diff --git a/CI_SCRIPTS/CPPLINT.cfg b/CI_SCRIPTS/CPPLINT.cfg new file mode 100644 index 00000000..db5b0f46 --- /dev/null +++ b/CI_SCRIPTS/CPPLINT.cfg @@ -0,0 +1 @@ +filter=-whitespace/line_length,-readability/casting,-whitespace/braces,-build/header_guard,-build/include_subdir,-runtime/explicit,-runtime/printf,-runtime/int,-whitespace/end_of_line,-readability/fn_size,-build/include_order,-build/include_what_you_use,-whitespace/indent diff --git a/CI_SCRIPTS/benchmark_verify.sh b/CI_SCRIPTS/benchmark_verify.sh new file mode 100644 index 00000000..37cfc79c --- /dev/null +++ b/CI_SCRIPTS/benchmark_verify.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +benchmark_verify() { + device=$1 + x2bolt_path=$2 + benchmark_path=$3 + model_zoo_directory=$4 + model_type=$5 + model_name=$6 + precision=$7 + affinity=$8 + loops=$9 + result=${10} + + model_directory=${model_zoo_directory}/${model_type}_models/${model_name} + if [[ "${precision}" == "FP32" ]]; then + precision_suffix="_f32" + fi + if [[ "${precision}" == "FP16" ]]; then + precision_suffix="_f16" + fi + if [[ "${precision}" == "INT8_Q" ]]; then + precision_suffix="_int8" + fi + model_convert_command="${x2bolt_path} -d ${model_directory} -m ${model_name} -i ${precision}" + benchmark_command="${benchmark_path} -m ${model_directory}/${model_name}${precision_suffix}.bolt -a ${affinity} -l ${loops}" + if [[ "${device}" == "host" ]]; then + ${model_convert_command} > /dev/null && ${benchmark_command} &> engine_result.txt + else + adb -s ${device} shell "${model_convert_command} && ${benchmark_command}" &> engine_result.txt + fi + + avg_time=$(grep -I "avg_time:" ./engine_result.txt) + verify_result=$(grep -I "${result}" ./engine_result.txt) + + rm -rf engine_result.txt + + if [[ ${#verify_result} > 0 ]] + then + echo "${model_name} on ${device} in ${precision} precision ${avg_time}" + else + echo "${model_name} on ${device} in ${precision} precision fail!" + exit 1 + fi +} diff --git a/CI_SCRIPTS/benchmark_verify_serial.sh b/CI_SCRIPTS/benchmark_verify_serial.sh new file mode 100644 index 00000000..386a6fec --- /dev/null +++ b/CI_SCRIPTS/benchmark_verify_serial.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +script_name=$0 +script_abs=$(readlink -f "$0") +script_dir=$(dirname $script_abs) + +source ${script_dir}/benchmark_verify.sh + +BOLT_ROOT=${script_dir}/.. +loops=6 +phone=$1 + +# arm gnu +arch=arm_gnu +x2bolt_path=/data/local/tmp/CI/${arch}/tools/X2bolt +benchmark_path=/data/local/tmp/CI/${arch}/bin/benchmark +model_zoo_directory=/data/local/tmp/CI/model_zoo +#benchmark_verify ${phone} ${x2bolt_path} ${benchmark_path} ${model_zoo_directory} tflite mbmelgan FP32 CPU_AFFINITY_HIGH_PERFORMANCE ${loops} '' + +# x86 gnu +arch=x86_gnu +x2bolt_path=${BOLT_ROOT}/install_${arch}/tools/X2bolt +benchmark_path=${BOLT_ROOT}/install_${arch}/examples/benchmark +model_zoo_directory=/data/bolt/model_zoo +benchmark_verify host ${x2bolt_path} ${benchmark_path} ${model_zoo_directory} tflite mbmelgan FP32 CPU_AFFINITY_HIGH_PERFORMANCE ${loops} '\-0.295808 0.563926 1.235842' diff --git a/CI_SCRIPTS/dir_cpplint.sh b/CI_SCRIPTS/dir_cpplint.sh new file mode 100644 index 00000000..b30b98e5 --- /dev/null +++ b/CI_SCRIPTS/dir_cpplint.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +script_name=$0 +script_abs=$(readlink -f "$0") +script_dir=$(dirname $script_abs) + +cp ${script_dir}/CPPLINT.cfg $1 +cd $1 +cpplint --recursive --extensions=cpp,h,hpp,cl . +rm CPPLINT.cfg +echo " " diff --git a/CI_SCRIPTS/format_code.sh b/CI_SCRIPTS/format_code.sh new file mode 100644 index 00000000..3961883f --- /dev/null +++ b/CI_SCRIPTS/format_code.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +script_name=$0 +script_abs=$(readlink -f "$0") +script_dir=$(dirname $script_abs) + +fileSuffix=(h hpp c cpp cl) + +cd ${script_dir}/../ +format() { + file=$1 + echo "format: $file" + #/data/opt/uncrustify-master/build/uncrustify -c /data/opt/uncrustify-master/forUncrustifySources.cfg -f $file > tmp.cpp + #sed -i "s/\/\/ /\/\//g" ./tmp.cpp + #sed -i "s/\/\//\/\/ /g" ./tmp.cpp + #clang-format -i tmp.cpp + #cp tmp.cpp $file + #rm tmp.cpp + clang-format -i $file +} + +format_all() { + dirs=(inference common model_tools compute kit) + for suffix in ${fileSuffix[*]} + do + for dir in ${dirs[*]} + do + for file in `find $dir -name "*.$suffix"` + do + format $file + done + done + done +} + +format_change() { + key=$1 + files=`git status | grep "${key}" | sed s/[[:space:]]//g | sed s/:/:/g | cut -d ":" -f 2` + for file in ${files[*]} + do + fresh=false + for suffix in ${fileSuffix[*]} + do + if [[ $file == *.${suffix} ]]; then + fresh=true + fi + done + if [[ $fresh == true ]]; then + format $file + fi + done +} + +format_change "modified:" +format_change "修改:" diff --git a/CI_SCRIPTS/genCommandLines.sh b/CI_SCRIPTS/genCommandLines.sh new file mode 100644 index 00000000..1a3dd1b8 --- /dev/null +++ b/CI_SCRIPTS/genCommandLines.sh @@ -0,0 +1,63 @@ +#!/bin/bash + +fun_gen_in_two_arrs() { + rm -rf ./single_combinations.txt + #touch ./single_combinations.txt + local _firstArr=(`echo $1|cut -d " " --output-delimiter=" " -f 1-`) + local _firstArrLen=${#_firstArr[@]} + local _secondArr=(`echo $2|cut -d " " --output-delimiter=" " -f 1-`) + local _secondArrLen=${#_secondArr[@]} + index=0 + for ((i=0;i<_firstArrLen;i++)) + do + for ((j=0;j<_secondArrLen;j++)) + do + elem1=${_firstArr[$i]} + elem2=${_secondArr[$j]} + combine_str=$elem1"--"$elem2 + echo $combine_str >> ./single_combinations.txt + let index+=1 + done + done +} + +rm -rf ./final_combinations.txt +while read line +do + if [[ ${line} =~ ^#.* ]]; then + continue + fi + original_strs=() + original_index=0 + for i in $(echo $line| tr "&" "\n") + do + original_strs[$original_index]=$i + let original_index+=1 + done + + for i in "${!original_strs[@]}"; + do + sub_str=${original_strs[$i]} + if [ $i == 0 ] + then + rm -rf ./single_combinations.txt + for j in $(echo $sub_str| tr ";" "\n") + do + echo $j >> ./single_combinations.txt + done + else + sub_firstArr=() + sub_firstIndex=0 + for line in `cat ./single_combinations.txt` + do + sub_firstArr[$sub_firstIndex]=$line + let sub_firstIndex+=1 + done + sub_secondArr=($(echo "$sub_str"| tr ";" "\n")) + fun_gen_in_two_arrs "$(echo ${sub_firstArr[@]})" "$(echo ${sub_secondArr[@]})" + fi + done + + cat ./single_combinations.txt >> ./final_combinations.txt +done < $1 +rm -rf ./single_combinations.txt diff --git a/CI_SCRIPTS/inference_big.txt b/CI_SCRIPTS/inference_big.txt new file mode 100644 index 00000000..81a7eecb --- /dev/null +++ b/CI_SCRIPTS/inference_big.txt @@ -0,0 +1,66 @@ +# ARMv8+ CPU GNU section +asr_convolution_transformer_joint_net&arm&caffe&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&nlp/asr/asr_convolution_transformer/joint_net&@S+joint_net&@S+joint_net& +tinybert384&arm&caffe&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16;int8&cpu&1&nlp/slot_intent&32+32+32& +tinybert&arm&caffe&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16;int8&cpu&1&nlp/slot_intent&32+32+32& +tinybert_onnx&arm&onnx&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&nlp/slot_intent_onnx&32+32+32& +nmt&arm&caffe&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&nlp/machine_translation&32+32+32& +asr_rnnt&arm&caffe&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&nlp/asr/asr_rnnt&32+32+32& +asr_convolution_transformer_encoder&arm&caffe&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&nlp/asr/asr_convolution_transformer/first_frame&@S+encoder&@S+encoder& +asr_convolution_transformer_encoder&arm&caffe&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&nlp/asr/asr_convolution_transformer/second_frame&@S+encoder&@S+encoder& +asr_convolution_transformer_prediction_net&arm&caffe&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&nlp/asr/asr_convolution_transformer/prediction_net&@S+prediction_net&@S+prediction_net& +tts_encoder_decoder&arm&caffe&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&nlp/tts/encoder_decoder&@S+encoder_decoder&@S+encoder_decoder& +tts_postnet&arm&caffe&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&nlp/tts/postnet&@S+postnet&@S+postnet& +tts_melgan_vocoder&arm&onnx&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&nlp/tts/melgan_vocoder&@S+melgan_vocoder&@S+melgan_vocoder&0 +ghostnet&arm&onnx&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR_SC_RAW+@s+1+@t+5+@c+151&2 +mobilenet_v1&arm&caffe&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+0.017+@t+5+@c+151 +mobilenet_v2&arm&caffe&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+0.017+@t+5+@c+151 +squeezenet&arm&caffe&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16;int8&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+1+@t+5+@c+151 +mobilenet_v3&arm&caffe&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+RGB+@s+0.017+@t+5+@c+151 +fingerprint_resnet18&arm&caffe&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&cv/fingerprint&UNKNOWN& +resnet50&arm&caffe&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16;int8&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+1+@t+5+@c+151 +vad&arm&tflite&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&&&& +birealnet18&arm&onnx&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp16&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+RGB_SC+@s+1+@t+5+@c+151&0 +# ARMv7 CPU section +#asr_convolution_transformer_joint_net&arm&caffe&ubuntu16_04&ndkv7&GCL5T19822000030&A76&fp32&cpu&1&nlp/asr/asr_convolution_transformer/joint_net&@S+joint_net&@S+joint_net& +tinybert384&arm&caffe&ubuntu16_04&ndkv7&GCL5T19822000030&A76&fp32&cpu&1&nlp/slot_intent&32+32+32& +tinybert&arm&caffe&ubuntu16_04&ndkv7&GCL5T19822000030&A76&fp32&cpu&1&nlp/slot_intent&32+32+32& +tinybert_onnx&arm&onnx&ubuntu16_04&ndkv7&GCL5T19822000030&A76&fp32&cpu&1&nlp/slot_intent_onnx&32+32+32& +nmt&arm&caffe&ubuntu16_04&ndkv7&GCL5T19822000030&A76&fp32&cpu&1&nlp/machine_translation&32+32+32& +asr_rnnt&arm&caffe&ubuntu16_04&ndkv7&GCL5T19822000030&A76&fp32&cpu&1&nlp/asr/asr_rnnt&32+32+32& +asr_convolution_transformer_encoder&arm&caffe&ubuntu16_04&ndkv7&GCL5T19822000030&A76&fp32&cpu&1&nlp/asr/asr_convolution_transformer/first_frame&@S+encoder&@S+encoder& +asr_convolution_transformer_encoder&arm&caffe&ubuntu16_04&ndkv7&GCL5T19822000030&A76&fp32&cpu&1&nlp/asr/asr_convolution_transformer/second_frame&@S+encoder&@S+encoder& +asr_convolution_transformer_prediction_net&arm&caffe&ubuntu16_04&ndkv7&GCL5T19822000030&A76&fp32&cpu&1&nlp/asr/asr_convolution_transformer/prediction_net&@S+prediction_net&@S+prediction_net& +tts_encoder_decoder&arm&caffe&ubuntu16_04&ndkv7&GCL5T19822000030&A76&fp32&cpu&1&nlp/tts/encoder_decoder&@S+encoder_decoder&@S+encoder_decoder& +tts_postnet&arm&caffe&ubuntu16_04&ndkv7&GCL5T19822000030&A76&fp32&cpu&1&nlp/tts/postnet&@S+postnet&@S+postnet& +tts_melgan_vocoder&arm&onnx&ubuntu16_04&ndkv7&GCL5T19822000030&A76&fp32&cpu&1&nlp/tts/melgan_vocoder&@S+melgan_vocoder&@S+melgan_vocoder&0 +ghostnet&arm&onnx&ubuntu16_04&ndkv7&GCL5T19822000030&A76&fp32&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR_SC_RAW+@s+1+@t+5+@c+151&2 +mobilenet_v1&arm&caffe&ubuntu16_04&ndkv7&GCL5T19822000030&A76&fp32&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+0.017+@t+5+@c+151 +mobilenet_v2&arm&caffe&ubuntu16_04&ndkv7&GCL5T19822000030&A76&fp32&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+0.017+@t+5+@c+151 +squeezenet&arm&caffe&ubuntu16_04&ndkv7&GCL5T19822000030&A76&fp32&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+1+@t+5+@c+151 +mobilenet_v3&arm&caffe&ubuntu16_04&ndkv7&GCL5T19822000030&A76&fp32&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+RGB+@s+0.017+@t+5+@c+151 +fingerprint_resnet18&arm&caffe&ubuntu16_04&ndkv7&GCL5T19822000030&A76&fp32&cpu&1&cv/fingerprint&UNKNOWN& +resnet50&arm&caffe&ubuntu16_04&ndkv7&GCL5T19822000030&A76&fp32&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+1+@t+5+@c+151 +vad&arm&tflite&ubuntu16_04&ndkv7&GCL5T19822000030&A76&fp32&cpu&&&& +# X86 CPU section +tinybert384&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/slot_intent&32+32+32& +tinybert&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/slot_intent&32+32+32& +tinybert_onnx&x86&onnx&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/slot_intent_onnx&32+32+32& +tinybert_disambiguate&arm&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/slot_intent&32+32+32& +nmt&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/machine_translation&32+32+32& +nmt_tsc_encoder&arm&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/machine_translation&32+32+32& +nmt_tsc_decoder&arm&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/machine_translation&32+32+32& +asr_rnnt&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/asr/asr_rnnt&32+32+32& +asr_convolution_transformer_encoder&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/asr/asr_convolution_transformer/first_frame&@S+encoder&@S+encoder& +asr_convolution_transformer_encoder&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/asr/asr_convolution_transformer/second_frame&@S+encoder&@S+encoder& +asr_convolution_transformer_prediction_net&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/asr/asr_convolution_transformer/prediction_net&@S+prediction_net&@S+prediction_net& +tts_encoder_decoder&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/tts/encoder_decoder&@S+encoder_decoder&@S+encoder_decoder& +tts_postnet&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/tts/postnet&@S+postnet&@S+postnet& +tts_melgan_vocoder&x86&onnx&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/tts/melgan_vocoder&@S+melgan_vocoder&@S+melgan_vocoder&0 +ghostnet&x86&onnx&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR_SC_RAW+@s+1+@t+5+@c+151&2 +mobilenet_v1&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+0.017+@t+5+@c+151 +mobilenet_v2&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+0.017+@t+5+@c+151 +squeezenet&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+1+@t+5+@c+151 +mobilenet_v3&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+RGB+@s+0.017+@t+5+@c+151 +fingerprint_resnet18&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&cv/fingerprint&UNKNOWN& +resnet50&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+1+@t+5+@c+151 +vad&x86&tflite&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&&&& diff --git a/CI_SCRIPTS/inference_serial.txt b/CI_SCRIPTS/inference_serial.txt new file mode 100644 index 00000000..a283f17e --- /dev/null +++ b/CI_SCRIPTS/inference_serial.txt @@ -0,0 +1,19 @@ +# X86 CPU section +tinybert384&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/slot_intent&32+32+32& +tinybert&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/slot_intent&32+32+32& +tinybert_onnx&x86&onnx&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/slot_intent_onnx&32+32+32& +nmt&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/machine_translation&32+32+32& +asr_rnnt&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/asr/asr_rnnt&32+32+32& +asr_convolution_transformer_encoder&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/asr/asr_convolution_transformer/first_frame&@S+encoder&@S+encoder& +asr_convolution_transformer_encoder&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/asr/asr_convolution_transformer/second_frame&@S+encoder&@S+encoder& +asr_convolution_transformer_prediction_net&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/asr/asr_convolution_transformer/prediction_net&@S+prediction_net&@S+prediction_net& +tts_encoder_decoder&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/tts/encoder_decoder&@S+encoder_decoder&@S+encoder_decoder& +tts_postnet&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/tts/postnet&@S+postnet&@S+postnet& +tts_melgan_vocoder&x86&onnx&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/tts/melgan_vocoder&@S+melgan_vocoder&@S+melgan_vocoder&0 +#mobilenet_v1&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+0.017+@t+5+@c+151 +#mobilenet_v2&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+0.017+@t+5+@c+151 +#squeezenet&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+1+@t+5+@c+151 +mobilenet_v3&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+RGB+@s+0.017+@t+5+@c+151 +fingerprint_resnet18&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&cv/fingerprint&UNKNOWN& +#resnet50&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+1+@t+5+@c+151 +vad&x86&tflite&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&&&& diff --git a/CI_SCRIPTS/inference_small.txt b/CI_SCRIPTS/inference_small.txt new file mode 100644 index 00000000..16e6abd6 --- /dev/null +++ b/CI_SCRIPTS/inference_small.txt @@ -0,0 +1,30 @@ +# ARMv8+ CPU LLVM section +tinybert384&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16;int8&cpu&1&nlp/slot_intent&32+32+32& +tinybert&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16;int8&cpu&1&nlp/slot_intent&32+32+32& +tinybert_onnx&arm&onnx&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&nlp/slot_intent_onnx&32+32+32& +tinybert_disambiguate&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A76&fp32;fp16&cpu&1&nlp/slot_intent&32+32+32& +nmt&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&nlp/machine_translation&32+32+32& +nmt_tsc_encoder&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A76&fp32;fp16&cpu&1&nlp/machine_translation&32+32+32& +nmt_tsc_decoder&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A76&fp32;fp16&cpu&1&nlp/machine_translation&32+32+32& +asr_rnnt&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&nlp/asr/asr_rnnt&32+32+32& +asr_convolution_transformer_encoder&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&nlp/asr/asr_convolution_transformer/first_frame&@S+encoder&@S+encoder& +asr_convolution_transformer_encoder&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&nlp/asr/asr_convolution_transformer/second_frame&@S+encoder&@S+encoder& +asr_convolution_transformer_prediction_net&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&nlp/asr/asr_convolution_transformer/prediction_net&@S+prediction_net&@S+prediction_net& +tts_encoder_decoder&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&nlp/tts/encoder_decoder&@S+encoder_decoder&@S+encoder_decoder& +tts_postnet&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&nlp/tts/postnet&@S+postnet&@S+postnet& +tts_melgan_vocoder&arm&onnx&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&nlp/tts/melgan_vocoder&@S+melgan_vocoder&@S+melgan_vocoder&0 +ghostnet&arm&onnx&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A76&fp32;fp16&cpu&1&cv/ILSVRC/n02085620&1*224*224*3&@f+BGR_SC_RAW+@s+1+@t+5+@c+151&2 +mobilenet_v1&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+0.017+@t+5+@c+151 +mobilenet_v2&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+0.017+@t+5+@c+151 +squeezenet&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16;int8&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+1+@t+5+@c+151 +mobilenet_v3&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+RGB+@s+0.017+@t+5+@c+151 +fingerprint_resnet18&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&cv/fingerprint&UNKNOWN& +resnet50&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16;int8&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+1+@t+5+@c+151 +vad&arm&tflite&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&&&& +birealnet18&arm&onnx&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55;A76&fp16&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+RGB_SC+@s+1+@t+5+@c+151&0 +# ARMv8+ GPU LLVM section +mobilenet_v1&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55&fp16&gpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+0.017+@t+5+@c+151 +mobilenet_v2&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55&fp16&gpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+0.017+@t+5+@c+151 +mobilenet_v3&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55&fp16&gpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+RGB+@s+0.017+@t+5+@c+151 +squeezenet&arm&caffe&ubuntu16_04&llvm&GCL5T19822000030&A55&fp16&gpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+1+@t+5+@c+151 +#ghostnet&arm&onnx&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55&fp16&gpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR_SC_RAW+@s+1+@t+5+@c+151&3 diff --git a/CI_SCRIPTS/java_api_test.sh b/CI_SCRIPTS/java_api_test.sh new file mode 100644 index 00000000..600fd566 --- /dev/null +++ b/CI_SCRIPTS/java_api_test.sh @@ -0,0 +1,63 @@ +#!/bin/bash + +device=$1 +script_name=$0 +script_abs=$(readlink -f "$0") +script_dir=$(dirname $script_abs) +BOLT_ROOT=${script_dir}/.. +if [ ${device} == "x86_HOST" ]; then + ci_dir=/data/bolt + build_dir=${BOLT_ROOT}/build_x86_gnu + install_dir=${BOLT_ROOT}/install_x86_gnu +else + ci_dir=/data/local/tmp/CI + build_dir=${BOLT_ROOT}/build_arm_llvm + install_dir=${BOLT_ROOT}/install_arm_llvm + device_dir=${ci_dir}/java +fi + +current_dir=${PWD} + +cd ${build_dir} +cp ${install_dir}/include/java/* . +cp ${BOLT_ROOT}/inference/examples/java_api/test_api_java.java . +javac BoltResult.java || exit 1 +javac BoltModel.java || exit 1 +javac test_api_java.java || exit 1 + +if [ ${device} != "x86_HOST" ]; then + dx --dex --output=test_java_api.jar *.class || exit 1 + adb -s ${device} shell rm -rf ${device_dir} + adb -s ${device} shell mkdir ${device_dir} || exit 1 + adb -s ${device} push ${install_dir}/lib/libBoltModel.so ${device_dir} > /dev/null || exit 1 + if [ -f "${install_dir}/lib/libkernelsource.so" ]; then + adb -s ${device} push ${install_dir}/lib/libkernelsource.so ${device_dir} > /dev/null || exit 1 + fi + if [ -f "${BOLT_ROOT}/third_party/arm_llvm/opencl/lib64/libc++_shared.so" ]; then + adb -s ${device} push ${BOLT_ROOT}/third_party/arm_llvm/opencl/lib64/libc++_shared.so ${device_dir} > /dev/null || exit 1 + fi + if [ -f "${install_dir}/lib/libOpenCL.so" ]; then + adb -s ${device} push ${install_dir}/lib/libOpenCL.so ${device_dir} > /dev/null || exit 1 + fi + adb -s ${device} push ./test_java_api.jar ${device_dir} > /dev/null || exit 1 + + adb -s ${device} shell "cd ${device_dir} && export LD_LIBRARY_PATH=/apex/com.android.runtime/lib64/bionic:/system/lib64 && dalvikvm -cp ./test_java_api.jar test_api_java ${device} ${ci_dir}" 2> status.txt +else + java test_api_java ${device} ${ci_dir} 2> status.txt +fi + +if [ "$?" != 0 ]; then + cat status.txt + if cat ./status.txt | grep "couldn't find an OpenCL implementation" > /dev/null + then + echo "GPU environment error" + else + exit 1 + fi +fi + +if [ ${device} != "x86_HOST" ]; then + adb -s ${device} shell rm -rf ${device_dir} +fi + +cd ${current_dir} diff --git a/CI_SCRIPTS/model_tools_test.sh b/CI_SCRIPTS/model_tools_test.sh new file mode 100644 index 00000000..0b488d4b --- /dev/null +++ b/CI_SCRIPTS/model_tools_test.sh @@ -0,0 +1,183 @@ +#!/bin/bash + +script_name=$0 +script_abs=$(readlink -f "$0") +script_dir=$(dirname $script_abs) + +host_bin_dir="" +host_lib_dir="" +excute_on_device=false +use_static_library=true +memory_reuse=true +remove=true +device="" +cpu_mask="2" +device_dir="" +model_zoo_dir="" + +print_help() { + cat < run specified program in . + -l, --lib use dynamic library in . + -d, --device run test on device. + -c, --cpu_mask taskset cpu mask(default: 2). + -p, --path run test on device in specified . + -m, --model_zoo use prepared models in model_zoo(/[caffe|onnx|tflite]_models) + -r, --remove remove device tmp directory or not +EOF + exit 1; +} + +TEMP=`getopt -o b:c:hl:d:p:r:m: --long bin:cpu_mask:help,lib:device:path:remove:model_zoo \ + -n ${script_name} -- "$@"` +if [ $? != 0 ] ; then echo "[ERROR] terminating..." >&2 ; exit 1 ; fi +eval set -- "$TEMP" +while true ; do + case "$1" in + -b|--bin) + host_bin_dir=$2 + echo "[INFO] run test in ${host_bin_dir}" ; + shift 2 ;; + -c|--cpu_mask) + cpu_mask=$2 + echo "[INFO] CPU mask ${cpu_mask}" ; + shift 2 ;; + -l|--lib) + host_lib_dir=$2 + use_static_library=false + echo "[INFO] use library in ${host_lib_dir}" ; + shift 2 ;; + -d|--device) + device=$2 + exe_on_device=true + echo "[INFO] test on device ${device}" ; + shift 2 ;; + -m|--model_zoo) + model_zoo_dir=$2 + echo "[INFO] use model_zoo ${model_zoo_dir}" ; + shift 2 ;; + -p|--path) + device_dir=$2 + echo "[INFO] test on device directory ${device_dir}" ; + shift 2 ;; + -r|--remove) + remove=$2 + echo "[INFO] clear tmp directory ${remove}" ; + shift 2;; + -h|--help) + print_help ; + shift ;; + --) shift ; + break ;; + *) echo "[ERROR] $1" ; exit 1 ;; + esac +done + +run_command() { + params=$* + + prefix="cd ${device_dir}/tmp" + if [[ ${memory_reuse} == true ]] ; then + prefix="$prefix && export BOLT_MEMORY_REUSE_OPTIMIZATION=ON" + else + prefix="$prefix && export BOLT_MEMORY_REUSE_OPTIMIZATION=OFF" + fi + if [[ ${exe_on_device} == true ]] ; then + if [[ ${use_static_library} == true ]] ; then + adb -s ${device} shell "$prefix && taskset ${cpu_mask} ./${params} || echo '[FAILURE]'" &> status.txt + else + adb -s ${device} shell "$prefix && export LD_LIBRARY_PATH=. && taskset ${cpu_mask} ./${params} || echo '[FAILURE]'" &> status.txt + fi + else + if [[ ${use_static_library} == true ]] ; then + $prefix && taskset ${cpu_mask} ${host_bin_dir}/${params} || echo '[FAILURE]' &> status.txt + else + export LD_LIBRARY_PATH=${host_lib_dir}:${LD_LIBRARY_PATH} && $prefix && taskset ${cpu_mask} ${host_bin_dir}/${params} || echo '[FAILURE]' &> status.txt + fi + fi + cat status.txt || exit 1 + if [ `grep -c "\[FAILURE\]" status.txt` -ne '0' ] ; then + exit 1 + fi + rm status.txt +} + +if [[ ${exe_on_device} == true ]] ; then + adb -s ${device} shell "mkdir ${device_dir}" + adb -s ${device} shell "rm -rf ${device_dir}/tmp" + adb -s ${device} shell "mkdir ${device_dir}/tmp" + adb -s ${device} shell "cp -r ${model_zoo_dir}/* ${device_dir}/tmp/" + adb -s ${device} shell "find ${device_dir}/tmp -name \"*\.bolt\" | xargs rm -rf" + if [[ ${use_static_library} != true ]] ; then + adb -s ${device} push ${host_lib_dir}/libuni.so ${device_dir}/tmp > /dev/null || exit 1 + adb -s ${device} push ${host_lib_dir}/libmodel_tools.so ${device_dir}/tmp > /dev/null || exit 1 + adb -s ${device} push ${host_lib_dir}/libmodel_tools_caffe.so ${device_dir}/tmp > /dev/null || exit 1 + adb -s ${device} push ${host_lib_dir}/libmodel_tools_onnx.so ${device_dir}/tmp > /dev/null || exit 1 + adb -s ${device} push ${host_lib_dir}/libmodel_tools_tflite.so ${device_dir}/tmp > /dev/null || exit 1 + bash ${script_dir}/../scripts/push_third_party.sh -l ${script_dir}/../third_party/arm_llvm -d ${device} -p ${device_dir}/tmp -c arm_llvm + fi + adb -s ${device} push ${host_bin_dir}/X2bolt ${device_dir}/tmp > /dev/null || exit 1 +else + mkdir ${host_bin_dir}/tmp + cp -r ${model_zoo_dir}/* ${host_bin_dir}/tmp/ +fi + +# caffe model +# INT8 +run_command X2bolt -d caffe_models/squeezenet -m squeezenet -i INT8_Q +run_command X2bolt -d caffe_models/tinybert384 -m tinybert384 -i INT8_Q +run_command X2bolt -d caffe_models/tinybert -m tinybert -i INT8_Q +# FP16 +run_command X2bolt -d caffe_models/mobilenet_v1 -m mobilenet_v1 -i FP16 +run_command X2bolt -d caffe_models/mobilenet_v2 -m mobilenet_v2 -i FP16 +run_command X2bolt -d caffe_models/mobilenet_v3 -m mobilenet_v3 -i FP16 +run_command X2bolt -d caffe_models/resnet50 -m resnet50 -i FP16 +run_command X2bolt -d caffe_models/squeezenet -m squeezenet -i FP16 +run_command X2bolt -d caffe_models/fingerprint_resnet18 -m fingerprint_resnet18 -i FP16 +run_command X2bolt -d caffe_models/tinybert384 -m tinybert384 -i FP16 +run_command X2bolt -d caffe_models/tinybert -m tinybert -i FP16 +run_command X2bolt -d caffe_models/tinybert_disambiguate -m tinybert_disambiguate -i FP16 +run_command X2bolt -d caffe_models/nmt -m nmt FP16 +run_command X2bolt -d caffe_models/nmt_tsc_encoder -m nmt_tsc_encoder -i FP16 +run_command X2bolt -d caffe_models/nmt_tsc_decoder -m nmt_tsc_decoder -i FP16 +run_command X2bolt -d caffe_models/tts_encoder_decoder -m tts_encoder_decoder -i FP16 +run_command X2bolt -d caffe_models/asr_rnnt -m asr_rnnt -i FP16 +run_command X2bolt -d caffe_models/tts_postnet -m tts_postnet -i FP16 +# FP32 +run_command X2bolt -d caffe_models/mobilenet_v1 -m mobilenet_v1 -i FP32 +run_command X2bolt -d caffe_models/mobilenet_v2 -m mobilenet_v2 -i FP32 +run_command X2bolt -d caffe_models/mobilenet_v3 -m mobilenet_v3 -i FP32 +run_command X2bolt -d caffe_models/resnet50 -m resnet50 -i FP32 +run_command X2bolt -d caffe_models/squeezenet -m squeezenet -i FP32 +run_command X2bolt -d caffe_models/fingerprint_resnet18 -m fingerprint_resnet18 -i FP32 +run_command X2bolt -d caffe_models/tinybert384 -m tinybert384 -i FP32 +run_command X2bolt -d caffe_models/tinybert -m tinybert -i FP32 +run_command X2bolt -d caffe_models/tinybert_disambiguate -m tinybert_disambiguate -i FP32 +run_command X2bolt -d caffe_models/nmt -m nmt -i FP32 +run_command X2bolt -d caffe_models/nmt_tsc_encoder -m nmt_tsc_encoder -i FP32 +run_command X2bolt -d caffe_models/nmt_tsc_decoder -m nmt_tsc_decoder -i FP32 +run_command X2bolt -d caffe_models/tts_encoder_decoder -m tts_encoder_decoder -i FP32 +run_command X2bolt -d caffe_models/asr_rnnt -m asr_rnnt -i FP32 +run_command X2bolt -d caffe_models/tts_postnet -m tts_postnet -i FP32 + +# onnx model +# BNN +run_command X2bolt -d onnx_models/birealnet18 -m birealnet18 -i FP16 +run_command X2bolt -d onnx_models/birealnet18 -m birealnet18 -i FP32 +# FP16 +run_command X2bolt -d onnx_models/tts_melgan_vocoder -m tts_melgan_vocoder -i FP16 +# FP32 +run_command X2bolt -d onnx_models/tts_melgan_vocoder -m tts_melgan_vocoder -i FP32 + +if [[ ${remove} == true ]] ; then + if [[ ${exe_on_device} == true ]] ; then + adb -s ${device} shell rm -rf ${device_dir}/tmp + else + rm -rf ${host_bin_dir}/tmp + fi +fi diff --git a/CI_SCRIPTS/operator_driver.sh b/CI_SCRIPTS/operator_driver.sh new file mode 100644 index 00000000..9da4abd7 --- /dev/null +++ b/CI_SCRIPTS/operator_driver.sh @@ -0,0 +1,126 @@ +#!/bin/bash + +script_name=$0 +script_abs=$(readlink -f "$0") +script_dir=$(dirname $script_abs) + +cpu_mask="2" +exe_host_path="" +parameter_file_path="" +excute_on_device=false +use_static_library=false +device="" +device_dir="" +exe_device_path="" + + +print_help() { + cat < run specified program. + -i, --input parameter file PATH. + -s, --static use the static library(default: false). + -c, --cpu_mask taskset cpu mask(default: 2). + -d, --device run test on device. + -p, --path run test on device in specified . +EOF + exit 1; +} + +TEMP=`getopt -o c:d:e:i:p:hs: --long cpu_mask:device:exe:input:path:help,static: \ + -n ${script_name} -- "$@"` +if [ $? != 0 ] ; then echo "[ERROR] terminating..." >&2 ; exit 1 ; fi +eval set -- "$TEMP" +while true ; do + case "$1" in + -c|--cpu_mask) + cpu_mask=$2 + echo "[INFO] CPU mask '${cpu_mask}'" ; + shift 2 ;; + -d|--device) + device=$2 + exe_on_device=true + echo "[INFO] test on device '${device}'" ; + shift 2 ;; + -p|--path) + device_dir=$2 + echo "[INFO] test on device path '${device_dir}'" ; + shift 2 ;; + -s|--static) + use_static_library=$2 + echo "[INFO] use static library: ${use_static_library}" ; + shift 2;; + -e|--exe) + exe_host_path=$2 + echo "[INFO] exe '${exe_host_path}'" ; + shift 2 ;; + -i|--input) + parameter_file_path=$2 + echo "[INFO] parameter \`${parameter_file_path}'" ; + shift 2 ;; + -h|--help) + print_help ; + shift ;; + --) shift ; + break ;; + *) echo "[ERROR]" ; exit 1 ;; + esac +done + +if [ "${exe_host_path}" == "" ] || [ ! -f ${exe_host_path} ] ; then + echo "[ERROR] exe '${exe}' doesn't exist"; + exit 1 +fi + +if [ "${parameter_file_path}" == "" ] || [ ! -f ${parameter_file_path} ] ; then + echo "[ERROR] parameter '${parameter_file_path}' doesn't exist"; + exit 1 +fi + +if [[ ${exe_on_device} == true ]] ; then + exe_name=${exe_host_path##*/} + exe_device_path="${device_dir}/${exe_name}" + adb -s ${device} push ${exe_host_path} ${exe_device_path} > /dev/null || exit 1 +fi + +commands=() +while read line; do + commands[${#commands[*]}]=`echo ${line}` +done < ${parameter_file_path} + +for((k=0;k<${#commands[@]};k++)){ + params=${commands[k]} + # filter out the params that starts with '#' + if [[ ! "$params" =~ ^#.* ]]; then + params_len=${#params} + if [[ $params_len -gt 0 ]]; then + #echo " parameter: ${params}" + if [[ ${exe_on_device} == true ]] ; then + library_reference="" + if [[ ${use_static_library} != true ]] ; then + library_reference="export LD_LIBRARY_PATH=${device_dir} &&" + fi + adb -s ${device} shell "${library_reference} taskset ${cpu_mask} ${exe_device_path} ${params} || echo '[FAILURE]'" &> status.txt + + cat status.txt || exit 1 + if [ `grep -c "\[FAILURE\]" status.txt` -ne '0' ] ; then + exit 1 + fi + rm status.txt + else + if [[ ${use_static_library} != true ]] ; then + export LD_LIBRARY_PATH=${exe_host_path}/../lib:${LD_LIBRARY_PATH} + fi + taskset ${cpu_mask} ${exe_host_path} ${params} || exit 1 + fi + fi + fi +} + +if [[ ${exe_on_device} == true ]] ; then + adb -s ${device} shell "rm -rf ${exe_device_path}" +fi diff --git a/CI_SCRIPTS/operator_test.sh b/CI_SCRIPTS/operator_test.sh new file mode 100644 index 00000000..685303ac --- /dev/null +++ b/CI_SCRIPTS/operator_test.sh @@ -0,0 +1,160 @@ +#!/bin/bash + +script_name=$0 +script_abs=$(readlink -f "$0") +script_dir=$(dirname $script_abs) +driver_script_path="${script_dir}/operator_driver.sh" + +host_bin_dir="" +use_static_library=true +host_lib_dir="" +excute_on_device=false +device="" +cpu_mask="2" +device_dir="" + +print_help() { + cat < run specified program in . + -l, --lib use sprcified library in . + -d, --device run test on device. + -c, --cpu_mask taskset cpu mask(default: 2). + -p, --path run test on device in specified PATH. +EOF + exit 1; +} + + +TEMP=`getopt -o b:c:hl:d:p: --long bin:cpu_mask:help,lib:device:path: \ + -n ${script_name} -- "$@"` +if [ $? != 0 ] ; then echo "[ERROR] terminating..." >&2 ; exit 1 ; fi +eval set -- "$TEMP" +while true ; do + case "$1" in + -b|--bin) + host_bin_dir=$2 + echo "[INFO] run test in '${host_bin_dir}'" ; + shift 2 ;; + -c|--cpu_mask) + cpu_mask=$2 + echo "[INFO] CPU mask '${cpu_mask}'" ; + shift 2 ;; + -l|--lib) + use_static_library=false; + host_lib_dir=$2 + echo "[INFO] use library in ${host_lib_dir}" ; + shift 2 ;; + -d|--device) + device=$2 + exe_on_device=true + echo "[INFO] test on device \`${device}'" ; + shift 2 ;; + -p|--path) + device_dir=$2 + echo "[INFO] test on device directory \`${device_dir}'" ; + shift 2 ;; + -h|--help) + print_help ; + shift ;; + --) shift ; + break ;; + *) echo "[ERROR]" ; exit 1 ;; + esac +done + +run_command() { + params=" -c ${cpu_mask} -e $1 -i $2" + if [[ ${exe_on_device} == true ]] ; then + params="${params} -p ${device_dir} -d ${device}" + fi + if [[ ${use_static_library} == true ]] ; then + params="${params} -s ${use_static_library}" + fi + ${driver_script_path} ${params} || exit 1 +} + +if [[ ${exe_on_device} == true ]] ; then + adb -s ${device} shell "mkdir ${device_dir}" + if [[ ${use_static_library} != true ]] ; then + adb -s ${device} push ${host_lib_dir}/libuni.so ${device_dir} > /dev/null || exit 1 + adb -s ${device} push ${host_lib_dir}/libblas_enhance.so ${device_dir} > /dev/null || exit 1 + adb -s ${device} push ${host_lib_dir}/libtensor.so ${device_dir} > /dev/null || exit 1 + if [[ -f ${host_lib_dir}/libgcl.so ]] ; then + adb -s ${device} push ${host_lib_dir}/libgcl.so ${device_dir} > /dev/null || exit 1 + fi + if [[ -f ${host_lib_dir}/libkernelsource.so ]] ; then + adb -s ${device} push ${host_lib_dir}/libkernelsource.so ${device_dir} > /dev/null || exit 1 + fi + fi +fi + + +# FP32 & FP16 operator test +# blas_enhance +run_command ${host_bin_dir}/test_mmm ${script_dir}/params/mmm.csv +run_command ${host_bin_dir}/test_mvm ${script_dir}/params/mvm.csv + +# tensor_computing +run_command ${host_bin_dir}/test_activation ${script_dir}/params/activation.csv +run_command ${host_bin_dir}/test_argmax ${script_dir}/params/argmax.csv +run_command ${host_bin_dir}/test_attention ${script_dir}/params/attention.csv +run_command ${host_bin_dir}/test_check ${script_dir}/params/check.csv +run_command ${host_bin_dir}/test_clip ${script_dir}/params/clip.csv +run_command ${host_bin_dir}/test_concat ${script_dir}/params/concat.csv +run_command ${host_bin_dir}/test_convolution ${script_dir}/params/convolution.csv +run_command ${host_bin_dir}/test_convolution ${script_dir}/params/alexnet_convolution.csv +run_command ${host_bin_dir}/test_convolution ${script_dir}/params/googlenet_convolution.csv +run_command ${host_bin_dir}/test_convolution ${script_dir}/params/resnet50_convolution.csv +run_command ${host_bin_dir}/test_deconvolution ${script_dir}/params/deconvolution.csv +run_command ${host_bin_dir}/test_depthwise_convolution ${script_dir}/params/mobilenetv1_depthwise_convolution.csv +run_command ${host_bin_dir}/test_depthwise_convolution ${script_dir}/params/mobilenetv2_depthwise_convolution.csv +run_command ${host_bin_dir}/test_depthwise_convolution ${script_dir}/params/mobilenetv3_depthwise_convolution.csv +run_command ${host_bin_dir}/test_dilated_convolution ${script_dir}/params/dilated_convolution.csv +run_command ${host_bin_dir}/test_detectionoutput ${script_dir}/params/detectionoutput.csv +run_command ${host_bin_dir}/test_eltwise ${script_dir}/params/eltwise.csv +run_command ${host_bin_dir}/test_fully_connected ${script_dir}/params/lenet_fully_connected.csv +run_command ${host_bin_dir}/test_l2normalization ${script_dir}/params/l2normalization.csv +run_command ${host_bin_dir}/test_non_max_suppression ${script_dir}/params/non_max_suppression.csv +run_command ${host_bin_dir}/test_padding ${script_dir}/params/padding.csv +run_command ${host_bin_dir}/test_prelu ${script_dir}/params/prelu.csv +run_command ${host_bin_dir}/test_power ${script_dir}/params/power.csv +run_command ${host_bin_dir}/test_pooling ${script_dir}/params/pooling.csv +run_command ${host_bin_dir}/test_pooling_bp ${script_dir}/params/pooling.csv +run_command ${host_bin_dir}/test_priorbox ${script_dir}/params/priorbox.csv +run_command ${host_bin_dir}/test_reshape ${script_dir}/params/reshape.csv +run_command ${host_bin_dir}/test_reduction ${script_dir}/params/reduction.csv +run_command ${host_bin_dir}/test_roialign ${script_dir}/params/roialign.csv +run_command ${host_bin_dir}/test_rnn ${script_dir}/params/rnn.csv +run_command ${host_bin_dir}/test_scale ${script_dir}/params/scale.csv +run_command ${host_bin_dir}/test_slice ${script_dir}/params/slice.csv +run_command ${host_bin_dir}/test_split ${script_dir}/params/split.csv +run_command ${host_bin_dir}/test_softmax ${script_dir}/params/softmax.csv +run_command ${host_bin_dir}/test_transpose ${script_dir}/params/transpose.csv +run_command ${host_bin_dir}/test_tile ${script_dir}/params/tile.csv + +# INT8 operator test +# blas_enhance +run_command ${host_bin_dir}/test_mmm_int8 ${script_dir}/params/mmm.csv +run_command ${host_bin_dir}/test_mvm_int8 ${script_dir}/params/mvm.csv + +# tensor_computing +run_command ${host_bin_dir}/test_concat_int8 ${script_dir}/params/concat.csv +run_command ${host_bin_dir}/test_convolution_int8 ${script_dir}/params/alexnet_convolution.csv +run_command ${host_bin_dir}/test_convolution_int8 ${script_dir}/params/googlenet_convolution.csv +run_command ${host_bin_dir}/test_convolution_int8 ${script_dir}/params/resnet50_convolution.csv +run_command ${host_bin_dir}/test_depthwise_convolution_int8 ${script_dir}/params/mobilenetv1_depthwise_convolution.csv +run_command ${host_bin_dir}/test_depthwise_convolution_int8 ${script_dir}/params/mobilenetv2_depthwise_convolution.csv +run_command ${host_bin_dir}/test_depthwise_convolution_int8 ${script_dir}/params/mobilenetv3_depthwise_convolution.csv +run_command ${host_bin_dir}/test_pooling_int8 ${script_dir}/params/pooling.csv + +# BNN operator test +run_command ${host_bin_dir}/test_convolution_bnn ${script_dir}/params/bnn_convolution.csv + +if [[ ${exe_on_device} == true ]] ; then + adb -s ${device} shell "rm -rf ${device_dir}" +fi diff --git a/scripts/params/activation.csv b/CI_SCRIPTS/params/activation.csv similarity index 100% rename from scripts/params/activation.csv rename to CI_SCRIPTS/params/activation.csv diff --git a/CI_SCRIPTS/params/alexnet_convolution.csv b/CI_SCRIPTS/params/alexnet_convolution.csv new file mode 100644 index 00000000..c20a9132 --- /dev/null +++ b/CI_SCRIPTS/params/alexnet_convolution.csv @@ -0,0 +1,6 @@ +#in_n in_c in_h in_w f_n f_c f_h f_w group stride padding out_n out_c out_h out_w +1 3 227 227 96 3 11 11 1 4 0 1 96 55 55 +1 96 27 27 256 96 5 5 1 2 0 1 256 13 13 +1 256 13 13 384 256 3 3 1 1 1 1 384 13 13 +1 384 13 13 384 384 3 3 1 1 1 1 384 13 13 +1 384 13 13 256 384 3 3 1 1 1 1 256 13 13 diff --git a/CI_SCRIPTS/params/argmax.csv b/CI_SCRIPTS/params/argmax.csv new file mode 100644 index 00000000..9b68c143 --- /dev/null +++ b/CI_SCRIPTS/params/argmax.csv @@ -0,0 +1,5 @@ +#in ic ih iw axis +1 64 24 24 -1 +1 8 100 100 1 +1 8 100 100 2 +1 8 100 100 3 diff --git a/scripts/params/attention.csv b/CI_SCRIPTS/params/attention.csv similarity index 100% rename from scripts/params/attention.csv rename to CI_SCRIPTS/params/attention.csv diff --git a/CI_SCRIPTS/params/bnn_convolution.csv b/CI_SCRIPTS/params/bnn_convolution.csv new file mode 100644 index 00000000..fb8445f6 --- /dev/null +++ b/CI_SCRIPTS/params/bnn_convolution.csv @@ -0,0 +1,53 @@ +#in_n in_c in_h in_w f_n f_c f_h f_w group stride padding out_n out_c out_h out_w +1 64 56 56 256 64 1 1 1 1 0 1 256 56 56 +1 64 56 56 64 64 1 1 1 1 0 1 64 56 56 +1 64 56 56 64 64 3 3 1 1 1 1 64 56 56 +1 64 56 56 256 64 1 1 1 1 0 1 256 56 56 +1 256 56 56 64 256 1 1 1 1 0 1 64 56 56 +1 64 56 56 64 64 3 3 1 1 1 1 64 56 56 +1 64 56 56 256 64 1 1 1 1 0 1 256 56 56 +1 256 56 56 64 256 1 1 1 1 0 1 64 56 56 +1 64 56 56 64 64 3 3 1 1 1 1 64 56 56 +1 64 56 56 256 64 1 1 1 1 0 1 256 56 56 +1 256 56 56 512 256 1 1 1 2 0 1 512 28 28 +1 256 56 56 128 256 1 1 1 2 0 1 128 28 28 +1 128 28 28 128 128 3 3 1 1 1 1 128 28 28 +1 128 28 28 512 128 1 1 1 1 0 1 512 28 28 +1 512 28 28 128 512 1 1 1 1 0 1 128 28 28 +1 128 28 28 128 128 3 3 1 1 1 1 128 28 28 +1 128 28 28 512 128 1 1 1 1 0 1 512 28 28 +1 512 28 28 128 512 1 1 1 1 0 1 128 28 28 +1 128 28 28 128 128 3 3 1 1 1 1 128 28 28 +1 128 28 28 512 128 1 1 1 1 0 1 512 28 28 +1 512 28 28 128 512 1 1 1 1 0 1 128 28 28 +1 128 28 28 128 128 3 3 1 1 1 1 128 28 28 +1 128 28 28 512 128 1 1 1 1 0 1 512 28 28 +1 512 28 28 1024 512 1 1 1 2 0 1 1024 14 14 +1 512 28 28 256 512 1 1 1 2 0 1 256 14 14 +1 256 14 14 256 256 3 3 1 1 1 1 256 14 14 +1 256 14 14 1024 256 1 1 1 1 0 1 1024 14 14 +1 1024 14 14 256 1024 1 1 1 1 0 1 256 14 14 +1 256 14 14 256 256 3 3 1 1 1 1 256 14 14 +1 256 14 14 1024 256 1 1 1 1 0 1 1024 14 14 +1 1024 14 14 256 1024 1 1 1 1 0 1 256 14 14 +1 256 14 14 256 256 3 3 1 1 1 1 256 14 14 +1 256 14 14 1024 256 1 1 1 1 0 1 1024 14 14 +1 1024 14 14 256 1024 1 1 1 1 0 1 256 14 14 +1 256 14 14 256 256 3 3 1 1 1 1 256 14 14 +1 256 14 14 1024 256 1 1 1 1 0 1 1024 14 14 +1 1024 14 14 256 1024 1 1 1 1 0 1 256 14 14 +1 256 14 14 256 256 3 3 1 1 1 1 256 14 14 +1 256 14 14 1024 256 1 1 1 1 0 1 1024 14 14 +1 1024 14 14 256 1024 1 1 1 1 0 1 256 14 14 +1 256 14 14 256 256 3 3 1 1 1 1 256 14 14 +1 256 14 14 1024 256 1 1 1 1 0 1 1024 14 14 +1 1024 14 14 2048 1024 1 1 1 2 0 1 2048 7 7 +1 1024 14 14 512 1024 1 1 1 2 0 1 512 7 7 +1 512 7 7 512 512 3 3 1 1 1 1 512 7 7 +1 512 7 7 2048 512 1 1 1 1 0 1 2048 7 7 +1 2048 7 7 512 2048 1 1 1 1 0 1 512 7 7 +1 512 7 7 512 512 3 3 1 1 1 1 512 7 7 +1 512 7 7 2048 512 1 1 1 1 0 1 2048 7 7 +1 2048 7 7 512 2048 1 1 1 1 0 1 512 7 7 +1 512 7 7 512 512 3 3 1 1 1 1 512 7 7 +1 512 7 7 2048 512 1 1 1 1 0 1 2048 7 7 diff --git a/CI_SCRIPTS/params/check.csv b/CI_SCRIPTS/params/check.csv new file mode 100644 index 00000000..b0c30bf5 --- /dev/null +++ b/CI_SCRIPTS/params/check.csv @@ -0,0 +1,3 @@ +#in ic ih iw +1 64 24 24 +1 8 100 100 diff --git a/scripts/params/clip.csv b/CI_SCRIPTS/params/clip.csv similarity index 100% rename from scripts/params/clip.csv rename to CI_SCRIPTS/params/clip.csv diff --git a/CI_SCRIPTS/params/concat.csv b/CI_SCRIPTS/params/concat.csv new file mode 100644 index 00000000..b6260c91 --- /dev/null +++ b/CI_SCRIPTS/params/concat.csv @@ -0,0 +1,3 @@ +#num axis [in ic ih iw]* on oc oh ow +2 1 1 8 16 16 1 16 16 16 1 24 16 16 +2 1 1 16 7 7 1 16 7 7 1 32 7 7 diff --git a/CI_SCRIPTS/params/convolution.csv b/CI_SCRIPTS/params/convolution.csv new file mode 100644 index 00000000..37b60801 --- /dev/null +++ b/CI_SCRIPTS/params/convolution.csv @@ -0,0 +1,17 @@ +#in_n in_c in_h in_w f_n f_c f_h f_w group stride padding out_n out_c out_h out_w +1 1 227 227 96 1 11 11 1 4 0 1 96 55 55 +1 2 227 227 96 2 11 11 1 4 0 1 96 55 55 +1 3 227 227 96 3 11 11 1 4 0 1 96 55 55 +1 4 227 227 96 4 11 11 1 4 0 1 96 55 55 +1 5 227 227 96 5 11 11 1 4 0 1 96 55 55 +1 6 227 227 96 6 11 11 1 4 0 1 96 55 55 +1 7 227 227 96 7 11 11 1 4 0 1 96 55 55 +1 8 227 227 96 8 11 11 1 4 0 1 96 55 55 +1 8 11 11 96 4 11 11 2 1 0 1 96 1 1 +1 8 227 227 96 4 11 11 2 4 0 1 96 55 55 +1 9 227 227 96 3 11 11 3 4 0 1 96 55 55 +1 16 227 227 96 8 11 11 2 4 0 1 96 55 55 +1 4 227 227 96 2 3 3 2 1 1 1 96 227 227 +1 8 227 227 96 4 3 3 2 1 1 1 96 227 227 +1 16 227 227 96 8 3 3 2 1 1 1 96 227 227 +1 32 227 227 96 16 3 3 2 1 1 1 96 227 227 diff --git a/CI_SCRIPTS/params/deconvolution.csv b/CI_SCRIPTS/params/deconvolution.csv new file mode 100644 index 00000000..f380ab68 --- /dev/null +++ b/CI_SCRIPTS/params/deconvolution.csv @@ -0,0 +1,12 @@ +#in_n in_c in_h in_w f_n f_c f_h f_w group stride padding out_n out_c out_h out_w +1 8 132 132 8 8 16 16 1 8 4 1 8 1056 1056 +1 8 4 4 8 8 2 2 1 2 0 1 8 8 8 +1 8 4 4 8 8 4 4 1 2 1 1 8 8 8 +1 8 8 8 8 8 16 16 1 8 4 1 8 64 64 +1 8 32 32 8 8 16 16 1 8 4 1 8 256 256 +1 8 4 4 8 8 2 2 1 2 0 1 8 8 8 +1 128 32 32 128 128 2 2 1 2 0 1 128 64 64 +1 128 3 3 128 128 3 3 1 2 1 1 128 5 5 +1 128 6 6 128 128 3 3 1 3 0 1 128 18 18 +1 64 8 8 1 64 8 8 64 4 2 1 64 32 32 +1 64 16 16 1 64 4 4 64 2 1 1 64 32 32 diff --git a/CI_SCRIPTS/params/detectionoutput.csv b/CI_SCRIPTS/params/detectionoutput.csv new file mode 100644 index 00000000..bba7e1b2 --- /dev/null +++ b/CI_SCRIPTS/params/detectionoutput.csv @@ -0,0 +1,2 @@ +#ih0 iw0 ih1 iw1 in2 ic2 ilens2 oh ow num_class +1 144 1 756 1 2 144 201 6 2 diff --git a/CI_SCRIPTS/params/dilated_convolution.csv b/CI_SCRIPTS/params/dilated_convolution.csv new file mode 100644 index 00000000..b7efe509 --- /dev/null +++ b/CI_SCRIPTS/params/dilated_convolution.csv @@ -0,0 +1,5 @@ +#in_n in_c in_h in_w f_n f_c f_h f_w group stride padding rate out_n out_c out_h out_w +1 96 27 27 256 96 5 5 1 2 0 2 1 256 10 10 +1 256 13 13 384 256 3 3 1 1 1 2 1 384 11 11 +1 384 13 13 384 384 3 3 1 1 1 3 1 384 9 9 +1 384 13 13 256 384 3 3 1 1 1 4 1 256 7 7 diff --git a/scripts/params/eltwise.csv b/CI_SCRIPTS/params/eltwise.csv similarity index 100% rename from scripts/params/eltwise.csv rename to CI_SCRIPTS/params/eltwise.csv diff --git a/CI_SCRIPTS/params/googlenet_convolution.csv b/CI_SCRIPTS/params/googlenet_convolution.csv new file mode 100644 index 00000000..d3e0a317 --- /dev/null +++ b/CI_SCRIPTS/params/googlenet_convolution.csv @@ -0,0 +1,58 @@ +#in_n in_c in_h in_w f_n f_c f_h f_w group stride padding out_n out_c out_h out_w +1 3 224 224 64 3 7 7 1 2 3 1 64 112 112 +1 64 56 56 64 64 1 1 1 1 0 1 64 56 56 +1 64 56 56 192 64 3 3 1 1 1 1 192 56 56 +1 192 28 28 64 192 1 1 1 1 0 1 64 28 28 +1 192 28 28 96 192 1 1 1 1 0 1 96 28 28 +1 96 28 28 128 96 3 3 1 1 1 1 128 28 28 +1 192 28 28 16 192 1 1 1 1 0 1 16 28 28 +1 16 28 28 32 16 5 5 1 1 2 1 32 28 28 +1 192 28 28 32 192 1 1 1 1 0 1 32 28 28 +1 256 28 28 128 256 1 1 1 1 0 1 128 28 28 +1 256 28 28 128 256 1 1 1 1 0 1 128 28 28 +1 128 28 28 192 128 3 3 1 1 1 1 192 28 28 +1 256 28 28 32 256 1 1 1 1 0 1 32 28 28 +1 32 28 28 96 32 5 5 1 1 2 1 96 28 28 +1 256 28 28 64 256 1 1 1 1 0 1 64 28 28 +1 480 14 14 192 480 1 1 1 1 0 1 192 14 14 +1 480 14 14 96 480 1 1 1 1 0 1 96 14 14 +1 96 14 14 208 96 3 3 1 1 1 1 208 14 14 +1 480 14 14 16 480 1 1 1 1 0 1 16 14 14 +1 16 14 14 48 16 5 5 1 1 2 1 48 14 14 +1 480 14 14 64 480 1 1 1 1 0 1 64 14 14 +1 512 14 14 160 512 1 1 1 1 0 1 160 14 14 +1 512 14 14 112 512 1 1 1 1 0 1 112 14 14 +1 112 14 14 224 112 3 3 1 1 1 1 224 14 14 +1 512 14 14 24 512 1 1 1 1 0 1 24 14 14 +1 24 14 14 64 24 5 5 1 1 2 1 64 14 14 +1 512 14 14 64 512 1 1 1 1 0 1 64 14 14 +1 512 14 14 128 512 1 1 1 1 0 1 128 14 14 +1 512 14 14 128 512 1 1 1 1 0 1 128 14 14 +1 128 14 14 256 128 3 3 1 1 1 1 256 14 14 +1 512 14 14 24 512 1 1 1 1 0 1 24 14 14 +1 24 14 14 64 24 5 5 1 1 2 1 64 14 14 +1 512 14 14 64 512 1 1 1 1 0 1 64 14 14 +1 512 14 14 112 512 1 1 1 1 0 1 112 14 14 +1 512 14 14 144 512 1 1 1 1 0 1 144 14 14 +1 144 14 14 288 144 3 3 1 1 1 1 288 14 14 +1 512 14 14 32 512 1 1 1 1 0 1 32 14 14 +1 32 14 14 64 32 5 5 1 1 2 1 64 14 14 +1 512 14 14 64 512 1 1 1 1 0 1 64 14 14 +1 528 14 14 256 528 1 1 1 1 0 1 256 14 14 +1 528 14 14 160 528 1 1 1 1 0 1 160 14 14 +1 160 14 14 320 160 3 3 1 1 1 1 320 14 14 +1 528 14 14 32 528 1 1 1 1 0 1 32 14 14 +1 32 14 14 128 32 5 5 1 1 2 1 128 14 14 +1 528 14 14 128 528 1 1 1 1 0 1 128 14 14 +1 832 7 7 256 832 1 1 1 1 0 1 256 7 7 +1 832 7 7 160 832 1 1 1 1 0 1 160 7 7 +1 160 7 7 320 160 3 3 1 1 1 1 320 7 7 +1 832 7 7 32 832 1 1 1 1 0 1 32 7 7 +1 32 7 7 128 32 5 5 1 1 2 1 128 7 7 +1 832 7 7 128 832 1 1 1 1 0 1 128 7 7 +1 832 7 7 384 832 1 1 1 1 0 1 384 7 7 +1 832 7 7 192 832 1 1 1 1 0 1 192 7 7 +1 192 7 7 384 192 3 3 1 1 1 1 384 7 7 +1 832 7 7 48 832 1 1 1 1 0 1 48 7 7 +1 48 7 7 128 48 5 5 1 1 2 1 128 7 7 +1 832 7 7 128 832 1 1 1 1 0 1 128 7 7 diff --git a/CI_SCRIPTS/params/l2normalization.csv b/CI_SCRIPTS/params/l2normalization.csv new file mode 100644 index 00000000..7e6d2e58 --- /dev/null +++ b/CI_SCRIPTS/params/l2normalization.csv @@ -0,0 +1,2 @@ +#ic ih iw +1 1 128 diff --git a/CI_SCRIPTS/params/lenet_convolution.csv b/CI_SCRIPTS/params/lenet_convolution.csv new file mode 100644 index 00000000..3559a7d1 --- /dev/null +++ b/CI_SCRIPTS/params/lenet_convolution.csv @@ -0,0 +1,3 @@ +#in_n in_c in_h in_w f_n f_c f_h f_w group stride padding out_n out_c out_h out_w +1 1 32 32 6 1 5 5 1 1 0 1 6 28 28 +1 6 14 14 16 6 5 5 1 1 0 1 16 10 10 diff --git a/scripts/params/lenet_fully_connected.csv b/CI_SCRIPTS/params/lenet_fully_connected.csv similarity index 100% rename from scripts/params/lenet_fully_connected.csv rename to CI_SCRIPTS/params/lenet_fully_connected.csv diff --git a/scripts/params/mmm.csv b/CI_SCRIPTS/params/mmm.csv similarity index 100% rename from scripts/params/mmm.csv rename to CI_SCRIPTS/params/mmm.csv diff --git a/CI_SCRIPTS/params/mobilenetv1_depthwise_convolution.csv b/CI_SCRIPTS/params/mobilenetv1_depthwise_convolution.csv new file mode 100644 index 00000000..319d4cd6 --- /dev/null +++ b/CI_SCRIPTS/params/mobilenetv1_depthwise_convolution.csv @@ -0,0 +1,14 @@ +#in_n in_c in_h in_w f_n f_c f_h f_w group stride padding out_n out_c out_h out_w +1 32 112 112 64 32 3 3 1 1 1 1 64 112 112 +1 64 112 112 128 64 3 3 1 2 1 1 128 56 56 +1 128 56 56 128 128 3 3 1 1 1 1 128 56 56 +1 128 56 56 256 128 3 3 1 2 1 1 256 28 28 +1 256 28 28 256 256 3 3 1 1 1 1 256 28 28 +1 256 28 28 512 256 3 3 1 2 1 1 512 14 14 +1 512 14 14 512 512 3 3 1 1 1 1 512 14 14 +1 512 14 14 512 512 3 3 1 1 1 1 512 14 14 +1 512 14 14 512 512 3 3 1 1 1 1 512 14 14 +1 512 14 14 512 512 3 3 1 1 1 1 512 14 14 +1 512 14 14 512 512 3 3 1 1 1 1 512 14 14 +1 512 14 14 1024 512 3 3 1 2 1 1 1024 7 7 +1 1024 7 7 1024 1024 3 3 1 1 1 1 1024 7 7 diff --git a/CI_SCRIPTS/params/mobilenetv2_depthwise_convolution.csv b/CI_SCRIPTS/params/mobilenetv2_depthwise_convolution.csv new file mode 100644 index 00000000..0879ed77 --- /dev/null +++ b/CI_SCRIPTS/params/mobilenetv2_depthwise_convolution.csv @@ -0,0 +1,18 @@ +#in_n in_c in_h in_w f_n f_c f_h f_w group stride padding out_n out_c out_h out_w +1 32 112 112 32 32 3 3 1 1 1 1 16 112 112 +1 96 112 112 24 96 3 3 1 2 1 1 24 56 56 +1 144 56 56 24 144 3 3 1 1 1 1 24 56 56 +1 144 56 56 32 144 3 3 1 2 1 1 32 28 28 +1 192 28 28 32 192 3 3 1 1 1 1 32 28 28 +1 192 28 28 32 192 3 3 1 1 1 1 32 28 28 +1 192 28 28 64 192 3 3 1 1 1 1 64 28 28 +1 384 28 28 64 384 3 3 1 1 1 1 64 28 28 +1 384 28 28 64 384 3 3 1 1 1 1 64 28 28 +1 384 28 28 64 384 3 3 1 1 1 1 64 28 28 +1 384 28 28 96 384 3 3 1 2 1 1 96 14 14 +1 576 14 14 96 576 3 3 1 1 1 1 96 14 14 +1 576 14 14 96 576 3 3 1 1 1 1 96 14 14 +1 576 14 14 160 576 3 3 1 2 1 1 160 7 7 +1 960 7 7 160 960 3 3 1 1 1 1 160 7 7 +1 960 7 7 160 960 3 3 1 1 1 1 160 7 7 +1 960 7 7 320 960 3 3 1 1 1 1 320 7 7 diff --git a/CI_SCRIPTS/params/mobilenetv3_convolution.csv b/CI_SCRIPTS/params/mobilenetv3_convolution.csv new file mode 100644 index 00000000..7de16faa --- /dev/null +++ b/CI_SCRIPTS/params/mobilenetv3_convolution.csv @@ -0,0 +1,33 @@ +#in_n in_c in_h in_w f_n f_c f_h f_w group stride padding out_n out_c out_h out_w +1 3 224 224 16 3 3 3 1 2 0 1 16 112 112 +1 16 112 112 16 16 1 1 1 1 0 1 16 112 112 +1 16 112 112 16 16 1 1 1 1 0 1 16 112 112 +1 16 112 112 64 16 1 1 1 1 0 1 64 112 112 +1 64 56 56 24 64 1 1 1 1 0 1 24 56 56 +1 24 56 56 72 24 1 1 1 1 0 1 72 56 56 +1 72 56 56 24 72 1 1 1 1 0 1 24 56 56 +1 24 56 56 72 24 1 1 1 1 0 1 72 56 56 +1 72 28 28 40 72 1 1 1 1 0 1 40 28 28 +1 40 28 28 120 40 1 1 1 1 0 1 120 28 28 +1 120 28 28 40 120 1 1 1 1 0 1 40 28 28 +1 40 28 28 120 40 1 1 1 1 0 1 120 28 28 +1 120 28 28 40 120 1 1 1 1 0 1 40 28 28 +1 40 28 28 240 40 1 1 1 1 0 1 240 28 28 +1 240 14 14 80 240 1 1 1 1 0 1 80 14 14 +1 80 14 14 200 80 1 1 1 1 0 1 200 14 14 +1 200 14 14 80 200 1 1 1 1 0 1 80 14 14 +1 80 14 14 184 80 1 1 1 1 0 1 184 14 14 +1 184 14 14 80 184 1 1 1 1 0 1 80 14 14 +1 80 14 14 184 80 1 1 1 1 0 1 184 14 14 +1 184 14 14 80 184 1 1 1 1 0 1 80 14 14 +1 80 14 14 480 80 1 1 1 1 0 1 480 14 14 +1 480 14 14 112 480 1 1 1 1 0 1 112 14 14 +1 112 14 14 672 112 1 1 1 1 0 1 672 14 14 +1 672 14 14 112 672 1 1 1 1 0 1 112 14 14 +1 112 14 14 672 112 1 1 1 1 0 1 672 14 14 +1 672 14 14 112 672 1 1 1 1 0 1 112 14 14 +1 112 14 14 672 112 1 1 1 1 0 1 672 14 14 +1 672 7 7 160 672 1 1 1 1 0 1 160 7 7 +1 160 7 7 960 160 1 1 1 1 0 1 960 7 7 +1 960 7 7 160 960 1 1 1 1 0 1 160 7 7 +1 160 7 7 960 160 1 1 1 1 0 1 960 7 7 diff --git a/CI_SCRIPTS/params/mobilenetv3_depthwise_convolution.csv b/CI_SCRIPTS/params/mobilenetv3_depthwise_convolution.csv new file mode 100644 index 00000000..3aa3ca67 --- /dev/null +++ b/CI_SCRIPTS/params/mobilenetv3_depthwise_convolution.csv @@ -0,0 +1,16 @@ +#in_n in_c in_h in_w f_n f_c f_h f_w group stride padding out_n out_c out_h out_w +1 16 112 112 16 16 3 3 1 1 1 1 16 112 112 +1 64 112 112 24 64 3 3 1 2 0 1 24 56 56 +1 72 56 56 24 72 3 3 1 1 1 1 24 56 56 +1 72 56 56 40 72 5 5 1 2 1 1 40 28 28 +1 120 28 28 40 120 5 5 1 1 2 1 40 28 28 +1 120 28 28 40 120 5 5 1 1 2 1 40 28 28 +1 240 28 28 80 240 3 3 1 2 0 1 80 14 14 +1 200 14 14 80 200 3 3 1 1 1 1 80 14 14 +1 184 14 14 80 184 3 3 1 1 1 1 80 14 14 +1 184 14 14 112 184 3 3 1 1 1 1 80 14 14 +1 480 14 14 112 480 3 3 1 1 1 1 112 14 14 +1 672 14 14 160 672 3 3 1 1 1 1 112 14 14 +1 672 14 14 160 672 5 5 1 1 2 1 160 14 14 +1 672 14 14 160 672 5 5 1 2 1 1 160 7 7 +1 960 7 7 160 960 5 5 1 1 2 1 160 7 7 diff --git a/scripts/params/mvm.csv b/CI_SCRIPTS/params/mvm.csv similarity index 100% rename from scripts/params/mvm.csv rename to CI_SCRIPTS/params/mvm.csv diff --git a/CI_SCRIPTS/params/non_max_suppression.csv b/CI_SCRIPTS/params/non_max_suppression.csv new file mode 100644 index 00000000..278a944d --- /dev/null +++ b/CI_SCRIPTS/params/non_max_suppression.csv @@ -0,0 +1,2 @@ +#in0 ic0 ilens0 in1 ic1 ilens1 oh ow max_output_boxes_per_class iou_threshold score_threshold +1 6 4 1 2 6 7 3 3 0.5 0 diff --git a/CI_SCRIPTS/params/normalization.csv b/CI_SCRIPTS/params/normalization.csv new file mode 100644 index 00000000..4ac9d079 --- /dev/null +++ b/CI_SCRIPTS/params/normalization.csv @@ -0,0 +1,3 @@ +#alpha beta in ic ih iw +2 2 1 64 24 24 +4 4 1 8 100 100 diff --git a/CI_SCRIPTS/params/padding.csv b/CI_SCRIPTS/params/padding.csv new file mode 100644 index 00000000..7c8b6240 --- /dev/null +++ b/CI_SCRIPTS/params/padding.csv @@ -0,0 +1,5 @@ +#in ic ih iw bn bc bh bw an ac ah aw mode +1 4 32 32 0 0 1 1 0 0 1 1 0 +1 4 32 32 0 0 2 2 0 0 2 2 0 +1 4 32 32 0 0 3 3 0 0 3 3 0 +1 4 32 32 0 0 4 4 0 0 4 4 0 diff --git a/scripts/params/pipeline.csv b/CI_SCRIPTS/params/pipeline.csv similarity index 100% rename from scripts/params/pipeline.csv rename to CI_SCRIPTS/params/pipeline.csv diff --git a/scripts/params/pooling.csv b/CI_SCRIPTS/params/pooling.csv similarity index 100% rename from scripts/params/pooling.csv rename to CI_SCRIPTS/params/pooling.csv diff --git a/CI_SCRIPTS/params/pooling_bp.csv b/CI_SCRIPTS/params/pooling_bp.csv new file mode 100644 index 00000000..6388af49 --- /dev/null +++ b/CI_SCRIPTS/params/pooling_bp.csv @@ -0,0 +1,2 @@ +#in ic ih iw fn fc fh fw stride padding on oc oh ow +1 16 3 3 1 1 2 2 2 1 1 16 4 4 diff --git a/CI_SCRIPTS/params/power.csv b/CI_SCRIPTS/params/power.csv new file mode 100644 index 00000000..f8ba774c --- /dev/null +++ b/CI_SCRIPTS/params/power.csv @@ -0,0 +1,3 @@ +#len alpha beta power +1000 1.1 0.2 1 +999 -0.2 -0.1 1 diff --git a/CI_SCRIPTS/params/prelu.csv b/CI_SCRIPTS/params/prelu.csv new file mode 100644 index 00000000..7886e1d9 --- /dev/null +++ b/CI_SCRIPTS/params/prelu.csv @@ -0,0 +1,2 @@ +#in ic ih iw +1 16 8 8 diff --git a/CI_SCRIPTS/params/priorbox.csv b/CI_SCRIPTS/params/priorbox.csv new file mode 100644 index 00000000..811ae33d --- /dev/null +++ b/CI_SCRIPTS/params/priorbox.csv @@ -0,0 +1,5 @@ +#in0 ic0 ih0 iw0 in1 ic1 ih1 iw1 min_size max_size flip clip step on oc olens ar1 [ar2] min_size1 max_size1 +1 1 38 38 1 1 300 300 64.0 90.0 1 0 8.0 1 2 23104 2.0 +1 1 38 38 1 1 300 300 64.0 90.0 1 0 8.0 1 2 34656 2.0 3.0 +1 1 38 38 1 1 300 300 64.0 90.0 1 0 8.0 1 2 46208 2.0 32.0 58.0 +1 1 38 38 1 1 300 300 64.0 90.0 1 0 8.0 1 2 69312 2.0 3.0 32.0 58.0 diff --git a/CI_SCRIPTS/params/reduction.csv b/CI_SCRIPTS/params/reduction.csv new file mode 100644 index 00000000..391ab931 --- /dev/null +++ b/CI_SCRIPTS/params/reduction.csv @@ -0,0 +1,6 @@ +#in ic ih iw axesNum axeses +1 64 24 24 1 -1 +1 8 100 100 1 1 +1 8 100 100 1 2 +1 8 100 100 1 3 +1 8 100 100 2 2 3 diff --git a/scripts/params/reshape.csv b/CI_SCRIPTS/params/reshape.csv similarity index 100% rename from scripts/params/reshape.csv rename to CI_SCRIPTS/params/reshape.csv diff --git a/CI_SCRIPTS/params/resnet50_convolution.csv b/CI_SCRIPTS/params/resnet50_convolution.csv new file mode 100644 index 00000000..710ce719 --- /dev/null +++ b/CI_SCRIPTS/params/resnet50_convolution.csv @@ -0,0 +1,54 @@ +#in_n in_c in_h in_w f_n f_c f_h f_w group stride padding out_n out_c out_h out_w +1 3 224 224 64 3 7 7 1 2 3 1 64 112 112 +1 64 56 56 256 64 1 1 1 1 0 1 256 56 56 +1 64 56 56 64 64 1 1 1 1 0 1 64 56 56 +1 64 56 56 64 64 3 3 1 1 1 1 64 56 56 +1 64 56 56 256 64 1 1 1 1 0 1 256 56 56 +1 256 56 56 64 256 1 1 1 1 0 1 64 56 56 +1 64 56 56 64 64 3 3 1 1 1 1 64 56 56 +1 64 56 56 256 64 1 1 1 1 0 1 256 56 56 +1 256 56 56 64 256 1 1 1 1 0 1 64 56 56 +1 64 56 56 64 64 3 3 1 1 1 1 64 56 56 +1 64 56 56 256 64 1 1 1 1 0 1 256 56 56 +1 256 56 56 512 256 1 1 1 2 0 1 512 28 28 +1 256 56 56 128 256 1 1 1 2 0 1 128 28 28 +1 128 28 28 128 128 3 3 1 1 1 1 128 28 28 +1 128 28 28 512 128 1 1 1 1 0 1 512 28 28 +1 512 28 28 128 512 1 1 1 1 0 1 128 28 28 +1 128 28 28 128 128 3 3 1 1 1 1 128 28 28 +1 128 28 28 512 128 1 1 1 1 0 1 512 28 28 +1 512 28 28 128 512 1 1 1 1 0 1 128 28 28 +1 128 28 28 128 128 3 3 1 1 1 1 128 28 28 +1 128 28 28 512 128 1 1 1 1 0 1 512 28 28 +1 512 28 28 128 512 1 1 1 1 0 1 128 28 28 +1 128 28 28 128 128 3 3 1 1 1 1 128 28 28 +1 128 28 28 512 128 1 1 1 1 0 1 512 28 28 +1 512 28 28 1024 512 1 1 1 2 0 1 1024 14 14 +1 512 28 28 256 512 1 1 1 2 0 1 256 14 14 +1 256 14 14 256 256 3 3 1 1 1 1 256 14 14 +1 256 14 14 1024 256 1 1 1 1 0 1 1024 14 14 +1 1024 14 14 256 1024 1 1 1 1 0 1 256 14 14 +1 256 14 14 256 256 3 3 1 1 1 1 256 14 14 +1 256 14 14 1024 256 1 1 1 1 0 1 1024 14 14 +1 1024 14 14 256 1024 1 1 1 1 0 1 256 14 14 +1 256 14 14 256 256 3 3 1 1 1 1 256 14 14 +1 256 14 14 1024 256 1 1 1 1 0 1 1024 14 14 +1 1024 14 14 256 1024 1 1 1 1 0 1 256 14 14 +1 256 14 14 256 256 3 3 1 1 1 1 256 14 14 +1 256 14 14 1024 256 1 1 1 1 0 1 1024 14 14 +1 1024 14 14 256 1024 1 1 1 1 0 1 256 14 14 +1 256 14 14 256 256 3 3 1 1 1 1 256 14 14 +1 256 14 14 1024 256 1 1 1 1 0 1 1024 14 14 +1 1024 14 14 256 1024 1 1 1 1 0 1 256 14 14 +1 256 14 14 256 256 3 3 1 1 1 1 256 14 14 +1 256 14 14 1024 256 1 1 1 1 0 1 1024 14 14 +1 1024 14 14 2048 1024 1 1 1 2 0 1 2048 7 7 +1 1024 14 14 512 1024 1 1 1 2 0 1 512 7 7 +1 512 7 7 512 512 3 3 1 1 1 1 512 7 7 +1 512 7 7 2048 512 1 1 1 1 0 1 2048 7 7 +1 2048 7 7 512 2048 1 1 1 1 0 1 512 7 7 +1 512 7 7 512 512 3 3 1 1 1 1 512 7 7 +1 512 7 7 2048 512 1 1 1 1 0 1 2048 7 7 +1 2048 7 7 512 2048 1 1 1 1 0 1 512 7 7 +1 512 7 7 512 512 3 3 1 1 1 1 512 7 7 +1 512 7 7 2048 512 1 1 1 1 0 1 2048 7 7 diff --git a/scripts/params/lstm.csv b/CI_SCRIPTS/params/rnn.csv similarity index 100% rename from scripts/params/lstm.csv rename to CI_SCRIPTS/params/rnn.csv diff --git a/CI_SCRIPTS/params/roialign.csv b/CI_SCRIPTS/params/roialign.csv new file mode 100644 index 00000000..a22b8664 --- /dev/null +++ b/CI_SCRIPTS/params/roialign.csv @@ -0,0 +1,2 @@ +#in0 ic0 ih0 iw0 ih1 iw1 ilens2 on0 oc0 oh0 ow0 output_h output_w sampling_ratio spatial_scale +1 1 10 10 1 4 1 1 1 5 5 5 5 2 1 diff --git a/scripts/params/scale.csv b/CI_SCRIPTS/params/scale.csv similarity index 100% rename from scripts/params/scale.csv rename to CI_SCRIPTS/params/scale.csv diff --git a/scripts/params/slice.csv b/CI_SCRIPTS/params/slice.csv similarity index 100% rename from scripts/params/slice.csv rename to CI_SCRIPTS/params/slice.csv diff --git a/scripts/params/softmax.csv b/CI_SCRIPTS/params/softmax.csv similarity index 100% rename from scripts/params/softmax.csv rename to CI_SCRIPTS/params/softmax.csv diff --git a/scripts/params/split.csv b/CI_SCRIPTS/params/split.csv similarity index 100% rename from scripts/params/split.csv rename to CI_SCRIPTS/params/split.csv diff --git a/CI_SCRIPTS/params/tile.csv b/CI_SCRIPTS/params/tile.csv new file mode 100644 index 00000000..13f2c4e6 --- /dev/null +++ b/CI_SCRIPTS/params/tile.csv @@ -0,0 +1,3 @@ +#in ic ih iw axis tile +1 16 16 16 0 4 +1 64 16 16 3 4 diff --git a/scripts/params/transpose.csv b/CI_SCRIPTS/params/transpose.csv similarity index 100% rename from scripts/params/transpose.csv rename to CI_SCRIPTS/params/transpose.csv diff --git a/CI_SCRIPTS/parseAndExeCommands.sh b/CI_SCRIPTS/parseAndExeCommands.sh new file mode 100644 index 00000000..80c2c97e --- /dev/null +++ b/CI_SCRIPTS/parseAndExeCommands.sh @@ -0,0 +1,430 @@ +#!/bin/bash + +declare CONVERTER + +declare BOLT_SUFFIX + +declare TASKSET_STR + +declare EXECUTOR="classification" + +declare CI_PATH="/data/local/tmp/CI" + +declare ARCH="arm" + +declare MODEL_TOOLS_EXE_PATH=${CI_PATH} + +declare ENGINE_EXE_PATH=${CI_PATH} + +declare BOLT_LIB_PATH=${CI_PATH} + +declare CAFFE_MODEL_ZOO_PATH="${CI_PATH}/model_zoo/caffe_models/" + +declare ONNX_MODEL_ZOO_PATH="${CI_PATH}/model_zoo/onnx_models/" + +declare TFLITE_MODEL_ZOO_PATH="${CI_PATH}/model_zoo/tflite_models/" + +declare DYNAMIC_MODEL_PATH_PREFIX + +declare PHONE_SPECIFICATION + +declare TESTING_DATA_PREFIX="${CI_PATH}/testing_data/" + +BOLT_DIR=$(dirname $(readlink -f "$0"))/.. + +function converter_selection() +{ + CONVERTER="X2bolt" + if [ "$1" == "caffe" ] + then + DYNAMIC_MODEL_PATH_PREFIX=$CAFFE_MODEL_ZOO_PATH + return + fi + + if [ "$1" == "onnx" ] + then + DYNAMIC_MODEL_PATH_PREFIX=$ONNX_MODEL_ZOO_PATH + return + fi + + if [ "$1" == "tflite" ] + then + DYNAMIC_MODEL_PATH_PREFIX=$TFLITE_MODEL_ZOO_PATH + return + fi + echo "[ERROR] error to convert model $1" + exit 1 +} + +function acc_selection() +{ + if [ "$1" == "fp32" ] + then + BOLT_SUFFIX="_f32.bolt" + return + fi + + if [ "$1" == "fp16" ] + then + BOLT_SUFFIX="_f16.bolt" + return + fi + + if [ "$1" == "int8" ] + then + BOLT_SUFFIX="_int8_q.bolt" + return + fi + + echo "[ERROR] error to process model precision $1" + exit 1 +} + +function core_selection() +{ + if [ "$1" == "A55" ] + then + TASKSET_STR="CPU_AFFINITY_LOW_POWER" + return + fi + + if [ "$1" == "A76" ] + then + TASKSET_STR="CPU_AFFINITY_HIGH_PERFORMANCE" + return + fi + + if [ "$1" == "x86_HOST" ] + then + TASKSET_STR="CPU_AFFINITY_HIGH_PERFORMANCE" + return + fi + + echo "[ERROR] error to set affinity setting $1" + exit 1 +} + +function arch_selection() +{ + if [ "$1" == "arm" ] + then + return + fi + + if [ "$1" == "x86" ] + then + ARCH="x86" + MODEL_TOOLS_EXE_PATH=${BOLT_DIR} + ENGINE_EXE_PATH=${BOLT_DIR} + CAFFE_MODEL_ZOO_PATH="/data/bolt/model_zoo/caffe_models/" + ONNX_MODEL_ZOO_PATH="/data/bolt/model_zoo/onnx_models/" + TFLITE_MODEL_ZOO_PATH="/data/bolt/model_zoo/tflite_models/" + TESTING_DATA_PREFIX="/data/bolt/testing_data/" + return + fi + + echo "[ERROR] error to set device $1" + exit 1 +} + +function device_selection() +{ + if [ "$1" == "cpu" ] + then + return + fi + + if [ "$1" == "gpu" ] + then + TASKSET_STR="GPU" + return + fi + + echo "[ERROR] error to set device $1" + exit 1 +} + +# device id to phone specification +function deviceId_to_phoneSpecification() +{ + if [ "$1" == "E5B0119506000260" ] + then + PHONE_SPECIFICATION="810" + return + fi + + if [ "$1" == "GCL5T19822000030" ] + then + PHONE_SPECIFICATION="990" + return + fi + + if [ "$1" == "x86_HOST" ] + then + return + fi + + echo "[ERROR] error to set mobile phone $1" + exit 1 +} + +combinations=() +commands=() +while read line; do + combinations[${#combinations[*]}]=`echo ${line}` +done < ./final_combinations.txt + +for((k=0;k<${#combinations[@]};k++)){ + line=${combinations[k]} + strs_arr=() + index=0 + for i in $(echo $line| tr "-" "\n") + do + strs_arr[$index]=$i; + let index+=1 + done + + commind_line="" + + arch_selection ${strs_arr[1]} + + DL_FRAMEWORK=${strs_arr[2]} + converter_selection $DL_FRAMEWORK + + core_selection ${strs_arr[6]} + + acc_selection ${strs_arr[7]} + + device_selection ${strs_arr[8]} + + # define model converter param + MODEL_NAME=${strs_arr[0]} + + EXECUTOR="classification" + if [[ "$MODEL_NAME" == "tinybert" || "$MODEL_NAME" == "tinybert384" ]] + then + EXECUTOR="tinybert" + fi + if [ "$MODEL_NAME" == "tinybert_onnx" ] + then + EXECUTOR="tinybert_onnx" + fi + if [ "$MODEL_NAME" == "nmt" ] + then + EXECUTOR="nmt" + fi + if [ "$MODEL_NAME" == "asr_rnnt" ] + then + EXECUTOR="asr_rnnt" + fi + if [[ "$MODEL_NAME" == "asr_convolution_transformer_encoder" || "$MODEL_NAME" == "asr_convolution_transformer_prediction_net" + || "$MODEL_NAME" == "asr_convolution_transformer_joint_net" ]] + then + EXECUTOR="asr_convolution_transformer" + fi + if [[ "$MODEL_NAME" == "tts_encoder_decoder" || "$MODEL_NAME" == "tts_postnet" + || "$MODEL_NAME" == "tts_melgan_vocoder" ]] + then + EXECUTOR="tts" + fi + if [ "$MODEL_NAME" == "vad" ] + then + EXECUTOR="vad" + fi + + REMOVE_OP_NUM=0 + if [ "$DL_FRAMEWORK" == "onnx" ] + then + REMOVE_OP_NUM=${strs_arr[13]} + fi + + COMPILER=${strs_arr[4]} + TESTING_DATA_PATH=$TESTING_DATA_PREFIX${strs_arr[10]} + ORIGINAL_PARAM=${strs_arr[12]} + MODEL_PATH=$DYNAMIC_MODEL_PATH_PREFIX$MODEL_NAME"/" + EXECUTE_PARAM= + BOLT_MODEL_PATH=$MODEL_PATH$MODEL_NAME$BOLT_SUFFIX + for i in $(echo $ORIGINAL_PARAM| tr "+" "\n") + do + j=${i/@/-} + EXECUTE_PARAM=$EXECUTE_PARAM" ""$j" + done + + if [ "$ARCH" == "arm" ] + then + mt_command_line=${MODEL_TOOLS_EXE_PATH}/${ARCH}_${COMPILER}"/bin/"$CONVERTER" -d "$MODEL_PATH" -m "$MODEL_NAME + engine_command_line=${ENGINE_EXE_PATH}/${ARCH}_${COMPILER}"/bin/"$EXECUTOR" ""-m "$BOLT_MODEL_PATH" ""-i "$TESTING_DATA_PATH" "$EXECUTE_PARAM" ""-a "$TASKSET_STR + if [ "$MODEL_NAME" == "vad" ] + then + engine_command_line=${ENGINE_EXE_PATH}/${ARCH}_${COMPILER}"/bin/"$EXECUTOR" ""-m "$BOLT_MODEL_PATH" "$EXECUTE_PARAM" ""-a "$TASKSET_STR + fi + fi + if [ "$ARCH" == "x86" ] + then + mt_command_line=${MODEL_TOOLS_EXE_PATH}/"install_"${ARCH}_${COMPILER}"/tools/"$CONVERTER" -d "$MODEL_PATH" -m "$MODEL_NAME + engine_command_line=${ENGINE_EXE_PATH}/"install_"${ARCH}_${COMPILER}"/examples/"$EXECUTOR" ""-m "$BOLT_MODEL_PATH" ""-i "$TESTING_DATA_PATH" "$EXECUTE_PARAM" " + fi + + if [ ${strs_arr[7]} == "fp32" ] + then + mt_command_line=$mt_command_line" -i FP32" + fi + if [ ${strs_arr[7]} == "fp16" ] + then + mt_command_line=$mt_command_line" -i FP16" + fi + if [ ${strs_arr[7]} == "int8" ] + then + mt_command_line=$mt_command_line" -i PTQ && export LD_LIBRARY_PATH=${BOLT_LIB_PATH}/${ARCH}_${COMPILER}/lib && "${MODEL_TOOLS_EXE_PATH}/${ARCH}_${COMPILER}"/bin/post_training_quantization -p "$MODEL_PATH$MODEL_NAME"_ptq_input.bolt" + fi + + if [[ "$DL_FRAMEWORK" == "onnx" && $REMOVE_OP_NUM -gt 0 ]] + then + mt_command_line=$mt_command_line" -r "$REMOVE_OP_NUM + fi + # skip engine run section + if [[ "$MODEL_NAME" == "tinybert_disambiguate" || "$MODEL_NAME" == "nmt_tsc_encoder" || "$MODEL_NAME" == "nmt_tsc_decoder" || "$MODEL_NAME" == "ghostnet" ]] + then + engine_command_line="echo 'avg_time:0ms/sequence'" + fi + if [ "$ARCH" == "arm" ] + then + mt_command_line="export LD_LIBRARY_PATH=${BOLT_LIB_PATH}/${ARCH}_${COMPILER}/lib && "$mt_command_line + engine_command_line="export LD_LIBRARY_PATH=${BOLT_LIB_PATH}/${ARCH}_${COMPILER}/lib && "$engine_command_line + + ADB_COMMAND_PREFIX="adb -s ${strs_arr[5]} shell" + adb_command_line="${ADB_COMMAND_PREFIX} \"${mt_command_line} > ${CI_PATH}/mt_result.txt && ${engine_command_line} > ${CI_PATH}/engine_result.txt\"" + adb_pull_result_line="adb -s ${strs_arr[5]} pull ${CI_PATH}/mt_result.txt . && adb -s ${strs_arr[5]} pull ${CI_PATH}/engine_result.txt ." + commands[${#commands[*]}]=`echo "${adb_command_line} && ${adb_pull_result_line}"` + fi + if [ "$ARCH" == "x86" ] + then + commands[${#commands[*]}]=`echo "${mt_command_line} > ./mt_result.txt && ${engine_command_line} > ./engine_result.txt"` + fi +} + +rm -r ./report.csv + +for((k=0;k<${#commands[@]};k++)){ + line=${commands[k]} + echo "Running_Beginning =====> $line" + eval $line || exit 1 + + MT_RUN_RESULT="MT_RUN_UNKNOWN" + + ENGINE_RUN_RESULT="ENGINE_RUN_UNKNOWN" + + TOP_ONE_ACC= + TOP_FIVE_ACC= + MAX_TIME_RESULT= + MIN_TIME_RESULT= + AVG_TIME_RESULT= + MESSAGE="ERROR" + + if cat ./mt_result.txt | grep "$MESSAGE" > /dev/null + then + MT_RUN_RESULT="MT_RUN_FAIL" + echo "Model conversion failed" + exit 1 + else + MT_RUN_RESULT="MT_RUN_PASS" + fi + + if cat ./engine_result.txt | grep "$MESSAGE" > /dev/null + then + ENGINE_RUN_RESULT="ENGINE_RUN_FAIL" + TOP_ONE_ACC="ERROR" + TOP_FIVE_ACC="ERROR" + MAX_TIME_RESULT="ERROR" + MIN_TIME_RESULT="ERROR" + AVG_TIME_RESULT="ERROR" + echo "Error during inference" + exit 1 + else + ENGINE_RUN_RESULT="ENGINE_RUN_PASS" + TOP_ONE_ACC=$(grep -I "top1" ./engine_result.txt) + TOP_FIVE_ACC=$(grep -I "top5" ./engine_result.txt) + MAX_TIME_RESULT=$(grep -I "max_time" ./engine_result.txt) + MIN_TIME_RESULT=$(grep -I "min_time" ./engine_result.txt) + AVG_TIME_RESULT=$(grep -I "avg_time:" ./engine_result.txt) + fi + + if [[ ${#AVG_TIME_RESULT} < 1 ]] + then + echo "Undetected error during Inference" + exit 1 + fi + + line=${combinations[k]} + final_arr=() + index=0 + for i in $(echo $line| tr "-" "\n") + do + final_arr[$index]=$i; + let index+=1 + done + + result_line="" + + report_index=0 + deviceId_to_phoneSpecification ${final_arr[5]} + final_arr[5]=$PHONE_SPECIFICATION + final_arr[12]="" + CUR_MODEL_NAME=${final_arr[0]} + for value in "${final_arr[@]}"; + do + if [ $report_index == 11 ] + then + break + fi + + if [ $report_index == 0 ] + then + result_line=$value + else + result_line=$result_line","$value + fi + let report_index+=1 + done + + # add segmentation fault check + SEGMENTATION_FAULT_CHECK=$(grep -I "Segmentation fault" ./mt_result.txt) + if [[ ${#SEGMENTATION_FAULT_CHECK} > 0 ]] + then + MT_RUN_RESULT="MT_SEGMENTATION_FAULT" + echo "Segmentation fault during model conversion" + exit 1 + fi + + SEGMENTATION_FAULT_CHECK=$(grep -I "Segmentation fault" ./engine_result.txt) + if [[ ${#SEGMENTATION_FAULT_CHECK} > 0 ]] + then + ENGINE_RUN_RESULT="ENGINE_SEGMENTATION_FAULT" + echo "Segmentation fault during inference" + exit 1 + fi + + COMPREHENSIVE_RESULT=$MAX_TIME_RESULT"+"$MIN_TIME_RESULT"+"$AVG_TIME_RESULT"+"$TOP_FIVE_ACC"+"$TOP_ONE_ACC + + if [[ "$CUR_MODEL_NAME" == "tinybert" || "$CUR_MODEL_NAME" == "fingerprint_resnet18" || "$CUR_MODEL_NAME" == "nmt" + || "$CUR_MODEL_NAME" == "asr_convolution_transformer_encoder" || "$CUR_MODEL_NAME" == "asr_convolution_transformer_prediction_net" + || "$CUR_MODEL_NAME" == "asr_convolution_transformer_joint_net" || "$CUR_MODEL_NAME" == "asr_rnnt" || "$CUR_MODEL_NAME" == "vad" + || "$CUR_MODEL_NAME" == "tts_encoder_decoder" || "$CUR_MODEL_NAME" == "tts_postnet" + || "$CUR_MODEL_NAME" == "tts_melgan_vocoder" ]] + then + result_line=$result_line","$MT_RUN_RESULT","$ENGINE_RUN_RESULT","$AVG_TIME_RESULT"," + else + result_line=$result_line","$MT_RUN_RESULT","$ENGINE_RUN_RESULT","$MAX_TIME_RESULT","$MIN_TIME_RESULT","$AVG_TIME_RESULT","$TOP_FIVE_ACC","$TOP_ONE_ACC"," + fi + rm -rf ./mt_result.txt + rm -rf ./engine_result.txt + + echo "Running_Result =====> $result_line" + + echo $result_line >> ./report.csv + echo " " >> ./report.csv + echo " " + echo " " +} + +cat ./report.csv diff --git a/CI_SCRIPTS/transExecutors.sh b/CI_SCRIPTS/transExecutors.sh new file mode 100644 index 00000000..86d56d7f --- /dev/null +++ b/CI_SCRIPTS/transExecutors.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +script_name=$0 +script_abs=$(readlink -f "$0") +script_dir=$(dirname $script_abs) +bolt_dir=${script_dir}/.. +compiler=$1 +device_dir=/data/local/tmp/CI/${compiler} + +echo "[INFO] compiler ${compiler}" + +upload_program() { + host_dir=$1 + device=$2 + device_dir=$3 + + adb -s ${device} shell "rm -rf ${device_dir}" + adb -s ${device} shell "mkdir ${device_dir}" + adb -s ${device} shell "mkdir ${device_dir}/bin ${device_dir}/lib" + for file in `ls ${host_dir}/examples/*` + do + adb -s ${device} push ${file} ${device_dir}/bin > /dev/null || exit 1 + done + for file in `ls ${host_dir}/lib/*.so` + do + adb -s ${device} push ${file} ${device_dir}/lib > /dev/null || exit 1 + done + adb -s ${device} push ${host_dir}/tools/X2bolt ${device_dir}/bin > /dev/null || exit 1 + adb -s ${device} push ${host_dir}/tools/post_training_quantization ${device_dir}/bin > /dev/null || exit 1 + if [[ "${compiler}" == "arm_llvm" ]] || [[ "${compiler}" == "arm_ndkv7" ]]; then + bash ${script_dir}/../scripts/push_third_party.sh -l ${script_dir}/../third_party/${compiler} -d ${device} -p ${device_dir}/lib -c ${compiler} || exit 1 + fi +} + +# Kirin 810 +upload_program ${bolt_dir}/install_${compiler} E5B0119506000260 ${device_dir} + +# Kirin 990 +upload_program ${bolt_dir}/install_${compiler} GCL5T19822000030 ${device_dir} diff --git a/CMakeLists.txt b/CMakeLists.txt index f7aae059..5e01dd06 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,134 +1,84 @@ cmake_minimum_required(VERSION 3.2) -file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/bolt.cmake ${BOLT_ROOT}/bolt.cmake) +file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/common/cmakes/bolt.cmake ${BOLT_ROOT}/common/cmakes/bolt.cmake) if (BOLT_CONFIGURE_FILE) include(${BOLT_CONFIGURE_FILE}) else (BOLT_CONFIGURE_FILE) message(FATAL_ERROR " -FATAL: can not find bolt.cmake in directory, +FATAL: can not find bolt.cmake in /common/cmakes directory, please set shell or cmake environment variable BOLT_ROOT. ") endif (BOLT_CONFIGURE_FILE) -project(bolt C CXX) - +if (USE_IOS_CLANG) + set(CMAKE_SYSTEM_NAME Darwin) + set(CMAKE_SYSTEM_VERSION 1) + set(UNIX True) + set(APPLE True) + set(IOS True) +endif (USE_IOS_CLANG) + +project(cheetah C CXX) + +set_policy() +SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${BOLT_ROOT}/common/cmakes") +if (USE_CAFFE OR USE_ONNX OR USE_FLOW) + find_package(Protobuf) +endif() +if (USE_TFLITE) + find_package(TFLite) +endif (USE_TFLITE) +if (USE_TENSORFLOW) + find_package(jsoncpp) +endif (USE_TENSORFLOW) if (USE_MALI) - add_subdirectory(gcl/tools/kernel_lib_compile) -endif (USE_MALI) -add_subdirectory(blas-enhance) -add_subdirectory(model-tools) -add_subdirectory(tensor_computing) -add_subdirectory(image) + find_package(Gcl) +endif(USE_MALI) +if (NOT USE_IOS_CLANG) + if (USE_LLVM_CLANG) + set(USE_JNI ON) + else() + find_package(JNI) + if (JNI_FOUND) + set(USE_JNI ON) + endif() + endif() +endif() + +add_subdirectory(common) +add_subdirectory(model_tools) +add_subdirectory(compute) add_subdirectory(inference) -add_subdirectory(tools) -add_subdirectory(kits) -add_subdirectory(tests) add_custom_target(bolt_library ALL - COMMAND ./scripts/build_light_bolt.sh ${CMAKE_BINARY_DIR} ${USE_MALI} ${USE_DEBUG} ${USE_LLVM_CLANG} ${CMAKE_CXX_COMPILER} ${CMAKE_AR} ${CMAKE_STRIP} + COMMAND ./scripts/build_light_bolt.sh ${CMAKE_CXX_COMPILER} ${CMAKE_AR} ${CMAKE_STRIP} ${CMAKE_BINARY_DIR} ${USE_MALI} ${USE_DEBUG} ${USE_LLVM_CLANG} ${USE_ANDROID_LOG} ${USE_IOS_CLANG} ${USE_OPENMP} WORKING_DIRECTORY $ENV{BOLT_ROOT}) +add_dependencies(bolt_library engine model_tools tensor image blas_enhance uni) +add_dependencies(bolt_library engine_static model_tools_static tensor_static image_static blas_enhance_static uni_static) -if (USE_MALI) - add_dependencies(inference kernelbin) - add_dependencies(inference_static kernelbin_static) - install(TARGETS kernelbin kernelbin_static - LIBRARY DESTINATION lib - ARCHIVE DESTINATION lib) - install(FILES ${CMAKE_BINARY_DIR}/libOpenCL.so - DESTINATION lib) -endif (USE_MALI) -add_dependencies(tensor_computing blas-enhance) -add_dependencies(tensor_computing_static blas-enhance_static) -add_dependencies(inference tensor_computing model-tools image) -add_dependencies(inference_static tensor_computing_static model-tools_static image_static) -add_dependencies(bolt_library inference) -add_dependencies(bolt_library inference_static) - -install(TARGETS blas-enhance blas-enhance_static - tensor_computing tensor_computing_static - model-tools model-tools_static - image image_static - inference inference_static - LIBRARY DESTINATION lib - ARCHIVE DESTINATION lib) - -if (USE_CAFFE) - add_dependencies(model-tools model-tools_caffe) - add_dependencies(model-tools_static model-tools_caffe_static) - install(TARGETS caffe2bolt - model-tools_caffe model-tools_caffe_static - RUNTIME DESTINATION tools - LIBRARY DESTINATION lib - ARCHIVE DESTINATION lib) -endif(USE_CAFFE) - -if (USE_ONNX) - add_dependencies(model-tools model-tools_onnx) - add_dependencies(model-tools_static model-tools_onnx_static) - install(TARGETS onnx2bolt - model-tools_onnx model-tools_onnx_static - RUNTIME DESTINATION tools - LIBRARY DESTINATION lib - ARCHIVE DESTINATION lib) -endif(USE_ONNX) - -if (USE_TFLITE) - add_dependencies(model-tools model-tools_tflite) - add_dependencies(model-tools_static model-tools_tflite_static) - install(TARGETS tflite2bolt - model-tools_tflite model-tools_tflite_static - RUNTIME DESTINATION tools - LIBRARY DESTINATION lib - ARCHIVE DESTINATION lib) -endif(USE_TFLITE) - -install(DIRECTORY model-tools/tools/tensorflow2caffe - model-tools/tools/pytorch2caffe - DESTINATION tools) - -if (USE_LIBRARY_TUNING) - install(TARGETS tensor_computing_library_search - RUNTIME DESTINATION tools) -endif (USE_LIBRARY_TUNING) - -if (BUILD_TEST) - if (USE_INT8) - install(TARGETS ptq_calibration - RUNTIME DESTINATION tools) - endif(USE_INT8) - install(TARGETS classification - bert - tinybert - nmt - asr_rnnt - asr_convolution_transformer - tts - vad - RUNTIME DESTINATION kits) -endif(BUILD_TEST) - -install(DIRECTORY inference/exports/java - inference/exports/c - DESTINATION include) - -install(FILES ${CMAKE_BINARY_DIR}/libBoltModel.so - ${CMAKE_BINARY_DIR}/libbolt.a - ${CMAKE_BINARY_DIR}/libbolt.so +# install section +install(FILES ${CMAKE_BINARY_DIR}/libbolt.a + DESTINATION lib) +if (USE_IOS_CLANG) + install(FILES ${CMAKE_BINARY_DIR}/libbolt.dylib DESTINATION lib) +else (USE_IOS_CLANG) + install(FILES ${CMAKE_BINARY_DIR}/libbolt.so + DESTINATION lib) + install(FILES ${CMAKE_BINARY_DIR}/libBoltModel.so + DESTINATION lib) +endif (USE_IOS_CLANG) execute_process(COMMAND doxygen .Doxyfile WORKING_DIRECTORY $ENV{BOLT_ROOT}) enable_testing() - find_program (BASH_PROGRAM bash) - -if (BASH_PROGRAM) - set(parameters -t $ENV{BOLT_ROOT}/tests/bin -k $ENV{BOLT_ROOT}/kits/bin -p /data/local/tmp/uldra) +if (BASH_PROGRAM AND USE_GENERAL) + set(parameters -t ${CMAKE_INSTALL_PREFIX} -p /data/local/tmp/uldra) if (USE_MALI) set(parameters ${parameters} -g) endif(USE_MALI) - if (USE_DYNAMIC_LIBRARY) - set(parameters ${parameters} -l $ENV{BOLT_ROOT}/install_llvm/lib) + set(parameters ${parameters} -l ${CMAKE_INSTALL_PREFIX}/lib) endif(USE_DYNAMIC_LIBRARY) - add_test (NAME quick_benchmark COMMAND $ENV{BOLT_ROOT}/quick_benchmark.sh ${parameters}) -endif (BASH_PROGRAM) + add_test (NAME quick_benchmark COMMAND $ENV{BOLT_ROOT}/scripts/quick_benchmark.sh ${parameters}) +endif (BASH_PROGRAM AND USE_GENERAL) diff --git a/README.md b/README.md index db758b2a..c2ef47a5 100644 --- a/README.md +++ b/README.md @@ -2,151 +2,66 @@ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) -Bolt is a light-weight library for mobile devices. Bolt, as a universal deployment tool for all kinds of neural networks, aims to minimize the inference runtime as much as possible. Higher speed, better security and more efficient memory management are the advantages that Bolt strives to provide. Feel free to make good use of issue submission, or join our QQ chatroom (Chinese): 833345709. +Bolt is a light-weight library for deep learning. Bolt, as a universal deployment tool for all kinds of neural networks, aims to minimize the inference runtime as much as possible. Higher speed, better security and more efficient memory management are the advantages that Bolt strives to provide. Bolt has been widely deployed and used in many departments of HUAWEI company, such as 2012 Laboratory, CBG and HUAWEI Product Lines. Feel free to make good use of issue submission, or **join our QQ chatroom (Chinese): 833345709**. -# Features - -- ### Overview - - Bolt has almost supported all the ARM-A devices incude ARMv7/ARMv8/ARMv8.2/Mali-GPU. FP16/BNN for CPU and FP16 for GPU are highly optimized. Bolt also support FP32 on ARMv7/ARMv8/ARMv8.2 devices. - - Bolt has its own format of model storage, which helps reduce the memory footprint by storing in FP16, INT8 and 1-bit representations when possible. We provide model converters for the following formats: - - - caffe - - onnx - - tflite - - For PyTorch and TensorFlow models, please try to convert them to the onnx or tflite format first. We also had some success in converting these models into customized caffe models. - -- ### Verified Networks - - Bolt has shown its high performance in the inference of common CV and NLP neural networks. Some of the representative networks that we have verified are listed below. You can find detailed benchmark information in [docs/BENCHMARK.md](docs/BENCHMARK.md). - - - Squeezenet - - Mobilenet v1, v2, v3 - - Resnet50, [Ghostnet](https://github.com/huawei-noah/ghostnet) (plus FPN detection) - - Birealnet18 (BNN) - - SSD(Resnet) - - Bert, TinyBert, Albert - - Neural Machine Translation - - Automatic Speech Recognition - - Text To Speech - - For MALI GPU FP16 Support - - Squeezenet v1.1 - - Mobilenet v1, v2, v3 - - Ghostnet - - -- ### Inference Graph Optimizers - - Apart from the refined acceleration of convolutions and GeMM for the supported data precisions, Bolt has a easy use and powerful inference graph optimizer. As shown in [model-tools/include](model-tools/include), classic operator fusion is supported. Bolt is also equipped with a Memory Reuse Optmizer, which reassigns the space occupied by a feature map as soon as it is no longer needed as input or output. Most networks that we tested benefit from a two-third reduction in feature map storage. - -- ### Thread Affinity Setting - - Users can specify the preferred policy (high-performance or low-power). Bolt will select the most suitable core and set the thread affinity. - -- ### Algorithm Tuning - - Bolt can tailor-make the algorithm configuration for your specific target device. - -# Documentation - -- ### Installation - -Bolt provides [install.sh](install.sh) for fast installation. The major third-party dependency is protobuf, and some other may come from the original model format that you want to use. You may also need libjpeg for building [tests/classification](tests). - -After configuring [bolt.cmake](bolt.cmake), the compilation can be as simple as: - -```shell -./install.sh -t 48 -c llvm -``` +# Quick Start -For more details, please refer to [docs/INSTALL.md](docs/INSTALL.md) +![](docs/images/QuickStart.PNG) -- ### User Guide +Generally, there are two steps to get started with bolt. It's quiet easy for users to quickly running bolt. -As a user, what you are normally concerned about include the following 4 parts: +1. Conversion: use **[X2bolt](../model_tools/tools/X2bolt/X2bolt.cpp)** to convert your model from caffe,onnx,tflite or tensorflow to .bolt; -- API (We guarantee that the C API will not be changed in the future) -- Model Preparation -- Model Conversion -- Model Inference +2. Inference: run **[benchmark](../inference/examples/benchmark/benchmark.cpp)** with .bolt and data to get the inference result. -For the details, please refer to [docs/USER_HANDBOOK.md](docs/USER_HANDBOOK.md) + For more details about the usage of [**X2bolt**](model_tools/tools/X2bolt/X2bolt.cpp) and [**benchmark**](inference/examples/benchmark/benchmark.cpp) tools, see [docs/USER_HANDBOOK.md](docs/USER_HANDBOOK.md). -- ### Developer Guide - - We welcome all kinds of contribution. Before that, let's get familiar with the project structure. - -- ##### Project Structure - - - [uni](uni) hosts the common headers that are used in the project. - - [gcl](gcl) hosts the setup of MALI GPU environment. - - [image](image) hosts common preprocessing routines for image inputs (e.g. bilinear interpolation). - - [blas-enhance](blas-enhance) hosts the fast implementation of matrix-matrix multiplication and matrix-vector multiplication of FP32, FP16 and INT8. It is referenced by some of the operators in [tensor_computing](tensor_computing). - - [tensor_computing](tensor_computing) hosts the implementation for individual operators. - - [model-tools](model-tools) hosts everything related to model conversion and optimization. - - [inference](inference) hosts the inference engine of neural networks. - - Lastly, [tests](tests) include all the unit tests for the above functionalities. - - To support your own network, you can first try to convert it with the provided tools. If an operator is missing, you can first add the conversion to [model-tools](model-tools). You may then implement the missing computation routine in [tensor_computing](tensor_computing). Please also define a class for your new operator in [inference](inference). - -- ##### Contribution - -All contributions are welcomed. For the details, please refer to [docs/DEVELOPER.md](docs/DEVELOPER.md) - -- ### Benchmark - -We provide a detailed benchmark report for your reference. For more testing information please refer to [docs/BENCHMARK.md](docs/BENCHMARK.md) . - -# Road Map - -#### v0.4.0 - -Future Release 2020-09-01 - -- Yolo support -- TensorFlow model converter - -# Who are using Bolt - -- HUAWEI CBG -- HUAWEI PORTFOLIO - -# FAQ - -1. Why configuring bolt.cmake does not take effect? - - The [install.sh](install.sh) serves as an example of compilation setup, and it overwrites some settings in [bolt.cmake](bolt.cmake). Please check install.sh first. +# Features -2. More details about dependency libraries for cross-compilation? +- ## Support Frameworks - The major dependency is Protobuf. Protoc should be the x86 version but protbuf should be the ARM version. + caffe, onnx, tflite, tensorflow + +- ## Inference Precision -3. Requirements on tensor dimensions? + Float32, Float16, Int8, 1-bit + +- ## Hardware - For optimal performance, Bolt requires the number of output channels to be divisible by 8. For compatibility, Bolt will try to pad the output channels of convolution layers to the nearest multiple of 8. You can turn on USE_DEBUG in [bolt.cmake](bolt.cmake) to check the actual dimensions. + ARM CPU(v7, v8, v8.2), Mali GPU, X86(AVX2) + +- ## Verified Networks -4. Restrictions for BNN? + Bolt has shown its high performance in the inference of common CV and NLP neural networks. Some of the representative networks that we have verified are listed below. You can find detailed benchmark information in [docs/BENCHMARK.md](docs/BENCHMARK.md). - For BNN convolution layers, the number of output channels must be divisible by 32. + | Application | Models | + | ------------- | ------------------------------------------------------------ | + | CV | Squeezenet/Mobilenet_v1/Mobilenet_v2/Mobilenet_v3/Resnet50
/[Ghostnet]()/SSD/Yolov3/Pointnet/...etc. | + | NLP | Bert/[TinyBert]()/Albert/Neural Machine Translation/Text To Speech
/Automatic Speech Recognition/...etc. | + | More DL Tasks | ... | -5. Restrictions on quantization (int8)? + More models than these mentioned above are supported, users are encouraged to further explore. - For the time being, Bolt only supports post-training int8 quantization. The quantization method is symmetrical for both activation and weight. We have added a calibration tool for image CNN pipelines. Please feel free to report cases of usage failures. +- ## More Advanced Features -6. Requirements for fp16 and int8? + - Graph Optimization + - Thread Affinity + - Algorithm Tuning + - [Time-Series Data Acceleration](docs/USER_HANDBOOK.md#time-series-data-acceleration) - Only arm-v8.2 supports fp16 and int8 dotprod instructions. +# Documentations -7. Restrictions for MALI? +Everything you want to know about bolt is recorded in the detailed documentations stored in [docs](docs). - Only llvm compilation supports MALI computing. +- [How to install bolt with different compilers](docs/INSTALL.md). +- [How to use bolt to inference your ML models.](docs/USER_HANDBOOK.md) +- [How to develop bolt to customize more models.](docs/DEVELOPER.md) +- [Benchmark results on some universal models.](docs/BENCHMARK.md) +- [Frequently Asked Questions(FAQ)](docs/FAQ.md) # Acknowledgement -Bolt refers to the following projects: [caffe](https://github.com/BVLC/caffe), [onnx](https://github.com/onnx/onnx), [protobuf](https://github.com/protocolbuffers/protobuf), [flatbuffers](https://github.com/google/flatbuffers), [ncnn](https://github.com/Tencent/ncnn), [mnn](https://github.com/alibaba/MNN), [dabnn](https://github.com/JDAI-CV/dabnn). +Bolt refers to the following projects: [caffe](https://github.com/BVLC/caffe), [onnx](https://github.com/onnx/onnx), [tensorflow](https://github.com/tensorflow/tensorflow), [ncnn](https://github.com/Tencent/ncnn), [mnn](https://github.com/alibaba/MNN), [dabnn](https://github.com/JDAI-CV/dabnn). # License diff --git a/THIRD PARTY OPEN SOURCE SOFTWARE NOTICE.md b/THIRD PARTY OPEN SOURCE SOFTWARE NOTICE.md deleted file mode 100644 index 8a6531cf..00000000 --- a/THIRD PARTY OPEN SOURCE SOFTWARE NOTICE.md +++ /dev/null @@ -1,71 +0,0 @@ -Please note we provide an open source software notice for the third party open source software along with this software and/or this software component contributed by Huawei (in the following just “this SOFTWARE”). The open source software licenses are granted by the respective right holders. - - - -Warranty Disclaimer - -THE OPEN SOURCE SOFTWARE IN THIS SOFTWARE IS DISTRIBUTED IN THE HOPE THAT IT WILL BE USEFUL, BUT WITHOUT ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. SEE THE APPLICABLE LICENSES FOR MORE DETAILS. - - - -Copyright Notice and License Texts - -Software: model-tools/src/caffe/caffe.proto () - -Copyright notice: - -Copyright (c) 2014-2017 The Regents of the University of California(Regents) - -All right reserved. - -License: BSD 2-Clause License - - - -Copyright - -Software:model-tools/src/onnx/onnx.proto () - -Copyright notice: - -Copyright (c) 2017 ONNX Project Contributors - -All rights reserved. - -License: MIT License - - - -Copyright - -Software:model-tools/cmakes/FindProtobuf.cmake () - -Copyright (c) 2008 Google Inc. - -All rights reserved. - -License: BSD License - - - -Copyright - -Software:model-tools/src/tflite/schema_generated.h () - -Copyright (c) 2019 The TensorFlow Authors. - -All rights reserved. - -License: Apache 2.0 - - - -Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files(the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - - - -The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - - - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/blas-enhance/CMakeLists.txt b/blas-enhance/CMakeLists.txt deleted file mode 100644 index dd609e02..00000000 --- a/blas-enhance/CMakeLists.txt +++ /dev/null @@ -1,25 +0,0 @@ -cmake_minimum_required(VERSION 3.2) - -file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/bolt.cmake ${BOLT_ROOT}/bolt.cmake) -if (BOLT_CONFIGURE_FILE) - include(${BOLT_CONFIGURE_FILE}) -else (BOLT_CONFIGURE_FILE) - message(FATAL_ERROR " -FATAL: can not find bolt.cmake in directory, - please set shell or cmake environment variable BOLT_ROOT. - ") -endif (BOLT_CONFIGURE_FILE) - -project(blas-enhance) - -set_policy() - -SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${BOLT_ROOT}/cmakes") -find_package(Uni) -find_package(BlasEnhance) - -set_project_install_directory() - -set_c_cxx_flags() - -add_subdirectory(src) diff --git a/blas-enhance/include/blas-enhance.h b/blas-enhance/include/blas-enhance.h deleted file mode 100644 index 5b8b5389..00000000 --- a/blas-enhance/include/blas-enhance.h +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_BLAS_ENHANCE -#define _H_BLAS_ENHANCE - -#include "sys.h" -#include "tensor_desc.h" - -#ifdef __cplusplus -extern "C" { -#endif - - EE matrix_matrix_multiply_tmp_bytes(TensorDesc matrixADesc, TensorDesc matrixBDesc, U32* bytes, Arch arch); - - EE matrix_matrix_multiply(TensorDesc matrixADesc, const void* matrixA, - TensorDesc matrixBDesc, const void* matrixB, - U32 bytes, void* tmp, - TensorDesc matrixCDesc, void* matrixC, Arch arch); - - EE matrix_vector_multiply_tmp_bytes(TensorDesc matrixDesc, TensorDesc vectorDesc, U32* bytes, Arch); - - EE matrix_vector_multiply(TensorDesc matrixDesc, const void* matrix, - TensorDesc vectorDesc, const void* vector, - U32 bytes, void* tmp, - TensorDesc resultDesc, void* result, Arch arch); - - inline DataFormat targetFormat4MatrixB(DataType dt) - { - switch (dt) { - case DT_F16: { - return DF_NKN24; - } - case DT_F32: { -#ifdef __aarch64__ - return DF_NKN12; -#else - return DF_NKN8; -#endif - } - case DT_I8: { - return DF_NKN12K4; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - exit(1); - } - } - } - - EE matrix_matrix_multiply_transform_rhs(TensorDesc desc, const void* src, TensorDesc* descTran,void* dst); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/blas-enhance/src/CMakeLists.txt b/blas-enhance/src/CMakeLists.txt deleted file mode 100644 index af3c35c3..00000000 --- a/blas-enhance/src/CMakeLists.txt +++ /dev/null @@ -1,32 +0,0 @@ -if (USE_GENERAL) - file(GLOB general_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/general/*.cpp) -endif (USE_GENERAL) - -if (USE_NEON) - if (USE_FP16) - file(GLOB arm_fp16_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/fp16/*.cpp) - endif (USE_FP16) - if (USE_FP32) - file(GLOB arm_fp32_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/fp32/*.cpp) - endif (USE_FP32) - if (USE_INT8) - file(GLOB arm_int8_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/int8/*.cpp) - endif (USE_INT8) - file(GLOB arm_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/*.cpp) - set(arm_srcs "${arm_srcs};${arm_fp16_srcs};${arm_fp32_srcs};${arm_int8_srcs}") -endif (USE_NEON) - -file(GLOB srcs ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) -set(srcs "${srcs};${general_srcs};${arm_srcs}") - -include_directories(${CMAKE_CURRENT_SOURCE_DIR}) - -# shared library -ADD_LIBRARY(${PROJECT_NAME} SHARED ${srcs}) - -# static library -ADD_LIBRARY(${PROJECT_NAME}_static STATIC ${srcs}) - -SET_TARGET_PROPERTIES(${PROJECT_NAME}_static PROPERTIES OUTPUT_NAME "${PROJECT_NAME}") -SET_TARGET_PROPERTIES(${PROJECT_NAME} PROPERTIES CLEAN_DIRECT_OUTPUT 1) -SET_TARGET_PROPERTIES(${PROJECT_NAME}_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) diff --git a/blas-enhance/src/cpu/arm/blas_arm.h b/blas-enhance/src/cpu/arm/blas_arm.h deleted file mode 100644 index e69e3c2b..00000000 --- a/blas-enhance/src/cpu/arm/blas_arm.h +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_BLAS_ARM -#define _H_BLAS_ARM - -#include "error.h" -#include "sys.h" -#include "type.h" - -EE matrix_vector_multiply_tmp_bytes_arm(bool transpose, - DataType dt, U32 *bytes); - -EE mvm_arm(U32 row, U32 col, DataType dt, bool transpose, - const void *matrix, const void *vector, - void *tmp, - void *result, - Arch arch); - -EE matrix_matrix_multiply_tmp_bytes_arm(U32 matrixA_M, U32 matrixA_K, U32 matrixB_K, U32 matrixB_N, - DataType dt, U32 *bytes); - -EE mmm_arm(U32 matrixC_N, U32 matrixC_M, U32 matrixA_K, - DataType matrixADataType, - const void* matrixAData, const void* matrixBData, - void* tmp, - void* matrixCData, - Arch arch); - -inline U32 pad_to_4_multiple(U32 k) -{ - if (k % 4 == 0) { - return k; - } else { - return (k / 4) * 4 + 4; - } -} - -#endif diff --git a/blas-enhance/src/cpu/arm/fp16/blas_fp16.h b/blas-enhance/src/cpu/arm/fp16/blas_fp16.h deleted file mode 100644 index 1f5c57f8..00000000 --- a/blas-enhance/src/cpu/arm/fp16/blas_fp16.h +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_BLAS_FP16 -#define _H_BLAS_FP16 - -#include "sys.h" -#include "type.h" -#include "error.h" -#include "tensor_desc.h" - - -EE mvm_fp16(U32 row, U32 col, bool transpose, F16* matrix, F16* vector, F16* result, Arch arch); - -void matrix_matrix_multiply_tmp_bytes_fp16(U32 row1, U32 col1, U32 row2, U32 col2, DataType dt, U32 *bytes); - -EE matrix_matrix_multiply_transform_rhsN_fp16(TensorDesc desc, F16* src, F16* dst); - -EE matrix_matrix_multiply_transform_rhsT_fp16(TensorDesc desc, F16* src, F16* dst); - -EE mmm_fp16(int M, int N, int K, F16* matrix1, F16* matrix2, F16* tmp, F16* result, Arch arch); - -#endif diff --git a/blas-enhance/src/cpu/arm/fp16/mmm.cpp b/blas-enhance/src/cpu/arm/fp16/mmm.cpp deleted file mode 100644 index 778bc70e..00000000 --- a/blas-enhance/src/cpu/arm/fp16/mmm.cpp +++ /dev/null @@ -1,87 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "type.h" -#include "error.h" -#include "cpu/arm/fp16/blas_fp16.h" -#include "mmm.h" -#include "mmm_common.h" - - -void matrix_matrix_multiply_tmp_bytes_fp16(U32 row1, U32 col1, U32 row2, U32 col2, DataType dt, U32 *bytes) -{ - *bytes = row1 * col1 + row2 * col2; - *bytes *= bytesOf(dt); - *bytes += 32; -} - -EE matrix_matrix_multiply_transform_rhsN_fp16(TensorDesc desc, F16* src, F16* dst) -{ - DataType dt; - U32 N, K; - CHECK_STATUS(tensor2dGet(desc, &dt, &K, &N)); - int i = 0; - for (; i < (int)N - 23; i += 24) { - matrix2_trans(24, K, N, src + i, dst + i * K); - } - for (; i < (int)N - 7; i += 8) { - matrix2_trans(8, K, N, src + i, dst + i * K); - } - for (; i < (int)N - 3; i += 4) { - matrix2_trans(4, K, N, src + i, dst + i * K); - } - if ((int)N > i) { - matrix2_trans(N - i, K, N, src + i, dst + i * K); - } - return SUCCESS; -} - -EE matrix_matrix_multiply_transform_rhsT_fp16(TensorDesc desc, F16* src, F16* dst) -{ - DataType dt; - U32 N, K; - CHECK_STATUS(tensor2dGet(desc, &dt, &N, &K)); - int i = 0; - for (; i < (int)N - 23; i += 24) { - matrix1_trans(24, K, K, src + i * K, dst + i * K); - } - for (; i < (int)N - 7; i += 8) { - matrix1_trans(8, K, K, src + i * K, dst + i * K); - } - for (; i < (int)N - 3; i += 4) { - matrix1_trans(4, K, K, src + i * K, dst + i * K); - } - if ((int)N > i) { - matrix1_trans(N - i, K, K, src + i * K, dst + i * K); - } - return SUCCESS; -} - - -EE mmm_fp16(int M, int N, int K, F16* matrix1, F16* matrix2, F16* tmp, F16* result, Arch arch) -{ - EE ret = SUCCESS; - switch (arch) { - case ARM_A55: - mmm_A55(M, N, K, matrix1, matrix2, tmp, result); - break; - case ARM_A76: - mmm_A76(M, N, K, matrix1, matrix2, tmp, result); - break; - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} diff --git a/blas-enhance/src/cpu/arm/fp16/mmm.h b/blas-enhance/src/cpu/arm/fp16/mmm.h deleted file mode 100644 index e66541f0..00000000 --- a/blas-enhance/src/cpu/arm/fp16/mmm.h +++ /dev/null @@ -1,22 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_MMM -#define _H_MMM -#include "type.h" - -void mmm_A55(int M, int N, int K, F16* matrix1, F16* matrix2, F16* tmp, F16* result); - -void mmm_A76(int M, int N, int K, F16* matrix1, F16* matrix2, F16* tmp, F16* result); -#endif diff --git a/blas-enhance/src/cpu/arm/fp16/mmm_A55.cpp b/blas-enhance/src/cpu/arm/fp16/mmm_A55.cpp deleted file mode 100644 index fe473339..00000000 --- a/blas-enhance/src/cpu/arm/fp16/mmm_A55.cpp +++ /dev/null @@ -1,790 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include -#include - -#include "type.h" -#include "error.h" -#include "cpu/arm/fp16/mmm_common.h" -#include "cpu/arm/fp16/mmm.h" - - -inline void mmm_4x24_A55(U32 M, U32 K, F16* w, F16* in, F16* out) { - U32 KTail = K % 2; - U32 KInner = K - KTail; - asm volatile( - //init in0- > v1, w- > v0 - "ld1 {v1.4h}, [%1], #8\n" - "ldr x22, [%1], #8\n" - "ins v1.d[1], x22\n" - "ld1 {v0.4h}, [%2], #8\n" - "mov x26, %0\n" - "ld1 {v5.8h, v6.8h, v7.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v8.8h, v9.8h, v10.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v11.8h, v12.8h, v13.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v14.8h, v15.8h, v16.8h}, [x26]\n" - - "mov x20, %3\n" - - "0:\n" - //w- > v4, in0- > v2/v3/v1, out0=v5~v28 - "fmla v5.8h, v1.8h, v0.h[0]\n" - "ld1 {v2.4h}, [%1], #8\n" - "fmla v8.8h, v1.8h, v0.h[1]\n" - "ldr x23, [%1], #8\n" - "fmla v11.8h, v1.8h, v0.h[2]\n" - "ins v2.d[1], x23\n" - "fmla v14.8h, v1.8h, v0.h[3]\n" - "ld1 {v4.4h}, [%2], #8\n" - "fmla v6.8h, v2.8h, v0.h[0]\n" - "ld1 {v3.4h}, [%1], #8\n" - "fmla v9.8h, v2.8h, v0.h[1]\n" - "ldr x24, [%1], #8\n" - "fmla v12.8h, v2.8h, v0.h[2]\n" - "ins v3.d[1], x24\n" - "fmla v15.8h, v2.8h, v0.h[3]\n" - "fmla v7.8h, v3.8h, v0.h[0]\n" - "ld1 {v1.4h}, [%1], #8\n" - "fmla v10.8h, v3.8h, v0.h[1]\n" - "ldr x22, [%1], #8\n" - "fmla v13.8h, v3.8h, v0.h[2]\n" - "ins v1.d[1], x22\n" - "fmla v16.8h, v3.8h, v0.h[3]\n" - - //w- > v0, in0- > v2/v3/v1, out0- > v5~v28 - "fmla v5.8h, v1.8h, v4.h[0]\n" - "ld1 {v2.4h}, [%1], #8\n" - "fmla v8.8h, v1.8h, v4.h[1]\n" - "ldr x23, [%1], #8\n" - "fmla v11.8h, v1.8h, v4.h[2]\n" - "ins v2.d[1], x23\n" - "fmla v14.8h, v1.8h, v4.h[3]\n" - "ld1 {v0.4h}, [%2], #8\n" - "fmla v6.8h, v2.8h, v4.h[0]\n" - "ld1 {v3.4h}, [%1], #8\n" - "fmla v9.8h, v2.8h, v4.h[1]\n" - "ldr x24, [%1], #8\n" - "fmla v12.8h, v2.8h, v4.h[2]\n" - "ins v3.d[1], x24\n" - "fmla v15.8h, v2.8h, v4.h[3]\n" - "fmla v7.8h, v3.8h, v4.h[0]\n" - "ld1 {v1.4h}, [%1], #8\n" - "fmla v10.8h, v3.8h, v4.h[1]\n" - "ldr x22, [%1], #8\n" - "fmla v13.8h, v3.8h, v4.h[2]\n" - "ins v1.d[1], x22\n" - "fmla v16.8h, v3.8h, v4.h[3]\n" - - "subs x20, x20, #0x2\n" - "bne 0b\n" - - "cbz %5, 1f\n" - "fmla v5.8h, v1.8h, v0.h[0]\n" - "ld1 {v2.4h}, [%1], #8\n" - "fmla v8.8h, v1.8h, v0.h[1]\n" - "ldr x23, [%1], #8\n" - "fmla v11.8h, v1.8h, v0.h[2]\n" - "ins v2.d[1], x23\n" - "fmla v14.8h, v1.8h, v0.h[3]\n" - "fmla v6.8h, v2.8h, v0.h[0]\n" - "ld1 {v3.4h}, [%1], #8\n" - "fmla v9.8h, v2.8h, v0.h[1]\n" - "ldr x24, [%1], #8\n" - "fmla v12.8h, v2.8h, v0.h[2]\n" - "ins v3.d[1], x24\n" - "fmla v15.8h, v2.8h, v0.h[3]\n" - "fmla v7.8h, v3.8h, v0.h[0]\n" - "fmla v10.8h, v3.8h, v0.h[1]\n" - "fmla v13.8h, v3.8h, v0.h[2]\n" - "fmla v16.8h, v3.8h, v0.h[3]\n" - - "1:\n" - "mov x26, %0\n" - "st1 {v5.8h, v6.8h, v7.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v8.8h, v9.8h, v10.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v11.8h, v12.8h, v13.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v14.8h, v15.8h, v16.8h}, [x26]\n" - :"+r" (out), - "+r" (in), - "+r" (w) - :"r" ((I64)KInner), - "r" ((I64)M), - "r" ((I64)KTail) - :"memory", "cc", "x0", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", - "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16" - ); -} - -inline void mmm_8x4_A55(U32 M, U32 K, F16* w, F16* in, F16* out) { - U32 KTail = K % 2; - U32 KInner = K - KTail; - asm volatile( - "mov x26, %0\n" - "ld1 {v5.h}[0], [x26], #2\n" - "ld1 {v6.h}[0], [x26], #2\n" - "ld1 {v7.h}[0], [x26], #2\n" - "ld1 {v8.h}[0], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "ld1 {v5.h}[1], [x26], #2\n" - "ld1 {v6.h}[1], [x26], #2\n" - "ld1 {v7.h}[1], [x26], #2\n" - "ld1 {v8.h}[1], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "ld1 {v5.h}[2], [x26], #2\n" - "ld1 {v6.h}[2], [x26], #2\n" - "ld1 {v7.h}[2], [x26], #2\n" - "ld1 {v8.h}[2], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "ld1 {v5.h}[3], [x26], #2\n" - "ld1 {v6.h}[3], [x26], #2\n" - "ld1 {v7.h}[3], [x26], #2\n" - "ld1 {v8.h}[3], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "ld1 {v5.h}[4], [x26], #2\n" - "ld1 {v6.h}[4], [x26], #2\n" - "ld1 {v7.h}[4], [x26], #2\n" - "ld1 {v8.h}[4], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "ld1 {v5.h}[5], [x26], #2\n" - "ld1 {v6.h}[5], [x26], #2\n" - "ld1 {v7.h}[5], [x26], #2\n" - "ld1 {v8.h}[5], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "ld1 {v5.h}[6], [x26], #2\n" - "ld1 {v6.h}[6], [x26], #2\n" - "ld1 {v7.h}[6], [x26], #2\n" - "ld1 {v8.h}[6], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "ld1 {v5.h}[7], [x26], #2\n" - "ld1 {v6.h}[7], [x26], #2\n" - "ld1 {v7.h}[7], [x26], #2\n" - "ld1 {v8.h}[7], [x26], #2\n" - "add x26, x26, %4\n" - - "mov x20, %3\n" - - "ld1 {v1.4h}, [%2], #8\n" - "ldr x24, [%2], #8\n" - "ins v1.d[1], x24\n" - "ld1 {v2.4h}, [%1], #8\n" - - "0:\n" - "fmla v5.8h, v1.8h, v2.h[0]\n" - "ld1 {v3.4h}, [%2], #8\n" - "fmla v6.8h, v1.8h, v2.h[1]\n" - "ldr x25, [%2], #8\n" - "ld1 {v4.4h}, [%1], #8\n" - "fmla v7.8h, v1.8h, v2.h[2]\n" - "ins v3.d[1], x25\n" - "fmla v8.8h, v1.8h, v2.h[3]\n" - - "fmla v5.8h, v3.8h, v4.h[0]\n" - "ld1 {v1.4h}, [%2], #8\n" - "fmla v6.8h, v3.8h, v4.h[1]\n" - "ldr x24, [%2], #8\n" - "ld1 {v2.4h}, [%1], #8\n" - "fmla v7.8h, v3.8h, v4.h[2]\n" - "ins v1.d[1], x24\n" - "fmla v8.8h, v3.8h, v4.h[3]\n" - - "subs x20, x20, 0x2\n" - "bne 0b\n" - - "cbz %5, 1f\n" - "fmla v5.8h, v1.8h, v2.h[0]\n" - "fmla v6.8h, v1.8h, v2.h[1]\n" - "fmla v7.8h, v1.8h, v2.h[2]\n" - "fmla v8.8h, v1.8h, v2.h[3]\n" - - "1:\n" - "mov x26, %0\n" - "st1 {v5.h}[0], [x26], #2\n" - "st1 {v6.h}[0], [x26], #2\n" - "st1 {v7.h}[0], [x26], #2\n" - "st1 {v8.h}[0], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "st1 {v5.h}[1], [x26], #2\n" - "st1 {v6.h}[1], [x26], #2\n" - "st1 {v7.h}[1], [x26], #2\n" - "st1 {v8.h}[1], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "st1 {v5.h}[2], [x26], #2\n" - "st1 {v6.h}[2], [x26], #2\n" - "st1 {v7.h}[2], [x26], #2\n" - "st1 {v8.h}[2], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "st1 {v5.h}[3], [x26], #2\n" - "st1 {v6.h}[3], [x26], #2\n" - "st1 {v7.h}[3], [x26], #2\n" - "st1 {v8.h}[3], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "st1 {v5.h}[4], [x26], #2\n" - "st1 {v6.h}[4], [x26], #2\n" - "st1 {v7.h}[4], [x26], #2\n" - "st1 {v8.h}[4], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "st1 {v5.h}[5], [x26], #2\n" - "st1 {v6.h}[5], [x26], #2\n" - "st1 {v7.h}[5], [x26], #2\n" - "st1 {v8.h}[5], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "st1 {v5.h}[6], [x26], #2\n" - "st1 {v6.h}[6], [x26], #2\n" - "st1 {v7.h}[6], [x26], #2\n" - "st1 {v8.h}[6], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "st1 {v5.h}[7], [x26], #2\n" - "st1 {v6.h}[7], [x26], #2\n" - "st1 {v7.h}[7], [x26], #2\n" - "st1 {v8.h}[7], [x26], #2\n" - "add x26, x26, %4\n" - :"+r" (out), - "+r" (in), - "+r" (w) - :"r" ((I64)KInner), - "r" ((I64)M), - "r" ((I64)KTail) - :"memory", "cc", "x20","x24","x25","x26","x27", "v1", "v2", "v3","v4", "v5","v6", "v7", "v8" - ); -} - -inline void mmm_4x8_A55(U32 M, U32 K, F16* w, F16* in, F16* out) { - U32 KTail = K % 2; - U32 KInner = K - KTail; - asm volatile( - "mov x26, %0\n" - "ld1 {v5.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v6.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v7.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v8.8h}, [x26]\n" - - "mov x20, %3\n" - - "ld1 {v1.4h}, [%1], #8\n" - "ldr x24, [%1], #8\n" - "ins v1.d[1], x24\n" - "ld1 {v2.4h}, [%2], #8\n" - - "0:\n" - "fmla v5.8h, v1.8h, v2.h[0]\n" - "ld1 {v3.4h}, [%1], #8\n" - "fmla v6.8h, v1.8h, v2.h[1]\n" - "ldr x25, [%1], #8\n" - "ld1 {v4.4h}, [%2], #8\n" - "fmla v7.8h, v1.8h, v2.h[2]\n" - "ins v3.d[1], x25\n" - "fmla v8.8h, v1.8h, v2.h[3]\n" - "fmla v5.8h, v3.8h, v4.h[0]\n" - "ld1 {v1.4h}, [%1], #8\n" - "fmla v6.8h, v3.8h, v4.h[1]\n" - "ldr x24, [%1], #8\n" - "ld1 {v2.4h}, [%2], #8\n" - "fmla v7.8h, v3.8h, v4.h[2]\n" - "ins v1.d[1], x24\n" - "fmla v8.8h, v3.8h, v4.h[3]\n" - - "subs x20, x20, 0x2\n" - "bne 0b\n" - - "cbz %5, 1f\n" - "fmla v5.8h, v1.8h, v2.h[0]\n" - "fmla v6.8h, v1.8h, v2.h[1]\n" - "fmla v7.8h, v1.8h, v2.h[2]\n" - "fmla v8.8h, v1.8h, v2.h[3]\n" - - "1:\n" - "mov x26, %0\n" - "st1 {v5.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v6.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v7.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v8.8h}, [x26]\n" - :"+r" (out), - "+r" (in), - "+r" (w) - :"r" ((I64)KInner), - "r" ((I64)M), - "r" ((I64)KTail) - :"memory", "cc", "x20","x24","x25","x26","x27", "v1", "v2", "v3","v4", "v5","v6", "v7", "v8" - ); -} - -inline void mmm_4x4_A55(U32 M, U32 K, F16* w, F16* in, F16* out) { - U32 KTail = K % 2; - U32 KInner = K - KTail; - asm volatile( - "mov x26, %0\n" - "ld1 {v5.4h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v6.4h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v7.4h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v8.4h}, [x26]\n" - - "mov x20, %3\n" - - "ld1 {v1.4h}, [%1], #8\n" - "ld1 {v2.4h}, [%2], #8\n" - - "0:\n" - "fmla v5.4h, v1.4h, v2.h[0]\n" - "ld1 {v3.4h}, [%1], #8\n" - "fmla v6.4h, v1.4h, v2.h[1]\n" - "ld1 {v4.4h}, [%2], #8\n" - "fmla v7.4h, v1.4h, v2.h[2]\n" - "fmla v8.4h, v1.4h, v2.h[3]\n" - "fmla v5.4h, v3.4h, v4.h[0]\n" - "ld1 {v1.4h}, [%1], #8\n" - "fmla v6.4h, v3.4h, v4.h[1]\n" - "ld1 {v2.4h}, [%2], #8\n" - "fmla v7.4h, v3.4h, v4.h[2]\n" - "fmla v8.4h, v3.4h, v4.h[3]\n" - - "subs x20, x20, 0x2\n" - "bne 0b\n" - - "cbz %5, 1f\n" - "fmla v5.4h, v1.4h, v2.h[0]\n" - "fmla v6.4h, v1.4h, v2.h[1]\n" - "fmla v7.4h, v1.4h, v2.h[2]\n" - "fmla v8.4h, v1.4h, v2.h[3]\n" - - "1:\n" - "mov x26, %0\n" - "st1 {v5.4h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v6.4h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v7.4h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v8.4h}, [x26]\n" - :"+r" (out), - "+r" (in), - "+r" (w) - :"r" ((I64)KInner), - "r" ((I64)M), - "r" ((I64)KTail) - :"memory", "cc", "x20","x26","v1", "v2", "v3","v4", "v5","v6", "v7", "v8" - ); -} - -inline void mmm_8x8_A55(U32 M, U32 K, F16* w, F16* in, F16* out) { - U32 KTail = K % 2; - U32 KInner = K - KTail; - asm volatile( - "mov x26, %0\n" - "ld1 {v5.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v6.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v7.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v8.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v9.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v10.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v11.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v12.8h}, [x26]\n" - - "mov x20, %3\n" - - "ld1 {v1.4h}, [%1], #8\n" - "ldr x24, [%1], #8\n" - "ins v1.d[1], x24\n" - "ld1 {v2.4h}, [%2], #8\n" - "ldr x22, [%2], #8\n" - "ins v2.d[1], x22\n" - - "0:\n" - "fmla v5.8h, v1.8h, v2.h[0]\n" - "ld1 {v3.4h}, [%1], #8\n" - "fmla v6.8h, v1.8h, v2.h[1]\n" - "ldr x25, [%1], #8\n" - "fmla v7.8h, v1.8h, v2.h[2]\n" - "ins v3.d[1], x25\n" - "fmla v8.8h, v1.8h, v2.h[3]\n" - "ld1 {v4.4h}, [%2], #8\n" - "fmla v9.8h, v1.8h, v2.h[4]\n" - "ldr x23, [%2], #8\n" - "fmla v10.8h, v1.8h, v2.h[5]\n" - "ins v4.d[1], x23\n" - "fmla v11.8h, v1.8h, v2.h[6]\n" - "fmla v12.8h, v1.8h, v2.h[7]\n" - - "fmla v5.8h, v3.8h, v4.h[0]\n" - "ld1 {v1.4h}, [%1], #8\n" - "fmla v6.8h, v3.8h, v4.h[1]\n" - "ldr x24, [%1], #8\n" - "fmla v7.8h, v3.8h, v4.h[2]\n" - "ins v1.d[1], x24\n" - "fmla v8.8h, v3.8h, v4.h[3]\n" - "ld1 {v2.4h}, [%2], #8\n" - "fmla v9.8h, v3.8h, v4.h[4]\n" - "ldr x22, [%2], #8\n" - "fmla v10.8h, v3.8h, v4.h[5]\n" - "ins v2.d[1], x22\n" - "fmla v11.8h, v3.8h, v4.h[6]\n" - "fmla v12.8h, v3.8h, v4.h[7]\n" - - "subs x20, x20, 0x2\n" - "bne 0b\n" - - "cbz %5, 1f\n" - "fmla v5.8h, v1.8h, v2.h[0]\n" - "fmla v6.8h, v1.8h, v2.h[1]\n" - "fmla v7.8h, v1.8h, v2.h[2]\n" - "fmla v8.8h, v1.8h, v2.h[3]\n" - "fmla v9.8h, v1.8h, v2.h[4]\n" - "fmla v10.8h, v1.8h, v2.h[5]\n" - "fmla v11.8h, v1.8h, v2.h[6]\n" - "fmla v12.8h, v1.8h, v2.h[7]\n" - - "1:\n" - "mov x26, %0\n" - "st1 {v5.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v6.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v7.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v8.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v9.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v10.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v11.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v12.8h}, [x26]\n" - :"+r" (out), - "+r" (in), - "+r" (w) - :"r" ((I64)KInner), - "r" ((I64)M), - "r" ((I64)KTail) - :"memory", "cc", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", - "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12" - ); -} - -inline void mmm_8x24_A55(U32 M, U32 K, F16* w, F16* in, F16* out) { - U32 KTail = K % 2; - U32 KInner = K - KTail; - asm volatile( - //init in0- > v1, w- > v0 - "ld1 {v1.4h}, [%1], #8\n" - "ldr x22, [%1], #8\n" - "ins v1.d[1], x22\n" - "ld1 {v0.4h}, [%2], #8\n" - "ldr x21, [%2], #8\n" - "ins v0.d[1], x21\n" - - "mov x26, %0\n" - "ld1 {v5.8h, v6.8h, v7.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v8.8h, v9.8h, v10.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v11.8h, v12.8h, v13.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v14.8h, v15.8h, v16.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v17.8h, v18.8h, v19.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v20.8h, v21.8h, v22.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v23.8h, v24.8h, v25.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v26.8h, v27.8h, v28.8h}, [x26]\n" - - "mov x20, %3\n" - - "0:\n" - //w- > v4, in0- > v2/v3/v1, out0=v5~v28 - "ld1 {v2.4h}, [%1], #8\n" - "fmla v5.8h, v1.8h, v0.h[0]\n" - "fmla v8.8h, v1.8h, v0.h[1]\n" - "ldr x23, [%1], #8\n" - "fmla v11.8h, v1.8h, v0.h[2]\n" - "fmla v14.8h, v1.8h, v0.h[3]\n" - "ins v2.d[1], x23\n" - "fmla v17.8h, v1.8h, v0.h[4]\n" - "fmla v20.8h, v1.8h, v0.h[5]\n" - "ld1 {v4.4h}, [%2], #8\n" - "fmla v23.8h, v1.8h, v0.h[6]\n" - "fmla v26.8h, v1.8h, v0.h[7]\n" - - "ld1 {v3.4h}, [%1], #8\n" - "fmla v6.8h, v2.8h, v0.h[0]\n" - "fmla v9.8h, v2.8h, v0.h[1]\n" - "ldr x24, [%1], #8\n" - "fmla v12.8h, v2.8h, v0.h[2]\n" - "fmla v15.8h, v2.8h, v0.h[3]\n" - "ins v3.d[1], x24\n" - "fmla v18.8h, v2.8h, v0.h[4]\n" - "fmla v21.8h, v2.8h, v0.h[5]\n" - "ldr x25, [%2], #8\n" - "fmla v24.8h, v2.8h, v0.h[6]\n" - "fmla v27.8h, v2.8h, v0.h[7]\n" - - "ld1 {v1.4h}, [%1], #8\n" - "fmla v7.8h, v3.8h, v0.h[0]\n" - "fmla v10.8h, v3.8h, v0.h[1]\n" - "ldr x22, [%1], #8\n" - "fmla v13.8h, v3.8h, v0.h[2]\n" - "fmla v16.8h, v3.8h, v0.h[3]\n" - "ins v1.d[1], x22\n" - "fmla v19.8h, v3.8h, v0.h[4]\n" - "fmla v22.8h, v3.8h, v0.h[5]\n" - "ins v4.d[1], x25\n" - "fmla v25.8h, v3.8h, v0.h[6]\n" - "fmla v28.8h, v3.8h, v0.h[7]\n" - - //w- > v0, in0- > v2/v3/v1, out0- > v5~v28 - "ld1 {v2.4h}, [%1], #8\n" - "fmla v5.8h, v1.8h, v4.h[0]\n" - "fmla v8.8h, v1.8h, v4.h[1]\n" - "ldr x23, [%1], #8\n" - "fmla v11.8h, v1.8h, v4.h[2]\n" - "fmla v14.8h, v1.8h, v4.h[3]\n" - "ins v2.d[1], x23\n" - "fmla v17.8h, v1.8h, v4.h[4]\n" - "fmla v20.8h, v1.8h, v4.h[5]\n" - "ld1 {v0.4h}, [%2], #8\n" - "fmla v23.8h, v1.8h, v4.h[6]\n" - "fmla v26.8h, v1.8h, v4.h[7]\n" - - "ld1 {v3.4h}, [%1], #8\n" - "fmla v6.8h, v2.8h, v4.h[0]\n" - "fmla v9.8h, v2.8h, v4.h[1]\n" - "ldr x24, [%1], #8\n" - "fmla v12.8h, v2.8h, v4.h[2]\n" - "fmla v15.8h, v2.8h, v4.h[3]\n" - "ins v3.d[1], x24\n" - "fmla v18.8h, v2.8h, v4.h[4]\n" - "fmla v21.8h, v2.8h, v4.h[5]\n" - "ldr x21, [%2], #8\n" - "fmla v24.8h, v2.8h, v4.h[6]\n" - "fmla v27.8h, v2.8h, v4.h[7]\n" - - "ld1 {v1.4h}, [%1], #8\n" - "fmla v7.8h, v3.8h, v4.h[0]\n" - "fmla v10.8h, v3.8h, v4.h[1]\n" - "ldr x22, [%1], #8\n" - "fmla v13.8h, v3.8h, v4.h[2]\n" - "fmla v16.8h, v3.8h, v4.h[3]\n" - "ins v1.d[1], x22\n" - "fmla v19.8h, v3.8h, v4.h[4]\n" - "fmla v22.8h, v3.8h, v4.h[5]\n" - "ins v0.d[1], x21\n" - "fmla v25.8h, v3.8h, v4.h[6]\n" - "subs x20, x20, #0x2\n" - "fmla v28.8h, v3.8h, v4.h[7]\n" - - "bne 0b\n" - - "cbz %5, 1f\n" - "ld1 {v2.4h}, [%1], #8\n" - "fmla v5.8h, v1.8h, v0.h[0]\n" - "fmla v8.8h, v1.8h, v0.h[1]\n" - "ldr x23, [%1], #8\n" - "fmla v11.8h, v1.8h, v0.h[2]\n" - "fmla v14.8h, v1.8h, v0.h[3]\n" - "ins v2.d[1], x23\n" - "fmla v17.8h, v1.8h, v0.h[4]\n" - "fmla v20.8h, v1.8h, v0.h[5]\n" - "fmla v23.8h, v1.8h, v0.h[6]\n" - "fmla v26.8h, v1.8h, v0.h[7]\n" - - "ld1 {v3.4h}, [%1], #8\n" - "fmla v6.8h, v2.8h, v0.h[0]\n" - "fmla v9.8h, v2.8h, v0.h[1]\n" - "ldr x24, [%1], #8\n" - "fmla v12.8h, v2.8h, v0.h[2]\n" - "fmla v15.8h, v2.8h, v0.h[3]\n" - "ins v3.d[1], x24\n" - "fmla v18.8h, v2.8h, v0.h[4]\n" - "fmla v21.8h, v2.8h, v0.h[5]\n" - "fmla v24.8h, v2.8h, v0.h[6]\n" - "fmla v27.8h, v2.8h, v0.h[7]\n" - - "fmla v7.8h, v3.8h, v0.h[0]\n" - "fmla v10.8h, v3.8h, v0.h[1]\n" - "fmla v13.8h, v3.8h, v0.h[2]\n" - "fmla v16.8h, v3.8h, v0.h[3]\n" - "fmla v19.8h, v3.8h, v0.h[4]\n" - "fmla v22.8h, v3.8h, v0.h[5]\n" - "fmla v25.8h, v3.8h, v0.h[6]\n" - "fmla v28.8h, v3.8h, v0.h[7]\n" - - "1:\n" - "mov x26, %0\n" - "st1 {v5.8h, v6.8h, v7.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v8.8h, v9.8h, v10.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v11.8h, v12.8h, v13.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v14.8h, v15.8h, v16.8h}, [x26]\n" - "add x26, x26, %4\n" - - "st1 {v17.8h, v18.8h, v19.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v20.8h, v21.8h, v22.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v23.8h, v24.8h, v25.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v26.8h, v27.8h, v28.8h}, [x26]\n" - :"+r" (out), - "+r" (in), - "+r" (w) - :"r" ((I64)KInner), - "r" ((I64)M), - "r" ((I64)KTail) - :"memory", "cc", "x0", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", - "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28" - ); -} - -void mmm_A55(int M, int N, int K, F16* matrix1, F16* matrix2, F16* tmp, F16* result) { - int blockK = K; - int blockM = 192; - F16* matrix1Trans = tmp; - F16* resultCurrent = result; - int KInner, MInner, m, n; - for(int k = 0; k < K; k += blockK) { - KInner = UNI_MIN(blockK, K - k); - for(int i = 0; i < M; i+=blockM) { - - MInner = UNI_MIN(blockM, M - i); - - for(n = 0; n <= N - 8; n+=8) { - if (i == 0) { - matrix1_trans(8, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); - } - for(m = 0; m <= (MInner-24); m+=24) { - resultCurrent = result + n * M + m + i; - mmm_8x24_A55(M*2, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - } - for(; m <=(MInner - 8); m+=8) { - resultCurrent = result + n * M + m + i; - mmm_8x8_A55(M*2, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - } - - if ((MInner - m) >= 4) { - resultCurrent = result + n * M + m + i; - mmm_8x4_A55(M*2, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - m += 4; - } - - if (MInner - m) { - resultCurrent = result + n * M + m + i; - mmm_N8_MTail(MInner - m, M, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - } - } - - if ((N - n) >= 4) { - - if (i == 0) { - matrix1_trans(4, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); - } - - for(m = 0; m <= (MInner - 24); m+=24) { - resultCurrent = result + n * M + m + i; - mmm_4x24_A55(M*2, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - } - - for(; m <= (MInner - 8); m+=8) { - resultCurrent = result + n * M + m + i; - mmm_4x8_A55(M*2, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - } - - if ((MInner - m) >= 4) { - resultCurrent = result + n * M + m + i; - mmm_4x4_A55(M*2, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - m += 4; - } - - if (MInner - m) { - resultCurrent = result + n * M + m + i; - mmm_N4_MTail(MInner - m, M, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - } - - n += 4; - - } - - if (N - n) { - if (i == 0) { - matrix1_trans(N-n, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); - } - - for(m = 0; m <= (MInner - 24); m+=24) { - resultCurrent = result + n * M + m + i; - mmm_NTail_M24(M, N - n, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - } - - for(; m <= (MInner - 8); m+=8) { - resultCurrent = result + n * M + m + i; - mmm_NTail_M8(M, N - n, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - } - - if ((MInner - m) >= 4) { - resultCurrent = result + n * M + m + i; - mmm_NTail_M4(M, N - n, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - m += 4; - } - - if (MInner - m) { - resultCurrent = result + n * M + m + i; - mmm_NTail_M(MInner - m, M, N - n, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - } - - } - - } - } -} diff --git a/blas-enhance/src/cpu/arm/fp16/mmm_A76.cpp b/blas-enhance/src/cpu/arm/fp16/mmm_A76.cpp deleted file mode 100644 index 83deca1e..00000000 --- a/blas-enhance/src/cpu/arm/fp16/mmm_A76.cpp +++ /dev/null @@ -1,634 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include - -#include "type.h" -#include "error.h" -#include "cpu/arm/fp16/mmm_common.h" -#include "cpu/arm/fp16/mmm.h" - -#define MMM_FMA_4x8_V5V14s3_V1xV0 "fmla v5.8h, v1.8h, v0.h[0]\n"\ - "fmla v8.8h, v1.8h, v0.h[1]\n"\ - "fmla v11.8h, v1.8h, v0.h[2]\n"\ - "fmla v14.8h, v1.8h, v0.h[3]\n" -#define MMM_FMA_4x8_V17V26s3_V1xV0 "fmla v17.8h, v1.8h, v0.h[4]\n"\ - "fmla v20.8h, v1.8h, v0.h[5]\n"\ - "fmla v23.8h, v1.8h, v0.h[6]\n"\ - "fmla v26.8h, v1.8h, v0.h[7]\n" -#define MMM_FMA_4x8_V6V15s3_V2xV0 "fmla v6.8h, v2.8h, v0.h[0]\n"\ - "fmla v9.8h, v2.8h, v0.h[1]\n"\ - "fmla v12.8h, v2.8h, v0.h[2]\n"\ - "fmla v15.8h, v2.8h, v0.h[3]\n" -#define MMM_FMA_4x8_V18V27s3_V2xV0 "fmla v18.8h, v2.8h, v0.h[4]\n"\ - "fmla v21.8h, v2.8h, v0.h[5]\n"\ - "fmla v24.8h, v2.8h, v0.h[6]\n"\ - "fmla v27.8h, v2.8h, v0.h[7]\n" -#define MMM_FMA_4x8_V7V16s3_V3xV0 "fmla v7.8h, v3.8h, v0.h[0]\n"\ - "fmla v10.8h, v3.8h, v0.h[1]\n"\ - "fmla v13.8h, v3.8h, v0.h[2]\n"\ - "fmla v16.8h, v3.8h, v0.h[3]\n" -#define MMM_FMA_4x8_V19V28s3_V3xV0 "fmla v19.8h, v3.8h, v0.h[4]\n"\ - "fmla v22.8h, v3.8h, v0.h[5]\n"\ - "fmla v25.8h, v3.8h, v0.h[6]\n"\ - "fmla v28.8h, v3.8h, v0.h[7]\n" -#define MMM_FMA_4x8_V5V14s3_V29xV4 "fmla v5.8h, v29.8h, v4.h[0]\n"\ - "fmla v8.8h, v29.8h, v4.h[1]\n"\ - "fmla v11.8h, v29.8h, v4.h[2]\n"\ - "fmla v14.8h, v29.8h, v4.h[3]\n" -#define MMM_FMA_4x8_V17V26s3_V29xV4 "fmla v17.8h, v29.8h, v4.h[4]\n"\ - "fmla v20.8h, v29.8h, v4.h[5]\n"\ - "fmla v23.8h, v29.8h, v4.h[6]\n"\ - "fmla v26.8h, v29.8h, v4.h[7]\n" -#define MMM_FMA_4x8_V6V15s3_V30xV4 "fmla v6.8h, v30.8h, v4.h[0]\n"\ - "fmla v9.8h, v30.8h, v4.h[1]\n"\ - "fmla v12.8h, v30.8h, v4.h[2]\n"\ - "fmla v15.8h, v30.8h, v4.h[3]\n" -#define MMM_FMA_4x8_V18V27s3_V30xV4 "fmla v18.8h, v30.8h, v4.h[4]\n"\ - "fmla v21.8h, v30.8h, v4.h[5]\n"\ - "fmla v24.8h, v30.8h, v4.h[6]\n"\ - "fmla v27.8h, v30.8h, v4.h[7]\n" -#define MMM_FMA_4x8_V7V16s3_V31xV4 "fmla v7.8h, v31.8h, v4.h[0]\n"\ - "fmla v10.8h, v31.8h, v4.h[1]\n"\ - "fmla v13.8h, v31.8h, v4.h[2]\n"\ - "fmla v16.8h, v31.8h, v4.h[3]\n" -#define MMM_FMA_4x8_V19V28s3_V31xV4 "fmla v19.8h, v31.8h, v4.h[4]\n"\ - "fmla v22.8h, v31.8h, v4.h[5]\n"\ - "fmla v25.8h, v31.8h, v4.h[6]\n"\ - "fmla v28.8h, v31.8h, v4.h[7]\n" - -inline void mmm_4x24_A76(U32 M, U32 K, F16* w, F16* in, F16* out) { - U32 KTail = K % 2; - U32 KInner = K - KTail; - asm volatile( - //init in0- > v1, w- > v0 - "ld1 {v1.8h}, [%1], #16\n" - "ld1 {v0.4h}, [%2], #8\n" - "mov x26, %0\n" - "ld1 {v5.8h, v6.8h, v7.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v8.8h, v9.8h, v10.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v11.8h, v12.8h, v13.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v14.8h, v15.8h, v16.8h}, [x26]\n" - - "mov x20, %3\n" - - "0:\n" - //w- > v4, in0- > v2/v3/v1, out0=v5~v28 - "ld1 {v2.8h}, [%1], #16\n" - MMM_FMA_4x8_V5V14s3_V1xV0 - "ld1 {v3.8h}, [%1], #16\n" - "ld1 {v29.8h}, [%1], #16\n" - MMM_FMA_4x8_V6V15s3_V2xV0 - "ld1 {v4.4h}, [%2], #8\n" - MMM_FMA_4x8_V7V16s3_V3xV0 - - //w- > v0, in0- > v2/v3/v1, out0- > v5~v28 - "ld1 {v30.8h}, [%1], #16\n" - MMM_FMA_4x8_V5V14s3_V29xV4 - "ld1 {v31.8h}, [%1], #16\n" - MMM_FMA_4x8_V6V15s3_V30xV4 - "ld1 {v1.8h}, [%1], #16\n" - "ld1 {v0.4h}, [%2], #8\n" - MMM_FMA_4x8_V7V16s3_V31xV4 - - "subs x20, x20, #0x2\n" - "bne 0b\n" - - "cbz %5, 1f\n" - "ld1 {v2.8h}, [%1], #16\n" - MMM_FMA_4x8_V5V14s3_V1xV0 - "ld1 {v3.8h}, [%1], #16\n" - MMM_FMA_4x8_V6V15s3_V2xV0 - MMM_FMA_4x8_V7V16s3_V3xV0 - - "1:\n" - "mov x26, %0\n" - "st1 {v5.8h, v6.8h, v7.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v8.8h, v9.8h, v10.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v11.8h, v12.8h, v13.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v14.8h, v15.8h, v16.8h}, [x26]\n" - :"+r" (out), - "+r" (in), - "+r" (w) - :"r" ((I64)KInner), - "r" ((I64)M), - "r" ((I64)KTail) - :"memory", "cc", "x20", "x26", - "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v29", "v30", "v31" - ); -} -inline void mmm_8x4_A76(U32 M, U32 K, F16* w, F16* in, F16* out) { - U32 KTail = K % 2; - U32 KInner = K - KTail; - asm volatile( - "ld1 {v1.8h}, [%2], #16\n" - "ld1 {v0.4h}, [%1], #8\n" - - "mov x26, %0\n" - "ld1 {v5.h}[0], [x26], #2\n" - "ld1 {v8.h}[0], [x26], #2\n" - "ld1 {v11.h}[0], [x26], #2\n" - "ld1 {v14.h}[0], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "ld1 {v5.h}[1], [x26], #2\n" - "ld1 {v8.h}[1], [x26], #2\n" - "ld1 {v11.h}[1], [x26], #2\n" - "ld1 {v14.h}[1], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "ld1 {v5.h}[2], [x26], #2\n" - "ld1 {v8.h}[2], [x26], #2\n" - "ld1 {v11.h}[2], [x26], #2\n" - "ld1 {v14.h}[2], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "ld1 {v5.h}[3], [x26], #2\n" - "ld1 {v8.h}[3], [x26], #2\n" - "ld1 {v11.h}[3], [x26], #2\n" - "ld1 {v14.h}[3], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "ld1 {v5.h}[4], [x26], #2\n" - "ld1 {v8.h}[4], [x26], #2\n" - "ld1 {v11.h}[4], [x26], #2\n" - "ld1 {v14.h}[4], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "ld1 {v5.h}[5], [x26], #2\n" - "ld1 {v8.h}[5], [x26], #2\n" - "ld1 {v11.h}[5], [x26], #2\n" - "ld1 {v14.h}[5], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "ld1 {v5.h}[6], [x26], #2\n" - "ld1 {v8.h}[6], [x26], #2\n" - "ld1 {v11.h}[6], [x26], #2\n" - "ld1 {v14.h}[6], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "ld1 {v5.h}[7], [x26], #2\n" - "ld1 {v8.h}[7], [x26], #2\n" - "ld1 {v11.h}[7], [x26], #2\n" - "ld1 {v14.h}[7], [x26], #2\n" - - "mov x20, %3\n" - - "0:\n" - "ld1 {v4.4h}, [%1], #8\n" - "ld1 {v29.8h}, [%2], #16\n" - MMM_FMA_4x8_V5V14s3_V1xV0 - "ld1 {v1.8h}, [%2], #16\n" - "ld1 {v0.4h}, [%1], #8\n" - MMM_FMA_4x8_V5V14s3_V29xV4 - - "subs x20, x20, 0x2\n" - "bne 0b\n" - - "cbz %5, 1f\n" - MMM_FMA_4x8_V5V14s3_V1xV0 - - "1:\n" - "mov x26, %0\n" - "st1 {v5.h}[0], [x26], #2\n" - "st1 {v8.h}[0], [x26], #2\n" - "st1 {v11.h}[0], [x26], #2\n" - "st1 {v14.h}[0], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "st1 {v5.h}[1], [x26], #2\n" - "st1 {v8.h}[1], [x26], #2\n" - "st1 {v11.h}[1], [x26], #2\n" - "st1 {v14.h}[1], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "st1 {v5.h}[2], [x26], #2\n" - "st1 {v8.h}[2], [x26], #2\n" - "st1 {v11.h}[2], [x26], #2\n" - "st1 {v14.h}[2], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "st1 {v5.h}[3], [x26], #2\n" - "st1 {v8.h}[3], [x26], #2\n" - "st1 {v11.h}[3], [x26], #2\n" - "st1 {v14.h}[3], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "st1 {v5.h}[4], [x26], #2\n" - "st1 {v8.h}[4], [x26], #2\n" - "st1 {v11.h}[4], [x26], #2\n" - "st1 {v14.h}[4], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "st1 {v5.h}[5], [x26], #2\n" - "st1 {v8.h}[5], [x26], #2\n" - "st1 {v11.h}[5], [x26], #2\n" - "st1 {v14.h}[5], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "st1 {v5.h}[6], [x26], #2\n" - "st1 {v8.h}[6], [x26], #2\n" - "st1 {v11.h}[6], [x26], #2\n" - "st1 {v14.h}[6], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "st1 {v5.h}[7], [x26], #2\n" - "st1 {v8.h}[7], [x26], #2\n" - "st1 {v11.h}[7], [x26], #2\n" - "st1 {v14.h}[7], [x26], #2\n" - :"+r" (out), - "+r" (in), - "+r" (w) - :"r" ((I64)KInner), - "r" ((I64)M), - "r" ((I64)KTail) - :"memory", "cc", "x20", "x26", "v0", "v1", "v4", "v29", "v5", "v8", "v11", "v14" - ); -} - -inline void mmm_4x8_A76(U32 M, U32 K, F16* w, F16* in, F16* out) { - U32 KTail = K % 2; - U32 KInner = K - KTail; - asm volatile( - "ld1 {v1.8h}, [%1], #16\n" - "ld1 {v0.4h}, [%2], #8\n" - "mov x26, %0\n" - "ld1 {v5.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v8.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v11.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v14.8h}, [x26]\n" - - "mov x20, %3\n" - - "0:\n" - "ld1 {v29.8h}, [%1], #16\n" - "ld1 {v4.4h}, [%2], #8\n" - MMM_FMA_4x8_V5V14s3_V1xV0 - "ld1 {v1.8h}, [%1], #16\n" - "ld1 {v0.4h}, [%2], #8\n" - MMM_FMA_4x8_V5V14s3_V29xV4 - - "subs x20, x20, 0x2\n" - "bne 0b\n" - - "cbz %5, 1f\n" - MMM_FMA_4x8_V5V14s3_V1xV0 - - "1:\n" - "mov x26, %0\n" - "st1 {v5.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v8.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v11.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v14.8h}, [x26]\n" - :"+r" (out), - "+r" (in), - "+r" (w) - :"r" ((I64)KInner), - "r" ((I64)M), - "r" ((I64)KTail) - :"memory", "cc", "x20", "x26", "v0", "v1", "v4", "v5","v8", "v11","v14", "v29" - ); -} - -inline void mmm_4x4_A76(U32 M, U32 K, F16* w, F16* in, F16* out) { - U32 KTail = K % 2; - U32 KInner = K - KTail; - asm volatile( - "ld1 {v1.4h}, [%1], #8\n" - "ld1 {v0.4h}, [%2], #8\n" - "mov x26, %0\n" - "ld1 {v5.4h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v8.4h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v11.4h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v14.4h}, [x26]\n" - - "mov x20, %3\n" - - "0:\n" - "ld1 {v29.4h}, [%1], #8\n" - "ld1 {v4.4h}, [%2], #8\n" - MMM_FMA_4x8_V5V14s3_V1xV0 - "ld1 {v1.4h}, [%1], #8\n" - "ld1 {v0.4h}, [%2], #8\n" - MMM_FMA_4x8_V5V14s3_V29xV4 - - "subs x20, x20, 0x2\n" - "bne 0b\n" - - "cbz %5, 1f\n" - MMM_FMA_4x8_V5V14s3_V1xV0 - - "1:\n" - "mov x26, %0\n" - "st1 {v5.4h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v8.4h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v11.4h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v14.4h}, [x26]\n" - :"+r" (out), - "+r" (in), - "+r" (w) - :"r" ((I64)KInner), - "r" ((I64)M), - "r" ((I64)KTail) - :"memory", "cc", "x20", "x26", "v0", "v1", "v4", "v29", "v5", "v8", "v11", "v14" - ); -} - -inline void mmm_8x8_A76(U32 M, U32 K, F16* w, F16* in, F16* out) { - U32 KTail = K % 2; - U32 KInner = K - KTail; - asm volatile( - "mov x26, %0\n" - "ld1 {v5.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v8.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v11.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v14.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v17.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v20.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v23.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v26.8h}, [x26]\n" - - "mov x20, %3\n" - - "ld1 {v1.8h}, [%1], #16\n" - "ld1 {v0.8h}, [%2], #16\n" - - "0:\n" - "ld1 {v29.8h}, [%1], #16\n" - "ld1 {v4.8h}, [%2], #16\n" - MMM_FMA_4x8_V5V14s3_V1xV0 - MMM_FMA_4x8_V17V26s3_V1xV0 - - "ld1 {v1.8h}, [%1], #16\n" - "ld1 {v0.8h}, [%2], #16\n" - MMM_FMA_4x8_V5V14s3_V29xV4 - MMM_FMA_4x8_V17V26s3_V29xV4 - - "subs x20, x20, 0x2\n" - "bne 0b\n" - - "cbz %5, 1f\n" - MMM_FMA_4x8_V5V14s3_V1xV0 - MMM_FMA_4x8_V17V26s3_V1xV0 - - "1:\n" - "mov x26, %0\n" - "st1 {v5.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v8.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v11.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v14.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v17.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v20.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v23.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v26.8h}, [x26]\n" - :"+r" (out), - "+r" (in), - "+r" (w) - :"r" ((I64)KInner), - "r" ((I64)M), - "r" ((I64)KTail) - :"memory", "cc", "x20", "x26", - "v1", "v0", "v29", "v4", "v5", "v8", "v11", "v14", "v17", "v20", "v23", "v26" - ); -} - -inline void mmm_8x24_A76(U32 M, U32 K, F16* w, F16* in, F16* out) { - U32 KTail = K % 2; - U32 KInner = K - KTail; - asm volatile( - //init in0- > v1, w- > v0 - "ld1 {v1.8h}, [%1], #16\n" - "ld1 {v0.8h}, [%2], #16\n" - "mov x26, %0\n" - "ld1 {v5.8h, v6.8h, v7.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v8.8h, v9.8h, v10.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v11.8h, v12.8h, v13.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v14.8h, v15.8h, v16.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v17.8h, v18.8h, v19.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v20.8h, v21.8h, v22.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v23.8h, v24.8h, v25.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v26.8h, v27.8h, v28.8h}, [x26]\n" - - "mov x20, %3\n" - - "0:\n" - //w- > v4, in0- > v2/v3/v1, out0=v5~v28 - "ld1 {v2.8h}, [%1], #16\n" - "ld1 {v3.8h}, [%1], #16\n" - MMM_FMA_4x8_V5V14s3_V1xV0 - MMM_FMA_4x8_V17V26s3_V1xV0 - - "ld1 {v4.8h}, [%2], #16\n" - MMM_FMA_4x8_V6V15s3_V2xV0 - MMM_FMA_4x8_V18V27s3_V2xV0 - - "ld1 {v29.8h}, [%1], #16\n" - MMM_FMA_4x8_V7V16s3_V3xV0 - MMM_FMA_4x8_V19V28s3_V3xV0 - - //w- > v0, in0- > v2/v3/v1, out0- > v5~v28 - "ld1 {v30.8h}, [%1], #16\n" - "ld1 {v0.8h}, [%2], #16\n" - MMM_FMA_4x8_V5V14s3_V29xV4 - MMM_FMA_4x8_V17V26s3_V29xV4 - - "ld1 {v31.8h}, [%1], #16\n" - MMM_FMA_4x8_V6V15s3_V30xV4 - MMM_FMA_4x8_V18V27s3_V30xV4 - - "ld1 {v1.8h}, [%1], #16\n" - MMM_FMA_4x8_V7V16s3_V31xV4 - "subs x20, x20, #0x2\n" - MMM_FMA_4x8_V19V28s3_V31xV4 - - "bne 0b\n" - - "cbz %5, 1f\n" - "ld1 {v2.8h}, [%1], #16\n" - "ld1 {v3.8h}, [%1], #16\n" - MMM_FMA_4x8_V5V14s3_V1xV0 - MMM_FMA_4x8_V17V26s3_V1xV0 - MMM_FMA_4x8_V6V15s3_V2xV0 - MMM_FMA_4x8_V18V27s3_V2xV0 - MMM_FMA_4x8_V7V16s3_V3xV0 - MMM_FMA_4x8_V19V28s3_V3xV0 - - "1:\n" - "mov x26, %0\n" - "st1 {v5.8h, v6.8h, v7.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v8.8h, v9.8h, v10.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v11.8h, v12.8h, v13.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v14.8h, v15.8h, v16.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v17.8h, v18.8h, v19.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v20.8h, v21.8h, v22.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v23.8h, v24.8h, v25.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v26.8h, v27.8h, v28.8h}, [x26]\n" - :"+r" (out), - "+r" (in), - "+r" (w) - :"r" ((I64)KInner), - "r" ((I64)M), - "r" ((I64)KTail) - :"memory", "cc", "x0", "x20", "x26", - "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" - ); -} - -void mmm_A76(int M, int N, int K, F16* matrix1, F16* matrix2, F16* tmp, F16* result) -{ - int blockK = K; - int blockM = 192; - F16* matrix1Trans = tmp; - F16* resultCurrent = result; - int KInner, MInner, m, n; - for(int k = 0; k < K; k += blockK) { - KInner = UNI_MIN(blockK, K - k); - for(int i = 0; i < M; i+=blockM) { - MInner = UNI_MIN(blockM, M - i); - for(n = 0; n <= N - 8; n+=8) { - if (i == 0) { - matrix1_trans(8, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); - } - for(m = 0; m <= (MInner-24); m+=24) { - resultCurrent = result + n * M + m + i; - mmm_8x24_A76(M*2, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - } - for(; m <=(MInner - 8); m+=8) { - resultCurrent = result + n * M + m + i; - mmm_8x8_A76(M*2, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - } - - if ((MInner - m) >= 4) { - resultCurrent = result + n * M + m + i; - mmm_8x4_A76(M*2, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - m += 4; - } - - if (MInner - m) { - resultCurrent = result + n * M + m + i; - mmm_N8_MTail(MInner - m, M, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - } - } - - if ((N - n) >= 4) { - if (i == 0) { - matrix1_trans(4, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); - } - - for(m = 0; m <= (MInner - 24); m+=24) { - resultCurrent = result + n * M + m + i; - mmm_4x24_A76(M*2, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - } - - for(; m <= (MInner - 8); m+=8) { - resultCurrent = result + n * M + m + i; - mmm_4x8_A76(M*2, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - } - - if ((MInner - m) >= 4) { - resultCurrent = result + n * M + m + i; - mmm_4x4_A76(M*2, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - m += 4; - } - - if (MInner - m) { - resultCurrent = result + n * M + m + i; - mmm_N4_MTail(MInner - m, M, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - } - - n += 4; - - } - - if (N - n) { - if (i == 0) { - matrix1_trans(N-n, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); - } - - for(m = 0; m <= (MInner - 24); m+=24) { - resultCurrent = result + n * M + m + i; - mmm_NTail_M24(M, N - n, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - } - - for(; m <= (MInner - 8); m+=8) { - resultCurrent = result + n * M + m + i; - mmm_NTail_M8(M, N - n, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - } - - if ((MInner - m) >= 4) { - resultCurrent = result + n * M + m + i; - mmm_NTail_M4(M, N - n, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - m += 4; - } - - if (MInner - m) { - resultCurrent = result + n * M + m + i; - mmm_NTail_M(MInner - m, M, N - n, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - } - - } - - } - } -} diff --git a/blas-enhance/src/cpu/arm/fp16/mmm_common.h b/blas-enhance/src/cpu/arm/fp16/mmm_common.h deleted file mode 100644 index 9934677e..00000000 --- a/blas-enhance/src/cpu/arm/fp16/mmm_common.h +++ /dev/null @@ -1,145 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_MMM_COMMON -#define _H_MMM_COMMON -#include -#include - -#include "type.h" - - -inline void matrix1_trans(U32 size, U32 blockK, U32 K, F16* src, F16* dst) { - F16* src1 = src; - U32 offset; - for (U32 i = 0; i < blockK; i++) { - for (U32 j = 0; j < size; j++) { - src1 = src + j * K; - offset = 8 * blockK; - if (i % 32) { - asm volatile( - "prfm pldl2keep, [%0, %1]\n" - :"+r" (src1) - :"r"((I64)offset) - :"memory","cc" - ); - } - *dst++ = *(src1 + i); - } - } -} - -inline void matrix2_trans(U32 size, U32 blockK, U32 M, F16* src, F16* dst) { - for (U32 i = 0; i < blockK; i++) { - asm volatile( - "prfm pldl2keep, [%0, #48]\n" - :"+r" (src) - : - :"memory","cc" - ); - memcpy(dst, src, size * sizeof (F16)); - dst += size; - src += M; - } -} - -inline void mmm_NTail_M24(U32 M, U32 N, U32 K, F16* matrix1, F16* matrix2, F16* result) { - float16x8x3_t mat2, res; - for (U32 i = 0; i < N; i++) { - res = vld3q_f16(result + i * M); - for (U32 q = 0; q < K; q+=1) { - mat2 = vld3q_f16(matrix2 + q * 24); - res.val[0] = vfmaq_n_f16(res.val[0], mat2.val[0], matrix1[q * N + i]); - res.val[1] = vfmaq_n_f16(res.val[1], mat2.val[1], matrix1[q * N + i]); - res.val[2] = vfmaq_n_f16(res.val[2], mat2.val[2], matrix1[q * N + i]); - } - vst3q_f16(result + i * M, res); - } -} - -inline void mmm_NTail_M8(U32 M, U32 N, U32 K, F16* matrix1, F16* matrix2, F16* result) { - float16x8_t mat2, res; - for (U32 i = 0; i < N; i++) { - res = vld1q_f16(result + i * M); - for (U32 q = 0; q < K; q+=1) { - mat2 = vld1q_f16(matrix2 + q * 8); - res = vfmaq_n_f16(res, mat2, matrix1[q * N + i]); - } - vst1q_f16(result + i * M, res); - } -} - -inline void mmm_NTail_M4(U32 M, U32 N, U32 K, F16* matrix1, F16* matrix2, F16* result) { - float16x4_t mat2, res; - for (U32 i = 0; i < N; i++) { - res = vld1_f16(result + i * M); - for (U32 q = 0; q < K; q+=1) { - mat2 = vld1_f16(matrix2 + q * 4); - res = vfma_n_f16(res, mat2, matrix1[q * N + i]); - } - vst1_f16(result + i * M, res); - } -} - -inline void mmm_NTail_M(U32 MInner, U32 M, U32 N, U32 K, F16* matrix1, F16* matrix2, F16* result) { - for(U32 i = 0; i < N; i++) { - for(U32 j = 0; j < MInner; j++) { - for(U32 k = 0; k < K; k++) { - result[i * M + j] += *(matrix1 + k * N + i) * *(matrix2 + k * MInner + j); - } - - } - } -} - -inline void mmm_N8_MTail(U32 MInner, U32 M, U32 K, F16* matrix1, F16* matrix2, F16* result) { - float16x8_t mat1 = {0}, res[4] = {0}; - F16 tmp[8] = {0}; - CHECK_REQUIREMENT(MInner < 4); - - for(U32 i = 0; i < K; i++) { - mat1 = vld1q_f16(matrix1 + i * 8); - for(U32 j = 0; j < MInner; j++) { - res[j] = vfmaq_n_f16(res[j], mat1, matrix2[j + i * MInner]); - } - } - for(U32 p = 0; p < MInner; p++) { - vst1q_f16(tmp, res[p]); - for(U32 q = 0; q < 8; q++) { - result[q * M + p] += tmp[q]; - } - res[p] = vdupq_n_f16(0); - } -} - -inline void mmm_N4_MTail(U32 MInner, U32 M, U32 K, F16* matrix1, F16* matrix2, F16* result) { - float16x4_t mat1 = {0}, res[4] = {0}; - F16 tmp[4] = {0}; - CHECK_REQUIREMENT(MInner < 4); - - for(U32 i = 0; i < K; i++) { - mat1 = vld1_f16(matrix1 + i * 4); - for(U32 j = 0; j < MInner; j++) { - res[j] = vfma_n_f16(res[j], mat1, matrix2[j + i * MInner]); - } - } - for(U32 p = 0; p < MInner; p++) { - vst1_f16(tmp, res[p]); - for(U32 q = 0; q < 4; q++) { - result[q * M + p] += tmp[q]; - } - res[p] = vdup_n_f16(0); - } -} -#endif diff --git a/blas-enhance/src/cpu/arm/fp16/mvm.cpp b/blas-enhance/src/cpu/arm/fp16/mvm.cpp deleted file mode 100644 index 6944482e..00000000 --- a/blas-enhance/src/cpu/arm/fp16/mvm.cpp +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "error.h" -#include "cpu/arm/fp16/blas_fp16.h" -#include "cpu/arm/fp16/mvm.h" - - -EE mvm_fp16(U32 row, U32 col, bool transpose, F16* matrix, F16* vector, F16* result, Arch arch) { - EE ret = SUCCESS; - switch (arch) { - case ARM_A55: - mvm_A55(row, col, transpose, matrix, vector, result); - break; - case ARM_A76: - mvm_A76(row, col, transpose, matrix, vector, result); - break; - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} diff --git a/blas-enhance/src/cpu/arm/fp16/mvm.h b/blas-enhance/src/cpu/arm/fp16/mvm.h deleted file mode 100644 index a4b059d1..00000000 --- a/blas-enhance/src/cpu/arm/fp16/mvm.h +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_MVM -#define _H_MVM - -#include "type.h" - -void mvm_A55(U32 row, U32 col, bool transpose, F16* matrix, F16* vector, F16* result); - -void mvm_A76(U32 row, U32 col, bool transpose, F16* matrix, F16* vector, F16* result); -#endif diff --git a/blas-enhance/src/cpu/arm/fp16/mvm_A55.cpp b/blas-enhance/src/cpu/arm/fp16/mvm_A55.cpp deleted file mode 100644 index e468f4e3..00000000 --- a/blas-enhance/src/cpu/arm/fp16/mvm_A55.cpp +++ /dev/null @@ -1,144 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include - -#include "mvm_common.h" -#include "mvm.h" - - -inline void mvm_row_kernel_A55(U32 N, U32 K, F16* matrix, F16* vector, F16* result) { - U32 KTail = K % 8; - U32 KInner = K - KTail; - F16* w0 = matrix; - F16* w1 = matrix + K * N/2; - F16* w2 = matrix + K * 2 * N/2; - F16* w3 = matrix + K * 3 * N/2; - - asm volatile( - "mov x19, %5\n" - "ld1 {v18.h}[0], [x19]\n" - "add x19, x19, %8\n" - "ld1 {v18.h}[1], [x19]\n" - "add x19, x19, %8\n" - "ld1 {v18.h}[2], [x19]\n" - "add x19, x19, %8\n" - "ld1 {v18.h}[3], [x19]\n" - - "movi v17.8h, #0x0\n" - "movi v16.8h, #0x0\n" - "movi v9.8h, #0x0\n" - "movi v10.8h, #0x0\n" - "movi v11.8h, #0x0\n" - "movi v12.8h, #0x0\n" - "mov x20, %6\n" - "cmp x20, #0x0\n" - "beq 3f\n" - "0:\n" - - "ld1 {v0.4h}, [%0], #8\n" - "ldr x15, [%0], #8\n" - "ins v0.d[1], x15\n" - - "ld1 {v1.4h}, [%1], #8\n" - "ld1 {v2.4h}, [%2], #8\n" - "ldr x21, [%1], #8\n" - "ldr x22, [%2], #8\n" - "ins v1.d[1], x21\n" - "ins v2.d[1], x22\n" - - "ld1 {v3.4h}, [%3], #8\n" - "fmla v9.8h, v1.8h, v0.8h\n" - "ld1 {v4.4h}, [%4], #8\n" - "fmla v10.8h, v2.8h, v0.8h\n" - "ldr x23, [%3], #8\n" - "ldr x24, [%4], #8\n" - "ins v3.d[1], x23\n" - "ins v4.d[1], x24\n" - "fmla v11.8h, v3.8h, v0.8h\n" - "fmla v12.8h, v4.8h, v0.8h\n" - - "subs x20, x20, 0x8\n" - "bne 0b\n" - - "faddp v13.8h, v9.8h, v10.8h\n" - "faddp v14.8h, v11.8h, v12.8h\n" - "faddp v15.8h, v13.8h, v14.8h\n" - "faddp v17.8h, v15.8h, v15.8h\n" - "3:\n" - "mov x16, %7\n" - "cmp x16, #0x0\n" - "beq 2f\n" - - "1:\n" - "ld1 {v8.h}[0], [%0], #2\n" - - "ld1 {v1.h}[0], [%1], #2\n" - "ld1 {v1.h}[1], [%2], #2\n" - "ld1 {v1.h}[2], [%3], #2\n" - "ld1 {v1.h}[3], [%4], #2\n" - "fmla v16.8h, v1.8h, v8.h[0]\n" - - "subs x16, x16, 0x1\n" - "bne 1b\n" - - "fadd v17.8h, v17.8h, v16.8h\n" - - "2:\n" - - "fadd v17.8h, v17.8h, v18.8h\n" - - "mov x19, %5\n" - "st1 {v17.h}[0], [x19]\n" - "add x19, x19, %8\n" - "st1 {v17.h}[1], [x19]\n" - "add x19, x19, %8\n" - "st1 {v17.h}[2], [x19]\n" - "add x19, x19, %8\n" - "st1 {v17.h}[3], [x19]\n" - - :"+r" (vector), - "+r" (w0), - "+r" (w1), - "+r" (w2), - "+r" (w3), - "+r" (result) - :"r" ((I64)KInner), - "r" ((I64)KTail), - "r" ((I64)N) - :"memory", "cc", "x19", "x20", "x21", "x22", "x23", "x24", "x15", "x16", - "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18" - ); -} - -inline void mvm_row_A55(U32 numRows, U32 numColumns, F16* matrix, F16* vector, F16* result) { - //Actual layout is NK, and vector is K - U32 N = numRows; - U32 K = numColumns; - U32 NTail = N % 4; - U32 NInner = N / 4; - for(U32 i = 0; i < NInner; i++) { - mvm_row_kernel_A55(NInner * 2, K, matrix + i * K, vector, result + i); - } - if (NTail != 0) { - mvm_row_tail(NTail, K, matrix + (N - NTail) * K, vector, result + (N - NTail)); - } -} - -void mvm_A55(U32 row, U32 col, bool transpose, F16* matrix, F16* vector, F16* result) { - if (transpose) - mvm_col(row, col, matrix, vector, result); - else - mvm_row_A55(row, col, matrix, vector, result); -} diff --git a/blas-enhance/src/cpu/arm/fp16/mvm_A76.cpp b/blas-enhance/src/cpu/arm/fp16/mvm_A76.cpp deleted file mode 100644 index 135bfa51..00000000 --- a/blas-enhance/src/cpu/arm/fp16/mvm_A76.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include - -#include "mvm_common.h" -#include "mvm.h" - - -inline void mvm_row_kernel_A76(U32 N, U32 K, F16* matrix, F16* vector, F16* result) { - U32 KTail = K % 8; - U32 KInner = K - KTail; - F16* w0 = matrix; - F16* w1 = matrix + K * N/2; - F16* w2 = matrix + K * 2 * N/2; - F16* w3 = matrix + K * 3 * N/2; - asm volatile( - "mov x19, %5\n" - "ld1 {v18.h}[0], [x19]\n" - "add x19, x19, %8\n" - "ld1 {v18.h}[1], [x19]\n" - "add x19, x19, %8\n" - "ld1 {v18.h}[2], [x19]\n" - "add x19, x19, %8\n" - "ld1 {v18.h}[3], [x19]\n" - - "movi v17.8h, #0x0\n" - "movi v16.8h, #0x0\n" - "movi v9.8h, #0x0\n" - "movi v10.8h, #0x0\n" - "movi v11.8h, #0x0\n" - "movi v12.8h, #0x0\n" - "mov x20, %6\n" - "cmp x20, #0x0\n" - "beq 3f\n" - "0:\n" - - "ld1 {v0.8h}, [%0], #16\n" - "ld1 {v1.8h}, [%1], #16\n" - "ld1 {v2.8h}, [%2], #16\n" - "ld1 {v3.8h}, [%3], #16\n" - "ld1 {v4.8h}, [%4], #16\n" - - "fmla v9.8h, v1.8h, v0.8h\n" - "fmla v10.8h, v2.8h, v0.8h\n" - "fmla v11.8h, v3.8h, v0.8h\n" - "fmla v12.8h, v4.8h, v0.8h\n" - - "subs x20, x20, 0x8\n" - "bne 0b\n" - - "faddp v13.8h, v9.8h, v10.8h\n" - "faddp v14.8h, v11.8h, v12.8h\n" - "faddp v15.8h, v13.8h, v14.8h\n" - "faddp v17.8h, v15.8h, v15.8h\n" - "3:\n" - "mov x16, %7\n" - "cmp x16, #0x0\n" - "beq 2f\n" - - "1:\n" - "ld1 {v8.h}[0], [%0], #2\n" - - "ld1 {v1.h}[0], [%1], #2\n" - "ld1 {v1.h}[1], [%2], #2\n" - "ld1 {v1.h}[2], [%3], #2\n" - "ld1 {v1.h}[3], [%4], #2\n" - "fmla v16.8h, v1.8h, v8.h[0]\n" - - "subs x16, x16, 0x1\n" - "bne 1b\n" - - "fadd v17.8h, v17.8h, v16.8h\n" - - "2:\n" - - "fadd v17.8h, v17.8h, v18.8h\n" - "mov x19, %5\n" - "st1 {v17.h}[0], [x19]\n" - "add x19, x19, %8\n" - "st1 {v17.h}[1], [x19]\n" - "add x19, x19, %8\n" - "st1 {v17.h}[2], [x19]\n" - "add x19, x19, %8\n" - "st1 {v17.h}[3], [x19]\n" - :"+r" (vector), - "+r" (w0), - "+r" (w1), - "+r" (w2), - "+r" (w3), - "+r" (result) - :"r" ((I64)KInner), - "r" ((I64)KTail), - "r" ((I64)N) - :"memory", "cc", "x19", "x20", "x21", "x22", "x23", "x24", "x15", "x16", - "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18" - ); -} - -inline void mvm_row_A76(U32 numRows, U32 numColumns, F16* matrix, F16* vector, F16* result) { - //Actual layout is NK, and vector is K - U32 N = numRows; - U32 K = numColumns; - U32 NTail = N % 4; - U32 NInner = N / 4; - for (U32 i = 0; i < NInner; i++) { - mvm_row_kernel_A76(NInner * 2, K, matrix + i * K, vector, result + i); - } - if (NTail != 0) { - mvm_row_tail(NTail, K, matrix + (N - NTail) * K, vector, result + (N - NTail)); - } -} - -void mvm_A76(U32 row, U32 col, bool transpose, F16* matrix, F16* vector, F16* result) { - if (transpose) - mvm_col(row, col, matrix, vector, result); - else - mvm_row_A76(row, col, matrix, vector, result); -} diff --git a/blas-enhance/src/cpu/arm/fp16/mvm_common.h b/blas-enhance/src/cpu/arm/fp16/mvm_common.h deleted file mode 100644 index dfad769d..00000000 --- a/blas-enhance/src/cpu/arm/fp16/mvm_common.h +++ /dev/null @@ -1,260 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_MVM_COMMON -#define _H_MVM_COMMON - -#include -#include "type.h" -#include "arm_neon_expand.h" - -inline void mvm_row_tail(U32 N, U32 K, F16* matrix, F16* vector, F16* result) { - float16x8_t vec, res, mat; - U32 KTail = K % 8; - U32 KInner = K - KTail; - - for (U32 i = 0; i < N; i+=1) { - res = vdupq_n_f16(0); - - for (U32 j = 0; j < KInner; j+=8) { - vec = vld1q_f16(&vector[j]); - mat = vld1q_f16(&matrix[j + K * i]); - res = vfmaq_f16(res, vec, mat); - } - result[i] += vaddvq_f16(res); - - if (KTail != 0) { - for (U32 p = 0; p < KTail; p+=1) { - result[i] += vector[p + KInner] * matrix[KInner + p + K * i]; - } - } - - } -} - -inline void mvm_col_tail(U32 N, U32 K, F16* matrix, F16* vector, F16* result) { - float16x8_t tmp, res, mat; - U32 NTail = N % 8; - U32 NInner = N - NTail; - - for (U32 i = 0; i < K; i+=1) { - for (U32 j = 0; j < NInner; j+=8) { - tmp = vld1q_f16(result + j); - mat = vld1q_f16(&matrix[j + N * i]); - res = vfmaq_n_f16(tmp, mat, vector[i]); - vst1q_f16(result + j, res); - } - if (NTail != 0) { - for (U32 p = 0; p < NTail; p+=1) { - result[NInner + p] += vector[i] * matrix[NInner + N * i + p]; - } - } - } -} - -inline void mvm_col_kernel(U32 N, U32 K, F16* matrix, F16* vector, F16* result) { - float16x8_t mat[4] = {0}; - - F16* w0 = matrix; - F16* w1 = matrix + K * N; - F16* w2 = matrix + 2 * K * N; - F16* w3 = matrix + 3 * K * N; - - U32 N_tail = N % 8; - U32 N_inner = N - N_tail; - - for(U32 i = 0; i < K; i+=1) { - for(U32 j = 0; j < N_inner; j+=8) { - - float16x8_t res[4] = {0}; - - res[3] = vld1q_f16(result + j); - mat[0] = vld1q_f16(w0); - mat[1] = vld1q_f16(w1); - mat[2] = vld1q_f16(w2); - mat[3] = vld1q_f16(w3); - - res[0] = vfmaq_n_f16(res[3], mat[0], vector[i]); - res[1] = vfmaq_n_f16(res[0], mat[1], vector[K + i]); - res[2] = vfmaq_n_f16(res[1], mat[2], vector[2 * K + i]); - res[3] = vfmaq_n_f16(res[2], mat[3], vector[3 * K + i]); - - w0 += 8; - w1 += 8; - w2 += 8; - w3 += 8; - vst1q_f16(result + j, res[3]); - - } - if (N_tail != 0) { - for(U32 p = 0; p < N_tail; p+=1) { - result[N_inner + p] += vector[i] * *w0++; - result[N_inner + p] += vector[i + K] * *w1++; - result[N_inner + p] += vector[i + 2 * K] * *w2++; - result[N_inner + p] += vector[i + 3 * K] * *w3++; - } - } - } -} - -inline void mvm_col_kernel_4x8(U32 N, U32 K, F16* matrix, F16* vector, F16* result) { - F16* result_end8 = result + N / 8 * 8; - F16* result_end = result + N; - asm volatile( - "mov x20, %0\n" - "add x21, x20, %5\n" - "add x22, x21, %5\n" - "add x23, x22, %5\n" - "mov x24, %1\n" - "add x25, x24, %6\n" - "add x26, x25, %6\n" - "add x27, x26, %6\n" - "mov x29, x21\n" - - "00:\n" - "cmp x20, x29\n" - "bge 01f\n" - "ldr h0, [x20], 2\n" - "dup v0.8h, v0.h[0]\n" - "ldr h1, [x21], 2\n" - "dup v1.8h, v1.h[0]\n" - "ldr h2, [x22], 2\n" - "dup v2.8h, v2.h[0]\n" - "ldr h3, [x23], 2\n" - "dup v3.8h, v3.h[0]\n" - - "mov x28, %2\n" - - "10:\n" - "cmp x28, %3\n" - "bge 11f\n" - "ldr q4, [x28]\n" - "ldr q8, [x24], 16\n" - "ldr q9, [x25], 16\n" - "ldr q10, [x26], 16\n" - "fmla v4.8h, v8.8h, v0.8h\n" - "ldr q11, [x27], 16\n" - "fmla v4.8h, v9.8h, v1.8h\n" - "fmla v4.8h, v10.8h, v2.8h\n" - "fmla v4.8h, v11.8h, v3.8h\n" - "str q4, [x28], 16\n" - "b 10b\n" - - "11:\n" - "cmp x28, %4\n" - "bge 12f\n" - "ldr h4, [x28]\n" - "ldr h8, [x24], 2\n" - "ldr h9, [x25], 2\n" - "ldr h10, [x26], 2\n" - "fmla h4, h8, v0.h[0]\n" - "ldr h11, [x27], 2\n" - "fmla h4, h9, v1.h[0]\n" - "fmla h4, h10, v2.h[0]\n" - "fmla h4, h11, v3.h[0]\n" - "str h4, [x28], 2\n" - "b 11b\n" - - "12:\n" - "b 00b\n" - "01:\n" - :"+r" (vector), - "+r" (matrix), - "+r" (result), - "+r" (result_end8), - "+r" (result_end) - :"r" ((I64)K*2), - "r" ((I64)K*N*2) - :"memory", "cc", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x29", - "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10", "v11" - ); -} - -inline void mvm_row_kernel(U32 N, U32 K, F16* matrix, F16* vector, F16* result) { - float16x8_t res[4] = {0}, mat[4] = {0} , vec; - float16x8_t tmp[6] = {0}; - - F16* w0 = matrix; - F16* w1 = matrix + K * N; - F16* w2 = matrix + 2 * K * N; - F16* w3 = matrix + 3 * K * N; - - U32 K_tail = K % 8; - U32 K_inner = K - K_tail; - - for (U32 i = 0; i < N; i+=1) { - for (U32 j = 0; j < K_inner; j+=8) { - - vec = vld1q_f16(&vector[j]); - - mat[0] = vld1q_f16(w0); - mat[1] = vld1q_f16(w1); - mat[2] = vld1q_f16(w2); - mat[3] = vld1q_f16(w3); - for(U32 k = 0; k < 4; k++) { - res[k] = vfmaq_f16(res[k], vec , mat[k]); - } - w0 += 8; - w1 += 8; - w2 += 8; - w3 += 8; - - } - - for(U32 m = 0; m < 2; m++) { - tmp[m] = vpaddq_f16(res[m * 2], res[m * 2 + 1]); - } - tmp[4] = vpaddq_f16(tmp[0], tmp[1]); - tmp[5] = vpaddq_f16(tmp[4], tmp[3]); - F16 addbias; - for(U32 n = 0; n < 4; n++) { - vst1q_lane_f16_builtin(&addbias, tmp[5], n); - result[i + N * n] += addbias; - res[n] = vdupq_n_f16(0); - } - - if (K_tail != 0) { - for (U32 p = 0; p < K_tail; p += 1) { - *(result + i) += vector[p + K_inner] * *w0++; - *(result + N + i) += vector[p + K_inner] * *w1++; - *(result + 2*N + i) += vector[p + K_inner] * *w2++; - *(result + 3*N + i) += vector[p + K_inner] * *w3++; - } - } - - } -} - -inline void mvm_col(U32 numRows, U32 numColumns, F16* matrix, F16* vector, F16* result) { - //Actual layout is KN, and vector is K - U32 N = numRows; - U32 K = numColumns; - U32 KInner = K / 4; - U32 KTail = K % 4; - mvm_col_kernel_4x8(N, KInner, matrix, vector, result); - if (KTail != 0) { - mvm_col_tail(N, KTail, matrix + (K - KTail) * N, vector + (K - KTail), result); - } -} - -//N is number of rows, K for columns -inline void mvm_row(U32 N, U32 K, F16* matrix, F16* vector, F16* result) { - U32 NInner = (N / 4); - U32 NTail = N % 4 ; - mvm_row_kernel(NInner, K, matrix, vector, result); - if (NTail != 0) { - mvm_row_tail(NTail, K, matrix + (N - NTail) * K, vector, result + N - NTail); - } -} -#endif diff --git a/blas-enhance/src/cpu/arm/fp32/blas_fp32.h b/blas-enhance/src/cpu/arm/fp32/blas_fp32.h deleted file mode 100644 index 2fc08be2..00000000 --- a/blas-enhance/src/cpu/arm/fp32/blas_fp32.h +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_BLAS_FP32 -#define _H_BLAS_FP32 - -#include "sys.h" -#include "type.h" -#include "error.h" -#include "tensor_desc.h" -#include "arm_neon_expand.h" - -void mvm_col_V8(U32 row, U32 col, F32* matrix, F32* vector, F32* result); - -void mvm_row_V8(U32 row, U32 col, F32* matrix, F32* vector, F32* result); - -inline EE mvm_fp32(U32 row, U32 col, bool transpose, F32* matrix, F32* vector, F32* result) -{ - if (transpose) { - mvm_col_V8(row, col, matrix, vector, result); - } else { - mvm_row_V8(row, col, matrix, vector, result); - } - return SUCCESS; -} - -void matrix_matrix_multiply_tmp_bytes_fp32(U32 row1, U32 col1, U32 row2, U32 col2, DataType dt, U32 *bytes); - -EE matrix_matrix_multiply_transform_rhsN_fp32(TensorDesc desc, F32* src, F32* dst); - -EE matrix_matrix_multiply_transform_rhsT_fp32(TensorDesc desc, F32* src, F32* dst); - -#ifdef __aarch64__ -EE mmm_fp32_V8(int M, int N, int K, F32* matrix1, F32* matrix2, F32* tmp, F32* result); -#else -EE mmm_fp32_V7(int M, int N, int K, F32* matrix1, F32* matrix2, F32* tmp, F32* result); -#endif - -#endif diff --git a/blas-enhance/src/cpu/arm/fp32/mmm_V7.cpp b/blas-enhance/src/cpu/arm/fp32/mmm_V7.cpp deleted file mode 100644 index e1adf11b..00000000 --- a/blas-enhance/src/cpu/arm/fp32/mmm_V7.cpp +++ /dev/null @@ -1,511 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#ifndef __aarch64__ -#include -#include "cpu/arm/fp32/blas_fp32.h" -#include "error.h" -#include "type.h" - -void matrix_matrix_multiply_tmp_bytes_fp32( - U32 row1, U32 col1, U32 row2, U32 col2, DataType dt, U32 *bytes) -{ - *bytes = row1 * col1 + row2 * col2; - *bytes *= bytesOf(dt); - *bytes += 32; -} - -void matrix1_trans(U32 size, U32 blockK, U32 K, F32 *src, F32 *dst) -{ - F32 *src1 = src; - for (U32 i = 0; i < blockK; i++) { - for (U32 j = 0; j < size; j++) { - src1 = src + j * K; - if (i % 16 == 0) { - __builtin_prefetch(src1 + 16); - } - *dst++ = *(src1 + i); - } - } -} - -void matrix2_trans(U32 size, U32 blockK, U32 M, F32 *src, F32 *dst) -{ - for (U32 i = 0; i < blockK; i++) { - if (i % 16 == 0) { - __builtin_prefetch(src + 16); - } - memcpy(dst, src, size * sizeof(F32)); - dst += size; - src += M; - } -} - -EE matrix_matrix_multiply_transform_rhsN_fp32(TensorDesc desc, F32 *src, F32 *dst) -{ - DataType dt; - U32 N, K; - CHECK_STATUS(tensor2dGet(desc, &dt, &K, &N)); - int i = 0; - for (; i < (int)N - 7; i += 8) { - matrix2_trans(8, K, N, src + i, dst + i * K); - } - for (; i < (int)N - 3; i += 4) { - matrix2_trans(4, K, N, src + i, dst + i * K); - } - if ((int)N > i) { - matrix2_trans(N - i, K, N, src + i, dst + i * K); - } - return SUCCESS; -} - -EE matrix_matrix_multiply_transform_rhsT_fp32(TensorDesc desc, F32 *src, F32 *dst) -{ - DataType dt; - U32 N, K; - CHECK_STATUS(tensor2dGet(desc, &dt, &N, &K)); - int i = 0; - for (; i < (int)N - 7; i += 8) { - matrix1_trans(8, K, K, src + i * K, dst + i * K); - } - for (; i < (int)N - 3; i += 4) { - matrix1_trans(4, K, K, src + i * K, dst + i * K); - } - if ((int)N > i) { - matrix1_trans(N - i, K, K, src + i * K, dst + i * K); - } - return SUCCESS; -} - -void mmm_NTail_M8(U32 M, U32 N, U32 K, F32 *matrix1, F32 *matrix2, F32 *result) -{ - float32x4x2_t mat2, res; - for (U32 i = 0; i < N; i++) { - res = vld2q_f32(result + i * M); - for (U32 q = 0; q < K; q++) { - mat2 = vld2q_f32(matrix2 + q * 8); - res.val[0] = vfmaq_n_f32(res.val[0], mat2.val[0], matrix1[q * N + i]); - res.val[1] = vfmaq_n_f32(res.val[1], mat2.val[1], matrix1[q * N + i]); - } - vst2q_f32(result + i * M, res); - } -} - -void mmm_NTail_M4(U32 M, U32 N, U32 K, F32 *matrix1, F32 *matrix2, F32 *result) -{ - float32x4_t mat2, res; - for (U32 i = 0; i < N; i++) { - res = vld1q_f32(result + i * M); - for (U32 q = 0; q < K; q++) { - mat2 = vld1q_f32(matrix2 + q * 4); - res = vfmaq_n_f32(res, mat2, matrix1[q * N + i]); - } - vst1q_f32(result + i * M, res); - } -} - -void mmm_NTail_M(U32 MInner, U32 M, U32 N, U32 K, F32 *matrix1, F32 *matrix2, F32 *result) -{ - for (U32 i = 0; i < N; i++) { - for (U32 j = 0; j < MInner; j++) { - for (U32 k = 0; k < K; k++) { - result[i * M + j] += *(matrix1 + k * N + i) * *(matrix2 + k * MInner + j); - } - } - } -} - -void mmm_N6_MTail(U32 MInner, U32 M, U32 K, F32 *matrix1, F32 *matrix2, F32 *result) -{ - float32x2_t mat1[3] = {0}, res[4][3] = {{0}}; - F32 tmp[6] = {0}; - CHECK_REQUIREMENT(MInner < 4); - - for (U32 i = 0; i < K; i++) { - mat1[0] = vld1_f32(matrix1 + i * 6); - mat1[1] = vld1_f32(matrix1 + i * 6 + 2); - mat1[2] = vld1_f32(matrix1 + i * 6 + 4); - for (U32 j = 0; j < MInner; j++) { - res[j][0] = vmla_n_f32(res[j][0], mat1[0], matrix2[j + i * MInner]); - res[j][1] = vmla_n_f32(res[j][1], mat1[1], matrix2[j + i * MInner]); - res[j][2] = vmla_n_f32(res[j][2], mat1[2], matrix2[j + i * MInner]); - } - } - for (U32 p = 0; p < MInner; p++) { - vst1_f32(tmp, res[p][0]); - vst1_f32(tmp + 2, res[p][1]); - vst1_f32(tmp + 4, res[p][2]); - for (U32 q = 0; q < 6; q++) { - result[q * M + p] += tmp[q]; - } - } -} - -void mmm_N4_MTail(U32 MInner, U32 M, U32 K, F32 *matrix1, F32 *matrix2, F32 *result) -{ - float32x4_t mat1 = {0}, res[4] = {0}; - F32 tmp[4] = {0}; - CHECK_REQUIREMENT(MInner < 4); - - for (U32 i = 0; i < K; i++) { - mat1 = vld1q_f32(matrix1 + i * 4); - for (U32 j = 0; j < MInner; j++) { - res[j] = vfmaq_n_f32(res[j], mat1, matrix2[j + i * MInner]); - } - } - for (U32 p = 0; p < MInner; p++) { - vst1q_f32(tmp, res[p]); - for (U32 q = 0; q < 4; q++) { - result[q * M + p] += tmp[q]; - } - } -} - -void mmm_4x4(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) -{ - asm volatile( - "vld1.f32 {d0-d1}, [%[in]]!\n" - - "vld1.f32 {d4-d5}, [%[w]]!\n" - - //K- > r2 - "mov r2, %[K]\n" - - //give out address to r1 - "mov r1, %[out]\n" - - //load in bias - "vld1.f32 {d8-d9}, [r1]\n" - "add r1, r1, %[offset]\n" - "vld1.f32 {d12-d13}, [r1]\n" - "add r1, r1, %[offset]\n" - "vld1.f32 {d16-d17}, [r1]\n" - "add r1, r1, %[offset]\n" - "vld1.f32 {d20-d21}, [r1]\n" - - //Computation loop - "0:\n" - - "vmla.f32 q4, q2, d0[0]\n" - "vmla.f32 q6, q2, d0[1]\n" - "vmla.f32 q8, q2, d1[0]\n" - "vmla.f32 q10, q2, d1[1]\n" - - "vld1.f32 {d4-d5}, [%[w]]!\n" - "subs r2, r2, #1\n" - - "vld1.f32 {d0-d1}, [%[in]]!\n" - "bne 0b\n" - - //give out address to r1 - "mov r1, %[out]\n" - - "vst1.f32 {d8-d9}, [r1]\n" - "add r1, r1, %[offset]\n" - "vst1.f32 {d12-d13}, [r1]\n" - "add r1, r1, %[offset]\n" - "vst1.f32 {d16-d17}, [r1]\n" - "add r1, r1, %[offset]\n" - "vst1.f32 {d20-d21}, [r1]\n" - - : [in]"+r"(in), [w]"+r"(w), [out]"+r"(out) - : [K]"r"(K), [offset]"r"(offset) - : "memory", "cc", "q0", "q2", "q4", "q6", "q8", "q10", "r1", "r2"); -} - -void mmm_6x4(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) -{ - asm volatile( - "vld1.f32 {d0-d2}, [%[in]]!\n" - - "vld1.f32 {d4-d5}, [%[w]]!\n" - - //K- > r2 - "mov r2, %[K]\n" - - //give out address to r1 - "mov r1, %[out]\n" - - //load in bias - "vld1.f32 {d8-d9}, [r1]\n" - "add r1, r1, %[offset]\n" - "vld1.f32 {d12-d13}, [r1]\n" - "add r1, r1, %[offset]\n" - "vld1.f32 {d16-d17}, [r1]\n" - "add r1, r1, %[offset]\n" - "vld1.f32 {d20-d21}, [r1]\n" - "add r1, r1, %[offset]\n" - "vld1.f32 {d24-d25}, [r1]\n" - "add r1, r1, %[offset]\n" - "vld1.f32 {d28-d29}, [r1]\n" - - //Computation loop - "0:\n" - - "vmla.f32 q4, q2, d0[0]\n" - "vmla.f32 q6, q2, d0[1]\n" - "vmla.f32 q8, q2, d1[0]\n" - "vmla.f32 q10, q2, d1[1]\n" - "vmla.f32 q12, q2, d2[0]\n" - "vmla.f32 q14, q2, d2[1]\n" - - "vld1.f32 {d4-d5}, [%[w]]!\n" - "subs r2, r2, #1\n" - - "vld1.f32 {d0-d2}, [%[in]]!\n" - "bne 0b\n" - - //give out address to r1 - "mov r1, %[out]\n" - - "vst1.f32 {d8-d9}, [r1]\n" - "add r1, r1, %[offset]\n" - "vst1.f32 {d12-d13}, [r1]\n" - "add r1, r1, %[offset]\n" - "vst1.f32 {d16-d17}, [r1]\n" - "add r1, r1, %[offset]\n" - "vst1.f32 {d20-d21}, [r1]\n" - "add r1, r1, %[offset]\n" - "vst1.f32 {d24-d25}, [r1]\n" - "add r1, r1, %[offset]\n" - "vst1.f32 {d28-d29}, [r1]\n" - - : [in]"+r"(in), [w]"+r"(w), [out]"+r"(out) - : [K]"r"(K), [offset]"r"(offset) - : "memory", "cc", "q0", "q1", "q2", "q4", "q6", "q8", "q10", - "q12", "q14", "r1", "r2"); -} - -void mmm_4x8(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) -{ - asm volatile( - "vld1.f32 {d0-d1}, [%[in]]!\n" - - "vld1.f32 {d4-d7}, [%[w]]!\n" - - //K- > r2 - "mov r2, %[K]\n" - - //give out address to r1 - "mov r1, %[out]\n" - - //load in bias - "vld1.f32 {d8-d11}, [r1]\n" - "add r1, r1, %[offset]\n" - "vld1.f32 {d12-d15}, [r1]\n" - "add r1, r1, %[offset]\n" - "vld1.f32 {d16-d19}, [r1]\n" - "add r1, r1, %[offset]\n" - "vld1.f32 {d20-d23}, [r1]\n" - - //Computation loop - "0:\n" - - "vmla.f32 q4, q2, d0[0]\n" - "vmla.f32 q6, q2, d0[1]\n" - "vmla.f32 q8, q2, d1[0]\n" - "vmla.f32 q10, q2, d1[1]\n" - - "vld1.f32 {d4-d5}, [%[w]]!\n" - - "vmla.f32 q5, q3, d0[0]\n" - "vmla.f32 q7, q3, d0[1]\n" - "vmla.f32 q9, q3, d1[0]\n" - "vmla.f32 q11, q3, d1[1]\n" - - "vld1.f32 {d6-d7}, [%[w]]!\n" - "subs r2, r2, #1\n" - - "vld1.f32 {d0-d1}, [%[in]]!\n" - "bne 0b\n" - - //give out address to r1 - "mov r1, %[out]\n" - - "vst1.f32 {d8-d11}, [r1]\n" - "add r1, r1, %[offset]\n" - "vst1.f32 {d12-d15}, [r1]\n" - "add r1, r1, %[offset]\n" - "vst1.f32 {d16-d19}, [r1]\n" - "add r1, r1, %[offset]\n" - "vst1.f32 {d20-d23}, [r1]\n" - - : [in]"+r"(in), [w]"+r"(w), [out]"+r"(out) - : [K]"r"(K), [offset]"r"(offset) - : "memory", "cc", "q0", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", - "q11", "r1", "r2"); -} - -void mmm_6x8(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) -{ - asm volatile( - "vld1.f32 {d0-d2}, [%[in]]!\n" - - "vld1.f32 {d4-d7}, [%[w]]!\n" - - //K- > r2 - "mov r2, %[K]\n" - - //give out address to r1 - "mov r1, %[out]\n" - - //load in bias - "vld1.f32 {d8-d11}, [r1]\n" - "add r1, r1, %[offset]\n" - "vld1.f32 {d12-d15}, [r1]\n" - "add r1, r1, %[offset]\n" - "vld1.f32 {d16-d19}, [r1]\n" - "add r1, r1, %[offset]\n" - "vld1.f32 {d20-d23}, [r1]\n" - "add r1, r1, %[offset]\n" - "vld1.f32 {d24-d27}, [r1]\n" - "add r1, r1, %[offset]\n" - "vld1.f32 {d28-d31}, [r1]\n" - - //Computation loop - "0:\n" - - "vmla.f32 q4, q2, d0[0]\n" - "vmla.f32 q6, q2, d0[1]\n" - "vmla.f32 q8, q2, d1[0]\n" - "vmla.f32 q10, q2, d1[1]\n" - "vmla.f32 q12, q2, d2[0]\n" - "vmla.f32 q14, q2, d2[1]\n" - - "vld1.f32 {d4-d5}, [%[w]]!\n" - - "vmla.f32 q5, q3, d0[0]\n" - "vmla.f32 q7, q3, d0[1]\n" - "vmla.f32 q9, q3, d1[0]\n" - "vmla.f32 q11, q3, d1[1]\n" - "vmla.f32 q13, q3, d2[0]\n" - "vmla.f32 q15, q3, d2[1]\n" - - "vld1.f32 {d6-d7}, [%[w]]!\n" - "subs r2, r2, #1\n" - - "vld1.f32 {d0-d2}, [%[in]]!\n" - "bne 0b\n" - - //give out address to r1 - "mov r1, %[out]\n" - - "vst1.f32 {d8-d11}, [r1]\n" - "add r1, r1, %[offset]\n" - "vst1.f32 {d12-d15}, [r1]\n" - "add r1, r1, %[offset]\n" - "vst1.f32 {d16-d19}, [r1]\n" - "add r1, r1, %[offset]\n" - "vst1.f32 {d20-d23}, [r1]\n" - "add r1, r1, %[offset]\n" - "vst1.f32 {d24-d27}, [r1]\n" - "add r1, r1, %[offset]\n" - "vst1.f32 {d28-d31}, [r1]\n" - - : [in]"+r"(in), [w]"+r"(w), [out]"+r"(out) - : [K]"r"(K), [offset]"r"(offset) - : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", - "q11", "q12", "q13", "q14", "q15", "r1", "r2"); -} - -EE mmm_fp32_V7(int M, int N, int K, F32 *matrix1, F32 *matrix2, F32 *tmp, F32 *result) -{ - int blockK = K; - int blockM = 96; - F32 *matrix1Trans = tmp; - F32 *resultCurrent = result; - int KInner, MInner, m, n; - for (int k = 0; k < K; k += blockK) { - KInner = UNI_MIN(blockK, K - k); - for (int i = 0; i < M; i += blockM) { - MInner = UNI_MIN(blockM, M - i); - for (n = 0; n <= N - 6; n += 6) { - if (i == 0) { - matrix1_trans(6, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); - } - for (m = 0; m <= (MInner - 8); m += 8) { - resultCurrent = result + n * M + m + i; - mmm_6x8(M * 4, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, - resultCurrent); - } - - if ((MInner - m) >= 4) { - resultCurrent = result + n * M + m + i; - mmm_6x4(M * 4, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, - resultCurrent); - m += 4; - } - - if (MInner - m) { - resultCurrent = result + n * M + m + i; - mmm_N6_MTail(MInner - m, M, KInner, matrix1Trans + n * KInner, - matrix2 + (i + m) * KInner, resultCurrent); - } - } - - if ((N - n) >= 4) { - if (i == 0) { - matrix1_trans(4, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); - } - - for (m = 0; m <= (MInner - 8); m += 8) { - resultCurrent = result + n * M + m + i; - mmm_4x8(M * 4, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, - resultCurrent); - } - - if ((MInner - m) >= 4) { - resultCurrent = result + n * M + m + i; - mmm_4x4(M * 4, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, - resultCurrent); - m += 4; - } - - if (MInner - m) { - resultCurrent = result + n * M + m + i; - mmm_N4_MTail(MInner - m, M, KInner, matrix1Trans + n * KInner, - matrix2 + (i + m) * KInner, resultCurrent); - } - - n += 4; - } - - if (N - n) { - if (i == 0) { - matrix1_trans(N - n, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); - } - - for (m = 0; m <= (MInner - 8); m += 8) { - resultCurrent = result + n * M + m + i; - mmm_NTail_M8(M, N - n, KInner, matrix1Trans + n * KInner, - matrix2 + (i + m) * KInner, resultCurrent); - } - - if ((MInner - m) >= 4) { - resultCurrent = result + n * M + m + i; - mmm_NTail_M4(M, N - n, KInner, matrix1Trans + n * KInner, - matrix2 + (i + m) * KInner, resultCurrent); - m += 4; - } - - if (MInner - m) { - resultCurrent = result + n * M + m + i; - mmm_NTail_M(MInner - m, M, N - n, KInner, matrix1Trans + n * KInner, - matrix2 + (i + m) * KInner, resultCurrent); - } - } - } - } - return SUCCESS; -} -#endif diff --git a/blas-enhance/src/cpu/arm/fp32/mvm_col_V8.cpp b/blas-enhance/src/cpu/arm/fp32/mvm_col_V8.cpp deleted file mode 100644 index 90174e72..00000000 --- a/blas-enhance/src/cpu/arm/fp32/mvm_col_V8.cpp +++ /dev/null @@ -1,97 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "type.h" -#include "blas_fp32.h" - - -inline void mvm_col_tail(U32 N, U32 K, F32* matrix, F32* vector, F32* result) -{ - float32x4_t tmp, res, mat; - U32 NTail = N % 4; - U32 NInner = N - NTail; - - for (U32 i = 0; i < K; i++) { - for (U32 j = 0; j < NInner; j += 4) { - tmp = vld1q_f32(result + j); - mat = vld1q_f32(&matrix[j + N * i]); - res = vfmaq_n_f32(tmp, mat, vector[i]); - vst1q_f32(result + j, res); - } - if (NTail != 0) { - for (U32 p = 0; p < NTail; p++) { - result[NInner + p] += vector[i] * matrix[NInner + N * i + p]; - } - } - } -} - -void mvm_col_kernel(U32 N, U32 K, F32* matrix, F32* vector, F32* result) -{ - float32x4_t mat[4] = {0}; - - F32* w0 = matrix; - F32* w1 = matrix + K * N; - F32* w2 = matrix + 2 * K * N; - F32* w3 = matrix + 3 * K * N; - - U32 N_tail = N % 4; - U32 N_inner = N - N_tail; - - for(U32 i = 0; i < K; i++) { - for(U32 j = 0; j < N_inner; j += 4) { - - float32x4_t res[4] = {0}; - - res[3] = vld1q_f32(result + j); - mat[0] = vld1q_f32(w0); - mat[1] = vld1q_f32(w1); - mat[2] = vld1q_f32(w2); - mat[3] = vld1q_f32(w3); - - res[0] = vfmaq_n_f32(res[3], mat[0], vector[i]); - res[1] = vfmaq_n_f32(res[0], mat[1], vector[K + i]); - res[2] = vfmaq_n_f32(res[1], mat[2], vector[2 * K + i]); - res[3] = vfmaq_n_f32(res[2], mat[3], vector[3 * K + i]); - - w0 += 4; - w1 += 4; - w2 += 4; - w3 += 4; - vst1q_f32(result + j, res[3]); - } - if (N_tail != 0) { - for(U32 p = 0; p < N_tail; p++) { - result[N_inner + p] += vector[i] * *w0++; - result[N_inner + p] += vector[i + K] * *w1++; - result[N_inner + p] += vector[i + 2 * K] * *w2++; - result[N_inner + p] += vector[i + 3 * K] * *w3++; - } - } - } -} - -void mvm_col_V8(U32 numRows, U32 numColumns, F32* matrix, F32* vector, F32* result) -{ - //Actual layout is KN, and vector is K - U32 N = numRows; - U32 K = numColumns; - U32 KInner = K / 4; - U32 KTail = K % 4; - mvm_col_kernel(N, KInner, matrix, vector, result); - if (KTail != 0) { - mvm_col_tail(N, KTail, matrix + (K - KTail) * N, vector + (K - KTail), result); - } -} diff --git a/blas-enhance/src/cpu/arm/fp32/mvm_row_V8.cpp b/blas-enhance/src/cpu/arm/fp32/mvm_row_V8.cpp deleted file mode 100644 index 0ffb68c8..00000000 --- a/blas-enhance/src/cpu/arm/fp32/mvm_row_V8.cpp +++ /dev/null @@ -1,152 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "blas_fp32.h" - - -void mvm_row_tail(U32 N, U32 K, F32* matrix, F32* vector, F32* result) -{ - float32x4_t vec, res, mat; - U32 KTail = K % 4; - U32 KInner = K - KTail; - - for (U32 i = 0; i < N; i++) { - res = vdupq_n_f32(0); - - for (U32 j = 0; j < KInner; j += 4) { - vec = vld1q_f32(&vector[j]); - mat = vld1q_f32(&matrix[j + K * i]); - res = vfmaq_f32(res, vec, mat); - } - result[i] += vaddvq_f32(res); - - if (KTail != 0) { - for (U32 p = 0; p < KTail; p++) { - result[i] += vector[p + KInner] * matrix[KInner + p + K * i]; - } - } - - } -} - -void mvm_row_kernel(U32 N, U32 K, F32* matrix, F32* vector, F32* result) -{ -#ifdef __aarch64__ - I32 KTail = K % 4; - I32 KInner = K - KTail; - F32* w0 = matrix; - F32* w1 = matrix + K * N; - F32* w2 = matrix + K * 2 * N; - F32* w3 = matrix + K * 3 * N; - asm volatile( - "mov x19, %5\n" - "ld1 {v18.s}[0], [x19]\n" - "add x19, x19, %8\n" - "ld1 {v18.s}[1], [x19]\n" - "add x19, x19, %8\n" - "ld1 {v18.s}[2], [x19]\n" - "add x19, x19, %8\n" - "ld1 {v18.s}[3], [x19]\n" - - "movi v17.4s, #0x0\n" - "movi v16.4s, #0x0\n" - "movi v9.4s, #0x0\n" - "movi v10.4s, #0x0\n" - "movi v11.4s, #0x0\n" - "movi v12.4s, #0x0\n" - "mov x20, %6\n" - "cmp x20, #0x0\n" - "beq 3f\n" - "0:\n" - - "ld1 {v0.4s}, [%0], #16\n" - "ld1 {v1.4s}, [%1], #16\n" - "ld1 {v2.4s}, [%2], #16\n" - "ld1 {v3.4s}, [%3], #16\n" - "ld1 {v4.4s}, [%4], #16\n" - - "fmla v9.4s, v1.4s, v0.4s\n" - "fmla v10.4s, v2.4s, v0.4s\n" - "fmla v11.4s, v3.4s, v0.4s\n" - "fmla v12.4s, v4.4s, v0.4s\n" - - "subs x20, x20, #4\n" - "bne 0b\n" - - "faddp v13.4s, v9.4s, v10.4s\n" - "faddp v14.4s, v11.4s, v12.4s\n" - "faddp v17.4s, v13.4s, v14.4s\n" - "3:\n" - "mov x16, %7\n" - "cmp x16, #0x0\n" - "beq 2f\n" - - "1:\n" - "ld1 {v8.s}[0], [%0], #4\n" - - "ld1 {v1.s}[0], [%1], #4\n" - "ld1 {v1.s}[1], [%2], #4\n" - "ld1 {v1.s}[2], [%3], #4\n" - "ld1 {v1.s}[3], [%4], #4\n" - "fmla v16.4s, v1.4s, v8.s[0]\n" - - "subs x16, x16, 0x1\n" - "bne 1b\n" - - "fadd v17.4s, v17.4s, v16.4s\n" - - "2:\n" - - "fadd v17.4s, v17.4s, v18.4s\n" - "mov x19, %5\n" - "st1 {v17.s}[0], [x19]\n" - "add x19, x19, %8\n" - "st1 {v17.s}[1], [x19]\n" - "add x19, x19, %8\n" - "st1 {v17.s}[2], [x19]\n" - "add x19, x19, %8\n" - "st1 {v17.s}[3], [x19]\n" - :"+r" (vector), - "+r" (w0), - "+r" (w1), - "+r" (w2), - "+r" (w3), - "+r" (result) - :"r" ((I64)KInner), - "r" ((I64)KTail), - "r" ((I64)N*4) - :"memory", "cc", "x19", "x20", "x21", "x22", "x23", "x24", "x15", "x16", - "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18" - ); -#else - // TODO - std::cerr << "[ERROR] currently not support ARMv7 row MVM" < i) { - matrix2_trans_int8(N - i, K, N, src + i, dst + i * K4); - } - return SUCCESS; -} - -EE matrix_matrix_multiply_transform_rhsT_int8(TensorDesc desc, INT8* src, INT8* dst) -{ - DataType dt; - U32 N, K; - CHECK_STATUS(tensor2dGet(desc, &dt, &N, &K)); - U32 K4 = pad_to_4_multiple(K); - int i = 0; - for (; i < (int)N - 11; i += 12) { - matrix1_trans_int8(12, K, K, src + i * K, dst + i * K4); - } - for (; i < (int)N - 7; i += 8) { - matrix1_trans_int8(8, K, K, src + i * K, dst + i * K4); - } - for (; i < (int)N - 3; i += 4) { - matrix1_trans_int8(4, K, K, src + i * K, dst + i * K4); - } - if ((int)N > i) { - matrix1_trans_int8(N - i, K, K, src + i * K, dst + i * K4); - } - return SUCCESS; -} - -EE mmm_int8(int M, int N, int K, INT8* matrix1, INT8* matrix2, INT8* tmp, I32* result, Arch arch) { - EE ret = SUCCESS; - switch (arch) { - case ARM_A55: - mmm_A55(M, N, K, matrix1, matrix2, tmp, result); - break; - case ARM_A76: - mmm_A76(M, N, K, matrix1, matrix2, tmp, result); - break; - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} diff --git a/blas-enhance/src/cpu/arm/int8/mmm.h b/blas-enhance/src/cpu/arm/int8/mmm.h deleted file mode 100644 index 81a9683c..00000000 --- a/blas-enhance/src/cpu/arm/int8/mmm.h +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_MMM -#define _H_MMM - -#include "type.h" - -void mmm_A55(int M, int N, int K, INT8* matrix1, INT8* matrix2, INT8* tmp, I32* result); - -void mmm_A76(int M, int N, int K, INT8* matrix1, INT8* matrix2, INT8* tmp, I32* result); -#endif diff --git a/blas-enhance/src/cpu/arm/int8/mmm_A55.cpp b/blas-enhance/src/cpu/arm/int8/mmm_A55.cpp deleted file mode 100644 index 56facbf8..00000000 --- a/blas-enhance/src/cpu/arm/int8/mmm_A55.cpp +++ /dev/null @@ -1,733 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifdef _USE_INT8 -#include -#include -#include "cpu/arm/blas_arm.h" -#include "cpu/arm/int8/mmm_common.h" -#include "cpu/arm/int8/mmm.h" - -inline void mmm_4x4_A55(U32 offset, U32 K, INT8* in, INT8* w, I32* out) { - asm volatile( - //init in- > v1, w- > v0 - "ldr d1, [%0]\n" - "ldr x16, [%0, 8]\n" - "ins v1.d[1], x16\n" - - "ldr d0, [%1]\n" - "ldr x17, [%1, 8]\n" - "ins v0.d[1], x17\n" - - //give in address to x3 - "mov x3, %0\n" - - //give w address to x0 - "mov x0, %1\n" - - //K- > x2 - "mov x2, %3\n" - - //give out address to x26 - "mov x26, %2\n" - - //load in bias - "ldr q5, [x26]\n" - "add x26, x26, %4\n" - - "ldr q7, [x26]\n" - "add x26, x26, %4\n" - - "ldr q9, [x26]\n" - "add x26, x26, %4\n" - - "ldr q11, [x26]\n" - - //Computation loop - "0:\n" - - "ldr d3, [x3, 16] !\n" - "ldr x16, [x3, 24]\n" - "sdot v5.4s, v0.16b, v1.4b[0]\n" - "ldr d29, [x0, 16] !\n" - "ldr x17, [x0, 24]\n" - "sdot v7.4s, v0.16b, v1.4b[1]\n" - "ins v3.d[1], x16\n" - "subs x2, x2, #4\n" - "sdot v9.4s, v0.16b, v1.4b[2]\n" - "ins v29.d[1], x17\n" - "sdot v11.4s, v0.16b, v1.4b[3]\n" - "mov v1.16b, v3.16b\n" - "mov v0.16b, v29.16b\n" - "bne 0b\n" - - "1:\n" - - //give out address to x26 - "mov x26, %2\n" - - "str q5, [x26]\n" - "add x26, x26, %4\n" - - "str q7, [x26]\n" - "add x26, x26, %4\n" - - "str q9, [x26]\n" - "add x26, x26, %4\n" - - "str q11, [x26]\n" - - :"+r" (in), - "+r" (w), - "+r" (out) - :"r" ((I64)K), - "r" ((I64)offset) - :"memory","cc","v30","v29","v11","v9","v7","v5","v3","v1","v0","x26","x16","x17","x3","x2","x0" - ); -} - -inline void mmm_8x4_A55(U32 offset, U32 K, INT8* in, INT8* w, I32* out) { - asm volatile( - // init in-> v1, w-> v0 - "ldr q1, [%0]\n" - "ldr q0, [%1]\n" - - // give in address to x3 - "mov x3, %0\n" - - // give w address to x0 - "mov x0, %1\n" - - // K-> x2 - "mov x2, %3\n" - - // give out address to x26 - "mov x26, %2\n" - - // load in bias - "ldr q5, [x26]\n" - "add x26, x26, %4\n" - - "ldr q7, [x26]\n" - "add x26, x26, %4\n" - - "ldr q9, [x26]\n" - "add x26, x26, %4\n" - - "ldr q11, [x26]\n" - "add x26, x26, %4\n" - - "ldr q13, [x26]\n" - "add x26, x26, %4\n" - - "ldr q15, [x26]\n" - "add x26, x26, %4\n" - - "ldr q17, [x26]\n" - "add x26, x26, %4\n" - - "ldr q19, [x26]\n" - - //Computation loop - "0:\n" - - "ldr d3, [x3, 16]\n" - "ldr x16, [x3, 24]\n" - "sdot v5.4s, v0.16b, v1.4b[0]\n" - "ldr d29, [x0, 16]!\n" - "ldr x17, [x0, 8]\n" - "sdot v7.4s, v0.16b, v1.4b[1]\n" - "ins v3.d[1], x16\n" - "ldr d30, [x3, 32]!\n" - "sdot v9.4s, v0.16b, v1.4b[2]\n" - "ins v29.d[1], x17\n" - "sdot v11.4s, v0.16b, v1.4b[3]\n" - - "ldr x16, [x3, 8]\n" - "subs x2, x2, #4\n" - "sdot v13.4s, v0.16b, v3.4b[0]\n" - "ins v30.d[1], x16\n" - "sdot v15.4s, v0.16b, v3.4b[1]\n" - "sdot v17.4s, v0.16b, v3.4b[2]\n" - "mov v1.16b, v30.16b\n" - "sdot v19.4s, v0.16b, v3.4b[3]\n" - "mov v0.16b, v29.16b\n" - "bne 0b\n" - - //give out address to x26 - "mov x26, %2\n" - - "str q5, [x26]\n" - "add x26, x26, %4\n" - - "str q7, [x26]\n" - "add x26, x26, %4\n" - - "str q9, [x26]\n" - "add x26, x26, %4\n" - - "str q11, [x26]\n" - "add x26, x26, %4\n" - - "str q13, [x26]\n" - "add x26, x26, %4\n" - - "str q15, [x26]\n" - "add x26, x26, %4\n" - - "str q17, [x26]\n" - "add x26, x26, %4\n" - - "str q19, [x26]\n" - - :"+r" (in), - "+r" (w), - "+r" (out) - :"r" ((I64)K), - "r" ((I64)offset) - :"memory","cc","v30","v29","v19","v17","v15","v13","v11", - "v9","v7","v5","v3","v1","v0","x26","x16","x17","x3","x2","x0" - ); -} - -inline void mmm_4x8_A55(U32 offset, U32 K, INT8* in, INT8* w, I32* out) { - asm volatile( - //init in- > v1, w- > v0 - "ldr d1, [%0]\n" - "ldr x16, [%0, 8]\n" - "ins v1.d[1], x16\n" - - "ldr d0, [%1]\n" - "ldr x17, [%1, 8]\n" - "ins v0.d[1], x17\n" - - //give in address to x3 - "mov x3, %0\n" - - //give w address to x0 - "mov x0, %1\n" - - //K- > x2 - "mov x2, %3\n" - - //give out address to x26 - "mov x26, %2\n" - - "ld1 {v5.4s, v6.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v7.4s, v8.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v9.4s, v10.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v11.4s, v12.4s}, [x26]\n" - - /* Layout - 5 6 - 7 8 - 9 10 - 11 12 - */ - - //Computation loop - "0:\n" - - "ldr d29, [x0, 16]\n" - "ldr x17, [x0, 24]\n" - "sdot v5.4s, v0.16b, v1.4b[0]\n" - "ldr d3, [x3, 16] !\n" - "ldr x16, [x3, 8]\n" - "sdot v7.4s, v0.16b, v1.4b[1]\n" - "ins v29.d[1], x17\n" - "subs x2, x2, #4\n" - "sdot v9.4s, v0.16b, v1.4b[2]\n" - "ins v3.d[1], x16\n" - "sdot v11.4s, v0.16b, v1.4b[3]\n" - - "sdot v6.4s, v29.16b, v1.4b[0]\n" - "ldr d0, [x0, 32] !\n" - "ldr x17, [x0, 8]\n" - "sdot v8.4s, v29.16b, v1.4b[1]\n" - "sdot v10.4s, v29.16b, v1.4b[2]\n" - "ins v0.d[1], x17\n" - "sdot v12.4s, v29.16b, v1.4b[3]\n" - "mov v1.16b, v3.16b\n" - "bne 0b\n" - - //give out address to x26 - "mov x26, %2\n" - - "st1 {v5.4s, v6.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v7.4s, v8.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v9.4s, v10.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v11.4s, v12.4s}, [x26]\n" - - :"+r" (in), - "+r" (w), - "+r" (out) - :"r" ((I64)K), - "r" ((I64)offset) - :"memory","cc","v29","v12","v11","v10","v9","v8","v7","v6","v5","v3","v1","v0", - "x26","x16","x17","x3","x2","x0" - ); -} - -inline void mmm_8x8_A55(U32 offset, U32 K, INT8* in, INT8* w, I32* out) { - asm volatile( - //init in- > v1, w- > v0 - "ldr d1, [%0]\n" - "ldr x16, [%0, 8]\n" - "ins v1.d[1], x16\n" - - "ldr d0, [%1]\n" - "ldr x17, [%1, 8]\n" - "ins v0.d[1], x17\n" - - //give in address to x3 - "mov x3, %0\n" - - //give w address to x0 - "mov x0, %1\n" - - //K- > x2 - "mov x2, %3\n" - - //give out address to x26 - "mov x26, %2\n" - - //load in bias - "ld1 {v5.4s, v6.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v7.4s, v8.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v9.4s, v10.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v11.4s, v12.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v13.4s, v14.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v15.4s, v16.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v17.4s, v18.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v19.4s, v20.4s}, [x26]\n" - - /* Layout - 5 6 - 7 8 - 9 10 - 11 12 - - 13 14 - 15 16 - 17 18 - 19 20 - */ - - //Computation loop - "0:\n" - - "sdot v5.4s, v0.16b, v1.4b[0]\n" - "ldr d3, [x3, 16] !\n" - "ldr x16, [x3, 8]\n" - "sdot v7.4s, v0.16b, v1.4b[1]\n" - "ldr d29, [x0, 16]\n" - "ldr x17, [x0, 24]\n" - "sdot v9.4s, v0.16b, v1.4b[2]\n" - "ins v3.d[1], x16\n" - "ldr d30, [x3, 16] !\n" - "sdot v11.4s, v0.16b, v1.4b[3]\n" - "ins v29.d[1], x17\n" - - "sdot v13.4s, v0.16b, v3.4b[0]\n" - "ldr x16, [x3, 8]\n" - "subs x2, x2, #4\n" - "sdot v15.4s, v0.16b, v3.4b[1]\n" - "sdot v17.4s, v0.16b, v3.4b[2]\n" - "ins v30.d[1], x16\n" - "sdot v19.4s, v0.16b, v3.4b[3]\n" - - "sdot v6.4s, v29.16b, v1.4b[0]\n" - "sdot v8.4s, v29.16b, v1.4b[1]\n" - "ldr d0, [x0, 32] !\n" - "ldr x17, [x0, 8]\n" - "sdot v10.4s, v29.16b, v1.4b[2]\n" - "sdot v12.4s, v29.16b, v1.4b[3]\n" - - "sdot v14.4s, v29.16b, v3.4b[0]\n" - "ins v0.d[1], x17\n" - "mov v1.16b, v30.16b\n" - "sdot v16.4s, v29.16b, v3.4b[1]\n" - "sdot v18.4s, v29.16b, v3.4b[2]\n" - "sdot v20.4s, v29.16b, v3.4b[3]\n" - - "bne 0b\n" - - //give out address to x26 - "mov x26, %2\n" - - "st1 {v5.4s, v6.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v7.4s, v8.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v9.4s, v10.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v11.4s, v12.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v13.4s, v14.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v15.4s, v16.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v17.4s, v18.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v19.4s, v20.4s}, [x26]\n" - - :"+r" (in), - "+r" (w), - "+r" (out) - :"r" ((I64)K), - "r" ((I64)offset) - :"memory", "cc", "v30", "v29", "v20", "v19", "v18", "v17", "v16", "v15", "v14", "v13", "v12", "v11", "v10", - "v9", "v8", "v7", "v6", "v5", "v3", "v1", "v0", - "x26", "x16", "x17", "x3", "x2", "x0" - ); -} - -inline void mmm_4x12_A55(U32 offset, U32 K, INT8* in, INT8* w, I32* out) -{ - asm volatile( - //init in->v1, w->v0 - "ldr q1, [%0]\n" - - "ldr q0, [%1]\n" - - "ldr q29, [%1, 16]\n" // prefetch one more w - - //give in address to x3 - "mov x3, %0\n" - - //give w address to x0 - "mov x0, %1\n" - - //K->x2 - "mov x2, %3\n" - - //give out address to x26 - "mov x26, %2\n" - - //load in bias - "ld1 {v5.4s, v6.4s, v7.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v8.4s, v9.4s, v10.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v11.4s, v12.4s, v13.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v14.4s, v15.4s, v16.4s}, [x26]\n" - - /* Layout - 5 6 7 - 8 9 10 - 11 12 13 - 14 15 16 - */ - - //Computation loop - "0:\n" - // in(x3): v1 - // w(x0): v0 v29 v30 - - "sdot v5.4s, v0.16b, v1.4b[0]\n" - "ldr d30, [x0, 32]\n" - "sdot v8.4s, v0.16b, v1.4b[1]\n" - "ldr x16, [x0, 40]\n" - "sdot v11.4s, v0.16b, v1.4b[2]\n" - "ldr d2, [x3, 16]!\n" // input of next round - "sdot v14.4s, v0.16b, v1.4b[3]\n" - "ldr x17, [x3, 8]\n" - - "sdot v6.4s, v29.16b, v1.4b[0]\n" - "ins v30.d[1], x16\n" - "sdot v9.4s, v29.16b, v1.4b[1]\n" - "ldr d0, [x0, 48]!\n" // first w of next round - "sdot v12.4s, v29.16b, v1.4b[2]\n" - "ins v2.d[1], x17\n" - "sdot v15.4s, v29.16b, v1.4b[3]\n" - "ldr x16, [x0, 8]\n" - - "sdot v7.4s, v30.16b, v1.4b[0]\n" - "ldr d29, [x0, 16]\n" - "sdot v10.4s, v30.16b, v1.4b[1]\n" - "ldr x19, [x0, 24]\n" - "ins v0.d[1], x16\n" - "sdot v13.4s, v30.16b, v1.4b[2]\n" - "subs x2, x2, #4\n" - "sdot v16.4s, v30.16b, v1.4b[3]\n" - - "mov v1.16b, v2.16b\n" - "ins v29.d[1], x19\n" - "bne 0b\n" - - //give out address to x26 - "mov x26, %2\n" - - "st1 {v5.4s, v6.4s, v7.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v8.4s, v9.4s, v10.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v11.4s, v12.4s, v13.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v14.4s, v15.4s, v16.4s}, [x26]\n" - - :"+r"(in), - "+r"(w), - "+r"(out) - :"r"((I64)K), - "r"((I64)offset) - :"memory","cc","v30","v29","v16","v15","v14","v13","v12","v11","v10","v9","v8","v7","v6","v5","v3","v2","v1","v0","x26","x19","x16","x17","x3","x2","x0" - ); -} - -inline void mmm_8x12_A55(U32 offset, U32 K, INT8* in, INT8* w, I32* out) -{ - asm volatile( - //init in->v1, w->v0 - "ldr q1, [%0]\n" - - "ldr q0, [%1]\n" - - "ldr q29, [%1, 16]\n" // prefetch one more w - - //give in address to x3 - "mov x3, %0\n" - - //give w address to x0 - "mov x0, %1\n" - - //K->x2 - "mov x2, %3\n" - - //give out address to x26 - "mov x26, %2\n" - - //load in bias - "ld1 {v5.4s, v6.4s, v7.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v8.4s, v9.4s, v10.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v11.4s, v12.4s, v13.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v14.4s, v15.4s, v16.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v17.4s, v18.4s, v19.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v20.4s, v21.4s, v22.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v23.4s, v24.4s, v25.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v26.4s, v27.4s, v28.4s}, [x26]\n" - - /* Layout - 5 6 7 - 8 9 10 - 11 12 13 - 14 15 16 - - 17 18 19 - 20 21 22 - 23 24 25 - 26 27 28 - */ - - //Computation loop - "0:\n" - // in(x3): v1 v2 - // w(x0): v0 v29 v30 - - "sdot v5.4s, v0.16b, v1.4b[0]\n" - "ldr d30, [x0, 32]\n" - "sdot v8.4s, v0.16b, v1.4b[1]\n" - "ldr x16, [x0, 40]\n" - "sdot v11.4s, v0.16b, v1.4b[2]\n" - "ldr d2, [x3, 16]\n" - "sdot v14.4s, v0.16b, v1.4b[3]\n" - "ldr x17, [x3, 24]\n" - - "sdot v6.4s, v29.16b, v1.4b[0]\n" - "ins v30.d[1], x16\n" - "sdot v9.4s, v29.16b, v1.4b[1]\n" - "ldr d3, [x0, 48]!\n" // first w of next round - "sdot v12.4s, v29.16b, v1.4b[2]\n" - "ins v2.d[1], x17\n" - "sdot v15.4s, v29.16b, v1.4b[3]\n" - "ldr x16, [x0, 8]\n" - - "sdot v7.4s, v30.16b, v1.4b[0]\n" - "subs x2, x2, #4\n" - "sdot v10.4s, v30.16b, v1.4b[1]\n" - "ins v3.d[1], x16\n" - "sdot v13.4s, v30.16b, v1.4b[2]\n" - "sdot v16.4s, v30.16b, v1.4b[3]\n" - - "sdot v17.4s, v0.16b, v2.4b[0]\n" - "ldr d1, [x3, 32]!\n" - "sdot v20.4s, v0.16b, v2.4b[1]\n" - "ldr x17, [x3, 8]\n" - "sdot v23.4s, v0.16b, v2.4b[2]\n" - "sdot v26.4s, v0.16b, v2.4b[3]\n" - - "sdot v18.4s, v29.16b, v2.4b[0]\n" - "mov v0.16b, v3.16b\n" - "sdot v21.4s, v29.16b, v2.4b[1]\n" - "ins v1.d[1], x17\n" - "sdot v24.4s, v29.16b, v2.4b[2]\n" - "sdot v27.4s, v29.16b, v2.4b[3]\n" - - "sdot v19.4s, v30.16b, v2.4b[0]\n" - "ldr d29, [x0, 16]\n" - "sdot v22.4s, v30.16b, v2.4b[1]\n" - "ldr x16, [x0, 24]\n" - "sdot v25.4s, v30.16b, v2.4b[2]\n" - "sdot v28.4s, v30.16b, v2.4b[3]\n" - "ins v29.d[1], x16\n" - - "bne 0b\n" - - //give out address to x26 - "mov x26, %2\n" - - "st1 {v5.4s, v6.4s, v7.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v8.4s, v9.4s, v10.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v11.4s, v12.4s, v13.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v14.4s, v15.4s, v16.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v17.4s, v18.4s, v19.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v20.4s, v21.4s, v22.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v23.4s, v24.4s, v25.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v26.4s, v27.4s, v28.4s}, [x26]\n" - - :"+r"(in), - "+r"(w), - "+r"(out) - :"r"((I64)K), - "r"((I64)offset) - :"memory","cc","v30","v29","v28","v27","v26","v25","v24","v23","v22","v21","v20","v19","v18","v17","v16","v15","v14","v13","v12","v11","v10","v9","v8","v7","v6","v5","v3","v2","v1","v0","x26","x16","x17","x3","x2","x0" - ); -} - -void mmm_A55(int M, int N, int K, INT8* matrix1, INT8* matrix2, INT8* tmp, I32* result) -{ - int blockK = K; - int K4 = pad_to_4_multiple(K); - int blockM = 96; - INT8* matrix1Trans = tmp; - I32* resultCurrent = result; - - int KInner, MInner, m, n; - for (int k = 0; k < K; k += blockK) { - KInner = UNI_MIN(blockK, K - k); //K for this inner iteration - for (int i = 0; i < M; i+=blockM) { - MInner = UNI_MIN(blockM, M - i); //M for this inner iteration - for(n = 0; n <= N - 8; n+=8){ - if(i == 0){ - matrix1_trans_n8(KInner, K, matrix1 + n * K + k, matrix1Trans + n * K4); - } - - for(m = 0; m <= (MInner-12); m+=12){ - resultCurrent = result + n * M + m + i; - mmm_8x12_A55(M*4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - } - for(; m <=(MInner - 8); m+=8){ - resultCurrent = result + n * M + m + i; - mmm_8x8_A55(M*4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - } - - if((MInner - m) >= 4){ - resultCurrent = result + n * M + m + i; - mmm_8x4_A55(M*4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - m += 4; - } - - if(MInner - m){ - resultCurrent = result + n * M + m + i; - mmm_N8_MTail(MInner - m, M, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - } - } - - if((N - n) >= 4){ - - if(i == 0){ - matrix1_trans_int8(4, KInner, K, matrix1 + n * K + k, matrix1Trans + n * K4); - } - - for(m = 0; m <= (MInner - 12); m+=12){ - resultCurrent = result + n * M + m + i; - mmm_4x12_A55(M*4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - } - - for(; m <= (MInner - 8); m+=8){ - resultCurrent = result + n * M + m + i; - mmm_4x8_A55(M*4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - } - - if((MInner - m) >= 4){ - resultCurrent = result + n * M + m + i; - mmm_4x4_A55(M*4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - m += 4; - } - - if(MInner - m){ - resultCurrent = result + n * M + m + i; - mmm_N4_MTail(MInner - m, M, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - } - n += 4; - } - - if (N - n) { - if(i == 0){ - matrix1_trans_int8(N-n, KInner, K, matrix1 + n * K + k, matrix1Trans + n * K4); - } - - for(m = 0; m <= (MInner - 12); m+=12){ - resultCurrent = result + n * M + m + i; - mmm_NTail_M12(M, N - n, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - } - - for(; m <= (MInner - 8); m+=8){ - resultCurrent = result + n * M + m + i; - mmm_NTail_M8(M, N - n, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - } - - if((MInner - m) >= 4){ - resultCurrent = result + n * M + m + i; - mmm_NTail_M4(M, N - n, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - m += 4; - } - - if(MInner - m){ - resultCurrent = result + n * M + m + i; - mmm_NTail_M(MInner - m, M, N - n, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - } - } - } - } -} -#endif diff --git a/blas-enhance/src/cpu/arm/int8/mmm_A76.cpp b/blas-enhance/src/cpu/arm/int8/mmm_A76.cpp deleted file mode 100644 index a3cca0c9..00000000 --- a/blas-enhance/src/cpu/arm/int8/mmm_A76.cpp +++ /dev/null @@ -1,685 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifdef _USE_INT8 -#include -#include -#include "cpu/arm/blas_arm.h" -#include "cpu/arm/int8/mmm_common.h" -#include "cpu/arm/int8/mmm.h" - -inline void mmm_4x4_A76(U32 offset, U32 K, INT8* in, INT8* w, I32* out) -{ - asm volatile( - //init in- > v1, w- > v0 - "ldr q1, [%0]\n" - - "ldr q0, [%1]\n" - - //give in address to x3 - "mov x3, %0\n" - - //give w address to x0 - "mov x0, %1\n" - - //K- > x2 - "mov x2, %3\n" - - //give out address to x26 - "mov x26, %2\n" - - //load in bias - "ldr q5, [x26]\n" - "add x26, x26, %4\n" - - "ldr q7, [x26]\n" - "add x26, x26, %4\n" - - "ldr q9, [x26]\n" - "add x26, x26, %4\n" - - "ldr q11, [x26]\n" - - //Computation loop - "0:\n" - - "ldr q3, [x3, 16]!\n" - "ldr q29, [x0, 16]!\n" - "sdot v5.4s, v0.16b, v1.4b[0]\n" - "sdot v7.4s, v0.16b, v1.4b[1]\n" - "subs x2, x2, #4\n" - "sdot v9.4s, v0.16b, v1.4b[2]\n" - "sdot v11.4s, v0.16b, v1.4b[3]\n" - "mov v1.16b, v3.16b\n" - "mov v0.16b, v29.16b\n" - "bne 0b\n" - - "1:\n" - - //give out address to x26 - "mov x26, %2\n" - - "str q5, [x26]\n" - "add x26, x26, %4\n" - - "str q7, [x26]\n" - "add x26, x26, %4\n" - - "str q9, [x26]\n" - "add x26, x26, %4\n" - - "str q11, [x26]\n" - - :"+r" (in), - "+r" (w), - "+r" (out) - :"r" ((I64)K), - "r" ((I64)offset) - :"memory","cc","v30","v29","v11","v9","v7","v5","v3","v1","v0","x26","x3","x2","x0" - ); -} - -inline void mmm_8x4_A76(U32 offset, U32 K, INT8* in, INT8* w, I32* out) -{ - asm volatile( - //init in- > v1, w- > v0 - "ldr q1, [%0]\n" - - "ldr q0, [%1]\n" - - //give in address to x3 - "mov x3, %0\n" - - //give w address to x0 - "mov x0, %1\n" - - //K- > x2 - "mov x2, %3\n" - - //give out address to x26 - "mov x26, %2\n" - - //load in bias - "ldr q5, [x26]\n" - "add x26, x26, %4\n" - - "ldr q7, [x26]\n" - "add x26, x26, %4\n" - - "ldr q9, [x26]\n" - "add x26, x26, %4\n" - - "ldr q11, [x26]\n" - "add x26, x26, %4\n" - - "ldr q13, [x26]\n" - "add x26, x26, %4\n" - - "ldr q15, [x26]\n" - "add x26, x26, %4\n" - - "ldr q17, [x26]\n" - "add x26, x26, %4\n" - - "ldr q19, [x26]\n" - - //Computation loop - "0:\n" - - "ldr q3, [x3, 16]\n" - "ldr q29, [x0, 16]!\n" - "sdot v5.4s, v0.16b, v1.4b[0]\n" - "sdot v7.4s, v0.16b, v1.4b[1]\n" - "sdot v9.4s, v0.16b, v1.4b[2]\n" - "sdot v11.4s, v0.16b, v1.4b[3]\n" - - "subs x2, x2, #4\n" - "sdot v13.4s, v0.16b, v3.4b[0]\n" - "sdot v15.4s, v0.16b, v3.4b[1]\n" - "ldr q1, [x3, 32]!\n" - "sdot v17.4s, v0.16b, v3.4b[2]\n" - "sdot v19.4s, v0.16b, v3.4b[3]\n" - "mov v0.16b, v29.16b\n" - "bne 0b\n" - - //give out address to x26 - "mov x26, %2\n" - - "str q5, [x26]\n" - "add x26, x26, %4\n" - - "str q7, [x26]\n" - "add x26, x26, %4\n" - - "str q9, [x26]\n" - "add x26, x26, %4\n" - - "str q11, [x26]\n" - "add x26, x26, %4\n" - - "str q13, [x26]\n" - "add x26, x26, %4\n" - - "str q15, [x26]\n" - "add x26, x26, %4\n" - - "str q17, [x26]\n" - "add x26, x26, %4\n" - - "str q19, [x26]\n" - - :"+r" (in), - "+r" (w), - "+r" (out) - :"r" ((I64)K), - "r" ((I64)offset) - :"memory","cc","v30","v29","v19","v17","v15","v13","v11", - "v9","v7","v5","v3","v1","v0","x26","x3","x2","x0" - ); -} - -inline void mmm_4x8_A76(U32 offset, U32 K, INT8* in, INT8* w, I32* out) -{ - asm volatile( - //init in- > v1, w- > v0 - "ldr q1, [%0]\n" - - "ldr q0, [%1]\n" - - //give in address to x3 - "mov x3, %0\n" - - //give w address to x0 - "mov x0, %1\n" - - //K- > x2 - "mov x2, %3\n" - - //give out address to x26 - "mov x26, %2\n" - - "ld1 {v5.4s, v6.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v7.4s, v8.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v9.4s, v10.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v11.4s, v12.4s}, [x26]\n" - - /* Layout - 5 6 - 7 8 - 9 10 - 11 12 - */ - - //Computation loop - "0:\n" - - "ldr q29, [x0, 16]\n" - "sdot v5.4s, v0.16b, v1.4b[0]\n" - "ldr q3, [x3, 16]!\n" - "sdot v7.4s, v0.16b, v1.4b[1]\n" - "subs x2, x2, #4\n" - "sdot v9.4s, v0.16b, v1.4b[2]\n" - "sdot v11.4s, v0.16b, v1.4b[3]\n" - - "sdot v6.4s, v29.16b, v1.4b[0]\n" - "ldr q0, [x0, 32]!\n" - "sdot v8.4s, v29.16b, v1.4b[1]\n" - "sdot v10.4s, v29.16b, v1.4b[2]\n" - "sdot v12.4s, v29.16b, v1.4b[3]\n" - "mov v1.16b, v3.16b\n" - "bne 0b\n" - - //give out address to x26 - "mov x26, %2\n" - - "st1 {v5.4s, v6.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v7.4s, v8.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v9.4s, v10.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v11.4s, v12.4s}, [x26]\n" - - :"+r" (in), - "+r" (w), - "+r" (out) - :"r" ((I64)K), - "r" ((I64)offset) - :"memory","cc","v29","v12","v11","v10","v9","v8","v7","v6","v5","v3","v1","v0", - "x26","x3","x2","x0" - ); -} - -inline void mmm_8x8_A76(U32 offset, U32 K, INT8* in, INT8* w, I32* out) -{ - asm volatile( - //init in- > v1, w- > v0 - "ldr q1, [%0]\n" - - "ldr q0, [%1]\n" - - //give in address to x3 - "mov x3, %0\n" - - //give w address to x0 - "mov x0, %1\n" - - //K- > x2 - "mov x2, %3\n" - - //give out address to x26 - "mov x26, %2\n" - - //load in bias - "ld1 {v5.4s, v6.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v7.4s, v8.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v9.4s, v10.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v11.4s, v12.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v13.4s, v14.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v15.4s, v16.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v17.4s, v18.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v19.4s, v20.4s}, [x26]\n" - - /* Layout - 5 6 - 7 8 - 9 10 - 11 12 - - 13 14 - 15 16 - 17 18 - 19 20 - */ - - //Computation loop - "0:\n" - - "sdot v5.4s, v0.16b, v1.4b[0]\n" - "ldr q3, [x3, 16]!\n" - "sdot v7.4s, v0.16b, v1.4b[1]\n" - "ldr q29, [x0, 16]\n" - "sdot v9.4s, v0.16b, v1.4b[2]\n" - "sdot v11.4s, v0.16b, v1.4b[3]\n" - - "sdot v13.4s, v0.16b, v3.4b[0]\n" - "subs x2, x2, #4\n" - "sdot v15.4s, v0.16b, v3.4b[1]\n" - "sdot v17.4s, v0.16b, v3.4b[2]\n" - "sdot v19.4s, v0.16b, v3.4b[3]\n" - - "sdot v6.4s, v29.16b, v1.4b[0]\n" - "sdot v8.4s, v29.16b, v1.4b[1]\n" - "ldr q0, [x0, 32]!\n" - "sdot v10.4s, v29.16b, v1.4b[2]\n" - "sdot v12.4s, v29.16b, v1.4b[3]\n" - - "sdot v14.4s, v29.16b, v3.4b[0]\n" - "sdot v16.4s, v29.16b, v3.4b[1]\n" - "ldr q1, [x3, 16]!\n" - "sdot v18.4s, v29.16b, v3.4b[2]\n" - "sdot v20.4s, v29.16b, v3.4b[3]\n" - - "bne 0b\n" - - //give out address to x26 - "mov x26, %2\n" - - "st1 {v5.4s, v6.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v7.4s, v8.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v9.4s, v10.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v11.4s, v12.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v13.4s, v14.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v15.4s, v16.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v17.4s, v18.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v19.4s, v20.4s}, [x26]\n" - - :"+r" (in), - "+r" (w), - "+r" (out) - :"r" ((I64)K), - "r" ((I64)offset) - :"memory", "cc", "v29", "v20", "v19", "v18", "v17", "v16", "v15", "v14", "v13", "v12", "v11", "v10", - "v9", "v8", "v7", "v6", "v5", "v3", "v1", "v0", - "x26", "x3", "x2", "x0" - ); -} - -inline void mmm_4x12_A76(U32 offset, U32 K, INT8* in, INT8* w, I32* out) -{ - asm volatile( - //init in->v1, w->v0 - "ldr q1, [%0]\n" - - "ldr q0, [%1]\n" - - "ldr q29, [%1, 16]\n" // prefetch one more w - - //give in address to x3 - "mov x3, %0\n" - - //give w address to x0 - "mov x0, %1\n" - - //K->x2 - "mov x2, %3\n" - - //give out address to x26 - "mov x26, %2\n" - - //load in bias - "ld1 {v5.4s, v6.4s, v7.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v8.4s, v9.4s, v10.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v11.4s, v12.4s, v13.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v14.4s, v15.4s, v16.4s}, [x26]\n" - - /* Layout - 5 6 7 - 8 9 10 - 11 12 13 - 14 15 16 - */ - - //Computation loop - "0:\n" - // in(x3): v1 - // w(x0): v0 v29 v30 - - "sdot v5.4s, v0.16b, v1.4b[0]\n" - "ldr q30, [x0, 32]\n" - "sdot v8.4s, v0.16b, v1.4b[1]\n" - "sdot v11.4s, v0.16b, v1.4b[2]\n" - "ldr q2, [x3, 16]!\n" // input of next round - "sdot v14.4s, v0.16b, v1.4b[3]\n" - - "sdot v6.4s, v29.16b, v1.4b[0]\n" - "sdot v9.4s, v29.16b, v1.4b[1]\n" - "ldr q0, [x0, 48]!\n" // first w of next round - "sdot v12.4s, v29.16b, v1.4b[2]\n" - "sdot v15.4s, v29.16b, v1.4b[3]\n" - - "sdot v7.4s, v30.16b, v1.4b[0]\n" - "ldr q29, [x0, 16]\n" - "sdot v10.4s, v30.16b, v1.4b[1]\n" - "sdot v13.4s, v30.16b, v1.4b[2]\n" - "subs x2, x2, #4\n" - "sdot v16.4s, v30.16b, v1.4b[3]\n" - - "mov v1.16b, v2.16b\n" - "bne 0b\n" - - //give out address to x26 - "mov x26, %2\n" - - "st1 {v5.4s, v6.4s, v7.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v8.4s, v9.4s, v10.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v11.4s, v12.4s, v13.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v14.4s, v15.4s, v16.4s}, [x26]\n" - - :"+r"(in), - "+r"(w), - "+r"(out) - :"r"((I64)K), - "r"((I64)offset) - :"memory","cc","v30","v29","v16","v15","v14","v13","v12","v11","v10", - "v9","v8","v7","v6","v5","v3","v2","v1","v0","x26","x19","x3","x2","x0" - ); -} - -inline void mmm_8x12_A76(U32 offset, U32 K, INT8* in, INT8* w, I32* out) -{ - asm volatile( - //init in->v1, w->v0 - "ldr q1, [%0]\n" - - "ldr q0, [%1]\n" - - "ldr q29, [%1, 16]\n" // prefetch one more w - - //give in address to x3 - "mov x3, %0\n" - - //give w address to x0 - "mov x0, %1\n" - - //K->x2 - "mov x2, %3\n" - - //give out address to x26 - "mov x26, %2\n" - - //load in bias - "ld1 {v5.4s, v6.4s, v7.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v8.4s, v9.4s, v10.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v11.4s, v12.4s, v13.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v14.4s, v15.4s, v16.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v17.4s, v18.4s, v19.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v20.4s, v21.4s, v22.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v23.4s, v24.4s, v25.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v26.4s, v27.4s, v28.4s}, [x26]\n" - - /* Layout - 5 6 7 - 8 9 10 - 11 12 13 - 14 15 16 - - 17 18 19 - 20 21 22 - 23 24 25 - 26 27 28 - */ - - //Computation loop - "0:\n" - // in(x3): v1 v2 - // w(x0): v0 v29 v30 - - "sdot v5.4s, v0.16b, v1.4b[0]\n" - "ldr q30, [x0, 32]\n" - "sdot v8.4s, v0.16b, v1.4b[1]\n" - "sdot v11.4s, v0.16b, v1.4b[2]\n" - "ldr q2, [x3, 16]\n" - "sdot v14.4s, v0.16b, v1.4b[3]\n" - - "sdot v6.4s, v29.16b, v1.4b[0]\n" - "sdot v9.4s, v29.16b, v1.4b[1]\n" - "ldr q3, [x0, 48]!\n" // first w of next round - "sdot v12.4s, v29.16b, v1.4b[2]\n" - "sdot v15.4s, v29.16b, v1.4b[3]\n" - - "sdot v7.4s, v30.16b, v1.4b[0]\n" - "subs x2, x2, #4\n" - "sdot v10.4s, v30.16b, v1.4b[1]\n" - "sdot v13.4s, v30.16b, v1.4b[2]\n" - "sdot v16.4s, v30.16b, v1.4b[3]\n" - - "sdot v17.4s, v0.16b, v2.4b[0]\n" - "ldr q1, [x3, 32]!\n" - "sdot v20.4s, v0.16b, v2.4b[1]\n" - "sdot v23.4s, v0.16b, v2.4b[2]\n" - "sdot v26.4s, v0.16b, v2.4b[3]\n" - - "sdot v18.4s, v29.16b, v2.4b[0]\n" - "mov v0.16b, v3.16b\n" - "sdot v21.4s, v29.16b, v2.4b[1]\n" - "sdot v24.4s, v29.16b, v2.4b[2]\n" - "sdot v27.4s, v29.16b, v2.4b[3]\n" - - "sdot v19.4s, v30.16b, v2.4b[0]\n" - "ldr q29, [x0, 16]\n" - "sdot v22.4s, v30.16b, v2.4b[1]\n" - "sdot v25.4s, v30.16b, v2.4b[2]\n" - "sdot v28.4s, v30.16b, v2.4b[3]\n" - - "bne 0b\n" - - //give out address to x26 - "mov x26, %2\n" - - "st1 {v5.4s, v6.4s, v7.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v8.4s, v9.4s, v10.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v11.4s, v12.4s, v13.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v14.4s, v15.4s, v16.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v17.4s, v18.4s, v19.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v20.4s, v21.4s, v22.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v23.4s, v24.4s, v25.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v26.4s, v27.4s, v28.4s}, [x26]\n" - - :"+r"(in), - "+r"(w), - "+r"(out) - :"r"((I64)K), - "r"((I64)offset) - :"memory","cc","v30","v29","v28","v27","v26","v25","v24","v23","v22","v21","v20", - "v19","v18","v17","v16","v15","v14","v13","v12","v11","v10","v9","v8","v7","v6", - "v5","v3","v2","v1","v0","x26","x3","x2","x0" - ); -} - -void mmm_A76(int M, int N, int K, INT8* matrix1, INT8* matrix2, INT8* tmp, I32* result) -{ - int blockK = K; - U32 K4 = pad_to_4_multiple(K); - int blockM = 96; - INT8* matrix1Trans = tmp; - I32* resultCurrent = result; - - int KInner, MInner, m, n; - for (int k = 0; k < K; k += blockK) { - KInner = UNI_MIN(blockK, K - k);//K for this inner iteration - for (int i = 0; i < M; i+=blockM) { - MInner = UNI_MIN(blockM, M - i);//M for this inner iteration - for(n = 0; n <= N - 8; n+=8){ - if (i == 0) { - matrix1_trans_n8(KInner, K, matrix1 + n * K + k, matrix1Trans + n * K4); - } - - for(m = 0; m <= (MInner-12); m+=12){ - resultCurrent = result + n * M + m + i; - mmm_8x12_A76(M*4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - } - for(; m <=(MInner - 8); m+=8){ - resultCurrent = result + n * M + m + i; - mmm_8x8_A76(M*4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - } - - if((MInner - m) >= 4){ - resultCurrent = result + n * M + m + i; - mmm_8x4_A76(M*4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - m += 4; - } - - if(MInner - m){ - resultCurrent = result + n * M + m + i; - mmm_N8_MTail(MInner - m, M, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - } - } - - if((N - n) >= 4){ - - if(i == 0){ - matrix1_trans_int8(4, KInner, K, matrix1 + n * K + k, matrix1Trans + n * K4); - } - - for(m = 0; m <= (MInner - 12); m+=12){ - resultCurrent = result + n * M + m + i; - mmm_4x12_A76(M*4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - } - - for(; m <= (MInner - 8); m+=8){ - resultCurrent = result + n * M + m + i; - mmm_4x8_A76(M*4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - } - - if((MInner - m) >= 4){ - resultCurrent = result + n * M + m + i; - mmm_4x4_A76(M*4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - m += 4; - } - - if(MInner - m){ - resultCurrent = result + n * M + m + i; - mmm_N4_MTail(MInner - m, M, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - } - n += 4; - } - - if (N - n) { - if(i == 0){ - matrix1_trans_int8(N-n, KInner, K, matrix1 + n * K + k, matrix1Trans + n * K4); - } - - for(m = 0; m <= (MInner - 12); m+=12){ - resultCurrent = result + n * M + m + i; - mmm_NTail_M12(M, N - n, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - } - - for(; m <= (MInner - 8); m+=8){ - resultCurrent = result + n * M + m + i; - mmm_NTail_M8(M, N - n, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - } - - if((MInner - m) >= 4){ - resultCurrent = result + n * M + m + i; - mmm_NTail_M4(M, N - n, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - m += 4; - } - - if(MInner - m){ - resultCurrent = result + n * M + m + i; - mmm_NTail_M(MInner - m, M, N - n, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - } - } - } - } -} -#endif diff --git a/blas-enhance/src/cpu/arm/int8/mmm_common.h b/blas-enhance/src/cpu/arm/int8/mmm_common.h deleted file mode 100644 index 9c2ec833..00000000 --- a/blas-enhance/src/cpu/arm/int8/mmm_common.h +++ /dev/null @@ -1,489 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_MMM_COMMON -#define _H_MMM_COMMON - -#ifdef _USE_INT8 -#include -#include - -#include "type.h" -#include "error.h" -#include "arm_neon_expand.h" - -inline void matrix1_trans_n8(U32 blockK, U32 K, INT8* src, INT8* dst) -{ - // Move k4 as one I32 - I32* dst1 = (I32*)dst; - - I32 *in[8]; - for (U32 i=0; i<8; i++) { - in[i] = (I32*)(src + i * K); - } - U32 k = 0; - for (; k < blockK - 7; k += 8) { - if(k % 64 == 0){ - asm volatile( - "prfm pldl2keep, [%[in0], 64]\n" - "prfm pldl2keep, [%[in1], 64]\n" - "prfm pldl2keep, [%[in2], 64]\n" - "prfm pldl2keep, [%[in3], 64]\n" - "prfm pldl2keep, [%[in4], 64]\n" - "prfm pldl2keep, [%[in5], 64]\n" - "prfm pldl2keep, [%[in6], 64]\n" - "prfm pldl2keep, [%[in7], 64]\n" - :[in0]"+r"(in[0]), - [in1]"+r"(in[1]), - [in2]"+r"(in[2]), - [in3]"+r"(in[3]), - [in4]"+r"(in[4]), - [in5]"+r"(in[5]), - [in6]"+r"(in[6]), - [in7]"+r"(in[7]) - : - :"memory","cc" - ); - } - asm volatile( - "ldr d0, [%[in0]], 8\n" - "ldr d1, [%[in1]], 8\n" - "ldr d2, [%[in2]], 8\n" - "ldr d3, [%[in3]], 8\n" - "ldr d4, [%[in4]], 8\n" - "ldr d5, [%[in5]], 8\n" - "ldr d6, [%[in6]], 8\n" - "ldr d7, [%[in7]], 8\n" - - "zip1 v8.2s, v0.2s, v1.2s\n" - "zip2 v12.2s, v0.2s, v1.2s\n" - "zip1 v9.2s, v2.2s, v3.2s\n" - "zip2 v13.2s, v2.2s, v3.2s\n" - "zip1 v10.2s, v4.2s, v5.2s\n" - "zip2 v14.2s, v4.2s, v5.2s\n" - "zip1 v11.2s, v6.2s, v7.2s\n" - "zip2 v15.2s, v6.2s, v7.2s\n" - - "str d8, [%[out]]\n" - "str d9, [%[out], 8]\n" - "str d10, [%[out], 16]\n" - "str d11, [%[out], 24]\n" - "str d12, [%[out], 32]\n" - "str d13, [%[out], 40]\n" - "str d14, [%[out], 48]\n" - "str d15, [%[out], 56]\n" - :[in0]"+r"(in[0]), - [in1]"+r"(in[1]), - [in2]"+r"(in[2]), - [in3]"+r"(in[3]), - [in4]"+r"(in[4]), - [in5]"+r"(in[5]), - [in6]"+r"(in[6]), - [in7]"+r"(in[7]) - :[out]"r"(dst1) - :"memory","cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" - ); - dst1 += 16; - } - - if (k < blockK - 3) { - for (U32 i = 0; i < 8; i++) { - dst1[0] = in[i][0]; - dst1++; - in[i]++; - } - k += 4; - } - - if (k < blockK) { - U32 kTail = blockK - k; - INT8 *dstI8 = (INT8*)dst1; - INT8 *inI[8]; - for (U32 i = 0; i < 8; i++) { - inI[i] = (INT8*)in[i]; - } - for (U32 i = 0; i < 8; i++) { - for (U32 j = 0; j < 4; j++) { - if (j < kTail) { - dstI8[i * 4 + j] = inI[i][j]; - } else { - dstI8[i * 4 + j] = 0; - } - } - } - } -} - -//Trans from NK to NKn(size)k4 -inline void matrix1_trans_int8(U32 size, U32 blockK, U32 K, INT8* src, INT8* dst) -{ - // Move k4 as one I32 - I32* src1; - I32* dst1 = (I32*)dst; - U32 offset = 64; - - U32 i = 0; - for (; i < blockK/4; i++) { - for (U32 j = 0; j < size; j++) { - src1 = (I32*)(src + j * K); - - if(i % 16 == 0){ - asm volatile( - "prfm pldl2keep, [%0, %1]\n" - :"+r"(src1) - :"r"((I64)offset) - :"memory","cc" - ); - } - *dst1++ = *(src1 + i); - } - } - U32 kTail = blockK % 4; - if (kTail > 0) { - INT8 *srcI8; - INT8 *dstI8 = (INT8*)dst1; - for (U32 j = 0; j < size; j++) { - srcI8 = src + j * K + i * 4; - for (U32 k = 0; k < 4; k++) { - if (k < kTail) { - dstI8[j * 4 + k] = srcI8[k]; - } else { - dstI8[j * 4 + k] = 0; - } - } - } - } -} - -inline void matrix2_trans_m12(U32 blockK, U32 M, INT8* src, INT8* dst) -{ - INT8* src1 = src; - INT8* dst1 = dst; - U32 offset = 4 * M; - - U32 i = 0; - for (; i < blockK - 3; i += 4) { - // Prefetch for the next iteration - asm volatile( - "prfm pldl2keep, [%0, %1]\n" - :"+r"(src1) - :"r"((I64)offset) - :"memory","cc" - ); - - INT8 *in12[4]; - for (U32 j=0; j<4; j++) { - in12[j] = src1 + j * M; - } - src1 += offset; - - asm volatile( - "ldr d0, [%[in0]]\n" - "ldr d1, [%[in1]]\n" - "ldr d2, [%[in2]]\n" - "ldr d3, [%[in3]]\n" - "zip1 v4.8b, v0.8b, v1.8b\n" - "zip2 v5.8b, v0.8b, v1.8b\n" - "zip1 v6.8b, v2.8b, v3.8b\n" - "zip2 v7.8b, v2.8b, v3.8b\n" - - "zip1 v0.4h, v4.4h, v6.4h\n" - "zip2 v1.4h, v4.4h, v6.4h\n" - "zip1 v2.4h, v5.4h, v7.4h\n" - "zip2 v3.4h, v5.4h, v7.4h\n" - "str d0, [%[out]]\n" - "str d1, [%[out], 8]\n" - "str d2, [%[out], 16]\n" - "str d3, [%[out], 24]\n" - : - :[in0]"r"(in12[0]), - [in1]"r"(in12[1]), - [in2]"r"(in12[2]), - [in3]"r"(in12[3]), - [out]"r"(dst1) - :"memory","cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" - ); - - for (U32 j=0; j<4; j++) { - for (U32 k=0; k<4; k++) { - dst1[32 + j*4 + k] = in12[k][8+j]; - } - } - - dst1 += 48; - } - if (i < blockK) { - U32 kTail = blockK - i; - - INT8 *in12[4]; - INT8 zero[12] = {0}; - for (U32 j = 0; j < 4; j++) { - if (j < kTail) { - in12[j] = src1 + j * M; - } else { - in12[j] = zero; - } - } - - asm volatile( - "ldr d0, [%[in0]]\n" - "ldr d1, [%[in1]]\n" - "ldr d2, [%[in2]]\n" - "ldr d3, [%[in3]]\n" - "zip1 v4.8b, v0.8b, v1.8b\n" - "zip2 v5.8b, v0.8b, v1.8b\n" - "zip1 v6.8b, v2.8b, v3.8b\n" - "zip2 v7.8b, v2.8b, v3.8b\n" - - "zip1 v0.4h, v4.4h, v6.4h\n" - "zip2 v1.4h, v4.4h, v6.4h\n" - "zip1 v2.4h, v5.4h, v7.4h\n" - "zip2 v3.4h, v5.4h, v7.4h\n" - "str d0, [%[out]]\n" - "str d1, [%[out], 8]\n" - "str d2, [%[out], 16]\n" - "str d3, [%[out], 24]\n" - : - :[in0]"r"(in12[0]), - [in1]"r"(in12[1]), - [in2]"r"(in12[2]), - [in3]"r"(in12[3]), - [out]"r"(dst1) - :"memory","cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" - ); - - for (U32 j = 0; j < 4; j++) { - for (U32 k = 0; k < 4; k++) { - dst1[32 + j * 4 + k] = in12[k][8 + j]; - } - } - } -} - -//Trans from KM to MKm(size)k4 -inline void matrix2_trans_int8(U32 size, U32 blockK, U32 M, INT8* src, INT8* dst) -{ - INT8* src1 = src; - INT8* dst1 = dst; - U32 offset = 4 * M; - - U32 i = 0; - for(; i < blockK - 3; i += 4){ - src1 = src + i * M; - asm volatile( - "prfm pldl2keep, [%0, %1]\n" - :"+r"(src1) - :"r"((I64)offset) - :"memory","cc" - ); - for(U32 j = 0; j < size; j++){ - src1 = src + i * M + j; - for (U32 k = 0; k < 4; k++){ - *dst1 = *src1; - dst1++; - src1 += M; - } - } - } - if (i < blockK) { - U32 kTail = blockK - i; - for (U32 j = 0; j < size; j++) { - src1 = src + i * M + j; - for (U32 k = 0; k < 4; k++) { - if (k < kTail) { - *dst1 = *src1; - dst1++; - src1 += M; - } else { - *dst1 = 0; - dst1++; - } - } - } - } -} - -inline void mmm_N8_MTail(U32 MInner, U32 M, U32 K, INT8* matrix1, INT8* matrix2, I32* result) -{ - int8x16_t mat1[2]; - int8x16_t mat2; - int32x4_t res[4][2] = {{0}}; - I32 tmp[8] = {0}; - - CHECK_REQUIREMENT(MInner < 4); - - for(U32 i = 0; i < K; i+=4){ - mat1[0] = vld1q_s8(matrix1 + i * 8); - mat1[1] = vld1q_s8(matrix1 + i * 8 + 16); - - mat2 = vld1q_s8(matrix2 + i * MInner); - - for(U32 j = 0; j < MInner; j++){ - res[j][0] = vdotq_laneq_s32_builtin(res[j][0], mat1[0], mat2, j); - res[j][1] = vdotq_laneq_s32_builtin(res[j][1], mat1[1], mat2, j); - } - } - for(U32 p = 0; p < MInner; p++){ - vst1q_s32(tmp, res[p][0]); - vst1q_s32(tmp+4, res[p][1]); - for(U32 q = 0; q < 8; q++){ - result[q * M + p] += tmp[q]; - } - res[p][0] = vdupq_n_s32(0); - res[p][1] = vdupq_n_s32(0); - } -} - -inline void mmm_N4_MTail(U32 MInner, U32 M, U32 K, INT8* matrix1, INT8* matrix2, I32* result) -{ - int8x16_t mat1 = {0}; - int8x16_t mat2 = {0}; - int32x4_t res[4] = {0}; - I32 tmp[8] = {0}; - - CHECK_REQUIREMENT(MInner < 4); - - for(U32 i = 0; i < K; i+=4){ - mat1 = vld1q_s8(matrix1 + i * 8); - - mat2 = vld1q_s8(matrix2 + i * MInner); - - for(U32 j = 0; j < MInner; j++){ - res[j] = vdotq_laneq_s32_builtin(res[j], mat1, mat2, j); - } - } - for(U32 p = 0; p < MInner; p++){ - vst1q_s32(tmp, res[p]); - for(U32 q = 0; q < 8; q++){ - result[q * M + p] += tmp[q]; - } - res[p] = vdupq_n_s32(0); - } -} - -inline void mmm_NTail_M12(U32 M, U32 N, U32 K, INT8* matrix1, INT8* matrix2, I32* result) { - int8x16_t mat1 = {0}; - int8x16_t mat2[3] = {0}; - int32x4_t res[4][3] = {{0}}; - - for (U32 i = 0; i < N; i++) { - res[i][0] = vld1q_s32(result + i*M); - res[i][1] = vld1q_s32(result + i*M + 4); - res[i][2] = vld1q_s32(result + i*M + 8); - } - - for (U32 q=0; q -#include - - -inline void mvm_col_tail(U32 N, U32 K, INT8* matrix, INT8* vector, I32* result) { - for (U32 n = 0; n < N; n++) { - I32 tmp = 0; - for (U32 k = 0; k < K; k++) { - tmp += vector[k] * matrix[k*N + n]; - } - result[n] += tmp; - } -} - -inline void mvm_row_tail(U32 N, U32 K, INT8* matrix, INT8* vector, I32* result) { - INT8* cur_row = matrix; - for (U32 n = 0; n < N; n++) { - I32 tmp = 0; - for(U32 k = 0; k < K; k++) { - tmp += vector[k] * cur_row[k]; - } - result[n] += tmp; - cur_row += K; - } -} - -inline void mvm_row_kernel(U32 Nbatch, U32 K, INT8* matrix, INT8* vector, I32* result) { - U32 N = Nbatch * 4; - int8x16_t mat[4], v; - U32 K_tail = K % 16; - U32 K_inner = K - K_tail; - for (U32 n = 0; n < N; n+=4) { - int32x4_t res[4] = {0}; - int32x4_t bias; - - INT8* w0 = matrix + n * K; - INT8* w1 = w0 + K; - INT8* w2 = w1 + K; - INT8* w3 = w2 + K; - - for (U32 k = 0; k < K_inner; k+=16) { - v = vld1q_s8(vector + k); - mat[0] = vld1q_s8(w0); - mat[1] = vld1q_s8(w1); - mat[2] = vld1q_s8(w2); - mat[3] = vld1q_s8(w3); - - res[0] = vdotq_s32(res[0], mat[0], v); - res[1] = vdotq_s32(res[1], mat[1], v); - res[2] = vdotq_s32(res[2], mat[2], v); - res[3] = vdotq_s32(res[3], mat[3], v); - - w0 += 16; - w1 += 16; - w2 += 16; - w3 += 16; - } - bias = vld1q_s32(result + n); - - res[0] = vpaddq_s32(res[0], res[1]); - res[2] = vpaddq_s32(res[2], res[3]); - res[0] = vpaddq_s32(res[0], res[2]); - res[0] = vaddq_s32(res[0], bias); - - vst1q_s32(result + n, res[0]); - - if (K_tail != 0) { - I32 tmp[4] = {0}; - for(U32 p = K_inner; p < K; p++) { - tmp[0] += vector[p] * *w0++; - tmp[1] += vector[p] * *w1++; - tmp[2] += vector[p] * *w2++; - tmp[3] += vector[p] * *w3++; - } - result[n] += tmp[0]; - result[n+1] += tmp[1]; - result[n+2] += tmp[2]; - result[n+3] += tmp[3]; - } - } -} - -inline void mvm_col(U32 numRows, U32 numColumns, INT8* matrix, INT8* vector, I32*tmp, I32* result) { - //Actual layout is KN, and vector is K - U32 N = numRows; - U32 K = numColumns; - U32 NTail = N % 64; - U32 NInner = N - NTail; - - for (U32 n = 0; n < NInner; n+=64) { - memset(tmp, 0, sizeof(I32)*64); - for (U32 k = 0; k < K; k++) { - for(U32 i = 0; i < 64; i++) { - tmp[i] += vector[k] * matrix[k * N + n + i]; - } - } - - for (U32 i = 0; i < 64; i++) { - result[n + i] += tmp[i]; - } - } - - memset(tmp, 0, sizeof(I32)*64); - for (U32 k = 0; k < K; k++) { - for(U32 i = 0; i < NTail; i++) { - tmp[i] += vector[k] * matrix[k * N + NInner + i]; - } - for(U32 i=0; i < NTail; i++) { - result[NInner + i] += tmp[i]; - } - } -} - -inline void mvm_row(U32 numRows, U32 numColumns, INT8* matrix, INT8* vector, I32* result) { - //Actual layout is NK, and vector is K - U32 N = numRows; - U32 K = numColumns; - U32 Nbatch = N / 4; - U32 NTail = N % 4; - - mvm_row_kernel(Nbatch, K, matrix, vector, result); - - if (NTail != 0) { - mvm_row_tail(NTail, K, matrix + (N - NTail) * K, vector, result + N - NTail); - } -} -#endif -#endif diff --git a/blas-enhance/src/cpu/arm/mmm.cpp b/blas-enhance/src/cpu/arm/mmm.cpp deleted file mode 100644 index 8ba0abab..00000000 --- a/blas-enhance/src/cpu/arm/mmm.cpp +++ /dev/null @@ -1,181 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "error.h" -#include "type.h" -#include "blas-enhance.h" -#include "cpu/arm/blas_arm.h" -#ifdef _USE_FP16 -#include "cpu/arm/fp16/blas_fp16.h" -#endif -#ifdef _USE_FP32 -#include "cpu/arm/fp32/blas_fp32.h" -#endif -#ifdef _USE_INT8 -#include "cpu/arm/int8/blas_int8.h" -#endif - - -EE matrix_matrix_multiply_tmp_bytes_arm(U32 matrixA_M, U32 matrixA_K, U32 matrixB_K, U32 matrixB_N, - DataType dt, U32 *bytes) -{ - EE ret = SUCCESS; - switch (dt) { -#ifdef _USE_FP16 - case DT_F16: { - matrix_matrix_multiply_tmp_bytes_fp16(matrixA_M, matrixA_K, matrixB_K, matrixB_N, dt, bytes); - break; - } -#endif -#ifdef _USE_FP32 - case DT_F32: { - matrix_matrix_multiply_tmp_bytes_fp32(matrixA_M, matrixA_K, matrixB_K, matrixB_N, dt, bytes); - break; - } -#endif -#ifdef _USE_INT8 - case DT_I8: { - matrix_matrix_multiply_tmp_bytes_int8(matrixA_M, matrixA_K, matrixB_K, matrixB_N, dt, bytes); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - - return ret; -} - -EE matrix_matrix_multiply_transform_rhsN(TensorDesc desc, const void* src, TensorDesc* descTran, void* dst) -{ - EE ret = SUCCESS; - switch (desc.dt) { -#ifdef _USE_FP16 - case DT_F16: { - ret = matrix_matrix_multiply_transform_rhsN_fp16(desc, (F16*)src, (F16*)dst); - break; - } -#endif -#ifdef _USE_FP32 - case DT_F32: { - ret = matrix_matrix_multiply_transform_rhsN_fp32(desc, (F32*)src, (F32*)dst); - break; - } -#endif -#ifdef _USE_INT8 - case DT_I8: { - ret = matrix_matrix_multiply_transform_rhsN_int8(desc, (INT8*)src, (INT8*)dst); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - (*descTran) = desc; - (*descTran).df = targetFormat4MatrixB(desc.dt); - return ret; -} - -EE matrix_matrix_multiply_transform_rhsT(TensorDesc desc, const void* src, TensorDesc* descTran, void* dst) -{ - EE ret = SUCCESS; - switch (desc.dt) { -#ifdef _USE_FP16 - case DT_F16: { - ret = matrix_matrix_multiply_transform_rhsT_fp16(desc, (F16*)src, (F16*)dst); - break; - } -#endif -#ifdef _USE_FP32 - case DT_F32: { - ret = matrix_matrix_multiply_transform_rhsT_fp32(desc, (F32*)src, (F32*)dst); - break; - } -#endif -#ifdef _USE_INT8 - case DT_I8: { - ret = matrix_matrix_multiply_transform_rhsT_int8(desc, (INT8*)src, (INT8*)dst); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - (*descTran) = desc; - (*descTran).df = targetFormat4MatrixB(desc.dt); - return ret; -} - -EE matrix_matrix_multiply_transform_rhs(TensorDesc desc, const void* src, TensorDesc* descTran, void* dst) -{ - if (desc.df == targetFormat4MatrixB(desc.dt)) { - return SUCCESS; - } - EE ret = SUCCESS; - switch (desc.df) { - case DF_NORMAL: { - ret = matrix_matrix_multiply_transform_rhsN(desc, src, descTran, dst); - break; - } - case DF_TRANSPOSE: { - ret = matrix_matrix_multiply_transform_rhsT(desc, src, descTran, dst); - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - -EE mmm_arm(U32 matrixC_N, U32 matrixC_M, U32 matrixA_K, - DataType dt, - const void* matrixAData, const void* matrixBData, - void* tmp, - void* matrixCData, - Arch arch) -{ - EE ret = SUCCESS; - switch (dt) { -#ifdef _USE_FP16 - case DT_F16: { - ret = mmm_fp16(matrixC_N, matrixC_M, matrixA_K, (F16*)matrixAData, (F16*)matrixBData, (F16*)tmp, (F16*)matrixCData, arch); - break; - } -#endif -#ifdef _USE_FP32 - case DT_F32: { -#ifdef __aarch64__ - ret = mmm_fp32_V8(matrixC_N, matrixC_M, matrixA_K, (F32*)matrixAData, (F32*)matrixBData, (F32*)tmp, (F32*)matrixCData); -#else - ret = mmm_fp32_V7(matrixC_N, matrixC_M, matrixA_K, (F32*)matrixAData, (F32*)matrixBData, (F32*)tmp, (F32*)matrixCData); -#endif - break; - } -#endif -#ifdef _USE_INT8 - case DT_I8: { - ret = mmm_int8(matrixC_N, matrixC_M, matrixA_K, (INT8*)matrixAData, (INT8*)matrixBData, (INT8*)tmp, (I32*)matrixCData, arch); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} diff --git a/blas-enhance/src/cpu/arm/mvm.cpp b/blas-enhance/src/cpu/arm/mvm.cpp deleted file mode 100644 index 58de421e..00000000 --- a/blas-enhance/src/cpu/arm/mvm.cpp +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "error.h" -#include "type.h" -#include "cpu/arm/blas_arm.h" -#ifdef _USE_FP16 -#include "cpu/arm/fp16/blas_fp16.h" -#endif -#ifdef _USE_FP32 -#include "cpu/arm/fp32/blas_fp32.h" -#endif -#ifdef _USE_INT8 -#include "cpu/arm/int8/blas_int8.h" -#endif - -EE matrix_vector_multiply_tmp_bytes_arm(bool transpose, - DataType dt, U32 *bytes) -{ - if (nullptr == bytes) - CHECK_STATUS(NULL_POINTER); - switch (dt) { -#ifdef _USE_FP16 - case DT_F16: - *bytes = 0; - break; -#endif -#ifdef _USE_FP32 - case DT_F32: - *bytes = 0; - break; -#endif -#ifdef _USE_INT8 - case DT_I8: { - if (transpose) - *bytes = 64 * sizeof(I32); - break; - } -#endif - default: - break; - } - return SUCCESS; -} - -EE mvm_arm(U32 row, U32 col, DataType dt, bool transpose, - const void *matrix, const void *vector, - void *tmp, - void *result, - Arch arch) -{ - EE ret = SUCCESS; - switch (dt) { -#ifdef _USE_FP16 - case DT_F16: - ret = mvm_fp16(row, col, transpose, (F16*)matrix, (F16*)vector, (F16*)result, arch); - break; -#endif -#ifdef _USE_FP32 - case DT_F32: - ret = mvm_fp32(row, col, transpose, (F32*)matrix, (F32*)vector, (F32*)result); - break; -#endif -#ifdef _USE_INT8 - case DT_I8: - ret = mvm_int8(row, col, transpose, (INT8*)matrix, (INT8*)vector, (I32*)tmp, (I32*)result); - break; -#endif - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} diff --git a/blas-enhance/src/cpu/general/blas_general.h b/blas-enhance/src/cpu/general/blas_general.h deleted file mode 100644 index fe978a35..00000000 --- a/blas-enhance/src/cpu/general/blas_general.h +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_BLAS_GENERAL -#define _H_BLAS_GENERAL - -#include "sys.h" -#include "type.h" - - -EE mvm_general(U32 row, U32 col, DataType dt, bool transpose, const void *matrix, const void *vector, void *result); - -EE mmm_general(U32 matrixC_N, U32 matrixC_M, U32 matrixA_K, - bool transposeA, bool transposeB, - DataType matrixADataType, - const void* matrixAData, const void* matrixBData, - void* matrixCData); - -#endif diff --git a/blas-enhance/src/cpu/general/mmm.cpp b/blas-enhance/src/cpu/general/mmm.cpp deleted file mode 100644 index 4b6cc208..00000000 --- a/blas-enhance/src/cpu/general/mmm.cpp +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "error.h" -#include "type.h" -#include "cpu/general/blas_general.h" - - -template -inline void mmm(U32 N, U32 M, U32 K, bool transposeA, bool transposeB, T1* matrixA, T1* matrixB, T2* matrixC) { - for (U32 i =0; i < M; i++) { - for (U32 n = 0; n < N; n++) { - F32 value = 0; - for (U32 j = 0; j < K; j++) { - U32 indexA = 0, indexB = 0; - if (transposeA) - indexA = j * M + i; - else - indexA = i * K + j; - if (transposeB) - indexB = n * K + j; - else - indexB = j * N + n; - value += matrixA[indexA] * matrixB[indexB]; - } - matrixC[i * N + n] += value; - } - } -} - -EE mmm_general(U32 matrixC_N, U32 matrixC_M, U32 matrixA_K, - bool transposeA, bool transposeB, - DataType dt, - const void* matrixAData, const void* matrixBData, - void* matrixCData) -{ - EE ret = SUCCESS; - switch (dt) { -#ifdef _USE_FP16 - case DT_F16: { - mmm(matrixC_N, matrixC_M, matrixA_K, transposeA, transposeB, (F16*)matrixAData, (F16*)matrixBData, (F16*)matrixCData); - break; - } -#endif -#ifdef _USE_INT8 - case DT_I8: { - mmm(matrixC_N, matrixC_M, matrixA_K, transposeA, transposeB, (INT8*)matrixAData, (INT8*)matrixBData, (I32*)matrixCData); - break; - } -#endif -#ifdef _USE_FP32 - case DT_F32: { - mmm(matrixC_N, matrixC_M, matrixA_K, transposeA, transposeB, (F32*)matrixAData, (F32*)matrixBData, (F32*)matrixCData); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} diff --git a/blas-enhance/src/cpu/general/mvm.cpp b/blas-enhance/src/cpu/general/mvm.cpp deleted file mode 100644 index a8f9a75f..00000000 --- a/blas-enhance/src/cpu/general/mvm.cpp +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "error.h" -#include "type.h" -#include "cpu/general/blas_general.h" - -template -inline void mvm(U32 M, U32 K, bool transpose, T1* mat, T1 *vec, T2* res) { - if (! transpose) { - for (U32 i = 0; i < M; i++) { - F32 out_f = 0; - for (U32 j = 0; j < K; j++) { - out_f += mat[i * K + j] * vec[j]; - } - res[i] += out_f; - } - } else { - for (U32 i = 0; i < M; i++) { - F32 out_f = 0; - for (U32 j = 0; j < K; j++) { - out_f += mat[j * M + i] * vec[j]; - } - res[i] += out_f; - } - } -} - -EE mvm_general(U32 row, U32 col, DataType dt, bool transpose, const void *matrix, const void *vector, void *result) { - EE ret = SUCCESS; - switch (dt) { -#ifdef _USE_FP16 - case DT_F16: - mvm(row, col, transpose, (F16*)matrix, (F16*)vector, (F16*)result); - break; -#endif -#ifdef _USE_INT8 - case DT_I8: - mvm(row, col, transpose, (INT8*)matrix, (INT8*)vector, (I32*)result); - break; -#endif -#ifdef _USE_FP32 - case DT_F32: - mvm(row, col, transpose, (F32*)matrix, (F32*)vector, (F32*)result); - break; -#endif - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} diff --git a/blas-enhance/src/mmm.cpp b/blas-enhance/src/mmm.cpp deleted file mode 100644 index 2e9e238d..00000000 --- a/blas-enhance/src/mmm.cpp +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "blas-enhance.h" -#ifdef _USE_GENERAL -#include "cpu/general/blas_general.h" -#endif -#ifdef _USE_NEON -#include "cpu/arm/blas_arm.h" -#endif - -EE matrix_matrix_multiply_tmp_bytes(TensorDesc matrixADesc, TensorDesc matrixBDesc, U32* bytes, Arch arch) -{ - DataType matrixADataType, matrixBDataType; - U32 matrixA_M, matrixA_K, matrixB_K, matrixB_N; - CHECK_STATUS(tensor2dGet(matrixADesc, &matrixADataType, &matrixA_M, &matrixA_K)); - CHECK_STATUS(tensor2dGet(matrixBDesc, &matrixBDataType, &matrixB_K, &matrixB_N)); - - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = SUCCESS; -#endif -#ifdef _USE_NEON - } else { - ret = matrix_matrix_multiply_tmp_bytes_arm(matrixA_M, matrixA_K, matrixB_K, matrixB_N, matrixADataType, bytes); -#endif - } - return ret; -} - -EE matrix_matrix_multiply(TensorDesc matrixADesc, const void* matrixAData, - TensorDesc matrixBDesc, const void* matrixBData, - U32 bytes, void* tmp, - TensorDesc matrixCDesc, void* matrixCData, - Arch arch) -{ - if (bytes != 0 && tmp == nullptr) { - CHECK_STATUS(NULL_POINTER); - } - if (nullptr == matrixAData || nullptr == matrixBData || nullptr == matrixCData) { - CHECK_STATUS(NULL_POINTER); - } - - DataType matrixADataType, matrixBDataType, matrixCDataType; - DataFormat matrixADataFormat, matrixBDataFormat; - U32 matrixA_M, matrixA_K, matrixB_K, matrixB_N, matrixC_M, matrixC_N; - CHECK_STATUS(tensor2dfGet(matrixADesc, &matrixADataType, &matrixADataFormat, &matrixA_M, &matrixA_K)); - CHECK_STATUS(tensor2dfGet(matrixBDesc, &matrixBDataType, &matrixBDataFormat, &matrixB_K, &matrixB_N)); - CHECK_STATUS(tensor2dGet(matrixCDesc, &matrixCDataType, &matrixC_M, &matrixC_N)); - - if (matrixADataType != matrixBDataType) - CHECK_STATUS(NOT_MATCH); - if (matrixADataType != matrixCDataType) - if (matrixADataType != DT_I8 || matrixCDataType != DT_I32) - CHECK_STATUS(NOT_MATCH); - - bool transposeA = false, transposeB = false; - if (matrixADataFormat == DF_TRANSPOSE) { - std::swap(matrixA_M, matrixA_K); - transposeA = true; - } - if (matrixBDataFormat == DF_TRANSPOSE) { - std::swap(matrixB_K, matrixB_N); - transposeB = true; - } - if (matrixA_M != matrixC_M || matrixB_N != matrixC_N || matrixA_K != matrixB_K) - CHECK_STATUS(NOT_MATCH); - - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = mmm_general(matrixC_N, matrixC_M, matrixA_K, transposeA, transposeB, matrixADataType, matrixAData, matrixBData, matrixCData); -#endif -#ifdef _USE_NEON - } else { - TensorDesc tranDescB; - U8 *dataB = (U8*)matrixBData; - if (matrixBDataFormat != targetFormat4MatrixB(matrixBDataType)) { - U32 K = matrixA_K; - if (DT_I8 == matrixADataType) { - K = pad_to_4_multiple(K); - } - dataB = ((U8*)tmp) + matrixA_M * K * bytesOf(matrixADataType); - ret = matrix_matrix_multiply_transform_rhs(matrixBDesc, matrixBData, &tranDescB, dataB); - } - ret = mmm_arm(matrixC_N, matrixC_M, matrixA_K, matrixADataType, matrixAData, dataB, tmp, matrixCData, arch); -#endif - } - return ret; -} diff --git a/blas-enhance/src/mvm.cpp b/blas-enhance/src/mvm.cpp deleted file mode 100644 index bcdddeb1..00000000 --- a/blas-enhance/src/mvm.cpp +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "blas-enhance.h" -#ifdef _USE_GENERAL -#include "cpu/general/blas_general.h" -#endif -#ifdef _USE_NEON -#include "cpu/arm/blas_arm.h" -#endif - - -EE matrix_vector_multiply_tmp_bytes(TensorDesc matrixDesc, TensorDesc vectorDesc, U32* bytes, Arch arch) -{ - UNUSED(vectorDesc); - - bool transpose = (matrixDesc.df == DF_TRANSPOSE); - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = SUCCESS; -#endif -#ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = matrix_vector_multiply_tmp_bytes_arm(transpose, matrixDesc.dt, bytes); -#endif - } - return ret; -} - -EE matrix_vector_multiply(TensorDesc matrixDesc, const void* matrix, - TensorDesc vectorDesc, const void* vector, - U32 bytes, void* tmp, - TensorDesc resultDesc, void* result, - Arch arch) -{ - if (bytes != 0 && tmp == nullptr) { - CHECK_STATUS(NULL_POINTER); - } - if (nullptr == matrix || nullptr == vector || nullptr == result) { - CHECK_STATUS(NULL_POINTER); - } - DataType matrixDataType, vectorDataType, resultDataType; - DataFormat matrixDataFormat; - U32 matrixRow, matrixColumn, vectorColumn, resultColumn; - CHECK_STATUS(tensor2dfGet(matrixDesc, &matrixDataType, &matrixDataFormat, &matrixRow, &matrixColumn)); - CHECK_STATUS(tensor1dGet(vectorDesc, &vectorDataType, &vectorColumn)); - CHECK_STATUS(tensor1dGet(resultDesc, &resultDataType, &resultColumn)); - - if (matrixDataType != vectorDataType) - CHECK_STATUS(NOT_MATCH); - if (matrixDataType != resultDataType) - if (matrixDataType != DT_I8 || resultDataType != DT_I32) - CHECK_STATUS(NOT_MATCH); - - if (matrixRow != resultColumn || matrixColumn != vectorColumn) - CHECK_STATUS(NOT_MATCH); - - bool transpose = (matrixDataFormat == DF_TRANSPOSE); - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = mvm_general(matrixRow, matrixColumn, matrixDataType, transpose, matrix, vector, result); -#endif -#ifdef _USE_NEON - } else { - ret = mvm_arm(matrixRow, matrixColumn, matrixDataType, transpose, matrix, vector, tmp, result, arch); -#endif - } - return ret; -} diff --git a/bolt.cmake b/bolt.cmake deleted file mode 100644 index adbcd27e..00000000 --- a/bolt.cmake +++ /dev/null @@ -1,207 +0,0 @@ -option(USE_CROSS_COMPILE "set use cross compile or not" ON) -option(USE_GNU_GCC "set use GNU gcc compiler or not" OFF) -option(USE_LLVM_CLANG "set use LLVM clang compiler or not" OFF) -option(USE_DEBUG "set use debug information or not" OFF) -option(USE_DYNAMIC_LIBRARY "set use dynamic library or not" OFF) -option(USE_MINSIZEREL ".so lib will be 300KB smaller but performance will be affected" OFF) - -# model-tools variable -option(USE_CAFFE "set use caffe model as input or not" ON) -option(USE_ONNX "set use onnx model as input or not" ON) -option(USE_TFLITE "set use tflite model as input or not" ON) - -# blas-enhance tensor_computing -option(USE_GENERAL "set use CPU serial code or not" ON) -option(USE_NEON "set use ARM NEON instruction or not" ON) -option(USE_ARMV7 "set use ARMv7 NEON instruction or not" OFF) -option(USE_ARMV8 "set use ARMv8 NEON instruction or not" ON) -option(USE_FP32 "set use ARM NEON FP32 instruction or not" ON) -option(USE_FP16 "set use ARM NEON FP16 instruction or not" ON) -option(USE_F16_MIX_PRECISION "set use ARM NEON mix precision f16/f32 instruction or not" ON) -option(USE_INT8 "set use ARM NEON INT8 instruction or not" ON) -option(BUILD_TEST "set to build unit test or not" OFF) -option(USE_OPENMP "set use OpenMP for parallel or not" ON) -option(USE_MALI "set use mali for parallel or not" ON) -option(USE_LIBRARY_TUNING "set use algorithm tuning or not" ON) - -set(BOLT_ROOT $ENV{BOLT_ROOT}) - -function (set_policy) - cmake_policy(SET CMP0074 NEW) -endfunction(set_policy) - -macro (set_c_cxx_flags) - set(COMMON_FLAGS "-W -Wall -Wextra -Wno-unused-command-line-argument -Wno-unused-parameter -O3") - - if (USE_LIBRARY_TUNING) - set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_LIBRARY_TUNING") - endif(USE_LIBRARY_TUNING) - - if (BUILD_TEST) - set(COMMON_FLAGS "${COMMON_FLAGS} -D_BUILD_TEST") - endif(BUILD_TEST) - - if (USE_DEBUG) - set(COMMON_FLAGS "${COMMON_FLAGS} -D_DEBUG") - if (USE_LLVM_CLANG) - set(COMMON_FLAGS "${COMMON_FLAGS} -llog") - endif(USE_LLVM_CLANG) - endif(USE_DEBUG) - - if (USE_GENERAL) - set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_GENERAL") - endif(USE_GENERAL) - - if (USE_MALI) - set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_MALI") - endif(USE_MALI) - - if (USE_NEON) - set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_NEON") - - if (USE_ARMV8) - set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_ARMV8") - endif (USE_ARMV8) - - if (USE_ARMV7) - set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_ARMV7 -march=armv7-a -mfloat-abi=softfp -mfpu=neon-vfpv4") - if (USE_LLVM_CLANG) - set(COMMON_FLAGS "${COMMON_FLAGS} -Wl,--allow-multiple-definition") - endif (USE_LLVM_CLANG) - endif (USE_ARMV7) - - if (USE_FP32) - set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_FP32") - endif (USE_FP32) - - if (USE_FP16) - set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_FP16") - if (USE_F16_MIX_PRECISION) - set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_F16_MIX_PRECISION") - endif (USE_F16_MIX_PRECISION) - if (USE_INT8) - if (USE_LLVM_CLANG) - set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_INT8 -march=armv8-a+fp16+dotprod") - else (USE_LLVM_CLANG) - set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_INT8 -march=armv8.2-a+fp16+dotprod") - endif (USE_LLVM_CLANG) - else (USE_INT8) - if (USE_LLVM_CLANG) - set(COMMON_FLAGS "${COMMON_FLAGS} -march=armv8-a+fp16") - else (USE_LLVM_CLANG) - set(COMMON_FLAGS "${COMMON_FLAGS} -march=armv8.2-a+fp16") - endif (USE_LLVM_CLANG) - endif (USE_INT8) - endif (USE_FP16) - endif(USE_NEON) - - if (USE_CAFFE) - set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_CAFFE_MODEL") - endif() - if (USE_ONNX) - set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_ONNX_MODEL") - endif() - if (USE_TFLITE) - set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_TFLITE_MODEL") - endif() - - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMMON_FLAGS}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMMON_FLAGS}") - link_libraries("-static-libstdc++") - - if (USE_DEBUG) - set(CMAKE_BUILD_TYPE "Debug") - elseif (USE_MINSIZEREL) - set(CMAKE_BUILD_TYPE "MinSizeRel") - endif (USE_DEBUG) -endmacro(set_c_cxx_flags) - -macro (set_test_c_cxx_flags) - if (${USE_DYNAMIC_LIBRARY} STREQUAL "OFF") - if (USE_CROSS_COMPILE) - if (USE_GNU_GCC) - set(COMMON_FLAGS "${COMMON_FLAGS} -static") - endif(USE_GNU_GCC) - endif(USE_CROSS_COMPILE) - endif(${USE_DYNAMIC_LIBRARY} STREQUAL "OFF") - - if (USE_LLVM_CLANG) - if (${USE_DYNAMIC_LIBRARY} STREQUAL "OFF") - set(COMMON_FLAGS "${COMMON_FLAGS} -Wl,-allow-shlib-undefined, -static-libstdc++") - else (${USE_DYNAMIC_LIBRARY} STREQUAL "OFF") - set(COMMON_FLAGS "${COMMON_FLAGS} -Wl,-allow-shlib-undefined") - endif(${USE_DYNAMIC_LIBRARY} STREQUAL "OFF") - endif(USE_LLVM_CLANG) - - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMMON_FLAGS}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMMON_FLAGS}") -endmacro (set_test_c_cxx_flags) - -macro (set_project_install_directory) - SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/bin) - SET(LIBRARY_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/lib) -endmacro (set_project_install_directory) - -function(blas_enhance name) - add_executable(${name} ${name}.cpp) - add_dependencies(${name} blas-enhance) - add_dependencies(${name} blas-enhance_static) - target_link_libraries(${name} ${BLAS_ENHANCE_LIBRARY}) -endfunction() - -function(tensor_computing name) - add_executable(${name} ${name}.cpp) - add_dependencies(${name} tensor_computing) - add_dependencies(${name} tensor_computing_static) - target_link_libraries(${name} ${TENSOR_COMPUTING_LIBRARIES}) - if(USE_MALI) - target_link_libraries(${name} ${OPENCL_LIBRARIES}) - endif(USE_MALI) -endfunction() - -function(image name) - add_executable(${name} ${name}.cpp) - add_dependencies(${name} image) - add_dependencies(${name} image_static) - target_link_libraries(${name} ${IMAGE_LIBRARIES}) -endfunction() - -function(model_tools name) - add_executable(${name} ${name}.cpp) - if (USE_CAFFE) - add_dependencies(${name} model-tools) - add_dependencies(${name} model-tools_static) - add_dependencies(${name} model-tools_caffe) - add_dependencies(${name} model-tools_caffe_static) - TARGET_LINK_LIBRARIES(${name} ${MODEL_TOOLS_LIBRARIES}) - endif (USE_CAFFE) - - if (USE_ONNX) - add_dependencies(${name} model-tools) - add_dependencies(${name} model-tools_static) - add_dependencies(${name} model-tools_onnx) - add_dependencies(${name} model-tools_onnx_static) - TARGET_LINK_LIBRARIES(${name} ${MODEL_TOOLS_LIBRARIES}) - endif (USE_ONNX) - - if (USE_TFLITE) - add_dependencies(${name} model-tools) - add_dependencies(${name} model-tools_static) - add_dependencies(${name} model-tools_tflite) - add_dependencies(${name} model-tools_tflite_static) - TARGET_LINK_LIBRARIES(${name} ${MODEL_TOOLS_LIBRARIES}) - endif (USE_TFLITE) -endfunction() - -function(inference name src_name) - add_executable(${name} ${src_name}) - if (USE_DYNAMIC_LIBRARY) - TARGET_LINK_LIBRARIES(${name} inference) - else (USE_DYNAMIC_LIBRARY) - TARGET_LINK_LIBRARIES(${name} inference_static) - endif (USE_DYNAMIC_LIBRARY) - TARGET_LINK_LIBRARIES(${name} ${INFERENCE_LIBRARIES} ${JPEG_LIBRARY}) - if (USE_MALI) - TARGET_LINK_LIBRARIES(${name} ${KERNELBIN_LIBRARIES} ${OPENCL_LIBRARIES}) - endif (USE_MALI) -endfunction() diff --git a/cmakes/FindBlasEnhance.cmake b/cmakes/FindBlasEnhance.cmake deleted file mode 100644 index 2b3f4af8..00000000 --- a/cmakes/FindBlasEnhance.cmake +++ /dev/null @@ -1,26 +0,0 @@ -set(BLAS_ENHANCE_PROJECT_NAME "blas-enhance") -unset(BLAS_ENHANCE_ROOT) -find_path(BLAS_ENHANCE_ROOT NAMES ${BLAS_ENHANCE_PROJECT_NAME} HINTS ${BOLT_ROOT} $ENV{BOLT_ROOT}) -set(BLAS_ENHANCE_ROOT "${BLAS_ENHANCE_ROOT}/${BLAS_ENHANCE_PROJECT_NAME}") - -set(BLAS_ENHANCE_INCLUDE_DIR "${BLAS_ENHANCE_ROOT}/include") -if (USE_DYNAMIC_LIBRARY) - set(BLAS_ENHANCE_LIBRARY "${BLAS_ENHANCE_ROOT}/lib/lib${BLAS_ENHANCE_PROJECT_NAME}.so") -else (USE_DYNAMIC_LIBRARY) - set(BLAS_ENHANCE_LIBRARY "${BLAS_ENHANCE_ROOT}/lib/lib${BLAS_ENHANCE_PROJECT_NAME}.a") -endif (USE_DYNAMIC_LIBRARY) - -if (BLAS_ENHANCE_INCLUDE_DIR AND BLAS_ENHANCE_LIBRARY) - set(BLAS_ENHANCE_FOUND true) -endif (BLAS_ENHANCE_INCLUDE_DIR AND BLAS_ENHANCE_LIBRARY) - -if (BLAS_ENHANCE_FOUND) - include_directories(include ${BLAS_ENHANCE_INCLUDE_DIR}) - message(STATUS "Found ${BLAS_ENHANCE_PROJECT_NAME}.h: ${BLAS_ENHANCE_INCLUDE_DIR}") - message(STATUS "Found ${BLAS_ENHANCE_PROJECT_NAME}: ${BLAS_ENHANCE_LIBRARY}") -else (BLAS_ENHANCE_FOUND) - message(FATAL_ERROR " -FATAL: can not find ${BLAS_ENHANCE_PROJECT_NAME} library in /${BLAS_ENHANCE_PROJECT_NAME}/[include/lib] directory, - please set shell or cmake environment variable BOLT_ROOT. - ") -endif (BLAS_ENHANCE_FOUND) diff --git a/cmakes/FindGcl.cmake b/cmakes/FindGcl.cmake deleted file mode 100644 index 5f4973c6..00000000 --- a/cmakes/FindGcl.cmake +++ /dev/null @@ -1,55 +0,0 @@ -find_path(GCL_ROOT NAMES gcl HINTS ${BOLT_ROOT} $ENV{BOLT_ROOT}) -set(GCL_ROOT "${GCL_ROOT}/gcl") - -set(GCL_INCLUDE_DIR "${GCL_ROOT}/include") - -if (GCL_INCLUDE_DIR) - set(GCL_FOUND true) -endif (GCL_INCLUDE_DIR) - -if (GCL_FOUND) - include_directories(include ${GCL_INCLUDE_DIR}) - message(STATUS "Found gcl.h: ${GCL_INCLUDE_DIR}") -else (GCL_FOUND) - message(FATAL_ERROR " -FATAL: can not find gcl.h in /gcl/include directory, - please set shell or cmake environment variable BOLT_ROOT. - ") -endif (GCL_FOUND) - -find_package(OpenCL) - -set(GCL_KERNELBIN_INCLUDE_DIR "${GCL_ROOT}/kernelBin/include") -if (${USE_DYNAMIC_LIBRARY} STREQUAL "OFF") - set(GCL_KERNELBIN_LIBRARY "${GCL_ROOT}/tools/kernel_lib_compile/lib/libkernelbin.a") -else (${USE_DYNAMIC_LIBRARY} STREQUAL "OFF") - set(GCL_KERNELBIN_LIBRARY "${GCL_ROOT}/tools/kernel_lib_compile/lib/libkernelbin.so") -endif (${USE_DYNAMIC_LIBRARY} STREQUAL "OFF") - -if(GCL_KERNELBIN_INCLUDE_DIR) - set(KERNELBIN_HEAD_FOUND true) -endif(GCL_KERNELBIN_INCLUDE_DIR) - -if(GCL_KERNELBIN_LIBRARY) - set(KERNELBIN_LIB_FOUND true) -endif(GCL_KERNELBIN_LIBRARY) - - -if (KERNELBIN_HEAD_FOUND) - include_directories(include ${GCL_KERNELBIN_INCLUDE_DIR}) - message(STATUS "Found kernel bin head file: ${GCL_KERNELBIN_INCLUDE_DIR}") -else (KERNELBIN_HEAD_FOUND) - message(FATAL_ERROR " -FATAL: can not find kernelbin header files in /gcl/kernelBin/include directory, - please set shell or cmake environment variable BOLT_ROOT. - ") -endif (KERNELBIN_HEAD_FOUND) - -if (KERNELBIN_LIB_FOUND) - set(KERNELBIN_LIBRARIES "${GCL_KERNELBIN_LIBRARY}") -else (KERNELBIN_LIB_FOUND) - message(FATAL_ERROR " -FATAL: can not find libkernelbin.a in /gcl/tools/kernel_lib_compile directory, - please set shell or cmake environment variable BOLT_ROOT. - ") -endif (KERNELBIN_LIB_FOUND) diff --git a/cmakes/FindImage.cmake b/cmakes/FindImage.cmake deleted file mode 100644 index e6c0f197..00000000 --- a/cmakes/FindImage.cmake +++ /dev/null @@ -1,33 +0,0 @@ -unset(IMAGE_ROOT) -find_path(IMAGE_ROOT NAMES image HINTS ${BOLT_ROOT} $ENV{BOLT_ROOT}) -set(IMAGE_ROOT "${IMAGE_ROOT}/image") - -set(IMAGE_INCLUDE_DIR "${IMAGE_ROOT}/include") -if (USE_DYNAMIC_LIBRARY) - set(IMAGE_LIBRARY "${IMAGE_ROOT}/lib/libimage.so") -else (USE_DYNAMIC_LIBRARY) - set(IMAGE_LIBRARY "${IMAGE_ROOT}/lib/libimage.a") -endif (USE_DYNAMIC_LIBRARY) - -if (IMAGE_INCLUDE_DIR AND IMAGE_LIBRARY) - set(IMAGE_FOUND true) -endif (IMAGE_INCLUDE_DIR AND IMAGE_LIBRARY) - -if (IMAGE_FOUND) - if (USE_GNU_GCC) - set(IMAGE_LIBRARIES "${IMAGE_LIBRARY};-lpthread;-ldl") - endif(USE_GNU_GCC) - if (USE_LLVM_CLANG) - set(IMAGE_LIBRARIES "${IMAGE_LIBRARY}") - endif(USE_LLVM_CLANG) - - include_directories(include ${IMAGE_INCLUDE_DIR}) - - message(STATUS "Found image.h: ${IMAGE_INCLUDE_DIR}") - message(STATUS "Found image: ${IMAGE_LIBRARY}") -else (IMAGE_FOUND) - message(FATAL_ERROR " -FATAL: can not find image library in /image/lib directory, - please set shell or cmake environment variable BOLT_ROOT. - ") -endif (IMAGE_FOUND) diff --git a/cmakes/FindInference.cmake b/cmakes/FindInference.cmake deleted file mode 100644 index 951cbcaf..00000000 --- a/cmakes/FindInference.cmake +++ /dev/null @@ -1,34 +0,0 @@ -set(INFERENCE_PROJECT_NAME "inference") -unset(INFERENCE_ROOT) -find_path(INFERENCE_ROOT NAMES ${INFERENCE_PROJECT_NAME} HINTS ${BOLT_ROOT} $ENV{BOLT_ROOT}) -set(INFERENCE_ROOT "${INFERENCE_ROOT}/${INFERENCE_PROJECT_NAME}") - -set(INFERENCE_INCLUDE_DIR "${INFERENCE_ROOT}/include") -if (USE_DYNAMIC_LIBRARY) - set(INFERENCE_LIBRARY "${INFERENCE_ROOT}/lib/lib${INFERENCE_PROJECT_NAME}.so") -else (USE_DYNAMIC_LIBRARY) - set(INFERENCE_LIBRARY "${INFERENCE_ROOT}/lib/lib${INFERENCE_PROJECT_NAME}.a") -endif (USE_DYNAMIC_LIBRARY) - -if (INFERENCE_INCLUDE_DIR AND INFERENCE_LIBRARY) - set(INFERENCE_FOUND true) -endif (INFERENCE_INCLUDE_DIR AND INFERENCE_LIBRARY) - -find_package(BlasEnhance) -find_package(TensorComputing) -find_package(ModelTools) -find_package(Image) - -if (INFERENCE_FOUND) - set(INFERENCE_LIBRARIES "${INFERENCE_LIBRARY};${IMAGE_LIBRARY};${MODEL_TOOLS_LIBRARY};${TENSOR_COMPUTING_LIBRARY};${BLAS_ENHANCE_LIBRARY}") - include_directories(include ${INFERENCE_INCLUDE_DIR}) - message(STATUS "Found ${INFERENCE_PROJECT_NAME}.hpp: ${INFERENCE_INCLUDE_DIR}") - message(STATUS "Found ${INFERENCE_PROJECT_NAME}: ${INFERENCE_LIBRARIES}") -else (INFERENCE_FOUND) - message(FATAL_ERROR " -FATAL: can not find ${INFERENCE_PROJECT_NAME} library in /${INFERENCE_PROJECT_NAME}/[include/lib] directory, - please set shell or cmake environment variable BOLT_ROOT. - ") -endif (INFERENCE_FOUND) - -message(STATUS ${INFERENCE_LIBRARIES}) diff --git a/cmakes/FindModelTools.cmake b/cmakes/FindModelTools.cmake deleted file mode 100644 index 3b283eea..00000000 --- a/cmakes/FindModelTools.cmake +++ /dev/null @@ -1,37 +0,0 @@ -set(MODEL_TOOLS_PROJECT_NAME "model-tools") -unset(MODEL_TOOLS_ROOT) -find_path(MODEL_TOOLS_ROOT NAMES ${MODEL_TOOLS_PROJECT_NAME} HINTS ${BOLT_ROOT} $ENV{BOLT_ROOT}) -set(MODEL_TOOLS_ROOT "${MODEL_TOOLS_ROOT}/${MODEL_TOOLS_PROJECT_NAME}") - -set(MODEL_TOOLS_INCLUDE_DIR "${MODEL_TOOLS_ROOT}/include") -if (USE_DYNAMIC_LIBRARY) - set(MODEL_TOOLS_LIBRARY "${MODEL_TOOLS_ROOT}/lib/lib${MODEL_TOOLS_PROJECT_NAME}.so") -else (USE_DYNAMIC_LIBRARY) - set(MODEL_TOOLS_LIBRARY "${MODEL_TOOLS_ROOT}/lib/lib${MODEL_TOOLS_PROJECT_NAME}.a") -endif (USE_DYNAMIC_LIBRARY) - -if (MODEL_TOOLS_INCLUDE_DIR AND MODEL_TOOLS_LIBRARY) - set(MODEL_TOOLS_FOUND true) -endif (MODEL_TOOLS_INCLUDE_DIR AND MODEL_TOOLS_LIBRARY) - -if (USE_CAFFE) - find_package(ModelToolsCaffe) -endif (USE_CAFFE) -if (USE_ONNX) - find_package(ModelToolsOnnx) -endif(USE_ONNX) -if (USE_TFLITE) - find_package(ModelToolsTFLite) -endif (USE_TFLITE) - -if (MODEL_TOOLS_FOUND) - set(MODEL_TOOLS_LIBRARIES "${MODEL_TOOLS_LIBRARY};${MODEL_TOOLS_CAFFE_LIBRARIES};${MODEL_TOOLS_ONNX_LIBRARIES};${MODEL_TOOLS_TFLITE_LIBRARIES}") - include_directories(include ${MODEL_TOOLS_INCLUDE_DIR}) - message(STATUS "Found ${MODEL_TOOLS_PROJECT_NAME}.h: ${MODEL_TOOLS_INCLUDE_DIR}") - message(STATUS "Found ${MODEL_TOOLS_PROJECT_NAME}: ${MODEL_TOOLS_LIBRARIES}") -else (MODEL_TOOLS_FOUND) - message(FATAL_ERROR " -FATAL: can not find lib${MODEL_TOOLS_PROJECT_NAME}.* library in /model-tools/lib directory, - please set shell or cmake environment variable BOLT_ROOT. - ") -endif (MODEL_TOOLS_FOUND) diff --git a/cmakes/FindModelToolsCaffe.cmake b/cmakes/FindModelToolsCaffe.cmake deleted file mode 100644 index dace7a38..00000000 --- a/cmakes/FindModelToolsCaffe.cmake +++ /dev/null @@ -1,33 +0,0 @@ -set(MODEL_TOOLS_PROJECT_NAME "model-tools") -unset(MODEL_TOOLS_ROOT) -find_path(MODEL_TOOLS_ROOT NAMES ${MODEL_TOOLS_PROJECT_NAME} HINTS ${BOLT_ROOT} $ENV{BOLT_ROOT}) -set(MODEL_TOOLS_ROOT "${MODEL_TOOLS_ROOT}/${MODEL_TOOLS_PROJECT_NAME}") - -if (USE_DYNAMIC_LIBRARY) - set(MODEL_TOOLS_CAFFE_LIBRARY "${MODEL_TOOLS_ROOT}/lib/lib${MODEL_TOOLS_PROJECT_NAME}_caffe.so") -else (USE_DYNAMIC_LIBRARY) - set(MODEL_TOOLS_CAFFE_LIBRARY "${MODEL_TOOLS_ROOT}/lib/lib${MODEL_TOOLS_PROJECT_NAME}_caffe.a") -endif (USE_DYNAMIC_LIBRARY) - -if (MODEL_TOOLS_CAFFE_LIBRARY) - set(MODEL_TOOLS_CAFFE_FOUND true) -endif (MODEL_TOOLS_CAFFE_LIBRARY) - -find_package(Protobuf) - -if (MODEL_TOOLS_CAFFE_FOUND) - if (USE_GNU_GCC) - set(MODEL_TOOLS_CAFFE_LIBRARIES "${MODEL_TOOLS_CAFFE_LIBRARY};${Protobuf_LIBRARY};-lpthread") - endif(USE_GNU_GCC) - if (USE_LLVM_CLANG) - set(MODEL_TOOLS_CAFFE_LIBRARIES "${MODEL_TOOLS_CAFFE_LIBRARY};${Protobuf_LIBRARY};-lz") - endif(USE_LLVM_CLANG) - - include_directories(include ${Protobuf_INCLUDE_DIR}) - message(STATUS "Found ${MODEL_TOOLS_PROJECT_NAME}_caffe: ${MODEL_TOOLS_CAFFE_LIBRARY}") -else (MODEL_TOOLS_CAFFE_FOUND) - message(FATAL_ERROR " -FATAL: can not find lib${MODEL_TOOLS_PROJECT_NAME}_caffe.* library in /model-tools/lib directory, - please set shell or cmake environment variable BOLT_ROOT. - ") -endif (MODEL_TOOLS_CAFFE_FOUND) diff --git a/cmakes/FindModelToolsOnnx.cmake b/cmakes/FindModelToolsOnnx.cmake deleted file mode 100644 index c3c597c9..00000000 --- a/cmakes/FindModelToolsOnnx.cmake +++ /dev/null @@ -1,33 +0,0 @@ -set(MODEL_TOOLS_PROJECT_NAME "model-tools") -unset(MODEL_TOOLS_ROOT) -find_path(MODEL_TOOLS_ROOT NAMES ${MODEL_TOOLS_PROJECT_NAME} HINTS ${BOLT_ROOT} $ENV{BOLT_ROOT}) -set(MODEL_TOOLS_ROOT "${MODEL_TOOLS_ROOT}/${MODEL_TOOLS_PROJECT_NAME}") - -if (USE_DYNAMIC_LIBRARY) - set(MODEL_TOOLS_ONNX_LIBRARY "${MODEL_TOOLS_ROOT}/lib/lib${MODEL_TOOLS_PROJECT_NAME}_onnx.so") -else (USE_DYNAMIC_LIBRARY) - set(MODEL_TOOLS_ONNX_LIBRARY "${MODEL_TOOLS_ROOT}/lib/lib${MODEL_TOOLS_PROJECT_NAME}_onnx.a") -endif (USE_DYNAMIC_LIBRARY) - -if (MODEL_TOOLS_ONNX_LIBRARY) - set(MODEL_TOOLS_ONNX_FOUND true) -endif (MODEL_TOOLS_ONNX_LIBRARY) - -find_package(Protobuf) - -if (MODEL_TOOLS_ONNX_FOUND) - if (USE_GNU_GCC) - set(MODEL_TOOLS_ONNX_LIBRARIES "${MODEL_TOOLS_ONNX_LIBRARY};${Protobuf_LIBRARY};-lpthread") - endif(USE_GNU_GCC) - if (USE_LLVM_CLANG) - set(MODEL_TOOLS_ONNX_LIBRARIES "${MODEL_TOOLS_ONNX_LIBRARY};${Protobuf_LIBRARY}") - endif(USE_LLVM_CLANG) - - include_directories(include ${Protobuf_INCLUDE_DIR}) - message(STATUS "Found ${MODEL_TOOLS_PROJECT_NAME}_onnx: ${MODEL_TOOLS_ONNX_LIBRARY}") -else (MODEL_TOOLS_ONNX_FOUND) - message(FATAL_ERROR " -FATAL: can not find lib${MODEL_TOOLS_PROJECT_NAME}_onnx.* library in /model-tools/lib directory, - please set shell or cmake environment variable BOLT_ROOT. - ") -endif (MODEL_TOOLS_ONNX_FOUND) diff --git a/cmakes/FindModelToolsTFLite.cmake b/cmakes/FindModelToolsTFLite.cmake deleted file mode 100644 index 66b91d0d..00000000 --- a/cmakes/FindModelToolsTFLite.cmake +++ /dev/null @@ -1,33 +0,0 @@ -set(MODEL_TOOLS_PROJECT_NAME "model-tools") -unset(MODEL_TOOLS_ROOT) -find_path(MODEL_TOOLS_ROOT NAMES ${MODEL_TOOLS_PROJECT_NAME} HINTS ${BOLT_ROOT} $ENV{BOLT_ROOT}) -set(MODEL_TOOLS_ROOT "${MODEL_TOOLS_ROOT}/${MODEL_TOOLS_PROJECT_NAME}") - -if (USE_DYNAMIC_LIBRARY) - set(MODEL_TOOLS_TFLITE_LIBRARY "${MODEL_TOOLS_ROOT}/lib/lib${MODEL_TOOLS_PROJECT_NAME}_tflite.so") -else (USE_DYNAMIC_LIBRARY) - set(MODEL_TOOLS_TFLITE_LIBRARY "${MODEL_TOOLS_ROOT}/lib/lib${MODEL_TOOLS_PROJECT_NAME}_tflite.a") -endif (USE_DYNAMIC_LIBRARY) - -if (MODEL_TOOLS_TFLITE_LIBRARY) - set(MODEL_TOOLS_TFLITE_FOUND true) -endif (MODEL_TOOLS_TFLITE_LIBRARY) - -find_package(FlatBuffers) -find_package(TFLite) - -if (MODEL_TOOLS_TFLITE_FOUND) - include_directories(include ${FlatBuffers_INCLUDE_DIR}) - if (USE_GNU_GCC) - set(MODEL_TOOLS_TFLITE_LIBRARIES "${MODEL_TOOLS_TFLITE_LIBRARY};-lpthread") - endif(USE_GNU_GCC) - if (USE_LLVM_CLANG) - set(MODEL_TOOLS_TFLITE_LIBRARIES "${MODEL_TOOLS_TFLITE_LIBRARY}") - endif(USE_LLVM_CLANG) - message(STATUS "Found ${MODEL_TOOLS_PROJECT_NAME}_tflite: ${MODEL_TOOLS_TFLITE_LIBRARY}") -else (MODEL_TOOLS_TFLITE_FOUND) - message(FATAL_ERROR " -FATAL: can not find lib${MODEL_TOOLS_PROJECT_NAME}_tflite.* library in /model-tools/lib directory, - please set shell or cmake environment variable BOLT_ROOT. - ") -endif (MODEL_TOOLS_TFLITE_FOUND) diff --git a/cmakes/FindTFLite.cmake b/cmakes/FindTFLite.cmake deleted file mode 100644 index 437b1329..00000000 --- a/cmakes/FindTFLite.cmake +++ /dev/null @@ -1,15 +0,0 @@ -find_path(TFLITE_INCLUDE_DIR NAMES schema_generated.h HINTS ${TFLite_ROOT}/include $ENV{TFLite_ROOT}/include) - -if (TFLITE_INCLUDE_DIR) - set(TFLITE_FOUND true) -endif (TFLITE_INCLUDE_DIR) - -if (TFLITE_FOUND) - include_directories(include ${TFLITE_INCLUDE_DIR}) - message(STATUS "Found schema_generated.h: ${TFLITE_INCLUDE_DIR}") -else (TFLITE_FOUND) - message(FATAL_ERROR " -FATAL: can not find schema_generated.h in /include directory, - please set shell environment variable TFLite_ROOT. - ") -endif (TFLITE_FOUND) diff --git a/cmakes/FindTensorComputing.cmake b/cmakes/FindTensorComputing.cmake deleted file mode 100644 index e8970f50..00000000 --- a/cmakes/FindTensorComputing.cmake +++ /dev/null @@ -1,29 +0,0 @@ -set(TENSOR_COMPUTING_PROJECT_NAME "tensor_computing") -unset(TENSOR_COMPUTING_ROOT) -find_path(TENSOR_COMPUTING_ROOT NAMES ${TENSOR_COMPUTING_PROJECT_NAME} HINTS ${BOLT_ROOT} $ENV{BOLT_ROOT}) -set(TENSOR_COMPUTING_ROOT "${TENSOR_COMPUTING_ROOT}/${TENSOR_COMPUTING_PROJECT_NAME}") - -set(TENSOR_COMPUTING_INCLUDE_DIR "${TENSOR_COMPUTING_ROOT}/include") -if (USE_DYNAMIC_LIBRARY) - set(TENSOR_COMPUTING_LIBRARY "${TENSOR_COMPUTING_ROOT}/lib/lib${TENSOR_COMPUTING_PROJECT_NAME}.so") -else (USE_DYNAMIC_LIBRARY) - set(TENSOR_COMPUTING_LIBRARY "${TENSOR_COMPUTING_ROOT}/lib/lib${TENSOR_COMPUTING_PROJECT_NAME}.a") -endif (USE_DYNAMIC_LIBRARY) - -if (TENSOR_COMPUTING_INCLUDE_DIR AND TENSOR_COMPUTING_LIBRARY) - set(TENSOR_COMPUTING_FOUND true) -endif (TENSOR_COMPUTING_INCLUDE_DIR AND TENSOR_COMPUTING_LIBRARY) - -find_package(BlasEnhance) - -if (TENSOR_COMPUTING_FOUND) - set(TENSOR_COMPUTING_LIBRARIES "${TENSOR_COMPUTING_LIBRARY};${BLAS_ENHANCE_LIBRARY}") - include_directories(include ${TENSOR_COMPUTING_INCLUDE_DIR}) - message(STATUS "Found ${TENSOR_COMPUTING_PROJECT_NAME}.h: ${TENSOR_COMPUTING_INCLUDE_DIR}") - message(STATUS "Found ${TENSOR_COMPUTING_PROJECT_NAME}: ${TENSOR_COMPUTING_LIBRARIES}") -else (TENSOR_COMPUTING_FOUND) - message(FATAL_ERROR " -FATAL: can not find ${TENSOR_COMPUTING_PROJECT_NAME} library in /${TENSOR_COMPUTING_PROJECT_NAME}/[include/lib] directory, - please set shell or cmake environment variable BOLT_ROOT. - ") -endif (TENSOR_COMPUTING_FOUND) diff --git a/cmakes/FindUni.cmake b/cmakes/FindUni.cmake deleted file mode 100644 index a2084b1b..00000000 --- a/cmakes/FindUni.cmake +++ /dev/null @@ -1,18 +0,0 @@ -find_path(UNI_ROOT NAMES uni HINTS ${BOLT_ROOT} $ENV{BOLT_ROOT}) -set(UNI_ROOT "${UNI_ROOT}/uni") - -set(UNI_INCLUDE_DIR "${UNI_ROOT}/include") - -if (UNI_INCLUDE_DIR) - set(UNI_FOUND true) -endif (UNI_INCLUDE_DIR) - -if (UNI_FOUND) - include_directories(include ${UNI_INCLUDE_DIR}) - message(STATUS "Found type.h: ${UNI_INCLUDE_DIR}") -else (UNI_FOUND) - message(FATAL_ERROR " -FATAL: can not find uni library in /uni/[include/lib] directory, - please set shell or cmake environment variable BOLT_ROOT. - ") -endif (UNI_FOUND) diff --git a/cmakes/Findjpeg.cmake b/cmakes/Findjpeg.cmake deleted file mode 100644 index 10c2b766..00000000 --- a/cmakes/Findjpeg.cmake +++ /dev/null @@ -1,22 +0,0 @@ -find_path(JPEG_INCLUDE_DIR NAMES jpeglib.h HINTS ${JPEG_ROOT}/include $ENV{JPEG_ROOT}/include) - -if (USE_DYNAMIC_LIBRARY) - find_library(JPEG_LIBRARY NAMES libjpeg.so HINTS ${JPEG_ROOT}/lib $ENV{JPEG_ROOT}/lib) -else (USE_DYNAMIC_LIBRARY) - find_library(JPEG_LIBRARY NAMES libjpeg.a HINTS ${JPEG_ROOT}/lib $ENV{JPEG_ROOT}/lib) -endif (USE_DYNAMIC_LIBRARY) - -if (JPEG_INCLUDE_DIR AND JPEG_LIBRARY) - set(JPEG_FOUND true) -endif (JPEG_INCLUDE_DIR AND JPEG_LIBRARY) - -if (JPEG_FOUND) - include_directories(include ${JPEG_INCLUDE_DIR}) - message(STATUS "Found jpeglib.h: ${JPEG_INCLUDE_DIR}") - message(STATUS "Found jpeg: ${JPEG_LIBRARY}") -else (JPEG_FOUND) - message(FATAL_ERROR " -FATAL: can not find jpeg library in /[include|lib] directory, - please set shell environment variable JPEG_ROOT. - ") -endif (JPEG_FOUND) diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt new file mode 100644 index 00000000..21a7eb10 --- /dev/null +++ b/common/CMakeLists.txt @@ -0,0 +1,18 @@ +cmake_minimum_required(VERSION 3.2) + +file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/common/cmakes/bolt.cmake ${BOLT_ROOT}/common/cmakes/bolt.cmake) +if (BOLT_CONFIGURE_FILE) + include(${BOLT_CONFIGURE_FILE}) +else (BOLT_CONFIGURE_FILE) + message(FATAL_ERROR " +FATAL: can not find bolt.cmake in /common/cmakes directory, + please set shell or cmake environment variable BOLT_ROOT. + ") +endif (BOLT_CONFIGURE_FILE) + +project(common) + +add_subdirectory(uni) +if (USE_MALI) + add_subdirectory(gcl) +endif (USE_MALI) diff --git a/common/cmakes/FindFFTW.cmake b/common/cmakes/FindFFTW.cmake new file mode 100644 index 00000000..e2f59c4a --- /dev/null +++ b/common/cmakes/FindFFTW.cmake @@ -0,0 +1,23 @@ +find_path(FFTW_INCLUDE_DIR NAMES fftw3.h HINTS $ENV{FFTW_ROOT}/include ${FFTW_ROOT}/include) + +if (USE_DYNAMIC_LIBRARY) + find_library(FFTW_LIBRARY NAMES libfftw3f.so HINTS $ENV{FFTW_ROOT}/lib ${FFTW_ROOT}/lib) +else (USE_DYNAMIC_LIBRARY) + find_library(FFTW_LIBRARY NAMES libfftw3f.a HINTS $ENV{FFTW_ROOT}/lib ${FFTW_ROOT}/lib) +endif (USE_DYNAMIC_LIBRARY) + +if (FFTW_INCLUDE_DIR AND FFTW_LIBRARY) + set(FFTW_FOUND true) +endif (FFTW_INCLUDE_DIR AND FFTW_LIBRARY) + +if (FFTW_FOUND) + include_directories(${FFTW_INCLUDE_DIR}) + set(FFTW_LIBRARIES "${FFTW_LIBRARY}") + message(STATUS "Found fftw3f.h: ${FFTW_INCLUDE_DIR}") + message(STATUS "Found fftw3: ${FFTW_LIBRARIES}") +else (FFTW_FOUND) + message(FATAL_ERROR " +FATAL: can not find fftw library in /[include|lib] directory, + please set shell environment variable FFTW_ROOT. + ") +endif (FFTW_FOUND) diff --git a/cmakes/FindFlatBuffers.cmake b/common/cmakes/FindFlatBuffers.cmake similarity index 88% rename from cmakes/FindFlatBuffers.cmake rename to common/cmakes/FindFlatBuffers.cmake index fc61fc8c..9fa15283 100644 --- a/cmakes/FindFlatBuffers.cmake +++ b/common/cmakes/FindFlatBuffers.cmake @@ -1,5 +1,5 @@ -find_path(FlatBuffers_INCLUDE_DIR NAMES flatbuffers/flatbuffers.h HINTS ${FlatBuffers_ROOT}/include - $ENV{FlatBuffers_ROOT}/include +find_path(FlatBuffers_INCLUDE_DIR NAMES flatbuffers/flatbuffers.h HINTS $ENV{FlatBuffers_ROOT}/include + ${FlatBuffers_ROOT}/include /usr/local/include) if (FlatBuffers_INCLUDE_DIR) diff --git a/common/cmakes/FindGcl.cmake b/common/cmakes/FindGcl.cmake new file mode 100644 index 00000000..8fab6e42 --- /dev/null +++ b/common/cmakes/FindGcl.cmake @@ -0,0 +1,56 @@ +find_path(GCL_ROOT NAMES gcl HINTS ${BOLT_ROOT}/common $ENV{BOLT_ROOT}/common) +set(GCL_ROOT "${GCL_ROOT}/gcl") + +set(GCL_INCLUDE_DIR "${GCL_ROOT}/include") + +if (GCL_INCLUDE_DIR) + set(GCL_FOUND true) +endif (GCL_INCLUDE_DIR) + +if (GCL_FOUND) + include_directories(${GCL_INCLUDE_DIR}) + message(STATUS "Found gcl.h: ${GCL_INCLUDE_DIR}") +else (GCL_FOUND) + message(FATAL_ERROR " +FATAL: can not find gcl.h in /gcl/include directory, + please set shell or cmake environment variable BOLT_ROOT. + ") +endif (GCL_FOUND) + +find_package(OpenCL) + +set(GCL_KERNELSOURCE_INCLUDE_DIR "${GCL_ROOT}/tools/kernel_source_compile/include") + +if (${USE_DYNAMIC_LIBRARY} STREQUAL "OFF") + set(GCL_KERNELSOURCE_LIBRARY "${GCL_ROOT}/tools/kernel_source_compile/lib/libkernelsource.a") +else (${USE_DYNAMIC_LIBRARY} STREQUAL "OFF") + set(GCL_KERNELSOURCE_LIBRARY "${GCL_ROOT}/tools/kernel_source_compile/lib/libkernelsource.so") +endif (${USE_DYNAMIC_LIBRARY} STREQUAL "OFF") + +if(GCL_KERNELSOURCE_INCLUDE_DIR) + set(KERNELSOURCE_INCLUDE_FOUND true) +endif(GCL_KERNELSOURCE_INCLUDE_DIR) + +if(GCL_KERNELSOURCE_LIBRARY) + set(KERNELSOURCE_LIB_FOUND true) +endif(GCL_KERNELSOURCE_LIBRARY) + +if (KERNELSOURCE_INCLUDE_FOUND) + include_directories(${GCL_KERNELSOURCE_INCLUDE_DIR}) + message(STATUS "Found libkernelsource.h: ${GCL_KERNELSOURCE_INCLUDE_DIR}") +else (KERNELSOURCE_INCLUDE_FOUND) + message(FATAL_ERROR " +FATAL: can not find libkernelsource.h in /gcl/tools/kernel_source_compile/include/ directory, + please set shell or cmake environment variable BOLT_ROOT. + ") +endif (KERNELSOURCE_INCLUDE_FOUND) + +if (KERNELSOURCE_LIB_FOUND) + set(KERNELSOURCE_LIBRARIES "${GCL_KERNELSOURCE_LIBRARY}") + message(STATUS "Found kernelsource: ${KERNELSOURCE_LIBRARIES}") +else (KERNELSOURCE_LIB_FOUND) + message(FATAL_ERROR " +FATAL: can not find libkernelsource.a in /gcl/tools/kernel_source_compile/lib directory, + please set shell or cmake environment variable BOLT_ROOT. + ") +endif (KERNELSOURCE_LIB_FOUND) diff --git a/common/cmakes/FindJNI.cmake b/common/cmakes/FindJNI.cmake new file mode 100644 index 00000000..ae7e662d --- /dev/null +++ b/common/cmakes/FindJNI.cmake @@ -0,0 +1,14 @@ +find_path(JNI_INCLUDE_DIR NAMES jni.h HINTS $ENV{JNI_ROOT}/include ${JNI_ROOT}/include $ENV{JNI_ROOT}/include/linux ${JNI_ROOT}/include/linux) +find_path(JNI_MD_INCLUDE_DIR NAMES jni_md.h HINTS $ENV{JNI_ROOT}/include ${JNI_ROOT}/include $ENV{JNI_ROOT}/include/linux ${JNI_ROOT}/include/linux) + +if (JNI_INCLUDE_DIR AND JNI_MD_INCLUDE_DIR) + set(JNI_FOUND true) +else (JNI_INCLUDE_DIR AND JNI_MD_INCLUDE_DIR) + set(JNI_FOUND false) +endif (JNI_INCLUDE_DIR AND JNI_MD_INCLUDE_DIR) + +if (JNI_FOUND) + message(STATUS "Found jni.h: ${JNI_INCLUDE_DIR}") +else (JNI_FOUND) + message(WARNING "WARNING: can not find jni.h/jni_md.h in /include or /include/linux directory, so can not use Java API. If you want to use Java API, please set shell or cmake environment variable JNI_ROOT.") +endif (JNI_FOUND) diff --git a/cmakes/FindOpenCL.cmake b/common/cmakes/FindOpenCL.cmake similarity index 77% rename from cmakes/FindOpenCL.cmake rename to common/cmakes/FindOpenCL.cmake index f77b9f0d..8b8fdf2a 100644 --- a/cmakes/FindOpenCL.cmake +++ b/common/cmakes/FindOpenCL.cmake @@ -1,6 +1,6 @@ -find_path(OPENCL_INCLUDE_DIR NAMES CL/cl.h HINTS ${OpenCL_ROOT}/include $ENV{OpenCL_ROOT}/include /usr/local/include) -find_path(OPENCL_LIB_DIR NAMES libOpenCL.so HINTS ${OpenCL_ROOT}/lib64 $ENV{OpenCL_ROOT}/lib64 /usr/local/lib) -find_path(GLES_MALI_LIB_DIR NAMES libGLES_mali.so HINT ${OpenCL_ROOT}/lib64 $ENV{OpenCL_ROOT}/lib64 /usr/local/lib) +find_path(OPENCL_INCLUDE_DIR NAMES CL/cl.h HINTS $ENV{OpenCL_ROOT}/include ${OpenCL_ROOT}/include /usr/local/include) +find_path(OPENCL_LIB_DIR NAMES libOpenCL.so HINTS $ENV{OpenCL_ROOT}/lib64 ${OpenCL_ROOT}/lib64 /usr/local/lib) +find_path(GLES_MALI_LIB_DIR NAMES libGLES_mali.so HINT $ENV{OpenCL_ROOT}/lib64 ${OpenCL_ROOT}/lib64 /usr/local/lib) if (OPENCL_INCLUDE_DIR) set(OPENCL_HEAD_FOUND true) @@ -19,7 +19,6 @@ if(GLES_MALI_LIB_DIR) endif(GLES_MALI_LIB_DIR) if (OPENCL_HEAD_FOUND) - include_directories(include ${OPENCL_INCLUDE_DIR}) message(STATUS "Found CL/cl.h: ${OPENCL_INCLUDE_DIR}") else (OPENCL_HEAD_FOUND) message(FATAL_ERROR " diff --git a/cmakes/FindProtobuf.cmake b/common/cmakes/FindProtobuf.cmake similarity index 84% rename from cmakes/FindProtobuf.cmake rename to common/cmakes/FindProtobuf.cmake index 3ca86118..d6e4bafa 100644 --- a/cmakes/FindProtobuf.cmake +++ b/common/cmakes/FindProtobuf.cmake @@ -1,12 +1,20 @@ -find_path(Protobuf_INCLUDE_DIR NAMES google/protobuf/service.h HINTS ${Protobuf_ROOT}/include $ENV{Protobuf_ROOT}/include) +if (Protobuf_FOUND) + return() +endif (Protobuf_FOUND) + +find_path(Protobuf_INCLUDE_DIR NAMES google/protobuf/service.h HINTS $ENV{Protobuf_ROOT}/include ${Protobuf_ROOT}/include) if (USE_DYNAMIC_LIBRARY) - find_library(Protobuf_LIBRARY NAMES libprotobuf.so HINTS ${Protobuf_ROOT}/lib $ENV{Protobuf_ROOT}/lib) + if (USE_IOS_CLANG) + find_library(Protobuf_LIBRARY NAMES libprotobuf.dylib HINTS $ENV{Protobuf_ROOT}/lib ${Protobuf_ROOT}/lib) + else (USE_IOS_CLANG) + find_library(Protobuf_LIBRARY NAMES libprotobuf.so HINTS $ENV{Protobuf_ROOT}/lib ${Protobuf_ROOT}/lib) + endif (USE_IOS_CLANG) else (USE_DYNAMIC_LIBRARY) - find_library(Protobuf_LIBRARY NAMES libprotobuf.a HINTS ${Protobuf_ROOT}/lib $ENV{Protobuf_ROOT}/lib) + find_library(Protobuf_LIBRARY NAMES libprotobuf.a HINTS $ENV{Protobuf_ROOT}/lib ${Protobuf_ROOT}/lib) endif (USE_DYNAMIC_LIBRARY) -find_program(Protobuf_PROTOC_EXECUTABLE NAMES protoc HINTS ${Protobuf_ROOT}/bin $ENV{Protobuf_ROOT}/bin) +find_program(Protobuf_PROTOC_EXECUTABLE NAMES protoc HINTS $ENV{Protobuf_ROOT}/bin ${Protobuf_ROOT}/bin) #set(Protobuf_DEBUG ON) if (Protobuf_INCLUDE_DIR AND Protobuf_LIBRARY AND Protobuf_PROTOC_EXECUTABLE) @@ -50,12 +58,11 @@ if (Protobuf_FOUND) message(STATUS "Found Protobuf: ${Protobuf_LIBRARY}") else (Protobuf_FOUND) message(FATAL_ERROR " -FATAL: can not find protobuf library in /uni/[include/lib] directory, +FATAL: can not find protobuf library in /[include/lib] directory, please set shell environment variable Protobuf_ROOT. ") endif (Protobuf_FOUND) - function(protobuf_generate) set(_options APPEND_PATH DESCRIPTORS) set(_singleargs LANGUAGE OUT_VAR EXPORT_MACRO PROTOC_OUT_DIR) @@ -150,7 +157,8 @@ function(protobuf_generate) set(_generated_srcs) foreach(_ext ${protobuf_generate_GENERATE_EXTENSIONS}) - list(APPEND _generated_srcs "${protobuf_generate_PROTOC_OUT_DIR}/${_possible_rel_dir}${_basename}${_ext}") + #list(APPEND _generated_srcs "${protobuf_generate_PROTOC_OUT_DIR}/${_possible_rel_dir}${_basename}${_ext}") + list(APPEND _generated_srcs "${protobuf_generate_PROTOC_OUT_DIR}/${_basename}${_ext}") endforeach() if(protobuf_generate_DESCRIPTORS AND protobuf_generate_LANGUAGE STREQUAL cpp) @@ -230,38 +238,38 @@ endfunction() function(PROTOBUF_GENERATE_PYTHON SRCS) - if(NOT ARGN) - message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files") - return() - endif() + if(NOT ARGN) + message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files") + return() + endif() - if(PROTOBUF_GENERATE_CPP_APPEND_PATH) - set(_append_arg APPEND_PATH) - endif() + if(PROTOBUF_GENERATE_CPP_APPEND_PATH) + set(_append_arg APPEND_PATH) + endif() - if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS) - set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}") - endif() + if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS) + set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}") + endif() - if(DEFINED Protobuf_IMPORT_DIRS) - set(_import_arg IMPORT_DIRS ${Protobuf_IMPORT_DIRS}) - endif() + if(DEFINED Protobuf_IMPORT_DIRS) + set(_import_arg IMPORT_DIRS ${Protobuf_IMPORT_DIRS}) + endif() - set(_outvar) - protobuf_generate(${_append_arg} LANGUAGE python OUT_VAR _outvar ${_import_arg} PROTOS ${ARGN}) - set(${SRCS} ${_outvar} PARENT_SCOPE) + set(_outvar) + protobuf_generate(${_append_arg} LANGUAGE python OUT_VAR _outvar ${_import_arg} PROTOS ${ARGN}) + set(${SRCS} ${_outvar} PARENT_SCOPE) endfunction() if(Protobuf_INCLUDE_DIR) if(Protobuf_PROTOC_EXECUTABLE) - if(NOT TARGET protobuf::protoc) - add_executable(protobuf::protoc IMPORTED) - if(EXISTS "${Protobuf_PROTOC_EXECUTABLE}") - set_target_properties(protobuf::protoc PROPERTIES - IMPORTED_LOCATION "${Protobuf_PROTOC_EXECUTABLE}") - endif() + if(NOT TARGET protobuf::protoc) + add_executable(protobuf::protoc IMPORTED) + if(EXISTS "${Protobuf_PROTOC_EXECUTABLE}") + set_target_properties(protobuf::protoc PROPERTIES + IMPORTED_LOCATION "${Protobuf_PROTOC_EXECUTABLE}") endif() + endif() endif() endif() diff --git a/common/cmakes/FindTFLite.cmake b/common/cmakes/FindTFLite.cmake new file mode 100644 index 00000000..5dcaf1cc --- /dev/null +++ b/common/cmakes/FindTFLite.cmake @@ -0,0 +1,16 @@ +find_path(TFLITE_INCLUDE_DIR NAMES tensorflow/lite/schema/schema_generated.h HINTS $ENV{TFLite_ROOT}/include ${TFLite_ROOT}/include) + +if (TFLITE_INCLUDE_DIR) + set(TFLITE_FOUND true) +endif (TFLITE_INCLUDE_DIR) +find_package(FlatBuffers) + +if (TFLITE_FOUND) + message(STATUS "Found tensorflow/lite/schema/schema_generated.h: ${TFLITE_INCLUDE_DIR}") + set(TFLITE_INCLUDE_DIR "${TFLITE_INCLUDE_DIR};${FlatBuffers_INCLUDE_DIR}") +else (TFLITE_FOUND) + message(FATAL_ERROR " +FATAL: can not find tensorflow/lite/schema/schema_generated.h in /include directory, + please set shell environment variable TFLite_ROOT. + ") +endif (TFLITE_FOUND) diff --git a/common/cmakes/Findjpeg.cmake b/common/cmakes/Findjpeg.cmake new file mode 100644 index 00000000..fa5e8ae7 --- /dev/null +++ b/common/cmakes/Findjpeg.cmake @@ -0,0 +1,25 @@ +find_path(JPEG_INCLUDE_DIR NAMES jpeglib.h HINTS $ENV{JPEG_ROOT}/include ${JPEG_ROOT}/include) + +if (USE_DYNAMIC_LIBRARY) + if (USE_IOS_CLANG) + find_library(JPEG_LIBRARY NAMES libjpeg.dylib HINTS $ENV{JPEG_ROOT}/lib ${JPEG_ROOT}/lib) + else (USE_IOS_CLANG) + find_library(JPEG_LIBRARY NAMES libjpeg.so HINTS $ENV{JPEG_ROOT}/lib ${JPEG_ROOT}/lib) + endif (USE_IOS_CLANG) +else (USE_DYNAMIC_LIBRARY) + find_library(JPEG_LIBRARY NAMES libjpeg.a HINTS $ENV{JPEG_ROOT}/lib ${JPEG_ROOT}/lib) +endif (USE_DYNAMIC_LIBRARY) + +if (JPEG_INCLUDE_DIR AND JPEG_LIBRARY) + set(JPEG_FOUND true) +endif (JPEG_INCLUDE_DIR AND JPEG_LIBRARY) + +if (JPEG_FOUND) + message(STATUS "Found jpeglib.h: ${JPEG_INCLUDE_DIR}") + message(STATUS "Found jpeg: ${JPEG_LIBRARY}") +else (JPEG_FOUND) + message(FATAL_ERROR " +FATAL: can not find jpeg library in /[include|lib] directory, + please set shell environment variable JPEG_ROOT. + ") +endif (JPEG_FOUND) diff --git a/common/cmakes/Findjsoncpp.cmake b/common/cmakes/Findjsoncpp.cmake new file mode 100644 index 00000000..ba4ae1d5 --- /dev/null +++ b/common/cmakes/Findjsoncpp.cmake @@ -0,0 +1,25 @@ +find_path(JSONCPP_INCLUDE_DIR NAMES json/json.h HINTS $ENV{JSONCPP_ROOT}/include ${JSONCPP_ROOT}/include) + +if (USE_DYNAMIC_LIBRARY) + if (USE_IOS_CLANG) + find_library(JSONCPP_LIBRARY NAMES libjsoncpp.dylib HINTS $ENV{JSONCPP_ROOT}/lib ${JSONCPP_ROOT}/lib) + else (USE_IOS_CLANG) + find_library(JSONCPP_LIBRARY NAMES libjsoncpp.so HINTS $ENV{JSONCPP_ROOT}/lib ${JSONCPP_ROOT}/lib) + endif (USE_IOS_CLANG) +else (USE_DYNAMIC_LIBRARY) + find_library(JSONCPP_LIBRARY NAMES libjsoncpp.a HINTS $ENV{JSONCPP_ROOT}/lib ${JSONCPP_ROOT}/lib) +endif (USE_DYNAMIC_LIBRARY) + +if (JSONCPP_INCLUDE_DIR AND JSONCPP_LIBRARY) + set(JSONCPP_FOUND true) +endif (JSONCPP_INCLUDE_DIR AND JSONCPP_LIBRARY) + +if (JSONCPP_FOUND) + message(STATUS "Found jsoncpplib.h: ${JSONCPP_INCLUDE_DIR}") + message(STATUS "Found jsoncpp: ${JSONCPP_LIBRARY}") +else (JSONCPP_FOUND) + message(FATAL_ERROR " +FATAL: can not find jsoncpp library in /[include|lib] directory, + please set shell environment variable JSONCPP_ROOT. + ") +endif (JSONCPP_FOUND) diff --git a/common/cmakes/bolt.cmake b/common/cmakes/bolt.cmake new file mode 100644 index 00000000..5d4bae6c --- /dev/null +++ b/common/cmakes/bolt.cmake @@ -0,0 +1,412 @@ +option(USE_CROSS_COMPILE "set use cross compile or not" ON) +option(USE_GNU_GCC "set use GNU gcc compiler or not" OFF) +option(USE_LLVM_CLANG "set use LLVM clang compiler or not" OFF) +option(USE_IOS_CLANG "set use ios compiler or not" OFF) +option(USE_DYNAMIC_LIBRARY "set use dynamic library or not" OFF) +option(USE_MINSIZEREL ".so lib will be 300KB smaller but performance will be affected" OFF) + +option(USE_ANDROID_LOG "set use Android log or not" OFF) +option(USE_DEBUG "set use debug information or not" OFF) +option(USE_PROFILE "set use profile information or not" OFF) +option(USE_PROFILE_STATISTICS "set use profile statistics information or not" OFF) +option(USE_THREAD_SAFE "set use thread safe or not" OFF) + +# model_tools variable +option(USE_CAFFE "set use caffe model as input or not" ON) +option(USE_ONNX "set use onnx model as input or not" ON) +option(USE_TFLITE "set use tflite model as input or not" ON) +option(USE_TENSORFLOW "set use tensorflow model as input or not" ON) + +# blas_enhance tensor +option(USE_GENERAL "set use CPU serial code or not" ON) +option(USE_X86 "set use X86 instruction or not" OFF) +option(USE_NEON "set use ARM NEON instruction or not" OFF) +option(USE_ARMV7 "set use ARMv7 NEON instruction or not" OFF) +option(USE_ARMV8 "set use ARMv8 NEON instruction or not" ON) +option(USE_MALI "set use mali for parallel or not" OFF) +option(USE_FP32 "set use ARM NEON FP32 instruction or not" ON) +option(USE_FP16 "set use ARM NEON FP16 instruction or not" ON) +option(USE_F16_MIX_PRECISION "set use ARM NEON mix precision f16/f32 instruction or not" ON) +option(USE_INT8 "set use ARM NEON INT8 instruction or not" ON) + +option(USE_OPENMP "set use openmp to run test(tinybert) or not" OFF) +option(USE_LIBRARY_TUNING "set use algorithm tuning or not" OFF) +option(USE_FLOW "set whether to use flow or not" ON) + +option(BUILD_TEST "set to build unit test or not" OFF) + +set(BOLT_ROOT $ENV{BOLT_ROOT}) + +function (set_policy) + if (POLICY CMP0074) + cmake_policy(SET CMP0074 NEW) + endif() +endfunction(set_policy) + +macro (set_c_cxx_flags) + set(COMMON_FLAGS "-W -Wall -Wextra -Wno-unused-command-line-argument -Wno-unused-parameter -O3 -fPIC -fstack-protector") + + if (USE_OPENMP) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_OPENMP -fopenmp") + endif(USE_OPENMP) + + if (USE_LIBRARY_TUNING) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_LIBRARY_TUNING") + endif(USE_LIBRARY_TUNING) + + if (BUILD_TEST) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_BUILD_TEST") + endif(BUILD_TEST) + + if (USE_DEBUG) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_DEBUG") + endif(USE_DEBUG) + + if (USE_JNI) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_JNI") + endif(USE_JNI) + + if (USE_LLVM_CLANG AND USE_ANDROID_LOG) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_ANDROID_LOG -llog") + endif(USE_LLVM_CLANG AND USE_ANDROID_LOG) + + if (USE_PROFILE) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_PROFILE") + endif(USE_PROFILE) + + if (USE_PROFILE_STATISTICS) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_PROFILE_STATISTICS") + endif(USE_PROFILE_STATISTICS) + + if (USE_THREAD_SAFE) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_THREAD_SAFE") + endif(USE_THREAD_SAFE) + + if (USE_GENERAL) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_GENERAL") + endif(USE_GENERAL) + + if (USE_MALI) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_MALI") + endif(USE_MALI) + + if (USE_X86) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_X86 -mavx2 -mfma") + endif(USE_X86) + + if (USE_IOS_CLANG) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_IOS") + endif(USE_IOS_CLANG) + + if (USE_FP32) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_FP32") + endif (USE_FP32) + + if (USE_NEON) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_NEON") + + if (USE_ARMV8) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_ARMV8") + endif (USE_ARMV8) + + if (USE_ARMV7) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_ARMV7 -march=armv7-a -mfloat-abi=softfp -mfpu=neon-vfpv4") + if (USE_LLVM_CLANG) + set(COMMON_FLAGS "${COMMON_FLAGS} -Wl,--allow-multiple-definition") + endif (USE_LLVM_CLANG) + endif (USE_ARMV7) + + if (USE_FP16) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_FP16") + if (USE_F16_MIX_PRECISION) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_F16_MIX_PRECISION") + endif (USE_F16_MIX_PRECISION) + if (USE_INT8) + if (USE_LLVM_CLANG) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_INT8 -march=armv8-a+fp16+dotprod") + else (USE_LLVM_CLANG) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_INT8 -march=armv8.2-a+fp16+dotprod") + endif (USE_LLVM_CLANG) + else (USE_INT8) + if (USE_LLVM_CLANG) + set(COMMON_FLAGS "${COMMON_FLAGS} -march=armv8-a+fp16") + else (USE_LLVM_CLANG) + set(COMMON_FLAGS "${COMMON_FLAGS} -march=armv8.2-a+fp16") + endif (USE_LLVM_CLANG) + endif (USE_INT8) + endif (USE_FP16) + endif(USE_NEON) + + if (USE_CAFFE) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_CAFFE") + endif() + if (USE_ONNX) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_ONNX") + endif() + if (USE_TFLITE) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_TFLITE") + endif() + if (USE_TENSORFLOW) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_TENSORFLOW") + endif() + + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMMON_FLAGS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMMON_FLAGS}") + if (USE_X86) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") + endif (USE_X86) + + if (USE_DEBUG) + set(CMAKE_BUILD_TYPE "Debug") + elseif (USE_MINSIZEREL) + set(CMAKE_BUILD_TYPE "MinSizeRel") + endif (USE_DEBUG) +endmacro(set_c_cxx_flags) + +macro (set_test_c_cxx_flags) + if (${USE_DYNAMIC_LIBRARY} STREQUAL "OFF") + if (USE_CROSS_COMPILE) + if (USE_GNU_GCC) + set(COMMON_FLAGS "${COMMON_FLAGS} -static") + endif(USE_GNU_GCC) + endif(USE_CROSS_COMPILE) + endif(${USE_DYNAMIC_LIBRARY} STREQUAL "OFF") + + if (USE_IOS_CLANG) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_IOS") + endif(USE_IOS_CLANG) + + if (USE_LLVM_CLANG) + if (${USE_DYNAMIC_LIBRARY} STREQUAL "OFF") + set(COMMON_FLAGS "${COMMON_FLAGS} -Wl,-allow-shlib-undefined, -static-libstdc++") + else (${USE_DYNAMIC_LIBRARY} STREQUAL "OFF") + set(COMMON_FLAGS "${COMMON_FLAGS} -Wl,-allow-shlib-undefined") + endif(${USE_DYNAMIC_LIBRARY} STREQUAL "OFF") + endif(USE_LLVM_CLANG) + + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMMON_FLAGS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMMON_FLAGS}") +endmacro (set_test_c_cxx_flags) + +macro (set_project_install_directory) + set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/bin) + set(LIBRARY_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/lib) +endmacro (set_project_install_directory) + +if(USE_DYNAMIC_LIBRARY) + set(uni_library uni) + set(gcl_library gcl) + set(kernelsource_library kernelsource) + set(blas_enhance_library blas_enhance) + set(tensor_library tensor) + set(image_library image) + set(model_tools_caffe_library model_tools_caffe) + set(model_tools_onnx_library model_tools_onnx) + set(model_tools_tflite_library model_tools_tflite) + set(model_tools_tensorflow_library model_tools_tensorflow) + set(model_tools_library model_tools) + set(engine_library engine) + set(flow_library flow) +else() + set(uni_library uni_static) + set(gcl_library gcl_static) + set(kernelsource_library kernelsource_static) + set(blas_enhance_library blas_enhance_static) + set(tensor_library tensor_static) + set(image_library image_static) + set(model_tools_caffe_library model_tools_caffe_static) + set(model_tools_onnx_library model_tools_onnx_static) + set(model_tools_tflite_library model_tools_tflite_static) + set(model_tools_tensorflow_library model_tools_tensorflow_static) + set(model_tools_library model_tools_static) + set(engine_library engine_static) + set(flow_library flow_static) +endif() + +macro(include_uni) + include_directories(${BOLT_ROOT}/common/uni/include) +endmacro() + +macro(link_uni name) + target_link_libraries(${name} ${uni_library}) + if (USE_THREAD_SAFE AND USE_GNU_GCC) + target_link_libraries(${name} -lpthread) + endif (USE_THREAD_SAFE AND USE_GNU_GCC) +endmacro() + +macro(include_gcl) + include_directories(${BOLT_ROOT}/common/gcl/include) + include_directories(${BOLT_ROOT}/common/gcl/tools/kernel_source_compile/include) + include_directories(${OPENCL_INCLUDE_DIR}) + include_uni() +endmacro() + +macro(link_opencl name) + if (USE_MALI) + target_link_libraries(${name} ${OPENCL_LIBRARIES}) + endif(USE_MALI) +endmacro() + +macro(link_gcl name) + if (USE_MALI) + target_link_libraries(${name} ${gcl_library} ${kernelsource_library}) + link_opencl(${name}) + endif (USE_MALI) +endmacro() + +macro(include_memory) + include_directories(${BOLT_ROOT}/common/memory/include) + include_uni() + include_gcl() +endmacro() + +macro(include_blas_enhance) + include_directories(${BOLT_ROOT}/compute/blas_enhance/include) + include_uni() +endmacro() + +macro(link_blas_enhance name) + target_link_libraries(${name} ${blas_enhance_library}) + link_uni(${name}) +endmacro() + +macro(include_tensor) + include_directories(${BOLT_ROOT}/compute/tensor/include) + include_blas_enhance() + include_gcl() + include_memory() +endmacro() + +macro(link_tensor name) + target_link_libraries(${name} ${tensor_library} ${blas_enhance_library}) + link_blas_enhance(${name}) + link_gcl(${name}) +endmacro() + +macro(include_image) + include_directories(${BOLT_ROOT}/compute/image/include) + include_tensor() +endmacro() + +macro(link_image name) + target_link_libraries(${name} ${image_library}) + link_tensor(${name}) +endmacro() + +macro(include_protobuf) + include_directories(${Protobuf_INCLUDE_DIR}) +endmacro() + +macro(link_protobuf name) + target_link_libraries(${name} ${Protobuf_LIBRARY}) + if (USE_GNU_GCC) + target_link_libraries(${name} ${Protobuf_LIBRARY} -lpthread) + endif(USE_GNU_GCC) + if (USE_LLVM_CLANG) + target_link_libraries(${name} ${Protobuf_LIBRARY} -lz) + endif(USE_LLVM_CLANG) +endmacro() + +macro(include_model_tools) + include_directories(${BOLT_ROOT}/model_tools/include) + include_uni() +endmacro() + +macro(link_model_tools name) + target_link_libraries(${name} ${model_tools_library}) + if(USE_CAFFE) + target_link_libraries(${name} ${model_tools_caffe_library}) + endif() + if(USE_ONNX) + target_link_libraries(${name} ${model_tools_onnx_library}) + endif() + if(USE_ONNX) + target_link_libraries(${name} ${model_tools_tflite_library}) + endif() + if(USE_TENSORFLOW) + target_link_libraries(${name} ${model_tools_tensorflow_library}) + target_link_libraries(${name} ${JSONCPP_LIBRARY}) + endif() + if(USE_CAFFE OR USE_ONNX) + link_protobuf(${name}) + endif() + link_uni(${name}) +endmacro() + +macro(model_tools_test name src_name) + include_directories(${BOLT_ROOT}/model_tools/include) + add_executable(${name} ${src_name}) + link_model_tools(${name}) +endmacro() + +macro(include_engine) + if (BUILD_TEST) + include_directories(${JPEG_INCLUDE_DIR}) + endif (BUILD_TEST) + include_directories(${BOLT_ROOT}/inference/engine/include) + if (USE_JNI) + include_directories(${JNI_INCLUDE_DIR}) + include_directories(${JNI_MD_INCLUDE_DIR}) + endif (USE_JNI) + include_model_tools() + include_tensor() + include_image() +endmacro() + +macro(link_engine name) + target_link_libraries(${name} ${engine_library}) + if (BUILD_TEST) + target_link_libraries(${name} ${JPEG_LIBRARY}) + endif (BUILD_TEST) + link_model_tools(${name}) + target_link_libraries(${name} ${image_library} ${tensor_library} ${blas_enhance_library}) + link_gcl(${name}) + link_uni(${name}) +endmacro() + +macro(engine_test name src_name) + add_executable(${name} ${src_name}) + link_engine(${name}) +endmacro() + +macro(include_flow) + include_directories(${BOLT_ROOT}/inference/flow/include) + include_engine() +endmacro() + +macro(flow_test name src_name) + include_protobuf() + include_directories(${BOLT_ROOT}/flow/include) + if ("${name}" STREQUAL "flow_asr") + set_policy() + find_package(FFTW) + add_executable(${name} ${src_name}) + target_link_libraries(${name} ${FFTW_LIBRARIES}) + else () + add_executable(${name} ${src_name}) + endif() + target_link_libraries(${name} ${flow_library}) + link_engine(${name}) + link_protobuf(${name}) + add_dependencies(${name} flow.pb.h) +endmacro() + +macro(include_train) + include_model_tools() + include_tensor() + include_image() +endmacro() + +macro(link_train name) + target_link_libraries(${name} RaulLib) + link_model_tools(${name}) + target_link_libraries(${name} ${image_library} ${tensor_library} ${blas_enhance_library}) + link_gcl(${name}) + link_uni(${name}) +endmacro() + +macro(train_test name src_name) + include_directories(${BOLT_ROOT}/training/include) + add_executable(${name} ${src_name}) + link_train(${name}) +endmacro() diff --git a/common/gcl/CMakeLists.txt b/common/gcl/CMakeLists.txt new file mode 100644 index 00000000..8b44ba49 --- /dev/null +++ b/common/gcl/CMakeLists.txt @@ -0,0 +1,20 @@ +cmake_minimum_required(VERSION 3.2) + +file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/common/cmakes/bolt.cmake ${BOLT_ROOT}/common/cmakes/bolt.cmake) +if (BOLT_CONFIGURE_FILE) + include(${BOLT_CONFIGURE_FILE}) +else (BOLT_CONFIGURE_FILE) + message(FATAL_ERROR " +FATAL: can not find bolt.cmake in directory, + please set shell or cmake environment variable BOLT_ROOT. + ") +endif (BOLT_CONFIGURE_FILE) + +project(gcl) + +set_c_cxx_flags() + +include_gcl() + +add_subdirectory(src) +add_subdirectory(tools/kernel_source_compile) diff --git a/common/gcl/include/context.h b/common/gcl/include/context.h new file mode 100644 index 00000000..f6e1a793 --- /dev/null +++ b/common/gcl/include/context.h @@ -0,0 +1,212 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_CONTEXT +#define _H_CONTEXT + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief create OpenCL Context based on platform + * + * @param platform input, context will be created on this platform + * @param num_devices input, context will be created on num_devices Device + * @param devices input, context created contains devices + * @param context output, return context created + * + * @return + * + */ +inline EE create_context(Platform platform, U32 num_devices, Device *devices, Context *context) +{ + if (NULL == context) { + return NULL_POINTER; + } + + I32 ret; + cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0}; + *context = clCreateContext(properties, num_devices, devices, NULL, NULL, &ret); + map_cl_error_2_ee(ret); +} + +/** + * @brief get context information + * + * @warning please free the memory allocate by this function + **/ +inline EE get_context_info(Context context, cl_context_info info, void **value, U32 *len) +{ + if (NULL == value) { + return NULL_POINTER; + } + + size_t size; + I32 ret = clGetContextInfo(context, info, 0, NULL, &size); + if (CL_SUCCESS == ret) { + if (NULL == len) { + *len = size; + } + void *data = malloc(size); + if (NULL == data) { + return ALLOC_FAILED; + } + ret = clGetContextInfo(context, info, size, data, NULL); + if (CL_SUCCESS == ret) { + *value = data; + } else { + free(data); + } + } + + map_cl_error_2_ee(ret); +} + +inline EE retain_context(Context context) +{ + I32 ret = clRetainContext(context); + map_cl_error_2_ee(ret); +} + +inline EE release_context(Context context) +{ + I32 ret = clReleaseContext(context); + map_cl_error_2_ee(ret); +} + +inline EE create_command_queue_properties( + Context context, Device device, cl_queue_properties *properties, CommandQueue *queue) +{ + if (NULL == queue) { + return NULL_POINTER; + } + I32 ret; + *queue = clCreateCommandQueueWithProperties(context, device, properties, &ret); + map_cl_error_2_ee(ret); +} +/* + inline EE create_command_queue(Context context, Device device, + cl_command_queue_properties properties, CommandQueue* queue) { + if(NULL == queue) return NULL_POINTER; + I32 ret; + * queue = clCreateCommandQueue(context, device, properties, &ret); + map_cl_error_2_ee(ret); + } + */ +/** + * @brief get information of command queue + * + * @warning please free memory associated with value + * + **/ +inline EE get_command_queue_info( + CommandQueue queue, cl_command_queue_info info, void **value, size_t *len) +{ + if (NULL == value) { + return NULL_POINTER; + } + + size_t size; + I32 ret = clGetCommandQueueInfo(queue, info, 0, NULL, &size); + if (CL_SUCCESS == ret) { + if (NULL != len) { + *len = size; + } + void *data = malloc(size); + if (NULL == data) { + return ALLOC_FAILED; + } + ret = clGetCommandQueueInfo(queue, info, size, data, NULL); + if (CL_SUCCESS == ret) { + *value = data; + } else { + free(data); + } + } + + map_cl_error_2_ee(ret); +} + +/** + * @brief get context of command queue + * + **/ +inline EE command_queue_get_context(CommandQueue queue, Context *context) +{ + if (NULL == context) { + return NULL_POINTER; + } + I32 ret = clGetCommandQueueInfo(queue, CL_QUEUE_CONTEXT, sizeof(Context), context, NULL); + map_cl_error_2_ee(ret); +} + +/** + * @brief get device of command queue + * + **/ +inline EE command_queue_get_device(CommandQueue queue, Device *device) +{ + if (NULL == device) { + return NULL_POINTER; + } + I32 ret = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(Device), device, NULL); + map_cl_error_2_ee(ret); +} + +inline EE retain_command_queue(CommandQueue queue) +{ + I32 ret = clRetainCommandQueue(queue); + map_cl_error_2_ee(ret); +} + +inline EE release_command_queue(CommandQueue queue) +{ + I32 ret = clReleaseCommandQueue(queue); + map_cl_error_2_ee(ret); +} + +/** + * @brief flush command queue, issue all command to execuate + **/ +inline EE flush(CommandQueue queue) +{ + I32 ret = clFlush(queue); + map_cl_error_2_ee(ret); +} + +/** + * @brief wait all commands finish + **/ +inline EE finish(CommandQueue queue) +{ + I32 ret = clFinish(queue); + map_cl_error_2_ee(ret); +} + +inline EE check_queue_profiling(CommandQueue queue, bool *enable) +{ + cl_bitfield prop; + I32 ret = clGetCommandQueueInfo(queue, CL_QUEUE_PROPERTIES, sizeof(prop), &prop, NULL); + if ((prop | CL_QUEUE_PROFILING_ENABLE) == prop) { + *enable = true; + } else { + *enable = false; + } + map_cl_error_2_ee(ret); +} + +#ifdef __cplusplus +} +#endif +#endif diff --git a/common/gcl/include/event.h b/common/gcl/include/event.h new file mode 100644 index 00000000..9db350f4 --- /dev/null +++ b/common/gcl/include/event.h @@ -0,0 +1,160 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef EVENT_H_ +#define EVENT_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief wait for event to complete + **/ +inline EE wait_events(U32 num_events, const Event *event_list) +{ + I32 ret = clWaitForEvents(num_events, event_list); + map_cl_error_2_ee(ret); +} + +/** + * @brief get informaiton about event + * + * @warning please free memory associated with value + **/ +inline EE get_event_info(cl_event event, cl_event_info info, void **value, size_t *size) +{ + size_t len; + I32 ret = clGetEventInfo(event, info, 0, NULL, &len); + if (CL_SUCCESS == ret) { + if (NULL != size) { + *size = len; + } + void *data = malloc(len); + if (NULL == data) { + return ALLOC_FAILED; + } + ret = clGetEventInfo(event, info, len, data, NULL); + if (CL_SUCCESS == ret) { + *value = data; + } else { + free(data); + } + } + + map_cl_error_2_ee(ret); +} + +/** + * @brief increase reference count of event + **/ +inline EE retain_event(Event event) +{ + I32 ret = clRetainEvent(event); + map_cl_error_2_ee(ret); +} + +inline EE release_event(Event event) +{ + I32 ret = clReleaseEvent(event); + map_cl_error_2_ee(ret); +} + +inline EE enqueue_barrier_wait_lists( + CommandQueue queue, U32 num_wait_events, const Event *wait_events, Event *event) +{ + I32 ret = clEnqueueBarrierWithWaitList(queue, num_wait_events, wait_events, event); + map_cl_error_2_ee(ret); +} + +inline EE event_counting_time( + Event *event, double *t_queue, double *t_submit, double *t_start, double *t_end, double *t_execute) +{ + cl_ulong queued, submit, start, end; + CHECK_STATUS(wait_events(1, event)); + I32 ret; + ret = clGetEventProfilingInfo( + *event, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &queued, NULL); + if (ret) { + map_cl_error_2_ee(ret); + } + ret = clGetEventProfilingInfo( + *event, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &submit, NULL); + if (ret) { + map_cl_error_2_ee(ret); + } + ret = + clGetEventProfilingInfo(*event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); + if (ret) { + map_cl_error_2_ee(ret); + } + ret = clGetEventProfilingInfo(*event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); + if (ret) { + map_cl_error_2_ee(ret); + } + + double t0, t1, t2, t3, t4; + t0 = (double)(queued)*1e-03; + t1 = (double)(submit)*1e-03; + t2 = (double)(start)*1e-03; + t3 = (double)(end)*1e-03; + t4 = ((double)(end) - (double)(start)) * 1e-03; + + if (t_queue) { + *t_queue = t0; + } + if (t_submit) { + *t_submit = t1; + } + if (t_start) { + *t_start = t2; + } + if (t_end) { + *t_end = t3; + } + if (t_execute) { + *t_execute = t4; + } + return SUCCESS; +} +/** + * @brief get profiling information + **/ +inline EE event_get_profiling_info(Event event, cl_profiling_info info, void **value, size_t *size) +{ + size_t len; + I32 ret = clGetEventProfilingInfo(event, info, 0, NULL, &len); + if (CL_SUCCESS == ret) { + if (NULL != size) { + *size = len; + } + void *data = malloc(len); + if (NULL == data) { + return ALLOC_FAILED; + } + ret = clGetEventProfilingInfo(event, info, len, data, NULL); + if (CL_SUCCESS == ret) { + *value = data; + } else { + free(data); + } + } + + map_cl_error_2_ee(ret); +} + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/gcl/include/gcl.h b/common/gcl/include/gcl.h similarity index 83% rename from gcl/include/gcl.h rename to common/gcl/include/gcl.h index fcab35d5..2a0ab241 100644 --- a/gcl/include/gcl.h +++ b/common/gcl/include/gcl.h @@ -1,20 +1,18 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - #ifndef _H_GCL #define _H_GCL #include "gcl_func.h" +#include "gclmem_desc_infer.h" #endif - diff --git a/common/gcl/include/gcl_common.h b/common/gcl/include/gcl_common.h new file mode 100644 index 00000000..46cbb05d --- /dev/null +++ b/common/gcl/include/gcl_common.h @@ -0,0 +1,275 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef H_GCL_COMMON +#define H_GCL_COMMON +#define CL_TARGET_OPENCL_VERSION 200 + +#include "tensor_desc.h" +#include "gcl_kernel_type.h" +#include "CL/cl.h" +#include +#include +#include +#include +#include +/** + * @file + */ +#define ERROR_CASE(x) \ + case x: \ + return (#x) + +#ifdef __cplusplus +extern "C" { +#endif + +typedef cl_platform_id Platform; +typedef cl_device_id Device; +typedef cl_context Context; +typedef cl_command_queue CommandQueue; +typedef cl_program Program; +typedef cl_mem Mem; +typedef cl_sampler Sampler; +typedef cl_kernel Kernel; +typedef cl_event Event; +typedef cl_mem_flags MemFlags; +typedef cl_image_format ImgFormat; + +inline CI8 *map_cl_error_2_string(cl_int err) +{ + switch (err) { + ERROR_CASE(CL_SUCCESS); + ERROR_CASE(CL_DEVICE_NOT_FOUND); + ERROR_CASE(CL_DEVICE_NOT_AVAILABLE); + ERROR_CASE(CL_COMPILER_NOT_AVAILABLE); + ERROR_CASE(CL_MEM_OBJECT_ALLOCATION_FAILURE); + ERROR_CASE(CL_OUT_OF_RESOURCES); + ERROR_CASE(CL_OUT_OF_HOST_MEMORY); + ERROR_CASE(CL_PROFILING_INFO_NOT_AVAILABLE); + ERROR_CASE(CL_MEM_COPY_OVERLAP); + ERROR_CASE(CL_IMAGE_FORMAT_MISMATCH); + ERROR_CASE(CL_IMAGE_FORMAT_NOT_SUPPORTED); + ERROR_CASE(CL_BUILD_PROGRAM_FAILURE); + ERROR_CASE(CL_MAP_FAILURE); +#ifdef CL_VERSION_1_1 + ERROR_CASE(CL_MISALIGNED_SUB_BUFFER_OFFSET); + ERROR_CASE(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST); +#endif +#ifdef CL_VERSION_1_2 + ERROR_CASE(CL_COMPILE_PROGRAM_FAILURE); + ERROR_CASE(CL_LINKER_NOT_AVAILABLE); + ERROR_CASE(CL_LINK_PROGRAM_FAILURE); + ERROR_CASE(CL_DEVICE_PARTITION_FAILED); + ERROR_CASE(CL_KERNEL_ARG_INFO_NOT_AVAILABLE); +#endif + ERROR_CASE(CL_INVALID_VALUE); + ERROR_CASE(CL_INVALID_DEVICE_TYPE); + ERROR_CASE(CL_INVALID_PLATFORM); + ERROR_CASE(CL_INVALID_DEVICE); + ERROR_CASE(CL_INVALID_CONTEXT); + ERROR_CASE(CL_INVALID_QUEUE_PROPERTIES); + ERROR_CASE(CL_INVALID_COMMAND_QUEUE); + ERROR_CASE(CL_INVALID_HOST_PTR); + ERROR_CASE(CL_INVALID_MEM_OBJECT); + ERROR_CASE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR); + ERROR_CASE(CL_INVALID_IMAGE_SIZE); + ERROR_CASE(CL_INVALID_SAMPLER); + ERROR_CASE(CL_INVALID_BINARY); + ERROR_CASE(CL_INVALID_BUILD_OPTIONS); + ERROR_CASE(CL_INVALID_PROGRAM); + ERROR_CASE(CL_INVALID_PROGRAM_EXECUTABLE); + ERROR_CASE(CL_INVALID_KERNEL_NAME); + ERROR_CASE(CL_INVALID_KERNEL_DEFINITION); + ERROR_CASE(CL_INVALID_KERNEL); + ERROR_CASE(CL_INVALID_ARG_INDEX); + ERROR_CASE(CL_INVALID_ARG_VALUE); + ERROR_CASE(CL_INVALID_ARG_SIZE); + ERROR_CASE(CL_INVALID_KERNEL_ARGS); + ERROR_CASE(CL_INVALID_WORK_DIMENSION); + ERROR_CASE(CL_INVALID_WORK_GROUP_SIZE); + ERROR_CASE(CL_INVALID_WORK_ITEM_SIZE); + ERROR_CASE(CL_INVALID_GLOBAL_OFFSET); + ERROR_CASE(CL_INVALID_EVENT_WAIT_LIST); + ERROR_CASE(CL_INVALID_EVENT); + ERROR_CASE(CL_INVALID_OPERATION); + ERROR_CASE(CL_INVALID_GL_OBJECT); + ERROR_CASE(CL_INVALID_BUFFER_SIZE); + ERROR_CASE(CL_INVALID_MIP_LEVEL); + ERROR_CASE(CL_INVALID_GLOBAL_WORK_SIZE); +#ifdef CL_VERSION_1_1 + ERROR_CASE(CL_INVALID_PROPERTY); +#endif +#ifdef CL_VERSION_1_2 + ERROR_CASE(CL_INVALID_IMAGE_DESCRIPTOR); + ERROR_CASE(CL_INVALID_COMPILER_OPTIONS); + ERROR_CASE(CL_INVALID_LINKER_OPTIONS); + ERROR_CASE(CL_INVALID_DEVICE_PARTITION_COUNT); +#endif +#ifdef CL_VERSION_2_0 + ERROR_CASE(CL_INVALID_PIPE_SIZE); + ERROR_CASE(CL_INVALID_DEVICE_QUEUE); +#endif +#ifdef CL_VERSION_2_2 + ERROR_CASE(CL_INVALID_SPEC_ID); + ERROR_CASE(CL_MAX_SIZE_RESTRICTION_EXCEEDED); +#endif + + default: + return "CL_UNKNOW_ERROR"; + } +} + +#define map_cl_error_2_ee(err) \ + { \ + if (err == 0) \ + return SUCCESS; \ + UNI_ERROR_LOG("GCLAPI error in: File: %s Line: %d Func name is: %s GCLERROR = %s\n", \ + __FILE__, __LINE__, __func__, map_cl_error_2_string(err)); \ + return GCL_ERROR; \ + } + +inline EE has_dedicated_local(Device device, I32 *b) +{ + void *value; + I32 ret = clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_TYPE, sizeof(void *), &value, nullptr); + if (CL_SUCCESS == ret) { + *b = (*((cl_device_local_mem_type *)value) == CL_LOCAL); + } + free(value); + map_cl_error_2_ee(ret); +} + +/** + *@ enum define + **/ +typedef enum { + GCL_MEM_BUF = 0, + GCL_MEM_IMG_1D = 1, + GCL_MEM_IMG_2D = 2, + GCL_MEM_IMG_3D = 3 +} GCLMemType; + +typedef enum { + HOST_TO_DEVICE_BUF = 0, + HOST_TO_DEVICE_IMG = 1, + DEVICE_BUF_TO_HOST = 2, + DEVICE_IMG_TO_HOST = 3, + DEVICE_BUF_TO_BUF = 4, + DEVICE_BUF_TO_IMG = 5, + DEVICE_IMG_TO_BUF = 6, + DEVICE_IMG_TO_IMG = 7 +} GCLMemTransType; +/** + *@ struct define + **/ +struct GCLKernelInfo { + Kernel kernel = NULL; + U32 dim = 0; + U32 gs[3] = {0}; + U32 ls[3] = {0}; + std::string name; +}; + +struct GCLHandle { + Platform *platforms; + U32 numPlatform; + U32 platformId; + + Device *devices; + U32 numDevice; + U32 deviceId; + cl_device_type deviceType; + + Context context; + CommandQueue queue; + CommandQueue queue_profiling; + bool existProfilingQueue; + + Event eventObj; + Event *eventPtr; + U32 numWaitEvents; + Event *waitEvents; + double t_execute; + double t_total; + + std::string deviceName; + std::map kernelMap; + std::map programMap; + std::vector *kernelVec; + std::string curOpName; + void *kernel_source; + void *kernel_binmap_handle; + void *kernel_binmap; + bool useBinMap; + std::string common_source_opt; + std::string common_source_ext; + Program source_head[1]; + CI8 *source_head_name[1]; +}; + +typedef struct GCLHandle *GCLHandle_t; + +struct GCLHandleConfig { + CI8 *deviceBinmapName; +}; + +typedef GCLHandleConfig *GCLHandleConfig_t; + +struct GCLMemDesc { + U32 dims[6]; + U32 nDims; + DataType dt; + DataFormat df; + + U32 stride[3]; + U32 offset[3]; + GCLMemType memType; + DataFormat memFormat; + U32 byteSize; + U32 num; + MemFlags flags; + ImgFormat imgFormat; + void *host_ptr; + bool need_pad; +}; +typedef struct GCLMemDesc *GCLMemDesc_t; +struct GCLMem { + Mem mem; + GCLMemDesc desc; + std::vector subMem; + std::vector mapPtrArray; +}; +typedef struct GCLMem *GCLMem_t; + +typedef struct { + I32 algorithm; + U32 best_w[6]; + U32 best_c[6]; + U32 best_k[6]; +} ForwardRunInfoMali; +typedef ForwardRunInfoMali *ForwardRunInfoMali_t; + +typedef struct { + GCLHandle_t handle; + GCLMemDesc_t gclmemInputDesc; + GCLMemDesc_t gclmemOutputDesc; + GCLMemDesc_t gclmemFilterDesc; + ForwardRunInfoMali_t forwardRunInfo; +} MaliPara; +typedef MaliPara *MaliPara_t; + +#ifdef __cplusplus +} +#endif +#endif diff --git a/common/gcl/include/gcl_engine.h b/common/gcl/include/gcl_engine.h new file mode 100644 index 00000000..25ff32eb --- /dev/null +++ b/common/gcl/include/gcl_engine.h @@ -0,0 +1,66 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef H_OCL_ENGINE +#define H_OCL_ENGINE + +#include "sys.h" +#include "ocl_context.h" + +#define REGISTER_OCL_OPERATOR_RUN \ + virtual void run() override \ + { \ + GCLHandle_t handle = OCLContext::getInstance().handle.get(); \ + handle->kernelVec = &this->opKernelVec; \ + if (this->needSetKernelVec) { \ + run_prepare(); \ + this->needSetKernelVec = false; \ + if (this->needSelectKernelLS) { \ + CHECK_STATUS(gcl_infer_best_kernelVec_ls_with_map(handle, this->algorithmMap)); \ + this->needSelectKernelLS = false; \ + } \ + } \ + CHECK_STATUS(gcl_run_kernelVec(handle)); \ + } \ + \ +private: \ + bool needSetKernelVec; \ + bool needSelectKernelLS; \ + std::vector opKernelVec; + +#define DESTROY_OCL_KERNEL \ + GCLHandle_t handle = OCLContext::getInstance().handle.get(); \ + handle->kernelVec = &this->opKernelVec; \ + CHECK_STATUS(gcl_clean_kernelVec(handle)); + +inline void setMALIArchInfo( + ArchInfo *archInfo, ForwardRunInfoMali *runInfo, bool *needSetKernelVec, bool *needSelectKernelLS) +{ + if (runInfo != nullptr) { + runInfo->algorithm = 0; + runInfo->best_w[0] = 1; + runInfo->best_w[1] = 1; + runInfo->best_c[0] = 1; + runInfo->best_c[1] = 1; + runInfo->best_k[0] = 1; + runInfo->best_k[1] = 1; + } + MaliPara *maliPara = (MaliPara *)malloc(sizeof(MaliPara)); + maliPara->handle = OCLContext::getInstance().handle.get(); + maliPara->forwardRunInfo = runInfo; + archInfo->arch = MALI; + archInfo->archPara = (void *)maliPara; + *needSetKernelVec = true; + *needSelectKernelLS = true; +} +#endif // H_OCL_ENGINE diff --git a/common/gcl/include/gcl_func.h b/common/gcl/include/gcl_func.h new file mode 100644 index 00000000..d8529df9 --- /dev/null +++ b/common/gcl/include/gcl_func.h @@ -0,0 +1,1351 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef H_GCL_FUNC +#define H_GCL_FUNC + +#include +#include +#include +#include +#include "gcl_common.h" +#include "platform.h" +#include "context.h" +#include "program.h" +#include "memory.h" +#include "kernel.h" +#include "event.h" +#include "gcl_kernel_binmap.h" +#include "gcl_kernel_source.h" +#include "libkernelsource.h" +#include "algorithm_map.h" +#include "types.h" + +#ifdef __cplusplus +extern "C" { +#endif +inline EE gcl_regist_binMap(GCLHandle_t handle) +{ + std::string deviceName = handle->deviceName; + std::string libKernelBinName = "lib" + deviceName + "_map.so"; + char *err; + void *dvm_handle = dlopen(libKernelBinName.c_str(), RTLD_LAZY); + if (dvm_handle) { + std::string func = "create_" + deviceName + "_kernelbin_map"; + gcl_kernel_binmap *(*create_kernelbin_map)(); + dlerror(); + create_kernelbin_map = (gcl_kernel_binmap * (*)()) dlsym(dvm_handle, func.c_str()); + if ((err = dlerror()) != NULL) { + UNI_ERROR_LOG( + "Get %s in %s failed, error %s\n", func.c_str(), libKernelBinName.c_str(), err); + dlclose(dvm_handle); + return NULL_POINTER; + } + gcl_kernel_binmap *kernel_binmap = create_kernelbin_map(); + handle->kernel_binmap = (void *)kernel_binmap; + handle->useBinMap = true; + handle->kernel_binmap_handle = dvm_handle; + } else { + UNI_DEBUG_LOG("try to dlopen %s failed, %s, create kernel from source code\n", + libKernelBinName.c_str(), dlerror()); + } + return SUCCESS; +} + +inline EE gcl_regist_sourceMap(GCLHandle_t handle) +{ + gcl_kernel_source *kernel_source = (gcl_kernel_source*) new kernel_source_executor(); + handle->kernel_source = kernel_source; + KernelOption *common_opt; + if (!kernel_source->get_option("common", &common_opt)) { + UNI_ERROR_LOG("the common doesn't exist in optionMap\n"); + CHECK_STATUS(NULL_POINTER); + } + handle->common_source_opt = common_opt->option; + handle->common_source_ext = "#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable\n"; + handle->common_source_ext += "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"; + handle->source_head_name[0] = "kernel_def.h"; + KernelSource *head_source; + if (!kernel_source->get_source("kernel_def", &head_source)) { + UNI_ERROR_LOG("the kernel_def doesn't exist in sourceMap\n"); + CHECK_STATUS(NULL_POINTER); + } + CHECK_STATUS(create_program_from_source( + handle->context, (U32 *)&head_source->len, head_source->data, handle->source_head)); + return SUCCESS; +} + +inline EE gcl_get_device_name(GCLHandle_t handle) +{ + cl_device_id device = handle->devices[handle->deviceId]; + U32 len; + I8 *data; + CHECK_STATUS(get_device_info(device, CL_DEVICE_NAME, (void **)&data, &len)); + I8 devName[64]; + for (U32 i = 0; i < len - 1; i++) { + if (data[i] == '-') { + data[i] = '_'; + } + if (data[i] == ' ') { + data[i] = '_'; + } + devName[i] = data[i]; + } + U32 version_len; + free(data); + CHECK_STATUS(get_device_info(device, CL_DEVICE_VERSION, (void **)&data, &version_len)); + std::string deviceV = std::string(data); + U32 be = deviceV.find("r"); + U32 end = deviceV.find("p", be + 1); + std::string numV = deviceV.substr(be + 1, end - be - 1); + U32 i = atoi(numV.c_str()); + if (i >= 14) { + devName[len - 1] = 'p'; + devName[len] = '\0'; + } else { + devName[len - 1] = '\0'; + } + free(data); + handle->deviceName = devName; + return SUCCESS; +} + +inline EE gcl_create_handle(GCLHandle_t *handlePtr) +{ + if (handlePtr == NULL) { + UNI_ERROR_LOG("the handlePtr set to gcl_create_handle is NULL\n"); + } + GCLHandle_t handle = new GCLHandle(); + handle->platformId = 0; + handle->deviceId = 0; + handle->deviceType = CL_DEVICE_TYPE_GPU; + handle->eventPtr = nullptr; + handle->numWaitEvents = 0; + handle->waitEvents = nullptr; + handle->t_execute = 0; + handle->t_total = 0; + handle->curOpName = "unknow"; + handle->deviceName = "unknow"; + handle->kernel_source = nullptr; + handle->kernel_binmap = nullptr; + handle->kernel_binmap_handle = nullptr; + handle->common_source_opt = "unknow"; + handle->common_source_ext = "unknow"; + handle->source_head_name[0] = "unknow"; + handle->useBinMap = false; + handle->existProfilingQueue = false; + U32 platformId = handle->platformId; + U32 deviceId = handle->deviceId; + CHECK_STATUS(get_platforms(&handle->numPlatform, &handle->platforms)); + CHECK_STATUS(platform_get_devices( + handle->platforms[platformId], handle->deviceType, &handle->numDevice, &handle->devices)); + CHECK_STATUS(create_context( + handle->platforms[platformId], handle->numDevice, handle->devices, &handle->context)); + cl_queue_properties props[] = {CL_QUEUE_PROPERTIES, 0, 0}; +#ifdef _DEBUG + handle->eventPtr = &handle->eventObj; + props[1] = props[1] | CL_QUEUE_PROFILING_ENABLE; +#endif + CHECK_STATUS(create_command_queue_properties( + handle->context, handle->devices[deviceId], props, &handle->queue)); + CHECK_STATUS(gcl_get_device_name(handle)); + CHECK_STATUS(gcl_regist_binMap(handle)); + if (!handle->useBinMap) { + CHECK_STATUS(gcl_regist_sourceMap(handle)); + } + *handlePtr = handle; + return SUCCESS; +} + +inline void gcl_destroy_handle(GCLHandle_t handle) +{ + U32 deviceId = handle->deviceId; + CHECK_STATUS(finish(handle->queue)); + for (auto k : handle->programMap) { + CHECK_STATUS(release_program(k.second)); + } + for (auto k : handle->kernelMap) { + CHECK_STATUS(release_kernel(k.second)); + } + if (handle->useBinMap) { + delete (gcl_kernel_binmap *)handle->kernel_binmap; + dlclose(handle->kernel_binmap_handle); + } else { + CHECK_STATUS(release_program(handle->source_head[0])); + delete (gcl_kernel_source *)handle->kernel_source; + } + handle->kernelMap.clear(); + if (handle->existProfilingQueue) { + CHECK_STATUS(finish(handle->queue_profiling)); + CHECK_STATUS(release_command_queue(handle->queue_profiling)); + } + CHECK_STATUS(release_command_queue(handle->queue)); + CHECK_STATUS(release_context(handle->context)); + CHECK_STATUS(release_device(handle->devices[deviceId])); + free(handle->devices); + free(handle->platforms); + delete handle; +} + +inline EE gcl_enable_queue_profiling(GCLHandle_t handle) +{ +#ifndef _DEBUG + handle->eventPtr = &handle->eventObj; + bool enableProfiling; + CHECK_STATUS(check_queue_profiling(handle->queue, &enableProfiling)); + if (enableProfiling) { + return SUCCESS; + } + if (!handle->existProfilingQueue) { + cl_queue_properties props[] = {CL_QUEUE_PROPERTIES, 0, 0}; + props[1] = props[1] | CL_QUEUE_PROFILING_ENABLE; + CHECK_STATUS(create_command_queue_properties( + handle->context, handle->devices[handle->deviceId], props, &handle->queue_profiling)); + handle->existProfilingQueue = true; + } + CommandQueue tmpQueue = handle->queue; + handle->queue = handle->queue_profiling; + handle->queue_profiling = tmpQueue; +#endif + return SUCCESS; +} + +inline EE gcl_off_queue_profiling(GCLHandle_t handle) +{ +#ifndef _DEBUG + handle->eventPtr = NULL; + bool enableProfiling; + CHECK_STATUS(check_queue_profiling(handle->queue, &enableProfiling)); + if (!enableProfiling) { + return SUCCESS; + } + CHECK_STATUS(check_queue_profiling(handle->queue_profiling, &enableProfiling)); + if (!enableProfiling) { + CHECK_STATUS(finish(handle->queue)); + CommandQueue tmpQueue = handle->queue; + handle->queue = handle->queue_profiling; + handle->queue_profiling = tmpQueue; + } else { + return NOT_SUPPORTED; + } +#endif + return SUCCESS; +} + +inline GCLMemDesc gcl_mem_desc(U32 stride[], U32 offset[], DataType dt, DataFormat memFormat) +{ + GCLMemDesc desc; + U32 s0, s1, s2; + s0 = stride[0]; + s1 = stride[1]; + s2 = stride[2]; + desc.stride[0] = s0; + desc.stride[1] = s1; + desc.stride[2] = s2; + desc.offset[0] = offset[0]; + desc.offset[1] = offset[1]; + desc.offset[2] = offset[2]; + desc.memFormat = memFormat; + desc.memType = GCL_MEM_BUF; + desc.num = s0 * s1 * s2; + desc.byteSize = s0 * s1 * s2 * bytesOf(dt); + desc.flags = CL_MEM_READ_WRITE; + desc.host_ptr = NULL; + desc.imgFormat.image_channel_order = CL_RGBA; + desc.imgFormat.image_channel_data_type = CL_HALF_FLOAT; + desc.need_pad = false; + return desc; +} + +inline GCLMem_t gcl_create_gclmem() +{ + GCLMem_t ret = new GCLMem; + ret->mem = NULL; + U32 str[3] = {0, 0, 0}; + U32 off[3] = {0, 0, 0}; + ret->desc = gcl_mem_desc(str, off, DT_U8, DF_NCWHC4); + return ret; +} + +inline EE gcl_release_subMem(GCLMem_t gclMem) +{ + if (gclMem == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (gclMem->subMem.size()) { + for (auto p : gclMem->subMem) { + CHECK_STATUS(release_memory(p)); + } + gclMem->subMem.clear(); + } + return SUCCESS; +} + +inline EE gcl_release_memory(GCLMem_t gclMem) +{ + if (gclMem == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (gclMem->mem) { + CHECK_STATUS(release_memory(gclMem->mem)); + gclMem->mem = NULL; + } + return SUCCESS; +} + +inline void gcl_destroy_gclmem(GCLMem_t mem) +{ + CHECK_STATUS(gcl_release_subMem(mem)); + CHECK_STATUS(gcl_release_memory(mem)); + delete mem; +} + +inline EE gcl_finish(GCLHandle_t handle) +{ + CHECK_STATUS(finish(handle->queue)); + return SUCCESS; +} + +inline EE gcl_unmap_memory(GCLHandle_t handle, GCLMem_t gclMem) +{ + for (auto p : gclMem->mapPtrArray) { + CHECK_STATUS(enqueue_unmap_memory(handle->queue, gclMem->mem, (void *)p, + handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); +#ifdef _DEBUG + double executeTime = 0; + CHECK_STATUS(event_counting_time(handle->eventPtr, NULL, NULL, NULL, NULL, &executeTime)); + CHECK_STATUS(release_event(handle->eventObj)); + UNI_DEBUG_LOG( + "DATAUNMAP>>> enqueue_unmap_memory runInfo: executeTime = %f us\n", executeTime); + CHECK_STATUS(gcl_finish(handle)); +#endif + } + if (gclMem->mapPtrArray.size()) { + gclMem->mapPtrArray.clear(); + } + return SUCCESS; +} + +inline EE gcl_produce_program_kernel_with_source(GCLHandle_t handle, + U32 *len, + CI8 *src, + CI8 *option, + Program *program, + U32 numKernel, + Kernel *kernels) +{ + U32 deviceId = handle->deviceId; + CHECK_STATUS(create_build_program_from_source( + handle->context, len, src, handle->devices[deviceId], option, program)); + CHECK_STATUS(create_kernels_in_program(*program, numKernel, kernels)); + return SUCCESS; +} + +inline EE gcl_get_program_info(Program program, U8 **binary, U32 *len) +{ + CHECK_STATUS(get_program_binary(program, binary, len)); + return SUCCESS; +} + +inline EE gcl_kernelmap_put(GCLHandle_t handle, std::string kernelName, Kernel kernel) +{ + handle->kernelMap.insert(std::pair(kernelName, kernel)); + return SUCCESS; +} + +inline Kernel gcl_kernelmap_get(GCLHandle_t handle, std::string kernelName) +{ + auto it = handle->kernelMap.find(std::string(kernelName)); + if (it == handle->kernelMap.end()) { + CHECK_STATUS(NOT_MATCH); + } + return it->second; +} + +inline EE gcl_create_kernel_binary(GCLHandle_t handle, CI8 *kernelName, Kernel *kernel) +{ + std::string binmapname = handle->deviceName; + std::string binmap_kernelname = binmapname + "_" + std::string(kernelName); + gcl_kernel_binmap *kernel_binmap = (gcl_kernel_binmap *)handle->kernel_binmap; + KernelBin *binmap; + if (!kernel_binmap->get(binmap_kernelname, &binmap)) { + UNI_ERROR_LOG( + "get kernel %s from %s kernel_binmap failed\n", kernelName, binmapname.c_str()); + return NULL_POINTER; + } + + U32 length = binmap->len; + CU8 *data = binmap->data; + I32 binsta; + Program program; + CI8 *options = ""; + Device device = handle->devices[handle->deviceId]; + CHECK_STATUS( + create_program_from_binary(handle->context, device, &length, &data, &binsta, &program)); + CHECK_STATUS(build_program(program, device, options)); + CHECK_STATUS(create_kernel(program, kernelName, kernel)); + CHECK_STATUS(release_program(program)); + return SUCCESS; +} + +inline EE gcl_create_kernel_with_source_map(GCLHandle_t handle, CI8 *kernelName, Kernel *kernel) +{ + Program program; + auto it = handle->programMap.find(kernelName); + if (it == handle->programMap.end()) { + gcl_kernel_source *kernel_source = (gcl_kernel_source *)handle->kernel_source; + KernelOption *option_ptr; + KernelSource *source_ptr; + CI8 *sourceName; + std::string option; + std::string optionName = kernelName; + bool use_common_opt; + if (!kernel_source->get_option(optionName, &option_ptr)) { + sourceName = kernelName; + option = ""; + use_common_opt = true; + } else { + use_common_opt = option_ptr->use_common_opt; + sourceName = option_ptr->sourceName; + option = option_ptr->option; + } + if (use_common_opt) { + option = handle->common_source_opt + option; + } + if (!kernel_source->get_source(sourceName, &source_ptr)) { + UNI_ERROR_LOG("the %s doesn't exist in sourceMap\n", sourceName); + CHECK_STATUS(NULL_POINTER); + } + + U32 len = source_ptr->len + handle->common_source_ext.size(); + std::string source = source_ptr->data; + source = handle->common_source_ext + source; + bool use_kernel_def_head = source_ptr->use_kernel_def_head; + create_program_from_source(handle->context, &len, source.c_str(), &program); + Device device = handle->devices[handle->deviceId]; + if (use_kernel_def_head) { + CHECK_STATUS(compile_program( + program, device, option.c_str(), 1, handle->source_head, handle->source_head_name)); + CHECK_STATUS(link_program(handle->context, device, NULL, 1, &program, &program)); + } else { + CHECK_STATUS(build_program(program, device, option.c_str())); + } + handle->programMap.insert(std::pair(kernelName, program)); + } else { + program = it->second; + } + CHECK_STATUS(create_kernel(program, kernelName, kernel)); + return SUCCESS; +} + +inline EE gcl_create_kernel(GCLHandle_t handle, CI8 *kernelName, Kernel *kernel) +{ + if (handle->useBinMap) { + CHECK_STATUS(gcl_create_kernel_binary(handle, kernelName, kernel)); + } else { + CHECK_STATUS(gcl_create_kernel_with_source_map(handle, kernelName, kernel)); + } + return SUCCESS; +} + +inline EE gcl_get_kernel_from_map(GCLHandle_t handle, CI8 *kernelName, Kernel *kernel) +{ + std::string binmapname = handle->deviceName; + std::string binmap_kernelname = binmapname + "_" + std::string(kernelName); + if (handle->kernelMap.find(binmap_kernelname) == handle->kernelMap.end()) { + CHECK_STATUS(gcl_create_kernel(handle, kernelName, kernel)); + CHECK_STATUS(gcl_kernelmap_put(handle, binmap_kernelname, *kernel)); + } else { + *kernel = gcl_kernelmap_get(handle, binmap_kernelname); + } + return SUCCESS; +} + +inline EE gcl_set_kernelVec(GCLHandle_t handle, + Kernel kernel, + U32 work_dim, + U32 global_work_size[], + U32 local_work_size[], + CI8 *kernelName = NULL) +{ + GCLKernelInfo kernelInfo; + kernelInfo.kernel = kernel; + kernelInfo.dim = work_dim; + kernelInfo.name = handle->curOpName + "_" + std::string(kernelName); + switch (work_dim) { + case 1: { + kernelInfo.gs[0] = global_work_size[0]; + kernelInfo.gs[1] = 1; + kernelInfo.gs[2] = 1; + kernelInfo.ls[0] = local_work_size[0]; + kernelInfo.ls[1] = 0; + kernelInfo.ls[2] = 0; + break; + } + case 2: { + kernelInfo.gs[0] = global_work_size[0]; + kernelInfo.gs[1] = global_work_size[1]; + kernelInfo.gs[2] = 1; + kernelInfo.ls[0] = local_work_size[0]; + kernelInfo.ls[1] = local_work_size[1]; + kernelInfo.ls[2] = 0; + break; + } + case 3: { + kernelInfo.gs[0] = global_work_size[0]; + kernelInfo.gs[1] = global_work_size[1]; + kernelInfo.gs[2] = global_work_size[2]; + kernelInfo.ls[0] = local_work_size[0]; + kernelInfo.ls[1] = local_work_size[1]; + kernelInfo.ls[2] = local_work_size[2]; + break; + } + default: + return NOT_SUPPORTED; + } + handle->kernelVec->push_back(kernelInfo); + return SUCCESS; +} + +inline EE gcl_run_kernelVec(GCLHandle_t handle, U32 *index = NULL) +{ + CommandQueue queue = handle->queue; + U32 numWaitEvents = handle->numWaitEvents; + Event *waitEvents = handle->waitEvents; + Event *eventPtr = handle->eventPtr; + U32 runBe; + U32 runEnd; + if (index) { + runBe = index[0]; + runEnd = index[1]; + } else { + runBe = 0; + runEnd = handle->kernelVec->size(); + } + for (U32 i = runBe; i < runEnd; ++i) { + auto kernelInfo = (*handle->kernelVec)[i]; + CHECK_STATUS(enqueue_ndrange_kernel(queue, kernelInfo.kernel, kernelInfo.dim, NULL, + kernelInfo.gs, kernelInfo.ls, numWaitEvents, waitEvents, eventPtr)); +#ifdef _DEBUG + double executeTime = 0; + CHECK_STATUS(event_counting_time(eventPtr, NULL, NULL, NULL, NULL, &executeTime)); + CHECK_STATUS(release_event(*eventPtr)); + handle->t_execute = executeTime; + UNI_DEBUG_LOG( + "KERNEL>>> %s runInfo: executeTime = %f us\n", kernelInfo.name.c_str(), executeTime); + CHECK_STATUS(gcl_finish(handle)); +#endif + } + return SUCCESS; +} + +inline EE gcl_run_kernelVec_timing( + GCLHandle_t handle, U32 be, U32 end, std::vector *kernelArrayTime = NULL) +{ + bool enableProfiling; + CHECK_STATUS(check_queue_profiling(handle->queue, &enableProfiling)); + if (enableProfiling) { + double executeTime = 0; + double totalTime = 0; + CommandQueue queue = handle->queue; + U32 numWaitEvents = handle->numWaitEvents; + Event *waitEvents = handle->waitEvents; + Event *eventPtr = handle->eventPtr; + for (U32 i = be; i < end; ++i) { + auto kernelInfo = (*handle->kernelVec)[i]; + for (U32 j = 0; j < 3; j++) { + CHECK_STATUS(enqueue_ndrange_kernel(queue, kernelInfo.kernel, kernelInfo.dim, NULL, + kernelInfo.gs, kernelInfo.ls, numWaitEvents, waitEvents, eventPtr)); + CHECK_STATUS( + event_counting_time(handle->eventPtr, NULL, NULL, NULL, NULL, &executeTime)); + CHECK_STATUS(release_event(handle->eventObj)); + } + UNI_DEBUG_LOG("KERNEL>>> %s runInfo: executeTime = %f us\n", kernelInfo.name.c_str(), + executeTime); + CHECK_STATUS(gcl_finish(handle)); + totalTime += executeTime; + if (kernelArrayTime) { + (*kernelArrayTime).push_back(executeTime); + } + } + handle->t_execute = totalTime; + return SUCCESS; + } + return NOT_SUPPORTED; +} + +inline EE gcl_clean_kernelVec(GCLHandle_t handle) +{ + for (U32 i = 0; i < handle->kernelVec->size(); i++) { + auto k = (*handle->kernelVec)[i]; + CHECK_STATUS(release_kernel(k.kernel)); + } + handle->kernelVec->clear(); + return SUCCESS; +} + +inline EE gcl_run_kernel( + GCLHandle_t handle, Kernel kernel, U32 work_dim, U32 *gs, U32 *ls, CI8 *kernelName = NULL) +{ + CHECK_STATUS(enqueue_ndrange_kernel(handle->queue, kernel, work_dim, NULL, gs, ls, + handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); +#ifdef _DEBUG + std::string name = "unknown kernel"; + if (kernelName) { + name = handle->curOpName + "_" + std::string(kernelName); + } + double executeTime = 0; + CHECK_STATUS(event_counting_time(handle->eventPtr, NULL, NULL, NULL, NULL, &executeTime)); + CHECK_STATUS(release_event(handle->eventObj)); + handle->t_execute = executeTime; + UNI_DEBUG_LOG("KERNEL>>> %s runInfo: executeTime = %f us\n", name.c_str(), executeTime); + CHECK_STATUS(gcl_finish(handle)); +#else + UNUSED(kernelName); +#endif + return SUCCESS; +} + +inline U32 get_next_ls_size(U32 ls_size) +{ + return (ls_size << 1); +} +inline EE gcl_run_kernel_select_ls(GCLHandle_t handle, GCLKernelInfo *kernelInfo) +{ + auto kernel = kernelInfo->kernel; + auto work_dim = kernelInfo->dim; + auto gs = kernelInfo->gs; + double minTime = DBL_MAX; + double time; + U32 test_ls[3]; + U32 best_ls[3]; + U32 test_gs[3]; + U32 maxSize = 384; + U32 gs_x = 256; + U32 gs_y = (work_dim > 1) ? 256 : 1; + U32 gs_z = (work_dim > 2) ? gs[2] : 1; + for (U32 z = 1; z <= gs_z; z = get_next_ls_size(z)) { + if (0 != gs_z % z) { + continue; + } + for (U32 y = 1; y <= gs_y; y = get_next_ls_size(y)) { + if (0 != gs_y % y) { + continue; + } + for (U32 x = 1; x <= gs_x; x = get_next_ls_size(x)) { + if (0 != gs_x % x) { + continue; + } + U32 total = x * y * z; + if (total <= maxSize) { + test_gs[0] = (gs[0] + x - 1) / x * x; + test_gs[1] = (gs[1] + y - 1) / y * y; + test_gs[2] = (gs[2] + z - 1) / z * z; + test_ls[0] = x; + test_ls[1] = y; + test_ls[2] = z; + CHECK_STATUS( + enqueue_ndrange_kernel(handle->queue, kernel, work_dim, NULL, test_gs, + test_ls, handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); + CHECK_STATUS( + event_counting_time(handle->eventPtr, NULL, NULL, NULL, NULL, &time)); + if (minTime > time) { + minTime = time; + best_ls[0] = test_ls[0]; + best_ls[1] = test_ls[1]; + best_ls[2] = test_ls[2]; + } + CHECK_STATUS(release_event(handle->eventObj)); + } + } + } + } + test_ls[0] = 0; + test_ls[1] = 0; + test_ls[2] = 0; + CHECK_STATUS(enqueue_ndrange_kernel(handle->queue, kernel, work_dim, NULL, gs, test_ls, + handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); + CHECK_STATUS(event_counting_time(handle->eventPtr, NULL, NULL, NULL, NULL, &time)); + if (minTime > time) { + minTime = time; + best_ls[0] = test_ls[0]; + best_ls[1] = test_ls[1]; + best_ls[2] = test_ls[2]; + } + CHECK_STATUS(release_event(handle->eventObj)); + if (best_ls[0] != 0 && best_ls[1] != 0 && best_ls[2] != 0) { + kernelInfo->gs[0] = (gs[0] + best_ls[0] - 1) / best_ls[0] * best_ls[0]; + kernelInfo->gs[1] = (gs[1] + best_ls[1] - 1) / best_ls[1] * best_ls[1]; + kernelInfo->gs[2] = (gs[2] + best_ls[2] - 1) / best_ls[2] * best_ls[2]; + } + kernelInfo->ls[0] = best_ls[0]; + kernelInfo->ls[1] = best_ls[1]; + kernelInfo->ls[2] = best_ls[2]; + handle->t_execute = minTime; + UNI_DEBUG_LOG("SELECT LS KERNEL>>> %s runInfo: best ls = %u %u %u executeTime = %f us\n", + kernelInfo->name.c_str(), best_ls[0], best_ls[1], best_ls[2], minTime); + return SUCCESS; +} + +inline EE gcl_run_kernelVec_select_ls(GCLHandle_t handle, std::vector kernelIndex) +{ + if (kernelIndex.size() == 0) { + return SUCCESS; + } + CHECK_STATUS(gcl_enable_queue_profiling(handle)); + for (auto index : kernelIndex) { + auto kernelInfo = (*handle->kernelVec)[index]; + CHECK_STATUS(gcl_run_kernel_select_ls(handle, &kernelInfo)); + (*handle->kernelVec)[index].gs[0] = kernelInfo.gs[0]; + (*handle->kernelVec)[index].gs[1] = kernelInfo.gs[1]; + (*handle->kernelVec)[index].gs[2] = kernelInfo.gs[2]; + (*handle->kernelVec)[index].ls[0] = kernelInfo.ls[0]; + (*handle->kernelVec)[index].ls[1] = kernelInfo.ls[1]; + (*handle->kernelVec)[index].ls[2] = kernelInfo.ls[2]; + } + CHECK_STATUS(gcl_off_queue_profiling(handle)); + return SUCCESS; +} + +inline EE gcl_infer_best_kernelVec_ls_with_map( + GCLHandle_t handle, std::shared_ptr algoMap) +{ + std::vector kernelIndex; + U32 len = handle->kernelVec->size(); + for (U32 i = 0; i < len; i++) { + auto kernelInfo = (*handle->kernelVec)[i]; + U32 gs[3]; + U32 ls[3]; + bool findKernelThreadInfo = false; + findKernelThreadInfo = algoMap->getKernelThreadInfoFromMap(kernelInfo.name, gs, ls); + if (findKernelThreadInfo) { + (*handle->kernelVec)[i].gs[0] = gs[0]; + (*handle->kernelVec)[i].gs[1] = gs[1]; + (*handle->kernelVec)[i].gs[2] = gs[2]; + (*handle->kernelVec)[i].ls[0] = ls[0]; + (*handle->kernelVec)[i].ls[1] = ls[1]; + (*handle->kernelVec)[i].ls[2] = ls[2]; + } else { + kernelIndex.push_back(i); + } + } + CHECK_STATUS(gcl_run_kernelVec_select_ls(handle, kernelIndex)); + for (U32 i = 0; i < len; i++) { + auto kernelInfo = (*handle->kernelVec)[i]; + algoMap->setKernelThreadInfoToMap(kernelInfo.name, kernelInfo.gs, kernelInfo.ls); + } + return SUCCESS; +} + +#ifdef _DEBUG +inline EE gcl_run_kernel_profiling( + GCLHandle_t handle, Kernel kernel, U32 work_dim, U32 *gs, U32 *ls, CI8 *kernelName = NULL) +{ + std::string name = "unknown kernel"; + if (kernelName) { + name = kernelName; + } + std::ostringstream debugLog; + debugLog << "KERNEL>>> " << name << " runInfo: "; + double totalTime = 0; + double executeTime = 0; + U32 loop = 10; + for (U32 i = 0; i < loop; i++) { + double t; + CHECK_STATUS(enqueue_ndrange_kernel(handle->queue, kernel, work_dim, NULL, gs, ls, + handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); + CHECK_STATUS(event_counting_time(handle->eventPtr, NULL, NULL, NULL, NULL, &t)); + CHECK_STATUS(release_event(handle->eventObj)); + debugLog << "loop " << i << " executeTime = " << t << " us " << std::endl; + totalTime += t; + } + executeTime = totalTime / loop; + debugLog << "executeTime = " << executeTime << " us for " << loop << " times average"; + UNI_DEBUG_LOG("%s\n", debugLog.str().c_str()); + CHECK_STATUS(gcl_finish(handle)); + return SUCCESS; +} +#endif + +inline EE gcl_create_memory(GCLHandle_t handle, GCLMem_t gclMem) +{ + GCLMemDesc_t desc = &gclMem->desc; + switch (desc->memType) { + case GCL_MEM_BUF: { + CHECK_STATUS(create_buffer( + handle->context, desc->flags, desc->byteSize, desc->host_ptr, &gclMem->mem)); + break; + } + case GCL_MEM_IMG_1D: { + CHECK_STATUS(create_image1D(handle->context, desc->flags, &desc->imgFormat, + desc->stride[0], 0, desc->host_ptr, &gclMem->mem)); + break; + } + case GCL_MEM_IMG_2D: { + CHECK_STATUS(create_image2D(handle->context, desc->flags, &desc->imgFormat, + desc->stride[0], desc->stride[1], 0, desc->host_ptr, &gclMem->mem)); + break; + } + case GCL_MEM_IMG_3D: { + CHECK_STATUS( + create_image3D(handle->context, desc->flags, &desc->imgFormat, desc->stride[0], + desc->stride[1], desc->stride[2], 0, 0, desc->host_ptr, &gclMem->mem)); + break; + } + default: + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE gcl_trans_memory(GCLHandle_t handle, + void *src, + void *dst, + U32 *size, + GCLMemTransType type, + cl_bool blocking, + U32 *offset = NULL) +{ +#ifdef _DEBUG + std::string debug_info = "DATATRANS>>> "; +#endif + switch (type) { + case HOST_TO_DEVICE_BUF: { + U8 *hostPtr = (U8 *)src; + GCLMem_t gclMem = (GCLMem_t)dst; + U32 dstOff = (offset) ? offset[0] : 0; + CHECK_STATUS(enqueue_write_buffer(handle->queue, gclMem->mem, blocking, dstOff, *size, + hostPtr, handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); +#ifdef _DEBUG + debug_info += "enqueue_write_buffer runInfo: "; +#endif + break; + } + case HOST_TO_DEVICE_IMG: { + U8 *hostPtr = (U8 *)src; + GCLMem_t gclMem = (GCLMem_t)dst; + U32 origin[3] = {0, 0, 0}; + if (offset) { + origin[0] = offset[0]; + origin[1] = offset[1]; + origin[2] = offset[2]; + } + CHECK_STATUS(enqueue_write_image(handle->queue, gclMem->mem, blocking, origin, size, 0, + 0, hostPtr, handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); +#ifdef _DEBUG + debug_info += "enqueue_write_image runInfo: "; +#endif + break; + } + case DEVICE_BUF_TO_HOST: { + U8 *hostPtr = (U8 *)dst; + GCLMem_t gclMem = (GCLMem_t)src; + U32 srcOff = (offset) ? offset[0] : 0; + CHECK_STATUS(enqueue_read_buffer(handle->queue, gclMem->mem, blocking, srcOff, *size, + hostPtr, handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); +#ifdef _DEBUG + debug_info += "enqueue_read_buffer runInfo: "; +#endif + break; + } + case DEVICE_IMG_TO_HOST: { + U8 *hostPtr = (U8 *)dst; + GCLMem_t gclMem = (GCLMem_t)src; + U32 origin[3] = {0, 0, 0}; + if (offset) { + origin[0] = offset[0]; + origin[1] = offset[1]; + origin[2] = offset[2]; + } + CHECK_STATUS(enqueue_read_image(handle->queue, gclMem->mem, blocking, origin, size, 0, + 0, hostPtr, handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); +#ifdef _DEBUG + debug_info += "enqueue_read_image runInfo: "; +#endif + break; + } + case DEVICE_BUF_TO_BUF: { + GCLMem_t srcBuf = (GCLMem_t)src; + GCLMem_t dstBuf = (GCLMem_t)dst; + U32 srcOff = 0; + U32 dstOff = 0; + if (offset) { + srcOff = offset[0]; + dstOff = offset[1]; + } + CHECK_STATUS(enqueue_copy_buffer(handle->queue, srcBuf->mem, dstBuf->mem, srcOff, + dstOff, *size, handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); +#ifdef _DEBUG + debug_info += "enqueue_copy_buffer runInfo: "; +#endif + break; + } + case DEVICE_BUF_TO_IMG: { + GCLMem_t srcBuf = (GCLMem_t)src; + GCLMem_t dstImg = (GCLMem_t)dst; + U32 origin[3] = {0, 0, 0}; + U32 srcOff = 0; + if (offset) { + srcOff = offset[0]; + origin[0] = offset[1]; + origin[1] = offset[2]; + origin[2] = offset[3]; + } + CHECK_STATUS(enqueue_copy_buffer_to_image(handle->queue, srcBuf->mem, dstImg->mem, + srcOff, origin, size, handle->numWaitEvents, handle->waitEvents, handle->eventPtr)) +#ifdef _DEBUG + debug_info += "enqueue_copy_buffer_to_image runInfo: "; +#endif + break; + } + case DEVICE_IMG_TO_BUF: { + GCLMem_t srcImg = (GCLMem_t)src; + GCLMem_t dstBuf = (GCLMem_t)dst; + U32 origin[3] = {0, 0, 0}; + U32 dstOff = 0; + if (offset) { + origin[0] = offset[0]; + origin[1] = offset[1]; + origin[2] = offset[2]; + dstOff = offset[3]; + } + CHECK_STATUS(enqueue_copy_image_to_buffer(handle->queue, srcImg->mem, dstBuf->mem, + origin, size, dstOff, handle->numWaitEvents, handle->waitEvents, handle->eventPtr)) +#ifdef _DEBUG + debug_info += "enqueue_copy_image_to_buffer runInfo: "; +#endif + break; + } + case DEVICE_IMG_TO_IMG: { + return NOT_SUPPORTED; + break; + } + default: + return NOT_SUPPORTED; + } +#ifdef _DEBUG + double executeTime = 0; + CHECK_STATUS(event_counting_time(handle->eventPtr, NULL, NULL, NULL, NULL, &executeTime)); + CHECK_STATUS(release_event(handle->eventObj)); + UNI_DEBUG_LOG("%sexecuteTime = %f us\n", debug_info.c_str(), executeTime); + CHECK_STATUS(gcl_finish(handle)); +#endif + return SUCCESS; +} + +inline EE gcl_trans_buffer_rect(GCLHandle_t handle, + void *src, + void *dst, + U32 *host_org, + U32 *buf_org, + U32 *region, + U32 host_row_pitch, + U32 host_slice_pitch, + U32 buf_row_pitch, + U32 buf_slice_pitch, + GCLMemTransType type, + cl_bool blocking) +{ +#ifdef _DEBUG + std::string debug_info = "DATATRANS>>> "; +#endif + switch (type) { + case HOST_TO_DEVICE_BUF: { + GCLMem_t dstBuf = (GCLMem_t)dst; + CHECK_STATUS(enqueue_write_buffer_rect(handle->queue, dstBuf->mem, blocking, buf_org, + host_org, region, buf_row_pitch, buf_slice_pitch, host_row_pitch, host_slice_pitch, + src, handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); +#ifdef _DEBUG + debug_info = "enqueue_write_buffer_rect runInfo: "; +#endif + break; + } + case DEVICE_BUF_TO_HOST: { + return NOT_SUPPORTED; + break; + } + default: + return NOT_SUPPORTED; + } +#ifdef _DEBUG + double executeTime = 0; + CHECK_STATUS(event_counting_time(handle->eventPtr, NULL, NULL, NULL, NULL, &executeTime)); + CHECK_STATUS(release_event(handle->eventObj)); + UNI_DEBUG_LOG("%sexecuteTime = %f us\n", debug_info.c_str(), executeTime); + CHECK_STATUS(gcl_finish(handle)); +#endif + return SUCCESS; +} + +inline EE gcl_map_memory( + GCLHandle_t handle, GCLMem_t gclMem, U32 *offset, U32 *size, cl_map_flags flags, cl_bool blocking) +{ +#ifdef _DEBUG + std::string debug_info = "DATATMAP>>> "; +#endif + if (gclMem->desc.memType == GCL_MEM_BUF) { + U8 *map_ptr; + CHECK_STATUS(enqueue_map_buffer(handle->queue, gclMem->mem, blocking, flags, *offset, *size, + handle->numWaitEvents, handle->waitEvents, handle->eventPtr, (void **)&map_ptr)); + gclMem->mapPtrArray.push_back(map_ptr); +#ifdef _DEBUG + debug_info = "enqueue_map_buffer runInfo: "; +#endif + } else { + return NOT_SUPPORTED; + } +#ifdef _DEBUG + double executeTime = 0; + CHECK_STATUS(event_counting_time(handle->eventPtr, NULL, NULL, NULL, NULL, &executeTime)); + CHECK_STATUS(release_event(handle->eventObj)); + UNI_DEBUG_LOG("%sexecuteTime = %f us\n", debug_info.c_str(), executeTime); + CHECK_STATUS(gcl_finish(handle)); +#endif + return SUCCESS; +} + +inline EE gcl_fill_memory_zero(GCLHandle_t handle, GCLMem_t gclMem) +{ +#ifdef _DEBUG + std::string debug_info = "FILLMEM>>> "; +#endif + if (gclMem->desc.memType == GCL_MEM_BUF) { +#ifdef _DEBUG + debug_info = "enqueue_fill_buffer runInfo: "; +#endif + U8 pat_val = 0; + CHECK_STATUS(enqueue_fill_buffer(handle->queue, gclMem->mem, &pat_val, sizeof(pat_val), 0, + gclMem->desc.byteSize, handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); + } else { +#ifdef _DEBUG + debug_info = "enqueue_fill_image runInfo: "; +#endif + F32 color[4] = {0.0f, 0.0f, 0.0f, 0.0f}; + U32 origin[3] = {0, 0, 0}; + U32 region[3]; + region[0] = gclMem->desc.stride[0]; + region[1] = gclMem->desc.stride[1]; + region[2] = gclMem->desc.stride[2]; + CHECK_STATUS(enqueue_fill_image(handle->queue, gclMem->mem, color, origin, region, + handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); + } +#ifdef _DEBUG + double executeTime = 0; + CHECK_STATUS(event_counting_time(handle->eventPtr, NULL, NULL, NULL, NULL, &executeTime)); + CHECK_STATUS(release_event(handle->eventObj)); + UNI_DEBUG_LOG("%sexecuteTime = %f us\n", debug_info.c_str(), executeTime); + CHECK_STATUS(gcl_finish(handle)); +#endif + return SUCCESS; +} + +inline EE gcl_get_mem_size(GCLMem_t gclMem, U32 *size) +{ + CHECK_STATUS(get_memory_size(gclMem->mem, size)); + return SUCCESS; +} + +inline EE gcl_create_sub_buffer(U32 size, U32 *offset, GCLMem_t src, Mem *subbuf) +{ + CHECK_STATUS(create_sub_buffer(src->mem, CL_MEM_READ_WRITE, *offset, size, subbuf)); + src->subMem.push_back(*subbuf); + *offset += (size + 1023) / 1024 * 1024; + return SUCCESS; +} +#ifdef __cplusplus +} +#endif +template +struct DummpyWrapper { + static void set_kernel_arg_wrapper(Kernel kernel, const Tuple &t) + { + DummpyWrapper::set_kernel_arg_wrapper(kernel, t); + auto arg = std::get(t); + set_kernel_arg(kernel, N - 1, sizeof(arg), (void *)&arg); + } +}; + +template +struct DummpyWrapper { + static void set_kernel_arg_wrapper(Kernel kernel, const Tuple &t) + { + UNUSED(kernel); + UNUSED(t); + } +}; + +template +inline EE gcl_set_kernelArgs(Kernel kernel, Args... args) +{ + std::tuple t = std::make_tuple(args...); + DummpyWrapper::set_kernel_arg_wrapper(kernel, t); + return SUCCESS; +} + +inline std::string gclMemDesc2Str(GCLMemDesc desc) +{ + char buff[128]; + snprintf(buff, sizeof(buff), "memFormat: %d, ", desc.memFormat); + std::string descStr = buff; + descStr += "stride("; + for (U32 i = 0; i < 3; i++) { + descStr += std::to_string(desc.stride[i]); + if (i < 2) { + descStr += ","; + } + } + descStr += "), "; + descStr += "offset("; + for (U32 i = 0; i < 3; i++) { + descStr += std::to_string(desc.offset[i]); + if (i < 2) { + descStr += ","; + } + } + descStr += ")"; + return descStr; +} +#ifdef _DEBUG +template +inline EE gcl_print_memory(GCLHandle_t handle, GCLMem_t gclMem, CI8 *gclMemName = NULL) +{ + UNUSED(handle); + UNUSED(gclMem); + UNUSED(gclMemName); + return SUCCESS; +} + +template +inline EE gcl_print_buffer(GCLHandle_t handle, Mem buf, U32 num, CI8 *bufferName = NULL) +{ + UNUSED(handle); + UNUSED(buf); + UNUSED(num); + return SUCCESS; +} + +template +inline EE gcl_check_buf(GCLHandle_t handle, Mem buf, U32 size, bool write2bin, CI8 *dataName = NULL) +{ + U32 num = size / sizeof(T); + U8 *hostPtr = new U8[size]; + F32 *hostPtrTran = new F32[num]; + CHECK_STATUS(enqueue_read_buffer(handle->queue, buf, CL_TRUE, 0, size, hostPtr, + handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); + T *val = (T *)hostPtr; + for (U32 i = 0; i < num; i++) { + hostPtrTran[i] = (F32)val[i]; + } + + if (write2bin) { + FILE *outfile; + if (!dataName) { + dataName = "unknow"; + } + std::string fileName = dataName; + replace(fileName.begin(), fileName.end(), '/', '_'); + replace(fileName.begin(), fileName.end(), '.', '_'); + replace(fileName.begin(), fileName.end(), ' ', '_'); + fileName += "_gpu"; + fileName += ".out"; + outfile = fopen(fileName.c_str(), "wb"); + if (outfile == NULL) { + UNI_DEBUG_LOG("waring fopen outfile %s failed\n", fileName.c_str()); + delete[] hostPtr; + delete[] hostPtrTran; + return SUCCESS; + } + fwrite(hostPtrTran, sizeof(float), num, outfile); + fclose(outfile); + } else { + //U32 len = (num > 64) ? 64 : num; + U32 len = num; + std::string line = "GPU result : "; + for (U32 i = 0; i < len; i++) { + if (i % 8 == 0) { + line = line + "\n\t"; + } + line = line + std::to_string(hostPtrTran[i]) + " "; + } + UNI_DEBUG_LOG("%s\n", line.c_str()); + } + delete[] hostPtr; + delete[] hostPtrTran; + return SUCCESS; +} +template +inline std::string gcl_check_data(GCLHandle_t handle, + GCLMemDesc memDesc, + void *ptr, + U32 len, + U32 ptrType, + bool write2bin, + CI8 *dataName = NULL) +{ + /*ptrType: + * GPU: 0 + * CPU: 1 + */ + DataFormat tdf; + DataType tdt; + U32 tn, tc, th, tw; + U32 dims; + tn = 1; + tc = 1; + th = 1; + tw = 1; + dims = memDesc.nDims; + tdt = memDesc.dt; + tdf = memDesc.df; + tw = memDesc.dims[0]; + if (dims > 1) { + th = memDesc.dims[1]; + } + if (dims > 2) { + tc = memDesc.dims[2]; + } + if (dims > 3) { + tn = memDesc.dims[3]; + } + if (dims > 4) { + CHECK_STATUS(NOT_SUPPORTED); + } + U32 num = tn * tc * th * tw; + F32 *hostPtrTran = new F32[num]; + if (!dataName) { + dataName = "unknow"; + } + + if (ptrType == 0) { + GCLMem_t mem = (GCLMem_t)ptr; + GCLMemDesc desc = memDesc; + GCLMemType type = desc.memType; + DataFormat df = desc.memFormat; + U8 *hostPtr = nullptr; + U32 s0 = desc.stride[0]; + U32 s1 = desc.stride[1]; + U32 off0 = desc.offset[0]; + U32 off1 = desc.offset[1]; + U32 byteSize = desc.byteSize; + hostPtr = new U8[(size_t)byteSize]; + + GCLMemTransType tranType = DEVICE_BUF_TO_HOST; + U32 size[3] = {byteSize, 1, 1}; + if (type == GCL_MEM_IMG_1D) { + tranType = DEVICE_IMG_TO_HOST; + size[0] = s0; + } + gcl_trans_memory(handle, (void *)mem, (void *)hostPtr, size, tranType, CL_TRUE); + + T *val = (T *)hostPtr; + if (df == DF_NCWHC4) { + if (tdf == DF_NCHW) { + for (U32 i = 0; i < num; i++) { + U32 iw = i % tw; + U32 ih = (i / tw) % th; + U32 ic = i / (tw * th); + hostPtrTran[i] = + (float)(val[((ic / 4) * s1 + iw + off1) * s0 * 4 + (ih + off0) * 4 + (ic & 3)]); + } + } + if (tdf == DF_MKT) { + for (U32 i = 0; i < num; i++) { + U32 ih = i % tw; + U32 ic = i / tw; + U32 in_off = ((ic / 4) * s1 + off1) * s0 * 4 + (ih + off0) * 4 + (ic & 3); + hostPtrTran[i] = (float)val[in_off]; + } + } + } else if (df == DF_NCHW || df == DF_NHWC) { + for (U32 i = 0; i < num; i++) { + U32 iw = i % tw; + U32 ih = (i / tw) % th; + U32 ic = i / (tw * th); + hostPtrTran[i] = (float)(val[(ic * s1 + ih + off1) * s0 + (iw + off0)]); + } + } else if (df == DF_NORMAL) { + for (U32 i = 0; i < num; i++) { + hostPtrTran[i] = (float)val[i]; + } + } else { + UNI_DEBUG_LOG( + "warning write GPU memory %s to bin, format not support: %d\n", dataName, (int)df); + delete[] hostPtrTran; + delete[] hostPtr; + } + delete[] hostPtr; + } + + if (ptrType == 1) { + T *val = (T *)ptr; + if (tdf == DF_NCHWC8) { + for (U32 i = 0; i < num; i++) { + U32 iw = i % tw; + U32 ih = (i / tw) % th; + U32 ic = i / (tw * th); + hostPtrTran[i] = (float)(val[((ic / 8) * th + ih) * tw * 8 + iw * 8 + (ic & 7)]); + } + } else if (tdf == DF_NORMAL || tdf == DF_NCHW) { + for (U32 i = 0; i < num; i++) { + hostPtrTran[i] = (float)(val[i]); + } + } else if (tdf == DF_MTK) { + for (U32 i = 0; i < num; i++) { + U32 it = i % th; + U32 ik = i / th; + U32 in_off = it * tw + ik; + hostPtrTran[i] = (float)(val[in_off]); //write as MKT, for compare with gpu + } + } else { + UNI_DEBUG_LOG( + "warning write GPU memory %s to bin, format not support: %d\n", dataName, (int)tdf); + delete[] hostPtrTran; + } + } + if (write2bin) { + FILE *outfile; + std::string fileName = dataName; + replace(fileName.begin(), fileName.end(), '/', '_'); + replace(fileName.begin(), fileName.end(), '.', '_'); + replace(fileName.begin(), fileName.end(), ' ', '_'); + if (ptrType == 0) { + fileName += "_gpu"; + } + if (ptrType == 1) { + fileName += "_cpu"; + } + fileName += ".out"; + + outfile = fopen(fileName.c_str(), "wb"); + if (outfile == NULL) { + UNI_DEBUG_LOG("waring fopen outfile %s failed\n", fileName.c_str()); + delete[] hostPtrTran; + } + fwrite(hostPtrTran, sizeof(float), num, outfile); + fclose(outfile); + } + std::string line = "GPU result nchw: "; + if (len > num) { + len = num; + } + for (U32 i = 0; i < len; i++) { + if (i % 8 == 0) { + line = line + "\n\t"; + } + line = line + std::to_string(hostPtrTran[i]) + " "; + } + delete[] hostPtrTran; + return line; +} +#endif +#endif diff --git a/common/gcl/include/gcl_kernel_binmap.h b/common/gcl/include/gcl_kernel_binmap.h new file mode 100644 index 00000000..6f95158c --- /dev/null +++ b/common/gcl/include/gcl_kernel_binmap.h @@ -0,0 +1,148 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef GCL_KERNELBIN_MAP +#define GCL_KERNELBIN_MAP + +#include "gcl_kernel_type.h" +#include +#include +typedef GCLKernelBin KernelBin; + +class gcl_kernel_binmap { +public: + gcl_kernel_binmap() + {} + std::unordered_map &binMap() + { + return binMap_; + } + + void put(std::string kernelname, KernelBin kernelbin) + { + std::lock_guard lock(mtx_); + auto it = binMap_.find(kernelname); + if (it == binMap_.end()) { + binMap_.insert({kernelname, kernelbin}); + } + } + + bool get(std::string kernelname, KernelBin **kernelbin_ptr) + { + std::lock_guard lock(mtx_); + auto it = binMap_.find(kernelname); + if (it == binMap_.end()) { + return false; + } + *kernelbin_ptr = &it->second; + return true; + } + +private: + std::unordered_map binMap_; + std::mutex mtx_; +}; + +class gcl_kernel_binmap_container { +public: + static gcl_kernel_binmap_container *instance() + { + static gcl_kernel_binmap_container sInst; + return &sInst; + } + void put(std::string kernel_binmap_name, std::unique_ptr kernel_binmap) + { + std::lock_guard lock(mtx_); + auto it = kernel_binmap_container_.find(kernel_binmap_name); + if (it == kernel_binmap_container_.end()) { + kernel_binmap_container_.insert( + std::make_pair(kernel_binmap_name, std::move(kernel_binmap))); + } + } + bool get(std::string kernel_binmap_name, gcl_kernel_binmap **kernel_binmap_ptr) + { + std::lock_guard lock(mtx_); + auto it = kernel_binmap_container_.find(kernel_binmap_name); + if (it == kernel_binmap_container_.end()) { + return false; + } + *kernel_binmap_ptr = it->second.get(); + return true; + } + +private: + gcl_kernel_binmap_container() + {} + std::unordered_map> kernel_binmap_container_; + std::mutex mtx_; +}; + +class gcl_kernel_binmap_factory { +public: + static gcl_kernel_binmap_factory *instance() + { + static gcl_kernel_binmap_factory sInst; + return &sInst; + } + typedef gcl_kernel_binmap *(*PFN_GCLKERNELMAP_CREATOR)(); + void register_gcl_kernel_binmap( + const std::string &kernel_binmap_name, PFN_GCLKERNELMAP_CREATOR pfnCreator) + { + std::lock_guard lock(mtx_); + auto it = creators_.find(kernel_binmap_name); + if (it == creators_.end()) { + creators_.insert({kernel_binmap_name, pfnCreator}); + } + } + bool create_gcl_kernel_binmap(const std::string &kernel_binmap_name) + { + std::lock_guard lock(mtx_); + auto it = creators_.find(kernel_binmap_name); + if (it == creators_.end()) { + printf("the kernel_binmap creator %s doesn't exist in kernel_binmap factory\n", + kernel_binmap_name.c_str()); + return false; + } + PFN_GCLKERNELMAP_CREATOR pfn = it->second; + gcl_kernel_binmap_container::instance()->put( + kernel_binmap_name, std::unique_ptr(pfn())); + return true; + } + +private: + gcl_kernel_binmap_factory() + {} + std::unordered_map creators_; + std::mutex mtx_; +}; + +#define REGISTER_GCLKERNELMAP_CREATOR_IMPL(kernel_binmap_name) \ + namespace { \ + static gcl_kernel_binmap *kernel_binmap_name##_gcl_kernel_binmap_pfn() \ + { \ + return new kernel_binmap_name(); \ + } \ + class kernel_binmap_name##_gcl_kernel_binmap_loader { \ + public: \ + kernel_binmap_name##_gcl_kernel_binmap_loader() \ + { \ + gcl_kernel_binmap_factory::instance()->register_gcl_kernel_binmap( \ + #kernel_binmap_name, kernel_binmap_name##_gcl_kernel_binmap_pfn); \ + } \ + }; \ + static kernel_binmap_name##_gcl_kernel_binmap_loader kernel_binmap_name##_sLoader; \ + } + +#define REGISTER_GCLKERNELMAP(kernel_binmap_name) \ + REGISTER_GCLKERNELMAP_CREATOR_IMPL(kernel_binmap_name) +#endif diff --git a/common/gcl/include/gcl_kernel_source.h b/common/gcl/include/gcl_kernel_source.h new file mode 100644 index 00000000..4284d9f5 --- /dev/null +++ b/common/gcl/include/gcl_kernel_source.h @@ -0,0 +1,85 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef GCL_KERNEL_SOURCE +#define GCL_KERNEL_SOURCE + +#include "gcl_kernel_type.h" +#include "error.h" + +#include +#include +typedef GCLKernelSource KernelSource; +typedef GCLKernelOption KernelOption; + +class gcl_kernel_source { +public: + gcl_kernel_source() + { + UNI_DEBUG_LOG("gcl_kernel_source %p constructor\n", (char *)this); + } + ~gcl_kernel_source() + { + UNI_DEBUG_LOG("gcl_kernel_source %p constructor\n", (char *)this); + } + + std::unordered_map &kernelSourceMap() + { + return kernelSourceMap_; + } + std::unordered_map &kernelOptionMap() + { + return kernelOptionMap_; + } + + void put_source(std::string kernelname, KernelSource kernelSource) + { + auto it = kernelSourceMap_.find(kernelname); + if (it == kernelSourceMap_.end()) { + kernelSourceMap_.insert({kernelname, kernelSource}); + } + } + + bool get_source(std::string kernelname, KernelSource **kernelSource_ptr) + { + auto it = kernelSourceMap_.find(kernelname); + if (it == kernelSourceMap_.end()) { + return false; + } + *kernelSource_ptr = &it->second; + return true; + } + + void put_option(std::string kernelname, KernelOption kernelOption) + { + auto it = kernelOptionMap_.find(kernelname); + if (it == kernelOptionMap_.end()) { + kernelOptionMap_.insert({kernelname, kernelOption}); + } + } + + bool get_option(std::string kernelname, KernelOption **kernelOption_ptr) + { + auto it = kernelOptionMap_.find(kernelname); + if (it == kernelOptionMap_.end()) { + return false; + } + *kernelOption_ptr = &it->second; + return true; + } + +private: + std::unordered_map kernelSourceMap_; + std::unordered_map kernelOptionMap_; +}; +#endif diff --git a/common/gcl/include/gcl_kernel_type.h b/common/gcl/include/gcl_kernel_type.h new file mode 100644 index 00000000..6979e004 --- /dev/null +++ b/common/gcl/include/gcl_kernel_type.h @@ -0,0 +1,20 @@ +#ifndef H_GCL_KERNEL_TYPE_H +#define H_GCL_KERNEL_TYPE_H + +struct GCLKernelBin { + const unsigned char *data; + const unsigned int len; +}; + +struct GCLKernelSource { + const char *data; + const unsigned int len; + bool use_kernel_def_head; +}; + +struct GCLKernelOption { + const char *option; + const char *sourceName; + bool use_common_opt; +}; +#endif diff --git a/common/gcl/include/gclmem_desc_infer.h b/common/gcl/include/gclmem_desc_infer.h new file mode 100644 index 00000000..4c646e37 --- /dev/null +++ b/common/gcl/include/gclmem_desc_infer.h @@ -0,0 +1,713 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _GCLMEM_DESC_INFER +#define _GCLMEM_DESC_INFER +#include +#include +#include "gcl_func.h" + +inline EE infer_gclmem_desc_nchwc3_to_nchw(U32 iw, + U32 ih, + U32 ic, + U32 pw, + U32 ph, + U32 ow, + U32 oh, + U32 oc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc, + bool need_pad = false) +{ + /*Intend to deprecate this API*/ + if (gclmemInputDesc == nullptr || gclmemOutputDesc == nullptr) { + return NULL_POINTER; + } + U32 s0, s1, s2; + s0 = ow; + s1 = oh; + s2 = oc; + + U32 num, byteSize; + num = s0 * s1 * s2; + byteSize = num * bytesOf(DT_F16); + gclmemOutputDesc->stride[0] = s0; + gclmemOutputDesc->stride[1] = s1; + gclmemOutputDesc->stride[2] = s2; + gclmemOutputDesc->offset[0] = 0; + gclmemOutputDesc->offset[1] = 0; + gclmemOutputDesc->offset[2] = 0; + gclmemOutputDesc->num = num; + gclmemOutputDesc->byteSize = byteSize; + + U32 pw_org, ph_org; + U32 s0_org, s1_org, s2_org; + U32 byteSize_org; + + s0_org = gclmemInputDesc->stride[0]; + s1_org = gclmemInputDesc->stride[1]; + s2_org = gclmemInputDesc->stride[2]; + pw_org = gclmemInputDesc->offset[0]; + ph_org = gclmemInputDesc->offset[1]; + byteSize_org = gclmemInputDesc->byteSize; + bool need_pad_org = gclmemInputDesc->need_pad; + if (byteSize_org != 0 && gclmemInputDesc->memFormat != DF_NCHWC3) { + return NOT_SUPPORTED; + } + + pw = (pw > pw_org) ? pw : pw_org; + ph = (ph > ph_org) ? ph : ph_org; + + s0 = iw + (pw << 1); + s1 = ih + (ph << 1); + s2 = (ic + 2) / 3; + s0 = (s0 > s0_org) ? s0 : s0_org; + s1 = (s1 > s1_org) ? s1 : s1_org; + s2 = (s2 > s2_org) ? s2 : s2_org; + + num = s0 * s1 * s2 * 3; + byteSize = num * bytesOf(DT_F16); + byteSize = (byteSize > byteSize_org) ? byteSize : byteSize_org; + + gclmemInputDesc->stride[0] = s0; + gclmemInputDesc->stride[1] = s1; + gclmemInputDesc->stride[2] = s2; + gclmemInputDesc->offset[0] = pw; + gclmemInputDesc->offset[1] = ph; + gclmemInputDesc->offset[2] = 0; + gclmemInputDesc->num = num; + gclmemInputDesc->byteSize = byteSize; + + gclmemInputDesc->memType = GCL_MEM_BUF; + gclmemInputDesc->memFormat = DF_NCHWC3; + gclmemInputDesc->flags = CL_MEM_READ_WRITE; + gclmemInputDesc->host_ptr = NULL; + gclmemOutputDesc->memType = GCL_MEM_BUF; + gclmemOutputDesc->memFormat = DF_NCHW; + gclmemOutputDesc->flags = CL_MEM_READ_WRITE; + gclmemOutputDesc->host_ptr = NULL; + gclmemOutputDesc->need_pad = need_pad | need_pad_org; + return SUCCESS; +} + +inline EE trans_gclmem_desc_nchw_ncwhc4( + U32 iw, U32 ih, U32 ic, U32 pw, U32 ph, DataType dt, GCLMemDesc_t gclmemDesc, bool need_pad = false) +{ + U32 s0, s1, s2; + U32 num, byteSize; + U32 pw_org, ph_org; + U32 s0_org, s1_org, s2_org; + U32 byteSize_org; + + if (gclmemDesc) { + if (gclmemDesc->memFormat != DF_NCHW) { + return NOT_SUPPORTED; + } + s0_org = gclmemDesc->stride[1]; + s1_org = gclmemDesc->stride[0]; + s2_org = gclmemDesc->stride[2]; + ph_org = gclmemDesc->offset[1]; + pw_org = gclmemDesc->offset[0]; + if (pw_org == 0 && ph_org == 0) { + if (s2_org == 1 && (s0_org == 1 || s1_org == 1)) { + s2_org = (s0_org == 1) ? s1_org : s0_org; + s0_org = 1; + s1_org = 1; + } + } + s2_org = (s2_org + 3) / 4; + byteSize_org = gclmemDesc->byteSize; + bool need_pad_org = gclmemDesc->need_pad; + if (pw == 0 && ph == 0) { + if (ic == 1 && (iw == 1 || ih == 1)) { + ic = (iw == 1) ? ih : iw; + iw = 1; + ih = 1; + } + } + ph = (ph > ph_org) ? ph : ph_org; + pw = (pw > pw_org) ? pw : pw_org; + + s0 = ih + (ph << 1); + s1 = iw + (pw << 1); + s2 = (ic + 3) / 4; + s0 = (s0 > s0_org) ? s0 : s0_org; + s1 = (s1 > s1_org) ? s1 : s1_org; + s2 = (s2 > s2_org) ? s2 : s2_org; + num = s0 * s1 * s2 * 4; + byteSize = num * bytesOf(dt); + byteSize = (byteSize > byteSize_org) ? byteSize : byteSize_org; + + gclmemDesc->stride[0] = s0; + gclmemDesc->stride[1] = s1; + gclmemDesc->stride[2] = s2; + gclmemDesc->offset[0] = ph; + gclmemDesc->offset[1] = pw; + gclmemDesc->offset[2] = 0; + gclmemDesc->num = num; + gclmemDesc->byteSize = byteSize; + gclmemDesc->memType = GCL_MEM_BUF; + gclmemDesc->memFormat = DF_NCWHC4; + gclmemDesc->flags = CL_MEM_READ_WRITE; + gclmemDesc->host_ptr = NULL; + gclmemDesc->need_pad = need_pad | need_pad_org; + } + return SUCCESS; +} +inline EE infer_gclmem_desc_ncwhc4(U32 iw, + U32 ih, + U32 ic, + U32 pw, + U32 ph, + U32 ow, + U32 oh, + U32 oc, + DataType idt, + DataType odt, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc, + bool need_pad = false) +{ + U32 s0, s1, s2; + U32 num, byteSize; + U32 pw_org, ph_org; + U32 s0_org, s1_org, s2_org; + U32 byteSize_org; + if (gclmemOutputDesc) { + s0 = oh; + s1 = ow; + s2 = (oc + 3) / 4; + num = s0 * s1 * s2 * 4; + byteSize = num * bytesOf(odt); + gclmemOutputDesc->stride[0] = s0; + gclmemOutputDesc->stride[1] = s1; + gclmemOutputDesc->stride[2] = s2; + gclmemOutputDesc->offset[0] = 0; + gclmemOutputDesc->offset[1] = 0; + gclmemOutputDesc->offset[2] = 0; + gclmemOutputDesc->num = num; + gclmemOutputDesc->byteSize = byteSize; + gclmemOutputDesc->memType = GCL_MEM_BUF; + gclmemOutputDesc->memFormat = DF_NCWHC4; + gclmemOutputDesc->flags = CL_MEM_READ_WRITE; + gclmemOutputDesc->host_ptr = NULL; + } + + if (gclmemInputDesc) { + byteSize_org = gclmemInputDesc->byteSize; + if (byteSize_org != 0 && gclmemInputDesc->memFormat != DF_NCWHC4) { + return trans_gclmem_desc_nchw_ncwhc4(iw, ih, ic, pw, ph, idt, gclmemInputDesc, need_pad); + } + s0_org = gclmemInputDesc->stride[0]; + s1_org = gclmemInputDesc->stride[1]; + s2_org = gclmemInputDesc->stride[2]; + ph_org = gclmemInputDesc->offset[0]; + pw_org = gclmemInputDesc->offset[1]; + if (pw_org == 0 && ph_org == 0) { + if (s2_org == 1 && (s0_org == 1 || s1_org == 1)) { + s2_org = (s0_org == 1) ? s1_org : s0_org; + s0_org = 1; + s1_org = 1; + } + } + bool need_pad_org = gclmemInputDesc->need_pad; + if (pw == 0 && ph == 0) { + if (ic == 1 && (iw == 1 || ih == 1)) { + ic = (iw == 1) ? ih : iw; + iw = 1; + ih = 1; + } + } + + ph = (ph > ph_org) ? ph : ph_org; + pw = (pw > pw_org) ? pw : pw_org; + + s0 = ih + (ph << 1); + s1 = iw + (pw << 1); + s2 = (ic + 3) / 4; + s0 = (s0 > s0_org) ? s0 : s0_org; + s1 = (s1 > s1_org) ? s1 : s1_org; + s2 = (s2 > s2_org) ? s2 : s2_org; + num = s0 * s1 * s2 * 4; + byteSize = num * bytesOf(idt); + byteSize = (byteSize > byteSize_org) ? byteSize : byteSize_org; + + gclmemInputDesc->stride[0] = s0; + gclmemInputDesc->stride[1] = s1; + gclmemInputDesc->stride[2] = s2; + gclmemInputDesc->offset[0] = ph; + gclmemInputDesc->offset[1] = pw; + gclmemInputDesc->offset[2] = 0; + gclmemInputDesc->num = num; + gclmemInputDesc->byteSize = byteSize; + gclmemInputDesc->memType = GCL_MEM_BUF; + gclmemInputDesc->memFormat = DF_NCWHC4; + gclmemInputDesc->flags = CL_MEM_READ_WRITE; + gclmemInputDesc->host_ptr = NULL; + gclmemInputDesc->need_pad = need_pad | need_pad_org; + } + return SUCCESS; +} + +inline EE infer_gclmem_desc_nhwc(U32 iw, + U32 ih, + U32 ic, + U32 pc, + U32 pw, + U32 ow, + U32 oh, + U32 oc, + DataType idt, + DataType odt, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc, + bool need_pad = false) +{ + U32 s0, s1, s2; + U32 num, byteSize; + U32 pc_org, pw_org; + U32 s0_org, s1_org, s2_org; + U32 byteSize_org; + + if (gclmemOutputDesc) { + s0 = oc; + s1 = ow; + s2 = oh; + num = s0 * s1 * s2; + byteSize = num * bytesOf(odt); + gclmemOutputDesc->stride[0] = s0; + gclmemOutputDesc->stride[1] = s1; + gclmemOutputDesc->stride[2] = s2; + gclmemOutputDesc->offset[0] = 0; + gclmemOutputDesc->offset[1] = 0; + gclmemOutputDesc->offset[2] = 0; + gclmemOutputDesc->num = num; + gclmemOutputDesc->byteSize = byteSize; + gclmemOutputDesc->memType = GCL_MEM_BUF; + gclmemOutputDesc->memFormat = DF_NHWC; + gclmemOutputDesc->flags = CL_MEM_READ_WRITE; + gclmemOutputDesc->host_ptr = NULL; + } + + if (gclmemInputDesc) { + s0_org = gclmemInputDesc->stride[0]; + s1_org = gclmemInputDesc->stride[1]; + s2_org = gclmemInputDesc->stride[2]; + pc_org = gclmemInputDesc->offset[0]; + pw_org = gclmemInputDesc->offset[1]; + byteSize_org = gclmemInputDesc->byteSize; + bool need_pad_org = gclmemInputDesc->need_pad; + if (byteSize_org != 0 && gclmemInputDesc->memFormat != DF_NHWC) { + return NOT_SUPPORTED; + } + + pc = (pc > pc_org) ? pc : pc_org; + pw = (pw > pw_org) ? pw : pw_org; + s0 = ic + (pc << 1); + s1 = iw + (pw << 1); + s2 = ih; + s0 = (s0 > s0_org) ? s0 : s0_org; + s1 = (s1 > s1_org) ? s1 : s1_org; + s2 = (s2 > s2_org) ? s2 : s2_org; + + num = s0 * s1 * s2; + byteSize = num * bytesOf(idt); + byteSize = (byteSize > byteSize_org) ? byteSize : byteSize_org; + gclmemInputDesc->stride[0] = s0; + gclmemInputDesc->stride[1] = s1; + gclmemInputDesc->stride[2] = s2; + gclmemInputDesc->offset[0] = pc; + gclmemInputDesc->offset[1] = pw; + gclmemInputDesc->offset[2] = 0; + gclmemInputDesc->num = num; + gclmemInputDesc->byteSize = byteSize; + gclmemInputDesc->memType = GCL_MEM_BUF; + gclmemInputDesc->memFormat = DF_NHWC; + gclmemInputDesc->flags = CL_MEM_READ_WRITE; + gclmemInputDesc->host_ptr = NULL; + gclmemInputDesc->need_pad = need_pad | need_pad_org; + } + return SUCCESS; +} + +inline EE infer_gclmem_desc_nchw(U32 iw, + U32 ih, + U32 ic, + U32 pw, + U32 ph, + U32 ow, + U32 oh, + U32 oc, + DataType idt, + DataType odt, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc, + bool need_pad = false) +{ + U32 s0, s1, s2; + U32 num, byteSize; + U32 pw_org, ph_org; + U32 s0_org, s1_org, s2_org; + U32 byteSize_org; + + if (gclmemOutputDesc) { + s0 = ow; + s1 = oh; + s2 = oc; + num = s0 * s1 * s2; + byteSize = num * bytesOf(odt); + gclmemOutputDesc->stride[0] = s0; + gclmemOutputDesc->stride[1] = s1; + gclmemOutputDesc->stride[2] = s2; + gclmemOutputDesc->offset[0] = 0; + gclmemOutputDesc->offset[1] = 0; + gclmemOutputDesc->offset[2] = 0; + gclmemOutputDesc->num = num; + gclmemOutputDesc->byteSize = byteSize; + gclmemOutputDesc->memType = GCL_MEM_BUF; + gclmemOutputDesc->memFormat = DF_NCHW; + gclmemOutputDesc->flags = CL_MEM_READ_WRITE; + gclmemOutputDesc->host_ptr = NULL; + } + + if (gclmemInputDesc) { + s0_org = gclmemInputDesc->stride[0]; + s1_org = gclmemInputDesc->stride[1]; + s2_org = gclmemInputDesc->stride[2]; + pw_org = gclmemInputDesc->offset[0]; + ph_org = gclmemInputDesc->offset[1]; + byteSize_org = gclmemInputDesc->byteSize; + bool need_pad_org = gclmemInputDesc->need_pad; + if (byteSize_org != 0 && gclmemInputDesc->memFormat != DF_NCHW) { + return NOT_SUPPORTED; + } + + pw = (pw > pw_org) ? pw : pw_org; + ph = (ph > ph_org) ? ph : ph_org; + s0 = iw + (pw << 1); + s1 = ih + (ph << 1); + s2 = ic; + s0 = (s0 > s0_org) ? s0 : s0_org; + s1 = (s1 > s1_org) ? s1 : s1_org; + s2 = (s2 > s2_org) ? s2 : s2_org; + + num = s0 * s1 * s2; + byteSize = num * bytesOf(idt); + byteSize = (byteSize > byteSize_org) ? byteSize : byteSize_org; + gclmemInputDesc->stride[0] = s0; + gclmemInputDesc->stride[1] = s1; + gclmemInputDesc->stride[2] = s2; + gclmemInputDesc->offset[0] = pw; + gclmemInputDesc->offset[1] = ph; + gclmemInputDesc->offset[2] = 0; + gclmemInputDesc->num = num; + gclmemInputDesc->byteSize = byteSize; + gclmemInputDesc->memType = GCL_MEM_BUF; + gclmemInputDesc->memFormat = DF_NCHW; + gclmemInputDesc->flags = CL_MEM_READ_WRITE; + gclmemInputDesc->host_ptr = NULL; + gclmemInputDesc->need_pad = need_pad | need_pad_org; + } + return SUCCESS; +} + +inline void get_nlp_mkt_val(TensorDesc desc, DataType *dt, U32 *m, U32 *k, U32 *t) +{ + if (dt) { + *dt = desc.dt; + } + if (desc.df == DF_MTK) { + if (m) { + *m = desc.dims[2]; + } + if (t) { + *t = desc.dims[1]; + } + if (k) { + *k = desc.dims[0]; + } + } else if (desc.df == DF_MKT) { + if (m) { + *m = desc.dims[2]; + } + if (k) { + *k = desc.dims[1]; + } + if (t) { + *t = desc.dims[0]; + } + } else { + CHECK_STATUS(NOT_MATCH); + } +} + +inline void map_nlp_mkt_to_ncwhc4(U32 m, U32 k, U32 t, U32 *gw, U32 *gh, U32 *gc) +{ + if (gw) { + *gw = 1; + } + if (gh) { + *gh = t; + } + if (gc) { + *gc = (k + 3) / 4 * m; + } +} + +inline void get_gclmem_dim( + GCLMemDesc desc, U32 *w_str, U32 *h_str, U32 *c_str, U32 *w_off, U32 *h_off) +{ + if (desc.memFormat == DF_NCHW) { + if (w_str) { + *w_str = desc.stride[0]; + } + if (h_str) { + *h_str = desc.stride[1]; + } + if (c_str) { + *c_str = desc.stride[2]; + } + if (w_off) { + *w_off = desc.offset[0]; + } + if (h_off) { + *h_off = desc.offset[1]; + } + } else if (desc.memFormat == DF_NCWHC4) { + if (w_str) { + *w_str = desc.stride[1]; + } + if (h_str) { + *h_str = desc.stride[0]; + } + if (c_str) { + *c_str = desc.stride[2]; + } + if (w_off) { + *w_off = desc.offset[1]; + } + if (h_off) { + *h_off = desc.offset[0]; + } + } else if (desc.memFormat == DF_NHWC) { + if (w_str) { + *w_str = desc.stride[1]; + } + if (h_str) { + *h_str = desc.stride[2]; + } + if (c_str) { + *c_str = desc.stride[0]; + } + if (w_off) { + *w_off = desc.offset[1]; + } + if (h_off) { + *h_off = desc.offset[0]; + } + } else { + CHECK_STATUS(NOT_SUPPORTED); + } +} + +inline EE fill_output_zero(GCLHandle_t handle, GCLMem_t output, TensorDesc outputDesc) +{ + GCLMemDesc outGCLDesc = output->desc; + if (!outGCLDesc.need_pad) { + return SUCCESS; + } + DataType dt; + U32 ow_str, oh_str, oc_str; + get_gclmem_dim(outGCLDesc, &ow_str, &oh_str, &oc_str, NULL, NULL); + char kernelname[128]; + U32 gs = ow_str * oh_str * oc_str; + U32 ls = 0; + U32 dim = 1; + Kernel kernel; + U32 ow, oh; + if (outGCLDesc.memFormat == DF_NCWHC4) { + if (outputDesc.df == DF_NCHW || outputDesc.df == DF_MKT || outputDesc.df == DF_MTK) { + if (outputDesc.df == DF_NCHW) { + tensorSelectGet(outputDesc, &dt, NULL, NULL, NULL, &oh, &ow); + } + if (outputDesc.df == DF_MKT || outputDesc.df == DF_MTK) { + get_nlp_mkt_val(outputDesc, &dt, NULL, NULL, &oh); + ow = 1; + } + if (ow_str != ow || oh_str != oh) { + if (dt == DT_F16) { + sprintf(kernelname, "fill_memory_zero_vec4_f16"); + } else if (dt == DT_I32 || dt == DT_U32) { + sprintf(kernelname, "fill_memory_zero_vec4_i32"); + } else { + return NOT_SUPPORTED; + } + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, gs * 4, 0, gs, output->mem)); + gcl_set_kernelVec(handle, kernel, dim, &gs, &ls, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, &gs, &ls, kernelname)); +#endif + } + return SUCCESS; + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + } else if (outGCLDesc.memFormat == DF_NCHW || outGCLDesc.memFormat == DF_NHWC) { + if (outputDesc.df == DF_NCHW || outputDesc.df == DF_NORMAL || outputDesc.df == DF_NHWC) { + tensorSelectGet(outputDesc, &dt, NULL, NULL, NULL, &oh, &ow); + if (ow_str != ow || oh_str != oh) { + if (dt == DT_F16) { + sprintf(kernelname, "fill_memory_zero_vec4_f16"); + } else if (dt == DT_I32 || dt == DT_U32) { + sprintf(kernelname, "fill_memory_zero_vec4_i32"); + } else { + return NOT_SUPPORTED; + } + U32 len = gs; + gs = (gs + 3) / 4; + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, len, 0, gs, output->mem)); + gcl_set_kernelVec(handle, kernel, dim, &gs, &ls, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, &gs, &ls, kernelname)); +#endif + } + return SUCCESS; + + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + } + return NOT_SUPPORTED; +} + +inline GCLMemDesc gclmem_build_desc() +{ + GCLMemDesc desc; + for (U32 i = 0; i < 6; i++) { + desc.dims[i] = 0; + } + for (U32 i = 0; i < 3; i++) { + desc.stride[i] = 0; + desc.offset[i] = 0; + } + desc.nDims = 4; + desc.dt = DT_U8; + desc.df = DF_NCHW; + desc.memFormat = DF_NCWHC4; + desc.memType = GCL_MEM_BUF; + desc.byteSize = 0; + desc.num = 0; + desc.flags = CL_MEM_READ_WRITE; + desc.imgFormat.image_channel_order = CL_RGBA; + desc.imgFormat.image_channel_data_type = CL_HALF_FLOAT; + desc.host_ptr = NULL; + desc.need_pad = false; + return desc; +} + +inline EE gclmem_set_desc_padding(GCLMemDesc *desc, + U32 *stride, + U32 *offset, + DataType dt, + DataFormat mf, + GCLMemType mt, + MemFlags flags, + void *host_ptr = NULL) +{ + if (desc == NULL) { + return NULL_POINTER; + } + desc->stride[0] = stride[0]; + desc->stride[1] = stride[1]; + desc->stride[2] = stride[2]; + desc->offset[0] = offset[0]; + desc->offset[1] = offset[1]; + desc->offset[2] = offset[2]; + desc->memFormat = mf; + desc->memType = mt; + desc->flags = flags; + desc->host_ptr = host_ptr; + U32 num = 0; + U32 bytes = 0; + if (mf == DF_NHWC || mf == DF_NCHW || mt != GCL_MEM_BUF) { + num = stride[0] * stride[1] * stride[2]; + } else if (mf == DF_NCWHC4) { + num = stride[0] * stride[1] * stride[2] * 4; + } else { + return NOT_SUPPORTED; + } + bytes = num * bytesOf(dt); + if (mt != GCL_MEM_BUF) { + bytes = bytes * 4; + } + desc->num = num; + desc->byteSize = bytes; + return SUCCESS; +} + +inline EE gclmem_get_desc_non_padding( + GCLMemDesc desc, DataType *dt, DataFormat *df, U32 *num, U32 *numChannels, U32 *height, U32 *width) +{ + U32 ndims = desc.nDims; + if (dt) { + *dt = desc.dt; + } + if (df) { + *df = desc.df; + } + if (desc.df == DF_MKT) { + if (num) { + *num = desc.dims[2]; + } + if (numChannels) { + *numChannels = desc.dims[1]; + } + if (height) { + *height = desc.dims[0]; + } + if (width) { + *width = 1; + } + } else if (desc.df == DF_MTK) { + if (num) { + *num = desc.dims[2]; + } + if (numChannels) { + *numChannels = desc.dims[0]; + } + if (height) { + *height = desc.dims[1]; + } + if (width) { + *width = 1; + } + } else { + if (width) { + *width = desc.dims[0]; + } + if (height) { + *height = (ndims > 1) ? desc.dims[1] : 1; + } + if (numChannels) { + *numChannels = (ndims > 2) ? desc.dims[2] : 1; + } + if (num) { + *num = (ndims > 3) ? desc.dims[3] : 1; + } + } + return SUCCESS; +} + +#endif diff --git a/common/gcl/include/kernel.h b/common/gcl/include/kernel.h new file mode 100644 index 00000000..bcc5aa63 --- /dev/null +++ b/common/gcl/include/kernel.h @@ -0,0 +1,167 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef KERNEL_H_ +#define KERNEL_H_ +#include "types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief get information of kernel + * @warning please free memory associate with value + **/ +inline EE get_kernel_info(Kernel kernel, cl_kernel_info info, void **value, size_t *size) +{ + if (NULL == value) { + return NULL_POINTER; + } + + size_t len; + cl_int ret = clGetKernelInfo(kernel, info, 0, NULL, &len); + if (CL_SUCCESS == ret) { + if (NULL != size) { + *size = len; + } + void *data = malloc(len); + if (NULL == data) { + return ALLOC_FAILED; + } + ret = clGetKernelInfo(kernel, info, len, data, NULL); + if (CL_SUCCESS == ret) { + *value = data; + } else { + free(data); + } + } + + map_cl_error_2_ee(ret); +} + +/** + * @brief get workgroup information of kernel + * @warning please free memory associate with value + **/ +inline EE get_kernel_workgroup_info( + Kernel kernel, Device device, cl_kernel_work_group_info info, void **value, size_t *size) +{ + size_t len; + cl_int ret = clGetKernelWorkGroupInfo(kernel, device, info, 0, NULL, &len); + if (CL_SUCCESS == ret) { + if (NULL != size) { + *size = len; + } + void *data = malloc(len); + if (NULL == data) { + return ALLOC_FAILED; + } + *value = data; + } + + map_cl_error_2_ee(ret); +} + +inline EE create_kernels_in_program(Program program, U32 num_kernel, Kernel *kernels) +{ + if (kernels == nullptr) { + return NULL_POINTER; + } + I32 ret = clCreateKernelsInProgram(program, num_kernel, kernels, NULL); + map_cl_error_2_ee(ret); +} + +inline EE create_kernel(Program program, CI8 *name, Kernel *kernel) +{ + if (kernel == nullptr) { + return NULL_POINTER; + } + I32 ret; + *kernel = clCreateKernel(program, name, &ret); + map_cl_error_2_ee(ret); +} + +inline EE retain_kernel(Kernel kernel) +{ + cl_int ret = clRetainKernel(kernel); + map_cl_error_2_ee(ret); +} + +inline EE release_kernel(Kernel kernel) +{ + cl_int ret = clReleaseKernel(kernel); + map_cl_error_2_ee(ret); +} + +inline EE set_kernel_arg(Kernel kernel, U32 arg_index, U32 arg_size, const void *arg_value) +{ + cl_int ret = clSetKernelArg(kernel, arg_index, arg_size, arg_value); + map_cl_error_2_ee(ret); +} +/* + inline EE clone_kernel(Kernel src_kernel, Kernel* dst_kernel) { + // TODO + I32 ret; + dst_kernel = clCloneKernel(src_kernel, &ret); + map_cl_error_2_ee(ret); + } + */ +inline EE enqueue_ndrange_kernel(CommandQueue queue, + Kernel kernel, + U32 work_dim, + CU32 *global_work_offset, + CU32 *global_work_size, + CU32 *local_work_size, + U32 num_events_in_wait_list, + const Event *event_in_wait_list, + Event *event) +{ + I32 ret; + UNUSED(global_work_offset); + UNUSED(local_work_size); + switch (work_dim) { + case 1: { + size_t gs = global_work_size[0]; + size_t ls = local_work_size[0]; + size_t *ls_ptr = (ls == 0) ? NULL : &ls; + ret = clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, &gs, ls_ptr, + num_events_in_wait_list, event_in_wait_list, event); + break; + } + case 2: { + size_t gs[2] = {global_work_size[0], global_work_size[1]}; + size_t ls[2] = {local_work_size[0], local_work_size[1]}; + size_t *ls_ptr = (ls[0] == 0 || ls[1] == 0) ? NULL : ls; + ret = clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, gs, ls_ptr, + num_events_in_wait_list, event_in_wait_list, event); + break; + } + case 3: { + size_t gs[3] = {global_work_size[0], global_work_size[1], global_work_size[2]}; + size_t ls[3] = {local_work_size[0], local_work_size[1], local_work_size[2]}; + size_t *ls_ptr = (ls[0] == 0 || ls[1] == 0 || ls[2] == 0) ? NULL : ls; + ret = clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, gs, ls_ptr, + num_events_in_wait_list, event_in_wait_list, event); + break; + } + default: + return NOT_SUPPORTED; + } + map_cl_error_2_ee(ret); +} + +#ifdef __cplusplus +} +#endif +#endif diff --git a/common/gcl/include/memory.h b/common/gcl/include/memory.h new file mode 100644 index 00000000..7f9b5e02 --- /dev/null +++ b/common/gcl/include/memory.h @@ -0,0 +1,661 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_BUFFER +#define _H_BUFFER + +#include "event.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief get memory information + * + **/ +inline EE get_mememory_info(Mem mem, cl_mem_info info, void **value, U32 *len) +{ + if (NULL == value) { + return NULL_POINTER; + } + + size_t size; + I32 ret = clGetMemObjectInfo(mem, info, 0, NULL, &size); + if (CL_SUCCESS == ret) { + if (NULL != len) { + *len = size; + } + void *data = malloc(size); + if (NULL == data) { + return NULL_POINTER; + } + ret = clGetMemObjectInfo(mem, info, size, data, NULL); + if (CL_SUCCESS == ret) { + *value = data; + } + } + + map_cl_error_2_ee(ret); +} + +#if defined(CL_VERSION_1_2) + +inline EE create_image1D(Context context, + cl_mem_flags flags, + const cl_image_format *format, + U32 len, + U32 pitch, + void *host_ptr, + Mem *image) +{ + cl_image_desc image_desc; + image_desc.image_type = CL_MEM_OBJECT_IMAGE1D; + image_desc.image_width = len; + image_desc.image_height = 1; + image_desc.image_depth = 1; + image_desc.image_array_size = 1; + image_desc.image_row_pitch = pitch; + image_desc.image_slice_pitch = 0; + image_desc.num_mip_levels = 0; + image_desc.num_samples = 0; + image_desc.buffer = NULL; + + I32 ret; + Mem temp = clCreateImage(context, flags, format, &image_desc, host_ptr, &ret); + *image = temp; + map_cl_error_2_ee(ret); +} + +/** + * @brief create 1d image buffer + * + **/ +inline EE create_image1D_buffer(Context context, + cl_mem_flags flags, + const cl_image_format *format, + U32 len, + const cl_mem buffer, + Mem *image) +{ + cl_image_desc image_desc; + image_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + image_desc.image_width = len; + image_desc.image_height = 1; + image_desc.image_depth = 1; + image_desc.image_array_size = 1; + image_desc.image_row_pitch = len; + image_desc.image_slice_pitch = len; + image_desc.num_mip_levels = 0; + image_desc.num_samples = 0; + image_desc.buffer = buffer; + + I32 ret; + Mem temp = clCreateImage(context, flags, format, &image_desc, NULL, &ret); + if (CL_SUCCESS == ret) { + *image = temp; + } + map_cl_error_2_ee(ret); +} +#endif + +/** + * @brief create 2d image object + * + **/ +inline EE create_image2D(Context cont, + cl_mem_flags flags, + cl_image_format *format, + U32 width, + U32 height, + U32 pitch, + void *host_ptr, + Mem *mem) +{ + I32 ret; +#if defined(CL_VERSION_1_2) + cl_image_desc image_desc; + image_desc.image_type = CL_MEM_OBJECT_IMAGE2D; + image_desc.image_width = width; + image_desc.image_height = height; + image_desc.image_depth = 1; + image_desc.image_array_size = 1; + image_desc.image_row_pitch = pitch; + image_desc.image_slice_pitch = 0; + image_desc.num_mip_levels = 0; + image_desc.num_samples = 0; + image_desc.buffer = NULL; + + Mem temp = clCreateImage(cont, flags, format, &image_desc, host_ptr, &ret); +#else + Mem temp = clCreateImage2D(cont, flags, format, width, height, pitch, host_ptr, &ret); +#endif + if (CL_SUCCESS == ret) { + *mem = temp; + } + + map_cl_error_2_ee(ret); +} + +#if defined(CL_VERSION_1_2) +/** + * @brief create 2d image buffer object + * + **/ +inline EE create_image2D_array(Context cont, + cl_mem_flags flags, + cl_image_format *format, + U32 width, + U32 height, + U32 pitch, + U32 arraySize, + void *host_ptr, + Mem *mem) +{ + I32 ret; + cl_image_desc image_desc; + image_desc.image_type = CL_MEM_OBJECT_IMAGE2D_ARRAY; + image_desc.image_width = width; + image_desc.image_height = height; + image_desc.image_depth = 1; + image_desc.image_array_size = arraySize; + image_desc.image_row_pitch = pitch; + image_desc.image_slice_pitch = 0; + image_desc.num_mip_levels = 0; + image_desc.num_samples = 0; + image_desc.buffer = NULL; + + *mem = clCreateImage(cont, flags, format, &image_desc, host_ptr, &ret); + map_cl_error_2_ee(ret); +} +#endif + +/** + * @brief create 3d image object + * + **/ +inline EE create_image3D(Context cont, + cl_mem_flags flags, + cl_image_format *format, + U32 width, + U32 height, + U32 depth, + U32 rowPitch, + U32 slicePitch, + void *host_ptr, + Mem *mem) +{ + I32 ret; +#if defined(CL_VERSION_1_2) + cl_image_desc image_desc; + image_desc.image_type = CL_MEM_OBJECT_IMAGE3D; + image_desc.image_width = width; + image_desc.image_height = height; + image_desc.image_depth = depth; + image_desc.image_array_size = 1; + image_desc.image_row_pitch = rowPitch; + image_desc.image_slice_pitch = slicePitch; + image_desc.num_mip_levels = 0; + image_desc.num_samples = 0; + image_desc.buffer = NULL; + + Mem temp = clCreateImage(cont, flags, format, &image_desc, host_ptr, &ret); +#else + Mem temp = clCreateImage3D( + cont, flags, format, width, height, depth, rowPitch, slicePitch, host_ptr, &ret); +#endif + if (CL_SUCCESS == ret) { + *mem = temp; + } + + map_cl_error_2_ee(ret); +} + +/** + * @brief get image information + * + **/ +inline EE get_image_info(Mem mem, cl_mem_info info, void **value, U32 *len) +{ + size_t size; + I32 ret = clGetImageInfo(mem, info, 0, NULL, &size); + if (CL_SUCCESS == ret) { + if (NULL != len) { + *len = size; + } + + void *data = malloc(size); + if (NULL == data) { + return NULL_POINTER; + } + ret = clGetImageInfo(mem, info, size, data, NULL); + if (CL_SUCCESS == ret) { + *value = data; + } + } + + map_cl_error_2_ee(ret); +} + +/** + * @brief get supported image format + * + * @warning please free memory associated with format + **/ +inline EE get_supported_image_formats( + Context cont, cl_mem_flags flags, cl_mem_object_type type, cl_image_format **format, U32 *num) +{ + if (NULL == format) { + return NULL_POINTER; + } + + U32 len; + I32 ret = clGetSupportedImageFormats(cont, flags, type, 0, NULL, &len); + if (CL_SUCCESS == ret) { + if (NULL != num) { + *num = len; + } + cl_image_format *data = (cl_image_format *)malloc(len); + if (NULL == data) { + return NULL_POINTER; + } + ret = clGetSupportedImageFormats(cont, flags, type, len, data, 0); + if (CL_SUCCESS == ret) { + *format = data; + } + } + + map_cl_error_2_ee(ret); +} + +inline EE retain_memory(Mem mem) +{ + I32 ret = clRetainMemObject(mem); + map_cl_error_2_ee(ret); +} + +inline EE release_memory(Mem mem) +{ + I32 ret = clReleaseMemObject(mem); + map_cl_error_2_ee(ret); +} + +inline EE enqueue_unmap_memory(CommandQueue queue, + Mem mem, + void *mapped_ptr, + I32 num_wait_events, + const Event *wait_events, + Event *event) +{ + I32 ret = clEnqueueUnmapMemObject(queue, mem, mapped_ptr, num_wait_events, wait_events, event); + + map_cl_error_2_ee(ret); +} + +inline EE create_buffer(Context context, cl_mem_flags flags, U32 size, void *host_ptr, Mem *buffe) +{ + I32 ret; + size_t len = size; + *buffe = clCreateBuffer(context, flags, len, host_ptr, &ret); + map_cl_error_2_ee(ret); +} + +inline EE create_sub_buffer(Mem buffer, cl_mem_flags flags, U32 offset, U32 size, Mem *sub) +{ + I32 ret; + cl_buffer_region region = {offset, size}; + *sub = clCreateSubBuffer(buffer, flags, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &ret); + map_cl_error_2_ee(ret); +} + +inline EE enqueue_read_buffer(CommandQueue queue, + Mem buffer, + cl_bool blocking, + U32 offset, + U32 size, + void *ptr, + U32 num_wait_events, + const Event *wait_events, + Event *event) +{ + I32 ret = clEnqueueReadBuffer( + queue, buffer, blocking, offset, size, ptr, num_wait_events, wait_events, event); + map_cl_error_2_ee(ret); +} + +/* + inline EE enqueue_read_buffer_rect(CommandQueue queue, Mem buffer, cl_bool blocking, + const U32 *buffer_origin, const U32 *host_origin, const U32 *region, + U32 buffer_row_pitch, U32 buffer_slice_pitch, U32 host_row_pitch, + U32 host_slice_pitch, void *ptr, U32 num_wait_events, + const Event *wait_events, Event *event) { + + I32 ret = clEnqueueReadBufferRect(queue, buffer, blocking, + buffer_origin, host_origin, region, + buffer_row_pitch, buffer_slice_pitch, host_row_pitch, + host_slice_pitch, ptr, num_wait_events, wait_events, event); + map_cl_error_2_ee(ret); + } + */ +inline EE enqueue_write_buffer(CommandQueue queue, + Mem buffer, + cl_bool blocking, + U32 offset, + U32 size, + const void *ptr, + U32 num_wait_events, + const Event *wait_events, + Event *event) +{ + I32 ret = clEnqueueWriteBuffer( + queue, buffer, blocking, offset, size, ptr, num_wait_events, wait_events, event); + map_cl_error_2_ee(ret); +} + +inline EE enqueue_fill_buffer(CommandQueue queue, + Mem buffer, + const void *pattern, + U32 pattern_size, + U32 offset, + U32 size, + U32 num_wait_events, + const Event *wait_events, + Event *event) +{ + size_t pat_size = pattern_size; + size_t off = offset; + size_t si = size; + I32 ret = clEnqueueFillBuffer( + queue, buffer, pattern, pat_size, off, si, num_wait_events, wait_events, event); + map_cl_error_2_ee(ret); +} + +inline EE enqueue_write_buffer_rect(CommandQueue queue, + Mem buffer, + cl_bool blocking_write, + const U32 *buffer_origin, + const U32 *host_origin, + const U32 *region, + U32 buffer_row_pitch, + U32 buffer_slice_pitch, + U32 host_row_pitch, + U32 host_slice_pitch, + const void *ptr, + U32 num_wait_events, + const Event *wait_events, + Event *event) +{ + size_t b_ori[3]; + size_t h_ori[3]; + size_t reg[3]; + size_t b_rp = buffer_row_pitch; + size_t b_sp = buffer_slice_pitch; + size_t h_rp = host_row_pitch; + size_t h_sp = host_slice_pitch; + for (U32 i = 0; i < 3; i++) { + b_ori[i] = buffer_origin[i]; + h_ori[i] = host_origin[i]; + reg[i] = region[i]; + } + I32 ret = clEnqueueWriteBufferRect(queue, buffer, blocking_write, b_ori, h_ori, reg, b_rp, b_sp, + h_rp, h_sp, ptr, num_wait_events, wait_events, event); + map_cl_error_2_ee(ret); +} + +inline EE enqueue_copy_buffer(CommandQueue queue, + Mem src_buffer, + Mem dst_buffer, + U32 src_offset, + U32 dst_offset, + U32 size, + U32 num_wait_events, + const Event *wait_events, + Event *event) +{ + I32 ret = clEnqueueCopyBuffer(queue, src_buffer, dst_buffer, src_offset, dst_offset, size, + num_wait_events, wait_events, event); + map_cl_error_2_ee(ret); +} + +/* + EE enqueue_copy_buffer_rect(CommandQueue queue, Mem src_buffer, Mem dst_buffer, + const U32 *src_origin, const U32 *dst_origin, const U32 *region, + U32 src_row_pitch, U32 src_slice_pitch, U32 dst_row_pitch, + U32 dst_slice_pitch, U32 num_wait_events, + const Event *wait_events, Event *event) { + I32 ret = clEnqueueCopyBufferRect(queue, src_buffer, dst_buffer, + const size_t *src_origin, const size_t *dst_origin, const size_t *region, + src_row_pitch, src_slice_pitch, dst_row_pitch, + dst_slice_pitch, num_wait_events, wait_events, event); + map_cl_error_2_ee(ret); + } + */ + +inline EE enqueue_map_buffer(CommandQueue queue, + Mem buffer, + cl_bool blocking_map, + cl_map_flags map_flags, + U32 offset, + U32 size, + U32 num_wait_events, + const Event *wait_events, + Event *event, + void **ptr) +{ + I32 ret; + *ptr = clEnqueueMapBuffer(queue, buffer, blocking_map, map_flags, offset, size, num_wait_events, + wait_events, event, &ret); + map_cl_error_2_ee(ret); +} + +inline EE create_image(Context context, + cl_mem_flags flags, + const cl_image_format *image_format, + const cl_image_desc *image_desc, + void *host_ptr, + Mem *mem) +{ + I32 ret; + *mem = clCreateImage(context, flags, image_format, image_desc, host_ptr, &ret); + map_cl_error_2_ee(ret); +} + +inline EE enqueue_read_image(CommandQueue queue, + Mem image, + cl_bool blocking_read, + const U32 *origin, + const U32 *region, + U32 row_pitch, + U32 slice_pitch, + void *ptr, + U32 num_wait_events, + const Event *wait_events, + Event *event) +{ + size_t org[3]; + size_t reg[3]; + for (U32 i = 0; i < 3; ++i) { + org[i] = (size_t)origin[i]; + reg[i] = (size_t)region[i]; + } + I32 ret = clEnqueueReadImage(queue, image, blocking_read, org, reg, row_pitch, slice_pitch, ptr, + num_wait_events, wait_events, event); + map_cl_error_2_ee(ret); +} + +inline EE enqueue_write_image(CommandQueue queue, + Mem image, + cl_bool blocking_write, + const U32 *origin, + const U32 *region, + U32 input_row_pitch, + U32 input_slice_pitch, + const void *ptr, + U32 num_wait_events, + const Event *wait_events, + Event *event) +{ + size_t org[3]; + size_t reg[3]; + for (U32 i = 0; i < 3; ++i) { + org[i] = (size_t)origin[i]; + reg[i] = (size_t)region[i]; + } + I32 ret = clEnqueueWriteImage(queue, image, blocking_write, org, reg, input_row_pitch, + input_slice_pitch, ptr, num_wait_events, wait_events, event); + map_cl_error_2_ee(ret); +} + +inline EE enqueue_fill_image(CommandQueue queue, + Mem image, + const void *fill_color, + const U32 *origin, + const U32 *region, + U32 num_wait_events, + const Event *wait_events, + Event *event) +{ + size_t org[3]; + size_t reg[3]; + for (U32 i = 0; i < 3; ++i) { + org[i] = (size_t)origin[i]; + reg[i] = (size_t)region[i]; + } + I32 ret = + clEnqueueFillImage(queue, image, fill_color, org, reg, num_wait_events, wait_events, event); + map_cl_error_2_ee(ret); +} + +inline EE enqueue_copy_image_to_buffer(CommandQueue queue, + Mem src_image, + Mem dst_buffer, + const U32 *src_origin, + const U32 *region, + U32 dst_offset, + U32 num_wait_events, + const cl_event *wait_events, + cl_event *event) +{ + size_t org[3]; + size_t reg[3]; + for (U32 i = 0; i < 3; ++i) { + org[i] = (size_t)src_origin[i]; + reg[i] = (size_t)region[i]; + } + I32 ret = clEnqueueCopyImageToBuffer( + queue, src_image, dst_buffer, org, reg, dst_offset, num_wait_events, wait_events, event); + map_cl_error_2_ee(ret); +} + +inline EE enqueue_copy_buffer_to_image(CommandQueue queue, + Mem src_buffer, + Mem dst_image, + U32 src_offset, + const U32 *dst_origin, + const U32 *region, + U32 num_wait_events, + const cl_event *wait_events, + cl_event *event) +{ + size_t org[3]; + size_t reg[3]; + for (U32 i = 0; i < 3; ++i) { + org[i] = (size_t)dst_origin[i]; + reg[i] = (size_t)region[i]; + } + I32 ret = clEnqueueCopyBufferToImage( + queue, src_buffer, dst_image, src_offset, org, reg, num_wait_events, wait_events, event); + map_cl_error_2_ee(ret); +} +/* + + EE enqueue_copy_image(CommandQueue queue, Mem src_image, Mem dst_image, + const U32 *src_origin, const U32 *dst_origin, const U32 *region, + U32 num_wait_events, const cl_event *wait_events, cl_event *event) { + I32 ret = clEnqueueCopyImage(queue, src_image, dst_image, + const size_t *src_origin, const size_t *dst_origin, const size_t *region, + num_wait_events, wait_events, event); + map_cl_error_2_ee(ret); + } + + + + EE enqueue_map_image(CommandQueue queue, Mem image, cl_bool blocking_map, + cl_map_flags map_flags, const U32 *origin, const U32 *region, + U32 *image_row_pitch, U32 *image_slice_pitch, U32 num_wait_events, + const cl_event *wait_events, cl_event *event, void* *ptr) { + I32 ret; + * ptr = clEnqueueMapImage(queue, image, blocking_map, + map_flags, const size_t *origin, const size_t *region, + size_t *image_row_pitch, size_t *image_slice_pitch, + num_wait_events, wait_events, event, &ret); + map_cl_error_2_ee(ret); + } + */ + +inline EE create_sampler(Context context, const cl_sampler_properties *properties, Sampler *s) +{ + I32 ret; + *s = clCreateSamplerWithProperties(context, properties, &ret); + map_cl_error_2_ee(ret); +} + +inline EE retain_sampler(Sampler s) +{ + I32 ret = clRetainSampler(s); + map_cl_error_2_ee(ret); +} + +inline EE release_sampler(Sampler s) +{ + I32 ret = clReleaseSampler(s); + map_cl_error_2_ee(ret); +} + +inline EE get_sampler_info(Sampler s, cl_sampler_info info, void **value, size_t *len) +{ + if (NULL == value) { + return NULL_POINTER; + } + + size_t size; + I32 ret = clGetSamplerInfo(s, info, 0, NULL, &size); + if (CL_SUCCESS == ret) { + if (NULL != len) { + *len = size; + } + void *data = malloc(size); + if (NULL == data) { + return NULL_POINTER; + } + ret = clGetSamplerInfo(s, info, size, data, NULL); + if (CL_SUCCESS == ret) { + *value = data; + } + } + + map_cl_error_2_ee(ret); +} + +inline EE get_memory_size(Mem memory, U32 *size) +{ + size_t len; + int ret = clGetMemObjectInfo(memory, CL_MEM_SIZE, sizeof(len), &len, NULL); + *size = len; + map_cl_error_2_ee(ret); +} +#ifdef __cplusplus +} +#endif + +#endif diff --git a/common/gcl/include/ocl_context.h b/common/gcl/include/ocl_context.h new file mode 100644 index 00000000..8cdd38a8 --- /dev/null +++ b/common/gcl/include/ocl_context.h @@ -0,0 +1,36 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef H_OCL_CONTEXT +#define H_OCL_CONTEXT + +#include "gcl_common.h" + +class OCLContext { +public: + static OCLContext &getInstance(); + +protected: + OCLContext(); + ~OCLContext(); + +private: + void setDeviceName(); + void registerBinaryKernelMap(); + void registerSourceKernelMap(); + void registerSourceKernelsExt(); + +public: + std::shared_ptr handle; +}; +#endif diff --git a/common/gcl/include/ocl_data_alloc.h b/common/gcl/include/ocl_data_alloc.h new file mode 100644 index 00000000..6052defb --- /dev/null +++ b/common/gcl/include/ocl_data_alloc.h @@ -0,0 +1,35 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +#ifndef _OCL_DATA_ALLOC +#define _OCL_DATA_ALLOC + +#include "gcl_common.h" +#include "gcl_func.h" +#include "ocl_context.h" + +inline GCLMem_t ocl_alloc_gclmem(GCLMemDesc desc) +{ + GCLMem_t gclmem = gcl_create_gclmem(); + gclmem->desc = desc; + CHECK_STATUS(gcl_create_memory(OCLContext::getInstance().handle.get(), gclmem)); + return gclmem; +} + +inline void ocl_release_gclmem(GCLMem_t mem) +{ + CHECK_STATUS(gcl_unmap_memory(OCLContext::getInstance().handle.get(), mem)); + CHECK_STATUS(gcl_release_subMem(mem)); + CHECK_STATUS(gcl_release_memory(mem)); + delete mem; +} +#endif diff --git a/common/gcl/include/ocl_data_trans.h b/common/gcl/include/ocl_data_trans.h new file mode 100644 index 00000000..b6dd08b2 --- /dev/null +++ b/common/gcl/include/ocl_data_trans.h @@ -0,0 +1,33 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +#ifndef _OCL_DATA_TRANS +#define _OCL_DATA_TRANS + +#include "types.h" +#include "tensor_desc.h" +#include "gcl_common.h" + +EE ocl_set_input(GCLHandle_t handle, + GCLMem_t input, + TensorDesc hostDesc, + const U8 *hostPtr, + GCLMem_t tmpBuf, + bool blocking); + +EE ocl_get_output(GCLHandle_t handle, const GCLMem_t input, TensorDesc hostDesc, bool blocking); + +EE ocl_trans_mem( + GCLHandle_t handle, GCLMem_t src, GCLMemDesc srcDesc, GCLMem_t dst, GCLMemDesc dstDesc); + +EE ocl_map_mem(GCLHandle_t handle, GCLMem_t gclMem, GCLMemDesc desc); +#endif diff --git a/common/gcl/include/ocl_desc_trans.h b/common/gcl/include/ocl_desc_trans.h new file mode 100644 index 00000000..2ff33ee2 --- /dev/null +++ b/common/gcl/include/ocl_desc_trans.h @@ -0,0 +1,31 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +#ifndef _OCL_DESC_TRANS +#define _OCL_DESC_TRANS + +#include "tensor.hpp" +#include "memory_ocl.hpp" +#include "gcl_common.h" + +inline void ocl_set_desc(Tensor *tensor, GCLMemDesc desc) +{ + OclMemory *mem = (OclMemory *)tensor->get_memory(); + mem->padding(desc); +}; + +inline GCLMemDesc ocl_get_desc(Tensor tensor) +{ + OclMemory *mem = (OclMemory *)tensor.get_memory(); + return mem->get_desc(); +} +#endif diff --git a/common/gcl/include/platform.h b/common/gcl/include/platform.h new file mode 100644 index 00000000..1d33cd35 --- /dev/null +++ b/common/gcl/include/platform.h @@ -0,0 +1,500 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_PLATFORM +#define _H_PLATFORM + +#include +#include + +#if defined(__cplusplus) +extern "C" { +#endif + +typedef enum { + VENDOR_ARM = 0, +} PlatformVendor; + +inline EE get_platforms(U32 *numPlatforms, Platform **platforms) +{ + if (NULL == platforms || NULL == numPlatforms) { + return NULL_POINTER; + } + U32 num; + I32 ret = clGetPlatformIDs(0, NULL, &num); + if (SUCCESS == ret) { + *numPlatforms = num; + Platform *p = (Platform *)malloc(num * sizeof(Platform)); + if (NULL == p) { + return ALLOC_FAILED; + } + + ret = clGetPlatformIDs(num, p, NULL); + if (SUCCESS != ret) { + free(p); + } else { + *platforms = p; + } + } + + map_cl_error_2_ee(ret); +} + +static cl_bool stringContains(char *big, const char *s) +{ + for (unsigned int i = 0; i < strlen(big); i++) { + big[i] = tolower(big[i]); + } + std::string str(big); + return std::string::npos != str.find(s); +} + +/** + * @brief get information from platform + * + * @param value value associate with info, memory is allocate by this + * function + * @param len the lengith of value, return by this function + * + **/ + +inline EE get_platform_info(Platform platform, cl_platform_info info, void **value, U32 *len) +{ + if (NULL == len || NULL == value) { + return NULL_POINTER; + } + size_t sizeRet; + I32 ret = clGetPlatformInfo(platform, info, 0, NULL, &sizeRet); + if (CL_SUCCESS == ret) { + if (len) { + *len = (U32)sizeRet; + } + void *data = malloc(sizeRet + 1); + if (NULL == data) { + return ALLOC_FAILED; + } + + ret = clGetPlatformInfo(platform, info, sizeRet + 1, data, NULL); + if (CL_SUCCESS != ret) { + free(data); + } else { + *value = data; + } + } + + map_cl_error_2_ee(ret); +} + +/** + * @brief select platfrom by platform type + * + * @param numPlatforms the number of platforms + * @param platforms platform array need to be selected + * @param type the type of platform we want + * @param index index of the selected platform + * + **/ +inline EE select_platform(PlatformVendor vendor, Platform *platform) +{ + if (NULL == platform) { + return NULL_POINTER; + } + + const static char *key[] = {"arm", "qualcomm"}; + U32 num_platforms; + Platform *platforms; + EE ret = get_platforms(&num_platforms, &platforms); + if (SUCCESS == ret) { + const char *platform_vendor = key[vendor]; + for (U32 i = 0; i < num_platforms; i++) { + Platform p = platforms[i]; + U32 nameLen; + char *name; + ret = get_platform_info(p, CL_PLATFORM_NAME, (void **)&name, &nameLen); + if (SUCCESS == ret) { + if (stringContains(name, platform_vendor)) { + *platform = p; + } + free(name); + } + } + } + free(platforms); + + map_cl_error_2_ee(ret); +} + +#define CHAR_PLATFORM_INFO(info, str) \ + { \ + EE ret = get_platform_info(p, info, &value, &len); \ + if (SUCCESS == ret) { \ + char *tmp = (char *)value; \ + tmp[len] = '\0'; \ + printf(str ": %s\n", tmp); \ + free(value); \ + } else { \ + map_cl_error_2_ee(ret); \ + } \ + } + +/** + * @brief list information about platform + * + */ +inline EE list_platform_info(Platform p) +{ + void *value; + U32 len; + + CHAR_PLATFORM_INFO(CL_PLATFORM_PROFILE, "\t Profile"); + CHAR_PLATFORM_INFO(CL_PLATFORM_VERSION, "\t Version "); + CHAR_PLATFORM_INFO(CL_PLATFORM_NAME, "\t Name "); + CHAR_PLATFORM_INFO(CL_PLATFORM_VENDOR, "\t Vendor "); + CHAR_PLATFORM_INFO(CL_PLATFORM_EXTENSIONS, "\t Extensions "); + + return SUCCESS; +} + +/** + * @brief get devices in platform, and allocate space for storing devices + * @warning please free space of devices allocated in this function + * + * @param p input, specify platform, device will be retrived from this platform + * @param type input, specify device type + * @param num_devices output, return device number with type in platform p + * @param devices output, return devices + * + * @return + * 0 means sucess + * -1 means fail + * + */ +inline EE platform_get_devices( + Platform platform, cl_device_type type, U32 *num_devices, Device **devices) +{ + if (NULL == devices || NULL == num_devices) { + return NULL_POINTER; + } + + U32 num; + I32 ret = clGetDeviceIDs(platform, type, 0, NULL, &num); + if (CL_SUCCESS == ret) { + *num_devices = num; + + Device *did = (Device *)malloc(num * sizeof(Device)); + if (NULL == did) { + return ALLOC_FAILED; + } + + ret = clGetDeviceIDs(platform, type, num, did, NULL); + if (CL_SUCCESS != ret) { + free(did); + } else { + *devices = did; + } + } + map_cl_error_2_ee(ret); +} + +inline EE create_sub_device( + Device device, const cl_device_partition_property *properties, U32 *num_devices, Device **devices) +{ + U32 len; + I32 ret = clCreateSubDevices(device, properties, 0, NULL, &len); + if (CL_SUCCESS == ret) { + if (NULL != num_devices) { + *num_devices = len; + } + Device *d = (Device *)malloc(sizeof(Device) * len); + if (NULL == d) { + return ALLOC_FAILED; + } + ret = clCreateSubDevices(device, properties, len, d, NULL); + if (CL_SUCCESS == ret) { + *devices = d; + } else { + free(d); + } + } + map_cl_error_2_ee(ret); +} + +inline EE retain_device(Device device) +{ + I32 ret = clRetainDevice(device); + map_cl_error_2_ee(ret); +} + +inline EE release_device(Device device) +{ + I32 ret = clReleaseDevice(device); + map_cl_error_2_ee(ret); +} + +/** + * + *@brief get device information + * + * @warning please free memory space allocated for value + * + **/ + +inline EE get_device_info(Device device, cl_device_info info, void **value, U32 *len) +{ + if (NULL == value) { + return NULL_POINTER; + } + + size_t size; + I32 ret = clGetDeviceInfo(device, info, 0, NULL, &size); + if (CL_SUCCESS == ret) { + if (NULL != len) { + *len = (U32)(size); + } + void *data = malloc(size); + if (NULL == data) { + return ALLOC_FAILED; + } + ret = clGetDeviceInfo(device, info, size, data, NULL); + if (CL_SUCCESS != ret) { + free(data); + } else { + *value = data; + } + } + + map_cl_error_2_ee(ret); +} + +#define V_Q_Info(device, info, type, str, modifier) \ + { \ + type v; \ + I32 ret = clGetDeviceInfo(device, info, sizeof(type), &v, NULL); \ + if (CL_SUCCESS != ret) { \ + map_cl_error_2_ee(ret); \ + } \ + \ + printf(str "%" modifier "\n", v); \ + } + +#define B_Q_Info(device, info, str) \ + { \ + cl_bool v; \ + I32 ret = clGetDeviceInfo(device, info, sizeof(cl_bool), &v, NULL); \ + if (CL_SUCCESS != ret) { \ + map_cl_error_2_ee(ret); \ + } \ + \ + printf(str "%s\n", v ? "Yes" : "NO"); \ + } + +#define STR_Q_Info(device, info, str) \ + { \ + size_t len; \ + I32 ret = clGetDeviceInfo(device, info, 0, NULL, &len); \ + if (SUCCESS != ret) { \ + map_cl_error_2_ee(ret); \ + } \ + \ + char *v = (char *)malloc(len + 1); \ + ret = clGetDeviceInfo(device, info, len, v, NULL); \ + if (SUCCESS != ret) { \ + map_cl_error_2_ee(ret); \ + } \ + \ + v[len] = '\0'; \ + printf(str "%s\n", v); \ + free(v); \ + } + +/** + * @brief list all attributes of device + * + * @param device input + * + * @return + * 0 : success + * -1: error + */ +inline EE list_device_info(Device device) +{ + printf("..........Device Info..............\n"); + STR_Q_Info(device, CL_DEVICE_NAME, "Device name : "); + V_Q_Info(device, CL_DEVICE_ADDRESS_BITS, U32, "Address Bits : ", "u"); + B_Q_Info(device, CL_DEVICE_AVAILABLE, "Device Available : "); + B_Q_Info(device, CL_DEVICE_COMPILER_AVAILABLE, "Device Compiler Available : "); + B_Q_Info(device, CL_DEVICE_ENDIAN_LITTLE, "Device is little Endian : "); + B_Q_Info(device, CL_DEVICE_ERROR_CORRECTION_SUPPORT, "ECC Supported : "); + STR_Q_Info(device, CL_DEVICE_EXTENSIONS, "Device Extensions : "); + STR_Q_Info(device, CL_DEVICE_OPENCL_C_VERSION, "OpenCL C Version : "); + STR_Q_Info(device, CL_DEVICE_PROFILE, "Device Profile : "); + V_Q_Info(device, CL_DEVICE_PROFILING_TIMER_RESOLUTION, size_t, "Timer Resolution : ", "ld"); + { + cl_device_fp_config v; + I32 ret = clGetDeviceInfo( + device, CL_DEVICE_SINGLE_FP_CONFIG, sizeof(cl_device_fp_config), &v, NULL); + if (CL_SUCCESS != ret) { + map_cl_error_2_ee(ret); + } + + if (v & CL_FP_DENORM) { + printf("Device Support Denorm Single Float \n"); + } + if (v & CL_FP_INF_NAN) { + printf("Device Support Single Float INF NAN\n"); + } + if (v & CL_FP_ROUND_TO_NEAREST) { + printf("Device Support Single Float Round to Nearest\n"); + } + if (v & CL_FP_ROUND_TO_ZERO) { + printf("Device Support Single Float Round to Zero \n"); + } + if (v & CL_FP_ROUND_TO_INF) { + printf("Device Support Single Float Round to Inf\n"); + } + if (v & CL_FP_FMA) { + printf("Device Support Single Float FMA\n"); + } + if (v & CL_FP_SOFT_FLOAT) { + printf("Device does not Support Hardware Single Float\n"); + } + } + + STR_Q_Info(device, CL_DEVICE_VENDOR, "Device Vendor : "); + V_Q_Info(device, CL_DEVICE_VENDOR_ID, U32, "Device Vendor ID : ", "u"); + STR_Q_Info(device, CL_DEVICE_VERSION, "Device Version : "); + STR_Q_Info(device, CL_DRIVER_VERSION, "Driver Version : "); + B_Q_Info(device, CL_DEVICE_HOST_UNIFIED_MEMORY, "Unified Memory Supported : "); + V_Q_Info(device, CL_DEVICE_MAX_PARAMETER_SIZE, size_t, "Max Parameter Size : ", "ld"); + + printf("..............Global Memory Configuration.............\n"); + V_Q_Info(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong, "Max Memory Allocate Size : ", "lu"); + V_Q_Info(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, U32, "Max Base Address Align Size : ", "u"); + V_Q_Info(device, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, U32, "Min Data Type align Size :", "u"); + + V_Q_Info(device, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong, "Global Memory Cache Size : ", "lu"); + { + cl_device_mem_cache_type v; + I32 ret = clGetDeviceInfo( + device, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, sizeof(cl_device_mem_cache_type), &v, NULL); + if (CL_SUCCESS != ret) { + map_cl_error_2_ee(ret); + } + switch (v) { + case CL_NONE: + printf("Global Memory does not have Cache \n"); + break; + case CL_READ_ONLY_CACHE: + printf("Global Memory has Readonly Cache \n"); + break; + case CL_READ_WRITE_CACHE: + printf("Global Memory has Read Write Cache \n"); + break; + default: + printf("Unknown Global Memory Cache type \n"); + break; + } + } + + V_Q_Info( + device, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, U32, "Global Memory, Cacheline Size : ", "u"); + V_Q_Info(device, CL_DEVICE_GLOBAL_MEM_SIZE, cl_ulong, "Global Memory Size : ", "lu"); + // CL_DEVICE_HALF_FP_CONFIG + + printf("..................Image Information...................\n"); + B_Q_Info(device, CL_DEVICE_IMAGE_SUPPORT, "Image Supported : "); + V_Q_Info(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, size_t, "2D Image Max Height : ", "ld"); + V_Q_Info(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, size_t, "2D Image Max Width : ", "ld"); + V_Q_Info(device, CL_DEVICE_IMAGE3D_MAX_DEPTH, size_t, "3D Image Max Depth : ", "ld"); + V_Q_Info(device, CL_DEVICE_IMAGE3D_MAX_HEIGHT, size_t, "3D Image Max Height : ", "ld"); + V_Q_Info(device, CL_DEVICE_IMAGE3D_MAX_WIDTH, size_t, "3D Image Max Width : ", "ld"); + V_Q_Info(device, CL_DEVICE_MAX_READ_IMAGE_ARGS, U32, "Max Read Image Args : ", "u"); + V_Q_Info(device, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, U32, "Max Write Image Args : ", "u"); + V_Q_Info(device, CL_DEVICE_MAX_SAMPLERS, U32, "Max Samples : ", "u"); + + printf(".................Local Memory...............................\n"); + V_Q_Info(device, CL_DEVICE_LOCAL_MEM_SIZE, cl_ulong, "Local Memory Size : ", "lu"); + { + cl_device_local_mem_type v; + I32 ret = clGetDeviceInfo( + device, CL_DEVICE_LOCAL_MEM_TYPE, sizeof(cl_device_local_mem_type), &v, NULL); + if (CL_SUCCESS != ret) { + map_cl_error_2_ee(ret); + } + switch (v) { + case CL_LOCAL: + printf("Device has Dedicate Local Memory\n"); + break; + case CL_GLOBAL: + printf("Local Memory uses Global Memory\n"); + break; + default: + printf("%d\n", __LINE__); + } + } + + printf("...................CU Information...........................\n"); + V_Q_Info(device, CL_DEVICE_MAX_CLOCK_FREQUENCY, U32, "Max Clock Frequency : ", "u"); + V_Q_Info(device, CL_DEVICE_MAX_COMPUTE_UNITS, U32, "Max Compute Units : ", "u"); + + printf(".................Constant Memory Information.............\n"); + V_Q_Info(device, CL_DEVICE_MAX_CONSTANT_ARGS, U32, "Max Constant Args : ", "u"); + V_Q_Info( + device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, cl_ulong, "Max Constant Buffer Size : ", "lu"); + + printf("...................ND Range Information........................\n"); + V_Q_Info(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, size_t, "Max Work Group Size : ", "ld"); + V_Q_Info(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, U32, "Work Item Dimensions : ", "u"); + + { + size_t v[3]; + I32 ret = + clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, &v, NULL); + if (CL_SUCCESS != ret) { + map_cl_error_2_ee(ret); + } + printf("Max Work Item size : %ld %ld %ld\n", v[0], v[1], v[2]); + } + + printf(".....................Vector Information..................\n"); + V_Q_Info(device, CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, U32, "Native Vector Width Char : ", "u"); + V_Q_Info(device, CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, U32, "Native Vector Width Short : ", "u"); + V_Q_Info(device, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, U32, "Native Vector Width Int : ", "u"); + V_Q_Info(device, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, U32, "Native Vector Width Long : ", "u"); + V_Q_Info(device, CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, U32, "Native Vector Width Float : ", "u"); + V_Q_Info( + device, CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, U32, "Native Vector Width Double : ", "u"); + V_Q_Info(device, CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, U32, "Native Vector Width Half : ", "u"); + + V_Q_Info( + device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, U32, "Preferred Vector Width Char : ", "u"); + V_Q_Info(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, U32, + "Preferred Vector Width Short : ", "u"); + V_Q_Info( + device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, U32, "Preferred Vector Width Int : ", "u"); + V_Q_Info( + device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, U32, "Preferred Vector Width Long : ", "u"); + V_Q_Info(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, U32, + "Preferred Vector Width Float : ", "u"); + V_Q_Info(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, U32, + "Preferred Vector Width Double : ", "u"); + V_Q_Info( + device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, U32, "Preferred Vector Width Half : ", "u"); + + return SUCCESS; +} + +#if defined(__cplusplus) +} +#endif +#endif diff --git a/common/gcl/include/program.h b/common/gcl/include/program.h new file mode 100644 index 00000000..c8dc15cd --- /dev/null +++ b/common/gcl/include/program.h @@ -0,0 +1,303 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef PROGRAM_H_ +#define PROGRAM_H_ + +#ifdef __cplusplus +extern "C" { +#endif +#define check_build_program_error(ret, program, device) \ + { \ + if (SUCCESS != ret) { \ + void *buildLog; \ + U32 buildLogSize; \ + ret = get_program_build_info( \ + program, device, CL_PROGRAM_BUILD_LOG, &buildLog, &buildLogSize); \ + if (SUCCESS == ret) { \ + printf("build log of device %s\n", (char *)buildLog); \ + free(buildLog); \ + } \ + } \ + } \ + /** + * @brief get build information of program + * @warning please free memory associate with value + **/ + +inline EE get_program_build_info( + Program program, Device device, cl_program_build_info info, void **value, U32 *size) +{ + if (NULL == value) { + return NULL_POINTER; + } + + size_t len; + I32 ret = clGetProgramBuildInfo(program, device, info, 0, NULL, &len); + if (SUCCESS == ret) { + if (NULL == size) { + *size = len; + } + void *data = malloc(len); + if (NULL == data) { + return ALLOC_FAILED; + } + ret = clGetProgramBuildInfo(program, device, info, len, data, NULL); + if (SUCCESS == ret) { + *value = data; + } else { + free(data); + } + } + + map_cl_error_2_ee(ret); +} + +/** + * @brief create program from source code + * + * @param context input, specify associate context + * @param source input, source code + * @param program output, created and built program + * + **/ + +inline EE create_program_from_source(Context context, U32 *len, CI8 *str, Program *program) +{ + I32 ret; + size_t length = (size_t)(*len); + *program = clCreateProgramWithSource(context, 1, &str, &length, &ret); + map_cl_error_2_ee(ret); +} + +/** + * @brief create program from binary code + * + * @param context input, specify associate context + * @param numDevices input, the number of devices need to compile the + * code for + * @param devices input, devices need to compile the code for + * @param lengths input, + * @param binaries + * @param binary_status output, compiled status for every devices + * @param program output, created and built program + * + **/ + +inline EE create_program_from_binary(Context context, + const Device device, + U32 *length, + CU8 **binary, + I32 *binary_status, + Program *program) +{ + I32 ret; + size_t len = *length; + *program = clCreateProgramWithBinary(context, 1, &device, &len, binary, binary_status, &ret); + map_cl_error_2_ee(ret); +} + +/** + * @brief build program + * + **/ + +inline EE build_program(Program program, Device device, CI8 *options) +{ + I32 ret = clBuildProgram(program, 1, &device, options, NULL, NULL); + if (CL_SUCCESS != ret) { + check_build_program_error(ret, program, device); + } + map_cl_error_2_ee(ret); +} + +/** + * @brief create program from source then build it + * + * @param cont input, specify associate context + * @param source input, source code + * @param devices input, source will be built on devices + * @param options input, options for compiling source + * @param program output, created and built program + * + */ + +inline EE create_build_program_from_source( + Context context, U32 *length, CI8 *source, Device device, CI8 *options, Program *program) +{ + if (NULL == program) { + return NULL_POINTER; + } + Program prog; + EE ret; + create_program_from_source(context, length, source, &prog); + ret = build_program(prog, device, options); + *program = prog; + map_cl_error_2_ee(ret); +} + +/** + * @brief create program from binary then build it + * + **/ + +inline EE create_build_program_from_binary(Context context, + Device device, + U32 *length, + CU8 **binary, + CI8 *options, + I32 *binary_status, + Program *program) +{ + if (NULL == program) { + return NULL_POINTER; + } + Program prog; + EE ret; + create_program_from_binary(context, device, length, binary, binary_status, &prog); + ret = build_program(prog, device, options); + *program = prog; + map_cl_error_2_ee(ret); +} + +/** + * @brief get information of program + * @warning please free memory associate with value + **/ + +inline EE get_program_info(Program program, cl_program_info info, void **value, U32 *size) +{ + if (NULL == value) { + return NULL_POINTER; + } + size_t len; + I32 ret = clGetProgramInfo(program, info, 0, NULL, &len); + if (CL_SUCCESS == ret) { + if (NULL != size) { + *size = len; + } + void *data = malloc(len); + if (NULL == data) { + return ALLOC_FAILED; + } + ret = clGetProgramInfo(program, info, len, data, NULL); + if (CL_SUCCESS == ret) { + *value = data; + } else { + free(data); + } + } + map_cl_error_2_ee(ret); +} + +/** + * @brief get information of program + * @warning please free memory associate with value + **/ +inline EE get_program_binary(Program program, U8 **binary, U32 *len) +{ + size_t size; + I32 ret = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL); + if (CL_SUCCESS == ret) { + *len = (U32)(size); + void *data = malloc(size); + if (NULL == data) { + return ALLOC_FAILED; + } + ret = clGetProgramInfo( + program, CL_PROGRAM_BINARIES, size, &data, NULL); //waring: need set &data + if (CL_SUCCESS == ret) { + *binary = (U8 *)(data); + } else { + free(data); + } + } + map_cl_error_2_ee(ret); +} + +/** + * @brief get binary of source code + * + * @warning please don't free binary, it is return by ocl + * + **/ + +inline EE get_program_binary_from_source( + Context context, U32 *length, CI8 *str, Device device, CI8 *options, U8 **binary, U32 *len) +{ + if (NULL == binary) { + return NULL_POINTER; + } + + Program program; + EE ret = create_build_program_from_source(context, length, str, device, options, &program); + if (SUCCESS == ret) { + ret = get_program_binary(program, binary, len); + } + return ret; +} + +/* + inline EE create_program_from_il(Context context, + const void *il, U32 length, Program *program) { + //TODO + I32 ret; + * program = clCreateProgramWithIL(context, il, length, &ret); + map_cl_error_2_ee(ret); + } + */ + +inline EE release_program(Program program) +{ + map_cl_error_2_ee(clReleaseProgram(program)); +} + +inline EE compile_program(Program program, + const Device device, + CI8 *options, + U32 num_input_headers, + const Program *input_headers, + CI8 **header_include_names) +{ + I32 ret = clCompileProgram(program, 1, &device, options, num_input_headers, input_headers, + header_include_names, NULL, NULL); + if (CL_SUCCESS != ret) { + check_build_program_error(ret, program, device); + } + map_cl_error_2_ee(ret); +} + +inline EE link_program(Context context, + const Device device, + CI8 *options, + U32 num_input_programs, + const Program *input_programs, + Program *program) +{ + I32 ret; + *program = clLinkProgram( + context, 1, &device, options, num_input_programs, input_programs, NULL, NULL, &ret); + map_cl_error_2_ee(ret); +} + +inline EE unload_platform_compiler(Platform p) +{ + I32 ret = clUnloadPlatformCompiler(p); + map_cl_error_2_ee(ret); +} + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/common/gcl/src/CMakeLists.txt b/common/gcl/src/CMakeLists.txt new file mode 100644 index 00000000..ef8301af --- /dev/null +++ b/common/gcl/src/CMakeLists.txt @@ -0,0 +1,14 @@ +file(GLOB srcs ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) + +# shared library +add_library(${PROJECT_NAME} SHARED ${srcs}) + +# static library +add_library(${PROJECT_NAME}_static STATIC ${srcs}) + +set_target_properties(${PROJECT_NAME}_static PROPERTIES OUTPUT_NAME "${PROJECT_NAME}") +set_target_properties(${PROJECT_NAME} PROPERTIES CLEAN_DIRECT_OUTPUT 1) +set_target_properties(${PROJECT_NAME}_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) +install(TARGETS ${PROJECT_NAME} ${PROJECT_NAME}_static + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib) diff --git a/common/gcl/src/ocl_context.cpp b/common/gcl/src/ocl_context.cpp new file mode 100644 index 00000000..8e348277 --- /dev/null +++ b/common/gcl/src/ocl_context.cpp @@ -0,0 +1,187 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "gcl_func.h" +#include "ocl_context.h" + +OCLContext::OCLContext() +{ + UNI_DEBUG_LOG("OCLContext %p constructor start\n", (char *)this); + this->handle = std::shared_ptr(new GCLHandle()); + this->handle->platformId = 0; + this->handle->deviceId = 0; + this->handle->deviceType = CL_DEVICE_TYPE_GPU; + this->handle->eventPtr = nullptr; + this->handle->numWaitEvents = 0; + this->handle->waitEvents = nullptr; + this->handle->t_execute = 0; + this->handle->t_total = 0; + this->handle->curOpName = "unknow"; + this->handle->deviceName = "unknow"; + this->handle->kernel_source = nullptr; + this->handle->kernel_binmap = nullptr; + this->handle->kernel_binmap_handle = nullptr; + this->handle->common_source_opt = "unknow"; + this->handle->common_source_ext = "unknow"; + this->handle->source_head_name[0] = "unknow"; + this->handle->useBinMap = false; + this->handle->existProfilingQueue = false; + CHECK_STATUS(get_platforms(&(this->handle->numPlatform), &(this->handle->platforms))); + CHECK_STATUS(platform_get_devices(this->handle->platforms[this->handle->platformId], + this->handle->deviceType, &this->handle->numDevice, &this->handle->devices)); + CHECK_STATUS(create_context(this->handle->platforms[this->handle->platformId], + this->handle->numDevice, this->handle->devices, &this->handle->context)); + cl_queue_properties props[] = {CL_QUEUE_PROPERTIES, 0, 0}; +#ifdef _DEBUG + this->handle->eventPtr = &this->handle->eventObj; + props[1] = props[1] | CL_QUEUE_PROFILING_ENABLE; +#endif + CHECK_STATUS(create_command_queue_properties(this->handle->context, + this->handle->devices[this->handle->deviceId], props, &this->handle->queue)); + this->setDeviceName(); + this->registerBinaryKernelMap(); + if (!this->handle->useBinMap) { + this->registerSourceKernelMap(); + this->registerSourceKernelsExt(); + } + UNI_DEBUG_LOG("OCLContext %p constructor end\n", (char *)this); +} + +OCLContext::~OCLContext() +{ + UNI_DEBUG_LOG("OCLContext %p deconstructor start\n", (char *)this); + if (this->handle->platforms == nullptr) { + return; + } + CHECK_STATUS(finish(this->handle->queue)); + for (auto k : this->handle->programMap) { + CHECK_STATUS(release_program(k.second)); + } + for (auto k : this->handle->kernelMap) { + CHECK_STATUS(release_kernel(k.second)); + } + if (this->handle->useBinMap) { + delete (gcl_kernel_binmap *)this->handle->kernel_binmap; + dlclose(this->handle->kernel_binmap_handle); + } else { + CHECK_STATUS(release_program(this->handle->source_head[0])); + delete (gcl_kernel_source *)this->handle->kernel_source; + } + this->handle->kernelMap.clear(); + if (this->handle->existProfilingQueue) { + CHECK_STATUS(finish(this->handle->queue_profiling)); + CHECK_STATUS(release_command_queue(this->handle->queue_profiling)); + } + CHECK_STATUS(release_command_queue(this->handle->queue)); + CHECK_STATUS(release_context(this->handle->context)); + CHECK_STATUS(release_device(this->handle->devices[this->handle->deviceId])); + free(this->handle->devices); + free(this->handle->platforms); + UNI_DEBUG_LOG("OCLContext %p deconstructor end\n", (char *)this); +} + +void OCLContext::setDeviceName() +{ + cl_device_id device = this->handle->devices[this->handle->deviceId]; + U32 len; + I8 *data; + CHECK_STATUS(get_device_info(device, CL_DEVICE_NAME, (void **)&data, &len)); + I8 devName[64]; + for (U32 i = 0; i < len - 1; i++) { + if (data[i] == '-') { + data[i] = '_'; + } + if (data[i] == ' ') { + data[i] = '_'; + } + devName[i] = data[i]; + } + U32 version_len; + free(data); + CHECK_STATUS(get_device_info(device, CL_DEVICE_VERSION, (void **)&data, &version_len)); + std::string deviceV = std::string(data); + U32 be = deviceV.find("r"); + U32 end = deviceV.find("p", be + 1); + std::string numV = deviceV.substr(be + 1, end - be - 1); + U32 i = atoi(numV.c_str()); + if (i >= 14) { + devName[len - 1] = 'p'; + devName[len] = '\0'; + } else { + devName[len - 1] = '\0'; + } + free(data); + this->handle->deviceName = devName; +} + +void OCLContext::registerBinaryKernelMap() +{ + std::string libKernelBinName = "lib" + this->handle->deviceName + "_map.so"; + char *err; + void *dvm_handle = dlopen(libKernelBinName.c_str(), RTLD_LAZY); + if (dvm_handle) { + std::string func = "create_" + this->handle->deviceName + "_kernelbin_map"; + gcl_kernel_binmap *(*create_kernelbin_map)(); + dlerror(); + create_kernelbin_map = (gcl_kernel_binmap * (*)()) dlsym(dvm_handle, func.c_str()); + if ((err = dlerror()) != NULL) { + UNI_ERROR_LOG( + "Get %s in %s failed, error %s\n", func.c_str(), libKernelBinName.c_str(), err); + dlclose(dvm_handle); + } + gcl_kernel_binmap *kernel_binmap = create_kernelbin_map(); + this->handle->kernel_binmap = (void *)kernel_binmap; + this->handle->useBinMap = true; + this->handle->kernel_binmap_handle = dvm_handle; + } else { + UNI_DEBUG_LOG("try to dlopen %s failed, %s, create kernel from source code\n", + libKernelBinName.c_str(), dlerror()); + } +} + +void OCLContext::registerSourceKernelMap() +{ + gcl_kernel_source *kernel_source = new kernel_source_executor(); + this->handle->kernel_source = kernel_source; + KernelOption *common_opt; + if (!kernel_source->get_option("common", &common_opt)) { + UNI_ERROR_LOG("the common doesn't exist in optionMap\n"); + CHECK_STATUS(NULL_POINTER); + } + this->handle->common_source_opt = common_opt->option; + this->handle->common_source_ext = "#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable\n"; + this->handle->common_source_ext += "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"; + this->handle->source_head_name[0] = "kernel_def.h"; + KernelSource *head_source; + if (!kernel_source->get_source("kernel_def", &head_source)) { + UNI_ERROR_LOG("the kernel_def doesn't exist in sourceMap\n"); + CHECK_STATUS(NULL_POINTER); + } + CHECK_STATUS(create_program_from_source(this->handle->context, (U32 *)&head_source->len, + head_source->data, this->handle->source_head)); +} + +void OCLContext::registerSourceKernelsExt() +{ + Kernel tmpK; + CHECK_STATUS(gcl_get_kernel_from_map(this->handle.get(), "padding_input_gclmem", &tmpK)); + CHECK_STATUS(gcl_get_kernel_from_map(this->handle.get(), "mem_trans_nchw_to_ncwhc4", &tmpK)); + CHECK_STATUS(gcl_get_kernel_from_map(this->handle.get(), "mem_trans_ncwhc4_to_nchw", &tmpK)); + CHECK_STATUS(gcl_get_kernel_from_map(this->handle.get(), "mem_trans_ncwhc4_to_mtk", &tmpK)); +} + +OCLContext &OCLContext::getInstance() +{ + static OCLContext _instance; + return _instance; +} diff --git a/common/gcl/src/ocl_data_trans.cpp b/common/gcl/src/ocl_data_trans.cpp new file mode 100644 index 00000000..9057614d --- /dev/null +++ b/common/gcl/src/ocl_data_trans.cpp @@ -0,0 +1,279 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gcl_common.h" +#include "gcl_func.h" +#include "gclmem_desc_infer.h" +#include "ocl_data_trans.h" + +EE ocl_set_input(GCLHandle_t handle, + GCLMem_t input, + TensorDesc hostDesc, + const U8 *hostPtr, + GCLMem_t tmpBuf, + bool blocking) +{ + GCLMemDesc desc = input->desc; + if (desc.memType == GCL_MEM_BUF) { + U32 size = tensorNumBytes(hostDesc); + Kernel kernel; + U32 iw, ih, ic, in; + DataType hdt; + DataFormat hdf; + if (hostDesc.df == DF_NCHW || hostDesc.df == DF_NHWC) { + tensorSelectGet(hostDesc, &hdt, &hdf, &in, &ic, &ih, &iw); + } else if (hostDesc.df == DF_NORMAL) { + tensor2dGet(hostDesc, &hdt, &hdf, &ih, &iw); + ic = 1; + in = 1; + hdf = DF_NORMAL; + } else { + return NOT_SUPPORTED; + } + if (hdf == DF_NCHW) { + U32 ow, oh, pw, ph; + ow = input->desc.stride[0]; + oh = input->desc.stride[1]; + pw = input->desc.offset[0]; + ph = input->desc.offset[1]; + if (desc.memFormat == DF_NCHW || (ow == 1 && oh == 1 && pw == 0 && ph == 0)) { + GCLMem_t dst = (iw == ow && ih == oh) ? input : tmpBuf; + CHECK_STATUS(gcl_trans_memory( + handle, (void *)hostPtr, (void *)dst, &size, HOST_TO_DEVICE_BUF, CL_TRUE)); + if (iw != ow || ih != oh) { + CHECK_STATUS(gcl_get_kernel_from_map(handle, "padding_input_gclmem", &kernel)); + CHECK_STATUS( + gcl_set_kernelArgs(kernel, iw, ih, pw, ph, ow, oh, tmpBuf->mem, input->mem)); + U32 gs[3] = {(ow + 3) / 4 * 4, (oh + 3) / 4 * 4, ic}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + CHECK_STATUS( + gcl_run_kernel(handle, kernel, dim, gs, ls, "padding_input_gclmem")); + } + return SUCCESS; + } + + if (desc.memFormat == DF_NCWHC4) { + if (hdt != DT_F16) { + return NOT_SUPPORTED; + } + oh = input->desc.stride[0]; + ow = input->desc.stride[1]; + ph = input->desc.offset[0]; + pw = input->desc.offset[1]; + gcl_trans_memory( + handle, (void *)hostPtr, (void *)tmpBuf, &size, HOST_TO_DEVICE_BUF, blocking); + CHECK_STATUS(gcl_get_kernel_from_map(handle, "mem_trans_nchw_to_ncwhc4", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw, ih, 0, 0, ow, oh, pw, ph, iw, ih, ic, + iw, ih, ic, 0, 0, tmpBuf->mem, input->mem)); + U32 gs[3] = {(iw + 3) / 4, ih, (ic + 3) / 4 * in}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + CHECK_STATUS( + gcl_run_kernel(handle, kernel, dim, gs, ls, "mem_trans_nchw_to_ncwhc4")); + return SUCCESS; + } + return NOT_SUPPORTED; + } + + if (hdf == DF_NHWC) { + U32 oc, ow, pc, pw; + oc = input->desc.stride[0]; + ow = input->desc.stride[1]; + pc = input->desc.offset[0]; + pw = input->desc.offset[1]; + if (desc.memFormat == DF_NHWC) { + if (ic == oc && iw == ow && pc == 0 && pw == 0) { + gcl_trans_memory(handle, (void *)hostPtr, (void *)input, &size, + HOST_TO_DEVICE_BUF, blocking); + return SUCCESS; + } + } + return NOT_SUPPORTED; + } + + if (hdf == DF_NORMAL) { + U32 oh, ow, ph, pw; + ow = input->desc.stride[0]; + oh = input->desc.stride[1]; + pw = input->desc.offset[0]; + ph = input->desc.offset[1]; + if (desc.memFormat == DF_NCHW) { + if (iw == ow && ih == oh && pw == 0 && ph == 0) { + gcl_trans_memory(handle, (void *)hostPtr, (void *)input, &size, + HOST_TO_DEVICE_BUF, blocking); + return SUCCESS; + } + } + return NOT_SUPPORTED; + } + } + return NOT_SUPPORTED; +} + +EE ocl_get_output(GCLHandle_t handle, const GCLMem_t input, TensorDesc hostDesc, bool blocking) +{ + GCLMemDesc desc = input->desc; + Kernel kernel; + DataType host_dt; + DataFormat host_df, device_df; + U32 ow, oh, oc, on; + U32 iw, ih, ic, pw, ph; + tensorSelectGet(hostDesc, &host_dt, &host_df, &on, &oc, &oh, &ow); + U32 size = tensorNumBytes(hostDesc); + U32 offset = 0; + get_gclmem_dim(desc, &iw, &ih, &ic, &pw, &ph); + device_df = desc.memFormat; + if (desc.byteSize < size) { + CHECK_STATUS(NOT_MATCH); + } + + if (device_df == DF_NCWHC4 && (host_df == DF_NCHW || host_df == DF_NORMAL) && + host_dt == DT_F16 && (ih != 1 || iw != 1)) { + if (desc.byteSize < size * 2) { + CHECK_STATUS(NOT_MATCH); + } + offset = iw * ih * ic * 4; + CHECK_STATUS(gcl_get_kernel_from_map(handle, "mem_trans_ncwhc4_to_nchw", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw, ih, pw, ph, ow, oh, 0, 0, ow, oh, oc, ow, oh, + oc, 0, offset, input->mem, input->mem)); + U32 gs[3] = {oh, (ow + 3) >> 2, (oc + 3) / 4 * on}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "mem_trans_ncwhc4_to_nchw")); + offset = offset * bytesOf(host_dt); + } + + if (device_df == DF_NCWHC4 && host_df == DF_MKT) { + if (desc.byteSize < size * 2) { + CHECK_STATUS(NOT_MATCH); + } + offset = iw * ih * ic * 4; + U32 gs[2] = {oh, (oc + 3) / 4}; + U32 ls[2] = {0, 0}; + U32 dim = 2; + CHECK_STATUS(gcl_get_kernel_from_map(handle, "mem_trans_ncwhc4_to_mtk", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs( + kernel, ih, iw, ph, pw, oc, offset, gs[0], gs[1], input->mem, input->mem)); + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "mem_trans_ncwhc4_to_mtk")); + offset = offset * bytesOf(host_dt); + } + CHECK_STATUS(gcl_map_memory(handle, input, &offset, &size, CL_MAP_READ, blocking)); + return SUCCESS; +} + +EE ocl_trans_mem( + GCLHandle_t handle, GCLMem_t src, GCLMemDesc srcDesc, GCLMem_t dst, GCLMemDesc dstDesc) +{ + if (srcDesc.memType == dstDesc.memType && srcDesc.memType == GCL_MEM_BUF) { + U32 sw_str, sh_str, sc_str, sw_off, sh_off; + U32 dw_str, dh_str, dc_str, dw_off, dh_off; + DataFormat sf, df; + sf = srcDesc.memFormat; + df = dstDesc.memFormat; + get_gclmem_dim(srcDesc, &sw_str, &sh_str, &sc_str, &sw_off, &sh_off); + get_gclmem_dim(dstDesc, &dw_str, &dh_str, &dc_str, &dw_off, &dh_off); + U32 gs[3] = {0, 0, 0}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + Mem srcMem = src->mem; + Mem dstMem = dst->mem; + Kernel kernel; + if (sf == df) { + if (sw_str == dw_str && sh_str == dh_str && sc_str == dc_str && sw_off == dw_off && + sh_off == dh_off) { + U32 len = srcDesc.num; + gs[0] = (len + 3) / 4; + ls[0] = 0; + dim = 1; + CHECK_STATUS(gcl_create_kernel(handle, "copy_f16", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, len, len, 0, 0, gs[0], srcMem, dstMem)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, "copy_fp16"); + } else if (sf == DF_NCHW && sw_off == 0 && sh_off == 0 && sc_str == dc_str) { + gs[0] = (dw_str + 3) / 4 * 4; + gs[1] = (dh_str + 3) / 4 * 4; + gs[2] = dc_str; + dim = 3; + CHECK_STATUS(gcl_create_kernel(handle, "padding_input_gclmem", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs( + kernel, sw_str, sh_str, dw_off, dh_off, dw_str, dh_str, srcMem, dstMem)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, "padding_input_gclmem"); + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + } else if (sf == DF_NCHW && df == DF_NCWHC4) { + U32 iw, ih, ic; + TensorDesc cpuDesc = tensor4df(srcDesc.dt, srcDesc.df, srcDesc.dims[3], srcDesc.dims[2], + srcDesc.dims[1], srcDesc.dims[0]); + tensorSelectGet(cpuDesc, NULL, NULL, NULL, &ic, &ih, &iw); + gs[0] = (iw + 3) / 4; + gs[1] = ih; + gs[2] = (ic + 3) / 4; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + CHECK_STATUS(gcl_create_kernel(handle, "mem_trans_nchw_to_ncwhc4", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, sw_str, sh_str, sw_off, sh_off, dw_str, dh_str, + dw_off, dh_off, iw, ih, ic, iw, ih, ic, 0, 0, srcMem, dstMem)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, "mem_trans_nchw_to_ncwhc4"); + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + return SUCCESS; +} + +EE ocl_map_mem(GCLHandle_t handle, GCLMem_t gclMem, GCLMemDesc desc) +{ + DataType dt; + DataFormat df; + U32 n, c, h, w; + CHECK_STATUS(gclmem_get_desc_non_padding(desc, &dt, &df, &n, &c, &h, &w)); + + DataFormat mf = desc.memFormat; + U32 w_str, h_str, c_str, w_off, h_off; + get_gclmem_dim(desc, &w_str, &h_str, &c_str, &w_off, &h_off); + bool needTrans = true; + U32 offset = 0; + U32 size = n * c * h * w * bytesOf(dt); + if (w_str == w && h_str == h && c_str == c && mf != DF_NCWHC4) { + needTrans = false; + } + if (w_str == 1 && h_str == 1 && mf == DF_NCWHC4) { + needTrans = false; + } + if (needTrans) { + if (mf == DF_NCWHC4) { + U32 gs[3] = {h, (w + 3) >> 2, (c + 3) / 4 * n}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + Kernel kernel; + Mem buf = gclMem->mem; + offset = desc.num; + CHECK_STATUS(gcl_get_kernel_from_map(handle, "mem_trans_ncwhc4_to_nchw", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, w_str, h_str, w_off, h_off, w, h, 0, 0, w, h, c, + w, h, c, 0, offset, buf, buf)); + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "mem_trans_ncwhc4_to_nchw")); + offset = desc.num * bytesOf(dt); + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + } + CHECK_STATUS(gcl_map_memory(handle, gclMem, &offset, &size, CL_MAP_READ, CL_TRUE)); + return SUCCESS; +} diff --git a/common/gcl/tools/device_info/CMakeLists.txt b/common/gcl/tools/device_info/CMakeLists.txt new file mode 100644 index 00000000..d9dbd182 --- /dev/null +++ b/common/gcl/tools/device_info/CMakeLists.txt @@ -0,0 +1,28 @@ +cmake_minimum_required(VERSION 3.2) + +file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/common/cmakes/bolt.cmake ${BOLT_ROOT}/common/cmakes/bolt.cmake) +if (BOLT_CONFIGURE_FILE) + set(USE_LLVM_CLANG ON) + set(USE_GNU_GCC OFF) + set(USE_MALI ON) + set(USE_DYNAMIC_LIBRARY OFF) + include(${BOLT_CONFIGURE_FILE}) +else (BOLT_CONFIGURE_FILE) + message(FATAL_ERROR " +FATAL: can not find bolt.cmake in directory, + please set shell or cmake environment variable BOLT_ROOT. + ") +endif (BOLT_CONFIGURE_FILE) + +project(gclinfo) + +set_policy() + +find_package(Gcl) + +set_c_cxx_flags() + +set_test_c_cxx_flags() + +add_executable(gcl_info clinfo.cpp) +target_link_libraries(gcl_info ${OPENCL_LIBRARIES} log -Wl,-allow-shlib-undefined, -static-libstdc++) diff --git a/gcl/tools/device_info/clinfo.cpp b/common/gcl/tools/device_info/clinfo.cpp similarity index 81% rename from gcl/tools/device_info/clinfo.cpp rename to common/gcl/tools/device_info/clinfo.cpp index 97530a7b..a705d882 100644 --- a/gcl/tools/device_info/clinfo.cpp +++ b/common/gcl/tools/device_info/clinfo.cpp @@ -1,23 +1,22 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include"gcl.h" -int main(){ +#include "gcl.h" +int main() +{ GCLHandle_t handle; CHECK_STATUS(gcl_create_handle(&handle)); CHECK_STATUS(list_device_info(handle->devices[handle->deviceId])); - CHECK_STATUS(gcl_destroy_handle(handle)); + gcl_destroy_handle(handle); return 0; } diff --git a/common/gcl/tools/gcl_sample/CMakeLists.txt b/common/gcl/tools/gcl_sample/CMakeLists.txt new file mode 100644 index 00000000..0fdb0401 --- /dev/null +++ b/common/gcl/tools/gcl_sample/CMakeLists.txt @@ -0,0 +1,23 @@ +cmake_minimum_required(VERSION 3.4.1) + +file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/common/cmakes/bolt.cmake ${BOLT_ROOT}/common/cmakes/bolt.cmake) +if (BOLT_CONFIGURE_FILE) + include(${BOLT_CONFIGURE_FILE}) +else (BOLT_CONFIGURE_FILE) + message(FATAL_ERROR " +FATAL: can not find bolt.cmake in directory, + please set shell or cmake environment variable BOLT_ROOT. + ") +endif (BOLT_CONFIGURE_FILE) + +project(sample) + +set_policy() + +find_package(Gcl) + +set_c_cxx_flags() +set_test_c_cxx_flags() + +add_executable(sample sample.cpp) +target_link_libraries(sample ${KERNELSOURCE_LIBRARIES} ${OPENCL_LIBRARIES} log -Wl,-allow-shlib-undefined, -static-libstdc++) diff --git a/common/gcl/tools/gcl_sample/build.sh b/common/gcl/tools/gcl_sample/build.sh new file mode 100644 index 00000000..8abf4138 --- /dev/null +++ b/common/gcl/tools/gcl_sample/build.sh @@ -0,0 +1,20 @@ +options="-DUSE_CROSS_COMPILE=ON \ + -DBUILD_TEST=ON \ + -DUSE_GNU_GCC=OFF \ + -DUSE_LLVM_CLANG=ON \ + -DUSE_MALI=ON \ + -DUSE_NEON=ON \ + -DUSE_DYNAMIC_LIBRARY=OFF \ + -DCMAKE_C_COMPILER=`which aarch64-linux-android21-clang` \ + -DCMAKE_CXX_COMPILER=`which aarch64-linux-android21-clang++` \ + -DCMAKE_STRIP=`which aarch64-linux-android-strip` " +rm -rf ./build_gcl_sample +mkdir ./build_gcl_sample +cd ./build_gcl_sample +cmake .. ${options} +make -j33 +cd .. + + + + diff --git a/gcl/tools/gcl_sample/cl/sample.cl b/common/gcl/tools/gcl_sample/cl/sample.cl similarity index 75% rename from gcl/tools/gcl_sample/cl/sample.cl rename to common/gcl/tools/gcl_sample/cl/sample.cl index 555f5271..c2ceaa8d 100644 --- a/gcl/tools/gcl_sample/cl/sample.cl +++ b/common/gcl/tools/gcl_sample/cl/sample.cl @@ -11,23 +11,36 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -__kernel void sample(const int iw_str, const int iwh_str, const int fc, const int flt_str, const int ow_str, const int oh_str, const int bx, const int by, - __global const T* in, __global const T* flt, __global const T* bias, __global T* out){ +__kernel void sample(const int iw_str, + const int iwh_str, + const int fc, + const int flt_str, + const int ow_str, + const int oh_str, + const int bx, + const int by, + __global const T *in, + __global const T *flt, + __global const T *bias, + __global T *out) +{ const int idx = get_global_id(0); const int idy = get_global_id(1); const int idz = get_global_id(2); - if(idx >= bx || idy >= by) return; + if (idx >= bx || idy >= by) { + return; + } T3 flt_val; T3 in_val; - T out_val; + T out_val; out_val = bias[idz]; int flt_off = idz * flt_str; - int in_off = idy * iw_str + idx; - for(int i = 0 ; i < fc; ++i) { - for(uchar j = 0; j < 3; ++j) { + int in_off = idy * iw_str + idx; + for (int i = 0; i < fc; ++i) { + for (uchar j = 0; j < 3; ++j) { flt_val = vload3(0, flt + flt_off + j * 3); - in_val = vload3(0, in + in_off + j * iw_str); + in_val = vload3(0, in + in_off + j * iw_str); out_val += flt_val.x * in_val.x; out_val += flt_val.y * in_val.y; out_val += flt_val.z * in_val.z; @@ -35,7 +48,7 @@ __kernel void sample(const int iw_str, const int iwh_str, const int fc, const in flt_off += 9; in_off += iwh_str; } - + int out_off = (idz * oh_str + idy) * ow_str + idx; out[out_off] = out_val; } diff --git a/common/gcl/tools/gcl_sample/sample.cpp b/common/gcl/tools/gcl_sample/sample.cpp new file mode 100644 index 00000000..da6f3154 --- /dev/null +++ b/common/gcl/tools/gcl_sample/sample.cpp @@ -0,0 +1,168 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +#ifdef _USE_FP16 +#include "gcl.h" +#include "types.h" +#include "libkernelsource.h" + +void setMemDesc(GCLMem_t mem, + DataType dt, + DataFormat ft, + GCLMemType mt, + U32 s0, + U32 s1, + U32 s2, + U32 off0, + U32 off1, + U32 off2) +{ + mem->desc.stride[0] = s0 + 2 * off0; + mem->desc.stride[1] = s1 + 2 * off1; + mem->desc.stride[2] = s2; + mem->desc.offset[0] = off0; + mem->desc.offset[1] = off1; + mem->desc.offset[2] = off2; + mem->desc.num = (s0 + 2 * off0) * (s1 + 2 * off0) * s2; + mem->desc.byteSize = s0 * s1 * s2 * bytesOf(dt); + mem->desc.memFormat = ft; + mem->desc.memType = mt; +} + +int main() +{ + GCLHandle_t handle; + CHECK_STATUS(gcl_create_handle(&handle)); + U32 iw, ih, ic, in; + U32 fw, fh, fc, fn; + U32 sv, pv; + U32 ow, oh, oc, on; + + iw = 1440; + ih = 960; + ic = 4; + in = 1; + + fw = 3; + fh = 3; + fc = 4; + fn = 1; + + ow = iw; + oh = ih; + oc = fn; + on = in; + + sv = 1; + pv = 1; + + TensorDesc outDesc = tensor4d(DT_F16, on, oc, oh, ow); + GCLMem_t input = gcl_create_gclmem(); + GCLMem_t flt = gcl_create_gclmem(); + GCLMem_t bias = gcl_create_gclmem(); + GCLMem_t output = gcl_create_gclmem(); + setMemDesc(input, DT_F16, DF_NCWHC4, GCL_MEM_BUF, iw + 8, ih, ic, pv, pv, 0); + setMemDesc(flt, DT_F16, DF_NCWHC4, GCL_MEM_BUF, fw * fh, fc, fn, 0, 0, 0); + setMemDesc(bias, DT_F16, DF_NCHW, GCL_MEM_BUF, fn, 1, 1, 0, 0, 0); + setMemDesc(output, DT_F16, DF_NCHW, GCL_MEM_BUF, ow, oh, oc * 4, 0, 0, 0); + CHECK_STATUS(gcl_create_memory(handle, input)); + CHECK_STATUS(gcl_create_memory(handle, flt)); + CHECK_STATUS(gcl_create_memory(handle, bias)); + CHECK_STATUS(gcl_create_memory(handle, output)); + + U8 *iptr = new U8[input->desc.byteSize]; + U8 *fptr = new U8[flt->desc.byteSize]; + U8 *bptr = new U8[bias->desc.byteSize]; + + F16 *ival = (F16 *)iptr; + F16 *fval = (F16 *)fptr; + F16 *bval = (F16 *)bptr; + for (U32 i = 0; i < input->desc.num; i++) { + ival[i] = (rand() & 1023) / 1024.0 - 0.5; + U32 s0 = input->desc.stride[0]; + U32 s1 = input->desc.stride[1]; + U32 j = i % (s0 * s1); + U32 h = j % s1; + U32 w = j / s1; + if (h < pv || w < pv) { + ival[i] = 0; + } + if (h >= ih + pv || w >= iw + pv) { + ival[i] = 0; + } + } + + for (U32 i = 0; i < flt->desc.num; i++) { + fval[i] = (rand() & 1023) / 1024.0 - 0.5; + } + + for (U32 i = 0; i < bias->desc.num; i++) { + bval[i] = (rand() & 1023) / 1024.0 - 0.5; + } + + CHECK_STATUS(gcl_trans_memory( + handle, (void *)iptr, (void *)input, &input->desc.byteSize, HOST_TO_DEVICE_BUF, CL_TRUE)); + CHECK_STATUS(gcl_trans_memory( + handle, (void *)fptr, (void *)flt, &flt->desc.byteSize, HOST_TO_DEVICE_BUF, CL_TRUE)); + CHECK_STATUS(gcl_trans_memory( + handle, (void *)bptr, (void *)bias, &bias->desc.byteSize, HOST_TO_DEVICE_BUF, CL_TRUE)); + + Kernel kernel; + char kernelname[128]; + U32 be = 0; + U32 end = 0; + for (int i = 3; i <= 8; i++) { + sprintf(kernelname, "conv_direct_s1_fn_spe_nchw_3%d", i); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + U32 iw_str = input->desc.stride[0]; + U32 ih_str = input->desc.stride[1]; + U32 ihw_str = iw_str * ih_str; + U32 ic_str = (input->desc.stride[2] + 3) / 4; + U32 ih_off = input->desc.offset[0] - pv; + U32 iw_off = input->desc.offset[1] - pv; + U32 sh = 1; + U32 ow_str = output->desc.stride[0]; + U32 oh_str = output->desc.stride[1]; + U32 ohw_str = ow_str * oh_str; + U32 ow_off = output->desc.offset[0]; + U32 oh_off = output->desc.offset[1]; + + U32 gs[2]; + gs[0] = oh; + gs[1] = (ow + i - 1) / i; + U32 dim = 2; + U32 ls[2] = {0, 0}; + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, ihw_str, ic_str, ih_off, iw_off, oh_str, + ow_str, ohw_str, oh_off, ow_off, ow, sh, gs[0], gs[1], input->mem, flt->mem, bias->mem, + output->mem)); + CHECK_STATUS(gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname)); + end = handle->kernelVec.size(); + CHECK_STATUS(gcl_run_kernelVec_timing(handle, be, end)); + CHECK_STATUS(gcl_run_kernelVec_timing(handle, be, end)); + CHECK_STATUS(gcl_run_kernelVec_timing(handle, be, end)); + CHECK_STATUS(gcl_run_kernelVec_timing(handle, be, end)); + be = end; +#ifdef _DEBUG + CHECK_STATUS(gcl_check_data(handle, outDesc, output, 0, false)); +#endif + } + + delete[] iptr; + delete[] fptr; + delete[] bptr; + gcl_destroy_gclmem(input); + gcl_destroy_gclmem(flt); + gcl_destroy_gclmem(bias); + gcl_destroy_gclmem(output); + gcl_destroy_handle(handle); +} +#endif diff --git a/common/gcl/tools/kernel_lib_compile/CMakeLists.txt b/common/gcl/tools/kernel_lib_compile/CMakeLists.txt new file mode 100644 index 00000000..8fc8b455 --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/CMakeLists.txt @@ -0,0 +1,34 @@ +cmake_minimum_required(VERSION 3.2) + + +file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/common/cmakes/bolt.cmake ${BOLT_ROOT}/common/cmakes/bolt.cmake) +if (BOLT_CONFIGURE_FILE) + include(${BOLT_CONFIGURE_FILE}) +else (BOLT_CONFIGURE_FILE) + message(FATAL_ERROR " +FATAL: can not find bolt.cmake in directory, + please set shell or cmake environment variable BOLT_ROOT. + ") +endif (BOLT_CONFIGURE_FILE) + +project(KERNELBIN) + +set_policy() + +find_package(Gcl) +include_directories(${PROJECT_SOURCE_DIR}/include) +set_project_install_directory() + +set_c_cxx_flags() + +execute_process( + COMMAND bash buildKernelLib.sh + WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" +) + +file(GLOB_RECURSE kernel_src_list "src/*.cpp") +add_library(kernelbin SHARED ${kernel_src_list}) +add_library(kernelbin_static STATIC ${kernel_src_list}) +set_target_properties(kernelbin_static PROPERTIES OUTPUT_NAME "kernelbin") +set_target_properties(kernelbin PROPERTIES CLEAN_DIRECT_OUTPUT 1) +set_target_properties(kernelbin_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) diff --git a/gcl/tools/kernel_lib_compile/buildKernelLib.sh b/common/gcl/tools/kernel_lib_compile/buildKernelLib.sh similarity index 100% rename from gcl/tools/kernel_lib_compile/buildKernelLib.sh rename to common/gcl/tools/kernel_lib_compile/buildKernelLib.sh diff --git a/common/gcl/tools/kernel_lib_compile/device_name/CMakeLists.txt b/common/gcl/tools/kernel_lib_compile/device_name/CMakeLists.txt new file mode 100644 index 00000000..e311eeea --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/device_name/CMakeLists.txt @@ -0,0 +1,24 @@ +cmake_minimum_required(VERSION 3.2) + +file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/common/cmakes/bolt.cmake ${BOLT_ROOT}/common/cmakes/bolt.cmake) +if (BOLT_CONFIGURE_FILE) + include(${BOLT_CONFIGURE_FILE}) +else (BOLT_CONFIGURE_FILE) + message(FATAL_ERROR " +FATAL: can not find bolt.cmake in directory, + please set shell or cmake environment variable BOLT_ROOT. + ") +endif (BOLT_CONFIGURE_FILE) + +project(deviceName) + +set_policy() + +set_c_cxx_flags() + +set_test_c_cxx_flags() + +find_package(Gcl) + +add_executable(gcl_device_name device_name.cpp) +target_link_libraries(gcl_device_name ${OPENCL_LIBRARIES} -Wl,-allow-shlib-undefined, -static-libstdc++) diff --git a/common/gcl/tools/kernel_lib_compile/device_name/device_name.cpp b/common/gcl/tools/kernel_lib_compile/device_name/device_name.cpp new file mode 100644 index 00000000..3f3ccac7 --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/device_name/device_name.cpp @@ -0,0 +1,26 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "gcl.h" +#include + +int main() +{ + GCLHandle_t handle; + CHECK_STATUS(gcl_create_handle(&handle)); + FILE *fp = fopen("deviceNameFile", "w"); + fwrite(handle->deviceName.c_str(), handle->deviceName.length(), 1, fp); + fclose(fp); + gcl_destroy_handle(handle); + return 0; +} diff --git a/common/gcl/tools/kernel_lib_compile/kernel_bin/CMakeLists.txt b/common/gcl/tools/kernel_lib_compile/kernel_bin/CMakeLists.txt new file mode 100644 index 00000000..0081c2f4 --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/kernel_bin/CMakeLists.txt @@ -0,0 +1,24 @@ +cmake_minimum_required(VERSION 3.2) + +file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/common/cmakes/bolt.cmake ${BOLT_ROOT}/common/cmakes/bolt.cmake) +if (BOLT_CONFIGURE_FILE) + include(${BOLT_CONFIGURE_FILE}) +else (BOLT_CONFIGURE_FILE) + message(FATAL_ERROR " +FATAL: can not find bolt.cmake in directory, + please set shell or cmake environment variable BOLT_ROOT. + ") +endif (BOLT_CONFIGURE_FILE) + +project(gclBinary) + +set_policy() + +find_package(Gcl) + +set_c_cxx_flags() + +set_test_c_cxx_flags() + +add_executable(gcl_binary clbinary.cpp) +target_link_libraries(gcl_binary ${OPENCL_LIBRARIES} -Wl,-allow-shlib-undefined, -static-libstdc++) diff --git a/common/gcl/tools/kernel_lib_compile/kernel_bin/clbinary.cpp b/common/gcl/tools/kernel_lib_compile/kernel_bin/clbinary.cpp new file mode 100644 index 00000000..c6f2e89d --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/kernel_bin/clbinary.cpp @@ -0,0 +1,195 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "gcl.h" +#include + +const char *imagesource = "#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable\n"; + +const char *half16source = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"; + +void printHelp() +{ + printf("please use the linux tradition, or you will face problem!!!!!!!!!!!!!!\n"); + printf("The program only support opencl kernel compile now !!!!!!!!!!!1!!!!!!!\n"); + printf("-i or --input to specify OpenCL input cl soruce file name\n"); + printf("-o or --output to specify OpenCL output binary file name\n"); + printf("-O or --options to specify OpenCL compiling options\n"); +} + +bool GetFileLength(CI8 *filename, U32 *len) +{ + if ((NULL == filename) || (0 == strlen(filename))) { + return false; + } + FILE *fp = fopen(filename, "rb"); + if (NULL == fp) { + return false; + } + rewind(fp); + if (0 != fseek(fp, 0, SEEK_END)) { + return false; + } + *len = ftell(fp); + fclose(fp); + return true; +} + +bool LoadBinFile(CI8 *filename, I8 *str, U32 len) +{ + if ((NULL == filename) || (0 == strlen(filename))) { + return false; + } + FILE *fp = fopen(filename, "rb"); + if (NULL == fp) { + return false; + } + rewind(fp); + if (len != fread(str, sizeof(char), len, fp)) { + fclose(fp); + return false; + } + fclose(fp); + return true; +} + +bool StoreToBinFile(CI8 *filename, U32 length, CU8 *s) +{ + if ((NULL == s) || (NULL == filename) || (0 == strlen(filename))) { + return false; + } + FILE *fp = fopen(filename, "wb"); + if (NULL == fp) { + return false; + } + if (length != fwrite(s, sizeof(char), length, fp)) { + fclose(fp); + return false; + } + fclose(fp); + return true; +} + +void parseCommandLine(I32 argc, I8 *argv[], I8 **inputFilename, I8 **outputFilename, I8 **options) +{ + const struct option long_options[] = {{"input", 1, nullptr, 'i'}, {"output", 1, nullptr, 'o'}, + {"options", 1, nullptr, 'O'}, {nullptr, 1, nullptr, '0'}}; + bool setInput = false; + bool setOutput = false; + bool setOptions = false; + int optionIndex = 0; + int ch; + while ((ch = getopt_long(argc, argv, "i:o:O", long_options, &optionIndex)) != -1) { + switch (ch) { + case 'i': + printf("input file name is %s\n", optarg); + *inputFilename = optarg; + if (setInput) { + printf("you specify input file name twice, program will exit\n"); + exit(0); + } + setInput = true; + break; + case 'o': + printf("output file name is %s\n", optarg); + *outputFilename = optarg; + if (setOutput) { + printf("you specify output file name twice, program will exit\n"); + exit(0); + } + setOutput = true; + break; + case 'O': + printf("options is %s\n", optarg); + *options = optarg; + if (setOptions) { + printf("you specify compiling options twice, program will exit\n"); + exit(0); + } + setOptions = true; + break; + default: + printf("not support option:%c\n", ch); + } + } + if (!setInput) { + printf("you don't specify the input cl file name, program will exit\n"); + exit(0); + } + if (!setOutput) { + printf("you don't specify the output file name, program will exit\n"); + exit(0); + } + if (!setOptions) { + printf("you don't specify the options for compiling cl file, default is empty\n"); + *options = (char *)""; + } +} + +int main(I32 argc, I8 *argv[]) +{ + if (1 == argc) { + printHelp(); + return 0; + } + + I8 *FLAGS_inputFilename; + I8 *FLAGS_outputFilename; + I8 *FLAGS_options; + parseCommandLine(argc, argv, &FLAGS_inputFilename, &FLAGS_outputFilename, &FLAGS_options); + + GCLHandle_t handle; + CHECK_STATUS(gcl_create_handle(&handle)); + U32 imageLen = 0; +#ifdef CL_VERSION_1_2 + imageLen = strlen(imagesource); +#endif + U32 half16Len = strlen(half16source); + U32 clcodeLen = 0; + bool FileStatus = GetFileLength(FLAGS_inputFilename, &clcodeLen); + if (!FileStatus) { + printf("get file length failed\n"); + return 0; + } + U32 srcLen = imageLen + half16Len + clcodeLen; + I8 *source = new I8[srcLen]; +#ifdef CL_VERSION_1_2 + memcpy(source, imagesource, imageLen); +#endif + memcpy(source + imageLen, half16source, half16Len); + FileStatus = LoadBinFile(FLAGS_inputFilename, source + imageLen + half16Len, clcodeLen); + if (!FileStatus) { + printf("load bin file failed\n"); + delete[] source; + return 0; + } + + Program program; + U32 numKernel = 1; + Kernel kernel; + U32 size = 0; + U8 *binary; + + CHECK_STATUS(gcl_produce_program_kernel_with_source( + handle, &srcLen, source, FLAGS_options, &program, numKernel, &kernel)); + CHECK_STATUS(gcl_get_program_info(program, &binary, &size)); + FileStatus = StoreToBinFile(FLAGS_outputFilename, size, binary); + if (!FileStatus) { + printf("store bin file failed\n"); + } + free(binary); + delete[] source; + CHECK_STATUS(release_program(program)); + CHECK_STATUS(release_kernel(kernel)); + gcl_destroy_handle(handle); +} diff --git a/gcl/tools/kernel_lib_compile/kernel_bin2char/bin2char.cpp b/common/gcl/tools/kernel_lib_compile/kernel_bin2char/bin2char.cpp similarity index 76% rename from gcl/tools/kernel_lib_compile/kernel_bin2char/bin2char.cpp rename to common/gcl/tools/kernel_lib_compile/kernel_bin2char/bin2char.cpp index d0db4ec5..3b73ffba 100644 --- a/gcl/tools/kernel_lib_compile/kernel_bin2char/bin2char.cpp +++ b/common/gcl/tools/kernel_lib_compile/kernel_bin2char/bin2char.cpp @@ -1,18 +1,16 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - #include #include #include @@ -20,7 +18,8 @@ #include #include -int main(int argc, char *argv[]) { +int main(int argc, char *argv[]) +{ char *binName; char *cppName; char *charName; @@ -28,29 +27,29 @@ int main(int argc, char *argv[]) { std::string binFile; std::string cppFile; - if(argc == 3) { + if (argc == 3) { binName = argv[1]; binFile = binName; charName = strtok(binName, "."); cppFile = std::string(charName) + ".cpp"; int len = strlen(charName); - for(int i = len - 1; i > -1 ; --i) { - if(charName[i] == '/') { - charName = &charName[i+1]; + for (int i = len - 1; i > -1; --i) { + if (charName[i] == '/') { + charName = &charName[i + 1]; break; } } binMapName = argv[2]; - } else if(argc == 4) { + } else if (argc == 4) { binName = argv[1]; binFile = binName; cppName = argv[2]; cppFile = cppName; charName = strtok(cppName, "."); int len = strlen(charName); - for(int i = len - 1; i > -1 ; --i) { - if(charName[i] == '/') { - charName = &charName[i+1]; + for (int i = len - 1; i > -1; --i) { + if (charName[i] == '/') { + charName = &charName[i + 1]; break; } } @@ -60,7 +59,7 @@ int main(int argc, char *argv[]) { } FILE *fpbin = fopen(binFile.c_str(), "rb"); - if(fpbin == NULL) { + if (fpbin == NULL) { printf("file %s open error\n", binFile.c_str()); return 1; } @@ -76,17 +75,19 @@ int main(int argc, char *argv[]) { templen << filelen; std::string filelen_st = templen.str(); - std::string str = "#include \"inline_" + std::string(binMapName) + ".h\"\n\nCU32 " + std::string(charName) + "_len = " + filelen_st + ";\nCU8 " + std::string(charName) + "[] = {"; + std::string str = "#include \"inline_" + std::string(binMapName) + ".h\"\n\nCU32 " + + std::string(charName) + "_len = " + filelen_st + ";\nCU8 " + std::string(charName) + + "[] = {"; unsigned char charRead; std::string appendBuf; - for(int i = 0 ;i < filelen ; i++ ) { + for (int i = 0; i < filelen; i++) { appendBuf.clear(); - if(i % 20 == 0) { + if (i % 20 == 0) { appendBuf += "\n"; } - if(1 != fread(&charRead, 1, 1, fpbin)) { + if (1 != fread(&charRead, 1, 1, fpbin)) { printf("file %s read error\n", binFile.c_str()); fclose(fpbin); return 1; @@ -95,9 +96,8 @@ int main(int argc, char *argv[]) { sprintf(tempstr, "0x%02x", charRead); appendBuf += std::string(tempstr); - if(i == filelen -1) { - } - else if(i % 20 == 19) { + if (i == filelen - 1) { + } else if (i % 20 == 19) { appendBuf += ","; } else { appendBuf += ", "; @@ -108,7 +108,7 @@ int main(int argc, char *argv[]) { str += "};"; std::ofstream file; - file.open (cppFile.c_str()); + file.open(cppFile.c_str()); file << str; file.close(); diff --git a/common/gcl/tools/kernel_lib_compile/sh/adbDeviceNum.sh b/common/gcl/tools/kernel_lib_compile/sh/adbDeviceNum.sh new file mode 100644 index 00000000..da1d2da3 --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/adbDeviceNum.sh @@ -0,0 +1,12 @@ +adbDeviceNum=($(adb devices | sed 's/\r//' | grep ".device$")) +i=0 +length=${#adbDeviceNum[@]} +while [ "$i" -lt "$length" ]; do + if + ((i%2!=0)) + then + unset adbDeviceNum[i] + fi + ((i++)) +done +adbDeviceNum=(E5B0119506000260 GCL5T19822000030) diff --git a/common/gcl/tools/kernel_lib_compile/sh/buildKernelBin.sh b/common/gcl/tools/kernel_lib_compile/sh/buildKernelBin.sh new file mode 100644 index 00000000..faaa49bc --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/buildKernelBin.sh @@ -0,0 +1,56 @@ +#build kernel bin on device# +#if devices name are the same, the build will only execute once# + +function push_directory() { + # Use for to avoid adb push version problems + for file in $(ls $1) + do + echo "Pushing ${file} to $2" + adb -s ${dNum} push $1/${file} $2 + done +} + +index=1 +for dNum in "${adbDeviceNum[@]}"; do + adb -s ${dNum} shell "rm -rf ${kernelBuildPath}" + adb -s ${dNum} shell "mkdir ${kernelBuildPath}" + adb -s ${dNum} push gcl_device_name ${kernelBuildPath} + adb -s ${dNum} shell "cd ${kernelBuildPath} && chmod +x gcl_device_name && ./gcl_device_name" + adb -s ${dNum} shell "cd ${kernelBuildPath} && cat ${deviceNameFile} >> ${dNum}.dn" + adb -s ${dNum} pull ${kernelBuildPath}/${dNum}.dn ${namePath} + dname=$(awk '{print $1}' ${namePath}/${dNum}.dn) + deviceNamesAll[$index]="${dname}" + dnameS=0 + for((j=1;j tmp.sh && chmod +x tmp.sh && ./tmp.sh" + done + adb -s ${dNum} shell "cd ${kernelBuildPath} && mkdir bin" + adb -s ${dNum} shell "cd ${kernelBuildPath} && cp *.bin ${kernelBuildPath}/bin" + adb -s ${dNum} pull ${kernelBuildPath}/bin ${binPath}/${dname} + if [ -d ${binPath}/${dname}/bin ]; then + mv ${binPath}/${dname}/bin/* ${binPath}/${dname} + rm -r ${binPath}/${dname}/bin + echo ${binPath}/${dname} + fi + adb -s ${dNum} shell "rm -rf ${kernelBuildPath}" + echo ${dname} >> ${dNameFile} + fi + index=`expr $index + 1` +done diff --git a/gcl/tools/kernel_lib_compile/sh/buildKernelLibConfig.sh b/common/gcl/tools/kernel_lib_compile/sh/buildKernelLibConfig.sh similarity index 84% rename from gcl/tools/kernel_lib_compile/sh/buildKernelLibConfig.sh rename to common/gcl/tools/kernel_lib_compile/sh/buildKernelLibConfig.sh index 50cf821a..38fb3254 100644 --- a/gcl/tools/kernel_lib_compile/sh/buildKernelLibConfig.sh +++ b/common/gcl/tools/kernel_lib_compile/sh/buildKernelLibConfig.sh @@ -4,8 +4,9 @@ workPath=$(pwd) #set file.cl dir# tensorCLPath=${BOLT_ROOT}/tensor_computing/src/gpu/mali/cl sampleCLPath=${BOLT_ROOT}/gcl/tools/gcl_sample/cl -CLPath=(${tensorCLPath} ${sampleCLPath}) -deviceNameFile=deviceBinmapNameFile +imageCLPath=${BOLT_ROOT}/image/src/gpu/mali/cl +CLPath=(${tensorCLPath} ${sampleCLPath} ${imageCLPath}) +deviceNameFile=deviceNameFile #get kernel compile option sh# shPath=${workPath}/sh @@ -20,7 +21,8 @@ srcPath=${workPath}/src incPath=${workPath}/include clPath=${workPath}/cl namePath=${workPath}/name -kernelBuildPath=/data/local/tmp/boltKernelBuild +TimeFlag=`adb -s ${adbDeviceNum[0]} shell "date +%s_%N"` +kernelBuildPath=/data/local/tmp/${TimeFlag} bin2charPath=${workPath}/kernel_bin2char kernelBinPath=${workPath}/kernel_bin deviceNamePath=${workPath}/device_name @@ -46,7 +48,8 @@ cmake_options="-DUSE_CROSS_COMPILE=ON \ -DUSE_LLVM_CLANG=ON \ -DUSE_MALI=ON \ -DUSE_DYNAMIC_LIBRARY=ON \ - -DBUILD_TEST=ON " + -DBUILD_TEST=ON \ + -DUSE_THREAD_SAFE=OFF" cmake .. ${cmake_options} make -j33 cp gcl_binary ${workPath} diff --git a/gcl/tools/kernel_lib_compile/sh/compile/activation.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/activation.sh similarity index 100% rename from gcl/tools/kernel_lib_compile/sh/compile/activation.sh rename to common/gcl/tools/kernel_lib_compile/sh/compile/activation.sh diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/argmax_x.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/argmax_x.sh new file mode 100644 index 00000000..dae036ea --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/argmax_x.sh @@ -0,0 +1,12 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "argmax_x.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}.bin --options=\"${copt}\" + echo ./gcl_binary --input=$file --output=${file%.*}_index.bin --options=\"${copt} -DUSE_INDEX\" + fi + fi + done + + + diff --git a/gcl/tools/kernel_lib_compile/sh/compile/bilateral_slice_apply_c12.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/bilateral_slice_apply_c12.sh similarity index 100% rename from gcl/tools/kernel_lib_compile/sh/compile/bilateral_slice_apply_c12.sh rename to common/gcl/tools/kernel_lib_compile/sh/compile/bilateral_slice_apply_c12.sh diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/channel_resize.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/channel_resize.sh new file mode 100644 index 00000000..b277c214 --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/channel_resize.sh @@ -0,0 +1,14 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "channel_resize.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}.bin --options=\"${copt}\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw.bin --options=\"${copt} -DINPUT_NCHW -DOUTPUT_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_ncwhc4.bin --options=\"${copt} -DINPUT_NCHW -DOUTPUT_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_nchw.bin --options=\"${copt} -DINPUT_NCWHC4 -DOUTPUT_NCHW\" + fi + fi + done + + + diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/common.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/common.sh new file mode 100644 index 00000000..16be6935 --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/common.sh @@ -0,0 +1,22 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + clFileName=${file%.*} + speConfig=0 + for filesh in *.sh + do + if [ "${filesh##*.}"x = "sh"x ];then + shFileName=${filesh%.*} + if [ "$clFileName" = "$shFileName" ];then + speConfig=1; + fi + fi + done + if [ $speConfig -eq 0 ]; then + echo ./gcl_binary --input=$file --output=${file%.*}.bin --options=\"${copt}\" + fi + fi + done + + + diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/concat.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/concat.sh new file mode 100644 index 00000000..85f82916 --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/concat.sh @@ -0,0 +1,26 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "concat.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_w1.bin --options=\"${copt} -D N=1 -D AXIS_W\" + echo ./gcl_binary --input=$file --output=${file%.*}_w2.bin --options=\"${copt} -D N=2 -D AXIS_W\" + echo ./gcl_binary --input=$file --output=${file%.*}_w3.bin --options=\"${copt} -D N=3 -D AXIS_W\" + echo ./gcl_binary --input=$file --output=${file%.*}_w4.bin --options=\"${copt} -D N=4 -D AXIS_W\" + echo ./gcl_binary --input=$file --output=${file%.*}_h1.bin --options=\"${copt} -D N=1 -D AXIS_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h2.bin --options=\"${copt} -D N=2 -D AXIS_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h3.bin --options=\"${copt} -D N=3 -D AXIS_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h4.bin --options=\"${copt} -D N=4 -D AXIS_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_c1.bin --options=\"${copt} -D N=1 -D AXIS_C\" + echo ./gcl_binary --input=$file --output=${file%.*}_c2.bin --options=\"${copt} -D N=2 -D AXIS_C\" + echo ./gcl_binary --input=$file --output=${file%.*}_c3.bin --options=\"${copt} -D N=3 -D AXIS_C\" + echo ./gcl_binary --input=$file --output=${file%.*}_c4.bin --options=\"${copt} -D N=4 -D AXIS_C\" + echo ./gcl_binary --input=$file --output=${file%.*}_nonalign_c_p1_1.bin --options=\"${copt} -D N=1 -D AXIS_C -D NON_ALIGN_C\" + echo ./gcl_binary --input=$file --output=${file%.*}_nonalign_c_p1_2.bin --options=\"${copt} -D N=2 -D AXIS_C -D NON_ALIGN_C\" + echo ./gcl_binary --input=$file --output=${file%.*}_nonalign_c_p1_3.bin --options=\"${copt} -D N=3 -D AXIS_C -D NON_ALIGN_C\" + echo ./gcl_binary --input=$file --output=${file%.*}_nonalign_c_p1_4.bin --options=\"${copt} -D N=4 -D AXIS_C -D NON_ALIGN_C\" + fi + fi + done + + + diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/conv_depthwise_s1.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_depthwise_s1.sh new file mode 100644 index 00000000..cc053ba3 --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_depthwise_s1.sh @@ -0,0 +1,150 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "conv_depthwise_s1.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_31.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_32.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=9 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_33.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_34.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=6 -D LN=6 -D UN=5 -D Fsq=9 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_35.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=7 -D LN=7 -D UN=6 -D Fsq=9 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_36.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_37.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_38.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -DUSE_HALF -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_31.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_32.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=9 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_33.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_34.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=6 -D LN=6 -D UN=5 -D Fsq=9 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_35.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=7 -D LN=7 -D UN=6 -D Fsq=9 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_36.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_37.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_38.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_31.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_32.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=9 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_33.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_34.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=6 -D LN=6 -D UN=5 -D Fsq=9 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_35.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=7 -D LN=7 -D UN=6 -D Fsq=9 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_36.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_37.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_38.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + + + echo ./gcl_binary --input=$file --output=${file%.*}_51.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_52.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_53.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_54.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_55.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_56.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_57.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_58.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -DUSE_HALF -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_51.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_52.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_53.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_54.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_55.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_56.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_57.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_58.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_51.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_52.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_53.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_54.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_55.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_56.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_57.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_58.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_71.bin --options=\"${copt} -D F=7 -D ON=1 -D IN=7 -D LN=7 -D UN=4 -D Fsq=49 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_72.bin --options=\"${copt} -D F=7 -D ON=2 -D IN=8 -D LN=8 -D UN=5 -D Fsq=49 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_73.bin --options=\"${copt} -D F=7 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=49 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_74.bin --options=\"${copt} -D F=7 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=49 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_75.bin --options=\"${copt} -D F=7 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=49 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_76.bin --options=\"${copt} -D F=7 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=49 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_77.bin --options=\"${copt} -D F=7 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=49 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_78.bin --options=\"${copt} -D F=7 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=49 -DUSE_HALF -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_71.bin --options=\"${copt} -D F=7 -D ON=1 -D IN=7 -D LN=7 -D UN=6 -D Fsq=49 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_72.bin --options=\"${copt} -D F=7 -D ON=2 -D IN=8 -D LN=8 -D UN=7 -D Fsq=49 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_73.bin --options=\"${copt} -D F=7 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=49 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_74.bin --options=\"${copt} -D F=7 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=49 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_75.bin --options=\"${copt} -D F=7 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=49 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_76.bin --options=\"${copt} -D F=7 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=49 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_77.bin --options=\"${copt} -D F=7 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=49 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_78.bin --options=\"${copt} -D F=7 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=49 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_71.bin --options=\"${copt} -D F=7 -D ON=1 -D IN=7 -D LN=7 -D UN=6 -D Fsq=49 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_72.bin --options=\"${copt} -D F=7 -D ON=2 -D IN=8 -D LN=8 -D UN=7 -D Fsq=49 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_73.bin --options=\"${copt} -D F=7 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=49 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_74.bin --options=\"${copt} -D F=7 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=49 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_75.bin --options=\"${copt} -D F=7 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=49 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_76.bin --options=\"${copt} -D F=7 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=49 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_77.bin --options=\"${copt} -D F=7 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=49 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_78.bin --options=\"${copt} -D F=7 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=49 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_31.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -DUSE_NCWH -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_32.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=9 -DUSE_NCWH -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_33.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -DUSE_NCWH -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_34.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=6 -D LN=6 -D UN=5 -D Fsq=9 -DUSE_NCWH -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_35.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=7 -D LN=7 -D UN=6 -D Fsq=9 -DUSE_NCWH -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_36.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_37.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_38.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_31.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_32.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_33.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_34.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=6 -D LN=6 -D UN=5 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_35.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=7 -D LN=7 -D UN=6 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_36.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_37.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_38.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_31.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_32.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_33.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_34.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=6 -D LN=6 -D UN=5 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_35.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=7 -D LN=7 -D UN=6 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_36.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_37.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_38.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + + + echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_51.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -DUSE_NCWH -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_52.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -DUSE_NCWH -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_53.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -DUSE_NCWH -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_54.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_55.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_56.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_57.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_58.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_51.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_52.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_53.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_54.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_55.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_56.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_57.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_58.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_51.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_52.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_53.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_54.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_55.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_56.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_57.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_58.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + fi + fi + done + + + diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/conv_depthwise_s2.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_depthwise_s2.sh new file mode 100644 index 00000000..eb79dea9 --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_depthwise_s2.sh @@ -0,0 +1,117 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "conv_depthwise_s2.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_31.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_32.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_33.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=9 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_34.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=9 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_35.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=9 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_36.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_37.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_38.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -DUSE_HALF -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_31.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_32.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_33.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=9 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_34.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=9 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_35.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=9 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_36.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_37.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_38.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_31.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_32.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_33.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=9 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_34.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=9 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_35.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=9 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_36.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_37.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_38.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_51.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=1 -D LN=0 -D UN=0 -D Fsq=25 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_52.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=25 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_53.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_54.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_55.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_56.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_57.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_58.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -DUSE_HALF -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_51.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=1 -D LN=0 -D UN=0 -D Fsq=25 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_52.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=25 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_53.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_54.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_55.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_56.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_57.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_58.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_51.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=1 -D LN=0 -D UN=0 -D Fsq=25 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_52.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=25 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_53.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_54.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_55.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_56.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_57.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_58.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_31.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -DUSE_NCWH -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_32.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -DUSE_NCWH -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_33.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=9 -DUSE_NCWH -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_34.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_35.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_36.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_37.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_38.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_31.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_32.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_33.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_34.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_35.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_36.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_37.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_38.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_31.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_32.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_33.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_34.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_35.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_36.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_37.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_38.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_51.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=1 -D LN=0 -D UN=0 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_52.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_53.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_54.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_55.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_56.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_57.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_58.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_51.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=1 -D LN=0 -D UN=0 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_52.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_53.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_54.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_55.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_56.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_57.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_58.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_51.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=1 -D LN=0 -D UN=0 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_52.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_53.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_54.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_55.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_56.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_57.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_58.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + fi + fi + done + + + diff --git a/gcl/tools/kernel_lib_compile/sh/compile/conv_depthwise_trans_fltbuf.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_depthwise_trans_fltbuf.sh similarity index 100% rename from gcl/tools/kernel_lib_compile/sh/compile/conv_depthwise_trans_fltbuf.sh rename to common/gcl/tools/kernel_lib_compile/sh/compile/conv_depthwise_trans_fltbuf.sh diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_3d_sw1_nchw_to_ncwhc4.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_3d_sw1_nchw_to_ncwhc4.sh new file mode 100644 index 00000000..162e901c --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_3d_sw1_nchw_to_ncwhc4.sh @@ -0,0 +1,11 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "conv_direct_3d_sw1_nchw_to_ncwhc4.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_relu_752.bin --options=\"${copt} -D FWH=7 -D FT=5 -D FWHT=245 -D ON=2 -DUSE_RELU\" + fi + fi + done + + + diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_3d_sw2_nchw_to_ncwhc4.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_3d_sw2_nchw_to_ncwhc4.sh new file mode 100644 index 00000000..7fc02394 --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_3d_sw2_nchw_to_ncwhc4.sh @@ -0,0 +1,11 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "conv_direct_3d_sw2_nchw_to_ncwhc4.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_relu_755.bin --options=\"${copt} -D FWH=7 -D FT=5 -D FWHT=245 -D ON=5 -DUSE_RELU\" + fi + fi + done + + + diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1.sh new file mode 100644 index 00000000..af751df8 --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1.sh @@ -0,0 +1,311 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "conv_direct_s1.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_111.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_121.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_131.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_141.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_151.bin --options=\"${copt} -D F=1 -D ON=5 -D IN=5 -D LN=5 -D Fsq=1 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_161.bin --options=\"${copt} -D F=1 -D ON=6 -D IN=6 -D LN=6 -D Fsq=1 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_171.bin --options=\"${copt} -D F=1 -D ON=7 -D IN=7 -D LN=7 -D Fsq=1 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_181.bin --options=\"${copt} -D F=1 -D ON=8 -D IN=8 -D LN=8 -D Fsq=1 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_112.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_122.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_132.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_142.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_152.bin --options=\"${copt} -D F=1 -D ON=5 -D IN=5 -D LN=5 -D Fsq=1 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_114.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=4 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_124.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=4 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_134.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=4 -DUSE_HALF\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_111.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_121.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_131.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_141.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_151.bin --options=\"${copt} -D F=1 -D ON=5 -D IN=5 -D LN=5 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_161.bin --options=\"${copt} -D F=1 -D ON=6 -D IN=6 -D LN=6 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_171.bin --options=\"${copt} -D F=1 -D ON=7 -D IN=7 -D LN=7 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_181.bin --options=\"${copt} -D F=1 -D ON=8 -D IN=8 -D LN=8 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_112.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_122.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_132.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_142.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_114.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_124.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_134.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_RELU\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_111.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_121.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_131.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_141.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_151.bin --options=\"${copt} -D F=1 -D ON=5 -D IN=5 -D LN=5 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_161.bin --options=\"${copt} -D F=1 -D ON=6 -D IN=6 -D LN=6 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_171.bin --options=\"${copt} -D F=1 -D ON=7 -D IN=7 -D LN=7 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_181.bin --options=\"${copt} -D F=1 -D ON=8 -D IN=8 -D LN=8 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_112.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_122.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_132.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_142.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_114.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_124.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_134.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_RELU6\" + + echo ./gcl_binary --input=$file --output=${file%.*}_gelu_111.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_GELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_gelu_121.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_GELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_gelu_131.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_GELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_gelu_141.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_GELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_gelu_151.bin --options=\"${copt} -D F=1 -D ON=5 -D IN=5 -D LN=5 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_GELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_gelu_161.bin --options=\"${copt} -D F=1 -D ON=6 -D IN=6 -D LN=6 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_GELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_gelu_171.bin --options=\"${copt} -D F=1 -D ON=7 -D IN=7 -D LN=7 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_GELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_gelu_181.bin --options=\"${copt} -D F=1 -D ON=8 -D IN=8 -D LN=8 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_GELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_gelu_112.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_GELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_gelu_122.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_GELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_gelu_132.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_GELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_gelu_142.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_GELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_gelu_114.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_GELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_gelu_124.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_GELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_gelu_134.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_GELU\" + + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_111.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_121.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_131.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_141.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_151.bin --options=\"${copt} -D F=1 -D ON=5 -D IN=5 -D LN=5 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_161.bin --options=\"${copt} -D F=1 -D ON=6 -D IN=6 -D LN=6 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_171.bin --options=\"${copt} -D F=1 -D ON=7 -D IN=7 -D LN=7 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_181.bin --options=\"${copt} -D F=1 -D ON=8 -D IN=8 -D LN=8 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_112.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_122.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_132.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_142.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_114.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_124.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_134.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_ELTWISE_NCWHC4\" + + echo ./gcl_binary --input=$file --output=${file%.*}_h_121.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=1 -DUSE_HALF -DREUSE_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_141.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=1 -DUSE_HALF -DREUSE_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_122.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=2 -DUSE_HALF -DREUSE_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_142.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=2 -DUSE_HALF -DREUSE_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_124.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=4 -DUSE_HALF -DREUSE_H\" + + echo ./gcl_binary --input=$file --output=${file%.*}_h_relu_121.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU -DREUSE_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_relu_141.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU -DREUSE_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_relu_122.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU -DREUSE_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_relu_142.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU -DREUSE_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_relu_124.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_RELU -DREUSE_H\" + + echo ./gcl_binary --input=$file --output=${file%.*}_h_relu6_121.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DREUSE_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_relu6_141.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DREUSE_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_relu6_122.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU6 -DREUSE_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_relu6_142.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU6 -DREUSE_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_relu6_124.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_RELU6 -DREUSE_H\" + + echo ./gcl_binary --input=$file --output=${file%.*}_h_gelu_121.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_GELU -DREUSE_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_gelu_141.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_GELU -DREUSE_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_gelu_122.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_GELU -DREUSE_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_gelu_142.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_GELU -DREUSE_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_gelu_124.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_GELU -DREUSE_H\" + + echo ./gcl_binary --input=$file --output=${file%.*}_311.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_321.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=9 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_331.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_341.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=9 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_351.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=9 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_361.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_371.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_381.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_311.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_321.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_331.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_341.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_351.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_361.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_371.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_381.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_312.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_322.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=9 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_332.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_342.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=6 -D LN=6 -D UN=5 -D Fsq=9 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_312.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_322.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_332.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_342.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=6 -D LN=6 -D UN=5 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_312.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_322.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_332.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_342.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=6 -D LN=6 -D UN=5 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU6\" +# echo ./gcl_binary --input=$file --output=${file%.*}_314.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=4 -DUSE_HALF\" +# echo ./gcl_binary --input=$file --output=${file%.*}_324.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=9 -D KN=4 -DUSE_HALF\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_314.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=4 -DUSE_HALF -DUSE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_324.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=9 -D KN=4 -DUSE_HALF -DUSE_RELU\" + + echo ./gcl_binary --input=$file --output=${file%.*}_211.bin --options=\"${copt} -D F=2 -D ON=1 -D IN=2 -D LN=2 -D UN=1 -D Fsq=4 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_221.bin --options=\"${copt} -D F=2 -D ON=2 -D IN=3 -D LN=3 -D UN=2 -D Fsq=4 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_231.bin --options=\"${copt} -D F=2 -D ON=3 -D IN=4 -D LN=4 -D UN=3 -D Fsq=4 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_241.bin --options=\"${copt} -D F=2 -D ON=4 -D IN=5 -D LN=5 -D UN=4 -D Fsq=4 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_251.bin --options=\"${copt} -D F=2 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=4 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_261.bin --options=\"${copt} -D F=2 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=4 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_271.bin --options=\"${copt} -D F=2 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=4 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_281.bin --options=\"${copt} -D F=2 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=4 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_211.bin --options=\"${copt} -D F=2 -D ON=1 -D IN=2 -D LN=2 -D UN=1 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_221.bin --options=\"${copt} -D F=2 -D ON=2 -D IN=3 -D LN=3 -D UN=2 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_231.bin --options=\"${copt} -D F=2 -D ON=3 -D IN=4 -D LN=4 -D UN=3 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_241.bin --options=\"${copt} -D F=2 -D ON=4 -D IN=5 -D LN=5 -D UN=4 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_251.bin --options=\"${copt} -D F=2 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_261.bin --options=\"${copt} -D F=2 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_271.bin --options=\"${copt} -D F=2 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_281.bin --options=\"${copt} -D F=2 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_212.bin --options=\"${copt} -D F=2 -D ON=1 -D IN=2 -D LN=2 -D UN=1 -D Fsq=4 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_222.bin --options=\"${copt} -D F=2 -D ON=2 -D IN=3 -D LN=3 -D UN=2 -D Fsq=4 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_232.bin --options=\"${copt} -D F=2 -D ON=3 -D IN=4 -D LN=4 -D UN=3 -D Fsq=4 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_242.bin --options=\"${copt} -D F=2 -D ON=4 -D IN=5 -D LN=5 -D UN=4 -D Fsq=4 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_212.bin --options=\"${copt} -D F=2 -D ON=1 -D IN=2 -D LN=2 -D UN=1 -D Fsq=4 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_222.bin --options=\"${copt} -D F=2 -D ON=2 -D IN=3 -D LN=3 -D UN=2 -D Fsq=4 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_232.bin --options=\"${copt} -D F=2 -D ON=3 -D IN=4 -D LN=4 -D UN=3 -D Fsq=4 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_242.bin --options=\"${copt} -D F=2 -D ON=4 -D IN=5 -D LN=5 -D UN=4 -D Fsq=4 -D KN=2 -DUSE_HALF -DUSE_RELU\" + + + + + echo ./gcl_binary --input=$file --output=${file%.*}_411.bin --options=\"${copt} -D F=4 -D ON=1 -D IN=4 -D LN=4 -D UN=3 -D Fsq=16 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_421.bin --options=\"${copt} -D F=4 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=16 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_431.bin --options=\"${copt} -D F=4 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=16 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_441.bin --options=\"${copt} -D F=4 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=16 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_451.bin --options=\"${copt} -D F=4 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=16 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_461.bin --options=\"${copt} -D F=4 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=16 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_471.bin --options=\"${copt} -D F=4 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=16 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_481.bin --options=\"${copt} -D F=4 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=16 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_411.bin --options=\"${copt} -D F=4 -D ON=1 -D IN=4 -D LN=4 -D UN=3 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU \" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_421.bin --options=\"${copt} -D F=4 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU \" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_431.bin --options=\"${copt} -D F=4 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_441.bin --options=\"${copt} -D F=4 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_451.bin --options=\"${copt} -D F=4 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_461.bin --options=\"${copt} -D F=4 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_471.bin --options=\"${copt} -D F=4 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_481.bin --options=\"${copt} -D F=4 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_411.bin --options=\"${copt} -D F=4 -D ON=1 -D IN=4 -D LN=4 -D UN=3 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU6 \" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_421.bin --options=\"${copt} -D F=4 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU6 \" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_431.bin --options=\"${copt} -D F=4 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_441.bin --options=\"${copt} -D F=4 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_451.bin --options=\"${copt} -D F=4 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_461.bin --options=\"${copt} -D F=4 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_471.bin --options=\"${copt} -D F=4 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_481.bin --options=\"${copt} -D F=4 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_412.bin --options=\"${copt} -D F=4 -D ON=1 -D IN=4 -D LN=4 -D UN=3 -D Fsq=16 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_422.bin --options=\"${copt} -D F=4 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=16 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_432.bin --options=\"${copt} -D F=4 -D ON=3 -D IN=6 -D LN=6 -D UN=5 -D Fsq=16 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_442.bin --options=\"${copt} -D F=4 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=16 -D KN=2 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_412.bin --options=\"${copt} -D F=4 -D ON=1 -D IN=4 -D LN=4 -D UN=3 -D Fsq=16 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_422.bin --options=\"${copt} -D F=4 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=16 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_432.bin --options=\"${copt} -D F=4 -D ON=3 -D IN=6 -D LN=6 -D UN=5 -D Fsq=16 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_442.bin --options=\"${copt} -D F=4 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=16 -D KN=2 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_412.bin --options=\"${copt} -D F=4 -D ON=1 -D IN=4 -D LN=4 -D UN=3 -D Fsq=16 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_422.bin --options=\"${copt} -D F=4 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=16 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_432.bin --options=\"${copt} -D F=4 -D ON=3 -D IN=6 -D LN=6 -D UN=5 -D Fsq=16 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_442.bin --options=\"${copt} -D F=4 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=16 -D KN=2 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + + + + + echo ./gcl_binary --input=$file --output=${file%.*}_511.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_521.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_531.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_541.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_551.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_561.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_571.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_581.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_511.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU \" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_521.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU \" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_531.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_541.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_551.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_561.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_571.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_581.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_511.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU6 \" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_521.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU6 \" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_531.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_541.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_551.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_561.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_571.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_581.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_512.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_522.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_532.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_542.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -D KN=2 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_512.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_522.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_532.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_542.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_512.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_522.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_532.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_542.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_514.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=4 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_524.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -D KN=4 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_514.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=4 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_524.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -D KN=4 -DUSE_HALF -DUSE_RELU\" + + + echo ./gcl_binary --input=$file --output=${file%.*}_711.bin --options=\"${copt} -D F=7 -D ON=1 -D IN=7 -D LN=7 -D UN=6 -D Fsq=49 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_721.bin --options=\"${copt} -D F=7 -D ON=2 -D IN=8 -D LN=8 -D UN=7 -D Fsq=49 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_731.bin --options=\"${copt} -D F=7 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=49 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_741.bin --options=\"${copt} -D F=7 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=49 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_751.bin --options=\"${copt} -D F=7 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=49 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_761.bin --options=\"${copt} -D F=7 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=49 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_771.bin --options=\"${copt} -D F=7 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=49 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_781.bin --options=\"${copt} -D F=7 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=49 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_711.bin --options=\"${copt} -D F=7 -D ON=1 -D IN=7 -D LN=7 -D UN=6 -D Fsq=49 -D KN=1 -DUSE_HALF -DUSE_RELU \" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_721.bin --options=\"${copt} -D F=7 -D ON=2 -D IN=8 -D LN=8 -D UN=7 -D Fsq=49 -D KN=1 -DUSE_HALF -DUSE_RELU \" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_731.bin --options=\"${copt} -D F=7 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=49 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_741.bin --options=\"${copt} -D F=7 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=49 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_751.bin --options=\"${copt} -D F=7 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=49 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_761.bin --options=\"${copt} -D F=7 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=49 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_771.bin --options=\"${copt} -D F=7 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=49 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_781.bin --options=\"${copt} -D F=7 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=49 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_711.bin --options=\"${copt} -D F=7 -D ON=1 -D IN=7 -D LN=7 -D UN=6 -D Fsq=49 -D KN=1 -DUSE_HALF -DUSE_RELU6 \" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_721.bin --options=\"${copt} -D F=7 -D ON=2 -D IN=8 -D LN=8 -D UN=7 -D Fsq=49 -D KN=1 -DUSE_HALF -DUSE_RELU6 \" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_731.bin --options=\"${copt} -D F=7 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=49 -D KN=1 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_741.bin --options=\"${copt} -D F=7 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=49 -D KN=1 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_751.bin --options=\"${copt} -D F=7 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=49 -D KN=1 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_761.bin --options=\"${copt} -D F=7 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=49 -D KN=1 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_771.bin --options=\"${copt} -D F=7 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=49 -D KN=1 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_781.bin --options=\"${copt} -D F=7 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=49 -D KN=1 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_712.bin --options=\"${copt} -D F=7 -D ON=1 -D IN=7 -D LN=7 -D UN=6 -D Fsq=49 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_722.bin --options=\"${copt} -D F=7 -D ON=2 -D IN=8 -D LN=8 -D UN=7 -D Fsq=49 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_732.bin --options=\"${copt} -D F=7 -D ON=3 -D IN=9 -D LN=9 -D UN=8 -D Fsq=49 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_742.bin --options=\"${copt} -D F=7 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=49 -D KN=2 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_712.bin --options=\"${copt} -D F=7 -D ON=1 -D IN=7 -D LN=7 -D UN=6 -D Fsq=49 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_722.bin --options=\"${copt} -D F=7 -D ON=2 -D IN=8 -D LN=8 -D UN=7 -D Fsq=49 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_732.bin --options=\"${copt} -D F=7 -D ON=3 -D IN=9 -D LN=9 -D UN=8 -D Fsq=49 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_742.bin --options=\"${copt} -D F=7 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=49 -D KN=2 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_712.bin --options=\"${copt} -D F=7 -D ON=1 -D IN=7 -D LN=7 -D UN=6 -D Fsq=49 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_722.bin --options=\"${copt} -D F=7 -D ON=2 -D IN=8 -D LN=8 -D UN=7 -D Fsq=49 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_732.bin --options=\"${copt} -D F=7 -D ON=3 -D IN=9 -D LN=9 -D UN=8 -D Fsq=49 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_742.bin --options=\"${copt} -D F=7 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=49 -D KN=2 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_714.bin --options=\"${copt} -D F=7 -D ON=1 -D IN=7 -D LN=7 -D UN=6 -D Fsq=49 -D KN=4 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_724.bin --options=\"${copt} -D F=7 -D ON=2 -D IN=8 -D LN=8 -D UN=7 -D Fsq=49 -D KN=4 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_714.bin --options=\"${copt} -D F=7 -D ON=1 -D IN=7 -D LN=7 -D UN=6 -D Fsq=49 -D KN=4 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_724.bin --options=\"${copt} -D F=7 -D ON=2 -D IN=8 -D LN=8 -D UN=7 -D Fsq=49 -D KN=4 -DUSE_HALF -DUSE_RELU\" + + + fi + fi + done + + + diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1_fn_spe.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1_fn_spe.sh new file mode 100644 index 00000000..4e76a6c7 --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1_fn_spe.sh @@ -0,0 +1,44 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "conv_direct_s1_fn_spe.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_18.bin --options=\"${copt} -D F=1 -D ON=8 -D IN=8 -D LN=8 -D Fsq=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_18.bin --options=\"${copt} -D F=1 -D ON=8 -D IN=8 -D LN=8 -D Fsq=1 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_18.bin --options=\"${copt} -D F=1 -D ON=8 -D IN=8 -D LN=8 -D Fsq=1 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_18.bin --options=\"${copt} -D F=1 -D ON=8 -D IN=8 -D LN=8 -D Fsq=1 -DUSE_NCHW -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_nchw_18.bin --options=\"${copt} -D F=1 -D ON=8 -D IN=8 -D LN=8 -D Fsq=1 -DUSE_RELU -DUSE_NCHW -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_nchw_18.bin --options=\"${copt} -D F=1 -D ON=8 -D IN=8 -D LN=8 -D Fsq=1 -DUSE_RELU6 -DUSE_NCHW -DUSE_HALF\" + + echo ./gcl_binary --input=$file --output=${file%.*}_28.bin --options=\"${copt} -D F=2 -D ON=8 -D IN=9 -D LN=9 -D UN=8 -D Fsq=4 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_28.bin --options=\"${copt} -D F=2 -D ON=8 -D IN=9 -D LN=9 -D UN=8 -D Fsq=4 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_28.bin --options=\"${copt} -D F=2 -D ON=8 -D IN=9 -D LN=9 -D UN=8 -D Fsq=4 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_28.bin --options=\"${copt} -D F=2 -D ON=8 -D IN=9 -D LN=9 -D UN=8 -D Fsq=4 -DUSE_NCHW -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_nchw_28.bin --options=\"${copt} -D F=2 -D ON=8 -D IN=9 -D LN=9 -D UN=8 -D Fsq=4 -DUSE_RELU -DUSE_NCHW -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_nchw_28.bin --options=\"${copt} -D F=2 -D ON=8 -D IN=9 -D LN=9 -D UN=8 -D Fsq=4 -DUSE_RELU6 -DUSE_NCHW -DUSE_HALF\" + + echo ./gcl_binary --input=$file --output=${file%.*}_38.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=10 -D LN=10 -D UN=9 -D Fsq=9 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_38.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=10 -D LN=10 -D UN=9 -D Fsq=9 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_38.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=10 -D LN=10 -D UN=9 -D Fsq=9 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_38.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=10 -D LN=10 -D UN=9 -D Fsq=9 -DUSE_NCHW -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_nchw_38.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=10 -D LN=10 -D UN=9 -D Fsq=9 -DUSE_RELU -DUSE_NCHW -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_nchw_38.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=10 -D LN=10 -D UN=9 -D Fsq=9 -DUSE_RELU6 -DUSE_NCHW -DUSE_HALF\" + + echo ./gcl_binary --input=$file --output=${file%.*}_58.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=12 -D LN=12 -D UN=11 -D Fsq=25 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_58.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=12 -D LN=12 -D UN=11 -D Fsq=25 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_58.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=12 -D LN=12 -D UN=11 -D Fsq=25 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_58.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=12 -D LN=12 -D UN=11 -D Fsq=25 -DUSE_NCHW -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_nchw_58.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=12 -D LN=12 -D UN=11 -D Fsq=25 -DUSE_RELU -DUSE_NCHW -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_nchw_58.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=12 -D LN=12 -D UN=11 -D Fsq=25 -DUSE_RELU6 -DUSE_NCHW -DUSE_HALF\" + + echo ./gcl_binary --input=$file --output=${file%.*}_76.bin --options=\"${copt} -D F=7 -D ON=6 -D IN=12 -D LN=12 -D UN=11 -D Fsq=49 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_76.bin --options=\"${copt} -D F=7 -D ON=6 -D IN=12 -D LN=12 -D UN=11 -D Fsq=49 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_76.bin --options=\"${copt} -D F=7 -D ON=6 -D IN=12 -D LN=12 -D UN=11 -D Fsq=49 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_76.bin --options=\"${copt} -D F=7 -D ON=6 -D IN=12 -D LN=12 -D UN=11 -D Fsq=49 -DUSE_NCHW -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_nchw_76.bin --options=\"${copt} -D F=7 -D ON=6 -D IN=12 -D LN=12 -D UN=11 -D Fsq=49 -DUSE_RELU -DUSE_NCHW -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_nchw_76.bin --options=\"${copt} -D F=7 -D ON=6 -D IN=12 -D LN=12 -D UN=11 -D Fsq=49 -DUSE_RELU6 -DUSE_NCHW -DUSE_HALF\" + fi + fi + done + + + diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1_nchw_to_ncwhc4.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1_nchw_to_ncwhc4.sh new file mode 100644 index 00000000..bb4898f0 --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1_nchw_to_ncwhc4.sh @@ -0,0 +1,22 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "conv_direct_s1_nchw_to_ncwhc4.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_18.bin --options=\"${copt} -D F=1 -D ON=8 -D Fsq=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_36.bin --options=\"${copt} -D F=3 -D ON=6 -D Fsq=9 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_54.bin --options=\"${copt} -D F=5 -D ON=4 -D Fsq=25 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_72.bin --options=\"${copt} -D F=7 -D ON=2 -D Fsq=49 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_18.bin --options=\"${copt} -D F=1 -D ON=8 -D Fsq=1 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_36.bin --options=\"${copt} -D F=3 -D ON=6 -D Fsq=9 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_54.bin --options=\"${copt} -D F=5 -D ON=4 -D Fsq=25 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_72.bin --options=\"${copt} -D F=7 -D ON=2 -D Fsq=49 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_18.bin --options=\"${copt} -D F=1 -D ON=8 -D Fsq=1 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_36.bin --options=\"${copt} -D F=3 -D ON=6 -D Fsq=9 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_54.bin --options=\"${copt} -D F=5 -D ON=4 -D Fsq=25 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_72.bin --options=\"${copt} -D F=7 -D ON=2 -D Fsq=49 -DUSE_HALF -DUSE_RELU6\" + fi + fi + done + + + diff --git a/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1_spe_f1c3k1.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1_spe_f1c3k1.sh similarity index 100% rename from gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1_spe_f1c3k1.sh rename to common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1_spe_f1c3k1.sh diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s2.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s2.sh new file mode 100644 index 00000000..b17f648e --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s2.sh @@ -0,0 +1,213 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "conv_direct_s2.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_111.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_121.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_131.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_141.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_151.bin --options=\"${copt} -D F=1 -D ON=5 -D IN=5 -D LN=5 -D Fsq=1 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_161.bin --options=\"${copt} -D F=1 -D ON=6 -D IN=6 -D LN=6 -D Fsq=1 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_171.bin --options=\"${copt} -D F=1 -D ON=7 -D IN=7 -D LN=7 -D Fsq=1 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_181.bin --options=\"${copt} -D F=1 -D ON=8 -D IN=8 -D LN=8 -D Fsq=1 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_111.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=1 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_121.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=1 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_131.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=1 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_141.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=1 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_151.bin --options=\"${copt} -D F=1 -D ON=5 -D IN=5 -D LN=5 -D Fsq=1 -D KN=1 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_161.bin --options=\"${copt} -D F=1 -D ON=6 -D IN=6 -D LN=6 -D Fsq=1 -D KN=1 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_171.bin --options=\"${copt} -D F=1 -D ON=7 -D IN=7 -D LN=7 -D Fsq=1 -D KN=1 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_181.bin --options=\"${copt} -D F=1 -D ON=8 -D IN=8 -D LN=8 -D Fsq=1 -D KN=1 -DUSE_RELU -DUSE_HALF\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_111.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=1 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_121.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=1 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_131.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=1 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_141.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=1 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_151.bin --options=\"${copt} -D F=1 -D ON=5 -D IN=5 -D LN=5 -D Fsq=1 -D KN=1 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_161.bin --options=\"${copt} -D F=1 -D ON=6 -D IN=6 -D LN=6 -D Fsq=1 -D KN=1 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_171.bin --options=\"${copt} -D F=1 -D ON=7 -D IN=7 -D LN=7 -D Fsq=1 -D KN=1 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_181.bin --options=\"${copt} -D F=1 -D ON=8 -D IN=8 -D LN=8 -D Fsq=1 -D KN=1 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_112.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_122.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_132.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_142.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_112.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_122.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_132.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_142.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_112.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_122.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_132.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_142.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_114.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=4 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_124.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=4 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_134.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=4 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_114.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_124.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_134.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_RELU\" + + + echo ./gcl_binary --input=$file --output=${file%.*}_211.bin --options=\"${copt} -D F=2 -D ON=1 -D IN=2 -D LN=2 -D UN=1 -D Fsq=4 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_221.bin --options=\"${copt} -D F=2 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=4 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_231.bin --options=\"${copt} -D F=2 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=4 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_241.bin --options=\"${copt} -D F=2 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=4 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_251.bin --options=\"${copt} -D F=2 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=4 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_261.bin --options=\"${copt} -D F=2 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=4 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_271.bin --options=\"${copt} -D F=2 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=4 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_281.bin --options=\"${copt} -D F=2 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=4 -D KN=1 -DUSE_HALF -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_211.bin --options=\"${copt} -D F=2 -D ON=1 -D IN=2 -D LN=2 -D UN=1 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_221.bin --options=\"${copt} -D F=2 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_231.bin --options=\"${copt} -D F=2 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_241.bin --options=\"${copt} -D F=2 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_251.bin --options=\"${copt} -D F=2 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_261.bin --options=\"${copt} -D F=2 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_271.bin --options=\"${copt} -D F=2 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_281.bin --options=\"${copt} -D F=2 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_211.bin --options=\"${copt} -D F=2 -D ON=1 -D IN=2 -D LN=2 -D UN=1 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_221.bin --options=\"${copt} -D F=2 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_231.bin --options=\"${copt} -D F=2 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_241.bin --options=\"${copt} -D F=2 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_251.bin --options=\"${copt} -D F=2 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_261.bin --options=\"${copt} -D F=2 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_271.bin --options=\"${copt} -D F=2 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_281.bin --options=\"${copt} -D F=2 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_212.bin --options=\"${copt} -D F=2 -D ON=1 -D IN=2 -D LN=2 -D UN=1 -D Fsq=4 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_222.bin --options=\"${copt} -D F=2 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=4 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_232.bin --options=\"${copt} -D F=2 -D ON=3 -D IN=6 -D LN=6 -D UN=5 -D Fsq=4 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_242.bin --options=\"${copt} -D F=2 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=4 -D KN=2 -DUSE_HALF -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_212.bin --options=\"${copt} -D F=2 -D ON=1 -D IN=2 -D LN=2 -D UN=1 -D Fsq=4 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_222.bin --options=\"${copt} -D F=2 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=4 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_232.bin --options=\"${copt} -D F=2 -D ON=3 -D IN=6 -D LN=6 -D UN=5 -D Fsq=4 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_242.bin --options=\"${copt} -D F=2 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=4 -D KN=2 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_212.bin --options=\"${copt} -D F=2 -D ON=1 -D IN=2 -D LN=2 -D UN=1 -D Fsq=4 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_222.bin --options=\"${copt} -D F=2 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=4 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_232.bin --options=\"${copt} -D F=2 -D ON=3 -D IN=6 -D LN=6 -D UN=5 -D Fsq=4 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_242.bin --options=\"${copt} -D F=2 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=4 -D KN=2 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + + + + echo ./gcl_binary --input=$file --output=${file%.*}_311.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_321.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=9 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_331.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=9 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_341.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=9 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_351.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=9 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_361.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_371.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_381.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_311.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_321.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_331.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_341.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_351.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_361.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_371.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_381.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_311.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_321.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_331.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_341.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_351.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_361.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_371.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_381.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_312.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_322.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_332.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=9 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_342.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=9 -D KN=2 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_312.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_322.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_332.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_342.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_312.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_322.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_332.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_342.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" +# echo ./gcl_binary --input=$file --output=${file%.*}_314.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=4 -DUSE_HALF\" +# echo ./gcl_binary --input=$file --output=${file%.*}_324.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -D KN=4 -DUSE_HALF\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_314.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=4 -DUSE_HALF -DUSE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_324.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -D KN=4 -DUSE_HALF -DUSE_RELU\" + + + + echo ./gcl_binary --input=$file --output=${file%.*}_411.bin --options=\"${copt} -D F=4 -D ON=1 -D IN=1 -D LN=0 -D UN=0 -D Fsq=16 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_421.bin --options=\"${copt} -D F=4 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=16 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_431.bin --options=\"${copt} -D F=4 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=16 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_441.bin --options=\"${copt} -D F=4 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=16 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_451.bin --options=\"${copt} -D F=4 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=16 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_461.bin --options=\"${copt} -D F=4 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=16 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_471.bin --options=\"${copt} -D F=4 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=16 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_481.bin --options=\"${copt} -D F=4 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=16 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_411.bin --options=\"${copt} -D F=4 -D ON=1 -D IN=1 -D LN=0 -D UN=0 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_421.bin --options=\"${copt} -D F=4 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_431.bin --options=\"${copt} -D F=4 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_441.bin --options=\"${copt} -D F=4 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_451.bin --options=\"${copt} -D F=4 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_461.bin --options=\"${copt} -D F=4 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_471.bin --options=\"${copt} -D F=4 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_481.bin --options=\"${copt} -D F=4 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_412.bin --options=\"${copt} -D F=4 -D ON=1 -D IN=1 -D LN=0 -D UN=0 -D Fsq=16 -D KN=2 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_422.bin --options=\"${copt} -D F=4 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=16 -D KN=2 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_432.bin --options=\"${copt} -D F=4 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=16 -D KN=2 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_442.bin --options=\"${copt} -D F=4 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=16 -D KN=2 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + + + + + echo ./gcl_binary --input=$file --output=${file%.*}_511.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=1 -D LN=0 -D UN=0 -D Fsq=25 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_521.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=25 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_531.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_541.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_551.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_561.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_571.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_581.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_511.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=1 -D LN=0 -D UN=0 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_521.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_531.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_541.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_551.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_561.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_571.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_581.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_511.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=1 -D LN=0 -D UN=0 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_521.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_531.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_541.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_551.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_561.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_571.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_581.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_512.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_522.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_532.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -D KN=2 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_542.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -D KN=2 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_512.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_522.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_532.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_542.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_512.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_522.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_532.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_542.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" +# echo ./gcl_binary --input=$file --output=${file%.*}_514.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=4 -DUSE_HALF\" +# echo ./gcl_binary --input=$file --output=${file%.*}_524.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -D KN=4 -DUSE_HALF\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_514.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=4 -DUSE_HALF -DUSE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_524.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -D KN=4 -DUSE_HALF -DUSE_RELU\" + fi + fi + done + + + diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s2_nchw_to_ncwhc4.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s2_nchw_to_ncwhc4.sh new file mode 100644 index 00000000..b1f03360 --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s2_nchw_to_ncwhc4.sh @@ -0,0 +1,22 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "conv_direct_s2_nchw_to_ncwhc4.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_18.bin --options=\"${copt} -D F=1 -D ON=8 -D Fsq=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_37.bin --options=\"${copt} -D F=3 -D ON=7 -D Fsq=9 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_56.bin --options=\"${copt} -D F=5 -D ON=6 -D Fsq=25 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_75.bin --options=\"${copt} -D F=7 -D ON=5 -D Fsq=49 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_18.bin --options=\"${copt} -D F=1 -D ON=8 -D Fsq=1 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_37.bin --options=\"${copt} -D F=3 -D ON=7 -D Fsq=9 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_56.bin --options=\"${copt} -D F=5 -D ON=6 -D Fsq=25 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_75.bin --options=\"${copt} -D F=7 -D ON=5 -D Fsq=49 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_18.bin --options=\"${copt} -D F=1 -D ON=8 -D Fsq=1 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_37.bin --options=\"${copt} -D F=3 -D ON=7 -D Fsq=9 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_56.bin --options=\"${copt} -D F=5 -D ON=6 -D Fsq=25 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_75.bin --options=\"${copt} -D F=7 -D ON=5 -D Fsq=49 -DUSE_HALF -DUSE_RELU6\" + fi + fi + done + + + diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_spe_fwhs1.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_spe_fwhs1.sh new file mode 100644 index 00000000..4d1249dc --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_spe_fwhs1.sh @@ -0,0 +1,25 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "conv_direct_spe_fwhs1.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_1.bin --options=\"${copt} -D OC=1\" + echo ./gcl_binary --input=$file --output=${file%.*}_2.bin --options=\"${copt} -D OC=2\" + echo ./gcl_binary --input=$file --output=${file%.*}_3.bin --options=\"${copt} -D OC=3\" + echo ./gcl_binary --input=$file --output=${file%.*}_4.bin --options=\"${copt} -D OC=4\" + echo ./gcl_binary --input=$file --output=${file%.*}_8.bin --options=\"${copt} -D OC=8\" + echo ./gcl_binary --input=$file --output=${file%.*}_16.bin --options=\"${copt} -D OC=16\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_4.bin --options=\"${copt} -D OC=4 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_8.bin --options=\"${copt} -D OC=8 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_16.bin --options=\"${copt} -D OC=16 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_4.bin --options=\"${copt} -D OC=4 -D USE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_8.bin --options=\"${copt} -D OC=8 -D USE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_16.bin --options=\"${copt} -D OC=16 -D USE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_4.bin --options=\"${copt} -D OC=4 -D USE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_8.bin --options=\"${copt} -D OC=8 -D USE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_16.bin --options=\"${copt} -D OC=16 -D USE_RELU6\" + fi + fi + done + + + diff --git a/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_trans_fltbuf.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_trans_fltbuf.sh similarity index 83% rename from gcl/tools/kernel_lib_compile/sh/compile/conv_direct_trans_fltbuf.sh rename to common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_trans_fltbuf.sh index 9c3bed24..5cb4f7be 100644 --- a/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_trans_fltbuf.sh +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_trans_fltbuf.sh @@ -3,8 +3,10 @@ for file in * if [ "${file##*.}"x = "cl"x ];then if [[ "${file}" == "conv_direct_trans_fltbuf.cl" ]];then echo ./gcl_binary --input=$file --output=${file%.*}_14.bin --options=\"${copt} -D C=1 -D K=4\" + echo ./gcl_binary --input=$file --output=${file%.*}_41.bin --options=\"${copt} -D C=4 -D K=1\" echo ./gcl_binary --input=$file --output=${file%.*}_44.bin --options=\"${copt} -D C=4 -D K=4\" echo ./gcl_binary --input=$file --output=${file%.*}_48.bin --options=\"${copt} -D C=4 -D K=8\" + echo ./gcl_binary --input=$file --output=${file%.*}_416.bin --options=\"${copt} -D C=4 -D K=16\" echo ./gcl_binary --input=$file --output=${file%.*}_10.bin --options=\"${copt} -D C=1 -D K=0\" echo ./gcl_binary --input=$file --output=${file%.*}_20.bin --options=\"${copt} -D C=2 -D K=0\" echo ./gcl_binary --input=$file --output=${file%.*}_30.bin --options=\"${copt} -D C=3 -D K=0\" diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_wh_s1.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_wh_s1.sh new file mode 100644 index 00000000..4a40a181 --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_wh_s1.sh @@ -0,0 +1,143 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "conv_direct_wh_s1.cl" ]];then + # W=4 H=1 + echo ./gcl_binary --input=$file --output=${file%.*}_4111.bin --options=\"${copt} -D W=4 -D H=1 -D ON=1 -D IN=4 -D LN=4 -D UN=3 -D Fsq=4 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_4121.bin --options=\"${copt} -D W=4 -D H=1 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=4 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_4131.bin --options=\"${copt} -D W=4 -D H=1 -D ON=3 -D IN=6 -D LN=6 -D UN=5 -D Fsq=4 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_4141.bin --options=\"${copt} -D W=4 -D H=1 -D ON=4 -D IN=6 -D LN=5 -D UN=5 -D Fsq=4 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_4151.bin --options=\"${copt} -D W=4 -D H=1 -D ON=5 -D IN=6 -D LN=5 -D UN=5 -D Fsq=4 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_4161.bin --options=\"${copt} -D W=4 -D H=1 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=4 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_4171.bin --options=\"${copt} -D W=4 -D H=1 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=4 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_4181.bin --options=\"${copt} -D W=4 -D H=1 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=4 -D KN=1 -DUSE_HALF -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_4111.bin --options=\"${copt} -D W=4 -D H=1 -D ON=1 -D IN=4 -D LN=4 -D UN=3 -D Fsq=4 -D KN=1 -D USE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_4121.bin --options=\"${copt} -D W=4 -D H=1 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=4 -D KN=1 -D USE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_4131.bin --options=\"${copt} -D W=4 -D H=1 -D ON=3 -D IN=6 -D LN=6 -D UN=5 -D Fsq=4 -D KN=1 -D USE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_4141.bin --options=\"${copt} -D W=4 -D H=1 -D ON=4 -D IN=6 -D LN=5 -D UN=5 -D Fsq=4 -D KN=1 -D USE_RELU -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_4151.bin --options=\"${copt} -D W=4 -D H=1 -D ON=5 -D IN=6 -D LN=5 -D UN=5 -D Fsq=4 -D KN=1 -D USE_RELU -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_4161.bin --options=\"${copt} -D W=4 -D H=1 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=4 -D KN=1 -D USE_RELU -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_4171.bin --options=\"${copt} -D W=4 -D H=1 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=4 -D KN=1 -D USE_RELU -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_4181.bin --options=\"${copt} -D W=4 -D H=1 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=4 -D KN=1 -D USE_RELU -DUSE_HALF -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_4111.bin --options=\"${copt} -D W=4 -D H=1 -D ON=1 -D IN=4 -D LN=4 -D UN=3 -D Fsq=4 -D KN=1 -D USE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_4121.bin --options=\"${copt} -D W=4 -D H=1 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=4 -D KN=1 -D USE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_4131.bin --options=\"${copt} -D W=4 -D H=1 -D ON=3 -D IN=6 -D LN=6 -D UN=5 -D Fsq=4 -D KN=1 -D USE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_4141.bin --options=\"${copt} -D W=4 -D H=1 -D ON=4 -D IN=6 -D LN=5 -D UN=5 -D Fsq=4 -D KN=1 -D USE_RELU6 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_4151.bin --options=\"${copt} -D W=4 -D H=1 -D ON=5 -D IN=6 -D LN=5 -D UN=5 -D Fsq=4 -D KN=1 -D USE_RELU6 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_4161.bin --options=\"${copt} -D W=4 -D H=1 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=4 -D KN=1 -D USE_RELU6 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_4171.bin --options=\"${copt} -D W=4 -D H=1 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=4 -D KN=1 -D USE_RELU6 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_4181.bin --options=\"${copt} -D W=4 -D H=1 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=4 -D KN=1 -D USE_RELU6 -DUSE_HALF -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_4112.bin --options=\"${copt} -D W=4 -D H=1 -D ON=1 -D IN=4 -D LN=4 -D UN=3 -D Fsq=4 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_4122.bin --options=\"${copt} -D W=4 -D H=1 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=4 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_4132.bin --options=\"${copt} -D W=4 -D H=1 -D ON=3 -D IN=6 -D LN=6 -D UN=5 -D Fsq=4 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_4142.bin --options=\"${copt} -D W=4 -D H=1 -D ON=4 -D IN=6 -D LN=5 -D UN=5 -D Fsq=4 -D KN=2 -DUSE_HALF -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_4112.bin --options=\"${copt} -D W=4 -D H=1 -D ON=1 -D IN=4 -D LN=4 -D UN=3 -D Fsq=4 -D KN=2 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_4122.bin --options=\"${copt} -D W=4 -D H=1 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=4 -D KN=2 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_4132.bin --options=\"${copt} -D W=4 -D H=1 -D ON=3 -D IN=6 -D LN=6 -D UN=5 -D Fsq=4 -D KN=2 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_4142.bin --options=\"${copt} -D W=4 -D H=1 -D ON=4 -D IN=6 -D LN=5 -D UN=5 -D Fsq=4 -D KN=2 -DUSE_RELU -DUSE_HALF -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_4112.bin --options=\"${copt} -D W=4 -D H=1 -D ON=1 -D IN=4 -D LN=4 -D UN=3 -D Fsq=4 -D KN=2 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_4122.bin --options=\"${copt} -D W=4 -D H=1 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=4 -D KN=2 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_4132.bin --options=\"${copt} -D W=4 -D H=1 -D ON=3 -D IN=6 -D LN=6 -D UN=5 -D Fsq=4 -D KN=2 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_4142.bin --options=\"${copt} -D W=4 -D H=1 -D ON=4 -D IN=6 -D LN=5 -D UN=5 -D Fsq=4 -D KN=2 -DUSE_RELU6 -DUSE_HALF -D BASICE_REG\" + + # W=3 H=1 + echo ./gcl_binary --input=$file --output=${file%.*}_3111.bin --options=\"${copt} -D W=3 -D H=1 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=3 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_3121.bin --options=\"${copt} -D W=3 -D H=1 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=3 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_3131.bin --options=\"${copt} -D W=3 -D H=1 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=3 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_3141.bin --options=\"${copt} -D W=3 -D H=1 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=3 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_3151.bin --options=\"${copt} -D W=3 -D H=1 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=3 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_3161.bin --options=\"${copt} -D W=3 -D H=1 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=3 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_3171.bin --options=\"${copt} -D W=3 -D H=1 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=3 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_3181.bin --options=\"${copt} -D W=3 -D H=1 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=3 -D KN=1 -DUSE_HALF -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3111.bin --options=\"${copt} -D W=3 -D H=1 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=3 -D KN=1 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3121.bin --options=\"${copt} -D W=3 -D H=1 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=3 -D KN=1 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3131.bin --options=\"${copt} -D W=3 -D H=1 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=3 -D KN=1 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3141.bin --options=\"${copt} -D W=3 -D H=1 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=3 -D KN=1 -DUSE_RELU -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3151.bin --options=\"${copt} -D W=3 -D H=1 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=3 -D KN=1 -DUSE_RELU -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3161.bin --options=\"${copt} -D W=3 -D H=1 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=3 -D KN=1 -DUSE_RELU -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3171.bin --options=\"${copt} -D W=3 -D H=1 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=3 -D KN=1 -DUSE_RELU -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3181.bin --options=\"${copt} -D W=3 -D H=1 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=3 -D KN=1 -DUSE_RELU -DUSE_HALF -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3111.bin --options=\"${copt} -D W=3 -D H=1 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=3 -D KN=1 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3121.bin --options=\"${copt} -D W=3 -D H=1 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=3 -D KN=1 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3131.bin --options=\"${copt} -D W=3 -D H=1 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=3 -D KN=1 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3141.bin --options=\"${copt} -D W=3 -D H=1 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=3 -D KN=1 -DUSE_RELU6 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3151.bin --options=\"${copt} -D W=3 -D H=1 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=3 -D KN=1 -DUSE_RELU6 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3161.bin --options=\"${copt} -D W=3 -D H=1 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=3 -D KN=1 -DUSE_RELU6 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3171.bin --options=\"${copt} -D W=3 -D H=1 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=3 -D KN=1 -DUSE_RELU6 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3181.bin --options=\"${copt} -D W=3 -D H=1 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=3 -D KN=1 -DUSE_RELU6 -DUSE_HALF -D BASICE_REG\" + + + echo ./gcl_binary --input=$file --output=${file%.*}_3112.bin --options=\"${copt} -D W=3 -D H=1 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=3 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_3122.bin --options=\"${copt} -D W=3 -D H=1 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=3 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_3132.bin --options=\"${copt} -D W=3 -D H=1 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=3 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_3142.bin --options=\"${copt} -D W=3 -D H=1 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=3 -D KN=2 -DUSE_HALF -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3112.bin --options=\"${copt} -D W=3 -D H=1 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=3 -D KN=2 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3122.bin --options=\"${copt} -D W=3 -D H=1 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=3 -D KN=2 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3132.bin --options=\"${copt} -D W=3 -D H=1 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=3 -D KN=2 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3142.bin --options=\"${copt} -D W=3 -D H=1 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=3 -D KN=2 -DUSE_RELU -DUSE_HALF -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3112.bin --options=\"${copt} -D W=3 -D H=1 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=3 -D KN=2 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3122.bin --options=\"${copt} -D W=3 -D H=1 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=3 -D KN=2 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3132.bin --options=\"${copt} -D W=3 -D H=1 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=3 -D KN=2 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3142.bin --options=\"${copt} -D W=3 -D H=1 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=3 -D KN=2 -DUSE_RELU6 -DUSE_HALF -D BASICE_REG\" + + # W=1 H=4 + echo ./gcl_binary --input=$file --output=${file%.*}_1411.bin --options=\"${copt} -D W=1 -D H=4 -D ON=1 -D IN=1 -D LN=1 -D Fsq=4 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_1421.bin --options=\"${copt} -D W=1 -D H=4 -D ON=2 -D IN=2 -D LN=2 -D Fsq=4 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_1431.bin --options=\"${copt} -D W=1 -D H=4 -D ON=3 -D IN=3 -D LN=3 -D Fsq=4 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_1441.bin --options=\"${copt} -D W=1 -D H=4 -D ON=4 -D IN=4 -D LN=4 -D Fsq=4 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_1451.bin --options=\"${copt} -D W=1 -D H=4 -D ON=5 -D IN=5 -D LN=5 -D Fsq=4 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_1461.bin --options=\"${copt} -D W=1 -D H=4 -D ON=6 -D IN=6 -D LN=6 -D Fsq=4 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_1471.bin --options=\"${copt} -D W=1 -D H=4 -D ON=7 -D IN=7 -D LN=7 -D Fsq=4 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_1481.bin --options=\"${copt} -D W=1 -D H=4 -D ON=8 -D IN=8 -D LN=8 -D Fsq=4 -D KN=1 -DUSE_HALF\" + + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_1411.bin --options=\"${copt} -D W=1 -D H=4 -D ON=1 -D IN=1 -D LN=1 -D Fsq=4 -D KN=1 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_1421.bin --options=\"${copt} -D W=1 -D H=4 -D ON=2 -D IN=2 -D LN=2 -D Fsq=4 -D KN=1 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_1431.bin --options=\"${copt} -D W=1 -D H=4 -D ON=3 -D IN=3 -D LN=3 -D Fsq=4 -D KN=1 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_1441.bin --options=\"${copt} -D W=1 -D H=4 -D ON=4 -D IN=4 -D LN=4 -D Fsq=4 -D KN=1 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_1451.bin --options=\"${copt} -D W=1 -D H=4 -D ON=5 -D IN=5 -D LN=5 -D Fsq=4 -D KN=1 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_1461.bin --options=\"${copt} -D W=1 -D H=4 -D ON=6 -D IN=6 -D LN=6 -D Fsq=4 -D KN=1 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_1471.bin --options=\"${copt} -D W=1 -D H=4 -D ON=7 -D IN=7 -D LN=7 -D Fsq=4 -D KN=1 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_1481.bin --options=\"${copt} -D W=1 -D H=4 -D ON=8 -D IN=8 -D LN=8 -D Fsq=4 -D KN=1 -DUSE_RELU -DUSE_HALF\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_1411.bin --options=\"${copt} -D W=1 -D H=4 -D ON=1 -D IN=1 -D LN=1 -D Fsq=4 -D KN=1 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_1421.bin --options=\"${copt} -D W=1 -D H=4 -D ON=2 -D IN=2 -D LN=2 -D Fsq=4 -D KN=1 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_1431.bin --options=\"${copt} -D W=1 -D H=4 -D ON=3 -D IN=3 -D LN=3 -D Fsq=4 -D KN=1 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_1441.bin --options=\"${copt} -D W=1 -D H=4 -D ON=4 -D IN=4 -D LN=4 -D Fsq=4 -D KN=1 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_1451.bin --options=\"${copt} -D W=1 -D H=4 -D ON=5 -D IN=5 -D LN=5 -D Fsq=4 -D KN=1 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_1461.bin --options=\"${copt} -D W=1 -D H=4 -D ON=6 -D IN=6 -D LN=6 -D Fsq=4 -D KN=1 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_1471.bin --options=\"${copt} -D W=1 -D H=4 -D ON=7 -D IN=7 -D LN=7 -D Fsq=4 -D KN=1 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_1481.bin --options=\"${copt} -D W=1 -D H=4 -D ON=8 -D IN=8 -D LN=8 -D Fsq=4 -D KN=1 -DUSE_RELU6 -DUSE_HALF\" + + + echo ./gcl_binary --input=$file --output=${file%.*}_1412.bin --options=\"${copt} -D W=1 -D H=4 -D ON=1 -D IN=1 -D LN=1 -D Fsq=4 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_1422.bin --options=\"${copt} -D W=1 -D H=4 -D ON=2 -D IN=2 -D LN=2 -D Fsq=4 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_1432.bin --options=\"${copt} -D W=1 -D H=4 -D ON=3 -D IN=3 -D LN=3 -D Fsq=4 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_1442.bin --options=\"${copt} -D W=1 -D H=4 -D ON=4 -D IN=4 -D LN=4 -D Fsq=4 -D KN=2 -DUSE_HALF\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_1412.bin --options=\"${copt} -D W=1 -D H=4 -D ON=1 -D IN=1 -D LN=1 -D Fsq=4 -D KN=2 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_1422.bin --options=\"${copt} -D W=1 -D H=4 -D ON=2 -D IN=2 -D LN=2 -D Fsq=4 -D KN=2 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_1432.bin --options=\"${copt} -D W=1 -D H=4 -D ON=3 -D IN=3 -D LN=3 -D Fsq=4 -D KN=2 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_1442.bin --options=\"${copt} -D W=1 -D H=4 -D ON=4 -D IN=4 -D LN=4 -D Fsq=4 -D KN=2 -DUSE_RELU -DUSE_HALF\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_1412.bin --options=\"${copt} -D W=1 -D H=4 -D ON=1 -D IN=1 -D LN=1 -D Fsq=4 -D KN=2 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_1422.bin --options=\"${copt} -D W=1 -D H=4 -D ON=2 -D IN=2 -D LN=2 -D Fsq=4 -D KN=2 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_1432.bin --options=\"${copt} -D W=1 -D H=4 -D ON=3 -D IN=3 -D LN=3 -D Fsq=4 -D KN=2 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_1442.bin --options=\"${copt} -D W=1 -D H=4 -D ON=4 -D IN=4 -D LN=4 -D Fsq=4 -D KN=2 -DUSE_RELU6 -DUSE_HALF\" + fi + fi + done + + + + + diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_wh_s2.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_wh_s2.sh new file mode 100644 index 00000000..7bf77fac --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_wh_s2.sh @@ -0,0 +1,54 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "conv_direct_wh_s2.cl" ]];then + # W=3 H=1 Stride = 2 + echo ./gcl_binary --input=$file --output=${file%.*}_3111.bin --options=\"${copt} -D W=3 -D H=1 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=3 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_3121.bin --options=\"${copt} -D W=3 -D H=1 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=3 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_3131.bin --options=\"${copt} -D W=3 -D H=1 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=3 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_3141.bin --options=\"${copt} -D W=3 -D H=1 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=3 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_3151.bin --options=\"${copt} -D W=3 -D H=1 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=3 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_3161.bin --options=\"${copt} -D W=3 -D H=1 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=3 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_3171.bin --options=\"${copt} -D W=3 -D H=1 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=3 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_3181.bin --options=\"${copt} -D W=3 -D H=1 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=3 -D KN=1 -DUSE_HALF -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3111.bin --options=\"${copt} -D W=3 -D H=1 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=3 -D KN=1 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3121.bin --options=\"${copt} -D W=3 -D H=1 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=3 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3131.bin --options=\"${copt} -D W=3 -D H=1 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=3 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3141.bin --options=\"${copt} -D W=3 -D H=1 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=3 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3151.bin --options=\"${copt} -D W=3 -D H=1 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=3 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3161.bin --options=\"${copt} -D W=3 -D H=1 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=3 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3171.bin --options=\"${copt} -D W=3 -D H=1 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=3 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3181.bin --options=\"${copt} -D W=3 -D H=1 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=3 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3111.bin --options=\"${copt} -D W=3 -D H=1 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=3 -D KN=1 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3121.bin --options=\"${copt} -D W=3 -D H=1 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=3 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3131.bin --options=\"${copt} -D W=3 -D H=1 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=3 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3141.bin --options=\"${copt} -D W=3 -D H=1 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=3 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3151.bin --options=\"${copt} -D W=3 -D H=1 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=3 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3161.bin --options=\"${copt} -D W=3 -D H=1 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=3 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3171.bin --options=\"${copt} -D W=3 -D H=1 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=3 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3181.bin --options=\"${copt} -D W=3 -D H=1 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=3 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_3112.bin --options=\"${copt} -D W=3 -D H=1 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=3 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_3122.bin --options=\"${copt} -D W=3 -D H=1 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=3 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_3132.bin --options=\"${copt} -D W=3 -D H=1 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=3 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_3142.bin --options=\"${copt} -D W=3 -D H=1 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=3 -D KN=2 -DUSE_HALF -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3112.bin --options=\"${copt} -D W=3 -D H=1 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=3 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3122.bin --options=\"${copt} -D W=3 -D H=1 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=3 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3132.bin --options=\"${copt} -D W=3 -D H=1 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=3 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3142.bin --options=\"${copt} -D W=3 -D H=1 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=3 -D KN=2 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3112.bin --options=\"${copt} -D W=3 -D H=1 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=3 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3122.bin --options=\"${copt} -D W=3 -D H=1 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=3 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3132.bin --options=\"${copt} -D W=3 -D H=1 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=3 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3142.bin --options=\"${copt} -D W=3 -D H=1 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=3 -D KN=2 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + fi + fi + done + + + + + diff --git a/gcl/tools/kernel_lib_compile/sh/compile/conv_wino_gemm36_tn.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_wino_gemm36_tn.sh similarity index 100% rename from gcl/tools/kernel_lib_compile/sh/compile/conv_wino_gemm36_tn.sh rename to common/gcl/tools/kernel_lib_compile/sh/compile/conv_wino_gemm36_tn.sh diff --git a/gcl/tools/kernel_lib_compile/sh/compile/conv_wino_rotate_fltbuf.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_wino_rotate_fltbuf.sh similarity index 100% rename from gcl/tools/kernel_lib_compile/sh/compile/conv_wino_rotate_fltbuf.sh rename to common/gcl/tools/kernel_lib_compile/sh/compile/conv_wino_rotate_fltbuf.sh diff --git a/gcl/tools/kernel_lib_compile/sh/compile/conv_wino_trans_outbuf.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_wino_trans_outbuf.sh similarity index 100% rename from gcl/tools/kernel_lib_compile/sh/compile/conv_wino_trans_outbuf.sh rename to common/gcl/tools/kernel_lib_compile/sh/compile/conv_wino_trans_outbuf.sh diff --git a/gcl/tools/kernel_lib_compile/sh/compile/conv_wino_trans_picbuf_left.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_wino_trans_picbuf_left.sh similarity index 100% rename from gcl/tools/kernel_lib_compile/sh/compile/conv_wino_trans_picbuf_left.sh rename to common/gcl/tools/kernel_lib_compile/sh/compile/conv_wino_trans_picbuf_left.sh diff --git a/gcl/tools/kernel_lib_compile/sh/compile/conv_wino_trans_picbuf_right.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_wino_trans_picbuf_right.sh similarity index 100% rename from gcl/tools/kernel_lib_compile/sh/compile/conv_wino_trans_picbuf_right.sh rename to common/gcl/tools/kernel_lib_compile/sh/compile/conv_wino_trans_picbuf_right.sh diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/copy.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/copy.sh new file mode 100644 index 00000000..4f988f0c --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/copy.sh @@ -0,0 +1,16 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "copy.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_f16.bin --options=\"${copt} -D DT=f16\" + echo ./gcl_binary --input=$file --output=${file%.*}_i32.bin --options=\"-D T=int -D T2=int2 -D T3=int3 -D T4=int4 -D DT=i32\" + echo ./gcl_binary --input=$file --output=${file%.*}_u32.bin --options=\"-D T=uint -D T2=uint2 -D T3=uint3 -D T4=uint4 -D DT=u32\" + echo ./gcl_binary --input=$file --output=${file%.*}_with_block_index_f16.bin --options=\"${copt} -D DT=f16 -D USE_BLOCK_INDEX\" + echo ./gcl_binary --input=$file --output=${file%.*}_with_block_index_i32.bin --options=\"-D T=int -D T2=int2 -D T3=int3 -D T4=int4 -D DT=i32 -D USE_BLOCK_INDEX\" + echo ./gcl_binary --input=$file --output=${file%.*}_with_block_index_u32.bin --options=\"-D T=uint -D T2=uint2 -D T3=uint3 -D T4=uint4 -D DT=u32 -D USE_BLOCK_INDEX\" + fi + fi + done + + + diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/deconv_gemm_f2s2.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/deconv_gemm_f2s2.sh new file mode 100644 index 00000000..70634810 --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/deconv_gemm_f2s2.sh @@ -0,0 +1,30 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "deconv_gemm_f2s2.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_12.bin --options=\"${copt} -D ON=1 -D IN=1 -D LN=1 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_22.bin --options=\"${copt} -D ON=2 -D IN=2 -D LN=2 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_32.bin --options=\"${copt} -D ON=3 -D IN=3 -D LN=3 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_42.bin --options=\"${copt} -D ON=4 -D IN=4 -D LN=4 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_14.bin --options=\"${copt} -D ON=1 -D IN=1 -D LN=1 -D KN=4 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_24.bin --options=\"${copt} -D ON=2 -D IN=2 -D LN=2 -D KN=4 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_34.bin --options=\"${copt} -D ON=3 -D IN=3 -D LN=3 -D KN=4 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_22.bin --options=\"${copt} -D ON=2 -D IN=2 -D LN=2 -D KN=2 -DUSE_HALF -DREUSE_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_42.bin --options=\"${copt} -D ON=4 -D IN=4 -D LN=4 -D KN=2 -DUSE_HALF -DREUSE_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_24.bin --options=\"${copt} -D ON=2 -D IN=2 -D LN=2 -D KN=4 -DUSE_HALF -DREUSE_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_12.bin --options=\"${copt} -D ON=1 -D IN=1 -D LN=1 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_22.bin --options=\"${copt} -D ON=2 -D IN=2 -D LN=2 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_32.bin --options=\"${copt} -D ON=3 -D IN=3 -D LN=3 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_42.bin --options=\"${copt} -D ON=4 -D IN=4 -D LN=4 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_14.bin --options=\"${copt} -D ON=1 -D IN=1 -D LN=1 -D KN=4 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_24.bin --options=\"${copt} -D ON=2 -D IN=2 -D LN=2 -D KN=4 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_34.bin --options=\"${copt} -D ON=3 -D IN=3 -D LN=3 -D KN=4 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_relu_22.bin --options=\"${copt} -D ON=2 -D IN=2 -D LN=2 -D KN=2 -DUSE_HALF -DREUSE_H -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_relu_42.bin --options=\"${copt} -D ON=4 -D IN=4 -D LN=4 -D KN=2 -DUSE_HALF -DREUSE_H -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_relu_24.bin --options=\"${copt} -D ON=2 -D IN=2 -D LN=2 -D KN=4 -DUSE_HALF -DREUSE_H -DUSE_RELU\" + fi + fi + done + + + diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/deconv_gemm_trans_fltbuf.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/deconv_gemm_trans_fltbuf.sh new file mode 100644 index 00000000..f8cd5dbc --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/deconv_gemm_trans_fltbuf.sh @@ -0,0 +1,13 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "deconv_gemm_trans_fltbuf.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_14.bin --options=\"${copt} -D C=1 -D K=4\" + echo ./gcl_binary --input=$file --output=${file%.*}_24.bin --options=\"${copt} -D C=2 -D K=4\" + echo ./gcl_binary --input=$file --output=${file%.*}_44.bin --options=\"${copt} -D C=4 -D K=4\" + fi + fi + done + + + diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/eltwise.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/eltwise.sh new file mode 100644 index 00000000..c7039f76 --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/eltwise.sh @@ -0,0 +1,69 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "eltwise.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_max1.bin --options=\"${copt} -D N=1 -D TP=max -DUSE_MAX\" + echo ./gcl_binary --input=$file --output=${file%.*}_max2.bin --options=\"${copt} -D N=2 -D TP=max -DUSE_MAX\" + echo ./gcl_binary --input=$file --output=${file%.*}_max3.bin --options=\"${copt} -D N=3 -D TP=max -DUSE_MAX\" + echo ./gcl_binary --input=$file --output=${file%.*}_max4.bin --options=\"${copt} -D N=4 -D TP=max -DUSE_MAX\" + + echo ./gcl_binary --input=$file --output=${file%.*}_sum1.bin --options=\"${copt} -D N=1 -D TP=sum -DUSE_SUM\" + echo ./gcl_binary --input=$file --output=${file%.*}_sum2.bin --options=\"${copt} -D N=2 -D TP=sum -DUSE_SUM\" + echo ./gcl_binary --input=$file --output=${file%.*}_sum3.bin --options=\"${copt} -D N=3 -D TP=sum -DUSE_SUM\" + echo ./gcl_binary --input=$file --output=${file%.*}_sum4.bin --options=\"${copt} -D N=4 -D TP=sum -DUSE_SUM\" + + echo ./gcl_binary --input=$file --output=${file%.*}_prod1.bin --options=\"${copt} -D N=1 -D TP=prod -DUSE_PROD\" + echo ./gcl_binary --input=$file --output=${file%.*}_prod2.bin --options=\"${copt} -D N=2 -D TP=prod -DUSE_PROD\" + echo ./gcl_binary --input=$file --output=${file%.*}_prod3.bin --options=\"${copt} -D N=3 -D TP=prod -DUSE_PROD\" + echo ./gcl_binary --input=$file --output=${file%.*}_prod4.bin --options=\"${copt} -D N=4 -D TP=prod -DUSE_PROD\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_max1.bin --options=\"${copt} -D N=1 -D TP=max -DUSE_MAX -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_max2.bin --options=\"${copt} -D N=2 -D TP=max -DUSE_MAX -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_max3.bin --options=\"${copt} -D N=3 -D TP=max -DUSE_MAX -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_max4.bin --options=\"${copt} -D N=4 -D TP=max -DUSE_MAX -DUSE_RELU\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_sum1.bin --options=\"${copt} -D N=1 -D TP=sum -DUSE_SUM -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_sum2.bin --options=\"${copt} -D N=2 -D TP=sum -DUSE_SUM -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_sum3.bin --options=\"${copt} -D N=3 -D TP=sum -DUSE_SUM -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_sum4.bin --options=\"${copt} -D N=4 -D TP=sum -DUSE_SUM -DUSE_RELU\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_prod1.bin --options=\"${copt} -D N=1 -D TP=prod -DUSE_PROD -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_prod2.bin --options=\"${copt} -D N=2 -D TP=prod -DUSE_PROD -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_prod3.bin --options=\"${copt} -D N=3 -D TP=prod -DUSE_PROD -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_prod4.bin --options=\"${copt} -D N=4 -D TP=prod -DUSE_PROD -DUSE_RELU\" + + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_max1.bin --options=\"${copt} -D N=1 -D TP=max -DUSE_MAX -DUSE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_max2.bin --options=\"${copt} -D N=2 -D TP=max -DUSE_MAX -DUSE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_max3.bin --options=\"${copt} -D N=3 -D TP=max -DUSE_MAX -DUSE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_max4.bin --options=\"${copt} -D N=4 -D TP=max -DUSE_MAX -DUSE_NCHW\" + + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_sum1.bin --options=\"${copt} -D N=1 -D TP=sum -DUSE_SUM -DUSE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_sum2.bin --options=\"${copt} -D N=2 -D TP=sum -DUSE_SUM -DUSE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_sum3.bin --options=\"${copt} -D N=3 -D TP=sum -DUSE_SUM -DUSE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_sum4.bin --options=\"${copt} -D N=4 -D TP=sum -DUSE_SUM -DUSE_NCHW\" + + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_prod1.bin --options=\"${copt} -D N=1 -D TP=prod -DUSE_PROD -DUSE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_prod2.bin --options=\"${copt} -D N=2 -D TP=prod -DUSE_PROD -DUSE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_prod3.bin --options=\"${copt} -D N=3 -D TP=prod -DUSE_PROD -DUSE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_prod4.bin --options=\"${copt} -D N=4 -D TP=prod -DUSE_PROD -DUSE_NCHW\" + + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_relu_max1.bin --options=\"${copt} -D N=1 -D TP=max -DUSE_MAX -DUSE_RELU -DUSE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_relu_max2.bin --options=\"${copt} -D N=2 -D TP=max -DUSE_MAX -DUSE_RELU -DUSE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_relu_max3.bin --options=\"${copt} -D N=3 -D TP=max -DUSE_MAX -DUSE_RELU -DUSE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_relu_max4.bin --options=\"${copt} -D N=4 -D TP=max -DUSE_MAX -DUSE_RELU -DUSE_NCHW\" + + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_relu_sum1.bin --options=\"${copt} -D N=1 -D TP=sum -DUSE_SUM -DUSE_RELU -DUSE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_relu_sum2.bin --options=\"${copt} -D N=2 -D TP=sum -DUSE_SUM -DUSE_RELU -DUSE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_relu_sum3.bin --options=\"${copt} -D N=3 -D TP=sum -DUSE_SUM -DUSE_RELU -DUSE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_relu_sum4.bin --options=\"${copt} -D N=4 -D TP=sum -DUSE_SUM -DUSE_RELU -DUSE_NCHW\" + + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_relu_prod1.bin --options=\"${copt} -D N=1 -D TP=prod -DUSE_PROD -DUSE_RELU -DUSE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_relu_prod2.bin --options=\"${copt} -D N=2 -D TP=prod -DUSE_PROD -DUSE_RELU -DUSE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_relu_prod3.bin --options=\"${copt} -D N=3 -D TP=prod -DUSE_PROD -DUSE_RELU -DUSE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_relu_prod4.bin --options=\"${copt} -D N=4 -D TP=prod -DUSE_PROD -DUSE_RELU -DUSE_NCHW\" + fi + fi + done + + + diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/eltwise_broadcast.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/eltwise_broadcast.sh new file mode 100644 index 00000000..37ad824c --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/eltwise_broadcast.sh @@ -0,0 +1,22 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "eltwise_broadcast.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_max0.bin --options=\"${copt} -D N=0 -D TP=max -DUSE_MAX\" + echo ./gcl_binary --input=$file --output=${file%.*}_max1.bin --options=\"${copt} -D N=1 -D TP=max -DUSE_MAX\" + echo ./gcl_binary --input=$file --output=${file%.*}_max2.bin --options=\"${copt} -D N=2 -D TP=max -DUSE_MAX\" + echo ./gcl_binary --input=$file --output=${file%.*}_max3.bin --options=\"${copt} -D N=3 -D TP=max -DUSE_MAX\" + + echo ./gcl_binary --input=$file --output=${file%.*}_sum0.bin --options=\"${copt} -D N=0 -D TP=sum -DUSE_SUM\" + echo ./gcl_binary --input=$file --output=${file%.*}_sum1.bin --options=\"${copt} -D N=1 -D TP=sum -DUSE_SUM\" + echo ./gcl_binary --input=$file --output=${file%.*}_sum2.bin --options=\"${copt} -D N=2 -D TP=sum -DUSE_SUM\" + echo ./gcl_binary --input=$file --output=${file%.*}_sum3.bin --options=\"${copt} -D N=3 -D TP=sum -DUSE_SUM\" + + echo ./gcl_binary --input=$file --output=${file%.*}_prod0.bin --options=\"${copt} -D N=0 -D TP=prod -DUSE_PROD\" + echo ./gcl_binary --input=$file --output=${file%.*}_prod1.bin --options=\"${copt} -D N=1 -D TP=prod -DUSE_PROD\" + echo ./gcl_binary --input=$file --output=${file%.*}_prod2.bin --options=\"${copt} -D N=2 -D TP=prod -DUSE_PROD\" + echo ./gcl_binary --input=$file --output=${file%.*}_prod3.bin --options=\"${copt} -D N=3 -D TP=prod -DUSE_PROD\" + fi + fi + done + diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/eltwise_spe_nchw_c.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/eltwise_spe_nchw_c.sh new file mode 100644 index 00000000..c879ea4d --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/eltwise_spe_nchw_c.sh @@ -0,0 +1,13 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "eltwise_spe_nchw_c.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_max.bin --options=\"${copt} -D TP=max -DUSE_MAX\" + echo ./gcl_binary --input=$file --output=${file%.*}_sum.bin --options=\"${copt} -D TP=sum -DUSE_SUM\" + echo ./gcl_binary --input=$file --output=${file%.*}_prod.bin --options=\"${copt} -D TP=prod -DUSE_PROD\" + fi + fi + done + + + diff --git a/gcl/tools/kernel_lib_compile/sh/compile/fc_trans_fltbuf.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/fc_trans_fltbuf.sh similarity index 100% rename from gcl/tools/kernel_lib_compile/sh/compile/fc_trans_fltbuf.sh rename to common/gcl/tools/kernel_lib_compile/sh/compile/fc_trans_fltbuf.sh diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/fill_memory_zero.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/fill_memory_zero.sh new file mode 100644 index 00000000..7926db3c --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/fill_memory_zero.sh @@ -0,0 +1,12 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "fill_memory_zero.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_f16.bin --options=\"${copt} -D DT=f16\" + echo ./gcl_binary --input=$file --output=${file%.*}_i32.bin --options=\"-D T=int -D DT=i32\" + fi + fi + done + + + diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/fill_memory_zero_vec4.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/fill_memory_zero_vec4.sh new file mode 100644 index 00000000..a29cf5dd --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/fill_memory_zero_vec4.sh @@ -0,0 +1,12 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "fill_memory_zero_vec4.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_f16.bin --options=\"${copt} -D DT=f16\" + echo ./gcl_binary --input=$file --output=${file%.*}_i32.bin --options=\"-D T=int -D T2=int2 -D T3=int3 -D T4=int4 -D DT=i32\" + fi + fi + done + + + diff --git a/gcl/tools/kernel_lib_compile/sh/compile/gemm_nt.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/gemm_nt.sh similarity index 100% rename from gcl/tools/kernel_lib_compile/sh/compile/gemm_nt.sh rename to common/gcl/tools/kernel_lib_compile/sh/compile/gemm_nt.sh diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/gemm_tn.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/gemm_tn.sh new file mode 100644 index 00000000..6ed69c02 --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/gemm_tn.sh @@ -0,0 +1,375 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == gemm_tn.cl ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_41.bin --options=\"${copt} -D LM=4 -D LN=1 -D UN=0 -DUSE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_42.bin --options=\"${copt} -D LM=4 -D LN=2 -D UN=1 -DUSE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_43.bin --options=\"${copt} -D LM=4 -D LN=3 -D UN=2 -DUSE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_44.bin --options=\"${copt} -D LM=4 -D LN=4 -D UN=3 -DUSE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_45.bin --options=\"${copt} -D LM=4 -D LN=5 -D UN=4 -DUSE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_46.bin --options=\"${copt} -D LM=4 -D LN=6 -D UN=5 -DUSE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_47.bin --options=\"${copt} -D LM=4 -D LN=7 -D UN=6 -DUSE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_48.bin --options=\"${copt} -D LM=4 -D LN=8 -D UN=7 -DUSE_NCWHC4\" + + echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_81.bin --options=\"${copt} -D LM=8 -D LN=1 -D UN=0 -DUSE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_82.bin --options=\"${copt} -D LM=8 -D LN=2 -D UN=1 -DUSE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_83.bin --options=\"${copt} -D LM=8 -D LN=3 -D UN=2 -DUSE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_84.bin --options=\"${copt} -D LM=8 -D LN=4 -D UN=3 -DUSE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_85.bin --options=\"${copt} -D LM=8 -D LN=5 -D UN=4 -DUSE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_86.bin --options=\"${copt} -D LM=8 -D LN=6 -D UN=5 -DUSE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_87.bin --options=\"${copt} -D LM=8 -D LN=7 -D UN=6 -DUSE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_88.bin --options=\"${copt} -D LM=8 -D LN=8 -D UN=7 -DUSE_NCWHC4\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_41.bin --options=\"${copt} -D LM=4 -D LN=1 -D UN=0 -DUSE_NCWHC4 -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_42.bin --options=\"${copt} -D LM=4 -D LN=2 -D UN=1 -DUSE_NCWHC4 -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_43.bin --options=\"${copt} -D LM=4 -D LN=3 -D UN=2 -DUSE_NCWHC4 -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_44.bin --options=\"${copt} -D LM=4 -D LN=4 -D UN=3 -DUSE_NCWHC4 -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_45.bin --options=\"${copt} -D LM=4 -D LN=5 -D UN=4 -DUSE_NCWHC4 -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_46.bin --options=\"${copt} -D LM=4 -D LN=6 -D UN=5 -DUSE_NCWHC4 -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_47.bin --options=\"${copt} -D LM=4 -D LN=7 -D UN=6 -DUSE_NCWHC4 -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_48.bin --options=\"${copt} -D LM=4 -D LN=8 -D UN=7 -DUSE_NCWHC4 -DUSE_RELU\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_81.bin --options=\"${copt} -D LM=8 -D LN=1 -D UN=0 -DUSE_NCWHC4 -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_82.bin --options=\"${copt} -D LM=8 -D LN=2 -D UN=1 -DUSE_NCWHC4 -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_83.bin --options=\"${copt} -D LM=8 -D LN=3 -D UN=2 -DUSE_NCWHC4 -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_84.bin --options=\"${copt} -D LM=8 -D LN=4 -D UN=3 -DUSE_NCWHC4 -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_85.bin --options=\"${copt} -D LM=8 -D LN=5 -D UN=4 -DUSE_NCWHC4 -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_86.bin --options=\"${copt} -D LM=8 -D LN=6 -D UN=5 -DUSE_NCWHC4 -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_87.bin --options=\"${copt} -D LM=8 -D LN=7 -D UN=6 -DUSE_NCWHC4 -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_88.bin --options=\"${copt} -D LM=8 -D LN=8 -D UN=7 -DUSE_NCWHC4 -DUSE_RELU\" + +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_ncwhc4_41.bin --options=\"${copt} -D LM=4 -D LN=1 -D UN=0 -DUSE_NCWHC4 -DUSE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_ncwhc4_42.bin --options=\"${copt} -D LM=4 -D LN=2 -D UN=1 -DUSE_NCWHC4 -DUSE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_ncwhc4_43.bin --options=\"${copt} -D LM=4 -D LN=3 -D UN=2 -DUSE_NCWHC4 -DUSE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_ncwhc4_44.bin --options=\"${copt} -D LM=4 -D LN=4 -D UN=3 -DUSE_NCWHC4 -DUSE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_ncwhc4_45.bin --options=\"${copt} -D LM=4 -D LN=5 -D UN=4 -DUSE_NCWHC4 -DUSE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_ncwhc4_46.bin --options=\"${copt} -D LM=4 -D LN=6 -D UN=5 -DUSE_NCWHC4 -DUSE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_ncwhc4_47.bin --options=\"${copt} -D LM=4 -D LN=7 -D UN=6 -DUSE_NCWHC4 -DUSE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_ncwhc4_48.bin --options=\"${copt} -D LM=4 -D LN=8 -D UN=7 -DUSE_NCWHC4 -DUSE_GELU\" +# +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_ncwhc4_81.bin --options=\"${copt} -D LM=8 -D LN=1 -D UN=0 -DUSE_NCWHC4 -DUSE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_ncwhc4_82.bin --options=\"${copt} -D LM=8 -D LN=2 -D UN=1 -DUSE_NCWHC4 -DUSE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_ncwhc4_83.bin --options=\"${copt} -D LM=8 -D LN=3 -D UN=2 -DUSE_NCWHC4 -DUSE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_ncwhc4_84.bin --options=\"${copt} -D LM=8 -D LN=4 -D UN=3 -DUSE_NCWHC4 -DUSE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_ncwhc4_85.bin --options=\"${copt} -D LM=8 -D LN=5 -D UN=4 -DUSE_NCWHC4 -DUSE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_ncwhc4_86.bin --options=\"${copt} -D LM=8 -D LN=6 -D UN=5 -DUSE_NCWHC4 -DUSE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_ncwhc4_87.bin --options=\"${copt} -D LM=8 -D LN=7 -D UN=6 -DUSE_NCWHC4 -DUSE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_ncwhc4_88.bin --options=\"${copt} -D LM=8 -D LN=8 -D UN=7 -DUSE_NCWHC4 -DUSE_GELU\" + + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise1_ncwhc4_41.bin --options=\"${copt} -D LM=4 -D LN=1 -D UN=0 -DUSE_NCWHC4 -DUSE_ELTWISE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise1_ncwhc4_42.bin --options=\"${copt} -D LM=4 -D LN=2 -D UN=1 -DUSE_NCWHC4 -DUSE_ELTWISE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise1_ncwhc4_43.bin --options=\"${copt} -D LM=4 -D LN=3 -D UN=2 -DUSE_NCWHC4 -DUSE_ELTWISE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise1_ncwhc4_44.bin --options=\"${copt} -D LM=4 -D LN=4 -D UN=3 -DUSE_NCWHC4 -DUSE_ELTWISE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise1_ncwhc4_45.bin --options=\"${copt} -D LM=4 -D LN=5 -D UN=4 -DUSE_NCWHC4 -DUSE_ELTWISE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise1_ncwhc4_46.bin --options=\"${copt} -D LM=4 -D LN=6 -D UN=5 -DUSE_NCWHC4 -DUSE_ELTWISE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise1_ncwhc4_47.bin --options=\"${copt} -D LM=4 -D LN=7 -D UN=6 -DUSE_NCWHC4 -DUSE_ELTWISE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise1_ncwhc4_48.bin --options=\"${copt} -D LM=4 -D LN=8 -D UN=7 -DUSE_NCWHC4 -DUSE_ELTWISE_NCHW\" + + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise1_ncwhc4_81.bin --options=\"${copt} -D LM=8 -D LN=1 -D UN=0 -DUSE_NCWHC4 -DUSE_ELTWISE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise1_ncwhc4_82.bin --options=\"${copt} -D LM=8 -D LN=2 -D UN=1 -DUSE_NCWHC4 -DUSE_ELTWISE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise1_ncwhc4_83.bin --options=\"${copt} -D LM=8 -D LN=3 -D UN=2 -DUSE_NCWHC4 -DUSE_ELTWISE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise1_ncwhc4_84.bin --options=\"${copt} -D LM=8 -D LN=4 -D UN=3 -DUSE_NCWHC4 -DUSE_ELTWISE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise1_ncwhc4_85.bin --options=\"${copt} -D LM=8 -D LN=5 -D UN=4 -DUSE_NCWHC4 -DUSE_ELTWISE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise1_ncwhc4_86.bin --options=\"${copt} -D LM=8 -D LN=6 -D UN=5 -DUSE_NCWHC4 -DUSE_ELTWISE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise1_ncwhc4_87.bin --options=\"${copt} -D LM=8 -D LN=7 -D UN=6 -DUSE_NCWHC4 -DUSE_ELTWISE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise1_ncwhc4_88.bin --options=\"${copt} -D LM=8 -D LN=8 -D UN=7 -DUSE_NCWHC4 -DUSE_ELTWISE_NCHW\" + + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_ncwhc4_41.bin --options=\"${copt} -D LM=4 -D LN=1 -D UN=0 -DUSE_NCWHC4 -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_ncwhc4_42.bin --options=\"${copt} -D LM=4 -D LN=2 -D UN=1 -DUSE_NCWHC4 -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_ncwhc4_43.bin --options=\"${copt} -D LM=4 -D LN=3 -D UN=2 -DUSE_NCWHC4 -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_ncwhc4_44.bin --options=\"${copt} -D LM=4 -D LN=4 -D UN=3 -DUSE_NCWHC4 -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_ncwhc4_45.bin --options=\"${copt} -D LM=4 -D LN=5 -D UN=4 -DUSE_NCWHC4 -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_ncwhc4_46.bin --options=\"${copt} -D LM=4 -D LN=6 -D UN=5 -DUSE_NCWHC4 -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_ncwhc4_47.bin --options=\"${copt} -D LM=4 -D LN=7 -D UN=6 -DUSE_NCWHC4 -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_ncwhc4_48.bin --options=\"${copt} -D LM=4 -D LN=8 -D UN=7 -DUSE_NCWHC4 -DUSE_ELTWISE_NCWHC4\" + + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_ncwhc4_81.bin --options=\"${copt} -D LM=8 -D LN=1 -D UN=0 -DUSE_NCWHC4 -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_ncwhc4_82.bin --options=\"${copt} -D LM=8 -D LN=2 -D UN=1 -DUSE_NCWHC4 -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_ncwhc4_83.bin --options=\"${copt} -D LM=8 -D LN=3 -D UN=2 -DUSE_NCWHC4 -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_ncwhc4_84.bin --options=\"${copt} -D LM=8 -D LN=4 -D UN=3 -DUSE_NCWHC4 -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_ncwhc4_85.bin --options=\"${copt} -D LM=8 -D LN=5 -D UN=4 -DUSE_NCWHC4 -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_ncwhc4_86.bin --options=\"${copt} -D LM=8 -D LN=6 -D UN=5 -DUSE_NCWHC4 -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_ncwhc4_87.bin --options=\"${copt} -D LM=8 -D LN=7 -D UN=6 -DUSE_NCWHC4 -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_ncwhc4_88.bin --options=\"${copt} -D LM=8 -D LN=8 -D UN=7 -DUSE_NCWHC4 -DUSE_ELTWISE_NCWHC4\" + + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_13.bin --options=\"${copt} -D LM=1 -D LN=3 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_14.bin --options=\"${copt} -D LM=1 -D LN=4 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_15.bin --options=\"${copt} -D LM=1 -D LN=5 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_16.bin --options=\"${copt} -D LM=1 -D LN=6 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_17.bin --options=\"${copt} -D LM=1 -D LN=7 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_18.bin --options=\"${copt} -D LM=1 -D LN=8 -D NO_BIAS\" + + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_22.bin --options=\"${copt} -D LM=2 -D LN=2 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_23.bin --options=\"${copt} -D LM=2 -D LN=3 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_24.bin --options=\"${copt} -D LM=2 -D LN=4 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_25.bin --options=\"${copt} -D LM=2 -D LN=5 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_26.bin --options=\"${copt} -D LM=2 -D LN=6 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_27.bin --options=\"${copt} -D LM=2 -D LN=7 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_28.bin --options=\"${copt} -D LM=2 -D LN=8 -D NO_BIAS\" + + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_31.bin --options=\"${copt} -D LM=3 -D LN=1 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_32.bin --options=\"${copt} -D LM=3 -D LN=2 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_33.bin --options=\"${copt} -D LM=3 -D LN=3 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_34.bin --options=\"${copt} -D LM=3 -D LN=4 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_35.bin --options=\"${copt} -D LM=3 -D LN=5 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_36.bin --options=\"${copt} -D LM=3 -D LN=6 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_37.bin --options=\"${copt} -D LM=3 -D LN=7 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_38.bin --options=\"${copt} -D LM=3 -D LN=8 -D NO_BIAS\" + + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_41.bin --options=\"${copt} -D LM=4 -D LN=1 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_42.bin --options=\"${copt} -D LM=4 -D LN=2 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_43.bin --options=\"${copt} -D LM=4 -D LN=3 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_44.bin --options=\"${copt} -D LM=4 -D LN=4 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_45.bin --options=\"${copt} -D LM=4 -D LN=5 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_46.bin --options=\"${copt} -D LM=4 -D LN=6 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_47.bin --options=\"${copt} -D LM=4 -D LN=7 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_48.bin --options=\"${copt} -D LM=4 -D LN=8 -D NO_BIAS\" + + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_51.bin --options=\"${copt} -D LM=5 -D LN=1 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_52.bin --options=\"${copt} -D LM=5 -D LN=2 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_53.bin --options=\"${copt} -D LM=5 -D LN=3 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_54.bin --options=\"${copt} -D LM=5 -D LN=4 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_55.bin --options=\"${copt} -D LM=5 -D LN=5 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_56.bin --options=\"${copt} -D LM=5 -D LN=6 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_57.bin --options=\"${copt} -D LM=5 -D LN=7 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_58.bin --options=\"${copt} -D LM=5 -D LN=8 -D NO_BIAS\" + + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_61.bin --options=\"${copt} -D LM=6 -D LN=1 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_62.bin --options=\"${copt} -D LM=6 -D LN=2 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_63.bin --options=\"${copt} -D LM=6 -D LN=3 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_64.bin --options=\"${copt} -D LM=6 -D LN=4 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_65.bin --options=\"${copt} -D LM=6 -D LN=5 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_66.bin --options=\"${copt} -D LM=6 -D LN=6 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_67.bin --options=\"${copt} -D LM=6 -D LN=7 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_68.bin --options=\"${copt} -D LM=6 -D LN=8 -D NO_BIAS\" + + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_71.bin --options=\"${copt} -D LM=7 -D LN=1 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_72.bin --options=\"${copt} -D LM=7 -D LN=2 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_73.bin --options=\"${copt} -D LM=7 -D LN=3 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_74.bin --options=\"${copt} -D LM=7 -D LN=4 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_75.bin --options=\"${copt} -D LM=7 -D LN=5 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_76.bin --options=\"${copt} -D LM=7 -D LN=6 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_77.bin --options=\"${copt} -D LM=7 -D LN=7 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_78.bin --options=\"${copt} -D LM=7 -D LN=8 -D NO_BIAS\" + + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_81.bin --options=\"${copt} -D LM=8 -D LN=1 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_82.bin --options=\"${copt} -D LM=8 -D LN=2 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_83.bin --options=\"${copt} -D LM=8 -D LN=3 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_84.bin --options=\"${copt} -D LM=8 -D LN=4 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_85.bin --options=\"${copt} -D LM=8 -D LN=5 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_86.bin --options=\"${copt} -D LM=8 -D LN=6 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_87.bin --options=\"${copt} -D LM=8 -D LN=7 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_88.bin --options=\"${copt} -D LM=8 -D LN=8 -D NO_BIAS\" + + echo ./gcl_binary --input=$file --output=${file%.*}_13.bin --options=\"${copt} -D LM=1 -D LN=3\" + echo ./gcl_binary --input=$file --output=${file%.*}_14.bin --options=\"${copt} -D LM=1 -D LN=4\" + echo ./gcl_binary --input=$file --output=${file%.*}_15.bin --options=\"${copt} -D LM=1 -D LN=5\" + echo ./gcl_binary --input=$file --output=${file%.*}_16.bin --options=\"${copt} -D LM=1 -D LN=6\" + echo ./gcl_binary --input=$file --output=${file%.*}_17.bin --options=\"${copt} -D LM=1 -D LN=7\" + echo ./gcl_binary --input=$file --output=${file%.*}_18.bin --options=\"${copt} -D LM=1 -D LN=8\" + + echo ./gcl_binary --input=$file --output=${file%.*}_22.bin --options=\"${copt} -D LM=2 -D LN=2\" + echo ./gcl_binary --input=$file --output=${file%.*}_23.bin --options=\"${copt} -D LM=2 -D LN=3\" + echo ./gcl_binary --input=$file --output=${file%.*}_24.bin --options=\"${copt} -D LM=2 -D LN=4\" + echo ./gcl_binary --input=$file --output=${file%.*}_25.bin --options=\"${copt} -D LM=2 -D LN=5\" + echo ./gcl_binary --input=$file --output=${file%.*}_26.bin --options=\"${copt} -D LM=2 -D LN=6\" + echo ./gcl_binary --input=$file --output=${file%.*}_27.bin --options=\"${copt} -D LM=2 -D LN=7\" + echo ./gcl_binary --input=$file --output=${file%.*}_28.bin --options=\"${copt} -D LM=2 -D LN=8\" + + echo ./gcl_binary --input=$file --output=${file%.*}_31.bin --options=\"${copt} -D LM=3 -D LN=1\" + echo ./gcl_binary --input=$file --output=${file%.*}_32.bin --options=\"${copt} -D LM=3 -D LN=2\" + echo ./gcl_binary --input=$file --output=${file%.*}_33.bin --options=\"${copt} -D LM=3 -D LN=3\" + echo ./gcl_binary --input=$file --output=${file%.*}_34.bin --options=\"${copt} -D LM=3 -D LN=4\" + echo ./gcl_binary --input=$file --output=${file%.*}_35.bin --options=\"${copt} -D LM=3 -D LN=5\" + echo ./gcl_binary --input=$file --output=${file%.*}_36.bin --options=\"${copt} -D LM=3 -D LN=6\" + echo ./gcl_binary --input=$file --output=${file%.*}_37.bin --options=\"${copt} -D LM=3 -D LN=7\" + echo ./gcl_binary --input=$file --output=${file%.*}_38.bin --options=\"${copt} -D LM=3 -D LN=8\" + + echo ./gcl_binary --input=$file --output=${file%.*}_41.bin --options=\"${copt} -D LM=4 -D LN=1\" + echo ./gcl_binary --input=$file --output=${file%.*}_42.bin --options=\"${copt} -D LM=4 -D LN=2\" + echo ./gcl_binary --input=$file --output=${file%.*}_43.bin --options=\"${copt} -D LM=4 -D LN=3\" + echo ./gcl_binary --input=$file --output=${file%.*}_44.bin --options=\"${copt} -D LM=4 -D LN=4\" + echo ./gcl_binary --input=$file --output=${file%.*}_45.bin --options=\"${copt} -D LM=4 -D LN=5\" + echo ./gcl_binary --input=$file --output=${file%.*}_46.bin --options=\"${copt} -D LM=4 -D LN=6\" + echo ./gcl_binary --input=$file --output=${file%.*}_47.bin --options=\"${copt} -D LM=4 -D LN=7\" + echo ./gcl_binary --input=$file --output=${file%.*}_48.bin --options=\"${copt} -D LM=4 -D LN=8\" + + echo ./gcl_binary --input=$file --output=${file%.*}_51.bin --options=\"${copt} -D LM=5 -D LN=1\" + echo ./gcl_binary --input=$file --output=${file%.*}_52.bin --options=\"${copt} -D LM=5 -D LN=2\" + echo ./gcl_binary --input=$file --output=${file%.*}_53.bin --options=\"${copt} -D LM=5 -D LN=3\" + echo ./gcl_binary --input=$file --output=${file%.*}_54.bin --options=\"${copt} -D LM=5 -D LN=4\" + echo ./gcl_binary --input=$file --output=${file%.*}_55.bin --options=\"${copt} -D LM=5 -D LN=5\" + echo ./gcl_binary --input=$file --output=${file%.*}_56.bin --options=\"${copt} -D LM=5 -D LN=6\" + echo ./gcl_binary --input=$file --output=${file%.*}_57.bin --options=\"${copt} -D LM=5 -D LN=7\" + echo ./gcl_binary --input=$file --output=${file%.*}_58.bin --options=\"${copt} -D LM=5 -D LN=8\" + + echo ./gcl_binary --input=$file --output=${file%.*}_61.bin --options=\"${copt} -D LM=6 -D LN=1\" + echo ./gcl_binary --input=$file --output=${file%.*}_62.bin --options=\"${copt} -D LM=6 -D LN=2\" + echo ./gcl_binary --input=$file --output=${file%.*}_63.bin --options=\"${copt} -D LM=6 -D LN=3\" + echo ./gcl_binary --input=$file --output=${file%.*}_64.bin --options=\"${copt} -D LM=6 -D LN=4\" + echo ./gcl_binary --input=$file --output=${file%.*}_65.bin --options=\"${copt} -D LM=6 -D LN=5\" + echo ./gcl_binary --input=$file --output=${file%.*}_66.bin --options=\"${copt} -D LM=6 -D LN=6\" + echo ./gcl_binary --input=$file --output=${file%.*}_67.bin --options=\"${copt} -D LM=6 -D LN=7\" + echo ./gcl_binary --input=$file --output=${file%.*}_68.bin --options=\"${copt} -D LM=6 -D LN=8\" + + echo ./gcl_binary --input=$file --output=${file%.*}_71.bin --options=\"${copt} -D LM=7 -D LN=1\" + echo ./gcl_binary --input=$file --output=${file%.*}_72.bin --options=\"${copt} -D LM=7 -D LN=2\" + echo ./gcl_binary --input=$file --output=${file%.*}_73.bin --options=\"${copt} -D LM=7 -D LN=3\" + echo ./gcl_binary --input=$file --output=${file%.*}_74.bin --options=\"${copt} -D LM=7 -D LN=4\" + echo ./gcl_binary --input=$file --output=${file%.*}_75.bin --options=\"${copt} -D LM=7 -D LN=5\" + echo ./gcl_binary --input=$file --output=${file%.*}_76.bin --options=\"${copt} -D LM=7 -D LN=6\" + echo ./gcl_binary --input=$file --output=${file%.*}_77.bin --options=\"${copt} -D LM=7 -D LN=7\" + echo ./gcl_binary --input=$file --output=${file%.*}_78.bin --options=\"${copt} -D LM=7 -D LN=8\" + + echo ./gcl_binary --input=$file --output=${file%.*}_81.bin --options=\"${copt} -D LM=8 -D LN=1\" + echo ./gcl_binary --input=$file --output=${file%.*}_82.bin --options=\"${copt} -D LM=8 -D LN=2\" + echo ./gcl_binary --input=$file --output=${file%.*}_83.bin --options=\"${copt} -D LM=8 -D LN=3\" + echo ./gcl_binary --input=$file --output=${file%.*}_84.bin --options=\"${copt} -D LM=8 -D LN=4\" + echo ./gcl_binary --input=$file --output=${file%.*}_85.bin --options=\"${copt} -D LM=8 -D LN=5\" + echo ./gcl_binary --input=$file --output=${file%.*}_86.bin --options=\"${copt} -D LM=8 -D LN=6\" + echo ./gcl_binary --input=$file --output=${file%.*}_87.bin --options=\"${copt} -D LM=8 -D LN=7\" + echo ./gcl_binary --input=$file --output=${file%.*}_88.bin --options=\"${copt} -D LM=8 -D LN=8\" + +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_13.bin --options=\"${copt} -D LM=1 -D LN=3 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_14.bin --options=\"${copt} -D LM=1 -D LN=4 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_15.bin --options=\"${copt} -D LM=1 -D LN=5 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_16.bin --options=\"${copt} -D LM=1 -D LN=6 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_17.bin --options=\"${copt} -D LM=1 -D LN=7 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_18.bin --options=\"${copt} -D LM=1 -D LN=8 -D USE_RELU\" +# +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_22.bin --options=\"${copt} -D LM=2 -D LN=2 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_23.bin --options=\"${copt} -D LM=2 -D LN=3 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_24.bin --options=\"${copt} -D LM=2 -D LN=4 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_25.bin --options=\"${copt} -D LM=2 -D LN=5 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_26.bin --options=\"${copt} -D LM=2 -D LN=6 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_27.bin --options=\"${copt} -D LM=2 -D LN=7 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_28.bin --options=\"${copt} -D LM=2 -D LN=8 -D USE_RELU\" +# +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_31.bin --options=\"${copt} -D LM=3 -D LN=1 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_32.bin --options=\"${copt} -D LM=3 -D LN=2 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_33.bin --options=\"${copt} -D LM=3 -D LN=3 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_34.bin --options=\"${copt} -D LM=3 -D LN=4 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_35.bin --options=\"${copt} -D LM=3 -D LN=5 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_36.bin --options=\"${copt} -D LM=3 -D LN=6 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_37.bin --options=\"${copt} -D LM=3 -D LN=7 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_38.bin --options=\"${copt} -D LM=3 -D LN=8 -D USE_RELU\" +# +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_41.bin --options=\"${copt} -D LM=4 -D LN=1 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_42.bin --options=\"${copt} -D LM=4 -D LN=2 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_43.bin --options=\"${copt} -D LM=4 -D LN=3 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_44.bin --options=\"${copt} -D LM=4 -D LN=4 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_45.bin --options=\"${copt} -D LM=4 -D LN=5 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_46.bin --options=\"${copt} -D LM=4 -D LN=6 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_47.bin --options=\"${copt} -D LM=4 -D LN=7 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_48.bin --options=\"${copt} -D LM=4 -D LN=8 -D USE_RELU\" +# +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_51.bin --options=\"${copt} -D LM=5 -D LN=1 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_52.bin --options=\"${copt} -D LM=5 -D LN=2 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_53.bin --options=\"${copt} -D LM=5 -D LN=3 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_54.bin --options=\"${copt} -D LM=5 -D LN=4 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_55.bin --options=\"${copt} -D LM=5 -D LN=5 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_56.bin --options=\"${copt} -D LM=5 -D LN=6 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_57.bin --options=\"${copt} -D LM=5 -D LN=7 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_58.bin --options=\"${copt} -D LM=5 -D LN=8 -D USE_RELU\" +# +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_61.bin --options=\"${copt} -D LM=6 -D LN=1 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_62.bin --options=\"${copt} -D LM=6 -D LN=2 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_63.bin --options=\"${copt} -D LM=6 -D LN=3 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_64.bin --options=\"${copt} -D LM=6 -D LN=4 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_65.bin --options=\"${copt} -D LM=6 -D LN=5 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_66.bin --options=\"${copt} -D LM=6 -D LN=6 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_67.bin --options=\"${copt} -D LM=6 -D LN=7 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_68.bin --options=\"${copt} -D LM=6 -D LN=8 -D USE_RELU\" +# +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_71.bin --options=\"${copt} -D LM=7 -D LN=1 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_72.bin --options=\"${copt} -D LM=7 -D LN=2 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_73.bin --options=\"${copt} -D LM=7 -D LN=3 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_74.bin --options=\"${copt} -D LM=7 -D LN=4 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_75.bin --options=\"${copt} -D LM=7 -D LN=5 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_76.bin --options=\"${copt} -D LM=7 -D LN=6 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_77.bin --options=\"${copt} -D LM=7 -D LN=7 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_78.bin --options=\"${copt} -D LM=7 -D LN=8 -D USE_RELU\" +# +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_81.bin --options=\"${copt} -D LM=8 -D LN=1 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_82.bin --options=\"${copt} -D LM=8 -D LN=2 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_83.bin --options=\"${copt} -D LM=8 -D LN=3 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_84.bin --options=\"${copt} -D LM=8 -D LN=4 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_85.bin --options=\"${copt} -D LM=8 -D LN=5 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_86.bin --options=\"${copt} -D LM=8 -D LN=6 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_87.bin --options=\"${copt} -D LM=8 -D LN=7 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_88.bin --options=\"${copt} -D LM=8 -D LN=8 -D USE_RELU\" +# +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_13.bin --options=\"${copt} -D LM=1 -D LN=3 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_14.bin --options=\"${copt} -D LM=1 -D LN=4 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_15.bin --options=\"${copt} -D LM=1 -D LN=5 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_16.bin --options=\"${copt} -D LM=1 -D LN=6 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_17.bin --options=\"${copt} -D LM=1 -D LN=7 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_18.bin --options=\"${copt} -D LM=1 -D LN=8 -D USE_GELU\" +# +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_22.bin --options=\"${copt} -D LM=2 -D LN=2 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_23.bin --options=\"${copt} -D LM=2 -D LN=3 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_24.bin --options=\"${copt} -D LM=2 -D LN=4 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_25.bin --options=\"${copt} -D LM=2 -D LN=5 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_26.bin --options=\"${copt} -D LM=2 -D LN=6 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_27.bin --options=\"${copt} -D LM=2 -D LN=7 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_28.bin --options=\"${copt} -D LM=2 -D LN=8 -D USE_GELU\" +# +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_31.bin --options=\"${copt} -D LM=3 -D LN=1 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_32.bin --options=\"${copt} -D LM=3 -D LN=2 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_33.bin --options=\"${copt} -D LM=3 -D LN=3 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_34.bin --options=\"${copt} -D LM=3 -D LN=4 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_35.bin --options=\"${copt} -D LM=3 -D LN=5 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_36.bin --options=\"${copt} -D LM=3 -D LN=6 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_37.bin --options=\"${copt} -D LM=3 -D LN=7 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_38.bin --options=\"${copt} -D LM=3 -D LN=8 -D USE_GELU\" +# +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_41.bin --options=\"${copt} -D LM=4 -D LN=1 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_42.bin --options=\"${copt} -D LM=4 -D LN=2 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_43.bin --options=\"${copt} -D LM=4 -D LN=3 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_44.bin --options=\"${copt} -D LM=4 -D LN=4 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_45.bin --options=\"${copt} -D LM=4 -D LN=5 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_46.bin --options=\"${copt} -D LM=4 -D LN=6 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_47.bin --options=\"${copt} -D LM=4 -D LN=7 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_48.bin --options=\"${copt} -D LM=4 -D LN=8 -D USE_GELU\" +# +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_51.bin --options=\"${copt} -D LM=5 -D LN=1 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_52.bin --options=\"${copt} -D LM=5 -D LN=2 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_53.bin --options=\"${copt} -D LM=5 -D LN=3 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_54.bin --options=\"${copt} -D LM=5 -D LN=4 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_55.bin --options=\"${copt} -D LM=5 -D LN=5 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_56.bin --options=\"${copt} -D LM=5 -D LN=6 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_57.bin --options=\"${copt} -D LM=5 -D LN=7 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_58.bin --options=\"${copt} -D LM=5 -D LN=8 -D USE_GELU\" +# +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_61.bin --options=\"${copt} -D LM=6 -D LN=1 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_62.bin --options=\"${copt} -D LM=6 -D LN=2 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_63.bin --options=\"${copt} -D LM=6 -D LN=3 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_64.bin --options=\"${copt} -D LM=6 -D LN=4 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_65.bin --options=\"${copt} -D LM=6 -D LN=5 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_66.bin --options=\"${copt} -D LM=6 -D LN=6 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_67.bin --options=\"${copt} -D LM=6 -D LN=7 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_68.bin --options=\"${copt} -D LM=6 -D LN=8 -D USE_GELU\" +# +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_71.bin --options=\"${copt} -D LM=7 -D LN=1 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_72.bin --options=\"${copt} -D LM=7 -D LN=2 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_73.bin --options=\"${copt} -D LM=7 -D LN=3 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_74.bin --options=\"${copt} -D LM=7 -D LN=4 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_75.bin --options=\"${copt} -D LM=7 -D LN=5 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_76.bin --options=\"${copt} -D LM=7 -D LN=6 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_77.bin --options=\"${copt} -D LM=7 -D LN=7 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_78.bin --options=\"${copt} -D LM=7 -D LN=8 -D USE_GELU\" +# +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_81.bin --options=\"${copt} -D LM=8 -D LN=1 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_82.bin --options=\"${copt} -D LM=8 -D LN=2 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_83.bin --options=\"${copt} -D LM=8 -D LN=3 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_84.bin --options=\"${copt} -D LM=8 -D LN=4 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_85.bin --options=\"${copt} -D LM=8 -D LN=5 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_86.bin --options=\"${copt} -D LM=8 -D LN=6 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_87.bin --options=\"${copt} -D LM=8 -D LN=7 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_88.bin --options=\"${copt} -D LM=8 -D LN=8 -D USE_GELU\" + fi + fi + done + + + diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/mem_trans_nchw_to_ncwhc4.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/mem_trans_nchw_to_ncwhc4.sh new file mode 100644 index 00000000..fe6d4969 --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/mem_trans_nchw_to_ncwhc4.sh @@ -0,0 +1,13 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "mem_trans_nchw_to_ncwhc4.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}.bin --options=\"${copt}\" + echo ./gcl_binary --input=$file --output=${file%.*}_input_tran.bin --options=\"${copt} -DINPUT_TRAN\" + echo ./gcl_binary --input=$file --output=${file%.*}_output_tran.bin --options=\"${copt} -DOUTPUT_TRAN\" + fi + fi + done + + + diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/mem_trans_ncwhc4_to_nchw.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/mem_trans_ncwhc4_to_nchw.sh new file mode 100644 index 00000000..ced01b7f --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/mem_trans_ncwhc4_to_nchw.sh @@ -0,0 +1,12 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "mem_trans_ncwhc4_to_nchw.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}.bin --options=\"${copt}\" + echo ./gcl_binary --input=$file --output=${file%.*}_output_tran.bin --options=\"${copt} -DOUTPUT_TRAN\" + fi + fi + done + + + diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/mem_trans_ncwhc4_to_ncwhc4.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/mem_trans_ncwhc4_to_ncwhc4.sh new file mode 100644 index 00000000..e975b63a --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/mem_trans_ncwhc4_to_ncwhc4.sh @@ -0,0 +1,12 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "mem_trans_ncwhc4_to_ncwhc4.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}.bin --options=\"${copt}\" + echo ./gcl_binary --input=$file --output=${file%.*}_output_tran.bin --options=\"${copt} -DOUTPUT_TRAN\" + fi + fi + done + + + diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/normalization.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/normalization.sh new file mode 100644 index 00000000..4abc0e04 --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/normalization.sh @@ -0,0 +1,12 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "normalization.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_c1.bin --options=\"${copt} -D USE_C1 \" + echo ./gcl_binary --input=$file --output=${file%.*}.bin --options=\"${copt}\" + fi + fi + done + + + diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/power.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/power.sh new file mode 100644 index 00000000..6cb49036 --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/power.sh @@ -0,0 +1,12 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "power.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_f16.bin --options=\"${copt} -D DT=f16\" + echo ./gcl_binary --input=$file --output=${file%.*}_i32.bin --options=\"-D T=int -D T2=int2 -D T3=int3 -D T4=int4 -D DT=i32\" + fi + fi + done + + + diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/prelu.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/prelu.sh new file mode 100644 index 00000000..440302d9 --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/prelu.sh @@ -0,0 +1,12 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "prelu.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_noprop.bin --options=\"${copt} -D MD=noprop \" + echo ./gcl_binary --input=$file --output=${file%.*}_prop.bin --options=\"${copt} -D MD=prop -DUSE_SAME \" + fi + fi + done + + + diff --git a/gcl/tools/kernel_lib_compile/sh/compile/sample.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/sample.sh similarity index 100% rename from gcl/tools/kernel_lib_compile/sh/compile/sample.sh rename to common/gcl/tools/kernel_lib_compile/sh/compile/sample.sh diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/scale.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/scale.sh new file mode 100644 index 00000000..d3e3e61b --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/scale.sh @@ -0,0 +1,14 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "scale.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_nobeta.bin --options=\"${copt} -D MD=nobeta \" + echo ./gcl_binary --input=$file --output=${file%.*}_beta.bin --options=\"${copt} -D MD=beta -DUSE_BETA\" + echo ./gcl_binary --input=$file --output=${file%.*}1_nobeta.bin --options=\"${copt} -D MD=nobeta -DUSE_SAME\" + echo ./gcl_binary --input=$file --output=${file%.*}1_beta.bin --options=\"${copt} -D MD=beta -DUSE_BETA -DUSE_SAME\" + fi + fi + done + + + diff --git a/gcl/tools/kernel_lib_compile/sh/compile/slice_h.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/slice_h.sh similarity index 100% rename from gcl/tools/kernel_lib_compile/sh/compile/slice_h.sh rename to common/gcl/tools/kernel_lib_compile/sh/compile/slice_h.sh diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/transpose_nchw.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/transpose_nchw.sh new file mode 100644 index 00000000..af18b26a --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/transpose_nchw.sh @@ -0,0 +1,14 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "transpose_nchw.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_0231.bin --options=\"${copt} -D OC=2 -D OH=3 -D OW=1\" + echo ./gcl_binary --input=$file --output=${file%.*}_0213.bin --options=\"${copt} -D OC=2 -D OH=1 -D OW=3\" + echo ./gcl_binary --input=$file --output=${file%.*}_0312.bin --options=\"${copt} -D OC=3 -D OH=1 -D OW=2\" + echo ./gcl_binary --input=$file --output=${file%.*}_0321.bin --options=\"${copt} -D OC=3 -D OH=2 -D OW=1\" + fi + fi + done + + + diff --git a/gcl/tools/kernel_lib_compile/sh/packKernelBin.sh b/common/gcl/tools/kernel_lib_compile/sh/packKernelBin.sh similarity index 97% rename from gcl/tools/kernel_lib_compile/sh/packKernelBin.sh rename to common/gcl/tools/kernel_lib_compile/sh/packKernelBin.sh index f6d5d34a..3de6d230 100644 --- a/gcl/tools/kernel_lib_compile/sh/packKernelBin.sh +++ b/common/gcl/tools/kernel_lib_compile/sh/packKernelBin.sh @@ -39,7 +39,7 @@ for((i=1;i $srcPath/$InlineHead echo "#define _${UpperInlineHead}_H" >> $srcPath/$InlineHead - echo "#include \""type.h"\"" >> $srcPath/$InlineHead + echo "#include \""types.h"\"" >> $srcPath/$InlineHead echo >> $srcPath/$InlineHead echo "#include \""${Head}"\"" > $srcPath/$InlineCpp diff --git a/gcl/tools/kernel_lib_compile/sh/sh.config b/common/gcl/tools/kernel_lib_compile/sh/sh.config similarity index 100% rename from gcl/tools/kernel_lib_compile/sh/sh.config rename to common/gcl/tools/kernel_lib_compile/sh/sh.config diff --git a/common/gcl/tools/kernel_source_compile/CMakeLists.txt b/common/gcl/tools/kernel_source_compile/CMakeLists.txt new file mode 100644 index 00000000..11429d21 --- /dev/null +++ b/common/gcl/tools/kernel_source_compile/CMakeLists.txt @@ -0,0 +1,35 @@ +cmake_minimum_required(VERSION 3.2) + +file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/common/cmakes/bolt.cmake ${BOLT_ROOT}/common/cmakes/bolt.cmake) +if (BOLT_CONFIGURE_FILE) + include(${BOLT_CONFIGURE_FILE}) +else (BOLT_CONFIGURE_FILE) + message(FATAL_ERROR " +FATAL: can not find bolt.cmake in directory, + please set shell or cmake environment variable BOLT_ROOT. + ") +endif (BOLT_CONFIGURE_FILE) + +project(kernelsource) + +set_c_cxx_flags() + +execute_process( + COMMAND bash buildKernelSourceLib.sh + WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" +) + +file(GLOB source_srcs "src/cl/*.cpp") +file(GLOB option_srcs "src/option/*.cpp") +set(kernel_source_list "${source_srcs};${option_srcs}") + +add_library(${PROJECT_NAME} SHARED ${kernel_source_list}) +add_library(${PROJECT_NAME}_static STATIC ${kernel_source_list}) +set_target_properties(${PROJECT_NAME}_static PROPERTIES OUTPUT_NAME "${PROJECT_NAME}") +set_target_properties(${PROJECT_NAME} PROPERTIES CLEAN_DIRECT_OUTPUT 1) +set_target_properties(${PROJECT_NAME}_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) +install(TARGETS ${PROJECT_NAME} ${PROJECT_NAME}_static + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib) +install(FILES ${CMAKE_BINARY_DIR}/libOpenCL.so + DESTINATION lib) diff --git a/common/gcl/tools/kernel_source_compile/buildKernelSourceLib.sh b/common/gcl/tools/kernel_source_compile/buildKernelSourceLib.sh new file mode 100644 index 00000000..9d157e46 --- /dev/null +++ b/common/gcl/tools/kernel_source_compile/buildKernelSourceLib.sh @@ -0,0 +1,22 @@ +workPath=${BOLT_ROOT}/common/gcl/tools/kernel_source_compile +#echo "Build OpenCL kernel source in ${workPath}" +cd ${workPath} + +if [ -d "src" ]; then + rm -rf src +fi +mkdir src +mkdir src/cl +mkdir src/option + +if [ -d "include" ]; then + rm -rf include +fi +mkdir include + +headfile=${BOLT_ROOT}/common/uni/include/ +cd ${workPath}/kernel_cl2char/ +g++ -g -std=c++11 cl2char.cpp -o gcl_cl2char -I ${headfile} +./gcl_cl2char +rm gcl_cl2char +cd ${workPath} diff --git a/common/gcl/tools/kernel_source_compile/kernel_cl2char/cl2char.cpp b/common/gcl/tools/kernel_source_compile/kernel_cl2char/cl2char.cpp new file mode 100644 index 00000000..d6fd8fde --- /dev/null +++ b/common/gcl/tools/kernel_source_compile/kernel_cl2char/cl2char.cpp @@ -0,0 +1,540 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "types.h" + +typedef struct { + std::string kernel; + U32 len; + bool use_kernel_def_head; +} KernelInfo; + +typedef struct { + std::string sourceName; + std::string option; + bool use_common_opt; +} OptionInfo; + +inline std::vector buildFileNames(std::string path, std::string postfix) +{ + struct dirent *dirTp; + DIR *handle = opendir(path.c_str()); + std::vector names; + if (handle != NULL) { + while ((dirTp = readdir(handle)) != NULL) { + std::string clFileName = dirTp->d_name; + U32 len = clFileName.size(); + U32 postfix_len = postfix.size(); + if (len > postfix_len) { + if (clFileName.substr(len - postfix_len) == postfix) { + clFileName.erase(len - postfix_len, postfix_len); + names.push_back(clFileName); + } + } + } + } else { + UNI_ERROR_LOG("opendir %s failed\n", path.c_str()); + } + closedir(handle); + return names; +} + +inline std::map buildClMap(std::vector clNames, + std::vector clPaths, + std::vector clNamesIndex, + std::string postfix) +{ + std::map clMap; + for (int ii = 0; ii < clPaths.size(); ii++) { + std::string clPath = clPaths[ii]; + int be = (ii - 1) < 0 ? 0 : clNamesIndex[ii - 1]; + int end = clNamesIndex[ii]; + for (int i = be; i < end; i++) { + KernelInfo kernelInfo; + std::string clName = clNames[i]; + std::string fileName = clPath + clName + postfix; + int fd = open(fileName.c_str(), O_RDONLY); + if (-1 == fd) { + UNI_ERROR_LOG("Cannot open .bolt file. Name: %s\n", fileName.c_str()); + } + + struct stat ss; + if (-1 == fstat(fd, &ss)) { + UNI_ERROR_LOG( + "Cannot get size from file descriptor. File Name: %s\n", fileName.c_str()); + } + + int fileLength = ss.st_size; + char *bytes = (char *)mmap(nullptr, fileLength, PROT_READ, MAP_SHARED, fd, 0); + if (MAP_FAILED == bytes) { + UNI_ERROR_LOG("Mmap failed. File Name: %s\n", fileName.c_str()); + } + std::string fileContent = (const char *)bytes; + int note_pos = -1; + int j = 0; + + for (; j < fileContent.size() - 1; j++) { + if (fileContent[j] == '/' && note_pos < 0) { + if (fileContent[j + 1] == '/') { + note_pos = j; + continue; + } + } + + if (fileContent[j] == '\n' && note_pos >= 0) { + fileContent.erase(note_pos, j - note_pos); + j = note_pos; + note_pos = -1; + } + } + note_pos = -1; + for (j = 0; j < fileContent.size() - 1; j++) { + if (fileContent[j] == '/' && note_pos < 0) { + if (fileContent[j + 1] == '*') { + note_pos = j; + continue; + } + } + + if (fileContent[j] == '*' && note_pos >= 0) { + if (fileContent[j + 1] == '/') { + fileContent.erase(note_pos, j - note_pos + 2); + j = note_pos; + note_pos = -1; + } + } + } + + for (j = 0; j < fileContent.size() - 1; j++) { + if (fileContent[j] == '\r') { + fileContent.erase(j, 1); + j = j - 1; + } + } + + for (j = 0; j < fileContent.size() - 1; j++) { + if (fileContent[j] == '\n') { + if (fileContent[j + 1] == '\n') { + fileContent.erase(j, 1); + j = j - 1; + } + } + } + if (fileContent[0] == '\n') { + fileContent.erase(0, 1); + } + if (fileContent[fileContent.size() - 1] == '\n') { + fileContent.erase(fileContent.size() - 1, 1); + } + kernelInfo.len = fileContent.size(); + + std::string kernel_def_head = "kernel_def.h"; + kernelInfo.use_kernel_def_head = false; + if (fileContent.find(kernel_def_head, 0) != -1) { + kernelInfo.use_kernel_def_head = true; + } + + std::string substr_a = "\\"; + std::string substr_b = "\\n\""; + std::string substr_c = "\""; + U32 sublen_a = substr_a.size(); + U32 sublen_b = substr_b.size(); + U32 sublen_c = substr_c.size(); + + for (j = 0; j < fileContent.size() - 1; j++) { + if (fileContent[j] == '\\') { + fileContent.insert(j, substr_a); + j += sublen_a + 1; + } + } + + for (j = 0; j < fileContent.size() - 1; j++) { + if (fileContent[j] == '"') { + fileContent.insert(j, substr_a); + j += sublen_a + 1; + } + } + + for (j = 0; j < fileContent.size() - 1; j++) { + if (fileContent[j] == '\n') { + fileContent.insert(j, substr_b); + j += sublen_b + 1; + fileContent.insert(j, substr_c); + j += sublen_c; + } + } + fileContent.insert(0, substr_c); + fileContent.insert(fileContent.size(), substr_b); + kernelInfo.kernel = fileContent; + clMap[clName] = kernelInfo; + munmap(bytes, fileLength); + if (-1 != fd) { + close(fd); + } + } + } + return clMap; +} + +inline std::map buildClOptionMap( + std::vector optionNames, std::string optionPath, std::string postfix) +{ + std::map optionMap; + for (int i = 0; i < optionNames.size(); i++) { + std::string optionName = optionNames[i]; + std::string fileName = optionPath + optionName + postfix; + int fd = open(fileName.c_str(), O_RDONLY); + if (-1 == fd) { + UNI_ERROR_LOG("Cannot open .bolt file. Name: %s\n", fileName.c_str()); + } + + struct stat ss; + if (-1 == fstat(fd, &ss)) { + UNI_ERROR_LOG("Cannot get size from file descriptor. File Name: %s\n", fileName.c_str()); + } + + int fileLength = ss.st_size; + char *bytes = (char *)mmap(nullptr, fileLength, PROT_READ, MAP_SHARED, fd, 0); + if (MAP_FAILED == bytes) { + UNI_ERROR_LOG("Mmap failed. File Name: %s\n", fileName.c_str()); + } + std::string fileContent = (const char *)bytes; + int note_pos = -1; + int j = 0; + + for (; j < fileContent.size() - 1; j++) { + if (fileContent[j] == '#' && note_pos < 0) { + note_pos = j; + continue; + } + + if (fileContent[j] == '\n' && note_pos >= 0) { + fileContent.erase(note_pos, j - note_pos); + j = note_pos; + note_pos = -1; + } + } + + for (j = 0; j < fileContent.size() - 1; j++) { + if (fileContent[j] == '\r') { + fileContent.erase(j, 1); + j = j - 1; + } + } + + for (j = 0; j < fileContent.size() - 1; j++) { + if (fileContent[j] == '\n') { + if (fileContent[j + 1] == '\n') { + fileContent.erase(j, 1); + j = j - 1; + } + } + } + if (fileContent[0] == '\n') { + fileContent.erase(0, 1); + } + if (fileContent[fileContent.size() - 1] == '\n') { + fileContent.erase(fileContent.size() - 1, 1); + } + optionMap[optionName] = fileContent; + munmap(bytes, fileLength); + if (-1 != fd) { + close(fd); + } + } + return optionMap; +} + +inline std::map buildClOptionExpandMap( + std::map optionMap) +{ + std::map optionMapExpand; + std::string output_flag = "--output="; + std::string option_flag = "\\\""; + std::string postfix = ".bin"; + std::string replace_name = "${file%.*}"; + std::string common_opt = "${copt}"; + for (auto p : optionMap) { + std::string name = p.first; + std::string option = p.second; + OptionInfo optionInfo; + optionInfo.sourceName = name; + int pos = option.find(output_flag, 0); + while (pos != -1) { + int be = pos + output_flag.size(); + int end = option.find(" ", be); + std::string expandName = option.substr(be, end - be); + expandName.erase(expandName.size() - postfix.size(), postfix.size()); + expandName.replace(0, replace_name.size(), name); + + pos = option.find(option_flag, end); + be = pos + option_flag.size(); + end = option.find(option_flag, be); + std::string expandOption = option.substr(be, end - be); + int common_opt_pos = expandOption.find(common_opt, 0); + if (common_opt_pos == -1) { + optionInfo.use_common_opt = false; + } else { + optionInfo.use_common_opt = true; + if (name == "common") { + expandOption.replace(0, common_opt.size(), + "-D T=half -D T2=half2 -D T3=half3 -D T4=half4 -D T8=half8 -D T16=half16 " + "-DUSE_HALF"); + } else { + expandOption.erase(common_opt_pos, common_opt.size()); + } + } + pos = option.find(output_flag, end); + optionInfo.option = expandOption; + optionMapExpand[expandName] = optionInfo; + } + } + return optionMapExpand; +} + +inline std::string produce_inline_cl_source_head(std::vector clNames) +{ + std::string source_head = ""; + for (auto p : clNames) { + std::string func = "source_" + p; + source_head += "extern bool " + func + "_head;\n"; + source_head += "extern const unsigned int " + func + "_len;\n"; + source_head += "extern const char " + func + "[];\n"; + } + return source_head; +} + +inline std::string produce_inline_cl_option_head(std::vector optionNamesExpand) +{ + std::string option_head = ""; + for (auto p : optionNamesExpand) { + std::string func = "option_" + p; + option_head += "extern bool " + func + "_common;\n"; + option_head += "extern const char " + func + "_source_name[];\n"; + option_head += "extern const char " + func + "[];\n"; + } + return option_head; +} + +inline std::string produce_inline_cl_source(std::vector clNames) +{ + std::string source = ""; + for (auto p : clNames) { + std::string func = "source_" + p; + source += " put_source(\"" + p + "\", " + "{" + func + ", " + func + "_len, " + func + + "_head});\n"; + } + return source; +} + +inline std::string produce_inline_cl_option(std::vector optionNamesExpand) +{ + std::string source = ""; + for (auto p : optionNamesExpand) { + std::string func = "option_" + p; + source += " put_option(\"" + p + "\", " + "{" + func + ", " + func + "_source_name, " + + func + "_common});\n"; + } + return source; +} + +inline std::string produce_kernel_source(std::string name, KernelInfo kernelInfo) +{ + name = "source_" + name; + std::string source = ""; + bool use_kernel_def_head = kernelInfo.use_kernel_def_head; + U32 len = kernelInfo.len; + source += "bool " + name + "_head = " + std::to_string(use_kernel_def_head) + ";\n"; + source += "const unsigned int " + name + "_len = " + std::to_string(len) + ";\n"; + source += "const char " + name + "[] = \n"; + source += kernelInfo.kernel; + source += ";\n"; + return source; +} + +inline std::string produce_option_source(std::string name, OptionInfo optionInfo) +{ + name = "option_" + name; + std::string source = ""; + source += "bool " + name + "_common = " + std::to_string(optionInfo.use_common_opt) + ";\n"; + source += "const char " + name + "_source_name[] = "; + source += "\""; + source += optionInfo.sourceName; + source += "\";\n"; + source += "const char " + name + "[] = "; + source += "\""; + source += optionInfo.option; + source += "\";\n"; + return source; +} + +inline void write_to_file(std::string str, std::string path, std::string name) +{ + std::string fileName = path + name; + std::ofstream file(fileName.c_str()); + if (file.is_open()) { + file << str.c_str(); + file.close(); + } else { + UNI_ERROR_LOG("fail to write file %s\n", fileName.c_str()); + } +} + +int main() +{ + CI8 *boltEnv = getenv("BOLT_ROOT"); + if (boltEnv == NULL) { + UNI_ERROR_LOG("BOLT_ROOT env value has not been set successfully\n"); + }; + std::string boltPath = boltEnv; + CI8 lastFlag = boltPath[boltPath.length() - 1]; + if (strcmp(&lastFlag, "/") != 0) { + boltPath += "/"; + } + std::string tensorComputingClPath = "compute/tensor/src/gpu/mali/cl/"; + std::string imageClPath = "compute/image/src/gpu/mali/cl/"; + tensorComputingClPath = boltPath + tensorComputingClPath; + imageClPath = boltPath + imageClPath; + + std::string clOptionPath = "common/gcl/tools/kernel_lib_compile/sh/compile/"; + clOptionPath = boltPath + clOptionPath; + + // std::string samplePath = "gcl/tools/gcl_sample/cl/"; + // samplePath = boltPath + samplePath; + + std::vector clPath; + clPath.push_back(tensorComputingClPath); + clPath.push_back(imageClPath); + // clPath.push_back(samplePath); + + std::vector clNames; + std::vector headNames; + std::vector clNamesIndex; + std::vector headNamesIndex; + + for (auto p : clPath) { + std::vector clName; + std::vector headName; + headName = buildFileNames(p, ".h"); + clName = buildFileNames(p, ".cl"); + clNames.insert(clNames.end(), clName.begin(), clName.end()); + headNames.insert(headNames.end(), headName.begin(), headName.end()); + clNamesIndex.push_back(clNames.size()); + headNamesIndex.push_back(headName.size()); + } + + std::vector clOptionNames; + std::vector clOptionNamesExpand; + clOptionNames = buildFileNames(clOptionPath, ".sh"); + + std::map headMap; + std::map clMap; + std::map clOptionMap; + std::map clOptionMapExpand; + headMap = buildClMap(headNames, clPath, headNamesIndex, ".h"); + clMap = buildClMap(clNames, clPath, clNamesIndex, ".cl"); + clOptionMap = buildClOptionMap(clOptionNames, clOptionPath, ".sh"); + + std::string filePath = "common/gcl/tools/kernel_source_compile/include/"; + filePath = boltPath + filePath; + std::string kernel_source_executor; + kernel_source_executor = "#ifndef _LIBKERNELSOURCE_H\n"; + kernel_source_executor += "#define _LIBKERNELSOURCE_H\n"; + kernel_source_executor += "#include \"gcl_kernel_source.h\"\n"; + kernel_source_executor += "class kernel_source_executor : public gcl_kernel_source {\n"; + kernel_source_executor += "public:\n"; + kernel_source_executor += " kernel_source_executor() {\n"; + kernel_source_executor += " loadKernelSource();\n"; + kernel_source_executor += " loadKernelOption();\n"; + kernel_source_executor += " }\n"; + kernel_source_executor += " void loadKernelSource();\n"; + kernel_source_executor += " void loadKernelOption();\n"; + kernel_source_executor += "};\n"; + kernel_source_executor += "#endif\n"; + write_to_file(kernel_source_executor, filePath, "libkernelsource.h"); + + filePath = "common/gcl/tools/kernel_source_compile/src/cl/"; + filePath = boltPath + filePath; + std::string inline_cl_source_head; + inline_cl_source_head = "#ifndef _INLINE_CL_SOURCE_HEAD\n"; + inline_cl_source_head += "#define _INLINE_CL_SOURCE_HEAD\n"; + inline_cl_source_head += produce_inline_cl_source_head(headNames); + inline_cl_source_head += produce_inline_cl_source_head(clNames); + inline_cl_source_head += "#endif\n "; + write_to_file(inline_cl_source_head, filePath, "inline_cl_source_head.h"); + + std::string inline_cl_source; + inline_cl_source = "#include \"libkernelsource.h\"\n"; + inline_cl_source += "#include \"inline_cl_source_head.h\"\n"; + inline_cl_source += "void kernel_source_executor::loadKernelSource() {\n"; + inline_cl_source += produce_inline_cl_source(headNames); + inline_cl_source += produce_inline_cl_source(clNames); + inline_cl_source += "}\n"; + write_to_file(inline_cl_source, filePath, "inline_cl_source.cpp"); + + std::string kernel_source = "#include \"inline_cl_source_head.h\"\n"; + for (auto p : headMap) { + std::string name = p.first; + KernelInfo kernelInfo = p.second; + kernel_source += produce_kernel_source(name, kernelInfo); + } + for (auto p : clMap) { + std::string name = p.first; + KernelInfo kernelInfo = p.second; + kernel_source += produce_kernel_source(name, kernelInfo); + } + write_to_file(kernel_source, filePath, "gcl_kernel_source.cpp"); + + clOptionMapExpand = buildClOptionExpandMap(clOptionMap); + for (auto p : clOptionMapExpand) { + clOptionNamesExpand.push_back(p.first); + } + filePath = "common/gcl/tools/kernel_source_compile/src/option/"; + filePath = boltPath + filePath; + std::string inline_cl_option_head; + inline_cl_option_head = "#ifndef _INLINE_CL_OPTION_HEAD\n"; + inline_cl_option_head += "#define _INLINE_CL_OPTION_HEAD\n"; + inline_cl_option_head += produce_inline_cl_option_head(clOptionNamesExpand); + inline_cl_option_head += "#endif\n "; + write_to_file(inline_cl_option_head, filePath, "inline_cl_option_head.h"); + + std::string inline_cl_option; + inline_cl_option = "#include \"libkernelsource.h\"\n"; + inline_cl_option += "#include \"inline_cl_option_head.h\"\n"; + inline_cl_option += "void kernel_source_executor::loadKernelOption() {\n"; + inline_cl_option += produce_inline_cl_option(clOptionNamesExpand); + inline_cl_option += "}\n"; + write_to_file(inline_cl_option, filePath, "inline_cl_option.cpp"); + + std::string option_source = "#include \"inline_cl_option_head.h\"\n"; + for (auto p : clOptionMapExpand) { + std::string name = p.first; + OptionInfo optionInfo = p.second; + option_source += produce_option_source(name, optionInfo); + } + write_to_file(option_source, filePath, "gcl_kernel_option.cpp"); + return 0; +} diff --git a/common/memory/include/memory.hpp b/common/memory/include/memory.hpp new file mode 100644 index 00000000..2a968edc --- /dev/null +++ b/common/memory/include/memory.hpp @@ -0,0 +1,52 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _MEMORY_H +#define _MEMORY_H + +#include +#include "tensor_desc.h" + +typedef enum { OCLMem = 0, CPUMem = 1 } MemoryType; + +class Memory { +public: + Memory() + {} + + virtual ~Memory() = default; + + virtual MemoryType get_mem_type() = 0; + + virtual std::shared_ptr clone(bool allocate = true) = 0; + + virtual void resize(TensorDesc desc) = 0; + + virtual void alloc() = 0; + + virtual EE reuse(Memory *other) = 0; + + virtual EE copy_from(Memory *other) = 0; + + virtual EE copy_to(Memory *other) + { + return other->copy_from(this); + } + + virtual U32 length() = 0; + virtual U32 bytes() = 0; + virtual U32 capacity() = 0; + virtual std::string string(U32 num, F32 factor) = 0; + virtual F32 element(U32 index) = 0; +}; +#endif diff --git a/common/memory/include/memory_cpu.hpp b/common/memory/include/memory_cpu.hpp new file mode 100644 index 00000000..a5018345 --- /dev/null +++ b/common/memory/include/memory_cpu.hpp @@ -0,0 +1,172 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _MEMORY_CPU_H +#define _MEMORY_CPU_H +#include +#include "memory.hpp" + +class CpuMemory : public Memory { +public: + CpuMemory() + { + this->capacitySize = 0; + this->allocated = false; + } + + ~CpuMemory() = default; + + std::shared_ptr clone(bool allocate) override + { + CpuMemory *mem = new CpuMemory(); + mem->desc = this->desc; + if (allocate) { + mem->alloc(); + } + return std::shared_ptr(mem); + } + + MemoryType get_mem_type() override + { + return CPUMem; + } + + void resize(TensorDesc desc) override + { + this->desc = desc; + if (tensorNumBytes(desc) > this->capacity()) { + this->allocated = false; + } + } + + void alloc() override + { + auto size = this->bytes(); + if (!this->allocated && size > this->capacity()) { + this->capacitySize = size; + this->val = std::shared_ptr((U8 *)operator new(size)); + } + this->allocated = true; + } + + TensorDesc get_desc() + { + return this->desc; + } + + void set_ptr(U8 *val) + { + this->set_shared_ptr(std::shared_ptr(val)); + } + + void *get_ptr() + { + return this->val.get(); + } + + void set_shared_ptr(std::shared_ptr val) + { + this->val = val; + this->allocated = true; + this->capacitySize = this->bytes(); + } + + std::shared_ptr get_shared_ptr() + { + return this->val; + } + + U32 length() override + { + return tensorNumElements(this->desc); + } + + U32 bytes() override + { + return tensorNumBytes(this->desc); + } + + U32 capacity() override + { + return this->capacitySize; + } + + EE reuse(Memory *other) override + { + EE ret; + if (other->get_mem_type() != CPUMem) { + ret = this->copy_from(other); + } else { + U32 other_size = other->capacity(); + if (other_size >= this->bytes()) { + this->set_shared_ptr(((CpuMemory *)other)->get_shared_ptr()); + this->capacitySize = other->capacity(); + ret = SUCCESS; + } else { + UNI_ERROR_LOG("Small CPU memory can not meet big CPU memory demand\n"); + ret = NOT_SUPPORTED; + } + } + return ret; + } + + EE copy_from(Memory *other) override + { + if (!this->allocated) { + this->alloc(); + } + if (CPUMem == other->get_mem_type()) { + auto *src = ((CpuMemory *)other)->val.get(); + auto *dst = this->val.get(); + auto dst_size = this->bytes(); + auto src_size = other->bytes(); + U32 min_size = UNI_MIN(src_size, dst_size); + U32 max_size = UNI_MAX(src_size, dst_size); + if (min_size <= 0) { + min_size = max_size; + } + UNI_memcpy(dst, src, min_size); + } else { + //todo + } + return SUCCESS; + } + + std::string string(U32 num, F32 factor) override + { + std::string line = "desc: " + tensorDesc2Str(this->desc) + " data:"; + for (U32 i = 0; i < num; i++) { + line = line + std::to_string(this->element(i) * factor) + " "; + } + return line; + } + + F32 element(U32 index) override + { + U8 *res = (U8 *)this->get_ptr(); + U32 offset = bytesOf(this->desc.dt) * index; + F32 value; + transformToFloat(this->desc.dt, res + offset, &value, 1); + return value; + } + +private: + // array val's really bytes + U32 capacitySize; + std::shared_ptr val; + + TensorDesc desc; + + bool allocated; +}; +#endif diff --git a/common/memory/include/memory_ocl.hpp b/common/memory/include/memory_ocl.hpp new file mode 100644 index 00000000..9129c7d9 --- /dev/null +++ b/common/memory/include/memory_ocl.hpp @@ -0,0 +1,294 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _MEMORY_OCL_H +#define _MEMORY_OCL_H + +#include "memory.hpp" +#include "gcl.h" +#include "ocl_data_alloc.h" +#include "ocl_data_trans.h" + +class OclMemory : public Memory { +public: + OclMemory() + { + memset(&(this->desc), 0, sizeof(GCLMemDesc)); + this->desc.memFormat = DF_NCHW; + this->allocated = false; + this->mapped = false; + this->capacitySize = 0; + } + + ~OclMemory() = default; + + MemoryType get_mem_type() override + { + return OCLMem; + } + + std::shared_ptr clone(bool allocate) override + { + OclMemory *mem = new OclMemory(); + mem->desc = this->desc; + if (allocate) { + mem->alloc(); + } + return std::shared_ptr(mem); + } + + void resize(TensorDesc desc) override + { + this->desc.nDims = desc.nDims; + for (U32 i = 0; i < desc.nDims; i++) { + this->desc.dims[i] = desc.dims[i]; + } + this->desc.dt = desc.dt; + this->desc.df = desc.df; + if (this->desc.byteSize == 0) { + this->desc.memType = GCL_MEM_BUF; + this->desc.flags = CL_MEM_READ_WRITE; + } + if (tensorNumBytes(desc) > this->capacity()) { + this->allocated = false; + } + } + + void padding(GCLMemDesc desc) + { + if (desc.byteSize > this->capacity()) { + this->allocated = false; + } + for (U32 i = 0; i < 3; i++) { + this->desc.stride[i] = desc.stride[i]; + this->desc.offset[i] = desc.offset[i]; + } + this->desc.memType = desc.memType; + this->desc.memFormat = desc.memFormat; + this->desc.byteSize = desc.byteSize; + this->desc.num = desc.num; + this->desc.flags = desc.flags; + this->desc.imgFormat = desc.imgFormat; + this->desc.host_ptr = desc.host_ptr; + this->desc.need_pad = desc.need_pad; + } + + void alloc() override + { + if (this->desc.byteSize == 0) { + U32 num = (this->desc.nDims == 0) ? 0 : 1; + for (U32 i = 0; i < this->desc.nDims; i++) { + num *= this->desc.dims[i]; + } + this->desc.byteSize = num * bytesOf(this->desc.dt); + } + U32 size = this->desc.byteSize; + if (!this->allocated && size > this->capacity()) { + GCLMem_t mem = ocl_alloc_gclmem(this->desc); + this->val = std::shared_ptr(mem, ocl_release_gclmem); + this->allocated = true; + this->capacitySize = size; + } + } + + GCLMemDesc get_desc() + { + return this->desc; + } + + EE copy_from(Memory *other) override + { + EE ret = SUCCESS; + if (other->get_mem_type() == CPUMem) { + U32 size = ((CpuMemory *)other)->bytes(); + void *host_ptr = ((CpuMemory *)other)->get_ptr(); + if (!allocated) { + U8 *tmp = nullptr; + if (size < this->desc.byteSize) { + U8 *tmp = (U8 *)operator new(this->desc.byteSize); + memset(tmp, 0, this->desc.byteSize); + memcpy(tmp, host_ptr, size); + host_ptr = tmp; + } + this->desc.host_ptr = host_ptr; + this->desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + this->alloc(); + if (tmp) { + delete tmp; + } + } else { + this->val->desc = this->desc; //TODO DELETE AFTER SPLITE DESC FROM GCLMEM + if (size > this->desc.byteSize) { + size = this->desc.byteSize; + } + CHECK_STATUS(gcl_trans_memory(OCLContext::getInstance().handle.get(), host_ptr, + this->val.get(), &size, HOST_TO_DEVICE_BUF, CL_TRUE)); + } + } else if (other->get_mem_type() == OCLMem) { + if (!allocated) { + this->alloc(); + } else { + GCLMemDesc srcDesc = ((OclMemory *)other)->get_desc(); + GCLMemType srcMt = srcDesc.memType; + GCLMemType dstMt = this->desc.memType; + void *srcPtr = ((OclMemory *)other)->get_ptr(); + void *dstPtr = this->val.get(); + if (srcMt != GCL_MEM_BUF && dstMt == GCL_MEM_BUF) { + if (srcDesc.byteSize > this->desc.byteSize) { + CHECK_STATUS(NOT_MATCH); + } + U32 region[3] = {srcDesc.stride[0], srcDesc.stride[1], srcDesc.stride[2]}; + CHECK_STATUS(gcl_trans_memory(OCLContext::getInstance().handle.get(), srcPtr, + dstPtr, region, DEVICE_IMG_TO_BUF, CL_TRUE)); + } else if (srcMt == GCL_MEM_BUF && dstMt != GCL_MEM_BUF) { + if (this->desc.byteSize > srcDesc.byteSize) { + CHECK_STATUS(NOT_MATCH); + } + U32 region[3] = { + this->desc.stride[0], this->desc.stride[1], this->desc.stride[2]}; + CHECK_STATUS(gcl_trans_memory(OCLContext::getInstance().handle.get(), srcPtr, + dstPtr, region, DEVICE_BUF_TO_IMG, CL_TRUE)); + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + } + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + return ret; + } + + void *get_ptr() + { + if (allocated) { + this->val->desc = this->desc; //TODO DELETE AFTER SPLITE DESC FROM GCLMEM + } + return this->val.get(); + } + + void set_shared_ptr(std::shared_ptr val) + { + this->val = val; + this->allocated = true; + this->capacitySize = this->bytes(); + } + + std::shared_ptr get_shared_ptr() + { + if (allocated) { + this->val->desc = this->desc; //TODO DELETE AFTER SPLITE DESC FROM GCLMEM + } + return this->val; + } + + void mapped_alloc() + { + this->desc.flags = CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR; + this->desc.byteSize *= 2; + this->allocated = this->mapped; + this->mapped = true; + this->alloc(); + } + + void *get_mapped_ptr() + { + if (!mapped) { + CHECK_STATUS(NOT_MATCH); + } + ocl_map_mem(OCLContext::getInstance().handle.get(), this->val.get(), this->desc); + return this->val->mapPtrArray.back(); + } + + EE reuse(Memory *other) override + { + EE ret; + if (other->get_mem_type() != OCLMem) { + ret = this->copy_from(other); + } else { + U32 size = other->capacity(); + if (size >= this->bytes()) { + this->val = ((OclMemory *)other)->get_shared_ptr(); + this->allocated = true; + this->capacitySize = other->capacity(); + ret = SUCCESS; + } else { + UNI_ERROR_LOG("small OCL memory can not meet big OCL memory demand\n"); + ret = NOT_SUPPORTED; + } + } + return ret; + } + + U32 length() override + { + return this->desc.num; + } + + U32 bytes() override + { + return this->desc.byteSize; + } + + U32 capacity() override + { + return this->capacitySize; + } + + std::string string(U32 num, F32 factor) override + { + std::string line = "desc: " + gclMemDesc2Str(this->desc) + "data: \n"; +#ifdef _DEBUG + DataType dt = (this->desc.dt == DT_U8) ? DT_F16 : this->desc.dt; + switch (dt) { + case DT_F16: + line += gcl_check_data( + OCLContext::getInstance().handle.get(), this->desc, get_ptr(), num, 0, false); + break; + case DT_I32: + line += gcl_check_data( + OCLContext::getInstance().handle.get(), this->desc, get_ptr(), num, 0, false); + break; + default: + UNI_ERROR_LOG("Currently not support to get %d type OCL Memory\n", this->desc.dt); + break; + } +#else + if (mapped) { + for (U32 i = 0; i < num; i++) { + line += std::to_string(this->element(i) * factor) + " "; + } + } +#endif + return line; + } + + F32 element(U32 index) override + { + F32 result = 0; + if (this->mapped) { + F16 *res = (F16 *)this->val->mapPtrArray.back(); + result = res[index]; + } else { + UNI_ERROR_LOG("Currently not support to get element on OCL memory\n"); + } + return result; + } + +private: + GCLMemDesc desc; + std::shared_ptr val; + U32 capacitySize; + bool allocated; + bool mapped; +}; +#endif diff --git a/common/memory/include/tensor.hpp b/common/memory/include/tensor.hpp new file mode 100644 index 00000000..25021765 --- /dev/null +++ b/common/memory/include/tensor.hpp @@ -0,0 +1,168 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _TENSOR_H +#define _TENSOR_H + +#include +#include +#include +#include +#include "memory_cpu.hpp" +#ifdef _USE_MALI +#include "memory_ocl.hpp" +#endif + +class Tensor { +public: + Tensor(MemoryType memoryType = CPUMem) + { + if (memoryType == CPUMem) { + this->val = std::shared_ptr(new CpuMemory()); + } else { +#ifdef _USE_MALI + this->val = std::shared_ptr(new OclMemory()); +#else + UNI_ERROR_LOG("not support to create GPU Tensor\n"); +#endif + } + this->scale = std::shared_ptr(new F32(-1.0)); + } + + Tensor clone(bool allocate = true) + { + Tensor tensor = *this; + tensor.val = this->val->clone(allocate); + tensor.scale = std::shared_ptr(new F32(tensor.get_scale())); + return tensor; + } + + void resize(TensorDesc desc) + { + this->desc = desc; + this->val->resize(desc); + } + + void alloc() + { + this->val->alloc(); + } + + template + static Tensor alloc_sized(TensorDesc desc) + { + Tensor tensor(type); + tensor.resize(desc); + tensor.alloc(); + return tensor; + } + + TensorDesc get_desc() + { + return this->desc; + } + + void set_scale(F32 scale) + { + *(this->scale) = scale; + } + + F32 get_scale() + { + return *(this->scale); + } + + void reuse(Tensor *other) + { + this->val->reuse(other->val.get()); + } + + void copy_from(Tensor *other) + { + this->desc = other->desc; + memcpy(this->scale.get(), other->scale.get(), sizeof(F32)); + this->val->copy_from(other->val.get()); + } + + void copy_to(Tensor *other) + { + other->copy_from(this); + } + + Memory *get_memory() + { + return this->val.get(); + } + + std::shared_ptr get_shared_memory() + { + return this->val; + } + + U32 length() + { + return this->val->length(); + } + + U32 bytes() + { + return this->val->bytes(); + } + + U32 capacity() + { + return this->val->capacity(); + } + + std::string string(int length = -1) + { + int num = tensorNumElements(this->desc); + if (length >= 0 && length < num) { + num = length; + } + F32 factor = this->get_scale(); + factor = (factor == -1) ? 1 : factor; + std::string line = this->val->string(num, factor); + return line; + } + + F32 element(U32 index) + { + F32 factor = this->get_scale(); + factor = (factor == -1) ? 1 : factor; + return this->val->element(index) * factor; + } + +private: + TensorDesc desc; + std::shared_ptr val; + std::shared_ptr scale; +}; + +#include "sys.h" + +// deprecated API, this will be remove +inline void *get_ptr_from_tensor(Tensor tensor, Arch arch) +{ + void *ptr = nullptr; + if (arch == MALI) { +#ifdef _USE_MALI + ptr = ((OclMemory *)(tensor.get_memory()))->get_ptr(); +#endif + } else { + ptr = ((CpuMemory *)(tensor.get_memory()))->get_ptr(); + } + return ptr; +} + +#endif // _TENSOR_H diff --git a/common/uni/CMakeLists.txt b/common/uni/CMakeLists.txt new file mode 100644 index 00000000..b4669c13 --- /dev/null +++ b/common/uni/CMakeLists.txt @@ -0,0 +1,19 @@ +cmake_minimum_required(VERSION 3.2) + +file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/common/cmakes/bolt.cmake ${BOLT_ROOT}/common/cmakes/bolt.cmake) +if (BOLT_CONFIGURE_FILE) + include(${BOLT_CONFIGURE_FILE}) +else (BOLT_CONFIGURE_FILE) + message(FATAL_ERROR " +FATAL: can not find bolt.cmake in directory, + please set shell or cmake environment variable BOLT_ROOT. + ") +endif (BOLT_CONFIGURE_FILE) + +project(uni) + +set_c_cxx_flags() + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) + +add_subdirectory(src) diff --git a/common/uni/include/algorithm_map.h b/common/uni/include/algorithm_map.h new file mode 100644 index 00000000..80440e6c --- /dev/null +++ b/common/uni/include/algorithm_map.h @@ -0,0 +1,400 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _ALGORITHM_MAP_H +#define _ALGORITHM_MAP_H + +#include +#include +#include "thread_affinity.h" +#include "op_type.h" +#include "types.h" + +class AlgorithmMap { +public: + AlgorithmMap(Arch arch, std::string modelName, std::string deviceName, DataType dt) + { + this->algorithmFileName = "algorithmInfo_"; + this->algorithmFileName += deviceName; + this->algorithmFileName += "_"; + this->algorithmFileName += modelName; + this->algorithmFileName += "_"; + this->algorithmFileName += std::to_string(arch); + this->algorithmFileName += "_"; + this->algorithmFileName += std::to_string(dt); + this->hasAlgorithmFile = false; + this->arch = arch; + this->commonAlgoFileName = "commonAlgoInfo_"; + this->commonAlgoFileName += deviceName; + this->commonAlgoFileName += "_"; + this->commonAlgoFileName += std::to_string(arch); + this->commonAlgoFileName += "_"; + this->commonAlgoFileName += std::to_string(dt); + this->hasCommonAlgoFile = false; + } + + void setAlgorithmInfoToMap( + std::string name, I32 *algorithmArray, U32 arrayNum, bool commonAlgo = false) + { + std::string algoInfo = "/"; + for (U32 i = 0; i < arrayNum; i++) { + algoInfo += std::to_string(algorithmArray[i]); + algoInfo += "/"; + } + if (!commonAlgo) { + this->algorithmMap[name] = algoInfo; + } else { + this->commonAlgoMap[name] = algoInfo; + } + } + + bool getAlgorithmInfoFromMap( + std::string name, I32 *algorithmArray, U32 arrayNum, bool commonAlgo = false) + { + std::string algoInfo; + if (!commonAlgo) { + if (this->algorithmMap.find(name) == this->algorithmMap.end()) { + return false; + } + algoInfo = this->algorithmMap[name]; + } else { + if (this->commonAlgoMap.find(name) == this->commonAlgoMap.end()) { + return false; + } + algoInfo = this->commonAlgoMap[name]; + } + U32 be = algoInfo.find_first_of("/"); + U32 end; + for (U32 i = 0; i < arrayNum; i++) { + end = algoInfo.find("/", be + 1); + algorithmArray[i] = std::stoi(algoInfo.substr(be + 1, end - be - 1)); + be = end; + } + return true; + } + + void loadAlgorithmMapFromFileStream(const char *algoFileStream) + { + U32 be = 0; + be = readFileStreamForMap(algoFileStream, be, &this->algorithmMap); +#ifdef _USE_MALI + be = readFileStreamForMap(algoFileStream, be, &this->kernelThreadMap); +#endif + be = readFileStreamForMap(algoFileStream, be, &this->commonAlgoMap); + if (algorithmMap.size()) { + this->hasAlgorithmFile = true; + } + if (commonAlgoMap.size()) { + this->hasCommonAlgoFile = true; + } + } + + void loadAlgorithmMapFromText(std::string algorithmMapPath) + { + if (algorithmMapPath == std::string("")) { + UNI_DEBUG_LOG("load algorithm file failed, path is not set \n"); + return; + } + CI8 lastFlag = algorithmMapPath[algorithmMapPath.length() - 1]; + if (strcmp(&lastFlag, "/") != 0) { + algorithmMapPath += "/"; + } + this->hasAlgorithmFile = readTextForMap(algorithmFileName, algorithmMapPath, &algorithmMap); + this->hasCommonAlgoFile = + readTextForMap(commonAlgoFileName, algorithmMapPath, &commonAlgoMap); + } + + void saveAlgorithmMapToText(std::string algorithmMapPath) + { + if (algorithmMapPath == std::string("")) { + UNI_DEBUG_LOG("save algorithm file failed, path is not set \n"); + return; + } + if (this->hasAlgorithmFile) { + return; + } + CI8 lastFlag = algorithmMapPath[algorithmMapPath.length() - 1]; + if (strcmp(&lastFlag, "/") != 0) { + algorithmMapPath += "/"; + } + saveMapToText( + this->algorithmFileName, algorithmMapPath, this->algorithmMap, this->hasAlgorithmFile); + saveMapToText(this->commonAlgoFileName, algorithmMapPath, this->commonAlgoMap, + this->hasCommonAlgoFile); + } + + void getCommonAlgoMapPara(U32 *ic_step, + U32 *ihw_step, + U32 *fn_step, + U32 *ic_max, + U32 *ihw_max, + U32 *fn_max, + std::set *fwh, + std::set *stride) + { + if (ic_step) { + *ic_step = 16; + } + if (ihw_step) { + *ihw_step = 16; + } + if (fn_step) { + *fn_step = 16; + } + if (ic_max) { + *ic_max = 640; + } + if (ihw_max) { + *ihw_max = 640; + } + if (fn_max) { + *fn_max = 640; + } + if (fwh) { + (*fwh).insert(1); + (*fwh).insert(2); + (*fwh).insert(3); + (*fwh).insert(4); + (*fwh).insert(5); + (*fwh).insert(7); + } + if (stride) { + (*stride).insert(1); + (*stride).insert(2); + } + } + + void setCommonAlgoInfoToMap(OperatorType opType, + DataType dt, + U32 ic, + U32 ih, + U32 iw, + U32 fn, + U32 fh, + U32 fw, + U32 sh, + U32 sw, + I32 *algorithmArray, + U32 arrayNum) + { + std::string algoName = getCommonAlgoName(opType, dt, ic, ih, iw, fn, fh, fw, sh, sw); + setAlgorithmInfoToMap(algoName, algorithmArray, arrayNum, true); + } + + bool getCommonAlgoInfoFromMap(OperatorType opType, + DataType dt, + U32 ic, + U32 ih, + U32 iw, + U32 fn, + U32 fh, + U32 fw, + U32 sh, + U32 sw, + I32 *algorithmArray, + U32 arrayNum) + { + if (this->commonAlgoMap.size() == 0) { + return false; + } + U32 ic_step, ihw_step, fn_step, ic_max, ihw_max, fn_max; + std::set fwh; + std::set stride; + getCommonAlgoMapPara( + &ic_step, &ihw_step, &fn_step, &ic_max, &ihw_max, &fn_max, &fwh, &stride); + ic = ((ic + ic_step - 1) / ic_step) * ic_step; + ih = ((ih + ihw_step - 1) / ihw_step) * ihw_step; + iw = ((iw + ihw_step - 1) / ihw_step) * ihw_step; + fn = ((fn + fn_step - 1) / fn_step) * fn_step; + ic = (ic > ic_max) ? ic_max : ic; + ih = (ih > ihw_max) ? ihw_max : ih; + iw = (iw > ihw_max) ? ihw_max : iw; + fn = (fn > fn_max) ? fn_max : fn; + fw = (fw < fh) ? fh : fw; + while (fwh.find(fw) == fwh.end()) { + fw--; + } + while (stride.find(sw) == stride.end()) { + sw--; + } + std::string algoName = getCommonAlgoName(opType, dt, ic, ih, iw, fn, fh, fw, sh, sw); + return getAlgorithmInfoFromMap(algoName, algorithmArray, arrayNum, true); + } + +#ifdef _USE_MALI + void setKernelThreadInfoToMap(std::string name, U32 gs[3], U32 ls[3]) + { + std::string kernelThreadInfo = "/"; + for (U32 i = 0; i < 3; i++) { + kernelThreadInfo += std::to_string(gs[i]); + kernelThreadInfo += "/"; + } + for (U32 i = 0; i < 3; i++) { + kernelThreadInfo += std::to_string(ls[i]); + kernelThreadInfo += "/"; + } + kernelThreadMap[name] = kernelThreadInfo; + } + + bool getKernelThreadInfoFromMap(std::string name, U32 *gs, U32 *ls) + { + bool findKernelInfo = kernelThreadMap.count(name); + if (!findKernelInfo) { + return findKernelInfo; + } + std::string kernelThreadInfo = kernelThreadMap[name]; + U32 be = kernelThreadInfo.find_first_of("/"); + U32 end; + for (U32 i = 0; i < 3; i++) { + end = kernelThreadInfo.find("/", be + 1); + gs[i] = std::stoi(kernelThreadInfo.substr(be + 1, end - be - 1)); + be = end; + } + for (U32 i = 0; i < 3; i++) { + end = kernelThreadInfo.find("/", be + 1); + ls[i] = std::stoi(kernelThreadInfo.substr(be + 1, end - be - 1)); + be = end; + } + return findKernelInfo; + } +#endif + +private: + U32 readFileStreamForMap( + const char *algoFileStream, U32 be, std::map *targetMap) + { + int num; + std::string content(algoFileStream); + std::string numString = ""; + std::string nameString = ""; + std::string infoString = ""; + while (content[be] == '\n' || content[be] == '\r' || content[be] == '\t' || + content[be] == ' ') { + be++; + } + if (be >= content.size()) { + return content.size(); + } + if (content[be] == '\0') { + return be; + } + while (content[be] != '\n') { + numString += content[be]; + be++; + } + num = (numString.size()) ? std::stoi(numString) : 0; + for (int i = 0; i < num; i++) { + be++; + while (content[be] != ' ') { + nameString += content[be]; + be++; + } + be++; + while (content[be] != '\n') { + infoString += content[be]; + be++; + } + (*targetMap)[nameString] = infoString; + nameString = ""; + infoString = ""; + } + return be++; + } + + bool readTextForMap( + std::string fileName, std::string path, std::map *targetMap) + { + std::string fullyFileName = path + fileName; + FILE *file = fopen(fullyFileName.c_str(), "r"); + if (!file || feof(file)) { + return false; + } + UNI_INFO_LOG("load algorithmFile %s\n", fullyFileName.c_str()); + int num = 0; + fscanf(file, "%d", &num); + char operatorName[100]; + char algorithm[100]; + for (int i = 0; i < num; i++) { + fscanf(file, "%s %s", operatorName, algorithm); + (*targetMap)[operatorName] = algorithm; + } +#ifdef _USE_MALI + if (this->arch == MALI && fileName == this->algorithmFileName) { + fscanf(file, "%d", &num); + char kernelName[100]; + char kernelThreadInfo[100]; + for (int i = 0; i < num; i++) { + fscanf(file, "%s %s", kernelName, kernelThreadInfo); + kernelThreadMap[kernelName] = kernelThreadInfo; + } + } +#endif + fclose(file); + return true; + } + + void saveMapToText(std::string fileName, + std::string path, + std::map targetMap, + bool noNeedSave) + { + if (noNeedSave) { + return; + } + if (targetMap.size() > 0) { + std::string fullyFileName = path + fileName; + UNI_DEBUG_LOG("save algorithmFile %s\n", fullyFileName.c_str()); + FILE *file = fopen(fullyFileName.c_str(), "w"); + fprintf(file, "%ld\n", (I64)targetMap.size()); + for (auto iter : targetMap) { + fprintf(file, "%s %s\n", iter.first.c_str(), iter.second.c_str()); + } +#ifdef _USE_MALI + if (this->arch == MALI && fileName == this->algorithmFileName) { + fprintf(file, "%ld\n", (I64)kernelThreadMap.size()); + for (auto iter : kernelThreadMap) { + fprintf(file, "%s %s\n", iter.first.c_str(), iter.second.c_str()); + } + } +#endif + fclose(file); + } + } + + std::string getCommonAlgoName( + OperatorType opType, DataType dt, U32 ic, U32 ih, U32 iw, U32 fn, U32 fh, U32 fw, U32 sh, U32 sw) + { + std::string algoName = "op" + std::to_string(opType) + "dt" + std::to_string(dt); + algoName += "ic" + std::to_string(ic); + algoName += "ih" + std::to_string(ih); + algoName += "iw" + std::to_string(iw); + algoName += "fn" + std::to_string(fn); + algoName += "fh" + std::to_string(fh); + algoName += "fw" + std::to_string(fw); + algoName += "sh" + std::to_string(sh); + algoName += "sw" + std::to_string(sw); + return algoName; + } + + std::map algorithmMap; + std::string algorithmFileName; + Arch arch; + bool hasAlgorithmFile; +#ifdef _USE_MALI + std::map kernelThreadMap; +#endif + std::map commonAlgoMap; + std::string commonAlgoFileName; + bool hasCommonAlgoFile; +}; +#endif diff --git a/common/uni/include/arm_neon_expand.h b/common/uni/include/arm_neon_expand.h new file mode 100644 index 00000000..74df296f --- /dev/null +++ b/common/uni/include/arm_neon_expand.h @@ -0,0 +1,337 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_ARM_NEON_EXPAND +#define _H_ARM_NEON_EXPAND +#include +#include +#include + +#include "types.h" +#include "error.h" + +#ifndef __aarch64__ +inline float32x4_t vdivq_f32(float32x4_t a, float32x4_t b) +{ + float32x4_t b_recip = vrecpeq_f32(b); + b_recip = vmulq_f32(vrecpsq_f32(b, b_recip), b_recip); + return vmulq_f32(a, b_recip); +} + +inline float vmaxvq_f32(float32x4_t x) +{ + float32x2_t max = vmax_f32(vget_low_f32(x), vget_high_f32(x)); + max = vpmax_f32(max, max); + return vget_lane_f32(max, 0); +} + +#ifndef __ANDROID__ +inline float32x4_t vfmaq_f32(float32x4_t c, float32x4_t a, float32_t b) +{ + return vmlaq_f32(c, a, vdupq_n_f32(b)); +} + +inline float32x4_t vfmaq_n_f32(float32x4_t c, float32x4_t a, float32_t b) +{ + return vfmaq_f32(c, a, vdupq_n_f32(b)); +} +#endif + +inline float vaddvq_f32(float32x4_t x) +{ + float32x2_t sum = vadd_f32(vget_low_f32(x), vget_high_f32(x)); + sum = vpadd_f32(sum, sum); + return vget_lane_f32(sum, 0); +} + +inline unsigned int vaddvq_u32(uint32x4_t x) +{ + uint32x2_t sum = vadd_u32(vget_low_u32(x), vget_high_u32(x)); + sum = vpadd_u32(sum, sum); + return vget_lane_u32(sum, 0); +} +#endif + +inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const std::array &coeffs) +{ + float32x4_t A = vfmaq_f32(coeffs[0], coeffs[4], x); + float32x4_t B = vfmaq_f32(coeffs[2], coeffs[6], x); + float32x4_t C = vfmaq_f32(coeffs[1], coeffs[5], x); + float32x4_t D = vfmaq_f32(coeffs[3], coeffs[7], x); + float32x4_t x2 = vmulq_f32(x, x); + float32x4_t x4 = vmulq_f32(x2, x2); + float32x4_t res = vfmaq_f32(vfmaq_f32(A, B, x2), vfmaq_f32(C, D, x2), x4); + return res; +} + +inline float32x4_t vexpq_f32_03_percent_error(float32x4_t x) +{ + const std::array exp_tab = {{ + vdupq_n_f32(1.f), + vdupq_n_f32(0.0416598916054f), + vdupq_n_f32(0.500000596046f), + vdupq_n_f32(0.0014122662833f), + vdupq_n_f32(1.00000011921f), + vdupq_n_f32(0.00833693705499f), + vdupq_n_f32(0.166665703058f), + vdupq_n_f32(0.000195780929062f), + }}; + + x = vminq_f32(x, vdupq_n_f32(88.3762626647949f)); + + static const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); + static const float32x4_t CONST_INV_LN2 = vdupq_n_f32(1.4426950408f); + static const float32x4_t CONST_0 = vdupq_n_f32(0.f); + static const int32x4_t CONST_NEGATIVE_14 = vdupq_n_s32(-14); + + int32x4_t m = vcvtq_s32_f32(vmulq_f32(x, CONST_INV_LN2)); + float32x4_t val = vfmsq_f32(x, vcvtq_f32_s32(m), CONST_LN2); + + float32x4_t poly = vtaylor_polyq_f32(val, exp_tab); + + poly = vreinterpretq_f32_s32(vqaddq_s32(vreinterpretq_s32_f32(poly), vqshlq_n_s32(m, 23))); + poly = vbslq_f32(vcltq_s32(m, CONST_NEGATIVE_14), CONST_0, poly); + + return poly; +} + +inline float32x4_t vlogq_f32(float32x4_t x) +{ + uint32x4_t ux = vreinterpretq_u32_f32(x); + float32x4_t fx = vcvtq_f32_u32(ux); + // fx * (1.0f / (1 << 23)) + fx = vmulq_f32(fx, vdivq_f32(vdupq_n_f32(1.0f), vcvtq_f32_u32(vshlq_n_u32(vdupq_n_u32(1), 23)))); + + uint32x4_t umx = + vorrq_u32(vandq_u32(ux, vdupq_n_u32(0x007FFFFF)), vshlq_n_u32(vdupq_n_u32(0x7e), 23)); + float32x4_t mx = vreinterpretq_f32_u32(umx); + + const float32x4_t c_124_22551499 = vdupq_n_f32(124.22551499f); + const float32x4_t c_1_498030302 = vdupq_n_f32(1.498030302f); + const float32x4_t c_1_725877999 = vdupq_n_f32(1.72587999f); + const float32x4_t c_0_3520087068 = vdupq_n_f32(0.3520887068f); + + float32x4_t tmp = vdivq_f32(c_1_725877999, vaddq_f32(c_0_3520087068, mx)); + tmp = vaddq_f32(c_124_22551499, tmp); + tmp = vfmaq_f32(tmp, c_1_498030302, mx); + const float32x4_t c_0_69314718 = vdupq_n_f32(0.69314718f); + float32x4_t result_v = vmulq_f32(vsubq_f32(fx, tmp), c_0_69314718); + result_v = vbslq_f32(vcltq_f32(x, vdupq_n_f32(0)), vdupq_n_f32(NAN), result_v); + result_v = vbslq_f32(vceqq_f32(x, vdupq_n_f32(0)), vdupq_n_f32(-INFINITY), result_v); + return result_v; +} + +inline float32x4_t vsigmoidq_f32(float32x4_t x) +{ + float32x4_t one_v = vdupq_n_f32(1.f); + return vrecpeq_f32(vaddq_f32(vexpq_f32_03_percent_error(vnegq_f32(x)), one_v)); +} + +inline float32x4_t vtanhq_f32(float32x4_t x) +{ + float32x4_t one_v = vdupq_n_f32(1.f); + float32x4_t two_v = vdupq_n_f32(2.f); + float32x4_t e_2G_v = vexpq_f32_03_percent_error(vmulq_f32(two_v, x)); + // float32x4_t result_v = vfmsq_f32(one_v, two_v, vrecpeq_f32(vaddq_f32(e_2G_v, one_v))); + float32x4_t result_v = vsubq_f32(one_v, vdivq_f32(two_v, vaddq_f32(one_v, e_2G_v))); + return result_v; +} + +#ifdef _USE_FP16 + +inline float16x8_t vaddq_f16_f32(float16x8_t a, float16x8_t b) +{ +#ifdef _USE_F16_MIX_PRECISION + float32x4_t a0 = vcvt_f32_f16(vget_low_f16(a)); + float32x4_t a1 = vcvt_f32_f16(vget_high_f16(a)); + float32x4_t b0 = vcvt_f32_f16(vget_low_f16(b)); + float32x4_t b1 = vcvt_f32_f16(vget_high_f16(b)); + return vcombine_f16(vcvt_f16_f32(vaddq_f32(a0, b0)), vcvt_f16_f32(vaddq_f32(a1, b1))); +#else + return vaddq_f16(a, b); +#endif +} + +inline float16x8_t vtaylor_polyq_f16(float16x8_t x, const std::array &coeffs) +{ + float16x8_t A = vfmaq_f16(coeffs[0], coeffs[4], x); + float16x8_t B = vfmaq_f16(coeffs[2], coeffs[6], x); + float16x8_t C = vfmaq_f16(coeffs[1], coeffs[5], x); + float16x8_t D = vfmaq_f16(coeffs[3], coeffs[7], x); + float16x8_t x2 = vmulq_f16(x, x); + float16x8_t x4 = vmulq_f16(x2, x2); + float16x8_t res = vfmaq_f16(vfmaq_f16(A, B, x2), vfmaq_f16(C, D, x2), x4); + return res; +} + +inline float16x8_t vexpq_f16_03_percent_error(float16x8_t x) +{ + const std::array exp_tab = {{ + vdupq_n_f16(1.f), + vdupq_n_f16(0.0416598916054f), + vdupq_n_f16(0.500000596046f), + vdupq_n_f16(0.0014122662833f), + vdupq_n_f16(1.00000011921f), + vdupq_n_f16(0.00833693705499f), + vdupq_n_f16(0.166665703058f), + vdupq_n_f16(0.000195780929062f), + }}; + + x = vminq_f16(x, vdupq_n_f16(11.0898664884f)); + + static const float16x8_t CONST_LN2 = vdupq_n_f16(0.6931471805f); + static const float16x8_t CONST_INV_LN2 = vdupq_n_f16(1.4426950408f); + static const float16x8_t CONST_0 = vdupq_n_f16(0.f); + static const int16x8_t CONST_NEGATIVE_14 = vdupq_n_s16(-14); + + int16x8_t m = vcvtq_s16_f16(vmulq_f16(x, CONST_INV_LN2)); + float16x8_t val = vfmsq_f16(x, vcvtq_f16_s16(m), CONST_LN2); + + float16x8_t poly = vtaylor_polyq_f16(val, exp_tab); + + poly = vreinterpretq_f16_s16(vqaddq_s16(vreinterpretq_s16_f16(poly), vqshlq_n_s16(m, 10))); + poly = vbslq_f16(vcltq_s16(m, CONST_NEGATIVE_14), CONST_0, poly); + + return poly; +} + +inline float16x8_t vexpq_f16_4_percent_error_half_time(float16x8_t x) +{ + x = vminq_f16(x, vdupq_n_f16(11.0898664884f)); + static const float16x8_t CONST_Y = vdupq_n_f16(1477.3197217792); + static const float16x8_t CONST_B = vdupq_n_f16(15301.3197217792); + float16x8_t in1, in3; + int16x8_t in2; + x = vmaxq_f16(x, vdupq_n_f16(-10)); + in1 = vfmaq_f16(CONST_B, CONST_Y, x); + in2 = vcvtq_s16_f16(in1); + in3 = vreinterpretq_f16_s16(in2); + return in3; +} + +inline float16x8_t vexpq_f16_f32(float16x8_t a) +{ +#ifdef _USE_F16_MIX_PRECISION + float32x4_t a0 = vcvt_f32_f16(vget_low_f16(a)); + float32x4_t a1 = vcvt_f32_f16(vget_high_f16(a)); + return vcombine_f16( + vcvt_f16_f32(vexpq_f32_03_percent_error(a0)), vcvt_f16_f32(vexpq_f32_03_percent_error(a1))); +#else + return vexpq_f16_03_percent_error(a); +#endif +} + +inline float16x8_t vlogq_f16(float16x8_t x) +{ + float32x4_t a0 = vcvt_f32_f16(vget_low_f16(x)); + float32x4_t a1 = vcvt_f32_f16(vget_high_f16(x)); + return vcombine_f16(vcvt_f16_f32(vlogq_f32(a0)), vcvt_f16_f32(vlogq_f32(a1))); +} + +inline float16x8_t vsigmoidq_f16(float16x8_t x) +{ +#ifdef _USE_F16_MIX_PRECISION + float32x4_t x0 = vcvt_f32_f16(vget_low_f16(x)); + float32x4_t x1 = vcvt_f32_f16(vget_high_f16(x)); + float16x8_t y = vcombine_f16(vcvt_f16_f32(vsigmoidq_f32(x0)), vcvt_f16_f32(vsigmoidq_f32(x1))); + return y; +#else + float16x8_t one_v = vdupq_n_f16(1.f); + return vrecpeq_f16(vaddq_f16_f32(vexpq_f16_03_percent_error(vnegq_f16(x)), one_v)); +#endif +} + +inline float16x8_t vtanhq_f16(float16x8_t x) +{ +#ifdef _USE_F16_MIX_PRECISION + float32x4_t x0 = vcvt_f32_f16(vget_low_f16(x)); + float32x4_t x1 = vcvt_f32_f16(vget_high_f16(x)); + float16x8_t y = vcombine_f16(vcvt_f16_f32(vtanhq_f32(x0)), vcvt_f16_f32(vtanhq_f32(x1))); + return y; +#else + float16x8_t one_v = vdupq_n_f16(1.f); + float16x8_t two_v = vdupq_n_f16(2.f); + float16x8_t e_2G_v = vexpq_f16_03_percent_error(vmulq_f16(two_v, x)); + // float16x8_t result_v = vfmsq_f16(one_v, two_v, vrecpeq_f16(vaddq_f16(e_2G_v, one_v))); + float16x8_t result_v = vsubq_f16(one_v, vdivq_f16(two_v, vaddq_f16(one_v, e_2G_v))); + return result_v; +#endif +} + +inline F32 vaddvq_f16(float16x8_t x) +{ + float32x4_t a = vcvt_f32_f16(vget_high_f16(x)); + float32x4_t b = vcvt_f32_f16(vget_low_f16(x)); + F32 sum = vaddvq_f32(vaddq_f32(a, b)); + return sum; +} + +inline void vst1q_lane_f16_builtin(F16 *address, float16x8_t vec, const int laneId) +{ + switch (laneId) { + case 0: + vst1q_lane_f16(address, vec, 0); + break; + case 1: + vst1q_lane_f16(address, vec, 1); + break; + case 2: + vst1q_lane_f16(address, vec, 2); + break; + case 3: + vst1q_lane_f16(address, vec, 3); + break; + case 4: + vst1q_lane_f16(address, vec, 4); + break; + case 5: + vst1q_lane_f16(address, vec, 5); + break; + case 6: + vst1q_lane_f16(address, vec, 6); + break; + case 7: + vst1q_lane_f16(address, vec, 7); + break; + default: + CHECK_REQUIREMENT(0); + } +} +#endif + +#ifdef _USE_INT8 +inline int32x4_t vdotq_laneq_s32_builtin(int32x4_t c, int8x16_t a, int8x16_t b, const int laneId) +{ + int32x4_t ret; + switch (laneId) { + case 0: + ret = vdotq_laneq_s32(c, a, b, 0); + break; + case 1: + ret = vdotq_laneq_s32(c, a, b, 1); + break; + case 2: + ret = vdotq_laneq_s32(c, a, b, 2); + break; + case 3: + ret = vdotq_laneq_s32(c, a, b, 3); + break; + default: + CHECK_REQUIREMENT(0); + ret = vdupq_n_s32(0); + break; + } + return ret; +} +#endif +#endif diff --git a/common/uni/include/error.h b/common/uni/include/error.h new file mode 100644 index 00000000..af899235 --- /dev/null +++ b/common/uni/include/error.h @@ -0,0 +1,183 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_ERROR +#define _H_ERROR + +#include +#include +#include +#include +#include + +#ifdef __GLIBC__ +#define UNI_THREADID pid_t tid = syscall(SYS_gettid); +#else +#ifdef _USE_IOS +#include +#define UNI_THREADID \ + uint64_t tid64; \ + pthread_threadid_np(NULL, &tid64); \ + pid_t tid = (pid_t)tid64; +#else +#define UNI_THREADID pid_t tid = gettid(); +#endif +#endif + +#ifdef _THREAD_SAFE +extern pthread_mutex_t uniThreadMutex; +#endif + +#ifdef _USE_ANDROID_LOG +#include +#define UNI_LOGD(...) \ + { \ + __android_log_print(ANDROID_LOG_DEBUG, "Bolt", __VA_ARGS__); \ + printf(__VA_ARGS__); \ + fflush(stdout); \ + } +#define UNI_EXIT +#else +#define UNI_LOGD(...) \ + { \ + printf(__VA_ARGS__); \ + fflush(stdout); \ + } +#define UNI_EXIT exit(1); +#endif + +#ifdef __cplusplus +extern "C" { +#endif +#ifdef _THREAD_SAFE +#define UNI_THREAD_SAFE(func) \ + pthread_mutex_lock(&uniThreadMutex); \ + func; \ + pthread_mutex_unlock(&uniThreadMutex); +#else +#define UNI_THREAD_SAFE(func) func; +#endif +#define UNI_CI_LOG(...) printf(__VA_ARGS__); +#define UNI_INFO_LOG(...) \ + { \ + UNI_THREADID \ + UNI_THREAD_SAFE({ \ + UNI_LOGD("[INFO] thread %d ", tid); \ + UNI_LOGD(__VA_ARGS__); \ + }) \ + } +#define UNI_WARNING_LOG(...) \ + { \ + UNI_THREADID \ + UNI_THREAD_SAFE({ \ + UNI_LOGD("[WARNING] thread %d ", tid); \ + UNI_LOGD(__VA_ARGS__); \ + }) \ + } +#define UNI_ERROR_LOG(...) \ + { \ + UNI_THREADID \ + UNI_THREAD_SAFE({ \ + UNI_LOGD("[ERROR] thread %d ", tid); \ + UNI_LOGD(__VA_ARGS__); \ + }) \ + UNI_EXIT; \ + } +#ifdef _DEBUG +#define UNI_DEBUG_LOG(...) \ + { \ + UNI_THREADID \ + UNI_THREAD_SAFE({ \ + UNI_LOGD("[DEBUG] thread %d ", tid); \ + UNI_LOGD(__VA_ARGS__); \ + }) \ + } +#else +#define UNI_DEBUG_LOG(...) +#endif +#define CHECK_REQUIREMENT(status) \ + if (!(status)) { \ + UNI_ERROR_LOG("%s %s line %d requirement mismatch\n", __FILE__, __func__, __LINE__); \ + } +#define CHECK_STATUS(ee) \ + { \ + EE status = (ee); \ + if (status != SUCCESS) { \ + UNI_ERROR_LOG( \ + "%s %s line %d got an error: %s\n", __FILE__, __func__, __LINE__, ee2str(status)); \ + } \ + } + +inline void UNI_PROFILE_INFO(const char *name, const char *category, long start, long duration) +{ +#ifdef _PROFILE + int pid = 0; + UNI_THREADID; + UNI_THREAD_SAFE({ + UNI_LOGD("[PROFILE] thread %d ", tid); + UNI_LOGD("{\"name\": \"%s\", \"cat\": \"%s\", \"ph\": \"X\", \"pid\": \"%d\", \"tid\": " + "\"%d\", \"ts\": %ld, \"dur\": %ld},\n", + name, category, pid, tid, start, duration); + }); +#endif +} + +typedef enum { + SUCCESS = 0, + NULL_POINTER = 1, + NOT_MATCH = 2, + NOT_FOUND = 3, + ALLOC_FAILED = 4, + NOT_IMPLEMENTED = 50, + NOT_SUPPORTED = 51, + GCL_ERROR = 52, + FILE_ERROR = 53, + UNKNOWN = 99 +} EE; + +inline const char *ee2str(EE ee) +{ + const char *ret = 0; + switch (ee) { + case SUCCESS: + ret = "SUCCESS"; + break; + case NULL_POINTER: + ret = "Null Pointer"; + break; + case NOT_MATCH: + ret = "Not Match"; + break; + case NOT_FOUND: + ret = "Not Found"; + break; + case NOT_IMPLEMENTED: + ret = "Not Implemented"; + break; + case NOT_SUPPORTED: + ret = "Not Supported"; + break; + case FILE_ERROR: + ret = "Error with file system"; + break; + default: + ret = "Unknown"; + break; + } + return ret; +} +#ifdef __cplusplus +} +#endif + +#endif diff --git a/common/uni/include/graph.h b/common/uni/include/graph.h new file mode 100644 index 00000000..8e3c6081 --- /dev/null +++ b/common/uni/include/graph.h @@ -0,0 +1,294 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef UNI_INCLUDE_GRAPH_H_ +#define UNI_INCLUDE_GRAPH_H_ + +#include +#include +#include +#include +#include +#include + +#ifdef _USE_XCODE +#include "coded_stream.h" +#include "zero_copy_stream_impl.h" +#include "text_format.h" +#include "message.h" +#else +#include +#include +#include +#include +#endif + +#include "error.h" +#include "tensor_desc.h" +#include "thread_affinity.h" + +template +class Graph { +public: + Graph() + {} + + ~Graph() + {} + + Graph clone() + { + UNI_DEBUG_LOG("graph %s clone begin\n", this->name.c_str()); + Graph graph = *this; + for (unsigned int i = 0; i < this->nodes.size(); i++) { + this->nodes[i] = this->nodes[i].clone(); + } + CHECK_STATUS(graph.manageDataTensors()); + CHECK_STATUS(graph.manageTmpBuffer()); + UNI_DEBUG_LOG("graph %s clone end\n", this->name.c_str()); + return graph; + } + + void init(std::string graphPath) + { + UNI_DEBUG_LOG("load and build graph from %s begin\n", graphPath.c_str()); + GraphParameter graphParameter; + CHECK_REQUIREMENT(load(graphPath, (google::protobuf::Message *)(&graphParameter))); + this->name = graphParameter.name(); + + for (int i = 0; i < graphParameter.output_size(); i++) { + this->outputs.insert(graphParameter.output(i)); + } + for (int i = 0, index = 0; i < graphParameter.node_size(); i++) { + ComputeNode node; + auto nodeParameter = graphParameter.node(i); + node.setNodeParameter(nodeParameter); + if (nodeParameter.type() == std::string("Input")) { + DataTensor *tensor = new DataTensor(); + tensor->resize(extractInputTensorDescFromNode(node)); + CHECK_REQUIREMENT(nodeParameter.output_size() == 1); + this->tensors[nodeParameter.output(0)] = std::shared_ptr(tensor); + continue; + } + + this->nodes.push_back(node); + index++; + } + UNI_DEBUG_LOG("load and build graph from %s end\n", graphPath.c_str()); + } + + EE ready(DataType precision, AffinityPolicy affinityPolicy, int gpuId) + { + UNI_DEBUG_LOG("graph %s ready begin\n", this->name.c_str()); + CHECK_STATUS(managePrecision(precision)); + if (gpuId >= 0) { + affinityPolicy = AFFINITY_GPU; + } + CHECK_STATUS(initInference(affinityPolicy)); + CHECK_STATUS(manageDataTensors()); + CHECK_STATUS(manageTmpBuffer()); + for (unsigned int i = 0; i < this->nodes.size(); i++) { + this->nodes[i].ready(); + } + UNI_DEBUG_LOG("graph %s ready end\n", this->name.c_str()); + return SUCCESS; + } + + EE setRuntime(int cpuId, Arch arch) + { + UNI_DEBUG_LOG( + "graph %s setRuntime(core:%d arch:%d) begin\n", this->name.c_str(), cpuId, arch); + for (unsigned int i = 0; i < this->nodes.size(); i++) { + this->nodes[i].setRuntime(cpuId, arch); + } + UNI_DEBUG_LOG("graph %s setRuntime end\n", this->name.c_str()); + return SUCCESS; + } + + EE run(std::map> tensors) + { + UNI_DEBUG_LOG("graph %s run begin\n", this->name.c_str()); + CHECK_STATUS(setData(tensors)); + for (unsigned int i = 0; i < this->nodes.size(); i++) { + this->nodes[i].run(); + } + UNI_DEBUG_LOG("graph %s run end\n", this->name.c_str()); + return SUCCESS; + } + +private: + std::string name; + std::vector nodes; + std::map> tensors; + std::shared_ptr tmpDataTensor; + std::set outputs; + + bool load(std::string graphPath, google::protobuf::Message *message) + { + std::ifstream fileStream(graphPath, std::ifstream::in); + bool ret = false; + if (fileStream.is_open()) { + google::protobuf::io::IstreamInputStream input(&fileStream); + ret = google::protobuf::TextFormat::Parse(&input, message); + fileStream.close(); + } else { + UNI_ERROR_LOG("can not load graph from %s\n", graphPath.c_str()); + } + return ret; + } + + TensorDesc extractInputTensorDescFromNode(ComputeNode node) + { + auto nodeParameter = node.getNodeParameter(); + std::map types = {{"FLOAT32", DT_F32}, {"FLOAT16", DT_F16}, + {"UINT32", DT_U32}, {"INT8", DT_I8}, {"UINT8", DT_U8}}; + std::map formats = { + {"NCHW", DF_NCHW}, {"NCHWC8", DF_NCHWC8}, {"MTK", DF_MTK}, {"NORMAL", DF_NORMAL}}; + TensorDesc desc; + if (types.find(nodeParameter.input_type()) != types.end()) { + desc.dt = types[nodeParameter.input_type()]; + } else { + UNI_ERROR_LOG( + "graph unsupported input data type %s\n", nodeParameter.input_type().c_str()); + } + if (formats.find(nodeParameter.input_format()) != formats.end()) { + desc.df = formats[nodeParameter.input_format()]; + } else { + UNI_ERROR_LOG( + "graph unsupported input data format %s\n", nodeParameter.input_format().c_str()); + } + desc.nDims = nodeParameter.input_dim_size(); + for (unsigned int i = 0; i < desc.nDims; i++) { + desc.dims[i] = nodeParameter.input_dim(desc.nDims - 1 - i); + } + return desc; + } + + EE inferOutputSize() + { + UNI_DEBUG_LOG("graph %s infer output size begin\n", this->name.c_str()); + for (unsigned int i = 0; i < this->nodes.size(); i++) { + CHECK_STATUS(this->nodes[i].inferOutputSize()); + } + UNI_DEBUG_LOG("graph %s infer output size end\n", this->name.c_str()); + return SUCCESS; + } + + EE setNodeInputOutput() + { + UNI_DEBUG_LOG("graph %s set node input and output begin\n", this->name.c_str()); + for (unsigned int i = 0; i < this->nodes.size(); i++) { + auto nodeParameter = this->nodes[i].getNodeParameter(); + std::map> nodeInputs, nodeOutputs; + for (int j = 0; j < nodeParameter.input_size(); j++) { + std::string nodeInputName = nodeParameter.input(j); + nodeInputs[nodeInputName] = tensors[nodeInputName]; + } + this->nodes[i].setInput(nodeInputs); + + for (int j = 0; j < nodeParameter.output_size(); j++) { + std::string nodeOutputName = nodeParameter.output(j); + nodeOutputs[nodeOutputName] = tensors[nodeOutputName]; + } + this->nodes[i].setOutput(nodeOutputs); + } + CHECK_STATUS(inferOutputSize()); + UNI_DEBUG_LOG("graph %s set node input and output end\n", this->name.c_str()); + return SUCCESS; + } + + EE manageDataTensors() + { + UNI_DEBUG_LOG("graph %s manage tensors begin\n", this->name.c_str()); + for (unsigned int i = 0; i < this->nodes.size(); i++) { + auto nodeParameter = this->nodes[i].getNodeParameter(); + for (int j = 0; j < nodeParameter.output_size(); j++) { + DataTensor *tensor = new DataTensor(); + std::string nodeOutputName = nodeParameter.output(j); + this->tensors[nodeOutputName] = std::shared_ptr(tensor); + } + } + CHECK_STATUS(setNodeInputOutput()); + for (auto tensor : this->tensors) { + if (this->outputs.find(tensor.first) == this->outputs.end()) { + tensor.second->alloc(); + } + } + UNI_DEBUG_LOG("graph %s manage tensors end\n", this->name.c_str()); + return SUCCESS; + } + + EE managePrecision(DataType dataType) + { + UNI_DEBUG_LOG("graph %s manage precision(%d) begin\n", this->name.c_str(), dataType); + for (unsigned int i = 0; i < this->nodes.size(); i++) { + this->nodes[i].setPrecision(dataType); + } + UNI_DEBUG_LOG("graph %s manage precision end\n", this->name.c_str()); + return SUCCESS; + } + + EE initInference(AffinityPolicy affinityPolicy) + { + UNI_DEBUG_LOG("graph %s init inference(%d) begin\n", this->name.c_str(), affinityPolicy); + for (unsigned int i = 0; i < this->nodes.size(); i++) { + this->nodes[i].initInference(affinityPolicy); + } + UNI_DEBUG_LOG("graph %s init inference end\n", this->name.c_str()); + return SUCCESS; + } + + unsigned int inferTmpBufferSize() + { + UNI_DEBUG_LOG("graph %s infer tmp buffer size begin\n", this->name.c_str()); + unsigned int maxTmpBufferSize = 0; + for (unsigned int i = 0; i < this->nodes.size(); i++) { + unsigned int tmpBufferSize = this->nodes[i].getTmpBufferSize(); + if (tmpBufferSize > maxTmpBufferSize) { + maxTmpBufferSize = tmpBufferSize; + } + } + UNI_DEBUG_LOG("graph %s infer tmp buffer size end\n", this->name.c_str()); + return maxTmpBufferSize; + } + + EE manageTmpBuffer() + { + UNI_DEBUG_LOG("graph %s manage tmp buffer begin\n", this->name.c_str()); + unsigned int maxTmpBufferSize = inferTmpBufferSize(); + this->tmpDataTensor = std::shared_ptr(new DataTensor()); + this->tmpDataTensor->resize(tensor1d(DT_U8, maxTmpBufferSize)); + for (unsigned int i = 0; i < this->nodes.size(); i++) { + this->nodes[i].setTmpBuffer(this->tmpDataTensor); + } + UNI_DEBUG_LOG("graph %s manage tmp buffer end\n", this->name.c_str()); + return SUCCESS; + } + + EE setData(std::map> tensors) + { + UNI_DEBUG_LOG("graph %s set data from upper begin\n", this->name.c_str()); + for (auto tensor : tensors) { + if (this->tensors.find(tensor.first) != this->tensors.end()) { + this->tensors[tensor.first] = tensor.second; + } else { + UNI_ERROR_LOG("graph %s can not find %s to set as input or output\n", + this->name.c_str(), tensor.first.c_str()); + } + } + CHECK_STATUS(setNodeInputOutput()); + UNI_DEBUG_LOG("graph %s set data from upper end\n", this->name.c_str()); + return SUCCESS; + } +}; +#endif // UNI_INCLUDE_GRAPH_H_ diff --git a/model-tools/include/model_print.h b/common/uni/include/model_print.h similarity index 84% rename from model-tools/include/model_print.h rename to common/uni/include/model_print.h index 7d8f95a8..f486e342 100644 --- a/model-tools/include/model_print.h +++ b/common/uni/include/model_print.h @@ -1,29 +1,27 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _H_MODEL_PRINT #define _H_MODEL_PRINT -#include "model_tools.h" +#include "types.h" #ifdef __cplusplus extern "C" { #endif - void print_header(const ModelSpec ms); -void print_operator_tensor_relationship(const ModelSpec ms, bool delete_deprecated_op=false); +void print_operator_tensor_relationship(const ModelSpec ms, bool delete_deprecated_op = false); void print_weights(const ModelSpec ms); diff --git a/common/uni/include/model_serialize_deserialize.hpp b/common/uni/include/model_serialize_deserialize.hpp new file mode 100644 index 00000000..0c6bd471 --- /dev/null +++ b/common/uni/include/model_serialize_deserialize.hpp @@ -0,0 +1,70 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_MODEL_SERIALIZE_DESERIALIZE +#define _H_MODEL_SERIALIZE_DESERIALIZE + +#include +#include "types.h" + +int get_operator_parameter_size(OperatorType operatorType); + +#if defined(_BUILD_TEST) || defined(_USE_CAFFE) || defined(_USE_ONNX) || defined(_USE_TFLITE) || \ + defined(_USE_TENSORFLOW) +EE serialize_header(const ModelSpec *spec, std::string *tmp); + +EE serialize_operators(const ModelSpec *spec, std::string *tmp); + +EE serialize_weights(const ModelSpec *spec, std::string *tmp); + +EE serialize_model(const ModelSpec *spec, std::string *bytes); + +EE write_to_file(std::string *bytes, const char *fn); + +EE serialize_model_to_file(const ModelSpec *spec, const char *fn); + +EE ms_datatype_converter(ModelSpec *original_ms, + ModelSpec *target_ms, + DataConvertType convert_mode, + std::string storageMode); +#endif + +EE deserialize_header(char *bytes, ModelSpec *spec, U32 *pos); + +EE deserialize_operator(char *bytes, ModelSpec *spec, U32 *pos); + +EE deserialize_weight(char *bytes, ModelSpec *spec, U32 *pos); + +EE operator_relationship(ModelSpec *spec); + +EE deserialize_model_from_file(const char *fn, ModelSpec *spec, bool useFileStream = false); + +inline std::string concat_dir_file(std::string dir, std::string file) +{ + std::string ret; + if (!dir.empty()) { + int len = dir.size(); + char &last = dir.at(len - 1); + if ('/' != last) { + ret = dir + '/'; + } else { + ret = dir; + } + ret += file; + } else { + ret = file; + } + + return ret; +} +#endif diff --git a/common/uni/include/op_type.h b/common/uni/include/op_type.h new file mode 100644 index 00000000..9804af72 --- /dev/null +++ b/common/uni/include/op_type.h @@ -0,0 +1,127 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_OP_TYPE +#define _H_OP_TYPE + +#ifdef __cplusplus +extern "C" { +#endif + +// please add OperatorType and OperatorTypeName at the same time +typedef enum { + OT_None = 0, + OT_Input = 1, + OT_Conv = 2, + OT_Deconvolution = 3, + OT_FC = 4, + OT_RNN = 5, + OT_MatMul = 6, + OT_Resize = 7, + OT_BilateralSliceApply = 8, + OT_Pooling = 9, + OT_Scale = 10, + OT_PRelu = 11, + OT_BatchNorm = 12, + OT_LayerNorm = 13, + OT_L2Normalization = 14, + OT_Reduction = 15, + OT_ArgMax = 16, + OT_Softmax = 17, + OT_SoftmaxWithLoss = 18, + OT_LogSoftmax = 19, + + OT_Clip = 20, + OT_Power = 21, + OT_Sigmoid = 22, + OT_Relu = 23, + OT_Relu6 = 24, + OT_HSwish = 25, + OT_HSigmoid = 26, + OT_Gelu = 27, + OT_TanH = 28, + OT_Mish = 29, + OT_Erf = 30, + + OT_Gather = 31, + OT_Embedding = 32, + OT_Pad = 33, + OT_Eltwise = 34, + OT_Concat = 35, + OT_Slice = 36, + OT_TfSlice = 37, + + OT_Cast = 38, + OT_Shape = 39, + OT_ConstantOfShape = 40, + OT_Transpose = 41, + OT_Reshape = 42, + OT_Squeeze = 43, + OT_Unsqueeze = 44, + OT_Space2Depth = 45, + OT_Depth2Space = 46, + OT_Constant = 47, + + OT_ChannelResize = 48, + OT_PreAllocatedMemory = 49, + OT_SharedWeight = 50, + OT_Copy = 51, + OT_Check = 52, + OT_Repeat = 53, + OT_Jump = 54, + OT_Attention = 55, + OT_AttentionMask = 56, + OT_RelativePositionEmbedding = 57, + OT_RelativeShift = 58, + OT_PriorBox = 59, + OT_DetectionOutput = 60, + OT_Yolov3DetectionOutput = 61, + OT_MultiHeadAttention = 62, + OT_SqDiff = 63, + OT_Tile = 64, + OT_Splice = 65, + OT_Neg = 66, + OT_Greater = 67 // Temporary support for special case +} OperatorType; + +inline const char *const *OperatorTypeName() +{ + static const char *const names[] = {"OT_None", "OT_Input", "OT_Conv", "OT_Deconvolution", + "OT_FC", "OT_RNN", "OT_MatMul", "OT_Resize", "OT_BilateralSliceApply", "OT_Pooling", + + "OT_Scale", "OT_PRelu", "OT_BatchNorm", "OT_LayerNorm", "OT_L2Normalization", + "OT_Reduction", "OT_ArgMax", "OT_Softmax", "OT_SoftmaxWithLoss", "OT_LogSoftmax", + + "OT_Clip", "OT_Power", "OT_Sigmoid", "OT_Relu", "OT_Relu6", "OT_HSwish", "OT_HSigmoid", + "OT_Gelu", "OT_TanH", "OT_Mish", + + "OT_Erf", "OT_Gather", "OT_Embedding", "OT_Pad", "OT_Eltwise", "OT_Concat", "OT_Slice", + "OT_TfSlice", "OT_Cast", "OT_Shape", + + "OT_ConstantOfShape", "OT_Transpose", "OT_Reshape", "OT_Squeeze", "OT_Unsqueeze", + "OT_Space2Depth", "OT_Depth2Space", "OT_Constant", "OT_ChannelResize", + "OT_PreAllocatedMemory", + + "OT_SharedWeight", "OT_Copy", "OT_Check", "OT_Repeat", "OT_Jump", "OT_Attention", + "OT_AttentionMask", "OT_RelativePositionEmbedding", "OT_RelativeShift", "OT_PriorBox", + + "OT_DetectionOutput", "OT_Yolov3DetectionOutput", "OT_MultiHeadAttention", "OT_SqDiff", + "OT_Tile", "OT_Splice", "OT_Neg", "OT_Greater"}; + return names; +} + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/common/uni/include/parse_command.h b/common/uni/include/parse_command.h new file mode 100644 index 00000000..d5d03c1a --- /dev/null +++ b/common/uni/include/parse_command.h @@ -0,0 +1,312 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_PARSE_COMMAND +#define _H_PARSE_COMMAND +#include +#include +#include +#include "types.h" +#include "error.h" + +#ifdef _USE_FP16 + +inline U32 getBinFileSize(CI8 *dataPath, CI8 *dataName) +{ + std::string filePath = dataPath; + CI8 lastFlag = filePath[filePath.length() - 1]; + if (strcmp(&lastFlag, "/") != 0) { + filePath += "/"; + } + std::string fileName = dataName; + fileName = filePath + fileName; + FILE *file = fopen(fileName.c_str(), "rb"); + if (file == NULL) { + UNI_WARNING_LOG("waring fopen %s failed\n", fileName.c_str()); + return 0; + } + fseek(file, 0, SEEK_END); + U32 size = (U32)ftell(file); + fseek(file, 0, SEEK_SET); + fclose(file); + return size; +} + +inline void writeF16ToF32Bin(F16 *data, U32 num, CI8 *dataPath, CI8 *dataName) +{ + std::string filePath = dataPath; + CI8 lastFlag = filePath[filePath.length() - 1]; + if (strcmp(&lastFlag, "/") != 0) { + filePath += "/"; + } + std::string fileName = dataName; + fileName = filePath + fileName; + FILE *outfile = fopen(fileName.c_str(), "wb"); + if (outfile == NULL) { + UNI_WARNING_LOG("waring fopen %s failed\n", fileName.c_str()); + return; + } + F32 *dataTran = new F32[num]; + for (U32 i = 0; i < num; i++) { + dataTran[i] = (F32)data[i]; + } + fwrite(dataTran, sizeof(float), num, outfile); + fclose(outfile); + delete[] dataTran; +} + +inline void readF32BinToF16(F16 *data, U32 num, CI8 *dataPath, CI8 *dataName) +{ + std::string filePath = dataPath; + CI8 lastFlag = filePath[filePath.length() - 1]; + if (strcmp(&lastFlag, "/") != 0) { + filePath += "/"; + } + std::string fileName = dataName; + fileName = filePath + fileName; + FILE *infile = fopen(fileName.c_str(), "rb"); + if (infile == NULL) { + UNI_WARNING_LOG("waring fopen %s failed\n", fileName.c_str()); + return; + } + F32 *dataTran = new F32[num]; + fread(dataTran, sizeof(float), num, infile); + for (U32 i = 0; i < num; i++) { + data[i] = (F16)dataTran[i]; + } + fclose(infile); + delete[] dataTran; +} + +#endif +const struct option long_options[]{ + {"model", 1, nullptr, 'm'}, + {"inputPath", 1, nullptr, 'i'}, + {"archInfo", 1, nullptr, 'a'}, + {"algoPath", 1, nullptr, 'p'}, + {"imageFormat", 1, nullptr, 'f'}, + {"scaleValue", 1, nullptr, 's'}, + {"topK", 1, nullptr, 't'}, + {"correctLable", 1, nullptr, 'c'}, + {"loopTime", 1, nullptr, 'l'}, + {"subNetworkName", 1, nullptr, 'S'}, + {"help", 1, nullptr, 'h'}, + {"readInputBinName", 1, nullptr, 1}, + {"writeOutputBinName", 1, nullptr, 2}, +}; + +const char optstring[] = "m:i:a:p:f:s:t:c:l:S:h:"; + +typedef struct { + std::pair model; + std::pair inputPath; + std::pair archInfo; + std::pair algoPath; + std::pair imageFormat; + std::pair scaleValue; + std::pair topK; + std::pair correctLable; + std::pair loopTime; + std::pair subNetworkName; + std::pair readInputBinName; + std::pair writeOutputBinName; +} ParseRes; +typedef ParseRes *ParseRes_t; + +inline void init_parse_res(ParseRes_t parse_res) +{ + parse_res->model.second = false; + parse_res->inputPath.second = false; + parse_res->archInfo.second = false; + parse_res->algoPath.second = false; + parse_res->imageFormat.second = false; + parse_res->scaleValue.second = false; + parse_res->topK.second = false; + parse_res->correctLable.second = false; + parse_res->loopTime.second = false; + parse_res->subNetworkName.second = false; + parse_res->readInputBinName.second = false; + parse_res->writeOutputBinName.second = false; +} + +inline void help_examples() +{ + std::cout << "<<<<<<<<<<<<<<<<<<<< Parameters specification for examples >>>>>>>>>>>>>>>>>>>>" + << std::endl; + std::cout << "--model " + << " or -m: " + << "--required-- " + << "specific bolt model" << std::endl; + std::cout << "--archInfo " + << " or -a: " + << "--optional-- " + << "specific running arch: CPU_AFFINITY_HIGH_PERFORMANCE/CPU_AFFINITY_LOW_POWER/GPU," + << " the defaule value is CPU_AFFINITY_HIGH_PERFORMANCE" << std::endl; + std::cout << "--inputPath " + << " or -i: " + << "--optional-- " + << "specific file path to read input data" << std::endl; + std::cout << "--algoPath " + << " or -p: " + << "--optional-- " + << "specific file path to read or write algorithm auto tunning result" << std::endl; + std::cout << "--imageFormat " + << " or -f: " + << "--optional-- " + << "specific imageFormat if the input is an image: " + "BGR/RGB/RGB_SC/BGR_SC_RAW/BGR_SC_R," + << " the default value is RGB" << std::endl; + std::cout << "--scaleValue " + << " or -s: " + << "--optional-- " + << "specific scaleValue for image classification, the default value is 1" << std::endl; + std::cout << "--topK " + << " or -t: " + << "--optional-- " + << "specific topK value for image classification, the default value is 5" << std::endl; + std::cout << "--correctLable " + << " or -c: " + << "--optional-- " + << "specific correctLable for image classification, the deault value is -1" + << std::endl; + std::cout << "--loopTime " + << " or -l: " + << "--optional-- " + << "specific loopTime for running set_input + run + get_output, the deault value is 1" + << std::endl; + std::cout << "--subNetworkName" + << " or -S: " + << "--optional-- " + << "specific subNetworkName for:" << std::endl; + std::cout << " asr convolution transformer: encoder/prediction_net_ln/joint_net, the " + "default value is encoder" + << std::endl; + std::cout << " nmt_tsc : encoder/decoder" << std::endl; + std::cout << " tts : " + "encoder_decoder/postnet/melgan_vocoder/tinybert, the default value is " + "encoder_decoder" + << std::endl; + std::cout << "--readInputBinName " + << "--optional-- " + << "specific read input as binary, the binary should be float value with nchw format" + << std::endl; + std::cout << "--writeOutputBinName " + << "--optional-- " + << "specific save output as binary, the binary will be float value with nchw format" + << std::endl; +} + +inline void help(std::string name) +{ + if (name == "examples") { + help_examples(); + } +} + +inline void parseCommandLine(int argc, char *argv[], ParseRes_t parse_res, std::string name) +{ + int c = 0; + int optionIndex; + ImageFormat imageFormat; + std::cout << "[PARAMETERS INFO]:" << std::endl; + if (argc == 1) { + help(name); + } + while ((c = getopt_long(argc, argv, optstring, long_options, &optionIndex)) != -1) { + switch (c) { + case 'm': + parse_res->model.first = optarg; + parse_res->model.second = true; + std::cout << " - " << parse_res->model.first << std::endl; + break; + case 'i': + parse_res->inputPath.first = optarg; + parse_res->inputPath.second = true; + std::cout << " - " << parse_res->inputPath.first << std::endl; + break; + case 'a': + parse_res->archInfo.first = optarg; + parse_res->archInfo.second = true; + std::cout << " - " << parse_res->archInfo.first << std::endl; + break; + case 'p': + parse_res->algoPath.first = optarg; + parse_res->algoPath.second = true; + std::cout << " - " << parse_res->algoPath.first << std::endl; + break; + case 'f': + if (std::string(optarg) == std::string("RGB")) { + imageFormat = RGB; + } else if (std::string(optarg) == std::string("BGR")) { + imageFormat = BGR; + } else if (std::string(optarg) == std::string("RGB_SC")) { + imageFormat = RGB_SC; + } else if (std::string(optarg) == std::string("BGR_SC_RAW")) { + imageFormat = BGR_SC_RAW; + } else if (std::string(optarg) == std::string("RGB_SC_RAW")) { + imageFormat = RGB_SC_RAW; + } else { + imageFormat = RGB; + std::cout << "Unsupported image format, default to be RGB" << std::endl; + } + parse_res->imageFormat.first = imageFormat; + parse_res->imageFormat.second = true; + std::cout << " - " << optarg << std::endl; + break; + case 's': + parse_res->scaleValue.first = atof(optarg); + parse_res->scaleValue.second = true; + std::cout << " - " << parse_res->scaleValue.first << std::endl; + break; + case 't': + parse_res->topK.first = atoi(optarg); + parse_res->topK.second = true; + std::cout << " - " << parse_res->topK.first << std::endl; + break; + case 'l': + parse_res->loopTime.first = atoi(optarg); + parse_res->loopTime.second = true; + std::cout << " - " << parse_res->loopTime.first << std::endl; + break; + case 'c': + parse_res->correctLable.first = atoi(optarg); + parse_res->correctLable.second = true; + std::cout << " - " << parse_res->correctLable.first << std::endl; + break; + case 'S': + parse_res->subNetworkName.first = optarg; + parse_res->subNetworkName.second = true; + std::cout << " - " << parse_res->subNetworkName.first << std::endl; + break; + case 1: + parse_res->readInputBinName.first = optarg; + parse_res->readInputBinName.second = true; + std::cout << " - " << parse_res->readInputBinName.first + << std::endl; + break; + case 2: + parse_res->writeOutputBinName.first = optarg; + parse_res->writeOutputBinName.second = true; + std::cout << " - " << parse_res->writeOutputBinName.first + << std::endl; + break; + case 'h': + help(name); + break; + default: + help(name); + break; + } + } +} +#endif diff --git a/common/uni/include/profiling.h b/common/uni/include/profiling.h new file mode 100644 index 00000000..6d601a28 --- /dev/null +++ b/common/uni/include/profiling.h @@ -0,0 +1,49 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_PROFILING +#define _H_PROFILING + +#include "ut_util.h" + +std::string extract_class_function(std::string &&pretty_function); +std::string extract_file_function(std::string &&pretty_function); + +#define __CLASS_FUNCTION__ extract_class_function(std::string(__PRETTY_FUNCTION__)) +#define __FILE_FUNCTION__ \ + extract_file_function(std::string(__FILE__) + "::" + std::string(__FUNCTION__)) + +void ut_time_init(); +void ut_time_process( + const std::string &name, const std::string &category, double time_start_ms, double time_end_ms); +void ut_time_statistics(); + +#ifdef _PROFILE_STATISTICS +#define UNI_TIME_INIT ut_time_init(); +#define UNI_TIME_STATISTICS ut_time_statistics(); +#else +#define UNI_TIME_INIT +#define UNI_TIME_STATISTICS +#endif + +#ifdef _PROFILE +#define UNI_PROFILE(func, name, category) \ + double profile_time_start_ms = ut_time_ms(); \ + func; \ + double profile_time_end_ms = ut_time_ms(); \ + ut_time_process(name, category, profile_time_start_ms, profile_time_end_ms); +#else +#define UNI_PROFILE(func, name, category) func; + +#endif +#endif // _H_PROFILING diff --git a/common/uni/include/schedule.h b/common/uni/include/schedule.h new file mode 100644 index 00000000..c9bd0441 --- /dev/null +++ b/common/uni/include/schedule.h @@ -0,0 +1,245 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef FLOW_INCLUDE_SCHEDULE_H_ +#define FLOW_INCLUDE_SCHEDULE_H_ + +#define _USE_WEIGHT_SHARE + +#include +#include +#include +#include + +#include "graph.h" +#include "task.h" + +template +class Schedule { +public: + Schedule() + { + pthread_mutex_init(&(this->taskQueueLock), NULL); + pthread_cond_init(&(this->condition), NULL); + this->threadNum = 0; + this->stop = false; + } + + ~Schedule() + { + pthread_mutex_lock(&(this->taskQueueLock)); + pthread_mutex_destroy(&(this->taskQueueLock)); + pthread_cond_destroy(&(this->condition)); + delete[] this->threads; + } + + int init(std::vector graphPath, + DataType dataType, + AffinityPolicy affinityPolicy, + int threadNum, + bool useGPU) + { + UNI_DEBUG_LOG("schedule init begin\n"); + if (threadNum <= 0) { + return 1; + } + if (pthread_mutex_init(&(this->taskQueueLock), NULL)) { + return 1; + } + if (pthread_cond_init(&(this->condition), NULL)) { + return 1; + } + this->precision = dataType; + this->deviceInfo = get_cpu_info(affinityPolicy); + this->graphPath = graphPath; + +#ifdef _USE_WEIGHT_SHARE + for (unsigned int i = 0; i < graphPath.size(); i++) { + this->graph[graphPath[i]].init(graphPath[i]); + this->graph[graphPath[i]].ready(this->precision, this->deviceInfo.affinityPolicy, -1); + } +#endif + int cpuId; + if (this->deviceInfo.affinityPolicy == AFFINITY_CPU_LOW_POWER) { + cpuId = 3; + } else { + cpuId = 4; + } + set_thread_affinity(0, &cpuId, 1); + this->threadNum = threadNum; + this->threads = new pthread_t[threadNum]; + for (int i = 0; i < threadNum; i++) { + if (pthread_create(this->threads + i, NULL, worker, reinterpret_cast(this)) != + 0) { + this->end(); + UNI_ERROR_LOG("schedule create thread pool fail\n"); + return 1; + } + } + this->useGPU = useGPU; + UNI_DEBUG_LOG("schedule init end\n"); + return 0; + } + + int end() + { + UNI_DEBUG_LOG("schedule exit begin\n"); + if (pthread_mutex_lock(&(this->taskQueueLock)) != 0) { + return 1; + } + + this->stop = true; + + if ((pthread_cond_broadcast(&(this->condition)) != 0) || + (pthread_mutex_unlock(&(this->taskQueueLock)) != 0)) { + return 1; + } + + for (int i = 0; i < this->threadNum; i++) { + if (pthread_join(this->threads[i], NULL) != 0) { + return 1; + } + } + UNI_DEBUG_LOG("schedule exit end\n"); + return 0; + } + + int enqueue(Task *task) + { + UNI_DEBUG_LOG("schedule enqueue task begin\n"); + if (this->threadNum == 0 || task == nullptr) { + UNI_WARNING_LOG("schedule enqueue task failed because task or schedule is " + "deprecated\n"); + return 1; + } + if (pthread_mutex_lock(&(this->taskQueueLock)) != 0) { + UNI_WARNING_LOG("schedule enqueue task failed because of can not acquire task queue " + "lock\n"); + return 1; + } + if (this->stop) { + UNI_WARNING_LOG("schedule enqueue task failed because schedule has end\n"); + return 1; + } + this->taskQueue.push(task); + if (pthread_cond_signal(&(this->condition)) != 0) { + UNI_WARNING_LOG("schedule enqueue task failed because can not find worker\n"); + return 1; + } + pthread_mutex_unlock(&(this->taskQueueLock)); + UNI_DEBUG_LOG("schedule enqueue task end\n"); + return 0; + } + +private: + int threadNum; + pthread_mutex_t taskQueueLock; + std::queue taskQueue; + pthread_cond_t condition; + pthread_t *threads; + int stop; + + std::vector graphPath; + std::map> graph; + + bool useGPU; + DeviceInfo deviceInfo; + DataType precision; + + int getThreadId(pthread_t tid) + { + for (int i = 0; i < this->threadNum; i++) { + if (this->threads[i] == tid) { + return i; + } + } + return -1; + } + + static void *worker(void *_schedule) + { + Schedule *schedule = reinterpret_cast(_schedule); + int threadId = schedule->getThreadId(pthread_self()); + UNI_DEBUG_LOG("worker(%d) begin\n", threadId); + std::map> threadPrivateGraph; + double timeStart = ut_time_ms(); +#ifdef _USE_WEIGHT_SHARE + int gpuId = -1, cpuId = -1; + Arch arch = MALI; + if (schedule->useGPU && threadId == schedule->threadNum - 1) { + gpuId = 0; + for (unsigned int i = 0; i < schedule->graphPath.size(); i++) { + threadPrivateGraph[schedule->graphPath[i]].init(schedule->graphPath[i]); + threadPrivateGraph[schedule->graphPath[i]].ready( + schedule->precision, schedule->deviceInfo.affinityPolicy, gpuId); + } + } + if (gpuId < 0) { + if (schedule->deviceInfo.affinityPolicy == AFFINITY_CPU_HIGH_PERFORMANCE) { + cpuId = schedule->deviceInfo.cpuNum - 1 - threadId; + } else { + cpuId = threadId; + } + arch = schedule->deviceInfo.archs[cpuId]; + if (threadId == 0) { + threadPrivateGraph = schedule->graph; + for (unsigned int i = 0; i < schedule->graphPath.size(); i++) { + threadPrivateGraph[schedule->graphPath[i]].setRuntime(cpuId, arch); + } + } else { + for (unsigned int i = 0; i < schedule->graphPath.size(); i++) { + threadPrivateGraph[schedule->graphPath[i]] = + schedule->graph[schedule->graphPath[i]].clone(); + threadPrivateGraph[schedule->graphPath[i]].setRuntime(cpuId, arch); + } + } + } +#else + for (unsigned int i = 0; i < schedule->graphPath.size(); i++) { + threadPrivateGraph[schedule->graphPath[i]].init(schedule->graphPath[i]); + threadPrivateGraph[schedule->graphPath[i]].ready( + schedule->precision, schedule->deviceInfo.affinityPolicy, -1); + threadPrivateGraph[schedule->graphPath[i]].setRuntime(6, ARM_A76); + } +#endif + UNI_DEBUG_LOG("start to wait task\n"); + double timeEnd = ut_time_ms(); + UNI_PROFILE_INFO("graphs init", "init", timeStart * 1000, (timeEnd - timeStart) * 1000); + while (1) { + pthread_mutex_lock(&(schedule->taskQueueLock)); + while (schedule->taskQueue.empty() && !(schedule->stop)) { + pthread_cond_wait(&(schedule->condition), &(schedule->taskQueueLock)); + } + if (schedule->stop) { + break; + } + + Task *task = nullptr; + if (!(schedule->taskQueue.empty())) { + task = schedule->taskQueue.front(); + schedule->taskQueue.pop(); + } + pthread_mutex_unlock(&(schedule->taskQueueLock)); + if (task != nullptr) { + threadPrivateGraph[task->graphPath].run(task->data); + task->status = TASK_END; + } + } + + pthread_mutex_unlock(&(schedule->taskQueueLock)); + pthread_exit(NULL); + UNI_DEBUG_LOG("worker end\n"); + return (NULL); + } +}; +#endif // UNI_INCLUDE_SCHEDULE_H_ diff --git a/common/uni/include/sys.h b/common/uni/include/sys.h new file mode 100644 index 00000000..097e13c7 --- /dev/null +++ b/common/uni/include/sys.h @@ -0,0 +1,54 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_SYS +#define _H_SYS + +#if defined(_USE_GENERAL) || defined(_USE_NEON) || defined(_USE_X86) +#define _USE_CPU +#endif +#define IS_GENERAL(arch) (arch == CPU_GENERAL) +#define IS_X86_AVX2(arch) (arch == X86_AVX2) +#define IS_ARM_V7(arch) (arch == ARM_V7) +#define IS_ARM_V8(arch) (arch == ARM_V8) +#define IS_ARM_A55(arch) (arch == ARM_A55) +#define IS_ARM_A76(arch) (arch == ARM_A76) +#define IS_ARM_LG_V8(arch) (IS_ARM_A55(arch) || IS_ARM_A76(arch)) +#define IS_ARM(arch) (IS_ARM_LG_V8(arch) || IS_ARM_V8(arch) || IS_ARM_V7(arch)) +#define IS_CPU(arch) (IS_GENERAL(arch) || IS_X86_AVX2(arch) || IS_ARM(arch)) +#define IS_MALI_GPU(arch) (arch == MALI) + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum { + CPU_GENERAL = 1, + MALI = 2, + ARM_V7 = 3, + ARM_V8 = 4, + ARM_A55 = 5, + ARM_A76 = 6, + X86_AVX2 = 7, +} Arch; + +typedef struct { + Arch arch; + void *archPara; +} ArchInfo; +typedef ArchInfo *ArchInfo_t; +#ifdef __cplusplus +} +#endif + +#endif diff --git a/common/uni/include/task.h b/common/uni/include/task.h new file mode 100644 index 00000000..963b7385 --- /dev/null +++ b/common/uni/include/task.h @@ -0,0 +1,130 @@ +/** + * @file + * @brief Task API Document + * + * @copyright + * @code + * Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE + * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * @endcode + */ + +#ifndef UNI_INCLUDE_TASK_H_ +#define UNI_INCLUDE_TASK_H_ + +#include +#include +#include +#include +#include + +#include "tensor.hpp" +#include "profiling.h" + +/** task status */ +typedef enum TaskStatus { + TASK_CREATE, ///< task is created + TASK_READY, ///< task can be processed + TASK_RUN, ///< task is being processed + TASK_END ///< task has been finished +} TaskStatus; + +class Task { +public: + /** + * @brief Task constructor + * + * @return + */ + Task() + { + this->status = TASK_CREATE; + } + + /** + * @brief Task constructor + * @param graphPath predefined flow graph file path + * @param data graph input data + * + * @return + */ + Task(std::string graphPath, std::map> data) + { + this->set(ut_time_ms(), graphPath, data, TASK_READY); + } + + /** + * @brief Task constructor + * @param id time series data stamp + * @param graphPath predefined flow graph file path + * @param data graph input data map + * + * @return + */ + Task(int id, std::string graphPath, std::map> data) + { + this->set(id, graphPath, data, TASK_READY); + } + + /** + * @brief Task copy constructor + * @param task copy from task to generate new Task + * + * @return + */ + Task(Task *task) + { + this->set(task->id, task->graphPath, task->data, task->status); + } + + /** + * @brief Task set function + * @param id time series data stamp + * @param graphPath predefined flow graph file path + * @param data graph input data map + * @param status task status + * + * @return + */ + void set(int id, + std::string graphPath, + std::map> data, + TaskStatus status) + { + this->id = id; + this->graphPath = graphPath; + this->data = data; + this->status = status; + } + + friend std::ostream &operator<<(std::ostream &os, const Task &task) + { + os << "Task " << task.id << "(timestamp " << task.id << ", status " << task.status + << ", graph " << task.graphPath << ", data " << std::endl; + for (auto iter : task.data) { + os << "tensor name " << iter.first << " " << iter.second->string(1) << std::endl; + } + os << ")"; + return os; + } + + /** time stamp */ + int id; + /** task status */ + TaskStatus status; + /** predefined flow graph file path */ + std::string graphPath; + /** graph data */ + std::map> data; +}; +#endif // UNI_INCLUDE_TASK_H_ diff --git a/common/uni/include/tensor_desc.h b/common/uni/include/tensor_desc.h new file mode 100644 index 00000000..c2d4ade0 --- /dev/null +++ b/common/uni/include/tensor_desc.h @@ -0,0 +1,516 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_TENSOR_DESC +#define _H_TENSOR_DESC +#include +#include +#include +#include +#include "error.h" + +#define UNUSED(x) (void)x +#define UNI_MIN(a, b) (((a) < (b)) ? (a) : (b)) +#define UNI_MAX(a, b) (((a) > (b)) ? (a) : (b)) +#define UNI_ABS(a) (((a) > 0) ? (a) : (-1 * (a))) +#define UNI_F16_MIN -65504.0f +#define UNI_F16_MAX 65504.0f +#define NAME_LEN 128 +#ifdef __cplusplus +extern "C" { +#endif +int UNI_ISNAN(float a); +int UNI_ISINF(float a); +#ifdef __cplusplus +} +#endif + +#ifdef _USE_X86 +#include +#endif +#if defined(_USE_NEON) || defined(_USE_MALI) +#include +#ifdef __aarch64__ +typedef __fp16 F16; +#endif +typedef int8_t INT8; +#else +typedef char INT8; +#endif +typedef unsigned char U8; +typedef const unsigned char CU8; +typedef char I8; +typedef const char CI8; +typedef unsigned int U32; +typedef const unsigned int CU32; +typedef int I32; +typedef const int CI32; +typedef float F32; +typedef double F64; +typedef long I64; +typedef unsigned char BIN8; + +typedef enum { + RGB_SC = 0, // scale and center crop + RGB = 1, + BGR = 2, + RGB_RAW = 3, + RGB_SC_RAW = 4, + BGR_SC_RAW = 5 +} ImageFormat; + +typedef enum { + DT_U8 = 0, + DT_I8 = 1, + DT_U32 = 2, + DT_I32 = 3, + DT_F16 = 4, + DT_F16_8Q = 5, + DT_F32 = 6, + DT_BIN01 = 7, + DT_BIN11 = 8, + DT_NUM = 9 +} DataType; + +inline U32 bytesOf(DataType dt) +{ + U32 bytes[] = {1, 1, 4, 4, 2, 2, 4, 1, 1, + 8}; // Please divide number of elements by 8 first in the case of binary data types + return dt < DT_NUM ? bytes[dt] : 0; +} + +typedef enum { + DF_NCHW, + DF_NCHWN16, // vectorize for N=16, for filter + DF_NCHWC8, // vectorize for C=8, for input and output + DF_HWNCN16, // vectorize for N=16, for filter in winograd + DF_NHWCN16, // im2col + GEMM, for filter + DF_NHWCN8, // vectorize for N=8, not used + DF_HWNCN8C4, // int8 filter for winograd + DF_NCHWN8C4, // int8 im2col + GEMM, for filter + DF_NCHWN8HW4, // int8 im2col + GEMM in the first layer, for filter + DF_NCHWN16C8, // bnn im2col + GEMM, for filter + DF_NCHWCxN32, // x86 AVX2 direct conv, for filter + DF_NCHWCxN24, // x86 AVX2 conv 1x1, for filter + DF_NCHWC24, // x86 AVX2 depthwise conv, for filter + DF_TRANSPOSE, // vectorize for COL_MAJOR + DF_NORMAL, // vectorize for ROW_MAJOR + DF_MTK, // RNN input, M: batch, T: step, K: x_dim + DF_MKT, // RNN input, M: batch, T: step, K: x_dim + DF_NK, // MMM/MVM filter, N: col_num, K: row_num + DF_NKN16, // MMM/MVM filter, vectorized for N=16 + DF_NKN32, // MMM/MVM filter, vectorized for N=32 + DF_NKN64, // MMM/MVM filter, vectorized for N=64 + DF_NKN32K4, // int8 MVM filter, vectorized for N=32 + DF_NCWHC4, // ocl mali input and output + DF_NCHWC3, // ocl mali support input rgb + DF_NHWC, // ocl mali support input/output + DF_NCHWN4C4, // ocl mali conv filter + DF_NCHWN4, // ocl mali conv filter + DF_HWCN, // ocl mali filter + DF_NCWHN4C4, // ocl mali fc filter + DF_NHWCN4, // ocl mali filter + DF_CHWNC4, // ocl mali filter + DF_CHWNC8, // ocl mali filter + DF_CHWNC16, // ocl mali filter + DF_CHWC8_NCN8, // fp32 dw_conv, vectorized for C8 and N8 + DF_RGB, + DF_HWNCN8, // fp32 filter for winograd + DF_NKN24, // Optimized MMM filter for FP16 +#ifdef __aarch64__ + DF_NKN12, // Optimized MMM filter for FP32 +#else + DF_NKN8, // Optimized MMM filter for FP32 +#endif + DF_NKN12K4, // Optimized MMM filter for INT8 + DF_NCTHW, // conv 3d +} DataFormat; + +typedef struct { + DataType dt = DT_U8; + DataFormat df; + U32 nDims = 0; + U32 dims[6] = {0}; +} TensorDesc; + +inline TensorDesc tensor5df( + DataType dt, DataFormat df, U32 num, U32 numChannels, U32 height, U32 width, U32 align) +{ + TensorDesc ret; + ret.dt = dt; + ret.df = df; + ret.nDims = 5; + ret.dims[0] = align; + ret.dims[1] = width; + ret.dims[2] = height; + ret.dims[3] = numChannels; + ret.dims[4] = num; + return ret; +} + +inline TensorDesc tensor5d(DataType dt, U32 num, U32 numChannels, U32 height, U32 width, U32 align) +{ + return tensor5df(dt, DF_NCHW, num, numChannels, height, width, align); +} + +inline TensorDesc tensor4df( + DataType dt, DataFormat df, U32 num, U32 numChannels, U32 height, U32 width) +{ + TensorDesc ret; + ret.dt = dt; + ret.df = df; + ret.nDims = 4; + ret.dims[0] = width; + ret.dims[1] = height; + ret.dims[2] = numChannels; + ret.dims[3] = num; + return ret; +} + +inline TensorDesc tensor4d(DataType dt, U32 num, U32 numChannels, U32 height, U32 width) +{ + return tensor4df(dt, DF_NCHW, num, numChannels, height, width); +} + +inline TensorDesc tensor3df(DataType dt, DataFormat df, U32 numChannels, U32 height, U32 width) +{ + TensorDesc ret = tensor4df(dt, df, 1, numChannels, height, width); + ret.nDims = 3; + return ret; +} + +inline TensorDesc tensor3d(DataType dt, U32 numChannels, U32 height, U32 width) +{ + return tensor3df(dt, DF_NCHW, numChannels, height, width); +} + +inline TensorDesc tensor2df(DataType dt, DataFormat df, U32 numRows, U32 numColumns) +{ + TensorDesc ret = tensor3df(dt, df, 1, numRows, numColumns); + ret.nDims = 2; + return ret; +} + +inline TensorDesc tensor2d(DataType dt, U32 numRows, U32 numColumns) +{ + TensorDesc ret = tensor3d(dt, 1, numRows, numColumns); + ret.nDims = 2; + return ret; +} + +inline TensorDesc tensor1d(DataType dt, U32 len) +{ + TensorDesc ret = tensor2d(dt, 1, len); + ret.nDims = 1; + return ret; +} + +inline EE tensor1dGet(TensorDesc desc, DataType *dt, DataFormat *df, U32 *len) +{ + if (nullptr == len || nullptr == dt || nullptr == df) { + return NULL_POINTER; + } + if (1 != desc.nDims) { + return NOT_MATCH; + } + + *df = desc.df; + *dt = desc.dt; + *len = desc.dims[0]; + return SUCCESS; +} + +inline EE tensor2dGet(TensorDesc desc, DataType *dt, DataFormat *df, U32 *numRows, U32 *numColumns) +{ + if (nullptr == numColumns || nullptr == numRows || nullptr == dt || nullptr == df) { + return NULL_POINTER; + } + if (2 != desc.nDims) { + return NOT_MATCH; + } + + *df = desc.df; + *dt = desc.dt; + *numColumns = desc.dims[0]; + *numRows = desc.dims[1]; + return SUCCESS; +} + +inline EE tensor3dGet( + TensorDesc desc, DataType *dt, DataFormat *df, U32 *numChannels, U32 *height, U32 *width) +{ + if (nullptr == numChannels || nullptr == height || nullptr == width || nullptr == dt || + nullptr == df) { + return NULL_POINTER; + } + if (3 != desc.nDims) { + return NOT_MATCH; + } + + *dt = desc.dt; + *df = desc.df; + *width = desc.dims[0]; + *height = desc.dims[1]; + *numChannels = desc.dims[2]; + return SUCCESS; +} + +inline EE tensor4dGet( + TensorDesc desc, DataType *dt, DataFormat *df, U32 *num, U32 *numChannels, U32 *height, U32 *width) +{ + if (nullptr == num || nullptr == numChannels || nullptr == height || nullptr == width || + nullptr == dt || nullptr == df) { + return NULL_POINTER; + } + if (4 != desc.nDims) { + return NOT_MATCH; + } + + *dt = desc.dt; + *df = desc.df; + *width = desc.dims[0]; + *height = desc.dims[1]; + *numChannels = desc.dims[2]; + *num = desc.dims[3]; + return SUCCESS; +} + +inline EE tensorSelectGet(TensorDesc desc, + DataType *dt, + DataFormat *df, + U32 *num, + U32 *numChannels, + U32 *height, + U32 *width, + U32 *time = NULL) + +{ + U32 ndims = desc.nDims; + if (dt) { + *dt = desc.dt; + } + if (df) { + *df = desc.df; + } + if (time && ndims < 5) { + *time = 1; + } + if (desc.df == DF_MKT) { + if (num) { + *num = desc.dims[2]; + } + if (numChannels) { + *numChannels = desc.dims[1]; + } + if (height) { + *height = desc.dims[0]; + } + if (width) { + *width = 1; + } + } else if (desc.df == DF_MTK) { + if (num) { + *num = desc.dims[2]; + } + if (numChannels) { + *numChannels = desc.dims[0]; + } + if (height) { + *height = desc.dims[1]; + } + if (width) { + *width = 1; + } + } else if (desc.df == DF_NCTHW) { + if (width) { + *width = desc.dims[0]; + } + if (height) { + *height = desc.dims[1]; + } + if (time) { + *time = desc.dims[2]; + } + if (numChannels) { + *numChannels = desc.dims[3]; + } + if (num) { + *num = desc.dims[4]; + } + } else { + if (width) { + *width = desc.dims[0]; + } + if (height) { + *height = (ndims > 1) ? desc.dims[1] : 1; + } + if (numChannels) { + *numChannels = (ndims > 2) ? desc.dims[2] : 1; + } + if (num) { + *num = (ndims > 3) ? desc.dims[3] : 1; + } + } + return SUCCESS; +} + +inline U32 tensorNumElements(TensorDesc desc) +{ + if (desc.nDims == 0) { + return 0; + } + U32 ret = 1; + for (U32 i = 0; i < desc.nDims; i++) { + ret *= desc.dims[i]; + } + return ret; +} + +inline U32 tensorNumBytes(TensorDesc desc) +{ + if (desc.dt == DT_BIN01 || desc.dt == DT_BIN11) { + return tensorNumElements(desc) / 8; + } else { + return tensorNumElements(desc) * bytesOf(desc.dt); + } +} + +inline U8 tensorIs1d(TensorDesc desc) +{ + return 1 == desc.nDims; +} + +inline U8 tensorIs2d(TensorDesc desc) +{ + return 2 == desc.nDims; +} + +inline U8 tensorIs3d(TensorDesc desc) +{ + return 3 == desc.nDims; +} + +inline U8 tensorIs4d(TensorDesc desc) +{ + return 4 == desc.nDims; +} + +inline std::string tensorDesc2Str(TensorDesc desc) +{ + std::string descStr = "dt:" + std::to_string(desc.dt) + " df:" + std::to_string(desc.df) + + " dims:" + std::to_string(desc.nDims); + + if (desc.nDims > 0) { + descStr += "("; + } + for (I32 i = int(desc.nDims) - 1; i >= 0; i--) { + descStr += std::to_string(desc.dims[i]); + if (i > 0) { + descStr += ","; + } else { + descStr += ")"; + } + } + + return descStr; +} + +inline int tensorDescIsValid(TensorDesc desc) +{ + if (desc.dt < 0 || desc.dt >= 10) { + return 0; + } + + if (desc.df < 0 || desc.df >= 30) { + return 0; + } + + if (desc.nDims > 6) { + return 0; + } + + for (U32 i = 0; i < desc.nDims; i++) { + if (desc.dims[i] > INT_MAX) { + return 0; + } + } + + return 1; +} + +inline DataFormat getTensorDefaultDataFormat(int nDims) +{ + DataFormat df = DF_NORMAL; + switch (nDims) { + case 2: + df = DF_NORMAL; + break; + case 3: + df = DF_MTK; + break; + case 4: + df = DF_NCHW; + break; + default: + break; + } + return df; +} + +inline std::vector calculateLocalIndex(U32 index, U32 *dims, U32 nDims) +{ + std::vector indexes(nDims); + for (U32 i = 0; i < nDims; i++) { + indexes[i] = index % dims[i]; + index /= dims[i]; + } + return indexes; +} + +inline U32 calculateGlobalIndex(U32 *indexes, U32 *dims, U32 nDims) +{ + U32 index = 0; + for (int i = ((int)nDims) - 1; i >= 0; i--) { + index = index * dims[i] + indexes[i]; + } + return index; +} + +void UNI_memcpy(void *dst, const void *src, int size); + +void UNI_init(U32 num, DataType dt, F32 val, void *dst); + +EE array_transpose(DataType dt, + U32 *inputDims, + const void *input, + U32 *outputDims, + void *output, + U32 *transposeDims, + int dimsNum); + +void transformFromFloat(DataType dataType, float *src, void *dst, int num, float scale = 1.0); + +void transformToFloat(DataType dataType, void *src, float *dst, int num, float scale = 1.0); + +EE transformToNCHW(TensorDesc inputDesc, const void *input, TensorDesc outputDesc, void *output); + +EE transformToNHWC(TensorDesc inputDesc, const void *input, TensorDesc outputDesc, void *output); + +EE transformNCHWToNCHWC8( + TensorDesc inputDesc, const void *input, TensorDesc outputDesc, void *output); + +EE transformNCHWC8ToNCHWC8ByGroup( + TensorDesc inputDesc, const void *input, int group, TensorDesc outputDesc, void *output); + +EE transposeFilter(TensorDesc inputDesc, const void *input, TensorDesc outputDesc, void *output); +#endif diff --git a/common/uni/include/thread_affinity.h b/common/uni/include/thread_affinity.h new file mode 100644 index 00000000..c133f2a0 --- /dev/null +++ b/common/uni/include/thread_affinity.h @@ -0,0 +1,535 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_THREAD_AFFINITY +#define _H_THREAD_AFFINITY + +#include +#include +#include +#include +#include "sys.h" +#include "error.h" +#include "tensor_desc.h" + +#ifdef _USE_X86 +#define __cpuid(data, eaxIn, ecxIn) \ + __asm__ __volatile__("cpuid\n" \ + : "=a"(data[0]), "=b"(data[1]), "=c"(data[2]), "=d"(data[3]) \ + : "0"(eaxIn), "2"(ecxIn)) +#endif + +const int CPU_MAX_NUMBER = 64; +#ifdef _USE_OPENMP +const int OMP_NUM_THREADS = 2; +#else +const int OMP_NUM_THREADS = 1; +#endif + +typedef enum { + AFFINITY_CPU_LOW_POWER = 0, + AFFINITY_CPU_HIGH_PERFORMANCE = 1, + AFFINITY_GPU = 2 +} AffinityPolicy; + +typedef struct CpuStat { + unsigned long idle; + unsigned long total; +} CpuStat; + +typedef struct DeviceInfo { + int cpuNum; + Arch archs[CPU_MAX_NUMBER]; + long freqs[CPU_MAX_NUMBER]; + float occupys[CPU_MAX_NUMBER]; + int cpuids[CPU_MAX_NUMBER]; + CpuStat cpuStats[CPU_MAX_NUMBER]; + + float maxOccupy; + AffinityPolicy affinityPolicy; + Arch schedule; +} DeviceInfo; + +inline const char *const *AffinityPolicyNames() +{ + static const char *const names[] = { + "CPU_AFFINITY_LOW_POWER", "CPU_AFFINITY_HIGH_PERFORMANCE", "GPU"}; + return names; +} + +inline const AffinityPolicy *AffinityPolicies() +{ + static const AffinityPolicy policies[] = { + AFFINITY_CPU_LOW_POWER, AFFINITY_CPU_HIGH_PERFORMANCE, AFFINITY_GPU}; + return policies; +} + +inline int get_cpus_num() +{ +#ifdef _USE_IOS + return 6; +#else + const int bufferSize = 1024; + char buffer[bufferSize]; + FILE *fp = fopen("/proc/cpuinfo", "rb"); + if (!fp) { + return 1; + } + + int cpuNum = 0; + while (!feof(fp)) { + char *status = fgets(buffer, bufferSize, fp); + if (!status) { + break; + } + + if (memcmp(buffer, "processor", 9) == 0) { + cpuNum++; + } + } + fclose(fp); + if (cpuNum > CPU_MAX_NUMBER) { + cpuNum = CPU_MAX_NUMBER; + } + return cpuNum; +#endif +} + +inline void get_cpus_arch(Arch *archs, int cpuNum) +{ +#ifdef _USE_IOS + for (int cpuid = 0; cpuid < cpuNum; cpuid++) { + archs[cpuid] = ARM_A76; + } + return; +#endif + FILE *fp = fopen("/proc/cpuinfo", "rb"); + *archs = CPU_GENERAL; + if (!fp) { + return; + } + +#if defined(_USE_FP32) && defined(_USE_X86) + U32 data[4] = {}; + const U32 &ebx = data[1]; + const U32 &ecx = data[2]; + + const U32 osxsave = 1U << 0; + const U32 avx = 1U << 1; + const U32 avx2 = 1U << 2; + + U32 cpuArch = 0; + __cpuid(data, 0, 0); + __cpuid(data, 1, 0); + if (ecx & (1U << 27)) { + cpuArch |= osxsave; + } + if (cpuArch & osxsave) { + if (ecx & (1U << 28)) { + cpuArch |= avx; + } + } + __cpuid(data, 7, 0); + if ((cpuArch & avx) && (ebx & (1U << 5))) { + cpuArch |= avx2; + } + + if (cpuArch & avx2) { + archs[0] = X86_AVX2; + } else { + UNI_WARNING_LOG("AVX2 is not available, use general implementation."); + } +#endif + + int cpuid = 0; +#ifdef _USE_NEON + const int bufferSize = 1024; + char buffer[bufferSize]; + while (!feof(fp)) { + char *status = fgets(buffer, bufferSize, fp); + if (!status) { + break; + } + + if (memcmp(buffer, "CPU part", 8) == 0) { + Arch arch = ARM_V8; + int id = 0; + sscanf(buffer, "CPU part\t: %x", &id); + switch (id) { + case 0xc07: + arch = ARM_V7; + break; + case 0xc0f: + arch = ARM_V7; + break; + case 0xd01: + arch = ARM_A76; + break; + case 0xd03: + arch = ARM_V8; + break; + case 0xd05: + arch = ARM_A55; + break; + case 0xd07: + arch = ARM_V8; + break; + case 0xd08: + arch = ARM_V8; + break; + case 0xd09: + arch = ARM_V8; + break; + case 0xd0a: + arch = ARM_A76; + break; + case 0xd0b: + arch = ARM_A76; + break; + case 0xd0d: + arch = ARM_A76; + break; + case 0xd40: + arch = ARM_A76; + break; + case 0xd41: + arch = ARM_A76; + break; + case 0xd44: + arch = ARM_A76; + break; + case 0x804: + arch = ARM_A76; + break; + case 0x805: + arch = ARM_A55; + break; + case 0x802: + arch = ARM_A76; + break; + case 0x803: + arch = ARM_A55; + break; + case 0x801: + arch = ARM_V8; + break; + case 0x800: + arch = ARM_V8; + break; + case 0x205: + arch = ARM_V8; + break; + default: + UNI_WARNING_LOG("unknown CPU %d arch %x, set to ARM_V8\n", cpuid, id); + break; + } + archs[cpuid++] = arch; + } + } +#endif + for (; cpuid < cpuNum; cpuid++) { + archs[cpuid] = archs[0]; + } + fclose(fp); +} + +inline long get_cpu_freq(int cpuid) +{ + char path[256]; + FILE *fp = NULL; + if (fp == NULL) { + snprintf( + path, sizeof(path), "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpuid); + fp = fopen(path, "rb"); + } + if (fp == NULL) { + snprintf( + path, sizeof(path), "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state", cpuid); + fp = fopen(path, "rb"); + } + if (fp == NULL) { + snprintf( + path, sizeof(path), "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", cpuid); + fp = fopen(path, "rb"); + } + + long maxFrequency = -1; + if (fp == NULL) { + printf("[WARNING] can not get CPU max frequency\n"); + } else { + fscanf(fp, "%ld", &maxFrequency); + fclose(fp); + } + return maxFrequency; +} + +inline void get_cpus_freq(long *freqs, int cpuNum) +{ + for (int i = 0; i < cpuNum; i++) { + freqs[i] = get_cpu_freq(i); + } +} + +inline void get_cpus_occupy(CpuStat *cpuStat, float *cpuOccupy, int cpuNum) +{ + const int bufferSize = 1024; + char buffer[bufferSize]; + char name[32]; + unsigned long user, nice, system, idle, iowait, irq, softirq, total; + FILE *fp = fopen("/proc/stat", "rb"); + if (!fp) { + for (int i = 0; i < cpuNum; i++) { + cpuOccupy[i] = 0; + } + return; + } + + // skip total statistics + fgets(buffer, bufferSize, fp); + + for (int i = 0; i < cpuNum; i++) { + fgets(buffer, bufferSize, fp); + sscanf(buffer, "%s %lu %lu %lu %lu %lu %lu %lu", name, &user, &nice, &system, &idle, + &iowait, &irq, &softirq); + total = user + nice + system + idle + iowait + irq + softirq; + cpuOccupy[i] = 0; + if (cpuStat[i].total != 0) { + float idleTime = idle - cpuStat[i].idle; + float totalTime = total - cpuStat[i].total; + if (totalTime != 0) { + cpuOccupy[i] = 1.0 - idleTime / totalTime; + } + } + cpuStat[i].idle = idle; + cpuStat[i].total = total; + } + fclose(fp); +} + +inline void swap_variable(void *a, void *b, const int size) +{ + char buffer[size]; + memcpy(buffer, a, size); + memcpy(a, b, size); + memcpy(b, buffer, size); +} + +inline void disable_cpus(float *occupys, int *cpuids, int cpuNum, float cpuOccupyMax) +{ + for (int i = 0; i < cpuNum; i++) { + if (occupys[i] > cpuOccupyMax) { + cpuids[i] = -1; + } + } +} + +inline void sort_cpus_by_arch_freq_occupy( + Arch *archs, long *freqs, float *occupys, int *cpuids, int cpuNum, float cpuOccupyMax) +{ + for (int i = 0; i < cpuNum; i++) { + cpuids[i] = i; + } + + for (int i = 1; i < cpuNum; i++) { + for (int j = i - 1; j >= 0; j--) { + if (archs[j + 1] < archs[j]) { + swap_variable(&archs[j], &archs[j + 1], sizeof(Arch)); + swap_variable(&freqs[j], &freqs[j + 1], sizeof(long)); + swap_variable(&cpuids[j], &cpuids[j + 1], sizeof(int)); + swap_variable(&occupys[j], &occupys[j + 1], sizeof(float)); + continue; + } + if (archs[j + 1] == archs[j]) { + if (freqs[j + 1] < freqs[j]) { + swap_variable(&archs[j], &archs[j + 1], sizeof(Arch)); + swap_variable(&freqs[j], &freqs[j + 1], sizeof(long)); + swap_variable(&cpuids[j], &cpuids[j + 1], sizeof(int)); + swap_variable(&occupys[j], &occupys[j + 1], sizeof(float)); + continue; + } + if (freqs[j + 1] >= freqs[j]) { + continue; + } + } + if (archs[j + 1] > archs[j]) { + continue; + } + } + } + disable_cpus(occupys, cpuids, cpuNum, cpuOccupyMax); +} + +inline int set_thread_affinity(int threadid, const int *cpuids, int num) +{ +#ifndef _USE_IOS +#ifdef __GLIBC__ + pid_t tid = syscall(SYS_gettid); +#else + pid_t tid = gettid(); +#endif + cpu_set_t mask; + CPU_ZERO(&mask); + for (int i = 0; i < num; i++) { + UNI_DEBUG_LOG("bind thread %d to core %d\n", threadid, cpuids[i]); + CPU_SET(cpuids[i], &mask); + } + int status = syscall(__NR_sched_setaffinity, tid, sizeof(mask), &mask); + if (status) { + UNI_WARNING_LOG("fail to set affinity %d\n", status); + return -1; + } +#endif + return 0; +} + +inline AffinityPolicy thread_affinity_get_policy_by_name(const char *name) +{ + int nameLength = strlen(name); + for (int i = 0; i < 3; i++) { + const char *target = AffinityPolicyNames()[i]; + int targetLength = strlen(target); + if (nameLength < targetLength) { + continue; + } + int match = 1; + for (int j = 0; j < targetLength; j++) { + if (name[j] == target[j] || name[j] == target[j] + 32) { + continue; + } else { + match = 0; + break; + } + } + if (match) { + return AffinityPolicies()[i]; + } + } + return AFFINITY_CPU_HIGH_PERFORMANCE; +} + +inline Arch thread_affinity_set_by_policy( + Arch *archs, int *cpuids, int cpuNum, AffinityPolicy policy, int threadId) +{ + if (threadId >= cpuNum) { + UNI_WARNING_LOG("can not allocate more cores for thread %d\n", threadId); + return CPU_GENERAL; + } + if (policy == AFFINITY_GPU) { + return MALI; + } +#ifndef _USE_OPENMP + int cpuid; + Arch arch; + int i = cpuNum - 1 - threadId; + switch (policy) { + case AFFINITY_CPU_LOW_POWER: { + i = threadId; + while (cpuids[i] == -1 && i < cpuNum - 1) { + i++; + } + break; + } + case AFFINITY_CPU_HIGH_PERFORMANCE: { + i = cpuNum - 1 - threadId; + while (cpuids[i] == -1 && i > 0) { + i--; + } + break; + } + default: { + break; + } + } + cpuid = cpuids[i]; + arch = archs[i]; + set_thread_affinity(threadId, &cpuid, 1); +#else + int index = 0; + for (int i = 0; i < cpuNum; i++) { + if (policy == AFFINITY_CPU_LOW_POWER && archs[index] > archs[i]) { + index = i; + } + if (policy == AFFINITY_CPU_HIGH_PERFORMANCE && archs[index] < archs[i]) { + index = i; + } + } + int count = 0; + int candidates[CPU_MAX_NUMBER]; + for (int i = 0; i < cpuNum; i++) { + if (archs[index] == archs[i]) { + candidates[count++] = i; + } + } + set_thread_affinity(threadId, candidates, count); + Arch arch = archs[index]; +#endif + return arch; +} + +inline void thread_affinity_set_by_arch( + Arch *archs, int *cpuids, int cpuNum, Arch arch, int threadId) +{ + if (threadId >= cpuNum) { + UNI_WARNING_LOG("can not allocate more cores for thread %d\n", threadId); + return; + } + if (IS_MALI_GPU(arch)) { + return; + } + int count = 0; + int cpuid = -1; + for (int i = 0; i < cpuNum; i++) { + if (archs[i] == arch && cpuids[i] != -1) { + if (count == threadId) { + cpuid = cpuids[i]; + break; + } else { + count++; + } + } + } + if (cpuid != -1) { + set_thread_affinity(threadId, &cpuid, 1); + } else { + UNI_WARNING_LOG("there is not enough %d arch cores for thread %d", arch, threadId); + } +} + +inline DeviceInfo get_cpu_info(AffinityPolicy affinityPolicy) +{ + DeviceInfo deviceInfo; + deviceInfo.affinityPolicy = affinityPolicy; + deviceInfo.cpuNum = get_cpus_num(); + deviceInfo.maxOccupy = 0.5; + get_cpus_arch(deviceInfo.archs, deviceInfo.cpuNum); + get_cpus_freq(deviceInfo.freqs, deviceInfo.cpuNum); + for (int i = 0; i < deviceInfo.cpuNum; i++) { + deviceInfo.cpuStats[i].total = 0; + } + get_cpus_occupy(deviceInfo.cpuStats, deviceInfo.occupys, deviceInfo.cpuNum); + return deviceInfo; +} + +inline void set_cpu_dynamic(DeviceInfo *deviceInfo, int threadId) +{ + if (deviceInfo->affinityPolicy == AFFINITY_GPU) { + deviceInfo->schedule = MALI; + return; + } + get_cpus_occupy(deviceInfo->cpuStats, deviceInfo->occupys, deviceInfo->cpuNum); + sort_cpus_by_arch_freq_occupy(deviceInfo->archs, deviceInfo->freqs, deviceInfo->occupys, + deviceInfo->cpuids, deviceInfo->cpuNum, deviceInfo->maxOccupy); + deviceInfo->schedule = thread_affinity_set_by_policy(deviceInfo->archs, deviceInfo->cpuids, + deviceInfo->cpuNum, deviceInfo->affinityPolicy, threadId); +} +#endif diff --git a/common/uni/include/types.h b/common/uni/include/types.h new file mode 100644 index 00000000..6ae1a128 --- /dev/null +++ b/common/uni/include/types.h @@ -0,0 +1,618 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_TYPES +#define _H_TYPES + +#include +#include "tensor_desc.h" +#include "op_type.h" +#ifdef __cplusplus +extern "C" { +#endif + +static const int sg_boltVersion = 20201120; +static const int sg_magicNumber = 1141119; + +typedef enum { POOLING_MAX, POOLING_MEAN } PoolingMode; + +typedef enum { CEIL, FLOOR } RoundMode; + +typedef enum { + ELTWISE_SUM, + ELTWISE_MAX, + ELTWISE_MIN, + ELTWISE_PROD, + ELTWISE_SUB, + ELTWISE_DIV, + ELTWISE_SQRT, + ELTWISE_ERF +} EltwiseMode; + +typedef enum { + ACTIVATION_NULL, + ACTIVATION_RELU, + ACTIVATION_RELU6, + ACTIVATION_H_SWISH, + ACTIVATION_H_SIGMOID, + ACTIVATION_SIGMOID, + ACTIVATION_TANH, + ACTIVATION_GELU, + ACTIVATION_MISH, + ACTIVATION_GREATER +} ActivationMode; + +typedef enum { BSliceApply_NULL, BSliceApply_CONV } BilateralSliceApplyMode; + +typedef enum { + Convolution_Pointwise, + Convolution_Dilation, + Convolution_Depthwise, + Convolution_Depthwise_Pointwise, + Convolution_Deconvolution, + Convolution_Depthwise_Deconvolution +} ConvolutionMode; + +typedef enum { Pad_Constant, Pad_Reflect, Pad_Edge, Pad_Symmetric } PadMode; + +typedef enum { CHECK_EQUAL, CHECK_GREATEQUAL, CHECK_GREAT } CheckMode; + +typedef enum { + REDUCTION_SUM, + REDUCTION_MEAN, + REDUCTION_STD_DEVIATION, + REDUCTION_SCALAR_PRODUCT +} ReductionMode; + +typedef enum { KeepPrecision, ToFloat, ToInt } CastPrecisionMode; + +typedef enum { F32_to_F32, F32_to_F16, F32_to_I8 } DataConvertType; + +typedef enum { RNN_RNN, RNN_LSTM, RNN_GRU, RNN_GRU_LBR } RNNMode; + +#pragma pack(8) +typedef struct { + ActivationMode mode; + float value[4] = {0, 0, 0, 0}; +} ActivationParamSpec; + +typedef struct { + bool propagate_down; +} PReLUParamSpec; + +typedef enum { + CONVOLUTION_NO_TMP_MEM, + CONVOLUTION_FASTEST, + CONVOLUTION_TUNNING, + CONVOLUTION_LIBRARY_SEARCH, +} ConvolutionPolicy; + +typedef enum { + CONVOLUTION_ALGORITHM_POINTWISE, + CONVOLUTION_ALGORITHM_DIRECT, + CONVOLUTION_ALGORITHM_IM2COL_GEMM, + CONVOLUTION_ALGORITHM_GEMM, + CONVOLUTION_ALGORITHM_GEMM_ICNCHW, + CONVOLUTION_ALGORITHM_WINOGRAD, + CONVOLUTION_ALGORITHM_BNN, + CONVOLUTION_ALGORITHM_DIRECT_SPE_CK, + CONVOLUTION_ALGORITHM_GROUP_DECONV, + CONVOLUTION_ALGORITHM_NULL +} ConvolutionForwardAlgorithm; + +typedef struct { + F32 xmin; + F32 ymin; + F32 xmax; + F32 ymax; + U32 label; +} BoxRect; + +typedef struct { + U32 label; + I64 box_index; +} BoxInfo; + +typedef struct { + U32 max_output_boxes_per_class; + F32 iou_threshold; + F32 score_threshold; +} NonMaxSuppressionParamSpec; + +typedef struct { + U32 output_h; + U32 output_w; + U32 sampling_ratio; + F32 spatial_scale; +} RoiAlignParamSpec; + +typedef enum { + DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT, + DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT, + DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT_NO_PADDING, + DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_3X3S1P1, + DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_GEMM, + DEPTHWISE_CONVOLUTION_ALGORITHM_NULL +} DepthwiseConvolutionForwardAlgorithm; + +typedef struct { + char mode[NAME_LEN]; + U32 sizes[2]; + float scales[4]; + U32 num_sizes; + U32 num_scales; +} ResizeParamSpec; + +typedef struct { + int gather_axis; +} GatherParamSpec; + +typedef struct { + int axes[8]; + int axes_num; +} SqueezeParamSpec; + +typedef struct { + int axes[8]; + int axes_num; +} UnsqueezeParamSpec; + +typedef struct { + CastPrecisionMode castPrecision; +} CastParamSpec; + +typedef struct { + int axis; + int num_concat; +} ScaleParamSpec; + +typedef struct { + float neg_slope; +} ReLUParamSpec; + +typedef struct { + float coeff_values[8]; + int coeff_size; +} EltwiseSumSpec; + +typedef struct { + float min; + float max; +} ClipParamSpec; + +typedef union { + ReLUParamSpec relu_spec; + ClipParamSpec clip_spec; +} ActivationSpec; + +typedef struct { + EltwiseMode elt_mode; + EltwiseSumSpec elt_sum_spec; + ActivationMode activation_type; + ActivationSpec activation_spec; +} EltwiseParamSpec; + +typedef struct { + U32 num_outputs; + U32 kernel_t; + U32 kernel_h; + U32 kernel_w; + U32 stride_t; + U32 stride_h; + U32 stride_w; + U32 padding_before; + U32 padding_after; + U32 padding_top; + U32 padding_bottom; + U32 padding_left; + U32 padding_right; + U32 group; + U32 dilatedRate_t; + U32 dilatedRate_h; + U32 dilatedRate_w; + U32 num_outputs_origin; + ConvolutionMode convolution_type; + ActivationMode dw_activation_type; + ActivationMode pw_activation_type; + ActivationSpec activation_spec; +} ConvolutionParamSpec; + +typedef struct { + U32 kernel_t; + U32 kernel_h; + U32 kernel_w; + U32 stride_t; + U32 stride_h; + U32 stride_w; + U32 padding_before; + U32 padding_after; + U32 padding_top; + U32 padding_bottom; + U32 padding_left; + U32 padding_right; + RoundMode rm; + PoolingMode mode; +} PoolingParamSpec; + +typedef struct { + U32 num_outputs; + U32 num_slices; + I32 slice_point[32]; +} FullyConnectedParamSpec; + +typedef struct { + int axis; + F32 eps; + F32 gama; + F32 momentum; +} BatchNormParamSpec; + +typedef struct { + U32 before; + U32 after; + U32 top; + U32 bottom; + U32 left; + U32 right; + F32 constant_value; + PadMode pad_mode; +} PadParamSpec; + +typedef struct { + U32 input_dim; + U32 num_output; + bool bias_term; + bool transpose; + int axis; +} EmbedParamSpec; + +typedef struct { + float scale; + float shift; + float power; +} PowerParamSpec; + +typedef struct { + I32 shape_dims[8]; + I32 shape_size; + I32 axis; + I32 num_axes; +} ReshapeParamSpec; + +typedef struct { + I32 slice_points[8]; + U32 slice_size; + I32 axis; +} SliceParamSpec; + +typedef struct { + U32 trans_dims[8]; + U32 trans_size; +} TransposeParamSpec; + +typedef struct { + U32 num_heads; + U32 from_sequence_length; + U32 to_sequence_length; +} AttentionParamSpec; + +typedef struct { + RNNMode mode; + U32 numOutput; + I32 steps; + I32 numProjection; + float zoneoutCell; + float zoneoutOutput; + + bool biDirection; + float forgetBias; + ActivationMode activationMode; +} RNNParamSpec; + +typedef struct { + U32 coefficient_len; + BilateralSliceApplyMode mode; + bool has_offset; +} BilateralSliceApplyParamSpec; + +typedef struct { + I32 axes[8]; + I32 axes_num; + ReductionMode reduction_mode; + float coeff; + bool keep_dim; +} ReductionParamSpec; + +typedef struct { + I32 axis; +} ArgMaxParamSpec; + +typedef struct { + I32 src_dims[3]; + I32 dst_dims[3]; + I32 length; +} CopyParamSpec; + +typedef struct { + CheckMode check_mode; +} CheckParamSpec; + +typedef struct { + int loops; + int axis; +} RepeatParamSpec; + +typedef struct { + TensorDesc desc; +} PreAllocatedMemoryParamSpec; + +typedef struct { + TensorDesc desc; +} SharedWeightParamSpec; + +typedef struct { + bool transpose_a; + bool transpose_b; +} MatMulParamSpec; + +typedef struct { + int attention_length; + float mask; + bool same_length; +} AttentionMaskParamSpec; + +typedef struct { + int axis; + int shift_length; +} RelativeShiftParamSpec; + +typedef struct { + int axis; + int num_concat; +} ConcatParamSpec; + +typedef struct { + int axis; +} SoftmaxParamSpec; + +typedef struct { + int begin[8]; + int end[8]; + int strides[8]; + char begin_mask[8]; + char end_mask[8]; + char ellipsis_mask[8]; + char new_axis_mask[8]; + char shrink_axis_mask[8]; + U32 dim_size; +} TfSliceParamSpec; + +typedef struct { + F32 min_sizes[2]; + F32 max_sizes[2]; + F32 aspect_ratios[2]; + U32 flip; + U32 clip; + F32 variances[4]; + U32 image_h; + U32 image_w; + F32 step_h; + F32 step_w; + F32 offset; +} PriorBoxParamSpec; + +typedef struct { + U32 num_class; + F32 nms_threshold; + U32 nms_top_k; + U32 keep_top_k; + F32 confidence_threshold; +} DetectionOutputParamSpec; + +typedef struct { + U32 num_class; + U32 num_box; + F32 confidence_threshold; + F32 nms_threshold; + F32 biases[18]; + U32 anchors_scale[3]; + U32 mask_group_num; + U32 mask[9]; +} Yolov3DetectionOutputParamSpec; + +typedef struct { + char symmetric[NAME_LEN]; + int group; + int channel_before; + int channel_after; +} ChannelResizeParamSpec; + +typedef struct { + int blockSize; +} Space2DepthParamSpec; + +typedef struct { + int blockSize; + I8 reMode[8]; +} Depth2SpaceParamSpec; + +typedef struct { + int repeatsInfo[8]; + int dimsSize; + int axis; +} TileParamSpec; + +typedef struct { + U32 numIndices; + int outputDim; +} SpliceParamSpec; + +typedef struct { + FullyConnectedParamSpec fc_desc[6]; + PowerParamSpec power_spec; + bool eltwiseWithLayerNormIn[2]; + ActivationMode actiMode; + ReshapeParamSpec reshapeDesc[4]; + EltwiseParamSpec eltwiseDesc[2]; +} MultiheadAttentionParamSpec; + +typedef union ParameterSpec { + ParameterSpec() + {} + ConvolutionParamSpec conv_spec; + FullyConnectedParamSpec fc_spec; + RNNParamSpec rnn_spec; + MatMulParamSpec matmul_spec; + ResizeParamSpec resize_spec; + BilateralSliceApplyParamSpec bilateral_slice_apply_spec; + PoolingParamSpec pooling_spec; + ScaleParamSpec scale_spec; + BatchNormParamSpec bn_spec; + ReductionParamSpec reduction_spec; + ArgMaxParamSpec argmax_spec; + SoftmaxParamSpec softmax_spec; + ClipParamSpec clip_spec; + PowerParamSpec power_spec; + ReLUParamSpec relu_spec; + GatherParamSpec gather_spec; + EmbedParamSpec embed_spec; + PadParamSpec pad_spec; + EltwiseParamSpec eltwise_spec; + ConcatParamSpec concat_spec; + SliceParamSpec slice_spec; + TfSliceParamSpec tfslice_spec; + CastParamSpec cast_spec; + TransposeParamSpec transpose_spec; + ReshapeParamSpec reshape_spec; + SqueezeParamSpec squeeze_spec; + UnsqueezeParamSpec unsqueeze_spec; + Space2DepthParamSpec space2depth_spec; + Depth2SpaceParamSpec depth2space_spec; + ChannelResizeParamSpec channel_resize_spec; + PreAllocatedMemoryParamSpec preallocated_memory_spec; + SharedWeightParamSpec shared_weight_spec; + CopyParamSpec copy_spec; + CheckParamSpec check_spec; + RepeatParamSpec repeat_spec; + AttentionParamSpec attention_spec; + AttentionMaskParamSpec attention_mask_spec; + RelativeShiftParamSpec relative_shift_spec; + PriorBoxParamSpec prior_box_spec; + DetectionOutputParamSpec detection_output_spec; + Yolov3DetectionOutputParamSpec yolov3_detection_output_spec; + MultiheadAttentionParamSpec multiheadAttention_spec; + TileParamSpec tile_spec; + SpliceParamSpec splice_spec; +} ParameterSpec; + +typedef struct { + int num_scale; + F32 *scale; +} QuantSpec; + +typedef struct { + I8 name[NAME_LEN]; + OperatorType type; + U32 num_inputs; + I8 **input_tensors_name; + U32 num_outputs; + I8 **output_tensors_name; + I32 *tensor_positions; + U32 num_quant_feature; + QuantSpec *feature_scale; + ParameterSpec ps; +} OperatorSpec; + +typedef struct { + I8 op_name[NAME_LEN]; + DataType mdt = DT_U8; + U32 bytes_of_weight = 0; + U8 *weight; + U32 bytes_of_vec = 0; + U8 *vec; + U32 num_quant_scale; // Merged FC may have multiple weight scales + QuantSpec *weight_scale; +} WeightSpec; + +typedef struct { + I8 op[NAME_LEN]; + U32 num_inputs; + I8 **input_op_names; + U32 num_outputs; + I8 **output_op_names; +} OperatorRelationshipMapEntry; + +typedef struct { + I32 version; + I32 magic_number; + + I8 model_name[NAME_LEN]; + DataType dt; + + I32 num_inputs; + I8 **input_names; + TensorDesc *input_dims; + + I32 num_outputs; + I8 **output_names; + + I32 num_operator_specs; + OperatorSpec *ops; + + I32 num_weight_specs; + WeightSpec *ws; + + I32 num_op_tensor_entries; + OperatorRelationshipMapEntry *op_relationship_entries; +} ModelSpec; +#pragma pack() + +#ifdef __cplusplus +} +#endif + +OperatorSpec mt_create_operator( + const char *name, OperatorType type, U32 num_inputs, U32 num_outputs); + +EE mt_insert_operator(ModelSpec *ms, int index, OperatorSpec newOperator); + +WeightSpec mt_create_weight( + const char *name, DataType dataType, U32 bytesOfWeight, U32 bytesOfVec, U32 numQuantScale); + +bool isDeprecatedOp(OperatorType opType); + +bool isDeprecatedOpWeight(const ModelSpec *spec, int index); + +EE str_copy(I8 *dst, const I8 *src, I32 src_len, I32 dst_len = NAME_LEN); + +void *mt_new_storage(size_t size); + +inline INT8 round_towards_zero(F32 num, bool clamp = true) +{ + INT8 ret; + if (clamp) { + if (num > 127.0) { + return 127; + } else if (num < -127.0) { + return -127; + } + } + if (num > 0) { + ret = floor(num); + } else { + ret = ceil(num); + } + return ret; +} + +#endif diff --git a/common/uni/include/ut_util.h b/common/uni/include/ut_util.h new file mode 100644 index 00000000..f5660668 --- /dev/null +++ b/common/uni/include/ut_util.h @@ -0,0 +1,401 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_UT_UTIL +#define _H_UT_UTIL + +#include +#include +#include +#include +#include + +#include "sys.h" +#include "types.h" +#include "error.h" + +#if defined(_USE_NEON) +const Arch UT_ARCH = ARM_A76; +#elif defined(_USE_X86) +const Arch UT_ARCH = X86_AVX2; +#else +const Arch UT_ARCH = CPU_GENERAL; +#endif + +// whether to check right +const int UT_CHECK = 1; + +// loop times to benchmark +const int UT_LOOPS = 6; + +// init data type +typedef enum UT_RANDOM_TYPE { + UT_INIT_RANDOM, // random + UT_INIT_NEG, // random & < 0 + UT_INIT_POS, // random & > 0 + UT_INIT_ZERO // 0 +} UT_RANDOM_TYPE; + +// generate random data +inline F32 ut_init_s(DataType dt, UT_RANDOM_TYPE type) +{ + if (type == UT_INIT_ZERO) { + return 0; + } + + F32 s = 0; + if (0 +#ifdef _USE_FP32 + || dt == DT_F32 +#endif +#ifdef _USE_FP16 + || dt == DT_F16 +#endif + ) { + s = rand() % 1000 / 1000.0 - 0.5; + } else { + s = rand() % 100 - 50; + } + + if (type == UT_INIT_NEG) { + s = (s > 0) ? (s * -1) : s; + } + if (type == UT_INIT_POS) { + s = (s < 0) ? (s * -1) : s; + } + return s; +} + +// generate random array +inline void ut_init_v(U8 *data, U32 len, DataType dt, UT_RANDOM_TYPE type) +{ + if (data == nullptr) { + return; + } + + for (U32 i = 0; i < len; i++) { + switch (dt) { +#ifdef _USE_FP32 + case DT_F32: { + F32 *dataPtr = (F32 *)data; + dataPtr[i] = ut_init_s(dt, type); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + F16 *dataPtr = (F16 *)data; + dataPtr[i] = ut_init_s(dt, type); + break; + } +#endif + case DT_I32: { + I32 *dataPtr = (I32 *)data; + dataPtr[i] = ut_init_s(dt, type); + break; + } + case DT_U32: { + U32 *dataPtr = (U32 *)data; + dataPtr[i] = ut_init_s(dt, type); + break; + } + case DT_I8: { + INT8 *dataPtr = (INT8 *)data; + dataPtr[i] = ut_init_s(dt, type); + break; + } + case DT_BIN11: { + BIN8 *dataPtr = (BIN8 *)data; + dataPtr[i] = ut_init_s(dt, type); + break; + } + case DT_BIN01: { + BIN8 *dataPtr = (BIN8 *)data; + dataPtr[i] = ut_init_s(dt, type); + break; + } + default: + UNI_ERROR_LOG("unsupported data type in ut_init_v\n"); + } + } +} + +inline U8 *ut_input_v(U32 len, DataType dt, UT_RANDOM_TYPE type) +{ + U8 *data = (U8 *)malloc(len * bytesOf(dt)); + ut_init_v(data, len, dt, type); + + return data; +} + +// unit test element check +inline void ut_check_s(F32 a, F32 b, F32 threshold, std::string file, int line, int index) +{ + if (!((a <= b + threshold) && (a >= b - threshold))) { + UNI_ERROR_LOG("check in %s at line %d, %d @ %f %f\n", file.c_str(), line, index, a, b); + } +} + +// unit test array check +inline void ut_check_v( + void *A, void *B, U32 len, DataType dt, F32 threshold, std::string file, int line) +{ + F32 a = 0, b = 0; + for (U32 i = 0; i < len; i++) { + switch (dt) { +#ifdef _USE_FP32 + case DT_F32: + a = ((F32 *)A)[i]; + b = ((F32 *)B)[i]; + break; +#endif +#ifdef _USE_FP16 + case DT_F16: + a = ((F16 *)A)[i]; + b = ((F16 *)B)[i]; + break; +#endif + case DT_I32: + a = ((I32 *)A)[i]; + b = ((I32 *)B)[i]; + break; + case DT_U32: + a = ((U32 *)A)[i]; + b = ((U32 *)B)[i]; + break; + case DT_I8: + a = ((INT8 *)A)[i]; + b = ((INT8 *)B)[i]; + break; + case DT_BIN11: + a = ((BIN8 *)A)[i]; + b = ((BIN8 *)B)[i]; + break; + case DT_BIN01: + a = ((BIN8 *)A)[i]; + b = ((BIN8 *)B)[i]; + break; + default: + UNI_ERROR_LOG("unsupported data type in ut_check_v(array, array)\n"); + } + ut_check_s(a, b, threshold, file, line, i); + } +} + +inline void ut_check_v(void *A, F32 val, U32 len, DataType dt, std::string file, int line) +{ + F32 a; + for (U32 i = 0; i < len; i++) { + switch (dt) { +#ifdef _USE_FP32 + case DT_F32: + a = ((F32 *)A)[i]; + break; +#endif +#ifdef _USE_FP16 + case DT_F16: + a = ((F16 *)A)[i]; + break; +#endif + case DT_I32: + a = ((I32 *)A)[i]; + break; + case DT_U32: + a = ((U32 *)A)[i]; + break; + case DT_BIN11: + a = ((BIN8 *)A)[i]; + break; + case DT_BIN01: + a = ((BIN8 *)A)[i]; + break; + default: + UNI_ERROR_LOG("unsupported data type in ut_check_v(array, scalar)\n"); + } + ut_check_s(a, val, 0, file, line, i); + } +} + +inline void ut_check_a(void *A, void *B, U32 len, DataType dt) +{ + U32 e0, e1, e2, e3, e4, e5, e6; + e0 = 0; + e1 = 0; + e2 = 0; + e3 = 0; + e4 = 0; + e5 = 0; + e6 = 0; + F32 a = 1, b = 0, diff; + F32 d0, d1, d2, d3, d4, d5; + F32 maxrel = -1.0; + F32 maxabs = -1.0; + F32 max_a0, max_b0, max_a1, max_b1; + U32 max_n0, max_n1; + switch (dt) { + case DT_F32: + d0 = 1; + d1 = 0.1; + d2 = 0.01; + d3 = 0.001; + d4 = 0.0001; + d5 = 0.00001; + break; +#ifdef _USE_FP16 + case DT_F16: + d0 = 1; + d1 = 0.1; + d2 = 0.01; + d3 = 0.001; + d4 = 0.0001; + d5 = 0.00001; + break; +#endif + case DT_U8: + d0 = 30; + d1 = 20; + d2 = 10; + d3 = 5; + d4 = 3; + d5 = 2; + break; + default: + UNI_ERROR_LOG("unsupported data type in ut_check_a(array, array)\n"); + } + + for (U32 i = 0; i < len; i++) { + switch (dt) { + case DT_F32: + a = ((F32 *)A)[i]; + b = ((F32 *)B)[i]; + break; +#ifdef _USE_FP16 + case DT_F16: + a = ((F16 *)A)[i]; + b = ((F16 *)B)[i]; + break; +#endif + case DT_U8: + a = ((U8 *)A)[i]; + b = ((U8 *)B)[i]; + break; + default: + break; + } + + if (UNI_ISNAN((float)a) || UNI_ISINF((float)a)) { + UNI_ERROR_LOG("nan or inf value in ut_check_a of input A\n"); + return; + } + if (UNI_ISNAN((float)b) || UNI_ISINF((float)b)) { + UNI_ERROR_LOG("nan or inf value in ut_check_a of input B\n"); + return; + } + + diff = a - b; + if (diff < 0) { + diff = -diff; + } + if (diff > maxabs) { + maxabs = diff; + max_a0 = a; + max_b0 = b; + max_n0 = i; + } + F32 tmp = diff * 2 / (a + b + 0.000001); + if (tmp > maxrel) { + maxrel = tmp; + max_a1 = a; + max_b1 = b; + max_n1 = i; + } + if (diff >= d0) { + e0++; + continue; + } + if (diff >= d1) { + e1++; + continue; + } + if (diff >= d2) { + e2++; + continue; + } + if (diff >= d3) { + e3++; + continue; + } + if (diff >= d4) { + e4++; + continue; + } + if (diff >= d5) { + e5++; + continue; + } + e6++; + } + std::cout << "abs(diff) >= " << std::scientific << d0 << " number = " << std::dec << e0 + << std::endl; + std::cout << "abs(diff) >= " << std::scientific << d1 << " number = " << std::dec << e1 + << std::endl; + std::cout << "abs(diff) >= " << std::scientific << d2 << " number = " << std::dec << e2 + << std::endl; + std::cout << "abs(diff) >= " << std::scientific << d3 << " number = " << std::dec << e3 + << std::endl; + std::cout << "abs(diff) >= " << std::scientific << d4 << " number = " << std::dec << e4 + << std::endl; + std::cout << "abs(diff) >= " << std::scientific << d5 << " number = " << std::dec << e5 + << std::endl; + std::cout << "others number = " << e6 << std::endl; + std::cout << "number " << max_n0 << " is " + << "maxabs = " << std::fixed << maxabs << " a = " << max_a0 << " b = " << max_b0 + << std::endl; + std::cout << "number " << max_n1 << " is " + << "maxrel = " << std::fixed << maxrel << " a = " << max_a1 << " b = " << max_b1 + << std::endl; +} +// benchmark time +inline double ut_time_ms() +{ + struct timeval tv; + gettimeofday(&tv, NULL); + double time = tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0; + return time; +} + +inline double ut_time_s() +{ + return ut_time_ms() / 1000.0; +} + +// calculate GFLOPS +inline double ut_gflops(double ops, double time_ms) +{ + return 1e-6 * ops / time_ms; +} + +// uniform log message +inline void ut_log(DataType dt, char *call, double ops, double time_ms) +{ + UNI_INFO_LOG("%ubit, %s,\tTIME %10.6lfms,\tGFLOPS %10.6lf\n", (U32)bytesOf(dt) * 8, call, + time_ms, ut_gflops(ops, time_ms)); +} + +inline void initialization_zero(void *ptr, int bytesOfNum) +{ + memset(ptr, 0, bytesOfNum); + return; +} + +#endif diff --git a/common/uni/include/x86_avx2_expand.h b/common/uni/include/x86_avx2_expand.h new file mode 100644 index 00000000..880f2431 --- /dev/null +++ b/common/uni/include/x86_avx2_expand.h @@ -0,0 +1,140 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef CHEETAH_X86_AVX2_EXPAND_H +#define CHEETAH_X86_AVX2_EXPAND_H +#include +#include +#include "types.h" +#include "error.h" + +//horizontal add u32 +inline unsigned int _mm256_hadd_u32(__m256i x) +{ + __m128i low = _mm256_extracti128_si256(x, 0); + __m128i high = _mm256_extracti128_si256(x, 1); + __m128i sum = _mm_add_epi32(low, high); + int one = _mm_extract_epi32(sum, 0); + int two = _mm_extract_epi32(sum, 1); + int three = _mm_extract_epi32(sum, 2); + int four = _mm_extract_epi32(sum, 3); + + return (one + two + three + four); +} + +inline __m256 _mm256_log_ps(__m256 x) +{ + static const __m256 CONST_one = _mm256_set1_ps(1.0f); + static const __m256 CONST_two = _mm256_set1_ps(2.0f); + static const __m256 CONST_neg_one = _mm256_set1_ps(-1.0f); + F32 i = 30; + __m256 n = _mm256_set1_ps(i); + __m256 nk = _mm256_add_ps(_mm256_mul_ps(CONST_two, n), CONST_one); + x = _mm256_div_ps(_mm256_add_ps(x, CONST_neg_one), _mm256_add_ps(x, CONST_one)); + __m256 xx = _mm256_mul_ps(x, x); + __m256 y = _mm256_div_ps(CONST_one, nk); + for (; i > 0; i--) { + nk = _mm256_sub_ps(nk, CONST_two); + y = _mm256_add_ps(_mm256_div_ps(CONST_one, nk), _mm256_mul_ps(xx, y)); + } + + y = _mm256_mul_ps(CONST_two, _mm256_mul_ps(x, y)); + return y; +} + +inline __m256 _mm256_exp_ps(__m256 x) +{ + // the max and min x in exp(x) in 32-bit float range + __m256 max_upper_bound = _mm256_set1_ps(88.3762626647949f); + __m256 min_lower_bound = _mm256_set1_ps(-87.3365447504019f); + + x = _mm256_min_ps(x, max_upper_bound); + x = _mm256_max_ps(x, min_lower_bound); + + __m256 t, f, p, r; + __m256i i, j; + + const __m256 l2e = _mm256_set1_ps(1.442695041f); /* log2(e) */ + const __m256 l2h = _mm256_set1_ps(-6.93145752e-1f); /* -log(2)_hi */ + const __m256 l2l = _mm256_set1_ps(-1.42860677e-6f); /* -log(2)_lo */ + const __m256 c0 = _mm256_set1_ps(0.008301110f); + const __m256 c1 = _mm256_set1_ps(0.041906696f); + const __m256 c2 = _mm256_set1_ps(0.166674897f); + const __m256 c3 = _mm256_set1_ps(0.499990642f); + const __m256 c4 = _mm256_set1_ps(0.999999762f); + const __m256 c5 = _mm256_set1_ps(1.000000000f); + + /* exp(x) = 2^i * e^f; i = rint (log2(e) * x), f = x - log(2) * i */ + t = _mm256_mul_ps(x, l2e); /* t = log2(e) * x */ + r = _mm256_round_ps(t, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); /* r = rint (t) */ + + f = _mm256_fmadd_ps(r, l2h, x); /* x - log(2)_hi * r */ + f = _mm256_fmadd_ps(r, l2l, f); /* f = x - log(2)_hi * r - log(2)_lo * r */ + + i = _mm256_cvtps_epi32(t); /* i = (int)rint(t) */ + + /* p ~= exp (f), -log(2)/2 <= f <= log(2)/2 */ + p = c0; /* c0 */ + p = _mm256_fmadd_ps(p, f, c1); /* c0*f+c1 */ + p = _mm256_fmadd_ps(p, f, c2); /* (c0*f+c1)*f+c2 */ + p = _mm256_fmadd_ps(p, f, c3); /* ((c0*f+c1)*f+c2)*f+c3 */ + p = _mm256_fmadd_ps(p, f, c4); /* (((c0*f+c1)*f+c2)*f+c3)*f+c4 ~= exp(f) */ + p = _mm256_fmadd_ps(p, f, c5); /* (((c0*f+c1)*f+c2)*f+c3)*f+c4 ~= exp(f) */ + /* exp(x) = 2^i * p */ + j = _mm256_slli_epi32(i, 23); /* i << 23 */ + r = _mm256_castsi256_ps(_mm256_add_epi32(j, _mm256_castps_si256(p))); /* r = p * 2^i */ + + return r; +} + +inline __m256 _mm256_sigmod_ps(__m256 x) +{ + __m256 one_v = _mm256_set1_ps(1.f); + __m256 neg_one_v = _mm256_set1_ps(-1.f); + return _mm256_rcp_ps(_mm256_add_ps(_mm256_exp_ps(_mm256_mul_ps(x, neg_one_v)), one_v)); +} + +inline __m256 _mm256_tanh_ps(__m256 x) +{ + __m256 one_v = _mm256_set1_ps(1.f); + __m256 two_v = _mm256_set1_ps(2.f); + __m256 e_2G_v = _mm256_exp_ps(_mm256_mul_ps(two_v, x)); + __m256 result_v = _mm256_sub_ps(one_v, _mm256_div_ps(two_v, _mm256_add_ps(one_v, e_2G_v))); + return result_v; +} + +// horizontal add, sum array to f32 +inline F32 _mm256_sum_ps(__m256 x) +{ + __m128 low = _mm256_extractf128_ps(x, 0); + __m128 high = _mm256_extractf128_ps(x, 1); + __m128 sum = _mm_hadd_ps(low, high); + low = _mm_hadd_ps(sum, sum); + high = _mm_permute_ps(low, 0b01); + sum = _mm_add_ss(low, high); + return _mm_cvtss_f32(sum); +} + +// horizontal max +inline F32 _mm256_hmax_ps(__m256 x) +{ + __m128 low = _mm256_extractf128_ps(x, 0); + __m128 high = _mm256_extractf128_ps(x, 1); + __m128 max = _mm_max_ps(low, high); + high = _mm_permute_ps(max, 0b1110); + low = _mm_max_ps(max, high); + high = _mm_permute_ps(low, 0b01); + max = _mm_max_ss(low, high); + return _mm_cvtss_f32(max); +} +#endif //CHEETAH_X86_AVX2_EXPAND_H diff --git a/common/uni/src/CMakeLists.txt b/common/uni/src/CMakeLists.txt new file mode 100644 index 00000000..ef8301af --- /dev/null +++ b/common/uni/src/CMakeLists.txt @@ -0,0 +1,14 @@ +file(GLOB srcs ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) + +# shared library +add_library(${PROJECT_NAME} SHARED ${srcs}) + +# static library +add_library(${PROJECT_NAME}_static STATIC ${srcs}) + +set_target_properties(${PROJECT_NAME}_static PROPERTIES OUTPUT_NAME "${PROJECT_NAME}") +set_target_properties(${PROJECT_NAME} PROPERTIES CLEAN_DIRECT_OUTPUT 1) +set_target_properties(${PROJECT_NAME}_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) +install(TARGETS ${PROJECT_NAME} ${PROJECT_NAME}_static + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib) diff --git a/common/uni/src/model_deserialize.cpp b/common/uni/src/model_deserialize.cpp new file mode 100644 index 00000000..d06929ff --- /dev/null +++ b/common/uni/src/model_deserialize.cpp @@ -0,0 +1,565 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "model_serialize_deserialize.hpp" +#include "profiling.h" + +int get_operator_parameter_size(OperatorType operatorType) +{ + std::map operatorParameterSizeMap = {{OT_Conv, sizeof(ConvolutionParamSpec)}, + {OT_Deconvolution, sizeof(ConvolutionParamSpec)}, {OT_FC, sizeof(FullyConnectedParamSpec)}, + {OT_RNN, sizeof(RNNParamSpec)}, {OT_MatMul, sizeof(MatMulParamSpec)}, + {OT_Resize, sizeof(ResizeParamSpec)}, + {OT_BilateralSliceApply, sizeof(BilateralSliceApplyParamSpec)}, + {OT_Pooling, sizeof(PoolingParamSpec)}, {OT_Scale, sizeof(ScaleParamSpec)}, + {OT_BatchNorm, sizeof(BatchNormParamSpec)}, {OT_Reduction, sizeof(ReductionParamSpec)}, + {OT_ArgMax, sizeof(ArgMaxParamSpec)}, {OT_Softmax, sizeof(SoftmaxParamSpec)}, + {OT_Clip, sizeof(ClipParamSpec)}, {OT_Power, sizeof(PowerParamSpec)}, + {OT_Relu, sizeof(ReLUParamSpec)}, {OT_Gather, sizeof(GatherParamSpec)}, + {OT_Embedding, sizeof(EmbedParamSpec)}, {OT_Pad, sizeof(PadParamSpec)}, + {OT_Eltwise, sizeof(EltwiseParamSpec)}, {OT_Concat, sizeof(ConcatParamSpec)}, + {OT_Slice, sizeof(SliceParamSpec)}, {OT_TfSlice, sizeof(TfSliceParamSpec)}, + {OT_Cast, sizeof(CastParamSpec)}, {OT_Transpose, sizeof(TransposeParamSpec)}, + {OT_Reshape, sizeof(ReshapeParamSpec)}, {OT_Squeeze, sizeof(SqueezeParamSpec)}, + {OT_Unsqueeze, sizeof(UnsqueezeParamSpec)}, {OT_Space2Depth, sizeof(Space2DepthParamSpec)}, + {OT_Depth2Space, sizeof(Depth2SpaceParamSpec)}, + {OT_ChannelResize, sizeof(ChannelResizeParamSpec)}, + {OT_PreAllocatedMemory, sizeof(PreAllocatedMemoryParamSpec)}, + {OT_SharedWeight, sizeof(SharedWeightParamSpec)}, {OT_Copy, sizeof(CopyParamSpec)}, + {OT_Check, sizeof(CheckParamSpec)}, {OT_Repeat, sizeof(RepeatParamSpec)}, + {OT_Attention, sizeof(AttentionParamSpec)}, + {OT_AttentionMask, sizeof(AttentionMaskParamSpec)}, + {OT_RelativePositionEmbedding, sizeof(EmbedParamSpec)}, + {OT_RelativeShift, sizeof(RelativeShiftParamSpec)}, {OT_PriorBox, sizeof(PriorBoxParamSpec)}, + {OT_DetectionOutput, sizeof(DetectionOutputParamSpec)}, + {OT_Yolov3DetectionOutput, sizeof(Yolov3DetectionOutputParamSpec)}, + {OT_MultiHeadAttention, sizeof(MultiheadAttentionParamSpec)}, + {OT_Tile, sizeof(TileParamSpec)}, {OT_Splice, sizeof(SpliceParamSpec)}}; + int size; + if (operatorParameterSizeMap.find(operatorType) == operatorParameterSizeMap.end()) { + size = 0; + } else { + size = operatorParameterSizeMap[operatorType]; + } + return size; +} + +EE operator_relationship(ModelSpec *spec) +{ + std::map opCanInChange; + std::set inplaceTensors; + std::map inplaceTensorInNum; + std::map inplaceTensorOutNum; + std::map> opInTensorNew; + std::map opOutTensorNew; + std::map tensorOpMapping; + std::map> tensorFlowsToOpSet; + + for (int i = 0; i < spec->num_operator_specs; i++) { + if (spec->ops[i].num_inputs == 1 && spec->ops[i].num_outputs == 1) { + std::string inputTensorStr = spec->ops[i].input_tensors_name[0]; + std::string outputTensorStr = spec->ops[i].output_tensors_name[0]; + if (inputTensorStr.compare(outputTensorStr) == 0) { + inplaceTensors.insert(inputTensorStr); + opCanInChange.insert(std::make_pair(inputTensorStr, true)); + } + } + } + + for (int i = 0; i < spec->num_operator_specs; i++) { + std::string currentOpName = spec->ops[i].name; + int in_tensor_number = spec->ops[i].num_inputs; + std::vector inTensorVec; + + // dealing with the relationship of op -- input tensors + for (int j = 0; j < in_tensor_number; j++) { + std::string tmpInTensor = spec->ops[i].input_tensors_name[j]; + if (inplaceTensors.find(tmpInTensor) != inplaceTensors.end()) { // judge inplace op or not + int inId; + if (inplaceTensorInNum.find(tmpInTensor) == inplaceTensorInNum.end()) { + inId = 1; + inplaceTensorInNum.insert(std::make_pair(tmpInTensor, inId)); + opCanInChange[tmpInTensor] = true; + } else { + if (opCanInChange[tmpInTensor] == false) { + inId = inplaceTensorInNum[tmpInTensor] + 1; + // inplaceTensorInNum.insert(std::make_pair(tmpInTensor, inId)); + inplaceTensorInNum[tmpInTensor] = inId; + opCanInChange[tmpInTensor] = true; + } else { + inId = inplaceTensorInNum[tmpInTensor]; + opCanInChange[tmpInTensor] = true; + } + } + std::string tmpInTensorChanged = tmpInTensor + "_" + std::to_string(inId); + inTensorVec.push_back(tmpInTensorChanged); + + if (tensorFlowsToOpSet.find(tmpInTensorChanged) == tensorFlowsToOpSet.end()) { + std::vector tmpVector; + tmpVector.push_back(currentOpName); + tensorFlowsToOpSet.insert(std::make_pair(tmpInTensorChanged, tmpVector)); + } else { + tensorFlowsToOpSet[tmpInTensorChanged].push_back(currentOpName); + } + + } else { + inTensorVec.push_back(tmpInTensor); + + if (tensorFlowsToOpSet.find(tmpInTensor) == tensorFlowsToOpSet.end()) { + std::vector tmpVector; + tmpVector.push_back(currentOpName); + tensorFlowsToOpSet.insert(std::make_pair(tmpInTensor, tmpVector)); + } else { + tensorFlowsToOpSet[tmpInTensor].push_back(currentOpName); + } + } + } + opInTensorNew.insert(std::make_pair(currentOpName, inTensorVec)); + + // dealing with the relationship of op -- output tensors + std::string tmpOutTensor = spec->ops[i].output_tensors_name[0]; + if (inplaceTensors.find(tmpOutTensor) != inplaceTensors.end()) { + // todo + int outId; + if (inplaceTensorOutNum.find(tmpOutTensor) == inplaceTensorOutNum.end()) { + outId = 1; + inplaceTensorOutNum.insert(std::make_pair(tmpOutTensor, outId)); + opCanInChange[tmpOutTensor] = false; + } else { + outId = inplaceTensorOutNum[tmpOutTensor] + 1; + // inplaceTensorOutNum.insert(std::make_pair(tmpOutTensor, outId)); can not update + inplaceTensorOutNum[tmpOutTensor] = outId; + opCanInChange[tmpOutTensor] = false; + } + std::string tmpOutTensorChanged = tmpOutTensor + "_" + std::to_string(outId); + opOutTensorNew.insert(std::make_pair(currentOpName, tmpOutTensorChanged)); + tensorOpMapping.insert(std::make_pair(tmpOutTensorChanged, currentOpName)); + } else { + opOutTensorNew.insert(std::make_pair(currentOpName, tmpOutTensor)); + tensorOpMapping.insert(std::make_pair(tmpOutTensor, currentOpName)); + } + } + + // assign op-op relationship + int opNum = spec->num_operator_specs; + spec->num_op_tensor_entries = opNum; + OperatorSpec *opsPtr2 = spec->ops; + OperatorRelationshipMapEntry *oprmePtr = (OperatorRelationshipMapEntry *)mt_new_storage( + sizeof(OperatorRelationshipMapEntry) * opNum); + spec->op_relationship_entries = oprmePtr; + for (int j = 0; j < opNum; j++) { + str_copy(oprmePtr[j].op, opsPtr2[j].name, NAME_LEN); + int opInOpNum = opInTensorNew[opsPtr2[j].name].size(); + oprmePtr[j].num_inputs = opInOpNum; + oprmePtr[j].input_op_names = (I8 **)mt_new_storage(opInOpNum * sizeof(I8 *)); + for (int k = 0; k < opInOpNum; k++) { + oprmePtr[j].input_op_names[k] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + std::string ten_name = opInTensorNew[opsPtr2[j].name][k]; + std::string tensor2op = tensorOpMapping[ten_name]; + str_copy(oprmePtr[j].input_op_names[k], tensor2op.c_str(), tensor2op.length()); + } + + int opOutOpNum = tensorFlowsToOpSet[opOutTensorNew[opsPtr2[j].name]].size(); + oprmePtr[j].num_outputs = opOutOpNum; + oprmePtr[j].output_op_names = (I8 **)mt_new_storage(opOutOpNum * sizeof(I8 *)); + for (int k = 0; k < opOutOpNum; k++) { + oprmePtr[j].output_op_names[k] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + std::string tensor2op = tensorFlowsToOpSet[opOutTensorNew[opsPtr2[j].name]][k]; + str_copy(oprmePtr[j].output_op_names[k], tensor2op.c_str(), tensor2op.length()); + } + } + return SUCCESS; +} + +template +void dequantize_int8_weight(int num, F32 scale, INT8 *q, T *d) +{ + F32 factor = 1 / scale; + T table[255]; + int base = -127; + for (int i = 0; i < 255; i++) { + table[i] = factor * base; + base++; + } + T *mid = table + 127; + for (int i = 0; i < num; i++) { + d[i] = *(mid + q[i]); + } +} + +inline void dequantize_fp16(int num, unsigned short *q, F32 *d) +{ +#if defined(_USE_NEON) && defined(__aarch64__) + F16 *half = (F16 *)q; +#else + U32 *word = (U32 *)d; +#endif + + for (int i = 0; i < num; i++) { +#if defined(_USE_NEON) && defined(__aarch64__) + d[i] = half[i]; +#else + unsigned short value = q[i]; + unsigned short sign = (value & 0x8000) >> 15; + unsigned short exponent = (value & 0x7c00) >> 10; + unsigned short significand = value & 0x03FF; + + U32 u; + if (exponent == 0) { + if (significand == 0) { + u = sign << 31; + } else { + exponent = 0; + while (0 == (significand & 0x200)) { + significand <<= 1; + exponent++; + } + significand <<= 1; + significand &= 0x3FF; + u = (sign << 31) | ((-exponent + (-15 + 127)) << 23) | (significand << 13); + } + } else if (exponent == 0x1F) { + u = (sign << 31) | (0xFF << 23) | (significand << 13); + } else { + u = (sign << 31) | ((exponent + (-15 + 127)) << 23) | (significand << 13); + } + word[i] = u; +#endif + } +} + +EE deserialize_header(const char *bytes, ModelSpec *spec, U32 *pos) +{ + const char *pointer = bytes + *pos; + memcpy(&spec->version, pointer, sizeof(I32)); + pointer += sizeof(I32); + *pos += sizeof(I32); + if (spec->version != sg_boltVersion) { + UNI_ERROR_LOG("X2bolt version is [%d], but your model version is : [%d].\n Please update " + "X2bolt to version[%d].\n", + sg_boltVersion, spec->version, spec->version); + CHECK_STATUS(NOT_MATCH); + return NOT_MATCH; + } + + memcpy(&spec->magic_number, pointer, sizeof(I32)); + pointer += sizeof(I32); + *pos += sizeof(I32); + if (spec->magic_number != sg_magicNumber) { + UNI_ERROR_LOG( + "magic_number not_match: code %d bolt model %d\n", sg_magicNumber, spec->magic_number); + CHECK_STATUS(NOT_MATCH); + return NOT_MATCH; + } + + str_copy(spec->model_name, pointer, NAME_LEN); + pointer += NAME_LEN; + *pos += NAME_LEN; + + spec->dt = *((DataType *)pointer); + pointer += sizeof(DataType); + *pos += sizeof(DataType); + + spec->num_inputs = *((I32 *)pointer); + pointer += sizeof(I32); + *pos += sizeof(I32); + + spec->input_names = (I8 **)mt_new_storage(spec->num_inputs * sizeof(I8 *)); + for (int i = 0; i < spec->num_inputs; i++) { + spec->input_names[i] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + str_copy(spec->input_names[i], pointer, NAME_LEN); + pointer += NAME_LEN; + *pos += NAME_LEN; + } + + spec->input_dims = (TensorDesc *)mt_new_storage(spec->num_inputs * sizeof(TensorDesc)); + memcpy(spec->input_dims, pointer, spec->num_inputs * sizeof(TensorDesc)); + pointer += spec->num_inputs * sizeof(TensorDesc); + *pos += spec->num_inputs * sizeof(TensorDesc); + + spec->num_outputs = *((I32 *)pointer); + pointer += sizeof(I32); + *pos += sizeof(I32); + + spec->output_names = (I8 **)mt_new_storage(spec->num_outputs * NAME_LEN); + for (int i = 0; i < spec->num_outputs; i++) { + spec->output_names[i] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + str_copy(spec->output_names[i], pointer, NAME_LEN); + pointer += NAME_LEN; + *pos += NAME_LEN; + } + return SUCCESS; +} + +EE deserialize_operator(const char *bytes, ModelSpec *spec, U32 *pos) +{ + const char *pointer = bytes + *pos; + I32 *p4numOperatorSpecs = (I32 *)pointer; + spec->num_operator_specs = *p4numOperatorSpecs; + pointer += sizeof(U32); + *pos += sizeof(U32); + + OperatorSpec *ptr = + (OperatorSpec *)mt_new_storage(spec->num_operator_specs * sizeof(OperatorSpec)); + spec->ops = ptr; + for (int i = 0; i < spec->num_operator_specs; i++) { + str_copy(ptr[i].name, pointer, NAME_LEN); + pointer += NAME_LEN * sizeof(I8); + *pos += NAME_LEN * sizeof(I8); + + ptr[i].type = *((OperatorType *)pointer); + pointer += sizeof(OperatorType); + *pos += sizeof(OperatorType); + + ptr[i].num_inputs = *((U32 *)pointer); + pointer += sizeof(U32); + *pos += sizeof(U32); + + ptr[i].input_tensors_name = (I8 **)mt_new_storage(ptr[i].num_inputs * sizeof(I8 *)); + for (U32 j = 0; j < ptr[i].num_inputs; j++) { + ptr[i].input_tensors_name[j] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + str_copy(ptr[i].input_tensors_name[j], pointer, NAME_LEN); + pointer += NAME_LEN * sizeof(I8); + *pos += NAME_LEN * sizeof(I8); + } + + ptr[i].num_outputs = *((U32 *)pointer); + pointer += sizeof(U32); + *pos += sizeof(U32); + + ptr[i].output_tensors_name = (I8 **)mt_new_storage(ptr[i].num_outputs * sizeof(I8 *)); + for (U32 j = 0; j < ptr[i].num_outputs; j++) { + ptr[i].output_tensors_name[j] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + str_copy(ptr[i].output_tensors_name[j], pointer, NAME_LEN); + pointer += NAME_LEN * sizeof(I8); + *pos += NAME_LEN * sizeof(I8); + } + + U32 numTensors = ptr[i].num_inputs + ptr[i].num_outputs; + ptr[i].tensor_positions = (I32 *)mt_new_storage(numTensors * sizeof(I32)); + memcpy(ptr[i].tensor_positions, pointer, numTensors * sizeof(I32)); + pointer += numTensors * sizeof(I32); + *pos += numTensors * sizeof(I32); + + ptr[i].num_quant_feature = *((U32 *)pointer); + pointer += sizeof(U32); + *pos += sizeof(U32); + + if (0 != ptr[i].num_quant_feature) { + ptr[i].feature_scale = + (QuantSpec *)mt_new_storage(ptr[i].num_quant_feature * sizeof(QuantSpec)); + } else { + ptr[i].feature_scale = nullptr; + } + for (U32 j = 0; j < ptr[i].num_quant_feature; j++) { + ptr[i].feature_scale[j].num_scale = *((int *)pointer); + int num = ptr[i].feature_scale[j].num_scale; + pointer += sizeof(int); + *pos += sizeof(int); + + ptr[i].feature_scale[j].scale = (F32 *)mt_new_storage(num * sizeof(F32)); + memcpy(ptr[i].feature_scale[j].scale, pointer, num * sizeof(F32)); + pointer += num * sizeof(F32); + *pos += num * sizeof(F32); + } + + int operatorParameterSize = get_operator_parameter_size(ptr[i].type); + memcpy(&(ptr[i].ps), pointer, operatorParameterSize); + pointer += operatorParameterSize; + *pos += operatorParameterSize; + } + + return SUCCESS; +} + +EE deserialize_weight(const char *bytes, ModelSpec *spec, U32 *pos) +{ + const char *pointer = bytes + *pos; + I32 *p4numWeightSpecs = (I32 *)pointer; + spec->num_weight_specs = *p4numWeightSpecs; + pointer += sizeof(U32); + *pos += sizeof(U32); + + WeightSpec *ptr = (WeightSpec *)mt_new_storage(spec->num_weight_specs * sizeof(WeightSpec)); + spec->ws = ptr; + for (int i = 0; i < spec->num_weight_specs; i++) { + U32 *length = (U32 *)pointer; + pointer += sizeof(U32); + *pos += sizeof(U32); + U32 weightBiasBytes = 0; + + str_copy(ptr[i].op_name, pointer, NAME_LEN); + pointer += NAME_LEN; + *pos += NAME_LEN; + + memcpy(&(ptr[i].mdt), pointer, sizeof(DataType)); + pointer += sizeof(U32); + *pos += sizeof(U32); + + bool quantFP16 = false; + bool quantInt8 = false; + if (DT_F16 == ptr[i].mdt && DT_F32 == spec->dt) { + ptr[i].mdt = DT_F32; + quantFP16 = true; + } else if (DT_I8 == ptr[i].mdt && DT_I8 != spec->dt) { + ptr[i].mdt = (spec->dt == DT_F16_8Q) ? DT_F16 : spec->dt; + quantInt8 = true; + } + + memcpy(&(ptr[i].bytes_of_weight), pointer, sizeof(U32)); + U32 alignSize = ptr[i].bytes_of_weight; + + if (quantFP16) { + ptr[i].bytes_of_weight *= 2; + } + if (quantInt8) { + ptr[i].bytes_of_weight *= bytesOf(ptr[i].mdt); + } + pointer += sizeof(U32); + *pos += sizeof(U32); + + ptr[i].weight = (U8 *)mt_new_storage(ptr[i].bytes_of_weight); + U8 *serialWeight = (U8 *)pointer; + + pointer += alignSize; + *pos += alignSize; + weightBiasBytes += alignSize; + + memcpy(&(ptr[i].bytes_of_vec), pointer, sizeof(U32)); + pointer += sizeof(U32); + *pos += sizeof(U32); + + alignSize = ptr[i].bytes_of_vec; + if (quantFP16) { + ptr[i].bytes_of_vec *= 2; + } + U8 *serialBias = nullptr; + if (0 != ptr[i].bytes_of_vec) { + serialBias = (U8 *)pointer; + ptr[i].vec = (U8 *)mt_new_storage(ptr[i].bytes_of_vec); + } else { + ptr[i].vec = nullptr; + } + + pointer += alignSize; + *pos += alignSize; + weightBiasBytes += alignSize; + + memcpy(&(ptr[i].num_quant_scale), pointer, sizeof(U32)); + pointer += sizeof(U32); + *pos += sizeof(U32); + + if (0 != ptr[i].num_quant_scale) { + ptr[i].weight_scale = + (QuantSpec *)mt_new_storage(ptr[i].num_quant_scale * sizeof(QuantSpec)); + } + for (U32 j = 0; j < ptr[i].num_quant_scale; j++) { + ptr[i].weight_scale[j].num_scale = *((int *)pointer); + int num = ptr[i].weight_scale[j].num_scale; + pointer += sizeof(int); + *pos += sizeof(int); + + ptr[i].weight_scale[j].scale = (F32 *)mt_new_storage(num * sizeof(F32)); + memcpy(ptr[i].weight_scale[j].scale, pointer, num * sizeof(F32)); + pointer += num * sizeof(F32); + *pos += num * sizeof(F32); + } + + CHECK_REQUIREMENT(*length == weightBiasBytes); + + if (quantFP16) { + dequantize_fp16(ptr[i].bytes_of_weight / 4, (unsigned short *)serialWeight, (F32 *)ptr[i].weight); + dequantize_fp16(ptr[i].bytes_of_vec / 4, (unsigned short *)serialBias, (F32 *)ptr[i].vec); + } else { + if (quantInt8) { + CHECK_REQUIREMENT(1 == ptr[i].num_quant_scale && 1 == ptr[i].weight_scale[0].num_scale); + F32 scale = ptr[i].weight_scale[0].scale[0]; + if (DT_F32 == ptr[i].mdt) { + dequantize_int8_weight( + ptr[i].bytes_of_weight / 4, scale, (INT8 *)serialWeight, (F32 *)ptr[i].weight); + } else { + #ifdef __aarch64__ + dequantize_int8_weight( + ptr[i].bytes_of_weight / 2, scale, (INT8 *)serialWeight, (F16 *)ptr[i].weight); + #endif + } + } else { + memcpy(ptr[i].weight, serialWeight, ptr[i].bytes_of_weight); + } + memcpy(ptr[i].vec, serialBias, ptr[i].bytes_of_vec); + } + } + return SUCCESS; +} + +EE deserialize_model(const char *bytes, ModelSpec *spec) +{ + U32 pos = 0; + CHECK_STATUS(deserialize_header(bytes, spec, &pos)); + CHECK_STATUS(deserialize_operator(bytes, spec, &pos)); + CHECK_STATUS(deserialize_weight(bytes, spec, &pos)); + CHECK_STATUS(operator_relationship(spec)); + return SUCCESS; +} + +EE deserialize_model_from_file(const char *fn, ModelSpec *spec, bool useFileStream) +{ + UNI_PROFILE( + { + char *bytes = nullptr; + int fd; + int fileLength; + if (useFileStream) { + bytes = (char *)fn; + } else { + fd = open(fn, O_RDONLY); + if (-1 == fd) { + UNI_ERROR_LOG("Cannot open .bolt file. Name: %s\n", fn); + return FILE_ERROR; + } + + struct stat ss; + if (-1 == fstat(fd, &ss)) { + UNI_ERROR_LOG("Cannot get size from file descriptor. File Name: %s\n", fn); + return FILE_ERROR; + } + + fileLength = ss.st_size; + bytes = (char *)mmap(nullptr, fileLength, PROT_READ, MAP_SHARED, fd, 0); + if (MAP_FAILED == bytes) { + UNI_ERROR_LOG("Mmap failed. File Name: %s\n", fn); + return FILE_ERROR; + } + } + + CHECK_STATUS(deserialize_model(bytes, spec)); + + if (!useFileStream) { + munmap(bytes, fileLength); + if (-1 != fd) { + close(fd); + } + } + }, + std::string("deserialize_model_from_file"), std::string("prepare")); + return SUCCESS; +} diff --git a/common/uni/src/model_print.cpp b/common/uni/src/model_print.cpp new file mode 100644 index 00000000..847856bc --- /dev/null +++ b/common/uni/src/model_print.cpp @@ -0,0 +1,127 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "model_print.h" +#include "types.h" + +void print_header(const ModelSpec ms) +{ + printf("[Model] %s\n [Input]", ms.model_name); + for (int i = 0; i < ms.num_inputs; i++) { + printf(" %s(%s)", ms.input_names[i], tensorDesc2Str(ms.input_dims[i]).c_str()); + } + printf("\n [Output]"); + for (int i = 0; i < ms.num_outputs; i++) { + printf(" %s", ms.output_names[i]); + } + printf("\n"); +} + +void print_operator_tensor_relationship(const ModelSpec ms, bool deleteDeprecatedOp) +{ + int number = ms.num_operator_specs; + printf(" [Ops] %d\n", number); + for (int i = 0; i < number; i++) { + if (deleteDeprecatedOp) { + if (isDeprecatedOp(ms.ops[i].type)) { + continue; + } + } + printf(" Op %3d %32s %16s|", i, ms.ops[i].name, OperatorTypeName()[ms.ops[i].type]); + for (U32 j = 0; j < ms.ops[i].num_inputs; j++) { + printf(" %s", ms.ops[i].input_tensors_name[j]); + } + printf(" ->"); + for (U32 j = 0; j < ms.ops[i].num_outputs; j++) { + printf(" %s", ms.ops[i].output_tensors_name[j]); + } + if (nullptr != ms.ops[i].tensor_positions) { + printf(" tensor position:"); + for (U32 j = 0; j < ms.ops[i].num_inputs + ms.ops[i].num_outputs; j++) { + printf(" %d", ms.ops[i].tensor_positions[j]); + } + } + if (nullptr != ms.ops[i].feature_scale) { + printf(" quant scale:"); + for (U32 j = 0; j < ms.ops[i].num_quant_feature; j++) { + printf(" %f", ms.ops[i].feature_scale[j].scale[0]); + } + } + printf("\n"); + } +} + +void print_weights(const ModelSpec ms) +{ + int number = ms.num_weight_specs; + printf(" [Weights] %d\n", number); + for (int i = 0; i < number; i++) { + if (isDeprecatedOpWeight(&ms, i)) { + printf(" Weight %3d %32s | Delete mdt %d weight: %p %uB bias: %p %uB\n", i, + ms.ws[i].op_name, ms.ws[i].mdt, ms.ws[i].weight, ms.ws[i].bytes_of_weight, + ms.ws[i].vec, ms.ws[i].bytes_of_vec); + continue; + } + + printf(" Weight %3d %32s | Retain mdt %d weight: %p %uB bias: %p %uB example: ", i, + ms.ws[i].op_name, ms.ws[i].mdt, ms.ws[i].weight, ms.ws[i].bytes_of_weight, ms.ws[i].vec, + ms.ws[i].bytes_of_vec); + if (ms.ws[i].bytes_of_weight > 0 && ms.ws[i].weight != nullptr) { + F32 value; + transformToFloat(ms.ws[i].mdt, ms.ws[i].weight, &value, 1); + printf("%f", value); + } else if ((ms.ws[i].bytes_of_weight == 0 && ms.ws[i].weight != nullptr) || + (ms.ws[i].bytes_of_weight != 0 && ms.ws[i].weight == nullptr)) { + UNI_ERROR_LOG("weight is null but size is not zero\n"); + } + if (ms.ws[i].bytes_of_vec > 0 && ms.ws[i].vec != nullptr) { + DataType dt = ms.ws[i].mdt; + if (DT_BIN01 == ms.ws[i].mdt || DT_BIN11 == ms.ws[i].mdt) { + dt = DT_F16; + } + F32 value; + transformToFloat(dt, ms.ws[i].vec, &value, 1); + printf(",%f", value); + } else if ((ms.ws[i].bytes_of_vec == 0 && ms.ws[i].vec != nullptr) || + (ms.ws[i].bytes_of_vec != 0 && ms.ws[i].vec == nullptr)) { + UNI_ERROR_LOG("vec is null but size is not zero\n"); + } + printf("\n"); + } +} + +void print_relationship(const ModelSpec ms) +{ + int number = ms.num_op_tensor_entries; + printf(" [Relationships] %d\n", number); + for (int i = 0; i < number; i++) { + printf(" Relation %3d %32s |", i, ms.op_relationship_entries[i].op); + for (U32 j = 0; j < ms.op_relationship_entries[i].num_inputs; j++) { + printf(" %s", ms.op_relationship_entries[i].input_op_names[j]); + } + printf(" ->"); + for (U32 j = 0; j < ms.op_relationship_entries[i].num_outputs; j++) { + printf(" %s", ms.op_relationship_entries[i].output_op_names[j]); + } + printf("\n"); + } +} + +void print_ms(const ModelSpec ms) +{ + print_header(ms); + print_operator_tensor_relationship(ms); + print_weights(ms); + print_relationship(ms); +} diff --git a/common/uni/src/model_serialize.cpp b/common/uni/src/model_serialize.cpp new file mode 100644 index 00000000..92bf670f --- /dev/null +++ b/common/uni/src/model_serialize.cpp @@ -0,0 +1,307 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include +#include "model_serialize_deserialize.hpp" +#include "types.h" + +EE serialize_header(const ModelSpec *spec, std::string *tmp) +{ + U32 bufSize = sizeof(I32) * 2 + sizeof(I8) * NAME_LEN + sizeof(DataType) + sizeof(I32) + + sizeof(I8) * NAME_LEN * spec->num_inputs + sizeof(TensorDesc) * spec->num_inputs + + sizeof(I32) + sizeof(I8) * NAME_LEN * spec->num_outputs; + I8 *data = (I8 *)mt_new_storage(bufSize); + + I32 *pointer4version = (I32 *)data; + memcpy(pointer4version, &spec->version, sizeof(I32)); + pointer4version += 1; // the pointer datatype(I32) of add 1 means 4 steps + + I32 *pointer4magicNumber = (I32 *)pointer4version; + memcpy(pointer4magicNumber, &spec->magic_number, sizeof(I32)); + pointer4magicNumber += 1; + + I8 *pointer4modelName = (I8 *)pointer4magicNumber; + str_copy(pointer4modelName, spec->model_name, NAME_LEN); + pointer4modelName += NAME_LEN; + + DataType *pointer4dt = (DataType *)pointer4modelName; + *pointer4dt = spec->dt; + pointer4dt++; + + I32 *pointer4numInputs = (I32 *)pointer4dt; + *pointer4numInputs = spec->num_inputs; + pointer4numInputs++; + + I8 *pointer4InputNames = (I8 *)pointer4numInputs; + for (int i = 0; i < spec->num_inputs; i++) { + str_copy(pointer4InputNames, spec->input_names[i], NAME_LEN); + pointer4InputNames += NAME_LEN; + } + + TensorDesc *pointer4TensorDesc = (TensorDesc *)pointer4InputNames; + memcpy(pointer4TensorDesc, spec->input_dims, sizeof(TensorDesc) * spec->num_inputs); + pointer4TensorDesc += spec->num_inputs; + + I32 *pointer4numOutputs = (I32 *)pointer4TensorDesc; + *pointer4numOutputs = spec->num_outputs; + pointer4numOutputs++; + + I8 *pointer4outputNames = (I8 *)pointer4numOutputs; + for (int i = 0; i < spec->num_outputs; i++) { + str_copy(pointer4outputNames, spec->output_names[i], NAME_LEN); + pointer4outputNames += NAME_LEN; + } + + tmp->clear(); + CHECK_REQUIREMENT((U32)(pointer4outputNames - data) == bufSize); + tmp->assign(data, data + bufSize); + delete data; + return SUCCESS; +} + +U32 operator_memory_size(OperatorSpec *ops) +{ + // sizeof(U32) * 4 : type + num_inputs + num_output + num_quant_feature + U32 allocatedBufferSize = sizeof(I8) * NAME_LEN + sizeof(U32) * 4 + + ops->num_inputs * NAME_LEN * sizeof(I8) + ops->num_outputs * NAME_LEN * sizeof(I8) + + (ops->num_inputs + ops->num_outputs) * sizeof(I32) + get_operator_parameter_size(ops->type); + + for (U32 i = 0; i < ops->num_quant_feature; i++) { + allocatedBufferSize += sizeof(int); // num_scale + allocatedBufferSize += ops->feature_scale[i].num_scale * sizeof(F32); + } + return allocatedBufferSize; +} + +EE serialize_operators(const ModelSpec *spec, std::string *tmp) +{ + OperatorSpec *opsTmp = spec->ops; + int removeOpNum = 0; + U32 bufSize = sizeof(I32); + for (int i = 0; i < spec->num_operator_specs; i++) { + if (isDeprecatedOp(opsTmp->type)) { + removeOpNum++; + } else { + bufSize += operator_memory_size(opsTmp); + } + opsTmp++; + } + + char *data = (char *)mt_new_storage(bufSize); + + I32 *pointer4numOperatorSpecs = (I32 *)data; + *pointer4numOperatorSpecs = spec->num_operator_specs - removeOpNum; // attention + pointer4numOperatorSpecs++; + + OperatorSpec *opsPointer = spec->ops; + I8 *pointer4opsName = (I8 *)pointer4numOperatorSpecs; + + for (int i = 0; i < spec->num_operator_specs; i++) { + if (isDeprecatedOp(opsPointer[i].type)) { + continue; + } + + str_copy(pointer4opsName, opsPointer[i].name, NAME_LEN); // to copy the name of op + pointer4opsName += NAME_LEN; + + U32 *pointer4opsType = (U32 *)pointer4opsName; + *pointer4opsType = opsPointer[i].type; + pointer4opsType++; + + U32 *pointer4opsNumInputs = pointer4opsType; + *pointer4opsNumInputs = opsPointer[i].num_inputs; + pointer4opsNumInputs++; + + I8 *pointer4opsInputTensorsName = (I8 *)pointer4opsNumInputs; + for (U32 j = 0; j < opsPointer[i].num_inputs; j++) { + str_copy(pointer4opsInputTensorsName, opsPointer[i].input_tensors_name[j], NAME_LEN); + pointer4opsInputTensorsName += NAME_LEN; + } + + U32 *pointer4opsNumOutputs = (U32 *)pointer4opsInputTensorsName; + *pointer4opsNumOutputs = opsPointer[i].num_outputs; + pointer4opsNumOutputs++; + + I8 *pointer4opsOutputTensorsName = (I8 *)pointer4opsNumOutputs; + for (U32 j = 0; j < opsPointer[i].num_outputs; j++) { + str_copy(pointer4opsOutputTensorsName, opsPointer[i].output_tensors_name[j], NAME_LEN); + pointer4opsOutputTensorsName += NAME_LEN; + } + + I32 *pointer4tensorPos = (I32 *)pointer4opsOutputTensorsName; + U32 numTensors = opsPointer[i].num_inputs + opsPointer[i].num_outputs; + if (nullptr != opsPointer[i].tensor_positions) { + memcpy(pointer4tensorPos, opsPointer[i].tensor_positions, numTensors * sizeof(I32)); + } else { + for (U32 j = 0; j < numTensors; j++) { + pointer4tensorPos[j] = -1; + } + } + pointer4tensorPos += numTensors; + + U32 *pointer4numint8 = (U32 *)pointer4tensorPos; + *pointer4numint8 = opsPointer[i].num_quant_feature; + pointer4numint8++; + + int *pointer4quant = (int *)pointer4numint8; + for (U32 j = 0; j < opsPointer[i].num_quant_feature; j++) { + *pointer4quant = opsPointer[i].feature_scale[j].num_scale; + int num = *pointer4quant; + pointer4quant++; + memcpy(pointer4quant, opsPointer[i].feature_scale[j].scale, num * sizeof(F32)); + pointer4quant += num; + } + + char *pointer4parameterSpecs = (char *)pointer4quant; + int operatorParameterSize = get_operator_parameter_size(opsPointer[i].type); + memcpy(pointer4parameterSpecs, &(opsPointer[i].ps), operatorParameterSize); + pointer4parameterSpecs += operatorParameterSize; + pointer4opsName = (I8 *)pointer4parameterSpecs; + } + + tmp->clear(); + CHECK_REQUIREMENT((U32)(pointer4opsName - data) == bufSize); + tmp->assign(data, data + bufSize); + delete data; + return SUCCESS; +} + +EE serialize_weights(const ModelSpec *spec, std::string *tmp) +{ + WeightSpec *tmpPointer = spec->ws; + U32 bufSize = sizeof(I32); + U32 weightCount = 0; + for (int i = 0; i < spec->num_weight_specs; i++) { + if (isDeprecatedOpWeight(spec, i)) { + continue; + } + + // U32 x 5: length, mdt, bytes_of_weight, bytes_of_vec, num_quant_scale + bufSize += sizeof(I8) * NAME_LEN + sizeof(U32) * 5 + tmpPointer[i].bytes_of_weight + + tmpPointer[i].bytes_of_vec; + for (U32 j = 0; j < tmpPointer[i].num_quant_scale; j++) { + bufSize += sizeof(int); // num_scale + bufSize += tmpPointer[i].weight_scale[j].num_scale * sizeof(F32); + } + + weightCount++; + } + char *data = (char *)mt_new_storage(bufSize); + + I32 *pointer4numWeightSpecs = (I32 *)data; + *pointer4numWeightSpecs = weightCount; + pointer4numWeightSpecs++; + + WeightSpec *wsPointer = spec->ws; + char *pointer4wsOpName = (char *)pointer4numWeightSpecs; + for (int i = 0; i < spec->num_weight_specs; i++) { + if (isDeprecatedOpWeight(spec, i)) { + continue; + } + + U32 *length = (U32 *)pointer4wsOpName; + U32 len; + len = wsPointer[i].bytes_of_weight + wsPointer[i].bytes_of_vec; + *length = len; + pointer4wsOpName += sizeof(U32); + + str_copy(pointer4wsOpName, wsPointer[i].op_name, NAME_LEN); + pointer4wsOpName += NAME_LEN; + + U32 *pointer4wsMdt = (U32 *)pointer4wsOpName; + *pointer4wsMdt = wsPointer[i].mdt; + pointer4wsMdt++; + + U32 *pointer4wsBytesOfWeight = (U32 *)pointer4wsMdt; + *pointer4wsBytesOfWeight = wsPointer[i].bytes_of_weight; + pointer4wsBytesOfWeight++; + + U8 *pointer4wsWeight = (U8 *)pointer4wsBytesOfWeight; + memcpy(pointer4wsWeight, wsPointer[i].weight, wsPointer[i].bytes_of_weight); + pointer4wsWeight += wsPointer[i].bytes_of_weight; + + U32 *pointer4wsBytesOfVec = (U32 *)pointer4wsWeight; + *pointer4wsBytesOfVec = wsPointer[i].bytes_of_vec; + pointer4wsBytesOfVec++; + + U8 *pointer4wsVec = (U8 *)pointer4wsBytesOfVec; + memcpy(pointer4wsVec, wsPointer[i].vec, wsPointer[i].bytes_of_vec); + pointer4wsVec += wsPointer[i].bytes_of_vec; + + U32 *pointer4numquant = (U32 *)pointer4wsVec; + *pointer4numquant = wsPointer[i].num_quant_scale; + pointer4numquant++; + + int *pointer4quant = (int *)pointer4numquant; + for (U32 j = 0; j < wsPointer[i].num_quant_scale; j++) { + *pointer4quant = wsPointer[i].weight_scale[j].num_scale; + int num = *pointer4quant; + pointer4quant++; + memcpy(pointer4quant, wsPointer[i].weight_scale[j].scale, num * sizeof(F32)); + pointer4quant += num; + } + + pointer4wsOpName = (char *)pointer4quant; + } + + tmp->clear(); + CHECK_REQUIREMENT((U32)(pointer4wsOpName - data) == bufSize); + tmp->assign(data, data + bufSize); + delete data; + return SUCCESS; +} + +EE serialize_model(const ModelSpec *spec, std::string *bytes) +{ + bytes->clear(); + std::string tmp; + + CHECK_STATUS(serialize_header(spec, &tmp)); + *bytes += tmp; + + CHECK_STATUS(serialize_operators(spec, &tmp)); + *bytes += tmp; + + CHECK_STATUS(serialize_weights(spec, &tmp)); + *bytes += tmp; + return SUCCESS; +} + +EE write_to_file(std::string *bytes, const char *fn) +{ + std::ofstream out(fn); + if (!out) { + return FILE_ERROR; + } + out << *bytes; + out.close(); + return SUCCESS; +} + +EE serialize_model_to_file(const ModelSpec *spec, const char *fn) +{ + std::string bytes = ""; + CHECK_STATUS(serialize_model(spec, &bytes)); + CHECK_STATUS(write_to_file(&bytes, fn)); + return SUCCESS; +} + +#if defined(_USE_CAFFE) || defined(_USE_ONNX) || defined(_USE_TFLITE) || defined(_USE_TENSORFLOW) +EE mt_store(CI8 *dir, CI8 *mfn, const ModelSpec *md) +{ + std::string completePath = concat_dir_file(dir, mfn); + serialize_model_to_file(md, completePath.c_str()); + return SUCCESS; +} +#endif diff --git a/common/uni/src/profiling.cpp b/common/uni/src/profiling.cpp new file mode 100644 index 00000000..a3bbca68 --- /dev/null +++ b/common/uni/src/profiling.cpp @@ -0,0 +1,91 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include +#include +#include + +#include "profiling.h" + +#ifdef _THREAD_SAFE +pthread_mutex_t uniThreadMutex = PTHREAD_MUTEX_INITIALIZER; +#endif + +std::string extract_class_function(std::string &&pretty_function) +{ + auto pos = pretty_function.find('('); + if (pos != std::string::npos) { + pretty_function.erase(pretty_function.begin() + pos, pretty_function.end()); + } + + pos = pretty_function.rfind(' '); + if (pos != std::string::npos) { + pretty_function.erase(pretty_function.begin(), pretty_function.begin() + pos + 1); + } + + return std::move(pretty_function); +} + +std::string extract_file_function(std::string &&pretty_function) +{ + auto pos = pretty_function.find('('); + if (pos != std::string::npos) { + pretty_function.erase(pretty_function.begin() + pos, pretty_function.end()); + } + + pos = pretty_function.rfind('/'); + if (pos != std::string::npos) { + pretty_function.erase(pretty_function.begin(), pretty_function.begin() + pos + 1); + } + + return std::move(pretty_function); +} + +std::map time_statistics; + +void ut_time_init() +{ + UNI_THREAD_SAFE(time_statistics.clear()); +} + +void ut_time_process( + const std::string &name, const std::string &category, double time_start_ms, double time_end_ms) +{ +#ifdef _PROFILE + UNI_PROFILE_INFO( + name.c_str(), category.c_str(), time_start_ms * 1000, (time_end_ms - time_start_ms) * 1000); +#endif +#ifdef _PROFILE_STATISTICS + double duration = time_end_ms - time_start_ms; + UNI_THREAD_SAFE({ + if (time_statistics.find(category) == time_statistics.end()) { + time_statistics[category] = duration; + } else { + time_statistics[category] += duration; + } + }); +#endif +} + +void ut_time_statistics() +{ + std::vector> vec(time_statistics.begin(), time_statistics.end()); + sort(vec.begin(), vec.end(), + [&](const std::pair &a, const std::pair &b) { + return (a.second > b.second); + }); + for (U32 i = 0; i < vec.size(); ++i) { + UNI_INFO_LOG("%s\t%lfms\n", vec[i].first.c_str(), vec[i].second); + } +} diff --git a/common/uni/src/tensor_desc.cpp b/common/uni/src/tensor_desc.cpp new file mode 100644 index 00000000..9f46fd6b --- /dev/null +++ b/common/uni/src/tensor_desc.cpp @@ -0,0 +1,614 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include +#include "tensor_desc.h" + +void UNI_memcpy(void *dst, const void *src, int size) +{ + if (src == dst || size <= 0 || dst == nullptr || src == nullptr) { + return; + } + memcpy(dst, src, size); +} + +void UNI_init(U32 num, DataType dt, F32 val, void *dst) +{ + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: { + F16 v = val; + F16 *arr = (F16 *)dst; + for (U32 i = 0; i < num; i++) { + arr[i] = v; + } + break; + } +#endif + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } +} + +void transformFromFloat(DataType dataType, float *src, void *dst, int num, float scale) +{ + switch (dataType) { + case DT_F32: { + UNI_memcpy(dst, src, sizeof(float) * num); + break; + } + case DT_U32: { + U32 *ptr = (U32 *)dst; + for (int i = 0; i < num; i++) { + ptr[i] = src[i]; + } + break; + } + case DT_I32: { + I32 *ptr = (I32 *)dst; + for (int i = 0; i < num; i++) { + ptr[i] = src[i]; + } + break; + } +#ifdef __aarch64__ + case DT_F16: { + F16 *ptr = (F16 *)dst; + for (int i = 0; i < num; i++) { + ptr[i] = src[i]; + } + break; + } + case DT_F16_8Q: { + F16 *ptr = (F16 *)dst; + for (int i = 0; i < num; i++) { + ptr[i] = src[i]; + } + break; + } +#endif + case DT_I8: { + INT8 *ptr = (INT8 *)dst; + for (int i = 0; i < num; i++) { + ptr[i] = src[i] * scale; + } + break; + } + case DT_U8: { + U8 *ptr = (U8 *)dst; + for (int i = 0; i < num; i++) { + ptr[i] = src[i]; + } + break; + } + default: { + UNI_ERROR_LOG("not unsupport transform float to %d type data\n", dataType); + break; + } + } +} + +void transformToFloat(DataType dataType, void *src, float *dst, int num, float scale) +{ + switch (dataType) { + case DT_F32: { + UNI_memcpy(dst, src, sizeof(float) * num); + break; + } + case DT_U32: { + U32 *ptr = (U32 *)src; + for (int i = 0; i < num; i++) { + dst[i] = ptr[i]; + } + break; + } + case DT_I32: { + I32 *ptr = (I32 *)src; + for (int i = 0; i < num; i++) { + dst[i] = ptr[i]; + } + break; + } +#ifdef __aarch64__ + case DT_F16: { + F16 *ptr = (F16 *)src; + for (int i = 0; i < num; i++) { + dst[i] = ptr[i]; + } + break; + } + case DT_F16_8Q: { + F16 *ptr = (F16 *)src; + for (int i = 0; i < num; i++) { + dst[i] = ptr[i]; + } + break; + } +#endif + case DT_I8: { + INT8 *ptr = (INT8 *)src; + for (int i = 0; i < num; i++) { + dst[i] = ptr[i] / scale; + } + break; + } + case DT_U8: { + U8 *ptr = (U8 *)src; + for (int i = 0; i < num; i++) { + dst[i] = ptr[i]; + } + break; + } + case DT_BIN01: { + BIN8 *ptr = (BIN8 *)src; + for (int i = 0; i < num; i++) { + std::bitset<8> Val(((BIN8 *)ptr)[i / 8]); + if (Val.test(7 - (i % 8))) { + dst[i] = 1.0; + } else { + dst[i] = 0; + } + } + break; + } + case DT_BIN11: { + BIN8 *ptr = (BIN8 *)src; + for (int i = 0; i < num; i++) { + std::bitset<8> Val(((BIN8 *)ptr)[i / 8]); + if (Val.test(7 - (i % 8))) { + dst[i] = 1.0; + } else { + dst[i] = -1.0; + } + } + break; + } + default: { + UNI_ERROR_LOG("not unsupport transform %d type data to float\n", dataType); + break; + } + } +} + +template +static void transformToNCHWKernel( + TensorDesc inputDesc, const T *input, TensorDesc outputDesc, T *output) +{ + DataType idt, odt; + DataFormat idf, odf; + U32 in, ic, ih, iw, on, oc, oh, ow; + if (tensorIs2d(inputDesc)) { + CHECK_STATUS(tensor2dGet(inputDesc, &idt, &idf, &in, &iw)); + ic = 1; + ih = 1; + } else if (tensorIs3d(inputDesc)) { + CHECK_STATUS(tensor3dGet(inputDesc, &idt, &idf, &in, &ih, &iw)); + ic = 1; + } else if (tensorIs4d(inputDesc)) { + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + } else { + UNI_ERROR_LOG("not support transform %d-dim tensor to NCHW format\n", (int)inputDesc.nDims); + return; + } + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 ihiw = ih * iw; + U32 size = tensorNumElements(outputDesc); + switch (idf) { + case DF_NCHW: { + CHECK_REQUIREMENT(tensorNumElements(inputDesc) == size); + if (output != input) { + memcpy(output, input, size); + } + break; + } + case DF_NCHWC8: { + CHECK_REQUIREMENT(ic % 8 == 0); + ic /= 8; + for (U32 n = 0, srcIndex = 0; n < in; n++) { + for (U32 c = 0; c < ic; c++) { + for (U32 hw = 0; hw < ihiw; hw++) { + for (U32 c8 = 0; c8 < 8; c8++, srcIndex++) { + U32 c_o = c * 8 + c8; + // support channel cut + if (c_o < oc) { + U32 dstIndex = (n * oc + c_o) * ihiw + hw; + output[dstIndex] = input[srcIndex]; + } + } + } + } + } + break; + } + case DF_NHWCN8: { + CHECK_REQUIREMENT(in % 8 == 0); + in /= 8; + for (U32 o = 0, srcIndex = 0; o < in; o++) { + for (U32 hw = 0; hw < ihiw; hw++) { + for (U32 c = 0; c < ic; c++) { + for (U32 o8 = 0; o8 < 8; o8++, srcIndex++) { + U32 dstIndex = ((o * 8 + o8) * ic + c) * ihiw + hw; + output[dstIndex] = input[srcIndex]; + } + } + } + } + break; + } + case DF_NHWC: { + CHECK_REQUIREMENT(tensorNumElements(inputDesc) == size); + for (U32 o = 0, srcIndex = 0; o < in; o++) { + for (U32 hw = 0; hw < ihiw; hw++) { + for (U32 cc = 0; cc < ic; cc++, srcIndex++) { + U32 dstIndex = (o * ic + cc) * ihiw + hw; + output[dstIndex] = input[srcIndex]; + } + } + } + break; + } + default: { + UNI_ERROR_LOG("not support transform %d format tensor to NCHW format\n", idf); + } + } +} + +EE transformToNCHW(TensorDesc inputDesc, const void *input, TensorDesc outputDesc, void *output) +{ + if (nullptr == input || nullptr == output) { + return NULL_POINTER; + } + switch (inputDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + transformToNCHWKernel(inputDesc, (F32 *)input, outputDesc, (F32 *)output); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + transformToNCHWKernel(inputDesc, (F16 *)input, outputDesc, (F16 *)output); + break; + } +#endif +#ifdef _USE_INT8 + case DT_I8: { + transformToNCHWKernel(inputDesc, (INT8 *)input, outputDesc, (INT8 *)output); + break; + } +#endif + default: { + return NOT_SUPPORTED; + } + } + return SUCCESS; +} + +template +static void transformToNHWCKernel( + TensorDesc inputDesc, const T *input, TensorDesc outputDesc, T *output) +{ + DataType idt, odt; + DataFormat idf, odf; + U32 in, ic, ih, iw, on, oc, oh, ow; + if (tensorIs2d(inputDesc)) { + CHECK_STATUS(tensor2dGet(inputDesc, &idt, &idf, &in, &iw)); + ic = 1; + ih = 1; + } else if (tensorIs3d(inputDesc)) { + CHECK_STATUS(tensor3dGet(inputDesc, &idt, &idf, &in, &ih, &iw)); + ic = 1; + } else if (tensorIs4d(inputDesc)) { + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + } else { + UNI_ERROR_LOG("not support transform %d-dim tensor to NHWC format\n", (int)inputDesc.nDims); + return; + } + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 size = tensorNumElements(outputDesc); + U32 ihiw = ih * iw; + switch (idf) { + case DF_NHWC: { + CHECK_REQUIREMENT(tensorNumElements(inputDesc) == size); + if (input != output) { + memcpy(output, input, size); + } + break; + } + case DF_NCHW: { + CHECK_REQUIREMENT(tensorNumElements(inputDesc) == size); + for (U32 o = 0, srcIndex = 0; o < in; o++) { + for (U32 cc = 0; cc < ic; cc++) { + for (U32 hw = 0; hw < ihiw; hw++, srcIndex++) { + U32 dstIndex = (o * ihiw + hw) * ic + cc; + output[dstIndex] = input[srcIndex]; + } + } + } + break; + } + case DF_NCHWC8: { + CHECK_REQUIREMENT(ic % 8 == 0); + ic /= 8; + for (U32 n = 0, srcIndex = 0; n < in; n++) { + for (U32 c = 0; c < ic; c++) { + for (U32 hw = 0; hw < ihiw; hw++) { + for (U32 c8 = 0; c8 < 8; c8++, srcIndex++) { + U32 dstIndex = ((n * ihiw + hw) * ic + c) * 8 + c8; + output[dstIndex] = input[srcIndex]; + } + } + } + } + break; + } + default: { + UNI_ERROR_LOG("not support transform %d format tensor to NHWC format\n", idf); + } + } +} + +EE transformToNHWC(TensorDesc inputDesc, const void *input, TensorDesc outputDesc, void *output) +{ + if (nullptr == input || nullptr == output) { + return NULL_POINTER; + } + switch (inputDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + transformToNHWCKernel(inputDesc, (F32 *)input, outputDesc, (F32 *)output); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + transformToNHWCKernel(inputDesc, (F16 *)input, outputDesc, (F16 *)output); + break; + } +#endif +#ifdef _USE_INT8 + case DT_I8: { + transformToNHWCKernel(inputDesc, (INT8 *)input, outputDesc, (INT8 *)output); + break; + } +#endif + default: { + return NOT_SUPPORTED; + } + } + return SUCCESS; +} + +EE transformNCHWToNCHWC8( + TensorDesc inputDesc, const void *input, TensorDesc outputDesc, void *output) +{ + if (input == NULL || output == NULL) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, odt; + DataFormat idf, odf; + U32 in, ic, ih, iw, on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + CHECK_REQUIREMENT(in == on && idf == DF_NCHW && odf == DF_NCHWC8 && idt == odt && ic <= oc && + ih == oh && iw == ow); + int elementSize = bytesOf(idt); + oc /= 8; + U32 ohow = oh * ow; + const U8 *inputPtr = (const U8 *)input; + U8 *outputPtr = (U8 *)output; + for (U32 n = 0, dstIndex = 0; n < on; n++) { + for (U32 c = 0; c < oc; c++) { + for (U32 hw = 0; hw < ohow; hw++) { + for (U32 c8 = 0; c8 < 8; c8++, dstIndex += elementSize) { + U32 c_i = c * 8 + c8; + // support channel padding + if (c_i < ic) { + U32 srcIndex = ((n * ic + c_i) * ohow + hw) * elementSize; + memcpy(outputPtr + dstIndex, inputPtr + srcIndex, elementSize); + } else { + memset(outputPtr + dstIndex, 0, elementSize); + } + } + } + } + } + return SUCCESS; +} + +EE transformNCHWC8ToNCHWC8ByGroup( + TensorDesc inputDesc, const void *input, int group, TensorDesc outputDesc, void *output) +{ + U32 inputSize = tensorNumElements(inputDesc); + U32 outputSize = tensorNumElements(outputDesc); + if (group <= 1 || inputSize == outputSize) { + if (input != output) { + memcpy(output, input, outputSize); + } + return SUCCESS; + } + + U32 channelAlignSize = 8; + DataType dtBefore, dtAfter; + DataFormat dfBefore, dfAfter; + U32 batch, channelBefore, hBefore, wBefore; + U32 batchAfter, channelAfter, hAfter, wAfter; + CHECK_STATUS( + tensor4dGet(inputDesc, &dtBefore, &dfBefore, &batch, &channelBefore, &hBefore, &wBefore)); + CHECK_STATUS( + tensor4dGet(outputDesc, &dtAfter, &dfAfter, &batchAfter, &channelAfter, &hAfter, &wAfter)); + CHECK_REQUIREMENT(dtBefore == dtAfter); + CHECK_REQUIREMENT(dfBefore == DF_NCHWC8 && dfAfter == DF_NCHWC8); + CHECK_REQUIREMENT(batch == batchAfter); + CHECK_REQUIREMENT(hBefore == hAfter); + CHECK_REQUIREMENT(wBefore == wAfter); + U32 channelGroupSizeBefore = channelBefore / group; + U32 channelGroupSizeAfter = channelAfter / group; + U32 channelTileSizeBefore = channelBefore / channelAlignSize; + U32 channelTileSizeAfter = channelAfter / channelAlignSize; + U32 elementSize = bytesOf(dtBefore); + U32 hw = hBefore * wBefore; + for (U32 n = 0; n < batch; n++) { + for (I32 g = 0, channelIdAfter = 0; g < group; g++) { + for (U32 c = 0; c < channelGroupSizeAfter; c++, channelIdAfter++) { + U32 channelIdBefore = g * channelGroupSizeBefore + c; + U32 channelTileBefore = channelIdBefore / channelAlignSize; + U32 channelTileAfter = channelIdAfter / channelAlignSize; + U32 channelLocalBefore = channelIdBefore % channelAlignSize; + U32 channelLocalAfter = channelIdAfter % channelAlignSize; + U32 indexBefore = + (((n * channelTileSizeBefore + channelTileBefore) * hw) * channelAlignSize + + channelLocalBefore) * + elementSize; + U32 indexAfter = + (((n * channelTileSizeAfter + channelTileAfter) * hw) * channelAlignSize + + channelLocalAfter) * + elementSize; + U32 stepSize = channelAlignSize * elementSize; + U32 indexBeforeUpper = indexBefore + stepSize * hw; + while (indexBefore < indexBeforeUpper) { + memcpy((U8 *)output + indexAfter, (const U8 *)input + indexBefore, elementSize); + indexBefore += stepSize; + indexAfter += stepSize; + } + } + } + } + return SUCCESS; +} + +EE transposeFilter(TensorDesc inputDesc, const void *input, TensorDesc outputDesc, void *output) +{ + if (input == NULL || output == NULL) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, odt; + DataFormat idf, odf; + U32 in, ic, ih, iw, on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + CHECK_REQUIREMENT(idf == odf); + const U8 *inputPtr = (const U8 *)input; + U8 *outputPtr = (U8 *)output; + + switch (idf) { + case DF_NHWCN8: { + CHECK_REQUIREMENT(in % 8 == 0); + in /= 8; + U32 hwMax = ih * iw - 1; + + U32 innerSize = bytesOf(idt) * ic * 8; + + for (U32 o = 0; o < in; o++) { + for (U32 hw = 0; hw < ih * iw; hw++) { + U32 srcIndex = o * ih * iw * innerSize + hw * innerSize; + U32 dstIndex = o * ih * iw * innerSize + (hwMax - hw) * innerSize; + memcpy(outputPtr + dstIndex, inputPtr + srcIndex, innerSize); + } + } + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + return SUCCESS; +} + +EE array_transpose(DataType dt, + U32 *inputDims, + const void *input, + U32 *outputDims, + void *output, + U32 *transposeDims, + int dimsNum) +{ + U32 sizeInner = 1; + I32 sizeInnerIndex = 0; + for (I32 i = dimsNum - 1; i >= 0; i--) { + if ((I32)transposeDims[i] == i) { + sizeInner *= inputDims[dimsNum - 1 - i]; + sizeInnerIndex++; + } else { + break; + } + } + U32 inputSize = 1, outputSize = 1; + for (int i = 0; i < dimsNum; i++) { + inputSize *= inputDims[i]; + outputSize *= outputDims[i]; + } + CHECK_REQUIREMENT(inputSize == outputSize); + outputSize = outputSize / sizeInner; + + std::vector inputLocalIndex(dimsNum); + const U8 *inputPtr = (const U8 *)input; + U8 *outputPtr = (U8 *)output; + U32 tileSize = sizeInner * bytesOf(dt); + for (U32 i = 0; i < outputSize; i++) { + U32 outputIndex = i; + for (I32 j = sizeInnerIndex; j < dimsNum; j++) { + U32 value = outputIndex % outputDims[j]; + outputIndex /= outputDims[j]; + inputLocalIndex[dimsNum - 1 - transposeDims[dimsNum - 1 - j]] = value; + } + U32 inputIndex = 0; + for (I32 j = dimsNum - 1; j > sizeInnerIndex; j--) { + inputIndex = (inputIndex + inputLocalIndex[j]) * inputDims[j - 1]; + } + inputIndex += inputLocalIndex[sizeInnerIndex]; + memcpy(outputPtr + i * tileSize, inputPtr + inputIndex * tileSize, tileSize); + } + + return SUCCESS; +} + +EE array_transpose_naive(DataType dt, + U32 *inputDims, + const void *input, + U32 *outputDims, + void *output, + U32 *transposeDims, + int dimsNum) +{ + if (dimsNum <= 1) { + return SUCCESS; + } + U32 inputSize = 1, outputSize = 1; + for (int i = 0; i < dimsNum; i++) { + inputSize *= inputDims[i]; + outputSize *= outputDims[i]; + } + std::vector inputLocalIndex(dimsNum); + const U8 *inputPtr = (const U8 *)input; + U8 *outputPtr = (U8 *)output; + U32 tileSize = bytesOf(dt); + for (U32 i = 0; i < outputSize; i++) { + U32 outputIndex = i; + for (I32 j = 0; j < dimsNum; j++) { + U32 value = outputIndex % outputDims[j]; + outputIndex /= outputDims[j]; + inputLocalIndex[dimsNum - 1 - transposeDims[dimsNum - 1 - j]] = value; + } + U32 inputIndex = 0; + for (I32 j = dimsNum - 1; j > 0; j--) { + inputIndex = (inputIndex + inputLocalIndex[j]) * inputDims[j - 1]; + } + inputIndex += inputLocalIndex[0]; + memcpy(outputPtr + i * tileSize, inputPtr + inputIndex * tileSize, tileSize); + } + + return SUCCESS; +} diff --git a/common/uni/src/types.cpp b/common/uni/src/types.cpp new file mode 100644 index 00000000..0bfdec06 --- /dev/null +++ b/common/uni/src/types.cpp @@ -0,0 +1,126 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include +#include +#include +#include +#include +#include "types.h" +#include "ut_util.h" + +OperatorSpec mt_create_operator(const char *name, OperatorType type, U32 num_inputs, U32 num_outputs) +{ + OperatorSpec newOperator; + initialization_zero(&(newOperator), sizeof(OperatorSpec)); + U32 length = UNI_MIN(strlen(name), NAME_LEN - 1); + str_copy(newOperator.name, name, length); + if (length < NAME_LEN) { + newOperator.name[length] = '\0'; + } + newOperator.type = type; + newOperator.num_inputs = num_inputs; + newOperator.input_tensors_name = (I8 **)mt_new_storage(num_inputs * sizeof(I8 *)); + for (U32 i = 0; i < num_inputs; i++) { + newOperator.input_tensors_name[i] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + } + newOperator.num_outputs = num_outputs; + newOperator.output_tensors_name = (I8 **)mt_new_storage(num_outputs * sizeof(I8 *)); + for (U32 i = 0; i < num_outputs; i++) { + newOperator.output_tensors_name[i] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + } + newOperator.tensor_positions = NULL; + newOperator.num_quant_feature = 0; + newOperator.feature_scale = NULL; + return newOperator; +} + +EE mt_insert_operator(ModelSpec *ms, int index, OperatorSpec newOperator) +{ + if (nullptr == ms) { + return NULL_POINTER; + } + OperatorSpec *operatorList = + (OperatorSpec *)mt_new_storage(sizeof(OperatorSpec) * (ms->num_operator_specs + 1)); + for (int i = 0; i < index; i++) { + operatorList[i] = ms->ops[i]; + } + operatorList[index] = newOperator; + for (int i = index; i < ms->num_operator_specs; i++) { + operatorList[i + 1] = ms->ops[i]; + } + delete ms->ops; + ms->ops = operatorList; + ms->num_operator_specs++; + return SUCCESS; +} + +WeightSpec mt_create_weight( + const char *name, DataType dataType, U32 bytesOfWeight, U32 bytesOfVec, U32 numQuantScale) +{ + WeightSpec newWeight; + initialization_zero(&(newWeight), sizeof(WeightSpec)); + U32 length = UNI_MIN(strlen(name), NAME_LEN - 1); + str_copy(newWeight.op_name, name, length); + if (length < NAME_LEN) { + newWeight.op_name[length] = '\0'; + } + newWeight.mdt = dataType; + newWeight.bytes_of_weight = bytesOfWeight; + newWeight.weight = (U8 *)mt_new_storage(bytesOfWeight); + newWeight.bytes_of_vec = bytesOfVec; + newWeight.vec = (U8 *)mt_new_storage(bytesOfVec); + newWeight.num_quant_scale = numQuantScale; + newWeight.weight_scale = (QuantSpec *)mt_new_storage(sizeof(QuantSpec) * numQuantScale); + return newWeight; +} + +bool isDeprecatedOp(OperatorType opType) +{ + return (opType == OT_None) ? true : false; +} + +bool isDeprecatedOpWeight(const ModelSpec *spec, int index) +{ + if (index >= spec->num_weight_specs) { + return true; + } else { + if (spec->ws[index].bytes_of_weight == 0 && spec->ws[index].bytes_of_vec == 0) { + return true; + } else { + return false; + } + } +} + +EE str_copy(I8 *dst, const I8 *src, I32 srcLen, I32 dstLen) +{ + memset(dst, 0, dstLen); + I32 copyLen = NAME_LEN - 1; + if (copyLen > srcLen) { + copyLen = srcLen; + } + memcpy(dst, src, copyLen * sizeof(I8)); + return SUCCESS; +} + +void *mt_new_storage(size_t size) +{ + if (size == 0) { + return nullptr; + } else { + U8 *s = (U8 *)operator new(size); + return (void *)s; + } +} diff --git a/common/uni/src/uni.cpp b/common/uni/src/uni.cpp new file mode 100644 index 00000000..506f4260 --- /dev/null +++ b/common/uni/src/uni.cpp @@ -0,0 +1,42 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_desc.h" + +#include + +extern "C" int UNI_ISINF(float a) +{ +#ifdef isinf + return isinf(a); +#else +#if __cplusplus < 201103L + return isinf(a); +#else + return std::isinf(a); +#endif +#endif +} + +extern "C" int UNI_ISNAN(float a) +{ +#ifdef isnan + return isnan(a); +#else +#if __cplusplus < 201103L + return isnan(a); +#else + return std::isnan(a); +#endif +#endif +} diff --git a/compute/CMakeLists.txt b/compute/CMakeLists.txt new file mode 100644 index 00000000..1ed03bab --- /dev/null +++ b/compute/CMakeLists.txt @@ -0,0 +1,17 @@ +cmake_minimum_required(VERSION 3.2) + +file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/common/cmakes/bolt.cmake ${BOLT_ROOT}/common/cmakes/bolt.cmake) +if (BOLT_CONFIGURE_FILE) + include(${BOLT_CONFIGURE_FILE}) +else (BOLT_CONFIGURE_FILE) + message(FATAL_ERROR " +FATAL: can not find bolt.cmake in /common/cmakes directory, + please set shell or cmake environment variable BOLT_ROOT. + ") +endif (BOLT_CONFIGURE_FILE) + +project(compute) + +add_subdirectory(blas_enhance) +add_subdirectory(tensor) +add_subdirectory(image) diff --git a/compute/blas_enhance/CMakeLists.txt b/compute/blas_enhance/CMakeLists.txt new file mode 100644 index 00000000..70601eef --- /dev/null +++ b/compute/blas_enhance/CMakeLists.txt @@ -0,0 +1,20 @@ +cmake_minimum_required(VERSION 3.2) + +file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/common/cmakes/bolt.cmake ${BOLT_ROOT}/common/cmakes/bolt.cmake) +if (BOLT_CONFIGURE_FILE) + include(${BOLT_CONFIGURE_FILE}) +else (BOLT_CONFIGURE_FILE) + message(FATAL_ERROR " +FATAL: can not find bolt.cmake in directory, + please set shell or cmake environment variable BOLT_ROOT. + ") +endif (BOLT_CONFIGURE_FILE) + +project(blas_enhance) + +set_c_cxx_flags() + +include_blas_enhance() + +add_subdirectory(src) +add_subdirectory(tests) diff --git a/compute/blas_enhance/include/blas_enhance.h b/compute/blas_enhance/include/blas_enhance.h new file mode 100644 index 00000000..7cae3bcc --- /dev/null +++ b/compute/blas_enhance/include/blas_enhance.h @@ -0,0 +1,105 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_BLAS_ENHANCE +#define _H_BLAS_ENHANCE + +#include "sys.h" +#include "tensor_desc.h" +#include "types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +EE matrix_matrix_multiply_tmp_bytes( + TensorDesc matrixADesc, TensorDesc matrixBDesc, U32 *bytes, Arch arch); + +EE matrix_matrix_multiply(TensorDesc matrixADesc, + const void *matrixA, + TensorDesc matrixBDesc, + const void *matrixB, + U32 bytes, + void *tmp, + TensorDesc matrixCDesc, + void *matrixC, + Arch arch); + +EE matrix_vector_multiply_tmp_bytes(TensorDesc matrixDesc, TensorDesc vectorDesc, U32 *bytes, Arch); + +EE matrix_vector_multiply(TensorDesc matrixDesc, + const void *matrix, + TensorDesc vectorDesc, + const void *vector, + U32 bytes, + void *tmp, + TensorDesc resultDesc, + void *result, + Arch arch); + +inline DataFormat targetFormat4MatrixB(DataType dt) +{ + switch (dt) { + case DT_F16: { + return DF_NKN24; + } + case DT_F32: { +#ifdef __aarch64__ + return DF_NKN12; +#else + return DF_NKN8; +#endif + } + case DT_I8: { + return DF_NKN12K4; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + return DF_NCHWC8; + } + } +} + +inline DataFormat targetFormat4mvmMatrix(DataType dt) +{ + switch (dt) { + case DT_I8: { + return DF_NKN32K4; + } + case DT_F16: { + return DF_NKN64; + } + case DT_F32: { + return DF_NKN16; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + return DF_NCHWC8; + } + } +} + +EE matrix_matrix_multiply_transform_rhs( + TensorDesc desc, const void *src, TensorDesc *descTran, void *dst, Arch arch); + +EE matrix_vector_multiply_transform_weight( + TensorDesc desc, const void *src, TensorDesc *descTran, void *dst, Arch arch); + +EE vector_vector_axpby( + F32 a, TensorDesc xDesc, const void *x, F32 b, TensorDesc yDesc, void *y, Arch arch); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/compute/blas_enhance/src/CMakeLists.txt b/compute/blas_enhance/src/CMakeLists.txt new file mode 100644 index 00000000..23572214 --- /dev/null +++ b/compute/blas_enhance/src/CMakeLists.txt @@ -0,0 +1,44 @@ +if (USE_GENERAL) + file(GLOB general_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/general/*.cpp) +endif (USE_GENERAL) + +if (USE_X86) + file(GLOB x86_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/x86/*.cpp) + if (USE_FP32) + file(GLOB x86_fp32_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/x86/fp32/*.cpp) + endif (USE_FP32) + set(x86_srcs "${x86_srcs};${x86_fp32_srcs};") +endif (USE_X86) + +if (USE_NEON) + if (USE_FP16) + file(GLOB arm_fp16_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/fp16/*.cpp) + endif (USE_FP16) + if (USE_FP32) + file(GLOB arm_fp32_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/fp32/*.cpp) + endif (USE_FP32) + if (USE_INT8) + file(GLOB arm_int8_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/int8/*.cpp) + endif (USE_INT8) + file(GLOB arm_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/*.cpp) + set(arm_srcs "${arm_srcs};${arm_fp16_srcs};${arm_fp32_srcs};${arm_int8_srcs}") +endif (USE_NEON) + +file(GLOB srcs ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) +set(srcs "${srcs};${general_srcs};${arm_srcs};${x86_srcs}") + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) + +# shared library +add_library(${PROJECT_NAME} SHARED ${srcs}) +target_link_libraries(${PROJECT_NAME} LINK_PUBLIC uni) + +# static library +add_library(${PROJECT_NAME}_static STATIC ${srcs}) + +set_target_properties(${PROJECT_NAME}_static PROPERTIES OUTPUT_NAME "${PROJECT_NAME}") +set_target_properties(${PROJECT_NAME} PROPERTIES CLEAN_DIRECT_OUTPUT 1) +set_target_properties(${PROJECT_NAME}_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) +install(TARGETS ${PROJECT_NAME} ${PROJECT_NAME}_static + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib) diff --git a/compute/blas_enhance/src/axpby.cpp b/compute/blas_enhance/src/axpby.cpp new file mode 100644 index 00000000..6f7cb448 --- /dev/null +++ b/compute/blas_enhance/src/axpby.cpp @@ -0,0 +1,53 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "blas_enhance.h" +#ifdef _USE_GENERAL +#include "cpu/general/blas_general.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/blas_arm.h" +#endif + +EE vector_vector_axpby( + F32 a, TensorDesc xDesc, const void *x, F32 b, TensorDesc yDesc, void *y, Arch arch) +{ + if (nullptr == x || nullptr == y) { + CHECK_STATUS(NULL_POINTER); + } + DataType xDataType, yDataType; + DataFormat xDataFormat, yDataFormat; + U32 xLen, yLen; + CHECK_STATUS(tensor1dGet(xDesc, &xDataType, &xDataFormat, &xLen)); + CHECK_STATUS(tensor1dGet(yDesc, &yDataType, &yDataFormat, &yLen)); + + if (xDataType != yDataType) { + CHECK_STATUS(NOT_MATCH); + } + + if (xLen != yLen) { + CHECK_STATUS(NOT_MATCH); + } + + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = axpby_general(yLen, yDataType, a, x, b, y); +#endif +#ifdef _USE_NEON + } else { + ret = axpby_arm(yLen, yDataType, a, x, b, y, arch); +#endif + } + return ret; +} diff --git a/compute/blas_enhance/src/cpu/arm/axpby.cpp b/compute/blas_enhance/src/cpu/arm/axpby.cpp new file mode 100644 index 00000000..89bc34a2 --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/axpby.cpp @@ -0,0 +1,46 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "error.h" +#include "types.h" +#include "cpu/arm/blas_arm.h" +#ifdef _USE_FP16 +#include "cpu/arm/fp16/blas_fp16.h" +#endif +#ifdef _USE_FP32 +#include "cpu/arm/fp32/blas_fp32.h" +#endif + +EE axpby_arm(U32 len, DataType dt, F32 a, const void *x, F32 b, void *y, Arch arch) +{ + EE ret = SUCCESS; + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + if (ARM_A55 != arch && ARM_A76 != arch) { + return NOT_SUPPORTED; + } + ret = axpby_fp16(len, a, (F16 *)x, b, (F16 *)y); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + ret = axpby_fp32(len, a, (F32 *)x, b, (F32 *)y); + break; +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/blas_enhance/src/cpu/arm/blas_arm.h b/compute/blas_enhance/src/cpu/arm/blas_arm.h new file mode 100644 index 00000000..e52b72fa --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/blas_arm.h @@ -0,0 +1,64 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_BLAS_ARM +#define _H_BLAS_ARM + +#include "error.h" +#include "sys.h" +#include "tensor_desc.h" + +EE matrix_vector_multiply_tmp_bytes_arm(bool transpose, DataType dt, U32 *bytes); + +EE matrix_vector_multiply_transform_weight_arm( + TensorDesc desc, const void *src, TensorDesc *descTran, void *dst); + +EE mvm_arm(U32 row, + U32 col, + DataType dt, + DataFormat df, + const void *matrix, + const void *vector, + void *tmp, + void *result, + Arch arch); + +EE matrix_matrix_multiply_tmp_bytes_arm( + U32 matrixA_M, U32 matrixA_K, U32 matrixB_K, U32 matrixB_N, DataType dt, U32 *bytes); + +EE matrix_matrix_multiply_transform_rhs_arm( + TensorDesc desc, const void *src, TensorDesc *descTran, void *dst); + +EE mmm_arm(U32 matrixC_N, + U32 matrixC_M, + U32 matrixA_K, + DataType matrixADataType, + bool transposeA, + const void *matrixAData, + const void *matrixBData, + void *tmp, + void *matrixCData, + Arch arch); + +inline U32 pad_to_4_multiple(U32 k) +{ + if (k % 4 == 0) { + return k; + } else { + return (k / 4) * 4 + 4; + } +} + +EE axpby_arm(U32 len, DataType dt, F32 a, const void *x, F32 b, void *y, Arch arch); + +#endif diff --git a/compute/blas_enhance/src/cpu/arm/fp16/axpby.cpp b/compute/blas_enhance/src/cpu/arm/fp16/axpby.cpp new file mode 100644 index 00000000..9f3589af --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/fp16/axpby.cpp @@ -0,0 +1,34 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "error.h" +#include "cpu/arm/fp16/blas_fp16.h" + +EE axpby_fp16(U32 len, F32 a, const F16 *x, F32 b, F16 *y) +{ + EE ret = SUCCESS; + float16x8_t alpha = vdupq_n_f16(a); + float16x8_t beta = vdupq_n_f16(b); + I32 i = 0; + for (; i < ((I32)len) - 7; i += 8) { + float16x8_t out = vld1q_f16(y + i); + float16x8_t in = vld1q_f16(x + i); + out = vmulq_f16(out, beta); + out = vfmaq_f16(out, alpha, in); + vst1q_f16(y + i, out); + } + for (; i < (I32)len; i++) { + y[i] = a * x[i] + b * y[i]; + } + return ret; +} diff --git a/compute/blas_enhance/src/cpu/arm/fp16/blas_fp16.h b/compute/blas_enhance/src/cpu/arm/fp16/blas_fp16.h new file mode 100644 index 00000000..fbc144d7 --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/fp16/blas_fp16.h @@ -0,0 +1,40 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_FP16 +#ifndef _H_BLAS_FP16 +#define _H_BLAS_FP16 + +#include "sys.h" +#include "types.h" +#include "error.h" +#include "tensor_desc.h" + +EE matrix_vector_multiply_transform_weight_fp16(TensorDesc desc, F16 *src, F16 *dst); + +EE mvm_fp16(U32 row, U32 col, DataFormat df, F16 *matrix, F16 *vector, F16 *result, Arch arch); + +void matrix_matrix_multiply_tmp_bytes_fp16( + U32 row1, U32 col1, U32 row2, U32 col2, DataType dt, U32 *bytes); + +EE matrix_matrix_multiply_transform_rhsN_fp16(TensorDesc desc, F16 *src, F16 *dst); + +EE matrix_matrix_multiply_transform_rhsT_fp16(TensorDesc desc, F16 *src, F16 *dst); + +EE mmm_fp16( + int M, int N, int K, bool transposeA, F16 *matrix1, F16 *matrix2, F16 *tmp, F16 *result, Arch arch); + +EE axpby_fp16(U32 len, F32 a, const F16 *x, F32 b, F16 *y); + +#endif +#endif diff --git a/compute/blas_enhance/src/cpu/arm/fp16/mmm.cpp b/compute/blas_enhance/src/cpu/arm/fp16/mmm.cpp new file mode 100644 index 00000000..8b7fc590 --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/fp16/mmm.cpp @@ -0,0 +1,88 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "types.h" +#include "error.h" +#include "cpu/arm/fp16/blas_fp16.h" +#include "mmm.h" +#include "mmm_common.h" + +void matrix_matrix_multiply_tmp_bytes_fp16( + U32 row1, U32 col1, U32 row2, U32 col2, DataType dt, U32 *bytes) +{ + *bytes = row1 * col1 + row2 * col2; + *bytes *= bytesOf(dt); + *bytes += 32; +} + +EE matrix_matrix_multiply_transform_rhsN_fp16(TensorDesc desc, F16 *src, F16 *dst) +{ + DataType dt; + DataFormat df; + U32 N, K; + CHECK_STATUS(tensor2dGet(desc, &dt, &df, &K, &N)); + int i = 0; + for (; i < (int)N - 23; i += 24) { + matrix2_trans(24, K, N, src + i, dst + i * K); + } + for (; i < (int)N - 7; i += 8) { + matrix2_trans(8, K, N, src + i, dst + i * K); + } + for (; i < (int)N - 3; i += 4) { + matrix2_trans(4, K, N, src + i, dst + i * K); + } + if ((int)N > i) { + matrix2_trans(N - i, K, N, src + i, dst + i * K); + } + return SUCCESS; +} + +EE matrix_matrix_multiply_transform_rhsT_fp16(TensorDesc desc, F16 *src, F16 *dst) +{ + DataType dt; + DataFormat df; + U32 N, K; + CHECK_STATUS(tensor2dGet(desc, &dt, &df, &N, &K)); + int i = 0; + for (; i < (int)N - 23; i += 24) { + matrix1_trans(24, K, K, src + i * K, dst + i * K); + } + for (; i < (int)N - 7; i += 8) { + matrix1_trans(8, K, K, src + i * K, dst + i * K); + } + for (; i < (int)N - 3; i += 4) { + matrix1_trans(4, K, K, src + i * K, dst + i * K); + } + if ((int)N > i) { + matrix1_trans(N - i, K, K, src + i * K, dst + i * K); + } + return SUCCESS; +} + +EE mmm_fp16( + int M, int N, int K, bool transposeA, F16 *matrix1, F16 *matrix2, F16 *tmp, F16 *result, Arch arch) +{ + EE ret = SUCCESS; + switch (arch) { + case ARM_A55: + mmm_A55(M, N, K, transposeA, matrix1, matrix2, tmp, result); + break; + case ARM_A76: + mmm_A76(M, N, K, transposeA, matrix1, matrix2, tmp, result); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/blas_enhance/src/cpu/arm/fp16/mmm.h b/compute/blas_enhance/src/cpu/arm/fp16/mmm.h new file mode 100644 index 00000000..a724ea98 --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/fp16/mmm.h @@ -0,0 +1,23 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_MMM +#define _H_MMM +#include "types.h" + +void mmm_A55( + int M, int N, int K, bool transposeA, F16 *matrix1, F16 *matrix2, F16 *tmp, F16 *result); + +void mmm_A76( + int M, int N, int K, bool transposeA, F16 *matrix1, F16 *matrix2, F16 *tmp, F16 *result); +#endif diff --git a/compute/blas_enhance/src/cpu/arm/fp16/mmm_A55.cpp b/compute/blas_enhance/src/cpu/arm/fp16/mmm_A55.cpp new file mode 100644 index 00000000..9166bde4 --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/fp16/mmm_A55.cpp @@ -0,0 +1,783 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include +#include + +#include "types.h" +#include "error.h" +#include "cpu/arm/fp16/mmm_common.h" +#include "cpu/arm/fp16/mmm.h" + +inline void mmm_4x24_A55(U32 M, U32 K, F16 *w, F16 *in, F16 *out) +{ + U32 KTail = K % 2; + U32 KInner = K - KTail; + asm volatile( + // init in0- > v1, w- > v0 + "ld1 {v1.4h}, [%1], #8\n" + "ldr x22, [%1], #8\n" + "ins v1.d[1], x22\n" + "ld1 {v0.4h}, [%2], #8\n" + "mov x26, %0\n" + "ld1 {v5.8h, v6.8h, v7.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v8.8h, v9.8h, v10.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v11.8h, v12.8h, v13.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v14.8h, v15.8h, v16.8h}, [x26]\n" + + "mov x20, %3\n" + + "0:\n" + // w- > v4, in0- > v2/v3/v1, out0=v5~v28 + "fmla v5.8h, v1.8h, v0.h[0]\n" + "ld1 {v2.4h}, [%1], #8\n" + "fmla v8.8h, v1.8h, v0.h[1]\n" + "ldr x23, [%1], #8\n" + "fmla v11.8h, v1.8h, v0.h[2]\n" + "ins v2.d[1], x23\n" + "fmla v14.8h, v1.8h, v0.h[3]\n" + "ld1 {v4.4h}, [%2], #8\n" + "fmla v6.8h, v2.8h, v0.h[0]\n" + "ld1 {v3.4h}, [%1], #8\n" + "fmla v9.8h, v2.8h, v0.h[1]\n" + "ldr x24, [%1], #8\n" + "fmla v12.8h, v2.8h, v0.h[2]\n" + "ins v3.d[1], x24\n" + "fmla v15.8h, v2.8h, v0.h[3]\n" + "fmla v7.8h, v3.8h, v0.h[0]\n" + "ld1 {v1.4h}, [%1], #8\n" + "fmla v10.8h, v3.8h, v0.h[1]\n" + "ldr x22, [%1], #8\n" + "fmla v13.8h, v3.8h, v0.h[2]\n" + "ins v1.d[1], x22\n" + "fmla v16.8h, v3.8h, v0.h[3]\n" + + // w- > v0, in0- > v2/v3/v1, out0- > v5~v28 + "fmla v5.8h, v1.8h, v4.h[0]\n" + "ld1 {v2.4h}, [%1], #8\n" + "fmla v8.8h, v1.8h, v4.h[1]\n" + "ldr x23, [%1], #8\n" + "fmla v11.8h, v1.8h, v4.h[2]\n" + "ins v2.d[1], x23\n" + "fmla v14.8h, v1.8h, v4.h[3]\n" + "ld1 {v0.4h}, [%2], #8\n" + "fmla v6.8h, v2.8h, v4.h[0]\n" + "ld1 {v3.4h}, [%1], #8\n" + "fmla v9.8h, v2.8h, v4.h[1]\n" + "ldr x24, [%1], #8\n" + "fmla v12.8h, v2.8h, v4.h[2]\n" + "ins v3.d[1], x24\n" + "fmla v15.8h, v2.8h, v4.h[3]\n" + "fmla v7.8h, v3.8h, v4.h[0]\n" + "ld1 {v1.4h}, [%1], #8\n" + "fmla v10.8h, v3.8h, v4.h[1]\n" + "ldr x22, [%1], #8\n" + "fmla v13.8h, v3.8h, v4.h[2]\n" + "ins v1.d[1], x22\n" + "fmla v16.8h, v3.8h, v4.h[3]\n" + + "subs x20, x20, #0x2\n" + "bne 0b\n" + + "cbz %5, 1f\n" + "fmla v5.8h, v1.8h, v0.h[0]\n" + "ld1 {v2.4h}, [%1], #8\n" + "fmla v8.8h, v1.8h, v0.h[1]\n" + "ldr x23, [%1], #8\n" + "fmla v11.8h, v1.8h, v0.h[2]\n" + "ins v2.d[1], x23\n" + "fmla v14.8h, v1.8h, v0.h[3]\n" + "fmla v6.8h, v2.8h, v0.h[0]\n" + "ld1 {v3.4h}, [%1], #8\n" + "fmla v9.8h, v2.8h, v0.h[1]\n" + "ldr x24, [%1], #8\n" + "fmla v12.8h, v2.8h, v0.h[2]\n" + "ins v3.d[1], x24\n" + "fmla v15.8h, v2.8h, v0.h[3]\n" + "fmla v7.8h, v3.8h, v0.h[0]\n" + "fmla v10.8h, v3.8h, v0.h[1]\n" + "fmla v13.8h, v3.8h, v0.h[2]\n" + "fmla v16.8h, v3.8h, v0.h[3]\n" + + "1:\n" + "mov x26, %0\n" + "st1 {v5.8h, v6.8h, v7.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v8.8h, v9.8h, v10.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v11.8h, v12.8h, v13.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v14.8h, v15.8h, v16.8h}, [x26]\n" + : "+r"(out), "+r"(in), "+r"(w) + : "r"((I64)KInner), "r"((I64)M), "r"((I64)KTail) + : "memory", "cc", "x0", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "v0", "v1", + "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16"); +} + +inline void mmm_8x4_A55(U32 M, U32 K, F16 *w, F16 *in, F16 *out) +{ + U32 KTail = K % 2; + U32 KInner = K - KTail; + asm volatile("mov x26, %0\n" + "ld1 {v5.h}[0], [x26], #2\n" + "ld1 {v6.h}[0], [x26], #2\n" + "ld1 {v7.h}[0], [x26], #2\n" + "ld1 {v8.h}[0], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "ld1 {v5.h}[1], [x26], #2\n" + "ld1 {v6.h}[1], [x26], #2\n" + "ld1 {v7.h}[1], [x26], #2\n" + "ld1 {v8.h}[1], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "ld1 {v5.h}[2], [x26], #2\n" + "ld1 {v6.h}[2], [x26], #2\n" + "ld1 {v7.h}[2], [x26], #2\n" + "ld1 {v8.h}[2], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "ld1 {v5.h}[3], [x26], #2\n" + "ld1 {v6.h}[3], [x26], #2\n" + "ld1 {v7.h}[3], [x26], #2\n" + "ld1 {v8.h}[3], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "ld1 {v5.h}[4], [x26], #2\n" + "ld1 {v6.h}[4], [x26], #2\n" + "ld1 {v7.h}[4], [x26], #2\n" + "ld1 {v8.h}[4], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "ld1 {v5.h}[5], [x26], #2\n" + "ld1 {v6.h}[5], [x26], #2\n" + "ld1 {v7.h}[5], [x26], #2\n" + "ld1 {v8.h}[5], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "ld1 {v5.h}[6], [x26], #2\n" + "ld1 {v6.h}[6], [x26], #2\n" + "ld1 {v7.h}[6], [x26], #2\n" + "ld1 {v8.h}[6], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "ld1 {v5.h}[7], [x26], #2\n" + "ld1 {v6.h}[7], [x26], #2\n" + "ld1 {v7.h}[7], [x26], #2\n" + "ld1 {v8.h}[7], [x26], #2\n" + "add x26, x26, %4\n" + + "mov x20, %3\n" + + "ld1 {v1.4h}, [%2], #8\n" + "ldr x24, [%2], #8\n" + "ins v1.d[1], x24\n" + "ld1 {v2.4h}, [%1], #8\n" + + "0:\n" + "fmla v5.8h, v1.8h, v2.h[0]\n" + "ld1 {v3.4h}, [%2], #8\n" + "fmla v6.8h, v1.8h, v2.h[1]\n" + "ldr x25, [%2], #8\n" + "ld1 {v4.4h}, [%1], #8\n" + "fmla v7.8h, v1.8h, v2.h[2]\n" + "ins v3.d[1], x25\n" + "fmla v8.8h, v1.8h, v2.h[3]\n" + + "fmla v5.8h, v3.8h, v4.h[0]\n" + "ld1 {v1.4h}, [%2], #8\n" + "fmla v6.8h, v3.8h, v4.h[1]\n" + "ldr x24, [%2], #8\n" + "ld1 {v2.4h}, [%1], #8\n" + "fmla v7.8h, v3.8h, v4.h[2]\n" + "ins v1.d[1], x24\n" + "fmla v8.8h, v3.8h, v4.h[3]\n" + + "subs x20, x20, 0x2\n" + "bne 0b\n" + + "cbz %5, 1f\n" + "fmla v5.8h, v1.8h, v2.h[0]\n" + "fmla v6.8h, v1.8h, v2.h[1]\n" + "fmla v7.8h, v1.8h, v2.h[2]\n" + "fmla v8.8h, v1.8h, v2.h[3]\n" + + "1:\n" + "mov x26, %0\n" + "st1 {v5.h}[0], [x26], #2\n" + "st1 {v6.h}[0], [x26], #2\n" + "st1 {v7.h}[0], [x26], #2\n" + "st1 {v8.h}[0], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "st1 {v5.h}[1], [x26], #2\n" + "st1 {v6.h}[1], [x26], #2\n" + "st1 {v7.h}[1], [x26], #2\n" + "st1 {v8.h}[1], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "st1 {v5.h}[2], [x26], #2\n" + "st1 {v6.h}[2], [x26], #2\n" + "st1 {v7.h}[2], [x26], #2\n" + "st1 {v8.h}[2], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "st1 {v5.h}[3], [x26], #2\n" + "st1 {v6.h}[3], [x26], #2\n" + "st1 {v7.h}[3], [x26], #2\n" + "st1 {v8.h}[3], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "st1 {v5.h}[4], [x26], #2\n" + "st1 {v6.h}[4], [x26], #2\n" + "st1 {v7.h}[4], [x26], #2\n" + "st1 {v8.h}[4], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "st1 {v5.h}[5], [x26], #2\n" + "st1 {v6.h}[5], [x26], #2\n" + "st1 {v7.h}[5], [x26], #2\n" + "st1 {v8.h}[5], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "st1 {v5.h}[6], [x26], #2\n" + "st1 {v6.h}[6], [x26], #2\n" + "st1 {v7.h}[6], [x26], #2\n" + "st1 {v8.h}[6], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "st1 {v5.h}[7], [x26], #2\n" + "st1 {v6.h}[7], [x26], #2\n" + "st1 {v7.h}[7], [x26], #2\n" + "st1 {v8.h}[7], [x26], #2\n" + "add x26, x26, %4\n" + : "+r"(out), "+r"(in), "+r"(w) + : "r"((I64)KInner), "r"((I64)M), "r"((I64)KTail) + : "memory", "cc", "x20", "x24", "x25", "x26", "x27", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8"); +} + +inline void mmm_4x8_A55(U32 M, U32 K, F16 *w, F16 *in, F16 *out) +{ + U32 KTail = K % 2; + U32 KInner = K - KTail; + asm volatile("mov x26, %0\n" + "ld1 {v5.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v6.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v7.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v8.8h}, [x26]\n" + + "mov x20, %3\n" + + "ld1 {v1.4h}, [%1], #8\n" + "ldr x24, [%1], #8\n" + "ins v1.d[1], x24\n" + "ld1 {v2.4h}, [%2], #8\n" + + "0:\n" + "fmla v5.8h, v1.8h, v2.h[0]\n" + "ld1 {v3.4h}, [%1], #8\n" + "fmla v6.8h, v1.8h, v2.h[1]\n" + "ldr x25, [%1], #8\n" + "ld1 {v4.4h}, [%2], #8\n" + "fmla v7.8h, v1.8h, v2.h[2]\n" + "ins v3.d[1], x25\n" + "fmla v8.8h, v1.8h, v2.h[3]\n" + "fmla v5.8h, v3.8h, v4.h[0]\n" + "ld1 {v1.4h}, [%1], #8\n" + "fmla v6.8h, v3.8h, v4.h[1]\n" + "ldr x24, [%1], #8\n" + "ld1 {v2.4h}, [%2], #8\n" + "fmla v7.8h, v3.8h, v4.h[2]\n" + "ins v1.d[1], x24\n" + "fmla v8.8h, v3.8h, v4.h[3]\n" + + "subs x20, x20, 0x2\n" + "bne 0b\n" + + "cbz %5, 1f\n" + "fmla v5.8h, v1.8h, v2.h[0]\n" + "fmla v6.8h, v1.8h, v2.h[1]\n" + "fmla v7.8h, v1.8h, v2.h[2]\n" + "fmla v8.8h, v1.8h, v2.h[3]\n" + + "1:\n" + "mov x26, %0\n" + "st1 {v5.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v6.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v7.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v8.8h}, [x26]\n" + : "+r"(out), "+r"(in), "+r"(w) + : "r"((I64)KInner), "r"((I64)M), "r"((I64)KTail) + : "memory", "cc", "x20", "x24", "x25", "x26", "x27", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8"); +} + +inline void mmm_4x4_A55(U32 M, U32 K, F16 *w, F16 *in, F16 *out) +{ + U32 KTail = K % 2; + U32 KInner = K - KTail; + asm volatile("mov x26, %0\n" + "ld1 {v5.4h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v6.4h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v7.4h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v8.4h}, [x26]\n" + + "mov x20, %3\n" + + "ld1 {v1.4h}, [%1], #8\n" + "ld1 {v2.4h}, [%2], #8\n" + + "0:\n" + "fmla v5.4h, v1.4h, v2.h[0]\n" + "ld1 {v3.4h}, [%1], #8\n" + "fmla v6.4h, v1.4h, v2.h[1]\n" + "ld1 {v4.4h}, [%2], #8\n" + "fmla v7.4h, v1.4h, v2.h[2]\n" + "fmla v8.4h, v1.4h, v2.h[3]\n" + "fmla v5.4h, v3.4h, v4.h[0]\n" + "ld1 {v1.4h}, [%1], #8\n" + "fmla v6.4h, v3.4h, v4.h[1]\n" + "ld1 {v2.4h}, [%2], #8\n" + "fmla v7.4h, v3.4h, v4.h[2]\n" + "fmla v8.4h, v3.4h, v4.h[3]\n" + + "subs x20, x20, 0x2\n" + "bne 0b\n" + + "cbz %5, 1f\n" + "fmla v5.4h, v1.4h, v2.h[0]\n" + "fmla v6.4h, v1.4h, v2.h[1]\n" + "fmla v7.4h, v1.4h, v2.h[2]\n" + "fmla v8.4h, v1.4h, v2.h[3]\n" + + "1:\n" + "mov x26, %0\n" + "st1 {v5.4h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v6.4h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v7.4h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v8.4h}, [x26]\n" + : "+r"(out), "+r"(in), "+r"(w) + : "r"((I64)KInner), "r"((I64)M), "r"((I64)KTail) + : "memory", "cc", "x20", "x26", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8"); +} + +inline void mmm_8x8_A55(U32 M, U32 K, F16 *w, F16 *in, F16 *out) +{ + U32 KTail = K % 2; + U32 KInner = K - KTail; + asm volatile("mov x26, %0\n" + "ld1 {v5.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v6.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v7.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v8.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v9.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v10.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v11.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v12.8h}, [x26]\n" + + "mov x20, %3\n" + + "ld1 {v1.4h}, [%1], #8\n" + "ldr x24, [%1], #8\n" + "ins v1.d[1], x24\n" + "ld1 {v2.4h}, [%2], #8\n" + "ldr x22, [%2], #8\n" + "ins v2.d[1], x22\n" + + "0:\n" + "fmla v5.8h, v1.8h, v2.h[0]\n" + "ld1 {v3.4h}, [%1], #8\n" + "fmla v6.8h, v1.8h, v2.h[1]\n" + "ldr x25, [%1], #8\n" + "fmla v7.8h, v1.8h, v2.h[2]\n" + "ins v3.d[1], x25\n" + "fmla v8.8h, v1.8h, v2.h[3]\n" + "ld1 {v4.4h}, [%2], #8\n" + "fmla v9.8h, v1.8h, v2.h[4]\n" + "ldr x23, [%2], #8\n" + "fmla v10.8h, v1.8h, v2.h[5]\n" + "ins v4.d[1], x23\n" + "fmla v11.8h, v1.8h, v2.h[6]\n" + "fmla v12.8h, v1.8h, v2.h[7]\n" + + "fmla v5.8h, v3.8h, v4.h[0]\n" + "ld1 {v1.4h}, [%1], #8\n" + "fmla v6.8h, v3.8h, v4.h[1]\n" + "ldr x24, [%1], #8\n" + "fmla v7.8h, v3.8h, v4.h[2]\n" + "ins v1.d[1], x24\n" + "fmla v8.8h, v3.8h, v4.h[3]\n" + "ld1 {v2.4h}, [%2], #8\n" + "fmla v9.8h, v3.8h, v4.h[4]\n" + "ldr x22, [%2], #8\n" + "fmla v10.8h, v3.8h, v4.h[5]\n" + "ins v2.d[1], x22\n" + "fmla v11.8h, v3.8h, v4.h[6]\n" + "fmla v12.8h, v3.8h, v4.h[7]\n" + + "subs x20, x20, 0x2\n" + "bne 0b\n" + + "cbz %5, 1f\n" + "fmla v5.8h, v1.8h, v2.h[0]\n" + "fmla v6.8h, v1.8h, v2.h[1]\n" + "fmla v7.8h, v1.8h, v2.h[2]\n" + "fmla v8.8h, v1.8h, v2.h[3]\n" + "fmla v9.8h, v1.8h, v2.h[4]\n" + "fmla v10.8h, v1.8h, v2.h[5]\n" + "fmla v11.8h, v1.8h, v2.h[6]\n" + "fmla v12.8h, v1.8h, v2.h[7]\n" + + "1:\n" + "mov x26, %0\n" + "st1 {v5.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v6.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v7.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v8.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v9.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v10.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v11.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v12.8h}, [x26]\n" + : "+r"(out), "+r"(in), "+r"(w) + : "r"((I64)KInner), "r"((I64)M), "r"((I64)KTail) + : "memory", "cc", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "v1", + "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12"); +} + +inline void mmm_8x24_A55(U32 M, U32 K, F16 *w, F16 *in, F16 *out) +{ + U32 KTail = K % 2; + U32 KInner = K - KTail; + asm volatile( + // init in0- > v1, w- > v0 + "ld1 {v1.4h}, [%1], #8\n" + "ldr x22, [%1], #8\n" + "ins v1.d[1], x22\n" + "ld1 {v0.4h}, [%2], #8\n" + "ldr x21, [%2], #8\n" + "ins v0.d[1], x21\n" + + "mov x26, %0\n" + "ld1 {v5.8h, v6.8h, v7.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v8.8h, v9.8h, v10.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v11.8h, v12.8h, v13.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v14.8h, v15.8h, v16.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v17.8h, v18.8h, v19.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v20.8h, v21.8h, v22.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v23.8h, v24.8h, v25.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v26.8h, v27.8h, v28.8h}, [x26]\n" + + "mov x20, %3\n" + + "0:\n" + // w- > v4, in0- > v2/v3/v1, out0=v5~v28 + "ld1 {v2.4h}, [%1], #8\n" + "fmla v5.8h, v1.8h, v0.h[0]\n" + "fmla v8.8h, v1.8h, v0.h[1]\n" + "ldr x23, [%1], #8\n" + "fmla v11.8h, v1.8h, v0.h[2]\n" + "fmla v14.8h, v1.8h, v0.h[3]\n" + "ins v2.d[1], x23\n" + "fmla v17.8h, v1.8h, v0.h[4]\n" + "fmla v20.8h, v1.8h, v0.h[5]\n" + "ld1 {v4.4h}, [%2], #8\n" + "fmla v23.8h, v1.8h, v0.h[6]\n" + "fmla v26.8h, v1.8h, v0.h[7]\n" + + "ld1 {v3.4h}, [%1], #8\n" + "fmla v6.8h, v2.8h, v0.h[0]\n" + "fmla v9.8h, v2.8h, v0.h[1]\n" + "ldr x24, [%1], #8\n" + "fmla v12.8h, v2.8h, v0.h[2]\n" + "fmla v15.8h, v2.8h, v0.h[3]\n" + "ins v3.d[1], x24\n" + "fmla v18.8h, v2.8h, v0.h[4]\n" + "fmla v21.8h, v2.8h, v0.h[5]\n" + "ldr x25, [%2], #8\n" + "fmla v24.8h, v2.8h, v0.h[6]\n" + "fmla v27.8h, v2.8h, v0.h[7]\n" + + "ld1 {v1.4h}, [%1], #8\n" + "fmla v7.8h, v3.8h, v0.h[0]\n" + "fmla v10.8h, v3.8h, v0.h[1]\n" + "ldr x22, [%1], #8\n" + "fmla v13.8h, v3.8h, v0.h[2]\n" + "fmla v16.8h, v3.8h, v0.h[3]\n" + "ins v1.d[1], x22\n" + "fmla v19.8h, v3.8h, v0.h[4]\n" + "fmla v22.8h, v3.8h, v0.h[5]\n" + "ins v4.d[1], x25\n" + "fmla v25.8h, v3.8h, v0.h[6]\n" + "fmla v28.8h, v3.8h, v0.h[7]\n" + + // w- > v0, in0- > v2/v3/v1, out0- > v5~v28 + "ld1 {v2.4h}, [%1], #8\n" + "fmla v5.8h, v1.8h, v4.h[0]\n" + "fmla v8.8h, v1.8h, v4.h[1]\n" + "ldr x23, [%1], #8\n" + "fmla v11.8h, v1.8h, v4.h[2]\n" + "fmla v14.8h, v1.8h, v4.h[3]\n" + "ins v2.d[1], x23\n" + "fmla v17.8h, v1.8h, v4.h[4]\n" + "fmla v20.8h, v1.8h, v4.h[5]\n" + "ld1 {v0.4h}, [%2], #8\n" + "fmla v23.8h, v1.8h, v4.h[6]\n" + "fmla v26.8h, v1.8h, v4.h[7]\n" + + "ld1 {v3.4h}, [%1], #8\n" + "fmla v6.8h, v2.8h, v4.h[0]\n" + "fmla v9.8h, v2.8h, v4.h[1]\n" + "ldr x24, [%1], #8\n" + "fmla v12.8h, v2.8h, v4.h[2]\n" + "fmla v15.8h, v2.8h, v4.h[3]\n" + "ins v3.d[1], x24\n" + "fmla v18.8h, v2.8h, v4.h[4]\n" + "fmla v21.8h, v2.8h, v4.h[5]\n" + "ldr x21, [%2], #8\n" + "fmla v24.8h, v2.8h, v4.h[6]\n" + "fmla v27.8h, v2.8h, v4.h[7]\n" + + "ld1 {v1.4h}, [%1], #8\n" + "fmla v7.8h, v3.8h, v4.h[0]\n" + "fmla v10.8h, v3.8h, v4.h[1]\n" + "ldr x22, [%1], #8\n" + "fmla v13.8h, v3.8h, v4.h[2]\n" + "fmla v16.8h, v3.8h, v4.h[3]\n" + "ins v1.d[1], x22\n" + "fmla v19.8h, v3.8h, v4.h[4]\n" + "fmla v22.8h, v3.8h, v4.h[5]\n" + "ins v0.d[1], x21\n" + "fmla v25.8h, v3.8h, v4.h[6]\n" + "subs x20, x20, #0x2\n" + "fmla v28.8h, v3.8h, v4.h[7]\n" + + "bne 0b\n" + + "cbz %5, 1f\n" + "ld1 {v2.4h}, [%1], #8\n" + "fmla v5.8h, v1.8h, v0.h[0]\n" + "fmla v8.8h, v1.8h, v0.h[1]\n" + "ldr x23, [%1], #8\n" + "fmla v11.8h, v1.8h, v0.h[2]\n" + "fmla v14.8h, v1.8h, v0.h[3]\n" + "ins v2.d[1], x23\n" + "fmla v17.8h, v1.8h, v0.h[4]\n" + "fmla v20.8h, v1.8h, v0.h[5]\n" + "fmla v23.8h, v1.8h, v0.h[6]\n" + "fmla v26.8h, v1.8h, v0.h[7]\n" + + "ld1 {v3.4h}, [%1], #8\n" + "fmla v6.8h, v2.8h, v0.h[0]\n" + "fmla v9.8h, v2.8h, v0.h[1]\n" + "ldr x24, [%1], #8\n" + "fmla v12.8h, v2.8h, v0.h[2]\n" + "fmla v15.8h, v2.8h, v0.h[3]\n" + "ins v3.d[1], x24\n" + "fmla v18.8h, v2.8h, v0.h[4]\n" + "fmla v21.8h, v2.8h, v0.h[5]\n" + "fmla v24.8h, v2.8h, v0.h[6]\n" + "fmla v27.8h, v2.8h, v0.h[7]\n" + + "fmla v7.8h, v3.8h, v0.h[0]\n" + "fmla v10.8h, v3.8h, v0.h[1]\n" + "fmla v13.8h, v3.8h, v0.h[2]\n" + "fmla v16.8h, v3.8h, v0.h[3]\n" + "fmla v19.8h, v3.8h, v0.h[4]\n" + "fmla v22.8h, v3.8h, v0.h[5]\n" + "fmla v25.8h, v3.8h, v0.h[6]\n" + "fmla v28.8h, v3.8h, v0.h[7]\n" + + "1:\n" + "mov x26, %0\n" + "st1 {v5.8h, v6.8h, v7.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v8.8h, v9.8h, v10.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v11.8h, v12.8h, v13.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v14.8h, v15.8h, v16.8h}, [x26]\n" + "add x26, x26, %4\n" + + "st1 {v17.8h, v18.8h, v19.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v20.8h, v21.8h, v22.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v23.8h, v24.8h, v25.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v26.8h, v27.8h, v28.8h}, [x26]\n" + : "+r"(out), "+r"(in), "+r"(w) + : "r"((I64)KInner), "r"((I64)M), "r"((I64)KTail) + : "memory", "cc", "x0", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "v0", "v1", + "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28"); +} + +void mmm_A55(int M, int N, int K, bool transposeA, F16 *matrix1, F16 *matrix2, F16 *tmp, F16 *result) +{ + int blockK = K; + int blockM = 192; + F16 *matrix1Trans = tmp; + F16 *resultCurrent = result; + int KInner, MInner, m, n; + for (int k = 0; k < K; k += blockK) { + KInner = UNI_MIN(blockK, K - k); + for (int i = 0; i < M; i += blockM) { + MInner = UNI_MIN(blockM, M - i); + + for (n = 0; n <= N - 8; n += 8) { + if (i == 0) { + if (transposeA) { + matrix2_trans(8, KInner, N, matrix1 + n, matrix1Trans + n * KInner); + } else { + matrix1_trans(8, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); + } + } + for (m = 0; m <= (MInner - 24); m += 24) { + resultCurrent = result + n * M + m + i; + mmm_8x24_A55(M * 2, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + } + for (; m <= (MInner - 8); m += 8) { + resultCurrent = result + n * M + m + i; + mmm_8x8_A55(M * 2, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + } + + if ((MInner - m) >= 4) { + resultCurrent = result + n * M + m + i; + mmm_8x4_A55(M * 2, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + m += 4; + } + + if (MInner - m) { + resultCurrent = result + n * M + m + i; + mmm_N8_MTail(MInner - m, M, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + } + } + + if ((N - n) >= 4) { + if (i == 0) { + if (transposeA) { + matrix2_trans(4, KInner, N, matrix1 + n, matrix1Trans + n * KInner); + } else { + matrix1_trans(4, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); + } + } + + for (m = 0; m <= (MInner - 24); m += 24) { + resultCurrent = result + n * M + m + i; + mmm_4x24_A55(M * 2, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + } + + for (; m <= (MInner - 8); m += 8) { + resultCurrent = result + n * M + m + i; + mmm_4x8_A55(M * 2, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + } + + if ((MInner - m) >= 4) { + resultCurrent = result + n * M + m + i; + mmm_4x4_A55(M * 2, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + m += 4; + } + + if (MInner - m) { + resultCurrent = result + n * M + m + i; + mmm_N4_MTail(MInner - m, M, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + } + + n += 4; + } + + if (N - n) { + if (i == 0) { + if (transposeA) { + matrix2_trans(N - n, KInner, N, matrix1 + n, matrix1Trans + n * KInner); + } else { + matrix1_trans( + N - n, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); + } + } + + for (m = 0; m <= (MInner - 24); m += 24) { + resultCurrent = result + n * M + m + i; + mmm_NTail_M24(M, N - n, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + } + + for (; m <= (MInner - 8); m += 8) { + resultCurrent = result + n * M + m + i; + mmm_NTail_M8(M, N - n, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + } + + if ((MInner - m) >= 4) { + resultCurrent = result + n * M + m + i; + mmm_NTail_M4(M, N - n, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + m += 4; + } + + if (MInner - m) { + resultCurrent = result + n * M + m + i; + mmm_NTail_M(MInner - m, M, N - n, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + } + } + } + } +} diff --git a/compute/blas_enhance/src/cpu/arm/fp16/mmm_A76.cpp b/compute/blas_enhance/src/cpu/arm/fp16/mmm_A76.cpp new file mode 100644 index 00000000..74df49e2 --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/fp16/mmm_A76.cpp @@ -0,0 +1,592 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include + +#include "types.h" +#include "error.h" +#include "cpu/arm/fp16/mmm_common.h" +#include "cpu/arm/fp16/mmm.h" + +#define MMM_FMA_4x8_V5V14s3_V1xV0 \ + "fmla v5.8h, v1.8h, v0.h[0]\n" \ + "fmla v8.8h, v1.8h, v0.h[1]\n" \ + "fmla v11.8h, v1.8h, v0.h[2]\n" \ + "fmla v14.8h, v1.8h, v0.h[3]\n" +#define MMM_FMA_4x8_V17V26s3_V1xV0 \ + "fmla v17.8h, v1.8h, v0.h[4]\n" \ + "fmla v20.8h, v1.8h, v0.h[5]\n" \ + "fmla v23.8h, v1.8h, v0.h[6]\n" \ + "fmla v26.8h, v1.8h, v0.h[7]\n" +#define MMM_FMA_4x8_V6V15s3_V2xV0 \ + "fmla v6.8h, v2.8h, v0.h[0]\n" \ + "fmla v9.8h, v2.8h, v0.h[1]\n" \ + "fmla v12.8h, v2.8h, v0.h[2]\n" \ + "fmla v15.8h, v2.8h, v0.h[3]\n" +#define MMM_FMA_4x8_V18V27s3_V2xV0 \ + "fmla v18.8h, v2.8h, v0.h[4]\n" \ + "fmla v21.8h, v2.8h, v0.h[5]\n" \ + "fmla v24.8h, v2.8h, v0.h[6]\n" \ + "fmla v27.8h, v2.8h, v0.h[7]\n" +#define MMM_FMA_4x8_V7V16s3_V3xV0 \ + "fmla v7.8h, v3.8h, v0.h[0]\n" \ + "fmla v10.8h, v3.8h, v0.h[1]\n" \ + "fmla v13.8h, v3.8h, v0.h[2]\n" \ + "fmla v16.8h, v3.8h, v0.h[3]\n" +#define MMM_FMA_4x8_V19V28s3_V3xV0 \ + "fmla v19.8h, v3.8h, v0.h[4]\n" \ + "fmla v22.8h, v3.8h, v0.h[5]\n" \ + "fmla v25.8h, v3.8h, v0.h[6]\n" \ + "fmla v28.8h, v3.8h, v0.h[7]\n" +#define MMM_FMA_4x8_V5V14s3_V29xV4 \ + "fmla v5.8h, v29.8h, v4.h[0]\n" \ + "fmla v8.8h, v29.8h, v4.h[1]\n" \ + "fmla v11.8h, v29.8h, v4.h[2]\n" \ + "fmla v14.8h, v29.8h, v4.h[3]\n" +#define MMM_FMA_4x8_V17V26s3_V29xV4 \ + "fmla v17.8h, v29.8h, v4.h[4]\n" \ + "fmla v20.8h, v29.8h, v4.h[5]\n" \ + "fmla v23.8h, v29.8h, v4.h[6]\n" \ + "fmla v26.8h, v29.8h, v4.h[7]\n" +#define MMM_FMA_4x8_V6V15s3_V30xV4 \ + "fmla v6.8h, v30.8h, v4.h[0]\n" \ + "fmla v9.8h, v30.8h, v4.h[1]\n" \ + "fmla v12.8h, v30.8h, v4.h[2]\n" \ + "fmla v15.8h, v30.8h, v4.h[3]\n" +#define MMM_FMA_4x8_V18V27s3_V30xV4 \ + "fmla v18.8h, v30.8h, v4.h[4]\n" \ + "fmla v21.8h, v30.8h, v4.h[5]\n" \ + "fmla v24.8h, v30.8h, v4.h[6]\n" \ + "fmla v27.8h, v30.8h, v4.h[7]\n" +#define MMM_FMA_4x8_V7V16s3_V31xV4 \ + "fmla v7.8h, v31.8h, v4.h[0]\n" \ + "fmla v10.8h, v31.8h, v4.h[1]\n" \ + "fmla v13.8h, v31.8h, v4.h[2]\n" \ + "fmla v16.8h, v31.8h, v4.h[3]\n" +#define MMM_FMA_4x8_V19V28s3_V31xV4 \ + "fmla v19.8h, v31.8h, v4.h[4]\n" \ + "fmla v22.8h, v31.8h, v4.h[5]\n" \ + "fmla v25.8h, v31.8h, v4.h[6]\n" \ + "fmla v28.8h, v31.8h, v4.h[7]\n" + +inline void mmm_4x24_A76(U32 M, U32 K, F16 *w, F16 *in, F16 *out) +{ + U32 KTail = K % 2; + U32 KInner = K - KTail; + asm volatile( + // init in0- > v1, w- > v0 + "ld1 {v1.8h}, [%1], #16\n" + "ld1 {v0.4h}, [%2], #8\n" + "mov x26, %0\n" + "ld1 {v5.8h, v6.8h, v7.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v8.8h, v9.8h, v10.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v11.8h, v12.8h, v13.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v14.8h, v15.8h, v16.8h}, [x26]\n" + + "mov x20, %3\n" + + "0:\n" + // w- > v4, in0- > v2/v3/v1, out0=v5~v28 + "ld1 {v2.8h}, [%1], #16\n" MMM_FMA_4x8_V5V14s3_V1xV0 "ld1 {v3.8h}, [%1], #16\n" + "ld1 {v29.8h}, [%1], #16\n" MMM_FMA_4x8_V6V15s3_V2xV0 "ld1 {v4.4h}, [%2], " + "#8\n" MMM_FMA_4x8_V7V16s3_V3xV0 + + // w- > v0, in0- > v2/v3/v1, out0- > v5~v28 + "ld1 {v30.8h}, [%1], #16\n" MMM_FMA_4x8_V5V14s3_V29xV4 + "ld1 {v31.8h}, [%1], #16\n" MMM_FMA_4x8_V6V15s3_V30xV4 "ld1 {v1.8h}, [%1], #16\n" + "ld1 {v0.4h}, [%2], #8\n" MMM_FMA_4x8_V7V16s3_V31xV4 + + "subs x20, x20, #0x2\n" + "bne 0b\n" + + "cbz %5, 1f\n" + "ld1 {v2.8h}, [%1], #16\n" MMM_FMA_4x8_V5V14s3_V1xV0 + "ld1 {v3.8h}, [%1], #16\n" MMM_FMA_4x8_V6V15s3_V2xV0 MMM_FMA_4x8_V7V16s3_V3xV0 + + "1:\n" + "mov x26, %0\n" + "st1 {v5.8h, v6.8h, v7.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v8.8h, v9.8h, v10.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v11.8h, v12.8h, v13.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v14.8h, v15.8h, v16.8h}, [x26]\n" + : "+r"(out), "+r"(in), "+r"(w) + : "r"((I64)KInner), "r"((I64)M), "r"((I64)KTail) + : "memory", "cc", "x20", "x26", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v29", "v30", "v31"); +} +inline void mmm_8x4_A76(U32 M, U32 K, F16 *w, F16 *in, F16 *out) +{ + U32 KTail = K % 2; + U32 KInner = K - KTail; + asm volatile("ld1 {v1.8h}, [%2], #16\n" + "ld1 {v0.4h}, [%1], #8\n" + + "mov x26, %0\n" + "ld1 {v5.h}[0], [x26], #2\n" + "ld1 {v8.h}[0], [x26], #2\n" + "ld1 {v11.h}[0], [x26], #2\n" + "ld1 {v14.h}[0], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "ld1 {v5.h}[1], [x26], #2\n" + "ld1 {v8.h}[1], [x26], #2\n" + "ld1 {v11.h}[1], [x26], #2\n" + "ld1 {v14.h}[1], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "ld1 {v5.h}[2], [x26], #2\n" + "ld1 {v8.h}[2], [x26], #2\n" + "ld1 {v11.h}[2], [x26], #2\n" + "ld1 {v14.h}[2], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "ld1 {v5.h}[3], [x26], #2\n" + "ld1 {v8.h}[3], [x26], #2\n" + "ld1 {v11.h}[3], [x26], #2\n" + "ld1 {v14.h}[3], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "ld1 {v5.h}[4], [x26], #2\n" + "ld1 {v8.h}[4], [x26], #2\n" + "ld1 {v11.h}[4], [x26], #2\n" + "ld1 {v14.h}[4], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "ld1 {v5.h}[5], [x26], #2\n" + "ld1 {v8.h}[5], [x26], #2\n" + "ld1 {v11.h}[5], [x26], #2\n" + "ld1 {v14.h}[5], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "ld1 {v5.h}[6], [x26], #2\n" + "ld1 {v8.h}[6], [x26], #2\n" + "ld1 {v11.h}[6], [x26], #2\n" + "ld1 {v14.h}[6], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "ld1 {v5.h}[7], [x26], #2\n" + "ld1 {v8.h}[7], [x26], #2\n" + "ld1 {v11.h}[7], [x26], #2\n" + "ld1 {v14.h}[7], [x26], #2\n" + + "mov x20, %3\n" + + "0:\n" + "ld1 {v4.4h}, [%1], #8\n" + "ld1 {v29.8h}, [%2], #16\n" MMM_FMA_4x8_V5V14s3_V1xV0 "ld1 {v1.8h}, [%2], #16\n" + "ld1 {v0.4h}, [%1], #8\n" MMM_FMA_4x8_V5V14s3_V29xV4 + + "subs x20, x20, 0x2\n" + "bne 0b\n" + + "cbz %5, 1f\n" MMM_FMA_4x8_V5V14s3_V1xV0 + + "1:\n" + "mov x26, %0\n" + "st1 {v5.h}[0], [x26], #2\n" + "st1 {v8.h}[0], [x26], #2\n" + "st1 {v11.h}[0], [x26], #2\n" + "st1 {v14.h}[0], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "st1 {v5.h}[1], [x26], #2\n" + "st1 {v8.h}[1], [x26], #2\n" + "st1 {v11.h}[1], [x26], #2\n" + "st1 {v14.h}[1], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "st1 {v5.h}[2], [x26], #2\n" + "st1 {v8.h}[2], [x26], #2\n" + "st1 {v11.h}[2], [x26], #2\n" + "st1 {v14.h}[2], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "st1 {v5.h}[3], [x26], #2\n" + "st1 {v8.h}[3], [x26], #2\n" + "st1 {v11.h}[3], [x26], #2\n" + "st1 {v14.h}[3], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "st1 {v5.h}[4], [x26], #2\n" + "st1 {v8.h}[4], [x26], #2\n" + "st1 {v11.h}[4], [x26], #2\n" + "st1 {v14.h}[4], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "st1 {v5.h}[5], [x26], #2\n" + "st1 {v8.h}[5], [x26], #2\n" + "st1 {v11.h}[5], [x26], #2\n" + "st1 {v14.h}[5], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "st1 {v5.h}[6], [x26], #2\n" + "st1 {v8.h}[6], [x26], #2\n" + "st1 {v11.h}[6], [x26], #2\n" + "st1 {v14.h}[6], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "st1 {v5.h}[7], [x26], #2\n" + "st1 {v8.h}[7], [x26], #2\n" + "st1 {v11.h}[7], [x26], #2\n" + "st1 {v14.h}[7], [x26], #2\n" + : "+r"(out), "+r"(in), "+r"(w) + : "r"((I64)KInner), "r"((I64)M), "r"((I64)KTail) + : "memory", "cc", "x20", "x26", "v0", "v1", "v4", "v29", "v5", "v8", "v11", "v14"); +} + +inline void mmm_4x8_A76(U32 M, U32 K, F16 *w, F16 *in, F16 *out) +{ + U32 KTail = K % 2; + U32 KInner = K - KTail; + asm volatile("ld1 {v1.8h}, [%1], #16\n" + "ld1 {v0.4h}, [%2], #8\n" + "mov x26, %0\n" + "ld1 {v5.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v8.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v11.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v14.8h}, [x26]\n" + + "mov x20, %3\n" + + "0:\n" + "ld1 {v29.8h}, [%1], #16\n" + "ld1 {v4.4h}, [%2], #8\n" MMM_FMA_4x8_V5V14s3_V1xV0 "ld1 {v1.8h}, [%1], #16\n" + "ld1 {v0.4h}, [%2], #8\n" MMM_FMA_4x8_V5V14s3_V29xV4 + + "subs x20, x20, 0x2\n" + "bne 0b\n" + + "cbz %5, 1f\n" MMM_FMA_4x8_V5V14s3_V1xV0 + + "1:\n" + "mov x26, %0\n" + "st1 {v5.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v8.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v11.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v14.8h}, [x26]\n" + : "+r"(out), "+r"(in), "+r"(w) + : "r"((I64)KInner), "r"((I64)M), "r"((I64)KTail) + : "memory", "cc", "x20", "x26", "v0", "v1", "v4", "v5", "v8", "v11", "v14", "v29"); +} + +inline void mmm_4x4_A76(U32 M, U32 K, F16 *w, F16 *in, F16 *out) +{ + U32 KTail = K % 2; + U32 KInner = K - KTail; + asm volatile("ld1 {v1.4h}, [%1], #8\n" + "ld1 {v0.4h}, [%2], #8\n" + "mov x26, %0\n" + "ld1 {v5.4h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v8.4h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v11.4h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v14.4h}, [x26]\n" + + "mov x20, %3\n" + + "0:\n" + "ld1 {v29.4h}, [%1], #8\n" + "ld1 {v4.4h}, [%2], #8\n" MMM_FMA_4x8_V5V14s3_V1xV0 "ld1 {v1.4h}, [%1], #8\n" + "ld1 {v0.4h}, [%2], #8\n" MMM_FMA_4x8_V5V14s3_V29xV4 + + "subs x20, x20, 0x2\n" + "bne 0b\n" + + "cbz %5, 1f\n" MMM_FMA_4x8_V5V14s3_V1xV0 + + "1:\n" + "mov x26, %0\n" + "st1 {v5.4h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v8.4h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v11.4h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v14.4h}, [x26]\n" + : "+r"(out), "+r"(in), "+r"(w) + : "r"((I64)KInner), "r"((I64)M), "r"((I64)KTail) + : "memory", "cc", "x20", "x26", "v0", "v1", "v4", "v29", "v5", "v8", "v11", "v14"); +} + +inline void mmm_8x8_A76(U32 M, U32 K, F16 *w, F16 *in, F16 *out) +{ + U32 KTail = K % 2; + U32 KInner = K - KTail; + asm volatile("mov x26, %0\n" + "ld1 {v5.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v8.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v11.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v14.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v17.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v20.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v23.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v26.8h}, [x26]\n" + + "mov x20, %3\n" + + "ld1 {v1.8h}, [%1], #16\n" + "ld1 {v0.8h}, [%2], #16\n" + + "0:\n" + "ld1 {v29.8h}, [%1], #16\n" + "ld1 {v4.8h}, [%2], #16\n" MMM_FMA_4x8_V5V14s3_V1xV0 MMM_FMA_4x8_V17V26s3_V1xV0 + + "ld1 {v1.8h}, [%1], #16\n" + "ld1 {v0.8h}, [%2], #16\n" MMM_FMA_4x8_V5V14s3_V29xV4 MMM_FMA_4x8_V17V26s3_V29xV4 + + "subs x20, x20, 0x2\n" + "bne 0b\n" + + "cbz %5, 1f\n" MMM_FMA_4x8_V5V14s3_V1xV0 MMM_FMA_4x8_V17V26s3_V1xV0 + + "1:\n" + "mov x26, %0\n" + "st1 {v5.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v8.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v11.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v14.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v17.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v20.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v23.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v26.8h}, [x26]\n" + : "+r"(out), "+r"(in), "+r"(w) + : "r"((I64)KInner), "r"((I64)M), "r"((I64)KTail) + : "memory", "cc", "x20", "x26", "v1", "v0", "v29", "v4", "v5", "v8", "v11", "v14", + "v17", "v20", "v23", "v26"); +} + +inline void mmm_8x24_A76(U32 M, U32 K, F16 *w, F16 *in, F16 *out) +{ + U32 KTail = K % 2; + U32 KInner = K - KTail; + asm volatile( + // init in0- > v1, w- > v0 + "ld1 {v1.8h}, [%1], #16\n" + "ld1 {v0.8h}, [%2], #16\n" + "mov x26, %0\n" + "ld1 {v5.8h, v6.8h, v7.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v8.8h, v9.8h, v10.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v11.8h, v12.8h, v13.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v14.8h, v15.8h, v16.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v17.8h, v18.8h, v19.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v20.8h, v21.8h, v22.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v23.8h, v24.8h, v25.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v26.8h, v27.8h, v28.8h}, [x26]\n" + + "mov x20, %3\n" + + "0:\n" + // w- > v4, in0- > v2/v3/v1, out0=v5~v28 + "ld1 {v2.8h}, [%1], #16\n" + "ld1 {v3.8h}, [%1], #16\n" MMM_FMA_4x8_V5V14s3_V1xV0 MMM_FMA_4x8_V17V26s3_V1xV0 + + "ld1 {v4.8h}, [%2], #16\n" MMM_FMA_4x8_V6V15s3_V2xV0 MMM_FMA_4x8_V18V27s3_V2xV0 + + "ld1 {v29.8h}, [%1], #16\n" MMM_FMA_4x8_V7V16s3_V3xV0 MMM_FMA_4x8_V19V28s3_V3xV0 + + // w- > v0, in0- > v2/v3/v1, out0- > v5~v28 + "ld1 {v30.8h}, [%1], #16\n" + "ld1 {v0.8h}, [%2], #16\n" MMM_FMA_4x8_V5V14s3_V29xV4 MMM_FMA_4x8_V17V26s3_V29xV4 + + "ld1 {v31.8h}, [%1], #16\n" MMM_FMA_4x8_V6V15s3_V30xV4 MMM_FMA_4x8_V18V27s3_V30xV4 + + "ld1 {v1.8h}, [%1], #16\n" MMM_FMA_4x8_V7V16s3_V31xV4 "subs x20, x20, " + "#0x2\n" MMM_FMA_4x8_V19V28s3_V31xV4 + + "bne 0b\n" + + "cbz %5, 1f\n" + "ld1 {v2.8h}, [%1], #16\n" + "ld1 {v3.8h}, [%1], #16\n" MMM_FMA_4x8_V5V14s3_V1xV0 MMM_FMA_4x8_V17V26s3_V1xV0 + MMM_FMA_4x8_V6V15s3_V2xV0 MMM_FMA_4x8_V18V27s3_V2xV0 MMM_FMA_4x8_V7V16s3_V3xV0 + MMM_FMA_4x8_V19V28s3_V3xV0 + + "1:\n" + "mov x26, %0\n" + "st1 {v5.8h, v6.8h, v7.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v8.8h, v9.8h, v10.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v11.8h, v12.8h, v13.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v14.8h, v15.8h, v16.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v17.8h, v18.8h, v19.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v20.8h, v21.8h, v22.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v23.8h, v24.8h, v25.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v26.8h, v27.8h, v28.8h}, [x26]\n" + : "+r"(out), "+r"(in), "+r"(w) + : "r"((I64)KInner), "r"((I64)M), "r"((I64)KTail) + : "memory", "cc", "x0", "x20", "x26", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); +} + +void mmm_A76(int M, int N, int K, bool transposeA, F16 *matrix1, F16 *matrix2, F16 *tmp, F16 *result) +{ + int blockK = K; + int blockM = 192; + F16 *matrix1Trans = tmp; + F16 *resultCurrent = result; + int KInner, MInner, m, n; + for (int k = 0; k < K; k += blockK) { + KInner = UNI_MIN(blockK, K - k); + for (int i = 0; i < M; i += blockM) { + MInner = UNI_MIN(blockM, M - i); + for (n = 0; n <= N - 8; n += 8) { + if (i == 0) { + if (transposeA) { + matrix2_trans(8, KInner, N, matrix1 + n, matrix1Trans + n * KInner); + } else { + matrix1_trans(8, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); + } + } + for (m = 0; m <= (MInner - 24); m += 24) { + resultCurrent = result + n * M + m + i; + mmm_8x24_A76(M * 2, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + } + for (; m <= (MInner - 8); m += 8) { + resultCurrent = result + n * M + m + i; + mmm_8x8_A76(M * 2, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + } + + if ((MInner - m) >= 4) { + resultCurrent = result + n * M + m + i; + mmm_8x4_A76(M * 2, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + m += 4; + } + + if (MInner - m) { + resultCurrent = result + n * M + m + i; + mmm_N8_MTail(MInner - m, M, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + } + } + + if ((N - n) >= 4) { + if (i == 0) { + if (transposeA) { + matrix2_trans(4, KInner, N, matrix1 + n, matrix1Trans + n * KInner); + } else { + matrix1_trans(4, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); + } + } + + for (m = 0; m <= (MInner - 24); m += 24) { + resultCurrent = result + n * M + m + i; + mmm_4x24_A76(M * 2, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + } + + for (; m <= (MInner - 8); m += 8) { + resultCurrent = result + n * M + m + i; + mmm_4x8_A76(M * 2, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + } + + if ((MInner - m) >= 4) { + resultCurrent = result + n * M + m + i; + mmm_4x4_A76(M * 2, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + m += 4; + } + + if (MInner - m) { + resultCurrent = result + n * M + m + i; + mmm_N4_MTail(MInner - m, M, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + } + + n += 4; + } + + if (N - n) { + if (i == 0) { + if (transposeA) { + matrix2_trans(N - n, KInner, N, matrix1 + n, matrix1Trans + n * KInner); + } else { + matrix1_trans( + N - n, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); + } + } + + for (m = 0; m <= (MInner - 24); m += 24) { + resultCurrent = result + n * M + m + i; + mmm_NTail_M24(M, N - n, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + } + + for (; m <= (MInner - 8); m += 8) { + resultCurrent = result + n * M + m + i; + mmm_NTail_M8(M, N - n, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + } + + if ((MInner - m) >= 4) { + resultCurrent = result + n * M + m + i; + mmm_NTail_M4(M, N - n, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + m += 4; + } + + if (MInner - m) { + resultCurrent = result + n * M + m + i; + mmm_NTail_M(MInner - m, M, N - n, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + } + } + } + } +} diff --git a/compute/blas_enhance/src/cpu/arm/fp16/mmm_common.h b/compute/blas_enhance/src/cpu/arm/fp16/mmm_common.h new file mode 100644 index 00000000..2538f643 --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/fp16/mmm_common.h @@ -0,0 +1,143 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_MMM_COMMON +#define _H_MMM_COMMON +#include +#include + +#include "types.h" + +inline void matrix1_trans(U32 size, U32 blockK, U32 K, F16 *src, F16 *dst) +{ + F16 *src1 = src; + U32 offset; + for (U32 i = 0; i < blockK; i++) { + for (U32 j = 0; j < size; j++) { + src1 = src + j * K; + offset = 8 * blockK; + if (i % 32) { + asm volatile("prfm pldl2keep, [%0, %1]\n" + : "+r"(src1) + : "r"((I64)offset) + : "memory", "cc"); + } + *dst++ = *(src1 + i); + } + } +} + +inline void matrix2_trans(U32 size, U32 blockK, U32 M, F16 *src, F16 *dst) +{ + for (U32 i = 0; i < blockK; i++) { + asm volatile("prfm pldl2keep, [%0, #48]\n" : "+r"(src) : : "memory", "cc"); + memcpy(dst, src, size * sizeof(F16)); + dst += size; + src += M; + } +} + +inline void mmm_NTail_M24(U32 M, U32 N, U32 K, F16 *matrix1, F16 *matrix2, F16 *result) +{ + float16x8x3_t mat2, res; + for (U32 i = 0; i < N; i++) { + res = vld3q_f16(result + i * M); + for (U32 q = 0; q < K; q += 1) { + mat2 = vld3q_f16(matrix2 + q * 24); + res.val[0] = vfmaq_n_f16(res.val[0], mat2.val[0], matrix1[q * N + i]); + res.val[1] = vfmaq_n_f16(res.val[1], mat2.val[1], matrix1[q * N + i]); + res.val[2] = vfmaq_n_f16(res.val[2], mat2.val[2], matrix1[q * N + i]); + } + vst3q_f16(result + i * M, res); + } +} + +inline void mmm_NTail_M8(U32 M, U32 N, U32 K, F16 *matrix1, F16 *matrix2, F16 *result) +{ + float16x8_t mat2, res; + for (U32 i = 0; i < N; i++) { + res = vld1q_f16(result + i * M); + for (U32 q = 0; q < K; q += 1) { + mat2 = vld1q_f16(matrix2 + q * 8); + res = vfmaq_n_f16(res, mat2, matrix1[q * N + i]); + } + vst1q_f16(result + i * M, res); + } +} + +inline void mmm_NTail_M4(U32 M, U32 N, U32 K, F16 *matrix1, F16 *matrix2, F16 *result) +{ + float16x4_t mat2, res; + for (U32 i = 0; i < N; i++) { + res = vld1_f16(result + i * M); + for (U32 q = 0; q < K; q += 1) { + mat2 = vld1_f16(matrix2 + q * 4); + res = vfma_n_f16(res, mat2, matrix1[q * N + i]); + } + vst1_f16(result + i * M, res); + } +} + +inline void mmm_NTail_M(U32 MInner, U32 M, U32 N, U32 K, F16 *matrix1, F16 *matrix2, F16 *result) +{ + for (U32 i = 0; i < N; i++) { + for (U32 j = 0; j < MInner; j++) { + for (U32 k = 0; k < K; k++) { + result[i * M + j] += *(matrix1 + k * N + i) * *(matrix2 + k * MInner + j); + } + } + } +} + +inline void mmm_N8_MTail(U32 MInner, U32 M, U32 K, F16 *matrix1, F16 *matrix2, F16 *result) +{ + float16x8_t mat1 = {0}, res[4] = {0}; + F16 tmp[8] = {0}; + CHECK_REQUIREMENT(MInner < 4); + + for (U32 i = 0; i < K; i++) { + mat1 = vld1q_f16(matrix1 + i * 8); + for (U32 j = 0; j < MInner; j++) { + res[j] = vfmaq_n_f16(res[j], mat1, matrix2[j + i * MInner]); + } + } + for (U32 p = 0; p < MInner; p++) { + vst1q_f16(tmp, res[p]); + for (U32 q = 0; q < 8; q++) { + result[q * M + p] += tmp[q]; + } + res[p] = vdupq_n_f16(0); + } +} + +inline void mmm_N4_MTail(U32 MInner, U32 M, U32 K, F16 *matrix1, F16 *matrix2, F16 *result) +{ + float16x4_t mat1 = {0}, res[4] = {0}; + F16 tmp[4] = {0}; + CHECK_REQUIREMENT(MInner < 4); + + for (U32 i = 0; i < K; i++) { + mat1 = vld1_f16(matrix1 + i * 4); + for (U32 j = 0; j < MInner; j++) { + res[j] = vfma_n_f16(res[j], mat1, matrix2[j + i * MInner]); + } + } + for (U32 p = 0; p < MInner; p++) { + vst1_f16(tmp, res[p]); + for (U32 q = 0; q < 4; q++) { + result[q * M + p] += tmp[q]; + } + res[p] = vdup_n_f16(0); + } +} +#endif diff --git a/compute/blas_enhance/src/cpu/arm/fp16/mvm.cpp b/compute/blas_enhance/src/cpu/arm/fp16/mvm.cpp new file mode 100644 index 00000000..1dc5c366 --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/fp16/mvm.cpp @@ -0,0 +1,119 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "error.h" +#include "cpu/arm/fp16/blas_fp16.h" +#include "cpu/arm/fp16/mvm.h" +#include "cpu/arm/fp16/mmm_common.h" +#include "cpu/arm/fp16/mvm_common.h" + +EE matrix_vector_multiply_transform_weight_fp16(TensorDesc desc, F16 *src, F16 *dst) +{ + DataType dt; + DataFormat df; + U32 N, K; + EE ret = SUCCESS; + int i = 0; + switch (desc.df) { + case DF_NORMAL: { + CHECK_STATUS(tensor2dGet(desc, &dt, &df, &N, &K)); + for (; i < (int)N - 63; i += 64) { + matrix1_trans(64, K, K, src + i * K, dst + i * K); + } + if (i < (int)N) { + memcpy(dst + i * K, src + i * K, (N - i) * K * bytesOf(DT_F16)); + } + break; + } + case DF_TRANSPOSE: { + CHECK_STATUS(tensor2dGet(desc, &dt, &df, &K, &N)); + for (; i < (int)N - 63; i += 64) { + matrix2_trans(64, K, N, src + i, dst + i * K); + } + if (i < (int)N) { + int base = i; + F16 *basePtr = dst + i * K; + for (int j = 0; j < (int)K; j++) { + for (; i < (int)N; i++) { + basePtr[(i - base) * K + j] = src[j * N + i]; + } + } + } + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +void mvm_kernel_fp16(U32 rounds, U32 K, F16 *matrix, F16 *vector, F16 *result) +{ + U32 N = rounds * 64; + float16x8_t mat[8]; + F16 v; + float16x8_t res[8]; + + for (U32 n = 0; n < N; n += 64) { + F16 *bufMov = matrix + n * K; + for (int i = 0; i < 8; i++) { + res[i] = vld1q_f16(result + n + i * 8); + } + for (U32 k = 0; k < K; k++) { + v = vector[k]; + for (int i = 0; i < 8; i++) { + mat[i] = vld1q_f16(bufMov + i * 8); + } + for (int i = 0; i < 8; i++) { + res[i] = vfmaq_n_f16(res[i], mat[i], v); + } + bufMov += 64; + } + for (int i = 0; i < 8; i++) { + vst1q_f16(result + n + i * 8, res[i]); + } + } +} + +void mvm_pack(U32 row, U32 col, F16 *matrix, F16 *vector, F16 *result) +{ + U32 rounds = row / 64; + U32 nTail = row % 64; + + mvm_kernel_fp16(rounds, col, matrix, vector, result); + if (0 != nTail) { + mvm_row_tail(nTail, col, matrix + (row - nTail) * col, vector, result + (row - nTail)); + } +} + +EE mvm_fp16(U32 row, U32 col, DataFormat df, F16 *matrix, F16 *vector, F16 *result, Arch arch) +{ + EE ret = SUCCESS; + if (DF_NKN64 == df) { + mvm_pack(row, col, matrix, vector, result); + return ret; + } + switch (arch) { + case ARM_A55: + mvm_A55(row, col, DF_TRANSPOSE == df, matrix, vector, result); + break; + case ARM_A76: + mvm_A76(row, col, DF_TRANSPOSE == df, matrix, vector, result); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/blas_enhance/src/cpu/arm/fp16/mvm.h b/compute/blas_enhance/src/cpu/arm/fp16/mvm.h new file mode 100644 index 00000000..6764411c --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/fp16/mvm.h @@ -0,0 +1,22 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_MVM +#define _H_MVM + +#include "types.h" + +void mvm_A55(U32 row, U32 col, bool transpose, F16 *matrix, F16 *vector, F16 *result); + +void mvm_A76(U32 row, U32 col, bool transpose, F16 *matrix, F16 *vector, F16 *result); +#endif diff --git a/compute/blas_enhance/src/cpu/arm/fp16/mvm_A55.cpp b/compute/blas_enhance/src/cpu/arm/fp16/mvm_A55.cpp new file mode 100644 index 00000000..4a23f27c --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/fp16/mvm_A55.cpp @@ -0,0 +1,138 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include + +#include "mvm_common.h" +#include "mvm.h" + +inline void mvm_row_kernel_A55(U32 N, U32 K, F16 *matrix, F16 *vector, F16 *result) +{ + U32 KTail = K % 8; + U32 KInner = K - KTail; + F16 *w0 = matrix; + F16 *w1 = matrix + K * N / 2; + F16 *w2 = matrix + K * 2 * N / 2; + F16 *w3 = matrix + K * 3 * N / 2; + + asm volatile("mov x19, %5\n" + "ld1 {v18.h}[0], [x19]\n" + "add x19, x19, %8\n" + "ld1 {v18.h}[1], [x19]\n" + "add x19, x19, %8\n" + "ld1 {v18.h}[2], [x19]\n" + "add x19, x19, %8\n" + "ld1 {v18.h}[3], [x19]\n" + + "movi v17.8h, #0x0\n" + "movi v16.8h, #0x0\n" + "movi v9.8h, #0x0\n" + "movi v10.8h, #0x0\n" + "movi v11.8h, #0x0\n" + "movi v12.8h, #0x0\n" + "mov x20, %6\n" + "cmp x20, #0x0\n" + "beq 3f\n" + "0:\n" + + "ld1 {v0.4h}, [%0], #8\n" + "ldr x15, [%0], #8\n" + "ins v0.d[1], x15\n" + + "ld1 {v1.4h}, [%1], #8\n" + "ld1 {v2.4h}, [%2], #8\n" + "ldr x21, [%1], #8\n" + "ldr x22, [%2], #8\n" + "ins v1.d[1], x21\n" + "ins v2.d[1], x22\n" + + "ld1 {v3.4h}, [%3], #8\n" + "fmla v9.8h, v1.8h, v0.8h\n" + "ld1 {v4.4h}, [%4], #8\n" + "fmla v10.8h, v2.8h, v0.8h\n" + "ldr x23, [%3], #8\n" + "ldr x24, [%4], #8\n" + "ins v3.d[1], x23\n" + "ins v4.d[1], x24\n" + "fmla v11.8h, v3.8h, v0.8h\n" + "fmla v12.8h, v4.8h, v0.8h\n" + + "subs x20, x20, 0x8\n" + "bne 0b\n" + + "faddp v13.8h, v9.8h, v10.8h\n" + "faddp v14.8h, v11.8h, v12.8h\n" + "faddp v15.8h, v13.8h, v14.8h\n" + "faddp v17.8h, v15.8h, v15.8h\n" + "3:\n" + "mov x16, %7\n" + "cmp x16, #0x0\n" + "beq 2f\n" + + "1:\n" + "ld1 {v8.h}[0], [%0], #2\n" + + "ld1 {v1.h}[0], [%1], #2\n" + "ld1 {v1.h}[1], [%2], #2\n" + "ld1 {v1.h}[2], [%3], #2\n" + "ld1 {v1.h}[3], [%4], #2\n" + "fmla v16.8h, v1.8h, v8.h[0]\n" + + "subs x16, x16, 0x1\n" + "bne 1b\n" + + "fadd v17.8h, v17.8h, v16.8h\n" + + "2:\n" + + "fadd v17.8h, v17.8h, v18.8h\n" + + "mov x19, %5\n" + "st1 {v17.h}[0], [x19]\n" + "add x19, x19, %8\n" + "st1 {v17.h}[1], [x19]\n" + "add x19, x19, %8\n" + "st1 {v17.h}[2], [x19]\n" + "add x19, x19, %8\n" + "st1 {v17.h}[3], [x19]\n" + + : "+r"(vector), "+r"(w0), "+r"(w1), "+r"(w2), "+r"(w3), "+r"(result) + : "r"((I64)KInner), "r"((I64)KTail), "r"((I64)N) + : "memory", "cc", "x19", "x20", "x21", "x22", "x23", "x24", "x15", "x16", "v0", + "v1", "v2", "v3", "v4", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18"); +} + +inline void mvm_row_A55(U32 numRows, U32 numColumns, F16 *matrix, F16 *vector, F16 *result) +{ + // Actual layout is NK, and vector is K + U32 N = numRows; + U32 K = numColumns; + U32 NTail = N % 4; + U32 NInner = N / 4; + for (U32 i = 0; i < NInner; i++) { + mvm_row_kernel_A55(NInner * 2, K, matrix + i * K, vector, result + i); + } + if (NTail != 0) { + mvm_row_tail(NTail, K, matrix + (N - NTail) * K, vector, result + (N - NTail)); + } +} + +void mvm_A55(U32 row, U32 col, bool transpose, F16 *matrix, F16 *vector, F16 *result) +{ + if (transpose) { + mvm_col(row, col, matrix, vector, result); + } else { + mvm_row_A55(row, col, matrix, vector, result); + } +} diff --git a/compute/blas_enhance/src/cpu/arm/fp16/mvm_A76.cpp b/compute/blas_enhance/src/cpu/arm/fp16/mvm_A76.cpp new file mode 100644 index 00000000..99450729 --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/fp16/mvm_A76.cpp @@ -0,0 +1,124 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include + +#include "mvm_common.h" +#include "mvm.h" + +inline void mvm_row_kernel_A76(U32 N, U32 K, F16 *matrix, F16 *vector, F16 *result) +{ + U32 KTail = K % 8; + U32 KInner = K - KTail; + F16 *w0 = matrix; + F16 *w1 = matrix + K * N / 2; + F16 *w2 = matrix + K * 2 * N / 2; + F16 *w3 = matrix + K * 3 * N / 2; + asm volatile("mov x19, %5\n" + "ld1 {v18.h}[0], [x19]\n" + "add x19, x19, %8\n" + "ld1 {v18.h}[1], [x19]\n" + "add x19, x19, %8\n" + "ld1 {v18.h}[2], [x19]\n" + "add x19, x19, %8\n" + "ld1 {v18.h}[3], [x19]\n" + + "movi v17.8h, #0x0\n" + "movi v16.8h, #0x0\n" + "movi v9.8h, #0x0\n" + "movi v10.8h, #0x0\n" + "movi v11.8h, #0x0\n" + "movi v12.8h, #0x0\n" + "mov x20, %6\n" + "cmp x20, #0x0\n" + "beq 3f\n" + "0:\n" + + "ld1 {v0.8h}, [%0], #16\n" + "ld1 {v1.8h}, [%1], #16\n" + "ld1 {v2.8h}, [%2], #16\n" + "ld1 {v3.8h}, [%3], #16\n" + "ld1 {v4.8h}, [%4], #16\n" + + "fmla v9.8h, v1.8h, v0.8h\n" + "fmla v10.8h, v2.8h, v0.8h\n" + "fmla v11.8h, v3.8h, v0.8h\n" + "fmla v12.8h, v4.8h, v0.8h\n" + + "subs x20, x20, 0x8\n" + "bne 0b\n" + + "faddp v13.8h, v9.8h, v10.8h\n" + "faddp v14.8h, v11.8h, v12.8h\n" + "faddp v15.8h, v13.8h, v14.8h\n" + "faddp v17.8h, v15.8h, v15.8h\n" + "3:\n" + "mov x16, %7\n" + "cmp x16, #0x0\n" + "beq 2f\n" + + "1:\n" + "ld1 {v8.h}[0], [%0], #2\n" + + "ld1 {v1.h}[0], [%1], #2\n" + "ld1 {v1.h}[1], [%2], #2\n" + "ld1 {v1.h}[2], [%3], #2\n" + "ld1 {v1.h}[3], [%4], #2\n" + "fmla v16.8h, v1.8h, v8.h[0]\n" + + "subs x16, x16, 0x1\n" + "bne 1b\n" + + "fadd v17.8h, v17.8h, v16.8h\n" + + "2:\n" + + "fadd v17.8h, v17.8h, v18.8h\n" + "mov x19, %5\n" + "st1 {v17.h}[0], [x19]\n" + "add x19, x19, %8\n" + "st1 {v17.h}[1], [x19]\n" + "add x19, x19, %8\n" + "st1 {v17.h}[2], [x19]\n" + "add x19, x19, %8\n" + "st1 {v17.h}[3], [x19]\n" + : "+r"(vector), "+r"(w0), "+r"(w1), "+r"(w2), "+r"(w3), "+r"(result) + : "r"((I64)KInner), "r"((I64)KTail), "r"((I64)N) + : "memory", "cc", "x19", "x20", "x21", "x22", "x23", "x24", "x15", "x16", "v0", + "v1", "v2", "v3", "v4", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18"); +} + +inline void mvm_row_A76(U32 numRows, U32 numColumns, F16 *matrix, F16 *vector, F16 *result) +{ + // Actual layout is NK, and vector is K + U32 N = numRows; + U32 K = numColumns; + U32 NTail = N % 4; + U32 NInner = N / 4; + for (U32 i = 0; i < NInner; i++) { + mvm_row_kernel_A76(NInner * 2, K, matrix + i * K, vector, result + i); + } + if (NTail != 0) { + mvm_row_tail(NTail, K, matrix + (N - NTail) * K, vector, result + (N - NTail)); + } +} + +void mvm_A76(U32 row, U32 col, bool transpose, F16 *matrix, F16 *vector, F16 *result) +{ + if (transpose) { + mvm_col(row, col, matrix, vector, result); + } else { + mvm_row_A76(row, col, matrix, vector, result); + } +} diff --git a/compute/blas_enhance/src/cpu/arm/fp16/mvm_common.h b/compute/blas_enhance/src/cpu/arm/fp16/mvm_common.h new file mode 100644 index 00000000..d8f8ed02 --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/fp16/mvm_common.h @@ -0,0 +1,253 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_MVM_COMMON +#define _H_MVM_COMMON + +#include +#include "types.h" +#include "arm_neon_expand.h" + +inline void mvm_row_tail(U32 N, U32 K, F16 *matrix, F16 *vector, F16 *result) +{ + float16x8_t vec, res, mat; + U32 KTail = K % 8; + U32 KInner = K - KTail; + + for (U32 i = 0; i < N; i += 1) { + res = vdupq_n_f16(0); + + for (U32 j = 0; j < KInner; j += 8) { + vec = vld1q_f16(&vector[j]); + mat = vld1q_f16(&matrix[j + K * i]); + res = vfmaq_f16(res, vec, mat); + } + result[i] += vaddvq_f16(res); + + if (KTail != 0) { + for (U32 p = 0; p < KTail; p += 1) { + result[i] += vector[p + KInner] * matrix[KInner + p + K * i]; + } + } + } +} + +inline void mvm_col_tail(U32 N, U32 K, F16 *matrix, F16 *vector, F16 *result) +{ + float16x8_t tmp, res, mat; + U32 NTail = N % 8; + U32 NInner = N - NTail; + + for (U32 i = 0; i < K; i += 1) { + for (U32 j = 0; j < NInner; j += 8) { + tmp = vld1q_f16(result + j); + mat = vld1q_f16(&matrix[j + N * i]); + res = vfmaq_n_f16(tmp, mat, vector[i]); + vst1q_f16(result + j, res); + } + if (NTail != 0) { + for (U32 p = 0; p < NTail; p += 1) { + result[NInner + p] += vector[i] * matrix[NInner + N * i + p]; + } + } + } +} + +inline void mvm_col_kernel(U32 N, U32 K, F16 *matrix, F16 *vector, F16 *result) +{ + float16x8_t mat[4] = {0}; + + F16 *w0 = matrix; + F16 *w1 = matrix + K * N; + F16 *w2 = matrix + 2 * K * N; + F16 *w3 = matrix + 3 * K * N; + + U32 N_tail = N % 8; + U32 N_inner = N - N_tail; + + for (U32 i = 0; i < K; i += 1) { + for (U32 j = 0; j < N_inner; j += 8) { + float16x8_t res[4] = {0}; + + res[3] = vld1q_f16(result + j); + mat[0] = vld1q_f16(w0); + mat[1] = vld1q_f16(w1); + mat[2] = vld1q_f16(w2); + mat[3] = vld1q_f16(w3); + + res[0] = vfmaq_n_f16(res[3], mat[0], vector[i]); + res[1] = vfmaq_n_f16(res[0], mat[1], vector[K + i]); + res[2] = vfmaq_n_f16(res[1], mat[2], vector[2 * K + i]); + res[3] = vfmaq_n_f16(res[2], mat[3], vector[3 * K + i]); + + w0 += 8; + w1 += 8; + w2 += 8; + w3 += 8; + vst1q_f16(result + j, res[3]); + } + if (N_tail != 0) { + for (U32 p = 0; p < N_tail; p += 1) { + result[N_inner + p] += vector[i] * *w0++; + result[N_inner + p] += vector[i + K] * *w1++; + result[N_inner + p] += vector[i + 2 * K] * *w2++; + result[N_inner + p] += vector[i + 3 * K] * *w3++; + } + } + } +} + +inline void mvm_col_kernel_4x8(U32 N, U32 K, F16 *matrix, F16 *vector, F16 *result) +{ + F16 *result_end8 = result + N / 8 * 8; + F16 *result_end = result + N; + asm volatile("mov x20, %0\n" + "add x21, x20, %5\n" + "add x22, x21, %5\n" + "add x23, x22, %5\n" + "mov x24, %1\n" + "add x25, x24, %6\n" + "add x26, x25, %6\n" + "add x27, x26, %6\n" + "mov x29, x21\n" + + "00:\n" + "cmp x20, x29\n" + "bge 01f\n" + "ldr h0, [x20], 2\n" + "dup v0.8h, v0.h[0]\n" + "ldr h1, [x21], 2\n" + "dup v1.8h, v1.h[0]\n" + "ldr h2, [x22], 2\n" + "dup v2.8h, v2.h[0]\n" + "ldr h3, [x23], 2\n" + "dup v3.8h, v3.h[0]\n" + + "mov x28, %2\n" + + "10:\n" + "cmp x28, %3\n" + "bge 11f\n" + "ldr q4, [x28]\n" + "ldr q8, [x24], 16\n" + "ldr q9, [x25], 16\n" + "ldr q10, [x26], 16\n" + "fmla v4.8h, v8.8h, v0.8h\n" + "ldr q11, [x27], 16\n" + "fmla v4.8h, v9.8h, v1.8h\n" + "fmla v4.8h, v10.8h, v2.8h\n" + "fmla v4.8h, v11.8h, v3.8h\n" + "str q4, [x28], 16\n" + "b 10b\n" + + "11:\n" + "cmp x28, %4\n" + "bge 12f\n" + "ldr h4, [x28]\n" + "ldr h8, [x24], 2\n" + "ldr h9, [x25], 2\n" + "ldr h10, [x26], 2\n" + "fmla h4, h8, v0.h[0]\n" + "ldr h11, [x27], 2\n" + "fmla h4, h9, v1.h[0]\n" + "fmla h4, h10, v2.h[0]\n" + "fmla h4, h11, v3.h[0]\n" + "str h4, [x28], 2\n" + "b 11b\n" + + "12:\n" + "b 00b\n" + "01:\n" + : "+r"(vector), "+r"(matrix), "+r"(result), "+r"(result_end8), "+r"(result_end) + : "r"((I64)K * 2), "r"((I64)K * N * 2) + : "memory", "cc", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", + "x29", "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10", "v11"); +} + +inline void mvm_row_kernel(U32 N, U32 K, F16 *matrix, F16 *vector, F16 *result) +{ + float16x8_t res[4] = {0}, mat[4] = {0}, vec; + float16x8_t tmp[6] = {0}; + + F16 *w0 = matrix; + F16 *w1 = matrix + K * N; + F16 *w2 = matrix + 2 * K * N; + F16 *w3 = matrix + 3 * K * N; + + U32 K_tail = K % 8; + U32 K_inner = K - K_tail; + + for (U32 i = 0; i < N; i += 1) { + for (U32 j = 0; j < K_inner; j += 8) { + vec = vld1q_f16(&vector[j]); + + mat[0] = vld1q_f16(w0); + mat[1] = vld1q_f16(w1); + mat[2] = vld1q_f16(w2); + mat[3] = vld1q_f16(w3); + for (U32 k = 0; k < 4; k++) { + res[k] = vfmaq_f16(res[k], vec, mat[k]); + } + w0 += 8; + w1 += 8; + w2 += 8; + w3 += 8; + } + + for (U32 m = 0; m < 2; m++) { + tmp[m] = vpaddq_f16(res[m * 2], res[m * 2 + 1]); + } + tmp[4] = vpaddq_f16(tmp[0], tmp[1]); + tmp[5] = vpaddq_f16(tmp[4], tmp[3]); + F16 addbias; + for (U32 n = 0; n < 4; n++) { + vst1q_lane_f16_builtin(&addbias, tmp[5], n); + result[i + N * n] += addbias; + res[n] = vdupq_n_f16(0); + } + + if (K_tail != 0) { + for (U32 p = 0; p < K_tail; p += 1) { + *(result + i) += vector[p + K_inner] * *w0++; + *(result + N + i) += vector[p + K_inner] * *w1++; + *(result + 2 * N + i) += vector[p + K_inner] * *w2++; + *(result + 3 * N + i) += vector[p + K_inner] * *w3++; + } + } + } +} + +inline void mvm_col(U32 numRows, U32 numColumns, F16 *matrix, F16 *vector, F16 *result) +{ + // Actual layout is KN, and vector is K + U32 N = numRows; + U32 K = numColumns; + U32 KInner = K / 4; + U32 KTail = K % 4; + mvm_col_kernel_4x8(N, KInner, matrix, vector, result); + if (KTail != 0) { + mvm_col_tail(N, KTail, matrix + (K - KTail) * N, vector + (K - KTail), result); + } +} + +// N is number of rows, K for columns +inline void mvm_row(U32 N, U32 K, F16 *matrix, F16 *vector, F16 *result) +{ + U32 NInner = (N / 4); + U32 NTail = N % 4; + mvm_row_kernel(NInner, K, matrix, vector, result); + if (NTail != 0) { + mvm_row_tail(NTail, K, matrix + (N - NTail) * K, vector, result + N - NTail); + } +} +#endif diff --git a/compute/blas_enhance/src/cpu/arm/fp32/axpby.cpp b/compute/blas_enhance/src/cpu/arm/fp32/axpby.cpp new file mode 100644 index 00000000..a1761246 --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/fp32/axpby.cpp @@ -0,0 +1,34 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "error.h" +#include "cpu/arm/fp32/blas_fp32.h" + +EE axpby_fp32(U32 len, F32 a, const F32 *x, F32 b, F32 *y) +{ + EE ret = SUCCESS; + float32x4_t alpha = vdupq_n_f32(a); + float32x4_t beta = vdupq_n_f32(b); + I32 i = 0; + for (; i < ((I32)len) - 3; i += 4) { + float32x4_t out = vld1q_f32(y + i); + float32x4_t in = vld1q_f32(x + i); + out = vmulq_f32(out, beta); + out = vmlaq_f32(out, alpha, in); + vst1q_f32(y + i, out); + } + for (; i < (I32)len; i++) { + y[i] = a * x[i] + b * y[i]; + } + return ret; +} diff --git a/compute/blas_enhance/src/cpu/arm/fp32/blas_fp32.h b/compute/blas_enhance/src/cpu/arm/fp32/blas_fp32.h new file mode 100644 index 00000000..0834ca99 --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/fp32/blas_fp32.h @@ -0,0 +1,97 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_BLAS_FP32 +#define _H_BLAS_FP32 + +#include "sys.h" +#include "types.h" +#include "error.h" +#include "tensor_desc.h" +#include "arm_neon_expand.h" + +EE matrix_vector_multiply_transform_weight_fp32(TensorDesc desc, F32 *src, F32 *dst); + +void mvm_col_fp32(U32 row, U32 col, F32 *matrix, F32 *vector, F32 *result); + +void mvm_row_fp32(U32 row, U32 col, F32 *matrix, F32 *vector, F32 *result); + +EE mvm_fp32(U32 row, U32 col, DataFormat df, F32 *matrix, F32 *vector, F32 *result); + +void matrix_matrix_multiply_tmp_bytes_fp32( + U32 row1, U32 col1, U32 row2, U32 col2, DataType dt, U32 *bytes); + +EE matrix_matrix_multiply_transform_rhsN_fp32(TensorDesc desc, F32 *src, F32 *dst); + +EE matrix_matrix_multiply_transform_rhsT_fp32(TensorDesc desc, F32 *src, F32 *dst); + +#ifdef __aarch64__ +EE mmm_fp32_V8( + int M, int N, int K, bool transposeA, F32 *matrix1, F32 *matrix2, F32 *tmp, F32 *result); +#else +EE mmm_fp32_V7( + int M, int N, int K, bool transposeA, F32 *matrix1, F32 *matrix2, F32 *tmp, F32 *result); +#endif + +EE axpby_fp32(U32 len, F32 a, const F32 *x, F32 b, F32 *y); + +inline void matrix1_trans(U32 size, U32 blockK, U32 K, F32 *src, F32 *dst) +{ + F32 *src1 = src; + for (U32 i = 0; i < blockK; i++) { + for (U32 j = 0; j < size; j++) { + src1 = src + j * K; + if (i % 16 == 0) { + __builtin_prefetch(src1 + 16); + } + *dst++ = *(src1 + i); + } + } +} + +inline void matrix2_trans(U32 size, U32 blockK, U32 M, F32 *src, F32 *dst) +{ + for (U32 i = 0; i < blockK; i++) { + if (i % 16 == 0) { + __builtin_prefetch(src + 16); + } + memcpy(dst, src, size * sizeof(F32)); + dst += size; + src += M; + } +} + +inline void mvm_row_tail(U32 N, U32 K, F32 *matrix, F32 *vector, F32 *result) +{ + float32x4_t vec, res, mat; + U32 KTail = K % 4; + U32 KInner = K - KTail; + + for (U32 i = 0; i < N; i++) { + res = vdupq_n_f32(0); + + for (U32 j = 0; j < KInner; j += 4) { + vec = vld1q_f32(&vector[j]); + mat = vld1q_f32(&matrix[j + K * i]); + res = vfmaq_f32(res, vec, mat); + } + result[i] += vaddvq_f32(res); + + if (KTail != 0) { + for (U32 p = 0; p < KTail; p++) { + result[i] += vector[p + KInner] * matrix[KInner + p + K * i]; + } + } + } +} +#endif diff --git a/compute/blas_enhance/src/cpu/arm/fp32/mmm_V7.cpp b/compute/blas_enhance/src/cpu/arm/fp32/mmm_V7.cpp new file mode 100644 index 00000000..30845d43 --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/fp32/mmm_V7.cpp @@ -0,0 +1,497 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef __aarch64__ +#include +#include "cpu/arm/fp32/blas_fp32.h" +#include "error.h" +#include "types.h" + +void matrix_matrix_multiply_tmp_bytes_fp32( + U32 row1, U32 col1, U32 row2, U32 col2, DataType dt, U32 *bytes) +{ + *bytes = row1 * col1 + row2 * col2; + *bytes *= bytesOf(dt); + *bytes += 32; +} + +EE matrix_matrix_multiply_transform_rhsN_fp32(TensorDesc desc, F32 *src, F32 *dst) +{ + DataType dt; + DataFormat df; + U32 N, K; + CHECK_STATUS(tensor2dGet(desc, &dt, &df, &K, &N)); + int i = 0; + for (; i < (int)N - 7; i += 8) { + matrix2_trans(8, K, N, src + i, dst + i * K); + } + for (; i < (int)N - 3; i += 4) { + matrix2_trans(4, K, N, src + i, dst + i * K); + } + if ((int)N > i) { + matrix2_trans(N - i, K, N, src + i, dst + i * K); + } + return SUCCESS; +} + +EE matrix_matrix_multiply_transform_rhsT_fp32(TensorDesc desc, F32 *src, F32 *dst) +{ + DataType dt; + DataFormat df; + U32 N, K; + CHECK_STATUS(tensor2dGet(desc, &dt, &df, &N, &K)); + int i = 0; + for (; i < (int)N - 7; i += 8) { + matrix1_trans(8, K, K, src + i * K, dst + i * K); + } + for (; i < (int)N - 3; i += 4) { + matrix1_trans(4, K, K, src + i * K, dst + i * K); + } + if ((int)N > i) { + matrix1_trans(N - i, K, K, src + i * K, dst + i * K); + } + return SUCCESS; +} + +void mmm_NTail_M8(U32 M, U32 N, U32 K, F32 *matrix1, F32 *matrix2, F32 *result) +{ + float32x4x2_t mat2, res; + for (U32 i = 0; i < N; i++) { + res = vld2q_f32(result + i * M); + for (U32 q = 0; q < K; q++) { + mat2 = vld2q_f32(matrix2 + q * 8); + res.val[0] = vfmaq_n_f32(res.val[0], mat2.val[0], matrix1[q * N + i]); + res.val[1] = vfmaq_n_f32(res.val[1], mat2.val[1], matrix1[q * N + i]); + } + vst2q_f32(result + i * M, res); + } +} + +void mmm_NTail_M4(U32 M, U32 N, U32 K, F32 *matrix1, F32 *matrix2, F32 *result) +{ + float32x4_t mat2, res; + for (U32 i = 0; i < N; i++) { + res = vld1q_f32(result + i * M); + for (U32 q = 0; q < K; q++) { + mat2 = vld1q_f32(matrix2 + q * 4); + res = vfmaq_n_f32(res, mat2, matrix1[q * N + i]); + } + vst1q_f32(result + i * M, res); + } +} + +void mmm_NTail_M(U32 MInner, U32 M, U32 N, U32 K, F32 *matrix1, F32 *matrix2, F32 *result) +{ + for (U32 i = 0; i < N; i++) { + for (U32 j = 0; j < MInner; j++) { + for (U32 k = 0; k < K; k++) { + result[i * M + j] += *(matrix1 + k * N + i) * *(matrix2 + k * MInner + j); + } + } + } +} + +void mmm_N6_MTail(U32 MInner, U32 M, U32 K, F32 *matrix1, F32 *matrix2, F32 *result) +{ + float32x2_t mat1[3] = {0}, res[4][3] = {{0}}; + F32 tmp[6] = {0}; + CHECK_REQUIREMENT(MInner < 4); + + for (U32 i = 0; i < K; i++) { + mat1[0] = vld1_f32(matrix1 + i * 6); + mat1[1] = vld1_f32(matrix1 + i * 6 + 2); + mat1[2] = vld1_f32(matrix1 + i * 6 + 4); + for (U32 j = 0; j < MInner; j++) { + res[j][0] = vmla_n_f32(res[j][0], mat1[0], matrix2[j + i * MInner]); + res[j][1] = vmla_n_f32(res[j][1], mat1[1], matrix2[j + i * MInner]); + res[j][2] = vmla_n_f32(res[j][2], mat1[2], matrix2[j + i * MInner]); + } + } + for (U32 p = 0; p < MInner; p++) { + vst1_f32(tmp, res[p][0]); + vst1_f32(tmp + 2, res[p][1]); + vst1_f32(tmp + 4, res[p][2]); + for (U32 q = 0; q < 6; q++) { + result[q * M + p] += tmp[q]; + } + } +} + +void mmm_N4_MTail(U32 MInner, U32 M, U32 K, F32 *matrix1, F32 *matrix2, F32 *result) +{ + float32x4_t mat1 = {0}, res[4] = {0}; + F32 tmp[4] = {0}; + CHECK_REQUIREMENT(MInner < 4); + + for (U32 i = 0; i < K; i++) { + mat1 = vld1q_f32(matrix1 + i * 4); + for (U32 j = 0; j < MInner; j++) { + res[j] = vfmaq_n_f32(res[j], mat1, matrix2[j + i * MInner]); + } + } + for (U32 p = 0; p < MInner; p++) { + vst1q_f32(tmp, res[p]); + for (U32 q = 0; q < 4; q++) { + result[q * M + p] += tmp[q]; + } + } +} + +void mmm_4x4(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) +{ + asm volatile("vld1.f32 {d0-d1}, [%[in]]!\n" + + "vld1.f32 {d4-d5}, [%[w]]!\n" + + // K- > r2 + "mov r2, %[K]\n" + + // give out address to r1 + "mov r1, %[out]\n" + + // load in bias + "vld1.f32 {d8-d9}, [r1]\n" + "add r1, r1, %[offset]\n" + "vld1.f32 {d12-d13}, [r1]\n" + "add r1, r1, %[offset]\n" + "vld1.f32 {d16-d17}, [r1]\n" + "add r1, r1, %[offset]\n" + "vld1.f32 {d20-d21}, [r1]\n" + + // Computation loop + "0:\n" + + "vmla.f32 q4, q2, d0[0]\n" + "vmla.f32 q6, q2, d0[1]\n" + "vmla.f32 q8, q2, d1[0]\n" + "vmla.f32 q10, q2, d1[1]\n" + + "vld1.f32 {d4-d5}, [%[w]]!\n" + "subs r2, r2, #1\n" + + "vld1.f32 {d0-d1}, [%[in]]!\n" + "bne 0b\n" + + // give out address to r1 + "mov r1, %[out]\n" + + "vst1.f32 {d8-d9}, [r1]\n" + "add r1, r1, %[offset]\n" + "vst1.f32 {d12-d13}, [r1]\n" + "add r1, r1, %[offset]\n" + "vst1.f32 {d16-d17}, [r1]\n" + "add r1, r1, %[offset]\n" + "vst1.f32 {d20-d21}, [r1]\n" + + : [in] "+r"(in), [w] "+r"(w), [out] "+r"(out) + : [K] "r"(K), [offset] "r"(offset) + : "memory", "cc", "q0", "q2", "q4", "q6", "q8", "q10", "r1", "r2"); +} + +void mmm_6x4(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) +{ + asm volatile( + "vld1.f32 {d0-d2}, [%[in]]!\n" + + "vld1.f32 {d4-d5}, [%[w]]!\n" + + // K- > r2 + "mov r2, %[K]\n" + + // give out address to r1 + "mov r1, %[out]\n" + + // load in bias + "vld1.f32 {d8-d9}, [r1]\n" + "add r1, r1, %[offset]\n" + "vld1.f32 {d12-d13}, [r1]\n" + "add r1, r1, %[offset]\n" + "vld1.f32 {d16-d17}, [r1]\n" + "add r1, r1, %[offset]\n" + "vld1.f32 {d20-d21}, [r1]\n" + "add r1, r1, %[offset]\n" + "vld1.f32 {d24-d25}, [r1]\n" + "add r1, r1, %[offset]\n" + "vld1.f32 {d28-d29}, [r1]\n" + + // Computation loop + "0:\n" + + "vmla.f32 q4, q2, d0[0]\n" + "vmla.f32 q6, q2, d0[1]\n" + "vmla.f32 q8, q2, d1[0]\n" + "vmla.f32 q10, q2, d1[1]\n" + "vmla.f32 q12, q2, d2[0]\n" + "vmla.f32 q14, q2, d2[1]\n" + + "vld1.f32 {d4-d5}, [%[w]]!\n" + "subs r2, r2, #1\n" + + "vld1.f32 {d0-d2}, [%[in]]!\n" + "bne 0b\n" + + // give out address to r1 + "mov r1, %[out]\n" + + "vst1.f32 {d8-d9}, [r1]\n" + "add r1, r1, %[offset]\n" + "vst1.f32 {d12-d13}, [r1]\n" + "add r1, r1, %[offset]\n" + "vst1.f32 {d16-d17}, [r1]\n" + "add r1, r1, %[offset]\n" + "vst1.f32 {d20-d21}, [r1]\n" + "add r1, r1, %[offset]\n" + "vst1.f32 {d24-d25}, [r1]\n" + "add r1, r1, %[offset]\n" + "vst1.f32 {d28-d29}, [r1]\n" + + : [in] "+r"(in), [w] "+r"(w), [out] "+r"(out) + : [K] "r"(K), [offset] "r"(offset) + : "memory", "cc", "q0", "q1", "q2", "q4", "q6", "q8", "q10", "q12", "q14", "r1", "r2"); +} + +void mmm_4x8(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) +{ + asm volatile("vld1.f32 {d0-d1}, [%[in]]!\n" + + "vld1.f32 {d4-d7}, [%[w]]!\n" + + // K- > r2 + "mov r2, %[K]\n" + + // give out address to r1 + "mov r1, %[out]\n" + + // load in bias + "vld1.f32 {d8-d11}, [r1]\n" + "add r1, r1, %[offset]\n" + "vld1.f32 {d12-d15}, [r1]\n" + "add r1, r1, %[offset]\n" + "vld1.f32 {d16-d19}, [r1]\n" + "add r1, r1, %[offset]\n" + "vld1.f32 {d20-d23}, [r1]\n" + + // Computation loop + "0:\n" + + "vmla.f32 q4, q2, d0[0]\n" + "vmla.f32 q6, q2, d0[1]\n" + "vmla.f32 q8, q2, d1[0]\n" + "vmla.f32 q10, q2, d1[1]\n" + + "vld1.f32 {d4-d5}, [%[w]]!\n" + + "vmla.f32 q5, q3, d0[0]\n" + "vmla.f32 q7, q3, d0[1]\n" + "vmla.f32 q9, q3, d1[0]\n" + "vmla.f32 q11, q3, d1[1]\n" + + "vld1.f32 {d6-d7}, [%[w]]!\n" + "subs r2, r2, #1\n" + + "vld1.f32 {d0-d1}, [%[in]]!\n" + "bne 0b\n" + + // give out address to r1 + "mov r1, %[out]\n" + + "vst1.f32 {d8-d11}, [r1]\n" + "add r1, r1, %[offset]\n" + "vst1.f32 {d12-d15}, [r1]\n" + "add r1, r1, %[offset]\n" + "vst1.f32 {d16-d19}, [r1]\n" + "add r1, r1, %[offset]\n" + "vst1.f32 {d20-d23}, [r1]\n" + + : [in] "+r"(in), [w] "+r"(w), [out] "+r"(out) + : [K] "r"(K), [offset] "r"(offset) + : "memory", "cc", "q0", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", + "q11", "r1", "r2"); +} + +void mmm_6x8(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) +{ + asm volatile("vld1.f32 {d0-d2}, [%[in]]!\n" + + "vld1.f32 {d4-d7}, [%[w]]!\n" + + // K- > r2 + "mov r2, %[K]\n" + + // give out address to r1 + "mov r1, %[out]\n" + + // load in bias + "vld1.f32 {d8-d11}, [r1]\n" + "add r1, r1, %[offset]\n" + "vld1.f32 {d12-d15}, [r1]\n" + "add r1, r1, %[offset]\n" + "vld1.f32 {d16-d19}, [r1]\n" + "add r1, r1, %[offset]\n" + "vld1.f32 {d20-d23}, [r1]\n" + "add r1, r1, %[offset]\n" + "vld1.f32 {d24-d27}, [r1]\n" + "add r1, r1, %[offset]\n" + "vld1.f32 {d28-d31}, [r1]\n" + + // Computation loop + "0:\n" + + "vmla.f32 q4, q2, d0[0]\n" + "vmla.f32 q6, q2, d0[1]\n" + "vmla.f32 q8, q2, d1[0]\n" + "vmla.f32 q10, q2, d1[1]\n" + "vmla.f32 q12, q2, d2[0]\n" + "vmla.f32 q14, q2, d2[1]\n" + + "vld1.f32 {d4-d5}, [%[w]]!\n" + + "vmla.f32 q5, q3, d0[0]\n" + "vmla.f32 q7, q3, d0[1]\n" + "vmla.f32 q9, q3, d1[0]\n" + "vmla.f32 q11, q3, d1[1]\n" + "vmla.f32 q13, q3, d2[0]\n" + "vmla.f32 q15, q3, d2[1]\n" + + "vld1.f32 {d6-d7}, [%[w]]!\n" + "subs r2, r2, #1\n" + + "vld1.f32 {d0-d2}, [%[in]]!\n" + "bne 0b\n" + + // give out address to r1 + "mov r1, %[out]\n" + + "vst1.f32 {d8-d11}, [r1]\n" + "add r1, r1, %[offset]\n" + "vst1.f32 {d12-d15}, [r1]\n" + "add r1, r1, %[offset]\n" + "vst1.f32 {d16-d19}, [r1]\n" + "add r1, r1, %[offset]\n" + "vst1.f32 {d20-d23}, [r1]\n" + "add r1, r1, %[offset]\n" + "vst1.f32 {d24-d27}, [r1]\n" + "add r1, r1, %[offset]\n" + "vst1.f32 {d28-d31}, [r1]\n" + + : [in] "+r"(in), [w] "+r"(w), [out] "+r"(out) + : [K] "r"(K), [offset] "r"(offset) + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15", "r1", "r2"); +} + +EE mmm_fp32_V7( + int M, int N, int K, bool transposeA, F32 *matrix1, F32 *matrix2, F32 *tmp, F32 *result) +{ + int blockK = K; + int blockM = 96; + F32 *matrix1Trans = tmp; + F32 *resultCurrent = result; + int KInner, MInner, m, n; + for (int k = 0; k < K; k += blockK) { + KInner = UNI_MIN(blockK, K - k); + for (int i = 0; i < M; i += blockM) { + MInner = UNI_MIN(blockM, M - i); + for (n = 0; n <= N - 6; n += 6) { + if (i == 0) { + if (transposeA) { + matrix2_trans(6, KInner, N, matrix1 + n, matrix1Trans + n * KInner); + } else { + matrix1_trans(6, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); + } + } + for (m = 0; m <= (MInner - 8); m += 8) { + resultCurrent = result + n * M + m + i; + mmm_6x8(M * 4, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, + resultCurrent); + } + + if ((MInner - m) >= 4) { + resultCurrent = result + n * M + m + i; + mmm_6x4(M * 4, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, + resultCurrent); + m += 4; + } + + if (MInner - m) { + resultCurrent = result + n * M + m + i; + mmm_N6_MTail(MInner - m, M, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + } + } + + if ((N - n) >= 4) { + if (i == 0) { + if (transposeA) { + matrix2_trans(4, KInner, N, matrix1 + n, matrix1Trans + n * KInner); + } else { + matrix1_trans(4, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); + } + } + + for (m = 0; m <= (MInner - 8); m += 8) { + resultCurrent = result + n * M + m + i; + mmm_4x8(M * 4, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, + resultCurrent); + } + + if ((MInner - m) >= 4) { + resultCurrent = result + n * M + m + i; + mmm_4x4(M * 4, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, + resultCurrent); + m += 4; + } + + if (MInner - m) { + resultCurrent = result + n * M + m + i; + mmm_N4_MTail(MInner - m, M, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + } + + n += 4; + } + + if (N - n) { + if (i == 0) { + if (transposeA) { + matrix2_trans(N - n, KInner, N, matrix1 + n, matrix1Trans + n * KInner); + } else { + matrix1_trans( + N - n, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); + } + } + + for (m = 0; m <= (MInner - 8); m += 8) { + resultCurrent = result + n * M + m + i; + mmm_NTail_M8(M, N - n, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + } + + if ((MInner - m) >= 4) { + resultCurrent = result + n * M + m + i; + mmm_NTail_M4(M, N - n, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + m += 4; + } + + if (MInner - m) { + resultCurrent = result + n * M + m + i; + mmm_NTail_M(MInner - m, M, N - n, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + } + } + } + } + return SUCCESS; +} +#endif diff --git a/blas-enhance/src/cpu/arm/fp32/mmm_V8.cpp b/compute/blas_enhance/src/cpu/arm/fp32/mmm_V8.cpp similarity index 87% rename from blas-enhance/src/cpu/arm/fp32/mmm_V8.cpp rename to compute/blas_enhance/src/cpu/arm/fp32/mmm_V8.cpp index e305cf78..d9d7c3a4 100644 --- a/blas-enhance/src/cpu/arm/fp32/mmm_V8.cpp +++ b/compute/blas_enhance/src/cpu/arm/fp32/mmm_V8.cpp @@ -15,7 +15,7 @@ #include #include "cpu/arm/fp32/blas_fp32.h" #include "error.h" -#include "type.h" +#include "types.h" void matrix_matrix_multiply_tmp_bytes_fp32( U32 row1, U32 col1, U32 row2, U32 col2, DataType dt, U32 *bytes) @@ -25,42 +25,12 @@ void matrix_matrix_multiply_tmp_bytes_fp32( *bytes += 32; } -void matrix1_trans(U32 size, U32 blockK, U32 K, F32 *src, F32 *dst) -{ - F32 *src1 = src; - U32 offset; - for (U32 i = 0; i < blockK; i++) { - for (U32 j = 0; j < size; j++) { - src1 = src + j * K; - offset = 64; - if (i % 16 == 0) { - asm volatile("prfm pldl2keep, [%0, %1]\n" - : "+r"(src1) - : "r"((I64)offset) - : "memory", "cc"); - } - *dst++ = *(src1 + i); - } - } -} - -void matrix2_trans(U32 size, U32 blockK, U32 M, F32 *src, F32 *dst) -{ - for (U32 i = 0; i < blockK; i++) { - if (i % 16 == 0) { - asm volatile("prfm pldl2keep, [%0, #64]\n" : "+r"(src) : : "memory", "cc"); - } - memcpy(dst, src, size * sizeof(F32)); - dst += size; - src += M; - } -} - EE matrix_matrix_multiply_transform_rhsN_fp32(TensorDesc desc, F32 *src, F32 *dst) { DataType dt; + DataFormat df; U32 N, K; - CHECK_STATUS(tensor2dGet(desc, &dt, &K, &N)); + CHECK_STATUS(tensor2dGet(desc, &dt, &df, &K, &N)); int i = 0; for (; i < (int)N - 11; i += 12) { matrix2_trans(12, K, N, src + i, dst + i * K); @@ -80,8 +50,9 @@ EE matrix_matrix_multiply_transform_rhsN_fp32(TensorDesc desc, F32 *src, F32 *ds EE matrix_matrix_multiply_transform_rhsT_fp32(TensorDesc desc, F32 *src, F32 *dst) { DataType dt; + DataFormat df; U32 N, K; - CHECK_STATUS(tensor2dGet(desc, &dt, &N, &K)); + CHECK_STATUS(tensor2dGet(desc, &dt, &df, &N, &K)); int i = 0; for (; i < (int)N - 11; i += 12) { matrix1_trans(12, K, K, src + i * K, dst + i * K); @@ -197,24 +168,24 @@ void mmm_N4_MTail(U32 MInner, U32 M, U32 K, F32 *matrix1, F32 *matrix2, F32 *res void mmm_4x4(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) { asm volatile( - //init in- > v1, w- > v0 + // init in- > v1, w- > v0 "ldr q1, [%0]\n" "ldr q0, [%1]\n" - //give in address to x3 + // give in address to x3 "mov x3, %0\n" - //give w address to x0 + // give w address to x0 "mov x0, %1\n" - //K- > x2 + // K- > x2 "mov x2, %3\n" - //give out address to x26 + // give out address to x26 "mov x26, %2\n" - //load in bias + // load in bias "ldr q5, [x26]\n" "add x26, x26, %4\n" @@ -226,7 +197,7 @@ void mmm_4x4(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) "ldr q11, [x26]\n" - //Computation loop + // Computation loop "0:\n" "ldr q3, [x3, 16]!\n" @@ -242,7 +213,7 @@ void mmm_4x4(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) "1:\n" - //give out address to x26 + // give out address to x26 "mov x26, %2\n" "str q5, [x26]\n" @@ -265,24 +236,24 @@ void mmm_4x4(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) void mmm_8x4(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) { asm volatile( - //init in- > v1, w- > v0 + // init in- > v1, w- > v0 "ldr q1, [%0]\n" "ldr q0, [%1]\n" - //give in address to x3 + // give in address to x3 "mov x3, %0\n" - //give w address to x0 + // give w address to x0 "mov x0, %1\n" - //K- > x2 + // K- > x2 "mov x2, %3\n" - //give out address to x26 + // give out address to x26 "mov x26, %2\n" - //load in bias + // load in bias "ldr q5, [x26]\n" "add x26, x26, %4\n" @@ -306,7 +277,7 @@ void mmm_8x4(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) "ldr q19, [x26]\n" - //Computation loop + // Computation loop "0:\n" "ldr q3, [x3, 16]\n" @@ -325,7 +296,7 @@ void mmm_8x4(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) "mov v0.16b, v29.16b\n" "bne 0b\n" - //give out address to x26 + // give out address to x26 "mov x26, %2\n" "str q5, [x26]\n" @@ -360,21 +331,21 @@ void mmm_8x4(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) void mmm_4x8(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) { asm volatile( - //init in- > v1, w- > v0 + // init in- > v1, w- > v0 "ldr q1, [%0]\n" "ldr q0, [%1]\n" - //give in address to x3 + // give in address to x3 "mov x3, %0\n" - //give w address to x0 + // give w address to x0 "mov x0, %1\n" - //K- > x2 + // K- > x2 "mov x2, %3\n" - //give out address to x26 + // give out address to x26 "mov x26, %2\n" "ld1 {v5.4s, v6.4s}, [x26]\n" @@ -386,13 +357,13 @@ void mmm_4x8(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) "ld1 {v11.4s, v12.4s}, [x26]\n" /* Layout - 5 6 - 7 8 - 9 10 - 11 12 - */ + * 5 6 + * 7 8 + * 9 10 + * 11 12 + */ - //Computation loop + // Computation loop "0:\n" "ldr q29, [x0, 16]\n" @@ -411,7 +382,7 @@ void mmm_4x8(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) "mov v1.16b, v3.16b\n" "bne 0b\n" - //give out address to x26 + // give out address to x26 "mov x26, %2\n" "st1 {v5.4s, v6.4s}, [x26]\n" @@ -431,24 +402,24 @@ void mmm_4x8(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) void mmm_8x8(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) { asm volatile( - //init in- > v1, w- > v0 + // init in- > v1, w- > v0 "ldr q1, [%0]\n" "ldr q0, [%1]\n" - //give in address to x3 + // give in address to x3 "mov x3, %0\n" - //give w address to x0 + // give w address to x0 "mov x0, %1\n" - //K- > x2 + // K- > x2 "mov x2, %3\n" - //give out address to x26 + // give out address to x26 "mov x26, %2\n" - //load in bias + // load in bias "ld1 {v5.4s, v6.4s}, [x26]\n" "add x26, x26, %4\n" "ld1 {v7.4s, v8.4s}, [x26]\n" @@ -466,18 +437,18 @@ void mmm_8x8(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) "ld1 {v19.4s, v20.4s}, [x26]\n" /* Layout - 5 6 - 7 8 - 9 10 - 11 12 - - 13 14 - 15 16 - 17 18 - 19 20 - */ - - //Computation loop + 5 6 + 7 8 + 9 10 + 11 12 + + 13 14 + 15 16 + 17 18 + 19 20 + */ + + // Computation loop "0:\n" "fmla v5.4s, v0.4s, v1.s[0]\n" @@ -507,7 +478,7 @@ void mmm_8x8(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) "bne 0b\n" - //give out address to x26 + // give out address to x26 "mov x26, %2\n" "st1 {v5.4s, v6.4s}, [x26]\n" @@ -535,26 +506,26 @@ void mmm_8x8(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) void mmm_4x12(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) { asm volatile( - //init in->v1, w->v0 + // init in->v1, w->v0 "ldr q1, [%0]\n" "ldr q0, [%1]\n" "ldr q29, [%1, 16]\n" // prefetch one more w - //give in address to x3 + // give in address to x3 "mov x3, %0\n" - //give w address to x0 + // give w address to x0 "mov x0, %1\n" - //K->x2 + // K->x2 "mov x2, %3\n" - //give out address to x26 + // give out address to x26 "mov x26, %2\n" - //load in bias + // load in bias "ld1 {v5.4s, v6.4s, v7.4s}, [x26]\n" "add x26, x26, %4\n" "ld1 {v8.4s, v9.4s, v10.4s}, [x26]\n" @@ -564,13 +535,13 @@ void mmm_4x12(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) "ld1 {v14.4s, v15.4s, v16.4s}, [x26]\n" /* Layout - 5 6 7 - 8 9 10 - 11 12 13 - 14 15 16 - */ + 5 6 7 + 8 9 10 + 11 12 13 + 14 15 16 + */ - //Computation loop + // Computation loop "0:\n" // in(x3): v1 // w(x0): v0 v29 v30 @@ -598,7 +569,7 @@ void mmm_4x12(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) "mov v1.16b, v2.16b\n" "bne 0b\n" - //give out address to x26 + // give out address to x26 "mov x26, %2\n" "st1 {v5.4s, v6.4s, v7.4s}, [x26]\n" @@ -618,26 +589,26 @@ void mmm_4x12(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) void mmm_8x12(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) { asm volatile( - //init in->v1, w->v0 + // init in->v1, w->v0 "ldr q1, [%0]\n" "ldr q0, [%1]\n" "ldr q29, [%1, 16]\n" // prefetch one more w - //give in address to x3 + // give in address to x3 "mov x3, %0\n" - //give w address to x0 + // give w address to x0 "mov x0, %1\n" - //K->x2 + // K->x2 "mov x2, %3\n" - //give out address to x26 + // give out address to x26 "mov x26, %2\n" - //load in bias + // load in bias "ld1 {v5.4s, v6.4s, v7.4s}, [x26]\n" "add x26, x26, %4\n" "ld1 {v8.4s, v9.4s, v10.4s}, [x26]\n" @@ -655,18 +626,18 @@ void mmm_8x12(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) "ld1 {v26.4s, v27.4s, v28.4s}, [x26]\n" /* Layout - 5 6 7 - 8 9 10 - 11 12 13 - 14 15 16 - - 17 18 19 - 20 21 22 - 23 24 25 - 26 27 28 - */ - - //Computation loop + 5 6 7 + 8 9 10 + 11 12 13 + 14 15 16 + + 17 18 19 + 20 21 22 + 23 24 25 + 26 27 28 + */ + + // Computation loop "0:\n" // in(x3): v1 v2 // w(x0): v0 v29 v30 @@ -710,7 +681,7 @@ void mmm_8x12(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) "bne 0b\n" - //give out address to x26 + // give out address to x26 "mov x26, %2\n" "st1 {v5.4s, v6.4s, v7.4s}, [x26]\n" @@ -736,7 +707,8 @@ void mmm_8x12(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) "v7", "v6", "v5", "v3", "v2", "v1", "v0", "x26", "x3", "x2", "x0"); } -EE mmm_fp32_V8(int M, int N, int K, F32 *matrix1, F32 *matrix2, F32 *tmp, F32 *result) +EE mmm_fp32_V8( + int M, int N, int K, bool transposeA, F32 *matrix1, F32 *matrix2, F32 *tmp, F32 *result) { int blockK = K; int blockM = 96; @@ -749,7 +721,11 @@ EE mmm_fp32_V8(int M, int N, int K, F32 *matrix1, F32 *matrix2, F32 *tmp, F32 *r MInner = UNI_MIN(blockM, M - i); for (n = 0; n <= N - 8; n += 8) { if (i == 0) { - matrix1_trans(8, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); + if (transposeA) { + matrix2_trans(8, KInner, N, matrix1 + n, matrix1Trans + n * KInner); + } else { + matrix1_trans(8, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); + } } for (m = 0; m <= (MInner - 12); m += 12) { resultCurrent = result + n * M + m + i; @@ -778,7 +754,11 @@ EE mmm_fp32_V8(int M, int N, int K, F32 *matrix1, F32 *matrix2, F32 *tmp, F32 *r if ((N - n) >= 4) { if (i == 0) { - matrix1_trans(4, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); + if (transposeA) { + matrix2_trans(4, KInner, N, matrix1 + n, matrix1Trans + n * KInner); + } else { + matrix1_trans(4, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); + } } for (m = 0; m <= (MInner - 12); m += 12) { @@ -811,7 +791,12 @@ EE mmm_fp32_V8(int M, int N, int K, F32 *matrix1, F32 *matrix2, F32 *tmp, F32 *r if (N - n) { if (i == 0) { - matrix1_trans(N - n, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); + if (transposeA) { + matrix2_trans(N - n, KInner, N, matrix1 + n, matrix1Trans + n * KInner); + } else { + matrix1_trans( + N - n, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); + } } for (m = 0; m <= (MInner - 12); m += 12) { diff --git a/compute/blas_enhance/src/cpu/arm/fp32/mvm.cpp b/compute/blas_enhance/src/cpu/arm/fp32/mvm.cpp new file mode 100644 index 00000000..ec364bcf --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/fp32/mvm.cpp @@ -0,0 +1,118 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "error.h" +#include "cpu/arm/fp32/blas_fp32.h" + +EE matrix_vector_multiply_transform_weight_fp32(TensorDesc desc, F32 *src, F32 *dst) +{ + DataType dt; + DataFormat df; + U32 N, K; + EE ret = SUCCESS; + int i = 0; + switch (desc.df) { + case DF_NORMAL: { + CHECK_STATUS(tensor2dGet(desc, &dt, &df, &N, &K)); + for (; i < (int)N - 15; i += 16) { + matrix1_trans(16, K, K, src + i * K, dst + i * K); + } + if (i < (int)N) { + memcpy(dst + i * K, src + i * K, (N - i) * K * bytesOf(DT_F32)); + } + break; + } + case DF_TRANSPOSE: { + CHECK_STATUS(tensor2dGet(desc, &dt, &df, &K, &N)); + for (; i < (int)N - 15; i += 16) { + matrix2_trans(16, K, N, src + i, dst + i * K); + } + if (i < (int)N) { + int base = i; + F32 *basePtr = dst + i * K; + for (int j = 0; j < (int)K; j++) { + for (; i < (int)N; i++) { + basePtr[(i - base) * K + j] = src[j * N + i]; + } + } + } + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +void mvm_kernel_fp32(U32 rounds, U32 K, F32 *matrix, F32 *vector, F32 *result) +{ + U32 N = rounds * 16; + float32x4_t mat[4]; + F32 v; + float32x4_t res[4]; + + for (U32 n = 0; n < N; n += 16) { + F32 *bufMov = matrix + n * K; + for (int i = 0; i < 4; i++) { + res[i] = vld1q_f32(result + n + i * 4); + } + for (U32 k = 0; k < K; k++) { + v = vector[k]; + for (int i = 0; i < 4; i++) { + mat[i] = vld1q_f32(bufMov + i * 4); + } + for (int i = 0; i < 4; i++) { + res[i] = vfmaq_n_f32(res[i], mat[i], v); + } + bufMov += 16; + } + for (int i = 0; i < 4; i++) { + vst1q_f32(result + n + i * 4, res[i]); + } + } +} + +void mvm_pack_fp32(U32 row, U32 col, F32 *matrix, F32 *vector, F32 *result) +{ + U32 rounds = row / 16; + U32 nTail = row % 16; + + mvm_kernel_fp32(rounds, col, matrix, vector, result); + if (0 != nTail) { + mvm_row_tail(nTail, col, matrix + (row - nTail) * col, vector, result + (row - nTail)); + } +} + +EE mvm_fp32(U32 row, U32 col, DataFormat df, F32 *matrix, F32 *vector, F32 *result) +{ + EE ret = SUCCESS; + switch (df) { + case DF_NKN16: { + mvm_pack_fp32(row, col, matrix, vector, result); + break; + } + case DF_NORMAL: { + mvm_row_fp32(row, col, matrix, vector, result); + break; + } + case DF_TRANSPOSE: { + mvm_col_fp32(row, col, matrix, vector, result); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/blas_enhance/src/cpu/arm/fp32/mvm_col.cpp b/compute/blas_enhance/src/cpu/arm/fp32/mvm_col.cpp new file mode 100644 index 00000000..9dc1e256 --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/fp32/mvm_col.cpp @@ -0,0 +1,94 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "types.h" +#include "blas_fp32.h" + +inline void mvm_col_tail(U32 N, U32 K, F32 *matrix, F32 *vector, F32 *result) +{ + float32x4_t tmp, res, mat; + U32 NTail = N % 4; + U32 NInner = N - NTail; + + for (U32 i = 0; i < K; i++) { + for (U32 j = 0; j < NInner; j += 4) { + tmp = vld1q_f32(result + j); + mat = vld1q_f32(&matrix[j + N * i]); + res = vfmaq_n_f32(tmp, mat, vector[i]); + vst1q_f32(result + j, res); + } + if (NTail != 0) { + for (U32 p = 0; p < NTail; p++) { + result[NInner + p] += vector[i] * matrix[NInner + N * i + p]; + } + } + } +} + +void mvm_col_kernel(U32 N, U32 K, F32 *matrix, F32 *vector, F32 *result) +{ + float32x4_t mat[4] = {0}; + + F32 *w0 = matrix; + F32 *w1 = matrix + K * N; + F32 *w2 = matrix + 2 * K * N; + F32 *w3 = matrix + 3 * K * N; + + U32 N_tail = N % 4; + U32 N_inner = N - N_tail; + + for (U32 i = 0; i < K; i++) { + for (U32 j = 0; j < N_inner; j += 4) { + float32x4_t res[4] = {0}; + + res[3] = vld1q_f32(result + j); + mat[0] = vld1q_f32(w0); + mat[1] = vld1q_f32(w1); + mat[2] = vld1q_f32(w2); + mat[3] = vld1q_f32(w3); + + res[0] = vfmaq_n_f32(res[3], mat[0], vector[i]); + res[1] = vfmaq_n_f32(res[0], mat[1], vector[K + i]); + res[2] = vfmaq_n_f32(res[1], mat[2], vector[2 * K + i]); + res[3] = vfmaq_n_f32(res[2], mat[3], vector[3 * K + i]); + + w0 += 4; + w1 += 4; + w2 += 4; + w3 += 4; + vst1q_f32(result + j, res[3]); + } + if (N_tail != 0) { + for (U32 p = 0; p < N_tail; p++) { + result[N_inner + p] += vector[i] * *w0++; + result[N_inner + p] += vector[i + K] * *w1++; + result[N_inner + p] += vector[i + 2 * K] * *w2++; + result[N_inner + p] += vector[i + 3 * K] * *w3++; + } + } + } +} + +void mvm_col_fp32(U32 numRows, U32 numColumns, F32 *matrix, F32 *vector, F32 *result) +{ + // Actual layout is KN, and vector is K + U32 N = numRows; + U32 K = numColumns; + U32 KInner = K / 4; + U32 KTail = K % 4; + mvm_col_kernel(N, KInner, matrix, vector, result); + if (KTail != 0) { + mvm_col_tail(N, KTail, matrix + (K - KTail) * N, vector + (K - KTail), result); + } +} diff --git a/compute/blas_enhance/src/cpu/arm/fp32/mvm_row.cpp b/compute/blas_enhance/src/cpu/arm/fp32/mvm_row.cpp new file mode 100644 index 00000000..ac5efee5 --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/fp32/mvm_row.cpp @@ -0,0 +1,182 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "blas_fp32.h" + +void mvm_row_kernel(U32 N, U32 K, F32 *matrix, F32 *vector, F32 *result) +{ + I32 KTail = K % 4; + I32 KInner = K - KTail; + F32 *w0 = matrix; + F32 *w1 = matrix + K * N; + F32 *w2 = matrix + K * 2 * N; + F32 *w3 = matrix + K * 3 * N; +#ifdef __aarch64__ + asm volatile("mov x19, %5\n" + "ld1 {v18.s}[0], [x19]\n" + "add x19, x19, %8\n" + "ld1 {v18.s}[1], [x19]\n" + "add x19, x19, %8\n" + "ld1 {v18.s}[2], [x19]\n" + "add x19, x19, %8\n" + "ld1 {v18.s}[3], [x19]\n" + + "movi v17.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "mov x20, %6\n" + "cmp x20, #0x0\n" + "beq 3f\n" + "0:\n" + + "ld1 {v0.4s}, [%0], #16\n" + "ld1 {v1.4s}, [%1], #16\n" + "ld1 {v2.4s}, [%2], #16\n" + "ld1 {v3.4s}, [%3], #16\n" + "ld1 {v4.4s}, [%4], #16\n" + + "fmla v9.4s, v1.4s, v0.4s\n" + "fmla v10.4s, v2.4s, v0.4s\n" + "fmla v11.4s, v3.4s, v0.4s\n" + "fmla v12.4s, v4.4s, v0.4s\n" + + "subs x20, x20, #4\n" + "bne 0b\n" + + "faddp v13.4s, v9.4s, v10.4s\n" + "faddp v14.4s, v11.4s, v12.4s\n" + "faddp v17.4s, v13.4s, v14.4s\n" + "3:\n" + "mov x16, %7\n" + "cmp x16, #0x0\n" + "beq 2f\n" + + "1:\n" + "ld1 {v8.s}[0], [%0], #4\n" + + "ld1 {v1.s}[0], [%1], #4\n" + "ld1 {v1.s}[1], [%2], #4\n" + "ld1 {v1.s}[2], [%3], #4\n" + "ld1 {v1.s}[3], [%4], #4\n" + "fmla v16.4s, v1.4s, v8.s[0]\n" + + "subs x16, x16, 0x1\n" + "bne 1b\n" + + "fadd v17.4s, v17.4s, v16.4s\n" + + "2:\n" + + "fadd v17.4s, v17.4s, v18.4s\n" + "mov x19, %5\n" + "st1 {v17.s}[0], [x19]\n" + "add x19, x19, %8\n" + "st1 {v17.s}[1], [x19]\n" + "add x19, x19, %8\n" + "st1 {v17.s}[2], [x19]\n" + "add x19, x19, %8\n" + "st1 {v17.s}[3], [x19]\n" + : "+r"(vector), "+r"(w0), "+r"(w1), "+r"(w2), "+r"(w3), "+r"(result) + : "r"((I64)KInner), "r"((I64)KTail), "r"((I64)N * 4) + : "memory", "cc", "x19", "x20", "x21", "x22", "x23", "x24", "x15", "x16", "v0", + "v1", "v2", "v3", "v4", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", + "v17", "v18"); +#else + asm volatile("mov r3, %[result]\n" + "vld1.f32 {d30[0]}, [r3], %[stride]\n" + "vld1.f32 {d30[1]}, [r3], %[stride]\n" + "vld1.f32 {d31[0]}, [r3], %[stride]\n" + "vld1.f32 {d31[1]}, [r3]\n" + + "veor q6, q6, q6\n" + "veor q5, q5, q5\n" + "veor q9, q9, q9\n" + "veor q10, q10, q10\n" + "veor q11, q11, q11\n" + "veor q12, q12, q12\n" + "mov r3, %[KInner]\n" + "cmp r3, #0\n" + "beq 3f\n" + "0:\n" + + "vld1.f32 {d0-d1}, [%[vector]]!\n" + "vld1.f32 {d2-d3}, [%[w0]]!\n" + "vld1.f32 {d4-d5}, [%[w1]]!\n" + "vld1.f32 {d6-d7}, [%[w2]]!\n" + "vld1.f32 {d8-d9}, [%[w3]]!\n" + + "vmla.f32 q9, q1, q0\n" + "vmla.f32 q10, q2, q0\n" + "vmla.f32 q11, q3, q0\n" + "vmla.f32 q12, q4, q0\n" + + "subs r3, r3, #4\n" + "bne 0b\n" + + "vpadd.f32 d26, d18, d20\n" + "vpadd.f32 d27, d19, d21\n" + "vpadd.f32 d28, d22, d24\n" + "vpadd.f32 d29, d23, d25\n" + "vadd.f32 d12, d26, d27\n" + "vadd.f32 d13, d28, d29\n" + "3:\n" + "mov r3, %[KTail]\n" + "cmp r3, #0\n" + "beq 2f\n" + + "1:\n" + "vld1.f32 {d0[0]}, [%[vector]]!\n" + "vld1.f32 {d2[0]}, [%[w0]]!\n" + "vld1.f32 {d2[1]}, [%[w1]]!\n" + "vld1.f32 {d3[0]}, [%[w2]]!\n" + "vld1.f32 {d3[1]}, [%[w3]]!\n" + "vmla.f32 q5, q1, d0[0]\n" + + "subs r3, r3, #1\n" + "bne 1b\n" + + "vadd.f32 q6, q6, q5\n" + + "2:\n" + + "vadd.f32 q6, q6, q15\n" + "mov r3, %[result]\n" + "vst1.f32 {d12[0]}, [r3], %[stride]\n" + "vst1.f32 {d12[1]}, [r3], %[stride]\n" + "vst1.f32 {d13[0]}, [r3], %[stride]\n" + "vst1.f32 {d13[1]}, [r3]\n" + : [vector] "+r"(vector), [w0] "+r"(w0), [w1] "+r"(w1), [w2] "+r"(w2), + [w3] "+r"(w3), [result] "+r"(result) + : [KInner] "r"(KInner), [KTail] "r"(KTail), [stride] "r"(N * 4) + : "memory", "cc", "r3", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15"); +#endif +} + +void mvm_row_fp32(U32 numRows, U32 numColumns, F32 *matrix, F32 *vector, F32 *result) +{ + // Actual layout is NK, and vector is K + U32 N = numRows; + U32 K = numColumns; + U32 NTail = N % 4; + U32 NInner = N / 4; + for (U32 i = 0; i < NInner; i++) { + mvm_row_kernel(NInner, K, matrix + i * K, vector, result + i); + } + if (NTail != 0) { + mvm_row_tail(NTail, K, matrix + (N - NTail) * K, vector, result + (N - NTail)); + } +} diff --git a/compute/blas_enhance/src/cpu/arm/int8/blas_int8.h b/compute/blas_enhance/src/cpu/arm/int8/blas_int8.h new file mode 100644 index 00000000..035a770f --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/int8/blas_int8.h @@ -0,0 +1,43 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_BLAS_INT8 +#define _H_BLAS_INT8 + +#include "sys.h" +#include "types.h" +#include "error.h" +#include "tensor_desc.h" + +EE matrix_vector_multiply_transform_weight_int8(TensorDesc desc, INT8 *src, INT8 *dst); + +EE mvm_int8(U32 row, U32 col, DataFormat df, INT8 *matrix, INT8 *vector, I32 *tmp, I32 *result); + +void matrix_matrix_multiply_tmp_bytes_int8( + U32 row1, U32 col1, U32 row2, U32 col2, DataType dt, U32 *bytes); + +EE matrix_matrix_multiply_transform_rhsN_int8(TensorDesc desc, INT8 *src, INT8 *dst); + +EE matrix_matrix_multiply_transform_rhsT_int8(TensorDesc desc, INT8 *src, INT8 *dst); + +EE mmm_int8(int M, + int N, + int K, + bool transposeA, + INT8 *matrix1, + INT8 *matrix2, + INT8 *tmp, + I32 *result, + Arch arch); + +#endif diff --git a/compute/blas_enhance/src/cpu/arm/int8/mmm.cpp b/compute/blas_enhance/src/cpu/arm/int8/mmm.cpp new file mode 100644 index 00000000..89d0d667 --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/int8/mmm.cpp @@ -0,0 +1,93 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "cpu/arm/blas_arm.h" +#include "cpu/arm/int8/blas_int8.h" +#include "cpu/arm/int8/mmm.h" +#include "cpu/arm/int8/mmm_common.h" + +void matrix_matrix_multiply_tmp_bytes_int8( + U32 row1, U32 col1, U32 row2, U32 col2, DataType dt, U32 *bytes) +{ + col1 = pad_to_4_multiple(col1); + row2 = pad_to_4_multiple(row2); + *bytes = row1 * col1 + row2 * col2; + *bytes *= bytesOf(dt); + *bytes += 32; +} + +EE matrix_matrix_multiply_transform_rhsN_int8(TensorDesc desc, INT8 *src, INT8 *dst) +{ + DataType dt; + DataFormat df; + U32 N, K; + CHECK_STATUS(tensor2dGet(desc, &dt, &df, &K, &N)); + U32 K4 = pad_to_4_multiple(K); + int i = 0; + for (; i < (int)N - 11; i += 12) { + matrix2_trans_m12(K, N, src + i, dst + i * K4); + } + for (; i < (int)N - 7; i += 8) { + matrix2_trans_int8(8, K, N, src + i, dst + i * K4); + } + for (; i < (int)N - 3; i += 4) { + matrix2_trans_int8(4, K, N, src + i, dst + i * K4); + } + if ((int)N > i) { + matrix2_trans_int8(N - i, K, N, src + i, dst + i * K4); + } + return SUCCESS; +} + +EE matrix_matrix_multiply_transform_rhsT_int8(TensorDesc desc, INT8 *src, INT8 *dst) +{ + DataType dt; + DataFormat df; + U32 N, K; + CHECK_STATUS(tensor2dGet(desc, &dt, &df, &N, &K)); + U32 K4 = pad_to_4_multiple(K); + int i = 0; + for (; i < (int)N - 11; i += 12) { + matrix1_trans_int8(12, K, K, src + i * K, dst + i * K4); + } + for (; i < (int)N - 7; i += 8) { + matrix1_trans_int8(8, K, K, src + i * K, dst + i * K4); + } + for (; i < (int)N - 3; i += 4) { + matrix1_trans_int8(4, K, K, src + i * K, dst + i * K4); + } + if ((int)N > i) { + matrix1_trans_int8(N - i, K, K, src + i * K, dst + i * K4); + } + return SUCCESS; +} + +EE mmm_int8( + int M, int N, int K, bool transposeA, INT8 *matrix1, INT8 *matrix2, INT8 *tmp, I32 *result, Arch arch) +{ + EE ret = SUCCESS; + switch (arch) { + case ARM_A55: + mmm_A55(M, N, K, transposeA, matrix1, matrix2, tmp, result); + break; + case ARM_A76: + mmm_A76(M, N, K, transposeA, matrix1, matrix2, tmp, result); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/blas_enhance/src/cpu/arm/int8/mmm.h b/compute/blas_enhance/src/cpu/arm/int8/mmm.h new file mode 100644 index 00000000..9a433664 --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/int8/mmm.h @@ -0,0 +1,24 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_MMM +#define _H_MMM + +#include "types.h" + +void mmm_A55( + int M, int N, int K, bool transposeA, INT8 *matrix1, INT8 *matrix2, INT8 *tmp, I32 *result); + +void mmm_A76( + int M, int N, int K, bool transposeA, INT8 *matrix1, INT8 *matrix2, INT8 *tmp, I32 *result); +#endif diff --git a/compute/blas_enhance/src/cpu/arm/int8/mmm_A55.cpp b/compute/blas_enhance/src/cpu/arm/int8/mmm_A55.cpp new file mode 100644 index 00000000..479f726b --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/int8/mmm_A55.cpp @@ -0,0 +1,741 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_INT8 +#include +#include +#include "cpu/arm/blas_arm.h" +#include "cpu/arm/int8/mmm_common.h" +#include "cpu/arm/int8/mmm.h" + +inline void mmm_4x4_A55(U32 offset, U32 K, INT8 *in, INT8 *w, I32 *out) +{ + asm volatile( + // init in- > v1, w- > v0 + "ldr d1, [%0]\n" + "ldr x16, [%0, 8]\n" + "ins v1.d[1], x16\n" + + "ldr d0, [%1]\n" + "ldr x17, [%1, 8]\n" + "ins v0.d[1], x17\n" + + // give in address to x3 + "mov x3, %0\n" + + // give w address to x0 + "mov x0, %1\n" + + // K- > x2 + "mov x2, %3\n" + + // give out address to x26 + "mov x26, %2\n" + + // load in bias + "ldr q5, [x26]\n" + "add x26, x26, %4\n" + + "ldr q7, [x26]\n" + "add x26, x26, %4\n" + + "ldr q9, [x26]\n" + "add x26, x26, %4\n" + + "ldr q11, [x26]\n" + + // Computation loop + "0:\n" + + "ldr d3, [x3, 16] !\n" + "ldr x16, [x3, 24]\n" + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "ldr d29, [x0, 16] !\n" + "ldr x17, [x0, 24]\n" + "sdot v7.4s, v0.16b, v1.4b[1]\n" + "ins v3.d[1], x16\n" + "subs x2, x2, #4\n" + "sdot v9.4s, v0.16b, v1.4b[2]\n" + "ins v29.d[1], x17\n" + "sdot v11.4s, v0.16b, v1.4b[3]\n" + "mov v1.16b, v3.16b\n" + "mov v0.16b, v29.16b\n" + "bne 0b\n" + + "1:\n" + + // give out address to x26 + "mov x26, %2\n" + + "str q5, [x26]\n" + "add x26, x26, %4\n" + + "str q7, [x26]\n" + "add x26, x26, %4\n" + + "str q9, [x26]\n" + "add x26, x26, %4\n" + + "str q11, [x26]\n" + + : "+r"(in), "+r"(w), "+r"(out) + : "r"((I64)K), "r"((I64)offset) + : "memory", "cc", "v30", "v29", "v11", "v9", "v7", "v5", "v3", "v1", "v0", "x26", "x16", + "x17", "x3", "x2", "x0"); +} + +inline void mmm_8x4_A55(U32 offset, U32 K, INT8 *in, INT8 *w, I32 *out) +{ + asm volatile( + // init in-> v1, w-> v0 + "ldr q1, [%0]\n" + "ldr q0, [%1]\n" + + // give in address to x3 + "mov x3, %0\n" + + // give w address to x0 + "mov x0, %1\n" + + // K-> x2 + "mov x2, %3\n" + + // give out address to x26 + "mov x26, %2\n" + + // load in bias + "ldr q5, [x26]\n" + "add x26, x26, %4\n" + + "ldr q7, [x26]\n" + "add x26, x26, %4\n" + + "ldr q9, [x26]\n" + "add x26, x26, %4\n" + + "ldr q11, [x26]\n" + "add x26, x26, %4\n" + + "ldr q13, [x26]\n" + "add x26, x26, %4\n" + + "ldr q15, [x26]\n" + "add x26, x26, %4\n" + + "ldr q17, [x26]\n" + "add x26, x26, %4\n" + + "ldr q19, [x26]\n" + + // Computation loop + "0:\n" + + "ldr d3, [x3, 16]\n" + "ldr x16, [x3, 24]\n" + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "ldr d29, [x0, 16]!\n" + "ldr x17, [x0, 8]\n" + "sdot v7.4s, v0.16b, v1.4b[1]\n" + "ins v3.d[1], x16\n" + "ldr d30, [x3, 32]!\n" + "sdot v9.4s, v0.16b, v1.4b[2]\n" + "ins v29.d[1], x17\n" + "sdot v11.4s, v0.16b, v1.4b[3]\n" + + "ldr x16, [x3, 8]\n" + "subs x2, x2, #4\n" + "sdot v13.4s, v0.16b, v3.4b[0]\n" + "ins v30.d[1], x16\n" + "sdot v15.4s, v0.16b, v3.4b[1]\n" + "sdot v17.4s, v0.16b, v3.4b[2]\n" + "mov v1.16b, v30.16b\n" + "sdot v19.4s, v0.16b, v3.4b[3]\n" + "mov v0.16b, v29.16b\n" + "bne 0b\n" + + // give out address to x26 + "mov x26, %2\n" + + "str q5, [x26]\n" + "add x26, x26, %4\n" + + "str q7, [x26]\n" + "add x26, x26, %4\n" + + "str q9, [x26]\n" + "add x26, x26, %4\n" + + "str q11, [x26]\n" + "add x26, x26, %4\n" + + "str q13, [x26]\n" + "add x26, x26, %4\n" + + "str q15, [x26]\n" + "add x26, x26, %4\n" + + "str q17, [x26]\n" + "add x26, x26, %4\n" + + "str q19, [x26]\n" + + : "+r"(in), "+r"(w), "+r"(out) + : "r"((I64)K), "r"((I64)offset) + : "memory", "cc", "v30", "v29", "v19", "v17", "v15", "v13", "v11", "v9", "v7", "v5", "v3", + "v1", "v0", "x26", "x16", "x17", "x3", "x2", "x0"); +} + +inline void mmm_4x8_A55(U32 offset, U32 K, INT8 *in, INT8 *w, I32 *out) +{ + asm volatile( + // init in- > v1, w- > v0 + "ldr d1, [%0]\n" + "ldr x16, [%0, 8]\n" + "ins v1.d[1], x16\n" + + "ldr d0, [%1]\n" + "ldr x17, [%1, 8]\n" + "ins v0.d[1], x17\n" + + // give in address to x3 + "mov x3, %0\n" + + // give w address to x0 + "mov x0, %1\n" + + // K- > x2 + "mov x2, %3\n" + + // give out address to x26 + "mov x26, %2\n" + + "ld1 {v5.4s, v6.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v7.4s, v8.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v9.4s, v10.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v11.4s, v12.4s}, [x26]\n" + + /* Layout + * 5 6 + * 7 8 + * 9 10 + * 11 12 + */ + + // Computation loop + "0:\n" + + "ldr d29, [x0, 16]\n" + "ldr x17, [x0, 24]\n" + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "ldr d3, [x3, 16] !\n" + "ldr x16, [x3, 8]\n" + "sdot v7.4s, v0.16b, v1.4b[1]\n" + "ins v29.d[1], x17\n" + "subs x2, x2, #4\n" + "sdot v9.4s, v0.16b, v1.4b[2]\n" + "ins v3.d[1], x16\n" + "sdot v11.4s, v0.16b, v1.4b[3]\n" + + "sdot v6.4s, v29.16b, v1.4b[0]\n" + "ldr d0, [x0, 32] !\n" + "ldr x17, [x0, 8]\n" + "sdot v8.4s, v29.16b, v1.4b[1]\n" + "sdot v10.4s, v29.16b, v1.4b[2]\n" + "ins v0.d[1], x17\n" + "sdot v12.4s, v29.16b, v1.4b[3]\n" + "mov v1.16b, v3.16b\n" + "bne 0b\n" + + // give out address to x26 + "mov x26, %2\n" + + "st1 {v5.4s, v6.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v7.4s, v8.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v9.4s, v10.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v11.4s, v12.4s}, [x26]\n" + + : "+r"(in), "+r"(w), "+r"(out) + : "r"((I64)K), "r"((I64)offset) + : "memory", "cc", "v29", "v12", "v11", "v10", "v9", "v8", "v7", "v6", "v5", "v3", "v1", + "v0", "x26", "x16", "x17", "x3", "x2", "x0"); +} + +inline void mmm_8x8_A55(U32 offset, U32 K, INT8 *in, INT8 *w, I32 *out) +{ + asm volatile( + // init in- > v1, w- > v0 + "ldr d1, [%0]\n" + "ldr x16, [%0, 8]\n" + "ins v1.d[1], x16\n" + + "ldr d0, [%1]\n" + "ldr x17, [%1, 8]\n" + "ins v0.d[1], x17\n" + + // give in address to x3 + "mov x3, %0\n" + + // give w address to x0 + "mov x0, %1\n" + + // K- > x2 + "mov x2, %3\n" + + // give out address to x26 + "mov x26, %2\n" + + // load in bias + "ld1 {v5.4s, v6.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v7.4s, v8.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v9.4s, v10.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v11.4s, v12.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v13.4s, v14.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v15.4s, v16.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v17.4s, v18.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v19.4s, v20.4s}, [x26]\n" + + /* Layout + 5 6 + 7 8 + 9 10 + 11 12 + + 13 14 + 15 16 + 17 18 + 19 20 + */ + + // Computation loop + "0:\n" + + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "ldr d3, [x3, 16] !\n" + "ldr x16, [x3, 8]\n" + "sdot v7.4s, v0.16b, v1.4b[1]\n" + "ldr d29, [x0, 16]\n" + "ldr x17, [x0, 24]\n" + "sdot v9.4s, v0.16b, v1.4b[2]\n" + "ins v3.d[1], x16\n" + "ldr d30, [x3, 16] !\n" + "sdot v11.4s, v0.16b, v1.4b[3]\n" + "ins v29.d[1], x17\n" + + "sdot v13.4s, v0.16b, v3.4b[0]\n" + "ldr x16, [x3, 8]\n" + "subs x2, x2, #4\n" + "sdot v15.4s, v0.16b, v3.4b[1]\n" + "sdot v17.4s, v0.16b, v3.4b[2]\n" + "ins v30.d[1], x16\n" + "sdot v19.4s, v0.16b, v3.4b[3]\n" + + "sdot v6.4s, v29.16b, v1.4b[0]\n" + "sdot v8.4s, v29.16b, v1.4b[1]\n" + "ldr d0, [x0, 32] !\n" + "ldr x17, [x0, 8]\n" + "sdot v10.4s, v29.16b, v1.4b[2]\n" + "sdot v12.4s, v29.16b, v1.4b[3]\n" + + "sdot v14.4s, v29.16b, v3.4b[0]\n" + "ins v0.d[1], x17\n" + "mov v1.16b, v30.16b\n" + "sdot v16.4s, v29.16b, v3.4b[1]\n" + "sdot v18.4s, v29.16b, v3.4b[2]\n" + "sdot v20.4s, v29.16b, v3.4b[3]\n" + + "bne 0b\n" + + // give out address to x26 + "mov x26, %2\n" + + "st1 {v5.4s, v6.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v7.4s, v8.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v9.4s, v10.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v11.4s, v12.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v13.4s, v14.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v15.4s, v16.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v17.4s, v18.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v19.4s, v20.4s}, [x26]\n" + + : "+r"(in), "+r"(w), "+r"(out) + : "r"((I64)K), "r"((I64)offset) + : "memory", "cc", "v30", "v29", "v20", "v19", "v18", "v17", "v16", "v15", "v14", "v13", + "v12", "v11", "v10", "v9", "v8", "v7", "v6", "v5", "v3", "v1", "v0", "x26", "x16", "x17", + "x3", "x2", "x0"); +} + +inline void mmm_4x12_A55(U32 offset, U32 K, INT8 *in, INT8 *w, I32 *out) +{ + asm volatile( + // init in->v1, w->v0 + "ldr q1, [%0]\n" + + "ldr q0, [%1]\n" + + "ldr q29, [%1, 16]\n" // prefetch one more w + + // give in address to x3 + "mov x3, %0\n" + + // give w address to x0 + "mov x0, %1\n" + + // K->x2 + "mov x2, %3\n" + + // give out address to x26 + "mov x26, %2\n" + + // load in bias + "ld1 {v5.4s, v6.4s, v7.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v8.4s, v9.4s, v10.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v11.4s, v12.4s, v13.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v14.4s, v15.4s, v16.4s}, [x26]\n" + + /* Layout + 5 6 7 + 8 9 10 + 11 12 13 + 14 15 16 + */ + + // Computation loop + "0:\n" + // in(x3): v1 + // w(x0): v0 v29 v30 + + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "ldr d30, [x0, 32]\n" + "sdot v8.4s, v0.16b, v1.4b[1]\n" + "ldr x16, [x0, 40]\n" + "sdot v11.4s, v0.16b, v1.4b[2]\n" + "ldr d2, [x3, 16]!\n" // input of next round + "sdot v14.4s, v0.16b, v1.4b[3]\n" + "ldr x17, [x3, 8]\n" + + "sdot v6.4s, v29.16b, v1.4b[0]\n" + "ins v30.d[1], x16\n" + "sdot v9.4s, v29.16b, v1.4b[1]\n" + "ldr d0, [x0, 48]!\n" // first w of next round + "sdot v12.4s, v29.16b, v1.4b[2]\n" + "ins v2.d[1], x17\n" + "sdot v15.4s, v29.16b, v1.4b[3]\n" + "ldr x16, [x0, 8]\n" + + "sdot v7.4s, v30.16b, v1.4b[0]\n" + "ldr d29, [x0, 16]\n" + "sdot v10.4s, v30.16b, v1.4b[1]\n" + "ldr x19, [x0, 24]\n" + "ins v0.d[1], x16\n" + "sdot v13.4s, v30.16b, v1.4b[2]\n" + "subs x2, x2, #4\n" + "sdot v16.4s, v30.16b, v1.4b[3]\n" + + "mov v1.16b, v2.16b\n" + "ins v29.d[1], x19\n" + "bne 0b\n" + + // give out address to x26 + "mov x26, %2\n" + + "st1 {v5.4s, v6.4s, v7.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v8.4s, v9.4s, v10.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v11.4s, v12.4s, v13.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v14.4s, v15.4s, v16.4s}, [x26]\n" + + : "+r"(in), "+r"(w), "+r"(out) + : "r"((I64)K), "r"((I64)offset) + : "memory", "cc", "v30", "v29", "v16", "v15", "v14", "v13", "v12", "v11", "v10", "v9", "v8", + "v7", "v6", "v5", "v3", "v2", "v1", "v0", "x26", "x19", "x16", "x17", "x3", "x2", "x0"); +} + +inline void mmm_8x12_A55(U32 offset, U32 K, INT8 *in, INT8 *w, I32 *out) +{ + asm volatile( + // init in->v1, w->v0 + "ldr q1, [%0]\n" + + "ldr q0, [%1]\n" + + "ldr q29, [%1, 16]\n" // prefetch one more w + + // give in address to x3 + "mov x3, %0\n" + + // give w address to x0 + "mov x0, %1\n" + + // K->x2 + "mov x2, %3\n" + + // give out address to x26 + "mov x26, %2\n" + + // load in bias + "ld1 {v5.4s, v6.4s, v7.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v8.4s, v9.4s, v10.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v11.4s, v12.4s, v13.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v14.4s, v15.4s, v16.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v17.4s, v18.4s, v19.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v20.4s, v21.4s, v22.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v23.4s, v24.4s, v25.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v26.4s, v27.4s, v28.4s}, [x26]\n" + + /* Layout + 5 6 7 + 8 9 10 + 11 12 13 + 14 15 16 + + 17 18 19 + 20 21 22 + 23 24 25 + 26 27 28 + */ + + // Computation loop + "0:\n" + // in(x3): v1 v2 + // w(x0): v0 v29 v30 + + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "ldr d30, [x0, 32]\n" + "sdot v8.4s, v0.16b, v1.4b[1]\n" + "ldr x16, [x0, 40]\n" + "sdot v11.4s, v0.16b, v1.4b[2]\n" + "ldr d2, [x3, 16]\n" + "sdot v14.4s, v0.16b, v1.4b[3]\n" + "ldr x17, [x3, 24]\n" + + "sdot v6.4s, v29.16b, v1.4b[0]\n" + "ins v30.d[1], x16\n" + "sdot v9.4s, v29.16b, v1.4b[1]\n" + "ldr d3, [x0, 48]!\n" // first w of next round + "sdot v12.4s, v29.16b, v1.4b[2]\n" + "ins v2.d[1], x17\n" + "sdot v15.4s, v29.16b, v1.4b[3]\n" + "ldr x16, [x0, 8]\n" + + "sdot v7.4s, v30.16b, v1.4b[0]\n" + "subs x2, x2, #4\n" + "sdot v10.4s, v30.16b, v1.4b[1]\n" + "ins v3.d[1], x16\n" + "sdot v13.4s, v30.16b, v1.4b[2]\n" + "sdot v16.4s, v30.16b, v1.4b[3]\n" + + "sdot v17.4s, v0.16b, v2.4b[0]\n" + "ldr d1, [x3, 32]!\n" + "sdot v20.4s, v0.16b, v2.4b[1]\n" + "ldr x17, [x3, 8]\n" + "sdot v23.4s, v0.16b, v2.4b[2]\n" + "sdot v26.4s, v0.16b, v2.4b[3]\n" + + "sdot v18.4s, v29.16b, v2.4b[0]\n" + "mov v0.16b, v3.16b\n" + "sdot v21.4s, v29.16b, v2.4b[1]\n" + "ins v1.d[1], x17\n" + "sdot v24.4s, v29.16b, v2.4b[2]\n" + "sdot v27.4s, v29.16b, v2.4b[3]\n" + + "sdot v19.4s, v30.16b, v2.4b[0]\n" + "ldr d29, [x0, 16]\n" + "sdot v22.4s, v30.16b, v2.4b[1]\n" + "ldr x16, [x0, 24]\n" + "sdot v25.4s, v30.16b, v2.4b[2]\n" + "sdot v28.4s, v30.16b, v2.4b[3]\n" + "ins v29.d[1], x16\n" + + "bne 0b\n" + + // give out address to x26 + "mov x26, %2\n" + + "st1 {v5.4s, v6.4s, v7.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v8.4s, v9.4s, v10.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v11.4s, v12.4s, v13.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v14.4s, v15.4s, v16.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v17.4s, v18.4s, v19.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v20.4s, v21.4s, v22.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v23.4s, v24.4s, v25.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v26.4s, v27.4s, v28.4s}, [x26]\n" + + : "+r"(in), "+r"(w), "+r"(out) + : "r"((I64)K), "r"((I64)offset) + : "memory", "cc", "v30", "v29", "v28", "v27", "v26", "v25", "v24", "v23", "v22", "v21", + "v20", "v19", "v18", "v17", "v16", "v15", "v14", "v13", "v12", "v11", "v10", "v9", "v8", + "v7", "v6", "v5", "v3", "v2", "v1", "v0", "x26", "x16", "x17", "x3", "x2", "x0"); +} + +void mmm_A55( + int M, int N, int K, bool transposeA, INT8 *matrix1, INT8 *matrix2, INT8 *tmp, I32 *result) +{ + int blockK = K; + int K4 = pad_to_4_multiple(K); + int blockM = 96; + INT8 *matrix1Trans = tmp; + I32 *resultCurrent = result; + + int KInner, MInner, m, n; + for (int k = 0; k < K; k += blockK) { + KInner = UNI_MIN(blockK, K - k); // K for this inner iteration + for (int i = 0; i < M; i += blockM) { + MInner = UNI_MIN(blockM, M - i); // M for this inner iteration + for (n = 0; n <= N - 8; n += 8) { + if (i == 0) { + if (transposeA) { + matrix2_trans_int8(8, KInner, N, matrix1 + n, matrix1Trans + n * K4); + } else { + matrix1_trans_n8(KInner, K, matrix1 + n * K + k, matrix1Trans + n * K4); + } + } + + for (m = 0; m <= (MInner - 12); m += 12) { + resultCurrent = result + n * M + m + i; + mmm_8x12_A55( + M * 4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); + } + for (; m <= (MInner - 8); m += 8) { + resultCurrent = result + n * M + m + i; + mmm_8x8_A55( + M * 4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); + } + + if ((MInner - m) >= 4) { + resultCurrent = result + n * M + m + i; + mmm_8x4_A55( + M * 4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); + m += 4; + } + + if (MInner - m) { + resultCurrent = result + n * M + m + i; + mmm_N8_MTail(MInner - m, M, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, + resultCurrent); + } + } + + if ((N - n) >= 4) { + if (i == 0) { + if (transposeA) { + matrix2_trans_int8(4, KInner, N, matrix1 + n, matrix1Trans + n * K4); + } else { + matrix1_trans_int8(4, KInner, K, matrix1 + n * K + k, matrix1Trans + n * K4); + } + } + + for (m = 0; m <= (MInner - 12); m += 12) { + resultCurrent = result + n * M + m + i; + mmm_4x12_A55( + M * 4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); + } + + for (; m <= (MInner - 8); m += 8) { + resultCurrent = result + n * M + m + i; + mmm_4x8_A55( + M * 4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); + } + + if ((MInner - m) >= 4) { + resultCurrent = result + n * M + m + i; + mmm_4x4_A55( + M * 4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); + m += 4; + } + + if (MInner - m) { + resultCurrent = result + n * M + m + i; + mmm_N4_MTail(MInner - m, M, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, + resultCurrent); + } + n += 4; + } + + if (N - n) { + if (i == 0) { + if (transposeA) { + matrix2_trans_int8(N - n, KInner, N, matrix1 + n, matrix1Trans + n * K4); + } else { + matrix1_trans_int8( + N - n, KInner, K, matrix1 + n * K + k, matrix1Trans + n * K4); + } + } + + for (m = 0; m <= (MInner - 12); m += 12) { + resultCurrent = result + n * M + m + i; + mmm_NTail_M12( + M, N - n, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); + } + + for (; m <= (MInner - 8); m += 8) { + resultCurrent = result + n * M + m + i; + mmm_NTail_M8( + M, N - n, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); + } + + if ((MInner - m) >= 4) { + resultCurrent = result + n * M + m + i; + mmm_NTail_M4( + M, N - n, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); + m += 4; + } + + if (MInner - m) { + resultCurrent = result + n * M + m + i; + mmm_NTail_M(MInner - m, M, N - n, K4, matrix1Trans + n * K4, + matrix2 + (i + m) * K4, resultCurrent); + } + } + } + } +} +#endif diff --git a/compute/blas_enhance/src/cpu/arm/int8/mmm_A76.cpp b/compute/blas_enhance/src/cpu/arm/int8/mmm_A76.cpp new file mode 100644 index 00000000..5abe7852 --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/int8/mmm_A76.cpp @@ -0,0 +1,685 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_INT8 +#include +#include +#include "cpu/arm/blas_arm.h" +#include "cpu/arm/int8/mmm_common.h" +#include "cpu/arm/int8/mmm.h" + +inline void mmm_4x4_A76(U32 offset, U32 K, INT8 *in, INT8 *w, I32 *out) +{ + asm volatile( + // init in- > v1, w- > v0 + "ldr q1, [%0]\n" + + "ldr q0, [%1]\n" + + // give in address to x3 + "mov x3, %0\n" + + // give w address to x0 + "mov x0, %1\n" + + // K- > x2 + "mov x2, %3\n" + + // give out address to x26 + "mov x26, %2\n" + + // load in bias + "ldr q5, [x26]\n" + "add x26, x26, %4\n" + + "ldr q7, [x26]\n" + "add x26, x26, %4\n" + + "ldr q9, [x26]\n" + "add x26, x26, %4\n" + + "ldr q11, [x26]\n" + + // Computation loop + "0:\n" + + "ldr q3, [x3, 16]!\n" + "ldr q29, [x0, 16]!\n" + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "sdot v7.4s, v0.16b, v1.4b[1]\n" + "subs x2, x2, #4\n" + "sdot v9.4s, v0.16b, v1.4b[2]\n" + "sdot v11.4s, v0.16b, v1.4b[3]\n" + "mov v1.16b, v3.16b\n" + "mov v0.16b, v29.16b\n" + "bne 0b\n" + + "1:\n" + + // give out address to x26 + "mov x26, %2\n" + + "str q5, [x26]\n" + "add x26, x26, %4\n" + + "str q7, [x26]\n" + "add x26, x26, %4\n" + + "str q9, [x26]\n" + "add x26, x26, %4\n" + + "str q11, [x26]\n" + + : "+r"(in), "+r"(w), "+r"(out) + : "r"((I64)K), "r"((I64)offset) + : "memory", "cc", "v30", "v29", "v11", "v9", "v7", "v5", "v3", "v1", "v0", "x26", "x3", + "x2", "x0"); +} + +inline void mmm_8x4_A76(U32 offset, U32 K, INT8 *in, INT8 *w, I32 *out) +{ + asm volatile( + // init in- > v1, w- > v0 + "ldr q1, [%0]\n" + + "ldr q0, [%1]\n" + + // give in address to x3 + "mov x3, %0\n" + + // give w address to x0 + "mov x0, %1\n" + + // K- > x2 + "mov x2, %3\n" + + // give out address to x26 + "mov x26, %2\n" + + // load in bias + "ldr q5, [x26]\n" + "add x26, x26, %4\n" + + "ldr q7, [x26]\n" + "add x26, x26, %4\n" + + "ldr q9, [x26]\n" + "add x26, x26, %4\n" + + "ldr q11, [x26]\n" + "add x26, x26, %4\n" + + "ldr q13, [x26]\n" + "add x26, x26, %4\n" + + "ldr q15, [x26]\n" + "add x26, x26, %4\n" + + "ldr q17, [x26]\n" + "add x26, x26, %4\n" + + "ldr q19, [x26]\n" + + // Computation loop + "0:\n" + + "ldr q3, [x3, 16]\n" + "ldr q29, [x0, 16]!\n" + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "sdot v7.4s, v0.16b, v1.4b[1]\n" + "sdot v9.4s, v0.16b, v1.4b[2]\n" + "sdot v11.4s, v0.16b, v1.4b[3]\n" + + "subs x2, x2, #4\n" + "sdot v13.4s, v0.16b, v3.4b[0]\n" + "sdot v15.4s, v0.16b, v3.4b[1]\n" + "ldr q1, [x3, 32]!\n" + "sdot v17.4s, v0.16b, v3.4b[2]\n" + "sdot v19.4s, v0.16b, v3.4b[3]\n" + "mov v0.16b, v29.16b\n" + "bne 0b\n" + + // give out address to x26 + "mov x26, %2\n" + + "str q5, [x26]\n" + "add x26, x26, %4\n" + + "str q7, [x26]\n" + "add x26, x26, %4\n" + + "str q9, [x26]\n" + "add x26, x26, %4\n" + + "str q11, [x26]\n" + "add x26, x26, %4\n" + + "str q13, [x26]\n" + "add x26, x26, %4\n" + + "str q15, [x26]\n" + "add x26, x26, %4\n" + + "str q17, [x26]\n" + "add x26, x26, %4\n" + + "str q19, [x26]\n" + + : "+r"(in), "+r"(w), "+r"(out) + : "r"((I64)K), "r"((I64)offset) + : "memory", "cc", "v30", "v29", "v19", "v17", "v15", "v13", "v11", "v9", "v7", "v5", "v3", + "v1", "v0", "x26", "x3", "x2", "x0"); +} + +inline void mmm_4x8_A76(U32 offset, U32 K, INT8 *in, INT8 *w, I32 *out) +{ + asm volatile( + // init in- > v1, w- > v0 + "ldr q1, [%0]\n" + + "ldr q0, [%1]\n" + + // give in address to x3 + "mov x3, %0\n" + + // give w address to x0 + "mov x0, %1\n" + + // K- > x2 + "mov x2, %3\n" + + // give out address to x26 + "mov x26, %2\n" + + "ld1 {v5.4s, v6.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v7.4s, v8.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v9.4s, v10.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v11.4s, v12.4s}, [x26]\n" + + /* Layout + * 5 6 + * 7 8 + * 9 10 + * 11 12 + */ + + // Computation loop + "0:\n" + + "ldr q29, [x0, 16]\n" + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "ldr q3, [x3, 16]!\n" + "sdot v7.4s, v0.16b, v1.4b[1]\n" + "subs x2, x2, #4\n" + "sdot v9.4s, v0.16b, v1.4b[2]\n" + "sdot v11.4s, v0.16b, v1.4b[3]\n" + + "sdot v6.4s, v29.16b, v1.4b[0]\n" + "ldr q0, [x0, 32]!\n" + "sdot v8.4s, v29.16b, v1.4b[1]\n" + "sdot v10.4s, v29.16b, v1.4b[2]\n" + "sdot v12.4s, v29.16b, v1.4b[3]\n" + "mov v1.16b, v3.16b\n" + "bne 0b\n" + + // give out address to x26 + "mov x26, %2\n" + + "st1 {v5.4s, v6.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v7.4s, v8.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v9.4s, v10.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v11.4s, v12.4s}, [x26]\n" + + : "+r"(in), "+r"(w), "+r"(out) + : "r"((I64)K), "r"((I64)offset) + : "memory", "cc", "v29", "v12", "v11", "v10", "v9", "v8", "v7", "v6", "v5", "v3", "v1", + "v0", "x26", "x3", "x2", "x0"); +} + +inline void mmm_8x8_A76(U32 offset, U32 K, INT8 *in, INT8 *w, I32 *out) +{ + asm volatile( + // init in- > v1, w- > v0 + "ldr q1, [%0]\n" + + "ldr q0, [%1]\n" + + // give in address to x3 + "mov x3, %0\n" + + // give w address to x0 + "mov x0, %1\n" + + // K- > x2 + "mov x2, %3\n" + + // give out address to x26 + "mov x26, %2\n" + + // load in bias + "ld1 {v5.4s, v6.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v7.4s, v8.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v9.4s, v10.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v11.4s, v12.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v13.4s, v14.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v15.4s, v16.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v17.4s, v18.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v19.4s, v20.4s}, [x26]\n" + + /* Layout + 5 6 + 7 8 + 9 10 + 11 12 + + 13 14 + 15 16 + 17 18 + 19 20 + */ + + // Computation loop + "0:\n" + + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "ldr q3, [x3, 16]!\n" + "sdot v7.4s, v0.16b, v1.4b[1]\n" + "ldr q29, [x0, 16]\n" + "sdot v9.4s, v0.16b, v1.4b[2]\n" + "sdot v11.4s, v0.16b, v1.4b[3]\n" + + "sdot v13.4s, v0.16b, v3.4b[0]\n" + "subs x2, x2, #4\n" + "sdot v15.4s, v0.16b, v3.4b[1]\n" + "sdot v17.4s, v0.16b, v3.4b[2]\n" + "sdot v19.4s, v0.16b, v3.4b[3]\n" + + "sdot v6.4s, v29.16b, v1.4b[0]\n" + "sdot v8.4s, v29.16b, v1.4b[1]\n" + "ldr q0, [x0, 32]!\n" + "sdot v10.4s, v29.16b, v1.4b[2]\n" + "sdot v12.4s, v29.16b, v1.4b[3]\n" + + "sdot v14.4s, v29.16b, v3.4b[0]\n" + "sdot v16.4s, v29.16b, v3.4b[1]\n" + "ldr q1, [x3, 16]!\n" + "sdot v18.4s, v29.16b, v3.4b[2]\n" + "sdot v20.4s, v29.16b, v3.4b[3]\n" + + "bne 0b\n" + + // give out address to x26 + "mov x26, %2\n" + + "st1 {v5.4s, v6.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v7.4s, v8.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v9.4s, v10.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v11.4s, v12.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v13.4s, v14.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v15.4s, v16.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v17.4s, v18.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v19.4s, v20.4s}, [x26]\n" + + : "+r"(in), "+r"(w), "+r"(out) + : "r"((I64)K), "r"((I64)offset) + : "memory", "cc", "v29", "v20", "v19", "v18", "v17", "v16", "v15", "v14", "v13", "v12", + "v11", "v10", "v9", "v8", "v7", "v6", "v5", "v3", "v1", "v0", "x26", "x3", "x2", "x0"); +} + +inline void mmm_4x12_A76(U32 offset, U32 K, INT8 *in, INT8 *w, I32 *out) +{ + asm volatile( + // init in->v1, w->v0 + "ldr q1, [%0]\n" + + "ldr q0, [%1]\n" + + "ldr q29, [%1, 16]\n" // prefetch one more w + + // give in address to x3 + "mov x3, %0\n" + + // give w address to x0 + "mov x0, %1\n" + + // K->x2 + "mov x2, %3\n" + + // give out address to x26 + "mov x26, %2\n" + + // load in bias + "ld1 {v5.4s, v6.4s, v7.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v8.4s, v9.4s, v10.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v11.4s, v12.4s, v13.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v14.4s, v15.4s, v16.4s}, [x26]\n" + + /* Layout + 5 6 7 + 8 9 10 + 11 12 13 + 14 15 16 + */ + + // Computation loop + "0:\n" + // in(x3): v1 + // w(x0): v0 v29 v30 + + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "ldr q30, [x0, 32]\n" + "sdot v8.4s, v0.16b, v1.4b[1]\n" + "sdot v11.4s, v0.16b, v1.4b[2]\n" + "ldr q2, [x3, 16]!\n" // input of next round + "sdot v14.4s, v0.16b, v1.4b[3]\n" + + "sdot v6.4s, v29.16b, v1.4b[0]\n" + "sdot v9.4s, v29.16b, v1.4b[1]\n" + "ldr q0, [x0, 48]!\n" // first w of next round + "sdot v12.4s, v29.16b, v1.4b[2]\n" + "sdot v15.4s, v29.16b, v1.4b[3]\n" + + "sdot v7.4s, v30.16b, v1.4b[0]\n" + "ldr q29, [x0, 16]\n" + "sdot v10.4s, v30.16b, v1.4b[1]\n" + "sdot v13.4s, v30.16b, v1.4b[2]\n" + "subs x2, x2, #4\n" + "sdot v16.4s, v30.16b, v1.4b[3]\n" + + "mov v1.16b, v2.16b\n" + "bne 0b\n" + + // give out address to x26 + "mov x26, %2\n" + + "st1 {v5.4s, v6.4s, v7.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v8.4s, v9.4s, v10.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v11.4s, v12.4s, v13.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v14.4s, v15.4s, v16.4s}, [x26]\n" + + : "+r"(in), "+r"(w), "+r"(out) + : "r"((I64)K), "r"((I64)offset) + : "memory", "cc", "v30", "v29", "v16", "v15", "v14", "v13", "v12", "v11", "v10", "v9", "v8", + "v7", "v6", "v5", "v3", "v2", "v1", "v0", "x26", "x19", "x3", "x2", "x0"); +} + +inline void mmm_8x12_A76(U32 offset, U32 K, INT8 *in, INT8 *w, I32 *out) +{ + asm volatile( + // init in->v1, w->v0 + "ldr q1, [%0]\n" + + "ldr q0, [%1]\n" + + "ldr q29, [%1, 16]\n" // prefetch one more w + + // give in address to x3 + "mov x3, %0\n" + + // give w address to x0 + "mov x0, %1\n" + + // K->x2 + "mov x2, %3\n" + + // give out address to x26 + "mov x26, %2\n" + + // load in bias + "ld1 {v5.4s, v6.4s, v7.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v8.4s, v9.4s, v10.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v11.4s, v12.4s, v13.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v14.4s, v15.4s, v16.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v17.4s, v18.4s, v19.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v20.4s, v21.4s, v22.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v23.4s, v24.4s, v25.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v26.4s, v27.4s, v28.4s}, [x26]\n" + + /* Layout + 5 6 7 + 8 9 10 + 11 12 13 + 14 15 16 + + 17 18 19 + 20 21 22 + 23 24 25 + 26 27 28 + */ + + // Computation loop + "0:\n" + // in(x3): v1 v2 + // w(x0): v0 v29 v30 + + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "ldr q30, [x0, 32]\n" + "sdot v8.4s, v0.16b, v1.4b[1]\n" + "sdot v11.4s, v0.16b, v1.4b[2]\n" + "ldr q2, [x3, 16]\n" + "sdot v14.4s, v0.16b, v1.4b[3]\n" + + "sdot v6.4s, v29.16b, v1.4b[0]\n" + "sdot v9.4s, v29.16b, v1.4b[1]\n" + "ldr q3, [x0, 48]!\n" // first w of next round + "sdot v12.4s, v29.16b, v1.4b[2]\n" + "sdot v15.4s, v29.16b, v1.4b[3]\n" + + "sdot v7.4s, v30.16b, v1.4b[0]\n" + "subs x2, x2, #4\n" + "sdot v10.4s, v30.16b, v1.4b[1]\n" + "sdot v13.4s, v30.16b, v1.4b[2]\n" + "sdot v16.4s, v30.16b, v1.4b[3]\n" + + "sdot v17.4s, v0.16b, v2.4b[0]\n" + "ldr q1, [x3, 32]!\n" + "sdot v20.4s, v0.16b, v2.4b[1]\n" + "sdot v23.4s, v0.16b, v2.4b[2]\n" + "sdot v26.4s, v0.16b, v2.4b[3]\n" + + "sdot v18.4s, v29.16b, v2.4b[0]\n" + "mov v0.16b, v3.16b\n" + "sdot v21.4s, v29.16b, v2.4b[1]\n" + "sdot v24.4s, v29.16b, v2.4b[2]\n" + "sdot v27.4s, v29.16b, v2.4b[3]\n" + + "sdot v19.4s, v30.16b, v2.4b[0]\n" + "ldr q29, [x0, 16]\n" + "sdot v22.4s, v30.16b, v2.4b[1]\n" + "sdot v25.4s, v30.16b, v2.4b[2]\n" + "sdot v28.4s, v30.16b, v2.4b[3]\n" + + "bne 0b\n" + + // give out address to x26 + "mov x26, %2\n" + + "st1 {v5.4s, v6.4s, v7.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v8.4s, v9.4s, v10.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v11.4s, v12.4s, v13.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v14.4s, v15.4s, v16.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v17.4s, v18.4s, v19.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v20.4s, v21.4s, v22.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v23.4s, v24.4s, v25.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v26.4s, v27.4s, v28.4s}, [x26]\n" + + : "+r"(in), "+r"(w), "+r"(out) + : "r"((I64)K), "r"((I64)offset) + : "memory", "cc", "v30", "v29", "v28", "v27", "v26", "v25", "v24", "v23", "v22", "v21", + "v20", "v19", "v18", "v17", "v16", "v15", "v14", "v13", "v12", "v11", "v10", "v9", "v8", + "v7", "v6", "v5", "v3", "v2", "v1", "v0", "x26", "x3", "x2", "x0"); +} + +void mmm_A76( + int M, int N, int K, bool transposeA, INT8 *matrix1, INT8 *matrix2, INT8 *tmp, I32 *result) +{ + int blockK = K; + U32 K4 = pad_to_4_multiple(K); + int blockM = 96; + INT8 *matrix1Trans = tmp; + I32 *resultCurrent = result; + + int KInner, MInner, m, n; + for (int k = 0; k < K; k += blockK) { + KInner = UNI_MIN(blockK, K - k); // K for this inner iteration + for (int i = 0; i < M; i += blockM) { + MInner = UNI_MIN(blockM, M - i); // M for this inner iteration + for (n = 0; n <= N - 8; n += 8) { + if (i == 0) { + if (transposeA) { + matrix2_trans_int8(8, KInner, N, matrix1 + n, matrix1Trans + n * K4); + } else { + matrix1_trans_n8(KInner, K, matrix1 + n * K + k, matrix1Trans + n * K4); + } + } + + for (m = 0; m <= (MInner - 12); m += 12) { + resultCurrent = result + n * M + m + i; + mmm_8x12_A76( + M * 4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); + } + for (; m <= (MInner - 8); m += 8) { + resultCurrent = result + n * M + m + i; + mmm_8x8_A76( + M * 4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); + } + + if ((MInner - m) >= 4) { + resultCurrent = result + n * M + m + i; + mmm_8x4_A76( + M * 4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); + m += 4; + } + + if (MInner - m) { + resultCurrent = result + n * M + m + i; + mmm_N8_MTail(MInner - m, M, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, + resultCurrent); + } + } + + if ((N - n) >= 4) { + if (i == 0) { + if (transposeA) { + matrix2_trans_int8(4, KInner, N, matrix1 + n, matrix1Trans + n * K4); + } else { + matrix1_trans_int8(4, KInner, K, matrix1 + n * K + k, matrix1Trans + n * K4); + } + } + + for (m = 0; m <= (MInner - 12); m += 12) { + resultCurrent = result + n * M + m + i; + mmm_4x12_A76( + M * 4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); + } + + for (; m <= (MInner - 8); m += 8) { + resultCurrent = result + n * M + m + i; + mmm_4x8_A76( + M * 4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); + } + + if ((MInner - m) >= 4) { + resultCurrent = result + n * M + m + i; + mmm_4x4_A76( + M * 4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); + m += 4; + } + + if (MInner - m) { + resultCurrent = result + n * M + m + i; + mmm_N4_MTail(MInner - m, M, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, + resultCurrent); + } + n += 4; + } + + if (N - n) { + if (i == 0) { + if (transposeA) { + matrix2_trans_int8(N - n, KInner, N, matrix1 + n, matrix1Trans + n * K4); + } else { + matrix1_trans_int8( + N - n, KInner, K, matrix1 + n * K + k, matrix1Trans + n * K4); + } + } + + for (m = 0; m <= (MInner - 12); m += 12) { + resultCurrent = result + n * M + m + i; + mmm_NTail_M12( + M, N - n, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); + } + + for (; m <= (MInner - 8); m += 8) { + resultCurrent = result + n * M + m + i; + mmm_NTail_M8( + M, N - n, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); + } + + if ((MInner - m) >= 4) { + resultCurrent = result + n * M + m + i; + mmm_NTail_M4( + M, N - n, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); + m += 4; + } + + if (MInner - m) { + resultCurrent = result + n * M + m + i; + mmm_NTail_M(MInner - m, M, N - n, K4, matrix1Trans + n * K4, + matrix2 + (i + m) * K4, resultCurrent); + } + } + } + } +} +#endif diff --git a/compute/blas_enhance/src/cpu/arm/int8/mmm_common.h b/compute/blas_enhance/src/cpu/arm/int8/mmm_common.h new file mode 100644 index 00000000..b5c213c2 --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/int8/mmm_common.h @@ -0,0 +1,455 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_MMM_COMMON +#define _H_MMM_COMMON + +#ifdef _USE_INT8 +#include +#include + +#include "types.h" +#include "error.h" +#include "arm_neon_expand.h" + +inline void matrix1_trans_n8(U32 blockK, U32 K, INT8 *src, INT8 *dst) +{ + // Move k4 as one I32 + I32 *dst1 = (I32 *)dst; + + I32 *in[8]; + for (U32 i = 0; i < 8; i++) { + in[i] = (I32 *)(src + i * K); + } + U32 k = 0; + for (; k < blockK - 7; k += 8) { + if (k % 64 == 0) { + asm volatile("prfm pldl2keep, [%[in0], 64]\n" + "prfm pldl2keep, [%[in1], 64]\n" + "prfm pldl2keep, [%[in2], 64]\n" + "prfm pldl2keep, [%[in3], 64]\n" + "prfm pldl2keep, [%[in4], 64]\n" + "prfm pldl2keep, [%[in5], 64]\n" + "prfm pldl2keep, [%[in6], 64]\n" + "prfm pldl2keep, [%[in7], 64]\n" + : [in0] "+r"(in[0]), [in1] "+r"(in[1]), [in2] "+r"(in[2]), + [in3] "+r"(in[3]), [in4] "+r"(in[4]), [in5] "+r"(in[5]), [in6] "+r"(in[6]), + [in7] "+r"(in[7]) + : + : "memory", "cc"); + } + asm volatile("ldr d0, [%[in0]], 8\n" + "ldr d1, [%[in1]], 8\n" + "ldr d2, [%[in2]], 8\n" + "ldr d3, [%[in3]], 8\n" + "ldr d4, [%[in4]], 8\n" + "ldr d5, [%[in5]], 8\n" + "ldr d6, [%[in6]], 8\n" + "ldr d7, [%[in7]], 8\n" + + "zip1 v8.2s, v0.2s, v1.2s\n" + "zip2 v12.2s, v0.2s, v1.2s\n" + "zip1 v9.2s, v2.2s, v3.2s\n" + "zip2 v13.2s, v2.2s, v3.2s\n" + "zip1 v10.2s, v4.2s, v5.2s\n" + "zip2 v14.2s, v4.2s, v5.2s\n" + "zip1 v11.2s, v6.2s, v7.2s\n" + "zip2 v15.2s, v6.2s, v7.2s\n" + + "str d8, [%[out]]\n" + "str d9, [%[out], 8]\n" + "str d10, [%[out], 16]\n" + "str d11, [%[out], 24]\n" + "str d12, [%[out], 32]\n" + "str d13, [%[out], 40]\n" + "str d14, [%[out], 48]\n" + "str d15, [%[out], 56]\n" + : [in0] "+r"(in[0]), [in1] "+r"(in[1]), [in2] "+r"(in[2]), [in3] "+r"(in[3]), + [in4] "+r"(in[4]), [in5] "+r"(in[5]), [in6] "+r"(in[6]), [in7] "+r"(in[7]) + : [out] "r"(dst1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15"); + dst1 += 16; + } + + if (k < blockK - 3) { + for (U32 i = 0; i < 8; i++) { + dst1[0] = in[i][0]; + dst1++; + in[i]++; + } + k += 4; + } + + if (k < blockK) { + U32 kTail = blockK - k; + INT8 *dstI8 = (INT8 *)dst1; + INT8 *inI[8]; + for (U32 i = 0; i < 8; i++) { + inI[i] = (INT8 *)in[i]; + } + for (U32 i = 0; i < 8; i++) { + for (U32 j = 0; j < 4; j++) { + if (j < kTail) { + dstI8[i * 4 + j] = inI[i][j]; + } else { + dstI8[i * 4 + j] = 0; + } + } + } + } +} + +// Trans from NK to NKn(size)k4 +inline void matrix1_trans_int8(U32 size, U32 blockK, U32 K, INT8 *src, INT8 *dst) +{ + // Move k4 as one I32 + I32 *src1; + I32 *dst1 = (I32 *)dst; + U32 offset = 64; + + U32 i = 0; + for (; i < blockK / 4; i++) { + for (U32 j = 0; j < size; j++) { + src1 = (I32 *)(src + j * K); + + if (i % 16 == 0) { + asm volatile("prfm pldl2keep, [%0, %1]\n" + : "+r"(src1) + : "r"((I64)offset) + : "memory", "cc"); + } + *dst1++ = *(src1 + i); + } + } + U32 kTail = blockK % 4; + if (kTail > 0) { + INT8 *srcI8; + INT8 *dstI8 = (INT8 *)dst1; + for (U32 j = 0; j < size; j++) { + srcI8 = src + j * K + i * 4; + for (U32 k = 0; k < 4; k++) { + if (k < kTail) { + dstI8[j * 4 + k] = srcI8[k]; + } else { + dstI8[j * 4 + k] = 0; + } + } + } + } +} + +inline void matrix2_trans_m12(U32 blockK, U32 M, INT8 *src, INT8 *dst) +{ + INT8 *src1 = src; + INT8 *dst1 = dst; + U32 offset = 4 * M; + + U32 i = 0; + for (; i < blockK - 3; i += 4) { + // Prefetch for the next iteration + asm volatile("prfm pldl2keep, [%0, %1]\n" : "+r"(src1) : "r"((I64)offset) : "memory", "cc"); + + INT8 *in12[4]; + for (U32 j = 0; j < 4; j++) { + in12[j] = src1 + j * M; + } + src1 += offset; + + asm volatile("ldr d0, [%[in0]]\n" + "ldr d1, [%[in1]]\n" + "ldr d2, [%[in2]]\n" + "ldr d3, [%[in3]]\n" + "zip1 v4.8b, v0.8b, v1.8b\n" + "zip2 v5.8b, v0.8b, v1.8b\n" + "zip1 v6.8b, v2.8b, v3.8b\n" + "zip2 v7.8b, v2.8b, v3.8b\n" + + "zip1 v0.4h, v4.4h, v6.4h\n" + "zip2 v1.4h, v4.4h, v6.4h\n" + "zip1 v2.4h, v5.4h, v7.4h\n" + "zip2 v3.4h, v5.4h, v7.4h\n" + "str d0, [%[out]]\n" + "str d1, [%[out], 8]\n" + "str d2, [%[out], 16]\n" + "str d3, [%[out], 24]\n" + : + : [in0] "r"(in12[0]), [in1] "r"(in12[1]), [in2] "r"(in12[2]), + [in3] "r"(in12[3]), [out] "r"(dst1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); + + for (U32 j = 0; j < 4; j++) { + for (U32 k = 0; k < 4; k++) { + dst1[32 + j * 4 + k] = in12[k][8 + j]; + } + } + + dst1 += 48; + } + if (i < blockK) { + U32 kTail = blockK - i; + + INT8 *in12[4]; + INT8 zero[12] = {0}; + for (U32 j = 0; j < 4; j++) { + if (j < kTail) { + in12[j] = src1 + j * M; + } else { + in12[j] = zero; + } + } + + asm volatile("ldr d0, [%[in0]]\n" + "ldr d1, [%[in1]]\n" + "ldr d2, [%[in2]]\n" + "ldr d3, [%[in3]]\n" + "zip1 v4.8b, v0.8b, v1.8b\n" + "zip2 v5.8b, v0.8b, v1.8b\n" + "zip1 v6.8b, v2.8b, v3.8b\n" + "zip2 v7.8b, v2.8b, v3.8b\n" + + "zip1 v0.4h, v4.4h, v6.4h\n" + "zip2 v1.4h, v4.4h, v6.4h\n" + "zip1 v2.4h, v5.4h, v7.4h\n" + "zip2 v3.4h, v5.4h, v7.4h\n" + "str d0, [%[out]]\n" + "str d1, [%[out], 8]\n" + "str d2, [%[out], 16]\n" + "str d3, [%[out], 24]\n" + : + : [in0] "r"(in12[0]), [in1] "r"(in12[1]), [in2] "r"(in12[2]), + [in3] "r"(in12[3]), [out] "r"(dst1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); + + for (U32 j = 0; j < 4; j++) { + for (U32 k = 0; k < 4; k++) { + dst1[32 + j * 4 + k] = in12[k][8 + j]; + } + } + } +} + +// Trans from KM to MKm(size)k4 +inline void matrix2_trans_int8(U32 size, U32 blockK, U32 M, INT8 *src, INT8 *dst) +{ + INT8 *src1 = src; + INT8 *dst1 = dst; + U32 offset = 4 * M; + + U32 i = 0; + for (; i < blockK - 3; i += 4) { + src1 = src + i * M; + asm volatile("prfm pldl2keep, [%0, %1]\n" : "+r"(src1) : "r"((I64)offset) : "memory", "cc"); + for (U32 j = 0; j < size; j++) { + src1 = src + i * M + j; + for (U32 k = 0; k < 4; k++) { + *dst1 = *src1; + dst1++; + src1 += M; + } + } + } + if (i < blockK) { + U32 kTail = blockK - i; + for (U32 j = 0; j < size; j++) { + src1 = src + i * M + j; + for (U32 k = 0; k < 4; k++) { + if (k < kTail) { + *dst1 = *src1; + dst1++; + src1 += M; + } else { + *dst1 = 0; + dst1++; + } + } + } + } +} + +inline void mmm_N8_MTail(U32 MInner, U32 M, U32 K, INT8 *matrix1, INT8 *matrix2, I32 *result) +{ + int8x16_t mat1[2]; + int8x16_t mat2; + int32x4_t res[4][2] = {{0}}; + I32 tmp[8] = {0}; + + CHECK_REQUIREMENT(MInner < 4); + + for (U32 i = 0; i < K; i += 4) { + mat1[0] = vld1q_s8(matrix1 + i * 8); + mat1[1] = vld1q_s8(matrix1 + i * 8 + 16); + + mat2 = vld1q_s8(matrix2 + i * MInner); + + for (U32 j = 0; j < MInner; j++) { + res[j][0] = vdotq_laneq_s32_builtin(res[j][0], mat1[0], mat2, j); + res[j][1] = vdotq_laneq_s32_builtin(res[j][1], mat1[1], mat2, j); + } + } + for (U32 p = 0; p < MInner; p++) { + vst1q_s32(tmp, res[p][0]); + vst1q_s32(tmp + 4, res[p][1]); + for (U32 q = 0; q < 8; q++) { + result[q * M + p] += tmp[q]; + } + res[p][0] = vdupq_n_s32(0); + res[p][1] = vdupq_n_s32(0); + } +} + +inline void mmm_N4_MTail(U32 MInner, U32 M, U32 K, INT8 *matrix1, INT8 *matrix2, I32 *result) +{ + int8x16_t mat1 = {0}; + int8x16_t mat2 = {0}; + int32x4_t res[4] = {0}; + I32 tmp[8] = {0}; + + CHECK_REQUIREMENT(MInner < 4); + + for (U32 i = 0; i < K; i += 4) { + mat1 = vld1q_s8(matrix1 + i * 8); + + mat2 = vld1q_s8(matrix2 + i * MInner); + + for (U32 j = 0; j < MInner; j++) { + res[j] = vdotq_laneq_s32_builtin(res[j], mat1, mat2, j); + } + } + for (U32 p = 0; p < MInner; p++) { + vst1q_s32(tmp, res[p]); + for (U32 q = 0; q < 8; q++) { + result[q * M + p] += tmp[q]; + } + res[p] = vdupq_n_s32(0); + } +} + +inline void mmm_NTail_M12(U32 M, U32 N, U32 K, INT8 *matrix1, INT8 *matrix2, I32 *result) +{ + int8x16_t mat1 = {0}; + int8x16_t mat2[3] = {0}; + int32x4_t res[4][3] = {{0}}; + + for (U32 i = 0; i < N; i++) { + res[i][0] = vld1q_s32(result + i * M); + res[i][1] = vld1q_s32(result + i * M + 4); + res[i][2] = vld1q_s32(result + i * M + 8); + } + + for (U32 q = 0; q < K; q += 4) { + mat1 = vld1q_s8(matrix1 + q * N); + + mat2[0] = vld1q_s8(matrix2 + q * 12); + mat2[1] = vld1q_s8(matrix2 + q * 12 + 16); + mat2[2] = vld1q_s8(matrix2 + q * 12 + 32); + + for (U32 n = 0; n < N; n++) { + res[n][0] = vdotq_laneq_s32_builtin(res[n][0], mat2[0], mat1, n); + res[n][1] = vdotq_laneq_s32_builtin(res[n][1], mat2[1], mat1, n); + res[n][2] = vdotq_laneq_s32_builtin(res[n][2], mat2[2], mat1, n); + } + } + + for (U32 i = 0; i < N; i++) { + vst1q_s32(result + i * M, res[i][0]); + vst1q_s32(result + i * M + 4, res[i][1]); + vst1q_s32(result + i * M + 8, res[i][2]); + } +} + +inline void mmm_NTail_M8(U32 M, U32 N, U32 K, INT8 *matrix1, INT8 *matrix2, I32 *result) +{ + int8x16_t mat1 = {0}; + int8x16_t mat2[2] = {0}; + int32x4_t res[4][2] = {{0}}; + + for (U32 i = 0; i < N; i++) { + res[i][0] = vld1q_s32(result + i * M); + res[i][1] = vld1q_s32(result + i * M + 4); + } + + for (U32 q = 0; q < K; q += 4) { + mat1 = vld1q_s8(matrix1 + q * N); + + mat2[0] = vld1q_s8(matrix2 + q * 8); + mat2[1] = vld1q_s8(matrix2 + q * 8 + 16); + + for (U32 n = 0; n < N; n++) { + res[n][0] = vdotq_laneq_s32_builtin(res[n][0], mat2[0], mat1, n); + res[n][1] = vdotq_laneq_s32_builtin(res[n][1], mat2[1], mat1, n); + } + } + + for (U32 i = 0; i < N; i++) { + vst1q_s32(result + i * M, res[i][0]); + vst1q_s32(result + i * M + 4, res[i][1]); + } +} + +inline void mmm_NTail_M4(U32 M, U32 N, U32 K, INT8 *matrix1, INT8 *matrix2, I32 *result) +{ + int8x16_t mat1 = {0}; + int8x16_t mat2 = {0}; + int32x4_t res[4] = {0}; + + for (U32 i = 0; i < N; i++) { + res[i] = vld1q_s32(result + i * M); + } + + for (U32 q = 0; q < K; q += 4) { + mat1 = vld1q_s8(matrix1 + q * N); + + mat2 = vld1q_s8(matrix2 + q * 4); + + for (U32 n = 0; n < N; n++) { + res[n] = vdotq_laneq_s32_builtin(res[n], mat2, mat1, n); + } + } + + for (U32 i = 0; i < N; i++) { + vst1q_s32(result + i * M, res[i]); + } +} + +// matrix2 has been transformed to MKm(MInner)K4 +inline void mmm_NTail_M(U32 MInner, U32 M, U32 N, U32 K, INT8 *matrix1, INT8 *matrix2, I32 *result) +{ + int8x16_t mat1 = {0}; + int8x16_t mat2 = {0}; + int32x4_t res[3] = {0}; + I32 buf[4]; + + // for (U32 i = 0; i < N; i++) { + // res[i] = vld1q_s32(result + i*M); + // } + + for (U32 q = 0; q < K; q += 4) { + mat1 = vld1q_s8(matrix1 + q * N); + + mat2 = vld1q_s8(matrix2 + q * MInner); + + for (U32 n = 0; n < N; n++) { + res[n] = vdotq_laneq_s32_builtin(res[n], mat2, mat1, n); + } + } + + for (U32 i = 0; i < N; i++) { + vst1q_s32(buf, res[i]); + for (U32 j = 0; j < MInner; j++) { + result[i * M + j] += buf[j]; + } + } +} +#endif +#endif diff --git a/compute/blas_enhance/src/cpu/arm/int8/mvm.cpp b/compute/blas_enhance/src/cpu/arm/int8/mvm.cpp new file mode 100644 index 00000000..97e9fb6a --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/int8/mvm.cpp @@ -0,0 +1,168 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_INT8 +#include "cpu/arm/blas_arm.h" +#include "cpu/arm/int8/blas_int8.h" +#include "cpu/arm/int8/mvm.h" +#include "cpu/arm/int8/mmm_common.h" + +EE matrix_vector_multiply_transform_weight_int8(TensorDesc desc, INT8 *src, INT8 *dst) +{ + DataType dt; + DataFormat df; + U32 N, K; + EE ret = SUCCESS; + int i = 0; + switch (desc.df) { + case DF_NORMAL: { + CHECK_STATUS(tensor2dGet(desc, &dt, &df, &N, &K)); + U32 K4 = pad_to_4_multiple(K); + for (; i < (int)N - 31; i += 32) { + matrix1_trans_int8(32, K, K, src + i * K, dst + i * K4); + } + if (i < (int)N) { + memcpy(dst + i * K4, src + i * K, (N - i) * K * bytesOf(DT_I8)); + } + break; + } + case DF_TRANSPOSE: { + CHECK_STATUS(tensor2dGet(desc, &dt, &df, &K, &N)); + U32 K4 = pad_to_4_multiple(K); + for (; i < (int)N - 31; i += 32) { + matrix2_trans_int8(32, K, N, src + i, dst + i * K4); + } + if (i < (int)N) { + int base = i; + INT8 *basePtr = dst + i * K4; + for (int j = 0; j < (int)K; j++) { + for (; i < (int)N; i++) { + basePtr[(i - base) * K + j] = src[j * N + i]; + } + } + } + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +void mvm_row_pack(U32 Nbatch, U32 K, INT8 *matrix, INT8 *vector, I32 *result) +{ + U32 N = Nbatch * 32; + int8x16_t mat[16]; + int8x8_t v; + int32x4_t res[8]; + U32 K_tail = K % 4; + U32 K_inner = K - K_tail; + U32 K4 = pad_to_4_multiple(K); + + for (U32 n = 0; n < N; n += 32) { + INT8 *bufMov = matrix + n * K4; + if (K_inner > 0) { + for (int i = 0; i < 8; i++) { + res[i] = vld1q_s32(result + n + i * 4); + } + U32 k = 0; + for (; k < K_inner; k += 8) { + v = vld1_s8(vector + k); + for (int i = 0; i < 8; i++) { + mat[i] = vld1q_s8(bufMov + i * 16); + } + for (int i = 0; i < 8; i++) { + res[i] = vdotq_lane_s32(res[i], mat[i], v, 0); + } + for (int i = 8; i < 16; i++) { + mat[i] = vld1q_s8(bufMov + i * 16); + } + for (int i = 0; i < 8; i++) { + res[i] = vdotq_lane_s32(res[i], mat[i + 8], v, 1); + } + bufMov += 256; + } + if (K_inner > K) { + v = vld1_s8(vector + k - 4); + for (int i = 0; i < 8; i++) { + mat[i] = vld1q_s8(bufMov + i * 16); + } + for (int i = 0; i < 8; i++) { + res[i] = vdotq_lane_s32(res[i], mat[i], v, 1); + } + bufMov += 128; + } + + for (int i = 0; i < 8; i++) { + vst1q_s32(result + n + i * 4, res[i]); + } + } + if (K_tail > 0) { + for (int i = 0; i < 32; i++) { + I32 tmp = 0; + for (int j = 0; j < (int)K_tail; j++) { + tmp += vector[K_inner + j] * bufMov[j]; + } + result[n + i] += tmp; + bufMov += 4; + } + } + } +} + +void mvm_row(U32 numRows, U32 numColumns, DataFormat df, INT8 *matrix, INT8 *vector, I32 *result) +{ + // Actual layout is NK, and vector is K + U32 N = numRows; + U32 K = numColumns; + switch (df) { + case DF_NORMAL: { + U32 Nbatch = N / 8; + U32 NTail = N % 8; + + mvm_row_unpack(Nbatch, K, matrix, vector, result); + + if (NTail != 0) { + mvm_row_tail(NTail, K, matrix + (N - NTail) * K, vector, result + N - NTail); + } + break; + } + case DF_NKN32K4: { + U32 Nbatch = N / 32; + U32 NTail = N % 32; + + mvm_row_pack(Nbatch, K, matrix, vector, result); + + if (NTail != 0) { + U32 K4 = pad_to_4_multiple(K); + mvm_row_tail(NTail, K, matrix + (N - NTail) * K4, vector, result + N - NTail); + } + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } +} + +EE mvm_int8(U32 row, U32 col, DataFormat df, INT8 *matrix, INT8 *vector, I32 *tmp, I32 *result) +{ + if (DF_TRANSPOSE == df) { + mvm_col(row, col, matrix, vector, tmp, result); + } else { + mvm_row(row, col, df, matrix, vector, result); + } + return SUCCESS; +} +#endif diff --git a/compute/blas_enhance/src/cpu/arm/int8/mvm.h b/compute/blas_enhance/src/cpu/arm/int8/mvm.h new file mode 100644 index 00000000..ef9ad23e --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/int8/mvm.h @@ -0,0 +1,128 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_MVM +#define _H_MVM + +#ifdef _USE_INT8 +#include +#include + +inline void mvm_col_tail(U32 N, U32 K, INT8 *matrix, INT8 *vector, I32 *result) +{ + for (U32 n = 0; n < N; n++) { + I32 tmp = 0; + for (U32 k = 0; k < K; k++) { + tmp += vector[k] * matrix[k * N + n]; + } + result[n] += tmp; + } +} + +inline void mvm_row_tail(U32 N, U32 K, INT8 *matrix, INT8 *vector, I32 *result) +{ + INT8 *cur_row = matrix; + for (U32 n = 0; n < N; n++) { + I32 tmp = 0; + for (U32 k = 0; k < K; k++) { + tmp += vector[k] * cur_row[k]; + } + result[n] += tmp; + cur_row += K; + } +} + +inline void mvm_row_unpack(U32 Nbatch, U32 K, INT8 *matrix, INT8 *vector, I32 *result) +{ + U32 N = Nbatch * 8; + int8x16_t mat[8], v; + U32 K_tail = K % 16; + U32 K_inner = K - K_tail; + for (U32 n = 0; n < N; n += 8) { + int32x4_t res[8] = {0}; + int32x4_t bias[2]; + + INT8 *w[8]; + for (int i = 0; i < 8; i++) { + w[i] = matrix + (n + i) * K; + } + + for (U32 k = 0; k < K_inner; k += 16) { + v = vld1q_s8(vector + k); + for (int i = 0; i < 8; i++) { + mat[i] = vld1q_s8(w[i + k]); + } + for (int i = 0; i < 8; i++) { + res[i] = vdotq_s32(res[i], mat[i], v); + } + } + bias[0] = vld1q_s32(result + n); + bias[1] = vld1q_s32(result + n + 4); + + res[0] = vpaddq_s32(res[0], res[1]); + res[4] = vpaddq_s32(res[4], res[5]); + res[2] = vpaddq_s32(res[2], res[3]); + res[6] = vpaddq_s32(res[6], res[7]); + res[0] = vpaddq_s32(res[0], res[2]); + res[4] = vpaddq_s32(res[4], res[6]); + res[0] = vaddq_s32(res[0], bias[0]); + res[4] = vaddq_s32(res[4], bias[1]); + + vst1q_s32(result + n, res[0]); + vst1q_s32(result + n + 4, res[4]); + + if (K_tail != 0) { + for (int i = 0; i < 8; i++) { + I32 tmp = 0; + for (U32 p = K_inner; p < K; p++) { + tmp += vector[p] * w[i][p]; + } + result[n + i] += tmp; + } + } + } +} + +inline void mvm_col(U32 numRows, U32 numColumns, INT8 *matrix, INT8 *vector, I32 *tmp, I32 *result) +{ + // Actual layout is KN, and vector is K + U32 N = numRows; + U32 K = numColumns; + U32 NTail = N % 64; + U32 NInner = N - NTail; + + for (U32 n = 0; n < NInner; n += 64) { + memset(tmp, 0, sizeof(I32) * 64); + for (U32 k = 0; k < K; k++) { + for (U32 i = 0; i < 64; i++) { + tmp[i] += vector[k] * matrix[k * N + n + i]; + } + } + + for (U32 i = 0; i < 64; i++) { + result[n + i] += tmp[i]; + } + } + + memset(tmp, 0, sizeof(I32) * 64); + for (U32 k = 0; k < K; k++) { + for (U32 i = 0; i < NTail; i++) { + tmp[i] += vector[k] * matrix[k * N + NInner + i]; + } + for (U32 i = 0; i < NTail; i++) { + result[NInner + i] += tmp[i]; + } + } +} +#endif +#endif diff --git a/compute/blas_enhance/src/cpu/arm/mmm.cpp b/compute/blas_enhance/src/cpu/arm/mmm.cpp new file mode 100644 index 00000000..e2d57d2e --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/mmm.cpp @@ -0,0 +1,194 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "error.h" +#include "types.h" +#include "blas_enhance.h" +#include "cpu/arm/blas_arm.h" +#ifdef _USE_FP16 +#include "cpu/arm/fp16/blas_fp16.h" +#endif +#ifdef _USE_FP32 +#include "cpu/arm/fp32/blas_fp32.h" +#endif +#ifdef _USE_INT8 +#include "cpu/arm/int8/blas_int8.h" +#endif + +EE matrix_matrix_multiply_tmp_bytes_arm( + U32 matrixA_M, U32 matrixA_K, U32 matrixB_K, U32 matrixB_N, DataType dt, U32 *bytes) +{ + EE ret = SUCCESS; + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: { + matrix_matrix_multiply_tmp_bytes_fp16( + matrixA_M, matrixA_K, matrixB_K, matrixB_N, dt, bytes); + break; + } +#endif +#ifdef _USE_FP32 + case DT_F32: { + matrix_matrix_multiply_tmp_bytes_fp32( + matrixA_M, matrixA_K, matrixB_K, matrixB_N, dt, bytes); + break; + } +#endif +#ifdef _USE_INT8 + case DT_I8: { + matrix_matrix_multiply_tmp_bytes_int8( + matrixA_M, matrixA_K, matrixB_K, matrixB_N, dt, bytes); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + + return ret; +} + +static EE matrix_matrix_multiply_transform_rhsN( + TensorDesc desc, const void *src, TensorDesc *descTran, void *dst) +{ + EE ret = SUCCESS; + switch (desc.dt) { +#ifdef _USE_FP16 + case DT_F16: { + ret = matrix_matrix_multiply_transform_rhsN_fp16(desc, (F16 *)src, (F16 *)dst); + break; + } +#endif +#ifdef _USE_FP32 + case DT_F32: { + ret = matrix_matrix_multiply_transform_rhsN_fp32(desc, (F32 *)src, (F32 *)dst); + break; + } +#endif +#ifdef _USE_INT8 + case DT_I8: { + ret = matrix_matrix_multiply_transform_rhsN_int8(desc, (INT8 *)src, (INT8 *)dst); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + (*descTran) = desc; + (*descTran).df = targetFormat4MatrixB(desc.dt); + return ret; +} + +static EE matrix_matrix_multiply_transform_rhsT( + TensorDesc desc, const void *src, TensorDesc *descTran, void *dst) +{ + EE ret = SUCCESS; + switch (desc.dt) { +#ifdef _USE_FP16 + case DT_F16: { + ret = matrix_matrix_multiply_transform_rhsT_fp16(desc, (F16 *)src, (F16 *)dst); + break; + } +#endif +#ifdef _USE_FP32 + case DT_F32: { + ret = matrix_matrix_multiply_transform_rhsT_fp32(desc, (F32 *)src, (F32 *)dst); + break; + } +#endif +#ifdef _USE_INT8 + case DT_I8: { + ret = matrix_matrix_multiply_transform_rhsT_int8(desc, (INT8 *)src, (INT8 *)dst); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + (*descTran) = desc; + (*descTran).df = targetFormat4MatrixB(desc.dt); + std::swap((*descTran).dims[0], (*descTran).dims[1]); + return ret; +} + +EE matrix_matrix_multiply_transform_rhs_arm( + TensorDesc desc, const void *src, TensorDesc *descTran, void *dst) +{ + if (desc.df == targetFormat4MatrixB(desc.dt)) { + return SUCCESS; + } + EE ret = SUCCESS; + switch (desc.df) { + case DF_NORMAL: { + ret = matrix_matrix_multiply_transform_rhsN(desc, src, descTran, dst); + break; + } + case DF_TRANSPOSE: { + ret = matrix_matrix_multiply_transform_rhsT(desc, src, descTran, dst); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE mmm_arm(U32 matrixC_N, + U32 matrixC_M, + U32 matrixA_K, + DataType dt, + bool transposeA, + const void *matrixAData, + const void *matrixBData, + void *tmp, + void *matrixCData, + Arch arch) +{ + EE ret = SUCCESS; + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: { + ret = mmm_fp16(matrixC_N, matrixC_M, matrixA_K, transposeA, (F16 *)matrixAData, + (F16 *)matrixBData, (F16 *)tmp, (F16 *)matrixCData, arch); + break; + } +#endif +#ifdef _USE_FP32 + case DT_F32: { +#ifdef __aarch64__ + ret = mmm_fp32_V8(matrixC_N, matrixC_M, matrixA_K, transposeA, (F32 *)matrixAData, + (F32 *)matrixBData, (F32 *)tmp, (F32 *)matrixCData); +#else + ret = mmm_fp32_V7(matrixC_N, matrixC_M, matrixA_K, transposeA, (F32 *)matrixAData, + (F32 *)matrixBData, (F32 *)tmp, (F32 *)matrixCData); +#endif + break; + } +#endif +#ifdef _USE_INT8 + case DT_I8: { + ret = mmm_int8(matrixC_N, matrixC_M, matrixA_K, transposeA, (INT8 *)matrixAData, + (INT8 *)matrixBData, (INT8 *)tmp, (I32 *)matrixCData, arch); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/blas_enhance/src/cpu/arm/mvm.cpp b/compute/blas_enhance/src/cpu/arm/mvm.cpp new file mode 100644 index 00000000..1329ceaa --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/mvm.cpp @@ -0,0 +1,128 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "error.h" +#include "types.h" +#include "blas_enhance.h" +#include "cpu/arm/blas_arm.h" +#ifdef _USE_FP16 +#include "cpu/arm/fp16/blas_fp16.h" +#endif +#ifdef _USE_FP32 +#include "cpu/arm/fp32/blas_fp32.h" +#endif +#ifdef _USE_INT8 +#include "cpu/arm/int8/blas_int8.h" +#endif + +EE matrix_vector_multiply_tmp_bytes_arm(bool transpose, DataType dt, U32 *bytes) +{ + if (nullptr == bytes) { + CHECK_STATUS(NULL_POINTER); + } + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + *bytes = 0; + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + *bytes = 0; + break; +#endif +#ifdef _USE_INT8 + case DT_I8: { + if (transpose) { + *bytes = 64 * sizeof(I32); + } + break; + } +#endif + default: + break; + } + return SUCCESS; +} + +EE matrix_vector_multiply_transform_weight_arm( + TensorDesc desc, const void *src, TensorDesc *descTran, void *dst) +{ + if (desc.df == targetFormat4mvmMatrix(desc.dt)) { + return SUCCESS; + } + EE ret = SUCCESS; + switch (desc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = matrix_vector_multiply_transform_weight_fp32(desc, (F32 *)src, (F32 *)dst); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + ret = matrix_vector_multiply_transform_weight_fp16(desc, (F16 *)src, (F16 *)dst); + break; + } +#endif +#ifdef _USE_INT8 + case DT_I8: { + ret = matrix_vector_multiply_transform_weight_int8(desc, (INT8 *)src, (INT8 *)dst); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + *descTran = desc; + if (DF_TRANSPOSE == desc.df) { + std::swap((*descTran).dims[0], (*descTran).dims[1]); + } + descTran->df = targetFormat4mvmMatrix(desc.dt); + return ret; +} + +EE mvm_arm(U32 row, + U32 col, + DataType dt, + DataFormat df, + const void *matrix, + const void *vector, + void *tmp, + void *result, + Arch arch) +{ + EE ret = SUCCESS; + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + ret = mvm_fp16(row, col, df, (F16 *)matrix, (F16 *)vector, (F16 *)result, arch); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + ret = mvm_fp32(row, col, df, (F32 *)matrix, (F32 *)vector, (F32 *)result); + break; +#endif +#ifdef _USE_INT8 + case DT_I8: + ret = mvm_int8(row, col, df, (INT8 *)matrix, (INT8 *)vector, (I32 *)tmp, (I32 *)result); + break; +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/blas_enhance/src/cpu/general/axpby.cpp b/compute/blas_enhance/src/cpu/general/axpby.cpp new file mode 100644 index 00000000..a24a23fe --- /dev/null +++ b/compute/blas_enhance/src/cpu/general/axpby.cpp @@ -0,0 +1,45 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "error.h" +#include "types.h" +#include "cpu/general/blas_general.h" + +template +inline void axpby(U32 len, F32 a, T *x, F32 b, T *y) +{ + for (U32 i = 0; i < len; i++) { + y[i] = a * x[i] + b * y[i]; + } +} + +EE axpby_general(U32 len, DataType dt, F32 a, const void *x, F32 b, void *y) +{ + EE ret = SUCCESS; + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + axpby(len, a, (F16 *)x, b, (F16 *)y); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + axpby(len, a, (F32 *)x, b, (F32 *)y); + break; +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/blas_enhance/src/cpu/general/blas_general.h b/compute/blas_enhance/src/cpu/general/blas_general.h new file mode 100644 index 00000000..aa1a9b44 --- /dev/null +++ b/compute/blas_enhance/src/cpu/general/blas_general.h @@ -0,0 +1,40 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_BLAS_GENERAL +#define _H_BLAS_GENERAL + +#include "sys.h" +#include "types.h" + +EE mvm_general(U32 row, + U32 col, + DataType dt, + bool transpose, + const void *matrix, + const void *vector, + void *result); + +EE mmm_general(U32 matrixC_N, + U32 matrixC_M, + U32 matrixA_K, + bool transposeA, + bool transposeB, + DataType matrixADataType, + const void *matrixAData, + const void *matrixBData, + void *matrixCData); + +EE axpby_general(U32 len, DataType dt, F32 a, const void *x, F32 b, void *y); + +#endif diff --git a/compute/blas_enhance/src/cpu/general/mmm.cpp b/compute/blas_enhance/src/cpu/general/mmm.cpp new file mode 100644 index 00000000..c041110c --- /dev/null +++ b/compute/blas_enhance/src/cpu/general/mmm.cpp @@ -0,0 +1,82 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "error.h" +#include "types.h" +#include "cpu/general/blas_general.h" + +template +inline void mmm( + U32 N, U32 M, U32 K, bool transposeA, bool transposeB, T1 *matrixA, T1 *matrixB, T2 *matrixC) +{ + for (U32 i = 0; i < M; i++) { + for (U32 n = 0; n < N; n++) { + F32 value = 0; + for (U32 j = 0; j < K; j++) { + U32 indexA = 0, indexB = 0; + if (transposeA) { + indexA = j * M + i; + } else { + indexA = i * K + j; + } + if (transposeB) { + indexB = n * K + j; + } else { + indexB = j * N + n; + } + value += matrixA[indexA] * matrixB[indexB]; + } + matrixC[i * N + n] += value; + } + } +} + +EE mmm_general(U32 matrixC_N, + U32 matrixC_M, + U32 matrixA_K, + bool transposeA, + bool transposeB, + DataType dt, + const void *matrixAData, + const void *matrixBData, + void *matrixCData) +{ + EE ret = SUCCESS; + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: { + mmm(matrixC_N, matrixC_M, matrixA_K, transposeA, transposeB, + (F16 *)matrixAData, (F16 *)matrixBData, (F16 *)matrixCData); + break; + } +#endif +#ifdef _USE_INT8 + case DT_I8: { + mmm(matrixC_N, matrixC_M, matrixA_K, transposeA, transposeB, + (INT8 *)matrixAData, (INT8 *)matrixBData, (I32 *)matrixCData); + break; + } +#endif +#ifdef _USE_FP32 + case DT_F32: { + mmm(matrixC_N, matrixC_M, matrixA_K, transposeA, transposeB, + (F32 *)matrixAData, (F32 *)matrixBData, (F32 *)matrixCData); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/blas_enhance/src/cpu/general/mvm.cpp b/compute/blas_enhance/src/cpu/general/mvm.cpp new file mode 100644 index 00000000..7decf55c --- /dev/null +++ b/compute/blas_enhance/src/cpu/general/mvm.cpp @@ -0,0 +1,65 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "error.h" +#include "types.h" +#include "cpu/general/blas_general.h" + +template +inline void mvm(U32 M, U32 K, bool transpose, T1 *mat, T1 *vec, T2 *res) +{ + if (!transpose) { + for (U32 i = 0; i < M; i++) { + F32 out_f = 0; + for (U32 j = 0; j < K; j++) { + out_f += mat[i * K + j] * vec[j]; + } + res[i] += out_f; + } + } else { + for (U32 i = 0; i < M; i++) { + F32 out_f = 0; + for (U32 j = 0; j < K; j++) { + out_f += mat[j * M + i] * vec[j]; + } + res[i] += out_f; + } + } +} + +EE mvm_general( + U32 row, U32 col, DataType dt, bool transpose, const void *matrix, const void *vector, void *result) +{ + EE ret = SUCCESS; + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + mvm(row, col, transpose, (F16 *)matrix, (F16 *)vector, (F16 *)result); + break; +#endif +#ifdef _USE_INT8 + case DT_I8: + mvm(row, col, transpose, (INT8 *)matrix, (INT8 *)vector, (I32 *)result); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + mvm(row, col, transpose, (F32 *)matrix, (F32 *)vector, (F32 *)result); + break; +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/blas_enhance/src/cpu/x86/blas_x86.h b/compute/blas_enhance/src/cpu/x86/blas_x86.h new file mode 100644 index 00000000..ff6a3792 --- /dev/null +++ b/compute/blas_enhance/src/cpu/x86/blas_x86.h @@ -0,0 +1,47 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_BLAS_X86 +#define _H_BLAS_X86 + +#include "error.h" +#include "sys.h" +#include "tensor_desc.h" + +EE matrix_vector_multiply_tmp_bytes_x86(bool transpose, DataType dt, U32 *bytes); + +EE mvm_x86(U32 row, + U32 col, + DataType dt, + bool transpose, + const void *matrix, + const void *vector, + void *result); + +EE matrix_matrix_multiply_tmp_bytes_x86( + U32 matrixA_M, U32 matrixA_K, U32 matrixB_K, U32 matrixB_N, DataType dt, U32 *bytes); + +EE matrix_matrix_multiply_transform_rhs_x86( + TensorDesc desc, const void *src, TensorDesc *descTran, void *dst); + +EE mmm_x86(U32 matrixC_N, + U32 matrixC_M, + U32 matrixA_K, + DataType matrixADataType, + bool transposeA, + const void *matrixAData, + const void *matrixBData, + void *tmp, + void *matrixCData); + +#endif diff --git a/compute/blas_enhance/src/cpu/x86/fp32/blas_fp32.h b/compute/blas_enhance/src/cpu/x86/fp32/blas_fp32.h new file mode 100644 index 00000000..1f839440 --- /dev/null +++ b/compute/blas_enhance/src/cpu/x86/fp32/blas_fp32.h @@ -0,0 +1,46 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_BLAS_FP32 +#define _H_BLAS_FP32 + +#include "sys.h" +#include "types.h" +#include "error.h" +#include "tensor_desc.h" + +void mvm_col_fp32(U32 row, U32 col, F32 *matrix, F32 *vector, F32 *result); + +void mvm_row_fp32(U32 row, U32 col, F32 *matrix, F32 *vector, F32 *result); + +inline EE mvm_avx2_fp32(U32 row, U32 col, bool transpose, F32 *matrix, F32 *vector, F32 *result) +{ + if (transpose) { + mvm_col_fp32(row, col, matrix, vector, result); + } else { + mvm_row_fp32(row, col, matrix, vector, result); + } + return SUCCESS; +} + +void matrix_matrix_multiply_tmp_bytes_fp32( + U32 row1, U32 col1, U32 row2, U32 col2, DataType dt, U32 *bytes); + +EE matrix_matrix_multiply_transform_rhsN_fp32(TensorDesc desc, F32 *src, F32 *dst); + +EE matrix_matrix_multiply_transform_rhsT_fp32(TensorDesc desc, F32 *src, F32 *dst); + +EE mmm_avx2_fp32( + int M, int N, int K, bool transposeA, F32 *matrix1, F32 *matrix2, F32 *tmp, F32 *result); + +#endif \ No newline at end of file diff --git a/compute/blas_enhance/src/cpu/x86/fp32/mmm_avx2.cpp b/compute/blas_enhance/src/cpu/x86/fp32/mmm_avx2.cpp new file mode 100644 index 00000000..ae62add8 --- /dev/null +++ b/compute/blas_enhance/src/cpu/x86/fp32/mmm_avx2.cpp @@ -0,0 +1,1445 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/fp32/blas_fp32.h" +#include "error.h" +#include "types.h" + +#define UNROLL_K 4 +#define UNROLL_N 24 +#define UNROLL_M 4 +#define BOLCK_M_DIM 768 +#define BOLCK_K_DIM 768 +#define align_addr(addr, unit) (((uintptr_t)addr + unit - 1) / unit * unit) + +typedef void (*kernel_func)( + U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 ldc); + +void matrix_matrix_multiply_tmp_bytes_fp32( + U32 row1, U32 col1, U32 row2, U32 col2, DataType dt, U32 *bytes) +{ + *bytes = row1 * col1 + row2 * col2; + *bytes *= sizeof(dt); + *bytes += 32; +} + +void matrix1_trans(U32 size, U32 blockK, U32 K, F32 *src, F32 *dst) +{ + U32 remain = size % 4; + size = size / 4 * 4; + __m128i vindex = _mm_set_epi32(K * 3, K * 2, K, 0); + for (U32 i = 0; i < blockK; ++i) { + U32 j; + for (j = 0; j < size; j += 4) { + if (i % 16 == 0) { + _mm_prefetch(src + i + j * K + 16, _MM_HINT_NTA); + _mm_prefetch(src + i + (j + 1) * K + 16, _MM_HINT_NTA); + _mm_prefetch(src + i + (j + 2) * K + 16, _MM_HINT_NTA); + _mm_prefetch(src + i + (j + 3) * K + 16, _MM_HINT_NTA); + } + _mm_store_ps(dst, _mm_i32gather_ps(src + i + j * K, vindex, 4)); + dst += 4; + } + for (; j < remain; ++j) { + if (i % 16 == 0) { + _mm_prefetch(src + i + (j + size) * K + 16, _MM_HINT_NTA); + } + *(dst++) = *(src + i + j * K); + } + } +} + +void matrix2_trans(U32 size, U32 blockK, U32 M, F32 *src, F32 *dst) +{ + for (U32 i = 0; i < blockK; i++) { + for (U32 j = 0; j < size; j += 16) { + _mm_prefetch(src + M + j, _MM_HINT_NTA); + } + memcpy(dst, src, size * sizeof(F32)); + dst += size; + src += M; + } +} + +EE matrix_matrix_multiply_transform_rhsN_fp32(TensorDesc desc, F32 *src, F32 *dst) +{ + DataType dt; + DataFormat df; + U32 N, K, blockSizeK, unrollSizeN; + CHECK_STATUS(tensor2dGet(desc, &dt, &df, &K, &N)); + F32 unrollSize[4] = {4, 8, 16, 24}; + + // buffer addr algined to 32 + F32 *packB = (F32 *)align_addr(dst, 32); + for (U32 bk = 0; bk < K; bk += blockSizeK) { + blockSizeK = UNI_MIN(BOLCK_K_DIM, K - bk); + for (U32 un = 0; un < N; un += unrollSizeN) { + unrollSizeN = UNI_MIN(UNROLL_N, N - un); + unrollSizeN = UNI_MIN(unrollSize[unrollSizeN / 8], unrollSizeN); + matrix2_trans(unrollSizeN, blockSizeK, N, src + un, packB); + packB += unrollSizeN * blockSizeK; + } + src += blockSizeK * N; + } + return SUCCESS; +} + +EE matrix_matrix_multiply_transform_rhsT_fp32(TensorDesc desc, F32 *src, F32 *dst) +{ + DataType dt; + DataFormat df; + U32 N, K, blockSizeK, unrollSizeN; + CHECK_STATUS(tensor2dGet(desc, &dt, &df, &N, &K)); + F32 unrollSize[4] = {4, 8, 16, 24}; + + // buffer addr aligned to 32 + F32 *packB = (F32 *)align_addr(dst, 32); + for (U32 bk = 0; bk < K; bk += blockSizeK) { + blockSizeK = UNI_MIN(BOLCK_K_DIM, K - bk); + for (U32 un = 0; un < N; un += unrollSizeN) { + unrollSizeN = UNI_MIN(UNROLL_N, N - un); + unrollSizeN = UNI_MIN(unrollSize[unrollSizeN >> 3], unrollSizeN); + matrix1_trans(unrollSizeN, blockSizeK, K, src + un * K, packB); + packB += unrollSizeN * blockSizeK; + } + src += blockSizeK; + } + return SUCCESS; +} + +void mmm_avx2_4x24_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N) +{ + __asm__ __volatile__("vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorps %%ymm2, %%ymm2, %%ymm2 \n\t" + "vxorps %%ymm3, %%ymm3, %%ymm3 \n\t" + "vxorps %%ymm4, %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5, %%ymm5, %%ymm5 \n\t" + "vxorps %%ymm6, %%ymm6, %%ymm6 \n\t" + "vxorps %%ymm7, %%ymm7, %%ymm7 \n\t" + "vxorps %%ymm8, %%ymm8, %%ymm8 \n\t" + "vxorps %%ymm9, %%ymm9, %%ymm9 \n\t" + "vxorps %%ymm10, %%ymm10, %%ymm10 \n\t" + "vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" + + "mov %0, %%ecx \n\t" + "shr $2, %%ecx \n\t" + "je .k_loop_4x24_end \n\t" + ".align 16 \n\t" + ".k_loop_4x24: \n\t" + + "prefetcht0 0x140(%1) \n\t" + "prefetcht0 0x180(%1) \n\t" + "prefetcht0 0x140(%2) \n\t" + + "vmovaps (%1), %%ymm12 \n\t" + "vmovaps 0x20(%1), %%ymm13 \n\t" + "vmovaps 0x40(%1), %%ymm14 \n\t" + "vbroadcastss 0x0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vbroadcastss 0x4(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vbroadcastss 0x8(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss 0xC(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "prefetcht0 0x1C0(%1) \n\t" + + "vmovaps 0x60(%1), %%ymm12 \n\t" + "vmovaps 0x80(%1), %%ymm13 \n\t" + "vmovaps 0xA0(%1), %%ymm14 \n\t" + "vbroadcastss 0x10(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vbroadcastss 0x14(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vbroadcastss 0x18(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss 0x1C(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "prefetcht0 0x200(%1) \n\t" + "prefetcht0 0x240(%1) \n\t" + + "vmovaps 0xC0(%1), %%ymm12 \n\t" + "vmovaps 0xE0(%1), %%ymm13 \n\t" + "vmovaps 0x100(%1), %%ymm14 \n\t" + "vbroadcastss 0x20(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vbroadcastss 0x24(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vbroadcastss 0x28(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss 0x2C(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "prefetcht0 0x280(%1) \n\t" + + "vmovaps 0x120(%1), %%ymm12 \n\t" + "vmovaps 0x140(%1), %%ymm13 \n\t" + "vmovaps 0x160(%1), %%ymm14 \n\t" + "vbroadcastss 0x30(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vbroadcastss 0x34(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vbroadcastss 0x38(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss 0x3C(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "add $0x180, %1 \n\t" + "add $0x40, %2 \n\t" + + "sub $1, %%ecx \n\t" + "jg .k_loop_4x24 \n\t" + ".align 16 \n\t" + ".k_loop_4x24_end: \n\t" + + "mov %0, %%ecx \n\t" + "and $3, %%ecx \n\t" + "je .k_loop_4x24_remain_end \n\t" + ".k_loop_4x24_remain: \n\t" + "vmovaps (%1), %%ymm12 \n\t" + "vmovaps 0x20(%1), %%ymm13 \n\t" + "vmovaps 0x40(%1), %%ymm14 \n\t" + "vbroadcastss 0x0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vbroadcastss 0x4(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vbroadcastss 0x8(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss 0xC(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + "add $0x60, %1 \n\t" + "add $0x10, %2 \n\t" + "sub $1, %%ecx \n\t" + "jg .k_loop_4x24_remain \n\t" + + ".k_loop_4x24_remain_end: \n\t" + "mov %4, %%eax \n\t" + "shl $2, %%eax \n\t" + "mov %%eax, %%eax \n\t" + "prefetcht0 0x40(%3) \n\t" + "prefetcht0 (%3, %%rax) \n\t" + "prefetcht0 0x40(%3, %%rax) \n\t" + "vaddps (%3), %%ymm0, %%ymm0 \n\t" + "vaddps 0x20(%3), %%ymm1, %%ymm1 \n\t" + "vaddps 0x40(%3), %%ymm2, %%ymm2 \n\t" + "vmovups %%ymm0, (%3) \n\t" + "vmovups %%ymm1, 0x20(%3) \n\t" + "vmovups %%ymm2, 0x40(%3) \n\t" + "add %%rax, %3 \n\t" + "prefetcht0 (%3, %%rax) \n\t" + "prefetcht0 0x40(%3, %%rax) \n\t" + "vaddps (%3), %%ymm3, %%ymm3 \n\t" + "vaddps 0x20(%3), %%ymm4, %%ymm4 \n\t" + "vaddps 0x40(%3), %%ymm5, %%ymm5 \n\t" + "vmovups %%ymm3, (%3) \n\t" + "vmovups %%ymm4, 0x20(%3) \n\t" + "vmovups %%ymm5, 0x40(%3) \n\t" + "add %%rax, %3 \n\t" + "prefetcht0 (%3, %%rax) \n\t" + "prefetcht0 0x40(%3, %%rax) \n\t" + "vaddps (%3), %%ymm6, %%ymm6 \n\t" + "vaddps 0x20(%3), %%ymm7, %%ymm7 \n\t" + "vaddps 0x40(%3), %%ymm8, %%ymm8 \n\t" + "vmovups %%ymm6, (%3) \n\t" + "vmovups %%ymm7, 0x20(%3) \n\t" + "vmovups %%ymm8, 0x40(%3) \n\t" + "add %%rax, %3 \n\t" + "prefetcht0 0x40(%3) \n\t" + "vaddps (%3), %%ymm9, %%ymm9 \n\t" + "vaddps 0x20(%3), %%ymm10, %%ymm10 \n\t" + "vaddps 0x40(%3), %%ymm11, %%ymm11 \n\t" + "vmovups %%ymm9, (%3) \n\t" + "vmovups %%ymm10, 0x20(%3) \n\t" + "vmovups %%ymm11, 0x40(%3) \n\t" + : + : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N) + : "%eax", "%rax", "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", + "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", + "%ymm13", "%ymm14", "%ymm15", "memory"); +} + +void mmm_avx2_4x16_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N) +{ + __asm__ __volatile__("vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorps %%ymm2, %%ymm2, %%ymm2 \n\t" + "vxorps %%ymm3, %%ymm3, %%ymm3 \n\t" + "vxorps %%ymm4, %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5, %%ymm5, %%ymm5 \n\t" + "vxorps %%ymm6, %%ymm6, %%ymm6 \n\t" + "vxorps %%ymm7, %%ymm7, %%ymm7 \n\t" + + "mov %0, %%ecx \n\t" + "shr $2, %%ecx \n\t" + "je .k_loop_4x16_end \n\t" + ".align 16 \n\t" + ".k_loop_4x16: \n\t" + + "prefetcht0 0x140(%1) \n\t" + "prefetcht0 0x140(%2) \n\t" + + "vmovaps (%1), %%ymm8 \n\t" + "vmovaps 0x20(%1), %%ymm9 \n\t" + "vbroadcastss 0x0(%2), %%ymm10 \n\t" + "vfmadd231ps %%ymm10, %%ymm8, %%ymm0 \n\t" + "vfmadd231ps %%ymm10, %%ymm9, %%ymm1 \n\t" + "vbroadcastss 0x4(%2), %%ymm10 \n\t" + "vfmadd231ps %%ymm10, %%ymm8, %%ymm2 \n\t" + "vfmadd231ps %%ymm10, %%ymm9, %%ymm3 \n\t" + "vbroadcastss 0x8(%2), %%ymm10 \n\t" + "vfmadd231ps %%ymm10, %%ymm8, %%ymm4 \n\t" + "vfmadd231ps %%ymm10, %%ymm9, %%ymm5 \n\t" + "vbroadcastss 0xC(%2), %%ymm10 \n\t" + "vfmadd231ps %%ymm10, %%ymm8, %%ymm6 \n\t" + "vfmadd231ps %%ymm10, %%ymm9, %%ymm7 \n\t" + + "prefetcht0 0x180(%1) \n\t" + + "vmovaps 0x40(%1), %%ymm8 \n\t" + "vmovaps 0x60(%1), %%ymm9 \n\t" + "vbroadcastss 0x10(%2), %%ymm10 \n\t" + "vfmadd231ps %%ymm10, %%ymm8, %%ymm0 \n\t" + "vfmadd231ps %%ymm10, %%ymm9, %%ymm1 \n\t" + "vbroadcastss 0x14(%2), %%ymm10 \n\t" + "vfmadd231ps %%ymm10, %%ymm8, %%ymm2 \n\t" + "vfmadd231ps %%ymm10, %%ymm9, %%ymm3 \n\t" + "vbroadcastss 0x18(%2), %%ymm10 \n\t" + "vfmadd231ps %%ymm10, %%ymm8, %%ymm4 \n\t" + "vfmadd231ps %%ymm10, %%ymm9, %%ymm5 \n\t" + "vbroadcastss 0x1C(%2), %%ymm10 \n\t" + "vfmadd231ps %%ymm10, %%ymm8, %%ymm6 \n\t" + "vfmadd231ps %%ymm10, %%ymm9, %%ymm7 \n\t" + + "prefetcht0 0x1C0(%1) \n\t" + + "vmovaps 0x80(%1), %%ymm8 \n\t" + "vmovaps 0xA0(%1), %%ymm9 \n\t" + "vbroadcastss 0x20(%2), %%ymm10 \n\t" + "vfmadd231ps %%ymm10, %%ymm8, %%ymm0 \n\t" + "vfmadd231ps %%ymm10, %%ymm9, %%ymm1 \n\t" + "vbroadcastss 0x24(%2), %%ymm10 \n\t" + "vfmadd231ps %%ymm10, %%ymm8, %%ymm2 \n\t" + "vfmadd231ps %%ymm10, %%ymm9, %%ymm3 \n\t" + "vbroadcastss 0x28(%2), %%ymm10 \n\t" + "vfmadd231ps %%ymm10, %%ymm8, %%ymm4 \n\t" + "vfmadd231ps %%ymm10, %%ymm9, %%ymm5 \n\t" + "vbroadcastss 0x2C(%2), %%ymm10 \n\t" + "vfmadd231ps %%ymm10, %%ymm8, %%ymm6 \n\t" + "vfmadd231ps %%ymm10, %%ymm9, %%ymm7 \n\t" + + "prefetcht0 0x200(%1) \n\t" + + "vmovaps 0xC0(%1), %%ymm8 \n\t" + "vmovaps 0xE0(%1), %%ymm9 \n\t" + "vbroadcastss 0x30(%2), %%ymm10 \n\t" + "vfmadd231ps %%ymm10, %%ymm8, %%ymm0 \n\t" + "vfmadd231ps %%ymm10, %%ymm9, %%ymm1 \n\t" + "vbroadcastss 0x34(%2), %%ymm10 \n\t" + "vfmadd231ps %%ymm10, %%ymm8, %%ymm2 \n\t" + "vfmadd231ps %%ymm10, %%ymm9, %%ymm3 \n\t" + "vbroadcastss 0x38(%2), %%ymm10 \n\t" + "vfmadd231ps %%ymm10, %%ymm8, %%ymm4 \n\t" + "vfmadd231ps %%ymm10, %%ymm9, %%ymm5 \n\t" + "vbroadcastss 0x3C(%2), %%ymm10 \n\t" + "vfmadd231ps %%ymm10, %%ymm8, %%ymm6 \n\t" + "vfmadd231ps %%ymm10, %%ymm9, %%ymm7 \n\t" + + "add $0x100, %1 \n\t" + "add $0x40, %2 \n\t" + + "sub $1, %%ecx \n\t" + "jg .k_loop_4x16 \n\t" + ".align 16 \n\t" + ".k_loop_4x16_end: \n\t" + + "mov %0, %%ecx \n\t" + "and $3, %%ecx \n\t" + "je .k_loop_4x16_remain_end \n\t" + ".k_loop_4x16_remain: \n\t" + "vmovaps (%1), %%ymm8 \n\t" + "vmovaps 0x20(%1), %%ymm9 \n\t" + "vbroadcastss 0x0(%2), %%ymm10 \n\t" + "vfmadd231ps %%ymm10, %%ymm8, %%ymm0 \n\t" + "vfmadd231ps %%ymm10, %%ymm9, %%ymm1 \n\t" + "vbroadcastss 0x4(%2), %%ymm10 \n\t" + "vfmadd231ps %%ymm10, %%ymm8, %%ymm2 \n\t" + "vfmadd231ps %%ymm10, %%ymm9, %%ymm3 \n\t" + "vbroadcastss 0x8(%2), %%ymm10 \n\t" + "vfmadd231ps %%ymm10, %%ymm8, %%ymm4 \n\t" + "vfmadd231ps %%ymm10, %%ymm9, %%ymm5 \n\t" + "vbroadcastss 0xC(%2), %%ymm10 \n\t" + "vfmadd231ps %%ymm10, %%ymm8, %%ymm6 \n\t" + "vfmadd231ps %%ymm10, %%ymm9, %%ymm7 \n\t" + "add $0x40, %1 \n\t" + "add $0x10, %2 \n\t" + "sub $1, %%ecx \n\t" + "jg .k_loop_4x16_remain \n\t" + + ".k_loop_4x16_remain_end: \n\t" + "mov %4, %%eax \n\t" + "shl $2, %%eax \n\t" + "mov %%eax, %%eax \n\t" + "prefetcht0 (%3, %%rax) \n\t" + "vaddps (%3), %%ymm0, %%ymm0 \n\t" + "vaddps 0x20(%3), %%ymm1, %%ymm1 \n\t" + "vmovups %%ymm0, (%3) \n\t" + "vmovups %%ymm1, 0x20(%3) \n\t" + "add %%rax, %3 \n\t" + "prefetcht0 (%3, %%rax) \n\t" + "vaddps (%3), %%ymm2, %%ymm2 \n\t" + "vaddps 0x20(%3), %%ymm3, %%ymm3 \n\t" + "vmovups %%ymm2, (%3) \n\t" + "vmovups %%ymm3, 0x20(%3) \n\t" + "add %%rax, %3 \n\t" + "prefetcht0 (%3, %%rax) \n\t" + "vaddps (%3), %%ymm4, %%ymm4 \n\t" + "vaddps 0x20(%3), %%ymm5, %%ymm5 \n\t" + "vmovups %%ymm4, (%3) \n\t" + "vmovups %%ymm5, 0x20(%3) \n\t" + "add %%rax, %3 \n\t" + "vaddps (%3), %%ymm6, %%ymm6 \n\t" + "vaddps 0x20(%3), %%ymm7, %%ymm7 \n\t" + "vmovups %%ymm6, (%3) \n\t" + "vmovups %%ymm7, 0x20(%3) \n\t" + : + : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N) + : "%eax", "%rax", "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", + "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "memory"); +} + +void mmm_avx2_4x8_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N) +{ + __asm__ __volatile__( + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorps %%ymm2, %%ymm2, %%ymm2 \n\t" + "vxorps %%ymm3, %%ymm3, %%ymm3 \n\t" + + "mov %0, %%ecx \n\t" + "shr $2, %%ecx \n\t" + "je .k_loop_4x8_end \n\t" + ".align 16 \n\t" + ".k_loop_4x8: \n\t" + + "prefetcht0 0x140(%1) \n\t" + "prefetcht0 0x140(%2) \n\t" + + "vmovaps (%1), %%ymm4 \n\t" + "vbroadcastss 0x0(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm4, %%ymm0 \n\t" + "vbroadcastss 0x4(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm4, %%ymm1 \n\t" + "vbroadcastss 0x8(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm4, %%ymm2 \n\t" + "vbroadcastss 0xC(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm4, %%ymm3 \n\t" + + "vmovaps 0x20(%1), %%ymm4 \n\t" + "vbroadcastss 0x10(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm4, %%ymm0 \n\t" + "vbroadcastss 0x14(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm4, %%ymm1 \n\t" + "vbroadcastss 0x18(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm4, %%ymm2 \n\t" + "vbroadcastss 0x1C(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm4, %%ymm3 \n\t" + + "prefetcht0 0x180(%1) \n\t" + + "vmovaps 0x40(%1), %%ymm4 \n\t" + "vbroadcastss 0x20(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm4, %%ymm0 \n\t" + "vbroadcastss 0x24(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm4, %%ymm1 \n\t" + "vbroadcastss 0x28(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm4, %%ymm2 \n\t" + "vbroadcastss 0x2C(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm4, %%ymm3 \n\t" + + "vmovaps 0x60(%1), %%ymm4 \n\t" + "vbroadcastss 0x30(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm4, %%ymm0 \n\t" + "vbroadcastss 0x34(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm4, %%ymm1 \n\t" + "vbroadcastss 0x38(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm4, %%ymm2 \n\t" + "vbroadcastss 0x3C(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm4, %%ymm3 \n\t" + + "add $0x80, %1 \n\t" + "add $0x40, %2 \n\t" + + "sub $1, %%ecx \n\t" + "jg .k_loop_4x8 \n\t" + ".align 16 \n\t" + ".k_loop_4x8_end: \n\t" + + "mov %0, %%ecx \n\t" + "and $3, %%ecx \n\t" + "je .k_loop_4x8_remain_end \n\t" + ".k_loop_4x8_remain: \n\t" + "vmovaps (%1), %%ymm4 \n\t" + "vbroadcastss 0x0(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm4, %%ymm0 \n\t" + "vbroadcastss 0x4(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm4, %%ymm1 \n\t" + "vbroadcastss 0x8(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm4, %%ymm2 \n\t" + "vbroadcastss 0xC(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm4, %%ymm3 \n\t" + "add $0x20, %1 \n\t" + "add $0x10, %2 \n\t" + "sub $1, %%ecx \n\t" + "jg .k_loop_4x8_remain \n\t" + + ".k_loop_4x8_remain_end: \n\t" + "mov %4, %%eax \n\t" + "shl $2, %%eax \n\t" + "mov %%eax, %%eax \n\t" + "prefetcht0 (%3, %%rax) \n\t" + "vaddps (%3), %%ymm0, %%ymm0 \n\t" + "vmovups %%ymm0, (%3) \n\t" + "add %%rax, %3 \n\t" + "prefetcht0 (%3, %%rax) \n\t" + "vaddps (%3), %%ymm1, %%ymm1 \n\t" + "vmovups %%ymm1, (%3) \n\t" + "add %%rax, %3 \n\t" + "prefetcht0 (%3, %%rax) \n\t" + "vaddps (%3), %%ymm2, %%ymm2 \n\t" + "vmovups %%ymm2, (%3) \n\t" + "add %%rax, %3 \n\t" + "vaddps (%3), %%ymm3, %%ymm3 \n\t" + "vmovups %%ymm3, (%3) \n\t" + : + : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N) + : "%eax", "%rax", "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "memory"); +} + +void mmm_avx2_4x4_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N) +{ + __asm__ __volatile__( + "vxorps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vxorps %%xmm1, %%xmm1, %%xmm1 \n\t" + "vxorps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vxorps %%xmm3, %%xmm3, %%xmm3 \n\t" + + "mov %0, %%ecx \n\t" + "shr $2, %%ecx \n\t" + "je .k_loop_4x4_end \n\t" + ".align 16 \n\t" + ".k_loop_4x4: \n\t" + + "prefetcht0 0x140(%1) \n\t" + "prefetcht0 0x140(%2) \n\t" + + "vmovaps (%1), %%xmm4 \n\t" + "vbroadcastss 0x0(%2), %%xmm5 \n\t" + "vfmadd231ps %%xmm5, %%xmm4, %%xmm0 \n\t" + "vbroadcastss 0x4(%2), %%xmm5 \n\t" + "vfmadd231ps %%xmm5, %%xmm4, %%xmm1 \n\t" + "vbroadcastss 0x8(%2), %%xmm5 \n\t" + "vfmadd231ps %%xmm5, %%xmm4, %%xmm2 \n\t" + "vbroadcastss 0xC(%2), %%xmm5 \n\t" + "vfmadd231ps %%xmm5, %%xmm4, %%xmm3 \n\t" + + "vmovaps 0x10(%1), %%xmm4 \n\t" + "vbroadcastss 0x10(%2), %%xmm5 \n\t" + "vfmadd231ps %%xmm5, %%xmm4, %%xmm0 \n\t" + "vbroadcastss 0x14(%2), %%xmm5 \n\t" + "vfmadd231ps %%xmm5, %%xmm4, %%xmm1 \n\t" + "vbroadcastss 0x18(%2), %%xmm5 \n\t" + "vfmadd231ps %%xmm5, %%xmm4, %%xmm2 \n\t" + "vbroadcastss 0x1C(%2), %%xmm5 \n\t" + "vfmadd231ps %%xmm5, %%xmm4, %%xmm3 \n\t" + + "vmovaps 0x20(%1), %%xmm4 \n\t" + "vbroadcastss 0x20(%2), %%xmm5 \n\t" + "vfmadd231ps %%xmm5, %%xmm4, %%xmm0 \n\t" + "vbroadcastss 0x24(%2), %%xmm5 \n\t" + "vfmadd231ps %%xmm5, %%xmm4, %%xmm1 \n\t" + "vbroadcastss 0x28(%2), %%xmm5 \n\t" + "vfmadd231ps %%xmm5, %%xmm4, %%xmm2 \n\t" + "vbroadcastss 0x2C(%2), %%xmm5 \n\t" + "vfmadd231ps %%xmm5, %%xmm4, %%xmm3 \n\t" + + "vmovaps 0x30(%1), %%xmm4 \n\t" + "vbroadcastss 0x30(%2), %%xmm5 \n\t" + "vfmadd231ps %%xmm5, %%xmm4, %%xmm0 \n\t" + "vbroadcastss 0x34(%2), %%xmm5 \n\t" + "vfmadd231ps %%xmm5, %%xmm4, %%xmm1 \n\t" + "vbroadcastss 0x38(%2), %%xmm5 \n\t" + "vfmadd231ps %%xmm5, %%xmm4, %%xmm2 \n\t" + "vbroadcastss 0x3C(%2), %%xmm5 \n\t" + "vfmadd231ps %%xmm5, %%xmm4, %%xmm3 \n\t" + + "add $0x40, %1 \n\t" + "add $0x40, %2 \n\t" + + "sub $1, %%ecx \n\t" + "jg .k_loop_4x4 \n\t" + ".align 16 \n\t" + ".k_loop_4x4_end: \n\t" + + "mov %0, %%ecx \n\t" + "and $3, %%ecx \n\t" + "je .k_loop_4x4_remain_end \n\t" + + ".k_loop_4x4_remain: \n\t" + "vmovaps (%1), %%xmm4 \n\t" + "vbroadcastss 0x0(%2), %%xmm5 \n\t" + "vfmadd231ps %%xmm5, %%xmm4, %%xmm0 \n\t" + "vbroadcastss 0x4(%2), %%xmm5 \n\t" + "vfmadd231ps %%xmm5, %%xmm4, %%xmm1 \n\t" + "vbroadcastss 0x8(%2), %%xmm5 \n\t" + "vfmadd231ps %%xmm5, %%xmm4, %%xmm2 \n\t" + "vbroadcastss 0xC(%2), %%xmm5 \n\t" + "vfmadd231ps %%xmm5, %%xmm4, %%xmm3 \n\t" + "add $0x10, %1 \n\t" + "add $0x10, %2 \n\t" + "sub $1, %%ecx \n\t" + "jg .k_loop_4x4_remain \n\t" + + ".k_loop_4x4_remain_end: \n\t" + "mov %4, %%eax \n\t" + "shl $2, %%eax \n\t" + "mov %%eax, %%eax \n\t" + "prefetcht0 (%3, %%rax) \n\t" + "vaddps (%3), %%xmm0, %%xmm0 \n\t" + "vmovups %%xmm0, (%3) \n\t" + "add %%rax, %3 \n\t" + "prefetcht0 (%3, %%rax) \n\t" + "vaddps (%3), %%xmm1, %%xmm1 \n\t" + "vmovups %%xmm1, (%3) \n\t" + "add %%rax, %3 \n\t" + "prefetcht0 (%3, %%rax) \n\t" + "vaddps (%3), %%xmm2, %%xmm2 \n\t" + "vmovups %%xmm2, (%3) \n\t" + "add %%rax, %3 \n\t" + "vaddps (%3), %%xmm3, %%xmm3 \n\t" + "vmovups %%xmm3, (%3) \n\t" + : + : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N) + : "%eax", "%rax", "%ecx", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "memory"); +} + +void mmm_avx2_2x24_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N) +{ + __asm__ __volatile__("mov %4, %%eax \n\t" + "shl $2, %%eax \n\t" + "mov %%eax, %%eax \n\t" + + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorps %%ymm2, %%ymm2, %%ymm2 \n\t" + "vxorps %%ymm3, %%ymm3, %%ymm3 \n\t" + "vxorps %%ymm4, %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5, %%ymm5, %%ymm5 \n\t" + + "mov %0, %%ecx \n\t" + "shr $2, %%ecx \n\t" + "je .k_loop_2x24_end \n\t" + + ".align 16 \n\t" + ".k_loop_2x24: \n\t" + + "prefetcht0 0x140(%1) \n\t" + "prefetcht0 0x180(%1) \n\t" + + "vmovaps (%1), %%ymm6 \n\t" + "vmovaps 0x20(%1), %%ymm7 \n\t" + "vmovaps 0x40(%1), %%ymm8 \n\t" + "vbroadcastss 0x0(%2), %%ymm9 \n\t" + "vfmadd231ps %%ymm9, %%ymm6, %%ymm0 \n\t" + "vfmadd231ps %%ymm9, %%ymm7, %%ymm1 \n\t" + "vfmadd231ps %%ymm9, %%ymm8, %%ymm2 \n\t" + "vbroadcastss 0x4(%2), %%ymm9 \n\t" + "vfmadd231ps %%ymm9, %%ymm6, %%ymm3 \n\t" + "vfmadd231ps %%ymm9, %%ymm7, %%ymm4 \n\t" + "vfmadd231ps %%ymm9, %%ymm8, %%ymm5 \n\t" + + "prefetcht0 0x1C0(%1) \n\t" + + "vmovaps 0x60(%1), %%ymm6 \n\t" + "vmovaps 0x80(%1), %%ymm7 \n\t" + "vmovaps 0xA0(%1), %%ymm8 \n\t" + "vbroadcastss 0x8(%2), %%ymm9 \n\t" + "vfmadd231ps %%ymm9, %%ymm6, %%ymm0 \n\t" + "vfmadd231ps %%ymm9, %%ymm7, %%ymm1 \n\t" + "vfmadd231ps %%ymm9, %%ymm8, %%ymm2 \n\t" + "vbroadcastss 0xC(%2), %%ymm9 \n\t" + "vfmadd231ps %%ymm9, %%ymm6, %%ymm3 \n\t" + "vfmadd231ps %%ymm9, %%ymm7, %%ymm4 \n\t" + "vfmadd231ps %%ymm9, %%ymm8, %%ymm5 \n\t" + + "prefetcht0 0x200(%1) \n\t" + "prefetcht0 0x240(%1) \n\t" + + "vmovaps 0xC0(%1), %%ymm6 \n\t" + "vmovaps 0xE0(%1), %%ymm7 \n\t" + "vmovaps 0x100(%1), %%ymm8 \n\t" + "vbroadcastss 0x10(%2), %%ymm9 \n\t" + "vfmadd231ps %%ymm9, %%ymm6, %%ymm0 \n\t" + "vfmadd231ps %%ymm9, %%ymm7, %%ymm1 \n\t" + "vfmadd231ps %%ymm9, %%ymm8, %%ymm2 \n\t" + "vbroadcastss 0x14(%2), %%ymm9 \n\t" + "vfmadd231ps %%ymm9, %%ymm6, %%ymm3 \n\t" + "vfmadd231ps %%ymm9, %%ymm7, %%ymm4 \n\t" + "vfmadd231ps %%ymm9, %%ymm8, %%ymm5 \n\t" + + "prefetcht0 0x280(%1) \n\t" + + "vmovaps 0x120(%1), %%ymm6 \n\t" + "vmovaps 0x140(%1), %%ymm7 \n\t" + "vmovaps 0x160(%1), %%ymm8 \n\t" + "vbroadcastss 0x18(%2), %%ymm9 \n\t" + "vfmadd231ps %%ymm9, %%ymm6, %%ymm0 \n\t" + "vfmadd231ps %%ymm9, %%ymm7, %%ymm1 \n\t" + "vfmadd231ps %%ymm9, %%ymm8, %%ymm2 \n\t" + "vbroadcastss 0x1C(%2), %%ymm9 \n\t" + "vfmadd231ps %%ymm9, %%ymm6, %%ymm3 \n\t" + "vfmadd231ps %%ymm9, %%ymm7, %%ymm4 \n\t" + "vfmadd231ps %%ymm9, %%ymm8, %%ymm5 \n\t" + + "add $0x180, %1 \n\t" + "add $0x20, %2 \n\t" + + "sub $1, %%ecx \n\t" + "jg .k_loop_2x24 \n\t" + ".align 16 \n\t" + ".k_loop_2x24_end: \n\t" + + "mov %0, %%ecx \n\t" + "and $3, %%ecx \n\t" + "je .k_loop_2x24_remain_end \n\t" + + ".align 16 \n\t" + ".k_loop_2x24_remain: \n\t" + "vmovaps (%1), %%ymm6 \n\t" + "vmovaps 0x20(%1), %%ymm7 \n\t" + "vmovaps 0x40(%1), %%ymm8 \n\t" + "vbroadcastss 0x0(%2), %%ymm9 \n\t" + "vfmadd231ps %%ymm9, %%ymm6, %%ymm0 \n\t" + "vfmadd231ps %%ymm9, %%ymm7, %%ymm1 \n\t" + "vfmadd231ps %%ymm9, %%ymm8, %%ymm2 \n\t" + "vbroadcastss 0x4(%2), %%ymm9 \n\t" + "vfmadd231ps %%ymm9, %%ymm6, %%ymm3 \n\t" + "vfmadd231ps %%ymm9, %%ymm7, %%ymm4 \n\t" + "vfmadd231ps %%ymm9, %%ymm8, %%ymm5 \n\t" + "add $0x60, %1 \n\t" + "add $0x8, %2 \n\t" + "sub $1, %%ecx \n\t" + "jg .k_loop_2x24_remain \n\t" + + ".align 16 \n\t" + ".k_loop_2x24_remain_end: \n\t" + "prefetcht0 (%3, %%rax) \n\t" + "prefetcht0 0x40(%3, %%rax) \n\t" + "vaddps (%3), %%ymm0, %%ymm0 \n\t" + "vaddps 0x20(%3), %%ymm1, %%ymm1 \n\t" + "vaddps 0x40(%3), %%ymm2, %%ymm2 \n\t" + "vmovups %%ymm0, (%3) \n\t" + "vmovups %%ymm1, 0x20(%3) \n\t" + "vmovups %%ymm2, 0x40(%3) \n\t" + "add %%rax, %3 \n\t" + "vaddps (%3), %%ymm3, %%ymm3 \n\t" + "vaddps 0x20(%3), %%ymm4, %%ymm4 \n\t" + "vaddps 0x40(%3), %%ymm5, %%ymm5 \n\t" + "vmovups %%ymm3, (%3) \n\t" + "vmovups %%ymm4, 0x20(%3) \n\t" + "vmovups %%ymm5, 0x40(%3) \n\t" + : + : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N) + : "%eax", "%rax", "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", + "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "memory"); +} + +void mmm_avx2_2x16_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N) +{ + __asm__ __volatile__("mov %4, %%eax \n\t" + "shl $2, %%eax \n\t" + "mov %%eax, %%eax \n\t" + + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorps %%ymm3, %%ymm3, %%ymm3 \n\t" + "vxorps %%ymm4, %%ymm4, %%ymm4 \n\t" + + "mov %0, %%ecx \n\t" + "shr $2, %%ecx \n\t" + "je .k_loop_2x16_end \n\t" + + ".align 16 \n\t" + ".k_loop_2x16: \n\t" + + "prefetcht0 0x140(%1) \n\t" + + "vmovaps (%1), %%ymm6 \n\t" + "vmovaps 0x20(%1), %%ymm7 \n\t" + "vbroadcastss 0x0(%2), %%ymm9 \n\t" + "vfmadd231ps %%ymm9, %%ymm6, %%ymm0 \n\t" + "vfmadd231ps %%ymm9, %%ymm7, %%ymm1 \n\t" + "vbroadcastss 0x4(%2), %%ymm9 \n\t" + "vfmadd231ps %%ymm9, %%ymm6, %%ymm3 \n\t" + "vfmadd231ps %%ymm9, %%ymm7, %%ymm4 \n\t" + + "prefetcht0 0x180(%1) \n\t" + + "vmovaps 0x40(%1), %%ymm6 \n\t" + "vmovaps 0x60(%1), %%ymm7 \n\t" + "vbroadcastss 0x8(%2), %%ymm9 \n\t" + "vfmadd231ps %%ymm9, %%ymm6, %%ymm0 \n\t" + "vfmadd231ps %%ymm9, %%ymm7, %%ymm1 \n\t" + "vbroadcastss 0xC(%2), %%ymm9 \n\t" + "vfmadd231ps %%ymm9, %%ymm6, %%ymm3 \n\t" + "vfmadd231ps %%ymm9, %%ymm7, %%ymm4 \n\t" + + "prefetcht0 0x1C0(%1) \n\t" + + "vmovaps 0x80(%1), %%ymm6 \n\t" + "vmovaps 0xA0(%1), %%ymm7 \n\t" + "vbroadcastss 0x10(%2), %%ymm9 \n\t" + "vfmadd231ps %%ymm9, %%ymm6, %%ymm0 \n\t" + "vfmadd231ps %%ymm9, %%ymm7, %%ymm1 \n\t" + "vbroadcastss 0x14(%2), %%ymm9 \n\t" + "vfmadd231ps %%ymm9, %%ymm6, %%ymm3 \n\t" + "vfmadd231ps %%ymm9, %%ymm7, %%ymm4 \n\t" + + "prefetcht0 0x200(%1) \n\t" + + "vmovaps 0xC0(%1), %%ymm6 \n\t" + "vmovaps 0xE0(%1), %%ymm7 \n\t" + "vbroadcastss 0x18(%2), %%ymm9 \n\t" + "vfmadd231ps %%ymm9, %%ymm6, %%ymm0 \n\t" + "vfmadd231ps %%ymm9, %%ymm7, %%ymm1 \n\t" + "vbroadcastss 0x1C(%2), %%ymm9 \n\t" + "vfmadd231ps %%ymm9, %%ymm6, %%ymm3 \n\t" + "vfmadd231ps %%ymm9, %%ymm7, %%ymm4 \n\t" + + "add $0x100, %1 \n\t" + "add $0x20, %2 \n\t" + + "sub $1, %%ecx \n\t" + "jg .k_loop_2x16 \n\t" + ".align 16 \n\t" + ".k_loop_2x16_end: \n\t" + + "mov %0, %%ecx \n\t" + "and $3, %%ecx \n\t" + "je .k_loop_2x16_remain_end \n\t" + + ".align 16 \n\t" + ".k_loop_2x16_remain: \n\t" + "vmovaps (%1), %%ymm6 \n\t" + "vmovaps 0x20(%1), %%ymm7 \n\t" + "vbroadcastss 0x0(%2), %%ymm9 \n\t" + "vfmadd231ps %%ymm9, %%ymm6, %%ymm0 \n\t" + "vfmadd231ps %%ymm9, %%ymm7, %%ymm1 \n\t" + "vbroadcastss 0x4(%2), %%ymm9 \n\t" + "vfmadd231ps %%ymm9, %%ymm6, %%ymm3 \n\t" + "vfmadd231ps %%ymm9, %%ymm7, %%ymm4 \n\t" + "add $0x40, %1 \n\t" + "add $0x8, %2 \n\t" + "sub $1, %%ecx \n\t" + "jg .k_loop_2x16_remain \n\t" + + ".align 16 \n\t" + ".k_loop_2x16_remain_end: \n\t" + "prefetcht0 (%3, %%rax) \n\t" + "vaddps (%3), %%ymm0, %%ymm0 \n\t" + "vaddps 0x20(%3), %%ymm1, %%ymm1 \n\t" + "vmovups %%ymm0, (%3) \n\t" + "vmovups %%ymm1, 0x20(%3) \n\t" + "add %%rax, %3 \n\t" + "prefetcht0 (%3, %%rax) \n\t" + "vaddps (%3), %%ymm3, %%ymm3 \n\t" + "vaddps 0x20(%3), %%ymm4, %%ymm4 \n\t" + "vmovups %%ymm3, (%3) \n\t" + "vmovups %%ymm4, 0x20(%3) \n\t" + : + : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N) + : "%eax", "%rax", "%ecx", "%ymm0", "%ymm1", "%ymm3", "%ymm4", "%ymm6", + "%ymm7", "%ymm9", "memory"); +} + +void mmm_avx2_2x8_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N) +{ + __asm__ __volatile__("mov %4, %%eax \n\t" + "shl $2, %%eax \n\t" + "mov %%eax, %%eax \n\t" + + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" + + "mov %0, %%ecx \n\t" + "shr $2, %%ecx \n\t" + "je .k_loop_2x8_end \n\t" + + ".align 16 \n\t" + ".k_loop_2x8: \n\t" + + "prefetcht0 0x140(%1) \n\t" + "vmovaps (%1), %%ymm2 \n\t" + "vbroadcastss 0x0(%2), %%ymm3 \n\t" + "vfmadd231ps %%ymm3, %%ymm2, %%ymm0 \n\t" + "vbroadcastss 0x4(%2), %%ymm3 \n\t" + "vfmadd231ps %%ymm3, %%ymm2, %%ymm1 \n\t" + + "vmovaps 0x20(%1), %%ymm2 \n\t" + "vbroadcastss 0x8(%2), %%ymm3 \n\t" + "vfmadd231ps %%ymm3, %%ymm2, %%ymm0 \n\t" + "vbroadcastss 0xC(%2), %%ymm3 \n\t" + "vfmadd231ps %%ymm3, %%ymm2, %%ymm1 \n\t" + + "prefetcht0 0x180(%1) \n\t" + "vmovaps 0x40(%1), %%ymm2 \n\t" + "vbroadcastss 0x10(%2), %%ymm3 \n\t" + "vfmadd231ps %%ymm3, %%ymm2, %%ymm0 \n\t" + "vbroadcastss 0x14(%2), %%ymm3 \n\t" + "vfmadd231ps %%ymm3, %%ymm2, %%ymm1 \n\t" + + "vmovaps 0x60(%1), %%ymm2 \n\t" + "vbroadcastss 0x18(%2), %%ymm3 \n\t" + "vfmadd231ps %%ymm3, %%ymm2, %%ymm0 \n\t" + "vbroadcastss 0x1C(%2), %%ymm3 \n\t" + "vfmadd231ps %%ymm3, %%ymm2, %%ymm1 \n\t" + + "add $0x80, %1 \n\t" + "add $0x20, %2 \n\t" + + "sub $1, %%ecx \n\t" + "jg .k_loop_2x8 \n\t" + ".align 16 \n\t" + ".k_loop_2x8_end: \n\t" + + "mov %0, %%ecx \n\t" + "and $3, %%ecx \n\t" + "je .k_loop_2x8_remain_end \n\t" + + ".align 16 \n\t" + ".k_loop_2x8_remain: \n\t" + "vmovaps (%1), %%ymm2 \n\t" + "vbroadcastss 0x0(%2), %%ymm3 \n\t" + "vfmadd231ps %%ymm3, %%ymm2, %%ymm0 \n\t" + "vbroadcastss 0x4(%2), %%ymm3 \n\t" + "vfmadd231ps %%ymm3, %%ymm2, %%ymm1 \n\t" + "add $0x20, %1 \n\t" + "add $0x8, %2 \n\t" + "sub $1, %%ecx \n\t" + "jg .k_loop_2x8_remain \n\t" + + ".align 16 \n\t" + ".k_loop_2x8_remain_end: \n\t" + + "vaddps (%3), %%ymm0, %%ymm0 \n\t" + "vmovups %%ymm0, (%3) \n\t" + "add %%rax, %3 \n\t" + "vaddps (%3), %%ymm1, %%ymm1 \n\t" + "vmovups %%ymm1, (%3) \n\t" + : + : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N) + : "%eax", "%rax", "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "memory"); +} + +void mmm_avx2_2x4_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N) +{ + __asm__ __volatile__("mov %4, %%eax \n\t" + "shl $2, %%eax \n\t" + "mov %%eax, %%eax \n\t" + + "vxorps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vxorps %%xmm1, %%xmm1, %%xmm1 \n\t" + + "mov %0, %%ecx \n\t" + "shr $2, %%ecx \n\t" + "je .k_loop_2x4_end \n\t" + + ".align 16 \n\t" + ".k_loop_2x4: \n\t" + + "prefetcht0 0x140(%1) \n\t" + "vmovaps (%1), %%xmm2 \n\t" + "vbroadcastss 0x0(%2), %%xmm3 \n\t" + "vfmadd231ps %%xmm3, %%xmm2, %%xmm0 \n\t" + "vbroadcastss 0x4(%2), %%xmm3 \n\t" + "vfmadd231ps %%xmm3, %%xmm2, %%xmm1 \n\t" + + "vmovaps 0x10(%1), %%xmm2 \n\t" + "vbroadcastss 0x8(%2), %%xmm3 \n\t" + "vfmadd231ps %%xmm3, %%xmm2, %%xmm0 \n\t" + "vbroadcastss 0xC(%2), %%xmm3 \n\t" + "vfmadd231ps %%xmm3, %%xmm2, %%xmm1 \n\t" + + "vmovaps 0x20(%1), %%xmm2 \n\t" + "vbroadcastss 0x10(%2), %%xmm3 \n\t" + "vfmadd231ps %%xmm3, %%xmm2, %%xmm0 \n\t" + "vbroadcastss 0x14(%2), %%xmm3 \n\t" + "vfmadd231ps %%xmm3, %%xmm2, %%xmm1 \n\t" + + "vmovaps 0x30(%1), %%xmm2 \n\t" + "vbroadcastss 0x18(%2), %%xmm3 \n\t" + "vfmadd231ps %%xmm3, %%xmm2, %%xmm0 \n\t" + "vbroadcastss 0x1C(%2), %%xmm3 \n\t" + "vfmadd231ps %%xmm3, %%xmm2, %%xmm1 \n\t" + + "add $0x40, %1 \n\t" + "add $0x20, %2 \n\t" + + "sub $1, %%ecx \n\t" + "jg .k_loop_2x4 \n\t" + ".align 16 \n\t" + ".k_loop_2x4_end: \n\t" + + "mov %0, %%ecx \n\t" + "and $3, %%ecx \n\t" + "je .k_loop_2x4_remain_end \n\t" + + ".align 16 \n\t" + ".k_loop_2x4_remain: \n\t" + "vmovaps (%1), %%xmm2 \n\t" + "vbroadcastss 0x0(%2), %%xmm3 \n\t" + "vfmadd231ps %%xmm3, %%xmm2, %%xmm0 \n\t" + "vbroadcastss 0x4(%2), %%xmm3 \n\t" + "vfmadd231ps %%xmm3, %%xmm2, %%xmm1 \n\t" + "add $0x10, %1 \n\t" + "add $0x8, %2 \n\t" + "sub $1, %%ecx \n\t" + "jg .k_loop_2x4_remain \n\t" + + ".align 16 \n\t" + ".k_loop_2x4_remain_end: \n\t" + + "vaddps (%3), %%xmm0, %%xmm0 \n\t" + "vmovups %%xmm0, (%3) \n\t" + "add %%rax, %3 \n\t" + "vaddps (%3), %%xmm1, %%xmm1 \n\t" + "vmovups %%xmm1, (%3) \n\t" + : + : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N) + : "%eax", "%rax", "%ecx", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "memory"); +} + +void mmm_avx2_1x24_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N) +{ + __asm__ __volatile__("mov %4, %%eax \n\t" + "shl $2, %%eax \n\t" + "mov %%eax, %%eax \n\t" + + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorps %%ymm2, %%ymm2, %%ymm2 \n\t" + + "mov %0, %%ecx \n\t" + "shr $2, %%ecx \n\t" + "je .k_loop_1x24_end \n\t" + + ".align 16 \n\t" + ".k_loop_1x24: \n\t" + + "prefetcht0 0x140(%1) \n\t" + "prefetcht0 0x180(%1) \n\t" + + "vmovaps (%1), %%ymm3 \n\t" + "vmovaps 0x20(%1), %%ymm4 \n\t" + "vmovaps 0x40(%1), %%ymm5 \n\t" + "vbroadcastss 0x0(%2), %%ymm6 \n\t" + "vfmadd231ps %%ymm6, %%ymm3, %%ymm0 \n\t" + "vfmadd231ps %%ymm6, %%ymm4, %%ymm1 \n\t" + "vfmadd231ps %%ymm6, %%ymm5, %%ymm2 \n\t" + + "prefetcht0 0x1C0(%1) \n\t" + + "vmovaps 0x60(%1), %%ymm3 \n\t" + "vmovaps 0x80(%1), %%ymm4 \n\t" + "vmovaps 0xA0(%1), %%ymm5 \n\t" + "vbroadcastss 0x4(%2), %%ymm6 \n\t" + "vfmadd231ps %%ymm6, %%ymm3, %%ymm0 \n\t" + "vfmadd231ps %%ymm6, %%ymm4, %%ymm1 \n\t" + "vfmadd231ps %%ymm6, %%ymm5, %%ymm2 \n\t" + + "prefetcht0 0x200(%1) \n\t" + "prefetcht0 0x240(%1) \n\t" + + "vmovaps 0xC0(%1), %%ymm3 \n\t" + "vmovaps 0xE0(%1), %%ymm4 \n\t" + "vmovaps 0x100(%1), %%ymm5 \n\t" + "vbroadcastss 0x8(%2), %%ymm6 \n\t" + "vfmadd231ps %%ymm6, %%ymm3, %%ymm0 \n\t" + "vfmadd231ps %%ymm6, %%ymm4, %%ymm1 \n\t" + "vfmadd231ps %%ymm6, %%ymm5, %%ymm2 \n\t" + + "prefetcht0 0x280(%1) \n\t" + + "vmovaps 0x120(%1), %%ymm3 \n\t" + "vmovaps 0x140(%1), %%ymm4 \n\t" + "vmovaps 0x160(%1), %%ymm5 \n\t" + "vbroadcastss 0xC(%2), %%ymm6 \n\t" + "vfmadd231ps %%ymm6, %%ymm3, %%ymm0 \n\t" + "vfmadd231ps %%ymm6, %%ymm4, %%ymm1 \n\t" + "vfmadd231ps %%ymm6, %%ymm5, %%ymm2 \n\t" + + "add $0x180, %1 \n\t" + "add $0x10, %2 \n\t" + + "sub $1, %%ecx \n\t" + "jg .k_loop_1x24 \n\t" + ".align 16 \n\t" + ".k_loop_1x24_end: \n\t" + + "mov %0, %%ecx \n\t" + "and $3, %%ecx \n\t" + "je .k_loop_1x24_remain_end \n\t" + + ".align 16 \n\t" + ".k_loop_1x24_remain: \n\t" + "vmovaps (%1), %%ymm3 \n\t" + "vmovaps 0x20(%1), %%ymm4 \n\t" + "vmovaps 0x40(%1), %%ymm5 \n\t" + "vbroadcastss (%2), %%ymm6 \n\t" + "vfmadd231ps %%ymm6, %%ymm3, %%ymm0 \n\t" + "vfmadd231ps %%ymm6, %%ymm4, %%ymm1 \n\t" + "vfmadd231ps %%ymm6, %%ymm5, %%ymm2 \n\t" + "add $0x60, %1 \n\t" + "add $0x4, %2 \n\t" + "sub $1, %%ecx \n\t" + "jg .k_loop_1x24_remain \n\t" + + ".align 16 \n\t" + ".k_loop_1x24_remain_end: \n\t" + "prefetcht0 (%3, %%rax) \n\t" + "prefetcht0 0x40(%3, %%rax) \n\t" + "vaddps (%3), %%ymm0, %%ymm0 \n\t" + "vaddps 0x20(%3), %%ymm1, %%ymm1 \n\t" + "vaddps 0x40(%3), %%ymm2, %%ymm2 \n\t" + "vmovups %%ymm0, (%3) \n\t" + "vmovups %%ymm1, 0x20(%3) \n\t" + "vmovups %%ymm2, 0x40(%3) \n\t" + : + : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N) + : "%eax", "%rax", "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", + "%ymm5", "%ymm6", "memory"); +} + +void mmm_avx2_1x16_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N) +{ + __asm__ __volatile__( + "mov %4, %%eax \n\t" + "shl $2, %%eax \n\t" + "mov %%eax, %%eax \n\t" + + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" + + "mov %0, %%ecx \n\t" + "shr $2, %%ecx \n\t" + "je .k_loop_1x16_end \n\t" + + ".align 16 \n\t" + ".k_loop_1x16: \n\t" + + "prefetcht0 0x140(%1) \n\t" + + "vmovaps (%1), %%ymm2 \n\t" + "vmovaps 0x20(%1), %%ymm3 \n\t" + "vbroadcastss (%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm2, %%ymm0 \n\t" + "vfmadd231ps %%ymm5, %%ymm3, %%ymm1 \n\t" + + "prefetcht0 0x180(%1) \n\t" + + "vmovaps 0x40(%1), %%ymm2 \n\t" + "vmovaps 0x60(%1), %%ymm3 \n\t" + "vbroadcastss 0x4(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm2, %%ymm0 \n\t" + "vfmadd231ps %%ymm5, %%ymm3, %%ymm1 \n\t" + + "prefetcht0 0x1C0(%1) \n\t" + + "vmovaps 0x80(%1), %%ymm2 \n\t" + "vmovaps 0xA0(%1), %%ymm3 \n\t" + "vbroadcastss 0x8(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm2, %%ymm0 \n\t" + "vfmadd231ps %%ymm5, %%ymm3, %%ymm1 \n\t" + + "prefetcht0 0x200(%1) \n\t" + + "vmovaps 0xC0(%1), %%ymm2 \n\t" + "vmovaps 0xE0(%1), %%ymm3 \n\t" + "vbroadcastss 0xC(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm2, %%ymm0 \n\t" + "vfmadd231ps %%ymm5, %%ymm3, %%ymm1 \n\t" + + "add $0x100, %1 \n\t" + "add $0x10, %2 \n\t" + + "sub $1, %%ecx \n\t" + "jg .k_loop_1x16 \n\t" + ".align 16 \n\t" + ".k_loop_1x16_end: \n\t" + + "mov %0, %%ecx \n\t" + "and $3, %%ecx \n\t" + "je .k_loop_1x16_remain_end \n\t" + + ".align 16 \n\t" + ".k_loop_1x16_remain: \n\t" + "vmovaps (%1), %%ymm2 \n\t" + "vmovaps 0x20(%1), %%ymm3 \n\t" + "vbroadcastss 0x0(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm2, %%ymm0 \n\t" + "vfmadd231ps %%ymm5, %%ymm3, %%ymm1 \n\t" + "add $0x40, %1 \n\t" + "add $0x4, %2 \n\t" + "sub $1, %%ecx \n\t" + "jg .k_loop_1x16_remain \n\t" + + ".align 16 \n\t" + ".k_loop_1x16_remain_end: \n\t" + "prefetcht0 (%3, %%rax) \n\t" + "vaddps (%3), %%ymm0, %%ymm0 \n\t" + "vaddps 0x20(%3), %%ymm1, %%ymm1 \n\t" + "vmovups %%ymm0, (%3) \n\t" + "vmovups %%ymm1, 0x20(%3) \n\t" + : + : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N) + : "%eax", "%rax", "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm5", "memory"); +} + +void mmm_avx2_1x8_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N) +{ + __asm__ __volatile__("mov %4, %%eax \n\t" + "shl $2, %%eax \n\t" + "mov %%eax, %%eax \n\t" + + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + + "mov %0, %%ecx \n\t" + "shr $2, %%ecx \n\t" + "je .k_loop_1x8_end \n\t" + + ".align 16 \n\t" + ".k_loop_1x8: \n\t" + + "prefetcht0 0x140(%1) \n\t" + "vmovaps (%1), %%ymm1 \n\t" + "vbroadcastss (%2), %%ymm2 \n\t" + "vfmadd231ps %%ymm2, %%ymm1, %%ymm0 \n\t" + + "vmovaps 0x20(%1), %%ymm1 \n\t" + "vbroadcastss 0x4(%2), %%ymm2 \n\t" + "vfmadd231ps %%ymm2, %%ymm1, %%ymm0 \n\t" + + "prefetcht0 0x180(%1) \n\t" + "vmovaps 0x40(%1), %%ymm1 \n\t" + "vbroadcastss 0x8(%2), %%ymm2 \n\t" + "vfmadd231ps %%ymm2, %%ymm1, %%ymm0 \n\t" + + "vmovaps 0x60(%1), %%ymm1 \n\t" + "vbroadcastss 0xC(%2), %%ymm2 \n\t" + "vfmadd231ps %%ymm2, %%ymm1, %%ymm0 \n\t" + + "add $0x80, %1 \n\t" + "add $0x10, %2 \n\t" + + "sub $1, %%ecx \n\t" + "jg .k_loop_1x8 \n\t" + ".align 16 \n\t" + ".k_loop_1x8_end: \n\t" + + "mov %0, %%ecx \n\t" + "and $3, %%ecx \n\t" + "je .k_loop_1x8_remain_end \n\t" + + ".align 16 \n\t" + ".k_loop_1x8_remain: \n\t" + "vmovaps (%1), %%ymm1 \n\t" + "vbroadcastss (%2), %%ymm2 \n\t" + "vfmadd231ps %%ymm2, %%ymm1, %%ymm0 \n\t" + "add $0x20, %1 \n\t" + "add $0x4, %2 \n\t" + "sub $1, %%ecx \n\t" + "jg .k_loop_1x8_remain \n\t" + + ".align 16 \n\t" + ".k_loop_1x8_remain_end: \n\t" + + "vaddps (%3), %%ymm0, %%ymm0 \n\t" + "vmovups %%ymm0, (%3) \n\t" + : + : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N) + : "%eax", "%rax", "%ecx", "%ymm0", "%ymm1", "%ymm2", "memory"); +} + +void mmm_avx2_1x4_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N) +{ + __asm__ __volatile__("mov %4, %%eax \n\t" + "shl $2, %%eax \n\t" + "mov %%eax, %%eax \n\t" + + "vxorps %%xmm0, %%xmm0, %%xmm0 \n\t" + + "mov %0, %%ecx \n\t" + "shr $2, %%ecx \n\t" + "je .k_loop_1x4_end \n\t" + ".align 16 \n\t" + ".k_loop_1x4: \n\t" + + "prefetcht0 0x40(%1) \n\t" + + "vmovaps (%1), %%xmm1 \n\t" + "vbroadcastss 0x0(%2), %%xmm2 \n\t" + "vfmadd231ps %%xmm2, %%xmm1, %%xmm0 \n\t" + + "vmovaps 0x10(%1), %%xmm1 \n\t" + "vbroadcastss 0x4(%2), %%xmm2 \n\t" + "vfmadd231ps %%xmm2, %%xmm1, %%xmm0 \n\t" + + "vmovaps 0x20(%1), %%xmm1 \n\t" + "vbroadcastss 0x8(%2), %%xmm2 \n\t" + "vfmadd231ps %%xmm2, %%xmm1, %%xmm0 \n\t" + + "vmovaps 0x30(%1), %%xmm1 \n\t" + "vbroadcastss 0xC(%2), %%xmm2 \n\t" + "vfmadd231ps %%xmm2, %%xmm1, %%xmm0 \n\t" + + "add $0x40, %1 \n\t" + "add $0x10, %2 \n\t" + + "sub $1, %%ecx \n\t" + "jg .k_loop_1x4 \n\t" + ".align 16 \n\t" + ".k_loop_1x4_end: \n\t" + + "mov %0, %%ecx \n\t" + "and $3, %%ecx \n\t" + "je .k_loop_1x4_remain_end \n\t" + + ".align 16 \n\t" + ".k_loop_1x4_remain: \n\t" + "vmovaps (%1), %%xmm1 \n\t" + "vbroadcastss 0x0(%2), %%xmm2 \n\t" + "vfmadd231ps %%xmm2, %%xmm1, %%xmm0 \n\t" + "add $0x10, %1 \n\t" + "add $0x4, %2 \n\t" + "sub $1, %%ecx \n\t" + "jg .k_loop_1x4_remain \n\t" + + ".align 16 \n\t" + ".k_loop_1x4_remain_end: \n\t" + + "vaddps (%3), %%xmm0, %%xmm0 \n\t" + "vmovups %%xmm0, (%3) \n\t" + "add %%rax, %3 \n\t" + : + : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N) + : "%eax", "%rax", "%ecx", "%xmm0", "%xmm1", "%xmm2", "memory"); +} + +void mmm_avx2_n_mtail(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N) +{ + for (U32 i = 0; i < um; ++i) { + for (U32 j = 0; j < un; ++j) { + for (U32 k = 0; k < bk; ++k) { + matrixC[i * N + j] += matrixA[k * um + i] * matrixB[k * un + j]; + } + } + } +} + +EE mmm_avx2_fp32( + int N, int M, int K, bool transposeA, F32 *matrix1, F32 *matrix2, F32 *tmp, F32 *result) +{ + // buffer addr algined to 32 + F32 *packA = (F32 *)align_addr(tmp, 32); + F32 *packB = (F32 *)align_addr(matrix2, 32); + U32 blockSizeM, blockSizeK, blockSizeN, unrollSizeM; + F32 *curA, *curB, *curC; + kernel_func kernel[3][5] = { + {mmm_avx2_n_mtail, mmm_avx2_1x4_asm, mmm_avx2_1x8_asm, mmm_avx2_1x16_asm, mmm_avx2_1x24_asm}, + {mmm_avx2_n_mtail, mmm_avx2_2x4_asm, mmm_avx2_2x8_asm, mmm_avx2_2x16_asm, mmm_avx2_2x24_asm}, + {mmm_avx2_n_mtail, mmm_avx2_4x4_asm, mmm_avx2_4x8_asm, mmm_avx2_4x16_asm, mmm_avx2_4x24_asm}}; + F32 unrollNSize[4] = {4, 8, 16, 24}; + F32 unrollMSize[3] = {1, 2, 4}; + + for (int k = 0; k < K; k += blockSizeK) { + blockSizeK = UNI_MIN(BOLCK_K_DIM, K - k); + for (int j = 0; j < M; j += blockSizeM) { + blockSizeM = UNI_MIN(BOLCK_M_DIM, M - j); + for (int n = 0; n < N; n += blockSizeN) { + blockSizeN = UNI_MIN(UNROLL_N, N - n); + blockSizeN = UNI_MIN(unrollNSize[blockSizeN >> 3], blockSizeN); + curB = packB + k * N + n * blockSizeK; + for (U32 m = 0; m < blockSizeM; m += unrollSizeM) { + unrollSizeM = UNI_MIN(UNROLL_M, blockSizeM - m); + unrollSizeM = unrollMSize[unrollSizeM >> 1]; + curA = packA + m * blockSizeK; + if (n == 0) { + if (transposeA) { + matrix2_trans( + unrollSizeM, blockSizeK, M, matrix1 + (j + m) + k * M, curA); + } else { + matrix1_trans( + unrollSizeM, blockSizeK, K, matrix1 + k + (j + m) * K, curA); + } + } + curC = result + (m + j) * N + n; + kernel[unrollSizeM >> 1][(blockSizeN >> 3) + (blockSizeN > 3)]( + unrollSizeM, blockSizeN, blockSizeK, curA, curB, curC, N); + } + } + } + } + return SUCCESS; +} diff --git a/compute/blas_enhance/src/cpu/x86/fp32/mvm_avx2_col.cpp b/compute/blas_enhance/src/cpu/x86/fp32/mvm_avx2_col.cpp new file mode 100644 index 00000000..0ea6b860 --- /dev/null +++ b/compute/blas_enhance/src/cpu/x86/fp32/mvm_avx2_col.cpp @@ -0,0 +1,565 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/fp32/blas_fp32.h" +#include "error.h" +#include "types.h" + +#define UNROLL_K 4 + +typedef void (*kernel_func)(U32 N, F32 *matrix, F32 *vector, F32 *result); + +void mvm_col_avx2_4_32(U32 N, F32 *matrix, F32 *vector, F32 *result) +{ + __asm__ __volatile__("mov %0, %%eax \n\t" + "shl $2, %%eax \n\t" + "mov %%eax, %%eax \n\t" + "mov %1, %%rdx \n\t" + "add %%rax, %%rdx \n\t" + "mov %%rdx, %%r9 \n\t" + "add %%rax, %%r9 \n\t" + "mov %%r9, %%r10 \n\t" + "add %%rax, %%r10 \n\t" + + "mov %0, %%ecx \n\t" + "cmp $0x20, %%ecx \n\t" + "jl .n_loop_32_end \n\t" + ".align 16 \n\t" + ".n_loop_32: \n\t" + "prefetcht0 0x100(%3) \n\t" + "prefetcht0 0x140(%3) \n\t" + "vmovups (%3), %%ymm0 \n\t" + "vmovups 0x20(%3), %%ymm1 \n\t" + "vmovups 0x40(%3), %%ymm2 \n\t" + "vmovups 0x60(%3), %%ymm3 \n\t" + + "prefetcht0 0x140(%1) \n\t" + "prefetcht0 0x180(%1) \n\t" + "vmovups (%1), %%ymm12 \n\t" + "vmovups 0x20(%1), %%ymm13 \n\t" + "vmovups 0x40(%1), %%ymm14 \n\t" + "vmovups 0x60(%1), %%ymm11 \n\t" + "vbroadcastss 0x0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm11, %%ymm3 \n\t" + + "prefetcht0 0x100(%%rdx) \n\t" + "prefetcht0 0x140(%%rdx) \n\t" + "vmovups (%%rdx), %%ymm12 \n\t" + "vmovups 0x20(%%rdx), %%ymm13 \n\t" + "vmovups 0x40(%%rdx), %%ymm14 \n\t" + "vmovups 0x60(%%rdx), %%ymm11 \n\t" + "vbroadcastss 0x4(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm11, %%ymm3 \n\t" + + "prefetcht0 0x100(%%r9) \n\t" + "prefetcht0 0x140(%%r9) \n\t" + "vmovups (%%r9), %%ymm12 \n\t" + "vmovups 0x20(%%r9), %%ymm13 \n\t" + "vmovups 0x40(%%r9), %%ymm14 \n\t" + "vmovups 0x60(%%r9), %%ymm11 \n\t" + "vbroadcastss 0x8(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm11, %%ymm3 \n\t" + + "prefetcht0 0x100(%%r10) \n\t" + "prefetcht0 0x140(%%r10) \n\t" + "vmovups (%%r10), %%ymm12 \n\t" + "vmovups 0x20(%%r10), %%ymm13 \n\t" + "vmovups 0x40(%%r10), %%ymm14 \n\t" + "vmovups 0x60(%%r10), %%ymm11 \n\t" + "vbroadcastss 0xC(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm11, %%ymm3 \n\t" + + "vmovups %%ymm0, (%3) \n\t" + "vmovups %%ymm1, 0x20(%3) \n\t" + "vmovups %%ymm2, 0x40(%3) \n\t" + "vmovups %%ymm3, 0x60(%3) \n\t" + + "add $0x80, %1 \n\t" + "add $0x80, %%rdx \n\t" + "add $0x80, %%r9 \n\t" + "add $0x80, %%r10 \n\t" + "add $0x80, %3 \n\t" + + "sub $0x20, %%ecx \n\t" + "cmp $0x20, %%ecx \n\t" + "jge .n_loop_32 \n\t" + + ".align 16 \n\t" + ".n_loop_32_end: \n\t" + "cmp $0x10, %%ecx \n\t" + "jl .n_loop_remain_16_end \n\t" + "vmovups (%3), %%ymm0 \n\t" + "vmovups 0x20(%3), %%ymm1 \n\t" + "vmovups (%1), %%ymm12 \n\t" + "vmovups 0x20(%1), %%ymm13 \n\t" + "vbroadcastss 0x0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + + "vmovups (%%rdx), %%ymm12 \n\t" + "vmovups 0x20(%%rdx), %%ymm13 \n\t" + "vbroadcastss 0x4(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + + "vmovups (%%r9), %%ymm12 \n\t" + "vmovups 0x20(%%r9), %%ymm13 \n\t" + "vbroadcastss 0x8(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + + "vmovups (%%r10), %%ymm12 \n\t" + "vmovups 0x20(%%r10), %%ymm13 \n\t" + "vbroadcastss 0xC(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + + "vmovups %%ymm0, (%3) \n\t" + "vmovups %%ymm1, 0x20(%3) \n\t" + + "add $0x40, %1 \n\t" + "add $0x40, %%rdx \n\t" + "add $0x40, %%r9 \n\t" + "add $0x40, %%r10 \n\t" + "add $0x40, %3 \n\t" + "sub $0x10, %%ecx \n\t" + + ".n_loop_remain_16_end: \n\t" + "cmp $0x8, %%ecx \n\t" + "jl .n_loop_remain_8_end \n\t" + "vmovups (%3), %%ymm0 \n\t" + "vmovups (%1), %%ymm12 \n\t" + "vbroadcastss 0x0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vmovups (%%rdx), %%ymm12 \n\t" + "vbroadcastss 0x4(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vmovups (%%r9), %%ymm12 \n\t" + "vbroadcastss 0x8(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vmovups (%%r10), %%ymm12 \n\t" + "vbroadcastss 0xC(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vmovups %%ymm0, (%3) \n\t" + + "add $0x20, %1 \n\t" + "add $0x20, %%rdx \n\t" + "add $0x20, %%r9 \n\t" + "add $0x20, %%r10 \n\t" + "add $0x20, %3 \n\t" + "sub $0x8, %%ecx \n\t" + + ".align 16 \n\t" + ".n_loop_remain_8_end: \n\t" + "cmp $0x4, %%ecx \n\t" + "jl .n_loop_remain_4_end \n\t" + "vmovups (%3), %%xmm0 \n\t" + "vmovups (%1), %%xmm12 \n\t" + "vbroadcastss 0x0(%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + + "vmovups (%%rdx), %%xmm12 \n\t" + "vbroadcastss 0x4(%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + + "vmovups (%%r9), %%xmm12 \n\t" + "vbroadcastss 0x8(%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + + "vmovups (%%r10), %%xmm12 \n\t" + "vbroadcastss 0xC(%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + + "vmovups %%xmm0, (%3) \n\t" + + "add $0x10, %1 \n\t" + "add $0x10, %%rdx \n\t" + "add $0x10, %%r9 \n\t" + "add $0x10, %%r10 \n\t" + "add $0x10, %3 \n\t" + "sub $0x4, %%ecx \n\t" + + ".align 16 \n\t" + ".n_loop_remain_4_end: \n\t" + "cmp $0x2, %%ecx \n\t" + "jl .n_loop_remain_2_end \n\t" + "vmovsd (%3), %%xmm0 \n\t" + "vmovsd (%1), %%xmm12 \n\t" + "vbroadcastss 0x0(%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + + "vmovsd (%%rdx), %%xmm12 \n\t" + "vbroadcastss 0x4(%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + + "vmovsd (%%r9), %%xmm12 \n\t" + "vbroadcastss 0x8(%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + + "vmovsd (%%r10), %%xmm12 \n\t" + "vbroadcastss 0xC(%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + + "vmovsd %%xmm0, (%3) \n\t" + + "add $0x8, %1 \n\t" + "add $0x8, %%rdx \n\t" + "add $0x8, %%r9 \n\t" + "add $0x8, %%r10 \n\t" + "add $0x8, %3 \n\t" + "sub $0x2, %%ecx \n\t" + + ".align 16 \n\t" + ".n_loop_remain_2_end: \n\t" + "and $0x1, %%ecx \n\t" + "je .n_loop_remain_1_end \n\t" + "vmovss (%3), %%xmm0 \n\t" + "vmovss (%1), %%xmm12 \n\t" + "vmovss 0x0(%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + + "vmovss (%%rdx), %%xmm12 \n\t" + "vmovss 0x4(%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + + "vmovss (%%r9), %%xmm12 \n\t" + "vmovss 0x8(%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + + "vmovss (%%r10), %%xmm12 \n\t" + "vmovss 0xC(%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + + "vmovss %%xmm0, (%3) \n\t" + + ".align 16 \n\t" + ".n_loop_remain_1_end: \n\t" + : + : "r"(N), "r"(matrix), "r"(vector), "r"(result) + : "%eax", "%rax", "%ecx", "%rdx", "%r9", "%r10", "%ymm0", "%ymm1", "%ymm2", + "%ymm12", "%ymm13", "%ymm14", "%ymm15", "%xmm0", "%xmm12", "%xmm15", + "memory"); +} + +void mvm_col_avx2_2_32(U32 N, F32 *matrix, F32 *vector, F32 *result) +{ + __asm__ __volatile__("mov %0, %%eax \n\t" + "shl $2, %%eax \n\t" + "mov %%eax, %%eax \n\t" + "mov %1, %%rdx \n\t" + "add %%rax, %%rdx \n\t" + + "mov %0, %%ecx \n\t" + "cmp $0x20, %%ecx \n\t" + "jl .k2_n_loop_32_end \n\t" + ".align 16 \n\t" + ".k2_n_loop_32: \n\t" + "prefetcht0 0x100(%3) \n\t" + "prefetcht0 0x140(%3) \n\t" + "vmovups (%3), %%ymm0 \n\t" + "vmovups 0x20(%3), %%ymm1 \n\t" + "vmovups 0x40(%3), %%ymm2 \n\t" + "vmovups 0x60(%3), %%ymm3 \n\t" + + "prefetcht0 0x100(%1) \n\t" + "prefetcht0 0x140(%1) \n\t" + "vmovups (%1), %%ymm12 \n\t" + "vmovups 0x20(%1), %%ymm13 \n\t" + "vmovups 0x40(%1), %%ymm14 \n\t" + "vmovups 0x60(%1), %%ymm11 \n\t" + "vbroadcastss 0x0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm11, %%ymm3 \n\t" + + "prefetcht0 0x100(%%rdx) \n\t" + "prefetcht0 0x140(%%rdx) \n\t" + "vmovups (%%rdx), %%ymm12 \n\t" + "vmovups 0x20(%%rdx), %%ymm13 \n\t" + "vmovups 0x40(%%rdx), %%ymm14 \n\t" + "vmovups 0x60(%%rdx), %%ymm11 \n\t" + "vbroadcastss 0x4(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm11, %%ymm3 \n\t" + + "vmovups %%ymm0, (%3) \n\t" + "vmovups %%ymm1, 0x20(%3) \n\t" + "vmovups %%ymm2, 0x40(%3) \n\t" + "vmovups %%ymm3, 0x60(%3) \n\t" + + "add $0x80, %1 \n\t" + "add $0x80, %%rdx \n\t" + "add $0x80, %3 \n\t" + + "sub $0x20, %%ecx \n\t" + "cmp $0x20, %%ecx \n\t" + "jge .k2_n_loop_32 \n\t" + + ".align 16 \n\t" + ".k2_n_loop_32_end: \n\t" + "cmp $0x10, %%ecx \n\t" + "jl .k2_n_loop_remain_16_end \n\t" + "vmovups (%3), %%ymm0 \n\t" + "vmovups 0x20(%3), %%ymm1 \n\t" + "vmovups (%1), %%ymm12 \n\t" + "vmovups 0x20(%1), %%ymm13 \n\t" + "vbroadcastss 0x0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + + "vmovups (%%rdx), %%ymm12 \n\t" + "vmovups 0x20(%%rdx), %%ymm13 \n\t" + "vbroadcastss 0x4(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + + "vmovups %%ymm0, (%3) \n\t" + "vmovups %%ymm1, 0x20(%3) \n\t" + + "add $0x40, %1 \n\t" + "add $0x40, %%rdx \n\t" + "add $0x40, %3 \n\t" + "sub $0x10, %%ecx \n\t" + + ".k2_n_loop_remain_16_end: \n\t" + "cmp $0x8, %%ecx \n\t" + "jl .k2_n_loop_remain_8_end \n\t" + "vmovups (%3), %%ymm0 \n\t" + "vmovups (%1), %%ymm12 \n\t" + "vbroadcastss 0x0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vmovups (%%rdx), %%ymm12 \n\t" + "vbroadcastss 0x4(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vmovups %%ymm0, (%3) \n\t" + + "add $0x20, %1 \n\t" + "add $0x20, %%rdx \n\t" + "add $0x20, %3 \n\t" + "sub $0x8, %%ecx \n\t" + + ".align 16 \n\t" + ".k2_n_loop_remain_8_end: \n\t" + "cmp $0x4, %%ecx \n\t" + "jl .k2_n_loop_remain_4_end \n\t" + "vmovups (%3), %%xmm0 \n\t" + "vmovups (%1), %%xmm12 \n\t" + "vbroadcastss 0x0(%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + + "vmovups (%%rdx), %%xmm12 \n\t" + "vbroadcastss 0x4(%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + "vmovups %%xmm0, (%3) \n\t" + + "add $0x10, %1 \n\t" + "add $0x10, %%rdx \n\t" + "add $0x10, %3 \n\t" + "sub $0x4, %%ecx \n\t" + + ".align 16 \n\t" + ".k2_n_loop_remain_4_end: \n\t" + "cmp $0x2, %%ecx \n\t" + "jl .k2_n_loop_remain_2_end \n\t" + "vmovsd (%3), %%xmm0 \n\t" + "vmovsd (%1), %%xmm12 \n\t" + "vbroadcastss 0x0(%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + + "vmovsd (%%rdx), %%xmm12 \n\t" + "vbroadcastss 0x4(%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + "vmovsd %%xmm0, (%3) \n\t" + + "add $0x8, %1 \n\t" + "add $0x8, %%rdx \n\t" + "add $0x8, %3 \n\t" + "sub $0x2, %%ecx \n\t" + + ".align 16 \n\t" + ".k2_n_loop_remain_2_end: \n\t" + "and $0x1, %%ecx \n\t" + "je .k2_n_loop_remain_1_end \n\t" + "vmovss (%3), %%xmm0 \n\t" + "vmovss (%1), %%xmm12 \n\t" + "vmovss 0x0(%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + + "vmovss (%%rdx), %%xmm12 \n\t" + "vmovss 0x4(%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + "vmovss %%xmm0, (%3) \n\t" + + ".align 16 \n\t" + ".k2_n_loop_remain_1_end: \n\t" + : + : "r"(N), "r"(matrix), "r"(vector), "r"(result) + : "%eax", "%rax", "%ecx", "%rdx", "%ymm0", "%ymm1", "%ymm2", "%ymm12", + "%ymm13", "%ymm14", "%ymm15", "%xmm0", "%xmm12", "%xmm15", "memory"); +} + +void mvm_col_avx2_1_32(U32 N, F32 *matrix, F32 *vector, F32 *result) +{ + __asm__ __volatile__("mov %0, %%ecx \n\t" + "cmp $0x20, %%ecx \n\t" + "jl .k1_n_loop_32_end \n\t" + ".align 16 \n\t" + ".k1_n_loop_32: \n\t" + "prefetcht0 0x100(%3) \n\t" + "prefetcht0 0x140(%3) \n\t" + "vmovups (%3), %%ymm0 \n\t" + "vmovups 0x20(%3), %%ymm1 \n\t" + "vmovups 0x40(%3), %%ymm2 \n\t" + "vmovups 0x60(%3), %%ymm3 \n\t" + + "prefetcht0 0x100(%1) \n\t" + "prefetcht0 0x140(%1) \n\t" + "vmovups (%1), %%ymm12 \n\t" + "vmovups 0x20(%1), %%ymm13 \n\t" + "vmovups 0x40(%1), %%ymm14 \n\t" + "vmovups 0x60(%1), %%ymm11 \n\t" + "vbroadcastss (%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm11, %%ymm3 \n\t" + + "vmovups %%ymm0, (%3) \n\t" + "vmovups %%ymm1, 0x20(%3) \n\t" + "vmovups %%ymm2, 0x40(%3) \n\t" + "vmovups %%ymm3, 0x60(%3) \n\t" + + "add $0x80, %1 \n\t" + "add $0x80, %3 \n\t" + + "sub $0x20, %%ecx \n\t" + "cmp $0x20, %%ecx \n\t" + "jge .k1_n_loop_32 \n\t" + + ".align 16 \n\t" + ".k1_n_loop_32_end: \n\t" + "cmp $0x10, %%ecx \n\t" + "jl .k1_n_loop_remain_16_end \n\t" + "vmovups (%3), %%ymm0 \n\t" + "vmovups 0x20(%3), %%ymm1 \n\t" + "vmovups (%1), %%ymm12 \n\t" + "vmovups 0x20(%1), %%ymm13 \n\t" + "vbroadcastss 0x0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + + "vmovups %%ymm0, (%3) \n\t" + "vmovups %%ymm1, 0x20(%3) \n\t" + + "add $0x40, %1 \n\t" + "add $0x40, %3 \n\t" + "sub $0x10, %%ecx \n\t" + + ".k1_n_loop_remain_16_end: \n\t" + "cmp $0x8, %%ecx \n\t" + "jl .k1_n_loop_remain_8_end \n\t" + "vmovups (%3), %%ymm0 \n\t" + "vmovups (%1), %%ymm12 \n\t" + "vbroadcastss (%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vaddps (%3), %%ymm0, %%ymm0 \n\t" + "vmovups %%ymm0, (%3) \n\t" + + "add $0x20, %1 \n\t" + "add $0x20, %3 \n\t" + "sub $0x8, %%ecx \n\t" + + ".align 16 \n\t" + ".k1_n_loop_remain_8_end: \n\t" + "cmp $0x4, %%ecx \n\t" + "jl .k1_n_loop_remain_4_end \n\t" + "vmovups (%3), %%xmm0 \n\t" + "vmovups (%1), %%xmm12 \n\t" + "vbroadcastss (%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + + "vaddps (%3), %%xmm0, %%xmm0 \n\t" + "vmovups %%xmm0, (%3) \n\t" + + "add $0x10, %1 \n\t" + "add $0x10, %3 \n\t" + "sub $0x4, %%ecx \n\t" + + ".align 16 \n\t" + ".k1_n_loop_remain_4_end: \n\t" + "cmp $0x2, %%ecx \n\t" + "jl .k1_n_loop_remain_2_end \n\t" + "vmovsd (%3), %%xmm0 \n\t" + "vmovsd (%1), %%xmm12 \n\t" + "vbroadcastss (%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + + "vaddps (%3), %%xmm0, %%xmm0 \n\t" + "vmovsd %%xmm0, (%3) \n\t" + + "add $0x8, %1 \n\t" + "add $0x8, %3 \n\t" + "sub $0x2, %%ecx \n\t" + + ".align 16 \n\t" + ".k1_n_loop_remain_2_end: \n\t" + "and $0x1, %%ecx \n\t" + "je .k1_n_loop_remain_1_end \n\t" + "vmovss (%3), %%xmm0 \n\t" + "vmovss (%1), %%xmm12 \n\t" + "vmovss (%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + "vmovss %%xmm0, (%3) \n\t" + + ".align 16 \n\t" + ".k1_n_loop_remain_1_end: \n\t" + : + : "r"(N), "r"(matrix), "r"(vector), "r"(result) + : "%eax", "%rax", "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm12", "%ymm13", + "%ymm14", "%ymm15", "%xmm0", "%xmm12", "%xmm15", "memory"); +} + +void mvm_col_fp32(U32 numRows, U32 numColumns, F32 *matrix, F32 *vector, F32 *result) +{ + // Actual layout is KN, and vector is K + U32 blockKSize = 0; + kernel_func kernel[3] = {mvm_col_avx2_1_32, mvm_col_avx2_2_32, mvm_col_avx2_4_32}; + U32 unrollKSize[3] = {1, 2, 4}; + for (U32 bk = 0; bk < numColumns; bk += blockKSize) { + blockKSize = UNI_MIN(numColumns - bk, 4); + blockKSize = unrollKSize[blockKSize >> 1]; + kernel[blockKSize >> 1](numRows, matrix + bk * numRows, vector + bk, result); + } +} diff --git a/compute/blas_enhance/src/cpu/x86/fp32/mvm_avx2_row.cpp b/compute/blas_enhance/src/cpu/x86/fp32/mvm_avx2_row.cpp new file mode 100644 index 00000000..6030bc9f --- /dev/null +++ b/compute/blas_enhance/src/cpu/x86/fp32/mvm_avx2_row.cpp @@ -0,0 +1,540 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/fp32/blas_fp32.h" +#include "error.h" +#include "types.h" + +#define UNROLL_N 4 +#define BOLCK_K_DIM 512 + +typedef void (*kernel_func)(U32 bk, U32 lda, F32 *matrix, F32 *vector, F32 *result); + +void mvm_row_avx_4_32(U32 bk, U32 lda, F32 *matrix, F32 *vector, F32 *result) +{ + __asm__ __volatile__("vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorps %%ymm2, %%ymm2, %%ymm2 \n\t" + "vxorps %%ymm3, %%ymm3, %%ymm3 \n\t" + + "mov %4, %%eax \n\t" + "shl $2, %%eax \n\t" + "mov %%eax, %%eax \n\t" + "mov %1, %%rdx \n\t" + "add %%rax, %%rdx \n\t" + "mov %%rdx, %%r9 \n\t" + "add %%rax, %%r9 \n\t" + "mov %%r9, %%r10 \n\t" + "add %%rax, %%r10 \n\t" + + "mov %0, %%ecx \n\t" + "shr $5, %%ecx \n\t" + "je .k_loop_32_end \n\t" + ".align 16 \n\t" + ".k_loop_32: \n\t" + + "prefetcht0 0x100(%1) \n\t" + "prefetcht0 0x140(%1) \n\t" + "prefetcht0 0x100(%%rdx) \n\t" + "prefetcht0 0x140(%%rdx) \n\t" + + "vmovups (%2), %%ymm12 \n\t" + "vmovups 0x20(%2), %%ymm13 \n\t" + "vmovups 0x40(%2), %%ymm14 \n\t" + "vmovups 0x60(%2), %%ymm15 \n\t" + + "vmovups (%1), %%ymm4 \n\t" + "vmovups 0x20(%1), %%ymm5 \n\t" + "vmovups 0x40(%1), %%ymm6 \n\t" + "vmovups 0x60(%1), %%ymm7 \n\t" + "vmovups (%%rdx), %%ymm8 \n\t" + "vmovups 0x20(%%rdx), %%ymm9 \n\t" + "vmovups 0x40(%%rdx), %%ymm10 \n\t" + "vmovups 0x60(%%rdx), %%ymm11 \n\t" + "vfmadd231ps %%ymm12, %%ymm4, %%ymm0 \n\t" + "vfmadd231ps %%ymm12, %%ymm8, %%ymm1 \n\t" + "vfmadd231ps %%ymm13, %%ymm5, %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm9, %%ymm1 \n\t" + "vfmadd231ps %%ymm14, %%ymm6, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm10, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm7, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm11, %%ymm1 \n\t" + + "prefetcht0 0x100(%%r9) \n\t" + "prefetcht0 0x140(%%r9) \n\t" + "prefetcht0 0x100(%%r10) \n\t" + "prefetcht0 0x140(%%r10) \n\t" + + "vmovups (%%r9), %%ymm4 \n\t" + "vmovups 0x20(%%r9), %%ymm5 \n\t" + "vmovups 0x40(%%r9), %%ymm6 \n\t" + "vmovups 0x60(%%r9), %%ymm7 \n\t" + "vmovups (%%r10), %%ymm8 \n\t" + "vmovups 0x20(%%r10), %%ymm9 \n\t" + "vmovups 0x40(%%r10), %%ymm10 \n\t" + "vmovups 0x60(%%r10), %%ymm11 \n\t" + "vfmadd231ps %%ymm12, %%ymm4, %%ymm2 \n\t" + "vfmadd231ps %%ymm12, %%ymm8, %%ymm3 \n\t" + "vfmadd231ps %%ymm13, %%ymm5, %%ymm2 \n\t" + "vfmadd231ps %%ymm13, %%ymm9, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm6, %%ymm2 \n\t" + "vfmadd231ps %%ymm14, %%ymm10, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm7, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm11, %%ymm3 \n\t" + + "add $0x80, %1 \n\t" + "add $0x80, %2 \n\t" + "add $0x80, %%rdx \n\t" + "add $0x80, %%r9 \n\t" + "add $0x80, %%r10 \n\t" + "sub $1, %%ecx \n\t" + "jg .k_loop_32 \n\t" + + ".align 16 \n\t" + ".k_loop_32_end: \n\t" + "mov %0, %%ecx \n\t" + "and $0x1F, %%ecx \n\t" + "cmp $0x10, %%ecx \n\t" + "jl .k_loop_remain_16_end \n\t" + + "vmovups (%2), %%ymm12 \n\t" + "vmovups 0x20(%2), %%ymm13 \n\t" + + "vmovups (%1), %%ymm4 \n\t" + "vmovups 0x20(%1), %%ymm5 \n\t" + "vmovups (%%rdx), %%ymm8 \n\t" + "vmovups 0x20(%%rdx), %%ymm9 \n\t" + "vmovups (%%r9), %%ymm6 \n\t" + "vmovups 0x20(%%r9), %%ymm7 \n\t" + "vmovups (%%r10), %%ymm10 \n\t" + "vmovups 0x20(%%r10), %%ymm11 \n\t" + "vfmadd231ps %%ymm12, %%ymm4, %%ymm0 \n\t" + "vfmadd231ps %%ymm12, %%ymm8, %%ymm1 \n\t" + "vfmadd231ps %%ymm12, %%ymm6, %%ymm2 \n\t" + "vfmadd231ps %%ymm12, %%ymm10, %%ymm3 \n\t" + "vfmadd231ps %%ymm13, %%ymm5, %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm9, %%ymm1 \n\t" + "vfmadd231ps %%ymm13, %%ymm7, %%ymm2 \n\t" + "vfmadd231ps %%ymm13, %%ymm11, %%ymm3 \n\t" + + "add $0x40, %1 \n\t" + "add $0x40, %2 \n\t" + "add $0x40, %%rdx \n\t" + "add $0x40, %%r9 \n\t" + "add $0x40, %%r10 \n\t" + "sub $0x10, %%ecx \n\t" + + ".align 16 \n\t" + ".k_loop_remain_16_end: \n\t" + "cmp $0x8, %%ecx \n\t" + "jl .k_loop_remain_8_end \n\t" + "vmovups (%2), %%ymm12 \n\t" + "vmovups (%1), %%ymm4 \n\t" + "vmovups (%%rdx), %%ymm6 \n\t" + "vmovups (%%r9), %%ymm8 \n\t" + "vmovups (%%r10), %%ymm10 \n\t" + "vfmadd231ps %%ymm4, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm6, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm8, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm10, %%ymm12, %%ymm3 \n\t" + "add $0x20, %1 \n\t" + "add $0x20, %2 \n\t" + "add $0x20, %%rdx \n\t" + "add $0x20, %%r9 \n\t" + "add $0x20, %%r10 \n\t" + "sub $0x8, %%ecx \n\t" + + ".align 16 \n\t" + ".k_loop_remain_8_end: \n\t" + "vperm2f128 $0x1, %%ymm0, %%ymm0, %%ymm12 \n\t" + "vperm2f128 $0x1, %%ymm1, %%ymm1, %%ymm13 \n\t" + "vperm2f128 $0x1, %%ymm2, %%ymm2, %%ymm14 \n\t" + "vperm2f128 $0x1, %%ymm3, %%ymm3, %%ymm15 \n\t" + "vaddps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vaddps %%ymm13, %%ymm1, %%ymm1 \n\t" + "vaddps %%ymm14, %%ymm2, %%ymm2 \n\t" + "vaddps %%ymm15, %%ymm3, %%ymm3 \n\t" + + "cmp $0x4, %%ecx \n\t" + "jl .k_loop_remain_4_end \n\t" + "vmovups (%2), %%xmm12 \n\t" + "vmovups (%1), %%xmm4 \n\t" + "vmovups (%%rdx), %%xmm6 \n\t" + "vmovups (%%r9), %%xmm8 \n\t" + "vmovups (%%r10), %%xmm10 \n\t" + "vfmadd231ps %%xmm4, %%xmm12, %%xmm0 \n\t" + "vfmadd231ps %%xmm6, %%xmm12, %%xmm1 \n\t" + "vfmadd231ps %%xmm8, %%xmm12, %%xmm2 \n\t" + "vfmadd231ps %%xmm10,%%xmm12, %%xmm3 \n\t" + "add $0x10, %1 \n\t" + "add $0x10, %2 \n\t" + "add $0x10, %%rdx \n\t" + "add $0x10, %%r9 \n\t" + "add $0x10, %%r10 \n\t" + "sub $0x4, %%ecx \n\t" + + ".align 16 \n\t" + ".k_loop_remain_4_end: \n\t" + "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" + "vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t" + + "cmp $0x2, %%ecx \n\t" + "jl .k_loop_remain_2_end \n\t" + "vxorps %%xmm12, %%xmm12, %%xmm12 \n\t" + "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorps %%xmm8, %%xmm8, %%xmm8 \n\t" + "vxorps %%xmm10, %%xmm10, %%xmm10 \n\t" + "vmovsd (%2), %%xmm12 \n\t" + "vmovsd (%1), %%xmm4 \n\t" + "vmovsd (%%rdx), %%xmm6 \n\t" + "vmovsd (%%r9), %%xmm8 \n\t" + "vmovsd (%%r10), %%xmm10 \n\t" + "vfmadd231ps %%xmm4, %%xmm12, %%xmm0 \n\t" + "vfmadd231ps %%xmm6, %%xmm12, %%xmm1 \n\t" + "vfmadd231ps %%xmm8, %%xmm12, %%xmm2 \n\t" + "vfmadd231ps %%xmm10, %%xmm12, %%xmm3 \n\t" + "add $0x8, %1 \n\t" + "add $0x8, %2 \n\t" + "add $0x8, %%rdx \n\t" + "add $0x8, %%r9 \n\t" + "add $0x8, %%r10 \n\t" + "sub $0x2, %%ecx \n\t" + + ".align 16 \n\t" + ".k_loop_remain_2_end: \n\t" + "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" + "vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t" + + "and $0x1, %%ecx \n\t" + "je .k_loop_remain_1_end \n\t" + "vxorps %%xmm12,%%xmm12, %%xmm12 \n\t" + "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorps %%xmm8, %%xmm8, %%xmm8 \n\t" + "vxorps %%xmm10, %%xmm10, %%xmm10 \n\t" + "vmovss (%2), %%xmm12 \n\t" + "vmovss (%1), %%xmm4 \n\t" + "vmovss (%%rdx), %%xmm6 \n\t" + "vmovss (%%r9), %%xmm8 \n\t" + "vmovss (%%r10), %%xmm10 \n\t" + "vfmadd231ps %%xmm4, %%xmm12, %%xmm0 \n\t" + "vfmadd231ps %%xmm6, %%xmm12, %%xmm1 \n\t" + "vfmadd231ps %%xmm8, %%xmm12, %%xmm2 \n\t" + "vfmadd231ps %%xmm10, %%xmm12, %%xmm3 \n\t" + + ".align 16 \n\t" + ".k_loop_remain_1_end: \n\t" + "vaddps (%3), %%xmm0, %%xmm0 \n\t" + "vmovss %%xmm0, (%3) \n\t" + "vaddps 0x4(%3), %%xmm1, %%xmm1 \n\t" + "vmovss %%xmm1, 0x4(%3) \n\t" + "vaddps 0x8(%3), %%xmm2, %%xmm2 \n\t" + "vmovss %%xmm2, 0x8(%3) \n\t" + "vaddps 0xC(%3), %%xmm3, %%xmm3 \n\t" + "vmovss %%xmm3, 0xC(%3) \n\t" + : + : "r"(bk), "r"(matrix), "r"(vector), "r"(result), "r"(lda) + : "%eax", "%rax", "%ecx", "%rdx", "%r9", "%r10", "%ymm0", "%ymm1", "%ymm2", + "%ymm3", "%ymm4", "%ymm6", "%ymm8", "%ymm10", "%ymm12", "%ymm13", "%ymm14", + "%ymm15", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm6", "%xmm8", + "%xmm10", "%xmm12", "memory"); +} + +void mvm_row_avx_2_32(U32 bk, U32 lda, F32 *matrix, F32 *vector, F32 *result) +{ + __asm__ __volatile__("vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" + + "mov %4, %%eax \n\t" + "shl $2, %%eax \n\t" + "mov %%eax, %%eax \n\t" + "mov %1, %%rdx \n\t" + "add %%rax, %%rdx \n\t" + + "mov %0, %%ecx \n\t" + "shr $5, %%ecx \n\t" + "je .n2_k_loop_32_end \n\t" + ".align 16 \n\t" + ".n2_k_loop_32: \n\t" + + "prefetcht0 0x100(%1) \n\t" + "prefetcht0 0x140(%1) \n\t" + "prefetcht0 0x100(%%rdx) \n\t" + "prefetcht0 0x140(%%rdx) \n\t" + + "vmovups (%2), %%ymm12 \n\t" + "vmovups 0x20(%2), %%ymm13 \n\t" + "vmovups 0x40(%2), %%ymm14 \n\t" + "vmovups 0x60(%2), %%ymm15 \n\t" + + "vmovups (%1), %%ymm4 \n\t" + "vmovups 0x20(%1), %%ymm5 \n\t" + "vmovups 0x40(%1), %%ymm6 \n\t" + "vmovups 0x60(%1), %%ymm7 \n\t" + "vmovups (%%rdx), %%ymm8 \n\t" + "vmovups 0x20(%%rdx), %%ymm9 \n\t" + "vmovups 0x40(%%rdx), %%ymm10 \n\t" + "vmovups 0x60(%%rdx), %%ymm11 \n\t" + "vfmadd231ps %%ymm12, %%ymm4, %%ymm0 \n\t" + "vfmadd231ps %%ymm12, %%ymm8, %%ymm1 \n\t" + "vfmadd231ps %%ymm13, %%ymm5, %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm9, %%ymm1 \n\t" + "vfmadd231ps %%ymm14, %%ymm6, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm10, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm7, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm11, %%ymm1 \n\t" + + "add $0x80, %1 \n\t" + "add $0x80, %2 \n\t" + "add $0x80, %%rdx \n\t" + "sub $1, %%ecx \n\t" + "jg .n2_k_loop_32 \n\t" + + ".align 16 \n\t" + ".n2_k_loop_32_end: \n\t" + "mov %0, %%ecx \n\t" + "and $0x1F, %%ecx \n\t" + "cmp $0x10, %%ecx \n\t" + "jl .n2_k_loop_remain_16_end \n\t" + + "vmovups (%2), %%ymm12 \n\t" + "vmovups 0x20(%2), %%ymm13 \n\t" + + "vmovups (%1), %%ymm4 \n\t" + "vmovups 0x20(%1), %%ymm5 \n\t" + "vmovups (%%rdx), %%ymm8 \n\t" + "vmovups 0x20(%%rdx), %%ymm9 \n\t" + "vfmadd231ps %%ymm12, %%ymm4, %%ymm0 \n\t" + "vfmadd231ps %%ymm12, %%ymm8, %%ymm1 \n\t" + "vfmadd231ps %%ymm13, %%ymm5, %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm9, %%ymm1 \n\t" + + "add $0x40, %1 \n\t" + "add $0x40, %2 \n\t" + "add $0x40, %%rdx \n\t" + "sub $0x10, %%ecx \n\t" + + ".align 16 \n\t" + ".n2_k_loop_remain_16_end: \n\t" + "cmp $0x8, %%ecx \n\t" + "jl .n2_k_loop_remain_8_end \n\t" + "vmovups (%2), %%ymm12 \n\t" + "vmovups (%1), %%ymm4 \n\t" + "vmovups (%%rdx), %%ymm6 \n\t" + "vfmadd231ps %%ymm4, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm6, %%ymm12, %%ymm1 \n\t" + "add $0x20, %1 \n\t" + "add $0x20, %2 \n\t" + "add $0x20, %%rdx \n\t" + "sub $0x8, %%ecx \n\t" + + ".align 16 \n\t" + ".n2_k_loop_remain_8_end: \n\t" + "vperm2f128 $0x1, %%ymm0, %%ymm0, %%ymm12 \n\t" + "vperm2f128 $0x1, %%ymm1, %%ymm1, %%ymm13 \n\t" + "vaddps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vaddps %%ymm13, %%ymm1, %%ymm1 \n\t" + + "cmp $0x4, %%ecx \n\t" + "jl .n2_k_loop_remain_4_end \n\t" + "vmovups (%2), %%xmm12 \n\t" + "vmovups (%1), %%xmm4 \n\t" + "vmovups (%%rdx), %%xmm6 \n\t" + "vfmadd231ps %%xmm4, %%xmm12, %%xmm0 \n\t" + "vfmadd231ps %%xmm6, %%xmm12, %%xmm1 \n\t" + "add $0x10, %1 \n\t" + "add $0x10, %2 \n\t" + "add $0x10, %%rdx \n\t" + "sub $0x4, %%ecx \n\t" + + ".align 16 \n\t" + ".n2_k_loop_remain_4_end: \n\t" + "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" + + "cmp $0x2, %%ecx \n\t" + "jl .n2_k_loop_remain_2_end \n\t" + "vxorps %%xmm12, %%xmm12, %%xmm12 \n\t" + "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vmovsd (%2), %%xmm12 \n\t" + "vmovsd (%1), %%xmm4 \n\t" + "vmovsd (%%rdx), %%xmm6 \n\t" + "vfmadd231ps %%xmm4, %%xmm12, %%xmm0 \n\t" + "vfmadd231ps %%xmm6, %%xmm12, %%xmm1 \n\t" + "add $0x8, %1 \n\t" + "add $0x8, %2 \n\t" + "add $0x8, %%rdx \n\t" + "sub $0x2, %%ecx \n\t" + + ".align 16 \n\t" + ".n2_k_loop_remain_2_end: \n\t" + "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" + "and $1, %%ecx \n\t" + "je .n2_k_loop_remain_1_end \n\t" + "vxorps %%xmm12, %%xmm12, %%xmm12 \n\t" + "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vmovss (%2), %%xmm12 \n\t" + "vmovss (%1), %%xmm4 \n\t" + "vmovss (%%rdx), %%xmm6 \n\t" + "vfmadd231ps %%xmm4, %%xmm12, %%xmm0 \n\t" + "vfmadd231ps %%xmm6, %%xmm12, %%xmm1 \n\t" + + ".align 16 \n\t" + ".n2_k_loop_remain_1_end: \n\t" + "vaddps (%3), %%xmm0, %%xmm0 \n\t" + "vmovss %%xmm0, (%3) \n\t" + "vaddps 0x4(%3), %%xmm1, %%xmm1 \n\t" + "vmovss %%xmm1, 0x4(%3) \n\t" + : + : "r"(bk), "r"(matrix), "r"(vector), "r"(result), "r"(lda) + : "%eax", "%rax", "%ecx", "%rdx", "%r9", "%r10", "%ymm0", "%ymm1", "%ymm4", + "%ymm6", "%ymm12", "%ymm13", "%xmm0", "%xmm1", "%xmm4", "%xmm6", "%xmm12", + "%xmm13", "memory"); +} + +void mvm_row_avx_1_32(U32 bk, U32 lda, F32 *matrix, F32 *vector, F32 *result) +{ + __asm__ __volatile__("vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + + "mov %0, %%ecx \n\t" + "shr $5, %%ecx \n\t" + "je .n1_k_loop_32_end \n\t" + ".align 16 \n\t" + ".n1_k_loop_32: \n\t" + + "prefetcht0 0x100(%1) \n\t" + "prefetcht0 0x140(%1) \n\t" + + "vmovups (%2), %%ymm12 \n\t" + "vmovups 0x20(%2), %%ymm13 \n\t" + "vmovups 0x40(%2), %%ymm14 \n\t" + "vmovups 0x60(%2), %%ymm15 \n\t" + + "vmovups (%1), %%ymm4 \n\t" + "vmovups 0x20(%1), %%ymm5 \n\t" + "vmovups 0x40(%1), %%ymm6 \n\t" + "vmovups 0x60(%1), %%ymm7 \n\t" + "vfmadd231ps %%ymm12, %%ymm4, %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm5, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm6, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm7, %%ymm0 \n\t" + + "add $0x80, %1 \n\t" + "add $0x80, %2 \n\t" + + "sub $1, %%ecx \n\t" + "jg .n1_k_loop_32 \n\t" + ".align 16 \n\t" + ".n1_k_loop_32_end: \n\t" + "mov %0, %%ecx \n\t" + "and $0x1F, %%ecx \n\t" + "cmp $0x10, %%ecx \n\t" + "jl .n1_k_loop_remain_16_end \n\t" + + "vmovups (%2), %%ymm12 \n\t" + "vmovups 0x20(%2), %%ymm13 \n\t" + + "vmovups (%1), %%ymm4 \n\t" + "vmovups 0x20(%1), %%ymm5 \n\t" + "vfmadd231ps %%ymm12, %%ymm4, %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm5, %%ymm0 \n\t" + + "add $0x40, %1 \n\t" + "add $0x40, %2 \n\t" + "sub $0x10, %%ecx \n\t" + + ".align 16 \n\t" + ".n1_k_loop_remain_16_end: \n\t" + "cmp $0x8, %%ecx \n\t" + "jl .n1_k_loop_remain_8_end \n\t" + "vmovups (%2), %%ymm12 \n\t" + "vmovups (%1), %%ymm4 \n\t" + "vfmadd231ps %%ymm4, %%ymm12, %%ymm0 \n\t" + "add $0x20, %1 \n\t" + "add $0x20, %2 \n\t" + "sub $0x8, %%ecx \n\t" + + ".align 16 \n\t" + ".n1_k_loop_remain_8_end: \n\t" + "vperm2f128 $0x1, %%ymm0, %%ymm0, %%ymm13 \n\t" + "vaddps %%ymm13, %%ymm0, %%ymm0 \n\t" + + "cmp $0x4, %%ecx \n\t" + "jl .n1_k_loop_remain_4_end \n\t" + "vmovups (%2), %%xmm12 \n\t" + "vmovups (%1), %%xmm4 \n\t" + "vfmadd231ps %%xmm4, %%xmm12, %%xmm0 \n\t" + "add $0x10, %1 \n\t" + "add $0x10, %2 \n\t" + "sub $0x4, %%ecx \n\t" + + ".align 16 \n\t" + ".n1_k_loop_remain_4_end: \n\t" + "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" + + "cmp $0x2, %%ecx \n\t" + "jl .n1_k_loop_remain_2_end \n\t" + "vxorps %%ymm12, %%ymm12, %%ymm12 \n\t" + "vxorps %%ymm4, %%ymm4, %%ymm4 \n\t" + "vmovsd (%2), %%xmm12 \n\t" + "vmovsd (%1), %%xmm4 \n\t" + "vfmadd231ps %%xmm4, %%xmm12, %%xmm0 \n\t" + "add $0x8, %1 \n\t" + "add $0x8, %2 \n\t" + "sub $0x2, %%ecx \n\t" + + ".align 16 \n\t" + ".n1_k_loop_remain_2_end: \n\t" + "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" + "and $1, %%ecx \n\t" + "je .n1_k_loop_remain_1_end \n\t" + "vxorps %%ymm12, %%ymm12, %%ymm12 \n\t" + "vxorps %%ymm4, %%ymm4, %%ymm4 \n\t" + "vmovss (%2), %%xmm12 \n\t" + "vmovss (%1), %%xmm4 \n\t" + "vfmadd231ps %%xmm4, %%xmm12, %%xmm0 \n\t" + + ".align 16 \n\t" + ".n1_k_loop_remain_1_end: \n\t" + "vaddps (%3), %%xmm0, %%xmm0 \n\t" + "vmovss %%xmm0, (%3) \n\t" + : + : "r"(bk), "r"(matrix), "r"(vector), "r"(result), "r"(lda) + : "%eax", "%rax", "%ecx", "%rdx", "%r9", "%r10", "%ymm0", "%ymm4", + "%ymm12", "%ymm13", "%xmm0", "%xmm4", "%xmm12", "memory"); +} + +void mvm_row_fp32(U32 numRows, U32 numColumns, F32 *matrix, F32 *vector, F32 *result) +{ + // Actual layout is NK, and vector is K + U32 blockKSize = 0, blockNSize = 0; + kernel_func kernel[3] = {mvm_row_avx_1_32, mvm_row_avx_2_32, mvm_row_avx_4_32}; + U32 unrollNSize[3] = {1, 2, 4}; + for (U32 bk = 0; bk < numColumns; bk += blockKSize) { + blockKSize = UNI_MIN(numColumns - bk, BOLCK_K_DIM); + for (U32 bn = 0; bn < numRows; bn += blockNSize) { + blockNSize = UNI_MIN(numRows - bn, UNROLL_N); + blockNSize = unrollNSize[blockNSize >> 1]; + kernel[blockNSize >> 1]( + blockKSize, numColumns, matrix + bn * numColumns + bk, vector + bk, result + bn); + } + } +} \ No newline at end of file diff --git a/compute/blas_enhance/src/cpu/x86/mmm.cpp b/compute/blas_enhance/src/cpu/x86/mmm.cpp new file mode 100644 index 00000000..b1f7e436 --- /dev/null +++ b/compute/blas_enhance/src/cpu/x86/mmm.cpp @@ -0,0 +1,130 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "error.h" +#include "types.h" +#include "blas_enhance.h" +#include "cpu/x86/blas_x86.h" +#ifdef _USE_FP32 +#include "cpu/x86/fp32/blas_fp32.h" +#endif + +EE matrix_matrix_multiply_tmp_bytes_x86( + U32 matrixA_M, U32 matrixA_K, U32 matrixB_K, U32 matrixB_N, DataType dt, U32 *bytes) +{ + EE ret = SUCCESS; + switch (dt) { +#ifdef _USE_FP32 + case DT_F32: { + matrix_matrix_multiply_tmp_bytes_fp32( + matrixA_M, matrixA_K, matrixB_K, matrixB_N, dt, bytes); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + + return ret; +} + +static EE matrix_matrix_multiply_transform_rhsN( + TensorDesc desc, const void *src, TensorDesc *descTran, void *dst) +{ + EE ret = SUCCESS; + switch (desc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = matrix_matrix_multiply_transform_rhsN_fp32(desc, (F32 *)src, (F32 *)dst); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + (*descTran) = desc; + (*descTran).df = targetFormat4MatrixB(desc.dt); + return ret; +} + +static EE matrix_matrix_multiply_transform_rhsT( + TensorDesc desc, const void *src, TensorDesc *descTran, void *dst) +{ + EE ret = SUCCESS; + switch (desc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = matrix_matrix_multiply_transform_rhsT_fp32(desc, (F32 *)src, (F32 *)dst); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + (*descTran) = desc; + (*descTran).df = targetFormat4MatrixB(desc.dt); + std::swap((*descTran).dims[0], (*descTran).dims[1]); + return ret; +} + +EE matrix_matrix_multiply_transform_rhs_x86( + TensorDesc desc, const void *src, TensorDesc *descTran, void *dst) +{ + if (desc.df == targetFormat4MatrixB(desc.dt)) { + return SUCCESS; + } + EE ret = SUCCESS; + switch (desc.df) { + case DF_NORMAL: { + ret = matrix_matrix_multiply_transform_rhsN(desc, src, descTran, dst); + break; + } + case DF_TRANSPOSE: { + ret = matrix_matrix_multiply_transform_rhsT(desc, src, descTran, dst); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE mmm_x86(U32 matrixC_N, + U32 matrixC_M, + U32 matrixA_K, + DataType dt, + bool transposeA, + const void *matrixAData, + const void *matrixBData, + void *tmp, + void *matrixCData) +{ + EE ret = SUCCESS; + switch (dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = mmm_avx2_fp32(matrixC_N, matrixC_M, matrixA_K, transposeA, (F32 *)matrixAData, + (F32 *)matrixBData, (F32 *)tmp, (F32 *)matrixCData); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/blas_enhance/src/cpu/x86/mvm.cpp b/compute/blas_enhance/src/cpu/x86/mvm.cpp new file mode 100644 index 00000000..740d793f --- /dev/null +++ b/compute/blas_enhance/src/cpu/x86/mvm.cpp @@ -0,0 +1,55 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "error.h" +#include "types.h" +#include "blas_enhance.h" +#include "cpu/x86/blas_x86.h" +#ifdef _USE_FP32 +#include "cpu/x86/fp32/blas_fp32.h" +#endif + +EE matrix_vector_multiply_tmp_bytes_x86(bool transpose, DataType dt, U32 *bytes) +{ + if (nullptr == bytes) { + CHECK_STATUS(NULL_POINTER); + } + switch (dt) { +#ifdef _USE_FP32 + case DT_F32: + *bytes = 0; + break; +#endif + default: + break; + } + return SUCCESS; +} + +EE mvm_x86( + U32 row, U32 col, DataType dt, bool transpose, const void *matrix, const void *vector, void *result) +{ + EE ret = SUCCESS; + switch (dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = mvm_avx2_fp32(row, col, transpose, (F32 *)matrix, (F32 *)vector, (F32 *)result); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/blas_enhance/src/mmm.cpp b/compute/blas_enhance/src/mmm.cpp new file mode 100644 index 00000000..8a7b7612 --- /dev/null +++ b/compute/blas_enhance/src/mmm.cpp @@ -0,0 +1,167 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "blas_enhance.h" +#ifdef _USE_GENERAL +#include "cpu/general/blas_general.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/blas_arm.h" +#endif +#ifdef _USE_X86 +#include "cpu/x86/blas_x86.h" +#endif + +EE matrix_matrix_multiply_tmp_bytes( + TensorDesc matrixADesc, TensorDesc matrixBDesc, U32 *bytes, Arch arch) +{ + DataType matrixADataType, matrixBDataType; + DataFormat matrixADataFormat, matrixBDataFormat; + U32 matrixA_M, matrixA_K, matrixB_K, matrixB_N; + CHECK_STATUS( + tensor2dGet(matrixADesc, &matrixADataType, &matrixADataFormat, &matrixA_M, &matrixA_K)); + CHECK_STATUS( + tensor2dGet(matrixBDesc, &matrixBDataType, &matrixBDataFormat, &matrixB_K, &matrixB_N)); + if (matrixBDesc.df == DF_TRANSPOSE) { + std::swap(matrixB_K, matrixB_N); + } + + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = SUCCESS; +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = matrix_matrix_multiply_tmp_bytes_x86( + matrixA_M, matrixA_K, matrixB_K, matrixB_N, matrixADataType, bytes); +#endif +#ifdef _USE_NEON + } else { + ret = matrix_matrix_multiply_tmp_bytes_arm( + matrixA_M, matrixA_K, matrixB_K, matrixB_N, matrixADataType, bytes); +#endif + } + return ret; +} + +EE matrix_matrix_multiply_transform_rhs( + TensorDesc desc, const void *src, TensorDesc *descTran, void *dst, Arch arch) +{ + EE ret = NOT_SUPPORTED; +#ifdef _USE_NEON + if (IS_ARM(arch)) { + ret = matrix_matrix_multiply_transform_rhs_arm(desc, src, descTran, dst); + } +#endif +#ifdef _USE_GENERAL + if (IS_GENERAL(arch)) { + memcpy(dst, src, tensorNumBytes(desc)); + (*descTran) = desc; + ret = SUCCESS; + } +#endif +#ifdef _USE_X86 + if (IS_X86_AVX2(arch)) { + ret = matrix_matrix_multiply_transform_rhs_x86(desc, src, descTran, dst); + } +#endif + return ret; +} + +EE matrix_matrix_multiply(TensorDesc matrixADesc, + const void *matrixAData, + TensorDesc matrixBDesc, + const void *matrixBData, + U32 bytes, + void *tmp, + TensorDesc matrixCDesc, + void *matrixCData, + Arch arch) +{ + if (bytes != 0 && tmp == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (nullptr == matrixAData || nullptr == matrixBData || nullptr == matrixCData) { + CHECK_STATUS(NULL_POINTER); + } + + DataType matrixADataType, matrixBDataType, matrixCDataType; + DataFormat matrixADataFormat, matrixBDataFormat, matrixCDataFormat; + U32 matrixA_M, matrixA_K, matrixB_K, matrixB_N, matrixC_M, matrixC_N; + CHECK_STATUS( + tensor2dGet(matrixADesc, &matrixADataType, &matrixADataFormat, &matrixA_M, &matrixA_K)); + CHECK_STATUS( + tensor2dGet(matrixBDesc, &matrixBDataType, &matrixBDataFormat, &matrixB_K, &matrixB_N)); + CHECK_STATUS( + tensor2dGet(matrixCDesc, &matrixCDataType, &matrixCDataFormat, &matrixC_M, &matrixC_N)); + + if (matrixADataType != matrixBDataType) { + CHECK_STATUS(NOT_MATCH); + } + if (matrixADataType != matrixCDataType) { + if (matrixADataType != DT_I8 || matrixCDataType != DT_I32) { + CHECK_STATUS(NOT_MATCH); + } + } + + bool transposeA = false, transposeB = false; + if (matrixADataFormat == DF_TRANSPOSE) { + std::swap(matrixA_M, matrixA_K); + transposeA = true; + } + if (matrixBDataFormat == DF_TRANSPOSE) { + std::swap(matrixB_K, matrixB_N); + transposeB = true; + } + if (matrixA_M != matrixC_M || matrixB_N != matrixC_N || matrixA_K != matrixB_K) { + CHECK_STATUS(NOT_MATCH); + } + + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = mmm_general(matrixC_N, matrixC_M, matrixA_K, transposeA, transposeB, matrixADataType, + matrixAData, matrixBData, matrixCData); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + TensorDesc tranDescB; + U8 *dataB = (U8 *)matrixBData; + if (matrixBDataFormat != targetFormat4MatrixB(matrixBDataType)) { + dataB = ((U8 *)tmp) + matrixA_M * matrixA_K * bytesOf(matrixADataType); + ret = matrix_matrix_multiply_transform_rhs_x86( + matrixBDesc, matrixBData, &tranDescB, dataB); + } + ret = mmm_x86(matrixC_N, matrixC_M, matrixA_K, matrixADataType, transposeA, matrixAData, + dataB, tmp, matrixCData); +#endif +#ifdef _USE_NEON + } else { + TensorDesc tranDescB; + U8 *dataB = (U8 *)matrixBData; + if (matrixBDataFormat != targetFormat4MatrixB(matrixBDataType)) { + U32 K = matrixA_K; + if (DT_I8 == matrixADataType) { + K = pad_to_4_multiple(K); + } + dataB = ((U8 *)tmp) + matrixA_M * K * bytesOf(matrixADataType); + ret = matrix_matrix_multiply_transform_rhs_arm( + matrixBDesc, matrixBData, &tranDescB, dataB); + } + ret = mmm_arm(matrixC_N, matrixC_M, matrixA_K, matrixADataType, transposeA, matrixAData, + dataB, tmp, matrixCData, arch); +#endif + } + return ret; +} diff --git a/compute/blas_enhance/src/mvm.cpp b/compute/blas_enhance/src/mvm.cpp new file mode 100644 index 00000000..518b5e02 --- /dev/null +++ b/compute/blas_enhance/src/mvm.cpp @@ -0,0 +1,125 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "blas_enhance.h" +#ifdef _USE_GENERAL +#include "cpu/general/blas_general.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/blas_arm.h" +#endif +#ifdef _USE_X86 +#include "cpu/x86/blas_x86.h" +#endif + +EE matrix_vector_multiply_tmp_bytes( + TensorDesc matrixDesc, TensorDesc vectorDesc, U32 *bytes, Arch arch) +{ + UNUSED(vectorDesc); + + bool transpose = (matrixDesc.df == DF_TRANSPOSE); + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = SUCCESS; +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = matrix_vector_multiply_tmp_bytes_x86(transpose, matrixDesc.dt, bytes); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = matrix_vector_multiply_tmp_bytes_arm(transpose, matrixDesc.dt, bytes); +#endif + } + return ret; +} + +EE matrix_vector_multiply_transform_weight( + TensorDesc desc, const void *src, TensorDesc *descTran, void *dst, Arch arch) +{ + EE ret = NOT_SUPPORTED; +#ifdef _USE_NEON + if (IS_ARM(arch)) { + ret = matrix_vector_multiply_transform_weight_arm(desc, src, descTran, dst); + } +#endif +#ifdef _USE_GENERAL + if (IS_GENERAL(arch)) { + memcpy(dst, src, tensorNumBytes(desc)); + (*descTran) = desc; + ret = SUCCESS; + } +#endif + return ret; +} + +EE matrix_vector_multiply(TensorDesc matrixDesc, + const void *matrix, + TensorDesc vectorDesc, + const void *vector, + U32 bytes, + void *tmp, + TensorDesc resultDesc, + void *result, + Arch arch) +{ + if (bytes != 0 && tmp == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (nullptr == matrix || nullptr == vector || nullptr == result) { + CHECK_STATUS(NULL_POINTER); + } + DataType matrixDataType, vectorDataType, resultDataType; + DataFormat matrixDataFormat, vectorDataFormat, resultDataFormat; + U32 matrixRow, matrixColumn, vectorColumn, resultColumn; + CHECK_STATUS( + tensor2dGet(matrixDesc, &matrixDataType, &matrixDataFormat, &matrixRow, &matrixColumn)); + CHECK_STATUS(tensor1dGet(vectorDesc, &vectorDataType, &vectorDataFormat, &vectorColumn)); + CHECK_STATUS(tensor1dGet(resultDesc, &resultDataType, &resultDataFormat, &resultColumn)); + + if (matrixDataType != vectorDataType) { + CHECK_STATUS(NOT_MATCH); + } + if (matrixDataType != resultDataType) { + if (matrixDataType != DT_I8 || resultDataType != DT_I32) { + CHECK_STATUS(NOT_MATCH); + } + } + + bool transpose = (matrixDataFormat == DF_TRANSPOSE); + if (transpose) { + std::swap(matrixRow, matrixColumn); + } + if (matrixRow != resultColumn || matrixColumn != vectorColumn) { + CHECK_STATUS(NOT_MATCH); + } + + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = + mvm_general(matrixRow, matrixColumn, matrixDataType, transpose, matrix, vector, result); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = mvm_x86(matrixRow, matrixColumn, matrixDataType, transpose, matrix, vector, result); +#endif +#ifdef _USE_NEON + } else { + ret = mvm_arm(matrixRow, matrixColumn, matrixDataType, matrixDataFormat, matrix, vector, + tmp, result, arch); +#endif + } + return ret; +} diff --git a/compute/blas_enhance/tests/.CMakeLists.txt.swp b/compute/blas_enhance/tests/.CMakeLists.txt.swp new file mode 100644 index 00000000..1f220ac0 Binary files /dev/null and b/compute/blas_enhance/tests/.CMakeLists.txt.swp differ diff --git a/compute/blas_enhance/tests/CMakeLists.txt b/compute/blas_enhance/tests/CMakeLists.txt new file mode 100644 index 00000000..716463ea --- /dev/null +++ b/compute/blas_enhance/tests/CMakeLists.txt @@ -0,0 +1,13 @@ +function(blas_enhance_test name) + add_executable(${name} ${name}.cpp) + link_blas_enhance(${name}) + install(TARGETS ${name} + RUNTIME DESTINATION tests) +endfunction() + +set_test_c_cxx_flags() + +blas_enhance_test(test_mmm) +blas_enhance_test(test_mvm) +blas_enhance_test(test_mmm_int8) +blas_enhance_test(test_mvm_int8) diff --git a/compute/blas_enhance/tests/test_mmm.cpp b/compute/blas_enhance/tests/test_mmm.cpp new file mode 100644 index 00000000..99448541 --- /dev/null +++ b/compute/blas_enhance/tests/test_mmm.cpp @@ -0,0 +1,87 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "blas_enhance.h" +#include "ut_util.h" + +int mmmTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 4); + U32 m = atoi(argv[1]); + U32 k = atoi(argv[2]); + U32 n = atoi(argv[3]); + + TensorDesc A_desc = tensor2df(dt, DF_TRANSPOSE, k, m); + TensorDesc B_desc = tensor2df(dt, DF_NORMAL, k, n); + TensorDesc tranDescB; + TensorDesc C_desc = tensor2df(dt, DF_NORMAL, m, n); + + U32 bytes = 0; + U8 *A = ut_input_v(m * k, dt, UT_INIT_RANDOM); + U8 *B = ut_input_v(k * n, dt, UT_INIT_RANDOM); + U8 *B_tran = ut_input_v(k * n + 32, dt, UT_INIT_ZERO); + U8 *C = ut_input_v(m * n, dt, UT_INIT_ZERO); + U8 *C_ref = ut_input_v(m * n, dt, UT_INIT_ZERO); + CHECK_STATUS(matrix_matrix_multiply_tmp_bytes(A_desc, B_desc, &bytes, UT_ARCH)); + U8 *tmp = ut_input_v(bytes / bytesOf(dt), dt, UT_INIT_ZERO); + + matrix_matrix_multiply_transform_rhs(B_desc, B, &tranDescB, B_tran, UT_ARCH); + if (UT_CHECK) { + CHECK_STATUS( + matrix_matrix_multiply(A_desc, A, tranDescB, B_tran, bytes, tmp, C_desc, C, UT_ARCH)); + + // naive implement + CHECK_STATUS( + matrix_matrix_multiply(A_desc, A, B_desc, B, bytes, tmp, C_desc, C_ref, CPU_GENERAL)); + + // check + ut_check_v(C, C_ref, m * n, dt, 10, __FILE__, __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + matrix_matrix_multiply(A_desc, A, tranDescB, B_tran, bytes, tmp, C_desc, C, UT_ARCH); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u)+(%u %u)=(%u %u)", m, k, k, n, m, n); + sprintf(buffer, "%20s, %80s", "MatrixMultiply", params); + double ops = 2.0 * m * n * k + 1.0 * m * n; + ut_log(dt, buffer, ops, time); + + free(A); + free(B); + free(B_tran); + free(C); + free(C_ref); + free(tmp); + + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + mmmTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + mmmTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/compute/blas_enhance/tests/test_mmm_int8.cpp b/compute/blas_enhance/tests/test_mmm_int8.cpp new file mode 100644 index 00000000..c32a7352 --- /dev/null +++ b/compute/blas_enhance/tests/test_mmm_int8.cpp @@ -0,0 +1,83 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "blas_enhance.h" +#include "ut_util.h" + +int main(int argc, char **argv) +{ +#ifdef _USE_INT8 + CHECK_REQUIREMENT(argc == 4); + U32 m = atoi(argv[1]); + U32 k = atoi(argv[2]); + U32 n = atoi(argv[3]); + + DataType dt = DT_I8; + DataType odt = DT_I32; + TensorDesc A_desc = tensor2df(dt, DF_TRANSPOSE, k, m); + TensorDesc B_desc = tensor2df(dt, DF_NORMAL, k, n); + TensorDesc tranDescB; + TensorDesc C_desc = tensor2df(odt, DF_NORMAL, m, n); + + U32 bytes = 0; + U32 k4 = k; + if (k4 % 4 != 0) { + k4 = (k4 / 4) * 4 + 4; + } + INT8 *A = (INT8 *)ut_input_v(m * k, DT_I8, UT_INIT_RANDOM); + INT8 *B = (INT8 *)ut_input_v(k * n, DT_I8, UT_INIT_RANDOM); + INT8 *B_tran = (INT8 *)ut_input_v(k4 * n + 32, DT_I8, UT_INIT_ZERO); + I32 *C = (I32 *)ut_input_v(m * n, DT_I32, UT_INIT_ZERO); + I32 *C_ref = (I32 *)ut_input_v(m * n, DT_I32, UT_INIT_ZERO); + CHECK_STATUS(matrix_matrix_multiply_tmp_bytes(A_desc, B_desc, &bytes, UT_ARCH)); + INT8 *tmp = (INT8 *)ut_input_v(bytes, DT_I8, UT_INIT_ZERO); + + matrix_matrix_multiply_transform_rhs(B_desc, B, &tranDescB, B_tran, UT_ARCH); + if (UT_CHECK) { + CHECK_STATUS( + matrix_matrix_multiply(A_desc, A, tranDescB, B_tran, bytes, tmp, C_desc, C, UT_ARCH)); + + // naive implement + CHECK_STATUS( + matrix_matrix_multiply(A_desc, A, B_desc, B, bytes, tmp, C_desc, C_ref, CPU_GENERAL)); + + // check + ut_check_v(C, C_ref, m * n, DT_I32, 1, __FILE__, __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + matrix_matrix_multiply(A_desc, A, tranDescB, B_tran, bytes, tmp, C_desc, C, UT_ARCH); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u)+(%u %u)=(%u %u)", m, k, k, n, m, n); + sprintf(buffer, "%20s, %80s", "MatrixMultiply", params); + double ops = 2.0 * m * n * k + 1.0 * m * n; + ut_log(DT_I8, buffer, ops, time); + + free(A); + free(B); + free(B_tran); + free(C); + free(C_ref); + free(tmp); +#endif + return 0; +} diff --git a/compute/blas_enhance/tests/test_mvm.cpp b/compute/blas_enhance/tests/test_mvm.cpp new file mode 100644 index 00000000..5d6443ca --- /dev/null +++ b/compute/blas_enhance/tests/test_mvm.cpp @@ -0,0 +1,96 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "blas_enhance.h" +#include "ut_util.h" + +int mvmTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 3); + U32 m = atoi(argv[1]); + U32 k = atoi(argv[2]); + + float threshold = 0.0001; + if (dt == DT_F16) { + threshold = 0.05; + } + DataFormat df = DF_NORMAL; + U32 vc, rc; + if (df == DF_NORMAL) { + vc = k; + rc = m; + } else { + vc = m; + rc = k; + } + + TensorDesc mat_desc = tensor2df(dt, df, rc, vc); + TensorDesc vec_desc = tensor1d(dt, vc); + TensorDesc res_desc = tensor1d(dt, rc); + + U8 *mat = ut_input_v(m * k, dt, UT_INIT_RANDOM); + U8 *vec = ut_input_v(vc, dt, UT_INIT_RANDOM); + U8 *res = ut_input_v(rc, dt, UT_INIT_ZERO); + U8 *res_ref = ut_input_v(rc, dt, UT_INIT_ZERO); + + U32 bytes = 0; + CHECK_STATUS(matrix_vector_multiply_tmp_bytes(mat_desc, vec_desc, &bytes, UT_ARCH)); + U8 *tmp = ut_input_v(bytes / bytesOf(dt), dt, UT_INIT_ZERO); + // check + if (UT_CHECK) { + CHECK_STATUS(matrix_vector_multiply( + mat_desc, mat, vec_desc, vec, bytes, tmp, res_desc, res, UT_ARCH)); + + // naive implement + CHECK_STATUS(matrix_vector_multiply( + mat_desc, mat, vec_desc, vec, bytes, tmp, res_desc, res_ref, CPU_GENERAL)); + + ut_check_v(res, res_ref, rc, dt, threshold, __FILE__, __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + matrix_vector_multiply(mat_desc, mat, vec_desc, vec, bytes, tmp, res_desc, res, UT_ARCH); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u)+(%u)=(%u)", m, k, vc, rc); + sprintf(buffer, "%20s, %80s", "MatrixVectorMultiply", params); + double ops = 2.0 * m * k; + ut_log(dt, buffer, ops, time); + + free(mat); + free(vec); + free(tmp); + free(res); + free(res_ref); + + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + mvmTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + mvmTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/compute/blas_enhance/tests/test_mvm_int8.cpp b/compute/blas_enhance/tests/test_mvm_int8.cpp new file mode 100644 index 00000000..5e5a82aa --- /dev/null +++ b/compute/blas_enhance/tests/test_mvm_int8.cpp @@ -0,0 +1,92 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "blas_enhance.h" +#include "ut_util.h" + +int main(int argc, char **argv) +{ +#ifdef _USE_INT8 + CHECK_REQUIREMENT(argc == 3); + U32 m = atoi(argv[1]); + U32 k = atoi(argv[2]); + + DataFormat df = DF_NORMAL; + DataType dt = DT_I8; + DataType odt = DT_I32; + U32 vc, rc; + if (df == DF_NORMAL) { + vc = k; + rc = m; + } else { + vc = m; + rc = k; + } + + TensorDesc mat_desc = tensor2df(dt, df, m, k); + TensorDesc tranDesc; + TensorDesc vec_desc = tensor1d(dt, vc); + TensorDesc res_desc = tensor1d(odt, rc); + + U32 k4 = k; + if (k4 % 4 != 0) { + k4 = (k4 / 4) * 4 + 4; + } + + INT8 *mat = (INT8 *)ut_input_v(m * k, DT_I8, UT_INIT_RANDOM); + INT8 *matTran = (INT8 *)ut_input_v(m * k4, DT_I8, UT_INIT_ZERO); + INT8 *vec = (INT8 *)ut_input_v(vc, DT_I8, UT_INIT_RANDOM); + I32 *res = (I32 *)ut_input_v(rc, DT_I32, UT_INIT_ZERO); + I32 *res_ref = (I32 *)ut_input_v(rc, DT_I32, UT_INIT_ZERO); + + matrix_vector_multiply_transform_weight(mat_desc, mat, &tranDesc, matTran, UT_ARCH); + + U32 bytes; + CHECK_STATUS(matrix_vector_multiply_tmp_bytes(mat_desc, vec_desc, &bytes, UT_ARCH)); + I32 *tmp = (I32 *)ut_input_v(bytes / bytesOf(DT_I32), DT_I32, UT_INIT_ZERO); + // check + if (UT_CHECK) { + CHECK_STATUS(matrix_vector_multiply( + tranDesc, matTran, vec_desc, vec, bytes, tmp, res_desc, res, UT_ARCH)); + + // naive implement + CHECK_STATUS(matrix_vector_multiply( + mat_desc, mat, vec_desc, vec, bytes, tmp, res_desc, res_ref, CPU_GENERAL)); + + ut_check_v(res, res_ref, rc, DT_I32, 1, __FILE__, __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + matrix_vector_multiply(tranDesc, matTran, vec_desc, vec, bytes, tmp, res_desc, res, UT_ARCH); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u)+(%u)=(%u)", m, k, vc, rc); + sprintf(buffer, "%20s, %80s", "MatrixVectorMultiply", params); + double ops = 2.0 * m * k; + ut_log(DT_I8, buffer, ops, time); + + free(mat); + free(vec); + free(res); + free(res_ref); +#endif + return 0; +} diff --git a/compute/image/CMakeLists.txt b/compute/image/CMakeLists.txt new file mode 100644 index 00000000..e9e5af57 --- /dev/null +++ b/compute/image/CMakeLists.txt @@ -0,0 +1,20 @@ +cmake_minimum_required(VERSION 3.2) + +file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/common/cmakes/bolt.cmake ${BOLT_ROOT}/common/cmakes/bolt.cmake) +if (BOLT_CONFIGURE_FILE) + include(${BOLT_CONFIGURE_FILE}) +else (BOLT_CONFIGURE_FILE) + message(FATAL_ERROR " +FATAL: can not find bolt.cmake in directory, + please set shell or cmake environment variable BOLT_ROOT. + ") +endif (BOLT_CONFIGURE_FILE) + +project(image) + +set_c_cxx_flags() + +include_image() + +add_subdirectory(src) +add_subdirectory(tests) diff --git a/compute/image/include/image.h b/compute/image/include/image.h new file mode 100644 index 00000000..274a5066 --- /dev/null +++ b/compute/image/include/image.h @@ -0,0 +1,38 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_IMAGE +#define _H_IMAGE + +#include "tensor.hpp" +#include "tensor_desc.h" +#include "sys.h" + +#ifdef _USE_MALI +#include "gcl.h" +#include "ocl_desc_trans.h" +#endif + +typedef struct { + DataType paramDT; +} ResizeDesc; + +EE resize_infer_output_size(Tensor *inputTensor, + ResizeDesc resizeDesc, + void *params, + Tensor *outputTensor, + U32 *outputBytes, + ArchInfo_t archInfo); + +EE resize(Tensor inputTensor, Tensor tmpTensor, Tensor outputTensor, ArchInfo_t archInfo); +#endif diff --git a/image/include/image_processing.hpp b/compute/image/include/image_processing.hpp similarity index 76% rename from image/include/image_processing.hpp rename to compute/image/include/image_processing.hpp index f5ac79fd..5aa20860 100644 --- a/image/include/image_processing.hpp +++ b/compute/image/include/image_processing.hpp @@ -1,27 +1,28 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _H_IMAGEPROCESSING #define _H_IMAGEPROCESSING #include #include #include "tensor_desc.h" +#include "tensor.hpp" #include "error.h" std::shared_ptr load_fake_image(TensorDesc inputDesc); -std::shared_ptr load_resize_image(TensorDesc rgbDesc, void* rgb, TensorDesc imageDesc, ImageFormat targetImageFormat, float scaleValue); +std::shared_ptr load_resize_image( + Tensor rgbTensor, TensorDesc imageDesc, ImageFormat targetImageFormat, float scaleValue); #endif diff --git a/compute/image/src/CMakeLists.txt b/compute/image/src/CMakeLists.txt new file mode 100644 index 00000000..0fccaf61 --- /dev/null +++ b/compute/image/src/CMakeLists.txt @@ -0,0 +1,31 @@ +if (USE_GENERAL) + file(GLOB general_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/general/*.cpp) +endif (USE_GENERAL) + +if (USE_NEON) + file(GLOB arm_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/*.cpp) +endif (USE_NEON) + +if (USE_MALI) + file(GLOB mali_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/mali/*.cpp) + file(GLOB mali_fp16_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/mali/fp16/*.cpp) +endif (USE_MALI) + +file(GLOB srcs ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) +set(srcs "${srcs};${general_srcs};${arm_srcs};${mali_srcs};${mali_fp16_srcs}") + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) + +# shared library +add_library(${PROJECT_NAME} SHARED ${srcs}) +target_link_libraries (${PROJECT_NAME} LINK_PUBLIC uni) + +# static library +add_library(${PROJECT_NAME}_static STATIC ${srcs}) + +set_target_properties(${PROJECT_NAME}_static PROPERTIES OUTPUT_NAME "${PROJECT_NAME}") +set_target_properties(${PROJECT_NAME} PROPERTIES CLEAN_DIRECT_OUTPUT 1) +set_target_properties(${PROJECT_NAME}_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) +install(TARGETS ${PROJECT_NAME} ${PROJECT_NAME}_static + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib) diff --git a/image/src/cpu/arm/image_arm.h b/compute/image/src/cpu/arm/image_arm.h similarity index 79% rename from image/src/cpu/arm/image_arm.h rename to compute/image/src/cpu/arm/image_arm.h index a67f1349..a42c596d 100644 --- a/image/src/cpu/arm/image_arm.h +++ b/compute/image/src/cpu/arm/image_arm.h @@ -1,17 +1,16 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _H_IMAGE_ARM #define _H_IMAGE_ARM @@ -21,6 +20,5 @@ #include "image.h" #include "arm_neon_expand.h" -EE resize_bilinear_arm(TensorDesc inputDesc, void* input, - TensorDesc outputDesc, void* output); +EE resize_bilinear_arm(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *output); #endif diff --git a/compute/image/src/cpu/arm/resize_bilinear.cpp b/compute/image/src/cpu/arm/resize_bilinear.cpp new file mode 100644 index 00000000..2c8e868d --- /dev/null +++ b/compute/image/src/cpu/arm/resize_bilinear.cpp @@ -0,0 +1,241 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "image.h" +#include "cpu/arm/image_arm.h" + +#ifdef _USE_FP16 +EE resize_bilinear_fp16(TensorDesc inputDesc, F16 *inArray, TensorDesc outputDesc, F16 *outArray) +{ + DataType idt, odt; + DataFormat idf, odf; + U32 in, ic, ih, iw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if (idf != DF_NCHWC8 || odf != DF_NCHWC8) { + CHECK_STATUS(NOT_MATCH); + } + + F32 strideH = (F32)(ih - 1) / (F32)(oh - 1); + F32 strideW = (F32)(iw - 1) / (F32)(ow - 1); + + oc /= 8; + + for (U32 n = 0; n < on; n++) { + for (U32 c = 0; c < oc; c++) { + I32 outBase = n * oc * oh * ow + c * oh * ow * 8; + I32 inBase = n * oc * ih * iw + c * ih * iw * 8; + for (U32 h = 0; h < oh; h++) { + for (U32 w = 0; w < ow; w++) { + if (h == 0 && w == 0) { + memcpy(outArray + outBase, inArray + inBase, 8 * bytesOf(DT_F16)); + continue; + } + if (h == 0 && w == ow - 1) { + memcpy(outArray + outBase + w * 8, inArray + inBase + (iw - 1) * 8, + 8 * bytesOf(DT_F16)); + continue; + } + if (h == oh - 1 && w == 0) { + memcpy(outArray + outBase + h * ow * 8, + inArray + inBase + (ih - 1) * iw * 8, 8 * bytesOf(DT_F16)); + continue; + } + if (h == oh - 1 && w == ow - 1) { + memcpy(outArray + outBase + h * ow * 8 + w * 8, + inArray + inBase + (ih - 1) * iw * 8 + (iw - 1) * 8, + 8 * bytesOf(DT_F16)); + continue; + } + + F32 hC = strideH * h; + F32 wC = strideW * w; + + I32 hT = floor(hC); + I32 hB = ceil(hC); + I32 wL = floor(wC); + I32 wR = ceil(wC); + + if (hT == hB && wL == wR) { + memcpy(outArray + outBase + h * ow * 8 + w * 8, + inArray + inBase + hT * iw * 8 + wL * 8, 8 * bytesOf(DT_F16)); + } else if (hT == hB) { + float16x8_t res = {0}; + float16x8_t vecL = vld1q_f16(inArray + inBase + hT * iw * 8 + wL * 8); + float16x8_t vecR = vld1q_f16(inArray + inBase + hT * iw * 8 + wR * 8); + res = vfmaq_n_f16(res, vecL, wR - wC); + res = vfmaq_n_f16(res, vecR, wC - wL); + vst1q_f16(outArray + outBase + h * ow * 8 + w * 8, res); + } else if (wL == wR) { + float16x8_t res = {0}; + float16x8_t vecT = vld1q_f16(inArray + inBase + hT * iw * 8 + wL * 8); + float16x8_t vecB = vld1q_f16(inArray + inBase + hB * iw * 8 + wL * 8); + res = vfmaq_n_f16(res, vecT, hB - hC); + res = vfmaq_n_f16(res, vecB, hC - hT); + vst1q_f16(outArray + outBase + h * ow * 8 + w * 8, res); + } else { + float16x8_t res = {0}; + float16x8_t vecTL = vld1q_f16(inArray + inBase + hT * iw * 8 + wL * 8); + float16x8_t vecTR = vld1q_f16(inArray + inBase + hT * iw * 8 + wR * 8); + float16x8_t vecBL = vld1q_f16(inArray + inBase + hB * iw * 8 + wL * 8); + float16x8_t vecBR = vld1q_f16(inArray + inBase + hB * iw * 8 + wR * 8); + res = vfmaq_n_f16(res, vecTL, (hB - hC) * (wR - wC)); + res = vfmaq_n_f16(res, vecTR, (hB - hC) * (wC - wL)); + res = vfmaq_n_f16(res, vecBL, (hC - hT) * (wR - wC)); + res = vfmaq_n_f16(res, vecBR, (hC - hT) * (wC - wL)); + vst1q_f16(outArray + outBase + h * ow * 8 + w * 8, res); + } + } + } + } + } + return SUCCESS; +} +#endif + +#ifdef _USE_FP32 +EE resize_bilinear_fp32(TensorDesc inputDesc, F32 *inArray, TensorDesc outputDesc, F32 *outArray) +{ + DataType idt, odt; + DataFormat idf, odf; + U32 in, ic, ih, iw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if (idf != DF_NCHWC8 || odf != DF_NCHWC8) { + CHECK_STATUS(NOT_MATCH); + } + + F32 strideH = (F32)(ih - 1) / (F32)(oh - 1); + F32 strideW = (F32)(iw - 1) / (F32)(ow - 1); + + oc /= 8; + + for (U32 n = 0; n < on; n++) { + for (U32 c = 0; c < oc; c++) { + I32 outBase = n * oc * oh * ow + c * oh * ow * 8; + I32 inBase = n * oc * ih * iw + c * ih * iw * 8; + for (U32 h = 0; h < oh; h++) { + for (U32 w = 0; w < ow; w++) { + if (h == 0 && w == 0) { + memcpy(outArray + outBase, inArray + inBase, 8 * bytesOf(DT_F32)); + continue; + } + if (h == 0 && w == ow - 1) { + memcpy(outArray + outBase + w * 8, inArray + inBase + (iw - 1) * 8, + 8 * bytesOf(DT_F32)); + continue; + } + if (h == oh - 1 && w == 0) { + memcpy(outArray + outBase + h * ow * 8, + inArray + inBase + (ih - 1) * iw * 8, 8 * bytesOf(DT_F32)); + continue; + } + if (h == oh - 1 && w == ow - 1) { + memcpy(outArray + outBase + h * ow * 8 + w * 8, + inArray + inBase + (ih - 1) * iw * 8 + (iw - 1) * 8, + 8 * bytesOf(DT_F32)); + continue; + } + + F32 hC = strideH * h; + F32 wC = strideW * w; + + I32 hT = floor(hC); + I32 hB = ceil(hC); + I32 wL = floor(wC); + I32 wR = ceil(wC); + + if (hT == hB && wL == wR) { + memcpy(outArray + outBase + h * ow * 8 + w * 8, + inArray + inBase + hT * iw * 8 + wL * 8, 8 * bytesOf(DT_F32)); + } else if (hT == hB) { + float32x4_t res[2] = {0}; + float32x4_t vecL = vld1q_f32(inArray + inBase + hT * iw * 8 + wL * 8); + float32x4_t vecL1 = vld1q_f32(inArray + inBase + hT * iw * 8 + wL * 8 + 4); + float32x4_t vecR = vld1q_f32(inArray + inBase + hT * iw * 8 + wR * 8); + float32x4_t vecR1 = vld1q_f32(inArray + inBase + hT * iw * 8 + wR * 8 + 4); + res[0] = vfmaq_n_f32(res[0], vecL, wR - wC); + res[1] = vfmaq_n_f32(res[1], vecL1, wR - wC); + res[0] = vfmaq_n_f32(res[0], vecR, wC - wL); + res[1] = vfmaq_n_f32(res[1], vecR1, wC - wL); + vst1q_f32(outArray + outBase + h * ow * 8 + w * 8, res[0]); + vst1q_f32(outArray + outBase + h * ow * 8 + w * 8 + 4, res[1]); + } else if (wL == wR) { + float32x4_t res[2] = {0}; + float32x4_t vecT = vld1q_f32(inArray + inBase + hT * iw * 8 + wL * 8); + float32x4_t vecT1 = vld1q_f32(inArray + inBase + hT * iw * 8 + wL * 8 + 4); + float32x4_t vecB = vld1q_f32(inArray + inBase + hB * iw * 8 + wL * 8); + float32x4_t vecB1 = vld1q_f32(inArray + inBase + hB * iw * 8 + wL * 8 + 4); + res[0] = vfmaq_n_f32(res[0], vecT, hB - hC); + res[1] = vfmaq_n_f32(res[1], vecT1, hB - hC); + res[0] = vfmaq_n_f32(res[0], vecB, hC - hT); + res[1] = vfmaq_n_f32(res[1], vecB1, hC - hT); + vst1q_f32(outArray + outBase + h * ow * 8 + w * 8, res[0]); + vst1q_f32(outArray + outBase + h * ow * 8 + w * 8 + 4, res[1]); + } else { + float32x4_t res[2] = {0}; + float32x4_t vecTL = vld1q_f32(inArray + inBase + hT * iw * 8 + wL * 8); + float32x4_t vecTL1 = vld1q_f32(inArray + inBase + hT * iw * 8 + wL * 8 + 4); + float32x4_t vecTR = vld1q_f32(inArray + inBase + hT * iw * 8 + wR * 8); + float32x4_t vecTR1 = vld1q_f32(inArray + inBase + hT * iw * 8 + wR * 8 + 4); + float32x4_t vecBL = vld1q_f32(inArray + inBase + hB * iw * 8 + wL * 8); + float32x4_t vecBL1 = vld1q_f32(inArray + inBase + hB * iw * 8 + wL * 8 + 4); + float32x4_t vecBR = vld1q_f32(inArray + inBase + hB * iw * 8 + wR * 8); + float32x4_t vecBR1 = vld1q_f32(inArray + inBase + hB * iw * 8 + wR * 8 + 4); + res[0] = vfmaq_n_f32(res[0], vecTL, (hB - hC) * (wR - wC)); + res[1] = vfmaq_n_f32(res[1], vecTL1, (hB - hC) * (wR - wC)); + res[0] = vfmaq_n_f32(res[0], vecTR, (hB - hC) * (wC - wL)); + res[1] = vfmaq_n_f32(res[1], vecTR1, (hB - hC) * (wC - wL)); + res[0] = vfmaq_n_f32(res[0], vecBL, (hC - hT) * (wR - wC)); + res[1] = vfmaq_n_f32(res[1], vecBL1, (hC - hT) * (wR - wC)); + res[0] = vfmaq_n_f32(res[0], vecBR, (hC - hT) * (wC - wL)); + res[1] = vfmaq_n_f32(res[1], vecBR1, (hC - hT) * (wC - wL)); + vst1q_f32(outArray + outBase + h * ow * 8 + w * 8, res[0]); + vst1q_f32(outArray + outBase + h * ow * 8 + w * 8 + 4, res[1]); + } + } + } + } + } + return SUCCESS; +} +#endif + +EE resize_bilinear_arm(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *output) +{ + EE ret = SUCCESS; + switch (inputDesc.dt) { +#ifdef _USE_FP16 + case DT_F16: + ret = resize_bilinear_fp16(inputDesc, (F16 *)input, outputDesc, (F16 *)output); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + ret = resize_bilinear_fp32(inputDesc, (F32 *)input, outputDesc, (F32 *)output); + break; +#endif + default: + return NOT_SUPPORTED; + } + return ret; +} diff --git a/compute/image/src/cpu/general/image_general.h b/compute/image/src/cpu/general/image_general.h new file mode 100644 index 00000000..ca83b318 --- /dev/null +++ b/compute/image/src/cpu/general/image_general.h @@ -0,0 +1,91 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_IMAGE_GENERAL +#define _H_IMAGE_GENERAL + +#include "error.h" +#include "sys.h" +#include "tensor_desc.h" +#include "image.h" + +EE resize_bilinear_general(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *output); + +template +inline EE from_nchwc8_to_nchw(TensorDesc *desc, T *data) +{ + if (desc == nullptr || data == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + + DataType idt; + DataFormat idf; + U32 in, ic, ih, iw; + CHECK_STATUS(tensor4dGet(*desc, &idt, &idf, &in, &ic, &ih, &iw)); + if (idf != DF_NCHWC8) { + CHECK_STATUS(NOT_MATCH); + } + + *desc = tensor4df(idt, DF_NCHW, in, ic, ih, iw); + + T *tmp = (T *)malloc(tensorNumBytes(*desc)); + ic /= 8; + for (U32 n = 0; n < in; n++) { + for (U32 c = 0; c < ic; c++) { + for (U32 hw = 0; hw < ih * iw; hw++) { + for (U32 c8 = 0; c8 < 8; c8++) { + tmp[n * ic * 8 * ih * iw + (c * 8 + c8) * ih * iw + hw] = + data[n * ic * ih * iw * 8 + c * ih * iw * 8 + hw * 8 + c8]; + } + } + } + } + memcpy(data, tmp, tensorNumBytes(*desc)); + free(tmp); + return SUCCESS; +} + +template +inline EE from_nchw_to_nchwc8(TensorDesc *desc, T *data) +{ + if (desc == nullptr || data == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + + DataType idt; + DataFormat idf; + U32 in, ic, ih, iw; + CHECK_STATUS(tensor4dGet(*desc, &idt, &idf, &in, &ic, &ih, &iw)); + if (idf != DF_NCHW) { + CHECK_STATUS(NOT_MATCH); + } + + *desc = tensor4df(idt, DF_NCHWC8, in, ic, ih, iw); + + T *tmp = (T *)malloc(tensorNumBytes(*desc)); + ic /= 8; + for (U32 n = 0; n < in; n++) { + for (U32 c = 0; c < ic; c++) { + for (U32 hw = 0; hw < ih * iw; hw++) { + for (U32 c8 = 0; c8 < 8; c8++) { + tmp[n * ic * ih * iw * 8 + c * ih * iw * 8 + hw * 8 + c8] = + data[n * ic * 8 * ih * iw + (c * 8 + c8) * ih * iw + hw]; + } + } + } + } + memcpy(data, tmp, tensorNumBytes(*desc)); + free(tmp); + return SUCCESS; +} +#endif diff --git a/compute/image/src/cpu/general/resize_bilinear.cpp b/compute/image/src/cpu/general/resize_bilinear.cpp new file mode 100644 index 00000000..9068d1ef --- /dev/null +++ b/compute/image/src/cpu/general/resize_bilinear.cpp @@ -0,0 +1,138 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "image.h" +#include "cpu/general/image_general.h" + +template +EE resize_bilinear(TensorDesc inputDesc, IT *inArray, TensorDesc outputDesc, OT *outArray) +{ + DataType idt, odt; + DataFormat idf, odf; + U32 in, ic, ih, iw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if (idf == DF_NCHWC8) { + CHECK_STATUS(from_nchwc8_to_nchw(&inputDesc, inArray)); + } + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + if (idf != DF_NCHW && idf != DF_RGB) { + CHECK_STATUS(NOT_MATCH); + } + + F32 strideH = (F32)(ih - 1) / (F32)(oh - 1); + F32 strideW = (F32)(iw - 1) / (F32)(ow - 1); + + for (U32 n = 0; n < on; n++) { + for (U32 c = 0; c < oc; c++) { + I32 outBase = n * oc * oh * ow + c * oh * ow; + I32 inBase = n * oc * ih * iw + c * ih * iw; + for (U32 h = 0; h < oh; h++) { + for (U32 w = 0; w < ow; w++) { + if (h == 0 && w == 0) { + outArray[outBase] = inArray[inBase]; + continue; + } + if (h == 0 && w == ow - 1) { + outArray[outBase + w] = inArray[inBase + iw - 1]; + continue; + } + if (h == oh - 1 && w == 0) { + outArray[outBase + h * ow] = inArray[inBase + (ih - 1) * iw]; + continue; + } + if (h == oh - 1 && w == ow - 1) { + outArray[outBase + h * ow + w] = inArray[inBase + (ih - 1) * iw + iw - 1]; + continue; + } + + F32 hC = strideH * h; + F32 wC = strideW * w; + + I32 hT = floor(hC); + I32 hB = ceil(hC); + I32 wL = floor(wC); + I32 wR = ceil(wC); + + if (hT == hB && wL == wR) { + outArray[outBase + h * ow + w] = inArray[inBase + hT * iw + wL]; + } else if (hT == hB) { + outArray[outBase + h * ow + w] = inArray[inBase + hT * iw + wL] * (wR - wC) + + inArray[inBase + hT * iw + wR] * (wC - wL); + } else if (wL == wR) { + outArray[outBase + h * ow + w] = inArray[inBase + hT * iw + wL] * (hB - hC) + + inArray[inBase + hB * iw + wL] * (hC - hT); + } else { + F32 factorTL = (hB - hC) * (wR - wC); + F32 factorTR = (hB - hC) * (wC - wL); + F32 factorBL = (hC - hT) * (wR - wC); + F32 factorBR = (hC - hT) * (wC - wL); + + outArray[outBase + h * ow + w] = inArray[inBase + hT * iw + wL] * factorTL; + outArray[outBase + h * ow + w] += inArray[inBase + hT * iw + wR] * factorTR; + outArray[outBase + h * ow + w] += inArray[inBase + hB * iw + wL] * factorBL; + outArray[outBase + h * ow + w] += inArray[inBase + hB * iw + wR] * factorBR; + } + } + } + } + } + + if (odf == DF_NCHWC8) { + outputDesc.df = DF_NCHW; + CHECK_STATUS(from_nchw_to_nchwc8(&outputDesc, outArray)); + } + return SUCCESS; +} + +EE resize_bilinear_general(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *output) +{ + EE ret = SUCCESS; + switch (inputDesc.dt) { +#ifdef __aarch64__ + case DT_F16: { + ret = resize_bilinear(inputDesc, (F16 *)input, outputDesc, (F16 *)output); + break; + } +#endif +#ifdef _USE_FP32 + case DT_F32: { + ret = resize_bilinear(inputDesc, (F32 *)input, outputDesc, (F32 *)output); + break; + } +#endif + case DT_U8: { +#ifdef __aarch64__ + if (DT_F16 == outputDesc.dt) { + ret = resize_bilinear(inputDesc, (U8 *)input, outputDesc, (F16 *)output); + } +#endif +#ifdef _USE_FP32 + if (DT_F32 == outputDesc.dt) { + ret = resize_bilinear(inputDesc, (U8 *)input, outputDesc, (F32 *)output); + } +#endif + break; + } + default: + return NOT_SUPPORTED; + } + return ret; +} diff --git a/compute/image/src/gpu/mali/cl/resize_bilinear.cl b/compute/image/src/gpu/mali/cl/resize_bilinear.cl new file mode 100644 index 00000000..6aa3c70e --- /dev/null +++ b/compute/image/src/gpu/mali/cl/resize_bilinear.cl @@ -0,0 +1,72 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void resize_bilinear(const int ih, + const int ih_str, + const int ih_off, + const int iw, + const int iw_str, + const int iw_off, + const int oh, + const int oh_str, + const int oh_off, + const int ow, + const int ow_str, + const int ow_off, + const float ratioh, + const float ratiow, + __global T *input, + __global T *output) +{ + int idx = get_global_id(0); + int idy = get_global_id(1); + int idz = get_global_id(2); + + if (idx >= oh || idy >= ow) { + return; + } + + float2 posi; + float2 ratio; + ratio.x = ratioh; + ratio.y = ratiow; + + posi.x = (float)idx * ratio.x; + posi.y = (float)idy * ratio.y; + + int4 tblr; + tblr.x = max(0, (int)floor(posi.y)); // T + tblr.y = min(tblr.x + 1, iw - 1); // B + tblr.z = max(0, (int)floor(posi.x)); // L + tblr.w = min(tblr.z + 1, ih - 1); // R + + int out_off = (idz * ow_str + idy + ow_off) * oh_str + idx + oh_off; + int4 in_off; + in_off.x = (idz * iw_str + tblr.x + iw_off) * ih_str + tblr.z + ih_off; // TL_off + in_off.y = (idz * iw_str + tblr.x + iw_off) * ih_str + tblr.w + ih_off; // TR_off + in_off.z = (idz * iw_str + tblr.y + iw_off) * ih_str + tblr.z + ih_off; // BL_off + in_off.w = (idz * iw_str + tblr.y + iw_off) * ih_str + tblr.w + ih_off; // BR_off + + T4 val_TL, val_TR, val_BL, val_BR; + val_TL = vload4(0, input + (in_off.x << 2)); + val_TR = vload4(0, input + (in_off.y << 2)); + val_BL = vload4(0, input + (in_off.z << 2)); + val_BR = vload4(0, input + (in_off.w << 2)); + float dif1 = posi.x - (float)tblr.z; // C-L + float dif2 = posi.y - (float)tblr.x; // C-T + + T4 top = mad((val_TR - val_TL), dif1, val_TL); + T4 bottom = mad((val_BR - val_BL), dif1, val_BL); + T4 out = mad((bottom - top), dif2, top); + vstore4(out, 0, output + (out_off << 2)); +} diff --git a/compute/image/src/gpu/mali/cl/resize_bilinear_nchw.cl b/compute/image/src/gpu/mali/cl/resize_bilinear_nchw.cl new file mode 100644 index 00000000..1a452573 --- /dev/null +++ b/compute/image/src/gpu/mali/cl/resize_bilinear_nchw.cl @@ -0,0 +1,72 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void resize_bilinear_nchw(const int ih, + const int ih_str, + const int ih_off, + const int iw, + const int iw_str, + const int iw_off, + const int oh, + const int oh_str, + const int oh_off, + const int ow, + const int ow_str, + const int ow_off, + const float ratioh, + const float ratiow, + __global T *input, + __global T *output) +{ + int idx = get_global_id(0); + int idy = get_global_id(1); + int idz = get_global_id(2); + + if (idx >= ow || idy >= oh) { + return; + } + + float2 posi; + float2 ratio; + ratio.x = ratiow; + ratio.y = ratioh; + + posi.x = (float)idx * ratio.x; + posi.y = (float)idy * ratio.y; + + int4 tblr; + tblr.x = max(0, (int)floor(posi.x)); // L + tblr.y = min(tblr.x + 1, iw - 1); // R + tblr.z = max(0, (int)floor(posi.y)); // T + tblr.w = min(tblr.z + 1, ih - 1); // B + + int4 in_off; + in_off.x = (idz * ih_str + tblr.z + ih_off) * iw_str + tblr.x + iw_off; // TL_off + in_off.y = (idz * ih_str + tblr.z + ih_off) * iw_str + tblr.y + iw_off; // TR_off + in_off.z = (idz * ih_str + tblr.w + ih_off) * iw_str + tblr.x + iw_off; // BL_off + in_off.w = (idz * ih_str + tblr.w + ih_off) * iw_str + tblr.y + iw_off; // BR_off + + T val_TL, val_TR, val_BL, val_BR; + val_TL = input[in_off.x]; + val_TR = input[in_off.y]; + val_BL = input[in_off.z]; + val_BR = input[in_off.w]; + float dif1 = posi.x - (float)tblr.x; // C-L + float dif2 = posi.y - (float)tblr.z; // C-T + + float top = mad((float)(val_TR - val_TL), dif1, (float)val_TL); + float bottom = mad((float)(val_BR - val_BL), dif1, (float)val_BL); + T out = mad((bottom - top), dif2, top); + int out_off = (idz * oh_str + idy + oh_off) * ow_str + idx + ow_off; + output[out_off] = out; +} diff --git a/compute/image/src/gpu/mali/fp16/resize_bilinear_mali_fp16.cpp b/compute/image/src/gpu/mali/fp16/resize_bilinear_mali_fp16.cpp new file mode 100644 index 00000000..f098bd75 --- /dev/null +++ b/compute/image/src/gpu/mali/fp16/resize_bilinear_mali_fp16.cpp @@ -0,0 +1,113 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "image.h" +#include "gpu/mali/fp16/resize_bilinear_mali_fp16.h" +#include + +inline EE resize_checkpara_mali_fp16(TensorDesc inputDesc, TensorDesc outputDesc) +{ + if (inputDesc.dt != outputDesc.dt) { + return NOT_SUPPORTED; + } + if (outputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE resize_bilinear_core_mali_fp16( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output) +{ + U32 iw, ih, ic, in; + U32 ow, oh, oc, on; + tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); + tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); + + F32 v_ratio[2] = {(F32)(ih - 1) / (F32)(oh - 1), (F32)(iw - 1) / (F32)(ow - 1)}; + + U32 iw_str, ih_str, iw_off, ih_off; + U32 ow_str, oh_str, ow_off, oh_off; + get_gclmem_dim(input->desc, &iw_str, &ih_str, NULL, &iw_off, &ih_off); + get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off); + + cl_mem inbuf = input->mem; + cl_mem outbuf = output->mem; + + U32 gs[3] = {oh, ow, (oc + 3) / 4}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, "resize_bilinear", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, ih_str, ih_off, iw, iw_str, iw_off, oh, oh_str, + oh_off, ow, ow_str, ow_off, v_ratio[0], v_ratio[1], inbuf, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, "resize_bilinear"); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "resize_bilinear")); + CHECK_STATUS(gcl_print_memory(handle, input, "resize_input")); + CHECK_STATUS(gcl_print_memory(handle, output, "resize_output")); +#endif + return SUCCESS; +} + +inline EE resize_bilinear_core_nchw_mali_fp16( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output) +{ + U32 iw, ih, ic, in; + U32 ow, oh, oc, on; + tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); + tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); + + F32 v_ratio[2] = {(F32)(ih - 1) / (F32)(oh - 1), (F32)(iw - 1) / (F32)(ow - 1)}; + + U32 iw_str, ih_str, iw_off, ih_off; + U32 ow_str, oh_str, ow_off, oh_off; + get_gclmem_dim(input->desc, &iw_str, &ih_str, NULL, &iw_off, &ih_off); + get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off); + + cl_mem inbuf = input->mem; + cl_mem outbuf = output->mem; + char kernelname[128]; + sprintf(kernelname, "resize_bilinear_nchw"); + U32 gs[3] = {ow, oh, oc}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, ih_str, ih_off, iw, iw_str, iw_off, oh, oh_str, + oh_off, ow, ow_str, ow_off, v_ratio[0], v_ratio[1], inbuf, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + handle->t_total += handle->t_execute; +#endif + return SUCCESS; +} + +EE resize_bilinear_mali_fp16( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output) +{ + CHECK_STATUS(resize_checkpara_mali_fp16(inputDesc, outputDesc)); + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + if (input->desc.memFormat == DF_NCHW) { + CHECK_STATUS( + resize_bilinear_core_nchw_mali_fp16(handle, inputDesc, input, outputDesc, output)); + } else { + CHECK_STATUS(resize_bilinear_core_mali_fp16(handle, inputDesc, input, outputDesc, output)); + } + return SUCCESS; +} diff --git a/compute/image/src/gpu/mali/fp16/resize_bilinear_mali_fp16.h b/compute/image/src/gpu/mali/fp16/resize_bilinear_mali_fp16.h new file mode 100644 index 00000000..02bc8a63 --- /dev/null +++ b/compute/image/src/gpu/mali/fp16/resize_bilinear_mali_fp16.h @@ -0,0 +1,20 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _RESIZE_MALI_FP16 +#define _RESIZE_MALI_FP16 +#include "image.h" + +EE resize_bilinear_mali_fp16( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output); +#endif diff --git a/compute/image/src/gpu/mali/image_mali.h b/compute/image/src/gpu/mali/image_mali.h new file mode 100644 index 00000000..110e67d6 --- /dev/null +++ b/compute/image/src/gpu/mali/image_mali.h @@ -0,0 +1,30 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_IMAGE_MALI +#define _H_IMAGE_MALI + +#include "image.h" + +EE resize_infer_output_size_mali(TensorDesc inputDesc, + ResizeDesc resizeDesc, + void *params, + TensorDesc *outputDesc, + U32 *outputBytes, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE resize_bilinear_mali( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output); + +#endif diff --git a/compute/image/src/gpu/mali/resize_bilinear.cpp b/compute/image/src/gpu/mali/resize_bilinear.cpp new file mode 100644 index 00000000..bd5280cb --- /dev/null +++ b/compute/image/src/gpu/mali/resize_bilinear.cpp @@ -0,0 +1,93 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "gpu/mali/image_mali.h" +#include "gpu/mali/fp16/resize_bilinear_mali_fp16.h" + +EE resize_infer_output_size_mali(TensorDesc inputDesc, + ResizeDesc resizeDesc, + void *params, + TensorDesc *outputDesc, + U32 *outputBytes, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + /*tensorDesc record cpu org data format info*/ + /*gclmemDesc record gpu trans data format info*/ + if (outputDesc == nullptr || gclmemInputDesc == nullptr || outputBytes == nullptr || + gclmemOutputDesc == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt; + DataFormat idf; + U32 in, ic, ih, iw; + U32 oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + switch (resizeDesc.paramDT) { + case DT_F32: { + F32 *scales = (F32 *)params; + oh = ih * scales[0]; + ow = iw * scales[1]; + break; + } + case DT_U32: { + U32 *len = (U32 *)params; + oh = len[0]; + ow = len[1]; + break; + } + default: { + return NOT_SUPPORTED; + } + } + *outputDesc = tensor4df(idt, DF_NCHW, in, ic, oh, ow); + *outputBytes = tensorNumBytes(*outputDesc); + if ((idf == gclmemInputDesc->byteSize == 0 || gclmemInputDesc->memFormat == DF_NCHW) && ic <= 2) { + CHECK_STATUS(infer_gclmem_desc_nchw( + iw, ih, ic, 0, 0, ow, oh, ic, idt, idt, gclmemInputDesc, gclmemOutputDesc)); + } else { + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + iw, ih, ic, 0, 0, ow, oh, ic, idt, idt, gclmemInputDesc, gclmemOutputDesc)); + } + return SUCCESS; +} + +inline EE resize_checkpara_mali( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output) +{ + if (handle == nullptr || nullptr == input || nullptr == output) { + return NULL_POINTER; + } + return SUCCESS; +} + +EE resize_bilinear_mali( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output) +{ + EE ret = SUCCESS; + CHECK_STATUS(resize_checkpara_mali(handle, inputDesc, input, outputDesc, output)); + switch (inputDesc.dt) { + case DT_F16: { + ret = resize_bilinear_mali_fp16(handle, inputDesc, input, outputDesc, output); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/image/src/image_processing.cpp b/compute/image/src/image_processing.cpp new file mode 100644 index 00000000..d7feecdd --- /dev/null +++ b/compute/image/src/image_processing.cpp @@ -0,0 +1,267 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include +#include +#include "image.h" +#include "tensor_desc.h" +#include "tensor.hpp" +#include "types.h" +#include "error.h" + +template +std::shared_ptr get_resize_image( + Tensor rgbTensor, TensorDesc imageDesc, ImageFormat targetImageFormat, float scaleValue) +{ + ArchInfo archInfo; + auto arch = CPU_GENERAL; + archInfo.arch = arch; + DataType rgbDt = DT_F16, imageDt = DT_F16; + DataFormat rgbDf = DF_RGB, imageDf = DF_RGB; + U32 rgbNum = 0, rgbChannel = 0, rgbHeight = 0, rgbWidth = 0; + U32 imageNum = 0, imageChannel = 0, imageHeight = 0, imageWidth = 0; + TensorDesc rgbDesc = rgbTensor.get_desc(); + CHECK_STATUS(tensor4dGet(rgbDesc, &rgbDt, &rgbDf, &rgbNum, &rgbChannel, &rgbHeight, &rgbWidth)); + CHECK_REQUIREMENT(rgbDf == DF_RGB); + CHECK_REQUIREMENT(rgbChannel == 3); + CHECK_REQUIREMENT(rgbNum == 1); + + CHECK_STATUS(tensor4dGet( + imageDesc, &imageDt, &imageDf, &imageNum, &imageChannel, &imageHeight, &imageWidth)); + CHECK_REQUIREMENT(imageDf == DF_NCHW); + CHECK_REQUIREMENT(imageNum == 1); + + U32 height = rgbHeight; + U32 width = rgbWidth; + + Tensor temp; + std::shared_ptr transferSpaceTensor(new Tensor()); + transferSpaceTensor->resize(imageDesc); + transferSpaceTensor->alloc(); + T *transferSpacePtrMov = (T *)get_ptr_from_tensor(*transferSpaceTensor, arch); + + // magic number + float meanRGB[3] = {122.6789143406786, 116.66876761696767, 104.0069879317889}; + float meanRGBSC[3] = {0.485, 0.456, 0.406}; + float stdRGBSC[3] = {0.229, 0.224, 0.225}; + + U32 transform[3]; + switch (targetImageFormat) { + case RGB: + transform[0] = 0; + transform[1] = 1; + transform[2] = 2; + break; + case BGR: + transform[0] = 2; + transform[1] = 1; + transform[2] = 0; + break; + case BGR_SC_RAW: + transform[0] = 2; + transform[1] = 1; + transform[2] = 0; + break; + case RGB_SC: + transform[0] = 0; + transform[1] = 1; + transform[2] = 2; + break; + case RGB_RAW: + transform[0] = 0; + transform[1] = 1; + transform[2] = 2; + break; + case RGB_SC_RAW: + transform[0] = 0; + transform[1] = 1; + transform[2] = 2; + break; + default: + UNI_ERROR_LOG("[ERROR] unsupported image format\n"); + return nullptr; + } + + // consider the dataformat + if (targetImageFormat == RGB_SC) { // Specific for Birealnet18, scale short edge to 224 first + F32 scale = 224.0 / UNI_MIN(height, width); + if (height < width) { + height = 224; + width = (U32)(scale * width + 0.5); + } else { + height = (U32)(scale * height + 0.5); + width = 224; + } + Tensor scaleTensor; + TensorDesc scaledDesc = tensor4df(imageDt, imageDf, imageNum, imageChannel, height, width); + scaleTensor.resize(scaledDesc); + scaleTensor.alloc(); + resize(rgbTensor, temp, scaleTensor, &archInfo); + + U32 h0 = (U32)((height - 224) * 0.5); + U32 w0 = (U32)((width - 224) * 0.5); + + T *scaled = (T *)get_ptr_from_tensor(scaleTensor, arch); + for (U32 c : transform) { + for (U32 h = h0; h < h0 + imageHeight; h++) { + for (U32 w = w0; w < w0 + imageWidth; w++) { + T value = (scaled[c * height * width + h * width + w] / 255 - meanRGBSC[c]) / + stdRGBSC[c]; + CHECK_REQUIREMENT(!UNI_ISNAN(value)); + *transferSpacePtrMov = value; + transferSpacePtrMov++; + } + } + } + } else if (targetImageFormat == RGB_RAW) { + resize(rgbTensor, temp, *transferSpaceTensor.get(), &archInfo); + } else if (targetImageFormat == RGB_SC_RAW || targetImageFormat == BGR_SC_RAW) { + F32 scale = 256.0 / UNI_MIN(height, width); + if (height < width) { + height = 256; + width = (U32)(scale * (F32)width + 0.5); + } else { + height = (U32)(scale * (F32)height + 0.5); + width = 256; + } + Tensor scaleTensor; + TensorDesc scaledDesc = tensor4df(imageDt, imageDf, imageNum, imageChannel, height, width); + scaleTensor.resize(scaledDesc); + scaleTensor.alloc(); + resize(rgbTensor, temp, scaleTensor, &archInfo); + + U32 h0 = (U32)((height - 224) * 0.5); + U32 w0 = (U32)((width - 224) * 0.5); + + T *scaled = (T *)get_ptr_from_tensor(scaleTensor, arch); + for (U32 c : transform) { + for (U32 h = h0; h < h0 + 224; h++) { + memcpy(transferSpacePtrMov, scaled + c * height * width + h * width + w0, + 224 * bytesOf(imageDt)); + transferSpacePtrMov += 224; + } + } + } else { + Tensor scaleTensor; + scaleTensor.resize(imageDesc); + scaleTensor.alloc(); + resize(rgbTensor, temp, scaleTensor, &archInfo); + + T *resized = (T *)get_ptr_from_tensor(scaleTensor, arch); + for (U32 c : transform) { + for (U32 h = 0; h < imageHeight; h++) { + for (U32 w = 0; w < imageWidth; w++) { + T value = (resized[c * imageHeight * imageWidth + h * imageWidth + w] - + 1.0 * meanRGB[c]) * + scaleValue; + CHECK_REQUIREMENT(!UNI_ISNAN(value)); + *transferSpacePtrMov = value; + transferSpacePtrMov++; + } + } + } + } + return transferSpaceTensor; +} + +// CImg load image to save in RGB format +// OpenCV load image to save in BGR format +// PIL load image to save in BGR format +// scikit-image load image to save in RGB format +// If you want to use other format, please set targetImageFormat +// numpy use OpenCV to load image + +// Assume most networks require 224*224 inputs +std::shared_ptr load_resize_image( + Tensor rgbTensor, TensorDesc imageDesc, ImageFormat targetImageFormat, float scaleValue) +{ + DataType imageDt = DT_F32; + DataFormat imageDf; + U32 imageNum, imageChannel, imageHeight, imageWidth; + + CHECK_STATUS(tensor4dGet( + imageDesc, &imageDt, &imageDf, &imageNum, &imageChannel, &imageHeight, &imageWidth)); + + switch (imageDt) { +#ifdef __aarch64__ + case DT_F16: { + return get_resize_image(rgbTensor, imageDesc, targetImageFormat, scaleValue); + } +#endif +#ifdef _USE_FP32 + case DT_F32: { + return get_resize_image(rgbTensor, imageDesc, targetImageFormat, scaleValue); + } +#endif + default: { + CHECK_STATUS(NOT_SUPPORTED); + return nullptr; + } + } +} + +template +std::shared_ptr gen_fake_image(TensorDesc inputDesc) +{ + DataType dt; + DataFormat df; + U32 in = 0, ic = 0, ih = 0, iw = 0; + CHECK_STATUS(tensor4dGet(inputDesc, &dt, &df, &in, &ic, &ih, &iw)); + CHECK_REQUIREMENT(df == DF_NCHW); + CHECK_REQUIREMENT(in == 1); + + U32 totalBytes = tensorNumBytes(inputDesc); + + // upon on the data type, to malloc the corresponding space + T *transferSpacePtr = (T *)operator new(totalBytes); + T *transferSpacePtrMov = transferSpacePtr; + + // consider the dataformat + for (U32 c = 0; c < ic; c++) { + for (U32 h = 0; h < ih; h++) { + for (U32 w = 0; w < iw; w++) { + *transferSpacePtrMov = 1; + transferSpacePtrMov++; + } + } + } + + std::shared_ptr val((U8 *)transferSpacePtr); + return val; +} + +std::shared_ptr load_fake_image(TensorDesc inputDesc) +{ + DataType dt = DT_F32; + DataFormat df; + U32 in, ic, ih, iw; + CHECK_STATUS(tensor4dGet(inputDesc, &dt, &df, &in, &ic, &ih, &iw)); + + switch (dt) { +#ifdef __aarch64__ + case DT_F16: { + return gen_fake_image(inputDesc); + } +#endif +#ifdef _USE_FP32 + case DT_F32: { + return gen_fake_image(inputDesc); + } +#endif + default: { + CHECK_STATUS(NOT_SUPPORTED); + return nullptr; + } + } +} diff --git a/compute/image/src/resize.cpp b/compute/image/src/resize.cpp new file mode 100644 index 00000000..9f97b86e --- /dev/null +++ b/compute/image/src/resize.cpp @@ -0,0 +1,157 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "image.h" +#ifdef _USE_GENERAL +#include "cpu/general/image_general.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/image_arm.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/image_mali.h" +#endif +#include + +// params is a pointer to either the target size or the resize ratios +// When resizeDesc specifies DT_U32, params should point to target sizes (height and width) +// When resizeDesc specifies DT_F32, params should point to resize ratios +EE resize_infer_output_size_cpu(TensorDesc inputDesc, + ResizeDesc resizeDesc, + void *params, + TensorDesc *outputDesc, + U32 *outputBytes) +{ + if (nullptr == outputDesc || nullptr == outputBytes) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt; + DataFormat idf; + U32 in, ic, ih, iw; + U32 oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + + switch (resizeDesc.paramDT) { + case DT_F32: { + F32 *scales = (F32 *)params; + oh = ih * scales[0]; + ow = iw * scales[1]; + break; + } + case DT_U32: { + U32 *len = (U32 *)params; + oh = len[0]; + ow = len[1]; + break; + } + default: { + return NOT_SUPPORTED; + } + } + *outputDesc = tensor4df(idt, idf, in, ic, oh, ow); + *outputBytes = tensorNumBytes(*outputDesc); + return SUCCESS; +} + +EE resize_infer_output_size(Tensor *inputTensor, + ResizeDesc resizeDesc, + void *params, + Tensor *outputTensor, + U32 *outputBytes, + ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = resize_infer_output_size_mali(inputDesc, resizeDesc, params, &outputDesc, outputBytes, + &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } else { + ret = resize_infer_output_size_cpu(inputDesc, resizeDesc, params, &outputDesc, outputBytes); + } + outputTensor->resize(outputDesc); + return ret; +} + +EE resize(Tensor inputTensor, Tensor tmpTensor, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + + DataType idt, odt; + DataFormat idf, odf; + U32 in, ic, ih, iw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + CHECK_REQUIREMENT(in == on && ic == oc); + + if (ih == oh && iw == ow && archInfo->arch != MALI) { + memcpy(output, input, tensorNumBytes(inputDesc)); + return SUCCESS; + } + + TensorDesc inDescARM = inputDesc; + U8 *inputARM = (U8 *)input; + TensorDesc outDescARM = outputDesc; + U8 *outputARM = (U8 *)output; + if (DF_NCHWC8 != inputDesc.df && IS_ARM(arch)) { + U32 paddedC = (inputDesc.dims[2] + 7) / 8 * 8; + inDescARM.dims[2] = paddedC; + inDescARM.df = DF_NCHWC8; + outDescARM.dims[2] = paddedC; + outDescARM.df = DF_NCHWC8; + inputARM = (U8 *)tmp; + outputARM = inputARM + tensorNumBytes(inDescARM); + transformNCHWToNCHWC8(inputDesc, input, inDescARM, inputARM); + } + EE ret = NOT_SUPPORTED; + + if (IS_GENERAL(arch) || IS_X86_AVX2(arch)) { +#if defined(_USE_GENERAL) || defined(_USE_X86) + ret = resize_bilinear_general(inputDesc, input, outputDesc, output); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = resize_bilinear_arm(inDescARM, inputARM, outDescARM, outputARM); +#endif + +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = resize_bilinear_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, + (GCLMem_t)input, outputDesc, (GCLMem_t)output); + +#endif + } + if (DF_NCHWC8 != outputDesc.df && IS_ARM(arch)) { + transformToNCHW(outDescARM, outputARM, outputDesc, output); + } + return ret; +} diff --git a/compute/image/tests/CMakeLists.txt b/compute/image/tests/CMakeLists.txt new file mode 100644 index 00000000..60dcb75c --- /dev/null +++ b/compute/image/tests/CMakeLists.txt @@ -0,0 +1,12 @@ +function(image_test name) + add_executable(${name} ${name}.cpp) + link_image(${name}) +endfunction() + +set_test_c_cxx_flags() + +#image_test(test_image_processing) +#image_test(test_image_resize) +if (USE_MALI) + image_test(test_image_resize_ocl test_image_resize_ocl.cpp) +endif (USE_MALI) diff --git a/tests/test_image_processing.cpp b/compute/image/tests/test_image_processing.cpp similarity index 81% rename from tests/test_image_processing.cpp rename to compute/image/tests/test_image_processing.cpp index 95c833d9..0a1ff9c0 100644 --- a/tests/test_image_processing.cpp +++ b/compute/image/tests/test_image_processing.cpp @@ -1,25 +1,24 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "ut_util.h" #include "tensor_desc.h" #include "image_processing.hpp" - -int main() { +int main() +{ TensorDesc rgbDesc = tensor4df(DT_U8, DF_RGB, 1, 3, 1280, 960); - U8* rgb = ut_input_v(tensorNumElements(rgbDesc), DT_U8, UT_INIT_POS); + U8 *rgb = ut_input_v(tensorNumElements(rgbDesc), DT_U8, UT_INIT_POS); TensorDesc imageDesc = tensor4df(DT_F32, DF_NCHW, 1, 3, 224, 224); load_resize_image(rgbDesc, rgb, imageDesc, RGB, 0.017); diff --git a/compute/image/tests/test_image_resize.cpp b/compute/image/tests/test_image_resize.cpp new file mode 100644 index 00000000..e40b48d2 --- /dev/null +++ b/compute/image/tests/test_image_resize.cpp @@ -0,0 +1,104 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "image.h" +#include "ut_util.h" + +int resizeTest(int argc, char *argv[], DataType dt) +{ + CHECK_REQUIREMENT(argc == 9); + // in data + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + // output + U32 on = atoi(argv[5]); + U32 oc = atoi(argv[6]); + U32 oh = atoi(argv[7]); + U32 ow = atoi(argv[8]); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + CHECK_REQUIREMENT(in == 1 && on == 1); + CHECK_REQUIREMENT(ic % 8 == 0 && oc % 8 == 0); + + TensorDesc inputDesc, outputDesc; + ResizeDesc resizeDesc; + inputDesc = tensor4df(dt, DF_NCHWC8, in, ic, ih, iw); + + resizeDesc.paramDT = DT_F32; + F32 scales[2]; + scales[0] = (F32)oh / (F32)ih; + scales[1] = (F32)ow / (F32)iw; + + // setup input, filter + U8 *input = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM); + U8 *input_ref = ut_input_v(in * ic * ih * iw, dt, UT_INIT_ZERO); + memcpy(input_ref, input, bytesOf(dt) * in * ic * ih * iw); + + // setup output + U32 outputBytes; + CHECK_STATUS(resize_infer_output_size( + inputDesc, resizeDesc, scales, &outputDesc, &outputBytes, &archInfo)); + CHECK_REQUIREMENT(tensorNumElements(outputDesc) == on * oc * oh * ow); + U32 output_size = outputBytes / bytesOf(dt); + U8 *output = ut_input_v(output_size, dt, UT_INIT_ZERO); + U8 *output_ref = ut_input_v(output_size, dt, UT_INIT_ZERO); + + if (UT_CHECK) { + CHECK_STATUS(resize(inputDesc, input, nullptr, outputDesc, output, &archInfo)); + + // naive implement + CHECK_STATUS(resize(inputDesc, input_ref, nullptr, outputDesc, output_ref, &archInfo_org)); + + // check + ut_check_v(output, output_ref, output_size, dt, 0.05, __FILE__, __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(resize(inputDesc, input_ref, nullptr, outputDesc, output_ref, &archInfo_org)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)=>(%u %u %u %u)", in, ic, ih, iw, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "Resize", params); + double ops = 15.0 * on * oc * oh * ow; + ut_log(dt, buffer, ops, time); + + free(input); + free(output); + free(input_ref); + free(output_ref); + return 0; +} + +int main(int argc, char *argv[]) +{ +#ifdef _USE_FP16 + resizeTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + resizeTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/compute/image/tests/test_image_resize_ocl.cpp b/compute/image/tests/test_image_resize_ocl.cpp new file mode 100644 index 00000000..41371b70 --- /dev/null +++ b/compute/image/tests/test_image_resize_ocl.cpp @@ -0,0 +1,164 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "types.h" +#include "image.h" +#include "ut_util.h" +#include "gcl.h" +#include "libkernelsource.h" +#include "tensor_computing.h" + +#ifdef _USE_FP16 +inline GCLMem_t alloc(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_map(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->mapped_alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_bytes(Tensor tensor, U32 size) +{ + auto mem = (OclMemory *)tensor.get_memory(); + GCLMem_t ptr = NULL; + if (size > 0) { + mem->resize(tensor1d(DT_U8, size)); + mem->alloc(); + ptr = (GCLMem_t)mem->get_ptr(); + } + return ptr; +} +int resizeTest(int argc, char *argv[], DataType dt) +{ + CHECK_REQUIREMENT(argc == 9); + // in data + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + // output + U32 on = atoi(argv[5]); + U32 oc = atoi(argv[6]); + U32 oh = atoi(argv[7]); + U32 ow = atoi(argv[8]); + + CHECK_REQUIREMENT(in == 1 && on == 1); + + ArchInfo archInfo; + archInfo.arch = MALI; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + TensorDesc inputDesc_cpu, inputDesc_gpu, outputDesc_cpu, outputDesc_gpu; + ResizeDesc resizeDesc; + inputDesc_cpu = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + inputDesc_gpu = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + + resizeDesc.paramDT = DT_F32; + F32 scales[2]; + scales[0] = (F32)oh / (F32)ih; + scales[1] = (F32)ow / (F32)iw; + + // setup input + U8 *input_cpu = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM); + + Tensor inputTensorCpu; + inputTensorCpu.resize(inputDesc_cpu); + inputTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(inputTensorCpu, UT_ARCH), input_cpu, tensorNumBytes(inputDesc_cpu)); + Tensor outputTensorCpu; + Tensor tmpTensorCpu; + U32 outputBytes; + CHECK_STATUS(resize_infer_output_size( + &inputTensorCpu, resizeDesc, scales, &outputTensorCpu, &outputBytes, &archInfo_org)); + outputTensorCpu.alloc(); + + // naive implement + // CPU output + CHECK_STATUS(resize(inputTensorCpu, tmpTensorCpu, outputTensorCpu, &archInfo_org)); + std::shared_ptr handleSharedPtr = OCLContext::getInstance().handle; + ; + GCLHandle_t handle = handleSharedPtr.get(); + std::vector kernelVec; + handle->kernelVec = &kernelVec; + Tensor inputTensor = Tensor(OCLMem); + Tensor outputTensor = Tensor(OCLMem); + Tensor tmpTensor = Tensor(OCLMem); + inputTensor.resize(inputDesc_gpu); + U8 *output_gpu = NULL; + + MaliPara maliPara; + maliPara.handle = handle; + archInfo.archPara = &maliPara; + + CHECK_STATUS(resize_infer_output_size( + &inputTensor, resizeDesc, scales, &outputTensor, &outputBytes, &archInfo)); + U32 maxBytes = 0; + U32 tmpBytes = 0; + + GCLMem_t output = alloc_map(outputTensor); + GCLMem_t input = alloc(inputTensor); + CHECK_STATUS(gcl_fill_memory_zero(handle, input)); + tmpBytes = tensorNumBytes(inputDesc_gpu); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + GCLMem_t tmpbuf = alloc_bytes(tmpTensor, maxBytes); + CHECK_STATUS(ocl_set_input(handle, input, inputDesc_gpu, input_cpu, tmpbuf, true)); + + CHECK_STATUS(resize(inputTensor, tmpTensor, outputTensor, &archInfo)); + /*warp up*/ + UNI_INFO_LOG("Warp up gpu:\n") + for (U32 i = 0; i < 2; i++) { + CHECK_STATUS(gcl_run_kernelVec(handle)); + } + + UNI_INFO_LOG("Run:\n") +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernelVec_timing(handle, 0, handle->kernelVec->size())); + double time = handle->t_execute * 0.001; +#else + CHECK_STATUS(gcl_run_kernelVec(handle)); +#endif + outputDesc_gpu = outputTensor.get_desc(); + ; + CHECK_STATUS(ocl_get_output(handle, output, outputDesc_gpu, true)); + output_gpu = output->mapPtrArray.back(); + + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)->(%u %u %u %u)", in, ic, ih, iw, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "bilinear", params); +#ifdef _DEBUG + double ops = on * oc * oh * ow * 4; // TO DO + ut_log(dt, buffer, ops, time); +#endif + ut_check_a(output_gpu, get_ptr_from_tensor(outputTensorCpu, UT_ARCH), on * oc * ow * oh, dt); + CHECK_STATUS(gcl_finish(handle)); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + free(input_cpu); + return 0; +} +#endif + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + resizeTest(argc, argv, DT_F16); +#endif + return 0; +} diff --git a/compute/tensor/CMakeLists.txt b/compute/tensor/CMakeLists.txt new file mode 100644 index 00000000..b0bd366e --- /dev/null +++ b/compute/tensor/CMakeLists.txt @@ -0,0 +1,20 @@ +cmake_minimum_required(VERSION 3.2) + +file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/common/cmakes/bolt.cmake ${BOLT_ROOT}/common/cmakes/bolt.cmake) +if (BOLT_CONFIGURE_FILE) + include(${BOLT_CONFIGURE_FILE}) +else (BOLT_CONFIGURE_FILE) + message(FATAL_ERROR " +FATAL: can not find bolt.cmake in directory, + please set shell or cmake environment variable BOLT_ROOT. + ") +endif (BOLT_CONFIGURE_FILE) + +project(tensor) + +set_c_cxx_flags() + +include_tensor() + +add_subdirectory(src) +add_subdirectory(tests) diff --git a/compute/tensor/include/tensor_computing.h b/compute/tensor/include/tensor_computing.h new file mode 100644 index 00000000..42816f46 --- /dev/null +++ b/compute/tensor/include/tensor_computing.h @@ -0,0 +1,702 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_TENSOR_COMPUTING +#define _H_TENSOR_COMPUTING + +#include +#include "sys.h" +#include "types.h" +#include "tensor.hpp" +#include "tensor_computing_type.h" + +EE convolution_infer_output_size(Tensor *inputTensor, + Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + Tensor *outputTensor, + DataType targetDataType, + ArchInfo_t archInfo); + +EE convolution_infer_forward_algorithm(Tensor inputTensor, + Tensor filterTensor, + Tensor outputTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + ConvolutionForwardAlgorithm *algorithm, + DataType targetDataType, + ActivationParamSpec activationDesc, + ArchInfo_t archInfo); + +EE convolution_transform_filter_bytes(Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes, + ArchInfo_t archInfo); + +EE convolution_transform_filter(Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + Tensor tmpTensor, + Tensor *ftmTensor, + ArchInfo_t archInfo); + +EE convolution_infer_forward_tmp_bytes(Tensor inputTensor, + Tensor filterTensor, + Tensor outputTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes, + ArchInfo_t archInfo); + +EE convolution(Tensor inputTensor, + Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + void *scale, + Tensor biasTensor, + Tensor tmpTensor, + Tensor outputTensor, + ActivationParamSpec activationDesc, + ArchInfo_t archInfo); + +EE deconvolution_infer_output_size(Tensor *inputTensor, + Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + Tensor *outputTensor, + DataType targetDataType, + ArchInfo_t archInfo); + +EE deconvolution_transform_filter_bytes(Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes, + ArchInfo_t archInfo); + +EE deconvolution_transform_filter(Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + Tensor tmpTensor, + Tensor *ftmTensor, + ArchInfo_t archInfo); + +EE deconvolution_infer_forward_algorithm(Tensor inputTensor, + Tensor filterTensor, + Tensor outputTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + ConvolutionForwardAlgorithm *algorithm, + DataType targetDataType, + ActivationParamSpec activationDesc, + ArchInfo_t archInfo); + +EE deconvolution_infer_forward_tmp_bytes(Tensor inputTensor, + Tensor filterTensor, + Tensor outputTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes, + ArchInfo_t archInfo); + +EE deconvolution(Tensor inputTensor, + Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + void *scale, + Tensor biasTensor, + Tensor tmpTensor, + Tensor outputTensor, + ActivationParamSpec activationDesc, + ArchInfo_t archInfo); + +EE depthwise_pointwise_convolution_infer_output_size(Tensor *inputTensor, + Tensor dwFilterTensor, + Tensor pwFilterTensor, + ConvolutionParamSpec convParamSpec, + Tensor *outputTensor, + DataType targetDataType, + ArchInfo_t archInfo); + +EE depthwise_pointwise_convolution_infer_forward_algorithm(Tensor inputTensor, + Tensor dwFilterTensor, + Tensor pwFilterTensor, + Tensor outputTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + DepthwiseConvolutionForwardAlgorithm *algorithm, + DataType targetDataType, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + ArchInfo_t archInfo); + +EE depthwise_pointwise_convolution_infer_forward_tmp_bytes(Tensor inputTensor, + Tensor dwFilterTensor, + Tensor pwFilterTensor, + Tensor outputTensor, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + U32 *bytes, + ArchInfo_t archInfo); + +EE depthwise_pointwise_convolution_transform_filter_bytes(Tensor dwFilterTensor, + Tensor pwFilterTensor, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + U32 *dwBytes, + U32 *pwBytes, + ArchInfo_t archInfo); + +EE depthwise_pointwise_convolution_transform_filter(Tensor dwFilterTensor, + Tensor pwFilterTensor, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + Tensor *dwFtm, + Tensor *pwFtm, + ArchInfo_t archInfo); + +EE depthwise_pointwise_convolution(Tensor inputTensor, + Tensor dwFilterTensor, + Tensor pwFilterTensor, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + Tensor dwBiasTensor, + Tensor pwBiasTensor, + Tensor tmpTensor, + Tensor outputTensor, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + ArchInfo_t archInfo); + +EE depthwise_convolution_infer_output_size(Tensor *inputTensor, + Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + Tensor *outputTensor, + DataType targetDataType, + ArchInfo_t archInfo); + +EE depthwise_convolution_infer_forward_algorithm(Tensor inputTensor, + Tensor filterTensor, + Tensor outputTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + DepthwiseConvolutionForwardAlgorithm *algorithm, + DataType targetDataType, + ActivationParamSpec depthwiseActivationParamSpec, + ArchInfo_t archInfo); + +EE depthwise_convolution_transform_filter_bytes(Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + U32 *bytes, + ArchInfo_t archInfo); + +EE depthwise_convolution_transform_filter(Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + Tensor *ftmTensor, + ArchInfo_t archInfo); + +EE depthwise_convolution_infer_forward_tmp_bytes(Tensor inputTensor, + Tensor filterTensor, + Tensor outputTensor, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + U32 *bytes, + ArchInfo_t archInfo); + +EE depthwise_convolution(Tensor inputTensor, + Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + Tensor biasTensor, + Tensor tmpTensor, + Tensor outputTensor, + ActivationParamSpec depthwiseActivationParamSpec, + ArchInfo_t archInfo); + +EE detectionoutput_infer_output_size(std::vector inputTensor, + DetectionOutputParamSpec detectionOutputParamSpec, + Tensor *outputTensor, + ArchInfo_t archInfo); + +EE detectionoutput(std::vector inputTensor, + DetectionOutputParamSpec detectionOutputParamSpec, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE pooling_infer_output_size(Tensor *inputTensor, + PoolingParamSpec poolingParamSpec, + Tensor *outputTensor, + ArchInfo_t archInfo); + +EE pooling_infer_forward_tmp_bytes( + Tensor inputTensor, Tensor outputTensor, U32 *bytes, ArchInfo_t archInfo); + +EE pooling(Tensor inputTensor, + PoolingParamSpec poolingParamSpec, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE pooling_bp( + Tensor inputTensor, PoolingParamSpec poolingParamSpec, Tensor outputTensor, ArchInfo_t archInfo); + +EE priorbox_infer_output_size(std::vector inputTensor, + PriorBoxParamSpec priorBoxParamSpec, + Tensor *outputTensor, + ArchInfo_t archInfo); + +EE priorbox(std::vector inputTensor, + PriorBoxParamSpec priorBoxParamSpec, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE activation_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo); + +EE activation( + Tensor inputTensor, ActivationParamSpec activationDesc, Tensor outputTensor, ArchInfo_t archInfo); + +EE concat_infer_output_size( + std::vector inputTensor, ConcatParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo); + +EE concat_infer_forward_tmp_bytes(std::vector inputTensor, U32 *bytes, ArchInfo_t archInfo); + +EE concat(std::vector inputTensor, + ConcatParamSpec p, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE eltwise_infer_output_size( + std::vector inputTensor, Tensor *outputTensor, ArchInfo_t archInfo); + +EE eltwise_infer_forward_tmp_bytes( + std::vector inputTensor, Tensor outputTensor, U32 *bytes, ArchInfo_t archInfo); + +EE eltwise(std::vector inputTensor, + EltwiseParamSpec eltwiseDesc, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE split_infer_output_size(Tensor *inputTensor, std::vector output); + +EE split(Tensor inputTensor, std::vector outputTensor, ArchInfo_t archInfo); + +EE fully_connected_infer_output_size( + Tensor *inputTensor, Tensor filterTensor, Tensor *outputTensor, ArchInfo_t archInfo); + +EE fully_connected_infer_forward_algorithm( + Tensor inputTensor, Tensor filterTensor, Tensor outputTensor, ArchInfo_t archInfo); + +EE fully_connected_infer_forward_tmp_bytes( + Tensor inputTensor, Tensor filterTensor, U32 *bytes, ArchInfo_t archInfo); + +EE fully_connected_transform_filter_bytes(Tensor filterTensor, U32 *bytes, ArchInfo_t archInfo); + +EE fully_connected_transform_filter( + Tensor inputTensor, Tensor filterTensor, Tensor *ftmTensor, ArchInfo_t archInfo); + +EE fully_connected(Tensor inputTensor, + Tensor filterTensor, + Tensor biasTensor, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE softmax_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo); + +EE softmax(Tensor inputTensor, + SoftmaxParamSpec p, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE softmax_infer_forward_tmp_bytes(Tensor inputTensor, U32 *bytes, ArchInfo_t archInfo); + +EE rnn_infer_output_size( + Tensor *inputTensor, RNNParamSpec rnnParamSpec, Tensor *outputTensor, ArchInfo_t archInfo); + +EE rnn_infer_forward_tmp_bytes(Tensor inputTensor, + Tensor filterTensor, + Tensor outputTensor, + RNNParamSpec rnnParamSpec, + U32 *bytes, + ArchInfo_t archInfo); + +EE rnn_transform_filter_bytes( + std::vector filterTensor, RNNParamSpec rnnParamSpec, U32 *bytes, ArchInfo_t archInfo); + +EE rnn_transform_filter(std::vector filterTensor, + RNNParamSpec rnnParamSpec, + std::vector ftmTensor, + ArchInfo_t archInfo); + +EE rnn(Tensor inputTensor, + std::vector filterTensors, + std::vector biasTensors, + RNNParamSpec rnnParamSpec, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE rnncell_infer_output_size(std::vector inputTensor, + RNNParamSpec rnnParamSpec, + Tensor *outputTensor, + ArchInfo_t archInfo); + +EE rnncell_infer_forward_tmp_bytes(Tensor inputTensor, + Tensor filterTensor, + Tensor outputTensor, + RNNParamSpec rnnParamSpec, + U32 *bytes, + ArchInfo_t archInfo); + +EE rnncell_infer_forward_algorithm(Tensor xTensor, + Tensor filterTensor, + Tensor biasTensor, + RNNParamSpec rnncellDesc, + U32 batchStrideX, + U32 batchStrideH, + Tensor hTensor, + ArchInfo_t archInfo); + +EE rnncell(Tensor xTensor, + std::vector filterTensors, + std::vector biasTensors, + Tensor stateTensor, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + U32 tmpOffset, + Tensor tmpTensor, + Tensor hTensor, + ArchInfo_t archInfo); + +EE scale_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo); + +EE scale(Tensor inputTensor, + void *alpha, + void *beta, + ScaleParamSpec p, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE prelu_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo); + +EE prelu(Tensor inputTensor, + Tensor weightTensor, + PReLUParamSpec preluDesc, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE normalization_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo); + +EE layer_normalization(Tensor inputTensor, + Tensor alphaTensor, + Tensor betaTensor, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE slice_infer_output_size( + Tensor *inputTensor, SliceParamSpec p, std::vector outputTensor, ArchInfo_t archInfo); + +EE slice( + Tensor inputTensor, SliceParamSpec p, std::vector outputTensor, ArchInfo_t archInfo); + +EE tfslice_infer_output_size( + Tensor *inputTensor, TfSliceParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo); + +EE tfslice(Tensor inputTensor, TfSliceParamSpec p, Tensor outputTensor, ArchInfo_t archInfo); + +EE transpose_infer_output_size( + Tensor *inputTensor, TransposeParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo); + +EE transpose_infer_forward_tmp_bytes( + Tensor inputTensor, Tensor outputTensor, U32 *bytes, ArchInfo_t archInfo); + +EE transpose(Tensor inputTensor, + TransposeParamSpec p, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE matmul_infer_output_size(Tensor *matrixATensor, + bool transposeA, + Tensor *matrixBTensor, + bool transposeB, + Tensor *matrixCTensor, + ArchInfo_t archInfo); + +EE matmul_infer_forward_algorithm(Tensor matrixATensor, + bool transposeA, + Tensor matrixBTensor, + bool transposeB, + Tensor matrixCTensor, + ArchInfo_t archInfo); + +EE matmul_infer_forward_tmp_bytes(Tensor matrixATensor, + bool transposeA, + Tensor matrixBTensor, + bool transposeB, + U32 *bytes, + ArchInfo_t archInfo); + +EE matmul(Tensor matrixATensor, + bool transposeA, + Tensor matrixBTensor, + bool transposeB, + Tensor tmpTensor, + Tensor matirxCTensor, + ArchInfo_t archInfo); + +EE reshape_infer_output_size( + Tensor *inputTensor, ReshapeParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo); + +EE reshape_infer_forward_tmp_bytes( + Tensor inputTensor, Tensor outputTensor, U32 *bytes, ArchInfo_t archInfo); + +EE reshape(Tensor inputTensor, Tensor tmpTensor, Tensor outputTensor, ArchInfo_t archInfo); + +EE attention_infer_output_size(Tensor *inputTensor, AttentionParamSpec p, Tensor *outputTensor); + +EE attention(Tensor inputTensor, Tensor outputTensor, ArchInfo_t archInfo); + +EE power_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo); + +EE power(Tensor inputTensor, PowerParamSpec p, Tensor outputTensor, ArchInfo_t archInfo); + +EE clip_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo); + +EE clip(Tensor inputTensor, ClipParamSpec p, Tensor outputTensor, ArchInfo_t archInfo); + +EE bilateral_slice_apply_infer_output_size(Tensor *inputTensor, + Tensor *guideTensor, + Tensor *gridTensor, + BilateralSliceApplyParamSpec p, + Tensor *outputTensor, + ArchInfo_t archInfo); + +EE bilateral_slice_apply_infer_forward_tmp_bytes(Tensor inputTensor, + Tensor guideTensor, + Tensor gridTensor, + BilateralSliceApplyParamSpec p, + U32 *bytes, + ArchInfo_t archInfo); + +EE bilateral_slice_apply(Tensor inputTensor, + Tensor guideTensor, + Tensor gridTensor, + BilateralSliceApplyParamSpec p, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE argmax_infer_output_size( + Tensor *inputTensor, ArgMaxParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo); + +EE argmax_infer_forward_tmp_bytes( + Tensor inputTensor, ArgMaxParamSpec p, Tensor outputTensor, U32 *bytes, ArchInfo_t archInfo); + +EE argmax( + Tensor inputTensor, ArgMaxParamSpec p, Tensor tmpTensor, Tensor outputTensor, ArchInfo_t archInfo); + +EE reduction_infer_output_size( + Tensor *inputTensor, Tensor maskTensor, ReductionParamSpec p, Tensor *outputTensor); + +EE reduction_infer_forward_tmp_bytes( + Tensor inputTensor, ReductionParamSpec p, Tensor outputTensor, U32 *bytes, ArchInfo_t archInfo); + +EE reduction(Tensor inputTensor, + Tensor maskTensor, + ReductionParamSpec p, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE check_infer_output_size( + std::vector inputTensor, Tensor *outputTensor, ArchInfo_t archInfo); + +EE check(Tensor inputTensorA, + Tensor inputTensorB, + CheckParamSpec p, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE squeeze_infer_output_size( + Tensor *inputTensor, SqueezeParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo); + +EE squeeze(Tensor inputTensor, Tensor outputTensor, ArchInfo_t archInfo); + +EE unsqueeze_infer_output_size( + Tensor *inputTensor, UnsqueezeParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo); + +EE unsqueeze(Tensor inputTensor, Tensor outputTensor, ArchInfo_t archInfo); + +EE space2depth_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo); + +EE space2depth(Tensor inputTensor, Tensor outputTensor, ArchInfo_t archInfo); + +EE depth2space_infer_output_size( + Tensor *inputTensor, Depth2SpaceParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo); + +EE depth2space_infer_forward_tmp_bytes( + Tensor inputTensor, Depth2SpaceParamSpec p, Tensor outputTensor, U32 *bytes, ArchInfo_t archInfo); + +EE depth2space(Tensor inputTensor, + Depth2SpaceParamSpec p, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE attention_mask( + Tensor inputTensor, AttentionMaskParamSpec p, Tensor outputTensor, ArchInfo_t archInfo); + +EE attention_mask_infer_output_size(Tensor *inputTensor, Tensor *outputTensor); + +EE padding_infer_output_size( + Tensor *inputTensor, PadParamSpec padParamSpec, Tensor *outputTensor, ArchInfo_t archInfo); + +EE padding(Tensor inputTensor, PadParamSpec padParamSpec, Tensor outputTensor, ArchInfo_t archInfo); + +EE embedding_infer_output_size(Tensor *inputTensor, + EmbedParamSpec p, + DataType outputDt, + Tensor *outputTensor, + ArchInfo_t archInfo); + +EE embedding(Tensor inputTensor, + Tensor weightTensor, + EmbedParamSpec p, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE yolov3detectionoutput_infer_output_size(std::vector inputTensor, + Yolov3DetectionOutputParamSpec yolov3DetectionOutputParamSpec, + Tensor *outputTensor, + ArchInfo_t archInfo); + +EE yolov3detectionoutput(std::vector inputTensor, + Yolov3DetectionOutputParamSpec yolov3DetectionOutputParamSpec, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE preallocated_memory_infer_output_size(Tensor *outputTensor, ArchInfo_t archInfo); + +EE preallocated_memory(Tensor outputTensor, ArchInfo_t archInfo); + +EE copy_infer_output_size(std::vector inputTensor, ArchInfo_t archInfo); + +EE copy(std::vector inputTensor, + U32 srcOffset, + U32 dstOffset, + U32 srcStride, + U32 dstStride, + U32 length, + ArchInfo_t archInfo); + +EE non_max_suppression_infer_output_size(std::vector inputTensor, + NonMaxSuppressionParamSpec p, + Tensor *outputTensor, + ArchInfo_t archInfo); + +EE non_max_suppression(std::vector inputTensor, + NonMaxSuppressionParamSpec p, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE roialign_infer_output_size(std::vector inputTensor, + RoiAlignParamSpec p, + Tensor *outputTensor, + ArchInfo_t archInfo); + +EE roialign( + std::vector inputTensor, RoiAlignParamSpec p, Tensor outputTensor, ArchInfo_t archInfo); + +EE multihead_attention_infer_output_size(Tensor *inputTensor, + std::vector filterTensor, + Tensor *outputTensor, + U32 *firstFCSliceNum, + ArchInfo_t archInfo); + +EE multihead_attention_infer_forward_algorithm(Tensor inputTensor, + std::vector filterTensor, + void *multiplyAlpha, + void *multiplyBeta, + U32 *firstFCSliceNum, + U32 matmulSliceLen, + std::vector eltwiseWithLayerNormIn, + ActivationMode activation, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE multihead_attention_infer_forward_tmp_bytes(Tensor inputTensor, + std::vector filterTensor, + std::vector eltwiseWithLayerNormIn, + U32 *firstFCSliceNum, + U32 matmulSliceLen, + U32 *bytes, + ArchInfo_t archInfo); + +EE multihead_attention_transform_filter_bytes( + std::vector filterTensor, U32 *bytes, ArchInfo_t archInfo); + +EE multihead_attention_transform_filter( + std::vector filterTensor, std::vector ftmTensor, ArchInfo_t archInfo); + +EE multihead_attention(Tensor inputTensor, + std::vector filterTensor, + std::vector biasTensor, + std::vector layerNormAlphaTensor, + std::vector layerNormBetaTensor, + void *multiplyAlpha, + void *multiplyBeta, + U32 *firstFCSliceNum, + U32 matmulSliceLen, + std::vector eltwiseWithLayerNormIn, + ActivationMode activation, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE channel_resize_infer_output_size( + Tensor *inputTensor, ChannelResizeParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo); + +EE channel_resize( + Tensor inputTensor, ChannelResizeParamSpec p, Tensor outputTensor, ArchInfo_t archInfo); + +EE l2normalization_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo); + +EE l2normalization(Tensor inputTensor, Tensor outputTensor, ArchInfo_t archInfo); + +EE tile_infer_output_size( + Tensor *inputTensor, TileParamSpec tileParamSpec, Tensor *outputTensor, ArchInfo_t archInfo); + +EE tile(Tensor inputTensor, TileParamSpec tileParamSpec, Tensor outputTensor, ArchInfo_t archInfo); + +EE quantize_tensor(TensorDesc dDesc, const void *data, TensorDesc *qDesc, void *qData, void *scale); + +#if defined(_USE_NEON) && defined(_USE_INT8) +void dequantize_int8_to_fp16(U32 len, INT8 *q, F32 scale, F16 *d); + +void dequantize_int32_to_fp16( + U32 len, I32 *q, F32 scale, F16 *d, U32 biasLen = 0, F16 *biasPtr = nullptr); +#endif + +#ifdef _USE_FP16 +void update_histogram(U32 len, const F16 *data, int numBins, F32 interval, F32 *histo); +#endif + +std::vector compress_histogram(std::vector &histogram, F32 numPerBin, F32 last_max); + +std::vector compute_scale_with_KL(std::vector &histogram, F32 interval); +#endif diff --git a/tensor_computing/include/tensor_computing_library_algorithm_search.h b/compute/tensor/include/tensor_computing_library_algorithm_search.h similarity index 84% rename from tensor_computing/include/tensor_computing_library_algorithm_search.h rename to compute/tensor/include/tensor_computing_library_algorithm_search.h index 6aaa5b2d..c7febed1 100644 --- a/tensor_computing/include/tensor_computing_library_algorithm_search.h +++ b/compute/tensor/include/tensor_computing_library_algorithm_search.h @@ -1,17 +1,16 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _H_TENSOR_COMPUTING_LIBRARY_ALGORITHM_SEARCH #define _H_TENSOR_COMPUTING_LIBRARY_ALGORITHM_SEARCH @@ -19,8 +18,7 @@ #include #include -#include "type.h" -#include "tensor_computing_type.h" +#include "types.h" extern std::map libraryAlgorithmMap; extern std::map libraryAlgorithmParameters; @@ -31,7 +29,7 @@ void saveLibraryAlgorithmMapToTxt(); std::string getConvolutionAlgorithmMapNameFromInput(TensorDesc inputDesc, TensorDesc filterDesc, - ConvolutionDesc convDesc, + ConvolutionParamSpec convParamSpec, DataType targetDataType); #endif #endif diff --git a/compute/tensor/include/tensor_computing_type.h b/compute/tensor/include/tensor_computing_type.h new file mode 100644 index 00000000..959233a9 --- /dev/null +++ b/compute/tensor/include/tensor_computing_type.h @@ -0,0 +1,70 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_TENSOR_COMPUTING_TYPE +#define _H_TENSOR_COMPUTING_TYPE + +#include +#include "types.h" +#include "tensor.hpp" + +#ifdef _USE_MALI +#include "gcl.h" +#include "ocl_desc_trans.h" +#define ALIGN(len, align_num) ((len + align_num - 1) / align_num * align_num) +#endif + +ConvolutionParamSpec createConvolutionParamSpec(U32 group, + U32 kernelH, + U32 kernelW, + U32 strideH, + U32 strideW, + U32 paddingT, + U32 paddingB, + U32 paddingL, + U32 paddingR, + U32 dilateH, + U32 dilateW, + U32 num_outputs, + ConvolutionMode convMode); + +FullyConnectedParamSpec createFullyConnectedParamSpec( + U32 num_outputs, U32 num_slices, I32 *slice_point); + +PoolingParamSpec createPoolingParamSpec(PoolingMode pm, + U32 ksH, + U32 ksW, + U32 strideH, + U32 strideW, + U32 paddingT, + U32 paddingB, + U32 paddingL, + U32 paddingR, + RoundMode rm); + +ReshapeParamSpec createReshapeParamSpec(I32 *shape_dims, I32 shape_size, I32 axis, I32 num_axes); + +ClipParamSpec createClipParamSpec(float min, float max); + +SqueezeParamSpec createSqueezeParamSpec(int *axes, int axes_num); + +std::vector get_desc_from_tensors(std::vector tensors); +std::vector get_desc_from_tensor_ptrs(std::vector tensors); + +std::vector get_scale_from_tensors(std::vector tensors); + +template +std::vector get_data_from_tensors(std::vector tensors, Arch arch); +template +std::vector get_data_from_tensor_ptrs(std::vector tensors, Arch arch); +#endif diff --git a/compute/tensor/src/CMakeLists.txt b/compute/tensor/src/CMakeLists.txt new file mode 100644 index 00000000..39950f23 --- /dev/null +++ b/compute/tensor/src/CMakeLists.txt @@ -0,0 +1,55 @@ +if (USE_GENERAL) + file(GLOB general_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/general/*.cpp) + file(GLOB cpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/*.cpp) +endif (USE_GENERAL) + +if (USE_NEON) + if (USE_FP32) + file(GLOB arm_fp32_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/fp32/*.cpp) + endif (USE_FP32) + if (USE_FP16) + file(GLOB arm_fp16_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/fp16/*.cpp) + file(GLOB arm_bnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/bnn/*.cpp) + endif (USE_FP16) + if (USE_INT8) + file(GLOB arm_int8_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/int8/*.cpp) + endif (USE_INT8) + file(GLOB arm_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/*.cpp) + set(arm_srcs "${arm_srcs};${arm_fp16_srcs};${arm_fp32_srcs};${arm_int8_srcs};${arm_bnn_srcs}") + file(GLOB cpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/*.cpp) +endif (USE_NEON) + +if (USE_X86) + if (USE_FP32) + file(GLOB x86_fp32_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/x86/fp32/*.cpp) + endif (USE_FP32) + file(GLOB x86_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/x86/*.cpp) + set(x86_srcs "${x86_srcs};${x86_fp32_srcs}") + file(GLOB cpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/*.cpp) +endif (USE_X86) + +if (USE_MALI) + file(GLOB mali_fp16_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/mali/fp16/*.cpp) + file(GLOB mali_uchar_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/mali/uchar/*.cpp) + file(GLOB mali_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/mali/*.cpp) + set(mali_srcs "${mali_srcs};${mali_fp16_srcs};${mali_uchar_srcs}") +endif (USE_MALI) + +file(GLOB srcs ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) +set(srcs "${srcs};${general_srcs};${arm_srcs};${cpu_srcs};${mali_srcs};${x86_srcs}") + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) + +# shared library +add_library(${PROJECT_NAME} SHARED ${srcs}) +target_link_libraries(${PROJECT_NAME} LINK_PUBLIC blas_enhance uni) + +# static library +add_library(${PROJECT_NAME}_static STATIC ${srcs}) + +set_target_properties(${PROJECT_NAME}_static PROPERTIES OUTPUT_NAME "${PROJECT_NAME}") +set_target_properties(${PROJECT_NAME} PROPERTIES CLEAN_DIRECT_OUTPUT 1) +set_target_properties(${PROJECT_NAME}_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) +install(TARGETS ${PROJECT_NAME} ${PROJECT_NAME}_static + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib) diff --git a/compute/tensor/src/activation.cpp b/compute/tensor/src/activation.cpp new file mode 100644 index 00000000..ec24a289 --- /dev/null +++ b/compute/tensor/src/activation.cpp @@ -0,0 +1,80 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +inline EE activation_infer_output_size_cpu(TensorDesc inputDesc, TensorDesc *outputDesc) +{ + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + + *outputDesc = inputDesc; + return SUCCESS; +} + +EE activation_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = activation_infer_output_size_mali( + inputDesc, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } else { + ret = activation_infer_output_size_cpu(inputDesc, &outputDesc); + } + outputTensor->resize(outputDesc); + return ret; +} + +EE activation( + Tensor inputTensor, ActivationParamSpec activationDesc, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + ret = activation_cpu(inputDesc, input, activationDesc, outputDesc, output, arch); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = activation_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, + (GCLMem_t)input, outputDesc, (GCLMem_t)output, activationDesc.mode); +#endif + } + return ret; +} diff --git a/compute/tensor/src/argmax.cpp b/compute/tensor/src/argmax.cpp new file mode 100644 index 00000000..7cf2d414 --- /dev/null +++ b/compute/tensor/src/argmax.cpp @@ -0,0 +1,99 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +EE argmax( + Tensor inputTensor, ArgMaxParamSpec p, Tensor tmpTensor, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { +#if defined(_USE_CPU) + ret = argmax_cpu(inputDesc, input, p, outputDesc, output); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + ret = argmax_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, (GCLMem_t)input, p, + (GCLMem_t)tmp, outputDesc, (GCLMem_t)output); +#endif + } + return ret; +} + +EE argmax_infer_forward_tmp_bytes( + Tensor inputTensor, ArgMaxParamSpec p, Tensor outputTensor, U32 *bytes, ArchInfo_t archInfo) +{ + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + TensorDesc inputDesc = inputTensor.get_desc(); + TensorDesc outputDesc = outputTensor.get_desc(); + ret = argmax_infer_forward_tmp_bytes_mali(inputDesc, p, outputDesc, bytes); +#endif + } else { + *bytes = 0; + ret = SUCCESS; + } + return ret; +} + +EE argmax_infer_output_size( + Tensor *inputTensor, ArgMaxParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = argmax_infer_output_size_mali( + inputDesc, p, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } else { + outputDesc = inputDesc; + int axis = p.axis; + if (axis < 0) { + axis += inputDesc.nDims; + } + axis = inputDesc.nDims - 1 - axis; + for (int i = axis; i < (I32)(inputDesc.nDims) - 1; i++) { + outputDesc.dims[i] = outputDesc.dims[i + 1]; + } + outputDesc.nDims = inputDesc.nDims - 1; + outputDesc.dt = DT_U32; + outputTensor->resize(outputDesc); + ret = SUCCESS; + } + return ret; +} diff --git a/compute/tensor/src/attention.cpp b/compute/tensor/src/attention.cpp new file mode 100644 index 00000000..71d7db2c --- /dev/null +++ b/compute/tensor/src/attention.cpp @@ -0,0 +1,66 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_GENERAL +#include "cpu/general/tensor_computing_general.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/tensor_computing_arm.h" +#endif + +EE attention(Tensor inputTensor, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = attention_general(inputDesc, input, outputDesc, output); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + UNI_WARNING_LOG("The x86 attention operator is not optimized now.\n"); + ret = attention_general(inputDesc, input, outputDesc, output); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = attention_arm(inputDesc, input, outputDesc, output); +#endif + } + return ret; +} + +EE attention_infer_output_size(Tensor *inputTensor, AttentionParamSpec p, Tensor *outputTensor) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + DataType dt; + DataFormat df; + U32 batch, sequenceLength; + CHECK_STATUS(tensor2dGet(inputDesc, &dt, &df, &batch, &sequenceLength)); + outputDesc = + tensor4df(dt, DF_NCHW, batch, p.num_heads, p.from_sequence_length, p.to_sequence_length); + outputTensor->resize(outputDesc); + return SUCCESS; +} diff --git a/compute/tensor/src/attention_mask.cpp b/compute/tensor/src/attention_mask.cpp new file mode 100644 index 00000000..7c2a1609 --- /dev/null +++ b/compute/tensor/src/attention_mask.cpp @@ -0,0 +1,69 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_GENERAL +#include "cpu/general/tensor_computing_general.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/tensor_computing_arm.h" +#endif +#ifdef _USE_X86 +#include "cpu/x86/tensor_computing_x86.h" +#endif + +EE attention_mask( + Tensor inputTensor, AttentionMaskParamSpec p, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + // reinit mask value to avoid overflow + if (bytesOf(inputDesc.dt) == 2 && p.mask > 10000) { + p.mask = 10000; + } + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = attention_mask_general(inputDesc, input, p, outputDesc, output); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = attention_mask_x86(inputDesc, input, p, outputDesc, output); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = attention_mask_arm(inputDesc, input, p, outputDesc, output); +#endif + } + return ret; +} + +EE attention_mask_infer_output_size(Tensor *inputTensor, Tensor *outputTensor) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + if (inputDesc.nDims < 2) { + return NOT_MATCH; + } + outputTensor->resize(inputDesc); + return SUCCESS; +} diff --git a/compute/tensor/src/bilateral_slice_apply.cpp b/compute/tensor/src/bilateral_slice_apply.cpp new file mode 100644 index 00000000..47c20c77 --- /dev/null +++ b/compute/tensor/src/bilateral_slice_apply.cpp @@ -0,0 +1,118 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +inline EE bilateral_slice_apply_infer_output_size_cpu() +{ + return NOT_SUPPORTED; +} + +EE bilateral_slice_apply_infer_output_size(Tensor *inputTensor, + Tensor *guideTensor, + Tensor *gridTensor, + BilateralSliceApplyParamSpec p, + Tensor *outputTensor, + ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (guideTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (gridTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + TensorDesc guideDesc = guideTensor->get_desc(); + TensorDesc gridDesc = gridTensor->get_desc(); + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemGuideDesc = ocl_get_desc(*guideTensor); + GCLMemDesc gclmemGridDesc = ocl_get_desc(*gridTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = bilateral_slice_apply_infer_output_size_mali(inputDesc, guideDesc, gridDesc, p, + &outputDesc, &gclmemInputDesc, &gclmemGuideDesc, &gclmemGridDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(guideTensor, gclmemGuideDesc); + ocl_set_desc(gridTensor, gclmemGridDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } + outputTensor->resize(outputDesc); + return ret; +} + +EE bilateral_slice_apply_infer_forward_tmp_bytes(Tensor inputTensor, + Tensor guideTensor, + Tensor gridTensor, + BilateralSliceApplyParamSpec p, + U32 *bytes, + ArchInfo_t archInfo) +{ + TensorDesc inputDesc = inputTensor.get_desc(); + TensorDesc guideDesc = guideTensor.get_desc(); + TensorDesc gridDesc = gridTensor.get_desc(); + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + ret = bilateral_slice_apply_infer_forward_tmp_bytes_mali(inputDesc, guideDesc, gridDesc, p, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo, bytes); +#endif + } + return ret; +} + +EE bilateral_slice_apply(Tensor inputTensor, + Tensor guideTensor, + Tensor gridTensor, + BilateralSliceApplyParamSpec p, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + U32 tmpBytes = tmpTensor.bytes(); + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + TensorDesc guideDesc = guideTensor.get_desc(); + void *guide = get_ptr_from_tensor(guideTensor, arch); + TensorDesc gridDesc = gridTensor.get_desc(); + void *grid = get_ptr_from_tensor(gridTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(arch)) { +#ifdef _USE_MALI + ret = bilateral_slice_apply_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, + (GCLMem_t)input, guideDesc, (GCLMem_t)guide, gridDesc, (GCLMem_t)grid, p, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo, tmpBytes, (GCLMem_t)tmp, outputDesc, + (GCLMem_t)output); +#endif + } + return ret; +} diff --git a/compute/tensor/src/channel_resize.cpp b/compute/tensor/src/channel_resize.cpp new file mode 100644 index 00000000..cb7c80f4 --- /dev/null +++ b/compute/tensor/src/channel_resize.cpp @@ -0,0 +1,66 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_GENERAL +#include "cpu/general/tensor_computing_general.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/tensor_computing_arm.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +EE channel_resize( + Tensor inputTensor, ChannelResizeParamSpec p, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(arch)) { +#ifdef _USE_MALI + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + ret = channel_resize_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, + (GCLMem_t)input, p, outputDesc, (GCLMem_t)output); +#endif + } + return ret; +} + +EE channel_resize_infer_output_size( + Tensor *inputTensor, ChannelResizeParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc outputDesc = outputTensor->get_desc(); + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + TensorDesc inputDesc = inputTensor->get_desc(); + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + CHECK_STATUS(channel_resize_infer_output_size_mali( + inputDesc, p, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc)) + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } + outputTensor->resize(outputDesc); + return SUCCESS; +} diff --git a/compute/tensor/src/check.cpp b/compute/tensor/src/check.cpp new file mode 100644 index 00000000..f5e4b826 --- /dev/null +++ b/compute/tensor/src/check.cpp @@ -0,0 +1,96 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_GENERAL +#include "cpu/general/tensor_computing_general.h" +#endif +#ifdef _USE_X86 +#include "cpu/x86/tensor_computing_x86.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/tensor_computing_arm.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +EE check(Tensor inputTensorA, + Tensor inputTensorB, + CheckParamSpec p, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDescA = inputTensorA.get_desc(); + void *inputA = get_ptr_from_tensor(inputTensorA, arch); + TensorDesc inputDescB = inputTensorB.get_desc(); + void *inputB = get_ptr_from_tensor(inputTensorB, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = check_general(inputDescA, inputA, inputDescB, inputB, p, outputDesc, output); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = check_x86(inputDescA, inputA, inputDescB, inputB, p, outputDesc, output); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = check_arm(inputDescA, inputA, inputDescB, inputB, p, outputDesc, output); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = check_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDescA, (GCLMem_t)inputA, + inputDescB, (GCLMem_t)inputB, p, outputDesc, (GCLMem_t)output); +#endif + } + return ret; +} + +EE check_infer_output_size( + std::vector inputTensor, Tensor *outputTensor, ArchInfo_t archInfo) +{ + EE ret = NOT_SUPPORTED; + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + for (auto p : inputTensor) { + if (p == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + } + TensorDesc inputDesc = inputTensor[0]->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDescA = ocl_get_desc(*(inputTensor[0])); + GCLMemDesc gclmemInputDescB = ocl_get_desc(*(inputTensor[1])); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = check_infer_output_size_mali( + inputDesc, &outputDesc, &gclmemInputDescA, &gclmemInputDescB, &gclmemOutputDesc); + ocl_set_desc(inputTensor[0], gclmemInputDescA); + ocl_set_desc(inputTensor[1], gclmemInputDescB); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } else { + outputDesc.dt = DT_I32; + outputDesc.nDims = 1; + outputDesc.dims[0] = inputDesc.dims[inputDesc.nDims - 1]; + ret = SUCCESS; + } + outputTensor->resize(outputDesc); + return ret; +} diff --git a/compute/tensor/src/clip.cpp b/compute/tensor/src/clip.cpp new file mode 100644 index 00000000..994462d8 --- /dev/null +++ b/compute/tensor/src/clip.cpp @@ -0,0 +1,78 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +inline EE clip_infer_output_size_cpu(TensorDesc inputDesc, TensorDesc *outputDesc) +{ + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + *outputDesc = inputDesc; + return SUCCESS; +} + +EE clip_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = clip_infer_output_size_mali( + inputDesc, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } else { + ret = clip_infer_output_size_cpu(inputDesc, &outputDesc); + } + outputTensor->resize(outputDesc); + return ret; +} + +EE clip(Tensor inputTensor, ClipParamSpec p, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + ret = clip_cpu(inputDesc, input, p, outputDesc, output, arch); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = clip_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, (GCLMem_t)input, p, + outputDesc, (GCLMem_t)output); +#endif + } + return ret; +} diff --git a/compute/tensor/src/concat.cpp b/compute/tensor/src/concat.cpp new file mode 100644 index 00000000..4ca0a88b --- /dev/null +++ b/compute/tensor/src/concat.cpp @@ -0,0 +1,184 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +inline void processInputDescs(std::vector *inputDesc, I32 axis) +{ + int inputNum = inputDesc->size(); + int axisInfo = (axis > 0) ? axis : ((*inputDesc)[0].nDims + axis); + axisInfo = (*inputDesc)[0].nDims - 1 - axisInfo; + for (int i = 0; i < (int)(*inputDesc)[0].nDims; i++) { + if (i == axisInfo) { + continue; + } + U32 minDim = (*inputDesc)[0].dims[i]; + for (int j = 1; j < inputNum; j++) { + if ((*inputDesc)[j].dims[i] < minDim) { + minDim = (*inputDesc)[j].dims[i]; + } + } + if (minDim == 0) { + continue; + } + for (int j = 0; j < inputNum; j++) { + (*inputDesc)[j].dims[i] = minDim; + } + } +} + +inline EE concat_infer_output_size_cpu( + std::vector inputDesc, ConcatParamSpec p, TensorDesc *outputDesc) +{ + if (inputDesc.size() < 1) { + CHECK_STATUS(NOT_MATCH); + } + if (inputDesc.size() == 1) { + *outputDesc = inputDesc[0]; + return SUCCESS; + } + + bool hasC8 = false; + for (U32 i = 1; i < inputDesc.size(); i++) { + if (inputDesc[i].nDims != 0) { + *outputDesc = inputDesc[i]; + } + if (inputDesc[i].df == DF_NCHWC8) { + hasC8 = true; + } + } + I32 dim = outputDesc->nDims; + int axis = p.axis; + axis = (axis + dim) % dim; + axis = dim - 1 - axis; + outputDesc->dims[axis] = 0; + + for (U32 i = 0; i < inputDesc.size(); i++) { + if (inputDesc[i].nDims == 0) { + continue; + } + + if (inputDesc[i].nDims != (U32)dim) { + return NOT_MATCH; + } + + for (I32 j = 0; j < dim; j++) { + if (j == axis) { + outputDesc->dims[j] += inputDesc[i].dims[j]; + } else { + outputDesc->dims[j] = UNI_MAX(inputDesc[i].dims[j], outputDesc->dims[j]); + if (inputDesc[i].dims[j] != 0 && outputDesc->dims[j] != 0 && + outputDesc->dims[j] != inputDesc[i].dims[j]) { + return NOT_MATCH; + } + } + } + } + + if ((outputDesc->dims[3] % 8 == 0) && hasC8) { + outputDesc->df = DF_NCHWC8; + } + + if ((outputDesc->df == DF_NCHWC8) && (outputDesc->dims[2] % 8 != 0)) { + outputDesc->df = DF_NCHW; + } + + return SUCCESS; +} + +EE concat_infer_output_size( + std::vector inputTensor, ConcatParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + std::vector inputDesc = get_desc_from_tensor_ptrs(inputTensor); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + std::vector gclmemInputDescs; + for (auto p : inputTensor) { + gclmemInputDescs.push_back(ocl_get_desc(*p)); + } + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = concat_infer_output_size_mali( + inputDesc, p, &outputDesc, gclmemInputDescs.data(), &gclmemOutputDesc); + for (U32 i = 0; i < inputTensor.size(); i++) { + ocl_set_desc(inputTensor[i], gclmemInputDescs[i]); + } + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } else { + processInputDescs(&inputDesc, p.axis); + ret = concat_infer_output_size_cpu(inputDesc, p, &outputDesc); + } + outputTensor->resize(outputDesc); + return ret; +} + +EE concat_infer_forward_tmp_bytes(std::vector inputTensor, U32 *bytes, ArchInfo_t archInfo) +{ + std::vector inputDesc = get_desc_from_tensors(inputTensor); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + ret = concat_infer_forward_tmp_bytes_mali(inputDesc, bytes); +#endif + } else { + *bytes = 0; + for (auto p : inputDesc) { + *bytes += tensorNumBytes(p); + } + ret = SUCCESS; + } + return ret; +} + +EE concat(std::vector inputTensor, + ConcatParamSpec p, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + std::vector inputDesc = get_desc_from_tensors(inputTensor); + std::vector inputScale = get_scale_from_tensors(inputTensor); + std::vector input = get_data_from_tensors(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + F32 outputScale = outputTensor.get_scale(); + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + processInputDescs(&inputDesc, p.axis); + ret = concat_cpu( + inputDesc, input, inputScale.data(), p, tmp, outputDesc, output, &outputScale); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = concat_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, input, NULL, p, + (GCLMem_t)tmp, outputDesc, (GCLMem_t)output, NULL); +#endif + } + outputTensor.set_scale(outputScale); + return ret; +} diff --git a/compute/tensor/src/convolution.cpp b/compute/tensor/src/convolution.cpp new file mode 100644 index 00000000..261dc90e --- /dev/null +++ b/compute/tensor/src/convolution.cpp @@ -0,0 +1,334 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_GENERAL +#include "cpu/general/tensor_computing_general.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/tensor_computing_arm.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif +#ifdef _USE_X86 +#include "cpu/x86/tensor_computing_x86.h" +#endif + +inline EE convolution_infer_output_size_cpu(TensorDesc inputDesc, + TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + TensorDesc *outputDesc, + DataType targetDataType) +{ + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, fdt; + DataFormat idf, fdf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + I32 oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + if (fh < 1 || fw < 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + + I32 strideH = convParamSpec.stride_h; + I32 strideW = convParamSpec.stride_w; + I32 paddingT = convParamSpec.padding_top; + I32 paddingB = convParamSpec.padding_bottom; + I32 paddingL = convParamSpec.padding_left; + I32 paddingR = convParamSpec.padding_right; + I32 dilateH = convParamSpec.dilatedRate_h; + I32 dilateW = convParamSpec.dilatedRate_w; + + U32 fhDilated = (fh - 1) * dilateH + 1; + U32 fwDilated = (fw - 1) * dilateW + 1; + oh = (ih + paddingT + paddingB - fhDilated) / strideH + 1; + ow = (iw + paddingL + paddingR - fwDilated) / strideW + 1; + + if (fn % 8 != 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + + *outputDesc = tensor4df(targetDataType, DF_NCHWC8, in, fn, oh, ow); + return SUCCESS; +} + +EE convolution_infer_output_size(Tensor *inputTensor, + Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + Tensor *outputTensor, + DataType targetDataType, + ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc filterDesc = filterTensor.get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = convolution_infer_output_size_mali( + inputDesc, filterDesc, convParamSpec, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } else { + ret = convolution_infer_output_size_cpu( + inputDesc, filterDesc, convParamSpec, &outputDesc, targetDataType); + } + outputTensor->resize(outputDesc); + return ret; +} + +EE convolution_infer_forward_algorithm(Tensor inputTensor, + Tensor filterTensor, + Tensor outputTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + ConvolutionForwardAlgorithm *algorithm, + DataType targetDataType, + ActivationParamSpec activationDesc, + ArchInfo_t archInfo) +{ + TensorDesc inputDesc = inputTensor.get_desc(); + TensorDesc filterDesc = filterTensor.get_desc(); + TensorDesc outputDesc = outputTensor.get_desc(); + + EE ret = NOT_SUPPORTED; + auto arch = archInfo->arch; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = SUCCESS; +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = convolution_infer_forward_algorithm_x86( + inputDesc, filterDesc, outputDesc, convParamSpec, policy, algorithm, targetDataType); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = convolution_infer_forward_algorithm_arm( + inputDesc, filterDesc, outputDesc, convParamSpec, policy, algorithm, targetDataType); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + GCLMemDesc gclmemInputDesc = ocl_get_desc(inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(outputTensor); + ret = convolution_infer_forward_algorithm_mali(((MaliPara_t)(archInfo->archPara))->handle, + inputDesc, filterDesc, convParamSpec, outputDesc, gclmemInputDesc, gclmemOutputDesc, + policy, activationDesc.mode, ((MaliPara_t)(archInfo->archPara))->forwardRunInfo); +#endif + } + return ret; +} + +EE convolution_transform_filter_bytes(Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes, + ArchInfo_t archInfo) +{ + TensorDesc filterDesc = filterTensor.get_desc(); + + EE ret = NOT_SUPPORTED; + auto arch = archInfo->arch; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + *bytes = tensorNumBytes(filterDesc); + ret = SUCCESS; +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = convolution_transform_filter_bytes_x86(filterDesc, convParamSpec, algorithm, bytes); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = convolution_transform_filter_bytes_arm(filterDesc, convParamSpec, algorithm, bytes); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = convolution_transform_filter_bytes_mali(filterDesc, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo, + ((MaliPara_t)(archInfo->archPara))->gclmemFilterDesc, bytes); +#endif + } + return ret; +} + +EE convolution_transform_filter(Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + Tensor tmpTensor, + Tensor *ftmTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc filterDesc = filterTensor.get_desc(); + void *filter = get_ptr_from_tensor(filterTensor, arch); + TensorDesc ftmDesc = ftmTensor->get_desc(); + void *filterTransformed = get_ptr_from_tensor(*ftmTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + UNI_memcpy(filterTransformed, filter, tensorNumBytes(filterDesc)); + ftmDesc = filterDesc; + ret = SUCCESS; +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = convolution_transform_filter_x86( + filterDesc, filter, convParamSpec, algorithm, &ftmDesc, filterTransformed); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = convolution_transform_filter_arm( + filterDesc, filter, convParamSpec, algorithm, &ftmDesc, filterTransformed); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + ret = convolution_transform_filter_mali(((MaliPara_t)(archInfo->archPara))->handle, + filterDesc, (GCLMem_t)filter, ((MaliPara_t)(archInfo->archPara))->forwardRunInfo, + (GCLMem_t)tmp, &ftmDesc, (GCLMem_t)filterTransformed); +#endif + } + ftmTensor->resize(ftmDesc); + return ret; +} + +EE convolution_infer_forward_tmp_bytes(Tensor inputTensor, + Tensor filterTensor, + Tensor outputTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes, + ArchInfo_t archInfo) +{ + TensorDesc inputDesc = inputTensor.get_desc(); + TensorDesc filterDesc = filterTensor.get_desc(); + TensorDesc outputDesc = outputTensor.get_desc(); + + EE ret = NOT_SUPPORTED; + auto arch = archInfo->arch; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = SUCCESS; +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = convolution_infer_forward_tmp_bytes_x86( + inputDesc, filterDesc, outputDesc, convParamSpec, algorithm, bytes); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = convolution_infer_forward_tmp_bytes_arm( + inputDesc, filterDesc, outputDesc, convParamSpec, algorithm, bytes); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = convolution_infer_forward_tmp_bytes_mali(inputDesc, filterDesc, outputDesc, + convParamSpec, ((MaliPara_t)(archInfo->archPara))->forwardRunInfo, bytes); +#endif + } + return ret; +} + +inline void convolution_process_bnn_scale( + U8 **bias, U8 **scale, TensorDesc *biasDesc, TensorDesc *scaleDesc) +{ + U32 vecLen = tensorNumElements(*biasDesc) / 2; + biasDesc->dims[0] = vecLen; + *scaleDesc = *biasDesc; + *scale = *bias; + *bias += vecLen * bytesOf(DT_F16); +} + +EE convolution(Tensor inputTensor, + Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + void *scale, + Tensor biasTensor, + Tensor tmpTensor, + Tensor outputTensor, + ActivationParamSpec activationDesc, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + if (3 == inputDesc.nDims) { + inputDesc = tensor4df(inputDesc.dt, DF_NCHW, inputDesc.dims[2], inputDesc.dims[1], inputDesc.dims[0], 1); + } + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc filterDesc = filterTensor.get_desc(); + void *filter = get_ptr_from_tensor(filterTensor, arch); + TensorDesc biasDesc = biasTensor.get_desc(); + void *bias = get_ptr_from_tensor(biasTensor, arch); + U32 tmpBytes = tmpTensor.bytes(); + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + TensorDesc scaleDesc = filterDesc; + + EE ret = NOT_SUPPORTED; +#ifdef _USE_FP16 + if (IS_GENERAL(arch) || (IS_ARM(arch))) { + if (filterDesc.dt == DT_BIN01 || filterDesc.dt == DT_BIN11) { + U8 *biasPtr = (U8 *)get_ptr_from_tensor(biasTensor, arch); + U8 *scalePtr = nullptr; + convolution_process_bnn_scale(&biasPtr, &scalePtr, &biasDesc, &scaleDesc); + bias = biasPtr; + scale = scalePtr; + } + } +#endif + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = convolution_general(inputDesc, input, filterDesc, filter, convParamSpec, scaleDesc, + scale, biasDesc, bias, outputDesc, output, activationDesc); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = convolution_x86(inputDesc, input, filterDesc, filter, convParamSpec, algorithm, + scaleDesc, scale, biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc, + archInfo->arch); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = convolution_arm(inputDesc, input, filterDesc, filter, convParamSpec, algorithm, + scaleDesc, scale, biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc, + archInfo->arch); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = convolution_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, + (GCLMem_t)input, filterDesc, (GCLMem_t)filter, convParamSpec, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo, scaleDesc, (GCLMem_t)scale, + biasDesc, (GCLMem_t)bias, tmpBytes, (GCLMem_t)tmp, outputDesc, (GCLMem_t)output, + activationDesc.mode); +#endif + } + return ret; +} diff --git a/compute/tensor/src/copy.cpp b/compute/tensor/src/copy.cpp new file mode 100644 index 00000000..6d79ed46 --- /dev/null +++ b/compute/tensor/src/copy.cpp @@ -0,0 +1,66 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +EE copy_infer_output_size(std::vector inputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + std::vector inputDesc = get_desc_from_tensor_ptrs(inputTensor); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(arch)) { +#ifdef _USE_MALI + std::vector gclmemInputDescs; + for (auto p : inputTensor) { + gclmemInputDescs.push_back(ocl_get_desc(*p)); + } + ret = copy_infer_output_size_mali(inputDesc, gclmemInputDescs.data()); + for (U32 i = 0; i < inputTensor.size(); i++) { + ocl_set_desc(inputTensor[i], gclmemInputDescs[i]); + } +#endif + } + return ret; +} + +EE copy(std::vector inputTensor, + U32 srcOffset, + U32 dstOffset, + U32 srcStride, + U32 dstStride, + U32 length, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + std::vector inputDesc = get_desc_from_tensors(inputTensor); + std::vector input = get_data_from_tensors(inputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(arch)) { +#ifdef _USE_MALI + ret = copy_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, input, srcOffset, + dstOffset, srcStride, dstStride, length); +#endif +#ifdef _USE_CPU + } else { + memcpy((U8 *)input[1] + bytesOf(inputDesc[1].dt) * dstOffset, + (U8 *)input[0] + bytesOf(inputDesc[0].dt) * srcOffset, + length * bytesOf(inputDesc[0].dt)); + ret = SUCCESS; +#endif + } + return ret; +} diff --git a/compute/tensor/src/cpu/activation.cpp b/compute/tensor/src/cpu/activation.cpp new file mode 100644 index 00000000..fb9cf6e4 --- /dev/null +++ b/compute/tensor/src/cpu/activation.cpp @@ -0,0 +1,32 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/tensor_computing_cpu.h" +#include "cpu/cpu_functions.h" + +EE activation_cpu(TensorDesc inputDesc, + void *input, + ActivationParamSpec activationDesc, + TensorDesc outputDesc, + void *output, + Arch arch) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt = inputDesc.dt; + U32 len = tensorNumElements(inputDesc); + CHECK_REQUIREMENT(len == tensorNumElements(outputDesc)); + ArrayActivationFunction activation_func = get_array_activation_function(arch); + return activation_func(idt, input, len, activationDesc, output); +} diff --git a/compute/tensor/src/cpu/argmax.cpp b/compute/tensor/src/cpu/argmax.cpp new file mode 100644 index 00000000..a5bb6ba6 --- /dev/null +++ b/compute/tensor/src/cpu/argmax.cpp @@ -0,0 +1,86 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/tensor_computing_cpu.h" + +template +static U32 array_argmax(const T *input, U32 len, U32 stride) +{ + U32 index = 0; + U32 j = stride; + for (U32 i = 1; i < len; i++, j += stride) { + if (input[j] > input[index]) { + index = j; + } + } + return index / stride; +} + +template +static EE argmax(TensorDesc inputDesc, const T *input, I32 axis, TensorDesc outputDesc, U32 *output) +{ + UNUSED(outputDesc); + + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + if (axis < 0) { + axis = inputDesc.nDims + axis; + } + axis = inputDesc.nDims - 1 - axis; + U32 loopInner = 1; + for (int i = 0; i < axis; i++) { + loopInner *= inputDesc.dims[i]; + } + U32 loopOuter = 1; + for (U32 i = axis + 1; i < inputDesc.nDims; i++) { + loopOuter *= inputDesc.dims[i]; + } + + U32 len = inputDesc.dims[axis]; + for (U32 i = 0; i < loopOuter; i++) { + for (U32 j = 0; j < loopInner; j++) { + const T *array = input + i * (len * loopInner) + j; + output[i * loopInner + j] = array_argmax(array, len, loopInner); + } + } + return SUCCESS; +} + +EE argmax_cpu( + TensorDesc inputDesc, const void *input, ArgMaxParamSpec p, TensorDesc outputDesc, void *output) +{ + DataType idt = inputDesc.dt; + EE ret = SUCCESS; + int axis = p.axis; + switch (idt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = argmax(inputDesc, (const F32 *)input, axis, outputDesc, (U32 *)output); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + ret = argmax(inputDesc, (const F16 *)input, axis, outputDesc, (U32 *)output); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + + return ret; +} diff --git a/compute/tensor/src/cpu/arm/arm_functions.h b/compute/tensor/src/cpu/arm/arm_functions.h new file mode 100644 index 00000000..0db1458f --- /dev/null +++ b/compute/tensor/src/cpu/arm/arm_functions.h @@ -0,0 +1,249 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_ARM_FUNCTIONS +#define _H_ARM_FUNCTIONS + +#include "cpu/cpu_functions_template.h" +#ifdef _USE_FP16 +#include "cpu/arm/fp16/arm_functions_fp16.h" +#endif +#ifdef _USE_FP32 +#include "cpu/arm/fp32/arm_functions_fp32.h" +#endif +#ifdef _USE_INT8 +#include "cpu/arm/int8/arm_functions_int8.h" +#endif + +// array sum +inline F32 array_sum_arm(DataType dt, const void *data, I32 len) +{ + F32 result = 0; + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + result = array_sum_f16((const F16 *)data, len); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + result = array_sum_f32((const F32 *)data, len); + break; +#endif + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } + return result; +} + +// array mean +inline F32 array_mean_arm(DataType dt, const void *data, I32 len) +{ + F32 result = 0; + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + result = array_mean_f16((const F16 *)data, len); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + result = array_mean_f32((const F32 *)data, len); + break; +#endif + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } + return result; +} + +// array var +inline F32 array_var_arm(DataType dt, const void *data, I32 len, F32 mean) +{ + F32 result = 0; + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + result = array_var_f16((const F16 *)data, len, mean); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + result = array_var_f32((const F32 *)data, len, mean); + break; +#endif + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } + return result; +} + +// array max +inline F32 array_max_arm(DataType dt, const void *data, I32 len) +{ + F32 result = 0; + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + result = array_max_f16((const F16 *)data, len); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + result = array_max_f32((const F32 *)data, len); + break; +#endif + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } + return result; +} + +inline F32 array_maxabs_arm(DataType dt, const void *data, I32 len) +{ + F32 result = 0; + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + result = array_maxabs_f16((const F16 *)data, len); + break; +#endif + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } + return result; +} + +inline void array_scale_arm( + DataType dt, const void *input, void *output, I32 len, F32 alpha, F32 beta) +{ + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + array_scale_f16((const F16 *)input, (F16 *)output, len, alpha, beta); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + array_scale_f32((const F32 *)input, (F32 *)output, len, alpha, beta); + break; +#endif + case DT_I32: + array_scale_template((const I32 *)input, (I32 *)output, len, alpha, beta); + break; + case DT_U32: + array_scale_template((const U32 *)input, (U32 *)output, len, alpha, beta); + break; + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } +} + +inline void array_power_arm(DataType dt, void *input, void *output, I32 len, F32 power) +{ + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + array_power_f16((F16 *)input, (F16 *)output, len, power); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + array_power_f32((F32 *)input, (F32 *)output, len, power); + break; +#endif + case DT_I32: + array_power_template((I32 *)input, (I32 *)output, len, power); + break; + case DT_U32: + array_power_template((U32 *)input, (U32 *)output, len, power); + break; + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } +} + +inline EE array_activation_arm( + DataType dt, void *input, U32 len, ActivationParamSpec activationDesc, void *output) +{ + EE ret = SUCCESS; + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + ret = activation_fp16((F16 *)input, len, activationDesc, (F16 *)output); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + ret = activation_fp32((F32 *)input, len, activationDesc, (F32 *)output); + break; +#endif +#ifdef _USE_INT8 + case DT_I8: { + ret = activation_int8((INT8 *)input, len, activationDesc, (INT8 *)output); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +inline void array_add_arm(DataType dt, const void *inputA, const void *inputB, void *output, I32 len) +{ + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + array_add_f16((const F16 *)inputA, (const F16 *)inputB, (F16 *)output, len); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + array_add_f32((const F32 *)inputA, (const F32 *)inputB, (F32 *)output, len); + break; +#endif + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } +} + +inline void array_square_and_add_arm( + DataType dt, const void *inputA, const void *inputB, void *output, I32 len) +{ + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + array_square_and_add_f16((const F16 *)inputA, (const F16 *)inputB, (F16 *)output, len); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + array_square_and_add_f32((const F32 *)inputA, (const F32 *)inputB, (F32 *)output, len); + break; +#endif + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } +} +#endif diff --git a/compute/tensor/src/cpu/arm/attention.cpp b/compute/tensor/src/cpu/arm/attention.cpp new file mode 100644 index 00000000..c8637194 --- /dev/null +++ b/compute/tensor/src/cpu/arm/attention.cpp @@ -0,0 +1,53 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/tensor_computing_arm.h" +#ifdef _USE_FP32 +#include "cpu/arm/fp32/tensor_computing_fp32.h" +#endif +#ifdef _USE_FP16 +#include "cpu/arm/fp16/tensor_computing_fp16.h" +#endif + +EE attention_arm(TensorDesc inputDesc, const void *input, TensorDesc outputDesc, void *output) +{ + DataType dt; + DataFormat df; + U32 batch, numHeads, fromSequenceLength, toSequenceLength; + CHECK_REQUIREMENT(tensorIs2d(inputDesc)); + CHECK_REQUIREMENT(tensorIs4d(outputDesc)); + CHECK_STATUS(tensor4dGet( + outputDesc, &dt, &df, &batch, &numHeads, &fromSequenceLength, &toSequenceLength)); + + EE ret = SUCCESS; + switch (dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = attention_fp32(batch, numHeads, fromSequenceLength, toSequenceLength, + (const F32 *)input, (F32 *)output); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + ret = attention_fp16(batch, numHeads, fromSequenceLength, toSequenceLength, + (const F16 *)input, (F16 *)output); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/arm/attention_mask.cpp b/compute/tensor/src/cpu/arm/attention_mask.cpp new file mode 100644 index 00000000..e79b0936 --- /dev/null +++ b/compute/tensor/src/cpu/arm/attention_mask.cpp @@ -0,0 +1,49 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/tensor_computing_arm.h" +#ifdef _USE_FP32 +#include "cpu/arm/fp32/tensor_computing_fp32.h" +#endif +#ifdef _USE_FP16 +#include "cpu/arm/fp16/tensor_computing_fp16.h" +#endif + +EE attention_mask_arm(TensorDesc inputDesc, + const void *input, + AttentionMaskParamSpec p, + TensorDesc outputDesc, + void *output) +{ + DataType idt = inputDesc.dt; + EE ret = SUCCESS; + switch (idt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = attention_mask_fp32(inputDesc, (const F32 *)input, p, outputDesc, (F32 *)output); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + ret = attention_mask_fp16(inputDesc, (const F16 *)input, p, outputDesc, (F16 *)output); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + + return ret; +} diff --git a/compute/tensor/src/cpu/arm/bnn/convolution.cpp b/compute/tensor/src/cpu/arm/bnn/convolution.cpp new file mode 100644 index 00000000..bdf74d24 --- /dev/null +++ b/compute/tensor/src/cpu/arm/bnn/convolution.cpp @@ -0,0 +1,74 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_FP16 +#include "cpu/arm/bnn/tensor_computing_bnn.h" + +EE convolution_bnn(TensorDesc inputDesc, + const F16 *input, + TensorDesc filterDesc, + const BIN8 *filter, + ConvolutionParamSpec convParamSpec, + TensorDesc scaleDesc, + const F16 *scale, + TensorDesc biasDesc, + const F16 *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *output, + ActivationParamSpec activationDesc, + Arch arch) +{ + if (nullptr == input || nullptr == filter || nullptr == output || nullptr == scale || + nullptr == bias || nullptr == tmp) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if (idt != DT_F16) { + CHECK_STATUS(NOT_MATCH); + } + if (odt != DT_F16) { + CHECK_STATUS(NOT_MATCH); + } + if (idf != DF_NCHWC8 || odf != DF_NCHWC8) { + CHECK_STATUS(NOT_MATCH); + } + + EE ret = SUCCESS; + switch (fdt) { + case DT_BIN01: + ret = convolution_dorefa(inputDesc, (F16 *)input, filterDesc, (BIN8 *)filter, + convParamSpec, scaleDesc, (F16 *)scale, biasDesc, (F16 *)bias, tmpBytes, tmp, + outputDesc, (F16 *)output, activationDesc, arch); + break; + case DT_BIN11: + ret = convolution_xnor(inputDesc, (F16 *)input, filterDesc, (BIN8 *)filter, + convParamSpec, scaleDesc, (F16 *)scale, biasDesc, (F16 *)bias, tmpBytes, tmp, + outputDesc, (F16 *)output, activationDesc, arch); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} +#endif diff --git a/compute/tensor/src/cpu/arm/bnn/convolution_dorefa.h b/compute/tensor/src/cpu/arm/bnn/convolution_dorefa.h new file mode 100644 index 00000000..8b586610 --- /dev/null +++ b/compute/tensor/src/cpu/arm/bnn/convolution_dorefa.h @@ -0,0 +1,86 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_CONVOLUTION_DOREFA +#define _H_CONVOLUTION_DOREFA + +#ifdef _USE_FP16 +#include +#include +#include "sys.h" +#include "types.h" +#include "error.h" + +EE convolution_dorefa_A55(TensorDesc inputDesc, + const F16 *input, + TensorDesc filterDesc, + const BIN8 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc scaleDesc, + const F16 *scaleArray, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc); + +EE convolution_dorefa_A76(TensorDesc inputDesc, + const F16 *input, + TensorDesc filterDesc, + const BIN8 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc scaleDesc, + const F16 *scaleArray, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc); + +inline EE convolution_dorefa(TensorDesc inputDesc, + const F16 *input, + TensorDesc filterDesc, + const BIN8 *filter, + ConvolutionParamSpec convParamSpec, + TensorDesc scaleDesc, + const F16 *scale, + TensorDesc biasDesc, + const F16 *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *output, + ActivationParamSpec activationDesc, + Arch arch) +{ + EE ret = SUCCESS; + switch (arch) { + case ARM_A55: + ret = convolution_dorefa_A55(inputDesc, input, filterDesc, filter, convParamSpec, + scaleDesc, scale, biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc); + break; + case ARM_A76: + ret = convolution_dorefa_A76(inputDesc, input, filterDesc, filter, convParamSpec, + scaleDesc, scale, biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc); + break; + default: + return NOT_SUPPORTED; + } + return ret; +} +#endif +#endif diff --git a/compute/tensor/src/cpu/arm/bnn/convolution_dorefa_A55.cpp b/compute/tensor/src/cpu/arm/bnn/convolution_dorefa_A55.cpp new file mode 100644 index 00000000..b6f0fef9 --- /dev/null +++ b/compute/tensor/src/cpu/arm/bnn/convolution_dorefa_A55.cpp @@ -0,0 +1,779 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_FP16 +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" + +#include "cpu/arm/bnn/convolution_dorefa.h" + +EE convolution_dorefa_A55(TensorDesc inputDesc, + const F16 *input, + TensorDesc filterDesc, + const BIN8 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc scaleDesc, + const F16 *scaleArray, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc) +{ + UNUSED(scaleDesc); + UNUSED(biasDesc); + UNUSED(tmpBytes); + UNUSED(activationDesc); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + if (fdf != DF_NCHWN16C8) { + CHECK_STATUS(NOT_MATCH); + } + if (!(ic == fc && oc == fn)) { + CHECK_STATUS(NOT_MATCH); + } + + oc /= 8; + ic /= 8; + + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + U32 ohow = oh * ow; + U32 ihiw = ih_pad * iw_pad; + + BIN8 *inArray = ((BIN8 *)tmp) + ic * ihiw + 8 * fh * fw * ic; // ic has been divided by 8 + BIN8 *inArray_pad; + for (U32 n = 0; n < in; n++) { + const F16 *in = input + n * ic * ih * iw * 8; + for (U32 i = 0; i < ic * ih * iw; i++) { + BIN8 temp = 0; + for (U32 j = 0; j < 8; j++) { + if (in[i * 8 + j] >= 0.5) { + temp |= (1 << (7 - j)); // set + } + } + inArray[i] = temp; + } + + if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { + inArray_pad = inArray + n * ic * ih * iw; // ic has been divided by 8 + } else { + // copy input into a input with padding + inArray_pad = (BIN8 *)tmp; + BIN8 *inArray_pad_mov = inArray_pad; + BIN8 *inArray_mov = inArray + n * ic * ih * iw; + for (U32 c = 0; c < ic; c++) { // All divide by 8 + for (U32 h = 0; h < paddingT; h++) { + memset(inArray_pad_mov, 0, iw_pad * bytesOf(DT_BIN01)); + inArray_pad_mov += iw_pad; + } + for (U32 h = paddingT; h < ih_pad - paddingB; h++) { + memset(inArray_pad_mov, 0, paddingL * bytesOf(DT_BIN01)); + inArray_pad_mov += paddingL; + memcpy(inArray_pad_mov, inArray_mov, iw * bytesOf(DT_BIN01)); + inArray_pad_mov += iw; + inArray_mov += iw; + memset(inArray_pad_mov, 0, paddingR * bytesOf(DT_BIN01)); + inArray_pad_mov += paddingR; + } + for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { + memset(inArray_pad_mov, 0, iw_pad * bytesOf(DT_BIN01)); + inArray_pad_mov += iw_pad; + } + } + } + for (U32 hw = 0; hw < ohow - 7; hw += 8) { + const F16 *s0 = scaleArray; + const F16 *s1 = scaleArray + 8; + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + BIN8 *in_order = ((BIN8 *)tmp) + ic * ihiw; // ic has been divided by 8 + // reorder input + // NCHWc8 => NHWChw8c8 + im2col + U32 in_h[8]; + U32 in_w[8]; + for (U32 i = 0; i < 8; i++) { + in_h[i] = ((hw + i) / ow) * strideH; + in_w[i] = ((hw + i) % ow) * strideW; + } + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + BIN8 *in_hw8c8 = inArray_pad + c * ihiw + fh_idx * iw_pad + fw_idx; + // NHWChw8c8 + BIN8 *in_order_hw8c8 = in_order + c * fh * fw * 8 + fh_idx * fw * 8 + + fw_idx * 8; // This 8 comes from hw8 + for (U32 i = 0; i < 8; i++) { + in_order_hw8c8[i] = *(in_hw8c8 + in_h[i] * iw_pad + in_w[i]); + } + } + } + } + + // compute + for (U32 o = 0; o < oc; o += + 2) { // oc should be multiple of 32. It will at least be multiple of 16 in the future. + BIN8 *in_hw0 = in_order; + const BIN8 *f_o0c0 = filterArray + o * 8 * fh * fw * ic; // ic has been divided by 8 + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // scale and bias + const F16 *s_o0 = s0; + const F16 *s_o1 = s1; + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + /* Layout + 5 6 + 7 8 + 9 10 + 11 12 + + 13 14 + 15 16 + 17 18 + 19 20 + */ + "eor v5.16b, v5.16b, v5.16b\n" + "ldr d29, [%[in_0]]\n" // in_0 + "eor v6.16b, v6.16b, v6.16b\n" + "ldr d0, [%[f_0]]\n" // f_0 + "eor v7.16b, v7.16b, v7.16b\n" + "ldr x2, [%[f_0], #8]\n" + "eor v8.16b, v8.16b, v8.16b\n" + "ins v0.d[1], x2\n" + "eor v9.16b, v9.16b, v9.16b\n" + "dup v1.16b, v29.b[0]\n" // duplicate a full register + "eor v10.16b, v10.16b, v10.16b\n" + "dup v2.16b, v29.b[1]\n" + "eor v11.16b, v11.16b, v11.16b\n" + "eor v12.16b, v12.16b, v12.16b\n" + "eor v13.16b, v13.16b, v13.16b\n" + "eor v14.16b, v14.16b, v14.16b\n" + "eor v15.16b, v15.16b, v15.16b\n" + "eor v16.16b, v16.16b, v16.16b\n" + "eor v17.16b, v17.16b, v17.16b\n" + "eor v18.16b, v18.16b, v18.16b\n" + "eor v19.16b, v19.16b, v19.16b\n" + "eor v20.16b, v20.16b, v20.16b\n" + + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" // ic_blk + + "0:\n" + "eor v21.16b, v21.16b, v21.16b\n" + "mov x9, %[fhfw]\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + "eor v25.16b, v25.16b, v25.16b\n" + "eor v26.16b, v26.16b, v26.16b\n" + "eor v27.16b, v27.16b, v27.16b\n" + "eor v28.16b, v28.16b, v28.16b\n" + + "mov x4, #4\n" + + "1:\n" + "and v3.16b, v1.16b, v0.16b\n" + "ldr d30, [x0, 16]!\n" // next filter + + "and v4.16b, v2.16b, v0.16b\n" + "ldr x1, [x0, 8]\n" + + "cnt v3.16b, v3.16b\n" + "subs x4, x4, #1\n" + + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[2]\n" + + "add v21.16b, v21.16b, v3.16b\n" // Use add because the latency is shorter + "dup v2.16b, v29.b[3]\n" + + "add v22.16b, v22.16b, v4.16b\n" + "ins v30.d[1], x1\n" + + "and v3.16b, v1.16b, v0.16b\n" + "and v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[4]\n" + "add v23.16b, v23.16b, v3.16b\n" + "dup v2.16b, v29.b[5]\n" + "add v24.16b, v24.16b, v4.16b\n" + + "and v3.16b, v1.16b, v0.16b\n" + "and v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[6]\n" + "add v25.16b, v25.16b, v3.16b\n" + "dup v2.16b, v29.b[7]\n" + "add v26.16b, v26.16b, v4.16b\n" + + "and v3.16b, v1.16b, v0.16b\n" + "ldr d29, [x3, 8]!\n" + "and v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "mov v0.16b, v30.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[0]\n" + "add v27.16b, v27.16b, v3.16b\n" + "dup v2.16b, v29.b[1]\n" + "add v28.16b, v28.16b, v4.16b\n" + "bne 1b\n" + + "movi v3.16b, #1\n" + "umlal v5.8h, v21.8b, v3.8b\n" + "umlal v7.8h, v22.8b, v3.8b\n" + "umlal v9.8h, v23.8b, v3.8b\n" + "umlal v11.8h, v24.8b, v3.8b\n" + "umlal v13.8h, v25.8b, v3.8b\n" + "umlal v15.8h, v26.8b, v3.8b\n" + "umlal v17.8h, v27.8b, v3.8b\n" + "umlal v19.8h, v28.8b, v3.8b\n" + + "umlal2 v6.8h, v21.16b, v3.16b\n" + "umlal2 v8.8h, v22.16b, v3.16b\n" + "umlal2 v10.8h, v23.16b, v3.16b\n" + "umlal2 v12.8h, v24.16b, v3.16b\n" + "umlal2 v14.8h, v25.16b, v3.16b\n" + "umlal2 v16.8h, v26.16b, v3.16b\n" + "umlal2 v18.8h, v27.16b, v3.16b\n" + "umlal2 v20.8h, v28.16b, v3.16b\n" + + "subs x9, x9, #1\n" + "beq 4f\n" // 1x1, continue with the next 32 input channels + + "2:\n" + "eor v21.16b, v21.16b, v21.16b\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + "eor v25.16b, v25.16b, v25.16b\n" + "eor v26.16b, v26.16b, v26.16b\n" + "eor v27.16b, v27.16b, v27.16b\n" + "eor v28.16b, v28.16b, v28.16b\n" + + "mov x4, #32\n" // Assume 256 will not happen + "3:\n" + "and v3.16b, v1.16b, v0.16b\n" + "ldr d30, [x0, 16]!\n" // next filter + + "and v4.16b, v2.16b, v0.16b\n" + "ldr x1, [x0, 8]\n" + + "cnt v3.16b, v3.16b\n" + "subs x4, x4, #1\n" + + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[2]\n" + + "uqadd v21.16b, v21.16b, v3.16b\n" + "dup v2.16b, v29.b[3]\n" + + "uqadd v22.16b, v22.16b, v4.16b\n" + "ins v30.d[1], x1\n" + + "and v3.16b, v1.16b, v0.16b\n" + "and v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[4]\n" + "uqadd v23.16b, v23.16b, v3.16b\n" + "dup v2.16b, v29.b[5]\n" + "uqadd v24.16b, v24.16b, v4.16b\n" + + "and v3.16b, v1.16b, v0.16b\n" + "and v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[6]\n" + "uqadd v25.16b, v25.16b, v3.16b\n" + "dup v2.16b, v29.b[7]\n" + "uqadd v26.16b, v26.16b, v4.16b\n" + + "and v3.16b, v1.16b, v0.16b\n" + "ldr d29, [x3, 8]!\n" + "and v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "mov v0.16b, v30.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[0]\n" + "uqadd v27.16b, v27.16b, v3.16b\n" + "dup v2.16b, v29.b[1]\n" + "uqadd v28.16b, v28.16b, v4.16b\n" + "bne 3b\n" + + "movi v3.16b, #1\n" + "umlal v5.8h, v21.8b, v3.8b\n" + "umlal v7.8h, v22.8b, v3.8b\n" + "umlal v9.8h, v23.8b, v3.8b\n" + "umlal v11.8h, v24.8b, v3.8b\n" + "umlal v13.8h, v25.8b, v3.8b\n" + "umlal v15.8h, v26.8b, v3.8b\n" + "umlal v17.8h, v27.8b, v3.8b\n" + "umlal v19.8h, v28.8b, v3.8b\n" + + "umlal2 v6.8h, v21.16b, v3.16b\n" + "umlal2 v8.8h, v22.16b, v3.16b\n" + "umlal2 v10.8h, v23.16b, v3.16b\n" + "umlal2 v12.8h, v24.16b, v3.16b\n" + "umlal2 v14.8h, v25.16b, v3.16b\n" + "umlal2 v16.8h, v26.16b, v3.16b\n" + "umlal2 v18.8h, v27.16b, v3.16b\n" + "umlal2 v20.8h, v28.16b, v3.16b\n" + + "subs x9, x9, #8\n" + "bne 2b\n" + + "4:\n" // Wrap up computation for 32 input channels + "subs x2, x2, #32\n" + "bne 0b\n" + + // pipelined + "ucvtf v5.8h, v5.8h\n" + "ucvtf v6.8h, v6.8h\n" + "ldr q21, [%[b_0]]\n" + "ucvtf v7.8h, v7.8h\n" + "ldr q22, [%[b_1]]\n" + "ucvtf v8.8h, v8.8h\n" + "ldr q23, [%[s_0]]\n" + "ucvtf v9.8h, v9.8h\n" + "ldr q24, [%[s_1]]\n" + "ucvtf v10.8h, v10.8h\n" + "ucvtf v11.8h, v11.8h\n" + "mov v1.16b, v21.16b\n" + "ucvtf v12.8h, v12.8h\n" + "mov v2.16b, v22.16b\n" + "ucvtf v13.8h, v13.8h\n" + "fmla v1.8h, v5.8h, v23.8h\n" + "ucvtf v14.8h, v14.8h\n" + "fmla v2.8h, v6.8h, v24.8h\n" + "ucvtf v15.8h, v15.8h\n" + "mov v3.16b, v21.16b\n" + "ucvtf v16.8h, v16.8h\n" + "mov v4.16b, v22.16b\n" + "ucvtf v17.8h, v17.8h\n" + "fmla v3.8h, v7.8h, v23.8h\n" + "ucvtf v18.8h, v18.8h\n" + "fmla v4.8h, v8.8h, v24.8h\n" + "ucvtf v19.8h, v19.8h\n" + "mov v5.16b, v21.16b\n" + "ucvtf v20.8h, v20.8h\n" + "mov v6.16b, v22.16b\n" + + "fmla v5.8h, v9.8h, v23.8h\n" + "mov v7.16b, v21.16b\n" + "fmla v6.8h, v10.8h, v24.8h\n" + "mov v8.16b, v22.16b\n" + "fmla v7.8h, v11.8h, v23.8h\n" + "mov v9.16b, v21.16b\n" + "fmla v8.8h, v12.8h, v24.8h\n" + "mov v10.16b, v22.16b\n" + "fmla v9.8h, v13.8h, v23.8h\n" + "mov v11.16b, v21.16b\n" + "fmla v10.8h, v14.8h, v24.8h\n" + "mov v12.16b, v22.16b\n" + "fmla v11.8h, v15.8h, v23.8h\n" + "mov v13.16b, v21.16b\n" + "fmla v12.8h, v16.8h, v24.8h\n" + "mov v14.16b, v22.16b\n" + "fmla v13.8h, v17.8h, v23.8h\n" + "mov v15.16b, v21.16b\n" + "fmla v14.8h, v18.8h, v24.8h\n" + "mov v16.16b, v22.16b\n" + "fmla v15.8h, v19.8h, v23.8h\n" + "fmla v16.8h, v20.8h, v24.8h\n" + + "str q1, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q5, [%[out_0], #32]\n" // out_o0hw2 + "str q7, [%[out_0], #48]\n" // out_o0hw3 + "str q9, [%[out_0], #64]\n" // out_o0hw4 + "str q11, [%[out_0], #80]\n" // out_o0hw5 + "str q13, [%[out_0], #96]\n" // out_o0hw6 + "str q15, [%[out_0], #112]\n" // out_o0hw7 + + "str q2, [%[out_1]]\n" // out_o1hw0 + "str q4, [%[out_1], #16]\n" // out_o1hw1 + "str q6, [%[out_1], #32]\n" // out_o1hw2 + "str q8, [%[out_1], #48]\n" // out_o1hw3 + "str q10, [%[out_1], #64]\n" // out_o1hw4 + "str q12, [%[out_1], #80]\n" // out_o1hw5 + "str q14, [%[out_1], #96]\n" // out_o1hw6 + "str q16, [%[out_1], #112]\n" // out_o1hw7 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [fhfw] "r"((I64)fh * fw), [s_0] "r"(s_o0), + [s_1] "r"(s_o1), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x0", + "x1", "x2", "x3", "x4", "x9"); + s0 += 16; + s1 += 16; + b0 += 16; + b1 += 16; + } + } + // ohow_remainder % 8 / 4 + U32 ohow_s = (ohow / 8) * 8; + + for (U32 hw = ohow_s; hw < ohow - 3; hw += 4) { + const F16 *s0 = scaleArray; + const F16 *s1 = scaleArray + 8; + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + BIN8 *in_order = ((BIN8 *)tmp) + ic * ihiw; // ic has been divided by 8 + // reorder input + // NCHWc8 => NHWChw4c8 + im2col + U32 in_h[4]; + U32 in_w[4]; + for (U32 i = 0; i < 4; i++) { + in_h[i] = ((hw + i) / ow) * strideH; + in_w[i] = ((hw + i) % ow) * strideW; + } + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + BIN8 *in_hw4c8 = inArray_pad + c * ihiw + fh_idx * iw_pad + fw_idx; + // NHWChw4c8 + BIN8 *in_order_hw4c8 = + in_order + c * fh * fw * 4 + fh_idx * fw * 4 + fw_idx * 4; + for (U32 i = 0; i < 4; i++) { + in_order_hw4c8[i] = *(in_hw4c8 + in_h[i] * iw_pad + in_w[i]); + } + } + } + } + + // compute + for (U32 o = 0; o < oc; o += + 2) { // oc should be multiple of 32. It will at least be multiple of 16 in the future. + BIN8 *in_hw0 = in_order; + const BIN8 *f_o0c0 = filterArray + o * 8 * fh * fw * ic; // ic has been divided by 8 + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // scale and bias + const F16 *s_o0 = s0; + const F16 *s_o1 = s1; + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + /* Layout + 5 6 + 7 8 + 9 10 + 11 12 + */ + "eor v5.16b, v5.16b, v5.16b\n" + "ldr s29, [%[in_0]]\n" // in_0 + "eor v6.16b, v6.16b, v6.16b\n" + "ldr d0, [%[f_0]]\n" // f_0 + "eor v7.16b, v7.16b, v7.16b\n" + "ldr x2, [%[f_0], #8]\n" + "eor v8.16b, v8.16b, v8.16b\n" + "ins v0.d[1], x2\n" + "eor v9.16b, v9.16b, v9.16b\n" + "dup v1.16b, v29.b[0]\n" // duplicate a full register + "eor v10.16b, v10.16b, v10.16b\n" + "dup v2.16b, v29.b[1]\n" + "eor v11.16b, v11.16b, v11.16b\n" + "eor v12.16b, v12.16b, v12.16b\n" + + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" // ic_blk + + "0:\n" + "eor v21.16b, v21.16b, v21.16b\n" + "mov x9, %[fhfw]\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + + "mov x4, #4\n" + + "1:\n" + "and v3.16b, v1.16b, v0.16b\n" + "ldr d30, [x0, 16]!\n" // next filter + + "and v4.16b, v2.16b, v0.16b\n" + "ldr x1, [x0, 8]\n" + + "cnt v3.16b, v3.16b\n" + "subs x4, x4, #1\n" + + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[2]\n" + + "add v21.16b, v21.16b, v3.16b\n" // Use add because the latency is shorter + "dup v2.16b, v29.b[3]\n" + + "add v22.16b, v22.16b, v4.16b\n" + "ins v30.d[1], x1\n" + + "and v3.16b, v1.16b, v0.16b\n" + "ldr s29, [x3, 4]!\n" + "and v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "mov v0.16b, v30.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[0]\n" + "add v23.16b, v23.16b, v3.16b\n" + "dup v2.16b, v29.b[1]\n" + "add v24.16b, v24.16b, v4.16b\n" + "bne 1b\n" + + "movi v3.16b, #1\n" + "umlal v5.8h, v21.8b, v3.8b\n" + "umlal v7.8h, v22.8b, v3.8b\n" + "umlal v9.8h, v23.8b, v3.8b\n" + "umlal v11.8h, v24.8b, v3.8b\n" + + "umlal2 v6.8h, v21.16b, v3.16b\n" + "umlal2 v8.8h, v22.16b, v3.16b\n" + "umlal2 v10.8h, v23.16b, v3.16b\n" + "umlal2 v12.8h, v24.16b, v3.16b\n" + + "subs x9, x9, #1\n" + "beq 4f\n" // 1x1, continue with the next 32 input channels + + "2:\n" + "eor v21.16b, v21.16b, v21.16b\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + + "mov x4, #32\n" // Assume 256 will not happen + "3:\n" + "and v3.16b, v1.16b, v0.16b\n" + "ldr d30, [x0, 16]!\n" // next filter + + "and v4.16b, v2.16b, v0.16b\n" + "ldr x1, [x0, 8]\n" + + "cnt v3.16b, v3.16b\n" + "subs x4, x4, #1\n" + + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[2]\n" + + "uqadd v21.16b, v21.16b, v3.16b\n" + "dup v2.16b, v29.b[3]\n" + + "uqadd v22.16b, v22.16b, v4.16b\n" + "ins v30.d[1], x1\n" + + "and v3.16b, v1.16b, v0.16b\n" + "ldr s29, [x3, 4]!\n" + "and v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "mov v0.16b, v30.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[0]\n" + "uqadd v23.16b, v23.16b, v3.16b\n" + "dup v2.16b, v29.b[1]\n" + "uqadd v24.16b, v24.16b, v4.16b\n" + "bne 3b\n" + + "movi v3.16b, #1\n" + "umlal v5.8h, v21.8b, v3.8b\n" + "umlal v7.8h, v22.8b, v3.8b\n" + "umlal v9.8h, v23.8b, v3.8b\n" + "umlal v11.8h, v24.8b, v3.8b\n" + + "umlal2 v6.8h, v21.16b, v3.16b\n" + "umlal2 v8.8h, v22.16b, v3.16b\n" + "umlal2 v10.8h, v23.16b, v3.16b\n" + "umlal2 v12.8h, v24.16b, v3.16b\n" + + "subs x9, x9, #8\n" + "bne 2b\n" + + "4:\n" // Wrap up computation for 32 input channels + "subs x2, x2, #32\n" + "bne 0b\n" + + // pipelined + "ucvtf v5.8h, v5.8h\n" + "ucvtf v6.8h, v6.8h\n" + "ldr q21, [%[b_0]]\n" + "ucvtf v7.8h, v7.8h\n" + "ldr q22, [%[b_1]]\n" + "ucvtf v8.8h, v8.8h\n" + "ldr q23, [%[s_0]]\n" + "ucvtf v9.8h, v9.8h\n" + "ldr q24, [%[s_1]]\n" + "ucvtf v10.8h, v10.8h\n" + "ucvtf v11.8h, v11.8h\n" + "mov v1.16b, v21.16b\n" + "ucvtf v12.8h, v12.8h\n" + "mov v2.16b, v22.16b\n" + "fmla v1.8h, v5.8h, v23.8h\n" + "fmla v2.8h, v6.8h, v24.8h\n" + "mov v3.16b, v21.16b\n" + "mov v4.16b, v22.16b\n" + "fmla v3.8h, v7.8h, v23.8h\n" + "fmla v4.8h, v8.8h, v24.8h\n" + "mov v5.16b, v21.16b\n" + "mov v6.16b, v22.16b\n" + + "fmla v5.8h, v9.8h, v23.8h\n" + "mov v7.16b, v21.16b\n" + "fmla v6.8h, v10.8h, v24.8h\n" + "mov v8.16b, v22.16b\n" + "fmla v7.8h, v11.8h, v23.8h\n" + "fmla v8.8h, v12.8h, v24.8h\n" + + "str q1, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q5, [%[out_0], #32]\n" // out_o0hw2 + "str q7, [%[out_0], #48]\n" // out_o0hw3 + + "str q2, [%[out_1]]\n" // out_o1hw0 + "str q4, [%[out_1], #16]\n" // out_o1hw1 + "str q6, [%[out_1], #32]\n" // out_o1hw2 + "str q8, [%[out_1], #48]\n" // out_o1hw3 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [fhfw] "r"((I64)fh * fw), [s_0] "r"(s_o0), + [s_1] "r"(s_o1), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v21", "v22", "v23", "v24", + "v29", "v30", "x0", "x1", "x2", "x3", "x4", "x9"); + s0 += 16; + s1 += 16; + b0 += 16; + b1 += 16; + } + } + // ohow_reminder % 4 + ohow_s = (ohow / 4) * 4; + for (U32 hw = ohow_s; hw < ohow; hw++) { + const F16 *s0 = scaleArray; + const F16 *s1 = scaleArray + 8; + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + BIN8 *in_order = ((BIN8 *)tmp) + ic * ih_pad * iw_pad; // ic has been divided by 8 + // reorder input + // NCHWc8 => NHWChw1c8 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + BIN8 *in_hw1c8 = inArray_pad + c * ihiw + fh_idx * iw_pad + fw_idx; + BIN8 *in_0 = in_hw1c8 + in_h_0 * iw_pad + in_w_0; + BIN8 *in_order_hw1c8 = in_order + c * fh * fw + fh_idx * fw + fw_idx; + *in_order_hw1c8 = (*in_0); + } + } + } + // compute + for (U32 o = 0; o < oc; o += 2) { + BIN8 *in_hw0 = in_order; + const BIN8 *f_o = filterArray + o * 8 * fh * fw * ic; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + + uint16x8_t sum[2] = {0}; + uint8x8_t v1 = vdup_n_u8(1); + for (U32 i = 0; i < ic * 8; i += 32) { + uint8x8_t sub0[2] = {0}; + + for (U32 j = 0; j < 4; j++) { + uint8x8_t f_0 = vld1_u8(f_o); + uint8x8_t f_1 = vld1_u8(f_o + 8); + f_o += 16; + uint8x8_t in_1 = vdup_n_u8(*in_hw0); + in_hw0++; + f_0 = vand_u8(in_1, f_0); + f_1 = vand_u8(in_1, f_1); + f_0 = vcnt_u8(f_0); + f_1 = vcnt_u8(f_1); + sub0[0] = vadd_u8(sub0[0], f_0); + sub0[1] = vadd_u8(sub0[1], f_1); + } + sum[0] = vmlal_u8(sum[0], sub0[0], v1); + sum[1] = vmlal_u8(sum[1], sub0[1], v1); + + for (U32 j = 1; j < fh * fw; j += 8) { + uint8x8_t sub1[2] = {0}; + for (U32 k = 0; k < 32; k++) { + uint8x8_t f_0 = vld1_u8(f_o); + uint8x8_t f_1 = vld1_u8(f_o + 8); + f_o += 16; + uint8x8_t in_1 = vdup_n_u8(*in_hw0); + in_hw0++; + f_0 = vand_u8(in_1, f_0); + f_1 = vand_u8(in_1, f_1); + f_0 = vcnt_u8(f_0); + f_1 = vcnt_u8(f_1); + sub1[0] = vadd_u8(sub1[0], f_0); + sub1[1] = vadd_u8(sub1[1], f_1); + } + sum[0] = vmlal_u8(sum[0], sub1[0], v1); + sum[1] = vmlal_u8(sum[1], sub1[1], v1); + } + } + + float16x8_t res_o0 = vcvtq_f16_u16(sum[0]); + float16x8_t res_o1 = vcvtq_f16_u16(sum[1]); + float16x8_t scale_o0 = vld1q_f16(s0); + s0 += 16; + float16x8_t scale_o1 = vld1q_f16(s1); + s1 += 16; + float16x8_t bias_o0 = vld1q_f16(b0); + b0 += 16; + float16x8_t bias_o1 = vld1q_f16(b1); + b1 += 16; + res_o0 = vmulq_f16(res_o0, scale_o0); + res_o1 = vmulq_f16(res_o1, scale_o1); + res_o0 = vaddq_f16(res_o0, bias_o0); + res_o1 = vaddq_f16(res_o1, bias_o1); + vst1q_f16(out_o0hw0, res_o0); + vst1q_f16(out_o1hw0, res_o1); + } + } + } + return SUCCESS; +} +#endif diff --git a/compute/tensor/src/cpu/arm/bnn/convolution_dorefa_A76.cpp b/compute/tensor/src/cpu/arm/bnn/convolution_dorefa_A76.cpp new file mode 100644 index 00000000..420ba6ef --- /dev/null +++ b/compute/tensor/src/cpu/arm/bnn/convolution_dorefa_A76.cpp @@ -0,0 +1,759 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_FP16 +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" + +#include "cpu/arm/bnn/convolution_dorefa.h" + +EE convolution_dorefa_A76(TensorDesc inputDesc, + const F16 *input, + TensorDesc filterDesc, + const BIN8 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc scaleDesc, + const F16 *scaleArray, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc) +{ + UNUSED(scaleDesc); + UNUSED(biasDesc); + UNUSED(tmpBytes); + UNUSED(activationDesc); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + if (fdf != DF_NCHWN16C8) { + CHECK_STATUS(NOT_MATCH); + } + if (!(ic == fc && oc == fn)) { + CHECK_STATUS(NOT_MATCH); + } + + oc /= 8; + ic /= 8; + + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + U32 ohow = oh * ow; + U32 ihiw = ih_pad * iw_pad; + + BIN8 *inArray = ((BIN8 *)tmp) + ic * ihiw + 8 * fh * fw * ic; // ic has been divided by 8 + BIN8 *inArray_pad; + for (U32 n = 0; n < in; n++) { + const F16 *in = input + n * ic * ih * iw * 8; + for (U32 i = 0; i < ic * ih * iw; i++) { + BIN8 temp = 0; + for (U32 j = 0; j < 8; j++) { + if (in[i * 8 + j] >= 0.5) { + temp |= (1 << (7 - j)); // set + } + } + inArray[i] = temp; + } + + if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { + inArray_pad = inArray + n * ic * ih * iw; // ic has been divided by 8 + } else { + // copy input into a input with padding + inArray_pad = (BIN8 *)tmp; + BIN8 *inArray_pad_mov = inArray_pad; + BIN8 *inArray_mov = inArray + n * ic * ih * iw; + for (U32 c = 0; c < ic; c++) { // All divide by 8 + for (U32 h = 0; h < paddingT; h++) { + memset(inArray_pad_mov, 0, iw_pad * bytesOf(DT_BIN01)); + inArray_pad_mov += iw_pad; + } + for (U32 h = paddingT; h < ih_pad - paddingB; h++) { + memset(inArray_pad_mov, 0, paddingL * bytesOf(DT_BIN01)); + inArray_pad_mov += paddingL; + memcpy(inArray_pad_mov, inArray_mov, iw * bytesOf(DT_BIN01)); + inArray_pad_mov += iw; + inArray_mov += iw; + memset(inArray_pad_mov, 0, paddingR * bytesOf(DT_BIN01)); + inArray_pad_mov += paddingR; + } + for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { + memset(inArray_pad_mov, 0, iw_pad * bytesOf(DT_BIN01)); + inArray_pad_mov += iw_pad; + } + } + } + for (U32 hw = 0; hw < ohow - 7; hw += 8) { + const F16 *s0 = scaleArray; + const F16 *s1 = scaleArray + 8; + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + BIN8 *in_order = ((BIN8 *)tmp) + ic * ihiw; // ic has been divided by 8 + // reorder input + // NCHWc8 => NHWChw8c8 + im2col + U32 in_h[8]; + U32 in_w[8]; + for (U32 i = 0; i < 8; i++) { + in_h[i] = ((hw + i) / ow) * strideH; + in_w[i] = ((hw + i) % ow) * strideW; + } + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + BIN8 *in_hw8c8 = inArray_pad + c * ihiw + fh_idx * iw_pad + fw_idx; + // NHWChw8c8 + BIN8 *in_order_hw8c8 = in_order + c * fh * fw * 8 + fh_idx * fw * 8 + + fw_idx * 8; // This 8 comes from hw8 + for (U32 i = 0; i < 8; i++) { + in_order_hw8c8[i] = *(in_hw8c8 + in_h[i] * iw_pad + in_w[i]); + } + } + } + } + + // compute + for (U32 o = 0; o < oc; o += + 2) { // oc should be multiple of 32. It will at least be multiple of 16 in the future. + BIN8 *in_hw0 = in_order; + const BIN8 *f_o0c0 = filterArray + o * 8 * fh * fw * ic; // ic has been divided by 8 + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // scale and bias + const F16 *s_o0 = s0; + const F16 *s_o1 = s1; + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr d29, [%[in_0]]\n" // in_0 + "ldr q0, [%[f_0]]\n" // f_0 + /* Layout + 5 6 + 7 8 + 9 10 + 11 12 + + 13 14 + 15 16 + 17 18 + 19 20 + */ + "eor v5.16b, v5.16b, v5.16b\n" + "eor v6.16b, v6.16b, v6.16b\n" + "eor v7.16b, v7.16b, v7.16b\n" + "eor v8.16b, v8.16b, v8.16b\n" + "eor v9.16b, v9.16b, v9.16b\n" + "dup v1.16b, v29.b[0]\n" // duplicate a full register + "eor v10.16b, v10.16b, v10.16b\n" + "dup v2.16b, v29.b[1]\n" + "eor v11.16b, v11.16b, v11.16b\n" + "eor v12.16b, v12.16b, v12.16b\n" + "eor v13.16b, v13.16b, v13.16b\n" + "eor v14.16b, v14.16b, v14.16b\n" + "eor v15.16b, v15.16b, v15.16b\n" + "eor v16.16b, v16.16b, v16.16b\n" + "eor v17.16b, v17.16b, v17.16b\n" + "eor v18.16b, v18.16b, v18.16b\n" + "eor v19.16b, v19.16b, v19.16b\n" + "eor v20.16b, v20.16b, v20.16b\n" + + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" // ic_blk + + "0:\n" + "eor v21.16b, v21.16b, v21.16b\n" + "mov x9, %[fhfw]\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + "eor v25.16b, v25.16b, v25.16b\n" + "eor v26.16b, v26.16b, v26.16b\n" + "eor v27.16b, v27.16b, v27.16b\n" + "eor v28.16b, v28.16b, v28.16b\n" + + "mov x4, #4\n" + + "1:\n" + "and v3.16b, v1.16b, v0.16b\n" + "and v4.16b, v2.16b, v0.16b\n" + + "cnt v3.16b, v3.16b\n" + "subs x4, x4, #1\n" + + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[2]\n" + + "add v21.16b, v21.16b, v3.16b\n" // Use add because the latency is shorter + "dup v2.16b, v29.b[3]\n" + + "add v22.16b, v22.16b, v4.16b\n" + + "and v3.16b, v1.16b, v0.16b\n" + "and v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[4]\n" + "add v23.16b, v23.16b, v3.16b\n" + "dup v2.16b, v29.b[5]\n" + "add v24.16b, v24.16b, v4.16b\n" + + "and v3.16b, v1.16b, v0.16b\n" + "and v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[6]\n" + "add v25.16b, v25.16b, v3.16b\n" + "dup v2.16b, v29.b[7]\n" + "add v26.16b, v26.16b, v4.16b\n" + + "and v3.16b, v1.16b, v0.16b\n" + "ldr d29, [x3, 8]!\n" + "and v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "ldr q0, [x0, 16]!\n" // next filter + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[0]\n" + "add v27.16b, v27.16b, v3.16b\n" + "dup v2.16b, v29.b[1]\n" + "add v28.16b, v28.16b, v4.16b\n" + "bne 1b\n" + + "movi v3.16b, #1\n" + "umlal v5.8h, v21.8b, v3.8b\n" + "umlal v7.8h, v22.8b, v3.8b\n" + "umlal v9.8h, v23.8b, v3.8b\n" + "umlal v11.8h, v24.8b, v3.8b\n" + "umlal v13.8h, v25.8b, v3.8b\n" + "umlal v15.8h, v26.8b, v3.8b\n" + "umlal v17.8h, v27.8b, v3.8b\n" + "umlal v19.8h, v28.8b, v3.8b\n" + + "umlal2 v6.8h, v21.16b, v3.16b\n" + "umlal2 v8.8h, v22.16b, v3.16b\n" + "umlal2 v10.8h, v23.16b, v3.16b\n" + "umlal2 v12.8h, v24.16b, v3.16b\n" + "umlal2 v14.8h, v25.16b, v3.16b\n" + "umlal2 v16.8h, v26.16b, v3.16b\n" + "umlal2 v18.8h, v27.16b, v3.16b\n" + "umlal2 v20.8h, v28.16b, v3.16b\n" + + "subs x9, x9, #1\n" + "beq 4f\n" // 1x1, continue with the next 32 input channels + + "2:\n" + "eor v21.16b, v21.16b, v21.16b\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + "eor v25.16b, v25.16b, v25.16b\n" + "eor v26.16b, v26.16b, v26.16b\n" + "eor v27.16b, v27.16b, v27.16b\n" + "eor v28.16b, v28.16b, v28.16b\n" + + "mov x4, #32\n" // Assume 256 will not happen + "3:\n" + "and v3.16b, v1.16b, v0.16b\n" + "and v4.16b, v2.16b, v0.16b\n" + + "cnt v3.16b, v3.16b\n" + "subs x4, x4, #1\n" + + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[2]\n" + + "uqadd v21.16b, v21.16b, v3.16b\n" + "dup v2.16b, v29.b[3]\n" + + "uqadd v22.16b, v22.16b, v4.16b\n" + + "and v3.16b, v1.16b, v0.16b\n" + "and v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[4]\n" + "uqadd v23.16b, v23.16b, v3.16b\n" + "dup v2.16b, v29.b[5]\n" + "uqadd v24.16b, v24.16b, v4.16b\n" + + "and v3.16b, v1.16b, v0.16b\n" + "and v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[6]\n" + "uqadd v25.16b, v25.16b, v3.16b\n" + "dup v2.16b, v29.b[7]\n" + "uqadd v26.16b, v26.16b, v4.16b\n" + + "and v3.16b, v1.16b, v0.16b\n" + "ldr d29, [x3, 8]!\n" + "and v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "ldr q0, [x0, 16]!\n" // next filter + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[0]\n" + "uqadd v27.16b, v27.16b, v3.16b\n" + "dup v2.16b, v29.b[1]\n" + "uqadd v28.16b, v28.16b, v4.16b\n" + "bne 3b\n" + + "movi v3.16b, #1\n" + "umlal v5.8h, v21.8b, v3.8b\n" + "umlal v7.8h, v22.8b, v3.8b\n" + "umlal v9.8h, v23.8b, v3.8b\n" + "umlal v11.8h, v24.8b, v3.8b\n" + "umlal v13.8h, v25.8b, v3.8b\n" + "umlal v15.8h, v26.8b, v3.8b\n" + "umlal v17.8h, v27.8b, v3.8b\n" + "umlal v19.8h, v28.8b, v3.8b\n" + + "umlal2 v6.8h, v21.16b, v3.16b\n" + "umlal2 v8.8h, v22.16b, v3.16b\n" + "umlal2 v10.8h, v23.16b, v3.16b\n" + "umlal2 v12.8h, v24.16b, v3.16b\n" + "umlal2 v14.8h, v25.16b, v3.16b\n" + "umlal2 v16.8h, v26.16b, v3.16b\n" + "umlal2 v18.8h, v27.16b, v3.16b\n" + "umlal2 v20.8h, v28.16b, v3.16b\n" + + "subs x9, x9, #8\n" + "bne 2b\n" + + "4:\n" // Wrap up computation for 32 input channels + "subs x2, x2, #32\n" + "bne 0b\n" + + // pipelined + "ucvtf v5.8h, v5.8h\n" + "ucvtf v6.8h, v6.8h\n" + "ldr q21, [%[b_0]]\n" + "ucvtf v7.8h, v7.8h\n" + "ldr q22, [%[b_1]]\n" + "ucvtf v8.8h, v8.8h\n" + "ldr q23, [%[s_0]]\n" + "ucvtf v9.8h, v9.8h\n" + "ldr q24, [%[s_1]]\n" + "ucvtf v10.8h, v10.8h\n" + "ucvtf v11.8h, v11.8h\n" + "mov v1.16b, v21.16b\n" + "ucvtf v12.8h, v12.8h\n" + "mov v2.16b, v22.16b\n" + "ucvtf v13.8h, v13.8h\n" + "fmla v1.8h, v5.8h, v23.8h\n" + "ucvtf v14.8h, v14.8h\n" + "fmla v2.8h, v6.8h, v24.8h\n" + "ucvtf v15.8h, v15.8h\n" + "mov v3.16b, v21.16b\n" + "ucvtf v16.8h, v16.8h\n" + "mov v4.16b, v22.16b\n" + "ucvtf v17.8h, v17.8h\n" + "fmla v3.8h, v7.8h, v23.8h\n" + "ucvtf v18.8h, v18.8h\n" + "fmla v4.8h, v8.8h, v24.8h\n" + "ucvtf v19.8h, v19.8h\n" + "mov v5.16b, v21.16b\n" + "ucvtf v20.8h, v20.8h\n" + "mov v6.16b, v22.16b\n" + + "fmla v5.8h, v9.8h, v23.8h\n" + "mov v7.16b, v21.16b\n" + "fmla v6.8h, v10.8h, v24.8h\n" + "mov v8.16b, v22.16b\n" + "fmla v7.8h, v11.8h, v23.8h\n" + "mov v9.16b, v21.16b\n" + "fmla v8.8h, v12.8h, v24.8h\n" + "mov v10.16b, v22.16b\n" + "fmla v9.8h, v13.8h, v23.8h\n" + "mov v11.16b, v21.16b\n" + "fmla v10.8h, v14.8h, v24.8h\n" + "mov v12.16b, v22.16b\n" + "fmla v11.8h, v15.8h, v23.8h\n" + "mov v13.16b, v21.16b\n" + "fmla v12.8h, v16.8h, v24.8h\n" + "mov v14.16b, v22.16b\n" + "fmla v13.8h, v17.8h, v23.8h\n" + "mov v15.16b, v21.16b\n" + "fmla v14.8h, v18.8h, v24.8h\n" + "mov v16.16b, v22.16b\n" + "fmla v15.8h, v19.8h, v23.8h\n" + "fmla v16.8h, v20.8h, v24.8h\n" + + "str q1, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q5, [%[out_0], #32]\n" // out_o0hw2 + "str q7, [%[out_0], #48]\n" // out_o0hw3 + "str q9, [%[out_0], #64]\n" // out_o0hw4 + "str q11, [%[out_0], #80]\n" // out_o0hw5 + "str q13, [%[out_0], #96]\n" // out_o0hw6 + "str q15, [%[out_0], #112]\n" // out_o0hw7 + + "str q2, [%[out_1]]\n" // out_o1hw0 + "str q4, [%[out_1], #16]\n" // out_o1hw1 + "str q6, [%[out_1], #32]\n" // out_o1hw2 + "str q8, [%[out_1], #48]\n" // out_o1hw3 + "str q10, [%[out_1], #64]\n" // out_o1hw4 + "str q12, [%[out_1], #80]\n" // out_o1hw5 + "str q14, [%[out_1], #96]\n" // out_o1hw6 + "str q16, [%[out_1], #112]\n" // out_o1hw7 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [fhfw] "r"((I64)fh * fw), [s_0] "r"(s_o0), + [s_1] "r"(s_o1), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x0", + "x1", "x2", "x3", "x4", "x9"); + s0 += 16; + s1 += 16; + b0 += 16; + b1 += 16; + } + } + // ohow_remainder % 8 / 4 + U32 ohow_s = (ohow / 8) * 8; + + for (U32 hw = ohow_s; hw < ohow - 3; hw += 4) { + const F16 *s0 = scaleArray; + const F16 *s1 = scaleArray + 8; + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + BIN8 *in_order = ((BIN8 *)tmp) + ic * ihiw; // ic has been divided by 8 + // reorder input + // NCHWc8 => NHWChw4c8 + im2col + U32 in_h[4]; + U32 in_w[4]; + for (U32 i = 0; i < 4; i++) { + in_h[i] = ((hw + i) / ow) * strideH; + in_w[i] = ((hw + i) % ow) * strideW; + } + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + BIN8 *in_hw4c8 = inArray_pad + c * ihiw + fh_idx * iw_pad + fw_idx; + // NHWChw4c8 + BIN8 *in_order_hw4c8 = + in_order + c * fh * fw * 4 + fh_idx * fw * 4 + fw_idx * 4; + for (U32 i = 0; i < 4; i++) { + in_order_hw4c8[i] = *(in_hw4c8 + in_h[i] * iw_pad + in_w[i]); + } + } + } + } + + // compute + for (U32 o = 0; o < oc; o += + 2) { // oc should be multiple of 32. It will at least be multiple of 16 in the future. + BIN8 *in_hw0 = in_order; + const BIN8 *f_o0c0 = filterArray + o * 8 * fh * fw * ic; // ic has been divided by 8 + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // scale and bias + const F16 *s_o0 = s0; + const F16 *s_o1 = s1; + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr q0, [%[f_0]]\n" // f_0 + "ldr s29, [%[in_0]]\n" // in_0 + /* Layout + 5 6 + 7 8 + 9 10 + 11 12 + */ + "eor v5.16b, v5.16b, v5.16b\n" + "eor v6.16b, v6.16b, v6.16b\n" + "eor v7.16b, v7.16b, v7.16b\n" + "eor v8.16b, v8.16b, v8.16b\n" + "eor v9.16b, v9.16b, v9.16b\n" + "dup v1.16b, v29.b[0]\n" // duplicate a full register + "eor v10.16b, v10.16b, v10.16b\n" + "dup v2.16b, v29.b[1]\n" + "eor v11.16b, v11.16b, v11.16b\n" + "eor v12.16b, v12.16b, v12.16b\n" + + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" // ic_blk + + "0:\n" + "eor v21.16b, v21.16b, v21.16b\n" + "mov x9, %[fhfw]\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + + "mov x4, #4\n" + + "1:\n" + "and v3.16b, v1.16b, v0.16b\n" + "and v4.16b, v2.16b, v0.16b\n" + + "cnt v3.16b, v3.16b\n" + "subs x4, x4, #1\n" + + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[2]\n" + + "add v21.16b, v21.16b, v3.16b\n" // Use add because the latency is shorter + "dup v2.16b, v29.b[3]\n" + + "add v22.16b, v22.16b, v4.16b\n" + + "and v3.16b, v1.16b, v0.16b\n" + "ldr s29, [x3, 4]!\n" + "and v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "ldr q0, [x0, 16]!\n" // next filter + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[0]\n" + "add v23.16b, v23.16b, v3.16b\n" + "dup v2.16b, v29.b[1]\n" + "add v24.16b, v24.16b, v4.16b\n" + "bne 1b\n" + + "movi v3.16b, #1\n" + "umlal v5.8h, v21.8b, v3.8b\n" + "umlal v7.8h, v22.8b, v3.8b\n" + "umlal v9.8h, v23.8b, v3.8b\n" + "umlal v11.8h, v24.8b, v3.8b\n" + + "umlal2 v6.8h, v21.16b, v3.16b\n" + "umlal2 v8.8h, v22.16b, v3.16b\n" + "umlal2 v10.8h, v23.16b, v3.16b\n" + "umlal2 v12.8h, v24.16b, v3.16b\n" + + "subs x9, x9, #1\n" + "beq 4f\n" // 1x1, continue with the next 32 input channels + + "2:\n" + "eor v21.16b, v21.16b, v21.16b\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + + "mov x4, #32\n" // Assume 256 will not happen + "3:\n" + "and v3.16b, v1.16b, v0.16b\n" + "and v4.16b, v2.16b, v0.16b\n" + + "cnt v3.16b, v3.16b\n" + "subs x4, x4, #1\n" + + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[2]\n" + + "uqadd v21.16b, v21.16b, v3.16b\n" + "dup v2.16b, v29.b[3]\n" + + "uqadd v22.16b, v22.16b, v4.16b\n" + + "and v3.16b, v1.16b, v0.16b\n" + "ldr s29, [x3, 4]!\n" + "and v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "ldr q0, [x0, 16]!\n" // next filter + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[0]\n" + "uqadd v23.16b, v23.16b, v3.16b\n" + "dup v2.16b, v29.b[1]\n" + "uqadd v24.16b, v24.16b, v4.16b\n" + "bne 3b\n" + + "movi v3.16b, #1\n" + "umlal v5.8h, v21.8b, v3.8b\n" + "umlal v7.8h, v22.8b, v3.8b\n" + "umlal v9.8h, v23.8b, v3.8b\n" + "umlal v11.8h, v24.8b, v3.8b\n" + + "umlal2 v6.8h, v21.16b, v3.16b\n" + "umlal2 v8.8h, v22.16b, v3.16b\n" + "umlal2 v10.8h, v23.16b, v3.16b\n" + "umlal2 v12.8h, v24.16b, v3.16b\n" + + "subs x9, x9, #8\n" + "bne 2b\n" + + "4:\n" // Wrap up computation for 32 input channels + "subs x2, x2, #32\n" + "bne 0b\n" + + // pipelined + "ucvtf v5.8h, v5.8h\n" + "ucvtf v6.8h, v6.8h\n" + "ldr q21, [%[b_0]]\n" + "ucvtf v7.8h, v7.8h\n" + "ldr q22, [%[b_1]]\n" + "ucvtf v8.8h, v8.8h\n" + "ldr q23, [%[s_0]]\n" + "ucvtf v9.8h, v9.8h\n" + "ldr q24, [%[s_1]]\n" + "ucvtf v10.8h, v10.8h\n" + "ucvtf v11.8h, v11.8h\n" + "mov v1.16b, v21.16b\n" + "ucvtf v12.8h, v12.8h\n" + "mov v2.16b, v22.16b\n" + "fmla v1.8h, v5.8h, v23.8h\n" + "fmla v2.8h, v6.8h, v24.8h\n" + "mov v3.16b, v21.16b\n" + "mov v4.16b, v22.16b\n" + "fmla v3.8h, v7.8h, v23.8h\n" + "fmla v4.8h, v8.8h, v24.8h\n" + "mov v5.16b, v21.16b\n" + "mov v6.16b, v22.16b\n" + + "fmla v5.8h, v9.8h, v23.8h\n" + "mov v7.16b, v21.16b\n" + "fmla v6.8h, v10.8h, v24.8h\n" + "mov v8.16b, v22.16b\n" + "fmla v7.8h, v11.8h, v23.8h\n" + "fmla v8.8h, v12.8h, v24.8h\n" + + "str q1, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q5, [%[out_0], #32]\n" // out_o0hw2 + "str q7, [%[out_0], #48]\n" // out_o0hw3 + + "str q2, [%[out_1]]\n" // out_o1hw0 + "str q4, [%[out_1], #16]\n" // out_o1hw1 + "str q6, [%[out_1], #32]\n" // out_o1hw2 + "str q8, [%[out_1], #48]\n" // out_o1hw3 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [fhfw] "r"((I64)fh * fw), [s_0] "r"(s_o0), + [s_1] "r"(s_o1), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v21", "v22", "v23", "v24", + "v29", "v30", "x0", "x1", "x2", "x3", "x4", "x9"); + s0 += 16; + s1 += 16; + b0 += 16; + b1 += 16; + } + } + // ohow_reminder % 4 + ohow_s = (ohow / 4) * 4; + for (U32 hw = ohow_s; hw < ohow; hw++) { + const F16 *s0 = scaleArray; + const F16 *s1 = scaleArray + 8; + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + BIN8 *in_order = ((BIN8 *)tmp) + ic * ih_pad * iw_pad; // ic has been divided by 8 + // reorder input + // NCHWc8 => NHWChw1c8 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + BIN8 *in_hw1c8 = inArray_pad + c * ihiw + fh_idx * iw_pad + fw_idx; + BIN8 *in_0 = in_hw1c8 + in_h_0 * iw_pad + in_w_0; + BIN8 *in_order_hw1c8 = in_order + c * fh * fw + fh_idx * fw + fw_idx; + *in_order_hw1c8 = (*in_0); + } + } + } + // compute + for (U32 o = 0; o < oc; o += 2) { + BIN8 *in_hw0 = in_order; + const BIN8 *f_o = filterArray + o * 8 * fh * fw * ic; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + + uint16x8_t sum[2] = {0}; + uint8x8_t v1 = vdup_n_u8(1); + for (U32 i = 0; i < ic * 8; i += 32) { + uint8x8_t sub0[2] = {0}; + + for (U32 j = 0; j < 4; j++) { + uint8x8_t f_0 = vld1_u8(f_o); + uint8x8_t f_1 = vld1_u8(f_o + 8); + f_o += 16; + uint8x8_t in_1 = vdup_n_u8(*in_hw0); + in_hw0++; + f_0 = vand_u8(in_1, f_0); + f_1 = vand_u8(in_1, f_1); + f_0 = vcnt_u8(f_0); + f_1 = vcnt_u8(f_1); + sub0[0] = vadd_u8(sub0[0], f_0); + sub0[1] = vadd_u8(sub0[1], f_1); + } + sum[0] = vmlal_u8(sum[0], sub0[0], v1); + sum[1] = vmlal_u8(sum[1], sub0[1], v1); + + for (U32 j = 1; j < fh * fw; j += 8) { + uint8x8_t sub1[2] = {0}; + for (U32 k = 0; k < 32; k++) { + uint8x8_t f_0 = vld1_u8(f_o); + uint8x8_t f_1 = vld1_u8(f_o + 8); + f_o += 16; + uint8x8_t in_1 = vdup_n_u8(*in_hw0); + in_hw0++; + f_0 = vand_u8(in_1, f_0); + f_1 = vand_u8(in_1, f_1); + f_0 = vcnt_u8(f_0); + f_1 = vcnt_u8(f_1); + sub1[0] = vadd_u8(sub1[0], f_0); + sub1[1] = vadd_u8(sub1[1], f_1); + } + sum[0] = vmlal_u8(sum[0], sub1[0], v1); + sum[1] = vmlal_u8(sum[1], sub1[1], v1); + } + } + + float16x8_t res_o0 = vcvtq_f16_u16(sum[0]); + float16x8_t res_o1 = vcvtq_f16_u16(sum[1]); + float16x8_t scale_o0 = vld1q_f16(s0); + s0 += 16; + float16x8_t scale_o1 = vld1q_f16(s1); + s1 += 16; + float16x8_t bias_o0 = vld1q_f16(b0); + b0 += 16; + float16x8_t bias_o1 = vld1q_f16(b1); + b1 += 16; + res_o0 = vmulq_f16(res_o0, scale_o0); + res_o1 = vmulq_f16(res_o1, scale_o1); + res_o0 = vaddq_f16(res_o0, bias_o0); + res_o1 = vaddq_f16(res_o1, bias_o1); + vst1q_f16(out_o0hw0, res_o0); + vst1q_f16(out_o1hw0, res_o1); + } + } + } + return SUCCESS; +} +#endif diff --git a/compute/tensor/src/cpu/arm/bnn/convolution_transform_bnn.h b/compute/tensor/src/cpu/arm/bnn/convolution_transform_bnn.h new file mode 100644 index 00000000..c30d5ffd --- /dev/null +++ b/compute/tensor/src/cpu/arm/bnn/convolution_transform_bnn.h @@ -0,0 +1,92 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_CONVOLUTION_TRANSFORM_BNN +#define _H_CONVOLUTION_TRANSFORM_BNN + +#ifdef _USE_FP16 +#include +#include + +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "tensor_computing.h" + +inline void bitwise_copy(BIN8 srcVal, U32 srcBit, BIN8 *dest, U32 destBit) +{ + std::bitset<8> Src(srcVal); + if (Src.test(srcBit)) { + *dest |= (1 << destBit); // Set bit + } else { + *dest &= ~(1 << destBit); // Clear bit + } +} + +inline EE convolution_transform_filter_bnn( + TensorDesc filterDesc, const BIN8 *filterArray, TensorDesc *ftmDesc, BIN8 *ftmArray) +{ + /* + * NCHW => (N/16)*(C/8)*(H*W)*n16*c8 + */ + if (nullptr == filterArray || nullptr == ftmDesc || nullptr == ftmArray) { + CHECK_STATUS(NULL_POINTER); + } + + DataType fdt; + DataFormat fdf; + U32 fn, fc, fh, fw; + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + switch (fdf) { + case DF_NCHWN16C8: + // Everything is ready + memcpy(ftmArray, filterArray, fn * fc * fh * fw / 8 * bytesOf(fdt)); + break; + case DF_NCHW: { + /* + * NCHW => NCHWN16C8 + * Now assume fn is divisible by 32 + */ + U32 oc = fn / 16; + U32 ic = fc / 8; + for (U32 o = 0; o < oc; o++) { + for (U32 c = 0; c < ic; c++) { + for (U32 hw = 0; hw < fh * fw; hw++) { + for (U32 o16 = 0; o16 < 16; o16++) { + for (U32 c8 = 0; c8 < 8; c8++) { + U32 ftmBitPos = o * fh * fw * ic * 128 + c * fh * fw * 128 + + hw * 128 + o16 * 8 + c8; + U32 ftmSlot = ftmBitPos / 8; + U32 ftmBitNo = 7 - (ftmBitPos % 8); + + U32 filterBitPos = + (o * 16 + o16) * ic * 8 * fh * fw + (c * 8 + c8) * fh * fw + hw; + U32 filterSlot = filterBitPos / 8; + U32 filterBitNo = 7 - (filterBitPos % 8); + bitwise_copy(filterArray[filterSlot], filterBitNo, + ftmArray + ftmSlot, ftmBitNo); + } + } + } + } + } + break; + } + default: + return NOT_MATCH; + } + *ftmDesc = tensor4df(fdt, DF_NCHWN16C8, fn, fc, fh, fw); + return SUCCESS; +} +#endif +#endif diff --git a/compute/tensor/src/cpu/arm/bnn/convolution_xnor.h b/compute/tensor/src/cpu/arm/bnn/convolution_xnor.h new file mode 100644 index 00000000..a78684cd --- /dev/null +++ b/compute/tensor/src/cpu/arm/bnn/convolution_xnor.h @@ -0,0 +1,86 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_CONVOLUTION_XNOR +#define _H_CONVOLUTION_XNOR + +#ifdef _USE_FP16 +#include +#include +#include "sys.h" +#include "types.h" +#include "error.h" + +EE convolution_xnor_A55(TensorDesc inputDesc, + const F16 *input, + TensorDesc filterDesc, + const BIN8 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc scaleDesc, + const F16 *scaleArray, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc); + +EE convolution_xnor_A76(TensorDesc inputDesc, + const F16 *input, + TensorDesc filterDesc, + const BIN8 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc scaleDesc, + const F16 *scaleArray, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc); + +inline EE convolution_xnor(TensorDesc inputDesc, + const F16 *input, + TensorDesc filterDesc, + const BIN8 *filter, + ConvolutionParamSpec convParamSpec, + TensorDesc scaleDesc, + const F16 *scale, + TensorDesc biasDesc, + const F16 *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *output, + ActivationParamSpec activationDesc, + Arch arch) +{ + EE ret = SUCCESS; + switch (arch) { + case ARM_A55: + ret = convolution_xnor_A55(inputDesc, input, filterDesc, filter, convParamSpec, + scaleDesc, scale, biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc); + break; + case ARM_A76: + ret = convolution_xnor_A76(inputDesc, input, filterDesc, filter, convParamSpec, + scaleDesc, scale, biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc); + break; + default: + return NOT_SUPPORTED; + } + return ret; +} +#endif +#endif diff --git a/compute/tensor/src/cpu/arm/bnn/convolution_xnor_A55.cpp b/compute/tensor/src/cpu/arm/bnn/convolution_xnor_A55.cpp new file mode 100644 index 00000000..fe5d6395 --- /dev/null +++ b/compute/tensor/src/cpu/arm/bnn/convolution_xnor_A55.cpp @@ -0,0 +1,786 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_FP16 +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" + +#include "cpu/arm/bnn/convolution_xnor.h" + +EE convolution_xnor_A55(TensorDesc inputDesc, + const F16 *input, + TensorDesc filterDesc, + const BIN8 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc scaleDesc, + const F16 *scaleArray, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc) +{ + UNUSED(scaleDesc); + UNUSED(biasDesc); + UNUSED(tmpBytes); + UNUSED(activationDesc); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + if (fdf != DF_NCHWN16C8) { + CHECK_STATUS(NOT_MATCH); + } + if (!(ic == fc && oc == fn)) { + CHECK_STATUS(NOT_MATCH); + } + + oc /= 8; + ic /= 8; + + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + U32 ohow = oh * ow; + U32 ihiw = ih_pad * iw_pad; + + BIN8 *inArray = ((BIN8 *)tmp) + ic * ihiw + 8 * fh * fw * ic; // ic has been divided by 8 + BIN8 *inArray_pad; + + for (U32 n = 0; n < in; n++) { + const F16 *in = input + n * ic * ih * iw * 8; + for (U32 i = 0; i < ic * ih * iw; i++) { + BIN8 temp = 0; + for (U32 j = 0; j < 8; j++) { + if (in[i * 8 + j] >= 0) { + temp |= (1 << (7 - j)); // set + } + } + inArray[i] = temp; + } + + if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { + inArray_pad = inArray + n * ic * ih * iw; // ic has been divided by 8 + } else { + // copy input into a input with padding + inArray_pad = (BIN8 *)tmp; + BIN8 *inArray_pad_mov = inArray_pad; + BIN8 *inArray_mov = inArray + n * ic * ih * iw; + for (U32 c = 0; c < ic; c++) { // All divide by 8 + for (U32 h = 0; h < paddingT; h++) { + memset(inArray_pad_mov, 0, iw_pad * bytesOf(DT_BIN11)); + inArray_pad_mov += iw_pad; + } + for (U32 h = paddingT; h < ih_pad - paddingB; h++) { + memset(inArray_pad_mov, 0, paddingL * bytesOf(DT_BIN11)); + inArray_pad_mov += paddingL; + memcpy(inArray_pad_mov, inArray_mov, iw * bytesOf(DT_BIN11)); + inArray_pad_mov += iw; + inArray_mov += iw; + memset(inArray_pad_mov, 0, paddingR * bytesOf(DT_BIN11)); + inArray_pad_mov += paddingR; + } + for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { + memset(inArray_pad_mov, 0, iw_pad * bytesOf(DT_BIN11)); + inArray_pad_mov += iw_pad; + } + } + } + // ohow / 8 + short base_s = fh * fw * ic * 8; // For xnorNet, actual_sum = base_s - 2 * noOf1sFromXOR + short base_v[8]; // Assume the base can be represented as int16 + for (U32 i = 0; i < 8; i++) { + base_v[i] = base_s; + } + for (U32 hw = 0; hw < ohow - 7; hw += 8) { + const F16 *s0 = scaleArray; + const F16 *s1 = scaleArray + 8; + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + BIN8 *in_order = ((BIN8 *)tmp) + ic * ihiw; // ic has been divided by 8 + // reorder input + // NCHWc8 => NHWChw8c8 + im2col + U32 in_h[8]; + U32 in_w[8]; + for (U32 i = 0; i < 8; i++) { + in_h[i] = ((hw + i) / ow) * strideH; + in_w[i] = ((hw + i) % ow) * strideW; + } + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + BIN8 *in_hw8c8 = inArray_pad + c * ihiw + fh_idx * iw_pad + fw_idx; + // NHWChw8c8 + BIN8 *in_order_hw8c8 = in_order + c * fh * fw * 8 + fh_idx * fw * 8 + + fw_idx * 8; // This 8 comes from hw8 + for (U32 i = 0; i < 8; i++) { + in_order_hw8c8[i] = *(in_hw8c8 + in_h[i] * iw_pad + in_w[i]); + } + } + } + } + + // compute + for (U32 o = 0; o < oc; o += + 2) { // oc should be multiple of 32. It will at least be multiple of 16 in the future. + BIN8 *in_hw0 = in_order; + const BIN8 *f_o0c0 = filterArray + o * 8 * fh * fw * ic; // ic has been divided by 8 + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // scale and bias + const F16 *s_o0 = s0; + const F16 *s_o1 = s1; + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr q4, [%[base]]\n" + "mov v5.16b, v4.16b\n" + "ldr d29, [%[in_0]]\n" // in_0 + "mov v6.16b, v4.16b\n" + "ldr d0, [%[f_0]]\n" // f_0 + "mov v7.16b, v4.16b\n" + "ldr x2, [%[f_0], #8]\n" + "mov v8.16b, v4.16b\n" + "ins v0.d[1], x2\n" + "mov v9.16b, v4.16b\n" + "dup v1.16b, v29.b[0]\n" // duplicate a full register + "mov v10.16b, v4.16b\n" + "dup v2.16b, v29.b[1]\n" + "mov v11.16b, v4.16b\n" + "mov v12.16b, v4.16b\n" + "mov v13.16b, v4.16b\n" + "mov v14.16b, v4.16b\n" + "mov v15.16b, v4.16b\n" + "mov v16.16b, v4.16b\n" + "mov v17.16b, v4.16b\n" + "mov v18.16b, v4.16b\n" + "mov v19.16b, v4.16b\n" + "mov v20.16b, v4.16b\n" + + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" // ic_blk + + "0:\n" + "eor v21.16b, v21.16b, v21.16b\n" + "mov x9, %[fhfw]\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + "eor v25.16b, v25.16b, v25.16b\n" + "eor v26.16b, v26.16b, v26.16b\n" + "eor v27.16b, v27.16b, v27.16b\n" + "eor v28.16b, v28.16b, v28.16b\n" + + "mov x4, #4\n" + + "1:\n" + "eor v3.16b, v1.16b, v0.16b\n" + "ldr d30, [x0, 16]!\n" // next filter + + "eor v4.16b, v2.16b, v0.16b\n" + "ldr x1, [x0, 8]\n" + + "cnt v3.16b, v3.16b\n" + "subs x4, x4, #1\n" + + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[2]\n" + + "add v21.16b, v21.16b, v3.16b\n" // Use add because the latency is shorter + "dup v2.16b, v29.b[3]\n" + + "add v22.16b, v22.16b, v4.16b\n" + "ins v30.d[1], x1\n" + + "eor v3.16b, v1.16b, v0.16b\n" + "eor v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[4]\n" + "add v23.16b, v23.16b, v3.16b\n" + "dup v2.16b, v29.b[5]\n" + "add v24.16b, v24.16b, v4.16b\n" + + "eor v3.16b, v1.16b, v0.16b\n" + "eor v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[6]\n" + "add v25.16b, v25.16b, v3.16b\n" + "dup v2.16b, v29.b[7]\n" + "add v26.16b, v26.16b, v4.16b\n" + + "eor v3.16b, v1.16b, v0.16b\n" + "ldr d29, [x3, 8]!\n" + "eor v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "mov v0.16b, v30.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[0]\n" + "add v27.16b, v27.16b, v3.16b\n" + "dup v2.16b, v29.b[1]\n" + "add v28.16b, v28.16b, v4.16b\n" + "bne 1b\n" + + "movi v3.16b, #2\n" + "umlsl v5.8h, v21.8b, v3.8b\n" + "umlsl v7.8h, v22.8b, v3.8b\n" + "umlsl v9.8h, v23.8b, v3.8b\n" + "umlsl v11.8h, v24.8b, v3.8b\n" + "umlsl v13.8h, v25.8b, v3.8b\n" + "umlsl v15.8h, v26.8b, v3.8b\n" + "umlsl v17.8h, v27.8b, v3.8b\n" + "umlsl v19.8h, v28.8b, v3.8b\n" + + "umlsl2 v6.8h, v21.16b, v3.16b\n" + "umlsl2 v8.8h, v22.16b, v3.16b\n" + "umlsl2 v10.8h, v23.16b, v3.16b\n" + "umlsl2 v12.8h, v24.16b, v3.16b\n" + "umlsl2 v14.8h, v25.16b, v3.16b\n" + "umlsl2 v16.8h, v26.16b, v3.16b\n" + "umlsl2 v18.8h, v27.16b, v3.16b\n" + "umlsl2 v20.8h, v28.16b, v3.16b\n" + + "subs x9, x9, #1\n" + "beq 4f\n" // 1x1, continue with the next 32 input channels + + "2:\n" + "eor v21.16b, v21.16b, v21.16b\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + "eor v25.16b, v25.16b, v25.16b\n" + "eor v26.16b, v26.16b, v26.16b\n" + "eor v27.16b, v27.16b, v27.16b\n" + "eor v28.16b, v28.16b, v28.16b\n" + + "mov x4, #32\n" // Assume 256 will not happen + "3:\n" + "eor v3.16b, v1.16b, v0.16b\n" + "ldr d30, [x0, 16]!\n" // next filter + + "eor v4.16b, v2.16b, v0.16b\n" + "ldr x1, [x0, 8]\n" + + "cnt v3.16b, v3.16b\n" + "subs x4, x4, #1\n" + + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[2]\n" + + "uqadd v21.16b, v21.16b, v3.16b\n" + "dup v2.16b, v29.b[3]\n" + + "uqadd v22.16b, v22.16b, v4.16b\n" + "ins v30.d[1], x1\n" + + "eor v3.16b, v1.16b, v0.16b\n" + "eor v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[4]\n" + "uqadd v23.16b, v23.16b, v3.16b\n" + "dup v2.16b, v29.b[5]\n" + "uqadd v24.16b, v24.16b, v4.16b\n" + + "eor v3.16b, v1.16b, v0.16b\n" + "eor v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[6]\n" + "uqadd v25.16b, v25.16b, v3.16b\n" + "dup v2.16b, v29.b[7]\n" + "uqadd v26.16b, v26.16b, v4.16b\n" + + "eor v3.16b, v1.16b, v0.16b\n" + "ldr d29, [x3, 8]!\n" + "eor v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "mov v0.16b, v30.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[0]\n" + "uqadd v27.16b, v27.16b, v3.16b\n" + "dup v2.16b, v29.b[1]\n" + "uqadd v28.16b, v28.16b, v4.16b\n" + "bne 3b\n" + + "movi v3.16b, #2\n" // actual sum = base - 2 * noOf1s + "umlsl v5.8h, v21.8b, v3.8b\n" + "umlsl v7.8h, v22.8b, v3.8b\n" + "umlsl v9.8h, v23.8b, v3.8b\n" + "umlsl v11.8h, v24.8b, v3.8b\n" + "umlsl v13.8h, v25.8b, v3.8b\n" + "umlsl v15.8h, v26.8b, v3.8b\n" + "umlsl v17.8h, v27.8b, v3.8b\n" + "umlsl v19.8h, v28.8b, v3.8b\n" + + "umlsl2 v6.8h, v21.16b, v3.16b\n" + "umlsl2 v8.8h, v22.16b, v3.16b\n" + "umlsl2 v10.8h, v23.16b, v3.16b\n" + "umlsl2 v12.8h, v24.16b, v3.16b\n" + "umlsl2 v14.8h, v25.16b, v3.16b\n" + "umlsl2 v16.8h, v26.16b, v3.16b\n" + "umlsl2 v18.8h, v27.16b, v3.16b\n" + "umlsl2 v20.8h, v28.16b, v3.16b\n" + + "subs x9, x9, #8\n" + "bne 2b\n" + + "4:\n" // Wrap up computation for 32 input channels + "subs x2, x2, #32\n" + "bne 0b\n" + + // pipelined + "scvtf v5.8h, v5.8h\n" + "scvtf v6.8h, v6.8h\n" + "ldr q21, [%[b_0]]\n" + "scvtf v7.8h, v7.8h\n" + "ldr q22, [%[b_1]]\n" + "scvtf v8.8h, v8.8h\n" + "ldr q23, [%[s_0]]\n" + "scvtf v9.8h, v9.8h\n" + "ldr q24, [%[s_1]]\n" + "scvtf v10.8h, v10.8h\n" + "scvtf v11.8h, v11.8h\n" + "mov v1.16b, v21.16b\n" + "scvtf v12.8h, v12.8h\n" + "mov v2.16b, v22.16b\n" + "scvtf v13.8h, v13.8h\n" + "fmla v1.8h, v5.8h, v23.8h\n" + "scvtf v14.8h, v14.8h\n" + "fmla v2.8h, v6.8h, v24.8h\n" + "scvtf v15.8h, v15.8h\n" + "mov v3.16b, v21.16b\n" + "scvtf v16.8h, v16.8h\n" + "mov v4.16b, v22.16b\n" + "scvtf v17.8h, v17.8h\n" + "fmla v3.8h, v7.8h, v23.8h\n" + "scvtf v18.8h, v18.8h\n" + "fmla v4.8h, v8.8h, v24.8h\n" + "scvtf v19.8h, v19.8h\n" + "mov v5.16b, v21.16b\n" + "scvtf v20.8h, v20.8h\n" + "mov v6.16b, v22.16b\n" + + "fmla v5.8h, v9.8h, v23.8h\n" + "mov v7.16b, v21.16b\n" + "fmla v6.8h, v10.8h, v24.8h\n" + "mov v8.16b, v22.16b\n" + "fmla v7.8h, v11.8h, v23.8h\n" + "mov v9.16b, v21.16b\n" + "fmla v8.8h, v12.8h, v24.8h\n" + "mov v10.16b, v22.16b\n" + "fmla v9.8h, v13.8h, v23.8h\n" + "mov v11.16b, v21.16b\n" + "fmla v10.8h, v14.8h, v24.8h\n" + "mov v12.16b, v22.16b\n" + "fmla v11.8h, v15.8h, v23.8h\n" + "mov v13.16b, v21.16b\n" + "fmla v12.8h, v16.8h, v24.8h\n" + "mov v14.16b, v22.16b\n" + "fmla v13.8h, v17.8h, v23.8h\n" + "mov v15.16b, v21.16b\n" + "fmla v14.8h, v18.8h, v24.8h\n" + "mov v16.16b, v22.16b\n" + "fmla v15.8h, v19.8h, v23.8h\n" + "fmla v16.8h, v20.8h, v24.8h\n" + + "str q1, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q5, [%[out_0], #32]\n" // out_o0hw2 + "str q7, [%[out_0], #48]\n" // out_o0hw3 + "str q9, [%[out_0], #64]\n" // out_o0hw4 + "str q11, [%[out_0], #80]\n" // out_o0hw5 + "str q13, [%[out_0], #96]\n" // out_o0hw6 + "str q15, [%[out_0], #112]\n" // out_o0hw7 + + "str q2, [%[out_1]]\n" // out_o1hw0 + "str q4, [%[out_1], #16]\n" // out_o1hw1 + "str q6, [%[out_1], #32]\n" // out_o1hw2 + "str q8, [%[out_1], #48]\n" // out_o1hw3 + "str q10, [%[out_1], #64]\n" // out_o1hw4 + "str q12, [%[out_1], #80]\n" // out_o1hw5 + "str q14, [%[out_1], #96]\n" // out_o1hw6 + "str q16, [%[out_1], #112]\n" // out_o1hw7 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [fhfw] "r"((I64)fh * fw), [base] "r"(base_v), + [s_0] "r"(s_o0), [s_1] "r"(s_o1), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x0", + "x1", "x2", "x3", "x4", "x9"); + s0 += 16; + s1 += 16; + b0 += 16; + b1 += 16; + } + } + // ohow_remainder % 8 / 4 + U32 ohow_s = (ohow / 8) * 8; + + for (U32 hw = ohow_s; hw < ohow - 3; hw += 4) { + const F16 *s0 = scaleArray; + const F16 *s1 = scaleArray + 8; + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + BIN8 *in_order = ((BIN8 *)tmp) + ic * ihiw; // ic has been divided by 8 + // reorder input + // NCHWc8 => NHWChw4c8 + im2col + U32 in_h[4]; + U32 in_w[4]; + for (U32 i = 0; i < 4; i++) { + in_h[i] = ((hw + i) / ow) * strideH; + in_w[i] = ((hw + i) % ow) * strideW; + } + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + BIN8 *in_hw4c8 = inArray_pad + c * ihiw + fh_idx * iw_pad + fw_idx; + // NHWChw8c8 + BIN8 *in_order_hw4c8 = + in_order + c * fh * fw * 4 + fh_idx * fw * 4 + fw_idx * 4; + for (U32 i = 0; i < 4; i++) { + in_order_hw4c8[i] = *(in_hw4c8 + in_h[i] * iw_pad + in_w[i]); + } + } + } + } + + // compute + for (U32 o = 0; o < oc; o += + 2) { // oc should be multiple of 32. It will at least be multiple of 16 in the future. + BIN8 *in_hw0 = in_order; + const BIN8 *f_o0c0 = filterArray + o * 8 * fh * fw * ic; // ic has been divided by 8 + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // scale and bias + const F16 *s_o0 = s0; + const F16 *s_o1 = s1; + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr q4, [%[base]]\n" + /* Layout + 5 6 + 7 8 + 9 10 + 11 12 + */ + "mov v5.16b, v4.16b\n" + "ldr s29, [%[in_0]]\n" // in_0 + "mov v6.16b, v4.16b\n" + "ldr d0, [%[f_0]]\n" // f_0 + "mov v7.16b, v4.16b\n" + "ldr x2, [%[f_0], #8]\n" + "mov v8.16b, v4.16b\n" + "ins v0.d[1], x2\n" + "mov v9.16b, v4.16b\n" + "dup v1.16b, v29.b[0]\n" // duplicate a full register + "mov v10.16b, v4.16b\n" + "dup v2.16b, v29.b[1]\n" + "mov v11.16b, v4.16b\n" + "mov v12.16b, v4.16b\n" + + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" // ic_blk + + "0:\n" + "eor v21.16b, v21.16b, v21.16b\n" + "mov x9, %[fhfw]\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + + "mov x4, #4\n" + + "1:\n" + "eor v3.16b, v1.16b, v0.16b\n" + "ldr d30, [x0, 16]!\n" // next filter + + "eor v4.16b, v2.16b, v0.16b\n" + "ldr x1, [x0, 8]\n" + + "cnt v3.16b, v3.16b\n" + "subs x4, x4, #1\n" + + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[2]\n" + + "add v21.16b, v21.16b, v3.16b\n" // Use add because the latency is shorter + "dup v2.16b, v29.b[3]\n" + + "add v22.16b, v22.16b, v4.16b\n" + "ins v30.d[1], x1\n" + + "eor v3.16b, v1.16b, v0.16b\n" + "ldr s29, [x3, 4]!\n" + "eor v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "mov v0.16b, v30.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[0]\n" + "add v23.16b, v23.16b, v3.16b\n" + "dup v2.16b, v29.b[1]\n" + "add v24.16b, v24.16b, v4.16b\n" + "bne 1b\n" + + "movi v3.16b, #2\n" + "umlsl v5.8h, v21.8b, v3.8b\n" + "umlsl v7.8h, v22.8b, v3.8b\n" + "umlsl v9.8h, v23.8b, v3.8b\n" + "umlsl v11.8h, v24.8b, v3.8b\n" + + "umlsl2 v6.8h, v21.16b, v3.16b\n" + "umlsl2 v8.8h, v22.16b, v3.16b\n" + "umlsl2 v10.8h, v23.16b, v3.16b\n" + "umlsl2 v12.8h, v24.16b, v3.16b\n" + + "subs x9, x9, #1\n" + "beq 4f\n" // 1x1, continue with the next 32 input channels + + "2:\n" + "eor v21.16b, v21.16b, v21.16b\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + + "mov x4, #32\n" // Assume 256 will not happen + "3:\n" + "eor v3.16b, v1.16b, v0.16b\n" + "ldr d30, [x0, 16]!\n" // next filter + + "eor v4.16b, v2.16b, v0.16b\n" + "ldr x1, [x0, 8]\n" + + "cnt v3.16b, v3.16b\n" + "subs x4, x4, #1\n" + + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[2]\n" + + "uqadd v21.16b, v21.16b, v3.16b\n" + "dup v2.16b, v29.b[3]\n" + + "uqadd v22.16b, v22.16b, v4.16b\n" + "ins v30.d[1], x1\n" + + "eor v3.16b, v1.16b, v0.16b\n" + "ldr s29, [x3, 4]!\n" + "eor v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "mov v0.16b, v30.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[0]\n" + "uqadd v23.16b, v23.16b, v3.16b\n" + "dup v2.16b, v29.b[1]\n" + "uqadd v24.16b, v24.16b, v4.16b\n" + "bne 3b\n" + + "movi v3.16b, #2\n" + "umlsl v5.8h, v21.8b, v3.8b\n" + "umlsl v7.8h, v22.8b, v3.8b\n" + "umlsl v9.8h, v23.8b, v3.8b\n" + "umlsl v11.8h, v24.8b, v3.8b\n" + + "umlsl2 v6.8h, v21.16b, v3.16b\n" + "umlsl2 v8.8h, v22.16b, v3.16b\n" + "umlsl2 v10.8h, v23.16b, v3.16b\n" + "umlsl2 v12.8h, v24.16b, v3.16b\n" + + "subs x9, x9, #8\n" + "bne 2b\n" + + "4:\n" // Wrap up computation for 32 input channels + "subs x2, x2, #32\n" + "bne 0b\n" + + // pipelined + "scvtf v5.8h, v5.8h\n" + "scvtf v6.8h, v6.8h\n" + "ldr q21, [%[b_0]]\n" + "scvtf v7.8h, v7.8h\n" + "ldr q22, [%[b_1]]\n" + "scvtf v8.8h, v8.8h\n" + "ldr q23, [%[s_0]]\n" + "scvtf v9.8h, v9.8h\n" + "ldr q24, [%[s_1]]\n" + "scvtf v10.8h, v10.8h\n" + "scvtf v11.8h, v11.8h\n" + "mov v1.16b, v21.16b\n" + "scvtf v12.8h, v12.8h\n" + "mov v2.16b, v22.16b\n" + "fmla v1.8h, v5.8h, v23.8h\n" + "fmla v2.8h, v6.8h, v24.8h\n" + "mov v3.16b, v21.16b\n" + "mov v4.16b, v22.16b\n" + "fmla v3.8h, v7.8h, v23.8h\n" + "fmla v4.8h, v8.8h, v24.8h\n" + "mov v5.16b, v21.16b\n" + "mov v6.16b, v22.16b\n" + + "fmla v5.8h, v9.8h, v23.8h\n" + "mov v7.16b, v21.16b\n" + "fmla v6.8h, v10.8h, v24.8h\n" + "mov v8.16b, v22.16b\n" + "fmla v7.8h, v11.8h, v23.8h\n" + "fmla v8.8h, v12.8h, v24.8h\n" + + "str q1, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q5, [%[out_0], #32]\n" // out_o0hw2 + "str q7, [%[out_0], #48]\n" // out_o0hw3 + + "str q2, [%[out_1]]\n" // out_o1hw0 + "str q4, [%[out_1], #16]\n" // out_o1hw1 + "str q6, [%[out_1], #32]\n" // out_o1hw2 + "str q8, [%[out_1], #48]\n" // out_o1hw3 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [fhfw] "r"((I64)fh * fw), [base] "r"(base_v), + [s_0] "r"(s_o0), [s_1] "r"(s_o1), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v21", "v22", "v23", "v24", + "v29", "v30", "x0", "x1", "x2", "x3", "x4", "x9"); + s0 += 16; + s1 += 16; + b0 += 16; + b1 += 16; + } + } + // ohow_reminder % 4 + ohow_s = (ohow / 4) * 4; + for (U32 hw = ohow_s; hw < ohow; hw++) { + const F16 *s0 = scaleArray; + const F16 *s1 = scaleArray + 8; + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + BIN8 *in_order = ((BIN8 *)tmp) + ic * ihiw; // ic has been divided by 8 + // reorder input + // NCHWc8 => NHWChw1c8 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + BIN8 *in_hw1c8 = inArray_pad + c * ihiw + fh_idx * iw_pad + fw_idx; + BIN8 *in_0 = in_hw1c8 + in_h_0 * iw_pad + in_w_0; + BIN8 *in_order_hw1c8 = in_order + c * fh * fw + fh_idx * fw + fw_idx; + *in_order_hw1c8 = (*in_0); + } + } + } + // compute + for (U32 o = 0; o < oc; o += 2) { + BIN8 *in_hw0 = in_order; + const BIN8 *f_o = filterArray + o * 8 * fh * fw * ic; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + + uint16x8_t sum[2] = {0}; + uint8x8_t v2 = vdup_n_u8(2); + for (U32 i = 0; i < ic * 8; i += 32) { + uint8x8_t sub0[2] = {0}; + + for (U32 j = 0; j < 4; j++) { + uint8x8_t f_0 = vld1_u8(f_o); + uint8x8_t f_1 = vld1_u8(f_o + 8); + f_o += 16; + uint8x8_t in_1 = vdup_n_u8(*in_hw0); + in_hw0++; + f_0 = veor_u8(in_1, f_0); + f_1 = veor_u8(in_1, f_1); + f_0 = vcnt_u8(f_0); + f_1 = vcnt_u8(f_1); + sub0[0] = vadd_u8(sub0[0], f_0); + sub0[1] = vadd_u8(sub0[1], f_1); + } + sum[0] = vmlal_u8(sum[0], sub0[0], v2); + sum[1] = vmlal_u8(sum[1], sub0[1], v2); + + for (U32 j = 1; j < fh * fw; j += 8) { + uint8x8_t sub1[2] = {0}; + for (U32 k = 0; k < 32; k++) { + uint8x8_t f_0 = vld1_u8(f_o); + uint8x8_t f_1 = vld1_u8(f_o + 8); + f_o += 16; + uint8x8_t in_1 = vdup_n_u8(*in_hw0); + in_hw0++; + f_0 = veor_u8(in_1, f_0); + f_1 = veor_u8(in_1, f_1); + f_0 = vcnt_u8(f_0); + f_1 = vcnt_u8(f_1); + sub1[0] = vadd_u8(sub1[0], f_0); + sub1[1] = vadd_u8(sub1[1], f_1); + } + sum[0] = vmlal_u8(sum[0], sub1[0], v2); + sum[1] = vmlal_u8(sum[1], sub1[1], v2); + } + } + short temp[16]; + vst1q_u16((uint16_t *)temp, sum[0]); + vst1q_u16((uint16_t *)(temp + 8), sum[1]); + int16x8_t base_abs = vdupq_n_s16(base_s); + int16x8_t ssum[2]; + ssum[0] = vld1q_s16(temp); + ssum[1] = vld1q_s16(temp + 8); + ssum[0] = vsubq_s16(base_abs, ssum[0]); + ssum[1] = vsubq_s16(base_abs, ssum[1]); + + float16x8_t res_o0 = vcvtq_f16_s16(ssum[0]); + float16x8_t res_o1 = vcvtq_f16_s16(ssum[1]); + float16x8_t scale_o0 = vld1q_f16(s0); + s0 += 16; + float16x8_t scale_o1 = vld1q_f16(s1); + s1 += 16; + float16x8_t bias_o0 = vld1q_f16(b0); + b0 += 16; + float16x8_t bias_o1 = vld1q_f16(b1); + b1 += 16; + res_o0 = vmulq_f16(res_o0, scale_o0); + res_o1 = vmulq_f16(res_o1, scale_o1); + res_o0 = vaddq_f16(res_o0, bias_o0); + res_o1 = vaddq_f16(res_o1, bias_o1); + vst1q_f16(out_o0hw0, res_o0); + vst1q_f16(out_o1hw0, res_o1); + } + } + } + return SUCCESS; +} +#endif diff --git a/compute/tensor/src/cpu/arm/bnn/convolution_xnor_A76.cpp b/compute/tensor/src/cpu/arm/bnn/convolution_xnor_A76.cpp new file mode 100644 index 00000000..163444a4 --- /dev/null +++ b/compute/tensor/src/cpu/arm/bnn/convolution_xnor_A76.cpp @@ -0,0 +1,774 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_FP16 +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" + +#include "cpu/arm/bnn/convolution_xnor.h" + +EE convolution_xnor_A76(TensorDesc inputDesc, + const F16 *input, + TensorDesc filterDesc, + const BIN8 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc scaleDesc, + const F16 *scaleArray, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc) +{ + UNUSED(scaleDesc); + UNUSED(biasDesc); + UNUSED(tmpBytes); + UNUSED(activationDesc); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + if (fdf != DF_NCHWN16C8) { + CHECK_STATUS(NOT_MATCH); + } + if (!(ic == fc && oc == fn)) { + CHECK_STATUS(NOT_MATCH); + } + + oc /= 8; + ic /= 8; + + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + U32 ohow = oh * ow; + U32 ihiw = ih_pad * iw_pad; + + BIN8 *inArray = ((BIN8 *)tmp) + ic * ihiw + 8 * fh * fw * ic; // ic has been divided by 8 + BIN8 *inArray_pad; + + for (U32 n = 0; n < in; n++) { + const F16 *in = input + n * ic * ih * iw * 8; + for (U32 i = 0; i < ic * ih * iw; i++) { + BIN8 temp = 0; + for (U32 j = 0; j < 8; j++) { + if (in[i * 8 + j] >= 0) { + temp |= (1 << (7 - j)); // set + } + } + inArray[i] = temp; + } + + if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { + inArray_pad = inArray + n * ic * ih * iw; // ic has been divided by 8 + } else { + // copy input into a input with padding + inArray_pad = (BIN8 *)tmp; + BIN8 *inArray_pad_mov = inArray_pad; + BIN8 *inArray_mov = inArray + n * ic * ih * iw; + for (U32 c = 0; c < ic; c++) { // All divide by 8 + for (U32 h = 0; h < paddingT; h++) { + memset(inArray_pad_mov, 0, iw_pad * bytesOf(DT_BIN11)); + inArray_pad_mov += iw_pad; + } + for (U32 h = paddingT; h < ih_pad - paddingB; h++) { + memset(inArray_pad_mov, 0, paddingL * bytesOf(DT_BIN11)); + inArray_pad_mov += paddingL; + memcpy(inArray_pad_mov, inArray_mov, iw * bytesOf(DT_BIN11)); + inArray_pad_mov += iw; + inArray_mov += iw; + memset(inArray_pad_mov, 0, paddingR * bytesOf(DT_BIN11)); + inArray_pad_mov += paddingR; + } + for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { + memset(inArray_pad_mov, 0, iw_pad * bytesOf(DT_BIN11)); + inArray_pad_mov += iw_pad; + } + } + } + // ohow / 8 + short base_s = fh * fw * ic * 8; // For xnorNet, actual_sum = base_s - 2 * noOf1sFromXOR + short base_v[8]; // Assume the base can be represented as int16 + for (U32 i = 0; i < 8; i++) { + base_v[i] = base_s; + } + for (U32 hw = 0; hw < ohow - 7; hw += 8) { + const F16 *s0 = scaleArray; + const F16 *s1 = scaleArray + 8; + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + BIN8 *in_order = ((BIN8 *)tmp) + ic * ihiw; // ic has been divided by 8 + // reorder input + // NCHWc8 => NHWChw8c8 + im2col + U32 in_h[8]; + U32 in_w[8]; + for (U32 i = 0; i < 8; i++) { + in_h[i] = ((hw + i) / ow) * strideH; + in_w[i] = ((hw + i) % ow) * strideW; + } + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + BIN8 *in_hw8c8 = inArray_pad + c * ihiw + fh_idx * iw_pad + fw_idx; + // NHWChw8c8 + BIN8 *in_order_hw8c8 = in_order + c * fh * fw * 8 + fh_idx * fw * 8 + + fw_idx * 8; // This 8 comes from hw8 + for (U32 i = 0; i < 8; i++) { + in_order_hw8c8[i] = *(in_hw8c8 + in_h[i] * iw_pad + in_w[i]); + } + } + } + } + + // compute + for (U32 o = 0; o < oc; o += + 2) { // oc should be multiple of 32. It will at least be multiple of 16 in the future. + BIN8 *in_hw0 = in_order; + const BIN8 *f_o0c0 = filterArray + o * 8 * fh * fw * ic; // ic has been divided by 8 + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // scale and bias + const F16 *s_o0 = s0; + const F16 *s_o1 = s1; + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr q4, [%[base]]\n" + "ldr q0, [%[f_0]]\n" // f_0 + "ldr d29, [%[in_0]]\n" // in_0 + /* Layout + 5 6 + 7 8 + 9 10 + 11 12 + + 13 14 + 15 16 + 17 18 + 19 20 + */ + "mov v5.16b, v4.16b\n" + "mov v6.16b, v4.16b\n" + "mov v7.16b, v4.16b\n" + "mov v8.16b, v4.16b\n" + "mov v9.16b, v4.16b\n" + "dup v1.16b, v29.b[0]\n" // duplicate a full register + "mov v10.16b, v4.16b\n" + "dup v2.16b, v29.b[1]\n" + "mov v11.16b, v4.16b\n" + "mov v12.16b, v4.16b\n" + "mov v13.16b, v4.16b\n" + "mov v14.16b, v4.16b\n" + "mov v15.16b, v4.16b\n" + "mov v16.16b, v4.16b\n" + "mov v17.16b, v4.16b\n" + "mov v18.16b, v4.16b\n" + "mov v19.16b, v4.16b\n" + "mov v20.16b, v4.16b\n" + + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" // ic_blk + + "0:\n" + "eor v21.16b, v21.16b, v21.16b\n" + "mov x9, %[fhfw]\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + "eor v25.16b, v25.16b, v25.16b\n" + "eor v26.16b, v26.16b, v26.16b\n" + "eor v27.16b, v27.16b, v27.16b\n" + "eor v28.16b, v28.16b, v28.16b\n" + + "mov x4, #4\n" + + "1:\n" + "eor v3.16b, v1.16b, v0.16b\n" + "eor v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "subs x4, x4, #1\n" + + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[2]\n" + + "add v21.16b, v21.16b, v3.16b\n" // Use add because the latency is shorter + "dup v2.16b, v29.b[3]\n" + + "add v22.16b, v22.16b, v4.16b\n" + + "eor v3.16b, v1.16b, v0.16b\n" + "eor v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[4]\n" + "add v23.16b, v23.16b, v3.16b\n" + "dup v2.16b, v29.b[5]\n" + "add v24.16b, v24.16b, v4.16b\n" + + "eor v3.16b, v1.16b, v0.16b\n" + "eor v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[6]\n" + "add v25.16b, v25.16b, v3.16b\n" + "dup v2.16b, v29.b[7]\n" + "add v26.16b, v26.16b, v4.16b\n" + + "eor v3.16b, v1.16b, v0.16b\n" + "ldr d29, [x3, 8]!\n" + "eor v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "ldr q0, [x0, 16]!\n" // next filter + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[0]\n" + "add v27.16b, v27.16b, v3.16b\n" + "dup v2.16b, v29.b[1]\n" + "add v28.16b, v28.16b, v4.16b\n" + "bne 1b\n" + + "movi v3.16b, #2\n" + "umlsl v5.8h, v21.8b, v3.8b\n" + "umlsl v7.8h, v22.8b, v3.8b\n" + "umlsl v9.8h, v23.8b, v3.8b\n" + "umlsl v11.8h, v24.8b, v3.8b\n" + "umlsl v13.8h, v25.8b, v3.8b\n" + "umlsl v15.8h, v26.8b, v3.8b\n" + "umlsl v17.8h, v27.8b, v3.8b\n" + "umlsl v19.8h, v28.8b, v3.8b\n" + + "umlsl2 v6.8h, v21.16b, v3.16b\n" + "umlsl2 v8.8h, v22.16b, v3.16b\n" + "umlsl2 v10.8h, v23.16b, v3.16b\n" + "umlsl2 v12.8h, v24.16b, v3.16b\n" + "umlsl2 v14.8h, v25.16b, v3.16b\n" + "umlsl2 v16.8h, v26.16b, v3.16b\n" + "umlsl2 v18.8h, v27.16b, v3.16b\n" + "umlsl2 v20.8h, v28.16b, v3.16b\n" + + "subs x9, x9, #1\n" + "beq 4f\n" // 1x1, continue with the next 32 input channels + + "2:\n" + "eor v21.16b, v21.16b, v21.16b\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + "eor v25.16b, v25.16b, v25.16b\n" + "eor v26.16b, v26.16b, v26.16b\n" + "eor v27.16b, v27.16b, v27.16b\n" + "eor v28.16b, v28.16b, v28.16b\n" + + "mov x4, #32\n" // Assume 256 will not happen + "3:\n" + "eor v3.16b, v1.16b, v0.16b\n" + "eor v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[2]\n" + + "uqadd v21.16b, v21.16b, v3.16b\n" + "dup v2.16b, v29.b[3]\n" + + "uqadd v22.16b, v22.16b, v4.16b\n" + "subs x4, x4, #1\n" + + "eor v3.16b, v1.16b, v0.16b\n" + "eor v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[4]\n" + "uqadd v23.16b, v23.16b, v3.16b\n" + "dup v2.16b, v29.b[5]\n" + "uqadd v24.16b, v24.16b, v4.16b\n" + + "eor v3.16b, v1.16b, v0.16b\n" + "eor v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[6]\n" + "uqadd v25.16b, v25.16b, v3.16b\n" + "dup v2.16b, v29.b[7]\n" + "uqadd v26.16b, v26.16b, v4.16b\n" + + "eor v3.16b, v1.16b, v0.16b\n" + "ldr d29, [x3, 8]!\n" + "eor v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "ldr q0, [x0, 16]!\n" // next filter + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[0]\n" + "uqadd v27.16b, v27.16b, v3.16b\n" + "dup v2.16b, v29.b[1]\n" + "uqadd v28.16b, v28.16b, v4.16b\n" + "bne 3b\n" + + "movi v3.16b, #2\n" // actual sum = base - 2 * noOf1s + "umlsl v5.8h, v21.8b, v3.8b\n" + "umlsl v7.8h, v22.8b, v3.8b\n" + "umlsl v9.8h, v23.8b, v3.8b\n" + "umlsl v11.8h, v24.8b, v3.8b\n" + "umlsl v13.8h, v25.8b, v3.8b\n" + "umlsl v15.8h, v26.8b, v3.8b\n" + "umlsl v17.8h, v27.8b, v3.8b\n" + "umlsl v19.8h, v28.8b, v3.8b\n" + + "umlsl2 v6.8h, v21.16b, v3.16b\n" + "umlsl2 v8.8h, v22.16b, v3.16b\n" + "umlsl2 v10.8h, v23.16b, v3.16b\n" + "umlsl2 v12.8h, v24.16b, v3.16b\n" + "umlsl2 v14.8h, v25.16b, v3.16b\n" + "umlsl2 v16.8h, v26.16b, v3.16b\n" + "umlsl2 v18.8h, v27.16b, v3.16b\n" + "umlsl2 v20.8h, v28.16b, v3.16b\n" + + "subs x9, x9, #8\n" + "bne 2b\n" + + "4:\n" // Wrap up computation for 32 input channels + "subs x2, x2, #32\n" + "bne 0b\n" + + // pipelined + "scvtf v5.8h, v5.8h\n" + "scvtf v6.8h, v6.8h\n" + "ldr q21, [%[b_0]]\n" + "ldr q22, [%[b_1]]\n" + "scvtf v7.8h, v7.8h\n" + "scvtf v8.8h, v8.8h\n" + "ldr q23, [%[s_0]]\n" + "ldr q24, [%[s_1]]\n" + "scvtf v9.8h, v9.8h\n" + "scvtf v10.8h, v10.8h\n" + "scvtf v11.8h, v11.8h\n" + "scvtf v12.8h, v12.8h\n" + "mov v1.16b, v21.16b\n" + "mov v2.16b, v22.16b\n" + "scvtf v13.8h, v13.8h\n" + "scvtf v14.8h, v14.8h\n" + "fmla v1.8h, v5.8h, v23.8h\n" + "fmla v2.8h, v6.8h, v24.8h\n" + "scvtf v15.8h, v15.8h\n" + "scvtf v16.8h, v16.8h\n" + "mov v3.16b, v21.16b\n" + "mov v4.16b, v22.16b\n" + "scvtf v17.8h, v17.8h\n" + "scvtf v18.8h, v18.8h\n" + "fmla v3.8h, v7.8h, v23.8h\n" + "fmla v4.8h, v8.8h, v24.8h\n" + "scvtf v19.8h, v19.8h\n" + "scvtf v20.8h, v20.8h\n" + "mov v5.16b, v21.16b\n" + "mov v6.16b, v22.16b\n" + + "fmla v5.8h, v9.8h, v23.8h\n" + "mov v7.16b, v21.16b\n" + "fmla v6.8h, v10.8h, v24.8h\n" + "mov v8.16b, v22.16b\n" + "fmla v7.8h, v11.8h, v23.8h\n" + "mov v9.16b, v21.16b\n" + "fmla v8.8h, v12.8h, v24.8h\n" + "mov v10.16b, v22.16b\n" + "fmla v9.8h, v13.8h, v23.8h\n" + "mov v11.16b, v21.16b\n" + "fmla v10.8h, v14.8h, v24.8h\n" + "mov v12.16b, v22.16b\n" + "fmla v11.8h, v15.8h, v23.8h\n" + "mov v13.16b, v21.16b\n" + "fmla v12.8h, v16.8h, v24.8h\n" + "mov v14.16b, v22.16b\n" + "fmla v13.8h, v17.8h, v23.8h\n" + "mov v15.16b, v21.16b\n" + "fmla v14.8h, v18.8h, v24.8h\n" + "mov v16.16b, v22.16b\n" + "fmla v15.8h, v19.8h, v23.8h\n" + "fmla v16.8h, v20.8h, v24.8h\n" + + "str q1, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q5, [%[out_0], #32]\n" // out_o0hw2 + "str q7, [%[out_0], #48]\n" // out_o0hw3 + "str q9, [%[out_0], #64]\n" // out_o0hw4 + "str q11, [%[out_0], #80]\n" // out_o0hw5 + "str q13, [%[out_0], #96]\n" // out_o0hw6 + "str q15, [%[out_0], #112]\n" // out_o0hw7 + + "str q2, [%[out_1]]\n" // out_o1hw0 + "str q4, [%[out_1], #16]\n" // out_o1hw1 + "str q6, [%[out_1], #32]\n" // out_o1hw2 + "str q8, [%[out_1], #48]\n" // out_o1hw3 + "str q10, [%[out_1], #64]\n" // out_o1hw4 + "str q12, [%[out_1], #80]\n" // out_o1hw5 + "str q14, [%[out_1], #96]\n" // out_o1hw6 + "str q16, [%[out_1], #112]\n" // out_o1hw7 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [fhfw] "r"((I64)fh * fw), [base] "r"(base_v), + [s_0] "r"(s_o0), [s_1] "r"(s_o1), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x0", "x2", "x3", + "x4", "x9"); + s0 += 16; + s1 += 16; + b0 += 16; + b1 += 16; + } + } + // ohow_remainder % 8 / 4 + U32 ohow_s = (ohow / 8) * 8; + + for (U32 hw = ohow_s; hw < ohow - 3; hw += 4) { + const F16 *s0 = scaleArray; + const F16 *s1 = scaleArray + 8; + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + BIN8 *in_order = ((BIN8 *)tmp) + ic * ihiw; // ic has been divided by 8 + // reorder input + // NCHWc8 => NHWChw4c8 + im2col + U32 in_h[4]; + U32 in_w[4]; + for (U32 i = 0; i < 4; i++) { + in_h[i] = ((hw + i) / ow) * strideH; + in_w[i] = ((hw + i) % ow) * strideW; + } + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + BIN8 *in_hw4c8 = inArray_pad + c * ihiw + fh_idx * iw_pad + fw_idx; + // NHWChw8c8 + BIN8 *in_order_hw4c8 = + in_order + c * fh * fw * 4 + fh_idx * fw * 4 + fw_idx * 4; + for (U32 i = 0; i < 4; i++) { + in_order_hw4c8[i] = *(in_hw4c8 + in_h[i] * iw_pad + in_w[i]); + } + } + } + } + + // compute + for (U32 o = 0; o < oc; o += + 2) { // oc should be multiple of 32. It will at least be multiple of 16 in the future. + BIN8 *in_hw0 = in_order; + const BIN8 *f_o0c0 = filterArray + o * 8 * fh * fw * ic; // ic has been divided by 8 + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // scale and bias + const F16 *s_o0 = s0; + const F16 *s_o1 = s1; + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr q4, [%[base]]\n" + "ldr q0, [%[f_0]]\n" // f_0 + "ldr s29, [%[in_0]]\n" // in_0 + /* Layout + 5 6 + 7 8 + 9 10 + 11 12 + */ + "mov v5.16b, v4.16b\n" + "mov v6.16b, v4.16b\n" + "mov v7.16b, v4.16b\n" + "mov v8.16b, v4.16b\n" + "mov v9.16b, v4.16b\n" + "dup v1.16b, v29.b[0]\n" // duplicate a full register + "mov v10.16b, v4.16b\n" + "dup v2.16b, v29.b[1]\n" + "mov v11.16b, v4.16b\n" + "mov v12.16b, v4.16b\n" + + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" // ic_blk + + "0:\n" + "eor v21.16b, v21.16b, v21.16b\n" + "mov x9, %[fhfw]\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + + "mov x4, #4\n" + + "1:\n" + "eor v3.16b, v1.16b, v0.16b\n" + "eor v4.16b, v2.16b, v0.16b\n" + + "cnt v3.16b, v3.16b\n" + "subs x4, x4, #1\n" + + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[2]\n" + + "add v21.16b, v21.16b, v3.16b\n" // Use add because the latency is shorter + "dup v2.16b, v29.b[3]\n" + + "add v22.16b, v22.16b, v4.16b\n" + + "eor v3.16b, v1.16b, v0.16b\n" + "ldr s29, [x3, 4]!\n" + "eor v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "ldr q0, [x0, 16]!\n" // next filter + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[0]\n" + "add v23.16b, v23.16b, v3.16b\n" + "dup v2.16b, v29.b[1]\n" + "add v24.16b, v24.16b, v4.16b\n" + "bne 1b\n" + + "movi v3.16b, #2\n" + "umlsl v5.8h, v21.8b, v3.8b\n" + "umlsl v7.8h, v22.8b, v3.8b\n" + "umlsl v9.8h, v23.8b, v3.8b\n" + "umlsl v11.8h, v24.8b, v3.8b\n" + + "umlsl2 v6.8h, v21.16b, v3.16b\n" + "umlsl2 v8.8h, v22.16b, v3.16b\n" + "umlsl2 v10.8h, v23.16b, v3.16b\n" + "umlsl2 v12.8h, v24.16b, v3.16b\n" + + "subs x9, x9, #1\n" + "beq 4f\n" // 1x1, continue with the next 32 input channels + + "2:\n" + "eor v21.16b, v21.16b, v21.16b\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + + "mov x4, #32\n" // Assume 256 will not happen + "3:\n" + "eor v3.16b, v1.16b, v0.16b\n" + "eor v4.16b, v2.16b, v0.16b\n" + + "cnt v3.16b, v3.16b\n" + "subs x4, x4, #1\n" + + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[2]\n" + + "uqadd v21.16b, v21.16b, v3.16b\n" + "dup v2.16b, v29.b[3]\n" + + "uqadd v22.16b, v22.16b, v4.16b\n" + + "eor v3.16b, v1.16b, v0.16b\n" + "ldr s29, [x3, 4]!\n" + "eor v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "ldr q0, [x0, 16]!\n" // next filter + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[0]\n" + "uqadd v23.16b, v23.16b, v3.16b\n" + "dup v2.16b, v29.b[1]\n" + "uqadd v24.16b, v24.16b, v4.16b\n" + "bne 3b\n" + + "movi v3.16b, #2\n" + "umlsl v5.8h, v21.8b, v3.8b\n" + "umlsl v7.8h, v22.8b, v3.8b\n" + "umlsl v9.8h, v23.8b, v3.8b\n" + "umlsl v11.8h, v24.8b, v3.8b\n" + + "umlsl2 v6.8h, v21.16b, v3.16b\n" + "umlsl2 v8.8h, v22.16b, v3.16b\n" + "umlsl2 v10.8h, v23.16b, v3.16b\n" + "umlsl2 v12.8h, v24.16b, v3.16b\n" + + "subs x9, x9, #8\n" + "bne 2b\n" + + "4:\n" // Wrap up computation for 32 input channels + "subs x2, x2, #32\n" + "bne 0b\n" + + // pipelined + "scvtf v5.8h, v5.8h\n" + "scvtf v6.8h, v6.8h\n" + "ldr q21, [%[b_0]]\n" + "scvtf v7.8h, v7.8h\n" + "ldr q22, [%[b_1]]\n" + "scvtf v8.8h, v8.8h\n" + "ldr q23, [%[s_0]]\n" + "scvtf v9.8h, v9.8h\n" + "ldr q24, [%[s_1]]\n" + "scvtf v10.8h, v10.8h\n" + "scvtf v11.8h, v11.8h\n" + "mov v1.16b, v21.16b\n" + "scvtf v12.8h, v12.8h\n" + "mov v2.16b, v22.16b\n" + "fmla v1.8h, v5.8h, v23.8h\n" + "fmla v2.8h, v6.8h, v24.8h\n" + "mov v3.16b, v21.16b\n" + "mov v4.16b, v22.16b\n" + "fmla v3.8h, v7.8h, v23.8h\n" + "fmla v4.8h, v8.8h, v24.8h\n" + "mov v5.16b, v21.16b\n" + "mov v6.16b, v22.16b\n" + + "fmla v5.8h, v9.8h, v23.8h\n" + "mov v7.16b, v21.16b\n" + "fmla v6.8h, v10.8h, v24.8h\n" + "mov v8.16b, v22.16b\n" + "fmla v7.8h, v11.8h, v23.8h\n" + "fmla v8.8h, v12.8h, v24.8h\n" + + "str q1, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q5, [%[out_0], #32]\n" // out_o0hw2 + "str q7, [%[out_0], #48]\n" // out_o0hw3 + + "str q2, [%[out_1]]\n" // out_o1hw0 + "str q4, [%[out_1], #16]\n" // out_o1hw1 + "str q6, [%[out_1], #32]\n" // out_o1hw2 + "str q8, [%[out_1], #48]\n" // out_o1hw3 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [fhfw] "r"((I64)fh * fw), [base] "r"(base_v), + [s_0] "r"(s_o0), [s_1] "r"(s_o1), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v21", "v22", "v23", "v24", + "v29", "v30", "x0", "x1", "x2", "x3", "x4", "x9"); + s0 += 16; + s1 += 16; + b0 += 16; + b1 += 16; + } + } + // ohow_reminder % 4 + ohow_s = (ohow / 4) * 4; + for (U32 hw = ohow_s; hw < ohow; hw++) { + const F16 *s0 = scaleArray; + const F16 *s1 = scaleArray + 8; + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + BIN8 *in_order = ((BIN8 *)tmp) + ic * ihiw; // ic has been divided by 8 + // reorder input + // NCHWc8 => NHWChw1c8 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + BIN8 *in_hw1c8 = inArray_pad + c * ihiw + fh_idx * iw_pad + fw_idx; + BIN8 *in_0 = in_hw1c8 + in_h_0 * iw_pad + in_w_0; + BIN8 *in_order_hw1c8 = in_order + c * fh * fw + fh_idx * fw + fw_idx; + *in_order_hw1c8 = (*in_0); + } + } + } + // compute + for (U32 o = 0; o < oc; o += 2) { + BIN8 *in_hw0 = in_order; + const BIN8 *f_o = filterArray + o * 8 * fh * fw * ic; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + + uint16x8_t sum[2] = {0}; + uint8x8_t v2 = vdup_n_u8(2); + for (U32 i = 0; i < ic * 8; i += 32) { + uint8x8_t sub0[2] = {0}; + + for (U32 j = 0; j < 4; j++) { + uint8x8_t f_0 = vld1_u8(f_o); + uint8x8_t f_1 = vld1_u8(f_o + 8); + f_o += 16; + uint8x8_t in_1 = vdup_n_u8(*in_hw0); + in_hw0++; + f_0 = veor_u8(in_1, f_0); + f_1 = veor_u8(in_1, f_1); + f_0 = vcnt_u8(f_0); + f_1 = vcnt_u8(f_1); + sub0[0] = vadd_u8(sub0[0], f_0); + sub0[1] = vadd_u8(sub0[1], f_1); + } + sum[0] = vmlal_u8(sum[0], sub0[0], v2); + sum[1] = vmlal_u8(sum[1], sub0[1], v2); + + for (U32 j = 1; j < fh * fw; j += 8) { + uint8x8_t sub1[2] = {0}; + for (U32 k = 0; k < 32; k++) { + uint8x8_t f_0 = vld1_u8(f_o); + uint8x8_t f_1 = vld1_u8(f_o + 8); + f_o += 16; + uint8x8_t in_1 = vdup_n_u8(*in_hw0); + in_hw0++; + f_0 = veor_u8(in_1, f_0); + f_1 = veor_u8(in_1, f_1); + f_0 = vcnt_u8(f_0); + f_1 = vcnt_u8(f_1); + sub1[0] = vadd_u8(sub1[0], f_0); + sub1[1] = vadd_u8(sub1[1], f_1); + } + sum[0] = vmlal_u8(sum[0], sub1[0], v2); + sum[1] = vmlal_u8(sum[1], sub1[1], v2); + } + } + short temp[16]; + vst1q_u16((uint16_t *)temp, sum[0]); + vst1q_u16((uint16_t *)(temp + 8), sum[1]); + int16x8_t base_abs = vdupq_n_s16(base_s); + int16x8_t ssum[2]; + ssum[0] = vld1q_s16(temp); + ssum[1] = vld1q_s16(temp + 8); + ssum[0] = vsubq_s16(base_abs, ssum[0]); + ssum[1] = vsubq_s16(base_abs, ssum[1]); + + float16x8_t res_o0 = vcvtq_f16_s16(ssum[0]); + float16x8_t res_o1 = vcvtq_f16_s16(ssum[1]); + float16x8_t scale_o0 = vld1q_f16(s0); + s0 += 16; + float16x8_t scale_o1 = vld1q_f16(s1); + s1 += 16; + float16x8_t bias_o0 = vld1q_f16(b0); + b0 += 16; + float16x8_t bias_o1 = vld1q_f16(b1); + b1 += 16; + res_o0 = vmulq_f16(res_o0, scale_o0); + res_o1 = vmulq_f16(res_o1, scale_o1); + res_o0 = vaddq_f16(res_o0, bias_o0); + res_o1 = vaddq_f16(res_o1, bias_o1); + vst1q_f16(out_o0hw0, res_o0); + vst1q_f16(out_o1hw0, res_o1); + } + } + } + return SUCCESS; +} +#endif diff --git a/compute/tensor/src/cpu/arm/bnn/tensor_computing_bnn.h b/compute/tensor/src/cpu/arm/bnn/tensor_computing_bnn.h new file mode 100644 index 00000000..38f4f28e --- /dev/null +++ b/compute/tensor/src/cpu/arm/bnn/tensor_computing_bnn.h @@ -0,0 +1,38 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_TENSOR_COMPUTING_BNN +#define _H_TENSOR_COMPUTING_BNN + +#ifdef _USE_FP16 +#include "cpu/arm/bnn/convolution_transform_bnn.h" +#include "cpu/arm/bnn/convolution_dorefa.h" +#include "cpu/arm/bnn/convolution_xnor.h" + +EE convolution_bnn(TensorDesc inputDesc, + const F16 *input, + TensorDesc filterDesc, + const BIN8 *filter, + ConvolutionParamSpec convParamSpec, + TensorDesc scaleDesc, + const F16 *scale, + TensorDesc biasDesc, + const F16 *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *output, + ActivationParamSpec activationDesc, + Arch arch); +#endif +#endif diff --git a/compute/tensor/src/cpu/arm/check.cpp b/compute/tensor/src/cpu/arm/check.cpp new file mode 100644 index 00000000..e4e1ac81 --- /dev/null +++ b/compute/tensor/src/cpu/arm/check.cpp @@ -0,0 +1,115 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/tensor_computing_arm.h" +#include "arm_neon_expand.h" +#ifdef _USE_FP32 +#include "cpu/arm/fp32/tensor_computing_fp32.h" +#endif +#ifdef _USE_FP16 +#include "cpu/arm/fp16/tensor_computing_fp16.h" +#endif + +static EE check_u32(TensorDesc inputDescA, + const U32 *inputA, + TensorDesc inputDescB, + const U32 *inputB, + CheckMode checkMode, + TensorDesc outputDesc, + I32 *output) +{ + if (nullptr == inputA || nullptr == inputB || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + if (tensorNumElements(inputDescA) != tensorNumElements(inputDescB)) { + CHECK_STATUS(NOT_MATCH); + } + + U32 size = tensorNumElements(inputDescA); + U32 loopOuter = inputDescA.dims[inputDescA.nDims - 1]; + if (tensorNumElements(outputDesc) != loopOuter) { + CHECK_STATUS(NOT_MATCH); + } + I32 length = size / loopOuter; + for (U32 j = 0; j < loopOuter; j++) { + const U32 *arrayA = inputA + j * length; + const U32 *arrayB = inputB + j * length; + switch (checkMode) { + case CHECK_EQUAL: { + uint32x4_t count_v = vdupq_n_u32(0); + I32 i = 0; + for (; i < length - 3; i += 4) { + uint32x4_t a = vld1q_u32(arrayA + i); + uint32x4_t b = vld1q_u32(arrayA + i); + count_v = vaddq_u32(count_v, vceqq_u32(a, b)); + } + I32 count = vaddvq_u32(count_v); + for (; i < length; i++) { + if (arrayA[i] == arrayB[i]) { + count++; + } + } + output[j] = (count == length); + break; + } + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } + } + return SUCCESS; +} + +EE check_arm(TensorDesc inputDescA, + const void *inputA, + TensorDesc inputDescB, + const void *inputB, + CheckParamSpec p, + TensorDesc outputDesc, + void *output) +{ + DataType idt = inputDescA.dt; + EE ret = SUCCESS; + switch (idt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = check_fp32(inputDescA, (const F32 *)inputA, inputDescB, (const F32 *)inputB, + p.check_mode, outputDesc, (I32 *)output); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + ret = check_fp16(inputDescA, (const F16 *)inputA, inputDescB, (const F16 *)inputB, + p.check_mode, outputDesc, (I32 *)output); + break; + } +#endif + case DT_U32: { + ret = check_u32(inputDescA, (const U32 *)inputA, inputDescB, (const U32 *)inputB, + p.check_mode, outputDesc, (I32 *)output); + break; + } + case DT_I32: { + ret = check_u32(inputDescA, (const U32 *)inputA, inputDescB, (const U32 *)inputB, + p.check_mode, outputDesc, (I32 *)output); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + + return ret; +} diff --git a/compute/tensor/src/cpu/arm/clip.cpp b/compute/tensor/src/cpu/arm/clip.cpp new file mode 100644 index 00000000..e5de09c7 --- /dev/null +++ b/compute/tensor/src/cpu/arm/clip.cpp @@ -0,0 +1,44 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/tensor_computing_arm.h" +#ifdef _USE_FP32 +#include "cpu/arm/fp32/tensor_computing_fp32.h" +#endif +#ifdef _USE_FP16 +#include "cpu/arm/fp16/tensor_computing_fp16.h" +#endif + +EE clip_arm(TensorDesc inputDesc, void *input, ClipParamSpec p, TensorDesc outputDesc, void *output) +{ + UNUSED(outputDesc); + EE ret = SUCCESS; + switch (inputDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = clip_fp32((F32 *)input, (F32 *)output, tensorNumElements(inputDesc), p.min, p.max); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + ret = clip_fp16((F16 *)input, (F16 *)output, tensorNumElements(inputDesc), p.min, p.max); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/arm/convolution.cpp b/compute/tensor/src/cpu/arm/convolution.cpp new file mode 100644 index 00000000..a14c003e --- /dev/null +++ b/compute/tensor/src/cpu/arm/convolution.cpp @@ -0,0 +1,492 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include +#include "cpu/arm/tensor_computing_arm.h" +#ifdef _USE_FP32 +#include "cpu/arm/fp32/tensor_computing_fp32.h" +#endif +#ifdef _USE_FP16 +#include "cpu/arm/fp16/tensor_computing_fp16.h" +#endif +#ifdef _USE_INT8 +#include "cpu/arm/int8/tensor_computing_int8.h" +#endif +#ifdef _USE_FP16 +#include "cpu/arm/bnn/tensor_computing_bnn.h" +#endif +#include "ut_util.h" + +EE convolution_infer_forward_algorithm_arm(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + ConvolutionForwardAlgorithm *algorithm, + DataType targetDataType) +{ + UNUSED(outputDesc); + if (nullptr == algorithm) { + CHECK_STATUS(NULL_POINTER); + } + if (*algorithm != CONVOLUTION_ALGORITHM_NULL) { + return SUCCESS; + } + + EE ret = SUCCESS; + if (policy == CONVOLUTION_FASTEST) { + DataType idt, fdt; + DataFormat idf, fdf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + U32 group = convParamSpec.group; + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; + if (dilateH > 1 || dilateW > 1) { + *algorithm = CONVOLUTION_ALGORITHM_GEMM; + return SUCCESS; + } + + if ((idf != DF_NCHWC8 || ic / group % 8 != 0) && DT_I8 != idt) { + *algorithm = CONVOLUTION_ALGORITHM_GEMM_ICNCHW; + } else if (fh == 3 && fw == 3 && strideH == 1 && strideW == 1 && paddingT == 1 && + paddingB == 1 && paddingL == 1 && paddingR == 1) { + *algorithm = CONVOLUTION_ALGORITHM_WINOGRAD; + } else { + *algorithm = CONVOLUTION_ALGORITHM_GEMM; + } + + switch (targetDataType) { + case DT_BIN01: { + *algorithm = CONVOLUTION_ALGORITHM_BNN; + break; + } + case DT_BIN11: { + *algorithm = CONVOLUTION_ALGORITHM_BNN; + break; + } + case DT_I8: { + if (*algorithm == CONVOLUTION_ALGORITHM_WINOGRAD) { + *algorithm = CONVOLUTION_ALGORITHM_GEMM; + } + break; + } + default: + break; + } + +#ifndef __aarch64__ + if (CONVOLUTION_ALGORITHM_GEMM_ICNCHW != *algorithm) { + *algorithm = CONVOLUTION_ALGORITHM_GEMM; + } + return SUCCESS; +#endif + } else if (policy == CONVOLUTION_TUNNING) { + std::vector convolutionAlgorithms; + U32 filterBytes = 0; + U32 tmpBytes = 0; + for (U32 i = 0; i < convolutionAlgorithms.size(); i++) { + U32 bytes = 0; + CHECK_STATUS(convolution_transform_filter_bytes_arm( + filterDesc, convParamSpec, convolutionAlgorithms[i], &bytes)); + filterBytes = (bytes > filterBytes) ? bytes : filterBytes; + CHECK_STATUS(convolution_infer_forward_tmp_bytes_arm(inputDesc, filterDesc, outputDesc, + convParamSpec, convolutionAlgorithms[i], &bytes)); + tmpBytes = (bytes > tmpBytes) ? bytes : tmpBytes; + } + TensorDesc biasDesc = tensor1d(filterDesc.dt, outputDesc.dims[3]); + TensorDesc scaleDesc = tensor1d(DT_F32, outputDesc.dims[2]); + U8 *input = ut_input_v(tensorNumElements(inputDesc), inputDesc.dt, UT_INIT_RANDOM); + U8 *filter = ut_input_v(tensorNumElements(filterDesc), filterDesc.dt, UT_INIT_RANDOM); + U8 *filterTransformed = + ut_input_v(filterBytes / bytesOf(filterDesc.dt), filterDesc.dt, UT_INIT_RANDOM); + U8 *bias = ut_input_v(tensorNumElements(biasDesc), biasDesc.dt, UT_INIT_RANDOM); + U8 *scale = ut_input_v(tensorNumElements(scaleDesc), scaleDesc.dt, UT_INIT_RANDOM); + U8 *tmp = ut_input_v(tmpBytes / bytesOf(inputDesc.dt), inputDesc.dt, UT_INIT_ZERO); + U8 *output = ut_input_v(tensorNumElements(outputDesc), outputDesc.dt, UT_INIT_ZERO); + U32 algorithmIndex = 0; + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_RELU; + activationDesc.value[0] = 0; + for (U32 i = 0; i < convolutionAlgorithms.size(); i++) { + TensorDesc ftmDesc; + CHECK_STATUS(convolution_transform_filter_arm(filterDesc, filter, convParamSpec, + convolutionAlgorithms[i], &ftmDesc, filterTransformed)); + + memset(tmp, 0, tmpBytes); + double timeStart = ut_time_ms(); + CHECK_STATUS(convolution_arm(inputDesc, input, ftmDesc, filterTransformed, + convParamSpec, convolutionAlgorithms[i], scaleDesc, scale, biasDesc, bias, tmpBytes, + tmp, outputDesc, output, activationDesc, ARM_A76)); + double timeEnd = ut_time_ms(); + double timeMin = FLT_MAX; + if (timeMin > timeEnd - timeStart) { + timeMin = timeEnd - timeStart; + algorithmIndex = i; + } + } + free(input); + free(filter); + free(filterTransformed); + free(bias); + free(scale); + free(tmp); + free(output); + *algorithm = convolutionAlgorithms[algorithmIndex]; + ret = SUCCESS; + } else { + ret = NOT_SUPPORTED; + } + return ret; +} + +EE convolution_transform_filter_bytes_arm(TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes) +{ + if (nullptr == bytes) { + CHECK_STATUS(NULL_POINTER); + } + EE ret = SUCCESS; + + DataType fdt; + DataFormat fdf; + U32 fn, fc, fh, fw; + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + U32 fnAlignSize = 8; + if (filterDesc.dt == DT_F16) { + fnAlignSize = 16; + } + U32 fnGroupSize = fn / convParamSpec.group; + U32 fnPadding = (fnGroupSize / fnAlignSize + ((fnGroupSize % fnAlignSize) == 0 ? 0 : 1)) * + fnAlignSize * convParamSpec.group; + switch (algorithm) { + case CONVOLUTION_ALGORITHM_WINOGRAD: + *bytes = fnPadding * fc * 6 * 6; + break; + case CONVOLUTION_ALGORITHM_DIRECT: + *bytes = fnPadding * fc * fh * fw; + break; + case CONVOLUTION_ALGORITHM_GEMM: + *bytes = fnPadding * fc * fh * fw; + break; + case CONVOLUTION_ALGORITHM_GEMM_ICNCHW: + *bytes = fnPadding * fc * fh * fw; + break; + case CONVOLUTION_ALGORITHM_BNN: + *bytes = fnPadding * fc * fh * fw; + break; + default: + return NOT_SUPPORTED; + } + *bytes *= bytesOf(fdt); + + switch (filterDesc.dt) { + case DT_BIN01: { + *bytes /= 8; + break; + } + case DT_BIN11: { + *bytes /= 8; + break; + } + default: + break; + } + *bytes += 32; + return ret; +} + +EE convolution_transform_filter_arm(TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + void *filterTransformed) +{ + EE ret = SUCCESS; + switch (filterDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = convolution_transform_filter_fp32(filterDesc, (F32 *)filter, convParamSpec, + algorithm, ftmDesc, (F32 *)filterTransformed); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + ret = convolution_transform_filter_fp16(filterDesc, (F16 *)filter, convParamSpec, + algorithm, ftmDesc, (F16 *)filterTransformed); + break; + } +#endif +#ifdef _USE_INT8 + case DT_I8: { + ret = convolution_transform_filter_int8( + filterDesc, filter, convParamSpec, algorithm, ftmDesc, filterTransformed); + break; + } + case DT_F16_8Q: { + ret = convolution_transform_filter_int8( + filterDesc, filter, convParamSpec, algorithm, ftmDesc, filterTransformed); + break; + } +#endif +#ifdef _USE_FP16 + case DT_BIN01: { + ret = convolution_transform_filter_bnn( + filterDesc, (BIN8 *)filter, ftmDesc, (BIN8 *)filterTransformed); + break; + } + case DT_BIN11: { + ret = convolution_transform_filter_bnn( + filterDesc, (BIN8 *)filter, ftmDesc, (BIN8 *)filterTransformed); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE convolution_infer_forward_tmp_bytes_arm(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes) +{ + if (nullptr == bytes) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + U32 tile_size = 0; + switch (fdt) { + case DT_F32: +#ifdef __aarch64__ + tile_size = 12; +#else + tile_size = 6; +#endif + break; + case DT_F16: + tile_size = 8; + break; + case DT_I8: + tile_size = 12; + break; + case DT_BIN01: + tile_size = 0; + break; + case DT_BIN11: + tile_size = 0; + break; + default: + return NOT_SUPPORTED; + } + EE ret = SUCCESS; + U32 element_size = bytesOf(idt); + *bytes = (ic * ih_pad * iw_pad) * element_size; + switch (algorithm) { + case CONVOLUTION_ALGORITHM_DIRECT: + break; + case CONVOLUTION_ALGORITHM_GEMM: + *bytes += tile_size * fh * fw * ic * OMP_NUM_THREADS * element_size; + if (fdt == DT_I8) { + *bytes += ic * ih * iw; + } + if (odt == DT_I8) { + // scaled bias + results before quantization + *bytes += (oc + on * oc * oh * ow) * bytesOf(DT_I32); + } + break; + case CONVOLUTION_ALGORITHM_WINOGRAD: { + U32 tile_h = (oh + 3) / 4; + U32 tile_w = (ow + 3) / 4; + U32 pad_left = paddingL; + U32 pad_right = paddingR + (tile_w * 4 - ow); + U32 pad_top = paddingT; + U32 pad_bottom = paddingB + (tile_h * 4 - oh); + ih_pad = ih + pad_top + pad_bottom; + iw_pad = iw + pad_left + pad_right; + *bytes = ic * ih_pad * iw_pad * element_size; + if (fdt == DT_F32) { + *bytes += (ic + 8) * 6 * 6 * 12 * element_size; + } else if (fdt == DT_F16) { + *bytes += (ic + oc) * 6 * 6 * 8 * element_size; + } else if (fdt == DT_I8) { + // itm (int16 for int8 inputs) and otm (otm just contains o8 each time) + *bytes += (ic + 8) * 6 * 6 * 12 * bytesOf(DT_F16); + // quantized transformed input + *bytes += ic * 6 * 6 * 12; + if (odt == DT_I8) { + // Output before quantization + *bytes += on * oc * oh * ow * bytesOf(DT_F16); + } + } else { + ret = NOT_SUPPORTED; + } + break; + } + case CONVOLUTION_ALGORITHM_GEMM_ICNCHW: + *bytes += tile_size * fh * fw * ic * element_size; + break; + case CONVOLUTION_ALGORITHM_BNN: + *bytes += (8 * fh * fw * ic + ic * ih * iw) * element_size; + *bytes /= 8; + break; + default: + ret = NOT_MATCH; + break; + } + if (DT_I8 == fdt && DF_NCHW == idf) { + CHECK_REQUIREMENT(ic % 8 == 0); + *bytes += tensorNumBytes(inputDesc); + } + *bytes += 32; + + // pre data processing space for not complete NCHWC8 group convolution input + U32 icGroupSize = ic / convParamSpec.group; + if (idf == DF_NCHWC8 && icGroupSize % 8 != 0) { + *bytes += tensorNumBytes(inputDesc); + } + return ret; +} + +EE convolution_arm(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc scaleDesc, + const void *scale, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec activationDesc, + Arch arch) +{ + UNUSED(scaleDesc); + UNUSED(scale); + U32 group = convParamSpec.group; + U32 batchAxis = inputDesc.nDims - 1; + U32 dataChannelAxis = inputDesc.nDims - 2; + U32 filterChannelAxis = filterDesc.nDims - 1; + U32 biasChannelAxis = 0; + CHECK_REQUIREMENT(inputDesc.dims[batchAxis] == 1); + U32 icGroupSize = inputDesc.dims[dataChannelAxis] / group; + // pre data processing space for not complete NCHWC8 group convolution input + void *inputTransform; + if (inputDesc.df == DF_NCHWC8 && icGroupSize % 8 != 0) { + TensorDesc tmpInputDesc = inputDesc; + tmpInputDesc.df = DF_NCHW; + transformToNCHW(inputDesc, input, tmpInputDesc, tmp); + inputTransform = tmp; + tmp = (U8 *)tmp + tensorNumBytes(tmpInputDesc); + tmpBytes -= tensorNumBytes(tmpInputDesc); + inputDesc.df = DF_NCHW; + } else { + inputTransform = input; + } + TensorDesc tmpInputDesc = inputDesc; + tmpInputDesc.dims[dataChannelAxis] /= group; + TensorDesc tmpOutputDesc = outputDesc; + tmpOutputDesc.dims[dataChannelAxis] /= group; + TensorDesc tmpFilterDesc = filterDesc; + tmpFilterDesc.dims[filterChannelAxis] /= group; + TensorDesc tmpBiasDesc = biasDesc; + tmpBiasDesc.dims[biasChannelAxis] /= group; + EE ret = SUCCESS; + for (U32 g = 0; g < group; g++) { + void *tmpInput = (U8 *)inputTransform + g * tensorNumBytes(tmpInputDesc); + const void *tmpFilter = (U8 *)filter + g * tensorNumBytes(tmpFilterDesc); + const void *tmpBias = (U8 *)bias + g * tensorNumBytes(tmpBiasDesc); + void *tmpOutput = (U8 *)output + g * tensorNumBytes(tmpOutputDesc); + switch (filterDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = convolution_fp32(tmpInputDesc, (F32 *)tmpInput, tmpFilterDesc, + (F32 *)tmpFilter, convParamSpec, algorithm, tmpBiasDesc, (F32 *)tmpBias, + tmpBytes, tmp, tmpOutputDesc, (F32 *)tmpOutput, activationDesc, arch); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + ret = convolution_fp16(tmpInputDesc, (F16 *)tmpInput, tmpFilterDesc, + (F16 *)tmpFilter, convParamSpec, algorithm, tmpBiasDesc, (F16 *)tmpBias, + tmpBytes, tmp, tmpOutputDesc, (F16 *)tmpOutput, activationDesc, arch); + break; + } +#endif +#ifdef _USE_INT8 + case DT_I8: { + ret = convolution_int8(tmpInputDesc, (INT8 *)tmpInput, tmpFilterDesc, + (INT8 *)tmpFilter, (F16 *)scale, convParamSpec, algorithm, tmpBiasDesc, + (F16 *)tmpBias, tmpBytes, tmp, tmpOutputDesc, tmpOutput, activationDesc, arch); + break; + } +#endif +#ifdef _USE_FP16 + case DT_BIN01: { + ret = convolution_bnn(tmpInputDesc, (F16 *)tmpInput, tmpFilterDesc, + (BIN8 *)tmpFilter, convParamSpec, scaleDesc, (F16 *)scale, tmpBiasDesc, + (F16 *)tmpBias, tmpBytes, tmp, tmpOutputDesc, (F16 *)tmpOutput, activationDesc, + arch); + break; + } + case DT_BIN11: { + ret = convolution_bnn(tmpInputDesc, (F16 *)tmpInput, tmpFilterDesc, + (BIN8 *)tmpFilter, convParamSpec, scaleDesc, (F16 *)scale, tmpBiasDesc, + (F16 *)tmpBias, tmpBytes, tmp, tmpOutputDesc, (F16 *)tmpOutput, activationDesc, + arch); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + } + return ret; +} diff --git a/compute/tensor/src/cpu/arm/deconvolution.cpp b/compute/tensor/src/cpu/arm/deconvolution.cpp new file mode 100644 index 00000000..6c13bc8f --- /dev/null +++ b/compute/tensor/src/cpu/arm/deconvolution.cpp @@ -0,0 +1,49 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/tensor_computing_arm.h" +#ifdef _USE_FP32 +#include "cpu/arm/fp32/tensor_computing_fp32.h" +#endif +#ifdef _USE_FP16 +#include "cpu/arm/fp16/tensor_computing_fp16.h" +#endif + +EE deconvolution_transform_filter_arm(TensorDesc filterDesc, + const void *filter, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + void *filterTransformed) +{ + EE ret = SUCCESS; + switch (filterDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = deconvolution_transform_filter_fp32( + filterDesc, (F32 *)filter, algorithm, ftmDesc, (F32 *)filterTransformed); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + ret = deconvolution_transform_filter_fp16( + filterDesc, (F16 *)filter, algorithm, ftmDesc, (F16 *)filterTransformed); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/arm/depthwise_convolution.cpp b/compute/tensor/src/cpu/arm/depthwise_convolution.cpp new file mode 100644 index 00000000..c19516fa --- /dev/null +++ b/compute/tensor/src/cpu/arm/depthwise_convolution.cpp @@ -0,0 +1,125 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/tensor_computing_arm.h" +#ifdef _USE_FP32 +#include "cpu/arm/fp32/tensor_computing_fp32.h" +#endif +#ifdef _USE_FP16 +#include "cpu/arm/fp16/tensor_computing_fp16.h" +#endif +#ifdef _USE_INT8 +#include "cpu/arm/int8/tensor_computing_int8.h" +#endif + +EE depthwise_convolution_transform_filter_arm(TensorDesc filterDesc, + const void *filter, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + void *filterTransformed) +{ + DataFormat ftmDataFormat; + switch (algorithm) { + case DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT: + ftmDataFormat = DF_NCHWC8; + break; + default: + return NOT_MATCH; + } + *ftmDesc = filterDesc; + ftmDesc->df = ftmDataFormat; + EE ret = NOT_SUPPORTED; + if (filterDesc.df == ftmDataFormat) { + memcpy(filterTransformed, filter, tensorNumBytes(filterDesc)); + ret = SUCCESS; + } else if (filterDesc.df == DF_NCHW) { + if (ftmDataFormat == DF_NCHWC8) { + ret = transformNCHWToNCHWC8(filterDesc, filter, *ftmDesc, filterTransformed); + } + } + return ret; +} + +EE depthwise_convolution_infer_forward_tmp_bytes_arm(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + U32 *bytes) +{ + if (nullptr == bytes) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + EE ret = SUCCESS; + switch (algorithm) { + case DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT: + *bytes = ic * ih_pad * iw_pad; + break; + default: { + ret = NOT_MATCH; + *bytes = 0; + break; + } + } + *bytes *= bytesOf(idt); + + switch (filterDesc.dt) { +#ifdef _USE_INT8 + case DT_I8: { + *bytes += ic * oh * ow * sizeof(I32); + break; + } +#endif + default: + break; + } + *bytes += 32; + return ret; +} + +EE depthwise_convolution_arm(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec depthwiseActivationParamSpec, + Arch arch) +{ + TensorDesc blankTensorDesc; + ActivationParamSpec blankActivationParamSpec; + return depthwise_pointwise_convolution_arm(inputDesc, input, filterDesc, filter, blankTensorDesc, + nullptr, convParamSpec, algorithm, blankTensorDesc, bias, biasDesc, nullptr, tmpBytes, tmp, + outputDesc, output, depthwiseActivationParamSpec, blankActivationParamSpec, arch); +} diff --git a/compute/tensor/src/cpu/arm/depthwise_pointwise_convolution.cpp b/compute/tensor/src/cpu/arm/depthwise_pointwise_convolution.cpp new file mode 100644 index 00000000..34058bdf --- /dev/null +++ b/compute/tensor/src/cpu/arm/depthwise_pointwise_convolution.cpp @@ -0,0 +1,211 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/tensor_computing_arm.h" +#ifdef _USE_FP32 +#include "cpu/arm/fp32/tensor_computing_fp32.h" +#endif +#ifdef _USE_FP16 +#include "cpu/arm/fp16/tensor_computing_fp16.h" +#endif +#ifdef _USE_INT8 +#include "cpu/arm/int8/tensor_computing_int8.h" +#endif + +EE depthwise_pointwise_convolution_infer_forward_algorithm_arm(TensorDesc inputDesc, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + DepthwiseConvolutionForwardAlgorithm *algorithm, + DataType targetDataType) +{ + UNUSED(policy); + if (nullptr == algorithm) { + CHECK_STATUS(NULL_POINTER); + } + EE ret = SUCCESS; + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + *algorithm = DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT; + if (convParamSpec.dilatedRate_h != 1 || convParamSpec.dilatedRate_w != 1) { + return ret; + } + + switch (targetDataType) { + case DT_F16: { + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + if (fh == 3 && fw == 3 && strideH == 1 && strideW == 1 && paddingT == 1 && + paddingB == 1 && paddingL == 1 && paddingR == 1 && ow % 4 == 0 && ow >= 12) { + *algorithm = DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_3X3S1P1; + } + break; + } + default: { + break; + } + } + return ret; +} + +EE depthwise_pointwise_convolution_transform_filter_arm(TensorDesc dwFilterDesc, + const void *dwFilter, + TensorDesc pwFilterDesc, + const void *pwFilter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc *dwFtmDesc, + void *dwFilterTransformed, + TensorDesc *pwFtmDesc, + void *pwFilterTransformed) +{ + EE ret = depthwise_convolution_transform_filter_arm(dwFilterDesc, dwFilter, + DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT, dwFtmDesc, dwFilterTransformed); + if (ret == SUCCESS) { + convParamSpec.group = 1; + ret = convolution_transform_filter_arm(pwFilterDesc, pwFilter, convParamSpec, + CONVOLUTION_ALGORITHM_GEMM, pwFtmDesc, pwFilterTransformed); + } + return ret; +} + +EE depthwise_pointwise_convolution_infer_forward_tmp_bytes_arm(TensorDesc inputDesc, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + U32 *bytes) +{ + if (nullptr == bytes) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, odt; + DataFormat idf, odf; + U32 in, ic, ih, iw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + EE ret = SUCCESS; + switch (algorithm) { + case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT: + *bytes = ic * ih_pad * iw_pad + ic * oh * ow; + break; + case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT_NO_PADDING: + *bytes = ic * oh * ow; + break; + case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_3X3S1P1: + *bytes = ic * oh * ow + ic * 8; + break; + default: { + ret = NOT_MATCH; + *bytes = 0; + break; + } + } + *bytes *= bytesOf(idt); + + switch (dwFilterDesc.dt) { +#ifdef _USE_INT8 + case DT_I8: { + *bytes += ic * oh * ow * sizeof(I32); + break; + } +#endif + default: + break; + } + *bytes += 32; + return ret; +} + +EE depthwise_pointwise_convolution_arm(TensorDesc inputDesc, + void *input, + TensorDesc dwFilterDesc, + const void *dwFilter, + TensorDesc pwFilterDesc, + const void *pwFilter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc dwBiasDesc, + const void *dwBias, + TensorDesc pwBiasDesc, + const void *pwBias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + Arch arch) +{ + EE ret = SUCCESS; + switch (dwFilterDesc.dt) { +#ifdef _USE_FP16 + case DT_F16: { + ret = depthwise_pointwise_convolution_fp16(inputDesc, (F16 *)input, dwFilterDesc, + (const F16 *)dwFilter, pwFilterDesc, (const F16 *)pwFilter, convParamSpec, + algorithm, dwBiasDesc, (const F16 *)dwBias, pwBiasDesc, (const F16 *)pwBias, + tmpBytes, tmp, outputDesc, (F16 *)output, depthwiseActivationParamSpec, + pointwiseActivationParamSpec, arch); + break; + } +#endif +#ifdef _USE_FP32 + case DT_F32: { + ret = depthwise_pointwise_convolution_fp32(inputDesc, (F32 *)input, dwFilterDesc, + (const F32 *)dwFilter, pwFilterDesc, (const F32 *)pwFilter, convParamSpec, + algorithm, dwBiasDesc, (const F32 *)dwBias, pwBiasDesc, (const F32 *)pwBias, + tmpBytes, tmp, outputDesc, (F32 *)output, depthwiseActivationParamSpec, + pointwiseActivationParamSpec, arch); + break; + } +#endif +#ifdef _USE_INT8 + case DT_I8: { + ret = depthwise_pointwise_convolution_int8(inputDesc, (INT8 *)input, dwFilterDesc, + (const INT8 *)dwFilter, pwFilterDesc, (const INT8 *)pwFilter, convParamSpec, + algorithm, dwBiasDesc, (const I32 *)dwBias, pwBiasDesc, (const I32 *)pwBias, + tmpBytes, tmp, outputDesc, (I32 *)output, depthwiseActivationParamSpec, + pointwiseActivationParamSpec, arch); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/arm/eltwise.cpp b/compute/tensor/src/cpu/arm/eltwise.cpp new file mode 100644 index 00000000..fcc8db99 --- /dev/null +++ b/compute/tensor/src/cpu/arm/eltwise.cpp @@ -0,0 +1,50 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/arm/tensor_computing_arm.h" +#ifdef _USE_FP32 +#include "cpu/arm/fp32/tensor_computing_fp32.h" +#endif +#ifdef _USE_FP16 +#include "cpu/arm/fp16/tensor_computing_fp16.h" +#endif + +EE eltwise_arm(DataType dataType, + std::vector input, + std::vector inputSize, + U32 num, + U32 len, + void *output, + EltwiseMode eltwiseMode) +{ + EE ret = SUCCESS; + switch (dataType) { +#ifdef _USE_FP32 + case DT_F32: { + ret = eltwise_fp32(input, inputSize, num, len, output, eltwiseMode); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + ret = eltwise_fp16(input, inputSize, num, len, output, eltwiseMode); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/arm/fp16/arm_functions_fp16.h b/compute/tensor/src/cpu/arm/fp16/arm_functions_fp16.h new file mode 100644 index 00000000..8cca0af4 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/arm_functions_fp16.h @@ -0,0 +1,432 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_ARM_FUNCTIONS_FP16 +#define _H_ARM_FUNCTIONS_FP16 + +#include +#include "arm_neon_expand.h" +#include "types.h" + +// array sum +inline F32 array_sum_f16(const F16 *data, I32 len) +{ + if (len <= 0) { + return 0; + } + + I32 i = 0; + F32 sum_s = 0; + float16x8_t sum_v = vdupq_n_f16(0); + for (i = 0; i < len - 7; i += 8) { + float16x8_t in = vld1q_f16(data + i); + sum_v = vaddq_f16(sum_v, in); + } + sum_s += vaddvq_f16(sum_v); + for (; i < len; i++) { + sum_s += data[i]; + } + return sum_s; +} + +// array mean +inline F32 array_mean_f16(const F16 *data, I32 len) +{ + if (len <= 0) { + return 0; + } + return array_sum_f16(data, len) / len; +} + +// array var +inline F32 array_var_f16(const F16 *data, I32 len, F32 mean) +{ + if (len <= 0) { + return 0; + } + + I32 i = 0; + F32 sum_s = 0; + float32x4_t mean_v = vdupq_n_f32(mean); + for (i = 0; i < len - 3; i += 4) { + float16x4_t in = vld1_f16(data + i); + float32x4_t in_f32 = vcvt_f32_f16(in); + float32x4_t tmp_v = vsubq_f32(in_f32, mean_v); + float32x4_t sum_v = vmulq_f32(tmp_v, tmp_v); + sum_s += vaddvq_f32(sum_v); + } + for (; i < len; i++) { + F16 in = data[i]; + F32 tmp = in - mean; + sum_s += tmp * tmp; + } + return sum_s / len; +} + +// array max +inline F16 array_max_f16(const F16 *data, I32 len) +{ + F16 max_s = data[0]; + I32 i = 0; + if (len >= 8) { + float16x8_t max_v, tmp_v; + max_v = vld1q_f16(data); + for (i = 8; i < len - 7; i += 8) { + tmp_v = vld1q_f16(data + i); + max_v = vmaxq_f16(tmp_v, max_v); + } + max_s = vmaxvq_f16(max_v); + } + + for (; i < len; i++) { + if (data[i] > max_s) { + max_s = data[i]; + } + } + + return max_s; +} + +inline F16 array_maxabs_f16(const F16 *data, I32 len) +{ + F16 max_s = abs(data[0]); + I32 i = 0; + if (len >= 8) { + float16x8_t max_v, tmp_v; + max_v = vld1q_f16(data); + max_v = vabsq_f16(max_v); + for (i = 8; i < len - 7; i += 8) { + tmp_v = vld1q_f16(data + i); + tmp_v = vabsq_f16(tmp_v); + max_v = vmaxq_f16(tmp_v, max_v); + } + max_s = vmaxvq_f16(max_v); + } + + for (; i < len; i++) { + if (abs(data[i]) > max_s) { + max_s = abs(data[i]); + } + } + + return max_s; +} + +inline void array_scale_f16(const F16 *input, F16 *output, I32 len, F32 alpha, F32 beta) +{ + I32 i = 0; +#ifdef _USE_F16_MIX_PRECISION + float32x4_t alpha_v = vdupq_n_f32(alpha); + float32x4_t beta_v = vdupq_n_f32(beta); + for (i = 0; i < len - 3; i += 4) { + float16x4_t in = vld1_f16(input + i); + float32x4_t in_f32 = vcvt_f32_f16(in); + float32x4_t result = vfmaq_f32(beta_v, alpha_v, in_f32); + vst1_f16(output + i, vcvt_f16_f32(result)); + } +#else + float16x8_t alpha_v = vdupq_n_f16(alpha); + float16x8_t beta_v = vdupq_n_f16(beta); + for (i = 0; i < len - 7; i += 8) { + float16x8_t in = vld1q_f16(input + i); + float16x8_t tmp_v = vfmaq_f16(beta_v, alpha_v, in); + vst1q_f16(output + i, tmp_v); + } +#endif + for (; i < len; i++) { + output[i] = alpha * input[i] + beta; + } +} + +inline void array_power_f16(F16 *input, F16 *output, I32 len, F32 power) +{ + I32 i = 0; + if (power == -1) { +#ifdef _USE_F16_MIX_PRECISION + float32x4_t one_v = vdupq_n_f32(1); + for (i = 0; i < len - 3; i += 4) { + float16x4_t in = vld1_f16(input + i); + float32x4_t in_f32 = vcvt_f32_f16(in); + float32x4_t result = vdivq_f32(one_v, in_f32); + vst1_f16(output + i, vcvt_f16_f32(result)); + } +#else + float16x8_t one_v = vdupq_n_f16(1); + for (i = 0; i < len - 7; i += 8) { + float16x8_t in = vld1q_f16(input + i); + float16x8_t tmp_v = vdivq_f16(one_v, in); + vst1q_f16(output + i, tmp_v); + } +#endif + } else if (power == 0.5) { +#ifdef _USE_F16_MIX_PRECISION + for (i = 0; i < len - 3; i += 4) { + float16x4_t in = vld1_f16(input + i); + float32x4_t in_f32 = vcvt_f32_f16(in); + float32x4_t result = vsqrtq_f32(in_f32); + vst1_f16(output + i, vcvt_f16_f32(result)); + } +#else + for (i = 0; i < len - 7; i += 8) { + float16x8_t in = vld1q_f16(input + i); + float16x8_t tmp_v = vsqrtq_f16(in); + vst1q_f16(output + i, tmp_v); + } +#endif + } else if (power == 1) { + if (input != output) { + memcpy(output, input, len * sizeof(F16)); + } + i = len; + } else if (power == 2) { +#ifdef _USE_F16_MIX_PRECISION + for (i = 0; i < len - 3; i += 4) { + float16x4_t in = vld1_f16(input + i); + float32x4_t in_f32 = vcvt_f32_f16(in); + float32x4_t result = vmulq_f32(in_f32, in_f32); + vst1_f16(output + i, vcvt_f16_f32(result)); + } +#else + for (i = 0; i < len - 7; i += 8) { + float16x8_t in = vld1q_f16(input + i); + float16x8_t tmp_v = vmulq_f16(in, in); + vst1q_f16(output + i, tmp_v); + } +#endif + } + for (; i < len; i++) { + output[i] = powf(input[i], power); + } +} + +inline EE activation_fp16(F16 *input, U32 len, ActivationParamSpec activationDesc, F16 *output) +{ + float16x8_t in, out; + float16x8_t zero = vdupq_n_f16(float16_t(0.)); + float16x8_t one = vdupq_n_f16(float16_t(1.)); + float16x8_t three = vdupq_n_f16(float16_t(3.)); + float16x8_t six = vdupq_n_f16(float16_t(6.)); + U32 len_main = len / 8; + U32 len_tail = len % 8; + + F16 value; + switch (activationDesc.mode) { + case ACTIVATION_NULL: { + break; + } + case ACTIVATION_RELU: { + if (activationDesc.value[0] == 0) { + for (U32 i = 0; i < len_main; i++) { + in = vld1q_f16(input); + out = vmaxq_f16(zero, in); + vst1q_f16(output, out); + input += 8; + output += 8; + } + for (U32 i = 0; i < len_tail; i++) { + output[i] = (input[i] < 0) ? 0 : input[i]; + } + } else { + float16x8_t scale = vdupq_n_f16(activationDesc.value[0]); + for (U32 i = 0; i < len_main; i++) { + in = vld1q_f16(input); + float16x8_t tmp = vmulq_f16(scale, in); + out = vmaxq_f16(tmp, in); + vst1q_f16(output, out); + input += 8; + output += 8; + } + for (U32 i = 0; i < len_tail; i++) { + float tmp = activationDesc.value[0] * input[i]; + output[i] = (input[i] < tmp) ? tmp : input[i]; + } + } + break; + } + case ACTIVATION_RELU6: { + for (U32 i = 0; i < len_main; i++) { + in = vld1q_f16(input); + out = vmaxq_f16(zero, in); + out = vminq_f16(six, out); + vst1q_f16(output, out); + input += 8; + output += 8; + } + for (U32 i = 0; i < len_tail; i++) { + value = (input[i] < 0) ? 0 : input[i]; + if (value > 6) { + value = 6; + } + output[i] = value; + } + break; + } + case ACTIVATION_H_SIGMOID: { + for (U32 i = 0; i < len_main; i++) { + in = vld1q_f16(input); + out = vaddq_f16(in, three); + out = vmaxq_f16(out, zero); + out = vminq_f16(out, six); + out = vdivq_f16(out, six); + vst1q_f16(output, out); + input += 8; + output += 8; + } + for (U32 i = 0; i < len_tail; i++) { + value = input[i] + 3; + value = (value < 0) ? 0 : value; + value = (value > 6) ? 6 : value; + value = value / 6; + output[i] = value; + } + break; + } + case ACTIVATION_H_SWISH: { + for (U32 i = 0; i < len_main; i++) { + in = vld1q_f16(input); + out = vaddq_f16(in, three); + out = vmaxq_f16(out, zero); + out = vminq_f16(out, six); + out = vdivq_f16(out, six); + out = vmulq_f16(out, in); + vst1q_f16(output, out); + input += 8; + output += 8; + } + for (U32 i = 0; i < len_tail; i++) { + value = input[i] + 3; + value = (value < 0) ? 0 : value; + value = (value > 6) ? 6 : value; + value = input[i] * value; + value = value / 6; + output[i] = value; + } + break; + } + case ACTIVATION_GELU: { + F16 two_div_PI_sqrt = sqrt(2 / 3.14159265358979323846); + float16x8_t vec0 = vdupq_n_f16(two_div_PI_sqrt); + float16x8_t vec1 = vdupq_n_f16(float16_t(0.044715)); + float16x8_t vec2 = vdupq_n_f16(float16_t(0.5)); + for (U32 i = 0; i < len_main; i++) { + in = vld1q_f16(input); + out = vmulq_f16(in, in); + out = vmulq_f16(out, in); + out = vfmaq_f16(in, vec1, out); + out = vmulq_f16(vec0, out); + out = vtanhq_f16(out); + out = vaddq_f16(one, out); + out = vmulq_f16(vec2, out); + out = vmulq_f16(in, out); + vst1q_f16(output, out); + input += 8; + output += 8; + } + for (U32 i = 0; i < len_tail; i++) { + value = input[i]; + value = two_div_PI_sqrt * (value + 0.044715 * powf(value, 3)); + value = 1.0 - 2.0 / (exp(2.0 * value) + 1.0); + value = 0.5 * (1.0 + value); + value = input[i] * value; + output[i] = value; + } + break; + } + case ACTIVATION_TANH: { + for (U32 i = 0; i < len_main; i++) { + in = vld1q_f16(input); + out = vtanhq_f16(in); + vst1q_f16(output, out); + input += 8; + output += 8; + } + for (U32 i = 0; i < len_tail; i++) { + value = 1.0 - 2.0 / (exp(2.0 * input[i]) + 1.0); + output[i] = value; + } + break; + } + case ACTIVATION_SIGMOID: { + for (U32 i = 0; i < len_main; i++) { + in = vld1q_f16(input); + out = vsigmoidq_f16(in); + vst1q_f16(output, out); + input += 8; + output += 8; + } + for (U32 i = 0; i < len_tail; i++) { + value = 1.0 / (1.0 + exp(-1.0 * input[i])); + output[i] = value; + } + break; + } + case ACTIVATION_MISH: { + for (U32 i = 0; i < len_main; i++) { + in = vld1q_f16(input); + out = vmulq_f16( + in, vtanhq_f16(vlogq_f16(vaddq_f16(vexpq_f16_03_percent_error(in), one)))); + vst1q_f16(output, out); + input += 8; + output += 8; + } + for (U32 i = 0; i < len_tail; i++) { + value = input[i] * tanh(log(exp(input[i]) + 1.0)); + output[i] = value; + } + break; + } + case ACTIVATION_GREATER: { + for (U32 i = 0; i < len; i++) { + output[i] = input[i] > 1 ? 1 : 0; + } + break; + } + default: + return NOT_SUPPORTED; + } + + return SUCCESS; +} + +inline void array_add_f16(const F16 *inputA, const F16 *inputB, F16 *output, I32 len) +{ + I32 i = 0; + for (i = 0; i < len - 7; i += 8) { + float16x8_t a = vld1q_f16(inputA + i); + float16x8_t b = vld1q_f16(inputB + i); + float16x8_t c = vaddq_f16(a, b); + vst1q_f16(output + i, c); + } + + for (; i < len; i++) { + output[i] = inputA[i] + inputB[i]; + } +} + +inline void array_square_and_add_f16(const F16 *inputA, const F16 *inputB, F16 *output, I32 len) +{ + I32 i = 0; + for (i = 0; i < len - 7; i += 8) { + float16x8_t a = vld1q_f16(inputA + i); + float16x8_t b = vld1q_f16(inputB + i); + b = vmulq_f16(b, b); + float16x8_t c = vaddq_f16(a, b); + vst1q_f16(output + i, c); + } + + for (; i < len; i++) { + output[i] = inputA[i] + inputB[i] * inputB[i]; + } +} + +#endif diff --git a/compute/tensor/src/cpu/arm/fp16/attention.cpp b/compute/tensor/src/cpu/arm/fp16/attention.cpp new file mode 100644 index 00000000..050203ab --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/attention.cpp @@ -0,0 +1,79 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/arm/fp16/tensor_computing_fp16.h" + +EE attention_fp16(U32 batch, + U32 numHeads, + I32 fromSequenceLength, + I32 toSequenceLength, + const F16 *input, + F16 *output) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + F16 mask_s = -10000.0; + I32 count = array_sum_f16(input, toSequenceLength); + I32 valid = UNI_MIN(count, fromSequenceLength); + float16x8_t mask_v = vdupq_n_f16(float16_t(mask_s)); + float16x8_t one_v = vdupq_n_f16(float16_t(1.0)); + for (U32 n = 0; n < batch; n++) { + for (U32 i = 0; i < numHeads; i++) { + if (i == 0) { + for (I32 j = 0; j < valid; j++) { + if (j == 0) { + I32 k = 0; + for (; k < toSequenceLength - 7; k += 8) { + float16x8_t in_v = vld1q_f16(input + k); + float16x8_t tmp_v = vsubq_f16(one_v, in_v); + tmp_v = vmulq_f16(tmp_v, mask_v); + vst1q_f16(output + k, tmp_v); + } + for (; k < toSequenceLength; k++) { + F16 value = (1 - input[k]) * mask_s; + output[k] = value; + } + } else { + memcpy( + output + j * toSequenceLength, output, toSequenceLength * sizeof(F16)); + } + } + + for (I32 j = valid; j < fromSequenceLength; j++) { + if (j == valid) { + I32 k = 0; + for (; k < toSequenceLength - 7; k += 8) { + vst1q_f16(output + j * toSequenceLength + k, mask_v); + } + for (; k < toSequenceLength; k++) { + output[j * toSequenceLength + k] = mask_s; + } + } else { + memcpy(output + j * toSequenceLength, output + valid * toSequenceLength, + toSequenceLength * sizeof(F16)); + } + } + } else { + memcpy(output + i * fromSequenceLength * toSequenceLength, output, + fromSequenceLength * toSequenceLength * sizeof(F16)); + } + } + + input += toSequenceLength; + output += numHeads * fromSequenceLength * toSequenceLength; + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/arm/fp16/attention_mask.cpp b/compute/tensor/src/cpu/arm/fp16/attention_mask.cpp new file mode 100644 index 00000000..afad68e5 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/attention_mask.cpp @@ -0,0 +1,82 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/arm/fp16/tensor_computing_fp16.h" + +EE attention_mask_fp16(TensorDesc inputDesc, + const F16 *input, + AttentionMaskParamSpec p, + TensorDesc outputDesc, + F16 *output) +{ + UNUSED(outputDesc); + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + I32 attentionLength = p.attention_length; + bool sameLength = p.same_length; + float maskValue = p.mask; + int qlen = inputDesc.dims[1]; + int klen = inputDesc.dims[0]; + int mlen = klen - qlen; + I32 length = qlen * klen; + std::vector mask; + if (attentionLength < 0) { + mask = std::vector(length, 0); + } else { + mask = std::vector(length, 1); + for (int i = 0; i < qlen; i++) { + int start, loops; + if (attentionLength > 0) { + int end = mlen + i; + start = UNI_MAX(end - attentionLength, 0); + loops = end - start + 1; + } else { + if (sameLength) { + start = i; + loops = qlen + 1; + } else { + start = 0; + loops = i + qlen + 1; + } + } + loops = UNI_MAX(loops, 0); + start = UNI_MIN(start, klen); + if (start + loops > klen) { + loops = UNI_MAX(klen - start, 0); + } + memset(&mask[i * klen + start], 0, sizeof(F16) * loops); + } + } + I32 loops = tensorNumElements(inputDesc) / length; + float16x8_t one_v = vdupq_n_f16(1); + float16x8_t mask_value_v = vdupq_n_f16(maskValue); + for (int i = 0, index = 0; i < loops; i++) { + int j = 0; + for (; j < length - 7; j += 8) { + float16x8_t in = vld1q_f16(input + index); + float16x8_t mask_v = vld1q_f16(&mask[j]); + float16x8_t tmp_v = vsubq_f16(one_v, mask_v); + tmp_v = vmulq_f16(in, tmp_v); + tmp_v = vfmsq_f16(tmp_v, mask_value_v, mask_v); + vst1q_f16(output + index, tmp_v); + index += 8; + } + for (; j < length; j++) { + output[index] = input[index] * (1 - mask[j]) - maskValue * mask[j]; + index++; + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/arm/fp16/check.cpp b/compute/tensor/src/cpu/arm/fp16/check.cpp new file mode 100644 index 00000000..139677cd --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/check.cpp @@ -0,0 +1,99 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/fp16/tensor_computing_fp16.h" + +EE check_fp16(TensorDesc inputDescA, + const F16 *inputA, + TensorDesc inputDescB, + const F16 *inputB, + CheckMode checkMode, + TensorDesc outputDesc, + I32 *output) +{ + if (nullptr == inputA || nullptr == inputB || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + if (tensorNumElements(inputDescA) != tensorNumElements(inputDescB)) { + CHECK_STATUS(NOT_MATCH); + } + + U32 size = tensorNumElements(inputDescA); + U32 loopOuter = inputDescA.dims[inputDescA.nDims - 1]; + I32 length = size / loopOuter; + if (tensorNumElements(outputDesc) != loopOuter) { + CHECK_STATUS(NOT_MATCH); + } + for (U32 j = 0; j < loopOuter; j++) { + const F16 *arrayA = inputA + j * length; + const F16 *arrayB = inputB + j * length; + switch (checkMode) { + case CHECK_GREAT: { + uint16x8_t count_v = vdupq_n_u16(0); + I32 i = 0; + for (; i < length - 7; i += 8) { + float16x8_t a = vld1q_f16(arrayA + i); + float16x8_t b = vld1q_f16(arrayA + i); + count_v = vaddq_u16(count_v, vcgtq_f16(a, b)); + } + I32 count = vaddvq_u16(count_v); + for (; i < length; i++) { + if (arrayA[i] > arrayB[i]) { + count++; + } + } + output[j] = (count == length); + break; + } + case CHECK_GREATEQUAL: { + uint16x8_t count_v = vdupq_n_u16(0); + I32 i = 0; + for (; i < length - 7; i += 8) { + float16x8_t a = vld1q_f16(arrayA + i); + float16x8_t b = vld1q_f16(arrayA + i); + count_v = vaddq_u16(count_v, vcgeq_f16(a, b)); + } + I32 count = vaddvq_u16(count_v); + for (; i < length; i++) { + if (arrayA[i] >= arrayB[i]) { + count++; + } + } + output[j] = (count == length); + break; + } + case CHECK_EQUAL: { + uint16x8_t count_v = vdupq_n_u16(0); + I32 i = 0; + for (; i < length - 7; i += 8) { + float16x8_t a = vld1q_f16(arrayA + i); + float16x8_t b = vld1q_f16(arrayA + i); + count_v = vaddq_u16(count_v, vceqq_f16(a, b)); + } + I32 count = vaddvq_u16(count_v); + for (; i < length; i++) { + if (arrayA[i] == arrayB[i]) { + count++; + } + } + output[j] = (count == length); + break; + } + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/arm/fp16/clip.cpp b/compute/tensor/src/cpu/arm/fp16/clip.cpp new file mode 100644 index 00000000..3f19ae9e --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/clip.cpp @@ -0,0 +1,38 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/fp16/tensor_computing_fp16.h" + +EE clip_fp16(F16 *input, F16 *output, I32 len, F32 minValue, F32 maxValue) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + float16x8_t min_v = vdupq_n_f16(minValue); + float16x8_t max_v = vdupq_n_f16(maxValue); + + I32 i = 0; + for (i = 0; i < len - 7; i += 8) { + float16x8_t in = vld1q_f16(input + i); + float16x8_t tmp_v = vminq_f16(max_v, vmaxq_f16(min_v, in)); + vst1q_f16(output + i, tmp_v); + } + for (; i < len; i++) { + F16 value = input[i]; + value = (value > minValue) ? value : minValue; + value = (value < maxValue) ? value : maxValue; + output[i] = value; + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/arm/fp16/convolution.cpp b/compute/tensor/src/cpu/arm/fp16/convolution.cpp new file mode 100644 index 00000000..8349c7ca --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/convolution.cpp @@ -0,0 +1,87 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/fp16/tensor_computing_fp16.h" +#include "cpu/arm/fp16/convolution_winograd.h" +#include "cpu/arm/fp16/convolution_gemm.h" +#include "cpu/arm/fp16/convolution_gemm_icnchw.h" +#include "cpu/arm/fp16/convolution_direct.h" + +EE convolution_fp16(TensorDesc inputDesc, + F16 *input, + TensorDesc filterDesc, + const F16 *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc biasDesc, + const F16 *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *output, + ActivationParamSpec activationDesc, + Arch arch) +{ + if (nullptr == input || nullptr == filter || nullptr == output || nullptr == bias || + nullptr == tmp) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if (!(idt == DT_F16 && fdt == DT_F16 && odt == DT_F16)) { + CHECK_STATUS(NOT_MATCH); + } + if (!(odf == DF_NCHWC8)) { + CHECK_STATUS(NOT_MATCH); + } + if (!(ic == fc && oc == fn)) { + CHECK_STATUS(NOT_MATCH); + } + + // In some cases when we adjust the model input, the input tensor of conv can change from NCHW to NCHWc8 + // In this case we can simply change the algo, because they both require the same filter transform + if (CONVOLUTION_ALGORITHM_GEMM_ICNCHW == algorithm && DF_NCHWC8 == idf) { + algorithm = CONVOLUTION_ALGORITHM_GEMM; + } + + EE ret = SUCCESS; + switch (algorithm) { + case CONVOLUTION_ALGORITHM_DIRECT: + ret = convolution_direct(inputDesc, input, filterDesc, filter, convParamSpec, biasDesc, + bias, tmpBytes, tmp, outputDesc, output, activationDesc, arch); + break; + case CONVOLUTION_ALGORITHM_GEMM: + ret = convolution_gemm(inputDesc, input, filterDesc, filter, convParamSpec, biasDesc, + bias, tmpBytes, tmp, outputDesc, output, activationDesc, arch); + break; + case CONVOLUTION_ALGORITHM_WINOGRAD: + ret = convolution_winograd(inputDesc, input, filterDesc, filter, convParamSpec, + biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc, arch); + break; + case CONVOLUTION_ALGORITHM_GEMM_ICNCHW: + ret = convolution_gemm_icnchw(inputDesc, input, filterDesc, filter, convParamSpec, + biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc, arch); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_direct.cpp b/compute/tensor/src/cpu/arm/fp16/convolution_direct.cpp new file mode 100644 index 00000000..3782db73 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/convolution_direct.cpp @@ -0,0 +1,500 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include + +#include "cpu/arm/fp16/convolution_direct.h" + +EE convolution_direct(TensorDesc inputDesc, + F16 *inArray, + TensorDesc filterDesc, + const F16 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc, + Arch arch) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + UNUSED(arch); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + if (fdf != DF_NCHWN16) { + CHECK_STATUS(NOT_MATCH); + } + + oc /= 8; + ic /= 8; + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + + // naive, no blocking, in: NCHWc8, out: NOHWo8, filter: OCHWo16, no bias + + EE ret = SUCCESS; + for (U32 n = 0; n < in; n++) { + // copy input into a input with padding + F16 *inArray_pad = (F16 *)tmp; + F16 *inArray_pad_mov = inArray_pad; + F16 *inArray_mov = inArray + n * ic * ih * iw * 8; + for (U32 c = 0; c < ic; c++) { + for (U32 h = 0; h < paddingT; h++) { + memset(inArray_pad_mov, 0, iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += iw_pad * 8; + } + for (U32 h = paddingT; h < ih_pad - paddingB; h++) { + memset(inArray_pad_mov, 0, paddingL * 8 * bytesOf(idt)); + inArray_pad_mov += paddingL * 8; + memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt)); + inArray_pad_mov += iw * 8; + inArray_mov += iw * 8; + memset(inArray_pad_mov, 0, paddingR * 8 * bytesOf(idt)); + inArray_pad_mov += paddingR * 8; + } + for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { + memset(inArray_pad_mov, 0, iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += iw_pad * 8; + } + } + + // compute + const F16 *f0 = filterArray; + const F16 *f1 = f0 + fh * fw * 16; + const F16 *f2 = f0 + fh * fw * 16 * 2; + const F16 *f3 = f0 + fh * fw * 16 * 3; + const F16 *f4 = f0 + fh * fw * 16 * 4; + const F16 *f5 = f0 + fh * fw * 16 * 5; + const F16 *f6 = f0 + fh * fw * 16 * 6; + const F16 *f7 = f0 + fh * fw * 16 * 7; + + F16 *outo0h0 = outArray + n * oc * oh * ow * 8; + F16 *outo1h0 = outo0h0 + oh * ow * 8; + F16 *outo0h1 = outo0h0 + ow * 8; + F16 *outo1h1 = outo1h0 + ow * 8; + for (U32 o = 0; o < oc; o += 2) { + for (U32 c = 0; c < ic; c++) { + F16 *out_o0h0 = outo0h0; + F16 *out_o1h0 = outo1h0; + F16 *out_o0h1 = outo0h1; + F16 *out_o1h1 = outo1h1; + + F16 *in_h0w0 = inArray_pad + n * ic * ih_pad * iw_pad * 8 + c * ih_pad * iw_pad * 8; + F16 *in_h0w1 = in_h0w0 + strideW * 8; + F16 *in_h0w2 = in_h0w0 + strideW * 8 * 2; + F16 *in_h0w3 = in_h0w0 + strideW * 8 * 3; + F16 *in_h1w0 = in_h0w0 + strideH * iw_pad * 8; + F16 *in_h1w1 = in_h1w0 + strideW * 8; + F16 *in_h1w2 = in_h1w0 + strideW * 8 * 2; + F16 *in_h1w3 = in_h1w0 + strideW * 8 * 3; + + for (U32 h = 0; h < oh; h += 2) { + for (U32 w = 0; w < ow; w += 4) { + const F16 *f_c0 = f0; + const F16 *f_c1 = f1; + const F16 *f_c2 = f2; + const F16 *f_c3 = f3; + const F16 *f_c4 = f4; + const F16 *f_c5 = f5; + const F16 *f_c6 = f6; + const F16 *f_c7 = f7; + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + __asm__ __volatile__( + "ldr d16, [%[f_c0]]\n" + "ldr x4, [%[f_c0], #8]\n" + "ins v16.d[1], x4\n" + "ldr d0, [%[in_h0w0]]\n" + "ldr x0, [%[in_h0w0], #8]\n" + "ins v0.d[1], x0\n" + "ldr d1, [%[in_h0w1]]\n" + "ldr x1, [%[in_h0w1], #8]\n" + "ins v1.d[1], x1\n" + "ldr d2, [%[in_h0w2]]\n" + "ldr x2, [%[in_h0w2], #8]\n" + "ins v2.d[1], x2\n" + "ldr d3, [%[in_h0w3]]\n" + "ldr x3, [%[in_h0w3], #8]\n" + "ins v3.d[1], x3\n" + "ldr d4, [%[in_h1w0]]\n" + "ldr x0, [%[in_h1w0], #8]\n" + "ins v4.d[1], x0\n" + "ldr d5, [%[in_h1w1]]\n" + "ldr x1, [%[in_h1w1], #8]\n" + "ins v5.d[1], x1\n" + "ldr d6, [%[in_h1w2]]\n" + "ldr x2, [%[in_h1w2], #8]\n" + "ins v6.d[1], x2\n" + "ldr d7, [%[in_h1w3]]\n" + "ldr x3, [%[in_h1w3], #8]\n" + "ins v7.d[1], x3\n" + "ldr d8, [%[out_o0h0]]\n" + "ldr x0, [%[out_o0h0], #8]\n" + "ins v8.d[1], x0\n" + "ldr d9, [%[out_o0h0], #16]\n" + "ldr x1, [%[out_o0h0], #24]\n" + "ins v9.d[1], x1\n" + "ldr d10, [%[out_o0h0], #32]\n" + "ldr x2, [%[out_o0h0], #40]\n" + "ins v10.d[1], x2\n" + "ldr d11, [%[out_o0h0], #48]\n" + "ldr x3, [%[out_o0h0], #56]\n" + "ins v11.d[1], x3\n" + "ldr d12, [%[out_o0h1]]\n" + "ldr x0, [%[out_o0h1], #8]\n" + "ins v12.d[1], x0\n" + "ldr d13, [%[out_o0h1], #16]\n" + "ldr x1, [%[out_o0h1], #24]\n" + "ins v13.d[1], x1\n" + "ldr d14, [%[out_o0h1], #32]\n" + "ldr x2, [%[out_o0h1], #40]\n" + "ins v14.d[1], x2\n" + "ldr d15, [%[out_o0h1], #48]\n" + "ldr x3, [%[out_o0h1], #56]\n" + "ins v15.d[1], x3\n" + + "fmla v8.8h, v16.8h, v0.h[0]\n" + "ldr d18, [%[out_o1h0]]\n" + "fmla v9.8h, v16.8h, v1.h[0]\n" + "ldr x0, [%[out_o1h0], #8]\n" + "fmla v10.8h, v16.8h, v2.h[0]\n" + "ldr d17, [%[f_c1]]\n" + "fmla v11.8h, v16.8h, v3.h[0]\n" + "ldr x5, [%[f_c1], #8]\n" + "fmla v12.8h, v16.8h, v4.h[0]\n" + "ins v17.d[1], x5\n" + "fmla v13.8h, v16.8h, v5.h[0]\n" + "ins v18.d[1], x0\n" + "fmla v14.8h, v16.8h, v6.h[0]\n" + "ldr d19, [%[out_o1h0], #16]\n" + "fmla v15.8h, v16.8h, v7.h[0]\n" + "ldr x1, [%[out_o1h0], #24]\n" + "fmla v8.8h, v17.8h, v0.h[1]\n" + "ins v19.d[1], x1\n" + "fmla v9.8h, v17.8h, v1.h[1]\n" + "fmla v10.8h, v17.8h, v2.h[1]\n" + "ldr d16, [%[f_c2]]\n" + "fmla v11.8h, v17.8h, v3.h[1]\n" + "ldr x5, [%[f_c2], #8]\n" + "fmla v12.8h, v17.8h, v4.h[1]\n" + "ins v16.d[1], x5\n" + "fmla v13.8h, v17.8h, v5.h[1]\n" + "ldr d20, [%[out_o1h0], #32]\n" + "fmla v14.8h, v17.8h, v6.h[1]\n" + "ldr x2, [%[out_o1h0], #40]\n" + "fmla v15.8h, v17.8h, v7.h[1]\n" + "ins v20.d[1], x2\n" + "fmla v8.8h, v16.8h, v0.h[2]\n" + "ldr d21, [%[out_o1h0], #48]\n" + "fmla v9.8h, v16.8h, v1.h[2]\n" + "fmla v10.8h, v16.8h, v2.h[2]\n" + "ldr d17, [%[f_c3]]\n" + "fmla v11.8h, v16.8h, v3.h[2]\n" + "ldr x5, [%[f_c3], #8]\n" + "fmla v12.8h, v16.8h, v4.h[2]\n" + "ins v17.d[1], x5\n" + "fmla v13.8h, v16.8h, v5.h[2]\n" + "ldr x3, [%[out_o1h0], #56]\n" + "fmla v14.8h, v16.8h, v6.h[2]\n" + "ins v21.d[1], x3\n" + "fmla v15.8h, v16.8h, v7.h[2]\n" + "ldr d22, [%[out_o1h1]]\n" + "fmla v8.8h, v17.8h, v0.h[3]\n" + "ldr x0, [%[out_o1h1], #8]\n" + "fmla v9.8h, v17.8h, v1.h[3]\n" + "ins v22.d[1], x0\n" + "fmla v10.8h, v17.8h, v2.h[3]\n" + "ldr d16, [%[f_c4]]\n" + "fmla v11.8h, v17.8h, v3.h[3]\n" + "ldr x5, [%[f_c4], #8]\n" + "fmla v12.8h, v17.8h, v4.h[3]\n" + "ins v16.d[1], x5\n" + "fmla v13.8h, v17.8h, v5.h[3]\n" + "ldr d23, [%[out_o1h1], #16]\n" + "fmla v14.8h, v17.8h, v6.h[3]\n" + "ldr x1, [%[out_o1h1], #24]\n" + "fmla v15.8h, v17.8h, v7.h[3]\n" + "ins v23.d[1], x1\n" + "fmla v8.8h, v16.8h, v0.h[4]\n" + "fmla v9.8h, v16.8h, v1.h[4]\n" + "fmla v10.8h, v16.8h, v2.h[4]\n" + "ldr d17, [%[f_c5]]\n" + "fmla v11.8h, v16.8h, v3.h[4]\n" + "ldr x5, [%[f_c5], #8]\n" + "fmla v12.8h, v16.8h, v4.h[4]\n" + "ins v17.d[1], x5\n" + "fmla v13.8h, v16.8h, v5.h[4]\n" + "ldr d24, [%[out_o1h1], #32]\n" + "fmla v14.8h, v16.8h, v6.h[4]\n" + "ldr x2, [%[out_o1h1], #40]\n" + "fmla v15.8h, v16.8h, v7.h[4]\n" + "ins v24.d[1], x2\n" + "fmla v8.8h, v17.8h, v0.h[5]\n" + "fmla v9.8h, v17.8h, v1.h[5]\n" + "fmla v10.8h, v17.8h, v2.h[5]\n" + "ldr d16, [%[f_c6]]\n" + "fmla v11.8h, v17.8h, v3.h[5]\n" + "ldr x5, [%[f_c6], #8]\n" + "fmla v12.8h, v17.8h, v4.h[5]\n" + "ins v16.d[1], x5\n" + "fmla v13.8h, v17.8h, v5.h[5]\n" + "ldr d25, [%[out_o1h1], #48]\n" + "fmla v14.8h, v17.8h, v6.h[5]\n" + "ldr x3, [%[out_o1h1], #56]\n" + "fmla v15.8h, v17.8h, v7.h[5]\n" + "ins v25.d[1], x3\n" + "fmla v8.8h, v16.8h, v0.h[6]\n" + "fmla v9.8h, v16.8h, v1.h[6]\n" + "fmla v10.8h, v16.8h, v2.h[6]\n" + "ldr d17, [%[f_c7]]\n" + "fmla v11.8h, v16.8h, v3.h[6]\n" + "ldr x5, [%[f_c7], #8]\n" + "fmla v12.8h, v16.8h, v4.h[6]\n" + "ins v17.d[1], x5\n" + "fmla v13.8h, v16.8h, v5.h[6]\n" + "fmla v14.8h, v16.8h, v6.h[6]\n" + "fmla v15.8h, v16.8h, v7.h[6]\n" + "fmla v8.8h, v17.8h, v0.h[7]\n" + "fmla v9.8h, v17.8h, v1.h[7]\n" + "fmla v10.8h, v17.8h, v2.h[7]\n" + "ldr d16, [%[f_c0], #16]\n" + "fmla v11.8h, v17.8h, v3.h[7]\n" + "ldr x4, [%[f_c0], #24]\n" + "fmla v12.8h, v17.8h, v4.h[7]\n" + "ins v16.d[1], x4\n" + "fmla v13.8h, v17.8h, v5.h[7]\n" + "fmla v14.8h, v17.8h, v6.h[7]\n" + "fmla v15.8h, v17.8h, v7.h[7]\n" + + "fmla v18.8h, v16.8h, v0.h[0]\n" + "fmla v19.8h, v16.8h, v1.h[0]\n" + "fmla v20.8h, v16.8h, v2.h[0]\n" + "ldr d17, [%[f_c1], #16]\n" + "fmla v21.8h, v16.8h, v3.h[0]\n" + "ldr x5, [%[f_c1], #24]\n" + "fmla v22.8h, v16.8h, v4.h[0]\n" + "ins v17.d[1], x5\n" + "fmla v23.8h, v16.8h, v5.h[0]\n" + "fmla v24.8h, v16.8h, v6.h[0]\n" + "fmla v25.8h, v16.8h, v7.h[0]\n" + "fmla v18.8h, v17.8h, v0.h[1]\n" + "fmla v19.8h, v17.8h, v1.h[1]\n" + "fmla v20.8h, v17.8h, v2.h[1]\n" + "ldr d16, [%[f_c2], #16]\n" + "fmla v21.8h, v17.8h, v3.h[1]\n" + "ldr x4, [%[f_c2], #24]\n" + "fmla v22.8h, v17.8h, v4.h[1]\n" + "ins v16.d[1], x4\n" + "fmla v23.8h, v17.8h, v5.h[1]\n" + "fmla v24.8h, v17.8h, v6.h[1]\n" + "fmla v25.8h, v17.8h, v7.h[1]\n" + "fmla v18.8h, v16.8h, v0.h[2]\n" + "fmla v19.8h, v16.8h, v1.h[2]\n" + "fmla v20.8h, v16.8h, v2.h[2]\n" + "ldr d17, [%[f_c3], #16]\n" + "fmla v21.8h, v16.8h, v3.h[2]\n" + "ldr x5, [%[f_c3], #24]\n" + "fmla v22.8h, v16.8h, v4.h[2]\n" + "ins v17.d[1], x5\n" + "fmla v23.8h, v16.8h, v5.h[2]\n" + "fmla v24.8h, v16.8h, v6.h[2]\n" + "fmla v25.8h, v16.8h, v7.h[2]\n" + "fmla v18.8h, v17.8h, v0.h[3]\n" + "fmla v19.8h, v17.8h, v1.h[3]\n" + "fmla v20.8h, v17.8h, v2.h[3]\n" + "ldr d16, [%[f_c4], #16]\n" + "fmla v21.8h, v17.8h, v3.h[3]\n" + "ldr x4, [%[f_c4], #24]\n" + "fmla v22.8h, v17.8h, v4.h[3]\n" + "ins v16.d[1], x4\n" + "fmla v23.8h, v17.8h, v5.h[3]\n" + "fmla v24.8h, v17.8h, v6.h[3]\n" + "fmla v25.8h, v17.8h, v7.h[3]\n" + "fmla v18.8h, v16.8h, v0.h[4]\n" + "fmla v19.8h, v16.8h, v1.h[4]\n" + "fmla v20.8h, v16.8h, v2.h[4]\n" + "ldr d17, [%[f_c5], #16]\n" + "fmla v21.8h, v16.8h, v3.h[4]\n" + "ldr x5, [%[f_c5], #24]\n" + "fmla v22.8h, v16.8h, v4.h[4]\n" + "ins v17.d[1], x5\n" + "fmla v23.8h, v16.8h, v5.h[4]\n" + "fmla v24.8h, v16.8h, v6.h[4]\n" + "fmla v25.8h, v16.8h, v7.h[4]\n" + "fmla v18.8h, v17.8h, v0.h[5]\n" + "fmla v19.8h, v17.8h, v1.h[5]\n" + "fmla v20.8h, v17.8h, v2.h[5]\n" + "ldr d16, [%[f_c6], #16]\n" + "fmla v21.8h, v17.8h, v3.h[5]\n" + "ldr x4, [%[f_c6], #24]\n" + "fmla v22.8h, v17.8h, v4.h[5]\n" + "ins v16.d[1], x4\n" + "fmla v23.8h, v17.8h, v5.h[5]\n" + "fmla v24.8h, v17.8h, v6.h[5]\n" + "fmla v25.8h, v17.8h, v7.h[5]\n" + "fmla v18.8h, v16.8h, v0.h[6]\n" + "fmla v19.8h, v16.8h, v1.h[6]\n" + "fmla v20.8h, v16.8h, v2.h[6]\n" + "ldr d17, [%[f_c7], #16]\n" + "fmla v21.8h, v16.8h, v3.h[6]\n" + "ldr x5, [%[f_c7], #24]\n" + "fmla v22.8h, v16.8h, v4.h[6]\n" + "ins v17.d[1], x5\n" + "fmla v23.8h, v16.8h, v5.h[6]\n" + "fmla v24.8h, v16.8h, v6.h[6]\n" + "fmla v25.8h, v16.8h, v7.h[6]\n" + "fmla v18.8h, v17.8h, v0.h[7]\n" + "fmla v19.8h, v17.8h, v1.h[7]\n" + "fmla v20.8h, v17.8h, v2.h[7]\n" + "fmla v21.8h, v17.8h, v3.h[7]\n" + "fmla v22.8h, v17.8h, v4.h[7]\n" + "fmla v23.8h, v17.8h, v5.h[7]\n" + "fmla v24.8h, v17.8h, v6.h[7]\n" + "fmla v25.8h, v17.8h, v7.h[7]\n" + "str q8, [%[out_o0h0]]\n" + "str q9, [%[out_o0h0], #16]\n" + "str q10, [%[out_o0h0], #32]\n" + "str q11, [%[out_o0h0], #48]\n" + "str q12, [%[out_o0h1]]\n" + "str q13, [%[out_o0h1], #16]\n" + "str q14, [%[out_o0h1], #32]\n" + "str q15, [%[out_o0h1], #48]\n" + "str q18, [%[out_o1h0]]\n" + "str q19, [%[out_o1h0], #16]\n" + "str q20, [%[out_o1h0], #32]\n" + "str q21, [%[out_o1h0], #48]\n" + "str q22, [%[out_o1h1]]\n" + "str q23, [%[out_o1h1], #16]\n" + "str q24, [%[out_o1h1], #32]\n" + "str q25, [%[out_o1h1], #48]\n" + + : [out_o0h0] "+r"(out_o0h0), [out_o0h1] "+r"(out_o0h1), + [out_o1h0] "+r"(out_o1h0), [out_o1h1] "+r"(out_o1h1) + : [in_h0w0] "r"(in_h0w0), [in_h0w1] "r"(in_h0w1), + [in_h0w2] "r"(in_h0w2), [in_h0w3] "r"(in_h0w3), + [in_h1w0] "r"(in_h1w0), [in_h1w1] "r"(in_h1w1), + [in_h1w2] "r"(in_h1w2), [in_h1w3] "r"(in_h1w3), + [f_c0] "r"(f_c0), [f_c1] "r"(f_c1), [f_c2] "r"(f_c2), + [f_c3] "r"(f_c3), [f_c4] "r"(f_c4), [f_c5] "r"(f_c5), + [f_c6] "r"(f_c6), [f_c7] "r"(f_c7) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", + "v25", "x0", "x1", "x2", "x3", "x4", "x5"); + f_c0 += 16; + f_c1 += 16; + f_c2 += 16; + f_c3 += 16; + f_c4 += 16; + f_c5 += 16; + f_c6 += 16; + f_c7 += 16; + in_h0w0 += 8; + in_h0w1 += 8; + in_h0w2 += 8; + in_h0w3 += 8; + in_h1w0 += 8; + in_h1w1 += 8; + in_h1w2 += 8; + in_h1w3 += 8; + } + in_h0w0 += iw_pad * 8 - fw * 8; + in_h0w1 += iw_pad * 8 - fw * 8; + in_h0w2 += iw_pad * 8 - fw * 8; + in_h0w3 += iw_pad * 8 - fw * 8; + in_h1w0 += iw_pad * 8 - fw * 8; + in_h1w1 += iw_pad * 8 - fw * 8; + in_h1w2 += iw_pad * 8 - fw * 8; + in_h1w3 += iw_pad * 8 - fw * 8; + } + in_h0w0 = in_h0w0 + 4 * strideW * 8 - fh * iw_pad * 8; + in_h0w1 = in_h0w1 + 4 * strideW * 8 - fh * iw_pad * 8; + in_h0w2 = in_h0w2 + 4 * strideW * 8 - fh * iw_pad * 8; + in_h0w3 = in_h0w3 + 4 * strideW * 8 - fh * iw_pad * 8; + in_h1w0 = in_h1w0 + 4 * strideW * 8 - fh * iw_pad * 8; + in_h1w1 = in_h1w1 + 4 * strideW * 8 - fh * iw_pad * 8; + in_h1w2 = in_h1w2 + 4 * strideW * 8 - fh * iw_pad * 8; + in_h1w3 = in_h1w3 + 4 * strideW * 8 - fh * iw_pad * 8; + out_o0h0 += 32; + out_o1h0 += 32; + out_o0h1 += 32; + out_o1h1 += 32; + } + in_h0w0 += 2 * strideH * iw_pad * 8 - ow * strideW * 8; + in_h0w1 += 2 * strideH * iw_pad * 8 - ow * strideW * 8; + in_h0w2 += 2 * strideH * iw_pad * 8 - ow * strideW * 8; + in_h0w3 += 2 * strideH * iw_pad * 8 - ow * strideW * 8; + in_h1w0 += 2 * strideH * iw_pad * 8 - ow * strideW * 8; + in_h1w1 += 2 * strideH * iw_pad * 8 - ow * strideW * 8; + in_h1w2 += 2 * strideH * iw_pad * 8 - ow * strideW * 8; + in_h1w3 += 2 * strideH * iw_pad * 8 - ow * strideW * 8; + out_o0h0 += ow * 8; + out_o1h0 += ow * 8; + out_o0h1 += ow * 8; + out_o1h1 += ow * 8; + } + f0 += 8 * fh * fw * 16; + f1 += 8 * fh * fw * 16; + f2 += 8 * fh * fw * 16; + f3 += 8 * fh * fw * 16; + f4 += 8 * fh * fw * 16; + f5 += 8 * fh * fw * 16; + f6 += 8 * fh * fw * 16; + f7 += 8 * fh * fw * 16; + } + outo0h0 += 2 * oh * ow * 8; + outo1h0 += 2 * oh * ow * 8; + outo0h1 += 2 * oh * ow * 8; + outo1h1 += 2 * oh * ow * 8; + } + + // bias + F16 *out = outArray; + float16x8_t v_0 = vmovq_n_f16(0); + for (U32 o = 0; o < oc; o++) { + float16x8_t v_b = vld1q_f16(biasArray + o * 8); + for (U32 hw = 0; hw < oh * ow; hw++) { + float16x8_t v = vld1q_f16(out); + switch (activationDesc.mode) { + case ACTIVATION_NULL: + vst1q_f16(out, vaddq_f16(v, v_b)); + break; + case ACTIVATION_RELU: + vst1q_f16(out, vmaxq_f16(vaddq_f16(v, v_b), v_0)); + break; + default: + return NOT_SUPPORTED; + } + out += 8; + } + } + } + return ret; +} diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_direct.h b/compute/tensor/src/cpu/arm/fp16/convolution_direct.h new file mode 100644 index 00000000..ffec13ac --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/convolution_direct.h @@ -0,0 +1,33 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_CONVOLUTION_DIRECT +#define _H_CONVOLUTION_DIRECT +#include "sys.h" +#include "types.h" +#include "error.h" + +EE convolution_direct(TensorDesc inputDesc, + F16 *inArray, + TensorDesc filterDesc, + const F16 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc, + Arch arch); +#endif diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_gemm.h b/compute/tensor/src/cpu/arm/fp16/convolution_gemm.h new file mode 100644 index 00000000..ca11c77c --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/convolution_gemm.h @@ -0,0 +1,76 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_CONVOLUTION_GEMM +#define _H_CONVOLUTION_GEMM + +#include "sys.h" +#include "types.h" +#include "error.h" + +EE convolution_gemm_A55(TensorDesc inputDesc, + F16 *inArray, + TensorDesc filterDesc, + const F16 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc); + +EE convolution_gemm_A76(TensorDesc inputDesc, + F16 *inArray, + TensorDesc filterDesc, + const F16 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc); + +inline EE convolution_gemm(TensorDesc inputDesc, + F16 *inArray, + TensorDesc filterDesc, + const F16 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc, + Arch arch) +{ + EE ret = SUCCESS; + switch (arch) { + case ARM_A55: + ret = convolution_gemm_A55(inputDesc, inArray, filterDesc, filterArray, convParamSpec, + biasDesc, biasArray, tmpBytes, tmp, outputDesc, outArray, activationDesc); + break; + case ARM_A76: + ret = convolution_gemm_A76(inputDesc, inArray, filterDesc, filterArray, convParamSpec, + biasDesc, biasArray, tmpBytes, tmp, outputDesc, outArray, activationDesc); + break; + default: + return NOT_SUPPORTED; + } + return ret; +} +#endif diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_gemm_A55.cpp b/compute/tensor/src/cpu/arm/fp16/convolution_gemm_A55.cpp new file mode 100644 index 00000000..ed6cee0d --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/convolution_gemm_A55.cpp @@ -0,0 +1,975 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/arm/fp16/convolution_gemm.h" + +EE convolution_gemm_A55(TensorDesc inputDesc, + F16 *inArray, + TensorDesc filterDesc, + const F16 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; + + if (fdf != DF_NHWCN16) { + CHECK_STATUS(NOT_MATCH); + } + + oc /= 8; + ic /= 8; + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + I32 ohow = oh * ow; + U32 ihiw = ih_pad * iw_pad; + F16 *inArray_pad; + EE ret = SUCCESS; + for (U32 n = 0; n < in; n++) { + if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { + inArray_pad = inArray + n * ic * ih * iw * 8; + } else { + // copy input into a input with padding + inArray_pad = (F16 *)tmp; + F16 *inArray_pad_mov = inArray_pad; + F16 *inArray_mov = inArray + n * ic * ih * iw * 8; + for (U32 c = 0; c < ic; c++) { + for (U32 h = 0; h < paddingT; h++) { + memset(inArray_pad_mov, 0, iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += iw_pad * 8; + } + for (U32 h = paddingT; h < ih_pad - paddingB; h++) { + memset(inArray_pad_mov, 0, paddingL * 8 * bytesOf(idt)); + inArray_pad_mov += paddingL * 8; + memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt)); + inArray_pad_mov += iw * 8; + inArray_mov += iw * 8; + memset(inArray_pad_mov, 0, paddingR * 8 * bytesOf(idt)); + inArray_pad_mov += paddingR * 8; + } + for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { + memset(inArray_pad_mov, 0, iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += iw_pad * 8; + } + } + } + // ohow / 8 + for (I32 hw = 0; hw < ohow - 7; hw += 8) { + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + const F16 *f_o0c0 = filterArray; + F16 *in_pack = ((F16 *)tmp) + ic * ih_pad * iw_pad * 8; + // pack input + // NCHWc8 => NHWChw8 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + U32 in_h_1 = ((hw + 1) / ow) * strideH; + U32 in_w_1 = ((hw + 1) % ow) * strideW; + U32 in_h_2 = ((hw + 2) / ow) * strideH; + U32 in_w_2 = ((hw + 2) % ow) * strideW; + U32 in_h_3 = ((hw + 3) / ow) * strideH; + U32 in_w_3 = ((hw + 3) % ow) * strideW; + U32 in_h_4 = ((hw + 4) / ow) * strideH; + U32 in_w_4 = ((hw + 4) % ow) * strideW; + U32 in_h_5 = ((hw + 5) / ow) * strideH; + U32 in_w_5 = ((hw + 5) % ow) * strideW; + U32 in_h_6 = ((hw + 6) / ow) * strideH; + U32 in_w_6 = ((hw + 6) % ow) * strideW; + U32 in_h_7 = ((hw + 7) / ow) * strideH; + U32 in_w_7 = ((hw + 7) % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F16 *in_hw8c8 = inArray_pad + c * ihiw * 8 + fh_idx * dilateH * iw_pad * 8 + + fw_idx * dilateW * 8; + F16 *in_0 = in_hw8c8 + in_h_0 * iw_pad * 8 + in_w_0 * 8; + F16 *in_1 = in_hw8c8 + in_h_1 * iw_pad * 8 + in_w_1 * 8; + F16 *in_2 = in_hw8c8 + in_h_2 * iw_pad * 8 + in_w_2 * 8; + F16 *in_3 = in_hw8c8 + in_h_3 * iw_pad * 8 + in_w_3 * 8; + F16 *in_4 = in_hw8c8 + in_h_4 * iw_pad * 8 + in_w_4 * 8; + F16 *in_5 = in_hw8c8 + in_h_5 * iw_pad * 8 + in_w_5 * 8; + F16 *in_6 = in_hw8c8 + in_h_6 * iw_pad * 8 + in_w_6 * 8; + F16 *in_7 = in_hw8c8 + in_h_7 * iw_pad * 8 + in_w_7 * 8; + + // NHWChw8 + F16 *in_pack_c8hw8 = + in_pack + fh_idx * fw * ic * 8 * 8 + fw_idx * ic * 8 * 8 + c * 8 * 8; + /* + * for (U32 c8 = 0; c8 < 8; c8++) { + * for (U32 hw8 = 0; hw8 < 8; hw8++) { + * in_pack_c8hw8[c8*8 + hw8] = in_hw8c8[hw8*8 + c8]; + * } + * } + */ + float16x8_t v0 = vld1q_f16(in_0); + float16x8_t v1 = vld1q_f16(in_1); + float16x8_t v2 = vld1q_f16(in_2); + float16x8_t v3 = vld1q_f16(in_3); + float16x8_t v4 = vld1q_f16(in_4); + float16x8_t v5 = vld1q_f16(in_5); + float16x8_t v6 = vld1q_f16(in_6); + float16x8_t v7 = vld1q_f16(in_7); + vst1q_f16(in_pack_c8hw8, + vzip1q_f16(vzip1q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip1q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8, + vzip2q_f16(vzip1q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip1q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8 * 2, + vzip1q_f16(vzip2q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip2q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8 * 3, + vzip2q_f16(vzip2q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip2q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8 * 4, + vzip1q_f16(vzip1q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vzip1q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8 * 5, + vzip2q_f16(vzip1q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vzip1q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8 * 6, + vzip1q_f16(vzip2q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vzip2q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8 * 7, + vzip2q_f16(vzip2q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vzip2q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); + } + } + } + + // compute + for (I32 o = 0; o < I32(oc) - 1; o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr d22, [%[b_0]]\n" // b_o0 + "ldr x1, [%[b_0], #8]\n" + "ins v22.d[1], x1\n" + "ldr d23, [%[b_1]]\n" // b_o1 + "ldr x2, [%[b_1], #8]\n" + "ins v23.d[1], x2\n" + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "ldr x1, [%[in_0], #8]\n" + "mov v4.16b, v22.16b\n" // out_o0hw2 + "ins v0.d[1], x1\n" + "mov v5.16b, v22.16b\n" // out_o0hw3 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "mov v6.16b, v22.16b\n" // out_o0hw4 + "ldr x2, [%[f_0], #8]\n" + "mov v7.16b, v22.16b\n" // out_o0hw5 + "ins v18.d[1], x2\n" + "mov v8.16b, v22.16b\n" // out_o0hw6 + "ldr d19, [%[f_0], #16]\n" // f_o1c0 + "mov v9.16b, v22.16b\n" // out_o0hw7 + "ldr x3, [%[f_0], #24]\n" + "mov v10.16b, v23.16b\n" // out_o1hw0 + "ins v19.d[1], x3\n" + "mov v11.16b, v23.16b\n" // out_o1hw1 + "mov v12.16b, v23.16b\n" // out_o1hw2 + "mov v13.16b, v23.16b\n" // out_o1hw3 + "mov v14.16b, v23.16b\n" // out_o1hw4 + "mov v15.16b, v23.16b\n" // out_o1hw5 + "mov v16.16b, v23.16b\n" // out_o1hw6 + "mov v17.16b, v23.16b\n" // out_o1hw7 + "0:\n" + "ldr d1, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr x1, [%[in_0], #24]\n" + "fmla v3.8h, v18.8h, v0.h[1]\n" + "ins v1.d[1], x1\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "ldr d20, [%[f_0], #32]\n" // f_o0c0 + "fmla v5.8h, v18.8h, v0.h[3]\n" + "ldr x2, [%[f_0], #40]\n" + "fmla v6.8h, v18.8h, v0.h[4]\n" + "ins v20.d[1], x2\n" + "fmla v7.8h, v18.8h, v0.h[5]\n" + "ldr d21, [%[f_0], #48]\n" // f_o1c0 + "fmla v8.8h, v18.8h, v0.h[6]\n" + "ldr x3, [%[f_0], #56]\n" + "fmla v9.8h, v18.8h, v0.h[7]\n" + "ins v21.d[1], x3\n" + "fmla v10.8h, v19.8h, v0.h[0]\n" + "fmla v11.8h, v19.8h, v0.h[1]\n" + "fmla v12.8h, v19.8h, v0.h[2]\n" + "fmla v13.8h, v19.8h, v0.h[3]\n" + "fmla v14.8h, v19.8h, v0.h[4]\n" + "fmla v15.8h, v19.8h, v0.h[5]\n" + "fmla v16.8h, v19.8h, v0.h[6]\n" + "fmla v17.8h, v19.8h, v0.h[7]\n" + + "ldr d0, [%[in_0], #32]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr x1, [%[in_0], #40]\n" + "fmla v3.8h, v20.8h, v1.h[1]\n" + "ins v0.d[1], x1\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "ldr d18, [%[f_0], #64]\n" // f_o0c0 + "fmla v5.8h, v20.8h, v1.h[3]\n" + "ldr x2, [%[f_0], #72]\n" + "fmla v6.8h, v20.8h, v1.h[4]\n" + "ins v18.d[1], x2\n" + "fmla v7.8h, v20.8h, v1.h[5]\n" + "ldr d19, [%[f_0], #80]\n" // f_o1c0 + "fmla v8.8h, v20.8h, v1.h[6]\n" + "ldr x3, [%[f_0], #88]\n" + "fmla v9.8h, v20.8h, v1.h[7]\n" + "ins v19.d[1], x3\n" + "fmla v10.8h, v21.8h, v1.h[0]\n" + "add %[in_0], %[in_0], #32\n" + "fmla v11.8h, v21.8h, v1.h[1]\n" + "add %[f_0], %[f_0], #64\n" + "fmla v12.8h, v21.8h, v1.h[2]\n" + "subs x0, x0, #2\n" + "fmla v13.8h, v21.8h, v1.h[3]\n" + "fmla v14.8h, v21.8h, v1.h[4]\n" + "fmla v15.8h, v21.8h, v1.h[5]\n" + "fmla v16.8h, v21.8h, v1.h[6]\n" + "fmla v17.8h, v21.8h, v1.h[7]\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "x0", "x1", "x2", "x3"); + + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + "fmax v3.8h, v3.8h, v1.8h\n" + "fmax v4.8h, v4.8h, v1.8h\n" + "fmax v5.8h, v5.8h, v1.8h\n" + "fmax v6.8h, v6.8h, v1.8h\n" + "fmax v7.8h, v7.8h, v1.8h\n" + "fmax v8.8h, v8.8h, v1.8h\n" + "fmax v9.8h, v9.8h, v1.8h\n" + "fmax v10.8h, v10.8h, v1.8h\n" + "fmax v11.8h, v11.8h, v1.8h\n" + "fmax v12.8h, v12.8h, v1.8h\n" + "fmax v13.8h, v13.8h, v1.8h\n" + "fmax v14.8h, v14.8h, v1.8h\n" + "fmax v15.8h, v15.8h, v1.8h\n" + "fmax v16.8h, v16.8h, v1.8h\n" + "fmax v17.8h, v17.8h, v1.8h\n" + : + : + : "memory", "cc", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", + "v15", "v16", "v17"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmax v4.8h, v4.8h, v31.8h\n" + "fmax v5.8h, v5.8h, v31.8h\n" + "fmax v6.8h, v6.8h, v31.8h\n" + "fmax v7.8h, v7.8h, v31.8h\n" + "fmax v8.8h, v8.8h, v31.8h\n" + "fmax v9.8h, v9.8h, v31.8h\n" + "fmax v10.8h, v10.8h, v31.8h\n" + "fmax v11.8h, v11.8h, v31.8h\n" + "fmax v12.8h, v12.8h, v31.8h\n" + "fmax v13.8h, v13.8h, v31.8h\n" + "fmax v14.8h, v14.8h, v31.8h\n" + "fmax v15.8h, v15.8h, v31.8h\n" + "fmax v16.8h, v16.8h, v31.8h\n" + "fmax v17.8h, v17.8h, v31.8h\n" + + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v3.8h, v3.8h, v30.8h\n" + "fmin v4.8h, v4.8h, v30.8h\n" + "fmin v5.8h, v5.8h, v30.8h\n" + "fmin v6.8h, v6.8h, v30.8h\n" + "fmin v7.8h, v7.8h, v30.8h\n" + "fmin v8.8h, v8.8h, v30.8h\n" + "fmin v9.8h, v9.8h, v30.8h\n" + "fmin v10.8h, v10.8h, v30.8h\n" + "fmin v11.8h, v11.8h, v30.8h\n" + "fmin v12.8h, v12.8h, v30.8h\n" + "fmin v13.8h, v13.8h, v30.8h\n" + "fmin v14.8h, v14.8h, v30.8h\n" + "fmin v15.8h, v15.8h, v30.8h\n" + "fmin v16.8h, v16.8h, v30.8h\n" + "fmin v17.8h, v17.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + "str q6, [%[out_0], #64]\n" // out_o0hw4 + "str q7, [%[out_0], #80]\n" // out_o0hw5 + "str q8, [%[out_0], #96]\n" // out_o0hw6 + "str q9, [%[out_0], #112]\n" // out_o0hw7 + "str q10, [%[out_1]]\n" // out_o1hw0 + "str q11, [%[out_1], #16]\n" // out_o1hw1 + "str q12, [%[out_1], #32]\n" // out_o1hw2 + "str q13, [%[out_1], #48]\n" // out_o1hw3 + "str q14, [%[out_1], #64]\n" // out_o1hw4 + "str q15, [%[out_1], #80]\n" // out_o1hw5 + "str q16, [%[out_1], #96]\n" // out_o1hw6 + "str q17, [%[out_1], #112]\n" // out_o1hw7 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0) + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = filterArray + (oc - 1) * 8 * fh * fw * ic * 8; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = biasArray + (oc - 1) * 8; + __asm__ __volatile__("ldr q12, [%[b_0]]\n" // b_o0 + "mov x0, %[ic]\n" // ic_blk + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v12.16b\n" // out_o0hw0 + "ldr x1, [%[in_0], #8]\n" + "mov v3.16b, v12.16b\n" // out_o0hw1 + "ins v0.d[1], x1\n" + "mov v4.16b, v12.16b\n" // out_o0hw2 + "ldr d10, [%[f_0]]\n" // f_o0c0 + "mov v5.16b, v12.16b\n" // out_o0hw3 + "ldr x2, [%[f_0], #8]\n" + "mov v6.16b, v12.16b\n" // out_o0hw4 + "ins v10.d[1], x2\n" + "mov v7.16b, v12.16b\n" // out_o0hw5 + "mov v8.16b, v12.16b\n" // out_o0hw6 + "mov v9.16b, v12.16b\n" // out_o0hw7 + "0:\n" + "ldr d1, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v10.8h, v0.h[0]\n" + "ldr x1, [%[in_0], #24]\n" + "fmla v3.8h, v10.8h, v0.h[1]\n" + "ins v1.d[1], x1\n" + "fmla v4.8h, v10.8h, v0.h[2]\n" + "ldr d11, [%[f_0], #16]\n" // f_o0c0 + "fmla v5.8h, v10.8h, v0.h[3]\n" + "ldr x2, [%[f_0], #24]\n" + "fmla v6.8h, v10.8h, v0.h[4]\n" + "ins v11.d[1], x2\n" + "fmla v7.8h, v10.8h, v0.h[5]\n" + "subs x0, x0, #2\n" + "fmla v8.8h, v10.8h, v0.h[6]\n" + "fmla v9.8h, v10.8h, v0.h[7]\n" + + "ldr d0, [%[in_0], #32]\n" // in_hw0 + "fmla v2.8h, v11.8h, v1.h[0]\n" + "ldr x1, [%[in_0], #40]\n" + "fmla v3.8h, v11.8h, v1.h[1]\n" + "ins v0.d[1], x1\n" + "fmla v4.8h, v11.8h, v1.h[2]\n" + "ldr d10, [%[f_0], #32]\n" // f_o0c0 + "fmla v5.8h, v11.8h, v1.h[3]\n" + "ldr x2, [%[f_0], #40]\n" + "fmla v6.8h, v11.8h, v1.h[4]\n" + "ins v10.d[1], x2\n" + "fmla v7.8h, v11.8h, v1.h[5]\n" + "add %[in_0], %[in_0], #32\n" + "fmla v8.8h, v11.8h, v1.h[6]\n" + "add %[f_0], %[f_0], #32\n" + "fmla v9.8h, v11.8h, v1.h[7]\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_o0) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "x0", "x1", "x2"); + + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__( + "eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + "fmax v3.8h, v3.8h, v1.8h\n" + "fmax v4.8h, v4.8h, v1.8h\n" + "fmax v5.8h, v5.8h, v1.8h\n" + "fmax v6.8h, v6.8h, v1.8h\n" + "fmax v7.8h, v7.8h, v1.8h\n" + "fmax v8.8h, v8.8h, v1.8h\n" + "fmax v9.8h, v9.8h, v1.8h\n" + : + : + : "memory", "cc", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmax v4.8h, v4.8h, v31.8h\n" + "fmax v5.8h, v5.8h, v31.8h\n" + "fmax v6.8h, v6.8h, v31.8h\n" + "fmax v7.8h, v7.8h, v31.8h\n" + "fmax v8.8h, v8.8h, v31.8h\n" + "fmax v9.8h, v9.8h, v31.8h\n" + + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v3.8h, v3.8h, v30.8h\n" + "fmin v4.8h, v4.8h, v30.8h\n" + "fmin v5.8h, v5.8h, v30.8h\n" + "fmin v6.8h, v6.8h, v30.8h\n" + "fmin v7.8h, v7.8h, v30.8h\n" + "fmin v8.8h, v8.8h, v30.8h\n" + "fmin v9.8h, v9.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__( + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw0 + "str q4, [%[out_0], #32]\n" // out_o0hw0 + "str q5, [%[out_0], #48]\n" // out_o0hw0 + "str q6, [%[out_0], #64]\n" // out_o0hw0 + "str q7, [%[out_0], #80]\n" // out_o0hw0 + "str q8, [%[out_0], #96]\n" // out_o0hw0 + "str q9, [%[out_0], #112]\n" // out_o0hw0 + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9"); + } + } + + // ohow_reminder % 8 / 4 + U32 ohow_s = (ohow / 8) * 8; + // U32 ohow_s = (ohow/8)*8; + for (I32 hw = ohow_s; hw < ohow - 3; hw += 4) { + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + const F16 *f_o0c0 = filterArray; + F16 *in_pack = ((F16 *)tmp) + ic * ih_pad * iw_pad * 8; + // pack input + // NCHWc8 => NHWChw4 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + U32 in_h_1 = ((hw + 1) / ow) * strideH; + U32 in_w_1 = ((hw + 1) % ow) * strideW; + U32 in_h_2 = ((hw + 2) / ow) * strideH; + U32 in_w_2 = ((hw + 2) % ow) * strideW; + U32 in_h_3 = ((hw + 3) / ow) * strideH; + U32 in_w_3 = ((hw + 3) % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F16 *in_hw4c8 = inArray_pad + c * ihiw * 8 + fh_idx * dilateH * iw_pad * 8 + + fw_idx * dilateW * 8; + F16 *in_0 = in_hw4c8 + in_h_0 * iw_pad * 8 + in_w_0 * 8; + F16 *in_1 = in_hw4c8 + in_h_1 * iw_pad * 8 + in_w_1 * 8; + F16 *in_2 = in_hw4c8 + in_h_2 * iw_pad * 8 + in_w_2 * 8; + F16 *in_3 = in_hw4c8 + in_h_3 * iw_pad * 8 + in_w_3 * 8; + F16 *in_pack_c8hw4 = + in_pack + fh_idx * fw * ic * 8 * 4 + fw_idx * ic * 8 * 4 + c * 8 * 4; + + /* + * for (U32 c8 = 0; c8 < 8; c8++) { + * for (U32 hw4 = 0; hw4 < 4; hw4++) { + * in_pack_c8hw4[c8*4 + hw4] = in_hw4c8[hw4*8 + c8]; + * } + * } + */ + + __asm__ __volatile__( + "ldr q0, [%[in_0]]\n" + "ldr q1, [%[in_1]]\n" + "ldr q2, [%[in_2]]\n" + "ldr q3, [%[in_3]]\n" + "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%[in_pack_0]]\n" + : [in_pack_0] "+r"(in_pack_c8hw4) + : [in_0] "r"(in_0), [in_1] "r"(in_1), [in_2] "r"(in_2), [in_3] "r"(in_3) + : "memory", "cc", "v0", "v1", "v2", "v3"); + } + } + } + + // compute + for (I32 o = 0; o < I32(oc - 1); o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr d22, [%[b_0]]\n" // b_o0 + "ldr x1, [%[b_0], #8]\n" + "ins v22.d[1], x1\n" + "ldr d23, [%[b_1]]\n" // b_o1 + "ldr x2, [%[b_1], #8]\n" + "ins v23.d[1], x2\n" + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "mov v4.16b, v22.16b\n" // out_o0hw2 + "ldr x2, [%[f_0], #8]\n" + "mov v5.16b, v22.16b\n" // out_o0hw3 + "ins v18.d[1], x2\n" + "mov v10.16b, v23.16b\n" // out_o1hw0 + "ldr d19, [%[f_0], #16]\n" // f_o1c0 + "mov v11.16b, v23.16b\n" // out_o1hw1 + "ldr x3, [%[f_0], #24]\n" + "mov v12.16b, v23.16b\n" // out_o1hw2 + "ins v19.d[1], x3\n" + "mov v13.16b, v23.16b\n" // out_o1hw3 + "0:\n" + "ldr d1, [%[in_0], #8]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr d20, [%[f_0], #32]\n" // f_o0c0 + "fmla v3.8h, v18.8h, v0.h[1]\n" + "ldr x2, [%[f_0], #40]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "ins v20.d[1], x2\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "ldr d21, [%[f_0], #48]\n" // f_o1c0 + "fmla v10.8h, v19.8h, v0.h[0]\n" + "ldr x3, [%[f_0], #56]\n" + "fmla v11.8h, v19.8h, v0.h[1]\n" + "ins v21.d[1], x3\n" + "fmla v12.8h, v19.8h, v0.h[2]\n" + "subs x0, x0, #2\n" + "fmla v13.8h, v19.8h, v0.h[3]\n" + + "ldr d0, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr d18, [%[f_0], #64]\n" // f_o0c0 + "fmla v3.8h, v20.8h, v1.h[1]\n" + "ldr x2, [%[f_0], #72]\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "ldr d19, [%[f_0], #80]\n" // f_o1c0 + "fmla v5.8h, v20.8h, v1.h[3]\n" + "ins v18.d[1], x2\n" + "fmla v10.8h, v21.8h, v1.h[0]\n" + "ldr x3, [%[f_0], #88]\n" + "fmla v11.8h, v21.8h, v1.h[1]\n" + "ins v19.d[1], x3\n" + "fmla v12.8h, v21.8h, v1.h[2]\n" + "add %[in_0], %[in_0], #16\n" + "fmla v13.8h, v21.8h, v1.h[3]\n" + "add %[f_0], %[f_0], #64\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v10", "v11", "v12", + "v13", "v18", "v19", "v20", "v21", "v22", "v23", "x0", "x1", "x2", "x3"); + + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + "fmax v3.8h, v3.8h, v1.8h\n" + "fmax v4.8h, v4.8h, v1.8h\n" + "fmax v5.8h, v5.8h, v1.8h\n" + "fmax v10.8h, v10.8h, v1.8h\n" + "fmax v11.8h, v11.8h, v1.8h\n" + "fmax v12.8h, v12.8h, v1.8h\n" + "fmax v13.8h, v13.8h, v1.8h\n" + : + : + : "memory", "cc", "v1", "v2", "v3", "v4", "v5", "v10", + "v11", "v12", "v13"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmax v4.8h, v4.8h, v31.8h\n" + "fmax v5.8h, v5.8h, v31.8h\n" + "fmax v10.8h, v10.8h, v31.8h\n" + "fmax v11.8h, v11.8h, v31.8h\n" + "fmax v12.8h, v12.8h, v31.8h\n" + "fmax v13.8h, v13.8h, v31.8h\n" + + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v3.8h, v3.8h, v30.8h\n" + "fmin v4.8h, v4.8h, v30.8h\n" + "fmin v5.8h, v5.8h, v30.8h\n" + "fmin v10.8h, v10.8h, v30.8h\n" + "fmin v11.8h, v11.8h, v30.8h\n" + "fmin v12.8h, v12.8h, v30.8h\n" + "fmin v13.8h, v13.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v10", "v11", + "v12", "v13", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__( + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + "str q10, [%[out_1]]\n" // out_o1hw0 + "str q11, [%[out_1], #16]\n" // out_o1hw1 + "str q12, [%[out_1], #32]\n" // out_o1hw2 + "str q13, [%[out_1], #48]\n" // out_o1hw3 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0) + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v10", "v11", "v12", "v13"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = filterArray + (oc - 1) * 8 * fh * fw * ic * 8; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = biasArray + (oc - 1) * 8; + __asm__ __volatile__("ldr d22, [%[b_0]]\n" // b_o0 + "ldr x1, [%[b_0], #8]\n" + "ins v22.d[1], x1\n" + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "mov v4.16b, v22.16b\n" // out_o0hw2 + "ldr x2, [%[f_0], #8]\n" + "mov v5.16b, v22.16b\n" // out_o0hw3 + "ins v18.d[1], x2\n" + "0:\n" + "ldr d1, [%[in_0], #8]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr d20, [%[f_0], #16]\n" // f_o0c0 + "fmla v3.8h, v18.8h, v0.h[1]\n" + "ldr x2, [%[f_0], #24]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "ins v20.d[1], x2\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "subs x0, x0, #2\n" + + "ldr d0, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr d18, [%[f_0], #32]\n" // f_o0c0 + "fmla v3.8h, v20.8h, v1.h[1]\n" + "ldr x2, [%[f_0], #40]\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "ins v18.d[1], x2\n" + "fmla v5.8h, v20.8h, v1.h[3]\n" + "add %[in_0], %[in_0], #16\n" + "add %[f_0], %[f_0], #32\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_o0) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v18", + "v20", "v22", "x0", "x1", "x2"); + + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + "fmax v3.8h, v3.8h, v1.8h\n" + "fmax v4.8h, v4.8h, v1.8h\n" + "fmax v5.8h, v5.8h, v1.8h\n" + : + : + : "memory", "cc", "v1", "v2", "v3", "v4", "v5"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmax v4.8h, v4.8h, v31.8h\n" + "fmax v5.8h, v5.8h, v31.8h\n" + + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v3.8h, v3.8h, v30.8h\n" + "fmin v4.8h, v4.8h, v30.8h\n" + "fmin v5.8h, v5.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "v2", "v3", "v4", "v5"); + } + } + + // ohow_reminder % 4 + ohow_s = (ohow / 4) * 4; + for (I32 hw = ohow_s; hw < ohow; hw++) { + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + const F16 *f_o0c0 = filterArray; + F16 *in_pack = ((F16 *)tmp) + ic * ih_pad * iw_pad * 8; + // pack input + // NCHWc8 => NHWChw4 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F16 *in_hw1c8 = inArray_pad + c * ihiw * 8 + fh_idx * dilateH * iw_pad * 8 + + fw_idx * dilateW * 8; + F16 *in_0 = in_hw1c8 + in_h_0 * iw_pad * 8 + in_w_0 * 8; + F16 *in_pack_c8hw1 = + in_pack + fh_idx * fw * ic * 8 + fw_idx * ic * 8 + c * 8; + /* + * for (U32 c8 = 0; c8 < 8; c8++) { + * in_pack_c8hw1[c8] = in_0[c8]; + * } + */ + memcpy(in_pack_c8hw1, in_0, 8 * bytesOf(idt)); + } + } + } + + // compute + for (I32 o = 0; o < I32(oc - 1); o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr d22, [%[b_0]]\n" // b_o0 + "ldr x1, [%[b_0], #8]\n" + "ins v22.d[1], x1\n" + "ldr d23, [%[b_1]]\n" // b_o1 + "mov x0, %[ic]\n" // ic_blk + "ldr x2, [%[b_1], #8]\n" + "ins v23.d[1], x2\n" + "ldr h0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "mov v10.16b, v23.16b\n" // out_o1hw0 + "ldr x2, [%[f_0], #8]\n" + "ins v18.d[1], x2\n" + "ldr d19, [%[f_0], #16]\n" // f_o1c0 + "ldr x3, [%[f_0], #24]\n" + "ins v19.d[1], x3\n" + "0:\n" + "ldr h1, [%[in_0], #2]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr d20, [%[f_0], #32]\n" // f_o0c0 + "fmla v10.8h, v19.8h, v0.h[0]\n" + "ldr x2, [%[f_0], #40]\n" + "ins v20.d[1], x2\n" + "ldr d21, [%[f_0], #48]\n" // f_o1c0 + "subs x0, x0, #2\n" + "ldr x3, [%[f_0], #56]\n" + "ins v21.d[1], x3\n" + + "ldr h0, [%[in_0], #4]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr d18, [%[f_0], #64]\n" // f_o0c0 + "fmla v10.8h, v21.8h, v1.h[0]\n" + "ldr x2, [%[f_0], #72]\n" + "ins v18.d[1], x2\n" + "ldr d19, [%[f_0], #80]\n" // f_o1c0 + "add %[in_0], %[in_0], #4\n" + "ldr x3, [%[f_0], #88]\n" + "ins v19.d[1], x3\n" + "add %[f_0], %[f_0], #64\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v10", "v18", "v19", "v20", "v21", "v22", + "v23", "x0", "x1", "x2", "x3"); + + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + "fmax v10.8h, v10.8h, v1.8h\n" + : + : + : "memory", "cc", "v1", "v2", "v10"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v10.8h, v10.8h, v31.8h\n" + + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v10.8h, v10.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v10", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q2, [%[out_0]]\n" // out_o0hw0 + "str q10, [%[out_1]]\n" // out_o1hw0 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0) + : + : "memory", "cc", "v2", "v10"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = filterArray + (oc - 1) * 8 * fh * fw * ic * 8; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = biasArray + (oc - 1) * 8; + __asm__ __volatile__("ldr d22, [%[b_0]]\n" // b_o0 + "mov x0, %[ic]\n" // ic_blk + "ldr x1, [%[b_0], #8]\n" + "ins v22.d[1], x1\n" + "ldr h0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "ldr x2, [%[f_0], #8]\n" + "ins v18.d[1], x2\n" + "0:\n" + "ldr h1, [%[in_0], #2]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr d20, [%[f_0], #16]\n" // f_o0c0 + "subs x0, x0, #2\n" + "ldr x2, [%[f_0], #24]\n" + "ins v20.d[1], x2\n" + + "ldr h0, [%[in_0], #4]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr d18, [%[f_0], #32]\n" // f_o0c0 + "ldr x2, [%[f_0], #40]\n" + "ins v18.d[1], x2\n" + "add %[in_0], %[in_0], #4\n" + "add %[f_0], %[f_0], #32\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_o0) + : "memory", "cc", "v0", "v1", "v2", "v10", "v18", "v20", "v22", + "x0", "x1", "x2"); + + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + : + : + : "memory", "cc", "v1", "v2"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmin v2.8h, v2.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q2, [%[out_0]]\n" // out_o0hw0 + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "v2"); + } + } + } + return ret; +} diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_gemm_A76.cpp b/compute/tensor/src/cpu/arm/fp16/convolution_gemm_A76.cpp new file mode 100644 index 00000000..028e32b0 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/convolution_gemm_A76.cpp @@ -0,0 +1,893 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/arm/fp16/convolution_gemm.h" + +EE convolution_gemm_A76(TensorDesc inputDesc, + F16 *inArray, + TensorDesc filterDesc, + const F16 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; + + if (fdf != DF_NHWCN16) { + CHECK_STATUS(NOT_MATCH); + } + + oc /= 8; + ic /= 8; + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + I32 ohow = oh * ow; + U32 ihiw = ih_pad * iw_pad; + F16 *inArray_pad; + EE ret = SUCCESS; + for (U32 n = 0; n < in; n++) { + if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { + inArray_pad = inArray + n * ic * ih * iw * 8; + } else { + // copy input into a input with padding + inArray_pad = (F16 *)tmp; + F16 *inArray_pad_mov = inArray_pad; + F16 *inArray_mov = inArray + n * ic * ih * iw * 8; + for (U32 c = 0; c < ic; c++) { + for (U32 h = 0; h < paddingT; h++) { + memset(inArray_pad_mov, 0, iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += iw_pad * 8; + } + for (U32 h = paddingT; h < ih_pad - paddingB; h++) { + memset(inArray_pad_mov, 0, paddingL * 8 * bytesOf(idt)); + inArray_pad_mov += paddingL * 8; + memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt)); + inArray_pad_mov += iw * 8; + inArray_mov += iw * 8; + memset(inArray_pad_mov, 0, paddingR * 8 * bytesOf(idt)); + inArray_pad_mov += paddingR * 8; + } + for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { + memset(inArray_pad_mov, 0, iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += iw_pad * 8; + } + } + } + // ohow / 8 + for (I32 hw = 0; hw < ohow - 7; hw += 8) { + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + const F16 *f_o0c0 = filterArray; + F16 *in_pack = ((F16 *)tmp) + ic * ih_pad * iw_pad * 8; + // pack input + // NCHWc8 => NHWChw8 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + U32 in_h_1 = ((hw + 1) / ow) * strideH; + U32 in_w_1 = ((hw + 1) % ow) * strideW; + U32 in_h_2 = ((hw + 2) / ow) * strideH; + U32 in_w_2 = ((hw + 2) % ow) * strideW; + U32 in_h_3 = ((hw + 3) / ow) * strideH; + U32 in_w_3 = ((hw + 3) % ow) * strideW; + U32 in_h_4 = ((hw + 4) / ow) * strideH; + U32 in_w_4 = ((hw + 4) % ow) * strideW; + U32 in_h_5 = ((hw + 5) / ow) * strideH; + U32 in_w_5 = ((hw + 5) % ow) * strideW; + U32 in_h_6 = ((hw + 6) / ow) * strideH; + U32 in_w_6 = ((hw + 6) % ow) * strideW; + U32 in_h_7 = ((hw + 7) / ow) * strideH; + U32 in_w_7 = ((hw + 7) % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F16 *in_hw8c8 = inArray_pad + c * ihiw * 8 + fh_idx * dilateH * iw_pad * 8 + + fw_idx * dilateW * 8; + F16 *in_0 = in_hw8c8 + in_h_0 * iw_pad * 8 + in_w_0 * 8; + F16 *in_1 = in_hw8c8 + in_h_1 * iw_pad * 8 + in_w_1 * 8; + F16 *in_2 = in_hw8c8 + in_h_2 * iw_pad * 8 + in_w_2 * 8; + F16 *in_3 = in_hw8c8 + in_h_3 * iw_pad * 8 + in_w_3 * 8; + F16 *in_4 = in_hw8c8 + in_h_4 * iw_pad * 8 + in_w_4 * 8; + F16 *in_5 = in_hw8c8 + in_h_5 * iw_pad * 8 + in_w_5 * 8; + F16 *in_6 = in_hw8c8 + in_h_6 * iw_pad * 8 + in_w_6 * 8; + F16 *in_7 = in_hw8c8 + in_h_7 * iw_pad * 8 + in_w_7 * 8; + + // NHWChw8 + F16 *in_pack_c8hw8 = + in_pack + fh_idx * fw * ic * 8 * 8 + fw_idx * ic * 8 * 8 + c * 8 * 8; + /* + * for (U32 c8 = 0; c8 < 8; c8++) { + * for (U32 hw8 = 0; hw8 < 8; hw8++) { + * in_pack_c8hw8[c8*8 + hw8] = in_hw8c8[hw8*8 + c8]; + * } + * } + */ + float16x8_t v0 = vld1q_f16(in_0); + float16x8_t v1 = vld1q_f16(in_1); + float16x8_t v2 = vld1q_f16(in_2); + float16x8_t v3 = vld1q_f16(in_3); + float16x8_t v4 = vld1q_f16(in_4); + float16x8_t v5 = vld1q_f16(in_5); + float16x8_t v6 = vld1q_f16(in_6); + float16x8_t v7 = vld1q_f16(in_7); + vst1q_f16(in_pack_c8hw8, + vzip1q_f16(vzip1q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip1q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8, + vzip2q_f16(vzip1q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip1q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8 * 2, + vzip1q_f16(vzip2q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip2q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8 * 3, + vzip2q_f16(vzip2q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip2q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8 * 4, + vzip1q_f16(vzip1q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vzip1q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8 * 5, + vzip2q_f16(vzip1q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vzip1q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8 * 6, + vzip1q_f16(vzip2q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vzip2q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8 * 7, + vzip2q_f16(vzip2q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vzip2q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); + } + } + } + + // compute + for (I32 o = 0; o < I32(oc) - 1; o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr q22, [%[b_0]]\n" // b_o0 + "ldr q23, [%[b_1]]\n" // b_o1 + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr q0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "mov v4.16b, v22.16b\n" // out_o0hw2 + "mov v5.16b, v22.16b\n" // out_o0hw3 + "ldr q18, [%[f_0]]\n" // f_o0c0 + "mov v6.16b, v22.16b\n" // out_o0hw4 + "mov v7.16b, v22.16b\n" // out_o0hw5 + "mov v8.16b, v22.16b\n" // out_o0hw6 + "ldr q19, [%[f_0], #16]\n" // f_o1c0 + "mov v9.16b, v22.16b\n" // out_o0hw7 + "mov v10.16b, v23.16b\n" // out_o1hw0 + "mov v11.16b, v23.16b\n" // out_o1hw1 + "mov v12.16b, v23.16b\n" // out_o1hw2 + "mov v13.16b, v23.16b\n" // out_o1hw3 + "mov v14.16b, v23.16b\n" // out_o1hw4 + "mov v15.16b, v23.16b\n" // out_o1hw5 + "mov v16.16b, v23.16b\n" // out_o1hw6 + "mov v17.16b, v23.16b\n" // out_o1hw7 + "0:\n" + "ldr q1, [%[in_0], #16]\n" // in_hw0 + "ldr q20, [%[f_0], #32]\n" // f_o0c0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "fmla v3.8h, v18.8h, v0.h[1]\n" + "ldr q21, [%[f_0], #48]\n" // f_o1c0 + "fmla v4.8h, v18.8h, v0.h[2]\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "fmla v6.8h, v18.8h, v0.h[4]\n" + "fmla v7.8h, v18.8h, v0.h[5]\n" + "fmla v8.8h, v18.8h, v0.h[6]\n" + "fmla v9.8h, v18.8h, v0.h[7]\n" + "fmla v10.8h, v19.8h, v0.h[0]\n" + "fmla v11.8h, v19.8h, v0.h[1]\n" + "fmla v12.8h, v19.8h, v0.h[2]\n" + "fmla v13.8h, v19.8h, v0.h[3]\n" + "fmla v14.8h, v19.8h, v0.h[4]\n" + "fmla v15.8h, v19.8h, v0.h[5]\n" + "fmla v16.8h, v19.8h, v0.h[6]\n" + "fmla v17.8h, v19.8h, v0.h[7]\n" + "subs x0, x0, #2\n" + + "ldr q0, [%[in_0], #32]\n" // in_hw0 + "ldr q18, [%[f_0], #64]\n" // f_o0c0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "fmla v3.8h, v20.8h, v1.h[1]\n" + "ldr q19, [%[f_0], #80]\n" // f_o1c0 + "fmla v4.8h, v20.8h, v1.h[2]\n" + "fmla v5.8h, v20.8h, v1.h[3]\n" + "fmla v6.8h, v20.8h, v1.h[4]\n" + "fmla v7.8h, v20.8h, v1.h[5]\n" + "fmla v8.8h, v20.8h, v1.h[6]\n" + "fmla v9.8h, v20.8h, v1.h[7]\n" + "fmla v10.8h, v21.8h, v1.h[0]\n" + "fmla v11.8h, v21.8h, v1.h[1]\n" + "fmla v12.8h, v21.8h, v1.h[2]\n" + "fmla v13.8h, v21.8h, v1.h[3]\n" + "add %[in_0], %[in_0], #32\n" + "add %[f_0], %[f_0], #64\n" + "fmla v14.8h, v21.8h, v1.h[4]\n" + "fmla v15.8h, v21.8h, v1.h[5]\n" + "fmla v16.8h, v21.8h, v1.h[6]\n" + "fmla v17.8h, v21.8h, v1.h[7]\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "x0"); + + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + "fmax v3.8h, v3.8h, v1.8h\n" + "fmax v4.8h, v4.8h, v1.8h\n" + "fmax v5.8h, v5.8h, v1.8h\n" + "fmax v6.8h, v6.8h, v1.8h\n" + "fmax v7.8h, v7.8h, v1.8h\n" + "fmax v8.8h, v8.8h, v1.8h\n" + "fmax v9.8h, v9.8h, v1.8h\n" + "fmax v10.8h, v10.8h, v1.8h\n" + "fmax v11.8h, v11.8h, v1.8h\n" + "fmax v12.8h, v12.8h, v1.8h\n" + "fmax v13.8h, v13.8h, v1.8h\n" + "fmax v14.8h, v14.8h, v1.8h\n" + "fmax v15.8h, v15.8h, v1.8h\n" + "fmax v16.8h, v16.8h, v1.8h\n" + "fmax v17.8h, v17.8h, v1.8h\n" + : + : + : "memory", "cc", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", + "v15", "v16", "v17"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmax v4.8h, v4.8h, v31.8h\n" + "fmax v5.8h, v5.8h, v31.8h\n" + "fmax v6.8h, v6.8h, v31.8h\n" + "fmax v7.8h, v7.8h, v31.8h\n" + "fmax v8.8h, v8.8h, v31.8h\n" + "fmax v9.8h, v9.8h, v31.8h\n" + "fmax v10.8h, v10.8h, v31.8h\n" + "fmax v11.8h, v11.8h, v31.8h\n" + "fmax v12.8h, v12.8h, v31.8h\n" + "fmax v13.8h, v13.8h, v31.8h\n" + "fmax v14.8h, v14.8h, v31.8h\n" + "fmax v15.8h, v15.8h, v31.8h\n" + "fmax v16.8h, v16.8h, v31.8h\n" + "fmax v17.8h, v17.8h, v31.8h\n" + + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v3.8h, v3.8h, v30.8h\n" + "fmin v4.8h, v4.8h, v30.8h\n" + "fmin v5.8h, v5.8h, v30.8h\n" + "fmin v6.8h, v6.8h, v30.8h\n" + "fmin v7.8h, v7.8h, v30.8h\n" + "fmin v8.8h, v8.8h, v30.8h\n" + "fmin v9.8h, v9.8h, v30.8h\n" + "fmin v10.8h, v10.8h, v30.8h\n" + "fmin v11.8h, v11.8h, v30.8h\n" + "fmin v12.8h, v12.8h, v30.8h\n" + "fmin v13.8h, v13.8h, v30.8h\n" + "fmin v14.8h, v14.8h, v30.8h\n" + "fmin v15.8h, v15.8h, v30.8h\n" + "fmin v16.8h, v16.8h, v30.8h\n" + "fmin v17.8h, v17.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + "str q6, [%[out_0], #64]\n" // out_o0hw4 + "str q7, [%[out_0], #80]\n" // out_o0hw5 + "str q8, [%[out_0], #96]\n" // out_o0hw6 + "str q9, [%[out_0], #112]\n" // out_o0hw7 + "str q10, [%[out_1]]\n" // out_o1hw0 + "str q11, [%[out_1], #16]\n" // out_o1hw1 + "str q12, [%[out_1], #32]\n" // out_o1hw2 + "str q13, [%[out_1], #48]\n" // out_o1hw3 + "str q14, [%[out_1], #64]\n" // out_o1hw4 + "str q15, [%[out_1], #80]\n" // out_o1hw5 + "str q16, [%[out_1], #96]\n" // out_o1hw6 + "str q17, [%[out_1], #112]\n" // out_o1hw7 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0) + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = filterArray + (oc - 1) * 8 * fh * fw * ic * 8; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = biasArray + (oc - 1) * 8; + __asm__ __volatile__("ldr q12, [%[b_0]]\n" // b_o0 + "mov x0, %[ic]\n" // ic_blk + "ldr q0, [%[in_0]]\n" // in_hw0 + "ldr q10, [%[f_0]]\n" // f_o0c0 + "mov v2.16b, v12.16b\n" // out_o0hw0 + "mov v3.16b, v12.16b\n" // out_o0hw1 + "mov v4.16b, v12.16b\n" // out_o0hw2 + "mov v5.16b, v12.16b\n" // out_o0hw3 + "mov v6.16b, v12.16b\n" // out_o0hw4 + "mov v7.16b, v12.16b\n" // out_o0hw5 + "mov v8.16b, v12.16b\n" // out_o0hw6 + "mov v9.16b, v12.16b\n" // out_o0hw7 + "0:\n" + "ldr q1, [%[in_0], #16]\n" // in_hw0 + "ldr q11, [%[f_0], #16]\n" // f_o0c0 + "fmla v2.8h, v10.8h, v0.h[0]\n" + "fmla v3.8h, v10.8h, v0.h[1]\n" + "fmla v4.8h, v10.8h, v0.h[2]\n" + "fmla v5.8h, v10.8h, v0.h[3]\n" + "fmla v6.8h, v10.8h, v0.h[4]\n" + "fmla v7.8h, v10.8h, v0.h[5]\n" + "fmla v8.8h, v10.8h, v0.h[6]\n" + "fmla v9.8h, v10.8h, v0.h[7]\n" + "subs x0, x0, #2\n" + + "ldr q0, [%[in_0], #32]\n" // in_hw0 + "ldr q10, [%[f_0], #32]\n" // f_o0c0 + "fmla v2.8h, v11.8h, v1.h[0]\n" + "fmla v3.8h, v11.8h, v1.h[1]\n" + "fmla v4.8h, v11.8h, v1.h[2]\n" + "fmla v5.8h, v11.8h, v1.h[3]\n" + "add %[in_0], %[in_0], #32\n" + "add %[f_0], %[f_0], #32\n" + "fmla v6.8h, v11.8h, v1.h[4]\n" + "fmla v7.8h, v11.8h, v1.h[5]\n" + "fmla v8.8h, v11.8h, v1.h[6]\n" + "fmla v9.8h, v11.8h, v1.h[7]\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_o0) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "x0"); + + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__( + "eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + "fmax v3.8h, v3.8h, v1.8h\n" + "fmax v4.8h, v4.8h, v1.8h\n" + "fmax v5.8h, v5.8h, v1.8h\n" + "fmax v6.8h, v6.8h, v1.8h\n" + "fmax v7.8h, v7.8h, v1.8h\n" + "fmax v8.8h, v8.8h, v1.8h\n" + "fmax v9.8h, v9.8h, v1.8h\n" + : + : + : "memory", "cc", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmax v4.8h, v4.8h, v31.8h\n" + "fmax v5.8h, v5.8h, v31.8h\n" + "fmax v6.8h, v6.8h, v31.8h\n" + "fmax v7.8h, v7.8h, v31.8h\n" + "fmax v8.8h, v8.8h, v31.8h\n" + "fmax v9.8h, v9.8h, v31.8h\n" + + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v3.8h, v3.8h, v30.8h\n" + "fmin v4.8h, v4.8h, v30.8h\n" + "fmin v5.8h, v5.8h, v30.8h\n" + "fmin v6.8h, v6.8h, v30.8h\n" + "fmin v7.8h, v7.8h, v30.8h\n" + "fmin v8.8h, v8.8h, v30.8h\n" + "fmin v9.8h, v9.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__( + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw0 + "str q4, [%[out_0], #32]\n" // out_o0hw0 + "str q5, [%[out_0], #48]\n" // out_o0hw0 + "str q6, [%[out_0], #64]\n" // out_o0hw0 + "str q7, [%[out_0], #80]\n" // out_o0hw0 + "str q8, [%[out_0], #96]\n" // out_o0hw0 + "str q9, [%[out_0], #112]\n" // out_o0hw0 + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9"); + } + } + + // ohow_reminder % 8 / 4 + U32 ohow_s = (ohow / 8) * 8; + + for (I32 hw = ohow_s; hw < ohow - 3; hw += 4) { + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + const F16 *f_o0c0 = filterArray; + F16 *in_pack = ((F16 *)tmp) + ic * ih_pad * iw_pad * 8; + // pack input + // NCHWc8 => NHWChw4 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + U32 in_h_1 = ((hw + 1) / ow) * strideH; + U32 in_w_1 = ((hw + 1) % ow) * strideW; + U32 in_h_2 = ((hw + 2) / ow) * strideH; + U32 in_w_2 = ((hw + 2) % ow) * strideW; + U32 in_h_3 = ((hw + 3) / ow) * strideH; + U32 in_w_3 = ((hw + 3) % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F16 *in_hw4c8 = inArray_pad + c * ihiw * 8 + fh_idx * dilateH * iw_pad * 8 + + fw_idx * dilateW * 8; + F16 *in_0 = in_hw4c8 + in_h_0 * iw_pad * 8 + in_w_0 * 8; + F16 *in_1 = in_hw4c8 + in_h_1 * iw_pad * 8 + in_w_1 * 8; + F16 *in_2 = in_hw4c8 + in_h_2 * iw_pad * 8 + in_w_2 * 8; + F16 *in_3 = in_hw4c8 + in_h_3 * iw_pad * 8 + in_w_3 * 8; + F16 *in_pack_c8hw4 = + in_pack + fh_idx * fw * ic * 8 * 4 + fw_idx * ic * 8 * 4 + c * 8 * 4; + + /* + * for (U32 c8 = 0; c8 < 8; c8++) { + * for (U32 hw4 = 0; hw4 < 4; hw4++) { + * in_pack_c8hw4[c8*4 + hw4] = in_hw4c8[hw4*8 + c8]; + * } + * } + */ + + __asm__ __volatile__( + "ldr q0, [%[in_0]]\n" + "ldr q1, [%[in_1]]\n" + "ldr q2, [%[in_2]]\n" + "ldr q3, [%[in_3]]\n" + "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%[in_pack_0]]\n" + : [in_pack_0] "+r"(in_pack_c8hw4) + : [in_0] "r"(in_0), [in_1] "r"(in_1), [in_2] "r"(in_2), [in_3] "r"(in_3) + : "memory", "cc", "v0", "v1", "v2", "v3"); + } + } + } + + // compute + for (I32 o = 0; o < I32(oc - 1); o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr q22, [%[b_0]]\n" // b_o0 + "ldr q23, [%[b_1]]\n" // b_o1 + "mov x0, %[ic]\n" // ic_blk + "ldr d0, [%[in_0]]\n" // in_hw0 + "ldr q18, [%[f_0]]\n" // f_o0c0 + "ldr q19, [%[f_0], #16]\n" // f_o1c0 + "mov v2.16b, v22.16b\n" // out_o0hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "mov v4.16b, v22.16b\n" // out_o0hw2 + "mov v5.16b, v22.16b\n" // out_o0hw3 + "mov v10.16b, v23.16b\n" // out_o1hw0 + "mov v11.16b, v23.16b\n" // out_o1hw1 + "mov v12.16b, v23.16b\n" // out_o1hw2 + "mov v13.16b, v23.16b\n" // out_o1hw3 + "0:\n" + "ldr d1, [%[in_0], #8]\n" // in_hw0 + "ldr q20, [%[f_0], #32]\n" // f_o0c0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "fmla v3.8h, v18.8h, v0.h[1]\n" + "ldr q21, [%[f_0], #48]\n" // f_o1c0 + "fmla v4.8h, v18.8h, v0.h[2]\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "fmla v10.8h, v19.8h, v0.h[0]\n" + "fmla v11.8h, v19.8h, v0.h[1]\n" + "fmla v12.8h, v19.8h, v0.h[2]\n" + "fmla v13.8h, v19.8h, v0.h[3]\n" + "subs x0, x0, #2\n" + + "ldr d0, [%[in_0], #16]\n" // in_hw0 + "ldr q18, [%[f_0], #64]\n" // f_o0c0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "fmla v3.8h, v20.8h, v1.h[1]\n" + "ldr q19, [%[f_0], #80]\n" // f_o1c0 + "fmla v4.8h, v20.8h, v1.h[2]\n" + "fmla v5.8h, v20.8h, v1.h[3]\n" + "add %[in_0], %[in_0], #16\n" + "add %[f_0], %[f_0], #64\n" + "fmla v10.8h, v21.8h, v1.h[0]\n" + "fmla v11.8h, v21.8h, v1.h[1]\n" + "fmla v12.8h, v21.8h, v1.h[2]\n" + "fmla v13.8h, v21.8h, v1.h[3]\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v10", "v11", "v12", + "v13", "v18", "v19", "v20", "v21", "v22", "v23", "x0"); + + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + "fmax v3.8h, v3.8h, v1.8h\n" + "fmax v4.8h, v4.8h, v1.8h\n" + "fmax v5.8h, v5.8h, v1.8h\n" + "fmax v10.8h, v10.8h, v1.8h\n" + "fmax v11.8h, v11.8h, v1.8h\n" + "fmax v12.8h, v12.8h, v1.8h\n" + "fmax v13.8h, v13.8h, v1.8h\n" + : + : + : "memory", "cc", "v1", "v2", "v3", "v4", "v5", "v10", + "v11", "v12", "v13"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmax v4.8h, v4.8h, v31.8h\n" + "fmax v5.8h, v5.8h, v31.8h\n" + "fmax v10.8h, v10.8h, v31.8h\n" + "fmax v11.8h, v11.8h, v31.8h\n" + "fmax v12.8h, v12.8h, v31.8h\n" + "fmax v13.8h, v13.8h, v31.8h\n" + + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v3.8h, v3.8h, v30.8h\n" + "fmin v4.8h, v4.8h, v30.8h\n" + "fmin v5.8h, v5.8h, v30.8h\n" + "fmin v10.8h, v10.8h, v30.8h\n" + "fmin v11.8h, v11.8h, v30.8h\n" + "fmin v12.8h, v12.8h, v30.8h\n" + "fmin v13.8h, v13.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v10", "v11", + "v12", "v13", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__( + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + "str q10, [%[out_1]]\n" // out_o1hw0 + "str q11, [%[out_1], #16]\n" // out_o1hw1 + "str q12, [%[out_1], #32]\n" // out_o1hw2 + "str q13, [%[out_1], #48]\n" // out_o1hw3 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0) + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v10", "v11", "v12", "v13"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = filterArray + (oc - 1) * 8 * fh * fw * ic * 8; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = biasArray + (oc - 1) * 8; + __asm__ __volatile__("ldr q22, [%[b_0]]\n" // b_o0 + "ldr d0, [%[in_0]]\n" // in_hw0 + "ldr q18, [%[f_0]]\n" // f_o0c0 + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "mov v4.16b, v22.16b\n" // out_o0hw2 + "mov v5.16b, v22.16b\n" // out_o0hw3 + "0:\n" + "ldr d1, [%[in_0], #8]\n" // in_hw0 + "ldr q20, [%[f_0], #16]\n" // f_o0c0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "fmla v3.8h, v18.8h, v0.h[1]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "subs x0, x0, #2\n" + + "ldr d0, [%[in_0], #16]\n" // in_hw0 + "ldr q18, [%[f_0], #32]\n" // f_o0c0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "fmla v3.8h, v20.8h, v1.h[1]\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "fmla v5.8h, v20.8h, v1.h[3]\n" + "add %[in_0], %[in_0], #16\n" + "add %[f_0], %[f_0], #32\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_o0) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v18", + "v20", "v22", "x0"); + + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + "fmax v3.8h, v3.8h, v1.8h\n" + "fmax v4.8h, v4.8h, v1.8h\n" + "fmax v5.8h, v5.8h, v1.8h\n" + : + : + : "memory", "cc", "v1", "v2", "v3", "v4", "v5"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmax v4.8h, v4.8h, v31.8h\n" + "fmax v5.8h, v5.8h, v31.8h\n" + + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v3.8h, v3.8h, v30.8h\n" + "fmin v4.8h, v4.8h, v30.8h\n" + "fmin v5.8h, v5.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "v2", "v3", "v4", "v5"); + } + } + + // ohow_reminder % 4 + ohow_s = (ohow / 4) * 4; + for (I32 hw = ohow_s; hw < ohow; hw++) { + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + const F16 *f_o0c0 = filterArray; + F16 *in_pack = ((F16 *)tmp) + ic * ih_pad * iw_pad * 8; + // pack input + // NCHWc8 => NHWChw4 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F16 *in_hw1c8 = inArray_pad + c * ihiw * 8 + fh_idx * dilateH * iw_pad * 8 + + fw_idx * dilateW * 8; + F16 *in_0 = in_hw1c8 + in_h_0 * iw_pad * 8 + in_w_0 * 8; + F16 *in_pack_c8hw1 = + in_pack + fh_idx * fw * ic * 8 + fw_idx * ic * 8 + c * 8; + /* + * for (U32 c8 = 0; c8 < 8; c8++) { + * in_pack_c8hw1[c8] = in_0[c8]; + * } + */ + memcpy(in_pack_c8hw1, in_0, 8 * bytesOf(idt)); + } + } + } + + // compute + for (I32 o = 0; o < I32(oc - 1); o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr q22, [%[b_0]]\n" // b_o0 + "ldr q23, [%[b_1]]\n" // b_o1 + "ldr h0, [%[in_0]]\n" // in_hw0 + "ldr q18, [%[f_0]]\n" // f_o0c0 + "ldr q19, [%[f_0], #16]\n" // f_o1c0 + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "mov v10.16b, v23.16b\n" // out_o1hw0 + "0:\n" + "ldr h1, [%[in_0], #2]\n" // in_hw0 + "ldr q20, [%[f_0], #32]\n" // f_o0c0 + "ldr q21, [%[f_0], #48]\n" // f_o1c0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "fmla v10.8h, v19.8h, v0.h[0]\n" + "subs x0, x0, #2\n" + + "ldr h0, [%[in_0], #4]\n" // in_hw0 + "ldr q18, [%[f_0], #64]\n" // f_o0c0 + "ldr q19, [%[f_0], #80]\n" // f_o1c0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "fmla v10.8h, v21.8h, v1.h[0]\n" + "add %[in_0], %[in_0], #4\n" + "add %[f_0], %[f_0], #64\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v10", "v18", "v19", "v20", "v21", "v22", + "v23", "x0"); + + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + "fmax v10.8h, v10.8h, v1.8h\n" + : + : + : "memory", "cc", "v1", "v2", "v10"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v10.8h, v10.8h, v31.8h\n" + + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v10.8h, v10.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v10", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q2, [%[out_0]]\n" // out_o0hw0 + "str q10, [%[out_1]]\n" // out_o1hw0 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0) + : + : "memory", "cc", "v2", "v10"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = filterArray + (oc - 1) * 8 * fh * fw * ic * 8; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = biasArray + (oc - 1) * 8; + __asm__ __volatile__( + "ldr q22, [%[b_0]]\n" // b_o0 + "ldr h0, [%[in_0]]\n" // in_hw0 + "ldr q18, [%[f_0]]\n" // f_o0c0 + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "0:\n" + "ldr h1, [%[in_0], #2]\n" // in_hw0 + "ldr q20, [%[f_0], #16]\n" // f_o0c0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "subs x0, x0, #2\n" + + "ldr h0, [%[in_0], #4]\n" // in_hw0 + "ldr q18, [%[f_0], #32]\n" // f_o0c0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "add %[in_0], %[in_0], #4\n" + "add %[f_0], %[f_0], #32\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_o0) + : "memory", "cc", "v0", "v1", "v2", "v10", "v18", "v20", "v22", "x0"); + + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + : + : + : "memory", "cc", "v1", "v2"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmin v2.8h, v2.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q2, [%[out_0]]\n" // out_o0hw0 + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "v2"); + } + } + } + return ret; +} diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw.h b/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw.h new file mode 100644 index 00000000..17778b77 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw.h @@ -0,0 +1,79 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_CONVOLUTION_GEMM_ICNCHW +#define _H_CONVOLUTION_GEMM_ICNCHW + +#include +#include "sys.h" +#include "types.h" +#include "error.h" + +EE convolution_gemm_icnchw_A55(TensorDesc inputDesc, + F16 *inArray, + TensorDesc filterDesc, + const F16 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc); + +EE convolution_gemm_icnchw_A76(TensorDesc inputDesc, + F16 *inArray, + TensorDesc filterDesc, + const F16 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc); + +inline EE convolution_gemm_icnchw(TensorDesc inputDesc, + F16 *inArray, + TensorDesc filterDesc, + const F16 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc, + Arch arch) +{ + EE ret = SUCCESS; + switch (arch) { + case ARM_A55: + ret = convolution_gemm_icnchw_A55(inputDesc, inArray, filterDesc, filterArray, + convParamSpec, biasDesc, biasArray, tmpBytes, tmp, outputDesc, outArray, + activationDesc); + break; + case ARM_A76: + ret = convolution_gemm_icnchw_A76(inputDesc, inArray, filterDesc, filterArray, + convParamSpec, biasDesc, biasArray, tmpBytes, tmp, outputDesc, outArray, + activationDesc); + break; + default: + return NOT_SUPPORTED; + } + return ret; +} +#endif diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw_A55.cpp b/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw_A55.cpp new file mode 100644 index 00000000..e51733cf --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw_A55.cpp @@ -0,0 +1,1003 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/fp16/convolution_gemm_icnchw.h" + +EE convolution_gemm_icnchw_A55(TensorDesc inputDesc, + F16 *inArray, + TensorDesc filterDesc, + const F16 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; + + if (fdf != DF_NHWCN16) { + CHECK_STATUS(NOT_MATCH); + } + + oc /= 8; + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + I32 ohow = oh * ow; + U32 ihiw = ih_pad * iw_pad; + F16 *inArray_pad; + EE ret = SUCCESS; + for (U32 n = 0; n < in; n++) { + if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { + inArray_pad = inArray + n * ic * ih * iw; + } else { + // copy input into a input with padding + inArray_pad = (F16 *)tmp; + F16 *inArray_pad_mov = inArray_pad; + F16 *inArray_mov = inArray + n * ic * ih * iw; + for (U32 c = 0; c < ic; c++) { + for (U32 h = 0; h < paddingT; h++) { + memset(inArray_pad_mov, 0, iw_pad * bytesOf(idt)); + inArray_pad_mov += iw_pad; + } + for (U32 h = paddingT; h < ih_pad - paddingB; h++) { + memset(inArray_pad_mov, 0, paddingL * bytesOf(idt)); + inArray_pad_mov += paddingL; + memcpy(inArray_pad_mov, inArray_mov, iw * bytesOf(idt)); + inArray_pad_mov += iw; + inArray_mov += iw; + memset(inArray_pad_mov, 0, paddingR * bytesOf(idt)); + inArray_pad_mov += paddingR; + } + for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { + memset(inArray_pad_mov, 0, iw_pad * bytesOf(idt)); + inArray_pad_mov += iw_pad; + } + } + } + + // ohow / 8 + for (I32 hw = 0; hw < ohow - 7; hw += 8) { + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + const F16 *f_o0c0 = filterArray; + F16 *in_pack = ((F16 *)tmp) + ic * ih_pad * iw_pad; + // pack input + // NCHW => NHWChw8 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + U32 in_h_1 = ((hw + 1) / ow) * strideH; + U32 in_w_1 = ((hw + 1) % ow) * strideW; + U32 in_h_2 = ((hw + 2) / ow) * strideH; + U32 in_w_2 = ((hw + 2) % ow) * strideW; + U32 in_h_3 = ((hw + 3) / ow) * strideH; + U32 in_w_3 = ((hw + 3) % ow) * strideW; + U32 in_h_4 = ((hw + 4) / ow) * strideH; + U32 in_w_4 = ((hw + 4) % ow) * strideW; + U32 in_h_5 = ((hw + 5) / ow) * strideH; + U32 in_w_5 = ((hw + 5) % ow) * strideW; + U32 in_h_6 = ((hw + 6) / ow) * strideH; + U32 in_w_6 = ((hw + 6) % ow) * strideW; + U32 in_h_7 = ((hw + 7) / ow) * strideH; + U32 in_w_7 = ((hw + 7) % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F16 *in_hw = + inArray_pad + c * ihiw + fh_idx * dilateH * iw_pad + dilateW * fw_idx; + F16 *in_0 = in_hw + in_h_0 * iw_pad + in_w_0; + F16 *in_1 = in_hw + in_h_1 * iw_pad + in_w_1; + F16 *in_2 = in_hw + in_h_2 * iw_pad + in_w_2; + F16 *in_3 = in_hw + in_h_3 * iw_pad + in_w_3; + F16 *in_4 = in_hw + in_h_4 * iw_pad + in_w_4; + F16 *in_5 = in_hw + in_h_5 * iw_pad + in_w_5; + F16 *in_6 = in_hw + in_h_6 * iw_pad + in_w_6; + F16 *in_7 = in_hw + in_h_7 * iw_pad + in_w_7; + F16 *in_pack_hw8 = in_pack + fh_idx * fw * ic * 8 + fw_idx * ic * 8 + c * 8; + *in_pack_hw8 = *in_0; + *(in_pack_hw8 + 1) = *in_1; + *(in_pack_hw8 + 2) = *in_2; + *(in_pack_hw8 + 3) = *in_3; + *(in_pack_hw8 + 4) = *in_4; + *(in_pack_hw8 + 5) = *in_5; + *(in_pack_hw8 + 6) = *in_6; + *(in_pack_hw8 + 7) = *in_7; + } + } + } + + // compute + for (I32 o = 0; o < I32(oc - 1); o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__("ldr d22, [%[b_0]]\n" // b_o0 + "ldr x1, [%[b_0], #8]\n" + "ins v22.d[1], x1\n" + "ldr d23, [%[b_1]]\n" // b_o1 + "ldr x2, [%[b_1], #8]\n" + "ins v23.d[1], x2\n" + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "ldr x1, [%[in_0], #8]\n" + "mov v4.16b, v22.16b\n" // out_o0hw2 + "ins v0.d[1], x1\n" + "mov v5.16b, v22.16b\n" // out_o0hw3 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "mov v6.16b, v22.16b\n" // out_o0hw4 + "ldr x2, [%[f_0], #8]\n" + "mov v7.16b, v22.16b\n" // out_o0hw5 + "ins v18.d[1], x2\n" + "mov v8.16b, v22.16b\n" // out_o0hw6 + "ldr d19, [%[f_0], #16]\n" // f_o1c0 + "mov v9.16b, v22.16b\n" // out_o0hw7 + "ldr x3, [%[f_0], #24]\n" + "mov v10.16b, v23.16b\n" // out_o1hw0 + "ins v19.d[1], x3\n" + "mov v11.16b, v23.16b\n" // out_o1hw1 + "mov v12.16b, v23.16b\n" // out_o1hw2 + "mov v13.16b, v23.16b\n" // out_o1hw3 + "mov v14.16b, v23.16b\n" // out_o1hw4 + "mov v15.16b, v23.16b\n" // out_o1hw5 + "mov v16.16b, v23.16b\n" // out_o1hw6 + "mov v17.16b, v23.16b\n" // out_o1hw7 + + "0:\n" + "cmp x0, #1\n" + "ble 1f\n" + "ldr d1, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr x1, [%[in_0], #24]\n" + "fmla v3.8h, v18.8h, v0.h[1]\n" + "ins v1.d[1], x1\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "ldr d20, [%[f_0], #32]\n" // f_o0c0 + "fmla v5.8h, v18.8h, v0.h[3]\n" + "ldr x2, [%[f_0], #40]\n" + "fmla v6.8h, v18.8h, v0.h[4]\n" + "ins v20.d[1], x2\n" + "fmla v7.8h, v18.8h, v0.h[5]\n" + "ldr d21, [%[f_0], #48]\n" // f_o1c0 + "fmla v8.8h, v18.8h, v0.h[6]\n" + "ldr x3, [%[f_0], #56]\n" + "fmla v9.8h, v18.8h, v0.h[7]\n" + "ins v21.d[1], x3\n" + "fmla v10.8h, v19.8h, v0.h[0]\n" + "fmla v11.8h, v19.8h, v0.h[1]\n" + "fmla v12.8h, v19.8h, v0.h[2]\n" + "fmla v13.8h, v19.8h, v0.h[3]\n" + "fmla v14.8h, v19.8h, v0.h[4]\n" + "fmla v15.8h, v19.8h, v0.h[5]\n" + "fmla v16.8h, v19.8h, v0.h[6]\n" + "fmla v17.8h, v19.8h, v0.h[7]\n" + + "ldr d0, [%[in_0], #32]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr x1, [%[in_0], #40]\n" + "fmla v3.8h, v20.8h, v1.h[1]\n" + "ins v0.d[1], x1\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "ldr d18, [%[f_0], #64]\n" // f_o0c0 + "fmla v5.8h, v20.8h, v1.h[3]\n" + "ldr x2, [%[f_0], #72]\n" + "fmla v6.8h, v20.8h, v1.h[4]\n" + "ins v18.d[1], x2\n" + "fmla v7.8h, v20.8h, v1.h[5]\n" + "ldr d19, [%[f_0], #80]\n" // f_o1c0 + "fmla v8.8h, v20.8h, v1.h[6]\n" + "ldr x3, [%[f_0], #88]\n" + "fmla v9.8h, v20.8h, v1.h[7]\n" + "ins v19.d[1], x3\n" + "fmla v10.8h, v21.8h, v1.h[0]\n" + "add %[in_0], %[in_0], #32\n" + "fmla v11.8h, v21.8h, v1.h[1]\n" + "add %[f_0], %[f_0], #64\n" + "fmla v12.8h, v21.8h, v1.h[2]\n" + "sub x0, x0, #2\n" + "fmla v13.8h, v21.8h, v1.h[3]\n" + "fmla v14.8h, v21.8h, v1.h[4]\n" + "fmla v15.8h, v21.8h, v1.h[5]\n" + "fmla v16.8h, v21.8h, v1.h[6]\n" + "fmla v17.8h, v21.8h, v1.h[7]\n" + "b 0b\n" + + "1:\n" + "blt 2f\n" + "fmla v2.8h, v18.8h, v0.h[0]\n" + "fmla v3.8h, v18.8h, v0.h[1]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "fmla v6.8h, v18.8h, v0.h[4]\n" + "fmla v7.8h, v18.8h, v0.h[5]\n" + "fmla v8.8h, v18.8h, v0.h[6]\n" + "fmla v9.8h, v18.8h, v0.h[7]\n" + "add %[f_0], %[f_0], #32\n" + "fmla v10.8h, v19.8h, v0.h[0]\n" + "fmla v11.8h, v19.8h, v0.h[1]\n" + "fmla v12.8h, v19.8h, v0.h[2]\n" + "fmla v13.8h, v19.8h, v0.h[3]\n" + "fmla v14.8h, v19.8h, v0.h[4]\n" + "fmla v15.8h, v19.8h, v0.h[5]\n" + "fmla v16.8h, v19.8h, v0.h[6]\n" + "fmla v17.8h, v19.8h, v0.h[7]\n" + "2:\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "x0", + "x1", "x2", "x3"); + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + "fmax v3.8h, v3.8h, v1.8h\n" + "fmax v4.8h, v4.8h, v1.8h\n" + "fmax v5.8h, v5.8h, v1.8h\n" + "fmax v6.8h, v6.8h, v1.8h\n" + "fmax v7.8h, v7.8h, v1.8h\n" + "fmax v8.8h, v8.8h, v1.8h\n" + "fmax v9.8h, v9.8h, v1.8h\n" + "fmax v10.8h, v10.8h, v1.8h\n" + "fmax v11.8h, v11.8h, v1.8h\n" + "fmax v12.8h, v12.8h, v1.8h\n" + "fmax v13.8h, v13.8h, v1.8h\n" + "fmax v14.8h, v14.8h, v1.8h\n" + "fmax v15.8h, v15.8h, v1.8h\n" + "fmax v16.8h, v16.8h, v1.8h\n" + "fmax v17.8h, v17.8h, v1.8h\n" + : + : + : "memory", "cc", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", + "v15", "v16", "v17"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmax v4.8h, v4.8h, v31.8h\n" + "fmax v5.8h, v5.8h, v31.8h\n" + "fmax v6.8h, v6.8h, v31.8h\n" + "fmax v7.8h, v7.8h, v31.8h\n" + "fmax v8.8h, v8.8h, v31.8h\n" + "fmax v9.8h, v9.8h, v31.8h\n" + "fmax v10.8h, v10.8h, v31.8h\n" + "fmax v11.8h, v11.8h, v31.8h\n" + "fmax v12.8h, v12.8h, v31.8h\n" + "fmax v13.8h, v13.8h, v31.8h\n" + "fmax v14.8h, v14.8h, v31.8h\n" + "fmax v15.8h, v15.8h, v31.8h\n" + "fmax v16.8h, v16.8h, v31.8h\n" + "fmax v17.8h, v17.8h, v31.8h\n" + + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v3.8h, v3.8h, v30.8h\n" + "fmin v4.8h, v4.8h, v30.8h\n" + "fmin v5.8h, v5.8h, v30.8h\n" + "fmin v6.8h, v6.8h, v30.8h\n" + "fmin v7.8h, v7.8h, v30.8h\n" + "fmin v8.8h, v8.8h, v30.8h\n" + "fmin v9.8h, v9.8h, v30.8h\n" + "fmin v10.8h, v10.8h, v30.8h\n" + "fmin v11.8h, v11.8h, v30.8h\n" + "fmin v12.8h, v12.8h, v30.8h\n" + "fmin v13.8h, v13.8h, v30.8h\n" + "fmin v14.8h, v14.8h, v30.8h\n" + "fmin v15.8h, v15.8h, v30.8h\n" + "fmin v16.8h, v16.8h, v30.8h\n" + "fmin v17.8h, v17.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + "str q6, [%[out_0], #64]\n" // out_o0hw4 + "str q7, [%[out_0], #80]\n" // out_o0hw5 + "str q8, [%[out_0], #96]\n" // out_o0hw6 + "str q9, [%[out_0], #112]\n" // out_o0hw7 + "str q10, [%[out_1]]\n" // out_o1hw0 + "str q11, [%[out_1], #16]\n" // out_o1hw1 + "str q12, [%[out_1], #32]\n" // out_o1hw2 + "str q13, [%[out_1], #48]\n" // out_o1hw3 + "str q14, [%[out_1], #64]\n" // out_o1hw4 + "str q15, [%[out_1], #80]\n" // out_o1hw5 + "str q16, [%[out_1], #96]\n" // out_o1hw6 + "str q17, [%[out_1], #112]\n" // out_o1hw7 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0) + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = filterArray + (oc - 1) * 8 * fh * fw * ic; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = biasArray + (oc - 1) * 8; + __asm__ __volatile__("ldr q12, [%[b_0]]\n" // b_o0 + "mov x0, %[ic]\n" // ic_blk + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v12.16b\n" // out_o0hw0 + "ldr x1, [%[in_0], #8]\n" + "mov v3.16b, v12.16b\n" // out_o0hw1 + "ins v0.d[1], x1\n" + "mov v4.16b, v12.16b\n" // out_o0hw2 + "ldr d10, [%[f_0]]\n" // f_o0c0 + "mov v5.16b, v12.16b\n" // out_o0hw3 + "ldr x2, [%[f_0], #8]\n" + "mov v6.16b, v12.16b\n" // out_o0hw4 + "ins v10.d[1], x2\n" + "mov v7.16b, v12.16b\n" // out_o0hw5 + "mov v8.16b, v12.16b\n" // out_o0hw6 + "mov v9.16b, v12.16b\n" // out_o0hw7 + + "0:\n" + "cmp x0, #1\n" + "ble 1f\n" + "ldr d1, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v10.8h, v0.h[0]\n" + "ldr x1, [%[in_0], #24]\n" + "fmla v3.8h, v10.8h, v0.h[1]\n" + "ins v1.d[1], x1\n" + "fmla v4.8h, v10.8h, v0.h[2]\n" + "ldr d11, [%[f_0], #16]\n" // f_o0c0 + "fmla v5.8h, v10.8h, v0.h[3]\n" + "ldr x2, [%[f_0], #24]\n" + "fmla v6.8h, v10.8h, v0.h[4]\n" + "ins v11.d[1], x2\n" + "fmla v7.8h, v10.8h, v0.h[5]\n" + "sub x0, x0, #2\n" + "fmla v8.8h, v10.8h, v0.h[6]\n" + "fmla v9.8h, v10.8h, v0.h[7]\n" + + "ldr d0, [%[in_0], #32]\n" // in_hw0 + "fmla v2.8h, v11.8h, v1.h[0]\n" + "ldr x1, [%[in_0], #40]\n" + "fmla v3.8h, v11.8h, v1.h[1]\n" + "ins v0.d[1], x1\n" + "fmla v4.8h, v11.8h, v1.h[2]\n" + "ldr d10, [%[f_0], #32]\n" // f_o0c0 + "fmla v5.8h, v11.8h, v1.h[3]\n" + "ldr x2, [%[f_0], #40]\n" + "fmla v6.8h, v11.8h, v1.h[4]\n" + "ins v10.d[1], x2\n" + "fmla v7.8h, v11.8h, v1.h[5]\n" + "add %[in_0], %[in_0], #32\n" + "fmla v8.8h, v11.8h, v1.h[6]\n" + "add %[f_0], %[f_0], #32\n" + "fmla v9.8h, v11.8h, v1.h[7]\n" + "b 0b\n" + + "1:\n" + "blt 2f\n" + "fmla v2.8h, v10.8h, v0.h[0]\n" + "fmla v3.8h, v10.8h, v0.h[1]\n" + "fmla v4.8h, v10.8h, v0.h[2]\n" + "fmla v5.8h, v10.8h, v0.h[3]\n" + "add %[f_0], %[f_0], #16\n" + "fmla v6.8h, v10.8h, v0.h[4]\n" + "fmla v7.8h, v10.8h, v0.h[5]\n" + "fmla v8.8h, v10.8h, v0.h[6]\n" + "fmla v9.8h, v10.8h, v0.h[7]\n" + "2:\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * fh * fw), [b_0] "r"(b_o0) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "x0", "x1", "x2"); + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__( + "eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + "fmax v3.8h, v3.8h, v1.8h\n" + "fmax v4.8h, v4.8h, v1.8h\n" + "fmax v5.8h, v5.8h, v1.8h\n" + "fmax v6.8h, v6.8h, v1.8h\n" + "fmax v7.8h, v7.8h, v1.8h\n" + "fmax v8.8h, v8.8h, v1.8h\n" + "fmax v9.8h, v9.8h, v1.8h\n" + : + : + : "memory", "cc", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmax v4.8h, v4.8h, v31.8h\n" + "fmax v5.8h, v5.8h, v31.8h\n" + "fmax v6.8h, v6.8h, v31.8h\n" + "fmax v7.8h, v7.8h, v31.8h\n" + "fmax v8.8h, v8.8h, v31.8h\n" + "fmax v9.8h, v9.8h, v31.8h\n" + + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v3.8h, v3.8h, v30.8h\n" + "fmin v4.8h, v4.8h, v30.8h\n" + "fmin v5.8h, v5.8h, v30.8h\n" + "fmin v6.8h, v6.8h, v30.8h\n" + "fmin v7.8h, v7.8h, v30.8h\n" + "fmin v8.8h, v8.8h, v30.8h\n" + "fmin v9.8h, v9.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__( + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw0 + "str q4, [%[out_0], #32]\n" // out_o0hw0 + "str q5, [%[out_0], #48]\n" // out_o0hw0 + "str q6, [%[out_0], #64]\n" // out_o0hw0 + "str q7, [%[out_0], #80]\n" // out_o0hw0 + "str q8, [%[out_0], #96]\n" // out_o0hw0 + "str q9, [%[out_0], #112]\n" // out_o0hw0 + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9"); + } + } + // ohow_reminder % 8 / 4 + U32 ohow_s = (ohow / 8) * 8; + // U32 ohow_s = (ohow/8)*8; + for (I32 hw = ohow_s; hw < ohow - 3; hw += 4) { + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + const F16 *f_o0c0 = filterArray; + F16 *in_pack = ((F16 *)tmp) + ic * ih_pad * iw_pad; + + // pack input + // NCHWc8 => NHWChw4 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + U32 in_h_1 = ((hw + 1) / ow) * strideH; + U32 in_w_1 = ((hw + 1) % ow) * strideW; + U32 in_h_2 = ((hw + 2) / ow) * strideH; + U32 in_w_2 = ((hw + 2) % ow) * strideW; + U32 in_h_3 = ((hw + 3) / ow) * strideH; + U32 in_w_3 = ((hw + 3) % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F16 *in_hw = + inArray_pad + c * ihiw + fh_idx * dilateH * iw_pad + dilateW * fw_idx; + F16 *in_0 = in_hw + in_h_0 * iw_pad + in_w_0; + F16 *in_1 = in_hw + in_h_1 * iw_pad + in_w_1; + F16 *in_2 = in_hw + in_h_2 * iw_pad + in_w_2; + F16 *in_3 = in_hw + in_h_3 * iw_pad + in_w_3; + F16 *in_pack_hw4 = in_pack + fh_idx * fw * ic * 4 + fw_idx * ic * 4 + c * 4; + *in_pack_hw4 = *in_0; + *(in_pack_hw4 + 1) = *in_1; + *(in_pack_hw4 + 2) = *in_2; + *(in_pack_hw4 + 3) = *in_3; + } + } + } + + // compute + for (I32 o = 0; o < I32(oc - 1); o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__("ldr d22, [%[b_0]]\n" // b_o0 + "ldr x1, [%[b_0], #8]\n" + "ins v22.d[1], x1\n" + "ldr d23, [%[b_1]]\n" // b_o1 + "ldr x2, [%[b_1], #8]\n" + "ins v23.d[1], x2\n" + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "mov v4.16b, v22.16b\n" // out_o0hw2 + "ldr x2, [%[f_0], #8]\n" + "mov v5.16b, v22.16b\n" // out_o0hw3 + "ins v18.d[1], x2\n" + "mov v10.16b, v23.16b\n" // out_o1hw0 + "ldr d19, [%[f_0], #16]\n" // f_o1c0 + "mov v11.16b, v23.16b\n" // out_o1hw1 + "ldr x3, [%[f_0], #24]\n" + "mov v12.16b, v23.16b\n" // out_o1hw2 + "ins v19.d[1], x3\n" + "mov v13.16b, v23.16b\n" // out_o1hw3 + + "0:\n" + "cmp x0, #1\n" + "ble 1f\n" + "ldr d1, [%[in_0], #8]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr d20, [%[f_0], #32]\n" // f_o0c0 + "fmla v3.8h, v18.8h, v0.h[1]\n" + "ldr x2, [%[f_0], #40]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "ins v20.d[1], x2\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "ldr d21, [%[f_0], #48]\n" // f_o1c0 + "fmla v10.8h, v19.8h, v0.h[0]\n" + "ldr x3, [%[f_0], #56]\n" + "fmla v11.8h, v19.8h, v0.h[1]\n" + "ins v21.d[1], x3\n" + "fmla v12.8h, v19.8h, v0.h[2]\n" + "sub x0, x0, #2\n" + "fmla v13.8h, v19.8h, v0.h[3]\n" + + "ldr d0, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr d18, [%[f_0], #64]\n" // f_o0c0 + "fmla v3.8h, v20.8h, v1.h[1]\n" + "ldr x2, [%[f_0], #72]\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "ldr d19, [%[f_0], #80]\n" // f_o1c0 + "fmla v5.8h, v20.8h, v1.h[3]\n" + "ins v18.d[1], x2\n" + "fmla v10.8h, v21.8h, v1.h[0]\n" + "ldr x3, [%[f_0], #88]\n" + "fmla v11.8h, v21.8h, v1.h[1]\n" + "ins v19.d[1], x3\n" + "fmla v12.8h, v21.8h, v1.h[2]\n" + "add %[in_0], %[in_0], #16\n" + "fmla v13.8h, v21.8h, v1.h[3]\n" + "add %[f_0], %[f_0], #64\n" + "b 0b\n" + + "1:\n" + "blt 2f\n" + "fmla v2.8h, v18.8h, v0.h[0]\n" + "fmla v3.8h, v18.8h, v0.h[1]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "add %[f_0], %[f_0], #32\n" + "fmla v10.8h, v19.8h, v0.h[0]\n" + "fmla v11.8h, v19.8h, v0.h[1]\n" + "fmla v12.8h, v19.8h, v0.h[2]\n" + "fmla v13.8h, v19.8h, v0.h[3]\n" + "2:\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v10", + "v11", "v12", "v13", "v18", "v19", "v20", "v21", "v22", "v23", + "x0", "x1", "x2", "x3"); + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + "fmax v3.8h, v3.8h, v1.8h\n" + "fmax v4.8h, v4.8h, v1.8h\n" + "fmax v5.8h, v5.8h, v1.8h\n" + "fmax v10.8h, v10.8h, v1.8h\n" + "fmax v11.8h, v11.8h, v1.8h\n" + "fmax v12.8h, v12.8h, v1.8h\n" + "fmax v13.8h, v13.8h, v1.8h\n" + : + : + : "memory", "cc", "v1", "v2", "v3", "v4", "v5", "v10", + "v11", "v12", "v13"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmax v4.8h, v4.8h, v31.8h\n" + "fmax v5.8h, v5.8h, v31.8h\n" + "fmax v10.8h, v10.8h, v31.8h\n" + "fmax v11.8h, v11.8h, v31.8h\n" + "fmax v12.8h, v12.8h, v31.8h\n" + "fmax v13.8h, v13.8h, v31.8h\n" + + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v3.8h, v3.8h, v30.8h\n" + "fmin v4.8h, v4.8h, v30.8h\n" + "fmin v5.8h, v5.8h, v30.8h\n" + "fmin v10.8h, v10.8h, v30.8h\n" + "fmin v11.8h, v11.8h, v30.8h\n" + "fmin v12.8h, v12.8h, v30.8h\n" + "fmin v13.8h, v13.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v10", "v11", + "v12", "v13", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__( + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + "str q10, [%[out_1]]\n" // out_o1hw0 + "str q11, [%[out_1], #16]\n" // out_o1hw1 + "str q12, [%[out_1], #32]\n" // out_o1hw2 + "str q13, [%[out_1], #48]\n" // out_o1hw3 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0) + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v10", "v11", "v12", "v13"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = filterArray + (oc - 1) * 8 * fh * fw * ic; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = biasArray + (oc - 1) * 8; + __asm__ __volatile__("ldr d22, [%[b_0]]\n" // b_o0 + "ldr x1, [%[b_0], #8]\n" + "ins v22.d[1], x1\n" + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "mov v4.16b, v22.16b\n" // out_o0hw2 + "ldr x2, [%[f_0], #8]\n" + "mov v5.16b, v22.16b\n" // out_o0hw3 + "ins v18.d[1], x2\n" + + "0:\n" + "cmp x0, #1\n" + "ble 1f\n" + "ldr d1, [%[in_0], #8]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr d20, [%[f_0], #16]\n" // f_o0c0 + "fmla v3.8h, v18.8h, v0.h[1]\n" + "ldr x2, [%[f_0], #24]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "ins v20.d[1], x2\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "sub x0, x0, #2\n" + + "ldr d0, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr d18, [%[f_0], #32]\n" // f_o0c0 + "fmla v3.8h, v20.8h, v1.h[1]\n" + "ldr x2, [%[f_0], #40]\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "ins v18.d[1], x2\n" + "fmla v5.8h, v20.8h, v1.h[3]\n" + "add %[in_0], %[in_0], #16\n" + "add %[f_0], %[f_0], #32\n" + "b 0b\n" + + "1:\n" + "blt 2f\n" + "fmla v2.8h, v18.8h, v0.h[0]\n" + "fmla v3.8h, v18.8h, v0.h[1]\n" + "add %[f_0], %[f_0], #16\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "2:\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * fh * fw), [b_0] "r"(b_o0) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v18", + "v20", "v22", "x0", "x1", "x2"); + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + "fmax v3.8h, v3.8h, v1.8h\n" + "fmax v4.8h, v4.8h, v1.8h\n" + "fmax v5.8h, v5.8h, v1.8h\n" + : + : + : "memory", "cc", "v1", "v2", "v3", "v4", "v5"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmax v4.8h, v4.8h, v31.8h\n" + "fmax v5.8h, v5.8h, v31.8h\n" + + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v3.8h, v3.8h, v30.8h\n" + "fmin v4.8h, v4.8h, v30.8h\n" + "fmin v5.8h, v5.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "v2", "v3", "v4", "v5"); + } + } + // ohow_reminder % 4 + ohow_s = (ohow / 4) * 4; + for (I32 hw = ohow_s; hw < ohow; hw++) { + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + const F16 *f_o0c0 = filterArray; + F16 *in_pack = ((F16 *)tmp) + ic * ih_pad * iw_pad; + // pack input + // NCHWc8 => NHWChw4 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F16 *in_hw = + inArray_pad + c * ihiw + fh_idx * dilateH * iw_pad + dilateW * fw_idx; + F16 *in_0 = in_hw + in_h_0 * iw_pad + in_w_0; + F16 *in_pack_hw1 = in_pack + fh_idx * fw * ic + fw_idx * ic + c; + /* + * for (U32 c8 = 0; c8 < 8; c8++) { + * in_pack_c8hw1[c8] = in_0[c8]; + * } + */ + *in_pack_hw1 = *in_0; + } + } + } + + // compute + for (I32 o = 0; o < I32(oc - 1); o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__("ldr d22, [%[b_0]]\n" // b_o0 + "ldr x1, [%[b_0], #8]\n" + "ins v22.d[1], x1\n" + "ldr d23, [%[b_1]]\n" // b_o1 + "mov x0, %[ic]\n" // ic_blk + "ldr x2, [%[b_1], #8]\n" + "ins v23.d[1], x2\n" + "ldr h0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "mov v10.16b, v23.16b\n" // out_o1hw0 + "ldr x2, [%[f_0], #8]\n" + "ins v18.d[1], x2\n" + "ldr d19, [%[f_0], #16]\n" // f_o1c0 + "ldr x3, [%[f_0], #24]\n" + "ins v19.d[1], x3\n" + + "0:\n" + "cmp x0, #1\n" + "ble 1f\n" + "ldr h1, [%[in_0], #2]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr d20, [%[f_0], #32]\n" // f_o0c0 + "fmla v10.8h, v19.8h, v0.h[0]\n" + "ldr x2, [%[f_0], #40]\n" + "ins v20.d[1], x2\n" + "ldr d21, [%[f_0], #48]\n" // f_o1c0 + "sub x0, x0, #2\n" + "ldr x3, [%[f_0], #56]\n" + "ins v21.d[1], x3\n" + + "ldr h0, [%[in_0], #4]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr d18, [%[f_0], #64]\n" // f_o0c0 + "fmla v10.8h, v21.8h, v1.h[0]\n" + "ldr x2, [%[f_0], #72]\n" + "ins v18.d[1], x2\n" + "ldr d19, [%[f_0], #80]\n" // f_o1c0 + "add %[in_0], %[in_0], #4\n" + "ldr x3, [%[f_0], #88]\n" + "ins v19.d[1], x3\n" + "add %[f_0], %[f_0], #64\n" + "b 0b\n" + + "1:\n" + "blt 2f\n" + "fmla v2.8h, v18.8h, v0.h[0]\n" + "add %[f_0], %[f_0], #32\n" + "fmla v10.8h, v19.8h, v0.h[0]\n" + "2:\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v10", "v18", "v19", "v20", + "v21", "v22", "v23", "x0", "x1", "x2", "x3"); + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + "fmax v10.8h, v10.8h, v1.8h\n" + : + : + : "memory", "cc", "v1", "v2", "v10"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v10.8h, v10.8h, v31.8h\n" + + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v10.8h, v10.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v10", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q2, [%[out_0]]\n" // out_o0hw0 + "str q10, [%[out_1]]\n" // out_o1hw0 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0) + : + : "memory", "cc", "v2", "v10"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = filterArray + (oc - 1) * 8 * fh * fw * ic; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = biasArray + (oc - 1) * 8; + __asm__ __volatile__("ldr d22, [%[b_0]]\n" // b_o0 + "mov x0, %[ic]\n" // ic_blk + "ldr x1, [%[b_0], #8]\n" + "ins v22.d[1], x1\n" + "ldr h0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "ldr x2, [%[f_0], #8]\n" + "ins v18.d[1], x2\n" + + "0:\n" + "cmp x0, #1\n" + "ble 1f\n" + "ldr h1, [%[in_0], #2]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr d20, [%[f_0], #16]\n" // f_o0c0 + "sub x0, x0, #2\n" + "ldr x2, [%[f_0], #24]\n" + "ins v20.d[1], x2\n" + + "ldr h0, [%[in_0], #4]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr d18, [%[f_0], #32]\n" // f_o0c0 + "ldr x2, [%[f_0], #40]\n" + "ins v18.d[1], x2\n" + "add %[in_0], %[in_0], #4\n" + "add %[f_0], %[f_0], #32\n" + "b 0b\n" + + "1:\n" + "blt 2f\n" + "fmla v2.8h, v18.8h, v0.h[0]\n" + "add %[f_0], %[f_0], #16\n" + "2:\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * fh * fw), [b_0] "r"(b_o0) + : "memory", "cc", "v0", "v1", "v2", "v10", "v18", "v20", "v22", + "x0", "x1", "x2"); + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + : + : + : "memory", "cc", "v1", "v2"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmin v2.8h, v2.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q2, [%[out_0]]\n" // out_o0hw0 + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "v2"); + } + } + } + return ret; +} diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw_A76.cpp b/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw_A76.cpp new file mode 100644 index 00000000..c6a6aa60 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw_A76.cpp @@ -0,0 +1,920 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/fp16/convolution_gemm_icnchw.h" + +EE convolution_gemm_icnchw_A76(TensorDesc inputDesc, + F16 *inArray, + TensorDesc filterDesc, + const F16 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; + + if (fdf != DF_NHWCN16) { + CHECK_STATUS(NOT_MATCH); + } + + oc /= 8; + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + I32 ohow = oh * ow; + U32 ihiw = ih_pad * iw_pad; + F16 *inArray_pad; + EE ret = SUCCESS; + for (U32 n = 0; n < in; n++) { + if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { + inArray_pad = inArray + n * ic * ih * iw; + } else { + // copy input into a input with padding + inArray_pad = (F16 *)tmp; + F16 *inArray_pad_mov = inArray_pad; + F16 *inArray_mov = inArray + n * ic * ih * iw; + for (U32 c = 0; c < ic; c++) { + for (U32 h = 0; h < paddingT; h++) { + memset(inArray_pad_mov, 0, iw_pad * bytesOf(idt)); + inArray_pad_mov += iw_pad; + } + for (U32 h = paddingT; h < ih_pad - paddingB; h++) { + memset(inArray_pad_mov, 0, paddingL * bytesOf(idt)); + inArray_pad_mov += paddingL; + memcpy(inArray_pad_mov, inArray_mov, iw * bytesOf(idt)); + inArray_pad_mov += iw; + inArray_mov += iw; + memset(inArray_pad_mov, 0, paddingR * bytesOf(idt)); + inArray_pad_mov += paddingR; + } + for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { + memset(inArray_pad_mov, 0, iw_pad * bytesOf(idt)); + inArray_pad_mov += iw_pad; + } + } + } + + // ohow / 8 + for (I32 hw = 0; hw < ohow - 7; hw += 8) { + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + const F16 *f_o0c0 = filterArray; + F16 *in_pack = ((F16 *)tmp) + ic * ih_pad * iw_pad; + // pack input + // NCHW => NHWChw8 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + U32 in_h_1 = ((hw + 1) / ow) * strideH; + U32 in_w_1 = ((hw + 1) % ow) * strideW; + U32 in_h_2 = ((hw + 2) / ow) * strideH; + U32 in_w_2 = ((hw + 2) % ow) * strideW; + U32 in_h_3 = ((hw + 3) / ow) * strideH; + U32 in_w_3 = ((hw + 3) % ow) * strideW; + U32 in_h_4 = ((hw + 4) / ow) * strideH; + U32 in_w_4 = ((hw + 4) % ow) * strideW; + U32 in_h_5 = ((hw + 5) / ow) * strideH; + U32 in_w_5 = ((hw + 5) % ow) * strideW; + U32 in_h_6 = ((hw + 6) / ow) * strideH; + U32 in_w_6 = ((hw + 6) % ow) * strideW; + U32 in_h_7 = ((hw + 7) / ow) * strideH; + U32 in_w_7 = ((hw + 7) % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F16 *in_hw = + inArray_pad + c * ihiw + fh_idx * dilateH * iw_pad + dilateW * fw_idx; + F16 *in_0 = in_hw + in_h_0 * iw_pad + in_w_0; + F16 *in_1 = in_hw + in_h_1 * iw_pad + in_w_1; + F16 *in_2 = in_hw + in_h_2 * iw_pad + in_w_2; + F16 *in_3 = in_hw + in_h_3 * iw_pad + in_w_3; + F16 *in_4 = in_hw + in_h_4 * iw_pad + in_w_4; + F16 *in_5 = in_hw + in_h_5 * iw_pad + in_w_5; + F16 *in_6 = in_hw + in_h_6 * iw_pad + in_w_6; + F16 *in_7 = in_hw + in_h_7 * iw_pad + in_w_7; + F16 *in_pack_hw8 = in_pack + fh_idx * fw * ic * 8 + fw_idx * ic * 8 + c * 8; + *in_pack_hw8 = *in_0; + *(in_pack_hw8 + 1) = *in_1; + *(in_pack_hw8 + 2) = *in_2; + *(in_pack_hw8 + 3) = *in_3; + *(in_pack_hw8 + 4) = *in_4; + *(in_pack_hw8 + 5) = *in_5; + *(in_pack_hw8 + 6) = *in_6; + *(in_pack_hw8 + 7) = *in_7; + } + } + } + + // compute + for (I32 o = 0; o < I32(oc - 1); o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__("ldr q22, [%[b_0]]\n" // b_o0 + "ldr q23, [%[b_1]]\n" // b_o1 + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr q0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "mov v4.16b, v22.16b\n" // out_o0hw2 + "mov v5.16b, v22.16b\n" // out_o0hw3 + "ldr q18, [%[f_0]]\n" // f_o0c0 + "mov v6.16b, v22.16b\n" // out_o0hw4 + "mov v7.16b, v22.16b\n" // out_o0hw5 + "mov v8.16b, v22.16b\n" // out_o0hw6 + "ldr q19, [%[f_0], #16]\n" // f_o1c0 + "mov v9.16b, v22.16b\n" // out_o0hw7 + "mov v10.16b, v23.16b\n" // out_o1hw0 + "mov v11.16b, v23.16b\n" // out_o1hw1 + "mov v12.16b, v23.16b\n" // out_o1hw2 + "mov v13.16b, v23.16b\n" // out_o1hw3 + "mov v14.16b, v23.16b\n" // out_o1hw4 + "mov v15.16b, v23.16b\n" // out_o1hw5 + "mov v16.16b, v23.16b\n" // out_o1hw6 + "mov v17.16b, v23.16b\n" // out_o1hw7 + + "0:\n" + "cmp x0, #1\n" + "ble 1f\n" + "ldr q1, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "fmla v3.8h, v18.8h, v0.h[1]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "ldr q20, [%[f_0], #32]\n" // f_o0c0 + "fmla v5.8h, v18.8h, v0.h[3]\n" + "fmla v6.8h, v18.8h, v0.h[4]\n" + "fmla v7.8h, v18.8h, v0.h[5]\n" + "ldr q21, [%[f_0], #48]\n" // f_o1c0 + "fmla v8.8h, v18.8h, v0.h[6]\n" + "fmla v9.8h, v18.8h, v0.h[7]\n" + "fmla v10.8h, v19.8h, v0.h[0]\n" + "fmla v11.8h, v19.8h, v0.h[1]\n" + "fmla v12.8h, v19.8h, v0.h[2]\n" + "fmla v13.8h, v19.8h, v0.h[3]\n" + "fmla v14.8h, v19.8h, v0.h[4]\n" + "fmla v15.8h, v19.8h, v0.h[5]\n" + "fmla v16.8h, v19.8h, v0.h[6]\n" + "fmla v17.8h, v19.8h, v0.h[7]\n" + + "ldr q0, [%[in_0], #32]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "fmla v3.8h, v20.8h, v1.h[1]\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "ldr q18, [%[f_0], #64]\n" // f_o0c0 + "fmla v5.8h, v20.8h, v1.h[3]\n" + "fmla v6.8h, v20.8h, v1.h[4]\n" + "fmla v7.8h, v20.8h, v1.h[5]\n" + "ldr q19, [%[f_0], #80]\n" // f_o1c0 + "fmla v8.8h, v20.8h, v1.h[6]\n" + "fmla v9.8h, v20.8h, v1.h[7]\n" + "fmla v10.8h, v21.8h, v1.h[0]\n" + "add %[in_0], %[in_0], #32\n" + "fmla v11.8h, v21.8h, v1.h[1]\n" + "add %[f_0], %[f_0], #64\n" + "fmla v12.8h, v21.8h, v1.h[2]\n" + "sub x0, x0, #2\n" + "fmla v13.8h, v21.8h, v1.h[3]\n" + "fmla v14.8h, v21.8h, v1.h[4]\n" + "fmla v15.8h, v21.8h, v1.h[5]\n" + "fmla v16.8h, v21.8h, v1.h[6]\n" + "fmla v17.8h, v21.8h, v1.h[7]\n" + "b 0b\n" + + "1:\n" + "blt 2f\n" + "fmla v2.8h, v18.8h, v0.h[0]\n" + "fmla v3.8h, v18.8h, v0.h[1]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "fmla v6.8h, v18.8h, v0.h[4]\n" + "fmla v7.8h, v18.8h, v0.h[5]\n" + "fmla v8.8h, v18.8h, v0.h[6]\n" + "fmla v9.8h, v18.8h, v0.h[7]\n" + "add %[f_0], %[f_0], #32\n" + "fmla v10.8h, v19.8h, v0.h[0]\n" + "fmla v11.8h, v19.8h, v0.h[1]\n" + "fmla v12.8h, v19.8h, v0.h[2]\n" + "fmla v13.8h, v19.8h, v0.h[3]\n" + "fmla v14.8h, v19.8h, v0.h[4]\n" + "fmla v15.8h, v19.8h, v0.h[5]\n" + "fmla v16.8h, v19.8h, v0.h[6]\n" + "fmla v17.8h, v19.8h, v0.h[7]\n" + "2:\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "x0"); + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + "fmax v3.8h, v3.8h, v1.8h\n" + "fmax v4.8h, v4.8h, v1.8h\n" + "fmax v5.8h, v5.8h, v1.8h\n" + "fmax v6.8h, v6.8h, v1.8h\n" + "fmax v7.8h, v7.8h, v1.8h\n" + "fmax v8.8h, v8.8h, v1.8h\n" + "fmax v9.8h, v9.8h, v1.8h\n" + "fmax v10.8h, v10.8h, v1.8h\n" + "fmax v11.8h, v11.8h, v1.8h\n" + "fmax v12.8h, v12.8h, v1.8h\n" + "fmax v13.8h, v13.8h, v1.8h\n" + "fmax v14.8h, v14.8h, v1.8h\n" + "fmax v15.8h, v15.8h, v1.8h\n" + "fmax v16.8h, v16.8h, v1.8h\n" + "fmax v17.8h, v17.8h, v1.8h\n" + : + : + : "memory", "cc", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", + "v15", "v16", "v17"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmax v4.8h, v4.8h, v31.8h\n" + "fmax v5.8h, v5.8h, v31.8h\n" + "fmax v6.8h, v6.8h, v31.8h\n" + "fmax v7.8h, v7.8h, v31.8h\n" + "fmax v8.8h, v8.8h, v31.8h\n" + "fmax v9.8h, v9.8h, v31.8h\n" + "fmax v10.8h, v10.8h, v31.8h\n" + "fmax v11.8h, v11.8h, v31.8h\n" + "fmax v12.8h, v12.8h, v31.8h\n" + "fmax v13.8h, v13.8h, v31.8h\n" + "fmax v14.8h, v14.8h, v31.8h\n" + "fmax v15.8h, v15.8h, v31.8h\n" + "fmax v16.8h, v16.8h, v31.8h\n" + "fmax v17.8h, v17.8h, v31.8h\n" + + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v3.8h, v3.8h, v30.8h\n" + "fmin v4.8h, v4.8h, v30.8h\n" + "fmin v5.8h, v5.8h, v30.8h\n" + "fmin v6.8h, v6.8h, v30.8h\n" + "fmin v7.8h, v7.8h, v30.8h\n" + "fmin v8.8h, v8.8h, v30.8h\n" + "fmin v9.8h, v9.8h, v30.8h\n" + "fmin v10.8h, v10.8h, v30.8h\n" + "fmin v11.8h, v11.8h, v30.8h\n" + "fmin v12.8h, v12.8h, v30.8h\n" + "fmin v13.8h, v13.8h, v30.8h\n" + "fmin v14.8h, v14.8h, v30.8h\n" + "fmin v15.8h, v15.8h, v30.8h\n" + "fmin v16.8h, v16.8h, v30.8h\n" + "fmin v17.8h, v17.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + "str q6, [%[out_0], #64]\n" // out_o0hw4 + "str q7, [%[out_0], #80]\n" // out_o0hw5 + "str q8, [%[out_0], #96]\n" // out_o0hw6 + "str q9, [%[out_0], #112]\n" // out_o0hw7 + "str q10, [%[out_1]]\n" // out_o1hw0 + "str q11, [%[out_1], #16]\n" // out_o1hw1 + "str q12, [%[out_1], #32]\n" // out_o1hw2 + "str q13, [%[out_1], #48]\n" // out_o1hw3 + "str q14, [%[out_1], #64]\n" // out_o1hw4 + "str q15, [%[out_1], #80]\n" // out_o1hw5 + "str q16, [%[out_1], #96]\n" // out_o1hw6 + "str q17, [%[out_1], #112]\n" // out_o1hw7 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0) + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = filterArray + (oc - 1) * 8 * fh * fw * ic; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = biasArray + (oc - 1) * 8; + __asm__ __volatile__("ldr q12, [%[b_0]]\n" // b_o0 + "mov x0, %[ic]\n" // ic_blk + "ldr q0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v12.16b\n" // out_o0hw0 + "mov v3.16b, v12.16b\n" // out_o0hw1 + "mov v4.16b, v12.16b\n" // out_o0hw2 + "ldr q10, [%[f_0]]\n" // f_o0c0 + "mov v5.16b, v12.16b\n" // out_o0hw3 + "mov v6.16b, v12.16b\n" // out_o0hw4 + "mov v7.16b, v12.16b\n" // out_o0hw5 + "mov v8.16b, v12.16b\n" // out_o0hw6 + "mov v9.16b, v12.16b\n" // out_o0hw7 + + "0:\n" + "cmp x0, #1\n" + "ble 1f\n" + "ldr q1, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v10.8h, v0.h[0]\n" + "fmla v3.8h, v10.8h, v0.h[1]\n" + "fmla v4.8h, v10.8h, v0.h[2]\n" + "ldr q11, [%[f_0], #16]\n" // f_o0c0 + "fmla v5.8h, v10.8h, v0.h[3]\n" + "fmla v6.8h, v10.8h, v0.h[4]\n" + "fmla v7.8h, v10.8h, v0.h[5]\n" + "sub x0, x0, #2\n" + "fmla v8.8h, v10.8h, v0.h[6]\n" + "fmla v9.8h, v10.8h, v0.h[7]\n" + + "ldr q0, [%[in_0], #32]\n" // in_hw0 + "fmla v2.8h, v11.8h, v1.h[0]\n" + "fmla v3.8h, v11.8h, v1.h[1]\n" + "fmla v4.8h, v11.8h, v1.h[2]\n" + "ldr q10, [%[f_0], #32]\n" // f_o0c0 + "fmla v5.8h, v11.8h, v1.h[3]\n" + "fmla v6.8h, v11.8h, v1.h[4]\n" + "fmla v7.8h, v11.8h, v1.h[5]\n" + "add %[in_0], %[in_0], #32\n" + "fmla v8.8h, v11.8h, v1.h[6]\n" + "add %[f_0], %[f_0], #32\n" + "fmla v9.8h, v11.8h, v1.h[7]\n" + "b 0b\n" + + "1:\n" + "blt 2f\n" + "fmla v2.8h, v10.8h, v0.h[0]\n" + "fmla v3.8h, v10.8h, v0.h[1]\n" + "fmla v4.8h, v10.8h, v0.h[2]\n" + "fmla v5.8h, v10.8h, v0.h[3]\n" + "add %[f_0], %[f_0], #16\n" + "fmla v6.8h, v10.8h, v0.h[4]\n" + "fmla v7.8h, v10.8h, v0.h[5]\n" + "fmla v8.8h, v10.8h, v0.h[6]\n" + "fmla v9.8h, v10.8h, v0.h[7]\n" + "2:\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * fh * fw), [b_0] "r"(b_o0) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "x0"); + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__( + "eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + "fmax v3.8h, v3.8h, v1.8h\n" + "fmax v4.8h, v4.8h, v1.8h\n" + "fmax v5.8h, v5.8h, v1.8h\n" + "fmax v6.8h, v6.8h, v1.8h\n" + "fmax v7.8h, v7.8h, v1.8h\n" + "fmax v8.8h, v8.8h, v1.8h\n" + "fmax v9.8h, v9.8h, v1.8h\n" + : + : + : "memory", "cc", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmax v4.8h, v4.8h, v31.8h\n" + "fmax v5.8h, v5.8h, v31.8h\n" + "fmax v6.8h, v6.8h, v31.8h\n" + "fmax v7.8h, v7.8h, v31.8h\n" + "fmax v8.8h, v8.8h, v31.8h\n" + "fmax v9.8h, v9.8h, v31.8h\n" + + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v3.8h, v3.8h, v30.8h\n" + "fmin v4.8h, v4.8h, v30.8h\n" + "fmin v5.8h, v5.8h, v30.8h\n" + "fmin v6.8h, v6.8h, v30.8h\n" + "fmin v7.8h, v7.8h, v30.8h\n" + "fmin v8.8h, v8.8h, v30.8h\n" + "fmin v9.8h, v9.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__( + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw0 + "str q4, [%[out_0], #32]\n" // out_o0hw0 + "str q5, [%[out_0], #48]\n" // out_o0hw0 + "str q6, [%[out_0], #64]\n" // out_o0hw0 + "str q7, [%[out_0], #80]\n" // out_o0hw0 + "str q8, [%[out_0], #96]\n" // out_o0hw0 + "str q9, [%[out_0], #112]\n" // out_o0hw0 + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9"); + } + } + // ohow_reminder % 8 / 4 + U32 ohow_s = (ohow / 8) * 8; + + for (I32 hw = ohow_s; hw < ohow - 3; hw += 4) { + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + const F16 *f_o0c0 = filterArray; + F16 *in_pack = ((F16 *)tmp) + ic * ih_pad * iw_pad; + + // pack input + // NCHWc8 => NHWChw4 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + U32 in_h_1 = ((hw + 1) / ow) * strideH; + U32 in_w_1 = ((hw + 1) % ow) * strideW; + U32 in_h_2 = ((hw + 2) / ow) * strideH; + U32 in_w_2 = ((hw + 2) % ow) * strideW; + U32 in_h_3 = ((hw + 3) / ow) * strideH; + U32 in_w_3 = ((hw + 3) % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F16 *in_hw = + inArray_pad + c * ihiw + fh_idx * dilateH * iw_pad + dilateW * fw_idx; + F16 *in_0 = in_hw + in_h_0 * iw_pad + in_w_0; + F16 *in_1 = in_hw + in_h_1 * iw_pad + in_w_1; + F16 *in_2 = in_hw + in_h_2 * iw_pad + in_w_2; + F16 *in_3 = in_hw + in_h_3 * iw_pad + in_w_3; + F16 *in_pack_hw4 = in_pack + fh_idx * fw * ic * 4 + fw_idx * ic * 4 + c * 4; + *in_pack_hw4 = *in_0; + *(in_pack_hw4 + 1) = *in_1; + *(in_pack_hw4 + 2) = *in_2; + *(in_pack_hw4 + 3) = *in_3; + } + } + } + + // compute + for (I32 o = 0; o < I32(oc - 1); o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__("ldr q22, [%[b_0]]\n" // b_o0 + "ldr q23, [%[b_1]]\n" // b_o1 + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "ldr q18, [%[f_0]]\n" // f_o0c0 + "mov v4.16b, v22.16b\n" // out_o0hw2 + "mov v5.16b, v22.16b\n" // out_o0hw3 + "mov v10.16b, v23.16b\n" // out_o1hw0 + "ldr q19, [%[f_0], #16]\n" // f_o1c0 + "mov v11.16b, v23.16b\n" // out_o1hw1 + "mov v12.16b, v23.16b\n" // out_o1hw2 + "mov v13.16b, v23.16b\n" // out_o1hw3 + + "0:\n" + "cmp x0, #1\n" + "ble 1f\n" + "ldr d1, [%[in_0], #8]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr q20, [%[f_0], #32]\n" // f_o0c0 + "fmla v3.8h, v18.8h, v0.h[1]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "ldr q21, [%[f_0], #48]\n" // f_o1c0 + "fmla v10.8h, v19.8h, v0.h[0]\n" + "fmla v11.8h, v19.8h, v0.h[1]\n" + "fmla v12.8h, v19.8h, v0.h[2]\n" + "sub x0, x0, #2\n" + "fmla v13.8h, v19.8h, v0.h[3]\n" + + "ldr d0, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr q18, [%[f_0], #64]\n" // f_o0c0 + "fmla v3.8h, v20.8h, v1.h[1]\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "ldr q19, [%[f_0], #80]\n" // f_o1c0 + "fmla v5.8h, v20.8h, v1.h[3]\n" + "fmla v10.8h, v21.8h, v1.h[0]\n" + "fmla v11.8h, v21.8h, v1.h[1]\n" + "fmla v12.8h, v21.8h, v1.h[2]\n" + "add %[in_0], %[in_0], #16\n" + "fmla v13.8h, v21.8h, v1.h[3]\n" + "add %[f_0], %[f_0], #64\n" + "b 0b\n" + + "1:\n" + "blt 2f\n" + "fmla v2.8h, v18.8h, v0.h[0]\n" + "fmla v3.8h, v18.8h, v0.h[1]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "add %[f_0], %[f_0], #32\n" + "fmla v10.8h, v19.8h, v0.h[0]\n" + "fmla v11.8h, v19.8h, v0.h[1]\n" + "fmla v12.8h, v19.8h, v0.h[2]\n" + "fmla v13.8h, v19.8h, v0.h[3]\n" + "2:\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v10", + "v11", "v12", "v13", "v18", "v19", "v20", "v21", "v22", "v23", + "x0"); + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + "fmax v3.8h, v3.8h, v1.8h\n" + "fmax v4.8h, v4.8h, v1.8h\n" + "fmax v5.8h, v5.8h, v1.8h\n" + "fmax v10.8h, v10.8h, v1.8h\n" + "fmax v11.8h, v11.8h, v1.8h\n" + "fmax v12.8h, v12.8h, v1.8h\n" + "fmax v13.8h, v13.8h, v1.8h\n" + : + : + : "memory", "cc", "v1", "v2", "v3", "v4", "v5", "v10", + "v11", "v12", "v13"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmax v4.8h, v4.8h, v31.8h\n" + "fmax v5.8h, v5.8h, v31.8h\n" + "fmax v10.8h, v10.8h, v31.8h\n" + "fmax v11.8h, v11.8h, v31.8h\n" + "fmax v12.8h, v12.8h, v31.8h\n" + "fmax v13.8h, v13.8h, v31.8h\n" + + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v3.8h, v3.8h, v30.8h\n" + "fmin v4.8h, v4.8h, v30.8h\n" + "fmin v5.8h, v5.8h, v30.8h\n" + "fmin v10.8h, v10.8h, v30.8h\n" + "fmin v11.8h, v11.8h, v30.8h\n" + "fmin v12.8h, v12.8h, v30.8h\n" + "fmin v13.8h, v13.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v10", "v11", + "v12", "v13", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__( + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + "str q10, [%[out_1]]\n" // out_o1hw0 + "str q11, [%[out_1], #16]\n" // out_o1hw1 + "str q12, [%[out_1], #32]\n" // out_o1hw2 + "str q13, [%[out_1], #48]\n" // out_o1hw3 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0) + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v10", "v11", "v12", "v13"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = filterArray + (oc - 1) * 8 * fh * fw * ic; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = biasArray + (oc - 1) * 8; + __asm__ __volatile__("ldr q22, [%[b_0]]\n" // b_o0 + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "ldr q18, [%[f_0]]\n" // f_o0c0 + "mov v4.16b, v22.16b\n" // out_o0hw2 + "mov v5.16b, v22.16b\n" // out_o0hw3 + + "0:\n" + "cmp x0, #1\n" + "ble 1f\n" + "ldr d1, [%[in_0], #8]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr q20, [%[f_0], #16]\n" // f_o0c0 + "fmla v3.8h, v18.8h, v0.h[1]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "sub x0, x0, #2\n" + + "ldr d0, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr q18, [%[f_0], #32]\n" // f_o0c0 + "fmla v3.8h, v20.8h, v1.h[1]\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "fmla v5.8h, v20.8h, v1.h[3]\n" + "add %[in_0], %[in_0], #16\n" + "add %[f_0], %[f_0], #32\n" + "b 0b\n" + + "1:\n" + "blt 2f\n" + "fmla v2.8h, v18.8h, v0.h[0]\n" + "fmla v3.8h, v18.8h, v0.h[1]\n" + "add %[f_0], %[f_0], #16\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "2:\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * fh * fw), [b_0] "r"(b_o0) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v18", + "v20", "v22", "x0"); + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + "fmax v3.8h, v3.8h, v1.8h\n" + "fmax v4.8h, v4.8h, v1.8h\n" + "fmax v5.8h, v5.8h, v1.8h\n" + : + : + : "memory", "cc", "v1", "v2", "v3", "v4", "v5"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmax v4.8h, v4.8h, v31.8h\n" + "fmax v5.8h, v5.8h, v31.8h\n" + + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v3.8h, v3.8h, v30.8h\n" + "fmin v4.8h, v4.8h, v30.8h\n" + "fmin v5.8h, v5.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "v2", "v3", "v4", "v5"); + } + } + // ohow_reminder % 4 + ohow_s = (ohow / 4) * 4; + for (I32 hw = ohow_s; hw < ohow; hw++) { + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + const F16 *f_o0c0 = filterArray; + F16 *in_pack = ((F16 *)tmp) + ic * ih_pad * iw_pad; + // pack input + // NCHWc8 => NHWChw4 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F16 *in_hw = + inArray_pad + c * ihiw + fh_idx * dilateH * iw_pad + dilateW * fw_idx; + F16 *in_0 = in_hw + in_h_0 * iw_pad + in_w_0; + F16 *in_pack_hw1 = in_pack + fh_idx * fw * ic + fw_idx * ic + c; + /* + * for (U32 c8 = 0; c8 < 8; c8++) { + * in_pack_c8hw1[c8] = in_0[c8]; + * } + */ + *in_pack_hw1 = *in_0; + } + } + } + + // compute + for (I32 o = 0; o < I32(oc - 1); o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__("ldr q22, [%[b_0]]\n" // b_o0 + "ldr q23, [%[b_1]]\n" // b_o1 + "mov x0, %[ic]\n" // ic_blk + "ldr h0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr q18, [%[f_0]]\n" // f_o0c0 + "mov v10.16b, v23.16b\n" // out_o1hw0 + "ldr q19, [%[f_0], #16]\n" // f_o1c0 + + "0:\n" + "cmp x0, #1\n" + "ble 1f\n" + "ldr h1, [%[in_0], #2]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr q20, [%[f_0], #32]\n" // f_o0c0 + "fmla v10.8h, v19.8h, v0.h[0]\n" + "ldr q21, [%[f_0], #48]\n" // f_o1c0 + "sub x0, x0, #2\n" + + "ldr h0, [%[in_0], #4]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr q18, [%[f_0], #64]\n" // f_o0c0 + "fmla v10.8h, v21.8h, v1.h[0]\n" + "ldr q19, [%[f_0], #80]\n" // f_o1c0 + "add %[in_0], %[in_0], #4\n" + "add %[f_0], %[f_0], #64\n" + "b 0b\n" + + "1:\n" + "blt 2f\n" + "fmla v2.8h, v18.8h, v0.h[0]\n" + "add %[f_0], %[f_0], #32\n" + "fmla v10.8h, v19.8h, v0.h[0]\n" + "2:\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v10", "v18", "v19", "v20", + "v21", "v22", "v23", "x0"); + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + "fmax v10.8h, v10.8h, v1.8h\n" + : + : + : "memory", "cc", "v1", "v2", "v10"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v10.8h, v10.8h, v31.8h\n" + + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v10.8h, v10.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v10", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q2, [%[out_0]]\n" // out_o0hw0 + "str q10, [%[out_1]]\n" // out_o1hw0 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0) + : + : "memory", "cc", "v2", "v10"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = filterArray + (oc - 1) * 8 * fh * fw * ic; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = biasArray + (oc - 1) * 8; + __asm__ __volatile__( + "ldr q22, [%[b_0]]\n" // b_o0 + "mov x0, %[ic]\n" // ic_blk + "ldr h0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr q18, [%[f_0]]\n" // f_o0c0 + + "0:\n" + "cmp x0, #1\n" + "ble 1f\n" + "ldr h1, [%[in_0], #2]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr q20, [%[f_0], #16]\n" // f_o0c0 + "sub x0, x0, #2\n" + + "ldr h0, [%[in_0], #4]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr q18, [%[f_0], #32]\n" // f_o0c0 + "add %[in_0], %[in_0], #4\n" + "add %[f_0], %[f_0], #32\n" + "b 0b\n" + + "1:\n" + "blt 2f\n" + "fmla v2.8h, v18.8h, v0.h[0]\n" + "add %[f_0], %[f_0], #16\n" + "2:\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * fh * fw), [b_0] "r"(b_o0) + : "memory", "cc", "v0", "v1", "v2", "v10", "v18", "v20", "v22", "x0"); + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + : + : + : "memory", "cc", "v1", "v2"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmin v2.8h, v2.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q2, [%[out_0]]\n" // out_o0hw0 + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "v2"); + } + } + } + return ret; +} diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_transform.cpp b/compute/tensor/src/cpu/arm/fp16/convolution_transform.cpp new file mode 100644 index 00000000..8bd7b6bb --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/convolution_transform.cpp @@ -0,0 +1,189 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/fp16/tensor_computing_fp16.h" +#include "cpu/arm/fp16/convolution_winograd_transform.h" + +static EE convolution_transform_filter_kernel_fp16(TensorDesc filterDesc, + const F16 *filterArray, + TensorDesc *ftmDesc, + F16 *ftmArray, + DataFormat ftmDataFormat) +{ + if (nullptr == filterArray || nullptr == ftmDesc || nullptr == ftmArray) { + CHECK_STATUS(NULL_POINTER); + } + DataType fdt; + DataFormat fdf; + U32 fn, fc, fh, fw; + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + if (fdf == ftmDataFormat) { + *ftmDesc = filterDesc; + memcpy(ftmArray, filterArray, fn * fc * fh * fw * bytesOf(fdt)); + return SUCCESS; + } + if (fdf != DF_NCHW) { + CHECK_STATUS(NOT_SUPPORTED); + } + EE ret = SUCCESS; + switch (ftmDataFormat) { + case DF_NHWCN16: { + /* + * NCHW => NHWCN16 + * if there is remainder, it should be NHWCN8 + */ + U32 oc = fn / 16; + for (U32 o = 0; o < oc; o++) { + for (U32 hw = 0; hw < fh * fw; hw++) { + for (U32 c = 0; c < fc; c++) { + for (U32 o16 = 0; o16 < 16; o16++) { + ftmArray[o * fh * fw * fc * 16 + hw * fc * 16 + c * 16 + o16] = + filterArray[(o * 16 + o16) * fc * fh * fw + c * fh * fw + hw]; + } + } + } + } + if (fn != oc * 16) { + for (U32 hw = 0; hw < fh * fw; hw++) { + for (U32 c = 0; c < fc; c++) { + for (U32 o8 = 0; o8 < 8; o8++) { + ftmArray[(oc * 16) * fh * fw * fc + hw * fc * 8 + c * 8 + o8] = + filterArray[(oc * 16 + o8) * fc * fh * fw + c * fh * fw + hw]; + } + } + } + } + *ftmDesc = tensor4df(fdt, ftmDataFormat, fn, fc, fh, fw); + break; + } + case DF_NCHWN16: { + /* + * NCHW => NCHWN16 + */ + U32 oc = fn / 16; + for (U32 o = 0; o < oc; o++) { + for (U32 chw = 0; chw < fc * fh * fw; chw++) { + for (U32 o16 = 0; o16 < 16; o16++) { + ftmArray[o * fc * fh * fw * 16 + chw * 16 + o16] = + filterArray[(o * 16 + o16) * fc * fh * fw + chw]; + } + } + } + *ftmDesc = tensor4df(fdt, ftmDataFormat, fn, fc, fh, fw); + break; + } + case DF_HWNCN16: { + for (U32 o = 0; o < fn / 16; o++) { + for (U32 c = 0; c < fc; c++) { + U32 f_off_0 = (o * 16) * fc * fh * fw + c * fh * fw; + U32 f_off_1 = (o * 16 + 8) * fc * fh * fw + c * fh * fw; + U32 ftm_off_0 = o * 36 * fc * 16 + c * 16; + U32 ftm_off_1 = o * 36 * fc * 16 + c * 16 + 8; + F16 F[9][8]; + F16 *F_ptr[9]; + F16 *Fw[36]; + for (U32 hw = 0; hw < 9; hw++) { + for (U32 oo = 0; oo < 8; oo++) { + F[hw][oo] = filterArray[f_off_0 + hw + oo * fc * fh * fw]; + } + F_ptr[hw] = F[hw]; + } + for (U32 hw = 0; hw < 36; hw++) { + Fw[hw] = ftmArray + ftm_off_0 + hw * fc * 16; + } + trans_W_4x4_3x3(Fw, F_ptr); + for (U32 hw = 0; hw < 9; hw++) { + for (U32 oo = 0; oo < 8; oo++) { + F[hw][oo] = filterArray[f_off_1 + hw + oo * fc * fh * fw]; + } + F_ptr[hw] = F[hw]; + } + for (U32 hw = 0; hw < 36; hw++) { + Fw[hw] = ftmArray + ftm_off_1 + hw * fc * 16; + } + trans_W_4x4_3x3(Fw, F_ptr); + } + } + U32 oc = (fn / 16) * 16; + if (oc != fn) { + for (U32 c = 0; c < fc; c++) { + U32 f_off_0 = oc * fc * fh * fw + c * fh * fw; + U32 ftm_off_0 = oc * 36 * fc + c * 8; + F16 F[9][8]; + F16 *F_ptr[9]; + F16 *Fw[36]; + for (U32 hw = 0; hw < 9; hw++) { + for (U32 oo = 0; oo < 8; oo++) { + F[hw][oo] = filterArray[f_off_0 + hw + oo * fc * fh * fw]; + } + F_ptr[hw] = F[hw]; + } + for (U32 hw = 0; hw < 36; hw++) { + Fw[hw] = ftmArray + ftm_off_0 + hw * fc * 8; + } + trans_W_4x4_3x3(Fw, F_ptr); + } + } + *ftmDesc = tensor4df(fdt, ftmDataFormat, fn, fc, 6, 6); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE convolution_transform_filter_fp16(TensorDesc filterDesc, + const F16 *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + F16 *filterTransformed) +{ + DataFormat ftmDataFormat; + switch (algorithm) { + case CONVOLUTION_ALGORITHM_WINOGRAD: + ftmDataFormat = DF_HWNCN16; + break; + case CONVOLUTION_ALGORITHM_DIRECT: + ftmDataFormat = DF_NCHWN16; + break; + case CONVOLUTION_ALGORITHM_GEMM: + ftmDataFormat = DF_NHWCN16; + break; + case CONVOLUTION_ALGORITHM_GEMM_ICNCHW: + ftmDataFormat = DF_NHWCN16; + break; + default: + return NOT_MATCH; + } + + U32 channelAxis = filterDesc.nDims - 1; + TensorDesc tmpFilterDesc = filterDesc; + tmpFilterDesc.dims[channelAxis] /= convParamSpec.group; + U32 fnPadding = tmpFilterDesc.dims[channelAxis]; + if (fnPadding % 8 != 0) { + fnPadding = (fnPadding / 8 + 1) * 8; + } + U32 originalTileSize = tensorNumElements(tmpFilterDesc); + for (U32 g = 0; g < convParamSpec.group; g++) { + CHECK_STATUS(convolution_transform_filter_kernel_fp16( + tmpFilterDesc, filter, ftmDesc, filterTransformed, ftmDataFormat)); + U32 newTileSize = tensorNumElements(*ftmDesc) / tmpFilterDesc.dims[channelAxis] * fnPadding; + filter += originalTileSize; + filterTransformed += newTileSize; + } + ftmDesc->dims[channelAxis] = filterDesc.dims[channelAxis]; + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_winograd.h b/compute/tensor/src/cpu/arm/fp16/convolution_winograd.h new file mode 100644 index 00000000..898338b8 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/convolution_winograd.h @@ -0,0 +1,78 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_CONVOLUTION_WINOGRAD +#define _H_CONVOLUTION_WINOGRAD + +#include "sys.h" +#include "types.h" +#include "error.h" + +EE convolution_winograd_A55(TensorDesc inputDesc, + F16 *inArray, + TensorDesc filterDesc, + const F16 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc); + +EE convolution_winograd_A76(TensorDesc inputDesc, + F16 *inArray, + TensorDesc filterDesc, + const F16 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc); + +inline EE convolution_winograd(TensorDesc inputDesc, + F16 *inArray, + TensorDesc filterDesc, + const F16 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc, + Arch arch) +{ + EE ret = SUCCESS; + switch (arch) { + case ARM_A55: + ret = convolution_winograd_A55(inputDesc, inArray, filterDesc, filterArray, + convParamSpec, biasDesc, biasArray, tmpBytes, tmp, outputDesc, outArray, + activationDesc); + break; + case ARM_A76: + ret = convolution_winograd_A76(inputDesc, inArray, filterDesc, filterArray, + convParamSpec, biasDesc, biasArray, tmpBytes, tmp, outputDesc, outArray, + activationDesc); + break; + default: + return NOT_SUPPORTED; + } + return ret; +} +#endif diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_winograd_A55.cpp b/compute/tensor/src/cpu/arm/fp16/convolution_winograd_A55.cpp new file mode 100644 index 00000000..dc206e87 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/convolution_winograd_A55.cpp @@ -0,0 +1,859 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/fp16/convolution_winograd_transform.h" +#include "cpu/arm/fp16/convolution_winograd.h" + +EE convolution_winograd_A55(TensorDesc inputDesc, + F16 *inArray, + TensorDesc filterDesc, + const F16 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + if (fdf != DF_HWNCN16) { + CHECK_STATUS(NOT_MATCH); + } + if (!(fh == 6 && fw == 6)) { + CHECK_STATUS(NOT_SUPPORTED); + } + + oc /= 8; + ic /= 8; + + U32 tile_h = (oh + 3) / 4; + U32 tile_w = (ow + 3) / 4; + I32 tiles = tile_h * tile_w; // num of 6x6 tiles + U32 pad_left = paddingL; + U32 pad_right = paddingR + (tile_w * 4 - ow); + U32 pad_w_mod_4 = tile_w * 4 - ow; + U32 pad_top = paddingT; + U32 pad_bottom = paddingB + (tile_h * 4 - oh); + U32 pad_h_mod_4 = tile_h * 4 - oh; + U32 ih_pad = ih + pad_top + pad_bottom; + U32 iw_pad = iw + pad_left + pad_right; + // tmp = in_pad + itm + otm + // in_pad: ic*ih_pad*iw_pad*8 + // itm: 6*6*ic*8*8 + // otm: oc*6*6*8*8 + F16 *inArray_pad = (F16 *)tmp; + F16 *itmArray = inArray_pad + ic * ih_pad * iw_pad * 8; + F16 *otmArray = itmArray + 6 * 6 * ic * 8 * 8; + + EE ret = SUCCESS; + // copy input into a input with padding + for (U32 n = 0; n < in; n++) { + F16 *inArray_pad_mov = inArray_pad; + F16 *inArray_mov = inArray + n * ic * ih * iw * 8; + for (U32 c = 0; c < ic; c++) { + memset(inArray_pad_mov, 0, pad_top * iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += pad_top * iw_pad * 8; + for (U32 h = pad_top; h < ih_pad - pad_bottom; h++) { + memset(inArray_pad_mov, 0, pad_left * 8 * bytesOf(idt)); + inArray_pad_mov += pad_left * 8; + memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt)); + inArray_pad_mov += iw * 8; + inArray_mov += iw * 8; + memset(inArray_pad_mov, 0, pad_right * 8 * bytesOf(idt)); + inArray_pad_mov += pad_right * 8; + } + memset(inArray_pad_mov, 0, pad_bottom * iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += pad_bottom * iw_pad * 8; + } + + // tiles / 8 + for (I32 hw = 0; hw < tiles - 7; hw += 8) { + const F16 *ftm_0 = filterArray; + F16 *otm_0 = otmArray; + // in trans + // NCHWc8 => (6*6)*C*c8*hw8 + for (U32 c = 0; c < ic; c++) { + F16 *inArray_pad_mov = inArray_pad + c * ih_pad * iw_pad * 8; + F16 *Iw_ptr[36]; + F16 Iw0[36][8]; + F16 *I0[36]; + F16 Iw1[36][8]; + F16 *I1[36]; + F16 Iw2[36][8]; + F16 *I2[36]; + F16 Iw3[36][8]; + F16 *I3[36]; + F16 Iw4[36][8]; + F16 *I4[36]; + F16 Iw5[36][8]; + F16 *I5[36]; + F16 Iw6[36][8]; + F16 *I6[36]; + F16 Iw7[36][8]; + F16 *I7[36]; + F16 *itmArray_mov = itmArray + c * 8 * 8; + U32 h0 = (hw / tile_w) * 4; + U32 w0 = (hw % tile_w) * 4; + U32 h1 = ((hw + 1) / tile_w) * 4; + U32 w1 = ((hw + 1) % tile_w) * 4; + U32 h2 = ((hw + 2) / tile_w) * 4; + U32 w2 = ((hw + 2) % tile_w) * 4; + U32 h3 = ((hw + 3) / tile_w) * 4; + U32 w3 = ((hw + 3) % tile_w) * 4; + U32 h4 = ((hw + 4) / tile_w) * 4; + U32 w4 = ((hw + 4) % tile_w) * 4; + U32 h5 = ((hw + 5) / tile_w) * 4; + U32 w5 = ((hw + 5) % tile_w) * 4; + U32 h6 = ((hw + 6) / tile_w) * 4; + U32 w6 = ((hw + 6) % tile_w) * 4; + U32 h7 = ((hw + 7) / tile_w) * 4; + U32 w7 = ((hw + 7) % tile_w) * 4; + for (U32 i = 0; i < 6; i++) { + for (U32 j = 0; j < 6; j++) { + I0[i * 6 + j] = inArray_pad_mov + (h0 + i) * iw_pad * 8 + (w0 + j) * 8; + I1[i * 6 + j] = inArray_pad_mov + (h1 + i) * iw_pad * 8 + (w1 + j) * 8; + I2[i * 6 + j] = inArray_pad_mov + (h2 + i) * iw_pad * 8 + (w2 + j) * 8; + I3[i * 6 + j] = inArray_pad_mov + (h3 + i) * iw_pad * 8 + (w3 + j) * 8; + I4[i * 6 + j] = inArray_pad_mov + (h4 + i) * iw_pad * 8 + (w4 + j) * 8; + I5[i * 6 + j] = inArray_pad_mov + (h5 + i) * iw_pad * 8 + (w5 + j) * 8; + I6[i * 6 + j] = inArray_pad_mov + (h6 + i) * iw_pad * 8 + (w6 + j) * 8; + I7[i * 6 + j] = inArray_pad_mov + (h7 + i) * iw_pad * 8 + (w7 + j) * 8; + } + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw0[i]; + } + trans_I_4x4_3x3(Iw_ptr, I0); + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw1[i]; + } + trans_I_4x4_3x3(Iw_ptr, I1); + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw2[i]; + } + trans_I_4x4_3x3(Iw_ptr, I2); + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw3[i]; + } + trans_I_4x4_3x3(Iw_ptr, I3); + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw4[i]; + } + trans_I_4x4_3x3(Iw_ptr, I4); + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw5[i]; + } + trans_I_4x4_3x3(Iw_ptr, I5); + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw6[i]; + } + trans_I_4x4_3x3(Iw_ptr, I6); + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw7[i]; + } + trans_I_4x4_3x3(Iw_ptr, I7); + for (U32 i = 0; i < 36; i++) { + F16 *itm = itmArray_mov + i * ic * 8 * 8; + + // for (U32 c8 = 0; c8 < 8; c8++) { + // itm[c8*8] = Iw0[i][c8]; + // itm[c8*8 + 1] = Iw1[i][c8]; + // itm[c8*8 + 2] = Iw2[i][c8]; + // itm[c8*8 + 3] = Iw3[i][c8]; + // itm[c8*8 + 4] = Iw4[i][c8]; + // itm[c8*8 + 5] = Iw5[i][c8]; + // itm[c8*8 + 6] = Iw6[i][c8]; + // itm[c8*8 + 7] = Iw7[i][c8]; + // } + + float16x8_t v0 = vld1q_f16(Iw0[i]); + float16x8_t v1 = vld1q_f16(Iw1[i]); + float16x8_t v2 = vld1q_f16(Iw2[i]); + float16x8_t v3 = vld1q_f16(Iw3[i]); + float16x8_t v4 = vld1q_f16(Iw4[i]); + float16x8_t v5 = vld1q_f16(Iw5[i]); + float16x8_t v6 = vld1q_f16(Iw6[i]); + float16x8_t v7 = vld1q_f16(Iw7[i]); + vst1q_f16(itm, + vzip1q_f16(vzip1q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip1q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); + vst1q_f16(itm + 8, + vzip2q_f16(vzip1q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip1q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); + vst1q_f16(itm + 8 * 2, + vzip1q_f16(vzip2q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip2q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); + vst1q_f16(itm + 8 * 3, + vzip2q_f16(vzip2q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip2q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); + vst1q_f16(itm + 8 * 4, + vzip1q_f16(vzip1q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vzip1q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); + vst1q_f16(itm + 8 * 5, + vzip2q_f16(vzip1q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vzip1q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); + vst1q_f16(itm + 8 * 6, + vzip1q_f16(vzip2q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vzip2q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); + vst1q_f16(itm + 8 * 7, + vzip2q_f16(vzip2q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vzip2q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); + } + } + for (I32 o = 0; o < I32(oc - 1); o += 2) { + const F16 *b_0 = biasArray + o * 8; + const F16 *b_1 = b_0 + 8; + F16 *itm_0 = itmArray; + // dot prod + // (6*6)*C*c8*hw8 times O*(6*6)*C*c8*o16 = O*(6*6)*hw8*o16 + for (U32 idx = 0; idx < 36; idx++) { + __asm__ __volatile__("mov x0, %[ic]\n" // ic_blk + "eor v2.16b, v2.16b, v2.16b\n" // out_o0hw0 + "ldr d0, [%[in]]\n" // in_hw0 + "eor v4.16b, v4.16b, v4.16b\n" // out_o0hw1 + "ldr x1, [%[in], #8]\n" + "eor v6.16b, v6.16b, v6.16b\n" // out_o0hw2 + "ins v0.d[1], x1\n" + "eor v8.16b, v8.16b, v8.16b\n" // out_o0hw3 + "ldr d18, [%[f]]\n" // f_o0c0 + "eor v10.16b, v10.16b, v10.16b\n" // out_o0hw4 + "ldr x2, [%[f], #8]\n" + "eor v12.16b, v12.16b, v12.16b\n" // out_o0hw5 + "ins v18.d[1], x2\n" + "eor v14.16b, v14.16b, v14.16b\n" // out_o0hw6 + "ldr d19, [%[f], #16]\n" // f_o1c0 + "eor v16.16b, v16.16b, v16.16b\n" // out_o0hw7 + "ldr x3, [%[f], #24]\n" + "eor v3.16b, v3.16b, v3.16b\n" // out_o1hw0 + "ins v19.d[1], x3\n" + "eor v5.16b, v5.16b, v5.16b\n" // out_o1hw1 + "eor v7.16b, v7.16b, v7.16b\n" // out_o1hw2 + "eor v9.16b, v9.16b, v9.16b\n" // out_o1hw3 + "eor v11.16b, v11.16b, v11.16b\n" // out_o1hw4 + "eor v13.16b, v13.16b, v13.16b\n" // out_o1hw5 + "eor v15.16b, v15.16b, v15.16b\n" // out_o1hw6 + "eor v17.16b, v17.16b, v17.16b\n" // out_o1hw7 + "0:\n" + "ldr d1, [%[in], #16]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr x1, [%[in], #24]\n" + "fmla v4.8h, v18.8h, v0.h[1]\n" + "ins v1.d[1], x1\n" + "fmla v6.8h, v18.8h, v0.h[2]\n" + "ldr d20, [%[f], #32]\n" // f_o0c0 + "fmla v8.8h, v18.8h, v0.h[3]\n" + "ldr x2, [%[f], #40]\n" + "fmla v10.8h, v18.8h, v0.h[4]\n" + "ins v20.d[1], x2\n" + "fmla v12.8h, v18.8h, v0.h[5]\n" + "ldr d21, [%[f], #48]\n" // f_o1c0 + "fmla v14.8h, v18.8h, v0.h[6]\n" + "ldr x3, [%[f], #56]\n" + "fmla v16.8h, v18.8h, v0.h[7]\n" + "ins v21.d[1], x3\n" + "fmla v3.8h, v19.8h, v0.h[0]\n" + "fmla v5.8h, v19.8h, v0.h[1]\n" + "fmla v7.8h, v19.8h, v0.h[2]\n" + "fmla v9.8h, v19.8h, v0.h[3]\n" + "fmla v11.8h, v19.8h, v0.h[4]\n" + "fmla v13.8h, v19.8h, v0.h[5]\n" + "fmla v15.8h, v19.8h, v0.h[6]\n" + "fmla v17.8h, v19.8h, v0.h[7]\n" + + "ldr d0, [%[in], #32]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr x1, [%[in], #40]\n" + "fmla v4.8h, v20.8h, v1.h[1]\n" + "ins v0.d[1], x1\n" + "fmla v6.8h, v20.8h, v1.h[2]\n" + "ldr d18, [%[f], #64]\n" // f_o0c0 + "fmla v8.8h, v20.8h, v1.h[3]\n" + "ldr x2, [%[f], #72]\n" + "fmla v10.8h, v20.8h, v1.h[4]\n" + "ins v18.d[1], x2\n" + "fmla v12.8h, v20.8h, v1.h[5]\n" + "ldr d19, [%[f], #80]\n" // f_o1c0 + "fmla v14.8h, v20.8h, v1.h[6]\n" + "ldr x3, [%[f], #88]\n" + "fmla v16.8h, v20.8h, v1.h[7]\n" + "ins v19.d[1], x3\n" + "fmla v3.8h, v21.8h, v1.h[0]\n" + "add %[in], %[in], #32\n" + "fmla v5.8h, v21.8h, v1.h[1]\n" + "add %[f], %[f], #64\n" + "fmla v7.8h, v21.8h, v1.h[2]\n" + "subs x0, x0, #2\n" + "fmla v9.8h, v21.8h, v1.h[3]\n" + "fmla v11.8h, v21.8h, v1.h[4]\n" + "fmla v13.8h, v21.8h, v1.h[5]\n" + "fmla v15.8h, v21.8h, v1.h[6]\n" + "fmla v17.8h, v21.8h, v1.h[7]\n" + "bne 0b\n" + "st1 { v2.8h, v3.8h, v4.8h, v5.8h}, [%[out]], #64\n" + "st1 { v6.8h, v7.8h, v8.8h, v9.8h}, [%[out]], #64\n" + "st1 {v10.8h, v11.8h, v12.8h, v13.8h}, [%[out]], #64\n" + "st1 {v14.8h, v15.8h, v16.8h, v17.8h}, [%[out]], #64\n" + : [out] "+r"(otm_0), [in] "+r"(itm_0), [f] "+r"(ftm_0) + : [ic] "r"((I64)ic * 8) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "x0", "x1", "x2", + "x3"); + } + // out trans + // O*(6*6)*hw8*o16 => NOHWo8 + for (U32 hw8 = 0; hw8 < 8; hw8++) { + U32 h = (hw + hw8) / tile_w; + U32 w = (hw + hw8) % tile_w; + F16 *out_0 = outArray + n * oc * oh * ow * 8 + o * oh * ow * 8 + + h * 4 * ow * 8 + w * 4 * 8; + F16 *out_1 = out_0 + oh * ow * 8; + U32 otm_off_0 = o * 8 * 36 * 8 + hw8 * 16; + U32 otm_off_1 = otm_off_0 + 8; + + F16 *Ow_0[36]; + F16 *Ow_1[36]; + F16 *O_0[16]; + F16 *O_1[16]; + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + otm_off_0 + idx * 8 * 16; + Ow_1[idx] = otmArray + otm_off_1 + idx * 8 * 16; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + O_1[i * 4 + j] = out_1 + i * ow * 8 + j * 8; + } + } + CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + CHECK_STATUS(trans_O_4x4_3x3(Ow_1, O_1, b_1, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + } + } + if (oc & 1) { + F16 *itm_0 = itmArray; + const F16 *ftm_0 = filterArray + (oc - 1) * 36 * ic * 8 * 8; + F16 *otm_0 = otmArray + (oc - 1) * 36 * 8 * 8; + const F16 *b_0 = biasArray + (oc - 1) * 8; + // dot prod + // (6*6)*C*c8*hw8 times O*(6*6)*C*c8*o8 = O*(6*6)*hw8*o8 + for (U32 idx = 0; idx < 36; idx++) { + __asm__ __volatile__("mov x0, %[ic]\n" // ic_blk + "eor v2.16b, v2.16b, v2.16b\n" // out_o0hw0 + "ldr d0, [%[in]]\n" // in_hw0 + "eor v3.16b, v3.16b, v3.16b\n" // out_o0hw1 + "ldr x1, [%[in], #8]\n" + "eor v4.16b, v4.16b, v4.16b\n" // out_o0hw2 + "ins v0.d[1], x1\n" + "eor v5.16b, v5.16b, v5.16b\n" // out_o0hw3 + "ldr d18, [%[f]]\n" // f_o0c0 + "eor v6.16b, v6.16b, v6.16b\n" // out_o0hw4 + "ldr x2, [%[f], #8]\n" + "eor v7.16b, v7.16b, v7.16b\n" // out_o0hw5 + "ins v18.d[1], x2\n" + "eor v8.16b, v8.16b, v8.16b\n" // out_o0hw6 + "eor v9.16b, v9.16b, v9.16b\n" // out_o0hw7 + "0:\n" + "ldr d1, [%[in], #16]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr x1, [%[in], #24]\n" + "fmla v3.8h, v18.8h, v0.h[1]\n" + "ins v1.d[1], x1\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "ldr d20, [%[f], #16]\n" // f_o0c0 + "fmla v5.8h, v18.8h, v0.h[3]\n" + "ldr x2, [%[f], #24]\n" + "fmla v6.8h, v18.8h, v0.h[4]\n" + "ins v20.d[1], x2\n" + "fmla v7.8h, v18.8h, v0.h[5]\n" + "fmla v8.8h, v18.8h, v0.h[6]\n" + "subs x0, x0, #2\n" + "fmla v9.8h, v18.8h, v0.h[7]\n" + + "ldr d0, [%[in], #32]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr x1, [%[in], #40]\n" + "fmla v3.8h, v20.8h, v1.h[1]\n" + "ins v0.d[1], x1\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "ldr d18, [%[f], #32]\n" // f_o0c0 + "fmla v5.8h, v20.8h, v1.h[3]\n" + "ldr x2, [%[f], #40]\n" + "fmla v6.8h, v20.8h, v1.h[4]\n" + "ins v18.d[1], x2\n" + "fmla v7.8h, v20.8h, v1.h[5]\n" + "add %[in], %[in], #32\n" + "fmla v8.8h, v20.8h, v1.h[6]\n" + "add %[f], %[f], #32\n" + "fmla v9.8h, v20.8h, v1.h[7]\n" + "bne 0b\n" + "st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%[out]], #64\n" + "st1 {v6.8h, v7.8h, v8.8h, v9.8h}, [%[out]], #64\n" + : [out] "+r"(otm_0), [in] "+r"(itm_0), [f] "+r"(ftm_0) + : [ic] "r"((I64)ic * 8) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v18", "v20", "x0", "x1", "x2"); + } + // out trans + // O*(6*6)*hw8*o8 => NOWHo8 + for (U32 hw8 = 0; hw8 < 8; hw8++) { + U32 h = (hw + hw8) / tile_w; + U32 w = (hw + hw8) % tile_w; + F16 *out_0 = outArray + n * oc * oh * ow * 8 + (oc - 1) * oh * ow * 8 + + h * 4 * ow * 8 + w * 4 * 8; + U32 otm_off_0 = (oc - 1) * 8 * 36 * 8 + hw8 * 8; + + F16 *Ow_0[36]; + F16 *O_0[16]; + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + otm_off_0 + idx * 8 * 8; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + } + } + CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + } + } + } + + // tiles_reminder % 8 / 4 + I32 tiles_s = (tiles / 8) * 8; + for (I32 hw = tiles_s; hw < tiles - 3; hw += 4) { + const F16 *ftm_0 = filterArray; + F16 *otm_0 = otmArray; + // in trans + // NCHWc8 => (6*6)*C*c8*hw4 + for (U32 c = 0; c < ic; c++) { + F16 *inArray_pad_mov = inArray_pad + c * ih_pad * iw_pad * 8; + F16 *Iw_ptr[36]; + F16 Iw0[36][8]; + F16 *I0[36]; + F16 Iw1[36][8]; + F16 *I1[36]; + F16 Iw2[36][8]; + F16 *I2[36]; + F16 Iw3[36][8]; + F16 *I3[36]; + F16 *itmArray_mov = itmArray + c * 8 * 4; + U32 h0 = (hw / tile_w) * 4; + U32 w0 = (hw % tile_w) * 4; + U32 h1 = ((hw + 1) / tile_w) * 4; + U32 w1 = ((hw + 1) % tile_w) * 4; + U32 h2 = ((hw + 2) / tile_w) * 4; + U32 w2 = ((hw + 2) % tile_w) * 4; + U32 h3 = ((hw + 3) / tile_w) * 4; + U32 w3 = ((hw + 3) % tile_w) * 4; + for (U32 i = 0; i < 6; i++) { + for (U32 j = 0; j < 6; j++) { + I0[i * 6 + j] = inArray_pad_mov + (h0 + i) * iw_pad * 8 + (w0 + j) * 8; + I1[i * 6 + j] = inArray_pad_mov + (h1 + i) * iw_pad * 8 + (w1 + j) * 8; + I2[i * 6 + j] = inArray_pad_mov + (h2 + i) * iw_pad * 8 + (w2 + j) * 8; + I3[i * 6 + j] = inArray_pad_mov + (h3 + i) * iw_pad * 8 + (w3 + j) * 8; + } + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw0[i]; + } + trans_I_4x4_3x3(Iw_ptr, I0); + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw1[i]; + } + trans_I_4x4_3x3(Iw_ptr, I1); + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw2[i]; + } + trans_I_4x4_3x3(Iw_ptr, I2); + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw3[i]; + } + trans_I_4x4_3x3(Iw_ptr, I3); + for (U32 i = 0; i < 36; i++) { + F16 *itm = itmArray_mov + i * ic * 8 * 4; + + // for (U32 c8 = 0; c8 < 8; c8++) { + // itm[c8*4] = Iw0[i][c8]; + // itm[c8*4 + 1] = Iw1[i][c8]; + // itm[c8*4 + 2] = Iw2[i][c8]; + // itm[c8*4 + 3] = Iw3[i][c8]; + // } + + __asm__ __volatile__("ldr q0, [%[in_0]]\n" + "ldr q1, [%[in_1]]\n" + "ldr q2, [%[in_2]]\n" + "ldr q3, [%[in_3]]\n" + "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%[itm]]\n" + : [itm] "+r"(itm) + : [in_0] "r"(Iw0[i]), [in_1] "r"(Iw1[i]), + [in_2] "r"(Iw2[i]), [in_3] "r"(Iw3[i]) + : "memory", "cc", "v0", "v1", "v2", "v3"); + } + } + for (I32 o = 0; o < I32(oc - 1); o += 2) { + const F16 *b_0 = biasArray + o * 8; + const F16 *b_1 = b_0 + 8; + F16 *itm_0 = itmArray; + // dot prod + // (6*6)*C*c8*hw4 times O*(6*6)*C*c8*o16 = O*(6*6)*hw4*o16 + for (U32 idx = 0; idx < 36; idx++) { + __asm__ __volatile__("mov x0, %[ic]\n" // ic_blk + "eor v2.16b, v2.16b, v2.16b\n" // out_o0hw0 + "ldr d0, [%[in]]\n" // in_hw0 + "eor v4.16b, v4.16b, v4.16b\n" // out_o0hw1 + "ldr d18, [%[f]]\n" // f_o0c0 + "eor v6.16b, v6.16b, v6.16b\n" // out_o0hw2 + "ldr x2, [%[f], #8]\n" // f_o0c0 + "eor v8.16b, v8.16b, v8.16b\n" // out_o0hw3 + "ins v18.d[1], x2\n" // f_o0c0 + "ldr d19, [%[f], #16]\n" // f_o1c0 + "eor v3.16b, v3.16b, v3.16b\n" // out_o1hw0 + "ldr x3, [%[f], #24]\n" // f_o1c0 + "eor v5.16b, v5.16b, v5.16b\n" // out_o1hw1 + "ins v19.d[1], x3\n" // f_o1c0 + "eor v7.16b, v7.16b, v7.16b\n" // out_o1hw2 + "eor v9.16b, v9.16b, v9.16b\n" // out_o1hw3 + "0:\n" + "ldr d1, [%[in], #8]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr d20, [%[f], #32]\n" // f_o0c0 + "fmla v4.8h, v18.8h, v0.h[1]\n" + "ldr x2, [%[f], #40]\n" // f_o0c0 + "fmla v6.8h, v18.8h, v0.h[2]\n" + "ins v20.d[1], x2\n" // f_o0c0 + "fmla v8.8h, v18.8h, v0.h[3]\n" + "ldr d21, [%[f], #48]\n" // f_o1c0 + "fmla v3.8h, v19.8h, v0.h[0]\n" + "ldr x3, [%[f], #56]\n" // f_o1c0 + "fmla v5.8h, v19.8h, v0.h[1]\n" + "ins v21.d[1], x3\n" // f_o1c0 + "fmla v7.8h, v19.8h, v0.h[2]\n" + "subs x0, x0, #2\n" + "fmla v9.8h, v19.8h, v0.h[3]\n" + + "ldr d0, [%[in], #16]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr d18, [%[f], #64]\n" // f_o0c0 + "fmla v4.8h, v20.8h, v1.h[1]\n" + "ldr x2, [%[f], #72]\n" // f_o0c0 + "fmla v6.8h, v20.8h, v1.h[2]\n" + "ins v18.d[1], x2\n" // f_o0c0 + "fmla v8.8h, v20.8h, v1.h[3]\n" + "ldr d19, [%[f], #80]\n" // f_o1c0 + "fmla v3.8h, v21.8h, v1.h[0]\n" + "ldr x3, [%[f], #88]\n" // f_o1c0 + "fmla v5.8h, v21.8h, v1.h[1]\n" + "ins v19.d[1], x3\n" // f_o1c0 + "fmla v7.8h, v21.8h, v1.h[2]\n" + "add %[in], %[in], #16\n" + "fmla v9.8h, v21.8h, v1.h[3]\n" + "add %[f], %[f], #64\n" + "bne 0b\n" + "st1 { v2.8h, v3.8h, v4.8h, v5.8h}, [%[out]], #64\n" + "st1 { v6.8h, v7.8h, v8.8h, v9.8h}, [%[out]], #64\n" + : [out] "+r"(otm_0), [in] "+r"(itm_0), [f] "+r"(ftm_0) + : [ic] "r"((I64)ic * 8) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v18", "v19", "v20", "v21", "x0", "x1", + "x2", "x3"); + } + // out trans + // O*(6*6)*hw4*o16 => NOWHo8 + for (U32 hw4 = 0; hw4 < 4; hw4++) { + U32 h = (hw + hw4) / tile_w; + U32 w = (hw + hw4) % tile_w; + F16 *out_0 = outArray + n * oc * oh * ow * 8 + o * oh * ow * 8 + + h * 4 * ow * 8 + w * 4 * 8; + F16 *out_1 = out_0 + oh * ow * 8; + U32 otm_off_0 = o * 8 * 36 * 4 + hw4 * 16; + U32 otm_off_1 = otm_off_0 + 8; + + F16 *Ow_0[36]; + F16 *Ow_1[36]; + F16 *O_0[16]; + F16 *O_1[16]; + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + otm_off_0 + idx * 4 * 16; + Ow_1[idx] = otmArray + otm_off_1 + idx * 4 * 16; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + O_1[i * 4 + j] = out_1 + i * ow * 8 + j * 8; + } + } + CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + CHECK_STATUS(trans_O_4x4_3x3(Ow_1, O_1, b_1, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + } + } + if (oc & 1) { + F16 *itm_0 = itmArray; + const F16 *ftm_0 = filterArray + (oc - 1) * 8 * 36 * ic * 8; + F16 *otm_0 = otmArray + (oc - 1) * 8 * 36 * 4; + const F16 *b_0 = biasArray + (oc - 1) * 8; + // dot prod + // (6*6)*C*c8*hw4 times O*(6*6)*C*c8*o8 = O*(6*6)*hw4*o8 + for (U32 idx = 0; idx < 36; idx++) { + __asm__ __volatile__("mov x0, %[ic]\n" // ic_blk + "eor v2.16b, v2.16b, v2.16b\n" // out_o0hw0 + "ldr d0, [%[in]]\n" // in_hw0 + "eor v3.16b, v3.16b, v3.16b\n" // out_o0hw1 + "ldr d18, [%[f]]\n" // f_o0c0 + "eor v4.16b, v4.16b, v4.16b\n" // out_o0hw2 + "ldr x2, [%[f], #8]\n" + "eor v5.16b, v5.16b, v5.16b\n" // out_o0hw3 + "ins v18.d[1], x2\n" + "0:\n" + "ldr d1, [%[in], #8]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr d20, [%[f], #16]\n" // f_o0c0 + "fmla v3.8h, v18.8h, v0.h[1]\n" + "ldr x2, [%[f], #24]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "ins v20.d[1], x2\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "subs x0, x0, #2\n" + + "ldr d0, [%[in], #16]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr d18, [%[f], #32]\n" // f_o0c0 + "fmla v3.8h, v20.8h, v1.h[1]\n" + "ldr x2, [%[f], #40]\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "ins v18.d[1], x2\n" + "fmla v5.8h, v20.8h, v1.h[3]\n" + "add %[in], %[in], #16\n" + "add %[f], %[f], #32\n" + "bne 0b\n" + "st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%[out]], #64\n" + : [out] "+r"(otm_0), [in] "+r"(itm_0), [f] "+r"(ftm_0) + : [ic] "r"((I64)ic * 8) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v18", "v20", "x0", "x2"); + } + // out trans + // O*(6*6)*hw4*o8 => NOWHo8 + for (U32 hw4 = 0; hw4 < 4; hw4++) { + U32 h = (hw + hw4) / tile_w; + U32 w = (hw + hw4) % tile_w; + F16 *out_0 = outArray + n * oc * oh * ow * 8 + (oc - 1) * oh * ow * 8 + + h * 4 * ow * 8 + w * 4 * 8; + U32 otm_off_0 = (oc - 1) * 8 * 36 * 4 + hw4 * 8; + + F16 *Ow_0[36]; + F16 *O_0[16]; + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + otm_off_0 + idx * 4 * 8; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + } + } + CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + } + } + } + + // tiles_reminder % 4 + tiles_s = (tiles / 4) * 4; + for (I32 hw = tiles_s; hw < tiles; hw++) { + const F16 *ftm_0 = filterArray; + F16 *otm_0 = otmArray; + // in trans + // NCHWc8 => (6*6)*C*c8*hw1 + for (U32 c = 0; c < ic; c++) { + F16 *inArray_pad_mov = inArray_pad + c * ih_pad * iw_pad * 8; + F16 *Iw_ptr[36]; + F16 Iw0[36][8]; + F16 *I0[36]; + F16 *itmArray_mov = itmArray + c * 8; + U32 h0 = (hw / tile_w) * 4; + U32 w0 = (hw % tile_w) * 4; + for (U32 i = 0; i < 6; i++) { + for (U32 j = 0; j < 6; j++) { + I0[i * 6 + j] = inArray_pad_mov + (h0 + i) * iw_pad * 8 + (w0 + j) * 8; + } + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw0[i]; + } + trans_I_4x4_3x3(Iw_ptr, I0); + for (U32 i = 0; i < 36; i++) { + F16 *itm = itmArray_mov + i * ic * 8; + + // for (U32 c8 = 0; c8 < 8; c8++) { + // itm[c8] = Iw0[i][c8]; + // } + memcpy(itm, Iw0[i], 8 * bytesOf(idt)); + } + } + for (I32 o = 0; o < I32(oc - 1); o += 2) { + const F16 *b_0 = biasArray + o * 8; + const F16 *b_1 = b_0 + 8; + F16 *itm_0 = itmArray; + // dot prod + // (6*6)*C*c8*hw1 times O*(6*6)*C*c8*o16 = O*(6*6)*hw1*o16 + for (U32 idx = 0; idx < 36; idx++) { + __asm__ __volatile__("mov x0, %[ic]\n" // ic_blk + "eor v2.16b, v2.16b, v2.16b\n" // out_o0hw0 + "ldr h0, [%[in]]\n" // in_hw0 + "eor v3.16b, v3.16b, v3.16b\n" // out_o1hw0 + "ldr d18, [%[f]]\n" // f_o0c0 + "ldr x2, [%[f], #8]\n" // f_o0c0 + "ins v18.d[1], x2\n" // f_o0c0 + "ldr d19, [%[f], #16]\n" // f_o1c0 + "ldr x3, [%[f], #24]\n" // f_o1c0 + "ins v19.d[1], x3\n" // f_o1c0 + "0:\n" + "ldr h1, [%[in], #2]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr d20, [%[f], #32]\n" // f_o0c0 + "fmla v3.8h, v19.8h, v0.h[0]\n" + "ldr x2, [%[f], #40]\n" // f_o0c0 + "ins v20.d[1], x2\n" // f_o0c0 + "ldr d21, [%[f], #48]\n" // f_o1c0 + "subs x0, x0, #2\n" + "ldr x3, [%[f], #56]\n" // f_o1c0 + "ins v21.d[1], x3\n" // f_o1c0 + + "ldr h0, [%[in], #4]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr d18, [%[f], #64]\n" // f_o0c0 + "fmla v3.8h, v21.8h, v1.h[0]\n" + "ldr x2, [%[f], #72]\n" // f_o0c0 + "ins v18.d[1], x2\n" // f_o0c0 + "ldr d19, [%[f], #80]\n" // f_o1c0 + "add %[in], %[in], #4\n" + "ldr x3, [%[f], #88]\n" // f_o1c0 + "ins v19.d[1], x3\n" // f_o1c0 + "add %[f], %[f], #64\n" + "bne 0b\n" + "st1 {v2.8h, v3.8h}, [%[out]], #32\n" + : [out] "+r"(otm_0), [in] "+r"(itm_0), [f] "+r"(ftm_0) + : [ic] "r"((I64)ic * 8) + : "memory", "cc", "v0", "v1", "v2", "v3", "v18", "v19", + "v20", "v21", "x0", "x2", "x3"); + } + // out trans + // O*(6*6)*hw1*o16 => NOWHo8 + U32 h = hw / tile_w; + U32 w = hw % tile_w; + F16 *out_0 = + outArray + n * oc * oh * ow * 8 + o * oh * ow * 8 + h * 4 * ow * 8 + w * 4 * 8; + F16 *out_1 = out_0 + oh * ow * 8; + U32 otm_off_0 = o * 8 * 36; + U32 otm_off_1 = otm_off_0 + 8; + + F16 *Ow_0[36]; + F16 *Ow_1[36]; + F16 *O_0[16]; + F16 *O_1[16]; + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + otm_off_0 + idx * 16; + Ow_1[idx] = otmArray + otm_off_1 + idx * 16; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + O_1[i * 4 + j] = out_1 + i * ow * 8 + j * 8; + } + } + CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + CHECK_STATUS(trans_O_4x4_3x3(Ow_1, O_1, b_1, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + } + if (oc & 1) { + F16 *itm_0 = itmArray; + const F16 *ftm_0 = filterArray + (oc - 1) * 8 * 36 * ic * 8; + F16 *otm_0 = otmArray + (oc - 1) * 8 * 36; + const F16 *b_0 = biasArray + (oc - 1) * 8; + // dot prod + // (6*6)*C*c8*hw1 times O*(6*6)*C*c8*o8 = O*(6*6)*hw1*o8 + for (U32 idx = 0; idx < 36; idx++) { + __asm__ __volatile__( + "mov x0, %[ic]\n" // ic_blk + "eor v2.16b, v2.16b, v2.16b\n" // out_o0hw0 + "ldr s0, [%[in]]\n" // in_hw0 + "ldr d18, [%[f]]\n" // f_o0c0 + "ldr x2, [%[f], #8]\n" + "ins v18.d[1], x2\n" + "0:\n" + "ldr h1, [%[in], #2]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr d20, [%[f], #16]\n" // f_o0c0 + "ldr x2, [%[f], #24]\n" + "ins v20.d[1], x2\n" + "subs x0, x0, #2\n" + + "ldr h0, [%[in], #4]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr d18, [%[f], #32]\n" // f_o0c0 + "ldr x2, [%[f], #40]\n" + "ins v18.d[1], x2\n" + "add %[in], %[in], #4\n" + "add %[f], %[f], #32\n" + "bne 0b\n" + "st1 {v2.8h}, [%[out]], #16\n" + : [out] "+r"(otm_0), [in] "+r"(itm_0), [f] "+r"(ftm_0) + : [ic] "r"((I64)ic * 8) + : "memory", "cc", "v0", "v1", "v2", "v18", "v20", "x0", "x2"); + } + // out trans + // O*(6*6)*hw1*o8 => NOWHo8 + U32 h = hw / tile_w; + U32 w = hw % tile_w; + F16 *out_0 = outArray + n * oc * oh * ow * 8 + (oc - 1) * oh * ow * 8 + + h * 4 * ow * 8 + w * 4 * 8; + U32 otm_off_0 = (oc - 1) * 8 * 36; + + F16 *Ow_0[36]; + F16 *O_0[16]; + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + otm_off_0 + idx * 8; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + } + } + CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + } + } + } + return ret; +} diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_winograd_A76.cpp b/compute/tensor/src/cpu/arm/fp16/convolution_winograd_A76.cpp new file mode 100644 index 00000000..a47a5952 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/convolution_winograd_A76.cpp @@ -0,0 +1,725 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/fp16/convolution_winograd_transform.h" +#include "cpu/arm/fp16/convolution_winograd.h" + +EE convolution_winograd_A76(TensorDesc inputDesc, + F16 *inArray, + TensorDesc filterDesc, + const F16 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + if (fdf != DF_HWNCN16) { + CHECK_STATUS(NOT_MATCH); + } + if (!(fh == 6 && fw == 6)) { + CHECK_STATUS(NOT_SUPPORTED); + } + + oc /= 8; + ic /= 8; + + U32 tile_h = (oh + 3) / 4; + U32 tile_w = (ow + 3) / 4; + // num of 6x6 tiles + I32 tiles = tile_h * tile_w; + U32 pad_left = paddingL; + U32 pad_right = paddingR + (tile_w * 4 - ow); + U32 pad_w_mod_4 = tile_w * 4 - ow; + U32 pad_top = paddingT; + U32 pad_bottom = paddingB + (tile_h * 4 - oh); + U32 pad_h_mod_4 = tile_h * 4 - oh; + U32 ih_pad = ih + pad_top + pad_bottom; + U32 iw_pad = iw + pad_left + pad_right; + // tmp = in_pad + itm + otm + // in_pad: ic*ih_pad*iw_pad*8 + // itm: 6*6*ic*8*8 + // otm: oc*6*6*8*8 + F16 *inArray_pad = (F16 *)tmp; + F16 *itmArray = inArray_pad + ic * ih_pad * iw_pad * 8; + F16 *otmArray = itmArray + 6 * 6 * ic * 8 * 8; + + EE ret = SUCCESS; + // copy input into a input with padding + for (U32 n = 0; n < in; n++) { + F16 *inArray_pad_mov = inArray_pad; + F16 *inArray_mov = inArray + n * ic * ih * iw * 8; + for (U32 c = 0; c < ic; c++) { + memset(inArray_pad_mov, 0, pad_top * iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += pad_top * iw_pad * 8; + for (U32 h = pad_top; h < ih_pad - pad_bottom; h++) { + memset(inArray_pad_mov, 0, pad_left * 8 * bytesOf(idt)); + inArray_pad_mov += pad_left * 8; + memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt)); + inArray_pad_mov += iw * 8; + inArray_mov += iw * 8; + memset(inArray_pad_mov, 0, pad_right * 8 * bytesOf(idt)); + inArray_pad_mov += pad_right * 8; + } + memset(inArray_pad_mov, 0, pad_bottom * iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += pad_bottom * iw_pad * 8; + } + + // tiles / 8 + for (I32 hw = 0; hw < tiles - 7; hw += 8) { + const F16 *ftm_0 = filterArray; + F16 *otm_0 = otmArray; + // in trans + // NCHWc8 => (6*6)*C*c8*hw8 + for (U32 c = 0; c < ic; c++) { + F16 *inArray_pad_mov = inArray_pad + c * ih_pad * iw_pad * 8; + F16 *itmArray_mov = itmArray + c * 8 * 8; + F16 *Iw_ptr[36]; + F16 Iw[8][36][8]; + F16 *I[8][36]; + U32 h[8]; + U32 w[8]; + for (U32 index = 0; index < 8; index++) { + h[index] = ((hw + index) / tile_w) * 4; + w[index] = ((hw + index) % tile_w) * 4; + } + for (U32 i = 0; i < 6; i++) { + for (U32 j = 0; j < 6; j++) { + for (U32 index = 0; index < 8; index++) { + I[index][i * 6 + j] = + inArray_pad_mov + (h[index] + i) * iw_pad * 8 + (w[index] + j) * 8; + } + } + } + for (U32 index = 0; index < 8; index++) { + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw[index][i]; + } + trans_I_4x4_3x3(Iw_ptr, I[index]); + } + for (U32 i = 0; i < 36; i++) { + F16 *itm = itmArray_mov + i * ic * 8 * 8; + float16x8_t v0 = vld1q_f16(Iw[0][i]); + float16x8_t v1 = vld1q_f16(Iw[1][i]); + float16x8_t v2 = vld1q_f16(Iw[2][i]); + float16x8_t v3 = vld1q_f16(Iw[3][i]); + float16x8_t v4 = vld1q_f16(Iw[4][i]); + float16x8_t v5 = vld1q_f16(Iw[5][i]); + float16x8_t v6 = vld1q_f16(Iw[6][i]); + float16x8_t v7 = vld1q_f16(Iw[7][i]); + vst1q_f16(itm, + vzip1q_f16(vzip1q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip1q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); + vst1q_f16(itm + 8, + vzip2q_f16(vzip1q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip1q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); + vst1q_f16(itm + 8 * 2, + vzip1q_f16(vzip2q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip2q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); + vst1q_f16(itm + 8 * 3, + vzip2q_f16(vzip2q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip2q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); + vst1q_f16(itm + 8 * 4, + vzip1q_f16(vzip1q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vzip1q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); + vst1q_f16(itm + 8 * 5, + vzip2q_f16(vzip1q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vzip1q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); + vst1q_f16(itm + 8 * 6, + vzip1q_f16(vzip2q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vzip2q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); + vst1q_f16(itm + 8 * 7, + vzip2q_f16(vzip2q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vzip2q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); + } + } + for (I32 o = 0; o < I32(oc - 1); o += 2) { + const F16 *b_0 = biasArray + o * 8; + const F16 *b_1 = b_0 + 8; + F16 *itm_0 = itmArray; + // dot prod + // (6*6)*C*c8*hw8 times O*(6*6)*C*c8*o16 = O*(6*6)*hw8*o16 + for (U32 idx = 0; idx < 36; idx++) { + __asm__ __volatile__("mov x0, %[ic]\n" // ic_blk + "eor v2.16b, v2.16b, v2.16b\n" // out_o0hw0 + "ldr q0, [%[in]]\n" // in_hw0 + "eor v4.16b, v4.16b, v4.16b\n" // out_o0hw1 + "eor v6.16b, v6.16b, v6.16b\n" // out_o0hw2 + "eor v8.16b, v8.16b, v8.16b\n" // out_o0hw3 + "ldr q18, [%[f]]\n" // f_o0c0 + "eor v10.16b, v10.16b, v10.16b\n" // out_o0hw4 + "eor v12.16b, v12.16b, v12.16b\n" // out_o0hw5 + "eor v14.16b, v14.16b, v14.16b\n" // out_o0hw6 + "ldr q19, [%[f], #16]\n" // f_o1c0 + "eor v16.16b, v16.16b, v16.16b\n" // out_o0hw7 + "eor v3.16b, v3.16b, v3.16b\n" // out_o1hw0 + "eor v5.16b, v5.16b, v5.16b\n" // out_o1hw1 + "eor v7.16b, v7.16b, v7.16b\n" // out_o1hw2 + "eor v9.16b, v9.16b, v9.16b\n" // out_o1hw3 + "eor v11.16b, v11.16b, v11.16b\n" // out_o1hw4 + "eor v13.16b, v13.16b, v13.16b\n" // out_o1hw5 + "eor v15.16b, v15.16b, v15.16b\n" // out_o1hw6 + "eor v17.16b, v17.16b, v17.16b\n" // out_o1hw7 + "0:\n" + "ldr q1, [%[in], #16]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "fmla v4.8h, v18.8h, v0.h[1]\n" + "ldr q20, [%[f], #32]\n" // f_o0c0 + "fmla v6.8h, v18.8h, v0.h[2]\n" + "fmla v8.8h, v18.8h, v0.h[3]\n" + "ldr q21, [%[f], #48]\n" // f_o1c0 + "fmla v10.8h, v18.8h, v0.h[4]\n" + "fmla v12.8h, v18.8h, v0.h[5]\n" + "fmla v14.8h, v18.8h, v0.h[6]\n" + "fmla v16.8h, v18.8h, v0.h[7]\n" + "fmla v3.8h, v19.8h, v0.h[0]\n" + "fmla v5.8h, v19.8h, v0.h[1]\n" + "fmla v7.8h, v19.8h, v0.h[2]\n" + "fmla v9.8h, v19.8h, v0.h[3]\n" + "fmla v11.8h, v19.8h, v0.h[4]\n" + "fmla v13.8h, v19.8h, v0.h[5]\n" + "fmla v15.8h, v19.8h, v0.h[6]\n" + "fmla v17.8h, v19.8h, v0.h[7]\n" + + "ldr q0, [%[in], #32]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "fmla v4.8h, v20.8h, v1.h[1]\n" + "ldr q18, [%[f], #64]\n" // f_o0c0 + "fmla v6.8h, v20.8h, v1.h[2]\n" + "fmla v8.8h, v20.8h, v1.h[3]\n" + "ldr q19, [%[f], #80]\n" // f_o1c0 + "fmla v10.8h, v20.8h, v1.h[4]\n" + "fmla v12.8h, v20.8h, v1.h[5]\n" + "fmla v14.8h, v20.8h, v1.h[6]\n" + "fmla v16.8h, v20.8h, v1.h[7]\n" + "fmla v3.8h, v21.8h, v1.h[0]\n" + "add %[in], %[in], #32\n" + "fmla v5.8h, v21.8h, v1.h[1]\n" + "add %[f], %[f], #64\n" + "fmla v7.8h, v21.8h, v1.h[2]\n" + "subs x0, x0, #2\n" + "fmla v9.8h, v21.8h, v1.h[3]\n" + "fmla v11.8h, v21.8h, v1.h[4]\n" + "fmla v13.8h, v21.8h, v1.h[5]\n" + "fmla v15.8h, v21.8h, v1.h[6]\n" + "fmla v17.8h, v21.8h, v1.h[7]\n" + "bne 0b\n" + "st1 { v2.8h, v3.8h, v4.8h, v5.8h}, [%[out]], #64\n" + "st1 { v6.8h, v7.8h, v8.8h, v9.8h}, [%[out]], #64\n" + "st1 {v10.8h, v11.8h, v12.8h, v13.8h}, [%[out]], #64\n" + "st1 {v14.8h, v15.8h, v16.8h, v17.8h}, [%[out]], #64\n" + : [out] "+r"(otm_0), [in] "+r"(itm_0), [f] "+r"(ftm_0) + : [ic] "r"((I64)ic * 8) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "x0"); + } + // out trans + // O*(6*6)*hw8*o16 => NOHWo8 + for (U32 hw8 = 0; hw8 < 8; hw8++) { + U32 h = (hw + hw8) / tile_w; + U32 w = (hw + hw8) % tile_w; + F16 *out_0 = outArray + n * oc * oh * ow * 8 + o * oh * ow * 8 + + h * 4 * ow * 8 + w * 4 * 8; + F16 *out_1 = out_0 + oh * ow * 8; + U32 otm_off_0 = o * 8 * 36 * 8 + hw8 * 16; + U32 otm_off_1 = otm_off_0 + 8; + + F16 *Ow_0[36]; + F16 *Ow_1[36]; + F16 *O_0[16]; + F16 *O_1[16]; + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + otm_off_0 + idx * 8 * 16; + Ow_1[idx] = otmArray + otm_off_1 + idx * 8 * 16; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + O_1[i * 4 + j] = out_1 + i * ow * 8 + j * 8; + } + } + CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + CHECK_STATUS(trans_O_4x4_3x3(Ow_1, O_1, b_1, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + } + } + if (oc & 1) { + F16 *itm_0 = itmArray; + const F16 *ftm_0 = filterArray + (oc - 1) * 36 * ic * 8 * 8; + F16 *otm_0 = otmArray + (oc - 1) * 36 * 8 * 8; + const F16 *b_0 = biasArray + (oc - 1) * 8; + // dot prod + // (6*6)*C*c8*hw8 times O*(6*6)*C*c8*o8 = O*(6*6)*hw8*o8 + for (U32 idx = 0; idx < 36; idx++) { + __asm__ __volatile__("mov x0, %[ic]\n" // ic_blk + "eor v2.16b, v2.16b, v2.16b\n" // out_o0hw0 + "ldr q0, [%[in]]\n" // in_hw0 + "eor v3.16b, v3.16b, v3.16b\n" // out_o0hw1 + "eor v4.16b, v4.16b, v4.16b\n" // out_o0hw2 + "eor v5.16b, v5.16b, v5.16b\n" // out_o0hw3 + "ldr q18, [%[f]]\n" // f_o0c0 + "eor v6.16b, v6.16b, v6.16b\n" // out_o0hw4 + "eor v7.16b, v7.16b, v7.16b\n" // out_o0hw5 + "eor v8.16b, v8.16b, v8.16b\n" // out_o0hw6 + "eor v9.16b, v9.16b, v9.16b\n" // out_o0hw7 + "0:\n" + "ldr q1, [%[in], #16]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "fmla v3.8h, v18.8h, v0.h[1]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "ldr q20, [%[f], #16]\n" // f_o0c0 + "fmla v5.8h, v18.8h, v0.h[3]\n" + "fmla v6.8h, v18.8h, v0.h[4]\n" + "fmla v7.8h, v18.8h, v0.h[5]\n" + "fmla v8.8h, v18.8h, v0.h[6]\n" + "subs x0, x0, #2\n" + "fmla v9.8h, v18.8h, v0.h[7]\n" + + "ldr q0, [%[in], #32]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "fmla v3.8h, v20.8h, v1.h[1]\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "ldr q18, [%[f], #32]\n" // f_o0c0 + "fmla v5.8h, v20.8h, v1.h[3]\n" + "fmla v6.8h, v20.8h, v1.h[4]\n" + "fmla v7.8h, v20.8h, v1.h[5]\n" + "add %[in], %[in], #32\n" + "fmla v8.8h, v20.8h, v1.h[6]\n" + "add %[f], %[f], #32\n" + "fmla v9.8h, v20.8h, v1.h[7]\n" + "bne 0b\n" + "st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%[out]], #64\n" + "st1 {v6.8h, v7.8h, v8.8h, v9.8h}, [%[out]], #64\n" + : [out] "+r"(otm_0), [in] "+r"(itm_0), [f] "+r"(ftm_0) + : [ic] "r"((I64)ic * 8) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v18", "v20", "x0"); + } + // out trans + // O*(6*6)*hw8*o8 => NOWHo8 + for (U32 hw8 = 0; hw8 < 8; hw8++) { + U32 h = (hw + hw8) / tile_w; + U32 w = (hw + hw8) % tile_w; + F16 *out_0 = outArray + n * oc * oh * ow * 8 + (oc - 1) * oh * ow * 8 + + h * 4 * ow * 8 + w * 4 * 8; + U32 otm_off_0 = (oc - 1) * 8 * 36 * 8 + hw8 * 8; + + F16 *Ow_0[36]; + F16 *O_0[16]; + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + otm_off_0 + idx * 8 * 8; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + } + } + CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + } + } + } + + // tiles_reminder % 8 / 4 + I32 tiles_s = (tiles / 8) * 8; + for (I32 hw = tiles_s; hw < tiles - 3; hw += 4) { + const F16 *ftm_0 = filterArray; + F16 *otm_0 = otmArray; + // in trans + // NCHWc8 => (6*6)*C*c8*hw4 + for (U32 c = 0; c < ic; c++) { + F16 *inArray_pad_mov = inArray_pad + c * ih_pad * iw_pad * 8; + F16 *Iw_ptr[36]; + F16 Iw0[36][8]; + F16 *I0[36]; + F16 Iw1[36][8]; + F16 *I1[36]; + F16 Iw2[36][8]; + F16 *I2[36]; + F16 Iw3[36][8]; + F16 *I3[36]; + F16 *itmArray_mov = itmArray + c * 8 * 4; + U32 h0 = (hw / tile_w) * 4; + U32 w0 = (hw % tile_w) * 4; + U32 h1 = ((hw + 1) / tile_w) * 4; + U32 w1 = ((hw + 1) % tile_w) * 4; + U32 h2 = ((hw + 2) / tile_w) * 4; + U32 w2 = ((hw + 2) % tile_w) * 4; + U32 h3 = ((hw + 3) / tile_w) * 4; + U32 w3 = ((hw + 3) % tile_w) * 4; + for (U32 i = 0; i < 6; i++) { + for (U32 j = 0; j < 6; j++) { + I0[i * 6 + j] = inArray_pad_mov + (h0 + i) * iw_pad * 8 + (w0 + j) * 8; + I1[i * 6 + j] = inArray_pad_mov + (h1 + i) * iw_pad * 8 + (w1 + j) * 8; + I2[i * 6 + j] = inArray_pad_mov + (h2 + i) * iw_pad * 8 + (w2 + j) * 8; + I3[i * 6 + j] = inArray_pad_mov + (h3 + i) * iw_pad * 8 + (w3 + j) * 8; + } + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw0[i]; + } + trans_I_4x4_3x3(Iw_ptr, I0); + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw1[i]; + } + trans_I_4x4_3x3(Iw_ptr, I1); + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw2[i]; + } + trans_I_4x4_3x3(Iw_ptr, I2); + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw3[i]; + } + trans_I_4x4_3x3(Iw_ptr, I3); + for (U32 i = 0; i < 36; i++) { + F16 *itm = itmArray_mov + i * ic * 8 * 4; + + // for (U32 c8 = 0; c8 < 8; c8++) { + // itm[c8*4] = Iw0[i][c8]; + // itm[c8*4 + 1] = Iw1[i][c8]; + // itm[c8*4 + 2] = Iw2[i][c8]; + // itm[c8*4 + 3] = Iw3[i][c8]; + // } + + __asm__ __volatile__("ldr q0, [%[in_0]]\n" + "ldr q1, [%[in_1]]\n" + "ldr q2, [%[in_2]]\n" + "ldr q3, [%[in_3]]\n" + "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%[itm]]\n" + : [itm] "+r"(itm) + : [in_0] "r"(Iw0[i]), [in_1] "r"(Iw1[i]), + [in_2] "r"(Iw2[i]), [in_3] "r"(Iw3[i]) + : "memory", "cc", "v0", "v1", "v2", "v3"); + } + } + for (I32 o = 0; o < I32(oc - 1); o += 2) { + const F16 *b_0 = biasArray + o * 8; + const F16 *b_1 = b_0 + 8; + F16 *itm_0 = itmArray; + // dot prod + // (6*6)*C*c8*hw4 times O*(6*6)*C*c8*o16 = O*(6*6)*hw4*o16 + for (U32 idx = 0; idx < 36; idx++) { + __asm__ __volatile__("mov x0, %[ic]\n" // ic_blk + "eor v2.16b, v2.16b, v2.16b\n" // out_o0hw0 + "ldr d0, [%[in]]\n" // in_hw0 + "eor v4.16b, v4.16b, v4.16b\n" // out_o0hw1 + "ldr q18, [%[f]]\n" // f_o0c0 + "eor v6.16b, v6.16b, v6.16b\n" // out_o0hw2 + "eor v8.16b, v8.16b, v8.16b\n" // out_o0hw3 + "ldr q19, [%[f], #16]\n" // f_o1c0 + "eor v3.16b, v3.16b, v3.16b\n" // out_o1hw0 + "eor v5.16b, v5.16b, v5.16b\n" // out_o1hw1 + "eor v7.16b, v7.16b, v7.16b\n" // out_o1hw2 + "eor v9.16b, v9.16b, v9.16b\n" // out_o1hw3 + "0:\n" + "ldr d1, [%[in], #8]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr q20, [%[f], #32]\n" // f_o0c0 + "fmla v4.8h, v18.8h, v0.h[1]\n" + "fmla v6.8h, v18.8h, v0.h[2]\n" + "fmla v8.8h, v18.8h, v0.h[3]\n" + "ldr q21, [%[f], #48]\n" // f_o1c0 + "fmla v3.8h, v19.8h, v0.h[0]\n" + "fmla v5.8h, v19.8h, v0.h[1]\n" + "fmla v7.8h, v19.8h, v0.h[2]\n" + "subs x0, x0, #2\n" + "fmla v9.8h, v19.8h, v0.h[3]\n" + + "ldr d0, [%[in], #16]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr q18, [%[f], #64]\n" // f_o0c0 + "fmla v4.8h, v20.8h, v1.h[1]\n" + "fmla v6.8h, v20.8h, v1.h[2]\n" + "fmla v8.8h, v20.8h, v1.h[3]\n" + "ldr q19, [%[f], #80]\n" // f_o1c0 + "fmla v3.8h, v21.8h, v1.h[0]\n" + "fmla v5.8h, v21.8h, v1.h[1]\n" + "fmla v7.8h, v21.8h, v1.h[2]\n" + "add %[in], %[in], #16\n" + "fmla v9.8h, v21.8h, v1.h[3]\n" + "add %[f], %[f], #64\n" + "bne 0b\n" + "st1 { v2.8h, v3.8h, v4.8h, v5.8h}, [%[out]], #64\n" + "st1 { v6.8h, v7.8h, v8.8h, v9.8h}, [%[out]], #64\n" + : [out] "+r"(otm_0), [in] "+r"(itm_0), [f] "+r"(ftm_0) + : [ic] "r"((I64)ic * 8) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v18", "v19", "v20", "v21", "x0"); + } + // out trans + // O*(6*6)*hw4*o16 => NOWHo8 + for (U32 hw4 = 0; hw4 < 4; hw4++) { + U32 h = (hw + hw4) / tile_w; + U32 w = (hw + hw4) % tile_w; + F16 *out_0 = outArray + n * oc * oh * ow * 8 + o * oh * ow * 8 + + h * 4 * ow * 8 + w * 4 * 8; + F16 *out_1 = out_0 + oh * ow * 8; + U32 otm_off_0 = o * 8 * 36 * 4 + hw4 * 16; + U32 otm_off_1 = otm_off_0 + 8; + + F16 *Ow_0[36]; + F16 *Ow_1[36]; + F16 *O_0[16]; + F16 *O_1[16]; + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + otm_off_0 + idx * 4 * 16; + Ow_1[idx] = otmArray + otm_off_1 + idx * 4 * 16; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + O_1[i * 4 + j] = out_1 + i * ow * 8 + j * 8; + } + } + CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + CHECK_STATUS(trans_O_4x4_3x3(Ow_1, O_1, b_1, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + } + } + if (oc & 1) { + F16 *itm_0 = itmArray; + const F16 *ftm_0 = filterArray + (oc - 1) * 8 * 36 * ic * 8; + F16 *otm_0 = otmArray + (oc - 1) * 8 * 36 * 4; + const F16 *b_0 = biasArray + (oc - 1) * 8; + // dot prod + // (6*6)*C*c8*hw4 times O*(6*6)*C*c8*o8 = O*(6*6)*hw4*o8 + for (U32 idx = 0; idx < 36; idx++) { + __asm__ __volatile__( + "mov x0, %[ic]\n" // ic_blk + "eor v2.16b, v2.16b, v2.16b\n" // out_o0hw0 + "ldr d0, [%[in]]\n" // in_hw0 + "eor v3.16b, v3.16b, v3.16b\n" // out_o0hw1 + "ldr q18, [%[f]]\n" // f_o0c0 + "eor v4.16b, v4.16b, v4.16b\n" // out_o0hw2 + "eor v5.16b, v5.16b, v5.16b\n" // out_o0hw3 + "0:\n" + "ldr d1, [%[in], #8]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr q20, [%[f], #16]\n" // f_o0c0 + "fmla v3.8h, v18.8h, v0.h[1]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "subs x0, x0, #2\n" + + "ldr d0, [%[in], #16]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr q18, [%[f], #32]\n" // f_o0c0 + "fmla v3.8h, v20.8h, v1.h[1]\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "fmla v5.8h, v20.8h, v1.h[3]\n" + "add %[in], %[in], #16\n" + "add %[f], %[f], #32\n" + "bne 0b\n" + "st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%[out]], #64\n" + : [out] "+r"(otm_0), [in] "+r"(itm_0), [f] "+r"(ftm_0) + : [ic] "r"((I64)ic * 8) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v18", "v20", "x0"); + } + // out trans + // O*(6*6)*hw4*o8 => NOWHo8 + for (U32 hw4 = 0; hw4 < 4; hw4++) { + U32 h = (hw + hw4) / tile_w; + U32 w = (hw + hw4) % tile_w; + F16 *out_0 = outArray + n * oc * oh * ow * 8 + (oc - 1) * oh * ow * 8 + + h * 4 * ow * 8 + w * 4 * 8; + U32 otm_off_0 = (oc - 1) * 8 * 36 * 4 + hw4 * 8; + + F16 *Ow_0[36]; + F16 *O_0[16]; + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + otm_off_0 + idx * 4 * 8; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + } + } + CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + } + } + } + + // tiles_reminder % 4 + tiles_s = (tiles / 4) * 4; + for (I32 hw = tiles_s; hw < tiles; hw++) { + const F16 *ftm_0 = filterArray; + F16 *otm_0 = otmArray; + // in trans + // NCHWc8 => (6*6)*C*c8*hw1 + for (U32 c = 0; c < ic; c++) { + F16 *inArray_pad_mov = inArray_pad + c * ih_pad * iw_pad * 8; + F16 *Iw_ptr[36]; + F16 Iw0[36][8]; + F16 *I0[36]; + F16 *itmArray_mov = itmArray + c * 8; + U32 h0 = (hw / tile_w) * 4; + U32 w0 = (hw % tile_w) * 4; + for (U32 i = 0; i < 6; i++) { + for (U32 j = 0; j < 6; j++) { + I0[i * 6 + j] = inArray_pad_mov + (h0 + i) * iw_pad * 8 + (w0 + j) * 8; + } + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw0[i]; + } + trans_I_4x4_3x3(Iw_ptr, I0); + for (U32 i = 0; i < 36; i++) { + F16 *itm = itmArray_mov + i * ic * 8; + + // for (U32 c8 = 0; c8 < 8; c8++) { + // itm[c8] = Iw0[i][c8]; + // } + memcpy(itm, Iw0[i], 8 * bytesOf(idt)); + } + } + for (I32 o = 0; o < I32(oc - 1); o += 2) { + const F16 *b_0 = biasArray + o * 8; + const F16 *b_1 = b_0 + 8; + F16 *itm_0 = itmArray; + // dot prod + // (6*6)*C*c8*hw1 times O*(6*6)*C*c8*o16 = O*(6*6)*hw1*o16 + for (U32 idx = 0; idx < 36; idx++) { + __asm__ __volatile__( + "mov x0, %[ic]\n" // ic_blk + "eor v2.16b, v2.16b, v2.16b\n" // out_o0hw0 + "ldr h0, [%[in]]\n" // in_hw0 + "eor v3.16b, v3.16b, v3.16b\n" // out_o1hw0 + "ldr q18, [%[f]]\n" // f_o0c0 + "ldr q19, [%[f], #16]\n" // f_o1c0 + "0:\n" + "ldr h1, [%[in], #2]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr q20, [%[f], #32]\n" // f_o0c0 + "fmla v3.8h, v19.8h, v0.h[0]\n" + "ldr q21, [%[f], #48]\n" // f_o1c0 + "subs x0, x0, #2\n" + + "ldr h0, [%[in], #4]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr q18, [%[f], #64]\n" // f_o0c0 + "fmla v3.8h, v21.8h, v1.h[0]\n" + "ldr q19, [%[f], #80]\n" // f_o1c0 + "add %[in], %[in], #4\n" + "add %[f], %[f], #64\n" + "bne 0b\n" + "st1 {v2.8h, v3.8h}, [%[out]], #32\n" + : [out] "+r"(otm_0), [in] "+r"(itm_0), [f] "+r"(ftm_0) + : [ic] "r"((I64)ic * 8) + : "memory", "cc", "v0", "v1", "v2", "v3", "v18", "v19", "v20", "v21", "x0"); + } + // out trans + // O*(6*6)*hw1*o16 => NOWHo8 + U32 h = hw / tile_w; + U32 w = hw % tile_w; + F16 *out_0 = + outArray + n * oc * oh * ow * 8 + o * oh * ow * 8 + h * 4 * ow * 8 + w * 4 * 8; + F16 *out_1 = out_0 + oh * ow * 8; + U32 otm_off_0 = o * 8 * 36; + U32 otm_off_1 = otm_off_0 + 8; + + F16 *Ow_0[36]; + F16 *Ow_1[36]; + F16 *O_0[16]; + F16 *O_1[16]; + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + otm_off_0 + idx * 16; + Ow_1[idx] = otmArray + otm_off_1 + idx * 16; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + O_1[i * 4 + j] = out_1 + i * ow * 8 + j * 8; + } + } + CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + CHECK_STATUS(trans_O_4x4_3x3(Ow_1, O_1, b_1, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + } + if (oc & 1) { + F16 *itm_0 = itmArray; + const F16 *ftm_0 = filterArray + (oc - 1) * 8 * 36 * ic * 8; + F16 *otm_0 = otmArray + (oc - 1) * 8 * 36; + const F16 *b_0 = biasArray + (oc - 1) * 8; + // dot prod + // (6*6)*C*c8*hw1 times O*(6*6)*C*c8*o8 = O*(6*6)*hw1*o8 + for (U32 idx = 0; idx < 36; idx++) { + __asm__ __volatile__("mov x0, %[ic]\n" // ic_blk + "eor v2.16b, v2.16b, v2.16b\n" // out_o0hw0 + "ldr s0, [%[in]]\n" // in_hw0 + "ldr q18, [%[f]]\n" // f_o0c0 + "0:\n" + "ldr h1, [%[in], #2]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr q20, [%[f], #16]\n" // f_o0c0 + "subs x0, x0, #2\n" + + "ldr h0, [%[in], #4]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr q18, [%[f], #32]\n" // f_o0c0 + "add %[in], %[in], #4\n" + "add %[f], %[f], #32\n" + "bne 0b\n" + "st1 {v2.8h}, [%[out]], #16\n" + : [out] "+r"(otm_0), [in] "+r"(itm_0), [f] "+r"(ftm_0) + : [ic] "r"((I64)ic * 8) + : "memory", "cc", "v0", "v1", "v2", "v18", "v20", "x0"); + } + // out trans + // O*(6*6)*hw1*o8 => NOWHo8 + U32 h = hw / tile_w; + U32 w = hw % tile_w; + F16 *out_0 = outArray + n * oc * oh * ow * 8 + (oc - 1) * oh * ow * 8 + + h * 4 * ow * 8 + w * 4 * 8; + U32 otm_off_0 = (oc - 1) * 8 * 36; + + F16 *Ow_0[36]; + F16 *O_0[16]; + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + otm_off_0 + idx * 8; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + } + } + CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + } + } + } + return ret; +} diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_winograd_transform.h b/compute/tensor/src/cpu/arm/fp16/convolution_winograd_transform.h new file mode 100644 index 00000000..45c7206d --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/convolution_winograd_transform.h @@ -0,0 +1,508 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_WINOGRAD_TRANSFORM +#define _H_WINOGRAD_TRANSFORM + +#include +#include +#include "cpu/arm/fp16/arm_functions_fp16.h" + +inline void trans_W_4x4_3x3(F16 *Fw[36], F16 *const F[9]) +{ + F16 T[6][3][8]; + + float16x8_t v_01666 = vmovq_n_f16(0.1666666666666667f); + float16x8_t v_minus_01666 = vmovq_n_f16(-0.1666666666666667f); + float16x8_t v_00833 = vmovq_n_f16(0.0833333333333333f); + float16x8_t v_minus_00833 = vmovq_n_f16(-0.0833333333333333f); + float16x8_t v_004166 = vmovq_n_f16(0.0416666666666667f); + float16x8_t v_025 = vmovq_n_f16(0.25f); + + for (U32 i = 0; i < 3; i++) { + float16x8_t v_F0 = vld1q_f16(F[0 * 3 + i]); + float16x8_t v_F1 = vld1q_f16(F[1 * 3 + i]); + float16x8_t v_F2 = vld1q_f16(F[2 * 3 + i]); + + float16x8_t v_t0 = vmulq_f16(v_01666, v_F2); + float16x8_t v_t1 = vsubq_f16(vmulq_f16(v_minus_01666, v_F0), v_t0); + float16x8_t v_t2 = vfmaq_f16(v_t0, v_004166, v_F0); + + float16x8_t v_T0 = vmulq_f16(v_025, v_F0); + float16x8_t v_T1 = vfmaq_f16(v_t1, v_minus_01666, v_F1); + float16x8_t v_T2 = vfmaq_f16(v_t1, v_01666, v_F1); + float16x8_t v_T3 = vfmaq_f16(v_t2, v_00833, v_F1); + float16x8_t v_T4 = vfmaq_f16(v_t2, v_minus_00833, v_F1); + + vst1q_f16(T[0][i], v_T0); + vst1q_f16(T[1][i], v_T1); + vst1q_f16(T[2][i], v_T2); + vst1q_f16(T[3][i], v_T3); + vst1q_f16(T[4][i], v_T4); + vst1q_f16(T[5][i], v_F2); + } + for (U32 i = 0; i < 6; i++) { + float16x8_t v_T0 = vld1q_f16(T[i][0]); + float16x8_t v_T1 = vld1q_f16(T[i][1]); + float16x8_t v_T2 = vld1q_f16(T[i][2]); + + float16x8_t v_t0 = vmulq_f16(v_01666, v_T2); + float16x8_t v_t1 = vsubq_f16(vmulq_f16(v_minus_01666, v_T0), v_t0); + float16x8_t v_t2 = vfmaq_f16(v_t0, v_004166, v_T0); + + float16x8_t v_Fw0 = vmulq_f16(v_025, v_T0); + float16x8_t v_Fw1 = vfmaq_f16(v_t1, v_minus_01666, v_T1); + float16x8_t v_Fw2 = vfmaq_f16(v_t1, v_01666, v_T1); + float16x8_t v_Fw3 = vfmaq_f16(v_t2, v_00833, v_T1); + float16x8_t v_Fw4 = vfmaq_f16(v_t2, v_minus_00833, v_T1); + + vst1q_f16(Fw[i * 6 + 0], v_Fw0); + vst1q_f16(Fw[i * 6 + 1], v_Fw1); + vst1q_f16(Fw[i * 6 + 2], v_Fw2); + vst1q_f16(Fw[i * 6 + 3], v_Fw3); + vst1q_f16(Fw[i * 6 + 4], v_Fw4); + vst1q_f16(Fw[i * 6 + 5], v_T2); + } +} + +inline EE trans_O_4x4_3x3(F16 *const Ow[36], + F16 *O[16], + const F16 *bias, + U32 h, + U32 w, + U32 _pad_h_mod_4, + U32 _pad_w_mod_4, + U32 oh, + U32 ow, + ActivationParamSpec activationDesc) +{ + F16 T[4][6][8]; + // bias + float16x8_t v_b = vld1q_f16(bias); + + float16x8_t v_0 = vmovq_n_f16(0); + float16x8_t v_2 = vmovq_n_f16(2); + float16x8_t v_4 = vmovq_n_f16(4); + float16x8_t v_8 = vmovq_n_f16(8); + + for (U32 i = 0; i < 6; i++) { + float16x8_t v_Ow0 = vld1q_f16(Ow[i]); + float16x8_t v_Ow1 = vld1q_f16(Ow[1 * 6 + i]); + float16x8_t v_Ow2 = vld1q_f16(Ow[2 * 6 + i]); + float16x8_t v_Ow3 = vld1q_f16(Ow[3 * 6 + i]); + float16x8_t v_Ow4 = vld1q_f16(Ow[4 * 6 + i]); + float16x8_t v_Ow5 = vld1q_f16(Ow[5 * 6 + i]); + + float16x8_t v_t0 = vaddq_f16(v_Ow1, v_Ow2); + float16x8_t v_t1 = vaddq_f16(v_Ow3, v_Ow4); + float16x8_t v_t2 = vsubq_f16(v_Ow1, v_Ow2); + float16x8_t v_t3 = vsubq_f16(v_Ow3, v_Ow4); + + float16x8_t v_T0 = vaddq_f16(vaddq_f16(v_t0, v_t1), v_Ow0); + float16x8_t v_T1 = vfmaq_f16(v_t2, v_t3, v_2); + float16x8_t v_T2 = vfmaq_f16(v_t0, v_t1, v_4); + float16x8_t v_T3 = vaddq_f16(vfmaq_f16(v_t2, v_t3, v_8), v_Ow5); + + vst1q_f16(T[0][i], v_T0); + vst1q_f16(T[1][i], v_T1); + vst1q_f16(T[2][i], v_T2); + vst1q_f16(T[3][i], v_T3); + } + + U32 pad_h_mod_4 = 0, pad_w_mod_4 = 0; + if (h == oh && w == ow) { + pad_h_mod_4 = _pad_h_mod_4; + pad_w_mod_4 = _pad_w_mod_4; + } else if (h == oh) { + pad_h_mod_4 = _pad_h_mod_4; + } else if (w == ow) { + pad_w_mod_4 = _pad_w_mod_4; + } + + for (U32 i = 0; i < 4 - pad_h_mod_4; i++) { + float16x8_t v_T0 = vld1q_f16(T[i][0]); + float16x8_t v_T1 = vld1q_f16(T[i][1]); + float16x8_t v_T2 = vld1q_f16(T[i][2]); + float16x8_t v_T3 = vld1q_f16(T[i][3]); + float16x8_t v_T4 = vld1q_f16(T[i][4]); + float16x8_t v_T5 = vld1q_f16(T[i][5]); + + float16x8_t v_t0 = vaddq_f16(v_T1, v_T2); + float16x8_t v_t1 = vaddq_f16(v_T3, v_T4); + float16x8_t v_t2 = vsubq_f16(v_T1, v_T2); + float16x8_t v_t3 = vsubq_f16(v_T3, v_T4); + + float16x8_t v_O0 = vaddq_f16(vaddq_f16(v_t0, v_t1), v_T0); + float16x8_t v_O1 = vfmaq_f16(v_t2, v_t3, v_2); + float16x8_t v_O2 = vfmaq_f16(v_t0, v_t1, v_4); + float16x8_t v_O3 = vaddq_f16(vfmaq_f16(v_t2, v_t3, v_8), v_T5); + + switch (activationDesc.mode) { + case ACTIVATION_NULL: { + if (pad_w_mod_4 == 0) { + vst1q_f16(O[i * 4 + 0], vaddq_f16(v_O0, v_b)); + vst1q_f16(O[i * 4 + 1], vaddq_f16(v_O1, v_b)); + vst1q_f16(O[i * 4 + 2], vaddq_f16(v_O2, v_b)); + vst1q_f16(O[i * 4 + 3], vaddq_f16(v_O3, v_b)); + } else if (pad_w_mod_4 == 1) { + vst1q_f16(O[i * 4 + 0], vaddq_f16(v_O0, v_b)); + vst1q_f16(O[i * 4 + 1], vaddq_f16(v_O1, v_b)); + vst1q_f16(O[i * 4 + 2], vaddq_f16(v_O2, v_b)); + } else if (pad_w_mod_4 == 2) { + vst1q_f16(O[i * 4 + 0], vaddq_f16(v_O0, v_b)); + vst1q_f16(O[i * 4 + 1], vaddq_f16(v_O1, v_b)); + } else if (pad_w_mod_4 == 3) { + vst1q_f16(O[i * 4 + 0], vaddq_f16(v_O0, v_b)); + } + break; + } + case ACTIVATION_RELU: { + if (pad_w_mod_4 == 0) { + vst1q_f16(O[i * 4 + 0], vmaxq_f16(vaddq_f16(v_O0, v_b), v_0)); + vst1q_f16(O[i * 4 + 1], vmaxq_f16(vaddq_f16(v_O1, v_b), v_0)); + vst1q_f16(O[i * 4 + 2], vmaxq_f16(vaddq_f16(v_O2, v_b), v_0)); + vst1q_f16(O[i * 4 + 3], vmaxq_f16(vaddq_f16(v_O3, v_b), v_0)); + } else if (pad_w_mod_4 == 1) { + vst1q_f16(O[i * 4 + 0], vmaxq_f16(vaddq_f16(v_O0, v_b), v_0)); + vst1q_f16(O[i * 4 + 1], vmaxq_f16(vaddq_f16(v_O1, v_b), v_0)); + vst1q_f16(O[i * 4 + 2], vmaxq_f16(vaddq_f16(v_O2, v_b), v_0)); + } else if (pad_w_mod_4 == 2) { + vst1q_f16(O[i * 4 + 0], vmaxq_f16(vaddq_f16(v_O0, v_b), v_0)); + vst1q_f16(O[i * 4 + 1], vmaxq_f16(vaddq_f16(v_O1, v_b), v_0)); + } else if (pad_w_mod_4 == 3) { + vst1q_f16(O[i * 4 + 0], vmaxq_f16(vaddq_f16(v_O0, v_b), v_0)); + } + break; + } + case ACTIVATION_SIGMOID: { + if (pad_w_mod_4 == 0) { + vst1q_f16(O[i * 4 + 0], vsigmoidq_f16(vaddq_f16(v_O0, v_b))); + vst1q_f16(O[i * 4 + 1], vsigmoidq_f16(vaddq_f16(v_O1, v_b))); + vst1q_f16(O[i * 4 + 2], vsigmoidq_f16(vaddq_f16(v_O2, v_b))); + vst1q_f16(O[i * 4 + 3], vsigmoidq_f16(vaddq_f16(v_O3, v_b))); + } else if (pad_w_mod_4 == 1) { + vst1q_f16(O[i * 4 + 0], vsigmoidq_f16(vaddq_f16(v_O0, v_b))); + vst1q_f16(O[i * 4 + 1], vsigmoidq_f16(vaddq_f16(v_O1, v_b))); + vst1q_f16(O[i * 4 + 2], vsigmoidq_f16(vaddq_f16(v_O2, v_b))); + } else if (pad_w_mod_4 == 2) { + vst1q_f16(O[i * 4 + 0], vsigmoidq_f16(vaddq_f16(v_O0, v_b))); + vst1q_f16(O[i * 4 + 1], vsigmoidq_f16(vaddq_f16(v_O1, v_b))); + } else if (pad_w_mod_4 == 3) { + vst1q_f16(O[i * 4 + 0], vsigmoidq_f16(vaddq_f16(v_O0, v_b))); + } + break; + } + default: + return NOT_SUPPORTED; + } + } + return SUCCESS; +} + +inline void trans_I_4x4_3x3(F16 *Iw[36], F16 *const I[36]) +{ + F16 T[6][6][8]; + + float16x8_t v_4 = vmovq_n_f16(4); + float16x8_t v_minus_4 = vmovq_n_f16(-4); + float16x8_t v_2 = vmovq_n_f16(2); + float16x8_t v_minus_5 = vmovq_n_f16(-5); + + for (U32 i = 0; i < 6; i++) { + float16x8_t v_I0 = vld1q_f16(I[0 * 6 + i]); + float16x8_t v_I1 = vld1q_f16(I[1 * 6 + i]); + float16x8_t v_I2 = vld1q_f16(I[2 * 6 + i]); + float16x8_t v_I3 = vld1q_f16(I[3 * 6 + i]); + float16x8_t v_I4 = vld1q_f16(I[4 * 6 + i]); + float16x8_t v_I5 = vld1q_f16(I[5 * 6 + i]); + + float16x8_t v_t0 = vfmaq_f16(v_I4, v_I2, v_minus_4); + float16x8_t v_t1 = vfmaq_f16(v_I3, v_I1, v_minus_4); + float16x8_t v_t2 = vsubq_f16(v_I4, v_I2); + float16x8_t v_t3 = vmulq_f16(vsubq_f16(v_I3, v_I1), v_2); + float16x8_t v_t4 = vfmaq_f16(v_I4, v_I0, v_4); + float16x8_t v_t5 = vfmaq_f16(v_I5, v_I1, v_4); + + float16x8_t v_T0 = vfmaq_f16(v_t4, v_I2, v_minus_5); + float16x8_t v_T1 = vaddq_f16(v_t1, v_t0); + float16x8_t v_T2 = vsubq_f16(v_t0, v_t1); + float16x8_t v_T3 = vaddq_f16(v_t3, v_t2); + float16x8_t v_T4 = vsubq_f16(v_t2, v_t3); + float16x8_t v_T5 = vfmaq_f16(v_t5, v_I3, v_minus_5); + + vst1q_f16(T[0][i], v_T0); + vst1q_f16(T[1][i], v_T1); + vst1q_f16(T[2][i], v_T2); + vst1q_f16(T[3][i], v_T3); + vst1q_f16(T[4][i], v_T4); + vst1q_f16(T[5][i], v_T5); + } + + for (U32 i = 0; i < 6; i++) { + float16x8_t v_T0 = vld1q_f16(T[i][0]); + float16x8_t v_T1 = vld1q_f16(T[i][1]); + float16x8_t v_T2 = vld1q_f16(T[i][2]); + float16x8_t v_T3 = vld1q_f16(T[i][3]); + float16x8_t v_T4 = vld1q_f16(T[i][4]); + float16x8_t v_T5 = vld1q_f16(T[i][5]); + + float16x8_t v_t0 = vfmaq_f16(v_T4, v_T2, v_minus_4); + float16x8_t v_t1 = vfmaq_f16(v_T3, v_T1, v_minus_4); + float16x8_t v_t2 = vsubq_f16(v_T4, v_T2); + float16x8_t v_t3 = vmulq_f16(vsubq_f16(v_T3, v_T1), v_2); + float16x8_t v_t4 = vfmaq_f16(v_T4, v_T0, v_4); + float16x8_t v_t5 = vfmaq_f16(v_T5, v_T1, v_4); + + float16x8_t v_Iw0 = vfmaq_f16(v_t4, v_T2, v_minus_5); + float16x8_t v_Iw1 = vaddq_f16(v_t1, v_t0); + float16x8_t v_Iw2 = vsubq_f16(v_t0, v_t1); + float16x8_t v_Iw3 = vaddq_f16(v_t3, v_t2); + float16x8_t v_Iw4 = vsubq_f16(v_t2, v_t3); + float16x8_t v_Iw5 = vfmaq_f16(v_t5, v_T3, v_minus_5); + + F16 max = vmaxvq_f16(v_Iw0); + F16 min = vminvq_f16(v_Iw0); + if (UNI_ISNAN(max) || UNI_ISINF(max) || UNI_ISNAN(min) || UNI_ISINF(min)) { + F16 check[8]; + vst1q_f16(check, v_Iw0); + for (U32 c = 0; c < 8; c++) { + F16 tmp = check[c]; + if (UNI_ISINF(tmp)) { + if (tmp > 0) { + check[c] = 65504; // FMAX for F16 + } else { + check[c] = -65504; + } + } else if (UNI_ISNAN(tmp)) { + tmp = (T[i][0][c] - T[i][2][c]) * 4; + if (UNI_ISINF(tmp)) { + if (tmp > 0) { + tmp = 65504; // FMAX for F16 + } else { + tmp = -65504; + } + } + F16 diff = T[i][4][c] - T[i][2][c]; + tmp += diff; + if (UNI_ISINF(tmp)) { + if (diff > 0) { + tmp = 65504; + } else { + tmp = -65504; + } + } + check[c] = tmp; + } + } + memcpy(Iw[i * 6 + 0], check, 8 * bytesOf(DT_F16)); + } else { + vst1q_f16(Iw[i * 6 + 0], v_Iw0); + } + + max = vmaxvq_f16(v_Iw1); + min = vminvq_f16(v_Iw1); + if (UNI_ISNAN(max) || UNI_ISINF(max) || UNI_ISNAN(min) || UNI_ISINF(min)) { + F16 check[8]; + vst1q_f16(check, v_Iw1); + for (U32 c = 0; c < 8; c++) { + F16 tmp = check[c]; + if (UNI_ISINF(tmp)) { + if (tmp > 0) { + check[c] = 65504; // FMAX for F16 + } else { + check[c] = -65504; + } + } else if (UNI_ISNAN(tmp)) { + tmp = (T[i][1][c] + T[i][2][c]) * -4; + if (UNI_ISINF(tmp)) { + if (tmp > 0) { + tmp = 65504; // FMAX for F16 + } else { + tmp = -65504; + } + } + F16 sum = T[i][3][c] + T[i][4][c]; + tmp += sum; + if (UNI_ISINF(tmp)) { + if (sum > 0) { + tmp = 65504; + } else { + tmp = -65504; + } + } + check[c] = tmp; + } + } + memcpy(Iw[i * 6 + 1], check, 8 * bytesOf(DT_F16)); + } else { + vst1q_f16(Iw[i * 6 + 1], v_Iw1); + } + + max = vmaxvq_f16(v_Iw2); + min = vminvq_f16(v_Iw2); + if (UNI_ISNAN(max) || UNI_ISINF(max) || UNI_ISNAN(min) || UNI_ISINF(min)) { + F16 check[8]; + vst1q_f16(check, v_Iw2); + for (U32 c = 0; c < 8; c++) { + F16 tmp = check[c]; + if (UNI_ISINF(tmp)) { + if (tmp > 0) { + check[c] = 65504; // FMAX for F16 + } else { + check[c] = -65504; + } + } else if (UNI_ISNAN(tmp)) { + tmp = (T[i][1][c] - T[i][2][c]) * 4; + if (UNI_ISINF(tmp)) { + if (tmp > 0) { + tmp = 65504; // FMAX for F16 + } else { + tmp = -65504; + } + } + F16 diff = T[i][4][c] - T[i][3][c]; + tmp += diff; + if (UNI_ISINF(tmp)) { + if (diff > 0) { + tmp = 65504; + } else { + tmp = -65504; + } + } + check[c] = tmp; + } + } + memcpy(Iw[i * 6 + 2], check, 8 * bytesOf(DT_F16)); + } else { + vst1q_f16(Iw[i * 6 + 2], v_Iw2); + } + + max = vmaxvq_f16(v_Iw3); + min = vminvq_f16(v_Iw3); + if (UNI_ISNAN(max) || UNI_ISINF(max) || UNI_ISNAN(min) || UNI_ISINF(min)) { + F16 check[8]; + vst1q_f16(check, v_Iw3); + for (U32 c = 0; c < 8; c++) { + F16 tmp = check[c]; + if (UNI_ISINF(tmp)) { + if (tmp > 0) { + check[c] = 65504; // FMAX for F16 + } else { + check[c] = -65504; + } + } else if (UNI_ISNAN(tmp)) { + tmp = (T[i][3][c] - T[i][1][c]) * 2; + if (UNI_ISINF(tmp)) { + if (tmp > 0) { + tmp = 65504; // FMAX for F16 + } else { + tmp = -65504; + } + } + F16 diff = T[i][4][c] - T[i][2][c]; + tmp += diff; + if (UNI_ISINF(tmp)) { + if (diff > 0) { + tmp = 65504; + } else { + tmp = -65504; + } + } + check[c] = tmp; + } + } + memcpy(Iw[i * 6 + 3], check, 8 * bytesOf(DT_F16)); + } else { + vst1q_f16(Iw[i * 6 + 3], v_Iw3); + } + + max = vmaxvq_f16(v_Iw4); + min = vminvq_f16(v_Iw4); + if (UNI_ISNAN(max) || UNI_ISINF(max) || UNI_ISNAN(min) || UNI_ISINF(min)) { + F16 check[8]; + vst1q_f16(check, v_Iw4); + for (U32 c = 0; c < 8; c++) { + F16 tmp = check[c]; + if (UNI_ISINF(tmp)) { + if (tmp > 0) { + check[c] = 65504; // FMAX for F16 + } else { + check[c] = -65504; + } + } else if (UNI_ISNAN(tmp)) { + tmp = (T[i][1][c] - T[i][3][c]) * 2; + if (UNI_ISINF(tmp)) { + if (tmp > 0) { + tmp = 65504; // FMAX for F16 + } else { + tmp = -65504; + } + } + F16 diff = T[i][4][c] - T[i][2][c]; + tmp += diff; + if (UNI_ISINF(tmp)) { + if (diff > 0) { + tmp = 65504; + } else { + tmp = -65504; + } + } + check[c] = tmp; + } + } + memcpy(Iw[i * 6 + 4], check, 8 * bytesOf(DT_F16)); + } else { + vst1q_f16(Iw[i * 6 + 4], v_Iw4); + } + + max = vmaxvq_f16(v_Iw5); + min = vminvq_f16(v_Iw5); + if (UNI_ISNAN(max) || UNI_ISINF(max) || UNI_ISNAN(min) || UNI_ISINF(min)) { + F16 check[8]; + vst1q_f16(check, v_Iw5); + for (U32 c = 0; c < 8; c++) { + F16 tmp = check[c]; + if (UNI_ISINF(tmp)) { + if (tmp > 0) { + check[c] = 65504; // FMAX for F16 + } else { + check[c] = -65504; + } + } else if (UNI_ISNAN(tmp)) { + tmp = (T[i][1][c] - T[i][3][c]) * 4; + if (UNI_ISINF(tmp)) { + if (tmp > 0) { + tmp = 65504; // FMAX for F16 + } else { + tmp = -65504; + } + } + F16 diff = T[i][5][c] - T[i][3][c]; + tmp += diff; + if (UNI_ISINF(tmp)) { + if (diff > 0) { + tmp = 65504; + } else { + tmp = -65504; + } + } + check[c] = tmp; + } + } + memcpy(Iw[i * 6 + 5], check, 8 * bytesOf(DT_F16)); + } else { + vst1q_f16(Iw[i * 6 + 5], v_Iw5); + } + } +} +#endif diff --git a/compute/tensor/src/cpu/arm/fp16/deconvolution_transform.cpp b/compute/tensor/src/cpu/arm/fp16/deconvolution_transform.cpp new file mode 100644 index 00000000..f54f6ad7 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/deconvolution_transform.cpp @@ -0,0 +1,97 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/transform_functions.h" +#include "cpu/arm/fp16/tensor_computing_fp16.h" + +inline EE deconvolution_transform_filter_kernel_fp16(TensorDesc filterDesc, + const F16 *filterArray, + TensorDesc *ftmDesc, + F16 *ftmArray, + DataFormat ftmDataFormat) +{ + // Procedure should be the same, but fhfw is reversed + if (nullptr == filterArray || nullptr == ftmDesc || nullptr == ftmArray) { + CHECK_STATUS(NULL_POINTER); + } + DataType fdt; + DataFormat fdf; + U32 fn, fc, fh, fw; + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + if (fdf == ftmDataFormat) { + *ftmDesc = filterDesc; + memcpy(ftmArray, filterArray, fn * fc * fh * fw * bytesOf(fdt)); + return SUCCESS; + } + if (fdf != DF_NCHW) { + CHECK_STATUS(NOT_SUPPORTED); + } + EE ret = SUCCESS; + switch (ftmDataFormat) { + case DF_NHWCN16: { + /* + * CNHW => NHWCN16 + * if there is remainder, it should be NHWCN8 + */ + *ftmDesc = tensor4df(fdt, ftmDataFormat, fc, fn, fh, fw); + transformCNHWToNHWCNx(filterDesc, filterArray, *ftmDesc, ftmArray); + break; + } + case DF_HWNCN16: { + /* + * CNHW => NHWCN16 + NHWCN8 if there is remainder divided by 16 + */ + *ftmDesc = tensor4df(fdt, ftmDataFormat, fc, fn, 6, 6); + transformCNHWToHWNCNx(filterDesc, filterArray, *ftmDesc, ftmArray); + break; + } + case DF_NCHWC8: { + *ftmDesc = tensor4df(fdt, DF_NCHWC8, fn, fc, fh, fw); + transformCNHWToNCHWC8(filterDesc, filterArray, *ftmDesc, ftmArray); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE deconvolution_transform_filter_fp16(TensorDesc filterDesc, + const F16 *filter, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + F16 *filterTransformed) +{ + DataFormat ftmDataFormat; + switch (algorithm) { + case CONVOLUTION_ALGORITHM_WINOGRAD: + ftmDataFormat = DF_HWNCN16; + break; + case CONVOLUTION_ALGORITHM_GEMM_ICNCHW: + ftmDataFormat = DF_NHWCN16; + break; + case CONVOLUTION_ALGORITHM_GEMM: + ftmDataFormat = DF_NHWCN16; + break; + case CONVOLUTION_ALGORITHM_GROUP_DECONV: + ftmDataFormat = DF_NCHWC8; + break; + default: + return NOT_MATCH; + } + EE ret = deconvolution_transform_filter_kernel_fp16( + filterDesc, filter, ftmDesc, filterTransformed, ftmDataFormat); + CHECK_STATUS(ret); + return ret; +} diff --git a/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution.cpp b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution.cpp new file mode 100644 index 00000000..af32a944 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution.cpp @@ -0,0 +1,93 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/fp16/tensor_computing_fp16.h" +#include "cpu/arm/fp16/depthwise_pointwise_convolution_direct.h" +#include "cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding.h" +#include "cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1.h" + +EE depthwise_pointwise_convolution_fp16(TensorDesc inputDesc, + F16 *input, + TensorDesc dwFilterDesc, + const F16 *dwFilter, + TensorDesc pwFilterDesc, + const F16 *pwFilter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc dwBiasDesc, + const F16 *dwBias, + TensorDesc pwBiasDesc, + const F16 *pwBias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *output, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + Arch arch) +{ + if (nullptr == input || nullptr == dwFilter || nullptr == output || nullptr == dwBias || + nullptr == tmp) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if (!(idt == DT_F16 && fdt == DT_F16 && odt == DT_F16)) { + CHECK_STATUS(NOT_MATCH); + } + if (!(idf == DF_NCHWC8 && odf == DF_NCHWC8)) { + CHECK_STATUS(NOT_MATCH); + } + if (ic != fc) { + CHECK_STATUS(NOT_MATCH); + } + + EE ret = SUCCESS; + switch (algorithm) { + case DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT: + ret = depthwise_pointwise_convolution_direct(inputDesc, input, dwFilterDesc, dwFilter, + pwFilterDesc, pwFilter, convParamSpec, dwBiasDesc, dwBias, pwBiasDesc, pwBias, + tmpBytes, tmp, outputDesc, output, depthwiseActivationParamSpec, + pointwiseActivationParamSpec, arch); + break; + case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT: + ret = depthwise_pointwise_convolution_direct(inputDesc, input, dwFilterDesc, dwFilter, + pwFilterDesc, pwFilter, convParamSpec, dwBiasDesc, dwBias, pwBiasDesc, pwBias, + tmpBytes, tmp, outputDesc, output, depthwiseActivationParamSpec, + pointwiseActivationParamSpec, arch); + break; + case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT_NO_PADDING: + ret = depthwise_pointwise_convolution_direct_no_padding(inputDesc, input, dwFilterDesc, + dwFilter, pwFilterDesc, pwFilter, convParamSpec, dwBiasDesc, dwBias, pwBiasDesc, + pwBias, tmpBytes, tmp, outputDesc, output, depthwiseActivationParamSpec, + pointwiseActivationParamSpec, arch); + break; + case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_3X3S1P1: + ret = depthwise_pointwise_convolution_3x3s1p1(inputDesc, input, dwFilterDesc, dwFilter, + pwFilterDesc, pwFilter, convParamSpec, dwBiasDesc, dwBias, pwBiasDesc, pwBias, + tmpBytes, tmp, outputDesc, output, depthwiseActivationParamSpec, + pointwiseActivationParamSpec, arch); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1.h b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1.h new file mode 100644 index 00000000..1a6e644c --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1.h @@ -0,0 +1,95 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_DEPTHWISE_POINTWISE_CONVOLUTION_3X3S1P1 +#define _H_DEPTHWISE_POINTWISE_CONVOLUTION_3X3S1P1 + +#include "sys.h" +#include "types.h" +#include "error.h" + +EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, + F16 *inArray, + TensorDesc dwFilterDesc, + const F16 *dwFilterArray, + TensorDesc pwFilterDesc, + const F16 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F16 *dwBiasArray, + TensorDesc pwBiasDesc, + const F16 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec); + +EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, + F16 *inArray, + TensorDesc dwFilterDesc, + const F16 *dwFilterArray, + TensorDesc pwFilterDesc, + const F16 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F16 *dwBiasArray, + TensorDesc pwBiasDesc, + const F16 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec); + +inline EE depthwise_pointwise_convolution_3x3s1p1(TensorDesc inputDesc, + F16 *inArray, + TensorDesc dwFilterDesc, + const F16 *dwFilterArray, + TensorDesc pwFilterDesc, + const F16 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F16 *dwBiasArray, + TensorDesc pwBiasDesc, + const F16 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + Arch arch) +{ + EE ret = SUCCESS; + switch (arch) { + case ARM_A55: + ret = depthwise_pointwise_convolution_3x3s1p1_A55(inputDesc, inArray, dwFilterDesc, + dwFilterArray, pwFilterDesc, pwFilterArray, convParamSpec, dwBiasDesc, dwBiasArray, + pwBiasDesc, pwBiasArray, tmpBytes, tmp, outputDesc, outArray, + depthwiseActivationParamSpec, pointwiseActivationParamSpec); + break; + case ARM_A76: + ret = depthwise_pointwise_convolution_3x3s1p1_A76(inputDesc, inArray, dwFilterDesc, + dwFilterArray, pwFilterDesc, pwFilterArray, convParamSpec, dwBiasDesc, dwBiasArray, + pwBiasDesc, pwBiasArray, tmpBytes, tmp, outputDesc, outArray, + depthwiseActivationParamSpec, pointwiseActivationParamSpec); + break; + default: + return NOT_SUPPORTED; + } + return ret; +} +#endif diff --git a/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1_A55.cpp b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1_A55.cpp new file mode 100644 index 00000000..7b30c26c --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1_A55.cpp @@ -0,0 +1,1610 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1.h" + +EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, + F16 *inArray, + TensorDesc dwFilterDesc, + const F16 *dwFilterArray, + TensorDesc pwFilterDesc, + const F16 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F16 *dwBiasArray, + TensorDesc pwBiasDesc, + const F16 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec) +{ + UNUSED(tmpBytes); + UNUSED(convParamSpec); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if (dwFilterDesc.df != DF_NCHWC8 || pwFilterDesc.df != DF_NHWCN16) { + CHECK_STATUS(NOT_MATCH); + } + + oc /= 8; + ic /= 8; + + I32 ohow = oh * ow; + F16 *pwArray = (F16 *)tmp; + + for (U32 n = 0; n < in; n++) { + // dw_conv + padding + for (U32 c = 0; c < ic; c++) { + const F16 *b = dwBiasArray + c * 8; + F16 *in_c = inArray + c * ih * iw * 8; + const F16 *f = dwFilterArray + c * fh * fw * 8; + F16 *out = pwArray + c * ohow * 8; + F16 *in0 = in_c; + F16 *in1 = in0 + iw * 8; + F16 *in2 = in1 + iw * 8; + __asm__ __volatile__( + "mov x0, %[w]\n" + "ldr q28, [%[b]]\n" + "ldr q3, [%[f], #48]\n" + "ldr q4, [%[f], #64]\n" + "ldr q5, [%[f], #80]\n" + "ldr q6, [%[f], #96]\n" + "ldr q7, [%[f], #112]\n" + "ldr q8, [%[f], #128]\n" + "ldr q13, [%[in_0]]\n" + "ldr q14, [%[in_0], #16]\n" + "ldr q15, [%[in_0], #32]\n" + "ldr q16, [%[in_0], #48]\n" + "ldr q18, [%[in_1]]\n" + "ldr q19, [%[in_1], #16]\n" + "ldr q20, [%[in_1], #32]\n" + "ldr q21, [%[in_1], #48]\n" + "mov v9.16b, v28.16b\n" // out_0 + "mov v10.16b, v28.16b\n" // out_1 + "mov v11.16b, v28.16b\n" // out_2 + "mov v12.16b, v28.16b\n" // out_3 + + "ldr q17, [%[in_0], #64]\n" + "fmla v10.8h, v3.8h, v13.8h\n" + "fmla v11.8h, v3.8h, v14.8h\n" + "fmla v12.8h, v3.8h, v15.8h\n" + "ldr q22, [%[in_1], #64]\n" + "fmla v10.8h, v6.8h, v18.8h\n" + "fmla v11.8h, v6.8h, v19.8h\n" + "fmla v12.8h, v6.8h, v20.8h\n" + + "fmla v9.8h, v4.8h, v13.8h\n" + "fmla v10.8h, v4.8h, v14.8h\n" + "fmla v11.8h, v4.8h, v15.8h\n" + "fmla v12.8h, v4.8h, v16.8h\n" + "fmla v9.8h, v7.8h, v18.8h\n" + "fmla v10.8h, v7.8h, v19.8h\n" + "fmla v11.8h, v7.8h, v20.8h\n" + "fmla v12.8h, v7.8h, v21.8h\n" + + "ldr q13, [%[in_0], #80]\n" + "fmla v9.8h, v5.8h, v14.8h\n" + "fmla v10.8h, v5.8h, v15.8h\n" + "fmla v11.8h, v5.8h, v16.8h\n" + "fmla v12.8h, v5.8h, v17.8h\n" + "ldr q18, [%[in_1], #80]\n" + "fmla v9.8h, v8.8h, v19.8h\n" + "fmla v10.8h, v8.8h, v20.8h\n" + "fmla v11.8h, v8.8h, v21.8h\n" + "fmla v12.8h, v8.8h, v22.8h\n" + + "mov v14.16b, v17.16b\n" + "mov v19.16b, v22.16b\n" + "mov v15.16b, v13.16b\n" + "mov v20.16b, v18.16b\n" + "mov v13.16b, v16.16b\n" + "mov v18.16b, v21.16b\n" + "ldr q16, [%[in_0], #96]\n" + "ldr q21, [%[in_1], #96]\n" + "add %[in_0], %[in_0], #48\n" + "add %[in_1], %[in_1], #48\n" + + "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse + "bne 111f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) + "fmax v10.8h, v10.8h, v17.8h\n" + "fmax v11.8h, v11.8h, v17.8h\n" + "fmax v12.8h, v12.8h, v17.8h\n" + + "111:\n" + "cmp %[depthwiseActivationMode], %[am_relu6]\n" + "bne 112f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) + "fmax v10.8h, v10.8h, v17.8h\n" + "fmax v11.8h, v11.8h, v17.8h\n" + "fmax v12.8h, v12.8h, v17.8h\n" + "fmin v9.8h, v9.8h, v22.8h\n" // min(v9, 6) + "fmin v10.8h, v10.8h, v22.8h\n" + "fmin v11.8h, v11.8h, v22.8h\n" + "fmin v12.8h, v12.8h, v22.8h\n" + + "112:\n" + "cmp %[depthwiseActivationMode], %[am_h_swish]\n" + "bne 113f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x42, lsl #8\n" // three + "fadd v27.8h, v9.8h, v22.8h\n" + "fadd v29.8h, v10.8h, v22.8h\n" + "fadd v30.8h, v11.8h, v22.8h\n" + "fadd v31.8h, v12.8h, v22.8h\n" + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v27.8h, v27.8h, v17.8h\n" + "fmax v29.8h, v29.8h, v17.8h\n" + "fmax v30.8h, v30.8h, v17.8h\n" + "fmax v31.8h, v31.8h, v17.8h\n" + "fmin v27.8h, v27.8h, v22.8h\n" + "fmin v29.8h, v29.8h, v22.8h\n" + "fmin v30.8h, v30.8h, v22.8h\n" + "fmin v31.8h, v31.8h, v22.8h\n" + "fdiv v27.8h, v27.8h, v22.8h\n" + "fdiv v29.8h, v29.8h, v22.8h\n" + "fdiv v30.8h, v30.8h, v22.8h\n" + "fdiv v31.8h, v31.8h, v22.8h\n" + "fmul v9.8h, v27.8h, v9.8h\n" + "fmul v10.8h, v29.8h, v10.8h\n" + "fmul v11.8h, v30.8h, v11.8h\n" + "fmul v12.8h, v31.8h, v12.8h\n" + + "113:\n" + "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" + + "0:\n" + "mov v9.16b, v28.16b\n" // out_0 + "mov v10.16b, v28.16b\n" // out_1 + "mov v11.16b, v28.16b\n" // out_2 + "mov v12.16b, v28.16b\n" // out_3 + + "ldr d17, [%[in_0], #64]\n" + "fmla v9.8h, v3.8h, v13.8h\n" + "ldr x1, [%[in_0], #72]\n" + "fmla v10.8h, v3.8h, v14.8h\n" + "ins v17.d[1], x1\n" + "fmla v11.8h, v3.8h, v15.8h\n" + "ldr d22, [%[in_1], #64]\n" + "fmla v12.8h, v3.8h, v16.8h\n" + "ldr x2, [%[in_1], #72]\n" + "fmla v9.8h, v6.8h, v18.8h\n" + "ins v22.d[1], x2\n" + "fmla v10.8h, v6.8h, v19.8h\n" + "fmla v11.8h, v6.8h, v20.8h\n" + "fmla v12.8h, v6.8h, v21.8h\n" + + "ldr d13, [%[in_0], #80]\n" + "fmla v9.8h, v4.8h, v14.8h\n" + "ldr x1, [%[in_0], #88]\n" + "fmla v10.8h, v4.8h, v15.8h\n" + "ins v13.d[1], x1\n" + "fmla v11.8h, v4.8h, v16.8h\n" + "ldr d18, [%[in_1], #80]\n" + "fmla v12.8h, v4.8h, v17.8h\n" + "ldr x2, [%[in_1], #88]\n" + "fmla v9.8h, v7.8h, v19.8h\n" + "ins v18.d[1], x2\n" + "fmla v10.8h, v7.8h, v20.8h\n" + "fmla v11.8h, v7.8h, v21.8h\n" + "fmla v12.8h, v7.8h, v22.8h\n" + + "ldr d14, [%[in_0], #96]\n" + "fmla v9.8h, v5.8h, v15.8h\n" + "ldr x1, [%[in_0], #104]\n" + "fmla v10.8h, v5.8h, v16.8h\n" + "ins v14.d[1], x1\n" + "fmla v11.8h, v5.8h, v17.8h\n" + "ldr d19, [%[in_1], #96]\n" + "fmla v12.8h, v5.8h, v13.8h\n" + "ldr x2, [%[in_1], #104]\n" + "fmla v9.8h, v8.8h, v20.8h\n" + "ins v19.d[1], x2\n" + "fmla v10.8h, v8.8h, v21.8h\n" + "fmla v11.8h, v8.8h, v22.8h\n" + "fmla v12.8h, v8.8h, v18.8h\n" + + "ldr d16, [%[in_0], #112]\n" + "mov v15.16b, v14.16b\n" + "ldr x1, [%[in_0], #120]\n" + "mov v20.16b, v19.16b\n" + "ins v16.d[1], x1\n" + "mov v14.16b, v13.16b\n" + "ldr d21, [%[in_1], #112]\n" + "mov v19.16b, v18.16b\n" + "ldr x2, [%[in_1], #120]\n" + "mov v13.16b, v17.16b\n" + "ins v21.d[1], x2\n" + "mov v18.16b, v22.16b\n" + + "add %[in_0], %[in_0], #64\n" + "add %[in_1], %[in_1], #64\n" + + "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse + "bne 211f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) + "fmax v10.8h, v10.8h, v17.8h\n" + "fmax v11.8h, v11.8h, v17.8h\n" + "fmax v12.8h, v12.8h, v17.8h\n" + + "211:\n" + "cmp %[depthwiseActivationMode], %[am_relu6]\n" + "bne 212f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) + "fmax v10.8h, v10.8h, v17.8h\n" + "fmax v11.8h, v11.8h, v17.8h\n" + "fmax v12.8h, v12.8h, v17.8h\n" + "fmin v9.8h, v9.8h, v22.8h\n" // min(v9, 6) + "fmin v10.8h, v10.8h, v22.8h\n" + "fmin v11.8h, v11.8h, v22.8h\n" + "fmin v12.8h, v12.8h, v22.8h\n" + + "212:\n" + "cmp %[depthwiseActivationMode], %[am_h_swish]\n" + "bne 213f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x42, lsl #8\n" // three + "fadd v27.8h, v9.8h, v22.8h\n" + "fadd v29.8h, v10.8h, v22.8h\n" + "fadd v30.8h, v11.8h, v22.8h\n" + "fadd v31.8h, v12.8h, v22.8h\n" + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v27.8h, v27.8h, v17.8h\n" + "fmax v29.8h, v29.8h, v17.8h\n" + "fmax v30.8h, v30.8h, v17.8h\n" + "fmax v31.8h, v31.8h, v17.8h\n" + "fmin v27.8h, v27.8h, v22.8h\n" + "fmin v29.8h, v29.8h, v22.8h\n" + "fmin v30.8h, v30.8h, v22.8h\n" + "fmin v31.8h, v31.8h, v22.8h\n" + "fdiv v27.8h, v27.8h, v22.8h\n" + "fdiv v29.8h, v29.8h, v22.8h\n" + "fdiv v30.8h, v30.8h, v22.8h\n" + "fdiv v31.8h, v31.8h, v22.8h\n" + "fmul v9.8h, v27.8h, v9.8h\n" + "fmul v10.8h, v29.8h, v10.8h\n" + "fmul v11.8h, v30.8h, v11.8h\n" + "fmul v12.8h, v31.8h, v12.8h\n" + + "213:\n" + "subs x0, x0, #4\n" + "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" + "bne 0b\n" + + "mov v9.16b, v28.16b\n" // out_0 + "mov v10.16b, v28.16b\n" // out_1 + "mov v11.16b, v28.16b\n" // out_2 + "mov v12.16b, v28.16b\n" // out_3 + + "ldr q17, [%[in_0], #64]\n" + "fmla v9.8h, v3.8h, v13.8h\n" + "fmla v10.8h, v3.8h, v14.8h\n" + "fmla v11.8h, v3.8h, v15.8h\n" + "fmla v12.8h, v3.8h, v16.8h\n" + "ldr q22, [%[in_1], #64]\n" + "fmla v9.8h, v6.8h, v18.8h\n" + "fmla v10.8h, v6.8h, v19.8h\n" + "fmla v11.8h, v6.8h, v20.8h\n" + "fmla v12.8h, v6.8h, v21.8h\n" + + "fmla v9.8h, v4.8h, v14.8h\n" + "fmla v10.8h, v4.8h, v15.8h\n" + "fmla v11.8h, v4.8h, v16.8h\n" + "fmla v12.8h, v4.8h, v17.8h\n" + "fmla v9.8h, v7.8h, v19.8h\n" + "fmla v10.8h, v7.8h, v20.8h\n" + "fmla v11.8h, v7.8h, v21.8h\n" + "fmla v12.8h, v7.8h, v22.8h\n" + + "fmla v9.8h, v5.8h, v15.8h\n" + "fmla v10.8h, v5.8h, v16.8h\n" + "fmla v11.8h, v5.8h, v17.8h\n" + "fmla v9.8h, v8.8h, v20.8h\n" + "fmla v10.8h, v8.8h, v21.8h\n" + "fmla v11.8h, v8.8h, v22.8h\n" + + "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse + "bne 311f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) + "fmax v10.8h, v10.8h, v17.8h\n" + "fmax v11.8h, v11.8h, v17.8h\n" + "fmax v12.8h, v12.8h, v17.8h\n" + + "311:\n" + "cmp %[depthwiseActivationMode], %[am_relu6]\n" + "bne 312f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) + "fmax v10.8h, v10.8h, v17.8h\n" + "fmax v11.8h, v11.8h, v17.8h\n" + "fmax v12.8h, v12.8h, v17.8h\n" + "fmin v9.8h, v9.8h, v22.8h\n" // min(v9, 6) + "fmin v10.8h, v10.8h, v22.8h\n" + "fmin v11.8h, v11.8h, v22.8h\n" + "fmin v12.8h, v12.8h, v22.8h\n" + + "312:\n" + "cmp %[depthwiseActivationMode], %[am_h_swish]\n" + "bne 313f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x42, lsl #8\n" // three + "fadd v27.8h, v9.8h, v22.8h\n" + "fadd v29.8h, v10.8h, v22.8h\n" + "fadd v30.8h, v11.8h, v22.8h\n" + "fadd v31.8h, v12.8h, v22.8h\n" + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v27.8h, v27.8h, v17.8h\n" + "fmax v29.8h, v29.8h, v17.8h\n" + "fmax v30.8h, v30.8h, v17.8h\n" + "fmax v31.8h, v31.8h, v17.8h\n" + "fmin v27.8h, v27.8h, v22.8h\n" + "fmin v29.8h, v29.8h, v22.8h\n" + "fmin v30.8h, v30.8h, v22.8h\n" + "fmin v31.8h, v31.8h, v22.8h\n" + "fdiv v27.8h, v27.8h, v22.8h\n" + "fdiv v29.8h, v29.8h, v22.8h\n" + "fdiv v30.8h, v30.8h, v22.8h\n" + "fdiv v31.8h, v31.8h, v22.8h\n" + "fmul v9.8h, v27.8h, v9.8h\n" + "fmul v10.8h, v29.8h, v10.8h\n" + "fmul v11.8h, v30.8h, v11.8h\n" + "fmul v12.8h, v31.8h, v12.8h\n" + + "313:\n" + "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" + : [out] "+r"(out), [in_0] "+r"(in0), [in_1] "+r"(in1) + : [f] "r"(f), [b] "r"(b), [w] "r"((I64)ow - 8), + [depthwiseActivationMode] "r"((I64)depthwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", + "x3"); + + for (U32 h = 0; h < oh - 2; h++) { + in0 = in_c + h * iw * 8; + in1 = in0 + iw * 8; + in2 = in1 + iw * 8; + __asm__ __volatile__( + "mov x0, %[w]\n" + "ldr q28, [%[b]]\n" + "ldr q0, [%[f]]\n" + "ldr q1, [%[f], #16]\n" + "ldr q2, [%[f], #32]\n" + "ldr q3, [%[f], #48]\n" + "ldr q4, [%[f], #64]\n" + "ldr q5, [%[f], #80]\n" + "ldr q6, [%[f], #96]\n" + "ldr q7, [%[f], #112]\n" + "ldr q8, [%[f], #128]\n" + "ldr q13, [%[in_0]]\n" + "ldr q14, [%[in_0], #16]\n" + "ldr q15, [%[in_0], #32]\n" + "ldr q16, [%[in_0], #48]\n" + "ldr q18, [%[in_1]]\n" + "ldr q19, [%[in_1], #16]\n" + "ldr q20, [%[in_1], #32]\n" + "ldr q21, [%[in_1], #48]\n" + "ldr q23, [%[in_2]]\n" + "ldr q24, [%[in_2], #16]\n" + "ldr q25, [%[in_2], #32]\n" + "ldr q26, [%[in_2], #48]\n" + + "mov v9.16b, v28.16b\n" // out_0 + "mov v10.16b, v28.16b\n" // out_1 + "mov v11.16b, v28.16b\n" // out_2 + "mov v12.16b, v28.16b\n" // out_3 + + "ldr q17, [%[in_0], #64]\n" + "fmla v10.8h, v0.8h, v13.8h\n" + "fmla v11.8h, v0.8h, v14.8h\n" + "fmla v12.8h, v0.8h, v15.8h\n" + "ldr q22, [%[in_1], #64]\n" + "fmla v10.8h, v3.8h, v18.8h\n" + "fmla v11.8h, v3.8h, v19.8h\n" + "fmla v12.8h, v3.8h, v20.8h\n" + "ldr q27, [%[in_2], #64]\n" + "fmla v10.8h, v6.8h, v23.8h\n" + "fmla v11.8h, v6.8h, v24.8h\n" + "fmla v12.8h, v6.8h, v25.8h\n" + + "fmla v9.8h, v1.8h, v13.8h\n" + "fmla v10.8h, v1.8h, v14.8h\n" + "fmla v11.8h, v1.8h, v15.8h\n" + "fmla v12.8h, v1.8h, v16.8h\n" + "fmla v9.8h, v4.8h, v18.8h\n" + "fmla v10.8h, v4.8h, v19.8h\n" + "fmla v11.8h, v4.8h, v20.8h\n" + "fmla v12.8h, v4.8h, v21.8h\n" + "fmla v9.8h, v7.8h, v23.8h\n" + "fmla v10.8h, v7.8h, v24.8h\n" + "fmla v11.8h, v7.8h, v25.8h\n" + "fmla v12.8h, v7.8h, v26.8h\n" + + "ldr q13, [%[in_0], #80]\n" + "fmla v9.8h, v2.8h, v14.8h\n" + "fmla v10.8h, v2.8h, v15.8h\n" + "fmla v11.8h, v2.8h, v16.8h\n" + "fmla v12.8h, v2.8h, v17.8h\n" + "ldr q18, [%[in_1], #80]\n" + "fmla v9.8h, v5.8h, v19.8h\n" + "fmla v10.8h, v5.8h, v20.8h\n" + "fmla v11.8h, v5.8h, v21.8h\n" + "fmla v12.8h, v5.8h, v22.8h\n" + "ldr q23, [%[in_2], #80]\n" + "fmla v9.8h, v8.8h, v24.8h\n" + "fmla v10.8h, v8.8h, v25.8h\n" + "fmla v11.8h, v8.8h, v26.8h\n" + "fmla v12.8h, v8.8h, v27.8h\n" + + "mov v14.16b, v17.16b\n" + "mov v19.16b, v22.16b\n" + "mov v24.16b, v27.16b\n" + "mov v15.16b, v13.16b\n" + "mov v20.16b, v18.16b\n" + "mov v25.16b, v23.16b\n" + "mov v13.16b, v16.16b\n" + "mov v18.16b, v21.16b\n" + "mov v23.16b, v26.16b\n" + "ldr q16, [%[in_0], #96]\n" + "ldr q21, [%[in_1], #96]\n" + "ldr q26, [%[in_2], #96]\n" + "add %[in_0], %[in_0], #48\n" + "add %[in_1], %[in_1], #48\n" + "add %[in_2], %[in_2], #48\n" + + "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse + "bne 111f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) + "fmax v10.8h, v10.8h, v17.8h\n" + "fmax v11.8h, v11.8h, v17.8h\n" + "fmax v12.8h, v12.8h, v17.8h\n" + + "111:\n" + "cmp %[depthwiseActivationMode], %[am_relu6]\n" + "bne 112f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) + "fmax v10.8h, v10.8h, v17.8h\n" + "fmax v11.8h, v11.8h, v17.8h\n" + "fmax v12.8h, v12.8h, v17.8h\n" + "fmin v9.8h, v9.8h, v22.8h\n" // min(v9, 6) + "fmin v10.8h, v10.8h, v22.8h\n" + "fmin v11.8h, v11.8h, v22.8h\n" + "fmin v12.8h, v12.8h, v22.8h\n" + + "112:\n" + "cmp %[depthwiseActivationMode], %[am_h_swish]\n" + "bne 113f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x42, lsl #8\n" // three + "fadd v27.8h, v9.8h, v22.8h\n" + "fadd v29.8h, v10.8h, v22.8h\n" + "fadd v30.8h, v11.8h, v22.8h\n" + "fadd v31.8h, v12.8h, v22.8h\n" + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v27.8h, v27.8h, v17.8h\n" + "fmax v29.8h, v29.8h, v17.8h\n" + "fmax v30.8h, v30.8h, v17.8h\n" + "fmax v31.8h, v31.8h, v17.8h\n" + "fmin v27.8h, v27.8h, v22.8h\n" + "fmin v29.8h, v29.8h, v22.8h\n" + "fmin v30.8h, v30.8h, v22.8h\n" + "fmin v31.8h, v31.8h, v22.8h\n" + "fdiv v27.8h, v27.8h, v22.8h\n" + "fdiv v29.8h, v29.8h, v22.8h\n" + "fdiv v30.8h, v30.8h, v22.8h\n" + "fdiv v31.8h, v31.8h, v22.8h\n" + "fmul v9.8h, v27.8h, v9.8h\n" + "fmul v10.8h, v29.8h, v10.8h\n" + "fmul v11.8h, v30.8h, v11.8h\n" + "fmul v12.8h, v31.8h, v12.8h\n" + + "113:\n" + "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" + + "0:\n" + "mov v9.16b, v28.16b\n" // out_0 + "mov v10.16b, v28.16b\n" // out_1 + "mov v11.16b, v28.16b\n" // out_2 + "mov v12.16b, v28.16b\n" // out_3 + + "ldr d17, [%[in_0], #64]\n" + "fmla v9.8h, v0.8h, v13.8h\n" + "ldr x1, [%[in_0], #72]\n" + "fmla v10.8h, v0.8h, v14.8h\n" + "ins v17.d[1], x1\n" + "fmla v11.8h, v0.8h, v15.8h\n" + "ldr d22, [%[in_1], #64]\n" + "fmla v12.8h, v0.8h, v16.8h\n" + "ldr x2, [%[in_1], #72]\n" + "fmla v9.8h, v3.8h, v18.8h\n" + "ins v22.d[1], x2\n" + "fmla v10.8h, v3.8h, v19.8h\n" + "ldr d27, [%[in_2], #64]\n" + "fmla v11.8h, v3.8h, v20.8h\n" + "ldr x3, [%[in_2], #72]\n" + "fmla v12.8h, v3.8h, v21.8h\n" + "ins v27.d[1], x3\n" + "fmla v9.8h, v6.8h, v23.8h\n" + "fmla v10.8h, v6.8h, v24.8h\n" + "fmla v11.8h, v6.8h, v25.8h\n" + "fmla v12.8h, v6.8h, v26.8h\n" + + "ldr d13, [%[in_0], #80]\n" + "fmla v9.8h, v1.8h, v14.8h\n" + "ldr x1, [%[in_0], #88]\n" + "fmla v10.8h, v1.8h, v15.8h\n" + "ins v13.d[1], x1\n" + "fmla v11.8h, v1.8h, v16.8h\n" + "ldr d18, [%[in_1], #80]\n" + "fmla v12.8h, v1.8h, v17.8h\n" + "ldr x2, [%[in_1], #88]\n" + "fmla v9.8h, v4.8h, v19.8h\n" + "ins v18.d[1], x2\n" + "fmla v10.8h, v4.8h, v20.8h\n" + "ldr d23, [%[in_2], #80]\n" + "fmla v11.8h, v4.8h, v21.8h\n" + "ldr x3, [%[in_2], #88]\n" + "fmla v12.8h, v4.8h, v22.8h\n" + "ins v23.d[1], x3\n" + "fmla v9.8h, v7.8h, v24.8h\n" + "fmla v10.8h, v7.8h, v25.8h\n" + "fmla v11.8h, v7.8h, v26.8h\n" + "fmla v12.8h, v7.8h, v27.8h\n" + + "ldr d14, [%[in_0], #96]\n" + "fmla v9.8h, v2.8h, v15.8h\n" + "ldr x1, [%[in_0], #104]\n" + "fmla v10.8h, v2.8h, v16.8h\n" + "ins v14.d[1], x1\n" + "fmla v11.8h, v2.8h, v17.8h\n" + "ldr d19, [%[in_1], #96]\n" + "fmla v12.8h, v2.8h, v13.8h\n" + "ldr x2, [%[in_1], #104]\n" + "fmla v9.8h, v5.8h, v20.8h\n" + "ins v19.d[1], x2\n" + "fmla v10.8h, v5.8h, v21.8h\n" + "ldr d24, [%[in_2], #96]\n" + "fmla v11.8h, v5.8h, v22.8h\n" + "ldr x3, [%[in_2], #104]\n" + "fmla v12.8h, v5.8h, v18.8h\n" + "ins v24.d[1], x3\n" + "fmla v9.8h, v8.8h, v25.8h\n" + "fmla v10.8h, v8.8h, v26.8h\n" + "fmla v11.8h, v8.8h, v27.8h\n" + "fmla v12.8h, v8.8h, v23.8h\n" + + "ldr d16, [%[in_0], #112]\n" + "mov v15.16b, v14.16b\n" + "ldr x1, [%[in_0], #120]\n" + "mov v20.16b, v19.16b\n" + "ins v16.d[1], x1\n" + "mov v25.16b, v24.16b\n" + "ldr d21, [%[in_1], #112]\n" + "mov v14.16b, v13.16b\n" + "ldr x2, [%[in_1], #120]\n" + "mov v19.16b, v18.16b\n" + "ins v21.d[1], x2\n" + "mov v24.16b, v23.16b\n" + "ldr d26, [%[in_2], #112]\n" + "mov v13.16b, v17.16b\n" + "ldr x3, [%[in_2], #120]\n" + "mov v18.16b, v22.16b\n" + "ins v26.d[1], x3\n" + "mov v23.16b, v27.16b\n" + + "add %[in_0], %[in_0], #64\n" + "add %[in_1], %[in_1], #64\n" + "add %[in_2], %[in_2], #64\n" + + "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse + "bne 211f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) + "fmax v10.8h, v10.8h, v17.8h\n" + "fmax v11.8h, v11.8h, v17.8h\n" + "fmax v12.8h, v12.8h, v17.8h\n" + + "211:\n" + "cmp %[depthwiseActivationMode], %[am_relu6]\n" + "bne 212f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) + "fmax v10.8h, v10.8h, v17.8h\n" + "fmax v11.8h, v11.8h, v17.8h\n" + "fmax v12.8h, v12.8h, v17.8h\n" + "fmin v9.8h, v9.8h, v22.8h\n" // min(v9, 6) + "fmin v10.8h, v10.8h, v22.8h\n" + "fmin v11.8h, v11.8h, v22.8h\n" + "fmin v12.8h, v12.8h, v22.8h\n" + + "212:\n" + "cmp %[depthwiseActivationMode], %[am_h_swish]\n" + "bne 213f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x42, lsl #8\n" // three + "fadd v27.8h, v9.8h, v22.8h\n" + "fadd v29.8h, v10.8h, v22.8h\n" + "fadd v30.8h, v11.8h, v22.8h\n" + "fadd v31.8h, v12.8h, v22.8h\n" + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v27.8h, v27.8h, v17.8h\n" + "fmax v29.8h, v29.8h, v17.8h\n" + "fmax v30.8h, v30.8h, v17.8h\n" + "fmax v31.8h, v31.8h, v17.8h\n" + "fmin v27.8h, v27.8h, v22.8h\n" + "fmin v29.8h, v29.8h, v22.8h\n" + "fmin v30.8h, v30.8h, v22.8h\n" + "fmin v31.8h, v31.8h, v22.8h\n" + "fdiv v27.8h, v27.8h, v22.8h\n" + "fdiv v29.8h, v29.8h, v22.8h\n" + "fdiv v30.8h, v30.8h, v22.8h\n" + "fdiv v31.8h, v31.8h, v22.8h\n" + "fmul v9.8h, v27.8h, v9.8h\n" + "fmul v10.8h, v29.8h, v10.8h\n" + "fmul v11.8h, v30.8h, v11.8h\n" + "fmul v12.8h, v31.8h, v12.8h\n" + + "213:\n" + "subs x0, x0, #4\n" + "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" + "bne 0b\n" + + "mov v9.16b, v28.16b\n" // out_0 + "mov v10.16b, v28.16b\n" // out_1 + "mov v11.16b, v28.16b\n" // out_2 + "mov v12.16b, v28.16b\n" // out_3 + + "ldr q17, [%[in_0], #64]\n" + "fmla v9.8h, v0.8h, v13.8h\n" + "fmla v10.8h, v0.8h, v14.8h\n" + "fmla v11.8h, v0.8h, v15.8h\n" + "fmla v12.8h, v0.8h, v16.8h\n" + "ldr q22, [%[in_1], #64]\n" + "fmla v9.8h, v3.8h, v18.8h\n" + "fmla v10.8h, v3.8h, v19.8h\n" + "fmla v11.8h, v3.8h, v20.8h\n" + "fmla v12.8h, v3.8h, v21.8h\n" + "ldr q27, [%[in_2], #64]\n" + "fmla v9.8h, v6.8h, v23.8h\n" + "fmla v10.8h, v6.8h, v24.8h\n" + "fmla v11.8h, v6.8h, v25.8h\n" + "fmla v12.8h, v6.8h, v26.8h\n" + + "fmla v9.8h, v1.8h, v14.8h\n" + "fmla v10.8h, v1.8h, v15.8h\n" + "fmla v11.8h, v1.8h, v16.8h\n" + "fmla v12.8h, v1.8h, v17.8h\n" + "fmla v9.8h, v4.8h, v19.8h\n" + "fmla v10.8h, v4.8h, v20.8h\n" + "fmla v11.8h, v4.8h, v21.8h\n" + "fmla v12.8h, v4.8h, v22.8h\n" + "fmla v9.8h, v7.8h, v24.8h\n" + "fmla v10.8h, v7.8h, v25.8h\n" + "fmla v11.8h, v7.8h, v26.8h\n" + "fmla v12.8h, v7.8h, v27.8h\n" + + "fmla v9.8h, v2.8h, v15.8h\n" + "fmla v10.8h, v2.8h, v16.8h\n" + "fmla v11.8h, v2.8h, v17.8h\n" + "fmla v9.8h, v5.8h, v20.8h\n" + "fmla v10.8h, v5.8h, v21.8h\n" + "fmla v11.8h, v5.8h, v22.8h\n" + "fmla v9.8h, v8.8h, v25.8h\n" + "fmla v10.8h, v8.8h, v26.8h\n" + "fmla v11.8h, v8.8h, v27.8h\n" + + "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse + "bne 311f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) + "fmax v10.8h, v10.8h, v17.8h\n" + "fmax v11.8h, v11.8h, v17.8h\n" + "fmax v12.8h, v12.8h, v17.8h\n" + + "311:\n" + "cmp %[depthwiseActivationMode], %[am_relu6]\n" + "bne 312f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) + "fmax v10.8h, v10.8h, v17.8h\n" + "fmax v11.8h, v11.8h, v17.8h\n" + "fmax v12.8h, v12.8h, v17.8h\n" + "fmin v9.8h, v9.8h, v22.8h\n" // min(v9, 6) + "fmin v10.8h, v10.8h, v22.8h\n" + "fmin v11.8h, v11.8h, v22.8h\n" + "fmin v12.8h, v12.8h, v22.8h\n" + + "312:\n" + "cmp %[depthwiseActivationMode], %[am_h_swish]\n" + "bne 313f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x42, lsl #8\n" // three + "fadd v27.8h, v9.8h, v22.8h\n" + "fadd v29.8h, v10.8h, v22.8h\n" + "fadd v30.8h, v11.8h, v22.8h\n" + "fadd v31.8h, v12.8h, v22.8h\n" + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v27.8h, v27.8h, v17.8h\n" + "fmax v29.8h, v29.8h, v17.8h\n" + "fmax v30.8h, v30.8h, v17.8h\n" + "fmax v31.8h, v31.8h, v17.8h\n" + "fmin v27.8h, v27.8h, v22.8h\n" + "fmin v29.8h, v29.8h, v22.8h\n" + "fmin v30.8h, v30.8h, v22.8h\n" + "fmin v31.8h, v31.8h, v22.8h\n" + "fdiv v27.8h, v27.8h, v22.8h\n" + "fdiv v29.8h, v29.8h, v22.8h\n" + "fdiv v30.8h, v30.8h, v22.8h\n" + "fdiv v31.8h, v31.8h, v22.8h\n" + "fmul v9.8h, v27.8h, v9.8h\n" + "fmul v10.8h, v29.8h, v10.8h\n" + "fmul v11.8h, v30.8h, v11.8h\n" + "fmul v12.8h, v31.8h, v12.8h\n" + + "313:\n" + "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" + : [out] "+r"(out), [in_0] "+r"(in0), [in_1] "+r"(in1), [in_2] "+r"(in2) + : [f] "r"(f), [b] "r"(b), [w] "r"((I64)ow - 8), + [depthwiseActivationMode] "r"((I64)depthwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", + "x0", "x1", "x2", "x3"); + } + in0 = in_c + (ih - 2) * iw * 8; + in1 = in0 + iw * 8; + in2 = in1 + iw * 8; + __asm__ __volatile__( + "mov x0, %[w]\n" + "ldr q28, [%[b]]\n" + "ldr q0, [%[f]]\n" + "ldr q1, [%[f], #16]\n" + "ldr q2, [%[f], #32]\n" + "ldr q3, [%[f], #48]\n" + "ldr q4, [%[f], #64]\n" + "ldr q5, [%[f], #80]\n" + "ldr q13, [%[in_0]]\n" + "ldr q14, [%[in_0], #16]\n" + "ldr q15, [%[in_0], #32]\n" + "ldr q16, [%[in_0], #48]\n" + "ldr q18, [%[in_1]]\n" + "ldr q19, [%[in_1], #16]\n" + "ldr q20, [%[in_1], #32]\n" + "ldr q21, [%[in_1], #48]\n" + + "mov v9.16b, v28.16b\n" // out_0 + "mov v10.16b, v28.16b\n" // out_1 + "mov v11.16b, v28.16b\n" // out_2 + "mov v12.16b, v28.16b\n" // out_3 + + "ldr q17, [%[in_0], #64]\n" + "fmla v10.8h, v0.8h, v13.8h\n" + "fmla v11.8h, v0.8h, v14.8h\n" + "fmla v12.8h, v0.8h, v15.8h\n" + "ldr q22, [%[in_1], #64]\n" + "fmla v10.8h, v3.8h, v18.8h\n" + "fmla v11.8h, v3.8h, v19.8h\n" + "fmla v12.8h, v3.8h, v20.8h\n" + + "fmla v9.8h, v1.8h, v13.8h\n" + "fmla v10.8h, v1.8h, v14.8h\n" + "fmla v11.8h, v1.8h, v15.8h\n" + "fmla v12.8h, v1.8h, v16.8h\n" + "fmla v9.8h, v4.8h, v18.8h\n" + "fmla v10.8h, v4.8h, v19.8h\n" + "fmla v11.8h, v4.8h, v20.8h\n" + "fmla v12.8h, v4.8h, v21.8h\n" + + "ldr q13, [%[in_0], #80]\n" + "fmla v9.8h, v2.8h, v14.8h\n" + "fmla v10.8h, v2.8h, v15.8h\n" + "fmla v11.8h, v2.8h, v16.8h\n" + "fmla v12.8h, v2.8h, v17.8h\n" + "ldr q18, [%[in_1], #80]\n" + "fmla v9.8h, v5.8h, v19.8h\n" + "fmla v10.8h, v5.8h, v20.8h\n" + "fmla v11.8h, v5.8h, v21.8h\n" + "fmla v12.8h, v5.8h, v22.8h\n" + + "mov v14.16b, v17.16b\n" + "mov v19.16b, v22.16b\n" + "mov v15.16b, v13.16b\n" + "mov v20.16b, v18.16b\n" + "mov v13.16b, v16.16b\n" + "mov v18.16b, v21.16b\n" + "ldr q16, [%[in_0], #96]\n" + "ldr q21, [%[in_1], #96]\n" + "add %[in_0], %[in_0], #48\n" + "add %[in_1], %[in_1], #48\n" + + "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse + "bne 111f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) + "fmax v10.8h, v10.8h, v17.8h\n" + "fmax v11.8h, v11.8h, v17.8h\n" + "fmax v12.8h, v12.8h, v17.8h\n" + + "111:\n" + "cmp %[depthwiseActivationMode], %[am_relu6]\n" + "bne 112f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) + "fmax v10.8h, v10.8h, v17.8h\n" + "fmax v11.8h, v11.8h, v17.8h\n" + "fmax v12.8h, v12.8h, v17.8h\n" + "fmin v9.8h, v9.8h, v22.8h\n" // min(v9, 6) + "fmin v10.8h, v10.8h, v22.8h\n" + "fmin v11.8h, v11.8h, v22.8h\n" + "fmin v12.8h, v12.8h, v22.8h\n" + + "112:\n" + "cmp %[depthwiseActivationMode], %[am_h_swish]\n" + "bne 113f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x42, lsl #8\n" // three + "fadd v27.8h, v9.8h, v22.8h\n" + "fadd v29.8h, v10.8h, v22.8h\n" + "fadd v30.8h, v11.8h, v22.8h\n" + "fadd v31.8h, v12.8h, v22.8h\n" + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v27.8h, v27.8h, v17.8h\n" + "fmax v29.8h, v29.8h, v17.8h\n" + "fmax v30.8h, v30.8h, v17.8h\n" + "fmax v31.8h, v31.8h, v17.8h\n" + "fmin v27.8h, v27.8h, v22.8h\n" + "fmin v29.8h, v29.8h, v22.8h\n" + "fmin v30.8h, v30.8h, v22.8h\n" + "fmin v31.8h, v31.8h, v22.8h\n" + "fdiv v27.8h, v27.8h, v22.8h\n" + "fdiv v29.8h, v29.8h, v22.8h\n" + "fdiv v30.8h, v30.8h, v22.8h\n" + "fdiv v31.8h, v31.8h, v22.8h\n" + "fmul v9.8h, v27.8h, v9.8h\n" + "fmul v10.8h, v29.8h, v10.8h\n" + "fmul v11.8h, v30.8h, v11.8h\n" + "fmul v12.8h, v31.8h, v12.8h\n" + + "113:\n" + "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" + + "0:\n" + "mov v9.16b, v28.16b\n" // out_0 + "mov v10.16b, v28.16b\n" // out_1 + "mov v11.16b, v28.16b\n" // out_2 + "mov v12.16b, v28.16b\n" // out_3 + + "ldr d17, [%[in_0], #64]\n" + "fmla v9.8h, v0.8h, v13.8h\n" + "ldr x1, [%[in_0], #72]\n" + "fmla v10.8h, v0.8h, v14.8h\n" + "ins v17.d[1], x1\n" + "fmla v11.8h, v0.8h, v15.8h\n" + "ldr d22, [%[in_1], #64]\n" + "fmla v12.8h, v0.8h, v16.8h\n" + "ldr x2, [%[in_1], #72]\n" + "fmla v9.8h, v3.8h, v18.8h\n" + "ins v22.d[1], x2\n" + "fmla v10.8h, v3.8h, v19.8h\n" + "fmla v11.8h, v3.8h, v20.8h\n" + "fmla v12.8h, v3.8h, v21.8h\n" + + "ldr d13, [%[in_0], #80]\n" + "fmla v9.8h, v1.8h, v14.8h\n" + "ldr x1, [%[in_0], #88]\n" + "fmla v10.8h, v1.8h, v15.8h\n" + "ins v13.d[1], x1\n" + "fmla v11.8h, v1.8h, v16.8h\n" + "ldr d18, [%[in_1], #80]\n" + "fmla v12.8h, v1.8h, v17.8h\n" + "ldr x2, [%[in_1], #88]\n" + "fmla v9.8h, v4.8h, v19.8h\n" + "ins v18.d[1], x2\n" + "fmla v10.8h, v4.8h, v20.8h\n" + "fmla v11.8h, v4.8h, v21.8h\n" + "fmla v12.8h, v4.8h, v22.8h\n" + + "ldr d14, [%[in_0], #96]\n" + "fmla v9.8h, v2.8h, v15.8h\n" + "ldr x1, [%[in_0], #104]\n" + "fmla v10.8h, v2.8h, v16.8h\n" + "ins v14.d[1], x1\n" + "fmla v11.8h, v2.8h, v17.8h\n" + "ldr d19, [%[in_1], #96]\n" + "fmla v12.8h, v2.8h, v13.8h\n" + "ldr x2, [%[in_1], #104]\n" + "fmla v9.8h, v5.8h, v20.8h\n" + "ins v19.d[1], x2\n" + "fmla v10.8h, v5.8h, v21.8h\n" + "fmla v11.8h, v5.8h, v22.8h\n" + "fmla v12.8h, v5.8h, v18.8h\n" + + "ldr d16, [%[in_0], #112]\n" + "mov v15.16b, v14.16b\n" + "ldr x1, [%[in_0], #120]\n" + "mov v20.16b, v19.16b\n" + "ins v16.d[1], x1\n" + "ldr d21, [%[in_1], #112]\n" + "mov v14.16b, v13.16b\n" + "ldr x2, [%[in_1], #120]\n" + "mov v19.16b, v18.16b\n" + "ins v21.d[1], x2\n" + "mov v13.16b, v17.16b\n" + "mov v18.16b, v22.16b\n" + + "add %[in_0], %[in_0], #64\n" + "add %[in_1], %[in_1], #64\n" + + "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse + "bne 211f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) + "fmax v10.8h, v10.8h, v17.8h\n" + "fmax v11.8h, v11.8h, v17.8h\n" + "fmax v12.8h, v12.8h, v17.8h\n" + + "211:\n" + "cmp %[depthwiseActivationMode], %[am_relu6]\n" + "bne 212f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) + "fmax v10.8h, v10.8h, v17.8h\n" + "fmax v11.8h, v11.8h, v17.8h\n" + "fmax v12.8h, v12.8h, v17.8h\n" + "fmin v9.8h, v9.8h, v22.8h\n" // min(v9, 6) + "fmin v10.8h, v10.8h, v22.8h\n" + "fmin v11.8h, v11.8h, v22.8h\n" + "fmin v12.8h, v12.8h, v22.8h\n" + + "212:\n" + "cmp %[depthwiseActivationMode], %[am_h_swish]\n" + "bne 213f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x42, lsl #8\n" // three + "fadd v27.8h, v9.8h, v22.8h\n" + "fadd v29.8h, v10.8h, v22.8h\n" + "fadd v30.8h, v11.8h, v22.8h\n" + "fadd v31.8h, v12.8h, v22.8h\n" + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v27.8h, v27.8h, v17.8h\n" + "fmax v29.8h, v29.8h, v17.8h\n" + "fmax v30.8h, v30.8h, v17.8h\n" + "fmax v31.8h, v31.8h, v17.8h\n" + "fmin v27.8h, v27.8h, v22.8h\n" + "fmin v29.8h, v29.8h, v22.8h\n" + "fmin v30.8h, v30.8h, v22.8h\n" + "fmin v31.8h, v31.8h, v22.8h\n" + "fdiv v27.8h, v27.8h, v22.8h\n" + "fdiv v29.8h, v29.8h, v22.8h\n" + "fdiv v30.8h, v30.8h, v22.8h\n" + "fdiv v31.8h, v31.8h, v22.8h\n" + "fmul v9.8h, v27.8h, v9.8h\n" + "fmul v10.8h, v29.8h, v10.8h\n" + "fmul v11.8h, v30.8h, v11.8h\n" + "fmul v12.8h, v31.8h, v12.8h\n" + + "213:\n" + "subs x0, x0, #4\n" + "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" + "bne 0b\n" + + "mov v9.16b, v28.16b\n" // out_0 + "mov v10.16b, v28.16b\n" // out_1 + "mov v11.16b, v28.16b\n" // out_2 + "mov v12.16b, v28.16b\n" // out_3 + + "ldr q17, [%[in_0], #64]\n" + "fmla v9.8h, v0.8h, v13.8h\n" + "fmla v10.8h, v0.8h, v14.8h\n" + "fmla v11.8h, v0.8h, v15.8h\n" + "fmla v12.8h, v0.8h, v16.8h\n" + "ldr q22, [%[in_1], #64]\n" + "fmla v9.8h, v3.8h, v18.8h\n" + "fmla v10.8h, v3.8h, v19.8h\n" + "fmla v11.8h, v3.8h, v20.8h\n" + "fmla v12.8h, v3.8h, v21.8h\n" + + "fmla v9.8h, v1.8h, v14.8h\n" + "fmla v10.8h, v1.8h, v15.8h\n" + "fmla v11.8h, v1.8h, v16.8h\n" + "fmla v12.8h, v1.8h, v17.8h\n" + "fmla v9.8h, v4.8h, v19.8h\n" + "fmla v10.8h, v4.8h, v20.8h\n" + "fmla v11.8h, v4.8h, v21.8h\n" + "fmla v12.8h, v4.8h, v22.8h\n" + + "fmla v9.8h, v2.8h, v15.8h\n" + "fmla v10.8h, v2.8h, v16.8h\n" + "fmla v11.8h, v2.8h, v17.8h\n" + "fmla v9.8h, v5.8h, v20.8h\n" + "fmla v10.8h, v5.8h, v21.8h\n" + "fmla v11.8h, v5.8h, v22.8h\n" + + "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse + "bne 311f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) + "fmax v10.8h, v10.8h, v17.8h\n" + "fmax v11.8h, v11.8h, v17.8h\n" + "fmax v12.8h, v12.8h, v17.8h\n" + + "311:\n" + "cmp %[depthwiseActivationMode], %[am_relu6]\n" + "bne 312f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) + "fmax v10.8h, v10.8h, v17.8h\n" + "fmax v11.8h, v11.8h, v17.8h\n" + "fmax v12.8h, v12.8h, v17.8h\n" + "fmin v9.8h, v9.8h, v22.8h\n" // min(v9, 6) + "fmin v10.8h, v10.8h, v22.8h\n" + "fmin v11.8h, v11.8h, v22.8h\n" + "fmin v12.8h, v12.8h, v22.8h\n" + + "312:\n" + "cmp %[depthwiseActivationMode], %[am_h_swish]\n" + "bne 313f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x42, lsl #8\n" // three + "fadd v27.8h, v9.8h, v22.8h\n" + "fadd v29.8h, v10.8h, v22.8h\n" + "fadd v30.8h, v11.8h, v22.8h\n" + "fadd v31.8h, v12.8h, v22.8h\n" + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v27.8h, v27.8h, v17.8h\n" + "fmax v29.8h, v29.8h, v17.8h\n" + "fmax v30.8h, v30.8h, v17.8h\n" + "fmax v31.8h, v31.8h, v17.8h\n" + "fmin v27.8h, v27.8h, v22.8h\n" + "fmin v29.8h, v29.8h, v22.8h\n" + "fmin v30.8h, v30.8h, v22.8h\n" + "fmin v31.8h, v31.8h, v22.8h\n" + "fdiv v27.8h, v27.8h, v22.8h\n" + "fdiv v29.8h, v29.8h, v22.8h\n" + "fdiv v30.8h, v30.8h, v22.8h\n" + "fdiv v31.8h, v31.8h, v22.8h\n" + "fmul v9.8h, v27.8h, v9.8h\n" + "fmul v10.8h, v29.8h, v10.8h\n" + "fmul v11.8h, v30.8h, v11.8h\n" + "fmul v12.8h, v31.8h, v12.8h\n" + + "313:\n" + "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" + : [out] "+r"(out), [in_0] "+r"(in0), [in_1] "+r"(in1) + : [f] "r"(f), [b] "r"(b), [w] "r"((I64)ow - 8), + [depthwiseActivationMode] "r"((I64)depthwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", + "x3"); + } + + // pw_conv + for (I32 hw = 0; hw < ohow - 7; hw += 8) { + const F16 *b0 = pwBiasArray; + const F16 *b1 = b0 + 8; + const F16 *f_o0c0 = pwFilterArray; + F16 *in_pack = pwArray + ohow * ic * 8; + // pack input + // NCHWc8 => NHWChw8 + for (U32 c = 0; c < ic; c++) { + F16 *in_pack_c8hw8 = in_pack + c * 8 * 8; + // it is 2% faster than in_hw8c8 = ... + hw*8; Amazing! + F16 *in_hw8c8 = pwArray + c * ohow * 8; + // + // for (U32 c8 = 0; c8 < 8; c8++) { + // for (U32 hw8 = 0; hw8 < 8; hw8++) { + // in_pack_c8hw8[c8*8 + hw8] = in_hw8c8[hw8*8 + c8]; + // } + // } + // + float16x8_t v0 = vld1q_f16(in_hw8c8 + hw * 8); + float16x8_t v1 = vld1q_f16(in_hw8c8 + hw * 8 + 8); + float16x8_t v2 = vld1q_f16(in_hw8c8 + hw * 8 + 8 * 2); + float16x8_t v3 = vld1q_f16(in_hw8c8 + hw * 8 + 8 * 3); + float16x8_t v4 = vld1q_f16(in_hw8c8 + hw * 8 + 8 * 4); + float16x8_t v5 = vld1q_f16(in_hw8c8 + hw * 8 + 8 * 5); + float16x8_t v6 = vld1q_f16(in_hw8c8 + hw * 8 + 8 * 6); + float16x8_t v7 = vld1q_f16(in_hw8c8 + hw * 8 + 8 * 7); + vst1q_f16(in_pack_c8hw8, + vzip1q_f16(vzip1q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip1q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8, + vzip2q_f16(vzip1q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip1q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8 * 2, + vzip1q_f16(vzip2q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip2q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8 * 3, + vzip2q_f16(vzip2q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip2q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8 * 4, + vzip1q_f16(vzip1q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vzip1q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8 * 5, + vzip2q_f16(vzip1q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vzip1q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8 * 6, + vzip1q_f16(vzip2q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vzip2q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8 * 7, + vzip2q_f16(vzip2q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vzip2q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); + } + // compute + for (I32 o = 0; o < I32(oc - 1); o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr d22, [%[b_0]]\n" // b_o0 + "ldr x1, [%[b_0], #8]\n" + "ins v22.d[1], x1\n" + "ldr d23, [%[b_1]]\n" // b_o1 + "ldr x2, [%[b_1], #8]\n" + "ins v23.d[1], x2\n" + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "ldr x1, [%[in_0], #8]\n" + "mov v4.16b, v22.16b\n" // out_o0hw2 + "ins v0.d[1], x1\n" + "mov v5.16b, v22.16b\n" // out_o0hw3 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "mov v6.16b, v22.16b\n" // out_o0hw4 + "ldr x2, [%[f_0], #8]\n" + "mov v7.16b, v22.16b\n" // out_o0hw5 + "ins v18.d[1], x2\n" + "mov v8.16b, v22.16b\n" // out_o0hw6 + "ldr d19, [%[f_0], #16]\n" // f_o1c0 + "mov v9.16b, v22.16b\n" // out_o0hw7 + "ldr x3, [%[f_0], #24]\n" + "mov v10.16b, v23.16b\n" // out_o1hw0 + "ins v19.d[1], x3\n" + "mov v11.16b, v23.16b\n" // out_o1hw1 + "mov v12.16b, v23.16b\n" // out_o1hw2 + "mov v13.16b, v23.16b\n" // out_o1hw3 + "mov v14.16b, v23.16b\n" // out_o1hw4 + "mov v15.16b, v23.16b\n" // out_o1hw5 + "mov v16.16b, v23.16b\n" // out_o1hw6 + "mov v17.16b, v23.16b\n" // out_o1hw7 + "0:\n" + "ldr d1, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr x1, [%[in_0], #24]\n" + "fmla v3.8h, v18.8h, v0.h[1]\n" + "ins v1.d[1], x1\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "ldr d20, [%[f_0], #32]\n" // f_o0c0 + "fmla v5.8h, v18.8h, v0.h[3]\n" + "ldr x2, [%[f_0], #40]\n" + "fmla v6.8h, v18.8h, v0.h[4]\n" + "ins v20.d[1], x2\n" + "fmla v7.8h, v18.8h, v0.h[5]\n" + "ldr d21, [%[f_0], #48]\n" // f_o1c0 + "fmla v8.8h, v18.8h, v0.h[6]\n" + "ldr x3, [%[f_0], #56]\n" + "fmla v9.8h, v18.8h, v0.h[7]\n" + "ins v21.d[1], x3\n" + "fmla v10.8h, v19.8h, v0.h[0]\n" + "fmla v11.8h, v19.8h, v0.h[1]\n" + "fmla v12.8h, v19.8h, v0.h[2]\n" + "fmla v13.8h, v19.8h, v0.h[3]\n" + "fmla v14.8h, v19.8h, v0.h[4]\n" + "fmla v15.8h, v19.8h, v0.h[5]\n" + "fmla v16.8h, v19.8h, v0.h[6]\n" + "fmla v17.8h, v19.8h, v0.h[7]\n" + + "ldr d0, [%[in_0], #32]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr x1, [%[in_0], #40]\n" + "fmla v3.8h, v20.8h, v1.h[1]\n" + "ins v0.d[1], x1\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "ldr d18, [%[f_0], #64]\n" // f_o0c0 + "fmla v5.8h, v20.8h, v1.h[3]\n" + "ldr x2, [%[f_0], #72]\n" + "fmla v6.8h, v20.8h, v1.h[4]\n" + "ins v18.d[1], x2\n" + "fmla v7.8h, v20.8h, v1.h[5]\n" + "ldr d19, [%[f_0], #80]\n" // f_o1c0 + "fmla v8.8h, v20.8h, v1.h[6]\n" + "ldr x3, [%[f_0], #88]\n" + "fmla v9.8h, v20.8h, v1.h[7]\n" + "ins v19.d[1], x3\n" + "fmla v10.8h, v21.8h, v1.h[0]\n" + "add %[in_0], %[in_0], #32\n" + "fmla v11.8h, v21.8h, v1.h[1]\n" + "add %[f_0], %[f_0], #64\n" + "fmla v12.8h, v21.8h, v1.h[2]\n" + "subs x0, x0, #2\n" + "fmla v13.8h, v21.8h, v1.h[3]\n" + "fmla v14.8h, v21.8h, v1.h[4]\n" + "fmla v15.8h, v21.8h, v1.h[5]\n" + "fmla v16.8h, v21.8h, v1.h[6]\n" + "fmla v17.8h, v21.8h, v1.h[7]\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + "fmax v11.8h, v11.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v0.8h\n" + "fmax v13.8h, v13.8h, v0.8h\n" + "fmax v14.8h, v14.8h, v0.8h\n" + "fmax v15.8h, v15.8h, v0.8h\n" + "fmax v16.8h, v16.8h, v0.8h\n" + "fmax v17.8h, v17.8h, v0.8h\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + "fmax v11.8h, v11.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v0.8h\n" + "fmax v13.8h, v13.8h, v0.8h\n" + "fmax v14.8h, v14.8h, v0.8h\n" + "fmax v15.8h, v15.8h, v0.8h\n" + "fmax v16.8h, v16.8h, v0.8h\n" + "fmax v17.8h, v17.8h, v0.8h\n" + "fmin v2.8h, v2.8h, v1.8h\n" + "fmin v3.8h, v3.8h, v1.8h\n" + "fmin v4.8h, v4.8h, v1.8h\n" + "fmin v5.8h, v5.8h, v1.8h\n" + "fmin v6.8h, v6.8h, v1.8h\n" + "fmin v7.8h, v7.8h, v1.8h\n" + "fmin v8.8h, v8.8h, v1.8h\n" + "fmin v9.8h, v9.8h, v1.8h\n" + "fmin v10.8h, v10.8h, v1.8h\n" + "fmin v11.8h, v11.8h, v1.8h\n" + "fmin v12.8h, v12.8h, v1.8h\n" + "fmin v13.8h, v13.8h, v1.8h\n" + "fmin v14.8h, v14.8h, v1.8h\n" + "fmin v15.8h, v15.8h, v1.8h\n" + "fmin v16.8h, v16.8h, v1.8h\n" + "fmin v17.8h, v17.8h, v1.8h\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three + "fadd v19.8h, v2.8h, v18.8h\n" + "fadd v20.8h, v3.8h, v18.8h\n" + "fadd v21.8h, v4.8h, v18.8h\n" + "fadd v22.8h, v5.8h, v18.8h\n" + "fadd v23.8h, v6.8h, v18.8h\n" + "fadd v24.8h, v7.8h, v18.8h\n" + "fadd v25.8h, v8.8h, v18.8h\n" + "fadd v26.8h, v9.8h, v18.8h\n" + "fmax v19.8h, v19.8h, v0.8h\n" + "fmax v20.8h, v20.8h, v0.8h\n" + "fmax v21.8h, v21.8h, v0.8h\n" + "fmax v22.8h, v22.8h, v0.8h\n" + "fmax v23.8h, v23.8h, v0.8h\n" + "fmax v24.8h, v24.8h, v0.8h\n" + "fmax v25.8h, v25.8h, v0.8h\n" + "fmax v26.8h, v26.8h, v0.8h\n" + "fmin v19.8h, v19.8h, v1.8h\n" + "fmin v20.8h, v20.8h, v1.8h\n" + "fmin v21.8h, v21.8h, v1.8h\n" + "fmin v22.8h, v22.8h, v1.8h\n" + "fmin v23.8h, v23.8h, v1.8h\n" + "fmin v24.8h, v24.8h, v1.8h\n" + "fmin v25.8h, v25.8h, v1.8h\n" + "fmin v26.8h, v26.8h, v1.8h\n" + "fdiv v19.8h, v19.8h, v1.8h\n" + "fdiv v20.8h, v20.8h, v1.8h\n" + "fdiv v21.8h, v21.8h, v1.8h\n" + "fdiv v22.8h, v22.8h, v1.8h\n" + "fdiv v23.8h, v23.8h, v1.8h\n" + "fdiv v24.8h, v24.8h, v1.8h\n" + "fdiv v25.8h, v25.8h, v1.8h\n" + "fdiv v26.8h, v26.8h, v1.8h\n" + "fmul v2.8h, v19.8h, v2.8h\n" + "fmul v3.8h, v20.8h, v3.8h\n" + "fmul v4.8h, v21.8h, v4.8h\n" + "fmul v5.8h, v22.8h, v5.8h\n" + "fmul v6.8h, v23.8h, v6.8h\n" + "fmul v7.8h, v24.8h, v7.8h\n" + "fmul v8.8h, v25.8h, v8.8h\n" + "fmul v9.8h, v26.8h, v9.8h\n" + + "fadd v19.8h, v10.8h, v18.8h\n" + "fadd v20.8h, v11.8h, v18.8h\n" + "fadd v21.8h, v12.8h, v18.8h\n" + "fadd v22.8h, v13.8h, v18.8h\n" + "fadd v23.8h, v14.8h, v18.8h\n" + "fadd v24.8h, v15.8h, v18.8h\n" + "fadd v25.8h, v16.8h, v18.8h\n" + "fadd v26.8h, v17.8h, v18.8h\n" + "fmax v19.8h, v19.8h, v0.8h\n" + "fmax v20.8h, v20.8h, v0.8h\n" + "fmax v21.8h, v21.8h, v0.8h\n" + "fmax v22.8h, v22.8h, v0.8h\n" + "fmax v23.8h, v23.8h, v0.8h\n" + "fmax v24.8h, v24.8h, v0.8h\n" + "fmax v25.8h, v25.8h, v0.8h\n" + "fmax v26.8h, v26.8h, v0.8h\n" + "fmin v19.8h, v19.8h, v1.8h\n" + "fmin v20.8h, v20.8h, v1.8h\n" + "fmin v21.8h, v21.8h, v1.8h\n" + "fmin v22.8h, v22.8h, v1.8h\n" + "fmin v23.8h, v23.8h, v1.8h\n" + "fmin v24.8h, v24.8h, v1.8h\n" + "fmin v25.8h, v25.8h, v1.8h\n" + "fmin v26.8h, v26.8h, v1.8h\n" + "fdiv v19.8h, v19.8h, v1.8h\n" + "fdiv v20.8h, v20.8h, v1.8h\n" + "fdiv v21.8h, v21.8h, v1.8h\n" + "fdiv v22.8h, v22.8h, v1.8h\n" + "fdiv v23.8h, v23.8h, v1.8h\n" + "fdiv v24.8h, v24.8h, v1.8h\n" + "fdiv v25.8h, v25.8h, v1.8h\n" + "fdiv v26.8h, v26.8h, v1.8h\n" + "fmul v10.8h, v19.8h, v10.8h\n" + "fmul v11.8h, v20.8h, v11.8h\n" + "fmul v12.8h, v21.8h, v12.8h\n" + "fmul v13.8h, v22.8h, v13.8h\n" + "fmul v14.8h, v23.8h, v14.8h\n" + "fmul v15.8h, v24.8h, v15.8h\n" + "fmul v16.8h, v25.8h, v16.8h\n" + "fmul v17.8h, v26.8h, v17.8h\n" + + "13:\n" + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + "str q6, [%[out_0], #64]\n" // out_o0hw4 + "str q7, [%[out_0], #80]\n" // out_o0hw5 + "str q8, [%[out_0], #96]\n" // out_o0hw6 + "str q9, [%[out_0], #112]\n" // out_o0hw7 + "str q10, [%[out_1]]\n" // out_o1hw0 + "str q11, [%[out_1], #16]\n" // out_o1hw1 + "str q12, [%[out_1], #32]\n" // out_o1hw2 + "str q13, [%[out_1], #48]\n" // out_o1hw3 + "str q14, [%[out_1], #64]\n" // out_o1hw4 + "str q15, [%[out_1], #80]\n" // out_o1hw5 + "str q16, [%[out_1], #96]\n" // out_o1hw6 + "str q17, [%[out_1], #112]\n" // out_o1hw7 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), [b_1] "r"(b_o1), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "x0", "x1", "x2", "x3"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = pwFilterArray + (oc - 1) * 8 * ic * 8; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = pwBiasArray + (oc - 1) * 8; + __asm__ __volatile__( + "ldr q12, [%[b_0]]\n" // b_o0 + "mov x0, %[ic]\n" // ic_blk + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v12.16b\n" // out_o0hw0 + "ldr x1, [%[in_0], #8]\n" + "mov v3.16b, v12.16b\n" // out_o0hw1 + "ins v0.d[1], x1\n" + "mov v4.16b, v12.16b\n" // out_o0hw2 + "ldr d10, [%[f_0]]\n" // f_o0c0 + "mov v5.16b, v12.16b\n" // out_o0hw3 + "ldr x2, [%[f_0], #8]\n" + "mov v6.16b, v12.16b\n" // out_o0hw4 + "ins v10.d[1], x2\n" + "mov v7.16b, v12.16b\n" // out_o0hw5 + "mov v8.16b, v12.16b\n" // out_o0hw6 + "mov v9.16b, v12.16b\n" // out_o0hw7 + "0:\n" + "ldr d1, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v10.8h, v0.h[0]\n" + "ldr x1, [%[in_0], #24]\n" + "fmla v3.8h, v10.8h, v0.h[1]\n" + "ins v1.d[1], x1\n" + "fmla v4.8h, v10.8h, v0.h[2]\n" + "ldr d11, [%[f_0], #16]\n" // f_o0c0 + "fmla v5.8h, v10.8h, v0.h[3]\n" + "ldr x2, [%[f_0], #24]\n" + "fmla v6.8h, v10.8h, v0.h[4]\n" + "ins v11.d[1], x2\n" + "fmla v7.8h, v10.8h, v0.h[5]\n" + "subs x0, x0, #2\n" + "fmla v8.8h, v10.8h, v0.h[6]\n" + "fmla v9.8h, v10.8h, v0.h[7]\n" + + "ldr d0, [%[in_0], #32]\n" // in_hw0 + "fmla v2.8h, v11.8h, v1.h[0]\n" + "ldr x1, [%[in_0], #40]\n" + "fmla v3.8h, v11.8h, v1.h[1]\n" + "ins v0.d[1], x1\n" + "fmla v4.8h, v11.8h, v1.h[2]\n" + "ldr d10, [%[f_0], #32]\n" // f_o0c0 + "fmla v5.8h, v11.8h, v1.h[3]\n" + "ldr x2, [%[f_0], #40]\n" + "fmla v6.8h, v11.8h, v1.h[4]\n" + "ins v10.d[1], x2\n" + "fmla v7.8h, v11.8h, v1.h[5]\n" + "add %[in_0], %[in_0], #32\n" + "fmla v8.8h, v11.8h, v1.h[6]\n" + "add %[f_0], %[f_0], #32\n" + "fmla v9.8h, v11.8h, v1.h[7]\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + "fmin v2.8h, v2.8h, v1.8h\n" + "fmin v3.8h, v3.8h, v1.8h\n" + "fmin v4.8h, v4.8h, v1.8h\n" + "fmin v5.8h, v5.8h, v1.8h\n" + "fmin v6.8h, v6.8h, v1.8h\n" + "fmin v7.8h, v7.8h, v1.8h\n" + "fmin v8.8h, v8.8h, v1.8h\n" + "fmin v9.8h, v9.8h, v1.8h\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v10.8h, #0x42, lsl #8\n" // three + "fadd v11.8h, v2.8h, v10.8h\n" + "fadd v12.8h, v3.8h, v10.8h\n" + "fadd v13.8h, v4.8h, v10.8h\n" + "fadd v14.8h, v5.8h, v10.8h\n" + "fadd v15.8h, v6.8h, v10.8h\n" + "fadd v16.8h, v7.8h, v10.8h\n" + "fadd v17.8h, v8.8h, v10.8h\n" + "fadd v18.8h, v9.8h, v10.8h\n" + "fmax v11.8h, v11.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v0.8h\n" + "fmax v13.8h, v13.8h, v0.8h\n" + "fmax v14.8h, v14.8h, v0.8h\n" + "fmax v15.8h, v15.8h, v0.8h\n" + "fmax v16.8h, v16.8h, v0.8h\n" + "fmax v17.8h, v17.8h, v0.8h\n" + "fmax v18.8h, v18.8h, v0.8h\n" + "fmin v11.8h, v11.8h, v1.8h\n" + "fmin v12.8h, v12.8h, v1.8h\n" + "fmin v13.8h, v13.8h, v1.8h\n" + "fmin v14.8h, v14.8h, v1.8h\n" + "fmin v15.8h, v15.8h, v1.8h\n" + "fmin v16.8h, v16.8h, v1.8h\n" + "fmin v17.8h, v17.8h, v1.8h\n" + "fmin v18.8h, v18.8h, v1.8h\n" + "fdiv v11.8h, v11.8h, v1.8h\n" + "fdiv v12.8h, v12.8h, v1.8h\n" + "fdiv v13.8h, v13.8h, v1.8h\n" + "fdiv v14.8h, v14.8h, v1.8h\n" + "fdiv v15.8h, v15.8h, v1.8h\n" + "fdiv v16.8h, v16.8h, v1.8h\n" + "fdiv v17.8h, v17.8h, v1.8h\n" + "fdiv v18.8h, v18.8h, v1.8h\n" + "fmul v2.8h, v11.8h, v2.8h\n" + "fmul v3.8h, v12.8h, v3.8h\n" + "fmul v4.8h, v13.8h, v4.8h\n" + "fmul v5.8h, v14.8h, v5.8h\n" + "fmul v6.8h, v15.8h, v6.8h\n" + "fmul v7.8h, v16.8h, v7.8h\n" + "fmul v8.8h, v17.8h, v8.8h\n" + "fmul v9.8h, v18.8h, v9.8h\n" + + "13:\n" + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw0 + "str q4, [%[out_0], #32]\n" // out_o0hw0 + "str q5, [%[out_0], #48]\n" // out_o0hw0 + "str q6, [%[out_0], #64]\n" // out_o0hw0 + "str q7, [%[out_0], #80]\n" // out_o0hw0 + "str q8, [%[out_0], #96]\n" // out_o0hw0 + "str q9, [%[out_0], #112]\n" // out_o0hw0 + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "x0", "x1", "x2"); + } + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1_A76.cpp b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1_A76.cpp new file mode 100644 index 00000000..e9340602 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1_A76.cpp @@ -0,0 +1,1520 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1.h" + +EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, + F16 *inArray, + TensorDesc dwFilterDesc, + const F16 *dwFilterArray, + TensorDesc pwFilterDesc, + const F16 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F16 *dwBiasArray, + TensorDesc pwBiasDesc, + const F16 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec) +{ + UNUSED(tmpBytes); + UNUSED(convParamSpec); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if (dwFilterDesc.df != DF_NCHWC8 || pwFilterDesc.df != DF_NHWCN16) { + CHECK_STATUS(NOT_MATCH); + } + + oc /= 8; + ic /= 8; + + I32 ohow = oh * ow; + F16 *pwArray = (F16 *)tmp; + + for (U32 n = 0; n < in; n++) { + // dw_conv + padding + for (U32 c = 0; c < ic; c++) { + const F16 *b = dwBiasArray + c * 8; + F16 *in_c = inArray + c * ih * iw * 8; + const F16 *f = dwFilterArray + c * fh * fw * 8; + F16 *out = pwArray + c * ohow * 8; + F16 *in0 = in_c; + F16 *in1 = in0 + iw * 8; + F16 *in2 = in1 + iw * 8; + __asm__ __volatile__( + "mov x0, %[w]\n" + "ldr q28, [%[b]]\n" + "ldr q3, [%[f], #48]\n" + "ldr q4, [%[f], #64]\n" + "ldr q5, [%[f], #80]\n" + "ldr q6, [%[f], #96]\n" + "ldr q7, [%[f], #112]\n" + "ldr q8, [%[f], #128]\n" + "ldr q13, [%[in_0]]\n" + "ldr q14, [%[in_0], #16]\n" + "ldr q15, [%[in_0], #32]\n" + "ldr q16, [%[in_0], #48]\n" + "ldr q18, [%[in_1]]\n" + "ldr q19, [%[in_1], #16]\n" + "ldr q20, [%[in_1], #32]\n" + "ldr q21, [%[in_1], #48]\n" + "mov v9.16b, v28.16b\n" // out_0 + "mov v10.16b, v28.16b\n" // out_1 + "mov v11.16b, v28.16b\n" // out_2 + "mov v12.16b, v28.16b\n" // out_3 + + "ldr q17, [%[in_0], #64]\n" + "fmla v10.8h, v3.8h, v13.8h\n" + "fmla v11.8h, v3.8h, v14.8h\n" + "fmla v12.8h, v3.8h, v15.8h\n" + "ldr q22, [%[in_1], #64]\n" + "fmla v10.8h, v6.8h, v18.8h\n" + "fmla v11.8h, v6.8h, v19.8h\n" + "fmla v12.8h, v6.8h, v20.8h\n" + + "fmla v9.8h, v4.8h, v13.8h\n" + "fmla v10.8h, v4.8h, v14.8h\n" + "fmla v11.8h, v4.8h, v15.8h\n" + "fmla v12.8h, v4.8h, v16.8h\n" + "fmla v9.8h, v7.8h, v18.8h\n" + "fmla v10.8h, v7.8h, v19.8h\n" + "fmla v11.8h, v7.8h, v20.8h\n" + "fmla v12.8h, v7.8h, v21.8h\n" + + "ldr q13, [%[in_0], #80]\n" + "fmla v9.8h, v5.8h, v14.8h\n" + "fmla v10.8h, v5.8h, v15.8h\n" + "fmla v11.8h, v5.8h, v16.8h\n" + "fmla v12.8h, v5.8h, v17.8h\n" + "ldr q18, [%[in_1], #80]\n" + "fmla v9.8h, v8.8h, v19.8h\n" + "fmla v10.8h, v8.8h, v20.8h\n" + "fmla v11.8h, v8.8h, v21.8h\n" + "fmla v12.8h, v8.8h, v22.8h\n" + + "mov v14.16b, v17.16b\n" + "mov v19.16b, v22.16b\n" + "mov v15.16b, v13.16b\n" + "mov v20.16b, v18.16b\n" + "mov v13.16b, v16.16b\n" + "mov v18.16b, v21.16b\n" + "ldr q16, [%[in_0], #96]\n" + "ldr q21, [%[in_1], #96]\n" + "add %[in_0], %[in_0], #48\n" + "add %[in_1], %[in_1], #48\n" + + "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse + "bne 111f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) + "fmax v10.8h, v10.8h, v17.8h\n" + "fmax v11.8h, v11.8h, v17.8h\n" + "fmax v12.8h, v12.8h, v17.8h\n" + + "111:\n" + "cmp %[depthwiseActivationMode], %[am_relu6]\n" + "bne 112f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) + "fmax v10.8h, v10.8h, v17.8h\n" + "fmax v11.8h, v11.8h, v17.8h\n" + "fmax v12.8h, v12.8h, v17.8h\n" + "fmin v9.8h, v9.8h, v22.8h\n" // min(v9, 6) + "fmin v10.8h, v10.8h, v22.8h\n" + "fmin v11.8h, v11.8h, v22.8h\n" + "fmin v12.8h, v12.8h, v22.8h\n" + + "112:\n" + "cmp %[depthwiseActivationMode], %[am_h_swish]\n" + "bne 113f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x42, lsl #8\n" // three + "fadd v27.8h, v9.8h, v22.8h\n" + "fadd v29.8h, v10.8h, v22.8h\n" + "fadd v30.8h, v11.8h, v22.8h\n" + "fadd v31.8h, v12.8h, v22.8h\n" + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v27.8h, v27.8h, v17.8h\n" + "fmax v29.8h, v29.8h, v17.8h\n" + "fmax v30.8h, v30.8h, v17.8h\n" + "fmax v31.8h, v31.8h, v17.8h\n" + "fmin v27.8h, v27.8h, v22.8h\n" + "fmin v29.8h, v29.8h, v22.8h\n" + "fmin v30.8h, v30.8h, v22.8h\n" + "fmin v31.8h, v31.8h, v22.8h\n" + "fdiv v27.8h, v27.8h, v22.8h\n" + "fdiv v29.8h, v29.8h, v22.8h\n" + "fdiv v30.8h, v30.8h, v22.8h\n" + "fdiv v31.8h, v31.8h, v22.8h\n" + "fmul v9.8h, v27.8h, v9.8h\n" + "fmul v10.8h, v29.8h, v10.8h\n" + "fmul v11.8h, v30.8h, v11.8h\n" + "fmul v12.8h, v31.8h, v12.8h\n" + + "113:\n" + "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" + + "0:\n" + "mov v9.16b, v28.16b\n" // out_0 + "mov v10.16b, v28.16b\n" // out_1 + "mov v11.16b, v28.16b\n" // out_2 + "mov v12.16b, v28.16b\n" // out_3 + + "ldr q17, [%[in_0], #64]\n" + "fmla v9.8h, v3.8h, v13.8h\n" + "fmla v10.8h, v3.8h, v14.8h\n" + "fmla v11.8h, v3.8h, v15.8h\n" + "ldr q22, [%[in_1], #64]\n" + "fmla v12.8h, v3.8h, v16.8h\n" + "fmla v9.8h, v6.8h, v18.8h\n" + "fmla v10.8h, v6.8h, v19.8h\n" + "fmla v11.8h, v6.8h, v20.8h\n" + "fmla v12.8h, v6.8h, v21.8h\n" + + "ldr q13, [%[in_0], #80]\n" + "fmla v9.8h, v4.8h, v14.8h\n" + "fmla v10.8h, v4.8h, v15.8h\n" + "fmla v11.8h, v4.8h, v16.8h\n" + "ldr q18, [%[in_1], #80]\n" + "fmla v12.8h, v4.8h, v17.8h\n" + "fmla v9.8h, v7.8h, v19.8h\n" + "fmla v10.8h, v7.8h, v20.8h\n" + "fmla v11.8h, v7.8h, v21.8h\n" + "fmla v12.8h, v7.8h, v22.8h\n" + + "ldr q14, [%[in_0], #96]\n" + "fmla v9.8h, v5.8h, v15.8h\n" + "fmla v10.8h, v5.8h, v16.8h\n" + "fmla v11.8h, v5.8h, v17.8h\n" + "ldr q19, [%[in_1], #96]\n" + "fmla v12.8h, v5.8h, v13.8h\n" + "fmla v9.8h, v8.8h, v20.8h\n" + "fmla v10.8h, v8.8h, v21.8h\n" + "fmla v11.8h, v8.8h, v22.8h\n" + "fmla v12.8h, v8.8h, v18.8h\n" + + "ldr q16, [%[in_0], #112]\n" + "mov v15.16b, v14.16b\n" + "mov v20.16b, v19.16b\n" + "mov v14.16b, v13.16b\n" + "ldr q21, [%[in_1], #112]\n" + "mov v19.16b, v18.16b\n" + "mov v13.16b, v17.16b\n" + "mov v18.16b, v22.16b\n" + + "add %[in_0], %[in_0], #64\n" + "add %[in_1], %[in_1], #64\n" + + "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse + "bne 211f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) + "fmax v10.8h, v10.8h, v17.8h\n" + "fmax v11.8h, v11.8h, v17.8h\n" + "fmax v12.8h, v12.8h, v17.8h\n" + + "211:\n" + "cmp %[depthwiseActivationMode], %[am_relu6]\n" + "bne 212f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) + "fmax v10.8h, v10.8h, v17.8h\n" + "fmax v11.8h, v11.8h, v17.8h\n" + "fmax v12.8h, v12.8h, v17.8h\n" + "fmin v9.8h, v9.8h, v22.8h\n" // min(v9, 6) + "fmin v10.8h, v10.8h, v22.8h\n" + "fmin v11.8h, v11.8h, v22.8h\n" + "fmin v12.8h, v12.8h, v22.8h\n" + + "212:\n" + "cmp %[depthwiseActivationMode], %[am_h_swish]\n" + "bne 213f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x42, lsl #8\n" // three + "fadd v27.8h, v9.8h, v22.8h\n" + "fadd v29.8h, v10.8h, v22.8h\n" + "fadd v30.8h, v11.8h, v22.8h\n" + "fadd v31.8h, v12.8h, v22.8h\n" + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v27.8h, v27.8h, v17.8h\n" + "fmax v29.8h, v29.8h, v17.8h\n" + "fmax v30.8h, v30.8h, v17.8h\n" + "fmax v31.8h, v31.8h, v17.8h\n" + "fmin v27.8h, v27.8h, v22.8h\n" + "fmin v29.8h, v29.8h, v22.8h\n" + "fmin v30.8h, v30.8h, v22.8h\n" + "fmin v31.8h, v31.8h, v22.8h\n" + "fdiv v27.8h, v27.8h, v22.8h\n" + "fdiv v29.8h, v29.8h, v22.8h\n" + "fdiv v30.8h, v30.8h, v22.8h\n" + "fdiv v31.8h, v31.8h, v22.8h\n" + "fmul v9.8h, v27.8h, v9.8h\n" + "fmul v10.8h, v29.8h, v10.8h\n" + "fmul v11.8h, v30.8h, v11.8h\n" + "fmul v12.8h, v31.8h, v12.8h\n" + + "213:\n" + "subs x0, x0, #4\n" + "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" + "bne 0b\n" + + "mov v9.16b, v28.16b\n" // out_0 + "mov v10.16b, v28.16b\n" // out_1 + "mov v11.16b, v28.16b\n" // out_2 + "mov v12.16b, v28.16b\n" // out_3 + + "ldr q17, [%[in_0], #64]\n" + "fmla v9.8h, v3.8h, v13.8h\n" + "fmla v10.8h, v3.8h, v14.8h\n" + "fmla v11.8h, v3.8h, v15.8h\n" + "fmla v12.8h, v3.8h, v16.8h\n" + "ldr q22, [%[in_1], #64]\n" + "fmla v9.8h, v6.8h, v18.8h\n" + "fmla v10.8h, v6.8h, v19.8h\n" + "fmla v11.8h, v6.8h, v20.8h\n" + "fmla v12.8h, v6.8h, v21.8h\n" + + "fmla v9.8h, v4.8h, v14.8h\n" + "fmla v10.8h, v4.8h, v15.8h\n" + "fmla v11.8h, v4.8h, v16.8h\n" + "fmla v12.8h, v4.8h, v17.8h\n" + "fmla v9.8h, v7.8h, v19.8h\n" + "fmla v10.8h, v7.8h, v20.8h\n" + "fmla v11.8h, v7.8h, v21.8h\n" + "fmla v12.8h, v7.8h, v22.8h\n" + + "fmla v9.8h, v5.8h, v15.8h\n" + "fmla v10.8h, v5.8h, v16.8h\n" + "fmla v11.8h, v5.8h, v17.8h\n" + "fmla v9.8h, v8.8h, v20.8h\n" + "fmla v10.8h, v8.8h, v21.8h\n" + "fmla v11.8h, v8.8h, v22.8h\n" + + "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse + "bne 311f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) + "fmax v10.8h, v10.8h, v17.8h\n" + "fmax v11.8h, v11.8h, v17.8h\n" + "fmax v12.8h, v12.8h, v17.8h\n" + + "311:\n" + "cmp %[depthwiseActivationMode], %[am_relu6]\n" + "bne 312f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) + "fmax v10.8h, v10.8h, v17.8h\n" + "fmax v11.8h, v11.8h, v17.8h\n" + "fmax v12.8h, v12.8h, v17.8h\n" + "fmin v9.8h, v9.8h, v22.8h\n" // min(v9, 6) + "fmin v10.8h, v10.8h, v22.8h\n" + "fmin v11.8h, v11.8h, v22.8h\n" + "fmin v12.8h, v12.8h, v22.8h\n" + + "312:\n" + "cmp %[depthwiseActivationMode], %[am_h_swish]\n" + "bne 313f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x42, lsl #8\n" // three + "fadd v27.8h, v9.8h, v22.8h\n" + "fadd v29.8h, v10.8h, v22.8h\n" + "fadd v30.8h, v11.8h, v22.8h\n" + "fadd v31.8h, v12.8h, v22.8h\n" + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v27.8h, v27.8h, v17.8h\n" + "fmax v29.8h, v29.8h, v17.8h\n" + "fmax v30.8h, v30.8h, v17.8h\n" + "fmax v31.8h, v31.8h, v17.8h\n" + "fmin v27.8h, v27.8h, v22.8h\n" + "fmin v29.8h, v29.8h, v22.8h\n" + "fmin v30.8h, v30.8h, v22.8h\n" + "fmin v31.8h, v31.8h, v22.8h\n" + "fdiv v27.8h, v27.8h, v22.8h\n" + "fdiv v29.8h, v29.8h, v22.8h\n" + "fdiv v30.8h, v30.8h, v22.8h\n" + "fdiv v31.8h, v31.8h, v22.8h\n" + "fmul v9.8h, v27.8h, v9.8h\n" + "fmul v10.8h, v29.8h, v10.8h\n" + "fmul v11.8h, v30.8h, v11.8h\n" + "fmul v12.8h, v31.8h, v12.8h\n" + + "313:\n" + "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" + : [out] "+r"(out), [in_0] "+r"(in0), [in_1] "+r"(in1) + : [f] "r"(f), [b] "r"(b), [w] "r"((I64)ow - 8), + [depthwiseActivationMode] "r"((I64)depthwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", + "x3"); + + for (U32 h = 0; h < oh - 2; h++) { + in0 = in_c + h * iw * 8; + in1 = in0 + iw * 8; + in2 = in1 + iw * 8; + __asm__ __volatile__( + "mov x0, %[w]\n" + "ldr q28, [%[b]]\n" + "ldr q0, [%[f]]\n" + "ldr q1, [%[f], #16]\n" + "ldr q2, [%[f], #32]\n" + "ldr q3, [%[f], #48]\n" + "ldr q4, [%[f], #64]\n" + "ldr q5, [%[f], #80]\n" + "ldr q6, [%[f], #96]\n" + "ldr q7, [%[f], #112]\n" + "ldr q8, [%[f], #128]\n" + "ldr q13, [%[in_0]]\n" + "ldr q14, [%[in_0], #16]\n" + "ldr q15, [%[in_0], #32]\n" + "ldr q16, [%[in_0], #48]\n" + "ldr q18, [%[in_1]]\n" + "ldr q19, [%[in_1], #16]\n" + "ldr q20, [%[in_1], #32]\n" + "ldr q21, [%[in_1], #48]\n" + "ldr q23, [%[in_2]]\n" + "ldr q24, [%[in_2], #16]\n" + "ldr q25, [%[in_2], #32]\n" + "ldr q26, [%[in_2], #48]\n" + + "mov v9.16b, v28.16b\n" // out_0 + "mov v10.16b, v28.16b\n" // out_1 + "mov v11.16b, v28.16b\n" // out_2 + "mov v12.16b, v28.16b\n" // out_3 + + "ldr q17, [%[in_0], #64]\n" + "fmla v10.8h, v0.8h, v13.8h\n" + "fmla v11.8h, v0.8h, v14.8h\n" + "fmla v12.8h, v0.8h, v15.8h\n" + "ldr q22, [%[in_1], #64]\n" + "fmla v10.8h, v3.8h, v18.8h\n" + "fmla v11.8h, v3.8h, v19.8h\n" + "fmla v12.8h, v3.8h, v20.8h\n" + "ldr q27, [%[in_2], #64]\n" + "fmla v10.8h, v6.8h, v23.8h\n" + "fmla v11.8h, v6.8h, v24.8h\n" + "fmla v12.8h, v6.8h, v25.8h\n" + + "fmla v9.8h, v1.8h, v13.8h\n" + "fmla v10.8h, v1.8h, v14.8h\n" + "fmla v11.8h, v1.8h, v15.8h\n" + "fmla v12.8h, v1.8h, v16.8h\n" + "fmla v9.8h, v4.8h, v18.8h\n" + "fmla v10.8h, v4.8h, v19.8h\n" + "fmla v11.8h, v4.8h, v20.8h\n" + "fmla v12.8h, v4.8h, v21.8h\n" + "fmla v9.8h, v7.8h, v23.8h\n" + "fmla v10.8h, v7.8h, v24.8h\n" + "fmla v11.8h, v7.8h, v25.8h\n" + "fmla v12.8h, v7.8h, v26.8h\n" + + "ldr q13, [%[in_0], #80]\n" + "fmla v9.8h, v2.8h, v14.8h\n" + "fmla v10.8h, v2.8h, v15.8h\n" + "fmla v11.8h, v2.8h, v16.8h\n" + "fmla v12.8h, v2.8h, v17.8h\n" + "ldr q18, [%[in_1], #80]\n" + "fmla v9.8h, v5.8h, v19.8h\n" + "fmla v10.8h, v5.8h, v20.8h\n" + "fmla v11.8h, v5.8h, v21.8h\n" + "fmla v12.8h, v5.8h, v22.8h\n" + "ldr q23, [%[in_2], #80]\n" + "fmla v9.8h, v8.8h, v24.8h\n" + "fmla v10.8h, v8.8h, v25.8h\n" + "fmla v11.8h, v8.8h, v26.8h\n" + "fmla v12.8h, v8.8h, v27.8h\n" + + "mov v14.16b, v17.16b\n" + "mov v19.16b, v22.16b\n" + "mov v24.16b, v27.16b\n" + "mov v15.16b, v13.16b\n" + "mov v20.16b, v18.16b\n" + "mov v25.16b, v23.16b\n" + "mov v13.16b, v16.16b\n" + "mov v18.16b, v21.16b\n" + "mov v23.16b, v26.16b\n" + "ldr q16, [%[in_0], #96]\n" + "ldr q21, [%[in_1], #96]\n" + "ldr q26, [%[in_2], #96]\n" + "add %[in_0], %[in_0], #48\n" + "add %[in_1], %[in_1], #48\n" + "add %[in_2], %[in_2], #48\n" + + "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse + "bne 111f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) + "fmax v10.8h, v10.8h, v17.8h\n" + "fmax v11.8h, v11.8h, v17.8h\n" + "fmax v12.8h, v12.8h, v17.8h\n" + + "111:\n" + "cmp %[depthwiseActivationMode], %[am_relu6]\n" + "bne 112f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) + "fmax v10.8h, v10.8h, v17.8h\n" + "fmax v11.8h, v11.8h, v17.8h\n" + "fmax v12.8h, v12.8h, v17.8h\n" + "fmin v9.8h, v9.8h, v22.8h\n" // min(v9, 6) + "fmin v10.8h, v10.8h, v22.8h\n" + "fmin v11.8h, v11.8h, v22.8h\n" + "fmin v12.8h, v12.8h, v22.8h\n" + + "112:\n" + "cmp %[depthwiseActivationMode], %[am_h_swish]\n" + "bne 113f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x42, lsl #8\n" // three + "fadd v27.8h, v9.8h, v22.8h\n" + "fadd v29.8h, v10.8h, v22.8h\n" + "fadd v30.8h, v11.8h, v22.8h\n" + "fadd v31.8h, v12.8h, v22.8h\n" + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v27.8h, v27.8h, v17.8h\n" + "fmax v29.8h, v29.8h, v17.8h\n" + "fmax v30.8h, v30.8h, v17.8h\n" + "fmax v31.8h, v31.8h, v17.8h\n" + "fmin v27.8h, v27.8h, v22.8h\n" + "fmin v29.8h, v29.8h, v22.8h\n" + "fmin v30.8h, v30.8h, v22.8h\n" + "fmin v31.8h, v31.8h, v22.8h\n" + "fdiv v27.8h, v27.8h, v22.8h\n" + "fdiv v29.8h, v29.8h, v22.8h\n" + "fdiv v30.8h, v30.8h, v22.8h\n" + "fdiv v31.8h, v31.8h, v22.8h\n" + "fmul v9.8h, v27.8h, v9.8h\n" + "fmul v10.8h, v29.8h, v10.8h\n" + "fmul v11.8h, v30.8h, v11.8h\n" + "fmul v12.8h, v31.8h, v12.8h\n" + + "113:\n" + "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" + + "0:\n" + "mov v9.16b, v28.16b\n" // out_0 + "mov v10.16b, v28.16b\n" // out_1 + "mov v11.16b, v28.16b\n" // out_2 + "mov v12.16b, v28.16b\n" // out_3 + + "ldr q17, [%[in_0], #64]\n" + "fmla v9.8h, v0.8h, v13.8h\n" + "fmla v10.8h, v0.8h, v14.8h\n" + "fmla v11.8h, v0.8h, v15.8h\n" + "ldr q22, [%[in_1], #64]\n" + "fmla v12.8h, v0.8h, v16.8h\n" + "fmla v9.8h, v3.8h, v18.8h\n" + "fmla v10.8h, v3.8h, v19.8h\n" + "ldr q27, [%[in_2], #64]\n" + "fmla v11.8h, v3.8h, v20.8h\n" + "fmla v12.8h, v3.8h, v21.8h\n" + "fmla v9.8h, v6.8h, v23.8h\n" + "fmla v10.8h, v6.8h, v24.8h\n" + "fmla v11.8h, v6.8h, v25.8h\n" + "fmla v12.8h, v6.8h, v26.8h\n" + + "ldr q13, [%[in_0], #80]\n" + "fmla v9.8h, v1.8h, v14.8h\n" + "fmla v10.8h, v1.8h, v15.8h\n" + "fmla v11.8h, v1.8h, v16.8h\n" + "ldr q18, [%[in_1], #80]\n" + "fmla v12.8h, v1.8h, v17.8h\n" + "fmla v9.8h, v4.8h, v19.8h\n" + "fmla v10.8h, v4.8h, v20.8h\n" + "ldr q23, [%[in_2], #80]\n" + "fmla v11.8h, v4.8h, v21.8h\n" + "fmla v12.8h, v4.8h, v22.8h\n" + "fmla v9.8h, v7.8h, v24.8h\n" + "fmla v10.8h, v7.8h, v25.8h\n" + "fmla v11.8h, v7.8h, v26.8h\n" + "fmla v12.8h, v7.8h, v27.8h\n" + + "ldr q14, [%[in_0], #96]\n" + "fmla v9.8h, v2.8h, v15.8h\n" + "fmla v10.8h, v2.8h, v16.8h\n" + "fmla v11.8h, v2.8h, v17.8h\n" + "ldr q19, [%[in_1], #96]\n" + "fmla v12.8h, v2.8h, v13.8h\n" + "fmla v9.8h, v5.8h, v20.8h\n" + "fmla v10.8h, v5.8h, v21.8h\n" + "ldr q24, [%[in_2], #96]\n" + "fmla v11.8h, v5.8h, v22.8h\n" + "fmla v12.8h, v5.8h, v18.8h\n" + "fmla v9.8h, v8.8h, v25.8h\n" + "fmla v10.8h, v8.8h, v26.8h\n" + "fmla v11.8h, v8.8h, v27.8h\n" + "fmla v12.8h, v8.8h, v23.8h\n" + + "ldr q16, [%[in_0], #112]\n" + "mov v15.16b, v14.16b\n" + "mov v20.16b, v19.16b\n" + "mov v25.16b, v24.16b\n" + "ldr q21, [%[in_1], #112]\n" + "mov v14.16b, v13.16b\n" + "mov v19.16b, v18.16b\n" + "mov v24.16b, v23.16b\n" + "ldr q26, [%[in_2], #112]\n" + "mov v13.16b, v17.16b\n" + "mov v18.16b, v22.16b\n" + "mov v23.16b, v27.16b\n" + + "add %[in_0], %[in_0], #64\n" + "add %[in_1], %[in_1], #64\n" + "add %[in_2], %[in_2], #64\n" + + "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse + "bne 211f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) + "fmax v10.8h, v10.8h, v17.8h\n" + "fmax v11.8h, v11.8h, v17.8h\n" + "fmax v12.8h, v12.8h, v17.8h\n" + + "211:\n" + "cmp %[depthwiseActivationMode], %[am_relu6]\n" + "bne 212f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) + "fmax v10.8h, v10.8h, v17.8h\n" + "fmax v11.8h, v11.8h, v17.8h\n" + "fmax v12.8h, v12.8h, v17.8h\n" + "fmin v9.8h, v9.8h, v22.8h\n" // min(v9, 6) + "fmin v10.8h, v10.8h, v22.8h\n" + "fmin v11.8h, v11.8h, v22.8h\n" + "fmin v12.8h, v12.8h, v22.8h\n" + + "212:\n" + "cmp %[depthwiseActivationMode], %[am_h_swish]\n" + "bne 213f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x42, lsl #8\n" // three + "fadd v27.8h, v9.8h, v22.8h\n" + "fadd v29.8h, v10.8h, v22.8h\n" + "fadd v30.8h, v11.8h, v22.8h\n" + "fadd v31.8h, v12.8h, v22.8h\n" + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v27.8h, v27.8h, v17.8h\n" + "fmax v29.8h, v29.8h, v17.8h\n" + "fmax v30.8h, v30.8h, v17.8h\n" + "fmax v31.8h, v31.8h, v17.8h\n" + "fmin v27.8h, v27.8h, v22.8h\n" + "fmin v29.8h, v29.8h, v22.8h\n" + "fmin v30.8h, v30.8h, v22.8h\n" + "fmin v31.8h, v31.8h, v22.8h\n" + "fdiv v27.8h, v27.8h, v22.8h\n" + "fdiv v29.8h, v29.8h, v22.8h\n" + "fdiv v30.8h, v30.8h, v22.8h\n" + "fdiv v31.8h, v31.8h, v22.8h\n" + "fmul v9.8h, v27.8h, v9.8h\n" + "fmul v10.8h, v29.8h, v10.8h\n" + "fmul v11.8h, v30.8h, v11.8h\n" + "fmul v12.8h, v31.8h, v12.8h\n" + + "213:\n" + "subs x0, x0, #4\n" + "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" + "bne 0b\n" + + "mov v9.16b, v28.16b\n" // out_0 + "mov v10.16b, v28.16b\n" // out_1 + "mov v11.16b, v28.16b\n" // out_2 + "mov v12.16b, v28.16b\n" // out_3 + + "ldr q17, [%[in_0], #64]\n" + "fmla v9.8h, v0.8h, v13.8h\n" + "fmla v10.8h, v0.8h, v14.8h\n" + "fmla v11.8h, v0.8h, v15.8h\n" + "fmla v12.8h, v0.8h, v16.8h\n" + "ldr q22, [%[in_1], #64]\n" + "fmla v9.8h, v3.8h, v18.8h\n" + "fmla v10.8h, v3.8h, v19.8h\n" + "fmla v11.8h, v3.8h, v20.8h\n" + "fmla v12.8h, v3.8h, v21.8h\n" + "ldr q27, [%[in_2], #64]\n" + "fmla v9.8h, v6.8h, v23.8h\n" + "fmla v10.8h, v6.8h, v24.8h\n" + "fmla v11.8h, v6.8h, v25.8h\n" + "fmla v12.8h, v6.8h, v26.8h\n" + + "fmla v9.8h, v1.8h, v14.8h\n" + "fmla v10.8h, v1.8h, v15.8h\n" + "fmla v11.8h, v1.8h, v16.8h\n" + "fmla v12.8h, v1.8h, v17.8h\n" + "fmla v9.8h, v4.8h, v19.8h\n" + "fmla v10.8h, v4.8h, v20.8h\n" + "fmla v11.8h, v4.8h, v21.8h\n" + "fmla v12.8h, v4.8h, v22.8h\n" + "fmla v9.8h, v7.8h, v24.8h\n" + "fmla v10.8h, v7.8h, v25.8h\n" + "fmla v11.8h, v7.8h, v26.8h\n" + "fmla v12.8h, v7.8h, v27.8h\n" + + "fmla v9.8h, v2.8h, v15.8h\n" + "fmla v10.8h, v2.8h, v16.8h\n" + "fmla v11.8h, v2.8h, v17.8h\n" + "fmla v9.8h, v5.8h, v20.8h\n" + "fmla v10.8h, v5.8h, v21.8h\n" + "fmla v11.8h, v5.8h, v22.8h\n" + "fmla v9.8h, v8.8h, v25.8h\n" + "fmla v10.8h, v8.8h, v26.8h\n" + "fmla v11.8h, v8.8h, v27.8h\n" + + "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse + "bne 311f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) + "fmax v10.8h, v10.8h, v17.8h\n" + "fmax v11.8h, v11.8h, v17.8h\n" + "fmax v12.8h, v12.8h, v17.8h\n" + + "311:\n" + "cmp %[depthwiseActivationMode], %[am_relu6]\n" + "bne 312f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) + "fmax v10.8h, v10.8h, v17.8h\n" + "fmax v11.8h, v11.8h, v17.8h\n" + "fmax v12.8h, v12.8h, v17.8h\n" + "fmin v9.8h, v9.8h, v22.8h\n" // min(v9, 6) + "fmin v10.8h, v10.8h, v22.8h\n" + "fmin v11.8h, v11.8h, v22.8h\n" + "fmin v12.8h, v12.8h, v22.8h\n" + + "312:\n" + "cmp %[depthwiseActivationMode], %[am_h_swish]\n" + "bne 313f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x42, lsl #8\n" // three + "fadd v27.8h, v9.8h, v22.8h\n" + "fadd v29.8h, v10.8h, v22.8h\n" + "fadd v30.8h, v11.8h, v22.8h\n" + "fadd v31.8h, v12.8h, v22.8h\n" + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v27.8h, v27.8h, v17.8h\n" + "fmax v29.8h, v29.8h, v17.8h\n" + "fmax v30.8h, v30.8h, v17.8h\n" + "fmax v31.8h, v31.8h, v17.8h\n" + "fmin v27.8h, v27.8h, v22.8h\n" + "fmin v29.8h, v29.8h, v22.8h\n" + "fmin v30.8h, v30.8h, v22.8h\n" + "fmin v31.8h, v31.8h, v22.8h\n" + "fdiv v27.8h, v27.8h, v22.8h\n" + "fdiv v29.8h, v29.8h, v22.8h\n" + "fdiv v30.8h, v30.8h, v22.8h\n" + "fdiv v31.8h, v31.8h, v22.8h\n" + "fmul v9.8h, v27.8h, v9.8h\n" + "fmul v10.8h, v29.8h, v10.8h\n" + "fmul v11.8h, v30.8h, v11.8h\n" + "fmul v12.8h, v31.8h, v12.8h\n" + + "313:\n" + "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" + : [out] "+r"(out), [in_0] "+r"(in0), [in_1] "+r"(in1), [in_2] "+r"(in2) + : [f] "r"(f), [b] "r"(b), [w] "r"((I64)ow - 8), + [depthwiseActivationMode] "r"((I64)depthwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", + "x0", "x1", "x2", "x3"); + } + in0 = in_c + (ih - 2) * iw * 8; + in1 = in0 + iw * 8; + in2 = in1 + iw * 8; + __asm__ __volatile__( + "mov x0, %[w]\n" + "ldr q28, [%[b]]\n" + "ldr q0, [%[f]]\n" + "ldr q1, [%[f], #16]\n" + "ldr q2, [%[f], #32]\n" + "ldr q3, [%[f], #48]\n" + "ldr q4, [%[f], #64]\n" + "ldr q5, [%[f], #80]\n" + "ldr q13, [%[in_0]]\n" + "ldr q14, [%[in_0], #16]\n" + "ldr q15, [%[in_0], #32]\n" + "ldr q16, [%[in_0], #48]\n" + "ldr q18, [%[in_1]]\n" + "ldr q19, [%[in_1], #16]\n" + "ldr q20, [%[in_1], #32]\n" + "ldr q21, [%[in_1], #48]\n" + + "mov v9.16b, v28.16b\n" // out_0 + "mov v10.16b, v28.16b\n" // out_1 + "mov v11.16b, v28.16b\n" // out_2 + "mov v12.16b, v28.16b\n" // out_3 + + "ldr q17, [%[in_0], #64]\n" + "fmla v10.8h, v0.8h, v13.8h\n" + "fmla v11.8h, v0.8h, v14.8h\n" + "fmla v12.8h, v0.8h, v15.8h\n" + "ldr q22, [%[in_1], #64]\n" + "fmla v10.8h, v3.8h, v18.8h\n" + "fmla v11.8h, v3.8h, v19.8h\n" + "fmla v12.8h, v3.8h, v20.8h\n" + + "fmla v9.8h, v1.8h, v13.8h\n" + "fmla v10.8h, v1.8h, v14.8h\n" + "fmla v11.8h, v1.8h, v15.8h\n" + "fmla v12.8h, v1.8h, v16.8h\n" + "fmla v9.8h, v4.8h, v18.8h\n" + "fmla v10.8h, v4.8h, v19.8h\n" + "fmla v11.8h, v4.8h, v20.8h\n" + "fmla v12.8h, v4.8h, v21.8h\n" + + "ldr q13, [%[in_0], #80]\n" + "fmla v9.8h, v2.8h, v14.8h\n" + "fmla v10.8h, v2.8h, v15.8h\n" + "fmla v11.8h, v2.8h, v16.8h\n" + "fmla v12.8h, v2.8h, v17.8h\n" + "ldr q18, [%[in_1], #80]\n" + "fmla v9.8h, v5.8h, v19.8h\n" + "fmla v10.8h, v5.8h, v20.8h\n" + "fmla v11.8h, v5.8h, v21.8h\n" + "fmla v12.8h, v5.8h, v22.8h\n" + + "mov v14.16b, v17.16b\n" + "mov v19.16b, v22.16b\n" + "mov v15.16b, v13.16b\n" + "mov v20.16b, v18.16b\n" + "mov v13.16b, v16.16b\n" + "mov v18.16b, v21.16b\n" + "ldr q16, [%[in_0], #96]\n" + "ldr q21, [%[in_1], #96]\n" + "add %[in_0], %[in_0], #48\n" + "add %[in_1], %[in_1], #48\n" + + "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse + "bne 111f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) + "fmax v10.8h, v10.8h, v17.8h\n" + "fmax v11.8h, v11.8h, v17.8h\n" + "fmax v12.8h, v12.8h, v17.8h\n" + + "111:\n" + "cmp %[depthwiseActivationMode], %[am_relu6]\n" + "bne 112f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) + "fmax v10.8h, v10.8h, v17.8h\n" + "fmax v11.8h, v11.8h, v17.8h\n" + "fmax v12.8h, v12.8h, v17.8h\n" + "fmin v9.8h, v9.8h, v22.8h\n" // min(v9, 6) + "fmin v10.8h, v10.8h, v22.8h\n" + "fmin v11.8h, v11.8h, v22.8h\n" + "fmin v12.8h, v12.8h, v22.8h\n" + + "112:\n" + "cmp %[depthwiseActivationMode], %[am_h_swish]\n" + "bne 113f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x42, lsl #8\n" // three + "fadd v27.8h, v9.8h, v22.8h\n" + "fadd v29.8h, v10.8h, v22.8h\n" + "fadd v30.8h, v11.8h, v22.8h\n" + "fadd v31.8h, v12.8h, v22.8h\n" + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v27.8h, v27.8h, v17.8h\n" + "fmax v29.8h, v29.8h, v17.8h\n" + "fmax v30.8h, v30.8h, v17.8h\n" + "fmax v31.8h, v31.8h, v17.8h\n" + "fmin v27.8h, v27.8h, v22.8h\n" + "fmin v29.8h, v29.8h, v22.8h\n" + "fmin v30.8h, v30.8h, v22.8h\n" + "fmin v31.8h, v31.8h, v22.8h\n" + "fdiv v27.8h, v27.8h, v22.8h\n" + "fdiv v29.8h, v29.8h, v22.8h\n" + "fdiv v30.8h, v30.8h, v22.8h\n" + "fdiv v31.8h, v31.8h, v22.8h\n" + "fmul v9.8h, v27.8h, v9.8h\n" + "fmul v10.8h, v29.8h, v10.8h\n" + "fmul v11.8h, v30.8h, v11.8h\n" + "fmul v12.8h, v31.8h, v12.8h\n" + + "113:\n" + "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" + + "0:\n" + "mov v9.16b, v28.16b\n" // out_0 + "mov v10.16b, v28.16b\n" // out_1 + "mov v11.16b, v28.16b\n" // out_2 + "mov v12.16b, v28.16b\n" // out_3 + + "ldr q17, [%[in_0], #64]\n" + "fmla v9.8h, v0.8h, v13.8h\n" + "fmla v10.8h, v0.8h, v14.8h\n" + "fmla v11.8h, v0.8h, v15.8h\n" + "ldr q22, [%[in_1], #64]\n" + "fmla v12.8h, v0.8h, v16.8h\n" + "fmla v9.8h, v3.8h, v18.8h\n" + "fmla v10.8h, v3.8h, v19.8h\n" + "fmla v11.8h, v3.8h, v20.8h\n" + "fmla v12.8h, v3.8h, v21.8h\n" + + "ldr q13, [%[in_0], #80]\n" + "fmla v9.8h, v1.8h, v14.8h\n" + "fmla v10.8h, v1.8h, v15.8h\n" + "fmla v11.8h, v1.8h, v16.8h\n" + "ldr q18, [%[in_1], #80]\n" + "fmla v12.8h, v1.8h, v17.8h\n" + "fmla v9.8h, v4.8h, v19.8h\n" + "fmla v10.8h, v4.8h, v20.8h\n" + "fmla v11.8h, v4.8h, v21.8h\n" + "fmla v12.8h, v4.8h, v22.8h\n" + + "ldr q14, [%[in_0], #96]\n" + "fmla v9.8h, v2.8h, v15.8h\n" + "fmla v10.8h, v2.8h, v16.8h\n" + "fmla v11.8h, v2.8h, v17.8h\n" + "ldr q19, [%[in_1], #96]\n" + "fmla v12.8h, v2.8h, v13.8h\n" + "fmla v9.8h, v5.8h, v20.8h\n" + "fmla v10.8h, v5.8h, v21.8h\n" + "fmla v11.8h, v5.8h, v22.8h\n" + "fmla v12.8h, v5.8h, v18.8h\n" + + "ldr q16, [%[in_0], #112]\n" + "mov v15.16b, v14.16b\n" + "mov v20.16b, v19.16b\n" + "ldr q21, [%[in_1], #112]\n" + "mov v14.16b, v13.16b\n" + "mov v19.16b, v18.16b\n" + "mov v13.16b, v17.16b\n" + "mov v18.16b, v22.16b\n" + + "add %[in_0], %[in_0], #64\n" + "add %[in_1], %[in_1], #64\n" + + "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse + "bne 211f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) + "fmax v10.8h, v10.8h, v17.8h\n" + "fmax v11.8h, v11.8h, v17.8h\n" + "fmax v12.8h, v12.8h, v17.8h\n" + + "211:\n" + "cmp %[depthwiseActivationMode], %[am_relu6]\n" + "bne 212f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) + "fmax v10.8h, v10.8h, v17.8h\n" + "fmax v11.8h, v11.8h, v17.8h\n" + "fmax v12.8h, v12.8h, v17.8h\n" + "fmin v9.8h, v9.8h, v22.8h\n" // min(v9, 6) + "fmin v10.8h, v10.8h, v22.8h\n" + "fmin v11.8h, v11.8h, v22.8h\n" + "fmin v12.8h, v12.8h, v22.8h\n" + + "212:\n" + "cmp %[depthwiseActivationMode], %[am_h_swish]\n" + "bne 213f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x42, lsl #8\n" // three + "fadd v27.8h, v9.8h, v22.8h\n" + "fadd v29.8h, v10.8h, v22.8h\n" + "fadd v30.8h, v11.8h, v22.8h\n" + "fadd v31.8h, v12.8h, v22.8h\n" + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v27.8h, v27.8h, v17.8h\n" + "fmax v29.8h, v29.8h, v17.8h\n" + "fmax v30.8h, v30.8h, v17.8h\n" + "fmax v31.8h, v31.8h, v17.8h\n" + "fmin v27.8h, v27.8h, v22.8h\n" + "fmin v29.8h, v29.8h, v22.8h\n" + "fmin v30.8h, v30.8h, v22.8h\n" + "fmin v31.8h, v31.8h, v22.8h\n" + "fdiv v27.8h, v27.8h, v22.8h\n" + "fdiv v29.8h, v29.8h, v22.8h\n" + "fdiv v30.8h, v30.8h, v22.8h\n" + "fdiv v31.8h, v31.8h, v22.8h\n" + "fmul v9.8h, v27.8h, v9.8h\n" + "fmul v10.8h, v29.8h, v10.8h\n" + "fmul v11.8h, v30.8h, v11.8h\n" + "fmul v12.8h, v31.8h, v12.8h\n" + + "213:\n" + "subs x0, x0, #4\n" + "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" + "bne 0b\n" + + "mov v9.16b, v28.16b\n" // out_0 + "mov v10.16b, v28.16b\n" // out_1 + "mov v11.16b, v28.16b\n" // out_2 + "mov v12.16b, v28.16b\n" // out_3 + + "ldr q17, [%[in_0], #64]\n" + "fmla v9.8h, v0.8h, v13.8h\n" + "fmla v10.8h, v0.8h, v14.8h\n" + "fmla v11.8h, v0.8h, v15.8h\n" + "fmla v12.8h, v0.8h, v16.8h\n" + "ldr q22, [%[in_1], #64]\n" + "fmla v9.8h, v3.8h, v18.8h\n" + "fmla v10.8h, v3.8h, v19.8h\n" + "fmla v11.8h, v3.8h, v20.8h\n" + "fmla v12.8h, v3.8h, v21.8h\n" + + "fmla v9.8h, v1.8h, v14.8h\n" + "fmla v10.8h, v1.8h, v15.8h\n" + "fmla v11.8h, v1.8h, v16.8h\n" + "fmla v12.8h, v1.8h, v17.8h\n" + "fmla v9.8h, v4.8h, v19.8h\n" + "fmla v10.8h, v4.8h, v20.8h\n" + "fmla v11.8h, v4.8h, v21.8h\n" + "fmla v12.8h, v4.8h, v22.8h\n" + + "fmla v9.8h, v2.8h, v15.8h\n" + "fmla v10.8h, v2.8h, v16.8h\n" + "fmla v11.8h, v2.8h, v17.8h\n" + "fmla v9.8h, v5.8h, v20.8h\n" + "fmla v10.8h, v5.8h, v21.8h\n" + "fmla v11.8h, v5.8h, v22.8h\n" + + "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse + "bne 311f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) + "fmax v10.8h, v10.8h, v17.8h\n" + "fmax v11.8h, v11.8h, v17.8h\n" + "fmax v12.8h, v12.8h, v17.8h\n" + + "311:\n" + "cmp %[depthwiseActivationMode], %[am_relu6]\n" + "bne 312f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) + "fmax v10.8h, v10.8h, v17.8h\n" + "fmax v11.8h, v11.8h, v17.8h\n" + "fmax v12.8h, v12.8h, v17.8h\n" + "fmin v9.8h, v9.8h, v22.8h\n" // min(v9, 6) + "fmin v10.8h, v10.8h, v22.8h\n" + "fmin v11.8h, v11.8h, v22.8h\n" + "fmin v12.8h, v12.8h, v22.8h\n" + + "312:\n" + "cmp %[depthwiseActivationMode], %[am_h_swish]\n" + "bne 313f\n" + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x42, lsl #8\n" // three + "fadd v27.8h, v9.8h, v22.8h\n" + "fadd v29.8h, v10.8h, v22.8h\n" + "fadd v30.8h, v11.8h, v22.8h\n" + "fadd v31.8h, v12.8h, v22.8h\n" + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v27.8h, v27.8h, v17.8h\n" + "fmax v29.8h, v29.8h, v17.8h\n" + "fmax v30.8h, v30.8h, v17.8h\n" + "fmax v31.8h, v31.8h, v17.8h\n" + "fmin v27.8h, v27.8h, v22.8h\n" + "fmin v29.8h, v29.8h, v22.8h\n" + "fmin v30.8h, v30.8h, v22.8h\n" + "fmin v31.8h, v31.8h, v22.8h\n" + "fdiv v27.8h, v27.8h, v22.8h\n" + "fdiv v29.8h, v29.8h, v22.8h\n" + "fdiv v30.8h, v30.8h, v22.8h\n" + "fdiv v31.8h, v31.8h, v22.8h\n" + "fmul v9.8h, v27.8h, v9.8h\n" + "fmul v10.8h, v29.8h, v10.8h\n" + "fmul v11.8h, v30.8h, v11.8h\n" + "fmul v12.8h, v31.8h, v12.8h\n" + + "313:\n" + "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" + : [out] "+r"(out), [in_0] "+r"(in0), [in_1] "+r"(in1) + : [f] "r"(f), [b] "r"(b), [w] "r"((I64)ow - 8), + [depthwiseActivationMode] "r"((I64)depthwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", + "x3"); + } + + // pw_conv + for (I32 hw = 0; hw < ohow - 7; hw += 8) { + const F16 *b0 = pwBiasArray; + const F16 *b1 = b0 + 8; + const F16 *f_o0c0 = pwFilterArray; + F16 *in_pack = pwArray + ohow * ic * 8; + // pack input + // NCHWc8 => NHWChw8 + for (U32 c = 0; c < ic; c++) { + F16 *in_pack_c8hw8 = in_pack + c * 8 * 8; + // it is 2% faster than in_hw8c8 = ... + hw*8; Amazing! + F16 *in_hw8c8 = pwArray + c * ohow * 8; + // + // for (U32 c8 = 0; c8 < 8; c8++) { + // for (U32 hw8 = 0; hw8 < 8; hw8++) { + // in_pack_c8hw8[c8*8 + hw8] = in_hw8c8[hw8*8 + c8]; + // } + // } + // + float16x8_t v0 = vld1q_f16(in_hw8c8 + hw * 8); + float16x8_t v1 = vld1q_f16(in_hw8c8 + hw * 8 + 8); + float16x8_t v2 = vld1q_f16(in_hw8c8 + hw * 8 + 8 * 2); + float16x8_t v3 = vld1q_f16(in_hw8c8 + hw * 8 + 8 * 3); + float16x8_t v4 = vld1q_f16(in_hw8c8 + hw * 8 + 8 * 4); + float16x8_t v5 = vld1q_f16(in_hw8c8 + hw * 8 + 8 * 5); + float16x8_t v6 = vld1q_f16(in_hw8c8 + hw * 8 + 8 * 6); + float16x8_t v7 = vld1q_f16(in_hw8c8 + hw * 8 + 8 * 7); + vst1q_f16(in_pack_c8hw8, + vzip1q_f16(vzip1q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip1q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8, + vzip2q_f16(vzip1q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip1q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8 * 2, + vzip1q_f16(vzip2q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip2q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8 * 3, + vzip2q_f16(vzip2q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip2q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8 * 4, + vzip1q_f16(vzip1q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vzip1q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8 * 5, + vzip2q_f16(vzip1q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vzip1q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8 * 6, + vzip1q_f16(vzip2q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vzip2q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8 * 7, + vzip2q_f16(vzip2q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vzip2q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); + } + // compute + for (I32 o = 0; o < I32(oc - 1); o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr q22, [%[b_0]]\n" // b_o0 + "ldr q23, [%[b_1]]\n" // b_o1 + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr q0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "mov v4.16b, v22.16b\n" // out_o0hw2 + "mov v5.16b, v22.16b\n" // out_o0hw3 + "ldr q18, [%[f_0]]\n" // f_o0c0 + "mov v6.16b, v22.16b\n" // out_o0hw4 + "mov v7.16b, v22.16b\n" // out_o0hw5 + "mov v8.16b, v22.16b\n" // out_o0hw6 + "ldr q19, [%[f_0], #16]\n" // f_o1c0 + "mov v9.16b, v22.16b\n" // out_o0hw7 + "mov v10.16b, v23.16b\n" // out_o1hw0 + "mov v11.16b, v23.16b\n" // out_o1hw1 + "mov v12.16b, v23.16b\n" // out_o1hw2 + "mov v13.16b, v23.16b\n" // out_o1hw3 + "mov v14.16b, v23.16b\n" // out_o1hw4 + "mov v15.16b, v23.16b\n" // out_o1hw5 + "mov v16.16b, v23.16b\n" // out_o1hw6 + "mov v17.16b, v23.16b\n" // out_o1hw7 + "0:\n" + "ldr q1, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "fmla v3.8h, v18.8h, v0.h[1]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "ldr q20, [%[f_0], #32]\n" // f_o0c0 + "fmla v5.8h, v18.8h, v0.h[3]\n" + "fmla v6.8h, v18.8h, v0.h[4]\n" + "fmla v7.8h, v18.8h, v0.h[5]\n" + "ldr q21, [%[f_0], #48]\n" // f_o1c0 + "fmla v8.8h, v18.8h, v0.h[6]\n" + "fmla v9.8h, v18.8h, v0.h[7]\n" + "fmla v10.8h, v19.8h, v0.h[0]\n" + "fmla v11.8h, v19.8h, v0.h[1]\n" + "fmla v12.8h, v19.8h, v0.h[2]\n" + "fmla v13.8h, v19.8h, v0.h[3]\n" + "fmla v14.8h, v19.8h, v0.h[4]\n" + "fmla v15.8h, v19.8h, v0.h[5]\n" + "fmla v16.8h, v19.8h, v0.h[6]\n" + "fmla v17.8h, v19.8h, v0.h[7]\n" + + "ldr q0, [%[in_0], #32]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "fmla v3.8h, v20.8h, v1.h[1]\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "ldr q18, [%[f_0], #64]\n" // f_o0c0 + "fmla v5.8h, v20.8h, v1.h[3]\n" + "fmla v6.8h, v20.8h, v1.h[4]\n" + "fmla v7.8h, v20.8h, v1.h[5]\n" + "ldr q19, [%[f_0], #80]\n" // f_o1c0 + "fmla v8.8h, v20.8h, v1.h[6]\n" + "fmla v9.8h, v20.8h, v1.h[7]\n" + "fmla v10.8h, v21.8h, v1.h[0]\n" + "add %[in_0], %[in_0], #32\n" + "fmla v11.8h, v21.8h, v1.h[1]\n" + "add %[f_0], %[f_0], #64\n" + "fmla v12.8h, v21.8h, v1.h[2]\n" + "subs x0, x0, #2\n" + "fmla v13.8h, v21.8h, v1.h[3]\n" + "fmla v14.8h, v21.8h, v1.h[4]\n" + "fmla v15.8h, v21.8h, v1.h[5]\n" + "fmla v16.8h, v21.8h, v1.h[6]\n" + "fmla v17.8h, v21.8h, v1.h[7]\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + "fmax v11.8h, v11.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v0.8h\n" + "fmax v13.8h, v13.8h, v0.8h\n" + "fmax v14.8h, v14.8h, v0.8h\n" + "fmax v15.8h, v15.8h, v0.8h\n" + "fmax v16.8h, v16.8h, v0.8h\n" + "fmax v17.8h, v17.8h, v0.8h\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + "fmax v11.8h, v11.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v0.8h\n" + "fmax v13.8h, v13.8h, v0.8h\n" + "fmax v14.8h, v14.8h, v0.8h\n" + "fmax v15.8h, v15.8h, v0.8h\n" + "fmax v16.8h, v16.8h, v0.8h\n" + "fmax v17.8h, v17.8h, v0.8h\n" + "fmin v2.8h, v2.8h, v1.8h\n" + "fmin v3.8h, v3.8h, v1.8h\n" + "fmin v4.8h, v4.8h, v1.8h\n" + "fmin v5.8h, v5.8h, v1.8h\n" + "fmin v6.8h, v6.8h, v1.8h\n" + "fmin v7.8h, v7.8h, v1.8h\n" + "fmin v8.8h, v8.8h, v1.8h\n" + "fmin v9.8h, v9.8h, v1.8h\n" + "fmin v10.8h, v10.8h, v1.8h\n" + "fmin v11.8h, v11.8h, v1.8h\n" + "fmin v12.8h, v12.8h, v1.8h\n" + "fmin v13.8h, v13.8h, v1.8h\n" + "fmin v14.8h, v14.8h, v1.8h\n" + "fmin v15.8h, v15.8h, v1.8h\n" + "fmin v16.8h, v16.8h, v1.8h\n" + "fmin v17.8h, v17.8h, v1.8h\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three + "fadd v19.8h, v2.8h, v18.8h\n" + "fadd v20.8h, v3.8h, v18.8h\n" + "fadd v21.8h, v4.8h, v18.8h\n" + "fadd v22.8h, v5.8h, v18.8h\n" + "fadd v23.8h, v6.8h, v18.8h\n" + "fadd v24.8h, v7.8h, v18.8h\n" + "fadd v25.8h, v8.8h, v18.8h\n" + "fadd v26.8h, v9.8h, v18.8h\n" + "fmax v19.8h, v19.8h, v0.8h\n" + "fmax v20.8h, v20.8h, v0.8h\n" + "fmax v21.8h, v21.8h, v0.8h\n" + "fmax v22.8h, v22.8h, v0.8h\n" + "fmax v23.8h, v23.8h, v0.8h\n" + "fmax v24.8h, v24.8h, v0.8h\n" + "fmax v25.8h, v25.8h, v0.8h\n" + "fmax v26.8h, v26.8h, v0.8h\n" + "fmin v19.8h, v19.8h, v1.8h\n" + "fmin v20.8h, v20.8h, v1.8h\n" + "fmin v21.8h, v21.8h, v1.8h\n" + "fmin v22.8h, v22.8h, v1.8h\n" + "fmin v23.8h, v23.8h, v1.8h\n" + "fmin v24.8h, v24.8h, v1.8h\n" + "fmin v25.8h, v25.8h, v1.8h\n" + "fmin v26.8h, v26.8h, v1.8h\n" + "fdiv v19.8h, v19.8h, v1.8h\n" + "fdiv v20.8h, v20.8h, v1.8h\n" + "fdiv v21.8h, v21.8h, v1.8h\n" + "fdiv v22.8h, v22.8h, v1.8h\n" + "fdiv v23.8h, v23.8h, v1.8h\n" + "fdiv v24.8h, v24.8h, v1.8h\n" + "fdiv v25.8h, v25.8h, v1.8h\n" + "fdiv v26.8h, v26.8h, v1.8h\n" + "fmul v2.8h, v19.8h, v2.8h\n" + "fmul v3.8h, v20.8h, v3.8h\n" + "fmul v4.8h, v21.8h, v4.8h\n" + "fmul v5.8h, v22.8h, v5.8h\n" + "fmul v6.8h, v23.8h, v6.8h\n" + "fmul v7.8h, v24.8h, v7.8h\n" + "fmul v8.8h, v25.8h, v8.8h\n" + "fmul v9.8h, v26.8h, v9.8h\n" + + "fadd v19.8h, v10.8h, v18.8h\n" + "fadd v20.8h, v11.8h, v18.8h\n" + "fadd v21.8h, v12.8h, v18.8h\n" + "fadd v22.8h, v13.8h, v18.8h\n" + "fadd v23.8h, v14.8h, v18.8h\n" + "fadd v24.8h, v15.8h, v18.8h\n" + "fadd v25.8h, v16.8h, v18.8h\n" + "fadd v26.8h, v17.8h, v18.8h\n" + "fmax v19.8h, v19.8h, v0.8h\n" + "fmax v20.8h, v20.8h, v0.8h\n" + "fmax v21.8h, v21.8h, v0.8h\n" + "fmax v22.8h, v22.8h, v0.8h\n" + "fmax v23.8h, v23.8h, v0.8h\n" + "fmax v24.8h, v24.8h, v0.8h\n" + "fmax v25.8h, v25.8h, v0.8h\n" + "fmax v26.8h, v26.8h, v0.8h\n" + "fmin v19.8h, v19.8h, v1.8h\n" + "fmin v20.8h, v20.8h, v1.8h\n" + "fmin v21.8h, v21.8h, v1.8h\n" + "fmin v22.8h, v22.8h, v1.8h\n" + "fmin v23.8h, v23.8h, v1.8h\n" + "fmin v24.8h, v24.8h, v1.8h\n" + "fmin v25.8h, v25.8h, v1.8h\n" + "fmin v26.8h, v26.8h, v1.8h\n" + "fdiv v19.8h, v19.8h, v1.8h\n" + "fdiv v20.8h, v20.8h, v1.8h\n" + "fdiv v21.8h, v21.8h, v1.8h\n" + "fdiv v22.8h, v22.8h, v1.8h\n" + "fdiv v23.8h, v23.8h, v1.8h\n" + "fdiv v24.8h, v24.8h, v1.8h\n" + "fdiv v25.8h, v25.8h, v1.8h\n" + "fdiv v26.8h, v26.8h, v1.8h\n" + "fmul v10.8h, v19.8h, v10.8h\n" + "fmul v11.8h, v20.8h, v11.8h\n" + "fmul v12.8h, v21.8h, v12.8h\n" + "fmul v13.8h, v22.8h, v13.8h\n" + "fmul v14.8h, v23.8h, v14.8h\n" + "fmul v15.8h, v24.8h, v15.8h\n" + "fmul v16.8h, v25.8h, v16.8h\n" + "fmul v17.8h, v26.8h, v17.8h\n" + + "13:\n" + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + "str q6, [%[out_0], #64]\n" // out_o0hw4 + "str q7, [%[out_0], #80]\n" // out_o0hw5 + "str q8, [%[out_0], #96]\n" // out_o0hw6 + "str q9, [%[out_0], #112]\n" // out_o0hw7 + "str q10, [%[out_1]]\n" // out_o1hw0 + "str q11, [%[out_1], #16]\n" // out_o1hw1 + "str q12, [%[out_1], #32]\n" // out_o1hw2 + "str q13, [%[out_1], #48]\n" // out_o1hw3 + "str q14, [%[out_1], #64]\n" // out_o1hw4 + "str q15, [%[out_1], #80]\n" // out_o1hw5 + "str q16, [%[out_1], #96]\n" // out_o1hw6 + "str q17, [%[out_1], #112]\n" // out_o1hw7 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), [b_1] "r"(b_o1), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "x0", "x1", "x2", "x3"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = pwFilterArray + (oc - 1) * 8 * ic * 8; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = pwBiasArray + (oc - 1) * 8; + __asm__ __volatile__( + "ldr q12, [%[b_0]]\n" // b_o0 + "mov x0, %[ic]\n" // ic_blk + "ldr q0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v12.16b\n" // out_o0hw0 + "mov v3.16b, v12.16b\n" // out_o0hw1 + "mov v4.16b, v12.16b\n" // out_o0hw2 + "ldr q10, [%[f_0]]\n" // f_o0c0 + "mov v5.16b, v12.16b\n" // out_o0hw3 + "mov v6.16b, v12.16b\n" // out_o0hw4 + "mov v7.16b, v12.16b\n" // out_o0hw5 + "mov v8.16b, v12.16b\n" // out_o0hw6 + "mov v9.16b, v12.16b\n" // out_o0hw7 + "0:\n" + "ldr q1, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v10.8h, v0.h[0]\n" + "fmla v3.8h, v10.8h, v0.h[1]\n" + "fmla v4.8h, v10.8h, v0.h[2]\n" + "ldr q11, [%[f_0], #16]\n" // f_o0c0 + "fmla v5.8h, v10.8h, v0.h[3]\n" + "fmla v6.8h, v10.8h, v0.h[4]\n" + "fmla v7.8h, v10.8h, v0.h[5]\n" + "subs x0, x0, #2\n" + "fmla v8.8h, v10.8h, v0.h[6]\n" + "fmla v9.8h, v10.8h, v0.h[7]\n" + + "ldr q0, [%[in_0], #32]\n" // in_hw0 + "fmla v2.8h, v11.8h, v1.h[0]\n" + "fmla v3.8h, v11.8h, v1.h[1]\n" + "fmla v4.8h, v11.8h, v1.h[2]\n" + "ldr q10, [%[f_0], #32]\n" // f_o0c0 + "fmla v5.8h, v11.8h, v1.h[3]\n" + "fmla v6.8h, v11.8h, v1.h[4]\n" + "fmla v7.8h, v11.8h, v1.h[5]\n" + "add %[in_0], %[in_0], #32\n" + "fmla v8.8h, v11.8h, v1.h[6]\n" + "add %[f_0], %[f_0], #32\n" + "fmla v9.8h, v11.8h, v1.h[7]\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + "fmin v2.8h, v2.8h, v1.8h\n" + "fmin v3.8h, v3.8h, v1.8h\n" + "fmin v4.8h, v4.8h, v1.8h\n" + "fmin v5.8h, v5.8h, v1.8h\n" + "fmin v6.8h, v6.8h, v1.8h\n" + "fmin v7.8h, v7.8h, v1.8h\n" + "fmin v8.8h, v8.8h, v1.8h\n" + "fmin v9.8h, v9.8h, v1.8h\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v10.8h, #0x42, lsl #8\n" // three + "fadd v11.8h, v2.8h, v10.8h\n" + "fadd v12.8h, v3.8h, v10.8h\n" + "fadd v13.8h, v4.8h, v10.8h\n" + "fadd v14.8h, v5.8h, v10.8h\n" + "fadd v15.8h, v6.8h, v10.8h\n" + "fadd v16.8h, v7.8h, v10.8h\n" + "fadd v17.8h, v8.8h, v10.8h\n" + "fadd v18.8h, v9.8h, v10.8h\n" + "fmax v11.8h, v11.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v0.8h\n" + "fmax v13.8h, v13.8h, v0.8h\n" + "fmax v14.8h, v14.8h, v0.8h\n" + "fmax v15.8h, v15.8h, v0.8h\n" + "fmax v16.8h, v16.8h, v0.8h\n" + "fmax v17.8h, v17.8h, v0.8h\n" + "fmax v18.8h, v18.8h, v0.8h\n" + "fmin v11.8h, v11.8h, v1.8h\n" + "fmin v12.8h, v12.8h, v1.8h\n" + "fmin v13.8h, v13.8h, v1.8h\n" + "fmin v14.8h, v14.8h, v1.8h\n" + "fmin v15.8h, v15.8h, v1.8h\n" + "fmin v16.8h, v16.8h, v1.8h\n" + "fmin v17.8h, v17.8h, v1.8h\n" + "fmin v18.8h, v18.8h, v1.8h\n" + "fdiv v11.8h, v11.8h, v1.8h\n" + "fdiv v12.8h, v12.8h, v1.8h\n" + "fdiv v13.8h, v13.8h, v1.8h\n" + "fdiv v14.8h, v14.8h, v1.8h\n" + "fdiv v15.8h, v15.8h, v1.8h\n" + "fdiv v16.8h, v16.8h, v1.8h\n" + "fdiv v17.8h, v17.8h, v1.8h\n" + "fdiv v18.8h, v18.8h, v1.8h\n" + "fmul v2.8h, v11.8h, v2.8h\n" + "fmul v3.8h, v12.8h, v3.8h\n" + "fmul v4.8h, v13.8h, v4.8h\n" + "fmul v5.8h, v14.8h, v5.8h\n" + "fmul v6.8h, v15.8h, v6.8h\n" + "fmul v7.8h, v16.8h, v7.8h\n" + "fmul v8.8h, v17.8h, v8.8h\n" + "fmul v9.8h, v18.8h, v9.8h\n" + + "13:\n" + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw0 + "str q4, [%[out_0], #32]\n" // out_o0hw0 + "str q5, [%[out_0], #48]\n" // out_o0hw0 + "str q6, [%[out_0], #64]\n" // out_o0hw0 + "str q7, [%[out_0], #80]\n" // out_o0hw0 + "str q8, [%[out_0], #96]\n" // out_o0hw0 + "str q9, [%[out_0], #112]\n" // out_o0hw0 + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "x0", "x1", "x2"); + } + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct.h b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct.h new file mode 100644 index 00000000..ef34d886 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct.h @@ -0,0 +1,96 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_DEPTHWISE_POINTWISE_CONVOLUTION_DIRECT +#define _H_DEPTHWISE_POINTWISE_CONVOLUTION_DIRECT + +#include +#include "sys.h" +#include "types.h" +#include "error.h" + +EE depthwise_pointwise_convolution_direct_A55(TensorDesc inputDesc, + F16 *inArray, + TensorDesc dwFilterDesc, + const F16 *dwFilterArray, + TensorDesc pwFilterDesc, + const F16 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F16 *dwBiasArray, + TensorDesc pwBiasDesc, + const F16 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec); + +EE depthwise_pointwise_convolution_direct_A76(TensorDesc inputDesc, + F16 *inArray, + TensorDesc dwFilterDesc, + const F16 *dwFilterArray, + TensorDesc pwFilterDesc, + const F16 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F16 *dwBiasArray, + TensorDesc pwBiasDesc, + const F16 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec); + +inline EE depthwise_pointwise_convolution_direct(TensorDesc inputDesc, + F16 *inArray, + TensorDesc dwFilterDesc, + const F16 *dwFilterArray, + TensorDesc pwFilterDesc, + const F16 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F16 *dwBiasArray, + TensorDesc pwBiasDesc, + const F16 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + Arch arch) +{ + EE ret = SUCCESS; + switch (arch) { + case ARM_A55: + ret = depthwise_pointwise_convolution_direct_A55(inputDesc, inArray, dwFilterDesc, + dwFilterArray, pwFilterDesc, pwFilterArray, convParamSpec, dwBiasDesc, dwBiasArray, + pwBiasDesc, pwBiasArray, tmpBytes, tmp, outputDesc, outArray, + depthwiseActivationParamSpec, pointwiseActivationParamSpec); + break; + case ARM_A76: + ret = depthwise_pointwise_convolution_direct_A76(inputDesc, inArray, dwFilterDesc, + dwFilterArray, pwFilterDesc, pwFilterArray, convParamSpec, dwBiasDesc, dwBiasArray, + pwBiasDesc, pwBiasArray, tmpBytes, tmp, outputDesc, outArray, + depthwiseActivationParamSpec, pointwiseActivationParamSpec); + break; + default: + return NOT_SUPPORTED; + } + return ret; +} +#endif diff --git a/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_A55.cpp b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_A55.cpp new file mode 100644 index 00000000..2998aa5a --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_A55.cpp @@ -0,0 +1,1417 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/fp16/depthwise_pointwise_convolution_direct.h" + +EE depthwise_pointwise_convolution_direct_A55(TensorDesc inputDesc, + F16 *inArray, + TensorDesc dwFilterDesc, + const F16 *dwFilterArray, + TensorDesc pwFilterDesc, + const F16 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F16 *dwBiasArray, + TensorDesc pwBiasDesc, + const F16 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec) +{ + UNUSED(tmpBytes); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; + + if (dwFilterDesc.df != DF_NCHWC8) { + CHECK_STATUS(NOT_MATCH); + } + if (pwFilterArray != nullptr && pwFilterDesc.df != DF_NHWCN16) { + CHECK_STATUS(NOT_MATCH); + } + + oc /= 8; + ic /= 8; + + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + U32 ihiw = ih * iw; + I32 ohow = oh * ow; + F16 *pwArray = (F16 *)tmp + ic * ih_pad * iw_pad * 8; + for (U32 n = 0; n < in; n++) { + // cpy input into a input with padding + F16 *inArray_pad = (F16 *)tmp; + F16 *inArray_pad_mov = inArray_pad; + F16 *inArray_mov = inArray + n * ic * ihiw * 8; + for (U32 c = 0; c < ic; c++) { + if (paddingT > 0) { + memset(inArray_pad_mov, 0, paddingT * iw_pad * 8 * bytesOf(fdt)); + inArray_pad_mov += paddingT * iw_pad * 8; + } + for (U32 h = paddingT; h < ih_pad - paddingB; h++) { + memset(inArray_pad_mov, 0, paddingL * 8 * bytesOf(fdt)); + inArray_pad_mov += paddingL * 8; + memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(fdt)); + inArray_pad_mov += iw * 8; + inArray_mov += iw * 8; + memset(inArray_pad_mov, 0, paddingR * 8 * bytesOf(fdt)); + inArray_pad_mov += paddingR * 8; + } + if (paddingB > 0) { + memset(inArray_pad_mov, 0, paddingB * iw_pad * 8 * bytesOf(fdt)); + inArray_pad_mov += paddingB * iw_pad * 8; + } + + // dw_conv + const F16 *b = dwBiasArray + c * 8; + F16 *in_pad = inArray_pad + c * ih_pad * iw_pad * 8; + const F16 *f = dwFilterArray + c * fh * fw * 8; + // ohow / 8 + for (I32 hw = 0; hw < ohow - 7; hw += 8) { + U32 in_h_0 = hw / ow * strideH; + U32 in_w_0 = hw % ow * strideW; + U32 in_h_1 = (hw + 1) / ow * strideH; + U32 in_w_1 = (hw + 1) % ow * strideW; + U32 in_h_2 = (hw + 2) / ow * strideH; + U32 in_w_2 = (hw + 2) % ow * strideW; + U32 in_h_3 = (hw + 3) / ow * strideH; + U32 in_w_3 = (hw + 3) % ow * strideW; + U32 in_h_4 = (hw + 4) / ow * strideH; + U32 in_w_4 = (hw + 4) % ow * strideW; + U32 in_h_5 = (hw + 5) / ow * strideH; + U32 in_w_5 = (hw + 5) % ow * strideW; + U32 in_h_6 = (hw + 6) / ow * strideH; + U32 in_w_6 = (hw + 6) % ow * strideW; + U32 in_h_7 = (hw + 7) / ow * strideH; + U32 in_w_7 = (hw + 7) % ow * strideW; + // TODO handle asm combined with c. No guarantee that compile will not use vec reg in c. + __asm__ __volatile__( + "ldr q8, [%[b]]\n" + "mov v0.16b, v8.16b\n" + "mov v1.16b, v8.16b\n" + "mov v2.16b, v8.16b\n" + "mov v3.16b, v8.16b\n" + "mov v4.16b, v8.16b\n" + "mov v5.16b, v8.16b\n" + "mov v6.16b, v8.16b\n" + "mov v7.16b, v8.16b\n" + : + : [b] "r"(b) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8"); + + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + const F16 *f_0 = f + fh_idx * fw * 8 + fw_idx * 8; + F16 *in_idx = in_pad + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + F16 *in_0 = in_idx + in_h_0 * iw_pad * 8 + in_w_0 * 8; + F16 *in_1 = in_idx + in_h_1 * iw_pad * 8 + in_w_1 * 8; + F16 *in_2 = in_idx + in_h_2 * iw_pad * 8 + in_w_2 * 8; + F16 *in_3 = in_idx + in_h_3 * iw_pad * 8 + in_w_3 * 8; + F16 *in_4 = in_idx + in_h_4 * iw_pad * 8 + in_w_4 * 8; + F16 *in_5 = in_idx + in_h_5 * iw_pad * 8 + in_w_5 * 8; + F16 *in_6 = in_idx + in_h_6 * iw_pad * 8 + in_w_6 * 8; + F16 *in_7 = in_idx + in_h_7 * iw_pad * 8 + in_w_7 * 8; + __asm__ __volatile__("ldr q17, [%[f0]]\n" + "ldr q9, [%[in0]]\n" + "ldr q10, [%[in1]]\n" + "ldr q11, [%[in2]]\n" + "ldr q12, [%[in3]]\n" + "ldr q13, [%[in4]]\n" + "ldr q14, [%[in5]]\n" + "ldr q15, [%[in6]]\n" + "ldr q16, [%[in7]]\n" + "fmla v0.8h, v9.8h, v17.8h\n" + "fmla v1.8h, v10.8h, v17.8h\n" + "fmla v2.8h, v11.8h, v17.8h\n" + "fmla v3.8h, v12.8h, v17.8h\n" + "fmla v4.8h, v13.8h, v17.8h\n" + "fmla v5.8h, v14.8h, v17.8h\n" + "fmla v6.8h, v15.8h, v17.8h\n" + "fmla v7.8h, v16.8h, v17.8h\n" + : + : [in0] "r"(in_0), [in1] "r"(in_1), [in2] "r"(in_2), + [in3] "r"(in_3), [in4] "r"(in_4), [in5] "r"(in_5), + [in6] "r"(in_6), [in7] "r"(in_7), [f0] "r"(f_0) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v9", "v10", "v11", "v12", "v13", "v14", + "v15", "v16", "v17"); + } + } + + // activation + switch (depthwiseActivationParamSpec.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "fmax v0.8h, v0.8h, v31.8h\n" + "fmax v1.8h, v1.8h, v31.8h\n" + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmax v4.8h, v4.8h, v31.8h\n" + "fmax v5.8h, v5.8h, v31.8h\n" + "fmax v6.8h, v6.8h, v31.8h\n" + "fmax v7.8h, v7.8h, v31.8h\n" + : + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v31"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v0.8h, v0.8h, v31.8h\n" + "fmax v1.8h, v1.8h, v31.8h\n" + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmax v4.8h, v4.8h, v31.8h\n" + "fmax v5.8h, v5.8h, v31.8h\n" + "fmax v6.8h, v6.8h, v31.8h\n" + "fmax v7.8h, v7.8h, v31.8h\n" + "fmin v0.8h, v0.8h, v30.8h\n" + "fmin v1.8h, v1.8h, v30.8h\n" + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v3.8h, v3.8h, v30.8h\n" + "fmin v4.8h, v4.8h, v30.8h\n" + "fmin v5.8h, v5.8h, v30.8h\n" + "fmin v6.8h, v6.8h, v30.8h\n" + "fmin v7.8h, v7.8h, v30.8h\n" + : + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v30", "v31"); + break; + } + case ACTIVATION_H_SWISH: { + __asm__ __volatile__("movi v29.8h, #0x42, lsl #8\n" // three + "movi v30.8h, #0x46, lsl #8\n" // six + "eor v31.16b, v31.16b, v31.16b\n" // zero + "fadd v21.8h, v0.8h, v29.8h\n" + "fadd v22.8h, v1.8h, v29.8h\n" + "fadd v23.8h, v2.8h, v29.8h\n" + "fadd v24.8h, v3.8h, v29.8h\n" + "fadd v25.8h, v4.8h, v29.8h\n" + "fadd v26.8h, v5.8h, v29.8h\n" + "fadd v27.8h, v6.8h, v29.8h\n" + "fadd v28.8h, v7.8h, v29.8h\n" + "fmax v21.8h, v21.8h, v31.8h\n" + "fmax v22.8h, v22.8h, v31.8h\n" + "fmax v23.8h, v23.8h, v31.8h\n" + "fmax v24.8h, v24.8h, v31.8h\n" + "fmax v25.8h, v25.8h, v31.8h\n" + "fmax v26.8h, v26.8h, v31.8h\n" + "fmax v27.8h, v27.8h, v31.8h\n" + "fmax v28.8h, v28.8h, v31.8h\n" + "fmin v21.8h, v21.8h, v30.8h\n" + "fmin v22.8h, v22.8h, v30.8h\n" + "fmin v23.8h, v23.8h, v30.8h\n" + "fmin v24.8h, v24.8h, v30.8h\n" + "fmin v25.8h, v25.8h, v30.8h\n" + "fmin v26.8h, v26.8h, v30.8h\n" + "fmin v27.8h, v27.8h, v30.8h\n" + "fmin v28.8h, v28.8h, v30.8h\n" + "fdiv v21.8h, v21.8h, v30.8h\n" + "fdiv v22.8h, v22.8h, v30.8h\n" + "fdiv v23.8h, v23.8h, v30.8h\n" + "fdiv v24.8h, v24.8h, v30.8h\n" + "fdiv v25.8h, v25.8h, v30.8h\n" + "fdiv v26.8h, v26.8h, v30.8h\n" + "fdiv v27.8h, v27.8h, v30.8h\n" + "fdiv v28.8h, v28.8h, v30.8h\n" + "fmul v0.8h, v0.8h, v21.8h\n" + "fmul v1.8h, v1.8h, v22.8h\n" + "fmul v2.8h, v2.8h, v23.8h\n" + "fmul v3.8h, v3.8h, v24.8h\n" + "fmul v4.8h, v4.8h, v25.8h\n" + "fmul v5.8h, v5.8h, v26.8h\n" + "fmul v6.8h, v6.8h, v27.8h\n" + "fmul v7.8h, v7.8h, v28.8h\n" + : + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); + break; + } + default: + return NOT_SUPPORTED; + } + + if (pwFilterArray != nullptr) { + F16 *pw_pack_0 = pwArray + hw * ic * 8 + c * 8 * 8; + __asm__ __volatile__("zip1 v8.8h, v0.8h, v4.8h\n" + "zip1 v9.8h, v2.8h, v6.8h\n" + "zip1 v10.8h, v1.8h, v5.8h\n" + "zip1 v11.8h, v3.8h, v7.8h\n" + "zip2 v0.8h, v0.8h, v4.8h\n" + "zip2 v2.8h, v2.8h, v6.8h\n" + "zip2 v1.8h, v1.8h, v5.8h\n" + "zip2 v3.8h, v3.8h, v7.8h\n" + "zip1 v12.8h, v8.8h, v9.8h\n" + "zip1 v13.8h, v10.8h, v11.8h\n" + "zip2 v8.8h, v8.8h, v9.8h\n" + "zip2 v10.8h, v10.8h, v11.8h\n" + "zip1 v14.8h, v0.8h, v2.8h\n" + "zip1 v15.8h, v1.8h, v3.8h\n" + "zip2 v0.8h, v0.8h, v2.8h\n" + "zip2 v1.8h, v1.8h, v3.8h\n" + "zip1 v16.8h, v12.8h, v13.8h\n" + "zip2 v12.8h, v12.8h, v13.8h\n" + "zip1 v17.8h, v8.8h, v10.8h\n" + "zip2 v8.8h, v8.8h, v10.8h\n" + "zip1 v18.8h, v14.8h, v15.8h\n" + "zip2 v14.8h, v14.8h, v15.8h\n" + "zip1 v19.8h, v0.8h, v1.8h\n" + "zip2 v0.8h, v0.8h, v1.8h\n" + "str q16, [%[pw0]]\n" + "str q12, [%[pw0], #16]\n" + "str q17, [%[pw0], #32]\n" + "str q8, [%[pw0], #48]\n" + "str q18, [%[pw0], #64]\n" + "str q14, [%[pw0], #80]\n" + "str q19, [%[pw0], #96]\n" + "str q0, [%[pw0], #112]\n" + : [pw0] "+r"(pw_pack_0) + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19"); + } else { + F16 *out_ptr = outArray + ((n * ic + c) * ohow + hw) * 8; + __asm__ __volatile__("str q0, [%[out]]\n" + "str q1, [%[out], #16]\n" + "str q2, [%[out], #32]\n" + "str q3, [%[out], #48]\n" + "str q4, [%[out], #64]\n" + "str q5, [%[out], #80]\n" + "str q6, [%[out], #96]\n" + "str q7, [%[out], #112]\n" + : [out] "+r"(out_ptr) + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19"); + } + } + + // ohow_reminder % 8 / 4 + U32 ohow_s = (ohow / 8) * 8; + for (I32 hw = ohow_s; hw < ohow - 3; hw += 4) { + U32 in_h_0 = hw / ow * strideH; + U32 in_w_0 = hw % ow * strideW; + U32 in_h_1 = (hw + 1) / ow * strideH; + U32 in_w_1 = (hw + 1) % ow * strideW; + U32 in_h_2 = (hw + 2) / ow * strideH; + U32 in_w_2 = (hw + 2) % ow * strideW; + U32 in_h_3 = (hw + 3) / ow * strideH; + U32 in_w_3 = (hw + 3) % ow * strideW; + // TODO handle asm combined with c. No guarantee that compile will not use vec reg in c. + __asm__ __volatile__("ldr q8, [%[b]]\n" + "mov v0.16b, v8.16b\n" + "mov v1.16b, v8.16b\n" + "mov v2.16b, v8.16b\n" + "mov v3.16b, v8.16b\n" + : + : [b] "r"(b) + : "memory", "cc", "v0", "v1", "v2", "v3", "v8"); + + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + const F16 *f_0 = f + fh_idx * fw * 8 + fw_idx * 8; + F16 *in_idx = in_pad + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + F16 *in_0 = in_idx + in_h_0 * iw_pad * 8 + in_w_0 * 8; + F16 *in_1 = in_idx + in_h_1 * iw_pad * 8 + in_w_1 * 8; + F16 *in_2 = in_idx + in_h_2 * iw_pad * 8 + in_w_2 * 8; + F16 *in_3 = in_idx + in_h_3 * iw_pad * 8 + in_w_3 * 8; + __asm__ __volatile__("ldr q17, [%[f0]]\n" + "ldr q9, [%[in0]]\n" + "ldr q10, [%[in1]]\n" + "ldr q11, [%[in2]]\n" + "ldr q12, [%[in3]]\n" + "fmla v0.8h, v9.8h, v17.8h\n" + "fmla v1.8h, v10.8h, v17.8h\n" + "fmla v2.8h, v11.8h, v17.8h\n" + "fmla v3.8h, v12.8h, v17.8h\n" + : + : [in0] "r"(in_0), [in1] "r"(in_1), [in2] "r"(in_2), + [in3] "r"(in_3), [f0] "r"(f_0) + : "memory", "cc", "v0", "v1", "v2", "v3", "v9", "v10", + "v11", "v12", "v17"); + } + } + + // activation + switch (depthwiseActivationParamSpec.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "fmax v0.8h, v0.8h, v31.8h\n" + "fmax v1.8h, v1.8h, v31.8h\n" + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + : + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v31"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v0.8h, v0.8h, v31.8h\n" + "fmax v1.8h, v1.8h, v31.8h\n" + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmin v0.8h, v0.8h, v30.8h\n" + "fmin v1.8h, v1.8h, v30.8h\n" + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v3.8h, v3.8h, v30.8h\n" + : + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v30", "v31"); + break; + } + case ACTIVATION_H_SWISH: { + __asm__ __volatile__("movi v29.8h, #0x42, lsl #8\n" // three + "movi v30.8h, #0x46, lsl #8\n" // six + "eor v31.16b, v31.16b, v31.16b\n" // zero + "fadd v25.8h, v0.8h, v29.8h\n" + "fadd v26.8h, v1.8h, v29.8h\n" + "fadd v27.8h, v2.8h, v29.8h\n" + "fadd v28.8h, v3.8h, v29.8h\n" + "fmax v25.8h, v25.8h, v31.8h\n" + "fmax v26.8h, v26.8h, v31.8h\n" + "fmax v27.8h, v27.8h, v31.8h\n" + "fmax v28.8h, v28.8h, v31.8h\n" + "fmin v25.8h, v25.8h, v30.8h\n" + "fmin v26.8h, v26.8h, v30.8h\n" + "fmin v27.8h, v27.8h, v30.8h\n" + "fmin v28.8h, v28.8h, v30.8h\n" + "fdiv v25.8h, v25.8h, v30.8h\n" + "fdiv v26.8h, v26.8h, v30.8h\n" + "fdiv v27.8h, v27.8h, v30.8h\n" + "fdiv v28.8h, v28.8h, v30.8h\n" + "fmul v0.8h, v0.8h, v25.8h\n" + "fmul v1.8h, v1.8h, v26.8h\n" + "fmul v2.8h, v2.8h, v27.8h\n" + "fmul v3.8h, v3.8h, v28.8h\n" + : + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); + break; + } + default: + return NOT_SUPPORTED; + } + + if (pwFilterArray != nullptr) { + F16 *pw_pack_0 = pwArray + hw * ic * 8 + c * 8 * 4; + __asm__ __volatile__("st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%[pw0]]\n" + : [pw0] "+r"(pw_pack_0) + : + : "memory", "cc", "v0", "v1", "v2", "v3"); + } else { + F16 *out_ptr = outArray + ((n * ic + c) * ohow + hw) * 8; + __asm__ __volatile__("st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [%[out]]\n" + : [out] "+r"(out_ptr) + : + : "memory", "cc", "v0", "v1", "v2", "v3"); + } + } + + // ohow_reminder % 4 + ohow_s = (ohow / 4) * 4; + for (I32 hw = ohow_s; hw < ohow; hw++) { + U32 in_h_0 = hw / ow * strideH; + U32 in_w_0 = hw % ow * strideW; + // TODO handle asm combined with c. No guarantee that compile will not use vec reg in c. + __asm__ __volatile__("ldr q8, [%[b]]\n" + "mov v0.16b, v8.16b\n" + : + : [b] "r"(b) + : "memory", "cc", "v0"); + + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + const F16 *f_0 = f + fh_idx * fw * 8 + fw_idx * 8; + F16 *in_idx = in_pad + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + F16 *in_0 = in_idx + in_h_0 * iw_pad * 8 + in_w_0 * 8; + __asm__ __volatile__("ldr q17, [%[f0]]\n" + "ldr q9, [%[in0]]\n" + "fmla v0.8h, v9.8h, v17.8h\n" + : + : [in0] "r"(in_0), [f0] "r"(f_0) + : "memory", "cc", "v0", "v9", "v17"); + } + } + + // activation + switch (depthwiseActivationParamSpec.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "fmax v0.8h, v0.8h, v31.8h\n" + : + : + : "memory", "cc", "v0", "v31"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v0.8h, v0.8h, v31.8h\n" + "fmin v0.8h, v0.8h, v30.8h\n" + : + : + : "memory", "cc", "v0", "v30", "v31"); + break; + } + case ACTIVATION_H_SWISH: { + __asm__ __volatile__("movi v29.8h, #0x42, lsl #8\n" // three + "movi v30.8h, #0x46, lsl #8\n" // six + "eor v31.16b, v31.16b, v31.16b\n" // zero + "fadd v28.8h, v0.8h, v29.8h\n" + "fmax v28.8h, v28.8h, v31.8h\n" + "fmin v28.8h, v28.8h, v30.8h\n" + "fdiv v28.8h, v28.8h, v30.8h\n" + "fmul v0.8h, v0.8h, v28.8h\n" + : + : + : "memory", "cc", "v0", "v28", "v29", "v30", "v31"); + break; + } + default: + return NOT_SUPPORTED; + } + + F16 *out_ptr; + if (pwFilterArray != nullptr) { + out_ptr = pwArray + hw * ic * 8 + c * 8; + } else { + out_ptr = outArray + ((n * ic + c) * ohow + hw) * 8; + } + __asm__ __volatile__("str q0, [%[out]]\n" + : [out] "+r"(out_ptr) + : + : "memory", "cc", "v0"); + } + } + + if (pwFilterArray == nullptr) { + continue; + } + // pw_conv + // ohow / 8 + for (I32 hw = 0; hw < ohow - 7; hw += 8) { + const F16 *b0 = pwBiasArray; + const F16 *b1 = b0 + 8; + F16 *in_pack = pwArray + hw * ic * 8; + const F16 *f_o0c0 = pwFilterArray; + for (I32 o = 0; o < I32(oc - 1); o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr d22, [%[b_0]]\n" // b_o0 + "ldr x1, [%[b_0], #8]\n" + "ins v22.d[1], x1\n" + "ldr d23, [%[b_1]]\n" // b_o1 + "ldr x2, [%[b_1], #8]\n" + "ins v23.d[1], x2\n" + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "ldr x1, [%[in_0], #8]\n" + "mov v4.16b, v22.16b\n" // out_o0hw2 + "ins v0.d[1], x1\n" + "mov v5.16b, v22.16b\n" // out_o0hw3 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "mov v6.16b, v22.16b\n" // out_o0hw4 + "ldr x2, [%[f_0], #8]\n" + "mov v7.16b, v22.16b\n" // out_o0hw5 + "ins v18.d[1], x2\n" + "mov v8.16b, v22.16b\n" // out_o0hw6 + "ldr d19, [%[f_0], #16]\n" // f_o1c0 + "mov v9.16b, v22.16b\n" // out_o0hw7 + "ldr x3, [%[f_0], #24]\n" + "mov v10.16b, v23.16b\n" // out_o1hw0 + "ins v19.d[1], x3\n" + "mov v11.16b, v23.16b\n" // out_o1hw1 + "mov v12.16b, v23.16b\n" // out_o1hw2 + "mov v13.16b, v23.16b\n" // out_o1hw3 + "mov v14.16b, v23.16b\n" // out_o1hw4 + "mov v15.16b, v23.16b\n" // out_o1hw5 + "mov v16.16b, v23.16b\n" // out_o1hw6 + "mov v17.16b, v23.16b\n" // out_o1hw7 + "0:\n" + "ldr d1, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr x1, [%[in_0], #24]\n" + "fmla v3.8h, v18.8h, v0.h[1]\n" + "ins v1.d[1], x1\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "ldr d20, [%[f_0], #32]\n" // f_o0c0 + "fmla v5.8h, v18.8h, v0.h[3]\n" + "ldr x2, [%[f_0], #40]\n" + "fmla v6.8h, v18.8h, v0.h[4]\n" + "ins v20.d[1], x2\n" + "fmla v7.8h, v18.8h, v0.h[5]\n" + "ldr d21, [%[f_0], #48]\n" // f_o1c0 + "fmla v8.8h, v18.8h, v0.h[6]\n" + "ldr x3, [%[f_0], #56]\n" + "fmla v9.8h, v18.8h, v0.h[7]\n" + "ins v21.d[1], x3\n" + "fmla v10.8h, v19.8h, v0.h[0]\n" + "fmla v11.8h, v19.8h, v0.h[1]\n" + "fmla v12.8h, v19.8h, v0.h[2]\n" + "fmla v13.8h, v19.8h, v0.h[3]\n" + "fmla v14.8h, v19.8h, v0.h[4]\n" + "fmla v15.8h, v19.8h, v0.h[5]\n" + "fmla v16.8h, v19.8h, v0.h[6]\n" + "fmla v17.8h, v19.8h, v0.h[7]\n" + + "ldr d0, [%[in_0], #32]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr x1, [%[in_0], #40]\n" + "fmla v3.8h, v20.8h, v1.h[1]\n" + "ins v0.d[1], x1\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "ldr d18, [%[f_0], #64]\n" // f_o0c0 + "fmla v5.8h, v20.8h, v1.h[3]\n" + "ldr x2, [%[f_0], #72]\n" + "fmla v6.8h, v20.8h, v1.h[4]\n" + "ins v18.d[1], x2\n" + "fmla v7.8h, v20.8h, v1.h[5]\n" + "ldr d19, [%[f_0], #80]\n" // f_o1c0 + "fmla v8.8h, v20.8h, v1.h[6]\n" + "ldr x3, [%[f_0], #88]\n" + "fmla v9.8h, v20.8h, v1.h[7]\n" + "ins v19.d[1], x3\n" + "fmla v10.8h, v21.8h, v1.h[0]\n" + "add %[in_0], %[in_0], #32\n" + "fmla v11.8h, v21.8h, v1.h[1]\n" + "add %[f_0], %[f_0], #64\n" + "fmla v12.8h, v21.8h, v1.h[2]\n" + "subs x0, x0, #2\n" + "fmla v13.8h, v21.8h, v1.h[3]\n" + "fmla v14.8h, v21.8h, v1.h[4]\n" + "fmla v15.8h, v21.8h, v1.h[5]\n" + "fmla v16.8h, v21.8h, v1.h[6]\n" + "fmla v17.8h, v21.8h, v1.h[7]\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + "fmax v11.8h, v11.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v0.8h\n" + "fmax v13.8h, v13.8h, v0.8h\n" + "fmax v14.8h, v14.8h, v0.8h\n" + "fmax v15.8h, v15.8h, v0.8h\n" + "fmax v16.8h, v16.8h, v0.8h\n" + "fmax v17.8h, v17.8h, v0.8h\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + "fmax v11.8h, v11.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v0.8h\n" + "fmax v13.8h, v13.8h, v0.8h\n" + "fmax v14.8h, v14.8h, v0.8h\n" + "fmax v15.8h, v15.8h, v0.8h\n" + "fmax v16.8h, v16.8h, v0.8h\n" + "fmax v17.8h, v17.8h, v0.8h\n" + "fmin v2.8h, v2.8h, v1.8h\n" + "fmin v3.8h, v3.8h, v1.8h\n" + "fmin v4.8h, v4.8h, v1.8h\n" + "fmin v5.8h, v5.8h, v1.8h\n" + "fmin v6.8h, v6.8h, v1.8h\n" + "fmin v7.8h, v7.8h, v1.8h\n" + "fmin v8.8h, v8.8h, v1.8h\n" + "fmin v9.8h, v9.8h, v1.8h\n" + "fmin v10.8h, v10.8h, v1.8h\n" + "fmin v11.8h, v11.8h, v1.8h\n" + "fmin v12.8h, v12.8h, v1.8h\n" + "fmin v13.8h, v13.8h, v1.8h\n" + "fmin v14.8h, v14.8h, v1.8h\n" + "fmin v15.8h, v15.8h, v1.8h\n" + "fmin v16.8h, v16.8h, v1.8h\n" + "fmin v17.8h, v17.8h, v1.8h\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three + "fadd v19.8h, v2.8h, v18.8h\n" + "fadd v20.8h, v3.8h, v18.8h\n" + "fadd v21.8h, v4.8h, v18.8h\n" + "fadd v22.8h, v5.8h, v18.8h\n" + "fadd v23.8h, v6.8h, v18.8h\n" + "fadd v24.8h, v7.8h, v18.8h\n" + "fadd v25.8h, v8.8h, v18.8h\n" + "fadd v26.8h, v9.8h, v18.8h\n" + "fmax v19.8h, v19.8h, v0.8h\n" + "fmax v20.8h, v20.8h, v0.8h\n" + "fmax v21.8h, v21.8h, v0.8h\n" + "fmax v22.8h, v22.8h, v0.8h\n" + "fmax v23.8h, v23.8h, v0.8h\n" + "fmax v24.8h, v24.8h, v0.8h\n" + "fmax v25.8h, v25.8h, v0.8h\n" + "fmax v26.8h, v26.8h, v0.8h\n" + "fmin v19.8h, v19.8h, v1.8h\n" + "fmin v20.8h, v20.8h, v1.8h\n" + "fmin v21.8h, v21.8h, v1.8h\n" + "fmin v22.8h, v22.8h, v1.8h\n" + "fmin v23.8h, v23.8h, v1.8h\n" + "fmin v24.8h, v24.8h, v1.8h\n" + "fmin v25.8h, v25.8h, v1.8h\n" + "fmin v26.8h, v26.8h, v1.8h\n" + "fdiv v19.8h, v19.8h, v1.8h\n" + "fdiv v20.8h, v20.8h, v1.8h\n" + "fdiv v21.8h, v21.8h, v1.8h\n" + "fdiv v22.8h, v22.8h, v1.8h\n" + "fdiv v23.8h, v23.8h, v1.8h\n" + "fdiv v24.8h, v24.8h, v1.8h\n" + "fdiv v25.8h, v25.8h, v1.8h\n" + "fdiv v26.8h, v26.8h, v1.8h\n" + "fmul v2.8h, v19.8h, v2.8h\n" + "fmul v3.8h, v20.8h, v3.8h\n" + "fmul v4.8h, v21.8h, v4.8h\n" + "fmul v5.8h, v22.8h, v5.8h\n" + "fmul v6.8h, v23.8h, v6.8h\n" + "fmul v7.8h, v24.8h, v7.8h\n" + "fmul v8.8h, v25.8h, v8.8h\n" + "fmul v9.8h, v26.8h, v9.8h\n" + + "fadd v19.8h, v10.8h, v18.8h\n" + "fadd v20.8h, v11.8h, v18.8h\n" + "fadd v21.8h, v12.8h, v18.8h\n" + "fadd v22.8h, v13.8h, v18.8h\n" + "fadd v23.8h, v14.8h, v18.8h\n" + "fadd v24.8h, v15.8h, v18.8h\n" + "fadd v25.8h, v16.8h, v18.8h\n" + "fadd v26.8h, v17.8h, v18.8h\n" + "fmax v19.8h, v19.8h, v0.8h\n" + "fmax v20.8h, v20.8h, v0.8h\n" + "fmax v21.8h, v21.8h, v0.8h\n" + "fmax v22.8h, v22.8h, v0.8h\n" + "fmax v23.8h, v23.8h, v0.8h\n" + "fmax v24.8h, v24.8h, v0.8h\n" + "fmax v25.8h, v25.8h, v0.8h\n" + "fmax v26.8h, v26.8h, v0.8h\n" + "fmin v19.8h, v19.8h, v1.8h\n" + "fmin v20.8h, v20.8h, v1.8h\n" + "fmin v21.8h, v21.8h, v1.8h\n" + "fmin v22.8h, v22.8h, v1.8h\n" + "fmin v23.8h, v23.8h, v1.8h\n" + "fmin v24.8h, v24.8h, v1.8h\n" + "fmin v25.8h, v25.8h, v1.8h\n" + "fmin v26.8h, v26.8h, v1.8h\n" + "fdiv v19.8h, v19.8h, v1.8h\n" + "fdiv v20.8h, v20.8h, v1.8h\n" + "fdiv v21.8h, v21.8h, v1.8h\n" + "fdiv v22.8h, v22.8h, v1.8h\n" + "fdiv v23.8h, v23.8h, v1.8h\n" + "fdiv v24.8h, v24.8h, v1.8h\n" + "fdiv v25.8h, v25.8h, v1.8h\n" + "fdiv v26.8h, v26.8h, v1.8h\n" + "fmul v10.8h, v19.8h, v10.8h\n" + "fmul v11.8h, v20.8h, v11.8h\n" + "fmul v12.8h, v21.8h, v12.8h\n" + "fmul v13.8h, v22.8h, v13.8h\n" + "fmul v14.8h, v23.8h, v14.8h\n" + "fmul v15.8h, v24.8h, v15.8h\n" + "fmul v16.8h, v25.8h, v16.8h\n" + "fmul v17.8h, v26.8h, v17.8h\n" + + "13:\n" + "st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%[out_0]], #64\n" + "st1 {v6.8h, v7.8h, v8.8h, v9.8h}, [%[out_0]], #64\n" + "st1 {v10.8h, v11.8h, v12.8h, v13.8h}, [%[out_1]], #64\n" + "st1 {v14.8h, v15.8h, v16.8h, v17.8h}, [%[out_1]], #64\n" + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), [b_1] "r"(b_o1), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "x0", "x1", "x2", "x3"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = pwFilterArray + (oc - 1) * 8 * ic * 8; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = pwBiasArray + (oc - 1) * 8; + __asm__ __volatile__( + "ldr q12, [%[b_0]]\n" // b_o0 + "mov x0, %[ic]\n" // ic_blk + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v12.16b\n" // out_o0hw0 + "ldr x1, [%[in_0], #8]\n" + "mov v3.16b, v12.16b\n" // out_o0hw1 + "ins v0.d[1], x1\n" + "mov v4.16b, v12.16b\n" // out_o0hw2 + "ldr d10, [%[f_0]]\n" // f_o0c0 + "mov v5.16b, v12.16b\n" // out_o0hw3 + "ldr x2, [%[f_0], #8]\n" + "mov v6.16b, v12.16b\n" // out_o0hw4 + "ins v10.d[1], x2\n" + "mov v7.16b, v12.16b\n" // out_o0hw5 + "mov v8.16b, v12.16b\n" // out_o0hw6 + "mov v9.16b, v12.16b\n" // out_o0hw7 + "0:\n" + "ldr d1, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v10.8h, v0.h[0]\n" + "ldr x1, [%[in_0], #24]\n" + "fmla v3.8h, v10.8h, v0.h[1]\n" + "ins v1.d[1], x1\n" + "fmla v4.8h, v10.8h, v0.h[2]\n" + "ldr d11, [%[f_0], #16]\n" // f_o0c0 + "fmla v5.8h, v10.8h, v0.h[3]\n" + "ldr x2, [%[f_0], #24]\n" + "fmla v6.8h, v10.8h, v0.h[4]\n" + "ins v11.d[1], x2\n" + "fmla v7.8h, v10.8h, v0.h[5]\n" + "subs x0, x0, #2\n" + "fmla v8.8h, v10.8h, v0.h[6]\n" + "fmla v9.8h, v10.8h, v0.h[7]\n" + + "ldr d0, [%[in_0], #32]\n" // in_hw0 + "fmla v2.8h, v11.8h, v1.h[0]\n" + "ldr x1, [%[in_0], #40]\n" + "fmla v3.8h, v11.8h, v1.h[1]\n" + "ins v0.d[1], x1\n" + "fmla v4.8h, v11.8h, v1.h[2]\n" + "ldr d10, [%[f_0], #32]\n" // f_o0c0 + "fmla v5.8h, v11.8h, v1.h[3]\n" + "ldr x2, [%[f_0], #40]\n" + "fmla v6.8h, v11.8h, v1.h[4]\n" + "ins v10.d[1], x2\n" + "fmla v7.8h, v11.8h, v1.h[5]\n" + "add %[in_0], %[in_0], #32\n" + "fmla v8.8h, v11.8h, v1.h[6]\n" + "add %[f_0], %[f_0], #32\n" + "fmla v9.8h, v11.8h, v1.h[7]\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + "fmin v2.8h, v2.8h, v1.8h\n" + "fmin v3.8h, v3.8h, v1.8h\n" + "fmin v4.8h, v4.8h, v1.8h\n" + "fmin v5.8h, v5.8h, v1.8h\n" + "fmin v6.8h, v6.8h, v1.8h\n" + "fmin v7.8h, v7.8h, v1.8h\n" + "fmin v8.8h, v8.8h, v1.8h\n" + "fmin v9.8h, v9.8h, v1.8h\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v10.8h, #0x42, lsl #8\n" // three + "fadd v11.8h, v2.8h, v10.8h\n" + "fadd v12.8h, v3.8h, v10.8h\n" + "fadd v13.8h, v4.8h, v10.8h\n" + "fadd v14.8h, v5.8h, v10.8h\n" + "fadd v15.8h, v6.8h, v10.8h\n" + "fadd v16.8h, v7.8h, v10.8h\n" + "fadd v17.8h, v8.8h, v10.8h\n" + "fadd v18.8h, v9.8h, v10.8h\n" + "fmax v11.8h, v11.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v0.8h\n" + "fmax v13.8h, v13.8h, v0.8h\n" + "fmax v14.8h, v14.8h, v0.8h\n" + "fmax v15.8h, v15.8h, v0.8h\n" + "fmax v16.8h, v16.8h, v0.8h\n" + "fmax v17.8h, v17.8h, v0.8h\n" + "fmax v18.8h, v18.8h, v0.8h\n" + "fmin v11.8h, v11.8h, v1.8h\n" + "fmin v12.8h, v12.8h, v1.8h\n" + "fmin v13.8h, v13.8h, v1.8h\n" + "fmin v14.8h, v14.8h, v1.8h\n" + "fmin v15.8h, v15.8h, v1.8h\n" + "fmin v16.8h, v16.8h, v1.8h\n" + "fmin v17.8h, v17.8h, v1.8h\n" + "fmin v18.8h, v18.8h, v1.8h\n" + "fdiv v11.8h, v11.8h, v1.8h\n" + "fdiv v12.8h, v12.8h, v1.8h\n" + "fdiv v13.8h, v13.8h, v1.8h\n" + "fdiv v14.8h, v14.8h, v1.8h\n" + "fdiv v15.8h, v15.8h, v1.8h\n" + "fdiv v16.8h, v16.8h, v1.8h\n" + "fdiv v17.8h, v17.8h, v1.8h\n" + "fdiv v18.8h, v18.8h, v1.8h\n" + "fmul v2.8h, v11.8h, v2.8h\n" + "fmul v3.8h, v12.8h, v3.8h\n" + "fmul v4.8h, v13.8h, v4.8h\n" + "fmul v5.8h, v14.8h, v5.8h\n" + "fmul v6.8h, v15.8h, v6.8h\n" + "fmul v7.8h, v16.8h, v7.8h\n" + "fmul v8.8h, v17.8h, v8.8h\n" + "fmul v9.8h, v18.8h, v9.8h\n" + + "13:\n" + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw0 + "str q4, [%[out_0], #32]\n" // out_o0hw0 + "str q5, [%[out_0], #48]\n" // out_o0hw0 + "str q6, [%[out_0], #64]\n" // out_o0hw0 + "str q7, [%[out_0], #80]\n" // out_o0hw0 + "str q8, [%[out_0], #96]\n" // out_o0hw0 + "str q9, [%[out_0], #112]\n" // out_o0hw0 + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "x0", "x1", "x2"); + } + } + + // ohow_remainder % 8 / 4 + U32 ohow_s = (ohow / 8) * 8; + for (I32 hw = ohow_s; hw < ohow - 3; hw += 4) { + const F16 *b0 = pwBiasArray; + const F16 *b1 = b0 + 8; + const F16 *f_o0c0 = pwFilterArray; + F16 *in_pack = pwArray + hw * ic * 8; + for (I32 o = 0; o < I32(oc - 1); o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr d22, [%[b_0]]\n" // b_o0 + "ldr x1, [%[b_0], #8]\n" + "ins v22.d[1], x1\n" + "ldr d23, [%[b_1]]\n" // b_o1 + "ldr x2, [%[b_1], #8]\n" + "ins v23.d[1], x2\n" + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "mov v4.16b, v22.16b\n" // out_o0hw2 + "ldr x2, [%[f_0], #8]\n" + "mov v5.16b, v22.16b\n" // out_o0hw3 + "ins v18.d[1], x2\n" + "mov v10.16b, v23.16b\n" // out_o1hw0 + "ldr d19, [%[f_0], #16]\n" // f_o1c0 + "mov v11.16b, v23.16b\n" // out_o1hw1 + "ldr x3, [%[f_0], #24]\n" + "mov v12.16b, v23.16b\n" // out_o1hw2 + "ins v19.d[1], x3\n" + "mov v13.16b, v23.16b\n" // out_o1hw3 + "0:\n" + "ldr d1, [%[in_0], #8]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr d20, [%[f_0], #32]\n" // f_o0c0 + "fmla v3.8h, v18.8h, v0.h[1]\n" + "ldr x2, [%[f_0], #40]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "ins v20.d[1], x2\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "ldr d21, [%[f_0], #48]\n" // f_o1c0 + "fmla v10.8h, v19.8h, v0.h[0]\n" + "ldr x3, [%[f_0], #56]\n" + "fmla v11.8h, v19.8h, v0.h[1]\n" + "ins v21.d[1], x3\n" + "fmla v12.8h, v19.8h, v0.h[2]\n" + "subs x0, x0, #2\n" + "fmla v13.8h, v19.8h, v0.h[3]\n" + + "ldr d0, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr d18, [%[f_0], #64]\n" // f_o0c0 + "fmla v3.8h, v20.8h, v1.h[1]\n" + "ldr x2, [%[f_0], #72]\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "ldr d19, [%[f_0], #80]\n" // f_o1c0 + "fmla v5.8h, v20.8h, v1.h[3]\n" + "ins v18.d[1], x2\n" + "fmla v10.8h, v21.8h, v1.h[0]\n" + "ldr x3, [%[f_0], #88]\n" + "fmla v11.8h, v21.8h, v1.h[1]\n" + "ins v19.d[1], x3\n" + "fmla v12.8h, v21.8h, v1.h[2]\n" + "add %[in_0], %[in_0], #16\n" + "fmla v13.8h, v21.8h, v1.h[3]\n" + "add %[f_0], %[f_0], #64\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + "fmax v11.8h, v11.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v0.8h\n" + "fmax v13.8h, v13.8h, v0.8h\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + "fmax v11.8h, v11.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v0.8h\n" + "fmax v13.8h, v13.8h, v0.8h\n" + "fmin v2.8h, v2.8h, v1.8h\n" + "fmin v3.8h, v3.8h, v1.8h\n" + "fmin v4.8h, v4.8h, v1.8h\n" + "fmin v5.8h, v5.8h, v1.8h\n" + "fmin v10.8h, v10.8h, v1.8h\n" + "fmin v11.8h, v11.8h, v1.8h\n" + "fmin v12.8h, v12.8h, v1.8h\n" + "fmin v13.8h, v13.8h, v1.8h\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three + "fadd v6.8h, v2.8h, v18.8h\n" + "fadd v7.8h, v3.8h, v18.8h\n" + "fadd v8.8h, v4.8h, v18.8h\n" + "fadd v9.8h, v5.8h, v18.8h\n" + "fadd v19.8h, v10.8h, v18.8h\n" + "fadd v20.8h, v11.8h, v18.8h\n" + "fadd v21.8h, v12.8h, v18.8h\n" + "fadd v22.8h, v13.8h, v18.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + "fmax v19.8h, v19.8h, v0.8h\n" + "fmax v20.8h, v20.8h, v0.8h\n" + "fmax v21.8h, v21.8h, v0.8h\n" + "fmax v22.8h, v22.8h, v0.8h\n" + "fmin v6.8h, v6.8h, v1.8h\n" + "fmin v7.8h, v7.8h, v1.8h\n" + "fmin v8.8h, v8.8h, v1.8h\n" + "fmin v9.8h, v9.8h, v1.8h\n" + "fmin v19.8h, v19.8h, v1.8h\n" + "fmin v20.8h, v20.8h, v1.8h\n" + "fmin v21.8h, v21.8h, v1.8h\n" + "fmin v22.8h, v22.8h, v1.8h\n" + "fdiv v6.8h, v6.8h, v1.8h\n" + "fdiv v7.8h, v7.8h, v1.8h\n" + "fdiv v8.8h, v8.8h, v1.8h\n" + "fdiv v9.8h, v9.8h, v1.8h\n" + "fdiv v19.8h, v19.8h, v1.8h\n" + "fdiv v20.8h, v20.8h, v1.8h\n" + "fdiv v21.8h, v21.8h, v1.8h\n" + "fdiv v22.8h, v22.8h, v1.8h\n" + "fmul v2.8h, v6.8h, v2.8h\n" + "fmul v3.8h, v7.8h, v3.8h\n" + "fmul v4.8h, v8.8h, v4.8h\n" + "fmul v5.8h, v9.8h, v5.8h\n" + "fmul v10.8h, v19.8h, v10.8h\n" + "fmul v11.8h, v20.8h, v11.8h\n" + "fmul v12.8h, v21.8h, v12.8h\n" + "fmul v13.8h, v22.8h, v13.8h\n" + + "13:\n" + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + "str q10, [%[out_1]]\n" // out_o1hw0 + "str q11, [%[out_1], #16]\n" // out_o1hw1 + "str q12, [%[out_1], #32]\n" // out_o1hw2 + "str q13, [%[out_1], #48]\n" // out_o1hw3 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), [b_1] "r"(b_o1), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v18", "v19", "v20", "v21", "v22", "v23", "x0", + "x1", "x2", "x3"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = pwFilterArray + (oc - 1) * 8 * ic * 8; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = pwBiasArray + (oc - 1) * 8; + __asm__ __volatile__( + "ldr d22, [%[b_0]]\n" // b_o0 + "ldr x1, [%[b_0], #8]\n" + "ins v22.d[1], x1\n" + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "mov v4.16b, v22.16b\n" // out_o0hw2 + "ldr x2, [%[f_0], #8]\n" + "mov v5.16b, v22.16b\n" // out_o0hw3 + "ins v18.d[1], x2\n" + "0:\n" + "ldr d1, [%[in_0], #8]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr d20, [%[f_0], #16]\n" // f_o0c0 + "fmla v3.8h, v18.8h, v0.h[1]\n" + "ldr x2, [%[f_0], #24]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "ins v20.d[1], x2\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "subs x0, x0, #2\n" + + "ldr d0, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr d18, [%[f_0], #32]\n" // f_o0c0 + "fmla v3.8h, v20.8h, v1.h[1]\n" + "ldr x2, [%[f_0], #40]\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "ins v18.d[1], x2\n" + "fmla v5.8h, v20.8h, v1.h[3]\n" + "add %[in_0], %[in_0], #16\n" + "add %[f_0], %[f_0], #32\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmin v2.8h, v2.8h, v1.8h\n" + "fmin v3.8h, v3.8h, v1.8h\n" + "fmin v4.8h, v4.8h, v1.8h\n" + "fmin v5.8h, v5.8h, v1.8h\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three + "fadd v6.8h, v2.8h, v18.8h\n" + "fadd v7.8h, v3.8h, v18.8h\n" + "fadd v8.8h, v4.8h, v18.8h\n" + "fadd v9.8h, v5.8h, v18.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + "fmin v6.8h, v6.8h, v1.8h\n" + "fmin v7.8h, v7.8h, v1.8h\n" + "fmin v8.8h, v8.8h, v1.8h\n" + "fmin v9.8h, v9.8h, v1.8h\n" + "fdiv v6.8h, v6.8h, v1.8h\n" + "fdiv v7.8h, v7.8h, v1.8h\n" + "fdiv v8.8h, v8.8h, v1.8h\n" + "fdiv v9.8h, v9.8h, v1.8h\n" + "fmul v2.8h, v6.8h, v2.8h\n" + "fmul v3.8h, v7.8h, v3.8h\n" + "fmul v4.8h, v8.8h, v4.8h\n" + "fmul v5.8h, v9.8h, v5.8h\n" + + "13:\n" + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v18", "v20", "v22", "x0", "x1", "x2"); + } + } + + // ohow_reminder % 4 + ohow_s = (ohow / 4) * 4; + for (I32 hw = ohow_s; hw < ohow; hw++) { + const F16 *b0 = pwBiasArray; + const F16 *b1 = b0 + 8; + const F16 *f_o0c0 = pwFilterArray; + F16 *in_pack = pwArray + hw * ic * 8; + for (I32 o = 0; o < I32(oc - 1); o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr d22, [%[b_0]]\n" // b_o0 + "ldr x1, [%[b_0], #8]\n" + "ins v22.d[1], x1\n" + "ldr d23, [%[b_1]]\n" // b_o1 + "mov x0, %[ic]\n" // ic_blk + "ldr x2, [%[b_1], #8]\n" + "ins v23.d[1], x2\n" + "ldr h0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "mov v10.16b, v23.16b\n" // out_o1hw0 + "ldr x2, [%[f_0], #8]\n" + "ins v18.d[1], x2\n" + "ldr d19, [%[f_0], #16]\n" // f_o1c0 + "ldr x3, [%[f_0], #24]\n" + "ins v19.d[1], x3\n" + "0:\n" + "ldr h1, [%[in_0], #2]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr d20, [%[f_0], #32]\n" // f_o0c0 + "fmla v10.8h, v19.8h, v0.h[0]\n" + "ldr x2, [%[f_0], #40]\n" + "ins v20.d[1], x2\n" + "ldr d21, [%[f_0], #48]\n" // f_o1c0 + "subs x0, x0, #2\n" + "ldr x3, [%[f_0], #56]\n" + "ins v21.d[1], x3\n" + + "ldr h0, [%[in_0], #4]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr d18, [%[f_0], #64]\n" // f_o0c0 + "fmla v10.8h, v21.8h, v1.h[0]\n" + "ldr x2, [%[f_0], #72]\n" + "ins v18.d[1], x2\n" + "ldr d19, [%[f_0], #80]\n" // f_o1c0 + "add %[in_0], %[in_0], #4\n" + "ldr x3, [%[f_0], #88]\n" + "ins v19.d[1], x3\n" + "add %[f_0], %[f_0], #64\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + "fmin v2.8h, v2.8h, v1.8h\n" + "fmin v10.8h, v10.8h, v1.8h\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three + "fadd v19.8h, v2.8h, v18.8h\n" + "fadd v20.8h, v10.8h, v18.8h\n" + "fmax v19.8h, v19.8h, v0.8h\n" + "fmax v20.8h, v20.8h, v0.8h\n" + "fmin v19.8h, v19.8h, v1.8h\n" + "fmin v20.8h, v20.8h, v1.8h\n" + "fdiv v19.8h, v19.8h, v1.8h\n" + "fdiv v20.8h, v20.8h, v1.8h\n" + "fmul v2.8h, v19.8h, v2.8h\n" + "fmul v10.8h, v20.8h, v10.8h\n" + + "13:\n" + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q10, [%[out_1]]\n" // out_o1hw0 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), [b_1] "r"(b_o1), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v10", "v18", "v19", "v20", "v21", "v22", + "v23", "x0", "x1", "x2", "x3"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = pwFilterArray + (oc - 1) * 8 * ic * 8; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = pwBiasArray + (oc - 1) * 8; + __asm__ __volatile__( + "ldr d22, [%[b_0]]\n" // b_o0 + "mov x0, %[ic]\n" // ic_blk + "ldr x1, [%[b_0], #8]\n" + "ins v22.d[1], x1\n" + "ldr h0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "ldr x2, [%[f_0], #8]\n" + "ins v18.d[1], x2\n" + "0:\n" + "ldr h1, [%[in_0], #2]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr d20, [%[f_0], #16]\n" // f_o0c0 + "subs x0, x0, #2\n" + "ldr x2, [%[f_0], #24]\n" + "ins v20.d[1], x2\n" + + "ldr h0, [%[in_0], #4]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr d18, [%[f_0], #32]\n" // f_o0c0 + "ldr x2, [%[f_0], #40]\n" + "ins v18.d[1], x2\n" + "add %[in_0], %[in_0], #4\n" + "add %[f_0], %[f_0], #32\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v2.8h, v2.8h, v0.8h\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v0.8h\n" + "fmin v2.8h, v2.8h, v1.8h\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three + "fadd v20.8h, v2.8h, v18.8h\n" + "fmax v20.8h, v20.8h, v0.8h\n" + "fmin v20.8h, v20.8h, v1.8h\n" + "fdiv v20.8h, v20.8h, v1.8h\n" + "fmul v2.8h, v20.8h, v2.8h\n" + + "13:\n" + "str q2, [%[out_0]]\n" // out_o0hw0 + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v10", "v18", "v20", "v22", "x0", "x1", + "x2"); + } + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_A76.cpp b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_A76.cpp new file mode 100644 index 00000000..46d0c628 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_A76.cpp @@ -0,0 +1,1334 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/fp16/depthwise_pointwise_convolution_direct.h" + +EE depthwise_pointwise_convolution_direct_A76(TensorDesc inputDesc, + F16 *inArray, + TensorDesc dwFilterDesc, + const F16 *dwFilterArray, + TensorDesc pwFilterDesc, + const F16 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F16 *dwBiasArray, + TensorDesc pwBiasDesc, + const F16 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec) +{ + UNUSED(tmpBytes); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; + + if (dwFilterDesc.df != DF_NCHWC8) { + CHECK_STATUS(NOT_MATCH); + } + if (pwFilterArray != nullptr && pwFilterDesc.df != DF_NHWCN16) { + CHECK_STATUS(NOT_MATCH); + } + + oc /= 8; + ic /= 8; + + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + U32 ihiw = ih * iw; + I32 ohow = oh * ow; + F16 *pwArray = (F16 *)tmp + ic * ih_pad * iw_pad * 8; + for (U32 n = 0; n < in; n++) { + F16 *inArray_pad = (F16 *)tmp; + F16 *inArray_pad_mov = inArray_pad; + F16 *inArray_mov = inArray + n * ic * ihiw * 8; + for (U32 c = 0; c < ic; c++) { + if (paddingT > 0) { + memset(inArray_pad_mov, 0, paddingT * iw_pad * 8 * bytesOf(fdt)); + inArray_pad_mov += paddingT * iw_pad * 8; + } + for (U32 h = paddingT; h < ih_pad - paddingB; h++) { + memset(inArray_pad_mov, 0, paddingL * 8 * bytesOf(fdt)); + inArray_pad_mov += paddingL * 8; + memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(fdt)); + inArray_pad_mov += iw * 8; + inArray_mov += iw * 8; + memset(inArray_pad_mov, 0, paddingR * 8 * bytesOf(fdt)); + inArray_pad_mov += paddingR * 8; + } + if (paddingB > 0) { + memset(inArray_pad_mov, 0, paddingB * iw_pad * 8 * bytesOf(fdt)); + inArray_pad_mov += paddingB * iw_pad * 8; + } + + // dw_conv + const F16 *b = dwBiasArray + c * 8; + F16 *in_pad = inArray_pad + c * ih_pad * iw_pad * 8; + const F16 *f = dwFilterArray + c * fh * fw * 8; + // ohow / 8 + for (I32 hw = 0; hw < ohow - 7; hw += 8) { + U32 in_h_0 = hw / ow * strideH; + U32 in_w_0 = hw % ow * strideW; + U32 in_h_1 = (hw + 1) / ow * strideH; + U32 in_w_1 = (hw + 1) % ow * strideW; + U32 in_h_2 = (hw + 2) / ow * strideH; + U32 in_w_2 = (hw + 2) % ow * strideW; + U32 in_h_3 = (hw + 3) / ow * strideH; + U32 in_w_3 = (hw + 3) % ow * strideW; + U32 in_h_4 = (hw + 4) / ow * strideH; + U32 in_w_4 = (hw + 4) % ow * strideW; + U32 in_h_5 = (hw + 5) / ow * strideH; + U32 in_w_5 = (hw + 5) % ow * strideW; + U32 in_h_6 = (hw + 6) / ow * strideH; + U32 in_w_6 = (hw + 6) % ow * strideW; + U32 in_h_7 = (hw + 7) / ow * strideH; + U32 in_w_7 = (hw + 7) % ow * strideW; + // TODO handle asm combined with c. No guarantee that compile will not use vec reg in c. + __asm__ __volatile__( + "ldr q8, [%[b]]\n" + "mov v0.16b, v8.16b\n" + "mov v1.16b, v8.16b\n" + "mov v2.16b, v8.16b\n" + "mov v3.16b, v8.16b\n" + "mov v4.16b, v8.16b\n" + "mov v5.16b, v8.16b\n" + "mov v6.16b, v8.16b\n" + "mov v7.16b, v8.16b\n" + : + : [b] "r"(b) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8"); + + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + const F16 *f_0 = f + fh_idx * fw * 8 + fw_idx * 8; + F16 *in_idx = in_pad + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + F16 *in_0 = in_idx + in_h_0 * iw_pad * 8 + in_w_0 * 8; + F16 *in_1 = in_idx + in_h_1 * iw_pad * 8 + in_w_1 * 8; + F16 *in_2 = in_idx + in_h_2 * iw_pad * 8 + in_w_2 * 8; + F16 *in_3 = in_idx + in_h_3 * iw_pad * 8 + in_w_3 * 8; + F16 *in_4 = in_idx + in_h_4 * iw_pad * 8 + in_w_4 * 8; + F16 *in_5 = in_idx + in_h_5 * iw_pad * 8 + in_w_5 * 8; + F16 *in_6 = in_idx + in_h_6 * iw_pad * 8 + in_w_6 * 8; + F16 *in_7 = in_idx + in_h_7 * iw_pad * 8 + in_w_7 * 8; + __asm__ __volatile__("ldr q17, [%[f0]]\n" + "ldr q9, [%[in0]]\n" + "ldr q10, [%[in1]]\n" + "ldr q11, [%[in2]]\n" + "ldr q12, [%[in3]]\n" + "ldr q13, [%[in4]]\n" + "ldr q14, [%[in5]]\n" + "ldr q15, [%[in6]]\n" + "ldr q16, [%[in7]]\n" + "fmla v0.8h, v9.8h, v17.8h\n" + "fmla v1.8h, v10.8h, v17.8h\n" + "fmla v2.8h, v11.8h, v17.8h\n" + "fmla v3.8h, v12.8h, v17.8h\n" + "fmla v4.8h, v13.8h, v17.8h\n" + "fmla v5.8h, v14.8h, v17.8h\n" + "fmla v6.8h, v15.8h, v17.8h\n" + "fmla v7.8h, v16.8h, v17.8h\n" + : + : [in0] "r"(in_0), [in1] "r"(in_1), [in2] "r"(in_2), + [in3] "r"(in_3), [in4] "r"(in_4), [in5] "r"(in_5), + [in6] "r"(in_6), [in7] "r"(in_7), [f0] "r"(f_0) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v9", "v10", "v11", "v12", "v13", "v14", + "v15", "v16", "v17"); + } + } + + // activation + switch (depthwiseActivationParamSpec.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "fmax v0.8h, v0.8h, v31.8h\n" + "fmax v1.8h, v1.8h, v31.8h\n" + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmax v4.8h, v4.8h, v31.8h\n" + "fmax v5.8h, v5.8h, v31.8h\n" + "fmax v6.8h, v6.8h, v31.8h\n" + "fmax v7.8h, v7.8h, v31.8h\n" + : + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v31"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v0.8h, v0.8h, v31.8h\n" + "fmax v1.8h, v1.8h, v31.8h\n" + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmax v4.8h, v4.8h, v31.8h\n" + "fmax v5.8h, v5.8h, v31.8h\n" + "fmax v6.8h, v6.8h, v31.8h\n" + "fmax v7.8h, v7.8h, v31.8h\n" + "fmin v0.8h, v0.8h, v30.8h\n" + "fmin v1.8h, v1.8h, v30.8h\n" + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v3.8h, v3.8h, v30.8h\n" + "fmin v4.8h, v4.8h, v30.8h\n" + "fmin v5.8h, v5.8h, v30.8h\n" + "fmin v6.8h, v6.8h, v30.8h\n" + "fmin v7.8h, v7.8h, v30.8h\n" + : + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v30", "v31"); + break; + } + case ACTIVATION_H_SWISH: { + __asm__ __volatile__("movi v29.8h, #0x42, lsl #8\n" // three + "movi v30.8h, #0x46, lsl #8\n" // six + "eor v31.16b, v31.16b, v31.16b\n" // zero + "fadd v21.8h, v0.8h, v29.8h\n" + "fadd v22.8h, v1.8h, v29.8h\n" + "fadd v23.8h, v2.8h, v29.8h\n" + "fadd v24.8h, v3.8h, v29.8h\n" + "fadd v25.8h, v4.8h, v29.8h\n" + "fadd v26.8h, v5.8h, v29.8h\n" + "fadd v27.8h, v6.8h, v29.8h\n" + "fadd v28.8h, v7.8h, v29.8h\n" + "fmax v21.8h, v21.8h, v31.8h\n" + "fmax v22.8h, v22.8h, v31.8h\n" + "fmax v23.8h, v23.8h, v31.8h\n" + "fmax v24.8h, v24.8h, v31.8h\n" + "fmax v25.8h, v25.8h, v31.8h\n" + "fmax v26.8h, v26.8h, v31.8h\n" + "fmax v27.8h, v27.8h, v31.8h\n" + "fmax v28.8h, v28.8h, v31.8h\n" + "fmin v21.8h, v21.8h, v30.8h\n" + "fmin v22.8h, v22.8h, v30.8h\n" + "fmin v23.8h, v23.8h, v30.8h\n" + "fmin v24.8h, v24.8h, v30.8h\n" + "fmin v25.8h, v25.8h, v30.8h\n" + "fmin v26.8h, v26.8h, v30.8h\n" + "fmin v27.8h, v27.8h, v30.8h\n" + "fmin v28.8h, v28.8h, v30.8h\n" + "fdiv v21.8h, v21.8h, v30.8h\n" + "fdiv v22.8h, v22.8h, v30.8h\n" + "fdiv v23.8h, v23.8h, v30.8h\n" + "fdiv v24.8h, v24.8h, v30.8h\n" + "fdiv v25.8h, v25.8h, v30.8h\n" + "fdiv v26.8h, v26.8h, v30.8h\n" + "fdiv v27.8h, v27.8h, v30.8h\n" + "fdiv v28.8h, v28.8h, v30.8h\n" + "fmul v0.8h, v0.8h, v21.8h\n" + "fmul v1.8h, v1.8h, v22.8h\n" + "fmul v2.8h, v2.8h, v23.8h\n" + "fmul v3.8h, v3.8h, v24.8h\n" + "fmul v4.8h, v4.8h, v25.8h\n" + "fmul v5.8h, v5.8h, v26.8h\n" + "fmul v6.8h, v6.8h, v27.8h\n" + "fmul v7.8h, v7.8h, v28.8h\n" + : + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); + break; + } + default: + return NOT_SUPPORTED; + } + + if (pwFilterArray != nullptr) { + F16 *pw_pack_0 = pwArray + hw * ic * 8 + c * 8 * 8; + __asm__ __volatile__("zip1 v8.8h, v0.8h, v4.8h\n" + "zip1 v9.8h, v2.8h, v6.8h\n" + "zip1 v10.8h, v1.8h, v5.8h\n" + "zip1 v11.8h, v3.8h, v7.8h\n" + "zip2 v0.8h, v0.8h, v4.8h\n" + "zip2 v2.8h, v2.8h, v6.8h\n" + "zip2 v1.8h, v1.8h, v5.8h\n" + "zip2 v3.8h, v3.8h, v7.8h\n" + "zip1 v12.8h, v8.8h, v9.8h\n" + "zip1 v13.8h, v10.8h, v11.8h\n" + "zip2 v8.8h, v8.8h, v9.8h\n" + "zip2 v10.8h, v10.8h, v11.8h\n" + "zip1 v14.8h, v0.8h, v2.8h\n" + "zip1 v15.8h, v1.8h, v3.8h\n" + "zip2 v0.8h, v0.8h, v2.8h\n" + "zip2 v1.8h, v1.8h, v3.8h\n" + "zip1 v16.8h, v12.8h, v13.8h\n" + "zip2 v12.8h, v12.8h, v13.8h\n" + "zip1 v17.8h, v8.8h, v10.8h\n" + "zip2 v8.8h, v8.8h, v10.8h\n" + "zip1 v18.8h, v14.8h, v15.8h\n" + "zip2 v14.8h, v14.8h, v15.8h\n" + "zip1 v19.8h, v0.8h, v1.8h\n" + "zip2 v0.8h, v0.8h, v1.8h\n" + "str q16, [%[pw0]]\n" + "str q12, [%[pw0], #16]\n" + "str q17, [%[pw0], #32]\n" + "str q8, [%[pw0], #48]\n" + "str q18, [%[pw0], #64]\n" + "str q14, [%[pw0], #80]\n" + "str q19, [%[pw0], #96]\n" + "str q0, [%[pw0], #112]\n" + : [pw0] "+r"(pw_pack_0) + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19"); + } else { + F16 *out_ptr = outArray + ((n * ic + c) * ohow + hw) * 8; + __asm__ __volatile__("str q0, [%[out]]\n" + "str q1, [%[out], #16]\n" + "str q2, [%[out], #32]\n" + "str q3, [%[out], #48]\n" + "str q4, [%[out], #64]\n" + "str q5, [%[out], #80]\n" + "str q6, [%[out], #96]\n" + "str q7, [%[out], #112]\n" + : [out] "+r"(out_ptr) + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19"); + } + } + + // ohow_reminder % 8 / 4 + U32 ohow_s = (ohow / 8) * 8; + for (I32 hw = ohow_s; hw < ohow - 3; hw += 4) { + U32 in_h_0 = hw / ow * strideH; + U32 in_w_0 = hw % ow * strideW; + U32 in_h_1 = (hw + 1) / ow * strideH; + U32 in_w_1 = (hw + 1) % ow * strideW; + U32 in_h_2 = (hw + 2) / ow * strideH; + U32 in_w_2 = (hw + 2) % ow * strideW; + U32 in_h_3 = (hw + 3) / ow * strideH; + U32 in_w_3 = (hw + 3) % ow * strideW; + // TODO handle asm combined with c. No guarantee that compile will not use vec reg in c. + __asm__ __volatile__("ldr q8, [%[b]]\n" + "mov v0.16b, v8.16b\n" + "mov v1.16b, v8.16b\n" + "mov v2.16b, v8.16b\n" + "mov v3.16b, v8.16b\n" + : + : [b] "r"(b) + : "memory", "cc", "v0", "v1", "v2", "v3", "v8"); + + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + const F16 *f_0 = f + fh_idx * fw * 8 + fw_idx * 8; + F16 *in_idx = in_pad + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + F16 *in_0 = in_idx + in_h_0 * iw_pad * 8 + in_w_0 * 8; + F16 *in_1 = in_idx + in_h_1 * iw_pad * 8 + in_w_1 * 8; + F16 *in_2 = in_idx + in_h_2 * iw_pad * 8 + in_w_2 * 8; + F16 *in_3 = in_idx + in_h_3 * iw_pad * 8 + in_w_3 * 8; + __asm__ __volatile__("ldr q17, [%[f0]]\n" + "ldr q9, [%[in0]]\n" + "ldr q10, [%[in1]]\n" + "ldr q11, [%[in2]]\n" + "ldr q12, [%[in3]]\n" + "fmla v0.8h, v9.8h, v17.8h\n" + "fmla v1.8h, v10.8h, v17.8h\n" + "fmla v2.8h, v11.8h, v17.8h\n" + "fmla v3.8h, v12.8h, v17.8h\n" + : + : [in0] "r"(in_0), [in1] "r"(in_1), [in2] "r"(in_2), + [in3] "r"(in_3), [f0] "r"(f_0) + : "memory", "cc", "v0", "v1", "v2", "v3", "v9", "v10", + "v11", "v12", "v17"); + } + } + + // activation + switch (depthwiseActivationParamSpec.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "fmax v0.8h, v0.8h, v31.8h\n" + "fmax v1.8h, v1.8h, v31.8h\n" + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + : + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v31"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v0.8h, v0.8h, v31.8h\n" + "fmax v1.8h, v1.8h, v31.8h\n" + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmin v0.8h, v0.8h, v30.8h\n" + "fmin v1.8h, v1.8h, v30.8h\n" + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v3.8h, v3.8h, v30.8h\n" + : + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v30", "v31"); + break; + } + case ACTIVATION_H_SWISH: { + __asm__ __volatile__("movi v29.8h, #0x42, lsl #8\n" // three + "movi v30.8h, #0x46, lsl #8\n" // six + "eor v31.16b, v31.16b, v31.16b\n" // zero + "fadd v25.8h, v0.8h, v29.8h\n" + "fadd v26.8h, v1.8h, v29.8h\n" + "fadd v27.8h, v2.8h, v29.8h\n" + "fadd v28.8h, v3.8h, v29.8h\n" + "fmax v25.8h, v25.8h, v31.8h\n" + "fmax v26.8h, v26.8h, v31.8h\n" + "fmax v27.8h, v27.8h, v31.8h\n" + "fmax v28.8h, v28.8h, v31.8h\n" + "fmin v25.8h, v25.8h, v30.8h\n" + "fmin v26.8h, v26.8h, v30.8h\n" + "fmin v27.8h, v27.8h, v30.8h\n" + "fmin v28.8h, v28.8h, v30.8h\n" + "fdiv v25.8h, v25.8h, v30.8h\n" + "fdiv v26.8h, v26.8h, v30.8h\n" + "fdiv v27.8h, v27.8h, v30.8h\n" + "fdiv v28.8h, v28.8h, v30.8h\n" + "fmul v0.8h, v0.8h, v25.8h\n" + "fmul v1.8h, v1.8h, v26.8h\n" + "fmul v2.8h, v2.8h, v27.8h\n" + "fmul v3.8h, v3.8h, v28.8h\n" + : + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); + break; + } + default: + return NOT_SUPPORTED; + } + + if (pwFilterArray != nullptr) { + F16 *pw_pack_0 = pwArray + hw * ic * 8 + c * 8 * 4; + __asm__ __volatile__("st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%[pw0]]\n" + : [pw0] "+r"(pw_pack_0) + : + : "memory", "cc", "v0", "v1", "v2", "v3"); + } else { + F16 *out_ptr = outArray + ((n * ic + c) * ohow + hw) * 8; + __asm__ __volatile__("st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [%[out]]\n" + : [out] "+r"(out_ptr) + : + : "memory", "cc", "v0", "v1", "v2", "v3"); + } + } + + // ohow_reminder % 4 + ohow_s = (ohow / 4) * 4; + for (I32 hw = ohow_s; hw < ohow; hw++) { + U32 in_h_0 = hw / ow * strideH; + U32 in_w_0 = hw % ow * strideW; + // TODO handle asm combined with c. No guarantee that compile will not use vec reg in c. + __asm__ __volatile__("ldr q8, [%[b]]\n" + "mov v0.16b, v8.16b\n" + : + : [b] "r"(b) + : "memory", "cc", "v0"); + + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + const F16 *f_0 = f + fh_idx * fw * 8 + fw_idx * 8; + F16 *in_idx = in_pad + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + F16 *in_0 = in_idx + in_h_0 * iw_pad * 8 + in_w_0 * 8; + __asm__ __volatile__("ldr q17, [%[f0]]\n" + "ldr q9, [%[in0]]\n" + "fmla v0.8h, v9.8h, v17.8h\n" + : + : [in0] "r"(in_0), [f0] "r"(f_0) + : "memory", "cc", "v0", "v9", "v17"); + } + } + + // activation + switch (depthwiseActivationParamSpec.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "fmax v0.8h, v0.8h, v31.8h\n" + : + : + : "memory", "cc", "v0", "v31"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v0.8h, v0.8h, v31.8h\n" + "fmin v0.8h, v0.8h, v30.8h\n" + : + : + : "memory", "cc", "v0", "v30", "v31"); + break; + } + case ACTIVATION_H_SWISH: { + __asm__ __volatile__("movi v29.8h, #0x42, lsl #8\n" // three + "movi v30.8h, #0x46, lsl #8\n" // six + "eor v31.16b, v31.16b, v31.16b\n" // zero + "fadd v28.8h, v0.8h, v29.8h\n" + "fmax v28.8h, v28.8h, v31.8h\n" + "fmin v28.8h, v28.8h, v30.8h\n" + "fdiv v28.8h, v28.8h, v30.8h\n" + "fmul v0.8h, v0.8h, v28.8h\n" + : + : + : "memory", "cc", "v0", "v28", "v29", "v30", "v31"); + break; + } + default: + return NOT_SUPPORTED; + } + + F16 *out_ptr; + if (pwFilterArray != nullptr) { + out_ptr = pwArray + hw * ic * 8 + c * 8; + } else { + out_ptr = outArray + ((n * ic + c) * ohow + hw) * 8; + } + __asm__ __volatile__("str q0, [%[out]]\n" + : [out] "+r"(out_ptr) + : + : "memory", "cc", "v0"); + } + } + + if (pwFilterArray == nullptr) { + continue; + } + // pw_conv + // ohow / 8 + for (I32 hw = 0; hw < ohow - 7; hw += 8) { + const F16 *b0 = pwBiasArray; + const F16 *b1 = b0 + 8; + F16 *in_pack = pwArray + hw * ic * 8; + const F16 *f_o0c0 = pwFilterArray; + for (I32 o = 0; o < I32(oc - 1); o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr q22, [%[b_0]]\n" // b_o0 + "ldr q23, [%[b_1]]\n" // b_o1 + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr q0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "mov v4.16b, v22.16b\n" // out_o0hw2 + "mov v5.16b, v22.16b\n" // out_o0hw3 + "ldr q18, [%[f_0]]\n" // f_o0c0 + "mov v6.16b, v22.16b\n" // out_o0hw4 + "mov v7.16b, v22.16b\n" // out_o0hw5 + "mov v8.16b, v22.16b\n" // out_o0hw6 + "ldr q19, [%[f_0], #16]\n" // f_o1c0 + "mov v9.16b, v22.16b\n" // out_o0hw7 + "mov v10.16b, v23.16b\n" // out_o1hw0 + "mov v11.16b, v23.16b\n" // out_o1hw1 + "mov v12.16b, v23.16b\n" // out_o1hw2 + "mov v13.16b, v23.16b\n" // out_o1hw3 + "mov v14.16b, v23.16b\n" // out_o1hw4 + "mov v15.16b, v23.16b\n" // out_o1hw5 + "mov v16.16b, v23.16b\n" // out_o1hw6 + "mov v17.16b, v23.16b\n" // out_o1hw7 + "0:\n" + "ldr q1, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "fmla v3.8h, v18.8h, v0.h[1]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "ldr q20, [%[f_0], #32]\n" // f_o0c0 + "fmla v5.8h, v18.8h, v0.h[3]\n" + "fmla v6.8h, v18.8h, v0.h[4]\n" + "fmla v7.8h, v18.8h, v0.h[5]\n" + "ldr q21, [%[f_0], #48]\n" // f_o1c0 + "fmla v8.8h, v18.8h, v0.h[6]\n" + "fmla v9.8h, v18.8h, v0.h[7]\n" + "fmla v10.8h, v19.8h, v0.h[0]\n" + "fmla v11.8h, v19.8h, v0.h[1]\n" + "fmla v12.8h, v19.8h, v0.h[2]\n" + "fmla v13.8h, v19.8h, v0.h[3]\n" + "fmla v14.8h, v19.8h, v0.h[4]\n" + "fmla v15.8h, v19.8h, v0.h[5]\n" + "fmla v16.8h, v19.8h, v0.h[6]\n" + "fmla v17.8h, v19.8h, v0.h[7]\n" + + "ldr q0, [%[in_0], #32]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "fmla v3.8h, v20.8h, v1.h[1]\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "ldr q18, [%[f_0], #64]\n" // f_o0c0 + "fmla v5.8h, v20.8h, v1.h[3]\n" + "fmla v6.8h, v20.8h, v1.h[4]\n" + "fmla v7.8h, v20.8h, v1.h[5]\n" + "ldr q19, [%[f_0], #80]\n" // f_o1c0 + "fmla v8.8h, v20.8h, v1.h[6]\n" + "fmla v9.8h, v20.8h, v1.h[7]\n" + "fmla v10.8h, v21.8h, v1.h[0]\n" + "add %[in_0], %[in_0], #32\n" + "fmla v11.8h, v21.8h, v1.h[1]\n" + "add %[f_0], %[f_0], #64\n" + "fmla v12.8h, v21.8h, v1.h[2]\n" + "subs x0, x0, #2\n" + "fmla v13.8h, v21.8h, v1.h[3]\n" + "fmla v14.8h, v21.8h, v1.h[4]\n" + "fmla v15.8h, v21.8h, v1.h[5]\n" + "fmla v16.8h, v21.8h, v1.h[6]\n" + "fmla v17.8h, v21.8h, v1.h[7]\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + "fmax v11.8h, v11.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v0.8h\n" + "fmax v13.8h, v13.8h, v0.8h\n" + "fmax v14.8h, v14.8h, v0.8h\n" + "fmax v15.8h, v15.8h, v0.8h\n" + "fmax v16.8h, v16.8h, v0.8h\n" + "fmax v17.8h, v17.8h, v0.8h\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + "fmax v11.8h, v11.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v0.8h\n" + "fmax v13.8h, v13.8h, v0.8h\n" + "fmax v14.8h, v14.8h, v0.8h\n" + "fmax v15.8h, v15.8h, v0.8h\n" + "fmax v16.8h, v16.8h, v0.8h\n" + "fmax v17.8h, v17.8h, v0.8h\n" + "fmin v2.8h, v2.8h, v1.8h\n" + "fmin v3.8h, v3.8h, v1.8h\n" + "fmin v4.8h, v4.8h, v1.8h\n" + "fmin v5.8h, v5.8h, v1.8h\n" + "fmin v6.8h, v6.8h, v1.8h\n" + "fmin v7.8h, v7.8h, v1.8h\n" + "fmin v8.8h, v8.8h, v1.8h\n" + "fmin v9.8h, v9.8h, v1.8h\n" + "fmin v10.8h, v10.8h, v1.8h\n" + "fmin v11.8h, v11.8h, v1.8h\n" + "fmin v12.8h, v12.8h, v1.8h\n" + "fmin v13.8h, v13.8h, v1.8h\n" + "fmin v14.8h, v14.8h, v1.8h\n" + "fmin v15.8h, v15.8h, v1.8h\n" + "fmin v16.8h, v16.8h, v1.8h\n" + "fmin v17.8h, v17.8h, v1.8h\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three + "fadd v19.8h, v2.8h, v18.8h\n" + "fadd v20.8h, v3.8h, v18.8h\n" + "fadd v21.8h, v4.8h, v18.8h\n" + "fadd v22.8h, v5.8h, v18.8h\n" + "fadd v23.8h, v6.8h, v18.8h\n" + "fadd v24.8h, v7.8h, v18.8h\n" + "fadd v25.8h, v8.8h, v18.8h\n" + "fadd v26.8h, v9.8h, v18.8h\n" + "fmax v19.8h, v19.8h, v0.8h\n" + "fmax v20.8h, v20.8h, v0.8h\n" + "fmax v21.8h, v21.8h, v0.8h\n" + "fmax v22.8h, v22.8h, v0.8h\n" + "fmax v23.8h, v23.8h, v0.8h\n" + "fmax v24.8h, v24.8h, v0.8h\n" + "fmax v25.8h, v25.8h, v0.8h\n" + "fmax v26.8h, v26.8h, v0.8h\n" + "fmin v19.8h, v19.8h, v1.8h\n" + "fmin v20.8h, v20.8h, v1.8h\n" + "fmin v21.8h, v21.8h, v1.8h\n" + "fmin v22.8h, v22.8h, v1.8h\n" + "fmin v23.8h, v23.8h, v1.8h\n" + "fmin v24.8h, v24.8h, v1.8h\n" + "fmin v25.8h, v25.8h, v1.8h\n" + "fmin v26.8h, v26.8h, v1.8h\n" + "fdiv v19.8h, v19.8h, v1.8h\n" + "fdiv v20.8h, v20.8h, v1.8h\n" + "fdiv v21.8h, v21.8h, v1.8h\n" + "fdiv v22.8h, v22.8h, v1.8h\n" + "fdiv v23.8h, v23.8h, v1.8h\n" + "fdiv v24.8h, v24.8h, v1.8h\n" + "fdiv v25.8h, v25.8h, v1.8h\n" + "fdiv v26.8h, v26.8h, v1.8h\n" + "fmul v2.8h, v19.8h, v2.8h\n" + "fmul v3.8h, v20.8h, v3.8h\n" + "fmul v4.8h, v21.8h, v4.8h\n" + "fmul v5.8h, v22.8h, v5.8h\n" + "fmul v6.8h, v23.8h, v6.8h\n" + "fmul v7.8h, v24.8h, v7.8h\n" + "fmul v8.8h, v25.8h, v8.8h\n" + "fmul v9.8h, v26.8h, v9.8h\n" + + "fadd v19.8h, v10.8h, v18.8h\n" + "fadd v20.8h, v11.8h, v18.8h\n" + "fadd v21.8h, v12.8h, v18.8h\n" + "fadd v22.8h, v13.8h, v18.8h\n" + "fadd v23.8h, v14.8h, v18.8h\n" + "fadd v24.8h, v15.8h, v18.8h\n" + "fadd v25.8h, v16.8h, v18.8h\n" + "fadd v26.8h, v17.8h, v18.8h\n" + "fmax v19.8h, v19.8h, v0.8h\n" + "fmax v20.8h, v20.8h, v0.8h\n" + "fmax v21.8h, v21.8h, v0.8h\n" + "fmax v22.8h, v22.8h, v0.8h\n" + "fmax v23.8h, v23.8h, v0.8h\n" + "fmax v24.8h, v24.8h, v0.8h\n" + "fmax v25.8h, v25.8h, v0.8h\n" + "fmax v26.8h, v26.8h, v0.8h\n" + "fmin v19.8h, v19.8h, v1.8h\n" + "fmin v20.8h, v20.8h, v1.8h\n" + "fmin v21.8h, v21.8h, v1.8h\n" + "fmin v22.8h, v22.8h, v1.8h\n" + "fmin v23.8h, v23.8h, v1.8h\n" + "fmin v24.8h, v24.8h, v1.8h\n" + "fmin v25.8h, v25.8h, v1.8h\n" + "fmin v26.8h, v26.8h, v1.8h\n" + "fdiv v19.8h, v19.8h, v1.8h\n" + "fdiv v20.8h, v20.8h, v1.8h\n" + "fdiv v21.8h, v21.8h, v1.8h\n" + "fdiv v22.8h, v22.8h, v1.8h\n" + "fdiv v23.8h, v23.8h, v1.8h\n" + "fdiv v24.8h, v24.8h, v1.8h\n" + "fdiv v25.8h, v25.8h, v1.8h\n" + "fdiv v26.8h, v26.8h, v1.8h\n" + "fmul v10.8h, v19.8h, v10.8h\n" + "fmul v11.8h, v20.8h, v11.8h\n" + "fmul v12.8h, v21.8h, v12.8h\n" + "fmul v13.8h, v22.8h, v13.8h\n" + "fmul v14.8h, v23.8h, v14.8h\n" + "fmul v15.8h, v24.8h, v15.8h\n" + "fmul v16.8h, v25.8h, v16.8h\n" + "fmul v17.8h, v26.8h, v17.8h\n" + + "13:\n" + "st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%[out_0]], #64\n" + "st1 {v6.8h, v7.8h, v8.8h, v9.8h}, [%[out_0]], #64\n" + "st1 {v10.8h, v11.8h, v12.8h, v13.8h}, [%[out_1]], #64\n" + "st1 {v14.8h, v15.8h, v16.8h, v17.8h}, [%[out_1]], #64\n" + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), [b_1] "r"(b_o1), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "x0", "x1", "x2", "x3"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = pwFilterArray + (oc - 1) * 8 * ic * 8; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = pwBiasArray + (oc - 1) * 8; + __asm__ __volatile__( + "ldr q12, [%[b_0]]\n" // b_o0 + "mov x0, %[ic]\n" // ic_blk + "ldr q0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v12.16b\n" // out_o0hw0 + "mov v3.16b, v12.16b\n" // out_o0hw1 + "mov v4.16b, v12.16b\n" // out_o0hw2 + "ldr q10, [%[f_0]]\n" // f_o0c0 + "mov v5.16b, v12.16b\n" // out_o0hw3 + "mov v6.16b, v12.16b\n" // out_o0hw4 + "mov v7.16b, v12.16b\n" // out_o0hw5 + "mov v8.16b, v12.16b\n" // out_o0hw6 + "mov v9.16b, v12.16b\n" // out_o0hw7 + "0:\n" + "ldr q1, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v10.8h, v0.h[0]\n" + "fmla v3.8h, v10.8h, v0.h[1]\n" + "fmla v4.8h, v10.8h, v0.h[2]\n" + "ldr q11, [%[f_0], #16]\n" // f_o0c0 + "fmla v5.8h, v10.8h, v0.h[3]\n" + "fmla v6.8h, v10.8h, v0.h[4]\n" + "fmla v7.8h, v10.8h, v0.h[5]\n" + "subs x0, x0, #2\n" + "fmla v8.8h, v10.8h, v0.h[6]\n" + "fmla v9.8h, v10.8h, v0.h[7]\n" + + "ldr q0, [%[in_0], #32]\n" // in_hw0 + "fmla v2.8h, v11.8h, v1.h[0]\n" + "fmla v3.8h, v11.8h, v1.h[1]\n" + "fmla v4.8h, v11.8h, v1.h[2]\n" + "ldr q10, [%[f_0], #32]\n" // f_o0c0 + "fmla v5.8h, v11.8h, v1.h[3]\n" + "fmla v6.8h, v11.8h, v1.h[4]\n" + "fmla v7.8h, v11.8h, v1.h[5]\n" + "add %[in_0], %[in_0], #32\n" + "fmla v8.8h, v11.8h, v1.h[6]\n" + "add %[f_0], %[f_0], #32\n" + "fmla v9.8h, v11.8h, v1.h[7]\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + "fmin v2.8h, v2.8h, v1.8h\n" + "fmin v3.8h, v3.8h, v1.8h\n" + "fmin v4.8h, v4.8h, v1.8h\n" + "fmin v5.8h, v5.8h, v1.8h\n" + "fmin v6.8h, v6.8h, v1.8h\n" + "fmin v7.8h, v7.8h, v1.8h\n" + "fmin v8.8h, v8.8h, v1.8h\n" + "fmin v9.8h, v9.8h, v1.8h\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v10.8h, #0x42, lsl #8\n" // three + "fadd v11.8h, v2.8h, v10.8h\n" + "fadd v12.8h, v3.8h, v10.8h\n" + "fadd v13.8h, v4.8h, v10.8h\n" + "fadd v14.8h, v5.8h, v10.8h\n" + "fadd v15.8h, v6.8h, v10.8h\n" + "fadd v16.8h, v7.8h, v10.8h\n" + "fadd v17.8h, v8.8h, v10.8h\n" + "fadd v18.8h, v9.8h, v10.8h\n" + "fmax v11.8h, v11.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v0.8h\n" + "fmax v13.8h, v13.8h, v0.8h\n" + "fmax v14.8h, v14.8h, v0.8h\n" + "fmax v15.8h, v15.8h, v0.8h\n" + "fmax v16.8h, v16.8h, v0.8h\n" + "fmax v17.8h, v17.8h, v0.8h\n" + "fmax v18.8h, v18.8h, v0.8h\n" + "fmin v11.8h, v11.8h, v1.8h\n" + "fmin v12.8h, v12.8h, v1.8h\n" + "fmin v13.8h, v13.8h, v1.8h\n" + "fmin v14.8h, v14.8h, v1.8h\n" + "fmin v15.8h, v15.8h, v1.8h\n" + "fmin v16.8h, v16.8h, v1.8h\n" + "fmin v17.8h, v17.8h, v1.8h\n" + "fmin v18.8h, v18.8h, v1.8h\n" + "fdiv v11.8h, v11.8h, v1.8h\n" + "fdiv v12.8h, v12.8h, v1.8h\n" + "fdiv v13.8h, v13.8h, v1.8h\n" + "fdiv v14.8h, v14.8h, v1.8h\n" + "fdiv v15.8h, v15.8h, v1.8h\n" + "fdiv v16.8h, v16.8h, v1.8h\n" + "fdiv v17.8h, v17.8h, v1.8h\n" + "fdiv v18.8h, v18.8h, v1.8h\n" + "fmul v2.8h, v11.8h, v2.8h\n" + "fmul v3.8h, v12.8h, v3.8h\n" + "fmul v4.8h, v13.8h, v4.8h\n" + "fmul v5.8h, v14.8h, v5.8h\n" + "fmul v6.8h, v15.8h, v6.8h\n" + "fmul v7.8h, v16.8h, v7.8h\n" + "fmul v8.8h, v17.8h, v8.8h\n" + "fmul v9.8h, v18.8h, v9.8h\n" + + "13:\n" + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw0 + "str q4, [%[out_0], #32]\n" // out_o0hw0 + "str q5, [%[out_0], #48]\n" // out_o0hw0 + "str q6, [%[out_0], #64]\n" // out_o0hw0 + "str q7, [%[out_0], #80]\n" // out_o0hw0 + "str q8, [%[out_0], #96]\n" // out_o0hw0 + "str q9, [%[out_0], #112]\n" // out_o0hw0 + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "x0", "x1", "x2"); + } + } + + // ohow_remainder % 8 / 4 + U32 ohow_s = (ohow / 8) * 8; + for (I32 hw = ohow_s; hw < ohow - 3; hw += 4) { + const F16 *b0 = pwBiasArray; + const F16 *b1 = b0 + 8; + const F16 *f_o0c0 = pwFilterArray; + F16 *in_pack = pwArray + hw * ic * 8; + for (I32 o = 0; o < I32(oc - 1); o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr q22, [%[b_0]]\n" // b_o0 + "ldr q23, [%[b_1]]\n" // b_o1 + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "ldr q18, [%[f_0]]\n" // f_o0c0 + "mov v4.16b, v22.16b\n" // out_o0hw2 + "mov v5.16b, v22.16b\n" // out_o0hw3 + "mov v10.16b, v23.16b\n" // out_o1hw0 + "ldr q19, [%[f_0], #16]\n" // f_o1c0 + "mov v11.16b, v23.16b\n" // out_o1hw1 + "mov v12.16b, v23.16b\n" // out_o1hw2 + "mov v13.16b, v23.16b\n" // out_o1hw3 + "0:\n" + "ldr d1, [%[in_0], #8]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr q20, [%[f_0], #32]\n" // f_o0c0 + "fmla v3.8h, v18.8h, v0.h[1]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "ldr q21, [%[f_0], #48]\n" // f_o1c0 + "fmla v10.8h, v19.8h, v0.h[0]\n" + "fmla v11.8h, v19.8h, v0.h[1]\n" + "fmla v12.8h, v19.8h, v0.h[2]\n" + "subs x0, x0, #2\n" + "fmla v13.8h, v19.8h, v0.h[3]\n" + + "ldr d0, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr q18, [%[f_0], #64]\n" // f_o0c0 + "fmla v3.8h, v20.8h, v1.h[1]\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "ldr q19, [%[f_0], #80]\n" // f_o1c0 + "fmla v5.8h, v20.8h, v1.h[3]\n" + "fmla v10.8h, v21.8h, v1.h[0]\n" + "fmla v11.8h, v21.8h, v1.h[1]\n" + "fmla v12.8h, v21.8h, v1.h[2]\n" + "add %[in_0], %[in_0], #16\n" + "fmla v13.8h, v21.8h, v1.h[3]\n" + "add %[f_0], %[f_0], #64\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + "fmax v11.8h, v11.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v0.8h\n" + "fmax v13.8h, v13.8h, v0.8h\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + "fmax v11.8h, v11.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v0.8h\n" + "fmax v13.8h, v13.8h, v0.8h\n" + "fmin v2.8h, v2.8h, v1.8h\n" + "fmin v3.8h, v3.8h, v1.8h\n" + "fmin v4.8h, v4.8h, v1.8h\n" + "fmin v5.8h, v5.8h, v1.8h\n" + "fmin v10.8h, v10.8h, v1.8h\n" + "fmin v11.8h, v11.8h, v1.8h\n" + "fmin v12.8h, v12.8h, v1.8h\n" + "fmin v13.8h, v13.8h, v1.8h\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three + "fadd v6.8h, v2.8h, v18.8h\n" + "fadd v7.8h, v3.8h, v18.8h\n" + "fadd v8.8h, v4.8h, v18.8h\n" + "fadd v9.8h, v5.8h, v18.8h\n" + "fadd v19.8h, v10.8h, v18.8h\n" + "fadd v20.8h, v11.8h, v18.8h\n" + "fadd v21.8h, v12.8h, v18.8h\n" + "fadd v22.8h, v13.8h, v18.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + "fmax v19.8h, v19.8h, v0.8h\n" + "fmax v20.8h, v20.8h, v0.8h\n" + "fmax v21.8h, v21.8h, v0.8h\n" + "fmax v22.8h, v22.8h, v0.8h\n" + "fmin v6.8h, v6.8h, v1.8h\n" + "fmin v7.8h, v7.8h, v1.8h\n" + "fmin v8.8h, v8.8h, v1.8h\n" + "fmin v9.8h, v9.8h, v1.8h\n" + "fmin v19.8h, v19.8h, v1.8h\n" + "fmin v20.8h, v20.8h, v1.8h\n" + "fmin v21.8h, v21.8h, v1.8h\n" + "fmin v22.8h, v22.8h, v1.8h\n" + "fdiv v6.8h, v6.8h, v1.8h\n" + "fdiv v7.8h, v7.8h, v1.8h\n" + "fdiv v8.8h, v8.8h, v1.8h\n" + "fdiv v9.8h, v9.8h, v1.8h\n" + "fdiv v19.8h, v19.8h, v1.8h\n" + "fdiv v20.8h, v20.8h, v1.8h\n" + "fdiv v21.8h, v21.8h, v1.8h\n" + "fdiv v22.8h, v22.8h, v1.8h\n" + "fmul v2.8h, v6.8h, v2.8h\n" + "fmul v3.8h, v7.8h, v3.8h\n" + "fmul v4.8h, v8.8h, v4.8h\n" + "fmul v5.8h, v9.8h, v5.8h\n" + "fmul v10.8h, v19.8h, v10.8h\n" + "fmul v11.8h, v20.8h, v11.8h\n" + "fmul v12.8h, v21.8h, v12.8h\n" + "fmul v13.8h, v22.8h, v13.8h\n" + + "13:\n" + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + "str q10, [%[out_1]]\n" // out_o1hw0 + "str q11, [%[out_1], #16]\n" // out_o1hw1 + "str q12, [%[out_1], #32]\n" // out_o1hw2 + "str q13, [%[out_1], #48]\n" // out_o1hw3 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), [b_1] "r"(b_o1), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v18", "v19", "v20", "v21", "v22", "v23", "x0", + "x1", "x2", "x3"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = pwFilterArray + (oc - 1) * 8 * ic * 8; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = pwBiasArray + (oc - 1) * 8; + __asm__ __volatile__( + "ldr q22, [%[b_0]]\n" // b_o0 + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "ldr q18, [%[f_0]]\n" // f_o0c0 + "mov v4.16b, v22.16b\n" // out_o0hw2 + "mov v5.16b, v22.16b\n" // out_o0hw3 + "0:\n" + "ldr d1, [%[in_0], #8]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr q20, [%[f_0], #16]\n" // f_o0c0 + "fmla v3.8h, v18.8h, v0.h[1]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "subs x0, x0, #2\n" + + "ldr d0, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr q18, [%[f_0], #32]\n" // f_o0c0 + "fmla v3.8h, v20.8h, v1.h[1]\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "fmla v5.8h, v20.8h, v1.h[3]\n" + "add %[in_0], %[in_0], #16\n" + "add %[f_0], %[f_0], #32\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmin v2.8h, v2.8h, v1.8h\n" + "fmin v3.8h, v3.8h, v1.8h\n" + "fmin v4.8h, v4.8h, v1.8h\n" + "fmin v5.8h, v5.8h, v1.8h\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three + "fadd v6.8h, v2.8h, v18.8h\n" + "fadd v7.8h, v3.8h, v18.8h\n" + "fadd v8.8h, v4.8h, v18.8h\n" + "fadd v9.8h, v5.8h, v18.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + "fmin v6.8h, v6.8h, v1.8h\n" + "fmin v7.8h, v7.8h, v1.8h\n" + "fmin v8.8h, v8.8h, v1.8h\n" + "fmin v9.8h, v9.8h, v1.8h\n" + "fdiv v6.8h, v6.8h, v1.8h\n" + "fdiv v7.8h, v7.8h, v1.8h\n" + "fdiv v8.8h, v8.8h, v1.8h\n" + "fdiv v9.8h, v9.8h, v1.8h\n" + "fmul v2.8h, v6.8h, v2.8h\n" + "fmul v3.8h, v7.8h, v3.8h\n" + "fmul v4.8h, v8.8h, v4.8h\n" + "fmul v5.8h, v9.8h, v5.8h\n" + + "13:\n" + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v18", "v20", "v22", "x0", "x1", "x2"); + } + } + + // ohow_reminder % 4 + ohow_s = (ohow / 4) * 4; + for (I32 hw = ohow_s; hw < ohow; hw++) { + const F16 *b0 = pwBiasArray; + const F16 *b1 = b0 + 8; + const F16 *f_o0c0 = pwFilterArray; + F16 *in_pack = pwArray + hw * ic * 8; + for (I32 o = 0; o < I32(oc - 1); o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr q22, [%[b_0]]\n" // b_o0 + "ldr q23, [%[b_1]]\n" // b_o1 + "mov x0, %[ic]\n" // ic_blk + "ldr h0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr q18, [%[f_0]]\n" // f_o0c0 + "mov v10.16b, v23.16b\n" // out_o1hw0 + "ldr q19, [%[f_0], #16]\n" // f_o1c0 + "0:\n" + "ldr h1, [%[in_0], #2]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr q20, [%[f_0], #32]\n" // f_o0c0 + "fmla v10.8h, v19.8h, v0.h[0]\n" + "ldr q21, [%[f_0], #48]\n" // f_o1c0 + "subs x0, x0, #2\n" + + "ldr h0, [%[in_0], #4]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr q18, [%[f_0], #64]\n" // f_o0c0 + "fmla v10.8h, v21.8h, v1.h[0]\n" + "ldr q19, [%[f_0], #80]\n" // f_o1c0 + "add %[in_0], %[in_0], #4\n" + "add %[f_0], %[f_0], #64\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + "fmin v2.8h, v2.8h, v1.8h\n" + "fmin v10.8h, v10.8h, v1.8h\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three + "fadd v19.8h, v2.8h, v18.8h\n" + "fadd v20.8h, v10.8h, v18.8h\n" + "fmax v19.8h, v19.8h, v0.8h\n" + "fmax v20.8h, v20.8h, v0.8h\n" + "fmin v19.8h, v19.8h, v1.8h\n" + "fmin v20.8h, v20.8h, v1.8h\n" + "fdiv v19.8h, v19.8h, v1.8h\n" + "fdiv v20.8h, v20.8h, v1.8h\n" + "fmul v2.8h, v19.8h, v2.8h\n" + "fmul v10.8h, v20.8h, v10.8h\n" + + "13:\n" + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q10, [%[out_1]]\n" // out_o1hw0 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), [b_1] "r"(b_o1), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v10", "v18", "v19", "v20", "v21", "v22", + "v23", "x0", "x1", "x2", "x3"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = pwFilterArray + (oc - 1) * 8 * ic * 8; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = pwBiasArray + (oc - 1) * 8; + __asm__ __volatile__( + "ldr q22, [%[b_0]]\n" // b_o0 + "mov x0, %[ic]\n" // ic_blk + "ldr h0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr q18, [%[f_0]]\n" // f_o0c0 + "0:\n" + "ldr h1, [%[in_0], #2]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr q20, [%[f_0], #16]\n" // f_o0c0 + "subs x0, x0, #2\n" + + "ldr h0, [%[in_0], #4]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr q18, [%[f_0], #32]\n" // f_o0c0 + "add %[in_0], %[in_0], #4\n" + "add %[f_0], %[f_0], #32\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v2.8h, v2.8h, v0.8h\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v0.8h\n" + "fmin v2.8h, v2.8h, v1.8h\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three + "fadd v20.8h, v2.8h, v18.8h\n" + "fmax v20.8h, v20.8h, v0.8h\n" + "fmin v20.8h, v20.8h, v1.8h\n" + "fdiv v20.8h, v20.8h, v1.8h\n" + "fmul v2.8h, v20.8h, v2.8h\n" + + "13:\n" + "str q2, [%[out_0]]\n" // out_o0hw0 + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v10", "v18", "v20", "v22", "x0", "x1", + "x2"); + } + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding.h b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding.h new file mode 100644 index 00000000..8e684953 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding.h @@ -0,0 +1,148 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_DEPTHWISE_POINTWISE_CONVOLUTION_DIRECT_NO_PADDING +#define _H_DEPTHWISE_POINTWISE_CONVOLUTION_DIRECT_NO_PADDING + +#include +#include "sys.h" +#include "types.h" +#include "error.h" + +inline void calc_eight_channel_elements(I32 hw, + I32 ih_base, + I32 ih, + I32 iw, + I32 fh, + I32 fw, + I32 ow, + F16 *inArray, + I32 strideH, + I32 strideW, + I32 paddingT, + I32 paddingL, + const F16 *filterArray, + float16x8_t bias, + F16 *output) +{ + I32 h = hw / ow; + I32 w = hw % ow; + float16x8_t v0 = bias; + I32 ih_start = h * strideH - paddingT; + I32 iw_start = w * strideW - paddingL; + I32 fh_start = 0; + if (ih_start < 0) { + fh_start -= ih_start; + } + I32 fw_start = 0; + if (iw_start < 0) { + fw_start -= iw_start; + } + for (I32 fh_idx = fh_start; fh_idx < fh; fh_idx++) { + I32 ih_idx = ih_start + fh_idx; + if (ih_idx >= ih) { + break; + } + I32 iw_base = ((ih_base + ih_idx) * iw); + I32 filter_index = (fh_idx * fw + fw_start) * 8; + for (I32 fw_idx = fw_start; fw_idx < fw; fw_idx++, filter_index += 8) { + I32 iw_idx = iw_start + fw_idx; + if (iw_idx >= iw) { + break; + } + { + U32 in_index = (iw_base + iw_idx) * 8; + float16x8_t v1 = vld1q_f16(inArray + in_index); + float16x8_t v2 = vld1q_f16(filterArray + filter_index); + v0 = vfmaq_f16(v0, v1, v2); + } + } + } + vst1q_f16(output, v0); +} + +EE depthwise_pointwise_convolution_direct_no_padding_A55(TensorDesc inputDesc, + F16 *inArray, + TensorDesc dwFilterDesc, + const F16 *dwFilterArray, + TensorDesc pwFilterDesc, + const F16 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F16 *dwBiasArray, + TensorDesc pwBiasDesc, + const F16 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec); + +EE depthwise_pointwise_convolution_direct_no_padding_A76(TensorDesc inputDesc, + F16 *inArray, + TensorDesc dwFilterDesc, + const F16 *dwFilterArray, + TensorDesc pwFilterDesc, + const F16 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F16 *dwBiasArray, + TensorDesc pwBiasDesc, + const F16 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec); + +inline EE depthwise_pointwise_convolution_direct_no_padding(TensorDesc inputDesc, + F16 *inArray, + TensorDesc dwFilterDesc, + const F16 *dwFilterArray, + TensorDesc pwFilterDesc, + const F16 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F16 *dwBiasArray, + TensorDesc pwBiasDesc, + const F16 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + Arch arch) +{ + EE ret = SUCCESS; + switch (arch) { + case ARM_A55: + ret = depthwise_pointwise_convolution_direct_no_padding_A55(inputDesc, inArray, + dwFilterDesc, dwFilterArray, pwFilterDesc, pwFilterArray, convParamSpec, dwBiasDesc, + dwBiasArray, pwBiasDesc, pwBiasArray, tmpBytes, tmp, outputDesc, outArray, + depthwiseActivationParamSpec, pointwiseActivationParamSpec); + break; + case ARM_A76: + ret = depthwise_pointwise_convolution_direct_no_padding_A76(inputDesc, inArray, + dwFilterDesc, dwFilterArray, pwFilterDesc, pwFilterArray, convParamSpec, dwBiasDesc, + dwBiasArray, pwBiasDesc, pwBiasArray, tmpBytes, tmp, outputDesc, outArray, + depthwiseActivationParamSpec, pointwiseActivationParamSpec); + break; + default: + return NOT_SUPPORTED; + } + return ret; +} +#endif diff --git a/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A55.cpp b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A55.cpp new file mode 100644 index 00000000..e86fe2c9 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A55.cpp @@ -0,0 +1,997 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding.h" +#include "cpu/arm/fp16/arm_functions_fp16.h" + +EE depthwise_pointwise_convolution_direct_no_padding_A55(TensorDesc inputDesc, + F16 *inArray, + TensorDesc dwFilterDesc, + const F16 *dwFilterArray, + TensorDesc pwFilterDesc, + const F16 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F16 *dwBiasArray, + TensorDesc pwBiasDesc, + const F16 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec) +{ + UNUSED(tmpBytes); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingL = convParamSpec.padding_left; + + if (dwFilterDesc.df != DF_NCHWC8 || pwFilterDesc.df != DF_NHWCN16) { + CHECK_STATUS(NOT_MATCH); + } + + oc /= 8; + ic /= 8; + + I32 ohow = oh * ow; + F16 *pwArray = (F16 *)tmp; + + F16 buffer[8]; + for (U32 n = 0; n < in; n++) { + for (U32 c = 0; c < ic; c++) { + const F16 *f = dwFilterArray + c * fh * fw * 8; + const F16 *b = dwBiasArray + c * 8; + float16x8_t vv0 = vld1q_f16(b); + + I32 iter = 0; + U32 ih_base = ((n * ic) + c) * ih; + // nhwchw8 + for (; iter < ohow - 7; iter += 8) { + U32 out_base = iter * ic * 8 + c * 8 * 8; + for (I32 j = 0; j < 8; j++) { + I32 hw = iter + j; + calc_eight_channel_elements(hw, ih_base, ih, iw, fh, fw, ow, inArray, strideH, + strideW, paddingT, paddingL, f, vv0, buffer); + CHECK_STATUS(activation_fp16(buffer, 8, depthwiseActivationParamSpec, buffer)); + U32 out_index = out_base + j; + for (I32 i = 0; i < 8; i++, out_index += 8) { + pwArray[out_index] = buffer[i]; + } + } + } + // nhwchw4 + for (; iter < ohow - 3; iter += 4) { + U32 out_base = iter * ic * 8 + c * 8 * 4; + for (I32 j = 0; j < 4; j++) { + I32 hw = iter + j; + calc_eight_channel_elements(hw, ih_base, ih, iw, fh, fw, ow, inArray, strideH, + strideW, paddingT, paddingL, f, vv0, buffer); + CHECK_STATUS(activation_fp16(buffer, 8, depthwiseActivationParamSpec, buffer)); + U32 out_index = out_base + j; + for (I32 i = 0; i < 8; i++, out_index += 4) { + pwArray[out_index] = buffer[i]; + } + } + } + // nhwchw1 + for (; iter < ohow; iter++) { + U32 out_base = iter * ic * 8 + c * 8; + for (I32 j = 0; j < 1; j++) { + I32 hw = iter + j; + calc_eight_channel_elements(hw, ih_base, ih, iw, fh, fw, ow, inArray, strideH, + strideW, paddingT, paddingL, f, vv0, buffer); + CHECK_STATUS(activation_fp16(buffer, 8, depthwiseActivationParamSpec, buffer)); + U32 out_index = out_base + j; + for (I32 i = 0; i < 8; i++, out_index++) { + pwArray[out_index] = buffer[i]; + } + } + } + } + + // pw_conv + // ohow / 8 + for (I32 hw = 0; hw < ohow - 7; hw += 8) { + const F16 *b0 = pwBiasArray; + const F16 *b1 = b0 + 8; + F16 *in_pack = pwArray + hw * ic * 8; + const F16 *f_o0c0 = pwFilterArray; + for (I32 o = 0; o < I32(oc - 1); o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr d22, [%[b_0]]\n" // b_o0 + "ldr x1, [%[b_0], #8]\n" + "ins v22.d[1], x1\n" + "ldr d23, [%[b_1]]\n" // b_o1 + "ldr x2, [%[b_1], #8]\n" + "ins v23.d[1], x2\n" + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "ldr x1, [%[in_0], #8]\n" + "mov v4.16b, v22.16b\n" // out_o0hw2 + "ins v0.d[1], x1\n" + "mov v5.16b, v22.16b\n" // out_o0hw3 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "mov v6.16b, v22.16b\n" // out_o0hw4 + "ldr x2, [%[f_0], #8]\n" + "mov v7.16b, v22.16b\n" // out_o0hw5 + "ins v18.d[1], x2\n" + "mov v8.16b, v22.16b\n" // out_o0hw6 + "ldr d19, [%[f_0], #16]\n" // f_o1c0 + "mov v9.16b, v22.16b\n" // out_o0hw7 + "ldr x3, [%[f_0], #24]\n" + "mov v10.16b, v23.16b\n" // out_o1hw0 + "ins v19.d[1], x3\n" + "mov v11.16b, v23.16b\n" // out_o1hw1 + "mov v12.16b, v23.16b\n" // out_o1hw2 + "mov v13.16b, v23.16b\n" // out_o1hw3 + "mov v14.16b, v23.16b\n" // out_o1hw4 + "mov v15.16b, v23.16b\n" // out_o1hw5 + "mov v16.16b, v23.16b\n" // out_o1hw6 + "mov v17.16b, v23.16b\n" // out_o1hw7 + "0:\n" + "ldr d1, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr x1, [%[in_0], #24]\n" + "fmla v3.8h, v18.8h, v0.h[1]\n" + "ins v1.d[1], x1\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "ldr d20, [%[f_0], #32]\n" // f_o0c0 + "fmla v5.8h, v18.8h, v0.h[3]\n" + "ldr x2, [%[f_0], #40]\n" + "fmla v6.8h, v18.8h, v0.h[4]\n" + "ins v20.d[1], x2\n" + "fmla v7.8h, v18.8h, v0.h[5]\n" + "ldr d21, [%[f_0], #48]\n" // f_o1c0 + "fmla v8.8h, v18.8h, v0.h[6]\n" + "ldr x3, [%[f_0], #56]\n" + "fmla v9.8h, v18.8h, v0.h[7]\n" + "ins v21.d[1], x3\n" + "fmla v10.8h, v19.8h, v0.h[0]\n" + "fmla v11.8h, v19.8h, v0.h[1]\n" + "fmla v12.8h, v19.8h, v0.h[2]\n" + "fmla v13.8h, v19.8h, v0.h[3]\n" + "fmla v14.8h, v19.8h, v0.h[4]\n" + "fmla v15.8h, v19.8h, v0.h[5]\n" + "fmla v16.8h, v19.8h, v0.h[6]\n" + "fmla v17.8h, v19.8h, v0.h[7]\n" + + "ldr d0, [%[in_0], #32]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr x1, [%[in_0], #40]\n" + "fmla v3.8h, v20.8h, v1.h[1]\n" + "ins v0.d[1], x1\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "ldr d18, [%[f_0], #64]\n" // f_o0c0 + "fmla v5.8h, v20.8h, v1.h[3]\n" + "ldr x2, [%[f_0], #72]\n" + "fmla v6.8h, v20.8h, v1.h[4]\n" + "ins v18.d[1], x2\n" + "fmla v7.8h, v20.8h, v1.h[5]\n" + "ldr d19, [%[f_0], #80]\n" // f_o1c0 + "fmla v8.8h, v20.8h, v1.h[6]\n" + "ldr x3, [%[f_0], #88]\n" + "fmla v9.8h, v20.8h, v1.h[7]\n" + "ins v19.d[1], x3\n" + "fmla v10.8h, v21.8h, v1.h[0]\n" + "add %[in_0], %[in_0], #32\n" + "fmla v11.8h, v21.8h, v1.h[1]\n" + "add %[f_0], %[f_0], #64\n" + "fmla v12.8h, v21.8h, v1.h[2]\n" + "subs x0, x0, #2\n" + "fmla v13.8h, v21.8h, v1.h[3]\n" + "fmla v14.8h, v21.8h, v1.h[4]\n" + "fmla v15.8h, v21.8h, v1.h[5]\n" + "fmla v16.8h, v21.8h, v1.h[6]\n" + "fmla v17.8h, v21.8h, v1.h[7]\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + "fmax v11.8h, v11.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v0.8h\n" + "fmax v13.8h, v13.8h, v0.8h\n" + "fmax v14.8h, v14.8h, v0.8h\n" + "fmax v15.8h, v15.8h, v0.8h\n" + "fmax v16.8h, v16.8h, v0.8h\n" + "fmax v17.8h, v17.8h, v0.8h\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + "fmax v11.8h, v11.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v0.8h\n" + "fmax v13.8h, v13.8h, v0.8h\n" + "fmax v14.8h, v14.8h, v0.8h\n" + "fmax v15.8h, v15.8h, v0.8h\n" + "fmax v16.8h, v16.8h, v0.8h\n" + "fmax v17.8h, v17.8h, v0.8h\n" + "fmin v2.8h, v2.8h, v1.8h\n" + "fmin v3.8h, v3.8h, v1.8h\n" + "fmin v4.8h, v4.8h, v1.8h\n" + "fmin v5.8h, v5.8h, v1.8h\n" + "fmin v6.8h, v6.8h, v1.8h\n" + "fmin v7.8h, v7.8h, v1.8h\n" + "fmin v8.8h, v8.8h, v1.8h\n" + "fmin v9.8h, v9.8h, v1.8h\n" + "fmin v10.8h, v10.8h, v1.8h\n" + "fmin v11.8h, v11.8h, v1.8h\n" + "fmin v12.8h, v12.8h, v1.8h\n" + "fmin v13.8h, v13.8h, v1.8h\n" + "fmin v14.8h, v14.8h, v1.8h\n" + "fmin v15.8h, v15.8h, v1.8h\n" + "fmin v16.8h, v16.8h, v1.8h\n" + "fmin v17.8h, v17.8h, v1.8h\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three + "fadd v19.8h, v2.8h, v18.8h\n" + "fadd v20.8h, v3.8h, v18.8h\n" + "fadd v21.8h, v4.8h, v18.8h\n" + "fadd v22.8h, v5.8h, v18.8h\n" + "fadd v23.8h, v6.8h, v18.8h\n" + "fadd v24.8h, v7.8h, v18.8h\n" + "fadd v25.8h, v8.8h, v18.8h\n" + "fadd v26.8h, v9.8h, v18.8h\n" + "fmax v19.8h, v19.8h, v0.8h\n" + "fmax v20.8h, v20.8h, v0.8h\n" + "fmax v21.8h, v21.8h, v0.8h\n" + "fmax v22.8h, v22.8h, v0.8h\n" + "fmax v23.8h, v23.8h, v0.8h\n" + "fmax v24.8h, v24.8h, v0.8h\n" + "fmax v25.8h, v25.8h, v0.8h\n" + "fmax v26.8h, v26.8h, v0.8h\n" + "fmin v19.8h, v19.8h, v1.8h\n" + "fmin v20.8h, v20.8h, v1.8h\n" + "fmin v21.8h, v21.8h, v1.8h\n" + "fmin v22.8h, v22.8h, v1.8h\n" + "fmin v23.8h, v23.8h, v1.8h\n" + "fmin v24.8h, v24.8h, v1.8h\n" + "fmin v25.8h, v25.8h, v1.8h\n" + "fmin v26.8h, v26.8h, v1.8h\n" + "fdiv v19.8h, v19.8h, v1.8h\n" + "fdiv v20.8h, v20.8h, v1.8h\n" + "fdiv v21.8h, v21.8h, v1.8h\n" + "fdiv v22.8h, v22.8h, v1.8h\n" + "fdiv v23.8h, v23.8h, v1.8h\n" + "fdiv v24.8h, v24.8h, v1.8h\n" + "fdiv v25.8h, v25.8h, v1.8h\n" + "fdiv v26.8h, v26.8h, v1.8h\n" + "fmul v2.8h, v19.8h, v2.8h\n" + "fmul v3.8h, v20.8h, v3.8h\n" + "fmul v4.8h, v21.8h, v4.8h\n" + "fmul v5.8h, v22.8h, v5.8h\n" + "fmul v6.8h, v23.8h, v6.8h\n" + "fmul v7.8h, v24.8h, v7.8h\n" + "fmul v8.8h, v25.8h, v8.8h\n" + "fmul v9.8h, v26.8h, v9.8h\n" + + "fadd v19.8h, v10.8h, v18.8h\n" + "fadd v20.8h, v11.8h, v18.8h\n" + "fadd v21.8h, v12.8h, v18.8h\n" + "fadd v22.8h, v13.8h, v18.8h\n" + "fadd v23.8h, v14.8h, v18.8h\n" + "fadd v24.8h, v15.8h, v18.8h\n" + "fadd v25.8h, v16.8h, v18.8h\n" + "fadd v26.8h, v17.8h, v18.8h\n" + "fmax v19.8h, v19.8h, v0.8h\n" + "fmax v20.8h, v20.8h, v0.8h\n" + "fmax v21.8h, v21.8h, v0.8h\n" + "fmax v22.8h, v22.8h, v0.8h\n" + "fmax v23.8h, v23.8h, v0.8h\n" + "fmax v24.8h, v24.8h, v0.8h\n" + "fmax v25.8h, v25.8h, v0.8h\n" + "fmax v26.8h, v26.8h, v0.8h\n" + "fmin v19.8h, v19.8h, v1.8h\n" + "fmin v20.8h, v20.8h, v1.8h\n" + "fmin v21.8h, v21.8h, v1.8h\n" + "fmin v22.8h, v22.8h, v1.8h\n" + "fmin v23.8h, v23.8h, v1.8h\n" + "fmin v24.8h, v24.8h, v1.8h\n" + "fmin v25.8h, v25.8h, v1.8h\n" + "fmin v26.8h, v26.8h, v1.8h\n" + "fdiv v19.8h, v19.8h, v1.8h\n" + "fdiv v20.8h, v20.8h, v1.8h\n" + "fdiv v21.8h, v21.8h, v1.8h\n" + "fdiv v22.8h, v22.8h, v1.8h\n" + "fdiv v23.8h, v23.8h, v1.8h\n" + "fdiv v24.8h, v24.8h, v1.8h\n" + "fdiv v25.8h, v25.8h, v1.8h\n" + "fdiv v26.8h, v26.8h, v1.8h\n" + "fmul v10.8h, v19.8h, v10.8h\n" + "fmul v11.8h, v20.8h, v11.8h\n" + "fmul v12.8h, v21.8h, v12.8h\n" + "fmul v13.8h, v22.8h, v13.8h\n" + "fmul v14.8h, v23.8h, v14.8h\n" + "fmul v15.8h, v24.8h, v15.8h\n" + "fmul v16.8h, v25.8h, v16.8h\n" + "fmul v17.8h, v26.8h, v17.8h\n" + + "13:\n" + "st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%[out_0]], #64\n" + "st1 {v6.8h, v7.8h, v8.8h, v9.8h}, [%[out_0]], #64\n" + "st1 {v10.8h, v11.8h, v12.8h, v13.8h}, [%[out_1]], #64\n" + "st1 {v14.8h, v15.8h, v16.8h, v17.8h}, [%[out_1]], #64\n" + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), [b_1] "r"(b_o1), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "x0", "x1", "x2", "x3"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = pwFilterArray + (oc - 1) * 8 * ic * 8; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = pwBiasArray + (oc - 1) * 8; + __asm__ __volatile__( + "ldr q12, [%[b_0]]\n" // b_o0 + "mov x0, %[ic]\n" // ic_blk + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v12.16b\n" // out_o0hw0 + "ldr x1, [%[in_0], #8]\n" + "mov v3.16b, v12.16b\n" // out_o0hw1 + "ins v0.d[1], x1\n" + "mov v4.16b, v12.16b\n" // out_o0hw2 + "ldr d10, [%[f_0]]\n" // f_o0c0 + "mov v5.16b, v12.16b\n" // out_o0hw3 + "ldr x2, [%[f_0], #8]\n" + "mov v6.16b, v12.16b\n" // out_o0hw4 + "ins v10.d[1], x2\n" + "mov v7.16b, v12.16b\n" // out_o0hw5 + "mov v8.16b, v12.16b\n" // out_o0hw6 + "mov v9.16b, v12.16b\n" // out_o0hw7 + "0:\n" + "ldr d1, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v10.8h, v0.h[0]\n" + "ldr x1, [%[in_0], #24]\n" + "fmla v3.8h, v10.8h, v0.h[1]\n" + "ins v1.d[1], x1\n" + "fmla v4.8h, v10.8h, v0.h[2]\n" + "ldr d11, [%[f_0], #16]\n" // f_o0c0 + "fmla v5.8h, v10.8h, v0.h[3]\n" + "ldr x2, [%[f_0], #24]\n" + "fmla v6.8h, v10.8h, v0.h[4]\n" + "ins v11.d[1], x2\n" + "fmla v7.8h, v10.8h, v0.h[5]\n" + "subs x0, x0, #2\n" + "fmla v8.8h, v10.8h, v0.h[6]\n" + "fmla v9.8h, v10.8h, v0.h[7]\n" + + "ldr d0, [%[in_0], #32]\n" // in_hw0 + "fmla v2.8h, v11.8h, v1.h[0]\n" + "ldr x1, [%[in_0], #40]\n" + "fmla v3.8h, v11.8h, v1.h[1]\n" + "ins v0.d[1], x1\n" + "fmla v4.8h, v11.8h, v1.h[2]\n" + "ldr d10, [%[f_0], #32]\n" // f_o0c0 + "fmla v5.8h, v11.8h, v1.h[3]\n" + "ldr x2, [%[f_0], #40]\n" + "fmla v6.8h, v11.8h, v1.h[4]\n" + "ins v10.d[1], x2\n" + "fmla v7.8h, v11.8h, v1.h[5]\n" + "add %[in_0], %[in_0], #32\n" + "fmla v8.8h, v11.8h, v1.h[6]\n" + "add %[f_0], %[f_0], #32\n" + "fmla v9.8h, v11.8h, v1.h[7]\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + "fmin v2.8h, v2.8h, v1.8h\n" + "fmin v3.8h, v3.8h, v1.8h\n" + "fmin v4.8h, v4.8h, v1.8h\n" + "fmin v5.8h, v5.8h, v1.8h\n" + "fmin v6.8h, v6.8h, v1.8h\n" + "fmin v7.8h, v7.8h, v1.8h\n" + "fmin v8.8h, v8.8h, v1.8h\n" + "fmin v9.8h, v9.8h, v1.8h\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v10.8h, #0x42, lsl #8\n" // three + "fadd v11.8h, v2.8h, v10.8h\n" + "fadd v12.8h, v3.8h, v10.8h\n" + "fadd v13.8h, v4.8h, v10.8h\n" + "fadd v14.8h, v5.8h, v10.8h\n" + "fadd v15.8h, v6.8h, v10.8h\n" + "fadd v16.8h, v7.8h, v10.8h\n" + "fadd v17.8h, v8.8h, v10.8h\n" + "fadd v18.8h, v9.8h, v10.8h\n" + "fmax v11.8h, v11.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v0.8h\n" + "fmax v13.8h, v13.8h, v0.8h\n" + "fmax v14.8h, v14.8h, v0.8h\n" + "fmax v15.8h, v15.8h, v0.8h\n" + "fmax v16.8h, v16.8h, v0.8h\n" + "fmax v17.8h, v17.8h, v0.8h\n" + "fmax v18.8h, v18.8h, v0.8h\n" + "fmin v11.8h, v11.8h, v1.8h\n" + "fmin v12.8h, v12.8h, v1.8h\n" + "fmin v13.8h, v13.8h, v1.8h\n" + "fmin v14.8h, v14.8h, v1.8h\n" + "fmin v15.8h, v15.8h, v1.8h\n" + "fmin v16.8h, v16.8h, v1.8h\n" + "fmin v17.8h, v17.8h, v1.8h\n" + "fmin v18.8h, v18.8h, v1.8h\n" + "fdiv v11.8h, v11.8h, v1.8h\n" + "fdiv v12.8h, v12.8h, v1.8h\n" + "fdiv v13.8h, v13.8h, v1.8h\n" + "fdiv v14.8h, v14.8h, v1.8h\n" + "fdiv v15.8h, v15.8h, v1.8h\n" + "fdiv v16.8h, v16.8h, v1.8h\n" + "fdiv v17.8h, v17.8h, v1.8h\n" + "fdiv v18.8h, v18.8h, v1.8h\n" + "fmul v2.8h, v11.8h, v2.8h\n" + "fmul v3.8h, v12.8h, v3.8h\n" + "fmul v4.8h, v13.8h, v4.8h\n" + "fmul v5.8h, v14.8h, v5.8h\n" + "fmul v6.8h, v15.8h, v6.8h\n" + "fmul v7.8h, v16.8h, v7.8h\n" + "fmul v8.8h, v17.8h, v8.8h\n" + "fmul v9.8h, v18.8h, v9.8h\n" + + "13:\n" + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw0 + "str q4, [%[out_0], #32]\n" // out_o0hw0 + "str q5, [%[out_0], #48]\n" // out_o0hw0 + "str q6, [%[out_0], #64]\n" // out_o0hw0 + "str q7, [%[out_0], #80]\n" // out_o0hw0 + "str q8, [%[out_0], #96]\n" // out_o0hw0 + "str q9, [%[out_0], #112]\n" // out_o0hw0 + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "x0", "x1", "x2"); + } + } + + // ohow_remainder % 8 / 4 + U32 ohow_s = (ohow / 8) * 8; + for (I32 hw = ohow_s; hw < ohow - 3; hw += 4) { + const F16 *b0 = pwBiasArray; + const F16 *b1 = b0 + 8; + const F16 *f_o0c0 = pwFilterArray; + F16 *in_pack = pwArray + hw * ic * 8; + for (I32 o = 0; o < I32(oc - 1); o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr d22, [%[b_0]]\n" // b_o0 + "ldr x1, [%[b_0], #8]\n" + "ins v22.d[1], x1\n" + "ldr d23, [%[b_1]]\n" // b_o1 + "ldr x2, [%[b_1], #8]\n" + "ins v23.d[1], x2\n" + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "mov v4.16b, v22.16b\n" // out_o0hw2 + "ldr x2, [%[f_0], #8]\n" + "mov v5.16b, v22.16b\n" // out_o0hw3 + "ins v18.d[1], x2\n" + "mov v10.16b, v23.16b\n" // out_o1hw0 + "ldr d19, [%[f_0], #16]\n" // f_o1c0 + "mov v11.16b, v23.16b\n" // out_o1hw1 + "ldr x3, [%[f_0], #24]\n" + "mov v12.16b, v23.16b\n" // out_o1hw2 + "ins v19.d[1], x3\n" + "mov v13.16b, v23.16b\n" // out_o1hw3 + "0:\n" + "ldr d1, [%[in_0], #8]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr d20, [%[f_0], #32]\n" // f_o0c0 + "fmla v3.8h, v18.8h, v0.h[1]\n" + "ldr x2, [%[f_0], #40]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "ins v20.d[1], x2\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "ldr d21, [%[f_0], #48]\n" // f_o1c0 + "fmla v10.8h, v19.8h, v0.h[0]\n" + "ldr x3, [%[f_0], #56]\n" + "fmla v11.8h, v19.8h, v0.h[1]\n" + "ins v21.d[1], x3\n" + "fmla v12.8h, v19.8h, v0.h[2]\n" + "subs x0, x0, #2\n" + "fmla v13.8h, v19.8h, v0.h[3]\n" + + "ldr d0, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr d18, [%[f_0], #64]\n" // f_o0c0 + "fmla v3.8h, v20.8h, v1.h[1]\n" + "ldr x2, [%[f_0], #72]\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "ldr d19, [%[f_0], #80]\n" // f_o1c0 + "fmla v5.8h, v20.8h, v1.h[3]\n" + "ins v18.d[1], x2\n" + "fmla v10.8h, v21.8h, v1.h[0]\n" + "ldr x3, [%[f_0], #88]\n" + "fmla v11.8h, v21.8h, v1.h[1]\n" + "ins v19.d[1], x3\n" + "fmla v12.8h, v21.8h, v1.h[2]\n" + "add %[in_0], %[in_0], #16\n" + "fmla v13.8h, v21.8h, v1.h[3]\n" + "add %[f_0], %[f_0], #64\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + "fmax v11.8h, v11.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v0.8h\n" + "fmax v13.8h, v13.8h, v0.8h\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + "fmax v11.8h, v11.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v0.8h\n" + "fmax v13.8h, v13.8h, v0.8h\n" + "fmin v2.8h, v2.8h, v1.8h\n" + "fmin v3.8h, v3.8h, v1.8h\n" + "fmin v4.8h, v4.8h, v1.8h\n" + "fmin v5.8h, v5.8h, v1.8h\n" + "fmin v10.8h, v10.8h, v1.8h\n" + "fmin v11.8h, v11.8h, v1.8h\n" + "fmin v12.8h, v12.8h, v1.8h\n" + "fmin v13.8h, v13.8h, v1.8h\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three + "fadd v6.8h, v2.8h, v18.8h\n" + "fadd v7.8h, v3.8h, v18.8h\n" + "fadd v8.8h, v4.8h, v18.8h\n" + "fadd v9.8h, v5.8h, v18.8h\n" + "fadd v19.8h, v10.8h, v18.8h\n" + "fadd v20.8h, v11.8h, v18.8h\n" + "fadd v21.8h, v12.8h, v18.8h\n" + "fadd v22.8h, v13.8h, v18.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + "fmax v19.8h, v19.8h, v0.8h\n" + "fmax v20.8h, v20.8h, v0.8h\n" + "fmax v21.8h, v21.8h, v0.8h\n" + "fmax v22.8h, v22.8h, v0.8h\n" + "fmin v6.8h, v6.8h, v1.8h\n" + "fmin v7.8h, v7.8h, v1.8h\n" + "fmin v8.8h, v8.8h, v1.8h\n" + "fmin v9.8h, v9.8h, v1.8h\n" + "fmin v19.8h, v19.8h, v1.8h\n" + "fmin v20.8h, v20.8h, v1.8h\n" + "fmin v21.8h, v21.8h, v1.8h\n" + "fmin v22.8h, v22.8h, v1.8h\n" + "fdiv v6.8h, v6.8h, v1.8h\n" + "fdiv v7.8h, v7.8h, v1.8h\n" + "fdiv v8.8h, v8.8h, v1.8h\n" + "fdiv v9.8h, v9.8h, v1.8h\n" + "fdiv v19.8h, v19.8h, v1.8h\n" + "fdiv v20.8h, v20.8h, v1.8h\n" + "fdiv v21.8h, v21.8h, v1.8h\n" + "fdiv v22.8h, v22.8h, v1.8h\n" + "fmul v2.8h, v6.8h, v2.8h\n" + "fmul v3.8h, v7.8h, v3.8h\n" + "fmul v4.8h, v8.8h, v4.8h\n" + "fmul v5.8h, v9.8h, v5.8h\n" + "fmul v10.8h, v19.8h, v10.8h\n" + "fmul v11.8h, v20.8h, v11.8h\n" + "fmul v12.8h, v21.8h, v12.8h\n" + "fmul v13.8h, v22.8h, v13.8h\n" + + "13:\n" + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + "str q10, [%[out_1]]\n" // out_o1hw0 + "str q11, [%[out_1], #16]\n" // out_o1hw1 + "str q12, [%[out_1], #32]\n" // out_o1hw2 + "str q13, [%[out_1], #48]\n" // out_o1hw3 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), [b_1] "r"(b_o1), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v18", "v19", "v20", "v21", "v22", "v23", "x0", + "x1", "x2", "x3"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = pwFilterArray + (oc - 1) * 8 * ic * 8; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = pwBiasArray + (oc - 1) * 8; + __asm__ __volatile__( + "ldr d22, [%[b_0]]\n" // b_o0 + "ldr x1, [%[b_0], #8]\n" + "ins v22.d[1], x1\n" + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "mov v4.16b, v22.16b\n" // out_o0hw2 + "ldr x2, [%[f_0], #8]\n" + "mov v5.16b, v22.16b\n" // out_o0hw3 + "ins v18.d[1], x2\n" + "0:\n" + "ldr d1, [%[in_0], #8]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr d20, [%[f_0], #16]\n" // f_o0c0 + "fmla v3.8h, v18.8h, v0.h[1]\n" + "ldr x2, [%[f_0], #24]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "ins v20.d[1], x2\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "subs x0, x0, #2\n" + + "ldr d0, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr d18, [%[f_0], #32]\n" // f_o0c0 + "fmla v3.8h, v20.8h, v1.h[1]\n" + "ldr x2, [%[f_0], #40]\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "ins v18.d[1], x2\n" + "fmla v5.8h, v20.8h, v1.h[3]\n" + "add %[in_0], %[in_0], #16\n" + "add %[f_0], %[f_0], #32\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmin v2.8h, v2.8h, v1.8h\n" + "fmin v3.8h, v3.8h, v1.8h\n" + "fmin v4.8h, v4.8h, v1.8h\n" + "fmin v5.8h, v5.8h, v1.8h\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three + "fadd v6.8h, v2.8h, v18.8h\n" + "fadd v7.8h, v3.8h, v18.8h\n" + "fadd v8.8h, v4.8h, v18.8h\n" + "fadd v9.8h, v5.8h, v18.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + "fmin v6.8h, v6.8h, v1.8h\n" + "fmin v7.8h, v7.8h, v1.8h\n" + "fmin v8.8h, v8.8h, v1.8h\n" + "fmin v9.8h, v9.8h, v1.8h\n" + "fdiv v6.8h, v6.8h, v1.8h\n" + "fdiv v7.8h, v7.8h, v1.8h\n" + "fdiv v8.8h, v8.8h, v1.8h\n" + "fdiv v9.8h, v9.8h, v1.8h\n" + "fmul v2.8h, v6.8h, v2.8h\n" + "fmul v3.8h, v7.8h, v3.8h\n" + "fmul v4.8h, v8.8h, v4.8h\n" + "fmul v5.8h, v9.8h, v5.8h\n" + + "13:\n" + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v18", "v20", "v22", "x0", "x1", "x2"); + } + } + + // ohow_reminder % 4 + ohow_s = (ohow / 4) * 4; + for (I32 hw = ohow_s; hw < ohow; hw++) { + const F16 *b0 = pwBiasArray; + const F16 *b1 = b0 + 8; + const F16 *f_o0c0 = pwFilterArray; + F16 *in_pack = pwArray + hw * ic * 8; + for (I32 o = 0; o < I32(oc - 1); o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr d22, [%[b_0]]\n" // b_o0 + "ldr x1, [%[b_0], #8]\n" + "ins v22.d[1], x1\n" + "ldr d23, [%[b_1]]\n" // b_o1 + "mov x0, %[ic]\n" // ic_blk + "ldr x2, [%[b_1], #8]\n" + "ins v23.d[1], x2\n" + "ldr h0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "mov v10.16b, v23.16b\n" // out_o1hw0 + "ldr x2, [%[f_0], #8]\n" + "ins v18.d[1], x2\n" + "ldr d19, [%[f_0], #16]\n" // f_o1c0 + "ldr x3, [%[f_0], #24]\n" + "ins v19.d[1], x3\n" + "0:\n" + "ldr h1, [%[in_0], #2]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr d20, [%[f_0], #32]\n" // f_o0c0 + "fmla v10.8h, v19.8h, v0.h[0]\n" + "ldr x2, [%[f_0], #40]\n" + "ins v20.d[1], x2\n" + "ldr d21, [%[f_0], #48]\n" // f_o1c0 + "subs x0, x0, #2\n" + "ldr x3, [%[f_0], #56]\n" + "ins v21.d[1], x3\n" + + "ldr h0, [%[in_0], #4]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr d18, [%[f_0], #64]\n" // f_o0c0 + "fmla v10.8h, v21.8h, v1.h[0]\n" + "ldr x2, [%[f_0], #72]\n" + "ins v18.d[1], x2\n" + "ldr d19, [%[f_0], #80]\n" // f_o1c0 + "add %[in_0], %[in_0], #4\n" + "ldr x3, [%[f_0], #88]\n" + "ins v19.d[1], x3\n" + "add %[f_0], %[f_0], #64\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + "fmin v2.8h, v2.8h, v1.8h\n" + "fmin v10.8h, v10.8h, v1.8h\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three + "fadd v19.8h, v2.8h, v18.8h\n" + "fadd v20.8h, v10.8h, v18.8h\n" + "fmax v19.8h, v19.8h, v0.8h\n" + "fmax v20.8h, v20.8h, v0.8h\n" + "fmin v19.8h, v19.8h, v1.8h\n" + "fmin v20.8h, v20.8h, v1.8h\n" + "fdiv v19.8h, v19.8h, v1.8h\n" + "fdiv v20.8h, v20.8h, v1.8h\n" + "fmul v2.8h, v19.8h, v2.8h\n" + "fmul v10.8h, v20.8h, v10.8h\n" + + "13:\n" + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q10, [%[out_1]]\n" // out_o1hw0 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), [b_1] "r"(b_o1), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v10", "v18", "v19", "v20", "v21", "v22", + "v23", "x0", "x1", "x2", "x3"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = pwFilterArray + (oc - 1) * 8 * ic * 8; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = pwBiasArray + (oc - 1) * 8; + __asm__ __volatile__( + "ldr d22, [%[b_0]]\n" // b_o0 + "mov x0, %[ic]\n" // ic_blk + "ldr x1, [%[b_0], #8]\n" + "ins v22.d[1], x1\n" + "ldr h0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "ldr x2, [%[f_0], #8]\n" + "ins v18.d[1], x2\n" + "0:\n" + "ldr h1, [%[in_0], #2]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr d20, [%[f_0], #16]\n" // f_o0c0 + "subs x0, x0, #2\n" + "ldr x2, [%[f_0], #24]\n" + "ins v20.d[1], x2\n" + + "ldr h0, [%[in_0], #4]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr d18, [%[f_0], #32]\n" // f_o0c0 + "ldr x2, [%[f_0], #40]\n" + "ins v18.d[1], x2\n" + "add %[in_0], %[in_0], #4\n" + "add %[f_0], %[f_0], #32\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v2.8h, v2.8h, v0.8h\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v0.8h\n" + "fmin v2.8h, v2.8h, v1.8h\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three + "fadd v20.8h, v2.8h, v18.8h\n" + "fmax v20.8h, v20.8h, v0.8h\n" + "fmin v20.8h, v20.8h, v1.8h\n" + "fdiv v20.8h, v20.8h, v1.8h\n" + "fmul v2.8h, v20.8h, v2.8h\n" + + "13:\n" + "str q2, [%[out_0]]\n" // out_o0hw0 + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v10", "v18", "v20", "v22", "x0", "x1", + "x2"); + } + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A76.cpp b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A76.cpp new file mode 100644 index 00000000..24bcfb4a --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A76.cpp @@ -0,0 +1,915 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding.h" +#include "cpu/arm/fp16/arm_functions_fp16.h" + +EE depthwise_pointwise_convolution_direct_no_padding_A76(TensorDesc inputDesc, + F16 *inArray, + TensorDesc dwFilterDesc, + const F16 *dwFilterArray, + TensorDesc pwFilterDesc, + const F16 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F16 *dwBiasArray, + TensorDesc pwBiasDesc, + const F16 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec) +{ + UNUSED(tmpBytes); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingL = convParamSpec.padding_left; + + if (dwFilterDesc.df != DF_NCHWC8 || pwFilterDesc.df != DF_NHWCN16) { + CHECK_STATUS(NOT_MATCH); + } + + oc /= 8; + ic /= 8; + + I32 ohow = oh * ow; + F16 *pwArray = (F16 *)tmp; + + F16 buffer[8]; + for (U32 n = 0; n < in; n++) { + for (U32 c = 0; c < ic; c++) { + const F16 *f = dwFilterArray + c * fh * fw * 8; + const F16 *b = dwBiasArray + c * 8; + float16x8_t vv0 = vld1q_f16(b); + + I32 iter = 0; + U32 ih_base = ((n * ic) + c) * ih; + // nhwchw8 + for (; iter < ohow - 7; iter += 8) { + U32 out_base = iter * ic * 8 + c * 8 * 8; + for (I32 j = 0; j < 8; j++) { + I32 hw = iter + j; + calc_eight_channel_elements(hw, ih_base, ih, iw, fh, fw, ow, inArray, strideH, + strideW, paddingT, paddingL, f, vv0, buffer); + CHECK_STATUS(activation_fp16(buffer, 8, depthwiseActivationParamSpec, buffer)); + U32 out_index = out_base + j; + for (I32 i = 0; i < 8; i++, out_index += 8) { + pwArray[out_index] = buffer[i]; + } + } + } + // nhwchw4 + for (; iter < ohow - 3; iter += 4) { + U32 out_base = iter * ic * 8 + c * 8 * 4; + for (I32 j = 0; j < 4; j++) { + I32 hw = iter + j; + calc_eight_channel_elements(hw, ih_base, ih, iw, fh, fw, ow, inArray, strideH, + strideW, paddingT, paddingL, f, vv0, buffer); + CHECK_STATUS(activation_fp16(buffer, 8, depthwiseActivationParamSpec, buffer)); + U32 out_index = out_base + j; + for (I32 i = 0; i < 8; i++, out_index += 4) { + pwArray[out_index] = buffer[i]; + } + } + } + // nhwchw1 + for (; iter < ohow; iter++) { + U32 out_base = iter * ic * 8 + c * 8; + for (I32 j = 0; j < 1; j++) { + I32 hw = iter + j; + calc_eight_channel_elements(hw, ih_base, ih, iw, fh, fw, ow, inArray, strideH, + strideW, paddingT, paddingL, f, vv0, buffer); + CHECK_STATUS(activation_fp16(buffer, 8, depthwiseActivationParamSpec, buffer)); + U32 out_index = out_base + j; + for (I32 i = 0; i < 8; i++, out_index++) { + pwArray[out_index] = buffer[i]; + } + } + } + } + + // pw_conv + // ohow / 8 + for (I32 hw = 0; hw < ohow - 7; hw += 8) { + const F16 *b0 = pwBiasArray; + const F16 *b1 = b0 + 8; + F16 *in_pack = pwArray + hw * ic * 8; + const F16 *f_o0c0 = pwFilterArray; + for (I32 o = 0; o < I32(oc - 1); o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr q22, [%[b_0]]\n" // b_o0 + "ldr q23, [%[b_1]]\n" // b_o1 + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr q0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "mov v4.16b, v22.16b\n" // out_o0hw2 + "mov v5.16b, v22.16b\n" // out_o0hw3 + "ldr q18, [%[f_0]]\n" // f_o0c0 + "mov v6.16b, v22.16b\n" // out_o0hw4 + "mov v7.16b, v22.16b\n" // out_o0hw5 + "mov v8.16b, v22.16b\n" // out_o0hw6 + "ldr q19, [%[f_0], #16]\n" // f_o1c0 + "mov v9.16b, v22.16b\n" // out_o0hw7 + "mov v10.16b, v23.16b\n" // out_o1hw0 + "mov v11.16b, v23.16b\n" // out_o1hw1 + "mov v12.16b, v23.16b\n" // out_o1hw2 + "mov v13.16b, v23.16b\n" // out_o1hw3 + "mov v14.16b, v23.16b\n" // out_o1hw4 + "mov v15.16b, v23.16b\n" // out_o1hw5 + "mov v16.16b, v23.16b\n" // out_o1hw6 + "mov v17.16b, v23.16b\n" // out_o1hw7 + "0:\n" + "ldr q1, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "fmla v3.8h, v18.8h, v0.h[1]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "ldr q20, [%[f_0], #32]\n" // f_o0c0 + "fmla v5.8h, v18.8h, v0.h[3]\n" + "fmla v6.8h, v18.8h, v0.h[4]\n" + "fmla v7.8h, v18.8h, v0.h[5]\n" + "ldr q21, [%[f_0], #48]\n" // f_o1c0 + "fmla v8.8h, v18.8h, v0.h[6]\n" + "fmla v9.8h, v18.8h, v0.h[7]\n" + "fmla v10.8h, v19.8h, v0.h[0]\n" + "fmla v11.8h, v19.8h, v0.h[1]\n" + "fmla v12.8h, v19.8h, v0.h[2]\n" + "fmla v13.8h, v19.8h, v0.h[3]\n" + "fmla v14.8h, v19.8h, v0.h[4]\n" + "fmla v15.8h, v19.8h, v0.h[5]\n" + "fmla v16.8h, v19.8h, v0.h[6]\n" + "fmla v17.8h, v19.8h, v0.h[7]\n" + + "ldr q0, [%[in_0], #32]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "fmla v3.8h, v20.8h, v1.h[1]\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "ldr q18, [%[f_0], #64]\n" // f_o0c0 + "fmla v5.8h, v20.8h, v1.h[3]\n" + "fmla v6.8h, v20.8h, v1.h[4]\n" + "fmla v7.8h, v20.8h, v1.h[5]\n" + "ldr q19, [%[f_0], #80]\n" // f_o1c0 + "fmla v8.8h, v20.8h, v1.h[6]\n" + "fmla v9.8h, v20.8h, v1.h[7]\n" + "fmla v10.8h, v21.8h, v1.h[0]\n" + "add %[in_0], %[in_0], #32\n" + "fmla v11.8h, v21.8h, v1.h[1]\n" + "add %[f_0], %[f_0], #64\n" + "fmla v12.8h, v21.8h, v1.h[2]\n" + "subs x0, x0, #2\n" + "fmla v13.8h, v21.8h, v1.h[3]\n" + "fmla v14.8h, v21.8h, v1.h[4]\n" + "fmla v15.8h, v21.8h, v1.h[5]\n" + "fmla v16.8h, v21.8h, v1.h[6]\n" + "fmla v17.8h, v21.8h, v1.h[7]\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + "fmax v11.8h, v11.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v0.8h\n" + "fmax v13.8h, v13.8h, v0.8h\n" + "fmax v14.8h, v14.8h, v0.8h\n" + "fmax v15.8h, v15.8h, v0.8h\n" + "fmax v16.8h, v16.8h, v0.8h\n" + "fmax v17.8h, v17.8h, v0.8h\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + "fmax v11.8h, v11.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v0.8h\n" + "fmax v13.8h, v13.8h, v0.8h\n" + "fmax v14.8h, v14.8h, v0.8h\n" + "fmax v15.8h, v15.8h, v0.8h\n" + "fmax v16.8h, v16.8h, v0.8h\n" + "fmax v17.8h, v17.8h, v0.8h\n" + "fmin v2.8h, v2.8h, v1.8h\n" + "fmin v3.8h, v3.8h, v1.8h\n" + "fmin v4.8h, v4.8h, v1.8h\n" + "fmin v5.8h, v5.8h, v1.8h\n" + "fmin v6.8h, v6.8h, v1.8h\n" + "fmin v7.8h, v7.8h, v1.8h\n" + "fmin v8.8h, v8.8h, v1.8h\n" + "fmin v9.8h, v9.8h, v1.8h\n" + "fmin v10.8h, v10.8h, v1.8h\n" + "fmin v11.8h, v11.8h, v1.8h\n" + "fmin v12.8h, v12.8h, v1.8h\n" + "fmin v13.8h, v13.8h, v1.8h\n" + "fmin v14.8h, v14.8h, v1.8h\n" + "fmin v15.8h, v15.8h, v1.8h\n" + "fmin v16.8h, v16.8h, v1.8h\n" + "fmin v17.8h, v17.8h, v1.8h\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three + "fadd v19.8h, v2.8h, v18.8h\n" + "fadd v20.8h, v3.8h, v18.8h\n" + "fadd v21.8h, v4.8h, v18.8h\n" + "fadd v22.8h, v5.8h, v18.8h\n" + "fadd v23.8h, v6.8h, v18.8h\n" + "fadd v24.8h, v7.8h, v18.8h\n" + "fadd v25.8h, v8.8h, v18.8h\n" + "fadd v26.8h, v9.8h, v18.8h\n" + "fmax v19.8h, v19.8h, v0.8h\n" + "fmax v20.8h, v20.8h, v0.8h\n" + "fmax v21.8h, v21.8h, v0.8h\n" + "fmax v22.8h, v22.8h, v0.8h\n" + "fmax v23.8h, v23.8h, v0.8h\n" + "fmax v24.8h, v24.8h, v0.8h\n" + "fmax v25.8h, v25.8h, v0.8h\n" + "fmax v26.8h, v26.8h, v0.8h\n" + "fmin v19.8h, v19.8h, v1.8h\n" + "fmin v20.8h, v20.8h, v1.8h\n" + "fmin v21.8h, v21.8h, v1.8h\n" + "fmin v22.8h, v22.8h, v1.8h\n" + "fmin v23.8h, v23.8h, v1.8h\n" + "fmin v24.8h, v24.8h, v1.8h\n" + "fmin v25.8h, v25.8h, v1.8h\n" + "fmin v26.8h, v26.8h, v1.8h\n" + "fdiv v19.8h, v19.8h, v1.8h\n" + "fdiv v20.8h, v20.8h, v1.8h\n" + "fdiv v21.8h, v21.8h, v1.8h\n" + "fdiv v22.8h, v22.8h, v1.8h\n" + "fdiv v23.8h, v23.8h, v1.8h\n" + "fdiv v24.8h, v24.8h, v1.8h\n" + "fdiv v25.8h, v25.8h, v1.8h\n" + "fdiv v26.8h, v26.8h, v1.8h\n" + "fmul v2.8h, v19.8h, v2.8h\n" + "fmul v3.8h, v20.8h, v3.8h\n" + "fmul v4.8h, v21.8h, v4.8h\n" + "fmul v5.8h, v22.8h, v5.8h\n" + "fmul v6.8h, v23.8h, v6.8h\n" + "fmul v7.8h, v24.8h, v7.8h\n" + "fmul v8.8h, v25.8h, v8.8h\n" + "fmul v9.8h, v26.8h, v9.8h\n" + + "fadd v19.8h, v10.8h, v18.8h\n" + "fadd v20.8h, v11.8h, v18.8h\n" + "fadd v21.8h, v12.8h, v18.8h\n" + "fadd v22.8h, v13.8h, v18.8h\n" + "fadd v23.8h, v14.8h, v18.8h\n" + "fadd v24.8h, v15.8h, v18.8h\n" + "fadd v25.8h, v16.8h, v18.8h\n" + "fadd v26.8h, v17.8h, v18.8h\n" + "fmax v19.8h, v19.8h, v0.8h\n" + "fmax v20.8h, v20.8h, v0.8h\n" + "fmax v21.8h, v21.8h, v0.8h\n" + "fmax v22.8h, v22.8h, v0.8h\n" + "fmax v23.8h, v23.8h, v0.8h\n" + "fmax v24.8h, v24.8h, v0.8h\n" + "fmax v25.8h, v25.8h, v0.8h\n" + "fmax v26.8h, v26.8h, v0.8h\n" + "fmin v19.8h, v19.8h, v1.8h\n" + "fmin v20.8h, v20.8h, v1.8h\n" + "fmin v21.8h, v21.8h, v1.8h\n" + "fmin v22.8h, v22.8h, v1.8h\n" + "fmin v23.8h, v23.8h, v1.8h\n" + "fmin v24.8h, v24.8h, v1.8h\n" + "fmin v25.8h, v25.8h, v1.8h\n" + "fmin v26.8h, v26.8h, v1.8h\n" + "fdiv v19.8h, v19.8h, v1.8h\n" + "fdiv v20.8h, v20.8h, v1.8h\n" + "fdiv v21.8h, v21.8h, v1.8h\n" + "fdiv v22.8h, v22.8h, v1.8h\n" + "fdiv v23.8h, v23.8h, v1.8h\n" + "fdiv v24.8h, v24.8h, v1.8h\n" + "fdiv v25.8h, v25.8h, v1.8h\n" + "fdiv v26.8h, v26.8h, v1.8h\n" + "fmul v10.8h, v19.8h, v10.8h\n" + "fmul v11.8h, v20.8h, v11.8h\n" + "fmul v12.8h, v21.8h, v12.8h\n" + "fmul v13.8h, v22.8h, v13.8h\n" + "fmul v14.8h, v23.8h, v14.8h\n" + "fmul v15.8h, v24.8h, v15.8h\n" + "fmul v16.8h, v25.8h, v16.8h\n" + "fmul v17.8h, v26.8h, v17.8h\n" + + "13:\n" + "st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%[out_0]], #64\n" + "st1 {v6.8h, v7.8h, v8.8h, v9.8h}, [%[out_0]], #64\n" + "st1 {v10.8h, v11.8h, v12.8h, v13.8h}, [%[out_1]], #64\n" + "st1 {v14.8h, v15.8h, v16.8h, v17.8h}, [%[out_1]], #64\n" + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), [b_1] "r"(b_o1), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "x0", "x1", "x2", "x3"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = pwFilterArray + (oc - 1) * 8 * ic * 8; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = pwBiasArray + (oc - 1) * 8; + __asm__ __volatile__( + "ldr q12, [%[b_0]]\n" // b_o0 + "mov x0, %[ic]\n" // ic_blk + "ldr q0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v12.16b\n" // out_o0hw0 + "mov v3.16b, v12.16b\n" // out_o0hw1 + "mov v4.16b, v12.16b\n" // out_o0hw2 + "ldr q10, [%[f_0]]\n" // f_o0c0 + "mov v5.16b, v12.16b\n" // out_o0hw3 + "mov v6.16b, v12.16b\n" // out_o0hw4 + "mov v7.16b, v12.16b\n" // out_o0hw5 + "mov v8.16b, v12.16b\n" // out_o0hw6 + "mov v9.16b, v12.16b\n" // out_o0hw7 + "0:\n" + "ldr q1, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v10.8h, v0.h[0]\n" + "fmla v3.8h, v10.8h, v0.h[1]\n" + "fmla v4.8h, v10.8h, v0.h[2]\n" + "ldr q11, [%[f_0], #16]\n" // f_o0c0 + "fmla v5.8h, v10.8h, v0.h[3]\n" + "fmla v6.8h, v10.8h, v0.h[4]\n" + "fmla v7.8h, v10.8h, v0.h[5]\n" + "subs x0, x0, #2\n" + "fmla v8.8h, v10.8h, v0.h[6]\n" + "fmla v9.8h, v10.8h, v0.h[7]\n" + + "ldr q0, [%[in_0], #32]\n" // in_hw0 + "fmla v2.8h, v11.8h, v1.h[0]\n" + "fmla v3.8h, v11.8h, v1.h[1]\n" + "fmla v4.8h, v11.8h, v1.h[2]\n" + "ldr q10, [%[f_0], #32]\n" // f_o0c0 + "fmla v5.8h, v11.8h, v1.h[3]\n" + "fmla v6.8h, v11.8h, v1.h[4]\n" + "fmla v7.8h, v11.8h, v1.h[5]\n" + "add %[in_0], %[in_0], #32\n" + "fmla v8.8h, v11.8h, v1.h[6]\n" + "add %[f_0], %[f_0], #32\n" + "fmla v9.8h, v11.8h, v1.h[7]\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + "fmin v2.8h, v2.8h, v1.8h\n" + "fmin v3.8h, v3.8h, v1.8h\n" + "fmin v4.8h, v4.8h, v1.8h\n" + "fmin v5.8h, v5.8h, v1.8h\n" + "fmin v6.8h, v6.8h, v1.8h\n" + "fmin v7.8h, v7.8h, v1.8h\n" + "fmin v8.8h, v8.8h, v1.8h\n" + "fmin v9.8h, v9.8h, v1.8h\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v10.8h, #0x42, lsl #8\n" // three + "fadd v11.8h, v2.8h, v10.8h\n" + "fadd v12.8h, v3.8h, v10.8h\n" + "fadd v13.8h, v4.8h, v10.8h\n" + "fadd v14.8h, v5.8h, v10.8h\n" + "fadd v15.8h, v6.8h, v10.8h\n" + "fadd v16.8h, v7.8h, v10.8h\n" + "fadd v17.8h, v8.8h, v10.8h\n" + "fadd v18.8h, v9.8h, v10.8h\n" + "fmax v11.8h, v11.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v0.8h\n" + "fmax v13.8h, v13.8h, v0.8h\n" + "fmax v14.8h, v14.8h, v0.8h\n" + "fmax v15.8h, v15.8h, v0.8h\n" + "fmax v16.8h, v16.8h, v0.8h\n" + "fmax v17.8h, v17.8h, v0.8h\n" + "fmax v18.8h, v18.8h, v0.8h\n" + "fmin v11.8h, v11.8h, v1.8h\n" + "fmin v12.8h, v12.8h, v1.8h\n" + "fmin v13.8h, v13.8h, v1.8h\n" + "fmin v14.8h, v14.8h, v1.8h\n" + "fmin v15.8h, v15.8h, v1.8h\n" + "fmin v16.8h, v16.8h, v1.8h\n" + "fmin v17.8h, v17.8h, v1.8h\n" + "fmin v18.8h, v18.8h, v1.8h\n" + "fdiv v11.8h, v11.8h, v1.8h\n" + "fdiv v12.8h, v12.8h, v1.8h\n" + "fdiv v13.8h, v13.8h, v1.8h\n" + "fdiv v14.8h, v14.8h, v1.8h\n" + "fdiv v15.8h, v15.8h, v1.8h\n" + "fdiv v16.8h, v16.8h, v1.8h\n" + "fdiv v17.8h, v17.8h, v1.8h\n" + "fdiv v18.8h, v18.8h, v1.8h\n" + "fmul v2.8h, v11.8h, v2.8h\n" + "fmul v3.8h, v12.8h, v3.8h\n" + "fmul v4.8h, v13.8h, v4.8h\n" + "fmul v5.8h, v14.8h, v5.8h\n" + "fmul v6.8h, v15.8h, v6.8h\n" + "fmul v7.8h, v16.8h, v7.8h\n" + "fmul v8.8h, v17.8h, v8.8h\n" + "fmul v9.8h, v18.8h, v9.8h\n" + + "13:\n" + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw0 + "str q4, [%[out_0], #32]\n" // out_o0hw0 + "str q5, [%[out_0], #48]\n" // out_o0hw0 + "str q6, [%[out_0], #64]\n" // out_o0hw0 + "str q7, [%[out_0], #80]\n" // out_o0hw0 + "str q8, [%[out_0], #96]\n" // out_o0hw0 + "str q9, [%[out_0], #112]\n" // out_o0hw0 + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "x0", "x1", "x2"); + } + } + + // ohow_remainder % 8 / 4 + U32 ohow_s = (ohow / 8) * 8; + for (I32 hw = ohow_s; hw < ohow - 3; hw += 4) { + const F16 *b0 = pwBiasArray; + const F16 *b1 = b0 + 8; + const F16 *f_o0c0 = pwFilterArray; + F16 *in_pack = pwArray + hw * ic * 8; + for (I32 o = 0; o < I32(oc - 1); o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr q22, [%[b_0]]\n" // b_o0 + "ldr q23, [%[b_1]]\n" // b_o1 + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "ldr q18, [%[f_0]]\n" // f_o0c0 + "mov v4.16b, v22.16b\n" // out_o0hw2 + "mov v5.16b, v22.16b\n" // out_o0hw3 + "mov v10.16b, v23.16b\n" // out_o1hw0 + "ldr q19, [%[f_0], #16]\n" // f_o1c0 + "mov v11.16b, v23.16b\n" // out_o1hw1 + "mov v12.16b, v23.16b\n" // out_o1hw2 + "mov v13.16b, v23.16b\n" // out_o1hw3 + "0:\n" + "ldr d1, [%[in_0], #8]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr q20, [%[f_0], #32]\n" // f_o0c0 + "fmla v3.8h, v18.8h, v0.h[1]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "ldr q21, [%[f_0], #48]\n" // f_o1c0 + "fmla v10.8h, v19.8h, v0.h[0]\n" + "fmla v11.8h, v19.8h, v0.h[1]\n" + "fmla v12.8h, v19.8h, v0.h[2]\n" + "subs x0, x0, #2\n" + "fmla v13.8h, v19.8h, v0.h[3]\n" + + "ldr d0, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr q18, [%[f_0], #64]\n" // f_o0c0 + "fmla v3.8h, v20.8h, v1.h[1]\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "ldr q19, [%[f_0], #80]\n" // f_o1c0 + "fmla v5.8h, v20.8h, v1.h[3]\n" + "fmla v10.8h, v21.8h, v1.h[0]\n" + "fmla v11.8h, v21.8h, v1.h[1]\n" + "fmla v12.8h, v21.8h, v1.h[2]\n" + "add %[in_0], %[in_0], #16\n" + "fmla v13.8h, v21.8h, v1.h[3]\n" + "add %[f_0], %[f_0], #64\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + "fmax v11.8h, v11.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v0.8h\n" + "fmax v13.8h, v13.8h, v0.8h\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + "fmax v11.8h, v11.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v0.8h\n" + "fmax v13.8h, v13.8h, v0.8h\n" + "fmin v2.8h, v2.8h, v1.8h\n" + "fmin v3.8h, v3.8h, v1.8h\n" + "fmin v4.8h, v4.8h, v1.8h\n" + "fmin v5.8h, v5.8h, v1.8h\n" + "fmin v10.8h, v10.8h, v1.8h\n" + "fmin v11.8h, v11.8h, v1.8h\n" + "fmin v12.8h, v12.8h, v1.8h\n" + "fmin v13.8h, v13.8h, v1.8h\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three + "fadd v6.8h, v2.8h, v18.8h\n" + "fadd v7.8h, v3.8h, v18.8h\n" + "fadd v8.8h, v4.8h, v18.8h\n" + "fadd v9.8h, v5.8h, v18.8h\n" + "fadd v19.8h, v10.8h, v18.8h\n" + "fadd v20.8h, v11.8h, v18.8h\n" + "fadd v21.8h, v12.8h, v18.8h\n" + "fadd v22.8h, v13.8h, v18.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + "fmax v19.8h, v19.8h, v0.8h\n" + "fmax v20.8h, v20.8h, v0.8h\n" + "fmax v21.8h, v21.8h, v0.8h\n" + "fmax v22.8h, v22.8h, v0.8h\n" + "fmin v6.8h, v6.8h, v1.8h\n" + "fmin v7.8h, v7.8h, v1.8h\n" + "fmin v8.8h, v8.8h, v1.8h\n" + "fmin v9.8h, v9.8h, v1.8h\n" + "fmin v19.8h, v19.8h, v1.8h\n" + "fmin v20.8h, v20.8h, v1.8h\n" + "fmin v21.8h, v21.8h, v1.8h\n" + "fmin v22.8h, v22.8h, v1.8h\n" + "fdiv v6.8h, v6.8h, v1.8h\n" + "fdiv v7.8h, v7.8h, v1.8h\n" + "fdiv v8.8h, v8.8h, v1.8h\n" + "fdiv v9.8h, v9.8h, v1.8h\n" + "fdiv v19.8h, v19.8h, v1.8h\n" + "fdiv v20.8h, v20.8h, v1.8h\n" + "fdiv v21.8h, v21.8h, v1.8h\n" + "fdiv v22.8h, v22.8h, v1.8h\n" + "fmul v2.8h, v6.8h, v2.8h\n" + "fmul v3.8h, v7.8h, v3.8h\n" + "fmul v4.8h, v8.8h, v4.8h\n" + "fmul v5.8h, v9.8h, v5.8h\n" + "fmul v10.8h, v19.8h, v10.8h\n" + "fmul v11.8h, v20.8h, v11.8h\n" + "fmul v12.8h, v21.8h, v12.8h\n" + "fmul v13.8h, v22.8h, v13.8h\n" + + "13:\n" + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + "str q10, [%[out_1]]\n" // out_o1hw0 + "str q11, [%[out_1], #16]\n" // out_o1hw1 + "str q12, [%[out_1], #32]\n" // out_o1hw2 + "str q13, [%[out_1], #48]\n" // out_o1hw3 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), [b_1] "r"(b_o1), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v18", "v19", "v20", "v21", "v22", "v23", "x0", + "x1", "x2", "x3"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = pwFilterArray + (oc - 1) * 8 * ic * 8; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = pwBiasArray + (oc - 1) * 8; + __asm__ __volatile__( + "ldr q22, [%[b_0]]\n" // b_o0 + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "ldr q18, [%[f_0]]\n" // f_o0c0 + "mov v4.16b, v22.16b\n" // out_o0hw2 + "mov v5.16b, v22.16b\n" // out_o0hw3 + "0:\n" + "ldr d1, [%[in_0], #8]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr q20, [%[f_0], #16]\n" // f_o0c0 + "fmla v3.8h, v18.8h, v0.h[1]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "subs x0, x0, #2\n" + + "ldr d0, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr q18, [%[f_0], #32]\n" // f_o0c0 + "fmla v3.8h, v20.8h, v1.h[1]\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "fmla v5.8h, v20.8h, v1.h[3]\n" + "add %[in_0], %[in_0], #16\n" + "add %[f_0], %[f_0], #32\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmin v2.8h, v2.8h, v1.8h\n" + "fmin v3.8h, v3.8h, v1.8h\n" + "fmin v4.8h, v4.8h, v1.8h\n" + "fmin v5.8h, v5.8h, v1.8h\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three + "fadd v6.8h, v2.8h, v18.8h\n" + "fadd v7.8h, v3.8h, v18.8h\n" + "fadd v8.8h, v4.8h, v18.8h\n" + "fadd v9.8h, v5.8h, v18.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + "fmin v6.8h, v6.8h, v1.8h\n" + "fmin v7.8h, v7.8h, v1.8h\n" + "fmin v8.8h, v8.8h, v1.8h\n" + "fmin v9.8h, v9.8h, v1.8h\n" + "fdiv v6.8h, v6.8h, v1.8h\n" + "fdiv v7.8h, v7.8h, v1.8h\n" + "fdiv v8.8h, v8.8h, v1.8h\n" + "fdiv v9.8h, v9.8h, v1.8h\n" + "fmul v2.8h, v6.8h, v2.8h\n" + "fmul v3.8h, v7.8h, v3.8h\n" + "fmul v4.8h, v8.8h, v4.8h\n" + "fmul v5.8h, v9.8h, v5.8h\n" + + "13:\n" + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v18", "v20", "v22", "x0", "x1", "x2"); + } + } + + // ohow_reminder % 4 + ohow_s = (ohow / 4) * 4; + for (I32 hw = ohow_s; hw < ohow; hw++) { + const F16 *b0 = pwBiasArray; + const F16 *b1 = b0 + 8; + const F16 *f_o0c0 = pwFilterArray; + F16 *in_pack = pwArray + hw * ic * 8; + for (I32 o = 0; o < I32(oc - 1); o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr q22, [%[b_0]]\n" // b_o0 + "ldr q23, [%[b_1]]\n" // b_o1 + "mov x0, %[ic]\n" // ic_blk + "ldr h0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr q18, [%[f_0]]\n" // f_o0c0 + "mov v10.16b, v23.16b\n" // out_o1hw0 + "ldr q19, [%[f_0], #16]\n" // f_o1c0 + "0:\n" + "ldr h1, [%[in_0], #2]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr q20, [%[f_0], #32]\n" // f_o0c0 + "fmla v10.8h, v19.8h, v0.h[0]\n" + "ldr q21, [%[f_0], #48]\n" // f_o1c0 + "subs x0, x0, #2\n" + + "ldr h0, [%[in_0], #4]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr q18, [%[f_0], #64]\n" // f_o0c0 + "fmla v10.8h, v21.8h, v1.h[0]\n" + "ldr q19, [%[f_0], #80]\n" // f_o1c0 + "add %[in_0], %[in_0], #4\n" + "add %[f_0], %[f_0], #64\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + "fmin v2.8h, v2.8h, v1.8h\n" + "fmin v10.8h, v10.8h, v1.8h\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three + "fadd v19.8h, v2.8h, v18.8h\n" + "fadd v20.8h, v10.8h, v18.8h\n" + "fmax v19.8h, v19.8h, v0.8h\n" + "fmax v20.8h, v20.8h, v0.8h\n" + "fmin v19.8h, v19.8h, v1.8h\n" + "fmin v20.8h, v20.8h, v1.8h\n" + "fdiv v19.8h, v19.8h, v1.8h\n" + "fdiv v20.8h, v20.8h, v1.8h\n" + "fmul v2.8h, v19.8h, v2.8h\n" + "fmul v10.8h, v20.8h, v10.8h\n" + + "13:\n" + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q10, [%[out_1]]\n" // out_o1hw0 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), [b_1] "r"(b_o1), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v10", "v18", "v19", "v20", "v21", "v22", + "v23", "x0", "x1", "x2", "x3"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = pwFilterArray + (oc - 1) * 8 * ic * 8; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = pwBiasArray + (oc - 1) * 8; + __asm__ __volatile__( + "ldr q22, [%[b_0]]\n" // b_o0 + "mov x0, %[ic]\n" // ic_blk + "ldr h0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr q18, [%[f_0]]\n" // f_o0c0 + "0:\n" + "ldr h1, [%[in_0], #2]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr q20, [%[f_0], #16]\n" // f_o0c0 + "subs x0, x0, #2\n" + + "ldr h0, [%[in_0], #4]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr q18, [%[f_0], #32]\n" // f_o0c0 + "add %[in_0], %[in_0], #4\n" + "add %[f_0], %[f_0], #32\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v2.8h, v2.8h, v0.8h\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v0.8h\n" + "fmin v2.8h, v2.8h, v1.8h\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three + "fadd v20.8h, v2.8h, v18.8h\n" + "fmax v20.8h, v20.8h, v0.8h\n" + "fmin v20.8h, v20.8h, v1.8h\n" + "fdiv v20.8h, v20.8h, v1.8h\n" + "fmul v2.8h, v20.8h, v2.8h\n" + + "13:\n" + "str q2, [%[out_0]]\n" // out_o0hw0 + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v10", "v18", "v20", "v22", "x0", "x1", + "x2"); + } + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/arm/fp16/eltwise.cpp b/compute/tensor/src/cpu/arm/fp16/eltwise.cpp new file mode 100644 index 00000000..6f8fc40f --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/eltwise.cpp @@ -0,0 +1,86 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/fp16/tensor_computing_fp16.h" +#include "cpu/cpu_functions.h" + +EE eltwise_fp16(std::vector input, + std::vector inputSize, + U32 num, + U32 len, + void *output, + EltwiseMode eltwiseMode) +{ + F16 buffer[8]; + U32 len_tail = len % 8; + U32 len_main = len - len_tail; + F16 *output_ptr = (F16 *)output; + F16 *tmp = buffer; + for (U32 i = 0; i < len_main; i += 8) { + get_vector((F16 *)input[0], inputSize[0], &tmp, 8, i, 8, buffer); + float16x8_t tmp_v = vld1q_f16(tmp); + for (U32 j = 1; j < num; j++) { + get_vector((F16 *)input[j], inputSize[j], &tmp, 8, i, 8, buffer); + float16x8_t value_v = vld1q_f16(tmp); + switch (eltwiseMode) { + case ELTWISE_SUM: + tmp_v = vaddq_f16(tmp_v, value_v); + break; + case ELTWISE_MAX: + tmp_v = vmaxq_f16(tmp_v, value_v); + break; + case ELTWISE_PROD: + tmp_v = vmulq_f16(tmp_v, value_v); + break; + case ELTWISE_SUB: + tmp_v = vsubq_f16(tmp_v, value_v); + break; + case ELTWISE_DIV: + tmp_v = vdivq_f16(tmp_v, value_v); + break; + default: + return NOT_SUPPORTED; + } + } + vst1q_f16(output_ptr + i, tmp_v); + } + for (U32 i = len_main; i < len; i++) { + get_vector((F16 *)input[0], inputSize[0], &tmp, 8, i, 1, buffer); + F32 tmp_s = tmp[0]; + for (U32 j = 1; j < num; j++) { + get_vector((F16 *)input[j], inputSize[j], &tmp, 8, i, 1, buffer); + F32 value_s = tmp[0]; + switch (eltwiseMode) { + case ELTWISE_SUM: + tmp_s = value_s + tmp_s; + break; + case ELTWISE_MAX: + tmp_s = (value_s > tmp_s) ? value_s : tmp_s; + break; + case ELTWISE_PROD: + tmp_s *= value_s; + break; + case ELTWISE_SUB: + tmp_s -= value_s; + break; + case ELTWISE_DIV: + tmp_s /= value_s; + break; + default: + return NOT_SUPPORTED; + } + } + output_ptr[i] = tmp_s; + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/arm/fp16/lstm.cpp b/compute/tensor/src/cpu/arm/fp16/lstm.cpp new file mode 100644 index 00000000..ec00702b --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/lstm.cpp @@ -0,0 +1,263 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/arm/fp16/tensor_computing_fp16.h" + +void mvm_nkn32(U32 fn, U32 fk, const F16 *filterArray, F16 *input, F16 *output) +{ + for (U32 n = 0; n < fn; n++) { + F16 *in = input; + const F16 *f = filterArray + n * fk * 32; + __asm__ __volatile__("ldr s0, [%[in]]\n" + "ldr q1, [%[out]]\n" + "ldr q2, [%[out], #16]\n" + "ldr q3, [%[out], #32]\n" + "ldr q4, [%[out], #48]\n" + "mov x0, %[k]\n" + "ldr q5, [%[f]]\n" + "ldr q6, [%[f], #16]\n" + "ldr q7, [%[f], #32]\n" + "ldr q8, [%[f], #48]\n" + "0:\n" + "prfm pldl2strm, [%[f], #4096]\n" + "prfm pldl1strm, [%[f], #1024]\n" + "ldr d9, [%[f], #64]\n" + "fmla v1.8h, v5.8h, v0.h[0]\n" + "ldr x9, [%[f], #72]\n" + "ins v9.d[1], x9\n" + "ldr d10, [%[f], #80]\n" + "fmla v2.8h, v6.8h, v0.h[0]\n" + "ldr x10, [%[f], #88]\n" + "ins v10.d[1], x10\n" + "ldr d11, [%[f], #96]\n" + "fmla v3.8h, v7.8h, v0.h[0]\n" + "ldr x11, [%[f], #104]\n" + "ins v11.d[1], x11\n" + "ldr d12, [%[f], #112]\n" + "fmla v4.8h, v8.8h, v0.h[0]\n" + "ldr x12, [%[f], #120]\n" + "ins v12.d[1], x12\n" + + "ldr d5, [%[f], #128]\n" + "fmla v1.8h, v9.8h, v0.h[1]\n" + "ldr x5, [%[f], #136]\n" + "ins v5.d[1], x5\n" + "ldr d6, [%[f], #144]\n" + "fmla v2.8h, v10.8h, v0.h[1]\n" + "ldr x6, [%[f], #152]\n" + "ins v6.d[1], x6\n" + "ldr d7, [%[f], #160]\n" + "fmla v3.8h, v11.8h, v0.h[1]\n" + "ldr x7, [%[f], #168]\n" + "ins v7.d[1], x7\n" + "ldr d8, [%[f], #176]\n" + "fmla v4.8h, v12.8h, v0.h[1]\n" + "ldr x8, [%[f], #184]\n" + "add %[in], %[in], #4\n" + "ins v8.d[1], x8\n" + "add %[f], %[f], #128\n" + "ldr s0, [%[in]]\n" + "sub x0, x0, #2\n" + + "cmp x0, #3\n" + "bgt 0b\n" + "ldr q9, [%[f], #64]\n" + "ldr q10, [%[f], #80]\n" + "ldr q11, [%[f], #96]\n" + "ldr q12, [%[f], #112]\n" + "fmla v1.8h, v5.8h, v0.h[0]\n" + "fmla v2.8h, v6.8h, v0.h[0]\n" + "fmla v3.8h, v7.8h, v0.h[0]\n" + "fmla v4.8h, v8.8h, v0.h[0]\n" + "fmla v1.8h, v9.8h, v0.h[1]\n" + "fmla v2.8h, v10.8h, v0.h[1]\n" + "fmla v3.8h, v11.8h, v0.h[1]\n" + "fmla v4.8h, v12.8h, v0.h[1]\n" + "cmp x0, #3\n" + "bne 1f\n" + "ldr h0, [%[in], #4]\n" + "ldr q5, [%[f], #128]\n" + "ldr q6, [%[f], #144]\n" + "ldr q7, [%[f], #160]\n" + "ldr q8, [%[f], #176]\n" + "fmla v1.8h, v5.8h, v0.h[0]\n" + "fmla v2.8h, v6.8h, v0.h[0]\n" + "fmla v3.8h, v7.8h, v0.h[0]\n" + "fmla v4.8h, v8.8h, v0.h[0]\n" + + "1:\n" + "str q1, [%[out]]\n" + "str q2, [%[out], #16]\n" + "str q3, [%[out], #32]\n" + "str q4, [%[out], #48]\n" + : [out] "+r"(output), [f] "+r"(f), [in] "+r"(in) + : [k] "r"((I64)fk) + : "memory", "cc", "x0", "x5", "x6", "x7", "x8", "x9", "x10", "x11", + "x12", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12"); + output += 32; + } +} + +EE rnncell_fp16(TensorDesc xDesc, + const void *currentX, + const TensorDesc *filterDesc, + const void **filter, + const TensorDesc *biasDesc, + const void **bias, + void *state, + U32 tmpBytes, + void *tmp, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + void *output, + Arch arch) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + UNUSED(arch); + if (nullptr == currentX || nullptr == filter || nullptr == bias || nullptr == state || + nullptr == tmp || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ix; + U32 on, oh; + U32 fk, fn; + CHECK_STATUS(tensor2dGet(xDesc, &idt, &idf, &in, &ix)); + CHECK_STATUS(tensor2dGet(filterDesc[0], &fdt, &fdf, &fn, &fk)); + CHECK_STATUS(tensor2dGet(hDesc, &odt, &odf, &on, &oh)); + if (fdf != DF_NKN32) { + CHECK_STATUS(NOT_MATCH); + } + fn /= 32; + + U32 batch = in; + I32 xDim = ix; + I32 hDim = rnnParamSpec.numOutput; + I32 column = (rnnParamSpec.numProjection > 0) ? rnnParamSpec.numProjection + : rnnParamSpec.numOutput; + if (!(idt == DT_F16 && fdt == DT_F16 && odt == DT_F16)) { + CHECK_STATUS(NOT_MATCH); + } + if (!(4 * column == (I32)fn * 32 && (ix + oh) == fk && in == on)) { + CHECK_STATUS(NOT_MATCH); + } + F32 forgetBias = rnnParamSpec.forgetBias; + ActivationMode activationMode = rnnParamSpec.activationMode; + if (activationMode != ACTIVATION_TANH) { + CHECK_STATUS(NOT_SUPPORTED); + } + + const F16 *currentXArray = (const F16 *)currentX; + F16 *lastStateArray = (F16 *)state; + F16 *lastHArray = lastStateArray + column; + F16 *tmpArray = (F16 *)tmp; + F16 *currentStateArray = (F16 *)state; + F16 *currentHArray = currentStateArray + column; + F16 *outputArray = (F16 *)output; + F16 *xhArray = tmpArray; + F16 *intermediateH = xhArray + (xDim + hDim); + U32 lastStateStride = column + hDim; + U32 lastHStride = column + hDim; + U32 currentStateStride = column + hDim; + U32 currentHStride = column + hDim; + float16x8_t forgetBiasVector = vdupq_n_f16(forgetBias); + for (U32 m = 0; m < batch; m++) { + F16 *lastBatchH = lastHArray + m * lastHStride; + memcpy(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(F16)); + memcpy(xhArray + xDim, lastBatchH, hDim * sizeof(F16)); + + memcpy(intermediateH, bias[0], column * 4 * sizeof(F16)); + mvm_nkn32(fn, fk, (const F16 *)filter[0], xhArray, intermediateH); + + F16 *out_i = intermediateH; + F16 *out_g = out_i + column; + F16 *out_f = out_i + column * 2; + F16 *out_o = out_i + column * 3; + + F16 *lastBatchState = lastStateArray + m * lastStateStride; + F16 *currentBatchState = currentStateArray + m * currentStateStride; + F16 *currentBatchH = currentHArray + m * currentHStride; + F16 *currentOutput = outputArray + m * batchStrideH; + + F16 *tmpState, *tmpHH, *tmpH; + if (rnnParamSpec.zoneoutCell == 0) { + tmpState = currentBatchState; + } else { + tmpState = out_i; + } + if (rnnParamSpec.numProjection > 0) { + tmpHH = out_g; + tmpH = currentOutput; + } else { + tmpHH = currentOutput; + tmpH = out_g; + } + + I32 h = 0; + for (; h < column - 7; h += 8) { + float16x8_t out_i_v = vld1q_f16(out_i + h); + float16x8_t out_g_v = vld1q_f16(out_g + h); + float16x8_t out_f_v = vld1q_f16(out_f + h); + float16x8_t out_o_v = vld1q_f16(out_o + h); + float16x8_t C_v = vld1q_f16(lastBatchState + h); + float16x8_t I_v = vsigmoidq_f16(out_i_v); + float16x8_t F_v = vsigmoidq_f16(vaddq_f16(out_f_v, forgetBiasVector)); + float16x8_t O_v = vsigmoidq_f16(out_o_v); + float16x8_t G_v = vtanhq_f16(out_g_v); + C_v = vaddq_f16_f32(vmulq_f16(C_v, F_v), vmulq_f16(I_v, G_v)); + float16x8_t out_hidden_v = vmulq_f16(O_v, vtanhq_f16(C_v)); + vst1q_f16(tmpState + h, C_v); + vst1q_f16(tmpHH + h, out_hidden_v); + } + for (; h < column; h++) { + F16 C_s = lastBatchState[h]; + F16 I_s = 1.0 / (1.0 + exp(-out_i[h])); + F16 F_s = 1.0 / (1.0 + exp(-(out_f[h] + forgetBias))); + F16 O_s = 1.0 / (1.0 + exp(-out_o[h])); + F16 G_s = tanh(out_g[h]); + C_s = C_s * F_s + I_s * G_s; + F16 value = O_s * tanh(C_s); + tmpState[h] = C_s; + tmpHH[h] = value; + } + if (rnnParamSpec.zoneoutCell != 0) { + array_scale_f16(tmpState, tmpState, column, 1 - rnnParamSpec.zoneoutCell, 0); + array_scale_f16(lastBatchState, lastBatchState, column, rnnParamSpec.zoneoutCell, 0); + array_add_f16(tmpState, lastBatchState, currentBatchState, column); + } + + if (rnnParamSpec.numProjection > 0) { + memset(tmpH, 0, sizeof(F16) * hDim); + mvm_nkn32(hDim / 32, rnnParamSpec.numProjection, (const F16 *)filter[1], tmpHH, tmpH); + } + if (rnnParamSpec.zoneoutOutput != 0) { + if (rnnParamSpec.numProjection > 0) { + array_scale_f16(tmpH, out_f, hDim, 1 - rnnParamSpec.zoneoutOutput, 0); + } else { + array_scale_f16(tmpHH, out_f, hDim, 1 - rnnParamSpec.zoneoutOutput, 0); + } + array_scale_f16(lastBatchH, lastBatchH, hDim, rnnParamSpec.zoneoutOutput, 0); + array_add_f16(out_f, lastBatchH, currentBatchH, hDim); + } else { + memcpy(currentBatchH, currentOutput, sizeof(F16) * hDim); + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/arm/fp16/normalization.cpp b/compute/tensor/src/cpu/arm/fp16/normalization.cpp new file mode 100644 index 00000000..503e2970 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/normalization.cpp @@ -0,0 +1,62 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/arm/fp16/tensor_computing_fp16.h" + +inline void array_norm_scale_fp16( + F16 *input, F16 *output, I32 len, F32 mean, F32 var, F16 *alpha, F16 *beta) +{ + F32 eps = 1e-6; + F32 std_value = sqrt(var + eps); + float16x8_t mean_v = vdupq_n_f16(mean); + float16x8_t std_v = vdupq_n_f16(std_value); + + I32 i = 0; + for (i = 0; i < len - 7; i += 8) { + float16x8_t in = vld1q_f16(input + i); + float16x8_t alpha_v = vld1q_f16(alpha + i); + float16x8_t beta_v = vld1q_f16(beta + i); + + float16x8_t tmp_v = vsubq_f16(in, mean_v); + tmp_v = vdivq_f16(tmp_v, std_v); + tmp_v = vfmaq_f16(beta_v, alpha_v, tmp_v); + vst1q_f16(output + i, tmp_v); + } + for (; i < len; i++) { + output[i] = alpha[i] * (input[i] - mean) / std_value + beta[i]; + } +} + +EE layer_normalization_fp16( + TensorDesc inputDesc, F16 *input, F16 *alpha, F16 *beta, TensorDesc outputDesc, F16 *output) +{ + UNUSED(outputDesc); + if (nullptr == alpha || nullptr == beta || nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + U32 size = tensorNumElements(inputDesc); + I32 size_inner = inputDesc.dims[0]; + I32 size_outer = size / size_inner; + for (I32 i = 0; i < size_outer; i++) { + F16 *current_input = input + i * size_inner; + F16 *current_output = output + i * size_inner; + F32 mean = array_mean_f16(current_input, size_inner); + F32 var = array_var_f16(current_input, size_inner, mean); + + array_norm_scale_fp16(current_input, current_output, size_inner, mean, var, alpha, beta); + } + + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/arm/fp16/pooling.cpp b/compute/tensor/src/cpu/arm/fp16/pooling.cpp new file mode 100644 index 00000000..b8d87d25 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/pooling.cpp @@ -0,0 +1,91 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/fp16/tensor_computing_fp16.h" + +EE pooling_c8_fp16(const F16 *input, + U32 stride, + int hstart, + int hend, + int wstart, + int wend, + F16 *output, + PoolingParamSpec poolingParamSpec) +{ + EE ret = SUCCESS; + PoolingMode pm = poolingParamSpec.mode; + float16x8_t in1, out1; + float16x8_t poolSize = vdupq_n_f16(float16_t((hend - hstart) * (wend - wstart))); + out1 = vdupq_n_f16(float16_t((pm == POOLING_MAX) ? UNI_F16_MIN : 0)); + for (int kernelH = hstart; kernelH < hend; kernelH++) { + for (int kernelW = wstart; kernelW < wend; kernelW++) { + const U32 index = (kernelH * stride + kernelW) * 8; + in1 = vld1q_f16(input + index); + switch (pm) { + case POOLING_MAX: + out1 = vmaxq_f16(in1, out1); + break; + case POOLING_MEAN: + out1 = vaddq_f16(out1, in1); + break; + default: + ret = NOT_SUPPORTED; + break; + } + } + } + vst1q_f16(output, ((pm == POOLING_MAX) ? out1 : vdivq_f16(out1, poolSize))); + return ret; +} + +EE pooling_c8_big_fp16(const F16 *input, + U32 stride, + int hstart, + int hend, + int wstart, + int wend, + F16 *output, + int poolSize) +{ + EE ret = SUCCESS; + float32x4_t out0, out1; + float32x4_t p = vdupq_n_f32(poolSize); + float16x4_t in0, in1, temp0, temp1; + temp0 = vdup_n_f16(0); + temp1 = temp0; + out0 = vdupq_n_f32(0); + out1 = out0; + int count = 0; + for (int kernelH = hstart; kernelH < hend; kernelH++) { + for (int kernelW = wstart; kernelW < wend; kernelW++, count++) { + const U32 index = (kernelH * stride + kernelW) * 8; + in0 = vld1_f16(input + index); + in1 = vld1_f16(input + index + 4); + temp0 = vadd_f16(temp0, in0); + temp1 = vadd_f16(temp1, in1); + if (count % 256 == 255) { + out0 = vaddq_f32(out0, vcvt_f32_f16(temp0)); + out1 = vaddq_f32(out1, vcvt_f32_f16(temp1)); + temp0 = vdup_n_f16(0); + temp1 = temp0; + } + } + } + out0 = vaddq_f32(out0, vcvt_f32_f16(temp0)); + out1 = vaddq_f32(out1, vcvt_f32_f16(temp1)); + out0 = vdivq_f32(out0, p); + out1 = vdivq_f32(out1, p); + vst1_f16(output, vcvt_f16_f32(out0)); + vst1_f16(output + 4, vcvt_f16_f32(out1)); + return ret; +} diff --git a/compute/tensor/src/cpu/arm/fp16/prelu.cpp b/compute/tensor/src/cpu/arm/fp16/prelu.cpp new file mode 100644 index 00000000..a8fa835c --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/prelu.cpp @@ -0,0 +1,61 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/tensor_computing_arm.h" +#include "cpu/arm/fp16/tensor_computing_fp16.h" + +EE prelu_fp16(TensorDesc inputDesc, + F16 *input, + F16 *weight, + PReLUParamSpec preluDesc, + TensorDesc outputDesc, + F16 *output) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, odt; + DataFormat idf, odf; + U32 in = 0, ic = 0, ih = 0, iw = 0, on = 0, oc = 0, oh = 0, ow = 0; + if (tensorIs4d(inputDesc) && tensorIs4d(outputDesc)) { + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + if (idf != DF_NCHWC8) { + CHECK_STATUS(NOT_SUPPORTED); + } + } else { + return NOT_SUPPORTED; + } + + CHECK_REQUIREMENT(in == on && ic == oc && ih == oh && iw == ow); + ic /= 8; + float16x8_t slope; + uint16x8_t mask; + float16x8_t in0, out0; + for (U32 n = 0; n < in; n++) { + for (U32 c = 0; c < ic; c++) { + for (U32 hw = 0; hw < ih * iw; hw++) { + slope = preluDesc.propagate_down ? vdupq_n_f16(weight[0]) + : vld1q_f16(weight + c * 8); + in0 = vld1q_f16(input); + mask = vcleq_f16(in0, vdupq_n_f16(0.f)); + float16x8_t tmp = vmulq_f16(in0, slope); + out0 = vbslq_f16(mask, tmp, in0); + vst1q_f16(output, out0); + input += 8; + output += 8; + } + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/arm/fp16/quantize.cpp b/compute/tensor/src/cpu/arm/fp16/quantize.cpp new file mode 100644 index 00000000..8adfa1cd --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/quantize.cpp @@ -0,0 +1,158 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include +#include "cpu/arm/fp16/tensor_computing_fp16.h" + +inline void apply_scale_f16(U32 numData, F16 *array, F16 scale, INT8 *qArray, bool clamp) +{ + for (U32 i = 0; i < numData; i++) { + F32 tmp = array[i]; + tmp *= scale; + qArray[i] = round_towards_zero(tmp, clamp); + } +} + +EE quantize_tensor_fp16( + TensorDesc dDesc, const void *data, TensorDesc *qDesc, void *qData, F16 *scale) +{ + if (nullptr == data || nullptr == qDesc || nullptr == qData || nullptr == scale) { + CHECK_STATUS(NULL_POINTER); + } + DataType dt; + DataFormat df; + U32 n, c, h, w; + if (tensorIs2d(dDesc)) { + CHECK_STATUS(tensor2dGet(dDesc, &dt, &df, &n, &w)); + c = 1; + h = 1; + } else if (tensorIs3d(dDesc)) { + CHECK_STATUS(tensor3dGet(dDesc, &dt, &df, &n, &h, &w)); + c = 1; + } else { + CHECK_STATUS(tensor4dGet(dDesc, &dt, &df, &n, &c, &h, &w)); + } + + switch (dt) { + case DT_F16: { + switch (df) { + case DF_HWNCN8C4: { // winograd + F16 *array = (F16 *)data; + for (U32 idx = 0; idx < 36; idx++) { + float16x8_t tmp_v = vld1q_f16(array + idx * 8 * c); + float16x8_t max_v = tmp_v; + float16x8_t min_v = tmp_v; + + for (U32 o = 0; o < n; o += 8) { + F16 *base = array + o * 36 * c + idx * 8 * c; + for (U32 i = 0; i < 8 * c; i += 8) { + tmp_v = vld1q_f16(base + i); + max_v = vmaxq_f16(max_v, tmp_v); + min_v = vminq_f16(min_v, tmp_v); + } + } + + F16 max = vmaxvq_f16(max_v); + F16 min = vminvq_f16(min_v); + if (max == 0 && min == 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (max > 0 && min < 0) { + F16 scale_max = 127.0 / max; + F16 scale_min = -127.0 / min; + scale[idx] = (scale_max < scale_min) ? scale_max : scale_min; + } else if (max < 0) { + scale[idx] = -127.0 / min; + } else { // min > 0 + scale[idx] = 127.0 / max; + } + + INT8 *qArray = (INT8 *)qData; + for (U32 o = 0; o < n; o += 8) { + U32 base = o * 36 * c + idx * 8 * c; + apply_scale_f16(8 * c, array + base, scale[idx], qArray + base, false); + } + } + *qDesc = tensor4df(DT_I8, df, n, c, h, w); + break; + } + default: { + if (tensorIs2d(dDesc)) { + *qDesc = tensor2df(DT_I8, df, n, w); + } else if (tensorIs3d(dDesc)) { + *qDesc = tensor3df(DT_I8, df, n, h, w); + } else { + *qDesc = tensor4df(DT_I8, df, n, c, h, w); + } + F16 *array = (F16 *)data; + float16x8_t tmp_v = vld1q_f16(array); + float16x8_t max_v = tmp_v; + float16x8_t min_v = tmp_v; + + U32 numData = n * c * h * w; + CHECK_REQUIREMENT(numData >= 8); + U32 i = 8; + for (; i < numData - 7; i += 8) { + tmp_v = vld1q_f16(array + i); + max_v = vmaxq_f16(max_v, tmp_v); + min_v = vminq_f16(min_v, tmp_v); + } + + F16 max = vmaxvq_f16(max_v); + F16 min = vminvq_f16(min_v); + + for (; i < numData; i++) { + F16 tmp = array[i]; + if (tmp > max) { + max = tmp; + } + if (tmp < min) { + min = tmp; + } + } + if (max == 0 && min == 0) { + *scale = 1; + memset(qData, 0, tensorNumBytes(*qDesc)); + break; + } + F16 scaleRaw; + if (max > 0 && min < 0) { + F32 scale_max = 127.0 / max; + F32 scale_min = -127.0 / min; + scaleRaw = (scale_max < scale_min) ? scale_max : scale_min; + } else if (max < 0) { + scaleRaw = -127.0 / min; + } else { // min > 0 + scaleRaw = 127.0 / max; + } + UNI_DEBUG_LOG("%f is the max FP16 value, and min value is %f\n", max, min); + if (*scale < scaleRaw) { + *scale = scaleRaw; + } + + INT8 *qArray = (INT8 *)qData; + apply_scale_f16(numData, array, *scale, qArray, (*scale) != scaleRaw); + break; + } + } + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + break; + } + } + UNI_DEBUG_LOG("%f is the quantization scale\n", scale[0]); + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/arm/fp16/scale.cpp b/compute/tensor/src/cpu/arm/fp16/scale.cpp new file mode 100644 index 00000000..80fb60ad --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/scale.cpp @@ -0,0 +1,123 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/arm/fp16/tensor_computing_fp16.h" + +EE scale_nchwc8_fp16( + F16 *input, F16 *alpha, F16 *beta, I32 in, I32 ic, I32 elements_per_channel, F16 *output) +{ + float16x8_t one = vdupq_n_f16(1.); + float16x8_t zero = vdupq_n_f16(0.); + U32 index = 0; + for (I32 n = 0; n < in; n++) { + for (I32 c = 0; c < ic; c += 8) { + float16x8_t alpha_vec = (alpha == nullptr) ? one : vld1q_f16(alpha + c); + float16x8_t beta_vec = (beta == nullptr) ? zero : vld1q_f16(beta + c); + for (I32 i = 0; i < elements_per_channel; i++) { + float16x8_t in_vec = vld1q_f16(input + index); + float16x8_t out_vec = vfmaq_f16(beta_vec, alpha_vec, in_vec); + vst1q_f16(output + index, out_vec); + index += 8; + } + } + } + return SUCCESS; +} + +EE scale_nchw_fp16( + F16 *input, F16 *alpha, F16 *beta, I32 in, I32 ic, I32 elements_per_channel, F16 *output) +{ + float16x8_t one = vdupq_n_f16(1.); + float16x8_t zero = vdupq_n_f16(0.); + U32 index = 0; + for (I32 n = 0; n < in; n++) { + for (I32 c = 0; c < ic; c++) { + float16x8_t alpha_vec = (alpha == nullptr) ? one : vdupq_n_f16(alpha[c]); + float16x8_t beta_vec = (beta == nullptr) ? zero : vdupq_n_f16(beta[c]); + I32 i = 0; + for (; i < elements_per_channel - 7; i += 8) { + float16x8_t in_vec = vld1q_f16(input + index); + float16x8_t out_vec = vfmaq_f16(beta_vec, alpha_vec, in_vec); + vst1q_f16(output + index, out_vec); + index += 8; + } + for (; i < elements_per_channel; i++) { + float alpha_s = (alpha == nullptr) ? 1 : alpha[c]; + float beta_s = (beta == nullptr) ? 0 : beta[c]; + output[index] = alpha_s * input[index] + beta_s; + index++; + } + } + } + return SUCCESS; +} + +EE scale_nhwc_fp16( + F16 *input, F16 *alpha, F16 *beta, I32 in, I32 ic, I32 elements_per_channel, F16 *output) +{ + float16x8_t one = vdupq_n_f16(1.); + float16x8_t zero = vdupq_n_f16(0.); + U32 index = 0; + for (I32 n = 0; n < in; n++) { + for (I32 i = 0; i < elements_per_channel; i++) { + I32 c = 0; + for (; c < ic - 7; c += 8) { + float16x8_t alpha_vec = (alpha == nullptr) ? one : vld1q_f16(alpha + c); + float16x8_t beta_vec = (beta == nullptr) ? zero : vld1q_f16(beta + c); + float16x8_t in_vec = vld1q_f16(input + index); + float16x8_t out_vec = vfmaq_f16(beta_vec, alpha_vec, in_vec); + vst1q_f16(output + index, out_vec); + index += 8; + } + for (; c < ic; c++) { + float alpha_s = (alpha == nullptr) ? 1 : alpha[c]; + float beta_s = (beta == nullptr) ? 0 : beta[c]; + output[index] = alpha_s * input[index] + beta_s; + index++; + } + } + } + return SUCCESS; +} + +EE scale_fp16(F16 *input, + I32 axis, + I32 nDims, + F16 *alpha, + F16 *beta, + I32 in, + I32 ic, + I32 elements_per_channel, + F16 *output) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + EE ret = SUCCESS; + // If ic is 1, it means that weights/vectors have only one param, so we need use the calculation logic of nchw. + if (axis == 1 || axis == 0 || ic == 1) { + ret = scale_nchw_fp16(input, alpha, beta, in, ic, elements_per_channel, output); + CHECK_STATUS(ret); + } else if (axis == nDims - 1) { + ret = scale_nhwc_fp16(input, alpha, beta, in, ic, elements_per_channel, output); + CHECK_STATUS(ret); + } else if (axis == nDims) { + ret = scale_nchwc8_fp16(input, alpha, beta, in, ic, elements_per_channel, output); + CHECK_STATUS(ret); + } else { + ret = NOT_SUPPORTED; + CHECK_STATUS(ret); + } + return ret; +} diff --git a/compute/tensor/src/cpu/arm/fp16/softmax.cpp b/compute/tensor/src/cpu/arm/fp16/softmax.cpp new file mode 100644 index 00000000..4a7396ce --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/softmax.cpp @@ -0,0 +1,139 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include +#include "cpu/arm/fp16/tensor_computing_fp16.h" + +void softmax_lastAxis_fp16(const F16 *input, I32 loopOuter, I32 loops, F16 *output) +{ + for (I32 i = 0; i < loopOuter; i++) { + const F16 *inputPtr = input + i * loops; + F16 *outputPtr = output + i * loops; + + float16x8_t max_v, sub_v, sum_v, tmp_v; + F32 max_s, tmp_s; + max_s = array_max_f16(inputPtr, loops); + max_v = vdupq_n_f16(max_s); + sum_v = vdupq_n_f16(0); + + I32 j = 0; + F32 sum_s = 0; + for (j = 0; j < loops - 7; j += 8) { + float16x8_t in = vld1q_f16(inputPtr + j); + sub_v = vsubq_f16(in, max_v); + tmp_v = vexpq_f16_f32(sub_v); + sum_v = vaddq_f16(sum_v, tmp_v); + vst1q_f16(outputPtr + j, tmp_v); + } + sum_s += vaddvq_f16(sum_v); + for (; j < loops; j++) { + tmp_s = exp(inputPtr[j] - max_s); + outputPtr[j] = tmp_s; + sum_s += tmp_s; + } + array_scale_f16(outputPtr, outputPtr, loops, 1.0 / sum_s, 0); + } +} + +void softmax_anyAxis_fp16(const F16 *input, I32 loopOuter, I32 loops, I32 loopInner, F16 *output) +{ + std::vector buffer(loopInner * 2); + F16 *maxBuffer = &buffer[0]; + F16 *sumBuffer = &buffer[loopInner]; + I32 k = 0; + for (I32 i = 0; i < loopOuter; i++) { + const F16 *inputPtrBase = input + i * loops * loopInner; + F16 *outputPtrBase = output + i * loops * loopInner; + + memcpy(maxBuffer, inputPtrBase, loopInner * sizeof(F16)); + memset(sumBuffer, 0, loopInner * sizeof(F16)); + for (I32 j = 1; j < loops; j++) { + const F16 *inputPtr = inputPtrBase + j * loopInner; + for (k = 0; k < loopInner - 7; k += 8) { + float16x8_t in_v = vld1q_f16(inputPtr + k); + float16x8_t out_v = vld1q_f16(maxBuffer + k); + float16x8_t max_v = vmaxq_f16(in_v, out_v); + vst1q_f16(maxBuffer + k, max_v); + } + for (; k < loopInner; k++) { + maxBuffer[k] = UNI_MAX(maxBuffer[k], inputPtr[k]); + } + } + for (I32 j = 0; j < loops; j++) { + const F16 *inputPtr = inputPtrBase + j * loopInner; + F16 *outputPtr = outputPtrBase + j * loopInner; + for (k = 0; k < loopInner - 7; k += 8) { + float16x8_t in_v = vld1q_f16(inputPtr + k); + float16x8_t max_v = vld1q_f16(maxBuffer + k); + float16x8_t sub_v = vsubq_f16(in_v, max_v); + float16x8_t exp_v = vexpq_f16_f32(sub_v); + float16x8_t sum_v = vld1q_f16(sumBuffer + k); + sum_v = vaddq_f16(sum_v, exp_v); + vst1q_f16(sumBuffer + k, sum_v); + vst1q_f16(outputPtr + k, exp_v); + } + for (; k < loopInner; k++) { + outputPtr[k] = exp(inputPtr[k] - maxBuffer[k]); + sumBuffer[k] += outputPtr[k]; + } + } + for (I32 j = 0; j < loops; j++) { + F16 *outputPtr = outputPtrBase + j * loopInner; + for (k = 0; k < loopInner - 7; k += 8) { + float16x8_t out_v = vld1q_f16(outputPtr + k); + float16x8_t sum_v = vld1q_f16(sumBuffer + k); + out_v = vdivq_f16(out_v, sum_v); + vst1q_f16(outputPtr + k, out_v); + } + for (; k < loopInner; k++) { + outputPtr[k] /= sumBuffer[k]; + } + } + } +} + +EE softmax_fp16(TensorDesc inputDesc, const F16 *input, int axis, TensorDesc outputDesc, F16 *output) +{ + UNUSED(outputDesc); + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + U32 size = tensorNumElements(inputDesc); + axis = (axis + inputDesc.nDims) % inputDesc.nDims; + axis = inputDesc.nDims - 1 - axis; + I32 loops = inputDesc.dims[axis]; + + I32 loopInner = 1; + for (int i = 0; i < axis; i++) { + loopInner *= inputDesc.dims[i]; + } + U32 loopOuter = size / loops / loopInner; + + if (loopInner == 1) { + if (DF_NCHWC8 == inputDesc.df && 4 == inputDesc.nDims && + (inputDesc.dims[1] != 1 || inputDesc.dims[0] != 1)) { + CHECK_REQUIREMENT(2 != axis); + loopInner *= 8; + loopOuter /= 8; + softmax_anyAxis_fp16(input, loopOuter, loops, loopInner, output); + } else { + softmax_lastAxis_fp16(input, loopOuter, loops, output); + } + } else { + CHECK_REQUIREMENT(DF_NCHWC8 != inputDesc.df); + softmax_anyAxis_fp16(input, loopOuter, loops, loopInner, output); + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/arm/fp16/tensor_computing_fp16.h b/compute/tensor/src/cpu/arm/fp16/tensor_computing_fp16.h new file mode 100644 index 00000000..c6129be3 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/tensor_computing_fp16.h @@ -0,0 +1,178 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_TENSOR_COMPUTING_FP16 +#define _H_TENSOR_COMPUTING_FP16 +#include + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "cpu/arm/fp16/arm_functions_fp16.h" + +EE convolution_transform_filter_fp16(TensorDesc filterDesc, + const F16 *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + F16 *filterTransformed); + +EE convolution_fp16(TensorDesc inputDesc, + F16 *input, + TensorDesc filterDesc, + const F16 *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc biasDesc, + const F16 *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *output, + ActivationParamSpec activationDesc, + Arch arch); + +EE deconvolution_transform_filter_fp16(TensorDesc filterDesc, + const F16 *filter, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + F16 *filterTransformed); + +EE pooling_c8_fp16(const F16 *input, + U32 stride, + int hstart, + int hend, + int wstart, + int wend, + F16 *output, + PoolingParamSpec poolingParamSpec); + +EE pooling_c8_big_fp16(const F16 *input, + U32 stride, + int hstart, + int hend, + int wstart, + int wend, + F16 *output, + int poolSize); + +EE softmax_fp16( + TensorDesc inputDesc, const F16 *input, int axis, TensorDesc outputDesc, F16 *output); + +EE attention_fp16(U32 batch, + U32 numHeads, + I32 fromSequenceLength, + I32 toSequenceLength, + const F16 *input, + F16 *output); + +EE clip_fp16(F16 *input, F16 *output, I32 len, F32 minValue, F32 maxValue); + +EE concat_fp16(std::vector inputDesc, + std::vector input, + F16 *inputScale, + TensorDesc outputDesc, + F16 *output, + F16 *outputScale, + U32 concatDim); + +EE depthwise_pointwise_convolution_fp16(TensorDesc inputDesc, + F16 *input, + TensorDesc dwFilterDesc, + const F16 *dwFilter, + TensorDesc pwFilterDesc, + const F16 *pwFilter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc dwBiasDesc, + const F16 *dwBias, + TensorDesc pwBiasDesc, + const F16 *pwBias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *output, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + Arch arch); + +EE eltwise_fp16(std::vector input, + std::vector inputSize, + U32 num, + U32 len, + void *output, + EltwiseMode eltwiseMode); + +EE rnncell_fp16(TensorDesc xDesc, + const void *currentX, + const TensorDesc *filterDesc, + const void **filter, + const TensorDesc *biasDesc, + const void **bias, + void *state, + U32 tmpBytes, + void *tmp, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + void *output, + Arch arch); + +EE power_fp16(TensorDesc inputDesc, + F16 *input, + F32 scale, + F32 shift, + F32 power, + TensorDesc outputDesc, + F16 *output); + +EE layer_normalization_fp16( + TensorDesc inputDesc, F16 *input, F16 *alpha, F16 *beta, TensorDesc outputDesc, F16 *output); + +EE scale_fp16(F16 *input, + I32 axis, + I32 nDims, + F16 *alpha, + F16 *beta, + I32 in, + I32 ic, + I32 elements_per_channel, + F16 *output); + +EE softmax_fp16(TensorDesc inputDesc, const F16 *input, TensorDesc outputDesc, F16 *output); + +EE check_fp16(TensorDesc inputDescA, + const F16 *inputA, + TensorDesc inputDescB, + const F16 *inputB, + CheckMode checkMode, + TensorDesc outputDesc, + I32 *output); + +EE quantize_tensor_fp16( + TensorDesc dDesc, const void *data, TensorDesc *qDesc, void *qData, F16 *scale); + +EE attention_mask_fp16(TensorDesc inputDesc, + const F16 *input, + AttentionMaskParamSpec p, + TensorDesc outputDesc, + F16 *output); + +EE prelu_fp16(TensorDesc inputDesc, + F16 *input, + F16 *weight, + PReLUParamSpec preluDesc, + TensorDesc outputDesc, + F16 *output); +#endif diff --git a/compute/tensor/src/cpu/arm/fp32/arm_functions_fp32.h b/compute/tensor/src/cpu/arm/fp32/arm_functions_fp32.h new file mode 100644 index 00000000..07e9a976 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp32/arm_functions_fp32.h @@ -0,0 +1,369 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_ARM_FUNCTIONS_FP32 +#define _H_ARM_FUNCTIONS_FP32 + +#include +#include "arm_neon_expand.h" +#include "types.h" + +// array sum +inline F32 array_sum_f32(const F32 *data, I32 len) +{ + if (len <= 0) { + return 0; + } + + I32 i = 0; + F32 sum_s = 0; + float32x4_t sum_v = vdupq_n_f32(0); + for (i = 0; i < len - 3; i += 4) { + float32x4_t in = vld1q_f32(data + i); + sum_v = vaddq_f32(sum_v, in); + } + sum_s += vaddvq_f32(sum_v); + for (; i < len; i++) { + sum_s += data[i]; + } + return sum_s; +} + +// array mean +inline F32 array_mean_f32(const F32 *data, I32 len) +{ + if (len <= 0) { + return 0; + } + return array_sum_f32(data, len) / len; +} + +// array var +inline F32 array_var_f32(const F32 *data, I32 len, F32 mean) +{ + if (len <= 0) { + return 0; + } + + I32 i = 0; + F32 sum_s = 0; + float32x4_t mean_v = vdupq_n_f32(mean); + for (i = 0; i < len - 3; i += 4) { + float32x4_t in = vld1q_f32(data + i); + float32x4_t tmp_v = vsubq_f32(in, mean_v); + float32x4_t sum_v = vmulq_f32(tmp_v, tmp_v); + sum_s += vaddvq_f32(sum_v); + } + for (; i < len; i++) { + F32 in = data[i]; + F32 tmp = in - mean; + sum_s += tmp * tmp; + } + return sum_s / len; +} + +// array max +inline F32 array_max_f32(const F32 *data, I32 len) +{ + F32 max_s = data[0]; + I32 i = 0; + if (len >= 4) { + float32x4_t max_v, tmp_v; + max_v = vld1q_f32(data); + for (i = 4; i < len - 3; i += 4) { + tmp_v = vld1q_f32(data + i); + max_v = vmaxq_f32(tmp_v, max_v); + } + max_s = vmaxvq_f32(max_v); + } + + for (; i < len; i++) { + if (data[i] > max_s) { + max_s = data[i]; + } + } + + return max_s; +} + +inline void array_scale_f32(const F32 *input, F32 *output, I32 len, F32 alpha, F32 beta) +{ + float32x4_t alpha_v = vdupq_n_f32(alpha); + float32x4_t beta_v = vdupq_n_f32(beta); + I32 i = 0; + for (i = 0; i < len - 3; i += 4) { + float32x4_t in = vld1q_f32(input + i); + float32x4_t tmp_v = vfmaq_f32(beta_v, alpha_v, in); + vst1q_f32(output + i, tmp_v); + } + for (; i < len; i++) { + output[i] = alpha * input[i] + beta; + } +} + +inline void array_power_f32(F32 *input, F32 *output, I32 len, F32 power) +{ + I32 i = 0; + if (power == -1) { + float32x4_t one_v = vdupq_n_f32(1); + for (i = 0; i < len - 3; i += 4) { + float32x4_t in = vld1q_f32(input + i); + float32x4_t tmp_v = vdivq_f32(one_v, in); + vst1q_f32(output + i, tmp_v); + } + } else if (power == 0.5) { +#ifdef __aarch64__ + for (i = 0; i < len - 3; i += 4) { + float32x4_t in = vld1q_f32(input + i); + float32x4_t tmp_v = vsqrtq_f32(in); + vst1q_f32(output + i, tmp_v); + } +#endif + } else if (power == 1) { + if (input != output) { + memcpy(output, input, len * sizeof(F32)); + } + i = len; + } else if (power == 2) { + for (i = 0; i < len - 3; i += 4) { + float32x4_t in = vld1q_f32(input + i); + float32x4_t tmp_v = vmulq_f32(in, in); + vst1q_f32(output + i, tmp_v); + } + } + for (; i < len; i++) { + output[i] = powf(input[i], power); + } +} + +inline EE activation_fp32(F32 *input, U32 len, ActivationParamSpec activationDesc, F32 *output) +{ + float32x4_t in, out; + float32x4_t zero = vdupq_n_f32(0.); + float32x4_t one = vdupq_n_f32(1.); + float32x4_t three = vdupq_n_f32(3.); + float32x4_t six = vdupq_n_f32(6.); + U32 len_main = len / 4; + U32 len_tail = len % 4; + + F32 value; + switch (activationDesc.mode) { + case ACTIVATION_NULL: { + break; + } + case ACTIVATION_RELU: { + if (activationDesc.value[0] == 0) { + for (U32 i = 0; i < len_main; i++) { + in = vld1q_f32(input); + out = vmaxq_f32(zero, in); + vst1q_f32(output, out); + input += 4; + output += 4; + } + for (U32 i = 0; i < len_tail; i++) { + output[i] = (input[i] < 0) ? 0 : input[i]; + } + } else { + float32x4_t scale = vdupq_n_f32(activationDesc.value[0]); + for (U32 i = 0; i < len_main; i++) { + in = vld1q_f32(input); + float32x4_t tmp = vmulq_f32(in, scale); + out = vmaxq_f32(tmp, in); + vst1q_f32(output, out); + input += 4; + output += 4; + } + for (U32 i = 0; i < len_tail; i++) { + float tmp = activationDesc.value[0] * input[i]; + output[i] = (input[i] < tmp) ? tmp : input[i]; + } + } + break; + } + case ACTIVATION_RELU6: { + for (U32 i = 0; i < len_main; i++) { + in = vld1q_f32(input); + out = vmaxq_f32(zero, in); + out = vminq_f32(six, out); + vst1q_f32(output, out); + input += 4; + output += 4; + } + for (U32 i = 0; i < len_tail; i++) { + value = (input[i] < 0) ? 0 : input[i]; + if (value > 6) { + value = 6; + } + output[i] = value; + } + break; + } + case ACTIVATION_H_SIGMOID: { + for (U32 i = 0; i < len_main; i++) { + in = vld1q_f32(input); + out = vaddq_f32(in, three); + out = vmaxq_f32(out, zero); + out = vminq_f32(out, six); + out = vdivq_f32(out, six); + vst1q_f32(output, out); + input += 4; + output += 4; + } + for (U32 i = 0; i < len_tail; i++) { + value = input[i] + 3; + value = (value < 0) ? 0 : value; + value = (value > 6) ? 6 : value; + value = value / 6; + output[i] = value; + } + break; + } + case ACTIVATION_H_SWISH: { + for (U32 i = 0; i < len_main; i++) { + in = vld1q_f32(input); + out = vaddq_f32(in, three); + out = vmaxq_f32(out, zero); + out = vminq_f32(out, six); + out = vdivq_f32(out, six); + out = vmulq_f32(out, in); + vst1q_f32(output, out); + input += 4; + output += 4; + } + for (U32 i = 0; i < len_tail; i++) { + value = input[i] + 3; + value = (value < 0) ? 0 : value; + value = (value > 6) ? 6 : value; + value = input[i] * value; + value = value / 6; + output[i] = value; + } + break; + } + case ACTIVATION_GELU: { + F32 two_div_PI_sqrt = sqrt(2 / 3.14159265358979323846); + float32x4_t vec0 = vdupq_n_f32(two_div_PI_sqrt); + float32x4_t vec1 = vdupq_n_f32(0.044715); + float32x4_t vec2 = vdupq_n_f32(0.5); + for (U32 i = 0; i < len_main; i++) { + in = vld1q_f32(input); + out = vmulq_f32(in, in); + out = vmulq_f32(out, in); + out = vfmaq_f32(in, vec1, out); + out = vmulq_f32(vec0, out); + out = vtanhq_f32(out); + out = vaddq_f32(one, out); + out = vmulq_f32(vec2, out); + out = vmulq_f32(in, out); + vst1q_f32(output, out); + input += 4; + output += 4; + } + for (U32 i = 0; i < len_tail; i++) { + value = input[i]; + value = two_div_PI_sqrt * (value + 0.044715 * powf(value, 3)); + value = 1.0 - 2.0 / (exp(2.0 * value) + 1.0); + value = 0.5 * (1.0 + value); + value = input[i] * value; + output[i] = value; + } + break; + } + case ACTIVATION_TANH: { + for (U32 i = 0; i < len_main; i++) { + in = vld1q_f32(input); + out = vtanhq_f32(in); + vst1q_f32(output, out); + input += 4; + output += 4; + } + for (U32 i = 0; i < len_tail; i++) { + value = 1.0 - 2.0 / (exp(2.0 * input[i]) + 1.0); + output[i] = value; + } + break; + } + case ACTIVATION_SIGMOID: { + for (U32 i = 0; i < len_main; i++) { + in = vld1q_f32(input); + out = vsigmoidq_f32(in); + vst1q_f32(output, out); + input += 4; + output += 4; + } + for (U32 i = 0; i < len_tail; i++) { + value = 1.0 / (1.0 + exp(-1.0 * input[i])); + output[i] = value; + } + break; + } + case ACTIVATION_MISH: { + for (U32 i = 0; i < len_main; i++) { + in = vld1q_f32(input); + out = vmulq_f32( + in, vtanhq_f32(vlogq_f32(vaddq_f32(vexpq_f32_03_percent_error(in), one)))); + vst1q_f32(output, out); + input += 4; + output += 4; + } + for (U32 i = 0; i < len_tail; i++) { + value = input[i] * tanh(log(exp(input[i]) + 1.0)); + output[i] = value; + } + break; + } + case ACTIVATION_GREATER: { + for (U32 i = 0; i < len; i++) { + output[i] = input[i] > 1 ? 1 : 0; + } + break; + } + default: + return NOT_SUPPORTED; + } + + return SUCCESS; +} + +inline void array_add_f32(const F32 *inputA, const F32 *inputB, F32 *output, I32 len) +{ + I32 i = 0; + for (i = 0; i < len - 3; i += 4) { + float32x4_t a = vld1q_f32(inputA + i); + float32x4_t b = vld1q_f32(inputB + i); + float32x4_t c = vaddq_f32(a, b); + vst1q_f32(output + i, c); + } + + for (; i < len; i++) { + output[i] = inputA[i] + inputB[i]; + } +} + +inline void array_square_and_add_f32(const F32 *inputA, const F32 *inputB, F32 *output, I32 len) +{ + I32 i = 0; + for (i = 0; i < len - 3; i += 4) { + float32x4_t a = vld1q_f32(inputA + i); + float32x4_t b = vld1q_f32(inputB + i); + b = vmulq_f32(b, b); + float32x4_t c = vaddq_f32(a, b); + vst1q_f32(output + i, c); + } + + for (; i < len; i++) { + output[i] = inputA[i] + inputB[i] * inputB[i]; + } +} + +#endif diff --git a/compute/tensor/src/cpu/arm/fp32/attention.cpp b/compute/tensor/src/cpu/arm/fp32/attention.cpp new file mode 100644 index 00000000..6861cae6 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp32/attention.cpp @@ -0,0 +1,79 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/arm/fp32/tensor_computing_fp32.h" + +EE attention_fp32(U32 batch, + U32 numHeads, + I32 fromSequenceLength, + I32 toSequenceLength, + const F32 *input, + F32 *output) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + F32 mask_s = -10000.0; + I32 count = array_sum_f32(input, toSequenceLength); + I32 valid = UNI_MIN(count, fromSequenceLength); + float32x4_t mask_v = vdupq_n_f32(mask_s); + float32x4_t one_v = vdupq_n_f32(1.0); + for (U32 n = 0; n < batch; n++) { + for (U32 i = 0; i < numHeads; i++) { + if (i == 0) { + for (I32 j = 0; j < valid; j++) { + if (j == 0) { + I32 k = 0; + for (; k < toSequenceLength - 3; k += 4) { + float32x4_t in_v = vld1q_f32(input + k); + float32x4_t tmp_v = vsubq_f32(one_v, in_v); + tmp_v = vmulq_f32(tmp_v, mask_v); + vst1q_f32(output + k, tmp_v); + } + for (; k < toSequenceLength; k++) { + F32 value = (1 - input[k]) * mask_s; + output[k] = value; + } + } else { + memcpy( + output + j * toSequenceLength, output, toSequenceLength * sizeof(F32)); + } + } + + for (I32 j = valid; j < fromSequenceLength; j++) { + if (j == valid) { + I32 k = 0; + for (; k < toSequenceLength - 3; k += 4) { + vst1q_f32(output + j * toSequenceLength + k, mask_v); + } + for (; k < toSequenceLength; k++) { + output[j * toSequenceLength + k] = mask_s; + } + } else { + memcpy(output + j * toSequenceLength, output + valid * toSequenceLength, + toSequenceLength * sizeof(F32)); + } + } + } else { + memcpy(output + i * fromSequenceLength * toSequenceLength, output, + fromSequenceLength * toSequenceLength * sizeof(F32)); + } + } + + input += toSequenceLength; + output += numHeads * fromSequenceLength * toSequenceLength; + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/arm/fp32/attention_mask.cpp b/compute/tensor/src/cpu/arm/fp32/attention_mask.cpp new file mode 100644 index 00000000..3a34c6dc --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp32/attention_mask.cpp @@ -0,0 +1,82 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/arm/fp32/tensor_computing_fp32.h" + +EE attention_mask_fp32(TensorDesc inputDesc, + const F32 *input, + AttentionMaskParamSpec p, + TensorDesc outputDesc, + F32 *output) +{ + UNUSED(outputDesc); + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + I32 attentionLength = p.attention_length; + bool sameLength = p.same_length; + float maskValue = p.mask; + int qlen = inputDesc.dims[1]; + int klen = inputDesc.dims[0]; + int mlen = klen - qlen; + I32 length = qlen * klen; + std::vector mask; + if (attentionLength < 0) { + mask = std::vector(length, 0); + } else { + mask = std::vector(length, 1); + for (int i = 0; i < qlen; i++) { + int start, loops; + if (attentionLength > 0) { + int end = mlen + i; + start = UNI_MAX(end - attentionLength, 0); + loops = end - start + 1; + } else { + if (sameLength) { + start = i; + loops = qlen + 1; + } else { + start = 0; + loops = i + qlen + 1; + } + } + loops = UNI_MAX(loops, 0); + start = UNI_MIN(start, klen); + if (start + loops > klen) { + loops = UNI_MAX(klen - start, 0); + } + memset(&mask[i * klen + start], 0, sizeof(F32) * loops); + } + } + I32 loops = tensorNumElements(inputDesc) / length; + float32x4_t one_v = vdupq_n_f32(1); + float32x4_t mask_value_v = vdupq_n_f32(maskValue); + for (int i = 0, index = 0; i < loops; i++) { + int j = 0; + for (; j < length - 3; j += 4) { + float32x4_t in = vld1q_f32(input + index); + float32x4_t mask_v = vld1q_f32(&mask[j]); + float32x4_t tmp_v = vsubq_f32(one_v, mask_v); + tmp_v = vmulq_f32(in, tmp_v); + tmp_v = vfmsq_f32(tmp_v, mask_value_v, mask_v); + vst1q_f32(output + index, tmp_v); + index += 4; + } + for (; j < length; j++) { + output[index] = input[index] * (1 - mask[j]) - maskValue * mask[j]; + index++; + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/arm/fp32/check.cpp b/compute/tensor/src/cpu/arm/fp32/check.cpp new file mode 100644 index 00000000..1e6894c7 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp32/check.cpp @@ -0,0 +1,99 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/fp32/tensor_computing_fp32.h" + +EE check_fp32(TensorDesc inputDescA, + const F32 *inputA, + TensorDesc inputDescB, + const F32 *inputB, + CheckMode checkMode, + TensorDesc outputDesc, + I32 *output) +{ + if (nullptr == inputA || nullptr == inputB || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + if (tensorNumElements(inputDescA) != tensorNumElements(inputDescB)) { + CHECK_STATUS(NOT_MATCH); + } + + U32 size = tensorNumElements(inputDescA); + U32 loopOuter = inputDescA.dims[inputDescA.nDims - 1]; + I32 length = size / loopOuter; + if (tensorNumElements(outputDesc) != loopOuter) { + CHECK_STATUS(NOT_MATCH); + } + for (U32 j = 0; j < loopOuter; j++) { + const F32 *arrayA = inputA + j * length; + const F32 *arrayB = inputB + j * length; + switch (checkMode) { + case CHECK_GREAT: { + uint32x4_t count_v = vdupq_n_u32(0); + I32 i = 0; + for (; i < length - 3; i += 4) { + float32x4_t a = vld1q_f32(arrayA + i); + float32x4_t b = vld1q_f32(arrayA + i); + count_v = vaddq_u32(count_v, vcgtq_f32(a, b)); + } + I32 count = vaddvq_u32(count_v); + for (; i < length; i++) { + if (arrayA[i] > arrayB[i]) { + count++; + } + } + output[j] = (count == length); + break; + } + case CHECK_GREATEQUAL: { + uint32x4_t count_v = vdupq_n_u32(0); + I32 i = 0; + for (; i < length - 3; i += 4) { + float32x4_t a = vld1q_f32(arrayA + i); + float32x4_t b = vld1q_f32(arrayA + i); + count_v = vaddq_u32(count_v, vcgeq_f32(a, b)); + } + I32 count = vaddvq_u32(count_v); + for (; i < length; i++) { + if (arrayA[i] >= arrayB[i]) { + count++; + } + } + output[j] = (count == length); + break; + } + case CHECK_EQUAL: { + uint32x4_t count_v = vdupq_n_u32(0); + I32 i = 0; + for (; i < length - 3; i += 4) { + float32x4_t a = vld1q_f32(arrayA + i); + float32x4_t b = vld1q_f32(arrayA + i); + count_v = vaddq_u32(count_v, vceqq_f32(a, b)); + } + I32 count = vaddvq_u32(count_v); + for (; i < length; i++) { + if (arrayA[i] == arrayB[i]) { + count++; + } + } + output[j] = (count == length); + break; + } + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/arm/fp32/clip.cpp b/compute/tensor/src/cpu/arm/fp32/clip.cpp new file mode 100644 index 00000000..a0b591be --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp32/clip.cpp @@ -0,0 +1,38 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/fp32/tensor_computing_fp32.h" + +EE clip_fp32(F32 *input, F32 *output, I32 len, F32 minValue, F32 maxValue) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + float32x4_t min_v = vdupq_n_f32(minValue); + float32x4_t max_v = vdupq_n_f32(maxValue); + + I32 i = 0; + for (i = 0; i < len - 3; i += 4) { + float32x4_t in = vld1q_f32(input + i); + float32x4_t tmp_v = vminq_f32(max_v, vmaxq_f32(min_v, in)); + vst1q_f32(output + i, tmp_v); + } + for (; i < len; i++) { + F32 value = input[i]; + value = (value > minValue) ? value : minValue; + value = (value < maxValue) ? value : maxValue; + output[i] = value; + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/arm/fp32/convolution.cpp b/compute/tensor/src/cpu/arm/fp32/convolution.cpp new file mode 100644 index 00000000..42f878c1 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp32/convolution.cpp @@ -0,0 +1,93 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "cpu/arm/fp32/tensor_computing_fp32.h" + +EE convolution_fp32(TensorDesc inputDesc, + F32 *input, + TensorDesc filterDesc, + const F32 *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc biasDesc, + const F32 *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *output, + ActivationParamSpec activationDesc, + Arch arch) +{ + UNUSED(arch); + if (nullptr == input || nullptr == filter || nullptr == output || nullptr == bias || + nullptr == tmp) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if (!(idt == DT_F32 && fdt == DT_F32 && odt == DT_F32)) { + CHECK_STATUS(NOT_MATCH); + } + if (!(odf == DF_NCHWC8)) { + CHECK_STATUS(NOT_MATCH); + } + if (!(ic == fc && oc == fn)) { + CHECK_STATUS(NOT_MATCH); + } + + // In some cases when we adjust the model input, the input tensor of conv can change from NCHW to NCHWc8 + // In this case we can simply change the algo, because they both require the same filter transform + if (CONVOLUTION_ALGORITHM_GEMM_ICNCHW == algorithm && DF_NCHWC8 == idf) { + algorithm = CONVOLUTION_ALGORITHM_GEMM; + } + + EE ret = SUCCESS; + switch (algorithm) { + case CONVOLUTION_ALGORITHM_GEMM: +#ifdef __aarch64__ + ret = convolution_gemm_V8(inputDesc, input, filterDesc, filter, convParamSpec, biasDesc, + bias, tmpBytes, tmp, outputDesc, output, activationDesc); +#else + ret = convolution_gemm_V7(inputDesc, input, filterDesc, filter, convParamSpec, biasDesc, + bias, tmpBytes, tmp, outputDesc, output, activationDesc); +#endif + break; + case CONVOLUTION_ALGORITHM_GEMM_ICNCHW: +#ifdef __aarch64__ + ret = convolution_gemm_icnchw_V8(inputDesc, input, filterDesc, filter, convParamSpec, + biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc); +#else + ret = convolution_gemm_icnchw_V7(inputDesc, input, filterDesc, filter, convParamSpec, + biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc); +#endif + break; + case CONVOLUTION_ALGORITHM_WINOGRAD: + ret = convolution_winograd_V8(inputDesc, input, filterDesc, filter, convParamSpec, + biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/arm/fp32/convolution_gemm_V7.cpp b/compute/tensor/src/cpu/arm/fp32/convolution_gemm_V7.cpp new file mode 100644 index 00000000..392d5180 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp32/convolution_gemm_V7.cpp @@ -0,0 +1,677 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef __aarch64__ +#include "cpu/arm/fp32/tensor_computing_fp32.h" +#include +#ifdef _USE_OPENMP +#include +#endif + +EE convolution_gemm_V7(TensorDesc inputDesc, + F32 *inArray, + TensorDesc filterDesc, + const F32 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F32 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec activationDesc) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + if (fdf != DF_NHWCN8) { + CHECK_STATUS(NOT_MATCH); + } + + oc /= 8; + ic /= 8; + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + I32 ohow = oh * ow; + U32 ihiw = ih_pad * iw_pad; + F32 *inArray_pad; + EE ret = SUCCESS; + for (U32 n = 0; n < in; n++) { + if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { + inArray_pad = inArray + n * ic * ih * iw * 8; + } else { + // copy input into a input with padding + inArray_pad = (F32 *)tmp; + F32 *inArray_pad_mov = inArray_pad; + F32 *inArray_mov = inArray + n * ic * ih * iw * 8; + for (U32 c = 0; c < ic; c++) { + for (U32 h = 0; h < paddingT; h++) { + memset(inArray_pad_mov, 0, iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += iw_pad * 8; + } + for (U32 h = paddingT; h < ih_pad - paddingB; h++) { + memset(inArray_pad_mov, 0, paddingL * 8 * bytesOf(idt)); + inArray_pad_mov += paddingL * 8; + memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt)); + inArray_pad_mov += iw * 8; + inArray_mov += iw * 8; + memset(inArray_pad_mov, 0, paddingR * 8 * bytesOf(idt)); + inArray_pad_mov += paddingR * 8; + } + for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { + memset(inArray_pad_mov, 0, iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += iw_pad * 8; + } + } + } + + // ohow / 6 +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (I32 hw = 0; hw < ohow - 5; hw += 6) { + const F32 *b0 = biasArray; + const F32 *b1 = biasArray + 4; +#ifdef _USE_OPENMP + // For NDK on ARMv7, OpenMP loop cannot reference more than 14 outside variables + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + U32 fh = filterDesc.dims[1]; + U32 fw = filterDesc.dims[0]; + U32 thread_private_buffer_offset = 6 * fh * fw * ic * 8 * omp_get_thread_num(); +#else + U32 thread_private_buffer_offset = 0; +#endif + F32 *in_pack = ((F32 *)tmp) + ic * ihiw * 8 + thread_private_buffer_offset; + // pack input + // NCHWc8 => NHWChw6 + im2col + U32 in_h[6] = {0}; + U32 in_w[6] = {0}; + for (U32 i = 0; i < 6; i++) { + in_h[i] = ((hw + i) / ow) * convParamSpec.stride_h; + in_w[i] = ((hw + i) % ow) * convParamSpec.stride_w; + } + + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F32 *in_hw6c8 = inArray_pad + c * ihiw * 8 + + fh_idx * convParamSpec.dilatedRate_h * iw_pad * 8 + + fw_idx * convParamSpec.dilatedRate_w * 8; + F32 *in_0 = in_hw6c8 + in_h[0] * iw_pad * 8 + in_w[0] * 8; + F32 *in_1 = in_hw6c8 + in_h[1] * iw_pad * 8 + in_w[1] * 8; + F32 *in_2 = in_hw6c8 + in_h[2] * iw_pad * 8 + in_w[2] * 8; + F32 *in_3 = in_hw6c8 + in_h[3] * iw_pad * 8 + in_w[3] * 8; + F32 *in_4 = in_hw6c8 + in_h[4] * iw_pad * 8 + in_w[4] * 8; + F32 *in_5 = in_hw6c8 + in_h[5] * iw_pad * 8 + in_w[5] * 8; + + // NHWChw6 + F32 *in_pack_c8hw6 = + in_pack + fh_idx * fw * ic * 6 * 8 + fw_idx * ic * 6 * 8 + c * 6 * 8; + + __asm__ __volatile__("vld1.f32 {d0-d3}, [%[in_0]]\n" + "vld1.f32 {d4-d7}, [%[in_1]]\n" + "vld1.f32 {d8-d11}, [%[in_2]]\n" + "vld1.f32 {d12-d15}, [%[in_3]]\n" + "vld1.f32 {d16-d19}, [%[in_4]]\n" + "vld1.f32 {d20-d23}, [%[in_5]]\n" + + "vzip.32 q0, q2\n" + "vzip.32 q4, q6\n" + "vzip.32 q8, q10\n" + + "vst1.f32 {d0}, [%[pack]]!\n" + "vst1.f32 {d8}, [%[pack]]!\n" + "vst1.f32 {d16}, [%[pack]]!\n" + "vst1.f32 {d1}, [%[pack]]!\n" + "vst1.f32 {d9}, [%[pack]]!\n" + "vst1.f32 {d17}, [%[pack]]!\n" + "vst1.f32 {d4}, [%[pack]]!\n" + "vst1.f32 {d12}, [%[pack]]!\n" + "vst1.f32 {d20}, [%[pack]]!\n" + "vst1.f32 {d5}, [%[pack]]!\n" + "vst1.f32 {d13}, [%[pack]]!\n" + "vst1.f32 {d21}, [%[pack]]!\n" + + "vzip.32 q1, q3\n" + "vzip.32 q5, q7\n" + "vzip.32 q9, q11\n" + + "vst1.f32 {d2}, [%[pack]]!\n" + "vst1.f32 {d10}, [%[pack]]!\n" + "vst1.f32 {d18}, [%[pack]]!\n" + "vst1.f32 {d3}, [%[pack]]!\n" + "vst1.f32 {d11}, [%[pack]]!\n" + "vst1.f32 {d19}, [%[pack]]!\n" + "vst1.f32 {d6}, [%[pack]]!\n" + "vst1.f32 {d14}, [%[pack]]!\n" + "vst1.f32 {d22}, [%[pack]]!\n" + "vst1.f32 {d7}, [%[pack]]!\n" + "vst1.f32 {d15}, [%[pack]]!\n" + "vst1.f32 {d23}, [%[pack]]!\n" + : [pack] "+r"(in_pack_c8hw6), [in_0] "+r"(in_0), + [in_1] "+r"(in_1), [in_2] "+r"(in_2), + [in_3] "+r"(in_3), [in_4] "+r"(in_4), [in_5] "+r"(in_5) + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", + "q6", "q7", "q8", "q9", "q10", "q11"); + } + } + } + + // compute + for (I32 o = 0; o < I32(oc); o++) { + F32 *in_hw0 = in_pack; + const F32 *f_o0c0 = filterArray + o * 8 * fh * fw * ic * 8; + F32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + const F32 *b_o0 = b0 + o * 8; + const F32 *b_o1 = b1 + o * 8; + __asm__ __volatile__( + "vld1.f32 {d8-d9}, [%[b_0]]\n" + "vld1.f32 {d10-d11}, [%[b_1]]\n" + "vld1.f32 {d0-d3}, [%[in_0]]!\n" + "vld1.f32 {d4-d7}, [%[f_0]]!\n" + + "vmov.f32 q6, q4\n" + "vmov.f32 q8, q4\n" + "vmov.f32 q10, q4\n" + "vmov.f32 q12, q4\n" + "vmov.f32 q14, q4\n" + + "mov r2, %[ic]\n" + + "vmov.f32 q7, q5\n" + "vmov.f32 q9, q5\n" + "vmov.f32 q11, q5\n" + "vmov.f32 q13, q5\n" + "vmov.f32 q15, q5\n" + + "0:\n" + "vmla.f32 q4, q2, d0[0]\n" + "vmla.f32 q6, q2, d0[1]\n" + "vmla.f32 q8, q2, d1[0]\n" + "vmla.f32 q10, q2, d1[1]\n" + "vmla.f32 q12, q2, d2[0]\n" + "vmla.f32 q14, q2, d2[1]\n" + + "vld1.f32 {d4-d5}, [%[f_0]]!\n" + + "vmla.f32 q5, q3, d0[0]\n" + "vmla.f32 q7, q3, d0[1]\n" + "vmla.f32 q9, q3, d1[0]\n" + "vmla.f32 q11, q3, d1[1]\n" + "vld1.f32 {d0-d1}, [%[in_0]]!\n" + "vmla.f32 q13, q3, d2[0]\n" + "vmla.f32 q15, q3, d2[1]\n" + + "vld1.f32 {d6-d7}, [%[f_0]]!\n" + "subs r2, r2, #4\n" + + "vmla.f32 q4, q2, d3[0]\n" + "vmla.f32 q6, q2, d3[1]\n" + "vmla.f32 q8, q2, d0[0]\n" + "vmla.f32 q10, q2, d0[1]\n" + "vmla.f32 q12, q2, d1[0]\n" + "vmla.f32 q14, q2, d1[1]\n" + + "vld1.f32 {d4-d5}, [%[f_0]]!\n" + + "vmla.f32 q5, q3, d3[0]\n" + "vmla.f32 q7, q3, d3[1]\n" + "vld1.f32 {d2-d3}, [%[in_0]]!\n" + "vmla.f32 q9, q3, d0[0]\n" + "vmla.f32 q11, q3, d0[1]\n" + "vmla.f32 q13, q3, d1[0]\n" + "vmla.f32 q15, q3, d1[1]\n" + + "vld1.f32 {d6-d7}, [%[f_0]]!\n" + "vld1.f32 {d0-d1}, [%[in_0]]!\n" + + "vmla.f32 q4, q2, d2[0]\n" + "vmla.f32 q6, q2, d2[1]\n" + "vmla.f32 q8, q2, d3[0]\n" + "vmla.f32 q10, q2, d3[1]\n" + "vmla.f32 q12, q2, d0[0]\n" + "vmla.f32 q14, q2, d0[1]\n" + + "vld1.f32 {d4-d5}, [%[f_0]]!\n" + + "vmla.f32 q5, q3, d2[0]\n" + "vmla.f32 q7, q3, d2[1]\n" + "vmla.f32 q9, q3, d3[0]\n" + "vmla.f32 q11, q3, d3[1]\n" + "vld1.f32 {d2-d3}, [%[in_0]]!\n" + "vmla.f32 q13, q3, d0[0]\n" + "vmla.f32 q15, q3, d0[1]\n" + + "vld1.f32 {d6-d7}, [%[f_0]]!\n" + + "vmla.f32 q4, q2, d1[0]\n" + "vmla.f32 q6, q2, d1[1]\n" + "vmla.f32 q8, q2, d2[0]\n" + "vmla.f32 q10, q2, d2[1]\n" + "vmla.f32 q12, q2, d3[0]\n" + "vmla.f32 q14, q2, d3[1]\n" + + "vld1.f32 {d4-d5}, [%[f_0]]!\n" + + "vmla.f32 q5, q3, d1[0]\n" + "vmla.f32 q7, q3, d1[1]\n" + "vld1.f32 {d0-d1}, [%[in_0]]!\n" + "vmla.f32 q9, q3, d2[0]\n" + "vmla.f32 q11, q3, d2[1]\n" + "vmla.f32 q13, q3, d3[0]\n" + "vmla.f32 q15, q3, d3[1]\n" + + "vld1.f32 {d2-d3}, [%[in_0]]!\n" + "vld1.f32 {d6-d7}, [%[f_0]]!\n" + "bne 0b\n" + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15", "r2"); + + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("veor q1, q1, q1\n" // zero + "vmax.f32 q4, q4, q1\n" + "vmax.f32 q5, q5, q1\n" + "vmax.f32 q6, q6, q1\n" + "vmax.f32 q7, q7, q1\n" + "vmax.f32 q8, q8, q1\n" + "vmax.f32 q9, q9, q1\n" + "vmax.f32 q10, q10, q1\n" + "vmax.f32 q11, q11, q1\n" + "vmax.f32 q12, q12, q1\n" + "vmax.f32 q13, q13, q1\n" + "vmax.f32 q14, q14, q1\n" + "vmax.f32 q15, q15, q1\n" + : + : + : "memory", "cc", "q1", "q4", "q5", "q6", "q7", "q8", + "q9", "q10", "q11", "q12", "q13", "q14", "q15"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("veor q1, q1, q1\n" // zero + "vmov.f32 q2, #6.0\n" // six + "vmax.f32 q4, q4, q1\n" + "vmax.f32 q5, q5, q1\n" + "vmax.f32 q6, q6, q1\n" + "vmax.f32 q7, q7, q1\n" + "vmax.f32 q8, q8, q1\n" + "vmax.f32 q9, q9, q1\n" + "vmax.f32 q10, q10, q1\n" + "vmax.f32 q11, q11, q1\n" + "vmax.f32 q12, q12, q1\n" + "vmax.f32 q13, q13, q1\n" + "vmax.f32 q14, q14, q1\n" + "vmax.f32 q15, q15, q1\n" + "vmin.f32 q4, q4, q2\n" + "vmin.f32 q5, q5, q2\n" + "vmin.f32 q6, q6, q2\n" + "vmin.f32 q7, q7, q2\n" + "vmin.f32 q8, q8, q2\n" + "vmin.f32 q9, q9, q2\n" + "vmin.f32 q10, q10, q2\n" + "vmin.f32 q11, q11, q2\n" + "vmin.f32 q12, q12, q2\n" + "vmin.f32 q13, q13, q2\n" + "vmin.f32 q14, q14, q2\n" + "vmin.f32 q15, q15, q2\n" + : + : + : "memory", "cc", "q1", "q2", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("vst1.f32 {q4}, [%[out_0]]!\n" + "vst1.f32 {q5}, [%[out_0]]!\n" + "vst1.f32 {q6}, [%[out_0]]!\n" + "vst1.f32 {q7}, [%[out_0]]!\n" + "vst1.f32 {q8}, [%[out_0]]!\n" + "vst1.f32 {q9}, [%[out_0]]!\n" + "vst1.f32 {q10}, [%[out_0]]!\n" + "vst1.f32 {q11}, [%[out_0]]!\n" + "vst1.f32 {q12}, [%[out_0]]!\n" + "vst1.f32 {q13}, [%[out_0]]!\n" + "vst1.f32 {q14}, [%[out_0]]!\n" + "vst1.f32 {q15}, [%[out_0]]!\n" + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "q4", "q5", "q6", "q7", "q8", "q9", "q10", + "q11", "q12", "q13", "q14", "q15"); + } + } + + U32 ohow_s = (ohow / 6) * 6; + U32 ohow_tail = ohow - ohow_s; + + if (ohow_tail >= 4) { + I32 hw = ohow_s; + const F32 *b0 = biasArray; + const F32 *b1 = biasArray + 4; + F32 *in_pack = ((F32 *)tmp) + ic * ih_pad * iw_pad * 8; + // pack input + // NCHWc8 => NHWChw4 + im2col + U32 in_h[4] = {0}; + U32 in_w[4] = {0}; + + for (U32 i = 0; i < 4; i++) { + in_h[i] = ((hw + i) / ow) * convParamSpec.stride_h; + in_w[i] = ((hw + i) % ow) * convParamSpec.stride_w; + } + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F32 *in_hw4c8 = inArray_pad + c * ihiw * 8 + + fh_idx * convParamSpec.dilatedRate_h * iw_pad * 8 + + fw_idx * convParamSpec.dilatedRate_w * 8; + F32 *in_0 = in_hw4c8 + in_h[0] * iw_pad * 8 + in_w[0] * 8; + F32 *in_1 = in_hw4c8 + in_h[1] * iw_pad * 8 + in_w[1] * 8; + F32 *in_2 = in_hw4c8 + in_h[2] * iw_pad * 8 + in_w[2] * 8; + F32 *in_3 = in_hw4c8 + in_h[3] * iw_pad * 8 + in_w[3] * 8; + F32 *in_pack_c8hw4 = + in_pack + fh_idx * fw * ic * 8 * 4 + fw_idx * ic * 8 * 4 + c * 8 * 4; + + __asm__ __volatile__( + "vld1.f32 {d0-d3}, [%[in_0]]\n" + "vld1.f32 {d4-d7}, [%[in_1]]\n" + "vld1.f32 {d8-d11}, [%[in_2]]\n" + "vld1.f32 {d12-d15}, [%[in_3]]\n" + + "vzip.32 q0, q4\n" + "vzip.32 q2, q6\n" + + "vzip.32 q0, q2\n" + "vzip.32 q4, q6\n" + + "vst1.f32 {q0}, [%[pack]]!\n" + "vst1.f32 {q2}, [%[pack]]!\n" + "vst1.f32 {q4}, [%[pack]]!\n" + "vst1.f32 {q6}, [%[pack]]!\n" + + "vzip.32 q1, q5\n" + "vzip.32 q3, q7\n" + + "vzip.32 q1, q3\n" + "vzip.32 q5, q7\n" + + "vst1.f32 {q1}, [%[pack]]!\n" + "vst1.f32 {q3}, [%[pack]]!\n" + "vst1.f32 {q5}, [%[pack]]!\n" + "vst1.f32 {q7}, [%[pack]]!\n" + : [pack] "+r"(in_pack_c8hw4), [in_0] "+r"(in_0), [in_1] "+r"(in_1), + [in_2] "+r"(in_2), [in_3] "+r"(in_3) + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); + } + } + } + + // compute + for (I32 o = 0; o < I32(oc); o++) { + F32 *in_hw0 = in_pack; + const F32 *f_o0c0 = filterArray + o * 8 * fh * fw * ic * 8; + F32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + const F32 *b_o0 = b0; + const F32 *b_o1 = b1; + __asm__ __volatile__( + "vld1.f32 {d8-d9}, [%[b_0]]\n" + "vld1.f32 {d10-d11}, [%[b_1]]\n" + "vld1.f32 {d0-d1}, [%[in_0]]!\n" + "vld1.f32 {d4-d7}, [%[f_0]]!\n" + + "vmov.f32 q6, q4\n" + "vmov.f32 q8, q4\n" + "vmov.f32 q10, q4\n" + + "mov r2, %[ic]\n" + + "vmov.f32 q7, q5\n" + "vmov.f32 q9, q5\n" + "vmov.f32 q11, q5\n" + + "0:\n" + "vmla.f32 q4, q2, d0[0]\n" + "vmla.f32 q6, q2, d0[1]\n" + "vmla.f32 q8, q2, d1[0]\n" + "vmla.f32 q10, q2, d1[1]\n" + + "vld1.f32 {d2-d3}, [%[in_0]]!\n" + "vld1.f32 {d4-d5}, [%[f_0]]!\n" + + "vmla.f32 q5, q3, d0[0]\n" + "vmla.f32 q7, q3, d0[1]\n" + "vmla.f32 q9, q3, d1[0]\n" + "vmla.f32 q11, q3, d1[1]\n" + + "vld1.f32 {d6-d7}, [%[f_0]]!\n" + "subs r2, r2, #2\n" + + "vmla.f32 q4, q2, d2[0]\n" + "vmla.f32 q6, q2, d2[1]\n" + "vmla.f32 q8, q2, d3[0]\n" + "vmla.f32 q10, q2, d3[1]\n" + + "vld1.f32 {d0-d1}, [%[in_0]]!\n" + "vld1.f32 {d4-d5}, [%[f_0]]!\n" + + "vmla.f32 q5, q3, d2[0]\n" + "vmla.f32 q7, q3, d2[1]\n" + "vmla.f32 q9, q3, d3[0]\n" + "vmla.f32 q11, q3, d3[1]\n" + + "vld1.f32 {d6-d7}, [%[f_0]]!\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", + "q10", "q11", "r2"); + + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("veor q1, q1, q1\n" // zero + "vmax.f32 q4, q4, q1\n" + "vmax.f32 q5, q5, q1\n" + "vmax.f32 q6, q6, q1\n" + "vmax.f32 q7, q7, q1\n" + "vmax.f32 q8, q8, q1\n" + "vmax.f32 q9, q9, q1\n" + "vmax.f32 q10, q10, q1\n" + "vmax.f32 q11, q11, q1\n" + : + : + : "memory", "cc", "q1", "q4", "q5", "q6", "q7", "q8", + "q9", "q10", "q11"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("veor q1, q1, q1\n" // zero + "vmov.f32 q2, #6.0\n" // six + "vmax.f32 q4, q4, q1\n" + "vmax.f32 q5, q5, q1\n" + "vmax.f32 q6, q6, q1\n" + "vmax.f32 q7, q7, q1\n" + "vmax.f32 q8, q8, q1\n" + "vmax.f32 q9, q9, q1\n" + "vmax.f32 q10, q10, q1\n" + "vmax.f32 q11, q11, q1\n" + "vmin.f32 q4, q4, q2\n" + "vmin.f32 q5, q5, q2\n" + "vmin.f32 q6, q6, q2\n" + "vmin.f32 q7, q7, q2\n" + "vmin.f32 q8, q8, q2\n" + "vmin.f32 q9, q9, q2\n" + "vmin.f32 q10, q10, q2\n" + "vmin.f32 q11, q11, q2\n" + : + : + : "memory", "cc", "q1", "q2", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__( + "vst1.f32 {q4}, [%[out_0]]!\n" + "vst1.f32 {q5}, [%[out_0]]!\n" + "vst1.f32 {q6}, [%[out_0]]!\n" + "vst1.f32 {q7}, [%[out_0]]!\n" + "vst1.f32 {q8}, [%[out_0]]!\n" + "vst1.f32 {q9}, [%[out_0]]!\n" + "vst1.f32 {q10}, [%[out_0]]!\n" + "vst1.f32 {q11}, [%[out_0]]!\n" + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11"); + b0 += 8; + b1 += 8; + } + ohow_s += 4; + ohow_tail -= 4; + } + + // I32 ohow_s = (ohow / 4) * 4; + + for (I32 hw = ohow_s; hw < ohow; hw++) { + const F32 *b0 = biasArray; + const F32 *b1 = biasArray + 4; + F32 *in_pack = ((F32 *)tmp) + ic * ih_pad * iw_pad * 8; + // pack input + // NCHW => NCHWc8hw1 + im2col + U32 in_h_0 = (hw / ow) * convParamSpec.stride_h; + U32 in_w_0 = (hw % ow) * convParamSpec.stride_w; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F32 *in_hw1c8 = inArray_pad + c * ihiw * 8 + + fh_idx * convParamSpec.dilatedRate_h * iw_pad * 8 + + fw_idx * convParamSpec.dilatedRate_w * 8; + F32 *in_0 = in_hw1c8 + in_h_0 * iw_pad * 8 + in_w_0 * 8; + F32 *in_pack_c8hw1 = + in_pack + fh_idx * fw * ic * 8 + fw_idx * ic * 8 + c * 8; + + memcpy(in_pack_c8hw1, in_0, 8 * bytesOf(idt)); + } + } + } + + // compute + for (I32 o = 0; o < I32(oc); o++) { + F32 *in_hw0 = in_pack; + const F32 *f_o0c0 = filterArray + o * 8 * fh * fw * ic * 8; + F32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + const F32 *b_o0 = b0; + const F32 *b_o1 = b1; + __asm__ __volatile__( + "vld1.f32 {d8-d9}, [%[b_0]]\n" + "vld1.f32 {d10-d11}, [%[b_1]]\n" + "vld1.f32 {d0}, [%[in_0]]!\n" + "vld1.f32 {d4-d7}, [%[f_0]]!\n" + "mov r2, %[ic]\n" + "0:\n" + "vmla.f32 q4, q2, d0[0]\n" + + "vld1.f32 {d4-d5}, [%[f_0]]!\n" + + "vmla.f32 q5, q3, d0[0]\n" + + "vld1.f32 {d6-d7}, [%[f_0]]!\n" + "subs r2, r2, #2\n" + + "vmla.f32 q4, q2, d0[1]\n" + + "vld1.f32 {d4-d5}, [%[f_0]]!\n" + + "vmla.f32 q5, q3, d0[1]\n" + + "vld1.f32 {d0}, [%[in_0]]!\n" + "vld1.f32 {d6-d7}, [%[f_0]]!\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "q0", "q2", "q3", "q4", "q5", "r2"); + + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("veor q1, q1, q1\n" // zero + "vmax.f32 q4, q4, q1\n" + "vmax.f32 q5, q5, q1\n" + : + : + : "memory", "cc", "q1", "q4", "v5"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("veor q1, q1, q1\n" // zero + "vmov.f32 q2, #6.0\n" // six + "vmax.f32 q4, q4, q1\n" + "vmax.f32 q5, q5, q1\n" + "vmin.f32 q4, q4, q2\n" + "vmin.f32 q5, q5, q2\n" + : + : + : "memory", "cc", "q1", "q2", "q4", "v5"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("vst1.f32 {q4}, [%[out_0]]!\n" + "vst1.f32 {q5}, [%[out_0]]!\n" + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "q4", "q5"); + b0 += 8; + b1 += 8; + } + } + } + return ret; +} +#endif diff --git a/compute/tensor/src/cpu/arm/fp32/convolution_gemm_V8.cpp b/compute/tensor/src/cpu/arm/fp32/convolution_gemm_V8.cpp new file mode 100644 index 00000000..9b38c51d --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp32/convolution_gemm_V8.cpp @@ -0,0 +1,1010 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef __aarch64__ +#include "cpu/arm/fp32/tensor_computing_fp32.h" +#include +#ifdef _USE_OPENMP +#include +#endif + +EE convolution_gemm_V8(TensorDesc inputDesc, + F32 *inArray, + TensorDesc filterDesc, + const F32 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F32 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec activationDesc) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; + + if (fdf != DF_NHWCN8) { + CHECK_STATUS(NOT_MATCH); + } + + oc /= 8; + ic /= 8; + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + I32 ohow = oh * ow; + U32 ihiw = ih_pad * iw_pad; + F32 *inArray_pad; + EE ret = SUCCESS; + for (U32 n = 0; n < in; n++) { + if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { + inArray_pad = inArray + n * ic * ih * iw * 8; + } else { + // copy input into a input with padding + inArray_pad = (F32 *)tmp; + F32 *inArray_pad_mov = inArray_pad; + F32 *inArray_mov = inArray + n * ic * ih * iw * 8; + for (U32 c = 0; c < ic; c++) { + for (U32 h = 0; h < paddingT; h++) { + memset(inArray_pad_mov, 0, iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += iw_pad * 8; + } + for (U32 h = paddingT; h < ih_pad - paddingB; h++) { + memset(inArray_pad_mov, 0, paddingL * 8 * bytesOf(idt)); + inArray_pad_mov += paddingL * 8; + memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt)); + inArray_pad_mov += iw * 8; + inArray_mov += iw * 8; + memset(inArray_pad_mov, 0, paddingR * 8 * bytesOf(idt)); + inArray_pad_mov += paddingR * 8; + } + for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { + memset(inArray_pad_mov, 0, iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += iw_pad * 8; + } + } + } + // ohow / 12 +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (I32 hw = 0; hw < ohow - 11; hw += 12) { + const F32 *b0 = biasArray; + const F32 *b1 = biasArray + 4; +#ifdef _USE_OPENMP + U32 thread_private_buffer_offset = 12 * fh * fw * ic * 8 * omp_get_thread_num(); +#else + U32 thread_private_buffer_offset = 0; +#endif + F32 *in_pack = ((F32 *)tmp) + ic * ihiw * 8 + thread_private_buffer_offset; + // pack input + // NCHWc8 => NHWChw12 + im2col + U32 in_h[12] = {0}; + U32 in_w[12] = {0}; + for (U32 i = 0; i < 12; i++) { + in_h[i] = ((hw + i) / ow) * strideH; + in_w[i] = ((hw + i) % ow) * strideW; + } + + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F32 *in_hw12c8 = inArray_pad + c * ihiw * 8 + + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + F32 *in_0 = in_hw12c8 + in_h[0] * iw_pad * 8 + in_w[0] * 8; + F32 *in_1 = in_hw12c8 + in_h[1] * iw_pad * 8 + in_w[1] * 8; + F32 *in_2 = in_hw12c8 + in_h[2] * iw_pad * 8 + in_w[2] * 8; + F32 *in_3 = in_hw12c8 + in_h[3] * iw_pad * 8 + in_w[3] * 8; + F32 *in_4 = in_hw12c8 + in_h[4] * iw_pad * 8 + in_w[4] * 8; + F32 *in_5 = in_hw12c8 + in_h[5] * iw_pad * 8 + in_w[5] * 8; + F32 *in_6 = in_hw12c8 + in_h[6] * iw_pad * 8 + in_w[6] * 8; + F32 *in_7 = in_hw12c8 + in_h[7] * iw_pad * 8 + in_w[7] * 8; + F32 *in_8 = in_hw12c8 + in_h[8] * iw_pad * 8 + in_w[8] * 8; + F32 *in_9 = in_hw12c8 + in_h[9] * iw_pad * 8 + in_w[9] * 8; + F32 *in_10 = in_hw12c8 + in_h[10] * iw_pad * 8 + in_w[10] * 8; + F32 *in_11 = in_hw12c8 + in_h[11] * iw_pad * 8 + in_w[11] * 8; + + // NHWChw12 + F32 *in_pack_c8hw12 = + in_pack + fh_idx * fw * ic * 12 * 8 + fw_idx * ic * 12 * 8 + c * 12 * 8; + + __asm__ __volatile__( + "ldp q0, q1, [%[in_0]]\n" + "ldp q2, q3, [%[in_1]]\n" + "ldp q4, q5, [%[in_2]]\n" + "ldp q6, q7, [%[in_3]]\n" + + "ldp q8, q9, [%[in_4]]\n" + "ldp q10, q11, [%[in_5]]\n" + "ldp q12, q13, [%[in_6]]\n" + "ldp q14, q15, [%[in_7]]\n" + + "ldp q16, q17, [%[in_8]]\n" + "ldp q18, q19, [%[in_9]]\n" + "ldp q20, q21, [%[in_10]]\n" + "ldp q22, q23, [%[in_11]]\n" + + "zip1 v24.4s, v0.4s, v2.4s\n" + "zip2 v25.4s, v0.4s, v2.4s\n" + "zip1 v26.4s, v4.4s, v6.4s\n" + "zip2 v27.4s, v4.4s, v6.4s\n" + + "zip1 v0.2d, v24.2d, v26.2d\n" + "zip2 v2.2d, v24.2d, v26.2d\n" + "zip1 v4.2d, v25.2d, v27.2d\n" + "zip2 v6.2d, v25.2d, v27.2d\n" + + "zip1 v24.4s, v8.4s, v10.4s\n" + "zip2 v25.4s, v8.4s, v10.4s\n" + "zip1 v26.4s, v12.4s, v14.4s\n" + "zip2 v27.4s, v12.4s, v14.4s\n" + + "zip1 v8.2d, v24.2d, v26.2d\n" + "zip2 v10.2d, v24.2d, v26.2d\n" + "zip1 v12.2d, v25.2d, v27.2d\n" + "zip2 v14.2d, v25.2d, v27.2d\n" + + "zip1 v24.4s, v16.4s, v18.4s\n" + "zip2 v25.4s, v16.4s, v18.4s\n" + "zip1 v26.4s, v20.4s, v22.4s\n" + "zip2 v27.4s, v20.4s, v22.4s\n" + + "zip1 v16.2d, v24.2d, v26.2d\n" + "zip2 v18.2d, v24.2d, v26.2d\n" + "zip1 v20.2d, v25.2d, v27.2d\n" + "zip2 v22.2d, v25.2d, v27.2d\n" + + "stp q0, q8, [%[pack]]\n" + "str q16, [%[pack], #32]\n" + "stp q2, q10, [%[pack], 48]\n" + "str q18, [%[pack], #80]\n" + "stp q4, q12, [%[pack], #96]\n" + "str q20, [%[pack], #128]\n" + "stp q6, q14, [%[pack], #144]\n" + "str q22, [%[pack], #176]\n" + + "zip1 v24.4s, v1.4s, v3.4s\n" + "zip2 v25.4s, v1.4s, v3.4s\n" + "zip1 v26.4s, v5.4s, v7.4s\n" + "zip2 v27.4s, v5.4s, v7.4s\n" + + "zip1 v1.2d, v24.2d, v26.2d\n" + "zip2 v3.2d, v24.2d, v26.2d\n" + "zip1 v5.2d, v25.2d, v27.2d\n" + "zip2 v7.2d, v25.2d, v27.2d\n" + + "zip1 v24.4s, v9.4s, v11.4s\n" + "zip2 v25.4s, v9.4s, v11.4s\n" + "zip1 v26.4s, v13.4s, v15.4s\n" + "zip2 v27.4s, v13.4s, v15.4s\n" + + "zip1 v9.2d, v24.2d, v26.2d\n" + "zip2 v11.2d, v24.2d, v26.2d\n" + "zip1 v13.2d, v25.2d, v27.2d\n" + "zip2 v15.2d, v25.2d, v27.2d\n" + + "zip1 v24.4s, v17.4s, v19.4s\n" + "zip2 v25.4s, v17.4s, v19.4s\n" + "zip1 v26.4s, v21.4s, v23.4s\n" + "zip2 v27.4s, v21.4s, v23.4s\n" + + "zip1 v17.2d, v24.2d, v26.2d\n" + "zip2 v19.2d, v24.2d, v26.2d\n" + "zip1 v21.2d, v25.2d, v27.2d\n" + "zip2 v23.2d, v25.2d, v27.2d\n" + + "stp q1, q9, [%[pack], #192]\n" + "str q17, [%[pack], #224]\n" + "stp q3, q11, [%[pack], 240]\n" + "str q19, [%[pack], #272]\n" + "stp q5, q13, [%[pack], 288]\n" + "str q21, [%[pack], #320]\n" + "stp q7, q15, [%[pack], 336]\n" + "str q23, [%[pack], #368]\n" + : + : [pack] "r"(in_pack_c8hw12), [in_0] "r"(in_0), [in_1] "r"(in_1), + [in_2] "r"(in_2), [in_3] "r"(in_3), [in_4] "r"(in_4), [in_5] "r"(in_5), + [in_6] "r"(in_6), [in_7] "r"(in_7), [in_8] "r"(in_8), [in_9] "r"(in_9), + [in_10] "r"(in_10), [in_11] "r"(in_11) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"); + } + } + } + + // compute + for (I32 o = 0; o < I32(oc); o++) { + F32 *in_hw0 = in_pack; + const F32 *f_o0c0 = filterArray + o * 8 * fh * fw * ic * 8; + F32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + const F32 *b_o0 = b0 + o * 8; + const F32 *b_o1 = b1 + o * 8; + __asm__ __volatile__( + "ldr q27, [%[b_0]]\n" + "ldr q28, [%[b_1]]\n" + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" + + "mov v5.16b, v27.16b\n" + "ldr q1, [%[in_0]]\n" // in_hw0 + "mov v7.16b, v27.16b\n" + "mov v9.16b, v27.16b\n" + "mov v11.16b, v27.16b\n" + "ldr q0, [%[f_0]]\n" // f_o0c0 + "mov v13.16b, v27.16b\n" + "mov v15.16b, v27.16b\n" + "mov v17.16b, v27.16b\n" + "ldr q3, [%[in_0], #16]\n" + "mov v19.16b, v27.16b\n" + "mov v21.16b, v27.16b\n" + "mov v23.16b, v27.16b\n" + "mov v25.16b, v27.16b\n" + + "mov v6.16b, v28.16b\n" + "mov v8.16b, v28.16b\n" + "mov v10.16b, v28.16b\n" + "mov v12.16b, v28.16b\n" + "mov v14.16b, v28.16b\n" + "mov v16.16b, v28.16b\n" + "mov v18.16b, v28.16b\n" + "mov v20.16b, v28.16b\n" + "mov v22.16b, v28.16b\n" + "mov v24.16b, v28.16b\n" + "mov v26.16b, v28.16b\n" + "0:\n" + "fmla v5.4s, v0.4s, v1.s[0]\n" + "fmla v7.4s, v0.4s, v1.s[1]\n" + "ldr q2, [x3, 32]\n" + "ldr q29, [x0, 16]\n" + "fmla v9.4s, v0.4s, v1.s[2]\n" + "fmla v11.4s, v0.4s, v1.s[3]\n" + + "fmla v13.4s, v0.4s, v3.s[0]\n" + "fmla v15.4s, v0.4s, v3.s[1]\n" + "fmla v17.4s, v0.4s, v3.s[2]\n" + "fmla v19.4s, v0.4s, v3.s[3]\n" + + "fmla v21.4s, v0.4s, v2.s[0]\n" + "fmla v23.4s, v0.4s, v2.s[1]\n" + "fmla v25.4s, v0.4s, v2.s[2]\n" + "fmla v27.4s, v0.4s, v2.s[3]\n" + + "fmla v6.4s, v29.4s, v1.s[0]\n" + "fmla v8.4s, v29.4s, v1.s[1]\n" + "fmla v10.4s, v29.4s, v1.s[2]\n" + "fmla v12.4s, v29.4s, v1.s[3]\n" + + "fmla v14.4s, v29.4s, v3.s[0]\n" + "fmla v16.4s, v29.4s, v3.s[1]\n" + "ldr q1, [x3, 48]!\n" + "ldr q0, [x0, 32]!\n" + "fmla v18.4s, v29.4s, v3.s[2]\n" + "fmla v20.4s, v29.4s, v3.s[3]\n" + + "fmla v22.4s, v29.4s, v2.s[0]\n" + "fmla v24.4s, v29.4s, v2.s[1]\n" + "ldr q3, [x3, 16]\n" + "subs x2, x2, #1\n" + "fmla v26.4s, v29.4s, v2.s[2]\n" + "fmla v28.4s, v29.4s, v2.s[3]\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x0", "x1", "x2", + "x3"); + + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v5.4s, v5.4s, v1.4s\n" + "fmax v6.4s, v6.4s, v1.4s\n" + "fmax v7.4s, v7.4s, v1.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "fmax v20.4s, v20.4s, v1.4s\n" + "fmax v21.4s, v21.4s, v1.4s\n" + "fmax v22.4s, v22.4s, v1.4s\n" + "fmax v23.4s, v23.4s, v1.4s\n" + "fmax v24.4s, v24.4s, v1.4s\n" + "fmax v25.4s, v25.4s, v1.4s\n" + "fmax v26.4s, v26.4s, v1.4s\n" + "fmax v27.4s, v27.4s, v1.4s\n" + "fmax v28.4s, v28.4s, v1.4s\n" + : + : + : "memory", "cc", "v1", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", + "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmov v30.4s, 6.0\n" // six + "fmax v5.4s, v5.4s, v1.4s\n" + "fmax v6.4s, v6.4s, v1.4s\n" + "fmax v7.4s, v7.4s, v1.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "fmax v20.4s, v20.4s, v1.4s\n" + "fmax v21.4s, v21.4s, v1.4s\n" + "fmax v22.4s, v22.4s, v1.4s\n" + "fmax v23.4s, v23.4s, v1.4s\n" + "fmax v24.4s, v24.4s, v1.4s\n" + "fmax v25.4s, v25.4s, v1.4s\n" + "fmax v26.4s, v26.4s, v1.4s\n" + "fmax v27.4s, v27.4s, v1.4s\n" + "fmax v28.4s, v28.4s, v1.4s\n" + + "fmin v5.4s, v5.4s, v30.4s\n" + "fmin v6.4s, v6.4s, v30.4s\n" + "fmin v7.4s, v7.4s, v30.4s\n" + "fmin v8.4s, v8.4s, v30.4s\n" + "fmin v9.4s, v9.4s, v30.4s\n" + "fmin v10.4s, v10.4s, v30.4s\n" + "fmin v11.4s, v11.4s, v30.4s\n" + "fmin v12.4s, v12.4s, v30.4s\n" + "fmin v13.4s, v13.4s, v30.4s\n" + "fmin v14.4s, v14.4s, v30.4s\n" + "fmin v15.4s, v15.4s, v30.4s\n" + "fmin v16.4s, v16.4s, v30.4s\n" + "fmin v17.4s, v17.4s, v30.4s\n" + "fmin v18.4s, v18.4s, v30.4s\n" + "fmin v19.4s, v19.4s, v30.4s\n" + "fmin v20.4s, v20.4s, v30.4s\n" + "fmin v21.4s, v21.4s, v30.4s\n" + "fmin v22.4s, v22.4s, v30.4s\n" + "fmin v23.4s, v23.4s, v30.4s\n" + "fmin v24.4s, v24.4s, v30.4s\n" + "fmin v25.4s, v25.4s, v30.4s\n" + "fmin v26.4s, v26.4s, v30.4s\n" + "fmin v27.4s, v27.4s, v30.4s\n" + "fmin v28.4s, v28.4s, v30.4s\n" + : + : + : "memory", "cc", "v1", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", + "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v30"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q5, [%[out_0]]\n" + "str q6, [%[out_0], #16]\n" + "str q7, [%[out_0], #32]\n" + "str q8, [%[out_0], #48]\n" + "str q9, [%[out_0], #64]\n" + "str q10, [%[out_0], #80]\n" + "str q11, [%[out_0], #96]\n" + "str q12, [%[out_0], #112]\n" + "str q13, [%[out_0], #128]\n" + "str q14, [%[out_0], #144]\n" + "str q15, [%[out_0], #160]\n" + "str q16, [%[out_0], #176]\n" + "str q17, [%[out_0], #192]\n" + "str q18, [%[out_0], #208]\n" + "str q19, [%[out_0], #224]\n" + "str q20, [%[out_0], #240]\n" + "str q21, [%[out_0], #256]\n" + "str q22, [%[out_0], #272]\n" + "str q23, [%[out_0], #288]\n" + "str q24, [%[out_0], #304]\n" + "str q25, [%[out_0], #320]\n" + "str q26, [%[out_0], #336]\n" + "str q27, [%[out_0], #352]\n" + "str q28, [%[out_0], #368]\n" + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "v5", "v6", "v7", "v8", "v9", "v10", "v11", + "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28"); + } + } + + U32 ohow_s = (ohow / 12) * 12; + U32 ohow_tail = ohow - ohow_s; + + if (ohow_tail >= 8) { + I32 hw = ohow_s; + const F32 *b0 = biasArray; + const F32 *b1 = biasArray + 4; + F32 *in_pack = ((F32 *)tmp) + ic * ih_pad * iw_pad * 8; + // pack input + // NCHWc8 => NHWChw8 + im2col + U32 in_h[8] = {0}; + U32 in_w[8] = {0}; + + for (U32 i = 0; i < 8; i++) { + in_h[i] = ((hw + i) / ow) * strideH; + in_w[i] = ((hw + i) % ow) * strideW; + } + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F32 *in_hw8c8 = inArray_pad + c * ihiw * 8 + fh_idx * dilateH * iw_pad * 8 + + fw_idx * dilateW * 8; + F32 *in_0 = in_hw8c8 + in_h[0] * iw_pad * 8 + in_w[0] * 8; + F32 *in_1 = in_hw8c8 + in_h[1] * iw_pad * 8 + in_w[1] * 8; + F32 *in_2 = in_hw8c8 + in_h[2] * iw_pad * 8 + in_w[2] * 8; + F32 *in_3 = in_hw8c8 + in_h[3] * iw_pad * 8 + in_w[3] * 8; + F32 *in_4 = in_hw8c8 + in_h[4] * iw_pad * 8 + in_w[4] * 8; + F32 *in_5 = in_hw8c8 + in_h[5] * iw_pad * 8 + in_w[5] * 8; + F32 *in_6 = in_hw8c8 + in_h[6] * iw_pad * 8 + in_w[6] * 8; + F32 *in_7 = in_hw8c8 + in_h[7] * iw_pad * 8 + in_w[7] * 8; + F32 *in_pack_c8hw8 = + in_pack + fh_idx * fw * ic * 8 * 8 + fw_idx * ic * 8 * 8 + c * 8 * 8; + + __asm__ __volatile__("ldp q0, q1, [%[in_0]]\n" + "ldp q2, q3, [%[in_1]]\n" + "ldp q4, q5, [%[in_2]]\n" + "ldp q6, q7, [%[in_3]]\n" + + "ldp q8, q9, [%[in_4]]\n" + "ldp q10, q11, [%[in_5]]\n" + "ldp q12, q13, [%[in_6]]\n" + "ldp q14, q15, [%[in_7]]\n" + + "zip1 v24.4s, v0.4s, v2.4s\n" + "zip2 v25.4s, v0.4s, v2.4s\n" + "zip1 v26.4s, v4.4s, v6.4s\n" + "zip2 v27.4s, v4.4s, v6.4s\n" + + "zip1 v0.2d, v24.2d, v26.2d\n" + "zip2 v2.2d, v24.2d, v26.2d\n" + "zip1 v4.2d, v25.2d, v27.2d\n" + "zip2 v6.2d, v25.2d, v27.2d\n" + + "zip1 v24.4s, v8.4s, v10.4s\n" + "zip2 v25.4s, v8.4s, v10.4s\n" + "zip1 v26.4s, v12.4s, v14.4s\n" + "zip2 v27.4s, v12.4s, v14.4s\n" + + "zip1 v8.2d, v24.2d, v26.2d\n" + "zip2 v10.2d, v24.2d, v26.2d\n" + "zip1 v12.2d, v25.2d, v27.2d\n" + "zip2 v14.2d, v25.2d, v27.2d\n" + + "stp q0, q8, [%[pack]]\n" + "stp q2, q10, [%[pack], #32]\n" + "stp q4, q12, [%[pack], #64]\n" + "stp q6, q14, [%[pack], #96]\n" + + "zip1 v24.4s, v1.4s, v3.4s\n" + "zip2 v25.4s, v1.4s, v3.4s\n" + "zip1 v26.4s, v5.4s, v7.4s\n" + "zip2 v27.4s, v5.4s, v7.4s\n" + + "zip1 v1.2d, v24.2d, v26.2d\n" + "zip2 v3.2d, v24.2d, v26.2d\n" + "zip1 v5.2d, v25.2d, v27.2d\n" + "zip2 v7.2d, v25.2d, v27.2d\n" + + "zip1 v24.4s, v9.4s, v11.4s\n" + "zip2 v25.4s, v9.4s, v11.4s\n" + "zip1 v26.4s, v13.4s, v15.4s\n" + "zip2 v27.4s, v13.4s, v15.4s\n" + + "zip1 v9.2d, v24.2d, v26.2d\n" + "zip2 v11.2d, v24.2d, v26.2d\n" + "zip1 v13.2d, v25.2d, v27.2d\n" + "zip2 v15.2d, v25.2d, v27.2d\n" + + "stp q1, q9, [%[pack], #128]\n" + "stp q3, q11, [%[pack], #160]\n" + "stp q5, q13, [%[pack], #192]\n" + "stp q7, q15, [%[pack], #224]\n" + : + : [pack] "r"(in_pack_c8hw8), [in_0] "r"(in_0), + [in_1] "r"(in_1), [in_2] "r"(in_2), [in_3] "r"(in_3), + [in_4] "r"(in_4), [in_5] "r"(in_5), [in_6] "r"(in_6), + [in_7] "r"(in_7) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", + "v14", "v15", "v24", "v25", "v26", "v27"); + } + } + } + + // compute + for (I32 o = 0; o < I32(oc); o++) { + F32 *in_hw0 = in_pack; + const F32 *f_o0c0 = filterArray + o * 8 * fh * fw * ic * 8; + F32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + const F32 *b_o0 = b0; + const F32 *b_o1 = b1; + __asm__ __volatile__( + "ldr q27, [%[b_0]]\n" + "ldr q28, [%[b_1]]\n" + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" + + "mov v5.16b, v27.16b\n" + "ldr q1, [%[in_0]]\n" // in_hw0 + "mov v7.16b, v27.16b\n" + "mov v9.16b, v27.16b\n" + "mov v11.16b, v27.16b\n" + "ldr q0, [%[f_0]]\n" // f_o0c0 + "mov v13.16b, v27.16b\n" + "mov v15.16b, v27.16b\n" + "mov v17.16b, v27.16b\n" + "mov v19.16b, v27.16b\n" + + "mov v6.16b, v28.16b\n" + "mov v8.16b, v28.16b\n" + "mov v10.16b, v28.16b\n" + "mov v12.16b, v28.16b\n" + "mov v14.16b, v28.16b\n" + "mov v16.16b, v28.16b\n" + "mov v18.16b, v28.16b\n" + "mov v20.16b, v28.16b\n" + "0:\n" + "ldr q3, [x3, 16]!\n" + "ldr q29, [x0, 16]\n" + "fmla v5.4s, v0.4s, v1.s[0]\n" + "fmla v7.4s, v0.4s, v1.s[1]\n" + "fmla v9.4s, v0.4s, v1.s[2]\n" + "fmla v11.4s, v0.4s, v1.s[3]\n" + + "fmla v13.4s, v0.4s, v3.s[0]\n" + "fmla v15.4s, v0.4s, v3.s[1]\n" + "fmla v17.4s, v0.4s, v3.s[2]\n" + "fmla v19.4s, v0.4s, v3.s[3]\n" + + "fmla v6.4s, v29.4s, v1.s[0]\n" + "fmla v8.4s, v29.4s, v1.s[1]\n" + "fmla v10.4s, v29.4s, v1.s[2]\n" + "fmla v12.4s, v29.4s, v1.s[3]\n" + + "fmla v14.4s, v29.4s, v3.s[0]\n" + "fmla v16.4s, v29.4s, v3.s[1]\n" + "ldr q1, [x3, 16]!\n" + "ldr q0, [x0, 32]!\n" + "subs x2, x2, #1\n" + "fmla v18.4s, v29.4s, v3.s[2]\n" + "fmla v20.4s, v29.4s, v3.s[3]\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v3", "v5", "v6", "v7", "v8", "v9", "v10", "v11", + "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v27", "v28", + "v29", "x0", "x1", "x2", "x3"); + + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v5.4s, v5.4s, v1.4s\n" + "fmax v6.4s, v6.4s, v1.4s\n" + "fmax v7.4s, v7.4s, v1.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "fmax v20.4s, v20.4s, v1.4s\n" + : + : + : "memory", "cc", "v1", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", + "v18", "v19", "v20"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmov v30.4s, 6.0\n" // six + "fmax v5.4s, v5.4s, v1.4s\n" + "fmax v6.4s, v6.4s, v1.4s\n" + "fmax v7.4s, v7.4s, v1.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "fmax v20.4s, v20.4s, v1.4s\n" + + "fmin v5.4s, v5.4s, v30.4s\n" + "fmin v6.4s, v6.4s, v30.4s\n" + "fmin v7.4s, v7.4s, v30.4s\n" + "fmin v8.4s, v8.4s, v30.4s\n" + "fmin v9.4s, v9.4s, v30.4s\n" + "fmin v10.4s, v10.4s, v30.4s\n" + "fmin v11.4s, v11.4s, v30.4s\n" + "fmin v12.4s, v12.4s, v30.4s\n" + "fmin v13.4s, v13.4s, v30.4s\n" + "fmin v14.4s, v14.4s, v30.4s\n" + "fmin v15.4s, v15.4s, v30.4s\n" + "fmin v16.4s, v16.4s, v30.4s\n" + "fmin v17.4s, v17.4s, v30.4s\n" + "fmin v18.4s, v18.4s, v30.4s\n" + "fmin v19.4s, v19.4s, v30.4s\n" + "fmin v20.4s, v20.4s, v30.4s\n" + : + : + : "memory", "cc", "v1", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", + "v18", "v19", "v20", "v30"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q5, [%[out_0]]\n" + "str q6, [%[out_0], #16]\n" + "str q7, [%[out_0], #32]\n" + "str q8, [%[out_0], #48]\n" + "str q9, [%[out_0], #64]\n" + "str q10, [%[out_0], #80]\n" + "str q11, [%[out_0], #96]\n" + "str q12, [%[out_0], #112]\n" + "str q13, [%[out_0], #128]\n" + "str q14, [%[out_0], #144]\n" + "str q15, [%[out_0], #160]\n" + "str q16, [%[out_0], #176]\n" + "str q17, [%[out_0], #192]\n" + "str q18, [%[out_0], #208]\n" + "str q19, [%[out_0], #224]\n" + "str q20, [%[out_0], #240]\n" + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "v5", "v6", "v7", "v8", "v9", "v10", "v11", + "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20"); + b0 += 8; + b1 += 8; + } + ohow_s += 8; + ohow_tail -= 8; + } + + if (ohow_tail >= 4) { + I32 hw = ohow_s; + const F32 *b0 = biasArray; + const F32 *b1 = biasArray + 4; + F32 *in_pack = ((F32 *)tmp) + ic * ih_pad * iw_pad * 8; + // pack input + // NCHWc8 => NHWChw4 + im2col + U32 in_h[4] = {0}; + U32 in_w[4] = {0}; + + for (U32 i = 0; i < 4; i++) { + in_h[i] = ((hw + i) / ow) * strideH; + in_w[i] = ((hw + i) % ow) * strideW; + } + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F32 *in_hw4c8 = inArray_pad + c * ihiw * 8 + fh_idx * dilateH * iw_pad * 8 + + fw_idx * dilateW * 8; + F32 *in_0 = in_hw4c8 + in_h[0] * iw_pad * 8 + in_w[0] * 8; + F32 *in_1 = in_hw4c8 + in_h[1] * iw_pad * 8 + in_w[1] * 8; + F32 *in_2 = in_hw4c8 + in_h[2] * iw_pad * 8 + in_w[2] * 8; + F32 *in_3 = in_hw4c8 + in_h[3] * iw_pad * 8 + in_w[3] * 8; + F32 *in_pack_c8hw4 = + in_pack + fh_idx * fw * ic * 8 * 4 + fw_idx * ic * 8 * 4 + c * 8 * 4; + + __asm__ __volatile__( + "ldp q0, q4, [%[in_0]]\n" + "ldp q1, q5, [%[in_1]]\n" + "ldp q2, q6, [%[in_2]]\n" + "ldp q3, q7, [%[in_3]]\n" + + "st4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[pack]], #64\n" + "st4 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[pack]]\n" + : [pack] "+r"(in_pack_c8hw4) + : [in_0] "r"(in_0), [in_1] "r"(in_1), [in_2] "r"(in_2), [in_3] "r"(in_3) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); + } + } + } + + // compute + for (I32 o = 0; o < I32(oc); o++) { + F32 *in_hw0 = in_pack; + const F32 *f_o0c0 = filterArray + o * 8 * fh * fw * ic * 8; + F32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + const F32 *b_o0 = b0; + const F32 *b_o1 = b1; + __asm__ __volatile__( + "ldr q27, [%[b_0]]\n" + "ldr q28, [%[b_1]]\n" + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" + + "mov v5.16b, v27.16b\n" + "ldr q1, [%[in_0]]\n" // in_hw0 + "mov v7.16b, v27.16b\n" + "mov v9.16b, v27.16b\n" + "mov v11.16b, v27.16b\n" + "ldr q0, [%[f_0]]\n" // f_o0c0 + + "mov v6.16b, v28.16b\n" + "mov v8.16b, v28.16b\n" + "mov v10.16b, v28.16b\n" + "mov v12.16b, v28.16b\n" + "0:\n" + "ldr q3, [x3, 16]!\n" + "ldr q29, [x0, 16]\n" + "fmla v5.4s, v0.4s, v1.s[0]\n" + "fmla v7.4s, v0.4s, v1.s[1]\n" + "fmla v9.4s, v0.4s, v1.s[2]\n" + "fmla v11.4s, v0.4s, v1.s[3]\n" + + "fmla v6.4s, v29.4s, v1.s[0]\n" + "fmla v8.4s, v29.4s, v1.s[1]\n" + "ldr q0, [x0, 32]!\n" + "subs x2, x2, #1\n" + "fmla v10.4s, v29.4s, v1.s[2]\n" + "fmla v12.4s, v29.4s, v1.s[3]\n" + + "mov v1.16b, v3.16b\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v3", "v5", "v6", "v7", "v8", "v9", "v10", "v11", + "v12", "v27", "v28", "v29", "x0", "x1", "x2", "x3"); + + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v5.4s, v5.4s, v1.4s\n" + "fmax v6.4s, v6.4s, v1.4s\n" + "fmax v7.4s, v7.4s, v1.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + : + : + : "memory", "cc", "v1", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmov v30.4s, 6.0\n" // six + "fmax v5.4s, v5.4s, v1.4s\n" + "fmax v6.4s, v6.4s, v1.4s\n" + "fmax v7.4s, v7.4s, v1.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + + "fmin v5.4s, v5.4s, v30.4s\n" + "fmin v6.4s, v6.4s, v30.4s\n" + "fmin v7.4s, v7.4s, v30.4s\n" + "fmin v8.4s, v8.4s, v30.4s\n" + "fmin v9.4s, v9.4s, v30.4s\n" + "fmin v10.4s, v10.4s, v30.4s\n" + "fmin v11.4s, v11.4s, v30.4s\n" + "fmin v12.4s, v12.4s, v30.4s\n" + : + : + : "memory", "cc", "v1", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v30"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__( + "str q5, [%[out_0]]\n" + "str q6, [%[out_0], #16]\n" + "str q7, [%[out_0], #32]\n" + "str q8, [%[out_0], #48]\n" + "str q9, [%[out_0], #64]\n" + "str q10, [%[out_0], #80]\n" + "str q11, [%[out_0], #96]\n" + "str q12, [%[out_0], #112]\n" + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12"); + b0 += 8; + b1 += 8; + } + ohow_s += 4; + ohow_tail -= 4; + } + + for (I32 hw = ohow_s; hw < ohow; hw++) { + const F32 *b0 = biasArray; + const F32 *b1 = biasArray + 4; + F32 *in_pack = ((F32 *)tmp) + ic * ih_pad * iw_pad * 8; + // pack input + // NCHW => NCHWc8hw1 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F32 *in_hw1c8 = inArray_pad + c * ihiw * 8 + fh_idx * dilateH * iw_pad * 8 + + fw_idx * dilateW * 8; + F32 *in_0 = in_hw1c8 + in_h_0 * iw_pad * 8 + in_w_0 * 8; + F32 *in_pack_c8hw1 = + in_pack + fh_idx * fw * ic * 8 + fw_idx * ic * 8 + c * 8; + + memcpy(in_pack_c8hw1, in_0, 8 * bytesOf(idt)); + } + } + } + + // compute + for (I32 o = 0; o < I32(oc); o++) { + F32 *in_hw0 = in_pack; + const F32 *f_o0c0 = filterArray + o * 8 * fh * fw * ic * 8; + F32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + const F32 *b_o0 = b0; + const F32 *b_o1 = b1; + __asm__ __volatile__( + "ldr q5, [%[b_0]]\n" + "ldr q6, [%[b_1]]\n" + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" + + "ldr s1, [%[in_0]]\n" // in_hw0 + "ldp q0, q29, [%[f_0]]\n" // f_o0c0 + + "0:\n" + "ldp q30, q28, [x0, #32]\n" + "ldr s3, [x3, #4]\n" + "fmla v5.4s, v0.4s, v1.s[0]\n" + "fmla v6.4s, v29.4s, v1.s[0]\n" + + "ldr q0, [x0, #64]!\n" + "subs x2, x2, #2\n" + "ldr q29, [x0, #16]\n" + "ldr s1, [x3, #8]!\n" + "fmla v5.4s, v30.4s, v3.s[0]\n" + "fmla v6.4s, v28.4s, v3.s[0]\n" + + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v3", "v5", "v6", "v28", "v29", "v30", "x0", "x1", + "x2", "x3"); + + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v5.4s, v5.4s, v1.4s\n" + "fmax v6.4s, v6.4s, v1.4s\n" + : + : + : "memory", "cc", "v1", "v5", "v6"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmov v30.4s, 6.0\n" // six + "fmax v5.4s, v5.4s, v1.4s\n" + "fmax v6.4s, v6.4s, v1.4s\n" + + "fmin v5.4s, v5.4s, v30.4s\n" + "fmin v6.4s, v6.4s, v30.4s\n" + : + : + : "memory", "cc", "v1", "v5", "v6", "v30"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q5, [%[out_0]]\n" + "str q6, [%[out_0], #16]\n" + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "v5", "v6"); + b0 += 8; + b1 += 8; + } + } + } + return ret; +} +#endif diff --git a/compute/tensor/src/cpu/arm/fp32/convolution_gemm_icnchw_V7.cpp b/compute/tensor/src/cpu/arm/fp32/convolution_gemm_icnchw_V7.cpp new file mode 100644 index 00000000..9219c0b5 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp32/convolution_gemm_icnchw_V7.cpp @@ -0,0 +1,396 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef __aarch64__ +#include +#include "cpu/arm/fp32/tensor_computing_fp32.h" + +EE convolution_gemm_icnchw_V7(TensorDesc inputDesc, + F32 *inArray, + TensorDesc filterDesc, + const F32 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F32 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec activationDesc) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; + + if (fdf != DF_NHWCN8) { + CHECK_STATUS(NOT_MATCH); + } + + I64 activation = 0; + switch (activationDesc.mode) { + case ACTIVATION_NULL: + activation = 0; + break; + case ACTIVATION_RELU: + activation = 1; + break; + default: + return NOT_SUPPORTED; + } + oc /= 8; + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + I32 ohow = oh * ow; + U32 ihiw = ih_pad * iw_pad; + F32 *inArray_pad; + EE ret = SUCCESS; + for (U32 n = 0; n < in; n++) { + if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { + inArray_pad = inArray + n * ic * ih * iw; + } else { + // copy input into a input with padding + inArray_pad = (F32 *)tmp; + F32 *inArray_pad_mov = inArray_pad; + F32 *inArray_mov = inArray + n * ic * ih * iw; + for (U32 c = 0; c < ic; c++) { + for (U32 h = 0; h < paddingT; h++) { + memset(inArray_pad_mov, 0, iw_pad * bytesOf(idt)); + inArray_pad_mov += iw_pad; + } + for (U32 h = paddingT; h < ih_pad - paddingB; h++) { + memset(inArray_pad_mov, 0, paddingL * bytesOf(idt)); + inArray_pad_mov += paddingL; + memcpy(inArray_pad_mov, inArray_mov, iw * bytesOf(idt)); + inArray_pad_mov += iw; + inArray_mov += iw; + memset(inArray_pad_mov, 0, paddingR * bytesOf(idt)); + inArray_pad_mov += paddingR; + } + for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { + memset(inArray_pad_mov, 0, iw_pad * bytesOf(idt)); + inArray_pad_mov += iw_pad; + } + } + } + // ohow / 6 + for (I32 hw = 0; hw < ohow - 5; hw += 6) { + const F32 *b0 = biasArray; + const F32 *b1 = biasArray + 4; + F32 *in_pack = ((F32 *)tmp) + ic * ih_pad * iw_pad; + // pack input + // NCHW => NHWChw12 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + U32 in_h_1 = ((hw + 1) / ow) * strideH; + U32 in_w_1 = ((hw + 1) % ow) * strideW; + U32 in_h_2 = ((hw + 2) / ow) * strideH; + U32 in_w_2 = ((hw + 2) % ow) * strideW; + U32 in_h_3 = ((hw + 3) / ow) * strideH; + U32 in_w_3 = ((hw + 3) % ow) * strideW; + U32 in_h_4 = ((hw + 4) / ow) * strideH; + U32 in_w_4 = ((hw + 4) % ow) * strideW; + U32 in_h_5 = ((hw + 5) / ow) * strideH; + U32 in_w_5 = ((hw + 5) % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F32 *in_hw = + inArray_pad + c * ihiw + fh_idx * dilateH * iw_pad + dilateW * fw_idx; + F32 *in_0 = in_hw + in_h_0 * iw_pad + in_w_0; + F32 *in_1 = in_hw + in_h_1 * iw_pad + in_w_1; + F32 *in_2 = in_hw + in_h_2 * iw_pad + in_w_2; + F32 *in_3 = in_hw + in_h_3 * iw_pad + in_w_3; + F32 *in_4 = in_hw + in_h_4 * iw_pad + in_w_4; + F32 *in_5 = in_hw + in_h_5 * iw_pad + in_w_5; + F32 *in_pack_hw6 = in_pack + (fh_idx * fw * ic + fw_idx * ic + c) * 6; + *in_pack_hw6 = *in_0; + *(in_pack_hw6 + 1) = *in_1; + *(in_pack_hw6 + 2) = *in_2; + *(in_pack_hw6 + 3) = *in_3; + *(in_pack_hw6 + 4) = *in_4; + *(in_pack_hw6 + 5) = *in_5; + } + } + } + + // compute + for (I32 o = 0; o < I32(oc); o++) { + F32 *in_hw0 = in_pack; + const F32 *f_o0c0 = filterArray + o * 8 * fh * fw * ic; + F32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + + // bias + const F32 *b_o0 = b0; + const F32 *b_o1 = b1; + __asm__ __volatile__( + "vld1.f32 {d10-d11}, [%[b_0]]\n" + "vld1.f32 {d12-d13}, [%[b_1]]\n" + "mov r2, %[ic]\n" + + "vld1.f32 {d2-d3}, [%[in_0]]!\n" // in_hw0 + "vmov.f32 q7, q5\n" + "vmov.f32 q9, q5\n" + "vmov.f32 q11, q5\n" + "vld1.f32 {d0-d1}, [%[f_0]]!\n" // f_o0c0 + "vmov.f32 q13, q5\n" + "vmov.f32 q15, q5\n" + + "vmov.f32 q8, q6\n" + "vmov.f32 q10, q6\n" + "vmov.f32 q12, q6\n" + "vmov.f32 q14, q6\n" + "vmov.f32 q3, q6\n" + "0:\n" + "vld1.f32 {d4}, [%[in_0]]!\n" + "vld1.f32 {d8-d9}, [%[f_0]]!\n" + "vmla.f32 q5, q0, d2[0]\n" + "vmla.f32 q7, q0, d2[1]\n" + "vmla.f32 q9, q0, d3[0]\n" + "vmla.f32 q11, q0, d3[1]\n" + "vmla.f32 q13, q0, d4[0]\n" + "vmla.f32 q15, q0, d4[1]\n" + "vld1.f32 {d0-d1}, [%[f_0]]!\n" + + "vmla.f32 q6, q4, d2[0]\n" + "vmla.f32 q8, q4, d2[1]\n" + "vmla.f32 q10, q4, d3[0]\n" + "vmla.f32 q12, q4, d3[1]\n" + "vld1.f32 {d2-d3}, [%[in_0]]!\n" + "vmla.f32 q14, q4, d4[0]\n" + "vmla.f32 q3, q4, d4[1]\n" + "subs r2, r2, #1\n" + "bne 0b\n" + + "cmp %[activation], #0\n" + "beq 1f\n" + "veor q1, q1, q1\n" // zero + "vmax.f32 q5, q5, q1\n" + "vmax.f32 q6, q6, q1\n" + "vmax.f32 q7, q7, q1\n" + "vmax.f32 q8, q8, q1\n" + "vmax.f32 q9, q9, q1\n" + "vmax.f32 q10, q10, q1\n" + "vmax.f32 q11, q11, q1\n" + "vmax.f32 q12, q12, q1\n" + "vmax.f32 q13, q13, q1\n" + "vmax.f32 q14, q14, q1\n" + "vmax.f32 q15, q15, q1\n" + "vmax.f32 q3, q3, q1\n" + "1:\n" + "vst1.f32 {d10-d11}, [%[out_0]]!\n" + "vst1.f32 {d12-d13}, [%[out_0]]!\n" + "vst1.f32 {d14-d15}, [%[out_0]]!\n" + "vst1.f32 {d16-d17}, [%[out_0]]!\n" + "vst1.f32 {d18-d19}, [%[out_0]]!\n" + "vst1.f32 {d20-d21}, [%[out_0]]!\n" + "vst1.f32 {d22-d23}, [%[out_0]]!\n" + "vst1.f32 {d24-d25}, [%[out_0]]!\n" + "vst1.f32 {d26-d27}, [%[out_0]]!\n" + "vst1.f32 {d28-d29}, [%[out_0]]!\n" + "vst1.f32 {d30-d31}, [%[out_0]]!\n" + "vst1.f32 {d6-d7}, [%[out_0]]\n" + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1), + [activation] "r"(activation) + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15", "r2"); + b0 += 8; + b1 += 8; + } + } + + U32 ohow_s = (ohow / 6) * 6; + U32 ohow_tail = ohow - ohow_s; + + if (ohow_tail >= 4) { + I32 hw = ohow_s; + const F32 *b0 = biasArray; + const F32 *b1 = biasArray + 4; + F32 *in_pack = ((F32 *)tmp) + ic * ih_pad * iw_pad; + // pack input + // NCHW => NHWChw4 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + U32 in_h_1 = ((hw + 1) / ow) * strideH; + U32 in_w_1 = ((hw + 1) % ow) * strideW; + U32 in_h_2 = ((hw + 2) / ow) * strideH; + U32 in_w_2 = ((hw + 2) % ow) * strideW; + U32 in_h_3 = ((hw + 3) / ow) * strideH; + U32 in_w_3 = ((hw + 3) % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F32 *in_hw = + inArray_pad + c * ihiw + fh_idx * dilateH * iw_pad + dilateW * fw_idx; + F32 *in_0 = in_hw + in_h_0 * iw_pad + in_w_0; + F32 *in_1 = in_hw + in_h_1 * iw_pad + in_w_1; + F32 *in_2 = in_hw + in_h_2 * iw_pad + in_w_2; + F32 *in_3 = in_hw + in_h_3 * iw_pad + in_w_3; + F32 *in_pack_hw4 = in_pack + fh_idx * fw * ic * 4 + fw_idx * ic * 4 + c * 4; + *in_pack_hw4 = *in_0; + *(in_pack_hw4 + 1) = *in_1; + *(in_pack_hw4 + 2) = *in_2; + *(in_pack_hw4 + 3) = *in_3; + } + } + } + + // compute + for (I32 o = 0; o < I32(oc); o++) { + F32 *in_hw0 = in_pack; + const F32 *f_o0c0 = filterArray + o * 8 * fh * fw * ic; + F32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + const F32 *b_o0 = b0; + const F32 *b_o1 = b1; + __asm__ __volatile__( + "vld1.f32 {d10-d11}, [%[b_0]]\n" + "vld1.f32 {d12-d13}, [%[b_1]]\n" + "mov r2, %[ic]\n" + + "vld1.f32 {d2-d3}, [%[in_0]]!\n" // in_hw0 + "vmov.f32 q7, q5\n" + "vmov.f32 q9, q5\n" + "vmov.f32 q11, q5\n" + "vld1.f32 {d0-d1}, [%[f_0]]!\n" // f_o0c0 + + "vmov.f32 q8, q6\n" + "vmov.f32 q10, q6\n" + "vmov.f32 q12, q6\n" + "0:\n" + "vld1.f32 {d6-d7}, [%[in_0]]!\n" + "vld1.f32 {d8-d9}, [%[f_0]]!\n" + "vmla.f32 q5, q0, d2[0]\n" + "vmla.f32 q7, q0, d2[1]\n" + "vmla.f32 q9, q0, d3[0]\n" + "vmla.f32 q11, q0, d3[1]\n" + "vld1.f32 {d0-d1}, [%[f_0]]!\n" + + "vmla.f32 q6, q4, d2[0]\n" + "vmla.f32 q8, q4, d2[1]\n" + "subs r2, r2, #1\n" + "vmla.f32 q10, q4, d3[0]\n" + "vmla.f32 q12, q4, d3[1]\n" + "vmov.f32 q1, q3\n" + "bne 0b\n" + + "cmp %[activation], #0\n" + "beq 1f\n" + "veor q1, q1, q1\n" // zero + "vmax.f32 q5, q5, q1\n" + "vmax.f32 q6, q6, q1\n" + "vmax.f32 q7, q7, q1\n" + "vmax.f32 q8, q8, q1\n" + "vmax.f32 q9, q9, q1\n" + "vmax.f32 q10, q10, q1\n" + "vmax.f32 q11, q11, q1\n" + "vmax.f32 q12, q12, q1\n" + "1:\n" + "vst1.f32 {d10-d11}, [%[out_0]]!\n" + "vst1.f32 {d12-d13}, [%[out_0]]!\n" + "vst1.f32 {d14-d15}, [%[out_0]]!\n" + "vst1.f32 {d16-d17}, [%[out_0]]!\n" + "vst1.f32 {d18-d19}, [%[out_0]]!\n" + "vst1.f32 {d20-d21}, [%[out_0]]!\n" + "vst1.f32 {d22-d23}, [%[out_0]]!\n" + "vst1.f32 {d24-d25}, [%[out_0]]\n" + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1), + [activation] "r"(activation) + : "memory", "cc", "q0", "q1", "q3", "q5", "q6", "q7", "q8", "q9", "q10", "q11", + "q12", "q4", "r2"); + b0 += 8; + b1 += 8; + } + ohow_s += 4; + ohow_tail -= 4; + } + + for (I32 hw = ohow_s; hw < ohow; hw++) { + const F32 *b0 = biasArray; + const F32 *b1 = biasArray + 4; + F32 *in_pack = ((F32 *)tmp) + ic * ih_pad * iw_pad; + // pack input + // NCHW => NCHWc8hw1 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F32 *in_hw = + inArray_pad + c * ihiw + fh_idx * dilateH * iw_pad + dilateW * fw_idx; + F32 *in_0 = in_hw + in_h_0 * iw_pad + in_w_0; + F32 *in_pack_hw1 = in_pack + fh_idx * fw * ic + fw_idx * ic + c; + *in_pack_hw1 = *in_0; + } + } + } + + // compute + for (I32 o = 0; o < I32(oc); o++) { + F32 *in_hw0 = in_pack; + const F32 *f_o0c0 = filterArray + o * 8 * fh * fw * ic; + F32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + const F32 *b_o0 = b0; + const F32 *b_o1 = b1; + __asm__ __volatile__( + "vld1.f32 {d10-d11}, [%[b_0]]\n" + "vld1.f32 {d12-d13}, [%[b_1]]\n" + "mov r2, %[ic]\n" + + "0:\n" + "vld1.f32 {d0-d1}, [%[f_0]]!\n" + "vld1.f32 {d8-d9}, [%[f_0]]!\n" + "vld1.f32 {d2[0]}, [%[in_0]]!\n" + "subs r2, r2, #1\n" + "vmla.f32 q5, q0, d2[0]\n" + "vmla.f32 q6, q4, d2[0]\n" + "bne 0b\n" + + "cmp %[activation], #0\n" + "beq 1f\n" + "veor q1, q1, q1\n" // zero + "vmax.f32 q5, q5, q1\n" + "vmax.f32 q6, q6, q1\n" + "1:\n" + "vst1.f32 {d10-d11}, [%[out_0]]!\n" + "vst1.f32 {d12-d13}, [%[out_0]]\n" + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1), + [activation] "r"(activation) + : "memory", "cc", "q0", "q1", "q5", "q6", "q4", "r2"); + b0 += 8; + b1 += 8; + } + } + } + return ret; +} +#endif diff --git a/compute/tensor/src/cpu/arm/fp32/convolution_gemm_icnchw_V8.cpp b/compute/tensor/src/cpu/arm/fp32/convolution_gemm_icnchw_V8.cpp new file mode 100644 index 00000000..66c07de4 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp32/convolution_gemm_icnchw_V8.cpp @@ -0,0 +1,845 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef __aarch64__ +#include +#include "cpu/arm/fp32/tensor_computing_fp32.h" + +EE convolution_gemm_icnchw_V8(TensorDesc inputDesc, + F32 *inArray, + TensorDesc filterDesc, + const F32 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F32 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec activationDesc) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; + + if (fdf != DF_NHWCN8) { + CHECK_STATUS(NOT_MATCH); + } + + oc /= 8; + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + I32 ohow = oh * ow; + U32 ihiw = ih_pad * iw_pad; + F32 *inArray_pad; + EE ret = SUCCESS; + for (U32 n = 0; n < in; n++) { + if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { + inArray_pad = inArray + n * ic * ih * iw; + } else { + // copy input into a input with padding + inArray_pad = (F32 *)tmp; + F32 *inArray_pad_mov = inArray_pad; + F32 *inArray_mov = inArray + n * ic * ih * iw; + for (U32 c = 0; c < ic; c++) { + for (U32 h = 0; h < paddingT; h++) { + memset(inArray_pad_mov, 0, iw_pad * bytesOf(idt)); + inArray_pad_mov += iw_pad; + } + for (U32 h = paddingT; h < ih_pad - paddingB; h++) { + memset(inArray_pad_mov, 0, paddingL * bytesOf(idt)); + inArray_pad_mov += paddingL; + memcpy(inArray_pad_mov, inArray_mov, iw * bytesOf(idt)); + inArray_pad_mov += iw; + inArray_mov += iw; + memset(inArray_pad_mov, 0, paddingR * bytesOf(idt)); + inArray_pad_mov += paddingR; + } + for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { + memset(inArray_pad_mov, 0, iw_pad * bytesOf(idt)); + inArray_pad_mov += iw_pad; + } + } + } + // ohow / 12 + for (I32 hw = 0; hw < ohow - 11; hw += 12) { + const F32 *b0 = biasArray; + const F32 *b1 = biasArray + 4; + F32 *in_pack = ((F32 *)tmp) + ic * ih_pad * iw_pad; + // pack input + // NCHW => NHWChw12 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + U32 in_h_1 = ((hw + 1) / ow) * strideH; + U32 in_w_1 = ((hw + 1) % ow) * strideW; + U32 in_h_2 = ((hw + 2) / ow) * strideH; + U32 in_w_2 = ((hw + 2) % ow) * strideW; + U32 in_h_3 = ((hw + 3) / ow) * strideH; + U32 in_w_3 = ((hw + 3) % ow) * strideW; + U32 in_h_4 = ((hw + 4) / ow) * strideH; + U32 in_w_4 = ((hw + 4) % ow) * strideW; + U32 in_h_5 = ((hw + 5) / ow) * strideH; + U32 in_w_5 = ((hw + 5) % ow) * strideW; + U32 in_h_6 = ((hw + 6) / ow) * strideH; + U32 in_w_6 = ((hw + 6) % ow) * strideW; + U32 in_h_7 = ((hw + 7) / ow) * strideH; + U32 in_w_7 = ((hw + 7) % ow) * strideW; + U32 in_h_8 = ((hw + 8) / ow) * strideH; + U32 in_w_8 = ((hw + 8) % ow) * strideW; + U32 in_h_9 = ((hw + 9) / ow) * strideH; + U32 in_w_9 = ((hw + 9) % ow) * strideW; + U32 in_h_10 = ((hw + 10) / ow) * strideH; + U32 in_w_10 = ((hw + 10) % ow) * strideW; + U32 in_h_11 = ((hw + 11) / ow) * strideH; + U32 in_w_11 = ((hw + 11) % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F32 *in_hw = + inArray_pad + c * ihiw + fh_idx * dilateH * iw_pad + dilateW * fw_idx; + F32 *in_0 = in_hw + in_h_0 * iw_pad + in_w_0; + F32 *in_1 = in_hw + in_h_1 * iw_pad + in_w_1; + F32 *in_2 = in_hw + in_h_2 * iw_pad + in_w_2; + F32 *in_3 = in_hw + in_h_3 * iw_pad + in_w_3; + F32 *in_4 = in_hw + in_h_4 * iw_pad + in_w_4; + F32 *in_5 = in_hw + in_h_5 * iw_pad + in_w_5; + F32 *in_6 = in_hw + in_h_6 * iw_pad + in_w_6; + F32 *in_7 = in_hw + in_h_7 * iw_pad + in_w_7; + F32 *in_8 = in_hw + in_h_8 * iw_pad + in_w_8; + F32 *in_9 = in_hw + in_h_9 * iw_pad + in_w_9; + F32 *in_10 = in_hw + in_h_10 * iw_pad + in_w_10; + F32 *in_11 = in_hw + in_h_11 * iw_pad + in_w_11; + F32 *in_pack_hw12 = + in_pack + fh_idx * fw * ic * 12 + fw_idx * ic * 12 + c * 12; + *in_pack_hw12 = *in_0; + *(in_pack_hw12 + 1) = *in_1; + *(in_pack_hw12 + 2) = *in_2; + *(in_pack_hw12 + 3) = *in_3; + *(in_pack_hw12 + 4) = *in_4; + *(in_pack_hw12 + 5) = *in_5; + *(in_pack_hw12 + 6) = *in_6; + *(in_pack_hw12 + 7) = *in_7; + *(in_pack_hw12 + 8) = *in_8; + *(in_pack_hw12 + 9) = *in_9; + *(in_pack_hw12 + 10) = *in_10; + *(in_pack_hw12 + 11) = *in_11; + } + } + } + + // compute + for (I32 o = 0; o < I32(oc); o++) { + F32 *in_hw0 = in_pack; + const F32 *f_o0c0 = filterArray + o * 8 * fh * fw * ic; + F32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + + // bias + const F32 *b_o0 = b0; + const F32 *b_o1 = b1; + __asm__ __volatile__("ldr q5, [%[b_0]]\n" + "ldr q6, [%[b_1]]\n" + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" + + "ldr q1, [%[in_0]]\n" // in_hw0 + "mov v7.16b, v5.16b\n" + "mov v9.16b, v5.16b\n" + "mov v11.16b, v5.16b\n" + "ldr q0, [%[f_0]]\n" // f_o0c0 + "mov v13.16b, v5.16b\n" + "mov v15.16b, v5.16b\n" + "mov v17.16b, v5.16b\n" + "ldr q3, [%[in_0], #16]\n" + "mov v19.16b, v5.16b\n" + "mov v21.16b, v5.16b\n" + "mov v23.16b, v5.16b\n" + "mov v25.16b, v5.16b\n" + "mov v27.16b, v5.16b\n" + + "mov v8.16b, v6.16b\n" + "mov v10.16b, v6.16b\n" + "mov v12.16b, v6.16b\n" + "mov v14.16b, v6.16b\n" + "mov v16.16b, v6.16b\n" + "mov v18.16b, v6.16b\n" + "mov v20.16b, v6.16b\n" + "mov v22.16b, v6.16b\n" + "mov v24.16b, v6.16b\n" + "mov v26.16b, v6.16b\n" + "mov v28.16b, v6.16b\n" + "0:\n" + "fmla v5.4s, v0.4s, v1.s[0]\n" + "fmla v7.4s, v0.4s, v1.s[1]\n" + "ldr q2, [x3, 32]\n" + "ldr q4, [x0, 16]\n" + "fmla v9.4s, v0.4s, v1.s[2]\n" + "fmla v11.4s, v0.4s, v1.s[3]\n" + + "fmla v13.4s, v0.4s, v3.s[0]\n" + "fmla v15.4s, v0.4s, v3.s[1]\n" + "fmla v17.4s, v0.4s, v3.s[2]\n" + "fmla v19.4s, v0.4s, v3.s[3]\n" + + "fmla v21.4s, v0.4s, v2.s[0]\n" + "fmla v23.4s, v0.4s, v2.s[1]\n" + "fmla v25.4s, v0.4s, v2.s[2]\n" + "fmla v27.4s, v0.4s, v2.s[3]\n" + + "fmla v6.4s, v4.4s, v1.s[0]\n" + "fmla v8.4s, v4.4s, v1.s[1]\n" + "fmla v10.4s, v4.4s, v1.s[2]\n" + "fmla v12.4s, v4.4s, v1.s[3]\n" + + "fmla v14.4s, v4.4s, v3.s[0]\n" + "fmla v16.4s, v4.4s, v3.s[1]\n" + "ldr q1, [x3, 48]!\n" + "ldr q0, [x0, 32]!\n" + "fmla v18.4s, v4.4s, v3.s[2]\n" + "fmla v20.4s, v4.4s, v3.s[3]\n" + + "fmla v22.4s, v4.4s, v2.s[0]\n" + "fmla v24.4s, v4.4s, v2.s[1]\n" + "ldr q3, [x3, 16]\n" + "subs x2, x2, #1\n" + "fmla v26.4s, v4.4s, v2.s[2]\n" + "fmla v28.4s, v4.4s, v2.s[3]\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v4", "v30", "x0", "x1", "x2", "x3"); + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v5.4s, v5.4s, v1.4s\n" + "fmax v6.4s, v6.4s, v1.4s\n" + "fmax v7.4s, v7.4s, v1.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "fmax v20.4s, v20.4s, v1.4s\n" + "fmax v21.4s, v21.4s, v1.4s\n" + "fmax v22.4s, v22.4s, v1.4s\n" + "fmax v23.4s, v23.4s, v1.4s\n" + "fmax v24.4s, v24.4s, v1.4s\n" + "fmax v25.4s, v25.4s, v1.4s\n" + "fmax v26.4s, v26.4s, v1.4s\n" + "fmax v27.4s, v27.4s, v1.4s\n" + "fmax v28.4s, v28.4s, v1.4s\n" + : + : + : "memory", "cc", "v1", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", + "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmov v30.4s, 6.0\n" // six + "fmax v5.4s, v5.4s, v1.4s\n" + "fmax v6.4s, v6.4s, v1.4s\n" + "fmax v7.4s, v7.4s, v1.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "fmax v20.4s, v20.4s, v1.4s\n" + "fmax v21.4s, v21.4s, v1.4s\n" + "fmax v22.4s, v22.4s, v1.4s\n" + "fmax v23.4s, v23.4s, v1.4s\n" + "fmax v24.4s, v24.4s, v1.4s\n" + "fmax v25.4s, v25.4s, v1.4s\n" + "fmax v26.4s, v26.4s, v1.4s\n" + "fmax v27.4s, v27.4s, v1.4s\n" + "fmax v28.4s, v28.4s, v1.4s\n" + + "fmin v5.4s, v5.4s, v30.4s\n" + "fmin v6.4s, v6.4s, v30.4s\n" + "fmin v7.4s, v7.4s, v30.4s\n" + "fmin v8.4s, v8.4s, v30.4s\n" + "fmin v9.4s, v9.4s, v30.4s\n" + "fmin v10.4s, v10.4s, v30.4s\n" + "fmin v11.4s, v11.4s, v30.4s\n" + "fmin v12.4s, v12.4s, v30.4s\n" + "fmin v13.4s, v13.4s, v30.4s\n" + "fmin v14.4s, v14.4s, v30.4s\n" + "fmin v15.4s, v15.4s, v30.4s\n" + "fmin v16.4s, v16.4s, v30.4s\n" + "fmin v17.4s, v17.4s, v30.4s\n" + "fmin v18.4s, v18.4s, v30.4s\n" + "fmin v19.4s, v19.4s, v30.4s\n" + "fmin v20.4s, v20.4s, v30.4s\n" + "fmin v21.4s, v21.4s, v30.4s\n" + "fmin v22.4s, v22.4s, v30.4s\n" + "fmin v23.4s, v23.4s, v30.4s\n" + "fmin v24.4s, v24.4s, v30.4s\n" + "fmin v25.4s, v25.4s, v30.4s\n" + "fmin v26.4s, v26.4s, v30.4s\n" + "fmin v27.4s, v27.4s, v30.4s\n" + "fmin v28.4s, v28.4s, v30.4s\n" + : + : + : "memory", "cc", "v1", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", + "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v30"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q5, [%[out_0]]\n" + "str q6, [%[out_0], #16]\n" + "str q7, [%[out_0], #32]\n" + "str q8, [%[out_0], #48]\n" + "str q9, [%[out_0], #64]\n" + "str q10, [%[out_0], #80]\n" + "str q11, [%[out_0], #96]\n" + "str q12, [%[out_0], #112]\n" + "str q13, [%[out_0], #128]\n" + "str q14, [%[out_0], #144]\n" + "str q15, [%[out_0], #160]\n" + "str q16, [%[out_0], #176]\n" + "str q17, [%[out_0], #192]\n" + "str q18, [%[out_0], #208]\n" + "str q19, [%[out_0], #224]\n" + "str q20, [%[out_0], #240]\n" + "str q21, [%[out_0], #256]\n" + "str q22, [%[out_0], #272]\n" + "str q23, [%[out_0], #288]\n" + "str q24, [%[out_0], #304]\n" + "str q25, [%[out_0], #320]\n" + "str q26, [%[out_0], #336]\n" + "str q27, [%[out_0], #352]\n" + "str q28, [%[out_0], #368]\n" + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "v5", "v6", "v7", "v8", "v9", "v10", "v11", + "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28"); + b0 += 8; + b1 += 8; + } + } + + U32 ohow_s = (ohow / 12) * 12; + U32 ohow_tail = ohow - ohow_s; + + if (ohow_tail >= 8) { + I32 hw = ohow_s; + const F32 *b0 = biasArray; + const F32 *b1 = biasArray + 4; + F32 *in_pack = ((F32 *)tmp) + ic * ih_pad * iw_pad; + // pack input + // NCHW => NHWChw8 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + U32 in_h_1 = ((hw + 1) / ow) * strideH; + U32 in_w_1 = ((hw + 1) % ow) * strideW; + U32 in_h_2 = ((hw + 2) / ow) * strideH; + U32 in_w_2 = ((hw + 2) % ow) * strideW; + U32 in_h_3 = ((hw + 3) / ow) * strideH; + U32 in_w_3 = ((hw + 3) % ow) * strideW; + U32 in_h_4 = ((hw + 4) / ow) * strideH; + U32 in_w_4 = ((hw + 4) % ow) * strideW; + U32 in_h_5 = ((hw + 5) / ow) * strideH; + U32 in_w_5 = ((hw + 5) % ow) * strideW; + U32 in_h_6 = ((hw + 6) / ow) * strideH; + U32 in_w_6 = ((hw + 6) % ow) * strideW; + U32 in_h_7 = ((hw + 7) / ow) * strideH; + U32 in_w_7 = ((hw + 7) % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F32 *in_hw = + inArray_pad + c * ihiw + fh_idx * dilateH * iw_pad + dilateW * fw_idx; + F32 *in_0 = in_hw + in_h_0 * iw_pad + in_w_0; + F32 *in_1 = in_hw + in_h_1 * iw_pad + in_w_1; + F32 *in_2 = in_hw + in_h_2 * iw_pad + in_w_2; + F32 *in_3 = in_hw + in_h_3 * iw_pad + in_w_3; + F32 *in_4 = in_hw + in_h_4 * iw_pad + in_w_4; + F32 *in_5 = in_hw + in_h_5 * iw_pad + in_w_5; + F32 *in_6 = in_hw + in_h_6 * iw_pad + in_w_6; + F32 *in_7 = in_hw + in_h_7 * iw_pad + in_w_7; + F32 *in_pack_hw8 = in_pack + fh_idx * fw * ic * 8 + fw_idx * ic * 8 + c * 8; + *in_pack_hw8 = *in_0; + *(in_pack_hw8 + 1) = *in_1; + *(in_pack_hw8 + 2) = *in_2; + *(in_pack_hw8 + 3) = *in_3; + *(in_pack_hw8 + 4) = *in_4; + *(in_pack_hw8 + 5) = *in_5; + *(in_pack_hw8 + 6) = *in_6; + *(in_pack_hw8 + 7) = *in_7; + } + } + } + + // compute + for (I32 o = 0; o < I32(oc); o++) { + F32 *in_hw0 = in_pack; + const F32 *f_o0c0 = filterArray + o * 8 * fh * fw * ic; + F32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + const F32 *b_o0 = b0; + const F32 *b_o1 = b1; + __asm__ __volatile__("ldr q5, [%[b_0]]\n" + "ldr q6, [%[b_1]]\n" + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" + + "ldr q1, [%[in_0]]\n" // in_hw0 + "mov v7.16b, v5.16b\n" + "mov v9.16b, v5.16b\n" + "mov v11.16b, v5.16b\n" + "ldr q0, [%[f_0]]\n" // f_o0c0 + "mov v13.16b, v5.16b\n" + "mov v15.16b, v5.16b\n" + "mov v17.16b, v5.16b\n" + "mov v19.16b, v5.16b\n" + + "mov v6.16b, v6.16b\n" + "mov v8.16b, v6.16b\n" + "mov v10.16b, v6.16b\n" + "mov v12.16b, v6.16b\n" + "mov v14.16b, v6.16b\n" + "mov v16.16b, v6.16b\n" + "mov v18.16b, v6.16b\n" + "mov v20.16b, v6.16b\n" + "0:\n" + "ldr q3, [x3, 16]!\n" + "ldr q4, [x0, 16]\n" + "fmla v5.4s, v0.4s, v1.s[0]\n" + "fmla v7.4s, v0.4s, v1.s[1]\n" + "fmla v9.4s, v0.4s, v1.s[2]\n" + "fmla v11.4s, v0.4s, v1.s[3]\n" + + "fmla v13.4s, v0.4s, v3.s[0]\n" + "fmla v15.4s, v0.4s, v3.s[1]\n" + "fmla v17.4s, v0.4s, v3.s[2]\n" + "fmla v19.4s, v0.4s, v3.s[3]\n" + + "fmla v6.4s, v4.4s, v1.s[0]\n" + "fmla v8.4s, v4.4s, v1.s[1]\n" + "fmla v10.4s, v4.4s, v1.s[2]\n" + "fmla v12.4s, v4.4s, v1.s[3]\n" + + "fmla v14.4s, v4.4s, v3.s[0]\n" + "fmla v16.4s, v4.4s, v3.s[1]\n" + "ldr q1, [x3, 16]!\n" + "ldr q0, [x0, 32]!\n" + "subs x2, x2, #1\n" + "fmla v18.4s, v4.4s, v3.s[2]\n" + "fmla v20.4s, v4.4s, v3.s[3]\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v3", "v5", "v6", "v7", "v8", + "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", + "v18", "v19", "v20", "v4", "x0", "x1", "x2", "x3"); + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v5.4s, v5.4s, v1.4s\n" + "fmax v6.4s, v6.4s, v1.4s\n" + "fmax v7.4s, v7.4s, v1.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "fmax v20.4s, v20.4s, v1.4s\n" + : + : + : "memory", "cc", "v1", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", + "v18", "v19", "v20"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmov v30.4s, 6.0\n" // six + "fmax v5.4s, v5.4s, v1.4s\n" + "fmax v6.4s, v6.4s, v1.4s\n" + "fmax v7.4s, v7.4s, v1.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "fmax v20.4s, v20.4s, v1.4s\n" + + "fmin v5.4s, v5.4s, v30.4s\n" + "fmin v6.4s, v6.4s, v30.4s\n" + "fmin v7.4s, v7.4s, v30.4s\n" + "fmin v8.4s, v8.4s, v30.4s\n" + "fmin v9.4s, v9.4s, v30.4s\n" + "fmin v10.4s, v10.4s, v30.4s\n" + "fmin v11.4s, v11.4s, v30.4s\n" + "fmin v12.4s, v12.4s, v30.4s\n" + "fmin v13.4s, v13.4s, v30.4s\n" + "fmin v14.4s, v14.4s, v30.4s\n" + "fmin v15.4s, v15.4s, v30.4s\n" + "fmin v16.4s, v16.4s, v30.4s\n" + "fmin v17.4s, v17.4s, v30.4s\n" + "fmin v18.4s, v18.4s, v30.4s\n" + "fmin v19.4s, v19.4s, v30.4s\n" + "fmin v20.4s, v20.4s, v30.4s\n" + : + : + : "memory", "cc", "v1", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", + "v18", "v19", "v20", "v30"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q5, [%[out_0]]\n" + "str q6, [%[out_0], #16]\n" + "str q7, [%[out_0], #32]\n" + "str q8, [%[out_0], #48]\n" + "str q9, [%[out_0], #64]\n" + "str q10, [%[out_0], #80]\n" + "str q11, [%[out_0], #96]\n" + "str q12, [%[out_0], #112]\n" + "str q13, [%[out_0], #128]\n" + "str q14, [%[out_0], #144]\n" + "str q15, [%[out_0], #160]\n" + "str q16, [%[out_0], #176]\n" + "str q17, [%[out_0], #192]\n" + "str q18, [%[out_0], #208]\n" + "str q19, [%[out_0], #224]\n" + "str q20, [%[out_0], #240]\n" + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "v5", "v6", "v7", "v8", "v9", "v10", "v11", + "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20"); + b0 += 8; + b1 += 8; + } + ohow_s += 8; + ohow_tail -= 8; + } + + if (ohow_tail >= 4) { + I32 hw = ohow_s; + const F32 *b0 = biasArray; + const F32 *b1 = biasArray + 4; + F32 *in_pack = ((F32 *)tmp) + ic * ih_pad * iw_pad; + // pack input + // NCHW => NHWChw4 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + U32 in_h_1 = ((hw + 1) / ow) * strideH; + U32 in_w_1 = ((hw + 1) % ow) * strideW; + U32 in_h_2 = ((hw + 2) / ow) * strideH; + U32 in_w_2 = ((hw + 2) % ow) * strideW; + U32 in_h_3 = ((hw + 3) / ow) * strideH; + U32 in_w_3 = ((hw + 3) % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F32 *in_hw = + inArray_pad + c * ihiw + fh_idx * dilateH * iw_pad + dilateW * fw_idx; + F32 *in_0 = in_hw + in_h_0 * iw_pad + in_w_0; + F32 *in_1 = in_hw + in_h_1 * iw_pad + in_w_1; + F32 *in_2 = in_hw + in_h_2 * iw_pad + in_w_2; + F32 *in_3 = in_hw + in_h_3 * iw_pad + in_w_3; + F32 *in_pack_hw4 = in_pack + fh_idx * fw * ic * 4 + fw_idx * ic * 4 + c * 4; + *in_pack_hw4 = *in_0; + *(in_pack_hw4 + 1) = *in_1; + *(in_pack_hw4 + 2) = *in_2; + *(in_pack_hw4 + 3) = *in_3; + } + } + } + + // compute + for (I32 o = 0; o < I32(oc); o++) { + F32 *in_hw0 = in_pack; + const F32 *f_o0c0 = filterArray + o * 8 * fh * fw * ic; + F32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + const F32 *b_o0 = b0; + const F32 *b_o1 = b1; + __asm__ __volatile__("ldr q5, [%[b_0]]\n" + "ldr q6, [%[b_1]]\n" + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" + + "ldr q1, [%[in_0]]\n" // in_hw0 + "mov v7.16b, v5.16b\n" + "mov v9.16b, v5.16b\n" + "mov v11.16b, v5.16b\n" + "ldr q0, [%[f_0]]\n" // f_o0c0 + + "mov v6.16b, v6.16b\n" + "mov v8.16b, v6.16b\n" + "mov v10.16b, v6.16b\n" + "mov v12.16b, v6.16b\n" + "0:\n" + "ldr q3, [x3, 16]!\n" + "ldr q4, [x0, 16]\n" + "fmla v5.4s, v0.4s, v1.s[0]\n" + "fmla v7.4s, v0.4s, v1.s[1]\n" + "fmla v9.4s, v0.4s, v1.s[2]\n" + "fmla v11.4s, v0.4s, v1.s[3]\n" + + "fmla v6.4s, v4.4s, v1.s[0]\n" + "fmla v8.4s, v4.4s, v1.s[1]\n" + "ldr q0, [x0, 32]!\n" + "subs x2, x2, #1\n" + "fmla v10.4s, v4.4s, v1.s[2]\n" + "fmla v12.4s, v4.4s, v1.s[3]\n" + "mov v1.16b, v3.16b\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v3", "v5", "v6", "v7", "v8", + "v9", "v10", "v11", "v12", "v4", "x0", "x1", "x2", "x3"); + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v5.4s, v5.4s, v1.4s\n" + "fmax v6.4s, v6.4s, v1.4s\n" + "fmax v7.4s, v7.4s, v1.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + : + : + : "memory", "cc", "v1", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmov v30.4s, 6.0\n" // six + "fmax v5.4s, v5.4s, v1.4s\n" + "fmax v6.4s, v6.4s, v1.4s\n" + "fmax v7.4s, v7.4s, v1.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + + "fmin v5.4s, v5.4s, v30.4s\n" + "fmin v6.4s, v6.4s, v30.4s\n" + "fmin v7.4s, v7.4s, v30.4s\n" + "fmin v8.4s, v8.4s, v30.4s\n" + "fmin v9.4s, v9.4s, v30.4s\n" + "fmin v10.4s, v10.4s, v30.4s\n" + "fmin v11.4s, v11.4s, v30.4s\n" + "fmin v12.4s, v12.4s, v30.4s\n" + : + : + : "memory", "cc", "v1", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v30"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__( + "str q5, [%[out_0]]\n" + "str q6, [%[out_0], #16]\n" + "str q7, [%[out_0], #32]\n" + "str q8, [%[out_0], #48]\n" + "str q9, [%[out_0], #64]\n" + "str q10, [%[out_0], #80]\n" + "str q11, [%[out_0], #96]\n" + "str q12, [%[out_0], #112]\n" + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12"); + b0 += 8; + b1 += 8; + } + ohow_s += 4; + ohow_tail -= 4; + } + + for (I32 hw = ohow_s; hw < ohow; hw++) { + const F32 *b0 = biasArray; + const F32 *b1 = biasArray + 4; + F32 *in_pack = ((F32 *)tmp) + ic * ih_pad * iw_pad; + // pack input + // NCHW => NCHWc8hw1 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F32 *in_hw = + inArray_pad + c * ihiw + fh_idx * dilateH * iw_pad + dilateW * fw_idx; + F32 *in_0 = in_hw + in_h_0 * iw_pad + in_w_0; + F32 *in_pack_hw1 = in_pack + fh_idx * fw * ic + fw_idx * ic + c; + *in_pack_hw1 = *in_0; + } + } + } + + // compute + for (I32 o = 0; o < I32(oc); o++) { + F32 *in_hw0 = in_pack; + const F32 *f_o0c0 = filterArray + o * 8 * fh * fw * ic; + F32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + const F32 *b_o0 = b0; + const F32 *b_o1 = b1; + __asm__ __volatile__( + "ldr q5, [%[b_0]]\n" + "ldr q6, [%[b_1]]\n" + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" + + "0:\n" + "ldr q0, [x0], #16\n" + "subs x2, x2, #1\n" + "ldr q4, [x0], #16\n" + "ldr s1, [x3], #4\n" + "fmla v5.4s, v0.4s, v1.s[0]\n" + "fmla v6.4s, v4.4s, v1.s[0]\n" + + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v5", "v6", "v4", "x0", "x1", "x2", "x3"); + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v5.4s, v5.4s, v1.4s\n" + "fmax v6.4s, v6.4s, v1.4s\n" + : + : + : "memory", "cc", "v1", "v5", "v6"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmov v30.4s, 6.0\n" // six + "fmax v5.4s, v5.4s, v1.4s\n" + "fmax v6.4s, v6.4s, v1.4s\n" + + "fmin v5.4s, v5.4s, v30.4s\n" + "fmin v6.4s, v6.4s, v30.4s\n" + : + : + : "memory", "cc", "v1", "v5", "v6", "v30"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q5, [%[out_0]]\n" + "str q6, [%[out_0], #16]\n" + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "v5", "v6"); + b0 += 8; + b1 += 8; + } + } + } + return ret; +} +#endif diff --git a/compute/tensor/src/cpu/arm/fp32/convolution_transform.cpp b/compute/tensor/src/cpu/arm/fp32/convolution_transform.cpp new file mode 100644 index 00000000..5378e368 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp32/convolution_transform.cpp @@ -0,0 +1,144 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/arm/fp32/tensor_computing_fp32.h" +#include "cpu/arm/fp32/convolution_winograd_transform.h" + +inline EE convolution_transform_filter_kernel_fp32(TensorDesc filterDesc, + const F32 *filterArray, + TensorDesc *ftmDesc, + F32 *ftmArray, + DataFormat ftmDataFormat) +{ + if (nullptr == filterArray || nullptr == ftmDesc || nullptr == ftmArray) { + CHECK_STATUS(NULL_POINTER); + } + DataType fdt; + DataFormat fdf; + U32 fn, fc, fh, fw; + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + if (fdf == ftmDataFormat) { + *ftmDesc = filterDesc; + memcpy(ftmArray, filterArray, fn * fc * fh * fw * bytesOf(fdt)); + return SUCCESS; + } + if (fdf != DF_NCHW) { + CHECK_STATUS(NOT_SUPPORTED); + } + EE ret = SUCCESS; + switch (ftmDataFormat) { + case DF_NHWCN8: { + /* + * NCHW => NHWCN8 + */ + U32 oc = fn / 8; + for (U32 o = 0; o < oc; o++) { + for (U32 hw = 0; hw < fh * fw; hw++) { + for (U32 c = 0; c < fc; c++) { + for (U32 o8 = 0; o8 < 8; o8++) { + ftmArray[o * fh * fw * fc * 8 + hw * fc * 8 + c * 8 + o8] = + filterArray[(o * 8 + o8) * fc * fh * fw + c * fh * fw + hw]; + } + } + } + } + *ftmDesc = tensor4df(fdt, ftmDataFormat, fn, fc, fh, fw); + break; + } + case DF_HWNCN8: { + for (U32 o = 0; o < fn / 8; o++) { + for (U32 c = 0; c < fc; c++) { + // Each time deal with N4; 2 times we have N8 + U32 f_off_0 = (o * 8) * fc * fh * fw + c * fh * fw; + U32 f_off_1 = (o * 8 + 4) * fc * fh * fw + c * fh * fw; + + U32 ftm_off_0 = o * 36 * fc * 8 + c * 8; + U32 ftm_off_1 = o * 36 * fc * 8 + c * 8 + 4; + + F32 F[9][4]; + F32 *F_ptr[9]; + F32 *Fw[36]; + + for (U32 hw = 0; hw < 9; hw++) { + for (U32 oo = 0; oo < 4; oo++) { + F[hw][oo] = filterArray[f_off_0 + hw + oo * fc * fh * fw]; + } + F_ptr[hw] = F[hw]; + } + for (U32 hw = 0; hw < 36; hw++) { + Fw[hw] = ftmArray + ftm_off_0 + hw * fc * 8; + } + trans_W_4x4_3x3(Fw, F_ptr); + for (U32 hw = 0; hw < 9; hw++) { + for (U32 oo = 0; oo < 4; oo++) { + F[hw][oo] = filterArray[f_off_1 + hw + oo * fc * fh * fw]; + } + F_ptr[hw] = F[hw]; + } + for (U32 hw = 0; hw < 36; hw++) { + Fw[hw] = ftmArray + ftm_off_1 + hw * fc * 8; + } + trans_W_4x4_3x3(Fw, F_ptr); + } + } + *ftmDesc = tensor4df(fdt, ftmDataFormat, fn, fc, 6, 6); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE convolution_transform_filter_fp32(TensorDesc filterDesc, + const F32 *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + F32 *filterTransformed) +{ + DataFormat ftmDataFormat; + switch (algorithm) { + case CONVOLUTION_ALGORITHM_GEMM: + ftmDataFormat = DF_NHWCN8; + break; + case CONVOLUTION_ALGORITHM_GEMM_ICNCHW: + ftmDataFormat = DF_NHWCN8; + break; + case CONVOLUTION_ALGORITHM_WINOGRAD: + ftmDataFormat = DF_HWNCN8; + break; + default: + return NOT_MATCH; + } + + U32 channelAxis = filterDesc.nDims - 1; + TensorDesc tmpFilterDesc = filterDesc; + tmpFilterDesc.dims[channelAxis] /= convParamSpec.group; + U32 fnPadding = tmpFilterDesc.dims[channelAxis]; + if (fnPadding % 8 != 0) { + fnPadding = (fnPadding / 8 + 1) * 8; + } + U32 originalTileSize = tensorNumElements(tmpFilterDesc); + for (U32 g = 0; g < convParamSpec.group; g++) { + CHECK_STATUS(convolution_transform_filter_kernel_fp32( + tmpFilterDesc, filter, ftmDesc, filterTransformed, ftmDataFormat)); + U32 newTileSize = tensorNumElements(*ftmDesc) / tmpFilterDesc.dims[channelAxis] * fnPadding; + filter += originalTileSize; + filterTransformed += newTileSize; + } + ftmDesc->dims[channelAxis] = filterDesc.dims[channelAxis]; + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/arm/fp32/convolution_winograd_V8.cpp b/compute/tensor/src/cpu/arm/fp32/convolution_winograd_V8.cpp new file mode 100644 index 00000000..8b39981a --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp32/convolution_winograd_V8.cpp @@ -0,0 +1,861 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/fp32/convolution_winograd_transform.h" +#include "cpu/arm/fp32/tensor_computing_fp32.h" + +EE convolution_winograd_V8(TensorDesc inputDesc, + F32 *inArray, + TensorDesc filterDesc, + const F32 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F32 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec activationDesc) +{ +#ifdef __aarch64__ + UNUSED(biasDesc); + UNUSED(tmpBytes); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + if (fdf != DF_HWNCN8) { + CHECK_STATUS(NOT_MATCH); + } + if (!(fh == 6 && fw == 6)) { + CHECK_STATUS(NOT_SUPPORTED); + } + + oc /= 8; + ic /= 8; + + U32 tile_h = (oh + 3) / 4; + U32 tile_w = (ow + 3) / 4; + // num of 6x6 tiles + I32 tiles = tile_h * tile_w; + U32 pad_left = paddingL; + U32 pad_right = paddingR + (tile_w * 4 - ow); + U32 pad_w_mod_4 = tile_w * 4 - ow; + U32 pad_top = paddingT; + U32 pad_bottom = paddingB + (tile_h * 4 - oh); + U32 pad_h_mod_4 = tile_h * 4 - oh; + U32 ih_pad = ih + pad_top + pad_bottom; + U32 iw_pad = iw + pad_left + pad_right; + // tmp = in_pad + itm + otm + // in_pad: ic*ih_pad*iw_pad*8 + // itm: 6*6*ic*12*8 + // otm: 6*6*12*8 + F32 *inArray_pad = (F32 *)tmp; + F32 *itmArray = inArray_pad + ic * ih_pad * iw_pad * 8; + F32 *otmArray = itmArray + 6 * 6 * ic * 12 * 8; + + EE ret = SUCCESS; + // copy input into a input with padding + for (U32 n = 0; n < in; n++) { + F32 *inArray_pad_mov = inArray_pad; + F32 *inArray_mov = inArray + n * ic * ih * iw * 8; + for (U32 c = 0; c < ic; c++) { + memset(inArray_pad_mov, 0, pad_top * iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += pad_top * iw_pad * 8; + for (U32 h = pad_top; h < ih_pad - pad_bottom; h++) { + memset(inArray_pad_mov, 0, pad_left * 8 * bytesOf(idt)); + inArray_pad_mov += pad_left * 8; + memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt)); + inArray_pad_mov += iw * 8; + inArray_mov += iw * 8; + memset(inArray_pad_mov, 0, pad_right * 8 * bytesOf(idt)); + inArray_pad_mov += pad_right * 8; + } + memset(inArray_pad_mov, 0, pad_bottom * iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += pad_bottom * iw_pad * 8; + } + + // tiles / 12 + for (I32 hw = 0; hw < tiles - 11; hw += 12) { + // in trans + // NCHWc8 => (6*6)*C*c8*hw12 + for (U32 c = 0; c < ic; c++) { + F32 *inArray_pad_mov = inArray_pad + c * ih_pad * iw_pad * 8; + F32 *itmArray_mov = itmArray + c * 12 * 8; + F32 *Iw_ptr0[36]; + F32 *Iw_ptr1[36]; + F32 Iw[12][36][8]; + F32 *I0[12][36]; + F32 *I1[12][36]; + U32 h[12]; + U32 w[12]; + for (U32 index = 0; index < 12; index++) { + h[index] = ((hw + index) / tile_w) * 4; + w[index] = ((hw + index) % tile_w) * 4; + } + for (U32 i = 0; i < 6; i++) { + for (U32 j = 0; j < 6; j++) { + for (U32 index = 0; index < 12; index++) { + I0[index][i * 6 + j] = + inArray_pad_mov + (h[index] + i) * iw_pad * 8 + (w[index] + j) * 8; + I1[index][i * 6 + j] = inArray_pad_mov + (h[index] + i) * iw_pad * 8 + + (w[index] + j) * 8 + 4; + } + } + } + for (U32 index = 0; index < 12; index++) { + for (U32 i = 0; i < 36; i++) { + Iw_ptr0[i] = Iw[index][i]; + Iw_ptr1[i] = Iw_ptr0[i] + 4; + } + trans_I_4x4_3x3(Iw_ptr0, I0[index]); + trans_I_4x4_3x3(Iw_ptr1, I1[index]); + } + for (U32 i = 0; i < 36; i++) { + F32 *itm = itmArray_mov + i * ic * 8 * 12; + + __asm__ __volatile__( + "ldp q0, q1, [%[in_0]]\n" + "ldp q2, q3, [%[in_1]]\n" + "ldp q4, q5, [%[in_2]]\n" + "ldp q6, q7, [%[in_3]]\n" + + "ldp q8, q9, [%[in_4]]\n" + "ldp q10, q11, [%[in_5]]\n" + "ldp q12, q13, [%[in_6]]\n" + "ldp q14, q15, [%[in_7]]\n" + + "ldp q16, q17, [%[in_8]]\n" + "ldp q18, q19, [%[in_9]]\n" + "ldp q20, q21, [%[in_10]]\n" + "ldp q22, q23, [%[in_11]]\n" + + "zip1 v24.4s, v0.4s, v2.4s\n" + "zip2 v25.4s, v0.4s, v2.4s\n" + "zip1 v26.4s, v4.4s, v6.4s\n" + "zip2 v27.4s, v4.4s, v6.4s\n" + + "zip1 v0.2d, v24.2d, v26.2d\n" + "zip2 v2.2d, v24.2d, v26.2d\n" + "zip1 v4.2d, v25.2d, v27.2d\n" + "zip2 v6.2d, v25.2d, v27.2d\n" + + "zip1 v24.4s, v8.4s, v10.4s\n" + "zip2 v25.4s, v8.4s, v10.4s\n" + "zip1 v26.4s, v12.4s, v14.4s\n" + "zip2 v27.4s, v12.4s, v14.4s\n" + + "zip1 v8.2d, v24.2d, v26.2d\n" + "zip2 v10.2d, v24.2d, v26.2d\n" + "zip1 v12.2d, v25.2d, v27.2d\n" + "zip2 v14.2d, v25.2d, v27.2d\n" + + "zip1 v24.4s, v16.4s, v18.4s\n" + "zip2 v25.4s, v16.4s, v18.4s\n" + "zip1 v26.4s, v20.4s, v22.4s\n" + "zip2 v27.4s, v20.4s, v22.4s\n" + + "zip1 v16.2d, v24.2d, v26.2d\n" + "zip2 v18.2d, v24.2d, v26.2d\n" + "zip1 v20.2d, v25.2d, v27.2d\n" + "zip2 v22.2d, v25.2d, v27.2d\n" + + "stp q0, q8, [%[pack]]\n" + "str q16, [%[pack], #32]\n" + "stp q2, q10, [%[pack], 48]\n" + "str q18, [%[pack], #80]\n" + "stp q4, q12, [%[pack], #96]\n" + "str q20, [%[pack], #128]\n" + "stp q6, q14, [%[pack], #144]\n" + "str q22, [%[pack], #176]\n" + + "zip1 v24.4s, v1.4s, v3.4s\n" + "zip2 v25.4s, v1.4s, v3.4s\n" + "zip1 v26.4s, v5.4s, v7.4s\n" + "zip2 v27.4s, v5.4s, v7.4s\n" + + "zip1 v1.2d, v24.2d, v26.2d\n" + "zip2 v3.2d, v24.2d, v26.2d\n" + "zip1 v5.2d, v25.2d, v27.2d\n" + "zip2 v7.2d, v25.2d, v27.2d\n" + + "zip1 v24.4s, v9.4s, v11.4s\n" + "zip2 v25.4s, v9.4s, v11.4s\n" + "zip1 v26.4s, v13.4s, v15.4s\n" + "zip2 v27.4s, v13.4s, v15.4s\n" + + "zip1 v9.2d, v24.2d, v26.2d\n" + "zip2 v11.2d, v24.2d, v26.2d\n" + "zip1 v13.2d, v25.2d, v27.2d\n" + "zip2 v15.2d, v25.2d, v27.2d\n" + + "zip1 v24.4s, v17.4s, v19.4s\n" + "zip2 v25.4s, v17.4s, v19.4s\n" + "zip1 v26.4s, v21.4s, v23.4s\n" + "zip2 v27.4s, v21.4s, v23.4s\n" + + "zip1 v17.2d, v24.2d, v26.2d\n" + "zip2 v19.2d, v24.2d, v26.2d\n" + "zip1 v21.2d, v25.2d, v27.2d\n" + "zip2 v23.2d, v25.2d, v27.2d\n" + + "stp q1, q9, [%[pack], #192]\n" + "str q17, [%[pack], #224]\n" + "stp q3, q11, [%[pack], 240]\n" + "str q19, [%[pack], #272]\n" + "stp q5, q13, [%[pack], 288]\n" + "str q21, [%[pack], #320]\n" + "stp q7, q15, [%[pack], 336]\n" + "str q23, [%[pack], #368]\n" + : + : [pack] "r"(itm), [in_0] "r"(Iw[0][i]), [in_1] "r"(Iw[1][i]), + [in_2] "r"(Iw[2][i]), [in_3] "r"(Iw[3][i]), [in_4] "r"(Iw[4][i]), + [in_5] "r"(Iw[5][i]), [in_6] "r"(Iw[6][i]), [in_7] "r"(Iw[7][i]), + [in_8] "r"(Iw[8][i]), [in_9] "r"(Iw[9][i]), [in_10] "r"(Iw[10][i]), + [in_11] "r"(Iw[11][i]) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", + "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"); + } + } + for (I32 o = 0; o < I32(oc); o++) { + const F32 *b_0 = biasArray + o * 8; + const F32 *b_1 = b_0 + 4; + // dot prod + // (6*6)*C*c8*hw12 times O*(6*6)*C*c8*o8 = O*(6*6)*hw12*o8 + for (U32 idx = 0; idx < 36; idx++) { + F32 *itm_0 = itmArray + idx * 12 * ic * 8; + const F32 *f_o0c0 = filterArray + o * 8 * 36 * ic * 8 + idx * 8 * ic * 8; + F32 *out_o0hw0 = otmArray + idx * 12 * 8; + __asm__ __volatile__( + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" + + "eor v5.16b, v5.16b, v5.16b\n" + "ldr q1, [%[in_0]]\n" // in_hw0 + "eor v6.16b, v6.16b, v6.16b\n" + "eor v7.16b, v7.16b, v7.16b\n" + "eor v8.16b, v8.16b, v8.16b\n" + "ldr q0, [%[f_0]]\n" // f_o0c0 + "eor v9.16b, v9.16b, v9.16b\n" + "eor v10.16b, v10.16b, v10.16b\n" + "eor v11.16b, v11.16b, v11.16b\n" + "ldr q3, [%[in_0], #16]\n" + "eor v12.16b, v12.16b, v12.16b\n" + "eor v13.16b, v13.16b, v13.16b\n" + "eor v14.16b, v14.16b, v14.16b\n" + "eor v15.16b, v15.16b, v15.16b\n" + + "eor v16.16b, v16.16b, v16.16b\n" + "eor v17.16b, v17.16b, v17.16b\n" + "eor v18.16b, v18.16b, v18.16b\n" + "eor v19.16b, v19.16b, v19.16b\n" + "eor v20.16b, v20.16b, v20.16b\n" + "eor v21.16b, v21.16b, v21.16b\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + "eor v25.16b, v25.16b, v25.16b\n" + "eor v26.16b, v26.16b, v26.16b\n" + "eor v27.16b, v27.16b, v27.16b\n" + "eor v28.16b, v28.16b, v28.16b\n" + "0:\n" + "fmla v5.4s, v0.4s, v1.s[0]\n" + "fmla v7.4s, v0.4s, v1.s[1]\n" + "ldr q2, [x3, 32]\n" + "ldr q29, [x0, 16]\n" + "fmla v9.4s, v0.4s, v1.s[2]\n" + "fmla v11.4s, v0.4s, v1.s[3]\n" + + "fmla v13.4s, v0.4s, v3.s[0]\n" + "fmla v15.4s, v0.4s, v3.s[1]\n" + "fmla v17.4s, v0.4s, v3.s[2]\n" + "fmla v19.4s, v0.4s, v3.s[3]\n" + + "fmla v21.4s, v0.4s, v2.s[0]\n" + "fmla v23.4s, v0.4s, v2.s[1]\n" + "fmla v25.4s, v0.4s, v2.s[2]\n" + "fmla v27.4s, v0.4s, v2.s[3]\n" + + "fmla v6.4s, v29.4s, v1.s[0]\n" + "fmla v8.4s, v29.4s, v1.s[1]\n" + "fmla v10.4s, v29.4s, v1.s[2]\n" + "fmla v12.4s, v29.4s, v1.s[3]\n" + + "fmla v14.4s, v29.4s, v3.s[0]\n" + "fmla v16.4s, v29.4s, v3.s[1]\n" + "ldr q1, [x3, 48]!\n" + "ldr q0, [x0, 32]!\n" + "fmla v18.4s, v29.4s, v3.s[2]\n" + "fmla v20.4s, v29.4s, v3.s[3]\n" + + "fmla v22.4s, v29.4s, v2.s[0]\n" + "fmla v24.4s, v29.4s, v2.s[1]\n" + "ldr q3, [x3, 16]\n" + "subs x2, x2, #1\n" + "fmla v26.4s, v29.4s, v2.s[2]\n" + "fmla v28.4s, v29.4s, v2.s[3]\n" + "bne 0b\n" + "str q5, [%[out_0]]\n" + "str q6, [%[out_0], #16]\n" + "str q7, [%[out_0], #32]\n" + "str q8, [%[out_0], #48]\n" + "str q9, [%[out_0], #64]\n" + "str q10, [%[out_0], #80]\n" + "str q11, [%[out_0], #96]\n" + "str q12, [%[out_0], #112]\n" + "str q13, [%[out_0], #128]\n" + "str q14, [%[out_0], #144]\n" + "str q15, [%[out_0], #160]\n" + "str q16, [%[out_0], #176]\n" + "str q17, [%[out_0], #192]\n" + "str q18, [%[out_0], #208]\n" + "str q19, [%[out_0], #224]\n" + "str q20, [%[out_0], #240]\n" + "str q21, [%[out_0], #256]\n" + "str q22, [%[out_0], #272]\n" + "str q23, [%[out_0], #288]\n" + "str q24, [%[out_0], #304]\n" + "str q25, [%[out_0], #320]\n" + "str q26, [%[out_0], #336]\n" + "str q27, [%[out_0], #352]\n" + "str q28, [%[out_0], #368]\n" + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(itm_0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8) + : "memory", "cc", "v0", "v1", "v2", "v3", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x0", + "x1", "x2", "x3"); + } + // out trans + // O*(6*6)*hw12*o8 => NOHWo8 + for (U32 hw12 = 0; hw12 < 12; hw12++) { + U32 h = (hw + hw12) / tile_w; + U32 w = (hw + hw12) % tile_w; + F32 *out_0 = outArray + n * oc * oh * ow * 8 + o * oh * ow * 8 + + h * 4 * ow * 8 + w * 4 * 8; + + F32 *Ow_0[36]; + F32 *Ow_1[36]; + F32 *O_0[16]; + F32 *O_1[16]; + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + idx * 12 * 8 + hw12 * 8; + Ow_1[idx] = Ow_0[idx] + 4; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + O_1[i * 4 + j] = O_0[i * 4 + j] + 4; + } + } + CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + CHECK_STATUS(trans_O_4x4_3x3(Ow_1, O_1, b_1, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + } + } + } + + // tiles_reminder % 12 / 8 + I32 tiles_s = (tiles / 12) * 12; + I32 tiles_tail = tiles - tiles_s; + + if (tiles_tail >= 8) { + I32 hw = tiles_s; + // in trans + // NCHWc8 => (6*6)*C*c8*hw8 + for (U32 c = 0; c < ic; c++) { + F32 *inArray_pad_mov = inArray_pad + c * ih_pad * iw_pad * 8; + F32 *itmArray_mov = itmArray + c * 8 * 8; + F32 *Iw_ptr0[36]; + F32 *Iw_ptr1[36]; + F32 Iw[8][36][8]; + F32 *I0[8][36]; + F32 *I1[8][36]; + U32 h[8]; + U32 w[8]; + for (U32 index = 0; index < 8; index++) { + h[index] = ((hw + index) / tile_w) * 4; + w[index] = ((hw + index) % tile_w) * 4; + } + for (U32 i = 0; i < 6; i++) { + for (U32 j = 0; j < 6; j++) { + for (U32 index = 0; index < 8; index++) { + I0[index][i * 6 + j] = + inArray_pad_mov + (h[index] + i) * iw_pad * 8 + (w[index] + j) * 8; + I1[index][i * 6 + j] = inArray_pad_mov + (h[index] + i) * iw_pad * 8 + + (w[index] + j) * 8 + 4; + } + } + } + for (U32 index = 0; index < 8; index++) { + for (U32 i = 0; i < 36; i++) { + Iw_ptr0[i] = Iw[index][i]; + Iw_ptr1[i] = Iw_ptr0[i] + 4; + } + trans_I_4x4_3x3(Iw_ptr0, I0[index]); + trans_I_4x4_3x3(Iw_ptr1, I1[index]); + } + for (U32 i = 0; i < 36; i++) { + F32 *itm = itmArray_mov + i * ic * 8 * 8; + + __asm__ __volatile__( + "ldp q0, q1, [%[in_0]]\n" + "ldp q2, q3, [%[in_1]]\n" + "ldp q4, q5, [%[in_2]]\n" + "ldp q6, q7, [%[in_3]]\n" + + "ldp q8, q9, [%[in_4]]\n" + "ldp q10, q11, [%[in_5]]\n" + "ldp q12, q13, [%[in_6]]\n" + "ldp q14, q15, [%[in_7]]\n" + + "zip1 v24.4s, v0.4s, v2.4s\n" + "zip2 v25.4s, v0.4s, v2.4s\n" + "zip1 v26.4s, v4.4s, v6.4s\n" + "zip2 v27.4s, v4.4s, v6.4s\n" + + "zip1 v0.2d, v24.2d, v26.2d\n" + "zip2 v2.2d, v24.2d, v26.2d\n" + "zip1 v4.2d, v25.2d, v27.2d\n" + "zip2 v6.2d, v25.2d, v27.2d\n" + + "zip1 v24.4s, v8.4s, v10.4s\n" + "zip2 v25.4s, v8.4s, v10.4s\n" + "zip1 v26.4s, v12.4s, v14.4s\n" + "zip2 v27.4s, v12.4s, v14.4s\n" + + "zip1 v8.2d, v24.2d, v26.2d\n" + "zip2 v10.2d, v24.2d, v26.2d\n" + "zip1 v12.2d, v25.2d, v27.2d\n" + "zip2 v14.2d, v25.2d, v27.2d\n" + + "stp q0, q8, [%[pack]]\n" + "stp q2, q10, [%[pack], #32]\n" + "stp q4, q12, [%[pack], #64]\n" + "stp q6, q14, [%[pack], #96]\n" + + "zip1 v24.4s, v1.4s, v3.4s\n" + "zip2 v25.4s, v1.4s, v3.4s\n" + "zip1 v26.4s, v5.4s, v7.4s\n" + "zip2 v27.4s, v5.4s, v7.4s\n" + + "zip1 v1.2d, v24.2d, v26.2d\n" + "zip2 v3.2d, v24.2d, v26.2d\n" + "zip1 v5.2d, v25.2d, v27.2d\n" + "zip2 v7.2d, v25.2d, v27.2d\n" + + "zip1 v24.4s, v9.4s, v11.4s\n" + "zip2 v25.4s, v9.4s, v11.4s\n" + "zip1 v26.4s, v13.4s, v15.4s\n" + "zip2 v27.4s, v13.4s, v15.4s\n" + + "zip1 v9.2d, v24.2d, v26.2d\n" + "zip2 v11.2d, v24.2d, v26.2d\n" + "zip1 v13.2d, v25.2d, v27.2d\n" + "zip2 v15.2d, v25.2d, v27.2d\n" + + "stp q1, q9, [%[pack], #128]\n" + "stp q3, q11, [%[pack], #160]\n" + "stp q5, q13, [%[pack], #192]\n" + "stp q7, q15, [%[pack], #224]\n" + : + : [pack] "r"(itm), [in_0] "r"(Iw[0][i]), [in_1] "r"(Iw[1][i]), + [in_2] "r"(Iw[2][i]), [in_3] "r"(Iw[3][i]), [in_4] "r"(Iw[4][i]), + [in_5] "r"(Iw[5][i]), [in_6] "r"(Iw[6][i]), [in_7] "r"(Iw[7][i]) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v24", "v25", "v26", "v27"); + } + } + for (I32 o = 0; o < I32(oc); o++) { + const F32 *b_0 = biasArray + o * 8; + const F32 *b_1 = b_0 + 4; + // dot prod + // (6*6)*C*c8*hw8 times O*(6*6)*C*c8*o8 = O*(6*6)*hw8*o8 + for (U32 idx = 0; idx < 36; idx++) { + F32 *itm_0 = itmArray + idx * 8 * ic * 8; + const F32 *f_o0c0 = filterArray + o * 8 * 36 * ic * 8 + idx * 8 * ic * 8; + F32 *out_o0hw0 = otmArray + idx * 8 * 8; + __asm__ __volatile__( + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" + + "eor v5.16b, v5.16b, v5.16b\n" + "ldr q1, [%[in_0]]\n" // in_hw0 + "eor v6.16b, v6.16b, v6.16b\n" + "eor v7.16b, v7.16b, v7.16b\n" + "eor v8.16b, v8.16b, v8.16b\n" + "ldr q0, [%[f_0]]\n" // f_o0c0 + "eor v9.16b, v9.16b, v9.16b\n" + "eor v10.16b, v10.16b, v10.16b\n" + "eor v11.16b, v11.16b, v11.16b\n" + "eor v12.16b, v12.16b, v12.16b\n" + "eor v13.16b, v13.16b, v13.16b\n" + "eor v14.16b, v14.16b, v14.16b\n" + "eor v15.16b, v15.16b, v15.16b\n" + + "eor v16.16b, v16.16b, v16.16b\n" + "eor v17.16b, v17.16b, v17.16b\n" + "eor v18.16b, v18.16b, v18.16b\n" + "eor v19.16b, v19.16b, v19.16b\n" + "eor v20.16b, v20.16b, v20.16b\n" + "0:\n" + "ldr q3, [x3, 16]!\n" + "ldr q29, [x0, 16]\n" + "fmla v5.4s, v0.4s, v1.s[0]\n" + "fmla v7.4s, v0.4s, v1.s[1]\n" + "fmla v9.4s, v0.4s, v1.s[2]\n" + "fmla v11.4s, v0.4s, v1.s[3]\n" + + "fmla v13.4s, v0.4s, v3.s[0]\n" + "fmla v15.4s, v0.4s, v3.s[1]\n" + "fmla v17.4s, v0.4s, v3.s[2]\n" + "fmla v19.4s, v0.4s, v3.s[3]\n" + + "fmla v6.4s, v29.4s, v1.s[0]\n" + "fmla v8.4s, v29.4s, v1.s[1]\n" + "fmla v10.4s, v29.4s, v1.s[2]\n" + "fmla v12.4s, v29.4s, v1.s[3]\n" + + "fmla v14.4s, v29.4s, v3.s[0]\n" + "fmla v16.4s, v29.4s, v3.s[1]\n" + "ldr q1, [x3, 16]!\n" + "ldr q0, [x0, 32]!\n" + "subs x2, x2, #1\n" + "fmla v18.4s, v29.4s, v3.s[2]\n" + "fmla v20.4s, v29.4s, v3.s[3]\n" + "bne 0b\n" + "str q5, [%[out_0]]\n" + "str q6, [%[out_0], #16]\n" + "str q7, [%[out_0], #32]\n" + "str q8, [%[out_0], #48]\n" + "str q9, [%[out_0], #64]\n" + "str q10, [%[out_0], #80]\n" + "str q11, [%[out_0], #96]\n" + "str q12, [%[out_0], #112]\n" + "str q13, [%[out_0], #128]\n" + "str q14, [%[out_0], #144]\n" + "str q15, [%[out_0], #160]\n" + "str q16, [%[out_0], #176]\n" + "str q17, [%[out_0], #192]\n" + "str q18, [%[out_0], #208]\n" + "str q19, [%[out_0], #224]\n" + "str q20, [%[out_0], #240]\n" + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(itm_0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8) + : "memory", "cc", "v0", "v1", "v3", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v27", + "v28", "v29", "x0", "x1", "x2", "x3"); + } + // out trans + // O*(6*6)*hw8*o8 => NOHWo8 + for (U32 hw8 = 0; hw8 < 8; hw8++) { + U32 h = (hw + hw8) / tile_w; + U32 w = (hw + hw8) % tile_w; + F32 *out_0 = outArray + n * oc * oh * ow * 8 + o * oh * ow * 8 + + h * 4 * ow * 8 + w * 4 * 8; + + F32 *Ow_0[36]; + F32 *Ow_1[36]; + F32 *O_0[16]; + F32 *O_1[16]; + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + idx * 8 * 8 + hw8 * 8; + Ow_1[idx] = Ow_0[idx] + 4; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + O_1[i * 4 + j] = O_0[i * 4 + j] + 4; + } + } + CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + CHECK_STATUS(trans_O_4x4_3x3(Ow_1, O_1, b_1, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + } + } + tiles_s += 8; + tiles_tail -= 8; + } + + if (tiles_tail >= 4) { + I32 hw = tiles_s; + // in trans + // NCHWc8 => (6*6)*C*c8*hw4 + for (U32 c = 0; c < ic; c++) { + F32 *inArray_pad_mov = inArray_pad + c * ih_pad * iw_pad * 8; + F32 *itmArray_mov = itmArray + c * 8 * 4; + F32 *Iw_ptr0[36]; + F32 *Iw_ptr1[36]; + F32 Iw[4][36][8]; + F32 *I0[4][36]; + F32 *I1[4][36]; + U32 h[4]; + U32 w[4]; + for (U32 index = 0; index < 4; index++) { + h[index] = ((hw + index) / tile_w) * 4; + w[index] = ((hw + index) % tile_w) * 4; + } + for (U32 i = 0; i < 6; i++) { + for (U32 j = 0; j < 6; j++) { + for (U32 index = 0; index < 4; index++) { + I0[index][i * 6 + j] = + inArray_pad_mov + (h[index] + i) * iw_pad * 8 + (w[index] + j) * 8; + I1[index][i * 6 + j] = inArray_pad_mov + (h[index] + i) * iw_pad * 8 + + (w[index] + j) * 8 + 4; + } + } + } + for (U32 index = 0; index < 4; index++) { + for (U32 i = 0; i < 36; i++) { + Iw_ptr0[i] = Iw[index][i]; + Iw_ptr1[i] = Iw_ptr0[i] + 4; + } + trans_I_4x4_3x3(Iw_ptr0, I0[index]); + trans_I_4x4_3x3(Iw_ptr1, I1[index]); + } + for (U32 i = 0; i < 36; i++) { + F32 *itm = itmArray_mov + i * ic * 8 * 4; + + __asm__ __volatile__( + "ldp q0, q4, [%[in_0]]\n" + "ldp q1, q5, [%[in_1]]\n" + "ldp q2, q6, [%[in_2]]\n" + "ldp q3, q7, [%[in_3]]\n" + + "st4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[pack]], #64\n" + "st4 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[pack]]\n" + : + : [pack] "r"(itm), [in_0] "r"(Iw[0][i]), [in_1] "r"(Iw[1][i]), + [in_2] "r"(Iw[2][i]), [in_3] "r"(Iw[3][i]) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); + } + } + for (I32 o = 0; o < I32(oc); o++) { + const F32 *b_0 = biasArray + o * 8; + const F32 *b_1 = b_0 + 4; + // dot prod + // (6*6)*C*c8*hw4 times O*(6*6)*C*c8*o8 = O*(6*6)*hw4*o8 + for (U32 idx = 0; idx < 36; idx++) { + F32 *itm_0 = itmArray + idx * 4 * ic * 8; + const F32 *f_o0c0 = filterArray + o * 8 * 36 * ic * 8 + idx * 8 * ic * 8; + F32 *out_o0hw0 = otmArray + idx * 4 * 8; + __asm__ __volatile__( + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" + + "eor v5.16b, v5.16b, v5.16b\n" + "ldr q1, [%[in_0]]\n" // in_hw0 + "eor v6.16b, v6.16b, v6.16b\n" + "eor v7.16b, v7.16b, v7.16b\n" + "eor v8.16b, v8.16b, v8.16b\n" + "ldr q0, [%[f_0]]\n" // f_o0c0 + "eor v9.16b, v9.16b, v9.16b\n" + "eor v10.16b, v10.16b, v10.16b\n" + "eor v11.16b, v11.16b, v11.16b\n" + "eor v12.16b, v12.16b, v12.16b\n" + "0:\n" + "ldr q3, [x3, 16]!\n" + "ldr q29, [x0, 16]\n" + "fmla v5.4s, v0.4s, v1.s[0]\n" + "fmla v7.4s, v0.4s, v1.s[1]\n" + "fmla v9.4s, v0.4s, v1.s[2]\n" + "fmla v11.4s, v0.4s, v1.s[3]\n" + + "fmla v6.4s, v29.4s, v1.s[0]\n" + "fmla v8.4s, v29.4s, v1.s[1]\n" + "ldr q0, [x0, 32]!\n" + "subs x2, x2, #1\n" + "fmla v10.4s, v29.4s, v1.s[2]\n" + "fmla v12.4s, v29.4s, v1.s[3]\n" + + "mov v1.16b, v3.16b\n" + "bne 0b\n" + "str q5, [%[out_0]]\n" + "str q6, [%[out_0], #16]\n" + "str q7, [%[out_0], #32]\n" + "str q8, [%[out_0], #48]\n" + "str q9, [%[out_0], #64]\n" + "str q10, [%[out_0], #80]\n" + "str q11, [%[out_0], #96]\n" + "str q12, [%[out_0], #112]\n" + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(itm_0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8) + : "memory", "cc", "v0", "v1", "v3", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v12", "v27", "v28", "v29", "x0", "x1", "x2", "x3"); + } + // out trans + // O*(6*6)*hw4*o8 => NOHWo8 + for (U32 hw4 = 0; hw4 < 4; hw4++) { + U32 h = (hw + hw4) / tile_w; + U32 w = (hw + hw4) % tile_w; + F32 *out_0 = outArray + n * oc * oh * ow * 8 + o * oh * ow * 8 + + h * 4 * ow * 8 + w * 4 * 8; + + F32 *Ow_0[36]; + F32 *Ow_1[36]; + F32 *O_0[16]; + F32 *O_1[16]; + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + idx * 4 * 8 + hw4 * 8; + Ow_1[idx] = Ow_0[idx] + 4; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + O_1[i * 4 + j] = O_0[i * 4 + j] + 4; + } + } + CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + CHECK_STATUS(trans_O_4x4_3x3(Ow_1, O_1, b_1, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + } + } + tiles_s += 4; + tiles_tail -= 4; + } + + for (I32 hw = tiles_s; hw < tiles; hw++) { + // in trans + // NCHWc8 => (6*6)*C*c8*hw1 + for (U32 c = 0; c < ic; c++) { + F32 *inArray_pad_mov = inArray_pad + c * ih_pad * iw_pad * 8; + F32 *itmArray_mov = itmArray + c * 8; + F32 *Iw_ptr0[36]; + F32 *Iw_ptr1[36]; + F32 Iw[36][8]; + F32 *I0[36]; + F32 *I1[36]; + U32 h = (hw / tile_w) * 4; + U32 w = (hw % tile_w) * 4; + for (U32 i = 0; i < 6; i++) { + for (U32 j = 0; j < 6; j++) { + I0[i * 6 + j] = inArray_pad_mov + (h + i) * iw_pad * 8 + (w + j) * 8; + I1[i * 6 + j] = inArray_pad_mov + (h + i) * iw_pad * 8 + (w + j) * 8 + 4; + } + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr0[i] = Iw[i]; + Iw_ptr1[i] = Iw_ptr0[i] + 4; + } + trans_I_4x4_3x3(Iw_ptr0, I0); + trans_I_4x4_3x3(Iw_ptr1, I1); + for (U32 i = 0; i < 36; i++) { + F32 *itm = itmArray_mov + i * ic * 8; + memcpy(itm, Iw[i], 8 * bytesOf(idt)); + } + } + for (I32 o = 0; o < I32(oc); o++) { + const F32 *b_0 = biasArray + o * 8; + const F32 *b_1 = b_0 + 4; + // dot prod + // (6*6)*C*c8*hw1 times O*(6*6)*C*c8*o8 = O*(6*6)*hw1*o8 + for (U32 idx = 0; idx < 36; idx++) { + F32 *itm_0 = itmArray + idx * ic * 8; + const F32 *f_o0c0 = filterArray + o * 8 * 36 * ic * 8 + idx * 8 * ic * 8; + F32 *out_o0hw0 = otmArray + idx * 8; + __asm__ __volatile__( + "ldr s1, [%[in_0]]\n" // in_hw0 + "ldp q0, q29, [%[f_0]]\n" // f_o0c0 + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" + + "eor v5.16b, v5.16b, v5.16b\n" + "eor v6.16b, v6.16b, v6.16b\n" + "0:\n" + "ldp q30, q28, [x0, #32]\n" + "ldr s3, [x3, #4]\n" + "fmla v5.4s, v0.4s, v1.s[0]\n" + "fmla v6.4s, v29.4s, v1.s[0]\n" + + "ldr q0, [x0, #64]!\n" + "subs x2, x2, #2\n" + "ldr q29, [x0, #16]\n" + "ldr s1, [x3, #8]!\n" + "fmla v5.4s, v30.4s, v3.s[0]\n" + "fmla v6.4s, v28.4s, v3.s[0]\n" + "bne 0b\n" + "str q5, [%[out_0]]\n" + "str q6, [%[out_0], #16]\n" + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(itm_0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8) + : "memory", "cc", "v0", "v1", "v3", "v5", "v6", "v28", "v29", "v30", "x0", + "x1", "x2", "x3"); + } + // out trans + // O*(6*6)*hw1*o8 => NOHWo8 + U32 h = hw / tile_w; + U32 w = hw % tile_w; + F32 *out_0 = + outArray + n * oc * oh * ow * 8 + o * oh * ow * 8 + h * 4 * ow * 8 + w * 4 * 8; + + F32 *Ow_0[36]; + F32 *Ow_1[36]; + F32 *O_0[16]; + F32 *O_1[16]; + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + idx * 8; + Ow_1[idx] = Ow_0[idx] + 4; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + O_1[i * 4 + j] = O_0[i * 4 + j] + 4; + } + } + CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + CHECK_STATUS(trans_O_4x4_3x3(Ow_1, O_1, b_1, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + } + } + } + return ret; +#else + // TODO + UNI_ERROR_LOG("[ERROR] currently not support ARMv7 convolution winograd\n"); +#endif +} diff --git a/compute/tensor/src/cpu/arm/fp32/convolution_winograd_transform.h b/compute/tensor/src/cpu/arm/fp32/convolution_winograd_transform.h new file mode 100644 index 00000000..0b203e33 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp32/convolution_winograd_transform.h @@ -0,0 +1,265 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_WINOGRAD_TRANSFORM_FP32 +#define _H_WINOGRAD_TRANSFORM_FP32 + +#ifdef _USE_FP32 +#include +#include +#include "cpu/arm/fp32/arm_functions_fp32.h" + +inline void trans_W_4x4_3x3(float *WTM[36], float *W[9]) +{ + float T[6][3][4]; + + float32x4_t v_01666 = vmovq_n_f32(0.1666666666666667f); + float32x4_t v_minus_01666 = vmovq_n_f32(-0.1666666666666667f); + float32x4_t v_00833 = vmovq_n_f32(0.0833333333333333f); + float32x4_t v_minus_00833 = vmovq_n_f32(-0.0833333333333333f); + float32x4_t v_004166 = vmovq_n_f32(0.0416666666666667f); + float32x4_t v_025 = vmovq_n_f32(0.25f); + + for (int i = 0; i < 3; i++) { + float32x4_t v_W0 = vld1q_f32(W[0 * 3 + i]); + float32x4_t v_W1 = vld1q_f32(W[1 * 3 + i]); + float32x4_t v_W2 = vld1q_f32(W[2 * 3 + i]); + + float32x4_t v_t0 = vmulq_f32(v_01666, v_W2); + float32x4_t v_t1 = vsubq_f32(vmulq_f32(v_minus_01666, v_W0), v_t0); + float32x4_t v_t2 = vfmaq_f32(v_t0, v_004166, v_W0); + + float32x4_t v_T0 = vmulq_f32(v_025, v_W0); + float32x4_t v_T1 = vfmaq_f32(v_t1, v_minus_01666, v_W1); + float32x4_t v_T2 = vfmaq_f32(v_t1, v_01666, v_W1); + float32x4_t v_T3 = vfmaq_f32(v_t2, v_00833, v_W1); + float32x4_t v_T4 = vfmaq_f32(v_t2, v_minus_00833, v_W1); + + vst1q_f32(T[0][i], v_T0); + vst1q_f32(T[1][i], v_T1); + vst1q_f32(T[2][i], v_T2); + vst1q_f32(T[3][i], v_T3); + vst1q_f32(T[4][i], v_T4); + vst1q_f32(T[5][i], v_W2); + } + for (int i = 0; i < 6; i++) { + float32x4_t v_T0 = vld1q_f32(T[i][0]); + float32x4_t v_T1 = vld1q_f32(T[i][1]); + float32x4_t v_T2 = vld1q_f32(T[i][2]); + + float32x4_t v_t0 = vmulq_f32(v_01666, v_T2); + float32x4_t v_t1 = vsubq_f32(vmulq_f32(v_minus_01666, v_T0), v_t0); + float32x4_t v_t2 = vfmaq_f32(v_t0, v_004166, v_T0); + + float32x4_t v_WTM0 = vmulq_f32(v_025, v_T0); + float32x4_t v_WTM1 = vfmaq_f32(v_t1, v_minus_01666, v_T1); + float32x4_t v_WTM2 = vfmaq_f32(v_t1, v_01666, v_T1); + float32x4_t v_WTM3 = vfmaq_f32(v_t2, v_00833, v_T1); + float32x4_t v_WTM4 = vfmaq_f32(v_t2, v_minus_00833, v_T1); + + vst1q_f32(WTM[i * 6 + 0], v_WTM0); + vst1q_f32(WTM[i * 6 + 1], v_WTM1); + vst1q_f32(WTM[i * 6 + 2], v_WTM2); + vst1q_f32(WTM[i * 6 + 3], v_WTM3); + vst1q_f32(WTM[i * 6 + 4], v_WTM4); + vst1q_f32(WTM[i * 6 + 5], v_T2); + } +} + +inline EE trans_O_4x4_3x3(float *OTM[36], + float *O[16], + const float *bias, + U32 h, + U32 w, + U32 _pad_h_mod_4, + U32 _pad_w_mod_4, + U32 oh, + U32 ow, + ActivationParamSpec activationDesc) +{ + float T[4][6][4]; + // bias + float32x4_t v_b = vld1q_f32(bias); + + float32x4_t v_0 = vmovq_n_f32(0); + float32x4_t v_2 = vmovq_n_f32(2); + float32x4_t v_4 = vmovq_n_f32(4); + float32x4_t v_8 = vmovq_n_f32(8); + + for (int i = 0; i < 6; i++) { + float32x4_t v_OTM0 = vld1q_f32(OTM[i]); + float32x4_t v_OTM1 = vld1q_f32(OTM[1 * 6 + i]); + float32x4_t v_OTM2 = vld1q_f32(OTM[2 * 6 + i]); + float32x4_t v_OTM3 = vld1q_f32(OTM[3 * 6 + i]); + float32x4_t v_OTM4 = vld1q_f32(OTM[4 * 6 + i]); + float32x4_t v_OTM5 = vld1q_f32(OTM[5 * 6 + i]); + + float32x4_t v_t0 = vaddq_f32(v_OTM1, v_OTM2); + float32x4_t v_t1 = vaddq_f32(v_OTM3, v_OTM4); + float32x4_t v_t2 = vsubq_f32(v_OTM1, v_OTM2); + float32x4_t v_t3 = vsubq_f32(v_OTM3, v_OTM4); + + float32x4_t v_T0 = vaddq_f32(vaddq_f32(v_t0, v_t1), v_OTM0); + float32x4_t v_T1 = vfmaq_f32(v_t2, v_t3, v_2); + float32x4_t v_T2 = vfmaq_f32(v_t0, v_t1, v_4); + float32x4_t v_T3 = vaddq_f32(vfmaq_f32(v_t2, v_t3, v_8), v_OTM5); + + vst1q_f32(T[0][i], v_T0); + vst1q_f32(T[1][i], v_T1); + vst1q_f32(T[2][i], v_T2); + vst1q_f32(T[3][i], v_T3); + } + + U32 pad_h_mod_4 = 0, pad_w_mod_4 = 0; + if (h == oh && w == ow) { + pad_h_mod_4 = _pad_h_mod_4; + pad_w_mod_4 = _pad_w_mod_4; + } else if (h == oh) { + pad_h_mod_4 = _pad_h_mod_4; + } else if (w == ow) { + pad_w_mod_4 = _pad_w_mod_4; + } + + for (U32 i = 0; i < 4 - pad_h_mod_4; i++) { + float32x4_t v_T0 = vld1q_f32(T[i][0]); + float32x4_t v_T1 = vld1q_f32(T[i][1]); + float32x4_t v_T2 = vld1q_f32(T[i][2]); + float32x4_t v_T3 = vld1q_f32(T[i][3]); + float32x4_t v_T4 = vld1q_f32(T[i][4]); + float32x4_t v_T5 = vld1q_f32(T[i][5]); + + float32x4_t v_t0 = vaddq_f32(v_T1, v_T2); + float32x4_t v_t1 = vaddq_f32(v_T3, v_T4); + float32x4_t v_t2 = vsubq_f32(v_T1, v_T2); + float32x4_t v_t3 = vsubq_f32(v_T3, v_T4); + + float32x4_t v_O0 = vaddq_f32(vaddq_f32(v_t0, v_t1), v_T0); + float32x4_t v_O1 = vfmaq_f32(v_t2, v_t3, v_2); + float32x4_t v_O2 = vfmaq_f32(v_t0, v_t1, v_4); + float32x4_t v_O3 = vaddq_f32(vfmaq_f32(v_t2, v_t3, v_8), v_T5); + + switch (activationDesc.mode) { + case ACTIVATION_NULL: { + if (pad_w_mod_4 == 0) { + vst1q_f32(O[i * 4 + 0], vaddq_f32(v_O0, v_b)); + vst1q_f32(O[i * 4 + 1], vaddq_f32(v_O1, v_b)); + vst1q_f32(O[i * 4 + 2], vaddq_f32(v_O2, v_b)); + vst1q_f32(O[i * 4 + 3], vaddq_f32(v_O3, v_b)); + } else if (pad_w_mod_4 == 1) { + vst1q_f32(O[i * 4 + 0], vaddq_f32(v_O0, v_b)); + vst1q_f32(O[i * 4 + 1], vaddq_f32(v_O1, v_b)); + vst1q_f32(O[i * 4 + 2], vaddq_f32(v_O2, v_b)); + } else if (pad_w_mod_4 == 2) { + vst1q_f32(O[i * 4 + 0], vaddq_f32(v_O0, v_b)); + vst1q_f32(O[i * 4 + 1], vaddq_f32(v_O1, v_b)); + } else if (pad_w_mod_4 == 3) { + vst1q_f32(O[i * 4 + 0], vaddq_f32(v_O0, v_b)); + } + break; + } + case ACTIVATION_RELU: { + if (pad_w_mod_4 == 0) { + vst1q_f32(O[i * 4 + 0], vmaxq_f32(vaddq_f32(v_O0, v_b), v_0)); + vst1q_f32(O[i * 4 + 1], vmaxq_f32(vaddq_f32(v_O1, v_b), v_0)); + vst1q_f32(O[i * 4 + 2], vmaxq_f32(vaddq_f32(v_O2, v_b), v_0)); + vst1q_f32(O[i * 4 + 3], vmaxq_f32(vaddq_f32(v_O3, v_b), v_0)); + } else if (pad_w_mod_4 == 1) { + vst1q_f32(O[i * 4 + 0], vmaxq_f32(vaddq_f32(v_O0, v_b), v_0)); + vst1q_f32(O[i * 4 + 1], vmaxq_f32(vaddq_f32(v_O1, v_b), v_0)); + vst1q_f32(O[i * 4 + 2], vmaxq_f32(vaddq_f32(v_O2, v_b), v_0)); + } else if (pad_w_mod_4 == 2) { + vst1q_f32(O[i * 4 + 0], vmaxq_f32(vaddq_f32(v_O0, v_b), v_0)); + vst1q_f32(O[i * 4 + 1], vmaxq_f32(vaddq_f32(v_O1, v_b), v_0)); + } else if (pad_w_mod_4 == 3) { + vst1q_f32(O[i * 4 + 0], vmaxq_f32(vaddq_f32(v_O0, v_b), v_0)); + } + break; + } + default: + return NOT_SUPPORTED; + } + } + return SUCCESS; +} + +inline void trans_I_4x4_3x3(float *ITM[36], float *I[36]) +{ + float T[6][6][4]; + + float32x4_t v_4 = vmovq_n_f32(4); + float32x4_t v_minus_4 = vmovq_n_f32(-4); + float32x4_t v_2 = vmovq_n_f32(2); + float32x4_t v_minus_5 = vmovq_n_f32(-5); + + for (int i = 0; i < 6; i++) { + float32x4_t v_I0 = vld1q_f32(I[0 * 6 + i]); + float32x4_t v_I1 = vld1q_f32(I[1 * 6 + i]); + float32x4_t v_I2 = vld1q_f32(I[2 * 6 + i]); + float32x4_t v_I3 = vld1q_f32(I[3 * 6 + i]); + float32x4_t v_I4 = vld1q_f32(I[4 * 6 + i]); + float32x4_t v_I5 = vld1q_f32(I[5 * 6 + i]); + + float32x4_t v_t0 = vfmaq_f32(v_I4, v_I2, v_minus_4); + float32x4_t v_t1 = vfmaq_f32(v_I3, v_I1, v_minus_4); + float32x4_t v_t2 = vsubq_f32(v_I4, v_I2); + float32x4_t v_t3 = vmulq_f32(vsubq_f32(v_I3, v_I1), v_2); + float32x4_t v_t4 = vfmaq_f32(v_I4, v_I0, v_4); + float32x4_t v_t5 = vfmaq_f32(v_I5, v_I1, v_4); + + float32x4_t v_T0 = vfmaq_f32(v_t4, v_I2, v_minus_5); + float32x4_t v_T1 = vaddq_f32(v_t1, v_t0); + float32x4_t v_T2 = vsubq_f32(v_t0, v_t1); + float32x4_t v_T3 = vaddq_f32(v_t3, v_t2); + float32x4_t v_T4 = vsubq_f32(v_t2, v_t3); + float32x4_t v_T5 = vfmaq_f32(v_t5, v_I3, v_minus_5); + + vst1q_f32(T[0][i], v_T0); + vst1q_f32(T[1][i], v_T1); + vst1q_f32(T[2][i], v_T2); + vst1q_f32(T[3][i], v_T3); + vst1q_f32(T[4][i], v_T4); + vst1q_f32(T[5][i], v_T5); + } + + for (int i = 0; i < 6; i++) { + float32x4_t v_T0 = vld1q_f32(T[i][0]); + float32x4_t v_T1 = vld1q_f32(T[i][1]); + float32x4_t v_T2 = vld1q_f32(T[i][2]); + float32x4_t v_T3 = vld1q_f32(T[i][3]); + float32x4_t v_T4 = vld1q_f32(T[i][4]); + float32x4_t v_T5 = vld1q_f32(T[i][5]); + + float32x4_t v_t0 = vfmaq_f32(v_T4, v_T2, v_minus_4); + float32x4_t v_t1 = vfmaq_f32(v_T3, v_T1, v_minus_4); + float32x4_t v_t2 = vsubq_f32(v_T4, v_T2); + float32x4_t v_t3 = vmulq_f32(vsubq_f32(v_T3, v_T1), v_2); + float32x4_t v_t4 = vfmaq_f32(v_T4, v_T0, v_4); + float32x4_t v_t5 = vfmaq_f32(v_T5, v_T1, v_4); + + float32x4_t v_ITM0 = vfmaq_f32(v_t4, v_T2, v_minus_5); + float32x4_t v_ITM1 = vaddq_f32(v_t1, v_t0); + float32x4_t v_ITM2 = vsubq_f32(v_t0, v_t1); + float32x4_t v_ITM3 = vaddq_f32(v_t3, v_t2); + float32x4_t v_ITM4 = vsubq_f32(v_t2, v_t3); + float32x4_t v_ITM5 = vfmaq_f32(v_t5, v_T3, v_minus_5); + + vst1q_f32(ITM[i * 6 + 0], v_ITM0); + vst1q_f32(ITM[i * 6 + 1], v_ITM1); + vst1q_f32(ITM[i * 6 + 2], v_ITM2); + vst1q_f32(ITM[i * 6 + 3], v_ITM3); + vst1q_f32(ITM[i * 6 + 4], v_ITM4); + vst1q_f32(ITM[i * 6 + 5], v_ITM5); + } +} +#endif +#endif diff --git a/compute/tensor/src/cpu/arm/fp32/deconvolution_transform.cpp b/compute/tensor/src/cpu/arm/fp32/deconvolution_transform.cpp new file mode 100644 index 00000000..4ed5962a --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp32/deconvolution_transform.cpp @@ -0,0 +1,89 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/transform_functions.h" +#include "cpu/arm/fp32/tensor_computing_fp32.h" + +inline EE deconvolution_transform_filter_kernel_fp32(TensorDesc filterDesc, + const F32 *filterArray, + TensorDesc *ftmDesc, + F32 *ftmArray, + DataFormat ftmDataFormat) +{ + if (nullptr == filterArray || nullptr == ftmDesc || nullptr == ftmArray) { + CHECK_STATUS(NULL_POINTER); + } + DataType fdt; + DataFormat fdf; + U32 fn, fc, fh, fw; + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + if (fdf == ftmDataFormat) { + *ftmDesc = filterDesc; + memcpy(ftmArray, filterArray, fn * fc * fh * fw * bytesOf(fdt)); + return SUCCESS; + } + if (fdf != DF_NCHW) { + CHECK_STATUS(NOT_SUPPORTED); + } + EE ret = SUCCESS; + switch (ftmDataFormat) { + case DF_NHWCN8: { + *ftmDesc = tensor4df(fdt, ftmDataFormat, fc, fn, fh, fw); + transformCNHWToNHWCNx(filterDesc, filterArray, *ftmDesc, ftmArray); + break; + } + case DF_HWNCN8: { + *ftmDesc = tensor4df(fdt, ftmDataFormat, fc, fn, 6, 6); + transformCNHWToHWNCNx(filterDesc, filterArray, *ftmDesc, ftmArray); + break; + } + case DF_NCHWC8: { + *ftmDesc = tensor4df(fdt, DF_NCHWC8, fn, fc, fh, fw); + transformCNHWToNCHWC8(filterDesc, filterArray, *ftmDesc, ftmArray); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE deconvolution_transform_filter_fp32(TensorDesc filterDesc, + const F32 *filter, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + F32 *filterTransformed) +{ + DataFormat ftmDataFormat; + switch (algorithm) { + case CONVOLUTION_ALGORITHM_GEMM: + ftmDataFormat = DF_NHWCN8; + break; + case CONVOLUTION_ALGORITHM_GEMM_ICNCHW: + ftmDataFormat = DF_NHWCN8; + break; + case CONVOLUTION_ALGORITHM_WINOGRAD: + ftmDataFormat = DF_HWNCN8; + break; + case CONVOLUTION_ALGORITHM_GROUP_DECONV: + ftmDataFormat = DF_NCHWC8; + break; + default: + return NOT_MATCH; + } + EE ret = deconvolution_transform_filter_kernel_fp32( + filterDesc, filter, ftmDesc, filterTransformed, ftmDataFormat); + CHECK_STATUS(ret); + return ret; +} diff --git a/compute/tensor/src/cpu/arm/fp32/depthwise_pointwise_convolution.cpp b/compute/tensor/src/cpu/arm/fp32/depthwise_pointwise_convolution.cpp new file mode 100644 index 00000000..530383ae --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp32/depthwise_pointwise_convolution.cpp @@ -0,0 +1,75 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/fp32/tensor_computing_fp32.h" +#include "cpu/arm/fp32/depthwise_pointwise_convolution.h" + +EE depthwise_pointwise_convolution_fp32(TensorDesc inputDesc, + F32 *input, + TensorDesc dwFilterDesc, + const F32 *dwFilter, + TensorDesc pwFilterDesc, + const F32 *pwFilter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc dwBiasDesc, + const F32 *dwBias, + TensorDesc pwBiasDesc, + const F32 *pwBias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *output, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + Arch arch) +{ + UNUSED(arch); + if (nullptr == input || nullptr == dwFilter || nullptr == output || nullptr == dwBias || + nullptr == tmp) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if (!(idt == DT_F32 && fdt == DT_F32 && odt == DT_F32)) { + CHECK_STATUS(NOT_MATCH); + } + if (!(idf == DF_NCHWC8 && odf == DF_NCHWC8)) { + CHECK_STATUS(NOT_MATCH); + } + if (!(ic == fc)) { + CHECK_STATUS(NOT_MATCH); + } + + EE ret = NOT_MATCH; + if (algorithm == DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT || + algorithm == DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT) { +#ifdef __aarch64__ + ret = depthwise_pointwise_convolution_direct_V8(inputDesc, input, dwFilterDesc, dwFilter, + pwFilterDesc, pwFilter, convParamSpec, dwBiasDesc, dwBias, pwBiasDesc, pwBias, tmpBytes, + tmp, outputDesc, output, depthwiseActivationParamSpec, pointwiseActivationParamSpec); +#else + ret = depthwise_pointwise_convolution_direct_V7(inputDesc, input, dwFilterDesc, dwFilter, + pwFilterDesc, pwFilter, convParamSpec, dwBiasDesc, dwBias, pwBiasDesc, pwBias, tmpBytes, + tmp, outputDesc, output, depthwiseActivationParamSpec, pointwiseActivationParamSpec); +#endif + } + return ret; +} diff --git a/compute/tensor/src/cpu/arm/fp32/depthwise_pointwise_convolution.h b/compute/tensor/src/cpu/arm/fp32/depthwise_pointwise_convolution.h new file mode 100644 index 00000000..f27d0db3 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp32/depthwise_pointwise_convolution.h @@ -0,0 +1,59 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_DEPTHWISE_POINTWISE_CONVOLUTION +#define _H_DEPTHWISE_POINTWISE_CONVOLUTION + +#include "sys.h" +#include "tensor_desc.h" +#include "types.h" + +#ifdef __aarch64__ +EE depthwise_pointwise_convolution_direct_V8(TensorDesc inputDesc, + F32 *inArray, + TensorDesc dwFilterDesc, + const F32 *dwFilterArray, + TensorDesc pwFilterDesc, + const F32 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F32 *dwBiasArray, + TensorDesc pwBiasDesc, + const F32 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec); +#else +EE depthwise_pointwise_convolution_direct_V7(TensorDesc inputDesc, + F32 *inArray, + TensorDesc dwFilterDesc, + const F32 *dwFilterArray, + TensorDesc pwFilterDesc, + const F32 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F32 *dwBiasArray, + TensorDesc pwBiasDesc, + const F32 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec); +#endif + +#endif diff --git a/compute/tensor/src/cpu/arm/fp32/depthwise_pointwise_convolution_direct_V7.cpp b/compute/tensor/src/cpu/arm/fp32/depthwise_pointwise_convolution_direct_V7.cpp new file mode 100644 index 00000000..4848caba --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp32/depthwise_pointwise_convolution_direct_V7.cpp @@ -0,0 +1,699 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef __aarch64__ +#include "cpu/arm/fp32/depthwise_pointwise_convolution.h" + +EE depthwise_pointwise_convolution_direct_V7(TensorDesc inputDesc, + F32 *inArray, + TensorDesc dwFilterDesc, + const F32 *dwFilterArray, + TensorDesc pwFilterDesc, + const F32 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F32 *dwBiasArray, + TensorDesc pwBiasDesc, + const F32 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec) +{ + UNUSED(dwBiasDesc); + UNUSED(pwBiasDesc); + UNUSED(tmpBytes); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; + + if (dwFilterDesc.df != DF_NCHWC8) { + CHECK_STATUS(NOT_MATCH); + } + if (pwFilterArray != nullptr && pwFilterDesc.df != DF_NHWCN8) { + CHECK_STATUS(NOT_MATCH); + } + + oc /= 8; + ic /= 8; + + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + U32 ihiw = ih * iw; + I32 ohow = oh * ow; + F32 *pwArray = (F32 *)tmp + ic * ih_pad * iw_pad * 8; + + for (U32 n = 0; n < in; n++) { + // copy input into a input with padding + F32 *inArray_pad = (F32 *)tmp; + F32 *inArray_pad_mov = inArray_pad; + F32 *inArray_mov = inArray + n * ic * ihiw * 8; + for (U32 c = 0; c < ic; c++) { + if (paddingT > 0) { + memset(inArray_pad_mov, 0, paddingT * iw_pad * 8 * bytesOf(fdt)); + inArray_pad_mov += paddingT * iw_pad * 8; + } + for (U32 h = paddingT; h < ih_pad - paddingB; h++) { + memset(inArray_pad_mov, 0, paddingL * 8 * bytesOf(fdt)); + inArray_pad_mov += paddingL * 8; + memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(fdt)); + inArray_pad_mov += iw * 8; + inArray_mov += iw * 8; + memset(inArray_pad_mov, 0, paddingR * 8 * bytesOf(fdt)); + inArray_pad_mov += paddingR * 8; + } + if (paddingB > 0) { + memset(inArray_pad_mov, 0, paddingB * iw_pad * 8 * bytesOf(fdt)); + inArray_pad_mov += paddingB * iw_pad * 8; + } + + const F32 *b = dwBiasArray + c * 8; + F32 *in_pad = inArray_pad + c * ih_pad * iw_pad * 8; + const F32 *f = dwFilterArray + c * fh * fw * 8; + // ohow / 4 + for (I32 hw = 0; hw < ohow - 3; hw += 4) { + U32 in_h_0 = hw / ow * strideH; + U32 in_w_0 = hw % ow * strideW; + U32 in_h_1 = (hw + 1) / ow * strideH; + U32 in_w_1 = (hw + 1) % ow * strideW; + U32 in_h_2 = (hw + 2) / ow * strideH; + U32 in_w_2 = (hw + 2) % ow * strideW; + U32 in_h_3 = (hw + 3) / ow * strideH; + U32 in_w_3 = (hw + 3) % ow * strideW; + + __asm__ __volatile__( + "vld1.f32 {d0-d3}, [%[b]]\n" + "vmov.f32 q2, q0\n" + "vmov.f32 q3, q1\n" + "vmov.f32 q4, q0\n" + "vmov.f32 q5, q1\n" + "vmov.f32 q6, q0\n" + "vmov.f32 q7, q1\n" + : + : [b] "r"(b) + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); + + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + const F32 *f_0 = f + fh_idx * fw * 8 + fw_idx * 8; + F32 *in_idx = in_pad + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + F32 *in_0 = in_idx + in_h_0 * iw_pad * 8 + in_w_0 * 8; + F32 *in_1 = in_idx + in_h_1 * iw_pad * 8 + in_w_1 * 8; + F32 *in_2 = in_idx + in_h_2 * iw_pad * 8 + in_w_2 * 8; + F32 *in_3 = in_idx + in_h_3 * iw_pad * 8 + in_w_3 * 8; + + __asm__ __volatile__("vld1.f32 {d28-d31}, [%[f0]]\n" + "vld1.f32 {d16-d19}, [%[in0]]\n" + "vld1.f32 {d20-d23}, [%[in1]]\n" + "vld1.f32 {d24-d27}, [%[in2]]\n" + + "vmla.f32 q0, q8, q14\n" + "vmla.f32 q1, q9, q15\n" + "vld1.f32 {d16-d19}, [%[in3]]\n" + "vmla.f32 q2, q10, q14\n" + "vmla.f32 q3, q11, q15\n" + "vmla.f32 q4, q12, q14\n" + "vmla.f32 q5, q13, q15\n" + "vmla.f32 q6, q8, q14\n" + "vmla.f32 q7, q9, q15\n" + : + : [in0] "r"(in_0), [in1] "r"(in_1), [in2] "r"(in_2), + [in3] "r"(in_3), [f0] "r"(f_0) + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", + "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15"); + } + } + + // activation + switch (depthwiseActivationParamSpec.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("veor q15, q15, q15\n" // zero + "vmax.f32 q0, q0, q15\n" + "vmax.f32 q1, q1, q15\n" + "vmax.f32 q2, q2, q15\n" + "vmax.f32 q3, q3, q15\n" + "vmax.f32 q4, q4, q15\n" + "vmax.f32 q5, q5, q15\n" + "vmax.f32 q6, q6, q15\n" + "vmax.f32 q7, q7, q15\n" + : + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", + "q6", "q7", "q15"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("veor q15, q15, q15\n" // zero + "vmov.f32 q14, #6.0\n" // six + "vmax.f32 q0, q0, q15\n" + "vmax.f32 q1, q1, q15\n" + "vmax.f32 q2, q2, q15\n" + "vmax.f32 q3, q3, q15\n" + "vmax.f32 q4, q4, q15\n" + "vmax.f32 q5, q5, q15\n" + "vmax.f32 q6, q6, q15\n" + "vmax.f32 q7, q7, q15\n" + + "vmin.f32 q0, q0, q14\n" + "vmin.f32 q1, q1, q14\n" + "vmin.f32 q2, q2, q14\n" + "vmin.f32 q3, q3, q14\n" + "vmin.f32 q4, q4, q14\n" + "vmin.f32 q5, q5, q14\n" + "vmin.f32 q6, q6, q14\n" + "vmin.f32 q7, q7, q14\n" + : + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", + "q6", "q7", "q14", "q15"); + break; + } + case ACTIVATION_H_SWISH: { + __asm__ __volatile__("vmov.f32 q13, #3.0\n" // three + "vmov.f32 q14, #6.0\n" // six + "veor q15, q15, q15\n" // zero + "vadd.f32 q8, q0, q13\n" + "vadd.f32 q9, q1, q13\n" + "vadd.f32 q10, q2, q13\n" + "vadd.f32 q11, q3, q13\n" + "vmax.f32 q8, q8, q15\n" + "vmax.f32 q9, q9, q15\n" + "vmax.f32 q10, q10, q15\n" + "vmax.f32 q11, q11, q15\n" + "vmin.f32 q8, q8, q14\n" + "vmin.f32 q9, q9, q14\n" + "vmin.f32 q10, q10, q14\n" + "vmin.f32 q11, q11, q14\n" + "vrecpe.f32 q12, q14\n" + "vrecps.f32 q14, q14, q12\n" + "vmul.f32 q12, q14, q12\n" + "vmul.f32 q8, q8, q12\n" + "vmul.f32 q9, q9, q12\n" + "vmul.f32 q10, q10, q12\n" + "vmul.f32 q11, q11, q12\n" + "vmov.f32 q14, #6.0\n" // six + "vmul.f32 q0, q0, q8\n" + "vmul.f32 q1, q1, q9\n" + "vmul.f32 q2, q2, q10\n" + "vmul.f32 q3, q3, q11\n" + + "vadd.f32 q8, q4, q13\n" + "vadd.f32 q9, q5, q13\n" + "vadd.f32 q10, q6, q13\n" + "vadd.f32 q11, q7, q13\n" + "vmax.f32 q8, q8, q15\n" + "vmax.f32 q9, q9, q15\n" + "vmax.f32 q10, q10, q15\n" + "vmax.f32 q11, q11, q15\n" + "vmin.f32 q8, q8, q14\n" + "vmin.f32 q9, q9, q14\n" + "vmin.f32 q10, q10, q14\n" + "vmin.f32 q11, q11, q14\n" + "vrecpe.f32 q12, q14\n" + "vrecps.f32 q14, q14, q12\n" + "vmul.f32 q12, q14, q12\n" + "vmul.f32 q8, q8, q12\n" + "vmul.f32 q9, q9, q12\n" + "vmul.f32 q10, q10, q12\n" + "vmul.f32 q11, q11, q12\n" + "vmul.f32 q4, q4, q8\n" + "vmul.f32 q5, q5, q9\n" + "vmul.f32 q6, q6, q10\n" + "vmul.f32 q7, q7, q11\n" + : + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", + "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15"); + break; + } + default: + return NOT_SUPPORTED; + } + + if (pwFilterArray != nullptr) { + F32 *pw_pack_0 = pwArray + hw * ic * 8 + c * 4 * 8; + __asm__ __volatile__( + "vzip.32 q0, q4\n" + "vzip.32 q2, q6\n" + "vzip.32 q1, q5\n" + "vzip.32 q3, q7\n" + + "vzip.32 q0, q2\n" + "vzip.32 q4, q6\n" + "vzip.32 q1, q3\n" + "vzip.32 q5, q7\n" + + "vst1.f32 {q0}, [%[pw0]]!\n" + "vst1.f32 {q2}, [%[pw0]]!\n" + "vst1.f32 {q4}, [%[pw0]]!\n" + "vst1.f32 {q6}, [%[pw0]]!\n" + "vst1.f32 {q1}, [%[pw0]]!\n" + "vst1.f32 {q3}, [%[pw0]]!\n" + "vst1.f32 {q5}, [%[pw0]]!\n" + "vst1.f32 {q7}, [%[pw0]]!\n" + : [pw0] "+r"(pw_pack_0) + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); + } else { + F32 *out_ptr = outArray + ((n * ic + c) * ohow + hw) * 8; + __asm__ __volatile__( + "vstm %[out], {d0-d15}\n" + : [out] "+r"(out_ptr) + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); + } + } + + // ohow_reminder % 4 + U32 ohow_s = (ohow / 4) * 4; + for (I32 hw = ohow_s; hw < ohow; hw++) { + U32 in_h_0 = hw / ow * strideH; + U32 in_w_0 = hw % ow * strideW; + + __asm__ __volatile__("vld1.f32 {d0-d3}, [%[b]]\n" + : + : [b] "r"(b) + : "memory", "cc", "q0", "q1"); + + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + const F32 *f_0 = f + fh_idx * fw * 8 + fw_idx * 8; + F32 *in_idx = in_pad + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + F32 *in_0 = in_idx + in_h_0 * iw_pad * 8 + in_w_0 * 8; + __asm__ __volatile__( + "vld1.f32 {d28-d31}, [%[f0]]\n" + "vld1.f32 {d24-d27}, [%[in0]]\n" + + "vmla.f32 q0, q12, q14\n" + "vmla.f32 q1, q13, q15\n" + : + : [in0] "r"(in_0), [f0] "r"(f_0) + : "memory", "cc", "q0", "q1", "q12", "q13", "q14", "q15"); + } + } + + // activation + switch (depthwiseActivationParamSpec.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("veor q15, q15, q15\n" // zero + "vmax.f32 q0, q0, q15\n" + "vmax.f32 q1, q1, q15\n" + : + : + : "memory", "cc", "q0", "q1", "q15"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("veor q15, q15, q15\n" // zero + "vmov.f32 q14, #6.0\n" // six + "vmax.f32 q0, q0, q15\n" + "vmax.f32 q1, q1, q15\n" + + "vmin.f32 q0, q0, q14\n" + "vmin.f32 q1, q1, q14\n" + : + : + : "memory", "cc", "q0", "q1", "q14", "q15"); + break; + } + case ACTIVATION_H_SWISH: { + __asm__ __volatile__( + "vmov.f32 q13, #3.0\n" // three + "vmov.f32 q14, #6.0\n" // six + "veor q15, q15, q15\n" // zero + "vadd.f32 q11, q0, q13\n" + "vadd.f32 q12, q1, q13\n" + + "vmax.f32 q11, q11, q15\n" + "vmax.f32 q12, q12, q15\n" + + "vmin.f32 q11, q11, q14\n" + "vmin.f32 q12, q12, q14\n" + + "vrecpe.f32 q13, q14\n" + "vrecps.f32 q14, q14, q13\n" + "vmul.f32 q14, q14, q13\n" + "vmul.f32 q11, q11, q14\n" + "vmul.f32 q12, q12, q14\n" + + "vmul.f32 q0, q0, q11\n" + "vmul.f32 q1, q1, q12\n" + : + : + : "memory", "cc", "q0", "q1", "q11", "q12", "q13", "q14", "q15"); + break; + } + default: + return NOT_SUPPORTED; + } + + F32 *out_ptr; + if (pwFilterArray != nullptr) { + out_ptr = pwArray + hw * ic * 8 + c * 8; + } else { + out_ptr = outArray + ((n * ic + c) * ohow + hw) * 8; + } + __asm__ __volatile__("vst1.f32 {d0-d3}, [%[pw0]]\n" + : [pw0] "+r"(out_ptr) + : + : "memory", "cc", "q0", "q1"); + } + } + + if (pwFilterArray == nullptr) { + continue; + } + // pw_conv + // ohow / 4 + for (I32 hw = 0; hw < ohow - 3; hw += 4) { + const F32 *b0 = pwBiasArray; + const F32 *b1 = b0 + 4; + F32 *in_pack = pwArray + hw * ic * 8; + for (I32 o = 0; o < I32(oc); o++) { + F32 *in_hw0 = in_pack; + const F32 *f_o0c0 = pwFilterArray + o * 8 * ic * 8; + F32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + const F32 *b_o0 = b0; + const F32 *b_o1 = b1; + __asm__ __volatile__("vld1.f32 {d0-d1}, [%[b_0]]\n" + "vld1.f32 {d2-d3}, [%[b_1]]\n" + "vld1.f32 {d12-d13}, [%[in_0]]!\n" + "vld1.f32 {d20-d23}, [%[f_0]]!\n" + + "vmov.f32 q2, q0\n" + "vmov.f32 q4, q0\n" + "vmov.f32 q8, q0\n" + + "mov r2, %[ic]\n" + + "vmov.f32 q3, q1\n" + "vmov.f32 q5, q1\n" + "vmov.f32 q9, q1\n" + + "0:\n" + "vmla.f32 q0, q10, d12[0]\n" + "vmla.f32 q2, q10, d12[1]\n" + "vmla.f32 q4, q10, d13[0]\n" + "vmla.f32 q8, q10, d13[1]\n" + + "vld1.f32 {d14-d15}, [%[in_0]]!\n" + "vld1.f32 {d20-d21}, [%[f_0]]!\n" + + "vmla.f32 q1, q11, d12[0]\n" + "vmla.f32 q3, q11, d12[1]\n" + "vmla.f32 q5, q11, d13[0]\n" + "vmla.f32 q9, q11, d13[1]\n" + + "vld1.f32 {d22-d23}, [%[f_0]]!\n" + "subs r2, r2, #2\n" + + "vmla.f32 q0, q10, d14[0]\n" + "vmla.f32 q2, q10, d14[1]\n" + "vmla.f32 q4, q10, d15[0]\n" + "vmla.f32 q8, q10, d15[1]\n" + + "vld1.f32 {d12-d13}, [%[in_0]]!\n" + "vld1.f32 {d20-d21}, [%[f_0]]!\n" + + "vmla.f32 q1, q11, d14[0]\n" + "vmla.f32 q3, q11, d14[1]\n" + "vmla.f32 q5, q11, d15[0]\n" + "vmla.f32 q9, q11, d15[1]\n" + + "vld1.f32 {d22-d23}, [%[f_0]]!\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", + "q7", "q8", "q9", "q10", "q11", "r2"); + + // activation + switch (pointwiseActivationParamSpec.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("veor q15, q15, q15\n" // zero + "vmax.f32 q0, q0, q15\n" + "vmax.f32 q1, q1, q15\n" + "vmax.f32 q2, q2, q15\n" + "vmax.f32 q3, q3, q15\n" + "vmax.f32 q4, q4, q15\n" + "vmax.f32 q5, q5, q15\n" + "vmax.f32 q8, q8, q15\n" + "vmax.f32 q9, q9, q15\n" + : + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", + "q8", "q9", "q15"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("veor q15, q15, q15\n" // zero + "vmov.f32 q14, #6.0\n" // six + "vmax.f32 q0, q0, q15\n" + "vmax.f32 q1, q1, q15\n" + "vmax.f32 q2, q2, q15\n" + "vmax.f32 q3, q3, q15\n" + "vmax.f32 q4, q4, q15\n" + "vmax.f32 q5, q5, q15\n" + "vmax.f32 q8, q8, q15\n" + "vmax.f32 q9, q9, q15\n" + + "vmin.f32 q0, q0, q14\n" + "vmin.f32 q1, q1, q14\n" + "vmin.f32 q2, q2, q14\n" + "vmin.f32 q3, q3, q14\n" + "vmin.f32 q4, q4, q14\n" + "vmin.f32 q5, q5, q14\n" + "vmin.f32 q8, q8, q14\n" + "vmin.f32 q9, q9, q14\n" + : + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", + "q8", "q9", "q14", "q15"); + break; + } + case ACTIVATION_H_SWISH: { + __asm__ __volatile__("vmov.f32 q6, q8\n" + "vmov.f32 q7, q9\n" + + "vmov.f32 q13, #3.0\n" // three + "vmov.f32 q14, #6.0\n" // six + "veor q15, q15, q15\n" // zero + "vadd.f32 q8, q0, q13\n" + "vadd.f32 q9, q1, q13\n" + "vadd.f32 q10, q2, q13\n" + "vadd.f32 q11, q3, q13\n" + "vmax.f32 q8, q8, q15\n" + "vmax.f32 q9, q9, q15\n" + "vmax.f32 q10, q10, q15\n" + "vmax.f32 q11, q11, q15\n" + "vmin.f32 q8, q8, q14\n" + "vmin.f32 q9, q9, q14\n" + "vmin.f32 q10, q10, q14\n" + "vmin.f32 q11, q11, q14\n" + "vrecpe.f32 q12, q14\n" + "vrecps.f32 q14, q14, q12\n" + "vmul.f32 q12, q14, q12\n" + "vmul.f32 q8, q8, q12\n" + "vmul.f32 q9, q9, q12\n" + "vmul.f32 q10, q10, q12\n" + "vmul.f32 q11, q11, q12\n" + "vmov.f32 q14, #6.0\n" // six + "vmul.f32 q0, q0, q8\n" + "vmul.f32 q1, q1, q9\n" + "vmul.f32 q2, q2, q10\n" + "vmul.f32 q3, q3, q11\n" + + "vadd.f32 q8, q4, q13\n" + "vadd.f32 q9, q5, q13\n" + "vadd.f32 q10, q6, q13\n" + "vadd.f32 q11, q7, q13\n" + "vmax.f32 q8, q8, q15\n" + "vmax.f32 q9, q9, q15\n" + "vmax.f32 q10, q10, q15\n" + "vmax.f32 q11, q11, q15\n" + "vmin.f32 q8, q8, q14\n" + "vmin.f32 q9, q9, q14\n" + "vmin.f32 q10, q10, q14\n" + "vmin.f32 q11, q11, q14\n" + "vrecpe.f32 q12, q14\n" + "vrecps.f32 q14, q14, q12\n" + "vmul.f32 q12, q14, q12\n" + "vmul.f32 q8, q8, q12\n" + "vmul.f32 q9, q9, q12\n" + "vmul.f32 q10, q10, q12\n" + "vmul.f32 q11, q11, q12\n" + "vmul.f32 q4, q4, q8\n" + "vmul.f32 q5, q5, q9\n" + "vmul.f32 q6, q6, q10\n" + "vmul.f32 q7, q7, q11\n" + + "vmov.f32 q8, q6\n" + "vmov.f32 q9, q7\n" + : + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", + "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15"); + break; + } + default: + return NOT_SUPPORTED; + } + + __asm__ __volatile__( + "vst1.f32 {d0-d3}, [%[out_0]]!\n" + "vst1.f32 {d4-d7}, [%[out_0]]!\n" + "vst1.f32 {d8-d11}, [%[out_0]]!\n" + "vst1.f32 {d16-d19}, [%[out_0]]!\n" + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q8", "q9"); + b0 += 8; + b1 += 8; + } + } + + // ohow_reminder % 4 + U32 ohow_s = (ohow / 4) * 4; + for (I32 hw = ohow_s; hw < ohow; hw++) { + const F32 *b0 = pwBiasArray; + const F32 *b1 = b0 + 4; + F32 *in_pack = pwArray + hw * ic * 8; + for (I32 o = 0; o < I32(oc); o++) { + F32 *in_hw0 = in_pack; + const F32 *f_o0c0 = pwFilterArray + o * 8 * ic * 8; + F32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + const F32 *b_o0 = b0; + const F32 *b_o1 = b1; + __asm__ __volatile__("vld1.f32 {d0-d1}, [%[b_0]]\n" + "vld1.f32 {d2-d3}, [%[b_1]]\n" + "vld1.f32 {d8}, [%[in_0]]!\n" + "vld1.f32 {d4-d7}, [%[f_0]]!\n" + "mov r2, %[ic]\n" + "0:\n" + "vmla.f32 q0, q2, d8[0]\n" + + "vld1.f32 {d4-d5}, [%[f_0]]!\n" + + "vmla.f32 q1, q3, d8[0]\n" + + "vld1.f32 {d6-d7}, [%[f_0]]!\n" + "subs r2, r2, #2\n" + + "vmla.f32 q0, q2, d8[1]\n" + + "vld1.f32 {d4-d5}, [%[f_0]]!\n" + + "vmla.f32 q1, q3, d8[1]\n" + + "vld1.f32 {d8}, [%[in_0]]!\n" + "vld1.f32 {d6-d7}, [%[f_0]]!\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "r2"); + + switch (pointwiseActivationParamSpec.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("veor q15, q15, q15\n" // zero + "vmax.f32 q0, q0, q15\n" + "vmax.f32 q1, q1, q15\n" + : + : + : "memory", "cc", "q0", "q1", "q15"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("veor q15, q15, q15\n" // zero + "vmov.f32 q14, #6.0\n" // six + "vmax.f32 q0, q0, q15\n" + "vmax.f32 q1, q1, q15\n" + + "vmin.f32 q0, q0, q14\n" + "vmin.f32 q1, q1, q14\n" + : + : + : "memory", "cc", "q0", "q1", "q14", "q15"); + break; + } + case ACTIVATION_H_SWISH: { + __asm__ __volatile__( + "vmov.f32 q13, #3.0\n" // three + "vmov.f32 q14, #6.0\n" // six + "veor q15, q15, q15\n" // zero + "vadd.f32 q11, q0, q13\n" + "vadd.f32 q12, q1, q13\n" + + "vmax.f32 q11, q11, q15\n" + "vmax.f32 q12, q12, q15\n" + + "vmin.f32 q11, q11, q14\n" + "vmin.f32 q12, q12, q14\n" + + "vrecpe.f32 q13, q14\n" + "vrecps.f32 q14, q14, q13\n" + "vmul.f32 q14, q14, q13\n" + "vmul.f32 q11, q11, q14\n" + "vmul.f32 q12, q12, q14\n" + + "vmul.f32 q0, q0, q11\n" + "vmul.f32 q1, q1, q12\n" + : + : + : "memory", "cc", "q0", "q1", "q11", "q12", "q13", "q14", "q15"); + break; + } + default: + return NOT_SUPPORTED; + } + + __asm__ __volatile__("vst1.f32 {d0-d3}, [%[out_0]]\n" + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "q0", "q1"); + b0 += 8; + b1 += 8; + } + } + } + return SUCCESS; +} +#endif diff --git a/compute/tensor/src/cpu/arm/fp32/depthwise_pointwise_convolution_direct_V8.cpp b/compute/tensor/src/cpu/arm/fp32/depthwise_pointwise_convolution_direct_V8.cpp new file mode 100644 index 00000000..b3fc32ad --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp32/depthwise_pointwise_convolution_direct_V8.cpp @@ -0,0 +1,1264 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef __aarch64__ +#include "cpu/arm/fp32/depthwise_pointwise_convolution.h" + +EE depthwise_pointwise_convolution_direct_V8(TensorDesc inputDesc, + F32 *inArray, + TensorDesc dwFilterDesc, + const F32 *dwFilterArray, + TensorDesc pwFilterDesc, + const F32 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F32 *dwBiasArray, + TensorDesc pwBiasDesc, + const F32 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec) +{ + UNUSED(tmpBytes); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; + + if (dwFilterDesc.df != DF_NCHWC8) { + CHECK_STATUS(NOT_MATCH); + } + if (pwFilterArray != nullptr && pwFilterDesc.df != DF_NHWCN8) { + CHECK_STATUS(NOT_MATCH); + } + + oc /= 8; + ic /= 8; + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + U32 ihiw = ih * iw; + I32 ohow = oh * ow; + F32 *pwArray = (F32 *)tmp + ic * ih_pad * iw_pad * 8; + for (U32 n = 0; n < in; n++) { + F32 *inArray_pad = (F32 *)tmp; + F32 *inArray_pad_mov = inArray_pad; + F32 *inArray_mov = inArray + n * ic * ihiw * 8; + for (U32 c = 0; c < ic; c++) { + if (paddingT > 0) { + memset(inArray_pad_mov, 0, paddingT * iw_pad * 8 * bytesOf(fdt)); + inArray_pad_mov += paddingT * iw_pad * 8; + } + for (U32 h = paddingT; h < ih_pad - paddingB; h++) { + memset(inArray_pad_mov, 0, paddingL * 8 * bytesOf(fdt)); + inArray_pad_mov += paddingL * 8; + memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(fdt)); + inArray_pad_mov += iw * 8; + inArray_mov += iw * 8; + memset(inArray_pad_mov, 0, paddingR * 8 * bytesOf(fdt)); + inArray_pad_mov += paddingR * 8; + } + if (paddingB > 0) { + memset(inArray_pad_mov, 0, paddingB * iw_pad * 8 * bytesOf(fdt)); + inArray_pad_mov += paddingB * iw_pad * 8; + } + + // dw_conv + const F32 *b = dwBiasArray + c * 8; + F32 *in_pad = inArray_pad + c * ih_pad * iw_pad * 8; + const F32 *f = dwFilterArray + c * fh * fw * 8; + // ohow / 8 + for (I32 hw = 0; hw < ohow - 7; hw += 8) { + U32 in_h_0 = hw / ow * strideH; + U32 in_w_0 = hw % ow * strideW; + U32 in_h_1 = (hw + 1) / ow * strideH; + U32 in_w_1 = (hw + 1) % ow * strideW; + U32 in_h_2 = (hw + 2) / ow * strideH; + U32 in_w_2 = (hw + 2) % ow * strideW; + U32 in_h_3 = (hw + 3) / ow * strideH; + U32 in_w_3 = (hw + 3) % ow * strideW; + U32 in_h_4 = (hw + 4) / ow * strideH; + U32 in_w_4 = (hw + 4) % ow * strideW; + U32 in_h_5 = (hw + 5) / ow * strideH; + U32 in_w_5 = (hw + 5) % ow * strideW; + U32 in_h_6 = (hw + 6) / ow * strideH; + U32 in_w_6 = (hw + 6) % ow * strideW; + U32 in_h_7 = (hw + 7) / ow * strideH; + U32 in_w_7 = (hw + 7) % ow * strideW; + + __asm__ __volatile__("ldr q14, [%[b]]\n" + "ldr q15, [%[b], #16]\n" + "mov v0.16b, v14.16b\n" + "mov v1.16b, v15.16b\n" + "mov v2.16b, v14.16b\n" + "mov v3.16b, v15.16b\n" + "mov v4.16b, v14.16b\n" + "mov v5.16b, v15.16b\n" + "mov v6.16b, v14.16b\n" + "mov v7.16b, v15.16b\n" + "mov v8.16b, v14.16b\n" + "mov v9.16b, v15.16b\n" + "mov v10.16b, v14.16b\n" + "mov v11.16b, v15.16b\n" + "mov v12.16b, v14.16b\n" + "mov v13.16b, v15.16b\n" + : + : [b] "r"(b) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"); + + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + const F32 *f_0 = f + fh_idx * fw * 8 + fw_idx * 8; + F32 *in_idx = in_pad + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + F32 *in_0 = in_idx + in_h_0 * iw_pad * 8 + in_w_0 * 8; + F32 *in_1 = in_idx + in_h_1 * iw_pad * 8 + in_w_1 * 8; + F32 *in_2 = in_idx + in_h_2 * iw_pad * 8 + in_w_2 * 8; + F32 *in_3 = in_idx + in_h_3 * iw_pad * 8 + in_w_3 * 8; + F32 *in_4 = in_idx + in_h_4 * iw_pad * 8 + in_w_4 * 8; + F32 *in_5 = in_idx + in_h_5 * iw_pad * 8 + in_w_5 * 8; + F32 *in_6 = in_idx + in_h_6 * iw_pad * 8 + in_w_6 * 8; + F32 *in_7 = in_idx + in_h_7 * iw_pad * 8 + in_w_7 * 8; + __asm__ __volatile__("ldp q16, q17, [%[f0]]\n" + "ldp q30, q31, [%[in0]]\n" + "ldp q18, q19, [%[in1]]\n" + "ldp q20, q21, [%[in2]]\n" + "ldp q22, q23, [%[in3]]\n" + "ldp q24, q25, [%[in4]]\n" + "ldp q26, q27, [%[in5]]\n" + "ldp q28, q29, [%[in6]]\n" + + "fmla v0.4s, v30.4s, v16.4s\n" + "fmla v1.4s, v31.4s, v17.4s\n" + "fmla v2.4s, v18.4s, v16.4s\n" + "ldp q30, q31, [%[in7]]\n" + "fmla v3.4s, v19.4s, v17.4s\n" + "fmla v4.4s, v20.4s, v16.4s\n" + "fmla v5.4s, v21.4s, v17.4s\n" + "fmla v6.4s, v22.4s, v16.4s\n" + "fmla v7.4s, v23.4s, v17.4s\n" + "fmla v8.4s, v24.4s, v16.4s\n" + "fmla v9.4s, v25.4s, v17.4s\n" + "fmla v10.4s, v26.4s, v16.4s\n" + "fmla v11.4s, v27.4s, v17.4s\n" + "fmla v12.4s, v28.4s, v16.4s\n" + "fmla v13.4s, v29.4s, v17.4s\n" + "fmla v14.4s, v30.4s, v16.4s\n" + "fmla v15.4s, v31.4s, v17.4s\n" + : + : [in0] "r"(in_0), [in1] "r"(in_1), [in2] "r"(in_2), + [in3] "r"(in_3), [in4] "r"(in_4), [in5] "r"(in_5), + [in6] "r"(in_6), [in7] "r"(in_7), [f0] "r"(f_0) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", + "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", + "v30", "v31"); + } + } + + // activation + switch (depthwiseActivationParamSpec.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "fmax v0.4s, v0.4s, v31.4s\n" + "fmax v1.4s, v1.4s, v31.4s\n" + "fmax v2.4s, v2.4s, v31.4s\n" + "fmax v3.4s, v3.4s, v31.4s\n" + "fmax v4.4s, v4.4s, v31.4s\n" + "fmax v5.4s, v5.4s, v31.4s\n" + "fmax v6.4s, v6.4s, v31.4s\n" + "fmax v7.4s, v7.4s, v31.4s\n" + "fmax v8.4s, v8.4s, v31.4s\n" + "fmax v9.4s, v9.4s, v31.4s\n" + "fmax v10.4s, v10.4s, v31.4s\n" + "fmax v11.4s, v11.4s, v31.4s\n" + "fmax v12.4s, v12.4s, v31.4s\n" + "fmax v13.4s, v13.4s, v31.4s\n" + "fmax v14.4s, v14.4s, v31.4s\n" + "fmax v15.4s, v15.4s, v31.4s\n" + : + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", + "v14", "v15", "v31"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "fmov v30.4s, 6.0\n" // six + "fmax v0.4s, v0.4s, v31.4s\n" + "fmax v1.4s, v1.4s, v31.4s\n" + "fmax v2.4s, v2.4s, v31.4s\n" + "fmax v3.4s, v3.4s, v31.4s\n" + "fmax v4.4s, v4.4s, v31.4s\n" + "fmax v5.4s, v5.4s, v31.4s\n" + "fmax v6.4s, v6.4s, v31.4s\n" + "fmax v7.4s, v7.4s, v31.4s\n" + "fmax v8.4s, v8.4s, v31.4s\n" + "fmax v9.4s, v9.4s, v31.4s\n" + "fmax v10.4s, v10.4s, v31.4s\n" + "fmax v11.4s, v11.4s, v31.4s\n" + "fmax v12.4s, v12.4s, v31.4s\n" + "fmax v13.4s, v13.4s, v31.4s\n" + "fmax v14.4s, v14.4s, v31.4s\n" + "fmax v15.4s, v15.4s, v31.4s\n" + + "fmin v0.4s, v0.4s, v30.4s\n" + "fmin v1.4s, v1.4s, v30.4s\n" + "fmin v2.4s, v2.4s, v30.4s\n" + "fmin v3.4s, v3.4s, v30.4s\n" + "fmin v4.4s, v4.4s, v30.4s\n" + "fmin v5.4s, v5.4s, v30.4s\n" + "fmin v6.4s, v6.4s, v30.4s\n" + "fmin v7.4s, v7.4s, v30.4s\n" + "fmin v8.4s, v8.4s, v30.4s\n" + "fmin v9.4s, v9.4s, v30.4s\n" + "fmin v10.4s, v10.4s, v30.4s\n" + "fmin v11.4s, v11.4s, v30.4s\n" + "fmin v12.4s, v12.4s, v30.4s\n" + "fmin v13.4s, v13.4s, v30.4s\n" + "fmin v14.4s, v14.4s, v30.4s\n" + "fmin v15.4s, v15.4s, v30.4s\n" + : + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", + "v14", "v15", "v30", "v31"); + break; + } + case ACTIVATION_H_SWISH: { + __asm__ __volatile__("fmov v29.4s, 3.0\n" // three + "fmov v30.4s, 6.0\n" // six + "eor v31.16b, v31.16b, v31.16b\n" // zero + "fadd v22.4s, v0.4s, v29.4s\n" + "fadd v23.4s, v1.4s, v29.4s\n" + "fadd v16.4s, v2.4s, v29.4s\n" + "fadd v17.4s, v3.4s, v29.4s\n" + "fadd v18.4s, v4.4s, v29.4s\n" + "fadd v19.4s, v5.4s, v29.4s\n" + "fadd v20.4s, v6.4s, v29.4s\n" + "fadd v21.4s, v7.4s, v29.4s\n" + + "fmax v22.4s, v22.4s, v31.4s\n" + "fmax v23.4s, v23.4s, v31.4s\n" + "fmax v16.4s, v16.4s, v31.4s\n" + "fmax v17.4s, v17.4s, v31.4s\n" + "fmax v18.4s, v18.4s, v31.4s\n" + "fmax v19.4s, v19.4s, v31.4s\n" + "fmax v20.4s, v20.4s, v31.4s\n" + "fmax v21.4s, v21.4s, v31.4s\n" + + "fmin v22.4s, v22.4s, v30.4s\n" + "fmin v23.4s, v23.4s, v30.4s\n" + "fmin v16.4s, v16.4s, v30.4s\n" + "fmin v17.4s, v17.4s, v30.4s\n" + "fmin v18.4s, v18.4s, v30.4s\n" + "fmin v19.4s, v19.4s, v30.4s\n" + "fmin v20.4s, v20.4s, v30.4s\n" + "fmin v21.4s, v21.4s, v30.4s\n" + + "fdiv v22.4s, v22.4s, v30.4s\n" + "fdiv v23.4s, v23.4s, v30.4s\n" + "fdiv v16.4s, v16.4s, v30.4s\n" + "fdiv v17.4s, v17.4s, v30.4s\n" + "fdiv v18.4s, v18.4s, v30.4s\n" + "fdiv v19.4s, v19.4s, v30.4s\n" + "fdiv v20.4s, v20.4s, v30.4s\n" + "fdiv v21.4s, v21.4s, v30.4s\n" + + "fmul v0.4s, v0.4s, v22.4s\n" + "fmul v1.4s, v1.4s, v23.4s\n" + "fmul v2.4s, v2.4s, v16.4s\n" + "fmul v3.4s, v3.4s, v17.4s\n" + "fmul v4.4s, v4.4s, v18.4s\n" + "fmul v5.4s, v5.4s, v19.4s\n" + "fmul v6.4s, v6.4s, v20.4s\n" + "fmul v7.4s, v7.4s, v21.4s\n" + + "fadd v22.4s, v8.4s, v29.4s\n" + "fadd v23.4s, v9.4s, v29.4s\n" + "fadd v16.4s, v10.4s, v29.4s\n" + "fadd v17.4s, v11.4s, v29.4s\n" + "fadd v18.4s, v12.4s, v29.4s\n" + "fadd v19.4s, v13.4s, v29.4s\n" + "fadd v20.4s, v14.4s, v29.4s\n" + "fadd v21.4s, v15.4s, v29.4s\n" + + "fmax v22.4s, v22.4s, v31.4s\n" + "fmax v23.4s, v23.4s, v31.4s\n" + "fmax v16.4s, v16.4s, v31.4s\n" + "fmax v17.4s, v17.4s, v31.4s\n" + "fmax v18.4s, v18.4s, v31.4s\n" + "fmax v19.4s, v19.4s, v31.4s\n" + "fmax v20.4s, v20.4s, v31.4s\n" + "fmax v21.4s, v21.4s, v31.4s\n" + + "fmin v22.4s, v22.4s, v30.4s\n" + "fmin v23.4s, v23.4s, v30.4s\n" + "fmin v16.4s, v16.4s, v30.4s\n" + "fmin v17.4s, v17.4s, v30.4s\n" + "fmin v18.4s, v18.4s, v30.4s\n" + "fmin v19.4s, v19.4s, v30.4s\n" + "fmin v20.4s, v20.4s, v30.4s\n" + "fmin v21.4s, v21.4s, v30.4s\n" + + "fdiv v22.4s, v22.4s, v30.4s\n" + "fdiv v23.4s, v23.4s, v30.4s\n" + "fdiv v16.4s, v16.4s, v30.4s\n" + "fdiv v17.4s, v17.4s, v30.4s\n" + "fdiv v18.4s, v18.4s, v30.4s\n" + "fdiv v19.4s, v19.4s, v30.4s\n" + "fdiv v20.4s, v20.4s, v30.4s\n" + "fdiv v21.4s, v21.4s, v30.4s\n" + + "fmul v8.4s, v8.4s, v22.4s\n" + "fmul v9.4s, v9.4s, v23.4s\n" + "fmul v10.4s, v10.4s, v16.4s\n" + "fmul v11.4s, v11.4s, v17.4s\n" + "fmul v12.4s, v12.4s, v18.4s\n" + "fmul v13.4s, v13.4s, v19.4s\n" + "fmul v14.4s, v14.4s, v20.4s\n" + "fmul v15.4s, v15.4s, v21.4s\n" + : + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", + "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v29", "v30", "v31"); + break; + } + default: + return NOT_SUPPORTED; + } + + if (pwFilterArray != nullptr) { + F32 *pw_pack_0 = pwArray + hw * ic * 8 + c * 8 * 8; + __asm__ __volatile__("zip1 v16.4s, v0.4s, v2.4s\n" + "zip2 v17.4s, v0.4s, v2.4s\n" + "zip1 v18.4s, v4.4s, v6.4s\n" + "zip2 v19.4s, v4.4s, v6.4s\n" + "zip1 v0.2d, v16.2d, v18.2d\n" + "zip2 v2.2d, v16.2d, v18.2d\n" + "zip1 v4.2d, v17.2d, v19.2d\n" + "zip2 v6.2d, v17.2d, v19.2d\n" + + "zip1 v16.4s, v8.4s, v10.4s\n" + "zip2 v17.4s, v8.4s, v10.4s\n" + "zip1 v18.4s, v12.4s, v14.4s\n" + "zip2 v19.4s, v12.4s, v14.4s\n" + "zip1 v8.2d, v16.2d, v18.2d\n" + "zip2 v10.2d, v16.2d, v18.2d\n" + "zip1 v12.2d, v17.2d, v19.2d\n" + "zip2 v14.2d, v17.2d, v19.2d\n" + + "zip1 v16.4s, v1.4s, v3.4s\n" + "zip2 v17.4s, v1.4s, v3.4s\n" + "zip1 v18.4s, v5.4s, v7.4s\n" + "zip2 v19.4s, v5.4s, v7.4s\n" + "zip1 v1.2d, v16.2d, v18.2d\n" + "zip2 v3.2d, v16.2d, v18.2d\n" + "zip1 v5.2d, v17.2d, v19.2d\n" + "zip2 v7.2d, v17.2d, v19.2d\n" + + "zip1 v16.4s, v9.4s, v11.4s\n" + "zip2 v17.4s, v9.4s, v11.4s\n" + "zip1 v18.4s, v13.4s, v15.4s\n" + "zip2 v19.4s, v13.4s, v15.4s\n" + "zip1 v9.2d, v16.2d, v18.2d\n" + "zip2 v11.2d, v16.2d, v18.2d\n" + "zip1 v13.2d, v17.2d, v19.2d\n" + "zip2 v15.2d, v17.2d, v19.2d\n" + + "str q0, [%[pw0]]\n" + "str q8, [%[pw0], #16]\n" + "str q2, [%[pw0], #32]\n" + "str q10, [%[pw0], #48]\n" + "str q4, [%[pw0], #64]\n" + "str q12, [%[pw0], #80]\n" + "str q6, [%[pw0], #96]\n" + "str q14, [%[pw0], #112]\n" + "str q1, [%[pw0], #128]\n" + "str q9, [%[pw0], #144]\n" + "str q3, [%[pw0], #160]\n" + "str q11, [%[pw0], #176]\n" + "str q5, [%[pw0], #192]\n" + "str q13, [%[pw0], #208]\n" + "str q7, [%[pw0], #224]\n" + "str q15, [%[pw0], #240]\n" + : [pw0] "+r"(pw_pack_0) + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19"); + } else { + F32 *out_ptr = outArray + ((n * ic + c) * ohow + hw) * 8; + __asm__ __volatile__("stp q0, q1, [%[out]]\n" + "stp q2, q3, [%[out], #32]\n" + "stp q4, q5, [%[out], #64]\n" + "stp q6, q7, [%[out], #96]\n" + "stp q8, q9, [%[out], #128]\n" + "stp q10, q11, [%[out], #160]\n" + "stp q12, q13, [%[out], #192]\n" + "stp q14, q15, [%[out], #224]\n" + : [out] "+r"(out_ptr) + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"); + } + } + + // ohow_reminder % 8 / 4 + U32 ohow_s = (ohow / 8) * 8; + for (I32 hw = ohow_s; hw < ohow - 3; hw += 4) { + U32 in_h_0 = hw / ow * strideH; + U32 in_w_0 = hw % ow * strideW; + U32 in_h_1 = (hw + 1) / ow * strideH; + U32 in_w_1 = (hw + 1) % ow * strideW; + U32 in_h_2 = (hw + 2) / ow * strideH; + U32 in_w_2 = (hw + 2) % ow * strideW; + U32 in_h_3 = (hw + 3) / ow * strideH; + U32 in_w_3 = (hw + 3) % ow * strideW; + + __asm__ __volatile__( + "ldr q14, [%[b]]\n" + "ldr q15, [%[b], #16]\n" + "mov v0.16b, v14.16b\n" + "mov v1.16b, v15.16b\n" + "mov v2.16b, v14.16b\n" + "mov v3.16b, v15.16b\n" + "mov v4.16b, v14.16b\n" + "mov v5.16b, v15.16b\n" + "mov v6.16b, v14.16b\n" + "mov v7.16b, v15.16b\n" + : + : [b] "r"(b) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v14", "v15"); + + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + const F32 *f_0 = f + fh_idx * fw * 8 + fw_idx * 8; + F32 *in_idx = in_pad + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + F32 *in_0 = in_idx + in_h_0 * iw_pad * 8 + in_w_0 * 8; + F32 *in_1 = in_idx + in_h_1 * iw_pad * 8 + in_w_1 * 8; + F32 *in_2 = in_idx + in_h_2 * iw_pad * 8 + in_w_2 * 8; + F32 *in_3 = in_idx + in_h_3 * iw_pad * 8 + in_w_3 * 8; + __asm__ __volatile__("ldp q14, q15, [%[f0]]\n" + "ldp q16, q17, [%[in0]]\n" + "ldp q18, q19, [%[in1]]\n" + "ldp q20, q21, [%[in2]]\n" + "ldp q22, q23, [%[in3]]\n" + + "fmla v0.4s, v16.4s, v14.4s\n" + "fmla v1.4s, v17.4s, v15.4s\n" + "fmla v2.4s, v18.4s, v14.4s\n" + "fmla v3.4s, v19.4s, v15.4s\n" + "fmla v4.4s, v20.4s, v14.4s\n" + "fmla v5.4s, v21.4s, v15.4s\n" + "fmla v6.4s, v22.4s, v14.4s\n" + "fmla v7.4s, v23.4s, v15.4s\n" + : + : [in0] "r"(in_0), [in1] "r"(in_1), [in2] "r"(in_2), + [in3] "r"(in_3), [f0] "r"(f_0) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v14", "v15", "v16", "v17", "v18", "v19", + "v20", "v21", "v22", "v23"); + } + } + + // activation + switch (depthwiseActivationParamSpec.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "fmax v0.4s, v0.4s, v31.4s\n" + "fmax v1.4s, v1.4s, v31.4s\n" + "fmax v2.4s, v2.4s, v31.4s\n" + "fmax v3.4s, v3.4s, v31.4s\n" + "fmax v4.4s, v4.4s, v31.4s\n" + "fmax v5.4s, v5.4s, v31.4s\n" + "fmax v6.4s, v6.4s, v31.4s\n" + "fmax v7.4s, v7.4s, v31.4s\n" + : + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v31"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "fmov v30.4s, 6.0\n" // six + "fmax v0.4s, v0.4s, v31.4s\n" + "fmax v1.4s, v1.4s, v31.4s\n" + "fmax v2.4s, v2.4s, v31.4s\n" + "fmax v3.4s, v3.4s, v31.4s\n" + "fmax v4.4s, v4.4s, v31.4s\n" + "fmax v5.4s, v5.4s, v31.4s\n" + "fmax v6.4s, v6.4s, v31.4s\n" + "fmax v7.4s, v7.4s, v31.4s\n" + + "fmin v0.4s, v0.4s, v30.4s\n" + "fmin v1.4s, v1.4s, v30.4s\n" + "fmin v2.4s, v2.4s, v30.4s\n" + "fmin v3.4s, v3.4s, v30.4s\n" + "fmin v4.4s, v4.4s, v30.4s\n" + "fmin v5.4s, v5.4s, v30.4s\n" + "fmin v6.4s, v6.4s, v30.4s\n" + "fmin v7.4s, v7.4s, v30.4s\n" + : + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v30", "v31"); + break; + } + case ACTIVATION_H_SWISH: { + __asm__ __volatile__("fmov v29.4s, 3.0\n" // three + "fmov v30.4s, 6.0\n" // six + "eor v31.16b, v31.16b, v31.16b\n" // zero + "fadd v14.4s, v0.4s, v29.4s\n" + "fadd v15.4s, v1.4s, v29.4s\n" + "fadd v16.4s, v2.4s, v29.4s\n" + "fadd v17.4s, v3.4s, v29.4s\n" + "fadd v18.4s, v4.4s, v29.4s\n" + "fadd v19.4s, v5.4s, v29.4s\n" + "fadd v20.4s, v6.4s, v29.4s\n" + "fadd v21.4s, v7.4s, v29.4s\n" + + "fmax v14.4s, v14.4s, v31.4s\n" + "fmax v15.4s, v15.4s, v31.4s\n" + "fmax v16.4s, v16.4s, v31.4s\n" + "fmax v17.4s, v17.4s, v31.4s\n" + "fmax v18.4s, v18.4s, v31.4s\n" + "fmax v19.4s, v19.4s, v31.4s\n" + "fmax v20.4s, v20.4s, v31.4s\n" + "fmax v21.4s, v21.4s, v31.4s\n" + + "fmin v14.4s, v14.4s, v30.4s\n" + "fmin v15.4s, v15.4s, v30.4s\n" + "fmin v16.4s, v16.4s, v30.4s\n" + "fmin v17.4s, v17.4s, v30.4s\n" + "fmin v18.4s, v18.4s, v30.4s\n" + "fmin v19.4s, v19.4s, v30.4s\n" + "fmin v20.4s, v20.4s, v30.4s\n" + "fmin v21.4s, v21.4s, v30.4s\n" + + "fdiv v14.4s, v14.4s, v30.4s\n" + "fdiv v15.4s, v15.4s, v30.4s\n" + "fdiv v16.4s, v16.4s, v30.4s\n" + "fdiv v17.4s, v17.4s, v30.4s\n" + "fdiv v18.4s, v18.4s, v30.4s\n" + "fdiv v19.4s, v19.4s, v30.4s\n" + "fdiv v20.4s, v20.4s, v30.4s\n" + "fdiv v21.4s, v21.4s, v30.4s\n" + + "fmul v0.4s, v0.4s, v14.4s\n" + "fmul v1.4s, v1.4s, v15.4s\n" + "fmul v2.4s, v2.4s, v16.4s\n" + "fmul v3.4s, v3.4s, v17.4s\n" + "fmul v4.4s, v4.4s, v18.4s\n" + "fmul v5.4s, v5.4s, v19.4s\n" + "fmul v6.4s, v6.4s, v20.4s\n" + "fmul v7.4s, v7.4s, v21.4s\n" + : + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v14", "v15", "v16", "v17", "v18", "v19", + "v20", "v21", "v29", "v30", "v31"); + break; + } + default: + return NOT_SUPPORTED; + } + + if (pwFilterArray != nullptr) { + F32 *pw_pack_0 = pwArray + hw * ic * 8 + c * 8 * 4; + __asm__ __volatile__("zip1 v16.4s, v0.4s, v2.4s\n" + "zip2 v17.4s, v0.4s, v2.4s\n" + "zip1 v18.4s, v4.4s, v6.4s\n" + "zip2 v19.4s, v4.4s, v6.4s\n" + "zip1 v0.2d, v16.2d, v18.2d\n" + "zip2 v2.2d, v16.2d, v18.2d\n" + "zip1 v4.2d, v17.2d, v19.2d\n" + "zip2 v6.2d, v17.2d, v19.2d\n" + + "zip1 v16.4s, v1.4s, v3.4s\n" + "zip2 v17.4s, v1.4s, v3.4s\n" + "zip1 v18.4s, v5.4s, v7.4s\n" + "zip2 v19.4s, v5.4s, v7.4s\n" + "zip1 v1.2d, v16.2d, v18.2d\n" + "zip2 v3.2d, v16.2d, v18.2d\n" + "zip1 v5.2d, v17.2d, v19.2d\n" + "zip2 v7.2d, v17.2d, v19.2d\n" + + "str q0, [%[pw0]]\n" + "str q2, [%[pw0], #16]\n" + "str q4, [%[pw0], #32]\n" + "str q6, [%[pw0], #48]\n" + "str q1, [%[pw0], #64]\n" + "str q3, [%[pw0], #80]\n" + "str q5, [%[pw0], #96]\n" + "str q7, [%[pw0], #112]\n" + : [pw0] "+r"(pw_pack_0) + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v16", "v17", "v18", "v19"); + } else { + F32 *out_ptr = outArray + ((n * ic + c) * ohow + hw) * 8; + __asm__ __volatile__( + "stp q0, q1, [%[out]]\n" + "stp q2, q3, [%[out], #32]\n" + "stp q4, q5, [%[out], #64]\n" + "stp q6, q7, [%[out], #96]\n" + : [out] "+r"(out_ptr) + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); + } + } + + // ohow_reminder % 4 + ohow_s = (ohow / 4) * 4; + for (I32 hw = ohow_s; hw < ohow; hw++) { + U32 in_h_0 = hw / ow * strideH; + U32 in_w_0 = hw % ow * strideW; + + __asm__ __volatile__("ldr q0, [%[b]]\n" + "ldr q1, [%[b], #16]\n" + : + : [b] "r"(b) + : "memory", "cc", "v0", "v1"); + + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + const F32 *f_0 = f + fh_idx * fw * 8 + fw_idx * 8; + F32 *in_idx = in_pad + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + F32 *in_0 = in_idx + in_h_0 * iw_pad * 8 + in_w_0 * 8; + __asm__ __volatile__("ldp q14, q15, [%[f0]]\n" + "ldp q16, q17, [%[in0]]\n" + + "fmla v0.4s, v16.4s, v14.4s\n" + "fmla v1.4s, v17.4s, v15.4s\n" + : + : [in0] "r"(in_0), [f0] "r"(f_0) + : "memory", "cc", "v0", "v1", "v14", "v15"); + } + } + + // activation + switch (depthwiseActivationParamSpec.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "fmax v0.4s, v0.4s, v31.4s\n" + "fmax v1.4s, v1.4s, v31.4s\n" + : + : + : "memory", "cc", "v0", "v1", "v31"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "fmov v30.4s, 6.0\n" // six + "fmax v0.4s, v0.4s, v31.4s\n" + "fmax v1.4s, v1.4s, v31.4s\n" + + "fmin v0.4s, v0.4s, v30.4s\n" + "fmin v1.4s, v1.4s, v30.4s\n" + : + : + : "memory", "cc", "v0", "v1", "v30", "v31"); + break; + } + case ACTIVATION_H_SWISH: { + __asm__ __volatile__( + "fmov v29.4s, 3.0\n" // three + "fmov v30.4s, 6.0\n" // six + "eor v31.16b, v31.16b, v31.16b\n" // zero + "fadd v14.4s, v0.4s, v29.4s\n" + "fadd v15.4s, v1.4s, v29.4s\n" + + "fmax v14.4s, v14.4s, v31.4s\n" + "fmax v15.4s, v15.4s, v31.4s\n" + + "fmin v14.4s, v14.4s, v30.4s\n" + "fmin v15.4s, v15.4s, v30.4s\n" + + "fdiv v14.4s, v14.4s, v30.4s\n" + "fdiv v15.4s, v15.4s, v30.4s\n" + + "fmul v0.4s, v0.4s, v14.4s\n" + "fmul v1.4s, v1.4s, v15.4s\n" + : + : + : "memory", "cc", "v0", "v1", "v14", "v15", "v29", "v30", "v31"); + break; + } + default: + return NOT_SUPPORTED; + } + + F32 *out_ptr; + if (pwFilterArray != nullptr) { + out_ptr = pwArray + hw * ic * 8 + c * 8; + } else { + out_ptr = outArray + ((n * ic + c) * ohow + hw) * 8; + } + __asm__ __volatile__("stp q0, q1, [%[out]]\n" + : [out] "+r"(out_ptr) + : + : "memory", "cc", "v0", "v1"); + } + } + + if (pwFilterArray == nullptr) { + continue; + } + // pw_conv + // ohow / 8 + for (I32 hw = 0; hw < ohow - 7; hw += 8) { + const F32 *b0 = pwBiasArray; + const F32 *b1 = b0 + 4; + F32 *in_pack = pwArray + hw * ic * 8; + const F32 *f_o0c0 = pwFilterArray; + for (I32 o = 0; o < I32(oc); o++) { + F32 *in_hw0 = in_pack; + F32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + const F32 *b_o0 = b0; + const F32 *b_o1 = b1; + __asm__ __volatile__( + "ldr q24, [%[b_0]]\n" // b_O0o[0:3] + "ldr q25, [%[b_1]]\n" // b_O1o[0:3] + "mov x0, %[ic]\n" // ic_blk + "mov v4.16b, v24.16b\n" + "ldr q0, [%[in_0]]\n" // in_hw0 + "mov v5.16b, v24.16b\n" + "ldr q1, [%[in_0], #16]\n" // in_hw0 + "mov v6.16b, v24.16b\n" + "ldr q20, [%[f_0]]\n" // f_o0c0 + "mov v7.16b, v24.16b\n" + "ldr q21, [%[f_0], #16]\n" // f_o0c0 + "mov v8.16b, v24.16b\n" + "mov v9.16b, v24.16b\n" + "mov v10.16b, v24.16b\n" + "mov v11.16b, v24.16b\n" + "mov v12.16b, v25.16b\n" + "mov v13.16b, v25.16b\n" + "mov v14.16b, v25.16b\n" + "mov v15.16b, v25.16b\n" + "mov v16.16b, v25.16b\n" + "mov v17.16b, v25.16b\n" + "mov v18.16b, v25.16b\n" + "mov v19.16b, v25.16b\n" + + "0:\n" + "fmla v4.4s, v20.4s, v0.s[0]\n" + "ldr q2, [%[in_0], #32]\n" + "fmla v5.4s, v20.4s, v0.s[1]\n" + "ldr q3, [%[in_0], #48]\n" + "fmla v6.4s, v20.4s, v0.s[2]\n" + "ldr q22, [%[f_0], #32]\n" + "fmla v7.4s, v20.4s, v0.s[3]\n" + "ldr q23, [%[f_0], #48]\n" + "fmla v8.4s, v20.4s, v1.s[0]\n" + "fmla v9.4s, v20.4s, v1.s[1]\n" + "fmla v10.4s, v20.4s, v1.s[2]\n" + "fmla v11.4s, v20.4s, v1.s[3]\n" + "fmla v12.4s, v21.4s, v0.s[0]\n" + "fmla v13.4s, v21.4s, v0.s[1]\n" + "fmla v14.4s, v21.4s, v0.s[2]\n" + "fmla v15.4s, v21.4s, v0.s[3]\n" + "fmla v16.4s, v21.4s, v1.s[0]\n" + "fmla v17.4s, v21.4s, v1.s[1]\n" + "fmla v18.4s, v21.4s, v1.s[2]\n" + "fmla v19.4s, v21.4s, v1.s[3]\n" + + "fmla v4.4s, v22.4s, v2.s[0]\n" + "ldr q0, [%[in_0], #64]!\n" + "fmla v5.4s, v22.4s, v2.s[1]\n" + "ldr q1, [%[in_0], #16]\n" + "fmla v6.4s, v22.4s, v2.s[2]\n" + "ldr q20, [%[f_0], #64]!\n" + "fmla v7.4s, v22.4s, v2.s[3]\n" + "ldr q21, [%[f_0], #16]\n" + "fmla v8.4s, v22.4s, v3.s[0]\n" + "fmla v9.4s, v22.4s, v3.s[1]\n" + "fmla v10.4s, v22.4s, v3.s[2]\n" + "fmla v11.4s, v22.4s, v3.s[3]\n" + "fmla v12.4s, v23.4s, v2.s[0]\n" + "fmla v13.4s, v23.4s, v2.s[1]\n" + "fmla v14.4s, v23.4s, v2.s[2]\n" + "fmla v15.4s, v23.4s, v2.s[3]\n" + "fmla v16.4s, v23.4s, v3.s[0]\n" + "fmla v17.4s, v23.4s, v3.s[1]\n" + "fmla v18.4s, v23.4s, v3.s[2]\n" + "fmla v19.4s, v23.4s, v3.s[3]\n" + "subs x0, x0, #2\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v31.16b, v31.16b, v31.16b\n" // zero + "fmax v4.4s, v4.4s, v31.4s\n" + "fmax v5.4s, v5.4s, v31.4s\n" + "fmax v6.4s, v6.4s, v31.4s\n" + "fmax v7.4s, v7.4s, v31.4s\n" + "fmax v8.4s, v8.4s, v31.4s\n" + "fmax v9.4s, v9.4s, v31.4s\n" + "fmax v10.4s, v10.4s, v31.4s\n" + "fmax v11.4s, v11.4s, v31.4s\n" + "fmax v12.4s, v12.4s, v31.4s\n" + "fmax v13.4s, v13.4s, v31.4s\n" + "fmax v14.4s, v14.4s, v31.4s\n" + "fmax v15.4s, v15.4s, v31.4s\n" + "fmax v16.4s, v16.4s, v31.4s\n" + "fmax v17.4s, v17.4s, v31.4s\n" + "fmax v18.4s, v18.4s, v31.4s\n" + "fmax v19.4s, v19.4s, v31.4s\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v31.16b, v31.16b, v31.16b\n" // zero + "fmov v30.4s, 6.0\n" // six + "fmax v4.4s, v4.4s, v31.4s\n" + "fmax v5.4s, v5.4s, v31.4s\n" + "fmax v6.4s, v6.4s, v31.4s\n" + "fmax v7.4s, v7.4s, v31.4s\n" + "fmax v8.4s, v8.4s, v31.4s\n" + "fmax v9.4s, v9.4s, v31.4s\n" + "fmax v10.4s, v10.4s, v31.4s\n" + "fmax v11.4s, v11.4s, v31.4s\n" + "fmax v12.4s, v12.4s, v31.4s\n" + "fmax v13.4s, v13.4s, v31.4s\n" + "fmax v14.4s, v14.4s, v31.4s\n" + "fmax v15.4s, v15.4s, v31.4s\n" + "fmax v16.4s, v16.4s, v31.4s\n" + "fmax v17.4s, v17.4s, v31.4s\n" + "fmax v18.4s, v18.4s, v31.4s\n" + "fmax v19.4s, v19.4s, v31.4s\n" + + "fmin v4.4s, v4.4s, v30.4s\n" + "fmin v5.4s, v5.4s, v30.4s\n" + "fmin v6.4s, v6.4s, v30.4s\n" + "fmin v7.4s, v7.4s, v30.4s\n" + "fmin v8.4s, v8.4s, v30.4s\n" + "fmin v9.4s, v9.4s, v30.4s\n" + "fmin v10.4s, v10.4s, v30.4s\n" + "fmin v11.4s, v11.4s, v30.4s\n" + "fmin v12.4s, v12.4s, v30.4s\n" + "fmin v13.4s, v13.4s, v30.4s\n" + "fmin v14.4s, v14.4s, v30.4s\n" + "fmin v15.4s, v15.4s, v30.4s\n" + "fmin v16.4s, v16.4s, v30.4s\n" + "fmin v17.4s, v17.4s, v30.4s\n" + "fmin v18.4s, v18.4s, v30.4s\n" + "fmin v19.4s, v19.4s, v30.4s\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "fmov v29.4s, 3.0\n" // three + "fmov v30.4s, 6.0\n" // six + "eor v31.16b, v31.16b, v31.16b\n" // zero + "fadd v20.4s, v4.4s, v29.4s\n" + "fadd v21.4s, v5.4s, v29.4s\n" + "fadd v22.4s, v6.4s, v29.4s\n" + "fadd v23.4s, v7.4s, v29.4s\n" + "fadd v24.4s, v8.4s, v29.4s\n" + "fadd v25.4s, v9.4s, v29.4s\n" + "fadd v26.4s, v10.4s, v29.4s\n" + "fadd v27.4s, v11.4s, v29.4s\n" + + "fmax v20.4s, v20.4s, v31.4s\n" + "fmax v21.4s, v21.4s, v31.4s\n" + "fmax v22.4s, v22.4s, v31.4s\n" + "fmax v23.4s, v23.4s, v31.4s\n" + "fmax v24.4s, v24.4s, v31.4s\n" + "fmax v25.4s, v25.4s, v31.4s\n" + "fmax v26.4s, v26.4s, v31.4s\n" + "fmax v27.4s, v27.4s, v31.4s\n" + + "fmin v20.4s, v20.4s, v30.4s\n" + "fmin v21.4s, v21.4s, v30.4s\n" + "fmin v22.4s, v22.4s, v30.4s\n" + "fmin v23.4s, v23.4s, v30.4s\n" + "fmin v24.4s, v24.4s, v30.4s\n" + "fmin v25.4s, v25.4s, v30.4s\n" + "fmin v26.4s, v26.4s, v30.4s\n" + "fmin v27.4s, v27.4s, v30.4s\n" + + "fdiv v20.4s, v20.4s, v30.4s\n" + "fdiv v21.4s, v21.4s, v30.4s\n" + "fdiv v22.4s, v22.4s, v30.4s\n" + "fdiv v23.4s, v23.4s, v30.4s\n" + "fdiv v24.4s, v24.4s, v30.4s\n" + "fdiv v25.4s, v25.4s, v30.4s\n" + "fdiv v26.4s, v26.4s, v30.4s\n" + "fdiv v27.4s, v27.4s, v30.4s\n" + + "fmul v4.4s, v4.4s, v20.4s\n" + "fmul v5.4s, v5.4s, v21.4s\n" + "fmul v6.4s, v6.4s, v22.4s\n" + "fmul v7.4s, v7.4s, v23.4s\n" + "fmul v8.4s, v8.4s, v24.4s\n" + "fmul v9.4s, v9.4s, v25.4s\n" + "fmul v10.4s, v10.4s, v26.4s\n" + "fmul v11.4s, v11.4s, v27.4s\n" + + "fadd v20.4s, v12.4s, v29.4s\n" + "fadd v21.4s, v13.4s, v29.4s\n" + "fadd v22.4s, v14.4s, v29.4s\n" + "fadd v23.4s, v15.4s, v29.4s\n" + "fadd v24.4s, v16.4s, v29.4s\n" + "fadd v25.4s, v17.4s, v29.4s\n" + "fadd v26.4s, v18.4s, v29.4s\n" + "fadd v27.4s, v19.4s, v29.4s\n" + + "fmax v20.4s, v20.4s, v31.4s\n" + "fmax v21.4s, v21.4s, v31.4s\n" + "fmax v22.4s, v22.4s, v31.4s\n" + "fmax v23.4s, v23.4s, v31.4s\n" + "fmax v24.4s, v24.4s, v31.4s\n" + "fmax v25.4s, v25.4s, v31.4s\n" + "fmax v26.4s, v26.4s, v31.4s\n" + "fmax v27.4s, v27.4s, v31.4s\n" + + "fmin v20.4s, v20.4s, v30.4s\n" + "fmin v21.4s, v21.4s, v30.4s\n" + "fmin v22.4s, v22.4s, v30.4s\n" + "fmin v23.4s, v23.4s, v30.4s\n" + "fmin v24.4s, v24.4s, v30.4s\n" + "fmin v25.4s, v25.4s, v30.4s\n" + "fmin v26.4s, v26.4s, v30.4s\n" + "fmin v27.4s, v27.4s, v30.4s\n" + + "fdiv v20.4s, v20.4s, v30.4s\n" + "fdiv v21.4s, v21.4s, v30.4s\n" + "fdiv v22.4s, v22.4s, v30.4s\n" + "fdiv v23.4s, v23.4s, v30.4s\n" + "fdiv v24.4s, v24.4s, v30.4s\n" + "fdiv v25.4s, v25.4s, v30.4s\n" + "fdiv v26.4s, v26.4s, v30.4s\n" + "fdiv v27.4s, v27.4s, v30.4s\n" + + "fmul v12.4s, v12.4s, v20.4s\n" + "fmul v13.4s, v13.4s, v21.4s\n" + "fmul v14.4s, v14.4s, v22.4s\n" + "fmul v15.4s, v15.4s, v23.4s\n" + "fmul v16.4s, v16.4s, v24.4s\n" + "fmul v17.4s, v17.4s, v25.4s\n" + "fmul v18.4s, v18.4s, v26.4s\n" + "fmul v19.4s, v19.4s, v27.4s\n" + + "13:\n" + "str q4, [%[out_0]], #16\n" + "str q12, [%[out_0]], #16\n" + "str q5, [%[out_0]], #16\n" + "str q13, [%[out_0]], #16\n" + "str q6, [%[out_0]], #16\n" + "str q14, [%[out_0]], #16\n" + "str q7, [%[out_0]], #16\n" + "str q15, [%[out_0]], #16\n" + "str q8, [%[out_0]], #16\n" + "str q16, [%[out_0]], #16\n" + "str q9, [%[out_0]], #16\n" + "str q17, [%[out_0]], #16\n" + "str q10, [%[out_0]], #16\n" + "str q18, [%[out_0]], #16\n" + "str q11, [%[out_0]], #16\n" + "str q19, [%[out_0]], #16\n" + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), [b_1] "r"(b_o1), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v29", "v30", "v31", "x0", + "x1", "x2", "x3"); + b0 += 8; + b1 += 8; + } + } + + // ohow_remainder % 8 / 4 + U32 ohow_s = (ohow / 8) * 8; + for (I32 hw = ohow_s; hw < ohow - 3; hw += 4) { + const F32 *b0 = pwBiasArray; + const F32 *b1 = b0 + 4; + const F32 *f_o0c0 = pwFilterArray; + F32 *in_pack = pwArray + hw * ic * 8; + for (I32 o = 0; o < I32(oc); o++) { + F32 *in_hw0 = in_pack; + F32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + const F32 *b_o0 = b0; + const F32 *b_o1 = b1; + __asm__ __volatile__( + "ldr q24, [%[b_0]]\n" // b_o0 + "ldr q25, [%[b_1]]\n" // b_o1 + "mov x0, %[ic]\n" // ic_blk + "mov v4.16b, v24.16b\n" + "ldr q0, [%[in_0]]\n" // in_hw0 + "mov v5.16b, v24.16b\n" + "mov v6.16b, v24.16b\n" + "ldr q20, [%[f_0]]\n" // f_o0c0 + "mov v7.16b, v24.16b\n" + "ldr q21, [%[f_0], #16]\n" // f_o0c0 + "mov v12.16b, v25.16b\n" + "mov v13.16b, v25.16b\n" + "mov v14.16b, v25.16b\n" + "mov v15.16b, v25.16b\n" + + "0:\n" + "fmla v4.4s, v20.4s, v0.s[0]\n" + "ldr q2, [%[in_0], #16]\n" + "fmla v5.4s, v20.4s, v0.s[1]\n" + "ldr q22, [%[f_0], #32]\n" + "fmla v6.4s, v20.4s, v0.s[2]\n" + "ldr q23, [%[f_0], #48]\n" + "fmla v7.4s, v20.4s, v0.s[3]\n" + "fmla v12.4s, v21.4s, v0.s[0]\n" + "fmla v13.4s, v21.4s, v0.s[1]\n" + "fmla v14.4s, v21.4s, v0.s[2]\n" + "fmla v15.4s, v21.4s, v0.s[3]\n" + + "fmla v4.4s, v22.4s, v2.s[0]\n" + "ldr q0, [%[in_0], #32]!\n" + "fmla v5.4s, v22.4s, v2.s[1]\n" + "ldr q20, [%[f_0], #64]!\n" + "fmla v6.4s, v22.4s, v2.s[2]\n" + "ldr q21, [%[f_0], #16]\n" + "fmla v7.4s, v22.4s, v2.s[3]\n" + "fmla v12.4s, v23.4s, v2.s[0]\n" + "fmla v13.4s, v23.4s, v2.s[1]\n" + "fmla v14.4s, v23.4s, v2.s[2]\n" + "fmla v15.4s, v23.4s, v2.s[3]\n" + "subs x0, x0, #2\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v31.16b, v31.16b, v31.16b\n" // zero + "fmax v4.4s, v4.4s, v31.4s\n" + "fmax v5.4s, v5.4s, v31.4s\n" + "fmax v6.4s, v6.4s, v31.4s\n" + "fmax v7.4s, v7.4s, v31.4s\n" + "fmax v12.4s, v12.4s, v31.4s\n" + "fmax v13.4s, v13.4s, v31.4s\n" + "fmax v14.4s, v14.4s, v31.4s\n" + "fmax v15.4s, v15.4s, v31.4s\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v31.16b, v31.16b, v31.16b\n" // zero + "fmov v30.4s, 6.0\n" // six + "fmax v4.4s, v4.4s, v31.4s\n" + "fmax v5.4s, v5.4s, v31.4s\n" + "fmax v6.4s, v6.4s, v31.4s\n" + "fmax v7.4s, v7.4s, v31.4s\n" + "fmax v12.4s, v12.4s, v31.4s\n" + "fmax v13.4s, v13.4s, v31.4s\n" + "fmax v14.4s, v14.4s, v31.4s\n" + "fmax v15.4s, v15.4s, v31.4s\n" + + "fmin v4.4s, v4.4s, v30.4s\n" + "fmin v5.4s, v5.4s, v30.4s\n" + "fmin v6.4s, v6.4s, v30.4s\n" + "fmin v7.4s, v7.4s, v30.4s\n" + "fmin v12.4s, v12.4s, v30.4s\n" + "fmin v13.4s, v13.4s, v30.4s\n" + "fmin v14.4s, v14.4s, v30.4s\n" + "fmin v15.4s, v15.4s, v30.4s\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "fmov v29.4s, 3.0\n" // three + "fmov v30.4s, 6.0\n" // six + "eor v31.16b, v31.16b, v31.16b\n" // zero + "fadd v20.4s, v4.4s, v29.4s\n" + "fadd v21.4s, v5.4s, v29.4s\n" + "fadd v22.4s, v6.4s, v29.4s\n" + "fadd v23.4s, v7.4s, v29.4s\n" + "fadd v24.4s, v12.4s, v29.4s\n" + "fadd v25.4s, v13.4s, v29.4s\n" + "fadd v26.4s, v14.4s, v29.4s\n" + "fadd v27.4s, v15.4s, v29.4s\n" + + "fmax v20.4s, v20.4s, v31.4s\n" + "fmax v21.4s, v21.4s, v31.4s\n" + "fmax v22.4s, v22.4s, v31.4s\n" + "fmax v23.4s, v23.4s, v31.4s\n" + "fmax v24.4s, v24.4s, v31.4s\n" + "fmax v25.4s, v25.4s, v31.4s\n" + "fmax v26.4s, v26.4s, v31.4s\n" + "fmax v27.4s, v27.4s, v31.4s\n" + + "fmin v20.4s, v20.4s, v30.4s\n" + "fmin v21.4s, v21.4s, v30.4s\n" + "fmin v22.4s, v22.4s, v30.4s\n" + "fmin v23.4s, v23.4s, v30.4s\n" + "fmin v24.4s, v24.4s, v30.4s\n" + "fmin v25.4s, v25.4s, v30.4s\n" + "fmin v26.4s, v26.4s, v30.4s\n" + "fmin v27.4s, v27.4s, v30.4s\n" + + "fdiv v20.4s, v20.4s, v30.4s\n" + "fdiv v21.4s, v21.4s, v30.4s\n" + "fdiv v22.4s, v22.4s, v30.4s\n" + "fdiv v23.4s, v23.4s, v30.4s\n" + "fdiv v24.4s, v24.4s, v30.4s\n" + "fdiv v25.4s, v25.4s, v30.4s\n" + "fdiv v26.4s, v26.4s, v30.4s\n" + "fdiv v27.4s, v27.4s, v30.4s\n" + + "fmul v4.4s, v4.4s, v20.4s\n" + "fmul v5.4s, v5.4s, v21.4s\n" + "fmul v6.4s, v6.4s, v22.4s\n" + "fmul v7.4s, v7.4s, v23.4s\n" + "fmul v12.4s, v12.4s, v24.4s\n" + "fmul v13.4s, v13.4s, v25.4s\n" + "fmul v14.4s, v14.4s, v26.4s\n" + "fmul v15.4s, v15.4s, v27.4s\n" + + "13:\n" + "str q4, [%[out_0]]\n" + "str q12, [%[out_0], #16]\n" + "str q5, [%[out_0], #32]\n" + "str q13, [%[out_0], #48]\n" + "str q6, [%[out_0], #64]\n" + "str q14, [%[out_0], #80]\n" + "str q7, [%[out_0], #96]\n" + "str q15, [%[out_0], #112]\n" + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), [b_1] "r"(b_o1), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v2", "v4", "v5", "v6", "v7", "v12", "v13", "v14", + "v15", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v29", "v30", + "v31", "x0", "x1", "x2", "x3"); + b0 += 8; + b1 += 8; + } + } + + // ohow_reminder % 4 + ohow_s = (ohow / 4) * 4; + for (I32 hw = ohow_s; hw < ohow; hw++) { + const F32 *b0 = pwBiasArray; + const F32 *b1 = b0 + 4; + const F32 *f_o0c0 = pwFilterArray; + F32 *in_pack = pwArray + hw * ic * 8; + for (I32 o = 0; o < I32(oc); o++) { + F32 *in_hw0 = in_pack; + F32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + const F32 *b_o0 = b0; + const F32 *b_o1 = b1; + __asm__ __volatile__( + "ldr q4, [%[b_0]]\n" // b_o0 + "ldr q12, [%[b_1]]\n" // b_o1 + "mov x0, %[ic]\n" // ic_blk + "ldr s0, [%[in_0]]\n" // in_hw0 + "ldr q20, [%[f_0]]\n" // f_o0c0 + "ldr q21, [%[f_0], #16]\n" + "0:\n" + "ldr s2, [%[in_0], #4]\n" + "ldr q22, [%[f_0], #32]\n" + "ldr q23, [%[f_0], #48]\n" + "fmla v4.4s, v20.4s, v0.s[0]\n" + "fmla v12.4s, v21.4s, v0.s[0]\n" + + "ldr s0, [%[in_0], #8]!\n" + "ldr q20, [%[f_0], #64]!\n" + "ldr q21, [%[f_0], #16]\n" + "fmla v4.4s, v22.4s, v2.s[0]\n" + "fmla v12.4s, v23.4s, v2.s[0]\n" + "subs x0, x0, #2\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v4.4s, v4.4s, v0.4s\n" + "fmax v12.4s, v12.4s, v0.4s\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v31.16b, v31.16b, v31.16b\n" // zero + "fmov v30.4s, 6.0\n" // six + "fmax v4.4s, v4.4s, v31.4s\n" + "fmax v12.4s, v12.4s, v31.4s\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "fmov v29.4s, 3.0\n" // three + "fmov v30.4s, 6.0\n" // six + "eor v31.16b, v31.16b, v31.16b\n" // zero + "fadd v20.4s, v4.4s, v29.4s\n" + "fadd v24.4s, v12.4s, v29.4s\n" + + "fmax v20.4s, v20.4s, v31.4s\n" + "fmax v24.4s, v24.4s, v31.4s\n" + + "fmin v20.4s, v20.4s, v30.4s\n" + "fmin v24.4s, v24.4s, v30.4s\n" + + "fdiv v20.4s, v20.4s, v30.4s\n" + "fdiv v24.4s, v24.4s, v30.4s\n" + + "fmul v4.4s, v4.4s, v20.4s\n" + "fmul v12.4s, v12.4s, v24.4s\n" + + "13:\n" + "str q4, [%[out_0]]\n" + "str q12, [%[out_0], #16]\n" + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), [b_1] "r"(b_o1), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v4", "v12", "v20", "v24", "v29", "v30", + "v31", "x0", "x1", "x2", "x3"); + b0 += 8; + b1 += 8; + } + } + } + return SUCCESS; +} +#endif diff --git a/compute/tensor/src/cpu/arm/fp32/eltwise.cpp b/compute/tensor/src/cpu/arm/fp32/eltwise.cpp new file mode 100644 index 00000000..94d9f147 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp32/eltwise.cpp @@ -0,0 +1,87 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/fp32/tensor_computing_fp32.h" +#include "cpu/cpu_functions.h" + +EE eltwise_fp32(std::vector input, + std::vector inputSize, + U32 num, + U32 len, + void *output, + EltwiseMode eltwiseMode) +{ + F32 buffer[4]; + U32 len_tail = len % 4; + U32 len_main = len - len_tail; + + F32 *tmp = buffer; + F32 *output_ptr = (F32 *)output; + for (U32 i = 0; i < len_main; i += 4) { + get_vector((F32 *)input[0], inputSize[0], &tmp, 4, i, 4, buffer); + float32x4_t tmp_v = vld1q_f32(tmp); + for (U32 j = 1; j < num; j++) { + get_vector((F32 *)input[j], inputSize[j], &tmp, 4, i, 4, buffer); + float32x4_t value_v = vld1q_f32(tmp); + switch (eltwiseMode) { + case ELTWISE_SUM: + tmp_v = vaddq_f32(tmp_v, value_v); + break; + case ELTWISE_MAX: + tmp_v = vmaxq_f32(tmp_v, value_v); + break; + case ELTWISE_PROD: + tmp_v = vmulq_f32(tmp_v, value_v); + break; + case ELTWISE_SUB: + tmp_v = vsubq_f32(tmp_v, value_v); + break; + case ELTWISE_DIV: + tmp_v = vdivq_f32(tmp_v, value_v); + break; + default: + return NOT_SUPPORTED; + } + } + vst1q_f32(output_ptr + i, tmp_v); + } + for (U32 i = len_main; i < len; i++) { + get_vector((F32 *)input[0], inputSize[0], &tmp, 4, i, 1, buffer); + F32 tmp_s = tmp[0]; + for (U32 j = 1; j < num; j++) { + get_vector((F32 *)input[j], inputSize[j], &tmp, 4, i, 1, buffer); + F32 value_s = tmp[0]; + switch (eltwiseMode) { + case ELTWISE_SUM: + tmp_s = value_s + tmp_s; + break; + case ELTWISE_MAX: + tmp_s = (value_s > tmp_s) ? value_s : tmp_s; + break; + case ELTWISE_PROD: + tmp_s *= value_s; + break; + case ELTWISE_SUB: + tmp_s -= value_s; + break; + case ELTWISE_DIV: + tmp_s /= value_s; + break; + default: + return NOT_SUPPORTED; + } + } + output_ptr[i] = tmp_s; + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/arm/fp32/lstm.cpp b/compute/tensor/src/cpu/arm/fp32/lstm.cpp new file mode 100644 index 00000000..9365902a --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp32/lstm.cpp @@ -0,0 +1,467 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/arm/fp32/tensor_computing_fp32.h" + +void mvm_nkn32(U32 fn, U32 fk, const F32 *filterArray, F32 *input, F32 *output) +{ +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 n = 0; n < fn; n++) { + F32 *in = input; + const F32 *f = filterArray + n * fk * 32; + F32 *out = output + n * 32; +#ifdef __aarch64__ + __asm__ __volatile__("ldr d0, [%[in]]\n" + "ldr q1, [%[out]]\n" + "ldr q2, [%[out], #16]\n" + "ldr q3, [%[out], #32]\n" + "ldr q4, [%[out], #48]\n" + "ldr q13, [%[out], #64]\n" + "ldr q14, [%[out], #80]\n" + "ldr q15, [%[out], #96]\n" + "ldr q16, [%[out], #112]\n" + "mov x0, %[k]\n" + "ldr q5, [%[f]]\n" + "ldr q6, [%[f], #16]\n" + "ldr q7, [%[f], #32]\n" + "ldr q8, [%[f], #48]\n" + "ldr q17, [%[f], #64]\n" + "ldr q18, [%[f], #80]\n" + "ldr q19, [%[f], #96]\n" + "ldr q20, [%[f], #112]\n" + "0:\n" + "prfm pldl2strm, [%[f], #4096]\n" + "prfm pldl1strm, [%[f], #1024]\n" + "ldr d9, [%[f], #128]\n" + "fmla v1.4s, v5.4s, v0.s[0]\n" + "ldr x9, [%[f], #136]\n" + "ins v9.d[1], x9\n" + "ldr d10, [%[f], #144]\n" + "fmla v2.4s, v6.4s, v0.s[0]\n" + "ldr x10, [%[f], #152]\n" + "ins v10.d[1], x10\n" + "ldr d11, [%[f], #160]\n" + "fmla v3.4s, v7.4s, v0.s[0]\n" + "ldr x11, [%[f], #168]\n" + "ins v11.d[1], x11\n" + "ldr d12, [%[f], #176]\n" + "fmla v4.4s, v8.4s, v0.s[0]\n" + "ldr x12, [%[f], #184]\n" + "ins v12.d[1], x12\n" + "ldr d21, [%[f], #192]\n" + "fmla v13.4s, v17.4s, v0.s[0]\n" + "ldr x9, [%[f], #200]\n" + "ins v21.d[1], x9\n" + "ldr d22, [%[f], #208]\n" + "fmla v14.4s, v18.4s, v0.s[0]\n" + "ldr x10, [%[f], #216]\n" + "ins v22.d[1], x10\n" + "ldr d23, [%[f], #224]\n" + "fmla v15.4s, v19.4s, v0.s[0]\n" + "ldr x11, [%[f], #232]\n" + "ins v23.d[1], x11\n" + "ldr d24, [%[f], #240]\n" + "fmla v16.4s, v20.4s, v0.s[0]\n" + "ldr x12, [%[f], #248]\n" + "ins v24.d[1], x12\n" + + "add %[f], %[f], #256\n" + "ldr d5, [%[f]]\n" + "fmla v1.4s, v9.4s, v0.s[1]\n" + "ldr x5, [%[f], #8]\n" + "ins v5.d[1], x5\n" + "ldr d6, [%[f], #16]\n" + "fmla v2.4s, v10.4s, v0.s[1]\n" + "ldr x6, [%[f], #24]\n" + "ins v6.d[1], x6\n" + "ldr d7, [%[f], #32]\n" + "fmla v3.4s, v11.4s, v0.s[1]\n" + "ldr x7, [%[f], #40]\n" + "ins v7.d[1], x7\n" + "ldr d8, [%[f], #48]\n" + "fmla v4.4s, v12.4s, v0.s[1]\n" + "ldr x8, [%[f], #56]\n" + "ins v8.d[1], x8\n" + "ldr d17, [%[f], #64]\n" + "fmla v13.4s, v21.4s, v0.s[1]\n" + "ldr x5, [%[f], #72]\n" + "ins v17.d[1], x5\n" + "ldr d18, [%[f], #80]\n" + "fmla v14.4s, v22.4s, v0.s[1]\n" + "ldr x6, [%[f], #88]\n" + "ins v18.d[1], x6\n" + "ldr d19, [%[f], #96]\n" + "fmla v15.4s, v23.4s, v0.s[1]\n" + "ldr x7, [%[f], #104]\n" + "ins v19.d[1], x7\n" + "ldr d20, [%[f], #112]\n" + "fmla v16.4s, v24.4s, v0.s[1]\n" + "ldr x8, [%[f], #120]\n" + "add %[in], %[in], #8\n" + "ins v20.d[1], x8\n" + + "ldr d0, [%[in]]\n" + "sub x0, x0, #2\n" + + "cmp x0, #3\n" + "bgt 0b\n" + "ldr q9, [%[f], #128]\n" + "ldr q10, [%[f], #144]\n" + "ldr q11, [%[f], #160]\n" + "ldr q12, [%[f], #176]\n" + "ldr q21, [%[f], #192]\n" + "ldr q22, [%[f], #208]\n" + "ldr q23, [%[f], #224]\n" + "ldr q24, [%[f], #240]\n" + "fmla v1.4s, v5.4s, v0.s[0]\n" + "fmla v2.4s, v6.4s, v0.s[0]\n" + "fmla v3.4s, v7.4s, v0.s[0]\n" + "fmla v4.4s, v8.4s, v0.s[0]\n" + "fmla v13.4s, v17.4s, v0.s[0]\n" + "fmla v14.4s, v18.4s, v0.s[0]\n" + "fmla v15.4s, v19.4s, v0.s[0]\n" + "fmla v16.4s, v20.4s, v0.s[0]\n" + "fmla v1.4s, v9.4s, v0.s[1]\n" + "fmla v2.4s, v10.4s, v0.s[1]\n" + "fmla v3.4s, v11.4s, v0.s[1]\n" + "fmla v4.4s, v12.4s, v0.s[1]\n" + "fmla v13.4s, v21.4s, v0.s[1]\n" + "fmla v14.4s, v22.4s, v0.s[1]\n" + "fmla v15.4s, v23.4s, v0.s[1]\n" + "fmla v16.4s, v24.4s, v0.s[1]\n" + "cmp x0, #3\n" + "bne 1f\n" + "add %[f], %[f], #256\n" + "ldr s0, [%[in], #8]\n" + "ldr q5, [%[f]]\n" + "ldr q6, [%[f], #16]\n" + "ldr q7, [%[f], #32]\n" + "ldr q8, [%[f], #48]\n" + "ldr q17, [%[f], #64]\n" + "ldr q18, [%[f], #80]\n" + "ldr q19, [%[f], #96]\n" + "ldr q20, [%[f], #112]\n" + "fmla v1.4s, v5.4s, v0.s[0]\n" + "fmla v2.4s, v6.4s, v0.s[0]\n" + "fmla v3.4s, v7.4s, v0.s[0]\n" + "fmla v4.4s, v8.4s, v0.s[0]\n" + "fmla v13.4s, v17.4s, v0.s[0]\n" + "fmla v14.4s, v18.4s, v0.s[0]\n" + "fmla v15.4s, v19.4s, v0.s[0]\n" + "fmla v16.4s, v20.4s, v0.s[0]\n" + + "1:\n" + "str q1, [%[out]]\n" + "str q2, [%[out], #16]\n" + "str q3, [%[out], #32]\n" + "str q4, [%[out], #48]\n" + "str q13, [%[out], #64]\n" + "str q14, [%[out], #80]\n" + "str q15, [%[out], #96]\n" + "str q16, [%[out], #112]\n" + : [out] "+r"(out), [f] "+r"(f), [in] "+r"(in) + : [k] "r"((I64)fk) + : "memory", "cc", "x0", "x5", "x6", "x7", "x8", "x9", "x10", "x11", + "x12", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", + "v20", "v21", "v22", "v23", "v24"); +#else + __asm__ __volatile__("vld1.f32 {d0[0]}, [%[in]]!\n" + "mov r2, %[out]\n" + "mov r3, %[out]\n" + "vld1.f32 {d2-d3}, [r2]!\n" + "vld1.f32 {d4-d5}, [r2]!\n" + "vld1.f32 {d6-d7}, [r2]!\n" + "vld1.f32 {d8-d9}, [r2]!\n" + "vld1.f32 {d18-d19}, [%[f]]!\n" + "vld1.f32 {d20-d21}, [%[f]]!\n" + "vld1.f32 {d22-d23}, [%[f]]!\n" + "vld1.f32 {d24-d25}, [%[f]]!\n" + "mov r4, %[k]\n" + "vld1.f32 {d10-d11}, [r2]!\n" + "vld1.f32 {d12-d13}, [r2]!\n" + "vld1.f32 {d14-d15}, [r2]!\n" + "vld1.f32 {d16-d17}, [r2]\n" + "vld1.f32 {d26-d27}, [%[f]]!\n" + "vld1.f32 {d28-d29}, [%[f]]!\n" + "0:\n" + "cmp r4, #3\n" + "ble 3f\n" + "pld [%[f], #374]\n" + "vld1.f32 {d30[0]}, [%[in]]!\n" + "vmla.f32 q1, q9, d0[0]\n" + "vld1.f32 {d18-d19}, [%[f]]!\n" + "vmla.f32 q2, q10, d0[0]\n" + "vld1.f32 {d20-d21}, [%[f]]!\n" + "vmla.f32 q3, q11, d0[0]\n" + "vld1.f32 {d22-d23}, [%[f]]!\n" + "vmla.f32 q4, q12, d0[0]\n" + "vld1.f32 {d24-d25}, [%[f]]!\n" + "vmla.f32 q5, q13, d0[0]\n" + "vld1.f32 {d26-d27}, [%[f]]!\n" + "vmla.f32 q6, q14, d0[0]\n" + "vld1.f32 {d28-d29}, [%[f]]!\n" + "vmla.f32 q7, q9, d0[0]\n" + "vld1.f32 {d18-d19}, [%[f]]!\n" + "vmla.f32 q8, q10, d0[0]\n" + "vld1.f32 {d20-d21}, [%[f]]!\n" + + "pld [%[f], #374]\n" + "vmov.f32 q0, q15\n" + "vld1.f32 {d30[0]}, [%[in]]!\n" + "vmla.f32 q1, q11, d0[0]\n" + "vld1.f32 {d22-d23}, [%[f]]!\n" + "vmla.f32 q2, q12, d0[0]\n" + "vld1.f32 {d24-d25}, [%[f]]!\n" + "vmla.f32 q3, q13, d0[0]\n" + "vld1.f32 {d26-d27}, [%[f]]!\n" + "vmla.f32 q4, q14, d0[0]\n" + "vld1.f32 {d28-d29}, [%[f]]!\n" + "vmla.f32 q5, q9, d0[0]\n" + "vld1.f32 {d18-d19}, [%[f]]!\n" + "vmla.f32 q6, q10, d0[0]\n" + "vld1.f32 {d20-d21}, [%[f]]!\n" + "vmla.f32 q7, q11, d0[0]\n" + "vld1.f32 {d22-d23}, [%[f]]!\n" + "vmla.f32 q8, q12, d0[0]\n" + "vld1.f32 {d24-d25}, [%[f]]!\n" + + "sub r4, r4, #3\n" + + "pld [%[f], #374]\n" + "vmov.f32 q0, q15\n" + "vld1.f32 {d30[0]}, [%[in]]!\n" + "vmla.f32 q1, q13, d0[0]\n" + "vld1.f32 {d26-d27}, [%[f]]!\n" + "vmla.f32 q2, q14, d0[0]\n" + "vld1.f32 {d28-d29}, [%[f]]!\n" + "vmla.f32 q3, q9, d0[0]\n" + "vld1.f32 {d18-d19}, [%[f]]!\n" + "vmla.f32 q4, q10, d0[0]\n" + "vld1.f32 {d20-d21}, [%[f]]!\n" + "vmla.f32 q5, q11, d0[0]\n" + "vld1.f32 {d22-d23}, [%[f]]!\n" + "vmla.f32 q6, q12, d0[0]\n" + "vld1.f32 {d24-d25}, [%[f]]!\n" + "vmla.f32 q7, q13, d0[0]\n" + "vld1.f32 {d26-d27}, [%[f]]!\n" + "vmla.f32 q8, q14, d0[0]\n" + "vld1.f32 {d28-d29}, [%[f]]!\n" + "vmov.f32 q0, q15\n" + "b 0b\n" + "3:\n" + "sub r4, r4, #1\n" + "vmla.f32 q1, q9, d0[0]\n" + "vld1.f32 {d18-d19}, [%[f]]!\n" + "vmla.f32 q2, q10, d0[0]\n" + "vld1.f32 {d20-d21}, [%[f]]!\n" + "vmla.f32 q3, q11, d0[0]\n" + "vmla.f32 q4, q12, d0[0]\n" + "vmla.f32 q5, q13, d0[0]\n" + "vmla.f32 q6, q14, d0[0]\n" + "vmla.f32 q7, q9, d0[0]\n" + "vmla.f32 q8, q10, d0[0]\n" + + "1:\n" + "cmp r4, #0\n" + "beq 2f\n" + "sub r4, r4, #1\n" + "vld1.f32 {d0[0]}, [%[in]]!\n" + "vld1.f32 {d18-d19}, [%[f]]!\n" + "vld1.f32 {d20-d21}, [%[f]]!\n" + "vld1.f32 {d22-d23}, [%[f]]!\n" + "vld1.f32 {d24-d25}, [%[f]]!\n" + "vmla.f32 q1, q9, d0[0]\n" + "vld1.f32 {d18-d19}, [%[f]]!\n" + "vmla.f32 q2, q10, d0[0]\n" + "vld1.f32 {d20-d21}, [%[f]]!\n" + "vmla.f32 q3, q11, d0[0]\n" + "vld1.f32 {d22-d23}, [%[f]]!\n" + "vmla.f32 q4, q12, d0[0]\n" + "vld1.f32 {d24-d25}, [%[f]]!\n" + "vmla.f32 q5, q9, d0[0]\n" + "vmla.f32 q6, q10, d0[0]\n" + "vmla.f32 q7, q11, d0[0]\n" + "vmla.f32 q8, q12, d0[0]\n" + "b 1b\n" + + "2:\n" + "vst1.f32 {d2-d3}, [r3]!\n" + "vst1.f32 {d4-d5}, [r3]!\n" + "vst1.f32 {d6-d7}, [r3]!\n" + "vst1.f32 {d8-d9}, [r3]!\n" + "vst1.f32 {d10-d11}, [r3]!\n" + "vst1.f32 {d12-d13}, [r3]!\n" + "vst1.f32 {d14-d15}, [r3]!\n" + "vst1.f32 {d16-d17}, [r3]\n" + : [f] "+r"(f), [in] "+r"(in) + : [k] "r"(fk), [out] "r"(out) + : "memory", "cc", "r2", "r3", "r4", "q0", "q1", "q2", "q3", "q4", "q5", + "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); +#endif + } +} + +EE rnncell_fp32(TensorDesc xDesc, + const void *currentX, + const TensorDesc *filterDesc, + const void **filter, + const TensorDesc *biasDesc, + const void **bias, + void *state, + U32 tmpBytes, + void *tmp, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + void *output, + Arch arch) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + UNUSED(arch); + if (nullptr == currentX || nullptr == filter || nullptr == bias || nullptr == state || + nullptr == tmp || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ix; + U32 on, oh; + U32 fk, fn; + CHECK_STATUS(tensor2dGet(xDesc, &idt, &idf, &in, &ix)); + CHECK_STATUS(tensor2dGet(filterDesc[0], &fdt, &fdf, &fn, &fk)); + CHECK_STATUS(tensor2dGet(hDesc, &odt, &odf, &on, &oh)); + if (fdf != DF_NKN32) { + CHECK_STATUS(NOT_MATCH); + } + fn /= 32; + + U32 batch = in; + I32 xDim = ix; + I32 hDim = rnnParamSpec.numOutput; + I32 column = (rnnParamSpec.numProjection > 0) ? rnnParamSpec.numProjection + : rnnParamSpec.numOutput; + if (!(idt == DT_F32 && fdt == DT_F32 && odt == DT_F32)) { + CHECK_STATUS(NOT_MATCH); + } + if (!(4 * column == (I32)fn * 32 && (ix + oh) == fk && in == on)) { + CHECK_STATUS(NOT_MATCH); + } + F32 forgetBias = rnnParamSpec.forgetBias; + ActivationMode activationMode = rnnParamSpec.activationMode; + if (activationMode != ACTIVATION_TANH) { + CHECK_STATUS(NOT_SUPPORTED); + } + + const F32 *currentXArray = (const F32 *)currentX; + F32 *lastStateArray = (F32 *)state; + F32 *lastHArray = lastStateArray + column; + F32 *tmpArray = (F32 *)tmp; + F32 *currentStateArray = (F32 *)state; + F32 *currentHArray = currentStateArray + column; + F32 *outputArray = (F32 *)output; + F32 *xhArray = tmpArray; + F32 *intermediateH = xhArray + (xDim + hDim); + U32 lastStateStride = column + hDim; + U32 lastHStride = column + hDim; + U32 currentStateStride = column + hDim; + U32 currentHStride = column + hDim; + float32x4_t forgetBiasVector = vdupq_n_f32(forgetBias); + for (U32 m = 0; m < batch; m++) { + F32 *lastBatchH = lastHArray + m * lastHStride; + memcpy(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(F32)); + memcpy(xhArray + xDim, lastBatchH, hDim * sizeof(F32)); + + memcpy(intermediateH, bias[0], column * 4 * sizeof(F32)); + mvm_nkn32(fn, fk, (const F32 *)filter[0], xhArray, intermediateH); + F32 *out_i = intermediateH; + F32 *out_g = out_i + column; + F32 *out_f = out_i + column * 2; + F32 *out_o = out_i + column * 3; + + F32 *lastBatchState = lastStateArray + m * lastStateStride; + F32 *currentBatchState = currentStateArray + m * currentStateStride; + F32 *currentBatchH = currentHArray + m * currentHStride; + F32 *currentOutput = outputArray + m * batchStrideH; + + F32 *tmpState, *tmpHH, *tmpH; + if (rnnParamSpec.zoneoutCell == 0) { + tmpState = currentBatchState; + } else { + tmpState = out_i; + } + if (rnnParamSpec.numProjection > 0) { + tmpHH = out_g; + tmpH = currentOutput; + } else { + tmpHH = currentOutput; + tmpH = out_g; + } + + I32 h = 0; + for (; h < column - 3; h += 4) { + float32x4_t out_i_v = vld1q_f32(out_i + h); + float32x4_t out_g_v = vld1q_f32(out_g + h); + float32x4_t out_f_v = vld1q_f32(out_f + h); + float32x4_t out_o_v = vld1q_f32(out_o + h); + float32x4_t C_v = vld1q_f32(lastBatchState + h); + float32x4_t I_v = vsigmoidq_f32(out_i_v); + float32x4_t F_v = vsigmoidq_f32(vaddq_f32(out_f_v, forgetBiasVector)); + float32x4_t O_v = vsigmoidq_f32(out_o_v); + float32x4_t G_v = vtanhq_f32(out_g_v); + C_v = vaddq_f32(vmulq_f32(C_v, F_v), vmulq_f32(I_v, G_v)); + float32x4_t out_hidden_v = vmulq_f32(O_v, vtanhq_f32(C_v)); + vst1q_f32(tmpState + h, C_v); + vst1q_f32(tmpHH + h, out_hidden_v); + } + for (; h < column; h++) { + F32 C_s = lastBatchState[h]; + F32 I_s = 1.0 / (1.0 + exp(-out_i[h])); + F32 F_s = 1.0 / (1.0 + exp(-(out_f[h] + forgetBias))); + F32 O_s = 1.0 / (1.0 + exp(-out_o[h])); + F32 G_s = tanh(out_g[h]); + C_s = C_s * F_s + I_s * G_s; + F32 value = O_s * tanh(C_s); + tmpState[h] = C_s; + tmpHH[h] = value; + } + if (rnnParamSpec.zoneoutCell != 0) { + array_scale_f32(tmpState, tmpState, column, 1 - rnnParamSpec.zoneoutCell, 0); + array_scale_f32(lastBatchState, lastBatchState, column, rnnParamSpec.zoneoutCell, 0); + array_add_f32(tmpState, lastBatchState, currentBatchState, column); + } + + if (rnnParamSpec.numProjection > 0) { + memset(tmpH, 0, sizeof(F32) * hDim); + mvm_nkn32(hDim / 32, rnnParamSpec.numProjection, (const F32 *)filter[1], tmpHH, tmpH); + } + if (rnnParamSpec.zoneoutOutput != 0) { + if (rnnParamSpec.numProjection > 0) { + array_scale_f32(tmpH, out_f, hDim, 1 - rnnParamSpec.zoneoutOutput, 0); + } else { + array_scale_f32(tmpHH, out_f, hDim, 1 - rnnParamSpec.zoneoutOutput, 0); + } + array_scale_f32(lastBatchH, lastBatchH, hDim, rnnParamSpec.zoneoutOutput, 0); + array_add_f32(out_f, lastBatchH, currentBatchH, hDim); + } else { + memcpy(currentBatchH, currentOutput, sizeof(F32) * hDim); + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/arm/fp32/normalization.cpp b/compute/tensor/src/cpu/arm/fp32/normalization.cpp new file mode 100644 index 00000000..6604b485 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp32/normalization.cpp @@ -0,0 +1,62 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/arm/fp32/tensor_computing_fp32.h" + +inline void array_norm_scale_fp32( + F32 *input, F32 *output, I32 len, F32 mean, F32 var, F32 *alpha, F32 *beta) +{ + F32 eps = 1e-6; + F32 std_value = sqrt(var + eps); + float32x4_t mean_v = vdupq_n_f32(mean); + float32x4_t std_v = vdupq_n_f32(std_value); + + I32 i = 0; + for (i = 0; i < len - 3; i += 4) { + float32x4_t in = vld1q_f32(input + i); + float32x4_t alpha_v = vld1q_f32(alpha + i); + float32x4_t beta_v = vld1q_f32(beta + i); + + float32x4_t tmp_v = vsubq_f32(in, mean_v); + tmp_v = vdivq_f32(tmp_v, std_v); + tmp_v = vfmaq_f32(beta_v, alpha_v, tmp_v); + vst1q_f32(output + i, tmp_v); + } + for (; i < len; i++) { + output[i] = alpha[i] * (input[i] - mean) / std_value + beta[i]; + } +} + +EE layer_normalization_fp32( + TensorDesc inputDesc, F32 *input, F32 *alpha, F32 *beta, TensorDesc outputDesc, F32 *output) +{ + UNUSED(outputDesc); + if (nullptr == alpha || nullptr == beta || nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + U32 size = tensorNumElements(inputDesc); + I32 size_inner = inputDesc.dims[0]; + I32 size_outer = size / size_inner; + for (I32 i = 0; i < size_outer; i++) { + F32 *current_input = input + i * size_inner; + F32 *current_output = output + i * size_inner; + F32 mean = array_mean_f32(current_input, size_inner); + F32 var = array_var_f32(current_input, size_inner, mean); + + array_norm_scale_fp32(current_input, current_output, size_inner, mean, var, alpha, beta); + } + + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/arm/fp32/pooling.cpp b/compute/tensor/src/cpu/arm/fp32/pooling.cpp new file mode 100644 index 00000000..0249b731 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp32/pooling.cpp @@ -0,0 +1,86 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/arm/fp32/tensor_computing_fp32.h" + +EE pooling_c8_fp32(const F32 *input, + U32 stride, + int hstart, + int hend, + int wstart, + int wend, + F32 *output, + PoolingParamSpec poolingParamSpec) +{ + EE ret = SUCCESS; + PoolingMode pm = poolingParamSpec.mode; + float32x4_t in0, in1, out0, out1; + float32x4_t poolSize = vdupq_n_f32((hend - hstart) * (wend - wstart)); + out0 = vdupq_n_f32((pm == POOLING_MAX) ? -FLT_MAX : 0); + out1 = out0; + for (int kernelH = hstart; kernelH < hend; kernelH++) { + for (int kernelW = wstart; kernelW < wend; kernelW++) { + const U32 index = (kernelH * stride + kernelW) * 8; + in0 = vld1q_f32(input + index); + in1 = vld1q_f32(input + index + 4); + switch (pm) { + case POOLING_MAX: { + out0 = vmaxq_f32(in0, out0); + out1 = vmaxq_f32(in1, out1); + break; + } + case POOLING_MEAN: { + out0 = vaddq_f32(out0, in0); + out1 = vaddq_f32(out1, in1); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + } + } + vst1q_f32(output, ((pm == POOLING_MAX) ? out0 : vdivq_f32(out0, poolSize))); + vst1q_f32(output + 4, ((pm == POOLING_MAX) ? out1 : vdivq_f32(out1, poolSize))); + return ret; +} + +EE pooling_bp_c8_fp32(const F32 *input, + int hstart, + int hend, + int wstart, + int wend, + F32 *output, + U32 stride, + PoolingParamSpec poolingParamSpec) +{ + EE ret = SUCCESS; + PoolingMode pm = poolingParamSpec.mode; + if (pm != POOLING_MEAN) { + ret = NOT_SUPPORTED; + } + float32x4_t poolSize = vdupq_n_f32((hend - hstart) * (wend - wstart)); + float32x4_t in0 = vdivq_f32(vld1q_f32(input), poolSize); + float32x4_t in1 = vdivq_f32(vld1q_f32(input + 4), poolSize); + for (int kernelH = hstart; kernelH < hend; kernelH++) { + for (int kernelW = wstart; kernelW < wend; kernelW++) { + U32 index = (kernelH * stride + kernelW) * 8; + float32x4_t out0 = vaddq_f32(vld1q_f32(output + index), in0); + float32x4_t out1 = vaddq_f32(vld1q_f32(output + index + 4), in1); + vst1q_f32(output + index, out0); + vst1q_f32(output + index + 4, out1); + } + } + return ret; +} diff --git a/compute/tensor/src/cpu/arm/fp32/prelu.cpp b/compute/tensor/src/cpu/arm/fp32/prelu.cpp new file mode 100644 index 00000000..b19dcd56 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp32/prelu.cpp @@ -0,0 +1,68 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/tensor_computing_arm.h" +#include "cpu/arm/fp32/tensor_computing_fp32.h" + +EE prelu_fp32(TensorDesc inputDesc, + F32 *input, + F32 *weight, + PReLUParamSpec preluDesc, + TensorDesc outputDesc, + F32 *output) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, odt; + DataFormat idf, odf; + U32 in = 0, ic = 0, ih = 0, iw = 0, on = 0, oc = 0, oh = 0, ow = 0; + if (tensorIs4d(inputDesc) && tensorIs4d(outputDesc)) { + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + if (idf != DF_NCHWC8) { + CHECK_STATUS(NOT_SUPPORTED); + } + } else { + return NOT_SUPPORTED; + } + + CHECK_REQUIREMENT(in == on && ic == oc && ih == oh && iw == ow); + ic /= 8; + float32x4_t slope0, slope1; + uint32x4_t mask0, mask1; + float32x4_t in0, in1, out0, out1; + for (U32 n = 0; n < in; n++) { + for (U32 c = 0; c < ic; c++) { + for (U32 hw = 0; hw < ih * iw; hw++) { + slope0 = preluDesc.propagate_down ? vdupq_n_f32(weight[0]) + : vld1q_f32(weight + c * 8); + slope1 = preluDesc.propagate_down ? vdupq_n_f32(weight[0]) + : vld1q_f32(weight + c * 8 + 4); + in0 = vld1q_f32(input); + in1 = vld1q_f32(input + 4); + mask0 = vcleq_f32(in0, vdupq_n_f32(0.f)); + mask1 = vcleq_f32(in1, vdupq_n_f32(0.f)); + float32x4_t tmp0 = vmulq_f32(in0, slope0); + float32x4_t tmp1 = vmulq_f32(in1, slope1); + out0 = vbslq_f32(mask0, tmp0, in0); + out1 = vbslq_f32(mask1, tmp1, in1); + vst1q_f32(output, out0); + vst1q_f32(output + 4, out1); + input += 8; + output += 8; + } + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/arm/fp32/scale.cpp b/compute/tensor/src/cpu/arm/fp32/scale.cpp new file mode 100644 index 00000000..882ed072 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp32/scale.cpp @@ -0,0 +1,126 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/arm/fp32/tensor_computing_fp32.h" + +EE scale_nchwc8_fp32( + F32 *input, F32 *alpha, F32 *beta, I32 in, I32 ic, I32 elements_per_channel, F32 *output) +{ + float32x4_t in_vec, out_vec; + float32x4_t one = vdupq_n_f32(float32_t(1.)); + float32x4_t zero = vdupq_n_f32(float32_t(0.)); + U32 index = 0; + for (I32 n = 0; n < in; n++) { + for (I32 c = 0; c < ic; c += 8) { + float32x4_t alpha_vec0 = (alpha == nullptr) ? one : vld1q_f32(alpha + c); + float32x4_t alpha_vec1 = (alpha == nullptr) ? one : vld1q_f32(alpha + c + 4); + float32x4_t beta_vec0 = (beta == nullptr) ? zero : vld1q_f32(beta + c); + float32x4_t beta_vec1 = (beta == nullptr) ? zero : vld1q_f32(beta + c + 4); + for (I32 i = 0; i < elements_per_channel; i++) { + in_vec = vld1q_f32(input + index); + out_vec = vfmaq_f32(beta_vec0, alpha_vec0, in_vec); + vst1q_f32(output + index, out_vec); + + in_vec = vld1q_f32(input + index + 4); + out_vec = vfmaq_f32(beta_vec1, alpha_vec1, in_vec); + vst1q_f32(output + index + 4, out_vec); + index += 8; + } + } + } + return SUCCESS; +} + +EE scale_nchw_fp32( + F32 *input, F32 *alpha, F32 *beta, I32 in, I32 ic, I32 elements_per_channel, F32 *output) +{ + float32x4_t one = vdupq_n_f32(1.); + float32x4_t zero = vdupq_n_f32(0.); + U32 index = 0; + for (I32 n = 0; n < in; n++) { + for (I32 c = 0; c < ic; c++) { + float32x4_t alpha_vec = (alpha == nullptr) ? one : vdupq_n_f32(alpha[c]); + float32x4_t beta_vec = (beta == nullptr) ? zero : vdupq_n_f32(beta[c]); + I32 i = 0; + for (; i < elements_per_channel - 3; i += 4) { + float32x4_t in_vec = vld1q_f32(input + index); + float32x4_t out_vec = vfmaq_f32(beta_vec, alpha_vec, in_vec); + vst1q_f32(output + index, out_vec); + index += 4; + } + for (; i < elements_per_channel; i++) { + float alpha_s = (alpha == nullptr) ? 1 : alpha[c]; + float beta_s = (beta == nullptr) ? 0 : beta[c]; + output[index] = alpha_s * input[index] + beta_s; + index++; + } + } + } + return SUCCESS; +} + +EE scale_nhwc_fp32( + F32 *input, F32 *alpha, F32 *beta, I32 in, I32 ic, I32 elements_per_channel, F32 *output) +{ + float32x4_t one = vdupq_n_f32(1.); + float32x4_t zero = vdupq_n_f32(0.); + U32 index = 0; + for (I32 n = 0; n < in; n++) { + for (I32 i = 0; i < elements_per_channel; i++) { + I32 c = 0; + for (; c < ic - 3; c += 4) { + float32x4_t alpha_vec = (alpha == nullptr) ? one : vld1q_f32(alpha + c); + float32x4_t beta_vec = (beta == nullptr) ? zero : vld1q_f32(beta + c); + float32x4_t in_vec = vld1q_f32(input + index); + float32x4_t out_vec = vfmaq_f32(beta_vec, alpha_vec, in_vec); + vst1q_f32(output + index, out_vec); + index += 4; + } + for (; c < ic; c++) { + float alpha_s = (alpha == nullptr) ? 1 : alpha[c]; + float beta_s = (beta == nullptr) ? 0 : beta[c]; + output[index] = alpha_s * input[index] + beta_s; + index++; + } + } + } + return SUCCESS; +} + +EE scale_fp32(F32 *input, + I32 axis, + I32 nDims, + F32 *alpha, + F32 *beta, + I32 in, + I32 ic, + I32 elements_per_channel, + F32 *output) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + EE ret = SUCCESS; + // If ic is 1, it means that weights/vectors have only one param, so we need use the calculation logic of nchw. + if (axis == 1 || axis == 0 || ic == 1) { + ret = scale_nchw_fp32(input, alpha, beta, in, ic, elements_per_channel, output); + } else if (axis == nDims - 1) { + ret = scale_nhwc_fp32(input, alpha, beta, in, ic, elements_per_channel, output); + } else if (axis == nDims) { + ret = scale_nchwc8_fp32(input, alpha, beta, in, ic, elements_per_channel, output); + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + return ret; +} diff --git a/compute/tensor/src/cpu/arm/fp32/softmax.cpp b/compute/tensor/src/cpu/arm/fp32/softmax.cpp new file mode 100644 index 00000000..04597bc8 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp32/softmax.cpp @@ -0,0 +1,139 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include +#include "cpu/arm/fp32/tensor_computing_fp32.h" + +void softmax_lastAxis_fp32(const F32 *input, I32 loopOuter, I32 loops, F32 *output) +{ + for (I32 i = 0; i < loopOuter; i++) { + const F32 *inputPtr = input + i * loops; + F32 *outputPtr = output + i * loops; + + float32x4_t max_v, sub_v, sum_v, tmp_v; + F32 max_s, tmp_s; + max_s = array_max_f32(inputPtr, loops); + max_v = vdupq_n_f32(max_s); + sum_v = vdupq_n_f32(0); + + I32 j = 0; + F32 sum_s = 0; + for (j = 0; j < loops - 3; j += 4) { + float32x4_t in = vld1q_f32(inputPtr + j); + sub_v = vsubq_f32(in, max_v); + tmp_v = vexpq_f32_03_percent_error(sub_v); + sum_v = vaddq_f32(sum_v, tmp_v); + vst1q_f32(outputPtr + j, tmp_v); + } + sum_s += vaddvq_f32(sum_v); + for (; j < loops; j++) { + tmp_s = exp(inputPtr[j] - max_s); + outputPtr[j] = tmp_s; + sum_s += tmp_s; + } + array_scale_f32(outputPtr, outputPtr, loops, 1.0 / sum_s, 0); + } +} + +void softmax_anyAxis_fp32(const F32 *input, I32 loopOuter, I32 loops, I32 loopInner, F32 *output) +{ + std::vector buffer(loopInner * 2); + F32 *maxBuffer = &buffer[0]; + F32 *sumBuffer = &buffer[loopInner]; + I32 k = 0; + for (I32 i = 0; i < loopOuter; i++) { + const F32 *inputPtrBase = input + i * loops * loopInner; + F32 *outputPtrBase = output + i * loops * loopInner; + + memcpy(maxBuffer, inputPtrBase, loopInner * sizeof(F32)); + memset(sumBuffer, 0, loopInner * sizeof(F32)); + for (I32 j = 1; j < loops; j++) { + const F32 *inputPtr = inputPtrBase + j * loopInner; + for (k = 0; k < loopInner - 3; k += 4) { + float32x4_t in_v = vld1q_f32(inputPtr + k); + float32x4_t out_v = vld1q_f32(maxBuffer + k); + float32x4_t max_v = vmaxq_f32(in_v, out_v); + vst1q_f32(maxBuffer + k, max_v); + } + for (; k < loopInner; k++) { + maxBuffer[k] = UNI_MAX(maxBuffer[k], inputPtr[k]); + } + } + for (I32 j = 0; j < loops; j++) { + const F32 *inputPtr = inputPtrBase + j * loopInner; + F32 *outputPtr = outputPtrBase + j * loopInner; + for (k = 0; k < loopInner - 3; k += 4) { + float32x4_t in_v = vld1q_f32(inputPtr + k); + float32x4_t max_v = vld1q_f32(maxBuffer + k); + float32x4_t sub_v = vsubq_f32(in_v, max_v); + float32x4_t exp_v = vexpq_f32_03_percent_error(sub_v); + float32x4_t sum_v = vld1q_f32(sumBuffer + k); + sum_v = vaddq_f32(sum_v, exp_v); + vst1q_f32(sumBuffer + k, sum_v); + vst1q_f32(outputPtr + k, exp_v); + } + for (; k < loopInner; k++) { + outputPtr[k] = exp(inputPtr[k] - maxBuffer[k]); + sumBuffer[k] += outputPtr[k]; + } + } + for (I32 j = 0; j < loops; j++) { + F32 *outputPtr = outputPtrBase + j * loopInner; + for (k = 0; k < loopInner - 3; k += 4) { + float32x4_t out_v = vld1q_f32(outputPtr + k); + float32x4_t sum_v = vld1q_f32(sumBuffer + k); + out_v = vdivq_f32(out_v, sum_v); + vst1q_f32(outputPtr + k, out_v); + } + for (; k < loopInner; k++) { + outputPtr[k] /= sumBuffer[k]; + } + } + } +} + +EE softmax_fp32(TensorDesc inputDesc, const F32 *input, int axis, TensorDesc outputDesc, F32 *output) +{ + UNUSED(outputDesc); + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + U32 size = tensorNumElements(inputDesc); + axis = (axis + inputDesc.nDims) % inputDesc.nDims; + axis = inputDesc.nDims - 1 - axis; + I32 loops = inputDesc.dims[axis]; + + I32 loopInner = 1; + for (int i = 0; i < axis; i++) { + loopInner *= inputDesc.dims[i]; + } + U32 loopOuter = size / loops / loopInner; + + if (loopInner == 1) { + if (DF_NCHWC8 == inputDesc.df && 4 == inputDesc.nDims && + (inputDesc.dims[1] != 1 || inputDesc.dims[0] != 1)) { + CHECK_REQUIREMENT(2 != axis); + loopInner *= 8; + loopOuter /= 8; + softmax_anyAxis_fp32(input, loopOuter, loops, loopInner, output); + } else { + softmax_lastAxis_fp32(input, loopOuter, loops, output); + } + } else { + CHECK_REQUIREMENT(DF_NCHWC8 != inputDesc.df); + softmax_anyAxis_fp32(input, loopOuter, loops, loopInner, output); + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/arm/fp32/tensor_computing_fp32.h b/compute/tensor/src/cpu/arm/fp32/tensor_computing_fp32.h new file mode 100644 index 00000000..76d2b38f --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp32/tensor_computing_fp32.h @@ -0,0 +1,242 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_TENSOR_COMPUTING_FP32 +#define _H_TENSOR_COMPUTING_FP32 +#include +#include "sys.h" +#include "error.h" +#include "thread_affinity.h" +#include "types.h" +#include "cpu/arm/fp32/arm_functions_fp32.h" + +EE convolution_transform_filter_fp32(TensorDesc filterDesc, + const F32 *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + F32 *filterTransformed); + +EE convolution_fp32(TensorDesc inputDesc, + F32 *input, + TensorDesc filterDesc, + const F32 *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc biasDesc, + const F32 *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *output, + ActivationParamSpec activationDesc, + Arch arch); + +#ifdef __aarch64__ +EE convolution_gemm_V8(TensorDesc inputDesc, + F32 *inArray, + TensorDesc filterDesc, + const F32 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F32 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec activationDesc); +#else +EE convolution_gemm_V7(TensorDesc inputDesc, + F32 *inArray, + TensorDesc filterDesc, + const F32 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F32 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec activationDesc); +#endif + +#ifdef __aarch64__ +EE convolution_gemm_icnchw_V8(TensorDesc inputDesc, + F32 *inArray, + TensorDesc filterDesc, + const F32 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F32 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec activationDesc); +#else +EE convolution_gemm_icnchw_V7(TensorDesc inputDesc, + F32 *inArray, + TensorDesc filterDesc, + const F32 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F32 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec activationDesc); +#endif + +EE convolution_winograd_V8(TensorDesc inputDesc, + F32 *inArray, + TensorDesc filterDesc, + const F32 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F32 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec activationDesc); + +EE deconvolution_transform_filter_fp32(TensorDesc filterDesc, + const F32 *filter, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + F32 *filterTransformed); + +EE pooling_c8_fp32(const F32 *input, + U32 stride, + int hstart, + int hend, + int wstart, + int wend, + F32 *output, + PoolingParamSpec poolingParamSpec); + +EE pooling_bp_c8_fp32(const F32 *input, + int hstart, + int hend, + int wstart, + int wend, + F32 *output, + U32 stride, + PoolingParamSpec poolingParamSpec); + +EE softmax_fp32( + TensorDesc inputDesc, const F32 *input, int axis, TensorDesc outputDesc, F32 *output); + +EE concat_fp32(std::vector inputDesc, + std::vector input, + TensorDesc outputDesc, + void *output, + U32 concatDim); + +EE attention_fp32(U32 batch, + U32 numHeads, + I32 fromSequenceLength, + I32 toSequenceLength, + const F32 *input, + F32 *output); + +EE clip_fp32(F32 *input, F32 *output, I32 len, F32 minValue, F32 maxValue); + +EE depthwise_pointwise_convolution_fp32(TensorDesc inputDesc, + F32 *input, + TensorDesc dwFilterDesc, + const F32 *dwFilter, + TensorDesc pwFilterDesc, + const F32 *pwFilter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc dwBiasDesc, + const F32 *dwBias, + TensorDesc pwBiasDesc, + const F32 *pwBias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *output, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + Arch arch); + +EE eltwise_fp32(std::vector input, + std::vector inputSize, + U32 num, + U32 len, + void *output, + EltwiseMode eltwiseMode); + +EE rnncell_fp32(TensorDesc xDesc, + const void *currentX, + const TensorDesc *filterDesc, + const void **filter, + const TensorDesc *biasDesc, + const void **bias, + void *state, + U32 tmpBytes, + void *tmp, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + void *output, + Arch arch); + +EE power_fp32(TensorDesc inputDesc, + F32 *input, + F32 scale, + F32 shift, + F32 power, + TensorDesc outputDesc, + F32 *output); + +EE layer_normalization_fp32( + TensorDesc inputDesc, F32 *input, F32 *alpha, F32 *beta, TensorDesc outputDesc, F32 *output); + +EE scale_fp32(F32 *input, + I32 axis, + I32 nDims, + F32 *alpha, + F32 *beta, + I32 in, + I32 ic, + I32 elements_per_channel, + F32 *output); + +EE softmax_fp32(TensorDesc inputDesc, const F32 *input, TensorDesc outputDesc, F32 *output); + +EE check_fp32(TensorDesc inputDescA, + const F32 *inputA, + TensorDesc inputDescB, + const F32 *inputB, + CheckMode checkMode, + TensorDesc outputDesc, + I32 *output); + +EE attention_mask_fp32(TensorDesc inputDesc, + const F32 *input, + AttentionMaskParamSpec p, + TensorDesc outputDesc, + F32 *output); + +EE prelu_fp32(TensorDesc inputDesc, + F32 *input, + F32 *weight, + PReLUParamSpec preluDesc, + TensorDesc outputDesc, + F32 *output); +#endif diff --git a/tensor_computing/src/cpu/arm/int8/arm_functions_int8.h b/compute/tensor/src/cpu/arm/int8/arm_functions_int8.h similarity index 81% rename from tensor_computing/src/cpu/arm/int8/arm_functions_int8.h rename to compute/tensor/src/cpu/arm/int8/arm_functions_int8.h index 10ebb20a..d1c1cebe 100644 --- a/tensor_computing/src/cpu/arm/int8/arm_functions_int8.h +++ b/compute/tensor/src/cpu/arm/int8/arm_functions_int8.h @@ -1,35 +1,35 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _H_ARM_FUNCTIONS_INT8 #define _H_ARM_FUNCTIONS_INT8 #include "arm_neon_expand.h" -inline EE activation_int8(INT8* input, U32 len, ActivationDesc activationDesc, INT8* output) +inline EE activation_int8(INT8 *input, U32 len, ActivationParamSpec activationDesc, INT8 *output) { int8x16_t in, out; - int8x16_t zero = vdupq_n_s8(0); + int8x16_t zero = vdupq_n_s8(0); U32 len_main = len / 16; U32 len_tail = len % 16; - switch (activationDesc.mode){ + switch (activationDesc.mode) { case ACTIVATION_NULL: { break; } case ACTIVATION_RELU: { - if (activationDesc.value[0] != 0) + if (activationDesc.value[0] != 0) { return NOT_SUPPORTED; + } for (U32 i = 0; i < len_main; i++) { in = vld1q_s8(input); out = vmaxq_s8(zero, in); diff --git a/compute/tensor/src/cpu/arm/int8/concat.cpp b/compute/tensor/src/cpu/arm/int8/concat.cpp new file mode 100644 index 00000000..d0acd676 --- /dev/null +++ b/compute/tensor/src/cpu/arm/int8/concat.cpp @@ -0,0 +1,144 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_INT8 +#include +#include "cpu/arm/int8/tensor_computing_int8.h" + +EE concat_int8(std::vector inputDesc, + std::vector input, + F32 *inputScale, + int concatDim, + TensorDesc outputDesc, + void *output, + F32 *outputScale) +{ + if (inputDesc.size() < 1) { + CHECK_STATUS(NOT_MATCH); + } + if (inputDesc.size() == 1) { + memcpy(output, input[0], tensorNumBytes(outputDesc)); + return SUCCESS; + } + if (concatDim != 0 && concatDim != 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + + F32 min_scale = inputScale[0]; + U32 min_idx = 0; + + for (U32 i = 1; i < input.size(); i++) { + if (min_scale > inputScale[i]) { + min_scale = inputScale[i]; + min_idx = i; + } + } + *outputScale = min_scale; + + for (U32 i = 0; i < input.size(); i++) { + if (i == min_idx) { + continue; + } + + INT8 *narr = (INT8 *)input[i]; + F32 rescale = min_scale / inputScale[i]; + if (rescale >= 0.9921) { // Even 127 will not be updated to 126 + continue; + } + INT8 factor = rescale * 128; + + if (factor < 2) { + continue; + } + + int8x8_t fact = vdup_n_s8(factor); + + U32 num = tensorNumElements(inputDesc[i]); + U32 i32 = num / 32; + + int8x8_t in[4]; + int16x8_t in16[4]; + + for (U32 i = 0; i < i32; i++) { + for (U32 j = 0; j < 4; j++) { + in[j] = vld1_s8(narr + j * 8); + } + for (U32 j = 0; j < 4; j++) { + in16[j] = vmull_s8(in[j], fact); + } + in[0] = vqshrn_n_s16(in16[0], 7); + for (U32 j = 1; j < 4; j++) { + in[j] = vqshrn_n_s16(in16[j], 7); + vst1_s8(narr + j * 8 - 8, in[j - 1]); + } + vst1_s8(narr + 24, in[3]); + + narr += 32; + } + + U32 remainder = num - i32 * 32; + + for (U32 j = 0; j < remainder; j += 8) { + int8x8_t in = vld1_s8(narr + j); + int16x8_t in16 = vmull_s8(in, fact); + in = vqshrn_n_s16(in16, 7); + vst1_s8(narr + j, in); + } + } + + DataType odt, idt; + DataFormat odf, idf; + U32 on = 0, oc = 0, oh = 0, ow = 0, in = 0, ic = 0, ih = 0, iw = 0; + U32 copySize; + + if (tensorIs4d(outputDesc)) { + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + if (odt != DT_I8) { + CHECK_STATUS(NOT_MATCH); + } + + INT8 *out_ptr = (INT8 *)output; + + // batch + if (concatDim == 0) { + for (U32 i = 0; i < inputDesc.size(); i++) { + copySize = tensorNumElements(inputDesc[i]) * sizeof(INT8); + + memcpy(out_ptr, input[i], copySize); + out_ptr = out_ptr + copySize; + } + return SUCCESS; + } + // channel + if (concatDim == 1) { + for (U32 j = 0; j < on; j++) { + for (U32 i = 0; i < inputDesc.size(); i++) { + CHECK_STATUS(tensor4dGet(inputDesc[i], &idt, &idf, &in, &ic, &ih, &iw)); + if (odf != idf) { + CHECK_STATUS(NOT_MATCH); + } + + copySize = tensorNumElements(inputDesc[i]) / in * sizeof(INT8); + + memcpy(out_ptr, (INT8 *)input[i] + j * copySize, copySize); + out_ptr = out_ptr + copySize; + } + } + return SUCCESS; + } + } else { + return NOT_MATCH; + } + return NOT_SUPPORTED; +} +#endif diff --git a/compute/tensor/src/cpu/arm/int8/convolution.cpp b/compute/tensor/src/cpu/arm/int8/convolution.cpp new file mode 100644 index 00000000..269ab97a --- /dev/null +++ b/compute/tensor/src/cpu/arm/int8/convolution.cpp @@ -0,0 +1,94 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_INT8 +#include "types.h" +#include "cpu/arm/int8/tensor_computing_int8.h" +#include "cpu/arm/int8/convolution_winograd.h" +#include "cpu/arm/int8/convolution_gemm.h" + +EE convolution_int8(TensorDesc inputDesc, + const INT8 *input, + TensorDesc filterDesc, + const INT8 *filter, + F16 *scales, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc biasDesc, + const F16 *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec activationDesc, + Arch arch) +{ + if (nullptr == input || nullptr == filter || nullptr == output || nullptr == bias || + nullptr == tmp) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if (idt != DT_I8 && idt != DT_F16) { + CHECK_STATUS(NOT_MATCH); + } + if (fdt != DT_I8) { + CHECK_STATUS(NOT_MATCH); + } + if (odt != DT_F16 && odt != DT_I8) { + CHECK_STATUS(NOT_MATCH); + } + if (odf != DF_NCHWC8) { + CHECK_STATUS(NOT_MATCH); + } + if (!(ic == fc && oc == fn)) { + CHECK_STATUS(NOT_MATCH); + } + + const INT8 *inputPtr = input; + INT8 *tmpPtr = (INT8 *)tmp; + if (idf == DF_NCHW) { + TensorDesc prevDesc = inputDesc; + inputDesc.df = DF_NCHWC8; + CHECK_STATUS(transformNCHWToNCHWC8(prevDesc, input, inputDesc, tmpPtr)); + inputPtr = tmpPtr; + tmpPtr += tensorNumBytes(inputDesc); + tmpBytes -= tensorNumBytes(inputDesc); + } + + EE ret = SUCCESS; + switch (algorithm) { + case CONVOLUTION_ALGORITHM_WINOGRAD: + ret = convolution_winograd(inputDesc, inputPtr, scales, filterDesc, filter, scales + 2, + convParamSpec, biasDesc, bias, tmpBytes, tmpPtr, outputDesc, output, scales + 1, + activationDesc, arch); + break; + case CONVOLUTION_ALGORITHM_GEMM: + ret = convolution_gemm(inputDesc, inputPtr, scales, filterDesc, filter, scales + 2, + convParamSpec, biasDesc, bias, tmpBytes, tmpPtr, outputDesc, output, scales + 1, + activationDesc, arch); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} +#endif diff --git a/compute/tensor/src/cpu/arm/int8/convolution_gemm.h b/compute/tensor/src/cpu/arm/int8/convolution_gemm.h new file mode 100644 index 00000000..4ef9117a --- /dev/null +++ b/compute/tensor/src/cpu/arm/int8/convolution_gemm.h @@ -0,0 +1,502 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_CONVOLUTION_GEMM +#define _H_CONVOLUTION_GEMM +#ifdef _USE_INT8 +#include + +#include "sys.h" +#include "types.h" + +template +EE convolution_gemm_A55(TensorDesc inputDesc, + const void *input, + F16 *inputScale, + TensorDesc filterDesc, + const void *filter, + F16 *filterScale, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + F16 *outputScale, + ActivationParamSpec am); + +template +EE convolution_gemm_A76(TensorDesc inputDesc, + const void *input, + F16 *inputScale, + TensorDesc filterDesc, + const void *filter, + F16 *filterScale, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + F16 *outputScale, + ActivationParamSpec am); + +inline EE convolution_gemm(TensorDesc inputDesc, + const void *input, + F16 *inputScale, + TensorDesc filterDesc, + const void *filter, + F16 *filterScale, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + F16 *outputScale, + ActivationParamSpec am, + Arch arch) +{ + EE ret = SUCCESS; + switch (arch) { + case ARM_A55: { + ret = convolution_gemm_A55(inputDesc, input, inputScale, filterDesc, filter, + filterScale, convParamSpec, biasDesc, bias, tmpBytes, tmp, outputDesc, output, + outputScale, am); + break; + } + case ARM_A76: { + ret = convolution_gemm_A76(inputDesc, input, inputScale, filterDesc, filter, + filterScale, convParamSpec, biasDesc, bias, tmpBytes, tmp, outputDesc, output, + outputScale, am); + break; + } + default: { + ret = NOT_SUPPORTED; + break; + } + } + return ret; +} + +inline EE quantize_I32(U32 num_v, I32 *out_d, I32 factor, F32 scale, INT8 *out_q) +{ + // num_v is the number of q-form vectors (I32) + I32 *arr_d = out_d; + I32 fact = factor; + INT8 *arr_q = out_q; + U32 i28 = num_v / 28; // The number of iterations, each handling 28 vectors + + if (i28 > 0) { + __asm__ __volatile__("ld4 {v1.4s, v2.4s, v3.4s, v4.4s}, [%[out_d]], #64\n" + "ldr s0, [%[factor]]\n" + "ld4 {v5.4s, v6.4s, v7.4s, v8.4s}, [%[out_d]], #64\n" + "mov x1, %[i]\n" + "ld4 {v9.4s, v10.4s, v11.4s, v12.4s}, [%[out_d]], #64\n" + "dup v0.4s, v0.s[0]\n" + "ld4 {v13.4s, v14.4s, v15.4s, v16.4s}, [%[out_d]], #64\n" + "ld4 {v17.4s, v18.4s, v19.4s, v20.4s}, [%[out_d]], #64\n" + "ld4 {v21.4s, v22.4s, v23.4s, v24.4s}, [%[out_d]], #64\n" + + "0:\n" + "ld4 {v25.4s, v26.4s, v27.4s, v28.4s}, [%[out_d]], #64\n" + "subs x1, x1, #1\n" + + "mul v4.4s, v4.4s, v0.4s\n" + "mul v3.4s, v3.4s, v0.4s\n" + "mul v2.4s, v2.4s, v0.4s\n" + "mul v1.4s, v1.4s, v0.4s\n" + + "mul v8.4s, v8.4s, v0.4s\n" + "sri v4.4s, v3.4s, #8\n" + "mul v7.4s, v7.4s, v0.4s\n" + "sri v2.4s, v1.4s, #8\n" + "mul v6.4s, v6.4s, v0.4s\n" + "mul v5.4s, v5.4s, v0.4s\n" + "sri v4.4s, v2.4s, #16\n" + + "mul v12.4s, v12.4s, v0.4s\n" + "sri v8.4s, v7.4s, #8\n" + "mul v11.4s, v11.4s, v0.4s\n" + "sri v6.4s, v5.4s, #8\n" + "mul v10.4s, v10.4s, v0.4s\n" + "str q4, [%[out_q]], #16\n" + "ld4 {v1.4s, v2.4s, v3.4s, v4.4s}, [%[out_d]], #64\n" + "mul v9.4s, v9.4s, v0.4s\n" + "sri v8.4s, v6.4s, #16\n" + + "mul v16.4s, v16.4s, v0.4s\n" + "sri v12.4s, v11.4s, #8\n" + "mul v15.4s, v15.4s, v0.4s\n" + "sri v10.4s, v9.4s, #8\n" + "mul v14.4s, v14.4s, v0.4s\n" + "str q8, [%[out_q]], #16\n" + "ld4 {v5.4s, v6.4s, v7.4s, v8.4s}, [%[out_d]], #64\n" + "mul v13.4s, v13.4s, v0.4s\n" + "sri v12.4s, v10.4s, #16\n" + + "mul v20.4s, v20.4s, v0.4s\n" + "sri v16.4s, v15.4s, #8\n" + "mul v19.4s, v19.4s, v0.4s\n" + "sri v14.4s, v13.4s, #8\n" + "mul v18.4s, v18.4s, v0.4s\n" + "str q12, [%[out_q]], #16\n" + "ld4 {v9.4s, v10.4s, v11.4s, v12.4s}, [%[out_d]], #64\n" + "mul v17.4s, v17.4s, v0.4s\n" + "sri v16.4s, v14.4s, #16\n" + + "mul v24.4s, v24.4s, v0.4s\n" + "sri v20.4s, v19.4s, #8\n" + "mul v23.4s, v23.4s, v0.4s\n" + "sri v18.4s, v17.4s, #8\n" + "mul v22.4s, v22.4s, v0.4s\n" + "str q16, [%[out_q]], #16\n" + "ld4 {v13.4s, v14.4s, v15.4s, v16.4s}, [%[out_d]], #64\n" + "mul v21.4s, v21.4s, v0.4s\n" + "sri v20.4s, v18.4s, #16\n" + + "mul v28.4s, v28.4s, v0.4s\n" + "sri v24.4s, v23.4s, #8\n" + "mul v27.4s, v27.4s, v0.4s\n" + "sri v22.4s, v21.4s, #8\n" + "mul v26.4s, v26.4s, v0.4s\n" + "str q20, [%[out_q]], #16\n" + "ld4 {v17.4s, v18.4s, v19.4s, v20.4s}, [%[out_d]], #64\n" + "mul v25.4s, v25.4s, v0.4s\n" + "sri v24.4s, v22.4s, #16\n" + + "sri v28.4s, v27.4s, #8\n" + "sri v26.4s, v25.4s, #8\n" + "str q24, [%[out_q]], #16\n" + "sri v28.4s, v26.4s, #16\n" + "ld4 {v21.4s, v22.4s, v23.4s, v24.4s}, [%[out_d]], #64\n" + "str q28, [%[out_q]], #16\n" + "bne 0b\n" + : [out_d] "+r"(arr_d), [out_q] "+r"(arr_q) + : [factor] "r"(&fact), [i] "r"((I64)i28) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", + "x1"); + arr_d -= 96; // Prefetched 24 extra vectors + } + + U32 remainder = num_v - i28 * 28; + + if (remainder % 4) { + for (U32 i = 0; i < 8; i++) { + arr_q[i] = round_towards_zero(arr_d[i] * scale); + } + arr_d += 8; + arr_q += 8; + remainder -= 2; + } + + switch (remainder) { + case 24: { + __asm__ __volatile__("ldr s0, [%[factor]]\n" + "dup v0.4s, v0.s[0]\n" + + "ld4 {v1.4s, v2.4s, v3.4s, v4.4s}, [%[out_d]], #64\n" + "ld4 {v5.4s, v6.4s, v7.4s, v8.4s}, [%[out_d]], #64\n" + "ld4 {v9.4s, v10.4s, v11.4s, v12.4s}, [%[out_d]], #64\n" + "ld4 {v13.4s, v14.4s, v15.4s, v16.4s}, [%[out_d]], #64\n" + "ld4 {v17.4s, v18.4s, v19.4s, v20.4s}, [%[out_d]], #64\n" + "ld4 {v21.4s, v22.4s, v23.4s, v24.4s}, [%[out_d]], #64\n" + + "mul v4.4s, v4.4s, v0.4s\n" + "mul v3.4s, v3.4s, v0.4s\n" + "mul v2.4s, v2.4s, v0.4s\n" + "mul v1.4s, v1.4s, v0.4s\n" + + "mul v8.4s, v8.4s, v0.4s\n" + "sri v4.4s, v3.4s, #8\n" + "mul v7.4s, v7.4s, v0.4s\n" + "sri v2.4s, v1.4s, #8\n" + "mul v6.4s, v6.4s, v0.4s\n" + "mul v5.4s, v5.4s, v0.4s\n" + "sri v4.4s, v2.4s, #16\n" + + "mul v12.4s, v12.4s, v0.4s\n" + "sri v8.4s, v7.4s, #8\n" + "mul v11.4s, v11.4s, v0.4s\n" + "sri v6.4s, v5.4s, #8\n" + "mul v10.4s, v10.4s, v0.4s\n" + "str q4, [%[out_q]], #16\n" + "mul v9.4s, v9.4s, v0.4s\n" + "sri v8.4s, v6.4s, #16\n" + + "mul v16.4s, v16.4s, v0.4s\n" + "sri v12.4s, v11.4s, #8\n" + "mul v15.4s, v15.4s, v0.4s\n" + "sri v10.4s, v9.4s, #8\n" + "mul v14.4s, v14.4s, v0.4s\n" + "str q8, [%[out_q]], #16\n" + "mul v13.4s, v13.4s, v0.4s\n" + "sri v12.4s, v10.4s, #16\n" + + "mul v20.4s, v20.4s, v0.4s\n" + "sri v16.4s, v15.4s, #8\n" + "mul v19.4s, v19.4s, v0.4s\n" + "sri v14.4s, v13.4s, #8\n" + "mul v18.4s, v18.4s, v0.4s\n" + "str q12, [%[out_q]], #16\n" + "mul v17.4s, v17.4s, v0.4s\n" + "sri v16.4s, v14.4s, #16\n" + + "mul v24.4s, v24.4s, v0.4s\n" + "sri v20.4s, v19.4s, #8\n" + "mul v23.4s, v23.4s, v0.4s\n" + "sri v18.4s, v17.4s, #8\n" + "mul v22.4s, v22.4s, v0.4s\n" + "str q16, [%[out_q]], #16\n" + "mul v21.4s, v21.4s, v0.4s\n" + "sri v20.4s, v18.4s, #16\n" + + "sri v24.4s, v23.4s, #8\n" + "sri v22.4s, v21.4s, #8\n" + "str q20, [%[out_q]], #16\n" + "sri v24.4s, v22.4s, #16\n" + + "str q24, [%[out_q]], #16\n" + : [out_d] "+r"(arr_d), [out_q] "+r"(arr_q) + : [factor] "r"(&fact) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", + "v18", "v19", "v20", "v21", "v22", "v23", "v24", "x1"); + break; + } + case 20: { + __asm__ __volatile__("ldr s0, [%[factor]]\n" + "dup v0.4s, v0.s[0]\n" + + "ld4 {v1.4s, v2.4s, v3.4s, v4.4s}, [%[out_d]], #64\n" + "ld4 {v5.4s, v6.4s, v7.4s, v8.4s}, [%[out_d]], #64\n" + "ld4 {v9.4s, v10.4s, v11.4s, v12.4s}, [%[out_d]], #64\n" + "ld4 {v13.4s, v14.4s, v15.4s, v16.4s}, [%[out_d]], #64\n" + "ld4 {v17.4s, v18.4s, v19.4s, v20.4s}, [%[out_d]], #64\n" + + "mul v4.4s, v4.4s, v0.4s\n" + "mul v3.4s, v3.4s, v0.4s\n" + "mul v2.4s, v2.4s, v0.4s\n" + "mul v1.4s, v1.4s, v0.4s\n" + + "mul v8.4s, v8.4s, v0.4s\n" + "sri v4.4s, v3.4s, #8\n" + "mul v7.4s, v7.4s, v0.4s\n" + "sri v2.4s, v1.4s, #8\n" + "mul v6.4s, v6.4s, v0.4s\n" + "mul v5.4s, v5.4s, v0.4s\n" + "sri v4.4s, v2.4s, #16\n" + + "mul v12.4s, v12.4s, v0.4s\n" + "sri v8.4s, v7.4s, #8\n" + "mul v11.4s, v11.4s, v0.4s\n" + "sri v6.4s, v5.4s, #8\n" + "mul v10.4s, v10.4s, v0.4s\n" + "str q4, [%[out_q]], #16\n" + "mul v9.4s, v9.4s, v0.4s\n" + "sri v8.4s, v6.4s, #16\n" + + "mul v16.4s, v16.4s, v0.4s\n" + "sri v12.4s, v11.4s, #8\n" + "mul v15.4s, v15.4s, v0.4s\n" + "sri v10.4s, v9.4s, #8\n" + "mul v14.4s, v14.4s, v0.4s\n" + "str q8, [%[out_q]], #16\n" + "mul v13.4s, v13.4s, v0.4s\n" + "sri v12.4s, v10.4s, #16\n" + + "mul v20.4s, v20.4s, v0.4s\n" + "sri v16.4s, v15.4s, #8\n" + "mul v19.4s, v19.4s, v0.4s\n" + "sri v14.4s, v13.4s, #8\n" + "mul v18.4s, v18.4s, v0.4s\n" + "str q12, [%[out_q]], #16\n" + "mul v17.4s, v17.4s, v0.4s\n" + "sri v16.4s, v14.4s, #16\n" + + "sri v20.4s, v19.4s, #8\n" + "sri v18.4s, v17.4s, #8\n" + "str q16, [%[out_q]], #16\n" + "sri v20.4s, v18.4s, #16\n" + + "str q20, [%[out_q]], #16\n" + : [out_d] "+r"(arr_d), [out_q] "+r"(arr_q) + : [factor] "r"(&fact) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", + "v18", "v19", "v20", "x1"); + break; + } + case 16: { + __asm__ __volatile__("ldr s0, [%[factor]]\n" + "dup v0.4s, v0.s[0]\n" + + "ld4 {v1.4s, v2.4s, v3.4s, v4.4s}, [%[out_d]], #64\n" + "ld4 {v5.4s, v6.4s, v7.4s, v8.4s}, [%[out_d]], #64\n" + "ld4 {v9.4s, v10.4s, v11.4s, v12.4s}, [%[out_d]], #64\n" + "ld4 {v13.4s, v14.4s, v15.4s, v16.4s}, [%[out_d]], #64\n" + + "mul v4.4s, v4.4s, v0.4s\n" + "mul v3.4s, v3.4s, v0.4s\n" + "mul v2.4s, v2.4s, v0.4s\n" + "mul v1.4s, v1.4s, v0.4s\n" + + "mul v8.4s, v8.4s, v0.4s\n" + "sri v4.4s, v3.4s, #8\n" + "mul v7.4s, v7.4s, v0.4s\n" + "sri v2.4s, v1.4s, #8\n" + "mul v6.4s, v6.4s, v0.4s\n" + "mul v5.4s, v5.4s, v0.4s\n" + "sri v4.4s, v2.4s, #16\n" + + "mul v12.4s, v12.4s, v0.4s\n" + "sri v8.4s, v7.4s, #8\n" + "mul v11.4s, v11.4s, v0.4s\n" + "sri v6.4s, v5.4s, #8\n" + "mul v10.4s, v10.4s, v0.4s\n" + "str q4, [%[out_q]], #16\n" + "mul v9.4s, v9.4s, v0.4s\n" + "sri v8.4s, v6.4s, #16\n" + + "mul v16.4s, v16.4s, v0.4s\n" + "sri v12.4s, v11.4s, #8\n" + "mul v15.4s, v15.4s, v0.4s\n" + "sri v10.4s, v9.4s, #8\n" + "mul v14.4s, v14.4s, v0.4s\n" + "str q8, [%[out_q]], #16\n" + "mul v13.4s, v13.4s, v0.4s\n" + "sri v12.4s, v10.4s, #16\n" + + "sri v16.4s, v15.4s, #8\n" + "sri v14.4s, v13.4s, #8\n" + "str q12, [%[out_q]], #16\n" + "sri v16.4s, v14.4s, #16\n" + + "str q16, [%[out_q]], #16\n" + : [out_d] "+r"(arr_d), [out_q] "+r"(arr_q) + : [factor] "r"(&fact) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "x1"); + break; + } + case 12: { + __asm__ __volatile__("ldr s0, [%[factor]]\n" + "dup v0.4s, v0.s[0]\n" + + "ld4 {v1.4s, v2.4s, v3.4s, v4.4s}, [%[out_d]], #64\n" + "ld4 {v5.4s, v6.4s, v7.4s, v8.4s}, [%[out_d]], #64\n" + "ld4 {v9.4s, v10.4s, v11.4s, v12.4s}, [%[out_d]], #64\n" + + "mul v4.4s, v4.4s, v0.4s\n" + "mul v3.4s, v3.4s, v0.4s\n" + "mul v2.4s, v2.4s, v0.4s\n" + "mul v1.4s, v1.4s, v0.4s\n" + + "mul v8.4s, v8.4s, v0.4s\n" + "sri v4.4s, v3.4s, #8\n" + "mul v7.4s, v7.4s, v0.4s\n" + "sri v2.4s, v1.4s, #8\n" + "mul v6.4s, v6.4s, v0.4s\n" + "mul v5.4s, v5.4s, v0.4s\n" + "sri v4.4s, v2.4s, #16\n" + + "mul v12.4s, v12.4s, v0.4s\n" + "sri v8.4s, v7.4s, #8\n" + "mul v11.4s, v11.4s, v0.4s\n" + "sri v6.4s, v5.4s, #8\n" + "mul v10.4s, v10.4s, v0.4s\n" + "str q4, [%[out_q]], #16\n" + "mul v9.4s, v9.4s, v0.4s\n" + "sri v8.4s, v6.4s, #16\n" + + "sri v12.4s, v11.4s, #8\n" + "sri v10.4s, v9.4s, #8\n" + "str q8, [%[out_q]], #16\n" + "sri v12.4s, v10.4s, #16\n" + + "str q12, [%[out_q]], #16\n" + : [out_d] "+r"(arr_d), [out_q] "+r"(arr_q) + : [factor] "r"(&fact) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "x1"); + break; + } + case 8: { + __asm__ __volatile__( + "ldr s0, [%[factor]]\n" + "dup v0.4s, v0.s[0]\n" + + "ld4 {v1.4s, v2.4s, v3.4s, v4.4s}, [%[out_d]], #64\n" + "ld4 {v5.4s, v6.4s, v7.4s, v8.4s}, [%[out_d]], #64\n" + + "mul v4.4s, v4.4s, v0.4s\n" + "mul v3.4s, v3.4s, v0.4s\n" + "mul v2.4s, v2.4s, v0.4s\n" + "mul v1.4s, v1.4s, v0.4s\n" + + "mul v8.4s, v8.4s, v0.4s\n" + "sri v4.4s, v3.4s, #8\n" + "mul v7.4s, v7.4s, v0.4s\n" + "sri v2.4s, v1.4s, #8\n" + "mul v6.4s, v6.4s, v0.4s\n" + "mul v5.4s, v5.4s, v0.4s\n" + "sri v4.4s, v2.4s, #16\n" + + "sri v8.4s, v7.4s, #8\n" + "sri v6.4s, v5.4s, #8\n" + "str q4, [%[out_q]], #16\n" + "sri v8.4s, v6.4s, #16\n" + + "str q8, [%[out_q]], #16\n" + : [out_d] "+r"(arr_d), [out_q] "+r"(arr_q) + : [factor] "r"(&fact) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "x1"); + break; + } + case 4: { + __asm__ __volatile__("ldr s0, [%[factor]]\n" + "dup v0.4s, v0.s[0]\n" + + "ld4 {v1.4s, v2.4s, v3.4s, v4.4s}, [%[out_d]], #64\n" + + "mul v4.4s, v4.4s, v0.4s\n" + "mul v3.4s, v3.4s, v0.4s\n" + "mul v2.4s, v2.4s, v0.4s\n" + "mul v1.4s, v1.4s, v0.4s\n" + + "sri v4.4s, v3.4s, #8\n" + "sri v2.4s, v1.4s, #8\n" + "sri v4.4s, v2.4s, #16\n" + + "str q4, [%[out_q]], #16\n" + : [out_d] "+r"(arr_d), [out_q] "+r"(arr_q) + : [factor] "r"(&fact) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "x1"); + break; + } + case 0: { + break; + } + default: { + return UNKNOWN; + } + } + return SUCCESS; +} +#endif +#endif diff --git a/compute/tensor/src/cpu/arm/int8/convolution_gemm_A55.cpp b/compute/tensor/src/cpu/arm/int8/convolution_gemm_A55.cpp new file mode 100644 index 00000000..6023715e --- /dev/null +++ b/compute/tensor/src/cpu/arm/int8/convolution_gemm_A55.cpp @@ -0,0 +1,1619 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_INT8 +#include +#include "cpu/arm/int8/convolution_gemm.h" + +template +EE convolution_gemm_A55(TensorDesc inputDesc, + const void *input, + F16 *inputScale, + TensorDesc filterDesc, + const void *filter, + F16 *filterScale, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + F16 *outputScale, + ActivationParamSpec activationDesc) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + // still im2col + gemm with a smaller buffer + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; + + if (fdf != DF_NCHWN8C4) { + return NOT_MATCH; + } + + I64 conv_relu_bool = (activationDesc.mode == ACTIVATION_RELU) ? 1 : 0; + I64 out_f16_bool = (odt == DT_F16) ? 1 : 0; + I64 scale_known_bool = 0; + if (*outputScale > 0 || ACTIVATION_RELU6 == activationDesc.mode) { + scale_known_bool = 1; + } + + INT8 *inArray = (INT8 *)input; // It will be updated if there is quantization + INT8 *filterArray = (INT8 *)filter; + F16 *outArray = (F16 *)output; + F16 *biasArray = (F16 *)bias; + INT8 *in_pad = (INT8 *)tmp; + + // both input and output are stored with C8 + oc /= 8; + ic /= 8; + + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + I32 ohow = oh * ow; + U32 ihiw = ih_pad * iw_pad; + + I32 *biasScaled = (I32 *)(in_pad + ic * ihiw * 8 + 12 * fh * fw * ic * 8); // Initialize + + // double start, end; + I32 max_i32[4] = {0}; // To record max I32 values + I32 min_i32[4] = {0}; // To record min I32 values + + for (U32 n = 0; n < in; n++) { // for each batch + F16 scale_i = 1.0; + + // quantize input if necessary + if (idt == DT_F16) { + // start = get_current_time_int8(); + F16 *in = ((F16 *)input) + n * ic * ih * iw * 8; + inArray = in_pad + ic * ihiw * 8 + + 12 * fh * fw * ic * 8; // After the space for padding and packing + + U32 numData = ic * ih * iw * 8; + if (*inputScale > 0) { + scale_i = *inputScale; + } else { + float16x8_t temp_v = vld1q_f16(in); + float16x8_t max_v = temp_v; + float16x8_t min_v = temp_v; + + for (U32 i = 8; i < numData; i += 8) { + temp_v = vld1q_f16(in + i); + max_v = vmaxq_f16(max_v, temp_v); + min_v = vminq_f16(min_v, temp_v); + } + + F16 max = vmaxvq_f16(max_v); + F16 min = vminvq_f16(min_v); + + if (max == 0 && min == 0) { + return NOT_SUPPORTED; + } + if (max > 0 && min < 0) { + F16 scale_max = 127.0 / max; + F16 scale_min = -127.0 / min; + scale_i = (scale_max < scale_min) ? scale_max : scale_min; + } else if (max < 0) { + scale_i = -127.0 / min; + } else { // min > 0 + scale_i = 127.0 / max; + } + } + for (U32 i = 0; i < numData; i++) { + F32 temp = in[i] * scale_i; + inArray[i] = round_towards_zero(temp, (*inputScale) != scale_i); + } + *inputScale = scale_i; + } else { + scale_i = *inputScale; + } + + if (1 == scale_known_bool) { + if (ACTIVATION_RELU6 == activationDesc.mode) { + *outputScale = 127.0 / 6.0; + } + F32 scaleInt = (*outputScale / *inputScale) / *filterScale; + I32 thresholdP = 127.0 / scaleInt; + I32 thresholdN = 0; + if (ACTIVATION_RELU6 != activationDesc.mode) { + thresholdN = thresholdP * -1; + } + + for (U32 i = 0; i < 4; i++) { + max_i32[i] = thresholdP; + min_i32[i] = thresholdN; + } + } + + if (odt == DT_I8) { // Scale the bias + if (idt == DT_F16) { + biasScaled += ic * ih * iw * 8 / bytesOf(DT_I32); // After the quantized input + } + F32 scale = (*inputScale) * (*filterScale); + for (U32 i = 0; i < oc * 8; i++) { + biasScaled[i] = round(scale * biasArray[i]); + } + } + + F32 factor_s = 1.0 / ((F32)scale_i) / ((F32)(*filterScale)); + F32 factor_v[4]; + for (U32 i = 0; i < 4; i++) { + factor_v[i] = factor_s; + } + + INT8 *inArray_pad; + if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { + inArray_pad = inArray + n * ic * ih * iw * 8; // use this batch directly + } else { + // copy input into an input with padding + inArray_pad = (INT8 *)tmp; + INT8 *inArray_pad_mov = inArray_pad; + INT8 *inArray_mov = inArray + n * ic * ih * iw * 8; + for (U32 c = 0; c < ic; c++) { // for each 8 channels + for (U32 h = 0; h < paddingT; h++) { // Upper rows of 0 + memset(inArray_pad_mov, 0, iw_pad * 8 * bytesOf(DT_I8)); // 8 comes from C8 + inArray_pad_mov += iw_pad * 8; + } + for (U32 h = paddingT; h < ih_pad - paddingB; h++) { // for each middle-section rows + memset(inArray_pad_mov, 0, paddingL * 8 * bytesOf(DT_I8)); // padding on the left + inArray_pad_mov += paddingL * 8; // 8 comes from C8 + memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(DT_I8)); // Copy input row + inArray_pad_mov += iw * 8; + inArray_mov += iw * 8; + memset( + inArray_pad_mov, 0, paddingR * 8 * bytesOf(DT_I8)); // padding on the right + inArray_pad_mov += paddingR * 8; + } + for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { // Bottom rows of 0 + memset(inArray_pad_mov, 0, iw_pad * 8 * bytesOf(DT_I8)); + inArray_pad_mov += iw_pad * 8; + } + } + } + // ohow / 12 (12x8) + for (I32 hw = 0; hw < ohow - 11; hw += 12) { // Remainder will be handled later + F16 *b0 = biasArray; + I32 *b0_s = biasScaled; + INT8 *in_pack = ((INT8 *)tmp) + ic * ih_pad * iw_pad * 8; // After the padded input + // pack input + // NCHWc8 => NHWChw12c4 + im2col + U32 in_h[12]; + U32 in_w[12]; + + for (U32 i = 0; i < 12; i++) { + in_h[i] = ((hw + i) / ow) * strideH; + in_w[i] = ((hw + i) % ow) * strideW; + } + for (U32 c = 0; c < ic; c++) { // for each 8 channels + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + INT8 *in_hw12c8 = inArray_pad + c * ihiw * 8 + + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + + INT8 *in_0 = in_hw12c8 + in_h[0] * iw_pad * 8 + in_w[0] * 8; + INT8 *in_1 = in_hw12c8 + in_h[1] * iw_pad * 8 + in_w[1] * 8; + INT8 *in_2 = in_hw12c8 + in_h[2] * iw_pad * 8 + in_w[2] * 8; + INT8 *in_3 = in_hw12c8 + in_h[3] * iw_pad * 8 + in_w[3] * 8; + INT8 *in_4 = in_hw12c8 + in_h[4] * iw_pad * 8 + in_w[4] * 8; + INT8 *in_5 = in_hw12c8 + in_h[5] * iw_pad * 8 + in_w[5] * 8; + INT8 *in_6 = in_hw12c8 + in_h[6] * iw_pad * 8 + in_w[6] * 8; + INT8 *in_7 = in_hw12c8 + in_h[7] * iw_pad * 8 + in_w[7] * 8; + INT8 *in_8 = in_hw12c8 + in_h[8] * iw_pad * 8 + in_w[8] * 8; + INT8 *in_9 = in_hw12c8 + in_h[9] * iw_pad * 8 + in_w[9] * 8; + INT8 *in_10 = in_hw12c8 + in_h[10] * iw_pad * 8 + in_w[10] * 8; + INT8 *in_11 = in_hw12c8 + in_h[11] * iw_pad * 8 + in_w[11] * 8; + + // in_pack (tmp) is reused for each tile + // NHWChw12c4 + INT8 *in_pack_0 = + in_pack + c * fh * fw * 12 * 8 + fh_idx * fw * 12 * 4 + fw_idx * 12 * 4; + INT8 *in_pack_1 = in_pack_0 + fh * fw * 12 * 4; + + __asm__ __volatile__( + "ldr d0, [%[in_0]]\n" + "ldr x2, [%[in_2]]\n" + + "ldr d1, [%[in_1]]\n" + "ldr x3, [%[in_3]]\n" + + "ldr d4, [%[in_4]]\n" + "ldr x6, [%[in_6]]\n" + + "ldr d5, [%[in_5]]\n" + "ins v0.d[1], x2\n" + + "ldr x7, [%[in_7]]\n" + "ins v1.d[1], x3\n" + + "ldr d8, [%[in_8]]\n" + "ins v4.d[1], x6\n" + + "trn1 v20.4s, v0.4s, v1.4s\n" + "ins v5.d[1], x7\n" + + "trn2 v21.4s, v0.4s, v1.4s\n" + "ldr x10, [%[in_10]]\n" + + "ldr d9, [%[in_9]]\n" + "trn1 v24.4s, v4.4s, v5.4s\n" + + "trn2 v25.4s, v4.4s, v5.4s\n" + "ldr x11, [%[in_11]]\n" + + "str q20, [%[pack_0]]\n" + "ins v8.d[1], x10\n" + + "str q24, [%[pack_0], #16]\n" + "ins v9.d[1], x11\n" + + "trn1 v28.4s, v8.4s, v9.4s\n" + "str q21, [%[pack_1]]\n" + + "trn2 v29.4s, v8.4s, v9.4s\n" + "str q25, [%[pack_1], #16]\n" + + "str q28, [%[pack_0], #32]\n" + "str q29, [%[pack_1], #32]\n" + : + : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), [in_0] "r"(in_0), + [in_1] "r"(in_1), [in_2] "r"(in_2), [in_3] "r"(in_3), [in_4] "r"(in_4), + [in_5] "r"(in_5), [in_6] "r"(in_6), [in_7] "r"(in_7), [in_8] "r"(in_8), + [in_9] "r"(in_9), [in_10] "r"(in_10), [in_11] "r"(in_11) + : "memory", "cc", "v0", "v1", "v4", "v5", "v8", "v9", "v20", "v21", + "v24", "v25", "v28", "v29", "x2", "x3", "x6", "x7", "x10", "x11"); + } + } + } + + // compute + for (U32 o = 0; o < oc; o++) { // 8 output channels at a time + INT8 *in_hw0 = in_pack; + INT8 *f_o0c0 = filterArray + o * 8 * fh * fw * ic * 8; + I32 *out_buf = biasScaled + oc * 8 + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + F16 *b_0 = b0; + I32 *b_0_s = b0_s; + __asm__ __volatile__( + "cbz %[out_f16], 8f\n" + "eor v5.16b, v5.16b, v5.16b\n" + "ldr d1, [%[in_0]]\n" // in_0 + "eor v6.16b, v6.16b, v6.16b\n" + "ldr x1, [%[in_0], #8]\n" + "eor v7.16b, v7.16b, v7.16b\n" + "ins v1.d[1], x1\n" + "eor v8.16b, v8.16b, v8.16b\n" + "ldr d0, [%[f_0]]\n" // f_0 + "eor v9.16b, v9.16b, v9.16b\n" + "ldr x2, [%[f_0], #8]\n" + "eor v10.16b, v10.16b, v10.16b\n" + "ins v0.d[1], x2\n" + "eor v11.16b, v11.16b, v11.16b\n" + "ldr d3, [%[in_0], #16]\n" // in_1 + "eor v12.16b, v12.16b, v12.16b\n" + "ldr x3, [%[in_0], #24]\n" + "eor v13.16b, v13.16b, v13.16b\n" + "ins v3.d[1], x3\n" + "eor v14.16b, v14.16b, v14.16b\n" + "eor v15.16b, v15.16b, v15.16b\n" + "eor v16.16b, v16.16b, v16.16b\n" + + "eor v17.16b, v17.16b, v17.16b\n" + "eor v18.16b, v18.16b, v18.16b\n" + "eor v19.16b, v19.16b, v19.16b\n" + "eor v20.16b, v20.16b, v20.16b\n" + "eor v21.16b, v21.16b, v21.16b\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + "eor v25.16b, v25.16b, v25.16b\n" + "eor v26.16b, v26.16b, v26.16b\n" + "eor v27.16b, v27.16b, v27.16b\n" + "eor v28.16b, v28.16b, v28.16b\n" + "b 7f\n" + + "8:\n" + "ldp q29, q30, [%[b_0_s]]\n" + "mov v5.16b, v29.16b\n" + "ldr d1, [%[in_0]]\n" // in_0 + "mov v7.16b, v29.16b\n" + "ldr x1, [%[in_0], #8]\n" + "mov v9.16b, v29.16b\n" + "ins v1.d[1], x1\n" + "mov v11.16b, v29.16b\n" + "ldr d0, [%[f_0]]\n" // f_0 + "mov v13.16b, v29.16b\n" + "ldr x2, [%[f_0], #8]\n" + "mov v15.16b, v29.16b\n" + "ins v0.d[1], x2\n" + "mov v17.16b, v29.16b\n" + "ldr d3, [%[in_0], #16]\n" // in_1 + "mov v19.16b, v29.16b\n" + "ldr x3, [%[in_0], #24]\n" + "mov v21.16b, v29.16b\n" + "ins v3.d[1], x3\n" + "mov v23.16b, v29.16b\n" + "mov v25.16b, v29.16b\n" + "mov v27.16b, v29.16b\n" + + "mov v6.16b, v30.16b\n" + "mov v8.16b, v30.16b\n" + "mov v10.16b, v30.16b\n" + "mov v12.16b, v30.16b\n" + "mov v14.16b, v30.16b\n" + "mov v16.16b, v30.16b\n" + "mov v18.16b, v30.16b\n" + "mov v20.16b, v30.16b\n" + "mov v22.16b, v30.16b\n" + "mov v24.16b, v30.16b\n" + "mov v26.16b, v30.16b\n" + "mov v28.16b, v30.16b\n" + + "7:\n" + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" // ic_blk + "0:\n" + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "ldr d2, [x3, 32]\n" + "ldr x16, [x3, 40]\n" + "sdot v7.4s, v0.16b, v1.4b[1]\n" + "ldr d29, [x0, 16]\n" + "ldr x17, [x0, 24]\n" + "sdot v9.4s, v0.16b, v1.4b[2]\n" + "ins v2.d[1], x16\n" + "ldr d30, [x3, 48]!\n" + "sdot v11.4s, v0.16b, v1.4b[3]\n" + "ins v29.d[1], x17\n" + + "sdot v13.4s, v0.16b, v3.4b[0]\n" + "ldr x16, [x3, 8]\n" + "subs x2, x2, #4\n" + "sdot v15.4s, v0.16b, v3.4b[1]\n" + "sdot v17.4s, v0.16b, v3.4b[2]\n" + "ins v30.d[1], x16\n" + "sdot v19.4s, v0.16b, v3.4b[3]\n" + + "sdot v21.4s, v0.16b, v2.4b[0]\n" + "sdot v23.4s, v0.16b, v2.4b[1]\n" + "sdot v25.4s, v0.16b, v2.4b[2]\n" + "sdot v27.4s, v0.16b, v2.4b[3]\n" + + "sdot v14.4s, v29.16b, v3.4b[0]\n" + "sdot v16.4s, v29.16b, v3.4b[1]\n" + "ldr d0, [x0, 32]!\n" + "ldr x17, [x0, 8]\n" + "sdot v18.4s, v29.16b, v3.4b[2]\n" + "sdot v20.4s, v29.16b, v3.4b[3]\n" + + "sdot v6.4s, v29.16b, v1.4b[0]\n" + "sdot v8.4s, v29.16b, v1.4b[1]\n" + "ldr d3, [x3, 16]\n" + "ldr x16, [x3, 24]\n" + "sdot v10.4s, v29.16b, v1.4b[2]\n" + "sdot v12.4s, v29.16b, v1.4b[3]\n" + + "ins v0.d[1], x17\n" + "ins v3.d[1], x16\n" + + "sdot v22.4s, v29.16b, v2.4b[0]\n" + "mov v1.16b, v30.16b\n" + "sdot v24.4s, v29.16b, v2.4b[1]\n" + "sdot v26.4s, v29.16b, v2.4b[2]\n" + "sdot v28.4s, v29.16b, v2.4b[3]\n" + + "bne 0b\n" + "cbz %[out_f16], 6f\n" + + "scvtf v5.4s, v5.4s\n" + "scvtf v6.4s, v6.4s\n" + "ldr d1, [%[factor]]\n" + "ldr x1, [%[factor], #8]\n" + "scvtf v7.4s, v7.4s\n" + "scvtf v8.4s, v8.4s\n" + "ins v1.d[1], x1\n" + "scvtf v9.4s, v9.4s\n" + "ldr d0, [%[b_0]]\n" + "ldr x0, [%[b_0], #8]\n" + "scvtf v10.4s, v10.4s\n" + "scvtf v11.4s, v11.4s\n" + "ins v0.d[1], x0\n" + "scvtf v12.4s, v12.4s\n" + "scvtf v13.4s, v13.4s\n" + "scvtf v14.4s, v14.4s\n" + "scvtf v15.4s, v15.4s\n" + "scvtf v16.4s, v16.4s\n" + "scvtf v17.4s, v17.4s\n" + "scvtf v18.4s, v18.4s\n" + "scvtf v19.4s, v19.4s\n" + "scvtf v20.4s, v20.4s\n" + "scvtf v21.4s, v21.4s\n" + "scvtf v22.4s, v22.4s\n" + "scvtf v23.4s, v23.4s\n" + "scvtf v24.4s, v24.4s\n" + "scvtf v25.4s, v25.4s\n" + "scvtf v26.4s, v26.4s\n" + "scvtf v27.4s, v27.4s\n" + "scvtf v28.4s, v28.4s\n" + + "fmul v5.4s, v1.4s, v5.4s\n" + "fmul v6.4s, v1.4s, v6.4s\n" + "fmul v7.4s, v1.4s, v7.4s\n" + "fmul v8.4s, v1.4s, v8.4s\n" + "fmul v9.4s, v1.4s, v9.4s\n" + "fmul v10.4s, v1.4s, v10.4s\n" + "fmul v11.4s, v1.4s, v11.4s\n" + "fmul v12.4s, v1.4s, v12.4s\n" + "fmul v13.4s, v1.4s, v13.4s\n" + "fmul v14.4s, v1.4s, v14.4s\n" + "fmul v15.4s, v1.4s, v15.4s\n" + "fmul v16.4s, v1.4s, v16.4s\n" + "fmul v17.4s, v1.4s, v17.4s\n" + "fmul v18.4s, v1.4s, v18.4s\n" + "fmul v19.4s, v1.4s, v19.4s\n" + "fmul v20.4s, v1.4s, v20.4s\n" + "fmul v21.4s, v1.4s, v21.4s\n" + "fmul v22.4s, v1.4s, v22.4s\n" + "fmul v23.4s, v1.4s, v23.4s\n" + "fmul v24.4s, v1.4s, v24.4s\n" + "fmul v25.4s, v1.4s, v25.4s\n" + "fmul v26.4s, v1.4s, v26.4s\n" + "fmul v27.4s, v1.4s, v27.4s\n" + "fmul v28.4s, v1.4s, v28.4s\n" + + "fcvtn v5.4h, v5.4s\n" + "fcvtn v7.4h, v7.4s\n" + "fcvtn v9.4h, v9.4s\n" + "fcvtn v11.4h, v11.4s\n" + "fcvtn v13.4h, v13.4s\n" + "fcvtn v15.4h, v15.4s\n" + "fcvtn v17.4h, v17.4s\n" + "fcvtn v19.4h, v19.4s\n" + "fcvtn v21.4h, v21.4s\n" + "fcvtn v23.4h, v23.4s\n" + "fcvtn v25.4h, v25.4s\n" + "fcvtn v27.4h, v27.4s\n" + + "fcvtn2 v5.8h, v6.4s\n" + "fcvtn2 v7.8h, v8.4s\n" + "fcvtn2 v9.8h, v10.4s\n" + "fcvtn2 v11.8h, v12.4s\n" + "fcvtn2 v13.8h, v14.4s\n" + "fcvtn2 v15.8h, v16.4s\n" + "fcvtn2 v17.8h, v18.4s\n" + "fcvtn2 v19.8h, v20.4s\n" + "fcvtn2 v21.8h, v22.4s\n" + "fcvtn2 v23.8h, v24.4s\n" + "fcvtn2 v25.8h, v26.4s\n" + "fcvtn2 v27.8h, v28.4s\n" + + "fadd v5.8h, v0.8h, v5.8h\n" + "fadd v7.8h, v0.8h, v7.8h\n" + "fadd v9.8h, v0.8h, v9.8h\n" + "fadd v11.8h, v0.8h, v11.8h\n" + "fadd v13.8h, v0.8h, v13.8h\n" + "fadd v15.8h, v0.8h, v15.8h\n" + "fadd v17.8h, v0.8h, v17.8h\n" + "fadd v19.8h, v0.8h, v19.8h\n" + "fadd v21.8h, v0.8h, v21.8h\n" + "fadd v23.8h, v0.8h, v23.8h\n" + "fadd v25.8h, v0.8h, v25.8h\n" + "fadd v27.8h, v0.8h, v27.8h\n" + + "cbz %[conv_relu], 1f\n" + "eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v5.8h, v5.8h, v1.8h\n" + "fmax v7.8h, v7.8h, v1.8h\n" + "fmax v9.8h, v9.8h, v1.8h\n" + "fmax v11.8h, v11.8h, v1.8h\n" + "fmax v13.8h, v13.8h, v1.8h\n" + "fmax v15.8h, v15.8h, v1.8h\n" + "fmax v17.8h, v17.8h, v1.8h\n" + "fmax v19.8h, v19.8h, v1.8h\n" + "fmax v21.8h, v21.8h, v1.8h\n" + "fmax v23.8h, v23.8h, v1.8h\n" + "fmax v25.8h, v25.8h, v1.8h\n" + "fmax v27.8h, v27.8h, v1.8h\n" + + "1:\n" + "str q5, [%[out_0]]\n" + "str q7, [%[out_0], #16]\n" + "str q9, [%[out_0], #32]\n" + "str q11, [%[out_0], #48]\n" + "str q13, [%[out_0], #64]\n" + "str q15, [%[out_0], #80]\n" + "str q17, [%[out_0], #96]\n" + "str q19, [%[out_0], #112]\n" + "str q21, [%[out_0], #128]\n" + "str q23, [%[out_0], #144]\n" + "str q25, [%[out_0], #160]\n" + "str q27, [%[out_0], #176]\n" + "b 5f\n" + + "6:\n" + "ldr q0, [%[min]]\n" + "ldr q30, [%[max]]\n" + "cbz %[conv_relu], 2f\n" + "eor v1.16b, v1.16b, v1.16b\n" // zero + "smax v5.4s, v5.4s, v1.4s\n" + "smax v6.4s, v6.4s, v1.4s\n" + "smax v7.4s, v7.4s, v1.4s\n" + "smax v8.4s, v8.4s, v1.4s\n" + "smax v9.4s, v9.4s, v1.4s\n" + "smax v10.4s, v10.4s, v1.4s\n" + "smax v11.4s, v11.4s, v1.4s\n" + "smax v12.4s, v12.4s, v1.4s\n" + "smax v13.4s, v13.4s, v1.4s\n" + "smax v14.4s, v14.4s, v1.4s\n" + "smax v15.4s, v15.4s, v1.4s\n" + "smax v16.4s, v16.4s, v1.4s\n" + "smax v17.4s, v17.4s, v1.4s\n" + "smax v18.4s, v18.4s, v1.4s\n" + "smax v19.4s, v19.4s, v1.4s\n" + "smax v20.4s, v20.4s, v1.4s\n" + "smax v21.4s, v21.4s, v1.4s\n" + "smax v22.4s, v22.4s, v1.4s\n" + "smax v23.4s, v23.4s, v1.4s\n" + "smax v24.4s, v24.4s, v1.4s\n" + "smax v25.4s, v25.4s, v1.4s\n" + "smax v26.4s, v26.4s, v1.4s\n" + "smax v27.4s, v27.4s, v1.4s\n" + "smax v28.4s, v28.4s, v1.4s\n" + + "2:\n" + "cbz %[scale_known], 7f\n" + "smax v5.4s, v5.4s, v0.4s\n" + "smin v5.4s, v5.4s, v30.4s\n" + "smax v6.4s, v6.4s, v0.4s\n" + "smin v6.4s, v6.4s, v30.4s\n" + "smax v7.4s, v7.4s, v0.4s\n" + "smin v7.4s, v7.4s, v30.4s\n" + "smax v8.4s, v8.4s, v0.4s\n" + "smin v8.4s, v8.4s, v30.4s\n" + "smax v9.4s, v9.4s, v0.4s\n" + "smin v9.4s, v9.4s, v30.4s\n" + "smax v10.4s, v10.4s, v0.4s\n" + "smin v10.4s, v10.4s, v30.4s\n" + "smax v11.4s, v11.4s, v0.4s\n" + "smin v11.4s, v11.4s, v30.4s\n" + "smax v12.4s, v12.4s, v0.4s\n" + "smin v12.4s, v12.4s, v30.4s\n" + "smax v13.4s, v13.4s, v0.4s\n" + "smin v13.4s, v13.4s, v30.4s\n" + "smax v14.4s, v14.4s, v0.4s\n" + "smin v14.4s, v14.4s, v30.4s\n" + "smax v15.4s, v15.4s, v0.4s\n" + "smin v15.4s, v15.4s, v30.4s\n" + "smax v16.4s, v16.4s, v0.4s\n" + "smin v16.4s, v16.4s, v30.4s\n" + "smax v17.4s, v17.4s, v0.4s\n" + "smin v17.4s, v17.4s, v30.4s\n" + "smax v18.4s, v18.4s, v0.4s\n" + "smin v18.4s, v18.4s, v30.4s\n" + "smax v19.4s, v19.4s, v0.4s\n" + "smin v19.4s, v19.4s, v30.4s\n" + "smax v20.4s, v20.4s, v0.4s\n" + "smin v20.4s, v20.4s, v30.4s\n" + "smax v21.4s, v21.4s, v0.4s\n" + "smin v21.4s, v21.4s, v30.4s\n" + "smax v22.4s, v22.4s, v0.4s\n" + "smin v22.4s, v22.4s, v30.4s\n" + "smax v23.4s, v23.4s, v0.4s\n" + "smin v23.4s, v23.4s, v30.4s\n" + "smax v24.4s, v24.4s, v0.4s\n" + "smin v24.4s, v24.4s, v30.4s\n" + "smax v25.4s, v25.4s, v0.4s\n" + "smin v25.4s, v25.4s, v30.4s\n" + "smax v26.4s, v26.4s, v0.4s\n" + "smin v26.4s, v26.4s, v30.4s\n" + "smax v27.4s, v27.4s, v0.4s\n" + "smin v27.4s, v27.4s, v30.4s\n" + "smax v28.4s, v28.4s, v0.4s\n" + "smin v28.4s, v28.4s, v30.4s\n" + + "str q5, [%[out_buf]]\n" + "str q6, [%[out_buf], 16]\n" + "str q7, [%[out_buf], 32]\n" + "str q8, [%[out_buf], 48]\n" + "str q9, [%[out_buf], 64]\n" + "str q10, [%[out_buf], 80]\n" + "str q11, [%[out_buf], 96]\n" + "str q12, [%[out_buf], 112]\n" + "str q13, [%[out_buf], 128]\n" + "str q14, [%[out_buf], 144]\n" + "str q15, [%[out_buf], 160]\n" + "str q16, [%[out_buf], 176]\n" + "str q17, [%[out_buf], 192]\n" + "str q18, [%[out_buf], 208]\n" + "str q19, [%[out_buf], 224]\n" + "str q20, [%[out_buf], 240]\n" + "str q21, [%[out_buf], 256]\n" + "str q22, [%[out_buf], 272]\n" + "str q23, [%[out_buf], 288]\n" + "str q24, [%[out_buf], 304]\n" + "str q25, [%[out_buf], 320]\n" + "str q26, [%[out_buf], 336]\n" + "str q27, [%[out_buf], 352]\n" + "str q28, [%[out_buf], 368]\n" + "b 5f\n" + + "7:\n" + "smax v30.4s, v5.4s, v30.4s\n" + "smin v0.4s, v5.4s, v0.4s\n" + "str q5, [%[out_buf]]\n" + "smax v30.4s, v6.4s, v30.4s\n" + "smin v0.4s, v6.4s, v0.4s\n" + "str q6, [%[out_buf], 16]\n" + "smax v30.4s, v7.4s, v30.4s\n" + "smin v0.4s, v7.4s, v0.4s\n" + "str q7, [%[out_buf], 32]\n" + "smax v30.4s, v8.4s, v30.4s\n" + "smin v0.4s, v8.4s, v0.4s\n" + "str q8, [%[out_buf], 48]\n" + "smax v30.4s, v9.4s, v30.4s\n" + "smin v0.4s, v9.4s, v0.4s\n" + "str q9, [%[out_buf], 64]\n" + "smax v30.4s, v10.4s, v30.4s\n" + "smin v0.4s, v10.4s, v0.4s\n" + "str q10, [%[out_buf], 80]\n" + "smax v30.4s, v11.4s, v30.4s\n" + "smin v0.4s, v11.4s, v0.4s\n" + "str q11, [%[out_buf], 96]\n" + "smax v30.4s, v12.4s, v30.4s\n" + "smin v0.4s, v12.4s, v0.4s\n" + "str q12, [%[out_buf], 112]\n" + "smax v30.4s, v13.4s, v30.4s\n" + "smin v0.4s, v13.4s, v0.4s\n" + "str q13, [%[out_buf], 128]\n" + + "smax v30.4s, v14.4s, v30.4s\n" + "smin v0.4s, v14.4s, v0.4s\n" + "str q14, [%[out_buf], 144]\n" + "smax v30.4s, v15.4s, v30.4s\n" + "smin v0.4s, v15.4s, v0.4s\n" + "str q15, [%[out_buf], 160]\n" + "smax v30.4s, v16.4s, v30.4s\n" + "smin v0.4s, v16.4s, v0.4s\n" + "str q16, [%[out_buf], 176]\n" + "smax v30.4s, v17.4s, v30.4s\n" + "smin v0.4s, v17.4s, v0.4s\n" + "str q17, [%[out_buf], 192]\n" + "smax v30.4s, v18.4s, v30.4s\n" + "smin v0.4s, v18.4s, v0.4s\n" + "str q18, [%[out_buf], 208]\n" + "smax v30.4s, v19.4s, v30.4s\n" + "smin v0.4s, v19.4s, v0.4s\n" + "str q19, [%[out_buf], 224]\n" + "smax v30.4s, v20.4s, v30.4s\n" + "smin v0.4s, v20.4s, v0.4s\n" + "str q20, [%[out_buf], 240]\n" + "smax v30.4s, v21.4s, v30.4s\n" + "smin v0.4s, v21.4s, v0.4s\n" + "str q21, [%[out_buf], 256]\n" + "smax v30.4s, v22.4s, v30.4s\n" + "smin v0.4s, v22.4s, v0.4s\n" + "str q22, [%[out_buf], 272]\n" + "smax v30.4s, v23.4s, v30.4s\n" + "smin v0.4s, v23.4s, v0.4s\n" + "str q23, [%[out_buf], 288]\n" + "smax v30.4s, v24.4s, v30.4s\n" + "smin v0.4s, v24.4s, v0.4s\n" + "str q24, [%[out_buf], 304]\n" + "smax v30.4s, v25.4s, v30.4s\n" + "smin v0.4s, v25.4s, v0.4s\n" + "str q25, [%[out_buf], 320]\n" + "smax v30.4s, v26.4s, v30.4s\n" + "smin v0.4s, v26.4s, v0.4s\n" + "str q26, [%[out_buf], 336]\n" + "smax v30.4s, v27.4s, v30.4s\n" + "smin v0.4s, v27.4s, v0.4s\n" + "str q27, [%[out_buf], 352]\n" + "smax v30.4s, v28.4s, v30.4s\n" + "smin v0.4s, v28.4s, v0.4s\n" + "str q28, [%[out_buf], 368]\n" + + "str q30, [%[max]]\n" + "str q0, [%[min]]\n" + + "5:\n" + : + : [out_0] "r"(out_o0hw0), [out_buf] "r"(out_buf), [in_0] "r"(in_hw0), + [f_0] "r"(f_o0c0), [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_0), + [b_0_s] "r"(b_0_s), [factor] "r"(factor_v), [max] "r"(max_i32), + [min] "r"(min_i32), [conv_relu] "r"(conv_relu_bool), + [out_f16] "r"(out_f16_bool), [scale_known] "r"(scale_known_bool) + : "memory", "cc", "v0", "v1", "v2", "v3", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x0", "x1", "x2", + "x3", "x17", "x16"); + b0 += 8; + b0_s += 8; + } + } + + // ohow_reminder % 12 / 8 + I32 ohow_s = (ohow / 12) * 12; + I32 ohow_tail = ohow - ohow_s; + + if (ohow_tail >= 8) { + I32 hw = ohow_s; + F16 *b0 = biasArray; + I32 *b0_s = biasScaled; + INT8 *in_pack = ((INT8 *)tmp) + ic * ih_pad * iw_pad * 8; + // pack input + // NCHWc8 => NHWChw8c4 + im2col + U32 in_h[8]; + U32 in_w[8]; + + for (U32 i = 0; i < 8; i++) { + in_h[i] = ((hw + i) / ow) * strideH; + in_w[i] = ((hw + i) % ow) * strideW; + } + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + INT8 *in_hw8c8 = inArray_pad + c * ihiw * 8 + + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + INT8 *in_0 = in_hw8c8 + in_h[0] * iw_pad * 8 + in_w[0] * 8; + INT8 *in_1 = in_hw8c8 + in_h[1] * iw_pad * 8 + in_w[1] * 8; + INT8 *in_2 = in_hw8c8 + in_h[2] * iw_pad * 8 + in_w[2] * 8; + INT8 *in_3 = in_hw8c8 + in_h[3] * iw_pad * 8 + in_w[3] * 8; + INT8 *in_4 = in_hw8c8 + in_h[4] * iw_pad * 8 + in_w[4] * 8; + INT8 *in_5 = in_hw8c8 + in_h[5] * iw_pad * 8 + in_w[5] * 8; + INT8 *in_6 = in_hw8c8 + in_h[6] * iw_pad * 8 + in_w[6] * 8; + INT8 *in_7 = in_hw8c8 + in_h[7] * iw_pad * 8 + in_w[7] * 8; + INT8 *in_pack_0 = + in_pack + c * fh * fw * 8 * 8 + fh_idx * fw * 8 * 4 + fw_idx * 8 * 4; + INT8 *in_pack_1 = in_pack_0 + fh * fw * 8 * 4; + + __asm__ __volatile__("ldr d0, [%[in_0]]\n" + "ldr x2, [%[in_2]]\n" + "ldr d1, [%[in_1]]\n" + "ldr x3, [%[in_3]]\n" + "ins v0.d[1], x2\n" + "ins v1.d[1], x3\n" + "ldr d4, [%[in_4]]\n" + "ldr x6, [%[in_6]]\n" + "trn1 v20.4s, v0.4s, v1.4s\n" + "trn2 v21.4s, v0.4s, v1.4s\n" + + "ldr d5, [%[in_5]]\n" + "ldr x7, [%[in_7]]\n" + "ins v4.d[1], x6\n" + "ins v5.d[1], x7\n" + + "str q20, [%[pack_0]]\n" + "trn1 v24.4s, v4.4s, v5.4s\n" + "trn2 v25.4s, v4.4s, v5.4s\n" + "str q21, [%[pack_1]]\n" + "str q24, [%[pack_0], #16]\n" + "str q25, [%[pack_1], #16]\n" + : + : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), + [in_0] "r"(in_0), [in_1] "r"(in_1), [in_2] "r"(in_2), + [in_3] "r"(in_3), [in_4] "r"(in_4), [in_5] "r"(in_5), + [in_6] "r"(in_6), [in_7] "r"(in_7) + : "memory", "cc", "v0", "v1", "v4", "v5", "v20", "v21", + "v24", "v25", "x2", "x3", "x6", "x7"); + } + } + } + + // compute + for (U32 o = 0; o < oc; o++) { + INT8 *in_hw0 = in_pack; + INT8 *f_o0c0 = filterArray + o * 8 * fh * fw * ic * 8; + I32 *out_buf = biasScaled + oc * 8 + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + F16 *b_0 = b0; + I32 *b_0_s = b0_s; + __asm__ __volatile__( + "cbz %[out_f16], 8f\n" + "eor v5.16b, v5.16b, v5.16b\n" + "ldr d1, [%[in_0]]\n" // in_0 + "eor v6.16b, v6.16b, v6.16b\n" + "ldr x1, [%[in_0], #8]\n" + "eor v7.16b, v7.16b, v7.16b\n" + "ins v1.d[1], x1\n" + "eor v8.16b, v8.16b, v8.16b\n" + "ldr d0, [%[f_0]]\n" // f_0 + "eor v9.16b, v9.16b, v9.16b\n" + "ldr x2, [%[f_0], #8]\n" + "eor v10.16b, v10.16b, v10.16b\n" + "ins v0.d[1], x2\n" + "eor v11.16b, v11.16b, v11.16b\n" + "eor v12.16b, v12.16b, v12.16b\n" + "eor v13.16b, v13.16b, v13.16b\n" + "eor v14.16b, v14.16b, v14.16b\n" + "eor v15.16b, v15.16b, v15.16b\n" + "eor v16.16b, v16.16b, v16.16b\n" + "eor v17.16b, v17.16b, v17.16b\n" + "eor v18.16b, v18.16b, v18.16b\n" + "eor v19.16b, v19.16b, v19.16b\n" + "eor v20.16b, v20.16b, v20.16b\n" + "b 7f\n" + + "8:\n" + "ldp q29, q30, [%[b_0_s]]\n" + "mov v5.16b, v29.16b\n" + "ldr d1, [%[in_0]]\n" // in_0 + "mov v7.16b, v29.16b\n" + "ldr x1, [%[in_0], #8]\n" + "mov v9.16b, v29.16b\n" + "ins v1.d[1], x1\n" + "mov v11.16b, v29.16b\n" + "ldr d0, [%[f_0]]\n" // f_0 + "mov v13.16b, v29.16b\n" + "ldr x2, [%[f_0], #8]\n" + "mov v15.16b, v29.16b\n" + "ins v0.d[1], x2\n" + "mov v17.16b, v29.16b\n" + "mov v19.16b, v29.16b\n" + + "mov v6.16b, v30.16b\n" + "mov v8.16b, v30.16b\n" + "mov v10.16b, v30.16b\n" + "mov v12.16b, v30.16b\n" + "mov v14.16b, v30.16b\n" + "mov v16.16b, v30.16b\n" + "mov v18.16b, v30.16b\n" + "mov v20.16b, v30.16b\n" + + "7:\n" + + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" // ic_blk + "0:\n" + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "ldr d3, [x3, 16]!\n" + "ldr x16, [x3, 8]\n" + "sdot v7.4s, v0.16b, v1.4b[1]\n" + "ldr d29, [x0, 16]\n" + "ldr x17, [x0, 24]\n" + "sdot v9.4s, v0.16b, v1.4b[2]\n" + "ins v3.d[1], x16\n" + "ldr d30, [x3, 16]!\n" + "sdot v11.4s, v0.16b, v1.4b[3]\n" + "ins v29.d[1], x17\n" + + "sdot v13.4s, v0.16b, v3.4b[0]\n" + "ldr x16, [x3, 8]\n" + "subs x2, x2, #4\n" + "sdot v15.4s, v0.16b, v3.4b[1]\n" + "sdot v17.4s, v0.16b, v3.4b[2]\n" + "ins v30.d[1], x16\n" + "sdot v19.4s, v0.16b, v3.4b[3]\n" + + "sdot v6.4s, v29.16b, v1.4b[0]\n" + "sdot v8.4s, v29.16b, v1.4b[1]\n" + "ldr d0, [x0, 32]!\n" + "ldr x17, [x0, 8]\n" + "sdot v10.4s, v29.16b, v1.4b[2]\n" + "sdot v12.4s, v29.16b, v1.4b[3]\n" + + "sdot v14.4s, v29.16b, v3.4b[0]\n" + "ins v0.d[1], x17\n" + "mov v1.16b, v30.16b\n" + "sdot v16.4s, v29.16b, v3.4b[1]\n" + "sdot v18.4s, v29.16b, v3.4b[2]\n" + "sdot v20.4s, v29.16b, v3.4b[3]\n" + + "bne 0b\n" + "cbz %[out_f16], 6f\n" + "scvtf v5.4s, v5.4s\n" + "scvtf v6.4s, v6.4s\n" + "ldr d1, [%[factor]]\n" + "ldr x1, [%[factor], #8]\n" + "scvtf v7.4s, v7.4s\n" + "scvtf v8.4s, v8.4s\n" + "ins v1.d[1], x1\n" + "scvtf v9.4s, v9.4s\n" + "ldr d0, [%[b_0]]\n" + "ldr x0, [%[b_0], #8]\n" + "scvtf v10.4s, v10.4s\n" + "scvtf v11.4s, v11.4s\n" + "ins v0.d[1], x0\n" + "scvtf v12.4s, v12.4s\n" + "scvtf v13.4s, v13.4s\n" + "scvtf v14.4s, v14.4s\n" + "scvtf v15.4s, v15.4s\n" + "scvtf v16.4s, v16.4s\n" + "scvtf v17.4s, v17.4s\n" + "scvtf v18.4s, v18.4s\n" + "scvtf v19.4s, v19.4s\n" + "scvtf v20.4s, v20.4s\n" + + "fmul v5.4s, v1.4s, v5.4s\n" + "fmul v6.4s, v1.4s, v6.4s\n" + "fmul v7.4s, v1.4s, v7.4s\n" + "fmul v8.4s, v1.4s, v8.4s\n" + "fmul v9.4s, v1.4s, v9.4s\n" + "fmul v10.4s, v1.4s, v10.4s\n" + "fmul v11.4s, v1.4s, v11.4s\n" + "fmul v12.4s, v1.4s, v12.4s\n" + "fmul v13.4s, v1.4s, v13.4s\n" + "fmul v14.4s, v1.4s, v14.4s\n" + "fmul v15.4s, v1.4s, v15.4s\n" + "fmul v16.4s, v1.4s, v16.4s\n" + "fmul v17.4s, v1.4s, v17.4s\n" + "fmul v18.4s, v1.4s, v18.4s\n" + "fmul v19.4s, v1.4s, v19.4s\n" + "fmul v20.4s, v1.4s, v20.4s\n" + + "fcvtn v5.4h, v5.4s\n" + "fcvtn v7.4h, v7.4s\n" + "fcvtn v9.4h, v9.4s\n" + "fcvtn v11.4h, v11.4s\n" + "fcvtn v13.4h, v13.4s\n" + "fcvtn v15.4h, v15.4s\n" + "fcvtn v17.4h, v17.4s\n" + "fcvtn v19.4h, v19.4s\n" + + "fcvtn2 v5.8h, v6.4s\n" + "fcvtn2 v7.8h, v8.4s\n" + "fcvtn2 v9.8h, v10.4s\n" + "fcvtn2 v11.8h, v12.4s\n" + "fcvtn2 v13.8h, v14.4s\n" + "fcvtn2 v15.8h, v16.4s\n" + "fcvtn2 v17.8h, v18.4s\n" + "fcvtn2 v19.8h, v20.4s\n" + + "fadd v5.8h, v0.8h, v5.8h\n" + "fadd v7.8h, v0.8h, v7.8h\n" + "fadd v9.8h, v0.8h, v9.8h\n" + "fadd v11.8h, v0.8h, v11.8h\n" + "fadd v13.8h, v0.8h, v13.8h\n" + "fadd v15.8h, v0.8h, v15.8h\n" + "fadd v17.8h, v0.8h, v17.8h\n" + "fadd v19.8h, v0.8h, v19.8h\n" + + "cbz %[conv_relu], 1f\n" + "eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v5.8h, v5.8h, v1.8h\n" + "fmax v7.8h, v7.8h, v1.8h\n" + "fmax v9.8h, v9.8h, v1.8h\n" + "fmax v11.8h, v11.8h, v1.8h\n" + "fmax v13.8h, v13.8h, v1.8h\n" + "fmax v15.8h, v15.8h, v1.8h\n" + "fmax v17.8h, v17.8h, v1.8h\n" + "fmax v19.8h, v19.8h, v1.8h\n" + + "1:\n" + "str q5, [%[out_0]]\n" + "str q7, [%[out_0], #16]\n" + "str q9, [%[out_0], #32]\n" + "str q11, [%[out_0], #48]\n" + "str q13, [%[out_0], #64]\n" + "str q15, [%[out_0], #80]\n" + "str q17, [%[out_0], #96]\n" + "str q19, [%[out_0], #112]\n" + "b 5f\n" + + "6:\n" + "ldr q0, [%[min]]\n" + "ldr q30, [%[max]]\n" + "cbz %[conv_relu], 2f\n" + "eor v1.16b, v1.16b, v1.16b\n" // zero + "smax v5.4s, v5.4s, v1.4s\n" + "smax v6.4s, v6.4s, v1.4s\n" + "smax v7.4s, v7.4s, v1.4s\n" + "smax v8.4s, v8.4s, v1.4s\n" + "smax v9.4s, v9.4s, v1.4s\n" + "smax v10.4s, v10.4s, v1.4s\n" + "smax v11.4s, v11.4s, v1.4s\n" + "smax v12.4s, v12.4s, v1.4s\n" + "smax v13.4s, v13.4s, v1.4s\n" + "smax v14.4s, v14.4s, v1.4s\n" + "smax v15.4s, v15.4s, v1.4s\n" + "smax v16.4s, v16.4s, v1.4s\n" + "smax v17.4s, v17.4s, v1.4s\n" + "smax v18.4s, v18.4s, v1.4s\n" + "smax v19.4s, v19.4s, v1.4s\n" + "smax v20.4s, v20.4s, v1.4s\n" + + "2:\n" + "cbz %[scale_known], 7f\n" + "smax v5.4s, v5.4s, v0.4s\n" + "smin v5.4s, v5.4s, v30.4s\n" + "smax v6.4s, v6.4s, v0.4s\n" + "smin v6.4s, v6.4s, v30.4s\n" + "smax v7.4s, v7.4s, v0.4s\n" + "smin v7.4s, v7.4s, v30.4s\n" + "smax v8.4s, v8.4s, v0.4s\n" + "smin v8.4s, v8.4s, v30.4s\n" + "smax v9.4s, v9.4s, v0.4s\n" + "smin v9.4s, v9.4s, v30.4s\n" + "smax v10.4s, v10.4s, v0.4s\n" + "smin v10.4s, v10.4s, v30.4s\n" + "smax v11.4s, v11.4s, v0.4s\n" + "smin v11.4s, v11.4s, v30.4s\n" + "smax v12.4s, v12.4s, v0.4s\n" + "smin v12.4s, v12.4s, v30.4s\n" + "smax v13.4s, v13.4s, v0.4s\n" + "smin v13.4s, v13.4s, v30.4s\n" + "smax v14.4s, v14.4s, v0.4s\n" + "smin v14.4s, v14.4s, v30.4s\n" + "smax v15.4s, v15.4s, v0.4s\n" + "smin v15.4s, v15.4s, v30.4s\n" + "smax v16.4s, v16.4s, v0.4s\n" + "smin v16.4s, v16.4s, v30.4s\n" + "smax v17.4s, v17.4s, v0.4s\n" + "smin v17.4s, v17.4s, v30.4s\n" + "smax v18.4s, v18.4s, v0.4s\n" + "smin v18.4s, v18.4s, v30.4s\n" + "smax v19.4s, v19.4s, v0.4s\n" + "smin v19.4s, v19.4s, v30.4s\n" + "smax v20.4s, v20.4s, v0.4s\n" + "smin v20.4s, v20.4s, v30.4s\n" + + "str q5, [%[out_buf]]\n" + "str q6, [%[out_buf], 16]\n" + "str q7, [%[out_buf], 32]\n" + "str q8, [%[out_buf], 48]\n" + "str q9, [%[out_buf], 64]\n" + "str q10, [%[out_buf], 80]\n" + "str q11, [%[out_buf], 96]\n" + "str q12, [%[out_buf], 112]\n" + "str q13, [%[out_buf], 128]\n" + "str q14, [%[out_buf], 144]\n" + "str q15, [%[out_buf], 160]\n" + "str q16, [%[out_buf], 176]\n" + "str q17, [%[out_buf], 192]\n" + "str q18, [%[out_buf], 208]\n" + "str q19, [%[out_buf], 224]\n" + "str q20, [%[out_buf], 240]\n" + "b 5f\n" + + "7:\n" + "smax v30.4s, v5.4s, v30.4s\n" + "smin v0.4s, v5.4s, v0.4s\n" + "str q5, [%[out_buf]]\n" + "smax v30.4s, v6.4s, v30.4s\n" + "smin v0.4s, v6.4s, v0.4s\n" + "str q6, [%[out_buf], 16]\n" + "smax v30.4s, v7.4s, v30.4s\n" + "smin v0.4s, v7.4s, v0.4s\n" + "str q7, [%[out_buf], 32]\n" + "smax v30.4s, v8.4s, v30.4s\n" + "smin v0.4s, v8.4s, v0.4s\n" + "str q8, [%[out_buf], 48]\n" + "smax v30.4s, v9.4s, v30.4s\n" + "smin v0.4s, v9.4s, v0.4s\n" + "str q9, [%[out_buf], 64]\n" + "smax v30.4s, v10.4s, v30.4s\n" + "smin v0.4s, v10.4s, v0.4s\n" + "str q10, [%[out_buf], 80]\n" + "smax v30.4s, v11.4s, v30.4s\n" + "smin v0.4s, v11.4s, v0.4s\n" + "str q11, [%[out_buf], 96]\n" + "smax v30.4s, v12.4s, v30.4s\n" + "smin v0.4s, v12.4s, v0.4s\n" + "str q12, [%[out_buf], 112]\n" + "smax v30.4s, v13.4s, v30.4s\n" + "smin v0.4s, v13.4s, v0.4s\n" + "str q13, [%[out_buf], 128]\n" + + "smax v30.4s, v14.4s, v30.4s\n" + "smin v0.4s, v14.4s, v0.4s\n" + "str q14, [%[out_buf], 144]\n" + "smax v30.4s, v15.4s, v30.4s\n" + "smin v0.4s, v15.4s, v0.4s\n" + "str q15, [%[out_buf], 160]\n" + "smax v30.4s, v16.4s, v30.4s\n" + "smin v0.4s, v16.4s, v0.4s\n" + "str q16, [%[out_buf], 176]\n" + "smax v30.4s, v17.4s, v30.4s\n" + "smin v0.4s, v17.4s, v0.4s\n" + "str q17, [%[out_buf], 192]\n" + "smax v30.4s, v18.4s, v30.4s\n" + "smin v0.4s, v18.4s, v0.4s\n" + "str q18, [%[out_buf], 208]\n" + "smax v30.4s, v19.4s, v30.4s\n" + "smin v0.4s, v19.4s, v0.4s\n" + "str q19, [%[out_buf], 224]\n" + "smax v30.4s, v20.4s, v30.4s\n" + "smin v0.4s, v20.4s, v0.4s\n" + "str q20, [%[out_buf], 240]\n" + + "str q30, [%[max]]\n" + "str q0, [%[min]]\n" + "5:\n" + : + : [out_0] "r"(out_o0hw0), [out_buf] "r"(out_buf), [in_0] "r"(in_hw0), + [f_0] "r"(f_o0c0), [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_0), + [b_0_s] "r"(b_0_s), [factor] "r"(factor_v), [max] "r"(max_i32), + [min] "r"(min_i32), [conv_relu] "r"(conv_relu_bool), + [out_f16] "r"(out_f16_bool), [scale_known] "r"(scale_known_bool) + : "memory", "cc", "v0", "v1", "v3", "v5", "v6", "v7", "v8", "v9", "v10", "v11", + "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v29", "v30", + "x0", "x1", "x2", "x3", "x17", "x16"); + b0 += 8; + b0_s += 8; + } + ohow_s += 8; + ohow_tail -= 8; + } + + if (ohow_tail >= 4) { + I32 hw = ohow_s; + F16 *b0 = biasArray; + I32 *b0_s = biasScaled; + INT8 *in_pack = ((INT8 *)tmp) + ic * ih_pad * iw_pad * 8; + // pack input + // NCHWc8 => NHWChw4c4 + im2col + U32 in_h[4]; + U32 in_w[4]; + + for (U32 i = 0; i < 4; i++) { + in_h[i] = ((hw + i) / ow) * strideH; + in_w[i] = ((hw + i) % ow) * strideW; + } + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + INT8 *in_hw4c8 = inArray_pad + c * ihiw * 8 + + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + INT8 *in_0 = in_hw4c8 + in_h[0] * iw_pad * 8 + in_w[0] * 8; + INT8 *in_1 = in_hw4c8 + in_h[1] * iw_pad * 8 + in_w[1] * 8; + INT8 *in_2 = in_hw4c8 + in_h[2] * iw_pad * 8 + in_w[2] * 8; + INT8 *in_3 = in_hw4c8 + in_h[3] * iw_pad * 8 + in_w[3] * 8; + INT8 *in_pack_0 = + in_pack + c * fh * fw * 4 * 8 + fh_idx * fw * 4 * 4 + fw_idx * 4 * 4; + INT8 *in_pack_1 = in_pack_0 + fh * fw * 4 * 4; + + __asm__ __volatile__( + "ldr d0, [%[in_0]]\n" + "ldr x2, [%[in_2]]\n" + "ldr d1, [%[in_1]]\n" + "ldr x3, [%[in_3]]\n" + "ins v0.d[1], x2\n" + "ins v1.d[1], x3\n" + "trn1 v20.4s, v0.4s, v1.4s\n" + "trn2 v21.4s, v0.4s, v1.4s\n" + "str q20, [%[pack_0]]\n" + "str q21, [%[pack_1]]\n" + : + : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), [in_0] "r"(in_0), + [in_1] "r"(in_1), [in_2] "r"(in_2), [in_3] "r"(in_3) + : "memory", "cc", "v0", "v1", "v20", "v21", "x2", "x3"); + } + } + } + + // compute + for (U32 o = 0; o < oc; o++) { + INT8 *in_hw0 = in_pack; + INT8 *f_o0c0 = filterArray + o * 8 * fh * fw * ic * 8; + I32 *out_buf = biasScaled + oc * 8 + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + F16 *b_0 = b0; + I32 *b_0_s = b0_s; + __asm__ __volatile__( + "cbz %[out_f16], 8f\n" + "eor v5.16b, v5.16b, v5.16b\n" + "ldr d1, [%[in_0]]\n" // in_0 + "eor v6.16b, v6.16b, v6.16b\n" + "ldr x1, [%[in_0], #8]\n" + "eor v7.16b, v7.16b, v7.16b\n" + "ins v1.d[1], x1\n" + "eor v8.16b, v8.16b, v8.16b\n" + "ldr d0, [%[f_0]]\n" // f_0 + + "eor v9.16b, v9.16b, v9.16b\n" + "ldr x2, [%[f_0], #8]\n" + "eor v10.16b, v10.16b, v10.16b\n" + "ins v0.d[1], x2\n" + "eor v11.16b, v11.16b, v11.16b\n" + "eor v12.16b, v12.16b, v12.16b\n" + "b 7f\n" + + "8:\n" + "ldp q29, q30, [%[b_0_s]]\n" + "ldr d1, [%[in_0]]\n" // in_0 + "mov v5.16b, v29.16b\n" + "ldr x1, [%[in_0], #8]\n" + "mov v7.16b, v29.16b\n" + "ins v1.d[1], x1\n" + "mov v9.16b, v29.16b\n" + "ldr d0, [%[f_0]]\n" // f_0 + "mov v11.16b, v29.16b\n" + "ldr x2, [%[f_0], #8]\n" + + "mov v6.16b, v30.16b\n" + "ins v0.d[1], x2\n" + "mov v8.16b, v30.16b\n" + "mov v10.16b, v30.16b\n" + "mov v12.16b, v30.16b\n" + + "7:\n" + + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" // ic_blk + "0:\n" + "ldr d29, [x0, 16]\n" + "ldr x17, [x0, 24]\n" + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "ldr d3, [x3, 16]!\n" + "ldr x16, [x3, 8]\n" + "sdot v7.4s, v0.16b, v1.4b[1]\n" + "ins v29.d[1], x17\n" + "subs x2, x2, #4\n" + "sdot v9.4s, v0.16b, v1.4b[2]\n" + "ins v3.d[1], x16\n" + "sdot v11.4s, v0.16b, v1.4b[3]\n" + + "sdot v6.4s, v29.16b, v1.4b[0]\n" + "ldr d0, [x0, 32]!\n" + "ldr x17, [x0, 8]\n" + "sdot v8.4s, v29.16b, v1.4b[1]\n" + "sdot v10.4s, v29.16b, v1.4b[2]\n" + "ins v0.d[1], x17\n" + "sdot v12.4s, v29.16b, v1.4b[3]\n" + "mov v1.16b, v3.16b\n" + + "bne 0b\n" + "cbz %[out_f16], 6f\n" + + "scvtf v5.4s, v5.4s\n" + "scvtf v6.4s, v6.4s\n" + "ldr d1, [%[factor]]\n" + "ldr x1, [%[factor], #8]\n" + "scvtf v7.4s, v7.4s\n" + "scvtf v8.4s, v8.4s\n" + "ins v1.d[1], x1\n" + "scvtf v9.4s, v9.4s\n" + "ldr d0, [%[b_0]]\n" + "ldr x0, [%[b_0], #8]\n" + "scvtf v10.4s, v10.4s\n" + "scvtf v11.4s, v11.4s\n" + "ins v0.d[1], x0\n" + "scvtf v12.4s, v12.4s\n" + + "fmul v5.4s, v1.4s, v5.4s\n" + "fmul v6.4s, v1.4s, v6.4s\n" + "fmul v7.4s, v1.4s, v7.4s\n" + "fmul v8.4s, v1.4s, v8.4s\n" + "fmul v9.4s, v1.4s, v9.4s\n" + "fmul v10.4s, v1.4s, v10.4s\n" + "fmul v11.4s, v1.4s, v11.4s\n" + "fmul v12.4s, v1.4s, v12.4s\n" + + "fcvtn v5.4h, v5.4s\n" + "fcvtn v7.4h, v7.4s\n" + "fcvtn v9.4h, v9.4s\n" + "fcvtn v11.4h, v11.4s\n" + + "fcvtn2 v5.8h, v6.4s\n" + "fcvtn2 v7.8h, v8.4s\n" + "fcvtn2 v9.8h, v10.4s\n" + "fcvtn2 v11.8h, v12.4s\n" + + "fadd v5.8h, v0.8h, v5.8h\n" + "fadd v7.8h, v0.8h, v7.8h\n" + "fadd v9.8h, v0.8h, v9.8h\n" + "fadd v11.8h, v0.8h, v11.8h\n" + + "cbz %[conv_relu], 1f\n" + "eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v5.8h, v5.8h, v1.8h\n" + "fmax v7.8h, v7.8h, v1.8h\n" + "fmax v9.8h, v9.8h, v1.8h\n" + "fmax v11.8h, v11.8h, v1.8h\n" + + "1:\n" + "str q5, [%[out_0]]\n" + "str q7, [%[out_0], #16]\n" + "str q9, [%[out_0], #32]\n" + "str q11, [%[out_0], #48]\n" + "b 5f\n" + + "6:\n" + "ldr q0, [%[min]]\n" + "ldr q30, [%[max]]\n" + "cbz %[conv_relu], 2f\n" + "eor v1.16b, v1.16b, v1.16b\n" // zero + "smax v5.4s, v5.4s, v1.4s\n" + "smax v6.4s, v6.4s, v1.4s\n" + "smax v7.4s, v7.4s, v1.4s\n" + "smax v8.4s, v8.4s, v1.4s\n" + "smax v9.4s, v9.4s, v1.4s\n" + "smax v10.4s, v10.4s, v1.4s\n" + "smax v11.4s, v11.4s, v1.4s\n" + "smax v12.4s, v12.4s, v1.4s\n" + + "2:\n" + "cbz %[scale_known], 7f\n" + "smax v5.4s, v5.4s, v0.4s\n" + "smin v5.4s, v5.4s, v30.4s\n" + "smax v6.4s, v6.4s, v0.4s\n" + "smin v6.4s, v6.4s, v30.4s\n" + "smax v7.4s, v7.4s, v0.4s\n" + "smin v7.4s, v7.4s, v30.4s\n" + "smax v8.4s, v8.4s, v0.4s\n" + "smin v8.4s, v8.4s, v30.4s\n" + "smax v9.4s, v9.4s, v0.4s\n" + "smin v9.4s, v9.4s, v30.4s\n" + "smax v10.4s, v10.4s, v0.4s\n" + "smin v10.4s, v10.4s, v30.4s\n" + "smax v11.4s, v11.4s, v0.4s\n" + "smin v11.4s, v11.4s, v30.4s\n" + "smax v12.4s, v12.4s, v0.4s\n" + "smin v12.4s, v12.4s, v30.4s\n" + + "str q5, [%[out_buf]]\n" + "str q6, [%[out_buf], 16]\n" + "str q7, [%[out_buf], 32]\n" + "str q8, [%[out_buf], 48]\n" + "str q9, [%[out_buf], 64]\n" + "str q10, [%[out_buf], 80]\n" + "str q11, [%[out_buf], 96]\n" + "str q12, [%[out_buf], 112]\n" + "b 5f\n" + + "7:\n" + "smax v30.4s, v5.4s, v30.4s\n" + "smin v0.4s, v5.4s, v0.4s\n" + "str q5, [%[out_buf]]\n" + "smax v30.4s, v6.4s, v30.4s\n" + "smin v0.4s, v6.4s, v0.4s\n" + "str q6, [%[out_buf], 16]\n" + "smax v30.4s, v7.4s, v30.4s\n" + "smin v0.4s, v7.4s, v0.4s\n" + "str q7, [%[out_buf], 32]\n" + "smax v30.4s, v8.4s, v30.4s\n" + "smin v0.4s, v8.4s, v0.4s\n" + "str q8, [%[out_buf], 48]\n" + "smax v30.4s, v9.4s, v30.4s\n" + "smin v0.4s, v9.4s, v0.4s\n" + "str q9, [%[out_buf], 64]\n" + "smax v30.4s, v10.4s, v30.4s\n" + "smin v0.4s, v10.4s, v0.4s\n" + "str q10, [%[out_buf], 80]\n" + "smax v30.4s, v11.4s, v30.4s\n" + "smin v0.4s, v11.4s, v0.4s\n" + "str q11, [%[out_buf], 96]\n" + "smax v30.4s, v12.4s, v30.4s\n" + "smin v0.4s, v12.4s, v0.4s\n" + "str q12, [%[out_buf], 112]\n" + + "str q30, [%[max]]\n" + "str q0, [%[min]]\n" + "5:\n" + : + : [out_0] "r"(out_o0hw0), [out_buf] "r"(out_buf), [in_0] "r"(in_hw0), + [f_0] "r"(f_o0c0), [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_0), + [b_0_s] "r"(b_0_s), [factor] "r"(factor_v), [max] "r"(max_i32), + [min] "r"(min_i32), [conv_relu] "r"(conv_relu_bool), + [out_f16] "r"(out_f16_bool), [scale_known] "r"(scale_known_bool) + : "memory", "cc", "v0", "v1", "v2", "v3", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v12", "v29", "x0", "x1", "x2", "x3", "x17", "x16"); + b0 += 8; + b0_s += 8; + } + ohow_s += 4; + } + + for (I32 hw = ohow_s; hw < ohow; hw++) { + F16 *b0 = biasArray; + I32 *b0_s = biasScaled; + INT8 *in_pack = ((INT8 *)tmp) + ic * ih_pad * iw_pad * 8; + // pack input + // NCHWc8 => NHWChw1c4 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + INT8 *in_hw1c8 = inArray_pad + c * ihiw * 8 + + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + INT8 *in_0 = in_hw1c8 + in_h_0 * iw_pad * 8 + in_w_0 * 8; + INT8 *in_pack_0 = in_pack + c * fh * fw * 8 + fh_idx * fw * 4 + fw_idx * 4; + INT8 *in_pack_1 = in_pack_0 + fh * fw * 4; + + memcpy(in_pack_0, in_0, 4 * bytesOf(DT_I8)); + memcpy(in_pack_1, in_0 + 4, 4 * bytesOf(DT_I8)); + } + } + } + + // compute + for (U32 o = 0; o < oc; o++) { + INT8 *in_hw = in_pack; + INT8 *f_o = filterArray + o * 8 * fh * fw * ic * 8; + I32 *out_buf = biasScaled + oc * 8 + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + + int32x4_t res[2] = {0}; + if (out_f16_bool == 0) { + res[0] = vld1q_s32(b0_s); + res[1] = vld1q_s32(b0_s + 4); + } + + for (U32 c = 0; c < ic * fh * fw; c++) { + int8x8_t in_2 = vld1_s8(in_hw); + in_hw += 8; + int8x16_t f_8o[4]; + f_8o[0] = vld1q_s8(f_o); + f_8o[1] = vld1q_s8(f_o + 16); + res[0] = vdotq_lane_s32(res[0], f_8o[0], in_2, 0); + res[1] = vdotq_lane_s32(res[1], f_8o[1], in_2, 0); + + f_8o[2] = vld1q_s8(f_o + 32); + f_8o[3] = vld1q_s8(f_o + 48); + f_o += 64; + res[0] = vdotq_lane_s32(res[0], f_8o[2], in_2, 1); + res[1] = vdotq_lane_s32(res[1], f_8o[3], in_2, 1); + } + if (out_f16_bool == 1) { + float32x4_t fac = vld1q_f32(factor_v); + float32x4_t resf0 = vcvtq_f32_s32(res[0]); + float32x4_t resf1 = vcvtq_f32_s32(res[1]); + resf0 = vmulq_f32(resf0, fac); + resf1 = vmulq_f32(resf1, fac); + + float16x4_t bias0 = vld1_f16(b0); + float16x4_t bias1 = vld1_f16(b0 + 4); + float16x4_t resh0 = vcvt_f16_f32(resf0); + float16x4_t resh1 = vcvt_f16_f32(resf1); + resh0 = vadd_f16(resh0, bias0); + resh1 = vadd_f16(resh1, bias1); + + if (conv_relu_bool) { + float16x4_t z = vdup_n_f16(0); + resh0 = vmax_f16(resh0, z); + resh1 = vmax_f16(resh1, z); + } + vst1_f16(out_o0hw0, resh0); + vst1_f16(out_o0hw0 + 4, resh1); + } else { + int32x4_t max = vld1q_s32(max_i32); + int32x4_t min = vld1q_s32(min_i32); + if (conv_relu_bool) { + int32x4_t z = vdupq_n_s32(0); + res[0] = vmaxq_s32(res[0], z); + res[1] = vmaxq_s32(res[1], z); + } + if (1 == scale_known_bool) { + res[0] = vmaxq_s32(min, res[0]); + res[1] = vmaxq_s32(min, res[1]); + res[0] = vminq_s32(max, res[0]); + res[1] = vminq_s32(max, res[1]); + } else { + max = vmaxq_s32(max, res[0]); + min = vminq_s32(min, res[0]); + max = vmaxq_s32(max, res[1]); + min = vminq_s32(min, res[1]); + vst1q_s32(max_i32, max); + vst1q_s32(min_i32, min); + } + vst1q_s32(out_buf, res[0]); + vst1q_s32(out_buf + 4, res[1]); + } + + b0 += 8; + b0_s += 8; + } + } + } + + EE ret = SUCCESS; + if (out_f16_bool == 0) { + I32 factor; + F32 scale_o; + + if (1 == scale_known_bool) { + scale_o = (*outputScale / *inputScale) / *filterScale; + factor = 127 * 16777216 / max_i32[0]; + } else { + I32 max = max_i32[0]; + I32 min = min_i32[0]; + for (U32 i = 1; i < 4; i++) { + if (max < max_i32[i]) { + max = max_i32[i]; + } + if (min > min_i32[i]) { + min = min_i32[i]; + } + } + + if (max == 0 && min == 0) { + return NOT_SUPPORTED; + } + + if (max > 0 && min < 0) { + I32 factor_max = 127 * 16777216 / max; + I32 factor_min = -127 * 16777216 / min; + factor = (factor_max < factor_min) ? factor_max : factor_min; + scale_o = (factor_max < factor_min) ? (127.0 / max) : (-127.0 / min); + } else if (max > 0) { + factor = 127 * 16777216 / max; + scale_o = 127.0 / max; + } else { + factor = -127 * 16777216 / min; + scale_o = -127.0 / min; + } + *outputScale = (*inputScale) * (*filterScale) * scale_o; + } + + U32 num_v = oc * ohow * 2; // Number of q-form vectors + I32 *out_buf = biasScaled + oc * 8; + INT8 *out_q = (INT8 *)output; + + ret = quantize_I32(num_v, out_buf, factor, scale_o, out_q); + } + return ret; +} + +template EE convolution_gemm_A55(TensorDesc inputDesc, + const void *input, + F16 *inputScale, + TensorDesc filterDesc, + const void *filter, + F16 *filterScale, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + F16 *outputScale, + ActivationParamSpec activationDesc); + +template EE convolution_gemm_A55(TensorDesc inputDesc, + const void *input, + F16 *inputScale, + TensorDesc filterDesc, + const void *filter, + F16 *filterScale, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + F16 *outputScale, + ActivationParamSpec activationDesc); +#endif diff --git a/compute/tensor/src/cpu/arm/int8/convolution_gemm_A76.cpp b/compute/tensor/src/cpu/arm/int8/convolution_gemm_A76.cpp new file mode 100644 index 00000000..4fe4e040 --- /dev/null +++ b/compute/tensor/src/cpu/arm/int8/convolution_gemm_A76.cpp @@ -0,0 +1,1554 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_INT8 +#include +#include "cpu/arm/int8/convolution_gemm.h" + +template +EE convolution_gemm_A76(TensorDesc inputDesc, + const void *input, + F16 *inputScale, + TensorDesc filterDesc, + const void *filter, + F16 *filterScale, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + F16 *outputScale, + ActivationParamSpec activationDesc) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + // still im2col + gemm with a smaller buffer + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; + + if (fdf != DF_NCHWN8C4) { + return NOT_MATCH; + } + + I64 conv_relu_bool = (activationDesc.mode == ACTIVATION_RELU) ? 1 : 0; + I64 out_f16_bool = (odt == DT_F16) ? 1 : 0; + I64 scale_known_bool = 0; + if (*outputScale > 0 || ACTIVATION_RELU6 == activationDesc.mode) { + scale_known_bool = 1; + } + + INT8 *inArray = (INT8 *)input; // It will be updated if there is quantization + INT8 *filterArray = (INT8 *)filter; + F16 *outArray = (F16 *)output; + F16 *biasArray = (F16 *)bias; + INT8 *in_pad = (INT8 *)tmp; + + // both input and output are stored with C8 + oc /= 8; + ic /= 8; + + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + I32 ohow = oh * ow; + U32 ihiw = ih_pad * iw_pad; + + I32 *biasScaled = (I32 *)(in_pad + ic * ihiw * 8 + 12 * fh * fw * ic * 8); // Initialize + + // double start, end; + I32 max_i32[4] = {0}; // To record max I32 values + I32 min_i32[4] = {0}; // To record min I32 values + + for (U32 n = 0; n < in; n++) { // for each batch + F16 scale_i = 1.0; + + // quantize input if necessary + if (idt == DT_F16) { + // start = get_current_time_int8(); + F16 *in = ((F16 *)input) + n * ic * ih * iw * 8; + inArray = in_pad + ic * ihiw * 8 + + 12 * fh * fw * ic * 8; // After the space for padding and packing + + U32 numData = ic * ih * iw * 8; + if (*inputScale > 0) { + scale_i = *inputScale; + } else { + float16x8_t temp_v = vld1q_f16(in); + float16x8_t max_v = temp_v; + float16x8_t min_v = temp_v; + + for (U32 i = 8; i < numData; i += 8) { + temp_v = vld1q_f16(in + i); + max_v = vmaxq_f16(max_v, temp_v); + min_v = vminq_f16(min_v, temp_v); + } + + F16 max = vmaxvq_f16(max_v); + F16 min = vminvq_f16(min_v); + + if (max == 0 && min == 0) { + return NOT_SUPPORTED; + } + if (max > 0 && min < 0) { + F16 scale_max = 127.0 / max; + F16 scale_min = -127.0 / min; + scale_i = (scale_max < scale_min) ? scale_max : scale_min; + } else if (max < 0) { + scale_i = -127.0 / min; + } else { // min > 0 + scale_i = 127.0 / max; + } + } + for (U32 i = 0; i < numData; i++) { + F32 temp = in[i] * scale_i; + inArray[i] = round_towards_zero(temp, (*inputScale) != scale_i); + } + *inputScale = scale_i; + } else { + scale_i = *inputScale; + } + + if (1 == scale_known_bool) { + if (ACTIVATION_RELU6 == activationDesc.mode) { + *outputScale = 127.0 / 6.0; + } + F32 scaleInt = (*outputScale / *inputScale) / *filterScale; + I32 thresholdP = 127.0 / scaleInt; + I32 thresholdN = 0; + if (ACTIVATION_RELU6 != activationDesc.mode) { + thresholdN = thresholdP * -1; + } + + for (U32 i = 0; i < 4; i++) { + max_i32[i] = thresholdP; + min_i32[i] = thresholdN; + } + } + + if (odt == DT_I8) { // Scale the bias + if (idt == DT_F16) { + biasScaled += ic * ih * iw * 8 / bytesOf(DT_I32); // After the quantized input + } + F32 scale = (*inputScale) * (*filterScale); + for (U32 i = 0; i < oc * 8; i++) { + biasScaled[i] = round(scale * biasArray[i]); + } + } + + F32 factor_s = 1.0 / ((F32)scale_i) / ((F32)(*filterScale)); + F32 factor_v[4]; + for (U32 i = 0; i < 4; i++) { + factor_v[i] = factor_s; + } + + INT8 *inArray_pad; + if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { + inArray_pad = inArray + n * ic * ih * iw * 8; // use this batch directly + } else { + // copy input into an input with padding + inArray_pad = (INT8 *)tmp; + INT8 *inArray_pad_mov = inArray_pad; + INT8 *inArray_mov = inArray + n * ic * ih * iw * 8; + for (U32 c = 0; c < ic; c++) { // for each 8 channels + for (U32 h = 0; h < paddingT; h++) { // Upper rows of 0 + memset(inArray_pad_mov, 0, iw_pad * 8 * bytesOf(DT_I8)); // 8 comes from C8 + inArray_pad_mov += iw_pad * 8; + } + for (U32 h = paddingT; h < ih_pad - paddingB; h++) { // for each middle-section rows + memset(inArray_pad_mov, 0, paddingL * 8 * bytesOf(DT_I8)); // padding on the left + inArray_pad_mov += paddingL * 8; // 8 comes from C8 + memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(DT_I8)); // Copy input row + inArray_pad_mov += iw * 8; + inArray_mov += iw * 8; + memset( + inArray_pad_mov, 0, paddingR * 8 * bytesOf(DT_I8)); // padding on the right + inArray_pad_mov += paddingR * 8; + } + for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { // Bottom rows of 0 + memset(inArray_pad_mov, 0, iw_pad * 8 * bytesOf(DT_I8)); + inArray_pad_mov += iw_pad * 8; + } + } + } + // ohow / 12 (12x8) + for (I32 hw = 0; hw < ohow - 11; hw += 12) { // Remainder will be handled later + F16 *b0 = biasArray; + I32 *b0_s = biasScaled; + INT8 *in_pack = ((INT8 *)tmp) + ic * ih_pad * iw_pad * 8; // After the padded input + // pack input + // NCHWc8 => NHWChw12c4 + im2col + U32 in_h[12]; + U32 in_w[12]; + + for (U32 i = 0; i < 12; i++) { + in_h[i] = ((hw + i) / ow) * strideH; + in_w[i] = ((hw + i) % ow) * strideW; + } + for (U32 c = 0; c < ic; c++) { // for each 8 channels + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + INT8 *in_hw12c8 = inArray_pad + c * ihiw * 8 + + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + + INT8 *in_0 = in_hw12c8 + in_h[0] * iw_pad * 8 + in_w[0] * 8; + INT8 *in_1 = in_hw12c8 + in_h[1] * iw_pad * 8 + in_w[1] * 8; + INT8 *in_2 = in_hw12c8 + in_h[2] * iw_pad * 8 + in_w[2] * 8; + INT8 *in_3 = in_hw12c8 + in_h[3] * iw_pad * 8 + in_w[3] * 8; + INT8 *in_4 = in_hw12c8 + in_h[4] * iw_pad * 8 + in_w[4] * 8; + INT8 *in_5 = in_hw12c8 + in_h[5] * iw_pad * 8 + in_w[5] * 8; + INT8 *in_6 = in_hw12c8 + in_h[6] * iw_pad * 8 + in_w[6] * 8; + INT8 *in_7 = in_hw12c8 + in_h[7] * iw_pad * 8 + in_w[7] * 8; + INT8 *in_8 = in_hw12c8 + in_h[8] * iw_pad * 8 + in_w[8] * 8; + INT8 *in_9 = in_hw12c8 + in_h[9] * iw_pad * 8 + in_w[9] * 8; + INT8 *in_10 = in_hw12c8 + in_h[10] * iw_pad * 8 + in_w[10] * 8; + INT8 *in_11 = in_hw12c8 + in_h[11] * iw_pad * 8 + in_w[11] * 8; + + // in_pack (tmp) is reused for each tile + // NHWChw12c4 + INT8 *in_pack_0 = + in_pack + c * fh * fw * 12 * 8 + fh_idx * fw * 12 * 4 + fw_idx * 12 * 4; + INT8 *in_pack_1 = in_pack_0 + fh * fw * 12 * 4; + + __asm__ __volatile__( + "ldr d0, [%[in_0]]\n" + "ldr x2, [%[in_2]]\n" + + "ldr d1, [%[in_1]]\n" + "ldr x3, [%[in_3]]\n" + + "ldr d4, [%[in_4]]\n" + "ldr x6, [%[in_6]]\n" + + "ldr d5, [%[in_5]]\n" + "ins v0.d[1], x2\n" + + "ldr x7, [%[in_7]]\n" + "ins v1.d[1], x3\n" + + "ldr d8, [%[in_8]]\n" + "ins v4.d[1], x6\n" + + "trn1 v20.4s, v0.4s, v1.4s\n" + "ins v5.d[1], x7\n" + + "trn2 v21.4s, v0.4s, v1.4s\n" + "ldr x10, [%[in_10]]\n" + + "ldr d9, [%[in_9]]\n" + "trn1 v24.4s, v4.4s, v5.4s\n" + + "trn2 v25.4s, v4.4s, v5.4s\n" + "ldr x11, [%[in_11]]\n" + + "str q20, [%[pack_0]]\n" + "ins v8.d[1], x10\n" + + "str q24, [%[pack_0], #16]\n" + "ins v9.d[1], x11\n" + + "trn1 v28.4s, v8.4s, v9.4s\n" + "str q21, [%[pack_1]]\n" + + "trn2 v29.4s, v8.4s, v9.4s\n" + "str q25, [%[pack_1], #16]\n" + + "str q28, [%[pack_0], #32]\n" + "str q29, [%[pack_1], #32]\n" + : + : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), [in_0] "r"(in_0), + [in_1] "r"(in_1), [in_2] "r"(in_2), [in_3] "r"(in_3), [in_4] "r"(in_4), + [in_5] "r"(in_5), [in_6] "r"(in_6), [in_7] "r"(in_7), [in_8] "r"(in_8), + [in_9] "r"(in_9), [in_10] "r"(in_10), [in_11] "r"(in_11) + : "memory", "cc", "v0", "v1", "v4", "v5", "v8", "v9", "v20", "v21", + "v24", "v25", "v28", "v29", "x2", "x3", "x6", "x7", "x10", "x11"); + } + } + } + + // compute + for (U32 o = 0; o < oc; o++) { // 8 output channels at a time + INT8 *in_hw0 = in_pack; + INT8 *f_o0c0 = filterArray + o * 8 * fh * fw * ic * 8; + I32 *out_buf = biasScaled + oc * 8 + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + F16 *b_0 = b0; + I32 *b_0_s = b0_s; + __asm__ __volatile__( + "cbz %[out_f16], 8f\n" + "eor v5.16b, v5.16b, v5.16b\n" + "ldr q1, [%[in_0]]\n" // in_0 + "eor v6.16b, v6.16b, v6.16b\n" + "eor v7.16b, v7.16b, v7.16b\n" + "eor v8.16b, v8.16b, v8.16b\n" + "ldr q0, [%[f_0]]\n" // f_0 + "eor v9.16b, v9.16b, v9.16b\n" + "eor v10.16b, v10.16b, v10.16b\n" + "eor v11.16b, v11.16b, v11.16b\n" + "ldr q3, [%[in_0], #16]\n" // in_1 + "eor v12.16b, v12.16b, v12.16b\n" + "eor v13.16b, v13.16b, v13.16b\n" + "eor v14.16b, v14.16b, v14.16b\n" + "eor v15.16b, v15.16b, v15.16b\n" + "eor v16.16b, v16.16b, v16.16b\n" + + "eor v17.16b, v17.16b, v17.16b\n" + "eor v18.16b, v18.16b, v18.16b\n" + "eor v19.16b, v19.16b, v19.16b\n" + "eor v20.16b, v20.16b, v20.16b\n" + "eor v21.16b, v21.16b, v21.16b\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + "eor v25.16b, v25.16b, v25.16b\n" + "eor v26.16b, v26.16b, v26.16b\n" + "eor v27.16b, v27.16b, v27.16b\n" + "eor v28.16b, v28.16b, v28.16b\n" + "b 7f\n" + + "8:\n" + "ldp q29, q30, [%[b_0_s]]\n" + "mov v5.16b, v29.16b\n" + "ldr q1, [%[in_0]]\n" // in_0 + "mov v7.16b, v29.16b\n" + "mov v9.16b, v29.16b\n" + "mov v11.16b, v29.16b\n" + "ldr q0, [%[f_0]]\n" // f_0 + "mov v13.16b, v29.16b\n" + "mov v15.16b, v29.16b\n" + "mov v17.16b, v29.16b\n" + "ldr q3, [%[in_0], #16]\n" // in_1 + "mov v19.16b, v29.16b\n" + "mov v21.16b, v29.16b\n" + "mov v23.16b, v29.16b\n" + "mov v25.16b, v29.16b\n" + "mov v27.16b, v29.16b\n" + + "mov v6.16b, v30.16b\n" + "mov v8.16b, v30.16b\n" + "mov v10.16b, v30.16b\n" + "mov v12.16b, v30.16b\n" + "mov v14.16b, v30.16b\n" + "mov v16.16b, v30.16b\n" + "mov v18.16b, v30.16b\n" + "mov v20.16b, v30.16b\n" + "mov v22.16b, v30.16b\n" + "mov v24.16b, v30.16b\n" + "mov v26.16b, v30.16b\n" + "mov v28.16b, v30.16b\n" + + "7:\n" + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" // ic_blk + "0:\n" + "ldr q2, [x3, 32]\n" + "ldr q29, [x0, 16]\n" + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "sdot v7.4s, v0.16b, v1.4b[1]\n" + "sdot v9.4s, v0.16b, v1.4b[2]\n" + "sdot v11.4s, v0.16b, v1.4b[3]\n" + + "sdot v13.4s, v0.16b, v3.4b[0]\n" + "sdot v15.4s, v0.16b, v3.4b[1]\n" + "sdot v17.4s, v0.16b, v3.4b[2]\n" + "sdot v19.4s, v0.16b, v3.4b[3]\n" + + "sdot v21.4s, v0.16b, v2.4b[0]\n" + "sdot v23.4s, v0.16b, v2.4b[1]\n" + "sdot v25.4s, v0.16b, v2.4b[2]\n" + "sdot v27.4s, v0.16b, v2.4b[3]\n" + + "sdot v14.4s, v29.16b, v3.4b[0]\n" + "sdot v16.4s, v29.16b, v3.4b[1]\n" + "ldr q0, [x0, 32]!\n" + "subs x2, x2, #4\n" + "sdot v18.4s, v29.16b, v3.4b[2]\n" + "sdot v20.4s, v29.16b, v3.4b[3]\n" + + "sdot v6.4s, v29.16b, v1.4b[0]\n" + "sdot v8.4s, v29.16b, v1.4b[1]\n" + "sdot v10.4s, v29.16b, v1.4b[2]\n" + "sdot v12.4s, v29.16b, v1.4b[3]\n" + + "ldr q1, [x3, 48]!\n" + "ldr q3, [x3, 16]\n" + "sdot v22.4s, v29.16b, v2.4b[0]\n" + "sdot v24.4s, v29.16b, v2.4b[1]\n" + "sdot v26.4s, v29.16b, v2.4b[2]\n" + "sdot v28.4s, v29.16b, v2.4b[3]\n" + + "bne 0b\n" + "cbz %[out_f16], 6f\n" + + "scvtf v5.4s, v5.4s\n" + "scvtf v6.4s, v6.4s\n" + "ldr q1, [%[factor]]\n" + "scvtf v7.4s, v7.4s\n" + "scvtf v8.4s, v8.4s\n" + "scvtf v9.4s, v9.4s\n" + "ldr q0, [%[b_0]]\n" + "scvtf v10.4s, v10.4s\n" + "scvtf v11.4s, v11.4s\n" + "scvtf v12.4s, v12.4s\n" + "scvtf v13.4s, v13.4s\n" + "scvtf v14.4s, v14.4s\n" + "scvtf v15.4s, v15.4s\n" + "scvtf v16.4s, v16.4s\n" + "scvtf v17.4s, v17.4s\n" + "scvtf v18.4s, v18.4s\n" + "scvtf v19.4s, v19.4s\n" + "scvtf v20.4s, v20.4s\n" + "scvtf v21.4s, v21.4s\n" + "scvtf v22.4s, v22.4s\n" + "scvtf v23.4s, v23.4s\n" + "scvtf v24.4s, v24.4s\n" + "scvtf v25.4s, v25.4s\n" + "scvtf v26.4s, v26.4s\n" + "scvtf v27.4s, v27.4s\n" + "scvtf v28.4s, v28.4s\n" + + "fmul v5.4s, v1.4s, v5.4s\n" + "fmul v6.4s, v1.4s, v6.4s\n" + "fmul v7.4s, v1.4s, v7.4s\n" + "fmul v8.4s, v1.4s, v8.4s\n" + "fmul v9.4s, v1.4s, v9.4s\n" + "fmul v10.4s, v1.4s, v10.4s\n" + "fmul v11.4s, v1.4s, v11.4s\n" + "fmul v12.4s, v1.4s, v12.4s\n" + "fmul v13.4s, v1.4s, v13.4s\n" + "fmul v14.4s, v1.4s, v14.4s\n" + "fmul v15.4s, v1.4s, v15.4s\n" + "fmul v16.4s, v1.4s, v16.4s\n" + "fmul v17.4s, v1.4s, v17.4s\n" + "fmul v18.4s, v1.4s, v18.4s\n" + "fmul v19.4s, v1.4s, v19.4s\n" + "fmul v20.4s, v1.4s, v20.4s\n" + "fmul v21.4s, v1.4s, v21.4s\n" + "fmul v22.4s, v1.4s, v22.4s\n" + "fmul v23.4s, v1.4s, v23.4s\n" + "fmul v24.4s, v1.4s, v24.4s\n" + "fmul v25.4s, v1.4s, v25.4s\n" + "fmul v26.4s, v1.4s, v26.4s\n" + "fmul v27.4s, v1.4s, v27.4s\n" + "fmul v28.4s, v1.4s, v28.4s\n" + + "fcvtn v5.4h, v5.4s\n" + "fcvtn v7.4h, v7.4s\n" + "fcvtn v9.4h, v9.4s\n" + "fcvtn v11.4h, v11.4s\n" + "fcvtn v13.4h, v13.4s\n" + "fcvtn v15.4h, v15.4s\n" + "fcvtn v17.4h, v17.4s\n" + "fcvtn v19.4h, v19.4s\n" + "fcvtn v21.4h, v21.4s\n" + "fcvtn v23.4h, v23.4s\n" + "fcvtn v25.4h, v25.4s\n" + "fcvtn v27.4h, v27.4s\n" + + "fcvtn2 v5.8h, v6.4s\n" + "fcvtn2 v7.8h, v8.4s\n" + "fcvtn2 v9.8h, v10.4s\n" + "fcvtn2 v11.8h, v12.4s\n" + "fcvtn2 v13.8h, v14.4s\n" + "fcvtn2 v15.8h, v16.4s\n" + "fcvtn2 v17.8h, v18.4s\n" + "fcvtn2 v19.8h, v20.4s\n" + "fcvtn2 v21.8h, v22.4s\n" + "fcvtn2 v23.8h, v24.4s\n" + "fcvtn2 v25.8h, v26.4s\n" + "fcvtn2 v27.8h, v28.4s\n" + + "fadd v5.8h, v0.8h, v5.8h\n" + "fadd v7.8h, v0.8h, v7.8h\n" + "fadd v9.8h, v0.8h, v9.8h\n" + "fadd v11.8h, v0.8h, v11.8h\n" + "fadd v13.8h, v0.8h, v13.8h\n" + "fadd v15.8h, v0.8h, v15.8h\n" + "fadd v17.8h, v0.8h, v17.8h\n" + "fadd v19.8h, v0.8h, v19.8h\n" + "fadd v21.8h, v0.8h, v21.8h\n" + "fadd v23.8h, v0.8h, v23.8h\n" + "fadd v25.8h, v0.8h, v25.8h\n" + "fadd v27.8h, v0.8h, v27.8h\n" + + "cbz %[conv_relu], 1f\n" + "eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v5.8h, v5.8h, v1.8h\n" + "fmax v7.8h, v7.8h, v1.8h\n" + "fmax v9.8h, v9.8h, v1.8h\n" + "fmax v11.8h, v11.8h, v1.8h\n" + "fmax v13.8h, v13.8h, v1.8h\n" + "fmax v15.8h, v15.8h, v1.8h\n" + "fmax v17.8h, v17.8h, v1.8h\n" + "fmax v19.8h, v19.8h, v1.8h\n" + "fmax v21.8h, v21.8h, v1.8h\n" + "fmax v23.8h, v23.8h, v1.8h\n" + "fmax v25.8h, v25.8h, v1.8h\n" + "fmax v27.8h, v27.8h, v1.8h\n" + + "1:\n" + "str q5, [%[out_0]]\n" + "str q7, [%[out_0], #16]\n" + "str q9, [%[out_0], #32]\n" + "str q11, [%[out_0], #48]\n" + "str q13, [%[out_0], #64]\n" + "str q15, [%[out_0], #80]\n" + "str q17, [%[out_0], #96]\n" + "str q19, [%[out_0], #112]\n" + "str q21, [%[out_0], #128]\n" + "str q23, [%[out_0], #144]\n" + "str q25, [%[out_0], #160]\n" + "str q27, [%[out_0], #176]\n" + "b 5f\n" + + "6:\n" + "ldr q0, [%[min]]\n" + "ldr q30, [%[max]]\n" + "cbz %[conv_relu], 2f\n" + "eor v1.16b, v1.16b, v1.16b\n" // zero + "smax v5.4s, v5.4s, v1.4s\n" + "smax v6.4s, v6.4s, v1.4s\n" + "smax v7.4s, v7.4s, v1.4s\n" + "smax v8.4s, v8.4s, v1.4s\n" + "smax v9.4s, v9.4s, v1.4s\n" + "smax v10.4s, v10.4s, v1.4s\n" + "smax v11.4s, v11.4s, v1.4s\n" + "smax v12.4s, v12.4s, v1.4s\n" + "smax v13.4s, v13.4s, v1.4s\n" + "smax v14.4s, v14.4s, v1.4s\n" + "smax v15.4s, v15.4s, v1.4s\n" + "smax v16.4s, v16.4s, v1.4s\n" + "smax v17.4s, v17.4s, v1.4s\n" + "smax v18.4s, v18.4s, v1.4s\n" + "smax v19.4s, v19.4s, v1.4s\n" + "smax v20.4s, v20.4s, v1.4s\n" + "smax v21.4s, v21.4s, v1.4s\n" + "smax v22.4s, v22.4s, v1.4s\n" + "smax v23.4s, v23.4s, v1.4s\n" + "smax v24.4s, v24.4s, v1.4s\n" + "smax v25.4s, v25.4s, v1.4s\n" + "smax v26.4s, v26.4s, v1.4s\n" + "smax v27.4s, v27.4s, v1.4s\n" + "smax v28.4s, v28.4s, v1.4s\n" + + "2:\n" + "cbz %[scale_known], 7f\n" + "smax v5.4s, v5.4s, v0.4s\n" + "smin v5.4s, v5.4s, v30.4s\n" + "smax v6.4s, v6.4s, v0.4s\n" + "smin v6.4s, v6.4s, v30.4s\n" + "smax v7.4s, v7.4s, v0.4s\n" + "smin v7.4s, v7.4s, v30.4s\n" + "smax v8.4s, v8.4s, v0.4s\n" + "smin v8.4s, v8.4s, v30.4s\n" + "smax v9.4s, v9.4s, v0.4s\n" + "smin v9.4s, v9.4s, v30.4s\n" + "smax v10.4s, v10.4s, v0.4s\n" + "smin v10.4s, v10.4s, v30.4s\n" + "smax v11.4s, v11.4s, v0.4s\n" + "smin v11.4s, v11.4s, v30.4s\n" + "smax v12.4s, v12.4s, v0.4s\n" + "smin v12.4s, v12.4s, v30.4s\n" + "smax v13.4s, v13.4s, v0.4s\n" + "smin v13.4s, v13.4s, v30.4s\n" + "smax v14.4s, v14.4s, v0.4s\n" + "smin v14.4s, v14.4s, v30.4s\n" + "smax v15.4s, v15.4s, v0.4s\n" + "smin v15.4s, v15.4s, v30.4s\n" + "smax v16.4s, v16.4s, v0.4s\n" + "smin v16.4s, v16.4s, v30.4s\n" + "smax v17.4s, v17.4s, v0.4s\n" + "smin v17.4s, v17.4s, v30.4s\n" + "smax v18.4s, v18.4s, v0.4s\n" + "smin v18.4s, v18.4s, v30.4s\n" + "smax v19.4s, v19.4s, v0.4s\n" + "smin v19.4s, v19.4s, v30.4s\n" + "smax v20.4s, v20.4s, v0.4s\n" + "smin v20.4s, v20.4s, v30.4s\n" + "smax v21.4s, v21.4s, v0.4s\n" + "smin v21.4s, v21.4s, v30.4s\n" + "smax v22.4s, v22.4s, v0.4s\n" + "smin v22.4s, v22.4s, v30.4s\n" + "smax v23.4s, v23.4s, v0.4s\n" + "smin v23.4s, v23.4s, v30.4s\n" + "smax v24.4s, v24.4s, v0.4s\n" + "smin v24.4s, v24.4s, v30.4s\n" + "smax v25.4s, v25.4s, v0.4s\n" + "smin v25.4s, v25.4s, v30.4s\n" + "smax v26.4s, v26.4s, v0.4s\n" + "smin v26.4s, v26.4s, v30.4s\n" + "smax v27.4s, v27.4s, v0.4s\n" + "smin v27.4s, v27.4s, v30.4s\n" + "smax v28.4s, v28.4s, v0.4s\n" + "smin v28.4s, v28.4s, v30.4s\n" + + "str q5, [%[out_buf]]\n" + "str q6, [%[out_buf], 16]\n" + "str q7, [%[out_buf], 32]\n" + "str q8, [%[out_buf], 48]\n" + "str q9, [%[out_buf], 64]\n" + "str q10, [%[out_buf], 80]\n" + "str q11, [%[out_buf], 96]\n" + "str q12, [%[out_buf], 112]\n" + "str q13, [%[out_buf], 128]\n" + "str q14, [%[out_buf], 144]\n" + "str q15, [%[out_buf], 160]\n" + "str q16, [%[out_buf], 176]\n" + "str q17, [%[out_buf], 192]\n" + "str q18, [%[out_buf], 208]\n" + "str q19, [%[out_buf], 224]\n" + "str q20, [%[out_buf], 240]\n" + "str q21, [%[out_buf], 256]\n" + "str q22, [%[out_buf], 272]\n" + "str q23, [%[out_buf], 288]\n" + "str q24, [%[out_buf], 304]\n" + "str q25, [%[out_buf], 320]\n" + "str q26, [%[out_buf], 336]\n" + "str q27, [%[out_buf], 352]\n" + "str q28, [%[out_buf], 368]\n" + "b 5f\n" + + "7:\n" + "smax v30.4s, v5.4s, v30.4s\n" + "smin v0.4s, v5.4s, v0.4s\n" + "str q5, [%[out_buf]]\n" + "smax v30.4s, v6.4s, v30.4s\n" + "smin v0.4s, v6.4s, v0.4s\n" + "str q6, [%[out_buf], 16]\n" + "smax v30.4s, v7.4s, v30.4s\n" + "smin v0.4s, v7.4s, v0.4s\n" + "str q7, [%[out_buf], 32]\n" + "smax v30.4s, v8.4s, v30.4s\n" + "smin v0.4s, v8.4s, v0.4s\n" + "str q8, [%[out_buf], 48]\n" + "smax v30.4s, v9.4s, v30.4s\n" + "smin v0.4s, v9.4s, v0.4s\n" + "str q9, [%[out_buf], 64]\n" + "smax v30.4s, v10.4s, v30.4s\n" + "smin v0.4s, v10.4s, v0.4s\n" + "str q10, [%[out_buf], 80]\n" + "smax v30.4s, v11.4s, v30.4s\n" + "smin v0.4s, v11.4s, v0.4s\n" + "str q11, [%[out_buf], 96]\n" + "smax v30.4s, v12.4s, v30.4s\n" + "smin v0.4s, v12.4s, v0.4s\n" + "str q12, [%[out_buf], 112]\n" + "smax v30.4s, v13.4s, v30.4s\n" + "smin v0.4s, v13.4s, v0.4s\n" + "str q13, [%[out_buf], 128]\n" + + "smax v30.4s, v14.4s, v30.4s\n" + "smin v0.4s, v14.4s, v0.4s\n" + "str q14, [%[out_buf], 144]\n" + "smax v30.4s, v15.4s, v30.4s\n" + "smin v0.4s, v15.4s, v0.4s\n" + "str q15, [%[out_buf], 160]\n" + "smax v30.4s, v16.4s, v30.4s\n" + "smin v0.4s, v16.4s, v0.4s\n" + "str q16, [%[out_buf], 176]\n" + "smax v30.4s, v17.4s, v30.4s\n" + "smin v0.4s, v17.4s, v0.4s\n" + "str q17, [%[out_buf], 192]\n" + "smax v30.4s, v18.4s, v30.4s\n" + "smin v0.4s, v18.4s, v0.4s\n" + "str q18, [%[out_buf], 208]\n" + "smax v30.4s, v19.4s, v30.4s\n" + "smin v0.4s, v19.4s, v0.4s\n" + "str q19, [%[out_buf], 224]\n" + "smax v30.4s, v20.4s, v30.4s\n" + "smin v0.4s, v20.4s, v0.4s\n" + "str q20, [%[out_buf], 240]\n" + "smax v30.4s, v21.4s, v30.4s\n" + "smin v0.4s, v21.4s, v0.4s\n" + "str q21, [%[out_buf], 256]\n" + "smax v30.4s, v22.4s, v30.4s\n" + "smin v0.4s, v22.4s, v0.4s\n" + "str q22, [%[out_buf], 272]\n" + "smax v30.4s, v23.4s, v30.4s\n" + "smin v0.4s, v23.4s, v0.4s\n" + "str q23, [%[out_buf], 288]\n" + "smax v30.4s, v24.4s, v30.4s\n" + "smin v0.4s, v24.4s, v0.4s\n" + "str q24, [%[out_buf], 304]\n" + "smax v30.4s, v25.4s, v30.4s\n" + "smin v0.4s, v25.4s, v0.4s\n" + "str q25, [%[out_buf], 320]\n" + "smax v30.4s, v26.4s, v30.4s\n" + "smin v0.4s, v26.4s, v0.4s\n" + "str q26, [%[out_buf], 336]\n" + "smax v30.4s, v27.4s, v30.4s\n" + "smin v0.4s, v27.4s, v0.4s\n" + "str q27, [%[out_buf], 352]\n" + "smax v30.4s, v28.4s, v30.4s\n" + "smin v0.4s, v28.4s, v0.4s\n" + "str q28, [%[out_buf], 368]\n" + + "str q30, [%[max]]\n" + "str q0, [%[min]]\n" + + "5:\n" + : + : [out_0] "r"(out_o0hw0), [out_buf] "r"(out_buf), [in_0] "r"(in_hw0), + [f_0] "r"(f_o0c0), [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_0), + [b_0_s] "r"(b_0_s), [factor] "r"(factor_v), [max] "r"(max_i32), + [min] "r"(min_i32), [conv_relu] "r"(conv_relu_bool), + [out_f16] "r"(out_f16_bool), [scale_known] "r"(scale_known_bool) + : "memory", "cc", "v0", "v1", "v2", "v3", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x0", "x1", "x2", + "x3", "x17", "x16"); + b0 += 8; + b0_s += 8; + } + } + + // ohow_reminder % 12 / 8 + I32 ohow_s = (ohow / 12) * 12; + I32 ohow_tail = ohow - ohow_s; + + if (ohow_tail >= 8) { + I32 hw = ohow_s; + F16 *b0 = biasArray; + I32 *b0_s = biasScaled; + INT8 *in_pack = ((INT8 *)tmp) + ic * ih_pad * iw_pad * 8; + // pack input + // NCHWc8 => NHWChw8c4 + im2col + U32 in_h[8]; + U32 in_w[8]; + + for (U32 i = 0; i < 8; i++) { + in_h[i] = ((hw + i) / ow) * strideH; + in_w[i] = ((hw + i) % ow) * strideW; + } + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + INT8 *in_hw8c8 = inArray_pad + c * ihiw * 8 + + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + INT8 *in_0 = in_hw8c8 + in_h[0] * iw_pad * 8 + in_w[0] * 8; + INT8 *in_1 = in_hw8c8 + in_h[1] * iw_pad * 8 + in_w[1] * 8; + INT8 *in_2 = in_hw8c8 + in_h[2] * iw_pad * 8 + in_w[2] * 8; + INT8 *in_3 = in_hw8c8 + in_h[3] * iw_pad * 8 + in_w[3] * 8; + INT8 *in_4 = in_hw8c8 + in_h[4] * iw_pad * 8 + in_w[4] * 8; + INT8 *in_5 = in_hw8c8 + in_h[5] * iw_pad * 8 + in_w[5] * 8; + INT8 *in_6 = in_hw8c8 + in_h[6] * iw_pad * 8 + in_w[6] * 8; + INT8 *in_7 = in_hw8c8 + in_h[7] * iw_pad * 8 + in_w[7] * 8; + INT8 *in_pack_0 = + in_pack + c * fh * fw * 8 * 8 + fh_idx * fw * 8 * 4 + fw_idx * 8 * 4; + INT8 *in_pack_1 = in_pack_0 + fh * fw * 8 * 4; + + __asm__ __volatile__("ldr d0, [%[in_0]]\n" + "ldr x2, [%[in_2]]\n" + "ldr d1, [%[in_1]]\n" + "ldr x3, [%[in_3]]\n" + "ins v0.d[1], x2\n" + "ins v1.d[1], x3\n" + "ldr d4, [%[in_4]]\n" + "ldr x6, [%[in_6]]\n" + "trn1 v20.4s, v0.4s, v1.4s\n" + "trn2 v21.4s, v0.4s, v1.4s\n" + + "ldr d5, [%[in_5]]\n" + "ldr x7, [%[in_7]]\n" + "ins v4.d[1], x6\n" + "ins v5.d[1], x7\n" + + "str q20, [%[pack_0]]\n" + "trn1 v24.4s, v4.4s, v5.4s\n" + "trn2 v25.4s, v4.4s, v5.4s\n" + "str q21, [%[pack_1]]\n" + "str q24, [%[pack_0], #16]\n" + "str q25, [%[pack_1], #16]\n" + : + : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), + [in_0] "r"(in_0), [in_1] "r"(in_1), [in_2] "r"(in_2), + [in_3] "r"(in_3), [in_4] "r"(in_4), [in_5] "r"(in_5), + [in_6] "r"(in_6), [in_7] "r"(in_7) + : "memory", "cc", "v0", "v1", "v4", "v5", "v20", "v21", + "v24", "v25", "x2", "x3", "x6", "x7"); + } + } + } + + // compute + for (U32 o = 0; o < oc; o++) { + INT8 *in_hw0 = in_pack; + INT8 *f_o0c0 = filterArray + o * 8 * fh * fw * ic * 8; + I32 *out_buf = biasScaled + oc * 8 + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + F16 *b_0 = b0; + I32 *b_0_s = b0_s; + __asm__ __volatile__( + "cbz %[out_f16], 8f\n" + "eor v5.16b, v5.16b, v5.16b\n" + "ldr q1, [%[in_0]]\n" // in_0 + "eor v6.16b, v6.16b, v6.16b\n" + "eor v7.16b, v7.16b, v7.16b\n" + "eor v8.16b, v8.16b, v8.16b\n" + "ldr q0, [%[f_0]]\n" // f_0 + "eor v9.16b, v9.16b, v9.16b\n" + "eor v10.16b, v10.16b, v10.16b\n" + "eor v11.16b, v11.16b, v11.16b\n" + "eor v12.16b, v12.16b, v12.16b\n" + "eor v13.16b, v13.16b, v13.16b\n" + "eor v14.16b, v14.16b, v14.16b\n" + "eor v15.16b, v15.16b, v15.16b\n" + "eor v16.16b, v16.16b, v16.16b\n" + "eor v17.16b, v17.16b, v17.16b\n" + "eor v18.16b, v18.16b, v18.16b\n" + "eor v19.16b, v19.16b, v19.16b\n" + "eor v20.16b, v20.16b, v20.16b\n" + "b 7f\n" + + "8:\n" + "ldp q29, q30, [%[b_0_s]]\n" + "ldr q1, [%[in_0]]\n" // in_0 + "ldr q0, [%[f_0]]\n" // f_0 + "mov v5.16b, v29.16b\n" + "mov v7.16b, v29.16b\n" + "mov v9.16b, v29.16b\n" + "mov v11.16b, v29.16b\n" + "mov v13.16b, v29.16b\n" + "mov v15.16b, v29.16b\n" + "mov v17.16b, v29.16b\n" + "mov v19.16b, v29.16b\n" + + "mov v6.16b, v30.16b\n" + "mov v8.16b, v30.16b\n" + "mov v10.16b, v30.16b\n" + "mov v12.16b, v30.16b\n" + "mov v14.16b, v30.16b\n" + "mov v16.16b, v30.16b\n" + "mov v18.16b, v30.16b\n" + "mov v20.16b, v30.16b\n" + + "7:\n" + + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" // ic_blk + "0:\n" + "ldr q3, [x3, 16]!\n" + "ldr q29, [x0, 16]\n" + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "sdot v7.4s, v0.16b, v1.4b[1]\n" + "sdot v9.4s, v0.16b, v1.4b[2]\n" + "sdot v11.4s, v0.16b, v1.4b[3]\n" + + "sdot v13.4s, v0.16b, v3.4b[0]\n" + "sdot v15.4s, v0.16b, v3.4b[1]\n" + "sdot v17.4s, v0.16b, v3.4b[2]\n" + "sdot v19.4s, v0.16b, v3.4b[3]\n" + + "sdot v6.4s, v29.16b, v1.4b[0]\n" + "sdot v8.4s, v29.16b, v1.4b[1]\n" + + "sdot v10.4s, v29.16b, v1.4b[2]\n" + "sdot v12.4s, v29.16b, v1.4b[3]\n" + + "ldr q0, [x0, 32]!\n" + "subs x2, x2, #4\n" + "ldr q1, [x3, 16]!\n" + "sdot v14.4s, v29.16b, v3.4b[0]\n" + "sdot v16.4s, v29.16b, v3.4b[1]\n" + "sdot v18.4s, v29.16b, v3.4b[2]\n" + "sdot v20.4s, v29.16b, v3.4b[3]\n" + + "bne 0b\n" + "cbz %[out_f16], 6f\n" + "scvtf v5.4s, v5.4s\n" + "scvtf v6.4s, v6.4s\n" + "ldr q1, [%[factor]]\n" + "scvtf v7.4s, v7.4s\n" + "scvtf v8.4s, v8.4s\n" + "scvtf v9.4s, v9.4s\n" + "ldr q0, [%[b_0]]\n" + "scvtf v10.4s, v10.4s\n" + "scvtf v11.4s, v11.4s\n" + "scvtf v12.4s, v12.4s\n" + "scvtf v13.4s, v13.4s\n" + "scvtf v14.4s, v14.4s\n" + "scvtf v15.4s, v15.4s\n" + "scvtf v16.4s, v16.4s\n" + "scvtf v17.4s, v17.4s\n" + "scvtf v18.4s, v18.4s\n" + "scvtf v19.4s, v19.4s\n" + "scvtf v20.4s, v20.4s\n" + + "fmul v5.4s, v1.4s, v5.4s\n" + "fmul v6.4s, v1.4s, v6.4s\n" + "fmul v7.4s, v1.4s, v7.4s\n" + "fmul v8.4s, v1.4s, v8.4s\n" + "fmul v9.4s, v1.4s, v9.4s\n" + "fmul v10.4s, v1.4s, v10.4s\n" + "fmul v11.4s, v1.4s, v11.4s\n" + "fmul v12.4s, v1.4s, v12.4s\n" + "fmul v13.4s, v1.4s, v13.4s\n" + "fmul v14.4s, v1.4s, v14.4s\n" + "fmul v15.4s, v1.4s, v15.4s\n" + "fmul v16.4s, v1.4s, v16.4s\n" + "fmul v17.4s, v1.4s, v17.4s\n" + "fmul v18.4s, v1.4s, v18.4s\n" + "fmul v19.4s, v1.4s, v19.4s\n" + "fmul v20.4s, v1.4s, v20.4s\n" + + "fcvtn v5.4h, v5.4s\n" + "fcvtn v7.4h, v7.4s\n" + "fcvtn v9.4h, v9.4s\n" + "fcvtn v11.4h, v11.4s\n" + "fcvtn v13.4h, v13.4s\n" + "fcvtn v15.4h, v15.4s\n" + "fcvtn v17.4h, v17.4s\n" + "fcvtn v19.4h, v19.4s\n" + + "fcvtn2 v5.8h, v6.4s\n" + "fcvtn2 v7.8h, v8.4s\n" + "fcvtn2 v9.8h, v10.4s\n" + "fcvtn2 v11.8h, v12.4s\n" + "fcvtn2 v13.8h, v14.4s\n" + "fcvtn2 v15.8h, v16.4s\n" + "fcvtn2 v17.8h, v18.4s\n" + "fcvtn2 v19.8h, v20.4s\n" + + "fadd v5.8h, v0.8h, v5.8h\n" + "fadd v7.8h, v0.8h, v7.8h\n" + "fadd v9.8h, v0.8h, v9.8h\n" + "fadd v11.8h, v0.8h, v11.8h\n" + "fadd v13.8h, v0.8h, v13.8h\n" + "fadd v15.8h, v0.8h, v15.8h\n" + "fadd v17.8h, v0.8h, v17.8h\n" + "fadd v19.8h, v0.8h, v19.8h\n" + + "cbz %[conv_relu], 1f\n" + "eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v5.8h, v5.8h, v1.8h\n" + "fmax v7.8h, v7.8h, v1.8h\n" + "fmax v9.8h, v9.8h, v1.8h\n" + "fmax v11.8h, v11.8h, v1.8h\n" + "fmax v13.8h, v13.8h, v1.8h\n" + "fmax v15.8h, v15.8h, v1.8h\n" + "fmax v17.8h, v17.8h, v1.8h\n" + "fmax v19.8h, v19.8h, v1.8h\n" + + "1:\n" + "str q5, [%[out_0]]\n" + "str q7, [%[out_0], #16]\n" + "str q9, [%[out_0], #32]\n" + "str q11, [%[out_0], #48]\n" + "str q13, [%[out_0], #64]\n" + "str q15, [%[out_0], #80]\n" + "str q17, [%[out_0], #96]\n" + "str q19, [%[out_0], #112]\n" + "b 5f\n" + + "6:\n" + "ldr q0, [%[min]]\n" + "ldr q30, [%[max]]\n" + "cbz %[conv_relu], 2f\n" + "eor v1.16b, v1.16b, v1.16b\n" // zero + "smax v5.4s, v5.4s, v1.4s\n" + "smax v6.4s, v6.4s, v1.4s\n" + "smax v7.4s, v7.4s, v1.4s\n" + "smax v8.4s, v8.4s, v1.4s\n" + "smax v9.4s, v9.4s, v1.4s\n" + "smax v10.4s, v10.4s, v1.4s\n" + "smax v11.4s, v11.4s, v1.4s\n" + "smax v12.4s, v12.4s, v1.4s\n" + "smax v13.4s, v13.4s, v1.4s\n" + "smax v14.4s, v14.4s, v1.4s\n" + "smax v15.4s, v15.4s, v1.4s\n" + "smax v16.4s, v16.4s, v1.4s\n" + "smax v17.4s, v17.4s, v1.4s\n" + "smax v18.4s, v18.4s, v1.4s\n" + "smax v19.4s, v19.4s, v1.4s\n" + "smax v20.4s, v20.4s, v1.4s\n" + + "2:\n" + "cbz %[scale_known], 7f\n" + "smax v5.4s, v5.4s, v0.4s\n" + "smin v5.4s, v5.4s, v30.4s\n" + "smax v6.4s, v6.4s, v0.4s\n" + "smin v6.4s, v6.4s, v30.4s\n" + "smax v7.4s, v7.4s, v0.4s\n" + "smin v7.4s, v7.4s, v30.4s\n" + "smax v8.4s, v8.4s, v0.4s\n" + "smin v8.4s, v8.4s, v30.4s\n" + "smax v9.4s, v9.4s, v0.4s\n" + "smin v9.4s, v9.4s, v30.4s\n" + "smax v10.4s, v10.4s, v0.4s\n" + "smin v10.4s, v10.4s, v30.4s\n" + "smax v11.4s, v11.4s, v0.4s\n" + "smin v11.4s, v11.4s, v30.4s\n" + "smax v12.4s, v12.4s, v0.4s\n" + "smin v12.4s, v12.4s, v30.4s\n" + "smax v13.4s, v13.4s, v0.4s\n" + "smin v13.4s, v13.4s, v30.4s\n" + "smax v14.4s, v14.4s, v0.4s\n" + "smin v14.4s, v14.4s, v30.4s\n" + "smax v15.4s, v15.4s, v0.4s\n" + "smin v15.4s, v15.4s, v30.4s\n" + "smax v16.4s, v16.4s, v0.4s\n" + "smin v16.4s, v16.4s, v30.4s\n" + "smax v17.4s, v17.4s, v0.4s\n" + "smin v17.4s, v17.4s, v30.4s\n" + "smax v18.4s, v18.4s, v0.4s\n" + "smin v18.4s, v18.4s, v30.4s\n" + "smax v19.4s, v19.4s, v0.4s\n" + "smin v19.4s, v19.4s, v30.4s\n" + "smax v20.4s, v20.4s, v0.4s\n" + "smin v20.4s, v20.4s, v30.4s\n" + + "str q5, [%[out_buf]]\n" + "str q6, [%[out_buf], 16]\n" + "str q7, [%[out_buf], 32]\n" + "str q8, [%[out_buf], 48]\n" + "str q9, [%[out_buf], 64]\n" + "str q10, [%[out_buf], 80]\n" + "str q11, [%[out_buf], 96]\n" + "str q12, [%[out_buf], 112]\n" + "str q13, [%[out_buf], 128]\n" + "str q14, [%[out_buf], 144]\n" + "str q15, [%[out_buf], 160]\n" + "str q16, [%[out_buf], 176]\n" + "str q17, [%[out_buf], 192]\n" + "str q18, [%[out_buf], 208]\n" + "str q19, [%[out_buf], 224]\n" + "str q20, [%[out_buf], 240]\n" + "b 5f\n" + + "7:\n" + "smax v30.4s, v5.4s, v30.4s\n" + "smin v0.4s, v5.4s, v0.4s\n" + "str q5, [%[out_buf]]\n" + "smax v30.4s, v6.4s, v30.4s\n" + "smin v0.4s, v6.4s, v0.4s\n" + "str q6, [%[out_buf], 16]\n" + "smax v30.4s, v7.4s, v30.4s\n" + "smin v0.4s, v7.4s, v0.4s\n" + "str q7, [%[out_buf], 32]\n" + "smax v30.4s, v8.4s, v30.4s\n" + "smin v0.4s, v8.4s, v0.4s\n" + "str q8, [%[out_buf], 48]\n" + "smax v30.4s, v9.4s, v30.4s\n" + "smin v0.4s, v9.4s, v0.4s\n" + "str q9, [%[out_buf], 64]\n" + "smax v30.4s, v10.4s, v30.4s\n" + "smin v0.4s, v10.4s, v0.4s\n" + "str q10, [%[out_buf], 80]\n" + "smax v30.4s, v11.4s, v30.4s\n" + "smin v0.4s, v11.4s, v0.4s\n" + "str q11, [%[out_buf], 96]\n" + "smax v30.4s, v12.4s, v30.4s\n" + "smin v0.4s, v12.4s, v0.4s\n" + "str q12, [%[out_buf], 112]\n" + "smax v30.4s, v13.4s, v30.4s\n" + "smin v0.4s, v13.4s, v0.4s\n" + "str q13, [%[out_buf], 128]\n" + + "smax v30.4s, v14.4s, v30.4s\n" + "smin v0.4s, v14.4s, v0.4s\n" + "str q14, [%[out_buf], 144]\n" + "smax v30.4s, v15.4s, v30.4s\n" + "smin v0.4s, v15.4s, v0.4s\n" + "str q15, [%[out_buf], 160]\n" + "smax v30.4s, v16.4s, v30.4s\n" + "smin v0.4s, v16.4s, v0.4s\n" + "str q16, [%[out_buf], 176]\n" + "smax v30.4s, v17.4s, v30.4s\n" + "smin v0.4s, v17.4s, v0.4s\n" + "str q17, [%[out_buf], 192]\n" + "smax v30.4s, v18.4s, v30.4s\n" + "smin v0.4s, v18.4s, v0.4s\n" + "str q18, [%[out_buf], 208]\n" + "smax v30.4s, v19.4s, v30.4s\n" + "smin v0.4s, v19.4s, v0.4s\n" + "str q19, [%[out_buf], 224]\n" + "smax v30.4s, v20.4s, v30.4s\n" + "smin v0.4s, v20.4s, v0.4s\n" + "str q20, [%[out_buf], 240]\n" + + "str q30, [%[max]]\n" + "str q0, [%[min]]\n" + "5:\n" + : + : [out_0] "r"(out_o0hw0), [out_buf] "r"(out_buf), [in_0] "r"(in_hw0), + [f_0] "r"(f_o0c0), [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_0), + [b_0_s] "r"(b_0_s), [factor] "r"(factor_v), [max] "r"(max_i32), + [min] "r"(min_i32), [conv_relu] "r"(conv_relu_bool), + [out_f16] "r"(out_f16_bool), [scale_known] "r"(scale_known_bool) + : "memory", "cc", "v0", "v1", "v3", "v5", "v6", "v7", "v8", "v9", "v10", "v11", + "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v29", "v30", + "x0", "x1", "x2", "x3", "x17", "x16"); + b0 += 8; + b0_s += 8; + } + ohow_s += 8; + ohow_tail -= 8; + } + + if (ohow_tail >= 4) { + I32 hw = ohow_s; + F16 *b0 = biasArray; + I32 *b0_s = biasScaled; + INT8 *in_pack = ((INT8 *)tmp) + ic * ih_pad * iw_pad * 8; + // pack input + // NCHWc8 => NHWChw4c4 + im2col + U32 in_h[4]; + U32 in_w[4]; + + for (U32 i = 0; i < 4; i++) { + in_h[i] = ((hw + i) / ow) * strideH; + in_w[i] = ((hw + i) % ow) * strideW; + } + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + INT8 *in_hw4c8 = inArray_pad + c * ihiw * 8 + + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + INT8 *in_0 = in_hw4c8 + in_h[0] * iw_pad * 8 + in_w[0] * 8; + INT8 *in_1 = in_hw4c8 + in_h[1] * iw_pad * 8 + in_w[1] * 8; + INT8 *in_2 = in_hw4c8 + in_h[2] * iw_pad * 8 + in_w[2] * 8; + INT8 *in_3 = in_hw4c8 + in_h[3] * iw_pad * 8 + in_w[3] * 8; + INT8 *in_pack_0 = + in_pack + c * fh * fw * 4 * 8 + fh_idx * fw * 4 * 4 + fw_idx * 4 * 4; + INT8 *in_pack_1 = in_pack_0 + fh * fw * 4 * 4; + + __asm__ __volatile__( + "ldr d0, [%[in_0]]\n" + "ldr x2, [%[in_2]]\n" + "ldr d1, [%[in_1]]\n" + "ldr x3, [%[in_3]]\n" + "ins v0.d[1], x2\n" + "ins v1.d[1], x3\n" + "trn1 v20.4s, v0.4s, v1.4s\n" + "trn2 v21.4s, v0.4s, v1.4s\n" + "str q20, [%[pack_0]]\n" + "str q21, [%[pack_1]]\n" + : + : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), [in_0] "r"(in_0), + [in_1] "r"(in_1), [in_2] "r"(in_2), [in_3] "r"(in_3) + : "memory", "cc", "v0", "v1", "v20", "v21", "x2", "x3"); + } + } + } + + // compute + for (U32 o = 0; o < oc; o++) { + INT8 *in_hw0 = in_pack; + INT8 *f_o0c0 = filterArray + o * 8 * fh * fw * ic * 8; + I32 *out_buf = biasScaled + oc * 8 + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + F16 *b_0 = b0; + I32 *b_0_s = b0_s; + __asm__ __volatile__( + "cbz %[out_f16], 8f\n" + "eor v5.16b, v5.16b, v5.16b\n" + "ldr q1, [%[in_0]]\n" // in_0 + "eor v6.16b, v6.16b, v6.16b\n" + "eor v7.16b, v7.16b, v7.16b\n" + "eor v8.16b, v8.16b, v8.16b\n" + "ldr q0, [%[f_0]]\n" // f_0 + + "eor v9.16b, v9.16b, v9.16b\n" + "eor v10.16b, v10.16b, v10.16b\n" + "eor v11.16b, v11.16b, v11.16b\n" + "eor v12.16b, v12.16b, v12.16b\n" + "b 7f\n" + + "8:\n" + "ldp q29, q30, [%[b_0_s]]\n" + "ldr q1, [%[in_0]]\n" // in_0 + "ldr q0, [%[f_0]]\n" // f_0 + "mov v5.16b, v29.16b\n" + "mov v7.16b, v29.16b\n" + "mov v9.16b, v29.16b\n" + "mov v11.16b, v29.16b\n" + + "mov v6.16b, v30.16b\n" + "mov v8.16b, v30.16b\n" + "mov v10.16b, v30.16b\n" + "mov v12.16b, v30.16b\n" + + "7:\n" + + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" // ic_blk + "0:\n" + "ldr q29, [x0, 16]\n" + "ldr q3, [x3, 16]!\n" + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "sdot v7.4s, v0.16b, v1.4b[1]\n" + + "sdot v9.4s, v0.16b, v1.4b[2]\n" + "sdot v11.4s, v0.16b, v1.4b[3]\n" + + "subs x2, x2, #4\n" + "ldr q0, [x0, 32]!\n" + "sdot v6.4s, v29.16b, v1.4b[0]\n" + "sdot v8.4s, v29.16b, v1.4b[1]\n" + "sdot v10.4s, v29.16b, v1.4b[2]\n" + "sdot v12.4s, v29.16b, v1.4b[3]\n" + "mov v1.16b, v3.16b\n" + + "bne 0b\n" + "cbz %[out_f16], 6f\n" + + "scvtf v5.4s, v5.4s\n" + "scvtf v6.4s, v6.4s\n" + "ldr q1, [%[factor]]\n" + "scvtf v7.4s, v7.4s\n" + "scvtf v8.4s, v8.4s\n" + "scvtf v9.4s, v9.4s\n" + "ldr q0, [%[b_0]]\n" + "scvtf v10.4s, v10.4s\n" + "scvtf v11.4s, v11.4s\n" + "scvtf v12.4s, v12.4s\n" + + "fmul v5.4s, v1.4s, v5.4s\n" + "fmul v6.4s, v1.4s, v6.4s\n" + "fmul v7.4s, v1.4s, v7.4s\n" + "fmul v8.4s, v1.4s, v8.4s\n" + "fmul v9.4s, v1.4s, v9.4s\n" + "fmul v10.4s, v1.4s, v10.4s\n" + "fmul v11.4s, v1.4s, v11.4s\n" + "fmul v12.4s, v1.4s, v12.4s\n" + + "fcvtn v5.4h, v5.4s\n" + "fcvtn v7.4h, v7.4s\n" + "fcvtn v9.4h, v9.4s\n" + "fcvtn v11.4h, v11.4s\n" + + "fcvtn2 v5.8h, v6.4s\n" + "fcvtn2 v7.8h, v8.4s\n" + "fcvtn2 v9.8h, v10.4s\n" + "fcvtn2 v11.8h, v12.4s\n" + + "fadd v5.8h, v0.8h, v5.8h\n" + "fadd v7.8h, v0.8h, v7.8h\n" + "fadd v9.8h, v0.8h, v9.8h\n" + "fadd v11.8h, v0.8h, v11.8h\n" + + "cbz %[conv_relu], 1f\n" + "eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v5.8h, v5.8h, v1.8h\n" + "fmax v7.8h, v7.8h, v1.8h\n" + "fmax v9.8h, v9.8h, v1.8h\n" + "fmax v11.8h, v11.8h, v1.8h\n" + + "1:\n" + "str q5, [%[out_0]]\n" + "str q7, [%[out_0], #16]\n" + "str q9, [%[out_0], #32]\n" + "str q11, [%[out_0], #48]\n" + "b 5f\n" + + "6:\n" + "ldr q0, [%[min]]\n" + "ldr q30, [%[max]]\n" + "cbz %[conv_relu], 2f\n" + "eor v1.16b, v1.16b, v1.16b\n" // zero + "smax v5.4s, v5.4s, v1.4s\n" + "smax v6.4s, v6.4s, v1.4s\n" + "smax v7.4s, v7.4s, v1.4s\n" + "smax v8.4s, v8.4s, v1.4s\n" + "smax v9.4s, v9.4s, v1.4s\n" + "smax v10.4s, v10.4s, v1.4s\n" + "smax v11.4s, v11.4s, v1.4s\n" + "smax v12.4s, v12.4s, v1.4s\n" + + "2:\n" + "cbz %[scale_known], 7f\n" + "smax v5.4s, v5.4s, v0.4s\n" + "smin v5.4s, v5.4s, v30.4s\n" + "smax v6.4s, v6.4s, v0.4s\n" + "smin v6.4s, v6.4s, v30.4s\n" + "smax v7.4s, v7.4s, v0.4s\n" + "smin v7.4s, v7.4s, v30.4s\n" + "smax v8.4s, v8.4s, v0.4s\n" + "smin v8.4s, v8.4s, v30.4s\n" + "smax v9.4s, v9.4s, v0.4s\n" + "smin v9.4s, v9.4s, v30.4s\n" + "smax v10.4s, v10.4s, v0.4s\n" + "smin v10.4s, v10.4s, v30.4s\n" + "smax v11.4s, v11.4s, v0.4s\n" + "smin v11.4s, v11.4s, v30.4s\n" + "smax v12.4s, v12.4s, v0.4s\n" + "smin v12.4s, v12.4s, v30.4s\n" + + "str q5, [%[out_buf]]\n" + "str q6, [%[out_buf], 16]\n" + "str q7, [%[out_buf], 32]\n" + "str q8, [%[out_buf], 48]\n" + "str q9, [%[out_buf], 64]\n" + "str q10, [%[out_buf], 80]\n" + "str q11, [%[out_buf], 96]\n" + "str q12, [%[out_buf], 112]\n" + "b 5f\n" + + "7:\n" + "smax v30.4s, v5.4s, v30.4s\n" + "smin v0.4s, v5.4s, v0.4s\n" + "str q5, [%[out_buf]]\n" + "smax v30.4s, v6.4s, v30.4s\n" + "smin v0.4s, v6.4s, v0.4s\n" + "str q6, [%[out_buf], 16]\n" + "smax v30.4s, v7.4s, v30.4s\n" + "smin v0.4s, v7.4s, v0.4s\n" + "str q7, [%[out_buf], 32]\n" + "smax v30.4s, v8.4s, v30.4s\n" + "smin v0.4s, v8.4s, v0.4s\n" + "str q8, [%[out_buf], 48]\n" + "smax v30.4s, v9.4s, v30.4s\n" + "smin v0.4s, v9.4s, v0.4s\n" + "str q9, [%[out_buf], 64]\n" + "smax v30.4s, v10.4s, v30.4s\n" + "smin v0.4s, v10.4s, v0.4s\n" + "str q10, [%[out_buf], 80]\n" + "smax v30.4s, v11.4s, v30.4s\n" + "smin v0.4s, v11.4s, v0.4s\n" + "str q11, [%[out_buf], 96]\n" + "smax v30.4s, v12.4s, v30.4s\n" + "smin v0.4s, v12.4s, v0.4s\n" + "str q12, [%[out_buf], 112]\n" + + "str q30, [%[max]]\n" + "str q0, [%[min]]\n" + "5:\n" + : + : [out_0] "r"(out_o0hw0), [out_buf] "r"(out_buf), [in_0] "r"(in_hw0), + [f_0] "r"(f_o0c0), [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_0), + [b_0_s] "r"(b_0_s), [factor] "r"(factor_v), [max] "r"(max_i32), + [min] "r"(min_i32), [conv_relu] "r"(conv_relu_bool), + [out_f16] "r"(out_f16_bool), [scale_known] "r"(scale_known_bool) + : "memory", "cc", "v0", "v1", "v2", "v3", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v12", "v29", "x0", "x1", "x2", "x3", "x17", "x16"); + b0 += 8; + b0_s += 8; + } + ohow_s += 4; + } + + for (I32 hw = ohow_s; hw < ohow; hw++) { + F16 *b0 = biasArray; + I32 *b0_s = biasScaled; + INT8 *in_pack = ((INT8 *)tmp) + ic * ih_pad * iw_pad * 8; + // pack input + // NCHWc8 => NHWChw1c4 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + INT8 *in_hw1c8 = inArray_pad + c * ihiw * 8 + + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + INT8 *in_0 = in_hw1c8 + in_h_0 * iw_pad * 8 + in_w_0 * 8; + INT8 *in_pack_0 = in_pack + c * fh * fw * 8 + fh_idx * fw * 4 + fw_idx * 4; + INT8 *in_pack_1 = in_pack_0 + fh * fw * 4; + + memcpy(in_pack_0, in_0, 4 * bytesOf(DT_I8)); + memcpy(in_pack_1, in_0 + 4, 4 * bytesOf(DT_I8)); + } + } + } + + // compute + for (U32 o = 0; o < oc; o++) { + INT8 *in_hw = in_pack; + INT8 *f_o = filterArray + o * 8 * fh * fw * ic * 8; + I32 *out_buf = biasScaled + oc * 8 + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + + int32x4_t res[2] = {0}; + if (out_f16_bool == 0) { + res[0] = vld1q_s32(b0_s); + res[1] = vld1q_s32(b0_s + 4); + } + + for (U32 c = 0; c < ic * fh * fw; c++) { + int8x8_t in_2 = vld1_s8(in_hw); + in_hw += 8; + int8x16_t f_8o[4]; + f_8o[0] = vld1q_s8(f_o); + f_8o[1] = vld1q_s8(f_o + 16); + res[0] = vdotq_lane_s32(res[0], f_8o[0], in_2, 0); + res[1] = vdotq_lane_s32(res[1], f_8o[1], in_2, 0); + + f_8o[2] = vld1q_s8(f_o + 32); + f_8o[3] = vld1q_s8(f_o + 48); + f_o += 64; + res[0] = vdotq_lane_s32(res[0], f_8o[2], in_2, 1); + res[1] = vdotq_lane_s32(res[1], f_8o[3], in_2, 1); + } + if (out_f16_bool == 1) { + float32x4_t fac = vld1q_f32(factor_v); + float32x4_t resf0 = vcvtq_f32_s32(res[0]); + float32x4_t resf1 = vcvtq_f32_s32(res[1]); + resf0 = vmulq_f32(resf0, fac); + resf1 = vmulq_f32(resf1, fac); + + float16x4_t bias0 = vld1_f16(b0); + float16x4_t bias1 = vld1_f16(b0 + 4); + float16x4_t resh0 = vcvt_f16_f32(resf0); + float16x4_t resh1 = vcvt_f16_f32(resf1); + resh0 = vadd_f16(resh0, bias0); + resh1 = vadd_f16(resh1, bias1); + + if (conv_relu_bool) { + float16x4_t z = vdup_n_f16(0); + resh0 = vmax_f16(resh0, z); + resh1 = vmax_f16(resh1, z); + } + vst1_f16(out_o0hw0, resh0); + vst1_f16(out_o0hw0 + 4, resh1); + } else { + int32x4_t max = vld1q_s32(max_i32); + int32x4_t min = vld1q_s32(min_i32); + if (conv_relu_bool) { + int32x4_t z = vdupq_n_s32(0); + res[0] = vmaxq_s32(res[0], z); + res[1] = vmaxq_s32(res[1], z); + } + if (1 == scale_known_bool) { + res[0] = vmaxq_s32(min, res[0]); + res[1] = vmaxq_s32(min, res[1]); + res[0] = vminq_s32(max, res[0]); + res[1] = vminq_s32(max, res[1]); + } else { + max = vmaxq_s32(max, res[0]); + min = vminq_s32(min, res[0]); + max = vmaxq_s32(max, res[1]); + min = vminq_s32(min, res[1]); + vst1q_s32(max_i32, max); + vst1q_s32(min_i32, min); + } + vst1q_s32(out_buf, res[0]); + vst1q_s32(out_buf + 4, res[1]); + } + + b0 += 8; + b0_s += 8; + } + } + } + + EE ret = SUCCESS; + if (out_f16_bool == 0) { + I32 factor; + F32 scale_o; + + if (1 == scale_known_bool) { + scale_o = (*outputScale / *inputScale) / *filterScale; + factor = 127 * 16777216 / max_i32[0]; + } else { + I32 max = max_i32[0]; + I32 min = min_i32[0]; + for (U32 i = 1; i < 4; i++) { + if (max < max_i32[i]) { + max = max_i32[i]; + } + if (min > min_i32[i]) { + min = min_i32[i]; + } + } + + if (max == 0 && min == 0) { + return NOT_SUPPORTED; + } + + if (max > 0 && min < 0) { + I32 factor_max = 127 * 16777216 / max; + I32 factor_min = -127 * 16777216 / min; + factor = (factor_max < factor_min) ? factor_max : factor_min; + scale_o = (factor_max < factor_min) ? (127.0 / max) : (-127.0 / min); + } else if (max > 0) { + factor = 127 * 16777216 / max; + scale_o = 127.0 / max; + } else { + factor = -127 * 16777216 / min; + scale_o = -127.0 / min; + } + *outputScale = (*inputScale) * (*filterScale) * scale_o; + } + + U32 num_v = oc * ohow * 2; // Number of q-form vectors + I32 *out_buf = biasScaled + oc * 8; + INT8 *out_q = (INT8 *)output; + + ret = quantize_I32(num_v, out_buf, factor, scale_o, out_q); + } + return ret; +} + +template EE convolution_gemm_A76(TensorDesc inputDesc, + const void *input, + F16 *inputScale, + TensorDesc filterDesc, + const void *filter, + F16 *filterScale, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + F16 *outputScale, + ActivationParamSpec activationDesc); + +template EE convolution_gemm_A76(TensorDesc inputDesc, + const void *input, + F16 *inputScale, + TensorDesc filterDesc, + const void *filter, + F16 *filterScale, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + F16 *outputScale, + ActivationParamSpec activationDesc); +#endif diff --git a/compute/tensor/src/cpu/arm/int8/convolution_transform.cpp b/compute/tensor/src/cpu/arm/int8/convolution_transform.cpp new file mode 100644 index 00000000..7bdeb659 --- /dev/null +++ b/compute/tensor/src/cpu/arm/int8/convolution_transform.cpp @@ -0,0 +1,193 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_INT8 +#include "cpu/arm/int8/tensor_computing_int8.h" +#include "cpu/arm/fp16/convolution_winograd_transform.h" + +#include +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "tensor_computing.h" + +inline EE convolution_transform_filter_kernel_int8(TensorDesc filterDesc, + const void *filter, + TensorDesc *ftmDesc, + void *ftm, + DataFormat ftmDataFormat) +{ + if (nullptr == filter || nullptr == ftmDesc || nullptr == ftm) { + CHECK_STATUS(NULL_POINTER); + } + DataType fdt; + DataFormat fdf; + U32 fn, fc, fh, fw; + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + if (fdf == ftmDataFormat) { + *ftmDesc = filterDesc; + memcpy(ftm, filter, fn * fc * fh * fw * bytesOf(fdt)); + return SUCCESS; + } + if (fdf != DF_NCHW) { + CHECK_STATUS(NOT_SUPPORTED); + } + EE ret = SUCCESS; + switch (ftmDataFormat) { + case DF_NCHWN8C4: { + INT8 *filterArray = (INT8 *)filter; + INT8 *ftmArray = (INT8 *)ftm; + U32 oc = fn / 8; + U32 fc_quad = fc / 4; + for (U32 o = 0; o < oc; o++) { + for (U32 c = 0; c < fc_quad; c++) { + for (U32 hw = 0; hw < fh * fw; hw++) { + for (U32 o8 = 0; o8 < 8; o8++) { + for (U32 c4 = 0; c4 < 4; c4++) { + ftmArray[o * fh * fw * fc * 8 + c * fh * fw * 32 + hw * 32 + + o8 * 4 + c4] = filterArray[(o * 8 + o8) * fc * fh * fw + + (c * 4 + c4) * fh * fw + hw]; + } + } + } + } + } + break; + } + case DF_HWNCN8C4: { + F16 *filterArray = (F16 *)filter; + F16 *ftmArray = (F16 *)ftm; + for (U32 o = 0; o < fn / 8; o++) { + for (U32 c = 0; c < fc / 4; c++) { + // Each time deal with N2C4; 4 times we have N8C4 + U32 f_off_0 = (o * 8) * fc * fh * fw + c * 4 * fh * fw; + U32 f_off_1 = (o * 8 + 2) * fc * fh * fw + c * 4 * fh * fw; + U32 f_off_2 = (o * 8 + 4) * fc * fh * fw + c * 4 * fh * fw; + U32 f_off_3 = (o * 8 + 6) * fc * fh * fw + c * 4 * fh * fw; + + U32 ftm_off_0 = o * 36 * fc * 8 + c * 32; + U32 ftm_off_1 = o * 36 * fc * 8 + c * 32 + 8; + U32 ftm_off_2 = o * 36 * fc * 8 + c * 32 + 16; + U32 ftm_off_3 = o * 36 * fc * 8 + c * 32 + 24; + + F16 F[9][8]; // N2C4 at a time + F16 *F_ptr[9]; + F16 *Fw[36]; + + for (U32 hw = 0; hw < 9; hw++) { + for (U32 oo = 0; oo < 2; oo++) { + for (U32 cc = 0; cc < 4; cc++) { + F[hw][oo * 4 + cc] = + filterArray[f_off_0 + hw + oo * fc * fh * fw + cc * fh * fw]; + } + } + F_ptr[hw] = F[hw]; + } + for (U32 hw = 0; hw < 36; hw++) { + Fw[hw] = ftmArray + ftm_off_0 + hw * fc * 8; // Each hw fills N8*fc + } + trans_W_4x4_3x3(Fw, F_ptr); + + for (U32 hw = 0; hw < 9; hw++) { + for (U32 oo = 0; oo < 2; oo++) { + for (U32 cc = 0; cc < 4; cc++) { + F[hw][oo * 4 + cc] = + filterArray[f_off_1 + hw + oo * fc * fh * fw + cc * fh * fw]; + } + } + F_ptr[hw] = F[hw]; + } + for (U32 hw = 0; hw < 36; hw++) { + Fw[hw] = ftmArray + ftm_off_1 + hw * fc * 8; // Each hw fills N8*fc + } + trans_W_4x4_3x3(Fw, F_ptr); + + for (U32 hw = 0; hw < 9; hw++) { + for (U32 oo = 0; oo < 2; oo++) { + for (U32 cc = 0; cc < 4; cc++) { + F[hw][oo * 4 + cc] = + filterArray[f_off_2 + hw + oo * fc * fh * fw + cc * fh * fw]; + } + } + F_ptr[hw] = F[hw]; + } + for (U32 hw = 0; hw < 36; hw++) { + Fw[hw] = ftmArray + ftm_off_2 + hw * fc * 8; // Each hw fills N8*fc + } + trans_W_4x4_3x3(Fw, F_ptr); + + for (U32 hw = 0; hw < 9; hw++) { + for (U32 oo = 0; oo < 2; oo++) { + for (U32 cc = 0; cc < 4; cc++) { + F[hw][oo * 4 + cc] = + filterArray[f_off_3 + hw + oo * fc * fh * fw + cc * fh * fw]; + } + } + F_ptr[hw] = F[hw]; + } + for (U32 hw = 0; hw < 36; hw++) { + Fw[hw] = ftmArray + ftm_off_3 + hw * fc * 8; // Each hw fills N8*fc + } + trans_W_4x4_3x3(Fw, F_ptr); + } + } + fdt = DT_F16; + fh = 6; + fw = 6; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + *ftmDesc = tensor4df(fdt, ftmDataFormat, fn, fc, fh, fw); + return ret; +} + +EE convolution_transform_filter_int8(TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + void *filterTransformed) +{ + DataFormat ftmDataFormat; + switch (algorithm) { + case CONVOLUTION_ALGORITHM_WINOGRAD: + ftmDataFormat = DF_HWNCN8C4; + break; + case CONVOLUTION_ALGORITHM_GEMM: + ftmDataFormat = DF_NCHWN8C4; + break; + default: + return NOT_MATCH; + } + U32 channelAxis = filterDesc.nDims - 1; + TensorDesc tmpFilterDesc = filterDesc; + tmpFilterDesc.dims[channelAxis] /= convParamSpec.group; + U32 fnPadding = tmpFilterDesc.dims[channelAxis]; + if (fnPadding % 8 != 0) { + fnPadding = (fnPadding / 8 + 1) * 8; + } + U32 originalTileSize = tensorNumElements(tmpFilterDesc); + for (U32 g = 0; g < convParamSpec.group; g++) { + CHECK_STATUS(convolution_transform_filter_kernel_int8( + tmpFilterDesc, filter, ftmDesc, filterTransformed, ftmDataFormat)); + U32 newTileSize = tensorNumElements(*ftmDesc) / tmpFilterDesc.dims[channelAxis] * fnPadding; + filter = (const U8 *)filter + originalTileSize * bytesOf(filterDesc.dt); + filterTransformed = (U8 *)filterTransformed + newTileSize * bytesOf(ftmDesc->dt); + } + ftmDesc->dims[channelAxis] = filterDesc.dims[channelAxis]; + return SUCCESS; +} +#endif diff --git a/compute/tensor/src/cpu/arm/int8/convolution_winograd.h b/compute/tensor/src/cpu/arm/int8/convolution_winograd.h new file mode 100644 index 00000000..8e764994 --- /dev/null +++ b/compute/tensor/src/cpu/arm/int8/convolution_winograd.h @@ -0,0 +1,181 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_CONVOLUTION_WINOGRAD +#define _H_CONVOLUTION_WINOGRAD + +#ifdef _USE_INT8 +#include "sys.h" +#include "types.h" + +template +EE convolution_winograd_A55(TensorDesc inputDesc, + const void *input, + F16 *input_scale, + TensorDesc filterDesc, + const void *filter, + F16 *filterScale, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + F16 *outputScale, + ActivationParamSpec am); + +template +EE convolution_winograd_A76(TensorDesc inputDesc, + const void *input, + F16 *input_scale, + TensorDesc filterDesc, + const void *filter, + F16 *filterScale, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + F16 *outputScale, + ActivationParamSpec am); + +inline EE convolution_winograd(TensorDesc inputDesc, + const void *input, + F16 *input_scale, + TensorDesc filterDesc, + const void *filter, + F16 *filterScale, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + F16 *outputScale, + ActivationParamSpec am, + Arch arch) +{ + EE ret = SUCCESS; + switch (arch) { + case ARM_A55: + ret = convolution_winograd_A55(inputDesc, input, input_scale, filterDesc, filter, + filterScale, convParamSpec, biasDesc, bias, tmpBytes, tmp, outputDesc, output, + outputScale, am); + break; + case ARM_A76: + ret = convolution_winograd_A76(inputDesc, input, input_scale, filterDesc, filter, + filterScale, convParamSpec, biasDesc, bias, tmpBytes, tmp, outputDesc, output, + outputScale, am); + break; + default: + return NOT_SUPPORTED; + } + return ret; +} + +inline void apply_scale_f16(U32 numData, F16 *array, F16 scale, INT8 *qArray, bool clamp = true) +{ + for (U32 i = 0; i < numData; i++) { + F32 tmp = array[i] * scale; + qArray[i] = round_towards_zero(tmp, clamp); + } +} + +inline void quantize_wino_input(F16 *itmArray, U32 len_per_36, INT8 *inQ, F32 *inputScale) +{ + U32 numData = len_per_36; + F32 scale; + + for (U32 idx = 0; idx < 36; idx++) { + F16 *in = itmArray + idx * numData; + float16x8_t temp_v = vld1q_f16(in); + float16x8_t max_v = temp_v; + float16x8_t min_v = temp_v; + + for (U32 i = 8; i < numData; i += 8) { + temp_v = vld1q_f16(in + i); + max_v = vmaxq_f16(max_v, temp_v); + min_v = vminq_f16(min_v, temp_v); + } + + F16 max = vmaxvq_f16(max_v); + F16 min = vminvq_f16(min_v); + + if (max == 0 && min == 0) { + inputScale[idx] = 0.0; // We can skip this dotprod later + continue; + } + if (max > 0 && min < 0) { + F32 scale_max = 127.0 / max; + F32 scale_min = -127.0 / min; + scale = (scale_max < scale_min) ? scale_max : scale_min; + } else if (max < 0) { + scale = -127.0 / min; + } else { // min > 0 + scale = 127.0 / max; + } + + INT8 *base = inQ + idx * numData; + apply_scale_f16(numData, in, scale, base); + inputScale[idx] = scale; + } +} + +inline void quantize_wino_input_s16( + short *itmArray, U32 len_per_36, INT8 *inQ, F32 *inputScale, F16 input_scale) +{ + U32 numData = len_per_36; + F32 scale; + + for (U32 idx = 0; idx < 36; idx++) { + short *in = itmArray + idx * numData; + int16x8_t temp_v = vld1q_s16(in); + int16x8_t max_v = temp_v; + int16x8_t min_v = temp_v; + + for (U32 i = 8; i < numData; i += 8) { + temp_v = vld1q_s16(in + i); + max_v = vmaxq_s16(max_v, temp_v); + min_v = vminq_s16(min_v, temp_v); + } + + short max = vmaxvq_s16(max_v); + short min = vminvq_s16(min_v); + + if (max == 0 && min == 0) { + inputScale[idx] = 0.0; // We can skip this dotprod later + continue; + } + if (max > 0 && min < 0) { + F32 scaleMax = 127.0 / max; + F32 scaleMin = -127.0 / min; + scale = (scaleMax < scaleMin) ? scaleMax : scaleMin; + } else if (max < 0) { + scale = -127.0 / min; + } else { // min > 0 + scale = 127.0 / max; + } + + INT8 *base = inQ + idx * numData; + for (U32 i = 0; i < numData; i++) { + base[i] = round_towards_zero(scale * in[i], false); + } + inputScale[idx] = input_scale * scale; + } +} +#endif +#endif diff --git a/compute/tensor/src/cpu/arm/int8/convolution_winograd_A55.cpp b/compute/tensor/src/cpu/arm/int8/convolution_winograd_A55.cpp new file mode 100644 index 00000000..5f577370 --- /dev/null +++ b/compute/tensor/src/cpu/arm/int8/convolution_winograd_A55.cpp @@ -0,0 +1,1487 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_INT8 +#include "cpu/arm/int8/convolution_winograd_transform.h" +#include "cpu/arm/int8/convolution_winograd.h" + +template +EE convolution_winograd_A55(TensorDesc inputDesc, + const void *input, + F16 *input_scale, + TensorDesc filterDesc, + const void *filter, + F16 *filterScale, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + F16 *outputScale, + ActivationParamSpec am) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + + // not truely one_step. Compute hw12*(6*6)*ic at one time. + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + if (fdf != DF_HWNCN8C4) { + return NOT_MATCH; + } + if (!(fh == 6 && fw == 6)) { + return NOT_MATCH; + } + + // Assume IT is the same as OT + OT *inArray = (OT *)input; + INT8 *filterArray = (INT8 *)filter; + F16 *outArray = (F16 *)output; + F16 *biasArray = (F16 *)bias; + + // both input and output are stored with C8 + oc /= 8; + ic /= 8; + + U32 tile_h = (oh + 3) / 4; + U32 tile_w = (ow + 3) / 4; + I32 tiles = tile_h * tile_w; // num of 6x6 tiles + U32 pad_left = paddingL; + U32 pad_right = paddingR + (tile_w * 4 - ow); + U32 pad_w_mod_4 = tile_w * 4 - ow; + U32 pad_top = paddingT; + U32 pad_bottom = paddingB + (tile_h * 4 - oh); + U32 pad_h_mod_4 = tile_h * 4 - oh; + U32 ih_pad = ih + pad_top + pad_bottom; + U32 iw_pad = iw + pad_left + pad_right; + + U32 ohow = oh * ow; + U32 ihiw = ih_pad * iw_pad; + + // tmp = in_pad + itm + otm + inQ + ... + // in_pad: ic*ih_pad*iw_pad*8 + // itm: 6*6*ic*12*8 (int16 or fp16) + // otm: 6*6*12*8 (F16) + // inQ: 6*6*ic*12*8 (int8) + OT *inArray_pad = (OT *)tmp; + short *itmArray = (short *)(inArray_pad + ic * ihiw * 8); // will be cast to fp16 for fp16 inputs + F16 *otmArray = (F16 *)(itmArray + 6 * 6 * ic * 12 * 8); + INT8 *inQ = (INT8 *)(otmArray + 6 * 6 * 12 * 8); + if (DT_I8 == odt) { + outArray = (F16 *)(inQ + 6 * 6 * ic * 12 * 8); // After otmArray and pack + } + + // To track the range of the final outputs and prepare for quantization + F16 max[8] = {0}; + F16 min[8] = {0}; + + for (U32 n = 0; n < in; n++) { // for each batch + OT *inArray_pad_mov = inArray_pad; + OT *inArray_mov = inArray + n * ic * ih * iw * 8; + for (U32 c = 0; c < ic; c++) { + memset(inArray_pad_mov, 0, pad_top * iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += pad_top * iw_pad * 8; + for (U32 h = pad_top; h < ih_pad - pad_bottom; h++) { + memset(inArray_pad_mov, 0, pad_left * 8 * bytesOf(idt)); + inArray_pad_mov += pad_left * 8; + memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt)); + inArray_pad_mov += iw * 8; + inArray_mov += iw * 8; + memset(inArray_pad_mov, 0, pad_right * 8 * bytesOf(idt)); + inArray_pad_mov += pad_right * 8; + } + memset(inArray_pad_mov, 0, pad_bottom * iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += pad_bottom * iw_pad * 8; + } + + // tiles / 12 + for (I32 hw = 0; hw < tiles - 11; hw += 12) { + // in trans + // NCHWc8 => (6*6)*(C/4)*hw12*c4 + // transform hw1c8 at a time, so we need 12 times to cover hw12c8 + // pack into hw12c4 after quantizing (reuse the space of itmArray) + for (U32 c = 0; c < ic; c++) { + OT *inArray_pad_mov = inArray_pad + c * ihiw * 8; + short *Iw_ptr[36]; + short *Iw0[36]; + OT *I0[36]; + short *Iw1[36]; + OT *I1[36]; + short *Iw2[36]; + OT *I2[36]; + short *Iw3[36]; + OT *I3[36]; + short *Iw4[36]; + OT *I4[36]; + short *Iw5[36]; + OT *I5[36]; + short *Iw6[36]; + OT *I6[36]; + short *Iw7[36]; + OT *I7[36]; + short *Iw8[36]; + OT *I8[36]; + short *Iw9[36]; + OT *I9[36]; + short *Iw10[36]; + OT *I10[36]; + short *Iw11[36]; + OT *I11[36]; + + // Store transformed hw12c8 to itmArray + for (U32 i = 0; i < 36; i++) { + Iw0[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12; + Iw1[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 1 * 8; + Iw2[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 2 * 8; + Iw3[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 3 * 8; + Iw4[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 4 * 8; + Iw5[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 5 * 8; + Iw6[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 6 * 8; + Iw7[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 7 * 8; + Iw8[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 8 * 8; + Iw9[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 9 * 8; + Iw10[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 10 * 8; + Iw11[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 11 * 8; + } + + U32 h0 = (hw / tile_w) * 4; // stride is 4 + U32 w0 = (hw % tile_w) * 4; + U32 h1 = ((hw + 1) / tile_w) * 4; + U32 w1 = ((hw + 1) % tile_w) * 4; + U32 h2 = ((hw + 2) / tile_w) * 4; + U32 w2 = ((hw + 2) % tile_w) * 4; + U32 h3 = ((hw + 3) / tile_w) * 4; + U32 w3 = ((hw + 3) % tile_w) * 4; + U32 h4 = ((hw + 4) / tile_w) * 4; + U32 w4 = ((hw + 4) % tile_w) * 4; + U32 h5 = ((hw + 5) / tile_w) * 4; + U32 w5 = ((hw + 5) % tile_w) * 4; + U32 h6 = ((hw + 6) / tile_w) * 4; + U32 w6 = ((hw + 6) % tile_w) * 4; + U32 h7 = ((hw + 7) / tile_w) * 4; + U32 w7 = ((hw + 7) % tile_w) * 4; + U32 h8 = ((hw + 8) / tile_w) * 4; + U32 w8 = ((hw + 8) % tile_w) * 4; + U32 h9 = ((hw + 9) / tile_w) * 4; + U32 w9 = ((hw + 9) % tile_w) * 4; + U32 h10 = ((hw + 10) / tile_w) * 4; + U32 w10 = ((hw + 10) % tile_w) * 4; + U32 h11 = ((hw + 11) / tile_w) * 4; + U32 w11 = ((hw + 11) % tile_w) * 4; + + for (U32 i = 0; i < 6; i++) { + for (U32 j = 0; j < 6; j++) { + I0[i * 6 + j] = inArray_pad_mov + (h0 + i) * iw_pad * 8 + (w0 + j) * 8; + I1[i * 6 + j] = inArray_pad_mov + (h1 + i) * iw_pad * 8 + (w1 + j) * 8; + I2[i * 6 + j] = inArray_pad_mov + (h2 + i) * iw_pad * 8 + (w2 + j) * 8; + I3[i * 6 + j] = inArray_pad_mov + (h3 + i) * iw_pad * 8 + (w3 + j) * 8; + I4[i * 6 + j] = inArray_pad_mov + (h4 + i) * iw_pad * 8 + (w4 + j) * 8; + I5[i * 6 + j] = inArray_pad_mov + (h5 + i) * iw_pad * 8 + (w5 + j) * 8; + I6[i * 6 + j] = inArray_pad_mov + (h6 + i) * iw_pad * 8 + (w6 + j) * 8; + I7[i * 6 + j] = inArray_pad_mov + (h7 + i) * iw_pad * 8 + (w7 + j) * 8; + I8[i * 6 + j] = inArray_pad_mov + (h8 + i) * iw_pad * 8 + (w8 + j) * 8; + I9[i * 6 + j] = inArray_pad_mov + (h9 + i) * iw_pad * 8 + (w9 + j) * 8; + I10[i * 6 + j] = inArray_pad_mov + (h10 + i) * iw_pad * 8 + (w10 + j) * 8; + I11[i * 6 + j] = inArray_pad_mov + (h11 + i) * iw_pad * 8 + (w11 + j) * 8; + } + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw0[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I0); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I0); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw1[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I1); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I1); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw2[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I2); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I2); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw3[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I3); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I3); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw4[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I4); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I4); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw5[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I5); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I5); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw6[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I6); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I6); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw7[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I7); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I7); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw8[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I8); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I8); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw9[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I9); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I9); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw10[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I10); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I10); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw11[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I11); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I11); + } + } + + F32 inputScale[36]; + + if (DT_I8 == idt) { + quantize_wino_input_s16(itmArray, 12 * ic * 8, inQ, inputScale, *input_scale); + } else { + quantize_wino_input((F16 *)itmArray, 12 * ic * 8, inQ, inputScale); + } + + F32 factor_v[36][4]; + for (U32 i = 0; i < 36; i++) { + if (inputScale[i] == 0) { + factor_v[i][0] = 0; + continue; + } else { + factor_v[i][0] = 1.0 / inputScale[i] / (F32)filterScale[i]; + } + + factor_v[i][1] = factor_v[i][0]; + factor_v[i][2] = factor_v[i][0]; + factor_v[i][3] = factor_v[i][0]; + } + + INT8 *in_pack = (INT8 *)itmArray; // Reuse the space + + for (U32 idx = 0; idx < 36; idx++) { + if (factor_v[idx][0] == 0) { // input pixels are all 0 + continue; + } + for (U32 c = 0; c < ic; c++) { // for each 8 channels + INT8 *in_hw12c8 = inQ + idx * 12 * ic * 8 + c * 12 * 8; + + INT8 *in_0 = in_hw12c8; + INT8 *in_1 = in_hw12c8 + 1 * 8; + INT8 *in_2 = in_hw12c8 + 2 * 8; + INT8 *in_3 = in_hw12c8 + 3 * 8; + INT8 *in_4 = in_hw12c8 + 4 * 8; + INT8 *in_5 = in_hw12c8 + 5 * 8; + INT8 *in_6 = in_hw12c8 + 6 * 8; + INT8 *in_7 = in_hw12c8 + 7 * 8; + INT8 *in_8 = in_hw12c8 + 8 * 8; + INT8 *in_9 = in_hw12c8 + 9 * 8; + INT8 *in_10 = in_hw12c8 + 10 * 8; + INT8 *in_11 = in_hw12c8 + 11 * 8; + + // NHWChw12c4 + INT8 *in_pack_0 = in_pack + idx * 12 * ic * 8 + c * 12 * 8; + INT8 *in_pack_1 = in_pack_0 + 12 * 4; + + __asm__ __volatile__( + "ldr d0, [%[in_0]]\n" + "ldr x2, [%[in_2]]\n" + "ldr d1, [%[in_1]]\n" + "ldr x3, [%[in_3]]\n" + "ins v0.d[1], x2\n" + "ins v1.d[1], x3\n" + "ldr d4, [%[in_4]]\n" + "ldr x6, [%[in_6]]\n" + "trn1 v20.4s, v0.4s, v1.4s\n" + "trn2 v21.4s, v0.4s, v1.4s\n" + + "ldr d5, [%[in_5]]\n" + "ldr x7, [%[in_7]]\n" + "ins v4.d[1], x6\n" + "ins v5.d[1], x7\n" + "ldr d8, [%[in_8]]\n" + "ldr x10, [%[in_10]]\n" + "trn1 v24.4s, v4.4s, v5.4s\n" + "trn2 v25.4s, v4.4s, v5.4s\n" + "ldr d9, [%[in_9]]\n" + "ldr x11, [%[in_11]]\n" + "ins v8.d[1], x10\n" + "ins v9.d[1], x11\n" + + "str q20, [%[pack_0]]\n" + "trn1 v28.4s, v8.4s, v9.4s\n" + "trn2 v29.4s, v8.4s, v9.4s\n" + "str q24, [%[pack_0], #16]\n" + "str q28, [%[pack_0], #32]\n" + "str q21, [%[pack_1]]\n" + "str q25, [%[pack_1], #16]\n" + "str q29, [%[pack_1], #32]\n" + : + : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), [in_0] "r"(in_0), + [in_1] "r"(in_1), [in_2] "r"(in_2), [in_3] "r"(in_3), [in_4] "r"(in_4), + [in_5] "r"(in_5), [in_6] "r"(in_6), [in_7] "r"(in_7), [in_8] "r"(in_8), + [in_9] "r"(in_9), [in_10] "r"(in_10), [in_11] "r"(in_11) + : "memory", "cc", "v0", "v1", "v4", "v5", "v8", "v9", "v20", "v21", "v24", + "v25", "v28", "v29", "x2", "x3", "x6", "x7", "x10", "x11"); + } + } + + // compute + for (U32 o = 0; o < oc; o++) { // 8 output channels at a time + // bias + F16 *b_0 = biasArray + o * 8; + for (U32 idx = 0; idx < 36; idx++) { + INT8 *in_hw0 = in_pack + idx * 12 * ic * 8; + INT8 *f_o0c0 = filterArray + o * 8 * 36 * ic * 8 + idx * 8 * ic * 8; + F16 *out_o0hw0 = otmArray + idx * 12 * 8; + if (factor_v[idx][0] == 0) { // input pixels are all 0 + memset(out_o0hw0, 0, 12 * 8 * sizeof(OT)); + continue; + } + F32 *fac = factor_v[idx]; + __asm__ __volatile__("eor v5.16b, v5.16b, v5.16b\n" + "ldr d1, [%[in_0]]\n" // in_0 + "eor v6.16b, v6.16b, v6.16b\n" + "ldr x1, [%[in_0], #8]\n" + "eor v7.16b, v7.16b, v7.16b\n" + "ins v1.d[1], x1\n" + "eor v8.16b, v8.16b, v8.16b\n" + "ldr d0, [%[f_0]]\n" // f_0 + "eor v9.16b, v9.16b, v9.16b\n" + "ldr x2, [%[f_0], #8]\n" + "eor v10.16b, v10.16b, v10.16b\n" + "ins v0.d[1], x2\n" + "eor v11.16b, v11.16b, v11.16b\n" + "ldr d3, [%[in_0], #16]\n" // in_1 + "eor v12.16b, v12.16b, v12.16b\n" + "ldr x3, [%[in_0], #24]\n" + "eor v13.16b, v13.16b, v13.16b\n" + "ins v3.d[1], x3\n" + "eor v14.16b, v14.16b, v14.16b\n" + "eor v15.16b, v15.16b, v15.16b\n" + "eor v16.16b, v16.16b, v16.16b\n" + + "eor v17.16b, v17.16b, v17.16b\n" + "eor v18.16b, v18.16b, v18.16b\n" + "eor v19.16b, v19.16b, v19.16b\n" + "eor v20.16b, v20.16b, v20.16b\n" + "eor v21.16b, v21.16b, v21.16b\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + "eor v25.16b, v25.16b, v25.16b\n" + "eor v26.16b, v26.16b, v26.16b\n" + "eor v27.16b, v27.16b, v27.16b\n" + "eor v28.16b, v28.16b, v28.16b\n" + + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" // ic_blk + "0:\n" + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "ldr d2, [x3, 32]\n" + "ldr x16, [x3, 40]\n" + "sdot v7.4s, v0.16b, v1.4b[1]\n" + "ldr d29, [x0, 16]\n" + "ldr x17, [x0, 24]\n" + "sdot v9.4s, v0.16b, v1.4b[2]\n" + "ins v2.d[1], x16\n" + "ldr d30, [x3, 48]!\n" + "sdot v11.4s, v0.16b, v1.4b[3]\n" + "ins v29.d[1], x17\n" + + "sdot v13.4s, v0.16b, v3.4b[0]\n" + "ldr x16, [x3, 8]\n" + "subs x2, x2, #4\n" + "sdot v15.4s, v0.16b, v3.4b[1]\n" + "sdot v17.4s, v0.16b, v3.4b[2]\n" + "ins v30.d[1], x16\n" + "sdot v19.4s, v0.16b, v3.4b[3]\n" + + "sdot v21.4s, v0.16b, v2.4b[0]\n" + "sdot v23.4s, v0.16b, v2.4b[1]\n" + "sdot v25.4s, v0.16b, v2.4b[2]\n" + "sdot v27.4s, v0.16b, v2.4b[3]\n" + + "sdot v14.4s, v29.16b, v3.4b[0]\n" + "sdot v16.4s, v29.16b, v3.4b[1]\n" + "ldr d0, [x0, 32]!\n" + "ldr x17, [x0, 8]\n" + "sdot v18.4s, v29.16b, v3.4b[2]\n" + "sdot v20.4s, v29.16b, v3.4b[3]\n" + + "sdot v6.4s, v29.16b, v1.4b[0]\n" + "sdot v8.4s, v29.16b, v1.4b[1]\n" + "ldr d3, [x3, 16]\n" + "ldr x16, [x3, 24]\n" + "sdot v10.4s, v29.16b, v1.4b[2]\n" + "sdot v12.4s, v29.16b, v1.4b[3]\n" + + "ins v0.d[1], x17\n" + "ins v3.d[1], x16\n" + + "sdot v22.4s, v29.16b, v2.4b[0]\n" + "mov v1.16b, v30.16b\n" + "sdot v24.4s, v29.16b, v2.4b[1]\n" + "sdot v26.4s, v29.16b, v2.4b[2]\n" + "sdot v28.4s, v29.16b, v2.4b[3]\n" + + "bne 0b\n" + "scvtf v5.4s, v5.4s\n" + "scvtf v6.4s, v6.4s\n" + "ldr d1, [%[factor]]\n" + "ldr x1, [%[factor], #8]\n" + "scvtf v7.4s, v7.4s\n" + "scvtf v8.4s, v8.4s\n" + "ins v1.d[1], x1\n" + "scvtf v9.4s, v9.4s\n" + "scvtf v10.4s, v10.4s\n" + "scvtf v11.4s, v11.4s\n" + "scvtf v12.4s, v12.4s\n" + "scvtf v13.4s, v13.4s\n" + "scvtf v14.4s, v14.4s\n" + "scvtf v15.4s, v15.4s\n" + "scvtf v16.4s, v16.4s\n" + "scvtf v17.4s, v17.4s\n" + "scvtf v18.4s, v18.4s\n" + "scvtf v19.4s, v19.4s\n" + "scvtf v20.4s, v20.4s\n" + "scvtf v21.4s, v21.4s\n" + "scvtf v22.4s, v22.4s\n" + "scvtf v23.4s, v23.4s\n" + "scvtf v24.4s, v24.4s\n" + "scvtf v25.4s, v25.4s\n" + "scvtf v26.4s, v26.4s\n" + "scvtf v27.4s, v27.4s\n" + "scvtf v28.4s, v28.4s\n" + + "fmul v5.4s, v1.4s, v5.4s\n" + "fmul v6.4s, v1.4s, v6.4s\n" + "fmul v7.4s, v1.4s, v7.4s\n" + "fmul v8.4s, v1.4s, v8.4s\n" + "fmul v9.4s, v1.4s, v9.4s\n" + "fmul v10.4s, v1.4s, v10.4s\n" + "fmul v11.4s, v1.4s, v11.4s\n" + "fmul v12.4s, v1.4s, v12.4s\n" + "fmul v13.4s, v1.4s, v13.4s\n" + "fmul v14.4s, v1.4s, v14.4s\n" + "fmul v15.4s, v1.4s, v15.4s\n" + "fmul v16.4s, v1.4s, v16.4s\n" + "fmul v17.4s, v1.4s, v17.4s\n" + "fmul v18.4s, v1.4s, v18.4s\n" + "fmul v19.4s, v1.4s, v19.4s\n" + "fmul v20.4s, v1.4s, v20.4s\n" + "fmul v21.4s, v1.4s, v21.4s\n" + "fmul v22.4s, v1.4s, v22.4s\n" + "fmul v23.4s, v1.4s, v23.4s\n" + "fmul v24.4s, v1.4s, v24.4s\n" + "fmul v25.4s, v1.4s, v25.4s\n" + "fmul v26.4s, v1.4s, v26.4s\n" + "fmul v27.4s, v1.4s, v27.4s\n" + "fmul v28.4s, v1.4s, v28.4s\n" + + "fcvtn v5.4h, v5.4s\n" + "fcvtn v7.4h, v7.4s\n" + "fcvtn v9.4h, v9.4s\n" + "fcvtn v11.4h, v11.4s\n" + "fcvtn v13.4h, v13.4s\n" + "fcvtn v15.4h, v15.4s\n" + "fcvtn v17.4h, v17.4s\n" + "fcvtn v19.4h, v19.4s\n" + "fcvtn v21.4h, v21.4s\n" + "fcvtn v23.4h, v23.4s\n" + "fcvtn v25.4h, v25.4s\n" + "fcvtn v27.4h, v27.4s\n" + + "fcvtn2 v5.8h, v6.4s\n" + "fcvtn2 v7.8h, v8.4s\n" + "fcvtn2 v9.8h, v10.4s\n" + "fcvtn2 v11.8h, v12.4s\n" + "fcvtn2 v13.8h, v14.4s\n" + "fcvtn2 v15.8h, v16.4s\n" + "fcvtn2 v17.8h, v18.4s\n" + "fcvtn2 v19.8h, v20.4s\n" + "fcvtn2 v21.8h, v22.4s\n" + "fcvtn2 v23.8h, v24.4s\n" + "fcvtn2 v25.8h, v26.4s\n" + "fcvtn2 v27.8h, v28.4s\n" + + "str q5, [%[out_0]]\n" + "str q7, [%[out_0], #16]\n" + "str q9, [%[out_0], #32]\n" + "str q11, [%[out_0], #48]\n" + "str q13, [%[out_0], #64]\n" + "str q15, [%[out_0], #80]\n" + "str q17, [%[out_0], #96]\n" + "str q19, [%[out_0], #112]\n" + "str q21, [%[out_0], #128]\n" + "str q23, [%[out_0], #144]\n" + "str q25, [%[out_0], #160]\n" + "str q27, [%[out_0], #176]\n" + : + : [out_0] "r"(out_o0hw0), [in_0] "r"(in_hw0), + [f_0] "r"(f_o0c0), [ic] "r"((I64)ic * 8), [factor] "r"(fac) + : "memory", "cc", "v0", "v1", "v2", "v3", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x0", + "x1", "x2", "x3", "x17", "x16"); + } + // out trans + // (6*6)*hw12*o8 => NOHWo8 + for (U32 hw12 = 0; hw12 < 12; hw12++) { + U32 h = (hw + hw12) / tile_w; + U32 w = (hw + hw12) % tile_w; + F16 *out_0 = + outArray + n * oc * ohow * 8 + o * ohow * 8 + h * 4 * ow * 8 + w * 4 * 8; + + F16 *Ow_0[36]; + F16 *O_0[16]; + + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + idx * 12 * 8 + hw12 * 8; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + } + } + trans_O(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h - 1, tile_w - 1, + max, min, am); + } + } + } + + // tiles_reminder % 12 / 8 + I32 tiles_s = (tiles / 12) * 12; + I32 tiles_tail = tiles - tiles_s; + + if (tiles_tail >= 8) { + I32 hw = tiles_s; + // in trans + // NCHWc8 => (6*6)*(C/4)*hw8*c4 + // transform hw1c8 at a time, so we need 8 times to cover hw8c8 + // pack into hw8c4 after quantizing (reuse the space of itmArray) + for (U32 c = 0; c < ic; c++) { + OT *inArray_pad_mov = inArray_pad + c * ihiw * 8; + short *Iw_ptr[36]; + short *Iw0[36]; + OT *I0[36]; + short *Iw1[36]; + OT *I1[36]; + short *Iw2[36]; + OT *I2[36]; + short *Iw3[36]; + OT *I3[36]; + short *Iw4[36]; + OT *I4[36]; + short *Iw5[36]; + OT *I5[36]; + short *Iw6[36]; + OT *I6[36]; + short *Iw7[36]; + OT *I7[36]; + + // Store transformed hw12c8 to itmArray + for (U32 i = 0; i < 36; i++) { + Iw0[i] = itmArray + i * 8 * ic * 8 + c * 8 * 8; + Iw1[i] = itmArray + i * 8 * ic * 8 + c * 8 * 8 + 1 * 8; + Iw2[i] = itmArray + i * 8 * ic * 8 + c * 8 * 8 + 2 * 8; + Iw3[i] = itmArray + i * 8 * ic * 8 + c * 8 * 8 + 3 * 8; + Iw4[i] = itmArray + i * 8 * ic * 8 + c * 8 * 8 + 4 * 8; + Iw5[i] = itmArray + i * 8 * ic * 8 + c * 8 * 8 + 5 * 8; + Iw6[i] = itmArray + i * 8 * ic * 8 + c * 8 * 8 + 6 * 8; + Iw7[i] = itmArray + i * 8 * ic * 8 + c * 8 * 8 + 7 * 8; + } + + U32 h0 = (hw / tile_w) * 4; // stride is 4 + U32 w0 = (hw % tile_w) * 4; + U32 h1 = ((hw + 1) / tile_w) * 4; + U32 w1 = ((hw + 1) % tile_w) * 4; + U32 h2 = ((hw + 2) / tile_w) * 4; + U32 w2 = ((hw + 2) % tile_w) * 4; + U32 h3 = ((hw + 3) / tile_w) * 4; + U32 w3 = ((hw + 3) % tile_w) * 4; + U32 h4 = ((hw + 4) / tile_w) * 4; + U32 w4 = ((hw + 4) % tile_w) * 4; + U32 h5 = ((hw + 5) / tile_w) * 4; + U32 w5 = ((hw + 5) % tile_w) * 4; + U32 h6 = ((hw + 6) / tile_w) * 4; + U32 w6 = ((hw + 6) % tile_w) * 4; + U32 h7 = ((hw + 7) / tile_w) * 4; + U32 w7 = ((hw + 7) % tile_w) * 4; + + for (U32 i = 0; i < 6; i++) { + for (U32 j = 0; j < 6; j++) { + I0[i * 6 + j] = inArray_pad_mov + (h0 + i) * iw_pad * 8 + (w0 + j) * 8; + I1[i * 6 + j] = inArray_pad_mov + (h1 + i) * iw_pad * 8 + (w1 + j) * 8; + I2[i * 6 + j] = inArray_pad_mov + (h2 + i) * iw_pad * 8 + (w2 + j) * 8; + I3[i * 6 + j] = inArray_pad_mov + (h3 + i) * iw_pad * 8 + (w3 + j) * 8; + I4[i * 6 + j] = inArray_pad_mov + (h4 + i) * iw_pad * 8 + (w4 + j) * 8; + I5[i * 6 + j] = inArray_pad_mov + (h5 + i) * iw_pad * 8 + (w5 + j) * 8; + I6[i * 6 + j] = inArray_pad_mov + (h6 + i) * iw_pad * 8 + (w6 + j) * 8; + I7[i * 6 + j] = inArray_pad_mov + (h7 + i) * iw_pad * 8 + (w7 + j) * 8; + } + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw0[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I0); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I0); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw1[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I1); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I1); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw2[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I2); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I2); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw3[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I3); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I3); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw4[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I4); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I4); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw5[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I5); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I5); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw6[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I6); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I6); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw7[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I7); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I7); + } + } + + F32 inputScale[36]; + + if (idt == DT_I8) { + quantize_wino_input_s16(itmArray, 8 * ic * 8, inQ, inputScale, *input_scale); + } else { + quantize_wino_input((F16 *)itmArray, 8 * ic * 8, inQ, inputScale); + } + + F32 factor_v[36][4]; + for (U32 i = 0; i < 36; i++) { + if (inputScale[i] == 0) { + factor_v[i][0] = 0; + continue; + } else { + factor_v[i][0] = 1.0 / inputScale[i] / (F32)filterScale[i]; + } + factor_v[i][1] = factor_v[i][0]; + factor_v[i][2] = factor_v[i][0]; + factor_v[i][3] = factor_v[i][0]; + } + + INT8 *in_pack = (INT8 *)itmArray; // Reuse the space + + for (U32 idx = 0; idx < 36; idx++) { + if (factor_v[idx][0] == 0) { // input pixels are all 0 + continue; + } + for (U32 c = 0; c < ic; c++) { // for each 8 channels + INT8 *in_hw8c8 = inQ + idx * 8 * ic * 8 + c * 8 * 8; + + INT8 *in_0 = in_hw8c8; + INT8 *in_1 = in_hw8c8 + 1 * 8; + INT8 *in_2 = in_hw8c8 + 2 * 8; + INT8 *in_3 = in_hw8c8 + 3 * 8; + INT8 *in_4 = in_hw8c8 + 4 * 8; + INT8 *in_5 = in_hw8c8 + 5 * 8; + INT8 *in_6 = in_hw8c8 + 6 * 8; + INT8 *in_7 = in_hw8c8 + 7 * 8; + + // NHWChw8c4 + INT8 *in_pack_0 = in_pack + idx * 8 * ic * 8 + c * 8 * 8; + INT8 *in_pack_1 = in_pack_0 + 8 * 4; + + __asm__ __volatile__("ldr d0, [%[in_0]]\n" + "ldr x2, [%[in_2]]\n" + "ldr d1, [%[in_1]]\n" + "ldr x3, [%[in_3]]\n" + "ins v0.d[1], x2\n" + "ins v1.d[1], x3\n" + "ldr d4, [%[in_4]]\n" + "ldr x6, [%[in_6]]\n" + "trn1 v20.4s, v0.4s, v1.4s\n" + "trn2 v21.4s, v0.4s, v1.4s\n" + + "ldr d5, [%[in_5]]\n" + "ldr x7, [%[in_7]]\n" + "ins v4.d[1], x6\n" + "ins v5.d[1], x7\n" + + "str q20, [%[pack_0]]\n" + "trn1 v24.4s, v4.4s, v5.4s\n" + "trn2 v25.4s, v4.4s, v5.4s\n" + "str q21, [%[pack_1]]\n" + "str q24, [%[pack_0], #16]\n" + "str q25, [%[pack_1], #16]\n" + : + : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), + [in_0] "r"(in_0), [in_1] "r"(in_1), [in_2] "r"(in_2), + [in_3] "r"(in_3), [in_4] "r"(in_4), [in_5] "r"(in_5), + [in_6] "r"(in_6), [in_7] "r"(in_7) + : "memory", "cc", "v0", "v1", "v4", "v5", "v20", "v21", + "v24", "v25", "x2", "x3", "x6", "x7"); + } + } + + // compute + for (U32 o = 0; o < oc; o++) { // 8 output channels at a time + // bias + F16 *b_0 = biasArray + o * 8; + for (U32 idx = 0; idx < 36; idx++) { + INT8 *in_hw0 = in_pack + idx * 8 * ic * 8; + INT8 *f_o0c0 = filterArray + o * 8 * 36 * ic * 8 + idx * 8 * ic * 8; + F16 *out_o0hw0 = otmArray + idx * 8 * 8; + if (factor_v[idx][0] == 0) { // input pixels are all 0 + memset(out_o0hw0, 0, 8 * 8 * sizeof(OT)); + continue; + } + F32 *fac = factor_v[idx]; + __asm__ __volatile__( + // Bias should be applied after transform + "eor v5.16b, v5.16b, v5.16b\n" + "ldr d1, [%[in_0]]\n" // in_0 + "eor v6.16b, v6.16b, v6.16b\n" + "ldr x1, [%[in_0], #8]\n" + "eor v7.16b, v7.16b, v7.16b\n" + "ins v1.d[1], x1\n" + "eor v8.16b, v8.16b, v8.16b\n" + "ldr d0, [%[f_0]]\n" // f_0 + "eor v9.16b, v9.16b, v9.16b\n" + "ldr x2, [%[f_0], #8]\n" + "eor v10.16b, v10.16b, v10.16b\n" + "ins v0.d[1], x2\n" + "eor v11.16b, v11.16b, v11.16b\n" + "eor v12.16b, v12.16b, v12.16b\n" + "eor v13.16b, v13.16b, v13.16b\n" + "eor v14.16b, v14.16b, v14.16b\n" + "eor v15.16b, v15.16b, v15.16b\n" + "eor v16.16b, v16.16b, v16.16b\n" + "eor v17.16b, v17.16b, v17.16b\n" + "eor v18.16b, v18.16b, v18.16b\n" + "eor v19.16b, v19.16b, v19.16b\n" + "eor v20.16b, v20.16b, v20.16b\n" + + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" // ic_blk + "0:\n" + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "ldr d3, [x3, 16]!\n" + "ldr x16, [x3, 8]\n" + "sdot v7.4s, v0.16b, v1.4b[1]\n" + "ldr d29, [x0, 16]\n" + "ldr x17, [x0, 24]\n" + "sdot v9.4s, v0.16b, v1.4b[2]\n" + "ins v3.d[1], x16\n" + "ldr d30, [x3, 16]!\n" + "sdot v11.4s, v0.16b, v1.4b[3]\n" + "ins v29.d[1], x17\n" + + "sdot v13.4s, v0.16b, v3.4b[0]\n" + "ldr x16, [x3, 8]\n" + "subs x2, x2, #4\n" + "sdot v15.4s, v0.16b, v3.4b[1]\n" + "sdot v17.4s, v0.16b, v3.4b[2]\n" + "ins v30.d[1], x16\n" + "sdot v19.4s, v0.16b, v3.4b[3]\n" + + "sdot v6.4s, v29.16b, v1.4b[0]\n" + "sdot v8.4s, v29.16b, v1.4b[1]\n" + "ldr d0, [x0, 32]!\n" + "ldr x17, [x0, 8]\n" + "sdot v10.4s, v29.16b, v1.4b[2]\n" + "sdot v12.4s, v29.16b, v1.4b[3]\n" + + "sdot v14.4s, v29.16b, v3.4b[0]\n" + "ins v0.d[1], x17\n" + "mov v1.16b, v30.16b\n" + "sdot v16.4s, v29.16b, v3.4b[1]\n" + "sdot v18.4s, v29.16b, v3.4b[2]\n" + "sdot v20.4s, v29.16b, v3.4b[3]\n" + + "bne 0b\n" + "scvtf v5.4s, v5.4s\n" + "scvtf v6.4s, v6.4s\n" + "ldr d1, [%[factor]]\n" + "ldr x1, [%[factor], #8]\n" + "scvtf v7.4s, v7.4s\n" + "scvtf v8.4s, v8.4s\n" + "ins v1.d[1], x1\n" + "scvtf v9.4s, v9.4s\n" + "scvtf v10.4s, v10.4s\n" + "scvtf v11.4s, v11.4s\n" + "scvtf v12.4s, v12.4s\n" + "scvtf v13.4s, v13.4s\n" + "scvtf v14.4s, v14.4s\n" + "scvtf v15.4s, v15.4s\n" + "scvtf v16.4s, v16.4s\n" + "scvtf v17.4s, v17.4s\n" + "scvtf v18.4s, v18.4s\n" + "scvtf v19.4s, v19.4s\n" + "scvtf v20.4s, v20.4s\n" + + "fmul v5.4s, v1.4s, v5.4s\n" + "fmul v6.4s, v1.4s, v6.4s\n" + "fmul v7.4s, v1.4s, v7.4s\n" + "fmul v8.4s, v1.4s, v8.4s\n" + "fmul v9.4s, v1.4s, v9.4s\n" + "fmul v10.4s, v1.4s, v10.4s\n" + "fmul v11.4s, v1.4s, v11.4s\n" + "fmul v12.4s, v1.4s, v12.4s\n" + "fmul v13.4s, v1.4s, v13.4s\n" + "fmul v14.4s, v1.4s, v14.4s\n" + "fmul v15.4s, v1.4s, v15.4s\n" + "fmul v16.4s, v1.4s, v16.4s\n" + "fmul v17.4s, v1.4s, v17.4s\n" + "fmul v18.4s, v1.4s, v18.4s\n" + "fmul v19.4s, v1.4s, v19.4s\n" + "fmul v20.4s, v1.4s, v20.4s\n" + + "fcvtn v5.4h, v5.4s\n" + "fcvtn v7.4h, v7.4s\n" + "fcvtn v9.4h, v9.4s\n" + "fcvtn v11.4h, v11.4s\n" + "fcvtn v13.4h, v13.4s\n" + "fcvtn v15.4h, v15.4s\n" + "fcvtn v17.4h, v17.4s\n" + "fcvtn v19.4h, v19.4s\n" + + "fcvtn2 v5.8h, v6.4s\n" + "fcvtn2 v7.8h, v8.4s\n" + "fcvtn2 v9.8h, v10.4s\n" + "fcvtn2 v11.8h, v12.4s\n" + "fcvtn2 v13.8h, v14.4s\n" + "fcvtn2 v15.8h, v16.4s\n" + "fcvtn2 v17.8h, v18.4s\n" + "fcvtn2 v19.8h, v20.4s\n" + + "str q5, [%[out_0]]\n" + "str q7, [%[out_0], #16]\n" + "str q9, [%[out_0], #32]\n" + "str q11, [%[out_0], #48]\n" + "str q13, [%[out_0], #64]\n" + "str q15, [%[out_0], #80]\n" + "str q17, [%[out_0], #96]\n" + "str q19, [%[out_0], #112]\n" + : + : [out_0] "r"(out_o0hw0), [in_0] "r"(in_hw0), [f_0] "r"(f_o0c0), + [ic] "r"((I64)ic * 8), [factor] "r"(fac) + : "memory", "cc", "v0", "v1", "v3", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v29", + "v30", "x0", "x1", "x2", "x3", "x17", "x16"); + } + // out trans + // (6*6)*hw8*o8 => NOHWo8 + for (U32 hw8 = 0; hw8 < 8; hw8++) { + U32 h = (hw + hw8) / tile_w; + U32 w = (hw + hw8) % tile_w; + F16 *out_0 = + outArray + n * oc * ohow * 8 + o * ohow * 8 + h * 4 * ow * 8 + w * 4 * 8; + + F16 *Ow_0[36]; + F16 *O_0[16]; + + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + idx * 8 * 8 + hw8 * 8; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + } + } + trans_O(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h - 1, tile_w - 1, + max, min, am); + } + } + tiles_s += 8; + tiles_tail -= 8; + } + + if (tiles_tail >= 4) { + I32 hw = tiles_s; + // in trans + // NCHWc8 => (6*6)*(C/4)*hw4*c4 + // transform hw4c8 at a time, so we need 4 times to cover hw4c8 + // pack into hw4c4 after quantizing (reuse the space of itmArray) + for (U32 c = 0; c < ic; c++) { + OT *inArray_pad_mov = inArray_pad + c * ihiw * 8; + short *Iw_ptr[36]; + short *Iw0[36]; + OT *I0[36]; + short *Iw1[36]; + OT *I1[36]; + short *Iw2[36]; + OT *I2[36]; + short *Iw3[36]; + OT *I3[36]; + + // Store transformed hw4c8 to itmArray + for (U32 i = 0; i < 36; i++) { + Iw0[i] = itmArray + i * 4 * ic * 8 + c * 4 * 8; + Iw1[i] = itmArray + i * 4 * ic * 8 + c * 4 * 8 + 1 * 8; + Iw2[i] = itmArray + i * 4 * ic * 8 + c * 4 * 8 + 2 * 8; + Iw3[i] = itmArray + i * 4 * ic * 8 + c * 4 * 8 + 3 * 8; + } + + U32 h0 = (hw / tile_w) * 4; // stride is 4 + U32 w0 = (hw % tile_w) * 4; + U32 h1 = ((hw + 1) / tile_w) * 4; + U32 w1 = ((hw + 1) % tile_w) * 4; + U32 h2 = ((hw + 2) / tile_w) * 4; + U32 w2 = ((hw + 2) % tile_w) * 4; + U32 h3 = ((hw + 3) / tile_w) * 4; + U32 w3 = ((hw + 3) % tile_w) * 4; + + for (U32 i = 0; i < 6; i++) { + for (U32 j = 0; j < 6; j++) { + I0[i * 6 + j] = inArray_pad_mov + (h0 + i) * iw_pad * 8 + (w0 + j) * 8; + I1[i * 6 + j] = inArray_pad_mov + (h1 + i) * iw_pad * 8 + (w1 + j) * 8; + I2[i * 6 + j] = inArray_pad_mov + (h2 + i) * iw_pad * 8 + (w2 + j) * 8; + I3[i * 6 + j] = inArray_pad_mov + (h3 + i) * iw_pad * 8 + (w3 + j) * 8; + } + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw0[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I0); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I0); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw1[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I1); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I1); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw2[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I2); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I2); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw3[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I3); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I3); + } + } + + F32 inputScale[36]; + + if (idt == DT_I8) { + quantize_wino_input_s16(itmArray, 4 * ic * 8, inQ, inputScale, *input_scale); + } else { + quantize_wino_input((F16 *)itmArray, 4 * ic * 8, inQ, inputScale); + } + + F32 factor_v[36][4]; + for (U32 i = 0; i < 36; i++) { + if (inputScale[i] == 0) { + factor_v[i][0] = 0; + continue; + } else { + factor_v[i][0] = 1.0 / inputScale[i] / (F32)filterScale[i]; + } + factor_v[i][1] = factor_v[i][0]; + factor_v[i][2] = factor_v[i][0]; + factor_v[i][3] = factor_v[i][0]; + } + + F16 *b0 = biasArray; + INT8 *in_pack = (INT8 *)itmArray; // Reuse the space + + for (U32 idx = 0; idx < 36; idx++) { + if (factor_v[idx][0] == 0) { // input pixels are all 0 + continue; + } + for (U32 c = 0; c < ic; c++) { // for each 8 channels + INT8 *in_hw4c8 = inQ + idx * 4 * ic * 8 + c * 4 * 8; + + INT8 *in_0 = in_hw4c8; + INT8 *in_1 = in_hw4c8 + 1 * 8; + INT8 *in_2 = in_hw4c8 + 2 * 8; + INT8 *in_3 = in_hw4c8 + 3 * 8; + + // NHWChw8c4 + INT8 *in_pack_0 = in_pack + idx * 4 * ic * 8 + c * 4 * 8; + INT8 *in_pack_1 = in_pack_0 + 4 * 4; + + __asm__ __volatile__("ldr d0, [%[in_0]]\n" + "ldr x2, [%[in_2]]\n" + "ldr d1, [%[in_1]]\n" + "ldr x3, [%[in_3]]\n" + "ins v0.d[1], x2\n" + "ins v1.d[1], x3\n" + "trn1 v20.4s, v0.4s, v1.4s\n" + "trn2 v21.4s, v0.4s, v1.4s\n" + "str q20, [%[pack_0]]\n" + "str q21, [%[pack_1]]\n" + : + : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), + [in_0] "r"(in_0), [in_1] "r"(in_1), [in_2] "r"(in_2), + [in_3] "r"(in_3) + : "memory", "cc", "v0", "v1", "v20", "v21", "x2", "x3"); + } + } + + // compute + for (U32 o = 0; o < oc; o++) { // 8 output channels at a time + // bias + F16 *b_0 = b0 + o * 8; + for (U32 idx = 0; idx < 36; idx++) { + INT8 *in_hw0 = in_pack + idx * 4 * ic * 8; + INT8 *f_o0c0 = filterArray + o * 8 * 36 * ic * 8 + idx * 8 * ic * 8; + F16 *out_o0hw0 = otmArray + idx * 4 * 8; + if (factor_v[idx][0] == 0) { + memset(out_o0hw0, 0, 4 * 8 * sizeof(OT)); + continue; + } + F32 *fac = factor_v[idx]; + __asm__ __volatile__("eor v5.16b, v5.16b, v5.16b\n" + "ldr d1, [%[in_0]]\n" // in_0 + "eor v6.16b, v6.16b, v6.16b\n" + "ldr x1, [%[in_0], #8]\n" + "eor v7.16b, v7.16b, v7.16b\n" + "ins v1.d[1], x1\n" + "eor v8.16b, v8.16b, v8.16b\n" + "ldr d0, [%[f_0]]\n" // f_0 + + "eor v9.16b, v9.16b, v9.16b\n" + "ldr x2, [%[f_0], #8]\n" + "eor v10.16b, v10.16b, v10.16b\n" + "ins v0.d[1], x2\n" + "eor v11.16b, v11.16b, v11.16b\n" + "eor v12.16b, v12.16b, v12.16b\n" + + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" // ic_blk + "0:\n" + "ldr d29, [x0, 16]\n" + "ldr x17, [x0, 24]\n" + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "ldr d3, [x3, 16]!\n" + "ldr x16, [x3, 8]\n" + "sdot v7.4s, v0.16b, v1.4b[1]\n" + "ins v29.d[1], x17\n" + "subs x2, x2, #4\n" + "sdot v9.4s, v0.16b, v1.4b[2]\n" + "ins v3.d[1], x16\n" + "sdot v11.4s, v0.16b, v1.4b[3]\n" + + "sdot v6.4s, v29.16b, v1.4b[0]\n" + "ldr d0, [x0, 32]!\n" + "ldr x17, [x0, 8]\n" + "sdot v8.4s, v29.16b, v1.4b[1]\n" + "sdot v10.4s, v29.16b, v1.4b[2]\n" + "ins v0.d[1], x17\n" + "sdot v12.4s, v29.16b, v1.4b[3]\n" + "mov v1.16b, v3.16b\n" + + "bne 0b\n" + + "scvtf v5.4s, v5.4s\n" + "scvtf v6.4s, v6.4s\n" + "ldr d1, [%[factor]]\n" + "ldr x1, [%[factor], #8]\n" + "scvtf v7.4s, v7.4s\n" + "scvtf v8.4s, v8.4s\n" + "ins v1.d[1], x1\n" + "scvtf v9.4s, v9.4s\n" + "scvtf v10.4s, v10.4s\n" + "scvtf v11.4s, v11.4s\n" + "scvtf v12.4s, v12.4s\n" + + "fmul v5.4s, v1.4s, v5.4s\n" + "fmul v6.4s, v1.4s, v6.4s\n" + "fmul v7.4s, v1.4s, v7.4s\n" + "fmul v8.4s, v1.4s, v8.4s\n" + "fmul v9.4s, v1.4s, v9.4s\n" + "fmul v10.4s, v1.4s, v10.4s\n" + "fmul v11.4s, v1.4s, v11.4s\n" + "fmul v12.4s, v1.4s, v12.4s\n" + + "fcvtn v5.4h, v5.4s\n" + "fcvtn v7.4h, v7.4s\n" + "fcvtn v9.4h, v9.4s\n" + "fcvtn v11.4h, v11.4s\n" + + "fcvtn2 v5.8h, v6.4s\n" + "fcvtn2 v7.8h, v8.4s\n" + "fcvtn2 v9.8h, v10.4s\n" + "fcvtn2 v11.8h, v12.4s\n" + + "str q5, [%[out_0]]\n" + "str q7, [%[out_0], #16]\n" + "str q9, [%[out_0], #32]\n" + "str q11, [%[out_0], #48]\n" + : + : [out_0] "r"(out_o0hw0), [in_0] "r"(in_hw0), + [f_0] "r"(f_o0c0), [ic] "r"((I64)ic * 8), [factor] "r"(fac) + : "memory", "cc", "v0", "v1", "v2", "v3", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v29", "x0", "x1", "x2", + "x3", "x17", "x16"); + } + // out trans + // (6*6)*hw4*o8 => NOHWo8 + for (U32 hw4 = 0; hw4 < 4; hw4++) { + U32 h = (hw + hw4) / tile_w; + U32 w = (hw + hw4) % tile_w; + F16 *out_0 = + outArray + n * oc * ohow * 8 + o * ohow * 8 + h * 4 * ow * 8 + w * 4 * 8; + + F16 *Ow_0[36]; + F16 *O_0[16]; + + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + idx * 4 * 8 + hw4 * 8; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + } + } + trans_O(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h - 1, tile_w - 1, + max, min, am); + } + } + tiles_s += 4; + } + + for (I32 hw = tiles_s; hw < tiles; hw++) { + // in trans + // NCHWc8 => (6*6)*(C/4)*hw1*c4 + // transform hw1c8 + // pack into hw1c4 after quantizing (reuse the space of itmArray) + for (U32 c = 0; c < ic; c++) { + OT *inArray_pad_mov = inArray_pad + c * ihiw * 8; + short *Iw_ptr[36]; + short *Iw0[36]; + OT *I0[36]; + + // Store transformed hw12c8 to itmArray + for (U32 i = 0; i < 36; i++) { + Iw0[i] = itmArray + i * ic * 8 + c * 8; + } + + U32 h0 = (hw / tile_w) * 4; // stride is 4 + U32 w0 = (hw % tile_w) * 4; + + for (U32 i = 0; i < 6; i++) { + for (U32 j = 0; j < 6; j++) { + I0[i * 6 + j] = inArray_pad_mov + (h0 + i) * iw_pad * 8 + (w0 + j) * 8; + } + } + + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw0[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I0); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I0); + } + } + + F32 inputScale[36]; + + if (idt == DT_I8) { + quantize_wino_input_s16(itmArray, ic * 8, inQ, inputScale, *input_scale); + } else { + quantize_wino_input((F16 *)itmArray, ic * 8, inQ, inputScale); + } + + F32 factor_v[36][4]; + for (U32 i = 0; i < 36; i++) { + if (inputScale[i] == 0) { + factor_v[i][0] = 0; + continue; + } else { + factor_v[i][0] = 1.0 / inputScale[i] / (F32)filterScale[i]; + } + factor_v[i][1] = factor_v[i][0]; + factor_v[i][2] = factor_v[i][0]; + factor_v[i][3] = factor_v[i][0]; + } + + F16 *b0 = biasArray; + INT8 *in_pack = (INT8 *)itmArray; // Reuse the space + + for (U32 idx = 0; idx < 36; idx++) { + if (factor_v[idx][0] == 0) { + continue; + } + for (U32 c = 0; c < ic; c++) { // for each 8 channels + INT8 *in_0 = inQ + idx * ic * 8 + c * 8; + + // NHWChw8c4 + INT8 *in_pack_0 = in_pack + idx * ic * 8 + c * 8; + INT8 *in_pack_1 = in_pack_0 + 4; + + memcpy(in_pack_0, in_0, 4 * bytesOf(DT_I8)); + memcpy(in_pack_1, in_0 + 4, 4 * bytesOf(DT_I8)); + } + } + + // compute + for (U32 o = 0; o < oc; o++) { // 8 output channels at a time + // bias + F16 *b_0 = b0 + o * 8; + for (U32 idx = 0; idx < 36; idx++) { + INT8 *in_hw = in_pack + idx * ic * 8; + INT8 *f_o = filterArray + o * 8 * 36 * ic * 8 + idx * 8 * ic * 8; + F16 *out_o0hw0 = otmArray + idx * 8; + if (factor_v[idx][0] == 0) { + memset(out_o0hw0, 0, 8 * sizeof(OT)); + continue; + } + int32x4_t res[2] = {0}; + + for (U32 c = 0; c < ic; c++) { + int8x8_t in_2 = vld1_s8(in_hw); + in_hw += 8; + int8x16_t f_8o[4]; + f_8o[0] = vld1q_s8(f_o); + f_8o[1] = vld1q_s8(f_o + 16); + res[0] = vdotq_lane_s32(res[0], f_8o[0], in_2, 0); + res[1] = vdotq_lane_s32(res[1], f_8o[1], in_2, 0); + + f_8o[2] = vld1q_s8(f_o + 32); + f_8o[3] = vld1q_s8(f_o + 48); + f_o += 64; + res[0] = vdotq_lane_s32(res[0], f_8o[2], in_2, 1); + res[1] = vdotq_lane_s32(res[1], f_8o[3], in_2, 1); + } + float32x4_t fac = vld1q_f32(factor_v[idx]); + float32x4_t resf0 = vcvtq_f32_s32(res[0]); + float32x4_t resf1 = vcvtq_f32_s32(res[1]); + resf0 = vmulq_f32(resf0, fac); + resf1 = vmulq_f32(resf1, fac); + + float16x4_t resh0 = vcvt_f16_f32(resf0); + float16x4_t resh1 = vcvt_f16_f32(resf1); + + vst1_f16(out_o0hw0, resh0); + vst1_f16(out_o0hw0 + 4, resh1); + } + // out trans + // (6*6)*hw1*o8 => NOHWo8 + U32 h = hw / tile_w; + U32 w = hw % tile_w; + F16 *out_0 = + outArray + n * oc * ohow * 8 + o * ohow * 8 + h * 4 * ow * 8 + w * 4 * 8; + + F16 *Ow_0[36]; + F16 *O_0[16]; + + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + idx * 8; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + } + } + trans_O(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h - 1, tile_w - 1, max, + min, am); + } + } + } + + if (DT_I8 == odt) { + F16 max_s = max[0]; + F16 min_s = min[0]; + for (U32 i = 1; i < 8; i++) { + if (max_s < max[i]) { + max_s = max[i]; + } + if (min_s > min[i]) { + min_s = min[i]; + } + } + + if (max_s == 0 && min_s == 0) { + return NOT_SUPPORTED; + } + + F16 scale_o; + if (max_s > 0 && min_s < 0) { + F16 scale_max = 127.0 / max_s; + F16 scale_min = -127.0 / min_s; + scale_o = (scale_max < scale_min) ? scale_max : scale_min; + } else if (max_s > 0) { + scale_o = 127.0 / max_s; + } else { + scale_o = -127.0 / min_s; + } + *outputScale = scale_o; + + apply_scale_f16(on * oc * ohow * 8, outArray, scale_o, (INT8 *)output); + } + return SUCCESS; +} + +template EE convolution_winograd_A55(TensorDesc inputDesc, + const void *input, + F16 *input_scale, + TensorDesc filterDesc, + const void *filter, + F16 *filterScale, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + F16 *outputScale, + ActivationParamSpec am); + +template EE convolution_winograd_A55(TensorDesc inputDesc, + const void *input, + F16 *input_scale, + TensorDesc filterDesc, + const void *filter, + F16 *filterScale, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + F16 *outputScale, + ActivationParamSpec am); +#endif diff --git a/compute/tensor/src/cpu/arm/int8/convolution_winograd_A76.cpp b/compute/tensor/src/cpu/arm/int8/convolution_winograd_A76.cpp new file mode 100644 index 00000000..9d997df0 --- /dev/null +++ b/compute/tensor/src/cpu/arm/int8/convolution_winograd_A76.cpp @@ -0,0 +1,1440 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_INT8 +#include "cpu/arm/int8/convolution_winograd_transform.h" +#include "cpu/arm/int8/convolution_winograd.h" + +template +EE convolution_winograd_A76(TensorDesc inputDesc, + const void *input, + F16 *input_scale, + TensorDesc filterDesc, + const void *filter, + F16 *filterScale, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + F16 *outputScale, + ActivationParamSpec am) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + + // not truely one_step. Compute hw12*(6*6)*ic at one time. + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + if (fdf != DF_HWNCN8C4) { + return NOT_MATCH; + } + if (!(fh == 6 && fw == 6)) { + return NOT_MATCH; + } + + // Assume IT is the same as OT + OT *inArray = (OT *)input; + INT8 *filterArray = (INT8 *)filter; + F16 *outArray = (F16 *)output; + F16 *biasArray = (F16 *)bias; + + // both input and output are stored with C8 + oc /= 8; + ic /= 8; + + U32 tile_h = (oh + 3) / 4; + U32 tile_w = (ow + 3) / 4; + I32 tiles = tile_h * tile_w; // num of 6x6 tiles + U32 pad_left = paddingL; + U32 pad_right = paddingR + (tile_w * 4 - ow); + U32 pad_w_mod_4 = tile_w * 4 - ow; + U32 pad_top = paddingT; + U32 pad_bottom = paddingB + (tile_h * 4 - oh); + U32 pad_h_mod_4 = tile_h * 4 - oh; + U32 ih_pad = ih + pad_top + pad_bottom; + U32 iw_pad = iw + pad_left + pad_right; + + U32 ohow = oh * ow; + U32 ihiw = ih_pad * iw_pad; + + // tmp = in_pad + itm + otm + inQ + ... + // in_pad: ic*ih_pad*iw_pad*8 + // itm: 6*6*ic*12*8 (int16 or fp16) + // otm: 6*6*12*8 (F16) + // inQ: 6*6*ic*12*8 (int8) + OT *inArray_pad = (OT *)tmp; + short *itmArray = (short *)(inArray_pad + ic * ihiw * 8); // will be cast to fp16 for fp16 inputs + F16 *otmArray = (F16 *)(itmArray + 6 * 6 * ic * 12 * 8); + INT8 *inQ = (INT8 *)(otmArray + 6 * 6 * 12 * 8); + if (DT_I8 == odt) { + outArray = (F16 *)(inQ + 6 * 6 * ic * 12 * 8); // After otmArray and pack + } + + // To track the range of the final outputs and prepare for quantization + F16 max[8] = {0}; + F16 min[8] = {0}; + + for (U32 n = 0; n < in; n++) { // for each batch + OT *inArray_pad_mov = inArray_pad; + OT *inArray_mov = inArray + n * ic * ih * iw * 8; + for (U32 c = 0; c < ic; c++) { + memset(inArray_pad_mov, 0, pad_top * iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += pad_top * iw_pad * 8; + for (U32 h = pad_top; h < ih_pad - pad_bottom; h++) { + memset(inArray_pad_mov, 0, pad_left * 8 * bytesOf(idt)); + inArray_pad_mov += pad_left * 8; + memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt)); + inArray_pad_mov += iw * 8; + inArray_mov += iw * 8; + memset(inArray_pad_mov, 0, pad_right * 8 * bytesOf(idt)); + inArray_pad_mov += pad_right * 8; + } + memset(inArray_pad_mov, 0, pad_bottom * iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += pad_bottom * iw_pad * 8; + } + + // tiles / 12 + for (I32 hw = 0; hw < tiles - 11; hw += 12) { + // in trans + // NCHWc8 => (6*6)*(C/4)*hw12*c4 + // transform hw1c8 at a time, so we need 12 times to cover hw12c8 + // pack into hw12c4 after quantizing (reuse the space of itmArray) + for (U32 c = 0; c < ic; c++) { + OT *inArray_pad_mov = inArray_pad + c * ihiw * 8; + short *Iw_ptr[36]; + short *Iw0[36]; + OT *I0[36]; + short *Iw1[36]; + OT *I1[36]; + short *Iw2[36]; + OT *I2[36]; + short *Iw3[36]; + OT *I3[36]; + short *Iw4[36]; + OT *I4[36]; + short *Iw5[36]; + OT *I5[36]; + short *Iw6[36]; + OT *I6[36]; + short *Iw7[36]; + OT *I7[36]; + short *Iw8[36]; + OT *I8[36]; + short *Iw9[36]; + OT *I9[36]; + short *Iw10[36]; + OT *I10[36]; + short *Iw11[36]; + OT *I11[36]; + + // Store transformed hw12c8 to itmArray + for (U32 i = 0; i < 36; i++) { + Iw0[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12; + Iw1[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 1 * 8; + Iw2[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 2 * 8; + Iw3[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 3 * 8; + Iw4[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 4 * 8; + Iw5[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 5 * 8; + Iw6[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 6 * 8; + Iw7[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 7 * 8; + Iw8[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 8 * 8; + Iw9[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 9 * 8; + Iw10[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 10 * 8; + Iw11[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 11 * 8; + } + + U32 h0 = (hw / tile_w) * 4; // stride is 4 + U32 w0 = (hw % tile_w) * 4; + U32 h1 = ((hw + 1) / tile_w) * 4; + U32 w1 = ((hw + 1) % tile_w) * 4; + U32 h2 = ((hw + 2) / tile_w) * 4; + U32 w2 = ((hw + 2) % tile_w) * 4; + U32 h3 = ((hw + 3) / tile_w) * 4; + U32 w3 = ((hw + 3) % tile_w) * 4; + U32 h4 = ((hw + 4) / tile_w) * 4; + U32 w4 = ((hw + 4) % tile_w) * 4; + U32 h5 = ((hw + 5) / tile_w) * 4; + U32 w5 = ((hw + 5) % tile_w) * 4; + U32 h6 = ((hw + 6) / tile_w) * 4; + U32 w6 = ((hw + 6) % tile_w) * 4; + U32 h7 = ((hw + 7) / tile_w) * 4; + U32 w7 = ((hw + 7) % tile_w) * 4; + U32 h8 = ((hw + 8) / tile_w) * 4; + U32 w8 = ((hw + 8) % tile_w) * 4; + U32 h9 = ((hw + 9) / tile_w) * 4; + U32 w9 = ((hw + 9) % tile_w) * 4; + U32 h10 = ((hw + 10) / tile_w) * 4; + U32 w10 = ((hw + 10) % tile_w) * 4; + U32 h11 = ((hw + 11) / tile_w) * 4; + U32 w11 = ((hw + 11) % tile_w) * 4; + + for (U32 i = 0; i < 6; i++) { + for (U32 j = 0; j < 6; j++) { + I0[i * 6 + j] = inArray_pad_mov + (h0 + i) * iw_pad * 8 + (w0 + j) * 8; + I1[i * 6 + j] = inArray_pad_mov + (h1 + i) * iw_pad * 8 + (w1 + j) * 8; + I2[i * 6 + j] = inArray_pad_mov + (h2 + i) * iw_pad * 8 + (w2 + j) * 8; + I3[i * 6 + j] = inArray_pad_mov + (h3 + i) * iw_pad * 8 + (w3 + j) * 8; + I4[i * 6 + j] = inArray_pad_mov + (h4 + i) * iw_pad * 8 + (w4 + j) * 8; + I5[i * 6 + j] = inArray_pad_mov + (h5 + i) * iw_pad * 8 + (w5 + j) * 8; + I6[i * 6 + j] = inArray_pad_mov + (h6 + i) * iw_pad * 8 + (w6 + j) * 8; + I7[i * 6 + j] = inArray_pad_mov + (h7 + i) * iw_pad * 8 + (w7 + j) * 8; + I8[i * 6 + j] = inArray_pad_mov + (h8 + i) * iw_pad * 8 + (w8 + j) * 8; + I9[i * 6 + j] = inArray_pad_mov + (h9 + i) * iw_pad * 8 + (w9 + j) * 8; + I10[i * 6 + j] = inArray_pad_mov + (h10 + i) * iw_pad * 8 + (w10 + j) * 8; + I11[i * 6 + j] = inArray_pad_mov + (h11 + i) * iw_pad * 8 + (w11 + j) * 8; + } + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw0[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I0); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I0); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw1[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I1); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I1); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw2[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I2); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I2); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw3[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I3); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I3); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw4[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I4); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I4); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw5[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I5); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I5); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw6[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I6); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I6); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw7[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I7); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I7); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw8[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I8); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I8); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw9[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I9); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I9); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw10[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I10); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I10); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw11[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I11); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I11); + } + } + + F32 inputScale[36]; + + if (DT_I8 == idt) { + quantize_wino_input_s16(itmArray, 12 * ic * 8, inQ, inputScale, *input_scale); + } else { + quantize_wino_input((F16 *)itmArray, 12 * ic * 8, inQ, inputScale); + } + + F32 factor_v[36][4]; + for (U32 i = 0; i < 36; i++) { + if (inputScale[i] == 0) { + factor_v[i][0] = 0; + continue; + } else { + factor_v[i][0] = 1.0 / inputScale[i] / (F32)filterScale[i]; + } + + factor_v[i][1] = factor_v[i][0]; + factor_v[i][2] = factor_v[i][0]; + factor_v[i][3] = factor_v[i][0]; + } + + INT8 *in_pack = (INT8 *)itmArray; // Reuse the space + + for (U32 idx = 0; idx < 36; idx++) { + if (factor_v[idx][0] == 0) { // input pixels are all 0 + continue; + } + for (U32 c = 0; c < ic; c++) { // for each 8 channels + INT8 *in_hw12c8 = inQ + idx * 12 * ic * 8 + c * 12 * 8; + + INT8 *in_0 = in_hw12c8; + INT8 *in_1 = in_hw12c8 + 1 * 8; + INT8 *in_2 = in_hw12c8 + 2 * 8; + INT8 *in_3 = in_hw12c8 + 3 * 8; + INT8 *in_4 = in_hw12c8 + 4 * 8; + INT8 *in_5 = in_hw12c8 + 5 * 8; + INT8 *in_6 = in_hw12c8 + 6 * 8; + INT8 *in_7 = in_hw12c8 + 7 * 8; + INT8 *in_8 = in_hw12c8 + 8 * 8; + INT8 *in_9 = in_hw12c8 + 9 * 8; + INT8 *in_10 = in_hw12c8 + 10 * 8; + INT8 *in_11 = in_hw12c8 + 11 * 8; + + // NHWChw12c4 + INT8 *in_pack_0 = in_pack + idx * 12 * ic * 8 + c * 12 * 8; + INT8 *in_pack_1 = in_pack_0 + 12 * 4; + + __asm__ __volatile__( + "ldr d0, [%[in_0]]\n" + "ldr x2, [%[in_2]]\n" + "ldr d1, [%[in_1]]\n" + "ldr x3, [%[in_3]]\n" + "ins v0.d[1], x2\n" + "ins v1.d[1], x3\n" + "ldr d4, [%[in_4]]\n" + "ldr x6, [%[in_6]]\n" + "trn1 v20.4s, v0.4s, v1.4s\n" + "trn2 v21.4s, v0.4s, v1.4s\n" + + "ldr d5, [%[in_5]]\n" + "ldr x7, [%[in_7]]\n" + "ins v4.d[1], x6\n" + "ins v5.d[1], x7\n" + "ldr d8, [%[in_8]]\n" + "ldr x10, [%[in_10]]\n" + "trn1 v24.4s, v4.4s, v5.4s\n" + "trn2 v25.4s, v4.4s, v5.4s\n" + "ldr d9, [%[in_9]]\n" + "ldr x11, [%[in_11]]\n" + "ins v8.d[1], x10\n" + "ins v9.d[1], x11\n" + + "str q20, [%[pack_0]]\n" + "trn1 v28.4s, v8.4s, v9.4s\n" + "trn2 v29.4s, v8.4s, v9.4s\n" + "str q24, [%[pack_0], #16]\n" + "str q28, [%[pack_0], #32]\n" + "str q21, [%[pack_1]]\n" + "str q25, [%[pack_1], #16]\n" + "str q29, [%[pack_1], #32]\n" + : + : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), [in_0] "r"(in_0), + [in_1] "r"(in_1), [in_2] "r"(in_2), [in_3] "r"(in_3), [in_4] "r"(in_4), + [in_5] "r"(in_5), [in_6] "r"(in_6), [in_7] "r"(in_7), [in_8] "r"(in_8), + [in_9] "r"(in_9), [in_10] "r"(in_10), [in_11] "r"(in_11) + : "memory", "cc", "v0", "v1", "v4", "v5", "v8", "v9", "v20", "v21", "v24", + "v25", "v28", "v29", "x2", "x3", "x6", "x7", "x10", "x11"); + } + } + + // compute + for (U32 o = 0; o < oc; o++) { // 8 output channels at a time + // bias + F16 *b_0 = biasArray + o * 8; + for (U32 idx = 0; idx < 36; idx++) { + INT8 *in_hw0 = in_pack + idx * 12 * ic * 8; + INT8 *f_o0c0 = filterArray + o * 8 * 36 * ic * 8 + idx * 8 * ic * 8; + F16 *out_o0hw0 = otmArray + idx * 12 * 8; + if (factor_v[idx][0] == 0) { // input pixels are all 0 + memset(out_o0hw0, 0, 12 * 8 * sizeof(OT)); + continue; + } + F32 *fac = factor_v[idx]; + __asm__ __volatile__("eor v5.16b, v5.16b, v5.16b\n" + "ldr q1, [%[in_0]]\n" // in_0 + "eor v6.16b, v6.16b, v6.16b\n" + "eor v7.16b, v7.16b, v7.16b\n" + "eor v8.16b, v8.16b, v8.16b\n" + "ldr q0, [%[f_0]]\n" // f_0 + "eor v9.16b, v9.16b, v9.16b\n" + "eor v10.16b, v10.16b, v10.16b\n" + "eor v11.16b, v11.16b, v11.16b\n" + "ldr q3, [%[in_0], #16]\n" // in_1 + "eor v12.16b, v12.16b, v12.16b\n" + "eor v13.16b, v13.16b, v13.16b\n" + "eor v14.16b, v14.16b, v14.16b\n" + "eor v15.16b, v15.16b, v15.16b\n" + "eor v16.16b, v16.16b, v16.16b\n" + + "eor v17.16b, v17.16b, v17.16b\n" + "eor v18.16b, v18.16b, v18.16b\n" + "eor v19.16b, v19.16b, v19.16b\n" + "eor v20.16b, v20.16b, v20.16b\n" + "eor v21.16b, v21.16b, v21.16b\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + "eor v25.16b, v25.16b, v25.16b\n" + "eor v26.16b, v26.16b, v26.16b\n" + "eor v27.16b, v27.16b, v27.16b\n" + "eor v28.16b, v28.16b, v28.16b\n" + + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" // ic_blk + "0:\n" + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "sdot v7.4s, v0.16b, v1.4b[1]\n" + "ldr q2, [x3, 32]\n" + "ldr q29, [x0, 16]\n" + "sdot v9.4s, v0.16b, v1.4b[2]\n" + "sdot v11.4s, v0.16b, v1.4b[3]\n" + + "sdot v13.4s, v0.16b, v3.4b[0]\n" + "sdot v15.4s, v0.16b, v3.4b[1]\n" + "sdot v17.4s, v0.16b, v3.4b[2]\n" + "sdot v19.4s, v0.16b, v3.4b[3]\n" + + "sdot v21.4s, v0.16b, v2.4b[0]\n" + "sdot v23.4s, v0.16b, v2.4b[1]\n" + "sdot v25.4s, v0.16b, v2.4b[2]\n" + "sdot v27.4s, v0.16b, v2.4b[3]\n" + + "sdot v14.4s, v29.16b, v3.4b[0]\n" + "sdot v16.4s, v29.16b, v3.4b[1]\n" + + "sdot v18.4s, v29.16b, v3.4b[2]\n" + "sdot v20.4s, v29.16b, v3.4b[3]\n" + + "sdot v6.4s, v29.16b, v1.4b[0]\n" + "sdot v8.4s, v29.16b, v1.4b[1]\n" + "ldr q0, [x0, 32]!\n" + "subs x2, x2, #4\n" + "sdot v10.4s, v29.16b, v1.4b[2]\n" + "sdot v12.4s, v29.16b, v1.4b[3]\n" + + "ldr q1, [x3, 48]!\n" + "ldr q3, [x3, 16]\n" + "sdot v22.4s, v29.16b, v2.4b[0]\n" + "sdot v24.4s, v29.16b, v2.4b[1]\n" + "sdot v26.4s, v29.16b, v2.4b[2]\n" + "sdot v28.4s, v29.16b, v2.4b[3]\n" + + "bne 0b\n" + "scvtf v5.4s, v5.4s\n" + "scvtf v6.4s, v6.4s\n" + "ldr q1, [%[factor]]\n" + "scvtf v7.4s, v7.4s\n" + "scvtf v8.4s, v8.4s\n" + "scvtf v9.4s, v9.4s\n" + "scvtf v10.4s, v10.4s\n" + "scvtf v11.4s, v11.4s\n" + "scvtf v12.4s, v12.4s\n" + "scvtf v13.4s, v13.4s\n" + "scvtf v14.4s, v14.4s\n" + "scvtf v15.4s, v15.4s\n" + "scvtf v16.4s, v16.4s\n" + "scvtf v17.4s, v17.4s\n" + "scvtf v18.4s, v18.4s\n" + "scvtf v19.4s, v19.4s\n" + "scvtf v20.4s, v20.4s\n" + "scvtf v21.4s, v21.4s\n" + "scvtf v22.4s, v22.4s\n" + "scvtf v23.4s, v23.4s\n" + "scvtf v24.4s, v24.4s\n" + "scvtf v25.4s, v25.4s\n" + "scvtf v26.4s, v26.4s\n" + "scvtf v27.4s, v27.4s\n" + "scvtf v28.4s, v28.4s\n" + + "fmul v5.4s, v1.4s, v5.4s\n" + "fmul v6.4s, v1.4s, v6.4s\n" + "fmul v7.4s, v1.4s, v7.4s\n" + "fmul v8.4s, v1.4s, v8.4s\n" + "fmul v9.4s, v1.4s, v9.4s\n" + "fmul v10.4s, v1.4s, v10.4s\n" + "fmul v11.4s, v1.4s, v11.4s\n" + "fmul v12.4s, v1.4s, v12.4s\n" + "fmul v13.4s, v1.4s, v13.4s\n" + "fmul v14.4s, v1.4s, v14.4s\n" + "fmul v15.4s, v1.4s, v15.4s\n" + "fmul v16.4s, v1.4s, v16.4s\n" + "fmul v17.4s, v1.4s, v17.4s\n" + "fmul v18.4s, v1.4s, v18.4s\n" + "fmul v19.4s, v1.4s, v19.4s\n" + "fmul v20.4s, v1.4s, v20.4s\n" + "fmul v21.4s, v1.4s, v21.4s\n" + "fmul v22.4s, v1.4s, v22.4s\n" + "fmul v23.4s, v1.4s, v23.4s\n" + "fmul v24.4s, v1.4s, v24.4s\n" + "fmul v25.4s, v1.4s, v25.4s\n" + "fmul v26.4s, v1.4s, v26.4s\n" + "fmul v27.4s, v1.4s, v27.4s\n" + "fmul v28.4s, v1.4s, v28.4s\n" + + "fcvtn v5.4h, v5.4s\n" + "fcvtn v7.4h, v7.4s\n" + "fcvtn v9.4h, v9.4s\n" + "fcvtn v11.4h, v11.4s\n" + "fcvtn v13.4h, v13.4s\n" + "fcvtn v15.4h, v15.4s\n" + "fcvtn v17.4h, v17.4s\n" + "fcvtn v19.4h, v19.4s\n" + "fcvtn v21.4h, v21.4s\n" + "fcvtn v23.4h, v23.4s\n" + "fcvtn v25.4h, v25.4s\n" + "fcvtn v27.4h, v27.4s\n" + + "fcvtn2 v5.8h, v6.4s\n" + "fcvtn2 v7.8h, v8.4s\n" + "fcvtn2 v9.8h, v10.4s\n" + "fcvtn2 v11.8h, v12.4s\n" + "fcvtn2 v13.8h, v14.4s\n" + "fcvtn2 v15.8h, v16.4s\n" + "fcvtn2 v17.8h, v18.4s\n" + "fcvtn2 v19.8h, v20.4s\n" + "fcvtn2 v21.8h, v22.4s\n" + "fcvtn2 v23.8h, v24.4s\n" + "fcvtn2 v25.8h, v26.4s\n" + "fcvtn2 v27.8h, v28.4s\n" + + "str q5, [%[out_0]]\n" + "str q7, [%[out_0], #16]\n" + "str q9, [%[out_0], #32]\n" + "str q11, [%[out_0], #48]\n" + "str q13, [%[out_0], #64]\n" + "str q15, [%[out_0], #80]\n" + "str q17, [%[out_0], #96]\n" + "str q19, [%[out_0], #112]\n" + "str q21, [%[out_0], #128]\n" + "str q23, [%[out_0], #144]\n" + "str q25, [%[out_0], #160]\n" + "str q27, [%[out_0], #176]\n" + : + : [out_0] "r"(out_o0hw0), [in_0] "r"(in_hw0), + [f_0] "r"(f_o0c0), [ic] "r"((I64)ic * 8), [factor] "r"(fac) + : "memory", "cc", "v0", "v1", "v2", "v3", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "x0", "x1", "x2", + "x3", "x17", "x16"); + } + // out trans + // (6*6)*hw12*o8 => NOHWo8 + for (U32 hw12 = 0; hw12 < 12; hw12++) { + U32 h = (hw + hw12) / tile_w; + U32 w = (hw + hw12) % tile_w; + F16 *out_0 = + outArray + n * oc * ohow * 8 + o * ohow * 8 + h * 4 * ow * 8 + w * 4 * 8; + + F16 *Ow_0[36]; + F16 *O_0[16]; + + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + idx * 12 * 8 + hw12 * 8; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + } + } + trans_O(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h - 1, tile_w - 1, + max, min, am); + } + } + } + + // tiles_reminder % 12 / 8 + I32 tiles_s = (tiles / 12) * 12; + I32 tiles_tail = tiles - tiles_s; + + if (tiles_tail >= 8) { + I32 hw = tiles_s; + // in trans + // NCHWc8 => (6*6)*(C/4)*hw8*c4 + // transform hw1c8 at a time, so we need 8 times to cover hw8c8 + // pack into hw8c4 after quantizing (reuse the space of itmArray) + for (U32 c = 0; c < ic; c++) { + OT *inArray_pad_mov = inArray_pad + c * ihiw * 8; + short *Iw_ptr[36]; + short *Iw0[36]; + OT *I0[36]; + short *Iw1[36]; + OT *I1[36]; + short *Iw2[36]; + OT *I2[36]; + short *Iw3[36]; + OT *I3[36]; + short *Iw4[36]; + OT *I4[36]; + short *Iw5[36]; + OT *I5[36]; + short *Iw6[36]; + OT *I6[36]; + short *Iw7[36]; + OT *I7[36]; + + // Store transformed hw12c8 to itmArray + for (U32 i = 0; i < 36; i++) { + Iw0[i] = itmArray + i * 8 * ic * 8 + c * 8 * 8; + Iw1[i] = itmArray + i * 8 * ic * 8 + c * 8 * 8 + 1 * 8; + Iw2[i] = itmArray + i * 8 * ic * 8 + c * 8 * 8 + 2 * 8; + Iw3[i] = itmArray + i * 8 * ic * 8 + c * 8 * 8 + 3 * 8; + Iw4[i] = itmArray + i * 8 * ic * 8 + c * 8 * 8 + 4 * 8; + Iw5[i] = itmArray + i * 8 * ic * 8 + c * 8 * 8 + 5 * 8; + Iw6[i] = itmArray + i * 8 * ic * 8 + c * 8 * 8 + 6 * 8; + Iw7[i] = itmArray + i * 8 * ic * 8 + c * 8 * 8 + 7 * 8; + } + + U32 h0 = (hw / tile_w) * 4; // stride is 4 + U32 w0 = (hw % tile_w) * 4; + U32 h1 = ((hw + 1) / tile_w) * 4; + U32 w1 = ((hw + 1) % tile_w) * 4; + U32 h2 = ((hw + 2) / tile_w) * 4; + U32 w2 = ((hw + 2) % tile_w) * 4; + U32 h3 = ((hw + 3) / tile_w) * 4; + U32 w3 = ((hw + 3) % tile_w) * 4; + U32 h4 = ((hw + 4) / tile_w) * 4; + U32 w4 = ((hw + 4) % tile_w) * 4; + U32 h5 = ((hw + 5) / tile_w) * 4; + U32 w5 = ((hw + 5) % tile_w) * 4; + U32 h6 = ((hw + 6) / tile_w) * 4; + U32 w6 = ((hw + 6) % tile_w) * 4; + U32 h7 = ((hw + 7) / tile_w) * 4; + U32 w7 = ((hw + 7) % tile_w) * 4; + + for (U32 i = 0; i < 6; i++) { + for (U32 j = 0; j < 6; j++) { + I0[i * 6 + j] = inArray_pad_mov + (h0 + i) * iw_pad * 8 + (w0 + j) * 8; + I1[i * 6 + j] = inArray_pad_mov + (h1 + i) * iw_pad * 8 + (w1 + j) * 8; + I2[i * 6 + j] = inArray_pad_mov + (h2 + i) * iw_pad * 8 + (w2 + j) * 8; + I3[i * 6 + j] = inArray_pad_mov + (h3 + i) * iw_pad * 8 + (w3 + j) * 8; + I4[i * 6 + j] = inArray_pad_mov + (h4 + i) * iw_pad * 8 + (w4 + j) * 8; + I5[i * 6 + j] = inArray_pad_mov + (h5 + i) * iw_pad * 8 + (w5 + j) * 8; + I6[i * 6 + j] = inArray_pad_mov + (h6 + i) * iw_pad * 8 + (w6 + j) * 8; + I7[i * 6 + j] = inArray_pad_mov + (h7 + i) * iw_pad * 8 + (w7 + j) * 8; + } + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw0[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I0); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I0); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw1[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I1); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I1); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw2[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I2); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I2); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw3[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I3); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I3); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw4[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I4); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I4); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw5[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I5); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I5); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw6[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I6); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I6); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw7[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I7); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I7); + } + } + + F32 inputScale[36]; + + if (idt == DT_I8) { + quantize_wino_input_s16(itmArray, 8 * ic * 8, inQ, inputScale, *input_scale); + } else { + quantize_wino_input((F16 *)itmArray, 8 * ic * 8, inQ, inputScale); + } + + F32 factor_v[36][4]; + for (U32 i = 0; i < 36; i++) { + if (inputScale[i] == 0) { + factor_v[i][0] = 0; + continue; + } else { + factor_v[i][0] = 1.0 / inputScale[i] / (F32)filterScale[i]; + } + factor_v[i][1] = factor_v[i][0]; + factor_v[i][2] = factor_v[i][0]; + factor_v[i][3] = factor_v[i][0]; + } + + INT8 *in_pack = (INT8 *)itmArray; // Reuse the space + + for (U32 idx = 0; idx < 36; idx++) { + if (factor_v[idx][0] == 0) { // input pixels are all 0 + continue; + } + for (U32 c = 0; c < ic; c++) { // for each 8 channels + INT8 *in_hw8c8 = inQ + idx * 8 * ic * 8 + c * 8 * 8; + + INT8 *in_0 = in_hw8c8; + INT8 *in_1 = in_hw8c8 + 1 * 8; + INT8 *in_2 = in_hw8c8 + 2 * 8; + INT8 *in_3 = in_hw8c8 + 3 * 8; + INT8 *in_4 = in_hw8c8 + 4 * 8; + INT8 *in_5 = in_hw8c8 + 5 * 8; + INT8 *in_6 = in_hw8c8 + 6 * 8; + INT8 *in_7 = in_hw8c8 + 7 * 8; + + // NHWChw8c4 + INT8 *in_pack_0 = in_pack + idx * 8 * ic * 8 + c * 8 * 8; + INT8 *in_pack_1 = in_pack_0 + 8 * 4; + + __asm__ __volatile__("ldr d0, [%[in_0]]\n" + "ldr x2, [%[in_2]]\n" + "ldr d1, [%[in_1]]\n" + "ldr x3, [%[in_3]]\n" + "ins v0.d[1], x2\n" + "ins v1.d[1], x3\n" + "ldr d4, [%[in_4]]\n" + "ldr x6, [%[in_6]]\n" + "trn1 v20.4s, v0.4s, v1.4s\n" + "trn2 v21.4s, v0.4s, v1.4s\n" + + "ldr d5, [%[in_5]]\n" + "ldr x7, [%[in_7]]\n" + "ins v4.d[1], x6\n" + "ins v5.d[1], x7\n" + + "str q20, [%[pack_0]]\n" + "trn1 v24.4s, v4.4s, v5.4s\n" + "trn2 v25.4s, v4.4s, v5.4s\n" + "str q21, [%[pack_1]]\n" + "str q24, [%[pack_0], #16]\n" + "str q25, [%[pack_1], #16]\n" + : + : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), + [in_0] "r"(in_0), [in_1] "r"(in_1), [in_2] "r"(in_2), + [in_3] "r"(in_3), [in_4] "r"(in_4), [in_5] "r"(in_5), + [in_6] "r"(in_6), [in_7] "r"(in_7) + : "memory", "cc", "v0", "v1", "v4", "v5", "v20", "v21", + "v24", "v25", "x2", "x3", "x6", "x7"); + } + } + + // compute + for (U32 o = 0; o < oc; o++) { // 8 output channels at a time + // bias + F16 *b_0 = biasArray + o * 8; + for (U32 idx = 0; idx < 36; idx++) { + INT8 *in_hw0 = in_pack + idx * 8 * ic * 8; + INT8 *f_o0c0 = filterArray + o * 8 * 36 * ic * 8 + idx * 8 * ic * 8; + F16 *out_o0hw0 = otmArray + idx * 8 * 8; + if (factor_v[idx][0] == 0) { // input pixels are all 0 + memset(out_o0hw0, 0, 8 * 8 * sizeof(OT)); + continue; + } + F32 *fac = factor_v[idx]; + __asm__ __volatile__( + // Bias should be applied after transform + "eor v5.16b, v5.16b, v5.16b\n" + "ldr q1, [%[in_0]]\n" // in_0 + "eor v6.16b, v6.16b, v6.16b\n" + "eor v7.16b, v7.16b, v7.16b\n" + "eor v8.16b, v8.16b, v8.16b\n" + "ldr q0, [%[f_0]]\n" // f_0 + "eor v9.16b, v9.16b, v9.16b\n" + "eor v10.16b, v10.16b, v10.16b\n" + "eor v11.16b, v11.16b, v11.16b\n" + "eor v12.16b, v12.16b, v12.16b\n" + "eor v13.16b, v13.16b, v13.16b\n" + "eor v14.16b, v14.16b, v14.16b\n" + "eor v15.16b, v15.16b, v15.16b\n" + "eor v16.16b, v16.16b, v16.16b\n" + "eor v17.16b, v17.16b, v17.16b\n" + "eor v18.16b, v18.16b, v18.16b\n" + "eor v19.16b, v19.16b, v19.16b\n" + "eor v20.16b, v20.16b, v20.16b\n" + + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" // ic_blk + "0:\n" + "ldr q3, [x3, 16]!\n" + "ldr q29, [x0, 16]\n" + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "sdot v7.4s, v0.16b, v1.4b[1]\n" + "sdot v9.4s, v0.16b, v1.4b[2]\n" + "sdot v11.4s, v0.16b, v1.4b[3]\n" + + "sdot v13.4s, v0.16b, v3.4b[0]\n" + "sdot v15.4s, v0.16b, v3.4b[1]\n" + "sdot v17.4s, v0.16b, v3.4b[2]\n" + "sdot v19.4s, v0.16b, v3.4b[3]\n" + + "sdot v6.4s, v29.16b, v1.4b[0]\n" + "sdot v8.4s, v29.16b, v1.4b[1]\n" + "ldr q0, [x0, 32]!\n" + "subs x2, x2, #4\n" + "sdot v10.4s, v29.16b, v1.4b[2]\n" + "sdot v12.4s, v29.16b, v1.4b[3]\n" + + "sdot v14.4s, v29.16b, v3.4b[0]\n" + "ldr q1, [x3, 16]!\n" + "sdot v16.4s, v29.16b, v3.4b[1]\n" + "sdot v18.4s, v29.16b, v3.4b[2]\n" + "sdot v20.4s, v29.16b, v3.4b[3]\n" + + "bne 0b\n" + "scvtf v5.4s, v5.4s\n" + "scvtf v6.4s, v6.4s\n" + "ldr q1, [%[factor]]\n" + "scvtf v7.4s, v7.4s\n" + "scvtf v8.4s, v8.4s\n" + "scvtf v9.4s, v9.4s\n" + "scvtf v10.4s, v10.4s\n" + "scvtf v11.4s, v11.4s\n" + "scvtf v12.4s, v12.4s\n" + "scvtf v13.4s, v13.4s\n" + "scvtf v14.4s, v14.4s\n" + "scvtf v15.4s, v15.4s\n" + "scvtf v16.4s, v16.4s\n" + "scvtf v17.4s, v17.4s\n" + "scvtf v18.4s, v18.4s\n" + "scvtf v19.4s, v19.4s\n" + "scvtf v20.4s, v20.4s\n" + + "fmul v5.4s, v1.4s, v5.4s\n" + "fmul v6.4s, v1.4s, v6.4s\n" + "fmul v7.4s, v1.4s, v7.4s\n" + "fmul v8.4s, v1.4s, v8.4s\n" + "fmul v9.4s, v1.4s, v9.4s\n" + "fmul v10.4s, v1.4s, v10.4s\n" + "fmul v11.4s, v1.4s, v11.4s\n" + "fmul v12.4s, v1.4s, v12.4s\n" + "fmul v13.4s, v1.4s, v13.4s\n" + "fmul v14.4s, v1.4s, v14.4s\n" + "fmul v15.4s, v1.4s, v15.4s\n" + "fmul v16.4s, v1.4s, v16.4s\n" + "fmul v17.4s, v1.4s, v17.4s\n" + "fmul v18.4s, v1.4s, v18.4s\n" + "fmul v19.4s, v1.4s, v19.4s\n" + "fmul v20.4s, v1.4s, v20.4s\n" + + "fcvtn v5.4h, v5.4s\n" + "fcvtn v7.4h, v7.4s\n" + "fcvtn v9.4h, v9.4s\n" + "fcvtn v11.4h, v11.4s\n" + "fcvtn v13.4h, v13.4s\n" + "fcvtn v15.4h, v15.4s\n" + "fcvtn v17.4h, v17.4s\n" + "fcvtn v19.4h, v19.4s\n" + + "fcvtn2 v5.8h, v6.4s\n" + "fcvtn2 v7.8h, v8.4s\n" + "fcvtn2 v9.8h, v10.4s\n" + "fcvtn2 v11.8h, v12.4s\n" + "fcvtn2 v13.8h, v14.4s\n" + "fcvtn2 v15.8h, v16.4s\n" + "fcvtn2 v17.8h, v18.4s\n" + "fcvtn2 v19.8h, v20.4s\n" + + "str q5, [%[out_0]]\n" + "str q7, [%[out_0], #16]\n" + "str q9, [%[out_0], #32]\n" + "str q11, [%[out_0], #48]\n" + "str q13, [%[out_0], #64]\n" + "str q15, [%[out_0], #80]\n" + "str q17, [%[out_0], #96]\n" + "str q19, [%[out_0], #112]\n" + : + : [out_0] "r"(out_o0hw0), [in_0] "r"(in_hw0), [f_0] "r"(f_o0c0), + [ic] "r"((I64)ic * 8), [factor] "r"(fac) + : "memory", "cc", "v0", "v1", "v3", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v29", + "v30", "x0", "x1", "x2", "x3", "x17", "x16"); + } + // out trans + // (6*6)*hw8*o8 => NOHWo8 + for (U32 hw8 = 0; hw8 < 8; hw8++) { + U32 h = (hw + hw8) / tile_w; + U32 w = (hw + hw8) % tile_w; + F16 *out_0 = + outArray + n * oc * ohow * 8 + o * ohow * 8 + h * 4 * ow * 8 + w * 4 * 8; + + F16 *Ow_0[36]; + F16 *O_0[16]; + + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + idx * 8 * 8 + hw8 * 8; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + } + } + trans_O(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h - 1, tile_w - 1, + max, min, am); + } + } + tiles_s += 8; + tiles_tail -= 8; + } + + if (tiles_tail >= 4) { + I32 hw = tiles_s; + // in trans + // NCHWc8 => (6*6)*(C/4)*hw4*c4 + // transform hw4c8 at a time, so we need 4 times to cover hw4c8 + // pack into hw4c4 after quantizing (reuse the space of itmArray) + for (U32 c = 0; c < ic; c++) { + OT *inArray_pad_mov = inArray_pad + c * ihiw * 8; + short *Iw_ptr[36]; + short *Iw0[36]; + OT *I0[36]; + short *Iw1[36]; + OT *I1[36]; + short *Iw2[36]; + OT *I2[36]; + short *Iw3[36]; + OT *I3[36]; + + // Store transformed hw4c8 to itmArray + for (U32 i = 0; i < 36; i++) { + Iw0[i] = itmArray + i * 4 * ic * 8 + c * 4 * 8; + Iw1[i] = itmArray + i * 4 * ic * 8 + c * 4 * 8 + 1 * 8; + Iw2[i] = itmArray + i * 4 * ic * 8 + c * 4 * 8 + 2 * 8; + Iw3[i] = itmArray + i * 4 * ic * 8 + c * 4 * 8 + 3 * 8; + } + + U32 h0 = (hw / tile_w) * 4; // stride is 4 + U32 w0 = (hw % tile_w) * 4; + U32 h1 = ((hw + 1) / tile_w) * 4; + U32 w1 = ((hw + 1) % tile_w) * 4; + U32 h2 = ((hw + 2) / tile_w) * 4; + U32 w2 = ((hw + 2) % tile_w) * 4; + U32 h3 = ((hw + 3) / tile_w) * 4; + U32 w3 = ((hw + 3) % tile_w) * 4; + + for (U32 i = 0; i < 6; i++) { + for (U32 j = 0; j < 6; j++) { + I0[i * 6 + j] = inArray_pad_mov + (h0 + i) * iw_pad * 8 + (w0 + j) * 8; + I1[i * 6 + j] = inArray_pad_mov + (h1 + i) * iw_pad * 8 + (w1 + j) * 8; + I2[i * 6 + j] = inArray_pad_mov + (h2 + i) * iw_pad * 8 + (w2 + j) * 8; + I3[i * 6 + j] = inArray_pad_mov + (h3 + i) * iw_pad * 8 + (w3 + j) * 8; + } + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw0[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I0); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I0); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw1[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I1); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I1); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw2[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I2); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I2); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw3[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I3); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I3); + } + } + + F32 inputScale[36]; + + if (idt == DT_I8) { + quantize_wino_input_s16(itmArray, 4 * ic * 8, inQ, inputScale, *input_scale); + } else { + quantize_wino_input((F16 *)itmArray, 4 * ic * 8, inQ, inputScale); + } + + F32 factor_v[36][4]; + for (U32 i = 0; i < 36; i++) { + if (inputScale[i] == 0) { + factor_v[i][0] = 0; + continue; + } else { + factor_v[i][0] = 1.0 / inputScale[i] / (F32)filterScale[i]; + } + factor_v[i][1] = factor_v[i][0]; + factor_v[i][2] = factor_v[i][0]; + factor_v[i][3] = factor_v[i][0]; + } + + F16 *b0 = biasArray; + INT8 *in_pack = (INT8 *)itmArray; // Reuse the space + + for (U32 idx = 0; idx < 36; idx++) { + if (factor_v[idx][0] == 0) { // input pixels are all 0 + continue; + } + for (U32 c = 0; c < ic; c++) { // for each 8 channels + INT8 *in_hw4c8 = inQ + idx * 4 * ic * 8 + c * 4 * 8; + + INT8 *in_0 = in_hw4c8; + INT8 *in_1 = in_hw4c8 + 1 * 8; + INT8 *in_2 = in_hw4c8 + 2 * 8; + INT8 *in_3 = in_hw4c8 + 3 * 8; + + // NHWChw8c4 + INT8 *in_pack_0 = in_pack + idx * 4 * ic * 8 + c * 4 * 8; + INT8 *in_pack_1 = in_pack_0 + 4 * 4; + + __asm__ __volatile__("ldr d0, [%[in_0]]\n" + "ldr x2, [%[in_2]]\n" + "ldr d1, [%[in_1]]\n" + "ldr x3, [%[in_3]]\n" + "ins v0.d[1], x2\n" + "ins v1.d[1], x3\n" + "trn1 v20.4s, v0.4s, v1.4s\n" + "trn2 v21.4s, v0.4s, v1.4s\n" + "str q20, [%[pack_0]]\n" + "str q21, [%[pack_1]]\n" + : + : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), + [in_0] "r"(in_0), [in_1] "r"(in_1), [in_2] "r"(in_2), + [in_3] "r"(in_3) + : "memory", "cc", "v0", "v1", "v20", "v21", "x2", "x3"); + } + } + + // compute + for (U32 o = 0; o < oc; o++) { // 8 output channels at a time + // bias + F16 *b_0 = b0 + o * 8; + for (U32 idx = 0; idx < 36; idx++) { + INT8 *in_hw0 = in_pack + idx * 4 * ic * 8; + INT8 *f_o0c0 = filterArray + o * 8 * 36 * ic * 8 + idx * 8 * ic * 8; + F16 *out_o0hw0 = otmArray + idx * 4 * 8; + if (factor_v[idx][0] == 0) { + memset(out_o0hw0, 0, 4 * 8 * sizeof(OT)); + continue; + } + F32 *fac = factor_v[idx]; + __asm__ __volatile__("eor v5.16b, v5.16b, v5.16b\n" + "eor v6.16b, v6.16b, v6.16b\n" + "ldr q1, [%[in_0]]\n" // in_0 + "ldr q0, [%[f_0]]\n" // f_0 + "eor v7.16b, v7.16b, v7.16b\n" + "eor v8.16b, v8.16b, v8.16b\n" + + "eor v9.16b, v9.16b, v9.16b\n" + "eor v10.16b, v10.16b, v10.16b\n" + "eor v11.16b, v11.16b, v11.16b\n" + "eor v12.16b, v12.16b, v12.16b\n" + + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" // ic_blk + "0:\n" + "ldr q29, [x0, 16]\n" + "ldr q3, [x3, 16]!\n" + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "sdot v7.4s, v0.16b, v1.4b[1]\n" + "sdot v9.4s, v0.16b, v1.4b[2]\n" + "sdot v11.4s, v0.16b, v1.4b[3]\n" + + "subs x2, x2, #4\n" + "ldr q0, [x0, 32]!\n" + "sdot v6.4s, v29.16b, v1.4b[0]\n" + "sdot v8.4s, v29.16b, v1.4b[1]\n" + "sdot v10.4s, v29.16b, v1.4b[2]\n" + "sdot v12.4s, v29.16b, v1.4b[3]\n" + "mov v1.16b, v3.16b\n" + "bne 0b\n" + + "scvtf v5.4s, v5.4s\n" + "scvtf v6.4s, v6.4s\n" + "ldr q1, [%[factor]]\n" + "scvtf v7.4s, v7.4s\n" + "scvtf v8.4s, v8.4s\n" + "scvtf v9.4s, v9.4s\n" + "scvtf v10.4s, v10.4s\n" + "scvtf v11.4s, v11.4s\n" + "scvtf v12.4s, v12.4s\n" + + "fmul v5.4s, v1.4s, v5.4s\n" + "fmul v6.4s, v1.4s, v6.4s\n" + "fmul v7.4s, v1.4s, v7.4s\n" + "fmul v8.4s, v1.4s, v8.4s\n" + "fmul v9.4s, v1.4s, v9.4s\n" + "fmul v10.4s, v1.4s, v10.4s\n" + "fmul v11.4s, v1.4s, v11.4s\n" + "fmul v12.4s, v1.4s, v12.4s\n" + + "fcvtn v5.4h, v5.4s\n" + "fcvtn v7.4h, v7.4s\n" + "fcvtn v9.4h, v9.4s\n" + "fcvtn v11.4h, v11.4s\n" + + "fcvtn2 v5.8h, v6.4s\n" + "fcvtn2 v7.8h, v8.4s\n" + "fcvtn2 v9.8h, v10.4s\n" + "fcvtn2 v11.8h, v12.4s\n" + + "str q5, [%[out_0]]\n" + "str q7, [%[out_0], #16]\n" + "str q9, [%[out_0], #32]\n" + "str q11, [%[out_0], #48]\n" + : + : [out_0] "r"(out_o0hw0), [in_0] "r"(in_hw0), + [f_0] "r"(f_o0c0), [ic] "r"((I64)ic * 8), [factor] "r"(fac) + : "memory", "cc", "v0", "v1", "v2", "v3", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v29", "x0", "x1", "x2", + "x3", "x17", "x16"); + } + // out trans + // (6*6)*hw4*o8 => NOHWo8 + for (U32 hw4 = 0; hw4 < 4; hw4++) { + U32 h = (hw + hw4) / tile_w; + U32 w = (hw + hw4) % tile_w; + F16 *out_0 = + outArray + n * oc * ohow * 8 + o * ohow * 8 + h * 4 * ow * 8 + w * 4 * 8; + + F16 *Ow_0[36]; + F16 *O_0[16]; + + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + idx * 4 * 8 + hw4 * 8; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + } + } + trans_O(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h - 1, tile_w - 1, + max, min, am); + } + } + tiles_s += 4; + } + + for (I32 hw = tiles_s; hw < tiles; hw++) { + // in trans + // NCHWc8 => (6*6)*(C/4)*hw1*c4 + // transform hw1c8 + // pack into hw1c4 after quantizing (reuse the space of itmArray) + for (U32 c = 0; c < ic; c++) { + OT *inArray_pad_mov = inArray_pad + c * ihiw * 8; + short *Iw_ptr[36]; + short *Iw0[36]; + OT *I0[36]; + + // Store transformed hw12c8 to itmArray + for (U32 i = 0; i < 36; i++) { + Iw0[i] = itmArray + i * ic * 8 + c * 8; + } + + U32 h0 = (hw / tile_w) * 4; // stride is 4 + U32 w0 = (hw % tile_w) * 4; + + for (U32 i = 0; i < 6; i++) { + for (U32 j = 0; j < 6; j++) { + I0[i * 6 + j] = inArray_pad_mov + (h0 + i) * iw_pad * 8 + (w0 + j) * 8; + } + } + + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw0[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I0); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I0); + } + } + + F32 inputScale[36]; + + if (idt == DT_I8) { + quantize_wino_input_s16(itmArray, ic * 8, inQ, inputScale, *input_scale); + } else { + quantize_wino_input((F16 *)itmArray, ic * 8, inQ, inputScale); + } + + F32 factor_v[36][4]; + for (U32 i = 0; i < 36; i++) { + if (inputScale[i] == 0) { + factor_v[i][0] = 0; + continue; + } else { + factor_v[i][0] = 1.0 / inputScale[i] / (F32)filterScale[i]; + } + factor_v[i][1] = factor_v[i][0]; + factor_v[i][2] = factor_v[i][0]; + factor_v[i][3] = factor_v[i][0]; + } + + F16 *b0 = biasArray; + INT8 *in_pack = (INT8 *)itmArray; // Reuse the space + + for (U32 idx = 0; idx < 36; idx++) { + if (factor_v[idx][0] == 0) { + continue; + } + for (U32 c = 0; c < ic; c++) { // for each 8 channels + INT8 *in_0 = inQ + idx * ic * 8 + c * 8; + + // NHWChw8c4 + INT8 *in_pack_0 = in_pack + idx * ic * 8 + c * 8; + INT8 *in_pack_1 = in_pack_0 + 4; + + memcpy(in_pack_0, in_0, 4 * bytesOf(DT_I8)); + memcpy(in_pack_1, in_0 + 4, 4 * bytesOf(DT_I8)); + } + } + + // compute + for (U32 o = 0; o < oc; o++) { // 8 output channels at a time + // bias + F16 *b_0 = b0 + o * 8; + for (U32 idx = 0; idx < 36; idx++) { + INT8 *in_hw = in_pack + idx * ic * 8; + INT8 *f_o = filterArray + o * 8 * 36 * ic * 8 + idx * 8 * ic * 8; + F16 *out_o0hw0 = otmArray + idx * 8; + if (factor_v[idx][0] == 0) { + memset(out_o0hw0, 0, 8 * sizeof(OT)); + continue; + } + int32x4_t res[2] = {0}; + + for (U32 c = 0; c < ic; c++) { + int8x8_t in_2 = vld1_s8(in_hw); + in_hw += 8; + int8x16_t f_8o[4]; + f_8o[0] = vld1q_s8(f_o); + f_8o[1] = vld1q_s8(f_o + 16); + res[0] = vdotq_lane_s32(res[0], f_8o[0], in_2, 0); + res[1] = vdotq_lane_s32(res[1], f_8o[1], in_2, 0); + + f_8o[2] = vld1q_s8(f_o + 32); + f_8o[3] = vld1q_s8(f_o + 48); + f_o += 64; + res[0] = vdotq_lane_s32(res[0], f_8o[2], in_2, 1); + res[1] = vdotq_lane_s32(res[1], f_8o[3], in_2, 1); + } + float32x4_t fac = vld1q_f32(factor_v[idx]); + float32x4_t resf0 = vcvtq_f32_s32(res[0]); + float32x4_t resf1 = vcvtq_f32_s32(res[1]); + resf0 = vmulq_f32(resf0, fac); + resf1 = vmulq_f32(resf1, fac); + + float16x4_t resh0 = vcvt_f16_f32(resf0); + float16x4_t resh1 = vcvt_f16_f32(resf1); + + vst1_f16(out_o0hw0, resh0); + vst1_f16(out_o0hw0 + 4, resh1); + } + // out trans + // (6*6)*hw1*o8 => NOHWo8 + U32 h = hw / tile_w; + U32 w = hw % tile_w; + F16 *out_0 = + outArray + n * oc * ohow * 8 + o * ohow * 8 + h * 4 * ow * 8 + w * 4 * 8; + + F16 *Ow_0[36]; + F16 *O_0[16]; + + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + idx * 8; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + } + } + trans_O(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h - 1, tile_w - 1, max, + min, am); + } + } + } + + if (DT_I8 == odt) { + F16 max_s = max[0]; + F16 min_s = min[0]; + for (U32 i = 1; i < 8; i++) { + if (max_s < max[i]) { + max_s = max[i]; + } + if (min_s > min[i]) { + min_s = min[i]; + } + } + + if (max_s == 0 && min_s == 0) { + return NOT_SUPPORTED; + } + + F16 scale_o; + if (max_s > 0 && min_s < 0) { + F16 scale_max = 127.0 / max_s; + F16 scale_min = -127.0 / min_s; + scale_o = (scale_max < scale_min) ? scale_max : scale_min; + } else if (max_s > 0) { + scale_o = 127.0 / max_s; + } else { + scale_o = -127.0 / min_s; + } + *outputScale = scale_o; + + apply_scale_f16(on * oc * ohow * 8, outArray, scale_o, (INT8 *)output, false); + } + return SUCCESS; +} + +template EE convolution_winograd_A76(TensorDesc inputDesc, + const void *input, + F16 *input_scale, + TensorDesc filterDesc, + const void *filter, + F16 *filterScale, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + F16 *outputScale, + ActivationParamSpec am); + +template EE convolution_winograd_A76(TensorDesc inputDesc, + const void *input, + F16 *input_scale, + TensorDesc filterDesc, + const void *filter, + F16 *filterScale, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + F16 *outputScale, + ActivationParamSpec am); +#endif diff --git a/compute/tensor/src/cpu/arm/int8/convolution_winograd_transform.h b/compute/tensor/src/cpu/arm/int8/convolution_winograd_transform.h new file mode 100644 index 00000000..60f56183 --- /dev/null +++ b/compute/tensor/src/cpu/arm/int8/convolution_winograd_transform.h @@ -0,0 +1,313 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_CONVOLUTION_WINOGRAD_TRANSFORM +#define _H_CONVOLUTION_WINOGRAD_TRANSFORM + +#ifdef _USE_INT8 +#include +#include +#include "types.h" +#include "error.h" +#include "cpu/arm/fp16/convolution_winograd_transform.h" + +inline void trans_I_int8(short *Iw[36], INT8 *const I[36]) +{ + short T[6][6][8]; + + int8x8_t v_4 = vmov_n_s8(4); + int8x8_t v_minus_4 = vmov_n_s8(-4); + int8x8_t v_minus_5 = vmov_n_s8(-5); + + for (U32 i = 0; i < 6; i++) { + int8x8_t v_I0 = vld1_s8(I[0 * 6 + i]); + int8x8_t v_I1 = vld1_s8(I[1 * 6 + i]); + int8x8_t v_I2 = vld1_s8(I[2 * 6 + i]); + int8x8_t v_I3 = vld1_s8(I[3 * 6 + i]); + int8x8_t v_I4 = vld1_s8(I[4 * 6 + i]); + int8x8_t v_I5 = vld1_s8(I[5 * 6 + i]); + + // Reorder to accelerate + int16x8_t v_t0 = vmull_s8(v_I2, v_minus_4); + + int16x8_t v_t1 = vmull_s8(v_I1, v_minus_4); + + int16x8_t v_t2 = vsubl_s8(v_I4, v_I2); + + int16x8_t v_t3 = vsubl_s8(v_I3, v_I1); + + v_t0 = vaddw_s8(v_t0, v_I4); + + v_t1 = vaddw_s8(v_t1, v_I3); + + v_t3 = vmulq_n_s16(v_t3, 2); + + int16x8_t v_t4 = vmull_s8(v_I0, v_4); + + int16x8_t v_t5 = vmull_s8(v_I1, v_4); + + int16x8_t v_T0 = vmull_s8(v_I2, v_minus_5); + + int16x8_t v_T1 = vaddq_s16(v_t1, v_t0); + + v_t4 = vaddw_s8(v_t4, v_I4); + + v_t5 = vaddw_s8(v_t5, v_I5); + + v_T0 = vaddq_s16(v_T0, v_t4); + + int16x8_t v_T2 = vsubq_s16(v_t0, v_t1); + + int16x8_t v_T3 = vaddq_s16(v_t3, v_t2); + + int16x8_t v_T4 = vsubq_s16(v_t2, v_t3); + + int16x8_t v_T5 = vmull_s8(v_I3, v_minus_5); + + vst1q_s16(T[0][i], v_T0); + vst1q_s16(T[1][i], v_T1); + vst1q_s16(T[2][i], v_T2); + vst1q_s16(T[3][i], v_T3); + v_T5 = vaddq_s16(v_T5, v_t5); + vst1q_s16(T[4][i], v_T4); + vst1q_s16(T[5][i], v_T5); + } + + for (U32 i = 0; i < 6; i++) { + int16x8_t v_T0 = vld1q_s16(T[i][0]); + int16x8_t v_T1 = vld1q_s16(T[i][1]); + int16x8_t v_T2 = vld1q_s16(T[i][2]); + int16x8_t v_T3 = vld1q_s16(T[i][3]); + int16x8_t v_T4 = vld1q_s16(T[i][4]); + int16x8_t v_T5 = vld1q_s16(T[i][5]); + + int16x8_t v_t0 = vmlaq_n_s16(v_T4, v_T2, -4); + int16x8_t v_t1 = vmlaq_n_s16(v_T3, v_T1, -4); + int16x8_t v_t2 = vsubq_s16(v_T4, v_T2); + int16x8_t v_t3 = vsubq_s16(v_T3, v_T1); + int16x8_t v_t4 = vmlaq_n_s16(v_T4, v_T0, 4); + int16x8_t v_t5 = vmlaq_n_s16(v_T5, v_T1, 4); + + v_t3 = vmulq_n_s16(v_t3, 2); + + int16x8_t v_Iw0 = vmlaq_n_s16(v_t4, v_T2, -5); + int16x8_t v_Iw1 = vaddq_s16(v_t1, v_t0); + int16x8_t v_Iw2 = vsubq_s16(v_t0, v_t1); + int16x8_t v_Iw3 = vaddq_s16(v_t3, v_t2); + int16x8_t v_Iw4 = vsubq_s16(v_t2, v_t3); + int16x8_t v_Iw5 = vmlaq_n_s16(v_t5, v_T3, -5); + + vst1q_s16(Iw[i * 6 + 0], v_Iw0); + vst1q_s16(Iw[i * 6 + 1], v_Iw1); + vst1q_s16(Iw[i * 6 + 2], v_Iw2); + vst1q_s16(Iw[i * 6 + 3], v_Iw3); + vst1q_s16(Iw[i * 6 + 4], v_Iw4); + vst1q_s16(Iw[i * 6 + 5], v_Iw5); + } +} + +inline void trans_O(F16 *const Ow[36], + F16 *O[16], + const F16 *bias, + U32 h, + U32 w, + U32 _pad_h_mod_4, + U32 _pad_w_mod_4, + U32 oh, + U32 ow, + F16 *max, + F16 *min, + ActivationParamSpec activationDesc) +{ + F16 T[4][6][8]; + // bias + float16x8_t v_b = vld1q_f16(bias); + + float16x8_t v_0 = vmovq_n_f16(0); + float16x8_t v_2 = vmovq_n_f16(2); + float16x8_t v_4 = vmovq_n_f16(4); + float16x8_t v_8 = vmovq_n_f16(8); + + for (U32 i = 0; i < 6; i++) { + float16x8_t v_Ow0 = vld1q_f16(Ow[i]); + float16x8_t v_Ow1 = vld1q_f16(Ow[1 * 6 + i]); + float16x8_t v_Ow2 = vld1q_f16(Ow[2 * 6 + i]); + float16x8_t v_Ow3 = vld1q_f16(Ow[3 * 6 + i]); + float16x8_t v_Ow4 = vld1q_f16(Ow[4 * 6 + i]); + float16x8_t v_Ow5 = vld1q_f16(Ow[5 * 6 + i]); + + float16x8_t v_t0 = vaddq_f16(v_Ow1, v_Ow2); + float16x8_t v_t1 = vaddq_f16(v_Ow3, v_Ow4); + float16x8_t v_t2 = vsubq_f16(v_Ow1, v_Ow2); + float16x8_t v_t3 = vsubq_f16(v_Ow3, v_Ow4); + + float16x8_t v_T0 = vaddq_f16(v_t0, v_t1); + float16x8_t v_T1 = vfmaq_f16(v_t2, v_t3, v_2); + float16x8_t v_T2 = vfmaq_f16(v_t0, v_t1, v_4); + float16x8_t v_T3 = vfmaq_f16(v_t2, v_t3, v_8); + v_T0 = vaddq_f16(v_T0, v_Ow0); + v_T3 = vaddq_f16(v_T3, v_Ow5); + + vst1q_f16(T[0][i], v_T0); + vst1q_f16(T[1][i], v_T1); + vst1q_f16(T[2][i], v_T2); + vst1q_f16(T[3][i], v_T3); + } + + float16x8_t max_v = vld1q_f16(max); + float16x8_t min_v = vld1q_f16(min); + + U32 pad_h_mod_4 = 0, pad_w_mod_4 = 0; + if (h == oh && w == ow) { + pad_h_mod_4 = _pad_h_mod_4; + pad_w_mod_4 = _pad_w_mod_4; + } else if (h == oh) { + pad_h_mod_4 = _pad_h_mod_4; + } else if (w == ow) { + pad_w_mod_4 = _pad_w_mod_4; + } + + for (U32 i = 0; i < 4 - pad_h_mod_4; i++) { + float16x8_t v_T0 = vld1q_f16(T[i][0]); + float16x8_t v_T1 = vld1q_f16(T[i][1]); + float16x8_t v_T2 = vld1q_f16(T[i][2]); + float16x8_t v_T3 = vld1q_f16(T[i][3]); + float16x8_t v_T4 = vld1q_f16(T[i][4]); + float16x8_t v_T5 = vld1q_f16(T[i][5]); + + float16x8_t v_t0 = vaddq_f16(v_T1, v_T2); + float16x8_t v_t1 = vaddq_f16(v_T3, v_T4); + float16x8_t v_t2 = vsubq_f16(v_T1, v_T2); + float16x8_t v_t3 = vsubq_f16(v_T3, v_T4); + + float16x8_t v_O0 = vaddq_f16(v_t0, v_t1); + float16x8_t v_O1 = vfmaq_f16(v_t2, v_t3, v_2); + float16x8_t v_O2 = vfmaq_f16(v_t0, v_t1, v_4); + float16x8_t v_O3 = vfmaq_f16(v_t2, v_t3, v_8); + v_O0 = vaddq_f16(v_O0, v_T0); + v_O3 = vaddq_f16(v_O3, v_T5); + + float16x8_t temp; + + if (activationDesc.mode == ACTIVATION_RELU) { + if (pad_w_mod_4 == 0) { + temp = vmaxq_f16(vaddq_f16(v_O0, v_b), v_0); + max_v = vmaxq_f16(max_v, temp); + min_v = vminq_f16(min_v, temp); + vst1q_f16(O[i * 4 + 0], temp); + + temp = vmaxq_f16(vaddq_f16(v_O1, v_b), v_0); + max_v = vmaxq_f16(max_v, temp); + min_v = vminq_f16(min_v, temp); + vst1q_f16(O[i * 4 + 1], temp); + + temp = vmaxq_f16(vaddq_f16(v_O2, v_b), v_0); + max_v = vmaxq_f16(max_v, temp); + min_v = vminq_f16(min_v, temp); + vst1q_f16(O[i * 4 + 2], temp); + + temp = vmaxq_f16(vaddq_f16(v_O3, v_b), v_0); + max_v = vmaxq_f16(max_v, temp); + min_v = vminq_f16(min_v, temp); + vst1q_f16(O[i * 4 + 3], temp); + } else if (pad_w_mod_4 == 1) { + temp = vmaxq_f16(vaddq_f16(v_O0, v_b), v_0); + max_v = vmaxq_f16(max_v, temp); + min_v = vminq_f16(min_v, temp); + vst1q_f16(O[i * 4 + 0], temp); + + temp = vmaxq_f16(vaddq_f16(v_O1, v_b), v_0); + max_v = vmaxq_f16(max_v, temp); + min_v = vminq_f16(min_v, temp); + vst1q_f16(O[i * 4 + 1], temp); + + temp = vmaxq_f16(vaddq_f16(v_O2, v_b), v_0); + max_v = vmaxq_f16(max_v, temp); + min_v = vminq_f16(min_v, temp); + vst1q_f16(O[i * 4 + 2], temp); + } else if (pad_w_mod_4 == 2) { + temp = vmaxq_f16(vaddq_f16(v_O0, v_b), v_0); + max_v = vmaxq_f16(max_v, temp); + min_v = vminq_f16(min_v, temp); + vst1q_f16(O[i * 4 + 0], temp); + + temp = vmaxq_f16(vaddq_f16(v_O1, v_b), v_0); + max_v = vmaxq_f16(max_v, temp); + min_v = vminq_f16(min_v, temp); + vst1q_f16(O[i * 4 + 1], temp); + } else if (pad_w_mod_4 == 3) { + temp = vmaxq_f16(vaddq_f16(v_O0, v_b), v_0); + max_v = vmaxq_f16(max_v, temp); + min_v = vminq_f16(min_v, temp); + vst1q_f16(O[i * 4 + 0], temp); + } + } else { + if (pad_w_mod_4 == 0) { + temp = vaddq_f16(v_O0, v_b); + max_v = vmaxq_f16(max_v, temp); + min_v = vminq_f16(min_v, temp); + vst1q_f16(O[i * 4 + 0], temp); + + temp = vaddq_f16(v_O1, v_b); + max_v = vmaxq_f16(max_v, temp); + min_v = vminq_f16(min_v, temp); + vst1q_f16(O[i * 4 + 1], temp); + + temp = vaddq_f16(v_O2, v_b); + max_v = vmaxq_f16(max_v, temp); + min_v = vminq_f16(min_v, temp); + vst1q_f16(O[i * 4 + 2], temp); + + temp = vaddq_f16(v_O3, v_b); + max_v = vmaxq_f16(max_v, temp); + min_v = vminq_f16(min_v, temp); + vst1q_f16(O[i * 4 + 3], temp); + } else if (pad_w_mod_4 == 1) { + temp = vaddq_f16(v_O0, v_b); + max_v = vmaxq_f16(max_v, temp); + min_v = vminq_f16(min_v, temp); + vst1q_f16(O[i * 4 + 0], temp); + + temp = vaddq_f16(v_O1, v_b); + max_v = vmaxq_f16(max_v, temp); + min_v = vminq_f16(min_v, temp); + vst1q_f16(O[i * 4 + 1], temp); + + temp = vaddq_f16(v_O2, v_b); + max_v = vmaxq_f16(max_v, temp); + min_v = vminq_f16(min_v, temp); + vst1q_f16(O[i * 4 + 2], temp); + } else if (pad_w_mod_4 == 2) { + temp = vaddq_f16(v_O0, v_b); + max_v = vmaxq_f16(max_v, temp); + min_v = vminq_f16(min_v, temp); + vst1q_f16(O[i * 4 + 0], temp); + + temp = vaddq_f16(v_O1, v_b); + max_v = vmaxq_f16(max_v, temp); + min_v = vminq_f16(min_v, temp); + vst1q_f16(O[i * 4 + 1], temp); + } else if (pad_w_mod_4 == 3) { + temp = vaddq_f16(v_O0, v_b); + max_v = vmaxq_f16(max_v, temp); + min_v = vminq_f16(min_v, temp); + vst1q_f16(O[i * 4 + 0], temp); + } + } + } + + vst1q_f16(max, max_v); + vst1q_f16(min, min_v); +} +#endif +#endif diff --git a/compute/tensor/src/cpu/arm/int8/depthwise_pointwise_convolution.cpp b/compute/tensor/src/cpu/arm/int8/depthwise_pointwise_convolution.cpp new file mode 100644 index 00000000..04c23b4a --- /dev/null +++ b/compute/tensor/src/cpu/arm/int8/depthwise_pointwise_convolution.cpp @@ -0,0 +1,73 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/int8/tensor_computing_int8.h" +#include "cpu/arm/int8/depthwise_pointwise_convolution.h" + +EE depthwise_pointwise_convolution_int8(TensorDesc inputDesc, + INT8 *input, + TensorDesc dwFilterDesc, + const INT8 *dwFilter, + TensorDesc pwFilterDesc, + const INT8 *pwFilter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc dwBiasDesc, + const I32 *dwBias, + TensorDesc pwBiasDesc, + const I32 *pwBias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + I32 *output, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + Arch arch) +{ + if (nullptr == input || nullptr == dwFilter || nullptr == output || nullptr == dwBias || + nullptr == tmp) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if (!(idt == DT_I8 && fdt == DT_I8 && odt == DT_I32)) { + CHECK_STATUS(NOT_MATCH); + } + if (!(idf == DF_NCHWC8 && odf == DF_NCHWC8)) { + CHECK_STATUS(NOT_MATCH); + } + if (ic != fc) { + CHECK_STATUS(NOT_MATCH); + } + + EE ret = SUCCESS; + switch (algorithm) { + case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT: + ret = depthwise_pointwise_convolution_direct(inputDesc, input, dwFilterDesc, dwFilter, + pwFilterDesc, pwFilter, convParamSpec, dwBiasDesc, dwBias, pwBiasDesc, pwBias, + tmpBytes, tmp, outputDesc, output, depthwiseActivationParamSpec, + pointwiseActivationParamSpec, arch); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/arm/int8/depthwise_pointwise_convolution.h b/compute/tensor/src/cpu/arm/int8/depthwise_pointwise_convolution.h new file mode 100644 index 00000000..a731826d --- /dev/null +++ b/compute/tensor/src/cpu/arm/int8/depthwise_pointwise_convolution.h @@ -0,0 +1,39 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_DEPTHWISE_POINTWISE_CONVOLUTION +#define _H_DEPTHWISE_POINTWISE_CONVOLUTION + +#include "sys.h" +#include "tensor_desc.h" +#include "tensor_computing_type.h" + +EE depthwise_pointwise_convolution_direct(TensorDesc inputDesc, + INT8 *inArray, + TensorDesc dwFilterDesc, + const INT8 *dwFilterArray, + TensorDesc pwFilterDesc, + const INT8 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const I32 *dwBiasArray, + TensorDesc pwBiasDesc, + const I32 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + I32 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + Arch arch); +#endif diff --git a/compute/tensor/src/cpu/arm/int8/depthwise_pointwise_convolution_direct.cpp b/compute/tensor/src/cpu/arm/int8/depthwise_pointwise_convolution_direct.cpp new file mode 100644 index 00000000..77ec8489 --- /dev/null +++ b/compute/tensor/src/cpu/arm/int8/depthwise_pointwise_convolution_direct.cpp @@ -0,0 +1,1865 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/int8/depthwise_pointwise_convolution.h" + +EE depthwise_pointwise_convolution_direct(TensorDesc inputDesc, + INT8 *inArray, + TensorDesc dwFilterDesc, + const INT8 *dwFilterArray, + TensorDesc pwFilterDesc, + const INT8 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const I32 *dwBiasArray, + TensorDesc pwBiasDesc, + const I32 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + I32 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + Arch arch) +{ + UNUSED(tmpBytes); + UNUSED(arch); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; + + if (dwFilterDesc.df != DF_NCHWC8 || pwFilterDesc.df != DF_NCHWN8C4) { + CHECK_STATUS(NOT_MATCH); + } + if (pwFilterArray == nullptr) { + return NOT_SUPPORTED; + } + + oc /= 8; + ic /= 8; + + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + U32 ihiw = ih * iw; + I32 ohow = oh * ow; + INT8 *pwArray = (INT8 *)tmp + ic * ih_pad * iw_pad * 8; + I32 *dw_out = (I32 *)(pwArray + ic * ohow * 8); + + for (U32 n = 0; n < in; n++) { + // copy input into a input with padding + INT8 *inArray_pad = (INT8 *)tmp; + INT8 *inArray_pad_mov = inArray_pad; + INT8 *inArray_mov = inArray + n * ic * ihiw * 8; + for (U32 c = 0; c < ic; c++) { + if (paddingT > 0) { + memset(inArray_pad_mov, 0, paddingT * iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += paddingT * iw_pad * 8; + } + for (U32 h = paddingT; h < ih_pad - paddingB; h++) { + memset(inArray_pad_mov, 0, paddingL * 8 * bytesOf(idt)); + inArray_pad_mov += paddingL * 8; + memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt)); + inArray_pad_mov += iw * 8; + inArray_mov += iw * 8; + memset(inArray_pad_mov, 0, paddingR * 8 * bytesOf(idt)); + inArray_pad_mov += paddingR * 8; + } + if (paddingB > 0) { + memset(inArray_pad_mov, 0, paddingB * iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += paddingB * iw_pad * 8; + } + + // dw_conv + const I32 *b = dwBiasArray + c * 8; + INT8 *in_pad = inArray_pad + c * ih_pad * iw_pad * 8; + const INT8 *f = dwFilterArray + c * fh * fw * 8; + + // ohow / 12 + for (I32 hw = 0; hw < ohow - 11; hw += 12) { + U32 in_h_0 = hw / ow * strideH; + U32 in_w_0 = hw % ow * strideW; + U32 in_h_1 = (hw + 1) / ow * strideH; + U32 in_w_1 = (hw + 1) % ow * strideW; + U32 in_h_2 = (hw + 2) / ow * strideH; + U32 in_w_2 = (hw + 2) % ow * strideW; + U32 in_h_3 = (hw + 3) / ow * strideH; + U32 in_w_3 = (hw + 3) % ow * strideW; + U32 in_h_4 = (hw + 4) / ow * strideH; + U32 in_w_4 = (hw + 4) % ow * strideW; + U32 in_h_5 = (hw + 5) / ow * strideH; + U32 in_w_5 = (hw + 5) % ow * strideW; + U32 in_h_6 = (hw + 6) / ow * strideH; + U32 in_w_6 = (hw + 6) % ow * strideW; + U32 in_h_7 = (hw + 7) / ow * strideH; + U32 in_w_7 = (hw + 7) % ow * strideW; + U32 in_h_8 = (hw + 8) / ow * strideH; + U32 in_w_8 = (hw + 8) % ow * strideW; + U32 in_h_9 = (hw + 9) / ow * strideH; + U32 in_w_9 = (hw + 9) % ow * strideW; + U32 in_h_10 = (hw + 10) / ow * strideH; + U32 in_w_10 = (hw + 10) % ow * strideW; + U32 in_h_11 = (hw + 11) / ow * strideH; + U32 in_w_11 = (hw + 11) % ow * strideW; + + I32 *pw_pack_0 = dw_out + hw * ic * 8 + c * 12 * 8; + I32 *pw_pack_1 = pw_pack_0 + 48; // Second half + // TODO handle asm combined with c. No guarantee that compiler will not use vec reg in c. + __asm__ __volatile__("ldr d29, [%[b]]\n" // b_0 + "ldr x1, [%[b], #8]\n" + "ins v29.d[1], x1\n" + "ldr d30, [%[b], #16]\n" // b_1 + "ldr x2, [%[b], #24]\n" + "ins v30.d[1], x2\n" + "mov v5.16b, v29.16b\n" + "mov v7.16b, v29.16b\n" + "mov v9.16b, v29.16b\n" + "mov v11.16b, v29.16b\n" + "mov v13.16b, v29.16b\n" + "mov v15.16b, v29.16b\n" + "mov v17.16b, v29.16b\n" + "mov v19.16b, v29.16b\n" + "mov v21.16b, v29.16b\n" + "mov v23.16b, v29.16b\n" + "mov v25.16b, v29.16b\n" + "mov v27.16b, v29.16b\n" + + "mov v6.16b, v30.16b\n" + "mov v8.16b, v30.16b\n" + "mov v10.16b, v30.16b\n" + "mov v12.16b, v30.16b\n" + "mov v14.16b, v30.16b\n" + "mov v16.16b, v30.16b\n" + "mov v18.16b, v30.16b\n" + "mov v20.16b, v30.16b\n" + "mov v22.16b, v30.16b\n" + "mov v24.16b, v30.16b\n" + "mov v26.16b, v30.16b\n" + "mov v28.16b, v30.16b\n" + : + : [b] "r"(b) + : "memory", "cc", "v5", "v6", "v7", "v8", "v9", "v10", "v11", + "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", + "v30", "x1", "x2"); + + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + const INT8 *f_0 = f + fh_idx * fw * 8 + fw_idx * 8; + INT8 *in_idx = in_pad + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + INT8 *in_0 = in_idx + in_h_0 * iw_pad * 8 + in_w_0 * 8; + INT8 *in_1 = in_idx + in_h_1 * iw_pad * 8 + in_w_1 * 8; + INT8 *in_2 = in_idx + in_h_2 * iw_pad * 8 + in_w_2 * 8; + INT8 *in_3 = in_idx + in_h_3 * iw_pad * 8 + in_w_3 * 8; + INT8 *in_4 = in_idx + in_h_4 * iw_pad * 8 + in_w_4 * 8; + INT8 *in_5 = in_idx + in_h_5 * iw_pad * 8 + in_w_5 * 8; + INT8 *in_6 = in_idx + in_h_6 * iw_pad * 8 + in_w_6 * 8; + INT8 *in_7 = in_idx + in_h_7 * iw_pad * 8 + in_w_7 * 8; + INT8 *in_8 = in_idx + in_h_8 * iw_pad * 8 + in_w_8 * 8; + INT8 *in_9 = in_idx + in_h_9 * iw_pad * 8 + in_w_9 * 8; + INT8 *in_10 = in_idx + in_h_10 * iw_pad * 8 + in_w_10 * 8; + INT8 *in_11 = in_idx + in_h_11 * iw_pad * 8 + in_w_11 * 8; + __asm__ __volatile__( + "ldr d29, [%[f0]]\n" + "ldr d0, [%[in0]]\n" + "ldr d1, [%[in1]]\n" + "ldr d2, [%[in2]]\n" + "sshll v29.8h, v29.8b, #0\n" + "ldr d30, [%[in3]]\n" + "sshll v0.8h, v0.8b, #0\n" + "sshll v1.8h, v1.8b, #0\n" + + "smlal v5.4s, v29.4h, v0.4h\n" + "sshll v2.8h, v2.8b, #0\n" + "smlal2 v6.4s, v29.8h, v0.8h\n" + "sshll v30.8h, v30.8b, #0\n" + "smlal v7.4s, v29.4h, v1.4h\n" + "ldr d0, [%[in4]]\n" + "smlal2 v8.4s, v29.8h, v1.8h\n" + "smlal v9.4s, v29.4h, v2.4h\n" + "ldr d1, [%[in5]]\n" + "smlal2 v10.4s, v29.8h, v2.8h\n" + "sshll v0.8h, v0.8b, #0\n" + "smlal v11.4s, v29.4h, v30.4h\n" + "ldr d2, [%[in6]]\n" + "smlal2 v12.4s, v29.8h, v30.8h\n" + "sshll v1.8h, v1.8b, #0\n" + + "smlal v13.4s, v29.4h, v0.4h\n" + "ldr d30, [%[in7]]\n" + "smlal2 v14.4s, v29.8h, v0.8h\n" + "sshll v2.8h, v2.8b, #0\n" + "smlal v15.4s, v29.4h, v1.4h\n" + "ldr d0, [%[in8]]\n" + "smlal2 v16.4s, v29.8h, v1.8h\n" + "sshll v30.8h, v30.8b, #0\n" + "smlal v17.4s, v29.4h, v2.4h\n" + "ldr d1, [%[in9]]\n" + "smlal2 v18.4s, v29.8h, v2.8h\n" + "sshll v0.8h, v0.8b, #0\n" + "smlal v19.4s, v29.4h, v30.4h\n" + "ldr d2, [%[in10]]\n" + "smlal2 v20.4s, v29.8h, v30.8h\n" + "sshll v1.8h, v1.8b, #0\n" + + "smlal v21.4s, v29.4h, v0.4h\n" + "ldr d30, [%[in11]]\n" + "smlal2 v22.4s, v29.8h, v0.8h\n" + "sshll v2.8h, v2.8b, #0\n" + "smlal v23.4s, v29.4h, v1.4h\n" + "sshll v30.8h, v30.8b, #0\n" + "smlal2 v24.4s, v29.8h, v1.8h\n" + "smlal v25.4s, v29.4h, v2.4h\n" + "smlal2 v26.4s, v29.8h, v2.8h\n" + "smlal v27.4s, v29.4h, v30.4h\n" + "smlal2 v28.4s, v29.8h, v30.8h\n" + : + : [in0] "r"(in_0), [in1] "r"(in_1), [in2] "r"(in_2), [in3] "r"(in_3), + [in4] "r"(in_4), [in5] "r"(in_5), [in6] "r"(in_6), [in7] "r"(in_7), + [in8] "r"(in_8), [in9] "r"(in_9), [in10] "r"(in_10), [in11] "r"(in_11), + [f0] "r"(f_0) + : "memory", "cc", "v0", "v1", "v2", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"); + } + } + + // activation + switch (depthwiseActivationParamSpec.mode) { + case ACTIVATION_NULL: { + break; + } + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v0.16b, v0.16b, v0.16b\n" // zero + + "smax v5.4s, v0.4s, v5.4s\n" + "smax v6.4s, v0.4s, v6.4s\n" + "smax v7.4s, v0.4s, v7.4s\n" + "smax v8.4s, v0.4s, v8.4s\n" + "smax v9.4s, v0.4s, v9.4s\n" + "smax v10.4s, v0.4s, v10.4s\n" + "smax v11.4s, v0.4s, v11.4s\n" + "smax v12.4s, v0.4s, v12.4s\n" + "smax v13.4s, v0.4s, v13.4s\n" + "smax v14.4s, v0.4s, v14.4s\n" + "smax v15.4s, v0.4s, v15.4s\n" + "smax v16.4s, v0.4s, v16.4s\n" + "smax v17.4s, v0.4s, v17.4s\n" + "smax v18.4s, v0.4s, v18.4s\n" + "smax v19.4s, v0.4s, v19.4s\n" + "smax v20.4s, v0.4s, v20.4s\n" + "smax v21.4s, v0.4s, v21.4s\n" + "smax v22.4s, v0.4s, v22.4s\n" + "smax v23.4s, v0.4s, v23.4s\n" + "smax v24.4s, v0.4s, v24.4s\n" + "smax v25.4s, v0.4s, v25.4s\n" + "smax v26.4s, v0.4s, v26.4s\n" + "smax v27.4s, v0.4s, v27.4s\n" + "smax v28.4s, v0.4s, v28.4s\n" + : + : + : "memory", "cc", "v0", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", + "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28"); + break; + } + case ACTIVATION_RELU6: { + INT8 *pw_in0 = pwArray + hw * ic * 8 + c * 12 * 8; + INT8 *pw_in1 = pw_in0 + 48; + __asm__ __volatile__("eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v30.4s, #6\n" // six + + "smax v5.4s, v0.4s, v5.4s\n" + "smax v6.4s, v0.4s, v6.4s\n" + "smax v7.4s, v0.4s, v7.4s\n" + "smax v8.4s, v0.4s, v8.4s\n" + "smax v9.4s, v0.4s, v9.4s\n" + "smax v10.4s, v0.4s, v10.4s\n" + "smax v11.4s, v0.4s, v11.4s\n" + "smax v12.4s, v0.4s, v12.4s\n" + "smax v13.4s, v0.4s, v13.4s\n" + "smax v14.4s, v0.4s, v14.4s\n" + "smax v15.4s, v0.4s, v15.4s\n" + "smax v16.4s, v0.4s, v16.4s\n" + "smax v17.4s, v0.4s, v17.4s\n" + "smax v18.4s, v0.4s, v18.4s\n" + "smax v19.4s, v0.4s, v19.4s\n" + "smax v20.4s, v0.4s, v20.4s\n" + "smax v21.4s, v0.4s, v21.4s\n" + "smax v22.4s, v0.4s, v22.4s\n" + "smax v23.4s, v0.4s, v23.4s\n" + "smax v24.4s, v0.4s, v24.4s\n" + "smax v25.4s, v0.4s, v25.4s\n" + "smax v26.4s, v0.4s, v26.4s\n" + "smax v27.4s, v0.4s, v27.4s\n" + "smax v28.4s, v0.4s, v28.4s\n" + + "smin v5.4s, v30.4s, v5.4s\n" + "smin v6.4s, v30.4s, v6.4s\n" + "smin v7.4s, v30.4s, v7.4s\n" + "smin v8.4s, v30.4s, v8.4s\n" + "smin v9.4s, v30.4s, v9.4s\n" + "smin v10.4s, v30.4s, v10.4s\n" + "smin v11.4s, v30.4s, v11.4s\n" + "smin v12.4s, v30.4s, v12.4s\n" + "smin v13.4s, v30.4s, v13.4s\n" + "smin v14.4s, v30.4s, v14.4s\n" + "smin v15.4s, v30.4s, v15.4s\n" + "smin v16.4s, v30.4s, v16.4s\n" + "smin v17.4s, v30.4s, v17.4s\n" + "smin v18.4s, v30.4s, v18.4s\n" + "smin v19.4s, v30.4s, v19.4s\n" + "smin v20.4s, v30.4s, v20.4s\n" + "smin v21.4s, v30.4s, v21.4s\n" + "smin v22.4s, v30.4s, v22.4s\n" + "smin v23.4s, v30.4s, v23.4s\n" + "smin v24.4s, v30.4s, v24.4s\n" + "smin v25.4s, v30.4s, v25.4s\n" + "smin v26.4s, v30.4s, v26.4s\n" + "smin v27.4s, v30.4s, v27.4s\n" + "smin v28.4s, v30.4s, v28.4s\n" + + // No need to quantize for ReLU6 + "sqshl v5.4s, v5.4s, #2\n" + "sqshl v6.4s, v6.4s, #2\n" + "sqshl v7.4s, v7.4s, #2\n" + "sqshl v8.4s, v8.4s, #2\n" + "sqshl v9.4s, v9.4s, #2\n" + "sqshl v10.4s, v10.4s, #2\n" + "sqshl v11.4s, v11.4s, #2\n" + "sqshl v12.4s, v12.4s, #2\n" + "sqshl v13.4s, v13.4s, #2\n" + "sqshl v14.4s, v14.4s, #2\n" + "sqshl v15.4s, v15.4s, #2\n" + "sqshl v16.4s, v16.4s, #2\n" + "sqshl v17.4s, v17.4s, #2\n" + "sqshl v18.4s, v18.4s, #2\n" + "sqshl v19.4s, v19.4s, #2\n" + "sqshl v20.4s, v20.4s, #2\n" + "sqshl v21.4s, v21.4s, #2\n" + "sqshl v22.4s, v22.4s, #2\n" + "sqshl v23.4s, v23.4s, #2\n" + "sqshl v24.4s, v24.4s, #2\n" + "sqshl v25.4s, v25.4s, #2\n" + "sqshl v26.4s, v26.4s, #2\n" + "sqshl v27.4s, v27.4s, #2\n" + "sqshl v28.4s, v28.4s, #2\n" + + "sqshrn v5.4h, v5.4s, #1\n" + "sqshrn v9.4h, v9.4s, #1\n" + "sqshrn2 v5.8h, v7.4s, #1\n" + "sqshrn2 v9.8h, v11.4s, #1\n" + "sqshrn v13.4h, v13.4s, #1\n" + "sqshrn v17.4h, v17.4s, #1\n" + "sqshrn2 v13.8h, v15.4s, #1\n" + "sqshrn2 v17.8h, v19.4s, #1\n" + + "sqshrn v21.4h, v21.4s, #1\n" + "sqshrn v25.4h, v25.4s, #1\n" + "sqshrn2 v21.8h, v23.4s, #1\n" + "sqshrn2 v25.8h, v27.4s, #1\n" + + "sqshrn v5.8b, v5.8h, #1\n" + "sqshrn v13.8b, v13.8h, #1\n" + "sqshrn v21.8b, v21.8h, #1\n" + + "sqshrn2 v5.16b, v9.8h, #1\n" + "sqshrn2 v13.16b, v17.8h, #1\n" + "sqshrn2 v21.16b, v25.8h, #1\n" + "str q5, [%[in0]]\n" + "str q13, [%[in0], #16]\n" + "str q21, [%[in0], #32]\n" + + "sqshrn v6.4h, v6.4s, #1\n" + "sqshrn v10.4h, v10.4s, #1\n" + "sqshrn2 v6.8h, v8.4s, #1\n" + "sqshrn2 v10.8h, v12.4s, #1\n" + + "sqshrn v14.4h, v14.4s, #1\n" + "sqshrn v18.4h, v18.4s, #1\n" + "sqshrn2 v14.8h, v16.4s, #1\n" + "sqshrn2 v18.8h, v20.4s, #1\n" + + "sqshrn v22.4h, v22.4s, #1\n" + "sqshrn v26.4h, v26.4s, #1\n" + "sqshrn2 v22.8h, v24.4s, #1\n" + "sqshrn2 v26.8h, v28.4s, #1\n" + + "sqshrn v6.8b, v6.8h, #1\n" + "sqshrn v14.8b, v14.8h, #1\n" + "sqshrn v22.8b, v22.8h, #1\n" + + "sqshrn2 v6.16b, v10.8h, #1\n" + "sqshrn2 v14.16b, v18.8h, #1\n" + "sqshrn2 v22.16b, v26.8h, #1\n" + "str q6, [%[in1]]\n" + "str q14, [%[in1], #16]\n" + "str q22, [%[in1], #32]\n" + : + : [in0] "r"(pw_in0), [in1] "r"(pw_in1) + : "memory", "cc", "v0", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", + "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v30"); + break; + } + default: + return NOT_SUPPORTED; + } + + if (depthwiseActivationParamSpec.mode != ACTIVATION_RELU6) { + __asm__ __volatile__("str q5, [%[pw0]]\n" + "str q7, [%[pw0], #16]\n" + "str q9, [%[pw0], #32]\n" + "str q11, [%[pw0], #48]\n" + "str q13, [%[pw0], #64]\n" + "str q15, [%[pw0], #80]\n" + "str q17, [%[pw0], #96]\n" + "str q19, [%[pw0], #112]\n" + "str q21, [%[pw0], #128]\n" + "str q23, [%[pw0], #144]\n" + "str q25, [%[pw0], #160]\n" + "str q27, [%[pw0], #176]\n" + + "str q6, [%[pw1]]\n" + "str q8, [%[pw1], #16]\n" + "str q10, [%[pw1], #32]\n" + "str q12, [%[pw1], #48]\n" + "str q14, [%[pw1], #64]\n" + "str q16, [%[pw1], #80]\n" + "str q18, [%[pw1], #96]\n" + "str q20, [%[pw1], #112]\n" + "str q22, [%[pw1], #128]\n" + "str q24, [%[pw1], #144]\n" + "str q26, [%[pw1], #160]\n" + "str q28, [%[pw1], #176]\n" + : + : [pw0] "r"(pw_pack_0), [pw1] "r"(pw_pack_1) + : "memory", "cc", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28"); + } + } + + // ohow_reminder % 12 / 8 + U32 ohow_s = (ohow / 12) * 12; + U32 ohow_tail = ohow - ohow_s; + + if (ohow_tail >= 8) { + U32 hw = ohow_s; + U32 in_h_0 = hw / ow * strideH; + U32 in_w_0 = hw % ow * strideW; + U32 in_h_1 = (hw + 1) / ow * strideH; + U32 in_w_1 = (hw + 1) % ow * strideW; + U32 in_h_2 = (hw + 2) / ow * strideH; + U32 in_w_2 = (hw + 2) % ow * strideW; + U32 in_h_3 = (hw + 3) / ow * strideH; + U32 in_w_3 = (hw + 3) % ow * strideW; + U32 in_h_4 = ((hw + 4) / ow) * strideH; + U32 in_w_4 = ((hw + 4) % ow) * strideW; + U32 in_h_5 = ((hw + 5) / ow) * strideH; + U32 in_w_5 = ((hw + 5) % ow) * strideW; + U32 in_h_6 = ((hw + 6) / ow) * strideH; + U32 in_w_6 = ((hw + 6) % ow) * strideW; + U32 in_h_7 = ((hw + 7) / ow) * strideH; + U32 in_w_7 = ((hw + 7) % ow) * strideW; + I32 *pw_pack_0 = dw_out + hw * ic * 8 + c * 8 * 8; + I32 *pw_pack_1 = pw_pack_0 + 32; + // TODO handle asm combined with c. No guarantee that compile will not use vec reg in c. + __asm__ __volatile__("ldr d29, [%[b]]\n" // b_0 + "ldr x1, [%[b], #8]\n" + "ins v29.d[1], x1\n" + "ldr d30, [%[b], #16]\n" // b_1 + "ldr x2, [%[b], #24]\n" + "ins v30.d[1], x2\n" + "mov v5.16b, v29.16b\n" + "mov v7.16b, v29.16b\n" + "mov v9.16b, v29.16b\n" + "mov v11.16b, v29.16b\n" + "mov v13.16b, v29.16b\n" + "mov v15.16b, v29.16b\n" + "mov v17.16b, v29.16b\n" + "mov v19.16b, v29.16b\n" + + "mov v6.16b, v30.16b\n" + "mov v8.16b, v30.16b\n" + "mov v10.16b, v30.16b\n" + "mov v12.16b, v30.16b\n" + "mov v14.16b, v30.16b\n" + "mov v16.16b, v30.16b\n" + "mov v18.16b, v30.16b\n" + "mov v20.16b, v30.16b\n" + : + : [b] "r"(b) + : "memory", "cc", "v5", "v6", "v7", "v8", "v9", "v10", "v11", + "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v29", "v30", "x1", "x2"); + + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + const INT8 *f_0 = f + fh_idx * fw * 8 + fw_idx * 8; + INT8 *in_idx = in_pad + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + INT8 *in_0 = in_idx + in_h_0 * iw_pad * 8 + in_w_0 * 8; + INT8 *in_1 = in_idx + in_h_1 * iw_pad * 8 + in_w_1 * 8; + INT8 *in_2 = in_idx + in_h_2 * iw_pad * 8 + in_w_2 * 8; + INT8 *in_3 = in_idx + in_h_3 * iw_pad * 8 + in_w_3 * 8; + INT8 *in_4 = in_idx + in_h_4 * iw_pad * 8 + in_w_4 * 8; + INT8 *in_5 = in_idx + in_h_5 * iw_pad * 8 + in_w_5 * 8; + INT8 *in_6 = in_idx + in_h_6 * iw_pad * 8 + in_w_6 * 8; + INT8 *in_7 = in_idx + in_h_7 * iw_pad * 8 + in_w_7 * 8; + __asm__ __volatile__("ldr d29, [%[f0]]\n" + "ldr d0, [%[in0]]\n" + "ldr d1, [%[in1]]\n" + "ldr d2, [%[in2]]\n" + "sshll v29.8h, v29.8b, #0\n" + "ldr d30, [%[in3]]\n" + "sshll v0.8h, v0.8b, #0\n" + "sshll v1.8h, v1.8b, #0\n" + + "smlal v5.4s, v29.4h, v0.4h\n" + "sshll v2.8h, v2.8b, #0\n" + "smlal2 v6.4s, v29.8h, v0.8h\n" + "sshll v30.8h, v30.8b, #0\n" + "smlal v7.4s, v29.4h, v1.4h\n" + "ldr d0, [%[in4]]\n" + "smlal2 v8.4s, v29.8h, v1.8h\n" + "smlal v9.4s, v29.4h, v2.4h\n" + "ldr d1, [%[in5]]\n" + "smlal2 v10.4s, v29.8h, v2.8h\n" + "sshll v0.8h, v0.8b, #0\n" + "smlal v11.4s, v29.4h, v30.4h\n" + "ldr d2, [%[in6]]\n" + "smlal2 v12.4s, v29.8h, v30.8h\n" + "sshll v1.8h, v1.8b, #0\n" + + "smlal v13.4s, v29.4h, v0.4h\n" + "ldr d30, [%[in7]]\n" + "smlal2 v14.4s, v29.8h, v0.8h\n" + "sshll v2.8h, v2.8b, #0\n" + "smlal v15.4s, v29.4h, v1.4h\n" + "smlal2 v16.4s, v29.8h, v1.8h\n" + "sshll v30.8h, v30.8b, #0\n" + "smlal v17.4s, v29.4h, v2.4h\n" + "smlal2 v18.4s, v29.8h, v2.8h\n" + "smlal v19.4s, v29.4h, v30.4h\n" + "smlal2 v20.4s, v29.8h, v30.8h\n" + : + : [in0] "r"(in_0), [in1] "r"(in_1), [in2] "r"(in_2), + [in3] "r"(in_3), [in4] "r"(in_4), [in5] "r"(in_5), + [in6] "r"(in_6), [in7] "r"(in_7), [f0] "r"(f_0) + : "memory", "cc", "v0", "v1", "v2", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v29", "v30"); + } + } + + // activation + switch (depthwiseActivationParamSpec.mode) { + case ACTIVATION_NULL: { + break; + } + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v0.16b, v0.16b, v0.16b\n" // zero + + "smax v5.4s, v0.4s, v5.4s\n" + "smax v6.4s, v0.4s, v6.4s\n" + "smax v7.4s, v0.4s, v7.4s\n" + "smax v8.4s, v0.4s, v8.4s\n" + "smax v9.4s, v0.4s, v9.4s\n" + "smax v10.4s, v0.4s, v10.4s\n" + "smax v11.4s, v0.4s, v11.4s\n" + "smax v12.4s, v0.4s, v12.4s\n" + "smax v13.4s, v0.4s, v13.4s\n" + "smax v14.4s, v0.4s, v14.4s\n" + "smax v15.4s, v0.4s, v15.4s\n" + "smax v16.4s, v0.4s, v16.4s\n" + "smax v17.4s, v0.4s, v17.4s\n" + "smax v18.4s, v0.4s, v18.4s\n" + "smax v19.4s, v0.4s, v19.4s\n" + "smax v20.4s, v0.4s, v20.4s\n" + : + : + : "memory", "cc", "v0", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", + "v18", "v19", "v20"); + break; + } + case ACTIVATION_RELU6: { + INT8 *pw_in0 = pwArray + hw * ic * 8 + c * 8 * 8; + INT8 *pw_in1 = pw_in0 + 32; + __asm__ __volatile__("eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v30.4s, #6\n" // six + + "smax v5.4s, v0.4s, v5.4s\n" + "smax v6.4s, v0.4s, v6.4s\n" + "smax v7.4s, v0.4s, v7.4s\n" + "smax v8.4s, v0.4s, v8.4s\n" + "smax v9.4s, v0.4s, v9.4s\n" + "smax v10.4s, v0.4s, v10.4s\n" + "smax v11.4s, v0.4s, v11.4s\n" + "smax v12.4s, v0.4s, v12.4s\n" + "smax v13.4s, v0.4s, v13.4s\n" + "smax v14.4s, v0.4s, v14.4s\n" + "smax v15.4s, v0.4s, v15.4s\n" + "smax v16.4s, v0.4s, v16.4s\n" + "smax v17.4s, v0.4s, v17.4s\n" + "smax v18.4s, v0.4s, v18.4s\n" + "smax v19.4s, v0.4s, v19.4s\n" + "smax v20.4s, v0.4s, v20.4s\n" + + "smin v5.4s, v30.4s, v5.4s\n" + "smin v6.4s, v30.4s, v6.4s\n" + "smin v7.4s, v30.4s, v7.4s\n" + "smin v8.4s, v30.4s, v8.4s\n" + "smin v9.4s, v30.4s, v9.4s\n" + "smin v10.4s, v30.4s, v10.4s\n" + "smin v11.4s, v30.4s, v11.4s\n" + "smin v12.4s, v30.4s, v12.4s\n" + "smin v13.4s, v30.4s, v13.4s\n" + "smin v14.4s, v30.4s, v14.4s\n" + "smin v15.4s, v30.4s, v15.4s\n" + "smin v16.4s, v30.4s, v16.4s\n" + "smin v17.4s, v30.4s, v17.4s\n" + "smin v18.4s, v30.4s, v18.4s\n" + "smin v19.4s, v30.4s, v19.4s\n" + "smin v20.4s, v30.4s, v20.4s\n" + + // No need to quantize for ReLU6 + "sqshl v5.4s, v5.4s, #2\n" + "sqshl v6.4s, v6.4s, #2\n" + "sqshl v7.4s, v7.4s, #2\n" + "sqshl v8.4s, v8.4s, #2\n" + "sqshl v9.4s, v9.4s, #2\n" + "sqshl v10.4s, v10.4s, #2\n" + "sqshl v11.4s, v11.4s, #2\n" + "sqshl v12.4s, v12.4s, #2\n" + "sqshl v13.4s, v13.4s, #2\n" + "sqshl v14.4s, v14.4s, #2\n" + "sqshl v15.4s, v15.4s, #2\n" + "sqshl v16.4s, v16.4s, #2\n" + "sqshl v17.4s, v17.4s, #2\n" + "sqshl v18.4s, v18.4s, #2\n" + "sqshl v19.4s, v19.4s, #2\n" + "sqshl v20.4s, v20.4s, #2\n" + + "sqshrn v5.4h, v5.4s, #1\n" + "sqshrn v9.4h, v9.4s, #1\n" + "sqshrn2 v5.8h, v7.4s, #1\n" + "sqshrn2 v9.8h, v11.4s, #1\n" + + "sqshrn v13.4h, v13.4s, #1\n" + "sqshrn v17.4h, v17.4s, #1\n" + "sqshrn2 v13.8h, v15.4s, #1\n" + "sqshrn2 v17.8h, v19.4s, #1\n" + + "sqshrn v5.8b, v5.8h, #1\n" + "sqshrn v13.8b, v13.8h, #1\n" + + "sqshrn2 v5.16b, v9.8h, #1\n" + "sqshrn2 v13.16b, v17.8h, #1\n" + "str q5, [%[in0]]\n" + "str q13, [%[in0], #16]\n" + + "sqshrn v6.4h, v6.4s, #1\n" + "sqshrn v10.4h, v10.4s, #1\n" + "sqshrn2 v6.8h, v8.4s, #1\n" + "sqshrn2 v10.8h, v12.4s, #1\n" + + "sqshrn v14.4h, v14.4s, #1\n" + "sqshrn v18.4h, v18.4s, #1\n" + "sqshrn2 v14.8h, v16.4s, #1\n" + "sqshrn2 v18.8h, v20.4s, #1\n" + + "sqshrn v6.8b, v6.8h, #1\n" + "sqshrn v14.8b, v14.8h, #1\n" + + "sqshrn2 v6.16b, v10.8h, #1\n" + "sqshrn2 v14.16b, v18.8h, #1\n" + "str q6, [%[in1]]\n" + "str q14, [%[in1], #16]\n" + : + : [in0] "r"(pw_in0), [in1] "r"(pw_in1) + : "memory", "cc", "v0", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", + "v18", "v19", "v20", "v30"); + break; + } + default: + return NOT_SUPPORTED; + } + + if (depthwiseActivationParamSpec.mode != ACTIVATION_RELU6) { + __asm__ __volatile__("str q5, [%[pw0]]\n" + "str q7, [%[pw0], #16]\n" + "str q9, [%[pw0], #32]\n" + "str q11, [%[pw0], #48]\n" + "str q13, [%[pw0], #64]\n" + "str q15, [%[pw0], #80]\n" + "str q17, [%[pw0], #96]\n" + "str q19, [%[pw0], #112]\n" + + "str q6, [%[pw1]]\n" + "str q8, [%[pw1], #16]\n" + "str q10, [%[pw1], #32]\n" + "str q12, [%[pw1], #48]\n" + "str q14, [%[pw1], #64]\n" + "str q16, [%[pw1], #80]\n" + "str q18, [%[pw1], #96]\n" + "str q20, [%[pw1], #112]\n" + : + : [pw0] "r"(pw_pack_0), [pw1] "r"(pw_pack_1) + : "memory", "cc", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", + "v19", "v20"); + } + ohow_s += 8; + ohow_tail -= 8; + } + + if (ohow_tail >= 4) { + U32 hw = ohow_s; + U32 in_h_0 = hw / ow * strideH; + U32 in_w_0 = hw % ow * strideW; + U32 in_h_1 = (hw + 1) / ow * strideH; + U32 in_w_1 = (hw + 1) % ow * strideW; + U32 in_h_2 = (hw + 2) / ow * strideH; + U32 in_w_2 = (hw + 2) % ow * strideW; + U32 in_h_3 = (hw + 3) / ow * strideH; + U32 in_w_3 = (hw + 3) % ow * strideW; + I32 *pw_pack_0 = dw_out + hw * ic * 8 + c * 4 * 8; + I32 *pw_pack_1 = pw_pack_0 + 16; + // TODO handle asm combined with c. No guarantee that compile will not use vec reg in c. + __asm__ __volatile__("ldr d29, [%[b]]\n" // b_0 + "ldr x1, [%[b], #8]\n" + "ins v29.d[1], x1\n" + "ldr d30, [%[b], #16]\n" // b_1 + "ldr x2, [%[b], #24]\n" + "ins v30.d[1], x2\n" + "mov v5.16b, v29.16b\n" + "mov v7.16b, v29.16b\n" + "mov v9.16b, v29.16b\n" + "mov v11.16b, v29.16b\n" + + "mov v6.16b, v30.16b\n" + "mov v8.16b, v30.16b\n" + "mov v10.16b, v30.16b\n" + "mov v12.16b, v30.16b\n" + : + : [b] "r"(b) + : "memory", "cc", "v5", "v6", "v7", "v8", "v9", "v10", "v11", + "v12", "v29", "v30", "x1", "x2"); + + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + const INT8 *f_0 = f + fh_idx * fw * 8 + fw_idx * 8; + INT8 *in_idx = in_pad + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + INT8 *in_0 = in_idx + in_h_0 * iw_pad * 8 + in_w_0 * 8; + INT8 *in_1 = in_idx + in_h_1 * iw_pad * 8 + in_w_1 * 8; + INT8 *in_2 = in_idx + in_h_2 * iw_pad * 8 + in_w_2 * 8; + INT8 *in_3 = in_idx + in_h_3 * iw_pad * 8 + in_w_3 * 8; + __asm__ __volatile__("ldr d29, [%[f0]]\n" + "ldr d0, [%[in0]]\n" + "ldr d1, [%[in1]]\n" + "ldr d2, [%[in2]]\n" + "sshll v29.8h, v29.8b, #0\n" + "ldr d30, [%[in3]]\n" + "sshll v0.8h, v0.8b, #0\n" + "sshll v1.8h, v1.8b, #0\n" + + "smlal v5.4s, v29.4h, v0.4h\n" + "sshll v2.8h, v2.8b, #0\n" + "smlal2 v6.4s, v29.8h, v0.8h\n" + "sshll v30.8h, v30.8b, #0\n" + "smlal v7.4s, v29.4h, v1.4h\n" + "smlal2 v8.4s, v29.8h, v1.8h\n" + "smlal v9.4s, v29.4h, v2.4h\n" + "smlal2 v10.4s, v29.8h, v2.8h\n" + "smlal v11.4s, v29.4h, v30.4h\n" + "smlal2 v12.4s, v29.8h, v30.8h\n" + : + : [in0] "r"(in_0), [in1] "r"(in_1), [in2] "r"(in_2), + [in3] "r"(in_3), [f0] "r"(f_0) + : "memory", "cc", "v0", "v1", "v2", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v29", "v30"); + } + } + + // activation + switch (depthwiseActivationParamSpec.mode) { + case ACTIVATION_NULL: { + break; + } + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v0.16b, v0.16b, v0.16b\n" // zero + + "smax v5.4s, v0.4s, v5.4s\n" + "smax v6.4s, v0.4s, v6.4s\n" + "smax v7.4s, v0.4s, v7.4s\n" + "smax v8.4s, v0.4s, v8.4s\n" + "smax v9.4s, v0.4s, v9.4s\n" + "smax v10.4s, v0.4s, v10.4s\n" + "smax v11.4s, v0.4s, v11.4s\n" + "smax v12.4s, v0.4s, v12.4s\n" + : + : + : "memory", "cc", "v0", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12"); + break; + } + case ACTIVATION_RELU6: { + INT8 *pw_in0 = pwArray + hw * ic * 8 + c * 4 * 8; + INT8 *pw_in1 = pw_in0 + 16; + __asm__ __volatile__("eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v30.4s, #6\n" // six + + "smax v5.4s, v0.4s, v5.4s\n" + "smax v6.4s, v0.4s, v6.4s\n" + "smax v7.4s, v0.4s, v7.4s\n" + "smax v8.4s, v0.4s, v8.4s\n" + "smax v9.4s, v0.4s, v9.4s\n" + "smax v10.4s, v0.4s, v10.4s\n" + "smax v11.4s, v0.4s, v11.4s\n" + "smax v12.4s, v0.4s, v12.4s\n" + + "smin v5.4s, v30.4s, v5.4s\n" + "smin v6.4s, v30.4s, v6.4s\n" + "smin v7.4s, v30.4s, v7.4s\n" + "smin v8.4s, v30.4s, v8.4s\n" + "smin v9.4s, v30.4s, v9.4s\n" + "smin v10.4s, v30.4s, v10.4s\n" + "smin v11.4s, v30.4s, v11.4s\n" + "smin v12.4s, v30.4s, v12.4s\n" + + // No need to quantize for ReLU6 + "sqshl v5.4s, v5.4s, #2\n" + "sqshl v6.4s, v6.4s, #2\n" + "sqshl v7.4s, v7.4s, #2\n" + "sqshl v8.4s, v8.4s, #2\n" + "sqshl v9.4s, v9.4s, #2\n" + "sqshl v10.4s, v10.4s, #2\n" + "sqshl v11.4s, v11.4s, #2\n" + "sqshl v12.4s, v12.4s, #2\n" + + "sqshrn v5.4h, v5.4s, #1\n" + "sqshrn v9.4h, v9.4s, #1\n" + "sqshrn2 v5.8h, v7.4s, #1\n" + "sqshrn2 v9.8h, v11.4s, #1\n" + + "sqshrn v5.8b, v5.8h, #1\n" + "sqshrn2 v5.16b, v9.8h, #1\n" + "str q5, [%[in0]]\n" + + "sqshrn v6.4h, v6.4s, #1\n" + "sqshrn v10.4h, v10.4s, #1\n" + "sqshrn2 v6.8h, v8.4s, #1\n" + "sqshrn2 v10.8h, v12.4s, #1\n" + + "sqshrn v6.8b, v6.8h, #1\n" + + "sqshrn2 v6.16b, v10.8h, #1\n" + "str q6, [%[in1]]\n" + : + : [in0] "r"(pw_in0), [in1] "r"(pw_in1) + : "memory", "cc", "v0", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v30"); + break; + } + default: + return NOT_SUPPORTED; + } + + if (depthwiseActivationParamSpec.mode != ACTIVATION_RELU6) { + __asm__ __volatile__( + "str q5, [%[pw0]]\n" + "str q7, [%[pw0], #16]\n" + "str q9, [%[pw0], #32]\n" + "str q11, [%[pw0], #48]\n" + + "str q6, [%[pw1]]\n" + "str q8, [%[pw1], #16]\n" + "str q10, [%[pw1], #32]\n" + "str q12, [%[pw1], #48]\n" + : + : [pw0] "r"(pw_pack_0), [pw1] "r"(pw_pack_1) + : "memory", "cc", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12"); + } + ohow_s += 4; + ohow_tail -= 4; + } + + // ohow_reminder % 4 + for (I32 hw = ohow_s; hw < ohow; hw++) { + U32 in_h_0 = hw / ow * strideH; + U32 in_w_0 = hw % ow * strideW; + I32 *pw_pack_0 = dw_out + hw * ic * 8 + c * 8; + I32 *pw_pack_1 = pw_pack_0 + 4; + // TODO handle asm combined with c. No guarantee that compile will not use vec reg in c. + __asm__ __volatile__("ldr d5, [%[b]]\n" // b_0 + "ldr x1, [%[b], #8]\n" + "ins v5.d[1], x1\n" + "ldr d6, [%[b], #16]\n" // b_1 + "ldr x2, [%[b], #24]\n" + "ins v6.d[1], x2\n" + : + : [b] "r"(b) + : "memory", "cc", "v5", "v6", "x1", "x2"); + + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + const INT8 *f_0 = f + fh_idx * fw * 8 + fw_idx * 8; + INT8 *in_idx = in_pad + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + INT8 *in_0 = in_idx + in_h_0 * iw_pad * 8 + in_w_0 * 8; + __asm__ __volatile__("ldr d29, [%[f0]]\n" + "ldr d0, [%[in0]]\n" + "sshll v29.8h, v29.8b, #0\n" + "sshll v0.8h, v0.8b, #0\n" + "smlal v5.4s, v29.4h, v0.4h\n" + "smlal2 v6.4s, v29.8h, v0.8h\n" + : + : [in0] "r"(in_0), [f0] "r"(f_0) + : "memory", "cc", "v0", "v5", "v6", "v29"); + } + } + + // activation + switch (depthwiseActivationParamSpec.mode) { + case ACTIVATION_NULL: { + break; + } + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v0.16b, v0.16b, v0.16b\n" // zero + + "smax v5.4s, v0.4s, v5.4s\n" + "smax v6.4s, v0.4s, v6.4s\n" + : + : + : "memory", "cc", "v0", "v5", "v6"); + break; + } + case ACTIVATION_RELU6: { + INT8 *pw_in0 = pwArray + hw * ic * 8 + c * 8; + __asm__ __volatile__("eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v30.4s, #6\n" // six + + "smax v5.4s, v0.4s, v5.4s\n" + "smax v6.4s, v0.4s, v6.4s\n" + + "smin v5.4s, v30.4s, v5.4s\n" + "smin v6.4s, v30.4s, v6.4s\n" + + // No need to quantize for ReLU6 + "sqshl v5.4s, v5.4s, #2\n" + "sqshl v6.4s, v6.4s, #2\n" + + "sqshrn v5.4h, v5.4s, #1\n" + "sqshrn2 v5.8h, v6.4s, #1\n" + + "sqshrn v5.8b, v5.8h, #1\n" + "str d5, [%[in0]]\n" + : + : [in0] "r"(pw_in0) + : "memory", "cc", "v0", "v5", "v6", "v30"); + break; + } + default: + return NOT_SUPPORTED; + } + + if (depthwiseActivationParamSpec.mode != ACTIVATION_RELU6) { + __asm__ __volatile__("str q5, [%[pw0]]\n" + "str q6, [%[pw1]]\n" + : + : [pw0] "r"(pw_pack_0), [pw1] "r"(pw_pack_1) + : "memory", "cc", "v5", "v6"); + } + } + } + + I32 scale = 1; + if (depthwiseActivationParamSpec.mode != ACTIVATION_RELU6) { + // quantization + I32 factor = 16777216; // 24 bits + switch (depthwiseActivationParamSpec.mode) { + case ACTIVATION_NULL: { + I32 max_s = dw_out[0]; + I32 min_s = dw_out[0]; + for (U32 i = 1; i < ohow * ic * 8; i++) { + I32 cur = dw_out[i]; + if (cur > max_s) { + max_s = cur; + } + if (cur < min_s) { + min_s = cur; + } + } + + if (max_s <= 127 && min_s >= -127) { // No need to scale + break; + } + + if (max_s == 0 && min_s == 0) { + break; + } + + if (max_s > 0 && min_s < 0) { + I32 factor_p = (factor * 127) / max_s; + I32 factor_n = (factor * -127) / min_s; + factor = (factor_p < factor_n) ? factor_p : factor_n; + } else if (max_s < 0) { + factor = (factor * -127) / min_s; + } else { // min_s > 0 + factor = (factor * 127) / max_s; + } + scale = 16777216 / factor; + break; + } + case ACTIVATION_RELU: { + I32 max_s = dw_out[0]; + for (U32 i = 1; i < ohow * ic * 8; i++) { + I32 cur = dw_out[i]; + if (cur > max_s) { + max_s = cur; + } + } + if (max_s <= 127) { // No need to scale + break; + } + + if (max_s == 0) { + break; + } + + factor = (factor * 127) / max_s; + scale = 16777216 / factor; + break; + } + default: + return NOT_SUPPORTED; + } + I32 factor_v[4]; + for (U32 i = 0; i < 4; i++) { + factor_v[i] = factor; + } + __asm__ __volatile__("ldr q0, [%[factor]]\n" + "mov x0, %[dw_out]\n" + "mov x1, %[pw_in]\n" + "mov x2, %[num]\n" + "0:\n" + "ldr q1, [x0], #16\n" + "ldr q2, [x0], #16\n" + "mul v1.4s, v0.4s, v1.4s\n" + "mul v2.4s, v0.4s, v2.4s\n" + + "shrn v1.4h, v1.4s, #16\n" + "shrn2 v1.8h, v2.4s, #16\n" + + "shrn v1.8b, v1.8h, #8\n" + "subs x2, x2, #8\n" + + "str d1, [x1], #8\n" + "bne 0b\n" + : + : [factor] "r"(factor_v), [dw_out] "r"(dw_out), + [pw_in] "r"(pwArray), [num] "r"((I64)ohow * ic * 8) + : "memory", "cc", "v0", "v1", "v2", "x0", "x1", "x2"); + } + + I32 scale_v[4]; + for (U32 i = 0; i < 4; i++) { + scale_v[i] = scale; + } + + // pw_conv + const INT8 *f_base = pwFilterArray; + + // ohow / 12 + for (I32 hw = 0; hw < ohow - 11; hw += 12) { + const I32 *b0 = pwBiasArray; + const I32 *b1 = b0 + 4; + INT8 *in_pack = pwArray + hw * ic * 8; + for (U32 o = 0; o < oc; o++) { + INT8 *in_hw0 = in_pack; + const INT8 *f_o0c0 = f_base + o * 8 * ic * 8; + I32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + const I32 *b_0 = b0; + const I32 *b_1 = b1; + __asm__ __volatile__( + // Bias should be applied after scaling + "eor v5.16b, v5.16b, v5.16b\n" + "ldr d1, [%[in_0]]\n" // in_0 + "eor v6.16b, v6.16b, v6.16b\n" + "ldr x1, [%[in_0], #8]\n" + "eor v7.16b, v7.16b, v7.16b\n" + "ins v1.d[1], x1\n" + "eor v8.16b, v8.16b, v8.16b\n" + "ldr d0, [%[f_0]]\n" // f_0 + "eor v9.16b, v9.16b, v9.16b\n" + "ldr x2, [%[f_0], #8]\n" + "eor v10.16b, v10.16b, v10.16b\n" + "ins v0.d[1], x2\n" + "eor v11.16b, v11.16b, v11.16b\n" + "ldr d3, [%[in_0], #16]\n" // in_1 + "eor v12.16b, v12.16b, v12.16b\n" + "ldr x3, [%[in_0], #24]\n" + "eor v13.16b, v13.16b, v13.16b\n" + "ins v3.d[1], x3\n" + "eor v14.16b, v14.16b, v14.16b\n" + "eor v15.16b, v15.16b, v15.16b\n" + "eor v16.16b, v16.16b, v16.16b\n" + + "eor v17.16b, v17.16b, v17.16b\n" + "eor v18.16b, v18.16b, v18.16b\n" + "eor v19.16b, v19.16b, v19.16b\n" + "eor v20.16b, v20.16b, v20.16b\n" + "eor v21.16b, v21.16b, v21.16b\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + "eor v25.16b, v25.16b, v25.16b\n" + "eor v26.16b, v26.16b, v26.16b\n" + "eor v27.16b, v27.16b, v27.16b\n" + "eor v28.16b, v28.16b, v28.16b\n" + + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" // ic_blk + "0:\n" + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "ldr d2, [x3, 32]\n" + "ldr x16, [x3, 40]\n" + "sdot v7.4s, v0.16b, v1.4b[1]\n" + "ldr d29, [x0, 16]\n" + "ldr x17, [x0, 24]\n" + "sdot v9.4s, v0.16b, v1.4b[2]\n" + "ins v2.d[1], x16\n" + "ldr d30, [x3, 48]!\n" + "sdot v11.4s, v0.16b, v1.4b[3]\n" + "ins v29.d[1], x17\n" + + "sdot v13.4s, v0.16b, v3.4b[0]\n" + "ldr x16, [x3, 8]\n" + "subs x2, x2, #4\n" + "sdot v15.4s, v0.16b, v3.4b[1]\n" + "sdot v17.4s, v0.16b, v3.4b[2]\n" + "ins v30.d[1], x16\n" + "sdot v19.4s, v0.16b, v3.4b[3]\n" + + "sdot v21.4s, v0.16b, v2.4b[0]\n" + "sdot v23.4s, v0.16b, v2.4b[1]\n" + "sdot v25.4s, v0.16b, v2.4b[2]\n" + "sdot v27.4s, v0.16b, v2.4b[3]\n" + + "sdot v14.4s, v29.16b, v3.4b[0]\n" + "sdot v16.4s, v29.16b, v3.4b[1]\n" + "ldr d0, [x0, 32]!\n" + "ldr x17, [x0, 8]\n" + "sdot v18.4s, v29.16b, v3.4b[2]\n" + "sdot v20.4s, v29.16b, v3.4b[3]\n" + + "sdot v6.4s, v29.16b, v1.4b[0]\n" + "sdot v8.4s, v29.16b, v1.4b[1]\n" + "ldr d3, [x3, 16]\n" + "ldr x16, [x3, 24]\n" + "sdot v10.4s, v29.16b, v1.4b[2]\n" + "sdot v12.4s, v29.16b, v1.4b[3]\n" + + "ins v0.d[1], x17\n" + "ins v3.d[1], x16\n" + + "sdot v22.4s, v29.16b, v2.4b[0]\n" + "mov v1.16b, v30.16b\n" + "sdot v24.4s, v29.16b, v2.4b[1]\n" + "sdot v26.4s, v29.16b, v2.4b[2]\n" + "sdot v28.4s, v29.16b, v2.4b[3]\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu6]\n" // No need to scale for relu6 + "ldr q3, [%[b_0]]\n" + "ldr q4, [%[b_1]]\n" + "beq 11f\n" + + "ldr q0, [%[scale]]\n" + "mul v5.4s, v0.4s, v5.4s\n" + "mul v6.4s, v0.4s, v6.4s\n" + "mul v7.4s, v0.4s, v7.4s\n" + "mul v8.4s, v0.4s, v8.4s\n" + "mul v9.4s, v0.4s, v9.4s\n" + "mul v10.4s, v0.4s, v10.4s\n" + "mul v11.4s, v0.4s, v11.4s\n" + "mul v12.4s, v0.4s, v12.4s\n" + "mul v13.4s, v0.4s, v13.4s\n" + "mul v14.4s, v0.4s, v14.4s\n" + "mul v15.4s, v0.4s, v15.4s\n" + "mul v16.4s, v0.4s, v16.4s\n" + "mul v17.4s, v0.4s, v17.4s\n" + "mul v18.4s, v0.4s, v18.4s\n" + "mul v19.4s, v0.4s, v19.4s\n" + "mul v20.4s, v0.4s, v20.4s\n" + "mul v21.4s, v0.4s, v21.4s\n" + "mul v22.4s, v0.4s, v22.4s\n" + "mul v23.4s, v0.4s, v23.4s\n" + "mul v24.4s, v0.4s, v24.4s\n" + "mul v25.4s, v0.4s, v25.4s\n" + "mul v26.4s, v0.4s, v26.4s\n" + "mul v27.4s, v0.4s, v27.4s\n" + "mul v28.4s, v0.4s, v28.4s\n" + + "add v5.4s, v3.4s, v5.4s\n" + "add v6.4s, v4.4s, v6.4s\n" + "add v7.4s, v3.4s, v7.4s\n" + "add v8.4s, v4.4s, v8.4s\n" + "add v9.4s, v3.4s, v9.4s\n" + "add v10.4s, v4.4s, v10.4s\n" + "add v11.4s, v3.4s, v11.4s\n" + "add v12.4s, v4.4s, v12.4s\n" + "add v13.4s, v3.4s, v13.4s\n" + "add v14.4s, v4.4s, v14.4s\n" + "add v15.4s, v3.4s, v15.4s\n" + "add v16.4s, v4.4s, v16.4s\n" + "add v17.4s, v3.4s, v17.4s\n" + "add v18.4s, v4.4s, v18.4s\n" + "add v19.4s, v3.4s, v19.4s\n" + "add v20.4s, v4.4s, v20.4s\n" + "add v21.4s, v3.4s, v21.4s\n" + "add v22.4s, v4.4s, v22.4s\n" + "add v23.4s, v3.4s, v23.4s\n" + "add v24.4s, v4.4s, v24.4s\n" + "add v25.4s, v3.4s, v25.4s\n" + "add v26.4s, v4.4s, v26.4s\n" + "add v27.4s, v3.4s, v27.4s\n" + "add v28.4s, v4.4s, v28.4s\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 13f\n" + "eor v1.16b, v1.16b, v1.16b\n" // zero + "smax v5.4s, v5.4s, v1.4s\n" + "smax v6.4s, v6.4s, v1.4s\n" + "smax v7.4s, v7.4s, v1.4s\n" + "smax v8.4s, v8.4s, v1.4s\n" + "smax v9.4s, v9.4s, v1.4s\n" + "smax v10.4s, v10.4s, v1.4s\n" + "smax v11.4s, v11.4s, v1.4s\n" + "smax v12.4s, v12.4s, v1.4s\n" + "smax v13.4s, v13.4s, v1.4s\n" + "smax v14.4s, v14.4s, v1.4s\n" + "smax v15.4s, v15.4s, v1.4s\n" + "smax v16.4s, v16.4s, v1.4s\n" + "smax v17.4s, v17.4s, v1.4s\n" + "smax v18.4s, v18.4s, v1.4s\n" + "smax v19.4s, v19.4s, v1.4s\n" + "smax v20.4s, v20.4s, v1.4s\n" + "smax v21.4s, v21.4s, v1.4s\n" + "smax v22.4s, v22.4s, v1.4s\n" + "smax v23.4s, v23.4s, v1.4s\n" + "smax v24.4s, v24.4s, v1.4s\n" + "smax v25.4s, v25.4s, v1.4s\n" + "smax v26.4s, v26.4s, v1.4s\n" + "smax v27.4s, v27.4s, v1.4s\n" + "smax v28.4s, v28.4s, v1.4s\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 13f\n" + // Apply bias + "add v5.4s, v3.4s, v5.4s\n" + "add v6.4s, v4.4s, v6.4s\n" + "add v7.4s, v3.4s, v7.4s\n" + "add v8.4s, v4.4s, v8.4s\n" + "add v9.4s, v3.4s, v9.4s\n" + "add v10.4s, v4.4s, v10.4s\n" + "add v11.4s, v3.4s, v11.4s\n" + "add v12.4s, v4.4s, v12.4s\n" + "add v13.4s, v3.4s, v13.4s\n" + "add v14.4s, v4.4s, v14.4s\n" + "add v15.4s, v3.4s, v15.4s\n" + "add v16.4s, v4.4s, v16.4s\n" + "add v17.4s, v3.4s, v17.4s\n" + "add v18.4s, v4.4s, v18.4s\n" + "add v19.4s, v3.4s, v19.4s\n" + "add v20.4s, v4.4s, v20.4s\n" + "add v21.4s, v3.4s, v21.4s\n" + "add v22.4s, v4.4s, v22.4s\n" + "add v23.4s, v3.4s, v23.4s\n" + "add v24.4s, v4.4s, v24.4s\n" + "add v25.4s, v3.4s, v25.4s\n" + "add v26.4s, v4.4s, v26.4s\n" + "add v27.4s, v3.4s, v27.4s\n" + "add v28.4s, v4.4s, v28.4s\n" + + "eor v1.16b, v0.16b, v0.16b\n" // zero + "movi v2.4s, #6\n" // six + "smax v5.4s, v5.4s, v1.4s\n" + "smax v6.4s, v6.4s, v1.4s\n" + "smax v7.4s, v7.4s, v1.4s\n" + "smax v8.4s, v8.4s, v1.4s\n" + "smax v9.4s, v9.4s, v1.4s\n" + "smax v10.4s, v10.4s, v1.4s\n" + "smax v11.4s, v11.4s, v1.4s\n" + "smax v12.4s, v12.4s, v1.4s\n" + "smax v13.4s, v13.4s, v1.4s\n" + "smax v14.4s, v14.4s, v1.4s\n" + "smax v15.4s, v15.4s, v1.4s\n" + "smax v16.4s, v16.4s, v1.4s\n" + "smax v17.4s, v17.4s, v1.4s\n" + "smax v18.4s, v18.4s, v1.4s\n" + "smax v19.4s, v19.4s, v1.4s\n" + "smax v20.4s, v20.4s, v1.4s\n" + "smax v21.4s, v21.4s, v1.4s\n" + "smax v22.4s, v22.4s, v1.4s\n" + "smax v23.4s, v23.4s, v1.4s\n" + "smax v24.4s, v24.4s, v1.4s\n" + "smax v25.4s, v25.4s, v1.4s\n" + "smax v26.4s, v26.4s, v1.4s\n" + "smax v27.4s, v27.4s, v1.4s\n" + "smax v28.4s, v28.4s, v1.4s\n" + + "smin v5.4s, v5.4s, v2.4s\n" + "smin v6.4s, v6.4s, v2.4s\n" + "smin v7.4s, v7.4s, v2.4s\n" + "smin v8.4s, v8.4s, v2.4s\n" + "smin v9.4s, v9.4s, v2.4s\n" + "smin v10.4s, v10.4s, v2.4s\n" + "smin v11.4s, v11.4s, v2.4s\n" + "smin v12.4s, v12.4s, v2.4s\n" + "smin v13.4s, v13.4s, v2.4s\n" + "smin v14.4s, v14.4s, v2.4s\n" + "smin v15.4s, v15.4s, v2.4s\n" + "smin v16.4s, v16.4s, v2.4s\n" + "smin v17.4s, v17.4s, v2.4s\n" + "smin v18.4s, v18.4s, v2.4s\n" + "smin v19.4s, v19.4s, v2.4s\n" + "smin v20.4s, v20.4s, v2.4s\n" + "smin v21.4s, v21.4s, v2.4s\n" + "smin v22.4s, v22.4s, v2.4s\n" + "smin v23.4s, v23.4s, v2.4s\n" + "smin v24.4s, v24.4s, v2.4s\n" + "smin v25.4s, v25.4s, v2.4s\n" + "smin v26.4s, v26.4s, v2.4s\n" + "smin v27.4s, v27.4s, v2.4s\n" + "smin v28.4s, v28.4s, v2.4s\n" + + "13:\n" + "str q5, [%[out_0]]\n" + "str q6, [%[out_0], #16]\n" + "str q7, [%[out_0], #32]\n" + "str q8, [%[out_0], #48]\n" + "str q9, [%[out_0], #64]\n" + "str q10, [%[out_0], #80]\n" + "str q11, [%[out_0], #96]\n" + "str q12, [%[out_0], #112]\n" + "str q13, [%[out_0], #128]\n" + "str q14, [%[out_0], #144]\n" + "str q15, [%[out_0], #160]\n" + "str q16, [%[out_0], #176]\n" + "str q17, [%[out_0], #192]\n" + "str q18, [%[out_0], #208]\n" + "str q19, [%[out_0], #224]\n" + "str q20, [%[out_0], #240]\n" + "str q21, [%[out_0], #256]\n" + "str q22, [%[out_0], #272]\n" + "str q23, [%[out_0], #288]\n" + "str q24, [%[out_0], #304]\n" + "str q25, [%[out_0], #320]\n" + "str q26, [%[out_0], #336]\n" + "str q27, [%[out_0], #352]\n" + "str q28, [%[out_0], #368]\n" + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_0), [b_1] "r"(b_1), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [scale] "r"(scale_v) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x0", + "x1", "x2", "x3", "x17", "x16"); + b0 += 8; + b1 += 8; + } + } + + // ohow_reminder % 12 / 8 + U32 ohow_s = (ohow / 12) * 12; + U32 ohow_tail = ohow - ohow_s; + + if (ohow_tail >= 8) { + U32 hw = ohow_s; + const I32 *b0 = pwBiasArray; + const I32 *b1 = b0 + 4; + INT8 *in_pack = pwArray + hw * ic * 8; + for (U32 o = 0; o < oc; o++) { + INT8 *in_hw0 = in_pack; + const INT8 *f_o0c0 = f_base + o * 8 * ic * 8; + I32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + const I32 *b_0 = b0; + const I32 *b_1 = b1; + __asm__ __volatile__( + // Bias should be applied after scaling + "eor v5.16b, v5.16b, v5.16b\n" + "ldr d1, [%[in_0]]\n" // in_0 + "eor v6.16b, v6.16b, v6.16b\n" + "ldr x1, [%[in_0], #8]\n" + "eor v7.16b, v7.16b, v7.16b\n" + "ins v1.d[1], x1\n" + "eor v8.16b, v8.16b, v8.16b\n" + "ldr d0, [%[f_0]]\n" // f_0 + "eor v9.16b, v9.16b, v9.16b\n" + "ldr x2, [%[f_0], #8]\n" + "eor v10.16b, v10.16b, v10.16b\n" + "ins v0.d[1], x2\n" + "eor v11.16b, v11.16b, v11.16b\n" + "eor v12.16b, v12.16b, v12.16b\n" + "eor v13.16b, v13.16b, v13.16b\n" + "eor v14.16b, v14.16b, v14.16b\n" + "eor v15.16b, v15.16b, v15.16b\n" + "eor v16.16b, v16.16b, v16.16b\n" + "eor v17.16b, v17.16b, v17.16b\n" + "eor v18.16b, v18.16b, v18.16b\n" + "eor v19.16b, v19.16b, v19.16b\n" + "eor v20.16b, v20.16b, v20.16b\n" + + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" // ic_blk + "0:\n" + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "ldr d3, [x3, 16]!\n" + "ldr x16, [x3, 8]\n" + "sdot v7.4s, v0.16b, v1.4b[1]\n" + "ldr d29, [x0, 16]\n" + "ldr x17, [x0, 24]\n" + "sdot v9.4s, v0.16b, v1.4b[2]\n" + "ins v3.d[1], x16\n" + "ldr d30, [x3, 16]!\n" + "sdot v11.4s, v0.16b, v1.4b[3]\n" + "ins v29.d[1], x17\n" + + "sdot v13.4s, v0.16b, v3.4b[0]\n" + "ldr x16, [x3, 8]\n" + "subs x2, x2, #4\n" + "sdot v15.4s, v0.16b, v3.4b[1]\n" + "sdot v17.4s, v0.16b, v3.4b[2]\n" + "ins v30.d[1], x16\n" + "sdot v19.4s, v0.16b, v3.4b[3]\n" + + "sdot v6.4s, v29.16b, v1.4b[0]\n" + "sdot v8.4s, v29.16b, v1.4b[1]\n" + "ldr d0, [x0, 32]!\n" + "ldr x17, [x0, 8]\n" + "sdot v10.4s, v29.16b, v1.4b[2]\n" + "sdot v12.4s, v29.16b, v1.4b[3]\n" + + "sdot v14.4s, v29.16b, v3.4b[0]\n" + "ins v0.d[1], x17\n" + "mov v1.16b, v30.16b\n" + "sdot v16.4s, v29.16b, v3.4b[1]\n" + "sdot v18.4s, v29.16b, v3.4b[2]\n" + "sdot v20.4s, v29.16b, v3.4b[3]\n" + + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu6]\n" // No need to scale for relu6 + "ldr q3, [%[b_0]]\n" + "ldr q4, [%[b_1]]\n" + "beq 11f\n" + + "ldr q0, [%[scale]]\n" + "mul v5.4s, v0.4s, v5.4s\n" + "mul v6.4s, v0.4s, v6.4s\n" + "mul v7.4s, v0.4s, v7.4s\n" + "mul v8.4s, v0.4s, v8.4s\n" + "mul v9.4s, v0.4s, v9.4s\n" + "mul v10.4s, v0.4s, v10.4s\n" + "mul v11.4s, v0.4s, v11.4s\n" + "mul v12.4s, v0.4s, v12.4s\n" + "mul v13.4s, v0.4s, v13.4s\n" + "mul v14.4s, v0.4s, v14.4s\n" + "mul v15.4s, v0.4s, v15.4s\n" + "mul v16.4s, v0.4s, v16.4s\n" + "mul v17.4s, v0.4s, v17.4s\n" + "mul v18.4s, v0.4s, v18.4s\n" + "mul v19.4s, v0.4s, v19.4s\n" + "mul v20.4s, v0.4s, v20.4s\n" + + "add v5.4s, v3.4s, v5.4s\n" + "add v6.4s, v4.4s, v6.4s\n" + "add v7.4s, v3.4s, v7.4s\n" + "add v8.4s, v4.4s, v8.4s\n" + "add v9.4s, v3.4s, v9.4s\n" + "add v10.4s, v4.4s, v10.4s\n" + "add v11.4s, v3.4s, v11.4s\n" + "add v12.4s, v4.4s, v12.4s\n" + "add v13.4s, v3.4s, v13.4s\n" + "add v14.4s, v4.4s, v14.4s\n" + "add v15.4s, v3.4s, v15.4s\n" + "add v16.4s, v4.4s, v16.4s\n" + "add v17.4s, v3.4s, v17.4s\n" + "add v18.4s, v4.4s, v18.4s\n" + "add v19.4s, v3.4s, v19.4s\n" + "add v20.4s, v4.4s, v20.4s\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 13f\n" + "eor v1.16b, v1.16b, v1.16b\n" // zero + "smax v5.4s, v5.4s, v1.4s\n" + "smax v6.4s, v6.4s, v1.4s\n" + "smax v7.4s, v7.4s, v1.4s\n" + "smax v8.4s, v8.4s, v1.4s\n" + "smax v9.4s, v9.4s, v1.4s\n" + "smax v10.4s, v10.4s, v1.4s\n" + "smax v11.4s, v11.4s, v1.4s\n" + "smax v12.4s, v12.4s, v1.4s\n" + "smax v13.4s, v13.4s, v1.4s\n" + "smax v14.4s, v14.4s, v1.4s\n" + "smax v15.4s, v15.4s, v1.4s\n" + "smax v16.4s, v16.4s, v1.4s\n" + "smax v17.4s, v17.4s, v1.4s\n" + "smax v18.4s, v18.4s, v1.4s\n" + "smax v19.4s, v19.4s, v1.4s\n" + "smax v20.4s, v20.4s, v1.4s\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 13f\n" + // Apply bias + "add v5.4s, v3.4s, v5.4s\n" + "add v6.4s, v4.4s, v6.4s\n" + "add v7.4s, v3.4s, v7.4s\n" + "add v8.4s, v4.4s, v8.4s\n" + "add v9.4s, v3.4s, v9.4s\n" + "add v10.4s, v4.4s, v10.4s\n" + "add v11.4s, v3.4s, v11.4s\n" + "add v12.4s, v4.4s, v12.4s\n" + "add v13.4s, v3.4s, v13.4s\n" + "add v14.4s, v4.4s, v14.4s\n" + "add v15.4s, v3.4s, v15.4s\n" + "add v16.4s, v4.4s, v16.4s\n" + "add v17.4s, v3.4s, v17.4s\n" + "add v18.4s, v4.4s, v18.4s\n" + "add v19.4s, v3.4s, v19.4s\n" + "add v20.4s, v4.4s, v20.4s\n" + + "eor v1.16b, v0.16b, v0.16b\n" // zero + "movi v2.4s, #6\n" // six + "smax v5.4s, v5.4s, v1.4s\n" + "smax v6.4s, v6.4s, v1.4s\n" + "smax v7.4s, v7.4s, v1.4s\n" + "smax v8.4s, v8.4s, v1.4s\n" + "smax v9.4s, v9.4s, v1.4s\n" + "smax v10.4s, v10.4s, v1.4s\n" + "smax v11.4s, v11.4s, v1.4s\n" + "smax v12.4s, v12.4s, v1.4s\n" + "smax v13.4s, v13.4s, v1.4s\n" + "smax v14.4s, v14.4s, v1.4s\n" + "smax v15.4s, v15.4s, v1.4s\n" + "smax v16.4s, v16.4s, v1.4s\n" + "smax v17.4s, v17.4s, v1.4s\n" + "smax v18.4s, v18.4s, v1.4s\n" + "smax v19.4s, v19.4s, v1.4s\n" + "smax v20.4s, v20.4s, v1.4s\n" + + "smin v5.4s, v5.4s, v2.4s\n" + "smin v6.4s, v6.4s, v2.4s\n" + "smin v7.4s, v7.4s, v2.4s\n" + "smin v8.4s, v8.4s, v2.4s\n" + "smin v9.4s, v9.4s, v2.4s\n" + "smin v10.4s, v10.4s, v2.4s\n" + "smin v11.4s, v11.4s, v2.4s\n" + "smin v12.4s, v12.4s, v2.4s\n" + "smin v13.4s, v13.4s, v2.4s\n" + "smin v14.4s, v14.4s, v2.4s\n" + "smin v15.4s, v15.4s, v2.4s\n" + "smin v16.4s, v16.4s, v2.4s\n" + "smin v17.4s, v17.4s, v2.4s\n" + "smin v18.4s, v18.4s, v2.4s\n" + "smin v19.4s, v19.4s, v2.4s\n" + "smin v20.4s, v20.4s, v2.4s\n" + + "13:\n" + "str q5, [%[out_0]]\n" + "str q6, [%[out_0], #16]\n" + "str q7, [%[out_0], #32]\n" + "str q8, [%[out_0], #48]\n" + "str q9, [%[out_0], #64]\n" + "str q10, [%[out_0], #80]\n" + "str q11, [%[out_0], #96]\n" + "str q12, [%[out_0], #112]\n" + "str q13, [%[out_0], #128]\n" + "str q14, [%[out_0], #144]\n" + "str q15, [%[out_0], #160]\n" + "str q16, [%[out_0], #176]\n" + "str q17, [%[out_0], #192]\n" + "str q18, [%[out_0], #208]\n" + "str q19, [%[out_0], #224]\n" + "str q20, [%[out_0], #240]\n" + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_0), [b_1] "r"(b_1), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [scale] "r"(scale_v) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v29", "v30", "x0", "x1", "x2", "x3", "x17", "x16"); + b0 += 8; + b1 += 8; + } + ohow_s += 8; + ohow_tail -= 8; + } + + if (ohow_tail >= 4) { + U32 hw = ohow_s; + const I32 *b0 = pwBiasArray; + const I32 *b1 = b0 + 4; + INT8 *in_pack = pwArray + hw * ic * 8; + for (U32 o = 0; o < oc; o++) { + INT8 *in_hw0 = in_pack; + const INT8 *f_o0c0 = f_base + o * 8 * ic * 8; + I32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + const I32 *b_0 = b0; + const I32 *b_1 = b1; + __asm__ __volatile__( + // Bias should be applied after scaling + "eor v5.16b, v5.16b, v5.16b\n" + "ldr d1, [%[in_0]]\n" // in_0 + "eor v6.16b, v6.16b, v6.16b\n" + "ldr x1, [%[in_0], #8]\n" + "eor v7.16b, v7.16b, v7.16b\n" + "ins v1.d[1], x1\n" + "eor v8.16b, v8.16b, v8.16b\n" + "ldr d0, [%[f_0]]\n" // f_0 + "eor v9.16b, v9.16b, v9.16b\n" + "ldr x2, [%[f_0], #8]\n" + "eor v10.16b, v10.16b, v10.16b\n" + "ins v0.d[1], x2\n" + "eor v11.16b, v11.16b, v11.16b\n" + "eor v12.16b, v12.16b, v12.16b\n" + + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" // ic_blk + "0:\n" + "ldr d29, [x0, 16]\n" + "ldr x17, [x0, 24]\n" + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "ldr d3, [x3, 16]!\n" + "ldr x16, [x3, 8]\n" + "sdot v7.4s, v0.16b, v1.4b[1]\n" + "ins v29.d[1], x17\n" + "subs x2, x2, #4\n" + "sdot v9.4s, v0.16b, v1.4b[2]\n" + "ins v3.d[1], x16\n" + "sdot v11.4s, v0.16b, v1.4b[3]\n" + + "sdot v6.4s, v29.16b, v1.4b[0]\n" + "ldr d0, [x0, 32]!\n" + "ldr x17, [x0, 8]\n" + "sdot v8.4s, v29.16b, v1.4b[1]\n" + "sdot v10.4s, v29.16b, v1.4b[2]\n" + "ins v0.d[1], x17\n" + "sdot v12.4s, v29.16b, v1.4b[3]\n" + "mov v1.16b, v3.16b\n" + + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu6]\n" // No need to scale for relu6 + "ldr q3, [%[b_0]]\n" + "ldr q4, [%[b_1]]\n" + "beq 11f\n" + + "ldr q0, [%[scale]]\n" + "mul v5.4s, v0.4s, v5.4s\n" + "mul v6.4s, v0.4s, v6.4s\n" + "mul v7.4s, v0.4s, v7.4s\n" + "mul v8.4s, v0.4s, v8.4s\n" + "mul v9.4s, v0.4s, v9.4s\n" + "mul v10.4s, v0.4s, v10.4s\n" + "mul v11.4s, v0.4s, v11.4s\n" + "mul v12.4s, v0.4s, v12.4s\n" + + "add v5.4s, v3.4s, v5.4s\n" + "add v6.4s, v4.4s, v6.4s\n" + "add v7.4s, v3.4s, v7.4s\n" + "add v8.4s, v4.4s, v8.4s\n" + "add v9.4s, v3.4s, v9.4s\n" + "add v10.4s, v4.4s, v10.4s\n" + "add v11.4s, v3.4s, v11.4s\n" + "add v12.4s, v4.4s, v12.4s\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 13f\n" + "eor v1.16b, v1.16b, v1.16b\n" // zero + "smax v5.4s, v5.4s, v1.4s\n" + "smax v6.4s, v6.4s, v1.4s\n" + "smax v7.4s, v7.4s, v1.4s\n" + "smax v8.4s, v8.4s, v1.4s\n" + "smax v9.4s, v9.4s, v1.4s\n" + "smax v10.4s, v10.4s, v1.4s\n" + "smax v11.4s, v11.4s, v1.4s\n" + "smax v12.4s, v12.4s, v1.4s\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 13f\n" + // Apply bias + "add v5.4s, v3.4s, v5.4s\n" + "add v6.4s, v4.4s, v6.4s\n" + "add v7.4s, v3.4s, v7.4s\n" + "add v8.4s, v4.4s, v8.4s\n" + "add v9.4s, v3.4s, v9.4s\n" + "add v10.4s, v4.4s, v10.4s\n" + "add v11.4s, v3.4s, v11.4s\n" + "add v12.4s, v4.4s, v12.4s\n" + + "eor v1.16b, v0.16b, v0.16b\n" // zero + "movi v2.4s, #0x06\n" // six + "smax v5.4s, v5.4s, v1.4s\n" + "smax v6.4s, v6.4s, v1.4s\n" + "smax v7.4s, v7.4s, v1.4s\n" + "smax v8.4s, v8.4s, v1.4s\n" + "smax v9.4s, v9.4s, v1.4s\n" + "smax v10.4s, v10.4s, v1.4s\n" + "smax v11.4s, v11.4s, v1.4s\n" + "smax v12.4s, v12.4s, v1.4s\n" + + "smin v5.4s, v5.4s, v2.4s\n" + "smin v6.4s, v6.4s, v2.4s\n" + "smin v7.4s, v7.4s, v2.4s\n" + "smin v8.4s, v8.4s, v2.4s\n" + "smin v9.4s, v9.4s, v2.4s\n" + "smin v10.4s, v10.4s, v2.4s\n" + "smin v11.4s, v11.4s, v2.4s\n" + "smin v12.4s, v12.4s, v2.4s\n" + + "13:\n" + "str q5, [%[out_0]]\n" + "str q6, [%[out_0], #16]\n" + "str q7, [%[out_0], #32]\n" + "str q8, [%[out_0], #48]\n" + "str q9, [%[out_0], #64]\n" + "str q10, [%[out_0], #80]\n" + "str q11, [%[out_0], #96]\n" + "str q12, [%[out_0], #112]\n" + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_0), [b_1] "r"(b_1), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [scale] "r"(scale_v) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v29", "v30", "x0", "x1", "x2", "x3", "x17", "x16"); + b0 += 8; + b1 += 8; + } + ohow_s += 4; + ohow_tail -= 4; + } + + for (I32 hw = ohow_s; hw < ohow; hw++) { + const I32 *b0 = pwBiasArray; + INT8 *in_pack = pwArray + hw * ic * 8; + + // compute + for (U32 o = 0; o < oc; o++) { + INT8 *in_hw = in_pack; + const INT8 *f_o = f_base + o * 8 * ic * 8; + I32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + + int32x4_t res[2] = {0}; + + for (U32 c = 0; c < ic; c++) { + int8x8_t in_2 = vld1_s8(in_hw); + in_hw += 8; + int8x16_t f_8o[4]; + f_8o[0] = vld1q_s8(f_o); + f_8o[1] = vld1q_s8(f_o + 16); + res[0] = vdotq_lane_s32(res[0], f_8o[0], in_2, 0); + res[1] = vdotq_lane_s32(res[1], f_8o[1], in_2, 0); + + f_8o[2] = vld1q_s8(f_o + 32); + f_8o[3] = vld1q_s8(f_o + 48); + f_o += 64; + res[0] = vdotq_lane_s32(res[0], f_8o[2], in_2, 1); + res[1] = vdotq_lane_s32(res[1], f_8o[3], in_2, 1); + } + + if (pointwiseActivationParamSpec.mode != ACTIVATION_RELU6 && scale != 1) { // Scale + int32x4_t sc = vld1q_s32(scale_v); + res[0] = vmulq_s32(res[0], sc); + res[1] = vmulq_s32(res[1], sc); + } + + int32x4_t bias[2]; + bias[0] = vld1q_s32(b0); + bias[1] = vld1q_s32(b0 + 4); + + res[0] = vaddq_s32(res[0], bias[0]); + res[1] = vaddq_s32(res[1], bias[1]); + + switch (pointwiseActivationParamSpec.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + int32x4_t z = vdupq_n_s32(0); + res[0] = vmaxq_s32(res[0], z); + res[1] = vmaxq_s32(res[1], z); + break; + } + case ACTIVATION_RELU6: { + int32x4_t z = vdupq_n_s32(0); + int32x4_t s = vdupq_n_s32(6); + res[0] = vmaxq_s32(res[0], z); + res[1] = vmaxq_s32(res[1], z); + res[0] = vminq_s32(res[0], s); + res[1] = vminq_s32(res[1], s); + break; + } + default: + return NOT_SUPPORTED; + } + vst1q_s32(out_o0hw0, res[0]); + vst1q_s32(out_o0hw0 + 4, res[1]); + b0 += 8; + } + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/arm/int8/pooling.cpp b/compute/tensor/src/cpu/arm/int8/pooling.cpp new file mode 100644 index 00000000..9a3bf24b --- /dev/null +++ b/compute/tensor/src/cpu/arm/int8/pooling.cpp @@ -0,0 +1,81 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/int8/tensor_computing_int8.h" + +EE pooling_c8_int8(const INT8 *input, + U32 stride, + int hstart, + int hend, + int wstart, + int wend, + INT8 *output, + PoolingParamSpec poolingParamSpec, + void *scale) +{ + EE ret = SUCCESS; + PoolingMode pm = poolingParamSpec.mode; + U32 kernelSizeH = poolingParamSpec.kernel_h; + U32 kernelSizeW = poolingParamSpec.kernel_w; + if (kernelSizeH * kernelSizeW > 256 && pm == POOLING_MEAN) { + ret = NOT_SUPPORTED; + } + short khkw = kernelSizeH * kernelSizeW; + short factor = 256 / khkw; + F32 *inputScale = (F32 *)scale; + F32 *outputScale = inputScale + 1; + switch (pm) { + case POOLING_MAX: { + *outputScale = *inputScale; + break; + } + case POOLING_MEAN: { + *outputScale = *inputScale * factor * khkw / 256; + break; + } + default: { + ret = NOT_SUPPORTED; + break; + } + } + int8x8_t in1, out1; + int16x8_t out_mean = {0}; + out1 = vdup_n_s8(-128); + short pool_size = (hend - hstart) * (wend - wstart); + for (int kernelH = hstart; kernelH < hend; kernelH++) { + for (int kernelW = wstart; kernelW < wend; kernelW++) { + const U32 index = (kernelH * stride + kernelW) * 8; + in1 = vld1_s8(input + index); + switch (pm) { + case POOLING_MAX: + out1 = vmax_s8(out1, in1); + break; + case POOLING_MEAN: + out_mean = vaddw_s8(out_mean, in1); + break; + default: + ret = NOT_SUPPORTED; + break; + } + } + } + if (pm == POOLING_MEAN) { + short pool_factor = factor * khkw / pool_size; + if (pool_factor > 1) { + out_mean = vmulq_n_s16(out_mean, pool_factor); + } + out1 = vshrn_n_s16(out_mean, 8); + } + vst1_s8(output, out1); + return ret; +} diff --git a/compute/tensor/src/cpu/arm/int8/quantize.cpp b/compute/tensor/src/cpu/arm/int8/quantize.cpp new file mode 100644 index 00000000..c0675ac5 --- /dev/null +++ b/compute/tensor/src/cpu/arm/int8/quantize.cpp @@ -0,0 +1,109 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include +#include "cpu/arm/int8/tensor_computing_int8.h" +#include "cpu/arm/int8/convolution_gemm.h" + +EE quantize_tensor_int32( + TensorDesc dDesc, const void *data, TensorDesc *qDesc, void *qData, F32 *scale) +{ + if (nullptr == data || nullptr == qDesc || nullptr == qData || nullptr == scale) { + CHECK_STATUS(NULL_POINTER); + } + DataType dt; + DataFormat df; + U32 n, c, h, w; + if (tensorIs2d(dDesc)) { + CHECK_STATUS(tensor2dGet(dDesc, &dt, &df, &n, &w)); + c = 1; + h = 1; + } else if (tensorIs3d(dDesc)) { + CHECK_STATUS(tensor3dGet(dDesc, &dt, &df, &n, &h, &w)); + c = 1; + } else { + CHECK_STATUS(tensor4dGet(dDesc, &dt, &df, &n, &c, &h, &w)); + } + switch (dt) { + case DT_I32: { + I32 *array = (I32 *)data; + int32x4_t tmp_v = vld1q_s32(array); + int32x4_t max_v = tmp_v; + int32x4_t min_v = tmp_v; + + U32 numData = n * c * h * w; + CHECK_REQUIREMENT(numData >= 4); + U32 i = 4; + for (; i < numData - 3; i += 4) { + tmp_v = vld1q_s32(array + i); + max_v = vmaxq_s32(max_v, tmp_v); + min_v = vminq_s32(min_v, tmp_v); + } + + I32 max = vmaxvq_s32(max_v); + I32 min = vminvq_s32(min_v); + for (; i < numData; i++) { + I32 tmp = array[i]; + if (tmp > max) { + max = tmp; + } + if (tmp < min) { + min = tmp; + } + } + if (max == 0 && min == 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + + I32 factor; + F32 scaleO; + if (max > 0 && min < 0) { + I32 factor_max = 127 * 16777216 / max; + I32 factor_min = -127 * 16777216 / min; + factor = (factor_max < factor_min) ? factor_max : factor_min; + scaleO = (factor_max < factor_min) ? (127.0 / max) : (-127.0 / min); + } else if (max > 0) { + factor = 127 * 16777216 / max; + scaleO = 127.0 / max; + } else { + factor = -127 * 16777216 / min; + scaleO = -127.0 / min; + } + UNI_DEBUG_LOG("%d is the max I32 value, and min values is %d, %f is the derived " + "scale\n", + max, min, scaleO); + *scale *= scaleO; + + U32 main = numData / 16; + INT8 *qArray = (INT8 *)qData; + CHECK_STATUS(quantize_I32(main * 4, array, factor, scaleO, qArray)); + for (U32 i = main * 16; i < numData; i++) { + qArray[i] = array[i] * scaleO; + } + + if (tensorIs2d(dDesc)) { + *qDesc = tensor2df(DT_I8, df, n, w); + } else if (tensorIs3d(dDesc)) { + *qDesc = tensor3df(DT_I8, df, n, h, w); + } else { + *qDesc = tensor4df(DT_I8, df, n, c, h, w); + } + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/arm/int8/tensor_computing_int8.h b/compute/tensor/src/cpu/arm/int8/tensor_computing_int8.h new file mode 100644 index 00000000..6fc9c92d --- /dev/null +++ b/compute/tensor/src/cpu/arm/int8/tensor_computing_int8.h @@ -0,0 +1,97 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_TENSOR_COMPUTING_INT8 +#define _H_TENSOR_COMPUTING_INT8 +#ifdef _USE_INT8 +#include +#include "sys.h" +#include "types.h" +#include "error.h" +#include "cpu/arm/int8/arm_functions_int8.h" + +EE convolution_infer_forward_algorithm_int8(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + ConvolutionForwardAlgorithm *algorithm); + +EE convolution_transform_filter_bytes_int8( + TensorDesc filterDesc, ConvolutionForwardAlgorithm algorithm, U32 *bytes); + +EE convolution_transform_filter_int8(TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + void *filterTransformed); + +EE convolution_int8(TensorDesc inputDesc, + const INT8 *input, + TensorDesc filterDesc, + const INT8 *filter, + F16 *scales, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc biasDesc, + const F16 *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec activationDesc, + Arch arch); + +EE depthwise_pointwise_convolution_int8(TensorDesc inputDesc, + INT8 *input, + TensorDesc dwFilterDesc, + const INT8 *dwFilter, + TensorDesc pwFilterDesc, + const INT8 *pwFilter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc dwBiasDesc, + const I32 *dwBias, + TensorDesc pwBiasDesc, + const I32 *pwBias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + I32 *output, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + Arch arch); + +EE pooling_c8_int8(const INT8 *input, + U32 stride, + int hstart, + int hend, + int wstart, + int wend, + INT8 *output, + PoolingParamSpec poolingParamSpec, + void *scale); + +EE concat_int8(std::vector inputDesc, + std::vector input, + F32 *inputScale, + int axis, + TensorDesc outputDesc, + void *output, + F32 *outputScale); + +EE quantize_tensor_int32( + TensorDesc dDesc, const void *data, TensorDesc *qDesc, void *qData, F32 *scale); +#endif +#endif diff --git a/compute/tensor/src/cpu/arm/normalization.cpp b/compute/tensor/src/cpu/arm/normalization.cpp new file mode 100644 index 00000000..a26d8bc3 --- /dev/null +++ b/compute/tensor/src/cpu/arm/normalization.cpp @@ -0,0 +1,47 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/tensor_computing_arm.h" +#ifdef _USE_FP32 +#include "cpu/arm/fp32/tensor_computing_fp32.h" +#endif +#ifdef _USE_FP16 +#include "cpu/arm/fp16/tensor_computing_fp16.h" +#endif + +EE layer_normalization_arm( + TensorDesc inputDesc, void *input, void *alpha, void *beta, TensorDesc outputDesc, void *output) +{ + DataType idt = inputDesc.dt; + EE ret = SUCCESS; + switch (idt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = layer_normalization_fp32( + inputDesc, (F32 *)input, (F32 *)alpha, (F32 *)beta, outputDesc, (F32 *)output); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + ret = layer_normalization_fp16( + inputDesc, (F16 *)input, (F16 *)alpha, (F16 *)beta, outputDesc, (F16 *)output); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/arm/padding.cpp b/compute/tensor/src/cpu/arm/padding.cpp new file mode 100644 index 00000000..a3a4ef79 --- /dev/null +++ b/compute/tensor/src/cpu/arm/padding.cpp @@ -0,0 +1,126 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "types.h" +#include "cpu/arm/tensor_computing_arm.h" +#include + +EE padding_arm(TensorDesc inputDesc, + const void *input, + PadParamSpec padParamSpec, + TensorDesc outputDesc, + void *output) +{ + DataType idt, odt; + DataFormat idf, odf; + U32 in = 0, ic = 0, ih = 0, iw = 0, on = 0, oc = 0, oh = 0, ow = 0; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + CHECK_REQUIREMENT(in == on); + CHECK_REQUIREMENT(ic == oc); + U32 alignSize = 1; + if (idf == DF_NCHWC8) { + alignSize = 8; + } + ic /= alignSize; + oc /= alignSize; + for (U32 n = 0; n < in; n++) { + for (U32 c = 0; c < ic; c++) { + for (U32 h = 0; h < ih; h++) { + const U8 *inPtr = + (const U8 *)input + (((n * ic + c) * ih + h) * iw) * alignSize * bytesOf(idt); + U8 *outPtr = (U8 *)output + + (((n * oc + c) * oh + (padParamSpec.top + h)) * ow) * alignSize * bytesOf(odt); + if (padParamSpec.pad_mode == Pad_Constant) { + memset(outPtr, 0, padParamSpec.left * alignSize * bytesOf(odt)); + outPtr += padParamSpec.left * alignSize * bytesOf(odt); + memcpy(outPtr, inPtr, iw * alignSize * bytesOf(idt)); + outPtr += iw * alignSize * bytesOf(odt); + memset(outPtr, 0, padParamSpec.right * alignSize * bytesOf(odt)); + } else { + for (U32 w = 0; w < padParamSpec.left; w++) { + U32 index = 0; + if (padParamSpec.pad_mode == Pad_Reflect) { + index = (padParamSpec.left - w) * alignSize * bytesOf(idt); + } else if (padParamSpec.pad_mode == Pad_Symmetric) { + index = (padParamSpec.left - w - 1) * alignSize * bytesOf(idt); + } + memcpy(outPtr, inPtr + index, alignSize * bytesOf(idt)); + outPtr += alignSize * bytesOf(idt); + } + memcpy(outPtr, inPtr, iw * alignSize * bytesOf(idt)); + outPtr += iw * alignSize * bytesOf(odt); + for (U32 w = 0; w < padParamSpec.right; w++) { + U32 index = (iw - 1) * alignSize * bytesOf(idt); + if (padParamSpec.pad_mode == Pad_Reflect) { + index = (iw - w - 2) * alignSize * bytesOf(idt); + } else if (padParamSpec.pad_mode == Pad_Symmetric) { + index = (iw - w - 1) * alignSize * bytesOf(idt); + } + memcpy(outPtr, inPtr + index, alignSize * bytesOf(idt)); + outPtr += alignSize * bytesOf(idt); + } + } + } + U8 *outPtr = (U8 *)output + (((n * oc + c) * oh) * ow) * alignSize * bytesOf(odt); + for (U32 h = 0; h < padParamSpec.top; h++) { + U32 index = h * ow * alignSize * bytesOf(odt); + if (padParamSpec.pad_mode == Pad_Constant) { + memset(outPtr + index, 0, ow * alignSize * bytesOf(odt)); + } else if (padParamSpec.pad_mode == Pad_Edge) { + memcpy(outPtr + index, + outPtr + (padParamSpec.top * ow * alignSize * bytesOf(odt)), + ow * alignSize * bytesOf(odt)); + } else if (padParamSpec.pad_mode == Pad_Reflect) { + memcpy(outPtr + index, + outPtr + + ((padParamSpec.top + padParamSpec.top - h) * ow * alignSize * + bytesOf(odt)), + ow * alignSize * bytesOf(odt)); + } else if (padParamSpec.pad_mode == Pad_Symmetric) { + memcpy(outPtr + index, + outPtr + + ((padParamSpec.top + padParamSpec.top - h - 1) * ow * alignSize * + bytesOf(odt)), + ow * alignSize * bytesOf(odt)); + } else { + return NOT_SUPPORTED; + } + } + for (U32 h = 0; h < padParamSpec.bottom; h++) { + U32 index = (padParamSpec.top + ih + h) * ow * alignSize * bytesOf(odt); + if (padParamSpec.pad_mode == Pad_Constant) { + memset(outPtr + index, 0, ow * alignSize * bytesOf(odt)); + } else if (padParamSpec.pad_mode == Pad_Edge) { + memcpy(outPtr + index, + outPtr + ((padParamSpec.top + ih - 1) * ow * alignSize * bytesOf(odt)), + ow * alignSize * bytesOf(odt)); + } else if (padParamSpec.pad_mode == Pad_Reflect) { + // memcpy(outPtr+index, outPtr+((padParamSpec.top+ih-2-h)*ow*alignSize*bytesOf(odt)), ow*alignSize*bytesOf(odt)); + memcpy(outPtr + index, + outPtr + + ((padParamSpec.top + ih - 1 - padParamSpec.bottom + h) * ow * + alignSize * bytesOf(odt)), + ow * alignSize * bytesOf(odt)); + } else if (padParamSpec.pad_mode == Pad_Symmetric) { + memcpy(outPtr + index, + outPtr + ((padParamSpec.top + ih - 1 - h) * ow * alignSize * bytesOf(odt)), + ow * alignSize * bytesOf(odt)); + } else { + return NOT_SUPPORTED; + } + } + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/arm/pooling.cpp b/compute/tensor/src/cpu/arm/pooling.cpp new file mode 100644 index 00000000..3c726e5b --- /dev/null +++ b/compute/tensor/src/cpu/arm/pooling.cpp @@ -0,0 +1,177 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/tensor_computing_arm.h" +#ifdef _USE_FP32 +#include "cpu/arm/fp32/tensor_computing_fp32.h" +#endif +#ifdef _USE_FP16 +#include "cpu/arm/fp16/tensor_computing_fp16.h" +#endif +#ifdef _USE_INT8 +#include "cpu/arm/int8/tensor_computing_int8.h" +#endif + +EE pooling_arm(TensorDesc inputDesc, + const void *input, + PoolingParamSpec poolingParamSpec, + void *scale, + TensorDesc outputDesc, + void *output) +{ + EE ret = SUCCESS; + if (nullptr == input || nullptr == output) { + ret = NULL_POINTER; + } + DataType idt, odt; + DataFormat idf, odf; + U32 in = 0, ic = 0, ih = 0, iw = 0, on = 0, oc = 0, oh = 0, ow = 0; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if (idt != odt) { + ret = NOT_MATCH; + } + if (in != on || ic != oc) { + ret = NOT_MATCH; + } + if (idf != DF_NCHWC8 || odf != idf) { + ret = NOT_MATCH; + } + + U32 strideH = poolingParamSpec.stride_h; + U32 strideW = poolingParamSpec.stride_w; + U32 paddingT = poolingParamSpec.padding_top; + U32 paddingL = poolingParamSpec.padding_left; + U32 kernelSizeH = poolingParamSpec.kernel_h; + U32 kernelSizeW = poolingParamSpec.kernel_w; + if (paddingT >= kernelSizeH || paddingL >= kernelSizeW) { + ret = NOT_SUPPORTED; + } + + ic /= 8; + const U8 *inputPtr = (const U8 *)input; + U8 *outputPtr = (U8 *)output; + for (U32 n = 0; n < in; n++) { + for (U32 c = 0; c < ic; c++) { + for (U32 h = 0; h < oh; h++) { + for (U32 w = 0; w < ow; w++, outputPtr += 8 * bytesOf(odt)) { + int hstart = UNI_MAX((int)h * (int)strideH - (int)paddingT, 0); + int wstart = UNI_MAX((int)w * (int)strideW - (int)paddingL, 0); + int hend = UNI_MIN(hstart + kernelSizeH, ih); + int wend = UNI_MIN(wstart + kernelSizeW, iw); + int poolSize = (hend - hstart) * (wend - wstart); + switch (idt) { +#ifdef _USE_FP32 + case DT_F32: + ret = pooling_c8_fp32((const F32 *)inputPtr, iw, hstart, hend, wstart, + wend, (F32 *)outputPtr, poolingParamSpec); + break; +#endif +#ifdef _USE_FP16 + case DT_F16: + // Global average pooling kernel can be very big. Accumulate to FP32 to protect accurracy + if (poolSize > 256 && poolingParamSpec.mode == POOLING_MEAN) { + ret = pooling_c8_big_fp16((const F16 *)inputPtr, iw, hstart, hend, wstart, + wend, (F16 *)outputPtr, poolSize); + } else { + ret = pooling_c8_fp16((const F16 *)inputPtr, iw, hstart, hend, wstart, + wend, (F16 *)outputPtr, poolingParamSpec); + } + break; +#endif +#ifdef _USE_INT8 + case DT_I8: + ret = pooling_c8_int8((const INT8 *)inputPtr, iw, hstart, hend, wstart, + wend, (INT8 *)outputPtr, poolingParamSpec, scale); + break; +#endif + default: + ret = NOT_SUPPORTED; + break; + } + } + } + inputPtr += ih * iw * 8 * bytesOf(idt); + } + } + return ret; +} + +EE pooling_bp_arm(TensorDesc inputDesc, + const void *input, + PoolingParamSpec poolingParamSpec, + TensorDesc outputDesc, + void *output) +{ + EE ret = SUCCESS; + if (nullptr == input || nullptr == output) { + ret = NULL_POINTER; + } + DataType idt, odt; + DataFormat idf, odf; + U32 in = 0, ic = 0, ih = 0, iw = 0, on = 0, oc = 0, oh = 0, ow = 0; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if (idt != odt) { + ret = NOT_MATCH; + } + if (in != on || ic != oc) { + ret = NOT_MATCH; + } + if (idf != DF_NCHWC8 || odf != idf) { + ret = NOT_MATCH; + } + + U32 strideH = poolingParamSpec.stride_h; + U32 strideW = poolingParamSpec.stride_w; + U32 paddingT = poolingParamSpec.padding_top; + U32 paddingL = poolingParamSpec.padding_left; + U32 kernelSizeH = poolingParamSpec.kernel_h; + U32 kernelSizeW = poolingParamSpec.kernel_w; + if (paddingT >= kernelSizeH || paddingL >= kernelSizeW) { + ret = NOT_SUPPORTED; + } + + ic /= 8; + const U8 *inputPtr = (const U8 *)input; + U8 *outputPtr = (U8 *)output; + for (U32 n = 0; n < in; n++) { + for (U32 c = 0; c < ic; c++) { + for (U32 h = 0; h < ih; h++) { + for (U32 w = 0; w < iw; w++, inputPtr += 8 * bytesOf(idt)) { + int hstart = (int)h * (int)strideH - (int)paddingT; + int wstart = (int)w * (int)strideW - (int)paddingL; + int hend = UNI_MIN(hstart + kernelSizeH, oh); + int wend = UNI_MIN(wstart + kernelSizeW, ow); + hstart = UNI_MAX(hstart, 0); + wstart = UNI_MAX(wstart, 0); + switch (idt) { +#ifdef _USE_FP32 + case DT_F32: + ret = pooling_bp_c8_fp32((const F32 *)inputPtr, hstart, hend, wstart, + wend, (F32 *)outputPtr, ow, poolingParamSpec); + break; +#endif + default: + ret = NOT_SUPPORTED; + break; + } + } + } + outputPtr += oh * ow * 8 * bytesOf(odt); + } + } + return ret; +} diff --git a/compute/tensor/src/cpu/arm/prelu.cpp b/compute/tensor/src/cpu/arm/prelu.cpp new file mode 100644 index 00000000..bb9881f0 --- /dev/null +++ b/compute/tensor/src/cpu/arm/prelu.cpp @@ -0,0 +1,50 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/tensor_computing_arm.h" +#ifdef _USE_FP32 +#include "cpu/arm/fp32/tensor_computing_fp32.h" +#endif +#ifdef _USE_FP16 +#include "cpu/arm/fp16/tensor_computing_fp16.h" +#endif + +EE prelu_arm(TensorDesc inputDesc, + void *input, + void *weight, + PReLUParamSpec preluDesc, + TensorDesc outputDesc, + void *output) +{ + EE ret = SUCCESS; + switch (inputDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = prelu_fp32( + inputDesc, (F32 *)input, (F32 *)weight, preluDesc, outputDesc, (F32 *)output); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + ret = prelu_fp16( + inputDesc, (F16 *)input, (F16 *)weight, preluDesc, outputDesc, (F16 *)output); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/arm/quantize.cpp b/compute/tensor/src/cpu/arm/quantize.cpp new file mode 100644 index 00000000..60c04e31 --- /dev/null +++ b/compute/tensor/src/cpu/arm/quantize.cpp @@ -0,0 +1,44 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/tensor_computing_arm.h" +#ifdef _USE_FP16 +#include "cpu/arm/fp16/tensor_computing_fp16.h" +#endif +#ifdef _USE_INT8 +#include "cpu/arm/int8/tensor_computing_int8.h" +#endif + +EE quantize_tensor_arm( + TensorDesc dDesc, const void *data, TensorDesc *qDesc, void *qData, void *scale) +{ + EE ret = SUCCESS; + switch (dDesc.dt) { +#ifdef _USE_FP16 + case DT_F16: { + ret = quantize_tensor_fp16(dDesc, data, qDesc, qData, (F16 *)scale); + break; + } +#endif +#ifdef _USE_INT8 + case DT_I32: { + ret = quantize_tensor_int32(dDesc, data, qDesc, qData, (F32 *)scale); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/arm/rnn.cpp b/compute/tensor/src/cpu/arm/rnn.cpp new file mode 100644 index 00000000..d5313f11 --- /dev/null +++ b/compute/tensor/src/cpu/arm/rnn.cpp @@ -0,0 +1,59 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/tensor_computing_arm.h" +#ifdef _USE_FP32 +#include "cpu/arm/fp32/tensor_computing_fp32.h" +#endif +#ifdef _USE_FP16 +#include "cpu/arm/fp16/tensor_computing_fp16.h" +#endif + +EE rnncell_arm(TensorDesc xDesc, + const void *currentX, + const TensorDesc *filterDesc, + const void **filter, + const TensorDesc *biasDesc, + const void **bias, + void *state, + U32 tmpBytes, + void *tmp, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + void *output, + Arch arch) +{ + EE ret = SUCCESS; + switch (xDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = rnncell_fp32(xDesc, currentX, filterDesc, filter, biasDesc, bias, state, tmpBytes, + tmp, rnnParamSpec, batchStrideX, batchStrideH, hDesc, output, arch); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + ret = rnncell_fp16(xDesc, currentX, filterDesc, filter, biasDesc, bias, state, tmpBytes, + tmp, rnnParamSpec, batchStrideX, batchStrideH, hDesc, output, arch); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/arm/scale.cpp b/compute/tensor/src/cpu/arm/scale.cpp new file mode 100644 index 00000000..84ea909b --- /dev/null +++ b/compute/tensor/src/cpu/arm/scale.cpp @@ -0,0 +1,61 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/tensor_computing_arm.h" +#ifdef _USE_FP32 +#include "cpu/arm/fp32/tensor_computing_fp32.h" +#endif +#ifdef _USE_FP16 +#include "cpu/arm/fp16/tensor_computing_fp16.h" +#endif + +EE scale_arm(TensorDesc inputDesc, + void *input, + void *alpha, + void *beta, + ScaleParamSpec p, + TensorDesc outputDesc, + void *output) +{ + UNUSED(outputDesc); + U32 length = tensorNumElements(inputDesc); + int axis = (p.axis + inputDesc.nDims) % inputDesc.nDims; + I32 in = inputDesc.dims[inputDesc.nDims - 1]; + I32 ic = inputDesc.dims[inputDesc.nDims - 1 - axis]; + I32 elements_per_channel = length / (in * ic); + if (inputDesc.df == DF_NCHWC8) { + axis = inputDesc.nDims; + } + EE ret = SUCCESS; + switch (inputDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = scale_fp32((F32 *)input, axis, inputDesc.nDims, (F32 *)alpha, (F32 *)beta, in, ic, + elements_per_channel, (F32 *)output); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + ret = scale_fp16((F16 *)input, axis, inputDesc.nDims, (F16 *)alpha, (F16 *)beta, in, ic, + elements_per_channel, (F16 *)output); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + + return ret; +} diff --git a/compute/tensor/src/cpu/arm/softmax.cpp b/compute/tensor/src/cpu/arm/softmax.cpp new file mode 100644 index 00000000..88ebb474 --- /dev/null +++ b/compute/tensor/src/cpu/arm/softmax.cpp @@ -0,0 +1,46 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/tensor_computing_arm.h" +#ifdef _USE_FP32 +#include "cpu/arm/fp32/tensor_computing_fp32.h" +#endif +#ifdef _USE_FP16 +#include "cpu/arm/fp16/tensor_computing_fp16.h" +#endif + +EE softmax_arm( + TensorDesc inputDesc, const void *input, SoftmaxParamSpec p, TensorDesc outputDesc, void *output) +{ + DataType idt = inputDesc.dt; + EE ret = SUCCESS; + switch (idt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = softmax_fp32(inputDesc, (const F32 *)input, p.axis, outputDesc, (F32 *)output); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + ret = softmax_fp16(inputDesc, (const F16 *)input, p.axis, outputDesc, (F16 *)output); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + + return ret; +} diff --git a/compute/tensor/src/cpu/arm/tensor_computing_arm.h b/compute/tensor/src/cpu/arm/tensor_computing_arm.h new file mode 100644 index 00000000..678ab1da --- /dev/null +++ b/compute/tensor/src/cpu/arm/tensor_computing_arm.h @@ -0,0 +1,227 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_TENSOR_COMPUTING_ARM +#define _H_TENSOR_COMPUTING_ARM + +#include +#include "sys.h" +#include "types.h" + +EE attention_arm(TensorDesc inputDesc, const void *input, TensorDesc outputDesc, void *output); + +EE clip_arm(TensorDesc inputDesc, void *input, ClipParamSpec p, TensorDesc outputDesc, void *output); + +EE convolution_infer_forward_algorithm_arm(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + ConvolutionForwardAlgorithm *algorithm, + DataType targetDataType); + +EE convolution_transform_filter_bytes_arm(TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes); + +EE convolution_transform_filter_arm(TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + void *filterTransformed); + +EE convolution_infer_forward_tmp_bytes_arm(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes); + +EE convolution_arm(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc scaleDesc, + const void *scale, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec activationDesc, + Arch arch); + +EE deconvolution_transform_filter_arm(TensorDesc filterDesc, + const void *filter, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + void *filterTransformed); + +EE depthwise_pointwise_convolution_infer_forward_algorithm_arm(TensorDesc inputDesc, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + DepthwiseConvolutionForwardAlgorithm *algorithm, + DataType targetDataType); + +EE depthwise_pointwise_convolution_infer_forward_tmp_bytes_arm(TensorDesc inputDesc, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + U32 *bytes); + +EE depthwise_pointwise_convolution_transform_filter_arm(TensorDesc dwFilterDesc, + const void *dwFilter, + TensorDesc pwFilterDesc, + const void *pwFilter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc *dwFtmDesc, + void *dwFilterTransformed, + TensorDesc *pwFtmDesc, + void *pwFilterTransformed); + +EE depthwise_pointwise_convolution_arm(TensorDesc inputDesc, + void *input, + TensorDesc dwFilterDesc, + const void *dwFilter, + TensorDesc pwFilterDesc, + const void *pwFilter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc dwBiasDesc, + const void *dwBias, + TensorDesc pwBiasDesc, + const void *pwBias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + Arch arch); + +EE depthwise_convolution_transform_filter_arm(TensorDesc filterDesc, + const void *filter, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + void *filterTransformed); + +EE depthwise_convolution_infer_forward_tmp_bytes_arm(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + U32 *bytes); + +EE depthwise_convolution_arm(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec depthwiseActivationParamSpec, + Arch arch); + +EE eltwise_arm(DataType dataType, + std::vector input, + std::vector inputSize, + U32 num, + U32 len, + void *output, + EltwiseMode eltwiseMode); + +EE rnncell_arm(TensorDesc xDesc, + const void *currentX, + const TensorDesc *filterDesc, + const void **filter, + const TensorDesc *biasDesc, + const void **bias, + void *state, + U32 tmpBytes, + void *tmp, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + void *currentH, + Arch arch); + +EE layer_normalization_arm( + TensorDesc inputDesc, void *input, void *alpha, void *beta, TensorDesc outputDesc, void *output); + +EE pooling_arm(TensorDesc inputDesc, + const void *input, + PoolingParamSpec poolingParamSpec, + void *scale, + TensorDesc outputDesc, + void *output); + +EE pooling_bp_arm(TensorDesc inputDesc, + const void *input, + PoolingParamSpec poolingParamSpec, + TensorDesc outputDesc, + void *output); + +EE reshape_arm(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *output); + +EE scale_arm(TensorDesc inputDesc, + void *input, + void *alpha, + void *beta, + ScaleParamSpec p, + TensorDesc outputDesc, + void *output); + +EE softmax_arm( + TensorDesc inputDesc, const void *input, SoftmaxParamSpec p, TensorDesc outputDesc, void *output); + +EE quantize_tensor_arm( + TensorDesc dDesc, const void *data, TensorDesc *qDesc, void *qData, void *scale); + +EE check_arm(TensorDesc inputDescA, + const void *inputA, + TensorDesc inputDescB, + const void *inputB, + CheckParamSpec p, + TensorDesc outputDesc, + void *output); + +EE attention_mask_arm(TensorDesc inputDesc, + const void *input, + AttentionMaskParamSpec p, + TensorDesc outputDesc, + void *output); + +EE prelu_arm(TensorDesc inputDesc, + void *input, + void *weight, + PReLUParamSpec preluDesc, + TensorDesc outputDesc, + void *output); +#endif diff --git a/compute/tensor/src/cpu/arm/transform_functions.h b/compute/tensor/src/cpu/arm/transform_functions.h new file mode 100644 index 00000000..98ea6b85 --- /dev/null +++ b/compute/tensor/src/cpu/arm/transform_functions.h @@ -0,0 +1,148 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "types.h" +#ifdef _USE_FP32 +#include "cpu/arm/fp32/convolution_winograd_transform.h" +#endif +#ifdef _USE_FP16 +#include "cpu/arm/fp16/convolution_winograd_transform.h" +#endif + +template +inline EE transformCNHWToNHWCNx( + TensorDesc inputDesc, const T *input, TensorDesc outputDesc, T *output) +{ + if (input == NULL || output == NULL) { + CHECK_STATUS(NULL_POINTER); + } + DataType fdt; + DataFormat fdf; + U32 fn, fc, fh, fw; + CHECK_STATUS(tensor4dGet(inputDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + U32 oc = fc / N; + U32 hwMax = fh * fw - 1; + + for (U32 o = 0; o < oc; o++) { + for (U32 hw = 0; hw < fh * fw; hw++) { + for (U32 c = 0; c < fn; c++) { + for (U32 ox = 0; ox < N; ox++) { + output[o * fh * fw * fn * N + hw * fn * N + c * N + ox] = + input[c * fc * fh * fw + (o * N + ox) * fh * fw + hwMax - hw]; + } + } + } + } + if ((fc != oc * N) && (N == 16)) { + for (U32 hw = 0; hw < fh * fw; hw++) { + for (U32 c = 0; c < fn; c++) { + for (U32 o8 = 0; o8 < 8; o8++) { + output[(oc * 16) * fh * fw * fn + hw * fn * 8 + c * 8 + o8] = + input[c * fc * fh * fw + (oc * 16 + o8) * fh * fw + hwMax - hw]; + } + } + } + } + return SUCCESS; +} + +template +inline EE transformCNHWToNCHWC8( + TensorDesc inputDesc, const T *input, TensorDesc outputDesc, T *output) +{ + if (input == NULL || output == NULL) { + CHECK_STATUS(NULL_POINTER); + } + DataType fdt; + DataFormat fdf; + U32 fn, fc, fh, fw; + CHECK_STATUS(tensor4dGet(inputDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_REQUIREMENT(1 == fn); + U32 ic = fc / 8; + U32 hwMax = fh * fw - 1; + for (U32 c = 0; c < ic; c++) { + for (U32 hw = 0; hw < fh * fw; hw++) { + for (U32 c8 = 0; c8 < 8; c8++) { + output[c * fh * fw * 8 + hw * 8 + c8] = input[(c * 8 + c8) * fh * fw + hwMax - hw]; + } + } + } + return SUCCESS; +} + +template +inline EE transformCNHWToHWNCNx( + TensorDesc inputDesc, const T *input, TensorDesc outputDesc, T *output) +{ + if (input == NULL || output == NULL) { + CHECK_STATUS(NULL_POINTER); + } + DataType fdt; + DataFormat fdf; + U32 fn, fc, fh, fw; + CHECK_STATUS(tensor4dGet(inputDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + const U32 hwMax = 8; + for (U32 o = 0; o < fc / N; o++) { + for (U32 c = 0; c < fn; c++) { + U32 f_off_0 = c * fc * fh * fw + (o * N) * fh * fw; + U32 f_off_1 = c * fc * fh * fw + (o * N + N / 2) * fh * fw; + U32 ftm_off_0 = o * 36 * fn * N + c * N; + U32 ftm_off_1 = o * 36 * fn * N + c * N + N / 2; + T F[9][N / 2]; + T *F_ptr[9]; + T *Fw[36]; + + for (U32 hw = 0; hw < 9; hw++) { + for (U32 oo = 0; oo < N / 2; oo++) { + F[hw][oo] = input[f_off_0 + hwMax - hw + oo * fh * fw]; + } + F_ptr[hw] = F[hw]; + } + for (U32 hw = 0; hw < 36; hw++) { + Fw[hw] = output + ftm_off_0 + hw * fn * N; + } + trans_W_4x4_3x3(Fw, F_ptr); + for (U32 hw = 0; hw < 9; hw++) { + for (U32 oo = 0; oo < N / 2; oo++) { + F[hw][oo] = input[f_off_1 + hwMax - hw + oo * fh * fw]; + } + F_ptr[hw] = F[hw]; + } + for (U32 hw = 0; hw < 36; hw++) { + Fw[hw] = output + ftm_off_1 + hw * fn * N; + } + trans_W_4x4_3x3(Fw, F_ptr); + } + } + U32 oc = (fc / 16) * 16; + if ((oc != fc) && (N == 16)) { + for (U32 c = 0; c < fn; c++) { + U32 f_off_0 = c * fc * fh * fw + oc * fh * fw; + U32 ftm_off_0 = oc * 36 * fn + c * 8; + T F[9][8]; + T *F_ptr[9]; + T *Fw[36]; + for (U32 hw = 0; hw < 9; hw++) { + for (U32 oo = 0; oo < 8; oo++) { + F[hw][oo] = input[f_off_0 + hwMax - hw + oo * fh * fw]; + } + F_ptr[hw] = F[hw]; + } + for (U32 hw = 0; hw < 36; hw++) { + Fw[hw] = output + ftm_off_0 + hw * fn * 8; + } + trans_W_4x4_3x3(Fw, F_ptr); + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/clip.cpp b/compute/tensor/src/cpu/clip.cpp new file mode 100644 index 00000000..b882e7fc --- /dev/null +++ b/compute/tensor/src/cpu/clip.cpp @@ -0,0 +1,43 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/tensor_computing_cpu.h" +#ifdef _USE_GENERAL +#include "cpu/general/tensor_computing_general.h" +#endif +#ifdef _USE_X86 +#include "cpu/x86/tensor_computing_x86.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/tensor_computing_arm.h" +#endif + +EE clip_cpu( + TensorDesc inputDesc, void *input, ClipParamSpec p, TensorDesc outputDesc, void *output, Arch arch) +{ + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = clip_general(inputDesc, input, p, outputDesc, output); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = clip_x86(inputDesc, input, p, outputDesc, output); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = clip_arm(inputDesc, input, p, outputDesc, output); +#endif + } + return ret; +} diff --git a/compute/tensor/src/cpu/concat.cpp b/compute/tensor/src/cpu/concat.cpp new file mode 100644 index 00000000..b9704e59 --- /dev/null +++ b/compute/tensor/src/cpu/concat.cpp @@ -0,0 +1,108 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/tensor_computing_cpu.h" +#if defined(_USE_NEON) && defined(_USE_INT8) +#include "cpu/arm/int8/tensor_computing_int8.h" +#endif + +static EE concat(std::vector inputDesc, + std::vector input, + int axis, + TensorDesc outputDesc, + void *output, + void *tmp) +{ + if (nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + U32 num = inputDesc.size(); + if (num < 1) { + return NOT_MATCH; + } + + int dim = outputDesc.nDims; + axis = (axis + dim) % dim; + axis = dim - 1 - axis; + U32 tileSize = bytesOf(outputDesc.dt); + for (I32 i = 0; i < axis; i++) { + tileSize *= outputDesc.dims[i]; + } + U32 loops = 1; + for (I32 i = axis + 1; i < dim; i++) { + loops *= outputDesc.dims[i]; + } + + if (outputDesc.df == DF_NCHWC8) { + if (axis < 2) { + tileSize *= 8; + loops /= 8; + } + } + + bool isC8 = DF_NCHWC8 == outputDesc.df; + + U8 *ptr = (U8 *)output; + U8 *tmpPtr = (U8 *)tmp; + for (U32 i = 0; i < loops; i++) { + for (U32 j = 0; j < num; j++) { + U8 *inPtr = (U8 *)((input)[j]); + if (nullptr == input[j] || tensorNumElements(inputDesc[j]) == 0) { + continue; + } + + if ((4 != inputDesc[j].nDims) || (1 != inputDesc[j].dims[1]) || + (1 != inputDesc[j].dims[0])) { + if (isC8 && (DF_NCHW == inputDesc[j].df)) { + TensorDesc tmpDesc = inputDesc[j]; + tmpDesc.df = DF_NCHWC8; + transformNCHWToNCHWC8(inputDesc[j], inPtr, tmpDesc, tmpPtr); + inPtr = tmpPtr; + } else if (!isC8 && (DF_NCHWC8 == inputDesc[j].df)) { + TensorDesc tmpDesc = inputDesc[j]; + tmpDesc.df = DF_NCHW; + transformToNCHW(inputDesc[j], inPtr, tmpDesc, tmpPtr); + inPtr = tmpPtr; + } + } + U32 blockSize = inputDesc[j].dims[axis] * tileSize; + U8 *srcPtr = inPtr + i * blockSize; + memcpy(ptr, srcPtr, blockSize); + ptr += blockSize; + tmpPtr += tensorNumBytes(inputDesc[j]); + } + } + return SUCCESS; +} + +EE concat_cpu(std::vector inputDesc, + std::vector input, + void *inputScale, + ConcatParamSpec p, + void *tmp, + TensorDesc outputDesc, + void *output, + void *outputScale) +{ + EE ret = NOT_SUPPORTED; + if (outputDesc.dt == DT_I8) { +#if defined(_USE_NEON) && defined(_USE_INT8) + ret = concat_int8( + inputDesc, input, (F32 *)inputScale, p.axis, outputDesc, output, (F32 *)outputScale); +#endif + } else { + ret = concat(inputDesc, input, p.axis, outputDesc, output, tmp); + } + return ret; +} diff --git a/compute/tensor/src/cpu/convolution.cpp b/compute/tensor/src/cpu/convolution.cpp new file mode 100644 index 00000000..e115eb9e --- /dev/null +++ b/compute/tensor/src/cpu/convolution.cpp @@ -0,0 +1,62 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_GENERAL +#include "cpu/general/tensor_computing_general.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/tensor_computing_arm.h" +#endif +#ifdef _USE_X86 +#include "cpu/x86/tensor_computing_x86.h" +#endif +#include "cpu/tensor_computing_cpu.h" + +EE convolution_cpu(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc scaleDesc, + const void *scale, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec activationDesc, + Arch arch) +{ + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = convolution_general(inputDesc, input, filterDesc, filter, convParamSpec, scaleDesc, + scale, biasDesc, bias, outputDesc, output, activationDesc); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = convolution_x86(inputDesc, input, filterDesc, filter, convParamSpec, algorithm, + scaleDesc, scale, biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc, + arch); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = convolution_arm(inputDesc, input, filterDesc, filter, convParamSpec, algorithm, + scaleDesc, scale, biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc, + arch); +#endif + } + return ret; +} \ No newline at end of file diff --git a/compute/tensor/src/cpu/cpu_functions.h b/compute/tensor/src/cpu/cpu_functions.h new file mode 100644 index 00000000..0aefae95 --- /dev/null +++ b/compute/tensor/src/cpu/cpu_functions.h @@ -0,0 +1,231 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_CPU_FUNCTIONS +#define _H_CPU_FUNCTIONS + +#ifdef _USE_GENERAL +#include "cpu/general/general_functions.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/arm_functions.h" +#endif +#ifdef _USE_X86 +#include "cpu/x86/x86_functions.h" +#endif + +typedef void (*ArrayScaleFunction)( + DataType dt, const void *input, void *output, I32 len, F32 alpha, F32 beta); +typedef void (*ArrayAddFunction)( + DataType dt, const void *inputA, const void *inputB, void *output, I32 len); +typedef F32 (*ArraySumFunction)(DataType dt, const void *data, I32 len); +typedef F32 (*ArrayMeanFunction)(DataType dt, const void *data, I32 len); +typedef F32 (*ArrayVarFunction)(DataType dt, const void *data, I32 len, F32 mean); +typedef void (*ArrayPowerFunction)(DataType dt, void *input, void *output, I32 len, F32 power); +typedef void (*ArraySquareAndAddFunction)( + DataType dt, const void *inputA, const void *inputB, void *output, I32 len); +typedef EE (*ArrayActivationFunction)( + DataType dt, void *input, U32 len, ActivationParamSpec activationDesc, void *output); + +inline ArrayScaleFunction get_array_scale_function(Arch arch) +{ + ArrayScaleFunction func; + bool find = false; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + func = array_scale_general; + find = true; +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + func = array_scale_arm; + find = true; +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + func = array_scale_x86; + find = true; +#endif + } + CHECK_REQUIREMENT(find); + return func; +} + +inline ArrayAddFunction get_array_add_function(Arch arch) +{ + ArrayAddFunction func; + bool find = false; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + func = array_add_general; + find = true; +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + func = array_add_arm; + find = true; +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + func = array_add_x86; + find = true; +#endif + } + CHECK_REQUIREMENT(find); + return func; +} + +inline ArrayMeanFunction get_array_mean_function(Arch arch) +{ + ArrayMeanFunction func; + bool find = false; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + func = array_mean_general; + find = true; +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + func = array_mean_arm; + find = true; +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + func = array_mean_x86; + find = true; +#endif + } + CHECK_REQUIREMENT(find); + return func; +} + +inline ArrayVarFunction get_array_var_function(Arch arch) +{ + ArrayVarFunction func; + bool find = false; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + func = array_var_general; + find = true; +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + func = array_var_arm; + find = true; +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + func = array_var_x86; + find = true; +#endif + } + CHECK_REQUIREMENT(find); + return func; +} + +inline ArrayPowerFunction get_array_power_function(Arch arch) +{ + ArrayPowerFunction func; + bool find = false; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + func = array_power_general; + find = true; +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + func = array_power_arm; + find = true; +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + func = array_power_x86; + find = true; +#endif + } + CHECK_REQUIREMENT(find); + return func; +} + +inline ArraySumFunction get_array_sum_function(Arch arch) +{ + ArraySumFunction func; + bool find = false; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + func = array_sum_general; + find = true; +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + func = array_sum_arm; + find = true; +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + func = array_sum_x86; + find = true; +#endif + } + CHECK_REQUIREMENT(find); + return func; +} + +inline ArraySquareAndAddFunction get_array_square_and_add_function(Arch arch) +{ + ArraySquareAndAddFunction func; + bool find = false; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + func = array_square_and_add_general; + find = true; +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + func = array_square_and_add_arm; + find = true; +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + func = array_square_and_add_x86; + find = true; +#endif + } + CHECK_REQUIREMENT(find); + return func; +} + +inline ArrayActivationFunction get_array_activation_function(Arch arch) +{ + ArrayActivationFunction func; + bool find = false; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + func = array_activation_general; + find = true; +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + func = array_activation_arm; + find = true; +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + func = array_activation_x86; + find = true; +#endif + } + CHECK_REQUIREMENT(find); + return func; +} +#endif diff --git a/compute/tensor/src/cpu/cpu_functions_template.h b/compute/tensor/src/cpu/cpu_functions_template.h new file mode 100644 index 00000000..e53260b1 --- /dev/null +++ b/compute/tensor/src/cpu/cpu_functions_template.h @@ -0,0 +1,215 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_CPU_FUNCTIONS_TEMPLATE +#define _H_CPU_FUNCTIONS_TEMPLATE + +#include +#include +#include "types.h" + +// copy input[index]~input[index+length] to output buffer +template +void get_vector(T *input, int lda, T **output, int ldb, int index, int length, T *buffer) +{ + UNUSED(ldb); + int local = index % lda; + if (length == 1) { + *output = buffer; + (*output)[0] = input[local]; + } else if (lda == 1) { + *output = input; + } else { + int remain = lda - local; + if (remain >= length) { + *output = input + local; + } else { + *output = buffer; + memcpy(*output, input + local, sizeof(T) * remain); + for (int i = 0; i < length - remain; i++) { + (*output)[remain + i] = input[i % lda]; + } + } + } +} + +template +inline void array_scale_template(const T *input, T *output, I32 len, F32 alpha, F32 beta) +{ + for (I32 i = 0; i < len; i++) { + output[i] = alpha * input[i] + beta; + } +} + +template +inline void array_power_template(T *input, T *output, I32 len, F32 power) +{ + for (I32 i = 0; i < len; i++) { + output[i] = powf(input[i], power); + } +} + +template +EE activation_template(ActivationParamSpec activationDesc, F32 input, T *output) +{ + F32 value, result = 0; + EE ret = SUCCESS; + switch (activationDesc.mode) { + case ACTIVATION_NULL: { + result = input; + break; + } + case ACTIVATION_RELU: { + value = input; + F32 tmp = activationDesc.value[0] * value; + if (value < tmp) { + value = tmp; + } + result = value; + break; + } + case ACTIVATION_RELU6: { + value = input; + if (value < 0) { + value = 0; + } + if (value > 6) { + value = 6; + } + result = value; + break; + } + case ACTIVATION_H_SIGMOID: { + value = input + 3; + if (value < 0) { + value = 0; + } + if (value > 6) { + value = 6; + } + result = value / 6; + break; + } + case ACTIVATION_H_SWISH: { + value = input + 3; + if (value < 0) { + value = 0; + } + if (value > 6) { + value = 6; + } + result = input * (value / 6); + break; + } + case ACTIVATION_GELU: { + value = input; + F32 two_div_PI_sqrt = sqrt(2 / 3.14159265358979323846); + value = two_div_PI_sqrt * (value + 0.044715 * powf(value, 3)); + value = 1.0 - 2.0 / (exp(2.0 * value) + 1.0); + value = 0.5 * (1.0 + value); + value = input * value; + result = value; + break; + } + case ACTIVATION_TANH: { + value = 1.0 - 2.0 / (exp(2.0 * input) + 1.0); + result = value; + break; + } + case ACTIVATION_SIGMOID: { + value = 1.0 / (1.0 + exp(-1.0 * input)); + result = value; + break; + } + case ACTIVATION_MISH: { + value = input; + F32 mish_threshold = 20; + if (value < -mish_threshold) { + value = exp(value); + } else if (!(value > mish_threshold || value < -mish_threshold)) { + value = log(exp(value) + 1.0); + } + value = input * tanh(value); + result = value; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + *output = result; + return ret; +} + +template +F32 array_sum_template(const T *array, U32 length) +{ + F32 sum = 0; + for (U32 i = 0; i < length; i++) { + sum += array[i]; + } + return sum; +} + +// array mean +template +F32 array_mean_template(const T *data, I32 len) +{ + if (len <= 0) { + return 0; + } + return array_sum_template(data, len) / len; +} + +template +F32 array_var_template(const T *data, I32 len, F32 mean) +{ + F32 sum_s = 0; + for (I32 i = 0; i < len; i++) { + F32 in = data[i]; + F32 tmp = in - mean; + sum_s += tmp * tmp; + } + return sum_s / len; +} + +template +inline void array_add_template(const T *inputA, const T *inputB, T *output, I32 len) +{ + for (I32 i = 0; i < len; i++) { + output[i] = inputA[i] + inputB[i]; + } +} + +template +inline F32 array_sum_template(const T *data, I32 len) +{ + if (len <= 0) { + return 0; + } + + F32 sum_s = 0; + for (I32 i = 0; i < len; i++) { + sum_s += data[i]; + } + return sum_s; +} + +template +inline void array_square_and_add_template(const T *inputA, const T *inputB, T *output, I32 len) +{ + for (I32 i = 0; i < len; i++) { + output[i] = inputA[i] + inputB[i] * inputB[i]; + } +} +#endif diff --git a/compute/tensor/src/cpu/deconvolution.cpp b/compute/tensor/src/cpu/deconvolution.cpp new file mode 100644 index 00000000..6a30e66c --- /dev/null +++ b/compute/tensor/src/cpu/deconvolution.cpp @@ -0,0 +1,681 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_OPENMP +#include +#endif +#include "thread_affinity.h" +#include "cpu/tensor_computing_cpu.h" +#include "cpu/cpu_functions.h" +#ifdef _USE_NEON +#include "cpu/arm/tensor_computing_arm.h" +#endif +#ifdef _USE_X86 +#include "cpu/x86/tensor_computing_x86.h" +#endif +#include "blas_enhance.h" + +#if defined(_USE_X86) || defined(_USE_NEON) + +EE deconvolution_infer_forward_algorithm_cpu(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + ConvolutionForwardAlgorithm *algorithm, + DataType targetDataType, + Arch arch) +{ + if (nullptr == algorithm) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, fdt; + DataFormat idf, fdf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + + if (1 == fn && ic != fn) { + *algorithm = CONVOLUTION_ALGORITHM_GROUP_DECONV; + return SUCCESS; + } + + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + if ((strideH > 1 || strideW > 1) && fh % strideH == 0 && fw % strideW == 0) { + *algorithm = CONVOLUTION_ALGORITHM_IM2COL_GEMM; + return SUCCESS; + } + + ConvolutionParamSpec transposedCD = convParamSpec; + transposedCD.stride_h = 1; + transposedCD.stride_w = 1; + transposedCD.padding_top = 1; + transposedCD.padding_bottom = 1; + transposedCD.padding_left = 1; + transposedCD.padding_right = 1; + transposedCD.dilatedRate_h = 1; + transposedCD.dilatedRate_w = 1; + + U32 tPadding = (fh - 1 - paddingT) - 1; // Leave out padding of length 1 to activate Winograd + U32 bPadding = (fh - 1 - paddingB) - 1; + U32 lPadding = (fw - 1 - paddingL) - 1; + U32 rPadding = (fw - 1 - paddingR) - 1; + + ih = ih + (ih - 1) * (strideH - 1) + tPadding + bPadding; + iw = iw + (iw - 1) * (strideW - 1) + lPadding + rPadding; + + TensorDesc inPaddedDesc = tensor4df(idt, idf, in, ic, ih, iw); + + // Swap fn and fc + filterDesc.dims[2] = filterDesc.dims[3]; + filterDesc.dims[3] = ic; + EE ret = NOT_SUPPORTED; + if (IS_ARM(arch)) { +#ifdef _USE_NEON + ret = convolution_infer_forward_algorithm_arm( + inPaddedDesc, filterDesc, outputDesc, transposedCD, policy, algorithm, targetDataType); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = convolution_infer_forward_algorithm_x86( + inPaddedDesc, filterDesc, outputDesc, transposedCD, policy, algorithm, targetDataType); +#endif + } + return ret; +} + +EE deconvolution_transform_filter_bytes_cpu(TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes, + Arch arch) +{ + EE ret = NOT_SUPPORTED; + if (algorithm == CONVOLUTION_ALGORITHM_IM2COL_GEMM) { + *bytes = tensorNumBytes(filterDesc); + ret = SUCCESS; + } else if (algorithm == CONVOLUTION_ALGORITHM_GROUP_DECONV) { + ret = depthwise_convolution_transform_filter_bytes_cpu( + filterDesc, DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT, bytes); + } else { + if (IS_ARM(arch)) { +#ifdef _USE_NEON + ret = + convolution_transform_filter_bytes_arm(filterDesc, convParamSpec, algorithm, bytes); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = + convolution_transform_filter_bytes_x86(filterDesc, convParamSpec, algorithm, bytes); +#endif + } + } + return ret; +} + +static EE deconvolution_transform_filter_im2col_gemm_cpu(TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + TensorDesc *ftmDesc, + void *filterTransformed) +{ + DataType fdt; + DataFormat fdf; + U32 fn, fc, fh, fw; + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + if (convParamSpec.stride_h == convParamSpec.kernel_h && + convParamSpec.stride_w == convParamSpec.kernel_w) { + U32 filterDims[5] = {fw, fh, 8, fc / 8, fn}; + U32 ftmDims[5] = {8, fw, fh, fc / 8, fn}; + U32 filterTransformDims[5] = {0, 1, 3, 4, 2}; + CHECK_STATUS(array_transpose( + fdt, filterDims, filter, ftmDims, filterTransformed, filterTransformDims, 5)); + } else { + U32 elementSize = bytesOf(filterDesc.dt); + U32 fnAlignSize = fn / 8; + U8 *ptr = (U8 *)filterTransformed; + for (U32 i = 0; i < convParamSpec.stride_h; i++) { + for (U32 j = 0; j < convParamSpec.stride_w; j++) { + U32 fhStart = (fh - 1 - i - convParamSpec.padding_top) % convParamSpec.stride_h; + U32 fwStart = (fw - 1 - j - convParamSpec.padding_left) % convParamSpec.stride_w; + for (U32 ic = 0; ic < fnAlignSize; ic++) { + for (U32 h = fhStart; h < convParamSpec.kernel_h; h += convParamSpec.stride_h) { + for (U32 w = fwStart; w < convParamSpec.kernel_w; + w += convParamSpec.stride_w) { + for (U32 c8 = 0; c8 < 8; c8++) { + for (U32 oc = 0; oc < fc; oc++, ptr += elementSize) { + U32 srcIndex = + ((((ic * 8 + c8) * fc + oc) * fh + (fh - 1 - h)) * fw + + (fw - 1 - w)) * + elementSize; + const U8 *src = (const U8 *)filter + srcIndex; + memcpy(ptr, src, elementSize); + } + } + } + } + } + } + } + } + *ftmDesc = tensor2df(filterDesc.dt, DF_NORMAL, fn, fc * fh * fw); + return SUCCESS; +} + +EE deconvolution_transform_filter_cpu(TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + void *filterTransformed, + Arch arch) +{ + if (algorithm == CONVOLUTION_ALGORITHM_IM2COL_GEMM) { + return deconvolution_transform_filter_im2col_gemm_cpu( + filterDesc, filter, convParamSpec, ftmDesc, filterTransformed); + } + EE ret = NOT_SUPPORTED; + if (IS_ARM(arch)) { +#ifdef _USE_NEON + ret = deconvolution_transform_filter_arm( + filterDesc, filter, algorithm, ftmDesc, filterTransformed); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = deconvolution_transform_filter_x86( + filterDesc, filter, algorithm, ftmDesc, filterTransformed); +#endif + } + return ret; +} + +EE deconvolution_infer_forward_tmp_bytes_cpu(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes, + Arch arch) +{ + if (nullptr == bytes) { + CHECK_STATUS(NULL_POINTER); + } + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw, fn, fc, fh, fw, on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + if (algorithm == CONVOLUTION_ALGORITHM_IM2COL_GEMM) { + U32 ihNum = ih + convParamSpec.kernel_h * 2 - convParamSpec.padding_top - + convParamSpec.padding_bottom; + U32 iwNum = iw + convParamSpec.kernel_w * 2 - convParamSpec.padding_left - + convParamSpec.padding_right; + U32 fhNum = (U32)ceil((float)convParamSpec.kernel_h / convParamSpec.stride_h); + U32 fwNum = (U32)ceil((float)convParamSpec.kernel_w / convParamSpec.stride_w); + TensorDesc matrixADesc = tensor2df(idt, DF_NORMAL, in * ihNum * iwNum, ic * fhNum * fwNum); + TensorDesc matrixBDesc = tensor2df(filterDesc.dt, DF_NORMAL, ic * fhNum * fwNum, + oc * convParamSpec.stride_h * convParamSpec.stride_w); + CHECK_STATUS(matrix_matrix_multiply_tmp_bytes(matrixADesc, matrixBDesc, bytes, arch)); + *bytes *= OMP_NUM_THREADS; + *bytes += tensorNumBytes(matrixADesc) + tensorNumBytes(outputDesc); + return SUCCESS; + } + + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + U32 tPadding = fh - 1 - paddingT; + U32 bPadding = fh - 1 - paddingB; + U32 lPadding = fw - 1 - paddingL; + U32 rPadding = fw - 1 - paddingR; + + ConvolutionParamSpec transposedCD = convParamSpec; + transposedCD.stride_h = 1; + transposedCD.stride_w = 1; + transposedCD.padding_top = 0; + transposedCD.padding_bottom = 0; + transposedCD.padding_left = 0; + transposedCD.padding_right = 0; + transposedCD.dilatedRate_h = 1; + transposedCD.dilatedRate_w = 1; + + ih = ih + (ih - 1) * (strideH - 1) + tPadding + bPadding; + iw = iw + (iw - 1) * (strideW - 1) + lPadding + rPadding; + TensorDesc inPaddedDesc = tensor4df(idt, idf, in, ic, ih, iw); + if (CONVOLUTION_ALGORITHM_GROUP_DECONV == algorithm) { + *bytes = tensorNumBytes(inPaddedDesc) * 2 + 32; + return SUCCESS; + } + if (DF_NCHW == filterDesc.df) { + // Swap fn and fc + filterDesc.dims[2] = filterDesc.dims[3]; + filterDesc.dims[3] = ic; + } + U32 convolution_tmp_bytes = 0; + EE ret = NOT_SUPPORTED; + if (IS_ARM(arch)) { +#ifdef _USE_NEON + ret = convolution_infer_forward_tmp_bytes_arm( + inPaddedDesc, filterDesc, outputDesc, transposedCD, algorithm, &convolution_tmp_bytes); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = convolution_infer_forward_tmp_bytes_x86( + inPaddedDesc, filterDesc, outputDesc, transposedCD, algorithm, &convolution_tmp_bytes); +#endif + } + *bytes = tensorNumBytes(inPaddedDesc) + convolution_tmp_bytes; + return ret; +} + +static EE deconvolution_stride_greater_one_and_kernel_divide_stride_cpu(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec activationDesc, + Arch arch) +{ + DataType idt, odt; + DataFormat idf, odf; + U32 in, ic, ih, iw, on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U8 alignSize = 8; + U32 icAlignSize = ic / alignSize; + U32 inputTileSize = bytesOf(idt) * alignSize; +#ifndef _USE_OPENMP + U32 ocAlignSize = oc / alignSize; + U32 outputTileSize = bytesOf(odt) * alignSize; + ArrayAddFunction add_func = get_array_add_function(arch); + ArrayActivationFunction activation_func = get_array_activation_function(arch); +#endif + U32 iPaddingT = + (convParamSpec.kernel_h - 1 - convParamSpec.padding_top) / convParamSpec.stride_h; + U32 iPaddingB = + (convParamSpec.kernel_h - 1 - convParamSpec.padding_bottom) / convParamSpec.stride_h; + U32 iPaddingL = + (convParamSpec.kernel_w - 1 - convParamSpec.padding_left) / convParamSpec.stride_w; + U32 iPaddingR = + (convParamSpec.kernel_w - 1 - convParamSpec.padding_right) / convParamSpec.stride_w; + U32 iKernelH = convParamSpec.kernel_h / convParamSpec.stride_h; + U32 iKernelW = convParamSpec.kernel_w / convParamSpec.stride_w; + U8 *tmpInput = (U8 *)tmp; + U32 iStrideT = (convParamSpec.kernel_h - 1 - convParamSpec.padding_top) % convParamSpec.stride_h; + U32 iStrideL = + (convParamSpec.kernel_w - 1 - convParamSpec.padding_left) % convParamSpec.stride_w; + U32 iDumpH = 1; + if (iStrideT == convParamSpec.stride_h - 1) { + iDumpH = 0; + } + U32 iDumpW = 1; + if (iStrideL == convParamSpec.stride_w - 1) { + iDumpW = 0; + } + U32 ihNum = iPaddingT + ih + iPaddingB; + U32 iwNum = iPaddingL + iw + iPaddingR; + U32 mNum = 0; + for (U32 n = 0; n < in; n++) { + for (U32 hStart = 0; hStart <= ihNum - iKernelH; hStart++) { + for (U32 wStart = 0; wStart <= iwNum - iKernelW; wStart++, mNum++) { + for (U32 c = 0, k = 0; c < icAlignSize; c++) { + for (U32 i = 0; i < iKernelH; i++) { + for (U32 j = 0; j < iKernelW; j++, tmpInput += inputTileSize, k += 8) { + U32 h = hStart + i; + U32 w = wStart + j; + if (h < iPaddingT || h >= iPaddingT + ih || w < iPaddingL || + w >= iPaddingL + iw) { + memset(tmpInput, 0, inputTileSize); + } else { + U32 srcIndex = (((n * icAlignSize + c) * ih + (h - iPaddingT)) * iw + + (w - iPaddingL)) * + inputTileSize; + memcpy(tmpInput, (const U8 *)input + srcIndex, inputTileSize); + } + } + } + } + } + } + } + U32 kNum = ic * iKernelH * iKernelW; + U32 nNum = oc; + TensorDesc tmpInputDesc = tensor2df(idt, DF_NORMAL, mNum, kNum); + TensorDesc tmpFilterDesc = tensor2df(filterDesc.dt, DF_NORMAL, kNum, nNum); + TensorDesc tmpOutputDesc = tensor2df(odt, DF_NORMAL, mNum, nNum); + tmpInput = (U8 *)tmp; + U32 bufferSize = + (tmpBytes - tensorNumBytes(tmpInputDesc) - tensorNumBytes(tmpOutputDesc) * OMP_NUM_THREADS) / + OMP_NUM_THREADS; +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 index = 0; index < convParamSpec.stride_h * convParamSpec.stride_w; index++) { + U32 i = index / convParamSpec.stride_w; + U32 j = index % convParamSpec.stride_w; +#ifdef _USE_OPENMP + // For NDK on ARMv7, OpenMP loop cannot reference more than 14 outside variables + ArrayAddFunction add_func = get_array_add_function(arch); + ArrayActivationFunction activation_func = get_array_activation_function(arch); + U32 ocAlignSize = outputDesc.dims[2] / 8; + U32 outputTileSize = bytesOf(outputDesc.dt) * 8; + U32 iPaddingT = + (convParamSpec.kernel_h - 1 - convParamSpec.padding_top) / convParamSpec.stride_h; + U32 iPaddingB = + (convParamSpec.kernel_h - 1 - convParamSpec.padding_bottom) / convParamSpec.stride_h; + U32 iPaddingL = + (convParamSpec.kernel_w - 1 - convParamSpec.padding_left) / convParamSpec.stride_w; + U32 iPaddingR = + (convParamSpec.kernel_w - 1 - convParamSpec.padding_right) / convParamSpec.stride_w; + U32 iKernelH = convParamSpec.kernel_h / convParamSpec.stride_h; + U32 iKernelW = convParamSpec.kernel_w / convParamSpec.stride_w; + U32 iStrideT = + (convParamSpec.kernel_h - 1 - convParamSpec.padding_top) % convParamSpec.stride_h; + U32 iStrideL = + (convParamSpec.kernel_w - 1 - convParamSpec.padding_left) % convParamSpec.stride_w; + U32 ihNum = iPaddingT + inputDesc.dims[1] + iPaddingB; + U32 iwNum = iPaddingL + inputDesc.dims[0] + iPaddingR; + U32 iDumpH = (iStrideT == convParamSpec.stride_h - 1) ? 0 : 1; + U32 iDumpW = (iStrideL == convParamSpec.stride_w - 1) ? 0 : 1; + U32 threadId = omp_get_thread_num(); +#else + U32 threadId = 0; +#endif + U8 *tmpOutput = (U8 *)tmpInput + tensorNumBytes(tmpInputDesc) + + (tensorNumBytes(tmpOutputDesc) + bufferSize) * threadId; + U8 *buffer = (U8 *)tmpOutput + tensorNumBytes(tmpOutputDesc); + memset(tmpOutput, 0, tensorNumBytes(tmpOutputDesc)); + const U8 *tmpFilter = (const U8 *)filter + tensorNumBytes(tmpFilterDesc) * index; + CHECK_STATUS(matrix_matrix_multiply(tmpInputDesc, tmpInput, tmpFilterDesc, tmpFilter, + bufferSize, buffer, tmpOutputDesc, tmpOutput, arch)); + U32 ihStart = 0; + U32 ihEnd = iPaddingT + inputDesc.dims[1] + iPaddingB - iKernelH - iDumpH; + U32 iwStart = 0; + U32 iwEnd = iPaddingL + inputDesc.dims[0] + iPaddingR - iKernelW - iDumpW; + if (i > iStrideT) { + ihStart += iDumpH; + ihEnd += iDumpH; + } + if (j > iStrideL) { + iwStart += iDumpW; + iwEnd += iDumpW; + } + for (U32 n = 0; n < in; n++) { + for (U32 hStart = ihStart, h = 0; hStart <= ihEnd; hStart++, h++) { + for (U32 wStart = iwStart, w = 0; wStart <= iwEnd; wStart++, w++) { + U32 srcIndex = + (((n * (ihNum - iKernelH + 1) + hStart) * (iwNum - iKernelW + 1) + wStart) * + ocAlignSize) * + outputTileSize; + add_func(outputDesc.dt, (U8 *)tmpOutput + srcIndex, bias, + (U8 *)tmpOutput + srcIndex, outputDesc.dims[2]); + CHECK_STATUS(activation_func(outputDesc.dt, (U8 *)tmpOutput + srcIndex, + outputDesc.dims[2], activationDesc, (U8 *)tmpOutput + srcIndex)); + for (U32 c = 0; c < ocAlignSize; c++) { + U32 srcIndex = + (((n * (ihNum - iKernelH + 1) + hStart) * (iwNum - iKernelW + 1) + + wStart) * + ocAlignSize + + c) * + outputTileSize; + U32 dstIndex = (((n * ocAlignSize + c) * outputDesc.dims[1] + + h * convParamSpec.stride_h + i) * + outputDesc.dims[0] + + w * convParamSpec.stride_w + j) * + outputTileSize; + memcpy((U8 *)output + dstIndex, (U8 *)tmpOutput + srcIndex, outputTileSize); + } + } + } + } + } + return SUCCESS; +} + +static EE deconvolution_stride_greater_one_and_kernel_equal_stride_cpu(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec activationDesc, + Arch arch) +{ + ArrayActivationFunction activation_func = get_array_activation_function(arch); + DataType idt, odt; + DataFormat idf, odf; + U32 in, ic, ih, iw, on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 ihNum = ih + convParamSpec.padding_top + convParamSpec.padding_bottom; + U32 iwNum = iw + convParamSpec.padding_left + convParamSpec.padding_right; + U32 fh = convParamSpec.kernel_h; + U32 fw = convParamSpec.kernel_w; + U32 fhNum = fh / convParamSpec.stride_h; + U32 fwNum = fw / convParamSpec.stride_w; + + TensorDesc tmpInputDesc = tensor5df(idt, DF_NCHW, in, ic / 8, ih, iw, 8); + TensorDesc finalInputDesc = tensor5df(idt, DF_NCHW, in, ih, iw, ic / 8, 8); + U32 inputTransformDims[5] = {0, 2, 3, 1, 4}; + void *tmpInput = tmp; + tmp = (U8 *)tmp + tensorNumBytes(finalInputDesc); + tmpBytes -= tensorNumBytes(finalInputDesc); + CHECK_STATUS(array_transpose(tmpInputDesc.dt, tmpInputDesc.dims, input, finalInputDesc.dims, + tmpInput, inputTransformDims, tmpInputDesc.nDims)); + + TensorDesc matrixADesc = tensor2df(idt, DF_NORMAL, in * ihNum * iwNum, ic * fhNum * fwNum); + TensorDesc matrixCDesc = tensor2df(odt, DF_NORMAL, in * ihNum * iwNum, oc * fh * fw); + void *tmpOutput = tmp; + tmp = (U8 *)tmp + tensorNumBytes(matrixCDesc); + tmpBytes -= tensorNumBytes(matrixCDesc); + U32 biasTileSize = bytesOf(biasDesc.dt) * 8; + U8 *tmpOutputPtr = (U8 *)tmpOutput; + for (U32 n = 0; n < on * ih * iw; n++) { + U8 *biasPtr = (U8 *)bias; + for (U32 c = 0; c < oc / 8; c++, biasPtr += biasTileSize) { + for (U32 i = 0; i < oh * ow / (ih * iw); i++, tmpOutputPtr += biasTileSize) { + memcpy(tmpOutputPtr, biasPtr, biasTileSize); + } + } + } + CHECK_STATUS(matrix_matrix_multiply( + matrixADesc, tmpInput, filterDesc, filter, tmpBytes, tmp, matrixCDesc, tmpOutput, arch)); + + U32 tmpOutputDims[7] = {8, ow / iw, oh / ih, oc / 8, iw, ih, on}; + U32 finalOutputDims[7] = {8, ow / iw, iw, oh / ih, ih, oc / 8, on}; + U32 outputTransformDims[7] = {0, 3, 1, 4, 2, 5, 6}; + CHECK_STATUS(array_transpose( + odt, tmpOutputDims, tmpOutput, finalOutputDims, output, outputTransformDims, 7)); + CHECK_STATUS( + activation_func(odt, output, tensorNumElements(outputDesc), activationDesc, output)); + return SUCCESS; +} + +EE deconvolution_cpu(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc scaleDesc, + const void *scale, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec activationDesc, + Arch arch) +{ + UNUSED(scaleDesc); + UNUSED(scale); + + if (algorithm == CONVOLUTION_ALGORITHM_IM2COL_GEMM) { + if (convParamSpec.stride_h == convParamSpec.kernel_h && + convParamSpec.stride_w == convParamSpec.kernel_w) { + return deconvolution_stride_greater_one_and_kernel_equal_stride_cpu(inputDesc, input, + filterDesc, filter, convParamSpec, biasDesc, bias, tmpBytes, tmp, outputDesc, + output, activationDesc, arch); + } else { + return deconvolution_stride_greater_one_and_kernel_divide_stride_cpu(inputDesc, input, + filterDesc, filter, convParamSpec, biasDesc, bias, tmpBytes, tmp, outputDesc, + output, activationDesc, arch); + } + } + + if (nullptr == input || nullptr == filter || nullptr == output || nullptr == bias || + nullptr == tmp) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if (!(idf == DF_NCHWC8 && odf == DF_NCHWC8)) { + CHECK_STATUS(NOT_MATCH); + } + + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + ConvolutionParamSpec transposedCD = convParamSpec; + transposedCD.stride_h = 1; + transposedCD.stride_w = 1; + transposedCD.padding_top = 0; + transposedCD.padding_bottom = 0; + transposedCD.padding_left = 0; + transposedCD.padding_right = 0; + transposedCD.dilatedRate_h = 1; + transposedCD.dilatedRate_w = 1; + + if (CONVOLUTION_ALGORITHM_WINOGRAD == algorithm) { + fh = 3; + fw = 3; + } + + U32 tPadding = fh - 1 - paddingT; + U32 bPadding = fh - 1 - paddingB; + U32 lPadding = fw - 1 - paddingL; + U32 rPadding = fw - 1 - paddingR; + + U32 stuffH = strideH - 1; + U32 stuffW = strideW - 1; + U32 ihPadded = ih + (ih - 1) * stuffH + tPadding + bPadding; + U32 iwPadded = iw + (iw - 1) * stuffW + lPadding + rPadding; + TensorDesc inPaddedDesc = tensor4df(idt, idf, in, ic, ihPadded, iwPadded); + + U8 *inPad = (U8 *)tmp; + U8 *inPadMov = inPad; + U8 *inputMov = (U8 *)input; + U32 memUnit = 8 * bytesOf(idt); + + ic /= 8; + + for (U32 c = 0; c < ic; c++) { + for (U32 h = 0; h < tPadding; h++) { + memset(inPadMov, 0, iwPadded * memUnit); + inPadMov += iwPadded * memUnit; + } + for (U32 h = 0; h < ih - 1; h++) { + memset(inPadMov, 0, lPadding * memUnit); + inPadMov += lPadding * memUnit; + for (U32 w = 0; w < iw - 1; w++) { + memcpy(inPadMov, inputMov, memUnit); + inPadMov += memUnit; + inputMov += memUnit; + memset(inPadMov, 0, stuffW * memUnit); + inPadMov += stuffW * memUnit; + } + memcpy(inPadMov, inputMov, memUnit); + inPadMov += memUnit; + inputMov += memUnit; + memset(inPadMov, 0, rPadding * memUnit); + inPadMov += rPadding * memUnit; + + // stuffH + memset(inPadMov, 0, iwPadded * stuffH * memUnit); + inPadMov += iwPadded * stuffH * memUnit; + } + memset(inPadMov, 0, lPadding * memUnit); + inPadMov += lPadding * memUnit; + for (U32 w = 0; w < iw - 1; w++) { + memcpy(inPadMov, inputMov, memUnit); + inPadMov += memUnit; + inputMov += memUnit; + memset(inPadMov, 0, stuffW * memUnit); + inPadMov += stuffW * memUnit; + } + memcpy(inPadMov, inputMov, memUnit); + inPadMov += memUnit; + inputMov += memUnit; + memset(inPadMov, 0, rPadding * memUnit); + inPadMov += rPadding * memUnit; + + for (U32 h = ihPadded - bPadding; h < ihPadded; h++) { + memset(inPadMov, 0, iwPadded * memUnit); + inPadMov += iwPadded * memUnit; + } + } + + EE ret = NOT_SUPPORTED; + if (algorithm == CONVOLUTION_ALGORITHM_GROUP_DECONV) { + TensorDesc blankTensorDesc; + ActivationParamSpec blankActivationParamSpec; + ret = depthwise_pointwise_convolution_cpu(inPaddedDesc, inPad, filterDesc, filter, + blankTensorDesc, nullptr, transposedCD, + DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT, biasDesc, bias, blankTensorDesc, + nullptr, tmpBytes - tensorNumBytes(inPaddedDesc), inPad + tensorNumBytes(inPaddedDesc), + outputDesc, output, activationDesc, blankActivationParamSpec, arch); + } else { + ret = convolution_cpu(inPaddedDesc, inPad, filterDesc, filter, transposedCD, algorithm, + scaleDesc, scale, biasDesc, bias, tmpBytes - tensorNumBytes(inPaddedDesc), + inPad + tensorNumBytes(inPaddedDesc), outputDesc, output, activationDesc, arch); + } + + return ret; +} + +#endif diff --git a/compute/tensor/src/cpu/depthwise_convolution.cpp b/compute/tensor/src/cpu/depthwise_convolution.cpp new file mode 100644 index 00000000..24caa9b5 --- /dev/null +++ b/compute/tensor/src/cpu/depthwise_convolution.cpp @@ -0,0 +1,31 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/tensor_computing_cpu.h" + +EE depthwise_convolution_transform_filter_bytes_cpu( + TensorDesc filterDesc, DepthwiseConvolutionForwardAlgorithm algorithm, U32 *bytes) +{ + if (nullptr == bytes) { + CHECK_STATUS(NULL_POINTER); + } + switch (algorithm) { + case DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT: + *bytes = tensorNumBytes(filterDesc); + break; + default: + return NOT_SUPPORTED; + } + *bytes += 32; + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/depthwise_pointwise_convolution.cpp b/compute/tensor/src/cpu/depthwise_pointwise_convolution.cpp new file mode 100644 index 00000000..14eea321 --- /dev/null +++ b/compute/tensor/src/cpu/depthwise_pointwise_convolution.cpp @@ -0,0 +1,68 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_GENERAL +#include "cpu/general/tensor_computing_general.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/tensor_computing_arm.h" +#endif +#ifdef _USE_X86 +#include "cpu/x86/tensor_computing_x86.h" +#endif +#include "cpu/tensor_computing_cpu.h" + +EE depthwise_pointwise_convolution_cpu(TensorDesc inputDesc, + void *input, + TensorDesc dwFilterDesc, + const void *dwFilter, + TensorDesc pwFilterDesc, + const void *pwFilter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc dwBiasDesc, + const void *dwBias, + TensorDesc pwBiasDesc, + const void *pwBias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + Arch arch) +{ + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = depthwise_pointwise_convolution_general(inputDesc, input, dwFilterDesc, dwFilter, + pwFilterDesc, pwFilter, convParamSpec, dwBiasDesc, dwBias, pwBiasDesc, pwBias, tmpBytes, + tmp, outputDesc, output, depthwiseActivationParamSpec, pointwiseActivationParamSpec); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = depthwise_pointwise_convolution_x86(inputDesc, input, dwFilterDesc, dwFilter, + pwFilterDesc, pwFilter, convParamSpec, algorithm, dwBiasDesc, dwBias, pwBiasDesc, + pwBias, tmpBytes, tmp, outputDesc, output, depthwiseActivationParamSpec, + pointwiseActivationParamSpec, arch); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = depthwise_pointwise_convolution_arm(inputDesc, input, dwFilterDesc, dwFilter, + pwFilterDesc, pwFilter, convParamSpec, algorithm, dwBiasDesc, dwBias, pwBiasDesc, + pwBias, tmpBytes, tmp, outputDesc, output, depthwiseActivationParamSpec, + pointwiseActivationParamSpec, arch); +#endif + } + return ret; +} \ No newline at end of file diff --git a/compute/tensor/src/cpu/detectionoutput.cpp b/compute/tensor/src/cpu/detectionoutput.cpp new file mode 100644 index 00000000..9695c638 --- /dev/null +++ b/compute/tensor/src/cpu/detectionoutput.cpp @@ -0,0 +1,252 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "error.h" +#include "cpu/tensor_computing_cpu.h" + +inline EE qsort_descent(std::vector &boxes, std::vector &scores, int left, int right) +{ + if (boxes.empty() || scores.empty()) { + return NOT_SUPPORTED; + } + + int i = left; + int j = right; + F32 temp = scores[(left + right) / 2]; + + while (i <= j) { + while (scores[i] > temp) { + i++; + } + while (scores[j] < temp) { + j--; + } + if (i <= j) { + std::swap(boxes[i], boxes[j]); + std::swap(scores[i], scores[j]); + i++; + j--; + } + } + + if (left < j) { + qsort_descent(boxes, scores, left, j); + } + if (i < right) { + qsort_descent(boxes, scores, i, right); + } + + return SUCCESS; +} + +inline F32 intersectionarea(BoxRect a, BoxRect b) +{ + if (a.xmin > b.xmax || a.xmax < b.xmin || a.ymin > b.ymax || a.ymax < b.ymin) { + return 0.f; + } + F32 inter_width = std::min(a.xmax, b.xmax) - std::max(a.xmin, b.xmin); + F32 inter_height = std::min(a.ymax, b.ymax) - std::max(a.ymin, b.ymin); + + return inter_width * inter_height; +} + +inline EE nms_pickedboxes(std::vector boxes, std::vector &picked, F32 nms_threshold) +{ + I64 n = boxes.size(); + + std::vector areas(n); + for (I64 i = 0; i < n; i++) { + BoxRect box = boxes[i]; + + F32 width = box.xmax - box.xmin; + F32 height = box.ymax - box.ymin; + + areas[i] = width * height; + } + for (I64 i = 0; i < n; i++) { + BoxRect a = boxes[i]; + int keep = 1; + for (int j = 0; j < (int)picked.size(); j++) { + BoxRect b = boxes[picked[j]]; + F32 inter_area = intersectionarea(a, b); + F32 union_area = areas[i] + areas[picked[j]] - inter_area; + + if (inter_area / union_area > nms_threshold) { + keep = 0; + } + } + if (keep) { + picked.push_back(i); + } + } + return SUCCESS; +} + +template +EE detectionoutput_kernel(std::vector input, + T *output, + U32 priorbox_width, + U32 num_class, + F32 nms_threshold, + U32 nms_top_k, + U32 keep_top_k, + F32 confidence_threshold) +{ + T *location = (T *)input[0]; + T *confidence = (T *)input[1]; + T *priorbox = (T *)input[2]; + + U32 num_total_priorbox = priorbox_width / 4; + U32 numclass = num_class; + + std::vector> boxes; + boxes.resize(num_total_priorbox); + T *variance = priorbox + priorbox_width; + // decode priorbox + for (U32 i = 0; i < num_total_priorbox; i++) { + T *loc = location + i * 4; + T *pb = priorbox + i * 4; + T *var = variance + i * 4; + + F32 pb_w = pb[2] - pb[0]; + F32 pb_h = pb[3] - pb[1]; + F32 pb_cx = (pb[0] + pb[2]) * 0.5f; + F32 pb_cy = (pb[1] + pb[3]) * 0.5f; + + F32 box_cx = var[0] * loc[0] * pb_w + pb_cx; + F32 box_cy = var[1] * loc[1] * pb_h + pb_cy; + F32 box_w = static_cast(exp(var[2] * loc[2]) * pb_w); + F32 box_h = static_cast(exp(var[3] * loc[3]) * pb_h); + + std::vector box; + box.resize(4); + box[0] = box_cx - box_w * 0.5f; + box[1] = box_cy - box_h * 0.5f; + box[2] = box_cx + box_w * 0.5f; + box[3] = box_cy + box_h * 0.5f; + // give box to boxes + boxes[i].assign(box.begin(), box.end()); + } + + std::vector> allclass_boxrects; + std::vector> allclass_boxscores; + allclass_boxrects.resize(numclass); + allclass_boxscores.resize(numclass); + + for (U32 i = 1; i < numclass; i++) { + std::vector class_boxrects; + std::vector class_boxscores; + for (U32 j = 0; j < num_total_priorbox; j++) { + F32 score = confidence[j * numclass + i]; + + if (score > confidence_threshold) { + std::vector inbox; + inbox.assign(boxes[j].begin(), boxes[j].end()); + BoxRect b = {inbox[0], inbox[1], inbox[2], inbox[3], i}; + class_boxrects.push_back(b); + class_boxscores.push_back(score); + } + } + // sort the boxes with scores + qsort_descent( + class_boxrects, class_boxscores, 0, static_cast(class_boxscores.size() - 1)); + + if (nms_top_k < (U32)class_boxrects.size()) { + class_boxrects.resize(nms_top_k); + class_boxscores.resize(nms_top_k); + } + // apply nms + std::vector picked; + nms_pickedboxes(class_boxrects, picked, nms_threshold); + + for (I64 j = 0; j < (I64)picked.size(); j++) { + I64 picked_box = picked[j]; + allclass_boxrects[i].push_back(class_boxrects[picked_box]); + allclass_boxscores[i].push_back(class_boxscores[picked_box]); + } + } + + std::vector boxrects; + std::vector boxscores; + + for (U32 i = 1; i < numclass; i++) { + boxrects.insert(boxrects.end(), allclass_boxrects[i].begin(), allclass_boxrects[i].end()); + boxscores.insert( + boxscores.end(), allclass_boxscores[i].begin(), allclass_boxscores[i].end()); + } + + qsort_descent(boxrects, boxscores, 0, static_cast(boxscores.size() - 1)); + + if (keep_top_k < (U32)boxrects.size()) { + boxrects.resize(keep_top_k); + boxscores.resize(keep_top_k); + } + + U32 num_detected = static_cast(boxrects.size()); + // the first box contains the number of availble boxes in the first element. + output[0] = num_detected; + output[1] = output[2] = output[3] = output[4] = output[5] = 0; + + for (U32 i = 0; i < num_detected; i++) { + BoxRect b = boxrects[i]; + F32 score = boxscores[i]; + + output[(i + 1) * 6] = b.label; + output[(i + 1) * 6 + 1] = score; + output[(i + 1) * 6 + 2] = b.xmin; + output[(i + 1) * 6 + 3] = b.ymin; + output[(i + 1) * 6 + 4] = b.xmax; + output[(i + 1) * 6 + 5] = b.ymax; + } + return SUCCESS; +} + +EE detectionoutput_cpu(std::vector inputDesc, + std::vector input, + DetectionOutputParamSpec detectionOutputParamSpec, + TensorDesc outputDesc, + void *output) +{ + UNUSED(outputDesc); + if (nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + if (inputDesc.size() != 3) { + CHECK_STATUS(NOT_MATCH); + } + DataType idt0 = inputDesc[0].dt; + U32 ilens2 = inputDesc[2].dims[0]; + U32 numclass = detectionOutputParamSpec.num_class; + F32 nmsthreshold = detectionOutputParamSpec.nms_threshold; + U32 nmstopk = detectionOutputParamSpec.nms_top_k; + U32 keeptopk = detectionOutputParamSpec.keep_top_k; + F32 confidencethreshold = detectionOutputParamSpec.confidence_threshold; + EE ret = SUCCESS; + switch (idt0) { +#ifdef _USE_FP32 + case DT_F32: + detectionoutput_kernel(input, (F32 *)output, ilens2, numclass, nmsthreshold, nmstopk, + keeptopk, confidencethreshold); + break; +#endif +#ifdef _USE_FP16 + case DT_F16: + detectionoutput_kernel(input, (F16 *)output, ilens2, numclass, nmsthreshold, nmstopk, + keeptopk, confidencethreshold); + break; +#endif + default: + ret = NOT_SUPPORTED; + } + return ret; +} diff --git a/compute/tensor/src/cpu/eltwise.cpp b/compute/tensor/src/cpu/eltwise.cpp new file mode 100644 index 00000000..def5b37a --- /dev/null +++ b/compute/tensor/src/cpu/eltwise.cpp @@ -0,0 +1,169 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/tensor_computing_cpu.h" +#ifdef _USE_GENERAL +#include "cpu/general/tensor_computing_general.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/tensor_computing_arm.h" +#endif +#ifdef _USE_X86 +#include "cpu/x86/tensor_computing_x86.h" +#endif + +static std::vector calculateRelativeLocalIndex_cpu(U32 *indexes, U32 *dims, U32 nDims) +{ + std::vector relativeIndexes(nDims); + for (U32 i = 0; i < nDims; i++) { + relativeIndexes[i] = indexes[i] % dims[i]; + } + return relativeIndexes; +} + +// [1, 10, 10] + [1, 10, 10] = [1, 10, 10] +// [1, 10, 1] + [1, 1, 10] = [1, 10, 10] +// [1, 20, 10] + [10] = [1. 20, 10] + [1, 1, 10] = [1, 20, 10] +EE eltwise_cpu(std::vector inputDesc, + std::vector input_, + EltwiseParamSpec eltwiseDesc, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + Arch arch) +{ + U32 num = inputDesc.size(); + if (num <= 1 || outputDesc.nDims < 1) { + return NOT_MATCH; + } + std::vector input = input_; + U32 nchwc8Count = 0; + U32 minDims = inputDesc[0].nDims; + for (U32 i = 0; i < num; i++) { + if (inputDesc[i].df == DF_NCHWC8) { + nchwc8Count++; + } + if (inputDesc[i].nDims < minDims) { + minDims = inputDesc[i].nDims; + } + } + U8 *ptr = (U8 *)tmp; + if (nchwc8Count > 0 && nchwc8Count != num) { + for (U32 i = 0; i < num; i++) { + if (inputDesc[i].df == DF_NCHWC8) { + TensorDesc tmpDesc = inputDesc[i]; + tmpDesc.df = DF_NCHW; + transformToNCHW(inputDesc[i], input[i], tmpDesc, ptr); + inputDesc[i] = tmpDesc; + input[i] = ptr; + ptr += tensorNumBytes(inputDesc[i]); + // Output from 1D-conv + 3D tensors + if (inputDesc[i].dims[0] == 1 && minDims == 3) { + inputDesc[i] = tensor3df(inputDesc[i].dt, DF_NCHW, + inputDesc[i].dims[3], inputDesc[i].dims[2], inputDesc[i].dims[1]); + } + } + } + } + + I32 oneCount = 0; + for (int i = 0; i < ((int)outputDesc.nDims) - 1; i++) { + if (outputDesc.dims[i] == 1) { + oneCount++; + } else { + break; + } + } + TensorDesc newOutputDesc = outputDesc; + for (int i = 0; i < (int)outputDesc.nDims - oneCount; i++) { + newOutputDesc.dims[i] = outputDesc.dims[oneCount + i]; + } + newOutputDesc.nDims = outputDesc.nDims - oneCount; + + std::vector newInputDesc(num); + for (U32 i = 0; i < num; i++) { + newInputDesc[i] = inputDesc[i]; + for (int j = 0; j < (int)inputDesc[i].nDims - oneCount; j++) { + newInputDesc[i].dims[j] = inputDesc[i].dims[oneCount + j]; + } + newInputDesc[i].nDims = inputDesc[i].nDims - oneCount; + for (U32 j = newInputDesc[i].nDims; j < newOutputDesc.nDims; j++) { + newInputDesc[i].dims[j] = 1; + } + newInputDesc[i].nDims = newOutputDesc.nDims; + } + U32 size = tensorNumElements(newOutputDesc); + int lastDimSize = newOutputDesc.dims[0]; + std::vector lastDimSizes(num); + for (U32 i = 0; i < num; i++) { + lastDimSizes[i] = newInputDesc[i].dims[0]; + if (lastDimSizes[i] != lastDimSize && newInputDesc[0].df == DF_NCHWC8) { + UNI_ERROR_LOG("For NCHWc8, eltwise can only handle inputs with matching widths\n"); + } + } + for (U32 i = 1; i < newOutputDesc.nDims; i++) { + bool sameDim = true; + for (U32 j = 0; j < num; j++) { + if (newInputDesc[j].dims[i] != newOutputDesc.dims[i]) { + sameDim = false; + break; + } + } + if (sameDim) { + lastDimSize *= newOutputDesc.dims[i]; + for (U32 j = 0; j < num; j++) { + lastDimSizes[j] *= newInputDesc[j].dims[i]; + } + } else { + break; + } + } + + std::vector newInput(num); + EE ret = NOT_SUPPORTED; + for (U32 i = 0; i < size; i += lastDimSize) { + std::vector index = calculateLocalIndex(i, newOutputDesc.dims, newOutputDesc.nDims); + for (U32 j = 0; j < num; j++) { + std::vector relativeIndex = calculateRelativeLocalIndex_cpu( + index.data(), newInputDesc[j].dims, newInputDesc[j].nDims); + U32 globalIndex = calculateGlobalIndex( + relativeIndex.data(), newInputDesc[j].dims, newInputDesc[j].nDims); + newInput[j] = (U8 *)(input[j]) + globalIndex * bytesOf(newInputDesc[j].dt); + } + U8 *newOutput = (U8 *)output + i * bytesOf(newOutputDesc.dt); + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = eltwise_general(newOutputDesc.dt, newInput, lastDimSizes, num, lastDimSize, + newOutput, eltwiseDesc.elt_mode); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = eltwise_arm(newOutputDesc.dt, newInput, lastDimSizes, num, lastDimSize, newOutput, + eltwiseDesc.elt_mode); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = eltwise_x86(newOutputDesc.dt, newInput, lastDimSizes, num, lastDimSize, newOutput, + eltwiseDesc.elt_mode); +#endif + } + } + if (ret == SUCCESS && eltwiseDesc.activation_type != ACTIVATION_NULL) { + ActivationParamSpec p; + p.mode = eltwiseDesc.activation_type; + ret = activation_cpu(outputDesc, output, p, outputDesc, output, arch); + } + return ret; +} diff --git a/compute/tensor/src/cpu/embedding.cpp b/compute/tensor/src/cpu/embedding.cpp new file mode 100644 index 00000000..6698b5a4 --- /dev/null +++ b/compute/tensor/src/cpu/embedding.cpp @@ -0,0 +1,66 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/tensor_computing_cpu.h" + +EE embedding_cpu(TensorDesc inputDesc, + void *input, + void *weight, + EmbedParamSpec p, + TensorDesc outputDesc, + void *output) +{ + U8 *weightPtr = (U8 *)weight; + U8 *outputPtr = (U8 *)output; + U32 len = tensorNumElements(inputDesc); + U32 elementBytes = bytesOf(outputDesc.dt); + U32 wordEmbeddingCPUBytes = elementBytes * p.num_output; + U32 transposeStride = elementBytes * p.input_dim; + EE ret = SUCCESS; + for (U32 i = 0; i < len; i++) { + U32 wordIndex = 0; + switch (inputDesc.dt) { + case DT_U32: + wordIndex = ((U32 *)input)[i]; + break; + case DT_I32: + wordIndex = ((I32 *)input)[i]; + break; + case DT_F32: + wordIndex = ((F32 *)input)[i]; + break; +#ifdef _USE_FP16 + case DT_F16: + wordIndex = ((F16 *)input)[i]; + break; +#endif + default: + ret = NOT_SUPPORTED; + break; + } + U8 *dest = outputPtr; + if (p.transpose) { + U8 *src = weightPtr + wordIndex * elementBytes; + for (U32 j = 0; j < p.num_output; j++) { + memcpy(dest, src, elementBytes); + src += transposeStride; + dest += elementBytes; + } + } else { + U8 *src = weightPtr + wordIndex * wordEmbeddingCPUBytes; + memcpy(dest, src, wordEmbeddingCPUBytes); + } + outputPtr += wordEmbeddingCPUBytes; + } + return ret; +} diff --git a/compute/tensor/src/cpu/general/attention.cpp b/compute/tensor/src/cpu/general/attention.cpp new file mode 100644 index 00000000..dc12c890 --- /dev/null +++ b/compute/tensor/src/cpu/general/attention.cpp @@ -0,0 +1,81 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/general/tensor_computing_general.h" +#include "cpu/general/general_functions.h" + +template +EE attention( + U32 batch, U32 numHeads, U32 fromSequenceLength, U32 toSequenceLength, const T *input, T *output) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + T minValue = -10000.0; + U32 count = array_sum_template(input, toSequenceLength); + U32 valid = UNI_MIN(count, fromSequenceLength); + for (U32 n = 0; n < batch; n++) { + for (U32 i = 0; i < numHeads; i++) { + for (U32 j = 0; j < valid; j++) { + for (U32 k = 0; k < toSequenceLength; k++) { + T value = input[n * toSequenceLength + k]; + U32 index = + (((n * numHeads + i) * fromSequenceLength + j) * toSequenceLength + k); + output[index] = (1 - value) * minValue; + } + } + for (U32 j = valid; j < fromSequenceLength; j++) { + for (U32 k = 0; k < toSequenceLength; k++) { + U32 index = + (((n * numHeads + i) * fromSequenceLength + j) * toSequenceLength + k); + output[index] = minValue; + } + } + } + } + return SUCCESS; +} + +EE attention_general(TensorDesc inputDesc, const void *input, TensorDesc outputDesc, void *output) +{ + DataType dt; + DataFormat df; + U32 batch, numHeads, fromSequenceLength, toSequenceLength; + CHECK_REQUIREMENT(tensorIs2d(inputDesc)); + CHECK_REQUIREMENT(tensorIs4d(outputDesc)); + CHECK_STATUS(tensor4dGet( + outputDesc, &dt, &df, &batch, &numHeads, &fromSequenceLength, &toSequenceLength)); + + EE ret = SUCCESS; + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: { + ret = attention(batch, numHeads, fromSequenceLength, toSequenceLength, + (const F16 *)input, (F16 *)output); + break; + } +#endif +#ifdef _USE_FP32 + case DT_F32: { + ret = attention(batch, numHeads, fromSequenceLength, toSequenceLength, + (const F32 *)input, (F32 *)output); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/general/attention_mask.cpp b/compute/tensor/src/cpu/general/attention_mask.cpp new file mode 100644 index 00000000..90a45c78 --- /dev/null +++ b/compute/tensor/src/cpu/general/attention_mask.cpp @@ -0,0 +1,102 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/general/tensor_computing_general.h" + +template +static EE attention_mask(TensorDesc inputDesc, + const T *input, + I32 attentionLength, + bool sameLength, + float maskValue, + TensorDesc outputDesc, + T *output) +{ + UNUSED(outputDesc); + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + int qlen = inputDesc.dims[1]; + int klen = inputDesc.dims[0]; + int mlen = klen - qlen; + std::vector> mask; + if (attentionLength < 0) { + mask = std::vector>(qlen, std::vector(klen, 0)); + } else { + mask = std::vector>(qlen, std::vector(klen, 1)); + for (int i = 0; i < qlen; i++) { + int start, loops; + if (attentionLength > 0) { + int end = mlen + i; + start = UNI_MAX(end - attentionLength, 0); + loops = end - start + 1; + } else { + if (sameLength) { + start = i; + loops = qlen + 1; + } else { + start = 0; + loops = i + qlen + 1; + } + } + loops = UNI_MAX(loops, 0); + start = UNI_MIN(start, klen); + if (start + loops > klen) { + loops = UNI_MAX(klen - start, 0); + } + memset(&mask[i][start], 0, sizeof(T) * loops); + } + } + I32 loops = tensorNumElements(inputDesc) / qlen / klen; + for (int i = 0, index = 0; i < loops; i++) { + for (int j = 0; j < qlen; j++) { + for (int k = 0; k < klen; k++) { + output[index] = input[index] * (1 - mask[j][k]) - maskValue * mask[j][k]; + index++; + } + } + } + return SUCCESS; +} + +EE attention_mask_general(TensorDesc inputDesc, + const void *input, + AttentionMaskParamSpec p, + TensorDesc outputDesc, + void *output) +{ + DataType idt = inputDesc.dt; + EE ret = SUCCESS; + switch (idt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = attention_mask(inputDesc, (const F32 *)input, p.attention_length, + p.same_length, p.mask, outputDesc, (F32 *)output); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + ret = attention_mask(inputDesc, (const F16 *)input, p.attention_length, + p.same_length, p.mask, outputDesc, (F16 *)output); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + + return ret; +} diff --git a/compute/tensor/src/cpu/general/check.cpp b/compute/tensor/src/cpu/general/check.cpp new file mode 100644 index 00000000..50ac9a82 --- /dev/null +++ b/compute/tensor/src/cpu/general/check.cpp @@ -0,0 +1,115 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/general/tensor_computing_general.h" + +template +static EE check(TensorDesc inputDescA, + const T *inputA, + TensorDesc inputDescB, + const T *inputB, + CheckMode checkMode, + TensorDesc outputDesc, + I32 *output) +{ + UNUSED(inputDescB); + UNUSED(outputDesc); + + if (nullptr == inputA || nullptr == inputB || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + U32 size = tensorNumElements(inputDescA); + U32 loopOuter = inputDescA.dims[inputDescA.nDims - 1]; + U32 loopInner = size / loopOuter; + + for (U32 i = 0; i < loopOuter; i++) { + U32 count = 0; + for (U32 j = 0; j < loopInner; j++) { + U32 index = i * loopInner + j; + switch (checkMode) { + case CHECK_EQUAL: { + if (inputA[index] == inputB[index]) { + count++; + } + break; + } + case CHECK_GREATEQUAL: { + if (inputA[index] >= inputB[index]) { + count++; + } + break; + } + case CHECK_GREAT: { + if (inputA[index] > inputB[index]) { + count++; + } + break; + } + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } + } + + if (count == loopInner) { + output[i] = 1; + } else { + output[i] = 0; + } + } + return SUCCESS; +} + +EE check_general(TensorDesc inputDescA, + const void *inputA, + TensorDesc inputDescB, + const void *inputB, + CheckParamSpec p, + TensorDesc outputDesc, + void *output) +{ + DataType idt = inputDescA.dt; + EE ret = SUCCESS; + switch (idt) { +#ifdef _USE_FP16 + case DT_F16: { + ret = check(inputDescA, (const F16 *)inputA, inputDescB, (const F16 *)inputB, + p.check_mode, outputDesc, (I32 *)output); + break; + } +#endif +#ifdef _USE_FP32 + case DT_F32: { + ret = check(inputDescA, (const F32 *)inputA, inputDescB, (const F32 *)inputB, + p.check_mode, outputDesc, (I32 *)output); + break; + } +#endif + case DT_U32: { + ret = check(inputDescA, (const U32 *)inputA, inputDescB, (const U32 *)inputB, + p.check_mode, outputDesc, (I32 *)output); + break; + } + case DT_I32: { + ret = check(inputDescA, (const I32 *)inputA, inputDescB, (const I32 *)inputB, + p.check_mode, outputDesc, (I32 *)output); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + + return ret; +} diff --git a/compute/tensor/src/cpu/general/clip.cpp b/compute/tensor/src/cpu/general/clip.cpp new file mode 100644 index 00000000..a627a24e --- /dev/null +++ b/compute/tensor/src/cpu/general/clip.cpp @@ -0,0 +1,55 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/general/tensor_computing_general.h" + +template +static EE clip(T *input, T *output, U32 len, F32 min_value, F32 max_value) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + for (U32 i = 0; i < len; i++) { + F32 value = input[i]; + value = (value > min_value) ? value : min_value; + value = (value < max_value) ? value : max_value; + output[i] = value; + } + return SUCCESS; +} + +EE clip_general( + TensorDesc inputDesc, void *input, ClipParamSpec p, TensorDesc outputDesc, void *output) +{ + UNUSED(outputDesc); + EE ret = SUCCESS; + switch (inputDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = clip((F32 *)input, (F32 *)output, tensorNumElements(inputDesc), p.min, p.max); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + ret = clip((F16 *)input, (F16 *)output, tensorNumElements(inputDesc), p.min, p.max); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/general/convolution.cpp b/compute/tensor/src/cpu/general/convolution.cpp new file mode 100644 index 00000000..9179307e --- /dev/null +++ b/compute/tensor/src/cpu/general/convolution.cpp @@ -0,0 +1,209 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include +#include "types.h" +#include "cpu/general/tensor_computing_general.h" +#include "cpu/general/general_functions.h" + +template +inline EE convolution(TensorDesc inputDesc, + T1 *inArray, + TensorDesc filterDesc, + const T2 *filterArray, + ConvolutionParamSpec convParamSpec, + const T3 *biasArray, + const T4 *scaleArray, + TensorDesc outputDesc, + T4 *outArray, + ActivationParamSpec activationDesc, + T1 paddingValue = 0) +{ + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 group = convParamSpec.group; + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingL = convParamSpec.padding_left; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; + U32 ocGroupSize = oc / group; + CHECK_REQUIREMENT(fdf == DF_NCHW); + + // For BNN, accumulated values are always 0 or 1, which may lead to error if buf is floating point. + U32 ic8 = ic / 8; + U32 oc8 = oc / 8; + for (U32 n = 0; n < in; n++) { + for (U32 o = 0; o < oc; o++) { + for (U32 h = 0; h < oh; h++) { + for (U32 w = 0; w < ow; w++) { + T3 value = 0; + U32 groupId = o / ocGroupSize; + U32 icStart = groupId * fc; + U32 icEnd = (groupId + 1) * fc; + for (U32 c = icStart, f_off = o * fc * fh * fw; c < icEnd; c++) { + for (I32 fh_idx = 0; fh_idx < (I32)fh; fh_idx++) { + for (I32 fw_idx = 0; fw_idx < (I32)fw; fw_idx++, f_off++) { + I32 ih_idx = h * strideH - paddingT + fh_idx * dilateH; + I32 iw_idx = w * strideW - paddingL + fw_idx * dilateW; + if (ih_idx >= 0 && ih_idx < (I32)ih && iw_idx >= 0 && + iw_idx < (I32)iw) { + U32 i_off; + if (idf == DF_NCHW) { + i_off = ((n * ic + c) * ih + ih_idx) * iw + iw_idx; + } else { + i_off = + (((n * ic8 + (c / 8)) * ih + ih_idx) * iw + iw_idx) * 8 + + c % 8; + } + value += inArray[i_off] * filterArray[f_off]; + } else { + value += paddingValue * filterArray[f_off]; + } + } + } + } + U32 o_off; + if (odf == DF_NCHW) { + o_off = ((n * oc + o) * oh + h) * ow + w; + } else { + o_off = (((n * oc8 + (o / 8)) * oh + h) * ow + w) * 8 + o % 8; + } + + T4 scale = 1; + if (scaleArray != nullptr) { + scale = scaleArray[o]; + } + outArray[o_off] = scale * value + biasArray[o]; + CHECK_STATUS( + activation_template(activationDesc, outArray[o_off], &outArray[o_off])); + } + } + } + } + return SUCCESS; +} + +#ifdef _USE_FP16 +void bnn_input_process(TensorDesc inputDesc, F16 *input, DataType fdt, short *output) +{ + F16 centerValue = 0.0; + if (fdt == DT_BIN01) { + centerValue = 0.5; + } + short zeroValue = 0; + if (fdt == DT_BIN11) { + zeroValue = -1; + } + U32 len = tensorNumElements(inputDesc); + for (U32 i = 0; i < len; i++) { + if (input[i] >= centerValue) { + output[i] = 1; + } else { + output[i] = zeroValue; + } + } +} + +void bnn_filter_process(TensorDesc filterDesc, BIN8 *filter, short *filterTransformed) +{ + short zeroValue = 0; + if (filterDesc.dt == DT_BIN11) { + zeroValue = -1; + } + U32 len = tensorNumElements(filterDesc); + for (U32 i = 0; i < len; i++) { + U32 bitSlot = i / 8; + U32 bitNo = 7 - (i % 8); + std::bitset<8> Q(filter[bitSlot]); + if (Q.test(bitNo)) { + filterTransformed[i] = 1; + } else { + filterTransformed[i] = zeroValue; + } + } +} +#endif + +EE convolution_general(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + TensorDesc scaleDesc, + const void *scale, + TensorDesc biasDesc, + const void *bias, + TensorDesc outputDesc, + void *output, + ActivationParamSpec activationDesc) +{ + UNUSED(scaleDesc); + UNUSED(biasDesc); + + EE ret = SUCCESS; + switch (filterDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: + ret = convolution(inputDesc, (F32 *)input, filterDesc, (F32 *)filter, + convParamSpec, (F32 *)bias, (F32 *)scale, outputDesc, (F32 *)output, activationDesc); + break; +#endif +#ifdef _USE_FP16 + case DT_F16: + ret = convolution(inputDesc, (F16 *)input, filterDesc, (F16 *)filter, + convParamSpec, (F16 *)bias, (F16 *)scale, outputDesc, (F16 *)output, activationDesc); + break; +#endif +#ifdef _USE_INT8 + case DT_I8: + ret = convolution(inputDesc, (INT8 *)input, filterDesc, + (F16 *)filter, convParamSpec, (F16 *)bias, (F16 *)scale, outputDesc, (F16 *)output, + activationDesc); + break; +#endif +#ifdef _USE_FP16 + case DT_BIN01: { + std::vector inputTransformed(tensorNumElements(inputDesc)); + std::vector filterTransformed(tensorNumElements(filterDesc)); + bnn_input_process(inputDesc, (F16 *)input, filterDesc.dt, inputTransformed.data()); + bnn_filter_process(filterDesc, (BIN8 *)filter, filterTransformed.data()); + ret = convolution(inputDesc, inputTransformed.data(), + filterDesc, filterTransformed.data(), convParamSpec, (F16 *)bias, (F16 *)scale, + outputDesc, (F16 *)output, activationDesc, 0); + break; + } + case DT_BIN11: { + std::vector inputTransformed(tensorNumElements(inputDesc)); + std::vector filterTransformed(tensorNumElements(filterDesc)); + bnn_input_process(inputDesc, (F16 *)input, filterDesc.dt, inputTransformed.data()); + bnn_filter_process(filterDesc, (BIN8 *)filter, filterTransformed.data()); + ret = convolution(inputDesc, inputTransformed.data(), + filterDesc, filterTransformed.data(), convParamSpec, (F16 *)bias, (F16 *)scale, + outputDesc, (F16 *)output, activationDesc, -1); + break; + } +#endif + default: + return NOT_SUPPORTED; + } + return ret; +} diff --git a/compute/tensor/src/cpu/general/deconvolution.cpp b/compute/tensor/src/cpu/general/deconvolution.cpp new file mode 100644 index 00000000..7ff796fe --- /dev/null +++ b/compute/tensor/src/cpu/general/deconvolution.cpp @@ -0,0 +1,150 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "types.h" +#include "cpu/general/tensor_computing_general.h" +#include "cpu/general/general_functions.h" + +template +inline EE deconvolution(TensorDesc inputDesc, + T *inArray, + TensorDesc filterDesc, + const T *filterArray, + ConvolutionParamSpec convParamSpec, + const T *biasArray, + TensorDesc outputDesc, + T *outArray, + ActivationParamSpec activationDesc) +{ + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 group = convParamSpec.group; + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingL = convParamSpec.padding_left; + U32 ocGroupSize = oc / group; + + // initialize outputs to 0 + memset(outArray, 0, tensorNumBytes(outputDesc)); + U32 ic8 = ic / 8; + U32 oc8 = oc / 8; + for (U32 n = 0; n < in; n++) { + for (U32 o = 0; o < oc; o++) { + U32 groupId = o / ocGroupSize; + U32 icStart = groupId * fn; + U32 icEnd = (groupId + 1) * fn; + for (U32 c = icStart; c < icEnd; c++) { + for (U32 h = 0; h < ih; h++) { + for (U32 w = 0; w < iw; w++) { + U32 i_off; + if (idf == DF_NCHW) { + i_off = ((n * ic + c) * ih + h) * iw + w; + } else { + i_off = (((n * ic8 + (c / 8)) * ih + h) * iw + w) * 8 + c % 8; + } + for (I32 fh_idx = 0; fh_idx < (I32)fh; fh_idx++) { + for (I32 fw_idx = 0; fw_idx < (I32)fw; fw_idx++) { + I32 oh_idx = fh_idx + strideH * h - paddingT; + I32 ow_idx = fw_idx + strideW * w - paddingL; + if (oh_idx >= 0 && oh_idx < (I32)oh && ow_idx >= 0 && + ow_idx < (I32)ow) { + U32 o_off; + if (odf == DF_NCHW) { + o_off = ((n * oc + o) * oh + oh_idx) * ow + ow_idx; + } else { + o_off = + (((n * oc8 + (o / 8)) * oh + oh_idx) * ow + ow_idx) * 8 + + o % 8; + } + U32 f_off = + (((c - icStart) * fc + o) * fh + fh_idx) * fw + fw_idx; + outArray[o_off] += inArray[i_off] * filterArray[f_off]; + } + } + } + } + } + } + } + } + // bias + U32 ohow = oh * ow; + for (U32 i = 0; i < tensorNumElements(outputDesc); i++) { + U32 o; + if (odf == DF_NCHW) { + o = (i / ohow) % oc; + } else { + o = (i / (ohow * 8)) % oc8 * 8 + i % 8; + } + outArray[i] += biasArray[o]; + switch (activationDesc.mode) { + case ACTIVATION_NULL: { + break; + } + case ACTIVATION_RELU: { + F32 tmp = activationDesc.value[0] * outArray[i]; + if (outArray[i] < tmp) { + outArray[i] = tmp; + } + break; + } + default: + return NOT_SUPPORTED; + } + } + return SUCCESS; +} + +EE deconvolution_general(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + TensorDesc scaleDesc, + const void *scale, + TensorDesc biasDesc, + const void *bias, + TensorDesc outputDesc, + void *output, + ActivationParamSpec activationDesc) +{ + UNUSED(scaleDesc); + UNUSED(scale); + UNUSED(biasDesc); + + EE ret = SUCCESS; + switch (inputDesc.dt) { +#ifdef _USE_FP16 + case DT_F16: + ret = deconvolution(inputDesc, (F16 *)input, filterDesc, (F16 *)filter, + convParamSpec, (F16 *)bias, outputDesc, (F16 *)output, activationDesc); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + ret = deconvolution(inputDesc, (F32 *)input, filterDesc, (F32 *)filter, + convParamSpec, (F32 *)bias, outputDesc, (F32 *)output, activationDesc); + break; +#endif + default: + return NOT_SUPPORTED; + } + return ret; +} diff --git a/compute/tensor/src/cpu/general/depthwise_convolution.cpp b/compute/tensor/src/cpu/general/depthwise_convolution.cpp new file mode 100644 index 00000000..e787ed44 --- /dev/null +++ b/compute/tensor/src/cpu/general/depthwise_convolution.cpp @@ -0,0 +1,34 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/general/tensor_computing_general.h" + +EE depthwise_convolution_general(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec depthwiseActivationParamSpec) +{ + TensorDesc blankTensorDesc; + ActivationParamSpec blankActivationParamSpec; + return depthwise_pointwise_convolution_general(inputDesc, input, filterDesc, filter, + blankTensorDesc, nullptr, convParamSpec, blankTensorDesc, bias, biasDesc, nullptr, tmpBytes, + tmp, outputDesc, output, depthwiseActivationParamSpec, blankActivationParamSpec); +} diff --git a/compute/tensor/src/cpu/general/depthwise_pointwise_convolution.cpp b/compute/tensor/src/cpu/general/depthwise_pointwise_convolution.cpp new file mode 100644 index 00000000..4fcddbcd --- /dev/null +++ b/compute/tensor/src/cpu/general/depthwise_pointwise_convolution.cpp @@ -0,0 +1,191 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "types.h" +#include "cpu/general/tensor_computing_general.h" +#include "cpu/general/general_functions.h" + +EE depthwise_pointwise_convolution_infer_forward_tmp_bytes_general(TensorDesc inputDesc, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + U32 *bytes) +{ + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + U32 elementSize = bytesOf(fdt); + if (fdt == DT_I8) { + elementSize = bytesOf(DT_I32); + } + *bytes = ic * oh * ow * elementSize; + return SUCCESS; +} + +template +inline EE depthwise_pointwise_convolution(TensorDesc inputDesc, + T1 *inArray, + TensorDesc dwFilterDesc, + const T2 *dwFilterArray, + TensorDesc pwFilterDesc, + const T2 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + const T3 *dwBiasArray, + const T3 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + T3 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec) +{ + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingL = convParamSpec.padding_left; + bool fuseDepthwisePointwise = (pwFilterArray == nullptr) ? false : true; + + T3 *pwArray; + if (fuseDepthwisePointwise) { + CHECK_REQUIREMENT(tmpBytes >= ic * oh * ow * sizeof(T3)); + pwArray = (T3 *)tmp; + } else { + pwArray = outArray; + } + U32 ic8 = ic / 8; + U32 oc8 = oc / 8; + for (U32 n = 0; n < in; n++) { + // dw conv + for (U32 c = 0, pw_off = 0; c < ic; c++) { + for (U32 h = 0; h < oh; h++) { + for (U32 w = 0; w < ow; w++, pw_off++) { + T3 value = dwBiasArray[c]; + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + I32 ih_idx = h * strideH - paddingT + fh_idx; + I32 iw_idx = w * strideW - paddingL + fw_idx; + if (ih_idx >= 0 && ih_idx < (I32)ih && iw_idx >= 0 && iw_idx < (I32)iw) { + U32 i_off; + if (idf == DF_NCHW) { + i_off = ((n * ic + c) * ih + ih_idx) * iw + iw_idx; + } else { + i_off = (((n * ic8 + (c / 8)) * ih + ih_idx) * iw + iw_idx) * 8 + + c % 8; + } + value += inArray[i_off] * + dwFilterArray[c * fh * fw + fh_idx * fw + fw_idx]; + } + } + } + CHECK_STATUS( + activation_template(depthwiseActivationParamSpec, value, &value)); + + if (fuseDepthwisePointwise || odf == DF_NCHW) { + pwArray[pw_off] = value; + } else { + pwArray[(((n * ic8 + (c / 8)) * oh + h) * ow + w) * 8 + c % 8] = value; + } + } + } + } + if (fuseDepthwisePointwise) { + // pw conv + for (U32 o = 0; o < oc; o++) { + for (U32 hw = 0; hw < oh * ow; hw++) { + T3 value = pwBiasArray[o]; + for (U32 c = 0; c < ic; c++) { + U32 pw_off = c * oh * ow + hw; + value += pwArray[pw_off] * pwFilterArray[o * ic + c]; + } + CHECK_STATUS( + activation_template(pointwiseActivationParamSpec, value, &value)); + U32 o_off; + if (odf == DF_NCHW) { + o_off = (n * oc + o) * oh * ow + hw; + } else { + o_off = ((n * oc8 + (o / 8)) * oh * ow + hw) * 8 + o % 8; + } + outArray[o_off] = value; + } + } + } + } + return SUCCESS; +} + +EE depthwise_pointwise_convolution_general(TensorDesc inputDesc, + void *input, + TensorDesc dwFilterDesc, + const void *dwFilter, + TensorDesc pwFilterDesc, + const void *pwFilter, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const void *dwBias, + TensorDesc pwBiasDesc, + const void *pwBias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec) +{ + EE ret = SUCCESS; + switch (inputDesc.dt) { +#ifdef _USE_FP16 + case DT_F16: + ret = depthwise_pointwise_convolution(inputDesc, (F16 *)input, + dwFilterDesc, (F16 *)dwFilter, pwFilterDesc, (F16 *)pwFilter, convParamSpec, + (F16 *)dwBias, (F16 *)pwBias, tmpBytes, tmp, outputDesc, (F16 *)output, + depthwiseActivationParamSpec, pointwiseActivationParamSpec); + break; +#endif +#ifdef _USE_INT8 + case DT_I8: + ret = depthwise_pointwise_convolution(inputDesc, (INT8 *)input, + dwFilterDesc, (INT8 *)dwFilter, pwFilterDesc, (INT8 *)pwFilter, convParamSpec, + (I32 *)dwBias, (I32 *)pwBias, tmpBytes, tmp, outputDesc, (I32 *)output, + depthwiseActivationParamSpec, pointwiseActivationParamSpec); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + ret = depthwise_pointwise_convolution(inputDesc, (F32 *)input, + dwFilterDesc, (F32 *)dwFilter, pwFilterDesc, (F32 *)pwFilter, convParamSpec, + (F32 *)dwBias, (F32 *)pwBias, tmpBytes, tmp, outputDesc, (F32 *)output, + depthwiseActivationParamSpec, pointwiseActivationParamSpec); + break; +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/general/eltwise.cpp b/compute/tensor/src/cpu/general/eltwise.cpp new file mode 100644 index 00000000..a62a7da2 --- /dev/null +++ b/compute/tensor/src/cpu/general/eltwise.cpp @@ -0,0 +1,88 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/general/tensor_computing_general.h" + +template +T getFloatScalar(void *input, int inputSize, int index) +{ + int local = index % inputSize; + return ((T *)input)[local]; +} + +template +EE eltwise_general_kernel(std::vector input, + std::vector inputSize, + U32 num, + U32 len, + void *output, + EltwiseMode eltwiseMode) +{ + T *output_ptr = (T *)output; + for (U32 i = 0; i < len; i++) { + F32 tmp_s = getFloatScalar(input[0], inputSize[0], i); + for (U32 j = 1; j < num; j++) { + F32 value_s = getFloatScalar(input[j], inputSize[j], i); + switch (eltwiseMode) { + case ELTWISE_SUM: + tmp_s = value_s + tmp_s; + break; + case ELTWISE_MAX: + tmp_s = (value_s > tmp_s) ? value_s : tmp_s; + break; + case ELTWISE_PROD: + tmp_s *= value_s; + break; + case ELTWISE_SUB: + tmp_s -= value_s; + break; + case ELTWISE_DIV: + tmp_s /= value_s; + break; + default: + return NOT_SUPPORTED; + } + } + output_ptr[i] = tmp_s; + } + return SUCCESS; +} + +EE eltwise_general(DataType dataType, + std::vector input, + std::vector inputSize, + U32 num, + U32 len, + void *output, + EltwiseMode eltwiseMode) +{ + EE ret = SUCCESS; + switch (dataType) { +#ifdef _USE_FP32 + case DT_F32: { + ret = eltwise_general_kernel(input, inputSize, num, len, output, eltwiseMode); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + ret = eltwise_general_kernel(input, inputSize, num, len, output, eltwiseMode); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/general/general_functions.h b/compute/tensor/src/cpu/general/general_functions.h new file mode 100644 index 00000000..2b886db3 --- /dev/null +++ b/compute/tensor/src/cpu/general/general_functions.h @@ -0,0 +1,274 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_GENERAL_FUNCTIONS +#define _H_GENERAL_FUNCTIONS + +#include "cpu/cpu_functions_template.h" + +template +inline EE from_nchwc8_to_nchw(TensorDesc *desc, T *data) +{ + if (desc == nullptr || data == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + + DataType idt; + DataFormat idf; + U32 in, ic, ih, iw; + CHECK_STATUS(tensor4dGet(*desc, &idt, &idf, &in, &ic, &ih, &iw)); + if (idf != DF_NCHWC8) { + CHECK_STATUS(NOT_MATCH); + } + + *desc = tensor4df(idt, DF_NCHW, in, ic, ih, iw); + + T *tmp = (T *)malloc(tensorNumBytes(*desc)); + ic /= 8; + for (U32 n = 0; n < in; n++) { + for (U32 c = 0; c < ic; c++) { + for (U32 hw = 0; hw < ih * iw; hw++) { + for (U32 c8 = 0; c8 < 8; c8++) { + tmp[n * ic * 8 * ih * iw + (c * 8 + c8) * ih * iw + hw] = + data[n * ic * ih * iw * 8 + c * ih * iw * 8 + hw * 8 + c8]; + } + } + } + } + memcpy(data, tmp, tensorNumBytes(*desc)); + free(tmp); + return SUCCESS; +} + +template +inline EE from_nchw_to_nchwc8(TensorDesc *desc, T *data) +{ + if (desc == nullptr || data == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + + DataType idt; + DataFormat idf; + U32 in, ic, ih, iw; + CHECK_STATUS(tensor4dGet(*desc, &idt, &idf, &in, &ic, &ih, &iw)); + if (idf != DF_NCHW) { + CHECK_STATUS(NOT_MATCH); + } + + *desc = tensor4df(idt, DF_NCHWC8, in, ic, ih, iw); + + T *tmp = (T *)malloc(tensorNumBytes(*desc)); + ic /= 8; + for (U32 n = 0; n < in; n++) { + for (U32 c = 0; c < ic; c++) { + for (U32 hw = 0; hw < ih * iw; hw++) { + for (U32 c8 = 0; c8 < 8; c8++) { + tmp[n * ic * ih * iw * 8 + c * ih * iw * 8 + hw * 8 + c8] = + data[n * ic * 8 * ih * iw + (c * 8 + c8) * ih * iw + hw]; + } + } + } + } + memcpy(data, tmp, tensorNumBytes(*desc)); + free(tmp); + return SUCCESS; +} + +inline F32 array_mean_general(DataType dt, const void *data, I32 len) +{ + F32 result = 0; + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + result = array_mean_template((const F16 *)data, len); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + result = array_mean_template((const F32 *)data, len); + break; +#endif + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } + return result; +} + +inline F32 array_var_general(DataType dt, const void *data, I32 len, F32 mean) +{ + F32 result = 0; + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + result = array_var_template((const F16 *)data, len, mean); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + result = array_var_template((const F32 *)data, len, mean); + break; +#endif + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } + return result; +} + +inline void array_power_general(DataType dt, void *input, void *output, I32 len, F32 power) +{ + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + array_power_template((F16 *)input, (F16 *)output, len, power); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + array_power_template((F32 *)input, (F32 *)output, len, power); + break; +#endif + case DT_I32: + array_power_template((I32 *)input, (I32 *)output, len, power); + break; + case DT_U32: + array_power_template((U32 *)input, (U32 *)output, len, power); + break; + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } +} + +inline void array_add_general( + DataType dt, const void *inputA, const void *inputB, void *output, I32 len) +{ + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + array_add_template((const F16 *)inputA, (const F16 *)inputB, (F16 *)output, len); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + array_add_template((const F32 *)inputA, (const F32 *)inputB, (F32 *)output, len); + break; +#endif + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } +} + +inline void array_scale_general( + DataType dt, const void *input, void *output, I32 len, F32 alpha, F32 beta) +{ + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + array_scale_template((const F16 *)input, (F16 *)output, len, alpha, beta); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + array_scale_template((const F32 *)input, (F32 *)output, len, alpha, beta); + break; +#endif + case DT_I32: + array_scale_template((const I32 *)input, (I32 *)output, len, alpha, beta); + break; + case DT_U32: + array_scale_template((const U32 *)input, (U32 *)output, len, alpha, beta); + break; + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } +} + +inline F32 array_sum_general(DataType dt, const void *data, I32 len) +{ + F32 result = 0; + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + result = array_sum_template((const F16 *)data, len); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + result = array_sum_template((const F32 *)data, len); + break; +#endif + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } + return result; +} + +inline void array_square_and_add_general( + DataType dt, const void *inputA, const void *inputB, void *output, I32 len) +{ + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + array_square_and_add_template( + (const F16 *)inputA, (const F16 *)inputB, (F16 *)output, len); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + array_square_and_add_template( + (const F32 *)inputA, (const F32 *)inputB, (F32 *)output, len); + break; +#endif + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } +} + +inline EE array_activation_general( + DataType dt, void *input, U32 len, ActivationParamSpec activationDesc, void *output) +{ + EE ret = SUCCESS; + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: { + F16 *inPtr = (F16 *)input; + F16 *outPtr = (F16 *)output; + for (U32 i = 0; i < len; i++) { + activation_template(activationDesc, inPtr[i], &outPtr[i]); + } + break; + } +#endif +#ifdef _USE_FP32 + case DT_F32: { + F32 *inPtr = (F32 *)input; + F32 *outPtr = (F32 *)output; + for (U32 i = 0; i < len; i++) { + activation_template(activationDesc, inPtr[i], &outPtr[i]); + } + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} +#endif diff --git a/compute/tensor/src/cpu/general/normalization.cpp b/compute/tensor/src/cpu/general/normalization.cpp new file mode 100644 index 00000000..793ebd7b --- /dev/null +++ b/compute/tensor/src/cpu/general/normalization.cpp @@ -0,0 +1,83 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include + +#include "cpu/general/general_functions.h" +#include "cpu/general/tensor_computing_general.h" + +template +inline EE array_norm_scale_template( + T *input, T *output, I32 len, F32 mean, F32 var, T *alpha, T *beta) +{ + F32 eps = 1e-6; + F32 std_value = sqrt(var + eps); + for (I32 i = 0; i < len; i++) { + output[i] = alpha[i] * (input[i] - mean) / std_value + beta[i]; + } + return SUCCESS; +} + +template +inline EE layer_normalization_template( + TensorDesc inputDesc, T *input, T *alpha, T *beta, TensorDesc outputDesc, T *output) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + if (inputDesc.dt != outputDesc.dt || inputDesc.df != outputDesc.df) { + CHECK_STATUS(NOT_MATCH); + } + + U32 size = tensorNumElements(inputDesc); + I32 size_inner = inputDesc.dims[0]; + I32 size_outer = size / size_inner; + for (I32 i = 0; i < size_outer; i++) { + T *current_input = input + i * size_inner; + T *current_output = output + i * size_inner; + F32 mean = array_mean_template(current_input, size_inner); + F32 var = array_var_template(current_input, size_inner, mean); + + array_norm_scale_template( + current_input, current_output, size_inner, mean, var, alpha, beta); + } + + return SUCCESS; +} + +EE layer_normalization_general( + TensorDesc inputDesc, void *input, void *alpha, void *beta, TensorDesc outputDesc, void *output) +{ + DataType idt = inputDesc.dt; + EE ret = SUCCESS; + switch (idt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = layer_normalization_template( + inputDesc, (F32 *)input, (F32 *)alpha, (F32 *)beta, outputDesc, (F32 *)output); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + ret = layer_normalization_template( + inputDesc, (F16 *)input, (F16 *)alpha, (F16 *)beta, outputDesc, (F16 *)output); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/general/padding.cpp b/compute/tensor/src/cpu/general/padding.cpp new file mode 100644 index 00000000..aad8b036 --- /dev/null +++ b/compute/tensor/src/cpu/general/padding.cpp @@ -0,0 +1,126 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "types.h" +#include "cpu/general/tensor_computing_general.h" +#include + +EE padding_general(TensorDesc inputDesc, + const void *input, + PadParamSpec padParamSpec, + TensorDesc outputDesc, + void *output) +{ + DataType idt, odt; + DataFormat idf, odf; + U32 in = 0, ic = 0, ih = 0, iw = 0, on = 0, oc = 0, oh = 0, ow = 0; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + CHECK_REQUIREMENT(in == on); + CHECK_REQUIREMENT(ic == oc); + U32 alignSize = 1; + if (idf == DF_NCHWC8) { + alignSize = 8; + } + ic /= alignSize; + oc /= alignSize; + for (U32 n = 0; n < in; n++) { + for (U32 c = 0; c < ic; c++) { + for (U32 h = 0; h < ih; h++) { + const U8 *inPtr = + (const U8 *)input + (((n * ic + c) * ih + h) * iw) * alignSize * bytesOf(idt); + U8 *outPtr = (U8 *)output + + (((n * oc + c) * oh + (padParamSpec.top + h)) * ow) * alignSize * bytesOf(odt); + if (padParamSpec.pad_mode == Pad_Constant) { + memset(outPtr, 0, padParamSpec.left * alignSize * bytesOf(odt)); + outPtr += padParamSpec.left * alignSize * bytesOf(odt); + memcpy(outPtr, inPtr, iw * alignSize * bytesOf(idt)); + outPtr += iw * alignSize * bytesOf(odt); + memset(outPtr, 0, padParamSpec.right * alignSize * bytesOf(odt)); + } else { + for (U32 w = 0; w < padParamSpec.left; w++) { + U32 index = 0; + if (padParamSpec.pad_mode == Pad_Reflect) { + index = (padParamSpec.left - w) * alignSize * bytesOf(idt); + } else if (padParamSpec.pad_mode == Pad_Symmetric) { + index = (padParamSpec.left - w - 1) * alignSize * bytesOf(idt); + } + memcpy(outPtr, inPtr + index, alignSize * bytesOf(idt)); + outPtr += alignSize * bytesOf(idt); + } + memcpy(outPtr, inPtr, iw * alignSize * bytesOf(idt)); + outPtr += iw * alignSize * bytesOf(odt); + for (U32 w = 0; w < padParamSpec.right; w++) { + U32 index = (iw - 1) * alignSize * bytesOf(idt); + if (padParamSpec.pad_mode == Pad_Reflect) { + index = (iw - w - 2) * alignSize * bytesOf(idt); + } else if (padParamSpec.pad_mode == Pad_Symmetric) { + index = (iw - w - 1) * alignSize * bytesOf(idt); + } + memcpy(outPtr, inPtr + index, alignSize * bytesOf(idt)); + outPtr += alignSize * bytesOf(idt); + } + } + } + U8 *outPtr = (U8 *)output + (((n * oc + c) * oh) * ow) * alignSize * bytesOf(odt); + for (U32 h = 0; h < padParamSpec.top; h++) { + U32 index = h * ow * alignSize * bytesOf(odt); + if (padParamSpec.pad_mode == Pad_Constant) { + memset(outPtr + index, 0, ow * alignSize * bytesOf(odt)); + } else if (padParamSpec.pad_mode == Pad_Edge) { + memcpy(outPtr + index, + outPtr + (padParamSpec.top * ow * alignSize * bytesOf(odt)), + ow * alignSize * bytesOf(odt)); + } else if (padParamSpec.pad_mode == Pad_Reflect) { + memcpy(outPtr + index, + outPtr + + ((padParamSpec.top + padParamSpec.top - h) * ow * alignSize * + bytesOf(odt)), + ow * alignSize * bytesOf(odt)); + } else if (padParamSpec.pad_mode == Pad_Symmetric) { + memcpy(outPtr + index, + outPtr + + ((padParamSpec.top + padParamSpec.top - h - 1) * ow * alignSize * + bytesOf(odt)), + ow * alignSize * bytesOf(odt)); + } else { + return NOT_SUPPORTED; + } + } + for (U32 h = 0; h < padParamSpec.bottom; h++) { + U32 index = (padParamSpec.top + ih + h) * ow * alignSize * bytesOf(odt); + if (padParamSpec.pad_mode == Pad_Constant) { + memset(outPtr + index, 0, ow * alignSize * bytesOf(odt)); + } else if (padParamSpec.pad_mode == Pad_Edge) { + memcpy(outPtr + index, + outPtr + ((padParamSpec.top + ih - 1) * ow * alignSize * bytesOf(odt)), + ow * alignSize * bytesOf(odt)); + } else if (padParamSpec.pad_mode == Pad_Reflect) { + // memcpy(outPtr+index, outPtr+((padParamSpec.top+ih-2-h)*ow*alignSize*bytesOf(odt)), ow*alignSize*bytesOf(odt)); + memcpy(outPtr + index, + outPtr + + ((padParamSpec.top + ih - 1 - padParamSpec.bottom + h) * ow * + alignSize * bytesOf(odt)), + ow * alignSize * bytesOf(odt)); + } else if (padParamSpec.pad_mode == Pad_Symmetric) { + memcpy(outPtr + index, + outPtr + ((padParamSpec.top + ih - 1 - h) * ow * alignSize * bytesOf(odt)), + ow * alignSize * bytesOf(odt)); + } else { + return NOT_SUPPORTED; + } + } + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/general/pooling.cpp b/compute/tensor/src/cpu/general/pooling.cpp new file mode 100644 index 00000000..b5aa6c4f --- /dev/null +++ b/compute/tensor/src/cpu/general/pooling.cpp @@ -0,0 +1,164 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "error.h" +#include "types.h" +#include "cpu/general/tensor_computing_general.h" + +template +EE pooling(T *input, + T *output, + U32 in, + U32 ic, + U32 ih, + U32 iw, + U32 strideH, + U32 strideW, + U32 paddingT, + U32 paddingB, + U32 paddingL, + U32 paddingR, + U32 kernelH, + U32 kernelW, + PoolingMode pm, + RoundMode rm, + U32 alignSize, + F32 minValue) +{ + U32 oh = 0, ow = 0; + if (rm == CEIL) { + oh = (U32)(ceil((double(ih + paddingT + paddingB - kernelH) / strideH))) + 1; + ow = (U32)(ceil((double(iw + paddingL + paddingR - kernelW) / strideW))) + 1; + } + if (rm == FLOOR) { + oh = (U32)(floor((double(ih + paddingT + paddingB - kernelH) / strideH))) + 1; + ow = (U32)(floor((double(iw + paddingL + paddingR - kernelW) / strideW))) + 1; + } + + CHECK_REQUIREMENT(ic % alignSize == 0); + ic = ic / alignSize; + + for (U32 n = 0; n < in; n++) { + for (U32 c = 0; c < ic; c++) { + for (U32 j = 0; j < alignSize; j++) { + for (I32 h = 0; h < (I32)oh; h++) { + for (I32 w = 0; w < (I32)ow; w++) { + int hstart = int(h * strideH - paddingT); + int wstart = int(w * strideW - paddingL); + int hend = hstart + kernelH; + int wend = wstart + kernelW; + hstart = (hstart < 0) ? 0 : hstart; + wstart = (wstart < 0) ? 0 : wstart; + hend = (hend > (int)ih) ? ih : hend; + wend = (wend > (int)iw) ? iw : wend; + float poolSize = (hend - hstart) * (wend - wstart); + + F32 value; + switch (pm) { + case POOLING_MAX: + value = minValue; + break; + case POOLING_MEAN: + value = 0; + break; + default: + return NOT_SUPPORTED; + } + for (int x = hstart; x < hend; x++) { + for (int y = wstart; y < wend; y++) { + U32 in_off = ((((n * ic + c) * ih) + x) * iw + y) * alignSize + j; + switch (pm) { + case POOLING_MAX: + value = (value > input[in_off]) ? value : input[in_off]; + break; + case POOLING_MEAN: + value += input[in_off]; + break; + default: + return NOT_SUPPORTED; + } + } + } + switch (pm) { + case POOLING_MAX: + break; + case POOLING_MEAN: + value = value / poolSize; + break; + default: + return NOT_SUPPORTED; + } + + U32 out_off = ((((n * ic + c) * oh) + h) * ow + w) * alignSize + j; + output[out_off] = value; + } + } + } + } + } + return SUCCESS; +} + +EE pooling_general(TensorDesc inputDesc, + const void *input, + PoolingParamSpec poolingParamSpec, + TensorDesc outputDesc, + void *output) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, odt; + DataFormat idf, odf; + U32 in = 0, ic = 0, ih = 0, iw = 0, on = 0, oc = 0, oh = 0, ow = 0; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if (in != on || ic != oc) { + CHECK_STATUS(NOT_MATCH); + } + if (idf != DF_NCHWC8 || odf != idf) { + CHECK_STATUS(NOT_MATCH); + } + + U32 strideH = poolingParamSpec.stride_h; + U32 strideW = poolingParamSpec.stride_w; + U32 paddingT = poolingParamSpec.padding_top; + U32 paddingB = poolingParamSpec.padding_bottom; + U32 paddingL = poolingParamSpec.padding_left; + U32 paddingR = poolingParamSpec.padding_right; + U32 kernelSizeH = poolingParamSpec.kernel_h; + U32 kernelSizeW = poolingParamSpec.kernel_w; + + EE ret = SUCCESS; + switch (idt) { +#ifdef _USE_FP32 + case DT_F32: + ret = pooling((F32 *)input, (F32 *)output, in, ic, ih, iw, strideH, strideW, paddingT, + paddingB, paddingL, paddingR, kernelSizeH, kernelSizeW, poolingParamSpec.mode, + poolingParamSpec.rm, 8, -FLT_MAX); + break; +#endif +#ifdef _USE_FP16 + case DT_F16: + ret = pooling((F16 *)input, (F16 *)output, in, ic, ih, iw, strideH, strideW, paddingT, + paddingB, paddingL, paddingR, kernelSizeH, kernelSizeW, poolingParamSpec.mode, + poolingParamSpec.rm, 8, -UNI_F16_MAX); + break; +#endif + default: + ret = NOT_SUPPORTED; + } + return ret; +} diff --git a/compute/tensor/src/cpu/general/pooling_bp.cpp b/compute/tensor/src/cpu/general/pooling_bp.cpp new file mode 100644 index 00000000..ac6a6ea4 --- /dev/null +++ b/compute/tensor/src/cpu/general/pooling_bp.cpp @@ -0,0 +1,111 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "error.h" +#include "types.h" +#include "cpu/general/tensor_computing_general.h" + +template +EE pooling_bp(T *input, + T *output, + U32 in, + U32 ic, + U32 ih, + U32 iw, + U32 strideH, + U32 strideW, + U32 paddingT, + U32 paddingL, + U32 kernelH, + U32 kernelW, + PoolingMode pm, + U32 oh, + U32 ow, + U32 alignSize) +{ + UNUSED(pm); + CHECK_REQUIREMENT(ic % alignSize == 0); + ic = ic / alignSize; + + for (U32 n = 0; n < in; n++) { + for (U32 c = 0; c < ic; c++) { + for (U32 j = 0; j < alignSize; j++) { + for (I32 h = 0; h < (I32)ih; h++) { + for (I32 w = 0; w < (I32)iw; w++) { + int hstart = int(h * strideH - paddingT); + int wstart = int(w * strideW - paddingL); + int hend = hstart + kernelH; + int wend = wstart + kernelW; + hstart = (hstart < 0) ? 0 : hstart; + wstart = (wstart < 0) ? 0 : wstart; + hend = (hend > (int)oh) ? oh : hend; + wend = (wend > (int)ow) ? ow : wend; + float poolSize = (hend - hstart) * (wend - wstart); + for (int x = hstart; x < hend; x++) { + for (int y = wstart; y < wend; y++) { + U32 in_off = ((((n * ic + c) * ih) + h) * iw + w) * alignSize + j; + U32 out_off = ((((n * ic + c) * oh) + x) * ow + y) * alignSize + j; + output[out_off] += input[in_off] / poolSize; + } + } + } + } + } + } + } + return SUCCESS; +} + +EE pooling_bp_general(TensorDesc inputDesc, + const void *input, + PoolingParamSpec poolingParamSpec, + TensorDesc outputDesc, + void *output) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, odt; + DataFormat idf, odf; + U32 in = 0, ic = 0, ih = 0, iw = 0, on = 0, oc = 0, oh = 0, ow = 0; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if (in != on || ic != oc) { + CHECK_STATUS(NOT_MATCH); + } + if (idf != DF_NCHWC8 || odf != idf) { + CHECK_STATUS(NOT_MATCH); + } + + U32 strideH = poolingParamSpec.stride_h; + U32 strideW = poolingParamSpec.stride_w; + U32 paddingT = poolingParamSpec.padding_top; + U32 paddingL = poolingParamSpec.padding_left; + U32 kernelSizeH = poolingParamSpec.kernel_h; + U32 kernelSizeW = poolingParamSpec.kernel_w; + + EE ret = SUCCESS; + switch (idt) { +#ifdef _USE_FP32 + case DT_F32: + ret = pooling_bp((F32 *)input, (F32 *)output, in, ic, ih, iw, strideH, strideW, + paddingT, paddingL, kernelSizeH, kernelSizeW, poolingParamSpec.mode, oh, ow, 8); + break; +#endif + default: + ret = NOT_SUPPORTED; + } + return ret; +} \ No newline at end of file diff --git a/compute/tensor/src/cpu/general/prelu.cpp b/compute/tensor/src/cpu/general/prelu.cpp new file mode 100644 index 00000000..1f6cca19 --- /dev/null +++ b/compute/tensor/src/cpu/general/prelu.cpp @@ -0,0 +1,85 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "error.h" +#include "types.h" +#include "cpu/general/tensor_computing_general.h" + +template +static EE prelu( + T *input, T *output, T *weight, PReLUParamSpec preluDesc, U32 in, U32 ic, U32 ih, U32 iw) +{ + ic /= 8; + T slope; + for (U32 n = 0; n < in; n++) { + for (U32 c = 0; c < ic; c++) { + for (U32 hw = 0; hw < ih * iw; hw++) { + for (U32 c8 = 0; c8 < 8; c8++) { + slope = preluDesc.propagate_down ? weight[0] : weight[c * 8 + c8]; + U32 off = n * ic * ih * iw * 8 + c * ih * iw * 8 + hw * 8 + c8; + if (input[off] > 0) { + output[off] = input[off]; + } else { + output[off] = input[off] * slope; + } + } + } + } + } + return SUCCESS; +} + +EE prelu_general(TensorDesc inputDesc, + void *input, + void *weight, + PReLUParamSpec preluDesc, + TensorDesc outputDesc, + void *output) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, odt; + DataFormat idf, odf; + U32 in = 0, ic = 0, ih = 0, iw = 0, on = 0, oc = 0, oh = 0, ow = 0; + if (tensorIs4d(inputDesc) && tensorIs4d(outputDesc)) { + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + } else { + return NOT_SUPPORTED; + } + if (idf != DF_NCHWC8) { + return NOT_SUPPORTED; + } + CHECK_REQUIREMENT(in == on && ic == oc && ih == oh && iw == ow); + EE ret = SUCCESS; + switch (idt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = prelu((F32 *)input, (F32 *)output, (F32 *)weight, preluDesc, in, ic, ih, iw); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + ret = prelu((F16 *)input, (F16 *)output, (F16 *)weight, preluDesc, in, ic, ih, iw); + break; + } +#endif + default: { + ret = NOT_SUPPORTED; + break; + } + } + return ret; +} diff --git a/compute/tensor/src/cpu/general/rnn.cpp b/compute/tensor/src/cpu/general/rnn.cpp new file mode 100644 index 00000000..36e398c0 --- /dev/null +++ b/compute/tensor/src/cpu/general/rnn.cpp @@ -0,0 +1,202 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include + +#include "cpu/general/tensor_computing_general.h" +#include "cpu/general/general_functions.h" + +template +static void mvm_nkn32_template(U32 fn, U32 fk, const T *filterArray, T *input, T *output) +{ + for (U32 i = 0; i < fn; i++) { + for (U32 j = 0; j < 32; j++) { + U32 n = i * 32 + j; + F32 value = 0; + for (U32 k = 0; k < fk; k++) { + value += input[k] * filterArray[(i * fk + k) * 32 + j]; + } + output[n] += value; + } + } +} + +template +static EE rnncell(TensorDesc xDesc, + const void *currentX, + const TensorDesc *filterDesc, + const void **filter, + const TensorDesc *biasDesc, + const void **bias, + void *state, + U32 tmpBytes, + void *tmp, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + void *output) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + if (nullptr == currentX || nullptr == filter || nullptr == bias || nullptr == state || + nullptr == tmp || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ix; + U32 on, oh; + U32 fk, fn; + CHECK_STATUS(tensor2dGet(xDesc, &idt, &idf, &in, &ix)); + CHECK_STATUS(tensor2dGet(filterDesc[0], &fdt, &fdf, &fn, &fk)); + CHECK_STATUS(tensor2dGet(hDesc, &odt, &odf, &on, &oh)); + if (fdf != DF_NKN32) { + CHECK_STATUS(NOT_MATCH); + } + + U32 batch = in; + U32 xDim = ix; + U32 hDim = rnnParamSpec.numOutput; + I32 column = (rnnParamSpec.numProjection > 0) ? rnnParamSpec.numProjection + : rnnParamSpec.numOutput; + F32 forgetBias = rnnParamSpec.forgetBias; + ActivationMode activationMode = rnnParamSpec.activationMode; + if (activationMode != ACTIVATION_TANH) { + CHECK_STATUS(NOT_SUPPORTED); + } + + if (!(idt == fdt && idt == odt)) { + CHECK_STATUS(NOT_MATCH); + } + + const T *currentXArray = (const T *)currentX; + T *lastStateArray = (T *)state; + T *lastHArray = lastStateArray + column; + T *tmpArray = (T *)tmp; + T *currentStateArray = (T *)state; + T *currentHArray = currentStateArray + column; + T *outputArray = (T *)output; + T *xhArray = tmpArray; + T *intermediateH = xhArray + (xDim + hDim); + U32 lastStateStride = column + hDim; + U32 lastHStride = column + hDim; + U32 currentStateStride = column + hDim; + U32 currentHStride = column + hDim; + for (U32 m = 0; m < batch; m++) { + T *lastBatchH = lastHArray + m * lastHStride; + memcpy(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(T)); + memcpy(xhArray + xDim, lastBatchH, hDim * sizeof(T)); + + // MVM + memcpy(intermediateH, bias[0], column * 4 * sizeof(T)); + mvm_nkn32_template(fn / 32, fk, (const T *)filter[0], xhArray, intermediateH); + + T *out_i = intermediateH; + T *out_g = out_i + column; + T *out_f = out_i + column * 2; + T *out_o = out_i + column * 3; + T *lastBatchState = lastStateArray + m * lastStateStride; + T *currentBatchState = currentStateArray + m * currentStateStride; + T *currentBatchH = currentHArray + m * currentHStride; + T *currentOutput = outputArray + m * batchStrideH; + T *tmpState, *tmpHH, *tmpH; + if (rnnParamSpec.zoneoutCell == 0) { + tmpState = currentBatchState; + } else { + tmpState = out_i; + } + if (rnnParamSpec.numProjection > 0) { + tmpHH = out_g; + tmpH = currentOutput; + } else { + tmpHH = currentOutput; + tmpH = out_g; + } + + for (I32 h = 0; h < column; h++) { + F32 C_s = lastBatchState[h]; + F32 I_s = 1.0 / (1.0 + exp(-out_i[h])); + F32 G_s = tanh(out_g[h]); + F32 F_s = 1.0 / (1.0 + exp(-(out_f[h] + forgetBias))); + F32 O_s = 1.0 / (1.0 + exp(-out_o[h])); + C_s = C_s * F_s + I_s * G_s; + F32 value = O_s * tanh(C_s); + tmpState[h] = C_s; + tmpHH[h] = value; + } + + if (rnnParamSpec.zoneoutCell != 0) { + array_scale_template(tmpState, tmpState, column, 1 - rnnParamSpec.zoneoutCell, 0); + array_scale_template( + lastBatchState, lastBatchState, column, rnnParamSpec.zoneoutCell, 0); + array_add_template(tmpState, lastBatchState, currentBatchState, column); + } + + if (rnnParamSpec.numProjection > 0) { + memset(tmpH, 0, sizeof(T) * hDim); + mvm_nkn32_template( + hDim / 32, rnnParamSpec.numProjection, (const T *)filter[1], tmpHH, tmpH); + } + if (rnnParamSpec.zoneoutOutput != 0) { + if (rnnParamSpec.numProjection > 0) { + array_scale_template(tmpH, out_f, hDim, 1 - rnnParamSpec.zoneoutOutput, 0); + } else { + array_scale_template(tmpHH, out_f, hDim, 1 - rnnParamSpec.zoneoutOutput, 0); + } + array_scale_template(lastBatchH, lastBatchH, hDim, rnnParamSpec.zoneoutOutput, 0); + array_add_template(out_f, lastBatchH, currentBatchH, hDim); + } else { + memcpy(currentBatchH, currentOutput, sizeof(T) * hDim); + } + } + return SUCCESS; +} + +EE rnncell_general(TensorDesc xDesc, + const void *currentX, + const TensorDesc *filterDesc, + const void **filter, + const TensorDesc *biasDesc, + const void **bias, + void *state, + U32 tmpBytes, + void *tmp, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + void *output) +{ + EE ret = SUCCESS; + switch (xDesc.dt) { +#ifdef _USE_FP16 + case DT_F16: + ret = rnncell(xDesc, currentX, filterDesc, filter, biasDesc, bias, state, tmpBytes, + tmp, rnnParamSpec, batchStrideX, batchStrideH, hDesc, output); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + ret = rnncell(xDesc, currentX, filterDesc, filter, biasDesc, bias, state, tmpBytes, + tmp, rnnParamSpec, batchStrideX, batchStrideH, hDesc, output); + break; +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/general/scale.cpp b/compute/tensor/src/cpu/general/scale.cpp new file mode 100644 index 00000000..b8f7ddd4 --- /dev/null +++ b/compute/tensor/src/cpu/general/scale.cpp @@ -0,0 +1,120 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/general/tensor_computing_general.h" + +template +static EE scale_nchw( + T *input, T *alpha, T *beta, U32 in, U32 ic, U32 elements_per_channel, U32 align_size, T *output) +{ + ic = ic / align_size; + for (U32 n = 0; n < in; n++) { + for (U32 c = 0; c < ic; c++) { + for (U32 i = 0; i < elements_per_channel; i++) { + for (U32 k = 0; k < align_size; k++) { + T alphaValue = (nullptr == alpha) ? 1 : alpha[c * align_size + k]; + T betaValue = (nullptr == beta) ? 0 : beta[c * align_size + k]; + U32 index = ((n * ic + c) * elements_per_channel + i) * align_size + k; + output[index] = alphaValue * input[index] + betaValue; + } + } + } + } + return SUCCESS; +} + +template +static EE scale_nhwc( + T *input, T *alpha, T *beta, U32 in, U32 ic, U32 elements_per_channel, T *output) +{ + for (U32 n = 0; n < in; n++) { + for (U32 i = 0; i < elements_per_channel; i++) { + for (U32 c = 0; c < ic; c++) { + T alphaValue = (nullptr == alpha) ? 1 : alpha[c]; + T betaValue = (nullptr == beta) ? 0 : beta[c]; + U32 index = ((n * elements_per_channel) + i) * ic + c; + output[index] = alphaValue * input[index] + betaValue; + } + } + } + return SUCCESS; +} + +template +static EE scale(T *input, + I32 axis, + I32 nDims, + T *alpha, + T *beta, + U32 in, + U32 ic, + U32 elements_per_channel, + U32 align_size, + T *output) +{ + EE ret = SUCCESS; + if (axis == 1 || axis == 0 || ic == 1) { + ret = scale_nchw(input, alpha, beta, in, ic, elements_per_channel, align_size, output); + } else if (axis == nDims - 1) { + ret = scale_nhwc(input, alpha, beta, in, ic, elements_per_channel, output); + } else { + ret = NOT_SUPPORTED; + } + return ret; +} + +EE scale_general(TensorDesc inputDesc, + void *input, + void *alpha, + void *beta, + ScaleParamSpec p, + TensorDesc outputDesc, + void *output) +{ + UNUSED(outputDesc); + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + U32 length = tensorNumElements(inputDesc); + int axis = (p.axis + inputDesc.nDims) % inputDesc.nDims; + I32 in = inputDesc.dims[inputDesc.nDims - 1]; + I32 ic = inputDesc.dims[inputDesc.nDims - 1 - axis]; + I32 elements_per_channel = length / (in * ic); + I32 align_size = 1; + if (inputDesc.df == DF_NCHWC8) { + align_size = 8; + } + EE ret = SUCCESS; + switch (inputDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = scale((F32 *)input, axis, inputDesc.nDims, (F32 *)alpha, (F32 *)beta, in, ic, + elements_per_channel, align_size, (F32 *)output); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + ret = scale((F16 *)input, axis, inputDesc.nDims, (F16 *)alpha, (F16 *)beta, in, ic, + elements_per_channel, align_size, (F16 *)output); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + + return ret; +} diff --git a/compute/tensor/src/cpu/general/softmax.cpp b/compute/tensor/src/cpu/general/softmax.cpp new file mode 100644 index 00000000..2992c9c5 --- /dev/null +++ b/compute/tensor/src/cpu/general/softmax.cpp @@ -0,0 +1,112 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/general/tensor_computing_general.h" + +template +static F32 array_max(const T *input, U32 len, U32 stride) +{ + F32 tmp = input[0]; + for (U32 i = 1; i < len; i++) { + if (input[i * stride] > tmp) { + tmp = input[i * stride]; + } + } + return tmp; +} + +template +static EE softmax(TensorDesc inputDesc, const T *input, int axis, TensorDesc outputDesc, T *output) +{ + UNUSED(outputDesc); + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + U32 size = tensorNumElements(inputDesc); + axis = (axis + inputDesc.nDims) % inputDesc.nDims; + axis = inputDesc.nDims - 1 - axis; + std::vector buffer; + if (inputDesc.df == DF_NCHWC8) { + if (axis == 2) { + if (inputDesc.dims[0] != 1 || inputDesc.dims[1] != 1) { + buffer = std::vector(size); + TensorDesc tmpInputDesc = inputDesc; + tmpInputDesc.df = DF_NCHW; + transformToNCHW(inputDesc, input, tmpInputDesc, buffer.data()); + input = (const T *)(buffer.data()); + } + } else { + for (I32 i = (int)inputDesc.nDims; i > 0; i--) { + inputDesc.dims[i] = inputDesc.dims[i - 1]; + } + inputDesc.dims[inputDesc.nDims - 1] /= 8; + inputDesc.dims[0] = 8; + inputDesc.nDims += 1; + axis += 1; + } + } + U32 loops = inputDesc.dims[axis]; + + U32 loop_inner = 1; + for (int i = 0; i < axis; i++) { + loop_inner *= inputDesc.dims[i]; + } + U32 loop_outer = size / loops / loop_inner; + + for (U32 i = 0; i < loop_outer; i++) { + for (U32 j = 0; j < loop_inner; j++) { + const T *in = input + i * loops * loop_inner + j; + T *out = output + i * loops * loop_inner + j; + F32 max_value = array_max(in, loops, loop_inner); + F32 sum = 0; + for (U32 i = 0; i < loops; i++) { + F32 tmp = exp(in[i * loop_inner] - max_value); + sum += tmp; + out[i * loop_inner] = tmp; + } + sum = 1 / sum; + for (U32 i = 0; i < loops; i++) { + out[i * loop_inner] *= sum; + } + } + } + return SUCCESS; +} + +EE softmax_general( + TensorDesc inputDesc, const void *input, SoftmaxParamSpec p, TensorDesc outputDesc, void *output) +{ + DataType idt = inputDesc.dt; + EE ret = SUCCESS; + switch (idt) { +#ifdef _USE_FP16 + case DT_F16: { + ret = softmax(inputDesc, (const F16 *)input, p.axis, outputDesc, (F16 *)output); + break; + } +#endif +#ifdef _USE_FP32 + case DT_F32: { + ret = softmax(inputDesc, (const F32 *)input, p.axis, outputDesc, (F32 *)output); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + + return ret; +} diff --git a/compute/tensor/src/cpu/general/tensor_computing_general.h b/compute/tensor/src/cpu/general/tensor_computing_general.h new file mode 100644 index 00000000..fda8c48b --- /dev/null +++ b/compute/tensor/src/cpu/general/tensor_computing_general.h @@ -0,0 +1,165 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_TENSOR_COMPUTING_GENERAL +#define _H_TENSOR_COMPUTING_GENERAL + +#include + +#include "error.h" +#include "sys.h" +#include "types.h" + +EE convolution_general(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + TensorDesc scaleDesc, + const void *scale, + TensorDesc biasDesc, + const void *bias, + TensorDesc outputDesc, + void *output, + ActivationParamSpec activationDesc); + +EE deconvolution_general(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + TensorDesc scaleDesc, + const void *scale, + TensorDesc biasDesc, + const void *bias, + TensorDesc outputDesc, + void *output, + ActivationParamSpec activationDesc); + +EE depthwise_pointwise_convolution_infer_forward_tmp_bytes_general(TensorDesc inputDesc, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + U32 *bytes); + +EE depthwise_pointwise_convolution_general(TensorDesc inputDesc, + void *input, + TensorDesc dwFilterDesc, + const void *dwFilter, + TensorDesc pwFilterDesc, + const void *pwFilter, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const void *dwBias, + TensorDesc pwBiasDesc, + const void *pwBias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec); + +EE depthwise_convolution_general(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec depthwiseActivationParamSpec); + +EE pooling_general(TensorDesc inputDesc, + const void *input, + PoolingParamSpec poolingParamSpec, + TensorDesc outputDesc, + void *output); + +EE pooling_bp_general(TensorDesc inputDesc, + const void *input, + PoolingParamSpec poolingParamSpec, + TensorDesc outputDesc, + void *output); + +EE attention_general(TensorDesc inputDesc, const void *input, TensorDesc outputDesc, void *output); + +EE clip_general( + TensorDesc inputDesc, void *input, ClipParamSpec p, TensorDesc outputDesc, void *output); + +EE eltwise_general(DataType dataType, + std::vector input, + std::vector inputSize, + U32 num, + U32 len, + void *output, + EltwiseMode eltwiseMode); + +EE rnncell_general(TensorDesc xDesc, + const void *currentX, + const TensorDesc *filterDesc, + const void **filter, + const TensorDesc *biasDesc, + const void **bias, + void *state, + U32 tmpBytes, + void *tmp, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + void *currentH); + +EE transpose_general( + TensorDesc inputDesc, const void *input, U32 *dim, TensorDesc outputDesc, void *output); + +EE scale_general(TensorDesc inputDesc, + void *input, + void *alpha, + void *beta, + ScaleParamSpec p, + TensorDesc outputDesc, + void *output); + +EE softmax_general( + TensorDesc inputDesc, const void *input, SoftmaxParamSpec p, TensorDesc outputDesc, void *output); + +EE check_general(TensorDesc inputDescA, + const void *inputA, + TensorDesc inputDescB, + const void *inputB, + CheckParamSpec p, + TensorDesc outputDesc, + void *output); + +EE layer_normalization_general( + TensorDesc inputDesc, void *input, void *alpha, void *beta, TensorDesc outputDesc, void *output); + +EE attention_mask_general(TensorDesc inputDesc, + const void *input, + AttentionMaskParamSpec p, + TensorDesc outputDesc, + void *output); + +EE prelu_general(TensorDesc inputDesc, + void *input, + void *weight, + PReLUParamSpec preluDesc, + TensorDesc outputDesc, + void *output); +#endif diff --git a/compute/tensor/src/cpu/general/transpose.cpp b/compute/tensor/src/cpu/general/transpose.cpp new file mode 100644 index 00000000..3ab8115c --- /dev/null +++ b/compute/tensor/src/cpu/general/transpose.cpp @@ -0,0 +1,51 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include + +#include "cpu/general/tensor_computing_general.h" + +EE transpose_general( + TensorDesc inputDesc, const void *input, U32 *dim, TensorDesc outputDesc, void *output) +{ + if (nullptr == input || nullptr == output || nullptr == dim) { + CHECK_STATUS(NULL_POINTER); + } + + U32 inputDim = inputDesc.nDims; + U32 outputDim = outputDesc.nDims; + CHECK_REQUIREMENT(inputDim == outputDim); + + U32 outputSize = tensorNumElements(outputDesc); + CHECK_REQUIREMENT(inputDim == outputDim); + std::vector inputLocalIndex(inputDim); + U8 *input_ptr = (U8 *)input; + U8 *output_ptr = (U8 *)output; + for (U32 i = 0; i < outputSize; i++) { + U32 outputIndex = i; + for (U32 j = 0; j < outputDim; j++) { + U32 value = outputIndex % outputDesc.dims[j]; + outputIndex /= outputDesc.dims[j]; + inputLocalIndex[inputDim - 1 - dim[outputDim - 1 - j]] = value; + } + U32 inputIndex = 0; + for (U32 j = inputDim - 1; j > 0; j--) { + inputIndex = (inputIndex + inputLocalIndex[j]) * inputDesc.dims[j - 1]; + } + inputIndex += inputLocalIndex[0]; + memcpy(output_ptr + i * bytesOf(outputDesc.dt), + input_ptr + inputIndex * bytesOf(inputDesc.dt), bytesOf(inputDesc.dt)); + } + + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/l2normalization.cpp b/compute/tensor/src/cpu/l2normalization.cpp new file mode 100644 index 00000000..032bfbb3 --- /dev/null +++ b/compute/tensor/src/cpu/l2normalization.cpp @@ -0,0 +1,57 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/tensor_computing_cpu.h" +#include "cpu/cpu_functions.h" + +EE l2normalization_cpu( + TensorDesc inputDesc, const void *input, TensorDesc outputDesc, void *output, Arch arch) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + ArrayVarFunction var_func = get_array_var_function(arch); + ArrayScaleFunction scale_func = get_array_scale_function(arch); + DataType idt, odt; + DataFormat idf, odf; + U32 ic = 0, ih = 0, iw = 0, oh = 0, ow = 0; + if (tensorIs2d(inputDesc)) { + CHECK_STATUS(tensor2dGet(inputDesc, &idt, &idf, &ih, &iw)); + ic = 1; + CHECK_STATUS(tensor2dGet(outputDesc, &odt, &odf, &oh, &ow)); + } else if (tensorIs3d(inputDesc)) { + U32 oc = 0; + CHECK_STATUS(tensor3dGet(inputDesc, &idt, &idf, &ic, &ih, &iw)); + CHECK_STATUS(tensor3dGet(outputDesc, &odt, &odf, &oc, &oh, &ow)); + CHECK_REQUIREMENT(ic == oc); + } else if (tensorIs4d(inputDesc)) { + idt = inputDesc.dt; + ic = inputDesc.dims[0]; + ih = inputDesc.dims[1]; + iw = inputDesc.dims[2]; + } else { + CHECK_STATUS(NOT_MATCH); + } + + // l2norm -> x / sqrt(sum(x^2)) + for (U32 c = 0; c < ic; c++) { + for (U32 h = 0; h < ih; h++) { + U32 index_off = (c * ih + h) * iw * bytesOf(idt); + const U8 *input_ptr = (const U8 *)input + index_off; + U8 *output_ptr = (U8 *)output + index_off; + F32 sum_row = var_func(idt, input_ptr, (I32)iw, 0.f) * static_cast(iw); + scale_func(idt, input_ptr, output_ptr, iw, 1.0 / sqrt(sum_row), 0); + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/non_max_suppression.cpp b/compute/tensor/src/cpu/non_max_suppression.cpp new file mode 100644 index 00000000..23118306 --- /dev/null +++ b/compute/tensor/src/cpu/non_max_suppression.cpp @@ -0,0 +1,222 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/tensor_computing_cpu.h" + +inline EE qsort_descent(std::vector &boxes, + std::vector &boxindex, + std::vector &scores, + int left, + int right) +{ + if (boxes.empty() || scores.empty()) { + return NOT_SUPPORTED; + } + + int i = left; + int j = right; + F32 temp = scores[(left + right) / 2]; + + while (i <= j) { + while (scores[i] > temp) { + i++; + } + while (scores[j] < temp) { + j--; + } + if (i <= j) { + std::swap(boxes[i], boxes[j]); + std::swap(scores[i], scores[j]); + std::swap(boxindex[i], boxindex[j]); + i++; + j--; + } + } + + if (left < j) { + qsort_descent(boxes, boxindex, scores, left, j); + } + if (i < right) { + qsort_descent(boxes, boxindex, scores, i, right); + } + + return SUCCESS; +} + +inline F32 intersectionarea(BoxRect a, BoxRect b) +{ + if (a.xmin > b.xmax || a.xmax < b.xmin || a.ymin > b.ymax || a.ymax < b.ymin) { + return 0.f; + } + F32 inter_width = std::min(a.xmax, b.xmax) - std::max(a.xmin, b.xmin); + F32 inter_height = std::min(a.ymax, b.ymax) - std::max(a.ymin, b.ymin); + + return inter_width * inter_height; +} + +inline EE nms_pickedboxes(std::vector boxes, std::vector &picked, F32 nms_threshold) +{ + I64 n = boxes.size(); + + std::vector areas(n); + for (I64 i = 0; i < n; i++) { + BoxRect box = boxes[i]; + + F32 width = box.xmax - box.xmin; + F32 height = box.ymax - box.ymin; + + areas[i] = width * height; + } + for (I64 i = 0; i < n; i++) { + BoxRect a = boxes[i]; + int keep = 1; + for (int j = 0; j < (int)picked.size(); j++) { + BoxRect b = boxes[picked[j]]; + F32 inter_area = intersectionarea(a, b); + F32 union_area = areas[i] + areas[picked[j]] - inter_area; + + if (inter_area / union_area > nms_threshold) { + keep = 0; + } + } + if (keep) { + picked.push_back(i); + } + } + return SUCCESS; +} + +template +EE non_max_suppression_kernel(std::vector input, + T *output, + U32 spatial_dim, + U32 num_class, + U32 max_output_boxes_per_class, + F32 iou_threshold, + F32 score_threshold) +{ + T *box = (T *)input[0]; + T *score = (T *)input[1]; + // decode box + std::vector> boxes; + boxes.resize(spatial_dim); + for (U32 i = 0; i < spatial_dim; i++) { + F32 ymin = std::min(box[i * 4], box[i * 4 + 2]); + F32 xmin = std::min(box[i * 4 + 1], box[i * 4 + 3]); + F32 ymax = std::max(box[i * 4], box[i * 4 + 2]); + F32 xmax = std::max(box[i * 4 + 1], box[i * 4 + 3]); + std::vector box_pixel; + box_pixel.resize(4); + box_pixel[0] = xmin; + box_pixel[1] = ymin; + box_pixel[2] = xmax; + box_pixel[3] = ymax; + boxes[i].assign(box_pixel.begin(), box_pixel.end()); + } + + std::vector all_boxinfo; + for (U32 i = 0; i < num_class; i++) { + std::vector class_boxrects; + std::vector class_boxscores; + std::vector class_boxindex; + for (U32 j = 0; j < spatial_dim; j++) { + F32 score_pixel = score[i * spatial_dim + j]; + if (score_pixel > score_threshold) { + std::vector inbox; + inbox.assign(boxes[j].begin(), boxes[j].end()); + BoxRect b = {inbox[0], inbox[1], inbox[2], inbox[3], i}; + class_boxrects.push_back(b); + class_boxindex.push_back(j); + class_boxscores.push_back(score_pixel); + } + } + // sort boxes and box index + qsort_descent(class_boxrects, class_boxindex, class_boxscores, 0, + static_cast(class_boxscores.size() - 1)); + std::vector picked; + // apply nms + nms_pickedboxes(class_boxrects, picked, iou_threshold); + std::vector boxindex; + for (I64 p = 0; p < (I64)picked.size(); p++) { + I64 picked_box = picked[p]; + boxindex.push_back(class_boxindex[picked_box]); + } + if (max_output_boxes_per_class < (U32)boxindex.size()) { + boxindex.resize(max_output_boxes_per_class); + } + for (I64 j = 0; j < (I64)boxindex.size(); j++) { + BoxInfo bi; + bi.box_index = boxindex[j]; + bi.label = i; + all_boxinfo.push_back(bi); + } + } + U32 num_detected = all_boxinfo.size(); + // the first box contains the number of availble boxes in the first element. + output[0] = num_detected; + output[1] = output[2] = 0; + for (U32 i = 0; i < num_detected; i++) { + BoxInfo bi = all_boxinfo[i]; + // batch_index = 0 + output[(i + 1) * 3] = 0; + // class_index + output[(i + 1) * 3 + 1] = bi.label; + // box_index + output[(i + 1) * 3 + 2] = bi.box_index; + } + return SUCCESS; +} + +EE non_max_suppression_cpu(std::vector inputDesc, + std::vector input, + NonMaxSuppressionParamSpec nonMaxSuppressionParamSpec, + TensorDesc outputDesc, + void *output) +{ + UNUSED(outputDesc); + if (nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt0, idt1; + DataFormat idf0, idf1; + U32 in0, ic0, ilens1; + U32 in1, ic1, ilens2; + // boxes + CHECK_STATUS(tensor3dGet(inputDesc[0], &idt0, &idf0, &in0, &ic0, &ilens1)); + // scores + CHECK_STATUS(tensor3dGet(inputDesc[1], &idt1, &idf1, &in1, &ic1, &ilens2)); + U32 spatial_dim = ic0; + U32 num_class = ic1; + CHECK_REQUIREMENT(spatial_dim == ilens2); + U32 max_output_boxes_per_class = nonMaxSuppressionParamSpec.max_output_boxes_per_class; + F32 iou_threshold = nonMaxSuppressionParamSpec.iou_threshold; + F32 score_threshold = nonMaxSuppressionParamSpec.score_threshold; + EE ret = SUCCESS; + switch (idt0) { +#ifdef _USE_FP32 + case DT_F32: + non_max_suppression_kernel(input, (F32 *)output, spatial_dim, num_class, + max_output_boxes_per_class, iou_threshold, score_threshold); + break; +#endif +#ifdef _USE_FP16 + case DT_F16: + non_max_suppression_kernel(input, (F16 *)output, spatial_dim, num_class, + max_output_boxes_per_class, iou_threshold, score_threshold); + break; +#endif + default: + ret = NOT_SUPPORTED; + } + return ret; +} diff --git a/compute/tensor/src/cpu/padding.cpp b/compute/tensor/src/cpu/padding.cpp new file mode 100644 index 00000000..a2f449b0 --- /dev/null +++ b/compute/tensor/src/cpu/padding.cpp @@ -0,0 +1,163 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "types.h" +#include "cpu/tensor_computing_cpu.h" +#include + +EE padding_infer_output_size_cpu( + TensorDesc inputDesc, PadParamSpec padParamSpec, TensorDesc *outputDesc) +{ + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt = DT_F32; + DataFormat idf = DF_NCHW; + U32 in = 0, ic = 0, ih = 0, iw = 0; + if (tensorIs3d(inputDesc)) { + CHECK_STATUS(tensor3dGet(inputDesc, &idt, &idf, &in, &ic, &ih)); + iw = 1; + } else if (tensorIs4d(inputDesc)) { + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + } else { + return NOT_SUPPORTED; + } + int out_n = in; + int out_c = ic; + int out_h = ih + padParamSpec.top + padParamSpec.bottom; + int out_w = iw + padParamSpec.left + padParamSpec.right; + if (tensorIs3d(inputDesc)) { + *outputDesc = tensor3df(idt, idf, out_n, out_c, out_h); + } else if (tensorIs4d(inputDesc)) { + *outputDesc = tensor4df(idt, idf, out_n, out_c, out_h, out_w); + } + return SUCCESS; +} + +EE padding_cpu(TensorDesc inputDesc, + const void *input, + PadParamSpec padParamSpec, + TensorDesc outputDesc, + void *output) +{ + DataType idt, odt; + DataFormat idf, odf; + U32 in = 0, ic = 0, ih = 0, iw = 0, on = 0, oc = 0, oh = 0, ow = 0; + if (tensorIs3d(inputDesc)) { + CHECK_STATUS(tensor3dGet(inputDesc, &idt, &idf, &in, &ic, &ih)); + CHECK_STATUS(tensor3dGet(outputDesc, &odt, &odf, &on, &oc, &oh)); + iw = ow = 1; + } else if (tensorIs4d(inputDesc)) { + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + } else { + return NOT_SUPPORTED; + } + CHECK_REQUIREMENT(in == on); + CHECK_REQUIREMENT(ic == oc); + U32 alignSize = 1; + if (idf == DF_NCHWC8) { + alignSize = 8; + } + ic /= alignSize; + oc /= alignSize; + for (U32 n = 0; n < in; n++) { + for (U32 c = 0; c < ic; c++) { + for (U32 h = 0; h < ih; h++) { + const U8 *inPtr = + (const U8 *)input + (((n * ic + c) * ih + h) * iw) * alignSize * bytesOf(idt); + U8 *outPtr = (U8 *)output + + (((n * oc + c) * oh + (padParamSpec.top + h)) * ow) * alignSize * bytesOf(odt); + if (padParamSpec.pad_mode == Pad_Constant) { + memset(outPtr, 0, padParamSpec.left * alignSize * bytesOf(odt)); + outPtr += padParamSpec.left * alignSize * bytesOf(odt); + memcpy(outPtr, inPtr, iw * alignSize * bytesOf(idt)); + outPtr += iw * alignSize * bytesOf(odt); + memset(outPtr, 0, padParamSpec.right * alignSize * bytesOf(odt)); + } else { + for (U32 w = 0; w < padParamSpec.left; w++) { + U32 index = 0; + if (padParamSpec.pad_mode == Pad_Reflect) { + index = (padParamSpec.left - w) * alignSize * bytesOf(idt); + } else if (padParamSpec.pad_mode == Pad_Symmetric) { + index = (padParamSpec.left - w - 1) * alignSize * bytesOf(idt); + } + memcpy(outPtr, inPtr + index, alignSize * bytesOf(idt)); + outPtr += alignSize * bytesOf(idt); + } + memcpy(outPtr, inPtr, iw * alignSize * bytesOf(idt)); + outPtr += iw * alignSize * bytesOf(odt); + for (U32 w = 0; w < padParamSpec.right; w++) { + U32 index = (iw - 1) * alignSize * bytesOf(idt); + if (padParamSpec.pad_mode == Pad_Reflect) { + index = (iw - w - 2) * alignSize * bytesOf(idt); + } else if (padParamSpec.pad_mode == Pad_Symmetric) { + index = (iw - w - 1) * alignSize * bytesOf(idt); + } + memcpy(outPtr, inPtr + index, alignSize * bytesOf(idt)); + outPtr += alignSize * bytesOf(idt); + } + } + } + U8 *outPtr = (U8 *)output + (((n * oc + c) * oh) * ow) * alignSize * bytesOf(odt); + for (U32 h = 0; h < padParamSpec.top; h++) { + U32 index = h * ow * alignSize * bytesOf(odt); + if (padParamSpec.pad_mode == Pad_Constant) { + memset(outPtr + index, 0, ow * alignSize * bytesOf(odt)); + } else if (padParamSpec.pad_mode == Pad_Edge) { + memcpy(outPtr + index, + outPtr + (padParamSpec.top * ow * alignSize * bytesOf(odt)), + ow * alignSize * bytesOf(odt)); + } else if (padParamSpec.pad_mode == Pad_Reflect) { + memcpy(outPtr + index, + outPtr + + ((padParamSpec.top + padParamSpec.top - h) * ow * alignSize * + bytesOf(odt)), + ow * alignSize * bytesOf(odt)); + } else if (padParamSpec.pad_mode == Pad_Symmetric) { + memcpy(outPtr + index, + outPtr + + ((padParamSpec.top + padParamSpec.top - h - 1) * ow * alignSize * + bytesOf(odt)), + ow * alignSize * bytesOf(odt)); + } else { + return NOT_SUPPORTED; + } + } + for (U32 h = 0; h < padParamSpec.bottom; h++) { + U32 index = (padParamSpec.top + ih + h) * ow * alignSize * bytesOf(odt); + if (padParamSpec.pad_mode == Pad_Constant) { + memset(outPtr + index, 0, ow * alignSize * bytesOf(odt)); + } else if (padParamSpec.pad_mode == Pad_Edge) { + memcpy(outPtr + index, + outPtr + ((padParamSpec.top + ih - 1) * ow * alignSize * bytesOf(odt)), + ow * alignSize * bytesOf(odt)); + } else if (padParamSpec.pad_mode == Pad_Reflect) { + // memcpy(outPtr+index, outPtr+((padParamSpec.top+ih-2-h)*ow*alignSize*bytesOf(odt)), ow*alignSize*bytesOf(odt)); + memcpy(outPtr + index, + outPtr + + ((padParamSpec.top + ih - 1 - padParamSpec.bottom + h) * ow * + alignSize * bytesOf(odt)), + ow * alignSize * bytesOf(odt)); + } else if (padParamSpec.pad_mode == Pad_Symmetric) { + memcpy(outPtr + index, + outPtr + ((padParamSpec.top + ih - 1 - h) * ow * alignSize * bytesOf(odt)), + ow * alignSize * bytesOf(odt)); + } else { + return NOT_SUPPORTED; + } + } + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/power.cpp b/compute/tensor/src/cpu/power.cpp new file mode 100644 index 00000000..cf08407e --- /dev/null +++ b/compute/tensor/src/cpu/power.cpp @@ -0,0 +1,30 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/tensor_computing_cpu.h" +#include "cpu/cpu_functions.h" + +EE power_cpu( + TensorDesc inputDesc, void *input, PowerParamSpec p, TensorDesc outputDesc, void *output, Arch arch) +{ + UNUSED(outputDesc); + ArrayScaleFunction scale_func = get_array_scale_function(arch); + ArrayPowerFunction power_func = get_array_power_function(arch); + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + scale_func(inputDesc.dt, input, output, tensorNumElements(inputDesc), p.scale, p.shift); + power_func(outputDesc.dt, output, output, tensorNumElements(inputDesc), p.power); + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/priorbox.cpp b/compute/tensor/src/cpu/priorbox.cpp new file mode 100644 index 00000000..f5f31780 --- /dev/null +++ b/compute/tensor/src/cpu/priorbox.cpp @@ -0,0 +1,206 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/tensor_computing_cpu.h" + +template +static EE priorbox_kernel(DataType idt0, + T *output, + U32 ih_layer, + U32 iw_layer, + U32 ih_img, + U32 iw_img, + std::vector minsizes, + std::vector maxsizes, + std::vector ars, + U32 flip, + U32 clip, + F32 *vars, + I32 imageW, + I32 imageH, + F32 stepW, + F32 stepH, + F32 offset, + Arch arch) +{ + U32 layer_w = iw_layer; + U32 layer_h = ih_layer; + + int img_w, img_h; + if (imageH == 0 || imageW == 0) { + img_w = iw_img; + img_h = ih_img; + } else { + img_w = imageW; + img_h = imageH; + } + F32 stp_h, stp_w; + if (stepW == 0 || stepH == 0) { + stp_w = static_cast(ceil((img_w) / layer_w)); + stp_h = static_cast(ceil((img_h) / layer_h)); + } else { + stp_w = stepW; + stp_h = stepH; + } + + U32 num_priorboxs = ars.size(); + if (flip) { + num_priorboxs = num_priorboxs * 2; + } + U32 num_minsize = minsizes.size(); + num_priorboxs = (num_priorboxs + 1) * num_minsize; + if (!maxsizes.empty()) { + U32 num_maxsize = maxsizes.size(); + num_priorboxs = num_priorboxs + num_maxsize; + } + int dim = layer_h * layer_w * num_priorboxs * 4; + int idx = 0; + for (U32 h = 0; h < layer_h; h++) { + for (U32 w = 0; w < layer_w; w++) { + F32 center_x = (w + offset) * stp_w; + F32 center_y = (h + offset) * stp_h; + F32 box_w, box_h; + for (int n = 0; n < (int)minsizes.size(); n++) { + F32 minsize = minsizes[n]; + box_w = box_h = minsize; + output[idx++] = (center_x - box_w / 2) / img_w; + output[idx++] = (center_y - box_h / 2) / img_h; + output[idx++] = (center_x + box_w / 2) / img_w; + output[idx++] = (center_y + box_h / 2) / img_h; + + if ((int)maxsizes.size() > 0) { + F32 maxsize = maxsizes[n]; + box_w = box_h = sqrt(minsize * maxsize); + output[idx++] = (center_x - box_w / 2) / img_w; + output[idx++] = (center_y - box_h / 2) / img_h; + output[idx++] = (center_x + box_w / 2) / img_w; + output[idx++] = (center_y + box_h / 2) / img_h; + } + + for (int a = 0; a < (int)ars.size(); a++) { + F32 ar = ars[a]; + box_w = minsize * sqrt(ar); + box_h = minsize / sqrt(ar); + output[idx++] = (center_x - box_w / 2) / img_w; + output[idx++] = (center_y - box_h / 2) / img_h; + output[idx++] = (center_x + box_w / 2) / img_w; + output[idx++] = (center_y + box_h / 2) / img_h; + if (flip) { + output[idx++] = (center_x - box_h / 2) / img_w; + output[idx++] = (center_y - box_w / 2) / img_h; + output[idx++] = (center_x + box_h / 2) / img_w; + output[idx++] = (center_y + box_w / 2) / img_h; + } + } + } + } + } + EE ret = SUCCESS; + if (clip) { + ClipParamSpec p; + p.min = 0; + p.max = 1; + TensorDesc desc = tensor1d(idt0, dim); + ret = clip_cpu(desc, output, p, desc, output, arch); + } + + for (int i = 0; i < dim / 4; i++) { + output[idx++] = vars[0]; + output[idx++] = vars[1]; + output[idx++] = vars[2]; + output[idx++] = vars[3]; + } + return ret; +} + +EE priorbox_cpu(std::vector inputDesc, + PriorBoxParamSpec priorBoxParamSpec, + TensorDesc outputDesc, + void *output, + Arch arch) +{ + UNUSED(outputDesc); + if (nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + U32 num = inputDesc.size(); + if (num != 2) { + return NOT_MATCH; + } + DataType idt0, idt1; + DataFormat idf0, idf1; + U32 in0 = 0, ic0 = 0, ih0 = 0, iw0 = 0; + U32 in1 = 0, ic1 = 0, ih1 = 0, iw1 = 0; + CHECK_STATUS(tensor4dGet(inputDesc[0], &idt0, &idf0, &in0, &ic0, &ih0, &iw0)); + CHECK_STATUS(tensor4dGet(inputDesc[1], &idt1, &idf1, &in1, &ic1, &ih1, &iw1)); + + std::vector minsizes; + for (int i = 0; i < 2; i++) { + if (priorBoxParamSpec.min_sizes[i] == 0) { + break; + } + minsizes.push_back(priorBoxParamSpec.min_sizes[i]); + } + std::vector maxsizes; + for (int i = 0; i < 2; i++) { + if (priorBoxParamSpec.max_sizes[i] == 0) { + break; + } + maxsizes.push_back(priorBoxParamSpec.max_sizes[i]); + } + std::vector ars; + for (int i = 0; i < 2; i++) { + if (priorBoxParamSpec.aspect_ratios[i] == 0) { + break; + } + ars.push_back(priorBoxParamSpec.aspect_ratios[i]); + } + U32 flip = priorBoxParamSpec.flip; + U32 clip = priorBoxParamSpec.clip; + F32 vars[4]; + for (int i = 0; i < 4; i++) { + vars[i] = priorBoxParamSpec.variances[i]; + } + U32 imageH = priorBoxParamSpec.image_h; + U32 imageW = priorBoxParamSpec.image_w; + F32 stepH = priorBoxParamSpec.step_h; + F32 stepW = priorBoxParamSpec.step_w; + F32 offset = priorBoxParamSpec.offset; + + EE ret = SUCCESS; + switch (idt0) { +#ifdef _USE_FP32 + case DT_F32: + ret = priorbox_kernel(idt0, (F32 *)output, ih0, iw0, ih1, iw1, minsizes, maxsizes, + ars, flip, clip, vars, imageW, imageH, stepW, stepH, offset, arch); + break; +#endif +#ifdef _USE_FP16 + case DT_F16: + ret = priorbox_kernel(idt0, (F16 *)output, ih0, iw0, ih1, iw1, minsizes, maxsizes, + ars, flip, clip, vars, imageW, imageH, stepW, stepH, offset, arch); + break; +#endif +#ifdef _USE_INT8 + case DT_I8: { + ret = priorbox_kernel(idt0, (F16 *)output, ih0, iw0, ih1, iw1, minsizes, maxsizes, + ars, flip, clip, vars, imageW, imageH, stepW, stepH, offset, arch); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/reduction.cpp b/compute/tensor/src/cpu/reduction.cpp new file mode 100644 index 00000000..5271d9fa --- /dev/null +++ b/compute/tensor/src/cpu/reduction.cpp @@ -0,0 +1,198 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/tensor_computing_cpu.h" +#include "cpu/cpu_functions.h" + +template +static EE reduction_kernel(TensorDesc inputDesc, + const T *input, + TensorDesc maskDesc, + const float *mask, + I32 axis, + ReductionMode reductionMode, + TensorDesc outputDesc, + T *output, + Arch arch) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + ArraySumFunction sum_func = get_array_sum_function(arch); + ArrayMeanFunction mean_func = get_array_mean_function(arch); + ArrayVarFunction var_func = get_array_var_function(arch); + ArrayAddFunction add_func = get_array_add_function(arch); + ArraySquareAndAddFunction square_and_add_func = get_array_square_and_add_function(arch); + ArrayScaleFunction scale_func = get_array_scale_function(arch); + + if (axis < 0) { + axis = inputDesc.nDims + axis; + } + axis = inputDesc.nDims - 1 - axis; + U32 loopInner = 1; + for (int i = 0; i < axis; i++) { + loopInner *= inputDesc.dims[i]; + } + U32 loopOuter = 1; + for (U32 i = axis + 1; i < inputDesc.nDims; i++) { + loopOuter *= inputDesc.dims[i]; + } + U32 len = inputDesc.dims[axis]; + U32 maskLen = tensorNumElements(maskDesc); + maskLen = (maskLen > 0) ? maskLen : len; + U32 axisDim = maskLen / len; + for (U32 i = 0; i < loopOuter; i++) { + if (loopInner == 1) { + if (mask != nullptr) { + return NOT_SUPPORTED; + } + const T *array = input + i * len; + F32 tmpValue = 0; + switch (reductionMode) { + case REDUCTION_SUM: + output[i] = sum_func(inputDesc.dt, array, len); + break; + case REDUCTION_MEAN: + output[i] = mean_func(inputDesc.dt, array, len); + break; + case REDUCTION_STD_DEVIATION: + tmpValue = mean_func(inputDesc.dt, array, len); + tmpValue = var_func(inputDesc.dt, array, len, tmpValue); + output[i] = sqrt(tmpValue); + break; + case REDUCTION_SCALAR_PRODUCT: + tmpValue = var_func(inputDesc.dt, array, len, 0); + break; + default: + return NOT_SUPPORTED; + } + } else { + CHECK_REQUIREMENT(REDUCTION_STD_DEVIATION != reductionMode); + for (U32 j = 0; j < maskLen; j += len) { + U32 axisIndex = j / len; + U32 outputIndex = (i * axisDim + axisIndex) * loopInner; + if (reductionMode == REDUCTION_SUM || reductionMode == REDUCTION_MEAN || + reductionMode == REDUCTION_SCALAR_PRODUCT) { + memset(output + outputIndex, 0, loopInner * bytesOf(inputDesc.dt)); + } else { + return NOT_SUPPORTED; + } + U32 count = 0; + for (U32 k = 0; k < len; k++) { + if (mask == nullptr || (mask != nullptr && mask[j + k] == 1)) { + if (reductionMode == REDUCTION_SUM || reductionMode == REDUCTION_MEAN) { + add_func(inputDesc.dt, output + outputIndex, + &input[(i * len + k) * loopInner], output + outputIndex, loopInner); + count++; + } else if (reductionMode == REDUCTION_SCALAR_PRODUCT) { + square_and_add_func(inputDesc.dt, output + outputIndex, + &input[(i * len + k) * loopInner], output + outputIndex, loopInner); + } else { + return NOT_SUPPORTED; + } + } + } + if (reductionMode == REDUCTION_MEAN) { + scale_func(inputDesc.dt, output + outputIndex, output + outputIndex, loopInner, + 1.0 / count, 0); + } + } + } + } + return SUCCESS; +} + +EE reduction_cpu(TensorDesc inputDesc, + const void *input, + TensorDesc maskDesc, + const void *mask, + ReductionParamSpec p, + int tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + Arch arch) +{ + EE ret = SUCCESS; + ArrayScaleFunction scale_func = get_array_scale_function(arch); + int start = 0; + TensorDesc tmpDesc = inputDesc; + if (inputDesc.df == DF_NCHWC8) { + for (int i = 0; i < p.axes_num; i++) { + // channel dimension + if (p.axes[i] == 1 || p.axes[i] == -3) { + start = -1; + break; + } + } + for (int i = (int)inputDesc.nDims - 1; i >= 0; i--) { + tmpDesc.dims[i + 1] = tmpDesc.dims[i]; + } + tmpDesc.dims[3] /= 8; + tmpDesc.dims[0] = 8; + tmpDesc.nDims += 1; + } + const void *tmp1 = input; + void *tmp2 = nullptr; + for (int i = start; i < p.axes_num; i++) { + if (p.axes_num - start == 1) { + tmp2 = output; + } else { + tmp2 = (char *)tmp + (i - start) % 2 * (tmpBytes / 2); + } + int axis; + if (i == -1) { + axis = 4; + } else { + axis = p.axes[i]; + } + + switch (inputDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = reduction_kernel(tmpDesc, (const F32 *)tmp1, maskDesc, + (const float *)mask, axis, p.reduction_mode, outputDesc, (F32 *)tmp2, arch); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + ret = reduction_kernel(tmpDesc, (const F16 *)tmp1, maskDesc, + (const float *)mask, axis, p.reduction_mode, outputDesc, (F16 *)tmp2, arch); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + tmp1 = tmp2; + if (axis < 0) { + axis = tmpDesc.nDims + axis; + } + axis = tmpDesc.nDims - 1 - axis; + tmpDesc.dims[axis] = 1; + } + + if (tmp2 != output) { + memcpy(output, tmp2, tensorNumBytes(outputDesc)); + } + + if (p.coeff != 1) { + scale_func(outputDesc.dt, output, output, tensorNumElements(outputDesc), p.coeff, 0); + } + + return ret; +} diff --git a/compute/tensor/src/cpu/reshape.cpp b/compute/tensor/src/cpu/reshape.cpp new file mode 100644 index 00000000..58c2641c --- /dev/null +++ b/compute/tensor/src/cpu/reshape.cpp @@ -0,0 +1,120 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/tensor_computing_cpu.h" + +EE reshape_infer_output_size_cpu(TensorDesc inputDesc, ReshapeParamSpec p, TensorDesc *outputDesc) +{ + if (nullptr == outputDesc) { + return NULL_POINTER; + } + I32 *shape = p.shape_dims; + I32 shape_size = p.shape_size; + int inputElementNum = tensorNumElements(inputDesc); + int outputElementNum = 1; + for (int i = 0; i < shape_size; i++) { + outputElementNum *= shape[i]; + } + int index_range = ((int)inputDesc.nDims > shape_size) ? shape_size : inputDesc.nDims; + if (inputElementNum > 0 && outputElementNum > 0 && inputElementNum != outputElementNum) { + for (int i = 0; i < index_range; i++) { + if ((inputElementNum / (int)inputDesc.dims[inputDesc.nDims - 1 - i]) == + (outputElementNum / shape[i])) { + shape[i] = inputDesc.dims[inputDesc.nDims - 1 - i]; + break; + } + } + } + + *outputDesc = inputDesc; + (*outputDesc).nDims = shape_size; + if (shape_size == 2) { + (*outputDesc).df = DF_NORMAL; + } + if (shape_size >= 4) { + (*outputDesc).df = DF_NCHW; + } + + U32 factor = 1; + I32 count = 0; + for (I32 i = 0; i < shape_size; i++) { + I32 value = shape[i]; + if (value == 0) { + value = inputDesc.dims[inputDesc.nDims - 1 - i]; + } + if (value == -1) { + value = 0; + count++; + } else { + factor *= value; + } + + (*outputDesc).dims[shape_size - 1 - i] = value; + } + if (count > 1) { + return NOT_SUPPORTED; + } + + for (I32 i = 0; i < shape_size; i++) { + if ((*outputDesc).dims[i] == 0) { + (*outputDesc).dims[i] = tensorNumElements(inputDesc) / factor; + } + } + + return SUCCESS; +} + +EE reshape_cpu(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *output) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + if (tensorNumElements(inputDesc) != tensorNumElements(outputDesc)) { + // Only allow the removal of padded convolution channels + CHECK_REQUIREMENT(DF_NCHWC8 == inputDesc.df); + CHECK_REQUIREMENT(tensorNumElements(inputDesc) >= tensorNumElements(outputDesc)); + inputDesc.df = DF_NCHW; + } + if (DF_NCHWC8 != inputDesc.df) { + if (output != input) { + memcpy(output, input, tensorNumBytes(outputDesc)); + } + } else { + CHECK_REQUIREMENT(input != output); + DataType idt; + DataFormat idf; + U32 in, ic, ih, iw; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + + U32 elementBytes = bytesOf(idt); + ic /= 8; + U8 *inPtr = (U8 *)input; + U8 *outPtr = (U8 *)output; + for (U32 n = 0; n < in; n++) { + for (U32 c = 0; c < ic; c++) { + for (U32 hw = 0; hw < ih * iw; hw++) { + for (U32 c8 = 0; c8 < 8; c8++) { + memcpy(outPtr + + elementBytes * (n * ic * 8 * ih * iw + (c * 8 + c8) * ih * iw + hw), + inPtr + + elementBytes * (n * ic * ih * iw * 8 + c * ih * iw * 8 + hw * 8 + c8), + elementBytes); + } + } + } + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/rnn.cpp b/compute/tensor/src/cpu/rnn.cpp new file mode 100644 index 00000000..3a7d9465 --- /dev/null +++ b/compute/tensor/src/cpu/rnn.cpp @@ -0,0 +1,273 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/tensor_computing_cpu.h" +#ifdef _USE_GENERAL +#include "cpu/general/tensor_computing_general.h" +#endif +#ifdef _USE_X86 +#include "cpu/x86/tensor_computing_x86.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/tensor_computing_arm.h" +#endif +#include "blas_enhance.h" + +template +static EE rnn_transform_filter(TensorDesc filterDesc, + const T *filterArray, + RNNParamSpec rnnParamSpec, + TensorDesc *ftmDesc, + T *ftmArray) +{ + if (nullptr == filterArray || nullptr == ftmDesc || nullptr == ftmArray) { + CHECK_STATUS(NULL_POINTER); + } + DataType fdt; + DataFormat fdf; + U32 fn, fk, ftm_n, ftm_k; + CHECK_STATUS(tensor2dGet(filterDesc, &fdt, &fdf, &fn, &fk)); + U32 alignSize = 32; + EE ret = SUCCESS; + switch (fdf) { + case DF_NKN32: { + ftm_n = fn; + ftm_k = fk; + break; + } + case DF_NK: { + // NK => NKN32 + if (fn % alignSize != 0) { + return NOT_MATCH; + } + ftm_n = fn / alignSize; + ftm_k = fk; + for (U32 n = 0; n < ftm_n; n++) { + for (U32 k = 0; k < ftm_k; k++) { + for (U32 n32 = 0; n32 < alignSize; n32++) { + ftmArray[n * ftm_k * alignSize + k * alignSize + n32] = + filterArray[(n * alignSize + n32) * ftm_k + k]; + } + } + } + break; + } + default: + ret = NOT_MATCH; + break; + } + *ftmDesc = tensor2df(fdt, DF_NKN32, fn, fk); + return ret; +} + +static EE rnn_transform_filter_cpu_kernel(TensorDesc filterDesc, + const void *filterArray, + RNNParamSpec rnnParamSpec, + TensorDesc *ftmDesc, + void *ftmArray) +{ + EE ret = SUCCESS; + switch (filterDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = rnn_transform_filter( + filterDesc, (const F32 *)filterArray, rnnParamSpec, ftmDesc, (F32 *)ftmArray); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + ret = rnn_transform_filter( + filterDesc, (const F16 *)filterArray, rnnParamSpec, ftmDesc, (F16 *)ftmArray); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE rnn_transform_filter_cpu(const TensorDesc *filterDesc, + const void **filterArray, + RNNParamSpec rnnParamSpec, + TensorDesc *ftmDesc, + void **ftmArray) +{ + int num1 = rnnParamSpec.biDirection ? 2 : 1; + int num2 = rnnParamSpec.numProjection > 0 ? 2 : 1; + EE ret = SUCCESS; + for (int i = 0; i < num1 * num2; i++) { + ret = rnn_transform_filter_cpu_kernel( + filterDesc[i], filterArray[i], rnnParamSpec, &ftmDesc[i], ftmArray[i]); + } + return ret; +} + +EE rnn_transform_filter_bytes_cpu( + const TensorDesc *filterDesc, RNNParamSpec rnnParamSpec, U32 *bytes) +{ + if (nullptr == bytes) { + CHECK_STATUS(NULL_POINTER); + } + int num1 = rnnParamSpec.biDirection ? 2 : 1; + int num2 = rnnParamSpec.numProjection > 0 ? 2 : 1; + for (int i = 0; i < num1 * num2; i++) { + bytes[i] = tensorNumBytes(filterDesc[i]); + } + return SUCCESS; +} + +EE rnncell_infer_forward_tmp_bytes_cpu(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + RNNParamSpec rnnParamSpec, + U32 *bytes, + Arch arch) +{ + UNUSED(outputDesc); + if (nullptr == bytes) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt; + DataFormat idf; + U32 batch, xDim; + CHECK_STATUS(tensor2dGet(inputDesc, &idt, &idf, &batch, &xDim)); + U32 hDim = rnnParamSpec.numOutput; + U32 column = (rnnParamSpec.numProjection > 0) ? rnnParamSpec.numProjection + : rnnParamSpec.numOutput; + *bytes = (hDim + xDim + column * 4) * bytesOf(idt); + return SUCCESS; +} + +EE rnn_infer_forward_tmp_bytes_cpu(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + RNNParamSpec rnnParamSpec, + U32 *bytes, + Arch arch) +{ + UNUSED(filterDesc); + UNUSED(outputDesc); + if (nullptr == bytes) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt; + DataFormat idf; + U32 batch, step, xDim; + CHECK_STATUS(tensor3dGet(inputDesc, &idt, &idf, &batch, &step, &xDim)); + U32 hDim = rnnParamSpec.numOutput; + TensorDesc xDesc = tensor2df(idt, DF_NORMAL, batch, xDim); + CHECK_STATUS(rnncell_infer_forward_tmp_bytes_cpu( + xDesc, filterDesc, outputDesc, rnnParamSpec, bytes, arch)); + U32 column = (rnnParamSpec.numProjection > 0) ? rnnParamSpec.numProjection + : rnnParamSpec.numOutput; + *bytes += batch * (column + hDim) * bytesOf(idt); + return SUCCESS; +} + +EE rnncell_cpu(TensorDesc xDesc, + const void *currentX, + const TensorDesc *filterDesc, + const void **filter, + const TensorDesc *biasDesc, + const void **bias, + void *state, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + U32 tmpBytes, + void *tmp, + TensorDesc hDesc, + void *currentH, + Arch arch) +{ + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = rnncell_general(xDesc, currentX, filterDesc, filter, biasDesc, bias, state, tmpBytes, + tmp, rnnParamSpec, batchStrideX, batchStrideH, hDesc, currentH); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = rnncell_x86(xDesc, currentX, filterDesc, filter, biasDesc, bias, state, tmpBytes, tmp, + rnnParamSpec, batchStrideX, batchStrideH, hDesc, currentH, arch); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = rnncell_arm(xDesc, currentX, filterDesc, filter, biasDesc, bias, state, tmpBytes, tmp, + rnnParamSpec, batchStrideX, batchStrideH, hDesc, currentH, arch); +#endif + } + return ret; +} + +EE rnn_cpu(TensorDesc inputDesc, + const void *input, + const TensorDesc *filterDesc, + const void **filter, + const TensorDesc *biasDesc, + const void **bias, + RNNParamSpec rnnParamSpec, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + Arch arch) +{ + UNUSED(outputDesc); + + if (nullptr == input || nullptr == filter || nullptr == bias || nullptr == tmp || + nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + DataType idt; + DataFormat idf; + U32 batch, step, xDim; + int num1 = rnnParamSpec.biDirection ? 2 : 1; + CHECK_STATUS(tensor3dGet(inputDesc, &idt, &idf, &batch, &step, &xDim)); + U32 hDim = rnnParamSpec.numOutput; + U32 column = (rnnParamSpec.numProjection > 0) ? rnnParamSpec.numProjection + : rnnParamSpec.numOutput; + + U8 *cellState = (U8 *)tmp; + U8 *tmpArray = cellState + batch * (column + hDim) * bytesOf(idt); + U32 batchStrideX = step * xDim; + U32 batchStrideH = step * hDim * num1; + TensorDesc xDesc = tensor2df(idt, DF_NORMAL, batch, xDim); + TensorDesc hDesc = tensor2df(idt, DF_NORMAL, batch, hDim); + + memset(cellState, 0, batch * (column + hDim) * bytesOf(idt)); + for (U32 t = 0; t < step; t++) { + const U8 *currentX = (const U8 *)input + t * xDim * bytesOf(idt); + U8 *currentH = (U8 *)output + t * hDim * num1 * bytesOf(idt); + CHECK_STATUS(rnncell_cpu(xDesc, currentX, filterDesc, filter, biasDesc, bias, cellState, + rnnParamSpec, batchStrideX, batchStrideH, tmpBytes, tmpArray, hDesc, currentH, arch)); + } + + if (rnnParamSpec.biDirection) { + memset(cellState, 0, batch * (column + hDim) * bytesOf(idt)); + int num2 = (rnnParamSpec.numProjection > 0) ? 2 : 1; + for (I32 t = step - 1; t >= 0; t--) { + const U8 *currentX = (const U8 *)input + t * xDim * bytesOf(idt); + U8 *currentH = (U8 *)output + t * hDim * num1 * bytesOf(idt) + hDim * bytesOf(idt); + CHECK_STATUS(rnncell_cpu(xDesc, currentX, &filterDesc[num2], &filter[num2], + &biasDesc[num2], &bias[num2], cellState, rnnParamSpec, batchStrideX, batchStrideH, + tmpBytes, tmpArray, hDesc, currentH, arch)); + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/roialign.cpp b/compute/tensor/src/cpu/roialign.cpp new file mode 100644 index 00000000..42969206 --- /dev/null +++ b/compute/tensor/src/cpu/roialign.cpp @@ -0,0 +1,170 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing_type.h" +#include "cpu/tensor_computing_cpu.h" + +template +static F32 bilinear_interpolate(T *data, U32 w, U32 h, F32 x, F32 y) +{ + if (y < -1.0 || y > h || x < -1.0 || x > w) { + return 0; + } + if (y <= 0) { + y = 0; + } + if (x <= 0) { + x = 0; + } + + U32 x0 = x; + U32 x1 = x0 + 1; + U32 y0 = y; + U32 y1 = y0 + 1; + + F32 hx = x1 - x; + F32 lx = x - x0; + F32 hy = y1 - y; + F32 ly = y - y0; + + if (x1 >= w) { + x1 = w - 1; + hx = 1.f; + lx = 0.f; + } + if (y1 >= h) { + y1 = h - 1; + hy = 1.f; + ly = 0.f; + } + + F32 r0 = data[y0 * w + x0] * hx + data[y0 * w + x1] * lx; + F32 r1 = data[y1 * w + x0] * hx + data[y1 * w + x1] * lx; + + F32 val = r0 * hy + r1 * ly; + return val; +} + +template +static EE roialign_kernel(std::vector input, + T *output, + std::vector inputDesc, + U32 output_h, + U32 output_w, + U32 sampling_ratio, + F32 spatial_scale) +{ + DataType idt0, idt1; + DataFormat idf0, idf1; + U32 in0, ic0, ih0, iw0; + U32 ih1, iw1; + CHECK_STATUS(tensor4dGet(inputDesc[0], &idt0, &idf0, &in0, &ic0, &ih0, &iw0)); + CHECK_STATUS(tensor2dGet(inputDesc[1], &idt1, &idf1, &ih1, &iw1)); + T *feature_map = (T *)input[0]; + T *rois = (T *)input[1]; + CHECK_REQUIREMENT(idf0 == DF_NCHWC8 || idf0 == DF_NCHW); + if (inputDesc[0].df == DF_NCHWC8) { + T *tmp = (T *)malloc(tensorNumBytes(inputDesc[0])); + memcpy(tmp, feature_map, tensorNumBytes(inputDesc[0])); + CHECK_STATUS(transformToNCHW(inputDesc[0], tmp, inputDesc[0], feature_map)); + free(tmp); + } + + U32 channel = ic0; + U32 feature_w = iw0; + U32 feature_h = ih0; + U32 num_rois = ih1; + for (U32 n = 0; n < num_rois; n++) { + U32 idx_n = n * channel * output_w * output_h; + F32 roi_start_x1 = static_cast(rois[n * 4]) * spatial_scale; + F32 roi_start_y1 = static_cast(rois[n * 4 + 1]) * spatial_scale; + F32 roi_end_x2 = static_cast(rois[n * 4 + 2]) * spatial_scale; + F32 roi_end_y2 = static_cast(rois[n * 4 + 3]) * spatial_scale; + + F32 roi_w = std::max(roi_end_x2 - roi_start_x1, 1.f); + F32 roi_h = std::max(roi_end_y2 - roi_start_y1, 1.f); + + F32 bin_size_w = roi_w / static_cast(output_w); + F32 bin_size_h = roi_h / static_cast(output_h); + + U32 bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_w / output_w); + U32 bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_h / output_h); + + F32 count = bin_grid_h * bin_grid_w; + for (U32 c = 0; c < channel; c++) { + U32 idx_nc = idx_n + c * output_h * output_w; + T *feature_map_offset = feature_map + c * feature_h * feature_w; + for (U32 ph = 0; ph < output_h; ph++) { + for (U32 pw = 0; pw < output_w; pw++) { + U32 idx = idx_nc + ph * output_w + pw; + F32 output_val = 0; + F32 start_x = roi_start_x1 + pw * bin_size_w; + F32 start_y = roi_start_y1 + ph * bin_size_h; + start_x = std::min(std::max(start_x, 0.f), (F32)feature_w); + start_y = std::min(std::max(start_y, 0.f), (F32)feature_h); + for (U32 by = 0; by < bin_grid_h; by++) { + F32 y = start_y + + static_cast(by + 0.5f) * bin_size_h / static_cast(bin_grid_h); + for (U32 bx = 0; bx < bin_grid_w; bx++) { + F32 x = start_x + + static_cast(bx + 0.5f) * bin_size_w / + static_cast(bin_grid_w); + F32 val = bilinear_interpolate( + (T *)feature_map_offset, feature_w, feature_h, x, y); + output_val += val; + } + } + output_val /= count; + output[idx] = output_val; + } + } + } + } + + return SUCCESS; +} + +EE roialign_cpu(std::vector inputDesc, + std::vector input, + RoiAlignParamSpec roiAlignParamSpec, + TensorDesc outputDesc, + void *output) +{ + UNUSED(outputDesc); + if (nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + U32 output_h = roiAlignParamSpec.output_h; + U32 output_w = roiAlignParamSpec.output_w; + U32 sampling_ratio = roiAlignParamSpec.sampling_ratio; + F32 spatial_scale = roiAlignParamSpec.spatial_scale; + EE ret = SUCCESS; + switch (inputDesc[0].dt) { +#ifdef _USE_FP32 + case DT_F32: + ret = roialign_kernel( + input, (F32 *)output, inputDesc, output_h, output_w, sampling_ratio, spatial_scale); + break; +#endif +#ifdef _USE_FP16 + case DT_F16: + ret = roialign_kernel( + input, (F16 *)output, inputDesc, output_h, output_w, sampling_ratio, spatial_scale); + break; +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/slice.cpp b/compute/tensor/src/cpu/slice.cpp new file mode 100644 index 00000000..72b59cef --- /dev/null +++ b/compute/tensor/src/cpu/slice.cpp @@ -0,0 +1,64 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include +#include "cpu/tensor_computing_cpu.h" + +EE slice_cpu(TensorDesc inputDesc, + void *input, + SliceParamSpec p, + std::vector outputDesc, + std::vector *output) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + U32 num = outputDesc.size(); + if (num < 1) { + return NOT_MATCH; + } + + int dim = inputDesc.nDims; + int axis = (p.axis + dim) % dim; + axis = dim - 1 - axis; + U32 tileSize = bytesOf(inputDesc.dt); + for (I32 i = 0; i < axis; i++) { + tileSize *= inputDesc.dims[i]; + } + U32 loops = 1; + for (I32 i = axis + 1; i < dim; i++) { + loops *= inputDesc.dims[i]; + } + + if (inputDesc.df == DF_NCHWC8) { + if (axis < 2) { + tileSize *= 8; + loops /= 8; + } + } + + U8 *ptr = (U8 *)input; + for (U32 i = 0; i < loops; i++) { + for (U32 j = 0; j < num; j++) { + U32 blockSize = outputDesc[j].dims[axis] * tileSize; + if (blockSize > 0 && nullptr == (*output)[j]) { + CHECK_STATUS(NULL_POINTER); + } + U8 *dstPtr = (U8 *)((*output)[j]) + i * blockSize; + memcpy(dstPtr, ptr, blockSize); + ptr += blockSize; + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/split.cpp b/compute/tensor/src/cpu/split.cpp new file mode 100644 index 00000000..38d25cb5 --- /dev/null +++ b/compute/tensor/src/cpu/split.cpp @@ -0,0 +1,39 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include + +#include "cpu/tensor_computing_cpu.h" + +EE split_cpu(TensorDesc inputDesc, + void *input, + std::vector outputDesc, + std::vector *output) +{ + UNUSED(inputDesc); + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + if (outputDesc.size() <= 1) { + return NOT_MATCH; + } + + for (U32 i = 0; i < (*output).size(); i++) { + if (nullptr == (*output)[i]) { + CHECK_STATUS(NULL_POINTER); + } + memcpy((*output)[i], input, tensorNumBytes(outputDesc[i])); + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/tensor_computing_cpu.h b/compute/tensor/src/cpu/tensor_computing_cpu.h new file mode 100644 index 00000000..4ce621d4 --- /dev/null +++ b/compute/tensor/src/cpu/tensor_computing_cpu.h @@ -0,0 +1,286 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_TENSOR_COMPUTING_CPU +#define _H_TENSOR_COMPUTING_CPU + +#include "sys.h" +#include "types.h" + +EE rnn_transform_filter_cpu(const TensorDesc *filterDescs, + const void **filterArray, + RNNParamSpec rnnParamSpec, + TensorDesc *ftmDesc, + void **ftmArray); + +EE rnn_transform_filter_bytes_cpu( + const TensorDesc *filterDesc, RNNParamSpec rnnParamSpec, U32 *bytes); + +EE rnncell_infer_forward_tmp_bytes_cpu(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + RNNParamSpec rnnParamSpec, + U32 *bytes, + Arch arch); + +EE rnn_infer_forward_tmp_bytes_cpu(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + RNNParamSpec rnnParamSpec, + U32 *bytes, + Arch arch); + +EE rnncell_cpu(TensorDesc xDesc, + const void *currentX, + const TensorDesc *filterDesc, + const void **filter, + const TensorDesc *biasDesc, + const void **bias, + void *state, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + U32 tmpBytes, + void *tmp, + TensorDesc hDesc, + void *currentH, + Arch arch); + +EE rnn_cpu(TensorDesc inputDesc, + const void *input, + const TensorDesc *filterDesc, + const void **filter, + const TensorDesc *biasDesc, + const void **bias, + RNNParamSpec rnnParamSpec, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + Arch arch); + +EE embedding_cpu(TensorDesc inputDesc, + void *input, + void *weight, + EmbedParamSpec p, + TensorDesc outputDesc, + void *output); + +EE tfslice_infer_output_size_cpu(TensorDesc inputDesc, TfSliceParamSpec p, TensorDesc *outputDesc); + +EE tfslice_cpu( + TensorDesc inputDesc, void *input, TfSliceParamSpec p, TensorDesc outputDesc, void *output); + +EE padding_infer_output_size_cpu( + TensorDesc inputDesc, PadParamSpec padParamSpec, TensorDesc *outputDesc); + +EE padding_cpu(TensorDesc inputDesc, + const void *input, + PadParamSpec padParamSpec, + TensorDesc outputDesc, + void *output); + +EE reshape_infer_output_size_cpu(TensorDesc inputDesc, ReshapeParamSpec p, TensorDesc *outputDesc); + +EE reshape_cpu(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *output); + +EE depthwise_convolution_transform_filter_bytes_cpu( + TensorDesc filterDesc, DepthwiseConvolutionForwardAlgorithm algorithm, U32 *bytes); + +EE eltwise_cpu(std::vector inputDesc, + std::vector input, + EltwiseParamSpec eltwiseDesc, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + Arch arch); + +EE roialign_cpu(std::vector inputDesc, + std::vector input, + RoiAlignParamSpec roiAlignParamSpec, + TensorDesc outputDesc, + void *output); + +EE split_cpu(TensorDesc inputDesc, + void *input, + std::vector outputDesc, + std::vector *output); + +EE transpose_cpu( + TensorDesc inputDesc, const void *input, U32 *dim, TensorDesc outputDesc, void *output); + +EE reduction_cpu(TensorDesc inputDesc, + const void *input, + TensorDesc maskDesc, + const void *mask, + ReductionParamSpec p, + int tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + Arch arch); + +EE non_max_suppression_cpu(std::vector inputDesc, + std::vector input, + NonMaxSuppressionParamSpec nonMaxSuppressionParamSpec, + TensorDesc outputDesc, + void *output); + +EE concat_cpu(std::vector inputDesc, + std::vector input, + void *inputScale, + ConcatParamSpec p, + void *tmp, + TensorDesc outputDesc, + void *output, + void *outputScale); + +EE l2normalization_cpu( + TensorDesc inputDesc, const void *input, TensorDesc outputDesc, void *output, Arch arch); + +EE power_cpu(TensorDesc inputDesc, + void *input, + PowerParamSpec p, + TensorDesc outputDesc, + void *output, + Arch arch); + +EE slice_cpu(TensorDesc inputDesc, + void *input, + SliceParamSpec p, + std::vector outputDesc, + std::vector *output); + +EE priorbox_cpu(std::vector inputDesc, + PriorBoxParamSpec priorBoxParamSpec, + TensorDesc outputDesc, + void *output, + Arch arch); + +EE clip_cpu(TensorDesc inputDesc, + void *input, + ClipParamSpec p, + TensorDesc outputDesc, + void *output, + Arch arch); + +EE detectionoutput_cpu(std::vector inputDesc, + std::vector input, + DetectionOutputParamSpec detectionOutputParamSpec, + TensorDesc outputDesc, + void *output); + +EE deconvolution_infer_forward_algorithm_cpu(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + ConvolutionForwardAlgorithm *algorithm, + DataType targetDataType, + Arch arch); + +EE deconvolution_transform_filter_bytes_cpu(TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes, + Arch arch); + +EE deconvolution_transform_filter_cpu(TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + void *filterTransformed, + Arch arch); + +EE deconvolution_infer_forward_tmp_bytes_cpu(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes, + Arch arch); + +EE deconvolution_cpu(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc scaleDesc, + const void *scale, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec activationDesc, + Arch arch); + +EE convolution_cpu(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc scaleDesc, + const void *scale, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec activationDesc, + Arch arch); + +EE depthwise_pointwise_convolution_cpu(TensorDesc inputDesc, + void *input, + TensorDesc dwFilterDesc, + const void *dwFilter, + TensorDesc pwFilterDesc, + const void *pwFilter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc dwBiasDesc, + const void *dwBias, + TensorDesc pwBiasDesc, + const void *pwBias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + Arch arch); + +EE activation_cpu(TensorDesc inputDesc, + void *input, + ActivationParamSpec activationDesc, + TensorDesc outputDesc, + void *output, + Arch arch); + +EE yolov3detectionoutput_cpu(std::vector inputDesc, + std::vector input, + Yolov3DetectionOutputParamSpec yolov3DetectionOutputParamSpec, + TensorDesc outputDesc, + void *output, + Arch arch); + +EE argmax_cpu( + TensorDesc inputDesc, const void *input, ArgMaxParamSpec p, TensorDesc outputDesc, void *output); + +#endif diff --git a/compute/tensor/src/cpu/tfslice.cpp b/compute/tensor/src/cpu/tfslice.cpp new file mode 100644 index 00000000..b72cc230 --- /dev/null +++ b/compute/tensor/src/cpu/tfslice.cpp @@ -0,0 +1,131 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/tensor_computing_cpu.h" + +EE tfslice_infer_output_size_cpu(TensorDesc inputDesc, TfSliceParamSpec p, TensorDesc *outputDesc) +{ + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + int *begin = p.begin; + int *end = p.end; + int *strides = p.strides; + char *beginMask = p.begin_mask; + char *endMask = p.end_mask; + U32 dimSize = p.dim_size; + + CHECK_REQUIREMENT(dimSize == inputDesc.nDims); + *outputDesc = inputDesc; + for (U32 i = 0; i < dimSize; i++) { + int axis = dimSize - 1 - i; + int axisBegin = (beginMask[i] == 1) ? 0 : begin[i]; + int axisEnd = (endMask[i] == 1) ? inputDesc.dims[axis] : end[i]; + int num = (axisEnd - axisBegin) / strides[i]; + outputDesc->dims[axis] = num; + begin[i] = axisBegin; + end[i] = axisEnd; + } + if (inputDesc.df == DF_NCHWC8) { + int channelAxis = 1; + if (begin[channelAxis] % 8 != 0 || strides[channelAxis] != 1 || + (end[channelAxis] - begin[channelAxis]) / strides[channelAxis] % 8 != 0) { + outputDesc->df = DF_NCHW; + } + } + return SUCCESS; +} + +EE tfslice_cpu( + TensorDesc inputDesc, void *input, TfSliceParamSpec p, TensorDesc outputDesc, void *output) +{ + int *begin = p.begin; + int *end = p.end; + int *strides = p.strides; + char *beginMask = p.begin_mask; + char *endMask = p.end_mask; + U32 dimSize = p.dim_size; + for (U32 i = 0; i < dimSize; i++) { + int axis = dimSize - 1 - i; + int axisBegin = (beginMask[i] == 1) ? 0 : begin[i]; + int axisEnd = (endMask[i] == 1) ? inputDesc.dims[axis] : end[i]; + begin[i] = axisBegin; + end[i] = axisEnd; + } + + U32 num = tensorNumElements(outputDesc); + U8 *dst = (U8 *)output; + U32 elementSize = bytesOf(inputDesc.dt); + int channelAxis = inputDesc.nDims - 2; + if (inputDesc.df == outputDesc.df) { + std::vector tmpInputDims(inputDesc.nDims), tmpOutputDims(outputDesc.nDims); + memcpy(tmpInputDims.data(), inputDesc.dims, inputDesc.nDims * sizeof(U32)); + memcpy(tmpOutputDims.data(), outputDesc.dims, outputDesc.nDims * sizeof(U32)); + int startAxis = 0; + int elementNum = 1; + if (inputDesc.df == DF_NCHWC8) { + elementNum *= 8; + begin[1] /= 8; + tmpInputDims[channelAxis] /= 8; + tmpOutputDims[channelAxis] /= 8; + tmpInputDims.insert(tmpInputDims.begin(), 8); + tmpOutputDims.insert(tmpOutputDims.begin(), 8); + startAxis = 1; + } + for (int i = dimSize - 1; i >= 0; i--) { + int reverseAxis = dimSize - 1 - i; + if (begin[i] == 0 && end[i] == (int)inputDesc.dims[reverseAxis] && strides[i] == 1) { + elementNum *= (end[i] - begin[i]); + } else { + break; + } + } + U32 tileSize = elementSize * elementNum; + for (U32 i = 0; i < num; i += elementNum, dst += tileSize) { + std::vector localIndex = + calculateLocalIndex(i, tmpOutputDims.data(), tmpOutputDims.size()); + for (U32 j = 0; j < dimSize; j++) { + int reverseAxis = dimSize - 1 - j; + localIndex[startAxis + j] = + localIndex[startAxis + j] * strides[reverseAxis] + begin[reverseAxis]; + } + U32 srcIndex = + calculateGlobalIndex(localIndex.data(), tmpInputDims.data(), tmpInputDims.size()); + U8 *src = (U8 *)input + srcIndex * elementSize; + memcpy(dst, src, tileSize); + } + if (inputDesc.df == DF_NCHWC8) { + begin[1] *= 8; + } + } else { + CHECK_REQUIREMENT(inputDesc.df == DF_NCHWC8); + U32 tmpNDims = inputDesc.nDims + 1; + std::vector tmpDims(tmpNDims); + tmpDims[0] = 8; + memcpy(&(tmpDims[1]), inputDesc.dims, inputDesc.nDims * sizeof(U32)); + for (U32 i = 0; i < num; i++, dst += elementSize) { + std::vector localIndex = calculateLocalIndex(i, outputDesc.dims, outputDesc.nDims); + for (U32 j = 0; j < dimSize; j++) { + int reverseAxis = dimSize - 1 - j; + localIndex[j] = localIndex[j] * strides[reverseAxis] + begin[reverseAxis]; + } + int c8 = localIndex[channelAxis] % 8; + localIndex[channelAxis] /= 8; + localIndex.insert(localIndex.begin(), c8); + U32 index = calculateGlobalIndex(localIndex.data(), tmpDims.data(), tmpNDims); + U8 *src = (U8 *)input + index * elementSize; + memcpy(dst, src, elementSize); + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/transpose.cpp b/compute/tensor/src/cpu/transpose.cpp new file mode 100644 index 00000000..38007d61 --- /dev/null +++ b/compute/tensor/src/cpu/transpose.cpp @@ -0,0 +1,24 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/tensor_computing_cpu.h" + +EE transpose_cpu( + TensorDesc inputDesc, const void *input, U32 *dim, TensorDesc outputDesc, void *output) +{ + if (nullptr == input || nullptr == output || nullptr == dim) { + CHECK_STATUS(NULL_POINTER); + } + return array_transpose( + inputDesc.dt, inputDesc.dims, input, outputDesc.dims, output, dim, inputDesc.nDims); +} diff --git a/compute/tensor/src/cpu/x86/attention_mask.cpp b/compute/tensor/src/cpu/x86/attention_mask.cpp new file mode 100644 index 00000000..52d0a85e --- /dev/null +++ b/compute/tensor/src/cpu/x86/attention_mask.cpp @@ -0,0 +1,40 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/tensor_computing_x86.h" +#ifdef _USE_FP32 +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#endif + +EE attention_mask_x86(TensorDesc inputDesc, + const void *input, + AttentionMaskParamSpec p, + TensorDesc outputDesc, + void *output) +{ + DataType idt = inputDesc.dt; + EE ret = SUCCESS; + switch (idt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = attention_mask_fp32(inputDesc, (const F32 *)input, p, outputDesc, (F32 *)output); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + + return ret; +} diff --git a/compute/tensor/src/cpu/x86/check.cpp b/compute/tensor/src/cpu/x86/check.cpp new file mode 100644 index 00000000..0cf0c5ba --- /dev/null +++ b/compute/tensor/src/cpu/x86/check.cpp @@ -0,0 +1,105 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/tensor_computing_x86.h" +#include "x86_avx2_expand.h" +#ifdef _USE_FP32 +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#endif + +static EE check_u32(TensorDesc inputDescA, + const U32 *inputA, + TensorDesc inputDescB, + const U32 *inputB, + CheckMode checkMode, + TensorDesc outputDesc, + I32 *output) +{ + if (nullptr == inputA || nullptr == inputB || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + if (tensorNumElements(inputDescA) != tensorNumElements(inputDescB)) { + CHECK_STATUS(NOT_MATCH); + } + + U32 size = tensorNumElements(inputDescA); + U32 loopOuter = inputDescA.dims[inputDescA.nDims - 1]; + if (tensorNumElements(outputDesc) != loopOuter) { + CHECK_STATUS(NOT_MATCH); + } + I32 length = size / loopOuter; + for (U32 j = 0; j < loopOuter; j++) { + const U32 *arrayA = inputA + j * length; + const U32 *arrayB = inputB + j * length; + switch (checkMode) { + case CHECK_EQUAL: { + __m256i count_v = _mm256_set1_epi32(0); + I32 i = 0; + for (; i < length - 7; i += 8) { + __m256i a = _mm256_loadu_si256((__m256i *)arrayA + i); + __m256i b = _mm256_loadu_si256((__m256i *)arrayA + i); + count_v = _mm256_add_epi32(count_v, _mm256_cmpeq_epi32(a, b)); + } + I32 count = _mm256_hadd_u32(count_v); + for (; i < length; i++) { + if (arrayA[i] == arrayB[i]) { + count++; + } + } + output[j] = (count == length); + break; + } + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } + } + return SUCCESS; +} + +EE check_x86(TensorDesc inputDescA, + const void *inputA, + TensorDesc inputDescB, + const void *inputB, + CheckParamSpec p, + TensorDesc outputDesc, + void *output) +{ + DataType idt = inputDescA.dt; + EE ret = SUCCESS; + switch (idt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = check_fp32(inputDescA, (const F32 *)inputA, inputDescB, (const F32 *)inputB, + p.check_mode, outputDesc, (I32 *)output); + break; + } +#endif + case DT_U32: { + ret = check_u32(inputDescA, (const U32 *)inputA, inputDescB, (const U32 *)inputB, + p.check_mode, outputDesc, (I32 *)output); + break; + } + case DT_I32: { + ret = check_u32(inputDescA, (const U32 *)inputA, inputDescB, (const U32 *)inputB, + p.check_mode, outputDesc, (I32 *)output); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + + return ret; +} diff --git a/compute/tensor/src/cpu/x86/clip.cpp b/compute/tensor/src/cpu/x86/clip.cpp new file mode 100644 index 00000000..fae34a1b --- /dev/null +++ b/compute/tensor/src/cpu/x86/clip.cpp @@ -0,0 +1,35 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/tensor_computing_x86.h" +#ifdef _USE_FP32 +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#endif + +EE clip_x86(TensorDesc inputDesc, void *input, ClipParamSpec p, TensorDesc outputDesc, void *output) +{ + UNUSED(outputDesc); + EE ret = SUCCESS; + switch (inputDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = clip_fp32((F32 *)input, (F32 *)output, tensorNumElements(inputDesc), p.min, p.max); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/x86/convolution.cpp b/compute/tensor/src/cpu/x86/convolution.cpp new file mode 100644 index 00000000..4692257a --- /dev/null +++ b/compute/tensor/src/cpu/x86/convolution.cpp @@ -0,0 +1,228 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/x86/tensor_computing_x86.h" +#ifdef _USE_FP32 +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#endif +#include "ut_util.h" + +EE convolution_infer_forward_algorithm_x86(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + ConvolutionForwardAlgorithm *algorithm, + DataType targetDataType) +{ + UNUSED(filterDesc); + UNUSED(outputDesc); + UNUSED(convParamSpec); + UNUSED(policy); + UNUSED(targetDataType); + if (nullptr == algorithm) { + CHECK_STATUS(NULL_POINTER); + } + if (*algorithm != CONVOLUTION_ALGORITHM_NULL) { + return SUCCESS; + } + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 group = convParamSpec.group; + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + if ((idf != DF_NCHWC8) || (ic / group % 8 != 0)) { + *algorithm = CONVOLUTION_ALGORITHM_GEMM_ICNCHW; + return SUCCESS; + } + + if ((strideH == 1) && (strideW == 1) && (fh == 1) && (fw == 1)) { + *algorithm = CONVOLUTION_ALGORITHM_POINTWISE; + return SUCCESS; + } + + *algorithm = CONVOLUTION_ALGORITHM_DIRECT; + return SUCCESS; +} + +EE convolution_transform_filter_bytes_x86(TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes) +{ + if (nullptr == bytes) { + CHECK_STATUS(NULL_POINTER); + } + EE ret = SUCCESS; + + DataType fdt; + DataFormat fdf; + U32 fn, fc, fh, fw; + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + U32 fnAlignSize = 8; + U32 fnGroupSize = fn / convParamSpec.group; + U32 fnPadding = (fnGroupSize / fnAlignSize + ((fnGroupSize % fnAlignSize) == 0 ? 0 : 1)) * + fnAlignSize * convParamSpec.group; + U32 fcPadding = (fc / fnAlignSize + ((fc % fnAlignSize) == 0 ? 0 : 1)) * fnAlignSize; + switch (algorithm) { + case CONVOLUTION_ALGORITHM_DIRECT: + *bytes = fnPadding * fcPadding * fh * fw; + break; + case CONVOLUTION_ALGORITHM_GEMM_ICNCHW: + *bytes = fnPadding * fc * fh * fw; + break; + case CONVOLUTION_ALGORITHM_POINTWISE: + *bytes = fnPadding * fcPadding; + break; + default: + return NOT_SUPPORTED; + } + *bytes *= bytesOf(fdt); + *bytes += 32; + return ret; +} + +EE convolution_transform_filter_x86(TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + void *filterTransformed) +{ + EE ret = SUCCESS; + switch (filterDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = convolution_transform_filter_fp32(filterDesc, (F32 *)filter, convParamSpec, + algorithm, ftmDesc, (F32 *)filterTransformed); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE convolution_infer_forward_tmp_bytes_x86(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes) +{ + EE ret = SUCCESS; + switch (filterDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = convolution_infer_forward_tmp_bytes_fp32( + inputDesc, filterDesc, outputDesc, convParamSpec, algorithm, bytes); + break; + } +#endif + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } + + return ret; +} + +EE convolution_x86(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc scaleDesc, + const void *scale, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec activationDesc, + Arch arch) +{ + UNUSED(scaleDesc); + UNUSED(scale); + U32 group = convParamSpec.group; + U32 batchAxis = inputDesc.nDims - 1; + U32 dataChannelAxis = inputDesc.nDims - 2; + U32 filterChannelAxis = filterDesc.nDims - 1; + U32 biasChannelAxis = 0; + CHECK_REQUIREMENT(inputDesc.dims[batchAxis] == 1); + U32 icGroupSize = inputDesc.dims[dataChannelAxis] / group; + + void *inputTransform; + if (inputDesc.df == DF_NCHWC8 && icGroupSize % 8 != 0) { + TensorDesc tmpInputDesc = inputDesc; + tmpInputDesc.df = DF_NCHW; + transformToNCHW(inputDesc, input, tmpInputDesc, tmp); + inputTransform = tmp; + tmp = (U8 *)tmp + tensorNumBytes(tmpInputDesc); + tmpBytes -= tensorNumBytes(tmpInputDesc); + inputDesc.df = DF_NCHW; + } else { + inputTransform = input; + } + + TensorDesc tmpInputDesc = inputDesc; + tmpInputDesc.dims[dataChannelAxis] /= group; + TensorDesc tmpOutputDesc = outputDesc; + tmpOutputDesc.dims[dataChannelAxis] /= group; + TensorDesc tmpFilterDesc = filterDesc; + tmpFilterDesc.dims[filterChannelAxis] /= group; + TensorDesc tmpBiasDesc = biasDesc; + tmpBiasDesc.dims[biasChannelAxis] /= group; + + TensorDesc paddingFilterDesc = tmpFilterDesc; + paddingFilterDesc.dims[filterChannelAxis] = (tmpFilterDesc.dims[filterChannelAxis] + 7) / 8 * 8; + + EE ret = SUCCESS; + for (U32 g = 0; g < group; g++) { + void *tmpInput = (U8 *)inputTransform + g * tensorNumBytes(tmpInputDesc); + const void *tmpFilter = (U8 *)filter + g * tensorNumBytes(paddingFilterDesc); + const void *tmpBias = (U8 *)bias + g * tensorNumBytes(tmpBiasDesc); + void *tmpOutput = (U8 *)output + g * tensorNumBytes(tmpOutputDesc); + switch (filterDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = convolution_fp32(tmpInputDesc, (F32 *)tmpInput, tmpFilterDesc, + (F32 *)tmpFilter, convParamSpec, algorithm, tmpBiasDesc, (F32 *)tmpBias, + tmpBytes, tmp, tmpOutputDesc, (F32 *)tmpOutput, activationDesc, arch); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + } + return ret; +} diff --git a/compute/tensor/src/cpu/x86/deconvolution.cpp b/compute/tensor/src/cpu/x86/deconvolution.cpp new file mode 100644 index 00000000..b144224b --- /dev/null +++ b/compute/tensor/src/cpu/x86/deconvolution.cpp @@ -0,0 +1,42 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/tensor_computing_x86.h" +#ifdef _USE_FP32 +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#endif +#ifdef _USE_FP16 +#include "cpu/x86/fp16/tensor_computing_fp16.h" +#endif + +EE deconvolution_transform_filter_x86(TensorDesc filterDesc, + const void *filter, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + void *filterTransformed) +{ + EE ret = SUCCESS; + switch (filterDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = deconvolution_transform_filter_fp32( + filterDesc, (F32 *)filter, algorithm, ftmDesc, (F32 *)filterTransformed); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/x86/depthwise_convolution.cpp b/compute/tensor/src/cpu/x86/depthwise_convolution.cpp new file mode 100644 index 00000000..2423173a --- /dev/null +++ b/compute/tensor/src/cpu/x86/depthwise_convolution.cpp @@ -0,0 +1,102 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/tensor_computing_x86.h" +#ifdef _USE_FP32 +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#endif + +EE depthwise_convolution_transform_filter_x86(TensorDesc filterDesc, + const void *filter, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + void *filterTransformed) +{ + EE ret = SUCCESS; + switch (filterDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = depthwise_convolution_transform_filter_fp32( + filterDesc, (F32 *)filter, algorithm, ftmDesc, (F32 *)filterTransformed); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE depthwise_convolution_infer_forward_tmp_bytes_x86(TensorDesc inputDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + U32 *bytes) +{ + if (nullptr == bytes) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, odt; + DataFormat idf, odf; + U32 in, ic, ih, iw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + EE ret = SUCCESS; + switch (algorithm) { + case DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT: + *bytes = ic * ih_pad * iw_pad; + break; + case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT: + *bytes = ic * ih_pad * iw_pad + ic * oh * ow; + break; + default: { + ret = NOT_MATCH; + *bytes = 0; + break; + } + } + *bytes *= bytesOf(idt); + *bytes += 32; + return ret; +} + +EE depthwise_convolution_x86(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec depthwiseActivationParamSpec, + Arch arch) +{ + TensorDesc blankTensorDesc; + ActivationParamSpec blankActivationParamSpec; + return depthwise_pointwise_convolution_x86(inputDesc, input, filterDesc, filter, blankTensorDesc, + nullptr, convParamSpec, algorithm, blankTensorDesc, bias, biasDesc, nullptr, tmpBytes, tmp, + outputDesc, output, depthwiseActivationParamSpec, blankActivationParamSpec, arch); +} diff --git a/compute/tensor/src/cpu/x86/depthwise_pointwise_convolution.cpp b/compute/tensor/src/cpu/x86/depthwise_pointwise_convolution.cpp new file mode 100644 index 00000000..02cada77 --- /dev/null +++ b/compute/tensor/src/cpu/x86/depthwise_pointwise_convolution.cpp @@ -0,0 +1,83 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/tensor_computing_x86.h" +#ifdef _USE_FP32 +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#endif + +EE depthwise_pointwise_convolution_transform_filter_x86(TensorDesc dwFilterDesc, + const void *dwFilter, + TensorDesc pwFilterDesc, + const void *pwFilter, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc *dwFtmDesc, + void *dwFilterTransformed, + TensorDesc *pwFtmDesc, + void *pwFilterTransformed) +{ + EE ret = SUCCESS; + switch (dwFilterDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = depthwise_pointwise_convolution_transform_filter_fp32(dwFilterDesc, + (F32 *)dwFilter, pwFilterDesc, (F32 *)pwFilter, algorithm, dwFtmDesc, + (F32 *)dwFilterTransformed, pwFtmDesc, (F32 *)pwFilterTransformed); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE depthwise_pointwise_convolution_x86(TensorDesc inputDesc, + void *input, + TensorDesc dwFilterDesc, + const void *dwFilter, + TensorDesc pwFilterDesc, + const void *pwFilter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc dwBiasDesc, + const void *dwBias, + TensorDesc pwBiasDesc, + const void *pwBias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + Arch arch) +{ + EE ret = SUCCESS; + switch (dwFilterDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = depthwise_pointwise_convolution_fp32(inputDesc, (F32 *)input, dwFilterDesc, + (const F32 *)dwFilter, pwFilterDesc, (const F32 *)pwFilter, convParamSpec, + algorithm, dwBiasDesc, (const F32 *)dwBias, pwBiasDesc, (const F32 *)pwBias, + tmpBytes, tmp, outputDesc, (F32 *)output, depthwiseActivationParamSpec, + pointwiseActivationParamSpec, arch); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/x86/eltwise.cpp b/compute/tensor/src/cpu/x86/eltwise.cpp new file mode 100644 index 00000000..8ead9916 --- /dev/null +++ b/compute/tensor/src/cpu/x86/eltwise.cpp @@ -0,0 +1,41 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/x86/tensor_computing_x86.h" +#ifdef _USE_FP32 +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#endif + +EE eltwise_x86(DataType dataType, + std::vector input, + std::vector inputSize, + U32 num, + U32 len, + void *output, + EltwiseMode eltwiseMode) +{ + EE ret = SUCCESS; + switch (dataType) { +#ifdef _USE_FP32 + case DT_F32: { + ret = eltwise_fp32(input, inputSize, num, len, output, eltwiseMode); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/x86/fp32/attention_mask.cpp b/compute/tensor/src/cpu/x86/fp32/attention_mask.cpp new file mode 100644 index 00000000..9d683bca --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/attention_mask.cpp @@ -0,0 +1,82 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/x86/fp32/tensor_computing_fp32.h" + +EE attention_mask_fp32(TensorDesc inputDesc, + const F32 *input, + AttentionMaskParamSpec p, + TensorDesc outputDesc, + F32 *output) +{ + UNUSED(outputDesc); + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + I32 attentionLength = p.attention_length; + bool sameLength = p.same_length; + float maskValue = p.mask; + int qlen = inputDesc.dims[1]; + int klen = inputDesc.dims[0]; + int mlen = klen - qlen; + I32 length = qlen * klen; + std::vector mask; + if (attentionLength < 0) { + mask = std::vector(length, 0); + } else { + mask = std::vector(length, 1); + for (int i = 0; i < qlen; i++) { + int start, loops; + if (attentionLength > 0) { + int end = mlen + i; + start = UNI_MAX(end - attentionLength, 0); + loops = end - start + 1; + } else { + if (sameLength) { + start = i; + loops = qlen + 1; + } else { + start = 0; + loops = i + qlen + 1; + } + } + loops = UNI_MAX(loops, 0); + start = UNI_MIN(start, klen); + if (start + loops > klen) { + loops = UNI_MAX(klen - start, 0); + } + memset(&mask[i * klen + start], 0, sizeof(F32) * loops); + } + } + I32 loops = tensorNumElements(inputDesc) / length; + __m256 one_v = _mm256_set1_ps(1.0f); + __m256 mask_value_v = _mm256_set1_ps(maskValue); + for (int i = 0, index = 0; i < loops; i++) { + int j = 0; + for (; j < length - 7; j += 8) { + __m256 in = _mm256_loadu_ps(input + index); + __m256 mask_v = _mm256_loadu_ps(&mask[j]); + __m256 tmp_v = _mm256_sub_ps(one_v, mask_v); + mask_v = _mm256_mul_ps(mask_value_v, mask_v); + tmp_v = _mm256_fmsub_ps(in, tmp_v, mask_v); + _mm256_storeu_ps(output + index, tmp_v); + index += 8; + } + for (; j < length; j++) { + output[index] = input[index] * (1 - mask[j]) - maskValue * mask[j]; + index++; + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/x86/fp32/check.cpp b/compute/tensor/src/cpu/x86/fp32/check.cpp new file mode 100644 index 00000000..9140fe00 --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/check.cpp @@ -0,0 +1,103 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#include "x86_avx2_expand.h" + +EE check_fp32(TensorDesc inputDescA, + const F32 *inputA, + TensorDesc inputDescB, + const F32 *inputB, + CheckMode checkMode, + TensorDesc outputDesc, + I32 *output) +{ + if (nullptr == inputA || nullptr == inputB || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + if (tensorNumElements(inputDescA) != tensorNumElements(inputDescB)) { + CHECK_STATUS(NOT_MATCH); + } + + U32 size = tensorNumElements(inputDescA); + U32 loopOuter = inputDescA.dims[inputDescA.nDims - 1]; + I32 length = size / loopOuter; + if (tensorNumElements(outputDesc) != loopOuter) { + CHECK_STATUS(NOT_MATCH); + } + for (U32 j = 0; j < loopOuter; j++) { + const F32 *arrayA = inputA + j * length; + const F32 *arrayB = inputB + j * length; + switch (checkMode) { + case CHECK_GREAT: { + __m256i count_v = _mm256_set1_epi32(0); + I32 i = 0; + for (; i < length - 7; i += 8) { + __m256 a = _mm256_loadu_ps(arrayA + i); + __m256 b = _mm256_loadu_ps(arrayA + i); + count_v = _mm256_add_epi32( + count_v, _mm256_cvtps_epi32(_mm256_cmp_ps(a, b, _CMP_GT_OS))); + } + I32 count = _mm256_hadd_u32(count_v); + for (; i < length; i++) { + if (arrayA[i] > arrayB[i]) { + count++; + } + } + output[j] = (count == length); + break; + } + case CHECK_GREATEQUAL: { + __m256i count_v = _mm256_set1_epi32(0); + I32 i = 0; + for (; i < length - 7; i += 8) { + __m256 a = _mm256_loadu_ps(arrayA + i); + __m256 b = _mm256_loadu_ps(arrayA + i); + count_v = _mm256_add_epi32( + count_v, _mm256_cvtps_epi32(_mm256_cmp_ps(a, b, _CMP_GE_OS))); + } + I32 count = _mm256_hadd_u32(count_v); + for (; i < length; i++) { + if (arrayA[i] >= arrayB[i]) { + count++; + } + } + output[j] = (count == length); + break; + } + case CHECK_EQUAL: { + __m256i count_v = _mm256_set1_epi32(0); + I32 i = 0; + for (; i < length - 7; i += 8) { + __m256 a = _mm256_loadu_ps(arrayA + i); + __m256 b = _mm256_loadu_ps(arrayA + i); + count_v = _mm256_add_epi32( + count_v, _mm256_cvtps_epi32(_mm256_cmp_ps(a, b, _CMP_EQ_OS))); + } + I32 count = _mm256_hadd_u32(count_v); + for (; i < length; i++) { + if (arrayA[i] == arrayB[i]) { + count++; + } + } + output[j] = (count == length); + break; + } + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/x86/fp32/clip.cpp b/compute/tensor/src/cpu/x86/fp32/clip.cpp new file mode 100644 index 00000000..cfa53653 --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/clip.cpp @@ -0,0 +1,38 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/fp32/tensor_computing_fp32.h" + +EE clip_fp32(F32 *input, F32 *output, I32 len, F32 minValue, F32 maxValue) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + __m256 min_v = _mm256_set1_ps(minValue); + __m256 max_v = _mm256_set1_ps(maxValue); + + I32 i = 0; + for (i = 0; i < len - 7; i += 8) { + __m256 in = _mm256_loadu_ps(input + i); + __m256 tmp_v = _mm256_min_ps(max_v, _mm256_max_ps(min_v, in)); + _mm256_storeu_ps(output + i, tmp_v); + } + for (; i < len; i++) { + F32 value = input[i]; + value = (value > minValue) ? value : minValue; + value = (value < maxValue) ? value : maxValue; + output[i] = value; + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/x86/fp32/convolution.cpp b/compute/tensor/src/cpu/x86/fp32/convolution.cpp new file mode 100644 index 00000000..f9fb09aa --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/convolution.cpp @@ -0,0 +1,134 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" + +#include "cpu/x86/fp32/tensor_computing_fp32.h" + +EE convolution_infer_forward_tmp_bytes_fp32(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes) +{ + if (nullptr == bytes) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + + U32 icAlignSize = 8; + U32 icPadding = (ic + icAlignSize - 1) / icAlignSize * icAlignSize; + + EE ret = SUCCESS; + switch (algorithm) { + case CONVOLUTION_ALGORITHM_DIRECT: + *bytes = icPadding * ih_pad * iw_pad; + break; + case CONVOLUTION_ALGORITHM_POINTWISE: + *bytes = oc; + break; + case CONVOLUTION_ALGORITHM_GEMM_ICNCHW: + *bytes = 0; + break; + default: + ret = NOT_MATCH; + break; + } + + // pre data processing space for not complete NCHWC8 group convolution input + U32 icGroupSize = ic / convParamSpec.group; + if (idf == DF_NCHWC8 && icGroupSize % 8 != 0) { + *bytes += tensorNumBytes(inputDesc); + } + + *bytes *= bytesOf(idt); + *bytes += 32; + return ret; +} + +EE convolution_fp32(TensorDesc inputDesc, + F32 *input, + TensorDesc filterDesc, + const F32 *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc biasDesc, + const F32 *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *output, + ActivationParamSpec activationDesc, + Arch arch) +{ + UNUSED(arch); + if (nullptr == input || nullptr == filter || nullptr == output || nullptr == bias || + nullptr == tmp) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if (!(idt == DT_F32 && fdt == DT_F32 && odt == DT_F32)) { + CHECK_STATUS(NOT_MATCH); + } + if (!(odf == DF_NCHWC8)) { + CHECK_STATUS(NOT_MATCH); + } + if (!(ic == fc && oc == fn)) { + CHECK_STATUS(NOT_MATCH); + } + + EE ret = SUCCESS; + switch (algorithm) { + case CONVOLUTION_ALGORITHM_DIRECT: + ret = convolution_direct(inputDesc, input, filterDesc, filter, convParamSpec, biasDesc, + bias, tmpBytes, tmp, outputDesc, output, activationDesc); + break; + case CONVOLUTION_ALGORITHM_POINTWISE: + ret = convolution_1x1_direct(inputDesc, input, filterDesc, filter, convParamSpec, + bias, tmpBytes, tmp, outputDesc, output, activationDesc); + break; + case CONVOLUTION_ALGORITHM_GEMM_ICNCHW: + ret = convolution_direct_nchw(inputDesc, input, filterDesc, filter, convParamSpec, + biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/x86/fp32/convolution_1x1_direct.cpp b/compute/tensor/src/cpu/x86/fp32/convolution_1x1_direct.cpp new file mode 100644 index 00000000..3c3173de --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/convolution_1x1_direct.cpp @@ -0,0 +1,1749 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" + +#include "cpu/x86/fp32/tensor_computing_fp32.h" + +#define UNROLL_HW 4 +#define SIMDW 8 +#define UNROLL_OC 24 +#define UNROLL_IC_BLOCK_DIM 8 +#define BLOCK_IC_DIM 128 +#define BLOCK_OC_DIM 96 +#define BLOCK_HW_DIM 128 +#define align_addr(addr, unit) (((uintptr_t)addr + unit - 1) / unit * unit) + +typedef void (*kernel_func)( + F32 *curI, const F32 *curW, F32 *curO, const F32 *curB, U32 oStep, U32 store, U32 ic, U32 fStep); + +inline void avx2_pointwise_kernel_3x32( + F32 *curI, const F32 *curW, F32 *curO, const F32 *curB, U32 oStep, U32 store, U32 ic, U32 fStep) +{ + __asm__ __volatile__("shr $3, %%ecx \n\t" + "mov %5, %%eax \n\t" + "and $0x1, %%eax \n\t" + "jne 0f \n\t" + "vmovups (%3), %%ymm0 \n\t" + "vmovups (%3), %%ymm1 \n\t" + "vmovups (%3), %%ymm2 \n\t" + "vmovups 0x20(%3), %%ymm3 \n\t" + "vmovups 0x20(%3), %%ymm4 \n\t" + "vmovups 0x20(%3), %%ymm5 \n\t" + "vmovups 0x40(%3), %%ymm6 \n\t" + "vmovups 0x40(%3), %%ymm7 \n\t" + "vmovups 0x40(%3), %%ymm8 \n\t" + "vmovups 0x60(%3), %%ymm9 \n\t" + "vmovups 0x60(%3), %%ymm10 \n\t" + "vmovups 0x60(%3), %%ymm11 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "mov %2, %%rax \n\t" + "vmovups (%%rax), %%ymm0 \n\t" + "vmovups 0x20(%%rax), %%ymm1 \n\t" + "vmovups 0x40(%%rax), %%ymm2 \n\t" + "add %4, %%rax \n\t" + "vmovups (%%rax), %%ymm3 \n\t" + "vmovups 0x20(%%rax), %%ymm4 \n\t" + "vmovups 0x40(%%rax), %%ymm5 \n\t" + "add %4, %%rax \n\t" + "vmovups (%%rax), %%ymm6 \n\t" + "vmovups 0x20(%%rax), %%ymm7 \n\t" + "vmovups 0x40(%%rax), %%ymm8 \n\t" + "add %4, %%rax \n\t" + "vmovups (%%rax), %%ymm9 \n\t" + "vmovups 0x20(%%rax), %%ymm10 \n\t" + "vmovups 0x40(%%rax), %%ymm11 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "vbroadcastss (%1), %%ymm12 \n\t" + "vbroadcastss 0x20(%1), %%ymm13 \n\t" + "vbroadcastss 0x40(%1), %%ymm14 \n\t" + "vmovaps (%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovaps 0x20(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vmovaps 0x40(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vmovaps 0x60(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vbroadcastss 0x4(%1), %%ymm12 \n\t" + "vbroadcastss 0x24(%1), %%ymm13 \n\t" + "vbroadcastss 0x44(%1), %%ymm14 \n\t" + "vmovaps 0x80(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovaps 0xA0(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vmovaps 0xC0(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vmovaps 0xE0(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vbroadcastss 0x8(%1), %%ymm12 \n\t" + "vbroadcastss 0x28(%1), %%ymm13 \n\t" + "vbroadcastss 0x48(%1), %%ymm14 \n\t" + "vmovaps 0x100(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovaps 0x120(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vmovaps 0x140(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vmovaps 0x160(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vbroadcastss 0xC(%1), %%ymm12 \n\t" + "vbroadcastss 0x2C(%1), %%ymm13 \n\t" + "vbroadcastss 0x4C(%1), %%ymm14 \n\t" + "vmovaps 0x180(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovaps 0x1A0(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vmovaps 0x1C0(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vmovaps 0x1E0(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vbroadcastss 0x10(%1), %%ymm12 \n\t" + "vbroadcastss 0x30(%1), %%ymm13 \n\t" + "vbroadcastss 0x50(%1), %%ymm14 \n\t" + "vmovaps 0x200(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovaps 0x220(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vmovaps 0x240(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vmovaps 0x260(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vbroadcastss 0x14(%1), %%ymm12 \n\t" + "vbroadcastss 0x34(%1), %%ymm13 \n\t" + "vbroadcastss 0x54(%1), %%ymm14 \n\t" + "vmovaps 0x280(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovaps 0x2A0(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vmovaps 0x2C0(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vmovaps 0x2E0(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vbroadcastss 0x18(%1), %%ymm12 \n\t" + "vbroadcastss 0x38(%1), %%ymm13 \n\t" + "vbroadcastss 0x58(%1), %%ymm14 \n\t" + "vmovaps 0x300(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovaps 0x320(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vmovaps 0x340(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vmovaps 0x360(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vbroadcastss 0x1C(%1), %%ymm12 \n\t" + "vbroadcastss 0x3C(%1), %%ymm13 \n\t" + "vbroadcastss 0x5C(%1), %%ymm14 \n\t" + "vmovaps 0x380(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovaps 0x3A0(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vmovaps 0x3C0(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vmovaps 0x3E0(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "add $0x400, %0 \n\t" + "add %7, %1 \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + + // relu + "and $0x6, %5 \n\t" + "je 2f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm1, %%ymm1 \n\t" + "vmaxps %%ymm15, %%ymm2, %%ymm2 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + "vmaxps %%ymm15, %%ymm5, %%ymm5 \n\t" + "vmaxps %%ymm15, %%ymm6, %%ymm6 \n\t" + "vmaxps %%ymm15, %%ymm7, %%ymm7 \n\t" + "vmaxps %%ymm15, %%ymm8, %%ymm8 \n\t" + "vmaxps %%ymm15, %%ymm9, %%ymm9 \n\t" + "vmaxps %%ymm15, %%ymm10, %%ymm10 \n\t" + "vmaxps %%ymm15, %%ymm11, %%ymm11 \n\t" + + // relu6 + "and $0x4, %5 \n\t" + "je 2f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm1, %%ymm1 \n\t" + "vminps %%ymm12, %%ymm2, %%ymm2 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + "vminps %%ymm12, %%ymm5, %%ymm5 \n\t" + "vminps %%ymm12, %%ymm6, %%ymm6 \n\t" + "vminps %%ymm12, %%ymm7, %%ymm7 \n\t" + "vminps %%ymm12, %%ymm8, %%ymm8 \n\t" + "vminps %%ymm12, %%ymm9, %%ymm9 \n\t" + "vminps %%ymm12, %%ymm10, %%ymm10 \n\t" + "vminps %%ymm12, %%ymm11, %%ymm11 \n\t" + + "2: \n\t" + "vmovups %%ymm0, (%2) \n\t" + "vmovups %%ymm1, 0x20(%2) \n\t" + "vmovups %%ymm2, 0x40(%2) \n\t" + "add %4, %2 \n\t" + "vmovups %%ymm3, (%2) \n\t" + "vmovups %%ymm4, 0x20(%2) \n\t" + "vmovups %%ymm5, 0x40(%2) \n\t" + "add %4, %2 \n\t" + "vmovups %%ymm6, (%2) \n\t" + "vmovups %%ymm7, 0x20(%2) \n\t" + "vmovups %%ymm8, 0x40(%2) \n\t" + "add %4, %2 \n\t" + "vmovups %%ymm9, (%2) \n\t" + "vmovups %%ymm10, 0x20(%2) \n\t" + "vmovups %%ymm11, 0x40(%2) \n\t" + : + : "r"(curW), "r"(curI), "r"(curO), "r"(curB), "r"(I64(oStep)), "r"(store), + "c"(ic), "r"(I64(fStep)) + : "%eax", "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", + "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", + "%ymm14", "%ymm15", "memory", "cc"); +} + +inline void avx2_pointwise_kernel_4x24( + F32 *curI, const F32 *curW, F32 *curO, const F32 *curB, U32 oStep, U32 store, U32 ic, U32 fStep) +{ + __asm__ __volatile__("shr $3, %%ecx \n\t" + "mov %5, %%eax \n\t" + "and $0x1, %%eax \n\t" + "jne 0f \n\t" + "vmovups (%3), %%ymm0 \n\t" + "vmovups (%3), %%ymm1 \n\t" + "vmovups (%3), %%ymm2 \n\t" + "vmovups (%3), %%ymm3 \n\t" + "vmovups 0x20(%3), %%ymm4 \n\t" + "vmovups 0x20(%3), %%ymm5 \n\t" + "vmovups 0x20(%3), %%ymm6 \n\t" + "vmovups 0x20(%3), %%ymm7 \n\t" + "vmovups 0x40(%3), %%ymm8 \n\t" + "vmovups 0x40(%3), %%ymm9 \n\t" + "vmovups 0x40(%3), %%ymm10 \n\t" + "vmovups 0x40(%3), %%ymm11 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "vmovups (%2), %%ymm0 \n\t" + "vmovups 0x20(%2), %%ymm1 \n\t" + "vmovups 0x40(%2), %%ymm2 \n\t" + "vmovups 0x60(%2), %%ymm3 \n\t" + "vmovups (%2, %4), %%ymm4 \n\t" + "vmovups 0x20(%2, %4), %%ymm5 \n\t" + "vmovups 0x40(%2, %4), %%ymm6 \n\t" + "vmovups 0x60(%2, %4), %%ymm7 \n\t" + "vmovups (%2, %4, 2), %%ymm8 \n\t" + "vmovups 0x20(%2, %4, 2), %%ymm9 \n\t" + "vmovups 0x40(%2, %4, 2), %%ymm10 \n\t" + "vmovups 0x60(%2, %4, 2), %%ymm11 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "vmovaps (%0), %%ymm12 \n\t" + "vmovaps 0x20(%0), %%ymm13 \n\t" + "vmovaps 0x40(%0), %%ymm14 \n\t" + "vbroadcastss (%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss 0x20(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm5 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm9 \n\t" + "vbroadcastss 0x40(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm10 \n\t" + "vbroadcastss 0x60(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vmovaps 0x60(%0), %%ymm12 \n\t" + "vmovaps 0x80(%0), %%ymm13 \n\t" + "vmovaps 0xA0(%0), %%ymm14 \n\t" + "vbroadcastss 0x4(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss 0x24(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm5 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm9 \n\t" + "vbroadcastss 0x44(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm10 \n\t" + "vbroadcastss 0x64(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vmovaps 0xC0(%0), %%ymm12 \n\t" + "vmovaps 0xE0(%0), %%ymm13 \n\t" + "vmovaps 0x100(%0), %%ymm14 \n\t" + "vbroadcastss 0x8(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss 0x28(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm5 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm9 \n\t" + "vbroadcastss 0x48(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm10 \n\t" + "vbroadcastss 0x68(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vmovaps 0x120(%0), %%ymm12 \n\t" + "vmovaps 0x140(%0), %%ymm13 \n\t" + "vmovaps 0x160(%0), %%ymm14 \n\t" + "vbroadcastss 0xC(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss 0x2C(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm5 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm9 \n\t" + "vbroadcastss 0x4C(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm10 \n\t" + "vbroadcastss 0x6C(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vmovaps 0x180(%0), %%ymm12 \n\t" + "vmovaps 0x1A0(%0), %%ymm13 \n\t" + "vmovaps 0x1C0(%0), %%ymm14 \n\t" + "vbroadcastss 0x10(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss 0x30(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm5 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm9 \n\t" + "vbroadcastss 0x50(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm10 \n\t" + "vbroadcastss 0x70(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vmovaps 0x1E0(%0), %%ymm12 \n\t" + "vmovaps 0x200(%0), %%ymm13 \n\t" + "vmovaps 0x220(%0), %%ymm14 \n\t" + "vbroadcastss 0x14(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss 0x34(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm5 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm9 \n\t" + "vbroadcastss 0x54(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm10 \n\t" + "vbroadcastss 0x74(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vmovaps 0x240(%0), %%ymm12 \n\t" + "vmovaps 0x260(%0), %%ymm13 \n\t" + "vmovaps 0x280(%0), %%ymm14 \n\t" + "vbroadcastss 0x18(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss 0x38(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm5 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm9 \n\t" + "vbroadcastss 0x58(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm10 \n\t" + "vbroadcastss 0x78(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vmovaps 0x2A0(%0), %%ymm12 \n\t" + "vmovaps 0x2C0(%0), %%ymm13 \n\t" + "vmovaps 0x2E0(%0), %%ymm14 \n\t" + "vbroadcastss 0x1C(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss 0x3C(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm5 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm9 \n\t" + "vbroadcastss 0x5C(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm10 \n\t" + "vbroadcastss 0x7C(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "add $0x300, %0 \n\t" + "add %7, %1 \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + + // relu + "and $0x6, %5 \n\t" + "je 2f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm1, %%ymm1 \n\t" + "vmaxps %%ymm15, %%ymm2, %%ymm2 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + "vmaxps %%ymm15, %%ymm5, %%ymm5 \n\t" + "vmaxps %%ymm15, %%ymm6, %%ymm6 \n\t" + "vmaxps %%ymm15, %%ymm7, %%ymm7 \n\t" + "vmaxps %%ymm15, %%ymm8, %%ymm8 \n\t" + "vmaxps %%ymm15, %%ymm9, %%ymm9 \n\t" + "vmaxps %%ymm15, %%ymm10, %%ymm10 \n\t" + "vmaxps %%ymm15, %%ymm11, %%ymm11 \n\t" + + // relu6 + "and $0x4, %5 \n\t" + "je 2f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm1, %%ymm1 \n\t" + "vminps %%ymm12, %%ymm2, %%ymm2 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + "vminps %%ymm12, %%ymm5, %%ymm5 \n\t" + "vminps %%ymm12, %%ymm6, %%ymm6 \n\t" + "vminps %%ymm12, %%ymm7, %%ymm7 \n\t" + "vminps %%ymm12, %%ymm8, %%ymm8 \n\t" + "vminps %%ymm12, %%ymm9, %%ymm9 \n\t" + "vminps %%ymm12, %%ymm10, %%ymm10 \n\t" + "vminps %%ymm12, %%ymm11, %%ymm11 \n\t" + + "2: \n\t" + "vmovups %%ymm0, (%2) \n\t" + "vmovups %%ymm1, 0x20(%2) \n\t" + "vmovups %%ymm2, 0x40(%2) \n\t" + "vmovups %%ymm3, 0x60(%2) \n\t" + "vmovups %%ymm4, (%2, %4) \n\t" + "vmovups %%ymm5, 0x20(%2, %4) \n\t" + "vmovups %%ymm6, 0x40(%2, %4) \n\t" + "vmovups %%ymm7, 0x60(%2, %4) \n\t" + "vmovups %%ymm8, (%2, %4, 2) \n\t" + "vmovups %%ymm9, 0x20(%2, %4, 2) \n\t" + "vmovups %%ymm10, 0x40(%2, %4, 2) \n\t" + "vmovups %%ymm11, 0x60(%2, %4, 2) \n\t" + : + : "r"(curW), "r"(curI), "r"(curO), "r"(curB), "r"(I64(oStep)), "r"(store), + "c"(ic), "r"(I64(fStep)) + : "%eax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", + "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", + "%ymm14", "%ymm15", "memory", "cc"); +} + +inline void avx2_pointwise_kernel_6x16( + F32 *curI, const F32 *curW, F32 *curO, const F32 *curB, U32 oStep, U32 store, U32 ic, U32 fStep) +{ + __asm__ __volatile__("shr $3, %%ecx \n\t" + "mov %5, %%eax \n\t" + "and $0x1, %%eax \n\t" + "jne 0f \n\t" + "vmovups (%3), %%ymm0 \n\t" + "vmovups (%3), %%ymm1 \n\t" + "vmovups (%3), %%ymm2 \n\t" + "vmovups (%3), %%ymm3 \n\t" + "vmovups (%3), %%ymm4 \n\t" + "vmovups (%3), %%ymm5 \n\t" + "vmovups 0x20(%3), %%ymm6 \n\t" + "vmovups 0x20(%3), %%ymm7 \n\t" + "vmovups 0x20(%3), %%ymm8 \n\t" + "vmovups 0x20(%3), %%ymm9 \n\t" + "vmovups 0x20(%3), %%ymm10 \n\t" + "vmovups 0x20(%3), %%ymm11 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "vmovups (%2), %%ymm0 \n\t" + "vmovups 0x20(%2), %%ymm1 \n\t" + "vmovups 0x40(%2), %%ymm2 \n\t" + "vmovups 0x60(%2), %%ymm3 \n\t" + "vmovups 0x80(%2), %%ymm4 \n\t" + "vmovups 0xA0(%2), %%ymm5 \n\t" + "vmovups (%2, %4), %%ymm6 \n\t" + "vmovups 0x20(%2, %4), %%ymm7 \n\t" + "vmovups 0x40(%2, %4), %%ymm8 \n\t" + "vmovups 0x60(%2, %4), %%ymm9 \n\t" + "vmovups 0x80(%2, %4), %%ymm10 \n\t" + "vmovups 0xA0(%2, %4), %%ymm11 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "vmovaps (%0), %%ymm12 \n\t" + "vmovaps 0x20(%0), %%ymm13 \n\t" + "vbroadcastss (%1), %%ymm15 \n\t" + "vbroadcastss 0x20(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm7 \n\t" + "vbroadcastss 0x40(%1), %%ymm15 \n\t" + "vbroadcastss 0x60(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm8 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm9 \n\t" + "vbroadcastss 0x80(%1), %%ymm15 \n\t" + "vbroadcastss 0xA0(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm5 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm11 \n\t" + + "vmovaps 0x40(%0), %%ymm12 \n\t" + "vmovaps 0x60(%0), %%ymm13 \n\t" + "vbroadcastss 0x4(%1), %%ymm15 \n\t" + "vbroadcastss 0x24(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm7 \n\t" + "vbroadcastss 0x44(%1), %%ymm15 \n\t" + "vbroadcastss 0x64(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm8 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm9 \n\t" + "vbroadcastss 0x84(%1), %%ymm15 \n\t" + "vbroadcastss 0xA4(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm5 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm11 \n\t" + + "vmovaps 0x80(%0), %%ymm12 \n\t" + "vmovaps 0xA0(%0), %%ymm13 \n\t" + "vbroadcastss 0x8(%1), %%ymm15 \n\t" + "vbroadcastss 0x28(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm7 \n\t" + "vbroadcastss 0x48(%1), %%ymm15 \n\t" + "vbroadcastss 0x68(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm8 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm9 \n\t" + "vbroadcastss 0x88(%1), %%ymm15 \n\t" + "vbroadcastss 0xA8(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm5 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm11 \n\t" + + "vmovaps 0xC0(%0), %%ymm12 \n\t" + "vmovaps 0xE0(%0), %%ymm13 \n\t" + "vbroadcastss 0xC(%1), %%ymm15 \n\t" + "vbroadcastss 0x2C(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm7 \n\t" + "vbroadcastss 0x4C(%1), %%ymm15 \n\t" + "vbroadcastss 0x6C(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm8 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm9 \n\t" + "vbroadcastss 0x8C(%1), %%ymm15 \n\t" + "vbroadcastss 0xAC(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm5 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm11 \n\t" + + "vmovaps 0x100(%0), %%ymm12 \n\t" + "vmovaps 0x120(%0), %%ymm13 \n\t" + "vbroadcastss 0x10(%1), %%ymm15 \n\t" + "vbroadcastss 0x30(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm7 \n\t" + "vbroadcastss 0x50(%1), %%ymm15 \n\t" + "vbroadcastss 0x70(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm8 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm9 \n\t" + "vbroadcastss 0x90(%1), %%ymm15 \n\t" + "vbroadcastss 0xB0(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm5 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm11 \n\t" + + "vmovaps 0x140(%0), %%ymm12 \n\t" + "vmovaps 0x160(%0), %%ymm13 \n\t" + "vbroadcastss 0x14(%1), %%ymm15 \n\t" + "vbroadcastss 0x34(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm7 \n\t" + "vbroadcastss 0x54(%1), %%ymm15 \n\t" + "vbroadcastss 0x74(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm8 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm9 \n\t" + "vbroadcastss 0x94(%1), %%ymm15 \n\t" + "vbroadcastss 0xB4(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm5 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm11 \n\t" + + "vmovaps 0x180(%0), %%ymm12 \n\t" + "vmovaps 0x1A0(%0), %%ymm13 \n\t" + "vbroadcastss 0x18(%1), %%ymm15 \n\t" + "vbroadcastss 0x38(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm7 \n\t" + "vbroadcastss 0x58(%1), %%ymm15 \n\t" + "vbroadcastss 0x78(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm8 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm9 \n\t" + "vbroadcastss 0x98(%1), %%ymm15 \n\t" + "vbroadcastss 0xB8(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm5 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm11 \n\t" + + "vmovaps 0x1C0(%0), %%ymm12 \n\t" + "vmovaps 0x1E0(%0), %%ymm13 \n\t" + "vbroadcastss 0x1C(%1), %%ymm15 \n\t" + "vbroadcastss 0x3C(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm7 \n\t" + "vbroadcastss 0x5C(%1), %%ymm15 \n\t" + "vbroadcastss 0x7C(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm8 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm9 \n\t" + "vbroadcastss 0x9C(%1), %%ymm15 \n\t" + "vbroadcastss 0xBC(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm5 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm11 \n\t" + + "add $0x200, %0 \n\t" + "add %7, %1 \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + + // relu + "and $0x6, %5 \n\t" + "je 2f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm1, %%ymm1 \n\t" + "vmaxps %%ymm15, %%ymm2, %%ymm2 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + "vmaxps %%ymm15, %%ymm5, %%ymm5 \n\t" + "vmaxps %%ymm15, %%ymm6, %%ymm6 \n\t" + "vmaxps %%ymm15, %%ymm7, %%ymm7 \n\t" + "vmaxps %%ymm15, %%ymm8, %%ymm8 \n\t" + "vmaxps %%ymm15, %%ymm9, %%ymm9 \n\t" + "vmaxps %%ymm15, %%ymm10, %%ymm10 \n\t" + "vmaxps %%ymm15, %%ymm11, %%ymm11 \n\t" + + // relu6 + "and $0x4, %5 \n\t" + "je 2f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm1, %%ymm1 \n\t" + "vminps %%ymm12, %%ymm2, %%ymm2 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + "vminps %%ymm12, %%ymm5, %%ymm5 \n\t" + "vminps %%ymm12, %%ymm6, %%ymm6 \n\t" + "vminps %%ymm12, %%ymm7, %%ymm7 \n\t" + "vminps %%ymm12, %%ymm8, %%ymm8 \n\t" + "vminps %%ymm12, %%ymm9, %%ymm9 \n\t" + "vminps %%ymm12, %%ymm10, %%ymm10 \n\t" + "vminps %%ymm12, %%ymm11, %%ymm11 \n\t" + + "2: \n\t" + "vmovups %%ymm0, (%2) \n\t" + "vmovups %%ymm1, 0x20(%2) \n\t" + "vmovups %%ymm2, 0x40(%2) \n\t" + "vmovups %%ymm3, 0x60(%2) \n\t" + "vmovups %%ymm4, 0x80(%2) \n\t" + "vmovups %%ymm5, 0xA0(%2) \n\t" + "vmovups %%ymm6, (%2, %4) \n\t" + "vmovups %%ymm7, 0x20(%2, %4) \n\t" + "vmovups %%ymm8, 0x40(%2, %4) \n\t" + "vmovups %%ymm9, 0x60(%2, %4) \n\t" + "vmovups %%ymm10, 0x80(%2, %4) \n\t" + "vmovups %%ymm11, 0xA0(%2, %4) \n\t" + : + : "r"(curW), "r"(curI), "r"(curO), "r"(curB), "r"(I64(oStep)), "r"(store), + "c"(ic), "r"(I64(fStep)) + : "%eax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", + "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", + "%ymm14", "%ymm15", "memory", "cc"); +} + +inline void avx2_pointwise_kernel_12x8( + F32 *curI, const F32 *curW, F32 *curO, const F32 *curB, U32 oStep, U32 store, U32 ic, U32 fStep) +{ + __asm__ __volatile__( + "shr $3, %%ecx \n\t" + "mov %4, %%ebx \n\t" + "and $0x1, %%ebx \n\t" + "jne 0f \n\t" + "vmovups (%3), %%ymm0 \n\t" + "vmovups (%3), %%ymm1 \n\t" + "vmovups (%3), %%ymm2 \n\t" + "vmovups (%3), %%ymm3 \n\t" + "vmovups (%3), %%ymm4 \n\t" + "vmovups (%3), %%ymm5 \n\t" + "vmovups (%3), %%ymm6 \n\t" + "vmovups (%3), %%ymm7 \n\t" + "vmovups (%3), %%ymm8 \n\t" + "vmovups (%3), %%ymm9 \n\t" + "vmovups (%3), %%ymm10 \n\t" + "vmovups (%3), %%ymm11 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "vmovups (%2), %%ymm0 \n\t" + "vmovups 0x20(%2), %%ymm1 \n\t" + "vmovups 0x40(%2), %%ymm2 \n\t" + "vmovups 0x60(%2), %%ymm3 \n\t" + "vmovups 0x80(%2), %%ymm4 \n\t" + "vmovups 0xA0(%2), %%ymm5 \n\t" + "vmovups 0xC0(%2), %%ymm6 \n\t" + "vmovups 0xE0(%2), %%ymm7 \n\t" + "vmovups 0x100(%2), %%ymm8 \n\t" + "vmovups 0x120(%2), %%ymm9 \n\t" + "vmovups 0x140(%2), %%ymm10 \n\t" + "vmovups 0x160(%2), %%ymm11 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "vmovaps (%0), %%ymm12 \n\t" + "vbroadcastss (%1), %%ymm13 \n\t" + "vbroadcastss 0x20(%1), %%ymm14 \n\t" + "vbroadcastss 0x40(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vbroadcastss 0x60(%1), %%ymm13 \n\t" + "vbroadcastss 0x80(%1), %%ymm14 \n\t" + "vbroadcastss 0xA0(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm5 \n\t" + "vbroadcastss 0xC0(%1), %%ymm13 \n\t" + "vbroadcastss 0xE0(%1), %%ymm14 \n\t" + "vbroadcastss 0x100(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm8 \n\t" + "vbroadcastss 0x120(%1), %%ymm13 \n\t" + "vbroadcastss 0x140(%1), %%ymm14 \n\t" + "vbroadcastss 0x160(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm11 \n\t" + + "vmovaps 0x20(%0), %%ymm12 \n\t" + "vbroadcastss 0x4(%1), %%ymm13 \n\t" + "vbroadcastss 0x24(%1), %%ymm14 \n\t" + "vbroadcastss 0x44(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vbroadcastss 0x64(%1), %%ymm13 \n\t" + "vbroadcastss 0x84(%1), %%ymm14 \n\t" + "vbroadcastss 0xA4(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm5 \n\t" + "vbroadcastss 0xC4(%1), %%ymm13 \n\t" + "vbroadcastss 0xE4(%1), %%ymm14 \n\t" + "vbroadcastss 0x104(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm8 \n\t" + "vbroadcastss 0x124(%1), %%ymm13 \n\t" + "vbroadcastss 0x144(%1), %%ymm14 \n\t" + "vbroadcastss 0x164(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm11 \n\t" + + "vmovaps 0x40(%0), %%ymm12 \n\t" + "vbroadcastss 0x8(%1), %%ymm13 \n\t" + "vbroadcastss 0x28(%1), %%ymm14 \n\t" + "vbroadcastss 0x48(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vbroadcastss 0x68(%1), %%ymm13 \n\t" + "vbroadcastss 0x88(%1), %%ymm14 \n\t" + "vbroadcastss 0xA8(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm5 \n\t" + "vbroadcastss 0xC8(%1), %%ymm13 \n\t" + "vbroadcastss 0xE8(%1), %%ymm14 \n\t" + "vbroadcastss 0x108(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm8 \n\t" + "vbroadcastss 0x128(%1), %%ymm13 \n\t" + "vbroadcastss 0x148(%1), %%ymm14 \n\t" + "vbroadcastss 0x168(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm11 \n\t" + + "vmovaps 0x60(%0), %%ymm12 \n\t" + "vbroadcastss 0xC(%1), %%ymm13 \n\t" + "vbroadcastss 0x2C(%1), %%ymm14 \n\t" + "vbroadcastss 0x4C(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vbroadcastss 0x6C(%1), %%ymm13 \n\t" + "vbroadcastss 0x8C(%1), %%ymm14 \n\t" + "vbroadcastss 0xAC(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm5 \n\t" + "vbroadcastss 0xCC(%1), %%ymm13 \n\t" + "vbroadcastss 0xEC(%1), %%ymm14 \n\t" + "vbroadcastss 0x10C(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm8 \n\t" + "vbroadcastss 0x12C(%1), %%ymm13 \n\t" + "vbroadcastss 0x14C(%1), %%ymm14 \n\t" + "vbroadcastss 0x16C(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm11 \n\t" + + "vmovaps 0x80(%0), %%ymm12 \n\t" + "vbroadcastss 0x10(%1), %%ymm13 \n\t" + "vbroadcastss 0x30(%1), %%ymm14 \n\t" + "vbroadcastss 0x50(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vbroadcastss 0x70(%1), %%ymm13 \n\t" + "vbroadcastss 0x90(%1), %%ymm14 \n\t" + "vbroadcastss 0xB0(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm5 \n\t" + "vbroadcastss 0xD0(%1), %%ymm13 \n\t" + "vbroadcastss 0xF0(%1), %%ymm14 \n\t" + "vbroadcastss 0x110(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm8 \n\t" + "vbroadcastss 0x130(%1), %%ymm13 \n\t" + "vbroadcastss 0x150(%1), %%ymm14 \n\t" + "vbroadcastss 0x170(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm11 \n\t" + + "vmovaps 0xA0(%0), %%ymm12 \n\t" + "vbroadcastss 0x14(%1), %%ymm13 \n\t" + "vbroadcastss 0x34(%1), %%ymm14 \n\t" + "vbroadcastss 0x54(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vbroadcastss 0x74(%1), %%ymm13 \n\t" + "vbroadcastss 0x94(%1), %%ymm14 \n\t" + "vbroadcastss 0xB4(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm5 \n\t" + "vbroadcastss 0xD4(%1), %%ymm13 \n\t" + "vbroadcastss 0xF4(%1), %%ymm14 \n\t" + "vbroadcastss 0x114(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm8 \n\t" + "vbroadcastss 0x134(%1), %%ymm13 \n\t" + "vbroadcastss 0x154(%1), %%ymm14 \n\t" + "vbroadcastss 0x174(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm11 \n\t" + + "vmovaps 0xC0(%0), %%ymm12 \n\t" + "vbroadcastss 0x18(%1), %%ymm13 \n\t" + "vbroadcastss 0x38(%1), %%ymm14 \n\t" + "vbroadcastss 0x58(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vbroadcastss 0x78(%1), %%ymm13 \n\t" + "vbroadcastss 0x98(%1), %%ymm14 \n\t" + "vbroadcastss 0xB8(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm5 \n\t" + "vbroadcastss 0xD8(%1), %%ymm13 \n\t" + "vbroadcastss 0xF8(%1), %%ymm14 \n\t" + "vbroadcastss 0x118(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm8 \n\t" + "vbroadcastss 0x138(%1), %%ymm13 \n\t" + "vbroadcastss 0x158(%1), %%ymm14 \n\t" + "vbroadcastss 0x178(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm11 \n\t" + + "vmovaps 0xE0(%0), %%ymm12 \n\t" + "vbroadcastss 0x1C(%1), %%ymm13 \n\t" + "vbroadcastss 0x3C(%1), %%ymm14 \n\t" + "vbroadcastss 0x5C(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vbroadcastss 0x7C(%1), %%ymm13 \n\t" + "vbroadcastss 0x9C(%1), %%ymm14 \n\t" + "vbroadcastss 0xBC(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm5 \n\t" + "vbroadcastss 0xDC(%1), %%ymm13 \n\t" + "vbroadcastss 0xFC(%1), %%ymm14 \n\t" + "vbroadcastss 0x11C(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm8 \n\t" + "vbroadcastss 0x13C(%1), %%ymm13 \n\t" + "vbroadcastss 0x15C(%1), %%ymm14 \n\t" + "vbroadcastss 0x17C(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm11 \n\t" + + "add $0x100, %0 \n\t" + "add %6, %1 \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + + // relu + "and $0x6, %4 \n\t" + "je 2f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm1, %%ymm1 \n\t" + "vmaxps %%ymm15, %%ymm2, %%ymm2 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + "vmaxps %%ymm15, %%ymm5, %%ymm5 \n\t" + "vmaxps %%ymm15, %%ymm6, %%ymm6 \n\t" + "vmaxps %%ymm15, %%ymm7, %%ymm7 \n\t" + "vmaxps %%ymm15, %%ymm8, %%ymm8 \n\t" + "vmaxps %%ymm15, %%ymm9, %%ymm9 \n\t" + "vmaxps %%ymm15, %%ymm10, %%ymm10 \n\t" + "vmaxps %%ymm15, %%ymm11, %%ymm11 \n\t" + + // relu6 + "and $0x4, %4 \n\t" + "je 2f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm1, %%ymm1 \n\t" + "vminps %%ymm12, %%ymm2, %%ymm2 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + "vminps %%ymm12, %%ymm5, %%ymm5 \n\t" + "vminps %%ymm12, %%ymm6, %%ymm6 \n\t" + "vminps %%ymm12, %%ymm7, %%ymm7 \n\t" + "vminps %%ymm12, %%ymm8, %%ymm8 \n\t" + "vminps %%ymm12, %%ymm9, %%ymm9 \n\t" + "vminps %%ymm12, %%ymm10, %%ymm10 \n\t" + "vminps %%ymm12, %%ymm11, %%ymm11 \n\t" + + "2: \n\t" + "vmovups %%ymm0, (%2) \n\t" + "vmovups %%ymm1, 0x20(%2) \n\t" + "vmovups %%ymm2, 0x40(%2) \n\t" + "vmovups %%ymm3, 0x60(%2) \n\t" + "vmovups %%ymm4, 0x80(%2) \n\t" + "vmovups %%ymm5, 0xA0(%2) \n\t" + "vmovups %%ymm6, 0xC0(%2) \n\t" + "vmovups %%ymm7, 0xE0(%2) \n\t" + "vmovups %%ymm8, 0x100(%2) \n\t" + "vmovups %%ymm9, 0x120(%2) \n\t" + "vmovups %%ymm10, 0x140(%2) \n\t" + "vmovups %%ymm11, 0x160(%2) \n\t" + : + : "r"(curW), "r"(curI), "r"(curO), "r"(curB), "r"(store), "c"(ic), "r"(I64(fStep)) + : "%eax", "%rax", "%ebx", "%r9", "%r10", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", + "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", + "%ymm14", "%ymm15", "memory", "cc"); +} + +inline void avx2_pointwise_kernel_1x32( + F32 *curI, const F32 *curW, F32 *curO, const F32 *curB, U32 oStep, U32 store, U32 ic, U32 fStep) +{ + __asm__ __volatile__("shr $3, %%ecx \n\t" + "mov %5, %%eax \n\t" + "and $0x1, %%eax \n\t" + "jne 0f \n\t" + "vmovups (%3), %%ymm0 \n\t" + "vmovups 0x20(%3), %%ymm3 \n\t" + "vmovups 0x40(%3), %%ymm6 \n\t" + "vmovups 0x60(%3), %%ymm9 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "mov %2, %%rax \n\t" + "vmovups (%%rax), %%ymm0 \n\t" + "add %4, %%rax \n\t" + "vmovups (%%rax), %%ymm3 \n\t" + "add %4, %%rax \n\t" + "vmovups (%%rax), %%ymm6 \n\t" + "add %4, %%rax \n\t" + "vmovups (%%rax), %%ymm9 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "vbroadcastss (%1), %%ymm12 \n\t" + "vmovaps (%0), %%ymm13 \n\t" + "vmovaps 0x20(%0), %%ymm14 \n\t" + "vmovaps 0x40(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vmovaps 0x60(%0), %%ymm14 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm9 \n\t" + + "vbroadcastss 0x4(%1), %%ymm12 \n\t" + "vmovaps 0x80(%0), %%ymm13 \n\t" + "vmovaps 0xA0(%0), %%ymm14 \n\t" + "vmovaps 0xC0(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vmovaps 0xE0(%0), %%ymm14 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm9 \n\t" + + "vbroadcastss 0x8(%1), %%ymm12 \n\t" + "vmovaps 0x100(%0), %%ymm13 \n\t" + "vmovaps 0x120(%0), %%ymm14 \n\t" + "vmovaps 0x140(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vmovaps 0x160(%0), %%ymm14 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm9 \n\t" + + "vbroadcastss 0xC(%1), %%ymm12 \n\t" + "vmovaps 0x180(%0), %%ymm13 \n\t" + "vmovaps 0x1A0(%0), %%ymm14 \n\t" + "vmovaps 0x1C0(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vmovaps 0x1E0(%0), %%ymm14 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm9 \n\t" + + "vbroadcastss 0x10(%1), %%ymm12 \n\t" + "vmovaps 0x200(%0), %%ymm13 \n\t" + "vmovaps 0x220(%0), %%ymm14 \n\t" + "vmovaps 0x240(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vmovaps 0x260(%0), %%ymm14 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm9 \n\t" + + "vbroadcastss 0x14(%1), %%ymm12 \n\t" + "vmovaps 0x280(%0), %%ymm13 \n\t" + "vmovaps 0x2A0(%0), %%ymm14 \n\t" + "vmovaps 0x2C0(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vmovaps 0x2E0(%0), %%ymm14 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm9 \n\t" + + "vbroadcastss 0x18(%1), %%ymm12 \n\t" + "vmovaps 0x300(%0), %%ymm13 \n\t" + "vmovaps 0x320(%0), %%ymm14 \n\t" + "vmovaps 0x340(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vmovaps 0x360(%0), %%ymm14 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm9 \n\t" + + "vbroadcastss 0x1C(%1), %%ymm12 \n\t" + "vmovaps 0x380(%0), %%ymm13 \n\t" + "vmovaps 0x3A0(%0), %%ymm14 \n\t" + "vmovaps 0x3C0(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vmovaps 0x3E0(%0), %%ymm13 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm9 \n\t" + + "add $0x400, %0 \n\t" + "add %7, %1 \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + + // relu + "and $0x6, %5 \n\t" + "je 2f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + "vmaxps %%ymm15, %%ymm6, %%ymm6 \n\t" + "vmaxps %%ymm15, %%ymm9, %%ymm9 \n\t" + + // relu6 + "and $0x4, %5 \n\t" + "je 2f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + "vminps %%ymm12, %%ymm6, %%ymm6 \n\t" + "vminps %%ymm12, %%ymm9, %%ymm9 \n\t" + + "2: \n\t" + "vmovups %%ymm0, (%2) \n\t" + "add %4, %2 \n\t" + "vmovups %%ymm3, (%2) \n\t" + "add %4, %2 \n\t" + "vmovups %%ymm6, (%2) \n\t" + "add %4, %2 \n\t" + "vmovups %%ymm9, (%2) \n\t" + : + : "r"(curW), "r"(curI), "r"(curO), "r"(curB), "r"(I64(oStep)), "r"(store), + "c"(ic), "r"(I64(fStep)) + : "%eax", "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", + "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", + "%ymm14", "%ymm15", "memory", "cc"); +} + +inline void avx2_pointwise_kernel_1x24( + F32 *curI, const F32 *curW, F32 *curO, const F32 *curB, U32 oStep, U32 store, U32 ic, U32 fStep) +{ + __asm__ __volatile__("shr $3, %%ecx \n\t" + "mov %5, %%eax \n\t" + "and $0x1, %%eax \n\t" + "jne 0f \n\t" + "vmovups (%3), %%ymm0 \n\t" + "vmovups 0x20(%3), %%ymm4 \n\t" + "vmovups 0x40(%3), %%ymm8 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "vmovups (%2), %%ymm0 \n\t" + "vmovups (%2, %4), %%ymm4 \n\t" + "vmovups (%2, %4, 2), %%ymm8 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "vmovaps (%0), %%ymm12 \n\t" + "vmovaps 0x20(%0), %%ymm13 \n\t" + "vmovaps 0x40(%0), %%ymm14 \n\t" + "vbroadcastss 0x0(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + + "vmovaps 0x60(%0), %%ymm12 \n\t" + "vmovaps 0x80(%0), %%ymm13 \n\t" + "vmovaps 0xA0(%0), %%ymm14 \n\t" + "vbroadcastss 0x4(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + + "vmovaps 0xC0(%0), %%ymm12 \n\t" + "vmovaps 0xE0(%0), %%ymm13 \n\t" + "vmovaps 0x100(%0), %%ymm14 \n\t" + "vbroadcastss 0x8(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + + "vmovaps 0x120(%0), %%ymm12 \n\t" + "vmovaps 0x140(%0), %%ymm13 \n\t" + "vmovaps 0x160(%0), %%ymm14 \n\t" + "vbroadcastss 0xC(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + + "vmovaps 0x180(%0), %%ymm12 \n\t" + "vmovaps 0x1A0(%0), %%ymm13 \n\t" + "vmovaps 0x1C0(%0), %%ymm14 \n\t" + "vbroadcastss 0x10(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + + "vmovaps 0x1E0(%0), %%ymm12 \n\t" + "vmovaps 0x200(%0), %%ymm13 \n\t" + "vmovaps 0x220(%0), %%ymm14 \n\t" + "vbroadcastss 0x14(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + + "vmovaps 0x240(%0), %%ymm12 \n\t" + "vmovaps 0x260(%0), %%ymm13 \n\t" + "vmovaps 0x280(%0), %%ymm14 \n\t" + "vbroadcastss 0x18(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + + "vmovaps 0x2A0(%0), %%ymm12 \n\t" + "vmovaps 0x2C0(%0), %%ymm13 \n\t" + "vmovaps 0x2E0(%0), %%ymm14 \n\t" + "vbroadcastss 0x1C(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + + "add $0x300, %0 \n\t" + "add %7, %1 \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + + // relu + "and $0x6, %5 \n\t" + "je 2f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + "vmaxps %%ymm15, %%ymm8, %%ymm8 \n\t" + + // relu6 + "and $0x4, %5 \n\t" + "je 2f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + "vminps %%ymm12, %%ymm8, %%ymm8 \n\t" + + "2: \n\t" + "vmovups %%ymm0, (%2) \n\t" + "vmovups %%ymm4, (%2, %4) \n\t" + "vmovups %%ymm8, (%2, %4, 2) \n\t" + : + : "r"(curW), "r"(curI), "r"(curO), "r"(curB), "r"(I64(oStep)), "r"(store), + "c"(ic), "r"(I64(fStep)) + : "%eax", "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", + "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", + "%ymm14", "%ymm15", "memory", "cc"); +} + +inline void avx2_pointwise_kernel_1x16( + F32 *curI, const F32 *curW, F32 *curO, const F32 *curB, U32 oStep, U32 store, U32 ic, U32 fStep) +{ + __asm__ __volatile__("shr $3, %%ecx \n\t" + "mov %5, %%eax \n\t" + "and $0x1, %%eax \n\t" + "jne 0f \n\t" + "vmovups (%3), %%ymm0 \n\t" + "vmovups 0x20(%3), %%ymm4 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "vmovups (%2), %%ymm0 \n\t" + "vmovups (%2, %4), %%ymm4 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "vmovaps (%0), %%ymm12 \n\t" + "vmovaps 0x20(%0), %%ymm13 \n\t" + "vbroadcastss 0x0(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "vmovaps 0x40(%0), %%ymm12 \n\t" + "vmovaps 0x60(%0), %%ymm13 \n\t" + "vbroadcastss 0x4(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "vmovaps 0x80(%0), %%ymm12 \n\t" + "vmovaps 0xA0(%0), %%ymm13 \n\t" + "vbroadcastss 0x8(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "vmovaps 0xC0(%0), %%ymm12 \n\t" + "vmovaps 0xE0(%0), %%ymm13 \n\t" + "vbroadcastss 0xC(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "vmovaps 0x100(%0), %%ymm12 \n\t" + "vmovaps 0x120(%0), %%ymm13 \n\t" + "vbroadcastss 0x10(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "vmovaps 0x140(%0), %%ymm12 \n\t" + "vmovaps 0x160(%0), %%ymm13 \n\t" + "vbroadcastss 0x14(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "vmovaps 0x180(%0), %%ymm12 \n\t" + "vmovaps 0x1A0(%0), %%ymm13 \n\t" + "vbroadcastss 0x18(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "vmovaps 0x1C0(%0), %%ymm12 \n\t" + "vmovaps 0x1E0(%0), %%ymm13 \n\t" + "vbroadcastss 0x1C(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "add $0x200, %0 \n\t" + "add %7, %1 \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + + // relu + "and $0x6, %5 \n\t" + "je 2f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + + // relu6 + "and $0x4, %5 \n\t" + "je 2f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + + "2: \n\t" + "vmovups %%ymm0, (%2) \n\t" + "vmovups %%ymm4, (%2, %4) \n\t" + : + : "r"(curW), "r"(curI), "r"(curO), "r"(curB), "r"(I64(oStep)), "r"(store), + "c"(ic), "r"(I64(fStep)) + : "%eax", "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", + "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", + "%ymm14", "%ymm15", "memory", "cc"); +} + +inline void avx2_pointwise_kernel_1x8( + F32 *curI, const F32 *curW, F32 *curO, const F32 *curB, U32 oStep, U32 store, U32 ic, U32 fStep) +{ + __asm__ __volatile__( + "shr $3, %%ecx \n\t" + "mov %4, %%eax \n\t" + "and $0x1, %%eax \n\t" + "jne 0f \n\t" + "vmovups (%3), %%ymm0 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "vmovups (%2), %%ymm0 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "vmovaps (%0), %%ymm12 \n\t" + "vbroadcastss 0x0(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vmovaps 0x20(%0), %%ymm12 \n\t" + "vbroadcastss 0x4(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vmovaps 0x40(%0), %%ymm12 \n\t" + "vbroadcastss 0x8(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vmovaps 0x60(%0), %%ymm12 \n\t" + "vbroadcastss 0xC(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vmovaps 0x80(%0), %%ymm12 \n\t" + "vbroadcastss 0x10(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vmovaps 0xA0(%0), %%ymm12 \n\t" + "vbroadcastss 0x14(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vmovaps 0xC0(%0), %%ymm12 \n\t" + "vbroadcastss 0x18(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vmovaps 0xE0(%0), %%ymm12 \n\t" + "vbroadcastss 0x1C(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "add $0x100, %0 \n\t" + "add %6, %1 \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + + // relu + "and $0x6, %4 \n\t" + "je 2f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + + // relu6 + "and $0x4, %4 \n\t" + "je 2f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + + "2: \n\t" + "vmovups %%ymm0, (%2) \n\t" + : + : "r"(curW), "r"(curI), "r"(curO), "r"(curB), "r"(store), "c"(ic), "r"(I64(fStep)) + : "%eax", "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", + "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory", + "cc"); +} + +EE convolution_1x1_direct(TensorDesc inputDesc, + F32 *inArray, + TensorDesc filterDesc, + const F32 *filterArray, + ConvolutionParamSpec convParamSpec, + const F32 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec activationDesc) +{ + UNUSED(tmpBytes); + DataType idt, odt, fdt; + DataFormat idf, odf, fdf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if (((fdf != DF_NCHWCxN24) && (fdf != DF_NCHWCxN32)) || (idf != DF_NCHWC8)) { + CHECK_STATUS(NOT_MATCH); + } + + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + F32 *ftmp = inArray; + F32 *btmp = (F32 *)align_addr(tmp, 32); + filterArray = (F32 *)align_addr(filterArray, 32); + + U32 oStep = oh * ow * SIMDW * 4; + U32 fStep = ih * iw * SIMDW * 4; + U32 store = 0, icSize = 0, ocBlocking = 0; + U32 ohow = oh * ow; + U32 icPadding = (ic + 8 - 1) / 8 * 8; + kernel_func kernel[2][4] = {{avx2_pointwise_kernel_1x8, avx2_pointwise_kernel_1x16, + avx2_pointwise_kernel_1x24, avx2_pointwise_kernel_1x32}, + {avx2_pointwise_kernel_12x8, avx2_pointwise_kernel_6x16, avx2_pointwise_kernel_4x24, + avx2_pointwise_kernel_3x32}}; + + U32 unroll_oc_array[4] = {8, 16, 24, 32}; + U32 unroll_hw_array[4] = {12, 6, 4, 3}; + U32 unroll_oc = 24, unroll_hw = 4; + + if ((oc % 24 != 0) && (oc % 32 == 0)) { + unroll_oc = 32; + unroll_hw = 3; + } + +#ifdef _USE_OPENMP + U32 alpha = (ohow + OMP_NUM_THREADS * BLOCK_HW_DIM - 1) / (OMP_NUM_THREADS * BLOCK_HW_DIM); + U32 block_hw_dim = (ohow + OMP_NUM_THREADS * alpha - 1 ) / (OMP_NUM_THREADS * alpha); +#else + U32 block_hw_dim = BLOCK_HW_DIM; +#endif + + U32 hwBlockNums = (ohow + block_hw_dim - 1 ) / block_hw_dim; + + if ((paddingT != 0) || (paddingB != 0) || (paddingL != 0) || (paddingR != 0)) { + __m256 zero = _mm256_set1_ps(0.); + switch (activationDesc.mode) { + case ACTIVATION_NULL: { + for (U32 ocb = 0; ocb < oc; ocb += 8) { + _mm256_store_ps(btmp + ocb, _mm256_loadu_ps(biasArray + ocb)); + } + break; + } + case ACTIVATION_RELU: { + for (U32 ocb = 0; ocb < oc; ocb += 8) { + _mm256_store_ps(btmp + ocb, _mm256_max_ps(zero, _mm256_loadu_ps(biasArray + ocb))); + } + break; + } + case ACTIVATION_RELU6: { + __m256 six = _mm256_set1_ps(6.); + for (U32 ocb = 0; ocb < oc; ocb += 8) { + _mm256_store_ps(btmp + ocb, _mm256_min_ps(six, _mm256_max_ps(zero, _mm256_loadu_ps(biasArray + ocb)))); + } + break; + } + default: + return NOT_SUPPORTED; + } + } + + for (U32 n = 0; n < in; ++n) { + for (U32 ocbb = 0; ocbb < oc; ocbb += ocBlocking) { + store = 0; + ocBlocking = UNI_MIN(oc - ocbb, BLOCK_OC_DIM); + for (U32 icb = 0; icb < icPadding; icb += icSize) { + icSize = UNI_MIN(icPadding - icb, BLOCK_IC_DIM); + store |= (icb > 0); + if (icb == icPadding - icSize) { + store |= U32(activationDesc.mode) << 1; + } + F32 *curI = ftmp + icb * ih * iw; + if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 hwIdx = 0; hwIdx < hwBlockNums; ++hwIdx) { + U32 hw = hwIdx * block_hw_dim; + U32 hwSize = UNI_MIN(block_hw_dim, ohow - hw); + U32 ocSize = 0, ihwSize = 0; + for (U32 ocb = ocbb; ocb < ocbb + ocBlocking; ocb += ocSize) { + ocSize = UNI_MIN(ocbb + ocBlocking - ocb, unroll_oc); + ocSize = unroll_oc_array[(ocSize>>3)-1]; + U32 unroll_hw = unroll_hw_array[(ocSize>>3)-1]; + const F32 *curB = biasArray + ocb; + const F32 *curW = filterArray + ocb * icPadding + icb * ocSize; + F32 *curO = outArray + ocb * oh * ow; + for (U32 ihw = hw; ihw < hw + hwSize; ihw += ihwSize) { + if ((hw + hwSize - ihw) >= unroll_hw) { + ihwSize = unroll_hw; + } else { + ihwSize = 1; + } + F32 *calI = curI + ihw * SIMDW; + F32 *calO = curO + ihw * SIMDW; + kernel[ihwSize>1][(ocSize>>3)-1](calI, curW, calO, curB, oStep, store, icSize, fStep); + } + } + } + } else { +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (I32 h = 0; h < oh; ++h) { + U32 ocSize = 0, ihwSize = 0; + for (U32 ocb = ocbb; ocb < ocbb + ocBlocking; ocb += ocSize) { + ocSize = UNI_MIN(ocbb + ocBlocking - ocb, unroll_oc); + ocSize = unroll_oc_array[(ocSize>>3)-1]; + U32 unroll_hw = unroll_hw_array[(ocSize>>3)-1]; + const F32 *curB = biasArray + ocb; + const F32 *curW = filterArray + ocb * icPadding + icb * ocSize; + F32 *curO = outArray + ocb * oh * ow; + for (U32 w = 0; w < ow; w += ihwSize) { + F32 *calI = curI + ((h - paddingT) * iw + w - paddingL) * SIMDW; + F32 *calO = curO + (h * ow + w) * SIMDW; + ihwSize = 1; + if ((h < paddingT) || (h >= ih + paddingT) || (w < paddingL) || (w >= paddingL + iw)) { + for (U32 oci = 0; oci < ocSize; oci += SIMDW) { + _mm256_storeu_ps(calO + ohow * oci, _mm256_load_ps(btmp + oci + ocb)); + } + continue; + } + if ((iw - (w - paddingL)) >= unroll_hw) { + ihwSize = unroll_hw; + } + kernel[ihwSize>1][(ocSize>>3)-1](calI, curW, calO, curB, oStep, store, icSize, fStep); + } + } + } + } + } + } + inArray += ic * ih * iw; + outArray += oc * oh * ow; + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/x86/fp32/convolution_2x2_direct.cpp b/compute/tensor/src/cpu/x86/fp32/convolution_2x2_direct.cpp new file mode 100644 index 00000000..55d24ad7 --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/convolution_2x2_direct.cpp @@ -0,0 +1,1769 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" + +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#include "cpu/x86/fp32/transform_functions_fp32.h" + +#define UNROLL_W 3 +#define UNROLL_OC_DIM 8 +#define BLOCK_OC_DIM 32 +#define BLOCK_IC_DIM 32 +#define UNROLL_IC_BLOCK_DIM 8 +#define BLOCK_HW_DIM 768 +#define align_addr(addr, unit) (((uintptr_t)addr + unit - 1) / unit * unit) + +typedef void (*kernel_func)(F32 *curI, + const F32 *curW, + F32 *curO, + U32 fw, + U32 fh, + U32 oStep, + U32 iStep, + U32 store, + const F32 *curB, + U32 dw, + F32 *in_1, + F32 *in_2, + U32 ic, + U32 fStep); + +void avx2_conv_kernel_3x32(F32 *curI, + const F32 *curW, + F32 *curO, + U32 fw, + U32 fh, + U32 oStep, + U32 iStep, + U32 store, + const F32 *curB, + U32 dw, + F32 *in_1, + F32 *in_2, + U32 ic, + U32 fStep) +{ + __asm__ __volatile__("mov %3, %%ecx \n\t" + "and $0x1, %%ecx \n\t" + "jne 0f \n\t" + "vmovups (%1), %%ymm0 \n\t" + "vmovups (%1), %%ymm1 \n\t" + "vmovups (%1), %%ymm2 \n\t" + "vmovups 0x20(%1), %%ymm3 \n\t" + "vmovups 0x20(%1), %%ymm4 \n\t" + "vmovups 0x20(%1), %%ymm5 \n\t" + "vmovups 0x40(%1), %%ymm6 \n\t" + "vmovups 0x40(%1), %%ymm7 \n\t" + "vmovups 0x40(%1), %%ymm8 \n\t" + "vmovups 0x60(%1), %%ymm9 \n\t" + "vmovups 0x60(%1), %%ymm10 \n\t" + "vmovups 0x60(%1), %%ymm11 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "mov %0, %%rax \n\t" + "vmovups (%%rax), %%ymm0 \n\t" + "vmovups 0x20(%%rax), %%ymm1 \n\t" + "vmovups 0x40(%%rax), %%ymm2 \n\t" + "add %2, %%rax \n\t" + "vmovups (%%rax), %%ymm3 \n\t" + "vmovups 0x20(%%rax), %%ymm4 \n\t" + "vmovups 0x40(%%rax), %%ymm5 \n\t" + "add %2, %%rax \n\t" + "vmovups (%%rax), %%ymm6 \n\t" + "vmovups 0x20(%%rax), %%ymm7 \n\t" + "vmovups 0x40(%%rax), %%ymm8 \n\t" + "add %2, %%rax \n\t" + "vmovups (%%rax), %%ymm9 \n\t" + "vmovups 0x20(%%rax), %%ymm10 \n\t" + "vmovups 0x40(%%rax), %%ymm11 \n\t" + + ".align 16 \n\t" + "1: \n\t" + : + : "r"(curO), "r"(curB), "r"(I64(oStep)), "r"(store) + : "%ecx", "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", + "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "memory", "cc"); + + __asm__ __volatile__(".align 16 \n\t" + "2: \n\t" + + "mov %5, %%ebx \n\t" + ".align 16 \n\t" + "3: \n\t" + + "mov %4, %%ecx \n\t" + ".align 16 \n\t" + "4: \n\t" + + "vbroadcastss (%0), %%ymm12 \n\t" + "vbroadcastss (%1), %%ymm13 \n\t" + "vbroadcastss (%2), %%ymm14 \n\t" + "vmovups 0x0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0x20(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vmovups 0x40(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vmovups 0x60(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vbroadcastss 0x4(%0), %%ymm12 \n\t" + "vbroadcastss 0x4(%1), %%ymm13 \n\t" + "vbroadcastss 0x4(%2), %%ymm14 \n\t" + "vmovups 0x80(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0xA0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vmovups 0xC0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vmovups 0xE0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vbroadcastss 0x8(%0), %%ymm12 \n\t" + "vbroadcastss 0x8(%1), %%ymm13 \n\t" + "vbroadcastss 0x8(%2), %%ymm14 \n\t" + "vmovups 0x100(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0x120(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vmovups 0x140(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vmovups 0x160(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vbroadcastss 0xC(%0), %%ymm12 \n\t" + "vbroadcastss 0xC(%1), %%ymm13 \n\t" + "vbroadcastss 0xC(%2), %%ymm14 \n\t" + "vmovups 0x180(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0x1A0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vmovups 0x1C0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vmovups 0x1E0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vbroadcastss 0x10(%0), %%ymm12 \n\t" + "vbroadcastss 0x10(%1), %%ymm13 \n\t" + "vbroadcastss 0x10(%2), %%ymm14 \n\t" + "vmovups 0x200(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0x220(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vmovups 0x240(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vmovups 0x260(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vbroadcastss 0x14(%0), %%ymm12 \n\t" + "vbroadcastss 0x14(%1), %%ymm13 \n\t" + "vbroadcastss 0x14(%2), %%ymm14 \n\t" + "vmovups 0x280(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0x2A0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vmovups 0x2C0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vmovups 0x2E0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vbroadcastss 0x18(%0), %%ymm12 \n\t" + "vbroadcastss 0x18(%1), %%ymm13 \n\t" + "vbroadcastss 0x18(%2), %%ymm14 \n\t" + "vmovups 0x300(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0x320(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vmovups 0x340(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vmovups 0x360(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vbroadcastss 0x1C(%0), %%ymm12 \n\t" + "vbroadcastss 0x1C(%1), %%ymm13 \n\t" + "vbroadcastss 0x1C(%2), %%ymm14 \n\t" + "vmovups 0x380(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0x3A0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vmovups 0x3C0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vmovups 0x3E0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "add %7, %0 \n\t" + "add %7, %1 \n\t" + "add %7, %2 \n\t" + "add $0x400, %3 \n\t" + "dec %%ecx \n\t" + "jg 4b \n\t" + + "add %6, %0 \n\t" + "add %6, %1 \n\t" + "add %6, %2 \n\t" + "dec %%ebx \n\t" + "jg 3b \n\t" + + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "dec %%eax \n\t" + "jg 2b \n\t" + + : + : "r"(curI), "r"(in_1), "r"(in_2), "r"(curW), "r"(fw), "r"(fh), + "r"(I64(iStep)), "r"(I64(dw)), "a"(ic / 8), "r"(I64(fStep)) + : "%ecx", "%ebx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", + "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", + "%ymm14", "%ymm15", "memory", "cc"); + + __asm__ __volatile__( + // relu + "and $0x6, %2 \n\t" + "je 5f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm1, %%ymm1 \n\t" + "vmaxps %%ymm15, %%ymm2, %%ymm2 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + "vmaxps %%ymm15, %%ymm5, %%ymm5 \n\t" + "vmaxps %%ymm15, %%ymm6, %%ymm6 \n\t" + "vmaxps %%ymm15, %%ymm7, %%ymm7 \n\t" + "vmaxps %%ymm15, %%ymm8, %%ymm8 \n\t" + "vmaxps %%ymm15, %%ymm9, %%ymm9 \n\t" + "vmaxps %%ymm15, %%ymm10, %%ymm10 \n\t" + "vmaxps %%ymm15, %%ymm11, %%ymm11 \n\t" + + // relu6 + "and $0x4, %2 \n\t" + "je 5f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm1, %%ymm1 \n\t" + "vminps %%ymm12, %%ymm2, %%ymm2 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + "vminps %%ymm12, %%ymm5, %%ymm5 \n\t" + "vminps %%ymm12, %%ymm6, %%ymm6 \n\t" + "vminps %%ymm12, %%ymm7, %%ymm7 \n\t" + "vminps %%ymm12, %%ymm8, %%ymm8 \n\t" + "vminps %%ymm12, %%ymm9, %%ymm9 \n\t" + "vminps %%ymm12, %%ymm10, %%ymm10 \n\t" + "vminps %%ymm12, %%ymm11, %%ymm11 \n\t" + + "5: \n\t" + "vmovups %%ymm0, (%0) \n\t" + "vmovups %%ymm1, 0x20(%0) \n\t" + "vmovups %%ymm2, 0x40(%0) \n\t" + "add %1, %0 \n\t" + "vmovups %%ymm3, (%0) \n\t" + "vmovups %%ymm4, 0x20(%0) \n\t" + "vmovups %%ymm5, 0x40(%0) \n\t" + "add %1, %0 \n\t" + "vmovups %%ymm6, (%0) \n\t" + "vmovups %%ymm7, 0x20(%0) \n\t" + "vmovups %%ymm8, 0x40(%0) \n\t" + "add %1, %0 \n\t" + "vmovups %%ymm9, (%0) \n\t" + "vmovups %%ymm10, 0x20(%0) \n\t" + "vmovups %%ymm11, 0x40(%0) \n\t" + : "+r"(curO) + : "r"(I64(oStep)), "r"(store) + : "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", + "%ymm9", "%ymm10", "%ymm11", "memory", "cc"); +} + +void avx2_conv_kernel_2x32(F32 *curI, + const F32 *curW, + F32 *curO, + U32 fw, + U32 fh, + U32 oStep, + U32 iStep, + U32 store, + const F32 *curB, + U32 dw, + F32 *in_1, + F32 *in_2, + U32 ic, + U32 fStep) +{ + __asm__ __volatile__("mov %7, %%ecx \n\t" + "and $0x1, %%ecx \n\t" + "jne 0f \n\t" + "vmovups (%8), %%ymm0 \n\t" + "vmovups (%8), %%ymm1 \n\t" + "vmovups 0x20(%8), %%ymm3 \n\t" + "vmovups 0x20(%8), %%ymm4 \n\t" + "vmovups 0x40(%8), %%ymm6 \n\t" + "vmovups 0x40(%8), %%ymm7 \n\t" + "vmovups 0x60(%8), %%ymm9 \n\t" + "vmovups 0x60(%8), %%ymm10 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "mov %1, %8 \n\t" + "vmovups (%8), %%ymm0 \n\t" + "vmovups 0x20(%8), %%ymm1 \n\t" + "add %4, %8 \n\t" + "vmovups (%8), %%ymm3 \n\t" + "vmovups 0x20(%8), %%ymm4 \n\t" + "add %4, %8 \n\t" + "vmovups (%8), %%ymm6 \n\t" + "vmovups 0x20(%8), %%ymm7 \n\t" + "add %4, %8 \n\t" + "vmovups (%8), %%ymm9 \n\t" + "vmovups 0x20(%8), %%ymm10 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "mov %5, %%ebx \n\t" + ".align 16 \n\t" + "2: \n\t" + + "mov %3, %%ecx \n\t" + ".align 16 \n\t" + "3: \n\t" + + "vbroadcastss (%0), %%ymm12 \n\t" + "vbroadcastss (%10), %%ymm13 \n\t" + "vmovups 0x0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0x20(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vmovups 0x40(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vmovups 0x60(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + + "vbroadcastss 0x4(%0), %%ymm12 \n\t" + "vbroadcastss 0x4(%10), %%ymm13 \n\t" + "vmovups 0x80(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0xA0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vmovups 0xC0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vmovups 0xE0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + + "vbroadcastss 0x8(%0), %%ymm12 \n\t" + "vbroadcastss 0x8(%10), %%ymm13 \n\t" + "vmovups 0x100(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0x120(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vmovups 0x140(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vmovups 0x160(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + + "vbroadcastss 0xC(%0), %%ymm12 \n\t" + "vbroadcastss 0xC(%10), %%ymm13 \n\t" + "vmovups 0x180(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0x1A0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vmovups 0x1C0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vmovups 0x1E0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + + "vbroadcastss 0x10(%0), %%ymm12 \n\t" + "vbroadcastss 0x10(%10), %%ymm13 \n\t" + "vmovups 0x200(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0x220(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vmovups 0x240(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vmovups 0x260(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + + "vbroadcastss 0x14(%0), %%ymm12 \n\t" + "vbroadcastss 0x14(%10), %%ymm13 \n\t" + "vmovups 0x280(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0x2A0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vmovups 0x2C0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vmovups 0x2E0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + + "vbroadcastss 0x18(%0), %%ymm12 \n\t" + "vbroadcastss 0x18(%10), %%ymm13 \n\t" + "vmovups 0x300(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0x320(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vmovups 0x340(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vmovups 0x360(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + + "vbroadcastss 0x1C(%0), %%ymm12 \n\t" + "vbroadcastss 0x1C(%10), %%ymm13 \n\t" + "vmovups 0x380(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0x3A0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vmovups 0x3C0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vmovups 0x3E0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + + "add %9, %0 \n\t" + "add %9, %10 \n\t" + "add $0x400, %2 \n\t" + "dec %%ecx \n\t" + "jg 3b \n\t" + + "add %6, %0 \n\t" + "add %6, %10 \n\t" + "dec %%ebx \n\t" + "jg 2b \n\t" + + "add %12, %0 \n\t" + "add %12, %10 \n\t" + "dec %%eax \n\t" + "jg 1b \n\t" + + // relu + "and $0x6, %7 \n\t" + "je 4f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm1, %%ymm1 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + "vmaxps %%ymm15, %%ymm6, %%ymm6 \n\t" + "vmaxps %%ymm15, %%ymm7, %%ymm7 \n\t" + "vmaxps %%ymm15, %%ymm9, %%ymm9 \n\t" + "vmaxps %%ymm15, %%ymm10, %%ymm10 \n\t" + + // relu6 + "and $0x4, %7 \n\t" + "je 4f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm1, %%ymm1 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + "vminps %%ymm12, %%ymm6, %%ymm6 \n\t" + "vminps %%ymm12, %%ymm7, %%ymm7 \n\t" + "vminps %%ymm12, %%ymm9, %%ymm9 \n\t" + "vminps %%ymm12, %%ymm10, %%ymm10 \n\t" + + "4: \n\t" + "vmovups %%ymm0, (%1) \n\t" + "vmovups %%ymm1, 0x20(%1) \n\t" + "add %4, %1 \n\t" + "vmovups %%ymm3, (%1) \n\t" + "vmovups %%ymm4, 0x20(%1) \n\t" + "add %4, %1 \n\t" + "vmovups %%ymm6, (%1) \n\t" + "vmovups %%ymm7, 0x20(%1) \n\t" + "add %4, %1 \n\t" + "vmovups %%ymm9, (%1) \n\t" + "vmovups %%ymm10, 0x20(%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(curW), "r"(fw), "r"(I64(oStep)), "r"(fh), + "r"(I64(iStep)), "r"(store), "r"(curB), "r"(I64(dw)), "r"(in_1), + "a"(ic / 8), "r"(I64(fStep)) + : "%ecx", "%ebx", "%ymm0", "%ymm1", "%ymm3", "%ymm4", "%ymm6", "%ymm7", + "%ymm9", "%ymm10", "%ymm12", "%ymm13", "%ymm15", "memory"); +} + +void avx2_conv_kernel_1x32(F32 *curI, + const F32 *curW, + F32 *curO, + U32 fw, + U32 fh, + U32 oStep, + U32 iStep, + U32 store, + const F32 *curB, + U32 dw, + F32 *in_1, + F32 *in_2, + U32 ic, + U32 fStep) +{ + __asm__ __volatile__( + "mov %7, %%ecx \n\t" + "and $0x1, %%ecx \n\t" + "jne 0f \n\t" + "vmovups (%8), %%ymm0 \n\t" + "vmovups 0x20(%8), %%ymm3 \n\t" + "vmovups 0x40(%8), %%ymm6 \n\t" + "vmovups 0x60(%8), %%ymm9 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "mov %1, %8 \n\t" + "vmovups (%8), %%ymm0 \n\t" + "add %4, %8 \n\t" + "vmovups (%8), %%ymm3 \n\t" + "add %4, %8 \n\t" + "vmovups (%8), %%ymm6 \n\t" + "add %4, %8 \n\t" + "vmovups (%8), %%ymm9 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "mov %5, %%ebx \n\t" + ".align 16 \n\t" + "2: \n\t" + + "mov %3, %%ecx \n\t" + ".align 16 \n\t" + "3: \n\t" + + "vbroadcastss (%0), %%ymm12 \n\t" + "vmovups 0x0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vmovups 0x20(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vmovups 0x40(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vmovups 0x60(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + + "vbroadcastss 0x4(%0), %%ymm12 \n\t" + "vmovups 0x80(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vmovups 0xA0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vmovups 0xC0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vmovups 0xE0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + + "vbroadcastss 0x8(%0), %%ymm12 \n\t" + "vmovups 0x100(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vmovups 0x120(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vmovups 0x140(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vmovups 0x160(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + + "vbroadcastss 0xC(%0), %%ymm12 \n\t" + "vmovups 0x180(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vmovups 0x1A0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vmovups 0x1C0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vmovups 0x1E0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + + "vbroadcastss 0x10(%0), %%ymm12 \n\t" + "vmovups 0x200(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vmovups 0x220(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vmovups 0x240(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vmovups 0x260(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + + "vbroadcastss 0x14(%0), %%ymm12 \n\t" + "vmovups 0x280(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vmovups 0x2A0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vmovups 0x2C0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vmovups 0x2E0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + + "vbroadcastss 0x18(%0), %%ymm12 \n\t" + "vmovups 0x300(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vmovups 0x320(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vmovups 0x340(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vmovups 0x360(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + + "vbroadcastss 0x1C(%0), %%ymm12 \n\t" + "vmovups 0x380(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vmovups 0x3A0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vmovups 0x3C0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vmovups 0x3E0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + + "add %9, %0 \n\t" + "add $0x400, %2 \n\t" + "dec %%ecx \n\t" + "jg 3b \n\t" + + "add %6, %0 \n\t" + "sub $1, %%ebx \n\t" + "jg 2b \n\t" + + "add %11, %0 \n\t" + "sub $1, %%eax \n\t" + "jg 1b \n\t" + + // relu + "and $0x6, %7 \n\t" + "je 4f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + "vmaxps %%ymm15, %%ymm6, %%ymm6 \n\t" + "vmaxps %%ymm15, %%ymm9, %%ymm9 \n\t" + + // relu6 + "and $0x4, %7 \n\t" + "je 4f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + "vminps %%ymm12, %%ymm6, %%ymm6 \n\t" + "vminps %%ymm12, %%ymm9, %%ymm9 \n\t" + + "4: \n\t" + "vmovups %%ymm0, (%1) \n\t" + "add %4, %1 \n\t" + "vmovups %%ymm3, (%1) \n\t" + "add %4, %1 \n\t" + "vmovups %%ymm6, (%1) \n\t" + "add %4, %1 \n\t" + "vmovups %%ymm9, (%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(curW), "r"(fw), "r"(I64(oStep)), "r"(fh), "r"(I64(iStep)), + "r"(store), "r"(curB), "r"(I64(dw)), "a"(ic / 8), "r"(I64(fStep)) + : "%ecx", "%ebx", "%ymm0", "%ymm3", "%ymm6", "%ymm9", "%ymm12", "%ymm15", "memory", "cc"); +} + +void avx2_conv_kernel_3x16(F32 *curI, + const F32 *curW, + F32 *curO, + U32 fw, + U32 fh, + U32 oStep, + U32 iStep, + U32 store, + const F32 *curB, + U32 dw, + F32 *in_1, + F32 *in_2, + U32 ic, + U32 fStep) +{ + __asm__ __volatile__("mov %3, %%ecx \n\t" + "and $0x1, %%ecx \n\t" + "jne 0f \n\t" + "vmovups (%1), %%ymm0 \n\t" + "vmovups (%1), %%ymm1 \n\t" + "vmovups (%1), %%ymm2 \n\t" + "vmovups 0x20(%1), %%ymm3 \n\t" + "vmovups 0x20(%1), %%ymm4 \n\t" + "vmovups 0x20(%1), %%ymm5 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "mov %0, %%rax \n\t" + "vmovups (%%rax), %%ymm0 \n\t" + "vmovups 0x20(%%rax), %%ymm1 \n\t" + "vmovups 0x40(%%rax), %%ymm2 \n\t" + "add %2, %%rax \n\t" + "vmovups (%%rax), %%ymm3 \n\t" + "vmovups 0x20(%%rax), %%ymm4 \n\t" + "vmovups 0x40(%%rax), %%ymm5 \n\t" + + ".align 16 \n\t" + "1: \n\t" + : + : "r"(curO), "r"(curB), "r"(I64(oStep)), "r"(store) + : "%ecx", "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", + "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "memory", "cc"); + + __asm__ __volatile__(".align 16 \n\t" + "2: \n\t" + + "mov %5, %%ebx \n\t" + ".align 16 \n\t" + "3: \n\t" + + "mov %4, %%ecx \n\t" + ".align 16 \n\t" + "4: \n\t" + + "vbroadcastss (%0), %%ymm12 \n\t" + "vbroadcastss (%1), %%ymm13 \n\t" + "vbroadcastss (%2), %%ymm14 \n\t" + "vmovups 0x0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0x20(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + + "vbroadcastss 0x4(%0), %%ymm12 \n\t" + "vbroadcastss 0x4(%1), %%ymm13 \n\t" + "vbroadcastss 0x4(%2), %%ymm14 \n\t" + "vmovups 0x40(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0x60(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + + "vbroadcastss 0x8(%0), %%ymm12 \n\t" + "vbroadcastss 0x8(%1), %%ymm13 \n\t" + "vbroadcastss 0x8(%2), %%ymm14 \n\t" + "vmovups 0x80(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0xA0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + + "vbroadcastss 0xC(%0), %%ymm12 \n\t" + "vbroadcastss 0xC(%1), %%ymm13 \n\t" + "vbroadcastss 0xC(%2), %%ymm14 \n\t" + "vmovups 0xC0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0xE0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + + "vbroadcastss 0x10(%0), %%ymm12 \n\t" + "vbroadcastss 0x10(%1), %%ymm13 \n\t" + "vbroadcastss 0x10(%2), %%ymm14 \n\t" + "vmovups 0x100(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0x120(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + + "vbroadcastss 0x14(%0), %%ymm12 \n\t" + "vbroadcastss 0x14(%1), %%ymm13 \n\t" + "vbroadcastss 0x14(%2), %%ymm14 \n\t" + "vmovups 0x140(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0x160(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + + "vbroadcastss 0x18(%0), %%ymm12 \n\t" + "vbroadcastss 0x18(%1), %%ymm13 \n\t" + "vbroadcastss 0x18(%2), %%ymm14 \n\t" + "vmovups 0x180(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0x1A0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + + "vbroadcastss 0x1C(%0), %%ymm12 \n\t" + "vbroadcastss 0x1C(%1), %%ymm13 \n\t" + "vbroadcastss 0x1C(%2), %%ymm14 \n\t" + "vmovups 0x1C0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0x1E0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + + "add %7, %0 \n\t" + "add %7, %1 \n\t" + "add %7, %2 \n\t" + "add $0x200, %3 \n\t" + "dec %%ecx \n\t" + "jg 4b \n\t" + + "add %6, %0 \n\t" + "add %6, %1 \n\t" + "add %6, %2 \n\t" + "dec %%ebx \n\t" + "jg 3b \n\t" + + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "dec %%eax \n\t" + "jg 2b \n\t" + + : + : "r"(curI), "r"(in_1), "r"(in_2), "r"(curW), "r"(fw), "r"(fh), + "r"(I64(iStep)), "r"(I64(dw)), "a"(ic / 8), "r"(I64(fStep)) + : "%ecx", "%ebx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", + "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", + "%ymm14", "%ymm15", "memory", "cc"); + + __asm__ __volatile__( + // relu + "and $0x6, %2 \n\t" + "je 5f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm1, %%ymm1 \n\t" + "vmaxps %%ymm15, %%ymm2, %%ymm2 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + "vmaxps %%ymm15, %%ymm5, %%ymm5 \n\t" + + // relu6 + "and $0x4, %2 \n\t" + "je 5f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm1, %%ymm1 \n\t" + "vminps %%ymm12, %%ymm2, %%ymm2 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + "vminps %%ymm12, %%ymm5, %%ymm5 \n\t" + + "5: \n\t" + "vmovups %%ymm0, (%0) \n\t" + "vmovups %%ymm1, 0x20(%0) \n\t" + "vmovups %%ymm2, 0x40(%0) \n\t" + "add %1, %0 \n\t" + "vmovups %%ymm3, (%0) \n\t" + "vmovups %%ymm4, 0x20(%0) \n\t" + "vmovups %%ymm5, 0x40(%0) \n\t" + : "+r"(curO) + : "r"(I64(oStep)), "r"(store) + : "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", + "%ymm9", "%ymm10", "%ymm11", "memory", "cc"); +} + +void avx2_conv_kernel_2x16(F32 *curI, + const F32 *curW, + F32 *curO, + U32 fw, + U32 fh, + U32 oStep, + U32 iStep, + U32 store, + const F32 *curB, + U32 dw, + F32 *in_1, + F32 *in_2, + U32 ic, + U32 fStep) +{ + __asm__ __volatile__("mov %7, %%ecx \n\t" + "and $0x1, %%ecx \n\t" + "jne 0f \n\t" + "vmovups (%8), %%ymm0 \n\t" + "vmovups (%8), %%ymm1 \n\t" + "vmovups 0x20(%8), %%ymm3 \n\t" + "vmovups 0x20(%8), %%ymm4 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "mov %1, %8 \n\t" + "vmovups (%8), %%ymm0 \n\t" + "vmovups 0x20(%8), %%ymm1 \n\t" + "add %4, %8 \n\t" + "vmovups (%8), %%ymm3 \n\t" + "vmovups 0x20(%8), %%ymm4 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "mov %5, %%ebx \n\t" + ".align 16 \n\t" + "2: \n\t" + + "mov %3, %%ecx \n\t" + ".align 16 \n\t" + "3: \n\t" + + "vbroadcastss (%0), %%ymm12 \n\t" + "vbroadcastss (%10), %%ymm13 \n\t" + "vmovups 0x0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0x20(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "vbroadcastss 0x4(%0), %%ymm12 \n\t" + "vbroadcastss 0x4(%10), %%ymm13 \n\t" + "vmovups 0x40(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0x60(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "vbroadcastss 0x8(%0), %%ymm12 \n\t" + "vbroadcastss 0x8(%10), %%ymm13 \n\t" + "vmovups 0x80(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0xA0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "vbroadcastss 0xC(%0), %%ymm12 \n\t" + "vbroadcastss 0xC(%10), %%ymm13 \n\t" + "vmovups 0xC0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0xE0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "vbroadcastss 0x10(%0), %%ymm12 \n\t" + "vbroadcastss 0x10(%10), %%ymm13 \n\t" + "vmovups 0x100(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0x120(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "vbroadcastss 0x14(%0), %%ymm12 \n\t" + "vbroadcastss 0x14(%10), %%ymm13 \n\t" + "vmovups 0x140(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0x160(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "vbroadcastss 0x18(%0), %%ymm12 \n\t" + "vbroadcastss 0x18(%10), %%ymm13 \n\t" + "vmovups 0x180(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0x1A0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "vbroadcastss 0x1C(%0), %%ymm12 \n\t" + "vbroadcastss 0x1C(%10), %%ymm13 \n\t" + "vmovups 0x1C0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0x1E0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "add %9, %0 \n\t" + "add %9, %10 \n\t" + "add $0x200, %2 \n\t" + "dec %%ecx \n\t" + "jg 3b \n\t" + + "add %6, %0 \n\t" + "add %6, %10 \n\t" + "dec %%ebx \n\t" + "jg 2b \n\t" + + "add %12, %0 \n\t" + "add %12, %10 \n\t" + "dec %%eax \n\t" + "jg 1b \n\t" + + // relu + "and $0x6, %7 \n\t" + "je 4f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm1, %%ymm1 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + + // relu6 + "and $0x4, %7 \n\t" + "je 4f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm1, %%ymm1 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + + "4: \n\t" + "vmovups %%ymm0, (%1) \n\t" + "vmovups %%ymm1, 0x20(%1) \n\t" + "add %4, %1 \n\t" + "vmovups %%ymm3, (%1) \n\t" + "vmovups %%ymm4, 0x20(%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(curW), "r"(fw), "r"(I64(oStep)), "r"(fh), + "r"(I64(iStep)), "r"(store), "r"(curB), "r"(I64(dw)), "r"(in_1), + "a"(ic / 8), "r"(I64(fStep)) + : "%ecx", "%ebx", "%ymm0", "%ymm1", "%ymm3", "%ymm4", "%ymm6", "%ymm7", + "%ymm9", "%ymm10", "%ymm12", "%ymm13", "%ymm15", "memory"); +} + +void avx2_conv_kernel_1x16(F32 *curI, + const F32 *curW, + F32 *curO, + U32 fw, + U32 fh, + U32 oStep, + U32 iStep, + U32 store, + const F32 *curB, + U32 dw, + F32 *in_1, + F32 *in_2, + U32 ic, + U32 fStep) +{ + __asm__ __volatile__( + "mov %7, %%ecx \n\t" + "and $0x1, %%ecx \n\t" + "jne 0f \n\t" + "vmovups (%8), %%ymm0 \n\t" + "vmovups 0x20(%8), %%ymm3 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "mov %1, %8 \n\t" + "vmovups (%8), %%ymm0 \n\t" + "add %4, %8 \n\t" + "vmovups (%8), %%ymm3 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "mov %5, %%ebx \n\t" + ".align 16 \n\t" + "2: \n\t" + + "mov %3, %%ecx \n\t" + ".align 16 \n\t" + "3: \n\t" + + "vbroadcastss (%0), %%ymm12 \n\t" + "vmovups 0x0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vmovups 0x20(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + + "vbroadcastss 0x4(%0), %%ymm12 \n\t" + "vmovups 0x40(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vmovups 0x60(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + + "vbroadcastss 0x8(%0), %%ymm12 \n\t" + "vmovups 0x80(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vmovups 0xA0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + + "vbroadcastss 0xC(%0), %%ymm12 \n\t" + "vmovups 0xC0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vmovups 0xE0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + + "vbroadcastss 0x10(%0), %%ymm12 \n\t" + "vmovups 0x100(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vmovups 0x120(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + + "vbroadcastss 0x14(%0), %%ymm12 \n\t" + "vmovups 0x140(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vmovups 0x160(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + + "vbroadcastss 0x18(%0), %%ymm12 \n\t" + "vmovups 0x180(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vmovups 0x1A0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + + "vbroadcastss 0x1C(%0), %%ymm12 \n\t" + "vmovups 0x1C0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vmovups 0x1E0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + + "add %9, %0 \n\t" + "add $0x200, %2 \n\t" + "dec %%ecx \n\t" + "jg 3b \n\t" + + "add %6, %0 \n\t" + "sub $1, %%ebx \n\t" + "jg 2b \n\t" + + "add %11, %0 \n\t" + "sub $1, %%eax \n\t" + "jg 1b \n\t" + + // relu + "and $0x6, %7 \n\t" + "je 4f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + + // relu6 + "and $0x4, %7 \n\t" + "je 4f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + + "4: \n\t" + "vmovups %%ymm0, (%1) \n\t" + "add %4, %1 \n\t" + "vmovups %%ymm3, (%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(curW), "r"(fw), "r"(I64(oStep)), "r"(fh), "r"(I64(iStep)), + "r"(store), "r"(curB), "r"(I64(dw)), "a"(ic / 8), "r"(I64(fStep)) + : "%ecx", "%ebx", "%ymm0", "%ymm3", "%ymm6", "%ymm9", "%ymm12", "%ymm15", "memory", "cc"); +} + +void avx2_conv_kernel_3x8(F32 *curI, + const F32 *curW, + F32 *curO, + U32 fw, + U32 fh, + U32 oStep, + U32 iStep, + U32 store, + const F32 *curB, + U32 dw, + F32 *in_1, + F32 *in_2, + U32 ic, + U32 fStep) +{ + __asm__ __volatile__("mov %7, %%ecx \n\t" + "and $0x1, %%ecx \n\t" + "jne 0f \n\t" + "vmovups (%8), %%ymm0 \n\t" + "vmovups (%8), %%ymm1 \n\t" + "vmovups (%8), %%ymm2 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "vmovups (%1), %%ymm0 \n\t" + "vmovups 0x20(%1), %%ymm1 \n\t" + "vmovups 0x40(%1), %%ymm2 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "mov %5, %%ebx \n\t" + ".align 16 \n\t" + "2: \n\t" + + "mov %3, %%ecx \n\t" + ".align 16 \n\t" + "3: \n\t" + + "vbroadcastss (%0), %%ymm12 \n\t" + "vbroadcastss (%10), %%ymm13 \n\t" + "vbroadcastss (%11), %%ymm14 \n\t" + "vmovups 0x0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + + "vbroadcastss 0x4(%0), %%ymm12 \n\t" + "vbroadcastss 0x4(%10), %%ymm13 \n\t" + "vbroadcastss 0x4(%11), %%ymm14 \n\t" + "vmovups 0x20(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + + "vbroadcastss 0x8(%0), %%ymm12 \n\t" + "vbroadcastss 0x8(%10), %%ymm13 \n\t" + "vbroadcastss 0x8(%11), %%ymm14 \n\t" + "vmovups 0x40(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + + "vbroadcastss 0xC(%0), %%ymm12 \n\t" + "vbroadcastss 0xC(%10), %%ymm13 \n\t" + "vbroadcastss 0xC(%11), %%ymm14 \n\t" + "vmovups 0x60(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + + "vbroadcastss 0x10(%0), %%ymm12 \n\t" + "vbroadcastss 0x10(%10), %%ymm13 \n\t" + "vbroadcastss 0x10(%11), %%ymm14 \n\t" + "vmovups 0x80(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + + "vbroadcastss 0x14(%0), %%ymm12 \n\t" + "vbroadcastss 0x14(%10), %%ymm13 \n\t" + "vbroadcastss 0x14(%11), %%ymm14 \n\t" + "vmovups 0xA0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + + "vbroadcastss 0x18(%0), %%ymm12 \n\t" + "vbroadcastss 0x18(%10), %%ymm13 \n\t" + "vbroadcastss 0x18(%11), %%ymm14 \n\t" + "vmovups 0xC0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + + "vbroadcastss 0x1C(%0), %%ymm12 \n\t" + "vbroadcastss 0x1C(%10), %%ymm13 \n\t" + "vbroadcastss 0x1C(%11), %%ymm14 \n\t" + "vmovups 0xE0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + + "add %9, %0 \n\t" + "add %9, %10 \n\t" + "add %9, %11 \n\t" + "add $0x100, %2 \n\t" + "dec %%ecx \n\t" + "jg 3b \n\t" + + "add %6, %0 \n\t" + "add %6, %10 \n\t" + "add %6, %11 \n\t" + "dec %%ebx \n\t" + "jg 2b \n\t" + + "add %12, %0 \n\t" + "add %12, %10 \n\t" + "add %12, %11 \n\t" + "dec %%ebx \n\t" + "jg 1b \n\t" + + // relu + "and $0x6, %7 \n\t" + "je 4f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm1, %%ymm1 \n\t" + "vmaxps %%ymm15, %%ymm2, %%ymm2 \n\t" + + // relu6 + "and $0x4, %7 \n\t" + "je 4f \n\t" + "mov $0x40C00000, %%eax \n\t" + "vmovd %%eax, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm1, %%ymm1 \n\t" + "vminps %%ymm12, %%ymm2, %%ymm2 \n\t" + + "4: \n\t" + "vmovups %%ymm0, (%1) \n\t" + "vmovups %%ymm1, 0x20(%1) \n\t" + "vmovups %%ymm2, 0x40(%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(curW), "r"(fw), "a"(ic / 8), "r"(fh), + "r"(I64(iStep)), "r"(store), "r"(curB), "r"(I64(dw)), "r"(in_1), "r"(in_2), + "r"(I64(fStep)) + : "%ecx", "%ebx", "%ymm0", "%ymm1", "%ymm2", "%ymm12", "%ymm13", "%ymm14", + "%ymm15", "memory", "cc"); +} + +void avx2_conv_kernel_2x8(F32 *curI, + const F32 *curW, + F32 *curO, + U32 fw, + U32 fh, + U32 oStep, + U32 iStep, + U32 store, + const F32 *curB, + U32 dw, + F32 *in_1, + F32 *in_2, + U32 ic, + U32 fStep) +{ + __asm__ __volatile__("mov %7, %%ecx \n\t" + "and $0x1, %%ecx \n\t" + "jne 0f \n\t" + "vmovups (%8), %%ymm0 \n\t" + "vmovups (%8), %%ymm1 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "vmovups (%1), %%ymm0 \n\t" + "vmovups 0x20(%1), %%ymm1 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + ".align 16 \n\t" + "2: \n\t" + + "mov %3, %%ecx \n\t" + ".align 16 \n\t" + "3: \n\t" + + "vbroadcastss (%0), %%ymm12 \n\t" + "vbroadcastss (%10), %%ymm13 \n\t" + "vmovups 0x0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + + "vbroadcastss 0x4(%0), %%ymm12 \n\t" + "vbroadcastss 0x4(%10), %%ymm13 \n\t" + "vmovups 0x20(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + + "vbroadcastss 0x8(%0), %%ymm12 \n\t" + "vbroadcastss 0x8(%10), %%ymm13 \n\t" + "vmovups 0x40(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + + "vbroadcastss 0xC(%0), %%ymm12 \n\t" + "vbroadcastss 0xC(%10), %%ymm13 \n\t" + "vmovups 0x60(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + + "vbroadcastss 0x10(%0), %%ymm12 \n\t" + "vbroadcastss 0x10(%10), %%ymm13 \n\t" + "vmovups 0x80(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + + "vbroadcastss 0x14(%0), %%ymm12 \n\t" + "vbroadcastss 0x14(%10), %%ymm13 \n\t" + "vmovups 0xA0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + + "vbroadcastss 0x18(%0), %%ymm12 \n\t" + "vbroadcastss 0x18(%10), %%ymm13 \n\t" + "vmovups 0xC0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + + "vbroadcastss 0x1C(%0), %%ymm12 \n\t" + "vbroadcastss 0x1C(%10), %%ymm13 \n\t" + "vmovups 0xE0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + + "add %9, %0 \n\t" + "add %9, %10 \n\t" + "add $0x100, %2 \n\t" + "dec %%ecx \n\t" + "jg 3b \n\t" + + "add %6, %0 \n\t" + "add %6, %10 \n\t" + "dec %%ebx \n\t" + "jg 2b \n\t" + + "add %11, %0 \n\t" + "add %11, %10 \n\t" + "dec %%eax \n\t" + "jg 1b \n\t" + + // relu + "and $0x6, %7 \n\t" + "je 4f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm1, %%ymm1 \n\t" + + // relu6 + "and $0x4, %7 \n\t" + "je 4f \n\t" + "mov $0x40C00000, %%eax \n\t" + "vmovd %%eax, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm1, %%ymm1 \n\t" + + "4: \n\t" + "vmovups %%ymm0, (%1) \n\t" + "vmovups %%ymm1, 0x20(%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(curW), "r"(fw), "a"(ic / 8), "b"(fh), + "r"(I64(iStep)), "r"(store), "r"(curB), "r"(I64(dw)), "r"(in_1), + "r"(I64(fStep)) + : "%ecx", "%ymm0", "%ymm1", "%ymm12", "%ymm13", "%ymm15", "memory", "cc"); +} + +void avx2_conv_kernel_1x8(F32 *curI, + const F32 *curW, + F32 *curO, + U32 fw, + U32 fh, + U32 oStep, + U32 iStep, + U32 store, + const F32 *curB, + U32 dw, + F32 *in_1, + F32 *in_2, + U32 ic, + U32 fStep) +{ + __asm__ __volatile__("mov %7, %%ecx \n\t" + "and $0x1, %%ecx \n\t" + "jne 0f \n\t" + "vmovups (%8), %%ymm0 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "vmovups (%1), %%ymm0 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + ".align 16 \n\t" + "2: \n\t" + + "mov %3, %%ecx \n\t" + ".align 16 \n\t" + "3: \n\t" + + "vbroadcastss (%0), %%ymm12 \n\t" + "vmovups 0x0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vbroadcastss 0x4(%0), %%ymm12 \n\t" + "vmovups 0x20(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vbroadcastss 0x8(%0), %%ymm12 \n\t" + "vmovups 0x40(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vbroadcastss 0xC(%0), %%ymm12 \n\t" + "vmovups 0x60(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vbroadcastss 0x10(%0), %%ymm12 \n\t" + "vmovups 0x80(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vbroadcastss 0x14(%0), %%ymm12 \n\t" + "vmovups 0xA0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vbroadcastss 0x18(%0), %%ymm12 \n\t" + "vmovups 0xC0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vbroadcastss 0x1C(%0), %%ymm12 \n\t" + "vmovups 0xE0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "add %9, %0 \n\t" + "add $0x100, %2 \n\t" + "dec %%ecx \n\t" + "jg 3b \n\t" + + "add %6, %0 \n\t" + "dec %%ebx \n\t" + "jg 2b \n\t" + + "add %11, %0 \n\t" + "dec %%eax \n\t" + "jg 1b \n\t" + + // relu + "and $0x6, %7 \n\t" + "je 3f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + + // relu6 + "and $0x4, %7 \n\t" + "je 3f \n\t" + "mov $0x40C00000, %%eax \n\t" + "vmovd %%eax, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + + "3: \n\t" + "vmovups %%ymm0, (%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(curW), "r"(fw), "r"(I64(oStep)), "b"(fh), + "r"(I64(iStep)), "r"(store), "r"(curB), "r"(I64(dw)), "a"(ic / 8), + "r"(I64(fStep)) + : "%ecx", "%ymm0", "%ymm12", "%ymm15", "memory", "cc"); +} + +EE convolution_2x2_direct(TensorDesc inputDesc, + F32 *inArray, + TensorDesc filterDesc, + const F32 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F32 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec activationDesc) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; + + if ((fdf != DF_NCHWCxN32) || (idf != DF_NCHWC8) || (ic % 8 != 0)) { + CHECK_STATUS(NOT_MATCH); + } + + F32 *curI, *curO, *calI, *calO; + const F32 *curW, *curB, *calW; + F32 *ftmp = (F32 *)align_addr(tmp, 32); + filterArray = (F32 *)align_addr(filterArray, 32); + + U32 icAlignSize = 8; + U32 icPadding = (ic + icAlignSize - 1) / icAlignSize * icAlignSize; + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + + U32 oStep = oh * ow * UNROLL_OC_DIM * 4; + U32 iStep = (iw_pad - fw * dilateW + (dilateH - 1) * iw_pad) * UNROLL_IC_BLOCK_DIM * 4; + U32 fStep = ((ih_pad - fh) * iw_pad) * UNROLL_IC_BLOCK_DIM * 4; + U32 sw = strideW * UNROLL_IC_BLOCK_DIM * 4; + U32 dw = dilateW * UNROLL_IC_BLOCK_DIM * 4; + U32 wSize = 0, store = 0, ocSize = 0, icSize = 0, hwSize = 0; + I32 ih_idx = 0; + kernel_func kernel[3][3] = {{avx2_conv_kernel_1x8, avx2_conv_kernel_2x8, avx2_conv_kernel_3x8}, + {avx2_conv_kernel_1x16, avx2_conv_kernel_2x16, avx2_conv_kernel_3x16}, + {avx2_conv_kernel_1x32, avx2_conv_kernel_2x32, avx2_conv_kernel_3x32}}; + U32 ocblocks[3] = {8, 16, 32}; + + I32 ohow = oh * ow; + + for (U32 n = 0; n < in; ++n) { + if (idf == DF_NCHWC8 && paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { + ftmp = inArray; + } else { + PaddingNCHWC8(inArray, ftmp, inputDesc, convParamSpec); + } + store = 0; + for (U32 icbb = 0; icbb < icPadding; icbb += icSize) { + icSize = UNI_MIN(BLOCK_IC_DIM, icPadding - icbb); + store |= (icbb > 0); + if (icbb == icPadding - icSize) { + store |= U32(activationDesc.mode) << 1; + } + for (I32 hw = 0; hw < ohow; hw += hwSize) { + hwSize = UNI_MIN(ohow - hw, BLOCK_HW_DIM); + for (U32 ocb = 0; ocb < oc; ocb += ocSize) { + curB = biasArray + ocb; + ocSize = UNI_MIN(BLOCK_OC_DIM, oc - ocb); + ocSize = ocblocks[ocSize >> 4]; + calW = filterArray + ocb * icPadding * fh * fw + ocSize * icbb * fh * fw; + curI = ftmp + icbb * ih_pad * iw_pad; + + for (I32 ihw = hw; ihw < hw + hwSize; ihw += wSize) { + wSize = UNI_MIN(hw + hwSize - ihw, UNROLL_W); + U32 in_h_0 = ihw / ow * strideH; + U32 in_w_0 = ihw % ow * strideW; + U32 in_h_1 = (ihw + 1) / ow * strideH; + U32 in_w_1 = (ihw + 1) % ow * strideW; + U32 in_h_2 = (ihw + 2) / ow * strideH; + U32 in_w_2 = (ihw + 2) % ow * strideW; + F32 *out_ptr = outArray + (n * oc + ocb) * ohow + ihw * 8; + F32 *in_0 = curI + in_h_0 * iw_pad * 8 + in_w_0 * 8; + F32 *in_1 = curI + in_h_1 * iw_pad * 8 + in_w_1 * 8; + F32 *in_2 = curI + in_h_2 * iw_pad * 8 + in_w_2 * 8; + + kernel[ocSize >> 4][wSize - 1](in_0, calW, out_ptr, fw, fh, oStep, iStep, + store, curB, dw, in_1, in_2, icSize, fStep); + } + } + } + } + inArray += ic * ih * iw; + outArray += oc * oh * ow; + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/x86/fp32/convolution_direct.cpp b/compute/tensor/src/cpu/x86/fp32/convolution_direct.cpp new file mode 100644 index 00000000..ef2e949a --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/convolution_direct.cpp @@ -0,0 +1,720 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" + +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#include "cpu/x86/fp32/transform_functions_fp32.h" + +#define UNROLL_W 3 +#define UNROLL_OC_DIM 8 +#define BLOCK_OC_DIM 32 +#define BLOCK_IC_DIM 32 +#define BLOCK_HW_DIM 128 +#define UNROLL_IC_BLOCK_DIM 8 +#define align_addr(addr, unit) (((uintptr_t)addr + unit - 1) / unit * unit) + +// clang-format off +#define kernel4x3(m0, r0, r1, r2, r3, m1, m2, m3, m4) \ + "vbroadcastss "#m0"("#r0"), %%ymm12 \n\t" \ + "vbroadcastss "#m0"("#r1"), %%ymm13 \n\t" \ + "vbroadcastss "#m0"("#r2"), %%ymm14 \n\t" \ + "vmovups "#m1"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" \ + "vmovups "#m2"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" \ + "vmovups "#m3"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" \ + "vmovups "#m4"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + +#define kernel4x2(m0, r0, r1, r2, r3, m1, m2, m3, m4) \ + "vbroadcastss "#m0"("#r0"), %%ymm12 \n\t" \ + "vbroadcastss "#m0"("#r1"), %%ymm13 \n\t" \ + "vmovups "#m1"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" \ + "vmovups "#m2"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" \ + "vmovups "#m3"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" \ + "vmovups "#m4"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + +#define kernel4x1(m0, r0, r1, r2, r3, m1, m2, m3, m4) \ + "vbroadcastss "#m0"("#r0"), %%ymm12 \n\t" \ + "vmovups "#m1"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" \ + "vmovups "#m2"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" \ + "vmovups "#m3"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" \ + "vmovups "#m4"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" \ + +#define kernel2x3(m0, r0, r1, r2, r3, m1, m2, m3, m4) \ + "vbroadcastss "#m0"("#r0"), %%ymm12 \n\t" \ + "vbroadcastss "#m0"("#r1"), %%ymm13 \n\t" \ + "vbroadcastss "#m0"("#r2"), %%ymm14 \n\t" \ + "vmovups "#m1"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" \ + "vmovups "#m2"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" \ + +#define kernel2x2(m0, r0, r1, r2, r3, m1, m2, m3, m4) \ + "vbroadcastss "#m0"("#r0"), %%ymm12 \n\t" \ + "vbroadcastss "#m0"("#r1"), %%ymm13 \n\t" \ + "vmovups "#m1"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" \ + "vmovups "#m2"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" \ + +#define kernel2x1(m0, r0, r1, r2, r3, m1, m2, m3, m4) \ + "vbroadcastss "#m0"("#r0"), %%ymm12 \n\t" \ + "vmovups "#m1"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" \ + "vmovups "#m2"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" \ + +#define kernel1x3(m0, r0, r1, r2, r3, m1, m2, m3, m4) \ + "vbroadcastss "#m0"("#r0"), %%ymm12 \n\t" \ + "vbroadcastss "#m0"("#r1"), %%ymm13 \n\t" \ + "vbroadcastss "#m0"("#r2"), %%ymm14 \n\t" \ + "vmovups "#m1"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" \ + +#define kernel1x2(m0, r0, r1, r2, r3, m1, m2, m3, m4) \ + "vbroadcastss "#m0"("#r0"), %%ymm12 \n\t" \ + "vbroadcastss "#m0"("#r1"), %%ymm13 \n\t" \ + "vmovups "#m1"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" \ + +#define kernel1x1(m0, r0, r1, r2, r3, m1, m2, m3, m4) \ + "vbroadcastss "#m0"("#r0"), %%ymm12 \n\t" \ + "vmovups "#m1"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" \ + +#define kernel4c8(r, r0, r1, r2, r3) \ + kernel4x##r(0x0, r0, r1, r2, r3, 0x0, 0x20, 0x40, 0x60) \ + kernel4x##r(0x4, r0, r1, r2, r3, 0x80, 0xA0, 0xC0, 0xE0) \ + kernel4x##r(0x8, r0, r1, r2, r3, 0x100, 0x120, 0x140, 0x160) \ + kernel4x##r(0xC, r0, r1, r2, r3, 0x180, 0x1A0, 0x1C0, 0x1E0) \ + kernel4x##r(0x10, r0, r1, r2, r3, 0x200, 0x220, 0x240, 0x260) \ + kernel4x##r(0x14, r0, r1, r2, r3, 0x280, 0x2A0, 0x2C0, 0x2E0) \ + kernel4x##r(0x18, r0, r1, r2, r3, 0x300, 0x320, 0x340, 0x360) \ + kernel4x##r(0x1C, r0, r1, r2, r3, 0x380, 0x3A0, 0x3C0, 0x3E0) + +#define kernel2c8(r, r0, r1, r2, r3) \ + kernel4x##r(0x0, r0, r1, r2, r3, 0x0, 0x20, 0, 0) \ + kernel4x##r(0x4, r0, r1, r2, r3, 0x40, 0x60, 0, 0) \ + kernel4x##r(0x8, r0, r1, r2, r3, 0x80, 0xA0, 0, 0) \ + kernel4x##r(0xC, r0, r1, r2, r3, 0xC0, 0xE0, 0, 0) \ + kernel4x##r(0x10, r0, r1, r2, r3, 0x100, 0x120, 0, 0) \ + kernel4x##r(0x14, r0, r1, r2, r3, 0x140, 0x160, 0, 0) \ + kernel4x##r(0x18, r0, r1, r2, r3, 0x180, 0x1A0, 0, 0) \ + kernel4x##r(0x1C, r0, r1, r2, r3, 0x1C0, 0x1E0, 0, 0) + +#define kernel1c8(r, r0, r1, r2, r3) \ + kernel4x##r(0x0, r0, r1, r2, r3, 0x0, 0, 0, 0) \ + kernel4x##r(0x4, r0, r1, r2, r3, 0x20, 0, 0, 0) \ + kernel4x##r(0x8, r0, r1, r2, r3, 0x40, 0, 0, 0) \ + kernel4x##r(0xC, r0, r1, r2, r3, 0x60, 0, 0, 0) \ + kernel4x##r(0x10, r0, r1, r2, r3, 0x80, 0, 0, 0) \ + kernel4x##r(0x14, r0, r1, r2, r3, 0xA0, 0, 0, 0) \ + kernel4x##r(0x18, r0, r1, r2, r3, 0xC0, 0, 0, 0) \ + kernel4x##r(0x1C, r0, r1, r2, r3, 0xE0, 0, 0, 0) + +typedef void (*kernel_func)(F32 *curI, const F32 *curW, F32 *curO, U32 fw, U32 fh, U32 oStep, U32 iStep, U32 store, const F32 *curB, U32 dw, F32 *in_1, F32 *in_2); + +void avx2_conv_kernel_3x32c8(F32 *curI, const F32 *curW, F32 *curO, U32 fw, U32 fh, U32 oStep, U32 iStep, U32 store, const F32 *curB, U32 dw, F32 *in_1, F32 *in_2) { + __asm__ __volatile__("mov %7, %%ecx \n\t" + "and $0x1, %%ecx \n\t" + "jne 0f \n\t" + "vmovups (%8), %%ymm0 \n\t" + "vmovups (%8), %%ymm1 \n\t" + "vmovups (%8), %%ymm2 \n\t" + "vmovups 0x20(%8), %%ymm3 \n\t" + "vmovups 0x20(%8), %%ymm4 \n\t" + "vmovups 0x20(%8), %%ymm5 \n\t" + "vmovups 0x40(%8), %%ymm6 \n\t" + "vmovups 0x40(%8), %%ymm7 \n\t" + "vmovups 0x40(%8), %%ymm8 \n\t" + "vmovups 0x60(%8), %%ymm9 \n\t" + "vmovups 0x60(%8), %%ymm10 \n\t" + "vmovups 0x60(%8), %%ymm11 \n\t" + "jmp 1f \n\t" + ".align 16 \n\t" + "0: \n\t" + "mov %1, %%r9 \n\t" + "vmovups (%%r9), %%ymm0 \n\t" + "vmovups 0x20(%%r9), %%ymm1 \n\t" + "vmovups 0x40(%%r9), %%ymm2 \n\t" + "add %4, %%r9 \n\t" + "vmovups (%%r9), %%ymm3 \n\t" + "vmovups 0x20(%%r9), %%ymm4 \n\t" + "vmovups 0x40(%%r9), %%ymm5 \n\t" + "add %4, %%r9 \n\t" + "vmovups (%%r9), %%ymm6 \n\t" + "vmovups 0x20(%%r9), %%ymm7 \n\t" + "vmovups 0x40(%%r9), %%ymm8 \n\t" + "add %4, %%r9 \n\t" + "vmovups (%%r9), %%ymm9 \n\t" + "vmovups 0x20(%%r9), %%ymm10 \n\t" + "vmovups 0x40(%%r9), %%ymm11 \n\t" + ".align 16 \n\t" + "1: \n\t" + "mov %3, %%ecx \n\t" + ".align 16 \n\t" + "2: \n\t" + kernel4c8(3, %0, %10, %11, %2) + "add %9, %0 \n\t" + "add %9, %10 \n\t" + "add %9, %11 \n\t" + "add $0x400, %2 \n\t" + "dec %%ecx \n\t" + "jg 2b \n\t" + "add %6, %0 \n\t" + "add %6, %10 \n\t" + "add %6, %11 \n\t" + "dec %%ebx \n\t" + "jg 1b \n\t" + // relu + "and $0x6, %7 \n\t" + "je 3f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm1, %%ymm1 \n\t" + "vmaxps %%ymm15, %%ymm2, %%ymm2 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + "vmaxps %%ymm15, %%ymm5, %%ymm5 \n\t" + "vmaxps %%ymm15, %%ymm6, %%ymm6 \n\t" + "vmaxps %%ymm15, %%ymm7, %%ymm7 \n\t" + "vmaxps %%ymm15, %%ymm8, %%ymm8 \n\t" + "vmaxps %%ymm15, %%ymm9, %%ymm9 \n\t" + "vmaxps %%ymm15, %%ymm10, %%ymm10 \n\t" + "vmaxps %%ymm15, %%ymm11, %%ymm11 \n\t" + // relu6 + "and $0x4, %7 \n\t" + "je 3f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm1, %%ymm1 \n\t" + "vminps %%ymm12, %%ymm2, %%ymm2 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + "vminps %%ymm12, %%ymm5, %%ymm5 \n\t" + "vminps %%ymm12, %%ymm6, %%ymm6 \n\t" + "vminps %%ymm12, %%ymm7, %%ymm7 \n\t" + "vminps %%ymm12, %%ymm8, %%ymm8 \n\t" + "vminps %%ymm12, %%ymm9, %%ymm9 \n\t" + "vminps %%ymm12, %%ymm10, %%ymm10 \n\t" + "vminps %%ymm12, %%ymm11, %%ymm11 \n\t" + "3: \n\t" + "vmovups %%ymm0, (%1) \n\t" + "vmovups %%ymm1, 0x20(%1) \n\t" + "vmovups %%ymm2, 0x40(%1) \n\t" + "add %4, %1 \n\t" + "vmovups %%ymm3, (%1) \n\t" + "vmovups %%ymm4, 0x20(%1) \n\t" + "vmovups %%ymm5, 0x40(%1) \n\t" + "add %4, %1 \n\t" + "vmovups %%ymm6, (%1) \n\t" + "vmovups %%ymm7, 0x20(%1) \n\t" + "vmovups %%ymm8, 0x40(%1) \n\t" + "add %4, %1 \n\t" + "vmovups %%ymm9, (%1) \n\t" + "vmovups %%ymm10, 0x20(%1) \n\t" + "vmovups %%ymm11, 0x40(%1) \n\t" + : + : "r" (curI), "r" (curO), "r" (curW), "r" (fw), + "r" (I64(oStep)), "b" (fh), "r" (I64(iStep)), "r" (store), + "r" (curB), "r" (I64(dw)), "r" (in_1), "r" (in_2) + : "%ecx", "%r9", + "%ymm0", "%ymm1", "%ymm2", "%ymm3", + "%ymm4", "%ymm5", "%ymm6", "%ymm7", + "%ymm8", "%ymm9", "%ymm10", "%ymm11", + "%ymm12", "%ymm13", "%ymm14", "%ymm15", + "memory", "cc"); +} + +void avx2_conv_kernel_1x32c8(F32 *curI, const F32 *curW, F32 *curO, U32 fw, U32 fh, U32 oStep, U32 iStep, U32 store, const F32 *curB, U32 dw, F32 *in_1, F32 *in_2) { + __asm__ __volatile__("mov %7, %%ecx \n\t" + "and $0x1, %%ecx \n\t" + "jne 0f \n\t" + "vmovups (%8), %%ymm0 \n\t" + "vmovups 0x20(%8), %%ymm3 \n\t" + "vmovups 0x40(%8), %%ymm6 \n\t" + "vmovups 0x60(%8), %%ymm9 \n\t" + "jmp 1f \n\t" + ".align 16 \n\t" + "0: \n\t" + "mov %1, %%r9 \n\t" + "vmovups (%%r9), %%ymm0 \n\t" + "add %4, %%r9 \n\t" + "vmovups (%%r9), %%ymm3 \n\t" + "add %4, %%r9 \n\t" + "vmovups (%%r9), %%ymm6 \n\t" + "add %4, %%r9 \n\t" + "vmovups (%%r9), %%ymm9 \n\t" + ".align 16 \n\t" + "1: \n\t" + "mov %3, %%ecx \n\t" + ".align 16 \n\t" + "2: \n\t" + kernel4c8(1, %0, 0, 0, %2) + "add %9, %0 \n\t" + "add $0x400, %2 \n\t" + "dec %%ecx \n\t" + "jg 2b \n\t" + "add %6, %0 \n\t" + "sub $1, %%ebx \n\t" + "jg 1b \n\t" + // relu + "and $0x6, %7 \n\t" + "je 3f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + "vmaxps %%ymm15, %%ymm6, %%ymm6 \n\t" + "vmaxps %%ymm15, %%ymm9, %%ymm9 \n\t" + // relu6 + "and $0x4, %7 \n\t" + "je 3f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + "vminps %%ymm12, %%ymm6, %%ymm6 \n\t" + "vminps %%ymm12, %%ymm9, %%ymm9 \n\t" + "3: \n\t" + "vmovups %%ymm0, (%1) \n\t" + "add %4, %1 \n\t" + "vmovups %%ymm3, (%1) \n\t" + "add %4, %1 \n\t" + "vmovups %%ymm6, (%1) \n\t" + "add %4, %1 \n\t" + "vmovups %%ymm9, (%1) \n\t" + : + : "r" (curI), "r" (curO), "r" (curW), "r" (fw), + "r" (I64(oStep)), "b" (fh), "r" (I64(iStep)), "r" (store), + "r" (curB), "r" (I64(dw)) + : "%ecx", "%r9", + "%ymm0", "%ymm3", "%ymm6", "%ymm9", "%ymm12", "%ymm15", + "memory", "cc"); +} + +void avx2_conv_kernel_3x16c8(F32 *curI, const F32 *curW, F32 *curO, U32 fw, U32 fh, U32 oStep, U32 iStep, U32 store, const F32 *curB, U32 dw, F32 *in_1, F32 *in_2) { + __asm__ __volatile__("mov %7, %%ecx \n\t" + "and $0x1, %%ecx \n\t" + "jne 0f \n\t" + "vmovups (%8), %%ymm0 \n\t" + "vmovups (%8), %%ymm1 \n\t" + "vmovups (%8), %%ymm2 \n\t" + "vmovups 0x20(%8), %%ymm3 \n\t" + "vmovups 0x20(%8), %%ymm4 \n\t" + "vmovups 0x20(%8), %%ymm5 \n\t" + "jmp 1f \n\t" + ".align 16 \n\t" + "0: \n\t" + "vmovups (%1), %%ymm0 \n\t" + "vmovups 0x20(%1), %%ymm1 \n\t" + "vmovups 0x40(%1), %%ymm2 \n\t" + "vmovups (%1, %4), %%ymm3 \n\t" + "vmovups 0x20(%1, %4), %%ymm4 \n\t" + "vmovups 0x40(%1, %4), %%ymm5 \n\t" + ".align 16 \n\t" + "1: \n\t" + "mov %3, %%ecx \n\t" + ".align 16 \n\t" + "2: \n\t" + kernel2c8(3, %0, %10, %11, %2) + "add %9, %0 \n\t" + "add %9, %10 \n\t" + "add %9, %11 \n\t" + "add $0x200, %2 \n\t" + "dec %%ecx \n\t" + "jg 2b \n\t" + "add %6, %0 \n\t" + "add %6, %10 \n\t" + "add %6, %11 \n\t" + "dec %%ebx \n\t" + "jg 1b \n\t" + // relu + "and $0x6, %7 \n\t" + "je 3f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm1, %%ymm1 \n\t" + "vmaxps %%ymm15, %%ymm2, %%ymm2 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + "vmaxps %%ymm15, %%ymm5, %%ymm5 \n\t" + // relu6 + "and $0x4, %7 \n\t" + "je 3f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm1, %%ymm1 \n\t" + "vminps %%ymm12, %%ymm2, %%ymm2 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + "vminps %%ymm12, %%ymm5, %%ymm5 \n\t" + "3: \n\t" + "vmovups %%ymm0, (%1) \n\t" + "vmovups %%ymm1, 0x20(%1) \n\t" + "vmovups %%ymm2, 0x40(%1) \n\t" + "vmovups %%ymm3, (%1, %4) \n\t" + "vmovups %%ymm4, 0x20(%1, %4) \n\t" + "vmovups %%ymm5, 0x40(%1, %4) \n\t" + : + : "r" (curI), "r" (curO), "r" (curW), "r" (fw), + "r" (I64(oStep)), "b" (fh), "r" (I64(iStep)), "r" (store), + "r" (curB), "r" (I64(dw)), "r" (in_1), "r" (in_2) + : "%ecx", + "%ymm0", "%ymm1", "%ymm2", "%ymm3", + "%ymm4", "%ymm5", + "%ymm12", "%ymm13", "%ymm14", "%ymm15", + "memory", "cc"); +} + +void avx2_conv_kernel_1x16c8(F32 *curI, const F32 *curW, F32 *curO, U32 fw, U32 fh, U32 oStep, U32 iStep, U32 store, const F32 *curB, U32 dw, F32 *in_1, F32 *in_2) { + __asm__ __volatile__("mov %7, %%ecx \n\t" + "and $0x1, %%ecx \n\t" + "jne 0f \n\t" + "vmovups (%8), %%ymm0 \n\t" + "vmovups 0x20(%8), %%ymm3 \n\t" + "jmp 1f \n\t" + ".align 16 \n\t" + "0: \n\t" + "vmovups (%1), %%ymm0 \n\t" + "vmovups (%1, %4), %%ymm3 \n\t" + ".align 16 \n\t" + "1: \n\t" + "mov %3, %%ecx \n\t" + ".align 16 \n\t" + "2: \n\t" + kernel2c8(1, %0, 0, 0, %2) + "add %9, %0 \n\t" + "add $0x200, %2 \n\t" + "dec %%ecx \n\t" + "jg 2b \n\t" + "add %6, %0 \n\t" + "dec %%ebx \n\t" + "jg 1b \n\t" + // relu + "and $0x6, %7 \n\t" + "je 3f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + // relu6 + "and $0x4, %7 \n\t" + "je 3f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + "3: \n\t" + "vmovups %%ymm0, (%1) \n\t" + "vmovups %%ymm3, (%1, %4) \n\t" + : + : "r" (curI), "r" (curO), "r" (curW), "r" (fw), + "r" (I64(oStep)), "b" (fh), "r" (I64(iStep)), "r" (store), + "r" (curB), "r" (I64(dw)) + : "%ecx", + "%ymm0", "%ymm3", "%ymm12", "%ymm15", + "memory", "cc"); +} + +void avx2_conv_kernel_3x8c8(F32 *curI, const F32 *curW, F32 *curO, U32 fw, U32 fh, U32 oStep, U32 iStep, U32 store, const F32 *curB, U32 dw, F32 *in_1, F32 *in_2) { + __asm__ __volatile__("mov %7, %%ecx \n\t" + "and $0x1, %%ecx \n\t" + "jne 0f \n\t" + "vmovups (%8), %%ymm0 \n\t" + "vmovups (%8), %%ymm1 \n\t" + "vmovups (%8), %%ymm2 \n\t" + "jmp 1f \n\t" + ".align 16 \n\t" + "0: \n\t" + "vmovups (%1), %%ymm0 \n\t" + "vmovups 0x20(%1), %%ymm1 \n\t" + "vmovups 0x40(%1), %%ymm2 \n\t" + ".align 16 \n\t" + "1: \n\t" + "mov %3, %%ecx \n\t" + ".align 16 \n\t" + "2: \n\t" + kernel1c8(3, %0, %10, %11, %2) + "add %9, %0 \n\t" + "add %9, %10 \n\t" + "add %9, %11 \n\t" + "add $0x100, %2 \n\t" + "dec %%ecx \n\t" + "jg 2b \n\t" + "add %6, %0 \n\t" + "add %6, %10 \n\t" + "add %6, %11 \n\t" + "dec %%ebx \n\t" + "jg 1b \n\t" + // relu + "and $0x6, %7 \n\t" + "je 3f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm1, %%ymm1 \n\t" + "vmaxps %%ymm15, %%ymm2, %%ymm2 \n\t" + // relu6 + "and $0x4, %7 \n\t" + "je 3f \n\t" + "mov $0x40C00000, %%eax \n\t" + "vmovd %%eax, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm1, %%ymm1 \n\t" + "vminps %%ymm12, %%ymm2, %%ymm2 \n\t" + "3: \n\t" + "vmovups %%ymm0, (%1) \n\t" + "vmovups %%ymm1, 0x20(%1) \n\t" + "vmovups %%ymm2, 0x40(%1) \n\t" + : + : "r" (curI), "r" (curO), "r" (curW), "r" (fw), + "r" (I64(oStep)), "b" (fh), "r" (I64(iStep)), "r" (store), + "r" (curB), "r" (I64(dw)), "r" (in_1), "r" (in_2) + : "%ecx", + "%ymm0", "%ymm1", "%ymm2", + "%ymm12", "%ymm13", "%ymm14", "%ymm15", + "memory", "cc"); +} + +void avx2_conv_kernel_1x8c8(F32 *curI, const F32 *curW, F32 *curO, U32 fw, U32 fh, U32 oStep, U32 iStep, U32 store, const F32 *curB, U32 dw, F32 *in_1, F32 *in_2) { + __asm__ __volatile__("mov %7, %%ecx \n\t" + "and $0x1, %%ecx \n\t" + "jne 0f \n\t" + "vmovups (%8), %%ymm0 \n\t" + "jmp 1f \n\t" + ".align 16 \n\t" + "0: \n\t" + "vmovups (%1), %%ymm0 \n\t" + ".align 16 \n\t" + "1: \n\t" + "mov %3, %%ecx \n\t" + ".align 16 \n\t" + "2: \n\t" + kernel1c8(1, %0, 0, 0, %2) + "add %9, %0 \n\t" + "add $0x100, %2 \n\t" + "dec %%ecx \n\t" + "jg 2b \n\t" + "add %6, %0 \n\t" + "dec %%ebx \n\t" + "jg 1b \n\t" + // relu + "and $0x6, %7 \n\t" + "je 3f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + // relu6 + "and $0x4, %7 \n\t" + "je 3f \n\t" + "mov $0x40C00000, %%eax \n\t" + "vmovd %%eax, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "3: \n\t" + "vmovups %%ymm0, (%1) \n\t" + : + : "r" (curI), "r" (curO), "r" (curW), "r" (fw), + "r" (I64(oStep)), "b" (fh), "r" (I64(iStep)), "r" (store), + "r" (curB), "r" (I64(dw)) + : "%ecx", + "%ymm0", "%ymm12", "%ymm15", + "memory", "cc"); +} + +EE convolution_direct(TensorDesc inputDesc, + F32 *inArray, + TensorDesc filterDesc, + const F32 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F32 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec activationDesc) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if ((2 == fh) && (2 == fw)) { + return convolution_2x2_direct(inputDesc, inArray, filterDesc, filterArray, convParamSpec, + biasDesc, biasArray, tmpBytes, tmp, outputDesc, outArray, activationDesc); + } + + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; + + if ((fdf != DF_NCHWCxN32) || (idf != DF_NCHWC8) || (ic % 8 != 0)) { + CHECK_STATUS(NOT_MATCH); + } + + F32 *ftmp = (F32 *)align_addr(tmp, 32); + filterArray = (F32 *)align_addr(filterArray, 32); + + U32 icAlignSize = 8; + U32 icPadding = (ic + icAlignSize - 1) / icAlignSize * icAlignSize; + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + I32 ohow = oh * ow; + + U32 oStep = oh * ow * UNROLL_OC_DIM * 4; + U32 iStep = (iw_pad - fw * dilateW + (dilateH - 1) * iw_pad) * UNROLL_IC_BLOCK_DIM * 4; + U32 sw = strideW * UNROLL_IC_BLOCK_DIM * 4; + U32 dw = dilateW * UNROLL_IC_BLOCK_DIM * 4; + kernel_func kernel[3][2] = {{avx2_conv_kernel_1x8c8, avx2_conv_kernel_3x8c8}, + {avx2_conv_kernel_1x16c8, avx2_conv_kernel_3x16c8}, + {avx2_conv_kernel_1x32c8, avx2_conv_kernel_3x32c8}}; + U32 ocblocks[3] = {8, 16, 32}; + +#ifdef _USE_OPENMP + U32 alpha = (ohow + OMP_NUM_THREADS * BLOCK_HW_DIM - 1) / (OMP_NUM_THREADS * BLOCK_HW_DIM); + U32 block_hw_dim = (ohow + OMP_NUM_THREADS * alpha - 1 ) / (OMP_NUM_THREADS * alpha); +#else + U32 block_hw_dim = BLOCK_HW_DIM; +#endif + + U32 icSize = 0; + U32 hwBlockNums = (ohow + block_hw_dim - 1 ) / block_hw_dim; + U32 ocBlockNums = oc / BLOCK_OC_DIM; + U32 ocbArray[4] = {0}; + U32 oc_remain = oc % BLOCK_OC_DIM; + for (U32 i = 0, j = 0; i < oc_remain; i += icSize, ++j) { + icSize = ocblocks[(oc_remain - i)>>4]; + ocbArray[j + 1] = icSize + ocbArray[j]; + ++ocBlockNums; + } + U32 hwocBlockNums = hwBlockNums * ocBlockNums; + + for (U32 n = 0; n < in; ++n) { + if ((paddingT == 0) && (paddingB == 0) && (paddingL == 0) && (paddingR == 0)) { + ftmp = inArray; + } else { + PaddingNCHWC8(inArray, ftmp, inputDesc, convParamSpec); + } +#ifdef _USE_OPENMP +#pragma omp parallel num_threads(OMP_NUM_THREADS) + { +#endif + U32 private_icSize = icSize; + for (U32 icbb = 0; icbb < ic; icbb += private_icSize) { + private_icSize = UNI_MIN(BLOCK_IC_DIM, ic - icbb); +#ifdef _USE_OPENMP +#pragma omp for +#endif + for (I32 bIdx = 0; bIdx < hwocBlockNums; ++bIdx) { + U32 hw = (bIdx / ocBlockNums) * block_hw_dim; + U32 hwSize = UNI_MIN(block_hw_dim, ohow - hw); + U32 ocIdx = bIdx % ocBlockNums; + U32 ocb = ocIdx * BLOCK_OC_DIM; + if (ocIdx > oc / BLOCK_OC_DIM) { + ocb += ocbArray[ocIdx - oc / BLOCK_OC_DIM]; + } + U32 ocSize = UNI_MIN(BLOCK_OC_DIM, oc - ocb); + ocSize = ocblocks[ocSize >> 4]; + const F32 *curB = biasArray + ocb; + U32 store = 0, icbSize = 0; + for (U32 icb = icbb; icb < icbb + private_icSize; icb += icbSize) { + icbSize = UNI_MIN(icbb + private_icSize - icb, UNROLL_IC_BLOCK_DIM); + const F32 *calW = filterArray + ocb * icPadding * fh * fw + ocSize * icb * fh * fw; + F32 *curI = ftmp + icb * ih_pad * iw_pad; + + store |= (icb > 0); + if (icb == ic - icbSize) { + store |= U32(activationDesc.mode) << 1; + } + U32 wSize = 0; + for (I32 ihw = hw; ihw < hw + hwSize; ihw += wSize) { + wSize = UNI_MIN(hw + hwSize - ihw, UNROLL_W); + if (wSize < 3) { + wSize = 1; + } + U32 in_h_0 = ihw / ow * strideH; + U32 in_w_0 = ihw % ow * strideW; + U32 in_h_1 = (ihw + 1) / ow * strideH; + U32 in_w_1 = (ihw + 1) % ow * strideW; + U32 in_h_2 = (ihw + 2) / ow * strideH; + U32 in_w_2 = (ihw + 2) % ow * strideW; + F32 *out_ptr = outArray + (n * oc + ocb) * ohow + ihw * 8; + F32 *in_0 = curI + in_h_0 * iw_pad * 8 + in_w_0 * 8; + F32 *in_1 = curI + in_h_1 * iw_pad * 8 + in_w_1 * 8; + F32 *in_2 = curI + in_h_2 * iw_pad * 8 + in_w_2 * 8; + + kernel[ocSize>>4][wSize>>1](in_0, calW, out_ptr, fw, fh, oStep, iStep, + store, curB, dw, in_1, in_2); + } + } + } + } +#ifdef _USE_OPENMP + } +#endif + inArray += ic * ih * iw; + outArray += oc * oh * ow; + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/x86/fp32/convolution_direct_nchw.cpp b/compute/tensor/src/cpu/x86/fp32/convolution_direct_nchw.cpp new file mode 100644 index 00000000..7e0009ae --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/convolution_direct_nchw.cpp @@ -0,0 +1,1861 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" + +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#include "cpu/x86/fp32/transform_functions_fp32.h" + +#define UNROLL_W 4 +#define UNROLL_OC_DIM 8 +#define BLOCK_OC_DIM 24 +#define BLOCK_IC_DIM 32 +#define BLOCK_HW_DIM 768 +#define UNROLL_IC_BLOCK_DIM 8 +#define align_addr(addr, unit) (((uintptr_t)addr + unit - 1) / unit * unit) + +#define kernel4x3(m0, r0, r1, r2, r3, m1, m2, m3, m4) \ + "vbroadcastss "#m0"("#r0"), %%ymm12 \n\t" \ + "vbroadcastss "#m0"("#r1"), %%ymm13 \n\t" \ + "vbroadcastss "#m0"("#r2"), %%ymm14 \n\t" \ + "vmovups "#m1"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" \ + "vmovups "#m2"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" \ + "vmovups "#m3"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" \ + "vmovups "#m4"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + +typedef void (*kernel_func)(F32 *in_0, F32 *in_1, F32 *in_2, F32 *in_3, const F32 *curW, F32 *curO, const F32 *curB, + U32 fw, U32 fh, U32 oStep, U32 hStep, U32 store, U32 dw, U32 ic, U32 iStep, U32 fwStep, U32 fhStep); + +void avx2_conv_kernel_3x32(F32 *in_0, F32 *in_1, F32 *in_2, F32 *in_3, const F32 *curW, F32 *curO, const F32 *curB, + U32 fw, U32 fh, U32 oStep, U32 hStep, U32 store, U32 dw, U32 ic, U32 iStep, U32 fwStep, U32 fhStep) { + __asm__ __volatile__("mov %3, %%eax \n\t" + "and $0x1, %%eax \n\t" + "jne 0f \n\t" + "vmovups (%1), %%ymm0 \n\t" + "vmovups (%1), %%ymm1 \n\t" + "vmovups (%1), %%ymm2 \n\t" + "vmovups 0x20(%1), %%ymm3 \n\t" + "vmovups 0x20(%1), %%ymm4 \n\t" + "vmovups 0x20(%1), %%ymm5 \n\t" + "vmovups 0x40(%1), %%ymm6 \n\t" + "vmovups 0x40(%1), %%ymm7 \n\t" + "vmovups 0x40(%1), %%ymm8 \n\t" + "vmovups 0x60(%1), %%ymm9 \n\t" + "vmovups 0x60(%1), %%ymm10 \n\t" + "vmovups 0x60(%1), %%ymm11 \n\t" + "jmp 1f \n\t" + ".align 16 \n\t" + "0: \n\t" + "mov %0, %%rax \n\t" + "vmovups (%%rax), %%ymm0 \n\t" + "vmovups 0x20(%%rax), %%ymm1 \n\t" + "vmovups 0x40(%%rax), %%ymm2 \n\t" + "add %2, %%rax \n\t" + "vmovups (%%rax), %%ymm3 \n\t" + "vmovups 0x20(%%rax), %%ymm4 \n\t" + "vmovups 0x40(%%rax), %%ymm5 \n\t" + "add %2, %%rax \n\t" + "vmovups (%%rax), %%ymm6 \n\t" + "vmovups 0x20(%%rax), %%ymm7 \n\t" + "vmovups 0x40(%%rax), %%ymm8 \n\t" + "add %2, %%rax \n\t" + "vmovups (%%rax), %%ymm9 \n\t" + "vmovups 0x20(%%rax), %%ymm10 \n\t" + "vmovups 0x40(%%rax), %%ymm11 \n\t" + ".align 16 \n\t" + "1: \n\t" + : + : "r" (curO), "r" (curB), "r" (I64(oStep)), "r" (store) + : "%eax", "%rax", + "%ymm0", "%ymm1", "%ymm2", "%ymm3", + "%ymm4", "%ymm5", "%ymm6", "%ymm7", + "%ymm8", "%ymm9", "%ymm10", "%ymm11", + "memory", "cc"); + + if ((fw == 7) && (fh > 0)) { + __asm__ __volatile__(".align 16 \n\t" + "0: \n\t" + "mov %4, %%ebx \n\t" + ".align 16 \n\t" + "1: \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x0, 0x20, 0x40, 0x60) + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x80, 0xA0, 0xC0, 0xE0) + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x100, 0x120, 0x140, 0x160) + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x180, 0x1A0, 0x1C0, 0x1E0) + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x200, 0x220, 0x240, 0x260) + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x280, 0x2A0, 0x2C0, 0x2E0) + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x300, 0x320, 0x340, 0x360) + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + "add $0x380, %3 \n\t" + "add %7, %0 \n\t" + "add %7, %1 \n\t" + "add %7, %2 \n\t" + "add %10, %3 \n\t" + "dec %%ebx \n\t" + "jg 1b \n\t" + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "add %11, %3 \n\t" + "dec %%eax \n\t" + "jg 0b \n\t" + : + : "r" (in_0), "r" (in_1), "r" (in_2), + "r" (curW), "r" (fh), "r" (fw), "a" (ic), + "r" (I64(hStep)), "r" (I64(dw)), "r" (I64(iStep)), "r" (I64(fwStep)), "r" (I64(fhStep)) + : "%ecx", "%ebx", + "%ymm0", "%ymm1", "%ymm2", "%ymm3", + "%ymm4", "%ymm5", "%ymm6", "%ymm7", + "%ymm8", "%ymm9", "%ymm10", "%ymm11", + "%ymm12", "%ymm13", "%ymm14", "%ymm15", + "memory", "cc"); + } else if ((fw == 5) && (fh > 0)) { + __asm__ __volatile__(".align 16 \n\t" + "0: \n\t" + "mov %4, %%ebx \n\t" + ".align 16 \n\t" + "1: \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x0, 0x20, 0x40, 0x60) + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x80, 0xA0, 0xC0, 0xE0) + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x100, 0x120, 0x140, 0x160) + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x180, 0x1A0, 0x1C0, 0x1E0) + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x200, 0x220, 0x240, 0x260) + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + "add $0x280, %3 \n\t" + "add %7, %0 \n\t" + "add %7, %1 \n\t" + "add %7, %2 \n\t" + "add %10, %3 \n\t" + "dec %%ebx \n\t" + "jg 1b \n\t" + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "add %11, %3 \n\t" + "dec %%eax \n\t" + "jg 0b \n\t" + : + : "r" (in_0), "r" (in_1), "r" (in_2), + "r" (curW), "r" (fh), "r" (fw), "a" (ic), + "r" (I64(hStep)), "r" (I64(dw)), "r" (I64(iStep)), "r" (I64(fwStep)), "r" (I64(fhStep)) + : "%ecx", "%ebx", + "%ymm0", "%ymm1", "%ymm2", "%ymm3", + "%ymm4", "%ymm5", "%ymm6", "%ymm7", + "%ymm8", "%ymm9", "%ymm10", "%ymm11", + "%ymm12", "%ymm13", "%ymm14", "%ymm15", + "memory", "cc"); + } else if ((fw == 3) && (fh == 3)) { + __asm__ __volatile__("add %8, %7 \n\t" + ".align 16 \n\t" + "0: \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x0, 0x20, 0x40, 0x60) + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x80, 0xA0, 0xC0, 0xE0) + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x100, 0x120, 0x140, 0x160) + "add %7, %0 \n\t" + "add %7, %1 \n\t" + "add %7, %2 \n\t" + "add %10, %3 \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x180, 0x1A0, 0x1C0, 0x1E0) + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x200, 0x220, 0x240, 0x260) + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x280, 0x2A0, 0x2C0, 0x2E0) + "add %7, %0 \n\t" + "add %7, %1 \n\t" + "add %7, %2 \n\t" + "add %10, %3 \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x300, 0x320, 0x340, 0x360) + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x380, 0x3A0, 0x3C0, 0x3E0) + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x400, 0x420, 0x440, 0x460) + "add %7, %0 \n\t" + "add %7, %1 \n\t" + "add %7, %2 \n\t" + "add %10, %3 \n\t" + "add $0x480, %3 \n\t" + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "add %11, %3 \n\t" + "dec %%eax \n\t" + "jg 0b \n\t" + : + : "r" (in_0), "r" (in_1), "r" (in_2), + "r" (curW), "r" (fh), "r" (fw), "a" (ic), + "r" (I64(hStep)), "r" (I64(dw)), "r" (I64(iStep)), "r" (I64(fwStep)), "r" (I64(fhStep)) + : "%ecx", + "%ymm0", "%ymm1", "%ymm2", "%ymm3", + "%ymm4", "%ymm5", "%ymm6", "%ymm7", + "%ymm8", "%ymm9", "%ymm10", "%ymm11", + "%ymm12", "%ymm13", "%ymm14", "%ymm15", + "memory", "cc"); + } else if ((fh > 0) && (fw > 0)) { + __asm__ __volatile__(".align 16 \n\t" + "0: \n\t" + "mov %4, %%ebx \n\t" + ".align 16 \n\t" + "1: \n\t" + "mov %5, %%ecx \n\t" + ".align 16 \n\t" + "2: \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x0, 0x20, 0x40, 0x60) + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + "add $0x80, %3 \n\t" + "dec %%ecx \n\t" + "jg 2b \n\t" + "add %7, %0 \n\t" + "add %7, %1 \n\t" + "add %7, %2 \n\t" + "add %10, %3 \n\t" + "dec %%ebx \n\t" + "jg 1b \n\t" + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "add %11, %3 \n\t" + "dec %%eax \n\t" + "jg 0b \n\t" + : + : "r" (in_0), "r" (in_1), "r" (in_2), + "r" (curW), "r" (fh), "r" (fw), "a" (ic), + "r" (I64(hStep)), "r" (I64(dw)), "r" (I64(iStep)), "r" (I64(fwStep)), "r" (I64(fhStep)) + : "%ecx", "%ebx", + "%ymm0", "%ymm1", "%ymm2", "%ymm3", + "%ymm4", "%ymm5", "%ymm6", "%ymm7", + "%ymm8", "%ymm9", "%ymm10", "%ymm11", + "%ymm12", "%ymm13", "%ymm14", "%ymm15", + "memory", "cc"); + } + + __asm__ __volatile__( + // relu + "and $0x6, %2 \n\t" + "je 0f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm1, %%ymm1 \n\t" + "vmaxps %%ymm15, %%ymm2, %%ymm2 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + "vmaxps %%ymm15, %%ymm5, %%ymm5 \n\t" + "vmaxps %%ymm15, %%ymm6, %%ymm6 \n\t" + "vmaxps %%ymm15, %%ymm7, %%ymm7 \n\t" + "vmaxps %%ymm15, %%ymm8, %%ymm8 \n\t" + "vmaxps %%ymm15, %%ymm9, %%ymm9 \n\t" + "vmaxps %%ymm15, %%ymm10, %%ymm10 \n\t" + "vmaxps %%ymm15, %%ymm11, %%ymm11 \n\t" + + // relu6 + "and $0x4, %2 \n\t" + "je 0f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm1, %%ymm1 \n\t" + "vminps %%ymm12, %%ymm2, %%ymm2 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + "vminps %%ymm12, %%ymm5, %%ymm5 \n\t" + "vminps %%ymm12, %%ymm6, %%ymm6 \n\t" + "vminps %%ymm12, %%ymm7, %%ymm7 \n\t" + "vminps %%ymm12, %%ymm8, %%ymm8 \n\t" + "vminps %%ymm12, %%ymm9, %%ymm9 \n\t" + "vminps %%ymm12, %%ymm10, %%ymm10 \n\t" + "vminps %%ymm12, %%ymm11, %%ymm11 \n\t" + + "0: \n\t" + "vmovups %%ymm0, (%0) \n\t" + "vmovups %%ymm1, 0x20(%0) \n\t" + "vmovups %%ymm2, 0x40(%0) \n\t" + "add %1, %0 \n\t" + "vmovups %%ymm3, (%0) \n\t" + "vmovups %%ymm4, 0x20(%0) \n\t" + "vmovups %%ymm5, 0x40(%0) \n\t" + "add %1, %0 \n\t" + "vmovups %%ymm6, (%0) \n\t" + "vmovups %%ymm7, 0x20(%0) \n\t" + "vmovups %%ymm8, 0x40(%0) \n\t" + "add %1, %0 \n\t" + "vmovups %%ymm9, (%0) \n\t" + "vmovups %%ymm10, 0x20(%0) \n\t" + "vmovups %%ymm11, 0x40(%0) \n\t" + : + : "r"(curO), "r"(I64(oStep)), "r"(store) + : "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", + "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm15", "memory", "cc"); +} + +void avx2_conv_kernel_1x32(F32 *in_0, + F32 *in_1, + F32 *in_2, + F32 *in_3, + const F32 *curW, + F32 *curO, + const F32 *curB, + U32 fw, + U32 fh, + U32 oStep, + U32 hStep, + U32 store, + U32 dw, + U32 ic, + U32 iStep, + U32 fwStep, + U32 fhStep) +{ + __asm__ __volatile__("mov %3, %%eax \n\t" + "and $0x1, %%eax \n\t" + "jne 0f \n\t" + "vmovups (%1), %%ymm0 \n\t" + "vmovups 0x20(%1), %%ymm3 \n\t" + "vmovups 0x40(%1), %%ymm6 \n\t" + "vmovups 0x60(%1), %%ymm9 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "mov %0, %%rax \n\t" + "vmovups (%%rax), %%ymm0 \n\t" + "add %2, %%rax \n\t" + "vmovups (%%rax), %%ymm3 \n\t" + "add %2, %%rax \n\t" + "vmovups (%%rax), %%ymm6 \n\t" + "add %2, %%rax \n\t" + "vmovups (%%rax), %%ymm9 \n\t" + + ".align 16 \n\t" + "1: \n\t" + : + : "r"(curO), "r"(curB), "r"(I64(oStep)), "r"(store) + : "%eax", "%rax", "%ymm0", "%ymm3", "%ymm6", "%ymm9", "memory", "cc"); + + if ((fh == 3) && (fw == 3)) { + __asm__ __volatile__("add %8, %7 \n\t" + ".align 16 \n\t" + "0: \n\t" + "vbroadcastss (%0), %%ymm12 \n\t" + "vmovaps (%3), %%ymm11 \n\t" + "vmovaps 0x20(%3), %%ymm13 \n\t" + "vmovaps 0x40(%3), %%ymm14 \n\t" + "vmovaps 0x60(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm11, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "add %8, %0 \n\t" + "vbroadcastss (%0), %%ymm12 \n\t" + "vmovaps 0x80(%3), %%ymm11 \n\t" + "vmovaps 0xA0(%3), %%ymm13 \n\t" + "vmovaps 0xC0(%3), %%ymm14 \n\t" + "vmovaps 0xE0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm11, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "add %8, %0 \n\t" + "vbroadcastss (%0), %%ymm12 \n\t" + "vmovaps 0x100(%3), %%ymm11 \n\t" + "vmovaps 0x120(%3), %%ymm13 \n\t" + "vmovaps 0x140(%3), %%ymm14 \n\t" + "vmovaps 0x160(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm11, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "add %7, %0 \n\t" + "add %10, %3 \n\t" + "vbroadcastss (%0), %%ymm12 \n\t" + "vmovaps 0x180(%3), %%ymm11 \n\t" + "vmovaps 0x1A0(%3), %%ymm13 \n\t" + "vmovaps 0x1C0(%3), %%ymm14 \n\t" + "vmovaps 0x1E0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm11, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "add %8, %0 \n\t" + "vbroadcastss (%0), %%ymm12 \n\t" + "vmovaps 0x200(%3), %%ymm11 \n\t" + "vmovaps 0x220(%3), %%ymm13 \n\t" + "vmovaps 0x240(%3), %%ymm14 \n\t" + "vmovaps 0x260(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm11, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "add %8, %0 \n\t" + "vbroadcastss (%0), %%ymm12 \n\t" + "vmovaps 0x280(%3), %%ymm11 \n\t" + "vmovaps 0x2A0(%3), %%ymm13 \n\t" + "vmovaps 0x2C0(%3), %%ymm14 \n\t" + "vmovaps 0x2E0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm11, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "add %7, %0 \n\t" + "add %10, %3 \n\t" + "vbroadcastss (%0), %%ymm12 \n\t" + "vmovaps 0x300(%3), %%ymm11 \n\t" + "vmovaps 0x320(%3), %%ymm13 \n\t" + "vmovaps 0x340(%3), %%ymm14 \n\t" + "vmovaps 0x360(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm11, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "add %8, %0 \n\t" + "vbroadcastss (%0), %%ymm12 \n\t" + "vmovaps 0x380(%3), %%ymm11 \n\t" + "vmovaps 0x3A0(%3), %%ymm13 \n\t" + "vmovaps 0x3C0(%3), %%ymm14 \n\t" + "vmovaps 0x3E0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm11, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "add %8, %0 \n\t" + "vbroadcastss (%0), %%ymm12 \n\t" + "vmovaps 0x400(%3), %%ymm11 \n\t" + "vmovaps 0x420(%3), %%ymm13 \n\t" + "vmovaps 0x440(%3), %%ymm14 \n\t" + "vmovaps 0x460(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm11, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "add %7, %0 \n\t" + "add %10, %3 \n\t" + "add $0x480, %3 \n\t" + "add %9, %0 \n\t" + "add %11, %3 \n\t" + "dec %%eax \n\t" + "jg 0b \n\t" + : + : "r" (in_0), "r" (in_1), "r" (in_2), + "r" (curW), "r" (fh), "r" (fw), "a" (ic), + "r" (I64(hStep)), "r" (I64(dw)), "r" (I64(iStep)), "r" (I64(fwStep)), "r" (I64(fhStep)) + : + "%ymm0", "%ymm3", "%ymm6", "%ymm9", "%ymm11", + "%ymm12", "%ymm13", "%ymm14", "%ymm15", + "memory", "cc"); + } else if ((fh > 0) && (fw > 0)) { + __asm__ __volatile__(".align 16 \n\t" + "0: \n\t" + "mov %4, %%ebx \n\t" + ".align 16 \n\t" + "1: \n\t" + "mov %5, %%ecx \n\t" + ".align 16 \n\t" + "2: \n\t" + "vbroadcastss (%0), %%ymm12 \n\t" + "vmovaps (%3), %%ymm11 \n\t" + "vmovaps 0x20(%3), %%ymm13 \n\t" + "vmovaps 0x40(%3), %%ymm14 \n\t" + "vmovaps 0x60(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm11, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "add %8, %0 \n\t" + "add $0x80, %3 \n\t" + "dec %%ecx \n\t" + "jg 2b \n\t" + "add %7, %0 \n\t" + "add %10, %3 \n\t" + "dec %%ebx \n\t" + "jg 1b \n\t" + "add %9, %0 \n\t" + "add %11, %3 \n\t" + "dec %%eax \n\t" + "jg 0b \n\t" + : + : "r" (in_0), "r" (in_1), "r" (in_2), + "r" (curW), "r" (fh), "r" (fw), "a" (ic), + "r" (I64(hStep)), "r" (I64(dw)), "r" (I64(iStep)), "r" (I64(fwStep)), "r" (I64(fhStep)) + : "%ecx", "%ebx", + "%ymm0", "%ymm3", "%ymm6", "%ymm9", "%ymm11", + "%ymm12", "%ymm13", "%ymm14", "%ymm15", + "memory", "cc"); + } + + __asm__ __volatile__( + // relu + "and $0x6, %2 \n\t" + "je 0f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + "vmaxps %%ymm15, %%ymm6, %%ymm6 \n\t" + "vmaxps %%ymm15, %%ymm9, %%ymm9 \n\t" + + // relu6 + "and $0x4, %2 \n\t" + "je 0f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + "vminps %%ymm12, %%ymm6, %%ymm6 \n\t" + "vminps %%ymm12, %%ymm9, %%ymm9 \n\t" + + "0: \n\t" + "vmovups %%ymm0, (%0) \n\t" + "add %1, %0 \n\t" + "vmovups %%ymm3, (%0) \n\t" + "add %1, %0 \n\t" + "vmovups %%ymm6, (%0) \n\t" + "add %1, %0 \n\t" + "vmovups %%ymm9, (%0) \n\t" + : + : "r"(curO), "r"(I64(oStep)), "r"(store) + : "%ecx", "%ymm0", "%ymm3", "%ymm6", "%ymm9", "%ymm12", "%ymm15", "memory", "cc"); +} + +void avx2_conv_kernel_4x24(F32 *in_0, + F32 *in_1, + F32 *in_2, + F32 *in_3, + const F32 *curW, + F32 *curO, + const F32 *curB, + U32 fw, + U32 fh, + U32 oStep, + U32 hStep, + U32 store, + U32 dw, + U32 ic, + U32 iStep, + U32 fwStep, + U32 fhStep) +{ + __asm__ __volatile__("mov %3, %%eax \n\t" + "and $0x1, %%eax \n\t" + "jne 0f \n\t" + "vmovups (%1), %%ymm0 \n\t" + "vmovups (%1), %%ymm1 \n\t" + "vmovups (%1), %%ymm2 \n\t" + "vmovups (%1), %%ymm3 \n\t" + "vmovups 0x20(%1), %%ymm4 \n\t" + "vmovups 0x20(%1), %%ymm5 \n\t" + "vmovups 0x20(%1), %%ymm6 \n\t" + "vmovups 0x20(%1), %%ymm7 \n\t" + "vmovups 0x40(%1), %%ymm8 \n\t" + "vmovups 0x40(%1), %%ymm9 \n\t" + "vmovups 0x40(%1), %%ymm10 \n\t" + "vmovups 0x40(%1), %%ymm11 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "vmovups (%0), %%ymm0 \n\t" + "vmovups 0x20(%0), %%ymm1 \n\t" + "vmovups 0x40(%0), %%ymm2 \n\t" + "vmovups 0x60(%0), %%ymm3 \n\t" + "vmovups (%0, %2), %%ymm4 \n\t" + "vmovups 0x20(%0, %2), %%ymm5 \n\t" + "vmovups 0x40(%0, %2), %%ymm6 \n\t" + "vmovups 0x60(%0, %2), %%ymm7 \n\t" + "vmovups (%0, %2, 2), %%ymm8 \n\t" + "vmovups 0x20(%0, %2, 2), %%ymm9 \n\t" + "vmovups 0x40(%0, %2, 2), %%ymm10 \n\t" + "vmovups 0x60(%0, %2, 2), %%ymm11 \n\t" + + ".align 16 \n\t" + "1: \n\t" + : + : "r"(curO), "r"(curB), "r"(I64(oStep)), "r"(store) + : "%eax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", + "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "memory", "cc"); + + if ((fw == 3) && (fh == 3)) { + __asm__ __volatile__("mov %7, %%eax \n\t" + ".align 16 \n\t" + "0: \n\t" + + "vmovaps (%4), %%ymm12 \n\t" + "vmovaps 0x20(%4), %%ymm13 \n\t" + "vmovaps 0x40(%4), %%ymm14 \n\t" + "vbroadcastss (%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss (%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm5 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm9 \n\t" + "vbroadcastss (%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm10 \n\t" + "vbroadcastss (%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "add %9, %3 \n\t" + + "vmovaps 0x60(%4), %%ymm12 \n\t" + "vmovaps 0x80(%4), %%ymm13 \n\t" + "vmovaps 0xA0(%4), %%ymm14 \n\t" + "vbroadcastss (%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss (%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm5 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm9 \n\t" + "vbroadcastss (%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm10 \n\t" + "vbroadcastss (%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "add %9, %3 \n\t" + + "vmovaps 0xC0(%4), %%ymm12 \n\t" + "vmovaps 0xE0(%4), %%ymm13 \n\t" + "vmovaps 0x100(%4), %%ymm14 \n\t" + "vbroadcastss (%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss (%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm5 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm9 \n\t" + "vbroadcastss (%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm10 \n\t" + "vbroadcastss (%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "add %9, %3 \n\t" + + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + "add %8, %3 \n\t" + "add %11, %4 \n\t" + + "vmovaps 0x120(%4), %%ymm12 \n\t" + "vmovaps 0x140(%4), %%ymm13 \n\t" + "vmovaps 0x160(%4), %%ymm14 \n\t" + "vbroadcastss (%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss (%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm5 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm9 \n\t" + "vbroadcastss (%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm10 \n\t" + "vbroadcastss (%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "add %9, %3 \n\t" + + "vmovaps 0x180(%4), %%ymm12 \n\t" + "vmovaps 0x1A0(%4), %%ymm13 \n\t" + "vmovaps 0x1C0(%4), %%ymm14 \n\t" + "vbroadcastss (%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss (%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm5 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm9 \n\t" + "vbroadcastss (%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm10 \n\t" + "vbroadcastss (%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "add %9, %3 \n\t" + + "vmovaps 0x1E0(%4), %%ymm12 \n\t" + "vmovaps 0x200(%4), %%ymm13 \n\t" + "vmovaps 0x220(%4), %%ymm14 \n\t" + "vbroadcastss (%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss (%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm5 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm9 \n\t" + "vbroadcastss (%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm10 \n\t" + "vbroadcastss (%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "add %9, %3 \n\t" + + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + "add %8, %3 \n\t" + "add %11, %4 \n\t" + + "vmovaps 0x240(%4), %%ymm12 \n\t" + "vmovaps 0x260(%4), %%ymm13 \n\t" + "vmovaps 0x280(%4), %%ymm14 \n\t" + "vbroadcastss (%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss (%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm5 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm9 \n\t" + "vbroadcastss (%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm10 \n\t" + "vbroadcastss (%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "add %9, %3 \n\t" + + "vmovaps 0x2A0(%4), %%ymm12 \n\t" + "vmovaps 0x2C0(%4), %%ymm13 \n\t" + "vmovaps 0x2E0(%4), %%ymm14 \n\t" + "vbroadcastss (%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss (%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm5 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm9 \n\t" + "vbroadcastss (%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm10 \n\t" + "vbroadcastss (%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "add %9, %3 \n\t" + + "vmovaps 0x300(%4), %%ymm12 \n\t" + "vmovaps 0x320(%4), %%ymm13 \n\t" + "vmovaps 0x340(%4), %%ymm14 \n\t" + "vbroadcastss (%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss (%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm5 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm9 \n\t" + "vbroadcastss (%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm10 \n\t" + "vbroadcastss (%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "add %9, %3 \n\t" + + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + "add %8, %3 \n\t" + + "add $0x360, %4 \n\t" + "add %10, %0 \n\t" + "add %10, %1 \n\t" + "add %10, %2 \n\t" + "add %10, %3 \n\t" + "add %12, %4 \n\t" + "dec %%eax \n\t" + "jg 0b \n\t" + : + : "r"(in_0), "r"(in_1), "r"(in_2), "r"(in_3), "r"(curW), "r"(fh), + "r"(fw), "r"(ic), "r"(I64(hStep)), "r"(I64(dw)), "r"(I64(iStep)), + "r"(I64(fwStep)), "r"(I64(fhStep)) + : "%ecx", "%ebx", "%eax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", + "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", + "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory", "cc"); + + } else if ((fh > 0) && (fw > 0)) { + __asm__ __volatile__(".align 16 \n\t" + "0: \n\t" + + "mov %5, %%ebx \n\t" + ".align 16 \n\t" + "1: \n\t" + + "mov %6, %%ecx \n\t" + ".align 16 \n\t" + "2: \n\t" + + "vmovaps (%4), %%ymm12 \n\t" + "vmovaps 0x20(%4), %%ymm13 \n\t" + "vmovaps 0x40(%4), %%ymm14 \n\t" + "vbroadcastss (%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss (%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm5 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm9 \n\t" + "vbroadcastss (%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm10 \n\t" + "vbroadcastss (%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "add %9, %3 \n\t" + "add $0x60, %4 \n\t" + "dec %%ecx \n\t" + "jg 2b \n\t" + + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + "add %8, %3 \n\t" + "add %11, %4 \n\t" + "dec %%ebx \n\t" + "jg 1b \n\t" + + "add %10, %0 \n\t" + "add %10, %1 \n\t" + "add %10, %2 \n\t" + "add %10, %3 \n\t" + "add %12, %4 \n\t" + "dec %%eax \n\t" + "jg 0b \n\t" + : + : "r"(in_0), "r"(in_1), "r"(in_2), "r"(in_3), "r"(curW), "r"(fh), + "r"(fw), "a"(ic), "r"(I64(hStep)), "r"(I64(dw)), "r"(I64(iStep)), + "r"(I64(fwStep)), "r"(I64(fhStep)) + : "%ecx", "%ebx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", + "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", + "%ymm13", "%ymm14", "%ymm15", "memory", "cc"); + } + + __asm__ __volatile__( + // relu + "and $0x6, %2 \n\t" + "je 0f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm1, %%ymm1 \n\t" + "vmaxps %%ymm15, %%ymm2, %%ymm2 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + "vmaxps %%ymm15, %%ymm5, %%ymm5 \n\t" + "vmaxps %%ymm15, %%ymm6, %%ymm6 \n\t" + "vmaxps %%ymm15, %%ymm7, %%ymm7 \n\t" + "vmaxps %%ymm15, %%ymm8, %%ymm8 \n\t" + "vmaxps %%ymm15, %%ymm9, %%ymm9 \n\t" + "vmaxps %%ymm15, %%ymm10, %%ymm10 \n\t" + "vmaxps %%ymm15, %%ymm11, %%ymm11 \n\t" + + // relu6 + "and $0x4, %2 \n\t" + "je 0f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm1, %%ymm1 \n\t" + "vminps %%ymm12, %%ymm2, %%ymm2 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + "vminps %%ymm12, %%ymm5, %%ymm5 \n\t" + "vminps %%ymm12, %%ymm6, %%ymm6 \n\t" + "vminps %%ymm12, %%ymm7, %%ymm7 \n\t" + "vminps %%ymm12, %%ymm8, %%ymm8 \n\t" + "vminps %%ymm12, %%ymm9, %%ymm9 \n\t" + "vminps %%ymm12, %%ymm10, %%ymm10 \n\t" + "vminps %%ymm12, %%ymm11, %%ymm11 \n\t" + + "0: \n\t" + "vmovups %%ymm0, (%0) \n\t" + "vmovups %%ymm1, 0x20(%0) \n\t" + "vmovups %%ymm2, 0x40(%0) \n\t" + "vmovups %%ymm3, 0x60(%0) \n\t" + "vmovups %%ymm4, (%0, %1) \n\t" + "vmovups %%ymm5, 0x20(%0, %1) \n\t" + "vmovups %%ymm6, 0x40(%0, %1) \n\t" + "vmovups %%ymm7, 0x60(%0, %1) \n\t" + "vmovups %%ymm8, (%0, %1, 2) \n\t" + "vmovups %%ymm9, 0x20(%0, %1, 2) \n\t" + "vmovups %%ymm10, 0x40(%0, %1, 2) \n\t" + "vmovups %%ymm11, 0x60(%0, %1, 2) \n\t" + : + : "r"(curO), "r"(I64(oStep)), "r"(store) + : "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", + "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm15", "memory", "cc"); +} + +void avx2_conv_kernel_1x24(F32 *in_0, + F32 *in_1, + F32 *in_2, + F32 *in_3, + const F32 *curW, + F32 *curO, + const F32 *curB, + U32 fw, + U32 fh, + U32 oStep, + U32 hStep, + U32 store, + U32 dw, + U32 ic, + U32 iStep, + U32 fwStep, + U32 fhStep) +{ + __asm__ __volatile__("mov %3, %%eax \n\t" + "and $0x1, %%eax \n\t" + "jne 0f \n\t" + "vmovups (%1), %%ymm0 \n\t" + "vmovups 0x20(%1), %%ymm4 \n\t" + "vmovups 0x40(%1), %%ymm8 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "vmovups (%0), %%ymm0 \n\t" + "vmovups (%0, %2), %%ymm4 \n\t" + "vmovups (%0, %2, 2), %%ymm8 \n\t" + + ".align 16 \n\t" + "1: \n\t" + : + : "r"(curO), "r"(curB), "r"(I64(oStep)), "r"(store) + : "%eax", "%ymm0", "%ymm4", "%ymm8", "memory", "cc"); + + if ((fh > 0) && (fw > 0)) { + __asm__ __volatile__(".align 16 \n\t" + "0: \n\t" + + "mov %2, %%ebx \n\t" + ".align 16 \n\t" + "1: \n\t" + + "mov %3, %%ecx \n\t" + ".align 16 \n\t" + "2: \n\t" + + "vmovaps (%1), %%ymm12 \n\t" + "vmovaps 0x20(%1), %%ymm13 \n\t" + "vmovaps 0x40(%1), %%ymm14 \n\t" + "vbroadcastss (%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + + "add %6, %0 \n\t" + "add $0x60, %1 \n\t" + "dec %%ecx \n\t" + "jg 2b \n\t" + + "add %5, %0 \n\t" + "add %8, %1 \n\t" + "dec %%ebx \n\t" + "jg 1b \n\t" + + "add %7, %0 \n\t" + "add %9, %1 \n\t" + "dec %%eax \n\t" + "jg 0b \n\t" + : + : "r"(in_0), "r"(curW), "r"(fh), "r"(fw), "a"(ic), "r"(I64(hStep)), + "r"(I64(dw)), "r"(I64(iStep)), "r"(I64(fwStep)), "r"(I64(fhStep)) + : "%ecx", "%ebx", "%ymm0", "%ymm4", "%ymm8", "%ymm12", "%ymm13", + "%ymm14", "%ymm15", "memory", "cc"); + } + + __asm__ __volatile__( + // relu + "and $0x6, %2 \n\t" + "je 3f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + "vmaxps %%ymm15, %%ymm8, %%ymm8 \n\t" + + // relu6 + "and $0x4, %2 \n\t" + "je 3f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + "vminps %%ymm12, %%ymm8, %%ymm8 \n\t" + + "3: \n\t" + "vmovups %%ymm0, (%0) \n\t" + "vmovups %%ymm4, (%0, %1) \n\t" + "vmovups %%ymm8, (%0, %1, 2) \n\t" + : + : "r"(curO), "r"(I64(oStep)), "r"(store) + : "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", + "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm15", "memory", "cc"); +} + +void avx2_conv_kernel_4x16(F32 *in_0, + F32 *in_1, + F32 *in_2, + F32 *in_3, + const F32 *curW, + F32 *curO, + const F32 *curB, + U32 fw, + U32 fh, + U32 oStep, + U32 hStep, + U32 store, + U32 dw, + U32 ic, + U32 iStep, + U32 fwStep, + U32 fhStep) +{ + __asm__ __volatile__("mov %3, %%eax \n\t" + "and $0x1, %%eax \n\t" + "jne 0f \n\t" + "vmovups (%1), %%ymm0 \n\t" + "vmovups (%1), %%ymm1 \n\t" + "vmovups (%1), %%ymm2 \n\t" + "vmovups (%1), %%ymm3 \n\t" + "vmovups 0x20(%1), %%ymm4 \n\t" + "vmovups 0x20(%1), %%ymm5 \n\t" + "vmovups 0x20(%1), %%ymm6 \n\t" + "vmovups 0x20(%1), %%ymm7 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "vmovups (%0), %%ymm0 \n\t" + "vmovups 0x20(%0), %%ymm1 \n\t" + "vmovups 0x40(%0), %%ymm2 \n\t" + "vmovups 0x60(%0), %%ymm3 \n\t" + "vmovups (%0, %2), %%ymm4 \n\t" + "vmovups 0x20(%0, %2), %%ymm5 \n\t" + "vmovups 0x40(%0, %2), %%ymm6 \n\t" + "vmovups 0x60(%0, %2), %%ymm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + : + : "r"(curO), "r"(curB), "r"(I64(oStep)), "r"(store) + : "%eax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", + "%ymm7", "memory", "cc"); + + if ((fh > 0) && (fw > 0)) { + __asm__ __volatile__(".align 16 \n\t" + "0: \n\t" + + "mov %5, %%ebx \n\t" + ".align 16 \n\t" + "1: \n\t" + + "mov %6, %%ecx \n\t" + ".align 16 \n\t" + "2: \n\t" + + "vmovaps (%4), %%ymm12 \n\t" + "vmovaps 0x20(%4), %%ymm13 \n\t" + "vbroadcastss (%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vbroadcastss (%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm5 \n\t" + "vbroadcastss (%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vbroadcastss (%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "add %9, %3 \n\t" + "add $0x40, %4 \n\t" + "dec %%ecx \n\t" + "jg 2b \n\t" + + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + "add %8, %3 \n\t" + "add %11, %4 \n\t" + "dec %%ebx \n\t" + "jg 1b \n\t" + + "add %10, %0 \n\t" + "add %10, %1 \n\t" + "add %10, %2 \n\t" + "add %10, %3 \n\t" + "add %12, %4 \n\t" + "dec %%eax \n\t" + "jg 0b \n\t" + : + : "r"(in_0), "r"(in_1), "r"(in_2), "r"(in_3), "r"(curW), "r"(fh), + "r"(fw), "a"(ic), "r"(I64(hStep)), "r"(I64(dw)), "r"(I64(iStep)), + "r"(I64(fwStep)), "r"(I64(fhStep)) + : "%ecx", "%ebx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", + "%ymm6", "%ymm7", "%ymm12", "%ymm13", "%ymm15", "memory", "cc"); + } + + __asm__ __volatile__( + // relu + "and $0x6, %2 \n\t" + "je 3f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm1, %%ymm1 \n\t" + "vmaxps %%ymm15, %%ymm2, %%ymm2 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + "vmaxps %%ymm15, %%ymm5, %%ymm5 \n\t" + "vmaxps %%ymm15, %%ymm6, %%ymm6 \n\t" + "vmaxps %%ymm15, %%ymm7, %%ymm7 \n\t" + + // relu6 + "and $0x4, %2 \n\t" + "je 3f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm1, %%ymm1 \n\t" + "vminps %%ymm12, %%ymm2, %%ymm2 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + "vminps %%ymm12, %%ymm5, %%ymm5 \n\t" + "vminps %%ymm12, %%ymm6, %%ymm6 \n\t" + "vminps %%ymm12, %%ymm7, %%ymm7 \n\t" + + "3: \n\t" + "vmovups %%ymm0, (%0) \n\t" + "vmovups %%ymm1, 0x20(%0) \n\t" + "vmovups %%ymm2, 0x40(%0) \n\t" + "vmovups %%ymm3, 0x60(%0) \n\t" + "vmovups %%ymm4, (%0, %1) \n\t" + "vmovups %%ymm5, 0x20(%0, %1) \n\t" + "vmovups %%ymm6, 0x40(%0, %1) \n\t" + "vmovups %%ymm7, 0x60(%0, %1) \n\t" + : + : "r"(curO), "r"(I64(oStep)), "r"(store) + : "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm12", + "%ymm15", "memory", "cc"); +} + +void avx2_conv_kernel_1x16(F32 *in_0, + F32 *in_1, + F32 *in_2, + F32 *in_3, + const F32 *curW, + F32 *curO, + const F32 *curB, + U32 fw, + U32 fh, + U32 oStep, + U32 hStep, + U32 store, + U32 dw, + U32 ic, + U32 iStep, + U32 fwStep, + U32 fhStep) +{ + __asm__ __volatile__("mov %3, %%eax \n\t" + "and $0x1, %%eax \n\t" + "jne 0f \n\t" + "vmovups (%1), %%ymm0 \n\t" + "vmovups 0x20(%1), %%ymm4 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "vmovups (%0), %%ymm0 \n\t" + "vmovups (%0, %2), %%ymm4 \n\t" + + ".align 16 \n\t" + "1: \n\t" + : + : "r"(curO), "r"(curB), "r"(I64(oStep)), "r"(store) + : "%eax", "%ymm0", "%ymm4", "memory", "cc"); + + if ((fh > 0) && (fw > 0)) { + __asm__ __volatile__(".align 16 \n\t" + "0: \n\t" + + "mov %2, %%ebx \n\t" + ".align 16 \n\t" + "1: \n\t" + + "mov %3, %%ecx \n\t" + ".align 16 \n\t" + "2: \n\t" + + "vmovaps (%1), %%ymm12 \n\t" + "vmovaps 0x20(%1), %%ymm13 \n\t" + "vbroadcastss (%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "add %6, %0 \n\t" + "add $0x40, %1 \n\t" + "dec %%ecx \n\t" + "jg 2b \n\t" + + "add %5, %0 \n\t" + "add %8, %1 \n\t" + "dec %%ebx \n\t" + "jg 1b \n\t" + + "add %7, %0 \n\t" + "add %9, %1 \n\t" + "dec %%eax \n\t" + "jg 0b \n\t" + : + : "r"(in_0), "r"(curW), "r"(fh), "r"(fw), "a"(ic), "r"(I64(hStep)), + "r"(I64(dw)), "r"(I64(iStep)), "r"(I64(fwStep)), "r"(I64(fhStep)) + : "%ecx", "%ebx", "%ymm0", "%ymm4", "%ymm8", "%ymm12", "%ymm13", + "%ymm15", "memory", "cc"); + } + + __asm__ __volatile__( + // relu + "and $0x6, %2 \n\t" + "je 3f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + + // relu6 + "and $0x4, %2 \n\t" + "je 3f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + + "3: \n\t" + "vmovups %%ymm0, (%0) \n\t" + "vmovups %%ymm4, (%0, %1) \n\t" + : + : "r"(curO), "r"(I64(oStep)), "r"(store) + : "%ecx", "%ymm0", "%ymm4", "%ymm12", "%ymm15", "memory", "cc"); +} + +void avx2_conv_kernel_4x8(F32 *in_0, + F32 *in_1, + F32 *in_2, + F32 *in_3, + const F32 *curW, + F32 *curO, + const F32 *curB, + U32 fw, + U32 fh, + U32 oStep, + U32 hStep, + U32 store, + U32 dw, + U32 ic, + U32 iStep, + U32 fwStep, + U32 fhStep) +{ + __asm__ __volatile__("mov %2, %%eax \n\t" + "and $0x1, %%eax \n\t" + "jne 0f \n\t" + "vmovups (%1), %%ymm0 \n\t" + "vmovups (%1), %%ymm1 \n\t" + "vmovups (%1), %%ymm2 \n\t" + "vmovups (%1), %%ymm3 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "vmovups (%0), %%ymm0 \n\t" + "vmovups 0x20(%0), %%ymm1 \n\t" + "vmovups 0x40(%0), %%ymm2 \n\t" + "vmovups 0x60(%0), %%ymm3 \n\t" + + ".align 16 \n\t" + "1: \n\t" + : + : "r"(curO), "r"(curB), "r"(store) + : "%eax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "memory", "cc"); + + if ((fh > 0) && (fw > 0)) { + __asm__ __volatile__(".align 16 \n\t" + "0: \n\t" + + "mov %5, %%ebx \n\t" + ".align 16 \n\t" + "1: \n\t" + + "mov %6, %%ecx \n\t" + ".align 16 \n\t" + "2: \n\t" + + "vmovaps (%4), %%ymm12 \n\t" + "vbroadcastss (%0), %%ymm11 \n\t" + "vbroadcastss (%1), %%ymm13 \n\t" + "vbroadcastss (%2), %%ymm14 \n\t" + "vbroadcastss (%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm11, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "add %9, %3 \n\t" + "add $0x20, %4 \n\t" + "dec %%ecx \n\t" + "jg 2b \n\t" + + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + "add %8, %3 \n\t" + "add %11, %4 \n\t" + "dec %%ebx \n\t" + "jg 1b \n\t" + + "add %10, %0 \n\t" + "add %10, %1 \n\t" + "add %10, %2 \n\t" + "add %10, %3 \n\t" + "add %12, %4 \n\t" + "dec %%eax \n\t" + "jg 0b \n\t" + : + : "r"(in_0), "r"(in_1), "r"(in_2), "r"(in_3), "r"(curW), "r"(fh), + "r"(fw), "a"(ic), "r"(I64(hStep)), "r"(I64(dw)), "r"(I64(iStep)), + "r"(I64(fwStep)), "r"(I64(fhStep)) + : "%ecx", "%ebx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm11", + "%ymm14", "%ymm12", "%ymm13", "%ymm15", "memory", "cc"); + } + + __asm__ __volatile__( + // relu + "and $0x6, %1 \n\t" + "je 3f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm1, %%ymm1 \n\t" + "vmaxps %%ymm15, %%ymm2, %%ymm2 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + + // relu6 + "and $0x4, %1 \n\t" + "je 3f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm1, %%ymm1 \n\t" + "vminps %%ymm12, %%ymm2, %%ymm2 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + + "3: \n\t" + "vmovups %%ymm0, (%0) \n\t" + "vmovups %%ymm1, 0x20(%0) \n\t" + "vmovups %%ymm2, 0x40(%0) \n\t" + "vmovups %%ymm3, 0x60(%0) \n\t" + : + : "r"(curO), "r"(store) + : "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm12", "%ymm15", "memory", "cc"); +} + +void avx2_conv_kernel_1x8(F32 *in_0, + F32 *in_1, + F32 *in_2, + F32 *in_3, + const F32 *curW, + F32 *curO, + const F32 *curB, + U32 fw, + U32 fh, + U32 oStep, + U32 hStep, + U32 store, + U32 dw, + U32 ic, + U32 iStep, + U32 fwStep, + U32 fhStep) +{ + __asm__ __volatile__("mov %2, %%eax \n\t" + "and $0x1, %%eax \n\t" + "jne 0f \n\t" + "vmovups (%1), %%ymm0 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "vmovups (%0), %%ymm0 \n\t" + + ".align 16 \n\t" + "1: \n\t" + : + : "r"(curO), "r"(curB), "r"(store) + : "%eax", "%ymm0", "memory", "cc"); + + if ((fh > 0) && (fw > 0)) { + __asm__ __volatile__(".align 16 \n\t" + "0: \n\t" + + "mov %2, %%ebx \n\t" + ".align 16 \n\t" + "1: \n\t" + + "mov %3, %%ecx \n\t" + ".align 16 \n\t" + "2: \n\t" + + "vmovaps (%1), %%ymm12 \n\t" + "vbroadcastss (%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "add %6, %0 \n\t" + "add $0x20, %1 \n\t" + "dec %%ecx \n\t" + "jg 2b \n\t" + + "add %5, %0 \n\t" + "add %8, %1 \n\t" + "dec %%ebx \n\t" + "jg 1b \n\t" + + "add %7, %0 \n\t" + "add %9, %1 \n\t" + "dec %%eax \n\t" + "jg 0b \n\t" + : + : "r"(in_0), "r"(curW), "r"(fh), "r"(fw), "a"(ic), "r"(I64(hStep)), + "r"(I64(dw)), "r"(I64(iStep)), "r"(I64(fwStep)), "r"(I64(fhStep)) + : "%ecx", "%ebx", "%ymm0", "%ymm12", "%ymm15", "memory", "cc"); + } + + __asm__ __volatile__( + // relu + "and $0x6, %1 \n\t" + "je 0f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + + // relu6 + "and $0x4, %1 \n\t" + "je 0f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + + "0: \n\t" + "vmovups %%ymm0, (%0) \n\t" + : + : "r"(curO), "r"(store) + : "%ecx", "%ymm0", "%ymm12", "%ymm15", "memory", "cc"); +} + +EE convolution_direct_nchw(TensorDesc inputDesc, + F32 *inArray, + TensorDesc filterDesc, + const F32 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F32 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec activationDesc) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + I32 strideH = convParamSpec.stride_h; + I32 strideW = convParamSpec.stride_w; + I32 paddingT = convParamSpec.padding_top; + I32 paddingB = convParamSpec.padding_bottom; + I32 paddingL = convParamSpec.padding_left; + I32 paddingR = convParamSpec.padding_right; + I32 dilateH = convParamSpec.dilatedRate_h; + I32 dilateW = convParamSpec.dilatedRate_w; + + if (((fdf != DF_NCHWCxN24) && (fdf != DF_NCHWCxN32)) || (idf != DF_NCHW)) { + CHECK_STATUS(NOT_MATCH); + } + + F32 *curI, *curO, *calI, *calO; + const F32 *curW, *curB, *calW; + F32 *ftmp = inArray; + filterArray = (F32 *)align_addr(filterArray, 32); + + U32 oStep = oh * ow * UNROLL_OC_DIM * 4; + U32 iStep = ((ih - fh) * iw) * 4; + U32 hStep = (iw - fw * dilateW + (dilateH - 1) * iw) * 4; + U32 dw = dilateW * 4; + U32 wSize = 0, store = 0, ocSize = 0, icSize = 0, hwSize = 0, icbSize = 0; + I32 ih_idx = 0; + kernel_func kernel[4][2] = {{avx2_conv_kernel_1x8, avx2_conv_kernel_4x8}, + {avx2_conv_kernel_1x16, avx2_conv_kernel_4x16}, + {avx2_conv_kernel_1x24, avx2_conv_kernel_4x24}, + {avx2_conv_kernel_1x32, avx2_conv_kernel_3x32}}; + U32 ocblocks[4] = {8, 16, 24, 32}; + U32 wblocks[4] = {4, 4, 4, 3}; + U32 unroll_w = UNROLL_W, unroll_oc = BLOCK_OC_DIM; + I32 ohow = oh * ow; + + U32 k24 = (oc + 23) / 24 * (ohow + 3) / 4; + U32 k32 = (oc + 31) / 32 * (ohow + 2) / 3; + if (k32 < k24) { + unroll_oc = 32; + } + + I32 oh_padding_t = 0; + I32 oh_padding_b = 0; + + for (U32 n = 0; n < in; ++n) { + store = 0; + for (U32 icbb = 0; icbb < ic; icbb += icSize) { + icSize = UNI_MIN(BLOCK_IC_DIM, ic - icbb); + store |= (icbb > 0); + if (icbb == ic - icSize) { + store |= U32(activationDesc.mode) << 1; + } + if ((paddingL == 0) && (paddingR == 0) && (paddingT != 0 || paddingB != 0)) { + oh_padding_t = UNI_MIN((paddingT - 1) / strideH + 1, oh); + oh_padding_b = UNI_MIN((paddingB - 1) / strideH + 1, oh - oh_padding_t); + if (((ih + paddingT - fh) / strideH + 1) >= oh) { + oh_padding_b = 0; + } + for (U32 ocb = 0; ocb < oc; ocb += ocSize) { + ocSize = UNI_MIN(unroll_oc, oc - ocb); + ocSize = ocblocks[(ocSize >> 3) - 1]; + unroll_w = wblocks[(ocSize >> 3) - 1]; + curW = filterArray + ocb * ic * fh * fw + ocSize * icbb * fh * fw; + curB = biasArray + ocb; + curI = ftmp + icbb * ih * iw; + for (I32 h = 0; h < oh_padding_t; ++h) { + I32 in_h_0 = h * strideH - paddingT; + U32 tfh = UNI_MIN(fh + in_h_0, ih); + iStep = ((ih - tfh) * iw) * 4; + for (I32 w = 0; w < ow; w += wSize) { + wSize = UNI_MIN(ow - w, unroll_w); + if (wSize < unroll_w) { + wSize = 1; + } + U32 in_w_0 = w * strideW; + U32 in_w_1 = (w + 1) * strideW; + U32 in_w_2 = (w + 2) * strideW; + U32 in_w_3 = (w + 3) * strideW; + F32 *out_ptr = outArray + (n * oc + ocb) * ohow + (h * ow + w) * 8; + F32 *in_0 = curI + in_w_0; + F32 *in_1 = curI + in_w_1; + F32 *in_2 = curI + in_w_2; + F32 *in_3 = curI + in_w_3; + kernel[(ocSize >> 3) - 1][wSize > 1](in_0, in_1, in_2, in_3, curW + (fh - tfh) * fw * ocSize, + out_ptr, curB, fw, tfh, oStep, hStep, store, dw, icSize, iStep, 0, fw * (fh - tfh) * ocSize * 4); + } + } + } + } + if ((paddingL == 0) && (paddingR == 0)) { + iStep = ((ih - fh) * iw) * 4; + for (I32 hw = oh_padding_t * ow; hw < ohow - oh_padding_b * ow; hw += hwSize) { + hwSize = UNI_MIN(BLOCK_HW_DIM, ohow - oh_padding_b * ow - hw); + for (U32 ocb = 0; ocb < oc; ocb += ocSize) { + ocSize = UNI_MIN(unroll_oc, oc - ocb); + ocSize = ocblocks[(ocSize >> 3) - 1]; + unroll_w = wblocks[(ocSize >> 3) - 1]; + curW = filterArray + ocb * ic * fh * fw + ocSize * icbb * fh * fw; + curB = biasArray + ocb; + curI = ftmp + icbb * ih * iw; + for (I32 ihw = hw; ihw < hw + hwSize; ihw += wSize) { + wSize = UNI_MIN(hw + hwSize - ihw, unroll_w); + if (wSize < unroll_w) { + wSize = 1; + } + U32 in_h_0 = ihw / ow * strideH - paddingT; + U32 in_w_0 = ihw % ow * strideW; + U32 in_h_1 = (ihw + 1) / ow * strideH - paddingT; + U32 in_w_1 = (ihw + 1) % ow * strideW; + U32 in_h_2 = (ihw + 2) / ow * strideH - paddingT; + U32 in_w_2 = (ihw + 2) % ow * strideW; + U32 in_h_3 = (ihw + 3) / ow * strideH - paddingT; + U32 in_w_3 = (ihw + 3) % ow * strideW; + F32 *out_ptr = outArray + (n * oc + ocb) * ohow + ihw * 8; + F32 *in_0 = curI + in_h_0 * iw + in_w_0; + F32 *in_1 = curI + in_h_1 * iw + in_w_1; + F32 *in_2 = curI + in_h_2 * iw + in_w_2; + F32 *in_3 = curI + in_h_3 * iw + in_w_3; + kernel[(ocSize >> 3) - 1][wSize > 1](in_0, in_1, in_2, in_3, curW, + out_ptr, curB, fw, fh, oStep, hStep, store, dw, icSize, iStep, 0, 0); + } + } + } + } + if ((paddingL == 0) && (paddingR == 0) && (paddingT != 0 || paddingB != 0)) { + for (U32 ocb = 0; ocb < oc; ocb += ocSize) { + ocSize = UNI_MIN(unroll_oc, oc - ocb); + ocSize = ocblocks[(ocSize >> 3) - 1]; + unroll_w = wblocks[(ocSize >> 3) - 1]; + curW = filterArray + ocb * ic * fh * fw + ocSize * icbb * fh * fw; + curB = biasArray + ocb; + curI = ftmp + icbb * ih * iw; + for (I32 h = oh - oh_padding_b; h < oh; ++h) { + I32 in_h_0 = h * strideH - paddingT; + U32 tfh = ih - in_h_0; + iStep = ((ih - tfh) * iw) * 4; + for (I32 w = 0; w < ow; w += wSize) { + wSize = UNI_MIN(ow - w, unroll_w); + if (wSize < unroll_w) { + wSize = 1; + } + U32 in_w_0 = w * strideW; + U32 in_w_1 = (w + 1) * strideW; + U32 in_w_2 = (w + 2) * strideW; + U32 in_w_3 = (w + 3) * strideW; + F32 *out_ptr = outArray + (n * oc + ocb) * ohow + (h * ow + w) * 8; + F32 *in_0 = curI + in_h_0 * iw + in_w_0; + F32 *in_1 = curI + in_h_0 * iw + in_w_1; + F32 *in_2 = curI + in_h_0 * iw + in_w_2; + F32 *in_3 = curI + in_h_0 * iw + in_w_3; + kernel[(ocSize >> 3) - 1][wSize > 1](in_0, in_1, in_2, in_3, curW, + out_ptr, curB, fw, tfh, oStep, hStep, store, dw, icSize, iStep, 0, fw * (fh - tfh) * ocSize * 4); + } + } + } + } + if ((paddingL != 0) || (paddingR != 0)) { + I32 tfw = fw, tfh = fh, wh = 0; + I32 in_h = 0, in_w = 0; + I32 ow_padding_l = UNI_MIN((paddingL - 1) / strideW + 1, ow); + I32 ow_padding_r = UNI_MIN((paddingR - 1) / strideW + 1, ow - ow_padding_l); + if (((iw + paddingL - fw) / strideW + 1) >= ow) { + ow_padding_r = 0; + } + for (I32 h = 0; h < oh; ++h) { + tfh = fh; + in_h = h * strideH - paddingT; + calW = curW; + wh = 0; + if (in_h < 0) { + tfh = UNI_MIN(fh + in_h, ih); + in_h = 0; + wh = fh - tfh; + } else if (in_h + fh >= ih) { + tfh = ih - in_h; + curW = filterArray; + } + iStep = ((ih - tfh) * iw) * 4; + for (U32 ocb = 0; ocb < oc; ocb += ocSize) { + ocSize = UNI_MIN(unroll_oc, oc - ocb); + ocSize = ocblocks[(ocSize >> 3) - 1]; + unroll_w = wblocks[(ocSize >> 3) - 1]; + curW = filterArray + ocb * ic * fh * fw + ocSize * icbb * fh * fw + + wh * fw * ocSize; + curB = biasArray + ocb; + curI = ftmp + icbb * ih * iw + in_h * iw; + curO = outArray + ocb * ohow + h * ow * 8; + I32 w = 0; + for (; w < ow_padding_l; ++w) { + I32 in_w = w * strideW - paddingL; + tfw = UNI_MIN(fw + in_w, iw); + const F32 *useW = curW + (fw - tfw) * ocSize; + hStep = (iw - tfw * dilateW + (dilateH - 1) * iw) * 4; + calO = curO + w * 8; + kernel[(ocSize >> 3) - 1][0](curI, nullptr, nullptr, nullptr, useW, + calO, curB, tfw, tfh, oStep, hStep, store, dw, icSize, iStep, + (fw - tfw) * ocSize * 4, fw * (fh - tfh) * ocSize * 4); + } + for (; w < ow - ow_padding_r; w += wSize) { + hStep = (iw - fw * dilateW + (dilateH - 1) * iw) * 4; + wSize = UNI_MIN(ow - ow_padding_r - w, unroll_w); + if (wSize < unroll_w) { + wSize = 1; + } + F32 *in_0 = curI + w * strideW - paddingL; + F32 *in_1 = curI + (w + 1) * strideW - paddingL; + F32 *in_2 = curI + (w + 2) * strideW - paddingL; + F32 *in_3 = curI + (w + 3) * strideW - paddingL; + calO = curO + w * 8; + kernel[(ocSize >> 3) - 1][wSize > 1](in_0, in_1, in_2, in_3, curW, calO, + curB, fw, tfh, oStep, hStep, store, dw, icSize, iStep, 0, + fw * (fh - tfh) * ocSize * 4); + } + for (; w < ow; ++w) { + I32 in_w = w * strideW - paddingL; + tfw = iw - in_w; + hStep = (iw - tfw * dilateW + (dilateH - 1) * iw) * 4; + F32 *in_0 = curI + in_w; + calO = curO + w * 8; + kernel[(ocSize >> 3) - 1][0](in_0, nullptr, nullptr, nullptr, curW, + calO, curB, tfw, tfh, oStep, hStep, store, dw, icSize, iStep, + (fw - tfw) * ocSize * 4, fw * (fh - tfh) * ocSize * 4); + } + } + } + } + } + inArray += ic * ih * iw; + outArray += oc * oh * ow; + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/x86/fp32/convolution_transform.cpp b/compute/tensor/src/cpu/x86/fp32/convolution_transform.cpp new file mode 100644 index 00000000..09601378 --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/convolution_transform.cpp @@ -0,0 +1,153 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#include "cpu/x86/fp32/transform_functions_fp32.h" + +// N is 32/24 +template +inline EE transformNCHWToNCHWCxNxWrapper( + TensorDesc filterDesc, const F32 *filterArray, TensorDesc ftmDesc, F32 *ftmArray, U32 cx) +{ + EE ret = NOT_SUPPORTED; + switch (cx) { + case 128: + ret = transformNCHWToNCHWCxNx<128, N>(filterDesc, filterArray, ftmDesc, ftmArray); + break; + case 8: + ret = transformNCHWToNCHWCxNx<8, N>(filterDesc, filterArray, ftmDesc, ftmArray); + break; + case 1: + ret = transformNCHWToNCHWCxNx<1, N>(filterDesc, filterArray, ftmDesc, ftmArray); + break; + default: + break; + } + return ret; +} + +inline EE convolution_transform_filter_kernel_fp32(TensorDesc filterDesc, + const F32 *filterArray, + TensorDesc *ftmDesc, + F32 *ftmArray, + DataFormat ftmDataFormat, + U32 cx) +{ + if (nullptr == filterArray || nullptr == ftmDesc || nullptr == ftmArray) { + CHECK_STATUS(NULL_POINTER); + } + DataType fdt; + DataFormat fdf; + U32 fn, fc, fh, fw; + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + if (fdf == ftmDataFormat) { + *ftmDesc = filterDesc; + memcpy(ftmArray, filterArray, fn * fc * fh * fw * bytesOf(fdt)); + return SUCCESS; + } + if (fdf != DF_NCHW) { + CHECK_STATUS(NOT_SUPPORTED); + } + EE ret = SUCCESS; + switch (ftmDataFormat) { + case DF_NCHWCxN32: { + /* + * NCHW => NCHWCxN32 + */ + *ftmDesc = tensor4df(fdt, ftmDataFormat, fn, fc, fh, fw); + transformNCHWToNCHWCxNxWrapper<32>(filterDesc, filterArray, *ftmDesc, ftmArray, cx); + break; + } + case DF_NCHWCxN24: { + *ftmDesc = tensor4df(fdt, ftmDataFormat, fn, fc, fh, fw); + transformNCHWToNCHWCxNxWrapper<24>(filterDesc, filterArray, *ftmDesc, ftmArray, cx); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE convolution_transform_filter_fp32(TensorDesc filterDesc, + const F32 *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + F32 *filterTransformed) +{ + U32 cx = 0; + DataFormat ftmDataFormat; + DataType fdt; + DataFormat fdf; + U32 fn, fc, fh, fw; + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + switch (algorithm) { + case CONVOLUTION_ALGORITHM_DIRECT: { + ftmDataFormat = DF_NCHWCxN32; + cx = 8; + break; + } + case CONVOLUTION_ALGORITHM_POINTWISE: { + if ((fn % 24 != 0) && (fn % 32 == 0)) { + ftmDataFormat = DF_NCHWCxN32; + } else { + ftmDataFormat = DF_NCHWCxN24; + } + cx = 128; + break; + } + case CONVOLUTION_ALGORITHM_GEMM_ICNCHW: { + fn = (fn + 7) / 8 * 8 / convParamSpec.group; + if ((fn % 24 == 0) && (fn % 32 != 0)) { + ftmDataFormat = DF_NCHWCxN24; + } else { + ftmDataFormat = DF_NCHWCxN32; + } + cx = 1; + break; + } + default: + return NOT_MATCH; + } + + // align to 32 byte + filterTransformed = (F32 *)(((uintptr_t)filterTransformed + 32 - 1) / 32 * 32); + + if (algorithm == CONVOLUTION_ALGORITHM_POINTWISE) { + EE ret = convolution_transform_filter_kernel_fp32( + filterDesc, filter, ftmDesc, filterTransformed, ftmDataFormat, cx); + CHECK_STATUS(ret); + return ret; + } + + U32 channelAxis = filterDesc.nDims - 1; + TensorDesc tmpFilterDesc = filterDesc; + tmpFilterDesc.dims[channelAxis] /= convParamSpec.group; + U32 fnPadding = tmpFilterDesc.dims[channelAxis]; + if (fnPadding % 8 != 0) { + fnPadding = (fnPadding / 8 + 1) * 8; + } + U32 originalTileSize = tensorNumElements(tmpFilterDesc); + for (U32 g = 0; g < convParamSpec.group; g++) { + CHECK_STATUS(convolution_transform_filter_kernel_fp32( + tmpFilterDesc, filter, ftmDesc, filterTransformed, ftmDataFormat, cx)); + U32 newTileSize = tensorNumElements(*ftmDesc) / tmpFilterDesc.dims[channelAxis] * fnPadding; + filter += originalTileSize; + filterTransformed += newTileSize; + } + ftmDesc->dims[channelAxis] = filterDesc.dims[channelAxis]; + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/x86/fp32/deconvolution_transform.cpp b/compute/tensor/src/cpu/x86/fp32/deconvolution_transform.cpp new file mode 100644 index 00000000..9a256507 --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/deconvolution_transform.cpp @@ -0,0 +1,185 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/fp32/tensor_computing_fp32.h" + +template +inline void transformCNHW2NCHWCxNxKernel( + U32 fc, U32 fn, U32 fh, U32 fw, U32 fnPadding, const F32 *input, F32 *output) +{ + F32 *dest; + const F32 *src; + U32 cSize = 0, cSizePadding = 0; + U32 lstep = fh * fw; + U32 hwMax = fh * fw - 1; + __m256i vindex = _mm256_set_epi32( + lstep * 7, lstep * 6, lstep * 5, lstep * 4, lstep * 3, lstep * 2, lstep, 0); + for (U32 n = 0; n < fn; n += cSize) { + cSize = UNI_MIN(fn - n, C); + cSizePadding = UNI_MIN(fnPadding - n, C); + for (U32 hw = 0; hw < fh * fw; ++hw) { + for (U32 c8 = 0; c8 < cSize; ++c8) { + src = input + (n + c8) * fc * fh * fw + hwMax - hw; + dest = output + n * fh * fw * N + hw * cSizePadding * N + c8 * N; + if (N >= 8) { + _mm256_storeu_ps(dest, _mm256_i32gather_ps(src, vindex, 4)); + } + if (N >= 16) { + _mm256_storeu_ps(dest + 8, _mm256_i32gather_ps(src + 8 * lstep, vindex, 4)); + } + if (N >= 24) { + _mm256_storeu_ps(dest + 16, _mm256_i32gather_ps(src + 16 * lstep, vindex, 4)); + } + if (N == 32) { + _mm256_storeu_ps(dest + 24, _mm256_i32gather_ps(src + 24 * lstep, vindex, 4)); + } + } + memset(dest + N, 0, ((cSizePadding - cSize) * N * 4)); + } + } +} + +// N is 32/24 +template +inline EE transformCNHW2NCHWCxNx( + TensorDesc inputDesc, const F32 *input, TensorDesc outputDesc, F32 *output) +{ + if (input == NULL || output == NULL) { + CHECK_STATUS(NULL_POINTER); + } + DataType fdt, odt; + DataFormat fdf, odf; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + U32 tail = fc % N; + U32 remain = fc - tail; + + for (U32 c = 0; c < remain; c += N) { + transformCNHW2NCHWCxNxKernel(fc, fn, fh, fw, on, input, output); + input += fh * fw * N; + output += on * fh * fw * N; + } + if (tail >= 16) { + transformCNHW2NCHWCxNxKernel(fc, fn, fh, fw, on, input, output); + input += fh * fw * 16; + output += on * fh * fw * 16; + tail -= 16; + } + if (tail >= 8) { + transformCNHW2NCHWCxNxKernel(fc, fn, fh, fw, on, input, output); + input += fh * fw * 8; + output += on * fh * fw * 8; + tail -= 8; + } + if (tail > 0) { + F32 *dest; + const F32 *src; + U32 cSize = 0, cSizePadding = 0; + U32 hwMax = fh * fw - 1; + F32 m[8] = {0.0f}; + for (U32 i = 0; i < tail; ++i) { + m[i] = -1.0f; + } + __m256 mask = _mm256_set_ps(m[7], m[6], m[5], m[4], m[3], m[2], m[1], m[0]); + U32 lstep = fh * fw; + __m256i vindex = _mm256_set_epi32( + lstep * 7, lstep * 6, lstep * 5, lstep * 4, lstep * 3, lstep * 2, lstep, 0); + __m256 src256 = _mm256_setzero_ps(); + + for (U32 n = 0; n < fn; n += cSize) { + cSize = UNI_MIN(fn - n, C); + cSizePadding = UNI_MIN(on - n, C); + for (U32 hw = 0; hw < fh * fw; ++hw) { + for (U32 c8 = 0; c8 < cSize; ++c8) { + src = input + (n + c8) * fc * fh * fw + hwMax - hw; + dest = output + n * fh * fw * 8 + hw * cSizePadding * 8 + c8 * 8; + _mm256_storeu_ps(dest, _mm256_mask_i32gather_ps(src256, src, vindex, mask, 4)); + } + memset(dest + 8, 0, ((cSizePadding - cSize) * 32)); + } + } + } + return SUCCESS; +} + +inline EE deconvolution_transform_filter_kernel_fp32(TensorDesc filterDesc, + const F32 *filterArray, + TensorDesc *ftmDesc, + F32 *ftmArray, + DataFormat ftmDataFormat) +{ + if (nullptr == filterArray || nullptr == ftmDesc || nullptr == ftmArray) { + CHECK_STATUS(NULL_POINTER); + } + DataType fdt; + DataFormat fdf; + U32 fn, fc, fh, fw; + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + ftmArray = (F32 *)(((uintptr_t)ftmArray + 32 - 1) / 32 * 32); + if (fdf == ftmDataFormat) { + *ftmDesc = filterDesc; + memcpy(ftmArray, filterArray, fn * fc * fh * fw * bytesOf(fdt)); + return SUCCESS; + } + if (fdf != DF_NCHW) { + CHECK_STATUS(NOT_SUPPORTED); + } + EE ret = SUCCESS; + switch (ftmDataFormat) { + case DF_NCHWCxN32: { + U32 fnAlignSize = 8; + U32 fnPadding = (fn + fnAlignSize - 1) / fnAlignSize * fnAlignSize; + *ftmDesc = tensor4df(fdt, ftmDataFormat, fnPadding, fc, fh, fw); + transformCNHW2NCHWCxNx<8, 32>(filterDesc, filterArray, *ftmDesc, ftmArray); + *ftmDesc = tensor4df(fdt, ftmDataFormat, fn, fc, fh, fw); + break; + } + case DF_NCHWC24: { + filterDesc = tensor4df(fdt, fdf, 1, fc, fh, fw); + *ftmDesc = tensor4df(fdt, ftmDataFormat, 1, fc, fh, fw); + transformCNHW2NCHWCxNx<1, 24>(filterDesc, filterArray, *ftmDesc, ftmArray); + *ftmDesc = tensor4df(fdt, ftmDataFormat, fn, fc, fh, fw); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE deconvolution_transform_filter_fp32(TensorDesc filterDesc, + const F32 *filter, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + F32 *filterTransformed) +{ + DataFormat ftmDataFormat; + switch (algorithm) { + case CONVOLUTION_ALGORITHM_DIRECT: + ftmDataFormat = DF_NCHWCxN32; + break; + case CONVOLUTION_ALGORITHM_GROUP_DECONV: + ftmDataFormat = DF_NCHWC24; + break; + default: + return NOT_MATCH; + } + EE ret = deconvolution_transform_filter_kernel_fp32( + filterDesc, filter, ftmDesc, filterTransformed, ftmDataFormat); + CHECK_STATUS(ret); + return ret; +} diff --git a/compute/tensor/src/cpu/x86/fp32/depthwise_convolution_direct.cpp b/compute/tensor/src/cpu/x86/fp32/depthwise_convolution_direct.cpp new file mode 100644 index 00000000..cf506a78 --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/depthwise_convolution_direct.cpp @@ -0,0 +1,834 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing.h" + +#include "cpu/x86/fp32/tensor_computing_fp32.h" + +#define UNROLL_W 4 +#define SIMD_W 8 +#define UNROLL_OC_BLOCK_DIM 24 +#define align_addr(addr, unit) (((uintptr_t)addr + unit - 1) / unit * unit) + +typedef void (*kernel_func)(F32 *in0, + F32 *in1, + F32 *in2, + F32 *in3, + const F32 *curW, + F32 *curO, + const F32 *curB, + U32 fw, + U32 fh, + U32 oStep, + U32 iStep, + U32 hStep, + U32 store, + U32 dw, + U32 wStep); + +void avx2_dw_kernel_4x24(F32 *in0, + F32 *in1, + F32 *in2, + F32 *in3, + const F32 *curW, + F32 *curO, + const F32 *curB, + U32 fw, + U32 fh, + U32 oStep, + U32 iStep, + U32 hStep, + U32 store, + U32 dw, + U32 wStep) +{ + __asm__ __volatile__("vmovups (%5), %%ymm0 \n\t" + "vmovups (%5), %%ymm1 \n\t" + "vmovups (%5), %%ymm2 \n\t" + "vmovups (%5), %%ymm3 \n\t" + "vmovups 0x20(%5), %%ymm4 \n\t" + "vmovups 0x20(%5), %%ymm5 \n\t" + "vmovups 0x20(%5), %%ymm6 \n\t" + "vmovups 0x20(%5), %%ymm7 \n\t" + "vmovups 0x40(%5), %%ymm8 \n\t" + "vmovups 0x40(%5), %%ymm9 \n\t" + "vmovups 0x40(%5), %%ymm10 \n\t" + "vmovups 0x40(%5), %%ymm11 \n\t" + + "cmp $0, %%ecx \n\t" + "je 3f \n\t" + "cmp $0, %6 \n\t" + "je 3f \n\t" + + ".align 16 \n\t" + "0: \n\t" + + "mov %6, %%eax \n\t" + ".align 16 \n\t" + "1: \n\t" + + "vmovaps (%4), %%ymm12 \n\t" + "vmovups (%0), %%ymm13 \n\t" + "vmovups (%1), %%ymm14 \n\t" + "vmovups (%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm1 \n\t" + "vmovups (%3), %%ymm13 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + + "vmovaps 0x20(%4), %%ymm14 \n\t" + "vmovups (%0, %8), %%ymm15 \n\t" + "vmovups (%1, %8), %%ymm13 \n\t" + "vmovups (%2, %8), %%ymm12 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm4 \n\t" + "vfmadd231ps %%ymm13, %%ymm14, %%ymm5 \n\t" + "vmovups (%3, %8), %%ymm15 \n\t" + "vfmadd231ps %%ymm12, %%ymm14, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm7 \n\t" + + "vmovaps 0x40(%4), %%ymm13 \n\t" + "vmovups (%0, %8, 2), %%ymm12 \n\t" + "vmovups (%1, %8, 2), %%ymm15 \n\t" + "vmovups (%2, %8, 2), %%ymm14 \n\t" + "vfmadd231ps %%ymm12, %%ymm13, %%ymm8 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm9 \n\t" + "vmovups (%3, %8, 2), %%ymm12 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm12, %%ymm13, %%ymm11 \n\t" + + "add %12, %0 \n\t" + "add %12, %1 \n\t" + "add %12, %2 \n\t" + "add %12, %3 \n\t" + "add $0x60, %4 \n\t" + "dec %%eax \n\t" + "jg 1b \n\t" + + "add %10, %4 \n\t" + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "add %9, %3 \n\t" + "dec %%ecx \n\t" + "jg 0b \n\t" + + // relu + "mov %11, %%eax \n\t" + "and $0x6, %%eax \n\t" + "je 3f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm1, %%ymm1 \n\t" + "vmaxps %%ymm15, %%ymm2, %%ymm2 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + "vmaxps %%ymm15, %%ymm5, %%ymm5 \n\t" + "vmaxps %%ymm15, %%ymm6, %%ymm6 \n\t" + "vmaxps %%ymm15, %%ymm7, %%ymm7 \n\t" + "vmaxps %%ymm15, %%ymm8, %%ymm8 \n\t" + "vmaxps %%ymm15, %%ymm9, %%ymm9 \n\t" + "vmaxps %%ymm15, %%ymm10, %%ymm10 \n\t" + "vmaxps %%ymm15, %%ymm11, %%ymm11 \n\t" + + // relu6 + "and $0x4, %%eax \n\t" + "je 3f \n\t" + "mov $0x40C00000, %%eax \n\t" + "vmovd %%eax, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm1, %%ymm1 \n\t" + "vminps %%ymm12, %%ymm2, %%ymm2 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + "vminps %%ymm12, %%ymm5, %%ymm5 \n\t" + "vminps %%ymm12, %%ymm6, %%ymm6 \n\t" + "vminps %%ymm12, %%ymm7, %%ymm7 \n\t" + "vminps %%ymm12, %%ymm8, %%ymm8 \n\t" + "vminps %%ymm12, %%ymm9, %%ymm9 \n\t" + "vminps %%ymm12, %%ymm10, %%ymm10 \n\t" + "vminps %%ymm12, %%ymm11, %%ymm11 \n\t" + + ".align 16 \n\t" + "3: \n\t" + : + : "r"(in0), "r"(in1), "r"(in2), "r"(in3), "r"(curW), "r"(curB), "r"(fw), + "c"(fh), "r"((I64)iStep), "r"((I64)hStep), "r"((I64)wStep), "r"(store), + "r"((I64)dw) + : "%eax", "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", + "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", + "%ymm14", "%ymm15", "memory", "cc"); + + __asm__ __volatile__("vmovups %%ymm0, (%0) \n\t" + "vmovups %%ymm1, 0x20(%0) \n\t" + "vmovups %%ymm2, 0x40(%0) \n\t" + "vmovups %%ymm3, 0x60(%0) \n\t" + "vmovups %%ymm4, (%0, %1) \n\t" + "vmovups %%ymm5, 0x20(%0, %1) \n\t" + "vmovups %%ymm6, 0x40(%0, %1) \n\t" + "vmovups %%ymm7, 0x60(%0, %1) \n\t" + "vmovups %%ymm8, (%0, %1, 2) \n\t" + "vmovups %%ymm9, 0x20(%0, %1, 2) \n\t" + "vmovups %%ymm10, 0x40(%0, %1, 2) \n\t" + "vmovups %%ymm11, 0x60(%0, %1, 2) \n\t" + + ".align 16 \n\t" + "1: \n\t" + : + : "r"(curO), "r"((I64)oStep) + : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", + "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", + "%ymm15", "memory", "cc"); +} + +void avx2_dw_kernel_4x16(F32 *in0, + F32 *in1, + F32 *in2, + F32 *in3, + const F32 *curW, + F32 *curO, + const F32 *curB, + U32 fw, + U32 fh, + U32 oStep, + U32 iStep, + U32 hStep, + U32 store, + U32 dw, + U32 wStep) +{ + __asm__ __volatile__("vmovups (%5), %%ymm0 \n\t" + "vmovups (%5), %%ymm1 \n\t" + "vmovups (%5), %%ymm2 \n\t" + "vmovups (%5), %%ymm3 \n\t" + "vmovups 0x20(%5), %%ymm4 \n\t" + "vmovups 0x20(%5), %%ymm5 \n\t" + "vmovups 0x20(%5), %%ymm6 \n\t" + "vmovups 0x20(%5), %%ymm7 \n\t" + + "cmp $0, %%ecx \n\t" + "je 3f \n\t" + "cmp $0, %6 \n\t" + "je 3f \n\t" + + ".align 16 \n\t" + "0: \n\t" + + "mov %6, %%eax \n\t" + ".align 16 \n\t" + "1: \n\t" + + "vmovaps (%4), %%ymm12 \n\t" + "vmovups (%0), %%ymm13 \n\t" + "vmovups (%1), %%ymm14 \n\t" + "vmovups (%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm1 \n\t" + "vmovups (%3), %%ymm13 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + + "vmovaps 0x20(%4), %%ymm14 \n\t" + "vmovups (%0, %8), %%ymm15 \n\t" + "vmovups (%1, %8), %%ymm13 \n\t" + "vmovups (%2, %8), %%ymm12 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm4 \n\t" + "vfmadd231ps %%ymm13, %%ymm14, %%ymm5 \n\t" + "vmovups (%3, %8), %%ymm15 \n\t" + "vfmadd231ps %%ymm12, %%ymm14, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm7 \n\t" + + "add %12, %0 \n\t" + "add %12, %1 \n\t" + "add %12, %2 \n\t" + "add %12, %3 \n\t" + "add $0x40, %4 \n\t" + "dec %%eax \n\t" + "jg 1b \n\t" + + "add %10, %4 \n\t" + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "add %9, %3 \n\t" + "dec %%ecx \n\t" + "jg 0b \n\t" + + // relu + "mov %11, %%eax \n\t" + "and $0x6, %%eax \n\t" + "je 3f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm1, %%ymm1 \n\t" + "vmaxps %%ymm15, %%ymm2, %%ymm2 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + "vmaxps %%ymm15, %%ymm5, %%ymm5 \n\t" + "vmaxps %%ymm15, %%ymm6, %%ymm6 \n\t" + "vmaxps %%ymm15, %%ymm7, %%ymm7 \n\t" + + // relu6 + "and $0x4, %%eax \n\t" + "je 3f \n\t" + "mov $0x40C00000, %%eax \n\t" + "vmovd %%eax, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm1, %%ymm1 \n\t" + "vminps %%ymm12, %%ymm2, %%ymm2 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + "vminps %%ymm12, %%ymm5, %%ymm5 \n\t" + "vminps %%ymm12, %%ymm6, %%ymm6 \n\t" + "vminps %%ymm12, %%ymm7, %%ymm7 \n\t" + + ".align 16 \n\t" + "3: \n\t" + : + : "r"(in0), "r"(in1), "r"(in2), "r"(in3), "r"(curW), "r"(curB), "r"(fw), + "c"(fh), "r"((I64)iStep), "r"((I64)hStep), "r"((I64)wStep), "r"(store), + "r"((I64)dw) + : "%eax", "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", + "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", + "%ymm14", "%ymm15", "memory", "cc"); + + __asm__ __volatile__("vmovups %%ymm0, (%0) \n\t" + "vmovups %%ymm1, 0x20(%0) \n\t" + "vmovups %%ymm2, 0x40(%0) \n\t" + "vmovups %%ymm3, 0x60(%0) \n\t" + "vmovups %%ymm4, (%0, %1) \n\t" + "vmovups %%ymm5, 0x20(%0, %1) \n\t" + "vmovups %%ymm6, 0x40(%0, %1) \n\t" + "vmovups %%ymm7, 0x60(%0, %1) \n\t" + + ".align 16 \n\t" + "1: \n\t" + : + : "r"(curO), "r"((I64)oStep) + : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", + "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", + "%ymm15", "memory", "cc"); +} + +void avx2_dw_kernel_4x8(F32 *in0, + F32 *in1, + F32 *in2, + F32 *in3, + const F32 *curW, + F32 *curO, + const F32 *curB, + U32 fw, + U32 fh, + U32 oStep, + U32 iStep, + U32 hStep, + U32 store, + U32 dw, + U32 wStep) +{ + __asm__ __volatile__("vmovups (%6), %%ymm0 \n\t" + "vmovups (%6), %%ymm1 \n\t" + "vmovups (%6), %%ymm2 \n\t" + "vmovups (%6), %%ymm3 \n\t" + + "cmp $0, %%ecx \n\t" + "je 3f \n\t" + "cmp $0, %6 \n\t" + "je 3f \n\t" + + ".align 16 \n\t" + "0: \n\t" + + "mov %7, %%eax \n\t" + ".align 16 \n\t" + "1: \n\t" + + "vmovaps (%4), %%ymm12 \n\t" + "vmovups (%0), %%ymm13 \n\t" + "vmovups (%1), %%ymm14 \n\t" + "vmovups (%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm1 \n\t" + "vmovups (%3), %%ymm13 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + + "add %13, %0 \n\t" + "add %13, %1 \n\t" + "add %13, %2 \n\t" + "add %13, %3 \n\t" + "add $0x20, %4 \n\t" + "dec %%eax \n\t" + "jg 1b \n\t" + + "add %9, %4 \n\t" + "add %11, %0 \n\t" + "add %11, %1 \n\t" + "add %11, %2 \n\t" + "add %11, %3 \n\t" + "dec %%ecx \n\t" + "jg 0b \n\t" + + // relu + "mov %12, %%eax \n\t" + "and $0x6, %%eax \n\t" + "je 3f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm1, %%ymm1 \n\t" + "vmaxps %%ymm15, %%ymm2, %%ymm2 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + + // relu6 + "and $0x4, %%eax \n\t" + "je 3f \n\t" + "mov $0x40C00000, %%eax \n\t" + "vmovd %%eax, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm1, %%ymm1 \n\t" + "vminps %%ymm12, %%ymm2, %%ymm2 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + + ".align 16 \n\t" + "3: \n\t" + "vmovups %%ymm0, (%5) \n\t" + "vmovups %%ymm1, 0x20(%5) \n\t" + "vmovups %%ymm2, 0x40(%5) \n\t" + "vmovups %%ymm3, 0x60(%5) \n\t" + : + : "r"(in0), "r"(in1), "r"(in2), "r"(in3), "r"(curW), "r"(curO), "r"(curB), + "r"(fw), "c"(fh), "r"((I64)wStep), "r"((I64)oStep), "r"((I64)hStep), + "r"(store), "r"((I64)dw) + : "%eax", "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", + "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", + "%ymm14", "%ymm15", "memory", "cc"); +} + +void avx2_dw_kernel_1x24(F32 *in0, + F32 *in1, + F32 *in2, + F32 *in3, + const F32 *curW, + F32 *curO, + const F32 *curB, + U32 fw, + U32 fh, + U32 oStep, + U32 iStep, + U32 hStep, + U32 store, + U32 dw, + U32 wStep) +{ + __asm__ __volatile__("vmovups (%3), %%ymm0 \n\t" + "vmovups 0x20(%3), %%ymm4 \n\t" + "vmovups 0x40(%3), %%ymm8 \n\t" + + "cmp $0, %%ecx \n\t" + "je 3f \n\t" + "cmp $0, %4 \n\t" + "je 3f \n\t" + + ".align 16 \n\t" + "0: \n\t" + + "mov %4, %%eax \n\t" + ".align 16 \n\t" + "1: \n\t" + + "vmovaps (%1), %%ymm12 \n\t" + "vmovups (%0), %%ymm13 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vmovaps 0x20(%1), %%ymm14 \n\t" + "vmovups (%0, %8), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm4 \n\t" + + "vmovaps 0x40(%1), %%ymm13 \n\t" + "vmovups (%0, %8, 2), %%ymm12 \n\t" + "vfmadd231ps %%ymm12, %%ymm13, %%ymm8 \n\t" + + "add %11, %0 \n\t" + "add $0x60, %1 \n\t" + "dec %%eax \n\t" + "jg 1b \n\t" + + "add %6, %1 \n\t" + "add %9, %0 \n\t" + "dec %%ecx \n\t" + "jg 0b \n\t" + + // relu + "mov %10, %%eax \n\t" + "and $0x6, %%eax \n\t" + "je 3f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + "vmaxps %%ymm15, %%ymm8, %%ymm8 \n\t" + + // relu6 + "and $0x4, %%eax \n\t" + "je 3f \n\t" + "mov $0x40C00000, %%eax \n\t" + "vmovd %%eax, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + "vminps %%ymm12, %%ymm8, %%ymm8 \n\t" + + ".align 16 \n\t" + "3: \n\t" + "vmovups %%ymm0, (%2) \n\t" + "vmovups %%ymm4, (%2, %7) \n\t" + "vmovups %%ymm8, (%2, %7, 2) \n\t" + : + : "r"(in0), "r"(curW), "r"(curO), "r"(curB), "r"(fw), "c"(fh), + "r"((I64)wStep), "r"((I64)oStep), "r"((I64)iStep), "r"((I64)hStep), + "r"(store), "r"((I64)dw) + : "%eax", "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", + "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", + "%ymm14", "%ymm15", "memory", "cc"); +} + +void avx2_dw_kernel_1x16(F32 *in0, + F32 *in1, + F32 *in2, + F32 *in3, + const F32 *curW, + F32 *curO, + const F32 *curB, + U32 fw, + U32 fh, + U32 oStep, + U32 iStep, + U32 hStep, + U32 store, + U32 dw, + U32 wStep) +{ + __asm__ __volatile__("vmovups (%3), %%ymm0 \n\t" + "vmovups 0x20(%3), %%ymm4 \n\t" + + "cmp $0, %%ecx \n\t" + "je 3f \n\t" + "cmp $0, %4 \n\t" + "je 3f \n\t" + + ".align 16 \n\t" + "0: \n\t" + + "mov %4, %%eax \n\t" + ".align 16 \n\t" + "1: \n\t" + + "vmovaps (%1), %%ymm12 \n\t" + "vmovups (%0), %%ymm13 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vmovaps 0x20(%1), %%ymm14 \n\t" + "vmovups (%0, %8), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm4 \n\t" + + "add %11, %0 \n\t" + "add $0x40, %1 \n\t" + "dec %%eax \n\t" + "jg 1b \n\t" + + "add %6, %1 \n\t" + "add %9, %0 \n\t" + "dec %%ecx \n\t" + "jg 0b \n\t" + + // relu + "mov %10, %%eax \n\t" + "and $0x6, %%eax \n\t" + "je 3f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + + // relu6 + "and $0x4, %%eax \n\t" + "je 3f \n\t" + "mov $0x40C00000, %%eax \n\t" + "vmovd %%eax, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + + ".align 16 \n\t" + "3: \n\t" + "vmovups %%ymm0, (%2) \n\t" + "vmovups %%ymm4, (%2, %7) \n\t" + : + : "r"(in0), "r"(curW), "r"(curO), "r"(curB), "r"(fw), "c"(fh), + "r"((I64)wStep), "r"((I64)oStep), "r"((I64)iStep), "r"((I64)hStep), + "r"(store), "r"((I64)dw) + : "%eax", "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", + "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", + "%ymm14", "%ymm15", "memory", "cc"); +} + +void avx2_dw_kernel_1x8(F32 *in0, + F32 *in1, + F32 *in2, + F32 *in3, + const F32 *curW, + F32 *curO, + const F32 *curB, + U32 fw, + U32 fh, + U32 oStep, + U32 iStep, + U32 hStep, + U32 store, + U32 dw, + U32 wStep) +{ + __asm__ __volatile__("vmovups (%3), %%ymm0 \n\t" + + "cmp $0, %%ecx \n\t" + "je 3f \n\t" + "cmp $0, %4 \n\t" + "je 3f \n\t" + + ".align 16 \n\t" + "0: \n\t" + + "mov %4, %%eax \n\t" + ".align 16 \n\t" + "1: \n\t" + + "vmovaps (%1), %%ymm12 \n\t" + "vmovups (%0), %%ymm13 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + + "add %11, %0 \n\t" + "add $0x20, %1 \n\t" + "dec %%eax \n\t" + "jg 1b \n\t" + + "add %6, %1 \n\t" + "add %9, %0 \n\t" + "dec %%ecx \n\t" + "jg 0b \n\t" + + // relu + "mov %10, %%eax \n\t" + "and $0x6, %%eax \n\t" + "je 3f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + + // relu6 + "and $0x4, %%eax \n\t" + "je 3f \n\t" + "mov $0x40C00000, %%eax \n\t" + "vmovd %%eax, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + + ".align 16 \n\t" + "3: \n\t" + "vmovups %%ymm0, (%2) \n\t" + : + : "r"(in0), "r"(curW), "r"(curO), "r"(curB), "r"(fw), "c"(fh), + "r"((I64)wStep), "r"((I64)oStep), "r"((I64)iStep), "r"((I64)hStep), + "r"(store), "r"((I64)dw) + : "%eax", "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", + "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", + "%ymm14", "%ymm15", "memory", "cc"); +} + +EE depthwise_convolution_direct(TensorDesc inputDesc, + F32 *inArray, + TensorDesc dwFilterDesc, + const F32 *dwFilterArray, + TensorDesc pwFilterDesc, + const F32 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F32 *dwBiasArray, + TensorDesc pwBiasDesc, + const F32 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec) +{ + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + I32 strideH = convParamSpec.stride_h; + I32 strideW = convParamSpec.stride_w; + I32 paddingT = convParamSpec.padding_top; + I32 paddingB = convParamSpec.padding_bottom; + I32 paddingL = convParamSpec.padding_left; + I32 paddingR = convParamSpec.padding_right; + I32 dilateH = convParamSpec.dilatedRate_h; + I32 dilateW = convParamSpec.dilatedRate_w; + + if (fdf != DF_NCHWC24 || idf != DF_NCHWC8) { + CHECK_STATUS(NOT_MATCH); + } + + F32 *curI, *curO, *calI, *calO; + const F32 *curW, *curB, *calW; + F32 *ftmp = inArray; + dwFilterArray = (F32 *)align_addr(dwFilterArray, 32); + + U32 icAlignSize = 8; + U32 icPadding = (ic + icAlignSize - 1) / icAlignSize * icAlignSize; + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + F32 *useOutArray = (F32 *)align_addr(tmp, 32); + if (pwFilterArray == nullptr) { + useOutArray = outArray; + } + + U32 oStep = oh * ow * SIMD_W * 4; + U32 ocblocking = 0; + U32 iStep = ih * iw * SIMD_W * 4; + U32 hStep = (iw - fw * dilateW + (dilateH - 1) * iw) * SIMD_W * 4; + U32 sw = strideW * SIMD_W * 4; + U32 dw = dilateW * SIMD_W * 4; + U32 wSize = 0, store = 0, ocSize = 0; + U32 ocblocks[3] = {8, 16, 24}; + + U32 ohow = oh * ow; + + F32 *curIn[4]; + U32 in_h = 0, in_w = 0, oc_idx = 0; + + kernel_func kernel[2][3] = {{avx2_dw_kernel_1x8, avx2_dw_kernel_1x16, avx2_dw_kernel_1x24}, + {avx2_dw_kernel_4x8, avx2_dw_kernel_4x16, avx2_dw_kernel_4x24}}; + + store |= U32(depthwiseActivationParamSpec.mode) << 1; + for (U32 n = 0; n < in; ++n) { + for (U32 ocb = 0; ocb < icPadding; ocb += ocSize) { + curW = dwFilterArray + ocb * fh * fw; + curB = dwBiasArray + ocb; + curI = ftmp + ocb * ih * iw; + curO = useOutArray + (n * icPadding + ocb) * oh * ow; + ocSize = UNI_MIN(UNROLL_OC_BLOCK_DIM, icPadding - ocb); + oc_idx = (ocSize >> 3) - 1; + ocSize = ocblocks[oc_idx]; + if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { + for (U32 hw = 0; hw < ohow; hw += wSize) { + wSize = UNI_MIN(ohow - hw, UNROLL_W); + if (wSize < 4) { + wSize = 1; + } + U32 in_h_0 = hw / ow * strideH; + U32 in_w_0 = hw % ow * strideW; + U32 in_h_1 = (hw + 1) / ow * strideH; + U32 in_w_1 = (hw + 1) % ow * strideW; + U32 in_h_2 = (hw + 2) / ow * strideH; + U32 in_w_2 = (hw + 2) % ow * strideW; + U32 in_h_3 = (hw + 3) / ow * strideH; + U32 in_w_3 = (hw + 3) % ow * strideW; + F32 *in_0 = curI + in_h_0 * iw_pad * 8 + in_w_0 * 8; + F32 *in_1 = curI + in_h_1 * iw_pad * 8 + in_w_1 * 8; + F32 *in_2 = curI + in_h_2 * iw_pad * 8 + in_w_2 * 8; + F32 *in_3 = curI + in_h_3 * iw_pad * 8 + in_w_3 * 8; + calO = curO + hw * 8; + + kernel[wSize >> 2][oc_idx](in_0, in_1, in_2, in_3, curW, calO, curB, fw, fh, + oStep, iStep, hStep, store, dw, 0); + } + } else { + I32 tfw = fw, tfh = fh; + I32 in_h = 0, in_w = 0; + I32 ow_padding_l = UNI_MIN((paddingL - 1) / strideW + 1, ow); + I32 ow_padding_r = UNI_MIN((paddingR - 1) / strideW + 1, ow - ow_padding_l); + if (((iw + paddingL - fw) / strideW + 1) >= ow) { + ow_padding_r = 0; + } + for (I32 h = 0; h < oh; ++h) { + tfh = fh; + in_h = h * strideH - paddingT; + calW = curW; + if (in_h < 0) { + tfh = UNI_MIN(fh + in_h, ih); + calW = curW + (fh - tfh) * fw * ocSize; + in_h = 0; + } else if (in_h + fh >= ih) { + tfh = ih - in_h; + } + I32 w = 0; + for (; w < ow_padding_l; ++w) { + I32 in_w = w * strideW - paddingL; + tfw = UNI_MIN(fw + in_w, iw); + const F32 *useW = calW + (fw - tfw) * ocSize; + hStep = (iw - tfw * dilateW + (dilateH - 1) * iw) * SIMD_W * 4; + F32 *in_0 = curI + in_h * iw * 8; + calO = curO + (h * ow + w) * 8; + kernel[0][oc_idx](in_0, nullptr, nullptr, nullptr, useW, calO, curB, tfw, + tfh, oStep, iStep, hStep, store, dw, (fw - tfw) * ocSize * 4); + } + for (; w < ow - ow_padding_r; w += wSize) { + hStep = (iw - fw * dilateW + (dilateH - 1) * iw) * SIMD_W * 4; + wSize = UNI_MIN(ow - ow_padding_r - w, UNROLL_W); + if (wSize < 4) { + wSize = 1; + } + U32 in_w_0 = w * strideW - paddingL; + U32 in_w_1 = (w + 1) * strideW - paddingL; + U32 in_w_2 = (w + 2) * strideW - paddingL; + U32 in_w_3 = (w + 3) * strideW - paddingL; + F32 *in_0 = curI + in_h * iw * 8 + in_w_0 * 8; + F32 *in_1 = curI + in_h * iw * 8 + in_w_1 * 8; + F32 *in_2 = curI + in_h * iw * 8 + in_w_2 * 8; + F32 *in_3 = curI + in_h * iw * 8 + in_w_3 * 8; + calO = curO + (h * ow + w) * 8; + + kernel[wSize >> 2][oc_idx](in_0, in_1, in_2, in_3, calW, calO, curB, fw, + tfh, oStep, iStep, hStep, store, dw, 0); + } + for (; w < ow; ++w) { + I32 in_w = w * strideW - paddingL; + tfw = iw - in_w; + hStep = (iw - tfw * dilateW + (dilateH - 1) * iw) * SIMD_W * 4; + F32 *in_0 = curI + in_h * iw * 8 + in_w * 8; + calO = curO + (h * ow + w) * 8; + kernel[0][oc_idx](in_0, nullptr, nullptr, nullptr, calW, calO, curB, tfw, + tfh, oStep, iStep, hStep, store, dw, (fw - tfw) * ocSize * 4); + } + } + } + } + } + + if (pwFilterArray != nullptr) { + TensorDesc pwInputDesc = tensor4df(odt, DF_NCHWC8, 1, ic, oh, ow); + tmpBytes -= oh * ic *oh * ow + 32; + tmp = (void *)((F32 *)tmp + oh * ic *oh * ow + 32); + ConvolutionParamSpec p = createConvolutionParamSpec( + 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, fn, Convolution_Pointwise); + convolution_1x1_direct(pwInputDesc, useOutArray, pwFilterDesc, pwFilterArray, p, + pwBiasArray, tmpBytes, tmp, outputDesc, outArray, pointwiseActivationParamSpec); + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/x86/fp32/depthwise_convolution_transform.cpp b/compute/tensor/src/cpu/x86/fp32/depthwise_convolution_transform.cpp new file mode 100644 index 00000000..ff2d97dd --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/depthwise_convolution_transform.cpp @@ -0,0 +1,71 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#include "cpu/x86/fp32/transform_functions_fp32.h" + +inline EE depthwise_convolution_transform_filter_kernel_fp32(TensorDesc filterDesc, + const F32 *filterArray, + TensorDesc *ftmDesc, + F32 *ftmArray, + DataFormat ftmDataFormat) +{ + if (nullptr == filterArray || nullptr == ftmDesc || nullptr == ftmArray) { + CHECK_STATUS(NULL_POINTER); + } + DataType fdt; + DataFormat fdf; + U32 fn, fc, fh, fw; + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + ftmArray = (F32 *)(((uintptr_t)ftmArray + 32 - 1) / 32 * 32); + if (fdf == ftmDataFormat) { + *ftmDesc = filterDesc; + memcpy(ftmArray, filterArray, fn * fc * fh * fw * bytesOf(fdt)); + return SUCCESS; + } + if (fdf != DF_NCHW) { + CHECK_STATUS(NOT_SUPPORTED); + } + switch (ftmDataFormat) { + case DF_NCHWC24: { + filterDesc = tensor4df(fdt, fdf, fc, 1, fh, fw); + *ftmDesc = tensor4df(fdt, ftmDataFormat, fc, 1, fh, fw); + transformNCHWToNCHWCxNx<1, 24>(filterDesc, filterArray, *ftmDesc, ftmArray); + *ftmDesc = tensor4df(fdt, ftmDataFormat, fn, fc, fh, fw); + break; + } + default: + return NOT_SUPPORTED; + } + return SUCCESS; +} + +EE depthwise_convolution_transform_filter_fp32(TensorDesc filterDesc, + const F32 *filter, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + F32 *filterTransformed) +{ + DataFormat ftmDataFormat; + switch (algorithm) { + case DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT: + ftmDataFormat = DF_NCHWC24; + break; + default: + return NOT_MATCH; + } + EE ret = depthwise_convolution_transform_filter_kernel_fp32( + filterDesc, filter, ftmDesc, filterTransformed, ftmDataFormat); + CHECK_STATUS(ret); + return ret; +} diff --git a/compute/tensor/src/cpu/x86/fp32/depthwise_pointwise_convolution.cpp b/compute/tensor/src/cpu/x86/fp32/depthwise_pointwise_convolution.cpp new file mode 100644 index 00000000..84eb23e3 --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/depthwise_pointwise_convolution.cpp @@ -0,0 +1,69 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "types.h" +#include "cpu/x86/fp32/tensor_computing_fp32.h" + +EE depthwise_pointwise_convolution_fp32(TensorDesc inputDesc, + F32 *input, + TensorDesc dwFilterDesc, + const F32 *dwFilter, + TensorDesc pwFilterDesc, + const F32 *pwFilter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc dwBiasDesc, + const F32 *dwBias, + TensorDesc pwBiasDesc, + const F32 *pwBias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *output, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + Arch arch) +{ + UNUSED(arch); + if (nullptr == input || nullptr == dwFilter || nullptr == output || nullptr == dwBias || + nullptr == tmp) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if (!(idt == DT_F32 && fdt == DT_F32 && odt == DT_F32)) { + CHECK_STATUS(NOT_MATCH); + } + if (!(idf == DF_NCHWC8 && odf == DF_NCHWC8)) { + CHECK_STATUS(NOT_MATCH); + } + if (ic != fc) { + CHECK_STATUS(NOT_MATCH); + } + + EE ret = NOT_MATCH; + if (algorithm == DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT || + algorithm == DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT) { + ret = depthwise_convolution_direct(inputDesc, input, dwFilterDesc, dwFilter, pwFilterDesc, + pwFilter, convParamSpec, dwBiasDesc, dwBias, pwBiasDesc, pwBias, tmpBytes, tmp, + outputDesc, output, depthwiseActivationParamSpec, pointwiseActivationParamSpec); + } + return ret; +} diff --git a/compute/tensor/src/cpu/x86/fp32/depthwise_pointwise_convolution_transform.cpp b/compute/tensor/src/cpu/x86/fp32/depthwise_pointwise_convolution_transform.cpp new file mode 100644 index 00000000..5f5e7958 --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/depthwise_pointwise_convolution_transform.cpp @@ -0,0 +1,38 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/fp32/tensor_computing_fp32.h" + +EE depthwise_pointwise_convolution_transform_filter_fp32(TensorDesc dwFilterDesc, + const F32 *dwFilter, + TensorDesc pwFilterDesc, + const F32 *pwFilter, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc *dwFtmDesc, + F32 *dwFilterTransformed, + TensorDesc *pwFtmDesc, + F32 *pwFilterTransformed) +{ + EE ret = depthwise_convolution_transform_filter_fp32(dwFilterDesc, dwFilter, + DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT, dwFtmDesc, dwFilterTransformed); + CHECK_STATUS(ret); + if (pwFilter == nullptr) { + return ret; + } + + ConvolutionParamSpec nullSpec; + ret = convolution_transform_filter_fp32(pwFilterDesc, pwFilter, nullSpec, + CONVOLUTION_ALGORITHM_POINTWISE, pwFtmDesc, pwFilterTransformed); + CHECK_STATUS(ret); + return ret; +} diff --git a/compute/tensor/src/cpu/x86/fp32/eltwise.cpp b/compute/tensor/src/cpu/x86/fp32/eltwise.cpp new file mode 100644 index 00000000..5a18f884 --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/eltwise.cpp @@ -0,0 +1,82 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#include "cpu/cpu_functions.h" + +EE eltwise_fp32(std::vector input, + std::vector inputSize, + U32 num, + U32 len, + void *output, + EltwiseMode eltwiseMode) +{ + F32 buffer[8]; + U32 len_tail = len % 8; + U32 len_main = len - len_tail; + + F32 *tmp = buffer; + F32 *output_ptr = (F32 *)output; + for (U32 i = 0; i < len_main; i += 8) { + get_vector((F32 *)input[0], inputSize[0], &tmp, 8, i, 8, buffer); + __m256 tmp_v = _mm256_loadu_ps(tmp); + for (U32 j = 1; j < num; j++) { + get_vector((F32 *)input[j], inputSize[j], &tmp, 8, i, 8, buffer); + __m256 value_v = _mm256_loadu_ps(tmp); + switch (eltwiseMode) { + case ELTWISE_SUM: + tmp_v = _mm256_add_ps(value_v, tmp_v); + break; + case ELTWISE_MAX: + tmp_v = _mm256_max_ps(value_v, tmp_v); + break; + case ELTWISE_PROD: + tmp_v = _mm256_mul_ps(value_v, tmp_v); + break; + case ELTWISE_SUB: + tmp_v = _mm256_sub_ps(tmp_v, value_v); + break; + case ELTWISE_DIV: + tmp_v = _mm256_div_ps(tmp_v, value_v); + break; + default: + return NOT_SUPPORTED; + } + } + _mm256_storeu_ps(output_ptr + i, tmp_v); + } + + for (U32 i = len_main; i < len; i++) { + get_vector((F32 *)input[0], inputSize[0], &tmp, 8, i, 1, buffer); + F32 tmp_s = tmp[0]; + for (U32 j = 1; j < num; j++) { + get_vector((F32 *)input[j], inputSize[j], &tmp, 8, i, 1, buffer); + F32 value_s = tmp[0]; + switch (eltwiseMode) { + case ELTWISE_SUM: + tmp_s = value_s + tmp_s; + break; + case ELTWISE_MAX: + tmp_s = (value_s > tmp_s) ? value_s : tmp_s; + break; + case ELTWISE_PROD: + tmp_s *= value_s; + break; + default: + return NOT_SUPPORTED; + } + } + output_ptr[i] = tmp_s; + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/x86/fp32/l2normalization.cpp b/compute/tensor/src/cpu/x86/fp32/l2normalization.cpp new file mode 100644 index 00000000..0ba99ca5 --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/l2normalization.cpp @@ -0,0 +1,64 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#include "cpu/x86/tensor_computing_x86.h" +#include "cpu/x86/fp32/x86_functions_fp32.h" + +EE l2normalization_fp32(TensorDesc inputDesc, const F32 *input, TensorDesc outputDesc, F32 *output) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, odt; + DataFormat idf, odf; + U32 ic = 0, ih = 0, iw = 0, oh = 0, ow = 0; + if (tensorIs2d(inputDesc)) { + CHECK_STATUS(tensor2dGet(inputDesc, &idt, &idf, &ih, &iw)); + ic = 1; + CHECK_STATUS(tensor2dGet(outputDesc, &odt, &odf, &oh, &ow)); + } else if (tensorIs3d(inputDesc)) { + U32 oc = 0; + CHECK_STATUS(tensor3dGet(inputDesc, &idt, &idf, &ic, &ih, &iw)); + CHECK_STATUS(tensor3dGet(outputDesc, &odt, &odf, &oc, &oh, &ow)); + CHECK_REQUIREMENT(ic == oc); + } else if (tensorIs4d(inputDesc)) { + idt = inputDesc.dt; + ic = inputDesc.dims[0]; + ih = inputDesc.dims[1]; + iw = inputDesc.dims[2]; + } else { + CHECK_STATUS(NOT_MATCH); + } + + // l2norm -> x / sqrt(sum(x^2)) + for (U32 c = 0; c < ic; c++) { + for (U32 h = 0; h < ih; h++) { + U32 index_off = (c * ih + h) * iw; + F32 sum_row = array_var_f32(input + index_off, (I32)iw, 0.f) * static_cast(iw); + F32 sqrt_sum_row = sqrt(sum_row); + __m256 sqrt_sum_row_4 = _mm256_set1_ps(sqrt_sum_row); + __m256 in, out; + U32 w = 0; + for (w = 0; w < iw - 7; w += 8) { + in = _mm256_loadu_ps(input + index_off + w); + out = _mm256_div_ps(in, sqrt_sum_row_4); + _mm256_storeu_ps(output + index_off + w, out); + } + for (; w < iw; w++) { + output[index_off + w] = input[index_off + w] / sqrt_sum_row; + } + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/x86/fp32/lstm.cpp b/compute/tensor/src/cpu/x86/fp32/lstm.cpp new file mode 100644 index 00000000..3ede5f14 --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/lstm.cpp @@ -0,0 +1,318 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/x86/fp32/tensor_computing_fp32.h" + +void mvm_nkn32_with_bias(U32 fn, U32 fk, const F32 *filterArray, const F32 *input, F32 *output, const F32 *bias) +{ +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 n = 0; n < fn; ++n) { + const F32 *f = filterArray + n * fk * 32; + F32 *out = output + n * 32; + const F32 *b = bias + n * 32; + if (bias == nullptr) { + __asm__ __volatile__("vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorps %%ymm2, %%ymm2, %%ymm2 \n\t" + "vxorps %%ymm3, %%ymm3, %%ymm3 \n\t" + : + : + :"%ymm0", "%ymm1", "%ymm2", "%ymm3"); + } else { + __asm__ __volatile__("vmovups (%0), %%ymm0 \n\t" + "vmovups 0x20(%0), %%ymm1 \n\t" + "vmovups 0x40(%0), %%ymm2 \n\t" + "vmovups 0x60(%0), %%ymm3 \n\t" + : + :"r"(b) + :"%ymm0", "%ymm1", "%ymm2", "%ymm3"); + } + __asm__ __volatile__("mov %1, %%rax \n\t" + "mov %3, %%ecx \n\t" + "shr $3, %%ecx \n\t" + "je 1f \n\t" + ".align 16 \n\t" + "0: \n\t" + + "vmovups (%0), %%ymm4 \n\t" + "vmovups 0x20(%0), %%ymm5 \n\t" + "vmovups 0x40(%0), %%ymm6 \n\t" + "vmovups 0x60(%0), %%ymm7 \n\t" + "vbroadcastss 0x0(%%rax), %%ymm8 \n\t" + "vmovups 0x80(%0), %%ymm9 \n\t" + "vmovups 0xA0(%0), %%ymm10 \n\t" + "vmovups 0xC0(%0), %%ymm11 \n\t" + "vmovups 0xE0(%0), %%ymm12 \n\t" + "vbroadcastss 0x4(%%rax), %%ymm13 \n\t" + "vfmadd231ps %%ymm8, %%ymm4, %%ymm0 \n\t" + "vfmadd231ps %%ymm8, %%ymm5, %%ymm1 \n\t" + "vfmadd231ps %%ymm8, %%ymm6, %%ymm2 \n\t" + "vfmadd231ps %%ymm8, %%ymm7, %%ymm3 \n\t" + + "vmovups 0x100(%0), %%ymm4 \n\t" + "vmovups 0x120(%0), %%ymm5 \n\t" + "vmovups 0x140(%0), %%ymm6 \n\t" + "vmovups 0x160(%0), %%ymm7 \n\t" + "vbroadcastss 0x8(%%rax), %%ymm8 \n\t" + "vfmadd231ps %%ymm13, %%ymm9, %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm10, %%ymm1 \n\t" + "vfmadd231ps %%ymm13, %%ymm11, %%ymm2 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + + "vmovups 0x180(%0), %%ymm9 \n\t" + "vmovups 0x1A0(%0), %%ymm10 \n\t" + "vmovups 0x1C0(%0), %%ymm11 \n\t" + "vmovups 0x1E0(%0), %%ymm12 \n\t" + "vbroadcastss 0xC(%%rax), %%ymm13 \n\t" + "vfmadd231ps %%ymm8, %%ymm4, %%ymm0 \n\t" + "vfmadd231ps %%ymm8, %%ymm5, %%ymm1 \n\t" + "vfmadd231ps %%ymm8, %%ymm6, %%ymm2 \n\t" + "vfmadd231ps %%ymm8, %%ymm7, %%ymm3 \n\t" + + "vmovups 0x200(%0), %%ymm4 \n\t" + "vmovups 0x220(%0), %%ymm5 \n\t" + "vmovups 0x240(%0), %%ymm6 \n\t" + "vmovups 0x260(%0), %%ymm7 \n\t" + "vbroadcastss 0x10(%%rax), %%ymm8 \n\t" + "vfmadd231ps %%ymm13, %%ymm9 , %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm10, %%ymm1 \n\t" + "vfmadd231ps %%ymm13, %%ymm11, %%ymm2 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + + "vmovups 0x280(%0), %%ymm9 \n\t" + "vmovups 0x2A0(%0), %%ymm10 \n\t" + "vmovups 0x2C0(%0), %%ymm11 \n\t" + "vmovups 0x2E0(%0), %%ymm12 \n\t" + "vbroadcastss 0x14(%%rax), %%ymm13 \n\t" + "vfmadd231ps %%ymm8, %%ymm4, %%ymm0 \n\t" + "vfmadd231ps %%ymm8, %%ymm5, %%ymm1 \n\t" + "vfmadd231ps %%ymm8, %%ymm6, %%ymm2 \n\t" + "vfmadd231ps %%ymm8, %%ymm7, %%ymm3 \n\t" + + "vmovups 0x300(%0), %%ymm4 \n\t" + "vmovups 0x320(%0), %%ymm5 \n\t" + "vmovups 0x340(%0), %%ymm6 \n\t" + "vmovups 0x360(%0), %%ymm7 \n\t" + "vbroadcastss 0x18(%%rax), %%ymm8 \n\t" + "vfmadd231ps %%ymm13, %%ymm9 , %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm10, %%ymm1 \n\t" + "vfmadd231ps %%ymm13, %%ymm11, %%ymm2 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + + "vmovups 0x380(%0), %%ymm9 \n\t" + "vmovups 0x3A0(%0), %%ymm10 \n\t" + "vmovups 0x3C0(%0), %%ymm11 \n\t" + "vmovups 0x3E0(%0), %%ymm12 \n\t" + "vbroadcastss 0x1C(%%rax), %%ymm13 \n\t" + "vfmadd231ps %%ymm8, %%ymm4, %%ymm0 \n\t" + "vfmadd231ps %%ymm8, %%ymm5, %%ymm1 \n\t" + "vfmadd231ps %%ymm8, %%ymm6, %%ymm2 \n\t" + "vfmadd231ps %%ymm8, %%ymm7, %%ymm3 \n\t" + + "vfmadd231ps %%ymm13, %%ymm9 , %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm10, %%ymm1 \n\t" + "vfmadd231ps %%ymm13, %%ymm11, %%ymm2 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + + "add $0x400, %0 \n\t" + "add $0x20, %%rax \n\t" + + "sub $1, %%ecx \n\t" + "jg 0b \n\t" + ".align 16 \n\t" + "1: \n\t" + + "mov %3, %%ecx \n\t" + "and $7, %%ecx \n\t" + "je 3f \n\t" + "2: \n\t" + "vmovups (%0), %%ymm4 \n\t" + "vmovups 0x20(%0), %%ymm5 \n\t" + "vmovups 0x40(%0), %%ymm6 \n\t" + "vmovups 0x60(%0), %%ymm7 \n\t" + "vbroadcastss (%%rax), %%ymm8 \n\t" + "vfmadd231ps %%ymm8, %%ymm4, %%ymm0 \n\t" + "vfmadd231ps %%ymm8, %%ymm5, %%ymm1 \n\t" + "vfmadd231ps %%ymm8, %%ymm6, %%ymm2 \n\t" + "vfmadd231ps %%ymm8, %%ymm7, %%ymm3 \n\t" + "add $0x80, %0 \n\t" + "add $0x4, %%rax \n\t" + "sub $1, %%ecx \n\t" + "jg 2b \n\t" + + "3: \n\t" + "vmovups %%ymm0, (%2) \n\t" + "vmovups %%ymm1, 0x20(%2) \n\t" + "vmovups %%ymm2, 0x40(%2) \n\t" + "vmovups %%ymm3, 0x60(%2) \n\t" + + : "+r"(f) + : "r"(input), "r"(out), "r"(fk) + : "%rax", "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", + "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", + "memory"); + } +} + +EE rnncell_fp32(TensorDesc xDesc, + const void *currentX, + const TensorDesc *filterDesc, + const void **filter, + const TensorDesc *biasDesc, + const void **bias, + void *state, + U32 tmpBytes, + void *tmp, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + void *output, + Arch arch) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + UNUSED(arch); + if (nullptr == currentX || nullptr == filter || nullptr == bias || nullptr == state || + nullptr == tmp || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ix; + U32 on, oh; + U32 fk, fn; + CHECK_STATUS(tensor2dGet(xDesc, &idt, &idf, &in, &ix)); + CHECK_STATUS(tensor2dGet(filterDesc[0], &fdt, &fdf, &fn, &fk)); + CHECK_STATUS(tensor2dGet(hDesc, &odt, &odf, &on, &oh)); + if (fdf != DF_NKN32) { + CHECK_STATUS(NOT_MATCH); + } + fn /= 32; + + U32 batch = in; + I32 xDim = ix; + I32 hDim = rnnParamSpec.numOutput; + I32 column = (rnnParamSpec.numProjection > 0) ? rnnParamSpec.numProjection + : rnnParamSpec.numOutput; + if (!(idt == DT_F32 && fdt == DT_F32 && odt == DT_F32)) { + CHECK_STATUS(NOT_MATCH); + } + if (!(4 * column == (I32)fn * 32 && (ix + oh) == fk && in == on)) { + CHECK_STATUS(NOT_MATCH); + } + F32 forgetBias = rnnParamSpec.forgetBias; + ActivationMode activationMode = rnnParamSpec.activationMode; + if (activationMode != ACTIVATION_TANH) { + CHECK_STATUS(NOT_SUPPORTED); + } + + const F32 *currentXArray = (const F32 *)currentX; + F32 *lastStateArray = (F32 *)state; + F32 *lastHArray = lastStateArray + column; + F32 *tmpArray = (F32 *)tmp; + F32 *currentStateArray = (F32 *)state; + F32 *currentHArray = currentStateArray + column; + F32 *outputArray = (F32 *)output; + F32 *xhArray = tmpArray; + F32 *intermediateH = xhArray + (xDim + hDim); + U32 lastStateStride = column + hDim; + U32 lastHStride = column + hDim; + U32 currentStateStride = column + hDim; + U32 currentHStride = column + hDim; + __m256 forgetBiasVector = _mm256_set1_ps(forgetBias); + for (U32 m = 0; m < batch; m++) { + F32 *lastBatchH = lastHArray + m * lastHStride; + memcpy(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(F32)); + memcpy(xhArray + xDim, lastBatchH, hDim * sizeof(F32)); + mvm_nkn32_with_bias(fn, fk, (const F32 *)filter[0], xhArray, intermediateH, (const F32 *)bias[0]); + + F32 *out_i = intermediateH; + F32 *out_g = out_i + column; + F32 *out_f = out_i + column * 2; + F32 *out_o = out_i + column * 3; + + F32 *lastBatchState = lastStateArray + m * lastStateStride; + F32 *currentBatchState = currentStateArray + m * currentStateStride; + F32 *currentBatchH = currentHArray + m * currentHStride; + F32 *currentOutput = outputArray + m * batchStrideH; + + F32 *tmpState, *tmpHH, *tmpH; + if (rnnParamSpec.zoneoutCell == 0) { + tmpState = currentBatchState; + } else { + tmpState = out_i; + } + if (rnnParamSpec.numProjection > 0) { + tmpHH = out_g; + tmpH = currentOutput; + } else { + tmpHH = currentOutput; + tmpH = out_g; + } + + I32 h = 0; + for (; h < column - 7; h += 8) { + __m256 out_i_v = _mm256_loadu_ps(out_i + h); + __m256 out_g_v = _mm256_loadu_ps(out_g + h); + __m256 out_f_v = _mm256_loadu_ps(out_f + h); + __m256 out_o_v = _mm256_loadu_ps(out_o + h); + __m256 C_v = _mm256_loadu_ps(lastBatchState + h); + __m256 I_v = _mm256_sigmod_ps(out_i_v); + __m256 F_v = _mm256_sigmod_ps(_mm256_add_ps(out_f_v, forgetBiasVector)); + __m256 O_v = _mm256_sigmod_ps(out_o_v); + __m256 G_v = _mm256_tanh_ps(out_g_v); + C_v = _mm256_add_ps(_mm256_mul_ps(C_v, F_v), _mm256_mul_ps(I_v, G_v)); + __m256 out_hidden_v = _mm256_mul_ps(O_v, _mm256_tanh_ps(C_v)); + _mm256_storeu_ps(tmpState + h, C_v); + _mm256_storeu_ps(tmpHH + h, out_hidden_v); + } + for (; h < column; h++) { + F32 C_s = lastBatchState[h]; + F32 I_s = 1.0 / (1.0 + exp(-out_i[h])); + F32 F_s = 1.0 / (1.0 + exp(-(out_f[h] + forgetBias))); + F32 O_s = 1.0 / (1.0 + exp(-out_o[h])); + F32 G_s = tanh(out_g[h]); + C_s = C_s * F_s + I_s * G_s; + F32 value = O_s * tanh(C_s); + tmpState[h] = C_s; + tmpHH[h] = value; + } + if (rnnParamSpec.zoneoutCell != 0) { + array_scale_f32(tmpState, tmpState, column, 1 - rnnParamSpec.zoneoutCell, 0); + array_scale_f32(lastBatchState, lastBatchState, column, rnnParamSpec.zoneoutCell, 0); + array_add_f32(tmpState, lastBatchState, currentBatchState, column); + } + + if (rnnParamSpec.numProjection > 0) { + mvm_nkn32_with_bias(hDim / 32, rnnParamSpec.numProjection, (const F32 *)filter[1], tmpHH, tmpH, nullptr); + } + + if (rnnParamSpec.zoneoutOutput != 0) { + if (rnnParamSpec.numProjection > 0) { + array_scale_f32(tmpH, out_f, hDim, 1 - rnnParamSpec.zoneoutOutput, 0); + } else { + array_scale_f32(tmpHH, out_f, hDim, 1 - rnnParamSpec.zoneoutOutput, 0); + } + array_scale_f32(lastBatchH, lastBatchH, hDim, rnnParamSpec.zoneoutOutput, 0); + array_add_f32(out_f, lastBatchH, currentBatchH, hDim); + } else { + memcpy(currentBatchH, currentOutput, sizeof(F32) * hDim); + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/x86/fp32/normalization.cpp b/compute/tensor/src/cpu/x86/fp32/normalization.cpp new file mode 100644 index 00000000..46b08655 --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/normalization.cpp @@ -0,0 +1,62 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/x86/fp32/tensor_computing_fp32.h" + +inline void array_norm_scale_fp32( + F32 *input, F32 *output, I32 len, F32 mean, F32 var, F32 *alpha, F32 *beta) +{ + F32 eps = 1e-6; + F32 std_value = sqrt(var + eps); + __m256 mean_v = _mm256_set1_ps(mean); + __m256 std_v = _mm256_set1_ps(std_value); + + I32 i = 0; + for (i = 0; i < len - 7; i += 8) { + __m256 in = _mm256_loadu_ps(input + i); + __m256 alpha_v = _mm256_loadu_ps(alpha + i); + __m256 beta_v = _mm256_loadu_ps(beta + i); + + __m256 tmp_v = _mm256_sub_ps(in, mean_v); + tmp_v = _mm256_div_ps(tmp_v, std_v); + tmp_v = _mm256_fmadd_ps(alpha_v, tmp_v, beta_v); + _mm256_storeu_ps(output + i, tmp_v); + } + for (; i < len; i++) { + output[i] = alpha[i] * (input[i] - mean) / std_value + beta[i]; + } +} + +EE layer_normalization_fp32( + TensorDesc inputDesc, F32 *input, F32 *alpha, F32 *beta, TensorDesc outputDesc, F32 *output) +{ + UNUSED(outputDesc); + if (nullptr == alpha || nullptr == beta || nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + U32 size = tensorNumElements(inputDesc); + I32 size_inner = inputDesc.dims[0]; + I32 size_outer = size / size_inner; + for (I32 i = 0; i < size_outer; i++) { + F32 *current_input = input + i * size_inner; + F32 *current_output = output + i * size_inner; + F32 mean = array_mean_f32(current_input, size_inner); + F32 var = array_var_f32(current_input, size_inner, mean); + + array_norm_scale_fp32(current_input, current_output, size_inner, mean, var, alpha, beta); + } + + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/x86/fp32/pooling.cpp b/compute/tensor/src/cpu/x86/fp32/pooling.cpp new file mode 100644 index 00000000..0a22e198 --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/pooling.cpp @@ -0,0 +1,394 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/x86/fp32/tensor_computing_fp32.h" + +#define UNROLL_W 4 + +typedef void (*pooling_max_func)(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride); +typedef void (*pooling_mean_func)( + const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 padSize); + +void pooling_max_w4(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride) +{ + __asm__ __volatile__("mov %%eax, %%eax \n\t" + "mov %4, %%eax \n\t" + "mov %%rax, %%rdi \n\t" + "mov %%eax, %%eax \n\t" + "mov %5, %%eax \n\t" + "mov %%rax, %%r9 \n\t" + "add %%r9, %%r9 \n\t" + "mov %%rax, %%r10 \n\t" + "add %%r9, %%r10 \n\t" + "add %0, %%rax \n\t" + "add %0, %%r9 \n\t" + "add %0, %%r10 \n\t" + + "vmovups (%0), %%ymm0 \n\t" + "vmovups (%%rax), %%ymm1 \n\t" + "vmovups (%%r9), %%ymm2 \n\t" + "vmovups (%%r10), %%ymm3 \n\t" + + ".align 16 \n\t" + "0: \n\t" + + "mov %2, %%ecx \n\t" + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%0), %%ymm4 \n\t" + "vmovups (%%rax), %%ymm5 \n\t" + "vmovups (%%r9), %%ymm6 \n\t" + "vmovups (%%r10), %%ymm7 \n\t" + + "vmaxps %%ymm0, %%ymm4, %%ymm0 \n\t" + "vmaxps %%ymm1, %%ymm5, %%ymm1 \n\t" + "vmaxps %%ymm2, %%ymm6, %%ymm2 \n\t" + "vmaxps %%ymm3, %%ymm7, %%ymm3 \n\t" + + "add $0x20, %0 \n\t" + "add $0x20, %%rax \n\t" + "add $0x20, %%r9 \n\t" + "add $0x20, %%r10 \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + + "add %%rdi, %0 \n\t" + "add %%rdi, %%rax \n\t" + "add %%rdi, %%r9 \n\t" + "add %%rdi, %%r10 \n\t" + "dec %%ebx \n\t" + "jg 0b \n\t" + + "vmovups %%ymm0, (%1) \n\t" + "vmovups %%ymm1, 0x20(%1) \n\t" + "vmovups %%ymm2, 0x40(%1) \n\t" + "vmovups %%ymm3, 0x60(%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride) + : "%eax", "%rax", "%ecx", "%r10", "%r9", "%rdi", "%ymm0", "%ymm1", "%ymm2", + "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "memory", "cc"); +} + +void pooling_max_w2(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride) +{ + __asm__ __volatile__( + "mov %%eax, %%eax \n\t" + "mov %4, %%eax \n\t" + "mov %%rax, %%rdi \n\t" + "mov %%eax, %%eax \n\t" + "mov %5, %%eax \n\t" + "add %0, %%rax \n\t" + + "vmovups (%0), %%ymm0 \n\t" + "vmovups (%%rax), %%ymm1 \n\t" + + ".align 16 \n\t" + "0: \n\t" + + "mov %2, %%ecx \n\t" + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%0), %%ymm4 \n\t" + "vmovups (%%rax), %%ymm5 \n\t" + + "vmaxps %%ymm0, %%ymm4, %%ymm0 \n\t" + "vmaxps %%ymm1, %%ymm5, %%ymm1 \n\t" + + "add $0x20, %0 \n\t" + "add $0x20, %%rax \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + + "add %%rdi, %0 \n\t" + "add %%rdi, %%rax \n\t" + "dec %%ebx \n\t" + "jg 0b \n\t" + + "vmovups %%ymm0, (%1) \n\t" + "vmovups %%ymm1, 0x20(%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride) + : "%eax", "%rax", "%ecx", "%rdi", "%ymm0", "%ymm1", "%ymm4", "%ymm5", "memory", "cc"); +} + +void pooling_max_w1(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride) +{ + __asm__ __volatile__("mov %%eax, %%eax \n\t" + "mov %4, %%eax \n\t" + "mov %%rax, %%rdi \n\t" + + "vmovups (%0), %%ymm0 \n\t" + ".align 16 \n\t" + "0: \n\t" + + "mov %2, %%ecx \n\t" + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%0), %%ymm4 \n\t" + "vmaxps %%ymm0, %%ymm4, %%ymm0 \n\t" + + "add $0x20, %0 \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + + "add %%rdi, %0 \n\t" + "dec %%ebx \n\t" + "jg 0b \n\t" + + "vmovups %%ymm0, (%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride) + : "%eax", "%rax", "%ecx", "%rdi", "%ymm0", "%ymm4", "memory", "cc"); +} + +void pooling_mean_w4(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 padSize) +{ + __asm__ __volatile__( + "mov %%eax, %%eax \n\t" + "mov %4, %%eax \n\t" + "mov %%rax, %%rdi \n\t" + "mov %5, %%eax \n\t" + "mov %%rax, %%r9 \n\t" + "add %%r9, %%r9 \n\t" + "mov %%rax, %%r10 \n\t" + "add %%r9, %%r10 \n\t" + "add %0, %%rax \n\t" + "add %0, %%r9 \n\t" + "add %0, %%r10 \n\t" + + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorps %%ymm2, %%ymm2, %%ymm2 \n\t" + "vxorps %%ymm3, %%ymm3, %%ymm3 \n\t" + + ".align 16 \n\t" + "0: \n\t" + + "mov %2, %%ecx \n\t" + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%0), %%ymm4 \n\t" + "vmovups (%%rax), %%ymm5 \n\t" + "vmovups (%%r9), %%ymm6 \n\t" + "vmovups (%%r10), %%ymm7 \n\t" + + "vaddps %%ymm0, %%ymm4, %%ymm0 \n\t" + "vaddps %%ymm1, %%ymm5, %%ymm1 \n\t" + "vaddps %%ymm2, %%ymm6, %%ymm2 \n\t" + "vaddps %%ymm3, %%ymm7, %%ymm3 \n\t" + + "add $0x20, %0 \n\t" + "add $0x20, %%rax \n\t" + "add $0x20, %%r9 \n\t" + "add $0x20, %%r10 \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + + "add %%rdi, %0 \n\t" + "add %%rdi, %%rax \n\t" + "add %%rdi, %%r9 \n\t" + "add %%rdi, %%r10 \n\t" + "dec %%ebx \n\t" + "jg 0b \n\t" + + "vbroadcastss (%6), %%ymm4 \n\t" + "vdivps %%ymm4, %%ymm0, %%ymm0 \n\t" + "vdivps %%ymm4, %%ymm1, %%ymm1 \n\t" + "vdivps %%ymm4, %%ymm2, %%ymm2 \n\t" + "vdivps %%ymm4, %%ymm3, %%ymm3 \n\t" + + "vmovups %%ymm0, (%1) \n\t" + "vmovups %%ymm1, 0x20(%1) \n\t" + "vmovups %%ymm2, 0x40(%1) \n\t" + "vmovups %%ymm3, 0x60(%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride), "r"(&padSize) + : "%eax", "%rax", "%ecx", "%r10", "%r9", "%rdi", "%ymm0", "%ymm1", "%ymm2", "%ymm3", + "%ymm4", "%ymm5", "%ymm6", "%ymm7", "memory", "cc"); +} + +void pooling_mean_w2(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 padSize) +{ + __asm__ __volatile__( + "mov %%eax, %%eax \n\t" + "mov %4, %%eax \n\t" + "mov %%rax, %%rdi \n\t" + "mov %5, %%eax \n\t" + "add %0, %%rax \n\t" + + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" + + ".align 16 \n\t" + "0: \n\t" + + "mov %2, %%ecx \n\t" + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%0), %%ymm4 \n\t" + "vmovups (%%rax), %%ymm5 \n\t" + + "vaddps %%ymm0, %%ymm4, %%ymm0 \n\t" + "vaddps %%ymm1, %%ymm5, %%ymm1 \n\t" + + "add $0x20, %0 \n\t" + "add $0x20, %%rax \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + + "add %%rdi, %0 \n\t" + "add %%rdi, %%rax \n\t" + "dec %%ebx \n\t" + "jg 0b \n\t" + + "vbroadcastss (%6), %%ymm4 \n\t" + "vdivps %%ymm4, %%ymm0, %%ymm0 \n\t" + "vdivps %%ymm4, %%ymm1, %%ymm1 \n\t" + + "vmovups %%ymm0, (%1) \n\t" + "vmovups %%ymm1, 0x20(%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride), "r"(&padSize) + : "%eax", "%rax", "%ecx", "%rdi", "%ymm0", "%ymm1", "%ymm4", "%ymm5", "memory", "cc"); +} + +void pooling_mean_w1(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 padSize) +{ + __asm__ __volatile__( + "mov %%eax, %%eax \n\t" + "mov %4, %%eax \n\t" + "mov %%rax, %%rdi \n\t" + + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + + ".align 16 \n\t" + "0: \n\t" + + "mov %2, %%ecx \n\t" + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%0), %%ymm4 \n\t" + + "vaddps %%ymm0, %%ymm4, %%ymm0 \n\t" + + "add $0x20, %0 \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + + "add %%rdi, %0 \n\t" + "dec %%ebx \n\t" + "jg 0b \n\t" + + "vbroadcastss (%6), %%ymm4 \n\t" + "vdivps %%ymm4, %%ymm0, %%ymm0 \n\t" + + "vmovups %%ymm0, (%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride), "r"(&padSize) + : "%eax", "%rax", "%ecx", "%rdi", "%ymm0", "%ymm4", "memory", "cc"); +} + +EE pooling_fp32(TensorDesc inputDesc, + const F32 *input, + PoolingParamSpec poolingParamSpec, + TensorDesc outputDesc, + F32 *output) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, odt; + DataFormat idf, odf; + U32 in = 0, ic = 0, ih = 0, iw = 0, on = 0, oc = 0, oh = 0, ow = 0; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if (idt != odt || idt != DT_F32) { + CHECK_STATUS(NOT_MATCH); + } + if (in != on || ic != oc) { + CHECK_STATUS(NOT_MATCH); + } + if (idf != DF_NCHWC8 || odf != idf) { + CHECK_STATUS(NOT_MATCH); + } + + PoolingMode pm = poolingParamSpec.mode; + U32 strideH = poolingParamSpec.stride_h; + U32 strideW = poolingParamSpec.stride_w; + U32 paddingT = poolingParamSpec.padding_top; + U32 paddingL = poolingParamSpec.padding_left; + U32 kernelSizeH = poolingParamSpec.kernel_h; + U32 kernelSizeW = poolingParamSpec.kernel_w; + U32 wSize, kh, kw, iStep; + F32 padSize, *curO; + const F32 *curI; + if (paddingT >= kernelSizeH || paddingL >= kernelSizeW) { + CHECK_STATUS(NOT_SUPPORTED); + } + + ic /= 8; + U32 wSizes[3] = {1, 2, 4}; + pooling_max_func pooling_max[3] = {pooling_max_w1, pooling_max_w2, pooling_max_w4}; + pooling_mean_func pooling_mean[3] = {pooling_mean_w1, pooling_mean_w2, pooling_mean_w4}; + for (U32 n = 0; n < in; n++) { + for (U32 c = 0; c < ic; c++) { + for (U32 h = 0; h < oh; h++) { + for (U32 w = 0; w < ow; w += wSize) { + wSize = UNI_MIN(ow - w - paddingL / strideW, UNROLL_W); + wSize = wSizes[wSize >> 1]; + int hstart = (int)h * (int)strideH - (int)paddingT; + int wstart = (int)w * (int)strideW - (int)paddingL; + int hend = UNI_MIN(hstart + kernelSizeH, ih); + int wend = UNI_MIN(wstart + kernelSizeW, iw); + hstart = UNI_MAX(hstart, 0); + wstart = UNI_MAX(wstart, 0); + + curI = input + (hstart * iw + wstart) * 8; + curO = output + (h * ow + w) * 8; + kh = hend - hstart; + kw = wend - wstart; + iStep = (iw - kw) * 32; + padSize = kw * kh * 1.0f; + if (kw < kernelSizeW) { + wSize = 1; + } + + switch (pm) { + case POOLING_MAX: { + pooling_max[wSize >> 1](curI, curO, kw, kh, iStep, strideW * 32); + break; + } + case POOLING_MEAN: { + pooling_mean[wSize >> 1]( + curI, curO, kw, kh, iStep, strideW * 32, padSize); + break; + } + default: + CHECK_STATUS(NOT_SUPPORTED); + } + } + } + input += ih * iw * 8; + output += oh * ow * 8; + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/x86/fp32/scale.cpp b/compute/tensor/src/cpu/x86/fp32/scale.cpp new file mode 100644 index 00000000..673ff9d8 --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/scale.cpp @@ -0,0 +1,119 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/fp32/tensor_computing_fp32.h" + +EE scale_nchwc8_fp32( + F32 *input, F32 *alpha, F32 *beta, I32 in, I32 ic, I32 elements_per_channel, F32 *output) +{ + __m256 in_vec, out_vec; + __m256 one = _mm256_set1_ps(1.); + __m256 zero = _mm256_set1_ps(0.); + U32 index = 0; + for (I32 n = 0; n < in; n++) { + for (I32 c = 0; c < ic; c += 8) { + __m256 alpha_vec = (alpha == nullptr) ? one : _mm256_loadu_ps(alpha + c); + __m256 beta_vec = (beta == nullptr) ? zero : _mm256_loadu_ps(beta + c); + for (I32 i = 0; i < elements_per_channel; i++) { + in_vec = _mm256_loadu_ps(input + index); + out_vec = _mm256_fmadd_ps(alpha_vec, in_vec, beta_vec); + _mm256_storeu_ps(output + index, out_vec); + index += 8; + } + } + } + return SUCCESS; +} + +EE scale_nchw_fp32( + F32 *input, F32 *alpha, F32 *beta, I32 in, I32 ic, I32 elements_per_channel, F32 *output) +{ + __m256 one = _mm256_set1_ps(1.); + __m256 zero = _mm256_set1_ps(0.); + U32 index = 0; + for (I32 n = 0; n < in; n++) { + for (I32 c = 0; c < ic; c++) { + __m256 alpha_vec = (alpha == nullptr) ? one : _mm256_set1_ps(alpha[c]); + __m256 beta_vec = (beta == nullptr) ? zero : _mm256_set1_ps(beta[c]); + I32 i = 0; + for (; i < elements_per_channel - 7; i += 8) { + __m256 in_vec = _mm256_loadu_ps(input + index); + __m256 out_vec = _mm256_fmadd_ps(alpha_vec, in_vec, beta_vec); + _mm256_storeu_ps(output + index, out_vec); + index += 8; + } + for (; i < elements_per_channel; i++) { + float alpha_s = (alpha == nullptr) ? 1 : alpha[c]; + float beta_s = (beta == nullptr) ? 0 : beta[c]; + output[index] = alpha_s * input[index] + beta_s; + index++; + } + } + } + return SUCCESS; +} + +EE scale_nhwc_fp32( + F32 *input, F32 *alpha, F32 *beta, I32 in, I32 ic, I32 elements_per_channel, F32 *output) +{ + __m256 one = _mm256_set1_ps(1.); + __m256 zero = _mm256_set1_ps(0.); + U32 index = 0; + for (I32 n = 0; n < in; n++) { + for (I32 i = 0; i < elements_per_channel; i++) { + I32 c = 0; + for (; c < ic - 7; c += 8) { + __m256 alpha_vec = (alpha == nullptr) ? one : _mm256_loadu_ps(alpha + c); + __m256 beta_vec = (beta == nullptr) ? zero : _mm256_loadu_ps(beta + c); + __m256 in_vec = _mm256_loadu_ps(input + index); + __m256 out_vec = _mm256_fmadd_ps(alpha_vec, in_vec, beta_vec); + _mm256_storeu_ps(output + index, out_vec); + index += 8; + } + for (; c < ic; c++) { + float alpha_s = (alpha == nullptr) ? 1 : alpha[c]; + float beta_s = (beta == nullptr) ? 0 : beta[c]; + output[index] = alpha_s * input[index] + beta_s; + index++; + } + } + } + return SUCCESS; +} + +EE scale_fp32(F32 *input, + I32 axis, + I32 nDims, + F32 *alpha, + F32 *beta, + I32 in, + I32 ic, + I32 elements_per_channel, + F32 *output) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + EE ret = SUCCESS; + // If ic is 1, it means that weights/vectors have only one param, so we need use the calculation logic of nchw. + if (axis == 1 || axis == 0 || ic == 1) { + ret = scale_nchw_fp32(input, alpha, beta, in, ic, elements_per_channel, output); + } else if (axis == nDims - 1) { + ret = scale_nhwc_fp32(input, alpha, beta, in, ic, elements_per_channel, output); + } else if (axis == nDims) { + ret = scale_nchwc8_fp32(input, alpha, beta, in, ic, elements_per_channel, output); + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + return ret; +} diff --git a/compute/tensor/src/cpu/x86/fp32/softmax.cpp b/compute/tensor/src/cpu/x86/fp32/softmax.cpp new file mode 100644 index 00000000..1ff108db --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/softmax.cpp @@ -0,0 +1,139 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include +#include "cpu/x86/fp32/tensor_computing_fp32.h" + +void softmax_lastAxis_fp32(const F32 *input, I32 loopOuter, I32 loops, F32 *output) +{ + for (I32 i = 0; i < loopOuter; i++) { + const F32 *inputPtr = input + i * loops; + F32 *outputPtr = output + i * loops; + + __m256 max_v, sub_v, sum_v, tmp_v; + F32 max_s, tmp_s; + max_s = array_max_f32(inputPtr, loops); + max_v = _mm256_set1_ps(max_s); + sum_v = _mm256_set1_ps(0.f); + + I32 j = 0; + F32 sum_s = 0; + for (j = 0; j < loops - 7; j += 8) { + __m256 in = _mm256_loadu_ps(inputPtr + j); + sub_v = _mm256_sub_ps(in, max_v); + tmp_v = _mm256_exp_ps(sub_v); + sum_v = _mm256_add_ps(sum_v, tmp_v); + _mm256_storeu_ps(outputPtr + j, tmp_v); + } + sum_s += _mm256_sum_ps(sum_v); + for (; j < loops; j++) { + tmp_s = exp(inputPtr[j] - max_s); + outputPtr[j] = tmp_s; + sum_s += tmp_s; + } + array_scale_f32(outputPtr, outputPtr, loops, 1.0 / sum_s, 0); + } +} + +void softmax_anyAxis_fp32(const F32 *input, I32 loopOuter, I32 loops, I32 loopInner, F32 *output) +{ + std::vector buffer(loopInner * 2); + F32 *maxBuffer = &buffer[0]; + F32 *sumBuffer = &buffer[loopInner]; + I32 k = 0; + for (I32 i = 0; i < loopOuter; i++) { + const F32 *inputPtrBase = input + i * loops * loopInner; + F32 *outputPtrBase = output + i * loops * loopInner; + + memcpy(maxBuffer, inputPtrBase, loopInner * sizeof(F32)); + memset(sumBuffer, 0, loopInner * sizeof(F32)); + for (I32 j = 1; j < loops; j++) { + const F32 *inputPtr = inputPtrBase + j * loopInner; + for (k = 0; k < loopInner - 7; k += 8) { + __m256 in_v = _mm256_loadu_ps(inputPtr + k); + __m256 out_v = _mm256_loadu_ps(maxBuffer + k); + __m256 max_v = _mm256_max_ps(in_v, out_v); + _mm256_storeu_ps(maxBuffer + k, max_v); + } + for (; k < loopInner; k++) { + maxBuffer[k] = UNI_MAX(maxBuffer[k], inputPtr[k]); + } + } + for (I32 j = 0; j < loops; j++) { + const F32 *inputPtr = inputPtrBase + j * loopInner; + F32 *outputPtr = outputPtrBase + j * loopInner; + for (k = 0; k < loopInner - 7; k += 8) { + __m256 in_v = _mm256_loadu_ps(inputPtr + k); + __m256 max_v = _mm256_loadu_ps(maxBuffer + k); + __m256 sub_v = _mm256_sub_ps(in_v, max_v); + __m256 exp_v = _mm256_exp_ps(sub_v); + __m256 sum_v = _mm256_loadu_ps(sumBuffer + k); + sum_v = _mm256_add_ps(sum_v, exp_v); + _mm256_storeu_ps(sumBuffer + k, sum_v); + _mm256_storeu_ps(outputPtr + k, exp_v); + } + for (; k < loopInner; k++) { + outputPtr[k] = exp(inputPtr[k] - maxBuffer[k]); + sumBuffer[k] += outputPtr[k]; + } + } + for (I32 j = 0; j < loops; j++) { + F32 *outputPtr = outputPtrBase + j * loopInner; + for (k = 0; k < loopInner - 7; k += 8) { + __m256 out_v = _mm256_loadu_ps(outputPtr + k); + __m256 sum_v = _mm256_loadu_ps(sumBuffer + k); + out_v = _mm256_div_ps(out_v, sum_v); + _mm256_storeu_ps(outputPtr + k, out_v); + } + for (; k < loopInner; k++) { + outputPtr[k] /= sumBuffer[k]; + } + } + } +} + +EE softmax_fp32(TensorDesc inputDesc, const F32 *input, int axis, TensorDesc outputDesc, F32 *output) +{ + UNUSED(outputDesc); + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + U32 size = tensorNumElements(inputDesc); + axis = (axis + inputDesc.nDims) % inputDesc.nDims; + axis = inputDesc.nDims - 1 - axis; + I32 loops = inputDesc.dims[axis]; + + I32 loopInner = 1; + for (int i = 0; i < axis; i++) { + loopInner *= inputDesc.dims[i]; + } + U32 loopOuter = size / loops / loopInner; + + if (loopInner == 1) { + if (DF_NCHWC8 == inputDesc.df && 4 == inputDesc.nDims && + (inputDesc.dims[1] != 1 || inputDesc.dims[0] != 1)) { + CHECK_REQUIREMENT(2 != axis); + loopInner *= 8; + loopOuter /= 8; + softmax_anyAxis_fp32(input, loopOuter, loops, loopInner, output); + } else { + softmax_lastAxis_fp32(input, loopOuter, loops, output); + } + } else { + CHECK_REQUIREMENT(DF_NCHWC8 != inputDesc.df); + softmax_anyAxis_fp32(input, loopOuter, loops, loopInner, output); + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/x86/fp32/tensor_computing_fp32.h b/compute/tensor/src/cpu/x86/fp32/tensor_computing_fp32.h new file mode 100644 index 00000000..6ff33c5d --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/tensor_computing_fp32.h @@ -0,0 +1,259 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef CHEETAH_TENSOR_COMPUTING_FP32_H +#define CHEETAH_TENSOR_COMPUTING_FP32_H + +#include +#include "sys.h" +#include "error.h" +#include "types.h" +#include "thread_affinity.h" +#include "x86_functions_fp32.h" + +EE attention_mask_fp32(TensorDesc inputDesc, + const F32 *input, + AttentionMaskParamSpec p, + TensorDesc outputDesc, + F32 *output); + +EE convolution_transform_filter_fp32(TensorDesc filterDesc, + const F32 *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + F32 *filterTransformed); + +EE convolution_infer_forward_tmp_bytes_fp32(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes); + +EE convolution_fp32(TensorDesc inputDesc, + F32 *input, + TensorDesc filterDesc, + const F32 *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc biasDesc, + const F32 *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *output, + ActivationParamSpec activationDesc, + Arch arch); + +EE convolution_direct(TensorDesc inputDesc, + F32 *inArray, + TensorDesc filterDesc, + const F32 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F32 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec activationDesc); + +EE convolution_1x1_direct(TensorDesc inputDesc, + F32 *inArray, + TensorDesc filterDesc, + const F32 *filterArray, + ConvolutionParamSpec convParamSpec, + const F32 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec activationDesc); + +EE convolution_2x2_direct(TensorDesc inputDesc, + F32 *inArray, + TensorDesc filterDesc, + const F32 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F32 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec activationDesc); + +EE convolution_direct_nchw(TensorDesc inputDesc, + F32 *inArray, + TensorDesc filterDesc, + const F32 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F32 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec activationDesc); + +EE check_fp32(TensorDesc inputDescA, + const F32 *inputA, + TensorDesc inputDescB, + const F32 *inputB, + CheckMode checkMode, + TensorDesc outputDesc, + I32 *output); + +EE clip_fp32(F32 *input, F32 *output, I32 len, F32 minValue, F32 maxValue); + +EE depthwise_pointwise_convolution_transform_filter_fp32(TensorDesc dwFilterDesc, + const F32 *dwFilter, + TensorDesc pwFilterDesc, + const F32 *pwFilter, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc *dwFtmDesc, + F32 *dwFilterTransformed, + TensorDesc *pwFtmDesc, + F32 *pwFilterTransformed); + +EE depthwise_pointwise_convolution_fp32(TensorDesc inputDesc, + F32 *input, + TensorDesc dwFilterDesc, + const F32 *dwFilter, + TensorDesc pwFilterDesc, + const F32 *pwFilter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc dwBiasDesc, + const F32 *dwBias, + TensorDesc pwBiasDesc, + const F32 *pwBias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *output, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + Arch arch); + +EE depthwise_convolution_infer_forward_algorithm_fp32(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + DepthwiseConvolutionForwardAlgorithm *algorithm, + DataType targetDataType); + +EE depthwise_convolution_transform_filter_bytes_fp32( + TensorDesc filterDesc, DepthwiseConvolutionForwardAlgorithm algorithm, U32 *bytes); + +EE depthwise_convolution_transform_filter_fp32(TensorDesc filterDesc, + const F32 *filter, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + F32 *filterTransformed); + +EE depthwise_convolution_transform_filter_fp32(TensorDesc filterDesc, + const F32 *filter, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + F32 *filterTransformed); + +EE depthwise_convolution_fp32(TensorDesc inputDesc, + F32 *input, + TensorDesc filterDesc, + const F32 *filter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc biasDesc, + const F32 *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *output, + ActivationParamSpec depthwiseActivationParamSpec, + Arch arch); + +EE depthwise_convolution_direct(TensorDesc inputDesc, + F32 *inArray, + TensorDesc dwFilterDesc, + const F32 *dwFilterArray, + TensorDesc pwFilterDesc, + const F32 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F32 *dwBiasArray, + TensorDesc pwBiasDesc, + const F32 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec); + +EE eltwise_fp32(std::vector input, + std::vector inputSize, + U32 num, + U32 len, + void *output, + EltwiseMode eltwiseMode); + +EE layer_normalization_fp32( + TensorDesc inputDesc, F32 *input, F32 *alpha, F32 *beta, TensorDesc outputDesc, F32 *output); + +EE l2normalization_fp32(TensorDesc inputDesc, const F32 *input, TensorDesc outputDesc, F32 *output); + +EE rnncell_fp32(TensorDesc xDesc, + const void *currentX, + const TensorDesc *filterDesc, + const void **filter, + const TensorDesc *biasDesc, + const void **bias, + void *state, + U32 tmpBytes, + void *tmp, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + void *output, + Arch arch); + +EE pooling_fp32(TensorDesc inputDesc, + const F32 *input, + PoolingParamSpec poolingParamSpec, + TensorDesc outputDesc, + F32 *output); + +EE scale_fp32(F32 *input, + I32 axis, + I32 nDims, + F32 *alpha, + F32 *beta, + I32 in, + I32 ic, + I32 elements_per_channel, + F32 *output); + +EE softmax_fp32( + TensorDesc inputDesc, const F32 *input, int axis, TensorDesc outputDesc, F32 *output); + +EE deconvolution_transform_filter_fp32(TensorDesc filterDesc, + const F32 *filter, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + F32 *filterTransformed); + +#endif //CHEETAH_TENSOR_COMPUTING_FP32_H diff --git a/compute/tensor/src/cpu/x86/fp32/transform_functions_fp32.h b/compute/tensor/src/cpu/x86/fp32/transform_functions_fp32.h new file mode 100644 index 00000000..4e69369b --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/transform_functions_fp32.h @@ -0,0 +1,149 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef CHEETAH_X86_TRANSFORM_FUNTIONS_FP32_H +#define CHEETAH_X86_TRANSFORM_FUNTIONS_FP32_H + +#include "types.h" + +template +inline void transformNCHWCxNx(U32 fc, U32 fh, U32 fw, U32 oc, const F32 *input, F32 *output) +{ + F32 *dest; + const F32 *src; + U32 cSize = 0, cSizePadding = 0; + U32 lstep = fc * fh * fw; + __m256i vindex = _mm256_set_epi32( + lstep * 7, lstep * 6, lstep * 5, lstep * 4, lstep * 3, lstep * 2, lstep, 0); + for (U32 c = 0; c < fc; c += cSize) { + cSize = UNI_MIN(fc - c, C); + cSizePadding = UNI_MIN(oc - c, C); + for (U32 hw = 0; hw < fh * fw; ++hw) { + for (U32 c8 = 0; c8 < cSize; ++c8) { + src = input + (c + c8) * fh * fw + hw; + dest = output + c * fh * fw * N + hw * cSizePadding * N + c8 * N; + if (N >= 8) { + _mm256_storeu_ps(dest, _mm256_i32gather_ps(src, vindex, 4)); + } + if (N >= 16) { + _mm256_storeu_ps(dest + 8, _mm256_i32gather_ps(src + 8 * lstep, vindex, 4)); + } + if (N >= 24) { + _mm256_storeu_ps(dest + 16, _mm256_i32gather_ps(src + 16 * lstep, vindex, 4)); + } + if (N == 32) { + _mm256_storeu_ps(dest + 24, _mm256_i32gather_ps(src + 24 * lstep, vindex, 4)); + } + } + memset(dest + N, 0, ((cSizePadding - cSize) * N * 4)); + } + } +} + +// N is 32/24 +template +inline EE transformNCHWToNCHWCxNx( + TensorDesc inputDesc, const F32 *input, TensorDesc outputDesc, F32 *output) +{ + if (input == NULL || output == NULL) { + CHECK_STATUS(NULL_POINTER); + } + DataType fdt, odt; + DataFormat fdf, odf; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + U32 remain = fn % N; + fn -= remain; + + for (U32 n = 0; n < fn; n += N) { + transformNCHWCxNx(fc, fh, fw, oc, input, output); + input += fc * fh * fw * N; + output += oc * fh * fw * N; + } + if (remain >= 16) { + transformNCHWCxNx(fc, fh, fw, oc, input, output); + input += fc * fh * fw * 16; + output += oc * fh * fw * 16; + remain -= 16; + } + if (remain >= 8) { + transformNCHWCxNx(fc, fh, fw, oc, input, output); + input += fc * fh * fw * 8; + output += oc * fh * fw * 8; + remain -= 8; + } + if (remain > 0) { + F32 *dest; + const F32 *src; + U32 cSize = 0, cSizePadding = 0; + F32 m[8] = {0.0f}; + for (U32 i = 0; i < remain; ++i) { + m[i] = -1.0f; + } + __m256 mask = _mm256_set_ps(m[7], m[6], m[5], m[4], m[3], m[2], m[1], m[0]); + U32 lstep = fc * fh * fw; + __m256i vindex = _mm256_set_epi32( + lstep * 7, lstep * 6, lstep * 5, lstep * 4, lstep * 3, lstep * 2, lstep, 0); + __m256 src256 = _mm256_setzero_ps(); + for (U32 c = 0; c < fc; c += cSize) { + cSize = UNI_MIN(fc - c, C); + cSizePadding = UNI_MIN(oc - c, C); + for (U32 hw = 0; hw < fh * fw; ++hw) { + for (U32 c8 = 0; c8 < cSize; ++c8) { + src = input + (c + c8) * fh * fw + hw; + dest = output + c * fh * fw * 8 + hw * cSizePadding * 8 + c8 * 8; + _mm256_storeu_ps(dest, _mm256_mask_i32gather_ps(src256, src, vindex, mask, 4)); + } + memset(dest + 8, 0, ((cSizePadding - cSize) * 32)); + } + } + fn += remain; + } + return SUCCESS; +} + + +inline void PaddingNCHWC8(F32 *data, F32 *tmp, + TensorDesc inputDesc, ConvolutionParamSpec convParamSpec) { + // NCHWC8 + DataType idt; + DataFormat idf; + U32 in, ic, ih, iw; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + U32 padih = paddingT + paddingB + ih; + U32 padiw = paddingL + paddingR + iw; + U32 coff, hoff; + + CHECK_REQUIREMENT((idf == DF_NCHWC8) && (ic % 8 == 0)); + for(U32 c = 0; c < ic; c += 8) { + coff = c * padih * padiw; + memset(tmp + coff, 0, padiw * paddingT * 8 * bytesOf(idt)); + for (U32 h = 0; h < ih; ++h) { + hoff = (h + paddingT) * padiw; + memset(tmp + coff + hoff * 8, 0, paddingL * 8 * bytesOf(idt)); + memcpy(tmp + coff + (hoff + paddingL) * 8, data + c * ih * iw + h * iw * 8, iw * 8 * bytesOf(idt)); + memset(tmp + coff + (hoff + (paddingL + iw)) * 8, 0, paddingR * 8 * bytesOf(idt)); + } + memset(tmp + coff + (ih + paddingT) * padiw * 8, 0, padiw * paddingB * 8 * bytesOf(idt)); + } +} + +#endif diff --git a/compute/tensor/src/cpu/x86/fp32/x86_functions_fp32.h b/compute/tensor/src/cpu/x86/fp32/x86_functions_fp32.h new file mode 100644 index 00000000..2897bdb6 --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/x86_functions_fp32.h @@ -0,0 +1,361 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef CHEETAH_X86_FUNCTIONS_FP32_H +#define CHEETAH_X86_FUNCTIONS_FP32_H +#include +#include "x86_avx2_expand.h" +#include "types.h" + +inline EE activation_fp32(F32 *input, U32 len, ActivationParamSpec activationDesc, F32 *output) +{ + __m256 in, out; + __m256 zero = _mm256_set1_ps(0.); + __m256 one = _mm256_set1_ps(1.); + __m256 three = _mm256_set1_ps(3.); + __m256 six = _mm256_set1_ps(6.); + U32 len_main = len / 8; + U32 len_tail = len % 8; + + F32 value; + switch (activationDesc.mode) { + case ACTIVATION_NULL: { + break; + } + case ACTIVATION_RELU: { + if (activationDesc.value[0] == 0) { + for (U32 i = 0; i < len_main; i++) { + in = _mm256_loadu_ps(input); + out = _mm256_max_ps(zero, in); + _mm256_storeu_ps(output, out); + input += 8; + output += 8; + } + for (U32 i = 0; i < len_tail; i++) { + output[i] = (input[i] < 0) ? 0 : input[i]; + } + } else { + __m256 scale = _mm256_set1_ps(activationDesc.value[0]); + for (U32 i = 0; i < len_main; i++) { + in = _mm256_loadu_ps(input); + __m256 tmp = _mm256_mul_ps(in, scale); + out = _mm256_max_ps(tmp, in); + _mm256_storeu_ps(output, out); + input += 8; + output += 8; + } + for (U32 i = 0; i < len_tail; i++) { + float tmp = activationDesc.value[0] * input[i]; + output[i] = (input[i] < tmp) ? tmp : input[i]; + } + } + break; + } + case ACTIVATION_RELU6: { + for (U32 i = 0; i < len_main; i++) { + in = _mm256_loadu_ps(input); + out = _mm256_max_ps(zero, in); + out = _mm256_min_ps(six, out); + _mm256_storeu_ps(output, out); + input += 8; + output += 8; + } + for (U32 i = 0; i < len_tail; i++) { + value = (input[i] < 0) ? 0 : input[i]; + if (value > 6) { + value = 6; + } + output[i] = value; + } + break; + } + case ACTIVATION_H_SIGMOID: { + for (U32 i = 0; i < len_main; i++) { + in = _mm256_loadu_ps(input); + out = _mm256_add_ps(in, three); + out = _mm256_max_ps(out, zero); + out = _mm256_min_ps(out, six); + out = _mm256_div_ps(out, six); + _mm256_storeu_ps(output, out); + input += 8; + output += 8; + } + for (U32 i = 0; i < len_tail; i++) { + value = input[i] + 3; + value = (value < 0) ? 0 : value; + value = (value > 6) ? 6 : value; + value = value / 6; + output[i] = value; + } + break; + } + case ACTIVATION_H_SWISH: { + for (U32 i = 0; i < len_main; i++) { + in = _mm256_loadu_ps(input); + out = _mm256_add_ps(in, three); + out = _mm256_max_ps(out, zero); + out = _mm256_min_ps(out, six); + out = _mm256_div_ps(out, six); + out = _mm256_mul_ps(out, in); + _mm256_storeu_ps(output, out); + input += 8; + output += 8; + } + for (U32 i = 0; i < len_tail; i++) { + value = input[i] + 3; + value = (value < 0) ? 0 : value; + value = (value > 6) ? 6 : value; + value = input[i] * value; + value = value / 6; + output[i] = value; + } + break; + } + case ACTIVATION_GELU: { + F32 two_div_PI_sqrt = sqrt(2 / 3.14159265358979323846); + __m256 vec0 = _mm256_set1_ps(two_div_PI_sqrt); + __m256 vec1 = _mm256_set1_ps(0.044715); + __m256 vec2 = _mm256_set1_ps(0.5); + for (U32 i = 0; i < len_main; i++) { + in = _mm256_loadu_ps(input); + out = _mm256_mul_ps(in, in); + out = _mm256_mul_ps(out, in); + out = _mm256_fmadd_ps(vec1, out, in); + out = _mm256_mul_ps(vec0, out); + out = _mm256_tanh_ps(out); + out = _mm256_add_ps(one, out); + out = _mm256_mul_ps(vec2, out); + out = _mm256_mul_ps(in, out); + _mm256_storeu_ps(output, out); + input += 8; + output += 8; + } + for (U32 i = 0; i < len_tail; i++) { + value = input[i]; + value = two_div_PI_sqrt * (value + 0.044715 * powf(value, 3)); + value = 1.0 - 2.0 / (exp(2.0 * value) + 1.0); + value = 0.5 * (1.0 + value); + value = input[i] * value; + output[i] = value; + } + break; + } + case ACTIVATION_TANH: { + for (U32 i = 0; i < len_main; i++) { + in = _mm256_loadu_ps(input); + out = _mm256_tanh_ps(in); + _mm256_storeu_ps(output, out); + input += 8; + output += 8; + } + for (U32 i = 0; i < len_tail; i++) { + value = 1.0 - 2.0 / (exp(2.0 * input[i]) + 1.0); + output[i] = value; + } + break; + } + case ACTIVATION_SIGMOID: { + for (U32 i = 0; i < len_main; i++) { + in = _mm256_loadu_ps(input); + out = _mm256_sigmod_ps(in); + _mm256_storeu_ps(output, out); + input += 8; + output += 8; + } + for (U32 i = 0; i < len_tail; i++) { + value = 1.0 / (1.0 + exp(-1.0 * input[i])); + output[i] = value; + } + break; + } + case ACTIVATION_MISH: { + for (U32 i = 0; i < len_main; i++) { + in = _mm256_loadu_ps(input); + out = _mm256_mul_ps( + in, _mm256_tanh_ps(_mm256_log_ps(_mm256_add_ps(_mm256_exp_ps(in), one)))); + _mm256_storeu_ps(output, out); + input += 8; + output += 8; + } + for (U32 i = 0; i < len_tail; i++) { + value = input[i] * tanh(log(exp(input[i]) + 1.0)); + output[i] = value; + } + break; + } + default: + return NOT_SUPPORTED; + } + + return SUCCESS; +} + +inline void array_scale_f32(const F32 *input, F32 *output, I32 len, F32 alpha, F32 beta) +{ + __m256 alpha_v = _mm256_set1_ps(alpha); + __m256 beta_v = _mm256_set1_ps(beta); + I32 i = 0; + for (i = 0; i < len - 7; i += 8) { + __m256 in = _mm256_loadu_ps(input + i); + __m256 tmp_v = _mm256_add_ps(beta_v, _mm256_mul_ps(alpha_v, in)); + _mm256_storeu_ps(output + i, tmp_v); + } + for (; i < len; i++) { + output[i] = alpha * input[i] + beta; + } +} + +inline void array_power_f32(F32 *input, F32 *output, I32 len, F32 power) +{ + I32 i = 0; + if (power == -1) { + __m256 one_v = _mm256_set1_ps(1); + for (i = 0; i < len - 7; i += 8) { + __m256 in = _mm256_loadu_ps(input + i); + __m256 tmp_v = _mm256_div_ps(one_v, in); + _mm256_storeu_ps(output + i, tmp_v); + } + } else if (power == 0.5) { + for (i = 0; i < len - 7; i += 8) { + __m256 in = _mm256_loadu_ps(input + i); + __m256 tmp_v = _mm256_sqrt_ps(in); + _mm256_storeu_ps(output + i, tmp_v); + } + } else if (power == 1) { + if (input != output) { + memcpy(output, input, len * sizeof(F32)); + } + i = len; + } else if (power == 2) { + for (i = 0; i < len - 7; i += 8) { + __m256 in = _mm256_loadu_ps(input + i); + __m256 tmp_v = _mm256_mul_ps(in, in); + _mm256_storeu_ps(output + i, tmp_v); + } + } + for (; i < len; i++) { + output[i] = powf(input[i], power); + } +} + +inline F32 array_max_f32(const F32 *data, I32 len) +{ + F32 max_s = data[0]; + I32 i = 0; + if (len >= 16) { + __m256 max_v, tmp_v; + F32 max_nums[8]; + max_v = _mm256_loadu_ps(data); + for (i = 8; i < len - 7; i += 8) { + tmp_v = _mm256_loadu_ps(data + i); + max_v = _mm256_max_ps(tmp_v, max_v); + } + _mm256_storeu_ps(max_nums, max_v); + max_s = _mm256_hmax_ps(max_v); + } + + for (; i < len; i++) { + if (data[i] > max_s) { + max_s = data[i]; + } + } + + return max_s; +} + +// array var +inline F32 array_var_f32(const F32 *data, I32 len, F32 mean) +{ + if (len <= 0) { + return 0; + } + + I32 i = 0; + F32 sum_s = 0; + __m256 mean_v = _mm256_set1_ps(mean); + for (i = 0; i < len - 7; i += 8) { + __m256 in = _mm256_loadu_ps(data + i); + __m256 tmp_v = _mm256_sub_ps(in, mean_v); + __m256 sum_v = _mm256_mul_ps(tmp_v, tmp_v); + sum_s += _mm256_sum_ps(sum_v); + } + for (; i < len; i++) { + F32 in = data[i]; + F32 tmp = in - mean; + sum_s += tmp * tmp; + } + return sum_s / len; +} + +// array sum +inline F32 array_sum_f32(const F32 *data, I32 len) +{ + if (len <= 0) { + return 0; + } + + I32 i = 0; + F32 sum_s = 0; + __m256 sum_v = _mm256_set1_ps(0); + for (i = 0; i < len - 7; i += 8) { + __m256 in = _mm256_loadu_ps(data + i); + sum_v = _mm256_add_ps(sum_v, in); + } + sum_s += _mm256_sum_ps(sum_v); + for (; i < len; i++) { + sum_s += data[i]; + } + return sum_s; +} + +// array mean +inline F32 array_mean_f32(const F32 *data, I32 len) +{ + if (len <= 0) { + return 0; + } + return array_sum_f32(data, len) / len; +} + +inline void array_add_f32(const F32 *inputA, const F32 *inputB, F32 *output, I32 len) +{ + I32 i = 0; + for (i = 0; i < len - 7; i += 8) { + __m256 a = _mm256_loadu_ps(inputA + i); + __m256 b = _mm256_loadu_ps(inputB + i); + __m256 c = _mm256_add_ps(a, b); + _mm256_storeu_ps(output + i, c); + } + + for (; i < len; i++) { + output[i] = inputA[i] + inputB[i]; + } +} + +inline void array_square_and_add_f32(const F32 *inputA, const F32 *inputB, F32 *output, I32 len) +{ + I32 i = 0; + for (i = 0; i < len - 7; i += 8) { + __m256 a = _mm256_loadu_ps(inputA + i); + __m256 b = _mm256_loadu_ps(inputB + i); + b = _mm256_mul_ps(b, b); + __m256 c = _mm256_add_ps(a, b); + _mm256_storeu_ps(output + i, c); + } + + for (; i < len; i++) { + output[i] = inputA[i] + inputB[i] * inputB[i]; + } +} + +#endif //CHEETAH_X86_FUNCTION_FP32_H diff --git a/compute/tensor/src/cpu/x86/normalization.cpp b/compute/tensor/src/cpu/x86/normalization.cpp new file mode 100644 index 00000000..aaf9f160 --- /dev/null +++ b/compute/tensor/src/cpu/x86/normalization.cpp @@ -0,0 +1,37 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/tensor_computing_x86.h" +#ifdef _USE_FP32 +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#endif + +EE layer_normalization_x86( + TensorDesc inputDesc, void *input, void *alpha, void *beta, TensorDesc outputDesc, void *output) +{ + DataType idt = inputDesc.dt; + EE ret = SUCCESS; + switch (idt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = layer_normalization_fp32( + inputDesc, (F32 *)input, (F32 *)alpha, (F32 *)beta, outputDesc, (F32 *)output); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/x86/pooling.cpp b/compute/tensor/src/cpu/x86/pooling.cpp new file mode 100644 index 00000000..82e5b73e --- /dev/null +++ b/compute/tensor/src/cpu/x86/pooling.cpp @@ -0,0 +1,41 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/tensor_computing_x86.h" +#ifdef _USE_FP32 +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#endif + +EE pooling_x86(TensorDesc inputDesc, + const void *input, + PoolingParamSpec poolingParamSpec, + const void *scale, + TensorDesc outputDesc, + void *output) +{ + EE ret = SUCCESS; + switch (inputDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + UNUSED(scale); + ret = pooling_fp32( + inputDesc, (const F32 *)input, poolingParamSpec, outputDesc, (F32 *)output); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/x86/rnn.cpp b/compute/tensor/src/cpu/x86/rnn.cpp new file mode 100644 index 00000000..ccc97141 --- /dev/null +++ b/compute/tensor/src/cpu/x86/rnn.cpp @@ -0,0 +1,49 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/tensor_computing_x86.h" +#ifdef _USE_FP32 +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#endif + +EE rnncell_x86(TensorDesc xDesc, + const void *currentX, + const TensorDesc *filterDesc, + const void **filter, + const TensorDesc *biasDesc, + const void **bias, + void *state, + U32 tmpBytes, + void *tmp, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + void *output, + Arch arch) +{ + EE ret = SUCCESS; + switch (xDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = rnncell_fp32(xDesc, currentX, filterDesc, filter, biasDesc, bias, state, tmpBytes, + tmp, rnnParamSpec, batchStrideX, batchStrideH, hDesc, output, arch); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/x86/scale.cpp b/compute/tensor/src/cpu/x86/scale.cpp new file mode 100644 index 00000000..0043ea9a --- /dev/null +++ b/compute/tensor/src/cpu/x86/scale.cpp @@ -0,0 +1,51 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/tensor_computing_x86.h" +#ifdef _USE_FP32 +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#endif + +EE scale_x86(TensorDesc inputDesc, + void *input, + void *alpha, + void *beta, + ScaleParamSpec p, + TensorDesc outputDesc, + void *output) +{ + UNUSED(outputDesc); + U32 length = tensorNumElements(inputDesc); + int axis = (p.axis + inputDesc.nDims) % inputDesc.nDims; + I32 in = inputDesc.dims[inputDesc.nDims - 1]; + I32 ic = inputDesc.dims[inputDesc.nDims - 1 - axis]; + I32 elements_per_channel = length / (in * ic); + if (inputDesc.df == DF_NCHWC8) { + axis = inputDesc.nDims; + } + EE ret = SUCCESS; + switch (inputDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = scale_fp32((F32 *)input, axis, inputDesc.nDims, (F32 *)alpha, (F32 *)beta, in, ic, + elements_per_channel, (F32 *)output); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + + return ret; +} diff --git a/compute/tensor/src/cpu/x86/softmax.cpp b/compute/tensor/src/cpu/x86/softmax.cpp new file mode 100644 index 00000000..9c2a37f0 --- /dev/null +++ b/compute/tensor/src/cpu/x86/softmax.cpp @@ -0,0 +1,37 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/tensor_computing_x86.h" +#ifdef _USE_FP32 +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#endif + +EE softmax_x86( + TensorDesc inputDesc, const void *input, SoftmaxParamSpec p, TensorDesc outputDesc, void *output) +{ + DataType idt = inputDesc.dt; + EE ret = SUCCESS; + switch (idt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = softmax_fp32(inputDesc, (const F32 *)input, p.axis, outputDesc, (F32 *)output); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + + return ret; +} diff --git a/compute/tensor/src/cpu/x86/tensor_computing_x86.h b/compute/tensor/src/cpu/x86/tensor_computing_x86.h new file mode 100644 index 00000000..d65fbc60 --- /dev/null +++ b/compute/tensor/src/cpu/x86/tensor_computing_x86.h @@ -0,0 +1,212 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef CHEETAH_TENSOR_COMPUTING_X86_H +#define CHEETAH_TENSOR_COMPUTING_X86_H + +#include +#include "error.h" +#include "sys.h" +#include "types.h" + +EE attention_mask_x86(TensorDesc inputDesc, + const void *input, + AttentionMaskParamSpec p, + TensorDesc outputDesc, + void *output); + +EE check_x86(TensorDesc inputDescA, + const void *inputA, + TensorDesc inputDescB, + const void *inputB, + CheckParamSpec p, + TensorDesc outputDesc, + void *output); + +EE clip_x86(TensorDesc inputDesc, void *input, ClipParamSpec p, TensorDesc outputDesc, void *output); + +EE convolution_infer_forward_algorithm_x86(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + ConvolutionForwardAlgorithm *algorithm, + DataType targetDataType); + +EE convolution_transform_filter_bytes_x86(TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes); + +EE convolution_transform_filter_x86(TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + void *filterTransformed); + +EE convolution_infer_forward_tmp_bytes_x86(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes); + +EE convolution_x86(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc scaleDesc, + const void *scale, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec activationDesc, + Arch arch); + +EE depthwise_pointwise_convolution_transform_filter_x86(TensorDesc dwFilterDesc, + const void *dwFilter, + TensorDesc pwFilterDesc, + const void *pwFilter, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc *dwFtmDesc, + void *dwFilterTransformed, + TensorDesc *pwFtmDesc, + void *pwFilterTransformed); + +EE depthwise_pointwise_convolution_x86(TensorDesc inputDesc, + void *input, + TensorDesc dwFilterDesc, + const void *dwFilter, + TensorDesc pwFilterDesc, + const void *pwFilter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc dwBiasDesc, + const void *dwBias, + TensorDesc pwBiasDesc, + const void *pwBias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + Arch arch); + +EE depthwise_convolution_transform_filter_bytes_x86( + TensorDesc filterDesc, DepthwiseConvolutionForwardAlgorithm algorithm, U32 *bytes); + +EE depthwise_convolution_transform_filter_x86(TensorDesc filterDesc, + const void *filter, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + void *filterTransformed); + +EE depthwise_convolution_infer_forward_tmp_bytes_x86(TensorDesc inputDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + U32 *bytes); + +EE depthwise_convolution_x86(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec depthwiseActivationParamSpec, + Arch arch); + +EE eltwise_x86(DataType dataType, + std::vector input, + std::vector inputSize, + U32 num, + U32 len, + void *output, + EltwiseMode eltwiseMode); + +EE layer_normalization_x86( + TensorDesc inputDesc, void *input, void *alpha, void *beta, TensorDesc outputDesc, void *output); + +EE rnncell_x86(TensorDesc xDesc, + const void *currentX, + const TensorDesc *filterDesc, + const void **filter, + const TensorDesc *biasDesc, + const void **bias, + void *state, + U32 tmpBytes, + void *tmp, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + void *currentH, + Arch arch); + +EE scale_x86(TensorDesc inputDesc, + void *input, + void *alpha, + void *beta, + ScaleParamSpec p, + TensorDesc outputDesc, + void *output); + +EE pooling_x86(TensorDesc inputDesc, + const void *input, + PoolingParamSpec poolingParamSpec, + const void *scale, + TensorDesc outputDesc, + void *output); + +EE reshape_x86(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *output); + +EE softmax_x86( + TensorDesc inputDesc, const void *input, SoftmaxParamSpec p, TensorDesc outputDesc, void *output); + +EE deconvolution_transform_filter_x86(TensorDesc filterDesc, + const void *filter, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + void *filterTransformed); + +EE deconvolution_x86(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc scaleDesc, + const void *scale, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec activationDesc, + Arch arch); + +#endif //CHEETAH_TENSOR_COMPUTING_X86_H diff --git a/compute/tensor/src/cpu/x86/x86_functions.h b/compute/tensor/src/cpu/x86/x86_functions.h new file mode 100644 index 00000000..1af44b7b --- /dev/null +++ b/compute/tensor/src/cpu/x86/x86_functions.h @@ -0,0 +1,158 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_X86_FUNCTIONS +#define _H_X86_FUNCTIONS +#include "cpu/cpu_functions_template.h" +#ifdef _USE_FP32 +#include "cpu/x86/fp32/x86_functions_fp32.h" +#endif + +inline void array_add_x86(DataType dt, const void *inputA, const void *inputB, void *output, I32 len) +{ + switch (dt) { +#ifdef _USE_FP32 + case DT_F32: + array_add_f32((const F32 *)inputA, (const F32 *)inputB, (F32 *)output, len); + break; +#endif + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } +} + +inline void array_square_and_add_x86( + DataType dt, const void *inputA, const void *inputB, void *output, I32 len) +{ + switch (dt) { +#ifdef _USE_FP32 + case DT_F32: + array_square_and_add_f32((const F32 *)inputA, (const F32 *)inputB, (F32 *)output, len); + break; +#endif + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } +} + +// array mean +inline F32 array_mean_x86(DataType dt, const void *data, I32 len) +{ + F32 result = 0; + switch (dt) { +#ifdef _USE_FP32 + case DT_F32: + result = array_mean_f32((const F32 *)data, len); + break; +#endif + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } + return result; +} + +inline void array_power_x86(DataType dt, void *input, void *output, I32 len, F32 power) +{ + switch (dt) { +#ifdef _USE_FP32 + case DT_F32: + array_power_f32((F32 *)input, (F32 *)output, len, power); + break; +#endif + case DT_I32: + array_power_template((I32 *)input, (I32 *)output, len, power); + break; + case DT_U32: + array_power_template((U32 *)input, (U32 *)output, len, power); + break; + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } +} + +inline F32 array_sum_x86(DataType dt, const void *data, I32 len) +{ + F32 result = 0; + switch (dt) { +#ifdef _USE_FP32 + case DT_F32: + result = array_sum_f32((const F32 *)data, len); + break; +#endif + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } + return result; +} + +inline void array_scale_x86( + DataType dt, const void *input, void *output, I32 len, F32 alpha, F32 beta) +{ + switch (dt) { +#ifdef _USE_FP32 + case DT_F32: + array_scale_f32((const F32 *)input, (F32 *)output, len, alpha, beta); + break; +#endif + case DT_I32: + array_scale_template((const I32 *)input, (I32 *)output, len, alpha, beta); + break; + case DT_U32: + array_scale_template((const U32 *)input, (U32 *)output, len, alpha, beta); + break; + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } +} + +// array var +inline F32 array_var_x86(DataType dt, const void *data, I32 len, F32 mean) +{ + F32 result = 0; + switch (dt) { +#ifdef _USE_FP32 + case DT_F32: + result = array_var_f32((const F32 *)data, len, mean); + break; +#endif + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } + return result; +} + +inline EE array_activation_x86( + DataType dt, void *input, U32 len, ActivationParamSpec activationDesc, void *output) +{ + EE result = SUCCESS; + switch (dt) { +#ifdef _USE_FP32 + case DT_F32: + result = activation_fp32((F32 *)input, len, activationDesc, (F32 *)output); + break; +#endif + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } + return result; +} + +#endif // _H_X86_FUNCTIONS diff --git a/compute/tensor/src/cpu/yolov3detectionoutput.cpp b/compute/tensor/src/cpu/yolov3detectionoutput.cpp new file mode 100644 index 00000000..6310f3cd --- /dev/null +++ b/compute/tensor/src/cpu/yolov3detectionoutput.cpp @@ -0,0 +1,274 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "error.h" +#include "cpu/tensor_computing_cpu.h" + +inline EE qsort_descent(std::vector &boxes, std::vector &scores, int left, int right) +{ + if (boxes.empty() || scores.empty()) { + return NOT_SUPPORTED; + } + + int i = left; + int j = right; + F32 temp = scores[(left + right) / 2]; + + while (i <= j) { + while (scores[i] > temp) { + i++; + } + while (scores[j] < temp) { + j--; + } + if (i <= j) { + std::swap(boxes[i], boxes[j]); + std::swap(scores[i], scores[j]); + i++; + j--; + } + } + + if (left < j) { + qsort_descent(boxes, scores, left, j); + } + if (i < right) { + qsort_descent(boxes, scores, i, right); + } + + return SUCCESS; +} + +inline F32 intersectionarea(BoxRect a, BoxRect b) +{ + if (a.xmin > b.xmax || a.xmax < b.xmin || a.ymin > b.ymax || a.ymax < b.ymin) { + return 0.f; + } + F32 inter_width = std::min(a.xmax, b.xmax) - std::max(a.xmin, b.xmin); + F32 inter_height = std::min(a.ymax, b.ymax) - std::max(a.ymin, b.ymin); + + return inter_width * inter_height; +} + +inline EE nms_pickedboxes(std::vector boxes, std::vector &picked, F32 nms_threshold) +{ + I64 n = boxes.size(); + + std::vector areas(n); + for (I64 i = 0; i < n; i++) { + BoxRect box = boxes[i]; + + F32 width = box.xmax - box.xmin; + F32 height = box.ymax - box.ymin; + + areas[i] = width * height; + } + for (I64 i = 0; i < n; i++) { + BoxRect a = boxes[i]; + int keep = 1; + for (int j = 0; j < (int)picked.size(); j++) { + BoxRect b = boxes[picked[j]]; + F32 inter_area = intersectionarea(a, b); + F32 union_area = areas[i] + areas[picked[j]] - inter_area; + + if (inter_area / union_area > nms_threshold) { + keep = 0; + } + } + if (keep) { + picked.push_back(i); + } + } + return SUCCESS; +} + +template +EE yolov3detectionoutput(std::vector input, + T *output, + std::vector inputDesc, + Yolov3DetectionOutputParamSpec yolov3DetectionOutputParamSpec, + Arch arch) +{ + U32 num_class = yolov3DetectionOutputParamSpec.num_class; + U32 num_box = yolov3DetectionOutputParamSpec.num_box; + F32 confidence_threshold = yolov3DetectionOutputParamSpec.confidence_threshold; + F32 nms_threshold = yolov3DetectionOutputParamSpec.nms_threshold; + std::vector biases; + for (int i = 0; i < 18; i++) { + if (yolov3DetectionOutputParamSpec.biases[i] == 0) { + break; + } + biases.push_back(yolov3DetectionOutputParamSpec.biases[i]); + } + std::vector anchors_scale; + for (int i = 0; i < 3; i++) { + if (yolov3DetectionOutputParamSpec.anchors_scale[i] == 0) { + break; + } + anchors_scale.push_back(yolov3DetectionOutputParamSpec.anchors_scale[i]); + } + std::vector mask; + for (int i = 0; i < (int)(yolov3DetectionOutputParamSpec.mask_group_num * 3); i++) { + mask.push_back(yolov3DetectionOutputParamSpec.mask[i]); + } + + std::vector all_boxrects; + std::vector all_boxscores; + I64 input_size = inputDesc.size(); + U32 info_per_box = 4 + 1 + num_class; + ActivationParamSpec activationdesc_sigmoid; + activationdesc_sigmoid.mode = ACTIVATION_SIGMOID; + TensorDesc tmpDesc = tensor1d(inputDesc[0].dt, 1); + for (I64 i = 0; i < input_size; i++) { + T *in = (T *)input[i]; + CHECK_REQUIREMENT(inputDesc[i].df == DF_NCHWC8 || inputDesc[i].df == DF_NCHW); + if (inputDesc[i].df == DF_NCHWC8) { + T *tmp = (T *)malloc(tensorNumBytes(inputDesc[0])); + memcpy(tmp, in, tensorNumBytes(inputDesc[0])); + CHECK_STATUS(transformToNCHW(inputDesc[0], tmp, inputDesc[0], in)); + free(tmp); + } + std::vector> allbox_boxrects; + std::vector> allbox_boxscores; + allbox_boxrects.resize(num_box); + allbox_boxscores.resize(num_box); + + U32 w = inputDesc[i].dims[0]; + U32 h = inputDesc[i].dims[1]; + U32 net_w = (U32)(anchors_scale[i] * w); + U32 net_h = (U32)(anchors_scale[i] * h); + I64 mask_offset = i * num_box; + U32 hw_stride = w * h; + U32 idx = 0; + + for (U32 b = 0; b < num_box; b++) { + U32 biases_index = mask[b + mask_offset]; + F32 bias_w = biases[biases_index * 2]; + F32 bias_h = biases[biases_index * 2 + 1]; + idx = hw_stride * b * info_per_box; + for (U32 nh = 0; nh < h; nh++) { + for (U32 nw = 0; nw < w; nw++) { + T box_score = 0; + CHECK_STATUS(activation_cpu(tmpDesc, &in[idx + 4 * hw_stride], + activationdesc_sigmoid, tmpDesc, &box_score, arch)); + U32 label = 0; + T class_score_max = in[idx + 5 * hw_stride]; + T class_score = 0; + for (U32 c = 1; c < num_class; c++) { + class_score = in[idx + (5 + c) * hw_stride]; + if (class_score > class_score_max) { + label = c; + class_score_max = class_score; + } + } + CHECK_STATUS(activation_cpu(tmpDesc, &class_score_max, activationdesc_sigmoid, + tmpDesc, &class_score, arch)); + F32 score_conf = static_cast(box_score * class_score); + T cx, cy; + cx = cy = 0; + if (score_conf >= confidence_threshold) { + CHECK_STATUS(activation_cpu( + tmpDesc, &in[idx], activationdesc_sigmoid, tmpDesc, &cx, arch)); + F32 box_cx = static_cast((nw + cx) / w); + CHECK_STATUS(activation_cpu(tmpDesc, &in[idx + 1 * hw_stride], + activationdesc_sigmoid, tmpDesc, &cy, arch)); + F32 box_cy = static_cast((nh + cy) / h); + F32 box_w = static_cast(exp(in[idx + 2 * hw_stride]) * bias_w / net_w); + F32 box_h = static_cast(exp(in[idx + 3 * hw_stride]) * bias_h / net_h); + + F32 box_xmin = box_cx - box_w * 0.5; + F32 box_ymin = box_cy - box_h * 0.5; + F32 box_xmax = box_cx + box_w * 0.5; + F32 box_ymax = box_cy + box_h * 0.5; + BoxRect box = {box_xmin, box_ymin, box_xmax, box_ymax, label}; + allbox_boxrects[b].push_back(box); + allbox_boxscores[b].push_back(score_conf); + } + idx++; + } + } + } + + for (U32 b = 0; b < num_box; b++) { + all_boxrects.insert( + all_boxrects.end(), allbox_boxrects[b].begin(), allbox_boxrects[b].end()); + all_boxscores.insert( + all_boxscores.end(), allbox_boxscores[b].begin(), allbox_boxscores[b].end()); + } + } + // sort boxes + qsort_descent(all_boxrects, all_boxscores, 0, static_cast(all_boxscores.size() - 1)); + // apply nms + std::vector picked; + nms_pickedboxes(all_boxrects, picked, nms_threshold); + + std::vector boxrects; + std::vector boxscores; + for (I64 p = 0; p < (I64)picked.size(); p++) { + I64 picked_box = picked[p]; + boxrects.push_back(all_boxrects[picked_box]); + boxscores.push_back(all_boxscores[picked_box]); + } + + U32 num_detected = static_cast(boxrects.size()); + // the first box contains the number of availble boxes + output[0] = num_detected; + output[1] = output[2] = output[3] = output[4] = output[5] = 0; + for (U32 i = 0; i < num_detected; i++) { + BoxRect b = boxrects[i]; + F32 score = boxscores[i]; + + output[(i + 1) * 6] = b.label + 1; + output[(i + 1) * 6 + 1] = score; + output[(i + 1) * 6 + 2] = b.xmin; + output[(i + 1) * 6 + 3] = b.ymin; + output[(i + 1) * 6 + 4] = b.xmax; + output[(i + 1) * 6 + 5] = b.ymax; + } + return SUCCESS; +} + +EE yolov3detectionoutput_cpu(std::vector inputDesc, + std::vector input, + Yolov3DetectionOutputParamSpec yolov3DetectionOutputParamSpec, + TensorDesc outputDesc, + void *output, + Arch arch) +{ + UNUSED(outputDesc); + if (nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + EE ret = SUCCESS; + switch (inputDesc[0].dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = yolov3detectionoutput( + input, (F32 *)output, inputDesc, yolov3DetectionOutputParamSpec, arch); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + ret = yolov3detectionoutput( + input, (F16 *)output, inputDesc, yolov3DetectionOutputParamSpec, arch); + break; + } +#endif + default: { + ret = NOT_SUPPORTED; + break; + } + } + return ret; +} diff --git a/compute/tensor/src/deconvolution.cpp b/compute/tensor/src/deconvolution.cpp new file mode 100644 index 00000000..d011de45 --- /dev/null +++ b/compute/tensor/src/deconvolution.cpp @@ -0,0 +1,278 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_GENERAL +#include "cpu/general/tensor_computing_general.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif +#include "cpu/tensor_computing_cpu.h" + +inline EE deconvolution_infer_output_size_cpu(TensorDesc inputDesc, + TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + TensorDesc *outputDesc, + DataType targetDataType) +{ + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, fdt; + DataFormat idf, fdf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + + CHECK_REQUIREMENT(1 == fn || ic == fn); + + if (fc % 8 != 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + + if (fh < 1 || fw < 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + oh = fh + strideH * (ih - 1) - paddingT - paddingB; + ow = fw + strideW * (iw - 1) - paddingL - paddingR; + + *outputDesc = tensor4df(targetDataType, DF_NCHWC8, in, fc, oh, ow); + return SUCCESS; +} + +EE deconvolution_infer_output_size(Tensor *inputTensor, + Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + Tensor *outputTensor, + DataType targetDataType, + ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc filterDesc = filterTensor.get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = deconvolution_infer_output_size_mali( + inputDesc, filterDesc, convParamSpec, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } else { + ret = deconvolution_infer_output_size_cpu( + inputDesc, filterDesc, convParamSpec, &outputDesc, targetDataType); + } + outputTensor->resize(outputDesc); + return ret; +} + +EE deconvolution_infer_forward_algorithm(Tensor inputTensor, + Tensor filterTensor, + Tensor outputTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + ConvolutionForwardAlgorithm *algorithm, + DataType targetDataType, + ActivationParamSpec activationDesc, + ArchInfo_t archInfo) +{ + TensorDesc inputDesc = inputTensor.get_desc(); + TensorDesc filterDesc = filterTensor.get_desc(); + TensorDesc outputDesc = outputTensor.get_desc(); + + EE ret = NOT_SUPPORTED; + auto arch = archInfo->arch; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = SUCCESS; +#endif +#if defined(_USE_NEON) || defined(_USE_X86) + } else if (IS_X86_AVX2(arch) || IS_ARM(arch)) { + ret = deconvolution_infer_forward_algorithm_cpu(inputDesc, filterDesc, outputDesc, + convParamSpec, policy, algorithm, targetDataType, arch); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = deconvolution_infer_forward_algorithm_mali(((MaliPara_t)(archInfo->archPara))->handle, + inputDesc, filterDesc, convParamSpec, outputDesc, policy, activationDesc.mode, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo); +#endif + } + return ret; +} + +EE deconvolution_transform_filter_bytes(Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes, + ArchInfo_t archInfo) +{ + TensorDesc filterDesc = filterTensor.get_desc(); + + EE ret = NOT_SUPPORTED; + auto arch = archInfo->arch; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + *bytes = tensorNumBytes(filterDesc); + ret = SUCCESS; +#endif +#if defined(_USE_NEON) || defined(_USE_X86) + } else if (IS_X86_AVX2(arch) || IS_ARM(arch)) { + ret = deconvolution_transform_filter_bytes_cpu( + filterDesc, convParamSpec, algorithm, bytes, arch); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = deconvolution_transform_filter_bytes_mali(filterDesc, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo, + ((MaliPara_t)(archInfo->archPara))->gclmemFilterDesc, bytes); +#endif + } + return ret; +} + +EE deconvolution_transform_filter(Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + Tensor tmpTensor, + Tensor *ftmTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc filterDesc = filterTensor.get_desc(); + void *filter = get_ptr_from_tensor(filterTensor, arch); + TensorDesc ftmDesc = ftmTensor->get_desc(); + void *filterTransformed = get_ptr_from_tensor(*ftmTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + UNI_memcpy(filterTransformed, filter, tensorNumBytes(filterDesc)); + ftmDesc = filterDesc; + ret = SUCCESS; +#endif +#if defined(_USE_NEON) || defined(_USE_X86) + } else if (IS_X86_AVX2(arch) || IS_ARM(arch)) { + ret = deconvolution_transform_filter_cpu( + filterDesc, filter, convParamSpec, algorithm, &ftmDesc, filterTransformed, arch); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + ret = deconvolution_transform_filter_mali(((MaliPara_t)(archInfo->archPara))->handle, + filterDesc, (GCLMem_t)filter, ((MaliPara_t)(archInfo->archPara))->forwardRunInfo, + (GCLMem_t)tmp, &ftmDesc, (GCLMem_t)filterTransformed); +#endif + } + ftmTensor->resize(ftmDesc); + return ret; +} + +EE deconvolution_infer_forward_tmp_bytes(Tensor inputTensor, + Tensor filterTensor, + Tensor outputTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes, + ArchInfo_t archInfo) +{ + TensorDesc inputDesc = inputTensor.get_desc(); + TensorDesc filterDesc = filterTensor.get_desc(); + TensorDesc outputDesc = outputTensor.get_desc(); + + EE ret = NOT_SUPPORTED; + auto arch = archInfo->arch; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = SUCCESS; +#endif +#if defined(_USE_NEON) || defined(_USE_X86) + } else if (IS_X86_AVX2(arch) || IS_ARM(arch)) { + ret = deconvolution_infer_forward_tmp_bytes_cpu( + inputDesc, filterDesc, outputDesc, convParamSpec, algorithm, bytes, archInfo->arch); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = deconvolution_infer_forward_tmp_bytes_mali(inputDesc, filterDesc, outputDesc, + convParamSpec, ((MaliPara_t)(archInfo->archPara))->forwardRunInfo, bytes); +#endif + } + return ret; +} + +EE deconvolution(Tensor inputTensor, + Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + void *scale, + Tensor biasTensor, + Tensor tmpTensor, + Tensor outputTensor, + ActivationParamSpec activationDesc, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc filterDesc = filterTensor.get_desc(); + void *filter = get_ptr_from_tensor(filterTensor, arch); + TensorDesc biasDesc = biasTensor.get_desc(); + void *bias = get_ptr_from_tensor(biasTensor, arch); + U32 tmpBytes = tmpTensor.bytes(); + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + TensorDesc scaleDesc = filterDesc; + + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = deconvolution_general(inputDesc, input, filterDesc, filter, convParamSpec, scaleDesc, + scale, biasDesc, bias, outputDesc, output, activationDesc); +#endif +#if defined(_USE_NEON) || defined(_USE_X86) + } else if (IS_X86_AVX2(arch) || IS_ARM(arch)) { + ret = deconvolution_cpu(inputDesc, input, filterDesc, filter, convParamSpec, algorithm, + scaleDesc, scale, biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc, + archInfo->arch); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = deconvolution_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, + (GCLMem_t)input, filterDesc, (GCLMem_t)filter, convParamSpec, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo, scaleDesc, (GCLMem_t)scale, + biasDesc, (GCLMem_t)bias, tmpBytes, (GCLMem_t)tmp, outputDesc, (GCLMem_t)output, + activationDesc.mode); +#endif + } + return ret; +} diff --git a/compute/tensor/src/depth2space.cpp b/compute/tensor/src/depth2space.cpp new file mode 100644 index 00000000..a11bbdb9 --- /dev/null +++ b/compute/tensor/src/depth2space.cpp @@ -0,0 +1,79 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +EE depth2space_infer_output_size( + Tensor *inputTensor, Depth2SpaceParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + TensorDesc inputDesc = inputTensor->get_desc(); + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = depth2space_infer_output_size_mali( + inputDesc, p, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } + outputTensor->resize(outputDesc); + return ret; +} + +EE depth2space_infer_forward_tmp_bytes( + Tensor inputTensor, Depth2SpaceParamSpec p, Tensor outputTensor, U32 *bytes, ArchInfo_t archInfo) +{ + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + TensorDesc inputDesc = inputTensor.get_desc(); + TensorDesc outputDesc = outputTensor.get_desc(); + ret = depth2space_infer_tmpBuf_size_mali(inputDesc, p, outputDesc, bytes); +#endif + } + return ret; +} + +EE depth2space(Tensor inputTensor, + Depth2SpaceParamSpec p, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(arch)) { +#ifdef _USE_MALI + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + ret = depth2space_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, + (GCLMem_t)input, p, (GCLMem_t)tmp, outputDesc, (GCLMem_t)output); +#endif + } + return ret; +} diff --git a/compute/tensor/src/depthwise_convolution.cpp b/compute/tensor/src/depthwise_convolution.cpp new file mode 100644 index 00000000..60213b13 --- /dev/null +++ b/compute/tensor/src/depthwise_convolution.cpp @@ -0,0 +1,306 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#include "cpu/tensor_computing_cpu.h" +#ifdef _USE_GENERAL +#include "cpu/general/tensor_computing_general.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/tensor_computing_arm.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif +#ifdef _USE_X86 +#include "cpu/x86/tensor_computing_x86.h" +#endif + +inline EE depthwise_convolution_infer_output_size_cpu(TensorDesc inputDesc, + TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + TensorDesc *outputDesc, + DataType targetDataType) +{ + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, fdt; + DataFormat idf, fdf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + if (fh < 1 || fw < 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; + + U32 fhDilated = (fh - 1) * dilateH + 1; + U32 fwDilated = (fw - 1) * dilateW + 1; + + CHECK_REQUIREMENT(fdf == DF_NCHW || fdf == DF_NCHWC8); + oh = (ih + paddingT + paddingB - fhDilated) / strideH + 1; + ow = (iw + paddingL + paddingR - fwDilated) / strideW + 1; + + if (ic % 8 != 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + + *outputDesc = tensor4df(targetDataType, DF_NCHWC8, in, ic, oh, ow); + return SUCCESS; +} + +EE depthwise_convolution_infer_output_size(Tensor *inputTensor, + Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + Tensor *outputTensor, + DataType targetDataType, + ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc filterDesc = filterTensor.get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = depthwise_convolution_infer_output_size_mali( + inputDesc, filterDesc, convParamSpec, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } else { + ret = depthwise_convolution_infer_output_size_cpu( + inputDesc, filterDesc, convParamSpec, &outputDesc, targetDataType); + } + outputTensor->resize(outputDesc); + return ret; +} + +EE depthwise_convolution_infer_forward_algorithm(Tensor inputTensor, + Tensor filterTensor, + Tensor outputTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + DepthwiseConvolutionForwardAlgorithm *algorithm, + DataType targetDataType, + ActivationParamSpec depthwiseActivationParamSpec, + ArchInfo_t archInfo) +{ + EE ret = NOT_SUPPORTED; + auto arch = archInfo->arch; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = SUCCESS; +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + *algorithm = DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT; + ret = SUCCESS; +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + *algorithm = DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT; + ret = SUCCESS; +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + TensorDesc inputDesc = inputTensor.get_desc(); + TensorDesc filterDesc = filterTensor.get_desc(); + TensorDesc outputDesc = outputTensor.get_desc(); + ret = depthwise_convolution_infer_forward_algorithm_mali( + ((MaliPara_t)(archInfo->archPara))->handle, inputDesc, filterDesc, outputDesc, + convParamSpec, policy, depthwiseActivationParamSpec.mode, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo); +#endif + } + return ret; +} + +EE depthwise_convolution_transform_filter_bytes(Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + U32 *bytes, + ArchInfo_t archInfo) +{ + TensorDesc filterDesc = filterTensor.get_desc(); + + UNUSED(convParamSpec); + EE ret = NOT_SUPPORTED; + auto arch = archInfo->arch; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + *bytes = tensorNumBytes(filterDesc); + ret = SUCCESS; +#endif +#if defined(_USE_X86) || defined(_USE_NEON) + } else if (IS_CPU(arch)) { + ret = depthwise_convolution_transform_filter_bytes_cpu(filterDesc, algorithm, bytes); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = depthwise_convolution_transform_filter_bytes_mali(filterDesc, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo, + ((MaliPara_t)(archInfo->archPara))->gclmemFilterDesc, bytes); +#endif + } + return ret; +} + +EE depthwise_convolution_transform_filter(Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + Tensor *ftmTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc filterDesc = filterTensor.get_desc(); + void *filter = get_ptr_from_tensor(filterTensor, arch); + TensorDesc ftmDesc = ftmTensor->get_desc(); + void *filterTransformed = get_ptr_from_tensor(*ftmTensor, arch); + + UNUSED(convParamSpec); + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + UNI_memcpy(filterTransformed, filter, tensorNumBytes(filterDesc)); + ftmDesc = filterDesc; + ret = SUCCESS; +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = depthwise_convolution_transform_filter_x86( + filterDesc, filter, algorithm, &ftmDesc, filterTransformed); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = depthwise_convolution_transform_filter_arm( + filterDesc, filter, algorithm, &ftmDesc, filterTransformed); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = depthwise_convolution_transform_filter_mali(((MaliPara_t)(archInfo->archPara))->handle, + filterDesc, (GCLMem_t)filter, ((MaliPara_t)(archInfo->archPara))->forwardRunInfo, + &ftmDesc, (GCLMem_t)filterTransformed); +#endif + } + ftmTensor->resize(ftmDesc); + return ret; +} + +EE depthwise_convolution_infer_forward_tmp_bytes(Tensor inputTensor, + Tensor filterTensor, + Tensor outputTensor, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + U32 *bytes, + ArchInfo_t archInfo) +{ + TensorDesc inputDesc = inputTensor.get_desc(); + TensorDesc filterDesc = filterTensor.get_desc(); + TensorDesc outputDesc = outputTensor.get_desc(); + + EE ret = NOT_SUPPORTED; + auto arch = archInfo->arch; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + *bytes = 0; + ret = SUCCESS; +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = depthwise_convolution_infer_forward_tmp_bytes_x86( + inputDesc, outputDesc, convParamSpec, algorithm, bytes); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = depthwise_convolution_infer_forward_tmp_bytes_arm( + inputDesc, filterDesc, outputDesc, convParamSpec, algorithm, bytes); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = depthwise_convolution_infer_forward_tmp_bytes_mali(inputDesc, filterDesc, outputDesc, + convParamSpec, ((MaliPara_t)(archInfo->archPara))->forwardRunInfo, bytes); + ret = SUCCESS; +#endif + } + return ret; +} + +EE depthwise_convolution(Tensor inputTensor, + Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + Tensor biasTensor, + Tensor tmpTensor, + Tensor outputTensor, + ActivationParamSpec depthwiseActivationParamSpec, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc filterDesc = filterTensor.get_desc(); + void *filter = get_ptr_from_tensor(filterTensor, arch); + TensorDesc biasDesc = biasTensor.get_desc(); + void *bias = get_ptr_from_tensor(biasTensor, arch); + U32 tmpBytes = tmpTensor.bytes(); + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = depthwise_convolution_general(inputDesc, input, filterDesc, filter, convParamSpec, + biasDesc, bias, tmpBytes, tmp, outputDesc, output, depthwiseActivationParamSpec); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = depthwise_convolution_x86(inputDesc, input, filterDesc, filter, convParamSpec, + algorithm, biasDesc, bias, tmpBytes, tmp, outputDesc, output, + depthwiseActivationParamSpec, archInfo->arch); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = depthwise_convolution_arm(inputDesc, input, filterDesc, filter, convParamSpec, + algorithm, biasDesc, bias, tmpBytes, tmp, outputDesc, output, + depthwiseActivationParamSpec, archInfo->arch); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = depthwise_convolution_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, + (GCLMem_t)input, filterDesc, (GCLMem_t)filter, convParamSpec, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo, biasDesc, (GCLMem_t)bias, tmpBytes, + (GCLMem_t)tmp, outputDesc, (GCLMem_t)output, depthwiseActivationParamSpec.mode); +#endif + } + return ret; +} diff --git a/compute/tensor/src/depthwise_pointwise_convolution.cpp b/compute/tensor/src/depthwise_pointwise_convolution.cpp new file mode 100644 index 00000000..83617cc8 --- /dev/null +++ b/compute/tensor/src/depthwise_pointwise_convolution.cpp @@ -0,0 +1,355 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_GENERAL +#include "cpu/general/tensor_computing_general.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/tensor_computing_arm.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif +#ifdef _USE_X86 +#include "cpu/x86/tensor_computing_x86.h" +#endif + +inline EE depthwise_pointwise_convolution_infer_output_size_cpu(TensorDesc inputDesc, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + ConvolutionParamSpec convParamSpec, + TensorDesc *outputDesc, + DataType targetDataType) +{ + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, fdt, fdt2; + DataFormat idf, fdf, fdf2; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 fn2, fc2, fh2, fw2; + U32 oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(pwFilterDesc, &fdt2, &fdf2, &fn2, &fc2, &fh2, &fw2)); + if (fh < 1 || fw < 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; + + U32 fhDilated = (fh - 1) * dilateH + 1; + U32 fwDilated = (fw - 1) * dilateW + 1; + + oh = (ih + paddingT + paddingB - fhDilated) / strideH + 1; + ow = (iw + paddingL + paddingR - fwDilated) / strideW + 1; + + if (fn2 % 8 != 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + + *outputDesc = tensor4df(targetDataType, DF_NCHWC8, in, fn2, oh, ow); + return SUCCESS; +} + +EE depthwise_pointwise_convolution_infer_output_size(Tensor *inputTensor, + Tensor dwFilterTensor, + Tensor pwFilterTensor, + ConvolutionParamSpec convParamSpec, + Tensor *outputTensor, + DataType targetDataType, + ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + TensorDesc dwFilterDesc = dwFilterTensor.get_desc(); + TensorDesc pwFilterDesc = pwFilterTensor.get_desc(); + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = depthwise_pointwise_convolution_infer_output_size_mali(inputDesc, dwFilterDesc, + pwFilterDesc, convParamSpec, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } else { + ret = depthwise_pointwise_convolution_infer_output_size_cpu( + inputDesc, dwFilterDesc, pwFilterDesc, convParamSpec, &outputDesc, targetDataType); + } + outputTensor->resize(outputDesc); + return ret; +} + +EE depthwise_pointwise_convolution_infer_forward_algorithm(Tensor inputTensor, + Tensor dwFilterTensor, + Tensor pwFilterTensor, + Tensor outputTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + DepthwiseConvolutionForwardAlgorithm *algorithm, + DataType targetDataType, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + ArchInfo_t archInfo) +{ + TensorDesc inputDesc = inputTensor.get_desc(); + TensorDesc outputDesc = outputTensor.get_desc(); + TensorDesc dwFilterDesc = dwFilterTensor.get_desc(); + TensorDesc pwFilterDesc = pwFilterTensor.get_desc(); + + EE ret = NOT_SUPPORTED; + auto arch = archInfo->arch; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = SUCCESS; +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + *algorithm = DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT; + ret = SUCCESS; +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = depthwise_pointwise_convolution_infer_forward_algorithm_arm(inputDesc, dwFilterDesc, + pwFilterDesc, outputDesc, convParamSpec, policy, algorithm, targetDataType); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = depthwise_pointwise_convolution_infer_forward_algorithm_mali( + ((MaliPara_t)(archInfo->archPara))->handle, inputDesc, dwFilterDesc, pwFilterDesc, + outputDesc, convParamSpec, policy, depthwiseActivationParamSpec.mode, + pointwiseActivationParamSpec.mode, ((MaliPara_t)(archInfo->archPara))->forwardRunInfo); +#endif + } + return ret; +} + +EE depthwise_pointwise_convolution_transform_filter_bytes(Tensor dwFilterTensor, + Tensor pwFilterTensor, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + U32 *dwBytes, + U32 *pwBytes, + ArchInfo_t archInfo) +{ + TensorDesc dwFilterDesc = dwFilterTensor.get_desc(); + TensorDesc pwFilterDesc = pwFilterTensor.get_desc(); + UNUSED(convParamSpec); + EE ret = NOT_SUPPORTED; + auto arch = archInfo->arch; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + *dwBytes = tensorNumBytes(dwFilterDesc); + *pwBytes = tensorNumBytes(pwFilterDesc); + ret = SUCCESS; +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + *dwBytes = tensorNumBytes(dwFilterDesc) + 32; + *pwBytes = tensorNumBytes(pwFilterDesc) + 32; + ret = SUCCESS; +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + *dwBytes = tensorNumBytes(dwFilterDesc) + 32; + *pwBytes = tensorNumBytes(pwFilterDesc) + 32; + ret = SUCCESS; +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + GCLMemDesc_t gclmemFilterDesc = ((MaliPara_t)(archInfo->archPara))->gclmemFilterDesc; + GCLMemDesc_t gclmemDwFilterDesc = &gclmemFilterDesc[0]; + GCLMemDesc_t gclmemPwFilterDesc = &gclmemFilterDesc[1]; + ret = depthwise_pointwise_convolution_transform_filter_bytes_mali(dwFilterDesc, + pwFilterDesc, ((MaliPara_t)(archInfo->archPara))->forwardRunInfo, gclmemDwFilterDesc, + gclmemPwFilterDesc, dwBytes); + *pwBytes = 0; +#endif + } + return ret; +} + +EE depthwise_pointwise_convolution_transform_filter(Tensor dwFilterTensor, + Tensor pwFilterTensor, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + Tensor *dwFtm, + Tensor *pwFtm, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc dwFilterDesc = dwFilterTensor.get_desc(); + void *dwFilter = get_ptr_from_tensor(dwFilterTensor, arch); + TensorDesc pwFilterDesc = pwFilterTensor.get_desc(); + void *pwFilter = get_ptr_from_tensor(pwFilterTensor, arch); + TensorDesc dwFtmDesc = dwFtm->get_desc(); + void *dwFilterTransformed = get_ptr_from_tensor(*dwFtm, arch); + TensorDesc pwFtmDesc = pwFtm->get_desc(); + void *pwFilterTransformed = get_ptr_from_tensor(*pwFtm, arch); + UNUSED(convParamSpec); + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + UNI_memcpy(dwFilterTransformed, dwFilter, tensorNumBytes(dwFilterDesc)); + dwFtmDesc = dwFilterDesc; + UNI_memcpy(pwFilterTransformed, pwFilter, tensorNumBytes(pwFilterDesc)); + pwFtmDesc = pwFilterDesc; + ret = SUCCESS; +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = depthwise_pointwise_convolution_transform_filter_x86(dwFilterDesc, dwFilter, + pwFilterDesc, pwFilter, algorithm, &dwFtmDesc, dwFilterTransformed, &pwFtmDesc, + pwFilterTransformed); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = depthwise_pointwise_convolution_transform_filter_arm(dwFilterDesc, dwFilter, + pwFilterDesc, pwFilter, convParamSpec, algorithm, &dwFtmDesc, dwFilterTransformed, + &pwFtmDesc, pwFilterTransformed); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = depthwise_pointwise_convolution_transform_filter_mali( + ((MaliPara_t)(archInfo->archPara))->handle, dwFilterDesc, pwFilterDesc, + (GCLMem_t)dwFilter, (GCLMem_t)pwFilter, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo, &dwFtmDesc, &pwFtmDesc, + (GCLMem_t)dwFilterTransformed, (GCLMem_t)pwFilterTransformed); +#endif + } + dwFtm->resize(dwFtmDesc); + pwFtm->resize(pwFtmDesc); + return ret; +} + +EE depthwise_pointwise_convolution_infer_forward_tmp_bytes(Tensor inputTensor, + Tensor dwFilterTensor, + Tensor pwFilterTensor, + Tensor outputTensor, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + U32 *bytes, + ArchInfo_t archInfo) +{ + TensorDesc inputDesc = inputTensor.get_desc(); + TensorDesc outputDesc = outputTensor.get_desc(); + TensorDesc dwFilterDesc = dwFilterTensor.get_desc(); + TensorDesc pwFilterDesc = pwFilterTensor.get_desc(); + + EE ret = NOT_SUPPORTED; + auto arch = archInfo->arch; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = depthwise_pointwise_convolution_infer_forward_tmp_bytes_general( + inputDesc, dwFilterDesc, pwFilterDesc, outputDesc, convParamSpec, algorithm, bytes); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = depthwise_convolution_infer_forward_tmp_bytes_x86( + inputDesc, outputDesc, convParamSpec, algorithm, bytes); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = depthwise_pointwise_convolution_infer_forward_tmp_bytes_arm( + inputDesc, dwFilterDesc, pwFilterDesc, outputDesc, convParamSpec, algorithm, bytes); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = depthwise_pointwise_convolution_infer_forward_tmp_bytes_mali(inputDesc, dwFilterDesc, + pwFilterDesc, outputDesc, convParamSpec, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo, bytes); +#endif + } + return ret; +} + +EE depthwise_pointwise_convolution(Tensor inputTensor, + Tensor dwFilterTensor, + Tensor pwFilterTensor, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + Tensor dwBiasTensor, + Tensor pwBiasTensor, + Tensor tmpTensor, + Tensor outputTensor, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + U32 tmpBytes = tmpTensor.bytes(); + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + TensorDesc dwFilterDesc = dwFilterTensor.get_desc(); + void *dwFilter = get_ptr_from_tensor(dwFilterTensor, arch); + TensorDesc pwFilterDesc = pwFilterTensor.get_desc(); + void *pwFilter = get_ptr_from_tensor(pwFilterTensor, arch); + TensorDesc dwBiasDesc = dwBiasTensor.get_desc(); + void *dwBias = get_ptr_from_tensor(dwBiasTensor, arch); + TensorDesc pwBiasDesc = pwBiasTensor.get_desc(); + void *pwBias = get_ptr_from_tensor(pwBiasTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = depthwise_pointwise_convolution_general(inputDesc, input, dwFilterDesc, dwFilter, + pwFilterDesc, pwFilter, convParamSpec, dwBiasDesc, dwBias, pwBiasDesc, pwBias, tmpBytes, + tmp, outputDesc, output, depthwiseActivationParamSpec, pointwiseActivationParamSpec); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = depthwise_pointwise_convolution_x86(inputDesc, input, dwFilterDesc, dwFilter, + pwFilterDesc, pwFilter, convParamSpec, algorithm, dwBiasDesc, dwBias, pwBiasDesc, + pwBias, tmpBytes, tmp, outputDesc, output, depthwiseActivationParamSpec, + pointwiseActivationParamSpec, archInfo->arch); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = depthwise_pointwise_convolution_arm(inputDesc, input, dwFilterDesc, dwFilter, + pwFilterDesc, pwFilter, convParamSpec, algorithm, dwBiasDesc, dwBias, pwBiasDesc, + pwBias, tmpBytes, tmp, outputDesc, output, depthwiseActivationParamSpec, + pointwiseActivationParamSpec, archInfo->arch); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = depthwise_pointwise_convolution_mali(((MaliPara_t)(archInfo->archPara))->handle, + inputDesc, (GCLMem_t)input, dwFilterDesc, pwFilterDesc, (GCLMem_t)dwFilter, + (GCLMem_t)pwFilter, convParamSpec, ((MaliPara_t)(archInfo->archPara))->forwardRunInfo, + dwBiasDesc, pwBiasDesc, (GCLMem_t)dwBias, (GCLMem_t)pwBias, tmpBytes, (GCLMem_t)tmp, + outputDesc, (GCLMem_t)output, depthwiseActivationParamSpec.mode, + pointwiseActivationParamSpec.mode); +#endif + } + return ret; +} diff --git a/compute/tensor/src/detectionoutput.cpp b/compute/tensor/src/detectionoutput.cpp new file mode 100644 index 00000000..c20344ba --- /dev/null +++ b/compute/tensor/src/detectionoutput.cpp @@ -0,0 +1,85 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif + +inline EE detectionoutput_infer_output_size_cpu(std::vector inputDesc, + DetectionOutputParamSpec detectionOutputParamSpec, + TensorDesc *outputDesc) +{ + if (inputDesc.size() != 3) { + CHECK_STATUS(NOT_MATCH); + } + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt0, idt2; + DataFormat idf0, idf2; + U32 ih0, iw0; + U32 in2, ic2, ilens2; + // loc + CHECK_STATUS(tensor2dGet(inputDesc[0], &idt0, &idf0, &ih0, &iw0)); + // priorbox + CHECK_STATUS(tensor3dGet(inputDesc[2], &idt2, &idf2, &in2, &ic2, &ilens2)); + CHECK_REQUIREMENT(iw0 == ilens2); + // output size + U32 oh, ow; + // oh = the first box for saving the number of available boxes(1) + the maximum number of dectected boxes(keep_top_k) + U32 num_detected_max = detectionOutputParamSpec.keep_top_k; + oh = 1 + num_detected_max; + // Each width is a 6 dimension vector, which stores [label, confidence, xmin, ymin, xmax, ymax] -> 6 + // The first box is [ number of available boxes, 0, 0, 0, 0, 0 ] + ow = 6; + *outputDesc = tensor2df(idt0, idf2, oh, ow); + return SUCCESS; +} + +EE detectionoutput_infer_output_size(std::vector inputTensor, + DetectionOutputParamSpec detectionOutputParamSpec, + Tensor *outputTensor, + ArchInfo_t archInfo) +{ + UNUSED(archInfo); + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + std::vector inputDesc = get_desc_from_tensor_ptrs(inputTensor); + TensorDesc outputDesc = outputTensor->get_desc(); + CHECK_STATUS( + detectionoutput_infer_output_size_cpu(inputDesc, detectionOutputParamSpec, &outputDesc)); + outputTensor->resize(outputDesc); + return SUCCESS; +} + +EE detectionoutput(std::vector inputTensor, + DetectionOutputParamSpec detectionOutputParamSpec, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + std::vector inputDesc = get_desc_from_tensors(inputTensor); + std::vector input = get_data_from_tensors(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + ret = detectionoutput_cpu(inputDesc, input, detectionOutputParamSpec, outputDesc, output); +#endif + } + return ret; +} diff --git a/compute/tensor/src/eltwise.cpp b/compute/tensor/src/eltwise.cpp new file mode 100644 index 00000000..329781e4 --- /dev/null +++ b/compute/tensor/src/eltwise.cpp @@ -0,0 +1,176 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#if defined(_USE_GENERAL) || defined(_USE_X86) || defined(_USE_NEON) +#include "cpu/tensor_computing_cpu.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +// [1, 10, 10] + [1, 10, 10] = [1, 10, 10] +// [1, 10, 1] + [1, 1, 10] = [1, 10, 10] +// [1, 20, 10] + [10] = [1, 20, 10] +inline EE eltwise_infer_output_size_cpu(std::vector inputDesc, TensorDesc *outputDesc) +{ + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + U32 num = inputDesc.size(); + if (num <= 0) { + return NOT_MATCH; + } + + if (num == 1) { + *outputDesc = inputDesc[0]; + return SUCCESS; + } + + U32 arrayDimMax = 0; + U32 minDims = inputDesc[0].nDims; + for (U32 i = 1; i < num; i++) { + if (inputDesc[i].nDims > inputDesc[arrayDimMax].nDims) { + arrayDimMax = i; + } + if (inputDesc[i].nDims < minDims) { + minDims = inputDesc[i].nDims; + } + } + U32 nchwc8Count = 0; + for (U32 i = 0; i < num; i++) { + if (inputDesc[i].df == DF_NCHWC8) { + nchwc8Count++; + // Output from 1D-conv + 3D tensors + if (inputDesc[i].dims[0] == 1 && minDims == 3) { + inputDesc[i] = tensor3df(inputDesc[i].dt, DF_NCHW, + inputDesc[i].dims[3], inputDesc[i].dims[2], inputDesc[i].dims[1]); + } + } + } + + U32 dim = inputDesc[arrayDimMax].nDims; + *outputDesc = inputDesc[arrayDimMax]; + + if (nchwc8Count > 0 && nchwc8Count != num) { + outputDesc->df = DF_NCHW; + } + + for (U32 i = 0; i < dim; i++) { + for (U32 j = 0; j < num; j++) { + if (inputDesc[j].nDims > i) { + outputDesc->dims[i] = UNI_MAX(outputDesc->dims[i], inputDesc[j].dims[i]); + } + } + } + return SUCCESS; +} + +EE eltwise_infer_output_size( + std::vector inputTensor, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + std::vector inputDesc = get_desc_from_tensor_ptrs(inputTensor); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + std::vector gclmemInputDescs; + for (auto p : inputTensor) { + gclmemInputDescs.push_back(ocl_get_desc(*p)); + } + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = eltwise_infer_output_size_mali( + inputDesc, &outputDesc, gclmemInputDescs.data(), &gclmemOutputDesc); + for (U32 i = 0; i < inputTensor.size(); i++) { + ocl_set_desc(inputTensor[i], gclmemInputDescs[i]); + } + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } else { + ret = eltwise_infer_output_size_cpu(inputDesc, &outputDesc); + } + outputTensor->resize(outputDesc); + return ret; +} + +EE eltwise_infer_forward_tmp_bytes( + std::vector inputTensor, Tensor outputTensor, U32 *bytes, ArchInfo_t archInfo) +{ + std::vector inputDesc = get_desc_from_tensors(inputTensor); + UNUSED(outputTensor); + + *bytes = 0; + U32 nchwc8Count = 0; + for (U32 i = 0; i < inputDesc.size(); i++) { + if (inputDesc[i].df == DF_NCHWC8) { + nchwc8Count++; + *bytes += tensorNumBytes(inputDesc[i]); + } + } + if (nchwc8Count == inputDesc.size() || nchwc8Count == 0) { + *bytes = 0; + } + return SUCCESS; +} + +#ifdef _USE_INT8 +inline void eltwise_process_int8(F32 scale, U8 **tmp, TensorDesc *desc, U8 **input) +{ + INT8 *inQ = (INT8 *)(*input); + dequantize_int8_to_fp16(tensorNumElements(*desc), inQ, scale, (F16 *)*tmp); + desc->dt = DT_F16; + *input = *tmp; + *tmp += tensorNumElements(*desc); +} +#endif + +EE eltwise(std::vector inputTensor, + EltwiseParamSpec eltwiseDesc, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + std::vector inputDesc = get_desc_from_tensors(inputTensor); + std::vector input = get_data_from_tensors(inputTensor, arch); + U32 tmpBytes = tmpTensor.bytes(); + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); +#ifdef _USE_INT8 + if (!IS_MALI_GPU(arch)) { + for (U32 i = 0; i < inputTensor.size(); i++) { + if (inputDesc[i].dt == DT_I8) { + F32 scale = inputTensor[i].get_scale(); + eltwise_process_int8(scale, (U8 **)&tmp, &inputDesc[i], (U8 **)&input[i]); + } + } + } +#endif + + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + ret = eltwise_cpu(inputDesc, input, eltwiseDesc, tmpBytes, tmp, outputDesc, output, arch); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = eltwise_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, input, + eltwiseDesc, outputDesc, (GCLMem_t)output); +#endif + } + return ret; +} diff --git a/compute/tensor/src/embedding.cpp b/compute/tensor/src/embedding.cpp new file mode 100644 index 00000000..b81a89cb --- /dev/null +++ b/compute/tensor/src/embedding.cpp @@ -0,0 +1,97 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +EE embedding_infer_output_size(Tensor *inputTensor, + EmbedParamSpec p, + DataType outputDt, + Tensor *outputTensor, + ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = embedding_infer_output_size_mali( + inputDesc, p, outputDt, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif +#ifdef _USE_CPU + } else { + DataType dt; + DataFormat df; + U32 batch, step; + bool inputOneDim = false; + if (inputDesc.nDims == 1) { + inputOneDim = true; + inputDesc.nDims = 2; + inputDesc.dims[1] = 1; + } + CHECK_REQUIREMENT(tensorIs2d(inputDesc)); + CHECK_STATUS(tensor2dGet(inputDesc, &dt, &df, &batch, &step)); + outputDesc = tensor3df(outputDt, DF_MTK, batch, step, p.num_output); + if (inputOneDim) { + outputDesc.nDims = 2; + outputDesc.df = DF_NORMAL; + } + ret = SUCCESS; +#endif + } + outputTensor->resize(outputDesc); + return ret; +} + +EE embedding(Tensor inputTensor, + Tensor weightTensor, + EmbedParamSpec p, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + void *weight = get_ptr_from_tensor(weightTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(arch)) { +#ifdef _USE_MALI + TensorDesc weightDesc = weightTensor.get_desc(); + ret = embedding_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, (GCLMem_t)input, + weightDesc, (GCLMem_t)weight, p, outputDesc, (GCLMem_t)output); +#endif +#ifdef _USE_CPU + } else { + ret = embedding_cpu(inputDesc, input, weight, p, outputDesc, output); +#endif + } + return ret; +} diff --git a/compute/tensor/src/fully_connected.cpp b/compute/tensor/src/fully_connected.cpp new file mode 100644 index 00000000..0d8e0848 --- /dev/null +++ b/compute/tensor/src/fully_connected.cpp @@ -0,0 +1,451 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include + +#include "tensor_computing.h" +#include "blas_enhance.h" +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +// input format: NCHW|NCHWC8|NORMAL +// weight(filter) format: NORMAL +// result format: NORMAL + +inline EE fully_connected_infer_output_size_cpu( + TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc *outputDesc) +{ + if (outputDesc == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + + DataType idt, fdt; + DataFormat idf, fdf; + U32 in, ic, ih, iw; + U32 fh, fw; + if (tensorIs2d(inputDesc)) { + CHECK_STATUS(tensor2dGet(inputDesc, &idt, &idf, &in, &iw)); + ic = 1; + ih = 1; + } else if (tensorIs4d(inputDesc)) { + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + if (idf != DF_NCHW && idf != DF_NCHWC8) { + CHECK_STATUS(NOT_MATCH); + } + } else { + return NOT_MATCH; + } + + CHECK_REQUIREMENT(tensorIs2d(filterDesc)); + CHECK_STATUS(tensor2dGet(filterDesc, &fdt, &fdf, &fh, &fw)); + if (fdf != DF_TRANSPOSE) { + CHECK_STATUS(NOT_MATCH); + } + + if (fw != ic * ih * iw) { + CHECK_STATUS(NOT_MATCH); + } + + *outputDesc = tensor2df(idt, DF_NORMAL, in, fh); + return SUCCESS; +} + +EE fully_connected_infer_output_size( + Tensor *inputTensor, Tensor filterTensor, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc filterDesc = filterTensor.get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = fully_connected_infer_output_size_mali( + inputDesc, filterDesc, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } else { + ret = fully_connected_infer_output_size_cpu(inputDesc, filterDesc, &outputDesc); + } + outputTensor->resize(outputDesc); + return ret; +} + +EE fully_connected_infer_forward_algorithm( + Tensor inputTensor, Tensor filterTensor, Tensor outputTensor, ArchInfo_t archInfo) +{ + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + TensorDesc inputDesc = inputTensor.get_desc(); + TensorDesc filterDesc = filterTensor.get_desc(); + TensorDesc outputDesc = outputTensor.get_desc(); + std::vector outputDescs; + outputDescs.push_back(outputDesc); + ret = fully_connected_infer_forward_algorithm_mali( + ((MaliPara_t)(archInfo->archPara))->handle, inputDesc, filterDesc, outputDescs, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo); +#endif + } else { + UNUSED(inputTensor); + UNUSED(filterTensor); + UNUSED(outputTensor); + } + return ret; +} +EE fully_connected_infer_forward_tmp_bytes( + Tensor inputTensor, Tensor filterTensor, U32 *bytes, ArchInfo_t archInfo) +{ + TensorDesc inputDesc = inputTensor.get_desc(); + TensorDesc filterDesc = filterTensor.get_desc(); + // Match dt in int8 inference + inputDesc.dt = filterDesc.dt; + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + ret = fully_connected_infer_forward_tmp_bytes_mali( + inputDesc, filterDesc, bytes, ((MaliPara_t)(archInfo->archPara))->forwardRunInfo); +#endif + } else { + if (bytes == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt; + DataFormat idf; + U32 in, ic, ih, iw; + if (tensorIs2d(inputDesc)) { + CHECK_STATUS(tensor2dGet(inputDesc, &idt, &idf, &in, &iw)); + ic = ih = 1; + } else if (tensorIs4d(inputDesc)) { + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + } else { + return NOT_MATCH; + } + + if (in != 1) { + // call gemm + TensorDesc in_desc = tensor2df(idt, DF_NORMAL, in, ic * ih * iw); + ret = matrix_matrix_multiply_tmp_bytes(in_desc, filterDesc, bytes, archInfo->arch); + } else { + // call gemv + TensorDesc in_desc = tensor1d(idt, ic * ih * iw); + ret = matrix_vector_multiply_tmp_bytes(filterDesc, in_desc, bytes, archInfo->arch); + } + if (DT_I8 == filterDesc.dt) { + if (DT_F16 == inputTensor.get_desc().dt) { + *bytes += tensorNumBytes(inputDesc); + } + *bytes += filterDesc.dims[0] * bytesOf(DT_I32); // Bias + *bytes += in * filterDesc.dims[1] * bytesOf(DT_I32); // Results before quantization + } + } + return ret; +} + +EE fully_connected_transform_filter_bytes(Tensor filterTensor, U32 *bytes, ArchInfo_t archInfo) +{ + TensorDesc filterDesc = filterTensor.get_desc(); + + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + CHECK_STATUS(fully_connected_transform_filter_bytes_mali(filterDesc, + ((MaliPara_t)(archInfo->archPara))->gclmemFilterDesc, bytes, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo)); +#endif + } else { + if (bytes == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + *bytes = tensorNumBytes(filterDesc) + 32; + } + return SUCCESS; +} + +template +EE fully_connected_transform_filter_kernel(TensorDesc inputDesc, + TensorDesc filterDesc, + const void *filter, + TensorDesc *ftmDesc, + void *filterTransformed) +{ + if (filter == nullptr || ftmDesc == nullptr || filterTransformed == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + + DataType idt, fdt; + DataFormat idf, fdf; + U32 in, ic, ih, iw; + U32 fh, fw; + if (tensorIs2d(inputDesc)) { + CHECK_STATUS(tensor2dGet(inputDesc, &idt, &idf, &in, &iw)); + ic = ih = 1; + } else if (tensorIs4d(inputDesc)) { + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + } else { + return NOT_MATCH; + } + CHECK_STATUS(tensor2dGet(filterDesc, &fdt, &fdf, &fh, &fw)); + + if (fw != ic * ih * iw) { + CHECK_STATUS(NOT_MATCH); + } + bool need_transpose = false; + if (in > 1) { + need_transpose = true; + } + + if (idf == DF_NCHW || idf == DF_NORMAL) { + if (need_transpose) { + T *f_ptr = (T *)filter; + T *ftm_ptr = (T *)filterTransformed; + for (U32 h = 0; h < fh; h++) { + for (U32 w = 0; w < fw; w++) { + U32 f_index = h * fw + w; + U32 ftm_index = w * fh + h; + ftm_ptr[ftm_index] = f_ptr[f_index]; + } + } + } else { + memcpy(filterTransformed, filter, tensorNumBytes(filterDesc)); + } + } else if (idf == DF_NCHWC8) { + U32 align = 8; + U32 ic_new = ic / align; + T *f_ptr = (T *)filter; + T *ftm_ptr = (T *)filterTransformed; + for (U32 h = 0; h < fh; h++) { + for (U32 w = 0; w < fw; w++) { + U32 i_n = w / (ic * ih * iw); + U32 remain = w % (ic * ih * iw); + U32 i_c = remain / (ih * iw); + remain = remain % (ih * iw); + U32 i_h = remain / iw; + U32 i_w = remain % iw; + U32 i_c_outer = i_c / align; + U32 i_c_inner = i_c % align; + U32 h_new = h; + U32 w_new = (((i_n * ic_new + i_c_outer) * ih + i_h) * iw + i_w) * align + i_c_inner; + U32 ld = fw; + if (need_transpose) { + U32 tmp = h_new; + h_new = w_new; + w_new = tmp; + ld = fh; + } + U32 f_index = h * fw + w; + U32 ftm_index = h_new * ld + w_new; + ftm_ptr[ftm_index] = f_ptr[f_index]; + } + } + } else { + return NOT_MATCH; + } + + U32 fh_after = fh; + U32 fw_after = fw; + if (need_transpose) { + fh_after = fw; + fw_after = fh; + } + *ftmDesc = tensor2df(fdt, DF_NORMAL, fh_after, fw_after); + return SUCCESS; +} + +EE fully_connected_transform_filter( + Tensor inputTensor, Tensor filterTensor, Tensor *ftmTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + TensorDesc filterDesc = filterTensor.get_desc(); + void *filter = get_ptr_from_tensor(filterTensor, arch); + TensorDesc ftmDesc = ftmTensor->get_desc(); + void *filterTransformed = get_ptr_from_tensor(*ftmTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(arch)) { +#ifdef _USE_MALI + std::vector filterTransVec; + filterTransVec.push_back((GCLMem_t)filterTransformed); + ret = fully_connected_transform_filter_mali(((MaliPara_t)(archInfo->archPara))->handle, + filterDesc, (GCLMem_t)filter, &ftmDesc, filterTransVec, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo); +#endif + } else { + switch (filterDesc.dt) { +#ifdef _USE_FP16 + case DT_F16: { + ret = fully_connected_transform_filter_kernel( + inputDesc, filterDesc, filter, &ftmDesc, filterTransformed); + break; + } +#endif +#ifdef _USE_FP32 + case DT_F32: { + ret = fully_connected_transform_filter_kernel( + inputDesc, filterDesc, filter, &ftmDesc, filterTransformed); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + } + ftmTensor->resize(ftmDesc); + return ret; +} + +EE fully_connected(Tensor inputTensor, + Tensor filterTensor, + Tensor biasTensor, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc filterDesc = filterTensor.get_desc(); + void *filter = get_ptr_from_tensor(filterTensor, arch); + TensorDesc biasDesc = biasTensor.get_desc(); + void *bias = get_ptr_from_tensor(biasTensor, arch); + U32 tmpBytes = tmpTensor.bytes(); + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(arch)) { +#ifdef _USE_MALI + std::vector filterVec; + std::vector biasVec; + std::vector outputVec; + filterVec.push_back((GCLMem_t)filter); + biasVec.push_back((GCLMem_t)bias); + outputVec.push_back((GCLMem_t)output); + ret = fully_connected_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, + (GCLMem_t)input, filterDesc, &filterVec, biasDesc, &biasVec, tmpBytes, (GCLMem_t)tmp, + outputDesc, &outputVec, ((MaliPara_t)(archInfo->archPara))->forwardRunInfo); +#endif + } else { + if (input == nullptr || filter == nullptr || output == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + +#ifdef _USE_INT8 + F32 scaleI = inputTensor.get_scale(); + if (DT_I8 == filterDesc.dt) { + if (DT_F16 == inputDesc.dt) { + F16 *inD = (F16 *)input; + INT8 *inQ = (INT8 *)tmp; + F16 scale = scaleI; + quantize_tensor(inputDesc, inD, &inputDesc, inQ, &scale); + scaleI = scale; + input = (U8 *)tmp; + tmp = (U8 *)tmp + tensorNumBytes(inputDesc); + } + if (nullptr != bias) { + if (DT_F16 == outputDesc.dt) { // dequantize and then add bias + bias = nullptr; + } else { + CHECK_REQUIREMENT(DT_I8 == outputDesc.dt); + biasDesc.dt = DT_I32; + F16 *biasF = (F16 *)bias; + I32 *biasI = (I32 *)tmp; + F32 scale = scaleI * filterTensor.get_scale(); + for (U32 i = 0; i < tensorNumElements(biasDesc); i++) { + biasI[i] = round(scale * biasF[i]); + } + bias = tmp; + tmp = (U8 *)tmp + tensorNumBytes(biasDesc); + } + } + outputDesc.dt = DT_I32; + output = tmp; + tmp = (U8 *)tmp + tensorNumBytes(outputDesc); + } +#endif + + U32 in, ic, ih, iw; + U32 oh, ow; + U32 fh, fw, bw; + DataType idt, fdt, odt, bdt; + DataFormat idf, fdf, odf, bdf; + if (tensorIs2d(inputDesc)) { + CHECK_STATUS(tensor2dGet(inputDesc, &idt, &idf, &in, &iw)); + ic = ih = 1; + } else if (tensorIs4d(inputDesc)) { + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + } else { + CHECK_STATUS(NOT_MATCH); + } + + CHECK_REQUIREMENT(tensorIs2d(filterDesc)); + CHECK_STATUS(tensor2dGet(filterDesc, &fdt, &fdf, &fh, &fw)); + CHECK_STATUS(tensor2dGet(outputDesc, &odt, &odf, &oh, &ow)); + + if (bias != nullptr) { + CHECK_STATUS(tensor1dGet(biasDesc, &bdt, &bdf, &bw)); + + if (bw != ow) { + CHECK_STATUS(NOT_MATCH); + } else { + U8 *outArray = (U8 *)output; + U32 size = tensorNumBytes(biasDesc); + for (U32 i = 0; i < in; i++) { + memcpy(outArray + i * size, bias, size); + } + } + } else { + memset(output, 0, tensorNumBytes(outputDesc)); + } + if (in == 1 && + fdf != targetFormat4MatrixB(fdt)) { // If weight is transformed for mmm, don't run as mvm + TensorDesc vectorDesc = tensor1d(idt, ic * ih * iw); + TensorDesc resultDesc = tensor1d(odt, ow); + ret = matrix_vector_multiply(filterDesc, filter, vectorDesc, input, tmpBytes, tmp, + resultDesc, output, archInfo->arch); + } else { + TensorDesc in_desc = tensor2df(idt, DF_NORMAL, in, ic * ih * iw); + ret = matrix_matrix_multiply(in_desc, input, filterDesc, filter, tmpBytes, tmp, + outputDesc, output, archInfo->arch); + } +#ifdef _USE_INT8 + F32 scale = scaleI * filterTensor.get_scale(); + if (DT_I8 == filterDesc.dt) { + if (DT_I8 == outputTensor.get_desc().dt) { + CHECK_STATUS(quantize_tensor(outputDesc, output, &outputDesc, + get_ptr_from_tensor(outputTensor, arch), &scale)); + outputTensor.set_scale(scale); + } else { + CHECK_REQUIREMENT(DT_F16 == outputTensor.get_desc().dt); + F16 *biasF = (F16 *)get_ptr_from_tensor(biasTensor, arch); + U32 biasLen = nullptr == biasF ? 0 : tensorNumElements(biasDesc); + dequantize_int32_to_fp16(tensorNumElements(outputDesc), (I32 *)output, scale, + (F16 *)get_ptr_from_tensor(outputTensor, arch), biasLen, biasF); + } + } +#endif + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/activation.cpp b/compute/tensor/src/gpu/mali/activation.cpp new file mode 100644 index 00000000..0aeaea05 --- /dev/null +++ b/compute/tensor/src/gpu/mali/activation.cpp @@ -0,0 +1,96 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/activation_mali_fp16.h" + +EE activation_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + /*tensorDesc record cpu org data format info*/ + /*gclmemDesc record gpu trans data format info*/ + if (outputDesc) { + *outputDesc = inputDesc; + } + + DataType idt; + DataFormat idf; + U32 iw, ih, ic, in; + tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + iw, ih, ic, 0, 0, iw, ih, ic, idt, idt, gclmemInputDesc, gclmemOutputDesc)); + if (gclmemInputDesc && gclmemOutputDesc) { + *gclmemOutputDesc = *gclmemInputDesc; + } + return SUCCESS; +} + +inline EE activation_checkpara_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode) +{ + if (handle == nullptr || nullptr == input || nullptr == output) { + return NULL_POINTER; + } + if (inputDesc.df != outputDesc.df) { + return NOT_SUPPORTED; + } + if (input->desc.memFormat != DF_NCWHC4) { + return NOT_SUPPORTED; + } + if (output->desc.memFormat != DF_NCWHC4) { + return NOT_SUPPORTED; + } + if (activationMode != ACTIVATION_NULL && activationMode != ACTIVATION_RELU && + activationMode != ACTIVATION_RELU6 && activationMode != ACTIVATION_H_SIGMOID && + activationMode != ACTIVATION_H_SWISH && activationMode != ACTIVATION_GELU && + activationMode != ACTIVATION_TANH && activationMode != ACTIVATION_SIGMOID) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +EE activation_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode) +{ + EE ret = SUCCESS; + CHECK_STATUS( + activation_checkpara_mali(handle, inputDesc, input, outputDesc, output, activationMode)); + switch (inputDesc.dt) { + case DT_F16: { + ret = activation_mali_fp16(handle, inputDesc, input, outputDesc, output, activationMode); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/argmax.cpp b/compute/tensor/src/gpu/mali/argmax.cpp new file mode 100644 index 00000000..7d8b6096 --- /dev/null +++ b/compute/tensor/src/gpu/mali/argmax.cpp @@ -0,0 +1,117 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/argmax_mali_fp16.h" + +EE argmax_infer_output_size_mali(TensorDesc inputDesc, + ArgMaxParamSpec p, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + /*tensorDesc record cpu org data format info*/ + /*gclmemDesc record gpu trans data format info*/ + int axis = p.axis; + TensorDesc desc = inputDesc; + if (axis < 0) { + axis += inputDesc.nDims; + } + axis = inputDesc.nDims - 1 - axis; + for (int i = axis; i < (I32)(inputDesc.nDims) - 1; i++) { + desc.dims[i] = desc.dims[i + 1]; + } + desc.nDims = inputDesc.nDims - 1; + desc.dt = DT_U32; + if (outputDesc) { + *outputDesc = desc; + } + + if (gclmemInputDesc || gclmemOutputDesc) { + U32 iw, ih, ic; + U32 ow, oh, oc; + U32 inDims = inputDesc.nDims; + U32 onDims = desc.nDims; + DataType idt = inputDesc.dt; + DataType odt = desc.dt; + iw = inputDesc.dims[0]; + ih = (inDims > 1) ? inputDesc.dims[1] : 1; + ic = (inDims > 2) ? inputDesc.dims[2] : 1; + ow = desc.dims[0]; + oh = (onDims > 1) ? desc.dims[1] : 1; + oc = (onDims > 2) ? desc.dims[2] : 1; + U32 iw_align = (axis == 0) ? (iw + 7) / 8 * 8 : iw; + U32 ih_align = (axis == 1) ? (iw + 7) / 8 * 8 : ih; + U32 ic_align = (axis == 2) ? (iw + 7) / 8 * 8 : ic; + bool need_pad = false; + if (iw_align != iw || ih_align != ih || ic_align != ic) { + need_pad = true; + } + CHECK_STATUS(infer_gclmem_desc_nchw(iw_align, ih_align, ic_align, 0, 0, ow, oh, oc, idt, + odt, gclmemInputDesc, gclmemOutputDesc, need_pad)); + } + return SUCCESS; +} + +inline EE argmax_checkpara_mali(GCLHandle_t handle, GCLMem_t input, GCLMem_t tmpbuf, GCLMem_t output) +{ + if (handle == nullptr || input == nullptr || output == nullptr || tmpbuf == nullptr) { + return NULL_POINTER; + } + if (input->desc.memFormat != output->desc.memFormat || input->desc.memFormat != DF_NCHW) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +EE argmax_infer_forward_tmp_bytes_mali( + TensorDesc inputDesc, ArgMaxParamSpec p, TensorDesc outputDesc, U32 *bytes) +{ + EE ret = SUCCESS; + switch (inputDesc.dt) { + case DT_F16: { + ret = argmax_infer_forward_tmp_bytes_mali_fp16(inputDesc, p.axis, outputDesc, bytes); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE argmax_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + ArgMaxParamSpec p, + GCLMem_t tmpbuf, + TensorDesc outputDesc, + GCLMem_t output) +{ + EE ret = SUCCESS; + CHECK_STATUS(argmax_checkpara_mali(handle, input, tmpbuf, output)); + switch (inputDesc.dt) { + case DT_F16: { + ret = argmax_mali_fp16(handle, inputDesc, input, p.axis, tmpbuf, outputDesc, output); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/bilateral_slice_apply.cpp b/compute/tensor/src/gpu/mali/bilateral_slice_apply.cpp new file mode 100644 index 00000000..95b414a3 --- /dev/null +++ b/compute/tensor/src/gpu/mali/bilateral_slice_apply.cpp @@ -0,0 +1,214 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/bilateral_slice_apply_mali_fp16.h" +#include "gpu/mali/uchar/bilateral_slice_apply_mali_uchar.h" + +inline EE bilateral_slice_apply_checkpara_mali_common(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc guideDesc, + const GCLMem_t guide, + TensorDesc gridDesc, + const GCLMem_t grid, + BilateralSliceApplyParamSpec bilateralSliceApplyParamSpec, + TensorDesc outputDesc, + GCLMem_t output) +{ + if (nullptr == handle || nullptr == input || nullptr == grid || nullptr == output) { + return NULL_POINTER; + } + if (bilateralSliceApplyParamSpec.mode == BSliceApply_NULL && nullptr == guide) { + return NULL_POINTER; + } + if (inputDesc.df != guideDesc.df || inputDesc.df != gridDesc.df) { + return NOT_SUPPORTED; + } + if (inputDesc.df != outputDesc.df || inputDesc.df != DF_NHWC) { + return NOT_SUPPORTED; + } + if (inputDesc.dims[0] != guideDesc.dims[0] || inputDesc.dims[1] != guideDesc.dims[1]) { + return NOT_MATCH; + } + if (inputDesc.dims[0] != outputDesc.dims[0] || inputDesc.dims[1] != outputDesc.dims[1]) { + return NOT_MATCH; + } + if (inputDesc.dims[2] != outputDesc.dims[2]) { + return NOT_MATCH; + } + if ((gridDesc.dims[2] % bilateralSliceApplyParamSpec.coefficient_len) != 0) { + return NOT_MATCH; + } + if (bilateralSliceApplyParamSpec.has_offset == true) { + if (bilateralSliceApplyParamSpec.coefficient_len != + inputDesc.dims[2] * (inputDesc.dims[2] + 1)) { + return NOT_MATCH; + } + if (bilateralSliceApplyParamSpec.coefficient_len != 12) { + return NOT_SUPPORTED; + } + } else { + return NOT_SUPPORTED; + // if(bilateralSliceApplyParamSpec.coefficient_len != inputDesc.dims[2] * inputDesc.dims[2]) return NOT_MATCH; + // if(bilateralSliceApplyParamSpec.coefficient_len != 9) return NOT_SUPPORTED; + } + return SUCCESS; +} + +EE bilateral_slice_apply_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc guideDesc, + TensorDesc gridDesc, + BilateralSliceApplyParamSpec bilateralSliceApplyParamSpec, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemGuideDesc, + GCLMemDesc_t gclmemGridDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + UNUSED(bilateralSliceApplyParamSpec); + DataType idt, gdt, guide_dt; + DataFormat idf, gdf; + U32 guide_w, guide_h, guide_c, guide_n; + U32 iw, ih, ic, in; + U32 ow, oh, oc, on; + U32 gw, gh, gc, gn; + + if (inputDesc.df != DF_NHWC || guideDesc.df != DF_NHWC) { + return NOT_MATCH; + } + tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); + tensorSelectGet(guideDesc, &guide_dt, &gdf, &guide_n, &guide_c, &guide_h, &guide_w); + tensorSelectGet(gridDesc, &gdt, &gdf, &gn, &gc, &gh, &gw); + ow = guide_w; + oh = guide_h; + oc = ic; + on = guide_n; + if (outputDesc) { + *outputDesc = tensor4df(idt, idf, on, oc, oh, ow); + } + CHECK_STATUS(infer_gclmem_desc_nhwc( + iw, ih, ic, 0, 0, ow, oh, oc, idt, idt, gclmemInputDesc, gclmemOutputDesc)); + + if (gclmemGridDesc && gclmemGuideDesc) { + U32 s0, s1, s2; + U32 num, byteSize; + s0 = gc; + s1 = gw; + s2 = gh; + num = s0 * s1 * s2; + byteSize = s0 * s1 * s2 * bytesOf(gdt); + gclmemGridDesc->stride[0] = s0; + gclmemGridDesc->stride[1] = s1; + gclmemGridDesc->stride[2] = s2; + gclmemGridDesc->offset[0] = 0; + gclmemGridDesc->offset[1] = 0; + gclmemGridDesc->offset[2] = 0; + gclmemGridDesc->num = num; + gclmemGridDesc->byteSize = byteSize; + gclmemGridDesc->memType = GCL_MEM_BUF; + gclmemGridDesc->memFormat = DF_NHWC; + gclmemGridDesc->flags = CL_MEM_READ_WRITE; + gclmemGridDesc->host_ptr = NULL; + gclmemGridDesc->need_pad = false; + + if (bilateralSliceApplyParamSpec.mode == BSliceApply_NULL) { + s0 = guide_c; + s1 = guide_w; + s2 = guide_h; + num = s0 * s1 * s2; + byteSize = s0 * s1 * s2 * bytesOf(guide_dt); + gclmemGuideDesc->stride[0] = s0; + gclmemGuideDesc->stride[1] = s1; + gclmemGuideDesc->stride[2] = s2; + gclmemGuideDesc->offset[0] = 0; + gclmemGuideDesc->offset[1] = 0; + gclmemGuideDesc->offset[2] = 0; + gclmemGuideDesc->num = num; + gclmemGuideDesc->byteSize = byteSize; + gclmemGuideDesc->memType = GCL_MEM_BUF; + gclmemGuideDesc->memFormat = DF_NHWC; + gclmemGuideDesc->flags = CL_MEM_READ_WRITE; + gclmemGuideDesc->host_ptr = NULL; + gclmemGuideDesc->need_pad = false; + } + } + return SUCCESS; +} + +EE bilateral_slice_apply_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, + TensorDesc guideDesc, + TensorDesc gridDesc, + BilateralSliceApplyParamSpec bilateralSliceApplyParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes) +{ + UNUSED(inputDesc); + UNUSED(guideDesc); + UNUSED(gridDesc); + UNUSED(bilateralSliceApplyParamSpec); + UNUSED(forwardRunInfo); + + DataType dt; + U32 gc, gw; + U32 ih; + tensorSelectGet(gridDesc, &dt, NULL, NULL, &gc, NULL, &gw); + tensorSelectGet(inputDesc, NULL, NULL, NULL, NULL, &ih, NULL); + *bytes = gc * gw * ih * bytesOf(dt); + return SUCCESS; +} + +EE bilateral_slice_apply_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc guideDesc, + const GCLMem_t guide, + TensorDesc gridDesc, + const GCLMem_t grid, + BilateralSliceApplyParamSpec bilateralSliceApplyParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output) +{ + EE ret = SUCCESS; + CHECK_STATUS(bilateral_slice_apply_checkpara_mali_common(handle, inputDesc, input, guideDesc, + guide, gridDesc, grid, bilateralSliceApplyParamSpec, outputDesc, output)); + switch (inputDesc.dt) { + case DT_F16: { + ret = bilateral_slice_apply_mali_fp16(handle, inputDesc, input, guideDesc, guide, + gridDesc, grid, bilateralSliceApplyParamSpec, forwardRunInfo, tmpBytes, tmpBuf, + outputDesc, output); + break; + } + case DT_U8: { + ret = bilateral_slice_apply_mali_uchar(handle, inputDesc, input, guideDesc, guide, + gridDesc, grid, bilateralSliceApplyParamSpec, forwardRunInfo, tmpBytes, tmpBuf, + outputDesc, output); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/channel_resize.cpp b/compute/tensor/src/gpu/mali/channel_resize.cpp new file mode 100644 index 00000000..2fa5f14f --- /dev/null +++ b/compute/tensor/src/gpu/mali/channel_resize.cpp @@ -0,0 +1,84 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/channel_resize_mali_fp16.h" + +EE channel_resize_infer_output_size_mali(TensorDesc inputDesc, + ChannelResizeParamSpec p, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + if (outputDesc == nullptr || gclmemInputDesc == nullptr || gclmemOutputDesc == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt; + DataFormat idf; + U32 in, ic, ih, iw; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_REQUIREMENT(((int)ic == p.channel_before)); + if (p.group != 1) { + return NOT_SUPPORTED; + } + + *outputDesc = tensor4df(idt, idf, in, p.channel_after, ih, iw); + if (gclmemInputDesc->memFormat == DF_NCHW || gclmemInputDesc->byteSize == 0) { + CHECK_STATUS( + infer_gclmem_desc_nchw(iw, ih, ic, 0, 0, 0, 0, 0, idt, idt, gclmemInputDesc, NULL)); + } else { + CHECK_STATUS( + infer_gclmem_desc_ncwhc4(iw, ih, ic, 0, 0, 0, 0, 0, idt, idt, gclmemInputDesc, NULL)); + } + CHECK_STATUS(infer_gclmem_desc_nchw( + 0, 0, 0, 0, 0, iw, ih, p.channel_after, idt, idt, NULL, gclmemOutputDesc)); + return SUCCESS; +} + +inline EE channel_resize_checkpara_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + ChannelResizeParamSpec p, + TensorDesc outputDesc, + GCLMem_t output) +{ + if (handle == nullptr || input == nullptr || output == nullptr) { + return NULL_POINTER; + } + return SUCCESS; +} + +EE channel_resize_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + ChannelResizeParamSpec p, + TensorDesc outputDesc, + GCLMem_t output) +{ + EE ret = SUCCESS; + CHECK_STATUS(channel_resize_checkpara_mali(handle, inputDesc, input, p, outputDesc, output)); + switch (inputDesc.dt) { + case DT_F16: { + ret = channel_resize_mali_fp16(handle, inputDesc, input, p, outputDesc, output); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/check.cpp b/compute/tensor/src/gpu/mali/check.cpp new file mode 100644 index 00000000..83710dfa --- /dev/null +++ b/compute/tensor/src/gpu/mali/check.cpp @@ -0,0 +1,153 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" + +EE check_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputADesc, + GCLMemDesc_t gclmemInputBDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + /*tensorDesc record cpu org data format info*/ + /*gclmemDesc record gpu trans data format info*/ + if (outputDesc) { + (*outputDesc).dt = DT_I32; + (*outputDesc).nDims = 1; + (*outputDesc).dims[0] = inputDesc.dims[inputDesc.nDims - 1]; + } + DataType idt = inputDesc.dt; + U32 ndims = inputDesc.nDims; + U32 iw = inputDesc.dims[0]; + U32 ih = (ndims > 1) ? inputDesc.dims[1] : 1; + U32 ic = (ndims > 2) ? inputDesc.dims[2] : 1; + U32 in = (ndims > 3) ? inputDesc.dims[3] : 1; + if (in > 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + CHECK_STATUS(infer_gclmem_desc_nchw( + iw, ih, ic, 0, 0, 1, 1, 1, idt, DT_I32, gclmemInputADesc, gclmemOutputDesc)); + CHECK_STATUS( + infer_gclmem_desc_nchw(iw, ih, ic, 0, 0, 0, 0, 0, idt, idt, gclmemInputBDesc, NULL)); + return SUCCESS; +} + +inline EE check_checkpara_mali(GCLHandle_t handle, + TensorDesc inputDescA, + GCLMem_t inputA, + TensorDesc inputDescB, + GCLMem_t inputB, + CheckParamSpec p, + TensorDesc outputDesc, + GCLMem_t output) +{ + if (handle == nullptr || inputA == nullptr || inputB == nullptr || output == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (inputA->desc.memFormat != output->desc.memFormat || + inputB->desc.memFormat != output->desc.memFormat || inputA->desc.memFormat != DF_NCHW) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (inputDescA.dt == DT_I32 || inputDescA.dt == DT_U32) { + if (inputDescB.dt != DT_I32 && inputDescB.dt != DT_U32) { + CHECK_STATUS(NOT_MATCH); + } + } + if (outputDesc.dt != DT_I32) { + CHECK_STATUS(NOT_MATCH); + } + if (p.check_mode != CHECK_EQUAL) { + CHECK_STATUS(NOT_SUPPORTED); + } + return SUCCESS; +} + +inline EE check_core_mali(GCLHandle_t handle, + TensorDesc inputDescA, + GCLMem_t inputA, + TensorDesc inputDescB, + GCLMem_t inputB, + CheckParamSpec p, + TensorDesc outputDesc, + GCLMem_t output) +{ + U32 ndims = inputDescA.nDims; + U32 iw = inputDescA.dims[0]; + U32 ih = (ndims > 1) ? inputDescA.dims[1] : 1; + U32 ic = (ndims > 2) ? inputDescA.dims[2] : 1; + if (iw == 1 && ih == 1 && ic == 1) { + U32 aw_str, ah_str, aw_off, ah_off; + U32 bw_str, bh_str, bw_off, bh_off; + U32 ow_str, oh_str, ow_off, oh_off; + get_gclmem_dim(inputA->desc, &aw_str, &ah_str, NULL, &aw_off, &ah_off); + get_gclmem_dim(inputB->desc, &bw_str, &bh_str, NULL, &bw_off, &bh_off); + get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off); + U32 gs = 1; + U32 ls = 0; + U32 dim = 1; + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, "check_int_spe", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs( + kernel, aw_off, bw_off, ow_off, gs, inputA->mem, inputB->mem, output->mem)); + gcl_set_kernelVec(handle, kernel, dim, &gs, &ls, "check_int_spe"); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, &gs, &ls, "check_int_spe")); + CHECK_STATUS(gcl_print_memory(handle, inputA, "clip_inputA")); + CHECK_STATUS(gcl_print_memory(handle, inputB, "clip_inputB")); + CHECK_STATUS(gcl_print_memory(handle, output, "clip_output")); +#endif + return SUCCESS; + } + return NOT_SUPPORTED; +} + +EE check_mali(GCLHandle_t handle, + TensorDesc inputDescA, + GCLMem_t inputA, + TensorDesc inputDescB, + GCLMem_t inputB, + CheckParamSpec p, + TensorDesc outputDesc, + GCLMem_t output) +{ + EE ret = SUCCESS; + CHECK_STATUS( + check_checkpara_mali(handle, inputDescA, inputA, inputDescB, inputB, p, outputDesc, output)); + DataType dt = inputDescA.dt; + if (dt == DT_U32) { + dt = DT_I32; + } + switch (dt) { + case DT_F16: { + ret = NOT_SUPPORTED; + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + case DT_I32: { + ret = check_core_mali( + handle, inputDescA, inputA, inputDescB, inputB, p, outputDesc, output); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/cl/activation.cl b/compute/tensor/src/gpu/mali/cl/activation.cl new file mode 100644 index 00000000..856c7fb2 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/activation.cl @@ -0,0 +1,58 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "kernel_def.h" +#define MANGLE_NAME_IMPL(base, AC, H) base##AC##H +#define MANGLE_NAME(base, AC, H) MANGLE_NAME_IMPL(base, AC, H) +__kernel void MANGLE_NAME(activation_, AC, H)(const int h, + const int w, + const int cd4, + const int ce4, + const int ih_str, + const int iw_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + __global T *input, + __global T *output) +{ + int idx = get_global_id(0); + int idy = get_global_id(1); + int idz = get_global_id(2); + if (idx >= h || idy >= w) { + return; + } + + T4 val; + int in_off = (idz * iw_str + idy + iw_off) * ih_str + idx + ih_off; + val = vload4(in_off, input); + ACTIVATION_V4(val); +#if defined(USE_TANH) || defined(USE_SIGMOID) || defined(USE_HSIGMOID) || defined(USE_GELU) + if (idz == cd4 - 1) { + if (ce4 < 2) { + val.y = 0; + } + if (ce4 < 3) { + val.z = 0; + } + if (ce4 < 4) { + val.w = 0; + } + } +#endif + int out_off = (idz * ow_str + idy + ow_off) * oh_str + idx + oh_off; + vstore4(val, out_off, output); +} diff --git a/compute/tensor/src/gpu/mali/cl/argmax_x.cl b/compute/tensor/src/gpu/mali/cl/argmax_x.cl new file mode 100644 index 00000000..7e3fe903 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/argmax_x.cl @@ -0,0 +1,136 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define get_max(val, dim) \ + { \ + dim.s0 = 0; \ + dim.s1 = 1; \ + dim.s2 = 2; \ + dim.s3 = 3; \ + if (val.s4 > val.s0) { \ + val.s0 = val.s4; \ + dim.s0 = 4; \ + } \ + if (val.s5 > val.s1) { \ + val.s1 = val.s5; \ + dim.s1 = 5; \ + } \ + if (val.s6 > val.s2) { \ + val.s2 = val.s6; \ + dim.s2 = 6; \ + } \ + if (val.s7 > val.s3) { \ + val.s3 = val.s7; \ + dim.s3 = 7; \ + } \ + if (val.s2 > val.s0) { \ + val.s0 = val.s2; \ + dim.s0 = dim.s2; \ + } \ + if (val.s3 > val.s1) { \ + val.s1 = val.s3; \ + dim.s1 = dim.s3; \ + } \ + if (val.s1 > val.s0) { \ + val.s0 = val.s1; \ + dim.s0 = dim.s1; \ + } \ + } + +#if defined(USE_INDEX) +__kernel void argmax_x_index +#else +__kernel void argmax_x +#endif + (const int iw_str, + const int ih_str, + const int iw_off, + const int ih_off, + const int ow_str, + const int oh_str, + const int ow_off, + const int oh_off, + const int len, + const int bx, + const int by, + __global const T *in, + __global const uint *ini, + __global T *outv, + __global uint *outi) +{ + int idx = get_global_id(0); + int idy = get_global_id(1); + int idz = get_global_id(2); + if (idx >= bx || idy >= by) { + return; + } + int bn = len >> 3; + int en = len & 7; + T8 val; + uchar4 dim; + T maxval = -65504; + uint maxIndex = 1; + const int in_off = (idz * ih_str + idy + ih_off) * iw_str + iw_off; + for (int i = idx; i < bn; i += bx) { + val = vload8(i, in + in_off); + get_max(val, dim); + if (val.s0 > maxval) { + maxval = val.s0; + maxIndex = (i << 3) + dim.s0; + } + } + + if (en != 0 && idx == bx - 1) { + int be = len - 8; + int rx = 0; + if (be < 0) { + be = 0; + rx = -be; + } + val = vload8(0, in + in_off + be); + if (rx > 0) { + val.s7 = -65504; + if (rx > 1) { + val.s6 = -65504; + } + if (rx > 2) { + val.s5 = -65504; + } + if (rx > 3) { + val.s4 = -65504; + } + if (rx > 4) { + val.s3 = -65504; + } + if (rx > 5) { + val.s2 = -65504; + } + if (rx > 6) { + val.s1 = -65504; + } + } + get_max(val, dim); + if (val.s0 > maxval) { + maxval = val.s0; + maxIndex = be + dim.s0; + } + } + int out_off = (idz * oh_str + idy + oh_off) * ow_str + idx + ow_off; +#if defined(USE_INDEX) + maxIndex = ini[maxIndex]; +#endif + if (bx > 1) { + outv[out_off] = maxval; + } + outi[out_off] = maxIndex; +} diff --git a/compute/tensor/src/gpu/mali/cl/bilateral_slice_apply_c12.cl b/compute/tensor/src/gpu/mali/cl/bilateral_slice_apply_c12.cl new file mode 100644 index 00000000..1f4028ea --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/bilateral_slice_apply_c12.cl @@ -0,0 +1,195 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#if defined(USE_HALF) +#define READ_IMAGE(image, sampler, coord) read_imageh(image, sampler, coord) +#define WRITE_IMAGE(image, coord, data) write_imageh(image, coord, data) +#else +#define READ_IMAGE(image, sampler, coord) read_imagef(image, sampler, coord) +#define WRITE_IMAGE(image, coord, data) write_imagef(image, coord, data) +#endif +__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + +/*these parameters are belong to matrix mult/add and conv*/ +/*they are extract from HDR model*/ +/*they may be changful for different model*/ +#define guide_cal(v, g) \ + { \ + T3 tmp; \ + tmp.x = v.x * (T)0.900616 - v.y * (T)0.1006 - v.z * (T)0.058384 + (T)0.072721; \ + tmp.y = -v.x * (T)0.079311 + v.y * (T)0.91976 - v.z * (T)0.037624 + (T)0.124359; \ + tmp.z = -v.x * (T)0.068347 - v.y * (T)0.069032 + v.z * (T)0.975032 + (T)0.129721; \ + tmp.x = (tmp.x < 0) ? 0 : tmp.x; \ + tmp.y = (tmp.y < 0) ? 0 : tmp.y; \ + tmp.z = (tmp.z < 0) ? 0 : tmp.z; \ + tmp.x = tmp.x * (T)0.003211 * 16; \ + tmp.y = tmp.y * (T)0.007948 * 16; \ + tmp.z = tmp.z * (T)0.046259 * 16; \ + g = tmp.x * (T)0.249512 + tmp.y * (T)0.274577 + tmp.z * (T)0.324276 + (T)0.078941; \ + } + +#if defined(CONV) +#if defined(UCHAR) +__kernel void bilateral_slice_apply_c12_conv_uchar +#else +__kernel void bilateral_slice_apply_c12_conv +#endif +#else +#if defined(UCHAR) +__kernel void bilateral_slice_apply_c12_uchar +#else +__kernel void bilateral_slice_apply_c12 +#endif +#endif + (const int w, + const int wh, + const int gc, + const int gw, + const int gh, + const int gcw, + const int gd, + const int coe, + const int bx, + const int by, + const float scale_x, + const float scale_y, + global const T *guide, + global const T *grid, +#if defined(UCHAR) + global const uchar *input, + global uchar *out) +{ +#else + global const T *input, + global T *out) +{ +#endif + + const int x = get_global_id(0); + const int y = get_global_id(1); + if (x >= bx || y >= by) { + return; + } + int in_off = y * w + x; + T3 in_val; +#if defined(UCHAR) + uchar3 tmp = vload3(0, input + in_off * 3); + in_val.x = tmp.x / 256.0; + in_val.y = tmp.y / 256.0; + in_val.z = tmp.z / 256.0; +#else + in_val = vload3(0, input + in_off * 3); +#endif + + T gx = (x + (T)0.5) * (T)scale_x; + T gz; +#if defined(CONV) + guide_cal(in_val, gz); +#else + gz = guide[in_off]; +#endif + gz = gz * gd; + char fx = (char)floor(gx - (T)0.5); + char fz = (char)floor(gz - (T)0.5); + + char i = 0; + char k = 0; + char x_ = fx; + char z_ = fz; + if (fx < 0) { + x_ = 0; + i = 1; + } + if (fz < 0) { + z_ = 0; + k = 1; + } + if (fx == gw - 1) { + i = 1; + } + if (fz == gd - 1) { + k = 1; + } + + T8 g_val[3]; + T4 p; + T4 sum[3]; + T2 wx, wz; + sum[0] = (T4)0; + sum[1] = (T4)0; + sum[2] = (T4)0; + + wx.s0 = (T)1 - fabs(fx + (T)0.5 - gx); + wx.s1 = (T)1 - fabs(fx + (T)1.5 - gx); + wz.s0 = (T)1 - fabs(fz + (T)0.5 - gz); + wz.s1 = (T)1 - fabs(fz + (T)1.5 - gz); + + if (wx.s0 < 0) { + wx.s0 = 0; + } + if (wx.s1 < 0) { + wx.s0 = 0; + } + if (wz.s0 < 0) { + wz.s0 = 0; + } + if (wz.s1 < 0) { + wz.s0 = 0; + } + + p.xy = wx.s0 * wz; + p.zw = wx.s1 * wz; + + int grid_off = y * gcw + x_ * gc + z_ * coe; + g_val[0] = vload8(0, grid + grid_off); + g_val[1] = vload8(0, grid + grid_off + 8); + p.x = p.x + (T)k * p.y + (T)i * (p.z + (T)k * p.w); + sum[0] += g_val[0].s0123 * p.x; + sum[1] += g_val[0].s4567 * p.x; + sum[2] += g_val[1].s0123 * p.x; + if (k == 0) { + p.y = p.y + (T)i * p.w; + g_val[2] = vload8(0, grid + grid_off + 16); + sum[0] += g_val[1].s4567 * p.y; + sum[1] += g_val[2].s0123 * p.y; + sum[2] += g_val[2].s4567 * p.y; + } + + if (i == 0) { + grid_off += gc; + p.z = p.z + (T)k * p.w; + g_val[0] = vload8(0, grid + grid_off); + g_val[1] = vload8(0, grid + grid_off + 8); + sum[0] += g_val[0].s0123 * p.z; + sum[1] += g_val[0].s4567 * p.z; + sum[2] += g_val[1].s0123 * p.z; + if (k == 0) { + g_val[2] = vload8(0, grid + grid_off + 16); + sum[0] += g_val[1].s4567 * p.w; + sum[1] += g_val[2].s0123 * p.w; + sum[2] += g_val[2].s4567 * p.w; + } + } + + sum[0].x = sum[0].x * in_val.x + sum[0].y * in_val.y + sum[0].z * in_val.z + sum[0].w; + sum[1].x = sum[1].x * in_val.x + sum[1].y * in_val.y + sum[1].z * in_val.z + sum[1].w; + sum[2].x = sum[2].x * in_val.x + sum[2].y * in_val.y + sum[2].z * in_val.z + sum[2].w; +#if defined(UCHAR) + tmp.x = (uchar)(sum[0].x * 256.0); + tmp.y = (uchar)(sum[1].x * 256.0); + tmp.z = (uchar)(sum[2].x * 256.0); + vstore3(tmp, 0, out + in_off * 3); +#else + vstore3((T3)(sum[0].x, sum[1].x, sum[2].x), 0, out + in_off * 3); +#endif +} diff --git a/compute/tensor/src/gpu/mali/cl/bilateral_slice_apply_pre.cl b/compute/tensor/src/gpu/mali/cl/bilateral_slice_apply_pre.cl new file mode 100644 index 00000000..5a9ee40d --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/bilateral_slice_apply_pre.cl @@ -0,0 +1,55 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void bilateral_slice_apply_pre(const int gh, + const int gc, + const int gcw, + const int bx, + const int bw, + const float scale_y, + global const T *grid, + global T *gridTran) +{ + const int idx = get_global_id(0); // dep * coe / 4 + const int idw = get_global_id(1); // gw + const int idh = get_global_id(2); // H + if (idx >= bx || idw >= bw) { + return; + } + char j = 1; + + T2 wy; + T gy = (idh + (T)0.5) * (T)scale_y; + char fy = floor(gy - (T)0.5); + char y_ = fy; + if (fy < 0) { + y_ = 0; + j = 0; + } + if (fy == gh - 1) { + j = 0; + } + wy.x = (T)1 - fabs(fy + (T)0.5 - gy); + wy.y = (T)1 - fabs(fy + (T)1.5 - gy); + + int grid_off = y_ * gcw + idw * gc + (idx << 2); + T4 val0; + T4 val1; + T4 res; + val0 = vload4(0, grid + grid_off); + val1 = (j == 0) ? val0 : vload4(0, grid + grid_off + gcw); + res = wy.x * val0 + wy.y * val1; + + int gridTran_off = idh * gcw + idw * gc + (idx << 2); + vstore4(res, 0, gridTran + gridTran_off); +} diff --git a/compute/tensor/src/gpu/mali/cl/channel_resize.cl b/compute/tensor/src/gpu/mali/cl/channel_resize.cl new file mode 100644 index 00000000..f206cb91 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/channel_resize.cl @@ -0,0 +1,230 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#if defined(USE_NCHW) +#else +#endif + +#if defined(INPUT_NCHW) && defined(OUTPUT_NCHW) +#define LOAD_VAL(ew, ec, idx, idy, idz, ih_str, iw_str, ih_off, iw_off, buf, val) \ + { \ + int off = (idz * ih_str + idy + ih_off) * iw_str + (idx << 2) + iw_off; \ + val = 0; \ + if (ew == 4) { \ + val = vload4(0, buf + off); \ + } else { \ + if (ew == 1) { \ + val.x = buf[off]; \ + } \ + if (ew == 2) { \ + val.xy = vload2(0, buf + off); \ + } \ + if (ew == 3) { \ + val.xyz = vload3(0, buf + off); \ + } \ + } \ + } +#define STORE_VAL(ew, ec, idx, idy, idz, oh_str, ow_str, oh_off, ow_off, buf, val) \ + { \ + int off = (idz * oh_str + idy + oh_off) * ow_str + (idx << 2) + ow_off; \ + if (ew == 4) { \ + vstore4(val, 0, buf + off); \ + } else { \ + if (ew == 1) { \ + buf[off] = val.x; \ + } \ + if (ew == 2) { \ + vstore2((T2)(val.x, val.y), 0, buf + off); \ + } \ + if (ew == 3) { \ + vstore3((T3)(val.x, val.y, val.z), 0, buf + off); \ + } \ + } \ + } +#elif defined(INPUT_NCHW) && defined(OUTPUT_NCWHC4) +#define LOAD_VAL(ew, ec, idx, idy, idz, ih_str, iw_str, ih_off, iw_off, buf, val) \ + { \ + int off = ((idz << 2) * ih_str + idy + ih_off) * iw_str + (idx << 2) + iw_off; \ + int str = iw_str * ih_str; \ + if (ew == 4) { \ + val[0] = vload4(0, buf + off); \ + if (ec > 1) \ + val[1] = vload4(0, buf + off + str); \ + if (ec > 2) \ + val[2] = vload4(0, buf + off + str * 2); \ + if (ec > 3) \ + val[3] = vload4(0, buf + off + str * 3); \ + } else { \ + if (ew == 1) { \ + val[0].x = buf[off]; \ + if (ec > 1) \ + val[1].x = buf[off + str]; \ + if (ec > 2) \ + val[2].x = buf[off + str * 2]; \ + if (ec > 3) \ + val[3].x = buf[off + str * 3]; \ + } \ + if (ew == 2) { \ + val[0].xy = vload2(0, buf + off); \ + if (ec > 1) \ + val[1].xy = vload2(0, buf + off + str); \ + if (ec > 2) \ + val[2].xy = vload2(0, buf + off + str * 2); \ + if (ec > 3) \ + val[3].xy = vload2(0, buf + off + str * 3); \ + } \ + if (ew == 3) { \ + val[0].xyz = vload3(0, buf + off); \ + if (ec > 1) \ + val[1].xyz = vload3(0, buf + off + str); \ + if (ec > 2) \ + val[2].xyz = vload3(0, buf + off + str * 2); \ + if (ec > 3) \ + val[3].xyz = vload3(0, buf + off + str * 3); \ + } \ + } \ + } +#define STORE_VAL(ew, ec, idx, idy, idz, oh_str, ow_str, oh_off, ow_off, buf, val) \ + { \ + int off = (idz * ow_str + (idx << 2) + ow_off) * oh_str + idy + oh_off; \ + vstore4((T4)(val[0].x, val[1].x, val[2].x, val[3].x), off, buf); \ + if (ew > 1) \ + vstore4((T4)(val[0].y, val[1].y, val[2].y, val[3].y), off + oh_str, buf); \ + if (ew > 2) \ + vstore4((T4)(val[0].z, val[1].z, val[2].z, val[3].z), off + oh_str * 2, buf); \ + if (ew > 3) \ + vstore4((T4)(val[0].w, val[1].w, val[2].w, val[3].w), off + oh_str * 3, buf); \ + } +#elif defined(INPUT_NCWHC4) && defined(OUTPUT_NCHW) +#define LOAD_VAL(ew, ec, idx, idy, idz, ih_str, iw_str, ih_off, iw_off, buf, val) \ + { \ + int off = (idz * iw_str + (idy << 2) + iw_off) * ih_str + idx + ih_off; \ + val[0] = vload4(off, buf); \ + if (ew > 1) \ + val[1] = vload4(off + ih_str, buf); \ + if (ew > 2) \ + val[2] = vload4(off + ih_str * 2, buf); \ + if (ew > 3) \ + val[3] = vload4(off + ih_str * 3, buf); \ + } +#define STORE_VAL(ew, ec, idx, idy, idz, oh_str, ow_str, oh_off, ow_off, buf, val) \ + { \ + int off = ((idz << 2) * oh_str + idx + oh_off) * ow_str + (idy << 2) + ow_off; \ + int str = ow_str * oh_str; \ + if (ew == 4) { \ + vstore4((T4)(val[0].x, val[1].x, val[2].x, val[3].x), 0, buf + off); \ + if (ec > 1) \ + vstore4((T4)(val[0].y, val[1].y, val[2].y, val[3].y), 0, buf + off + str); \ + if (ec > 2) \ + vstore4((T4)(val[0].z, val[1].z, val[2].z, val[3].z), 0, buf + off + str * 2); \ + if (ec > 3) \ + vstore4((T4)(val[0].w, val[1].w, val[2].w, val[3].w), 0, buf + off + str * 3); \ + } else { \ + if (ew == 1) { \ + buf[off] = val[0].x; \ + if (ec > 1) \ + buf[off + str] = val[0].y; \ + if (ec > 2) \ + buf[off + str * 2] = val[0].z; \ + if (ec > 3) \ + buf[off + str * 3] = val[0].w; \ + } \ + if (ew == 2) { \ + vstore2((T2)(val[0].x, val[1].x), 0, buf + off); \ + if (ec > 1) \ + vstore2((T2)(val[0].y, val[1].y), 0, buf + off + str); \ + if (ec > 2) \ + vstore2((T2)(val[0].z, val[1].z), 0, buf + off + str * 2); \ + if (ec > 3) \ + vstore2((T2)(val[0].w, val[1].w), 0, buf + off + str * 3); \ + } \ + if (ew == 3) { \ + vstore3((T3)(val[0].x, val[1].x, val[2].x), 0, buf + off); \ + if (ec > 1) \ + vstore3((T3)(val[0].y, val[1].y, val[2].y), 0, buf + off + str); \ + if (ec > 2) \ + vstore3((T3)(val[0].z, val[1].z, val[2].z), 0, buf + off + str * 2); \ + if (ec > 3) \ + vstore3((T3)(val[0].w, val[1].w, val[2].w), 0, buf + off + str * 3); \ + } \ + } \ + } +#else +#define LOAD_VAL(ew, ec, idx, idy, idz, ih_str, iw_str, ih_off, iw_off, buf, val) \ + { \ + int off = (idz * iw_str + idy + iw_off) * ih_str + idx + ih_off; \ + val = vload4(off, buf); \ + } +#define STORE_VAL(ew, ec, idx, idy, idz, oh_str, ow_str, oh_off, ow_off, buf, val) \ + { \ + int off = (idz * ow_str + idy + ow_off) * oh_str + idx + oh_off; \ + vstore4(val, off, buf); \ + } +#endif + +__kernel void +#if defined(INPUT_NCHW) && defined(OUTPUT_NCHW) +channel_resize_nchw +#elif defined(INPUT_NCHW) && defined(OUTPUT_NCWHC4) +channel_resize_nchw_ncwhc4 +#elif defined(INPUT_NCWHC4) && defined(OUTPUT_NCHW) +channel_resize_ncwhc4_nchw +#else +channel_resize +#endif + (const int ih_str, + const int iw_str, + const int ic_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + const int in_c, + const int out_c, + const int w, + const int bx, + const int by, + const __global const T *in, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= bx || idy >= by) { + return; + } + + char ew = 0; + char ec = 0; +#if defined(INPUT_NCHW) && defined(OUTPUT_NCHW) + T4 val = 0; + ew = ((idx << 2) + 4 <= w) ? 4 : (w & 3); +#elif defined(INPUT_NCHW) && defined(OUTPUT_NCWHC4) + T4 val[4] = {0}; + ew = ((idx << 2) + 4 <= w) ? 4 : (w & 3); + ec = ((idz << 2) + 4 <= in_c) ? 4 : (in_c & 3); +#elif defined(INPUT_NCWHC4) && defined(OUTPUT_NCHW) + T4 val[4] = {0}; + ew = ((idy << 2) + 4 <= w) ? 4 : (w & 3); + ec = ((idz << 2) + 4 <= out_c) ? 4 : (out_c & 3); +#else + T4 val = 0; +#endif + + if (idz < ic_str) { + LOAD_VAL(ew, ec, idx, idy, idz, ih_str, iw_str, ih_off, iw_off, in, val); + } + STORE_VAL(ew, ec, idx, idy, idz, oh_str, ow_str, oh_off, ow_off, out, val); +} diff --git a/compute/tensor/src/gpu/mali/cl/check_int_spe.cl b/compute/tensor/src/gpu/mali/cl/check_int_spe.cl new file mode 100644 index 00000000..52155ed0 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/check_int_spe.cl @@ -0,0 +1,33 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void check_int_spe(const int aw_off, + const int bw_off, + const int ow_off, + const int bx, + __global const int *inputA, + __global const int *inputB, + __global int *output) +{ + int idx = get_global_id(0); + if (idx >= bx) { + return; + } + int va = inputA[idx + aw_off]; + int vb = inputB[idx + bw_off]; + int res = 0; + if (va == vb) { + res = 1; + } + output[idx + ow_off] = res; +} diff --git a/tensor_computing/src/gpu/mali/cl/clip.cl b/compute/tensor/src/gpu/mali/cl/clip.cl similarity index 83% rename from tensor_computing/src/gpu/mali/cl/clip.cl rename to compute/tensor/src/gpu/mali/cl/clip.cl index b9cc497d..426a4b44 100644 --- a/tensor_computing/src/gpu/mali/cl/clip.cl +++ b/compute/tensor/src/gpu/mali/cl/clip.cl @@ -11,17 +11,27 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - -__kernel void clip(const int h, const int w, const int ih_str, const int iw_str, const int ih_off, const int iw_off, - const int oh_str, const int ow_str, const int oh_off, const int ow_off, const float min_value, const float max_value, __global T* input, __global T* output) { - +__kernel void clip(const int h, + const int w, + const int ih_str, + const int iw_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + const float min_value, + const float max_value, + __global T *input, + __global T *output) +{ int idx = get_global_id(0); int idy = get_global_id(1); int idz = get_global_id(2); - if(idx >= h || idy >= w) return; + if (idx >= h || idy >= w) { + return; + } T4 val; int in_off = (idz * iw_str + idy + iw_off) * ih_str + idx + ih_off; diff --git a/compute/tensor/src/gpu/mali/cl/col2im.cl b/compute/tensor/src/gpu/mali/cl/col2im.cl new file mode 100644 index 00000000..0ad2e729 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/col2im.cl @@ -0,0 +1,78 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void col2im(const int ih, + const int iw, + const int ic, + const int kw, + const int kh, + const int pw, + const int ph, + const int sw, + const int sh, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + const int oh, + const int ow, + const int bx, + __global const T *bias, + __global const T *in, + __global T *out) +{ + const int index = get_global_id(0); + if (index >= bx) { + return; + } + const int idx = index % oh; + const int idy = (index % (ow * oh)) / oh; + const int idz = index / (ow * oh); + + const int pidx = idx + ph; + const int pidy = idy + pw; + + int sidw_i = pidy / sw; + int sidw_j = pidy % sw; + int in_wx = (sidw_i < iw) ? sidw_i : (iw - 1); + int in_wy = (sidw_i < iw) ? sidw_j : ((sidw_i - iw + 1) * sw + sidw_j); + int in_wl = (kw - in_wy + sw - 1) / sw; + if (in_wl > in_wx + 1) { + in_wl = in_wx + 1; + } + + int sidh_i = pidx / sh; + int sidh_j = pidx % sh; + int in_hx = (sidh_i < ih) ? sidh_i : (ih - 1); + int in_hy = (sidh_i < ih) ? sidh_j : ((sidh_i - ih + 1) * sh + sidh_j); + int in_hl = (kh - in_hy + sh - 1) / sh; + if (in_hl > in_hx + 1) { + in_hl = in_hx + 1; + } + + int in_off_w = ih * (in_wx + iw * kh * (in_wy + idz * kw)); + int in_str_w = ih * (iw * kh * sh - 1); + int in_off_h = in_hx + in_hy * ih * iw; + int in_str_h = ih * iw * sh - 1; + T4 sum = vload4(idz, bias); + + for (int i = 0; i < in_wl; i++) { + for (int j = 0; j < in_hl; j++) { + sum += vload4(in_off_w + in_off_h + j * in_str_h, in); + } + in_off_w += in_str_w; + } + + int out_off = (idz * ow_str + idy + ow_off) * oh_str + idx + oh_off; + vstore4(sum, out_off, out); +} diff --git a/compute/tensor/src/gpu/mali/cl/concat.cl b/compute/tensor/src/gpu/mali/cl/concat.cl new file mode 100644 index 00000000..ec7bd923 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/concat.cl @@ -0,0 +1,186 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define MANGLE_NAME_IMPL(base, N) base##N +#define MANGLE_NAME(base, N) MANGLE_NAME_IMPL(base, N) + +#define LOAD_VAL(idx, idy, idz, h_str, w_str, h_off, w_off, val, buf) \ + { \ + int off = (idz * w_str + idy + w_off) * h_str + idx + h_off; \ + val = vload4(off, buf); \ + } + +__kernel void +#if defined(NON_ALIGN_C) + MANGLE_NAME(concat_nonalign_c_p1_, N) +#else +#if defined(AXIS_W) + MANGLE_NAME(concat_w, N) +#elif defined(AXIS_H) + MANGLE_NAME(concat_h, N) +#elif defined(AXIS_C) + MANGLE_NAME(concat_c, N) +#endif +#endif + (const int oh_str, + const int ohw_str, + const int oh_off, + const int ow_off, + const int axis_max, + const int nmax, + const int out_size, + const int bx, + const int by, + const int ih_str0, + const int iw_str0, + const int ih_off0, + const int iw_off0, + const int ic0, + __global const T *in0, +#if (N > 1) + const int ih_str1, + const int iw_str1, + const int ih_off1, + const int iw_off1, + const int ic1, + const int axis_len_0, + __global const T *in1, +#endif +#if (N > 2) + const int ih_str2, + const int iw_str2, + const int ih_off2, + const int iw_off2, + const int ic2, + const int axis_len_1, + __global const T *in2, +#endif +#if (N > 3) + const int ih_str3, + const int iw_str3, + const int ih_off3, + const int iw_off3, + const int ic3, + const int axis_len_2, + __global const T *in3, +#endif + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= bx || idy >= by) { + return; + } +#if defined(AXIS_W) + int id_axis = idy - axis_max; +#elif defined(AXIS_H) + int id_axis = idx - axis_max; +#elif defined(AXIS_C) + int id_axis = idz - axis_max; +#endif + int idn = nmax; +#if (N > 3) + if (id_axis < 0) { + id_axis += axis_len_2; + idn = 2; + } +#endif +#if (N > 2) + if (id_axis < 0) { + id_axis += axis_len_1; + idn = 1; + } +#endif +#if (N > 1) + if (id_axis < 0) { + id_axis += axis_len_0; + idn = 0; + } +#endif + T4 val; + int in_idx = idx; + int in_idy = idy; + int in_idz = idz; + +#if defined(AXIS_W) + in_idy = id_axis; +#elif defined(AXIS_H) + in_idx = id_axis; +#elif defined(AXIS_C) + in_idz = id_axis; +#endif + +#if defined(NON_ALIGN_C) + char ec = 4; + int out_off = id_axis * ohw_str * 4 + idy * oh_str + idx; +#else + int out_off = idz * ohw_str + (idy + ow_off) * oh_str + idx + oh_off; +#endif + if (idn == 0) { + LOAD_VAL(in_idx, in_idy, in_idz, ih_str0, iw_str0, ih_off0, iw_off0, val, in0); +#if defined(NON_ALIGN_C) + if (id_axis * 4 + 4 > ic0) { + ec = ic0 & 3; + } +#endif + } +#if (N > 1) + if (idn == 1) { + LOAD_VAL(in_idx, in_idy, in_idz, ih_str1, iw_str1, ih_off1, iw_off1, val, in1); +#if defined(NON_ALIGN_C) + out_off += ic0 * ohw_str; + if (id_axis * 4 + 4 > ic1) { + ec = ic1 & 3; + } +#endif + } +#endif +#if (N > 2) + if (idn == 2) { + LOAD_VAL(in_idx, in_idy, in_idz, ih_str2, iw_str2, ih_off2, iw_off2, val, in2); +#if defined(NON_ALIGN_C) + out_off += (ic0 + ic1) * ohw_str; + if (id_axis * 4 + 4 > ic2) { + ec = ic2 & 3; + } +#endif + } +#endif +#if (N > 3) + if (idn == 3) { + LOAD_VAL(in_idx, in_idy, in_idz, ih_str3, iw_str3, ih_off3, iw_off3, val, in3); +#if defined(NON_ALIGN_C) + out_off += (ic0 + ic1 + ic2) * ohw_str; + if (id_axis * 4 + 4 > ic3) { + ec = ic3 & 3; + } +#endif + } +#endif + +#if defined(NON_ALIGN_C) + out[out_size + out_off] = val.x; + if (ec > 1) { + out[out_size + out_off + ohw_str] = val.y; + } + if (ec > 2) { + out[out_size + out_off + ohw_str * 2] = val.z; + } + if (ec > 3) { + out[out_size + out_off + ohw_str * 3] = val.w; + } +#else + vstore4(val, out_off, out + out_size); +#endif +} diff --git a/compute/tensor/src/gpu/mali/cl/conv_depthwise_s1.cl b/compute/tensor/src/gpu/mali/cl/conv_depthwise_s1.cl new file mode 100644 index 00000000..90fbe82e --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/conv_depthwise_s1.cl @@ -0,0 +1,87 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "kernel_def.h" +#define MANGLE_NAME_IMPL(base, F, ON) base##F##ON +#define MANGLE_NAME(base, F, ON) MANGLE_NAME_IMPL(base, F, ON) + +#if defined(USE_NCWH) +#if defined(USE_RELU) +__kernel void MANGLE_NAME(conv_depthwise_s1_relu_ncwh_, F, ON) +#elif defined(USE_RELU6) +__kernel void MANGLE_NAME(conv_depthwise_s1_relu6_ncwh_, F, ON) +#else +__kernel void MANGLE_NAME(conv_depthwise_s1_ncwh_, F, ON) +#endif +#else +#if defined(USE_RELU) +__kernel void MANGLE_NAME(conv_depthwise_s1_relu_, F, ON) +#elif defined(USE_RELU6) +__kernel void MANGLE_NAME(conv_depthwise_s1_relu6_, F, ON) +#else +__kernel void MANGLE_NAME(conv_depthwise_s1_, F, ON) +#endif +#endif + (const int ih_str, + const int ihw_str, + const int ic_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int ow_str, + const int ohw_str, + const int oh_off, + const int ow_off, + const int ow, + const int bx, + const int by, + __global const T *in, + __global const T *flt, + __read_only image1d_t bias, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= bx || idy >= by) { + return; + } + + T4 in_val[IN]; + T4 flt_val; + T4 out_val[ON]; + + LOADBIAS_IMAGE_ARRAY_V4(out_val, idz, bias); + int in_off = idz * ihw_str + (idy * ON + iw_off) * ih_str + idx + ih_off; + int flt_off = idz * Fsq; + + for (uchar i = 0; i < F; ++i) { + LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off + i, ih_str, in); + for (uchar j = 0; j < F; ++j) { +#if defined(BASICE_REG) + in_val[LN] = vload4(in_off + i + (LN + j) * ih_str, in); +#endif + flt_val = vload4(flt_off + j, flt); + DEPTHWISE_CAL_CORE_S1(in_val, flt_val, out_val); + UPDATE_REG(in_val); + } + flt_off += F; + } +#if defined(USE_NCWH) + int out_off = (idz << 2) * ohw_str + (idy * ON + ow_off) * oh_str + idx + oh_off; + STORE_OUTPUT_BUF_ARRAY_V4_NCWH(out_val, out_off, oh_str, ohw_str, idy * ON, ow, out); +#else + int out_off = (idz * ow_str + idy * ON + ow_off) * oh_str + idx + oh_off; + STORE_OUTPUT_BUF_ARRAY_V4(out_val, out_off, oh_str, idy * ON, ow, out); +#endif +} diff --git a/compute/tensor/src/gpu/mali/cl/conv_depthwise_s2.cl b/compute/tensor/src/gpu/mali/cl/conv_depthwise_s2.cl new file mode 100644 index 00000000..39ce9d9b --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/conv_depthwise_s2.cl @@ -0,0 +1,100 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "kernel_def.h" +#define MANGLE_NAME_IMPL(base, F, ON) base##F##ON +#define MANGLE_NAME(base, F, ON) MANGLE_NAME_IMPL(base, F, ON) + +#if defined(USE_NCWH) +#if defined(USE_RELU) +__kernel void MANGLE_NAME(conv_depthwise_s2_relu_ncwh_, F, ON) +#elif defined(USE_RELU6) +__kernel void MANGLE_NAME(conv_depthwise_s2_relu6_ncwh_, F, ON) +#else +__kernel void MANGLE_NAME(conv_depthwise_s2_ncwh_, F, ON) +#endif +#else +#if defined(USE_RELU) +__kernel void MANGLE_NAME(conv_depthwise_s2_relu_, F, ON) +#elif defined(USE_RELU6) +__kernel void MANGLE_NAME(conv_depthwise_s2_relu6_, F, ON) +#else +__kernel void MANGLE_NAME(conv_depthwise_s2_, F, ON) +#endif +#endif + (const int ih_str, + const int ihw_str, + const int ic_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int ow_str, + const int ohw_str, + const int oh_off, + const int ow_off, + const int ow, + const int bx, + const int by, + __global const T *in, + __global const T *flt, + __read_only image1d_t bias, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= bx || idy >= by) { + return; + } + + T4 in_val[IN]; + T4 flt_val; + T4 out_val[ON]; + + LOADBIAS_IMAGE_ARRAY_V4(out_val, idz, bias); + int in_off = idz * ihw_str + ((idy << 1) * ON + iw_off) * ih_str + (idx << 1) + ih_off; + int flt_off = idz * Fsq; + for (uchar i = 0; i < F; ++i) { +#if defined(BASIC_REG) + LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off + i, (ih_str << 1), in); + for (uchar j = 0; j < F; j += 2) { + flt_val = vload4(flt_off + j, flt); + in_val[LN] = vload4(in_off + i + ((LN << 1) + j) * ih_str, in); + DEPTHWISE_CAL_CORE_S1(in_val, flt_val, out_val); + UPDATE_REG(in_val); + } + LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off + i + ih_str, (ih_str << 1), in); + for (uchar j = 1; j < F; j += 2) { + flt_val = vload4(flt_off + j, flt); + in_val[LN] = vload4(in_off + i + ((LN << 1) + j) * ih_str, in); + DEPTHWISE_CAL_CORE_S1(in_val, flt_val, out_val) + UPDATE_REG(in_val); + } +#else + LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off + i, ih_str, in); + for (uchar j = 0; j < F; ++j) { + flt_val = vload4(flt_off + j, flt); + DEPTHWISE_CAL_CORE_S2(in_val, flt_val, out_val); + UPDATE_REG(in_val); + } +#endif + flt_off += F; + } +#if defined(USE_NCWH) + int out_off = (idz << 2) * ohw_str + (idy * ON + ow_off) * oh_str + idx + oh_off; + STORE_OUTPUT_BUF_ARRAY_V4_NCWH(out_val, out_off, oh_str, ohw_str, idy * ON, ow, out); +#else + int out_off = (idz * ow_str + idy * ON + ow_off) * oh_str + idx + oh_off; + STORE_OUTPUT_BUF_ARRAY_V4(out_val, out_off, oh_str, idy * ON, ow, out); +#endif +} diff --git a/compute/tensor/src/gpu/mali/cl/conv_depthwise_trans_fltbuf.cl b/compute/tensor/src/gpu/mali/cl/conv_depthwise_trans_fltbuf.cl new file mode 100644 index 00000000..77c712bc --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/conv_depthwise_trans_fltbuf.cl @@ -0,0 +1,88 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define MANGLE_NAME_IMPL(base, K) base##K +#define MANGLE_NAME(base, K) MANGLE_NAME_IMPL(base, K) +#if (K == 4) +#define loadFltval(off, str, flt, val) \ + { \ + val.x = flt[off]; \ + val.y = flt[off + str]; \ + val.z = flt[off + (str << 1)]; \ + val.w = flt[off + str * 3]; \ + } + +#define loadFltvalEdge(off, str, flt, val, edge) \ + { \ + val.x = flt[off]; \ + if (edge > 1) \ + val.y = flt[off + str]; \ + if (edge > 2) \ + val.z = flt[off + (str << 1)]; \ + } +#endif + +#if (K == 8) +#define loadFltval(off, str, flt, val) \ + { \ + val.s0 = flt[off]; \ + val.s1 = flt[off + str]; \ + val.s2 = flt[off + (str << 1)]; \ + val.s3 = flt[off + str * 3]; \ + val.s4 = flt[off + (str << 2)]; \ + val.s5 = flt[off + str * 5]; \ + val.s6 = flt[off + str * 6]; \ + val.s7 = flt[off + str * 7]; \ + } +#define loadFltvalEdge(off, str, flt, val, edge) \ + { \ + val.s0 = flt[off]; \ + if (edge > 1) \ + val.s1 = flt[off + str]; \ + if (edge > 2) \ + val.s2 = flt[off + (str << 1)]; \ + if (edge > 3) \ + val.s3 = flt[off + str * 3]; \ + if (edge > 4) \ + val.s4 = flt[off + (str << 2)]; \ + if (edge > 5) \ + val.s5 = flt[off + str * 5]; \ + if (edge > 6) \ + val.s6 = flt[off + str * 6]; \ + } +#endif + +__kernel void MANGLE_NAME(conv_depthwise_trans_fltbuf_, K)( + const fwh, const fn, __global const T *fltdata, __global T *fltbuf) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int flt_off = idy * K * fwh + idx; + int ek = ((idy + 1) * K <= fn) ? K : (fn % K); +#if (K == 4) + T4 val = 0; +#elif (K == 8) + T8 val = 0; +#endif + if (ek == K) { + loadFltval(flt_off, fwh, fltdata, val); + } else { + loadFltvalEdge(flt_off, fwh, fltdata, val, ek); + } + const int out_off = idy * fwh + idx; +#if (K == 4) + vstore4(val, out_off, fltbuf); +#elif (K == 8) + vstore8(val, out_off, fltbuf); +#endif +} diff --git a/compute/tensor/src/gpu/mali/cl/conv_direct_3d_sw1_nchw_to_ncwhc4.cl b/compute/tensor/src/gpu/mali/cl/conv_direct_3d_sw1_nchw_to_ncwhc4.cl new file mode 100644 index 00000000..40c37948 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/conv_direct_3d_sw1_nchw_to_ncwhc4.cl @@ -0,0 +1,170 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "kernel_def.h" +#define MANGLE_NAME_IMPL(base, FWH, FT, ON) base##FWH##FT##ON +#define MANGLE_NAME(base, FWH, FT, ON) MANGLE_NAME_IMPL(base, FWH, FT, ON) + +#if (FWH == 1) +#define calCore(A, B, C) \ + { \ + C[0] += A.s0 * B; \ + C[1] += A.s1 * B; \ + C[2] += A.s2 * B; \ + C[3] += A.s3 * B; \ + C[4] += A.s4 * B; \ + C[5] += A.s5 * B; \ + C[6] += A.s6 * B; \ + C[7] += A.s7 * B; \ + } +#elif (FWH == 3) +#define calCore(a0, a1, a2, a3, a4, a5, B, C) \ + { \ + C[0] += a0 * B; \ + C[1] += a1 * B; \ + C[2] += a2 * B; \ + C[3] += a3 * B; \ + C[4] += a4 * B; \ + C[5] += a5 * B; \ + } +#define calCore0(A, B, C) calCore(A.s0, A.s1, A.s2, A.s3, A.s4, A.s5, B, C) +#define calCore1(A, B, C) calCore(A.s1, A.s2, A.s3, A.s4, A.s5, A.s6, B, C) +#define calCore2(A, B, C) calCore(A.s2, A.s3, A.s4, A.s5, A.s6, A.s7, B, C) +#elif (FWH == 5) +#define calCore(a0, a1, a2, a3, B, C) \ + { \ + C[0] += a0 * B; \ + C[1] += a1 * B; \ + C[2] += a2 * B; \ + C[3] += a3 * B; \ + } +#define calCore0(A, B, C) calCore(A.s0, A.s1, A.s2, A.s3, B, C) +#define calCore1(A, B, C) calCore(A.s1, A.s2, A.s3, A.s4, B, C) +#define calCore2(A, B, C) calCore(A.s2, A.s3, A.s4, A.s5, B, C) +#define calCore3(A, B, C) calCore(A.s3, A.s4, A.s5, A.s6, B, C) +#define calCore4(A, B, C) calCore(A.s4, A.s5, A.s6, A.s7, B, C) +#elif (FWH == 7) +#define calCore(a0, a1, B, C) \ + { \ + C[0] += a0 * B; \ + C[1] += a1 * B; \ + } +#define calCore0(A, B, C) calCore(A.s0, A.s1, B, C) +#define calCore1(A, B, C) calCore(A.s1, A.s2, B, C) +#define calCore2(A, B, C) calCore(A.s2, A.s3, B, C) +#define calCore3(A, B, C) calCore(A.s3, A.s4, B, C) +#define calCore4(A, B, C) calCore(A.s4, A.s5, B, C) +#define calCore5(A, B, C) calCore(A.s5, A.s6, B, C) +#define calCore6(A, B, C) calCore(A.s6, A.s7, B, C) +#endif + +#if defined(USE_RELU) +__kernel void MANGLE_NAME(conv_direct_3d_sw1_nchw_to_ncwhc4_relu_, FWH, FT, ON) +#elif defined(USE_RELU6) +__kernel void MANGLE_NAME(conv_direct_3d_sw1_nchw_to_ncwhc4_relu6_, FWH, FT, ON) +#else +__kernel void MANGLE_NAME(conv_direct_3d_sw1_nchw_to_ncwhc4_, FWH, FT, ON) +#endif + (const int iw_str, + const int iwh_str, + const int ic_str, + const int iw_off, + const int ih_off, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + const int ow, + const int ot, + const int it, + const int pt, + const int sh, + const int st, + const int bx, + const int by, + __global const T *in, + __global const T *flt, + __read_only image1d_t bias, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + const int idt = idz % ot; + const int idk = idz / ot; + + if (idx >= bx || idy >= by) { + return; + } + + T8 in_val; + T4 flt_val; + T4 out_val[ON]; + + LOADBIAS_IMAGE_ARRAY_V4(out_val, idk, bias); + int in_off = (idy * sh + ih_off) * iw_str + idx * ON + iw_off; + int flt_off = idk * ic_str * FWHT; + + int t_be = idt * st - pt; + int t_end = t_be + FT; + if (t_be < 0) { + t_be = 0; + flt_off += pt * FWH * FWH; + } + if (t_end > it) { + t_end = it; + } + + for (int i = 0; i < ic_str; ++i) { + for (int tt = t_be; tt < t_end; ++tt) { +#if (FWH == 1) + flt_val = vload4(flt_off, flt); + in_val = vload8(0, in + in_off + tt * iwh_str); + calCore(in_val, flt_val, out_val); + flt_off++; +#else + for (uchar j = 0; j < FWH; ++j) { + in_val = vload8(0, in + in_off + tt * iwh_str + j * iw_str); + for (uchar k = 0; k < FWH; ++k) { + flt_val = vload4(flt_off + k, flt); + if (k == 0) { + calCore0(in_val, flt_val, out_val); + } else if (k == 1) { + calCore1(in_val, flt_val, out_val); + } else if (k == 2) { + calCore2(in_val, flt_val, out_val); +#if (FWH >= 5) + } else if (k == 3) { + calCore3(in_val, flt_val, out_val); + } else if (k == 4) { + calCore4(in_val, flt_val, out_val); +#endif +#if (FWH == 7) + } else if (k == 5) { + calCore5(in_val, flt_val, out_val); + } else if (k == 6) { + calCore6(in_val, flt_val, out_val); +#endif + } + } + flt_off += FWH; + } +#endif + } + in_off += iwh_str * it; + } + + int xn = idx * ON; + int out_off = (idz * ow_str + xn + ow_off) * oh_str + idy + oh_off; + STORE_OUTPUT_BUF_ARRAY_V4(out_val, out_off, oh_str, xn, ow, out); +} diff --git a/compute/tensor/src/gpu/mali/cl/conv_direct_3d_sw2_nchw_to_ncwhc4.cl b/compute/tensor/src/gpu/mali/cl/conv_direct_3d_sw2_nchw_to_ncwhc4.cl new file mode 100644 index 00000000..59917a11 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/conv_direct_3d_sw2_nchw_to_ncwhc4.cl @@ -0,0 +1,176 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "kernel_def.h" +#define MANGLE_NAME_IMPL(base, FWH, FT, ON) base##FWH##FT##ON +#define MANGLE_NAME(base, FWH, FT, ON) MANGLE_NAME_IMPL(base, FWH, FT, ON) + +#if (FWH == 1) +#define calCore(A, B, C) \ + { \ + C[0] += A.s0 * B; \ + C[1] += A.s2 * B; \ + C[2] += A.s4 * B; \ + C[3] += A.s6 * B; \ + C[4] += A.s8 * B; \ + C[5] += A.sa * B; \ + C[6] += A.sc * B; \ + C[7] += A.se * B; \ + } +#elif (FWH == 3) +#define calCore(a0, a1, a2, a3, a4, a5, a6, B, C) \ + { \ + C[0] += a0 * B; \ + C[1] += a1 * B; \ + C[2] += a2 * B; \ + C[3] += a3 * B; \ + C[4] += a4 * B; \ + C[5] += a5 * B; \ + C[6] += a6 * B; \ + } +#define calCore0(A, B, C) calCore(A.s0, A.s2, A.s4, A.s6, A.s8, A.sa, A.sc, B, C) +#define calCore1(A, B, C) calCore(A.s1, A.s3, A.s5, A.s7, A.s9, A.sb, A.sd, B, C) +#define calCore2(A, B, C) calCore(A.s2, A.s4, A.s6, A.s8, A.sa, A.sc, A.se, B, C) +#elif (FWH == 5) +#define calCore(a0, a1, a2, a3, a4, a5, B, C) \ + { \ + C[0] += a0 * B; \ + C[1] += a1 * B; \ + C[2] += a2 * B; \ + C[3] += a3 * B; \ + C[4] += a4 * B; \ + C[5] += a5 * B; \ + } +#define calCore0(A, B, C) calCore(A.s0, A.s2, A.s4, A.s6, A.s8, A.sa, B, C) +#define calCore1(A, B, C) calCore(A.s1, A.s3, A.s5, A.s7, A.s9, A.sb, B, C) +#define calCore2(A, B, C) calCore(A.s2, A.s4, A.s6, A.s8, A.sa, A.sc, B, C) +#define calCore3(A, B, C) calCore(A.s3, A.s5, A.s7, A.s9, A.sb, A.sd, B, C) +#define calCore4(A, B, C) calCore(A.s4, A.s6, A.s8, A.sa, A.sc, A.se, B, C) +#elif (FWH == 7) +#define calCore(a0, a1, a2, a3, a4, B, C) \ + { \ + C[0] += a0 * B; \ + C[1] += a1 * B; \ + C[2] += a2 * B; \ + C[3] += a3 * B; \ + C[4] += a4 * B; \ + } +#define calCore0(A, B, C) calCore(A.s0, A.s2, A.s4, A.s6, A.s8, B, C) +#define calCore1(A, B, C) calCore(A.s1, A.s3, A.s5, A.s7, A.s9, B, C) +#define calCore2(A, B, C) calCore(A.s2, A.s4, A.s6, A.s8, A.sa, B, C) +#define calCore3(A, B, C) calCore(A.s3, A.s5, A.s7, A.s9, A.sb, B, C) +#define calCore4(A, B, C) calCore(A.s4, A.s6, A.s8, A.sa, A.sc, B, C) +#define calCore5(A, B, C) calCore(A.s5, A.s7, A.s9, A.sb, A.sd, B, C) +#define calCore6(A, B, C) calCore(A.s6, A.s8, A.sa, A.sc, A.se, B, C) +#endif + +#if defined(USE_RELU) +__kernel void MANGLE_NAME(conv_direct_3d_sw2_nchw_to_ncwhc4_relu_, FWH, FT, ON) +#elif defined(USE_RELU6) +__kernel void MANGLE_NAME(conv_direct_3d_sw2_nchw_to_ncwhc4_relu6_, FWH, FT, ON) +#else +__kernel void MANGLE_NAME(conv_direct_3d_sw2_nchw_to_ncwhc4_, FWH, FT, ON) +#endif + (const int iw_str, + const int iwh_str, + const int ic_str, + const int iw_off, + const int ih_off, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + const int ow, + const int ot, + const int it, + const int pt, + const int sh, + const int st, + const int bx, + const int by, + __global const T *in, + __global const T *flt, + __read_only image1d_t bias, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + const int idt = idz % ot; + const int idk = idz / ot; + + if (idx >= bx || idy >= by) { + return; + } + + T16 in_val; + T4 flt_val; + T4 out_val[ON]; + + LOADBIAS_IMAGE_ARRAY_V4(out_val, idk, bias); + int in_off = (idy * sh + ih_off) * iw_str + (idx << 1) * ON + iw_off; + int flt_off = idk * ic_str * FWHT; + + int t_be = idt * st - pt; + int t_end = t_be + FT; + if (t_be < 0) { + t_be = 0; + flt_off += pt * FWH * FWH; + } + if (t_end > it) { + t_end = it; + } + + for (int i = 0; i < ic_str; ++i) { + for (int tt = t_be; tt < t_end; ++tt) { +#if (FWH == 1) + flt_val = vload4(flt_off, flt); + in_val = vload16(0, in + in_off + tt * iwh_str); + calCore(in_val, flt_val, out_val); + flt_off++; +#else + for (uchar j = 0; j < FWH; ++j) { + in_val = vload16(0, in + in_off + tt * iwh_str + j * iw_str); + for (uchar k = 0; k < FWH; ++k) { + flt_val = vload4(flt_off + k, flt); + if (k == 0) { + calCore0(in_val, flt_val, out_val); + } else if (k == 1) { + calCore1(in_val, flt_val, out_val); + } else if (k == 2) { + calCore2(in_val, flt_val, out_val); +#if (FWH >= 5) + } else if (k == 3) { + calCore3(in_val, flt_val, out_val); + } else if (k == 4) { + calCore4(in_val, flt_val, out_val); +#endif +#if (FWH == 7) + } else if (k == 5) { + calCore5(in_val, flt_val, out_val); + } else if (k == 6) { + calCore6(in_val, flt_val, out_val); +#endif + } + } + flt_off += FWH; + } +#endif + } + in_off += iwh_str * it; + } + + int xn = idx * ON; + int out_off = (idz * ow_str + xn + ow_off) * oh_str + idy + oh_off; + STORE_OUTPUT_BUF_ARRAY_V4(out_val, out_off, oh_str, xn, ow, out); +} diff --git a/compute/tensor/src/gpu/mali/cl/conv_direct_s1.cl b/compute/tensor/src/gpu/mali/cl/conv_direct_s1.cl new file mode 100644 index 00000000..a8ba5bf7 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/conv_direct_s1.cl @@ -0,0 +1,345 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "kernel_def.h" +#define MANGLE_NAME_IMPL(base, F, ON, KN) base##F##ON##KN +#define MANGLE_NAME(base, F, ON, KN) MANGLE_NAME_IMPL(base, F, ON, KN) + +#if defined(REUSE_H) +#if (ON == 2) +#define SET_BIAS_VAL(bv, ov) \ + { \ + ov.s0 = bv.x; \ + ov.s1 = bv.y; \ + ov.s2 = bv.z; \ + ov.s3 = bv.w; \ + ov.s4 = bv.x; \ + ov.s5 = bv.y; \ + ov.s6 = bv.z; \ + ov.s7 = bv.w; \ + } + +#define calCore(iv, fv, ov) \ + { \ + ov.s0 += iv.s0 * fv.s0 + iv.s1 * fv.s1 + iv.s2 * fv.s2 + iv.s3 * fv.s3; \ + ov.s1 += iv.s0 * fv.s4 + iv.s1 * fv.s5 + iv.s2 * fv.s6 + iv.s3 * fv.s7; \ + ov.s2 += iv.s0 * fv.s8 + iv.s1 * fv.s9 + iv.s2 * fv.sa + iv.s3 * fv.sb; \ + ov.s3 += iv.s0 * fv.sc + iv.s1 * fv.sd + iv.s2 * fv.se + iv.s3 * fv.sf; \ + ov.s4 += iv.s4 * fv.s0 + iv.s5 * fv.s1 + iv.s6 * fv.s2 + iv.s7 * fv.s3; \ + ov.s5 += iv.s4 * fv.s4 + iv.s5 * fv.s5 + iv.s6 * fv.s6 + iv.s7 * fv.s7; \ + ov.s6 += iv.s4 * fv.s8 + iv.s5 * fv.s9 + iv.s6 * fv.sa + iv.s7 * fv.sb; \ + ov.s7 += iv.s4 * fv.sc + iv.s5 * fv.sd + iv.s6 * fv.se + iv.s7 * fv.sf; \ + } + +#define VLOAD_VEC(off, buf) vload8(0, buf + off); +#define VSTORE_VEC(v, off, buf) \ + { \ + ACTIVATION_V8(v); \ + vstore8(v, 0, buf + off); \ + } +#elif (ON == 4) +#define SET_BIAS_VAL(bv, ov) \ + { \ + ov.s0 = bv.x; \ + ov.s1 = bv.y; \ + ov.s2 = bv.z; \ + ov.s3 = bv.w; \ + ov.s4 = bv.x; \ + ov.s5 = bv.y; \ + ov.s6 = bv.z; \ + ov.s7 = bv.w; \ + ov.s8 = bv.x; \ + ov.s9 = bv.y; \ + ov.sa = bv.z; \ + ov.sb = bv.w; \ + ov.sc = bv.x; \ + ov.sd = bv.y; \ + ov.se = bv.z; \ + ov.sf = bv.w; \ + } +#define calCore(iv, fv, ov) \ + { \ + ov.s0 += iv.s0 * fv.s0 + iv.s1 * fv.s1 + iv.s2 * fv.s2 + iv.s3 * fv.s3; \ + ov.s1 += iv.s0 * fv.s4 + iv.s1 * fv.s5 + iv.s2 * fv.s6 + iv.s3 * fv.s7; \ + ov.s2 += iv.s0 * fv.s8 + iv.s1 * fv.s9 + iv.s2 * fv.sa + iv.s3 * fv.sb; \ + ov.s3 += iv.s0 * fv.sc + iv.s1 * fv.sd + iv.s2 * fv.se + iv.s3 * fv.sf; \ + ov.s4 += iv.s4 * fv.s0 + iv.s5 * fv.s1 + iv.s6 * fv.s2 + iv.s7 * fv.s3; \ + ov.s5 += iv.s4 * fv.s4 + iv.s5 * fv.s5 + iv.s6 * fv.s6 + iv.s7 * fv.s7; \ + ov.s6 += iv.s4 * fv.s8 + iv.s5 * fv.s9 + iv.s6 * fv.sa + iv.s7 * fv.sb; \ + ov.s7 += iv.s4 * fv.sc + iv.s5 * fv.sd + iv.s6 * fv.se + iv.s7 * fv.sf; \ + ov.s8 += iv.s8 * fv.s0 + iv.s9 * fv.s1 + iv.sa * fv.s2 + iv.sb * fv.s3; \ + ov.s9 += iv.s8 * fv.s4 + iv.s9 * fv.s5 + iv.sa * fv.s6 + iv.sb * fv.s7; \ + ov.sa += iv.s8 * fv.s8 + iv.s9 * fv.s9 + iv.sa * fv.sa + iv.sb * fv.sb; \ + ov.sb += iv.s8 * fv.sc + iv.s9 * fv.sd + iv.sa * fv.se + iv.sb * fv.sf; \ + ov.sc += iv.sc * fv.s0 + iv.sd * fv.s1 + iv.se * fv.s2 + iv.sf * fv.s3; \ + ov.sd += iv.sc * fv.s4 + iv.sd * fv.s5 + iv.se * fv.s6 + iv.sf * fv.s7; \ + ov.se += iv.sc * fv.s8 + iv.sd * fv.s9 + iv.se * fv.sa + iv.sf * fv.sb; \ + ov.sf += iv.sc * fv.sc + iv.sd * fv.sd + iv.se * fv.se + iv.sf * fv.sf; \ + } + +#define VLOAD_VEC(off, buf) vload16(0, buf + off); +#define VSTORE_VEC(v, off, buf) \ + { \ + ACTIVATION_V16(v); \ + vstore16(v, 0, buf + off); \ + } +#endif +#if defined(USE_RELU) +__kernel void MANGLE_NAME(conv_direct_s1_h_relu_, F, ON, KN) +#elif defined(USE_RELU6) +__kernel void MANGLE_NAME(conv_direct_s1_h_relu6_, F, ON, KN) +#elif defined(USE_GELU) +__kernel void MANGLE_NAME(conv_direct_s1_h_gelu_, F, ON, KN) +#else +__kernel void MANGLE_NAME(conv_direct_s1_h_, F, ON, KN) +#endif + (const int ih_str, + int ihw_str, + const int ic_str, + const int ih_off, + const int iw_off, + const int oh_str, + int ohw_str, + const int oh_off, + const int ow_off, + const int oh, + const int sw, + const int bx, + const int by, + __global const T *in, + __global const T *flt, + __read_only image1d_t bias, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= bx || idy >= by) { + return; + } +#if (ON == 2) + T8 in_val; + T8 out_val[KN]; +#elif (ON == 4) + T16 in_val; + T16 out_val[KN]; +#endif + T16 flt_val; + T4 bias_val = read_imageh(bias, sampler, idz * KN); + SET_BIAS_VAL(bias_val, out_val[0]); +#if (KN > 1) + bias_val = read_imageh(bias, sampler, idz * KN + 1); + SET_BIAS_VAL(bias_val, out_val[1]); +#endif +#if (KN > 2) + bias_val = read_imageh(bias, sampler, idz * KN + 2); + SET_BIAS_VAL(bias_val, out_val[2]); + bias_val = read_imageh(bias, sampler, idz * KN + 3); + SET_BIAS_VAL(bias_val, out_val[3]); +#endif + int in_off = ((idy + iw_off) * ih_str + idx * ON + ih_off) << 2; + int flt_off = idz * ic_str * Fsq * KN; + ihw_str = ihw_str << 2; + + for (int i = 0; i < ic_str; ++i) { + in_val = VLOAD_VEC(in_off, in); +#if (KN == 1) + flt_val = vload16(flt_off, flt); + calCore(in_val, flt_val, out_val[0]); +#elif (KN == 2) + flt_val = vload16(flt_off, flt); + calCore(in_val, flt_val, out_val[0]); + flt_val = vload16(flt_off + 1, flt); + calCore(in_val, flt_val, out_val[1]); +#elif (KN == 4) + for (uchar j = 0; j < KN; ++j) { + flt_val = vload16(flt_off + j, flt); + if (j == 0) { + calCore(in_val, flt_val, out_val[0]); + } + if (j == 1) { + calCore(in_val, flt_val, out_val[1]); + } + if (j == 2) { + calCore(in_val, flt_val, out_val[2]); + } + if (j == 3) { + calCore(in_val, flt_val, out_val[3]); + } + } +#endif + flt_off += KN; + in_off += ihw_str; + } + int out_off = (idz * KN * ohw_str + (idy + ow_off) * oh_str + idx * ON + oh_off) << 2; + VSTORE_VEC(out_val[0], out_off, out); + +#if (KN > 1) + ohw_str = ohw_str << 2; + out_off += ohw_str; + VSTORE_VEC(out_val[1], out_off, out); +#endif + +#if (KN > 2) + out_off += ohw_str; + VSTORE_VEC(out_val[2], out_off, out); + out_off += ohw_str; + VSTORE_VEC(out_val[3], out_off, out); +#endif +} + +// // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // / +#else +#if defined(USE_RELU) +__kernel void MANGLE_NAME(conv_direct_s1_relu_, F, ON, KN) +#elif defined(USE_RELU6) +__kernel void MANGLE_NAME(conv_direct_s1_relu6_, F, ON, KN) +#elif defined(USE_GELU) +__kernel void MANGLE_NAME(conv_direct_s1_gelu_, F, ON, KN) +#elif defined(USE_ELTWISE_NCWHC4) +__kernel void MANGLE_NAME(conv_direct_s1_eltwise4_, F, ON, KN) +#else +__kernel void MANGLE_NAME(conv_direct_s1_, F, ON, KN) +#endif + (const int ih_str, + const int ihw_str, + const int ic_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int ohw_str, + const int oh_off, + const int ow_off, + const int ow, + const int sh, + const int bx, + const int by, + __global const T *in, + __global const T *flt, + __read_only image1d_t bias, + __global T *out +#if defined(USE_ELTWISE_NCWHC4) + , + const int eh_str, + const int ehw_str, + const int eh_off, + const int ew_off, + __global const T *eltVal +#endif + ) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= bx || idy >= by) { + return; + } + T4 in_val[IN]; + T16 flt_val; + T4 out_val[KN][ON]; + LOADBIAS_IMAGE_ARRAY_V4(out_val[0], idz * KN, bias); +#if (KN > 1) + LOADBIAS_IMAGE_ARRAY_V4(out_val[1], idz * KN + 1, bias); +#endif +#if (KN > 2) + LOADBIAS_IMAGE_ARRAY_V4(out_val[2], idz * KN + 2, bias); + LOADBIAS_IMAGE_ARRAY_V4(out_val[3], idz * KN + 3, bias); +#endif + + int in_off = (idy * ON + iw_off) * ih_str + idx * sh + ih_off; + int flt_off = idz * ic_str * Fsq * KN; + + for (int i = 0; i < ic_str; ++i) { +#if (F == 1) + LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off, ih_str, in); +#if (KN == 1) + flt_val = vload16(flt_off, flt); + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[0]); +#elif (KN == 2) + flt_val = vload16(flt_off, flt); + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[0]); + flt_val = vload16(flt_off + 1, flt); + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[1]); +#elif (KN == 4) + for (uchar j = 0; j < KN; ++j) { + flt_val = vload16(flt_off + j, flt); + if (j == 0) { + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[0]); + } + if (j == 1) { + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[1]); + } + if (j == 2) { + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[2]); + } + if (j == 3) { + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[3]); + } + } +#endif + flt_off += KN; +#else + for (uchar j = 0; j < F; ++j) { + LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off + j, ih_str, in); + for (uchar k = 0; k < F; ++k) { +#if defined(BASICE_REG) + in_val[LN] = vload4(in_off + j + (LN + k) * ih_str, in); +#endif + flt_val = vload16(flt_off + k * KN, flt); + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[0]); +#if (KN > 1) + flt_val = vload16(flt_off + k * KN + 1, flt); + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[1]); +#endif +#if (KN > 2) + flt_val = vload16(flt_off + k * KN + 2, flt); + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[2]); + flt_val = vload16(flt_off + k * KN + 3, flt); + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[3]); +#endif + UPDATE_REG(in_val); + } + flt_off += F * KN; + } +#endif + in_off += ihw_str; + } + +#if defined(USE_ELTWISE_NCWHC4) + int elt_off = idz * KN * ehw_str + (idy * ON + ew_off) * eh_str + idx + eh_off; + ADD_ELTWISE_BUF_ARRAY_V4(out_val[0], elt_off, eh_str, eltVal); +#endif + int out_off = idz * KN * ohw_str + (idy * ON + ow_off) * oh_str + idx + oh_off; + STORE_OUTPUT_BUF_ARRAY_V4(out_val[0], out_off, oh_str, idy * ON, ow, out); +#if (KN > 1) +#if defined(USE_ELTWISE_NCWHC4) + elt_off += ehw_str; + ADD_ELTWISE_BUF_ARRAY_V4(out_val[1], elt_off, eh_str, eltVal); +#endif + out_off += ohw_str; + STORE_OUTPUT_BUF_ARRAY_V4(out_val[1], out_off, oh_str, idy * ON, ow, out); +#endif +#if (KN > 2) +#if defined(USE_ELTWISE_NCWHC4) + elt_off += ehw_str; + ADD_ELTWISE_BUF_ARRAY_V4(out_val[2], elt_off, eh_str, eltVal); + elt_off += ehw_str; + ADD_ELTWISE_BUF_ARRAY_V4(out_val[3], elt_off, eh_str, eltVal); +#endif + out_off += ohw_str; + STORE_OUTPUT_BUF_ARRAY_V4(out_val[2], out_off, oh_str, idy * ON, ow, out); + out_off += ohw_str; + STORE_OUTPUT_BUF_ARRAY_V4(out_val[3], out_off, oh_str, idy * ON, ow, out); +#endif +} +#endif diff --git a/compute/tensor/src/gpu/mali/cl/conv_direct_s1_fn_spe.cl b/compute/tensor/src/gpu/mali/cl/conv_direct_s1_fn_spe.cl new file mode 100644 index 00000000..f54507ac --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/conv_direct_s1_fn_spe.cl @@ -0,0 +1,503 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "kernel_def.h" +#define MANGLE_NAME_IMPL(base, F, ON) base##F##ON +#define MANGLE_NAME(base, F, ON) MANGLE_NAME_IMPL(base, F, ON) + +#if (ON == 3) +#define LOAD_BIAS(ov, bias) \ + { \ + ov[0] = bias[0]; \ + ov[1] = ov[0]; \ + ov[2] = ov[0]; \ + } +#define CALCORE(iv, fv, ov) \ + { \ + ov[0] += iv[0].x * fv.x + iv[0].y * fv.y + iv[0].z * fv.z + iv[0].w * fv.w; \ + ov[1] += iv[1].x * fv.x + iv[1].y * fv.y + iv[1].z * fv.z + iv[1].w * fv.w; \ + ov[2] += iv[2].x * fv.x + iv[2].y * fv.y + iv[2].z * fv.z + iv[2].w * fv.w; \ + } +#endif + +#if (ON == 4) +#define LOAD_BIAS(ov, bias) \ + { \ + ov[0] = bias[0]; \ + ov[1] = ov[0]; \ + ov[2] = ov[0]; \ + ov[3] = ov[0]; \ + } +#define CALCORE(iv, fv, ov) \ + { \ + ov[0] += iv[0].x * fv.x + iv[0].y * fv.y + iv[0].z * fv.z + iv[0].w * fv.w; \ + ov[1] += iv[1].x * fv.x + iv[1].y * fv.y + iv[1].z * fv.z + iv[1].w * fv.w; \ + ov[2] += iv[2].x * fv.x + iv[2].y * fv.y + iv[2].z * fv.z + iv[2].w * fv.w; \ + ov[3] += iv[3].x * fv.x + iv[3].y * fv.y + iv[3].z * fv.z + iv[3].w * fv.w; \ + } +#endif + +#if (ON == 5) +#define LOAD_BIAS(ov, bias) \ + { \ + ov[0] = bias[0]; \ + ov[1] = ov[0]; \ + ov[2] = ov[0]; \ + ov[3] = ov[0]; \ + ov[4] = ov[0]; \ + } +#define CALCORE(iv, fv, ov) \ + { \ + ov[0] += iv[0].x * fv.x + iv[0].y * fv.y + iv[0].z * fv.z + iv[0].w * fv.w; \ + ov[1] += iv[1].x * fv.x + iv[1].y * fv.y + iv[1].z * fv.z + iv[1].w * fv.w; \ + ov[2] += iv[2].x * fv.x + iv[2].y * fv.y + iv[2].z * fv.z + iv[2].w * fv.w; \ + ov[3] += iv[3].x * fv.x + iv[3].y * fv.y + iv[3].z * fv.z + iv[3].w * fv.w; \ + ov[4] += iv[4].x * fv.x + iv[4].y * fv.y + iv[4].z * fv.z + iv[4].w * fv.w; \ + } +#endif + +#if (ON == 6) +#define LOAD_BIAS(ov, bias) \ + { \ + ov[0] = bias[0]; \ + ov[1] = ov[0]; \ + ov[2] = ov[0]; \ + ov[3] = ov[0]; \ + ov[4] = ov[0]; \ + ov[5] = ov[0]; \ + } +#define CALCORE(iv, fv, ov) \ + { \ + ov[0] += iv[0].x * fv.x + iv[0].y * fv.y + iv[0].z * fv.z + iv[0].w * fv.w; \ + ov[1] += iv[1].x * fv.x + iv[1].y * fv.y + iv[1].z * fv.z + iv[1].w * fv.w; \ + ov[2] += iv[2].x * fv.x + iv[2].y * fv.y + iv[2].z * fv.z + iv[2].w * fv.w; \ + ov[3] += iv[3].x * fv.x + iv[3].y * fv.y + iv[3].z * fv.z + iv[3].w * fv.w; \ + ov[4] += iv[4].x * fv.x + iv[4].y * fv.y + iv[4].z * fv.z + iv[4].w * fv.w; \ + ov[5] += iv[5].x * fv.x + iv[5].y * fv.y + iv[5].z * fv.z + iv[5].w * fv.w; \ + } +#endif + +#if (ON == 7) +#define LOAD_BIAS(ov, bias) \ + { \ + ov[0] = bias[0]; \ + ov[1] = ov[0]; \ + ov[2] = ov[0]; \ + ov[3] = ov[0]; \ + ov[4] = ov[0]; \ + ov[5] = ov[0]; \ + ov[6] = ov[0]; \ + } +#define CALCORE(iv, fv, ov) \ + { \ + ov[0] += iv[0].x * fv.x + iv[0].y * fv.y + iv[0].z * fv.z + iv[0].w * fv.w; \ + ov[1] += iv[1].x * fv.x + iv[1].y * fv.y + iv[1].z * fv.z + iv[1].w * fv.w; \ + ov[2] += iv[2].x * fv.x + iv[2].y * fv.y + iv[2].z * fv.z + iv[2].w * fv.w; \ + ov[3] += iv[3].x * fv.x + iv[3].y * fv.y + iv[3].z * fv.z + iv[3].w * fv.w; \ + ov[4] += iv[4].x * fv.x + iv[4].y * fv.y + iv[4].z * fv.z + iv[4].w * fv.w; \ + ov[5] += iv[5].x * fv.x + iv[5].y * fv.y + iv[5].z * fv.z + iv[5].w * fv.w; \ + ov[6] += iv[6].x * fv.x + iv[6].y * fv.y + iv[6].z * fv.z + iv[6].w * fv.w; \ + } +#endif + +#if (ON == 8) +#define LOAD_BIAS(ov, bias) \ + { \ + ov[0] = bias[0]; \ + ov[1] = ov[0]; \ + ov[2] = ov[0]; \ + ov[3] = ov[0]; \ + ov[4] = ov[0]; \ + ov[5] = ov[0]; \ + ov[6] = ov[0]; \ + ov[7] = ov[0]; \ + } +#define CALCORE(iv, fv, ov) \ + { \ + ov[0] += iv[0].x * fv.x + iv[0].y * fv.y + iv[0].z * fv.z + iv[0].w * fv.w; \ + ov[1] += iv[1].x * fv.x + iv[1].y * fv.y + iv[1].z * fv.z + iv[1].w * fv.w; \ + ov[2] += iv[2].x * fv.x + iv[2].y * fv.y + iv[2].z * fv.z + iv[2].w * fv.w; \ + ov[3] += iv[3].x * fv.x + iv[3].y * fv.y + iv[3].z * fv.z + iv[3].w * fv.w; \ + ov[4] += iv[4].x * fv.x + iv[4].y * fv.y + iv[4].z * fv.z + iv[4].w * fv.w; \ + ov[5] += iv[5].x * fv.x + iv[5].y * fv.y + iv[5].z * fv.z + iv[5].w * fv.w; \ + ov[6] += iv[6].x * fv.x + iv[6].y * fv.y + iv[6].z * fv.z + iv[6].w * fv.w; \ + ov[7] += iv[7].x * fv.x + iv[7].y * fv.y + iv[7].z * fv.z + iv[7].w * fv.w; \ + } +#endif + +#if defined(USE_NCHW) +#if (ON == 3) +#define STORE_OUT(ov, off, id, ow, buf) \ + { \ + ACTIVATION_ARRAY3(ov); \ + if (id + 3 < ow) { \ + STORE_BUF_ARRAY3(ov, off, buf); \ + } else { \ + buf[off] = ov[0]; \ + if (id + 1 < ow) \ + buf[off + 1] = ov[1]; \ + if (id + 2 < ow) \ + buf[off + 2] = ov[2]; \ + } \ + } +#endif +#if (ON == 4) +#define STORE_OUT(ov, off, id, ow, buf) \ + { \ + ACTIVATION_ARRAY4(ov); \ + if (id + 4 < ow) { \ + STORE_BUF_ARRAY4(ov, off, buf); \ + } else { \ + buf[off] = ov[0]; \ + if (id + 1 < ow) \ + buf[off + 1] = ov[1]; \ + if (id + 2 < ow) \ + buf[off + 2] = ov[2]; \ + if (id + 3 < ow) \ + buf[off + 3] = ov[3]; \ + } \ + } +#endif +#if (ON == 5) +#define STORE_OUT(ov, off, id, ow, buf) \ + { \ + ACTIVATION_ARRAY5(ov); \ + if (id + 5 < ow) { \ + STORE_BUF_ARRAY5(ov, off, buf); \ + } else { \ + buf[off] = ov[0]; \ + if (id + 1 < ow) \ + buf[off + 1] = ov[1]; \ + if (id + 2 < ow) \ + buf[off + 2] = ov[2]; \ + if (id + 3 < ow) \ + buf[off + 3] = ov[3]; \ + if (id + 4 < ow) \ + buf[off + 4] = ov[4]; \ + } \ + } +#endif +#if (ON == 6) +#define STORE_OUT(ov, off, id, ow, buf) \ + { \ + ACTIVATION_ARRAY6(ov); \ + if (id + 6 < ow) { \ + STORE_BUF_ARRAY6(ov, off, buf); \ + } else { \ + buf[off] = ov[0]; \ + if (id + 1 < ow) \ + buf[off + 1] = ov[1]; \ + if (id + 2 < ow) \ + buf[off + 2] = ov[2]; \ + if (id + 3 < ow) \ + buf[off + 3] = ov[3]; \ + if (id + 4 < ow) \ + buf[off + 4] = ov[4]; \ + if (id + 5 < ow) \ + buf[off + 5] = ov[5]; \ + } \ + } +#endif +#if (ON == 7) +#define STORE_OUT(ov, off, id, ow, buf) \ + { \ + ACTIVATION_ARRAY7(ov); \ + if (id + 7 < ow) { \ + STORE_BUF_ARRAY7(ov, off, buf); \ + } else { \ + buf[off] = ov[0]; \ + if (id + 1 < ow) \ + buf[off + 1] = ov[1]; \ + if (id + 2 < ow) \ + buf[off + 2] = ov[2]; \ + if (id + 3 < ow) \ + buf[off + 3] = ov[3]; \ + if (id + 4 < ow) \ + buf[off + 4] = ov[4]; \ + if (id + 5 < ow) \ + buf[off + 5] = ov[5]; \ + if (id + 6 < ow) \ + buf[off + 6] = ov[6]; \ + } \ + } +#endif +#if (ON == 8) +#define STORE_OUT(ov, off, id, ow, buf) \ + { \ + ACTIVATION_ARRAY8(ov); \ + if (id + 8 < ow) { \ + STORE_BUF_ARRAY8(ov, off, buf); \ + } else { \ + buf[off] = ov[0]; \ + if (id + 1 < ow) \ + buf[off + 1] = ov[1]; \ + if (id + 2 < ow) \ + buf[off + 2] = ov[2]; \ + if (id + 3 < ow) \ + buf[off + 3] = ov[3]; \ + if (id + 4 < ow) \ + buf[off + 4] = ov[4]; \ + if (id + 5 < ow) \ + buf[off + 5] = ov[5]; \ + if (id + 6 < ow) \ + buf[off + 6] = ov[6]; \ + if (id + 7 < ow) \ + buf[off + 7] = ov[7]; \ + } \ + } +#endif +#else +#if (ON == 3) +#define STORE_OUT(ov, off, str, id, ow, buf) \ + { \ + ACTIVATION_ARRAY3(ov); \ + T4 tmp = 0; \ + tmp.x = ov[0]; \ + vstore4(tmp, off, buf); \ + if (id + 1 < ow) { \ + tmp.x = ov[1]; \ + vstore4(tmp, off + str, buf); \ + } \ + if (id + 2 < ow) { \ + tmp.x = ov[2]; \ + vstore4(tmp, off + 2 * str, buf); \ + } \ + } +#endif +#if (ON == 4) +#define STORE_OUT(ov, off, str, id, ow, buf) \ + { \ + ACTIVATION_ARRAY4(ov); \ + T4 tmp = 0; \ + tmp.x = ov[0]; \ + vstore4(tmp, off, buf); \ + if (id + 1 < ow) { \ + tmp.x = ov[1]; \ + vstore4(tmp, off + str, buf); \ + } \ + if (id + 2 < ow) { \ + tmp.x = ov[2]; \ + vstore4(tmp, off + 2 * str, buf); \ + } \ + if (id + 3 < ow) { \ + tmp.x = ov[3]; \ + vstore4(tmp, off + 3 * str, buf); \ + } \ + } +#endif +#if (ON == 5) +#define STORE_OUT(ov, off, str, id, ow, buf) \ + { \ + ACTIVATION_ARRAY5(ov); \ + T4 tmp = 0; \ + tmp.x = ov[0]; \ + vstore4(tmp, off, buf); \ + if (id + 1 < ow) { \ + tmp.x = ov[1]; \ + vstore4(tmp, off + str, buf); \ + } \ + if (id + 2 < ow) { \ + tmp.x = ov[2]; \ + vstore4(tmp, off + 2 * str, buf); \ + } \ + if (id + 3 < ow) { \ + tmp.x = ov[3]; \ + vstore4(tmp, off + 3 * str, buf); \ + } \ + if (id + 4 < ow) { \ + tmp.x = ov[4]; \ + vstore4(tmp, off + 4 * str, buf); \ + } \ + } +#endif +#if (ON == 6) +#define STORE_OUT(ov, off, str, id, ow, buf) \ + { \ + ACTIVATION_ARRAY6(ov); \ + T4 tmp = 0; \ + tmp.x = ov[0]; \ + vstore4(tmp, off, buf); \ + if (id + 1 < ow) { \ + tmp.x = ov[1]; \ + vstore4(tmp, off + str, buf); \ + } \ + if (id + 2 < ow) { \ + tmp.x = ov[2]; \ + vstore4(tmp, off + 2 * str, buf); \ + } \ + if (id + 3 < ow) { \ + tmp.x = ov[3]; \ + vstore4(tmp, off + 3 * str, buf); \ + } \ + if (id + 4 < ow) { \ + tmp.x = ov[4]; \ + vstore4(tmp, off + 4 * str, buf); \ + } \ + if (id + 5 < ow) { \ + tmp.x = ov[5]; \ + vstore4(tmp, off + 5 * str, buf); \ + } \ + } +#endif +#if (ON == 7) +#define STORE_OUT(ov, off, str, id, ow, buf) \ + { \ + ACTIVATION_ARRAY7(ov); \ + T4 tmp = 0; \ + tmp.x = ov[0]; \ + vstore4(tmp, off, buf); \ + if (id + 1 < ow) { \ + tmp.x = ov[1]; \ + vstore4(tmp, off + str, buf); \ + } \ + if (id + 2 < ow) { \ + tmp.x = ov[2]; \ + vstore4(tmp, off + 2 * str, buf); \ + } \ + if (id + 3 < ow) { \ + tmp.x = ov[3]; \ + vstore4(tmp, off + 3 * str, buf); \ + } \ + if (id + 4 < ow) { \ + tmp.x = ov[4]; \ + vstore4(tmp, off + 4 * str, buf); \ + } \ + if (id + 5 < ow) { \ + tmp.x = ov[5]; \ + vstore4(tmp, off + 5 * str, buf); \ + } \ + if (id + 6 < ow) { \ + tmp.x = ov[6]; \ + vstore4(tmp, off + 6 * str, buf); \ + } \ + } +#endif +#if (ON == 8) +#define STORE_OUT(ov, off, str, id, ow, buf) \ + { \ + ACTIVATION_ARRAY8(ov); \ + T4 tmp = 0; \ + tmp.x = ov[0]; \ + vstore4(tmp, off, buf); \ + if (id + 1 < ow) { \ + tmp.x = ov[1]; \ + vstore4(tmp, off + str, buf); \ + } \ + if (id + 2 < ow) { \ + tmp.x = ov[2]; \ + vstore4(tmp, off + 2 * str, buf); \ + } \ + if (id + 3 < ow) { \ + tmp.x = ov[3]; \ + vstore4(tmp, off + 3 * str, buf); \ + } \ + if (id + 4 < ow) { \ + tmp.x = ov[4]; \ + vstore4(tmp, off + 4 * str, buf); \ + } \ + if (id + 5 < ow) { \ + tmp.x = ov[5]; \ + vstore4(tmp, off + 5 * str, buf); \ + } \ + if (id + 6 < ow) { \ + tmp.x = ov[6]; \ + vstore4(tmp, off + 6 * str, buf); \ + } \ + if (id + 7 < ow) { \ + tmp.x = ov[7]; \ + vstore4(tmp, off + 7 * str, buf); \ + } \ + } +#endif +#endif + +#if defined(USE_NCHW) +#if defined(USE_RELU) +__kernel void MANGLE_NAME(conv_direct_s1_fn_spe_relu_nchw_, F, ON) +#elif defined(USE_RELU6) +__kernel void MANGLE_NAME(conv_direct_s1_fn_spe_relu6_nchw_, F, ON) +#else +__kernel void MANGLE_NAME(conv_direct_s1_fn_spe_nchw_, F, ON) +#endif +#else +#if defined(USE_RELU) +__kernel void MANGLE_NAME(conv_direct_s1_fn_spe_relu_, F, ON) +#elif defined(USE_RELU6) +__kernel void MANGLE_NAME(conv_direct_s1_fn_spe_relu6_, F, ON) +#else +__kernel void MANGLE_NAME(conv_direct_s1_fn_spe_, F, ON) +#endif +#endif + (const int ih_str, + const int ihw_str, + const int ic_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int ow_str, + const int ohw_str, + const int oh_off, + const int ow_off, + const int ow, + const int sh, + const int bx, + const int by, + __global const T *in, + __global const T *flt, + __global const T *bias, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + if (idx >= bx || idy >= by) { + return; + } + T4 flt_val; + T4 in_val[IN]; + T out_val[ON]; + + LOAD_BIAS(out_val, bias); + int flt_off = 0; + int in_off = (idy * ON + iw_off) * ih_str + idx * sh + ih_off; + +#if (F == 1) + for (int i = 0; i < ic_str; ++i) { + LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off, ih_str, in); + flt_val = vload4(flt_off, flt); + CALCORE(in_val, flt_val, out_val); + flt_off += 1; + in_off += ihw_str; + } +#else + for (int i = 0; i < ic_str; ++i) { + for (uchar j = 0; j < F; j++) { + LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off + j, ih_str, in); + for (uchar k = 0; k < F; k++) { + flt_val = vload4(flt_off + k, flt); + CALCORE(in_val, flt_val, out_val); + UPDATE_REG(in_val); + } + flt_off += F; + } + in_off += ihw_str; + } +#endif + +#if defined(USE_NCHW) + int out_off = (idx + oh_off) * ow_str + idy * ON + ow_off; + STORE_OUT(out_val, out_off, idy * ON, ow, out); +#else + int out_off = (idy * ON + ow_off) * oh_str + idx + oh_off; + STORE_OUT(out_val, out_off, oh_str, idy * ON, ow, out); +#endif +} diff --git a/compute/tensor/src/gpu/mali/cl/conv_direct_s1_nchw_to_ncwhc4.cl b/compute/tensor/src/gpu/mali/cl/conv_direct_s1_nchw_to_ncwhc4.cl new file mode 100644 index 00000000..55e20be3 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/conv_direct_s1_nchw_to_ncwhc4.cl @@ -0,0 +1,156 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "kernel_def.h" +#define MANGLE_NAME_IMPL(base, F, ON) base##F##ON +#define MANGLE_NAME(base, F, ON) MANGLE_NAME_IMPL(base, F, ON) + +#if (F == 1) +#define calCore(A, B, C) \ + { \ + C[0] += A.s0 * B; \ + C[1] += A.s1 * B; \ + C[2] += A.s2 * B; \ + C[3] += A.s3 * B; \ + C[4] += A.s4 * B; \ + C[5] += A.s5 * B; \ + C[6] += A.s6 * B; \ + C[7] += A.s7 * B; \ + } +#elif (F == 3) +#define calCore(a0, a1, a2, a3, a4, a5, B, C) \ + { \ + C[0] += a0 * B; \ + C[1] += a1 * B; \ + C[2] += a2 * B; \ + C[3] += a3 * B; \ + C[4] += a4 * B; \ + C[5] += a5 * B; \ + } +#define calCore0(A, B, C) calCore(A.s0, A.s1, A.s2, A.s3, A.s4, A.s5, B, C) +#define calCore1(A, B, C) calCore(A.s1, A.s2, A.s3, A.s4, A.s5, A.s6, B, C) +#define calCore2(A, B, C) calCore(A.s2, A.s3, A.s4, A.s5, A.s6, A.s7, B, C) +#elif (F == 5) +#define calCore(a0, a1, a2, a3, B, C) \ + { \ + C[0] += a0 * B; \ + C[1] += a1 * B; \ + C[2] += a2 * B; \ + C[3] += a3 * B; \ + } +#define calCore0(A, B, C) calCore(A.s0, A.s1, A.s2, A.s3, B, C) +#define calCore1(A, B, C) calCore(A.s1, A.s2, A.s3, A.s4, B, C) +#define calCore2(A, B, C) calCore(A.s2, A.s3, A.s4, A.s5, B, C) +#define calCore3(A, B, C) calCore(A.s3, A.s4, A.s5, A.s6, B, C) +#define calCore4(A, B, C) calCore(A.s4, A.s5, A.s6, A.s7, B, C) +#elif (F == 7) +#define calCore(a0, a1, B, C) \ + { \ + C[0] += a0 * B; \ + C[1] += a1 * B; \ + } +#define calCore0(A, B, C) calCore(A.s0, A.s1, B, C) +#define calCore1(A, B, C) calCore(A.s1, A.s2, B, C) +#define calCore2(A, B, C) calCore(A.s2, A.s3, B, C) +#define calCore3(A, B, C) calCore(A.s3, A.s4, B, C) +#define calCore4(A, B, C) calCore(A.s4, A.s5, B, C) +#define calCore5(A, B, C) calCore(A.s5, A.s6, B, C) +#define calCore6(A, B, C) calCore(A.s6, A.s7, B, C) +#endif + +#if defined(USE_RELU) +__kernel void MANGLE_NAME(conv_direct_s1_nchw_to_ncwhc4_relu_, F, ON) +#elif defined(USE_RELU6) +__kernel void MANGLE_NAME(conv_direct_s1_nchw_to_ncwhc4_relu6_, F, ON) +#else +__kernel void MANGLE_NAME(conv_direct_s1_nchw_to_ncwhc4_, F, ON) +#endif + (const int iw_str, + const int iwh_str, + const int ic_str, + const int iw_off, + const int ih_off, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + const int ow, + const int bx, + const int by, + __global const T *in, + __global const T *flt, + __read_only image1d_t bias, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= bx || idy >= by) { + return; + } + + T8 in_val; + T4 flt_val; + T4 out_val[ON]; + + LOADBIAS_IMAGE_ARRAY_V4(out_val, idz, bias); + int in_off = (idy + ih_off) * iw_str + idx * ON + iw_off; + int flt_off = idz * ic_str * Fsq; + + for (int i = 0; i < ic_str; ++i) { +#if (F == 1) + flt_val = vload4(flt_off, flt); + in_val = vload8(0, in + in_off); + calCore(in_val, flt_val, out_val); + flt_off++; +#else + for (uchar j = 0; j < F; ++j) { + in_val = vload8(0, in + in_off + j * iw_str); + for (uchar k = 0; k < F; ++k) { + flt_val = vload4(flt_off + k, flt); + if (k == 0) { + calCore0(in_val, flt_val, out_val); + } + if (k == 1) { + calCore1(in_val, flt_val, out_val); + } + if (k == 2) { + calCore2(in_val, flt_val, out_val); + } +#if (F > 3) + if (k == 3) { + calCore3(in_val, flt_val, out_val); + } + if (k == 4) { + calCore4(in_val, flt_val, out_val); + } +#endif +#if (F > 5) + if (k == 5) { + calCore5(in_val, flt_val, out_val); + } + if (k == 6) { + calCore6(in_val, flt_val, out_val); + } +#endif + } + flt_off += F; + } +#endif + in_off += iwh_str; + } + + int xn = idx * ON; + int out_off = (idz * ow_str + xn + ow_off) * oh_str + idy + oh_off; + STORE_OUTPUT_BUF_ARRAY_V4(out_val, out_off, oh_str, xn, ow, out); +} diff --git a/tensor_computing/src/gpu/mali/cl/conv_direct_s1_spe_f1c3k1.cl b/compute/tensor/src/gpu/mali/cl/conv_direct_s1_spe_f1c3k1.cl similarity index 83% rename from tensor_computing/src/gpu/mali/cl/conv_direct_s1_spe_f1c3k1.cl rename to compute/tensor/src/gpu/mali/cl/conv_direct_s1_spe_f1c3k1.cl index cf1fbd39..f973a38e 100644 --- a/tensor_computing/src/gpu/mali/cl/conv_direct_s1_spe_f1c3k1.cl +++ b/compute/tensor/src/gpu/mali/cl/conv_direct_s1_spe_f1c3k1.cl @@ -11,19 +11,25 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - -#define MANGLE_NAME_IMPL(base, EW) base ## EW +#define MANGLE_NAME_IMPL(base, EW) base##EW #define MANGLE_NAME(base, EW) MANGLE_NAME_IMPL(base, EW) -__kernel void MANGLE_NAME(conv_direct_s1_spe_f1c3k1_, EW)(const int iw_str, const int ow_str, const int ow_off, const int oh_off, const int ow_d2, const int bx, const int by, - __global const T* in, __global const T* flt, __global T* out){ - +__kernel void MANGLE_NAME(conv_direct_s1_spe_f1c3k1_, EW)(const int iw_str, + const int ow_str, + const int ow_off, + const int oh_off, + const int ow_d2, + const int bx, + const int by, + __global const T *in, + __global const T *flt, + __global T *out) +{ const int idx = get_global_id(0); const int idy = get_global_id(1); - if(idx >= bx || idy >= by) return; + if (idx >= bx || idy >= by) { + return; + } T4 flt_val; T8 in_val; T2 out_val; @@ -31,16 +37,16 @@ __kernel void MANGLE_NAME(conv_direct_s1_spe_f1c3k1_, EW)(const int iw_str, cons out_val.x = flt_val.w; out_val.y = flt_val.w; int in_off = (idy * iw_str + (idx << 1)) * 3; - + in_val = vload8(0, in + in_off); out_val.x += in_val.s0 * flt_val.x + in_val.s1 * flt_val.y + in_val.s2 * flt_val.z; out_val.y += in_val.s3 * flt_val.x + in_val.s4 * flt_val.y + in_val.s5 * flt_val.z; - + int out_off = (idy + oh_off) * ow_str + (idx << 1) + ow_off; -#if(EW == 0) +#if (EW == 0) vstore2(out_val, 0, out + out_off); -#elif(EW == 1) - if(idx < ow_d2){ +#elif (EW == 1) + if (idx < ow_d2) { vstore2(out_val, 0, out + out_off); } else { out[out_off] = out_val.x; diff --git a/tensor_computing/src/gpu/mali/cl/conv_direct_s2.cl b/compute/tensor/src/gpu/mali/cl/conv_direct_s2.cl similarity index 80% rename from tensor_computing/src/gpu/mali/cl/conv_direct_s2.cl rename to compute/tensor/src/gpu/mali/cl/conv_direct_s2.cl index d8bfdc38..b1ab72fa 100644 --- a/tensor_computing/src/gpu/mali/cl/conv_direct_s2.cl +++ b/compute/tensor/src/gpu/mali/cl/conv_direct_s2.cl @@ -11,55 +11,67 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - #include "kernel_def.h" -#define MANGLE_NAME_IMPL(base, F, ON, KN) base ## F ## ON ## KN +#define MANGLE_NAME_IMPL(base, F, ON, KN) base##F##ON##KN #define MANGLE_NAME(base, F, ON, KN) MANGLE_NAME_IMPL(base, F, ON, KN) - - #if defined(USE_RELU) __kernel void MANGLE_NAME(conv_direct_s2_relu_, F, ON, KN) +#elif defined(USE_RELU6) +__kernel void MANGLE_NAME(conv_direct_s2_relu6_, F, ON, KN) #else __kernel void MANGLE_NAME(conv_direct_s2_, F, ON, KN) #endif -(const int ih_str, const int ihw_str, const int ic_str, const int ih_off, const int iw_off, const int oh_str, const int ohw_str, const int oh_off, const int ow_off, -const int ow, const int bx, const int by, __global const T* in, __global const T* flt, __read_only image1d_t bias, __global T* out) + (const int ih_str, + const int ihw_str, + const int ic_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int ohw_str, + const int oh_off, + const int ow_off, + const int ow, + const int sh, + const int bx, + const int by, + __global const T *in, + __global const T *flt, + __read_only image1d_t bias, + __global T *out) { - const int idx = get_global_id(0); const int idy = get_global_id(1); const int idz = get_global_id(2); - if(idx >= bx || idy >= by) return; + if (idx >= bx || idy >= by) { + return; + } - T4 in_val[IN]; + T4 in_val[IN]; T16 flt_val; - T4 out_val[KN][ON]; + T4 out_val[KN][ON]; LOADBIAS_IMAGE_ARRAY_V4(out_val[0], idz * KN, bias); -#if(KN > 1) +#if (KN > 1) LOADBIAS_IMAGE_ARRAY_V4(out_val[1], idz * KN + 1, bias); #endif -#if(KN > 2) +#if (KN > 2) LOADBIAS_IMAGE_ARRAY_V4(out_val[2], idz * KN + 2, bias); LOADBIAS_IMAGE_ARRAY_V4(out_val[3], idz * KN + 3, bias); -#endif +#endif - int in_off = ((idy << 1) * ON + iw_off) * ih_str + (idx << 1) + ih_off; + int in_off = ((idy << 1) * ON + iw_off) * ih_str + idx * sh + ih_off; int flt_off = idz * ic_str * Fsq * KN; - for(int i = 0; i < ic_str; ++i) { -#if(F == 1) + for (int i = 0; i < ic_str; ++i) { +#if (F == 1) LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off, (ih_str << 1), in); flt_val = vload16(flt_off, flt); DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[0]); -#if(KN > 1) +#if (KN > 1) flt_val = vload16(flt_off + 1, flt); DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[1]); #endif -#if(KN > 2) +#if (KN > 2) flt_val = vload16(flt_off + 2, flt); DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[2]); flt_val = vload16(flt_off + 3, flt); @@ -67,18 +79,18 @@ const int ow, const int bx, const int by, __global const T* in, __global const T #endif flt_off += KN; #else - for(uchar j = 0; j < F; ++j) { + for (uchar j = 0; j < F; ++j) { #if defined(BASIC_REG) LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off + j, (ih_str << 1), in); - for(uchar k = 0; k < F; k += 2) { - in_val[LN] = vload4(in_off + j + ((LN << 1) + k) * ih_str , in); + for (uchar k = 0; k < F; k += 2) { + in_val[LN] = vload4(in_off + j + ((LN << 1) + k) * ih_str, in); flt_val = vload16(flt_off + k * KN, flt); DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[0]); -#if(KN > 1) +#if (KN > 1) flt_val = vload16(flt_off + k * KN + 1, flt); DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[1]); -#endif -#if(KN > 2) +#endif +#if (KN > 2) flt_val = vload16(flt_off + k * KN + 2, flt); DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[2]); flt_val = vload16(flt_off + k * KN + 3, flt); @@ -87,15 +99,15 @@ const int ow, const int bx, const int by, __global const T* in, __global const T UPDATE_REG(in_val); } LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off + j + ih_str, (ih_str << 1), in); - for(uchar k = 1; k < F; k += 2) { - in_val[LN] = vload4(in_off + j + ((LN << 1) + k) * ih_str , in); + for (uchar k = 1; k < F; k += 2) { + in_val[LN] = vload4(in_off + j + ((LN << 1) + k) * ih_str, in); flt_val = vload16(flt_off + k * KN, flt); DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[0]) -#if(KN > 1) +#if (KN > 1) flt_val = vload16(flt_off + k * KN + 1, flt); DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[1]); -#endif -#if(KN > 2) +#endif +#if (KN > 2) flt_val = vload16(flt_off + k * KN + 2, flt); DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[2]); flt_val = vload16(flt_off + k * KN + 3, flt); @@ -105,14 +117,14 @@ const int ow, const int bx, const int by, __global const T* in, __global const T } #else LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off + j, ih_str, in); - for(uchar k = 0; k < F; ++k) { + for (uchar k = 0; k < F; ++k) { flt_val = vload16(flt_off + k * KN, flt); DIRECT_CONV_CAL_CORE_S2(in_val, flt_val, out_val[0]); -#if(KN > 1) +#if (KN > 1) flt_val = vload16(flt_off + k * KN + 1, flt); DIRECT_CONV_CAL_CORE_S2(in_val, flt_val, out_val[1]); -#endif -#if(KN > 2) +#endif +#if (KN > 2) flt_val = vload16(flt_off + k * KN + 2, flt); DIRECT_CONV_CAL_CORE_S2(in_val, flt_val, out_val[2]); flt_val = vload16(flt_off + k * KN + 3, flt); @@ -120,7 +132,7 @@ const int ow, const int bx, const int by, __global const T* in, __global const T #endif UPDATE_REG(in_val); } -#endif +#endif flt_off += F * KN; } #endif @@ -129,11 +141,11 @@ const int ow, const int bx, const int by, __global const T* in, __global const T int out_off = idz * KN * ohw_str + (idy * ON + ow_off) * oh_str + idx + oh_off; STORE_OUTPUT_BUF_ARRAY_V4(out_val[0], out_off, oh_str, idy * ON, ow, out); -#if(KN > 1) +#if (KN > 1) out_off += ohw_str; STORE_OUTPUT_BUF_ARRAY_V4(out_val[1], out_off, oh_str, idy * ON, ow, out); #endif -#if(KN > 2) +#if (KN > 2) out_off += ohw_str; STORE_OUTPUT_BUF_ARRAY_V4(out_val[2], out_off, oh_str, idy * ON, ow, out); out_off += ohw_str; diff --git a/compute/tensor/src/gpu/mali/cl/conv_direct_s2_nchw_to_ncwhc4.cl b/compute/tensor/src/gpu/mali/cl/conv_direct_s2_nchw_to_ncwhc4.cl new file mode 100644 index 00000000..dce60fdb --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/conv_direct_s2_nchw_to_ncwhc4.cl @@ -0,0 +1,168 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "kernel_def.h" +#define MANGLE_NAME_IMPL(base, F, ON) base##F##ON +#define MANGLE_NAME(base, F, ON) MANGLE_NAME_IMPL(base, F, ON) + +#if (F == 1) +#define calCore(A, B, C) \ + { \ + C[0] += A.s0 * B; \ + C[1] += A.s2 * B; \ + C[2] += A.s4 * B; \ + C[3] += A.s6 * B; \ + C[4] += A.s8 * B; \ + C[5] += A.sa * B; \ + C[6] += A.sc * B; \ + C[7] += A.se * B; \ + } +#elif (F == 3) +#define calCore(a0, a1, a2, a3, a4, a5, a6, B, C) \ + { \ + C[0] += a0 * B; \ + C[1] += a1 * B; \ + C[2] += a2 * B; \ + C[3] += a3 * B; \ + C[4] += a4 * B; \ + C[5] += a5 * B; \ + C[6] += a6 * B; \ + } +#define calCore0(A, B, C) calCore(A.s0, A.s2, A.s4, A.s6, A.s8, A.sa, A.sc, B, C) +#define calCore1(A, B, C) calCore(A.s1, A.s3, A.s5, A.s7, A.s9, A.sb, A.sd, B, C) +#define calCore2(A, B, C) calCore(A.s2, A.s4, A.s6, A.s8, A.sa, A.sc, A.se, B, C) +#elif (F == 5) +#define calCore(a0, a1, a2, a3, a4, a5, B, C) \ + { \ + C[0] += a0 * B; \ + C[1] += a1 * B; \ + C[2] += a2 * B; \ + C[3] += a3 * B; \ + C[4] += a4 * B; \ + C[5] += a5 * B; \ + } +#define calCore0(A, B, C) calCore(A.s0, A.s2, A.s4, A.s6, A.s8, A.sa, B, C) +#define calCore1(A, B, C) calCore(A.s1, A.s3, A.s5, A.s7, A.s9, A.sb, B, C) +#define calCore2(A, B, C) calCore(A.s2, A.s4, A.s6, A.s8, A.sa, A.sc, B, C) +#define calCore3(A, B, C) calCore(A.s3, A.s5, A.s7, A.s9, A.sb, A.sd, B, C) +#define calCore4(A, B, C) calCore(A.s4, A.s6, A.s8, A.sa, A.sc, A.se, B, C) +#elif (F == 7) +#define calCore(a0, a1, a2, a3, a4, B, C) \ + { \ + C[0] += a0 * B; \ + C[1] += a1 * B; \ + C[2] += a2 * B; \ + C[3] += a3 * B; \ + C[4] += a4 * B; \ + } +#define calCore0(A, B, C) calCore(A.s0, A.s2, A.s4, A.s6, A.s8, B, C) +#define calCore1(A, B, C) calCore(A.s1, A.s3, A.s5, A.s7, A.s9, B, C) +#define calCore2(A, B, C) calCore(A.s2, A.s4, A.s6, A.s8, A.sa, B, C) +#define calCore3(A, B, C) calCore(A.s3, A.s5, A.s7, A.s9, A.sb, B, C) +#define calCore4(A, B, C) calCore(A.s4, A.s6, A.s8, A.sa, A.sc, B, C) +#define calCore5(A, B, C) calCore(A.s5, A.s7, A.s9, A.sb, A.sd, B, C) +#define calCore6(A, B, C) calCore(A.s6, A.s8, A.sa, A.sc, A.se, B, C) +#endif + +#if defined(USE_RELU) +__kernel void MANGLE_NAME(conv_direct_s2_nchw_to_ncwhc4_relu_, F, ON) +#elif defined(USE_RELU6) +__kernel void MANGLE_NAME(conv_direct_s2_nchw_to_ncwhc4_relu6_, F, ON) +#else +__kernel void MANGLE_NAME(conv_direct_s2_nchw_to_ncwhc4_, F, ON) +#endif + (const int iw_str, + const int iwh_str, + const int ic_str, + const int iw_off, + const int ih_off, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + const int ow, + const int bx, + const int by, + __global const T *in, + __global const T *flt, + __read_only image1d_t bias, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= bx || idy >= by) { + return; + } + + T16 in_val; + T4 flt_val; + T4 out_val[ON]; + + LOADBIAS_IMAGE_ARRAY_V4(out_val, idz, bias); + int in_off = ((idy << 1) + ih_off) * iw_str + (idx << 1) * ON + iw_off; + int flt_off = idz * ic_str * Fsq; + + for (int i = 0; i < ic_str; ++i) { +#if (F == 1) + flt_val = vload4(flt_off, flt); + in_val = vload16(0, in + in_off); + calCore(in_val, flt_val, out_val); + flt_off++; +#else + for (uchar j = 0; j < F; ++j) { + in_val = vload16(0, in + in_off + j * iw_str); + for (uchar k = 0; k < F; ++k) { + flt_val = vload4(flt_off + k, flt); + if (k == 0) { + calCore0(in_val, flt_val, out_val); + } + if (k == 1) { + calCore1(in_val, flt_val, out_val); + } + if (k == 2) { + calCore2(in_val, flt_val, out_val); + } +#if (F == 5) + if (k == 3) { + calCore3(in_val, flt_val, out_val); + } + if (k == 4) { + calCore4(in_val, flt_val, out_val); + } +#endif +#if (F == 7) + if (k == 3) { + calCore3(in_val, flt_val, out_val); + } + if (k == 4) { + calCore4(in_val, flt_val, out_val); + } + if (k == 5) { + calCore5(in_val, flt_val, out_val); + } + if (k == 6) { + calCore6(in_val, flt_val, out_val); + } +#endif + } + flt_off += F; + } +#endif + in_off += iwh_str; + } + + int xn = idx * ON; + int out_off = (idz * ow_str + xn + ow_off) * oh_str + idy + oh_off; + STORE_OUTPUT_BUF_ARRAY_V4(out_val, out_off, oh_str, xn, ow, out); +} diff --git a/compute/tensor/src/gpu/mali/cl/conv_direct_spe_fwhs1.cl b/compute/tensor/src/gpu/mali/cl/conv_direct_spe_fwhs1.cl new file mode 100644 index 00000000..21985ac8 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/conv_direct_spe_fwhs1.cl @@ -0,0 +1,134 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NOCINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTIOC OF COCTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN COCNECTIOC WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "kernel_def.h" +#define MANGLE_NAME_IMPL(base, OC) base##OC +#define MANGLE_NAME(base, OC) MANGLE_NAME_IMPL(base, OC) + +#if (OC == 1) +#define calCore(ov, i_off, f_off, in, flt) \ + { \ + T iv = in[i_off]; \ + T fv = flt[f_off]; \ + ov += iv * fv; \ + } +#endif + +#if (OC == 2) +#define calCore(ov, i_off, f_off, in, flt) \ + { \ + T2 iv = vload2(i_off, in); \ + T2 fv = vload2(f_off, flt); \ + ov += iv.x * fv.x + iv.y * fv.y; \ + } +#endif + +#if (OC == 3) +#define calCore(ov, i_off, f_off, in, flt) \ + { \ + T3 iv = vload3(i_off, in); \ + T3 fv = vload3(f_off, flt); \ + ov += iv.x * fv.x + iv.y * fv.y + iv.z * fv.z; \ + } +#endif + +#if (OC == 4) +#define calCore(ov, i_off, f_off, in, flt) \ + { \ + T4 iv = vload4(i_off, in); \ + T4 fv = vload4(f_off, flt); \ + DOT_A4B4C1(iv, fv, ov); \ + } +#endif + +#if (OC == 8) +#define calCore(ov, i_off, f_off, in, flt) \ + { \ + T8 iv = vload8(i_off, in); \ + T8 fv = vload8(f_off, flt); \ + DOT_A4B4C1(iv.s0123, fv.s0123, ov); \ + DOT_A4B4C1(iv.s4567, fv.s4567, ov); \ + } +#endif + +#if (OC == 16) +#define calCore(ov, i_off, f_off, in, flt) \ + { \ + T16 iv = vload16(i_off, in); \ + T16 fv = vload16(f_off, flt); \ + DOT_A4B4C1(iv.s0123, fv.s0123, ov); \ + DOT_A4B4C1(iv.s4567, fv.s4567, ov); \ + DOT_A4B4C1(iv.s89ab, fv.s89ab, ov); \ + DOT_A4B4C1(iv.scdef, fv.scdef, ov); \ + } +#endif + +#if defined(USE_RELU) +#if defined(NO_BIAS) +__kernel MANGLE_NAME(void conv_direct_spe_fwhs1_nobias_relu_, OC) +#else +__kernel MANGLE_NAME(void conv_direct_spe_fwhs1_relu_, OC) +#endif +#elif defined(USE_RELU6) +#if defined(NO_BIAS) +__kernel MANGLE_NAME(void conv_direct_spe_fwhs1_nobias_relu6_, OC) +#else +__kernel MANGLE_NAME(void conv_direct_spe_fwhs1_relu6_, OC) +#endif +#else +#if defined(NO_BIAS) +__kernel MANGLE_NAME(void conv_direct_spe_fwhs1_nobias_, OC) +#else +__kernel MANGLE_NAME(void conv_direct_spe_fwhs1_, OC) +#endif +#endif + (const int ih_str, + const int ihw_str, + const int ic_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + const int flt_str, + const int bx, + const int by, + __global const T *in, + __global const T *flt, + __global const T *bias, + __global T *out) +{ + const int idx = get_global_id(0); + if (idx >= bx) { + return; + } +#if defined(NO_BIAS) + T out_val = 0; +#else + T out_val = bias[idx]; +#endif + int in_off = iw_off * ih_str + ih_off; + int flt_off = idx; + for (int i = 0; i < ic_str; ++i) { + calCore(out_val, in_off, flt_off, in, flt); + in_off += ihw_str; + flt_off += flt_str; + } + + ACTIVATION_V1(out_val); + const int ox = idx >> 2; + const int oy = idx & 3; + int out_off = (ox * ow_str + ow_off) * oh_str + oh_off; + out[out_off * 4 + oy] = out_val; +} diff --git a/compute/tensor/src/gpu/mali/cl/conv_direct_trans_fltbuf.cl b/compute/tensor/src/gpu/mali/cl/conv_direct_trans_fltbuf.cl new file mode 100644 index 00000000..6a819e48 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/conv_direct_trans_fltbuf.cl @@ -0,0 +1,207 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define MANGLE_NAME_IMPL(base, C, K) base##C##K +#define MANGLE_NAME(base, C, K) MANGLE_NAME_IMPL(base, C, K) +#if (C == 1) +#define loadFltval(off, str, flt, val) \ + { \ + val = flt[off]; \ + } + +#define loadFltvalEdge(off, str, flt, val, edge) \ + {} +#endif + +#if (C == 2) +#define loadFltval(off, str, flt, val) \ + { \ + val.x = flt[off]; \ + val.y = flt[off + str]; \ + } + +#define loadFltvalEdge(off, str, flt, val, edge) \ + { \ + val.x = flt[off]; \ + } +#endif + +#if (C == 3) +#define loadFltval(off, str, flt, val) \ + { \ + val.x = flt[off]; \ + val.y = flt[off + str]; \ + val.z = flt[off + str * 2]; \ + } + +#define loadFltvalEdge(off, str, flt, val, edge) \ + { \ + val.x = flt[off]; \ + if (edge > 1) \ + val.y = flt[off + str]; \ + } +#endif + +#if (C == 4) +#define loadFltval(off, str, flt, val) \ + { \ + val.x = flt[off]; \ + val.y = flt[off + str]; \ + val.z = flt[off + str * 2]; \ + val.w = flt[off + str * 3]; \ + } + +#define loadFltvalEdge(off, str, flt, val, edge) \ + { \ + val.x = flt[off]; \ + if (edge > 1) \ + val.y = flt[off + str]; \ + if (edge > 2) \ + val.z = flt[off + str * 2]; \ + } +#endif + +#if (C == 8) +#define loadFltval(off, str, flt, val) \ + { \ + val.s0 = flt[off]; \ + val.s1 = flt[off + str]; \ + val.s2 = flt[off + str * 2]; \ + val.s3 = flt[off + str * 3]; \ + val.s4 = flt[off + str * 4]; \ + val.s5 = flt[off + str * 5]; \ + val.s6 = flt[off + str * 6]; \ + val.s7 = flt[off + str * 7]; \ + } +#define loadFltvalEdge(off, str, flt, val, edge) \ + { \ + val.s0 = flt[off]; \ + if (edge > 1) \ + val.s1 = flt[off + str]; \ + if (edge > 2) \ + val.s2 = flt[off + str * 2]; \ + if (edge > 3) \ + val.s3 = flt[off + str * 3]; \ + if (edge > 4) \ + val.s4 = flt[off + str * 4]; \ + if (edge > 5) \ + val.s5 = flt[off + str * 5]; \ + if (edge > 6) \ + val.s6 = flt[off + str * 6]; \ + } +#endif + +#if (C == 16) +#define loadFltval(off, str, flt, val) \ + { \ + val.s0 = flt[off]; \ + val.s1 = flt[off + str]; \ + val.s2 = flt[off + str * 2]; \ + val.s3 = flt[off + str * 3]; \ + val.s4 = flt[off + str * 4]; \ + val.s5 = flt[off + str * 5]; \ + val.s6 = flt[off + str * 6]; \ + val.s7 = flt[off + str * 7]; \ + val.s8 = flt[off + str * 8]; \ + val.s9 = flt[off + str * 9]; \ + val.sa = flt[off + str * 10]; \ + val.sb = flt[off + str * 11]; \ + val.sc = flt[off + str * 12]; \ + val.sd = flt[off + str * 13]; \ + val.se = flt[off + str * 14]; \ + val.sf = flt[off + str * 15]; \ + } +#define loadFltvalEdge(off, str, flt, val, edge) \ + { \ + val.s0 = flt[off]; \ + if (edge > 1) \ + val.s1 = flt[off + str]; \ + if (edge > 2) \ + val.s2 = flt[off + str * 2]; \ + if (edge > 3) \ + val.s3 = flt[off + str * 3]; \ + if (edge > 4) \ + val.s4 = flt[off + str * 4]; \ + if (edge > 5) \ + val.s5 = flt[off + str * 5]; \ + if (edge > 6) \ + val.s6 = flt[off + str * 6]; \ + if (edge > 7) \ + val.s7 = flt[off + str * 7]; \ + if (edge > 8) \ + val.s8 = flt[off + str * 8]; \ + if (edge > 9) \ + val.s9 = flt[off + str * 9]; \ + if (edge > 10) \ + val.sa = flt[off + str * 10]; \ + if (edge > 11) \ + val.sb = flt[off + str * 11]; \ + if (edge > 12) \ + val.sc = flt[off + str * 12]; \ + if (edge > 13) \ + val.sd = flt[off + str * 13]; \ + if (edge > 14) \ + val.se = flt[off + str * 14]; \ + } +#endif + +__kernel void MANGLE_NAME(conv_direct_trans_fltbuf_, C, K)( + const int fwh, const int fc, const int fn, __global const T *fltdata, __global T *fltbuf) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + short ec = ((idy + 1) * C <= fc) ? C : (fc % C); + + const int flt_off = (idz * fc + idy * C) * fwh + idx; +#if (C == 1) + T val = 0; +#elif (C == 2) + T2 val = 0; +#elif (C == 3) + T3 val = 0; +#elif (C == 4) + T4 val = 0; +#elif (C == 8) + T8 val = 0; +#elif (C == 16) + T16 val = 0; +#endif + if (idz < fn) { + if (ec == C) { + loadFltval(flt_off, fwh, fltdata, val); + } else { + loadFltvalEdge(flt_off, fwh, fltdata, val, ec); + } + } + const int bc = (fc + C - 1) / C; + int out_off; +#if (K == 0) + out_off = (idy * fwh + idx) * fn + idz; +#else + out_off = (idz / K * bc + idy) * fwh * K + idx * K + (idz % K); +#endif +#if (C == 1) + fltbuf[out_off] = val; +#elif (C == 2) + vstore2(val, out_off, fltbuf); +#elif (C == 3) + vstore3(val, out_off, fltbuf); +#elif (C == 4) + vstore4(val, out_off, fltbuf); +#elif (C == 8) + vstore8(val, out_off, fltbuf); +#elif (C == 16) + vstore16(val, out_off, fltbuf); +#endif +} diff --git a/compute/tensor/src/gpu/mali/cl/conv_direct_wh_s1.cl b/compute/tensor/src/gpu/mali/cl/conv_direct_wh_s1.cl new file mode 100644 index 00000000..62df1ca7 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/conv_direct_wh_s1.cl @@ -0,0 +1,102 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "kernel_def.h" +#define MANGLE_NAME_IMPL(base, W, H, ON, KN) base##W##H##ON##KN +#define MANGLE_NAME(base, W, H, ON, KN) MANGLE_NAME_IMPL(base, W, H, ON, KN) + +#if defined(USE_RELU) +__kernel void MANGLE_NAME(conv_direct_wh_s1_relu_, W, H, ON, KN) +#elif defined(USE_RELU6) +__kernel void MANGLE_NAME(conv_direct_wh_s1_relu6_, W, H, ON, KN) +#else +__kernel void MANGLE_NAME(conv_direct_wh_s1_, W, H, ON, KN) +#endif + (const int ih_str, + const int ihw_str, + const int ic_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int ohw_str, + const int oh_off, + const int ow_off, + const int ow, + const int bx, + const int by, + __global const T *in, + __global const T *flt, + __read_only image1d_t bias, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= bx || idy >= by) { + return; + } + + T4 in_val[IN]; + T16 flt_val; + T4 out_val[KN][ON]; + LOADBIAS_IMAGE_ARRAY_V4(out_val[0], idz * KN, bias); +#if (KN > 1) + LOADBIAS_IMAGE_ARRAY_V4(out_val[1], idz * KN + 1, bias); +#endif +#if (KN > 2) + LOADBIAS_IMAGE_ARRAY_V4(out_val[2], idz * KN + 2, bias); + LOADBIAS_IMAGE_ARRAY_V4(out_val[3], idz * KN + 3, bias); +#endif + + int in_off = (idy * ON + iw_off) * ih_str + idx + ih_off; + int flt_off = idz * ic_str * Fsq * KN; + + for (int i = 0; i < ic_str; ++i) { + for (uchar j = 0; j < H; ++j) { + LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off + j, ih_str, in); + for (uchar k = 0; k < W; ++k) { +#if defined(BASICE_REG) + in_val[LN] = vload4(in_off + j + (LN + k) * ih_str, in); +#endif + flt_val = vload16(flt_off + k * KN, flt); + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[0]); +#if (KN > 1) + flt_val = vload16(flt_off + k * KN + 1, flt); + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[1]); +#endif +#if (KN > 2) + flt_val = vload16(flt_off + k * KN + 2, flt); + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[2]); + flt_val = vload16(flt_off + k * KN + 3, flt); + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[3]); +#endif + UPDATE_REG(in_val); + } + flt_off += W * KN; + } + in_off += ihw_str; + } + + int out_off = idz * KN * ohw_str + (idy * ON + ow_off) * oh_str + idx + oh_off; + STORE_OUTPUT_BUF_ARRAY_V4(out_val[0], out_off, oh_str, idy * ON, ow, out); +#if (KN > 1) + out_off += ohw_str; + STORE_OUTPUT_BUF_ARRAY_V4(out_val[1], out_off, oh_str, idy * ON, ow, out); +#endif +#if (KN > 2) + out_off += ohw_str; + STORE_OUTPUT_BUF_ARRAY_V4(out_val[2], out_off, oh_str, idy * ON, ow, out); + out_off += ohw_str; + STORE_OUTPUT_BUF_ARRAY_V4(out_val[3], out_off, oh_str, idy * ON, ow, out); +#endif +} diff --git a/compute/tensor/src/gpu/mali/cl/conv_direct_wh_s2.cl b/compute/tensor/src/gpu/mali/cl/conv_direct_wh_s2.cl new file mode 100644 index 00000000..2da47d26 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/conv_direct_wh_s2.cl @@ -0,0 +1,136 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "kernel_def.h" +#define MANGLE_NAME_IMPL(base, W, H, ON, KN) base##W##H##ON##KN +#define MANGLE_NAME(base, W, H, ON, KN) MANGLE_NAME_IMPL(base, W, H, ON, KN) + +#if defined(USE_RELU) +__kernel void MANGLE_NAME(conv_direct_wh_s2_relu_, W, H, ON, KN) +#elif defined(USE_RELU6) +__kernel void MANGLE_NAME(conv_direct_wh_s2_relu6_, W, H, ON, KN) +#else +__kernel void MANGLE_NAME(conv_direct_wh_s2_, W, H, ON, KN) +#endif + (const int ih_str, + const int ihw_str, + const int ic_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int ohw_str, + const int oh_off, + const int ow_off, + const int ow, + const int bx, + const int by, + __global const T *in, + __global const T *flt, + __read_only image1d_t bias, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= bx || idy >= by) { + return; + } + + T4 in_val[IN]; + T16 flt_val; + T4 out_val[KN][ON]; + LOADBIAS_IMAGE_ARRAY_V4(out_val[0], idz * KN, bias); +#if (KN > 1) + LOADBIAS_IMAGE_ARRAY_V4(out_val[1], idz * KN + 1, bias); +#endif +#if (KN > 2) + LOADBIAS_IMAGE_ARRAY_V4(out_val[2], idz * KN + 2, bias); + LOADBIAS_IMAGE_ARRAY_V4(out_val[3], idz * KN + 3, bias); +#endif + + int in_off = ((idy << 1) * ON + iw_off) * ih_str + (idx << 1) + ih_off; + int flt_off = idz * ic_str * Fsq * KN; + + for (int i = 0; i < ic_str; ++i) { + for (uchar j = 0; j < H; ++j) { +#if defined(BASIC_REG) + LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off + j, (ih_str << 1), in); + for (uchar k = 0; k < W; k += 2) { + in_val[LN] = vload4(in_off + j + ((LN << 1) + k) * ih_str, in); + flt_val = vload16(flt_off + k * KN, flt); + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[0]); +#if (KN > 1) + flt_val = vload16(flt_off + k * KN + 1, flt); + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[1]); +#endif +#if (KN > 2) + flt_val = vload16(flt_off + k * KN + 2, flt); + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[2]); + flt_val = vload16(flt_off + k * KN + 3, flt); + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[3]); +#endif + UPDATE_REG(in_val); + } + LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off + j + ih_str, (ih_str << 1), in); + for (uchar k = 1; k < W; k += 2) { + in_val[LN] = vload4(in_off + j + ((LN << 1) + k) * ih_str, in); + flt_val = vload16(flt_off + k * KN, flt); + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[0]) +#if (KN > 1) + flt_val = vload16(flt_off + k * KN + 1, flt); + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[1]); +#endif +#if (KN > 2) + flt_val = vload16(flt_off + k * KN + 2, flt); + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[2]); + flt_val = vload16(flt_off + k * KN + 3, flt); + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[3]); +#endif + UPDATE_REG(in_val); + } +#else + LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off + j, ih_str, in); + for (uchar k = 0; k < W; ++k) { + flt_val = vload16(flt_off + k * KN, flt); + DIRECT_CONV_CAL_CORE_S2(in_val, flt_val, out_val[0]); +#if (KN > 1) + flt_val = vload16(flt_off + k * KN + 1, flt); + DIRECT_CONV_CAL_CORE_S2(in_val, flt_val, out_val[1]); +#endif +#if (KN > 2) + flt_val = vload16(flt_off + k * KN + 2, flt); + DIRECT_CONV_CAL_CORE_S2(in_val, flt_val, out_val[2]); + flt_val = vload16(flt_off + k * KN + 3, flt); + DIRECT_CONV_CAL_CORE_S2(in_val, flt_val, out_val[3]); +#endif + UPDATE_REG(in_val); + } +#endif + flt_off += W * KN; + } + in_off += ihw_str; + } + + int out_off = idz * KN * ohw_str + (idy * ON + ow_off) * oh_str + idx + oh_off; + STORE_OUTPUT_BUF_ARRAY_V4(out_val[0], out_off, oh_str, idy * ON, ow, out); +#if (KN > 1) + out_off += ohw_str; + STORE_OUTPUT_BUF_ARRAY_V4(out_val[1], out_off, oh_str, idy * ON, ow, out); +#endif +#if (KN > 2) + out_off += ohw_str; + STORE_OUTPUT_BUF_ARRAY_V4(out_val[2], out_off, oh_str, idy * ON, ow, out); + out_off += ohw_str; + STORE_OUTPUT_BUF_ARRAY_V4(out_val[3], out_off, oh_str, idy * ON, ow, out); +#endif +} diff --git a/compute/tensor/src/gpu/mali/cl/conv_wino_gemm36_tn.cl b/compute/tensor/src/gpu/mali/cl/conv_wino_gemm36_tn.cl new file mode 100644 index 00000000..aa24e28d --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/conv_wino_gemm36_tn.cl @@ -0,0 +1,56 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "kernel_def.h" +#define MANGLE_NAME_LMPL(base, LM, LN) base##LM##LN +#define MANGLE_NAME(base, LM, LN) MANGLE_NAME_LMPL(base, LM, LN) + +__kernel void MANGLE_NAME(conv_wino_gemm36_tn_, LM, LN)(int M, + int N, + int K, + int a_str, + int b_str, + int c_str, + const int bx, + const int by, + __global const T *A, + __global const T *B, + global T *C) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + if (idx >= bx || idy >= by) { + return; + } + const int ix = idx * LN; + const int iy = idy * LM; + + T a[LM]; + T b[LN]; + T c[LM][LN]; + GEMM_SET_C_ZERO(c); + + int a_off = iy + a_str; + int b_off = ix + b_str; + for (int i = 0; i < K; i++) { + GEMM_LOAD_A(a, a_off, A); + GEMM_LOAD_B(b, b_off, B); + GEMM_CALCORE(a, b, c); + a_off += M; + b_off += N; + } + + int c_off = iy * N + ix + c_str; + GEMM_MUL_C((float)(0.1111111111), 0, c); + GEMM_STORE_C(c, c_off, N, C); +} diff --git a/tensor_computing/src/gpu/mali/cl/conv_wino_rotate_fltbuf.cl b/compute/tensor/src/gpu/mali/cl/conv_wino_rotate_fltbuf.cl similarity index 87% rename from tensor_computing/src/gpu/mali/cl/conv_wino_rotate_fltbuf.cl rename to compute/tensor/src/gpu/mali/cl/conv_wino_rotate_fltbuf.cl index 8149bd3b..68dc362b 100644 --- a/tensor_computing/src/gpu/mali/cl/conv_wino_rotate_fltbuf.cl +++ b/compute/tensor/src/gpu/mali/cl/conv_wino_rotate_fltbuf.cl @@ -11,23 +11,21 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - -#define MANGLE_NAME_IMPL(base, F) base ## F +#define MANGLE_NAME_IMPL(base, F) base##F #define MANGLE_NAME(base, F) MANGLE_NAME_IMPL(base, F) -__kernel void MANGLE_NAME(conv_wino_rotate_fltbuf_, F)(const int fwhc, const int fnc, const int fn, __global const T* fltdata, __global T* fltbuf) { +__kernel void MANGLE_NAME(conv_wino_rotate_fltbuf_, F)( + const int fwhc, const int fnc, const int fn, __global const T *fltdata, __global T *fltbuf) +{ const int idx = get_global_id(0); const int idy = get_global_id(1); T val = 0; - if(idy < fn) { + if (idy < fn) { const int in_off = idy * fwhc + idx; val = fltdata[in_off]; } - + const int ox = idy; const int oy = idx / Fsq; const int oz = idx % Fsq; diff --git a/compute/tensor/src/gpu/mali/cl/conv_wino_trans_fltbuf_3x3.cl b/compute/tensor/src/gpu/mali/cl/conv_wino_trans_fltbuf_3x3.cl new file mode 100644 index 00000000..79ab36a8 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/conv_wino_trans_fltbuf_3x3.cl @@ -0,0 +1,124 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define loadG(val, str, off, flt) \ + { \ + val[0] = flt[off]; \ + val[1] = flt[off + str]; \ + val[2] = flt[off + str * 2]; \ + } + +#define setReg6(reg0, reg1) \ + { \ + reg1[0] = reg0[0]; \ + reg1[1] = reg0[1]; \ + reg1[2] = reg0[2]; \ + reg1[3] = reg0[3]; \ + reg1[4] = reg0[4]; \ + reg1[5] = reg0[5]; \ + } + +#define addReg6(reg0, reg1) \ + { \ + reg1[0] += reg0[0]; \ + reg1[1] += reg0[1]; \ + reg1[2] += reg0[2]; \ + reg1[3] += reg0[3]; \ + reg1[4] += reg0[4]; \ + reg1[5] += reg0[5]; \ + } + +#define minReg6(reg0, reg1) \ + { \ + reg1[0] -= reg0[0]; \ + reg1[1] -= reg0[1]; \ + reg1[2] -= reg0[2]; \ + reg1[3] -= reg0[3]; \ + reg1[4] -= reg0[4]; \ + reg1[5] -= reg0[5]; \ + } + +#define mulReg6(s, reg0, reg1) \ + { \ + reg1[0] = s * reg0[0]; \ + reg1[1] = s * reg0[1]; \ + reg1[2] = s * reg0[2]; \ + reg1[3] = s * reg0[3]; \ + reg1[4] = s * reg0[4]; \ + reg1[5] = s * reg0[5]; \ + } + +#define calCore(g, t) \ + { \ + t[0] = (T)(0.75) * g[0]; \ + t[1] = (g[0] + g[1] + g[2]) * (T)(-0.5); \ + t[2] = (g[0] - g[1] + g[2]) * (T)(-0.5); \ + t[3] = ((T)(0.125) * g[0] + (T)(0.25) * g[1] + (T)(0.5) * g[2]); \ + t[4] = ((T)(0.125) * g[0] - (T)(0.25) * g[1] + (T)(0.5) * g[2]); \ + t[5] = (T)(3.0) * g[2]; \ + } + +#define storeReg6(reg, off, str, flt) \ + { \ + flt[off] = reg[0]; \ + flt[off + str] = reg[1]; \ + flt[off + str * 2] = reg[2]; \ + flt[off + str * 3] = reg[3]; \ + flt[off + str * 4] = reg[4]; \ + flt[off + str * 5] = reg[5]; \ + } + +__kernel void conv_wino_trans_fltbuf_3x3( + const int fn, const int fc, const int fnc, __global const T *fltbuf, __global T *flttran) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int in_off = idy * fn + idx; + + T g[3]; + T h0[6], h1[6], h2[6], h3[6], h4[6], h5[6], t[6], tmp[6]; + loadG(g, fnc, in_off, fltbuf); + calCore(g, tmp); + mulReg6((T)(0.75), tmp, h0); + mulReg6((T)(-0.5), tmp, t); + setReg6(t, h1); + setReg6(t, h2); + mulReg6((T)(0.125), tmp, t); + setReg6(t, h3); + setReg6(t, h4); + + loadG(g, fnc, in_off + 3 * fnc, fltbuf); + calCore(g, tmp); + mulReg6((T)(0.5), tmp, t); + minReg6(t, h1); + addReg6(t, h2); + mulReg6((T)(0.25), tmp, t); + addReg6(t, h3); + minReg6(t, h4); + + loadG(g, fnc, in_off + 6 * fnc, fltbuf); + calCore(g, tmp); + mulReg6((T)(0.5), tmp, t); + minReg6(t, h1); + minReg6(t, h2); + addReg6(t, h3); + addReg6(t, h4); + mulReg6((T)(3.0), tmp, h5); + + storeReg6(h0, in_off, fnc, flttran); + storeReg6(h1, in_off + 6 * fnc, fnc, flttran); + storeReg6(h2, in_off + 12 * fnc, fnc, flttran); + storeReg6(h3, in_off + 18 * fnc, fnc, flttran); + storeReg6(h4, in_off + 24 * fnc, fnc, flttran); + storeReg6(h5, in_off + 30 * fnc, fnc, flttran); +} diff --git a/compute/tensor/src/gpu/mali/cl/conv_wino_trans_outbuf.cl b/compute/tensor/src/gpu/mali/cl/conv_wino_trans_outbuf.cl new file mode 100644 index 00000000..ac5aa2cf --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/conv_wino_trans_outbuf.cl @@ -0,0 +1,262 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "kernel_def.h" +#define loadR(val, str, off, in) \ + { \ + val[0] = in[off]; \ + val[1] = in[off + str]; \ + val[2] = in[off + str * 2]; \ + val[3] = in[off + str * 3]; \ + val[4] = in[off + str * 4]; \ + val[5] = in[off + str * 5]; \ + } + +#define calCore(s, t, tmp) \ + { \ + t.x = s[1] + s[2]; \ + t.y = s[3] + s[4]; \ + t.z = s[1] - s[2]; \ + t.w = s[3] - s[4]; \ + tmp[0] = s[0] + t.x + t.y; \ + tmp[1] = t.z + (T)(2.0) * t.w; \ + tmp[2] = t.x + (T)(4.0) * t.y; \ + tmp[3] = t.z + (T)(8.0) * t.w + s[5]; \ + } + +#if defined(ALIGN) +#if defined(USE_RELU) +__kernel void conv_wino_trans_outbuf_relu_align +#else +__kernel void conv_wino_trans_outbuf_align +#endif +#else +#if defined(USE_RELU) +__kernel void conv_wino_trans_outbuf_relu +#else +__kernel void conv_wino_trans_outbuf +#endif +#endif + (const int wino_h, + const int wino_w, + const int pw_str, + const int pwh_str, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + const int oh, + const int ow, + __read_only image1d_t bias, + __global const T *in, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= wino_h || idy >= wino_w) { + return; + } + + T4 r0, r1, r2, r3; + T4 r4, r5, r6, r7; + T4 r8, r9, ra, rb; + T4 rc, rd, re, rf; + T4 bias_v4 = READ_IMAGE(bias, sampler, idz); + + int in_off = (idz << 2) * pw_str + idy * wino_h + idx; + for (uchar ii = 0; ii < 4; ii++) { + r0 = r4; + r1 = r5; + r2 = r6; + r3 = r7; + + r4 = r8; + r5 = r9; + r6 = ra; + r7 = rb; + + r8 = rc; + r9 = rd; + ra = re; + rb = rf; + + T s[6]; + T4 t; + T bias_val; + if (ii == 0) { + bias_val = bias_v4.x; + } + if (ii == 1) { + bias_val = bias_v4.y; + } + if (ii == 2) { + bias_val = bias_v4.z; + } + if (ii == 3) { + bias_val = bias_v4.w; + } + + rd = (T4)bias_val; + re = (T4)bias_val; + for (uchar i = 0; i < 2; ++i) { + rc.x = rf.x; + rc.y = rf.y; + rc.z = rf.z; + rc.w = rf.w; + loadR(s, pwh_str, in_off + i * 30 * pwh_str, in); + for (uchar j = 0; j < 4; ++j) { + rf.x = rf.y; + rf.y = rf.z; + rf.z = rf.w; + rf.w = bias_val; + if (j == 0) { + rf.w += s[0] + s[1] + s[2] + s[3] + s[4]; + } + if (j == 1) { + rf.w += s[1] - s[2] + (T)2 * (s[3] - s[4]); + } + if (j == 2) { + rf.w += s[1] + s[2] + (T)4 * (s[3] + s[4]); + } + if (j == 3) { + rf.w += s[1] - s[2] + (T)8 * (s[3] - s[4]) + s[5]; + } + } + } + + for (uchar i = 0; i < 4; ++i) { + loadR(s, pwh_str, in_off + (i + 1) * 6 * pwh_str, in); + for (uchar j = 0; j < 4; ++j) { + t.x = t.y; + t.y = t.z; + t.z = t.w; + if (j == 0) { + t.w = s[0] + s[1] + s[2] + s[3] + s[4]; + } + if (j == 1) { + t.w = s[1] - s[2] + (T)2 * (s[3] - s[4]); + } + if (j == 2) { + t.w = s[1] + s[2] + (T)4 * (s[3] + s[4]); + } + if (j == 3) { + t.w = s[1] - s[2] + (T)8 * (s[3] - s[4]) + s[5]; + } + } + if (i == 0) { + rc += t; + rd += t; + re += t; + rf += t; + } + if (i == 1) { + rc += t; + rd -= t; + re += t; + rf -= t; + } + if (i == 2) { + rc += t; + rd += (T)2 * t; + re += (T)4 * t; + rf += (T)8 * t; + } + if (i == 3) { + rc += t; + rd -= (T)2 * t; + re += (T)4 * t; + rf -= (T)8 * t; + } + } + ACTIVATION_V4(rc); + ACTIVATION_V4(rd); + ACTIVATION_V4(re); + ACTIVATION_V4(rf); + in_off += pw_str; + } + + const int x_off = idx << 2; + const int y_off = idy << 2; + int out_off = (idz * ow_str + y_off + ow_off) * (oh_str << 2) + (x_off << 2) + (oh_off << 2); +#if defined(ALIGN) + vstore16((T16)(r0.x, r4.x, r8.x, rc.x, r1.x, r5.x, r9.x, rd.x, r2.x, r6.x, ra.x, re.x, r3.x, + r7.x, rb.x, rf.x), + 0, out + out_off); + out_off += (oh_str << 2); + vstore16((T16)(r0.y, r4.y, r8.y, rc.y, r1.y, r5.y, r9.y, rd.y, r2.y, r6.y, ra.y, re.y, r3.y, + r7.y, rb.y, rf.y), + 0, out + out_off); + out_off += (oh_str << 2); + vstore16((T16)(r0.z, r4.z, r8.z, rc.z, r1.z, r5.z, r9.z, rd.z, r2.z, r6.z, ra.z, re.z, r3.z, + r7.z, rb.z, rf.z), + 0, out + out_off); + out_off += (oh_str << 2); + vstore16((T16)(r0.w, r4.w, r8.w, rc.w, r1.w, r5.w, r9.w, rd.w, r2.w, r6.w, ra.w, re.w, r3.w, + r7.w, rb.w, rf.w), + 0, out + out_off); +#else + vstore4((T4)(r0.x, r4.x, r8.x, rc.x), 0, out + out_off); + if (x_off + 1 < oh) { + vstore4((T4)(r1.x, r5.x, r9.x, rd.x), 0, out + out_off + 4); + } + if (x_off + 2 < oh) { + vstore4((T4)(r2.x, r6.x, ra.x, re.x), 0, out + out_off + 8); + } + if (x_off + 3 < oh) { + vstore4((T4)(r3.x, r7.x, rb.x, rf.x), 0, out + out_off + 12); + } + + if (y_off + 1 < ow) { + out_off += (oh_str << 2); + vstore4((T4)(r0.y, r4.y, r8.y, rc.y), 0, out + out_off); + if (x_off + 1 < oh) { + vstore4((T4)(r1.y, r5.y, r9.y, rd.y), 0, out + out_off + 4); + } + if (x_off + 2 < oh) { + vstore4((T4)(r2.y, r6.y, ra.y, re.y), 0, out + out_off + 8); + } + if (x_off + 3 < oh) { + vstore4((T4)(r3.y, r7.y, rb.y, rf.y), 0, out + out_off + 12); + } + } + + if (y_off + 2 < ow) { + out_off += (oh_str << 2); + vstore4((T4)(r0.z, r4.z, r8.z, rc.z), 0, out + out_off); + if (x_off + 1 < oh) { + vstore4((T4)(r1.z, r5.z, r9.z, rd.z), 0, out + out_off + 4); + } + if (x_off + 2 < oh) { + vstore4((T4)(r2.z, r6.z, ra.z, re.z), 0, out + out_off + 8); + } + if (x_off + 3 < oh) { + vstore4((T4)(r3.z, r7.z, rb.z, rf.z), 0, out + out_off + 12); + } + } + + if (y_off + 3 < ow) { + out_off += (oh_str << 2); + vstore4((T4)(r0.w, r4.w, r8.w, rc.w), 0, out + out_off); + if (x_off + 1 < oh) { + vstore4((T4)(r1.w, r5.w, r9.w, rd.w), 0, out + out_off + 4); + } + if (x_off + 2 < oh) { + vstore4((T4)(r2.w, r6.w, ra.w, re.w), 0, out + out_off + 8); + } + if (x_off + 3 < oh) { + vstore4((T4)(r3.w, r7.w, rb.w, rf.w), 0, out + out_off + 12); + } + } +#endif +} diff --git a/compute/tensor/src/gpu/mali/cl/conv_wino_trans_outbuf_right.cl b/compute/tensor/src/gpu/mali/cl/conv_wino_trans_outbuf_right.cl new file mode 100644 index 00000000..5c975e25 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/conv_wino_trans_outbuf_right.cl @@ -0,0 +1,75 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "kernel_def.h" +#define loadR(val, str, off, in) \ + { \ + val[0] = in[off]; \ + val[1] = in[off + str]; \ + val[2] = in[off + str * 2]; \ + val[3] = in[off + str * 3]; \ + val[4] = in[off + str * 4]; \ + val[5] = in[off + str * 5]; \ + } + +#define calCore(s, t, tmp) \ + { \ + t[0] = s[1] + s[2]; \ + t[1] = s[3] + s[4]; \ + t[2] = s[1] - s[2]; \ + t[3] = s[3] - s[4]; \ + tmp[0] = s[0] + t[0] + t[1]; \ + tmp[1] = t[2] + (T)(2.0) * t[3]; \ + tmp[2] = t[0] + (T)(4.0) * t[1]; \ + tmp[3] = t[2] + (T)(8.0) * t[3] + s[5]; \ + } + +__kernel void conv_wino_trans_outbuf_right(const int iw_str, + const int iwh_str, + const int wino_h, + const int wino_w, + const int wino_h6, + const int wino_hw, + __global const T *in, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= wino_hw) { + return; + } + + int in_off = idz * iwh_str * 6 + (idy << 2) * iw_str + idx; + T s[6]; + T4 res[4]; + for (int ii = 0; ii < 4; ++ii) { + loadR(s, iwh_str, in_off, in); + res[0] = res[1]; + res[1] = res[2]; + res[2] = res[3]; + res[3].x = s[0] + s[1] + s[2] + s[3] + s[4]; + res[3].y = s[1] - s[2] + (T)(2) * (s[3] - s[4]); + res[3].z = s[1] + s[2] + (T)(4) * (s[3] + s[4]); + res[3].w = s[1] - s[2] + (T)(8) * (s[3] - s[4]) + s[5]; + in_off += iw_str; + } + + const int idx_i = idx % wino_h; + const int idx_j = idx / wino_h; + const int out_off = (idy * 24 * wino_w + idx_j * 24 + idz) * wino_h + idx_i; + vstore4((T4)(res[0].x, res[1].x, res[2].x, res[3].x), out_off, out); + vstore4((T4)(res[0].y, res[1].y, res[2].y, res[3].y), out_off + wino_h6, out); + vstore4((T4)(res[0].z, res[1].z, res[2].z, res[3].z), out_off + wino_h6 * 2, out); + vstore4((T4)(res[0].w, res[1].w, res[2].w, res[3].w), out_off + wino_h6 * 3, out); +} diff --git a/compute/tensor/src/gpu/mali/cl/conv_wino_trans_picbuf.cl b/compute/tensor/src/gpu/mali/cl/conv_wino_trans_picbuf.cl new file mode 100644 index 00000000..6493518c --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/conv_wino_trans_picbuf.cl @@ -0,0 +1,137 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define loadH(val, off, pic) \ + { \ + val[0] = pic[off]; \ + val[1] = pic[off + 4]; \ + val[2] = pic[off + 8]; \ + val[3] = pic[off + 12]; \ + val[4] = pic[off + 16]; \ + val[5] = pic[off + 20]; \ + } + +__kernel void conv_wino_trans_picbuf(const int ih_str4, + const int iw_str, + const int ih_off, + const int iw_off, + const int oh4, + const int pw_str, + const int pwh_str, + __global const T *in, + __global T *pictran) +{ + const int id = get_global_id(0); + const int idhc = id % oh4; + const int idx = idhc >> 2; + const int idc = idhc & 3; + const int idy = id / oh4; + const int idz = get_global_id(1); + + const int in_off = + (idz * iw_str + (idy << 2) + iw_off) * ih_str4 + (idx << 4) + idc + (ih_off << 2); + const int pictran_off = ((idz << 2) + idc) * pw_str + (id >> 2); + T tmp[16]; + T h0[6], h1[6], h2[6], h3[6], h4[6], h5[6]; + + loadH(h0, in_off, in); + loadH(h1, in_off + ih_str4, in); + loadH(h2, in_off + ih_str4 * 2, in); + loadH(h3, in_off + ih_str4 * 3, in); + loadH(h4, in_off + ih_str4 * 4, in); + loadH(h5, in_off + ih_str4 * 5, in); + + h1[0] = (T)(4.0) * h1[0] - (T)(5.0) * h1[2] + h1[4]; + h2[0] = (T)(4.0) * h2[0] - (T)(5.0) * h2[2] + h2[4]; + h3[0] = (T)(4.0) * h3[0] - (T)(5.0) * h3[2] + h3[4]; + h4[0] = (T)(4.0) * h4[0] - (T)(5.0) * h4[2] + h4[4]; + + tmp[0] = (T)(-4.0) * (h1[1] + h1[2]) + h1[3] + h1[4]; + tmp[1] = (T)(-4.0) * (h2[1] + h2[2]) + h2[3] + h2[4]; + tmp[2] = (T)(-4.0) * (h3[1] + h3[2]) + h3[3] + h3[4]; + tmp[3] = (T)(-4.0) * (h4[1] + h4[2]) + h4[3] + h4[4]; + + tmp[4] = (T)(4.0) * (h1[1] - h1[2]) - h1[3] + h1[4]; + tmp[5] = (T)(4.0) * (h2[1] - h2[2]) - h2[3] + h2[4]; + tmp[6] = (T)(4.0) * (h3[1] - h3[2]) - h3[3] + h3[4]; + tmp[7] = (T)(4.0) * (h4[1] - h4[2]) - h4[3] + h4[4]; + + tmp[8] = (T)(2.0) * (h1[3] - h1[1]) - h1[2] + h1[4]; + tmp[9] = (T)(2.0) * (h2[3] - h2[1]) - h2[2] + h2[4]; + tmp[10] = (T)(2.0) * (h3[3] - h3[1]) - h3[2] + h3[4]; + tmp[11] = (T)(2.0) * (h4[3] - h4[1]) - h4[2] + h4[4]; + + tmp[12] = (T)(2.0) * (h1[1] - h1[3]) - h1[2] + h1[4]; + tmp[13] = (T)(2.0) * (h2[1] - h2[3]) - h2[2] + h2[4]; + tmp[14] = (T)(2.0) * (h3[1] - h3[3]) - h3[2] + h3[4]; + tmp[15] = (T)(2.0) * (h4[1] - h4[3]) - h4[2] + h4[4]; + + h1[5] = (T)(4.0) * h1[1] - (T)(5.0) * h1[3] + h1[5]; + h2[5] = (T)(4.0) * h2[1] - (T)(5.0) * h2[3] + h2[5]; + h3[5] = (T)(4.0) * h3[1] - (T)(5.0) * h3[3] + h3[5]; + h4[5] = (T)(4.0) * h4[1] - (T)(5.0) * h4[3] + h4[5]; + + pictran[pictran_off] = + (T)(16.0) * h0[0] - (T)(20.0) * h0[2] + (T)(4.0) * h0[4] - (T)(5.0) * h2[0] + h4[0]; + pictran[pictran_off + pwh_str] = (T)(-4.0) * (h1[0] + h2[0]) + h3[0] + h4[0]; + pictran[pictran_off + pwh_str * 2] = (T)(4.0) * (h1[0] - h2[0]) - h3[0] + h4[0]; + pictran[pictran_off + pwh_str * 3] = (T)(2.0) * (h3[0] - h1[0]) - h2[0] + h4[0]; + pictran[pictran_off + pwh_str * 4] = (T)(2.0) * (h1[0] - h3[0]) - h2[0] + h4[0]; + pictran[pictran_off + pwh_str * 5] = + (T)(4.0) * (h1[0] + h5[0]) - (T)(5.0) * (h3[0] + h5[2]) + h5[4]; + + pictran[pictran_off + pwh_str * 6] = + (T)(-16.0) * (h0[1] + h0[2]) + (T)(4.0) * (h0[3] + h0[4]) - (T)(5.0) * tmp[1] + tmp[3]; + pictran[pictran_off + pwh_str * 7] = (T)(-4.0) * (tmp[0] + tmp[1]) + tmp[2] + tmp[3]; + pictran[pictran_off + pwh_str * 8] = (T)(4.0) * (tmp[0] - tmp[1]) - tmp[2] + tmp[3]; + pictran[pictran_off + pwh_str * 9] = (T)(2.0) * (tmp[2] - tmp[0]) - tmp[1] + tmp[3]; + pictran[pictran_off + pwh_str * 10] = (T)(2.0) * (tmp[0] - tmp[2]) - tmp[1] + tmp[3]; + pictran[pictran_off + pwh_str * 11] = + (T)(4.0) * (tmp[0] - h5[1] - h5[2]) - (T)(5.0) * tmp[2] + h5[3] + h5[4]; + + pictran[pictran_off + pwh_str * 12] = + (T)(16.0) * (h0[1] - h0[2]) + (T)(4.0) * (h0[4] - h0[3]) - (T)(5.0) * tmp[5] + tmp[7]; + pictran[pictran_off + pwh_str * 13] = (T)(-4.0) * (tmp[4] + tmp[5]) + tmp[6] + tmp[7]; + pictran[pictran_off + pwh_str * 14] = (T)(4.0) * (tmp[4] - tmp[5]) - tmp[6] + tmp[7]; + pictran[pictran_off + pwh_str * 15] = (T)(2.0) * (tmp[6] - tmp[4]) - tmp[5] + tmp[7]; + pictran[pictran_off + pwh_str * 16] = (T)(2.0) * (tmp[4] - tmp[6]) - tmp[5] + tmp[7]; + pictran[pictran_off + pwh_str * 17] = + (T)(4.0) * (tmp[4] + h5[1] - h5[2]) - (T)(5.0) * tmp[6] - h5[3] + h5[4]; + + pictran[pictran_off + pwh_str * 18] = + (T)(8.0) * (h0[3] - h0[1]) + (T)(4.0) * (h0[4] - h0[2]) - (T)(5.0) * tmp[9] + tmp[11]; + pictran[pictran_off + pwh_str * 19] = (T)(-4.0) * (tmp[8] + tmp[9]) + tmp[10] + tmp[11]; + pictran[pictran_off + pwh_str * 20] = (T)(4.0) * (tmp[8] - tmp[9]) - tmp[10] + tmp[11]; + pictran[pictran_off + pwh_str * 21] = (T)(2.0) * (tmp[10] - tmp[8]) - tmp[9] + tmp[11]; + pictran[pictran_off + pwh_str * 22] = (T)(2.0) * (tmp[8] - tmp[10]) - tmp[9] + tmp[11]; + pictran[pictran_off + pwh_str * 23] = + (T)(4.0) * tmp[8] + (T)(2.0) * (h5[3] - h5[1]) - h5[2] - (T)(5.0) * tmp[10] + h5[4]; + + pictran[pictran_off + pwh_str * 24] = + (T)(8.0) * (h0[1] - h0[3]) + (T)(4.0) * (h0[4] - h0[2]) - (T)(5.0) * tmp[13] + tmp[15]; + pictran[pictran_off + pwh_str * 25] = (T)(-4.0) * (tmp[12] + tmp[13]) + tmp[14] + tmp[15]; + pictran[pictran_off + pwh_str * 26] = (T)(4.0) * (tmp[12] - tmp[13]) - tmp[14] + tmp[15]; + pictran[pictran_off + pwh_str * 27] = (T)(2.0) * (tmp[14] - tmp[12]) - tmp[13] + tmp[15]; + pictran[pictran_off + pwh_str * 28] = (T)(2.0) * (tmp[12] - tmp[14]) - tmp[13] + tmp[15]; + pictran[pictran_off + pwh_str * 29] = + (T)(4.0) * tmp[12] + (T)(2.0) * (h5[1] - h5[3]) - h5[2] - (T)(5.0) * tmp[14] + h5[4]; + + pictran[pictran_off + pwh_str * 30] = + (T)(16.0) * h0[1] - (T)(20.0) * h0[3] + (T)(4.0) * h0[5] - (T)(5.0) * h2[5] + h4[5]; + pictran[pictran_off + pwh_str * 31] = (T)(-4.0) * (h1[5] + h2[5]) + h3[5] + h4[5]; + pictran[pictran_off + pwh_str * 32] = (T)(4.0) * (h1[5] - h2[5]) - h3[5] + h4[5]; + pictran[pictran_off + pwh_str * 33] = (T)(2.0) * (h3[5] - h1[5]) - h2[5] + h4[5]; + pictran[pictran_off + pwh_str * 34] = (T)(2.0) * (h1[5] - h3[5]) - h2[5] + h4[5]; + pictran[pictran_off + pwh_str * 35] = + (T)(4.0) * (h1[5] + h5[1]) - (T)(5.0) * (h3[5] + h5[3]) + h5[5]; +} diff --git a/tensor_computing/src/gpu/mali/cl/conv_wino_trans_picbuf_left.cl b/compute/tensor/src/gpu/mali/cl/conv_wino_trans_picbuf_left.cl similarity index 78% rename from tensor_computing/src/gpu/mali/cl/conv_wino_trans_picbuf_left.cl rename to compute/tensor/src/gpu/mali/cl/conv_wino_trans_picbuf_left.cl index 7f9e6299..7adc3603 100644 --- a/tensor_computing/src/gpu/mali/cl/conv_wino_trans_picbuf_left.cl +++ b/compute/tensor/src/gpu/mali/cl/conv_wino_trans_picbuf_left.cl @@ -12,19 +12,32 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include "kernel_def.h" -#define MANGLE_NAME_IMPL(base, ON) base ## ON +#define MANGLE_NAME_IMPL(base, ON) base##ON #define MANGLE_NAME(base, ON) MANGLE_NAME_IMPL(base, ON) - -__kernel void MANGLE_NAME(conv_wino_trans_picbuf_left_, ON) -(const int ih_str, const int iw_str, const int ic_str, const int oh_str, const int ow_str, const int ohw_str, const int ohwc_str, const int bx, const int by, __global const T* in, __global T* out) { +__kernel void MANGLE_NAME(conv_wino_trans_picbuf_left_, ON)(const int ih_str, + const int iw_str, + const int ic_str, + const int oh_str, + const int ow_str, + const int ohw_str, + const int ohwc_str, + const int bx, + const int by, + __global const T *in, + __global T *out) +{ const int idx = get_global_id(0); const int idy = get_global_id(1); const int idz = get_global_id(2); - if(idx >= bx || idy >= by) return; + if (idx >= bx || idy >= by) { + return; + } const int idzx = idz % ic_str; const int idzy = idz / ic_str; - if(idx * ON >= oh_str) return; + if (idx * ON >= oh_str) { + return; + } T in_val[6]; T out_val0[ON]; T out_val1[ON]; @@ -38,7 +51,7 @@ __kernel void MANGLE_NAME(conv_wino_trans_picbuf_left_, ON) LOAD_BUF_ARRAY2(in_val, in_off, in); - for(uchar i = 0; i < ON; ++i) { + for (uchar i = 0; i < ON; ++i) { T4 tmp = vload4(0, in + in_off + 2); in_val[2] = tmp.x; in_val[3] = tmp.y; @@ -51,17 +64,17 @@ __kernel void MANGLE_NAME(conv_wino_trans_picbuf_left_, ON) UPDATE_REG(out_val4); UPDATE_REG(out_val5); p[0] = -4; - p[1] = 1; - for(uchar j = 0; j < 2; ++j) { + p[1] = 1; + for (uchar j = 0; j < 2; ++j) { out_val1[UN] = out_val2[UN]; out_val2[UN] = p[0] * in_val[1] - (T)(4.0) * in_val[2] + p[1] * in_val[3] + in_val[4]; - p[0] = -p[0]; - p[1] = -p[1]; + p[0] = -p[0]; + p[1] = -p[1]; } p[0] = -2; - p[1] = 2; - for(uchar j = 0; j < 2; ++j) { + p[1] = 2; + for (uchar j = 0; j < 2; ++j) { out_val3[UN] = out_val4[UN]; out_val4[UN] = p[0] * in_val[1] - in_val[2] + p[1] * in_val[3] + in_val[4]; p[0] = -p[0]; @@ -70,7 +83,7 @@ __kernel void MANGLE_NAME(conv_wino_trans_picbuf_left_, ON) p[0] = 4; p[1] = -5; - for(uchar j = 0; j < 2; j++) { + for (uchar j = 0; j < 2; j++) { out_val0[UN] = out_val5[UN]; out_val5[UN] = p[0] * in_val[0] + p[1] * in_val[2] + in_val[4]; in_val[0] = in_val[1]; @@ -83,10 +96,10 @@ __kernel void MANGLE_NAME(conv_wino_trans_picbuf_left_, ON) in_val[1] = in_val[3]; in_off += 4; } - - int out_off = idzy * ohwc_str + idzx * ohw_str + idy * oh_str + idx * ON; - STORE_OUTPUT_BUF_ARRAY_ALIGN(out_val0, out_off, 1, out); - STORE_OUTPUT_BUF_ARRAY_ALIGN(out_val1, out_off + 6 * ohwc_str, 1, out); + + int out_off = idzy * ohwc_str + idzx * ohw_str + idy * oh_str + idx * ON; + STORE_OUTPUT_BUF_ARRAY_ALIGN(out_val0, out_off, 1, out); + STORE_OUTPUT_BUF_ARRAY_ALIGN(out_val1, out_off + 6 * ohwc_str, 1, out); STORE_OUTPUT_BUF_ARRAY_ALIGN(out_val2, out_off + 12 * ohwc_str, 1, out); STORE_OUTPUT_BUF_ARRAY_ALIGN(out_val3, out_off + 18 * ohwc_str, 1, out); STORE_OUTPUT_BUF_ARRAY_ALIGN(out_val4, out_off + 24 * ohwc_str, 1, out); diff --git a/tensor_computing/src/gpu/mali/cl/conv_wino_trans_picbuf_right.cl b/compute/tensor/src/gpu/mali/cl/conv_wino_trans_picbuf_right.cl similarity index 77% rename from tensor_computing/src/gpu/mali/cl/conv_wino_trans_picbuf_right.cl rename to compute/tensor/src/gpu/mali/cl/conv_wino_trans_picbuf_right.cl index 65c56270..62894fb7 100644 --- a/tensor_computing/src/gpu/mali/cl/conv_wino_trans_picbuf_right.cl +++ b/compute/tensor/src/gpu/mali/cl/conv_wino_trans_picbuf_right.cl @@ -12,14 +12,29 @@ // OUT OF OR IN C4NECTI4 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include "kernel_def.h" -__kernel void conv_wino_trans_picbuf_right -(const int ih_str4, const int iw_str, const int ih_off4, const int iw_off, const int oh_str, const int ow_str, const int ohwc_str, const int oh_off4, const int bx, const int by, __global const T* in, __global T* out) { +__kernel void conv_wino_trans_picbuf_right(const int ih_str4, + const int iw_str, + const int ih_off4, + const int iw_off, + const int oh_str, + const int ow_str, + const int ohwc_str, + const int oh_off4, + const int bx, + const int by, + __global const T *in, + __global T *out) +{ int idx = get_global_id(0); const int idy = get_global_id(1); const int idz = get_global_id(2); - if(idx >= bx || idy >= by) return; + if (idx >= bx || idy >= by) { + return; + } idx = idx - oh_off4; - if((idy << 2) >= ow_str) return; + if ((idy << 2) >= ow_str) { + return; + } T out_val0[4]; T out_val1[4]; T out_val2[4]; @@ -34,12 +49,12 @@ __kernel void conv_wino_trans_picbuf_right SET_REG_ARRAY(0, out_val4); SET_REG_ARRAY(0, out_val5); - if(idx >= 0 && idx < ih_str4) { + if (idx >= 0 && idx < ih_str4) { int in_off = (idz * iw_str + (idy << 4) + iw_off) * ih_str4 + idx + ih_off4; - T in_val[6]; + T in_val[6]; in_val[0] = in[in_off]; in_val[1] = in[in_off + ih_str4]; - for(uchar i = 0; i < 4; ++i) { + for (uchar i = 0; i < 4; ++i) { in_val[2] = in[in_off + 2 * ih_str4]; in_val[3] = in[in_off + 3 * ih_str4]; in_val[4] = in[in_off + 4 * ih_str4]; @@ -53,17 +68,17 @@ __kernel void conv_wino_trans_picbuf_right UPDATE_REG(out_val5); p[0] = -4; - p[1] = 1; - for(uchar j = 0; j < 2; ++j) { + p[1] = 1; + for (uchar j = 0; j < 2; ++j) { out_val1[UN] = out_val2[UN]; out_val2[UN] = p[0] * in_val[1] - (T)(4) * in_val[2] + p[1] * in_val[3] + in_val[4]; - p[0] = -p[0]; - p[1] = -p[1]; + p[0] = -p[0]; + p[1] = -p[1]; } p[0] = -2; - p[1] = 2; - for(uchar j = 0; j < 2; ++j) { + p[1] = 2; + for (uchar j = 0; j < 2; ++j) { out_val3[UN] = out_val4[UN]; out_val4[UN] = p[0] * in_val[1] - in_val[2] + p[1] * in_val[3] + in_val[4]; p[0] = -p[0]; @@ -72,7 +87,7 @@ __kernel void conv_wino_trans_picbuf_right p[0] = 4; p[1] = -5; - for(uchar j = 0; j < 2; ++j) { + for (uchar j = 0; j < 2; ++j) { out_val0[UN] = out_val5[UN]; out_val5[UN] = p[0] * in_val[0] + p[1] * in_val[2] + in_val[4]; in_val[0] = in_val[1]; @@ -86,11 +101,11 @@ __kernel void conv_wino_trans_picbuf_right in_off += (ih_str4 << 2); } } - + idx += oh_off4; - int out_off = (((idz << 2) + (idx & 3)) * ow_str + (idy << 2)) * oh_str + (idx >> 2); - STORE_OUTPUT_BUF_ARRAY_ALIGN(out_val0, out_off, oh_str, out); - STORE_OUTPUT_BUF_ARRAY_ALIGN(out_val1, out_off + ohwc_str, oh_str, out); + int out_off = (((idz << 2) + (idx & 3)) * ow_str + (idy << 2)) * oh_str + (idx >> 2); + STORE_OUTPUT_BUF_ARRAY_ALIGN(out_val0, out_off, oh_str, out); + STORE_OUTPUT_BUF_ARRAY_ALIGN(out_val1, out_off + ohwc_str, oh_str, out); STORE_OUTPUT_BUF_ARRAY_ALIGN(out_val2, out_off + 2 * ohwc_str, oh_str, out); STORE_OUTPUT_BUF_ARRAY_ALIGN(out_val3, out_off + 3 * ohwc_str, oh_str, out); STORE_OUTPUT_BUF_ARRAY_ALIGN(out_val4, out_off + 4 * ohwc_str, oh_str, out); diff --git a/compute/tensor/src/gpu/mali/cl/copy.cl b/compute/tensor/src/gpu/mali/cl/copy.cl new file mode 100644 index 00000000..fdd65036 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/copy.cl @@ -0,0 +1,87 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define MANGLE_NAME_IMPL(base, DT) base##DT +#define MANGLE_NAME(base, DT) MANGLE_NAME_IMPL(base, DT) + +#if defined(USE_BLOCK_INDEX) +__kernel void MANGLE_NAME(copy_with_block_index_, DT)(const int s_len, + const int d_len, + const int s_off, + const int d_off, + const int s_str, + const int d_str, + const int bx, + __global const int *srcBlockIndex, + __global const int *dstBlockIndex, + __global const T *src, + __global T *dst) +{ +#else +__kernel void MANGLE_NAME(copy_, DT)(const int s_len, + const int d_len, + const int s_off, + const int d_off, + const int bx, + __global const T *src, + __global T *dst) +{ +#endif + int idx = get_global_id(0); + if (idx >= bx) { + return; + } + char s_ex = (((idx << 2) + 4) <= s_len) ? 4 : (s_len & 3); + char d_ex = (((idx << 2) + 4) <= d_len) ? 4 : (d_len & 3); + if ((idx << 2) >= s_len) { + s_ex = 0; + } + if ((idx << 2) >= d_len) { + d_ex = 0; + } +#if defined(USE_BLOCK_INDEX) + s_off = s_off + s_str * srcBlockIndex[0]; + d_off = d_off + d_str * dstBlockIndex[0]; +#endif + int src_off = s_off + (idx << 2); + int dst_off = d_off + (idx << 2); + + T4 val = 0; + if (s_ex == 4) { + val = vload4(0, src + src_off); + } else { + if (s_ex == 1) { + val.x = src[src_off]; + } + if (s_ex == 2) { + val.xy = vload2(0, src + src_off); + } + if (s_ex == 3) { + val.xyz = vload3(0, src + src_off); + } + } + + if (d_ex == 4) { + vstore4(val, 0, dst + dst_off); + } else { + if (d_ex == 1) { + dst[dst_off] = val.x; + } + if (d_ex == 2) { + vstore2(val.xy, 0, dst + dst_off); + } + if (d_ex == 3) { + vstore3(val.xyz, 0, dst + dst_off); + } + } +} diff --git a/compute/tensor/src/gpu/mali/cl/deconv_direct.cl b/compute/tensor/src/gpu/mali/cl/deconv_direct.cl new file mode 100644 index 00000000..039fa0d2 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/deconv_direct.cl @@ -0,0 +1,100 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + +__kernel void deconv_direct(__global const T *input, + __global const T *weights, + __global T *output, + __read_only image1d_t bias, + int iw, + int iw_str, + int iw_off, + int ih, + int ih_str, + int ih_off, + int kw, + int kh, + int kc, + int kn, + int sw, + int sh, + int pw, + int ph, + int ow, + int ow_str, + int ow_off, + int oh, + int oh_str, + int oh_off, + int ic, + int oc, + int align_h, + int align_w, + int in_channel_blocks, + int out_channel_blocks) +{ + const int oh_idx = get_global_id(0); + const int ow_idx = get_global_id(1); + const int oc_idx = get_global_id(2); + if (oh_idx >= oh || ow_idx >= ow || oc_idx >= oc) { + return; + } + + T4 out0 = read_imageh(bias, sampler, oc_idx); + + int kernel_start_x = max(0, (oh_idx + align_h) / sh); + int kernel_start_y = max(0, (ow_idx + align_w) / sw); + + int deal_kernel_width = kw - (kernel_start_y * sw + pw) + ow_idx - 1; + int deal_kernel_height = kh - (kernel_start_x * sh + ph) + oh_idx - 1; + + int kernel_0, kernel_1, kernel_2, kernel_3, kernel_y; + T4 in0; + T4 weights0, weights1, weights2, weights3; + int in_off, kernel_off; + for (int i = 0; i < in_channel_blocks; i++) { + kernel_0 = 0; + kernel_1 = kernel_0 + 1; + kernel_2 = kernel_0 + 2; + kernel_3 = kernel_0 + 3; + for (int k_y = deal_kernel_width, idx_w = kernel_start_y; k_y >= 0; k_y -= sw, idx_w++) { + int in_width0 = idx_w; + int in_height0 = kernel_start_x; + for (int k_x = deal_kernel_height; k_x >= 0; k_x -= sh) { + kernel_off = + (oc_idx * kw * kh * in_channel_blocks + i * kw * kh + k_x * kh + k_y) * 4; + weights0 = vload4(kernel_off + kernel_0, weights); + weights1 = vload4(kernel_off + kernel_1, weights); + weights2 = vload4(kernel_off + kernel_2, weights); + weights3 = vload4(kernel_off + kernel_3, weights); + + // in_off = i * ih * iw + ih * in_width0 + in_height0; + in_off = (i * iw_str + in_width0 + iw_off) * ih_str + ih_off + in_height0; + if (in_height0 < 0 || in_height0 >= ih || in_width0 < 0 || in_width0 >= iw) { + in0 = (T4)0; + } else { + in0 = vload4(in_off, input); + } + + out0 = mad(in0.x, weights0, out0); + out0 = mad(in0.y, weights1, out0); + out0 = mad(in0.z, weights2, out0); + out0 = mad(in0.w, weights3, out0); + in_height0++; + } + } + } + int out_off = (oc_idx * ow_str + ow_idx + ow_off) * oh_str + oh_idx + oh_off; + vstore4(out0, out_off, output); +} diff --git a/compute/tensor/src/gpu/mali/cl/deconv_direct_trans_fltbuf.cl b/compute/tensor/src/gpu/mali/cl/deconv_direct_trans_fltbuf.cl new file mode 100644 index 00000000..6f2f4b7c --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/deconv_direct_trans_fltbuf.cl @@ -0,0 +1,48 @@ +#define loadFltval(off, str, flt, val) \ + { \ + val.x = flt[off]; \ + val.y = flt[off + str]; \ + val.z = flt[off + str * 2]; \ + val.w = flt[off + str * 3]; \ + } +#define loadFltvalEdge(off, str, flt, val, edge) \ + { \ + val.x = flt[off]; \ + if (edge > 1) \ + val.y = flt[off + str]; \ + if (edge > 2) \ + val.z = flt[off + str * 2]; \ + } + +// conv filter gs[3] = {fwh, (fc+3)/4, (fn+3)/4*4}; +// deconv filter gs[3] = {fwh, (fc+3)/4*4, (fn+3)/4}; +// iohw -> nchwn4c4 + +__kernel void deconv_direct_trans_fltbuf( + const int fwh, const int fc, const int fn, __global const T *fltdata, __global T *fltbuf) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + + short ec = ((idy + 1) * 4 <= fc) ? 4 : (fc % 4); + + int flt_off = (idz * fc + idy * 4) * fwh + idx; + + T4 val = 0; + + int str = fwh; + if (idz < fn) { + if (ec == 4) { + loadFltval(flt_off, str, fltdata, val); + } else { + loadFltvalEdge(flt_off, str, fltdata, val, ec); + } + } + int bc = (fn + 4 - 1) / 4; + int out_off; + out_off = (idy * bc + idz / 4) * fwh * 4 + idx * 4 + (idz % 4); + // out_off = (idy / 4 * bc + idz) * fwh * 4 + idx * 4 + (idy % 4); + + vstore4(val, out_off, fltbuf); +} diff --git a/compute/tensor/src/gpu/mali/cl/deconv_gemm_f2s2.cl b/compute/tensor/src/gpu/mali/cl/deconv_gemm_f2s2.cl new file mode 100644 index 00000000..e9d95f59 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/deconv_gemm_f2s2.cl @@ -0,0 +1,339 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "kernel_def.h" +#define MANGLE_NAME_IMPL(base, ON, KN) base##ON##KN +#define MANGLE_NAME(base, ON, KN) MANGLE_NAME_IMPL(base, ON, KN) + +#if defined(REUSE_H) +#if (ON == 2) +#define SET_BIAS_VAL(bv, ov) \ + { \ + ov.s0 = bv.x; \ + ov.s1 = bv.y; \ + ov.s2 = bv.z; \ + ov.s3 = bv.w; \ + ov.s4 = bv.x; \ + ov.s5 = bv.y; \ + ov.s6 = bv.z; \ + ov.s7 = bv.w; \ + } + +#define calCore(iv, fv, ov) \ + { \ + ov.s0 += iv.s0 * fv.s0 + iv.s1 * fv.s1 + iv.s2 * fv.s2 + iv.s3 * fv.s3; \ + ov.s1 += iv.s0 * fv.s4 + iv.s1 * fv.s5 + iv.s2 * fv.s6 + iv.s3 * fv.s7; \ + ov.s2 += iv.s0 * fv.s8 + iv.s1 * fv.s9 + iv.s2 * fv.sa + iv.s3 * fv.sb; \ + ov.s3 += iv.s0 * fv.sc + iv.s1 * fv.sd + iv.s2 * fv.se + iv.s3 * fv.sf; \ + ov.s4 += iv.s4 * fv.s0 + iv.s5 * fv.s1 + iv.s6 * fv.s2 + iv.s7 * fv.s3; \ + ov.s5 += iv.s4 * fv.s4 + iv.s5 * fv.s5 + iv.s6 * fv.s6 + iv.s7 * fv.s7; \ + ov.s6 += iv.s4 * fv.s8 + iv.s5 * fv.s9 + iv.s6 * fv.sa + iv.s7 * fv.sb; \ + ov.s7 += iv.s4 * fv.sc + iv.s5 * fv.sd + iv.s6 * fv.se + iv.s7 * fv.sf; \ + } +#define VLOAD_VEC(off, buf) vload8(0, buf + off); +#define VSTORE_VEC(v0, v1, off, buf) \ + { \ + ACTIVATION_V8(v0); \ + ACTIVATION_V8(v1); \ + vstore16((T16)(v0.s0, v0.s1, v0.s2, v0.s3, v1.s0, v1.s1, v1.s2, v1.s3, v0.s4, v0.s5, \ + v0.s6, v0.s7, v1.s4, v1.s5, v1.s6, v1.s7), \ + 0, buf + off); \ + } +#elif (ON == 4) +#define SET_BIAS_VAL(bv, ov) \ + { \ + ov.s0 = bv.x; \ + ov.s1 = bv.y; \ + ov.s2 = bv.z; \ + ov.s3 = bv.w; \ + ov.s4 = bv.x; \ + ov.s5 = bv.y; \ + ov.s6 = bv.z; \ + ov.s7 = bv.w; \ + ov.s8 = bv.x; \ + ov.s9 = bv.y; \ + ov.sa = bv.z; \ + ov.sb = bv.w; \ + ov.sc = bv.x; \ + ov.sd = bv.y; \ + ov.se = bv.z; \ + ov.sf = bv.w; \ + } +#define calCore(iv, fv, ov) \ + { \ + ov.s0 += iv.s0 * fv.s0 + iv.s1 * fv.s1 + iv.s2 * fv.s2 + iv.s3 * fv.s3; \ + ov.s1 += iv.s0 * fv.s4 + iv.s1 * fv.s5 + iv.s2 * fv.s6 + iv.s3 * fv.s7; \ + ov.s2 += iv.s0 * fv.s8 + iv.s1 * fv.s9 + iv.s2 * fv.sa + iv.s3 * fv.sb; \ + ov.s3 += iv.s0 * fv.sc + iv.s1 * fv.sd + iv.s2 * fv.se + iv.s3 * fv.sf; \ + ov.s4 += iv.s4 * fv.s0 + iv.s5 * fv.s1 + iv.s6 * fv.s2 + iv.s7 * fv.s3; \ + ov.s5 += iv.s4 * fv.s4 + iv.s5 * fv.s5 + iv.s6 * fv.s6 + iv.s7 * fv.s7; \ + ov.s6 += iv.s4 * fv.s8 + iv.s5 * fv.s9 + iv.s6 * fv.sa + iv.s7 * fv.sb; \ + ov.s7 += iv.s4 * fv.sc + iv.s5 * fv.sd + iv.s6 * fv.se + iv.s7 * fv.sf; \ + ov.s8 += iv.s8 * fv.s0 + iv.s9 * fv.s1 + iv.sa * fv.s2 + iv.sb * fv.s3; \ + ov.s9 += iv.s8 * fv.s4 + iv.s9 * fv.s5 + iv.sa * fv.s6 + iv.sb * fv.s7; \ + ov.sa += iv.s8 * fv.s8 + iv.s9 * fv.s9 + iv.sa * fv.sa + iv.sb * fv.sb; \ + ov.sb += iv.s8 * fv.sc + iv.s9 * fv.sd + iv.sa * fv.se + iv.sb * fv.sf; \ + ov.sc += iv.sc * fv.s0 + iv.sd * fv.s1 + iv.se * fv.s2 + iv.sf * fv.s3; \ + ov.sd += iv.sc * fv.s4 + iv.sd * fv.s5 + iv.se * fv.s6 + iv.sf * fv.s7; \ + ov.se += iv.sc * fv.s8 + iv.sd * fv.s9 + iv.se * fv.sa + iv.sf * fv.sb; \ + ov.sf += iv.sc * fv.sc + iv.sd * fv.sd + iv.se * fv.se + iv.sf * fv.sf; \ + } + +#define VLOAD_VEC(off, buf) vload16(0, buf + off); +#define VSTORE_VEC(v0, v1, off, buf) \ + { \ + ACTIVATION_V16(v0); \ + ACTIVATION_V16(v1); \ + vstore16((T16)(v0.s0, v0.s1, v0.s2, v0.s3, v1.s0, v1.s1, v1.s2, v1.s3, v0.s4, v0.s5, \ + v0.s6, v0.s7, v1.s4, v1.s5, v1.s6, v1.s7), \ + 0, buf + off); \ + vstore16((T16)(v0.s8, v0.s9, v0.sa, v0.sb, v1.s8, v1.s9, v1.sa, v1.sb, v0.sc, v0.sd, \ + v0.se, v0.sf, v1.sc, v1.sd, v1.se, v1.sf), \ + 0, buf + off + 16); \ + } +#endif + +#if defined(USE_RELU) +__kernel void MANGLE_NAME(deconv_gemm_f2s2_h_relu_, ON, KN) +#else +__kernel void MANGLE_NAME(deconv_gemm_f2s2_h_, ON, KN) +#endif + (const int ih_str, + int ihw_str, + const int ic_str, + const int ih_off, + const int iw_off, + const int oh_str, + int ohw_str, + const int oh_off, + const int ow_off, + const int oh, + const int bx, + const int by, + __global const T *in, + __global const T *flt, + __read_only image1d_t bias, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= bx || idy >= by) { + return; + } +#if (ON == 2) + T8 in_val; + T8 out_val[KN]; +#elif (ON == 4) + T16 in_val; + T16 out_val[KN]; +#endif + T16 flt_val; + T4 bias_val; + +#if (KN == 2) + bias_val = read_imageh(bias, sampler, (idz >> 1)); + SET_BIAS_VAL(bias_val, out_val[0]); + SET_BIAS_VAL(bias_val, out_val[1]); +#elif (KN == 4) + bias_val = read_imageh(bias, sampler, idz); + SET_BIAS_VAL(bias_val, out_val[0]); + SET_BIAS_VAL(bias_val, out_val[1]); + SET_BIAS_VAL(bias_val, out_val[2]); + SET_BIAS_VAL(bias_val, out_val[3]); +#endif + + int in_off = ((idy + iw_off) * ih_str + idx * ON + ih_off) << 2; + int flt_off = idz * ic_str * KN; + ihw_str = ihw_str << 2; + + for (int i = 0; i < ic_str; ++i) { + in_val = VLOAD_VEC(in_off, in); +#if (KN == 2) + flt_val = vload16(flt_off, flt); + calCore(in_val, flt_val, out_val[0]); + flt_val = vload16(flt_off + 1, flt); + calCore(in_val, flt_val, out_val[1]); +#elif (KN == 4) + for (uchar j = 0; j < KN; ++j) { + flt_val = vload16(flt_off + j, flt); + if (j == 0) { + calCore(in_val, flt_val, out_val[0]); + } + if (j == 1) { + calCore(in_val, flt_val, out_val[1]); + } + if (j == 2) { + calCore(in_val, flt_val, out_val[2]); + } + if (j == 3) { + calCore(in_val, flt_val, out_val[3]); + } + } +#endif + flt_off += KN; + in_off += ihw_str; + } + +#if (KN == 2) + int out_off = (idx << 1) * ON + oh_off; + out_off += ((idy << 1) + ow_off + (idz & 1)) * oh_str; + out_off += (idz >> 1) * ohw_str; + out_off = (out_off << 2); + VSTORE_VEC(out_val[0], out_val[1], out_off, out); +#elif (KN == 4) + int out_off = (idx << 1) * ON + oh_off; + out_off += ((idy << 1) + ow_off) * oh_str; + out_off += idz * ohw_str; + out_off = (out_off << 2); + VSTORE_VEC(out_val[0], out_val[1], out_off, out); + VSTORE_VEC(out_val[2], out_val[3], out_off + oh_str * 4, out); +#endif +} + +// // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // / +#else + +#define VSTORE_VEC(v0, v1, off, buf) \ + { \ + ACTIVATION_V4(v0); \ + ACTIVATION_V4(v1); \ + vstore8((T8)(v0.s0, v0.s1, v0.s2, v0.s3, v1.s0, v1.s1, v1.s2, v1.s3), 0, buf + off); \ + } + +#if defined(USE_RELU) +__kernel void MANGLE_NAME(deconv_gemm_f2s2_relu_, ON, KN) +#else +__kernel void MANGLE_NAME(deconv_gemm_f2s2_, ON, KN) +#endif + (const int ih_str, + const int ihw_str, + const int ic_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int ohw_str, + const int oh_off, + const int ow_off, + const int ow, + const int bx, + const int by, + __global const T *in, + __global const T *flt, + __read_only image1d_t bias, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= bx || idy >= by) { + return; + } + T4 in_val[IN]; + T16 flt_val; + T4 out_val[KN][ON]; + T4 bias_val; + +#if (KN == 2) + bias_val = read_imageh(bias, sampler, (idz >> 1)); + SET_REG_ARRAY(bias_val, out_val[0]); + SET_REG_ARRAY(bias_val, out_val[1]); +#elif (KN == 4) + bias_val = read_imageh(bias, sampler, idz); + SET_REG_ARRAY(bias_val, out_val[0]); + SET_REG_ARRAY(bias_val, out_val[1]); + SET_REG_ARRAY(bias_val, out_val[2]); + SET_REG_ARRAY(bias_val, out_val[3]); +#endif + + int in_off = (idy * ON + iw_off) * ih_str + idx + ih_off; + int flt_off = idz * ic_str * KN; + + for (int i = 0; i < ic_str; ++i) { + LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off, ih_str, in); +#if (KN == 2) + flt_val = vload16(flt_off, flt); + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[0]); + flt_val = vload16(flt_off + 1, flt); + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[1]); +#elif (KN == 4) + for (uchar j = 0; j < KN; ++j) { + flt_val = vload16(flt_off + j, flt); + if (j == 0) { + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[0]); + } + if (j == 1) { + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[1]); + } + if (j == 2) { + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[2]); + } + if (j == 3) { + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[3]); + } + } +#endif + flt_off += KN; + in_off += ihw_str; + } +#if (KN == 2) + int index_y = (idy << 1) * ON + (idz & 1); + int out_off = (idx << 1) + oh_off; + out_off += (index_y + ow_off) * oh_str; + out_off += (idz >> 1) * ohw_str; + out_off = (out_off << 2); + VSTORE_VEC(out_val[0][0], out_val[1][0], out_off, out); +#if (ON > 1) + if (index_y + 2 < ow) { + VSTORE_VEC(out_val[0][1], out_val[1][1], out_off + oh_str * 8, out); + } +#endif +#if (ON > 2) + if (index_y + 4 < ow) { + VSTORE_VEC(out_val[0][2], out_val[1][2], out_off + oh_str * 16, out); + } +#endif +#if (ON > 3) + if (index_y + 6 < ow) { + VSTORE_VEC(out_val[0][3], out_val[1][3], out_off + oh_str * 24, out); + } +#endif +#elif (KN == 4) + int index_y = (idy << 1) * ON; + int out_off = (idx << 1) + oh_off; + out_off += (index_y + ow_off) * oh_str; + out_off += idz * ohw_str; + out_off = (out_off << 2); + VSTORE_VEC(out_val[0][0], out_val[1][0], out_off, out); + if (index_y + 1 < ow) { + VSTORE_VEC(out_val[2][0], out_val[3][0], out_off + oh_str * 4, out); + } +#if (ON > 1) + if (index_y + 2 < ow) { + VSTORE_VEC(out_val[0][1], out_val[1][1], out_off + oh_str * 8, out); + } + if (index_y + 3 < ow) { + VSTORE_VEC(out_val[2][1], out_val[3][1], out_off + oh_str * 12, out); + } +#endif +#if (ON > 2) + if (index_y + 4 < ow) { + VSTORE_VEC(out_val[0][2], out_val[1][2], out_off + oh_str * 16, out); + } + if (index_y + 5 < ow) { + VSTORE_VEC(out_val[2][2], out_val[3][2], out_off + oh_str * 20, out); + } +#endif +#endif +} +#endif diff --git a/compute/tensor/src/gpu/mali/cl/deconv_gemm_trans_fltbuf.cl b/compute/tensor/src/gpu/mali/cl/deconv_gemm_trans_fltbuf.cl new file mode 100644 index 00000000..ea29c034 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/deconv_gemm_trans_fltbuf.cl @@ -0,0 +1,92 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define MANGLE_NAME_IMPL(base, C, K) base##C##K +#define MANGLE_NAME(base, C, K) MANGLE_NAME_IMPL(base, C, K) +__kernel void MANGLE_NAME(deconv_gemm_trans_fltbuf_, C, K)(const int fw, + const int fwh, + const int fwhc, + const int fc, + const int fn, + __global const T *fltdata, + __global T *fltbuf) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); // (fn + 3) / 4; + const int idx_wh = idx % fwh; // fwh + const int idx_c = idx / fwh; // (fc + 3) / 4; + uchar ec = ((idx_c + 1) * 4 <= fc) ? 4 : (fc % 4); + uchar ek = ((idy + 1) * K <= fn) ? K : (fn % K); + + T16 val = 0; + int flt_off = idy * fwhc * 4 + idx_c * fwh * 4 + idx_wh; + val.s0 = fltdata[flt_off]; + if (ec > 1) { + val.s4 = fltdata[flt_off + fwh]; + } + if (ec > 2) { + val.s8 = fltdata[flt_off + fwh * 2]; + } + if (ec > 3) { + val.sc = fltdata[flt_off + fwh * 3]; + } + + if (ek > 1) { + flt_off += fwhc; + val.s1 = fltdata[flt_off]; + if (ec > 1) { + val.s5 = fltdata[flt_off + fwh]; + } + if (ec > 2) { + val.s9 = fltdata[flt_off + fwh * 2]; + } + if (ec > 3) { + val.sd = fltdata[flt_off + fwh * 3]; + } + } + + if (ek > 2) { + flt_off += fwhc; + val.s2 = fltdata[flt_off]; + if (ec > 1) { + val.s6 = fltdata[flt_off + fwh]; + } + if (ec > 2) { + val.sa = fltdata[flt_off + fwh * 2]; + } + if (ec > 3) { + val.se = fltdata[flt_off + fwh * 3]; + } + } + + if (ek > 3) { + flt_off += fwhc; + val.s3 = fltdata[flt_off]; + if (ec > 1) { + val.s7 = fltdata[flt_off + fwh]; + } + if (ec > 2) { + val.sb = fltdata[flt_off + fwh * 2]; + } + if (ec > 3) { + val.sf = fltdata[flt_off + fwh * 3]; + } + } + + /*C = 1 C = 2 C = 4*/ + const int idx_w = idx_wh % fw; + const int idx_h = idx_wh / fw; + const int idx_tran = idx_c * fwh + idx_w * fw + idx_h; + int out_off = (idx_tran / C) * ((fn + 3) >> 2) * C + idy * C + (idx_tran % C); + vstore16(val, out_off, fltbuf); +} diff --git a/compute/tensor/src/gpu/mali/cl/depth2space.cl b/compute/tensor/src/gpu/mali/cl/depth2space.cl new file mode 100644 index 00000000..1903add6 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/depth2space.cl @@ -0,0 +1,33 @@ +__kernel void depth2space(const int iw, + const int ih, + const int iw_str, + const int ih_str, + const int iw_off, + const int ih_off, + const int ow_str, + const int oh_str, + const int ow_off, + const int oh_off, + __global const T *in, + __global uchar *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + if (idx >= ih || idy >= (iw << 2)) { + return; + } + const int ix = idx; + const int iy = idy % iw; + const int iz = idy / iw; + + const int in_off = (iz * iw_str + iy + iw_off) * ih_str + ix + ih_off; + T4 tmp = vload4(in_off, in); + uchar4 val; + val.x = tmp.x * 255.0; + val.y = tmp.y * 255.0; + val.z = tmp.z * 255.0; + val.w = tmp.w * 255.0; + + const int out_off = ((ix << 2) + iz + oh_off) * ow_str + (iy << 2) + ow_off; + vstore4(val, 0, out + out_off); +} diff --git a/compute/tensor/src/gpu/mali/cl/depth2space_nchw.cl b/compute/tensor/src/gpu/mali/cl/depth2space_nchw.cl new file mode 100644 index 00000000..3bd3dea2 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/depth2space_nchw.cl @@ -0,0 +1,46 @@ +__kernel void depth2space_nchw(const int blockSize, + const int iw_str, + const int iwh_str, + const int iw_off, + const int ih_off, + const int oh_str, + const int ohw_str, + const int oh_off, + const int ow_off, + const int iw, + const int ih, + const int ic, + __global const T *in, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + if (idx >= iw || idy >= ih) { + return; + } + const int idz = get_global_id(2); + const int bs2 = blockSize * blockSize; + const int z_group = idz / bs2; + const int z_group_lane = idz % bs2; + const int z_group_lane_x = z_group_lane % blockSize; + const int z_group_lane_y = z_group_lane / blockSize; + + const int z_off = z_group * (bs2 << 2) + z_group_lane; + int in_off = z_off * iwh_str + (idy + ih_off) * iw_str + idx + iw_off; + T4 val = 0; + val.x = in[in_off]; + if (z_off + bs2 < ic) { + val.y = in[in_off + bs2 * iwh_str]; + } + if (z_off + bs2 * 2 < ic) { + val.z = in[in_off + bs2 * 2 * iwh_str]; + } + if (z_off + bs2 * 3 < ic) { + val.w = in[in_off + bs2 * 3 * iwh_str]; + } + + int out_off = idy * blockSize + z_group_lane_y + oh_off; + out_off += (idx * blockSize + z_group_lane_x + ow_off) * oh_str; + out_off += z_group * ohw_str; + vstore4(val, out_off, out); +} diff --git a/compute/tensor/src/gpu/mali/cl/depth2space_ncwhc4_2x2.cl b/compute/tensor/src/gpu/mali/cl/depth2space_ncwhc4_2x2.cl new file mode 100644 index 00000000..cd0ff9d2 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/depth2space_ncwhc4_2x2.cl @@ -0,0 +1,62 @@ +__kernel void depth2space_ncwhc4_2x2(const int blockSize, + const int ih_str, + const int ihw_str, + const int ic_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int ohw_str, + const int oh_off, + const int ow_off, + const int ih, + const int iw, + __global const T *in, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + if (idx >= ih || idy >= iw) { + return; + } + const int idz = get_global_id(2); + const int in_off = idz * 4 * ihw_str + (idy + iw_off) * ih_str + idx + ih_off; + T4 val[4] = {0}; + T4 val_0, val_1, val_2, val_3; + + val[0] = vload4(in_off, in); + if (idz * 4 + 1 < ic_str) { + val[1] = vload4(in_off + ihw_str, in); + } + if (idz * 4 + 2 < ic_str) { + val[2] = vload4(in_off + ihw_str * 2, in); + } + if (idz * 4 + 3 < ic_str) { + val[3] = vload4(in_off + ihw_str * 3, in); + } + + val_0.x = val[0].x; + val_1.x = val[0].y; + val_2.x = val[0].z; + val_3.x = val[0].w; + + val_0.y = val[1].x; + val_1.y = val[1].y; + val_2.y = val[1].z; + val_3.y = val[1].w; + + val_0.z = val[2].x; + val_1.z = val[2].y; + val_2.z = val[2].z; + val_3.z = val[2].w; + + val_0.w = val[3].x; + val_1.w = val[3].y; + val_2.w = val[3].z; + val_3.w = val[3].w; + + const int out_off = idz * ohw_str + ((idy << 1) + ow_off) * oh_str + (idx << 1) + oh_off; + vstore4(val_0, out_off, out); + vstore4(val_2, out_off + 1, out); + vstore4(val_1, out_off + oh_str, out); + vstore4(val_3, out_off + oh_str + 1, out); +} diff --git a/compute/tensor/src/gpu/mali/cl/eltwise.cl b/compute/tensor/src/gpu/mali/cl/eltwise.cl new file mode 100644 index 00000000..a4c418f7 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/eltwise.cl @@ -0,0 +1,175 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "kernel_def.h" +#define MANGLE_NAME_IMPL(base, TP, N) base##TP##N +#define MANGLE_NAME(base, TP, N) MANGLE_NAME_IMPL(base, TP, N) + +#if defined(USE_SUM) +#define calCore(v, res) \ + { \ + res.s0 += v.s0; \ + res.s1 += v.s1; \ + res.s2 += v.s2; \ + res.s3 += v.s3; \ + } +#endif + +#if defined(USE_MAX) +#define calCore(v, res) \ + { \ + res = fmax(res, v); \ + } +#endif + +#if defined(USE_PROD) +#define calCore(v, res) \ + { \ + res.s0 *= v.s0; \ + res.s1 *= v.s1; \ + res.s2 *= v.s2; \ + res.s3 *= v.s3; \ + } +#endif + +#if defined(USE_NCHW) +#define LOAD_VAL(ew, idx, idy, idz, ih_str, iw_str, ih_off, iw_off, buf, val) \ + { \ + int off = (idz * ih_str + idy + ih_off) * iw_str + (idx << 2) + iw_off; \ + val = 0; \ + if (ew == 4) { \ + val = vload4(0, buf + off); \ + } else { \ + if (ew == 1) \ + val.x = buf[off]; \ + if (ew == 2) { \ + T2 tmp = vload2(0, buf + off); \ + val.x = tmp.x; \ + val.y = tmp.y; \ + } \ + if (ew == 3) { \ + T3 tmp = vload3(0, buf + off); \ + val.x = tmp.x; \ + val.y = tmp.y; \ + val.z = tmp.z; \ + } \ + } \ + } +#define STORE_VAL(ew, idx, idy, idz, oh_str, ow_str, oh_off, ow_off, buf, val) \ + { \ + int off = (idz * oh_str + idy + oh_off) * ow_str + (idx << 2) + ow_off; \ + if (ew == 4) { \ + vstore4(val, 0, buf + off); \ + } else { \ + if (ew == 1) \ + buf[off] = val.x; \ + if (ew == 2) { \ + vstore2((T2)(val.x, val.y), 0, buf + off); \ + } \ + if (ew == 3) { \ + vstore3((T3)(val.x, val.y, val.z), 0, buf + off); \ + } \ + } \ + } +#else +#define LOAD_VAL(ew, idx, idy, idz, ih_str, iw_str, ih_off, iw_off, buf, val) \ + { \ + int off = (idz * iw_str + idy + iw_off) * ih_str + idx + ih_off; \ + val = vload4(off, buf); \ + } +#define STORE_VAL(ew, idx, idy, idz, oh_str, ow_str, oh_off, ow_off, buf, val) \ + { \ + int off = (idz * ow_str + idy + ow_off) * oh_str + idx + oh_off; \ + vstore4(val, off, buf); \ + } +#endif + +#if (USE_NCHW) +#if (USE_RELU) +__kernel void MANGLE_NAME(eltwise_nchw_relu_, TP, N) +#else +__kernel void MANGLE_NAME(eltwise_nchw_, TP, N) +#endif +#else +#if (USE_RELU) +__kernel void MANGLE_NAME(eltwise_relu_, TP, N) +#else +__kernel void MANGLE_NAME(eltwise_, TP, N) +#endif +#endif + (const int h, + const int w, + const int c, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + const int bx, + const int by, + const int ih0_str, + const int iw0_str, + const int ih0_off, + const int iw0_off, + __global const T *in0, +#if (N > 1) + const int ih1_str, + const int iw1_str, + const int ih1_off, + const int iw1_off, + __global const T *in1, +#endif +#if (N > 2) + const int ih2_str, + const int iw2_str, + const int ih2_off, + const int iw2_off, + __global const T *in2, +#endif +#if (N > 3) + const int ih3_str, + const int iw3_str, + const int ih3_off, + const int iw3_off, + __global const T *in3, +#endif + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= bx || idy >= by) { + return; + } + char ew = 0; +#if defined(USE_NCHW) + ew = ((idx << 2) + 4 < w) ? 4 : (w & 3); +#endif + + T4 val; + T4 res; + LOAD_VAL(ew, idx, idy, idz, ih0_str, iw0_str, ih0_off, iw0_off, in0, res); +#if (N > 1) + LOAD_VAL(ew, idx, idy, idz, ih1_str, iw1_str, ih1_off, iw1_off, in1, val); + calCore(val, res); +#endif +#if (N > 2) + LOAD_VAL(ew, idx, idy, idz, ih2_str, iw2_str, ih2_off, iw2_off, in2, val); + calCore(val, res); +#endif +#if (N > 3) + LOAD_VAL(ew, idx, idy, idz, ih3_str, iw3_str, ih3_off, iw3_off, in3, val); + calCore(val, res); +#endif + ACTIVATION_V4(res); + STORE_VAL(ew, idx, idy, idz, oh_str, ow_str, oh_off, ow_off, out, res); +} diff --git a/compute/tensor/src/gpu/mali/cl/eltwise_broadcast.cl b/compute/tensor/src/gpu/mali/cl/eltwise_broadcast.cl new file mode 100644 index 00000000..993d2bb0 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/eltwise_broadcast.cl @@ -0,0 +1,89 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define MANGLE_NAME_IMPL(base, TP, N) base##TP##N +#define MANGLE_NAME(base, TP, N) MANGLE_NAME_IMPL(base, TP, N) + +#if defined(USE_SUM) +#define calCore(in, off, v, res) \ + { \ + v = vload4(off, in); \ + res.s0 += v.s0; \ + res.s1 += v.s1; \ + res.s2 += v.s2; \ + res.s3 += v.s3; \ + } +#endif + +#if defined(USE_MAX) +#define calCore(in, off, v, res) \ + { \ + v = vload4(off, in); \ + res = fmax(res, v); \ + } +#endif + +#if defined(USE_PROD) +#define calCore(in, off, v, res) \ + { \ + v = vload4(off, in); \ + res.s0 *= v.s0; \ + res.s1 *= v.s1; \ + res.s2 *= v.s2; \ + res.s3 *= v.s3; \ + } +#endif + +__kernel void MANGLE_NAME(eltwise_broadcast_, TP, N)(const int h, + const int w, + const int c, + const int ih_str, + const int iw_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + __global const T *in0, + __global const T *in1, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= h || idy >= w) { + return; + } + + T4 val; + T4 res; + const int in_off_res = (idz * iw_str + idy + iw_off) * ih_str + idx + ih_off; + // c = h = w = 1 have bugs to fix +#if (N == 0) + const int in_off_val = 0; + // h = w = 1 +#elif (N == 1) + const int in_off_val = idz; + // h = 1 +#elif (N == 2) + const int in_off_val = idz * iw_str + idy + iw_str; + // w = 1 +#elif (N == 3) + const int in_off_val = idz * ih_str + idx + ih_str; +#endif + res = vload4(in_off_res, in0); + calCore(in1, in_off_val, val, res); + const int out_off = (idz * ow_str + idy + ow_off) * oh_str + idx + oh_off; + vstore4(res, out_off, out); +} diff --git a/compute/tensor/src/gpu/mali/cl/eltwise_spe_nchw_c.cl b/compute/tensor/src/gpu/mali/cl/eltwise_spe_nchw_c.cl new file mode 100644 index 00000000..0992daa7 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/eltwise_spe_nchw_c.cl @@ -0,0 +1,74 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define MANGLE_NAME_IMPL(base, TP) base##TP +#define MANGLE_NAME(base, TP) MANGLE_NAME_IMPL(base, TP) + +#if defined(USE_SUM) +#define calCore(v, res) \ + { \ + res.s0 += v.s0; \ + res.s1 += v.s1; \ + res.s2 += v.s2; \ + res.s3 += v.s3; \ + } +#endif + +#if defined(USE_MAX) +#define calCore(v, res) \ + { \ + res = fmax(res, v); \ + } +#endif + +#if defined(USE_PROD) +#define calCore(v, res) \ + { \ + res.s0 *= v.s0; \ + res.s1 *= v.s1; \ + res.s2 *= v.s2; \ + res.s3 *= v.s3; \ + } +#endif + +__kernel void MANGLE_NAME(eltwise_spe_nchw_c_, TP)(const int h, + const int w, + const int c, + const int ih_str, + const int iw_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + __global const T *in, + __global const T *ine, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= h || idy >= w) { + return; + } + + T4 val; + T4 res; + const int in_off = (idz * iw_str + idy + iw_off) * ih_str + idx + ih_off; + res = vload4(in_off, in); + val = vload4(idz, ine); + calCore(val, res); + const int out_off = (idz * ow_str + idy + ow_off) * oh_str + idx + oh_off; + vstore4(res, out_off, out); +} diff --git a/tensor_computing/src/gpu/mali/cl/embedding.cl b/compute/tensor/src/gpu/mali/cl/embedding.cl similarity index 77% rename from tensor_computing/src/gpu/mali/cl/embedding.cl rename to compute/tensor/src/gpu/mali/cl/embedding.cl index 8f302066..9ae2db32 100644 --- a/tensor_computing/src/gpu/mali/cl/embedding.cl +++ b/compute/tensor/src/gpu/mali/cl/embedding.cl @@ -11,35 +11,43 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - -__kernel void embedding(const int step, const int on, const int on_d4, const int oh_str, const int oh_off, const int ow_off, __global const unsigned int* input, __global const T* weight, __global T* output) { - +__kernel void embedding(const int step, + const int on, + const int on_d4, + const int oh_str, + const int oh_off, + const int ow_off, + __global const unsigned int *input, + __global const T *weight, + __global T *output) +{ int idx = get_global_id(0); int idy = get_global_id(1); - if(idx >= on_d4 || idy >= step) return; + if (idx >= on_d4 || idy >= step) { + return; + } T4 val = 0; unsigned int index = input[idy]; const int wei_off = index * on + (idx << 2); uchar rn = ((idx << 2) + 4 <= on) ? 0 : (on & 3); - if(rn == 0) { + if (rn == 0) { val = vload4(0, weight + wei_off); } else { - if(rn == 1) val.x = weight[wei_off]; - if(rn == 2) { + if (rn == 1) { + val.x = weight[wei_off]; + } + if (rn == 2) { T2 tmp = vload2(0, weight + wei_off); val.x = tmp.x; val.y = tmp.y; } - if(rn == 3) { + if (rn == 3) { T3 tmp = vload3(0, weight + wei_off); val.x = tmp.x; val.y = tmp.y; val.z = tmp.z; } } - const int out_off = (idx + ow_off) * oh_str + idy + oh_off; + const int out_off = (idx + ow_off) * oh_str + idy + oh_off; vstore4(val, out_off, output); } diff --git a/compute/tensor/src/gpu/mali/cl/fc_p1.cl b/compute/tensor/src/gpu/mali/cl/fc_p1.cl new file mode 100644 index 00000000..8b6cf12f --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/fc_p1.cl @@ -0,0 +1,66 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define calCore(iv, fv, res) \ + { \ + res.x += iv.x * fv.s0 + iv.y * fv.s1 + iv.z * fv.s2 + iv.w * fv.s3; \ + res.y += iv.x * fv.s4 + iv.y * fv.s5 + iv.z * fv.s6 + iv.w * fv.s7; \ + res.z += iv.x * fv.s8 + iv.y * fv.s9 + iv.z * fv.sa + iv.w * fv.sb; \ + res.w += iv.x * fv.sc + iv.y * fv.sd + iv.z * fv.se + iv.w * fv.sf; \ + } +__kernel void fc_p1(const int item_y, + const int ih_str, + const int iw_str, + const int ih_off, + const int iw_off, + const int ihy_str, + const int ihw_str, + const int fh, + const int fw, + const int fc, + const int fn, + const int fhy_str, + const int fhw_str, + const int fwc_str, + __global const T *flt, + __global const T *in, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= fh || idy >= item_y) { + return; + } + + T4 in_val; + T16 flt_val; + T4 sum = 0; + int in_off = (idy + iw_off) * ih_str + idx + ih_off; + int flt_off = (idz * fwc_str + idy) * fh + idx; + + for (int i = 0; i < fc; i++) { + int k = 0; + for (int j = idy; j < fw; j += item_y) { + in_val = vload4(in_off + k * ihy_str, in); + flt_val = vload16(flt_off + k * fhy_str, flt); + calCore(in_val, flt_val, sum); + k++; + } + in_off += ihw_str; + flt_off += fhw_str; + } + + const int out_off = (idy * fh + idx) * fn + idz; + vstore4(sum, out_off, out); +} diff --git a/compute/tensor/src/gpu/mali/cl/fc_p2.cl b/compute/tensor/src/gpu/mali/cl/fc_p2.cl new file mode 100644 index 00000000..e79209e7 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/fc_p2.cl @@ -0,0 +1,57 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#if defined(USE_HALF) +#define READ_IMAGE(image, sampler, coord) read_imageh(image, sampler, coord) +#define WRITE_IMAGE(image, coord, data) write_imageh(image, coord, data) +#else +#define READ_IMAGE(image, sampler, coord) read_imagef(image, sampler, coord) +#define WRITE_IMAGE(image, coord, data) write_imagef(image, coord, data) +#endif +__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + +#define calCore(iv, fv, res) \ + { \ + res.x += iv.x * fv.s0 + iv.y * fv.s1 + iv.z * fv.s2 + iv.w * fv.s3; \ + res.y += iv.x * fv.s4 + iv.y * fv.s5 + iv.z * fv.s6 + iv.w * fv.s7; \ + res.z += iv.x * fv.s8 + iv.y * fv.s9 + iv.z * fv.sa + iv.w * fv.sb; \ + res.w += iv.x * fv.sc + iv.y * fv.sd + iv.z * fv.se + iv.w * fv.sf; \ + } +__kernel void fc_p2(const int loop, + const int len, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + __global const T *in, + __global const T *bias, + __global T *out) +{ + const int idx = get_global_id(0); + if (idx >= len) { + return; + } + + T4 sum = vload4(idx, bias); + T4 val; + for (int i = 0; i < loop; i++) { + val = vload4(idx + i * len, in); + sum.x += val.x; + sum.y += val.y; + sum.z += val.z; + sum.w += val.w; + } + + const int out_off = (idx * ow_str + ow_off) * oh_str + oh_off; + vstore4(sum, out_off, out); +} diff --git a/compute/tensor/src/gpu/mali/cl/fc_trans_fltbuf.cl b/compute/tensor/src/gpu/mali/cl/fc_trans_fltbuf.cl new file mode 100644 index 00000000..2d996aef --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/fc_trans_fltbuf.cl @@ -0,0 +1,100 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define MANGLE_NAME_IMPL(base, C, K) base##C##K +#define MANGLE_NAME(base, C, K) MANGLE_NAME_IMPL(base, C, K) +#if (C == 4) +#define loadFltval(off, str, flt, val) \ + { \ + val.x = flt[off]; \ + val.y = flt[off + str]; \ + val.z = flt[off + (str << 1)]; \ + val.w = flt[off + str * 3]; \ + } + +#define loadFltvalEdge(off, str, flt, val, edge) \ + { \ + val.x = flt[off]; \ + if (edge > 1) \ + val.y = flt[off + str]; \ + if (edge > 2) \ + val.z = flt[off + (str << 1)]; \ + } +#endif + +#if (C == 8) +#define loadFltval(off, str, flt, val) \ + { \ + val.s0 = flt[off]; \ + val.s1 = flt[off + str]; \ + val.s2 = flt[off + (str << 1)]; \ + val.s3 = flt[off + str * 3]; \ + val.s4 = flt[off + (str << 2)]; \ + val.s5 = flt[off + str * 5]; \ + val.s6 = flt[off + str * 6]; \ + val.s7 = flt[off + str * 7]; \ + } +#define loadFltvalEdge(off, str, flt, val, edge) \ + { \ + val.s0 = flt[off]; \ + if (edge > 1) \ + val.s1 = flt[off + str]; \ + if (edge > 2) \ + val.s2 = flt[off + (str << 1)]; \ + if (edge > 3) \ + val.s3 = flt[off + str * 3]; \ + if (edge > 4) \ + val.s4 = flt[off + (str << 2)]; \ + if (edge > 5) \ + val.s5 = flt[off + str * 5]; \ + if (edge > 6) \ + val.s6 = flt[off + str * 6]; \ + } +#endif + +__kernel void MANGLE_NAME(fc_trans_fltbuf_, C, K)(const int fw, + const int fh, + const int fwh, + const int fc, + const int fn, + __global const T *fltdata, + __global T *fltbuf) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + const int bc = (fc + C - 1) / C; + const int idc = idz % bc; + const int idn = idz / bc; + short ec = ((idc + 1) * C <= fc) ? C : (fc % C); + + const int flt_off = ((idn * fc + idc * C) * fh + idy) * fw + idx; +#if (C == 4) + T4 val = 0; +#elif (C == 8) + T8 val = 0; +#endif + if (idn < fn) { + if (ec == C) { + loadFltval(flt_off, fwh, fltdata, val); + } else { + loadFltvalEdge(flt_off, fwh, fltdata, val, ec); + } + } + const int out_off = ((idn / K * bc + idc) * fh + idx) * fw * K + idy * K + (idn % K); +#if (C == 4) + vstore4(val, out_off, fltbuf); +#elif (C == 8) + vstore8(val, out_off, fltbuf); +#endif +} diff --git a/compute/tensor/src/gpu/mali/cl/fill_memory_zero.cl b/compute/tensor/src/gpu/mali/cl/fill_memory_zero.cl new file mode 100644 index 00000000..1d032e00 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/fill_memory_zero.cl @@ -0,0 +1,23 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define MANGLE_NAME_IMPL(base, DT) base##DT +#define MANGLE_NAME(base, DT) MANGLE_NAME_IMPL(base, DT) +__kernel void MANGLE_NAME(fill_memory_zero_, DT)(const int len, __global T *data) +{ + int idx = get_global_id(0); + if (idx >= len) { + return; + } + data[idx] = 0; +} diff --git a/compute/tensor/src/gpu/mali/cl/fill_memory_zero_vec4.cl b/compute/tensor/src/gpu/mali/cl/fill_memory_zero_vec4.cl new file mode 100644 index 00000000..e0f25587 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/fill_memory_zero_vec4.cl @@ -0,0 +1,38 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define MANGLE_NAME_IMPL(base, DT) base##DT +#define MANGLE_NAME(base, DT) MANGLE_NAME_IMPL(base, DT) +__kernel void MANGLE_NAME(fill_memory_zero_vec4_, DT)( + const int len, const int offset, const int bx, __global T *data) +{ + int idx = get_global_id(0); + if (idx >= bx) { + return; + } + char el = ((idx << 2) + 4 <= len) ? 4 : (len & 3); + const int off = offset + (idx << 2); + if (el == 4) { + vstore4((T4)0, 0, data + off); + } else { + if (el == 1) { + data[off] = 0; + } + if (el == 2) { + vstore2((T2)0, 0, data + off); + } + if (el == 3) { + vstore3((T3)0, 0, data + off); + } + } +} diff --git a/compute/tensor/src/gpu/mali/cl/gemm_nt.cl b/compute/tensor/src/gpu/mali/cl/gemm_nt.cl new file mode 100644 index 00000000..66b9964d --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/gemm_nt.cl @@ -0,0 +1,110 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "kernel_def.h" +#define MANGLE_NAME_LMPL(base, LM, LN, LK) base##LM##LN##LK +#define MANGLE_NAME(base, LM, LN, LK) MANGLE_NAME_LMPL(base, LM, LN, LK) + +#if defined(NO_BIAS) +__kernel void MANGLE_NAME(gemm_nt_nobias_, LM, LN, LK)(const int KA, + const int KB, + const int K, + const int ow_str, + const int A_str, + const int B_str, + const int C_str, + const int A_off, + const int B_off, + const int ow, + const int oh, + const int bx, + const int by, + __global const T *A, + __global const T *B, + __global T *C) +#else +__kernel void MANGLE_NAME(gemm_nt_, LM, LN, LK)(const int KA, + const int KB, + const int K, + const int ow_str, + const int A_str, + const int B_str, + const int C_str, + const int A_off, + const int B_off, + const int ow, + const int oh, + const int bx, + const int by, + __global const T *A, + __global const T *B, + __global const T *bias, + __global T *C) +#endif +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= bx || idy >= by) { + return; + } + const int ix = idx * LN; + const int iy = idy * LM; + const int L = K >> LK; + const int VN = 1 << LK; + + T c[LM][LN]; +#if (LK == 0) + T a[LM]; + T b[LN]; +#elif (LK == 1) + T2 a[LM]; + T2 b[LN]; +#elif (LK == 2) + T4 a[LM]; + T4 b[LN]; +#elif (LK == 3) + T8 a[LM]; + T8 b[LN]; +#elif (LK == 4) + T16 a[LM]; + T16 b[LN]; +#endif + +#if defined(NO_BIAS) + GEMM_SET_C_ZERO(c); +#else + GEMM_LOAD_A(a, iy, bias); + GEMM_SET_C_BIAS(a, c); +#endif + + int a_off = iy * KA + idz * A_str + A_off; + int b_off = ix * KB + idz * B_str + B_off; + for (int i = 0; i < L; ++i) { + GEMM_NT_LOAD_A(a, a_off, KA, A); + GEMM_NT_LOAD_B(b, b_off, KB, B); + GEMM_CALCORE(a, b, c); + a_off += VN; + b_off += VN; + } + int c_off = iy * ow_str + ix + idz * C_str; + int ex = ix + LN - ow; + int ey = iy + LM - oh; + if (ex > 0) { + GEMM_SET_C_EDGE_ZERO_W(c, ex); + } + if (ey > 0) { + GEMM_SET_C_EDGE_ZERO_H(c, ey); + } + GEMM_STORE_C(c, c_off, ow_str, C); +} diff --git a/compute/tensor/src/gpu/mali/cl/gemm_tn.cl b/compute/tensor/src/gpu/mali/cl/gemm_tn.cl new file mode 100644 index 00000000..60f5b2fa --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/gemm_tn.cl @@ -0,0 +1,269 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "kernel_def.h" +#define MANGLE_NAME_LMPL(base, LM, LN) base##LM##LN +#define MANGLE_NAME(base, LM, LN) MANGLE_NAME_LMPL(base, LM, LN) + +#if defined(USE_NCWHC4) +#if defined(USE_RELU) +__kernel void MANGLE_NAME(gemm_tn_relu_ncwhc4_, LM, LN) +#elif defined(USE_GELU) +__kernel void MANGLE_NAME(gemm_tn_gelu_ncwhc4_, LM, LN) +#elif defined(USE_ELTWISE_NCHW) +__kernel void MANGLE_NAME(gemm_tn_eltwise1_ncwhc4_, LM, LN) +#elif defined(USE_ELTWISE_NCWHC4) +__kernel void MANGLE_NAME(gemm_tn_eltwise4_ncwhc4_, LM, LN) +#else +__kernel void MANGLE_NAME(gemm_tn_ncwhc4_, LM, LN) +#endif + (const int M, + const int N, + const int K, + const int oh, + const int ow, + const int oc, + const int oh_str, + const int ow_str, + const int ohw_str, + const int oh_off, + const int ow_off, + const int bx, + const int by, + __global const T *A, + __global const T *B, + __global const T *bias, + __global T *C +#if defined(USE_ELTWISE_NCHW) + , + const int ew_str, + const int ew_off, + const int eh_off, + __global const T *eltVal +#endif +#if defined(USE_ELTWISE_NCWHC4) + , + const int eh_str, + const int ew_str, + const int ehw_str, + const int eh_off, + const int ew_off, + __global const T *eltVal +#endif + ) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + if (idx >= bx || idy >= by) { + return; + } + const int ix = idx * LN; + const int iy = idy * LM; + + T a[LM]; + T b[LN]; + T c[LM][LN]; + int a_off = iy; + int b_off = ix; + GEMM_LOAD_A(a, iy, bias); + GEMM_SET_C_BIAS(a, c); +#if defined(USE_ELTWISE_NCHW) + int c_off = (iy + eh_off) * ew_str + ix + ew_off; + ADD_ELTWISE_NCHW(c, c_off, ew_str, eltVal); +#endif + + for (int i = 0; i < K; ++i) { + GEMM_LOAD_A(a, a_off, A); + GEMM_LOAD_B(b, b_off, B); + GEMM_CALCORE(a, b, c); + a_off += M; + b_off += N; + } + + /*LM = 4 or LM = 8*/ + int c_base = (iy >> 2) * ohw_str; +#if defined(USE_ELTWISE_NCWHC4) + int e_base = (iy >> 2) * ehw_str; +#endif + for (uchar i = 0; i < LN; ++i) { + int oxh = (ix + i) % oh; + int oxw = (ix + i) / oh; + if (oxw >= ow) { + break; + } + int c_off = c_base + (oxw + ow_off) * oh_str + oxh + oh_off; + T4 tmp; +#if defined(USE_ELTWISE_NCWHC4) + int e_off = e_base + (oxw + ew_off) * eh_str + oxh + eh_off; + tmp = vload4(e_off, eltVal); + tmp.x += c[0][0]; + tmp.y += c[1][0]; + tmp.z += c[2][0]; + tmp.w += c[3][0]; +#else + tmp.x = c[0][0]; + tmp.y = c[1][0]; + tmp.z = c[2][0]; + tmp.w = c[3][0]; + ACTIVATION_V4(tmp); +#endif + vstore4(tmp, c_off, C); + UPDATE_REG(c[0]); + UPDATE_REG(c[1]); + UPDATE_REG(c[2]); + UPDATE_REG(c[3]); +#if (LM == 8) + if (iy + 4 >= oc) { + continue; + } + c_off += ohw_str; +#if defined(USE_ELTWISE_NCWHC4) + e_off += ohw_str; + tmp = vload4(e_off, eltVal); + tmp.x += c[4][0]; + tmp.y += c[5][0]; + tmp.z += c[6][0]; + tmp.w += c[7][0]; +#else + tmp.x = c[4][0]; + tmp.y = c[5][0]; + tmp.z = c[6][0]; + tmp.w = c[7][0]; + ACTIVATION_V4(tmp); +#endif + vstore4(tmp, c_off, C); + UPDATE_REG(c[4]); + UPDATE_REG(c[5]); + UPDATE_REG(c[6]); + UPDATE_REG(c[7]); +#endif + } +} + +#elif defined(NO_BIAS) +__kernel void MANGLE_NAME(gemm_tn_nobias_, LM, LN)(const int M, + const int N, + const int K, + const int ow_str, + const int A_str, + const int B_str, + const int C_str, + const int A_off, + const int B_off, + const int ow, + const int oh, + const int bx, + const int by, + float alp, + float bet, + __global const T *A, + __global const T *B, + __global T *C) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= bx || idy >= by) { + return; + } + const int ix = idx * LN; + const int iy = idy * LM; + + T a[LM]; + T b[LN]; + T c[LM][LN]; + int a_off = iy + A_off; + int b_off = ix + B_off; + a_off += idz * A_str; + b_off += idz * B_str; + GEMM_SET_C_ZERO(c); + + for (int i = 0; i < K; ++i) { + GEMM_LOAD_A(a, a_off, A); + GEMM_LOAD_B(b, b_off, B); + GEMM_CALCORE(a, b, c); + a_off += M; + b_off += N; + } + + int c_off = iy * ow_str + ix; + c_off += idz * C_str; + int ex = ix + LN - ow; + int ey = iy + LM - oh; + GEMM_MUL_C(alp, bet, c); + if (ex > 0) { + GEMM_SET_C_EDGE_ZERO_W(c, ex); + } + if (ey > 0) { + GEMM_SET_C_EDGE_ZERO_H(c, ey); + } + GEMM_STORE_C(c, c_off, ow_str, C); +} + +#else +#if defined(USE_RELU) +__kernel void MANGLE_NAME(gemm_tn_relu_, LM, LN) +#elif defined(USE_GELU) +__kernel void MANGLE_NAME(gemm_tn_gelu_, LM, LN) +#else +__kernel void MANGLE_NAME(gemm_tn_, LM, LN) +#endif + (const int M, + const int N, + const int K, + const int ow_str, + const int ow, + const int oh, + const int bx, + const int by, + __global const T *A, + __global const T *B, + __global const T *bias, + __global T *C) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + if (idx >= bx || idy >= by) { + return; + } + const int ix = idx * LN; + const int iy = idy * LM; + + T a[LM]; + T b[LN]; + T c[LM][LN]; + int a_off = iy; + int b_off = ix; + GEMM_LOAD_A(a, iy, bias); + GEMM_SET_C_BIAS(a, c); + + for (int i = 0; i < K; ++i) { + GEMM_LOAD_A(a, a_off, A); + GEMM_LOAD_B(b, b_off, B); + GEMM_CALCORE(a, b, c); + a_off += M; + b_off += N; + } + + int c_off = iy * ow_str + ix; + int ex = ix + LN - ow; + int ey = iy + LM - oh; + if (ex > 0) { + GEMM_SET_C_EDGE_ZERO_W(c, ex); + } + if (ey > 0) { + GEMM_SET_C_EDGE_ZERO_H(c, ey); + } + GEMM_STORE_C(c, c_off, ow_str, C); +} +#endif diff --git a/compute/tensor/src/gpu/mali/cl/kernel_def.h b/compute/tensor/src/gpu/mali/cl/kernel_def.h new file mode 100644 index 00000000..8d9ce80e --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/kernel_def.h @@ -0,0 +1,3424 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _KERNEL_DEF +#define _KERNEL_DEF + +/* + * READ IMAGE + */ +#if defined(USE_HALF) +#define READ_IMAGE(image, sampler, coord) read_imageh(image, sampler, coord) +#define WRITE_IMAGE(image, coord, data) write_imageh(image, coord, data) +#else +#define READ_IMAGE(image, sampler, coord) read_imagef(image, sampler, coord) +#define WRITE_IMAGE(image, coord, data) write_imagef(image, coord, data) +#endif + +__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + +#if defined(USE_V1) +#define READ_BUF(v, off, buf) \ + { \ + v = buf[off]; \ + } +#elif defined(USE_V2) +#define READ_BUF(v, off, buf) \ + { \ + v = vload2(0, buf + off); \ + } +#elif defined(USE_V3) +#define READ_BUF(v, off, buf) \ + { \ + v = vload3(0, buf + off); \ + } +#elif defined(USE_V4) +#define READ_BUF(v, off, buf) \ + { \ + v = vload4(0, buf + off); \ + } +#elif defined(USE_V8) +#define READ_BUF(v, off, buf) \ + { \ + v = vload8(0, buf + off); \ + } +#elif defined(USE_V16) +#define READ_BUF(v, off, buf) \ + { \ + v = vload16(0, buf + off); \ + } +#endif + +/* + * load data from buffer to reg array + */ +#define LOAD_BUF_ARRAY1(v, off, buf) \ + { \ + v[0] = buf[off]; \ + } + +#define LOAD_BUF_ARRAY2(v, off, buf) \ + { \ + T2 tmp = vload2(0, buf + off); \ + v[0] = tmp.x; \ + v[1] = tmp.y; \ + } + +#define LOAD_BUF_ARRAY3(v, off, buf) \ + { \ + T3 tmp = vload3(0, buf + off); \ + v[0] = tmp.x; \ + v[1] = tmp.y; \ + v[2] = tmp.z; \ + } + +#define LOAD_BUF_ARRAY4(v, off, buf) \ + { \ + T4 tmp = vload4(0, buf + off); \ + v[0] = tmp.x; \ + v[1] = tmp.y; \ + v[2] = tmp.z; \ + v[3] = tmp.w; \ + } + +#define LOAD_BUF_ARRAY5(v, off, buf) \ + { \ + T8 tmp = vload8(0, buf + off); \ + v[0] = tmp.s0; \ + v[1] = tmp.s1; \ + v[2] = tmp.s2; \ + v[3] = tmp.s3; \ + v[4] = tmp.s4; \ + } + +#define LOAD_BUF_ARRAY6(v, off, buf) \ + { \ + T8 tmp = vload8(0, buf + off); \ + v[0] = tmp.s0; \ + v[1] = tmp.s1; \ + v[2] = tmp.s2; \ + v[3] = tmp.s3; \ + v[4] = tmp.s4; \ + v[5] = tmp.s5; \ + } + +#define LOAD_BUF_ARRAY7(v, off, buf) \ + { \ + T8 tmp = vload8(0, buf + off); \ + v[0] = tmp.s0; \ + v[1] = tmp.s1; \ + v[2] = tmp.s2; \ + v[3] = tmp.s3; \ + v[4] = tmp.s4; \ + v[5] = tmp.s5; \ + v[6] = tmp.s6; \ + } + +#define LOAD_BUF_ARRAY8(v, off, buf) \ + { \ + T8 tmp = vload8(0, buf + off); \ + v[0] = tmp.s0; \ + v[1] = tmp.s1; \ + v[2] = tmp.s2; \ + v[3] = tmp.s3; \ + v[4] = tmp.s4; \ + v[5] = tmp.s5; \ + v[6] = tmp.s6; \ + v[7] = tmp.s7; \ + } + +#define ADD_BUF_ARRAY1(v, off, buf) \ + { \ + v[0] += buf[off]; \ + } + +#define ADD_BUF_ARRAY2(v, off, buf) \ + { \ + T2 tmp = vload2(0, buf + off); \ + v[0] += tmp.x; \ + v[1] += tmp.y; \ + } + +#define ADD_BUF_ARRAY3(v, off, buf) \ + { \ + T3 tmp = vload3(0, buf + off); \ + v[0] += tmp.x; \ + v[1] += tmp.y; \ + v[2] += tmp.z; \ + } + +#define ADD_BUF_ARRAY4(v, off, buf) \ + { \ + T4 tmp = vload4(0, buf + off); \ + v[0] += tmp.x; \ + v[1] += tmp.y; \ + v[2] += tmp.z; \ + v[3] += tmp.w; \ + } + +#define ADD_BUF_ARRAY5(v, off, buf) \ + { \ + T8 tmp = vload8(0, buf + off); \ + v[0] += tmp.s0; \ + v[1] += tmp.s1; \ + v[2] += tmp.s2; \ + v[3] += tmp.s3; \ + v[4] += tmp.s4; \ + } + +#define ADD_BUF_ARRAY6(v, off, buf) \ + { \ + T8 tmp = vload8(0, buf + off); \ + v[0] += tmp.s0; \ + v[1] += tmp.s1; \ + v[2] += tmp.s2; \ + v[3] += tmp.s3; \ + v[4] += tmp.s4; \ + v[5] += tmp.s5; \ + } + +#define ADD_BUF_ARRAY7(v, off, buf) \ + { \ + T8 tmp = vload8(0, buf + off); \ + v[0] += tmp.s0; \ + v[1] += tmp.s1; \ + v[2] += tmp.s2; \ + v[3] += tmp.s3; \ + v[4] += tmp.s4; \ + v[5] += tmp.s5; \ + v[6] += tmp.s6; \ + } + +#define ADD_BUF_ARRAY8(v, off, buf) \ + { \ + T8 tmp = vload8(0, buf + off); \ + v[0] += tmp.s0; \ + v[1] += tmp.s1; \ + v[2] += tmp.s2; \ + v[3] += tmp.s3; \ + v[4] += tmp.s4; \ + v[5] += tmp.s5; \ + v[6] += tmp.s6; \ + v[7] += tmp.s7; \ + } +/* + * set reg array to normal val + */ +#define SET_REG_ARRAY1(v, reg) \ + { \ + reg[0] = v; \ + } + +#define SET_REG_ARRAY2(v, reg) \ + { \ + reg[0] = v; \ + reg[1] = v; \ + } + +#define SET_REG_ARRAY3(v, reg) \ + { \ + reg[0] = v; \ + reg[1] = v; \ + reg[2] = v; \ + } + +#define SET_REG_ARRAY4(v, reg) \ + { \ + reg[0] = v; \ + reg[1] = v; \ + reg[2] = v; \ + reg[3] = v; \ + } +#define SET_REG_ARRAY5(v, reg) \ + { \ + reg[0] = v; \ + reg[1] = v; \ + reg[2] = v; \ + reg[3] = v; \ + reg[4] = v; \ + } + +#define SET_REG_ARRAY6(v, reg) \ + { \ + reg[0] = v; \ + reg[1] = v; \ + reg[2] = v; \ + reg[3] = v; \ + reg[4] = v; \ + reg[5] = v; \ + } + +#define SET_REG_ARRAY7(v, reg) \ + { \ + reg[0] = v; \ + reg[1] = v; \ + reg[2] = v; \ + reg[3] = v; \ + reg[4] = v; \ + reg[5] = v; \ + reg[6] = v; \ + } + +#define SET_REG_ARRAY8(v, reg) \ + { \ + reg[0] = v; \ + reg[1] = v; \ + reg[2] = v; \ + reg[3] = v; \ + reg[4] = v; \ + reg[5] = v; \ + reg[6] = v; \ + reg[7] = v; \ + } + +#define MUL_REG_NORMAL_ARRAY1(a, b, reg) \ + { \ + reg[0] = a * reg[0] + b; \ + } + +#define MUL_REG_NORMAL_ARRAY2(a, b, reg) \ + { \ + reg[0] = a * reg[0] + b; \ + reg[1] = a * reg[1] + b; \ + } + +#define MUL_REG_NORMAL_ARRAY3(a, b, reg) \ + { \ + reg[0] = a * reg[0] + b; \ + reg[1] = a * reg[1] + b; \ + reg[2] = a * reg[2] + b; \ + } + +#define MUL_REG_NORMAL_ARRAY4(a, b, reg) \ + { \ + reg[0] = a * reg[0] + b; \ + reg[1] = a * reg[1] + b; \ + reg[2] = a * reg[2] + b; \ + reg[3] = a * reg[3] + b; \ + } + +#define MUL_REG_NORMAL_ARRAY5(a, b, reg) \ + { \ + reg[0] = a * reg[0] + b; \ + reg[1] = a * reg[1] + b; \ + reg[2] = a * reg[2] + b; \ + reg[3] = a * reg[3] + b; \ + reg[4] = a * reg[4] + b; \ + } + +#define MUL_REG_NORMAL_ARRAY6(a, b, reg) \ + { \ + reg[0] = a * reg[0] + b; \ + reg[1] = a * reg[1] + b; \ + reg[2] = a * reg[2] + b; \ + reg[3] = a * reg[3] + b; \ + reg[4] = a * reg[4] + b; \ + reg[5] = a * reg[5] + b; \ + } + +#define MUL_REG_NORMAL_ARRAY7(a, b, reg) \ + { \ + reg[0] = a * reg[0] + b; \ + reg[1] = a * reg[1] + b; \ + reg[2] = a * reg[2] + b; \ + reg[3] = a * reg[3] + b; \ + reg[4] = a * reg[4] + b; \ + reg[5] = a * reg[5] + b; \ + reg[6] = a * reg[6] + b; \ + } + +#define MUL_REG_NORMAL_ARRAY8(a, b, reg) \ + { \ + reg[0] = a * reg[0] + b; \ + reg[1] = a * reg[1] + b; \ + reg[2] = a * reg[2] + b; \ + reg[3] = a * reg[3] + b; \ + reg[4] = a * reg[4] + b; \ + reg[5] = a * reg[5] + b; \ + reg[6] = a * reg[6] + b; \ + reg[7] = a * reg[7] + b; \ + } + +#define ADD_REG_ARRAY4(reg0, reg1) \ + { \ + reg1[0] += reg0[0]; \ + reg1[1] += reg0[1]; \ + reg1[2] += reg0[2]; \ + reg1[3] += reg0[3]; \ + } + +#define MINUS_REG_ARRAY4(reg0, reg1) \ + { \ + reg1[0] -= reg0[0]; \ + reg1[1] -= reg0[1]; \ + reg1[2] -= reg0[2]; \ + reg1[3] -= reg0[3]; \ + } + +/* + * DOT + */ +#define DOT_A4B16C4(a, b, c) \ + { \ + c.x += (a.x * b.s0 + a.y * b.s1 + a.z * b.s2 + a.w * b.s3); \ + c.y += (a.x * b.s4 + a.y * b.s5 + a.z * b.s6 + a.w * b.s7); \ + c.z += (a.x * b.s8 + a.y * b.s9 + a.z * b.sa + a.w * b.sb); \ + c.w += (a.x * b.sc + a.y * b.sd + a.z * b.se + a.w * b.sf); \ + } + +#define DOT_A4B4C1(a, b, c) \ + { \ + c += (a.x * b.s0 + a.y * b.s1 + a.z * b.s2 + a.w * b.s3); \ + } + +#define DOT_A4B4C4(a, b, c) \ + { \ + c.x += a.x * b.x; \ + c.y += a.y * b.y; \ + c.z += a.z * b.z; \ + c.w += a.w * b.w; \ + } + +#define DOT_A2B2C1(a, b, c) \ + { \ + c += (a.s0 * b.s0 + a.s1 * b.s1); \ + } + +#define DOT_A8B8C1(a, b, c) \ + { \ + c += (a.s0 * b.s0 + a.s1 * b.s1 + a.s2 * b.s2 + a.s3 * b.s3); \ + c += (a.s4 * b.s4 + a.s5 * b.s5 + a.s6 * b.s6 + a.s7 * b.s7); \ + } + +#define DOT_A16B16C1(a, b, c) \ + { \ + c += (a.x * b.s0 + a.y * b.s1 + a.z * b.s2 + a.w * b.s3); \ + c += (a.x * b.s4 + a.y * b.s5 + a.z * b.s6 + a.w * b.s7); \ + c += (a.x * b.s8 + a.y * b.s9 + a.z * b.sa + a.w * b.sb); \ + c += (a.x * b.sc + a.y * b.sd + a.z * b.se + a.w * b.sf); \ + } + +#define DOT_A_NORMAL_B1C1_ARRAY(a, b, c) \ + { \ + c[0] += a * b[0]; \ + } + +#define DOT_A_NORMAL_B2C2_ARRAY(a, b, c) \ + { \ + c[0] += a * b[0]; \ + c[1] += a * b[1]; \ + } + +#define DOT_A_NORMAL_B3C3_ARRAY(a, b, c) \ + { \ + c[0] += a * b[0]; \ + c[1] += a * b[1]; \ + c[2] += a * b[2]; \ + } + +#define DOT_A_NORMAL_B4C4_ARRAY(a, b, c) \ + { \ + c[0] += a * b[0]; \ + c[1] += a * b[1]; \ + c[2] += a * b[2]; \ + c[3] += a * b[3]; \ + } + +#define DOT_A_NORMAL_B5C5_ARRAY(a, b, c) \ + { \ + c[0] += a * b[0]; \ + c[1] += a * b[1]; \ + c[2] += a * b[2]; \ + c[3] += a * b[3]; \ + c[4] += a * b[4]; \ + } + +#define DOT_A_NORMAL_B6C6_ARRAY(a, b, c) \ + { \ + c[0] += a * b[0]; \ + c[1] += a * b[1]; \ + c[2] += a * b[2]; \ + c[3] += a * b[3]; \ + c[4] += a * b[4]; \ + c[5] += a * b[5]; \ + } + +#define DOT_A_NORMAL_B7C7_ARRAY(a, b, c) \ + { \ + c[0] += a * b[0]; \ + c[1] += a * b[1]; \ + c[2] += a * b[2]; \ + c[3] += a * b[3]; \ + c[4] += a * b[4]; \ + c[5] += a * b[5]; \ + c[6] += a * b[6]; \ + } + +#define DOT_A_NORMAL_B8C8_ARRAY(a, b, c) \ + { \ + c[0] += a * b[0]; \ + c[1] += a * b[1]; \ + c[2] += a * b[2]; \ + c[3] += a * b[3]; \ + c[4] += a * b[4]; \ + c[5] += a * b[5]; \ + c[6] += a * b[6]; \ + c[7] += a * b[7]; \ + } + +#if defined(USE_V2) +#define DOT_VEC(a, b, c) DOT_A2B2C1(a, b, c) +#elif defined(USE_V4) +#define DOT_VEC(a, b, c) DOT_A4B4C1(a, b, c) +#elif defined(USE_V8) +#define DOT_VEC(a, b, c) DOT_A8B8C1(a, b, c) +#elif defined(USE_V16) +#define DOT_VEC(a, b, c) DOT_A16B16C1(a, b, c) +#else +#define DOT_A_VEC_B1C1_ARRAY(a, b, c) DOT_A_NORMAL_B1C1_ARRAY(a, b, c) +#define DOT_A_VEC_B2C2_ARRAY(a, b, c) DOT_A_NORMAL_B2C2_ARRAY(a, b, c) +#define DOT_A_VEC_B3C3_ARRAY(a, b, c) DOT_A_NORMAL_B3C3_ARRAY(a, b, c) +#define DOT_A_VEC_B4C4_ARRAY(a, b, c) DOT_A_NORMAL_B4C4_ARRAY(a, b, c) +#define DOT_A_VEC_B5C5_ARRAY(a, b, c) DOT_A_NORMAL_B5C5_ARRAY(a, b, c) +#define DOT_A_VEC_B6C6_ARRAY(a, b, c) DOT_A_NORMAL_B6C6_ARRAY(a, b, c) +#define DOT_A_VEC_B7C7_ARRAY(a, b, c) DOT_A_NORMAL_B7C7_ARRAY(a, b, c) +#define DOT_A_VEC_B8C8_ARRAY(a, b, c) DOT_A_NORMAL_B8C8_ARRAY(a, b, c) +#endif + +#if defined(USE_V2) || defined(USE_V4) || defined(USE_V8) || defined(USE_V16) +#define DOT_A_VEC_B1C1_ARRAY(a, b, c) \ + { \ + DOT_VEC(a, b[0], c[0]); \ + } + +#define DOT_A_VEC_B2C2_ARRAY(a, b, c) \ + { \ + DOT_VEC(a, b[0], c[0]); \ + DOT_VEC(a, b[1], c[1]); \ + } + +#define DOT_A_VEC_B3C3_ARRAY(a, b, c) \ + { \ + DOT_VEC(a, b[0], c[0]); \ + DOT_VEC(a, b[1], c[1]); \ + DOT_VEC(a, b[2], c[2]); \ + } + +#define DOT_A_VEC_B4C4_ARRAY(a, b, c) \ + { \ + DOT_VEC(a, b[0], c[0]); \ + DOT_VEC(a, b[1], c[1]); \ + DOT_VEC(a, b[2], c[2]); \ + DOT_VEC(a, b[3], c[3]); \ + } + +#define DOT_A_VEC_B5C5_ARRAY(a, b, c) \ + { \ + DOT_VEC(a, b[0], c[0]); \ + DOT_VEC(a, b[1], c[1]); \ + DOT_VEC(a, b[2], c[2]); \ + DOT_VEC(a, b[3], c[3]); \ + DOT_VEC(a, b[4], c[4]); \ + } + +#define DOT_A_VEC_B6C6_ARRAY(a, b, c) \ + { \ + DOT_VEC(a, b[0], c[0]); \ + DOT_VEC(a, b[1], c[1]); \ + DOT_VEC(a, b[2], c[2]); \ + DOT_VEC(a, b[3], c[3]); \ + DOT_VEC(a, b[4], c[4]); \ + DOT_VEC(a, b[5], c[5]); \ + } + +#define DOT_A_VEC_B7C7_ARRAY(a, b, c) \ + { \ + DOT_VEC(a, b[0], c[0]); \ + DOT_VEC(a, b[1], c[1]); \ + DOT_VEC(a, b[2], c[2]); \ + DOT_VEC(a, b[3], c[3]); \ + DOT_VEC(a, b[4], c[4]); \ + DOT_VEC(a, b[5], c[5]); \ + DOT_VEC(a, b[6], c[6]); \ + } + +#define DOT_A_VEC_B8C8_ARRAY(a, b, c) \ + { \ + DOT_VEC(a, b[0], c[0]); \ + DOT_VEC(a, b[1], c[1]); \ + DOT_VEC(a, b[2], c[2]); \ + DOT_VEC(a, b[3], c[3]); \ + DOT_VEC(a, b[4], c[4]); \ + DOT_VEC(a, b[5], c[5]); \ + DOT_VEC(a, b[6], c[6]); \ + DOT_VEC(a, b[7], c[7]); \ + } +#endif +/* + * ACTIVATION + */ +#if defined(USE_RELU) +#define ACTIVATION_V4(v) \ + { \ + v.x = fmax(v.x, 0); \ + v.y = fmax(v.y, 0); \ + v.z = fmax(v.z, 0); \ + v.w = fmax(v.w, 0); \ + } + +#define ACTIVATION_V8(v) \ + { \ + v.s0 = fmax(v.s0, 0); \ + v.s1 = fmax(v.s1, 0); \ + v.s2 = fmax(v.s2, 0); \ + v.s3 = fmax(v.s3, 0); \ + v.s4 = fmax(v.s4, 0); \ + v.s5 = fmax(v.s5, 0); \ + v.s6 = fmax(v.s6, 0); \ + v.s7 = fmax(v.s7, 0); \ + } + +#define ACTIVATION_V16(v) \ + { \ + v.s0 = fmax(v.s0, 0); \ + v.s1 = fmax(v.s1, 0); \ + v.s2 = fmax(v.s2, 0); \ + v.s3 = fmax(v.s3, 0); \ + v.s4 = fmax(v.s4, 0); \ + v.s5 = fmax(v.s5, 0); \ + v.s6 = fmax(v.s6, 0); \ + v.s7 = fmax(v.s7, 0); \ + v.s8 = fmax(v.s8, 0); \ + v.s9 = fmax(v.s9, 0); \ + v.sa = fmax(v.sa, 0); \ + v.sb = fmax(v.sb, 0); \ + v.sc = fmax(v.sc, 0); \ + v.sd = fmax(v.sd, 0); \ + v.se = fmax(v.se, 0); \ + v.sf = fmax(v.sf, 0); \ + } + +#define ACTIVATION_V1(v) \ + { \ + v = fmax(v, 0); \ + } + +#define ACTIVATION_ARRAY1(v) \ + { \ + v[0] = fmax(v[0], 0); \ + } + +#define ACTIVATION_ARRAY2(v) \ + { \ + v[0] = fmax(v[0], 0); \ + v[1] = fmax(v[1], 0); \ + } + +#define ACTIVATION_ARRAY3(v) \ + { \ + v[0] = fmax(v[0], 0); \ + v[1] = fmax(v[1], 0); \ + v[2] = fmax(v[2], 0); \ + } + +#define ACTIVATION_ARRAY4(v) \ + { \ + v[0] = fmax(v[0], 0); \ + v[1] = fmax(v[1], 0); \ + v[2] = fmax(v[2], 0); \ + v[3] = fmax(v[3], 0); \ + } + +#define ACTIVATION_ARRAY5(v) \ + { \ + v[0] = fmax(v[0], 0); \ + v[1] = fmax(v[1], 0); \ + v[2] = fmax(v[2], 0); \ + v[3] = fmax(v[3], 0); \ + v[4] = fmax(v[4], 0); \ + } + +#define ACTIVATION_ARRAY6(v) \ + { \ + v[0] = fmax(v[0], 0); \ + v[1] = fmax(v[1], 0); \ + v[2] = fmax(v[2], 0); \ + v[3] = fmax(v[3], 0); \ + v[4] = fmax(v[4], 0); \ + v[5] = fmax(v[5], 0); \ + } + +#define ACTIVATION_ARRAY7(v) \ + { \ + v[0] = fmax(v[0], 0); \ + v[1] = fmax(v[1], 0); \ + v[2] = fmax(v[2], 0); \ + v[3] = fmax(v[3], 0); \ + v[4] = fmax(v[4], 0); \ + v[5] = fmax(v[5], 0); \ + v[6] = fmax(v[6], 0); \ + } + +#define ACTIVATION_ARRAY8(v) \ + { \ + v[0] = fmax(v[0], 0); \ + v[1] = fmax(v[1], 0); \ + v[2] = fmax(v[2], 0); \ + v[3] = fmax(v[3], 0); \ + v[4] = fmax(v[4], 0); \ + v[5] = fmax(v[5], 0); \ + v[6] = fmax(v[6], 0); \ + v[7] = fmax(v[7], 0); \ + } +#elif defined(USE_RELU6) +#define ACTIVATION_V4(v) \ + { \ + v.x = clamp(v.x, (T)0, (T)6); \ + v.y = clamp(v.y, (T)0, (T)6); \ + v.z = clamp(v.z, (T)0, (T)6); \ + v.w = clamp(v.w, (T)0, (T)6); \ + } + +#define ACTIVATION_V8(v) \ + { \ + v.s0 = clamp(v.s0, (T)0, (T)6); \ + v.s1 = clamp(v.s1, (T)0, (T)6); \ + v.s2 = clamp(v.s2, (T)0, (T)6); \ + v.s3 = clamp(v.s3, (T)0, (T)6); \ + v.s4 = clamp(v.s4, (T)0, (T)6); \ + v.s5 = clamp(v.s5, (T)0, (T)6); \ + v.s6 = clamp(v.s6, (T)0, (T)6); \ + v.s7 = clamp(v.s7, (T)0, (T)6); \ + } + +#define ACTIVATION_V16(v) \ + { \ + v.s0 = clamp(v.s0, (T)0, (T)6); \ + v.s1 = clamp(v.s1, (T)0, (T)6); \ + v.s2 = clamp(v.s2, (T)0, (T)6); \ + v.s3 = clamp(v.s3, (T)0, (T)6); \ + v.s4 = clamp(v.s4, (T)0, (T)6); \ + v.s5 = clamp(v.s5, (T)0, (T)6); \ + v.s6 = clamp(v.s6, (T)0, (T)6); \ + v.s7 = clamp(v.s7, (T)0, (T)6); \ + v.s8 = clamp(v.s0, (T)0, (T)6); \ + v.s9 = clamp(v.s1, (T)0, (T)6); \ + v.sa = clamp(v.sa, (T)0, (T)6); \ + v.sb = clamp(v.sb, (T)0, (T)6); \ + v.sc = clamp(v.sc, (T)0, (T)6); \ + v.sd = clamp(v.sd, (T)0, (T)6); \ + v.se = clamp(v.se, (T)0, (T)6); \ + v.sf = clamp(v.sf, (T)0, (T)6); \ + } + +#define ACTIVATION_V1(v) \ + { \ + v = clamp(v, (T)0, (T)6); \ + } + +#define ACTIVATION_ARRAY1(v) \ + { \ + v[0] = clamp(v[0], (T)0, (T)6); \ + } + +#define ACTIVATION_ARRAY2(v) \ + { \ + v[0] = clamp(v[0], (T)0, (T)6); \ + v[1] = clamp(v[1], (T)0, (T)6); \ + } + +#define ACTIVATION_ARRAY3(v) \ + { \ + v[0] = clamp(v[0], (T)0, (T)6); \ + v[1] = clamp(v[1], (T)0, (T)6); \ + v[2] = clamp(v[2], (T)0, (T)6); \ + } + +#define ACTIVATION_ARRAY4(v) \ + { \ + v[0] = clamp(v[0], (T)0, (T)6); \ + v[1] = clamp(v[1], (T)0, (T)6); \ + v[2] = clamp(v[2], (T)0, (T)6); \ + v[3] = clamp(v[3], (T)0, (T)6); \ + } + +#define ACTIVATION_ARRAY5(v) \ + { \ + v[0] = clamp(v[0], (T)0, (T)6); \ + v[1] = clamp(v[1], (T)0, (T)6); \ + v[2] = clamp(v[2], (T)0, (T)6); \ + v[3] = clamp(v[3], (T)0, (T)6); \ + v[4] = clamp(v[4], (T)0, (T)6); \ + } + +#define ACTIVATION_ARRAY6(v) \ + { \ + v[0] = clamp(v[0], (T)0, (T)6); \ + v[1] = clamp(v[1], (T)0, (T)6); \ + v[2] = clamp(v[2], (T)0, (T)6); \ + v[3] = clamp(v[3], (T)0, (T)6); \ + v[4] = clamp(v[4], (T)0, (T)6); \ + v[5] = clamp(v[5], (T)0, (T)6); \ + } + +#define ACTIVATION_ARRAY7(v) \ + { \ + v[0] = clamp(v[0], (T)0, (T)6); \ + v[1] = clamp(v[1], (T)0, (T)6); \ + v[2] = clamp(v[2], (T)0, (T)6); \ + v[3] = clamp(v[3], (T)0, (T)6); \ + v[4] = clamp(v[4], (T)0, (T)6); \ + v[5] = clamp(v[5], (T)0, (T)6); \ + v[6] = clamp(v[6], (T)0, (T)6); \ + } + +#define ACTIVATION_ARRAY8(v) \ + { \ + v[0] = clamp(v[0], (T)0, (T)6); \ + v[1] = clamp(v[1], (T)0, (T)6); \ + v[2] = clamp(v[2], (T)0, (T)6); \ + v[3] = clamp(v[3], (T)0, (T)6); \ + v[4] = clamp(v[4], (T)0, (T)6); \ + v[5] = clamp(v[5], (T)0, (T)6); \ + v[6] = clamp(v[6], (T)0, (T)6); \ + v[7] = clamp(v[7], (T)0, (T)6); \ + } +#elif defined(USE_GELU) +#define ACTIVATION_V4(v) \ + { \ + T4 tmp = v; \ + v.s0 = 0.797885 * (v.s0 + 0.044715 * pown(v.s0, 3)); \ + v.s1 = 0.797885 * (v.s1 + 0.044715 * pown(v.s1, 3)); \ + v.s2 = 0.797885 * (v.s2 + 0.044715 * pown(v.s2, 3)); \ + v.s3 = 0.797885 * (v.s3 + 0.044715 * pown(v.s3, 3)); \ + v.s0 = 1.0 - 2.0 / (exp(2.0 * v.s0) + 1.0); \ + v.s1 = 1.0 - 2.0 / (exp(2.0 * v.s1) + 1.0); \ + v.s2 = 1.0 - 2.0 / (exp(2.0 * v.s2) + 1.0); \ + v.s3 = 1.0 - 2.0 / (exp(2.0 * v.s3) + 1.0); \ + v.s0 = (v.s0 + (T)1.0) * (T)0.5; \ + v.s1 = (v.s1 + (T)1.0) * (T)0.5; \ + v.s2 = (v.s2 + (T)1.0) * (T)0.5; \ + v.s3 = (v.s3 + (T)1.0) * (T)0.5; \ + v.s0 = v.s0 * tmp.s0; \ + v.s1 = v.s1 * tmp.s1; \ + v.s2 = v.s2 * tmp.s2; \ + v.s3 = v.s3 * tmp.s3; \ + } + +#define ACTIVATION_V8(v) \ + { \ + T8 tmp = v; \ + v.s0 = 0.797885 * (v.s0 + 0.044715 * pown(v.s0, 3)); \ + v.s1 = 0.797885 * (v.s1 + 0.044715 * pown(v.s1, 3)); \ + v.s2 = 0.797885 * (v.s2 + 0.044715 * pown(v.s2, 3)); \ + v.s3 = 0.797885 * (v.s3 + 0.044715 * pown(v.s3, 3)); \ + v.s4 = 0.797885 * (v.s4 + 0.044715 * pown(v.s4, 3)); \ + v.s5 = 0.797885 * (v.s5 + 0.044715 * pown(v.s5, 3)); \ + v.s6 = 0.797885 * (v.s6 + 0.044715 * pown(v.s6, 3)); \ + v.s7 = 0.797885 * (v.s7 + 0.044715 * pown(v.s7, 3)); \ + v.s0 = 1.0 - 2.0 / (exp(2.0 * v.s0) + 1.0); \ + v.s1 = 1.0 - 2.0 / (exp(2.0 * v.s1) + 1.0); \ + v.s2 = 1.0 - 2.0 / (exp(2.0 * v.s2) + 1.0); \ + v.s3 = 1.0 - 2.0 / (exp(2.0 * v.s3) + 1.0); \ + v.s4 = 1.0 - 2.0 / (exp(2.0 * v.s4) + 1.0); \ + v.s5 = 1.0 - 2.0 / (exp(2.0 * v.s5) + 1.0); \ + v.s6 = 1.0 - 2.0 / (exp(2.0 * v.s6) + 1.0); \ + v.s7 = 1.0 - 2.0 / (exp(2.0 * v.s7) + 1.0); \ + v.s0 = (v.s0 + (T)1.0) * (T)0.5; \ + v.s1 = (v.s1 + (T)1.0) * (T)0.5; \ + v.s2 = (v.s2 + (T)1.0) * (T)0.5; \ + v.s3 = (v.s3 + (T)1.0) * (T)0.5; \ + v.s4 = (v.s4 + (T)1.0) * (T)0.5; \ + v.s5 = (v.s5 + (T)1.0) * (T)0.5; \ + v.s6 = (v.s6 + (T)1.0) * (T)0.5; \ + v.s7 = (v.s7 + (T)1.0) * (T)0.5; \ + v.s0 = v.s0 * tmp.s0; \ + v.s1 = v.s1 * tmp.s1; \ + v.s2 = v.s2 * tmp.s2; \ + v.s3 = v.s3 * tmp.s3; \ + v.s4 = v.s4 * tmp.s4; \ + v.s5 = v.s5 * tmp.s5; \ + v.s6 = v.s6 * tmp.s6; \ + v.s7 = v.s7 * tmp.s7; \ + } + +#define ACTIVATION_V16(v) \ + { \ + T16 tmp = v; \ + v.s0 = 0.797885 * (v.s0 + 0.044715 * pown(v.s0, 3)); \ + v.s1 = 0.797885 * (v.s1 + 0.044715 * pown(v.s1, 3)); \ + v.s2 = 0.797885 * (v.s2 + 0.044715 * pown(v.s2, 3)); \ + v.s3 = 0.797885 * (v.s3 + 0.044715 * pown(v.s3, 3)); \ + v.s4 = 0.797885 * (v.s4 + 0.044715 * pown(v.s4, 3)); \ + v.s5 = 0.797885 * (v.s5 + 0.044715 * pown(v.s5, 3)); \ + v.s6 = 0.797885 * (v.s6 + 0.044715 * pown(v.s6, 3)); \ + v.s7 = 0.797885 * (v.s7 + 0.044715 * pown(v.s7, 3)); \ + v.s8 = 0.797885 * (v.s8 + 0.044715 * pown(v.s8, 3)); \ + v.s9 = 0.797885 * (v.s9 + 0.044715 * pown(v.s9, 3)); \ + v.sa = 0.797885 * (v.sa + 0.044715 * pown(v.sa, 3)); \ + v.sb = 0.797885 * (v.sb + 0.044715 * pown(v.sb, 3)); \ + v.sc = 0.797885 * (v.sc + 0.044715 * pown(v.sc, 3)); \ + v.sd = 0.797885 * (v.sd + 0.044715 * pown(v.sd, 3)); \ + v.se = 0.797885 * (v.se + 0.044715 * pown(v.se, 3)); \ + v.sf = 0.797885 * (v.sf + 0.044715 * pown(v.sf, 3)); \ + v.s0 = 1.0 - 2.0 / (exp(2.0 * v.s0) + 1.0); \ + v.s1 = 1.0 - 2.0 / (exp(2.0 * v.s1) + 1.0); \ + v.s2 = 1.0 - 2.0 / (exp(2.0 * v.s2) + 1.0); \ + v.s3 = 1.0 - 2.0 / (exp(2.0 * v.s3) + 1.0); \ + v.s4 = 1.0 - 2.0 / (exp(2.0 * v.s4) + 1.0); \ + v.s5 = 1.0 - 2.0 / (exp(2.0 * v.s5) + 1.0); \ + v.s6 = 1.0 - 2.0 / (exp(2.0 * v.s6) + 1.0); \ + v.s7 = 1.0 - 2.0 / (exp(2.0 * v.s7) + 1.0); \ + v.s8 = 1.0 - 2.0 / (exp(2.0 * v.s8) + 1.0); \ + v.s9 = 1.0 - 2.0 / (exp(2.0 * v.s9) + 1.0); \ + v.sa = 1.0 - 2.0 / (exp(2.0 * v.sa) + 1.0); \ + v.sb = 1.0 - 2.0 / (exp(2.0 * v.sb) + 1.0); \ + v.sc = 1.0 - 2.0 / (exp(2.0 * v.sc) + 1.0); \ + v.sd = 1.0 - 2.0 / (exp(2.0 * v.sd) + 1.0); \ + v.se = 1.0 - 2.0 / (exp(2.0 * v.se) + 1.0); \ + v.sf = 1.0 - 2.0 / (exp(2.0 * v.sf) + 1.0); \ + v.s0 = (v.s0 + (T)1.0) * (T)0.5; \ + v.s1 = (v.s1 + (T)1.0) * (T)0.5; \ + v.s2 = (v.s2 + (T)1.0) * (T)0.5; \ + v.s3 = (v.s3 + (T)1.0) * (T)0.5; \ + v.s4 = (v.s4 + (T)1.0) * (T)0.5; \ + v.s5 = (v.s5 + (T)1.0) * (T)0.5; \ + v.s6 = (v.s6 + (T)1.0) * (T)0.5; \ + v.s7 = (v.s7 + (T)1.0) * (T)0.5; \ + v.s8 = (v.s8 + (T)1.0) * (T)0.5; \ + v.s9 = (v.s9 + (T)1.0) * (T)0.5; \ + v.sa = (v.sa + (T)1.0) * (T)0.5; \ + v.sb = (v.sb + (T)1.0) * (T)0.5; \ + v.sc = (v.sc + (T)1.0) * (T)0.5; \ + v.sd = (v.sd + (T)1.0) * (T)0.5; \ + v.se = (v.se + (T)1.0) * (T)0.5; \ + v.sf = (v.sf + (T)1.0) * (T)0.5; \ + v.s0 = v.s0 * tmp.s0; \ + v.s1 = v.s1 * tmp.s1; \ + v.s2 = v.s2 * tmp.s2; \ + v.s3 = v.s3 * tmp.s3; \ + v.s4 = v.s4 * tmp.s4; \ + v.s5 = v.s5 * tmp.s5; \ + v.s6 = v.s6 * tmp.s6; \ + v.s7 = v.s7 * tmp.s7; \ + v.s8 = v.s8 * tmp.s8; \ + v.s9 = v.s9 * tmp.s9; \ + v.sa = v.sa * tmp.sa; \ + v.sb = v.sb * tmp.sb; \ + v.sc = v.sc * tmp.sc; \ + v.sd = v.sd * tmp.sd; \ + v.se = v.se * tmp.se; \ + v.sf = v.sf * tmp.sf; \ + } + +#define ACTIVATION_ARRAY1(v) \ + { \ + T tmp = v[0]; \ + v[0] = 0.797885 * (v[0] + 0.044715 * pown(v[0], 3)); \ + v[0] = 1.0 - 2.0 / (exp(2.0 * v[0]) + 1.0); \ + v[0] = (v[0] + (T)1.0) * (T)0.5; \ + v[0] = v[0] * tmp; \ + } + +#define ACTIVATION_ARRAY2(v) \ + { \ + T tmp[2]; \ + tmp[0] = v[0]; \ + tmp[1] = v[1]; \ + v[0] = 0.797885 * (v[0] + 0.044715 * pown(v[0], 3)); \ + v[1] = 0.797885 * (v[1] + 0.044715 * pown(v[1], 3)); \ + v[0] = 1.0 - 2.0 / (exp(2.0 * v[0]) + 1.0); \ + v[1] = 1.0 - 2.0 / (exp(2.0 * v[1]) + 1.0); \ + v[0] = (v[0] + (T)1.0) * (T)0.5; \ + v[1] = (v[1] + (T)1.0) * (T)0.5; \ + v[0] = v[0] * tmp[0]; \ + v[1] = v[1] * tmp[1]; \ + } + +#define ACTIVATION_ARRAY3(v) \ + { \ + T tmp[3]; \ + tmp[0] = v[0]; \ + tmp[1] = v[1]; \ + tmp[2] = v[2]; \ + v[0] = 0.797885 * (v[0] + 0.044715 * pown(v[0], 3)); \ + v[1] = 0.797885 * (v[1] + 0.044715 * pown(v[1], 3)); \ + v[2] = 0.797885 * (v[2] + 0.044715 * pown(v[2], 3)); \ + v[0] = 1.0 - 2.0 / (exp(2.0 * v[0]) + 1.0); \ + v[1] = 1.0 - 2.0 / (exp(2.0 * v[1]) + 1.0); \ + v[2] = 1.0 - 2.0 / (exp(2.0 * v[2]) + 1.0); \ + v[0] = (v[0] + (T)1.0) * (T)0.5; \ + v[1] = (v[1] + (T)1.0) * (T)0.5; \ + v[2] = (v[2] + (T)1.0) * (T)0.5; \ + v[0] = v[0] * tmp[0]; \ + v[1] = v[1] * tmp[1]; \ + v[2] = v[2] * tmp[2]; \ + } + +#define ACTIVATION_ARRAY4(v) \ + { \ + T tmp[4]; \ + tmp[0] = v[0]; \ + tmp[1] = v[1]; \ + tmp[2] = v[2]; \ + tmp[3] = v[3]; \ + v[0] = 0.797885 * (v[0] + 0.044715 * pown(v[0], 3)); \ + v[1] = 0.797885 * (v[1] + 0.044715 * pown(v[1], 3)); \ + v[2] = 0.797885 * (v[2] + 0.044715 * pown(v[2], 3)); \ + v[3] = 0.797885 * (v[3] + 0.044715 * pown(v[3], 3)); \ + v[0] = 1.0 - 2.0 / (exp(2.0 * v[0]) + 1.0); \ + v[1] = 1.0 - 2.0 / (exp(2.0 * v[1]) + 1.0); \ + v[2] = 1.0 - 2.0 / (exp(2.0 * v[2]) + 1.0); \ + v[3] = 1.0 - 2.0 / (exp(2.0 * v[3]) + 1.0); \ + v[0] = (v[0] + (T)1.0) * (T)0.5; \ + v[1] = (v[1] + (T)1.0) * (T)0.5; \ + v[2] = (v[2] + (T)1.0) * (T)0.5; \ + v[3] = (v[3] + (T)1.0) * (T)0.5; \ + v[0] = v[0] * tmp[0]; \ + v[1] = v[1] * tmp[1]; \ + v[2] = v[2] * tmp[2]; \ + v[3] = v[3] * tmp[3]; \ + } + +#define ACTIVATION_ARRAY5(v) \ + { \ + T tmp[5]; \ + tmp[0] = v[0]; \ + tmp[1] = v[1]; \ + tmp[2] = v[2]; \ + tmp[3] = v[3]; \ + tmp[4] = v[4]; \ + v[0] = 0.797885 * (v[0] + 0.044715 * pown(v[0], 3)); \ + v[1] = 0.797885 * (v[1] + 0.044715 * pown(v[1], 3)); \ + v[2] = 0.797885 * (v[2] + 0.044715 * pown(v[2], 3)); \ + v[3] = 0.797885 * (v[3] + 0.044715 * pown(v[3], 3)); \ + v[4] = 0.797885 * (v[4] + 0.044715 * pown(v[4], 3)); \ + v[0] = 1.0 - 2.0 / (exp(2.0 * v[0]) + 1.0); \ + v[1] = 1.0 - 2.0 / (exp(2.0 * v[1]) + 1.0); \ + v[2] = 1.0 - 2.0 / (exp(2.0 * v[2]) + 1.0); \ + v[3] = 1.0 - 2.0 / (exp(2.0 * v[3]) + 1.0); \ + v[4] = 1.0 - 2.0 / (exp(2.0 * v[4]) + 1.0); \ + v[0] = (v[0] + (T)1.0) * (T)0.5; \ + v[1] = (v[1] + (T)1.0) * (T)0.5; \ + v[2] = (v[2] + (T)1.0) * (T)0.5; \ + v[3] = (v[3] + (T)1.0) * (T)0.5; \ + v[4] = (v[4] + (T)1.0) * (T)0.5; \ + v[0] = v[0] * tmp[0]; \ + v[1] = v[1] * tmp[1]; \ + v[2] = v[2] * tmp[2]; \ + v[3] = v[3] * tmp[3]; \ + v[4] = v[4] * tmp[4]; \ + } + +#define ACTIVATION_ARRAY6(v) \ + { \ + T tmp[6]; \ + tmp[0] = v[0]; \ + tmp[1] = v[1]; \ + tmp[2] = v[2]; \ + tmp[3] = v[3]; \ + tmp[4] = v[4]; \ + tmp[5] = v[5]; \ + v[0] = 0.797885 * (v[0] + 0.044715 * pown(v[0], 3)); \ + v[1] = 0.797885 * (v[1] + 0.044715 * pown(v[1], 3)); \ + v[2] = 0.797885 * (v[2] + 0.044715 * pown(v[2], 3)); \ + v[3] = 0.797885 * (v[3] + 0.044715 * pown(v[3], 3)); \ + v[4] = 0.797885 * (v[4] + 0.044715 * pown(v[4], 3)); \ + v[5] = 0.797885 * (v[5] + 0.044715 * pown(v[5], 3)); \ + v[0] = 1.0 - 2.0 / (exp(2.0 * v[0]) + 1.0); \ + v[1] = 1.0 - 2.0 / (exp(2.0 * v[1]) + 1.0); \ + v[2] = 1.0 - 2.0 / (exp(2.0 * v[2]) + 1.0); \ + v[3] = 1.0 - 2.0 / (exp(2.0 * v[3]) + 1.0); \ + v[4] = 1.0 - 2.0 / (exp(2.0 * v[4]) + 1.0); \ + v[5] = 1.0 - 2.0 / (exp(2.0 * v[5]) + 1.0); \ + v[0] = (v[0] + (T)1.0) * (T)0.5; \ + v[1] = (v[1] + (T)1.0) * (T)0.5; \ + v[2] = (v[2] + (T)1.0) * (T)0.5; \ + v[3] = (v[3] + (T)1.0) * (T)0.5; \ + v[4] = (v[4] + (T)1.0) * (T)0.5; \ + v[5] = (v[5] + (T)1.0) * (T)0.5; \ + v[0] = v[0] * tmp[0]; \ + v[1] = v[1] * tmp[1]; \ + v[2] = v[2] * tmp[2]; \ + v[3] = v[3] * tmp[3]; \ + v[4] = v[4] * tmp[4]; \ + v[5] = v[5] * tmp[5]; \ + } + +#define ACTIVATION_ARRAY7(v) \ + { \ + T tmp[7]; \ + tmp[0] = v[0]; \ + tmp[1] = v[1]; \ + tmp[2] = v[2]; \ + tmp[3] = v[3]; \ + tmp[4] = v[4]; \ + tmp[5] = v[5]; \ + tmp[6] = v[6]; \ + v[0] = 0.797885 * (v[0] + 0.044715 * pown(v[0], 3)); \ + v[1] = 0.797885 * (v[1] + 0.044715 * pown(v[1], 3)); \ + v[2] = 0.797885 * (v[2] + 0.044715 * pown(v[2], 3)); \ + v[3] = 0.797885 * (v[3] + 0.044715 * pown(v[3], 3)); \ + v[4] = 0.797885 * (v[4] + 0.044715 * pown(v[4], 3)); \ + v[5] = 0.797885 * (v[5] + 0.044715 * pown(v[5], 3)); \ + v[6] = 0.797885 * (v[6] + 0.044715 * pown(v[6], 3)); \ + v[0] = 1.0 - 2.0 / (exp(2.0 * v[0]) + 1.0); \ + v[1] = 1.0 - 2.0 / (exp(2.0 * v[1]) + 1.0); \ + v[2] = 1.0 - 2.0 / (exp(2.0 * v[2]) + 1.0); \ + v[3] = 1.0 - 2.0 / (exp(2.0 * v[3]) + 1.0); \ + v[4] = 1.0 - 2.0 / (exp(2.0 * v[4]) + 1.0); \ + v[5] = 1.0 - 2.0 / (exp(2.0 * v[5]) + 1.0); \ + v[6] = 1.0 - 2.0 / (exp(2.0 * v[6]) + 1.0); \ + v[0] = (v[0] + (T)1.0) * (T)0.5; \ + v[1] = (v[1] + (T)1.0) * (T)0.5; \ + v[2] = (v[2] + (T)1.0) * (T)0.5; \ + v[3] = (v[3] + (T)1.0) * (T)0.5; \ + v[4] = (v[4] + (T)1.0) * (T)0.5; \ + v[5] = (v[5] + (T)1.0) * (T)0.5; \ + v[6] = (v[6] + (T)1.0) * (T)0.5; \ + v[0] = v[0] * tmp[0]; \ + v[1] = v[1] * tmp[1]; \ + v[2] = v[2] * tmp[2]; \ + v[3] = v[3] * tmp[3]; \ + v[4] = v[4] * tmp[4]; \ + v[5] = v[5] * tmp[5]; \ + v[6] = v[6] * tmp[6]; \ + } + +#define ACTIVATION_ARRAY8(v) \ + { \ + T tmp[8]; \ + tmp[0] = v[0]; \ + tmp[1] = v[1]; \ + tmp[2] = v[2]; \ + tmp[3] = v[3]; \ + tmp[4] = v[4]; \ + tmp[5] = v[5]; \ + tmp[6] = v[6]; \ + tmp[7] = v[7]; \ + v[0] = 0.797885 * (v[0] + 0.044715 * pown(v[0], 3)); \ + v[1] = 0.797885 * (v[1] + 0.044715 * pown(v[1], 3)); \ + v[2] = 0.797885 * (v[2] + 0.044715 * pown(v[2], 3)); \ + v[3] = 0.797885 * (v[3] + 0.044715 * pown(v[3], 3)); \ + v[4] = 0.797885 * (v[4] + 0.044715 * pown(v[4], 3)); \ + v[5] = 0.797885 * (v[5] + 0.044715 * pown(v[5], 3)); \ + v[6] = 0.797885 * (v[6] + 0.044715 * pown(v[6], 3)); \ + v[7] = 0.797885 * (v[7] + 0.044715 * pown(v[7], 3)); \ + v[0] = 1.0 - 2.0 / (exp(2.0 * v[0]) + 1.0); \ + v[1] = 1.0 - 2.0 / (exp(2.0 * v[1]) + 1.0); \ + v[2] = 1.0 - 2.0 / (exp(2.0 * v[2]) + 1.0); \ + v[3] = 1.0 - 2.0 / (exp(2.0 * v[3]) + 1.0); \ + v[4] = 1.0 - 2.0 / (exp(2.0 * v[4]) + 1.0); \ + v[5] = 1.0 - 2.0 / (exp(2.0 * v[5]) + 1.0); \ + v[6] = 1.0 - 2.0 / (exp(2.0 * v[6]) + 1.0); \ + v[7] = 1.0 - 2.0 / (exp(2.0 * v[7]) + 1.0); \ + v[0] = (v[0] + (T)1.0) * (T)0.5; \ + v[1] = (v[1] + (T)1.0) * (T)0.5; \ + v[2] = (v[2] + (T)1.0) * (T)0.5; \ + v[3] = (v[3] + (T)1.0) * (T)0.5; \ + v[4] = (v[4] + (T)1.0) * (T)0.5; \ + v[5] = (v[5] + (T)1.0) * (T)0.5; \ + v[6] = (v[6] + (T)1.0) * (T)0.5; \ + v[7] = (v[7] + (T)1.0) * (T)0.5; \ + v[0] = v[0] * tmp[0]; \ + v[1] = v[1] * tmp[1]; \ + v[2] = v[2] * tmp[2]; \ + v[3] = v[3] * tmp[3]; \ + v[4] = v[4] * tmp[4]; \ + v[5] = v[5] * tmp[5]; \ + v[6] = v[6] * tmp[6]; \ + v[7] = v[7] * tmp[7]; \ + } +#elif defined(USE_HSIGMOID) +#define ACTIVATION_V4(v) \ + { \ + v.s0 = v.s0 + (T)3.0; \ + v.s1 = v.s1 + (T)3.0; \ + v.s2 = v.s2 + (T)3.0; \ + v.s3 = v.s3 + (T)3.0; \ + v.s0 = clamp(v.s0, (T)0, (T)6.0); \ + v.s1 = clamp(v.s1, (T)0, (T)6.0); \ + v.s2 = clamp(v.s2, (T)0, (T)6.0); \ + v.s3 = clamp(v.s3, (T)0, (T)6.0); \ + v.s0 = v.s0 * 0.166667; \ + v.s1 = v.s1 * 0.166667; \ + v.s2 = v.s2 * 0.166667; \ + v.s3 = v.s3 * 0.166667; \ + } +#elif defined(USE_HSWISH) +#define ACTIVATION_V4(v) \ + { \ + T4 tmp = v; \ + v.s0 = v.s0 + (T)3.0; \ + v.s1 = v.s1 + (T)3.0; \ + v.s2 = v.s2 + (T)3.0; \ + v.s3 = v.s3 + (T)3.0; \ + v.s0 = clamp(v.s0, (T)0, (T)6.0); \ + v.s1 = clamp(v.s1, (T)0, (T)6.0); \ + v.s2 = clamp(v.s2, (T)0, (T)6.0); \ + v.s3 = clamp(v.s3, (T)0, (T)6.0); \ + v.s0 = tmp.s0 * (v.s0 * 0.166667); \ + v.s1 = tmp.s1 * (v.s1 * 0.166667); \ + v.s2 = tmp.s2 * (v.s2 * 0.166667); \ + v.s3 = tmp.s3 * (v.s3 * 0.166667); \ + } +#elif defined(USE_TANH) +#define ACTIVATION_V4(v) \ + { \ + v.s0 = 1.0 - 2.0 / (exp(2.0 * v.s0) + 1.0); \ + v.s1 = 1.0 - 2.0 / (exp(2.0 * v.s1) + 1.0); \ + v.s2 = 1.0 - 2.0 / (exp(2.0 * v.s2) + 1.0); \ + v.s3 = 1.0 - 2.0 / (exp(2.0 * v.s3) + 1.0); \ + } +#elif defined(USE_SIGMOID) +#define ACTIVATION_V4(v) \ + { \ + v.s0 = 1.0 / (1.0 + exp(-1.0 * v.s0)); \ + v.s1 = 1.0 / (1.0 + exp(-1.0 * v.s1)); \ + v.s2 = 1.0 / (1.0 + exp(-1.0 * v.s2)); \ + v.s3 = 1.0 / (1.0 + exp(-1.0 * v.s3)); \ + } +#else +#define ACTIVATION_V1(v) \ + {} + +#define ACTIVATION_V4(v) \ + {} + +#define ACTIVATION_V8(v) \ + {} + +#define ACTIVATION_V16(v) \ + {} + +#define ACTIVATION_ARRAY1(v) \ + {} + +#define ACTIVATION_ARRAY2(v) \ + {} + +#define ACTIVATION_ARRAY3(v) \ + {} + +#define ACTIVATION_ARRAY4(v) \ + {} + +#define ACTIVATION_ARRAY5(v) \ + {} + +#define ACTIVATION_ARRAY6(v) \ + {} + +#define ACTIVATION_ARRAY7(v) \ + {} + +#define ACTIVATION_ARRAY8(v) \ + {} +#endif + +/* + * store data reg array to buffer + */ +#define STORE_BUF_ARRAY1(v, off, buf) \ + { \ + ACTIVATION_ARRAY1(v); \ + buf[off] = v[0]; \ + } + +#define STORE_BUF_ARRAY2(v, off, buf) \ + { \ + ACTIVATION_ARRAY2(v); \ + vstore2((T2)(v[0], v[1]), 0, buf + off); \ + } + +#define STORE_BUF_ARRAY3(v, off, buf) \ + { \ + ACTIVATION_ARRAY3(v); \ + vstore3((T3)(v[0], v[1], v[2]), 0, buf + off); \ + } + +#define STORE_BUF_ARRAY4(v, off, buf) \ + { \ + ACTIVATION_ARRAY4(v); \ + vstore4((T4)(v[0], v[1], v[2], v[3]), 0, buf + off); \ + } + +#define STORE_BUF_ARRAY5(v, off, buf) \ + { \ + ACTIVATION_ARRAY5(v); \ + vstore4((T4)(v[0], v[1], v[2], v[3]), 0, buf + off); \ + buf[off + 4] = v[4]; \ + } + +#define STORE_BUF_ARRAY6(v, off, buf) \ + { \ + ACTIVATION_ARRAY6(v); \ + vstore3((T3)(v[0], v[1], v[2]), 0, buf + off); \ + vstore3((T3)(v[3], v[4], v[5]), 0, buf + off + 3); \ + } + +#define STORE_BUF_ARRAY7(v, off, buf) \ + { \ + ACTIVATION_ARRAY7(v); \ + vstore4((T4)(v[0], v[1], v[2], v[3]), 0, buf + off); \ + vstore3((T3)(v[4], v[5], v[6]), 0, buf + off + 4); \ + } + +#define STORE_BUF_ARRAY8(v, off, buf) \ + { \ + ACTIVATION_ARRAY8(v); \ + vstore8((T8)(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]), 0, buf + off); \ + } +/* + * LOAD BIAS + * Load bias from image 1D based on out number + * ON is out number + */ + +#if (ON == 1) +#define LOADBIAS_IMAGE_ARRAY_V4(V, id, img) \ + { \ + V[0] = READ_IMAGE(img, sampler, id); \ + } +#elif (ON == 2) +#define LOADBIAS_IMAGE_ARRAY_V4(V, id, img) \ + { \ + V[0] = READ_IMAGE(img, sampler, id); \ + V[1] = V[0]; \ + } +#elif (ON == 3) +#define LOADBIAS_IMAGE_ARRAY_V4(V, id, img) \ + { \ + V[0] = READ_IMAGE(img, sampler, id); \ + V[1] = V[0]; \ + V[2] = V[0]; \ + } +#elif (ON == 4) +#define LOADBIAS_IMAGE_ARRAY_V4(V, id, img) \ + { \ + V[0] = READ_IMAGE(img, sampler, id); \ + V[1] = V[0]; \ + V[2] = V[0]; \ + V[3] = V[0]; \ + } +#elif (ON == 5) +#define LOADBIAS_IMAGE_ARRAY_V4(V, id, img) \ + { \ + V[0] = READ_IMAGE(img, sampler, id); \ + V[1] = V[0]; \ + V[2] = V[0]; \ + V[3] = V[0]; \ + V[4] = V[0]; \ + } +#elif (ON == 6) +#define LOADBIAS_IMAGE_ARRAY_V4(V, id, img) \ + { \ + V[0] = READ_IMAGE(img, sampler, id); \ + V[1] = V[0]; \ + V[2] = V[0]; \ + V[3] = V[0]; \ + V[4] = V[0]; \ + V[5] = V[0]; \ + } +#elif (ON == 7) +#define LOADBIAS_IMAGE_ARRAY_V4(V, id, img) \ + { \ + V[0] = READ_IMAGE(img, sampler, id); \ + V[1] = V[0]; \ + V[2] = V[0]; \ + V[3] = V[0]; \ + V[4] = V[0]; \ + V[5] = V[0]; \ + V[6] = V[0]; \ + } +#elif (ON == 8) +#define LOADBIAS_IMAGE_ARRAY_V4(V, id, img) \ + { \ + V[0] = READ_IMAGE(img, sampler, id); \ + V[1] = V[0]; \ + V[2] = V[0]; \ + V[3] = V[0]; \ + V[4] = V[0]; \ + V[5] = V[0]; \ + V[6] = V[0]; \ + V[7] = V[0]; \ + } +#elif (ON == 9) +#define LOADBIAS_IMAGE_ARRAY_V4(V, id, img) \ + { \ + V[0] = READ_IMAGE(img, sampler, id); \ + V[1] = V[0]; \ + V[2] = V[0]; \ + V[3] = V[0]; \ + V[4] = V[0]; \ + V[5] = V[0]; \ + V[6] = V[0]; \ + V[7] = V[0]; \ + V[8] = V[0]; \ + } +#elif (ON == 10) +#define LOADBIAS_IMAGE_ARRAY_V4(V, id, img) \ + { \ + V[0] = READ_IMAGE(img, sampler, id); \ + V[1] = V[0]; \ + V[2] = V[0]; \ + V[3] = V[0]; \ + V[4] = V[0]; \ + V[5] = V[0]; \ + V[6] = V[0]; \ + V[7] = V[0]; \ + V[8] = V[0]; \ + V[9] = V[0]; \ + } +#elif (ON == 11) +#define LOADBIAS_IMAGE_ARRAY_V4(V, id, img) \ + { \ + V[0] = READ_IMAGE(img, sampler, id); \ + V[1] = V[0]; \ + V[2] = V[0]; \ + V[3] = V[0]; \ + V[4] = V[0]; \ + V[5] = V[0]; \ + V[6] = V[0]; \ + V[7] = V[0]; \ + V[8] = V[0]; \ + V[9] = V[0]; \ + V[10] = V[0]; \ + } +#elif (ON == 12) +#define LOADBIAS_IMAGE_ARRAY_V4(V, id, img) \ + { \ + V[0] = READ_IMAGE(img, sampler, id); \ + V[1] = V[0]; \ + V[2] = V[0]; \ + V[3] = V[0]; \ + V[4] = V[0]; \ + V[5] = V[0]; \ + V[6] = V[0]; \ + V[7] = V[0]; \ + V[8] = V[0]; \ + V[9] = V[0]; \ + V[10] = V[0]; \ + V[11] = V[0]; \ + } +#endif + +/* + * LOAD INPUT + * load input from buffer based on len of array vector 4 + * len = N; + * N is usually associated with number W + * + * + * GEMM TN A x B = C + * Matrix A has been transposed + * Operator define for Matrix B and Matrix C + */ +#if (LN == 0) +#define LOAD_INPUT_BUF_ARRAY_V4(V, off, str, buf) \ + {} +#elif (LN == 1) +#define LOAD_INPUT_BUF_ARRAY_V4(V, off, str, buf) \ + { \ + V[0] = vload4(off, buf); \ + } + +#define GEMM_SET_C_BIAS_X(v, reg) \ + { \ + SET_REG_ARRAY1(v, reg); \ + } + +#define GEMM_SET_C_EDGE_ZERO_X(reg, ex) \ + {} + +#define ADD_ELTWISE_NCHW_X(v, off, buf) \ + { \ + ADD_BUF_ARRAY1(v, off, buf); \ + } + +#define GEMM_LOAD_B(v, off, buf) \ + { \ + LOAD_BUF_ARRAY1(v, off, buf); \ + } + +#define GEMM_CALCORE_X(a, b, c) \ + { \ + DOT_A_VEC_B1C1_ARRAY(a, b, c); \ + } + +#define GEMM_MUL_C_X(a, b, reg) \ + { \ + MUL_REG_NORMAL_ARRAY1(a, b, reg); \ + } + +#define GEMM_STORE_C_X(v, off, buf) \ + { \ + STORE_BUF_ARRAY1(v, off, buf); \ + } + +#define GEMM_NT_LOAD_B(v, off, str, buf) \ + { \ + READ_BUF(v[0], off, buf); \ + } +#elif (LN == 2) +#define LOAD_INPUT_BUF_ARRAY_V4(V, off, str, buf) \ + { \ + V[0] = vload4(off, buf); \ + V[1] = vload4(off + str, buf); \ + } + +#define GEMM_SET_C_BIAS_X(v, reg) \ + { \ + SET_REG_ARRAY2(v, reg); \ + } + +#define GEMM_SET_C_EDGE_ZERO_X(reg, ex) \ + { \ + reg[1] = 0; \ + } + +#define ADD_ELTWISE_NCHW_X(v, off, buf) \ + { \ + ADD_BUF_ARRAY2(v, off, buf); \ + } + +#define GEMM_LOAD_B(v, off, buf) \ + { \ + LOAD_BUF_ARRAY2(v, off, buf); \ + } + +#define GEMM_CALCORE_X(a, b, c) \ + { \ + DOT_A_VEC_B2C2_ARRAY(a, b, c); \ + } + +#define GEMM_MUL_C_X(a, b, reg) \ + { \ + MUL_REG_NORMAL_ARRAY2(a, b, reg); \ + } + +#define GEMM_STORE_C_X(v, off, buf) \ + { \ + STORE_BUF_ARRAY2(v, off, buf); \ + } + +#define GEMM_NT_LOAD_B(v, off, str, buf) \ + { \ + READ_BUF(v[0], off, buf); \ + READ_BUF(v[1], off + str, buf); \ + } +#elif (LN == 3) +#define LOAD_INPUT_BUF_ARRAY_V4(V, off, str, buf) \ + { \ + V[0] = vload4(off, buf); \ + V[1] = vload4(off + str, buf); \ + V[2] = vload4(off + str * 2, buf); \ + } + +#define GEMM_SET_C_BIAS_X(v, reg) \ + { \ + SET_REG_ARRAY3(v, reg); \ + } + +#define GEMM_SET_C_EDGE_ZERO_X(reg, ex) \ + { \ + if (ex > 1) \ + reg[1] = 0; \ + reg[2] = 0; \ + } + +#define ADD_ELTWISE_NCHW_X(v, off, buf) \ + { \ + ADD_BUF_ARRAY3(v, off, buf); \ + } + +#define GEMM_LOAD_B(v, off, buf) \ + { \ + LOAD_BUF_ARRAY3(v, off, buf); \ + } + +#define GEMM_CALCORE_X(a, b, c) \ + { \ + DOT_A_VEC_B3C3_ARRAY(a, b, c); \ + } + +#define GEMM_MUL_C_X(a, b, reg) \ + { \ + MUL_REG_NORMAL_ARRAY3(a, b, reg); \ + } + +#define GEMM_STORE_C_X(v, off, buf) \ + { \ + STORE_BUF_ARRAY3(v, off, buf); \ + } + +#define GEMM_NT_LOAD_B(v, off, str, buf) \ + { \ + READ_BUF(v[0], off, buf); \ + READ_BUF(v[1], off + str, buf); \ + READ_BUF(v[2], off + str * 2, buf); \ + } +#elif (LN == 4) +#define LOAD_INPUT_BUF_ARRAY_V4(V, off, str, buf) \ + { \ + V[0] = vload4(off, buf); \ + V[1] = vload4(off + str, buf); \ + V[2] = vload4(off + str * 2, buf); \ + V[3] = vload4(off + str * 3, buf); \ + } + +#define GEMM_SET_C_BIAS_X(v, reg) \ + { \ + SET_REG_ARRAY4(v, reg); \ + } + +#define GEMM_SET_C_EDGE_ZERO_X(reg, ex) \ + { \ + if (ex > 2) \ + reg[1] = 0; \ + if (ex > 1) \ + reg[2] = 0; \ + reg[3] = 0; \ + } + +#define ADD_ELTWISE_NCHW_X(v, off, buf) \ + { \ + ADD_BUF_ARRAY4(v, off, buf); \ + } + +#define GEMM_LOAD_B(v, off, buf) \ + { \ + LOAD_BUF_ARRAY4(v, off, buf); \ + } + +#define GEMM_CALCORE_X(a, b, c) \ + { \ + DOT_A_VEC_B4C4_ARRAY(a, b, c); \ + } + +#define GEMM_MUL_C_X(a, b, reg) \ + { \ + MUL_REG_NORMAL_ARRAY4(a, b, reg); \ + } + +#define GEMM_STORE_C_X(v, off, buf) \ + { \ + STORE_BUF_ARRAY4(v, off, buf); \ + } + +#define GEMM_NT_LOAD_B(v, off, str, buf) \ + { \ + READ_BUF(v[0], off, buf); \ + READ_BUF(v[1], off + str, buf); \ + READ_BUF(v[2], off + str * 2, buf); \ + READ_BUF(v[3], off + str * 3, buf); \ + } +#elif (LN == 5) +#define LOAD_INPUT_BUF_ARRAY_V4(V, off, str, buf) \ + { \ + V[0] = vload4(off, buf); \ + V[1] = vload4(off + str, buf); \ + V[2] = vload4(off + str * 2, buf); \ + V[3] = vload4(off + str * 3, buf); \ + V[4] = vload4(off + str * 4, buf); \ + } + +#define GEMM_SET_C_BIAS_X(v, reg) \ + { \ + SET_REG_ARRAY5(v, reg); \ + } + +#define GEMM_SET_C_EDGE_ZERO_X(reg, ex) \ + { \ + if (ex > 3) \ + reg[1] = 0; \ + if (ex > 2) \ + reg[2] = 0; \ + if (ex > 1) \ + reg[3] = 0; \ + reg[4] = 0; \ + } + +#define ADD_ELTWISE_NCHW_X(v, off, buf) \ + { \ + ADD_BUF_ARRAY5(v, off, buf); \ + } + +#define GEMM_LOAD_B(v, off, buf) \ + { \ + LOAD_BUF_ARRAY5(v, off, buf); \ + } + +#define GEMM_CALCORE_X(a, b, c) \ + { \ + DOT_A_VEC_B5C5_ARRAY(a, b, c); \ + } + +#define GEMM_MUL_C_X(a, b, reg) \ + { \ + MUL_REG_NORMAL_ARRAY5(a, b, reg); \ + } + +#define GEMM_STORE_C_X(v, off, buf) \ + { \ + STORE_BUF_ARRAY5(v, off, buf); \ + } + +#define GEMM_NT_LOAD_B(v, off, str, buf) \ + { \ + READ_BUF(v[0], off, buf); \ + READ_BUF(v[1], off + str, buf); \ + READ_BUF(v[2], off + str * 2, buf); \ + READ_BUF(v[3], off + str * 3, buf); \ + READ_BUF(v[4], off + str * 4, buf); \ + } +#elif (LN == 6) +#define LOAD_INPUT_BUF_ARRAY_V4(V, off, str, buf) \ + { \ + V[0] = vload4(off, buf); \ + V[1] = vload4(off + str, buf); \ + V[2] = vload4(off + str * 2, buf); \ + V[3] = vload4(off + str * 3, buf); \ + V[4] = vload4(off + str * 4, buf); \ + V[5] = vload4(off + str * 5, buf); \ + } + +#define GEMM_SET_C_BIAS_X(v, reg) \ + { \ + SET_REG_ARRAY6(v, reg); \ + } + +#define GEMM_SET_C_EDGE_ZERO_X(reg, ex) \ + { \ + if (ex > 4) \ + reg[1] = 0; \ + if (ex > 3) \ + reg[2] = 0; \ + if (ex > 2) \ + reg[3] = 0; \ + if (ex > 1) \ + reg[4] = 0; \ + reg[5] = 0; \ + } + +#define ADD_ELTWISE_NCHW_X(v, off, buf) \ + { \ + ADD_BUF_ARRAY6(v, off, buf); \ + } + +#define GEMM_LOAD_B(v, off, buf) \ + { \ + LOAD_BUF_ARRAY6(v, off, buf); \ + } + +#define GEMM_CALCORE_X(a, b, c) \ + { \ + DOT_A_VEC_B6C6_ARRAY(a, b, c); \ + } + +#define GEMM_MUL_C_X(a, b, reg) \ + { \ + MUL_REG_NORMAL_ARRAY6(a, b, reg); \ + } + +#define GEMM_STORE_C_X(v, off, buf) \ + { \ + STORE_BUF_ARRAY6(v, off, buf); \ + } + +#define GEMM_NT_LOAD_B(v, off, str, buf) \ + { \ + READ_BUF(v[0], off, buf); \ + READ_BUF(v[1], off + str, buf); \ + READ_BUF(v[2], off + str * 2, buf); \ + READ_BUF(v[3], off + str * 3, buf); \ + READ_BUF(v[4], off + str * 4, buf); \ + READ_BUF(v[5], off + str * 5, buf); \ + } +#elif (LN == 7) +#define LOAD_INPUT_BUF_ARRAY_V4(V, off, str, buf) \ + { \ + V[0] = vload4(off, buf); \ + V[1] = vload4(off + str, buf); \ + V[2] = vload4(off + str * 2, buf); \ + V[3] = vload4(off + str * 3, buf); \ + V[4] = vload4(off + str * 4, buf); \ + V[5] = vload4(off + str * 5, buf); \ + V[6] = vload4(off + str * 6, buf); \ + } + +#define GEMM_SET_C_BIAS_X(v, reg) \ + { \ + SET_REG_ARRAY7(v, reg); \ + } + +#define GEMM_SET_C_EDGE_ZERO_X(reg, ex) \ + { \ + if (ex > 5) \ + reg[1] = 0; \ + if (ex > 4) \ + reg[2] = 0; \ + if (ex > 3) \ + reg[3] = 0; \ + if (ex > 2) \ + reg[4] = 0; \ + if (ex > 1) \ + reg[5] = 0; \ + reg[6] = 0; \ + } + +#define ADD_ELTWISE_NCHW_X(v, off, buf) \ + { \ + ADD_BUF_ARRAY7(v, off, buf); \ + } + +#define GEMM_LOAD_B(v, off, buf) \ + { \ + LOAD_BUF_ARRAY7(v, off, buf); \ + } + +#define GEMM_CALCORE_X(a, b, c) \ + { \ + DOT_A_VEC_B7C7_ARRAY(a, b, c); \ + } + +#define GEMM_MUL_C_X(a, b, reg) \ + { \ + MUL_REG_NORMAL_ARRAY7(a, b, reg); \ + } + +#define GEMM_STORE_C_X(v, off, buf) \ + { \ + STORE_BUF_ARRAY7(v, off, buf); \ + } + +#define GEMM_NT_LOAD_B(v, off, str, buf) \ + { \ + READ_BUF(v[0], off, buf); \ + READ_BUF(v[1], off + str, buf); \ + READ_BUF(v[2], off + str * 2, buf); \ + READ_BUF(v[3], off + str * 3, buf); \ + READ_BUF(v[4], off + str * 4, buf); \ + READ_BUF(v[5], off + str * 5, buf); \ + READ_BUF(v[6], off + str * 6, buf); \ + } +#elif (LN == 8) +#define LOAD_INPUT_BUF_ARRAY_V4(V, off, str, buf) \ + { \ + V[0] = vload4(off, buf); \ + V[1] = vload4(off + str, buf); \ + V[2] = vload4(off + str * 2, buf); \ + V[3] = vload4(off + str * 3, buf); \ + V[4] = vload4(off + str * 4, buf); \ + V[5] = vload4(off + str * 5, buf); \ + V[6] = vload4(off + str * 6, buf); \ + V[7] = vload4(off + str * 7, buf); \ + } + +#define GEMM_SET_C_BIAS_X(v, reg) \ + { \ + SET_REG_ARRAY8(v, reg); \ + } + +#define GEMM_SET_C_EDGE_ZERO_X(reg, ex) \ + { \ + if (ex > 6) \ + reg[1] = 0; \ + if (ex > 5) \ + reg[2] = 0; \ + if (ex > 4) \ + reg[3] = 0; \ + if (ex > 3) \ + reg[4] = 0; \ + if (ex > 2) \ + reg[5] = 0; \ + if (ex > 1) \ + reg[6] = 0; \ + reg[7] = 0; \ + } + +#define ADD_ELTWISE_NCHW_X(v, off, buf) \ + { \ + ADD_BUF_ARRAY8(v, off, buf); \ + } + +#define GEMM_LOAD_B(v, off, buf) \ + { \ + LOAD_BUF_ARRAY8(v, off, buf); \ + } + +#define GEMM_CALCORE_X(a, b, c) \ + { \ + DOT_A_VEC_B8C8_ARRAY(a, b, c); \ + } + +#define GEMM_MUL_C_X(a, b, reg) \ + { \ + MUL_REG_NORMAL_ARRAY8(a, b, reg); \ + } + +#define GEMM_STORE_C_X(v, off, buf) \ + { \ + STORE_BUF_ARRAY8(v, off, buf); \ + } + +#define GEMM_NT_LOAD_B(v, off, str, buf) \ + { \ + READ_BUF(v[0], off, buf); \ + READ_BUF(v[1], off + str, buf); \ + READ_BUF(v[2], off + str * 2, buf); \ + READ_BUF(v[3], off + str * 3, buf); \ + READ_BUF(v[4], off + str * 4, buf); \ + READ_BUF(v[5], off + str * 5, buf); \ + READ_BUF(v[6], off + str * 6, buf); \ + READ_BUF(v[7], off + str * 7, buf); \ + } +#elif (LN == 9) +#define LOAD_INPUT_BUF_ARRAY_V4(V, off, str, buf) \ + { \ + V[0] = vload4(off, buf); \ + V[1] = vload4(off + str, buf); \ + V[2] = vload4(off + str * 2, buf); \ + V[3] = vload4(off + str * 3, buf); \ + V[4] = vload4(off + str * 4, buf); \ + V[5] = vload4(off + str * 5, buf); \ + V[6] = vload4(off + str * 6, buf); \ + V[7] = vload4(off + str * 7, buf); \ + V[8] = vload4(off + str * 8, buf); \ + } +#elif (LN == 10) +#define LOAD_INPUT_BUF_ARRAY_V4(V, off, str, buf) \ + { \ + V[0] = vload4(off, buf); \ + V[1] = vload4(off + str, buf); \ + V[2] = vload4(off + str * 2, buf); \ + V[3] = vload4(off + str * 3, buf); \ + V[4] = vload4(off + str * 4, buf); \ + V[5] = vload4(off + str * 5, buf); \ + V[6] = vload4(off + str * 6, buf); \ + V[7] = vload4(off + str * 7, buf); \ + V[8] = vload4(off + str * 8, buf); \ + V[9] = vload4(off + str * 9, buf); \ + } +#elif (LN == 11) +#define LOAD_INPUT_BUF_ARRAY_V4(V, off, str, buf) \ + { \ + V[0] = vload4(off, buf); \ + V[1] = vload4(off + str, buf); \ + V[2] = vload4(off + str * 2, buf); \ + V[3] = vload4(off + str * 3, buf); \ + V[4] = vload4(off + str * 4, buf); \ + V[5] = vload4(off + str * 5, buf); \ + V[6] = vload4(off + str * 6, buf); \ + V[7] = vload4(off + str * 7, buf); \ + V[8] = vload4(off + str * 8, buf); \ + V[9] = vload4(off + str * 9, buf); \ + V[10] = vload4(off + str * 10, buf); \ + } +#elif (LN == 12) +#define LOAD_INPUT_BUF_ARRAY_V4(V, off, str, buf) \ + { \ + V[0] = vload4(off, buf); \ + V[1] = vload4(off + str, buf); \ + V[2] = vload4(off + str * 2, buf); \ + V[3] = vload4(off + str * 3, buf); \ + V[4] = vload4(off + str * 4, buf); \ + V[5] = vload4(off + str * 5, buf); \ + V[6] = vload4(off + str * 6, buf); \ + V[7] = vload4(off + str * 7, buf); \ + V[8] = vload4(off + str * 8, buf); \ + V[9] = vload4(off + str * 9, buf); \ + V[10] = vload4(off + str * 10, buf); \ + V[11] = vload4(off + str * 11, buf); \ + } +#endif + +/* + * GEMM A x B = C + */ +#if (LM == 1) +#define GEMM_LOAD_A(v, off, buf) \ + { \ + LOAD_BUF_ARRAY1(v, off, buf); \ + } + +#define GEMM_SET_C_BIAS(v, reg) \ + { \ + GEMM_SET_C_BIAS_X(v[0], reg[0]); \ + } + +#define GEMM_SET_C_ZERO(reg) \ + { \ + GEMM_SET_C_BIAS_X(0, reg[0]); \ + } + +#define GEMM_SET_C_EDGE_ZERO_H(reg, ey) \ + {} + +#define GEMM_SET_C_EDGE_ZERO_W(reg, ex) \ + { \ + GEMM_SET_C_EDGE_ZERO_X(reg[0], ex); \ + } + +#define GEMM_CALCORE(a, b, c) \ + { \ + GEMM_CALCORE_X(a[0], b, c[0]); \ + } + +#define GEMM_MUL_C(a, b, reg) \ + { \ + GEMM_MUL_C_X(a, b, reg[0]); \ + } + +#define GEMM_STORE_C(v, off, str, buf) \ + { \ + GEMM_STORE_C_X(v[0], off, buf); \ + } + +#define GEMM_NT_LOAD_A(v, off, str, buf) \ + { \ + READ_BUF(v[0], off, buf); \ + } +#elif (LM == 2) +#define GEMM_LOAD_A(v, off, buf) \ + { \ + LOAD_BUF_ARRAY2(v, off, buf); \ + } + +#define GEMM_SET_C_BIAS(v, reg) \ + { \ + GEMM_SET_C_BIAS_X(v[0], reg[0]); \ + GEMM_SET_C_BIAS_X(v[1], reg[1]); \ + } + +#define GEMM_SET_C_ZERO(reg) \ + { \ + GEMM_SET_C_BIAS_X(0, reg[0]); \ + GEMM_SET_C_BIAS_X(0, reg[1]); \ + } + +#define GEMM_SET_C_EDGE_ZERO_H(reg, ey) \ + { \ + GEMM_SET_C_BIAS_X(0, reg[1]); \ + } + +#define GEMM_SET_C_EDGE_ZERO_W(reg, ex) \ + { \ + GEMM_SET_C_EDGE_ZERO_X(reg[0], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[1], ex); \ + } + +#define GEMM_CALCORE(a, b, c) \ + { \ + GEMM_CALCORE_X(a[0], b, c[0]); \ + GEMM_CALCORE_X(a[1], b, c[1]); \ + } + +#define GEMM_MUL_C(a, b, reg) \ + { \ + GEMM_MUL_C_X(a, b, reg[0]) \ + { \ + GEMM_MUL_C_X(a, b, reg[1]) \ + {} + +#define GEMM_MUL_C(a, b, reg) \ + { \ + GEMM_MUL_C_X(a, b, reg[0]); \ + GEMM_MUL_C_X(a, b, reg[1]); \ + } + +#define GEMM_STORE_C(v, off, str, buf) \ + { \ + GEMM_STORE_C_X(v[0], off, buf); \ + GEMM_STORE_C_X(v[1], off + str, buf); \ + } + +#define GEMM_NT_LOAD_A(v, off, str, buf) \ + { \ + READ_BUF(v[0], off, buf); \ + READ_BUF(v[1], off + str, buf); \ + } +#elif (LM == 3) +#define GEMM_LOAD_A(v, off, buf) \ + { \ + LOAD_BUF_ARRAY3(v, off, buf); \ + } + +#define GEMM_SET_C_BIAS(v, reg) \ + { \ + GEMM_SET_C_BIAS_X(v[0], reg[0]); \ + GEMM_SET_C_BIAS_X(v[1], reg[1]); \ + GEMM_SET_C_BIAS_X(v[2], reg[2]); \ + } + +#define GEMM_SET_C_ZERO(reg) \ + { \ + GEMM_SET_C_BIAS_X(0, reg[0]); \ + GEMM_SET_C_BIAS_X(0, reg[1]); \ + GEMM_SET_C_BIAS_X(0, reg[2]); \ + } + +#define GEMM_SET_C_EDGE_ZERO_H(reg, ey) \ + { \ + if (ey > 1) \ + GEMM_SET_C_BIAS_X(0, reg[1]); \ + GEMM_SET_C_BIAS_X(0, reg[2]); \ + } + +#define GEMM_SET_C_EDGE_ZERO_W(reg, ex) \ + { \ + GEMM_SET_C_EDGE_ZERO_X(reg[0], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[1], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[2], ex); \ + } + +#define GEMM_CALCORE(a, b, c) \ + { \ + GEMM_CALCORE_X(a[0], b, c[0]); \ + GEMM_CALCORE_X(a[1], b, c[1]); \ + GEMM_CALCORE_X(a[2], b, c[2]); \ + } + +#define GEMM_MUL_C(a, b, reg) \ + { \ + GEMM_MUL_C_X(a, b, reg[0]); \ + GEMM_MUL_C_X(a, b, reg[1]); \ + GEMM_MUL_C_X(a, b, reg[2]); \ + } + +#define GEMM_STORE_C(v, off, str, buf) \ + { \ + GEMM_STORE_C_X(v[0], off, buf); \ + GEMM_STORE_C_X(v[1], off + str, buf); \ + GEMM_STORE_C_X(v[2], off + str * 2, buf); \ + } + +#define GEMM_NT_LOAD_A(v, off, str, buf) \ + { \ + READ_BUF(v[0], off, buf); \ + READ_BUF(v[1], off + str, buf); \ + READ_BUF(v[2], off + str * 2, buf); \ + } +#elif (LM == 4) +#define GEMM_LOAD_A(v, off, buf) \ + { \ + LOAD_BUF_ARRAY4(v, off, buf); \ + } + +#define GEMM_SET_C_BIAS(v, reg) \ + { \ + GEMM_SET_C_BIAS_X(v[0], reg[0]); \ + GEMM_SET_C_BIAS_X(v[1], reg[1]); \ + GEMM_SET_C_BIAS_X(v[2], reg[2]); \ + GEMM_SET_C_BIAS_X(v[3], reg[3]); \ + } + +#define GEMM_SET_C_ZERO(reg) \ + { \ + GEMM_SET_C_BIAS_X(0, reg[0]); \ + GEMM_SET_C_BIAS_X(0, reg[1]); \ + GEMM_SET_C_BIAS_X(0, reg[2]); \ + GEMM_SET_C_BIAS_X(0, reg[3]); \ + } + +#define GEMM_SET_C_EDGE_ZERO_H(reg, ey) \ + { \ + if (ey > 2) \ + GEMM_SET_C_BIAS_X(0, reg[1]); \ + if (ey > 1) \ + GEMM_SET_C_BIAS_X(0, reg[2]); \ + GEMM_SET_C_BIAS_X(0, reg[3]); \ + } + +#define GEMM_SET_C_EDGE_ZERO_W(reg, ex) \ + { \ + GEMM_SET_C_EDGE_ZERO_X(reg[0], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[1], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[2], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[3], ex); \ + } + +#define ADD_ELTWISE_NCHW(v, off, str, buf) \ + { \ + ADD_ELTWISE_NCHW_X(v[0], off, buf); \ + ADD_ELTWISE_NCHW_X(v[1], off + str, buf); \ + ADD_ELTWISE_NCHW_X(v[2], off + str * 2, buf); \ + ADD_ELTWISE_NCHW_X(v[3], off + str * 3, buf); \ + } + +#define GEMM_CALCORE(a, b, c) \ + { \ + GEMM_CALCORE_X(a[0], b, c[0]); \ + GEMM_CALCORE_X(a[1], b, c[1]); \ + GEMM_CALCORE_X(a[2], b, c[2]); \ + GEMM_CALCORE_X(a[3], b, c[3]); \ + } + +#define GEMM_MUL_C(a, b, reg) \ + { \ + GEMM_MUL_C_X(a, b, reg[0]); \ + GEMM_MUL_C_X(a, b, reg[1]); \ + GEMM_MUL_C_X(a, b, reg[2]); \ + GEMM_MUL_C_X(a, b, reg[3]); \ + } + +#define GEMM_STORE_C(v, off, str, buf) \ + { \ + GEMM_STORE_C_X(v[0], off, buf); \ + GEMM_STORE_C_X(v[1], off + str, buf); \ + GEMM_STORE_C_X(v[2], off + str * 2, buf); \ + GEMM_STORE_C_X(v[3], off + str * 3, buf); \ + } + +#define GEMM_NT_LOAD_A(v, off, str, buf) \ + { \ + READ_BUF(v[0], off, buf); \ + READ_BUF(v[1], off + str, buf); \ + READ_BUF(v[2], off + str * 2, buf); \ + READ_BUF(v[3], off + str * 3, buf); \ + } +#elif (LM == 5) +#define GEMM_LOAD_A(v, off, buf) \ + { \ + LOAD_BUF_ARRAY5(v, off, buf); \ + } + +#define GEMM_SET_C_BIAS(v, reg) \ + { \ + GEMM_SET_C_BIAS_X(v[0], reg[0]); \ + GEMM_SET_C_BIAS_X(v[1], reg[1]); \ + GEMM_SET_C_BIAS_X(v[2], reg[2]); \ + GEMM_SET_C_BIAS_X(v[3], reg[3]); \ + GEMM_SET_C_BIAS_X(v[4], reg[4]); \ + } + +#define GEMM_SET_C_ZERO(reg) \ + { \ + GEMM_SET_C_BIAS_X(0, reg[0]); \ + GEMM_SET_C_BIAS_X(0, reg[1]); \ + GEMM_SET_C_BIAS_X(0, reg[2]); \ + GEMM_SET_C_BIAS_X(0, reg[3]); \ + GEMM_SET_C_BIAS_X(0, reg[4]); \ + } + +#define GEMM_SET_C_EDGE_ZERO_H(reg, ey) \ + { \ + if (ey > 3) \ + GEMM_SET_C_BIAS_X(0, reg[1]); \ + if (ey > 2) \ + GEMM_SET_C_BIAS_X(0, reg[2]); \ + if (ey > 1) \ + GEMM_SET_C_BIAS_X(0, reg[3]); \ + GEMM_SET_C_BIAS_X(0, reg[4]); \ + } + +#define GEMM_SET_C_EDGE_ZERO_W(reg, ex) \ + { \ + GEMM_SET_C_EDGE_ZERO_X(reg[0], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[1], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[2], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[3], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[4], ex); \ + } + +#define GEMM_CALCORE(a, b, c) \ + { \ + GEMM_CALCORE_X(a[0], b, c[0]); \ + GEMM_CALCORE_X(a[1], b, c[1]); \ + GEMM_CALCORE_X(a[2], b, c[2]); \ + GEMM_CALCORE_X(a[3], b, c[3]); \ + GEMM_CALCORE_X(a[4], b, c[4]); \ + } + +#define GEMM_MUL_C(a, b, reg) \ + { \ + GEMM_MUL_C_X(a, b, reg[0]); \ + GEMM_MUL_C_X(a, b, reg[1]); \ + GEMM_MUL_C_X(a, b, reg[2]); \ + GEMM_MUL_C_X(a, b, reg[3]); \ + GEMM_MUL_C_X(a, b, reg[4]); \ + } + +#define GEMM_STORE_C(v, off, str, buf) \ + { \ + GEMM_STORE_C_X(v[0], off, buf); \ + GEMM_STORE_C_X(v[1], off + str, buf); \ + GEMM_STORE_C_X(v[2], off + str * 2, buf); \ + GEMM_STORE_C_X(v[3], off + str * 3, buf); \ + GEMM_STORE_C_X(v[4], off + str * 4, buf); \ + } + +#define GEMM_NT_LOAD_A(v, off, str, buf) \ + { \ + READ_BUF(v[0], off, buf); \ + READ_BUF(v[1], off + str, buf); \ + READ_BUF(v[2], off + str * 2, buf); \ + READ_BUF(v[3], off + str * 3, buf); \ + READ_BUF(v[4], off + str * 4, buf); \ + } +#elif (LM == 6) +#define GEMM_LOAD_A(v, off, buf) \ + { \ + LOAD_BUF_ARRAY6(v, off, buf); \ + } + +#define GEMM_SET_C_BIAS(v, reg) \ + { \ + GEMM_SET_C_BIAS_X(v[0], reg[0]); \ + GEMM_SET_C_BIAS_X(v[1], reg[1]); \ + GEMM_SET_C_BIAS_X(v[2], reg[2]); \ + GEMM_SET_C_BIAS_X(v[3], reg[3]); \ + GEMM_SET_C_BIAS_X(v[4], reg[4]); \ + GEMM_SET_C_BIAS_X(v[5], reg[5]); \ + } + +#define GEMM_SET_C_ZERO(reg) \ + { \ + GEMM_SET_C_BIAS_X(0, reg[0]); \ + GEMM_SET_C_BIAS_X(0, reg[1]); \ + GEMM_SET_C_BIAS_X(0, reg[2]); \ + GEMM_SET_C_BIAS_X(0, reg[3]); \ + GEMM_SET_C_BIAS_X(0, reg[4]); \ + GEMM_SET_C_BIAS_X(0, reg[5]); \ + } + +#define GEMM_SET_C_EDGE_ZERO_H(reg, ey) \ + { \ + if (ey > 4) \ + GEMM_SET_C_BIAS_X(0, reg[1]); \ + if (ey > 3) \ + GEMM_SET_C_BIAS_X(0, reg[2]); \ + if (ey > 2) \ + GEMM_SET_C_BIAS_X(0, reg[3]); \ + if (ey > 1) \ + GEMM_SET_C_BIAS_X(0, reg[4]); \ + GEMM_SET_C_BIAS_X(0, reg[5]); \ + } + +#define GEMM_SET_C_EDGE_ZERO_W(reg, ex) \ + { \ + GEMM_SET_C_EDGE_ZERO_X(reg[0], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[1], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[2], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[3], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[4], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[5], ex); \ + } + +#define GEMM_CALCORE(a, b, c) \ + { \ + GEMM_CALCORE_X(a[0], b, c[0]); \ + GEMM_CALCORE_X(a[1], b, c[1]); \ + GEMM_CALCORE_X(a[2], b, c[2]); \ + GEMM_CALCORE_X(a[3], b, c[3]); \ + GEMM_CALCORE_X(a[4], b, c[4]); \ + GEMM_CALCORE_X(a[5], b, c[5]); \ + } + +#define GEMM_MUL_C(a, b, reg) \ + { \ + GEMM_MUL_C_X(a, b, reg[0]); \ + GEMM_MUL_C_X(a, b, reg[1]); \ + GEMM_MUL_C_X(a, b, reg[2]); \ + GEMM_MUL_C_X(a, b, reg[3]); \ + GEMM_MUL_C_X(a, b, reg[4]); \ + GEMM_MUL_C_X(a, b, reg[5]); \ + } + +#define GEMM_STORE_C(v, off, str, buf) \ + { \ + GEMM_STORE_C_X(v[0], off, buf); \ + GEMM_STORE_C_X(v[1], off + str, buf); \ + GEMM_STORE_C_X(v[2], off + str * 2, buf); \ + GEMM_STORE_C_X(v[3], off + str * 3, buf); \ + GEMM_STORE_C_X(v[4], off + str * 4, buf); \ + GEMM_STORE_C_X(v[5], off + str * 5, buf); \ + } + +#define GEMM_NT_LOAD_A(v, off, str, buf) \ + { \ + READ_BUF(v[0], off, buf); \ + READ_BUF(v[1], off + str, buf); \ + READ_BUF(v[2], off + str * 2, buf); \ + READ_BUF(v[3], off + str * 3, buf); \ + READ_BUF(v[4], off + str * 4, buf); \ + READ_BUF(v[5], off + str * 5, buf); \ + } +#elif (LM == 7) +#define GEMM_LOAD_A(v, off, buf) \ + { \ + LOAD_BUF_ARRAY7(v, off, buf); \ + } + +#define GEMM_SET_C_BIAS(v, reg) \ + { \ + GEMM_SET_C_BIAS_X(v[0], reg[0]); \ + GEMM_SET_C_BIAS_X(v[1], reg[1]); \ + GEMM_SET_C_BIAS_X(v[2], reg[2]); \ + GEMM_SET_C_BIAS_X(v[3], reg[3]); \ + GEMM_SET_C_BIAS_X(v[4], reg[4]); \ + GEMM_SET_C_BIAS_X(v[5], reg[5]); \ + GEMM_SET_C_BIAS_X(v[6], reg[6]); \ + } + +#define GEMM_SET_C_ZERO(reg) \ + { \ + GEMM_SET_C_BIAS_X(0, reg[0]); \ + GEMM_SET_C_BIAS_X(0, reg[1]); \ + GEMM_SET_C_BIAS_X(0, reg[2]); \ + GEMM_SET_C_BIAS_X(0, reg[3]); \ + GEMM_SET_C_BIAS_X(0, reg[4]); \ + GEMM_SET_C_BIAS_X(0, reg[5]); \ + GEMM_SET_C_BIAS_X(0, reg[6]); \ + } + +#define GEMM_SET_C_EDGE_ZERO_H(reg, ey) \ + { \ + if (ey > 5) \ + GEMM_SET_C_BIAS_X(0, reg[1]); \ + if (ey > 4) \ + GEMM_SET_C_BIAS_X(0, reg[2]); \ + if (ey > 3) \ + GEMM_SET_C_BIAS_X(0, reg[3]); \ + if (ey > 2) \ + GEMM_SET_C_BIAS_X(0, reg[4]); \ + if (ey > 1) \ + GEMM_SET_C_BIAS_X(0, reg[5]); \ + GEMM_SET_C_BIAS_X(0, reg[6]); \ + } + +#define GEMM_SET_C_EDGE_ZERO_W(reg, ex) \ + { \ + GEMM_SET_C_EDGE_ZERO_X(reg[0], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[1], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[2], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[3], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[4], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[5], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[6], ex); \ + } + +#define GEMM_CALCORE(a, b, c) \ + { \ + GEMM_CALCORE_X(a[0], b, c[0]); \ + GEMM_CALCORE_X(a[1], b, c[1]); \ + GEMM_CALCORE_X(a[2], b, c[2]); \ + GEMM_CALCORE_X(a[3], b, c[3]); \ + GEMM_CALCORE_X(a[4], b, c[4]); \ + GEMM_CALCORE_X(a[5], b, c[5]); \ + GEMM_CALCORE_X(a[6], b, c[6]); \ + } + +#define GEMM_MUL_C(a, b, reg) \ + { \ + GEMM_MUL_C_X(a, b, reg[0]); \ + GEMM_MUL_C_X(a, b, reg[1]); \ + GEMM_MUL_C_X(a, b, reg[2]); \ + GEMM_MUL_C_X(a, b, reg[3]); \ + GEMM_MUL_C_X(a, b, reg[4]); \ + GEMM_MUL_C_X(a, b, reg[5]); \ + GEMM_MUL_C_X(a, b, reg[6]); \ + } + +#define GEMM_STORE_C(v, off, str, buf) \ + { \ + GEMM_STORE_C_X(v[0], off, buf); \ + GEMM_STORE_C_X(v[1], off + str, buf); \ + GEMM_STORE_C_X(v[2], off + str * 2, buf); \ + GEMM_STORE_C_X(v[3], off + str * 3, buf); \ + GEMM_STORE_C_X(v[4], off + str * 4, buf); \ + GEMM_STORE_C_X(v[5], off + str * 5, buf); \ + GEMM_STORE_C_X(v[6], off + str * 6, buf); \ + } + +#define GEMM_NT_LOAD_A(v, off, str, buf) \ + { \ + READ_BUF(v[0], off, buf); \ + READ_BUF(v[1], off + str, buf); \ + READ_BUF(v[2], off + str * 2, buf); \ + READ_BUF(v[3], off + str * 3, buf); \ + READ_BUF(v[4], off + str * 4, buf); \ + READ_BUF(v[5], off + str * 5, buf); \ + READ_BUF(v[6], off + str * 6, buf); \ + } +#elif (LM == 8) +#define GEMM_LOAD_A(v, off, buf) \ + { \ + LOAD_BUF_ARRAY8(v, off, buf); \ + } + +#define GEMM_SET_C_BIAS(v, reg) \ + { \ + GEMM_SET_C_BIAS_X(v[0], reg[0]); \ + GEMM_SET_C_BIAS_X(v[1], reg[1]); \ + GEMM_SET_C_BIAS_X(v[2], reg[2]); \ + GEMM_SET_C_BIAS_X(v[3], reg[3]); \ + GEMM_SET_C_BIAS_X(v[4], reg[4]); \ + GEMM_SET_C_BIAS_X(v[5], reg[5]); \ + GEMM_SET_C_BIAS_X(v[6], reg[6]); \ + GEMM_SET_C_BIAS_X(v[7], reg[7]); \ + } + +#define GEMM_SET_C_ZERO(reg) \ + { \ + GEMM_SET_C_BIAS_X(0, reg[0]); \ + GEMM_SET_C_BIAS_X(0, reg[1]); \ + GEMM_SET_C_BIAS_X(0, reg[2]); \ + GEMM_SET_C_BIAS_X(0, reg[3]); \ + GEMM_SET_C_BIAS_X(0, reg[4]); \ + GEMM_SET_C_BIAS_X(0, reg[5]); \ + GEMM_SET_C_BIAS_X(0, reg[6]); \ + GEMM_SET_C_BIAS_X(0, reg[7]); \ + } + +#define GEMM_SET_C_EDGE_ZERO_H(reg, ey) \ + { \ + if (ey > 6) \ + GEMM_SET_C_BIAS_X(0, reg[1]); \ + if (ey > 5) \ + GEMM_SET_C_BIAS_X(0, reg[2]); \ + if (ey > 4) \ + GEMM_SET_C_BIAS_X(0, reg[3]); \ + if (ey > 3) \ + GEMM_SET_C_BIAS_X(0, reg[4]); \ + if (ey > 2) \ + GEMM_SET_C_BIAS_X(0, reg[5]); \ + if (ey > 1) \ + GEMM_SET_C_BIAS_X(0, reg[6]); \ + GEMM_SET_C_BIAS_X(0, reg[7]); \ + } + +#define GEMM_SET_C_EDGE_ZERO_W(reg, ex) \ + { \ + GEMM_SET_C_EDGE_ZERO_X(reg[0], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[1], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[2], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[3], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[4], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[5], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[6], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[7], ex); \ + } + +#define ADD_ELTWISE_NCHW(v, off, str, buf) \ + { \ + ADD_ELTWISE_NCHW_X(v[0], off, buf); \ + ADD_ELTWISE_NCHW_X(v[1], off + str, buf); \ + ADD_ELTWISE_NCHW_X(v[2], off + str * 2, buf); \ + ADD_ELTWISE_NCHW_X(v[3], off + str * 3, buf); \ + ADD_ELTWISE_NCHW_X(v[4], off + str * 4, buf); \ + ADD_ELTWISE_NCHW_X(v[5], off + str * 5, buf); \ + ADD_ELTWISE_NCHW_X(v[6], off + str * 6, buf); \ + ADD_ELTWISE_NCHW_X(v[7], off + str * 7, buf); \ + } + +#define GEMM_CALCORE(a, b, c) \ + { \ + GEMM_CALCORE_X(a[0], b, c[0]); \ + GEMM_CALCORE_X(a[1], b, c[1]); \ + GEMM_CALCORE_X(a[2], b, c[2]); \ + GEMM_CALCORE_X(a[3], b, c[3]); \ + GEMM_CALCORE_X(a[4], b, c[4]); \ + GEMM_CALCORE_X(a[5], b, c[5]); \ + GEMM_CALCORE_X(a[6], b, c[6]); \ + GEMM_CALCORE_X(a[7], b, c[7]); \ + } + +#define GEMM_MUL_C(a, b, reg) \ + { \ + GEMM_MUL_C_X(a, b, reg[0]); \ + GEMM_MUL_C_X(a, b, reg[1]); \ + GEMM_MUL_C_X(a, b, reg[2]); \ + GEMM_MUL_C_X(a, b, reg[3]); \ + GEMM_MUL_C_X(a, b, reg[4]); \ + GEMM_MUL_C_X(a, b, reg[5]); \ + GEMM_MUL_C_X(a, b, reg[6]); \ + GEMM_MUL_C_X(a, b, reg[7]); \ + } + +#define GEMM_STORE_C(v, off, str, buf) \ + { \ + GEMM_STORE_C_X(v[0], off, buf); \ + GEMM_STORE_C_X(v[1], off + str, buf); \ + GEMM_STORE_C_X(v[2], off + str * 2, buf); \ + GEMM_STORE_C_X(v[3], off + str * 3, buf); \ + GEMM_STORE_C_X(v[4], off + str * 4, buf); \ + GEMM_STORE_C_X(v[5], off + str * 5, buf); \ + GEMM_STORE_C_X(v[6], off + str * 6, buf); \ + GEMM_STORE_C_X(v[7], off + str * 7, buf); \ + } + +#define GEMM_NT_LOAD_A(v, off, str, buf) \ + { \ + READ_BUF(v[0], off, buf); \ + READ_BUF(v[1], off + str, buf); \ + READ_BUF(v[2], off + str * 2, buf); \ + READ_BUF(v[3], off + str * 3, buf); \ + READ_BUF(v[4], off + str * 4, buf); \ + READ_BUF(v[5], off + str * 5, buf); \ + READ_BUF(v[6], off + str * 6, buf); \ + READ_BUF(v[7], off + str * 7, buf); \ + } +#endif + +/* + * UPDATE VALUE OF REG + */ +#if (UN == 0) +#define UPDATE_REG(A) \ + {} +#elif (UN == 1) +#define UPDATE_REG(A) \ + { \ + A[0] = A[1]; \ + } +#elif (UN == 2) +#define UPDATE_REG(A) \ + { \ + A[0] = A[1]; \ + A[1] = A[2]; \ + } +#elif (UN == 3) +#define UPDATE_REG(A) \ + { \ + A[0] = A[1]; \ + A[1] = A[2]; \ + A[2] = A[3]; \ + } +#elif (UN == 4) +#define UPDATE_REG(A) \ + { \ + A[0] = A[1]; \ + A[1] = A[2]; \ + A[2] = A[3]; \ + A[3] = A[4]; \ + } +#elif (UN == 5) +#define UPDATE_REG(A) \ + { \ + A[0] = A[1]; \ + A[1] = A[2]; \ + A[2] = A[3]; \ + A[3] = A[4]; \ + A[4] = A[5]; \ + } +#elif (UN == 6) +#define UPDATE_REG(A) \ + { \ + A[0] = A[1]; \ + A[1] = A[2]; \ + A[2] = A[3]; \ + A[3] = A[4]; \ + A[4] = A[5]; \ + A[5] = A[6]; \ + } +#elif (UN == 7) +#define UPDATE_REG(A) \ + { \ + A[0] = A[1]; \ + A[1] = A[2]; \ + A[2] = A[3]; \ + A[3] = A[4]; \ + A[4] = A[5]; \ + A[5] = A[6]; \ + A[6] = A[7]; \ + } +#elif (UN == 8) +#define UPDATE_REG(A) \ + { \ + A[0] = A[1]; \ + A[1] = A[2]; \ + A[2] = A[3]; \ + A[3] = A[4]; \ + A[4] = A[5]; \ + A[5] = A[6]; \ + A[6] = A[7]; \ + A[7] = A[8]; \ + } +#elif (UN == 9) +#define UPDATE_REG(A) \ + { \ + A[0] = A[1]; \ + A[1] = A[2]; \ + A[2] = A[3]; \ + A[3] = A[4]; \ + A[4] = A[5]; \ + A[5] = A[6]; \ + A[6] = A[7]; \ + A[7] = A[8]; \ + A[8] = A[9]; \ + } +#elif (UN == 10) +#define UPDATE_REG(A) \ + { \ + A[0] = A[1]; \ + A[1] = A[2]; \ + A[2] = A[3]; \ + A[3] = A[4]; \ + A[4] = A[5]; \ + A[5] = A[6]; \ + A[6] = A[7]; \ + A[7] = A[8]; \ + A[8] = A[9]; \ + A[9] = A[10]; \ + } +#elif (UN == 11) +#define UPDATE_REG(A) \ + { \ + A[0] = A[1]; \ + A[1] = A[2]; \ + A[2] = A[3]; \ + A[3] = A[4]; \ + A[4] = A[5]; \ + A[5] = A[6]; \ + A[6] = A[7]; \ + A[7] = A[8]; \ + A[8] = A[9]; \ + A[9] = A[10]; \ + A[10] = A[11]; \ + } +#elif (UN == 12) +#define UPDATE_REG(A) \ + { \ + A[0] = A[1]; \ + A[1] = A[2]; \ + A[2] = A[3]; \ + A[3] = A[4]; \ + A[4] = A[5]; \ + A[5] = A[6]; \ + A[6] = A[7]; \ + A[7] = A[8]; \ + A[8] = A[9]; \ + A[9] = A[10]; \ + A[10] = A[11]; \ + A[11] = A[12]; \ + } +#endif + +/* + * Direct convolution calculate core + * Depthwise calculate core + */ +#if (ON == 1) +#define DIRECT_CONV_CAL_CORE_S1(A, B, C) \ + { \ + DOT_A4B16C4(A[0], B, C[0]); \ + } +#define DIRECT_CONV_CAL_CORE_S2(A, B, C) \ + { \ + DOT_A4B16C4(A[0], B, C[0]); \ + } +#define DEPTHWISE_CAL_CORE_S1(A, B, C) \ + { \ + DOT_A4B4C4(A[0], B, C[0]); \ + } +#define DEPTHWISE_CAL_CORE_S2(A, B, C) \ + { \ + DOT_A4B4C4(A[0], B, C[0]); \ + } +#elif (ON == 2) +#define DIRECT_CONV_CAL_CORE_S1(A, B, C) \ + { \ + DOT_A4B16C4(A[0], B, C[0]); \ + DOT_A4B16C4(A[1], B, C[1]); \ + } +#define DIRECT_CONV_CAL_CORE_S2(A, B, C) \ + { \ + DOT_A4B16C4(A[0], B, C[0]); \ + DOT_A4B16C4(A[2], B, C[1]); \ + } +#define DEPTHWISE_CAL_CORE_S1(A, B, C) \ + { \ + DOT_A4B4C4(A[0], B, C[0]); \ + DOT_A4B4C4(A[1], B, C[1]); \ + } +#define DEPTHWISE_CAL_CORE_S2(A, B, C) \ + { \ + DOT_A4B4C4(A[0], B, C[0]); \ + DOT_A4B4C4(A[2], B, C[1]); \ + } +#elif (ON == 3) +#define DIRECT_CONV_CAL_CORE_S1(A, B, C) \ + { \ + DOT_A4B16C4(A[0], B, C[0]); \ + DOT_A4B16C4(A[1], B, C[1]); \ + DOT_A4B16C4(A[2], B, C[2]); \ + } +#define DIRECT_CONV_CAL_CORE_S2(A, B, C) \ + { \ + DOT_A4B16C4(A[0], B, C[0]); \ + DOT_A4B16C4(A[2], B, C[1]); \ + DOT_A4B16C4(A[4], B, C[2]); \ + } +#define DEPTHWISE_CAL_CORE_S1(A, B, C) \ + { \ + DOT_A4B4C4(A[0], B, C[0]); \ + DOT_A4B4C4(A[1], B, C[1]); \ + DOT_A4B4C4(A[2], B, C[2]); \ + } +#define DEPTHWISE_CAL_CORE_S2(A, B, C) \ + { \ + DOT_A4B4C4(A[0], B, C[0]); \ + DOT_A4B4C4(A[2], B, C[1]); \ + DOT_A4B4C4(A[4], B, C[2]); \ + } +#elif (ON == 4) +#define DIRECT_CONV_CAL_CORE_S1(A, B, C) \ + { \ + DOT_A4B16C4(A[0], B, C[0]); \ + DOT_A4B16C4(A[1], B, C[1]); \ + DOT_A4B16C4(A[2], B, C[2]); \ + DOT_A4B16C4(A[3], B, C[3]); \ + } +#define DIRECT_CONV_CAL_CORE_S2(A, B, C) \ + { \ + DOT_A4B16C4(A[0], B, C[0]); \ + DOT_A4B16C4(A[2], B, C[2]); \ + DOT_A4B16C4(A[4], B, C[4]); \ + DOT_A4B16C4(A[6], B, C[6]); \ + } +#define DEPTHWISE_CAL_CORE_S1(A, B, C) \ + { \ + DOT_A4B4C4(A[0], B, C[0]); \ + DOT_A4B4C4(A[1], B, C[1]); \ + DOT_A4B4C4(A[2], B, C[2]); \ + DOT_A4B4C4(A[3], B, C[3]); \ + } +#define DEPTHWISE_CAL_CORE_S2(A, B, C) \ + { \ + DOT_A4B4C4(A[0], B, C[0]); \ + DOT_A4B4C4(A[2], B, C[1]); \ + DOT_A4B4C4(A[4], B, C[2]); \ + DOT_A4B4C4(A[6], B, C[3]); \ + } +#elif (ON == 5) +#define DIRECT_CONV_CAL_CORE_S1(A, B, C) \ + { \ + DOT_A4B16C4(A[0], B, C[0]); \ + DOT_A4B16C4(A[1], B, C[1]); \ + DOT_A4B16C4(A[2], B, C[2]); \ + DOT_A4B16C4(A[3], B, C[3]); \ + DOT_A4B16C4(A[4], B, C[4]); \ + } +#define DEPTHWISE_CAL_CORE_S1(A, B, C) \ + { \ + DOT_A4B4C4(A[0], B, C[0]); \ + DOT_A4B4C4(A[1], B, C[1]); \ + DOT_A4B4C4(A[2], B, C[2]); \ + DOT_A4B4C4(A[3], B, C[3]); \ + DOT_A4B4C4(A[4], B, C[4]); \ + } +#elif (ON == 6) +#define DIRECT_CONV_CAL_CORE_S1(A, B, C) \ + { \ + DOT_A4B16C4(A[0], B, C[0]); \ + DOT_A4B16C4(A[1], B, C[1]); \ + DOT_A4B16C4(A[2], B, C[2]); \ + DOT_A4B16C4(A[3], B, C[3]); \ + DOT_A4B16C4(A[4], B, C[4]); \ + DOT_A4B16C4(A[5], B, C[5]); \ + } +#define DEPTHWISE_CAL_CORE_S1(A, B, C) \ + { \ + DOT_A4B4C4(A[0], B, C[0]); \ + DOT_A4B4C4(A[1], B, C[1]); \ + DOT_A4B4C4(A[2], B, C[2]); \ + DOT_A4B4C4(A[3], B, C[3]); \ + DOT_A4B4C4(A[4], B, C[4]); \ + DOT_A4B4C4(A[5], B, C[5]); \ + } +#elif (ON == 7) +#define DIRECT_CONV_CAL_CORE_S1(A, B, C) \ + { \ + DOT_A4B16C4(A[0], B, C[0]); \ + DOT_A4B16C4(A[1], B, C[1]); \ + DOT_A4B16C4(A[2], B, C[2]); \ + DOT_A4B16C4(A[3], B, C[3]); \ + DOT_A4B16C4(A[4], B, C[4]); \ + DOT_A4B16C4(A[5], B, C[5]); \ + DOT_A4B16C4(A[6], B, C[6]); \ + } +#define DEPTHWISE_CAL_CORE_S1(A, B, C) \ + { \ + DOT_A4B4C4(A[0], B, C[0]); \ + DOT_A4B4C4(A[1], B, C[1]); \ + DOT_A4B4C4(A[2], B, C[2]); \ + DOT_A4B4C4(A[3], B, C[3]); \ + DOT_A4B4C4(A[4], B, C[4]); \ + DOT_A4B4C4(A[5], B, C[5]); \ + DOT_A4B4C4(A[6], B, C[6]); \ + } +#elif (ON == 8) +#define DIRECT_CONV_CAL_CORE_S1(A, B, C) \ + { \ + DOT_A4B16C4(A[0], B, C[0]); \ + DOT_A4B16C4(A[1], B, C[1]); \ + DOT_A4B16C4(A[2], B, C[2]); \ + DOT_A4B16C4(A[3], B, C[3]); \ + DOT_A4B16C4(A[4], B, C[4]); \ + DOT_A4B16C4(A[5], B, C[5]); \ + DOT_A4B16C4(A[6], B, C[6]); \ + DOT_A4B16C4(A[7], B, C[7]); \ + } +#define DEPTHWISE_CAL_CORE_S1(A, B, C) \ + { \ + DOT_A4B4C4(A[0], B, C[0]); \ + DOT_A4B4C4(A[1], B, C[1]); \ + DOT_A4B4C4(A[2], B, C[2]); \ + DOT_A4B4C4(A[3], B, C[3]); \ + DOT_A4B4C4(A[4], B, C[4]); \ + DOT_A4B4C4(A[5], B, C[5]); \ + DOT_A4B4C4(A[6], B, C[6]); \ + DOT_A4B4C4(A[7], B, C[7]); \ + } +#endif + +/* + * STORE_OUTPUT_BUF_ARRAY_V4 WITH ACTIVATION + */ +#if (ON == 1) +#define STORE_OUTPUT_BUF_ARRAY_V4(V, off, str, id, bd, buf) \ + { \ + ACTIVATION_V4(V[0]); \ + vstore4(V[0], off, buf); \ + } + +#define ADD_ELTWISE_BUF_ARRAY_V4(V, off, str, buf) \ + { \ + V[0] += vload4(off, buf); \ + } + +#define STORE_OUTPUT_BUF_ARRAY_V4_NCWH(V, off, str_h, str_hw, id, bd, buf) \ + { \ + ACTIVATION_V4(V[0]); \ + buf[off] = V[0].x; \ + buf[off + str_hw] = V[0].y; \ + buf[off + str_hw * 2] = V[0].z; \ + buf[off + str_hw * 3] = V[0].w; \ + } + +#define STORE_OUTPUT_BUF_ARRAY_ALIGN(val, off, str, out) \ + { \ + out[off] = val[0]; \ + } + +#define SET_REG_ARRAY(v, reg) \ + { \ + SET_REG_ARRAY1(v, reg); \ + } +#elif (ON == 2) +#define STORE_OUTPUT_BUF_ARRAY_V4(V, off, str, id, bd, buf) \ + { \ + ACTIVATION_V4(V[0]); \ + ACTIVATION_V4(V[1]); \ + vstore4(V[0], off, buf); \ + if (id + 1 < bd) \ + vstore4(V[1], off + str, buf); \ + } + +#define ADD_ELTWISE_BUF_ARRAY_V4(V, off, str, buf) \ + { \ + V[0] += vload4(off, buf); \ + V[1] += vload4(off + str, buf); \ + } + +#define STORE_OUTPUT_BUF_ARRAY_V4_NCWH(V, off, str_h, str_hw, id, bd, buf) \ + { \ + ACTIVATION_V4(V[0]); \ + ACTIVATION_V4(V[1]); \ + buf[off] = V[0].x; \ + buf[off + str_hw] = V[0].y; \ + buf[off + str_hw * 2] = V[0].z; \ + buf[off + str_hw * 3] = V[0].w; \ + if (id + 1 < bd) { \ + buf[off + str_h] = V[1].x; \ + buf[off + str_h + str_hw] = V[1].y; \ + buf[off + str_h + str_hw * 2] = V[1].z; \ + buf[off + str_h + str_hw * 3] = V[1].w; \ + } \ + } + +#define STORE_OUTPUT_BUF_ARRAY_ALIGN(val, off, str, out) \ + { \ + out[off] = val[0]; \ + out[off + str] = val[1]; \ + } + +#define SET_REG_ARRAY(v, reg) \ + { \ + SET_REG_ARRAY2(v, reg); \ + } +#elif (ON == 3) +#define STORE_OUTPUT_BUF_ARRAY_V4(V, off, str, id, bd, buf) \ + { \ + ACTIVATION_V4(V[0]); \ + ACTIVATION_V4(V[1]); \ + ACTIVATION_V4(V[2]); \ + vstore4(V[0], off, buf); \ + if (id + 1 < bd) \ + vstore4(V[1], off + str, buf); \ + if (id + 2 < bd) \ + vstore4(V[2], off + str * 2, buf); \ + } + +#define ADD_ELTWISE_BUF_ARRAY_V4(V, off, str, buf) \ + { \ + V[0] += vload4(off, buf); \ + V[1] += vload4(off + str, buf); \ + V[2] += vload4(off + str * 2, buf); \ + } + +#define STORE_OUTPUT_BUF_ARRAY_V4_NCWH(V, off, str_h, str_hw, id, bd, buf) \ + { \ + ACTIVATION_V4(V[0]); \ + ACTIVATION_V4(V[1]); \ + ACTIVATION_V4(V[2]); \ + buf[off] = V[0].x; \ + buf[off + str_hw] = V[0].y; \ + buf[off + str_hw * 2] = V[0].z; \ + buf[off + str_hw * 3] = V[0].w; \ + if (id + 1 < bd) { \ + buf[off + str_h] = V[1].x; \ + buf[off + str_h + str_hw] = V[1].y; \ + buf[off + str_h + str_hw * 2] = V[1].z; \ + buf[off + str_h + str_hw * 3] = V[1].w; \ + } \ + if (id + 2 < bd) { \ + buf[off + str_h * 2] = V[2].x; \ + buf[off + str_h * 2 + str_hw] = V[2].y; \ + buf[off + str_h * 2 + str_hw * 2] = V[2].z; \ + buf[off + str_h * 2 + str_hw * 3] = V[2].w; \ + } \ + } + +#define STORE_OUTPUT_BUF_ARRAY_ALIGN(val, off, str, out) \ + { \ + out[off] = val[0]; \ + out[off + str] = val[1]; \ + out[off + str * 2] = val[2]; \ + } + +#define SET_REG_ARRAY(v, reg) \ + { \ + SET_REG_ARRAY3(v, reg); \ + } +#elif (ON == 4) +#define STORE_OUTPUT_BUF_ARRAY_V4(V, off, str, id, bd, buf) \ + { \ + ACTIVATION_V4(V[0]); \ + ACTIVATION_V4(V[1]); \ + ACTIVATION_V4(V[2]); \ + ACTIVATION_V4(V[3]); \ + vstore4(V[0], off, buf); \ + if (id + 1 < bd) \ + vstore4(V[1], off + str, buf); \ + if (id + 2 < bd) \ + vstore4(V[2], off + str * 2, buf); \ + if (id + 3 < bd) \ + vstore4(V[3], off + str * 3, buf); \ + } + +#define ADD_ELTWISE_BUF_ARRAY_V4(V, off, str, buf) \ + { \ + V[0] += vload4(off, buf); \ + V[1] += vload4(off + str, buf); \ + V[2] += vload4(off + str * 2, buf); \ + V[3] += vload4(off + str * 3, buf); \ + } + +#define STORE_OUTPUT_BUF_ARRAY_V4_NCWH(V, off, str_h, str_hw, id, bd, buf) \ + { \ + ACTIVATION_V4(V[0]); \ + ACTIVATION_V4(V[1]); \ + ACTIVATION_V4(V[2]); \ + ACTIVATION_V4(V[3]); \ + buf[off] = V[0].x; \ + buf[off + str_hw] = V[0].y; \ + buf[off + str_hw * 2] = V[0].z; \ + buf[off + str_hw * 3] = V[0].w; \ + if (id + 1 < bd) { \ + buf[off + str_h] = V[1].x; \ + buf[off + str_h + str_hw] = V[1].y; \ + buf[off + str_h + str_hw * 2] = V[1].z; \ + buf[off + str_h + str_hw * 3] = V[1].w; \ + } \ + if (id + 2 < bd) { \ + buf[off + str_h * 2] = V[2].x; \ + buf[off + str_h * 2 + str_hw] = V[2].y; \ + buf[off + str_h * 2 + str_hw * 2] = V[2].z; \ + buf[off + str_h * 2 + str_hw * 3] = V[2].w; \ + } \ + if (id + 3 < bd) { \ + buf[off + str_h * 3] = V[3].x; \ + buf[off + str_h * 3 + str_hw] = V[3].y; \ + buf[off + str_h * 3 + str_hw * 2] = V[3].z; \ + buf[off + str_h * 3 + str_hw * 3] = V[3].w; \ + } \ + } + +#define STORE_OUTPUT_BUF_ARRAY_ALIGN(val, off, str, out) \ + { \ + out[off] = val[0]; \ + out[off + str] = val[1]; \ + out[off + str * 2] = val[2]; \ + out[off + str * 3] = val[3]; \ + } + +#define SET_REG_ARRAY(v, reg) \ + { \ + SET_REG_ARRAY4(v, reg); \ + } +#elif (ON == 5) +#define STORE_OUTPUT_BUF_ARRAY_V4(V, off, str, id, bd, buf) \ + { \ + ACTIVATION_V4(V[0]); \ + ACTIVATION_V4(V[1]); \ + ACTIVATION_V4(V[2]); \ + ACTIVATION_V4(V[3]); \ + ACTIVATION_V4(V[4]); \ + vstore4(V[0], off, buf); \ + if (id + 1 < bd) \ + vstore4(V[1], off + str, buf); \ + if (id + 2 < bd) \ + vstore4(V[2], off + str * 2, buf); \ + if (id + 3 < bd) \ + vstore4(V[3], off + str * 3, buf); \ + if (id + 4 < bd) \ + vstore4(V[4], off + str * 4, buf); \ + } + +#define ADD_ELTWISE_BUF_ARRAY_V4(V, off, str, buf) \ + { \ + V[0] += vload4(off, buf); \ + V[1] += vload4(off + str, buf); \ + V[2] += vload4(off + str * 2, buf); \ + V[3] += vload4(off + str * 3, buf); \ + V[4] += vload4(off + str * 4, buf); \ + } + +#define STORE_OUTPUT_BUF_ARRAY_V4_NCWH(V, off, str_h, str_hw, id, bd, buf) \ + { \ + ACTIVATION_V4(V[0]); \ + ACTIVATION_V4(V[1]); \ + ACTIVATION_V4(V[2]); \ + ACTIVATION_V4(V[3]); \ + ACTIVATION_V4(V[4]); \ + buf[off] = V[0].x; \ + buf[off + str_hw] = V[0].y; \ + buf[off + str_hw * 2] = V[0].z; \ + buf[off + str_hw * 3] = V[0].w; \ + if (id + 1 < bd) { \ + buf[off + str_h] = V[1].x; \ + buf[off + str_h + str_hw] = V[1].y; \ + buf[off + str_h + str_hw * 2] = V[1].z; \ + buf[off + str_h + str_hw * 3] = V[1].w; \ + } \ + if (id + 2 < bd) { \ + buf[off + str_h * 2] = V[2].x; \ + buf[off + str_h * 2 + str_hw] = V[2].y; \ + buf[off + str_h * 2 + str_hw * 2] = V[2].z; \ + buf[off + str_h * 2 + str_hw * 3] = V[2].w; \ + } \ + if (id + 3 < bd) { \ + buf[off + str_h * 3] = V[3].x; \ + buf[off + str_h * 3 + str_hw] = V[3].y; \ + buf[off + str_h * 3 + str_hw * 2] = V[3].z; \ + buf[off + str_h * 3 + str_hw * 3] = V[3].w; \ + } \ + if (id + 4 < bd) { \ + buf[off + str_h * 4] = V[4].x; \ + buf[off + str_h * 4 + str_hw] = V[4].y; \ + buf[off + str_h * 4 + str_hw * 2] = V[4].z; \ + buf[off + str_h * 4 + str_hw * 3] = V[4].w; \ + } \ + } + +#define SET_REG_ARRAY(v, reg) \ + { \ + SET_REG_ARRAY5(v, reg); \ + } +#elif (ON == 6) +#define STORE_OUTPUT_BUF_ARRAY_V4(V, off, str, id, bd, buf) \ + { \ + ACTIVATION_V4(V[0]); \ + ACTIVATION_V4(V[1]); \ + ACTIVATION_V4(V[2]); \ + ACTIVATION_V4(V[3]); \ + ACTIVATION_V4(V[4]); \ + ACTIVATION_V4(V[5]); \ + vstore4(V[0], off, buf); \ + if (id + 1 < bd) \ + vstore4(V[1], off + str, buf); \ + if (id + 2 < bd) \ + vstore4(V[2], off + str * 2, buf); \ + if (id + 3 < bd) \ + vstore4(V[3], off + str * 3, buf); \ + if (id + 4 < bd) \ + vstore4(V[4], off + str * 4, buf); \ + if (id + 5 < bd) \ + vstore4(V[5], off + str * 5, buf); \ + } + +#define ADD_ELTWISE_BUF_ARRAY_V4(V, off, str, buf) \ + { \ + V[0] += vload4(off, buf); \ + V[1] += vload4(off + str, buf); \ + V[2] += vload4(off + str * 2, buf); \ + V[3] += vload4(off + str * 3, buf); \ + V[4] += vload4(off + str * 4, buf); \ + V[5] += vload4(off + str * 5, buf); \ + } + +#define STORE_OUTPUT_BUF_ARRAY_V4_NCWH(V, off, str_h, str_hw, id, bd, buf) \ + { \ + ACTIVATION_V4(V[0]); \ + ACTIVATION_V4(V[1]); \ + ACTIVATION_V4(V[2]); \ + ACTIVATION_V4(V[3]); \ + ACTIVATION_V4(V[4]); \ + ACTIVATION_V4(V[5]); \ + buf[off] = V[0].x; \ + buf[off + str_hw] = V[0].y; \ + buf[off + str_hw * 2] = V[0].z; \ + buf[off + str_hw * 3] = V[0].w; \ + if (id + 1 < bd) { \ + buf[off + str_h] = V[1].x; \ + buf[off + str_h + str_hw] = V[1].y; \ + buf[off + str_h + str_hw * 2] = V[1].z; \ + buf[off + str_h + str_hw * 3] = V[1].w; \ + } \ + if (id + 2 < bd) { \ + buf[off + str_h * 2] = V[2].x; \ + buf[off + str_h * 2 + str_hw] = V[2].y; \ + buf[off + str_h * 2 + str_hw * 2] = V[2].z; \ + buf[off + str_h * 2 + str_hw * 3] = V[2].w; \ + } \ + if (id + 3 < bd) { \ + buf[off + str_h * 3] = V[3].x; \ + buf[off + str_h * 3 + str_hw] = V[3].y; \ + buf[off + str_h * 3 + str_hw * 2] = V[3].z; \ + buf[off + str_h * 3 + str_hw * 3] = V[3].w; \ + } \ + if (id + 4 < bd) { \ + buf[off + str_h * 4] = V[4].x; \ + buf[off + str_h * 4 + str_hw] = V[4].y; \ + buf[off + str_h * 4 + str_hw * 2] = V[4].z; \ + buf[off + str_h * 4 + str_hw * 3] = V[4].w; \ + } \ + if (id + 5 < bd) { \ + buf[off + str_h * 5] = V[5].x; \ + buf[off + str_h * 5 + str_hw] = V[5].y; \ + buf[off + str_h * 5 + str_hw * 2] = V[5].z; \ + buf[off + str_h * 5 + str_hw * 3] = V[5].w; \ + } \ + } + +#define SET_REG_ARRAY(v, reg) \ + { \ + SET_REG_ARRAY6(v, reg); \ + } +#elif (ON == 7) +#define STORE_OUTPUT_BUF_ARRAY_V4(V, off, str, id, bd, buf) \ + { \ + ACTIVATION_V4(V[0]); \ + ACTIVATION_V4(V[1]); \ + ACTIVATION_V4(V[2]); \ + ACTIVATION_V4(V[3]); \ + ACTIVATION_V4(V[4]); \ + ACTIVATION_V4(V[5]); \ + ACTIVATION_V4(V[6]); \ + vstore4(V[0], off, buf); \ + if (id + 1 < bd) \ + vstore4(V[1], off + str, buf); \ + if (id + 2 < bd) \ + vstore4(V[2], off + str * 2, buf); \ + if (id + 3 < bd) \ + vstore4(V[3], off + str * 3, buf); \ + if (id + 4 < bd) \ + vstore4(V[4], off + str * 4, buf); \ + if (id + 5 < bd) \ + vstore4(V[5], off + str * 5, buf); \ + if (id + 6 < bd) \ + vstore4(V[6], off + str * 6, buf); \ + } + +#define ADD_ELTWISE_BUF_ARRAY_V4(V, off, str, buf) \ + { \ + V[0] += vload4(off, buf); \ + V[1] += vload4(off + str, buf); \ + V[2] += vload4(off + str * 2, buf); \ + V[3] += vload4(off + str * 3, buf); \ + V[4] += vload4(off + str * 4, buf); \ + V[5] += vload4(off + str * 5, buf); \ + V[6] += vload4(off + str * 6, buf); \ + } + +#define STORE_OUTPUT_BUF_ARRAY_V4_NCWH(V, off, str_h, str_hw, id, bd, buf) \ + { \ + ACTIVATION_V4(V[0]); \ + ACTIVATION_V4(V[1]); \ + ACTIVATION_V4(V[2]); \ + ACTIVATION_V4(V[3]); \ + ACTIVATION_V4(V[4]); \ + ACTIVATION_V4(V[5]); \ + ACTIVATION_V4(V[6]); \ + buf[off] = V[0].x; \ + buf[off + str_hw] = V[0].y; \ + buf[off + str_hw * 2] = V[0].z; \ + buf[off + str_hw * 3] = V[0].w; \ + if (id + 1 < bd) { \ + buf[off + str_h] = V[1].x; \ + buf[off + str_h + str_hw] = V[1].y; \ + buf[off + str_h + str_hw * 2] = V[1].z; \ + buf[off + str_h + str_hw * 3] = V[1].w; \ + } \ + if (id + 2 < bd) { \ + buf[off + str_h * 2] = V[2].x; \ + buf[off + str_h * 2 + str_hw] = V[2].y; \ + buf[off + str_h * 2 + str_hw * 2] = V[2].z; \ + buf[off + str_h * 2 + str_hw * 3] = V[2].w; \ + } \ + if (id + 3 < bd) { \ + buf[off + str_h * 3] = V[3].x; \ + buf[off + str_h * 3 + str_hw] = V[3].y; \ + buf[off + str_h * 3 + str_hw * 2] = V[3].z; \ + buf[off + str_h * 3 + str_hw * 3] = V[3].w; \ + } \ + if (id + 4 < bd) { \ + buf[off + str_h * 4] = V[4].x; \ + buf[off + str_h * 4 + str_hw] = V[4].y; \ + buf[off + str_h * 4 + str_hw * 2] = V[4].z; \ + buf[off + str_h * 4 + str_hw * 3] = V[4].w; \ + } \ + if (id + 5 < bd) { \ + buf[off + str_h * 5] = V[5].x; \ + buf[off + str_h * 5 + str_hw] = V[5].y; \ + buf[off + str_h * 5 + str_hw * 2] = V[5].z; \ + buf[off + str_h * 5 + str_hw * 3] = V[5].w; \ + } \ + if (id + 6 < bd) { \ + buf[off + str_h * 6] = V[6].x; \ + buf[off + str_h * 6 + str_hw] = V[6].y; \ + buf[off + str_h * 6 + str_hw * 2] = V[6].z; \ + buf[off + str_h * 6 + str_hw * 3] = V[6].w; \ + } \ + } + +#define SET_REG_ARRAY(v, reg) \ + { \ + SET_REG_ARRAY7(v, reg); \ + } +#elif (ON == 8) +#define STORE_OUTPUT_BUF_ARRAY_V4(V, off, str, id, bd, buf) \ + { \ + ACTIVATION_V4(V[0]); \ + ACTIVATION_V4(V[1]); \ + ACTIVATION_V4(V[2]); \ + ACTIVATION_V4(V[3]); \ + ACTIVATION_V4(V[4]); \ + ACTIVATION_V4(V[5]); \ + ACTIVATION_V4(V[6]); \ + ACTIVATION_V4(V[7]); \ + vstore4(V[0], off, buf); \ + if (id + 1 < bd) \ + vstore4(V[1], off + str, buf); \ + if (id + 2 < bd) \ + vstore4(V[2], off + str * 2, buf); \ + if (id + 3 < bd) \ + vstore4(V[3], off + str * 3, buf); \ + if (id + 4 < bd) \ + vstore4(V[4], off + str * 4, buf); \ + if (id + 5 < bd) \ + vstore4(V[5], off + str * 5, buf); \ + if (id + 6 < bd) \ + vstore4(V[6], off + str * 6, buf); \ + if (id + 7 < bd) \ + vstore4(V[7], off + str * 7, buf); \ + } + +#define ADD_ELTWISE_BUF_ARRAY_V4(V, off, str, buf) \ + { \ + V[0] += vload4(off, buf); \ + V[1] += vload4(off + str, buf); \ + V[2] += vload4(off + str * 2, buf); \ + V[3] += vload4(off + str * 3, buf); \ + V[4] += vload4(off + str * 4, buf); \ + V[5] += vload4(off + str * 5, buf); \ + V[6] += vload4(off + str * 6, buf); \ + V[7] += vload4(off + str * 7, buf); \ + } + +#define STORE_OUTPUT_BUF_ARRAY_V4_NCWH(V, off, str_h, str_hw, id, bd, buf) \ + { \ + ACTIVATION_V4(V[0]); \ + ACTIVATION_V4(V[1]); \ + ACTIVATION_V4(V[2]); \ + ACTIVATION_V4(V[3]); \ + ACTIVATION_V4(V[4]); \ + ACTIVATION_V4(V[5]); \ + ACTIVATION_V4(V[6]); \ + ACTIVATION_V4(V[7]); \ + buf[off] = V[0].x; \ + buf[off + str_hw] = V[0].y; \ + buf[off + str_hw * 2] = V[0].z; \ + buf[off + str_hw * 3] = V[0].w; \ + if (id + 1 < bd) { \ + buf[off + str_h] = V[1].x; \ + buf[off + str_h + str_hw] = V[1].y; \ + buf[off + str_h + str_hw * 2] = V[1].z; \ + buf[off + str_h + str_hw * 3] = V[1].w; \ + } \ + if (id + 2 < bd) { \ + buf[off + str_h * 2] = V[2].x; \ + buf[off + str_h * 2 + str_hw] = V[2].y; \ + buf[off + str_h * 2 + str_hw * 2] = V[2].z; \ + buf[off + str_h * 2 + str_hw * 3] = V[2].w; \ + } \ + if (id + 3 < bd) { \ + buf[off + str_h * 3] = V[3].x; \ + buf[off + str_h * 3 + str_hw] = V[3].y; \ + buf[off + str_h * 3 + str_hw * 2] = V[3].z; \ + buf[off + str_h * 3 + str_hw * 3] = V[3].w; \ + } \ + if (id + 4 < bd) { \ + buf[off + str_h * 4] = V[4].x; \ + buf[off + str_h * 4 + str_hw] = V[4].y; \ + buf[off + str_h * 4 + str_hw * 2] = V[4].z; \ + buf[off + str_h * 4 + str_hw * 3] = V[4].w; \ + } \ + if (id + 5 < bd) { \ + buf[off + str_h * 5] = V[5].x; \ + buf[off + str_h * 5 + str_hw] = V[5].y; \ + buf[off + str_h * 5 + str_hw * 2] = V[5].z; \ + buf[off + str_h * 5 + str_hw * 3] = V[5].w; \ + } \ + if (id + 6 < bd) { \ + buf[off + str_h * 6] = V[6].x; \ + buf[off + str_h * 6 + str_hw] = V[6].y; \ + buf[off + str_h * 6 + str_hw * 2] = V[6].z; \ + buf[off + str_h * 6 + str_hw * 3] = V[6].w; \ + } \ + if (id + 7 < bd) { \ + buf[off + str_h * 7] = V[7].x; \ + buf[off + str_h * 7 + str_hw] = V[7].y; \ + buf[off + str_h * 7 + str_hw * 2] = V[7].z; \ + buf[off + str_h * 7 + str_hw * 3] = V[7].w; \ + } \ + } + +#define SET_REG_ARRAY(v, reg) \ + { \ + SET_REG_ARRAY8(v, reg); \ + } +#endif +#endif diff --git a/compute/tensor/src/gpu/mali/cl/mem_trans_3d_ncwhc4_to_nchw.cl b/compute/tensor/src/gpu/mali/cl/mem_trans_3d_ncwhc4_to_nchw.cl new file mode 100644 index 00000000..3a313cf7 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/mem_trans_3d_ncwhc4_to_nchw.cl @@ -0,0 +1,123 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void mem_trans_3d_ncwhc4_to_nchw(const int iw_str, + const int ih_str, + const int iw_off, + const int ih_off, + const int ow_str, + const int oh_str, + const int ow_off, + const int oh_off, + const int iw, + const int ih, + const int ic, + const int it, + const int ow, + const int oh, + const int oc, + const int ot, + const int offset_in, + const int offset_out, + __global T *in, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + const int idt = idz % it; + const int idc = idz / it; + + if (idx >= oh || idy >= (ow + 3) >> 2 || idt >= ot) { + return; + } + int in_off = (idz * iw_str + (idy << 2) + iw_off) * ih_str + idx + ih_off; + + int out_off = + ((idc << 2) * ot + idt * oh_str + idx + oh_off) * ow_str + (idy << 2) + ow_off + offset_out; + char iex = ((idy << 2) + 4 <= iw) ? 4 : (iw & 3); + char oex = ((idy << 2) + 4 <= ow) ? 4 : (ow & 3); + if (idx >= ih || (idy << 2) >= iw || idz >= ((ic + 3) >> 2) * it) { + iex = 0; + } + char oec = ((idc << 2) + 4 <= oc) ? 4 : (oc & 3); + T4 val[4]; + val[0] = 0; + val[1] = 0; + val[2] = 0; + val[3] = 0; + + if (iex > 0) { + val[0] = vload4(in_off, in + offset_in); + } + if (iex > 1) { + val[1] = vload4(in_off + ih_str, in + offset_in); + } + if (iex > 2) { + val[2] = vload4(in_off + (ih_str << 1), in + offset_in); + } + if (iex > 3) { + val[3] = vload4(in_off + ih_str * 3, in + offset_in); + } + + int owh_str = ow_str * oh_str * ot; + if (oex == 4) { + vstore4((T4)(val[0].x, val[1].x, val[2].x, val[3].x), 0, out + out_off); + if (oec > 1) { + vstore4((T4)(val[0].y, val[1].y, val[2].y, val[3].y), 0, out + out_off + owh_str); + } + if (oec > 2) { + vstore4((T4)(val[0].z, val[1].z, val[2].z, val[3].z), 0, out + out_off + (owh_str << 1)); + } + if (oec > 3) { + vstore4((T4)(val[0].w, val[1].w, val[2].w, val[3].w), 0, out + out_off + owh_str * 3); + } + } else { + if (oex == 1) { + out[out_off] = val[0].x; + if (oec > 1) { + out[out_off + owh_str] = val[0].y; + } + if (oec > 2) { + out[out_off + (owh_str << 1)] = val[0].z; + } + if (oec > 3) { + out[out_off + owh_str * 3] = val[0].w; + } + } + if (oex == 2) { + vstore2((T2)(val[0].x, val[1].x), 0, out + out_off); + if (oec > 1) { + vstore2((T2)(val[0].y, val[1].y), 0, out + out_off + owh_str); + } + if (oec > 2) { + vstore2((T2)(val[0].z, val[1].z), 0, out + out_off + (owh_str << 1)); + } + if (oec > 3) { + vstore2((T2)(val[0].w, val[1].w), 0, out + out_off + owh_str * 3); + } + } + if (oex == 3) { + vstore3((T3)(val[0].x, val[1].x, val[2].x), 0, out + out_off); + if (oec > 1) { + vstore3((T3)(val[0].y, val[1].y, val[2].y), 0, out + out_off + owh_str); + } + if (oec > 2) { + vstore3((T3)(val[0].z, val[1].z, val[2].z), 0, out + out_off + (owh_str << 1)); + } + if (oec > 3) { + vstore3((T3)(val[0].w, val[1].w, val[2].w), 0, out + out_off + owh_str * 3); + } + } + } +} diff --git a/compute/tensor/src/gpu/mali/cl/mem_trans_nchw_to_nchw.cl b/compute/tensor/src/gpu/mali/cl/mem_trans_nchw_to_nchw.cl new file mode 100644 index 00000000..551af4a3 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/mem_trans_nchw_to_nchw.cl @@ -0,0 +1,79 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void mem_trans_nchw_to_nchw(const int iw_str, + const int ih_str, + const int iw_off, + const int ih_off, + const int ow_str, + const int oh_str, + const int ow_off, + const int oh_off, + const int iw, + const int ih, + const int ic, + const int ow, + const int oh, + const int oc, + const int offset_in, + const int offset_out, + const __global T *in, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= ((ow + 3) >> 2) || idy >= oh) { + return; + } + char ie = (((idx << 2) + 4) <= iw) ? 4 : (iw & 3); + char oe = (((idx << 2) + 4) <= ow) ? 4 : (ow & 3); + if (idx >= ((iw + 3) >> 2) || idy >= ih || idz >= ic) { + ie = 0; + } + + T4 val = 0; + const int in_off = (idz * ih_str + idy + ih_off) * iw_str + (idx << 2) + iw_off + offset_in; + if (ie == 4) { + val = vload4(0, in + in_off); + } else { + if (ie == 1) { + val.x = in[in_off]; + } + if (ie == 2) { + T2 tmp = vload2(0, in + in_off); + val.x = tmp.x; + val.y = tmp.y; + } + if (ie == 3) { + T3 tmp = vload3(0, in + in_off); + val.x = tmp.x; + val.y = tmp.y; + val.z = tmp.z; + } + } + const int out_off = (idz * oh_str + idy + oh_off) * ow_str + (idx << 2) + ow_off + offset_out; + if (oe == 4) { + vstore4(val, 0, out + out_off); + } else { + if (oe == 1) { + out[out_off] = val.x; + } + if (oe == 2) { + vstore2(val.xy, 0, out + out_off); + } + if (oe == 3) { + vstore3(val.xyz, 0, out + out_off); + } + } +} diff --git a/compute/tensor/src/gpu/mali/cl/mem_trans_nchw_to_ncwhc4.cl b/compute/tensor/src/gpu/mali/cl/mem_trans_nchw_to_ncwhc4.cl new file mode 100644 index 00000000..b863dfc9 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/mem_trans_nchw_to_ncwhc4.cl @@ -0,0 +1,153 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void +#if defined(INPUT_TRAN) +mem_trans_nchw_to_ncwhc4_input_tran +#elif defined(OUTPUT_TRAN) +mem_trans_nchw_to_ncwhc4_output_tran +#else +mem_trans_nchw_to_ncwhc4 +#endif + (const int iw_str, + const int ih_str, + const int iw_off, + const int ih_off, + const int ow_str, + const int oh_str, + const int ow_off, + const int oh_off, + const int iw, + const int ih, + const int ic, + const int ow, + const int oh, + const int oc, + const int offset_in, + const int offset_out, + const __global T *in, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + int ocd4 = (oc + 3) >> 2; + const int idc = idz % ocd4; + const int idn = idz / ocd4; + const int iz_off = (idn * ic + (idc << 2)) * iw_str * ih_str; + +#if defined(INPUT_TRAN) + if (idx >= (oh + 3) >> 2 || idy >= ow) { + return; + } + int in_off = iz_off + (idy + iw_off) * ih_str + (idx << 2) + ih_off + offset_in; + int out_off = (idz * ow_str + idy + ow_off) * oh_str + (idx << 2) + oh_off; + char iex = ((idx << 2) + 4 <= ih) ? 4 : (ih & 3); + char oex = ((idx << 2) + 4 <= oh) ? 4 : (oh & 3); + if ((idx << 2) >= ih || idy >= iw || idc >= ((ic + 3) >> 2)) { + iex = 0; + } + int out_str = 1; +#else +#if defined(OUTPUT_TRAN) + if (idx >= (oh + 3) >> 2 || idy >= ow) { + return; + } + int out_off = (idz * ow_str + idy + ow_off) * oh_str + (idx << 2) + oh_off; + int out_str = 1; + char oex = ((idx << 2) + 4 <= oh) ? 4 : (oh & 3); +#else + if (idx >= (ow + 3) >> 2 || idy >= oh) { + return; + } + int out_off = (idz * ow_str + (idx << 2) + ow_off) * oh_str + idy + oh_off; + int out_str = oh_str; + char oex = ((idx << 2) + 4 <= ow) ? 4 : (ow & 3); +#endif + int in_off = iz_off + (idy + ih_off) * iw_str + (idx << 2) + iw_off + offset_in; + char iex = ((idx << 2) + 4 <= iw) ? 4 : (iw & 3); + if ((idx << 2) >= iw || idy >= ih || idc >= ((ic + 3) >> 2)) { + iex = 0; + } +#endif + char iec = ((idc << 2) + 4 <= ic) ? 4 : (ic & 3); + T4 val[4]; + val[0] = 0; + val[1] = 0; + val[2] = 0; + val[3] = 0; + + int iwh_str = iw_str * ih_str; + if (iex == 4) { + val[0] = vload4(0, in + in_off); + if (iec > 1) { + val[1] = vload4(0, in + in_off + iwh_str); + } + if (iec > 2) { + val[2] = vload4(0, in + in_off + (iwh_str << 1)); + } + if (iec > 3) { + val[3] = vload4(0, in + in_off + iwh_str * 3); + } + } else { + if (iex == 1) { + val[0].x = in[in_off]; + if (iec > 1) { + val[1].x = in[in_off + iwh_str]; + } + if (iec > 2) { + val[2].x = in[in_off + (iwh_str << 1)]; + } + if (iec > 3) { + val[3].x = in[in_off + iwh_str * 3]; + } + } + if (iex == 2) { + val[0].xy = vload2(0, in + in_off); + if (iec > 1) { + val[1].xy = vload2(0, in + in_off + iwh_str); + } + if (iec > 2) { + val[2].xy = vload2(0, in + in_off + (iwh_str << 1)); + } + if (iec > 3) { + val[3].xy = vload2(0, in + in_off + iwh_str * 3); + } + } + if (iex == 3) { + val[0].xyz = vload3(0, in + in_off); + if (iec > 1) { + val[1].xyz = vload3(0, in + in_off + iwh_str); + } + if (iec > 2) { + val[2].xyz = vload3(0, in + in_off + (iwh_str << 1)); + } + if (iec > 3) { + val[3].xyz = vload3(0, in + in_off + iwh_str * 3); + } + } + } + + vstore4((T4)(val[0].x, val[1].x, val[2].x, val[3].x), out_off, out + offset_out); + if (oex > 1) { + vstore4((T4)(val[0].y, val[1].y, val[2].y, val[3].y), out_off + out_str, out + offset_out); + } + if (oex > 2) { + vstore4((T4)(val[0].z, val[1].z, val[2].z, val[3].z), out_off + (out_str << 1), + out + offset_out); + } + if (oex > 3) { + vstore4( + (T4)(val[0].w, val[1].w, val[2].w, val[3].w), out_off + out_str * 3, out + offset_out); + } +} diff --git a/compute/tensor/src/gpu/mali/cl/mem_trans_nchw_to_ncwhc4_iw_equal_oh.cl b/compute/tensor/src/gpu/mali/cl/mem_trans_nchw_to_ncwhc4_iw_equal_oh.cl new file mode 100644 index 00000000..f1ae3cfc --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/mem_trans_nchw_to_ncwhc4_iw_equal_oh.cl @@ -0,0 +1,61 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void mem_trans_nchw_to_ncwhc4_iw_equal_oh(const int iw_str, + const int ih_str, + const int iw_off, + const int ih_off, + const int ow_str, + const int oh_str, + const int ow_off, + const int oh_off, + const int ih, + const int ic, + const int bx, + const int by, + __global const T *in, + __global T *out) +{ + int idx = get_global_id(0); + int idy = get_global_id(0); + int idz = get_global_id(1); + if (idx >= bx || idy >= by) { + return; + } + T4 val = 0; + int k = ih * ic; + int idk = (idz * by + idy) << 2; + int ix = idx; + int4 iy; + int4 iz; + iy.s0 = idk % ih; + iy.s1 = (idk + 1) % ih; + iy.s2 = (idk + 2) % ih; + iy.s3 = (idk + 3) % ih; + iz.s0 = idk / ih; + iz.s1 = (idk + 1) / ih; + iz.s2 = (idk + 2) / ih; + iz.s3 = (idk + 3) / ih; + val.x = in[(iz.s0 * ih_str + iy.s0 + ih_off) * iw_str + ix + iw_off]; + if (idk + 1 < k) { + val.y = in[(iz.s1 * ih_str + iy.s1 + ih_off) * iw_str + ix + iw_off]; + } + if (idk + 2 < k) { + val.z = in[(iz.s2 * ih_str + iy.s2 + ih_off) * iw_str + ix + iw_off]; + } + if (idk + 3 < k) { + val.w = in[(iz.s3 * ih_str + iy.s3 + ih_off) * iw_str + ix + iw_off]; + } + const int out_off = (idy * ow_str + ow_off) * oh_str + idx + oh_off; + vstore4(val, out_off, out); +} diff --git a/compute/tensor/src/gpu/mali/cl/mem_trans_ncwhc4_to_mtk.cl b/compute/tensor/src/gpu/mali/cl/mem_trans_ncwhc4_to_mtk.cl new file mode 100644 index 00000000..2d0939cc --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/mem_trans_ncwhc4_to_mtk.cl @@ -0,0 +1,47 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void mem_trans_ncwhc4_to_mtk(const int ih_str, + const int iw_str, + const int ih_off, + const int iw_off, + const int k, + const int offset, + const int bx, + const int by, + __global T *in, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + if (idx >= bx || idy >= by) { + return; + } + uchar ek = ((idy << 2) + 4 <= k) ? 4 : (k & 3); + const int in_off = (idy * iw_str + iw_off) * ih_str + idx + ih_off; + T4 val = vload4(in_off, in); + const int out_off = idx * k + (idy << 2) + offset; + if (ek == 4) { + vstore4(val, 0, out + out_off); + } else { + if (ek == 1) { + out[out_off] = val.x; + } + if (ek == 2) { + vstore2((T2)(val.x, val.y), 0, out + out_off); + } + if (ek == 3) { + vstore3((T3)(val.x, val.y, val.z), 0, out + out_off); + } + } +} diff --git a/compute/tensor/src/gpu/mali/cl/mem_trans_ncwhc4_to_nchw.cl b/compute/tensor/src/gpu/mali/cl/mem_trans_ncwhc4_to_nchw.cl new file mode 100644 index 00000000..5207e701 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/mem_trans_ncwhc4_to_nchw.cl @@ -0,0 +1,137 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void +#if defined(OUTPUT_TRAN) +mem_trans_ncwhc4_to_nchw_output_tran +#else +mem_trans_ncwhc4_to_nchw +#endif + (const int iw_str, + const int ih_str, + const int iw_off, + const int ih_off, + const int ow_str, + const int oh_str, + const int ow_off, + const int oh_off, + const int iw, + const int ih, + const int ic, + const int ow, + const int oh, + const int oc, + const int offset_in, + const int offset_out, + __global T *in, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); +#if defined(OUTPUT_TRAN) + if (idx >= (ow + 3) >> 2 || idy >= oh) { + return; + } + int in_off = (idz * iw_str + idy + iw_off) * ih_str + (idx << 2) + ih_off; + int out_off = ((idz << 2) * oh_str + idy + oh_off) * ow_str + (idx << 2) + ow_off + offset_out; + char iex = ((idx << 2) + 4 <= ih) ? 4 : (ih & 3); + char oex = ((idx << 2) + 4 <= ow) ? 4 : (ow & 3); + if ((idx << 2) >= ih || idy >= iw || idz >= (ic + 3) >> 2) { + iex = 0; + } + const int in_str = 1; +#else + if (idx >= oh || idy >= (ow + 3) >> 2) { + return; + } + int in_off = (idz * iw_str + (idy << 2) + iw_off) * ih_str + idx + ih_off; + int out_off = ((idz << 2) * oh_str + idx + oh_off) * ow_str + (idy << 2) + ow_off + offset_out; + char iex = ((idy << 2) + 4 <= iw) ? 4 : (iw & 3); + char oex = ((idy << 2) + 4 <= ow) ? 4 : (ow & 3); + if (idx >= ih || (idy << 2) >= iw || idz >= (ic + 3) >> 2) { + iex = 0; + } + const int in_str = ih_str; +#endif + short oec = ((idz << 2) + 4 <= oc) ? 4 : (oc & 3); + T4 val[4]; + val[0] = 0; + val[1] = 0; + val[2] = 0; + val[3] = 0; + + if (iex > 0) { + val[0] = vload4(in_off, in + offset_in); + } + if (iex > 1) { + val[1] = vload4(in_off + in_str, in + offset_in); + } + if (iex > 2) { + val[2] = vload4(in_off + (in_str << 1), in + offset_in); + } + if (iex > 3) { + val[3] = vload4(in_off + in_str * 3, in + offset_in); + } + + int owh_str = ow_str * oh_str; + if (oex == 4) { + vstore4((T4)(val[0].x, val[1].x, val[2].x, val[3].x), 0, out + out_off); + if (oec > 1) { + vstore4((T4)(val[0].y, val[1].y, val[2].y, val[3].y), 0, out + out_off + owh_str); + } + if (oec > 2) { + vstore4((T4)(val[0].z, val[1].z, val[2].z, val[3].z), 0, out + out_off + (owh_str << 1)); + } + if (oec > 3) { + vstore4((T4)(val[0].w, val[1].w, val[2].w, val[3].w), 0, out + out_off + owh_str * 3); + } + } else { + if (oex == 1) { + out[out_off] = val[0].x; + if (oec > 1) { + out[out_off + owh_str] = val[0].y; + } + if (oec > 2) { + out[out_off + (owh_str << 1)] = val[0].z; + } + if (oec > 3) { + out[out_off + owh_str * 3] = val[0].w; + } + } + if (oex == 2) { + vstore2((T2)(val[0].x, val[1].x), 0, out + out_off); + if (oec > 1) { + vstore2((T2)(val[0].y, val[1].y), 0, out + out_off + owh_str); + } + if (oec > 2) { + vstore2((T2)(val[0].z, val[1].z), 0, out + out_off + (owh_str << 1)); + } + if (oec > 3) { + vstore2((T2)(val[0].w, val[1].w), 0, out + out_off + owh_str * 3); + } + } + if (oex == 3) { + vstore3((T3)(val[0].x, val[1].x, val[2].x), 0, out + out_off); + if (oec > 1) { + vstore3((T3)(val[0].y, val[1].y, val[2].y), 0, out + out_off + owh_str); + } + if (oec > 2) { + vstore3((T3)(val[0].z, val[1].z, val[2].z), 0, out + out_off + (owh_str << 1)); + } + if (oec > 3) { + vstore3((T3)(val[0].w, val[1].w, val[2].w), 0, out + out_off + owh_str * 3); + } + } + } +} diff --git a/compute/tensor/src/gpu/mali/cl/mem_trans_ncwhc4_to_nchw_ih_equal_ow.cl b/compute/tensor/src/gpu/mali/cl/mem_trans_ncwhc4_to_nchw_ih_equal_ow.cl new file mode 100644 index 00000000..d15f2fcf --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/mem_trans_ncwhc4_to_nchw_ih_equal_ow.cl @@ -0,0 +1,61 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void mem_trans_ncwhc4_to_nchw_ih_equal_ow(const int iw_str, + const int ih_str, + const int iw_off, + const int ih_off, + const int ow_str, + const int oh_str, + const int ow_off, + const int oh_off, + const int oh, + const int oc, + const int bx, + const int by, + __global const T *in, + __global T *out) +{ + int idx = get_global_id(0); + int idy = get_global_id(1); + int idz = get_global_id(2); + if (idx >= bx || idy >= by) { + return; + } + T4 val; + const int in_off = (idz * iw_str + idy + iw_off) * ih_str + idx + ih_off; + val = vload4(in_off, in); + + int idk = (idz * by + idy) << 2; + int ox = idx; + int4 oy; + int4 oz; + oy.s0 = idk % oh; + oy.s1 = (idk + 1) % oh; + oy.s2 = (idk + 2) % oh; + oy.s3 = (idk + 3) % oh; + oz.s0 = idk / oh; + oz.s1 = (idk + 1) / oh; + oz.s2 = (idk + 2) / oh; + oz.s3 = (idk + 3) / oh; + out[(oz.s0 * oh_str + oy.s0 + oh_off) * ow_str + ox + ow_off] = val.x; + if (oz.s1 < oc) { + out[(oz.s1 * oh_str + oy.s1 + oh_off) * ow_str + ox + ow_off] = val.y; + } + if (oz.s2 < oc) { + out[(oz.s2 * oh_str + oy.s2 + oh_off) * ow_str + ox + ow_off] = val.z; + } + if (oz.s3 < oc) { + out[(oz.s3 * oh_str + oy.s3 + oh_off) * ow_str + ox + ow_off] = val.w; + } +} diff --git a/compute/tensor/src/gpu/mali/cl/mem_trans_ncwhc4_to_ncwhc4.cl b/compute/tensor/src/gpu/mali/cl/mem_trans_ncwhc4_to_ncwhc4.cl new file mode 100644 index 00000000..a24987ca --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/mem_trans_ncwhc4_to_ncwhc4.cl @@ -0,0 +1,61 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void +#if defined(OUTPUT_TRAN) +mem_trans_ncwhc4_to_ncwhc4_output_tran +#else +mem_trans_ncwhc4_to_ncwhc4 +#endif + (const int iw_str, + const int ih_str, + const int iw_off, + const int ih_off, + const int ow_str, + const int oh_str, + const int ow_off, + const int oh_off, + const int iw, + const int ih, + const int ic, + const int ow, + const int oh, + const int oc, + const int offset_in, + const int offset_out, + __global T *in, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + +#if defined(OUTPUT_TRAN) + if (idx >= ow || idy >= oh) { + return; + } + const int out_off = (idz * ow_str + idx + ow_off) * oh_str + idy + oh_off; +#else + if (idx >= oh || idy >= ow) { + return; + } + const int out_off = (idz * ow_str + idy + ow_off) * oh_str + idx + oh_off; +#endif + + T4 val = 0; + const int in_off = (idz * iw_str + idy + iw_off) * ih_str + idx + ih_off; + if (idx < ih && idy < iw && idz < ((ic + 3) >> 2)) { + val = vload4(in_off, in + offset_in); + } + vstore4(val, out_off, out + offset_out); +} diff --git a/compute/tensor/src/gpu/mali/cl/normalization.cl b/compute/tensor/src/gpu/mali/cl/normalization.cl new file mode 100644 index 00000000..6dbe61e2 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/normalization.cl @@ -0,0 +1,89 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#if defined(USE_C1) +__kernel void normalization_c1 +#else +__kernel void normalization +#endif + (const int len, + const int ih_str, + const int ic_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int oh_off, + const int ow_off, + const float para, + __global const T *alpha, + __global const T *beta, + __global const T *in, + __global T *out) +{ + int idx = get_global_id(0); + if (idx >= len) { + return; + } + + float mean = 0; + float var = 0; + + int in_off = iw_off * ih_str + idx + ih_off; + for (int i = 0; i < ic_str; ++i) { + T4 tmp = vload4(in_off + i * ih_str, in); + float4 tmpf; + tmpf.x = tmp.x; + tmpf.y = tmp.y; + tmpf.z = tmp.z; + tmpf.w = tmp.w; + mean += (float)(tmpf.x + tmpf.y + tmpf.z + tmpf.w); + } + mean = mean * para; + + for (int i = 0; i < ic_str; ++i) { + T4 tmp = vload4(in_off + i * ih_str, in); + float4 tmpf; + tmpf.x = tmp.x; + tmpf.y = tmp.y; + tmpf.z = tmp.z; + tmpf.w = tmp.w; + tmpf.x = tmpf.x - mean; + tmpf.y = tmpf.y - mean; + tmpf.z = tmpf.z - mean; + tmpf.w = tmpf.w - mean; + var += tmpf.x * tmpf.x + tmpf.y * tmpf.y + tmpf.z * tmpf.z + tmpf.w * tmpf.w; + } + var = var * para; + + float std_val = sqrt(var + 1e-6); + std_val = 1.0 / std_val; + int out_off = ow_off * oh_str + idx + oh_off; + for (int i = 0; i < ic_str; ++i) { + T4 out_val = vload4(in_off + i * ih_str, in); + T4 alp = vload4(i, alpha); + T4 bet = vload4(i, beta); + out_val.x = alp.x * (out_val.x - mean) * std_val + bet.x; + out_val.y = alp.y * (out_val.y - mean) * std_val + bet.y; + out_val.z = alp.z * (out_val.z - mean) * std_val + bet.z; + out_val.w = alp.w * (out_val.w - mean) * std_val + bet.w; +#if (USE_C1) + out[out_off] = out_val.x; + out[out_off + oh_str] = out_val.y; + out[out_off + oh_str * 2] = out_val.z; + out[out_off + oh_str * 3] = out_val.w; + out_off += (oh_str << 2); +#else + vstore4(out_val, out_off + i * oh_str, out); +#endif + } +} diff --git a/compute/tensor/src/gpu/mali/cl/padding_constant.cl b/compute/tensor/src/gpu/mali/cl/padding_constant.cl new file mode 100644 index 00000000..58ba79f5 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/padding_constant.cl @@ -0,0 +1,51 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void padding_constant(const int ih, + const int iw, + const int ih_str, + const int iw_str, + const int ih_off, + const int iw_off, + const int oh, + const int ow, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + const int ph, + const int pb, + const int pw, + const int pr, + __global const T *in, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= oh || idy >= ow) { + return; + } + if (idx < ph || idx >= ph + ih) { + return; + } + if (idy < pw || idy >= pw + iw) { + return; + } + + int in_off = (idz * iw_str + idy - pw + iw_off) * ih_str + ih_off + idx - ph; + int out_off = (idz * ow_str + idy + ow_off) * oh_str + oh_off + idx; + T4 val; + val = vload4(in_off, in); + vstore4(val, out_off, out); +} diff --git a/compute/tensor/src/gpu/mali/cl/padding_edge.cl b/compute/tensor/src/gpu/mali/cl/padding_edge.cl new file mode 100644 index 00000000..b6c1f428 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/padding_edge.cl @@ -0,0 +1,72 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void padding_edge(const int ih, + const int iw, + const int ih_str, + const int iw_str, + const int ih_off, + const int iw_off, + const int oh, + const int ow, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + const int ph, + const int pb, + const int pw, + const int pr, + __global const T *in, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= oh || idy >= ow) { + return; + } + + int in_off = idz * iw_str * ih_str; + if (idx < ph) { + if (idy < pw) { + in_off = in_off + iw_off * ih_str + ih_off; + } else if (idy >= pw + iw) { + in_off = in_off + (iw_off + iw - 1) * ih_str + ih_off; + } else { + in_off = in_off + (idy + iw_off - pw) * ih_str + ih_off; + } + } else if (idx >= ph + ih) { + in_off = in_off + iw_off * ih_str + ih_off + ih - 1; + if (idy < pw) { + in_off = in_off; + } else if (idy >= pw + iw) { + in_off = in_off + (iw - 1) * ih_str; + } else { + in_off = in_off + (idy - pw) * ih_str; + } + } else { + in_off = in_off + iw_off * ih_str + ih_off; + if (idy < pw) { + in_off = in_off + idx - ph; + } else if (idy >= pw + iw) { + in_off = in_off + idx - ph + (iw - 1) * ih_str; + } else { + in_off = in_off + (idy - pw) * ih_str + idx - ph; + } + } + T4 val; + val = vload4(in_off, in); + int out_off = (idz * ow_str + idy + ow_off) * oh_str + oh_off + idx; + vstore4(val, out_off, out); +} diff --git a/compute/tensor/src/gpu/mali/cl/padding_input_gclmem.cl b/compute/tensor/src/gpu/mali/cl/padding_input_gclmem.cl new file mode 100644 index 00000000..287b8acf --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/padding_input_gclmem.cl @@ -0,0 +1,66 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void padding_input_gclmem(const int iw, + const int ih, + const int pw, + const int ph, + const int ow, + const int oh, + const __global const T *in, + __global T *out) +{ + int idx = get_global_id(0) << 2; + int idy = get_global_id(1); + int idz = get_global_id(2); + if (idx >= ow || idy >= oh) { + return; + } + + int in_y = idy - ph; + int be_x = idx - pw; + int en_x = be_x + 4; + T4 val = 0; + if (in_y >= 0 && in_y < ih) { + int in_off = (idz * ih + in_y) * iw; + if (be_x >= 0 && en_x < iw) { + val = vload4(0, in + in_off + be_x); + } else { + if (be_x >= 0 && be_x < iw) { + val.x = in[in_off + be_x]; + } + if (be_x + 1 >= 0 && be_x + 1 < iw) { + val.y = in[in_off + be_x + 1]; + } + if (be_x + 2 >= 0 && be_x + 2 < iw) { + val.z = in[in_off + be_x + 2]; + } + if (be_x + 3 >= 0 && be_x + 3 < iw) { + val.w = in[in_off + be_x + 3]; + } + } + } + + int out_off = (idz * oh + idy) * ow + idx; + if (idx + 3 >= ow) { + out[out_off] = val.x; + if (idx + 1 < ow) { + out[out_off + 1] = val.y; + } + if (idx + 2 < ow) { + out[out_off + 2] = val.z; + } + } else { + vstore4(val, 0, out + out_off); + } +} diff --git a/compute/tensor/src/gpu/mali/cl/padding_reflect.cl b/compute/tensor/src/gpu/mali/cl/padding_reflect.cl new file mode 100644 index 00000000..984c82d8 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/padding_reflect.cl @@ -0,0 +1,75 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void padding_reflect(const int ih, + const int iw, + const int ih_str, + const int iw_str, + const int ih_off, + const int iw_off, + const int oh, + const int ow, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + const int ph, + const int pb, + const int pw, + const int pr, + __global const T *in, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= oh || idy >= ow) { + return; + } + + int in_off = idz * iw_str * ih_str; + if (idx < ph) { + in_off = in_off + iw_off * ih_str + ih_off; + if (idy < pw) { + in_off = in_off + (pw - idy) * ih_str + ph - idx; + } else if (idy >= pw + iw) { + in_off = in_off + (iw - 1) * ih_str; + in_off = in_off - (idy + 1 - pw - iw) * ih_str + ph - idx; + } else { + in_off = in_off + (idy - pw) * ih_str + ph - idx; + } + } else if (idx >= ph + ih) { + in_off = in_off + iw_off * ih_str + ih_off + ih - 1; + if (idy + ow_off < pw) { + in_off = in_off + (pw - idy) * ih_str - (idx + 1 - ph - ih); + } else if (idy >= pw + iw) { + in_off = in_off + (iw - 1) * ih_str; + in_off = in_off - (idy + 1 - pw - iw) * ih_str - (idx + 1 - ph - ih); + } else { + in_off = in_off + (idy - pw) * ih_str - (idx + 1 - ih - ph); + } + } else { + in_off = in_off + iw_off * ih_str + ih_off; + if (idy < pw) { + in_off = in_off + (pw - idy) * ih_str + idx - ph; + } else if (idy >= pw + iw) { + in_off = in_off + (iw - 1) * ih_str - (idy + 1 - iw - pw) * ih_str + idx - ph; + } else { + in_off = in_off + (idy - pw) * ih_str + idx - ph; + } + } + T4 val; + val = vload4(in_off, in); + int out_off = (idz * ow_str + idy + ow_off) * oh_str + oh_off + idx; + vstore4(val, out_off, out); +} diff --git a/compute/tensor/src/gpu/mali/cl/padding_symmetric.cl b/compute/tensor/src/gpu/mali/cl/padding_symmetric.cl new file mode 100644 index 00000000..3dd5ebcc --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/padding_symmetric.cl @@ -0,0 +1,75 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void padding_symmetric(const int ih, + const int iw, + const int ih_str, + const int iw_str, + const int ih_off, + const int iw_off, + const int oh, + const int ow, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + const int ph, + const int pb, + const int pw, + const int pr, + __global const T *in, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= oh || idy >= ow) { + return; + } + + int in_off = idz * iw_str * ih_str; + if (idx < ph) { + in_off = in_off + iw_off * ih_str + ih_off; + if (idy < pw) { + in_off = in_off + (pw - 1 - idy) * ih_str + ph - 1 - idx; + } else if (idy >= pw + iw) { + in_off = in_off + (iw - 1) * ih_str; + in_off = in_off - (idy - pw - iw) * ih_str + ph - 1 - idx; + } else { + in_off = in_off + (idy - pw) * ih_str + ph - 1 - idx; + } + } else if (idx >= ph + ih) { + in_off = in_off + iw_off * ih_str + ih_off + ih - 1; + if (idy + ow_off < pw) { + in_off = in_off + (pw - 1 - idy) * ih_str - (idx - ph - ih); + } else if (idy >= pw + iw) { + in_off = in_off + (iw - 1) * ih_str; + in_off = in_off - (idy - pw - iw) * ih_str - (idx - ph - ih); + } else { + in_off = in_off + (idy - pw) * ih_str - (idx - ih - ph); + } + } else { + in_off = in_off + iw_off * ih_str + ih_off; + if (idy < pw) { + in_off = in_off + (pw - 1 - idy) * ih_str + idx - ph; + } else if (idy >= pw + iw) { + in_off = in_off + (iw - 1) * ih_str - (idy - iw - pw) * ih_str + idx - ph; + } else { + in_off = in_off + (idy - pw) * ih_str + idx - ph; + } + } + T4 val; + val = vload4(in_off, in); + int out_off = (idz * ow_str + idy + ow_off) * oh_str + oh_off + idx; + vstore4(val, out_off, out); +} diff --git a/compute/tensor/src/gpu/mali/cl/pooling_global_mean_h.cl b/compute/tensor/src/gpu/mali/cl/pooling_global_mean_h.cl new file mode 100644 index 00000000..637c0513 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/pooling_global_mean_h.cl @@ -0,0 +1,46 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define sumvec4(x, y) \ + { \ + x.s0 += (float)y.s0; \ + x.s1 += (float)y.s1; \ + x.s2 += (float)y.s2; \ + x.s3 += (float)y.s3; \ + } + +__kernel void pooling_global_mean_h(const int ih, + const int oh_str, + const int ohw_str, + const int oh_off, + const int ow_off, + const int bx, + __global const T *in, + __global T *out) +{ + const int idx = get_global_id(0); + if (idx >= bx) { + return; + } + const int in_off = idx * ih; + + T4 val; + float4 sum = 0; + for (int i = 0; i < ih; ++i) { + val = vload4(in_off + i, in); + sumvec4(sum, val); + } + sum = sum / ((float)(ih)); + int out_off = idx * ohw_str + ow_off * oh_str + oh_off; + vstore4((T4)(sum.x, sum.y, sum.z, sum.w), out_off, out); +} diff --git a/compute/tensor/src/gpu/mali/cl/pooling_global_mean_w.cl b/compute/tensor/src/gpu/mali/cl/pooling_global_mean_w.cl new file mode 100644 index 00000000..08af8310 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/pooling_global_mean_w.cl @@ -0,0 +1,50 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define sumvec4(x, y) \ + { \ + x.s0 += (float)y.s0; \ + x.s1 += (float)y.s1; \ + x.s2 += (float)y.s2; \ + x.s3 += (float)y.s3; \ + } + +__kernel void pooling_global_mean_w(const int ih_str, + const int ihw_str, + const int ih_off, + const int iw_off, + const int ih, + const int iw, + const int bx, + const int by, + __global const T *in, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + if (idx >= bx || idy >= by) { + return; + } + + int in_off = idy * ihw_str + iw_off * ih_str + idx + ih_off; + + T4 val; + float4 sum = 0; + for (int i = 0; i < iw; ++i) { + val = vload4(in_off + ih_str * i, in); + sumvec4(sum, val); + } + sum = sum / (float)(iw); + int out_off = (idy * ih) + idx; + vstore4((T4)(sum.x, sum.y, sum.z, sum.w), out_off, out); +} diff --git a/compute/tensor/src/gpu/mali/cl/pooling_max.cl b/compute/tensor/src/gpu/mali/cl/pooling_max.cl new file mode 100644 index 00000000..3abb101f --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/pooling_max.cl @@ -0,0 +1,76 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define maxvec4(x, y) \ + { \ + x.s0 = (x.s0 > y.s0) ? x.s0 : y.s0; \ + x.s1 = (x.s1 > y.s1) ? x.s1 : y.s1; \ + x.s2 = (x.s2 > y.s2) ? x.s2 : y.s2; \ + x.s3 = (x.s3 > y.s3) ? x.s3 : y.s3; \ + } + +__kernel void pooling_max(const int ih, + const int iw, + const int ih_off, + const int iw_off, + const int ih_str, + const int iw_str, + const int oh, + const int ow, + const int oh_off, + const int ow_off, + const int oh_str, + const int ow_str, + const int sh, + const int sw, + const int ph, + const int pw, + const int kh, + const int kw, + __global const T *in, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= oh || idy >= ow) { + return; + } + + int bh = idx * sh - ph; + int bw = idy * sw - pw; + int eh = bh + kh; + int ew = bw + kw; + bh = (bh < 0) ? 0 : bh; + bw = (bw < 0) ? 0 : bw; + eh = (eh < ih) ? eh : ih; + ew = (ew < iw) ? ew : iw; + + bh += ih_off; + bw += iw_off; + eh += ih_off; + ew += iw_off; + int in_off = (idz * iw_str + bw) * ih_str; + + T4 val = -FLT_MAX; + T4 maxval = -FLT_MAX; + for (int i = bw; i < ew; ++i) { + for (int j = bh; j < eh; ++j) { + val = vload4(in_off + j, in); + maxvec4(maxval, val); + } + in_off += ih_str; + } + int out_off = (idz * ow_str + ow_off + idy) * oh_str + oh_off + idx; + vstore4(maxval, out_off, out); +} diff --git a/compute/tensor/src/gpu/mali/cl/pooling_mean.cl b/compute/tensor/src/gpu/mali/cl/pooling_mean.cl new file mode 100644 index 00000000..6e040e56 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/pooling_mean.cl @@ -0,0 +1,78 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define sumvec4(x, y) \ + { \ + x.s0 += (float)y.s0; \ + x.s1 += (float)y.s1; \ + x.s2 += (float)y.s2; \ + x.s3 += (float)y.s3; \ + } + +__kernel void pooling_mean(const int ih, + const int iw, + const int ih_off, + const int iw_off, + const int ih_str, + const int iw_str, + const int oh, + const int ow, + const int oh_off, + const int ow_off, + const int oh_str, + const int ow_str, + const int sh, + const int sw, + const int ph, + const int pw, + const int kh, + const int kw, + __global const T *in, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= oh || idy >= ow) { + return; + } + + int bh = idx * sh - ph; + int bw = idy * sw - pw; + int eh = bh + kh; + int ew = bw + kw; + bh = (bh < 0) ? 0 : bh; + bw = (bw < 0) ? 0 : bw; + eh = (eh < ih) ? eh : ih; + ew = (ew < iw) ? ew : iw; + float psize = (eh - bh) * (ew - bw); + + bh += ih_off; + bw += iw_off; + eh += ih_off; + ew += iw_off; + int in_off = (idz * iw_str + bw) * ih_str; + + T4 val; + float4 sum = 0; + for (int i = bw; i < ew; ++i) { + for (int j = bh; j < eh; ++j) { + val = vload4(in_off + j, in); + sumvec4(sum, val); + } + in_off += ih_str; + } + sum = sum / psize; + int out_off = (idz * ow_str + ow_off + idy) * oh_str + oh_off + idx; + vstore4((T4)(sum.x, sum.y, sum.z, sum.w), out_off, out); +} diff --git a/compute/tensor/src/gpu/mali/cl/power.cl b/compute/tensor/src/gpu/mali/cl/power.cl new file mode 100644 index 00000000..d09c718d --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/power.cl @@ -0,0 +1,93 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define MANGLE_NAME_IMPL(base, DT) base##DT +#define MANGLE_NAME(base, DT) MANGLE_NAME_IMPL(base, DT) +__kernel void MANGLE_NAME(power_, DT)(const int ih_str, + const int iw_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + const int w, + const int bx, + const int by, + const int has_power, + const float alp, + const float bet, + float power, + __global T *input, + __global T *output) +{ + int idx = get_global_id(0); + int idy = get_global_id(1); + int idz = get_global_id(2); + if (idx >= bx || idy >= by) { + return; + } + char ew = (((idx << 2) + 4) <= w) ? 4 : (w & 3); + + int in_off = (idz * ih_str + idy + ih_off) * iw_str + (idx << 2) + iw_off; + int out_off = (idz * oh_str + idy + oh_off) * ow_str + (idx << 2) + ow_off; + if (ew == 4) { + T4 val; + val = vload4(0, input + in_off); + val.x = (T)(((float)val.x) * alp + bet); + val.y = (T)(((float)val.y) * alp + bet); + val.z = (T)(((float)val.z) * alp + bet); + val.w = (T)(((float)val.w) * alp + bet); + if (has_power) { + val.x = pow((float)val.x, power); + val.y = pow((float)val.y, power); + val.z = pow((float)val.z, power); + val.w = pow((float)val.w, power); + } + vstore4(val, 0, output + out_off); + } else { + if (ew == 1) { + T val; + val = input[in_off]; + val = ((float)val) * alp + bet; + if (has_power) { + val = pow((float)val, power); + } + output[out_off] = (T)val; + } + if (ew == 2) { + T2 val; + val = vload2(0, input + in_off); + val.x = (T)(((float)val.x) * alp + bet); + val.y = (T)(((float)val.y) * alp + bet); + if (has_power) { + val.x = pow((float)val.x, power); + val.y = pow((float)val.y, power); + } + vstore2(val, 0, output + out_off); + } + if (ew == 3) { + T3 val; + val = vload3(0, input + in_off); + val.x = (T)(((float)val.x) * alp + bet); + val.y = (T)(((float)val.y) * alp + bet); + val.z = (T)(((float)val.z) * alp + bet); + if (has_power) { + val.x = pow((float)val.x, power); + val.y = pow((float)val.y, power); + val.z = pow((float)val.z, power); + } + vstore3(val, 0, output + out_off); + } + } +} diff --git a/compute/tensor/src/gpu/mali/cl/prelu.cl b/compute/tensor/src/gpu/mali/cl/prelu.cl new file mode 100644 index 00000000..7708b0cc --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/prelu.cl @@ -0,0 +1,60 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define MANGLE_NAME_IMPL(base, MD) base##MD +#define MANGLE_NAME(base, MD) MANGLE_NAME_IMPL(base, MD) + +__kernel void MANGLE_NAME(prelu_, MD)(const int ih, + const int iw, + const int ih_str, + const int iw_str, + const int ih_off, + const int iw_off, + const int oh, + const int ow, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + __global const T *weight, + __global T *input, + __global T *output) +{ + int idx = get_global_id(0); + int idy = get_global_id(1); + int idz = get_global_id(2); + if (idx >= oh || idy >= ow) { + return; + } + +#if defined(USE_SAME) + T4 wei = vload4(0, weight); + wei.y = wei.x; + wei.z = wei.x; + wei.w = wei.x; +#else + T4 wei = vload4(idz, weight); +#endif + + T4 val; + int in_off = (idz * iw_str + idy + iw_off) * ih_str + idx + ih_off; + val = vload4(in_off, input); + + val.s0 = val.s0 > 0 ? val.s0 : wei.x * val.s0; + val.s1 = val.s1 > 0 ? val.s1 : wei.y * val.s1; + val.s2 = val.s2 > 0 ? val.s2 : wei.z * val.s2; + val.s3 = val.s3 > 0 ? val.s3 : wei.w * val.s3; + + int out_off = (idz * ow_str + idy + ow_off) * oh_str + idx + oh_off; + vstore4(val, out_off, output); +} diff --git a/tensor_computing/src/gpu/mali/cl/reshape.cl b/compute/tensor/src/gpu/mali/cl/reshape.cl similarity index 79% rename from tensor_computing/src/gpu/mali/cl/reshape.cl rename to compute/tensor/src/gpu/mali/cl/reshape.cl index 2edc4d28..f2ee1395 100644 --- a/tensor_computing/src/gpu/mali/cl/reshape.cl +++ b/compute/tensor/src/gpu/mali/cl/reshape.cl @@ -11,17 +11,28 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - -//TODO -__kernel void reshape(const int h, const int ih_str, const int iw_str, const int ih_off, const int iw_off, const int oh_str, const int ow_str, - const int oh_off, const int ow_off, const int bx, const int by, __global const T* in, __global T* out) { +// TODO +__kernel void reshape(const int h, + const int ih_str, + const int iw_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + const int bx, + const int by, + __global const T *in, + __global T *out) +{ int idx = get_global_id(0); int idy = get_global_id(1); int idz = get_global_id(2); - if(idx >= bx || idy >= by) return; - T4 val; + if (idx >= bx || idy >= by) { + return; + } + T4 val; int in_off = (idz * iw_str + idy + iw_off) * ih_str + idx + ih_off; val = vload4(in_off, in); int out_off = (idz * ow_str + idy + ow_off) * oh_str + idx + oh_off; diff --git a/compute/tensor/src/gpu/mali/cl/rnncell_build_xh.cl b/compute/tensor/src/gpu/mali/cl/rnncell_build_xh.cl new file mode 100644 index 00000000..1553c2bd --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/rnncell_build_xh.cl @@ -0,0 +1,33 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void rnncell_build_xh(const int xDim, + const int xhDim, + const int s_off, + const int bx, + __global const T *xmem, + __global const T *smem, + __global T *xhmem) +{ + int idx = get_global_id(0); + if (idx >= bx) { + return; + } + T val = 0; + if (idx < xDim) { + val = xmem[idx]; + } else if (idx < xhDim) { + val = smem[idx + s_off - xDim]; + } + xhmem[idx] = val; +} diff --git a/compute/tensor/src/gpu/mali/cl/rnncell_update_project_res.cl b/compute/tensor/src/gpu/mali/cl/rnncell_update_project_res.cl new file mode 100644 index 00000000..117fa1ee --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/rnncell_update_project_res.cl @@ -0,0 +1,118 @@ +// Copyright (C) 2019. Huawei Tehhnologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subjeht to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define load_float4(off, val, buf) \ + { \ + T4 tmp; \ + tmp = vload4(0, buf + off); \ + val.x = tmp.x; \ + val.y = tmp.y; \ + val.z = tmp.z; \ + val.w = tmp.w; \ + } + +#define load_float3(off, val, buf) \ + { \ + T3 tmp; \ + tmp = vload3(0, buf + off); \ + val.x = tmp.x; \ + val.y = tmp.y; \ + val.z = tmp.z; \ + } + +#define load_float2(off, val, buf) \ + { \ + T2 tmp; \ + tmp = vload2(0, buf + off); \ + val.x = tmp.x; \ + val.y = tmp.y; \ + } + +#define store_float4(off, val, buf) \ + { \ + T4 tmp; \ + tmp.x = (T)val.x; \ + tmp.y = (T)val.y; \ + tmp.z = (T)val.z; \ + tmp.w = (T)val.w; \ + vstore4(tmp, 0, buf + off); \ + } + +#define store_float3(off, val, buf) \ + { \ + T3 tmp; \ + tmp.x = (T)val.x; \ + tmp.y = (T)val.y; \ + tmp.z = (T)val.z; \ + vstore3(tmp, 0, buf + off); \ + } + +#define store_float2(off, val, buf) \ + { \ + T2 tmp; \ + tmp.x = (T)val.x; \ + tmp.y = (T)val.y; \ + vstore2(tmp, 0, buf + off); \ + } + +__kernel void rnncell_update_projeht_state( + const int hDim, const int col, const int bx, float zoneout, __global T *out, __global T *smem) +{ + int idx = get_global_id(0); + if (idx >= bx) { + return; + } + char eh = ((idx << 2) + 4 <= hDim) ? 4 : (hDim & 3); + float4 res; + float4 hres; + int off = idx << 2; + if (eh == 4) { + load_float4(off, res, out); + } + if (eh == 3) { + load_float3(off, res, out); + } + if (eh == 2) { + load_float2(off, res, out); + } + if (eh == 1) { + res.x = out[off]; + } + hres = res; + + if (zoneout != 0) { + if (eh == 4) { + load_float4(off + col, hres, smem); + } + hres.x = res.x * (1 - zoneout) + hres.x * zoneout; + hres.y = res.y * (1 - zoneout) + hres.y * zoneout; + hres.z = res.z * (1 - zoneout) + hres.z * zoneout; + hres.w = res.w * (1 - zoneout) + hres.w * zoneout; + } + + if (eh == 4) { + store_float4(off + col, hres, smem); + return; + } + if (eh == 3) { + store_float3(off + col, hres, smem); + return; + } + if (eh == 2) { + store_float2(off + col, hres, smem); + return; + } + if (eh == 1) { + smem[off + col] = hres.x; + } +} diff --git a/compute/tensor/src/gpu/mali/cl/rnncell_update_res.cl b/compute/tensor/src/gpu/mali/cl/rnncell_update_res.cl new file mode 100644 index 00000000..707da037 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/rnncell_update_res.cl @@ -0,0 +1,166 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define load_float4(off, val, buf) \ + { \ + T4 tmp; \ + tmp = vload4(0, buf + off); \ + val.x = tmp.x; \ + val.y = tmp.y; \ + val.z = tmp.z; \ + val.w = tmp.w; \ + } + +#define load_float3(off, val, buf) \ + { \ + T3 tmp; \ + tmp = vload3(0, buf + off); \ + val.x = tmp.x; \ + val.y = tmp.y; \ + val.z = tmp.z; \ + } + +#define load_float2(off, val, buf) \ + { \ + T2 tmp; \ + tmp = vload2(0, buf + off); \ + val.x = tmp.x; \ + val.y = tmp.y; \ + } + +#define store_float4(off, val, buf) \ + { \ + T4 tmp; \ + tmp.x = (T)val.x; \ + tmp.y = (T)val.y; \ + tmp.z = (T)val.z; \ + tmp.w = (T)val.w; \ + vstore4(tmp, 0, buf + off); \ + } +#define store_float3(off, val, buf) \ + { \ + T3 tmp; \ + tmp.x = (T)val.x; \ + tmp.y = (T)val.y; \ + tmp.z = (T)val.z; \ + vstore3(tmp, 0, buf + off); \ + } +#define store_float2(off, val, buf) \ + { \ + T2 tmp; \ + tmp.x = (T)val.x; \ + tmp.y = (T)val.y; \ + vstore2(tmp, 0, buf + off); \ + } + +__kernel void rnncell_update_res(const int col, + const uchar noproject, + const int bx, + float fbias, + float zonecell, + float zoneout, + __global T *smem, + __global T *imem, + __global T *out) +{ + int idx = get_global_id(0); + if (idx >= bx) { + return; + } + char ec = ((idx << 2) + 4 <= col) ? 4 : (col & 3); + float4 cval; + float4 lcval; + float4 ival; + float4 gval; + float4 fval; + float4 oval; + float4 res; + float4 hres; + int off = idx << 2; + load_float4(off, cval, smem); + load_float4(off, ival, imem); + load_float4(off + col, gval, imem); + load_float4(off + col * 2, fval, imem); + load_float4(off + col * 3, oval, imem); + ival.x = 1.0 / (1.0 + exp(-ival.x)); + ival.y = 1.0 / (1.0 + exp(-ival.y)); + ival.z = 1.0 / (1.0 + exp(-ival.z)); + ival.w = 1.0 / (1.0 + exp(-ival.w)); + gval.x = tanh(gval.x); + gval.y = tanh(gval.y); + gval.z = tanh(gval.z); + gval.w = tanh(gval.w); + fval.x = 1.0 / (1.0 + exp(-(fval.x + fbias))); + fval.y = 1.0 / (1.0 + exp(-(fval.y + fbias))); + fval.z = 1.0 / (1.0 + exp(-(fval.z + fbias))); + fval.w = 1.0 / (1.0 + exp(-(fval.w + fbias))); + oval.x = 1.0 / (1.0 + exp(-oval.x)); + oval.y = 1.0 / (1.0 + exp(-oval.y)); + oval.z = 1.0 / (1.0 + exp(-oval.z)); + oval.w = 1.0 / (1.0 + exp(-oval.w)); + lcval = cval; + cval.x = cval.x * fval.x + ival.x * gval.x; + cval.y = cval.y * fval.y + ival.y * gval.y; + cval.z = cval.z * fval.z + ival.z * gval.z; + cval.w = cval.w * fval.w + ival.w * gval.w; + res.x = oval.x * tanh(cval.x); + res.y = oval.y * tanh(cval.y); + res.z = oval.z * tanh(cval.z); + res.w = oval.w * tanh(cval.w); + hres = res; + + if (zonecell != 0) { + cval.x = cval.x * (1 - zonecell) + lcval.x * zonecell; + cval.y = cval.y * (1 - zonecell) + lcval.y * zonecell; + cval.z = cval.z * (1 - zonecell) + lcval.z * zonecell; + cval.w = cval.w * (1 - zonecell) + lcval.w * zonecell; + } + + if (zoneout != 0 && noproject) { + load_float4(off + col, hres, smem); + hres.x = res.x * (1 - zoneout) + hres.x * zoneout; + hres.y = res.y * (1 - zoneout) + hres.y * zoneout; + hres.z = res.z * (1 - zoneout) + hres.z * zoneout; + hres.w = res.w * (1 - zoneout) + hres.w * zoneout; + } + + if (ec == 4) { + store_float4(off, cval, smem); + store_float4(off, res, out); + if (noproject) { + store_float4(off + col, hres, smem); + } + } else { + if (ec == 1) { + smem[off] = (T)cval.x; + out[off] = (T)res.x; + if (noproject) { + smem[off + col] = (T)hres.x; + } + } + if (ec == 2) { + store_float2(off, cval, smem); + store_float2(off, res, out); + if (noproject) { + store_float2(off + col, hres, smem); + } + } + if (ec == 3) { + store_float3(off, cval, smem); + store_float3(off, res, out); + if (noproject) { + store_float3(off + col, hres, smem); + } + } + } +} diff --git a/compute/tensor/src/gpu/mali/cl/scale.cl b/compute/tensor/src/gpu/mali/cl/scale.cl new file mode 100644 index 00000000..2549a4b8 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/scale.cl @@ -0,0 +1,75 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define MANGLE_NAME_IMPL(base, MD) base##MD +#define MANGLE_NAME(base, MD) MANGLE_NAME_IMPL(base, MD) + +#if defined(USE_SAME) +__kernel void MANGLE_NAME(scale1_, MD) +#else +__kernel void MANGLE_NAME(scale_, MD) +#endif + (const int h, + const int ih_str, + const int iw_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + const int bx, + const int by, + __global const T *alpha, + __global const T *beta, + __global T *input, + __global T *output) +{ + int idx = get_global_id(0); + int idy = get_global_id(1); + int idz = get_global_id(2); + if (idx >= bx || idy >= by) { + return; + } + +#if defined(USE_SAME) + T4 alp = vload4(0, alpha); + alp.y = alp.x; + alp.z = alp.x; + alp.w = alp.x; + T4 bet = 0; +#if defined(USE_BETA) + bet = vload4(0, beta); + bet.y = bet.x; + bet.z = bet.x; + bet.w = bet.x; +#endif +#else + T4 alp = vload4(idz, alpha); + T4 bet = 0; +#if defined(USE_BETA) + bet = vload4(idz, beta); +#endif +#endif + T4 val; + int in_off = (idz * iw_str + idy + iw_off) * ih_str + idx + ih_off; + val = vload4(in_off, input); + + val.s0 = val.s0 * alp.x + bet.x; + val.s1 = val.s1 * alp.y + bet.y; + val.s2 = val.s2 * alp.z + bet.z; + val.s3 = val.s3 * alp.w + bet.w; + + int out_off = (idz * ow_str + idy + ow_off) * oh_str + idx + oh_off; + vstore4(val, out_off, output); +} diff --git a/compute/tensor/src/gpu/mali/cl/slice_h.cl b/compute/tensor/src/gpu/mali/cl/slice_h.cl new file mode 100644 index 00000000..2124b446 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/slice_h.cl @@ -0,0 +1,57 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define MANGLE_NAME_IMPL(base, N) base##N +#define MANGLE_NAME(base, N) MANGLE_NAME_IMPL(base, N) + +__kernel void MANGLE_NAME(slice_h_, N)(const int ih_str, + const int iw_str, + const int ih_off, + const int iw_off, + const int bx, + const int by, + __global T *input, + const int oh_str0, + const int ow_str0, + const int oh_off0, + const int ow_off0, + const int slice_end0, + __global T *output0, + const int oh_str1, + const int ow_str1, + const int oh_off1, + const int ow_off1, + const int slice_end1, + __global T *output1) +{ + int idx = get_global_id(0); + int idy = get_global_id(1); + int idz = get_global_id(2); + if (idx >= bx || idy >= by) { + return; + } + + T4 val; + int in_off = (idz * iw_str + idy + iw_off) * ih_str + idx + ih_off; + val = vload4(in_off, input); + if (idx < slice_end0) { + int out_off = (idz * ow_str0 + idy + ow_off0) * oh_str0 + idx + oh_off0; + vstore4(val, out_off, output0); + return; + } + if (idx < slice_end1) { + int out_off = (idz * ow_str1 + idy + ow_off1) * oh_str1 + idx + oh_off1; + vstore4(val, out_off, output1); + return; + } +} diff --git a/compute/tensor/src/gpu/mali/cl/softmax.cl b/compute/tensor/src/gpu/mali/cl/softmax.cl new file mode 100644 index 00000000..c135577e --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/softmax.cl @@ -0,0 +1,115 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void softmax(const int cd4, + const int ce4, + const int ih_str, + const int ihw_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int ohw_str, + const int oh_off, + const int ow_off, + const int bx, + const int by, + __global T *in, + __global T *out) +{ + int idx = get_global_id(0); + int idy = get_global_id(1); + if (idx >= bx || idy >= by) { + return; + } + + float4 maxval = (float4)(-FLT_MAX); + float4 tmp; + float4 lval; + T4 val; + int index = (idy + iw_off) * ih_str + idx + ih_off; + for (int i = 0; i < cd4 - 1; i++) { + val = vload4(index + i * ihw_str, in); + tmp.x = (float)val.x; + tmp.y = (float)val.y; + tmp.z = (float)val.z; + tmp.w = (float)val.w; + maxval = fmax(maxval, tmp); + } + val = vload4(index + (cd4 - 1) * ihw_str, in); + lval.x = (float)val.x; + lval.y = (float)val.y; + lval.z = (float)val.z; + lval.w = (float)val.w; + if (maxval.x < maxval.y) { + maxval.x = maxval.y; + } + if (maxval.x < maxval.z) { + maxval.x = maxval.z; + } + if (maxval.x < maxval.w) { + maxval.x = maxval.w; + } + if (maxval.x < lval.x) { + maxval.x = lval.x; + } + if (ce4 > 1 && maxval.x < lval.y) { + maxval.x = lval.y; + } + if (ce4 > 2 && maxval.x < lval.z) { + maxval.x = lval.z; + } + if (ce4 > 3 && maxval.x < lval.w) { + maxval.x = lval.w; + } + + float sumexp = 0; + for (int i = 0; i < cd4 - 1; i++) { + val = vload4(index + i * ihw_str, in); + sumexp += exp((float)val.x - maxval.x); + sumexp += exp((float)val.y - maxval.x); + sumexp += exp((float)val.z - maxval.x); + sumexp += exp((float)val.w - maxval.x); + } + sumexp += exp(lval.x - maxval.x); + if (ce4 > 1) { + sumexp += exp(lval.y - maxval.x); + } + if (ce4 > 2) { + sumexp += exp(lval.z - maxval.x); + } + if (ce4 > 3) { + sumexp += exp(lval.w - maxval.x); + } + + sumexp = 1.0 / sumexp; + int out_off = (idy + ow_off) * oh_str + idx + oh_off; + for (int i = 0; i < cd4 - 1; i++) { + val = vload4(index + i * ihw_str, in); + val.x = exp((float)val.x - maxval.x) * sumexp; + val.y = exp((float)val.y - maxval.x) * sumexp; + val.z = exp((float)val.z - maxval.x) * sumexp; + val.w = exp((float)val.w - maxval.x) * sumexp; + vstore4(val, out_off + i * ohw_str, out); + } + val.x = exp(lval.x - maxval.x) * sumexp; + if (ce4 > 1) { + val.y = exp(lval.y - maxval.x) * sumexp; + } + if (ce4 > 2) { + val.z = exp(lval.z - maxval.x) * sumexp; + } + if (ce4 > 3) { + val.w = exp(lval.w - maxval.x) * sumexp; + } + vstore4(val, out_off + (cd4 - 1) * ohw_str, out); +} diff --git a/compute/tensor/src/gpu/mali/cl/softmax_h1w1_max_all.cl b/compute/tensor/src/gpu/mali/cl/softmax_h1w1_max_all.cl new file mode 100644 index 00000000..2e162b16 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/softmax_h1w1_max_all.cl @@ -0,0 +1,45 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void softmax_h1w1_max_all(const int kn, __global T *in) +{ + int idx = get_global_id(0); + if (idx >= 1) { + return; + } + + float4 maxval = (float4)(-FLT_MAX); + float4 tmp; + T4 val; + int kn4 = kn >> 2; + for (int i = 0; i < kn4; ++i) { + val = vload4(i, in); + tmp.x = (float)val.x; + tmp.y = (float)val.y; + tmp.z = (float)val.z; + tmp.w = (float)val.w; + maxval = fmax(maxval, tmp); + } + + if (maxval.x < maxval.y) { + maxval.x = maxval.y; + } + if (maxval.x < maxval.z) { + maxval.x = maxval.z; + } + if (maxval.x < maxval.w) { + maxval.x = maxval.w; + } + + in[kn + 1] = maxval.x; +} \ No newline at end of file diff --git a/compute/tensor/src/gpu/mali/cl/softmax_h1w1_max_part.cl b/compute/tensor/src/gpu/mali/cl/softmax_h1w1_max_part.cl new file mode 100644 index 00000000..ead93c75 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/softmax_h1w1_max_part.cl @@ -0,0 +1,60 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void softmax_h1w1_max_part( + const int cd4, const int ce4, const int kn, __global const T *in, __global T *out) +{ + int idx = get_global_id(0); + if (idx >= kn) { + return; + } + + float4 maxval = (float4)(-FLT_MAX); + float4 tmp; + T4 val; + + for (int i = idx; i < cd4 - 1; i = i + kn) { + val = vload4(i, in); + tmp.x = (float)val.x; + tmp.y = (float)val.y; + tmp.z = (float)val.z; + tmp.w = (float)val.w; + maxval = fmax(maxval, tmp); + } + + if (maxval.x < maxval.y) { + maxval.x = maxval.y; + } + if (maxval.x < maxval.z) { + maxval.x = maxval.z; + } + if (maxval.x < maxval.w) { + maxval.x = maxval.w; + } + + if (idx == kn - 1) { + val = vload4(cd4 - 1, in); + maxval.x = fmax((float)val.x, maxval.x); + if (ce4 >= 2) { + maxval.x = fmax((float)val.y, maxval.x); + } + if (ce4 >= 3) { + maxval.x = fmax((float)val.z, maxval.x); + } + if (ce4 >= 4) { + maxval.x = fmax((float)val.w, maxval.x); + } + } + + out[idx] = (T)maxval.x; +} diff --git a/compute/tensor/src/gpu/mali/cl/softmax_h1w1_output.cl b/compute/tensor/src/gpu/mali/cl/softmax_h1w1_output.cl new file mode 100644 index 00000000..049d43e4 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/softmax_h1w1_output.cl @@ -0,0 +1,37 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void softmax_h1w1_output(const int cd4, + const int ce4, + const int kn, + __global const T *in, + __global const T *tmp, + __global T *out) +{ + int idx = get_global_id(0); + if (idx >= cd4) { + return; + } + T4 val; + + val = vload4(idx, in); + float maxv = (float)(tmp[kn + 1]); + float sumexp = (float)(tmp[kn]); + + val.x = (T)(exp((float)val.x - maxv) * sumexp); + val.y = (T)(exp((float)val.y - maxv) * sumexp); + val.z = (T)(exp((float)val.z - maxv) * sumexp); + val.w = (T)(exp((float)val.w - maxv) * sumexp); + + vstore4(val, idx, out); +} \ No newline at end of file diff --git a/compute/tensor/src/gpu/mali/cl/softmax_h1w1_sum_all.cl b/compute/tensor/src/gpu/mali/cl/softmax_h1w1_sum_all.cl new file mode 100644 index 00000000..08014343 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/softmax_h1w1_sum_all.cl @@ -0,0 +1,33 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void softmax_h1w1_sum_all(const int kn, __global T *in) +{ + int idx = get_global_id(0); + if (idx >= 1) { + return; + } + + T4 val; + float sumexp = 0; + int kn4 = kn >> 2; + for (int i = 0; i < kn4; ++i) { + val = vload4(i, in); + sumexp += (float)val.x; + sumexp += (float)val.y; + sumexp += (float)val.z; + sumexp += (float)val.w; + } + + in[kn] = (T)(1.0f / sumexp); +} diff --git a/compute/tensor/src/gpu/mali/cl/softmax_h1w1_sum_part.cl b/compute/tensor/src/gpu/mali/cl/softmax_h1w1_sum_part.cl new file mode 100644 index 00000000..aae3784c --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/softmax_h1w1_sum_part.cl @@ -0,0 +1,49 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void softmax_h1w1_sum_part( + const int cd4, const int ce4, const int kn, __global const T *in, __global T *out) +{ + int idx = get_global_id(0); + if (idx >= kn) { + return; + } + + T4 val; + float maxval = (float)(out[kn + 1]); + float sumexp = 0.0f; + for (int i = idx; i < cd4 - 1; i = i + kn) { + val = vload4(i, in); + + sumexp += exp((float)val.x - maxval); + sumexp += exp((float)val.y - maxval); + sumexp += exp((float)val.z - maxval); + sumexp += exp((float)val.w - maxval); + } + + if (idx == kn - 1) { + val = vload4(cd4 - 1, in); + sumexp += exp((float)val.x - maxval); + if (ce4 >= 2) { + sumexp += exp((float)val.y - maxval); + } + if (ce4 >= 3) { + sumexp += exp((float)val.z - maxval); + } + if (ce4 >= 4) { + sumexp += exp((float)val.w - maxval); + } + } + + out[idx] = (T)sumexp; +} \ No newline at end of file diff --git a/compute/tensor/src/gpu/mali/cl/softmax_nchw_c.cl b/compute/tensor/src/gpu/mali/cl/softmax_nchw_c.cl new file mode 100644 index 00000000..5825115a --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/softmax_nchw_c.cl @@ -0,0 +1,88 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void softmax_nchw_c(const int c, + const int iw_str, + const int ihw_str, + const int iw_off, + const int ih_off, + const int ow_str, + const int ohw_str, + const int ow_off, + const int oh_off, + const int ow, + const int bx, + const int by, + __global T *in, + __global T *out) +{ + int idx = get_global_id(0); + int idy = get_global_id(1); + if (idx >= bx || idy >= by) { + return; + } + int ew = ((idx << 2) + 4 <= ow) ? 4 : (ow & 3); + + float4 maxval = (float4)(-FLT_MAX); + float4 tmp; + T4 val; + int index = (idy + ih_off) * iw_str + (idx << 2) + iw_off; + for (int i = 0; i < c; i++) { + val = vload4(0, in + index + i * ihw_str); + tmp.x = (float)val.x; + tmp.y = (float)val.y; + tmp.z = (float)val.z; + tmp.w = (float)val.w; + maxval = fmax(maxval, tmp); + } + + float4 sumexp = 0; + for (int i = 0; i < c; i++) { + val = vload4(0, in + index + i * ihw_str); + sumexp.x += exp((float)val.x - maxval.x); + sumexp.y += exp((float)val.y - maxval.y); + sumexp.z += exp((float)val.z - maxval.z); + sumexp.w += exp((float)val.w - maxval.w); + } + + sumexp.x = 1.0 / sumexp.x; + sumexp.y = 1.0 / sumexp.y; + sumexp.z = 1.0 / sumexp.z; + sumexp.w = 1.0 / sumexp.w; + int out_off = (idy + oh_off) * ow_str + (idx << 2) + ow_off; + if (ew == 4) { + for (int i = 0; i < c; i++) { + val = vload4(0, in + index + i * ihw_str); + val.x = exp((float)val.x - maxval.x) * sumexp.x; + val.y = exp((float)val.y - maxval.y) * sumexp.y; + val.z = exp((float)val.z - maxval.z) * sumexp.z; + val.w = exp((float)val.w - maxval.w) * sumexp.w; + vstore4(val, 0, out + out_off + i * ohw_str); + } + } else { + for (int i = 0; i < c; i++) { + val = vload4(0, in + index + i * ihw_str); + val.x = exp((float)val.x - maxval.x) * sumexp.x; + val.y = exp((float)val.y - maxval.y) * sumexp.y; + val.z = exp((float)val.z - maxval.z) * sumexp.z; + if (ew < 2) { + val.y = 0; + } + if (ew < 3) { + val.z = 0; + } + val.w = 0; + vstore4(val, 0, out + out_off + i * ohw_str); + } + } +} diff --git a/compute/tensor/src/gpu/mali/cl/softmax_nchw_w.cl b/compute/tensor/src/gpu/mali/cl/softmax_nchw_w.cl new file mode 100644 index 00000000..90f1e079 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/softmax_nchw_w.cl @@ -0,0 +1,116 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void softmax_nchw_w(const int wd4, + const int we4, + const int iw_str, + const int ih_str, + const int iw_off, + const int ih_off, + const int ow_str, + const int oh_str, + const int ow_off, + const int oh_off, + const int bx, + const int by, + __global T *in, + __global T *out) +{ + int idx = get_global_id(0); + int idy = get_global_id(1); + if (idx >= bx || idy >= by) { + return; + } + + float4 maxval = (float4)(-FLT_MAX); + float4 tmp; + float4 lval; + T4 val; + + int index = (idy * ih_str + idx + ih_off) * iw_str + iw_off; + for (int i = 0; i < wd4 - 1; i++) { + val = vload4(i, in + index); + tmp.x = (float)val.x; + tmp.y = (float)val.y; + tmp.z = (float)val.z; + tmp.w = (float)val.w; + maxval = fmax(maxval, tmp); + } + val = vload4(wd4 - 1, in + index); + lval.x = (float)val.x; + lval.y = (float)val.y; + lval.z = (float)val.z; + lval.w = (float)val.w; + if (maxval.x < maxval.y) { + maxval.x = maxval.y; + } + if (maxval.x < maxval.z) { + maxval.x = maxval.z; + } + if (maxval.x < maxval.w) { + maxval.x = maxval.w; + } + if (maxval.x < lval.x) { + maxval.x = lval.x; + } + if (we4 > 1 && maxval.x < lval.y) { + maxval.x = lval.y; + } + if (we4 > 2 && maxval.x < lval.z) { + maxval.x = lval.z; + } + if (we4 > 3 && maxval.x < lval.w) { + maxval.x = lval.w; + } + + float sumexp = 0; + for (int i = 0; i < wd4 - 1; i++) { + val = vload4(i, in + index); + sumexp += exp((float)val.x - maxval.x); + sumexp += exp((float)val.y - maxval.x); + sumexp += exp((float)val.z - maxval.x); + sumexp += exp((float)val.w - maxval.x); + } + sumexp += exp(lval.x - maxval.x); + if (we4 > 1) { + sumexp += exp(lval.y - maxval.x); + } + if (we4 > 2) { + sumexp += exp(lval.z - maxval.x); + } + if (we4 > 3) { + sumexp += exp(lval.w - maxval.x); + } + + sumexp = 1.0 / sumexp; + int out_off = (idy * oh_str + idx + oh_off) * ow_str + ow_off; + for (int i = 0; i < wd4 - 1; i++) { + val = vload4(i, in + index); + val.x = exp((float)val.x - maxval.x) * sumexp; + val.y = exp((float)val.y - maxval.x) * sumexp; + val.z = exp((float)val.z - maxval.x) * sumexp; + val.w = exp((float)val.w - maxval.x) * sumexp; + vstore4(val, i, out + out_off); + } + val.x = exp(lval.x - maxval.x) * sumexp; + if (we4 > 1) { + val.y = exp(lval.y - maxval.x) * sumexp; + } + if (we4 > 2) { + val.z = exp(lval.z - maxval.x) * sumexp; + } + if (we4 > 3) { + val.w = exp(lval.w - maxval.x) * sumexp; + } + vstore4(val, wd4 - 1, out + out_off); +} diff --git a/compute/tensor/src/gpu/mali/cl/space2depth.cl b/compute/tensor/src/gpu/mali/cl/space2depth.cl new file mode 100644 index 00000000..b145410f --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/space2depth.cl @@ -0,0 +1,48 @@ +__kernel void space2depth(const int iw_str, + const int ih_str, + const int iw_off, + const int ih_off, + const int oh_str, + const int ohw_str, + const int ow_off, + const int oh_off, + const int bx, + const int by, + __global const uchar *in, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + if (idx >= bx || idy >= by) { + return; + } + + const int in_off = ((idx << 2) + ih_off) * iw_str + (idy << 2) + iw_off; + uchar4 tmp0 = vload4(0, in + in_off); + uchar4 tmp1 = vload4(0, in + in_off + iw_str); + uchar4 tmp2 = vload4(0, in + in_off + (iw_str << 1)); + uchar4 tmp3 = vload4(0, in + in_off + iw_str * 3); + T4 val0, val1, val2, val3; + val0.x = tmp0.x / (T)(255); + val0.y = tmp0.y / (T)(255); + val0.z = tmp0.z / (T)(255); + val0.w = tmp0.w / (T)(255); + val1.x = tmp1.x / (T)(255); + val1.y = tmp1.y / (T)(255); + val1.z = tmp1.z / (T)(255); + val1.w = tmp1.w / (T)(255); + val2.x = tmp2.x / (T)(255); + val2.y = tmp2.y / (T)(255); + val2.z = tmp2.z / (T)(255); + val2.w = tmp2.w / (T)(255); + val3.x = tmp3.x / (T)(255); + val3.y = tmp3.y / (T)(255); + val3.z = tmp3.z / (T)(255); + val3.w = tmp3.w / (T)(255); + + const int out_off = (idy + ow_off) * oh_str + idx + oh_off; + vstore4(val0, out_off, out); + vstore4(val1, out_off + ohw_str, out); + vstore4(val2, out_off + ohw_str * 2, out); + vstore4(val3, out_off + ohw_str * 3, out); +} diff --git a/tensor_computing/src/gpu/mali/cl/squeeze.cl b/compute/tensor/src/gpu/mali/cl/squeeze.cl similarity index 80% rename from tensor_computing/src/gpu/mali/cl/squeeze.cl rename to compute/tensor/src/gpu/mali/cl/squeeze.cl index 51717f11..e44aa6fb 100644 --- a/tensor_computing/src/gpu/mali/cl/squeeze.cl +++ b/compute/tensor/src/gpu/mali/cl/squeeze.cl @@ -11,17 +11,26 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - -__kernel void squeeze(const int h, const int w, const int ih_str, const int iw_str, const int ih_off, const int iw_off, - const int oh_str, const int ow_str, const int oh_off, const int ow_off, __global const T* in, __global T* out) { +__kernel void squeeze(const int h, + const int w, + const int ih_str, + const int iw_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + __global const T *in, + __global T *out) +{ int idx = get_global_id(0); int idy = get_global_id(1); - if(idx >= h || idy >= w) return; + if (idx >= h || idy >= w) { + return; + } int idz = get_global_id(2); - T4 val; + T4 val; int in_off = (idz * iw_str + idy + iw_off) * ih_str + idx + ih_off; val = vload4(in_off, in); int out_off = (idz * ow_str + idy + ow_off) * oh_str + idx + oh_off; diff --git a/compute/tensor/src/gpu/mali/cl/transpose_3d_nchw.cl b/compute/tensor/src/gpu/mali/cl/transpose_3d_nchw.cl new file mode 100644 index 00000000..dd9c513c --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/transpose_3d_nchw.cl @@ -0,0 +1,119 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void transpose_nchw(const int iw_str, + const int ih_str, + const int iw_off, + const int ih_off, + const int ow_str, + const int oh_str, + const int ow_off, + const int oh_off, + const int dim0, + const int dim1, + const int dim2, + const int dim3, + const int iw, + const int it, + const int ot, + const int bx, + const int by, + __global const T *in, + __global T *out) +{ + int idx = get_global_id(0); + int idy = get_global_id(1); + int idz = get_global_id(2); + int idt = idz % it; + int idc = idz / it; + if (idx >= bx || idy >= by) { + return; + } + char ew = (((idx << 2) + 4) <= iw) ? 4 : (iw & 3); + T4 val = 0; + const int in_off = (idz * ih_str + idy + ih_off) * iw_str + (idx << 2) + iw_off; + if (ew == 4) { + val = vload4(0, in + in_off); + } else { + if (ew == 1) { + val.x = in[in_off]; + } + if (ew == 2) { + val.xy = vload2(0, in + in_off); + } + if (ew == 3) { + val.xyz = vload3(0, in + in_off); + } + } + int idox = idx << 2; + int idoy = idy; + int idot = idt; + int idoc = idc; + + int out_str = 1; + + if (dim0 == 1) { + idox = idy; + } + if (dim0 == 2) { + idox = idt; + } + if (dim0 == 3) { + idox = idc; + } + + if (dim1 == 0) { + idoy = idx << 2; + out_str = ow_str; + } + if (dim1 == 2) { + idoy = idt; + } + if (dim1 == 3) { + idoy = idc; + } + + if (dim2 == 0) { + idot = idx << 2; + out_str = ow_str * oh_str; + } + if (dim2 == 1) { + idot = idy; + } + if (dim2 == 3) { + idot = idc; + } + + if (dim3 == 0) { + idoc = idx << 2; + out_str = ow_str * oh_str * ot; + } + if (dim3 == 1) { + idoc = idy; + } + if (dim3 == 2) { + idoc = idt; + } + + int out_off = ((idoc * ot + idot) * oh_str + idoy + oh_off) * ow_str + idox + ow_off; + out[out_off] = val.x; + if (ew > 1) { + out[out_off + out_str] = val.y; + } + if (ew > 2) { + out[out_off + out_str * 2] = val.z; + } + if (ew > 3) { + out[out_off + out_str * 3] = val.w; + } +} diff --git a/compute/tensor/src/gpu/mali/cl/transpose_nchw.cl b/compute/tensor/src/gpu/mali/cl/transpose_nchw.cl new file mode 100644 index 00000000..b78112b3 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/transpose_nchw.cl @@ -0,0 +1,92 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void transpose_nchw(const int iw_str, + const int ih_str, + const int iw_off, + const int ih_off, + const int ow_str, + const int oh_str, + const int ow_off, + const int oh_off, + const int dim0, + const int dim1, + const int dim2, + const int iw, + const int bx, + const int by, + __global const T *in, + __global T *out) +{ + int idx = get_global_id(0); + int idy = get_global_id(1); + int idz = get_global_id(2); + if (idx >= bx || idy >= by) { + return; + } + char ew = (((idx << 2) + 4) <= iw) ? 4 : (iw & 3); + T4 val = 0; + const int in_off = (idz * ih_str + idy + ih_off) * iw_str + (idx << 2) + iw_off; + if (ew == 4) { + val = vload4(0, in + in_off); + } else { + if (ew == 1) { + val.x = in[in_off]; + } + if (ew == 2) { + val.xy = vload2(0, in + in_off); + } + if (ew == 3) { + val.xyz = vload3(0, in + in_off); + } + } + int ox = idx << 2; + int oy = idy; + int oz = idz; + int out_str = 1; + + if (dim0 == 1) { + ox = idy; + } + if (dim0 == 2) { + ox = idz; + } + + if (dim1 == 0) { + oy = idx << 2; + out_str = ow_str; + } + if (dim1 == 2) { + oy = idz; + } + + if (dim2 == 0) { + oz = idx << 2; + out_str = ow_str * oh_str; + } + if (dim2 == 1) { + oz = idy; + } + + int out_off = (oz * oh_str + oy + oh_off) * ow_str + ox + ow_off; + out[out_off] = val.x; + if (ew > 1) { + out[out_off + out_str] = val.y; + } + if (ew > 2) { + out[out_off + out_str * 2] = val.z; + } + if (ew > 3) { + out[out_off + out_str * 3] = val.w; + } +} diff --git a/compute/tensor/src/gpu/mali/clip.cpp b/compute/tensor/src/gpu/mali/clip.cpp new file mode 100644 index 00000000..93878bfa --- /dev/null +++ b/compute/tensor/src/gpu/mali/clip.cpp @@ -0,0 +1,85 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/clip_mali_fp16.h" + +EE clip_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + /*tensorDesc record cpu org data format info*/ + /*gclmemDesc record gpu trans data format info*/ + if (outputDesc) { + *outputDesc = inputDesc; + } + DataType idt; + DataFormat idf; + U32 iw, ih, ic, in; + tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); + + if (idf == DF_NCHW) { + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + iw, ih, ic, 0, 0, iw, ih, ic, idt, idt, gclmemInputDesc, gclmemOutputDesc)); + if (gclmemInputDesc && gclmemOutputDesc) { + *gclmemOutputDesc = *gclmemInputDesc; // the input and output mem maybe the same + } + return SUCCESS; + } + return NOT_SUPPORTED; +} + +inline EE clip_checkpara_mali( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output) +{ + if (handle == nullptr || nullptr == input || nullptr == output) { + return NULL_POINTER; + } + if (inputDesc.df != outputDesc.df || inputDesc.df != DF_NCHW) { + return NOT_SUPPORTED; + } + if (input->desc.memFormat != output->desc.memFormat || input->desc.memFormat != DF_NCWHC4) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +EE clip_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + ClipParamSpec p, + TensorDesc outputDesc, + GCLMem_t output) +{ + EE ret = SUCCESS; + CHECK_STATUS(clip_checkpara_mali(handle, inputDesc, input, outputDesc, output)); + switch (inputDesc.dt) { + case DT_F16: { + ret = clip_mali_fp16(handle, inputDesc, input, p, outputDesc, output); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/concat.cpp b/compute/tensor/src/gpu/mali/concat.cpp new file mode 100644 index 00000000..5e5fff4d --- /dev/null +++ b/compute/tensor/src/gpu/mali/concat.cpp @@ -0,0 +1,167 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/concat_mali_fp16.h" + +EE concat_infer_output_size_mali(std::vector inputDesc, + ConcatParamSpec p, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + /*tensorDesc record cpu org data format info*/ + /*gclmemDesc record gpu trans data format info*/ + if (outputDesc) { + *outputDesc = inputDesc[0]; + } + U32 sumDimSize = 0; + I32 dim = inputDesc[0].nDims; + int concatDim = p.axis; + concatDim = (concatDim + dim) % dim; + concatDim = dim - 1 - concatDim; + for (auto p : inputDesc) { + if (inputDesc[0].df != p.df) { + CHECK_STATUS(NOT_SUPPORTED); + } + } + if (inputDesc[0].df == DF_MKT) { + concatDim = 1 - concatDim; + } + for (U32 i = 0; i < inputDesc.size(); i++) { + sumDimSize += inputDesc[i].dims[concatDim]; + } + + if (outputDesc) { + *outputDesc = inputDesc[0]; + (*outputDesc).dims[concatDim] = sumDimSize; + } + + if (gclmemInputDesc && gclmemOutputDesc) { + DataType idt; + DataFormat idf; + U32 iw, ih, ic, in; + for (U32 i = 0; i < inputDesc.size(); i++) { + tensorSelectGet(inputDesc[i], &idt, &idf, &in, &ic, &ih, &iw); + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + iw, ih, ic, 0, 0, iw, ih, ic, idt, idt, &gclmemInputDesc[i], gclmemOutputDesc)); + } + U32 s0 = gclmemOutputDesc->stride[0]; + U32 s1 = gclmemOutputDesc->stride[1]; + U32 s2 = gclmemOutputDesc->stride[2]; + if (inputDesc[0].df == DF_NCHW) { + if (concatDim == 0) { + s1 = sumDimSize; + } else if (concatDim == 1) { + s0 = sumDimSize; + } else if (concatDim == 2) { + s2 = (sumDimSize + 3) / 4; + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + } + if (inputDesc[0].df == DF_MKT || inputDesc[0].df == DF_MTK) { + if (concatDim == 0) { + s2 = (sumDimSize + 3) / 4; + } else if (concatDim == 1) { + s0 = sumDimSize; + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + } + gclmemOutputDesc->stride[0] = s0; + gclmemOutputDesc->stride[1] = s1; + gclmemOutputDesc->stride[2] = s2; + gclmemOutputDesc->num = s0 * s1 * s2 * 4; + gclmemOutputDesc->byteSize = s0 * s1 * s2 * 4 * bytesOf(idt); + } + return SUCCESS; +} + +inline EE concat_checkpara_mali(GCLHandle_t handle, + std::vector inputDesc, + std::vector input, + ConcatParamSpec p, + TensorDesc outputDesc, + GCLMem_t output) +{ + if (handle == nullptr || nullptr == output) { + return NULL_POINTER; + } + if (input.size() < 1) { + return NOT_MATCH; + } + for (auto it : inputDesc) { + if (it.df != outputDesc.df) { + return NOT_MATCH; + } + } + if (outputDesc.df != DF_NCHW && outputDesc.df != DF_MKT && outputDesc.df != DF_MTK) { + return NOT_SUPPORTED; + } + for (auto it : input) { + GCLMem_t ptr = (GCLMem_t)it; + if (ptr == nullptr) { + return NULL_POINTER; + } + if (ptr->desc.memFormat != output->desc.memFormat) { + return NOT_SUPPORTED; + } + } + return SUCCESS; +} + +EE concat_infer_forward_tmp_bytes_mali(std::vector inputDesc, U32 *bytes) +{ + EE ret = SUCCESS; + switch (inputDesc[0].dt) { + case DT_F16: { + ret = concat_infer_forward_tmp_bytes_mali_fp16(inputDesc, bytes); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE concat_mali(GCLHandle_t handle, + std::vector inputDesc, + std::vector input, + GCLMem_t inputScale, + ConcatParamSpec p, + GCLMem_t tmpbuf, + TensorDesc outputDesc, + GCLMem_t output, + GCLMem_t outputScale) +{ + UNUSED(inputScale); + UNUSED(outputScale); + EE ret = SUCCESS; + CHECK_STATUS(concat_checkpara_mali(handle, inputDesc, input, p, outputDesc, output)); + switch (inputDesc[0].dt) { + case DT_F16: { + ret = concat_mali_fp16(handle, inputDesc, input, outputDesc, output, tmpbuf, p.axis); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/convolution.cpp b/compute/tensor/src/gpu/mali/convolution.cpp new file mode 100644 index 00000000..c9510349 --- /dev/null +++ b/compute/tensor/src/gpu/mali/convolution.cpp @@ -0,0 +1,606 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/convolution_mali_fp16.h" + +inline void convolution_produce_algos_paras(TensorDesc inputDesc, + TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + DataFormat inputGclmemFormat, + std::vector *convolutionAlgorithms, + std::vector *algoNumIndex, + std::vector *vecW, + std::vector *vecC, + std::vector *vecK) +{ + DataFormat idf; + U32 ic, ih, iw, fn, fh, fw, sh, sw; + tensorSelectGet(inputDesc, NULL, &idf, NULL, &ic, &ih, &iw); + tensorSelectGet(filterDesc, NULL, NULL, &fn, NULL, &fh, &fw); + sh = convParamSpec.stride_h; + sw = convParamSpec.stride_w; + U32 configInfo[3][128]; + U32 configNums[2]; + ConvolutionForwardAlgorithm algo[2]; + U32 algoNum = 1; + algo[0] = CONVOLUTION_ALGORITHM_DIRECT; + if (inputGclmemFormat == DF_NCHW && (ih != 1 || iw != 1 || fw != 1 || fh != 1)) { + configInfo[0][0] = (8 * sw - (fw - 1)) / sw; + configInfo[1][0] = 1; + configInfo[2][0] = 4; + configNums[0] = 1; + } else if (fn == 1 && sw == 1 && (fw == fh) && (fw == 1 || fw == 3 || fw == 5 || fw == 7)) { + configInfo[0][0] = (fw == 7) ? 6 : 8; + configInfo[1][0] = 4; + configInfo[2][0] = 1; + configNums[0] = 1; + } else { + if (fw == 3 && fh == 3 && sw == 1 && sh == 1) { + algo[1] = CONVOLUTION_ALGORITHM_WINOGRAD; + algoNum = 2; + } + U32 configNum = 0; + for (U32 ii = 0; ii < algoNum; ii++) { + if (algo[ii] == CONVOLUTION_ALGORITHM_DIRECT) { + if (ih == 1 && iw == 1 && fh == 1 && fw == 1) { + U32 j = 8; + for (U32 i = 0; i < 3; i++) { + configInfo[0][configNum] = 1; + configInfo[1][configNum] = 1 << (2 + i); + configInfo[2][configNum] = 0; + configNum++; + if (ic % j != 0) { + break; + } + j = j << 1; + } + } else { + U32 k = 4; + U32 nj = 8; + for (U32 i = 0; i < 2; i++) { + for (U32 j = 0; j < nj; j++) { + configInfo[0][configNum] = j + 1; + configInfo[1][configNum] = 4; + configInfo[2][configNum] = k; + configNum++; + } + k = k << 1; + if (fn % k != 0) { + break; + } + nj = 4; + } + if (fw == 1 && fh == 1 && sw == 1 && sh == 1) { + U32 k = 4; + U32 nj = 2; + for (U32 i = 0; i < 3; i++) { + U32 w = 2; + if (i == 2) { + nj = 1; + } + for (U32 j = 0; j < nj; j++) { + if (ih % w != 0) { + continue; + } + configInfo[0][configNum] = w << 8; + configInfo[1][configNum] = 4; + configInfo[2][configNum] = k; + configNum += 1; + w = w << 1; + } + k = k << 1; + if (fn % k != 0) { + break; + } + } + if (fn % 16 == 0) { + for (U32 i = 0; i < 3; i++) { + configInfo[0][configNum] = i + 1; + configInfo[1][configNum] = 4; + configInfo[2][configNum] = 16; + configNum++; + } + } + } + } + } + + if (algo[ii] == CONVOLUTION_ALGORITHM_WINOGRAD) { + for (U32 i = 1; i <= 8; i++) { + for (U32 j = 4; j <= 8; j += 4) { + if (i * j <= 2) { + continue; + } + configInfo[0][configNum] = i; + configInfo[1][configNum] = 1; + configInfo[2][configNum] = j; + configNum++; + } + } + } + configNums[ii] = configNum; + } + } + + for (U32 i = 0; i < algoNum; i++) { + (*convolutionAlgorithms).push_back(algo[i]); + (*algoNumIndex).push_back(configNums[i]); + U32 be = (i == 0) ? 0 : configNums[i - 1]; + U32 end = configNums[i]; + for (U32 j = be; j < end; j++) { + if (vecW) { + (*vecW).push_back(configInfo[0][j]); + } + if (vecC) { + (*vecC).push_back(configInfo[1][j]); + } + if (vecK) { + (*vecK).push_back(configInfo[2][j]); + } + } + } +} + +EE convolution_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + if (outputDesc == nullptr || gclmemInputDesc == nullptr || gclmemOutputDesc == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, fdt; + DataFormat idf, fdf; + U32 iw, ih, ic, in, it; + U32 fw, fh, fc, fn, ft; + U32 ow, oh, ot; + U32 sw, sh, st, dw, dh, fdw, fdh; + U32 pl, pr, pt, pb, pt_b, pt_a; + tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw, &it); + tensorSelectGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw, &ft); + pl = convParamSpec.padding_left; + pr = convParamSpec.padding_right; + pt = convParamSpec.padding_top; + pb = convParamSpec.padding_bottom; + sw = convParamSpec.stride_w; + sh = convParamSpec.stride_h; + dw = convParamSpec.dilatedRate_w; + dh = convParamSpec.dilatedRate_h; + pt_b = convParamSpec.padding_before; + pt_a = convParamSpec.padding_after; + st = convParamSpec.stride_t; + + if (fw > 7 || fw == 6) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (in != 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (fw < 1 || fh < 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (dw != 1 || dh != 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (sw != 1 && sw != 2) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (sh != 1 && sh != 2) { + CHECK_STATUS(NOT_SUPPORTED); + } + fdw = (fw - 1) * dw + 1; + fdh = (fh - 1) * dh + 1; + ow = (iw + pl + pr - fdw) / sw + 1; + oh = (ih + pt + pb - fdh) / sh + 1; + ot = (inputDesc.df == DF_NCTHW) ? (it + pt_b + pt_a - ft) / st + 1 : 1; + + U32 iw_align, ih_align, item_w, ext_w, ext_h; + bool need_pad = false; + + if (inputDesc.df == DF_NCTHW) { + *outputDesc = tensor5df(idt, idf, in, fn, ot, oh, ow); + } else { + *outputDesc = tensor4df(idt, idf, in, fn, oh, ow); + } + ext_w = (fw / 2 < pl) ? pl : fw / 2; // if fw / 2 < pl, use pl as offset + ext_h = pt; + + std::vector convolutionAlgorithms; + std::vector algoNumIndex; + std::vector vecW; + DataFormat inputGclmemFormat = DF_NCWHC4; + if (gclmemInputDesc->byteSize == 0) { + inputGclmemFormat = DF_NCHW; + } + convolution_produce_algos_paras(inputDesc, filterDesc, convParamSpec, inputGclmemFormat, + &convolutionAlgorithms, &algoNumIndex, &vecW, NULL, NULL); + iw_align = ow; + for (auto p : convolutionAlgorithms) { + U32 tmp_align = 0; + if (p == CONVOLUTION_ALGORITHM_WINOGRAD) { + tmp_align = ALIGN(ow, 16); + } else { + for (U32 i = 0; i < algoNumIndex[0]; i++) { + item_w = vecW[i]; + item_w = ((item_w >> 8) > 0) ? 1 : item_w; + U32 j = ALIGN(ow, item_w); + tmp_align = (tmp_align < j) ? j : tmp_align; + } + } + iw_align = (iw_align < tmp_align) ? tmp_align : iw_align; + } + iw_align = iw_align * sw; + ih_align = ih + pt + pb; + ih_align = ih_align - ext_h * 2; + + if (pl < ext_w) { // if fw / 2 > pl, use pl as offset, and pad (ext_w - pl) * 2 in the end + iw_align = iw_align + 2 * (ext_w - pl); + ext_w = pl; + } + if (iw_align != iw || ih_align != ih) { + need_pad = true; + } + if (ext_w != 0 || ext_h != 0) { + need_pad = true; + } + + if (fw == 1 && fh == 1 && ft == 1 && iw == 1 && ih == 1 && it == 1) { + CHECK_STATUS(infer_gclmem_desc_ncwhc4(iw_align, ih_align, ic, ext_w, ext_h, ow, oh, fn, idt, + idt, gclmemInputDesc, gclmemOutputDesc, need_pad)); + return SUCCESS; + } + + if (inputGclmemFormat == DF_NCHW) { + if (fw == fh && (fw == 1 || fw == 3 || fw == 5 || fw == 7)) { + CHECK_STATUS(infer_gclmem_desc_nchw(iw_align, ih_align, ic * it, ext_w, ext_h, 0, 0, 0, + idt, idt, gclmemInputDesc, NULL, need_pad)); + } else { + ic = ALIGN(ic, 4); + CHECK_STATUS(infer_gclmem_desc_ncwhc4(iw_align, ih_align, ic * it, ext_w, ext_h, 0, 0, + 0, idt, idt, gclmemInputDesc, NULL, need_pad)); + } + fn = ALIGN(fn, 4); + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + 0, 0, 0, 0, 0, ow, oh, fn * ot, idt, idt, NULL, gclmemOutputDesc)); + return SUCCESS; + } + + ic = ALIGN(ic, 4); + CHECK_STATUS(infer_gclmem_desc_ncwhc4(iw_align, ih_align, ic * it, ext_w, ext_h, 0, 0, 0, idt, + idt, gclmemInputDesc, NULL, need_pad)); + if (fn == 1 && sw == 1 && (fw == fh) && ft == 1 && (fw == 1 || fw == 3 || fw == 5 || fw == 7)) { + CHECK_STATUS(infer_gclmem_desc_nchw( + 0, 0, 0, 0, 0, ow, oh, fn * ot, idt, idt, NULL, gclmemOutputDesc)); + } else { + fn = ALIGN(fn, 4); + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + 0, 0, 0, 0, 0, ow, oh, fn * ot, idt, idt, NULL, gclmemOutputDesc)); + } + return SUCCESS; +} + +EE convolution_infer_forward_algorithm_mali(GCLHandle_t handle, + TensorDesc inputDesc, + TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + TensorDesc outputDesc, + GCLMemDesc inputMemDesc, + GCLMemDesc outputMemDesc, + ConvolutionPolicy policy, + ActivationMode activationMode, + ForwardRunInfoMali_t forwardRunInfo) +{ + if (forwardRunInfo == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + ConvolutionForwardAlgorithm algorithm = (ConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + if (algorithm != CONVOLUTION_ALGORITHM_NULL) { + return SUCCESS; + } + if (policy == CONVOLUTION_LIBRARY_SEARCH) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (policy == CONVOLUTION_FASTEST) { + CHECK_STATUS(NOT_SUPPORTED); + } + DataType dt; + U32 ih, iw, fn, fh, fw; + tensorSelectGet(inputDesc, NULL, NULL, NULL, NULL, &ih, &iw); + tensorSelectGet(filterDesc, &dt, NULL, &fn, NULL, &fh, &fw); + + std::vector convolutionAlgorithms; + std::vector algoNumIndex; + std::vector vecW; + std::vector vecC; + std::vector vecK; + DataFormat inputGclmemFormat = inputMemDesc.memFormat; + convolution_produce_algos_paras(inputDesc, filterDesc, convParamSpec, inputGclmemFormat, + &convolutionAlgorithms, &algoNumIndex, &vecW, &vecC, &vecK); + if (vecW.size() == 1) { + forwardRunInfo->best_w[0] = vecW[0]; + forwardRunInfo->best_k[0] = vecK[0]; + forwardRunInfo->best_c[0] = vecC[0]; + forwardRunInfo->algorithm = convolutionAlgorithms[0]; + return SUCCESS; + } + + if (policy == CONVOLUTION_TUNNING) { + CHECK_STATUS(gcl_clean_kernelVec(handle)); + CHECK_STATUS(gcl_enable_queue_profiling(handle)); + GCLMem_t input = gcl_create_gclmem(); + GCLMem_t filter = gcl_create_gclmem(); + GCLMem_t output = gcl_create_gclmem(); + GCLMem_t bias = gcl_create_gclmem(); + GCLMem_t tmpbuf = gcl_create_gclmem(); + U32 maxFilterSize = 0; + U32 maxBytes = 0; + U32 algosNum = 0; + std::vector runInfos; + U32 stride[3] = {0, 0, 0}; + U32 offset[3] = {0, 0, 0}; + std::vector filterMemDescs; + for (U32 i = 0; i < algoNumIndex.size(); i++) { + U32 bytes = 0; + ForwardRunInfoMali runInfo; + U32 be = (i == 0) ? 0 : algoNumIndex[i - 1]; + U32 end = algoNumIndex[i]; + runInfo.algorithm = convolutionAlgorithms[i]; + for (U32 j = be; j < end; j++) { + GCLMemDesc filterMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + runInfo.best_w[0] = vecW[j]; + runInfo.best_c[0] = vecC[j]; + runInfo.best_k[0] = vecK[j]; + if (convolution_transform_filter_bytes_mali( + filterDesc, &runInfo, &filterMemDesc, &bytes) != SUCCESS) { + continue; + } + maxBytes = (maxBytes < bytes) ? bytes : maxBytes; + if (convolution_infer_forward_tmp_bytes_mali(inputDesc, filterDesc, outputDesc, + convParamSpec, &runInfo, &bytes) != SUCCESS) { + continue; + } + maxBytes = (maxBytes < bytes) ? bytes : maxBytes; + maxFilterSize = (maxFilterSize < filterMemDesc.byteSize) ? filterMemDesc.byteSize + : maxFilterSize; + filterMemDescs.push_back(filterMemDesc); + runInfos.push_back(runInfo); + } + } + + if (ih == 1 && iw == 1 && fh == 1 && fw == 1) { + U32 stride[3] = {fn, 1, 1}; + U32 offset[3] = {0, 0, 0}; + CHECK_STATUS(gclmem_set_desc_padding( + &bias->desc, stride, offset, dt, DF_NHWC, GCL_MEM_BUF, CL_MEM_READ_WRITE)); + } else { + U32 stride[3] = {(fn + 3) / 4, 1, 1}; + U32 offset[3] = {0, 0, 0}; + CHECK_STATUS(gclmem_set_desc_padding( + &bias->desc, stride, offset, dt, DF_NHWC, GCL_MEM_IMG_1D, CL_MEM_READ_WRITE)); + } + algosNum = runInfos.size(); + if (algosNum == 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + TensorDesc scaleDesc = tensor1d(DT_F32, 0); + TensorDesc biasDesc = tensor1d(dt, fn); + filterMemDescs[0].byteSize = maxFilterSize; + outputMemDesc.need_pad = false; + input->desc = inputMemDesc; + output->desc = outputMemDesc; + filter->desc = filterMemDescs[0]; + tmpbuf->desc.byteSize = maxBytes; + gcl_create_memory(handle, input); + gcl_create_memory(handle, output); + gcl_create_memory(handle, filter); + gcl_create_memory(handle, bias); + if (maxBytes) { + gcl_create_memory(handle, tmpbuf); + } + + double minTimeDirect = DBL_MAX; + double minTimeWinograd = DBL_MAX; + double minTime = DBL_MAX; + double winogradPicTranTime = DBL_MAX; + double winogradOutTranTime = DBL_MAX; + U32 runKernelBe = 0; + U32 runKernelEnd = 0; + ForwardRunInfoMali bestRunInfo; + ForwardRunInfoMali bestRunInfoDirect; + ForwardRunInfoMali bestRunInfoWinograd; + for (U32 i = 0; i < algosNum; i++) { + filter->desc = filterMemDescs[i]; + if (convolution_mali(handle, inputDesc, input, filterDesc, filter, convParamSpec, + &runInfos[i], scaleDesc, NULL, biasDesc, bias, maxBytes, tmpbuf, outputDesc, + output, activationMode) == SUCCESS) { + if (runInfos[i].algorithm == (I32)CONVOLUTION_ALGORITHM_DIRECT) { + runKernelEnd = handle->kernelVec->size(); + gcl_run_kernelVec_timing(handle, runKernelBe, runKernelEnd); + runKernelBe = runKernelEnd; + if (minTimeDirect > handle->t_execute) { + minTimeDirect = handle->t_execute; + bestRunInfoDirect = runInfos[i]; + } + } + + if (runInfos[i].algorithm == (I32)CONVOLUTION_ALGORITHM_WINOGRAD) { + if (winogradPicTranTime == DBL_MAX) { + runKernelEnd = runKernelBe + 2; + gcl_run_kernelVec_timing(handle, runKernelBe, runKernelEnd); + winogradPicTranTime = handle->t_execute; + } + runKernelBe += 2; + runKernelEnd = runKernelBe + 1; + gcl_run_kernelVec_timing(handle, runKernelBe, runKernelEnd); + if (minTimeWinograd > handle->t_execute) { + minTimeWinograd = handle->t_execute; + bestRunInfoWinograd = runInfos[i]; + } + runKernelBe += 36; + if (winogradOutTranTime == DBL_MAX) { + runKernelEnd = runKernelBe + 1; + gcl_run_kernelVec_timing(handle, runKernelBe, runKernelEnd); + winogradOutTranTime = handle->t_execute; + } + runKernelBe = handle->kernelVec->size(); + } + } + } + + if (minTimeWinograd != DBL_MAX) { + minTimeWinograd = 36 * minTimeWinograd + winogradPicTranTime + winogradOutTranTime; + } + minTime = minTimeDirect; + bestRunInfo = bestRunInfoDirect; + if (minTimeWinograd < minTime) { + minTime = minTimeWinograd; + bestRunInfo = bestRunInfoWinograd; + } + if (minTime == DBL_MAX) { + CHECK_STATUS(NOT_SUPPORTED); + } + *forwardRunInfo = bestRunInfo; + CHECK_STATUS(gcl_finish(handle)); + gcl_destroy_gclmem(input); + gcl_destroy_gclmem(filter); + gcl_destroy_gclmem(output); + gcl_destroy_gclmem(bias); + gcl_destroy_gclmem(tmpbuf); + convolutionAlgorithms.clear(); + runInfos.clear(); + filterMemDescs.clear(); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + CHECK_STATUS(gcl_off_queue_profiling(handle)); + return SUCCESS; + } + return NOT_SUPPORTED; +} + +EE convolution_transform_filter_bytes_mali(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes) +{ + EE ret = SUCCESS; + switch (filterDesc.dt) { + case DT_F16: { + ret = convolution_transform_filter_bytes_mali_fp16( + filterDesc, forwardRunInfo, gclmemFilterDesc, bytes); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE convolution_transform_filter_mali(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + GCLMem_t tmp, + TensorDesc *fltmemDesc, + GCLMem_t fltmem) +{ + EE ret = SUCCESS; + switch (filterDesc.dt) { + case DT_F16: { + ret = convolution_transform_filter_mali_fp16( + handle, filterDesc, filter, forwardRunInfo, fltmemDesc, fltmem, tmp); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE convolution_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes) +{ + EE ret = SUCCESS; + switch (inputDesc.dt) { + case DT_F16: { + ret = convolution_infer_forward_tmp_bytes_mali_fp16( + inputDesc, filterDesc, outputDesc, convParamSpec, forwardRunInfo, bytes); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} +EE convolution_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc scaleDesc, + const GCLMem_t scale, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode) +{ + UNUSED(scaleDesc); + UNUSED(scale); + EE ret = SUCCESS; + switch (inputDesc.dt) { + case DT_F16: { + ret = convolution_mali_fp16(handle, inputDesc, input, filterDesc, filter, convParamSpec, + forwardRunInfo, biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, + activationMode); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/copy.cpp b/compute/tensor/src/gpu/mali/copy.cpp new file mode 100644 index 00000000..f6ae5097 --- /dev/null +++ b/compute/tensor/src/gpu/mali/copy.cpp @@ -0,0 +1,177 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" + +inline void check_tensordesc_dims( + U32 sn, U32 sc, U32 sh, U32 sw, U32 dn, U32 dc, U32 dh, U32 dw, U32 srcOffset, U32 dstOffset, U32 length) +{ + U32 srcElementNum = sw * sh * sc * sn; + U32 dstElementNum = dw * dh * dc * dn; + if (sn > 1 || dn > 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (length + srcOffset > srcElementNum) { + CHECK_STATUS(NOT_MATCH); + } + if (length + dstOffset > dstElementNum) { + CHECK_STATUS(NOT_MATCH); + } +} + +inline EE copy_checkpara_mali(GCLHandle_t handle, + std::vector inputDesc, + std::vector input, + U32 srcOffset, + U32 dstOffset, + U32 length) +{ + if (handle == nullptr) { + return NULL_POINTER; + } + if (inputDesc.size() != 2) { + return NOT_SUPPORTED; + } + if (input.size() != 2 && input.size() != 4) { + return NOT_SUPPORTED; + } + if (input[0] == nullptr || input[1] == nullptr) { + return NOT_SUPPORTED; + } + GCLMem_t srcMem = (GCLMem_t)input[0]; + GCLMem_t dstMem = (GCLMem_t)input[1]; + U32 sn, sc, sh, sw, sw_off, sh_off; + U32 dn, dc, dh, dw, dw_off, dh_off; + sn = 1; + dn = 1; + get_gclmem_dim(srcMem->desc, &sw, &sh, &sc, &sw_off, &sh_off); + get_gclmem_dim(dstMem->desc, &dw, &dh, &dc, &dw_off, &dh_off); + if (sw_off != 0 || sh_off != 0 || dw_off != 0 || dh_off != 0) { + return NOT_SUPPORTED; + } + check_tensordesc_dims(sn, sc, sh, sw, dn, dc, dh, dw, srcOffset, dstOffset, length); + return SUCCESS; +} + +inline EE copy_core_mali_fp16(GCLHandle_t handle, + std::vector inputDesc, + std::vector input, + U32 srcOffset, + U32 dstOffset, + U32 srcStride, + U32 dstStride, + U32 length) +{ + DataType sdt = inputDesc[0].dt; + DataType ddt = inputDesc[1].dt; + if (sdt == DT_U32 && ddt == DT_I32) { + sdt = DT_I32; + } + cl_mem srcbuf = ((GCLMem_t)(input[0]))->mem; + cl_mem dstbuf = ((GCLMem_t)(input[1]))->mem; + cl_mem srcBlockIndex = NULL; + cl_mem dstBlockIndex = NULL; + bool useBlockIndex = false; + if (input.size() == 4) { + srcBlockIndex = ((GCLMem_t)(input[2]))->mem; + dstBlockIndex = ((GCLMem_t)(input[3]))->mem; + useBlockIndex = true; + } + U32 gs = (length + 3) / 4; + U32 ls = 0; + U32 dim = 1; + Kernel kernel; + char dataType[16]; + if (sdt == DT_I32) { + strcpy(dataType, "i32"); + } + if (sdt == DT_U32) { + strcpy(dataType, "u32"); + } + if (sdt == DT_F16) { + strcpy(dataType, "f16"); + } + char kernelName[128]; + if (!useBlockIndex) { + sprintf(kernelName, "copy_%s", dataType); + } else { + sprintf(kernelName, "copy_with_block_index_%s", dataType); + } + + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + if (!useBlockIndex) { + CHECK_STATUS( + gcl_set_kernelArgs(kernel, length, length, srcOffset, dstOffset, gs, srcbuf, dstbuf)); + } else { + CHECK_STATUS(gcl_set_kernelArgs(kernel, length, length, srcOffset, dstOffset, srcStride, + dstStride, gs, srcBlockIndex, dstBlockIndex, srcbuf, dstbuf)); + } + gcl_set_kernelVec(handle, kernel, dim, &gs, &ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, &gs, &ls, kernelName)); + CHECK_STATUS(gcl_print_memory(handle, (GCLMem_t)input[0], "copy_srcbuf")); + CHECK_STATUS(gcl_print_memory(handle, (GCLMem_t)input[1], "copy_dstbuf")); +#endif + return SUCCESS; +} + +EE copy_infer_output_size_mali(std::vector inputDesc, GCLMemDesc_t gclmemInputDesc) +{ + if (inputDesc.size() != 2) { + CHECK_STATUS(NOT_SUPPORTED); + } + DataType sdt, ddt; + U32 sw, sh, sc, sn; + U32 dw, dh, dc, dn; + TensorDesc srcDesc = inputDesc[0]; + TensorDesc dstDesc = inputDesc[1]; + tensorSelectGet(srcDesc, &sdt, NULL, &sn, &sc, &sh, &sw); + tensorSelectGet(dstDesc, &ddt, NULL, &dn, &dc, &dh, &dw); + if (sdt == DT_U32 && ddt == DT_I32) { + sdt = DT_I32; + } + if (sdt != DT_F16 && sdt != DT_I32 && sdt != DT_U32) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (sdt != ddt) { + CHECK_STATUS(NOT_MATCH); + } + if (gclmemInputDesc) { + CHECK_STATUS( + infer_gclmem_desc_nchw(sw, sh, sc, 0, 0, 0, 0, 0, sdt, sdt, &gclmemInputDesc[0], NULL)); + CHECK_STATUS( + infer_gclmem_desc_nchw(dw, dh, dc, 0, 0, 0, 0, 0, ddt, ddt, &gclmemInputDesc[1], NULL)); + } + return SUCCESS; +} + +EE copy_mali(GCLHandle_t handle, + std::vector inputDesc, + std::vector input, + U32 srcOffset, + U32 dstOffset, + U32 srcStride, + U32 dstStride, + U32 length) +{ + EE ret = SUCCESS; + CHECK_STATUS(copy_checkpara_mali(handle, inputDesc, input, srcOffset, dstOffset, length)); + CHECK_STATUS(fill_output_zero(handle, (GCLMem_t)input[1], inputDesc[1])); + CHECK_STATUS(copy_core_mali_fp16( + handle, inputDesc, input, srcOffset, dstOffset, srcStride, dstStride, length)); + return ret; +} diff --git a/compute/tensor/src/gpu/mali/deconvolution.cpp b/compute/tensor/src/gpu/mali/deconvolution.cpp new file mode 100644 index 00000000..66abf3b8 --- /dev/null +++ b/compute/tensor/src/gpu/mali/deconvolution.cpp @@ -0,0 +1,433 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/deconvolution_mali_fp16.h" + +inline void deconvolution_produce_algos_paras(TensorDesc inputDesc, + TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + std::vector *deconvAlgorithms, + std::vector *algoNumIndex, + std::vector *vecW, + std::vector *vecC, + std::vector *vecK) +{ + DataFormat idf; + U32 ic, ih, iw, fn, fc, fh, fw, sh, sw; + tensorSelectGet(inputDesc, NULL, &idf, NULL, &ic, &ih, &iw); + tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw); + sh = convParamSpec.stride_h; + sw = convParamSpec.stride_w; + U32 configInfo[3][128]; + U32 configNums[2]; + ConvolutionForwardAlgorithm algo[2]; + U32 algoNum = 1; + algo[0] = CONVOLUTION_ALGORITHM_DIRECT; + if (fw != 2 || fh != 2 || sw != 2 || sh != 2) { + configInfo[0][0] = 1; + configInfo[1][0] = 4; + configInfo[2][0] = 4; + configNums[0] = 1; + } else { + algo[0] = CONVOLUTION_ALGORITHM_GEMM; + U32 configNum = 0; + U32 c = 8; + U32 ni = 4; + for (U32 ii = 0; ii < 2; ii++) { + for (U32 i = 0; i < ni; i++) { + configInfo[0][configNum] = i + 1; + configInfo[1][configNum] = c; + configInfo[2][configNum] = 4; + configNum++; + } + c = c << 1; + ni = 3; + } + + ni = 2; + U32 w = 2; + for (U32 ii = 0; ii < 2; ii++) { + c = 8; + if (ih % w == 0) { + for (U32 i = 0; i < ni; i++) { + configInfo[0][configNum] = w << 8; + configInfo[1][configNum] = c; + configInfo[2][configNum] = 4; + configNum++; + c = c << 1; + } + } + w = w << 1; + ni = 1; + } + configNums[0] = configNum; + } + + for (U32 i = 0; i < algoNum; i++) { + (*deconvAlgorithms).push_back(algo[i]); + (*algoNumIndex).push_back(configNums[i]); + U32 be = (i == 0) ? 0 : configNums[i - 1]; + U32 end = configNums[i]; + for (U32 j = be; j < end; j++) { + if (vecW) { + (*vecW).push_back(configInfo[0][j]); + } + if (vecC) { + (*vecC).push_back(configInfo[1][j]); + } + if (vecK) { + (*vecK).push_back(configInfo[2][j]); + } + } + } +} +EE deconvolution_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + DataType idt, fdt; + DataFormat idf, fdf; + U32 iw, ih, ic, in; + U32 fw, fh, fc, fn; + U32 ow, oh; + U32 sw, sh, dw, dh; + U32 pt, pb, pl, pr; + tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); + tensorSelectGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw); + sw = convParamSpec.stride_w; + sh = convParamSpec.stride_h; + dw = convParamSpec.dilatedRate_w; + dh = convParamSpec.dilatedRate_h; + if (in != 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (fw < 1 || fh < 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (dw != 1 || dh != 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + + sw = convParamSpec.stride_h; + sh = convParamSpec.stride_w; + pt = convParamSpec.padding_top; + pb = convParamSpec.padding_bottom; + pl = convParamSpec.padding_left; + pr = convParamSpec.padding_right; + + oh = fh + sh * (ih - 1) - pt - pb; + ow = fw + sw * (iw - 1) - pl - pr; + + bool need_pad = false; + std::vector deconvAlgorithms; + std::vector algoNumIndex; + std::vector vecW; + deconvolution_produce_algos_paras( + inputDesc, filterDesc, convParamSpec, &deconvAlgorithms, &algoNumIndex, &vecW, NULL, NULL); + + if (idf == DF_NCHW) { + if (outputDesc) { + *outputDesc = tensor4df(idt, DF_NCHW, in, fc, oh, ow); + } + if (fw == 2 && fh == 2 && sw == 2 && sh == 2) { + U32 iw_align, item_w; + iw_align = ow; + U32 tmp_align = 0; + for (U32 i = 0; i < algoNumIndex[0]; i++) { + item_w = vecW[i]; + item_w = ((item_w >> 8) > 0) ? 1 : item_w; + U32 j = ALIGN(ow, item_w); + tmp_align = (tmp_align < j) ? j : tmp_align; + } + iw_align = (iw_align < tmp_align) ? tmp_align : iw_align; + if (iw_align != iw) { + need_pad = true; + } + CHECK_STATUS(infer_gclmem_desc_ncwhc4(iw_align, ih, ic, 0, 0, ow, oh, fc, idt, idt, + gclmemInputDesc, gclmemOutputDesc, need_pad)); + } else { + CHECK_STATUS(infer_gclmem_desc_ncwhc4(iw, ih, ic, 0, 0, ow, oh, fc, idt, idt, + gclmemInputDesc, gclmemOutputDesc, need_pad)); + } + return SUCCESS; + } + + return NOT_SUPPORTED; +} + +EE deconvolution_infer_forward_algorithm_mali(GCLHandle_t handle, + TensorDesc inputDesc, + TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + TensorDesc outputDesc, + ConvolutionPolicy policy, + ActivationMode activationMode, + ForwardRunInfoMali_t forwardRunInfo) +{ + if (forwardRunInfo == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + ConvolutionForwardAlgorithm algorithm = (ConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + if (algorithm != CONVOLUTION_ALGORITHM_NULL) { + return SUCCESS; + } + DataType dt; + U32 ih, iw, fc, fh, fw; + tensorSelectGet(inputDesc, NULL, NULL, NULL, NULL, &ih, &iw); + tensorSelectGet(filterDesc, &dt, NULL, NULL, &fc, &fh, &fw); + std::vector deconvAlgorithms; + std::vector algoNumIndex; + std::vector vecW; + std::vector vecC; + std::vector vecK; + deconvolution_produce_algos_paras(inputDesc, filterDesc, convParamSpec, &deconvAlgorithms, + &algoNumIndex, &vecW, &vecC, &vecK); + if (vecW.size() == 1) { + forwardRunInfo->best_w[0] = vecW[0]; + forwardRunInfo->best_k[0] = vecK[0]; + forwardRunInfo->best_c[0] = vecC[0]; + forwardRunInfo->algorithm = deconvAlgorithms[0]; + return SUCCESS; + } + + if (policy == CONVOLUTION_TUNNING) { + CHECK_STATUS(gcl_clean_kernelVec(handle)); + CHECK_STATUS(gcl_enable_queue_profiling(handle)); + GCLMem_t input = gcl_create_gclmem(); + GCLMem_t filter = gcl_create_gclmem(); + GCLMem_t output = gcl_create_gclmem(); + GCLMem_t bias = gcl_create_gclmem(); + GCLMem_t tmpbuf = gcl_create_gclmem(); + U32 maxFilterSize = 0; + U32 maxBytes = 0; + U32 algosNum = 0; + std::vector runInfos; + U32 stride[3] = {0, 0, 0}; + U32 offset[3] = {0, 0, 0}; + GCLMemDesc inputMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + GCLMemDesc outputMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + CHECK_STATUS(deconvolution_infer_output_size_mali( + inputDesc, filterDesc, convParamSpec, NULL, &inputMemDesc, &outputMemDesc)); + std::vector filterMemDescs; + for (U32 i = 0; i < algoNumIndex.size(); i++) { + U32 bytes = 0; + ForwardRunInfoMali runInfo; + U32 be = (i == 0) ? 0 : algoNumIndex[i - 1]; + U32 end = algoNumIndex[i]; + runInfo.algorithm = deconvAlgorithms[i]; + for (U32 j = be; j < end; j++) { + GCLMemDesc filterMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + runInfo.best_w[0] = vecW[j]; + runInfo.best_c[0] = vecC[j]; + runInfo.best_k[0] = vecK[j]; + if (deconvolution_transform_filter_bytes_mali( + filterDesc, &runInfo, &filterMemDesc, &bytes) != SUCCESS) { + continue; + } + maxBytes = (maxBytes < bytes) ? bytes : maxBytes; + if (deconvolution_infer_forward_tmp_bytes_mali(inputDesc, filterDesc, outputDesc, + convParamSpec, &runInfo, &bytes) != SUCCESS) { + continue; + } + maxBytes = (maxBytes < bytes) ? bytes : maxBytes; + maxFilterSize = (maxFilterSize < filterMemDesc.byteSize) ? filterMemDesc.byteSize + : maxFilterSize; + filterMemDescs.push_back(filterMemDesc); + runInfos.push_back(runInfo); + } + } + + algosNum = runInfos.size(); + TensorDesc biasDesc = tensor1d(dt, fc); + filterMemDescs[0].byteSize = maxFilterSize; + input->desc = inputMemDesc; + output->desc = outputMemDesc; + filter->desc = filterMemDescs[0]; + stride[0] = (fc + 3) / 4; + stride[1] = 1; + stride[2] = 1; + MemFlags flags = CL_MEM_READ_WRITE; + CHECK_STATUS(gclmem_set_desc_padding( + &bias->desc, stride, offset, dt, DF_NHWC, GCL_MEM_IMG_1D, flags)); + tmpbuf->desc.byteSize = maxBytes; + gcl_create_memory(handle, input); + gcl_create_memory(handle, output); + gcl_create_memory(handle, filter); + gcl_create_memory(handle, bias); + if (maxBytes) { + gcl_create_memory(handle, tmpbuf); + } + + double minTimeGemm = DBL_MAX; + double minTime = DBL_MAX; + U32 runKernelBe = 0; + U32 runKernelEnd = 0; + ForwardRunInfoMali bestRunInfo; + ForwardRunInfoMali bestRunInfoGemm; + for (U32 i = 0; i < algosNum; i++) { + filter->desc = filterMemDescs[i]; + if (deconvolution_mali(handle, inputDesc, input, filterDesc, filter, convParamSpec, + &runInfos[i], biasDesc, NULL, biasDesc, bias, maxBytes, tmpbuf, outputDesc, + output, activationMode) == SUCCESS) { + if (runInfos[i].algorithm == (I32)CONVOLUTION_ALGORITHM_GEMM) { + runKernelEnd = handle->kernelVec->size(); + gcl_run_kernelVec_timing(handle, runKernelBe, runKernelEnd); + runKernelBe = runKernelEnd; + if (minTimeGemm > handle->t_execute) { + minTimeGemm = handle->t_execute; + bestRunInfoGemm = runInfos[i]; + } + } + } + } + minTime = minTimeGemm; + bestRunInfo = bestRunInfoGemm; + if (minTime == DBL_MAX) { + CHECK_STATUS(NOT_SUPPORTED); + } + *forwardRunInfo = bestRunInfo; + CHECK_STATUS(gcl_finish(handle)); + gcl_destroy_gclmem(input); + gcl_destroy_gclmem(filter); + gcl_destroy_gclmem(output); + gcl_destroy_gclmem(bias); + gcl_destroy_gclmem(tmpbuf); + deconvAlgorithms.clear(); + runInfos.clear(); + filterMemDescs.clear(); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + CHECK_STATUS(gcl_off_queue_profiling(handle)); + return SUCCESS; + } + return NOT_SUPPORTED; +} + +EE deconvolution_transform_filter_bytes_mali(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes) +{ + EE ret = SUCCESS; + switch (filterDesc.dt) { + case DT_F16: { + ret = deconvolution_transform_filter_bytes_mali_fp16( + filterDesc, forwardRunInfo, gclmemFilterDesc, bytes); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE deconvolution_transform_filter_mali(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + GCLMem_t tmp, + TensorDesc *fltmemDesc, + GCLMem_t fltmem) +{ + EE ret = SUCCESS; + switch (filterDesc.dt) { + case DT_F16: { + ret = deconvolution_transform_filter_mali_fp16( + handle, filterDesc, filter, forwardRunInfo, fltmemDesc, fltmem, tmp); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE deconvolution_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes) +{ + EE ret = SUCCESS; + switch (inputDesc.dt) { + case DT_F16: { + ret = deconvolution_infer_forward_tmp_bytes_mali_fp16( + inputDesc, filterDesc, outputDesc, convParamSpec, forwardRunInfo, bytes); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} +EE deconvolution_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc scaleDesc, + const GCLMem_t scale, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode) +{ + UNUSED(scaleDesc); + UNUSED(scale); + EE ret = SUCCESS; + switch (inputDesc.dt) { + case DT_F16: { + ret = deconvolution_mali_fp16(handle, inputDesc, input, filterDesc, filter, + convParamSpec, forwardRunInfo, biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, + activationMode); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/depth2space.cpp b/compute/tensor/src/gpu/mali/depth2space.cpp new file mode 100644 index 00000000..2efe7598 --- /dev/null +++ b/compute/tensor/src/gpu/mali/depth2space.cpp @@ -0,0 +1,110 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/depth2space_mali_fp16.h" + +inline EE depth2space_checkpara_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + Depth2SpaceParamSpec p, + TensorDesc outputDesc, + GCLMem_t output) +{ + if (handle == nullptr || nullptr == input || nullptr == output) { + return NULL_POINTER; + } + return SUCCESS; +} + +EE depth2space_infer_output_size_mali(TensorDesc inputDesc, + Depth2SpaceParamSpec p, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + if (outputDesc == nullptr || gclmemInputDesc == nullptr || gclmemOutputDesc == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + + DataType idt; + DataFormat idf; + U32 iw, ih, ic, in; + U32 ow, oh, oc, on; + tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); + on = in; + oc = ic / (p.blockSize * p.blockSize); + oh = ih * p.blockSize; + ow = iw * p.blockSize; + if (ic % (p.blockSize * p.blockSize) != 0) { + return NOT_MATCH; + } + + *outputDesc = tensor4df(idt, idf, on, oc, oh, ow); + if (gclmemInputDesc->byteSize == 0) { + CHECK_STATUS(infer_gclmem_desc_nchw( + iw, ih, ic, 0, 0, 0, 0, 0, DT_F16, DT_F16, gclmemInputDesc, NULL)); + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + 0, 0, 0, 0, 0, ow, oh, oc, DT_F16, DT_F16, NULL, gclmemOutputDesc)); + return SUCCESS; + } + + if (idf == DF_NCHW) { + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + iw, ih, ic, 0, 0, ow, oh, oc, DT_F16, DT_F16, gclmemInputDesc, gclmemOutputDesc)); + return SUCCESS; + } + return NOT_SUPPORTED; +} + +EE depth2space_infer_tmpBuf_size_mali( + TensorDesc inputDesc, Depth2SpaceParamSpec p, TensorDesc outputDesc, U32 *bytes) +{ + EE ret = SUCCESS; + switch (inputDesc.dt) { + case DT_F16: { + ret = depth2space_infer_tmpBuf_size_mali_fp16(inputDesc, p, outputDesc, bytes); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE depth2space_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + Depth2SpaceParamSpec p, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output) +{ + EE ret = SUCCESS; + CHECK_STATUS(depth2space_checkpara_mali(handle, inputDesc, input, p, outputDesc, output)); + switch (inputDesc.dt) { + case DT_F16: { + ret = depth2space_mali_fp16(handle, inputDesc, input, p, tmpBuf, outputDesc, output); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/depthwise_convolution.cpp b/compute/tensor/src/gpu/mali/depthwise_convolution.cpp new file mode 100644 index 00000000..ca09f531 --- /dev/null +++ b/compute/tensor/src/gpu/mali/depthwise_convolution.cpp @@ -0,0 +1,359 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/depthwise_convolution_mali_fp16.h" + +inline void depthwise_convolution_produce_algos_paras( + std::vector *depthwiseConvAlgorithms, + std::vector *algoNumIndex, + std::vector *vecW, + std::vector *vecC, + std::vector *vecK) +{ + U32 configNum = 8; + for (U32 i = 0; i < configNum; i++) { + if (vecW) { + (*vecW).push_back(i + 1); + } + if (vecC) { + (*vecC).push_back(1); + } + if (vecK) { + (*vecK).push_back(4); + } + } + if (depthwiseConvAlgorithms) { + (*depthwiseConvAlgorithms).push_back(DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT); + } + if (algoNumIndex) { + (*algoNumIndex).push_back(configNum); + } +} + +EE depthwise_convolution_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + DataType idt; + DataFormat idf; + U32 iw, ih, ic, in; + U32 fw, fh; + U32 ow, oh; + U32 sw, sh, pl, pt, dw, dh, pr, pb; + tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); + tensorSelectGet(filterDesc, NULL, NULL, NULL, NULL, &fh, &fw); + pl = convParamSpec.padding_left; + pr = convParamSpec.padding_right; + pt = convParamSpec.padding_top; + pb = convParamSpec.padding_bottom; + sw = convParamSpec.stride_w; + sh = convParamSpec.stride_h; + dw = convParamSpec.dilatedRate_w; + dh = convParamSpec.dilatedRate_h; + if (fw < 1 || fh < 1) { + return NOT_SUPPORTED; + } + if (dw != 1 || dh != 1) { + return NOT_SUPPORTED; + } + if (sw != sh) { + return NOT_SUPPORTED; + } + if ((ic & 3) != 0) { + return NOT_SUPPORTED; + } + ow = (iw + pl + pr - fw) / sw + 1; + oh = (ih + pt + pb - fh) / sh + 1; + if (outputDesc) { + *outputDesc = tensor4df(idt, idf, in, ic, oh, ow); + } + bool need_pad = false; + + std::vector vecW; + depthwise_convolution_produce_algos_paras(NULL, NULL, &vecW, NULL, NULL); + U32 iw_align = ow; + for (auto item_w : vecW) { + U32 i = ALIGN(ow, item_w); + iw_align = (iw_align < i) ? i : iw_align; + } + U32 ext_w = (fw / 2 < pl) ? pl : fw / 2; + iw_align = iw_align * sw; + if (pl < ext_w) { + iw_align = iw_align + 2 * (ext_w - pl); + ext_w = pl; + } + if (iw_align != iw) { + need_pad = true; + } + if (ext_w != 0 || pt != 0) { + need_pad = true; + } + CHECK_STATUS(infer_gclmem_desc_ncwhc4(iw_align, ih, ic, ext_w, pt, ow, oh, ic, idt, idt, + gclmemInputDesc, gclmemOutputDesc, need_pad)); + return SUCCESS; +} + +EE depthwise_convolution_infer_forward_algorithm_mali(GCLHandle_t handle, + TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + ActivationMode depthwiseActivationMode, + ForwardRunInfoMali_t forwardRunInfo) +{ + if (forwardRunInfo == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + DepthwiseConvolutionForwardAlgorithm algorithm = + (DepthwiseConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + if (algorithm != DEPTHWISE_CONVOLUTION_ALGORITHM_NULL) { + return SUCCESS; + } + if (policy == CONVOLUTION_LIBRARY_SEARCH) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (policy == CONVOLUTION_FASTEST) { + CHECK_STATUS(NOT_SUPPORTED); + } + std::vector depthwiseConvAlgorithms; + std::vector algoNumIndex; + std::vector vecW; + std::vector vecC; + std::vector vecK; + depthwise_convolution_produce_algos_paras( + &depthwiseConvAlgorithms, &algoNumIndex, &vecW, &vecC, &vecK); + + if (policy == CONVOLUTION_TUNNING) { + CHECK_STATUS(gcl_clean_kernelVec(handle)); + CHECK_STATUS(gcl_enable_queue_profiling(handle)); + GCLMem_t input = gcl_create_gclmem(); + GCLMem_t filter = gcl_create_gclmem(); + GCLMem_t output = gcl_create_gclmem(); + GCLMem_t bias = gcl_create_gclmem(); + GCLMem_t tmpbuf = gcl_create_gclmem(); + U32 maxFilterSize = 0; + U32 maxBytes = 0; + U32 algosNum = 0; + std::vector runInfos; + U32 stride[3] = {0, 0, 0}; + U32 offset[3] = {0, 0, 0}; + GCLMemDesc inputMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + GCLMemDesc outputMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + CHECK_STATUS(depthwise_convolution_infer_output_size_mali( + inputDesc, filterDesc, convParamSpec, NULL, &inputMemDesc, &outputMemDesc)); + std::vector filterMemDescs; + U32 ic; + DataType dt; + tensorSelectGet(inputDesc, &dt, NULL, NULL, &ic, NULL, NULL); + for (U32 i = 0; i < algoNumIndex.size(); i++) { + U32 bytes = 0; + ForwardRunInfoMali runInfo; + U32 be = (i == 0) ? 0 : algoNumIndex[i - 1]; + U32 end = algoNumIndex[i]; + runInfo.algorithm = depthwiseConvAlgorithms[i]; + for (U32 j = be; j < end; j++) { + GCLMemDesc filterMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + runInfo.best_w[0] = vecW[j]; + runInfo.best_c[0] = vecC[j]; + runInfo.best_k[0] = vecK[j]; + if (depthwise_convolution_transform_filter_bytes_mali( + filterDesc, &runInfo, &filterMemDesc, &bytes) != SUCCESS) { + continue; + } + maxBytes = (maxBytes < bytes) ? bytes : maxBytes; + if (depthwise_convolution_infer_forward_tmp_bytes_mali(inputDesc, filterDesc, + outputDesc, convParamSpec, &runInfo, &bytes) != SUCCESS) { + continue; + } + maxBytes = (maxBytes < bytes) ? bytes : maxBytes; + maxFilterSize = (maxFilterSize < filterMemDesc.byteSize) ? filterMemDesc.byteSize + : maxFilterSize; + filterMemDescs.push_back(filterMemDesc); + runInfos.push_back(runInfo); + } + } + + TensorDesc biasDesc = tensor1d(dt, ic); + stride[0] = (ic + 3) / 4; + CHECK_STATUS(gclmem_set_desc_padding( + &bias->desc, stride, offset, dt, DF_NHWC, GCL_MEM_IMG_1D, CL_MEM_READ_WRITE)); + algosNum = runInfos.size(); + if (algosNum == 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + filterMemDescs[0].byteSize = maxFilterSize; + input->desc = inputMemDesc; + output->desc = outputMemDesc; + filter->desc = filterMemDescs[0]; + tmpbuf->desc.byteSize = maxBytes; + gcl_create_memory(handle, input); + gcl_create_memory(handle, output); + gcl_create_memory(handle, filter); + gcl_create_memory(handle, bias); + if (maxBytes) { + gcl_create_memory(handle, tmpbuf); + } + + double minTime = DBL_MAX; + U32 runKernelBe = 0; + U32 runKernelEnd = 0; + ForwardRunInfoMali bestRunInfo; + for (U32 i = 0; i < algosNum; i++) { + filter->desc = filterMemDescs[i]; + if (depthwise_convolution_mali(handle, inputDesc, input, filterDesc, filter, + convParamSpec, &runInfos[i], biasDesc, bias, maxBytes, tmpbuf, outputDesc, + output, depthwiseActivationMode) == SUCCESS) { + runKernelEnd = handle->kernelVec->size(); + gcl_run_kernelVec_timing(handle, runKernelBe, runKernelEnd); + if (minTime > handle->t_execute) { + minTime = handle->t_execute; + bestRunInfo = runInfos[i]; + } + runKernelBe = runKernelEnd; + } + } + if (minTime == DBL_MAX) { + CHECK_STATUS(NOT_SUPPORTED); + } + *forwardRunInfo = bestRunInfo; + CHECK_STATUS(gcl_finish(handle)); + gcl_destroy_gclmem(input); + gcl_destroy_gclmem(filter); + gcl_destroy_gclmem(output); + gcl_destroy_gclmem(bias); + gcl_destroy_gclmem(tmpbuf); + depthwiseConvAlgorithms.clear(); + runInfos.clear(); + filterMemDescs.clear(); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + CHECK_STATUS(gcl_off_queue_profiling(handle)); + return SUCCESS; + } + return NOT_SUPPORTED; +} + +EE depthwise_convolution_transform_filter_bytes_mali(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes) +{ + EE ret = SUCCESS; + switch (filterDesc.dt) { + case DT_F16: { + ret = depthwise_convolution_transform_filter_bytes_mali_fp16( + filterDesc, forwardRunInfo, gclmemFilterDesc, bytes); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE depthwise_convolution_transform_filter_mali(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem) +{ + EE ret = SUCCESS; + switch (filterDesc.dt) { + case DT_F16: { + ret = depthwise_convolution_transform_filter_mali_fp16( + handle, filterDesc, filter, forwardRunInfo, fltmemDesc, fltmem); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE depthwise_convolution_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes) +{ + EE ret = SUCCESS; + switch (filterDesc.dt) { + case DT_F16: { + ret = depthwise_convolution_infer_forward_tmp_bytes_mali_fp16( + inputDesc, filterDesc, outputDesc, convParamSpec, forwardRunInfo, bytes); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE depthwise_convolution_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode depthwiseActivationMode) +{ + EE ret = SUCCESS; + switch (inputDesc.dt) { + case DT_F16: { + ret = depthwise_convolution_mali_fp16(handle, inputDesc, input, filterDesc, filter, + convParamSpec, forwardRunInfo, biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, + depthwiseActivationMode); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/depthwise_pointwise_convolution.cpp b/compute/tensor/src/gpu/mali/depthwise_pointwise_convolution.cpp new file mode 100644 index 00000000..f7498c2b --- /dev/null +++ b/compute/tensor/src/gpu/mali/depthwise_pointwise_convolution.cpp @@ -0,0 +1,522 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/depthwise_pointwise_convolution_mali_fp16.h" +inline void depthwise_pointwise_convolution_produce_algos_paras(U32 pointwiseFilterNum, + std::vector *depthwisePointwiseConvAlgorithms, + std::vector *algoNumIndexD, + std::vector *vecWD, + std::vector *vecCD, + std::vector *vecKD, + std::vector *algoNumIndexP, + std::vector *vecWP, + std::vector *vecCP, + std::vector *vecKP) +{ + U32 algoNum = 2; + DepthwiseConvolutionForwardAlgorithm algo[2]; + algo[0] = DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT; + algo[1] = DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_GEMM; + U32 configNumsD[2]; + U32 configNumsP[2]; + U32 configNumD = 0; + U32 configNumP = 0; + U32 configInfo[3][128]; + for (U32 ii = 0; ii < algoNum; ii++) { + for (U32 i = 0; i < 8; i++) { + if (vecWD) { + (*vecWD).push_back(i + 1); + } + if (vecCD) { + (*vecCD).push_back(1); + } + if (vecKD) { + (*vecKD).push_back(4); + } + configNumD++; + } + configNumsD[ii] = configNumD; + U32 c = (algo[ii] == DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT) ? 4 : 1; + U32 k = 4; + U32 nj = 8; + for (U32 i = 0; i < 2; i++) { + for (U32 j = 0; j < nj; j++) { + configInfo[0][configNumP] = j + 1; + configInfo[1][configNumP] = c; + configInfo[2][configNumP] = k; + configNumP++; + } + k = k << 1; + if (pointwiseFilterNum % k != 0) { + break; + } + if (algo[ii] == DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT) { + nj = 4; + } + } + configNumsP[ii] = configNumP; + } + + for (U32 i = 0; i < algoNum; i++) { + if (depthwisePointwiseConvAlgorithms) { + (*depthwisePointwiseConvAlgorithms).push_back(algo[i]); + } + if (algoNumIndexD) { + (*algoNumIndexD).push_back(configNumsD[i]); + } + if (algoNumIndexP) { + (*algoNumIndexP).push_back(configNumsP[i]); + } + } + for (U32 i = 0; i < configNumP; i++) { + if (vecWP) { + (*vecWP).push_back(configInfo[0][i]); + } + if (vecCP) { + (*vecCP).push_back(configInfo[1][i]); + } + if (vecKP) { + (*vecKP).push_back(configInfo[2][i]); + } + } +} + +EE depthwise_pointwise_convolution_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + ConvolutionParamSpec convParamSpec, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + DataType idt; + DataFormat idf; + U32 iw, ih, ic, in; + U32 dfw, dfh; + U32 pfn; + U32 ow, oh; + U32 sw, sh, pl, pt, dw, dh, pr, pb; + tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); + tensorSelectGet(dwFilterDesc, NULL, NULL, NULL, NULL, &dfh, &dfw); + tensorSelectGet(pwFilterDesc, NULL, NULL, &pfn, NULL, NULL, NULL); + pl = convParamSpec.padding_left; + pr = convParamSpec.padding_right; + pt = convParamSpec.padding_top; + pb = convParamSpec.padding_bottom; + sw = convParamSpec.stride_w; + sh = convParamSpec.stride_h; + dw = convParamSpec.dilatedRate_w; + dh = convParamSpec.dilatedRate_h; + if (dfw < 1 || dfh < 1) { + return NOT_SUPPORTED; + } + if (dw != 1 || dh != 1) { + return NOT_SUPPORTED; + } + if (sw != sh) { + return NOT_SUPPORTED; + } + if ((pfn & 3) != 0) { + return NOT_SUPPORTED; + } + ow = (iw + pl + pr - dfw) / sw + 1; + oh = (ih + pt + pb - dfh) / sh + 1; + if (outputDesc) { + *outputDesc = tensor4df(idt, idf, in, pfn, oh, ow); + } + bool need_pad = false; + + std::vector vecW; + depthwise_pointwise_convolution_produce_algos_paras( + pfn, NULL, NULL, &vecW, NULL, NULL, NULL, NULL, NULL, NULL); + U32 iw_align = ow; + for (auto item_w : vecW) { + U32 i = ALIGN(ow, item_w); + iw_align = (iw_align < i) ? i : iw_align; + } + U32 ext_w = (dfw / 2 < pl) ? pl : dfw / 2; + iw_align = iw_align * sw; + if (pl < ext_w) { + iw_align = iw_align + 2 * (ext_w - pl); + ext_w = pl; + } + if (iw_align != iw) { + need_pad = true; + } + if (ext_w != 0 || pt != 0) { + need_pad = true; + } + CHECK_STATUS(infer_gclmem_desc_ncwhc4(iw_align, ih, ic, ext_w, pt, ow, oh, pfn, idt, idt, + gclmemInputDesc, gclmemOutputDesc, need_pad)); + return SUCCESS; +} + +EE depthwise_pointwise_convolution_infer_forward_algorithm_mali(GCLHandle_t handle, + TensorDesc inputDesc, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + ActivationMode depthwiseActivationMode, + ActivationMode pointwiseActivationMode, + ForwardRunInfoMali_t forwardRunInfo) +{ + if (forwardRunInfo == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + DepthwiseConvolutionForwardAlgorithm algorithm = + (DepthwiseConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + if (algorithm != DEPTHWISE_CONVOLUTION_ALGORITHM_NULL) { + return SUCCESS; + } + if (policy == CONVOLUTION_LIBRARY_SEARCH) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (policy == CONVOLUTION_FASTEST) { + CHECK_STATUS(NOT_SUPPORTED); + } + std::vector depthwisePointwiseConvAlgorithms; + std::vector algoNumIndexD; + std::vector vecWD; + std::vector vecCD; + std::vector vecKD; + std::vector algoNumIndexP; + std::vector vecWP; + std::vector vecCP; + std::vector vecKP; + DataType dt; + U32 pfn; + tensorSelectGet(pwFilterDesc, &dt, NULL, &pfn, NULL, NULL, NULL); + depthwise_pointwise_convolution_produce_algos_paras(pfn, &depthwisePointwiseConvAlgorithms, + &algoNumIndexD, &vecWD, &vecCD, &vecKD, &algoNumIndexP, &vecWP, &vecCP, &vecKP); + + if (policy == CONVOLUTION_TUNNING) { + CHECK_STATUS(gcl_clean_kernelVec(handle)); + CHECK_STATUS(gcl_enable_queue_profiling(handle)); + GCLMem_t input = gcl_create_gclmem(); + GCLMem_t dwFilter = gcl_create_gclmem(); + GCLMem_t pwFilter = gcl_create_gclmem(); + GCLMem_t output = gcl_create_gclmem(); + GCLMem_t tmpbuf = gcl_create_gclmem(); + GCLMem_t dwBias = gcl_create_gclmem(); + GCLMem_t pwBiasBuf = gcl_create_gclmem(); + GCLMem_t pwBiasImg = gcl_create_gclmem(); + U32 maxDwFilterSize = 0; + U32 maxPwFilterSize = 0; + U32 maxBytes = 0; + U32 algosNum = 0; + std::vector runInfos; + U32 stride[3] = {0, 0, 0}; + U32 offset[3] = {0, 0, 0}; + GCLMemDesc inputMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + GCLMemDesc outputMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + CHECK_STATUS(depthwise_pointwise_convolution_infer_output_size_mali(inputDesc, dwFilterDesc, + pwFilterDesc, convParamSpec, NULL, &inputMemDesc, &outputMemDesc)); + std::vector dwFilterMemDescs; + std::vector pwFilterMemDescs; + U32 ic, pfn; + tensorSelectGet(inputDesc, NULL, NULL, NULL, &ic, NULL, NULL); + tensorSelectGet(pwFilterDesc, NULL, NULL, &pfn, NULL, NULL, NULL); + if (algoNumIndexD.size() != algoNumIndexP.size()) { + CHECK_STATUS(NOT_MATCH); + } + + U32 runInfoBe[2][2]; + U32 runInfoEnd[2][2]; + U32 runInfoCount = 0; + for (U32 i = 0; i < algoNumIndexD.size(); i++) { + U32 bytes = 0; + ForwardRunInfoMali runInfo; + U32 be = (i == 0) ? 0 : algoNumIndexD[i - 1]; + U32 end = algoNumIndexD[i]; + runInfo.algorithm = depthwisePointwiseConvAlgorithms[i]; + for (U32 j = 0; j < 2; j++) { + runInfoBe[i][j] = runInfoCount; + U32 depthwiseIndex = 0; + U32 pointwiseIndex = 0; + for (U32 k = be; k < end; k++) { + GCLMemDesc dwFilterMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + GCLMemDesc pwFilterMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + if (j == 0) { + depthwiseIndex = k; + } + if (j == 1) { + pointwiseIndex = k; + } + runInfo.best_w[0] = vecWD[depthwiseIndex]; + runInfo.best_c[0] = vecCD[depthwiseIndex]; + runInfo.best_k[0] = vecKD[depthwiseIndex]; + runInfo.best_w[1] = vecWP[pointwiseIndex]; + runInfo.best_c[1] = vecCP[pointwiseIndex]; + runInfo.best_k[1] = vecKP[pointwiseIndex]; + runInfoCount++; + if (depthwise_pointwise_convolution_transform_filter_bytes_mali(dwFilterDesc, + pwFilterDesc, &runInfo, &dwFilterMemDesc, &pwFilterMemDesc, + &bytes) != SUCCESS) { + continue; + } + maxBytes = (maxBytes < bytes) ? bytes : maxBytes; + if (depthwise_pointwise_convolution_infer_forward_tmp_bytes_mali(inputDesc, + dwFilterDesc, pwFilterDesc, outputDesc, convParamSpec, &runInfo, + &bytes) != SUCCESS) { + continue; + } + maxBytes = (maxBytes < bytes) ? bytes : maxBytes; + maxDwFilterSize = (maxDwFilterSize < dwFilterMemDesc.byteSize) + ? dwFilterMemDesc.byteSize + : maxDwFilterSize; + maxPwFilterSize = (maxPwFilterSize < pwFilterMemDesc.byteSize) + ? pwFilterMemDesc.byteSize + : maxPwFilterSize; + dwFilterMemDescs.push_back(dwFilterMemDesc); + pwFilterMemDescs.push_back(pwFilterMemDesc); + runInfos.push_back(runInfo); + } + runInfoEnd[i][j] = runInfoCount; + be = (i == 0) ? 0 : algoNumIndexP[i - 1]; + end = algoNumIndexP[i]; + } + } + + TensorDesc dwBiasDesc = tensor1d(dt, ic); + TensorDesc pwBiasDesc = tensor1d(dt, pfn); + U32 dwStride[3] = {(ic + 3) / 4, 1, 1}; + CHECK_STATUS(gclmem_set_desc_padding( + &dwBias->desc, dwStride, offset, dt, DF_NHWC, GCL_MEM_IMG_1D, CL_MEM_READ_WRITE)); + U32 pwStride[3] = {(pfn + 3) / 4, 1, 1}; + CHECK_STATUS(gclmem_set_desc_padding( + &pwBiasImg->desc, pwStride, offset, dt, DF_NHWC, GCL_MEM_IMG_1D, CL_MEM_READ_WRITE)); + pwStride[0] = (pfn + 7) / 8 * 8; + CHECK_STATUS(gclmem_set_desc_padding( + &pwBiasBuf->desc, pwStride, offset, dt, DF_NHWC, GCL_MEM_BUF, CL_MEM_READ_WRITE)); + + algosNum = runInfos.size(); + if (algosNum == 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + dwFilterMemDescs[0].byteSize = maxDwFilterSize; + pwFilterMemDescs[0].byteSize = maxPwFilterSize; + input->desc = inputMemDesc; + output->desc = outputMemDesc; + dwFilter->desc = dwFilterMemDescs[0]; + pwFilter->desc = pwFilterMemDescs[0]; + tmpbuf->desc.byteSize = maxBytes; + gcl_create_memory(handle, input); + gcl_create_memory(handle, output); + gcl_create_memory(handle, dwFilter); + gcl_create_memory(handle, pwFilter); + gcl_create_memory(handle, dwBias); + gcl_create_memory(handle, pwBiasImg); + gcl_create_memory(handle, pwBiasBuf); + if (maxBytes) { + gcl_create_memory(handle, tmpbuf); + } + + double minTimeDepthwise[2] = {DBL_MAX, DBL_MAX}; + double minTimePointwise[2] = {DBL_MAX, DBL_MAX}; + ForwardRunInfoMali bestRunInfo[2]; + U32 runKernelBe = 0; + U32 runKernelEnd = 0; + + for (U32 i = 0; i < 2; i++) { + U32 depthwiseBe = runInfoBe[i][0]; + U32 depthwiseEnd = runInfoEnd[i][0]; + U32 pointwiseBe = runInfoBe[i][1]; + U32 pointwiseEnd = runInfoEnd[i][1]; + GCLMem_t pwBias = (i == 0) ? pwBiasImg : pwBiasBuf; + for (U32 j = depthwiseBe; j < depthwiseEnd; j++) { + if (depthwise_pointwise_convolution_mali(handle, inputDesc, input, dwFilterDesc, + pwFilterDesc, dwFilter, pwFilter, convParamSpec, &runInfos[j], dwBiasDesc, + pwBiasDesc, dwBias, pwBias, maxBytes, tmpbuf, outputDesc, output, + depthwiseActivationMode, pointwiseActivationMode) == SUCCESS) { + runKernelEnd = handle->kernelVec->size(); + gcl_run_kernelVec_timing(handle, runKernelBe, runKernelBe + 1); + if (minTimeDepthwise[i] > handle->t_execute) { + minTimeDepthwise[i] = handle->t_execute; + bestRunInfo[i].algorithm = runInfos[j].algorithm; + bestRunInfo[i].best_w[0] = runInfos[j].best_w[0]; + bestRunInfo[i].best_c[0] = runInfos[j].best_c[0]; + bestRunInfo[i].best_k[0] = runInfos[j].best_k[0]; + } + runKernelBe = runKernelEnd; + } + } + for (U32 j = pointwiseBe; j < pointwiseEnd; j++) { + if (depthwise_pointwise_convolution_mali(handle, inputDesc, input, dwFilterDesc, + pwFilterDesc, dwFilter, pwFilter, convParamSpec, &runInfos[j], dwBiasDesc, + pwBiasDesc, dwBias, pwBias, maxBytes, tmpbuf, outputDesc, output, + depthwiseActivationMode, pointwiseActivationMode) == SUCCESS) { + runKernelEnd = handle->kernelVec->size(); + gcl_run_kernelVec_timing(handle, runKernelEnd - 1, runKernelEnd); + if (minTimePointwise[i] > handle->t_execute) { + minTimePointwise[i] = handle->t_execute; + bestRunInfo[i].algorithm = runInfos[j].algorithm; + bestRunInfo[i].best_w[1] = runInfos[j].best_w[1]; + bestRunInfo[i].best_c[1] = runInfos[j].best_c[1]; + bestRunInfo[i].best_k[1] = runInfos[j].best_k[1]; + } + runKernelBe = runKernelEnd; + } + } + } + + double minTimeDirect = minTimeDepthwise[0] + minTimePointwise[0]; + double minTimeGemm = minTimeDepthwise[1] + minTimePointwise[1]; + if (minTimeDirect == DBL_MAX && minTimeGemm == DBL_MAX) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (minTimeDirect > minTimeGemm) { + bestRunInfo[0] = bestRunInfo[1]; + } + + *forwardRunInfo = bestRunInfo[0]; + CHECK_STATUS(gcl_finish(handle)); + gcl_destroy_gclmem(input); + gcl_destroy_gclmem(dwFilter); + gcl_destroy_gclmem(pwFilter); + gcl_destroy_gclmem(output); + gcl_destroy_gclmem(dwBias); + gcl_destroy_gclmem(pwBiasImg); + gcl_destroy_gclmem(pwBiasBuf); + gcl_destroy_gclmem(tmpbuf); + runInfos.clear(); + dwFilterMemDescs.clear(); + pwFilterMemDescs.clear(); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + CHECK_STATUS(gcl_off_queue_profiling(handle)); + return SUCCESS; + } + return NOT_SUPPORTED; +} + +EE depthwise_pointwise_convolution_transform_filter_bytes_mali(TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemDwFilterDesc, + GCLMemDesc_t gclmemPwFilterDesc, + U32 *bytes) +{ + EE ret = SUCCESS; + switch (dwFilterDesc.dt) { + case DT_F16: { + ret = depthwise_pointwise_convolution_transform_filter_bytes_mali_fp16(dwFilterDesc, + pwFilterDesc, forwardRunInfo, gclmemDwFilterDesc, gclmemPwFilterDesc, bytes); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE depthwise_pointwise_convolution_transform_filter_mali(GCLHandle_t handle, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + GCLMem_t dwFilter, + GCLMem_t pwFilter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *dwfltmemDesc, + TensorDesc *pwfltmemDesc, + GCLMem_t dwfltmem, + GCLMem_t pwfltmem) +{ + EE ret = SUCCESS; + switch (dwFilterDesc.dt) { + case DT_F16: { + ret = depthwise_pointwise_convolution_transform_filter_mali_fp16(handle, dwFilterDesc, + pwFilterDesc, dwFilter, pwFilter, forwardRunInfo, dwfltmemDesc, pwfltmemDesc, + dwfltmem, pwfltmem); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE depthwise_pointwise_convolution_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes) +{ + EE ret = SUCCESS; + switch (inputDesc.dt) { + case DT_F16: { + ret = depthwise_pointwise_convolution_infer_forward_tmp_bytes_mali_fp16(inputDesc, + dwFilterDesc, pwFilterDesc, outputDesc, convParamSpec, forwardRunInfo, bytes); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE depthwise_pointwise_convolution_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + const GCLMem_t dwFilter, + const GCLMem_t pwFilter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc dwBiasDesc, + TensorDesc pwBiasDesc, + const GCLMem_t dwBias, + const GCLMem_t pwBias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode depthwiseActivationMode, + ActivationMode pointwiseActivationMode) +{ + EE ret = SUCCESS; + switch (inputDesc.dt) { + case DT_F16: { + ret = depthwise_pointwise_convolution_mali_fp16(handle, inputDesc, input, dwFilterDesc, + pwFilterDesc, dwFilter, pwFilter, convParamSpec, forwardRunInfo, dwBiasDesc, + pwBiasDesc, dwBias, pwBias, tmpBytes, tmpBuf, outputDesc, output, + depthwiseActivationMode, pointwiseActivationMode); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/eltwise.cpp b/compute/tensor/src/gpu/mali/eltwise.cpp new file mode 100644 index 00000000..fd5211ea --- /dev/null +++ b/compute/tensor/src/gpu/mali/eltwise.cpp @@ -0,0 +1,202 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/eltwise_mali_fp16.h" + +inline void gcl_mem_desc_align(U32 size, DataType dt, GCLMemDesc_t desc) +{ + U32 s0 = desc[0].stride[0]; + U32 s1 = desc[0].stride[1]; + U32 s2 = desc[0].stride[2]; + U32 off0 = desc[0].offset[0]; + U32 off1 = desc[0].offset[1]; + U32 off2 = desc[0].offset[2]; + for (U32 i = 1; i < size; i++) { + s0 = (s0 >= desc[i].stride[0]) ? s0 : desc[i].stride[0]; + s1 = (s1 >= desc[i].stride[1]) ? s1 : desc[i].stride[1]; + s2 = (s2 >= desc[i].stride[2]) ? s2 : desc[i].stride[2]; + off0 = (off0 >= desc[i].offset[0]) ? off0 : desc[i].offset[0]; + off1 = (off1 >= desc[i].offset[1]) ? off1 : desc[i].offset[1]; + off2 = (off2 >= desc[i].offset[2]) ? off2 : desc[i].offset[2]; + } + U32 num = s0 * s1 * s2 * 4; + U32 byteSize = num * bytesOf(dt); + for (U32 i = 0; i < size; i++) { + desc[i].stride[0] = s0; + desc[i].stride[1] = s1; + desc[i].stride[2] = s2; + desc[i].offset[0] = off0; + desc[i].offset[1] = off1; + desc[i].offset[2] = off2; + desc[i].num = num; + desc[i].byteSize = byteSize; + } +} +EE eltwise_infer_output_size_mali(std::vector inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + /*tensorDesc record cpu org data format info*/ + /*gclmemDesc record gpu trans data format info*/ + U32 arrayDimMax = 0; + bool sameDesc = eltwise_same_desc(inputDesc, &arrayDimMax); + if (outputDesc) { + *outputDesc = inputDesc[arrayDimMax]; + } + + DataType idt; + DataFormat idf; + U32 iw, ih, ic, in; + tensorSelectGet(inputDesc[arrayDimMax], &idt, &idf, &in, &ic, &ih, &iw); + + if (sameDesc) { + U32 size = inputDesc.size(); + if (gclmemInputDesc) { + bool inputIsModelInput = true; + bool inputIsAllNCHW = true; + + for (U32 i = 0; i < size; i++) { + if (gclmemInputDesc[i].byteSize > 0) { + inputIsModelInput = false; + } + if (gclmemInputDesc[i].memFormat != DF_NCHW) { + inputIsAllNCHW = false; + } + } + + if (inputIsAllNCHW && !inputIsModelInput) { + CHECK_STATUS(infer_gclmem_desc_nchw( + iw, ih, ic, 0, 0, iw, ih, ic, idt, idt, gclmemInputDesc, gclmemOutputDesc)); + for (U32 i = 0; i < size; i++) { + DataType tdt; + U32 tw, th, tc; + tensorSelectGet(inputDesc[i], &tdt, NULL, NULL, &tc, &th, &tw); + CHECK_STATUS(infer_gclmem_desc_nchw( + tw, th, tc, 0, 0, tw, th, tc, tdt, tdt, &gclmemInputDesc[i], NULL)); + } + } else { + for (U32 i = 0; i < size; i++) { + DataType tdt; + U32 tw, th, tc; + tensorSelectGet(inputDesc[i], &tdt, NULL, NULL, &tc, &th, &tw); + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + tw, th, tc, 0, 0, tw, th, tc, tdt, tdt, &gclmemInputDesc[i], NULL)); + } + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + iw, ih, ic, 0, 0, iw, ih, ic, idt, idt, NULL, gclmemOutputDesc)); + } + gcl_mem_desc_align(size, idt, gclmemInputDesc); + } + return SUCCESS; + } else { + if (inputDesc.size() > 2) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (gclmemInputDesc) { + CHECK_STATUS(infer_gclmem_desc_ncwhc4(iw, ih, ic, 0, 0, iw, ih, ic, idt, idt, + &gclmemInputDesc[arrayDimMax], gclmemOutputDesc)); + tensorSelectGet(inputDesc[1 - arrayDimMax], &idt, NULL, &in, &ic, &ih, &iw); + if (gclmemInputDesc[1 - arrayDimMax].byteSize == 0 || + gclmemInputDesc[1 - arrayDimMax].memFormat == DF_NCHW) { + CHECK_STATUS(infer_gclmem_desc_nchw(iw, ih, ic, 0, 0, iw, ih, ic, idt, idt, + &gclmemInputDesc[1 - arrayDimMax], NULL)); + } else { + CHECK_STATUS(infer_gclmem_desc_ncwhc4(iw, ih, ic, 0, 0, iw, ih, ic, idt, idt, + &gclmemInputDesc[1 - arrayDimMax], NULL)); + } + } + return SUCCESS; + } + return NOT_SUPPORTED; +} + +inline EE eltwise_checkpara_mali(GCLHandle_t handle, + std::vector inputDesc, + std::vector input, + EltwiseParamSpec eltwiseDesc, + TensorDesc outputDesc, + GCLMem_t output) +{ + if (handle == nullptr || nullptr == output) { + return NULL_POINTER; + } + for (auto it : input) { + GCLMem_t ptr = (GCLMem_t)it; + if (ptr == nullptr) { + return NULL_POINTER; + } + } + EltwiseMode eltwiseMode = eltwiseDesc.elt_mode; + U32 arrayDimMax = 0; + bool sameDesc = eltwise_same_desc(inputDesc, &arrayDimMax); + if (sameDesc) { + for (auto it : input) { + if (((GCLMem_t)(it))->desc.memFormat != output->desc.memFormat) { + return NOT_SUPPORTED; + } + } + for (auto it : inputDesc) { + // if(it.df != outputDesc.df) return NOT_SUPPORTED; + if (it.dims[0] != outputDesc.dims[0]) { + return NOT_SUPPORTED; + } + if (it.dims[1] != outputDesc.dims[1]) { + return NOT_SUPPORTED; + } + if (it.dims[2] != outputDesc.dims[2]) { + return NOT_SUPPORTED; + } + if (it.dims[3] != outputDesc.dims[3]) { + return NOT_SUPPORTED; + } + } + } else { + if (inputDesc.size() > 2) { + CHECK_STATUS(NOT_SUPPORTED); + } + } + if (outputDesc.df != DF_NCHW && outputDesc.df != DF_MKT) { + return NOT_SUPPORTED; + } + if (eltwiseMode != ELTWISE_SUM && eltwiseMode != ELTWISE_MAX && eltwiseMode != ELTWISE_PROD) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +EE eltwise_mali(GCLHandle_t handle, + std::vector inputDesc, + std::vector input, + EltwiseParamSpec eltwiseDesc, + TensorDesc outputDesc, + GCLMem_t output) +{ + EE ret = SUCCESS; + CHECK_STATUS(eltwise_checkpara_mali(handle, inputDesc, input, eltwiseDesc, outputDesc, output)); + switch (inputDesc[0].dt) { + case DT_F16: { + ret = eltwise_mali_fp16(handle, inputDesc, input, outputDesc, output, eltwiseDesc); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/embedding.cpp b/compute/tensor/src/gpu/mali/embedding.cpp new file mode 100644 index 00000000..62f0a063 --- /dev/null +++ b/compute/tensor/src/gpu/mali/embedding.cpp @@ -0,0 +1,86 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/embedding_mali_fp16.h" + +EE embedding_infer_output_size_mali(TensorDesc inputDesc, + EmbedParamSpec p, + DataType dt, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + DataType idt; + DataFormat df; + U32 batch, step; + CHECK_REQUIREMENT(tensorIs2d(inputDesc)); + CHECK_STATUS(tensor2dGet(inputDesc, &idt, &df, &batch, &step)); + if (outputDesc) { + *outputDesc = tensor3df(dt, DF_MKT, batch, p.num_output, step); + } + + if (df == DF_NORMAL) { + U32 iw = step; + U32 ih = batch; + U32 ic = 1; + CHECK_STATUS( + infer_gclmem_desc_nchw(iw, ih, ic, 0, 0, 0, 0, 0, idt, dt, gclmemInputDesc, NULL)); + + U32 m = 1; + U32 ow, oh, oc; + map_nlp_mkt_to_ncwhc4(m, p.num_output, step, &ow, &oh, &oc); + /*oc has been divided 4 in map_nlp_xxx, need to mul 4 for infer_xxx_ncwhc4*/ + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + 0, 0, 0, 0, 0, ow, oh, oc * 4, idt, dt, NULL, gclmemOutputDesc)); + return SUCCESS; + } + return NOT_SUPPORTED; +} + +inline EE embedding_checkpara_mali( + GCLHandle_t handle, GCLMem_t input, GCLMem_t weight, GCLMem_t output) +{ + if (nullptr == handle || nullptr == input || nullptr == weight || nullptr == output) { + return NULL_POINTER; + } + return SUCCESS; +} + +EE embedding_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc weightDesc, + GCLMem_t weight, + EmbedParamSpec p, + TensorDesc outputDesc, + GCLMem_t output) +{ + EE ret = SUCCESS; + CHECK_STATUS(embedding_checkpara_mali(handle, input, weight, output)); + switch (outputDesc.dt) { + case DT_F16: { + ret = embedding_mali_fp16( + handle, inputDesc, input, weightDesc, weight, p, outputDesc, output); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/fp16/activation_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/activation_mali_fp16.cpp new file mode 100644 index 00000000..1c80e604 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/activation_mali_fp16.cpp @@ -0,0 +1,106 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/activation_mali_fp16.h" + +inline EE activation_checkpara_mali_fp16(TensorDesc inputDesc) +{ + if (inputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE activation_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode) +{ + UNUSED(inputDesc); + U32 ow, oh, oc, on; + tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); + U32 iw_str, ih_str, iw_off, ih_off; + U32 ow_str, oh_str, ow_off, oh_off; + get_gclmem_dim(input->desc, &iw_str, &ih_str, NULL, &iw_off, &ih_off); + get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off); + cl_mem inbuf, outbuf; + inbuf = input->mem; + outbuf = output->mem; + + char modeName[16]; + switch (activationMode) { + case ACTIVATION_NULL: + return SUCCESS; + case ACTIVATION_RELU: + strcpy(modeName, "relu"); + break; + case ACTIVATION_RELU6: + strcpy(modeName, "relu6"); + break; + case ACTIVATION_H_SIGMOID: + strcpy(modeName, "hsigmoid"); + break; + case ACTIVATION_H_SWISH: + strcpy(modeName, "hswish"); + break; + case ACTIVATION_GELU: + strcpy(modeName, "gelu"); + break; + case ACTIVATION_TANH: + strcpy(modeName, "tanh"); + break; + case ACTIVATION_SIGMOID: + strcpy(modeName, "sigmoid"); + break; + default: + return NOT_SUPPORTED; + } + char kernelName[128]; + U32 H = 1; + sprintf(kernelName, "activation_%s%d", modeName, H); + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + U32 cd4 = (oc + 3) / 4; + U32 ce4 = (oc & 3) == 0 ? 4 : (oc & 3); + CHECK_STATUS(gcl_set_kernelArgs(kernel, oh, ow, cd4, ce4, ih_str, iw_str, ih_off, iw_off, + oh_str, ow_str, oh_off, ow_off, inbuf, outbuf)); + U32 gs[3] = {oh, ow, (oc + 3) / 4}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); +#endif + return SUCCESS; +} + +EE activation_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode) +{ + CHECK_STATUS(activation_checkpara_mali_fp16(inputDesc)); + if (input->mem != output->mem) { + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + } + CHECK_STATUS( + activation_core_mali_fp16(handle, inputDesc, input, outputDesc, output, activationMode)); + return SUCCESS; +} diff --git a/tensor_computing/src/gpu/mali/fp16/activation_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/activation_mali_fp16.h similarity index 78% rename from tensor_computing/src/gpu/mali/fp16/activation_mali_fp16.h rename to compute/tensor/src/gpu/mali/fp16/activation_mali_fp16.h index 8bc9fdb6..ca8af3f0 100644 --- a/tensor_computing/src/gpu/mali/fp16/activation_mali_fp16.h +++ b/compute/tensor/src/gpu/mali/fp16/activation_mali_fp16.h @@ -14,15 +14,14 @@ #ifndef _ACTIVATION_MALI_FP16 #define _ACTIVATION_MALI_FP16 #include "sys.h" -#include "type.h" -#include "tensor_desc.h" #include "error.h" +#include "types.h" #include "tensor_computing_type.h" -EE activation_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output, - ActivationMode activationMode); +EE activation_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode); #endif diff --git a/compute/tensor/src/gpu/mali/fp16/argmax_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/argmax_mali_fp16.cpp new file mode 100644 index 00000000..ceef38be --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/argmax_mali_fp16.cpp @@ -0,0 +1,204 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/argmax_mali_fp16.h" +#define get_thread_num(len, maxThreadNum, threadNum) \ + { \ + threadNum = ((len + 7) / 8 < maxThreadNum) ? (len + 7) / 8 : maxThreadNum; \ + } + +inline EE argmax_checkpara_mali_fp16(TensorDesc inputDesc, TensorDesc outputDesc) +{ + if (inputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + if (outputDesc.dt != DT_U32) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE argmax_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + I32 axis, + GCLMem_t tmpbuf, + TensorDesc outputDesc, + GCLMem_t output) +{ + UNUSED(outputDesc); + if (axis < 0) { + axis += inputDesc.nDims; + } + axis = inputDesc.nDims - 1 - axis; + if (axis == 0) { + DataType dt = inputDesc.dt; + U32 iw, ih, ic; + U32 inDims = inputDesc.nDims; + iw = inputDesc.dims[0]; + ih = (inDims > 1) ? inputDesc.dims[1] : 1; + ic = (inDims > 2) ? inputDesc.dims[2] : 1; + U32 iw_str, ih_str, iw_off, ih_off; + U32 ow_str, oh_str, ow_off, oh_off; + get_gclmem_dim(input->desc, &iw_str, &ih_str, NULL, &iw_off, &ih_off); + U32 threadNum; + Kernel kernel; + U32 gs[3]; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + Mem inv1024 = input->mem; + Mem ini1024 = input->mem; + Mem inv128 = input->mem; + Mem ini128 = input->mem; + Mem inv1 = input->mem; + Mem ini1 = input->mem; + Mem outv1024, outi1024, outv128, outi128; + char kernelName[128]; + char kernelNameIndex[128]; + sprintf(kernelName, "argmax_x"); + sprintf(kernelNameIndex, "argmax_x_index"); + bool use_index = false; + U32 offset = 0; + U32 len = iw; + get_thread_num(len, 1024, threadNum); + if (threadNum > 128) { + U32 outNum = 1024 * ih * ic; + U32 outvSize = outNum * bytesOf(dt); + U32 outiSize = outNum * bytesOf(DT_U32); + ow_str = threadNum; + oh_str = ih; + ow_off = 0; + oh_off = 0; + CHECK_STATUS(gcl_create_sub_buffer(outvSize, &offset, tmpbuf, &outv1024)); + CHECK_STATUS(gcl_create_sub_buffer(outiSize, &offset, tmpbuf, &outi1024)); + gs[0] = threadNum; + gs[1] = ih; + gs[2] = ic; + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, iw_off, ih_off, ow_str, oh_str, + ow_off, oh_off, len, gs[0], gs[1], inv1024, ini1024, outv1024, outi1024)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); + inv128 = outv1024; + ini128 = outi1024; + iw_str = ow_str; + ih_str = oh_str; + iw_off = ow_off; + ih_off = oh_off; + use_index = true; + len = threadNum; +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); + CHECK_STATUS(gcl_print_buffer(handle, inv1024, input->desc.num, "argmax_input")); + CHECK_STATUS(gcl_print_buffer(handle, outv1024, outNum, "argmax_output_value")); + CHECK_STATUS(gcl_print_buffer(handle, outi1024, outNum, "argmax_output_value")); +#endif + } + + get_thread_num(len, 128, threadNum); + if (threadNum > 1) { + U32 outNum = 128 * ih * ic; + U32 outvSize = outNum * bytesOf(dt); + U32 outiSize = outNum * bytesOf(DT_U32); + ow_str = threadNum; + oh_str = ih; + ow_off = 0; + oh_off = 0; + CHECK_STATUS(gcl_create_sub_buffer(outvSize, &offset, tmpbuf, &outv128)); + CHECK_STATUS(gcl_create_sub_buffer(outiSize, &offset, tmpbuf, &outi128)); + gs[0] = threadNum; + gs[1] = ih; + gs[2] = ic; + if (use_index) { + CHECK_STATUS(gcl_create_kernel(handle, kernelNameIndex, &kernel)); + } else { + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + } + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, iw_off, ih_off, ow_str, oh_str, + ow_off, oh_off, len, gs[0], gs[1], inv128, ini128, outv128, outi128)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); + inv1 = outv128; + ini1 = outi128; + iw_str = ow_str; + ih_str = oh_str; + iw_off = ow_off; + ih_off = oh_off; + use_index = true; + len = threadNum; +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); + CHECK_STATUS(gcl_print_buffer(handle, outv128, outNum, "argmax_output_index")); + CHECK_STATUS(gcl_print_buffer(handle, outi128, outNum, "argmax_output_value")); +#endif + } + + gs[0] = 1; + gs[1] = ih; + gs[2] = ic; + get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off); + if (use_index) { + CHECK_STATUS(gcl_create_kernel(handle, kernelNameIndex, &kernel)); + } else { + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + } + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, iw_off, ih_off, ow_str, oh_str, + ow_off, oh_off, len, gs[0], gs[1], inv1, ini1, output->mem, output->mem)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + if (use_index) { + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelNameIndex)); + } else { + CHECK_STATUS(gcl_print_memory(handle, input, "argmax_input")); + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); + } + CHECK_STATUS(gcl_print_memory(handle, output, "argmax_output")); +#endif + return SUCCESS; + } + return NOT_SUPPORTED; +} + +EE argmax_infer_forward_tmp_bytes_mali_fp16( + TensorDesc inputDesc, I32 axis, TensorDesc outputDesc, U32 *bytes) +{ + UNUSED(axis); + UNUSED(outputDesc); + DataType dt = inputDesc.dt; + U32 iw, ih, ic; + U32 inDims = inputDesc.nDims; + iw = inputDesc.dims[0]; + ih = (inDims > 1) ? inputDesc.dims[1] : 1; + ic = (inDims > 2) ? inputDesc.dims[2] : 1; + U32 size = 1024 * ih * ic * bytesOf(dt); + size += 1024 * ih * ic * bytesOf(DT_U32); + size += (128 * ih * ic * bytesOf(dt) + 1023) / 1024 * 1024; + size += (128 * ih * ic * bytesOf(DT_U32) + 1023) / 1024 * 1024; + *bytes = size; + return SUCCESS; +} + +EE argmax_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + I32 axis, + GCLMem_t tmpbuf, + TensorDesc outputDesc, + GCLMem_t output) +{ + CHECK_STATUS(argmax_checkpara_mali_fp16(inputDesc, outputDesc)); + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + CHECK_STATUS(argmax_core_mali_fp16(handle, inputDesc, input, axis, tmpbuf, outputDesc, output)); + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/fp16/argmax_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/argmax_mali_fp16.h new file mode 100644 index 00000000..76e52ab3 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/argmax_mali_fp16.h @@ -0,0 +1,31 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _ARGMAX_MALI_FP16 +#define _ARGMAX_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE argmax_infer_forward_tmp_bytes_mali_fp16( + TensorDesc inputDesc, I32 axis, TensorDesc outputDesc, U32 *bytes); + +EE argmax_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + I32 axis, + GCLMem_t tmpbuf, + TensorDesc outputDesc, + GCLMem_t output); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/bilateral_slice_apply_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/bilateral_slice_apply_mali_fp16.cpp new file mode 100644 index 00000000..99a2400d --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/bilateral_slice_apply_mali_fp16.cpp @@ -0,0 +1,132 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/bilateral_slice_apply_mali_fp16.h" + +inline EE bilateral_slice_apply_checkpara_mali_fp16( + TensorDesc inputDesc, TensorDesc guideDesc, TensorDesc gridDesc, TensorDesc outputDesc) +{ + if (inputDesc.dt != guideDesc.dt || inputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + if (inputDesc.dt != gridDesc.dt || inputDesc.dt != outputDesc.dt) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE bilateral_slice_apply_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc guideDesc, + const GCLMem_t guide, + TensorDesc gridDesc, + const GCLMem_t grid, + BilateralSliceApplyParamSpec bilateralSliceApplyParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output) +{ + UNUSED(guideDesc); + UNUSED(forwardRunInfo); + U32 iw, ih, ic, in; + U32 gw, gh, gc, gn; + U32 ow, oh, oc, on; + tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); + tensorSelectGet(gridDesc, NULL, NULL, &gn, &gc, &gh, &gw); + tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); + + U32 coe = bilateralSliceApplyParamSpec.coefficient_len; + BilateralSliceApplyMode mode = bilateralSliceApplyParamSpec.mode; + // bool has_offset = bilateralSliceApplyParamSpec.has_offset; + U32 dep = gc / coe; + U32 gcw = gc * gw; + U32 wh = iw * ih; + F32 scale_x = (F32)gw / iw; + F32 scale_y = (F32)gh / ih; + Mem inbuf, gridbuf, guidebuf, outbuf, gridTran; + inbuf = input->mem; + gridbuf = grid->mem; + outbuf = output->mem; + gridTran = tmpBuf->mem; + if (mode == BSliceApply_NULL) { + guidebuf = guide->mem; + } else { + guidebuf = inbuf; + } + + U32 gs0[3] = {gc / 4, gw, ih}; + U32 ls0[3] = {0, 0, 0}; + U32 dim0 = 3; + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, "bilateral_slice_apply_pre", &kernel)); + CHECK_STATUS( + gcl_set_kernelArgs(kernel, gh, gc, gcw, gs0[0], gs0[1], scale_y, gridbuf, gridTran)); + gcl_set_kernelVec(handle, kernel, dim0, gs0, ls0, "bilateral_slice_apply_pre"); + +#ifdef _DEBUG + CHECK_STATUS( + gcl_run_kernel_profiling(handle, kernel, dim0, gs0, ls0, "bilateral_slice_apply_pre")); + CHECK_STATUS(gcl_print_memory(handle, grid, "bilateral_slice_apply_grid")); +#endif + char kernelname[128]; + if (mode == BSliceApply_CONV) { + sprintf(kernelname, "bilateral_slice_apply_c12_conv"); + } else { + sprintf(kernelname, "bilateral_slice_apply_c12"); + } + U32 gs[2] = {ow, oh}; + U32 ls[2] = {0, 0}; + U32 dim = 2; + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw, wh, gc, gw, gh, gcw, dep, coe, gs[0], gs[1], + scale_x, scale_y, guidebuf, gridTran, inbuf, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); + +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel_profiling(handle, kernel, dim, gs, ls, kernelname)); + CHECK_STATUS(gcl_print_memory(handle, input, "bilateral_slice_apply_input")); + CHECK_STATUS(gcl_print_memory(handle, output, "bilateral_slice_apply_output")); + if (mode == BSliceApply_NULL) { + CHECK_STATUS(gcl_print_memory(handle, guide, "bilateral_slice_apply_guide")); + } +#endif + return SUCCESS; +} + +EE bilateral_slice_apply_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc guideDesc, + const GCLMem_t guide, + TensorDesc gridDesc, + const GCLMem_t grid, + BilateralSliceApplyParamSpec bilateralSliceApplyParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output) +{ + UNUSED(tmpBytes); + CHECK_STATUS( + bilateral_slice_apply_checkpara_mali_fp16(inputDesc, guideDesc, gridDesc, outputDesc)); + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + CHECK_STATUS(bilateral_slice_apply_core_mali_fp16(handle, inputDesc, input, guideDesc, guide, + gridDesc, grid, bilateralSliceApplyParamSpec, forwardRunInfo, tmpBuf, outputDesc, output)); + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/fp16/bilateral_slice_apply_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/bilateral_slice_apply_mali_fp16.h new file mode 100644 index 00000000..538b9790 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/bilateral_slice_apply_mali_fp16.h @@ -0,0 +1,34 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _BILATERAL_SLICE_APPLY_MALI_FP16 +#define _BILATERAL_SLICE_APPLY_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE bilateral_slice_apply_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc guideDesc, + const GCLMem_t guide, + TensorDesc gridDesc, + const GCLMem_t grid, + BilateralSliceApplyParamSpec bilateralSliceApplyParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/channel_resize_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/channel_resize_mali_fp16.cpp new file mode 100644 index 00000000..d9c4c29d --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/channel_resize_mali_fp16.cpp @@ -0,0 +1,92 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/channel_resize_mali_fp16.h" + +inline EE channel_resize_checkpara_mali_fp16(TensorDesc inputDesc, TensorDesc outputDesc) +{ + if (inputDesc.dt != outputDesc.dt || inputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE channel_resize_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + ChannelResizeParamSpec p, + TensorDesc outputDesc, + GCLMem_t output) +{ + U32 iw, ih, ic, in, oc; + tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); + tensorSelectGet(outputDesc, NULL, NULL, NULL, &oc, NULL, NULL); + U32 iw_str, ih_str, ic_str, iw_off, ih_off; + U32 ow_str, oh_str, ow_off, oh_off; + get_gclmem_dim(input->desc, &iw_str, &ih_str, &ic_str, &iw_off, &ih_off); + get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off); + cl_mem inbuf, outbuf; + inbuf = input->mem; + outbuf = output->mem; + DataFormat imf = input->desc.memFormat; + DataFormat omf = output->desc.memFormat; + U32 dim = 3; + Kernel kernel; + char kernelName[128]; + U32 gs[3] = {ih, iw, (U32)(p.channel_after + 3) / 4}; + U32 ls[3] = {0, 0, 0}; + sprintf(kernelName, "channel_resize"); + if (imf == DF_NCHW && omf == DF_NCHW) { + gs[0] = (iw + 3) / 4; + gs[1] = ih; + gs[2] = p.channel_after; + sprintf(kernelName, "channel_resize_nchw"); + } + + if (imf == DF_NCHW && omf == DF_NCWHC4) { + gs[0] = (iw + 3) / 4; + gs[1] = ih; + gs[2] = (p.channel_after + 3) / 4; + sprintf(kernelName, "channel_resize_nchw_ncwhc4"); + } + + if (imf == DF_NCWHC4 && omf == DF_NCHW) { + gs[0] = ih; + gs[1] = (iw + 3) / 4; + gs[2] = (p.channel_after + 3) / 4; + sprintf(kernelName, "channel_resize_ncwhc4_nchw"); + } + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, iw_str, ic_str, ih_off, iw_off, oh_str, ow_str, + oh_off, ow_off, p.channel_before, p.channel_after, iw, gs[0], gs[1], inbuf, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); +#endif + return SUCCESS; +} + +EE channel_resize_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + ChannelResizeParamSpec p, + TensorDesc outputDesc, + GCLMem_t output) +{ + CHECK_STATUS(channel_resize_checkpara_mali_fp16(inputDesc, outputDesc)); + CHECK_STATUS(channel_resize_core_mali_fp16(handle, inputDesc, input, p, outputDesc, output)); + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/fp16/channel_resize_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/channel_resize_mali_fp16.h new file mode 100644 index 00000000..c020a482 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/channel_resize_mali_fp16.h @@ -0,0 +1,26 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _CHANNEL_RESIZE_MALI_FP16 +#define _CHANNEL_RESIZE_MALI_FP16 +#include "sys.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE channel_resize_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + ChannelResizeParamSpec p, + TensorDesc outputDesc, + GCLMem_t output); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/clip_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/clip_mali_fp16.cpp new file mode 100644 index 00000000..7c94b353 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/clip_mali_fp16.cpp @@ -0,0 +1,80 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/clip_mali_fp16.h" + +inline EE clip_checkpara_mali_fp16(TensorDesc inputDesc, TensorDesc outputDesc) +{ + if (inputDesc.dt != outputDesc.dt || inputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE clip_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + ClipParamSpec p, + TensorDesc outputDesc, + GCLMem_t output) +{ + UNUSED(outputDesc); + U32 iw, ih, ic, in; + tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); + U32 iw_str, ih_str, iw_off, ih_off; + U32 ow_str, oh_str, ow_off, oh_off; + ih_str = input->desc.stride[0]; + iw_str = input->desc.stride[1]; + ih_off = input->desc.offset[0]; + iw_off = input->desc.offset[1]; + oh_str = output->desc.stride[0]; + ow_str = output->desc.stride[1]; + oh_off = output->desc.offset[0]; + ow_off = output->desc.offset[1]; + cl_mem inbuf, outbuf; + inbuf = input->mem; + outbuf = output->mem; + + U32 gs[3] = {ih, iw, (ic + 3) / 4}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, "clip", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ih_str, iw_str, ih_off, iw_off, oh_str, ow_str, + oh_off, ow_off, p.min, p.max, inbuf, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, "clip"); +#ifdef _DEBUG + CHECK_STATUS(gcl_print_memory(handle, input, "clip_input")); + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "clip")); + CHECK_STATUS(gcl_print_memory(handle, output, "clip_output")); +#endif + return SUCCESS; +} + +EE clip_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + ClipParamSpec p, + TensorDesc outputDesc, + GCLMem_t output) +{ + CHECK_STATUS(clip_checkpara_mali_fp16(inputDesc, outputDesc)); + if (input->mem != output->mem) { + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + } + CHECK_STATUS(clip_core_mali_fp16(handle, inputDesc, input, p, outputDesc, output)); + return SUCCESS; +} diff --git a/tensor_computing/src/gpu/mali/fp16/clip_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/clip_mali_fp16.h similarity index 81% rename from tensor_computing/src/gpu/mali/fp16/clip_mali_fp16.h rename to compute/tensor/src/gpu/mali/fp16/clip_mali_fp16.h index ccec6caf..fecb9e65 100644 --- a/tensor_computing/src/gpu/mali/fp16/clip_mali_fp16.h +++ b/compute/tensor/src/gpu/mali/fp16/clip_mali_fp16.h @@ -11,20 +11,17 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _CLIP_MALI_FP16 #define _CLIP_MALI_FP16 #include "sys.h" -#include "type.h" -#include "tensor_desc.h" #include "error.h" +#include "types.h" #include "tensor_computing_type.h" EE clip_mali_fp16(GCLHandle_t handle, - void* min_value, - void* max_value, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output); + TensorDesc inputDesc, + GCLMem_t input, + ClipParamSpec p, + TensorDesc outputDesc, + GCLMem_t output); #endif diff --git a/compute/tensor/src/gpu/mali/fp16/concat_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/concat_mali_fp16.cpp new file mode 100644 index 00000000..57a07e7d --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/concat_mali_fp16.cpp @@ -0,0 +1,248 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/concat_mali_fp16.h" + +inline EE concat_checkpara_mali_fp16(std::vector inputDesc, TensorDesc outputDesc) +{ + for (auto it : inputDesc) { + if (it.dt != outputDesc.dt) { + return NOT_SUPPORTED; + } + } + if (outputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE concat_core_mali_fp16(GCLHandle_t handle, + std::vector inputDesc, + std::vector input, + TensorDesc outputDesc, + GCLMem_t output, + GCLMem_t tmpbuf, + I32 concatDim) +{ + U32 ow, oh, oc; + tensorSelectGet(outputDesc, NULL, NULL, NULL, &oc, &oh, &ow); + U32 ow_str, oh_str, oc_str, ow_off, oh_off; + get_gclmem_dim(output->desc, &ow_str, &oh_str, &oc_str, &ow_off, &oh_off); + U32 num = input.size(); + GCLMem_t inputMem[4]; + cl_mem inbuf[4]; + I32 dim = outputDesc.nDims; + concatDim = (concatDim + dim) % dim; + concatDim = dim - 1 - concatDim; + char kernelName[128]; + char dimName[128]; + U32 axis; + if (inputDesc[0].df == DF_NCHW) { + switch (concatDim) { + case 0: + strcpy(dimName, "w"); + axis = 1; + break; + case 1: + strcpy(dimName, "h"); + axis = 0; + break; + case 2: + strcpy(dimName, "c"); + axis = 2; + break; + default: + CHECK_STATUS(NOT_SUPPORTED); + } + } + if (inputDesc[0].df == DF_MKT) { + concatDim = 1 - concatDim; + } + if (inputDesc[0].df == DF_MKT || inputDesc[0].df == DF_MTK) { + switch (concatDim) { + case 0: + strcpy(dimName, "c"); + axis = 2; + break; + case 1: + strcpy(dimName, "h"); + axis = 0; + break; + default: + CHECK_STATUS(NOT_SUPPORTED); + } + } + bool concatDimCAlign = true; + if (axis == 2) { + for (auto p : inputDesc) { + U32 tc; + tensorSelectGet(p, NULL, NULL, NULL, &tc, NULL, NULL); + if (tc % 4 != 0) { + concatDimCAlign = false; + break; + } + } + } + U32 ic[4]; + U32 axis_len[4]; + U32 bn = (num + 3) / 4; + U32 en, nmax, axis_max; + U32 out_size = 0; + U32 ih_str[4]; + U32 iw_str[4]; + U32 ih_off[4]; + U32 iw_off[4]; + U32 oh_val = oh_str; + U32 ohw_val = oh_str * ow_str; + U32 oh_off_val = oh_off; + U32 ow_off_val = ow_off; + cl_mem outbuf = output->mem; + if (!concatDimCAlign) { + oh_val = oh; + ohw_val = oh * ow; + oh_off_val = 0; + ow_off_val = 0; + outbuf = tmpbuf->mem; + } + for (U32 i = 0; i < bn; i++) { + en = (i * 4 + 4 <= num) ? 4 : (num & 3); + axis_max = 0; + nmax = en - 1; + for (U32 j = 0; j < en; ++j) { + inputMem[j] = (GCLMem_t)input[i * 4 + j]; + inbuf[j] = inputMem[j]->mem; + get_gclmem_dim(inputMem[j]->desc, &iw_str[j], &ih_str[j], NULL, &iw_off[j], &ih_off[j]); + } + for (U32 j = 0; j < en; ++j) { + axis_len[j] = inputDesc[i * 4 + j].dims[concatDim]; + ic[j] = 0; + if (axis == 2) { + ic[j] = axis_len[j]; + axis_len[j] = (axis_len[j] + 3) / 4; + } + axis_max += axis_len[j]; + } + U32 gs[3] = {oh, ow, (oc + 3) / 4}; + gs[axis] = axis_max; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + axis_max -= axis_len[nmax]; + if (!concatDimCAlign) { + sprintf(kernelName, "concat_nonalign_c_p1_%d", en); + } else { + sprintf(kernelName, "concat_%s%d", dimName, en); + } + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + switch (en) { + case 1: + CHECK_STATUS(gcl_set_kernelArgs(kernel, oh_val, ohw_val, oh_off_val, ow_off_val, + axis_max, nmax, out_size, gs[0], gs[1], ih_str[0], iw_str[0], ih_off[0], + iw_off[0], ic[0], inbuf[0], outbuf)); + break; + case 2: + CHECK_STATUS(gcl_set_kernelArgs(kernel, oh_val, ohw_val, oh_off_val, ow_off_val, + axis_max, nmax, out_size, gs[0], gs[1], ih_str[0], iw_str[0], ih_off[0], + iw_off[0], ic[0], inbuf[0], ih_str[1], iw_str[1], ih_off[1], iw_off[1], ic[1], + axis_len[0], inbuf[1], outbuf)); + break; + case 3: + CHECK_STATUS(gcl_set_kernelArgs(kernel, oh_val, ohw_val, oh_off_val, ow_off_val, + axis_max, nmax, out_size, gs[0], gs[1], ih_str[0], iw_str[0], ih_off[0], + iw_off[0], ic[0], inbuf[0], ih_str[1], iw_str[1], ih_off[1], iw_off[1], ic[1], + axis_len[0], inbuf[1], ih_str[2], iw_str[2], ih_off[2], iw_off[2], ic[2], + axis_len[1], inbuf[2], outbuf)); + break; + case 4: + CHECK_STATUS(gcl_set_kernelArgs(kernel, oh_val, ohw_val, oh_off_val, ow_off_val, + axis_max, nmax, out_size, gs[0], gs[1], ih_str[0], iw_str[0], ih_off[0], + iw_off[0], ic[0], inbuf[0], ih_str[1], iw_str[1], ih_off[1], iw_off[1], ic[1], + axis_len[0], inbuf[1], ih_str[2], iw_str[2], ih_off[2], iw_off[2], ic[2], + axis_len[1], inbuf[2], ih_str[3], iw_str[3], ih_off[3], iw_off[3], ic[3], + axis_len[2], inbuf[3], outbuf)); + break; + default: + return NOT_SUPPORTED; + } + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); + if (!concatDimCAlign) { + out_size += oh * ow * (ic[0] + ic[1] + ic[2] + ic[3]); + } else { + if (axis == 0) { + out_size += gs[0] * 4; + } + if (axis == 1) { + out_size += oh_str * gs[1] * 4; + } + if (axis == 2) { + out_size += oh_str * ow_str * gs[2] * 4; + } + } +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); +#endif + } + if (!concatDimCAlign) { + U32 gs[3] = {(oh + 3) / 4, ow, (oc + 3) / 4}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, "mem_trans_nchw_to_ncwhc4_input_tran", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ow, oh, 0, 0, ow_str, oh_str, ow_off, oh_off, ow, + oh, oc, ow, oh, oc, 0, 0, tmpbuf->mem, output->mem)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, "mem_trans_nchw_to_ncwhc4_input_tran"); +#ifdef _DEBUG + CHECK_STATUS( + gcl_run_kernel(handle, kernel, dim, gs, ls, "mem_trans_nchw_to_ncwhc4_input_tran")); +#endif + } + return SUCCESS; +} + +EE concat_infer_forward_tmp_bytes_mali_fp16(std::vector inputDesc, U32 *bytes) +{ + *bytes = 0; + bool concatDimCAlign = true; + for (auto p : inputDesc) { + U32 tc; + tensorSelectGet(p, NULL, NULL, NULL, &tc, NULL, NULL); + if (tc % 4 != 0) { + concatDimCAlign = false; + break; + } + } + if (!concatDimCAlign) { + for (auto p : inputDesc) { + *bytes += tensorNumBytes(p); + } + } + return SUCCESS; +} + +EE concat_mali_fp16(GCLHandle_t handle, + std::vector inputDesc, + std::vector input, + TensorDesc outputDesc, + GCLMem_t output, + GCLMem_t tmpbuf, + I32 concatDim) +{ + CHECK_STATUS(concat_checkpara_mali_fp16(inputDesc, outputDesc)); + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + CHECK_STATUS( + concat_core_mali_fp16(handle, inputDesc, input, outputDesc, output, tmpbuf, concatDim)); + return SUCCESS; +} diff --git a/tensor_computing/src/gpu/mali/fp16/concat_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/concat_mali_fp16.h similarity index 77% rename from tensor_computing/src/gpu/mali/fp16/concat_mali_fp16.h rename to compute/tensor/src/gpu/mali/fp16/concat_mali_fp16.h index baa13080..7b5e9757 100644 --- a/tensor_computing/src/gpu/mali/fp16/concat_mali_fp16.h +++ b/compute/tensor/src/gpu/mali/fp16/concat_mali_fp16.h @@ -11,19 +11,20 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _CONCAT_MALI_FP16 #define _CONCAT_MALI_FP16 #include "sys.h" -#include "type.h" -#include "tensor_desc.h" #include "error.h" +#include "types.h" #include "tensor_computing_type.h" -EE concat_mali_fp16(GCLHandle_t handle, - std::vector inputDesc, - std::vector input, - TensorDesc outputDesc, - GCLMem_t output, - U32 concatDim); +EE concat_infer_forward_tmp_bytes_mali_fp16(std::vector inputDesc, U32 *bytes); + +EE concat_mali_fp16(GCLHandle_t handle, + std::vector inputDesc, + std::vector input, + TensorDesc outputDesc, + GCLMem_t output, + GCLMem_t tmpbuf, + I32 concatDim); #endif diff --git a/compute/tensor/src/gpu/mali/fp16/convolution_direct_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/convolution_direct_mali_fp16.cpp new file mode 100644 index 00000000..3f7588c6 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/convolution_direct_mali_fp16.cpp @@ -0,0 +1,453 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/convolution_mali_fp16.h" +#include "gpu/mali/fp16/convolution_direct_mali_fp16.h" + +inline EE direct_core_nchw_to_ncwhc4_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + UNUSED(tmpBuf); + cl_mem inbuf, biasmem, outbuf, fltbuf; + inbuf = input->mem; + fltbuf = filter->mem; + biasmem = bias->mem; + outbuf = output->mem; + DataFormat df; + U32 iw, ih, it; + U32 fw, fh, fn, ft, sw, sh, st, pw, ph, pt; + U32 ow, oh, oc, on, ot; + sw = convParamSpec.stride_w; + sh = convParamSpec.stride_h; + st = convParamSpec.stride_t; + ph = convParamSpec.padding_top; + pw = convParamSpec.padding_left; + pt = convParamSpec.padding_before; + + tensorSelectGet(inputDesc, NULL, &df, NULL, NULL, &ih, &iw, &it); + tensorSelectGet(filterDesc, NULL, NULL, &fn, NULL, &fh, &fw, &ft); + tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow, &ot); + U32 iw_str, ih_str, iwh_str, ic_str, iw_off, ih_off; + get_gclmem_dim(input->desc, &iw_str, &ih_str, &ic_str, &iw_off, &ih_off); + iw_off -= pw; + ih_off -= ph; + iwh_str = iw_str * ih_str; + ic_str = ic_str / it; + + U32 ow_str, oh_str, ow_off, oh_off; + get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off); + + U32 item_w = forwardRunInfo->best_w[0]; + char kernelname[128]; + char modeName[16]; + U32 gs[3]; + U32 ls[3] = {0, 0, 0}; + U32 dim; + Kernel kernel; + switch (activationMode) { + case ACTIVATION_RELU: + strcpy(modeName, "relu_"); + break; + case ACTIVATION_RELU6: + strcpy(modeName, "relu6_"); + break; + case ACTIVATION_NULL: + strcpy(modeName, ""); + break; + default: + return NOT_SUPPORTED; + } + + gs[0] = (ow + item_w - 1) / item_w; + gs[1] = oh; + gs[2] = (oc + 3) / 4 * on * ot; + dim = 3; + if (df == DF_NCTHW) { + sprintf(kernelname, "conv_direct_3d_sw%d_nchw_to_ncwhc4_%s%d%d%d", sw, modeName, fw, ft, + item_w); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, iwh_str, ic_str, iw_off, ih_off, oh_str, + ow_str, oh_off, ow_off, ow, ot, it, pt, sh, st, gs[0], gs[1], inbuf, fltbuf, biasmem, + outbuf)); + } else { + sprintf(kernelname, "conv_direct_s%d_nchw_to_ncwhc4_%s%d%d", sw, modeName, fw, item_w); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, iwh_str, ic_str, iw_off, ih_off, oh_str, + ow_str, oh_off, ow_off, ow, gs[0], gs[1], inbuf, fltbuf, biasmem, outbuf)); + } + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, iwh_str, ic_str, iw_off, ih_off, oh_str, ow_str, + oh_off, ow_off, ow, gs[0], gs[1], inbuf, fltbuf, biasmem, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); + +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + handle->t_total += handle->t_execute; +#endif + return SUCCESS; +} + +inline EE direct_core_fn_spe(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + UNUSED(tmpBuf); + cl_mem inbuf, biasmem, outbuf, fltbuf; + inbuf = input->mem; + fltbuf = filter->mem; + biasmem = bias->mem; + outbuf = output->mem; + U32 iw, ih; + U32 fw, fh, fn, sw, sh, pw, ph; + U32 ow, oh, oc, on; + sw = convParamSpec.stride_w; + sh = convParamSpec.stride_h; + ph = convParamSpec.padding_top; + pw = convParamSpec.padding_left; + tensorSelectGet(inputDesc, NULL, NULL, NULL, NULL, &ih, &iw); + tensorSelectGet(filterDesc, NULL, NULL, &fn, NULL, &fh, &fw); + tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); + + U32 iw_str, ih_str, ihw_str, ic_str, iw_off, ih_off; + U32 ow_str, oh_str, ow_off, oh_off, ohw_str; + get_gclmem_dim(input->desc, &iw_str, &ih_str, &ic_str, &iw_off, &ih_off); + get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off); + iw_off -= pw; + ih_off -= ph; + ihw_str = iw_str * ih_str; + ohw_str = ow_str * oh_str; + + U32 item_w = forwardRunInfo->best_w[0]; + char kernelname[128]; + char modeName[16]; + char outFormat[16] = ""; + if (output->desc.memFormat == DF_NCHW) { + strcpy(outFormat, "nchw_"); + } + U32 gs[3]; + U32 ls[3] = {0, 0, 0}; + U32 dim; + Kernel kernel; + switch (activationMode) { + case ACTIVATION_RELU: + strcpy(modeName, "relu_"); + break; + case ACTIVATION_RELU6: + strcpy(modeName, "relu6_"); + break; + case ACTIVATION_NULL: + strcpy(modeName, ""); + break; + default: + return NOT_SUPPORTED; + } + sprintf(kernelname, "conv_direct_s%d_fn_spe_%s%s%d%d", sw, modeName, outFormat, fw, item_w); + gs[0] = oh; + gs[1] = (ow + item_w - 1) / item_w; + dim = 2; + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, ihw_str, ic_str, ih_off, iw_off, oh_str, ow_str, + ohw_str, oh_off, ow_off, ow, sh, gs[0], gs[1], inbuf, fltbuf, biasmem, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); + +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + handle->t_total += handle->t_execute; +#endif + return SUCCESS; +} +inline EE direct_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + UNUSED(tmpBuf); + cl_mem inbuf, biasmem, outbuf, fltbuf; + inbuf = input->mem; + fltbuf = filter->mem; + biasmem = bias->mem; + outbuf = output->mem; + U32 iw, ih; + U32 fw, fh, fn, sw, sh, pw, ph; + U32 ow, oh, oc, on; + sw = convParamSpec.stride_w; + sh = convParamSpec.stride_h; + ph = convParamSpec.padding_top; + pw = convParamSpec.padding_left; + tensorSelectGet(inputDesc, NULL, NULL, NULL, NULL, &ih, &iw); + tensorSelectGet(filterDesc, NULL, NULL, &fn, NULL, &fh, &fw); + tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); + + U32 iw_str, ih_str, ihw_str, ic_str, iw_off, ih_off; + U32 ow_str, oh_str, ohw_str, ow_off, oh_off; + get_gclmem_dim(input->desc, &iw_str, &ih_str, &ic_str, &iw_off, &ih_off); + get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off); + ih_off -= ph; + iw_off -= pw; + ihw_str = ih_str * iw_str; + ohw_str = oh_str * ow_str; + + U32 item_w = forwardRunInfo->best_w[0]; + U32 item_c = forwardRunInfo->best_c[0]; + U32 item_k = forwardRunInfo->best_k[0]; + char kernelname[128]; + char modeName[16]; + U32 gs[3]; + U32 ls[3] = {0, 0, 0}; + U32 dim; + Kernel kernel; + switch (activationMode) { + case ACTIVATION_RELU: + strcpy(modeName, "relu_"); + break; + case ACTIVATION_RELU6: + strcpy(modeName, "relu6_"); + break; + case ACTIVATION_NULL: + strcpy(modeName, ""); + break; + default: + return NOT_SUPPORTED; + } + if (item_k == 0) { + if ((ih_str > 1 || iw_str > 1) && (item_c != 4)) { + CHECK_STATUS(NOT_SUPPORTED); + } + sprintf(kernelname, "conv_direct_spe_fwhs1_%s%d", modeName, item_c); + ic_str = filter->desc.stride[1]; + ow = fn; + gs[0] = fn; + gs[1] = 1; + gs[2] = 1; + dim = 1; + } else if ((item_w >> 8) > 0) { + U32 item_h = item_w >> 8; + item_k = item_k >> 2; + sprintf(kernelname, "conv_direct_s%d_h_%s%d%d%d", sw, modeName, fw, item_h, item_k); + gs[0] = (oh + item_h - 1) / item_h; + gs[1] = ow; + gs[2] = (oc + 3) / 4 * on / item_k; + dim = 3; + } else { + item_k = item_k >> 2; + sprintf(kernelname, "conv_direct_s%d_%s%d%d%d", sw, modeName, fw, item_w, item_k); + if (fw != fh) { + sprintf( + kernelname, "conv_direct_wh_s%d_%s%d%d%d%d", sw, modeName, fw, fh, item_w, item_k); + } + gs[0] = oh; + gs[1] = (ow + item_w - 1) / item_w; + gs[2] = (oc + 3) / 4 * on / item_k; + dim = 3; + } + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + if (item_k == 0 || fw != fh) { + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, ihw_str, ic_str, ih_off, iw_off, oh_str, + ohw_str, oh_off, ow_off, ow, gs[0], gs[1], inbuf, fltbuf, biasmem, outbuf)); + } else { + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, ihw_str, ic_str, ih_off, iw_off, oh_str, + ohw_str, oh_off, ow_off, ow, sh, gs[0], gs[1], inbuf, fltbuf, biasmem, outbuf)); + } + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + handle->t_total += handle->t_execute; +#endif + return SUCCESS; +} + +EE convolution_direct_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes) +{ + U32 fw, fh, fc, fn; + tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw); + U32 item_c = forwardRunInfo->best_c[0]; + U32 item_k = forwardRunInfo->best_k[0]; + U32 s0 = 0; + U32 s1 = 0; + U32 s2 = 0; + U32 num = 0; + U32 byteSize; + if (item_k == 0) { + s0 = fn; + s1 = (fc + item_c - 1) / item_c; + s2 = 1; + DataFormat df = DF_CHWNC4; + if (item_c == 8) { + df = DF_CHWNC8; + } + if (item_c == 16) { + df = DF_CHWNC16; + } + gclmemFilterDesc->memFormat = df; + num = s0 * s1 * s2 * item_c; + } else if (item_c == 4) { + U32 item_kd4 = (item_k == 1) ? 1 : (item_k >> 2); + s0 = fw * fh * item_kd4; + s1 = (fc + item_c - 1) / item_c; + s2 = (fn + item_k - 1) / item_k; + gclmemFilterDesc->memFormat = DF_NCHWN4C4; + num = s0 * s1 * s2 * item_c * item_k / item_kd4; + } else if (item_c == 1) { + s0 = fw * fh; + s1 = fc; + s2 = (fn + item_k - 1) / item_k; + gclmemFilterDesc->memFormat = DF_NCHWN4; + num = s0 * s1 * s2 * item_k; + } + byteSize = num * bytesOf(DT_F16); + gclmemFilterDesc->stride[0] = s0; + gclmemFilterDesc->stride[1] = s1; + gclmemFilterDesc->stride[2] = s2; + gclmemFilterDesc->offset[0] = 0; + gclmemFilterDesc->offset[1] = 0; + gclmemFilterDesc->offset[2] = 0; + gclmemFilterDesc->num = num; + gclmemFilterDesc->byteSize = byteSize; + gclmemFilterDesc->memType = GCL_MEM_BUF; + gclmemFilterDesc->flags = CL_MEM_READ_WRITE; + gclmemFilterDesc->host_ptr = NULL; + *bytes = 0; + return SUCCESS; +} + +EE convolution_direct_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem) +{ + DataType fdt; + DataFormat fdf; + U32 fw, fh, fc, fn, ft; + tensorSelectGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw, &ft); + U32 fwh = fw * fh * ft; + U32 item_c = forwardRunInfo->best_c[0]; + U32 item_k = forwardRunInfo->best_k[0]; + U32 nk = item_k; + if (item_k == 0) { + item_k = fn; + } + char kernelname[128]; + Kernel kernel; + sprintf(kernelname, "conv_direct_trans_fltbuf_%d%d", item_c, nk); + CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, fwh, fc, fn, filter->mem, fltmem->mem)); + U32 gs[3] = {fwh, (fc + item_c - 1) / item_c, (fn + item_k - 1) / item_k * item_k}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + *fltmemDesc = tensor4df(fdt, fdf, fn, fc, fh, fw); + return SUCCESS; +} + +EE convolution_direct_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes) +{ + UNUSED(inputDesc); + UNUSED(filterDesc); + UNUSED(outputDesc); + UNUSED(convParamSpec); + UNUSED(forwardRunInfo); + *bytes = 0; + return SUCCESS; +} + +EE convolution_direct_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode) +{ + U32 fw, fh, fn, ih, iw, sw; + tensorSelectGet(filterDesc, NULL, NULL, &fn, NULL, &fh, &fw); + tensorSelectGet(inputDesc, NULL, NULL, NULL, NULL, &ih, &iw); + sw = convParamSpec.stride_w; + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + if ((fw == 1 && fw == 1 && ih == 1 && iw == 1) || input->desc.memFormat == DF_NCWHC4) { + if (fn == 1 && sw == 1 && (fw == fh) && (fw == 1 || fw == 3 || fw == 5 || fw == 7)) { + CHECK_STATUS(direct_core_fn_spe(handle, inputDesc, input, filterDesc, filter, + convParamSpec, forwardRunInfo, biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, + activationMode)); + } else { + CHECK_STATUS(direct_core_mali_fp16(handle, inputDesc, input, filterDesc, filter, + convParamSpec, forwardRunInfo, biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, + activationMode)); + } + } else if (input->desc.memFormat == DF_NCHW) { + CHECK_STATUS(direct_core_nchw_to_ncwhc4_mali_fp16(handle, inputDesc, input, filterDesc, + filter, convParamSpec, forwardRunInfo, biasDesc, bias, tmpBytes, tmpBuf, outputDesc, + output, activationMode)); + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/fp16/convolution_direct_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/convolution_direct_mali_fp16.h new file mode 100644 index 00000000..067222d4 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/convolution_direct_mali_fp16.h @@ -0,0 +1,55 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_CONVOLUTION_DIRECT_MALI_FP16 +#define _H_CONVOLUTION_DIRECT_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE convolution_direct_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes); + +EE convolution_direct_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem); + +EE convolution_direct_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes); + +EE convolution_direct_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode); + +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/convolution_direct_spe_ck_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/convolution_direct_spe_ck_mali_fp16.cpp new file mode 100644 index 00000000..595f3b67 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/convolution_direct_spe_ck_mali_fp16.cpp @@ -0,0 +1,188 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/convolution_mali_fp16.h" +#include "gpu/mali/fp16/convolution_direct_spe_ck_mali_fp16.h" + +inline EE direct_spe_ck_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode) +{ + UNUSED(inputDesc); + UNUSED(forwardRunInfo); + UNUSED(biasDesc); + UNUSED(bias); + UNUSED(tmpBytes); + UNUSED(tmpBuf); + UNUSED(activationMode); + + cl_mem inbuf, outbuf, fltbuf; + inbuf = input->mem; + fltbuf = filter->mem; + outbuf = output->mem; + U32 fn, fc, fw, sw; + U32 ow, oh, oc, on; + sw = convParamSpec.stride_w; + tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, NULL, &fw); + tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); + Kernel kernel; + U32 gs[3]; + U32 ls[3] = {0, 0, 0}; + U32 dim; + char kernelname[128]; + + if (fn == 1 && fc == 4 && fw == 1) { // fc = orgfc + fn + U32 iw_str, ih_str; + iw_str = input->desc.stride[0]; + ih_str = input->desc.stride[1]; + U32 ow_str, oh_str, ow_off, oh_off; + ow_str = output->desc.stride[0]; + oh_str = output->desc.stride[1]; + ow_off = output->desc.offset[0]; + oh_off = output->desc.offset[1]; + if (output->desc.memFormat != DF_NCHW) { + return NOT_SUPPORTED; + } + U32 item_w = 2; + U32 item_h = 1; + U32 ew = ow % item_w; + gs[0] = (ow + item_w - 1) / item_w; + gs[1] = (oh + item_h - 1) / item_h; + dim = 2; + sprintf(kernelname, "conv_direct_s%d_spe_f1c3k1_%d", sw, ew); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ow_str, ow_off, oh_off, ow >> 1, gs[0], + gs[1], inbuf, fltbuf, outbuf)); // c = 3 k = 1, bias val has been set in fltbuf + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); + } else { + return NOT_SUPPORTED; + } + +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + CHECK_STATUS(gcl_print_memory(handle, input, "conv_direct_spe_ck_input")); + CHECK_STATUS(gcl_print_memory(handle, filter, "conv_direct_spe_ck_filter")); + CHECK_STATUS(gcl_print_memory(handle, output, "conv_direct_spe_ck_output")); +#endif + return SUCCESS; +} + +EE convolution_direct_spe_ck_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes) +{ + UNUSED(forwardRunInfo); + U32 fw, fh, fc, fn; + tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw); + U32 s0, s1, s2; + U32 num, byteSize; + if (fn == 1 && fc == 3 && fw == 1) { + s0 = fw * fh; + s1 = fc + fn; // set bias val in flt + s2 = fn; + gclmemFilterDesc->memFormat = DF_NCHW; + } else { + return NOT_SUPPORTED; + } + num = s0 * s1 * s2; + byteSize = num * bytesOf(DT_F16); + gclmemFilterDesc->stride[0] = s0; + gclmemFilterDesc->stride[1] = s1; + gclmemFilterDesc->stride[2] = s2; + gclmemFilterDesc->offset[0] = 0; + gclmemFilterDesc->offset[1] = 0; + gclmemFilterDesc->offset[2] = 0; + gclmemFilterDesc->num = num; + gclmemFilterDesc->byteSize = byteSize; + gclmemFilterDesc->memType = GCL_MEM_BUF; + gclmemFilterDesc->flags = CL_MEM_READ_ONLY; + gclmemFilterDesc->host_ptr = NULL; + *bytes = 0; + return SUCCESS; +} + +EE convolution_direct_spe_ck_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem) +{ + UNUSED(forwardRunInfo); + DataType fdt; + DataFormat fdf; + U32 fw, fh, fc, fn; + tensorSelectGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw); + *fltmemDesc = tensor4df(fdt, fdf, fn, fc + fn, fh, fw); // set bias val in flt + U32 size = tensorNumBytes(*fltmemDesc); + CHECK_STATUS(gcl_trans_memory(handle, filter, fltmem, &size, DEVICE_BUF_TO_BUF, CL_FALSE)); +#ifdef _DEBUG + CHECK_STATUS(gcl_print_memory(handle, filter, "conv_direct_spe_ck_filter_org")); + CHECK_STATUS(gcl_print_memory(handle, fltmem, "conv_direct_spe_ck_filter_tran")); +#endif + return SUCCESS; +} + +EE convolution_direct_spe_ck_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes) +{ + UNUSED(inputDesc); + UNUSED(filterDesc); + UNUSED(outputDesc); + UNUSED(convParamSpec); + UNUSED(forwardRunInfo); + *bytes = 0; + return SUCCESS; +} + +EE convolution_direct_spe_ck_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode) +{ + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + CHECK_STATUS( + direct_spe_ck_core_mali_fp16(handle, inputDesc, input, filterDesc, filter, convParamSpec, + forwardRunInfo, biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, activationMode)); + + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/fp16/convolution_direct_spe_ck_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/convolution_direct_spe_ck_mali_fp16.h new file mode 100644 index 00000000..877c39cb --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/convolution_direct_spe_ck_mali_fp16.h @@ -0,0 +1,55 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_CONVOLUTION_DIRECT_SPE_CK_MALI_FP16 +#define _H_CONVOLUTION_DIRECT_SPE_CK_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE convolution_direct_spe_ck_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes); + +EE convolution_direct_spe_ck_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem); + +EE convolution_direct_spe_ck_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes); + +EE convolution_direct_spe_ck_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode); + +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/convolution_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/convolution_mali_fp16.cpp new file mode 100644 index 00000000..f37147ea --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/convolution_mali_fp16.cpp @@ -0,0 +1,201 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" + +#include "gpu/mali/fp16/convolution_mali_fp16.h" +#include "gpu/mali/fp16/convolution_direct_mali_fp16.h" +#include "gpu/mali/fp16/convolution_wino_mali_fp16.h" +#include "gpu/mali/fp16/convolution_direct_spe_ck_mali_fp16.h" + +inline EE convolution_checkpara_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + const GCLMem_t bias, + TensorDesc outputDesc, + GCLMem_t output) +{ + if (nullptr == handle || nullptr == input || nullptr == filter || nullptr == output || + nullptr == bias) { + return NULL_POINTER; + } + if (inputDesc.dt != outputDesc.dt || inputDesc.dt != filterDesc.dt || inputDesc.dt != DT_F16) { + return NOT_MATCH; + } + + U32 ic, fc, fn, fh, fw, oc; + CHECK_STATUS(tensorSelectGet(inputDesc, NULL, NULL, NULL, &ic, NULL, NULL)); + CHECK_STATUS(tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensorSelectGet(outputDesc, NULL, NULL, NULL, &oc, NULL, NULL)); + + if (input->desc.memFormat == DF_NCWHC4) { + if (output->desc.memFormat == DF_NCHW) { + if (fn != 1) { + return NOT_SUPPORTED; + } + } else if (output->desc.memFormat != DF_NCWHC4) { + return NOT_MATCH; + } + } + if (fn != oc) { + return NOT_MATCH; + } + if (ic != fc) { + return NOT_MATCH; + } + return SUCCESS; +} + +EE convolution_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes) +{ + EE ret = SUCCESS; + ConvolutionForwardAlgorithm algorithm = (ConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + switch (algorithm) { + case CONVOLUTION_ALGORITHM_DIRECT: + ret = convolution_direct_transform_filter_bytes_mali_fp16( + filterDesc, forwardRunInfo, gclmemFilterDesc, bytes); + break; + case CONVOLUTION_ALGORITHM_DIRECT_SPE_CK: + ret = convolution_direct_spe_ck_transform_filter_bytes_mali_fp16( + filterDesc, forwardRunInfo, gclmemFilterDesc, bytes); + break; + case CONVOLUTION_ALGORITHM_GEMM: + ret = NOT_SUPPORTED; + break; + case CONVOLUTION_ALGORITHM_WINOGRAD: + ret = convolution_wino_transform_filter_bytes_mali_fp16( + filterDesc, forwardRunInfo, gclmemFilterDesc, bytes); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE convolution_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem, + GCLMem_t tmp) +{ + EE ret = SUCCESS; + ConvolutionForwardAlgorithm algorithm = (ConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + switch (algorithm) { + case CONVOLUTION_ALGORITHM_DIRECT: + ret = convolution_direct_transform_filter_mali_fp16( + handle, filterDesc, filter, forwardRunInfo, fltmemDesc, fltmem); + break; + case CONVOLUTION_ALGORITHM_DIRECT_SPE_CK: + ret = convolution_direct_spe_ck_transform_filter_mali_fp16( + handle, filterDesc, filter, forwardRunInfo, fltmemDesc, fltmem); + break; + case CONVOLUTION_ALGORITHM_GEMM: + ret = NOT_SUPPORTED; + break; + case CONVOLUTION_ALGORITHM_WINOGRAD: + ret = convolution_wino_transform_filter_mali_fp16( + handle, filterDesc, filter, forwardRunInfo, fltmemDesc, fltmem, tmp); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE convolution_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes) +{ + EE ret = SUCCESS; + ConvolutionForwardAlgorithm algorithm = (ConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + switch (algorithm) { + case CONVOLUTION_ALGORITHM_DIRECT: + ret = convolution_direct_infer_forward_tmp_bytes_mali_fp16( + inputDesc, filterDesc, outputDesc, convParamSpec, forwardRunInfo, bytes); + break; + case CONVOLUTION_ALGORITHM_DIRECT_SPE_CK: + ret = convolution_direct_spe_ck_infer_forward_tmp_bytes_mali_fp16( + inputDesc, filterDesc, outputDesc, convParamSpec, forwardRunInfo, bytes); + break; + case CONVOLUTION_ALGORITHM_GEMM: + ret = NOT_SUPPORTED; + break; + case CONVOLUTION_ALGORITHM_WINOGRAD: + ret = convolution_wino_infer_forward_tmp_bytes_mali_fp16( + inputDesc, filterDesc, outputDesc, convParamSpec, forwardRunInfo, bytes); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE convolution_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode) +{ + CHECK_STATUS(convolution_checkpara_mali_fp16( + handle, inputDesc, input, filterDesc, filter, bias, outputDesc, output)); + EE ret = SUCCESS; + ConvolutionForwardAlgorithm algorithm = (ConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + switch (algorithm) { + case CONVOLUTION_ALGORITHM_DIRECT: + ret = convolution_direct_mali_fp16(handle, inputDesc, input, filterDesc, filter, + convParamSpec, forwardRunInfo, biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, + activationMode); + break; + case CONVOLUTION_ALGORITHM_DIRECT_SPE_CK: + ret = convolution_direct_spe_ck_mali_fp16(handle, inputDesc, input, filterDesc, filter, + convParamSpec, forwardRunInfo, biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, + activationMode); + break; + case CONVOLUTION_ALGORITHM_GEMM: + ret = NOT_SUPPORTED; + break; + case CONVOLUTION_ALGORITHM_WINOGRAD: + ret = convolution_wino_mali_fp16(handle, inputDesc, input, filterDesc, filter, + convParamSpec, forwardRunInfo, biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, + activationMode); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/fp16/convolution_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/convolution_mali_fp16.h new file mode 100644 index 00000000..af361da8 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/convolution_mali_fp16.h @@ -0,0 +1,55 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _CONVOLUTION_MALI_FP16 +#define _CONVOLUTION_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE convolution_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes); + +EE convolution_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem, + GCLMem_t tmp); + +EE convolution_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes); + +EE convolution_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/convolution_wino_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/convolution_wino_mali_fp16.cpp new file mode 100644 index 00000000..5dba19ef --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/convolution_wino_mali_fp16.cpp @@ -0,0 +1,420 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/convolution_mali_fp16.h" +#include "gpu/mali/fp16/convolution_wino_mali_fp16.h" + +#define calPicTranRDesc( \ + wino_h, wino_w, wino_num, ic, fh, ph, dt, prh, prw, prc, prn, prh_off, prw_off, prhwc, prsize) \ + { \ + U32 ext_h = (fh / 2 < ph) ? ph : fh / 2; \ + prh = wino_h * 4 + 2 * ext_h; \ + prw = ((wino_w + 3) / 4 * 4); \ + prc = ic; \ + prn = wino_num; \ + prhwc = prh * prw * prc; \ + prsize = prhwc * prn * bytesOf(dt); \ + prh_off = ph; \ + prw_off = 0; \ + } + +#define calPtrTranRLDesc( \ + wino_h, wino_w, wino_num, ic, item_n, dt, prlh, prlw, prlc, prln, prlhw, prlhwc, prlsize) \ + { \ + prlh = wino_h; \ + prlw = wino_w; \ + prlc = ic; \ + prln = wino_num * wino_num; \ + prlhw = (wino_h * wino_w + item_n - 1) / item_n * item_n; \ + prlhwc = prlhw * ic; \ + prlsize = prlhwc * prln * bytesOf(dt); \ + } + +#define calGemmOutDesc(wino_num, fn, phw, ic, item_m, dt, M, N, C, MC, NC, MN, gSize) \ + { \ + M = (fn + item_m - 1) / item_m * item_m; \ + N = prlhw_str; \ + C = ic; \ + MC = M * C; \ + NC = N * C; \ + MN = M * N; \ + gSize = MN * wino_num * wino_num * bytesOf(dt); \ + } +inline EE wino_trans_pic(GCLHandle_t handle, + U32 ih_str, + U32 iw_str, + U32 ih_off, + U32 iw_off, + U32 ic_str, + U32 prh_str, + U32 prw_str, + U32 prc_str, + U32 prhwc_str, + U32 prh_off, + U32 prw_off, + U32 prlh_str, + U32 prlw_str, + U32 prlc_str, + U32 prlhw_str, + U32 prlhwc_str, + Mem pic, + Mem picTranR, + Mem picTranRL) + +{ + UNUSED(prw_str); + UNUSED(prw_off); + Kernel kernel; + char kernelname[128]; + U32 ih_str4 = ih_str * 4; + U32 ih_off4 = ih_off * 4; + U32 prh_off4 = prh_off * 4; + U32 gs[3] = {prh_str * 4, (prw_str / 4 + 3) / 4 * 4, ic_str}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + sprintf(kernelname, "conv_wino_trans_picbuf_right"); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str4, iw_str, ih_off4, iw_off, prh_str, prw_str, + prhwc_str, prh_off4, gs[0], gs[1], pic, picTranR)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + handle->t_total += handle->t_execute; +#endif + U32 item_h = 1; + if (prlh_str % 2 == 0) { + item_h = 2; + } + if (prlh_str % 3 == 0) { + item_h = 3; + } + if (prlh_str % 4 == 0) { + item_h = 4; + } + gs[0] = (prlh_str / item_h + 3) / 4 * 4; + gs[1] = prlw_str; + gs[2] = prlc_str * 6; + sprintf(kernelname, "conv_wino_trans_picbuf_left_%d", item_h); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, prh_str, prw_str, prc_str, prlh_str, prlw_str, + prlhw_str, prlhwc_str, gs[0], gs[1], picTranR, picTranRL)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + handle->t_total += handle->t_execute; +#endif + return SUCCESS; +} + +inline EE wino_gemm(GCLHandle_t handle, + U32 M, + U32 N, + U32 C, + U32 item_m, + U32 item_n, + U32 flttran_str, + U32 pictran_str, + U32 out_str, + U32 wino_num, + Mem flttran, + Mem pictran, + Mem out) +{ + Kernel kernel; + wino_num = wino_num * wino_num; + char kernelname[128]; + sprintf(kernelname, "conv_wino_gemm%d_tn_%d%d", wino_num, item_m, item_n); + U32 gs[2] = {(N + item_n - 1) / item_n, (M + item_m - 1) / item_m}; + U32 ls[2] = {0, 0}; + U32 dim = 2; + for (U32 i = 0; i < wino_num; i++) { + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, M, N, C, i * flttran_str, i * pictran_str, + i * out_str, gs[0], gs[1], flttran, pictran, out)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + handle->t_total += handle->t_execute; +#endif + } + return SUCCESS; +} + +inline EE wino_trans_out(GCLHandle_t handle, + U32 wino_h, + U32 wino_w, + U32 pw_str, + U32 pwh_str, + U32 oh_str, + U32 ow_str, + U32 oh_off, + U32 ow_off, + U32 oh, + U32 ow, + U32 oc, + ActivationMode activationMode, + Mem bias, + Mem gemm_out, + Mem output) +{ + Kernel kernel; + char kernelname[128]; + char modeName[16]; + switch (activationMode) { + case ACTIVATION_RELU: + strcpy(modeName, "_relu"); + break; + case ACTIVATION_NULL: + strcpy(modeName, ""); + break; + default: + return NOT_SUPPORTED; + } + sprintf(kernelname, "conv_wino_trans_outbuf%s", modeName); + if ((oh & 3) == 0 && (ow & 3) == 0) { + sprintf(kernelname, "conv_wino_trans_outbuf%s_align", modeName); + } + U32 gs[3] = {(wino_h + 3) / 4 * 4, (wino_w + 3) / 4 * 4, oc / 4}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, wino_h, wino_w, pw_str, pwh_str, oh_str, ow_str, oh_off, + ow_off, oh, ow, bias, gemm_out, output)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + handle->t_total += handle->t_execute; +#endif + return SUCCESS; +} + +EE convolution_wino_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes) +{ + U32 item_k = forwardRunInfo->best_k[0]; + U32 fw, fh, fc, fn; + U32 winoTransNum = 36; + tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw); + U32 s0 = (fn + item_k - 1) / item_k * item_k; + U32 s1 = fc; + U32 s2 = winoTransNum; + U32 num = s0 * s1 * s2; + U32 byteSize = num * bytesOf(DT_F16); + gclmemFilterDesc->stride[0] = s0; + gclmemFilterDesc->stride[1] = s1; + gclmemFilterDesc->stride[2] = s2; + gclmemFilterDesc->offset[0] = 0; + gclmemFilterDesc->offset[1] = 0; + gclmemFilterDesc->offset[2] = 0; + gclmemFilterDesc->num = num; + gclmemFilterDesc->byteSize = byteSize; + gclmemFilterDesc->memType = GCL_MEM_BUF; + gclmemFilterDesc->flags = CL_MEM_READ_WRITE; + gclmemFilterDesc->memFormat = DF_HWCN; + gclmemFilterDesc->host_ptr = NULL; + *bytes = fn * fc * fh * fw * bytesOf(DT_F16); + return SUCCESS; +} + +EE convolution_wino_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem, + GCLMem_t tmp) +{ + UNUSED(forwardRunInfo); + DataType fdt; + DataFormat fdf; + U32 fw, fh, fc, fn; + tensorSelectGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw); + U32 item_k = forwardRunInfo->best_k[0]; + U32 fn_align = (fn + item_k - 1) / item_k * item_k; + U32 fwhc = fw * fh * fc; + U32 fnc = fn_align * fc; + + char kernelname[128]; + Kernel kernel; + sprintf(kernelname, "conv_wino_rotate_fltbuf_%d", fw); + CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, fwhc, fnc, fn, filter->mem, tmp->mem)); + U32 gs[2] = {fwhc, fn_align}; + U32 ls[2] = {0, 0}; + U32 dim = 2; + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); +#ifdef _DEBUG + CHECK_STATUS(gcl_print_memory(handle, filter, "conv_wino_filter_org")); + CHECK_STATUS( + gcl_print_buffer(handle, tmp->mem, fn_align * fc * fw * fh, "conv_wino_filter_tmp")); +#endif + sprintf(kernelname, "conv_wino_trans_fltbuf_3x3"); + CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, fn_align, fc, fnc, tmp->mem, fltmem->mem)); + gs[0] = fn_align; + gs[1] = fc; + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); +#ifdef _DEBUG + CHECK_STATUS(gcl_print_memory(handle, fltmem, "conv_wino_filter_tran")); +#endif + *fltmemDesc = tensor4df(fdt, fdf, fn, fc, fh, fw); + return SUCCESS; +} + +EE convolution_wino_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes) +{ + UNUSED(inputDesc); + UNUSED(outputDesc); + UNUSED(convParamSpec); + DataType fdt; + DataFormat fdf; + U32 fw, fh, fc, fn; + tensorSelectGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw); + U32 item_k = forwardRunInfo->best_k[0]; + U32 fn_align = (fn + item_k - 1) / item_k * item_k; + U32 tempBufNum = fn_align * fc * fw * fh; + U32 fltTempBufSize = tempBufNum * bytesOf(fdt); + + DataType odt; + U32 ow, oh, oc, on; + tensorSelectGet(outputDesc, &odt, NULL, &on, &oc, &oh, &ow); + U32 ph = convParamSpec.padding_top; + U32 wino_num = 6; + U32 wino_h = (oh + 3) / 4; + U32 wino_w = (ow + 3) / 4; + U32 prh_str, prw_str, prc_str, prn_str, prh_off, prw_off, prhwc_str, prSize; + calPicTranRDesc(wino_h, wino_w, wino_num, fc, fh, ph, odt, prh_str, prw_str, prc_str, prn_str, + prh_off, prw_off, prhwc_str, prSize); + + U32 item_n = forwardRunInfo->best_w[0]; + U32 item_m = forwardRunInfo->best_k[0]; + U32 prlh_str, prlw_str, prlc_str, prln_str, prlhw_str, prlhwc_str, prlSize; + calPtrTranRLDesc(wino_h, wino_w, wino_num, fc, item_n, odt, prlh_str, prlw_str, prlc_str, + prln_str, prlhw_str, prlhwc_str, prlSize); + + U32 M, N, C, MC, NC, MN, gemmOutSize; + calGemmOutDesc(wino_num, fn, prlhw_str, fc, item_m, odt, M, N, C, MC, NC, MN, gemmOutSize); + + U32 tempBufSize = (prSize + 1023) / 1024 * 1024; + tempBufSize += (prlSize + 1023) / 1024 * 1024; + tempBufSize += gemmOutSize; + if (tempBufSize < fltTempBufSize) { + tempBufSize = fltTempBufSize; + } + *bytes = tempBufSize; + return SUCCESS; +} + +EE convolution_wino_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + U32 wino_num = 6; + DataType idt; + U32 iw, ih, ic; + U32 fw, fh, fc, fn, pw, ph; + U32 ow, oh, oc, on; + ph = convParamSpec.padding_top; + pw = convParamSpec.padding_left; + tensorSelectGet(inputDesc, &idt, NULL, NULL, &ic, &ih, &iw); + tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw); + tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); + + U32 iw_str, ih_str, ih_str4, ic_str, iw_off, ih_off; + ih_str = input->desc.stride[0]; + iw_str = input->desc.stride[1]; + ic_str = input->desc.stride[2]; + ih_off = input->desc.offset[0]; // input have not pad in h axis + iw_off = input->desc.offset[1] - pw; + ih_str4 = ih_str * 4; + + U32 ow_str, oh_str, ow_off, oh_off; + oh_str = output->desc.stride[0]; + ow_str = output->desc.stride[1]; + oh_off = output->desc.offset[0]; + ow_off = output->desc.offset[1]; + + Mem pic = input->mem; + Mem picTranR, picTranRL, gemmOut; + U32 wino_h = (oh + 3) / 4; + U32 wino_w = (ow + 3) / 4; + U32 offset = 0; + U32 prh_str, prw_str, prc_str, prn_str, prh_off, prw_off, prhwc_str, prSize; + calPicTranRDesc(wino_h, wino_w, wino_num, ic, fh, ph, idt, prh_str, prw_str, prc_str, prn_str, + prh_off, prw_off, prhwc_str, prSize); + CHECK_STATUS(gcl_create_sub_buffer(prSize, &offset, tmpBuf, &picTranR)); + + U32 item_n = forwardRunInfo->best_w[0]; + U32 item_m = forwardRunInfo->best_k[0]; + U32 prlh_str, prlw_str, prlc_str, prln_str, prlhw_str, prlhwc_str, prlSize; + calPtrTranRLDesc(wino_h, wino_w, wino_num, ic, item_n, idt, prlh_str, prlw_str, prlc_str, + prln_str, prlhw_str, prlhwc_str, prlSize); + CHECK_STATUS(gcl_create_sub_buffer(prlSize, &offset, tmpBuf, &picTranRL)); + + U32 M, N, C, MC, NC, MN, gemmOutSize; + calGemmOutDesc(wino_num, fn, prlhw_str, ic, item_m, idt, M, N, C, MC, NC, MN, gemmOutSize); + CHECK_STATUS(gcl_create_sub_buffer(gemmOutSize, &offset, tmpBuf, &gemmOut)); + + CHECK_STATUS(wino_trans_pic(handle, ih_str, iw_str, ih_off, iw_off, ic_str, prh_str, prw_str, + prc_str, prhwc_str, prh_off, prw_off, prlh_str, prlw_str, prlc_str, prlhw_str, prlhwc_str, + pic, picTranR, picTranRL)); +#ifdef _DEBUG + CHECK_STATUS(gcl_print_memory(handle, input, "conv_wino_input")); + CHECK_STATUS( + gcl_print_buffer(handle, picTranR, prSize / bytesOf(idt), "conv_wino_pictran_right")); + CHECK_STATUS( + gcl_print_buffer(handle, picTranRL, prlSize / bytesOf(idt), "conv_wino_pictran_left")); +#endif + + Mem fltTran = filter->mem; + CHECK_STATUS(wino_gemm( + handle, M, N, C, item_m, item_n, MC, NC, MN, wino_num, fltTran, picTranRL, gemmOut)); +#ifdef _DEBUG + CHECK_STATUS(gcl_print_memory(handle, filter, "conv_wino_flttran")); + CHECK_STATUS( + gcl_print_buffer(handle, gemmOut, gemmOutSize / bytesOf(idt), "conv_wino_gemm_out")); +#endif + + Mem biasbuf = bias->mem; + Mem outbuf = output->mem; + CHECK_STATUS(wino_trans_out(handle, wino_h, wino_w, N, MN, oh_str, ow_str, oh_off, ow_off, oh, + ow, oc, activationMode, biasbuf, gemmOut, outbuf)); +#ifdef _DEBUG + CHECK_STATUS(gcl_print_memory(handle, output, "conv_wino_output")); +#endif + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/fp16/convolution_wino_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/convolution_wino_mali_fp16.h new file mode 100644 index 00000000..918985c7 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/convolution_wino_mali_fp16.h @@ -0,0 +1,55 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_CONVOLUTION_WINO_MALI_FP16 +#define _H_CONVOLUTION_WINO_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE convolution_wino_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes); + +EE convolution_wino_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem, + GCLMem_t tmp); + +EE convolution_wino_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes); + +EE convolution_wino_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/deconvolution_direct_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/deconvolution_direct_mali_fp16.cpp new file mode 100644 index 00000000..f7043def --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/deconvolution_direct_mali_fp16.cpp @@ -0,0 +1,246 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/deconvolution_mali_fp16.h" +#include "gpu/mali/fp16/deconvolution_direct_mali_fp16.h" + +inline EE deconv_direct_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + UNUSED(tmpBuf); + cl_mem inbuf, biasmem, outbuf, fltbuf; + inbuf = input->mem; + fltbuf = filter->mem; + biasmem = bias->mem; + outbuf = output->mem; + U32 iw, ih, ic; + U32 fn, fw, fh, fc, sw, sh, pw, ph; + U32 ow, oh, oc, on; + sw = convParamSpec.stride_w; + sh = convParamSpec.stride_h; + ph = convParamSpec.padding_top; + pw = convParamSpec.padding_left; + tensorSelectGet(inputDesc, NULL, NULL, NULL, &ic, &ih, &iw); + tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw); + tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); + + U32 iw_str, ih_str, ihw_str, ic_str, iw_off, ih_off; + ih_str = input->desc.stride[0]; + iw_str = input->desc.stride[1]; + ic_str = input->desc.stride[2]; + ih_off = input->desc.offset[0]; + iw_off = input->desc.offset[1]; + ihw_str = ih_str * iw_str; + + U32 ow_str, oh_str, ohw_str, ow_off, oh_off; + oh_str = output->desc.stride[0]; + ow_str = output->desc.stride[1]; + oh_off = output->desc.offset[0]; + ow_off = output->desc.offset[1]; + ohw_str = oh_str * ow_str; + + char kernelname[128]; + U32 gs[3]; + U32 ls[3] = {0, 0, 0}; + U32 dim; + Kernel kernel; + // switch(activationMode) { + // case ACTIVATION_RELU: + // strcpy(modeName, "relu_"); + // break; + // case ACTIVATION_RELU6: + // strcpy(modeName, "relu6_"); + // break; + // case ACTIVATION_NULL: + // strcpy(modeName, ""); + // break; + // default: + // return NOT_SUPPORTED; + // } + // if(item_k == 0) { + // if((ih_str > 1 || iw_str > 1) && (item_c != 4)) CHECK_STATUS(NOT_SUPPORTED); + // sprintf(kernelname, "conv_direct_spe_fwhs1_%s%d", modeName, item_c); + // ic_str = filter->desc.stride[1]; + // ow = fn; + // gs[0] = fn; + // gs[1] = 1; + // gs[2] = 1; + // dim = 1; + // } else { + // item_k = item_k >> 2; + // sprintf(kernelname, "conv_direct_s%d_%s%d%d%d",sw, modeName, fw, item_w, item_k); + sprintf(kernelname, "deconv_direct"); + gs[0] = oh; + gs[1] = ow; + gs[2] = (oc + 3) / 4; + dim = 3; + // } + U32 in_channel_blocks = (ic + 3) / 4; + U32 out_channel_blocks = gs[2]; + + pw = fw - pw - 1; + ph = fh - ph - 1; + U32 align_h = sh - 1 - ph; + U32 align_w = sw - 1 - pw; + + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, inbuf, fltbuf, outbuf, biasmem, iw, iw_str, iw_off, ih, + ih_str, ih_off, fw, fh, fc, fn, sw, sh, pw, ph, ow, ow_str, ow_off, oh, oh_str, oh_off, ic, + oc, align_h, align_w, in_channel_blocks, out_channel_blocks)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); + +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + CHECK_STATUS(gcl_print_memory(handle, input, "deconv_direct_input")); + CHECK_STATUS(gcl_print_memory(handle, filter, "deconv_direct_filter")); + CHECK_STATUS(gcl_print_memory(handle, bias, "deconv_direct_bias")); + CHECK_STATUS(gcl_print_memory(handle, output, "deconv_direct_output")); + handle->t_total += handle->t_execute; +#endif + return SUCCESS; +} + +EE deconvolution_direct_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes) +{ + U32 fw, fh, fc, fn; + tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw); + U32 item_c = forwardRunInfo->best_c[0]; + U32 item_k = forwardRunInfo->best_k[0]; + U32 s0 = 0; + U32 s1 = 0; + U32 s2 = 0; + U32 num = 0; + U32 byteSize; + if (item_c == 4) { + s0 = fw * fh; + s1 = (fc + item_c - 1) / item_c; + s2 = (fn + item_k - 1) / item_k; + gclmemFilterDesc->memFormat = DF_NCHWN4C4; + num = s0 * s1 * s2 * item_c * item_k; + } else { + CHECK_STATUS(NOT_MATCH); + } + byteSize = num * bytesOf(DT_F16); + gclmemFilterDesc->stride[0] = s0; + gclmemFilterDesc->stride[1] = s1; + gclmemFilterDesc->stride[2] = s2; + gclmemFilterDesc->offset[0] = 0; + gclmemFilterDesc->offset[1] = 0; + gclmemFilterDesc->offset[2] = 0; + gclmemFilterDesc->num = num; + gclmemFilterDesc->byteSize = byteSize; + gclmemFilterDesc->memType = GCL_MEM_BUF; + gclmemFilterDesc->flags = CL_MEM_READ_WRITE; + gclmemFilterDesc->host_ptr = NULL; + *bytes = 0; + return SUCCESS; +} + +EE deconvolution_direct_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem) +{ + DataType fdt; + DataFormat fdf; + U32 fw, fh, fc, fn; + tensorSelectGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw); + U32 fwh = fw * fh; + U32 item_k = forwardRunInfo->best_k[0]; + if (item_k != 4) { + CHECK_STATUS(NOT_MATCH); + } + // if(item_k == 0) item_k = fn; + char kernelname[128]; + Kernel kernel; + sprintf(kernelname, "deconv_direct_trans_fltbuf"); + CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, fwh, fc, fn, filter->mem, fltmem->mem)); + U32 gs[3] = {fwh, (fc + 3) / 4, (fn + 3) / 4 * 4}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + *fltmemDesc = tensor4df(fdt, fdf, fn, fc, fh, fw); +#ifdef _DEBUG + CHECK_STATUS(gcl_print_memory(handle, filter, "deconv_direct_filter_org")); + CHECK_STATUS(gcl_print_memory(handle, fltmem, "deconv_direct_filter_tran")); +#endif + return SUCCESS; +} + +EE deconvolution_direct_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes) +{ + UNUSED(inputDesc); + UNUSED(filterDesc); + UNUSED(outputDesc); + UNUSED(convParamSpec); + UNUSED(forwardRunInfo); + *bytes = 0; + return SUCCESS; +} + +EE deconvolution_direct_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode) +{ + U32 fw, fh, ih, iw; + tensorSelectGet(filterDesc, NULL, NULL, NULL, NULL, &fh, &fw); + tensorSelectGet(inputDesc, NULL, NULL, NULL, NULL, &ih, &iw); + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + if (inputDesc.df == DF_NCHW || (fw == 1 && fw == 1 && ih == 1 && iw == 1)) { + CHECK_STATUS(deconv_direct_core_mali_fp16(handle, inputDesc, input, filterDesc, filter, + convParamSpec, forwardRunInfo, biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, + activationMode)); + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/fp16/deconvolution_direct_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/deconvolution_direct_mali_fp16.h new file mode 100644 index 00000000..8fc35c66 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/deconvolution_direct_mali_fp16.h @@ -0,0 +1,54 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_DECONVOLUTION_DIRECT_MALI_FP16 +#define _H_DECONVOLUTION_DIRECT_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE deconvolution_direct_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes); + +EE deconvolution_direct_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem); + +EE deconvolution_direct_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes); + +EE deconvolution_direct_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/deconvolution_gemm_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/deconvolution_gemm_mali_fp16.cpp new file mode 100644 index 00000000..bf862ac1 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/deconvolution_gemm_mali_fp16.cpp @@ -0,0 +1,261 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/deconvolution_mali_fp16.h" +#include "gpu/mali/fp16/deconvolution_gemm_mali_fp16.h" + +inline EE deconv_gemm_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + cl_mem inbuf, biasmem, outbuf, fltbuf, tmp; + inbuf = input->mem; + fltbuf = filter->mem; + biasmem = bias->mem; + outbuf = output->mem; + tmp = tmpBuf->mem; + U32 iw, ih, ic; + U32 fn, fw, fh, fc, sw, sh, pw, ph; + U32 ow, oh, oc, on; + sw = convParamSpec.stride_w; + sh = convParamSpec.stride_h; + ph = convParamSpec.padding_top; + pw = convParamSpec.padding_left; + tensorSelectGet(inputDesc, NULL, NULL, NULL, &ic, &ih, &iw); + tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw); + tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); + + U32 iw_str, ih_str, ic_str, iw_off, ih_off; + U32 ow_str, oh_str, ow_off, oh_off; + get_gclmem_dim(input->desc, &iw_str, &ih_str, &ic_str, &iw_off, &ih_off); + get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off); + U32 ihw_str = ih_str * iw_str; + U32 ohw_str = oh_str * ow_str; + + U32 item_w = forwardRunInfo->best_w[0]; + U32 item_c = forwardRunInfo->best_c[0]; + item_c = item_c >> 2; + char kernelname[128]; + char modeName[16]; + U32 gs[3]; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + Kernel kernel; + switch (activationMode) { + case ACTIVATION_RELU: + strcpy(modeName, "relu_"); + break; + case ACTIVATION_NULL: + strcpy(modeName, ""); + break; + default: + return NOT_SUPPORTED; + } + + if (fw == 2 && fh == 2 && sw == 2 && sh == 2) { + if ((item_w >> 8) > 0) { + U32 item_h = item_w >> 8; + sprintf(kernelname, "deconv_gemm_f2s2_h_%s%d%d", modeName, item_h, item_c); + gs[0] = ((oh + 1) / 2 + item_h - 1) / item_h; + gs[1] = (ow + 1) / 2; + gs[2] = (fc * fw * fh + 3) / 4 / item_c; + } else { + sprintf(kernelname, "deconv_gemm_f2s2_%s%d%d", modeName, item_w, item_c); + gs[0] = (oh + 1) / 2; + gs[1] = ((ow + 1) / 2 + item_w - 1) / item_w; + gs[2] = (fc * fw * fh + 3) / 4 / item_c; + } + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, ihw_str, ic_str, ih_off, iw_off, oh_str, + ohw_str, oh_off, ow_off, ow, gs[0], gs[1], inbuf, fltbuf, biasmem, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + handle->t_total += handle->t_execute; +#endif + } else { + return NOT_SUPPORTED; + U32 th_str = ih; + U32 tw_str = iw; + U32 th_off = 0; + U32 tw_off = 0; + U32 th = ih; + U32 tw = iw; + U32 tc = fw * fh * fc; + U32 thw_str = th_str * tw_str; + if ((item_w >> 8) > 0) { + U32 item_h = item_w >> 8; + sprintf(kernelname, "conv_direct_s1_h_1%d%d", item_w, item_c); + gs[0] = (th + item_h - 1) / item_h; + gs[1] = tw; + gs[2] = (tc + 3) / 4 / item_c; + } else { + sprintf(kernelname, "conv_direct_s1_1%d%d", item_w, item_c); + gs[0] = th; + gs[1] = (tw + item_w - 1) / item_w; + gs[2] = (tc + 3) / 4 / item_c; + } + + bool has_bias = false; + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, ihw_str, ic_str, ih_off, iw_off, th_str, + thw_str, th_off, tw_off, tw, 1, gs[0], gs[1], has_bias, inbuf, fltbuf, biasmem, tmp)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + CHECK_STATUS(gcl_print_memory(handle, input, "deconv_gemm_input")); + CHECK_STATUS(gcl_print_memory(handle, filter, "deconv_gemm_filter")); + handle->t_total += handle->t_execute; +#endif + + gs[0] = oh * ow * (oc + 3) / 4; + ls[0] = 0; + dim = 1; + sprintf(kernelname, "col2im"); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, th, tw, tc, fw, fh, pw, ph, sw, sh, oh_str, ow_str, + oh_off, ow_off, oh, ow, gs[0], biasmem, tmp, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + CHECK_STATUS(gcl_print_memory(handle, bias, "deconv_col2im_bias")); + CHECK_STATUS(gcl_print_memory(handle, output, "deconv_col2im_output")); + handle->t_total += handle->t_execute; +#endif + } + return SUCCESS; +} + +EE deconvolution_gemm_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes) +{ + U32 fw, fh, fc, fn; + tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw); + U32 item_c = forwardRunInfo->best_c[0]; + U32 item_k = forwardRunInfo->best_k[0]; + U32 s0 = 0; + U32 s1 = 0; + U32 s2 = 0; + U32 num = 0; + U32 byteSize; + s0 = item_c >> 2; + s1 = (fn + item_k - 1) / item_k; + s2 = (fc * fw * fh + item_c - 1) / item_c; + gclmemFilterDesc->memFormat = DF_NCHWN4C4; + num = s1 * s1 * s2 * item_c * item_k / (item_c >> 2); + byteSize = num * bytesOf(DT_F16); + gclmemFilterDesc->stride[0] = s0; + gclmemFilterDesc->stride[1] = s1; + gclmemFilterDesc->stride[2] = s2; + gclmemFilterDesc->offset[0] = 0; + gclmemFilterDesc->offset[1] = 0; + gclmemFilterDesc->offset[2] = 0; + gclmemFilterDesc->num = num; + gclmemFilterDesc->byteSize = byteSize; + gclmemFilterDesc->memType = GCL_MEM_BUF; + gclmemFilterDesc->flags = CL_MEM_READ_WRITE; + gclmemFilterDesc->host_ptr = NULL; + *bytes = 0; + return SUCCESS; +} + +EE deconvolution_gemm_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem) +{ + DataType fdt; + DataFormat fdf; + U32 fw, fh, fc, fn; + tensorSelectGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw); + U32 fwh = fw * fh; + U32 fwhc = fwh * fc; + U32 item_c = forwardRunInfo->best_c[0]; + U32 item_k = forwardRunInfo->best_k[0]; + char kernelname[128]; + Kernel kernel; + sprintf(kernelname, "deconv_gemm_trans_fltbuf_%d%d", (item_c >> 2), item_k); + CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); + + CHECK_STATUS(gcl_set_kernelArgs(kernel, fw, fwh, fwhc, fc, fn, filter->mem, fltmem->mem)); + U32 gs[2] = {fwh * ((fc + 3) / 4), (fn + 3) / 4}; + U32 ls[2] = {0, 0}; + U32 dim = 2; + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + *fltmemDesc = tensor4df(fdt, fdf, fn, fc, fh, fw); +#ifdef _DEBUG + CHECK_STATUS(gcl_print_memory(handle, filter, "deconv_gemm_filter_org")); + CHECK_STATUS(gcl_print_memory(handle, fltmem, "deconv_gemm_filter_tran")); +#endif + return SUCCESS; +} + +EE deconvolution_gemm_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes) +{ + UNUSED(outputDesc); + UNUSED(convParamSpec); + UNUSED(forwardRunInfo); + U32 iw, ih; + U32 fw, fh, fc; + tensorSelectGet(inputDesc, NULL, NULL, NULL, NULL, &ih, &iw); + tensorSelectGet(filterDesc, NULL, NULL, NULL, &fc, &fh, &fw); + *bytes = iw * ih * fw * fh * fc * bytesOf(inputDesc.dt); + return SUCCESS; +} + +EE deconvolution_gemm_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode) +{ + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + CHECK_STATUS( + deconv_gemm_core_mali_fp16(handle, inputDesc, input, filterDesc, filter, convParamSpec, + forwardRunInfo, biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, activationMode)); + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/fp16/deconvolution_gemm_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/deconvolution_gemm_mali_fp16.h new file mode 100644 index 00000000..3cab694d --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/deconvolution_gemm_mali_fp16.h @@ -0,0 +1,54 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_DECONVOLUTION_GEMM_MALI_FP16 +#define _H_DECONVOLUTION_GEMM_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE deconvolution_gemm_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes); + +EE deconvolution_gemm_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem); + +EE deconvolution_gemm_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes); + +EE deconvolution_gemm_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/deconvolution_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/deconvolution_mali_fp16.cpp new file mode 100644 index 00000000..31a6e82c --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/deconvolution_mali_fp16.cpp @@ -0,0 +1,166 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/deconvolution_mali_fp16.h" +#include "gpu/mali/fp16/deconvolution_direct_mali_fp16.h" +#include "gpu/mali/fp16/deconvolution_gemm_mali_fp16.h" + +inline EE deconvolution_checkpara_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + const GCLMem_t bias, + TensorDesc outputDesc, + GCLMem_t output) +{ + if (nullptr == handle || nullptr == input || nullptr == filter || nullptr == output || + nullptr == bias) { + return NULL_POINTER; + } + if (inputDesc.dt != outputDesc.dt || inputDesc.dt != filterDesc.dt || inputDesc.dt != DT_F16) { + return NOT_MATCH; + } + + U32 ic, fc, fn, fh, fw, oc; + CHECK_STATUS(tensorSelectGet(inputDesc, NULL, NULL, NULL, &ic, NULL, NULL)); + CHECK_STATUS(tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensorSelectGet(outputDesc, NULL, NULL, NULL, &oc, NULL, NULL)); + if (input->desc.memFormat != DF_NCWHC4) { + return NOT_MATCH; + } + if (output->desc.memFormat != DF_NCWHC4) { + return NOT_MATCH; + } + if (fc != oc) { + return NOT_MATCH; + } + if (ic != fn) { + return NOT_MATCH; + } + return SUCCESS; +} + +EE deconvolution_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes) +{ + EE ret = SUCCESS; + ConvolutionForwardAlgorithm algorithm = (ConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + switch (algorithm) { + case CONVOLUTION_ALGORITHM_DIRECT: + ret = deconvolution_direct_transform_filter_bytes_mali_fp16( + filterDesc, forwardRunInfo, gclmemFilterDesc, bytes); + break; + case CONVOLUTION_ALGORITHM_GEMM: + ret = deconvolution_gemm_transform_filter_bytes_mali_fp16( + filterDesc, forwardRunInfo, gclmemFilterDesc, bytes); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE deconvolution_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem, + GCLMem_t tmp) +{ + EE ret = SUCCESS; + ConvolutionForwardAlgorithm algorithm = (ConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + switch (algorithm) { + case CONVOLUTION_ALGORITHM_DIRECT: + ret = deconvolution_direct_transform_filter_mali_fp16( + handle, filterDesc, filter, forwardRunInfo, fltmemDesc, fltmem); + break; + case CONVOLUTION_ALGORITHM_GEMM: + ret = deconvolution_gemm_transform_filter_mali_fp16( + handle, filterDesc, filter, forwardRunInfo, fltmemDesc, fltmem); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE deconvolution_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes) +{ + EE ret = SUCCESS; + ConvolutionForwardAlgorithm algorithm = (ConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + switch (algorithm) { + case CONVOLUTION_ALGORITHM_DIRECT: + ret = deconvolution_direct_infer_forward_tmp_bytes_mali_fp16( + inputDesc, filterDesc, outputDesc, convParamSpec, forwardRunInfo, bytes); + break; + case CONVOLUTION_ALGORITHM_GEMM: + ret = deconvolution_gemm_infer_forward_tmp_bytes_mali_fp16( + inputDesc, filterDesc, outputDesc, convParamSpec, forwardRunInfo, bytes); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE deconvolution_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode) +{ + CHECK_STATUS(deconvolution_checkpara_mali_fp16( + handle, inputDesc, input, filterDesc, filter, bias, outputDesc, output)); + EE ret = SUCCESS; + ConvolutionForwardAlgorithm algorithm = (ConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + switch (algorithm) { + case CONVOLUTION_ALGORITHM_DIRECT: + ret = deconvolution_direct_mali_fp16(handle, inputDesc, input, filterDesc, filter, + convParamSpec, forwardRunInfo, biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, + activationMode); + break; + case CONVOLUTION_ALGORITHM_GEMM: + ret = deconvolution_gemm_mali_fp16(handle, inputDesc, input, filterDesc, filter, + convParamSpec, forwardRunInfo, biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, + activationMode); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/fp16/deconvolution_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/deconvolution_mali_fp16.h new file mode 100644 index 00000000..ac168e81 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/deconvolution_mali_fp16.h @@ -0,0 +1,55 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _CONVOLUTION_MALI_FP16 +#define _CONVOLUTION_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE deconvolution_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes); + +EE deconvolution_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem, + GCLMem_t tmp); + +EE deconvolution_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes); + +EE deconvolution_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/depth2space_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/depth2space_mali_fp16.cpp new file mode 100644 index 00000000..db03ecd2 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/depth2space_mali_fp16.cpp @@ -0,0 +1,128 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/depth2space_mali_fp16.h" + +inline EE depth2space_checkpara_mali_fp16(TensorDesc inputDesc, TensorDesc outputDesc) +{ + if (inputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + if (outputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE depth2space_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + Depth2SpaceParamSpec p, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output) +{ + UNUSED(outputDesc); + U32 iw, ih, ic, in; + tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); + U32 iw_str, ih_str, iw_off, ih_off, iwh_str, ic_str; + U32 ow_str, oh_str, ow_off, oh_off, owh_str; + get_gclmem_dim(input->desc, &iw_str, &ih_str, &ic_str, &iw_off, &ih_off); + get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off); + iwh_str = iw_str * ih_str; + owh_str = ow_str * oh_str; + cl_mem inbuf, outbuf, tmp; + inbuf = input->mem; + outbuf = output->mem; + tmp = tmpBuf->mem; + DataFormat memFormat = input->desc.memFormat; + + if (memFormat == DF_NCWHC4 && p.blockSize == 2) { + U32 gs[3] = {ih, iw, (ic_str + 3) / 4}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, "depth2space_ncwhc4_2x2", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, p.blockSize, ih_str, iwh_str, ic_str, ih_off, + iw_off, oh_str, owh_str, oh_off, ow_off, ih, iw, inbuf, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, "depth2space_ncwhc4_2x2"); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "depth2space_ncwhc4_2x2")); +#endif + return SUCCESS; + } else if (memFormat == DF_NCHW || memFormat == DF_NCWHC4) { + if (memFormat == DF_NCWHC4) { + U32 gs0[3] = {ih, (iw + 3) / 4, (ic + 3) / 4}; + U32 ls0[3] = {0, 0, 0}; + U32 dim0 = 3; + Kernel kernel0; + CHECK_STATUS(gcl_create_kernel(handle, "mem_trans_ncwhc4_to_nchw", &kernel0)); + CHECK_STATUS(gcl_set_kernelArgs(kernel0, iw_str, ih_str, iw_off, ih_off, iw, ih, 0, 0, + iw, ih, ic, iw, ih, ic, 0, 0, inbuf, tmp)); + gcl_set_kernelVec(handle, kernel0, dim0, gs0, ls0, "mem_trans_ncwhc4_to_nchw"); +#ifdef _DEBUG + CHECK_STATUS( + gcl_run_kernel(handle, kernel0, dim0, gs0, ls0, "mem_trans_ncwhc4_to_nchw")); +#endif + inbuf = tmp; + } + U32 gs[3] = { + iw, ih, (ic / (p.blockSize * p.blockSize) + 3) / 4 * (p.blockSize * p.blockSize)}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, "depth2space_nchw", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, p.blockSize, iw_str, iwh_str, iw_off, ih_off, + oh_str, owh_str, oh_off, ow_off, iw, ih, ic, inbuf, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, "depth2space_nchw"); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "depth2space_nchw")); +#endif + return SUCCESS; + } + return NOT_SUPPORTED; +} + +EE depth2space_infer_tmpBuf_size_mali_fp16( + TensorDesc inputDesc, Depth2SpaceParamSpec p, TensorDesc outputDesc, U32 *bytes) +{ + UNUSED(outputDesc); + DataFormat idf; + DataType idt; + U32 iw, ih, ic, in; + tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); + *bytes = 0; + if (idf == DF_NCHW && p.blockSize != 2) { + *bytes = in * ic * ih * iw * bytesOf(idt); + } + return SUCCESS; +} + +EE depth2space_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + Depth2SpaceParamSpec p, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output) +{ + EE ret = SUCCESS; + CHECK_STATUS(depth2space_checkpara_mali_fp16(inputDesc, outputDesc)); + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + CHECK_STATUS( + depth2space_core_mali_fp16(handle, inputDesc, input, p, tmpBuf, outputDesc, output)); + return ret; +} diff --git a/compute/tensor/src/gpu/mali/fp16/depth2space_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/depth2space_mali_fp16.h new file mode 100644 index 00000000..90d1efe8 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/depth2space_mali_fp16.h @@ -0,0 +1,31 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _DEPTH2SPACE_MALI_FP16 +#define _DEPTH2SPACE_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE depth2space_infer_tmpBuf_size_mali_fp16( + TensorDesc inputDesc, Depth2SpaceParamSpec p, TensorDesc outputDesc, U32 *bytes); + +EE depth2space_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + Depth2SpaceParamSpec p, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/depthwise_convolution_direct_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/depthwise_convolution_direct_mali_fp16.cpp new file mode 100644 index 00000000..1b764e92 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/depthwise_convolution_direct_mali_fp16.cpp @@ -0,0 +1,184 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/depthwise_convolution_direct_mali_fp16.h" + +inline EE depthwise_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode depthwiseActivationMode) +{ + UNUSED(inputDesc); + UNUSED(biasDesc); + UNUSED(tmpBytes); + UNUSED(tmpBuf); + + cl_mem inbuf, biasimg, outbuf, fltbuf; + inbuf = input->mem; + fltbuf = filter->mem; + biasimg = bias->mem; + outbuf = output->mem; + U32 fw, sw, pw, ph; + U32 ow, oh, oc, on; + sw = convParamSpec.stride_w; + ph = convParamSpec.padding_top; + pw = convParamSpec.padding_left; + tensorSelectGet(filterDesc, NULL, NULL, NULL, NULL, NULL, &fw); + tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); + + U32 iw_str, ih_str, ihw_str, ic_str, ih_off, iw_off; + get_gclmem_dim(input->desc, &iw_str, &ih_str, &ic_str, &iw_off, &ih_off); + iw_off -= pw; + ih_off -= ph; + ihw_str = iw_str * ih_str; + + U32 ow_str, oh_str, ow_off, oh_off, ohw_str; + get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off); + ohw_str = oh_str * ow_str; + + U32 item_w = forwardRunInfo->best_w[0]; + U32 gs[3] = {oh, (ow + item_w - 1) / item_w, (oc + 3) / 4 * on}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + char kernelname[128]; + Kernel kernel; + if (depthwiseActivationMode == ACTIVATION_NULL) { + sprintf(kernelname, "conv_depthwise_s%d_%d%d", sw, fw, item_w); + } else if (depthwiseActivationMode == ACTIVATION_RELU) { + sprintf(kernelname, "conv_depthwise_s%d_relu_%d%d", sw, fw, item_w); + } else if (depthwiseActivationMode == ACTIVATION_RELU6) { + sprintf(kernelname, "conv_depthwise_s%d_relu6_%d%d", sw, fw, item_w); + } else { + UNI_ERROR_LOG("xxx %d \n", (int)depthwiseActivationMode); + CHECK_STATUS(NOT_SUPPORTED); + return NOT_SUPPORTED; + } + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, ihw_str, ic_str, ih_off, iw_off, oh_str, ow_str, + ohw_str, oh_off, ow_off, ow, gs[0], gs[1], inbuf, fltbuf, biasimg, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); + +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + handle->t_total += handle->t_execute; +#endif + return SUCCESS; +} + +EE depthwise_convolution_direct_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes) +{ + U32 fw, fh, fc; + tensorSelectGet(filterDesc, NULL, NULL, NULL, &fc, &fh, &fw); + U32 item_k = forwardRunInfo->best_k[0]; + U32 s0, s1, s2; + U32 num, byteSize; + s0 = fw * fh; + s1 = (fc + item_k - 1) / item_k; + s2 = 1; + num = s0 * s1 * s2 * item_k; + byteSize = num * bytesOf(DT_F16); + gclmemFilterDesc->stride[0] = s0; + gclmemFilterDesc->stride[1] = s1; + gclmemFilterDesc->stride[2] = s2; + gclmemFilterDesc->offset[0] = 0; + gclmemFilterDesc->offset[1] = 0; + gclmemFilterDesc->offset[2] = 0; + gclmemFilterDesc->num = num; + gclmemFilterDesc->byteSize = byteSize; + gclmemFilterDesc->memType = GCL_MEM_BUF; + gclmemFilterDesc->memFormat = DF_NHWCN4; + gclmemFilterDesc->flags = CL_MEM_READ_WRITE; + gclmemFilterDesc->host_ptr = NULL; + *bytes = 0; + return SUCCESS; +} + +EE depthwise_convolution_direct_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem) +{ + DataType fdt; + DataFormat fdf; + U32 fw, fh, fc; + tensorSelectGet(filterDesc, &fdt, &fdf, NULL, &fc, &fh, &fw); + U32 fwh = fw * fh; + U32 item_k = forwardRunInfo->best_k[0]; + char kernelname[128]; + Kernel kernel; + sprintf(kernelname, "conv_depthwise_trans_fltbuf_%d", item_k); + CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, fwh, fc, filter->mem, fltmem->mem)); + U32 gs[3] = {fwh, (fc + item_k - 1) / item_k}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 2; + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + *fltmemDesc = tensor4df(fdt, fdf, 1, fc, fh, fw); + return SUCCESS; +} + +EE depthwise_convolution_direct_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes) +{ + UNUSED(inputDesc); + UNUSED(filterDesc); + UNUSED(outputDesc); + UNUSED(convParamSpec); + UNUSED(forwardRunInfo); + *bytes = 0; + return SUCCESS; +} + +EE depthwise_convolution_direct_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode depthwiseActivationMode) +{ + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + CHECK_STATUS(depthwise_core_mali_fp16(handle, inputDesc, input, filterDesc, filter, + convParamSpec, forwardRunInfo, biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, + depthwiseActivationMode)); + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/fp16/depthwise_convolution_direct_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/depthwise_convolution_direct_mali_fp16.h new file mode 100644 index 00000000..84c06e38 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/depthwise_convolution_direct_mali_fp16.h @@ -0,0 +1,54 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _DEPTHWISE_CONVOLUTION_DIRECT_MALI_FP16 +#define _DEPTHWISE_CONVOLUTION_DIRECT_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE depthwise_convolution_direct_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes); + +EE depthwise_convolution_direct_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem); + +EE depthwise_convolution_direct_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes); + +EE depthwise_convolution_direct_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode depthwiseActivationMode); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/depthwise_convolution_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/depthwise_convolution_mali_fp16.cpp new file mode 100644 index 00000000..a546c75b --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/depthwise_convolution_mali_fp16.cpp @@ -0,0 +1,157 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/depthwise_convolution_mali_fp16.h" +#include "gpu/mali/fp16/depthwise_convolution_direct_mali_fp16.h" + +inline EE depthwise_convolution_checkpara_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + const GCLMem_t bias, + TensorDesc outputDesc, + GCLMem_t output) +{ + if (nullptr == handle || nullptr == input || nullptr == filter || nullptr == output || + nullptr == bias) { + return NULL_POINTER; + } + if (inputDesc.dt != outputDesc.dt || inputDesc.dt != filterDesc.dt || inputDesc.dt != DT_F16) { + return NOT_MATCH; + } + + DataFormat fdf; + U32 ic, fc, fh, fw, oc; + CHECK_STATUS(tensorSelectGet(inputDesc, NULL, NULL, NULL, &ic, NULL, NULL)); + CHECK_STATUS(tensorSelectGet(filterDesc, NULL, &fdf, NULL, &fc, &fh, &fw)); + CHECK_STATUS(tensorSelectGet(outputDesc, NULL, NULL, NULL, &oc, NULL, NULL)); + if (input->desc.memFormat == DF_NCWHC4) { + if (filter->desc.memFormat != DF_NHWCN4) { + return NOT_MATCH; + } + if (output->desc.memFormat != DF_NCWHC4) { + return NOT_MATCH; + } + } + if (fw != 3 && fw != 5 && fw != 7) { + return NOT_MATCH; + } + if (fdf == DF_NCHW && ic != fc) { + return NOT_MATCH; + } + if (fc != oc) { + return NOT_MATCH; + } + return SUCCESS; +} + +EE depthwise_convolution_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes) +{ + EE ret = SUCCESS; + DepthwiseConvolutionForwardAlgorithm algorithm = + (DepthwiseConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + switch (algorithm) { + case DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT: + ret = depthwise_convolution_direct_transform_filter_bytes_mali_fp16( + filterDesc, forwardRunInfo, gclmemFilterDesc, bytes); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE depthwise_convolution_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem) +{ + EE ret = SUCCESS; + DepthwiseConvolutionForwardAlgorithm algorithm = + (DepthwiseConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + switch (algorithm) { + case DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT: + ret = depthwise_convolution_direct_transform_filter_mali_fp16( + handle, filterDesc, filter, forwardRunInfo, fltmemDesc, fltmem); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE depthwise_convolution_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes) +{ + EE ret = SUCCESS; + DepthwiseConvolutionForwardAlgorithm algorithm = + (DepthwiseConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + switch (algorithm) { + case DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT: + ret = depthwise_convolution_direct_infer_forward_tmp_bytes_mali_fp16( + inputDesc, filterDesc, outputDesc, convParamSpec, forwardRunInfo, bytes); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE depthwise_convolution_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode depthwiseActivationMode) +{ + EE ret = SUCCESS; + CHECK_STATUS(depthwise_convolution_checkpara_mali_fp16( + handle, inputDesc, input, filterDesc, filter, bias, outputDesc, output)); + DepthwiseConvolutionForwardAlgorithm algorithm = + (DepthwiseConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + switch (algorithm) { + case DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT: + ret = depthwise_convolution_direct_mali_fp16(handle, inputDesc, input, filterDesc, + filter, convParamSpec, forwardRunInfo, biasDesc, bias, tmpBytes, tmpBuf, outputDesc, + output, depthwiseActivationMode); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/fp16/depthwise_convolution_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/depthwise_convolution_mali_fp16.h new file mode 100644 index 00000000..5a876c9d --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/depthwise_convolution_mali_fp16.h @@ -0,0 +1,54 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _DEPTHWISE_CONVOLUTION_MALI_FP16 +#define _DEPTHWISE_CONVOLUTION_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE depthwise_convolution_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes); + +EE depthwise_convolution_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem); + +EE depthwise_convolution_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes); + +EE depthwise_convolution_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode depthwiseActivationMode); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_direct_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_direct_mali_fp16.cpp new file mode 100644 index 00000000..551ea9e2 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_direct_mali_fp16.cpp @@ -0,0 +1,277 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/depthwise_pointwise_convolution_direct_mali_fp16.h" + +inline EE depthwise_pointwise_direct_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + const GCLMem_t dwFilter, + const GCLMem_t pwFilter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc dwBiasDesc, + TensorDesc pwBiasDesc, + const GCLMem_t dwBias, + const GCLMem_t pwBias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode depthwiseActivationMode, + ActivationMode pointwiseActivationMode) +{ + UNUSED(inputDesc); + UNUSED(dwBiasDesc); + UNUSED(pwBiasDesc); + UNUSED(tmpBytes); + + cl_mem inbuf, dwBiasimg, pwBiasimg, outbuf, dwFltbuf, pwFltbuf, tmp; + inbuf = input->mem; + dwFltbuf = dwFilter->mem; + pwFltbuf = pwFilter->mem; + dwBiasimg = dwBias->mem; + pwBiasimg = pwBias->mem; + outbuf = output->mem; + tmp = tmpBuf->mem; + U32 fw, sw, pw, ph, fc; + U32 ow, oh, oc, on; + sw = convParamSpec.stride_w; + ph = convParamSpec.padding_top; + pw = convParamSpec.padding_left; + tensorSelectGet(dwFilterDesc, NULL, NULL, NULL, &fc, NULL, &fw); + tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); + + U32 iw_str, ih_str, ihw_str, ic_str, ih_off, iw_off; + get_gclmem_dim(input->desc, &iw_str, &ih_str, &ic_str, &iw_off, &ih_off); + iw_off -= pw; + ih_off -= ph; + ihw_str = iw_str * ih_str; + + U32 th_str, tw_str, th_off, tw_off, thw_str; + U32 w_align, item_wd, item_wp; + item_wd = forwardRunInfo->best_w[0]; + item_wp = forwardRunInfo->best_w[1]; + w_align = (ow + item_wp - 1) / item_wp * item_wp; + th_str = oh; + tw_str = w_align; + th_off = 0; + tw_off = 0; + thw_str = th_str * tw_str; + + U32 ow_str, oh_str, ow_off, oh_off, ohw_str; + get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off); + ohw_str = oh_str * ow_str; + + U32 gs[3] = {oh, (ow + item_wd - 1) / item_wd, (fc + 3) / 4}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + char kernelname[128]; + Kernel kernel; + if (depthwiseActivationMode == ACTIVATION_NULL) { + sprintf(kernelname, "conv_depthwise_s%d_%d%d", sw, fw, item_wd); + } else if (depthwiseActivationMode == ACTIVATION_RELU) { + sprintf(kernelname, "conv_depthwise_s%d_relu_%d%d", sw, fw, item_wd); + } else if (depthwiseActivationMode == ACTIVATION_RELU6) { + sprintf(kernelname, "conv_depthwise_s%d_relu6_%d%d", sw, fw, item_wd); + } else { + CHECK_STATUS(NOT_SUPPORTED); + return NOT_SUPPORTED; + } + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, ihw_str, ic_str, ih_off, iw_off, th_str, tw_str, + thw_str, th_off, tw_off, ow, gs[0], gs[1], inbuf, dwFltbuf, dwBiasimg, tmp)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); + +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + handle->t_total += handle->t_execute; +#endif + fw = 1; + sw = 1; + U32 item_kp = forwardRunInfo->best_k[1]; + item_kp = item_kp >> 2; + if (pointwiseActivationMode == ACTIVATION_NULL) { + sprintf(kernelname, "conv_direct_s%d_%d%d%d", sw, fw, item_wp, item_kp); + } else if (pointwiseActivationMode == ACTIVATION_RELU) { + sprintf(kernelname, "conv_direct_s%d_relu_%d%d%d", sw, fw, item_wp, item_kp); + } else { + CHECK_STATUS(NOT_SUPPORTED); + return NOT_SUPPORTED; + } + + U32 gsp[3] = {oh, (ow + item_wp - 1) / item_wp, (oc + 3) / 4 * on / item_kp}; + U32 lsp[3] = {0, 0, 0}; + U32 dimp = 3; + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, th_str, thw_str, ic_str, th_off, tw_off, oh_str, + ohw_str, oh_off, ow_off, ow, 1, gsp[0], gsp[1], tmp, pwFltbuf, pwBiasimg, outbuf)); + gcl_set_kernelVec(handle, kernel, dimp, gsp, lsp, kernelname); + +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dimp, gsp, lsp, kernelname)); + handle->t_total += handle->t_execute; +#endif + return SUCCESS; +} + +EE depthwise_pointwise_convolution_direct_transform_filter_bytes_mali_fp16(TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemDwFilterDesc, + GCLMemDesc_t gclmemPwFilterDesc, + U32 *bytes) +{ + U32 fw, fh, fc, fn; + tensorSelectGet(dwFilterDesc, NULL, NULL, NULL, &fc, &fh, &fw); + tensorSelectGet(pwFilterDesc, NULL, NULL, &fn, NULL, NULL, NULL); + U32 item_kd = forwardRunInfo->best_k[0]; + U32 item_kp = forwardRunInfo->best_k[1]; + U32 item_c = forwardRunInfo->best_c[1]; + U32 s0, s1, s2; + U32 num, byteSize; + s0 = fw * fh; + s1 = (fc + item_kd - 1) / item_kd; + s2 = 1; + num = s0 * s1 * s2 * item_kd; + byteSize = num * bytesOf(DT_F16); + gclmemDwFilterDesc->stride[0] = s0; + gclmemDwFilterDesc->stride[1] = s1; + gclmemDwFilterDesc->stride[2] = s2; + gclmemDwFilterDesc->offset[0] = 0; + gclmemDwFilterDesc->offset[1] = 0; + gclmemDwFilterDesc->offset[2] = 0; + gclmemDwFilterDesc->num = num; + gclmemDwFilterDesc->byteSize = byteSize; + gclmemDwFilterDesc->memType = GCL_MEM_BUF; + gclmemDwFilterDesc->memFormat = DF_NHWCN4; + gclmemDwFilterDesc->flags = CL_MEM_READ_WRITE; + gclmemDwFilterDesc->host_ptr = NULL; + + s0 = item_kp >> 2; + s1 = (fc + item_c - 1) / item_c; + s2 = (fn + item_kp - 1) / item_kp; + num = s0 * s1 * s2 * item_c * item_kp / (item_kp >> 2); + byteSize = num * bytesOf(DT_F16); + gclmemPwFilterDesc->stride[0] = s0; + gclmemPwFilterDesc->stride[1] = s1; + gclmemPwFilterDesc->stride[2] = s2; + gclmemPwFilterDesc->offset[0] = 0; + gclmemPwFilterDesc->offset[1] = 0; + gclmemPwFilterDesc->offset[2] = 0; + gclmemPwFilterDesc->num = num; + gclmemPwFilterDesc->byteSize = byteSize; + gclmemPwFilterDesc->memType = GCL_MEM_BUF; + gclmemPwFilterDesc->memFormat = DF_NCHWN4C4; + gclmemPwFilterDesc->flags = CL_MEM_READ_WRITE; + gclmemPwFilterDesc->host_ptr = NULL; + *bytes = 0; + return SUCCESS; +} + +EE depthwise_pointwise_convolution_direct_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + GCLMem_t dwFilter, + GCLMem_t pwFilter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *dwFltmemDesc, + TensorDesc *pwFltmemDesc, + GCLMem_t dwFltmem, + GCLMem_t pwFltmem) +{ + U32 dfw, dfh, dfc; + U32 pfc, pfn; + tensorSelectGet(dwFilterDesc, NULL, NULL, NULL, &dfc, &dfh, &dfw); + tensorSelectGet(pwFilterDesc, NULL, NULL, &pfn, &pfc, NULL, NULL); + U32 dfwh = dfw * dfh; + U32 item_kd = forwardRunInfo->best_k[0]; + U32 item_kp = forwardRunInfo->best_k[1]; + U32 item_c = forwardRunInfo->best_c[1]; + char kernelname[128]; + Kernel kernel; + sprintf(kernelname, "conv_depthwise_trans_fltbuf_%d", item_kd); + CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, dfwh, dfc, dwFilter->mem, dwFltmem->mem)); + U32 gs[2] = {dfwh, (dfc + item_kd - 1) / item_kd}; + U32 ls[2] = {0, 0}; + U32 dim = 2; + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + *dwFltmemDesc = dwFilterDesc; + + sprintf(kernelname, "conv_direct_trans_fltbuf_%d%d", item_c, item_kp); + CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, 1, pfc, pfn, pwFilter->mem, pwFltmem->mem)); + U32 gsc[3] = {1, (pfc + item_c - 1) / item_c, (pfn + item_kp - 1) / item_kp * item_kp}; + U32 lsc[3] = {0, 0, 0}; + U32 dimc = 3; + CHECK_STATUS(gcl_run_kernel(handle, kernel, dimc, gsc, lsc, kernelname)); + *pwFltmemDesc = pwFilterDesc; + return SUCCESS; +} + +EE depthwise_pointwise_convolution_direct_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes) +{ + UNUSED(inputDesc); + UNUSED(pwFilterDesc); + UNUSED(convParamSpec); + DataType odt; + U32 oh, ow, fc; + tensorSelectGet(dwFilterDesc, NULL, NULL, NULL, &fc, NULL, NULL); + tensorSelectGet(outputDesc, &odt, NULL, NULL, NULL, &oh, &ow); + + U32 w_align; + U32 item_w = forwardRunInfo->best_w[1]; + w_align = (ow + item_w - 1) / item_w * item_w; + *bytes = oh * w_align * ((fc + 3) / 4) * 4 * bytesOf(odt); + return SUCCESS; +} + +EE depthwise_pointwise_convolution_direct_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + const GCLMem_t dwFilter, + const GCLMem_t pwFilter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc dwBiasDesc, + TensorDesc pwBiasDesc, + const GCLMem_t dwBias, + const GCLMem_t pwBias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode depthwiseActivationMode, + ActivationMode pointwiseActivationMode) +{ + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + CHECK_STATUS(depthwise_pointwise_direct_core_mali_fp16(handle, inputDesc, input, dwFilterDesc, + pwFilterDesc, dwFilter, pwFilter, convParamSpec, forwardRunInfo, dwBiasDesc, pwBiasDesc, + dwBias, pwBias, tmpBytes, tmpBuf, outputDesc, output, depthwiseActivationMode, + pointwiseActivationMode)); + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_direct_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_direct_mali_fp16.h new file mode 100644 index 00000000..72a682d8 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_direct_mali_fp16.h @@ -0,0 +1,66 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _DEPTHWISE_POINTWISE_CONVOLUTION_DIRECT_MALI_FP16 +#define _DEPTHWISE_POINTWISE_CONVOLUTION_DIRECT_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE depthwise_pointwise_convolution_direct_transform_filter_bytes_mali_fp16(TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemDwFilterDesc, + GCLMemDesc_t gclmemPwFilterDesc, + U32 *bytes); + +EE depthwise_pointwise_convolution_direct_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + GCLMem_t dwFilter, + GCLMem_t pwFilter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *dwFltmemDesc, + TensorDesc *pwFltmemDesc, + GCLMem_t dwFltmem, + GCLMem_t pwFltmem); + +EE depthwise_pointwise_convolution_direct_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes); + +EE depthwise_pointwise_convolution_direct_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + const GCLMem_t dwFilter, + const GCLMem_t pwFilter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc dwBiasDesc, + TensorDesc pwBiasDesc, + const GCLMem_t dwBias, + const GCLMem_t pwBias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode depthwiseActivationMode, + ActivationMode pointwiseActivationMode); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_gemm_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_gemm_mali_fp16.cpp new file mode 100644 index 00000000..b08de440 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_gemm_mali_fp16.cpp @@ -0,0 +1,278 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/depthwise_pointwise_convolution_gemm_mali_fp16.h" + +inline EE depthwise_pointwise_gemm_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + const GCLMem_t dwFilter, + const GCLMem_t pwFilter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc dwBiasDesc, + TensorDesc pwBiasDesc, + const GCLMem_t dwBias, + const GCLMem_t pwBias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode depthwiseActivationMode, + ActivationMode pointwiseActivationMode) +{ + UNUSED(inputDesc); + UNUSED(dwBiasDesc); + UNUSED(pwBiasDesc); + UNUSED(tmpBytes); + + cl_mem inbuf, dwBiasimg, pwBiasbuf, outbuf, dwFltbuf, pwFltbuf, tmp; + inbuf = input->mem; + dwFltbuf = dwFilter->mem; + pwFltbuf = pwFilter->mem; + dwBiasimg = dwBias->mem; + pwBiasbuf = pwBias->mem; + outbuf = output->mem; + tmp = tmpBuf->mem; + U32 fw, sw, pw, ph, fc; + U32 ow, oh, oc, on; + sw = convParamSpec.stride_w; + ph = convParamSpec.padding_top; + pw = convParamSpec.padding_left; + tensorSelectGet(dwFilterDesc, NULL, NULL, NULL, &fc, NULL, &fw); + tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); + + U32 iw_str, ih_str, ihw_str, ic_str, ih_off, iw_off; + get_gclmem_dim(input->desc, &iw_str, &ih_str, &ic_str, &iw_off, &ih_off); + iw_off -= pw; + ih_off -= ph; + ihw_str = iw_str * ih_str; + + U32 th_str, tw_str, th_off, tw_off, thw_str; + U32 item_wd, item_whp, item_kp; + item_wd = forwardRunInfo->best_w[0]; + item_whp = forwardRunInfo->best_w[1]; + item_kp = forwardRunInfo->best_k[1]; + th_str = oh; + tw_str = ow; + th_off = 0; + tw_off = 0; + thw_str = ALIGN(th_str * tw_str, item_whp); + + U32 ow_str, oh_str, ow_off, oh_off, ohw_str; + get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off); + ohw_str = oh_str * ow_str; + + U32 gs[3] = {oh, ALIGN(ow, item_wd) / item_wd, ALIGN(fc, 4) / 4}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + char kernelname[128]; + Kernel kernel; + if (depthwiseActivationMode == ACTIVATION_NULL) { + sprintf(kernelname, "conv_depthwise_s%d_ncwh_%d%d", sw, fw, item_wd); + } else if (depthwiseActivationMode == ACTIVATION_RELU) { + sprintf(kernelname, "conv_depthwise_s%d_relu_ncwh_%d%d", sw, fw, item_wd); + } else if (depthwiseActivationMode == ACTIVATION_RELU6) { + sprintf(kernelname, "conv_depthwise_s%d_relu6_ncwh_%d%d", sw, fw, item_wd); + } else { + CHECK_STATUS(NOT_SUPPORTED); + return NOT_SUPPORTED; + } + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, ihw_str, ic_str, ih_off, iw_off, th_str, tw_str, + thw_str, th_off, tw_off, ow, gs[0], gs[1], inbuf, dwFltbuf, dwBiasimg, tmp)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); + +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + handle->t_total += handle->t_execute; +#endif + if (pointwiseActivationMode == ACTIVATION_NULL) { + sprintf(kernelname, "gemm_tn_ncwhc4_%d%d", item_kp, item_whp); + } else if (pointwiseActivationMode == ACTIVATION_RELU) { + sprintf(kernelname, "gemm_tn_relu_ncwhc4_%d%d", item_kp, item_whp); + } else { + CHECK_STATUS(NOT_SUPPORTED); + return NOT_SUPPORTED; + } + + U32 M, N, K; + M = ALIGN(oc, item_kp); + N = thw_str; + K = fc; + U32 gsp[3] = {N / item_whp, M / item_kp}; + U32 lsp[3] = {0, 0}; + U32 dimp = 2; + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, M, N, K, oh, ow, oc, oh_str, ow_str, ohw_str, oh_off, + ow_off, gsp[0], gsp[1], pwFltbuf, tmp, pwBiasbuf, outbuf)); + gcl_set_kernelVec(handle, kernel, dimp, gsp, lsp, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dimp, gsp, lsp, kernelname)); + handle->t_total += handle->t_execute; +#endif + return SUCCESS; +} + +EE depthwise_pointwise_convolution_gemm_transform_filter_bytes_mali_fp16(TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemDwFilterDesc, + GCLMemDesc_t gclmemPwFilterDesc, + U32 *bytes) +{ + U32 fw, fh, fc, fn; + tensorSelectGet(dwFilterDesc, NULL, NULL, NULL, &fc, &fh, &fw); + tensorSelectGet(pwFilterDesc, NULL, NULL, &fn, NULL, NULL, NULL); + U32 item_kd = forwardRunInfo->best_k[0]; + U32 item_kp = forwardRunInfo->best_k[1]; + U32 item_c = forwardRunInfo->best_c[1]; + U32 s0, s1, s2; + U32 num, byteSize; + s0 = fw * fh; + s1 = ALIGN(fc, item_kd) / item_kd; + s2 = 1; + num = s0 * s1 * s2 * item_kd; + byteSize = num * bytesOf(DT_F16); + gclmemDwFilterDesc->stride[0] = s0; + gclmemDwFilterDesc->stride[1] = s1; + gclmemDwFilterDesc->stride[2] = s2; + gclmemDwFilterDesc->offset[0] = 0; + gclmemDwFilterDesc->offset[1] = 0; + gclmemDwFilterDesc->offset[2] = 0; + gclmemDwFilterDesc->num = num; + gclmemDwFilterDesc->byteSize = byteSize; + gclmemDwFilterDesc->memType = GCL_MEM_BUF; + gclmemDwFilterDesc->memFormat = DF_NHWCN4; + gclmemDwFilterDesc->flags = CL_MEM_READ_WRITE; + gclmemDwFilterDesc->host_ptr = NULL; + + s0 = ALIGN(fn, item_kp); + s1 = ALIGN(fc, item_c); + s2 = 1; + num = s0 * s1 * s2; + byteSize = num * bytesOf(DT_F16); + gclmemPwFilterDesc->stride[0] = s0; + gclmemPwFilterDesc->stride[1] = s1; + gclmemPwFilterDesc->stride[2] = s2; + gclmemPwFilterDesc->offset[0] = 0; + gclmemPwFilterDesc->offset[1] = 0; + gclmemPwFilterDesc->offset[2] = 0; + gclmemPwFilterDesc->num = num; + gclmemPwFilterDesc->byteSize = byteSize; + gclmemPwFilterDesc->memType = GCL_MEM_BUF; + gclmemPwFilterDesc->memFormat = DF_HWCN; + gclmemPwFilterDesc->flags = CL_MEM_READ_WRITE; + gclmemPwFilterDesc->host_ptr = NULL; + *bytes = 0; + return SUCCESS; +} + +EE depthwise_pointwise_convolution_gemm_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + GCLMem_t dwFilter, + GCLMem_t pwFilter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *dwFltmemDesc, + TensorDesc *pwFltmemDesc, + GCLMem_t dwFltmem, + GCLMem_t pwFltmem) +{ + U32 dfw, dfh, dfc; + U32 pfc, pfn; + tensorSelectGet(dwFilterDesc, NULL, NULL, NULL, &dfc, &dfh, &dfw); + tensorSelectGet(pwFilterDesc, NULL, NULL, &pfn, &pfc, NULL, NULL); + U32 dfwh = dfw * dfh; + U32 item_kd = forwardRunInfo->best_k[0]; + U32 item_kp = forwardRunInfo->best_k[1]; + U32 item_c = forwardRunInfo->best_c[1]; + char kernelname[128]; + Kernel kernel; + sprintf(kernelname, "conv_depthwise_trans_fltbuf_%d", item_kd); + CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, dfwh, dfc, dwFilter->mem, dwFltmem->mem)); + U32 gs[2] = {dfwh, (dfc + item_kd - 1) / item_kd}; + U32 ls[2] = {0, 0}; + U32 dim = 2; + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + *dwFltmemDesc = dwFilterDesc; + + U32 fn_align = ALIGN(pfn, item_kp); + sprintf(kernelname, "conv_direct_trans_fltbuf_%d%d", item_c, 0); + CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, 1, pfc, fn_align, pwFilter->mem, pwFltmem->mem)); + U32 gsc[3] = {1, ALIGN(pfc, item_c) / item_c, fn_align}; + U32 lsc[3] = {0, 0, 0}; + U32 dimc = 3; + CHECK_STATUS(gcl_run_kernel(handle, kernel, dimc, gsc, lsc, kernelname)); + *pwFltmemDesc = pwFilterDesc; + return SUCCESS; +} + +EE depthwise_pointwise_convolution_gemm_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes) +{ + UNUSED(inputDesc); + UNUSED(pwFilterDesc); + UNUSED(convParamSpec); + DataType odt; + U32 oh, ow, fc; + tensorSelectGet(dwFilterDesc, NULL, NULL, NULL, &fc, NULL, NULL); + tensorSelectGet(outputDesc, &odt, NULL, NULL, NULL, &oh, &ow); + + U32 N; + U32 item_wh = forwardRunInfo->best_w[1]; + N = ALIGN(oh * ow, item_wh); + *bytes = N * ALIGN(fc, 4) * bytesOf(odt); + return SUCCESS; +} + +EE depthwise_pointwise_convolution_gemm_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + const GCLMem_t dwFilter, + const GCLMem_t pwFilter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc dwBiasDesc, + TensorDesc pwBiasDesc, + const GCLMem_t dwBias, + const GCLMem_t pwBias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode depthwiseActivationMode, + ActivationMode pointwiseActivationMode) +{ + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + CHECK_STATUS(depthwise_pointwise_gemm_core_mali_fp16(handle, inputDesc, input, dwFilterDesc, + pwFilterDesc, dwFilter, pwFilter, convParamSpec, forwardRunInfo, dwBiasDesc, pwBiasDesc, + dwBias, pwBias, tmpBytes, tmpBuf, outputDesc, output, depthwiseActivationMode, + pointwiseActivationMode)); + + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_gemm_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_gemm_mali_fp16.h new file mode 100644 index 00000000..72fdf75b --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_gemm_mali_fp16.h @@ -0,0 +1,66 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _DEPTHWISE_POINTWISE_CONVOLUTION_GEMM_MALI_FP16 +#define _DEPTHWISE_POINTWISE_CONVOLUTION_GEMM_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE depthwise_pointwise_convolution_gemm_transform_filter_bytes_mali_fp16(TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemDwFilterDesc, + GCLMemDesc_t gclmemPwFilterDesc, + U32 *bytes); + +EE depthwise_pointwise_convolution_gemm_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + GCLMem_t dwFilter, + GCLMem_t pwFilter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *dwFltmemDesc, + TensorDesc *pwFltmemDesc, + GCLMem_t dwFltmem, + GCLMem_t pwFltmem); + +EE depthwise_pointwise_convolution_gemm_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes); + +EE depthwise_pointwise_convolution_gemm_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + const GCLMem_t dwFilter, + const GCLMem_t pwFilter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc dwBiasDesc, + TensorDesc pwBiasDesc, + const GCLMem_t dwBias, + const GCLMem_t pwBias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode depthwiseActivationMode, + ActivationMode pointwiseActivationMode); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_mali_fp16.cpp new file mode 100644 index 00000000..28b32d67 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_mali_fp16.cpp @@ -0,0 +1,190 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/depthwise_pointwise_convolution_mali_fp16.h" +#include "gpu/mali/fp16/depthwise_pointwise_convolution_direct_mali_fp16.h" +#include "gpu/mali/fp16/depthwise_pointwise_convolution_gemm_mali_fp16.h" + +inline EE depthwise_pointwise_convolution_checkpara_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + const GCLMem_t dwFilter, + const GCLMem_t pwFilter, + const GCLMem_t dwBias, + const GCLMem_t pwBias, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output) +{ + if (nullptr == handle || nullptr == input || nullptr == dwFilter || nullptr == pwFilter || + nullptr == output || nullptr == dwBias || nullptr == pwBias || nullptr == tmpBuf) { + return NULL_POINTER; + } + if (inputDesc.dt != outputDesc.dt || inputDesc.dt != dwFilterDesc.dt || + inputDesc.dt != pwFilterDesc.dt || inputDesc.dt != DT_F16) { + return NOT_MATCH; + } + + U32 ic, fn, fh, fw, oc; + U32 dfc, pfc; + CHECK_STATUS(tensorSelectGet(inputDesc, NULL, NULL, NULL, &ic, NULL, NULL)); + CHECK_STATUS(tensorSelectGet(dwFilterDesc, NULL, NULL, NULL, &dfc, &fh, &fw)); + CHECK_STATUS(tensorSelectGet(pwFilterDesc, NULL, NULL, &fn, &pfc, NULL, NULL)); + CHECK_STATUS(tensorSelectGet(outputDesc, NULL, NULL, NULL, &oc, NULL, NULL)); + if (fw != 3 && fw != 5 && fw != 7) { + return NOT_MATCH; + } + if (ic != dfc || ic != pfc) { + return NOT_MATCH; + } + if (fn != oc) { + return NOT_MATCH; + } + return SUCCESS; +} + +EE depthwise_pointwise_convolution_transform_filter_bytes_mali_fp16(TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemDwFilterDesc, + GCLMemDesc_t gclmemPwFilterDesc, + U32 *bytes) +{ + EE ret = SUCCESS; + DepthwiseConvolutionForwardAlgorithm algorithm = + (DepthwiseConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + switch (algorithm) { + case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT: + ret = depthwise_pointwise_convolution_direct_transform_filter_bytes_mali_fp16( + dwFilterDesc, pwFilterDesc, forwardRunInfo, gclmemDwFilterDesc, gclmemPwFilterDesc, + bytes); + break; + case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_GEMM: + ret = depthwise_pointwise_convolution_gemm_transform_filter_bytes_mali_fp16(dwFilterDesc, + pwFilterDesc, forwardRunInfo, gclmemDwFilterDesc, gclmemPwFilterDesc, bytes); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE depthwise_pointwise_convolution_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + GCLMem_t dwFilter, + GCLMem_t pwFilter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *dwFltmemDesc, + TensorDesc *pwFltmemDesc, + GCLMem_t dwFltmem, + GCLMem_t pwFltmem) +{ + EE ret = SUCCESS; + DepthwiseConvolutionForwardAlgorithm algorithm = + (DepthwiseConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + switch (algorithm) { + case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT: + ret = depthwise_pointwise_convolution_direct_transform_filter_mali_fp16(handle, + dwFilterDesc, pwFilterDesc, dwFilter, pwFilter, forwardRunInfo, dwFltmemDesc, + pwFltmemDesc, dwFltmem, pwFltmem); + break; + case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_GEMM: + ret = depthwise_pointwise_convolution_gemm_transform_filter_mali_fp16(handle, + dwFilterDesc, pwFilterDesc, dwFilter, pwFilter, forwardRunInfo, dwFltmemDesc, + pwFltmemDesc, dwFltmem, pwFltmem); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE depthwise_pointwise_convolution_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes) +{ + EE ret = SUCCESS; + DepthwiseConvolutionForwardAlgorithm algorithm = + (DepthwiseConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + switch (algorithm) { + case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT: + ret = depthwise_pointwise_convolution_direct_infer_forward_tmp_bytes_mali_fp16(inputDesc, + dwFilterDesc, pwFilterDesc, outputDesc, convParamSpec, forwardRunInfo, bytes); + break; + case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_GEMM: + ret = depthwise_pointwise_convolution_gemm_infer_forward_tmp_bytes_mali_fp16(inputDesc, + dwFilterDesc, pwFilterDesc, outputDesc, convParamSpec, forwardRunInfo, bytes); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE depthwise_pointwise_convolution_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + const GCLMem_t dwFilter, + const GCLMem_t pwFilter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc dwBiasDesc, + TensorDesc pwBiasDesc, + const GCLMem_t dwBias, + const GCLMem_t pwBias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode depthwiseActivationMode, + ActivationMode pointwiseActivationMode) +{ + EE ret = SUCCESS; + CHECK_STATUS(depthwise_pointwise_convolution_checkpara_mali_fp16(handle, inputDesc, input, + dwFilterDesc, pwFilterDesc, dwFilter, pwFilter, dwBias, pwBias, tmpBuf, outputDesc, output)); + DepthwiseConvolutionForwardAlgorithm algorithm = + (DepthwiseConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + switch (algorithm) { + case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT: + ret = depthwise_pointwise_convolution_direct_mali_fp16(handle, inputDesc, input, + dwFilterDesc, pwFilterDesc, dwFilter, pwFilter, convParamSpec, forwardRunInfo, + dwBiasDesc, pwBiasDesc, dwBias, pwBias, tmpBytes, tmpBuf, outputDesc, output, + depthwiseActivationMode, pointwiseActivationMode); + break; + case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_GEMM: + ret = depthwise_pointwise_convolution_gemm_mali_fp16(handle, inputDesc, input, + dwFilterDesc, pwFilterDesc, dwFilter, pwFilter, convParamSpec, forwardRunInfo, + dwBiasDesc, pwBiasDesc, dwBias, pwBias, tmpBytes, tmpBuf, outputDesc, output, + depthwiseActivationMode, pointwiseActivationMode); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_mali_fp16.h new file mode 100644 index 00000000..1468f365 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_mali_fp16.h @@ -0,0 +1,66 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _DEPTHWISE_POINTWISE_CONVOLUTION_MALI_FP16 +#define _DEPTHWISE_POINTWISE_CONVOLUTION_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE depthwise_pointwise_convolution_transform_filter_bytes_mali_fp16(TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemDwFilterDesc, + GCLMemDesc_t gclmemPwFilterDesc, + U32 *bytes); + +EE depthwise_pointwise_convolution_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + GCLMem_t dwFilter, + GCLMem_t pwFilter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *dwFltmemDesc, + TensorDesc *pwFltmemDesc, + GCLMem_t dwFltmem, + GCLMem_t pwFltmem); + +EE depthwise_pointwise_convolution_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes); + +EE depthwise_pointwise_convolution_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + const GCLMem_t dwFilter, + const GCLMem_t pwFilter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc dwBiasDesc, + TensorDesc pwBiasDesc, + const GCLMem_t dwBias, + const GCLMem_t pwBias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode depthwiseActivationMode, + ActivationMode pointwiseActivationMode); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/eltwise_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/eltwise_mali_fp16.cpp new file mode 100644 index 00000000..4062bc06 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/eltwise_mali_fp16.cpp @@ -0,0 +1,290 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +#include +#include + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/eltwise_mali_fp16.h" + +bool eltwise_same_desc(std::vector inputDesc, U32 *arrayDimMax) +{ + U32 size = inputDesc.size(); + U32 dimMax = 0; + for (U32 i = 1; i < size; i++) { + if (inputDesc[i].nDims > inputDesc[dimMax].nDims) { + dimMax = i; + } else if (inputDesc[i].nDims == inputDesc[dimMax].nDims) { + U32 nDims = inputDesc[dimMax].nDims; + U32 sign[8]; + if (nDims > 8) { + CHECK_STATUS(NOT_SUPPORTED); + } + for (U32 j = 0; j < nDims; j++) { + if (inputDesc[i].dims[j] > inputDesc[dimMax].dims[j]) { + sign[j] = 2; + } else if (inputDesc[i].dims[j] == inputDesc[dimMax].dims[j]) { + sign[j] = 1; + } else { + sign[j] = 0; + } + } + if (*std::max_element(sign, sign + nDims) == 2 && + *std::min_element(sign, sign + nDims) == 1) { + dimMax = i; + } + if (*std::max_element(sign, sign + nDims) == 2 && + *std::min_element(sign, sign + nDims) == 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + } + } + + bool sameDesc = true; + DataFormat idf; + U32 in, ic, ih, iw; + tensorSelectGet(inputDesc[0], NULL, &idf, &in, &ic, &ih, &iw); + for (U32 i = 1; i < size; i++) { + DataFormat tdf; + U32 tn, tc, th, tw; + tensorSelectGet(inputDesc[i], NULL, &tdf, &tn, &tc, &th, &tw); + if (tdf != idf || in != tn || ic != tc || ih != th || iw != tw) { + sameDesc = false; + break; + } + } + *arrayDimMax = dimMax; + return sameDesc; +} + +inline EE eltwise_checkpara_mali_fp16( + std::vector inputDesc, std::vector input, TensorDesc outputDesc) +{ + for (auto it : inputDesc) { + if (it.dt != outputDesc.dt) { + return NOT_SUPPORTED; + } + } + U32 num = input.size(); + if (num > 8) { + return NOT_SUPPORTED; + } + if (outputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE eltwise_core_mali_fp16(GCLHandle_t handle, + std::vector inputDesc, + std::vector input, + TensorDesc outputDesc, + GCLMem_t output, + EltwiseParamSpec eltwiseDesc) +{ + UNUSED(outputDesc); + U32 iw, ih, ic, in; + U32 arrayDimMax; + bool sameDesc = eltwise_same_desc(inputDesc, &arrayDimMax); + tensorSelectGet(inputDesc[arrayDimMax], NULL, NULL, &in, &ic, &ih, &iw); + + U32 num = input.size(); + GCLMem_t inputMem[8]; + for (U32 i = 0; i < num; ++i) { + inputMem[i] = (GCLMem_t)input[i]; + } + cl_mem outbuf; + outbuf = output->mem; + + U32 ow_str, oh_str, oc_str, ow_off, oh_off; + U32 iw_str[8]; + U32 ih_str[8]; + U32 iw_off[8]; + U32 ih_off[8]; + for (U32 i = 0; i < num; ++i) { + get_gclmem_dim(inputMem[i]->desc, &iw_str[i], &ih_str[i], NULL, &iw_off[i], &ih_off[i]); + } + get_gclmem_dim(output->desc, &ow_str, &oh_str, &oc_str, &ow_off, &oh_off); + + char modeName[16]; + char activeName[16]; + char kernelName[128]; + EltwiseMode eltwiseMode = eltwiseDesc.elt_mode; + ActivationMode activeMode = eltwiseDesc.activation_type; + + Kernel kernel; + if (eltwiseMode == ELTWISE_MAX) { + strcpy(modeName, "max"); + } + if (eltwiseMode == ELTWISE_SUM) { + strcpy(modeName, "sum"); + } + if (eltwiseMode == ELTWISE_PROD) { + strcpy(modeName, "prod"); + } + switch (activeMode) { + case ACTIVATION_RELU: + strcpy(activeName, "relu_"); + break; + case ACTIVATION_NULL: + strcpy(activeName, ""); + break; + default: + return NOT_SUPPORTED; + } + U32 gs[3] = {ih, iw, (ic + 3) / 4 * in}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + if (activeMode != ACTIVATION_NULL && !sameDesc) { + CHECK_STATUS(NOT_SUPPORTED); + } + + if (sameDesc) { + char formatName[16] = ""; + if (inputMem[0]->desc.memFormat == DF_NCHW) { + strcpy(formatName, "nchw_"); + gs[0] = (iw + 3) / 4; + gs[1] = ih; + gs[2] = ic; + if (output->desc.memFormat == DF_NCWHC4) { + CHECK_STATUS(NOT_SUPPORTED); + } + } + sprintf(kernelName, "eltwise_%s%s%s%d", formatName, activeName, modeName, num); + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + switch (num) { + case 1: + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ic, oh_str, ow_str, oh_off, ow_off, + gs[0], gs[1], ih_str[0], iw_str[0], ih_off[0], iw_off[0], inputMem[0]->mem, + outbuf)); + break; + case 2: + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ic, oh_str, ow_str, oh_off, ow_off, + gs[0], gs[1], ih_str[0], iw_str[0], ih_off[0], iw_off[0], inputMem[0]->mem, + ih_str[1], iw_str[1], ih_off[1], iw_off[1], inputMem[1]->mem, outbuf)); + break; + case 3: + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ic, oh_str, ow_str, oh_off, ow_off, + gs[0], gs[1], ih_str[0], iw_str[0], ih_off[0], iw_off[0], inputMem[0]->mem, + ih_str[1], iw_str[1], ih_off[1], iw_off[1], inputMem[1]->mem, ih_str[2], + iw_str[2], ih_off[2], iw_off[2], inputMem[2]->mem, outbuf)); + break; + case 4: + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ic, oh_str, ow_str, oh_off, ow_off, + gs[0], gs[1], ih_str[0], iw_str[0], ih_off[0], iw_off[0], inputMem[0]->mem, + ih_str[1], iw_str[1], ih_off[1], iw_off[1], inputMem[1]->mem, ih_str[2], + iw_str[2], ih_off[2], iw_off[2], inputMem[2]->mem, ih_str[3], iw_str[3], + ih_off[3], iw_off[3], inputMem[3]->mem, outbuf)); + break; + default: + return NOT_SUPPORTED; + } + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); + handle->t_total += handle->t_execute; +#endif + return SUCCESS; + } else { + if (num > 2) { + CHECK_STATUS(NOT_SUPPORTED) + } + DataFormat mf[2]; + mf[0] = inputMem[arrayDimMax]->desc.memFormat; + mf[1] = inputMem[1 - arrayDimMax]->desc.memFormat; + if (mf[0] == DF_NCWHC4 && mf[1] == DF_NCWHC4) { + U32 w_str, h_str, c_str, w_off, h_off; + get_gclmem_dim(inputMem[1 - arrayDimMax]->desc, &w_str, &h_str, &c_str, &w_off, &h_off); + if (w_str == 1 && h_str == 1 && c_str == 1) { + sprintf(kernelName, "eltwise_broadcast_%s%d", modeName, 0); + } else if (w_str == 1 && h_str == 1) { + sprintf(kernelName, "eltwise_broadcast_%s%d", modeName, 1); + } else if (w_str != 1 && h_str == 1) { + sprintf(kernelName, "eltwise_broadcast_%s%d", modeName, 2); + } else if (w_str == 1 && h_str != 1) { + sprintf(kernelName, "eltwise_broadcast_%s%d", modeName, 3); + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ic, ih_str[arrayDimMax], + iw_str[arrayDimMax], ih_off[arrayDimMax], iw_off[arrayDimMax], oh_str, ow_str, + oh_off, ow_off, inputMem[arrayDimMax]->mem, inputMem[1 - arrayDimMax]->mem, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); + handle->t_total += handle->t_execute; +#endif + return SUCCESS; + } else if (mf[0] == DF_NCWHC4 && mf[1] == DF_NCHW) { + U32 axis_a[3]; + U32 axis_b[3]; + tensorSelectGet( + inputDesc[arrayDimMax], NULL, NULL, NULL, &axis_a[2], &axis_a[1], &axis_a[0]); + tensorSelectGet( + inputDesc[1 - arrayDimMax], NULL, NULL, NULL, &axis_b[2], &axis_b[1], &axis_b[0]); + U32 matchAxis[2]; + for (U32 i = 0; i < 3; ++i) { + for (U32 j = 0; j < 3; ++j) { + if (axis_a[i] == axis_b[j] && axis_b[j] != 1) { + matchAxis[0] = i; + matchAxis[1] = j; + break; + } + } + } + if (matchAxis[0] == 2) { + for (U32 i = 0; i < 3; ++i) { + if (i != matchAxis[1]) { + if (axis_b[i] != 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (inputMem[1 - arrayDimMax]->desc.stride[i] != 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (inputMem[1 - arrayDimMax]->desc.offset[i] != 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + } + } + sprintf(kernelName, "eltwise_spe_nchw_c_%s", modeName); + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS( + gcl_set_kernelArgs(kernel, ih, iw, ic, ih_str[arrayDimMax], iw_str[arrayDimMax], + ih_off[arrayDimMax], iw_off[arrayDimMax], oh_str, ow_str, oh_off, ow_off, + inputMem[arrayDimMax]->mem, inputMem[1 - arrayDimMax]->mem, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); + handle->t_total += handle->t_execute; +#endif + return SUCCESS; + } + } + } + return NOT_SUPPORTED; +} + +EE eltwise_mali_fp16(GCLHandle_t handle, + std::vector inputDesc, + std::vector input, + TensorDesc outputDesc, + GCLMem_t output, + EltwiseParamSpec eltwiseDesc) +{ + CHECK_STATUS(eltwise_checkpara_mali_fp16(inputDesc, input, outputDesc)); + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + CHECK_STATUS(eltwise_core_mali_fp16(handle, inputDesc, input, outputDesc, output, eltwiseDesc)); + return SUCCESS; +} diff --git a/tensor_computing/src/gpu/mali/fp16/eltwise_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/eltwise_mali_fp16.h similarity index 75% rename from tensor_computing/src/gpu/mali/fp16/eltwise_mali_fp16.h rename to compute/tensor/src/gpu/mali/fp16/eltwise_mali_fp16.h index 9f724162..ddc17912 100644 --- a/tensor_computing/src/gpu/mali/fp16/eltwise_mali_fp16.h +++ b/compute/tensor/src/gpu/mali/fp16/eltwise_mali_fp16.h @@ -11,20 +11,19 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _H_ELTWISE_MALI_FP16 #define _H_ELTWISE_MALI_FP16 #include "sys.h" -#include "type.h" -#include "tensor_desc.h" #include "error.h" +#include "types.h" #include "tensor_computing_type.h" -EE eltwise_mali_fp16(GCLHandle_t handle, - std::vector inputDesc, - std::vector input, - TensorDesc outputDesc, - GCLMem_t output, - EltwiseMode eltwiseMode); -#endif +bool eltwise_same_desc(std::vector inputDesc, U32 *arrayDimMax); +EE eltwise_mali_fp16(GCLHandle_t handle, + std::vector inputDesc, + std::vector input, + TensorDesc outputDesc, + GCLMem_t output, + EltwiseParamSpec eltwiseDesc); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/embedding_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/embedding_mali_fp16.cpp new file mode 100644 index 00000000..f4532f33 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/embedding_mali_fp16.cpp @@ -0,0 +1,86 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/embedding_mali_fp16.h" + +inline EE embedding_checkpara_mali_fp16(TensorDesc weightDesc, TensorDesc outputDesc) +{ + if (weightDesc.dt != outputDesc.dt || weightDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE embedding_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc weightDesc, + GCLMem_t weight, + EmbedParamSpec p, + TensorDesc outputDesc, + GCLMem_t output) +{ + UNUSED(weightDesc); + UNUSED(outputDesc); + U32 step = inputDesc.dims[0]; + U32 on = p.num_output; + U32 oh_str = output->desc.stride[0]; + U32 ow_str = output->desc.stride[1]; + U32 oc_str = output->desc.stride[2]; + U32 oh_off = output->desc.offset[0]; + U32 ow_off = output->desc.offset[1]; + if (ow_str != 1 || oh_off != 0 || ow_off != 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + cl_mem inbuf, weibuf, outbuf; + inbuf = input->mem; + weibuf = weight->mem; + outbuf = output->mem; + + if (!p.transpose) { + U32 gs[2] = {oc_str, step}; + U32 ls[2] = {0, 0}; + U32 dim = 2; + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, "embedding", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs( + kernel, step, on, oc_str, oh_str, oh_off, ow_off, inbuf, weibuf, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, "embedding"); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "embedding")); + CHECK_STATUS(gcl_print_memory(handle, output, "embedding_output")); +#endif + return SUCCESS; + } else { + return NOT_SUPPORTED; + } +} + +EE embedding_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc weightDesc, + GCLMem_t weight, + EmbedParamSpec p, + TensorDesc outputDesc, + GCLMem_t output) +{ + CHECK_STATUS(embedding_checkpara_mali_fp16(weightDesc, outputDesc)); + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + CHECK_STATUS(embedding_core_mali_fp16( + handle, inputDesc, input, weightDesc, weight, p, outputDesc, output)); + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/fp16/embedding_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/embedding_mali_fp16.h new file mode 100644 index 00000000..309402b2 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/embedding_mali_fp16.h @@ -0,0 +1,29 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _EMBEDDING_MALI_FP16 +#define _EMBEDDING_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE embedding_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc weightDesc, + GCLMem_t weight, + EmbedParamSpec p, + TensorDesc outputDesc, + GCLMem_t output); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/fully_connected_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/fully_connected_mali_fp16.cpp new file mode 100644 index 00000000..dd828ad1 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/fully_connected_mali_fp16.cpp @@ -0,0 +1,353 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/fully_connected_mali_fp16.h" + +inline EE fully_connected_checkpara_mali_fp16( + TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc) +{ + if (inputDesc.dt != outputDesc.dt || inputDesc.dt != filterDesc.dt || inputDesc.dt != DT_F16) { + return NOT_MATCH; + } + return SUCCESS; +} + +inline EE fully_connected_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + std::vector filter, + TensorDesc biasDesc, + std::vector bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + std::vector output, + ForwardRunInfoMali_t forwardRunInfo) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + UNUSED(outputDesc); + + U32 ih_str, iw_str, ih_off, iw_off, ihw_str; + U32 oh_str, ow_str, oh_off, ow_off; + U32 fw, fh, fc, fn; + cl_mem inbuf, fltbuf, biasmem, outbuf, tmp; + inbuf = input->mem; + fltbuf = filter[0]->mem; + biasmem = bias[0]->mem; + outbuf = output[0]->mem; + tmp = tmpBuf->mem; + + tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw); + ih_str = input->desc.stride[0]; + iw_str = input->desc.stride[1]; + ih_off = input->desc.offset[0]; + iw_off = input->desc.offset[1]; + oh_str = output[0]->desc.stride[0]; + ow_str = output[0]->desc.stride[1]; + oh_off = output[0]->desc.offset[0]; + ow_off = output[0]->desc.offset[1]; + ihw_str = ih_str * iw_str; + char kernelname[128]; + Kernel kernel; + U32 gs[3]; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + U32 item_w = forwardRunInfo->best_w[0]; + U32 item_c = forwardRunInfo->best_c[0]; + U32 item_k = forwardRunInfo->best_k[0]; + + if (fw == 1 && fh == 1) { + if (inputDesc.df == DF_NCHW || inputDesc.df == DF_NORMAL) { + U32 ic_str; + ic_str = filter[0]->desc.stride[1]; + if (ih_str > 1 || iw_str > 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + sprintf(kernelname, "conv_direct_spe_fwhs1_%d", item_c); + gs[0] = fn; + gs[1] = 1; + gs[2] = 1; + dim = 1; + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, ihw_str, ic_str, ih_off, iw_off, oh_str, + ow_str, oh_off, ow_off, fn, gs[0], gs[1], inbuf, fltbuf, biasmem, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + CHECK_STATUS(gcl_print_memory(handle, input, "fc_wh1_input")); + CHECK_STATUS(gcl_print_memory(handle, filter[0], "fc_wh1_filter")); + CHECK_STATUS(gcl_print_memory(handle, bias[0], "fc_wh1_bias")); + CHECK_STATUS(gcl_print_memory(handle, output[0], "fc_wh1_output")); + handle->t_total += handle->t_execute; +#endif + } + if (inputDesc.df == DF_MKT) { + item_k = item_k >> 2; + U32 ic_str = input->desc.stride[2]; + U32 ohw_str; + U32 step = inputDesc.dims[0]; + sprintf(kernelname, "conv_direct_s%d_%d%d%d", 1, 1, item_w, item_k); + for (U32 i = 0; i < filter.size(); ++i) { + fltbuf = filter[i]->mem; + biasmem = bias[i]->mem; + outbuf = output[i]->mem; + iw_str = input->desc.stride[0]; + ih_str = input->desc.stride[1]; + iw_off = input->desc.offset[0]; + ih_off = input->desc.offset[1]; + ow_str = output[i]->desc.stride[0]; + oh_str = output[i]->desc.stride[1]; + ow_off = output[i]->desc.offset[0]; + oh_off = output[i]->desc.offset[1]; + ohw_str = oh_str * ow_str; + if (ih_str != 1 || ih_off != 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + gs[0] = 1; + gs[1] = (step + item_w - 1) / item_w; + gs[2] = output[i]->desc.stride[2] / item_k; + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, ihw_str, ic_str, ih_off, iw_off, + oh_str, ohw_str, oh_off, ow_off, step, 1, gs[0], gs[1], inbuf, fltbuf, biasmem, + outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + CHECK_STATUS(gcl_print_memory(handle, input, "conv_direct_input")); + CHECK_STATUS(gcl_print_memory(handle, filter[i], "conv_direct_filter")); + CHECK_STATUS(gcl_print_memory(handle, bias[i], "conv_direct_bias")); + CHECK_STATUS(gcl_print_memory(handle, output[i], "conv_direct_output")); + handle->t_total += handle->t_execute; +#endif + } + } + } else { + U32 ihy_str, fhy_str, fhw_str, fwc_str; + ihy_str = ih_str * item_w; + fc = (fc + item_c - 1) / item_c; + fn = (fn + item_k - 1) / item_k; + fhy_str = fh * item_w; + fhw_str = fh * fw; + fwc_str = fw * fc; + CHECK_STATUS(gcl_create_kernel(handle, "fc_p1", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, item_w, ih_str, iw_str, ih_off, iw_off, ihy_str, + ihw_str, fh, fw, fc, fn, fhy_str, fhw_str, fwc_str, fltbuf, inbuf, tmp)); + gs[0] = fh; + gs[1] = item_w; + gs[2] = fn; + gcl_set_kernelVec(handle, kernel, dim, gs, ls, "fc_p1"); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "fc_p1")); + CHECK_STATUS(gcl_print_memory(handle, input, "fc_p1_input")); + CHECK_STATUS(gcl_print_memory(handle, filter[0], "fc_p1_filter")); + CHECK_STATUS(gcl_print_buffer(handle, tmp, fh * item_w * fn * item_k, "fc_p1_output")); + handle->t_total += handle->t_execute; +#endif + CHECK_STATUS(gcl_create_kernel(handle, "fc_p2", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs( + kernel, fh * item_w, fn, oh_str, ow_str, oh_off, ow_off, tmp, biasmem, outbuf)); + U32 gs2 = fn; + U32 ls2 = 0; + dim = 1; + gcl_set_kernelVec(handle, kernel, dim, &gs2, &ls2, "fc_p2"); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, &gs2, &ls2, "fc_p2")); + CHECK_STATUS(gcl_print_memory(handle, bias[0], "fc_p2_bias")); + CHECK_STATUS(gcl_print_memory(handle, output[0], "fc_p2_output")); + handle->t_total += handle->t_execute; +#endif + } + return SUCCESS; +} + +EE fully_connected_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo) +{ + U32 fw, fh, fc, fn; + tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw); + U32 item_c = forwardRunInfo->best_c[0]; + U32 item_k = forwardRunInfo->best_k[0]; + U32 s0 = 0; + U32 s1 = 0; + U32 s2 = 0; + U32 num = 0; + U32 byteSize; + + if (item_k == 0) { + s0 = fn; + s1 = (fc + item_c - 1) / item_c; + s2 = 1; + DataFormat df = DF_CHWNC4; + if (item_c == 8) { + df = DF_CHWNC8; + } + if (item_c == 16) { + df = DF_CHWNC16; + } + gclmemFilterDesc->memFormat = df; + num = s0 * s1 * s2 * item_c; + } else if (fw == 1 && fh == 1) { + s0 = item_k >> 2; + s1 = (fc + item_c - 1) / item_c; + s2 = (fn + item_k - 1) / item_k; + gclmemFilterDesc->memFormat = DF_NCHWN4C4; + num = s0 * s1 * s2 * item_c * item_k / (item_k >> 2); + } else { + s0 = fh; + s1 = fw; + s2 = ((fc + item_c - 1) / item_c) * ((fn + item_k - 1) / item_k); + num = s0 * s1 * s2 * item_c * item_k; + gclmemFilterDesc->memFormat = DF_NCWHN4C4; + } + byteSize = num * bytesOf(DT_F16); + gclmemFilterDesc->stride[0] = s0; + gclmemFilterDesc->stride[1] = s1; + gclmemFilterDesc->stride[2] = s2; + gclmemFilterDesc->offset[0] = 0; + gclmemFilterDesc->offset[1] = 0; + gclmemFilterDesc->offset[2] = 0; + gclmemFilterDesc->num = num; + gclmemFilterDesc->byteSize = byteSize; + gclmemFilterDesc->memType = GCL_MEM_BUF; + gclmemFilterDesc->flags = CL_MEM_READ_WRITE; + gclmemFilterDesc->host_ptr = NULL; + *bytes = 0; + return SUCCESS; +} + +EE fully_connected_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + TensorDesc *fltmemDesc, + std::vector fltmem, + ForwardRunInfoMali_t forwardRunInfo) +{ + DataType fdt; + DataFormat fdf; + U32 fw, fh, fc, fn; + tensorSelectGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw); + char kernelname[128]; + Kernel kernel; + U32 gs[3]; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + U32 fwh = fw * fh; + U32 item_c = forwardRunInfo->best_c[0]; + U32 item_k = forwardRunInfo->best_k[0]; + if (fw == 1 && fh == 1) { + if (item_k == 0) { + sprintf(kernelname, "conv_direct_trans_fltbuf_%d%d", item_c, item_k); + CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, fwh, fc, fn, filter->mem, fltmem[0]->mem)); + gs[0] = fwh; + gs[1] = (fc + item_c - 1) / item_c; + gs[2] = fn; + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + } else { + sprintf(kernelname, "conv_direct_trans_fltbuf_%d%d", item_c, item_k); + CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); + if (fltmem.size() == 1) { + CHECK_STATUS(gcl_set_kernelArgs(kernel, fwh, fc, fn, filter->mem, fltmem[0]->mem)); + gs[0] = fwh; + gs[1] = (fc + item_c - 1) / item_c; + gs[2] = (fn + item_k - 1) / item_k * item_k; + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + } else { + GCLMem_t tmp = gcl_create_gclmem(); + tmp->desc.byteSize = 0; + for (U32 i = 0; i < fltmem.size(); ++i) { + tmp->desc.byteSize += fltmem[i]->desc.byteSize; + } + tmp->desc.memType = GCL_MEM_BUF; + tmp->desc.flags = CL_MEM_READ_WRITE; + CHECK_STATUS(gcl_create_memory(handle, tmp)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, fwh, fc, fn, filter->mem, tmp->mem)); + gs[0] = fwh; + gs[1] = (fc + item_c - 1) / item_c; + gs[2] = (fn + item_k - 1) / item_k * item_k; + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + U32 offset[2] = {0, 0}; + for (U32 i = 0; i < fltmem.size(); i++) { + U32 size = fltmem[i]->desc.byteSize; + CHECK_STATUS(gcl_trans_memory( + handle, tmp, fltmem[i], &size, DEVICE_BUF_TO_BUF, CL_TRUE, offset)); + offset[0] += size; + } + gcl_destroy_gclmem(tmp); + } + } + } else { + sprintf(kernelname, "fc_trans_fltbuf_%d%d", item_c, item_k); + CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, fw, fh, fwh, fc, fn, filter->mem, fltmem[0]->mem)); + gs[0] = fw; + gs[1] = fh; + gs[2] = (fc + item_c - 1) / item_c * ((fn + item_k - 1) / item_k) * item_k; + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + } + *fltmemDesc = tensor4df(fdt, fdf, fn, fc, fh, fw); +#ifdef _DEBUG + CHECK_STATUS(gcl_print_memory(handle, filter, "fc_filter_org")); + for (U32 i = 0; i < fltmem.size(); ++i) { + CHECK_STATUS(gcl_print_memory(handle, fltmem[i], "fc_filter_tran")); + } +#endif + return SUCCESS; +} + +EE fully_connected_infer_forward_tmp_bytes_mali_fp16( + TensorDesc inputDesc, TensorDesc filterDesc, U32 *bytes, ForwardRunInfoMali_t forwardRunInfo) +{ + U32 fn, fw, fh; + tensorSelectGet(filterDesc, NULL, NULL, &fn, NULL, &fh, &fw); + if (fh == 1 && fw == 1) { + *bytes = 0; + } else { + DataType dt; + U32 ic, ih, iw; + tensorSelectGet(inputDesc, &dt, NULL, NULL, &ic, &ih, &iw); + U32 item_w = forwardRunInfo->best_w[0]; + U32 item_k = forwardRunInfo->best_k[0]; + *bytes = ih * item_w * ((fn + item_k - 1) / item_k * item_k) * bytesOf(dt); + } + return SUCCESS; +} + +EE fully_connected_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + std::vector filter, + TensorDesc biasDesc, + std::vector bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + std::vector output, + ForwardRunInfoMali_t forwardRunInfo) +{ + CHECK_STATUS(fully_connected_checkpara_mali_fp16(inputDesc, filterDesc, outputDesc)); + for (U32 i = 0; i < output.size(); i++) { + CHECK_STATUS(fill_output_zero(handle, output[i], outputDesc)); + } + CHECK_STATUS(fully_connected_core_mali_fp16(handle, inputDesc, input, filterDesc, filter, + biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, forwardRunInfo)); + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/fp16/fully_connected_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/fully_connected_mali_fp16.h new file mode 100644 index 00000000..6af33a0d --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/fully_connected_mali_fp16.h @@ -0,0 +1,48 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _FC_MALI_FP16 +#define _FC_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE fully_connected_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo); + +EE fully_connected_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + TensorDesc *fltmemDesc, + std::vector fltmem, + ForwardRunInfoMali_t forwardRunInfo); + +EE fully_connected_infer_forward_tmp_bytes_mali_fp16( + TensorDesc inputDesc, TensorDesc filterDesc, U32 *bytes, ForwardRunInfoMali_t forwardRunInfo); + +EE fully_connected_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + std::vector filter, + TensorDesc biasDesc, + std::vector bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + std::vector output, + ForwardRunInfoMali_t forwardRunInfo); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/matmul_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/matmul_mali_fp16.cpp new file mode 100644 index 00000000..e9b46aae --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/matmul_mali_fp16.cpp @@ -0,0 +1,197 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/matmul_mali_fp16.h" + +inline EE matmul_checkpara_mali_fp16( + TensorDesc matrixADesc, TensorDesc matrixBDesc, TensorDesc matrixCDesc) +{ + if (matrixADesc.dt != matrixBDesc.dt || matrixADesc.dt != matrixCDesc.dt || + matrixADesc.dt != DT_F16) { + return NOT_MATCH; + } + return SUCCESS; +} + +inline EE matmul_core_mali_fp16(GCLHandle_t handle, + TensorDesc matrixADesc, + bool transposeA, + const GCLMem_t matrixA, + TensorDesc matrixBDesc, + bool transposeB, + const GCLMem_t matrixB, + GCLMem_t tmp, + TensorDesc matrixCDesc, + GCLMem_t matrixC, + ForwardRunInfoMali_t forwardRunInfo) +{ + UNUSED(tmp); + UNUSED(matrixCDesc); + U32 adims = matrixADesc.nDims; + U32 ac = (adims > 2) ? matrixADesc.dims[2] : 1; + U32 ah = matrixADesc.dims[1]; + U32 aw = matrixADesc.dims[0]; + U32 bh = matrixBDesc.dims[1]; + U32 bw = matrixBDesc.dims[0]; + + U32 item_w = forwardRunInfo->best_w[0]; + U32 item_c = forwardRunInfo->best_c[0]; + U32 item_k = forwardRunInfo->best_k[0]; + cl_mem A, B, C; + A = matrixA->mem; + B = matrixB->mem; + C = matrixC->mem; + char kernelname[128]; + Kernel kernel; + U32 gs[3]; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + if (matrixA->desc.offset[0] != 0 || matrixA->desc.offset[1] != 0 || + matrixB->desc.offset[0] != 0 || matrixB->desc.offset[1] != 0 || + matrixC->desc.offset[0] != 0 || matrixC->desc.offset[1] != 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (transposeA && !transposeB) { + U32 M = matrixA->desc.stride[0]; + U32 N = matrixB->desc.stride[0]; + U32 K = ah; + U32 ow_str = matrixC->desc.stride[0]; + U32 A_str = M * matrixA->desc.stride[1]; + U32 B_str = N * matrixB->desc.stride[1]; + U32 C_str = ow_str * matrixC->desc.stride[1]; + U32 batch = ac; + gs[0] = (bw + item_w - 1) / item_w; + gs[1] = (aw + item_k - 1) / item_k; + gs[2] = batch; + sprintf(kernelname, "gemm_tn_nobias_%d%d", item_k, item_w); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, M, N, K, ow_str, A_str, B_str, C_str, 0, 0, bw, aw, + gs[0], gs[1], 0, 0, A, B, C)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + CHECK_STATUS(gcl_print_memory(handle, matrixA, "gemm_tn_a")); + CHECK_STATUS(gcl_print_memory(handle, matrixB, "gemm_tn_b")); + CHECK_STATUS(gcl_print_memory(handle, matrixC, "gemm_tn_c")); + handle->t_total += handle->t_execute; +#endif + return SUCCESS; + } + + if (!transposeA && transposeB) { + U32 KA = matrixA->desc.stride[0]; + U32 KB = matrixB->desc.stride[0]; + U32 K = (aw + item_c - 1) / item_c * item_c; + U32 ow_str = matrixC->desc.stride[0]; + U32 A_str = KA * matrixA->desc.stride[1]; + U32 B_str = KB * matrixB->desc.stride[1]; + U32 C_str = ow_str * matrixC->desc.stride[1]; + U32 batch = ac; + gs[0] = (bh + item_w - 1) / item_w; + gs[1] = (ah + item_k - 1) / item_k; + gs[2] = batch; + sprintf(kernelname, "gemm_nt_nobias_%d%d%d", item_k, item_w, (item_c >> 1)); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs( + kernel, KA, KB, K, ow_str, A_str, B_str, C_str, 0, 0, bh, ah, gs[0], gs[1], A, B, C)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + CHECK_STATUS(gcl_print_memory(handle, matrixA, "gemm_nt_a")); + CHECK_STATUS(gcl_print_memory(handle, matrixB, "gemm_nt_b")); + CHECK_STATUS(gcl_print_memory(handle, matrixC, "gemm_nt_c")); + handle->t_total += handle->t_execute; +#endif + return SUCCESS; + } + + if (transposeA && transposeB) { + if (matrixADesc.df != DF_MKT) { + CHECK_STATUS(NOT_SUPPORTED); + } + U32 m, k, t; + get_nlp_mkt_val(matrixADesc, NULL, &m, &k, &t); + if (t != 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (m != 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (aw != 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (ah != k) { + CHECK_STATUS(NOT_MATCH); + } + U32 KA = matrixA->desc.stride[2] * 4; + U32 KB = matrixB->desc.stride[0]; + U32 K = (ah + item_c - 1) / item_c * item_c; + U32 ow_str = matrixC->desc.stride[0]; + U32 batch = 1; + gs[0] = (bh + item_w - 1) / item_w; + gs[1] = 1; + gs[2] = batch; + sprintf(kernelname, "gemm_nt_nobias_%d%d%d", item_k, item_w, (item_c >> 1)); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs( + kernel, KA, KB, K, ow_str, 0, 0, 0, 0, 0, bh, 1, gs[0], gs[1], A, B, C)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + CHECK_STATUS(gcl_print_memory(handle, matrixA, "gemm_nt_a")); + CHECK_STATUS(gcl_print_memory(handle, matrixB, "gemm_nt_b")); + CHECK_STATUS(gcl_print_memory(handle, matrixC, "gemm_nt_c")); + handle->t_total += handle->t_execute; +#endif + return SUCCESS; + } + return NOT_SUPPORTED; +} + +EE matmul_infer_forward_tmp_bytes_mali_fp16(TensorDesc matrixADesc, + bool transposeA, + TensorDesc matrixBDesc, + bool transposeB, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo) +{ + UNUSED(matrixADesc); + UNUSED(transposeA); + UNUSED(matrixBDesc); + UNUSED(transposeB); + UNUSED(forwardRunInfo); + *bytes = 0; + return SUCCESS; +} + +EE matmul_mali_fp16(GCLHandle_t handle, + TensorDesc matrixADesc, + bool transposeA, + const GCLMem_t matrixA, + TensorDesc matrixBDesc, + bool transposeB, + const GCLMem_t matrixB, + GCLMem_t tmp, + TensorDesc matrixCDesc, + GCLMem_t matrixC, + ForwardRunInfoMali_t forwardRunInfo) +{ + CHECK_STATUS(matmul_checkpara_mali_fp16(matrixADesc, matrixBDesc, matrixCDesc)); + CHECK_STATUS(fill_output_zero(handle, matrixC, matrixCDesc)); + CHECK_STATUS(matmul_core_mali_fp16(handle, matrixADesc, transposeA, matrixA, matrixBDesc, + transposeB, matrixB, tmp, matrixCDesc, matrixC, forwardRunInfo)); + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/fp16/matmul_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/matmul_mali_fp16.h new file mode 100644 index 00000000..3d717463 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/matmul_mali_fp16.h @@ -0,0 +1,39 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _MATMUL_MALI_FP16 +#define _MATMUL_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE matmul_infer_forward_tmp_bytes_mali_fp16(TensorDesc matrixADesc, + bool transposeA, + TensorDesc matrixBDesc, + bool transposeB, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo); + +EE matmul_mali_fp16(GCLHandle_t handle, + TensorDesc matrixADesc, + bool transposeA, + const GCLMem_t matrixA, + TensorDesc matrixBDesc, + bool transposeB, + const GCLMem_t matrixB, + GCLMem_t tmp, + TensorDesc matrixCDesc, + GCLMem_t matrixC, + ForwardRunInfoMali_t forwardRunInfo); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/multihead_attention_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/multihead_attention_mali_fp16.cpp new file mode 100644 index 00000000..4a4e38c6 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/multihead_attention_mali_fp16.cpp @@ -0,0 +1,907 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/multihead_attention_mali_fp16.h" + +#define set_best_wkc(fc_bw, fc_bk, tn_bw, tn_bk, nt_bw, nt_bk, nt_bc, runInfo) \ + { \ + U32 *best_w = runInfo->best_w; \ + U32 *best_k = runInfo->best_k; \ + U32 *best_c = runInfo->best_c; \ + fc_bw[0] = best_w[0]; \ + fc_bw[1] = best_w[3]; \ + fc_bw[2] = best_w[4]; \ + fc_bw[3] = best_w[5]; \ + fc_bk[0] = best_k[0]; \ + fc_bk[1] = best_k[3]; \ + fc_bk[2] = best_k[4]; \ + fc_bk[3] = best_k[5]; \ + tn_bw = best_w[1]; \ + tn_bk = best_k[1]; \ + nt_bw = best_w[2]; \ + nt_bk = best_k[2]; \ + nt_bc = best_c[2]; \ + } + +#define set_mem_flag(ln_out_flag, fc_out_flag, tn_out_flag, nt_out_flag) \ + { \ + ln_out_flag[0] = 2; \ + fc_out_flag[0] = 0; \ + tn_out_flag = 1; \ + nt_out_flag = 0; \ + fc_out_flag[1] = 1; \ + ln_out_flag[1] = 0; \ + fc_out_flag[2] = 2; \ + } + +#define get_subbuf_size(dt, ln_out_w, ln_out_h, ln_out_flag, fc_out_w, fc_out_h, fc_out_flag, \ + tn_out_w, tn_out_h, tn_out_c, tn_out_flag, nt_out_w, nt_out_h, nt_out_c, nt_out_flag, sub_size) \ + { \ + U32 size; \ + for (U32 i = 0; i < 3; i++) \ + sub_size[i] = 0; \ + for (U32 i = 0; i < 2; i++) { \ + size = ln_out_w[i] * ln_out_h[i] * bytesOf(dt); \ + if (size > sub_size[ln_out_flag[i]]) \ + sub_size[ln_out_flag[i]] = size; \ + } \ + for (U32 i = 0; i < 3; i++) { \ + size = fc_out_w[i] * fc_out_h[i] * bytesOf(dt); \ + if (size > sub_size[fc_out_flag[i]]) \ + sub_size[fc_out_flag[i]] = size; \ + } \ + size = tn_out_w * tn_out_h * tn_out_c * bytesOf(dt); \ + if (size > sub_size[tn_out_flag]) \ + sub_size[tn_out_flag] = size; \ + size = nt_out_w * nt_out_h * nt_out_c * bytesOf(dt); \ + if (size > sub_size[nt_out_flag]) \ + sub_size[nt_out_flag] = size; \ + } + +#define get_ln0_out_wh(t, k, fc_bw, ow, oh, useEltIn) \ + { \ + ow = ALIGN(t, fc_bw[0]); \ + if (!useEltIn[0]) \ + ow = (ow > ALIGN(t, fc_bw[1])) ? ow : ALIGN(t, fc_bw[1]); \ + oh = ALIGN(k, 4); \ + } + +#define get_fc0_out_wh(t, k, fc_bw, fc_bk, tn_bw, tn_bk, nt_bc, ow, oh) \ + { \ + ow = ALIGN(t, fc_bw[0]); \ + oh = ALIGN(k, fc_bk[0]); \ + ow = (ow > ALIGN(t, tn_bw)) ? ow : ALIGN(t, tn_bw); \ + ow = (ow > ALIGN(t, tn_bk)) ? ow : ALIGN(t, tn_bk); \ + ow = (ow > ALIGN(t, nt_bc)) ? ow : ALIGN(t, nt_bc); \ + } + +#define get_tn_sf_out_whc(Aw, Bw, t, k, sliceLen, nt_bw, nt_bc, ow, oh, oc) \ + { \ + ow = Bw; \ + oh = Aw; \ + oc = k / sliceLen; \ + ow = (ow > ALIGN(t, 4)) ? ow : ALIGN(t, 4); \ + ow = (ow > ALIGN(t, nt_bc)) ? ow : ALIGN(t, nt_bc); \ + oh = (oh > ALIGN(t, nt_bw)) ? oh : ALIGN(t, nt_bw); \ + } + +#define get_nt_out_whc(Ah, Bh, t, k, sliceLen, fc_bw, ow, oh, oc) \ + { \ + ow = Bh; \ + oh = Ah; \ + oc = k / sliceLen; \ + ow = (ow > ALIGN(t, fc_bw[1])) ? ow : ALIGN(t, fc_bw[1]); \ + if (sliceLen != oh) \ + CHECK_STATUS(NOT_MATCH); \ + } + +#define get_fc1_out_wh(Bw, t, k, fc_bw, fc_bk, ow, oh) \ + { \ + ow = Bw; \ + oh = ALIGN(k, fc_bk[1]); \ + ow = (ow > ALIGN(t, fc_bw[2])) ? ow : ALIGN(t, fc_bw[2]); \ + ow = (ow > ALIGN(t, fc_bw[3])) ? ow : ALIGN(t, fc_bw[3]); \ + } + +#define get_fc2_out_wh(Bw, t, k, fc_bw, fc_bk, ow, oh) \ + { \ + ow = Bw; \ + oh = ALIGN(k, fc_bk[2]); \ + ow = (ow > ALIGN(t, fc_bw[3])) ? ow : ALIGN(t, fc_bw[3]); \ + } + +inline void fill_zero_nchw(GCLHandle_t handle, U32 len, U32 offset, Mem buf) +{ + char kernelName[128]; + Kernel kernel; + sprintf(kernelName, "fill_memory_zero_vec4_f16"); + U32 gs = (len + 3) / 4; + U32 ls = 0; + U32 dim = 1; + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, len, offset, gs, buf)); + gcl_set_kernelVec(handle, kernel, dim, &gs, &ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, &gs, &ls, kernelName)); +#endif +} + +inline void layer_norm(GCLHandle_t handle, + U32 len, + U32 on, + U32 ih_str, + U32 ic_str, + U32 ih_off, + U32 iw_off, + U32 oh_str, + Mem alpbuf, + Mem betbuf, + Mem in, + Mem out, + bool USE_C1 = false) +{ + U32 gs = len; + U32 ls = 0; + U32 dim = 1; + float para = 1.0 / on; + Kernel kernel; + if (USE_C1) { + CHECK_STATUS(gcl_create_kernel(handle, "normalization_c1", &kernel)); + } else { + CHECK_STATUS(gcl_create_kernel(handle, "normalization", &kernel)); + } + CHECK_STATUS(gcl_set_kernelArgs( + kernel, len, ih_str, ic_str, ih_off, iw_off, oh_str, 0, 0, para, alpbuf, betbuf, in, out)); + gcl_set_kernelVec(handle, kernel, dim, &gs, &ls, "normalization_c1"); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, &gs, &ls, "normalization_c1")); +#endif +} + +inline void inner_product_c1(GCLHandle_t handle, + U32 M, + U32 N, + U32 K, + U32 ow_str, + U32 ow, + U32 oh, + U32 item_w, + U32 item_k, + Mem A, + Mem B, + Mem bias, + Mem C) +{ + /*output is c1*/ + U32 ow_align = ALIGN(ow, item_w); + U32 oh_align = ALIGN(oh, item_k); + U32 gs[2] = {ow_align / item_w, oh_align / item_k}; + U32 ls[2] = {0, 0}; + U32 dim = 2; + Kernel kernel; + char kernelName[128]; + sprintf(kernelName, "gemm_tn_%d%d", item_k, item_w); + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, M, N, K, ow_str, ow, oh, gs[0], gs[1], A, B, bias, C)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); +#endif +} + +inline void inner_product_with_eltwise_c4(GCLHandle_t handle, + U32 M, + U32 N, + U32 K, + U32 ow_str, + U32 ow, + U32 oh, + U32 item_w, + U32 item_k, + bool useLayerNormIn, + U32 ew_str, + Mem A, + Mem B, + Mem bias, + Mem C, + Mem elt) +{ + /*output is c4*/ + U32 ow_align = ALIGN(ow, item_w); + U32 oh_align = ALIGN(oh, item_k); + U32 gs[2] = {ow_align / item_w, oh_align / item_k}; + U32 ls[2] = {0, 0}; + U32 dim = 2; + Kernel kernel; + char kernelName[128]; + if (useLayerNormIn) { + sprintf(kernelName, "gemm_tn_eltwise4_ncwhc4_%d%d", item_k, item_w); + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, M, N, K, ow, 1, oh, ow_str, 1, ow_str, 0, 0, gs[0], + gs[1], A, B, bias, C, ew_str, 1, ew_str, 0, 0, elt)); + } else { + sprintf(kernelName, "gemm_tn_eltwise1_ncwhc4_%d%d", item_k, item_w); + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, M, N, K, ow, 1, oh, ow_str, 1, ow_str, 0, 0, gs[0], + gs[1], A, B, bias, C, ew_str, 0, 0, elt)); + } + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); +#endif +} + +inline void inner_product_ncwhc4(GCLHandle_t handle, + U32 iw_str, + U32 ic_str, + U32 fn, + U32 ow_str, + U32 oh_off, + U32 ow_off, + U32 ow, + U32 item_w, + U32 item_k, + ActivationMode activation, + bool useEltwise, + Mem in, + Mem flt, + Mem bias, + Mem out, + U32 ew_str, + Mem elt) +{ + U32 ow_align = ALIGN(ow, item_w); + U32 gs[3] = {1, ow_align / item_w, fn / item_k}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + Kernel kernel; + char kernelName[128]; + char modeName[128]; + if (useEltwise) { + strcpy(modeName, "eltwise4_"); + } else { + switch (activation) { + case ACTIVATION_RELU: + strcpy(modeName, "relu_"); + break; + case ACTIVATION_GELU: + strcpy(modeName, "gelu_"); + break; + case ACTIVATION_NULL: + strcpy(modeName, ""); + break; + default: + CHECK_STATUS(NOT_SUPPORTED); + } + } + item_k = item_k >> 2; + sprintf(kernelName, "conv_direct_s%d_%s%d%d%d", 1, modeName, 1, item_w, item_k); + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + if (useEltwise) { + CHECK_STATUS(gcl_set_kernelArgs(kernel, 1, iw_str, ic_str, 0, 0, 1, ow_str, oh_off, ow_off, + ow, 1, gs[0], gs[1], in, flt, bias, out, 1, ew_str, 0, 0, elt)); + } else { + CHECK_STATUS(gcl_set_kernelArgs(kernel, 1, iw_str, ic_str, 0, 0, 1, ow_str, oh_off, ow_off, + ow, 1, gs[0], gs[1], in, flt, bias, out)); + } + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); +#endif +} + +inline void matmul_tn_c1(GCLHandle_t handle, + U32 M, + U32 N, + U32 K, + U32 ow_str, + U32 A_str, + U32 B_str, + U32 C_str, + U32 A_off, + U32 B_off, + U32 ow, + U32 oh, + U32 item_w, + U32 item_k, + U32 batch, + float alp, + float bet, + Mem A, + Mem B, + Mem C) +{ + /*output is c1*/ + U32 ow_align = ALIGN(ow, item_w); + U32 oh_align = ALIGN(oh, item_k); + U32 gs[3] = {ow_align / item_w, oh_align / item_k, batch}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + Kernel kernel; + char kernelName[128]; + sprintf(kernelName, "gemm_tn_nobias_%d%d", item_k, item_w); + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, M, N, K, ow_str, A_str, B_str, C_str, A_off, B_off, ow, + oh, gs[0], gs[1], alp, bet, A, B, C)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); +#endif +} + +inline void matmul_nt_c1(GCLHandle_t handle, + U32 KA, + U32 KB, + U32 K, + U32 ow_str, + U32 A_str, + U32 B_str, + U32 C_str, + U32 A_off, + U32 B_off, + U32 ow, + U32 oh, + U32 item_w, + U32 item_k, + U32 item_c, + U32 batch, + Mem A, + Mem B, + Mem C) +{ + /*output is c1*/ + U32 ow_align = ALIGN(ow, item_w); + U32 oh_align = ALIGN(oh, item_k); + U32 gs[3] = {ow_align / item_w, oh_align / item_k, batch}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + Kernel kernel; + char kernelName[128]; + sprintf(kernelName, "gemm_nt_nobias_%d%d%d", item_k, item_w, (item_c >> 1)); + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, KA, KB, K, ow_str, A_str, B_str, C_str, A_off, B_off, + ow, oh, gs[0], gs[1], A, B, C)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); +#endif +} + +inline void softmax_w(GCLHandle_t handle, + U32 iw, + U32 ih, + U32 ic, + U32 iw_str, + U32 ih_str, + U32 iw_off, + U32 ih_off, + U32 ow_str, + U32 oh_str, + U32 ow_off, + U32 oh_off, + Mem in, + Mem out) +{ + U32 gs[2] = {ih, ic}; + U32 ls[2] = {0, 0}; + U32 dim = 2; + U32 iwd4 = (iw + 3) >> 2; + U32 iwe4 = ((iw & 3) == 0) ? 4 : (iw & 3); + Kernel kernel; + char kernelName[128]; + sprintf(kernelName, "softmax_nchw_w"); + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iwd4, iwe4, iw_str, ih_str, iw_off, ih_off, ow_str, + oh_str, ow_off, oh_off, gs[0], gs[1], in, out)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); +#endif +} + +inline EE multihead_attention_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + std::vector filterDesc, + std::vector filter, + std::vector biasDesc, + std::vector bias, + std::vector layerNormAlpha, + std::vector layerNormBeta, + void *multiplyAlpha, + void *multiplyBeta, + U32 *firstFCSliceNum, + U32 matmulSliceLen, + std::vector eltwiseWithLayerNormIn, + ActivationMode activation, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ForwardRunInfoMali_t forwardRunInfo) +{ + DataType dt; + U32 m, k, t; + get_nlp_mkt_val(inputDesc, &dt, &m, &k, &t); + U32 ih_str, ic_str, ih_off, iw_off; + get_gclmem_dim(input->desc, NULL, &ih_str, &ic_str, &iw_off, &ih_off); + + U32 oh_str, oc_str, oh_off, ow_off; + get_gclmem_dim(output->desc, NULL, &oh_str, &oc_str, &ow_off, &oh_off); + U32 fn[4]; + for (U32 i = 0; i < filterDesc.size(); i++) { + tensorSelectGet(filterDesc[i], NULL, NULL, &fn[i], NULL, NULL, NULL); + } + + U32 fc_bw[4]; + U32 fc_bk[4]; + U32 tn_bw, tn_bk; + U32 nt_bw, nt_bk, nt_bc; + set_best_wkc(fc_bw, fc_bk, tn_bw, tn_bk, nt_bw, nt_bk, nt_bc, forwardRunInfo); + + U32 ln_out_flag[2]; + U32 fc_out_flag[3]; + U32 tn_out_flag, nt_out_flag; + set_mem_flag(ln_out_flag, fc_out_flag, tn_out_flag, nt_out_flag); + + U32 ln_out_w[2]; + U32 ln_out_h[2]; + U32 fc_out_w[3]; + U32 fc_out_h[3]; + U32 tn_out_w, tn_out_h, tn_out_c; + U32 nt_out_w, nt_out_h, nt_out_c; + + get_ln0_out_wh(t, k, fc_bw, ln_out_w[0], ln_out_h[0], eltwiseWithLayerNormIn); + get_fc0_out_wh(t, fn[0], fc_bw, fc_bk, tn_bw, tn_bk, nt_bc, fc_out_w[0], fc_out_h[0]); + U32 Aw = ALIGN(t, tn_bk); + U32 Bw = ALIGN(t, tn_bw); + get_tn_sf_out_whc( + Aw, Bw, t, firstFCSliceNum[0], matmulSliceLen, nt_bw, nt_bc, tn_out_w, tn_out_h, tn_out_c); + + U32 Ah = ALIGN(matmulSliceLen, nt_bk); + U32 Bh = ALIGN(t, nt_bw); + get_nt_out_whc( + Ah, Bh, t, firstFCSliceNum[2], matmulSliceLen, fc_bw, nt_out_w, nt_out_h, nt_out_c); + + Bw = ALIGN(t, fc_bw[1]); + get_fc1_out_wh(Bw, t, fn[1], fc_bw, fc_bk, fc_out_w[1], fc_out_h[1]); + + ln_out_w[1] = fc_out_w[1]; + ln_out_h[1] = fc_out_h[1]; + + Bw = ALIGN(t, fc_bw[2]); + get_fc2_out_wh(Bw, t, fn[2], fc_bw, fc_bk, fc_out_w[2], fc_out_h[2]); + + U32 offset = 0; + U32 sub_size[3]; + get_subbuf_size(dt, ln_out_w, ln_out_h, ln_out_flag, fc_out_w, fc_out_h, fc_out_flag, tn_out_w, + tn_out_h, tn_out_c, tn_out_flag, nt_out_w, nt_out_h, nt_out_c, nt_out_flag, sub_size); + + Mem ln_out_mem[2]; + Mem fc_out_mem[3]; + Mem tn_out_mem, nt_out_mem; + Mem subBuf[3]; + CHECK_STATUS(gcl_create_sub_buffer(sub_size[0], &offset, tmpBuf, &subBuf[0])); + CHECK_STATUS(gcl_create_sub_buffer(sub_size[1], &offset, tmpBuf, &subBuf[1])); + CHECK_STATUS(gcl_create_sub_buffer(sub_size[2], &offset, tmpBuf, &subBuf[2])); + + for (U32 i = 0; i < 2; i++) { + ln_out_mem[i] = subBuf[ln_out_flag[i]]; + } + for (U32 i = 0; i < 3; i++) { + fc_out_mem[i] = subBuf[fc_out_flag[i]]; + } + tn_out_mem = subBuf[tn_out_flag]; + nt_out_mem = subBuf[nt_out_flag]; + + /* STAGE0: layerNorm + * INPUT (X, 78) C4 + * OUTPUT (X, 312) C1 --> X align to best_w[0] + */ + Mem stage0LNIn = input->mem; + Mem stage0LNAlp = ((GCLMem_t)(layerNormAlpha[0]))->mem; + Mem stage0LNBet = ((GCLMem_t)(layerNormBeta[0]))->mem; + + layer_norm(handle, t, k, ih_str, ic_str, ih_off, iw_off, ln_out_w[0], stage0LNAlp, stage0LNBet, + stage0LNIn, ln_out_mem[0], true); + + /* STAGE1: InnerProduct + * TN GEMM + * weight(T) (932, 312) * stage0LNOut(N) (X, 312) + * GPU: + * weight W : 932 -> 312 * 3 + * weight H: 312 + * OUTPUT: + * mat_q: (X, 312) --> (Xq, 26, 12) + * mat_k: (X, 312) --> (Xk, 26, 12) + * mat_v: (X, 312) --> (Xv, 26, 12) + * Xq = Xk = Xv + + * mat_q * mat_k(TN) --->mat_qk(Xk, Xq, 12) + * mat_q --> Xq X align to best_k[1] + * mat_k --> Xk X align to best_w[1] + * mat_qk --> Xqk_w X align to best_c[0] + * mat_qk --> Xqk_h X align to best_w[2] + + * mat_v * mat_qk(NT) -->mat_vqk(Xq, 7, 12) + * mat_v --> Xv X align to best_c[0]; + * mat_v --> 26 26 align to best_k[2](require 26 % best_k[2] = 0); + + * Stage1: + * OUTPUT + * dim0: max(Xq align best_k[1], Xk align best_w[1], Xv align to best_c[0]) + * dim1: 312 + 312 + 312 + * INPUT: + * A(dim1 align to best_k[0], 312) B(X align to best_w[0]) + */ + + U32 M = ((GCLMem_t)(filter[0]))->desc.stride[0]; + U32 K = ((GCLMem_t)(filter[0]))->desc.stride[1]; + U32 N = ln_out_w[0]; + Mem stage1MatA = ((GCLMem_t)(filter[0]))->mem; + Mem stage1MatB = ln_out_mem[0]; + Mem stage1Bias = ((GCLMem_t)(bias[0]))->mem; + if (N < ALIGN(t, nt_bc)) { + U32 off = (firstFCSliceNum[0] + firstFCSliceNum[1]) * fc_out_w[0]; + U32 len = fc_out_w[0] * fc_out_h[0] - off; + fill_zero_nchw(handle, len, off, fc_out_mem[0]); + } + inner_product_c1(handle, M, N, K, fc_out_w[0], t, M, fc_bw[0], fc_bk[0], stage1MatA, stage1MatB, + stage1Bias, fc_out_mem[0]); + + /* Stage2: Matmul mat_q * mat_k + * TN GEMM + * INPUT: mat_q(Xq, 26, 12) mat_k (Xk, 26, 12); + * Xq X align to best_k[1] + * Xk X align to best_w[1] + * Use stride Xmax + * Output: mat_qk(Xqk_w, Xqk_h, 12) + * Xqk_w X align to best_c[0](Xk) + * Xqk_h X align to best_w[2](Xq) + */ + + M = fc_out_w[0]; + N = fc_out_w[0]; + K = matmulSliceLen; + Mem stage2MatA = fc_out_mem[0]; + Mem stage2MatB = fc_out_mem[0]; + Aw = ALIGN(t, tn_bk); + Bw = ALIGN(t, tn_bw); + if (tn_out_w > Aw || tn_out_h > Bw) { + U32 len = tn_out_w * tn_out_h * tn_out_c; + fill_zero_nchw(handle, len, 0, tn_out_mem); + } + U32 A_str = matmulSliceLen * M; + U32 B_str = matmulSliceLen * N; + U32 C_str = tn_out_w * tn_out_h; + U32 A_off = 0; + U32 B_off = firstFCSliceNum[0] * fc_out_w[0]; + float *mulAlp = (float *)multiplyAlpha; + float *mulBet = (float *)multiplyBeta; + matmul_tn_c1(handle, M, N, K, tn_out_w, A_str, B_str, C_str, A_off, B_off, t, t, tn_bw, tn_bk, + tn_out_c, *mulAlp, *mulBet, stage2MatA, stage2MatB, tn_out_mem); + + /* STAGE3: Softmax on w for mat_qk */ + softmax_w(handle, t, t, tn_out_c, tn_out_w, tn_out_h, 0, 0, tn_out_w, tn_out_h, 0, 0, + tn_out_mem, tn_out_mem); + + /* STAGE4: Matmul mat_v * mat_qk + * NT GEMM + * INPUT: mat_v(Xv, 26, 12) mat_qk(Xqk_w, Xqk_h, 12) + * Xv X align to best_c[0] + * 26 align to best_k[2] + * Xqk_w align to best_c[0] + * Xqk_h align to best_w[2] + * OUTPUT: mat_vqk(Xvqk, 26, 12) + * Xvqk X align to best_w[3] + * set 26 divided by best_k[2], for next step + */ + U32 KA = fc_out_w[0]; + U32 KB = tn_out_w; + Mem stage4MatA = fc_out_mem[0]; + Mem stage4MatB = tn_out_mem; + K = ALIGN(t, nt_bc); + A_str = KA * matmulSliceLen; + B_str = tn_out_w * tn_out_h; + C_str = nt_out_w * nt_out_h; + A_off = (firstFCSliceNum[0] + firstFCSliceNum[1]) * KA; + B_off = 0; + matmul_nt_c1(handle, KA, KB, K, nt_out_w, A_str, B_str, C_str, A_off, B_off, t, matmulSliceLen, + nt_bw, nt_bk, nt_bc, nt_out_c, stage4MatA, stage4MatB, nt_out_mem); + + /* STAGE5: Innerproduct + * TN GEMM + * weight(T) (312, 312) stage4MatC(Xvqk, 312) + * weight w 312 align to best_k[3] + * Xvqk align to best_w[3], use stride Xvqk_max_w + * Output: stage5MatC + * use ncwhc4 for layer normal + * (Xi5, 312) + * Xi5, X align to best_w[4] + */ + + M = ((GCLMem_t)filter[1])->desc.stride[0]; + K = ((GCLMem_t)filter[1])->desc.stride[1]; + N = nt_out_w; + Mem stage5MatA = ((GCLMem_t)filter[1])->mem; + Mem stage5MatB = nt_out_mem; + Mem stage5Bias = ((GCLMem_t)bias[1])->mem; + U32 ew_str = (eltwiseWithLayerNormIn[0]) ? ih_str : ln_out_w[0]; + Mem elt = (eltwiseWithLayerNormIn[0]) ? stage0LNIn : ln_out_mem[0]; + inner_product_with_eltwise_c4(handle, M, N, K, fc_out_w[1], t, fn[1], fc_bw[1], fc_bk[1], + eltwiseWithLayerNormIn[0], ew_str, stage5MatA, stage5MatB, stage5Bias, fc_out_mem[1], elt); + + /* STAGE6: LayerNorm + */ + Mem stage6LNAlp = ((GCLMem_t)(layerNormAlpha[1]))->mem; + Mem stage6LNBet = ((GCLMem_t)(layerNormBeta[1]))->mem; + layer_norm(handle, t, fn[1], fc_out_w[1], (fn[1] + 3) / 4, 0, 0, ln_out_w[1], stage6LNAlp, + stage6LNBet, fc_out_mem[1], ln_out_mem[1]); + + /* STAGE7: Innerproduct with relu + */ + Mem stage7Flt = ((GCLMem_t)filter[2])->mem; + Mem stage7In = ln_out_mem[1]; + Mem stage7Bias = ((GCLMem_t)bias[2])->mem; + inner_product_ncwhc4(handle, ln_out_w[1], (fn[1] + 3) / 4, fn[2], fc_out_w[2], 0, 0, t, fc_bw[2], + fc_bk[2], activation, false, stage7In, stage7Flt, stage7Bias, fc_out_mem[2], 0, NULL); + + /*STAGE8: Innerproduct with eltwise + */ + M = ((GCLMem_t)(filter[3]))->desc.stride[0]; + K = ((GCLMem_t)(filter[3]))->desc.stride[1]; + N = fc_out_w[2]; + Mem stage8Flt = ((GCLMem_t)filter[3])->mem; + Mem stage8In = fc_out_mem[2]; + Mem stage8Bias = ((GCLMem_t)bias[3])->mem; + ew_str = (eltwiseWithLayerNormIn[1]) ? fc_out_w[1] : ln_out_w[1]; + Mem elt2 = (eltwiseWithLayerNormIn[1]) ? fc_out_mem[1] : ln_out_mem[1]; + inner_product_ncwhc4(handle, fc_out_w[2], (fn[2] + 3) / 4, fn[3], oh_str, oh_off, ow_off, t, + fc_bw[3], fc_bk[3], ACTIVATION_NULL, true, stage8In, stage8Flt, stage8Bias, output->mem, + ew_str, elt2); + return SUCCESS; +} + +inline EE multihead_attention_checkpara_mali_fp16( + TensorDesc inputDesc, std::vector filterDesc, TensorDesc outputDesc) +{ + if (inputDesc.dt != outputDesc.dt || inputDesc.dt != DT_F16) { + return NOT_MATCH; + } + for (U32 i = 0; i < filterDesc.size(); i++) { + if (filterDesc[i].dt != DT_F16) { + return NOT_SUPPORTED; + } + } + return SUCCESS; +} + +EE multihead_attention_transform_filter_bytes_mali_fp16(std::vector filterDesc, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo) +{ + U32 fc_bk[4]; + U32 fc_bc[4]; + fc_bk[0] = forwardRunInfo->best_k[0]; + fc_bk[1] = forwardRunInfo->best_k[3]; + fc_bk[2] = forwardRunInfo->best_k[4]; + fc_bk[3] = forwardRunInfo->best_k[5]; + fc_bc[0] = forwardRunInfo->best_c[0]; + fc_bc[1] = forwardRunInfo->best_c[3]; + fc_bc[2] = forwardRunInfo->best_c[4]; + fc_bc[3] = forwardRunInfo->best_c[5]; + for (U32 i = 0; i < 2; i++) { + U32 fn, fc, fh, fw; + U32 s0, s1, s2; + U32 num; + DataType dt = filterDesc[i].dt; + tensorSelectGet(filterDesc[i], NULL, NULL, &fn, &fc, &fh, &fw); + if (fh != 1 || fw != 1) { + CHECK_STATUS(NOT_MATCH); + } + s0 = ALIGN(fn, fc_bk[i]); + s1 = ALIGN(fc, 4); + s2 = 1; + num = s0 * s1 * s2; + gclmemFilterDesc[i].stride[0] = s0; + gclmemFilterDesc[i].stride[1] = s1; + gclmemFilterDesc[i].stride[2] = s2; + gclmemFilterDesc[i].offset[0] = 0; + gclmemFilterDesc[i].offset[1] = 0; + gclmemFilterDesc[i].offset[2] = 0; + gclmemFilterDesc[i].num = num; + gclmemFilterDesc[i].memFormat = DF_NCHW; + gclmemFilterDesc[i].byteSize = num * bytesOf(dt); + gclmemFilterDesc[i].memType = GCL_MEM_BUF; + gclmemFilterDesc[i].flags = CL_MEM_READ_WRITE; + gclmemFilterDesc[i].host_ptr = NULL; + } + for (U32 i = 2; i < filterDesc.size(); i++) { + U32 fn, fc, fh, fw; + U32 s0, s1, s2; + U32 num; + DataType dt = filterDesc[i].dt; + tensorSelectGet(filterDesc[i], NULL, NULL, &fn, &fc, &fh, &fw); + if (fh != 1 || fw != 1) { + CHECK_STATUS(NOT_MATCH); + } + s0 = fc_bk[i] >> 2; + s1 = (fc + fc_bc[i] - 1) / fc_bc[i]; + s2 = (fn + fc_bk[i] - 1) / fc_bk[i]; + num = s0 * s1 * s2 * fc_bc[i] * fc_bk[i] / (fc_bk[i] >> 2); + gclmemFilterDesc[i].stride[0] = s0; + gclmemFilterDesc[i].stride[1] = s1; + gclmemFilterDesc[i].stride[2] = s2; + gclmemFilterDesc[i].offset[0] = 0; + gclmemFilterDesc[i].offset[1] = 0; + gclmemFilterDesc[i].offset[2] = 0; + gclmemFilterDesc[i].num = num; + gclmemFilterDesc[i].memFormat = DF_NCWHN4C4; + gclmemFilterDesc[i].byteSize = num * bytesOf(dt); + gclmemFilterDesc[i].memType = GCL_MEM_BUF; + gclmemFilterDesc[i].flags = CL_MEM_READ_WRITE; + gclmemFilterDesc[i].host_ptr = NULL; + } + return SUCCESS; +} + +EE multihead_attention_transform_filter_mali_fp16(GCLHandle_t handle, + std::vector filterDesc, + std::vector filter, + std::vector *fltmemDesc, + std::vector fltmem, + ForwardRunInfoMali_t forwardRunInfo) +{ + U32 fc_bk[4]; + U32 fc_bc[4]; + + fc_bk[0] = forwardRunInfo->best_k[0]; + fc_bk[1] = forwardRunInfo->best_k[3]; + fc_bk[2] = forwardRunInfo->best_k[4]; + fc_bk[3] = forwardRunInfo->best_k[5]; + fc_bc[0] = forwardRunInfo->best_c[0]; + fc_bc[1] = forwardRunInfo->best_c[3]; + fc_bc[2] = forwardRunInfo->best_c[4]; + fc_bc[3] = forwardRunInfo->best_c[5]; + char kernelname[128]; + Kernel kernel; + U32 gs[3]; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + U32 filterNum = filterDesc.size(); + if (filterNum != filter.size() || filterNum != fltmemDesc->size() || filterNum != fltmem.size()) { + CHECK_STATUS(NOT_MATCH); + } + for (auto p : filterDesc) { + fltmemDesc->push_back(p); + } + U32 fwh = 1; + for (U32 i = 0; i < 2; i++) { + sprintf(kernelname, "conv_direct_trans_fltbuf_%d%d", 1, 0); + U32 fc, fn; + Mem flt_org = ((GCLMem_t)filter[i])->mem; + Mem flt_tra = ((GCLMem_t)fltmem[i])->mem; + tensorSelectGet(filterDesc[i], NULL, NULL, &fn, &fc, NULL, NULL); + CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, fwh, fc, fn, flt_org, flt_tra)); + gs[0] = fwh; + gs[1] = ALIGN(fc, 4); + gs[2] = ALIGN(fn, fc_bk[i]); + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + } + + for (U32 i = 2; i < filterDesc.size(); i++) { + sprintf(kernelname, "conv_direct_trans_fltbuf_%d%d", fc_bc[i], fc_bk[i]); + U32 fc, fn; + Mem flt_org = ((GCLMem_t)filter[i])->mem; + Mem flt_tra = ((GCLMem_t)fltmem[i])->mem; + tensorSelectGet(filterDesc[i], NULL, NULL, &fn, &fc, NULL, NULL); + CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, fwh, fc, fn, flt_org, flt_tra)); + gs[0] = fwh; + gs[1] = (fc + fc_bc[i] - 1) / fc_bk[i]; + gs[2] = ALIGN(fn, fc_bk[i]); + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + } + return SUCCESS; +} + +EE multihead_attention_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + std::vector filterDesc, + std::vector eltwiseWithLayerNormIn, + U32 *firstFCSliceNum, + U32 matmulSliceLen, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo) +{ + U32 fn[4]; + for (U32 i = 0; i < filterDesc.size(); i++) { + tensorSelectGet(filterDesc[i], NULL, NULL, &fn[i], NULL, NULL, NULL); + } + DataType dt; + U32 m, k, t; + get_nlp_mkt_val(inputDesc, &dt, &m, &k, &t); + U32 fc_bw[4]; + U32 fc_bk[4]; + U32 tn_bw, tn_bk; + U32 nt_bw, nt_bk, nt_bc; + set_best_wkc(fc_bw, fc_bk, tn_bw, tn_bk, nt_bw, nt_bk, nt_bc, forwardRunInfo); + + U32 ln_out_flag[2]; + U32 fc_out_flag[3]; + U32 tn_out_flag, nt_out_flag; + set_mem_flag(ln_out_flag, fc_out_flag, tn_out_flag, nt_out_flag); + + U32 ln_out_w[2]; + U32 ln_out_h[2]; + U32 fc_out_w[3]; + U32 fc_out_h[3]; + U32 tn_out_w, tn_out_h, tn_out_c; + U32 nt_out_w, nt_out_h, nt_out_c; + get_ln0_out_wh(t, k, fc_bw, ln_out_w[0], ln_out_h[0], eltwiseWithLayerNormIn); + get_fc0_out_wh(t, fn[0], fc_bw, fc_bk, tn_bw, tn_bk, nt_bc, fc_out_w[0], fc_out_h[0]); + U32 Aw = ALIGN(t, tn_bk); + U32 Bw = ALIGN(t, tn_bw); + get_tn_sf_out_whc( + Aw, Bw, t, firstFCSliceNum[0], matmulSliceLen, nt_bw, nt_bc, tn_out_w, tn_out_h, tn_out_c); + U32 Ah = ALIGN(matmulSliceLen, nt_bk); + U32 Bh = ALIGN(t, nt_bw); + get_nt_out_whc( + Ah, Bh, t, firstFCSliceNum[2], matmulSliceLen, fc_bw, nt_out_w, nt_out_h, nt_out_c); + Bw = ALIGN(t, fc_bw[1]); + get_fc1_out_wh(Bw, t, fn[1], fc_bw, fc_bk, fc_out_w[1], fc_out_h[1]); + ln_out_w[1] = fc_out_w[1]; + ln_out_h[1] = fc_out_h[1]; + Bw = ALIGN(t, fc_bw[2]); + get_fc2_out_wh(Bw, t, fn[2], fc_bw, fc_bk, fc_out_w[2], fc_out_h[2]); + + U32 sub_size[3]; + get_subbuf_size(dt, ln_out_w, ln_out_h, ln_out_flag, fc_out_w, fc_out_h, fc_out_flag, tn_out_w, + tn_out_h, tn_out_c, tn_out_flag, nt_out_w, nt_out_h, nt_out_c, nt_out_flag, sub_size); + *bytes = ALIGN(sub_size[0], 1024) + ALIGN(sub_size[1], 1024) + sub_size[2]; + return SUCCESS; +} + +EE multihead_attention_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + std::vector filterDesc, + std::vector filter, + std::vector biasDesc, + std::vector bias, + std::vector layerNormAlpha, + std::vector layerNormBeta, + void *multiplyAlpha, + void *multiplyBeta, + U32 *firstFCSliceNum, + U32 matmulSliceLen, + std::vector eltwiseWithLayerNormIn, + ActivationMode activation, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ForwardRunInfoMali_t forwardRunInfo) +{ + CHECK_STATUS(multihead_attention_checkpara_mali_fp16(inputDesc, filterDesc, outputDesc)); + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + CHECK_STATUS(multihead_attention_core_mali_fp16(handle, inputDesc, input, filterDesc, filter, + biasDesc, bias, layerNormAlpha, layerNormBeta, multiplyAlpha, multiplyBeta, firstFCSliceNum, + matmulSliceLen, eltwiseWithLayerNormIn, activation, tmpBytes, tmpBuf, outputDesc, output, + forwardRunInfo)); + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/fp16/multihead_attention_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/multihead_attention_mali_fp16.h new file mode 100644 index 00000000..d53b17b3 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/multihead_attention_mali_fp16.h @@ -0,0 +1,61 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _MULTIHEAD_ATTENTION_MALI_FP16 +#define _MULTIHEAD_ATTENTION_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE multihead_attention_transform_filter_bytes_mali_fp16(std::vector filterDesc, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo); + +EE multihead_attention_transform_filter_mali_fp16(GCLHandle_t handle, + std::vector filterDesc, + std::vector filter, + std::vector *fltmemDesc, + std::vector fltmem, + ForwardRunInfoMali_t forwardRunInfo); + +EE multihead_attention_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + std::vector filterDesc, + std::vector eltwiseWithLayerNormIn, + U32 *firstFCSliceNum, + U32 matmulSliceLen, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo); + +EE multihead_attention_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + std::vector filterDesc, + std::vector filter, + std::vector biasDesc, + std::vector bias, + std::vector layerNormAlpha, + std::vector layerNormBeta, + void *multiplyAlpha, + void *multiplyBeta, + U32 *firstFCSliceNum, + U32 matmulSliceLen, + std::vector eltwiseWithLayerNormIn, + ActivationMode activation, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ForwardRunInfoMali_t forwardRunInfo); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/normalization_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/normalization_mali_fp16.cpp new file mode 100644 index 00000000..d35df88e --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/normalization_mali_fp16.cpp @@ -0,0 +1,87 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/normalization_mali_fp16.h" + +inline EE normalization_checkpara_mali_fp16(TensorDesc inputDesc, TensorDesc outputDesc) +{ + if (inputDesc.dt != outputDesc.dt || inputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE normalization_core_mali_fp16(GCLHandle_t handle, + GCLMem_t alpha, + GCLMem_t beta, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output) +{ + UNUSED(outputDesc); + U32 step = inputDesc.dims[0]; + U32 numOutput = inputDesc.dims[1]; + U32 iw_str, ih_str, ic_str, iw_off, ih_off; + U32 oh_str, ow_off, oh_off; + ih_str = input->desc.stride[0]; + iw_str = input->desc.stride[1]; + ic_str = input->desc.stride[2]; + ih_off = input->desc.offset[0]; + iw_off = input->desc.offset[1]; + oh_str = output->desc.stride[0]; + oh_off = output->desc.offset[0]; + ow_off = output->desc.offset[1]; + if (iw_str != 1 || ih_off != 0 || iw_off != 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + cl_mem alpbuf, betbuf, inbuf, outbuf; + alpbuf = alpha->mem; + betbuf = beta->mem; + inbuf = input->mem; + outbuf = output->mem; + + U32 gs = step; + U32 ls = 0; + U32 dim = 1; + float para = 1.0 / numOutput; + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, "normalization", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, step, ih_str, ic_str, ih_off, iw_off, oh_str, oh_off, + ow_off, para, alpbuf, betbuf, inbuf, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, &gs, &ls, "normalization"); +#ifdef _DEBUG + CHECK_STATUS(gcl_print_memory(handle, input, "normalization_input")); + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, &gs, &ls, "normalization")); + CHECK_STATUS(gcl_print_memory(handle, output, "normalization_output")); +#endif + return SUCCESS; +} + +EE normalization_mali_fp16(GCLHandle_t handle, + GCLMem_t alpha, + GCLMem_t beta, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output) +{ + CHECK_STATUS(normalization_checkpara_mali_fp16(inputDesc, outputDesc)); + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + CHECK_STATUS( + normalization_core_mali_fp16(handle, alpha, beta, inputDesc, input, outputDesc, output)); + return SUCCESS; +} diff --git a/tensor_computing/src/gpu/mali/fp16/normalization_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/normalization_mali_fp16.h similarity index 78% rename from tensor_computing/src/gpu/mali/fp16/normalization_mali_fp16.h rename to compute/tensor/src/gpu/mali/fp16/normalization_mali_fp16.h index 9d903e72..77ccd7a0 100644 --- a/tensor_computing/src/gpu/mali/fp16/normalization_mali_fp16.h +++ b/compute/tensor/src/gpu/mali/fp16/normalization_mali_fp16.h @@ -14,17 +14,15 @@ #ifndef _NORMALIZATION_MALI_FP16 #define _NORMALIZATION_MALI_FP16 #include "sys.h" -#include "type.h" -#include "tensor_desc.h" #include "error.h" +#include "types.h" #include "tensor_computing_type.h" EE normalization_mali_fp16(GCLHandle_t handle, - GCLMem_t alpha, - GCLMem_t beta, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output); -#endif - + GCLMem_t alpha, + GCLMem_t beta, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/padding_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/padding_mali_fp16.cpp new file mode 100644 index 00000000..60ea7859 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/padding_mali_fp16.cpp @@ -0,0 +1,168 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/padding_mali_fp16.h" + +inline EE padding_checkpara_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + PadParamSpec padParamSpec, + TensorDesc outputDesc, + GCLMem_t output) +{ + if (handle == nullptr || nullptr == input || nullptr == output) { + return NULL_POINTER; + } + if (inputDesc.dt != outputDesc.dt || inputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + if (inputDesc.df != outputDesc.df || inputDesc.df != DF_NCHW) { + return NOT_SUPPORTED; + } + if (input->desc.memFormat != output->desc.memFormat || input->desc.memFormat != DF_NCWHC4) { + return NOT_SUPPORTED; + } + if (padParamSpec.pad_mode == Pad_Reflect && + (padParamSpec.top >= inputDesc.dims[1] || padParamSpec.bottom >= inputDesc.dims[1])) { + return NOT_SUPPORTED; + } + if (padParamSpec.pad_mode == Pad_Symmetric && + (padParamSpec.left > inputDesc.dims[0] || padParamSpec.right > inputDesc.dims[0])) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE padding_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + PadParamSpec padParamSpec, + TensorDesc outputDesc, + GCLMem_t output) +{ + U32 iw, ih, ic, in; + U32 ow, oh, oc, on; + tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); + tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); + + cl_mem inbuf, outbuf; + inbuf = input->mem; + outbuf = output->mem; + + U32 iw_str, ih_str, iw_off, ih_off; + ih_str = input->desc.stride[0]; + iw_str = input->desc.stride[1]; + ih_off = input->desc.offset[0]; + iw_off = input->desc.offset[1]; + + U32 ow_str, oh_str, ow_off, oh_off; + oh_str = output->desc.stride[0]; + ow_str = output->desc.stride[1]; + oh_off = output->desc.offset[0]; + ow_off = output->desc.offset[1]; + + U32 pw, ph, pr, pb; + pw = padParamSpec.left; + pr = padParamSpec.right; + ph = padParamSpec.top; + pb = padParamSpec.bottom; + + Kernel kernel; + switch (padParamSpec.pad_mode) { + case Pad_Constant: { + CHECK_STATUS(gcl_create_kernel(handle, "padding_constant", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ih_str, iw_str, ih_off, iw_off, oh, ow, + oh_str, ow_str, oh_off, ow_off, ph, pb, pw, pr, inbuf, outbuf)); + + U32 gs[3] = {oh, ow, (oc + 3) / 4 * on}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + gcl_set_kernelVec(handle, kernel, dim, gs, ls, "padding_constant"); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "padding_constant")); + CHECK_STATUS(gcl_print_memory(handle, input, "padding_constant_input")); + CHECK_STATUS(gcl_print_memory(handle, output, "padding_constant_output")); +#endif + break; + } + case Pad_Reflect: { + CHECK_STATUS(gcl_create_kernel(handle, "padding_reflect", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ih_str, iw_str, ih_off, iw_off, oh, ow, + oh_str, ow_str, oh_off, ow_off, ph, pb, pw, pr, inbuf, outbuf)); + + U32 gs[3] = {oh, ow, (oc + 3) / 4 * on}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + gcl_set_kernelVec(handle, kernel, dim, gs, ls, "padding_reflect"); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "padding_reflect")); + CHECK_STATUS(gcl_print_memory(handle, input, "padding_reflect_input")); + CHECK_STATUS(gcl_print_memory(handle, output, "padding_reflect_output")); +#endif + break; + } + case Pad_Edge: { + CHECK_STATUS(gcl_create_kernel(handle, "padding_edge", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ih_str, iw_str, ih_off, iw_off, oh, ow, + oh_str, ow_str, oh_off, ow_off, ph, pb, pw, pr, inbuf, outbuf)); + + U32 gs[3] = {oh, ow, (oc + 3) / 4 * on}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + gcl_set_kernelVec(handle, kernel, dim, gs, ls, "padding_edge"); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "padding_edge")); + CHECK_STATUS(gcl_print_memory(handle, input, "padding_edge_input")); + CHECK_STATUS(gcl_print_memory(handle, output, "padding_edge_output")); +#endif + break; + } + case Pad_Symmetric: { + CHECK_STATUS(gcl_create_kernel(handle, "padding_symmetric", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ih_str, iw_str, ih_off, iw_off, oh, ow, + oh_str, ow_str, oh_off, ow_off, ph, pb, pw, pr, inbuf, outbuf)); + + U32 gs[3] = {oh, ow, (oc + 3) / 4 * on}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + gcl_set_kernelVec(handle, kernel, dim, gs, ls, "padding_symmetric"); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "padding_symmetric")); + CHECK_STATUS(gcl_print_memory(handle, input, "padding_symmetric_input")); + CHECK_STATUS(gcl_print_memory(handle, output, "padding_symmetric_output")); +#endif + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + return SUCCESS; +} + +EE padding_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + PadParamSpec padParamSpec, + TensorDesc outputDesc, + GCLMem_t output) +{ + CHECK_STATUS( + padding_checkpara_mali_fp16(handle, inputDesc, input, padParamSpec, outputDesc, output)); + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + CHECK_STATUS(padding_core_mali_fp16(handle, inputDesc, input, padParamSpec, outputDesc, output)); + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/fp16/padding_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/padding_mali_fp16.h new file mode 100644 index 00000000..26b78ed6 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/padding_mali_fp16.h @@ -0,0 +1,27 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_PADDING_MALI_FP16 +#define _H_PADDING_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE padding_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + PadParamSpec padParamSpec, + TensorDesc outputDesc, + GCLMem_t output); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/pooling_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/pooling_mali_fp16.cpp new file mode 100644 index 00000000..2bdb49ec --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/pooling_mali_fp16.cpp @@ -0,0 +1,188 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/pooling_mali_fp16.h" + +inline EE pooling_checkpara_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + PoolingParamSpec poolingParamSpec, + TensorDesc outputDesc, + GCLMem_t output) +{ + if (handle == nullptr || nullptr == input || nullptr == output) { + return NULL_POINTER; + } + if (inputDesc.dt != outputDesc.dt || inputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + if (inputDesc.df != outputDesc.df) { + return NOT_SUPPORTED; + } + if (inputDesc.dims[2] != outputDesc.dims[2] || inputDesc.dims[3] != outputDesc.dims[3]) { + return NOT_SUPPORTED; + } + if (poolingParamSpec.padding_top >= poolingParamSpec.kernel_h) { + return NOT_SUPPORTED; + } + if (poolingParamSpec.padding_bottom >= poolingParamSpec.kernel_w) { + return NOT_SUPPORTED; + } + if (input->desc.memFormat != output->desc.memFormat || input->desc.memFormat != DF_NCWHC4) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE pooling_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + PoolingParamSpec poolingParamSpec, + TensorDesc outputDesc, + GCLMem_t output, + GCLMem_t temp) +{ + DataFormat df; + U32 iw, ih, ic, in, it; + U32 ow, oh, oc, on, ot; + tensorSelectGet(inputDesc, NULL, &df, &in, &ic, &ih, &iw, &it); + tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow, &ot); + + cl_mem inbuf, outbuf, tmpbuf; + inbuf = input->mem; + outbuf = output->mem; + tmpbuf = temp->mem; + U32 iw_str, ih_str, iw_off, ih_off; + U32 ow_str, oh_str, ow_off, oh_off; + get_gclmem_dim(input->desc, &iw_str, &ih_str, NULL, &iw_off, &ih_off); + get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off); + + U32 sw, sh, st, pw, ph, pt, kw, kh, kt; + sw = poolingParamSpec.stride_w; + sh = poolingParamSpec.stride_h; + st = poolingParamSpec.stride_t; + pw = poolingParamSpec.padding_left; + ph = poolingParamSpec.padding_top; + pt = poolingParamSpec.padding_before; + kw = poolingParamSpec.kernel_w; + kh = poolingParamSpec.kernel_h; + kt = poolingParamSpec.kernel_t; + + if (df == DF_NCHW) { + st = 1; + pt = 0; + kt = 1; + } + Kernel kernel; + U32 gs[3]; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + char kernelname[128]; + switch (poolingParamSpec.mode) { + case POOLING_MAX: { + gs[0] = oh; + gs[1] = ow; + gs[2] = (oc + 3) / 4 * ot * on; + if (st == 1 && pt == 0 && kt == 1) { + sprintf(kernelname, "pooling_max"); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ih_off, iw_off, ih_str, iw_str, oh, + ow, oh_off, ow_off, oh_str, ow_str, sh, sw, ph, pw, kh, kw, inbuf, outbuf)); + } else { + return NOT_SUPPORTED; + } + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + handle->t_total += handle->t_execute; +#endif + break; + } + case POOLING_MEAN: { + if (oh == 1 && ow == 1 && iw > 7) { + sprintf(kernelname, "pooling_global_mean_w"); + gs[0] = ih; + gs[1] = (oc + 3) / 4 * on; + dim = 2; + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, ih_str * iw_str, ih_off, iw_off, ih, + iw, gs[0], gs[1], inbuf, tmpbuf)); + CHECK_STATUS(gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname)); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + handle->t_total += handle->t_execute; +#endif + sprintf(kernelname, "pooling_global_mean_h"); + gs[0] = (oc + 3) / 4 * on; + dim = 1; + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs( + kernel, ih, oh_str, oh_str * ow_str, oh_off, ow_off, gs[0], tmpbuf, outbuf)); + CHECK_STATUS(gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname)); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); +#endif + } else { + sprintf(kernelname, "pooling_mean"); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ih_off, iw_off, ih_str, iw_str, oh, + ow, oh_off, ow_off, oh_str, ow_str, sh, sw, ph, pw, kh, kw, inbuf, outbuf)); + + gs[0] = oh; + gs[1] = ow; + gs[2] = (oc + 3) / 4 * on; + dim = 3; + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + handle->t_total += handle->t_execute; +#endif + } + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + return SUCCESS; +} + +EE pooling_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + PoolingParamSpec poolingParamSpec, + TensorDesc outputDesc, + GCLMem_t output, + GCLMem_t temp) +{ + CHECK_STATUS( + pooling_checkpara_mali_fp16(handle, inputDesc, input, poolingParamSpec, outputDesc, output)); + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + CHECK_STATUS(pooling_core_mali_fp16( + handle, inputDesc, input, poolingParamSpec, outputDesc, output, temp)); + return SUCCESS; +} + +EE pooling_infer_forward_tmp_bytes_mali_fp16( + TensorDesc inputDesc, U32 *bytes, ForwardRunInfoMali_t forwardRunInfo) +{ + UNUSED(forwardRunInfo); + DataType idt; + U32 in, ic, ih, iw; + tensorSelectGet(inputDesc, &idt, NULL, &in, &ic, &ih, &iw); + *bytes = ih * ((ic + 3) / 4 * 4) * bytesOf(idt); + return SUCCESS; +} diff --git a/tensor_computing/src/gpu/mali/fp16/pooling_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/pooling_mali_fp16.h similarity index 78% rename from tensor_computing/src/gpu/mali/fp16/pooling_mali_fp16.h rename to compute/tensor/src/gpu/mali/fp16/pooling_mali_fp16.h index bda21f3a..6ae6f310 100644 --- a/tensor_computing/src/gpu/mali/fp16/pooling_mali_fp16.h +++ b/compute/tensor/src/gpu/mali/fp16/pooling_mali_fp16.h @@ -11,20 +11,20 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _H_POOLING_MALI_FP16 #define _H_POOLING_MALI_FP16 #include "sys.h" -#include "type.h" -#include "tensor_desc.h" #include "error.h" +#include "types.h" #include "tensor_computing_type.h" -EE pooling_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - PoolingDesc poolingDesc, - TensorDesc outputDesc, - GCLMem_t output); +EE pooling_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + PoolingParamSpec poolingParamSpec, + TensorDesc outputDesc, + GCLMem_t output, + GCLMem_t temp); +EE pooling_infer_forward_tmp_bytes_mali_fp16( + TensorDesc inputDesc, U32 *bytes, ForwardRunInfoMali_t forwardRunInfo); #endif - diff --git a/compute/tensor/src/gpu/mali/fp16/power_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/power_mali_fp16.cpp new file mode 100644 index 00000000..bf22eb72 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/power_mali_fp16.cpp @@ -0,0 +1,98 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "types.h" +#include "gpu/mali/fp16/power_mali_fp16.h" + +inline EE power_checkpara_mali_fp16(TensorDesc inputDesc, TensorDesc outputDesc) +{ + EE ret = SUCCESS; + if (inputDesc.dt != outputDesc.dt || (inputDesc.dt != DT_F16 && inputDesc.dt != DT_I32)) { + ret = NOT_SUPPORTED; + } + return ret; +} + +inline EE power_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + PowerParamSpec p, + TensorDesc outputDesc, + GCLMem_t output) +{ + UNUSED(outputDesc); + U32 iw, ih, ic, in; + DataType dt; + if (inputDesc.df == DF_NCHW || inputDesc.df == DF_NORMAL) { + tensorSelectGet(inputDesc, &dt, NULL, &in, &ic, &ih, &iw); + } else if (inputDesc.df == DF_MKT) { + get_nlp_mkt_val(inputDesc, &dt, &in, &ic, &ih); + iw = 1; + } else { + return NOT_SUPPORTED; + } + U32 iw_str, ih_str, iw_off, ih_off; + U32 ow_str, oh_str, ow_off, oh_off; + get_gclmem_dim(input->desc, &iw_str, &ih_str, NULL, &iw_off, &ih_off); + get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off); + cl_mem inbuf, outbuf; + inbuf = input->mem; + outbuf = output->mem; + Kernel kernel; + U32 gs[3]; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + char kernelname[128]; + sprintf(kernelname, "power_f16"); + if (dt == DT_I32) { + sprintf(kernelname, "power_i32"); + } + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + U32 has_power = (p.power == (F32)1.0) ? 0 : 1; + if (input->desc.memFormat == DF_NCHW) { + gs[0] = (iw + 3) / 4; + gs[1] = ih; + gs[2] = ic; + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, iw_str, ih_off, iw_off, oh_str, ow_str, + oh_off, ow_off, iw, gs[0], gs[1], has_power, p.scale, p.shift, p.power, inbuf, outbuf)); + } + if (input->desc.memFormat == DF_NCWHC4) { + gs[0] = ih; + gs[1] = iw; + gs[2] = (ic + 3) / 4; + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str * 4, iw_off, ih_off * 4, ow_str, + oh_str * 4, ow_off, oh_off * 4, ih * 4, gs[0], gs[1], has_power, p.scale, p.shift, + p.power, inbuf, outbuf)); + } + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + handle->t_total += handle->t_execute; +#endif + return SUCCESS; +} + +EE power_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + PowerParamSpec p, + TensorDesc outputDesc, + GCLMem_t output) +{ + CHECK_STATUS(power_checkpara_mali_fp16(inputDesc, outputDesc)); + if (input->mem != output->mem) { + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + } + CHECK_STATUS(power_core_mali_fp16(handle, inputDesc, input, p, outputDesc, output)); + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/fp16/power_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/power_mali_fp16.h new file mode 100644 index 00000000..ca25d102 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/power_mali_fp16.h @@ -0,0 +1,25 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _POWER_MALI_FP16 +#define _POWER_MALI_FP16 +#include "types.h" +#include "tensor_computing_type.h" + +EE power_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + PowerParamSpec p, + TensorDesc outputDesc, + GCLMem_t output); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/prelu_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/prelu_mali_fp16.cpp new file mode 100644 index 00000000..5ef92949 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/prelu_mali_fp16.cpp @@ -0,0 +1,93 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/prelu_mali_fp16.h" + +inline EE prelu_checkpara_mali_fp16(TensorDesc inputDesc, TensorDesc outputDesc) +{ + if (inputDesc.dt != outputDesc.dt || inputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE prelu_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + GCLMem_t weight, + PReLUParamSpec preluDesc, + TensorDesc outputDesc, + GCLMem_t output) +{ + UNUSED(outputDesc); + U32 iw, ih, ic, in; + tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); + U32 iw_str, ih_str, iw_off, ih_off; + U32 ow_str, oh_str, ow_off, oh_off; + ih_str = input->desc.stride[0]; + iw_str = input->desc.stride[1]; + ih_off = input->desc.offset[0]; + iw_off = input->desc.offset[1]; + oh_str = output->desc.stride[0]; + ow_str = output->desc.stride[1]; + oh_off = output->desc.offset[0]; + ow_off = output->desc.offset[1]; + cl_mem inbuf, outbuf, webuf; + inbuf = input->mem; + outbuf = output->mem; + webuf = weight->mem; + + char modeName[16]; + char kernelName[128]; + if (preluDesc.propagate_down) { + strcpy(modeName, "prop"); + } else { + strcpy(modeName, "noprop"); + } + sprintf(kernelName, "prelu_%s", modeName); + + U32 gs[3] = {ih, iw, (ic + 3) / 4}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ih_str, iw_str, ih_off, iw_off, ih, iw, oh_str, + ow_str, oh_off, ow_off, webuf, inbuf, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_print_memory(handle, input, "prelu_input")); + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); + CHECK_STATUS(gcl_print_memory(handle, weight, "prelu_weight")); +#endif + return SUCCESS; +} + +EE prelu_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + GCLMem_t weight, + PReLUParamSpec preluDesc, + TensorDesc outputDesc, + GCLMem_t output) +{ + CHECK_STATUS(prelu_checkpara_mali_fp16(inputDesc, outputDesc)); + if (input->mem != output->mem) { + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + } + CHECK_STATUS( + prelu_core_mali_fp16(handle, inputDesc, input, weight, preluDesc, outputDesc, output)); + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/fp16/prelu_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/prelu_mali_fp16.h new file mode 100644 index 00000000..51c1143e --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/prelu_mali_fp16.h @@ -0,0 +1,28 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _PRELU_MALI_FP16 +#define _PRELU_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE prelu_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + GCLMem_t weight, + PReLUParamSpec preluDesc, + TensorDesc outputDesc, + GCLMem_t output); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/reshape_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/reshape_mali_fp16.cpp new file mode 100644 index 00000000..463b116f --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/reshape_mali_fp16.cpp @@ -0,0 +1,310 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/reshape_mali_fp16.h" + +inline EE reshape_checkpara_mali_fp16(TensorDesc inputDesc, TensorDesc outputDesc) +{ + if (inputDesc.dt != outputDesc.dt) { + return NOT_SUPPORTED; + } + if (outputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE reshape_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output, + GCLMem_t tmpbuf) +{ + DataFormat idf, odf; + U32 iw, ih, ic, in, it; + U32 ow, oh, oc, on, ot; + tensorSelectGet(inputDesc, NULL, &idf, &in, &ic, &ih, &iw, &it); + tensorSelectGet(outputDesc, NULL, &odf, &on, &oc, &oh, &ow, &ot); + U32 iw_str, ih_str, ic_str, iw_off, ih_off; + U32 ow_str, oh_str, oc_str, ow_off, oh_off; + get_gclmem_dim(input->desc, &iw_str, &ih_str, &ic_str, &iw_off, &ih_off); + get_gclmem_dim(output->desc, &ow_str, &oh_str, &oc_str, &ow_off, &oh_off); + DataFormat imf = input->desc.memFormat; + DataFormat omf = output->desc.memFormat; + cl_mem inbuf = input->mem; + cl_mem outbuf = output->mem; + cl_mem tmp = tmpbuf->mem; + bool dataCopy = false; + U32 copy_len_in = iw * ih * ic * in * it; + U32 copy_len_out = ow * oh * oc * on * ot; + + if ((iw_str == 1 && ih_str == 1 && omf == DF_NCHW && ow_off == 0 && oh_off == 0) || + (ow_str == 1 && oh_str == 1 && imf == DF_NCHW && iw_off == 0 && ih_off == 0)) { + if (inbuf == outbuf) { + return SUCCESS; + } else { + dataCopy = true; + goto DATACOPY; + } + } + + if (imf == omf) { + if (imf == DF_NCHW) { + if ((iw_off == 0 && ih_off == 0 && ow_off == 0 && oh_off == 0) || + (iw_str == ow_str && ih_str == oh_str && iw_off == ow_off && ih_off == oh_off && + iw == ow && ih == oh)) { + if (inbuf == outbuf) { + return SUCCESS; + } else { + dataCopy = true; + goto DATACOPY; + } + } + } + + if (imf == DF_NCWHC4) { + if (iw_str == ow_str && ih_str == oh_str && iw_off == ow_off && ih_off == oh_off && + iw == ow && ih == oh) { + if (it == ot) { + if (inbuf == outbuf) { + return SUCCESS; + } else { + dataCopy = true; + goto DATACOPY; + } + } else { + goto DATACOPY; + } + } + } + + if (iw == ow && ih == oh) { + if (inbuf == outbuf) { + outbuf = tmp; + dataCopy = true; + copy_len_in = copy_len_out; + } + char kernelName[128]; + U32 gs[3]; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + if (imf == DF_NCHW) { + sprintf(kernelName, "mem_trans_nchw_to_nchw"); + gs[0] = (ow + 3) / 4; + gs[1] = oh; + gs[2] = oc * ot * on; + } else { + if (it != ot) { + dataCopy = false; + goto DATACOPY; + } + sprintf(kernelName, "mem_trans_ncwhc4_to_ncwhc4"); + gs[0] = oh; + gs[1] = ow; + gs[2] = (oc + 3) / 4 * ot * on; + ic = ALIGN(ic, 4); + } + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, iw_off, ih_off, ow_str, oh_str, + ow_off, oh_off, iw, ih, ic * it * in, ow, oh, oc * ot * on, 0, 0, inbuf, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); +#endif + if (dataCopy) { + inbuf = tmp; + goto DATACOPY; + } else { + return SUCCESS; + } + } + } + + if (imf != omf && it == 1 && ot == 1) { + if ((imf == DF_NCWHC4 && ih == ow && iw == 1) || (omf == DF_NCWHC4 && iw == oh && ow == 1)) { + if (inbuf == outbuf) { + outbuf = tmp; + dataCopy = true; + copy_len_in = copy_len_out; + } + char kernelName[128]; + U32 gs[3]; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + U32 h_val, c_val; + if (imf == DF_NCWHC4) { + sprintf(kernelName, "mem_trans_ncwhc4_to_nchw_ih_equal_ow"); + gs[0] = ih; + gs[1] = iw; + gs[2] = (ic + 3) / 4; + h_val = oh; + c_val = oc; + } else { + sprintf(kernelName, "mem_trans_nchw_to_ncwhc4_iw_equal_oh"); + gs[0] = oh; + gs[1] = ow; + gs[2] = (oc + 3) / 4; + h_val = ih; + c_val = ic; + } + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, iw_off, ih_off, ow_str, oh_str, + ow_off, oh_off, h_val, c_val, gs[0], gs[1], inbuf, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); +#endif + if (dataCopy) { + inbuf = tmp; + } else { + return SUCCESS; + } + } + } + +DATACOPY: + if (dataCopy) { + U32 gs = (copy_len_out + 3) / 4; + U32 ls = 0; + U32 dim = 1; + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, "copy_f16", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, copy_len_in, copy_len_out, 0, 0, gs, inbuf, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, &gs, &ls, "copy_f16"); + inbuf = tmp; +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, &gs, &ls, "copy_f16")); +#endif + return SUCCESS; + } + + bool noNeedOutTrans = false; + if (ow_str == 1 && oh_str == 1) { + noNeedOutTrans = true; + tmp = outbuf; + } + + if (imf == DF_NCHW && (iw_off > 0 || ih_off > 0)) { + U32 gs[3] = {(iw + 3) / 4, ih, ic * it}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, "mem_trans_nchw_to_nchw", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, iw_off, ih_off, iw, ih, 0, 0, iw, + ih, ic * it, iw, ih, ic * it, 0, 0, inbuf, tmp)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, "mem_trans_nchw_to_nchw"); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "mem_trans_nchw_to_nchw")); +#endif + if (noNeedOutTrans) { + return SUCCESS; + } else { + inbuf = tmp; + } + } + + if (imf == DF_NCWHC4) { + U32 gs[3] = {ih, (iw + 3) / 4, (ic + 3) / 4 * it}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + Kernel kernel; + char kernelName[128]; + if (idf == DF_NCTHW) { + sprintf(kernelName, "mem_trans_3d_ncwhc4_to_nchw"); + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, iw_off, ih_off, iw, ih, 0, 0, + iw, ih, ic, it, iw, ih, ic, it, 0, 0, inbuf, tmp)); + } else { + sprintf(kernelName, "mem_trans_ncwhc4_to_nchw"); + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, iw_off, ih_off, iw, ih, 0, 0, + iw, ih, ic, iw, ih, ic, 0, 0, inbuf, tmp)); + } + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); +#endif + if (noNeedOutTrans) { + return SUCCESS; + } else { + inbuf = tmp; + } + } + + if (omf == DF_NCHW) { + U32 gs[3] = {(ow + 3) / 4, oh, oc * ot}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, "mem_trans_nchw_to_nchw", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ow, oh, 0, 0, ow_str, oh_str, ow_off, oh_off, ow, + oh, oc * ot, ow, oh, oc * ot, 0, 0, inbuf, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, "mem_trans_nchw_to_nchw"); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "mem_trans_nchw_to_nchw")); +#endif + return SUCCESS; + } + + if (omf == DF_NCWHC4) { + U32 gs[3] = {(ow + 3) / 4, oh, (oc + 3) / 4 * on}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, "mem_trans_nchw_to_ncwhc4", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ow, oh, 0, 0, ow_str, oh_str, ow_off, oh_off, ow, + oh, oc, ow, oh, oc, 0, 0, inbuf, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, "mem_trans_nchw_to_ncwhc4"); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "mem_trans_nchw_to_ncwhc4")); +#endif + return SUCCESS; + } + return NOT_SUPPORTED; +} + +EE reshape_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc, + U32 *bytes) +{ + U32 maxSize = tensorNumBytes(inputDesc); + U32 tmpSize = tensorNumBytes(outputDesc); + maxSize = (maxSize > tmpSize) ? maxSize : tmpSize; + tmpSize = gclmemInputDesc->byteSize; + maxSize = (maxSize > tmpSize) ? maxSize : tmpSize; + tmpSize = gclmemOutputDesc->byteSize; + maxSize = (maxSize > tmpSize) ? maxSize : tmpSize; + *bytes = maxSize; + return SUCCESS; +} + +EE reshape_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output, + GCLMem_t tmpbuf) +{ + CHECK_STATUS(reshape_checkpara_mali_fp16(inputDesc, outputDesc)); + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + CHECK_STATUS(reshape_core_mali_fp16(handle, inputDesc, input, outputDesc, output, tmpbuf)); + return SUCCESS; +} diff --git a/tensor_computing/src/gpu/mali/fp16/reshape_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/reshape_mali_fp16.h similarity index 80% rename from tensor_computing/src/gpu/mali/fp16/reshape_mali_fp16.h rename to compute/tensor/src/gpu/mali/fp16/reshape_mali_fp16.h index 4156e1f2..f0959222 100644 --- a/tensor_computing/src/gpu/mali/fp16/reshape_mali_fp16.h +++ b/compute/tensor/src/gpu/mali/fp16/reshape_mali_fp16.h @@ -11,18 +11,23 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _RESHAPE_MALI_FP16 #define _RESHAPE_MALI_FP16 #include "sys.h" -#include "type.h" -#include "tensor_desc.h" #include "error.h" +#include "types.h" #include "tensor_computing_type.h" +EE reshape_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc, + U32 *bytes); + EE reshape_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output); + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output, + GCLMem_t tmpbuf); #endif diff --git a/compute/tensor/src/gpu/mali/fp16/rnn_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/rnn_mali_fp16.cpp new file mode 100644 index 00000000..ab8048f9 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/rnn_mali_fp16.cpp @@ -0,0 +1,192 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/rnn_mali_fp16.h" + +inline EE rnn_checkpara_mali_fp16( + TensorDesc xDesc, TensorDesc filterDesc, TensorDesc biasDesc, TensorDesc hDesc) +{ + if (xDesc.dt != filterDesc.dt || xDesc.dt != biasDesc.dt || xDesc.dt != hDesc.dt || + xDesc.dt != DT_F16) { + return NOT_MATCH; + } + return SUCCESS; +} + +inline EE rnn_core_mali_fp16(GCLHandle_t handle, + TensorDesc xDesc, + const GCLMem_t currentX, + TensorDesc filterDesc, + GCLMem_t filter, + TensorDesc biasDesc, + GCLMem_t bias, + GCLMem_t state, + U32 tmpBytes, + GCLMem_t tmpBuf, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + GCLMem_t currentH, + ForwardRunInfoMali_t forwardRunInfo) +{ + UNUSED(handle); + UNUSED(xDesc); + UNUSED(currentX); + UNUSED(filterDesc); + UNUSED(filter); + UNUSED(biasDesc); + UNUSED(bias); + UNUSED(state); + UNUSED(tmpBytes); + UNUSED(tmpBuf); + UNUSED(rnnParamSpec); + UNUSED(batchStrideX); + UNUSED(batchStrideH); + UNUSED(hDesc); + UNUSED(currentH); + UNUSED(forwardRunInfo); + return NOT_SUPPORTED; +} + +EE rnn_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + RNNParamSpec rnnParamSpec, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo) +{ + U32 filterRow, filterCol; + tensorSelectGet(filterDesc, NULL, NULL, NULL, NULL, &filterRow, &filterCol); + U32 s0, s1, s2, num, byteSize, item_c; + U32 filterNum = (rnnParamSpec.numProjection > 0) ? 2 : 1; + for (U32 i = 0; i < filterNum; ++i) { + item_c = forwardRunInfo->best_c[i]; + if (i == 0) { + s0 = filterRow; + s1 = (filterCol + item_c - 1) / item_c; + } else { + s0 = rnnParamSpec.numOutput; + s1 = (rnnParamSpec.numProjection + item_c - 1) / item_c; + } + s2 = 1; + num = s0 * s1 * s2 * item_c; + byteSize = num * bytesOf(DT_F16); + gclmemFilterDesc[i].stride[0] = s0; + gclmemFilterDesc[i].stride[1] = s1; + gclmemFilterDesc[i].stride[2] = s2; + gclmemFilterDesc[i].offset[0] = 0; + gclmemFilterDesc[i].offset[1] = 0; + gclmemFilterDesc[i].offset[2] = 0; + gclmemFilterDesc[i].num = num; + gclmemFilterDesc[i].byteSize = byteSize; + gclmemFilterDesc[i].memType = GCL_MEM_BUF; + gclmemFilterDesc[i].flags = CL_MEM_READ_WRITE; + gclmemFilterDesc[i].memFormat = DF_CHWNC4; + if (item_c == 8) { + gclmemFilterDesc[i].memFormat = DF_CHWNC8; + } + if (item_c == 16) { + gclmemFilterDesc[i].memFormat = DF_CHWNC16; + } + gclmemFilterDesc[i].host_ptr = NULL; + } + *bytes = 0; + return SUCCESS; +} + +EE rnn_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + RNNParamSpec rnnParamSpec, + TensorDesc *fltmemDesc, + GCLMem_t fltmem, + ForwardRunInfoMali_t forwardRunInfo) +{ + DataType fdt; + U32 filterRow, filterCol; + tensorSelectGet(filterDesc, &fdt, NULL, NULL, NULL, &filterRow, &filterCol); + U32 filterNum = (rnnParamSpec.numProjection > 0) ? 2 : 1; + U32 item_c, item_k; + + char kernelname[128]; + Kernel kernel; + U32 gs[3]; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + U32 fwh = 1; + for (U32 i = 0; i < filterNum; i++) { + item_c = forwardRunInfo->best_c[i]; + item_k = forwardRunInfo->best_k[i]; + sprintf(kernelname, "conv_direct_trans_fltbuf_%d%d", item_c, item_k); + CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); + if (i == 1) { + filterCol = rnnParamSpec.numProjection; + filterRow = rnnParamSpec.numOutput; + } + CHECK_STATUS( + gcl_set_kernelArgs(kernel, fwh, filterCol, filterRow, filter[i].mem, fltmem[i].mem)); + gs[0] = fwh; + gs[1] = (filterCol + item_c - 1) / item_c; + gs[2] = filterRow; + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); +#ifdef _DEBUG + CHECK_STATUS(gcl_print_memory(handle, filter, "fc_filter_org")); + CHECK_STATUS(gcl_print_memory(handle, fltmem, "fc_filter_tran")); +#endif + fltmemDesc[i] = tensor2df(fdt, DF_NORMAL, filterRow, filterCol); + } + return SUCCESS; +} + +EE rnn_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + RNNParamSpec rnnParamSpec, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo) +{ + UNUSED(inputDesc); + UNUSED(filterDesc); + UNUSED(outputDesc); + UNUSED(rnnParamSpec); + UNUSED(bytes); + UNUSED(forwardRunInfo); + return SUCCESS; +} + +EE rnn_mali_fp16(GCLHandle_t handle, + TensorDesc xDesc, + const GCLMem_t currentX, + TensorDesc filterDesc, + GCLMem_t filter, + TensorDesc biasDesc, + GCLMem_t bias, + GCLMem_t state, + U32 tmpBytes, + GCLMem_t tmpBuf, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + GCLMem_t currentH, + ForwardRunInfoMali_t forwardRunInfo) +{ + CHECK_STATUS(rnn_checkpara_mali_fp16(xDesc, filterDesc, biasDesc, hDesc)); + CHECK_STATUS(rnn_core_mali_fp16(handle, xDesc, currentX, filterDesc, filter, biasDesc, bias, + state, tmpBytes, tmpBuf, rnnParamSpec, batchStrideX, batchStrideH, hDesc, currentH, + forwardRunInfo)); + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/fp16/rnn_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/rnn_mali_fp16.h new file mode 100644 index 00000000..e66ba6bc --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/rnn_mali_fp16.h @@ -0,0 +1,58 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _RNN_MALI_FP16 +#define _RNN_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE rnn_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + RNNParamSpec rnnParamSpec, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo); + +EE rnn_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + RNNParamSpec rnnParamSpec, + TensorDesc *fltmemDesc, + GCLMem_t fltmem, + ForwardRunInfoMali_t forwardRunInfo); + +EE rnn_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + RNNParamSpec rnnParamSpec, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo); + +EE rnn_mali_fp16(GCLHandle_t handle, + TensorDesc xDesc, + const GCLMem_t currentX, + TensorDesc filterDesc, + GCLMem_t filter, + TensorDesc biasDesc, + GCLMem_t bias, + GCLMem_t state, + U32 tmpBytes, + GCLMem_t tmpBuf, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + GCLMem_t currentH, + ForwardRunInfoMali_t forwardRunInfo); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/rnncell_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/rnncell_mali_fp16.cpp new file mode 100644 index 00000000..3a55cd2c --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/rnncell_mali_fp16.cpp @@ -0,0 +1,233 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/rnncell_mali_fp16.h" +#define get_xDim(xDesc, xDim) \ + { \ + if (xDesc.nDims == 2 || xDesc.df == DF_MTK) \ + xDim = xDesc.dims[0]; \ + if (xDesc.df == DF_MKT) \ + xDim = xDesc.dims[1]; \ + } + +inline EE rnncell_checkpara_mali_fp16( + TensorDesc xDesc, TensorDesc filterDesc, TensorDesc biasDesc, TensorDesc hDesc) +{ + if (xDesc.dt != filterDesc.dt || xDesc.dt != biasDesc.dt || xDesc.dt != hDesc.dt || + xDesc.dt != DT_F16) { + return NOT_MATCH; + } + return SUCCESS; +} + +inline EE rnncell_core_mali_fp16(GCLHandle_t handle, + TensorDesc xDesc, + const GCLMem_t currentX, + TensorDesc filterDesc, + GCLMem_t filter, + TensorDesc biasDesc, + GCLMem_t bias, + GCLMem_t state, + U32 tmpBytes, + GCLMem_t tmpBuf, + RNNParamSpec rnncellDesc, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + GCLMem_t output, + ForwardRunInfoMali_t forwardRunInfo) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + UNUSED(batchStrideX); + UNUSED(batchStrideH); + UNUSED(hDesc); + U32 item_c = forwardRunInfo->best_c[0]; + U32 hDim = rnncellDesc.numOutput; + U32 col = (rnncellDesc.numProjection > 0) ? rnncellDesc.numProjection : hDim; + bool project = (rnncellDesc.numProjection > 0) ? true : false; + float fbias = rnncellDesc.forgetBias; + float zonecell = rnncellDesc.zoneoutCell; + float zoneout = rnncellDesc.zoneoutOutput; + + DataType dt = xDesc.dt; + U32 xDim; + get_xDim(xDesc, xDim); + Mem xMem = currentX->mem; + Mem sMem = state->mem; + Mem xhMem; + U32 offset = 0; + U32 xhNum, xhSize; + xhNum = (xDim + hDim + item_c - 1) / item_c * item_c; + xhSize = xhNum * bytesOf(dt); + CHECK_STATUS(gcl_create_sub_buffer(xhSize, &offset, tmpBuf, &xhMem)); + + Mem interMem; + U32 interNum, interSize; + U32 filterRow, filterCol; + tensorSelectGet(filterDesc, NULL, NULL, NULL, NULL, &filterRow, &filterCol); + interNum = filterRow + 4; + interSize = interNum * bytesOf(dt); + CHECK_STATUS(gcl_create_sub_buffer(interSize, &offset, tmpBuf, &interMem)); + + Mem tmpOut; + Mem outbuf = output->mem; + if (project) { + U32 item_cp = forwardRunInfo->best_c[1]; + U32 tmpOutNum = (col + item_cp - 1) / item_cp * item_cp; + U32 tmpOutSize = tmpOutNum * bytesOf(dt); + CHECK_STATUS(gcl_create_sub_buffer(tmpOutSize, &offset, tmpBuf, &tmpOut)); + outbuf = tmpOut; + } + + U32 xh_str, xw_str, xh_off, xw_off; + get_gclmem_dim(currentX->desc, &xw_str, &xh_str, NULL, &xw_off, &xh_off); + if (xw_str != 1 || xh_str != 1 || xw_off != 0 || xh_off != 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + U32 gs1 = xhNum; + U32 ls1 = 0; + U32 dim = 1; + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, "rnncell_build_xh", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, xDim, xDim + hDim, col, gs1, xMem, sMem, xhMem)); + gcl_set_kernelVec(handle, kernel, dim, &gs1, &ls1, "rnncell_build_xh"); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, &gs1, &ls1, "rnncell_build_xh")); + CHECK_STATUS(gcl_print_memory(handle, currentX, "currentX")); + CHECK_STATUS(gcl_print_memory(handle, state, "state")); + CHECK_STATUS(gcl_print_buffer(handle, xhMem, xhNum, "xhMem")); + handle->t_total += handle->t_execute; +#endif + + Mem fltbuf = filter[0].mem; + Mem biasMem = bias->mem; + char kernelname[128]; + U32 ic_str = filter[0].desc.stride[1]; + sprintf(kernelname, "conv_direct_spe_fwhs1_%d", item_c); + gs1 = filterRow; + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, 1, 1, ic_str, 0, 0, 1, 1, 0, 0, filterRow, gs1, 1, + xhMem, fltbuf, biasMem, interMem)); + gcl_set_kernelVec(handle, kernel, dim, &gs1, &ls1, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, &gs1, &ls1, kernelname)); + CHECK_STATUS(gcl_print_memory(handle, &filter[0], "filter")); + CHECK_STATUS(gcl_print_memory(handle, bias, "bias")); + CHECK_STATUS(gcl_print_buffer(handle, interMem, interNum, "interMem")); + handle->t_total += handle->t_execute; +#endif + + U8 noproject = (project) ? 0 : 1; + gs1 = (col + 3) / 4; + CHECK_STATUS(gcl_create_kernel(handle, "rnncell_update_res", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs( + kernel, col, noproject, gs1, fbias, zonecell, zoneout, sMem, interMem, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, &gs1, &ls1, "rnncell_update_res"); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, &gs1, &ls1, "rnncell_update_res")); + CHECK_STATUS(gcl_print_buffer(handle, sMem, col + hDim, "sMem")); + CHECK_STATUS(gcl_print_buffer(handle, interMem, 4 * col, "interMem")); + CHECK_STATUS(gcl_print_buffer(handle, outbuf, col, "outbuf")); + handle->t_total += handle->t_execute; +#endif + + if (project) { + item_c = forwardRunInfo->best_c[1]; + filterRow = rnncellDesc.numOutput; + ic_str = filter[1].desc.stride[1]; + Mem fltbuf = filter[1].mem; + sprintf(kernelname, "conv_direct_spe_fwhs1_nobias_%d", item_c); + gs1 = filterRow; + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, 1, 1, ic_str, 0, 0, 1, 1, 0, 0, filterRow, gs1, 1, + outbuf, fltbuf, biasMem, output->mem)); + gcl_set_kernelVec(handle, kernel, dim, &gs1, &ls1, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, &gs1, &ls1, kernelname)); + CHECK_STATUS(gcl_print_memory(handle, &filter[1], "filter")); + CHECK_STATUS(gcl_print_memory(handle, output, "output")); + handle->t_total += handle->t_execute; +#endif + + gs1 = (hDim + 3) / 4; + CHECK_STATUS(gcl_create_kernel(handle, "rnncell_update_project_state", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, hDim, col, gs1, zoneout, output->mem, sMem)); + gcl_set_kernelVec(handle, kernel, dim, &gs1, &ls1, "rnncell_update_project_state"); +#ifdef _DEBUG + CHECK_STATUS( + gcl_run_kernel(handle, kernel, dim, &gs1, &ls1, "rnncell_update_project_state")); + CHECK_STATUS(gcl_print_buffer(handle, sMem, col + hDim, "sMem")); + handle->t_total += handle->t_execute; +#endif + } + return SUCCESS; +} + +EE rnncell_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + RNNParamSpec rnncellDesc, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo) +{ + UNUSED(outputDesc); + U32 item_c = forwardRunInfo->best_c[0]; + DataType dt = inputDesc.dt; + U32 xDim; + get_xDim(inputDesc, xDim); + U32 hDim = rnncellDesc.numOutput; + U32 xhNum = (xDim + hDim + item_c - 1) / item_c * item_c; + U32 xhSize = (xhNum * bytesOf(dt) + 1023) / 1024 * 1024; + + U32 filterRow; + tensorSelectGet(filterDesc, NULL, NULL, NULL, NULL, &filterRow, NULL); + U32 interNum = filterRow + 4; + U32 interSize = (interNum * bytesOf(dt) + 1023) / 1024 * 1024; + + U32 tmpOutSize = 0; + if (rnncellDesc.numProjection > 0) { + U32 tmpOutNum = rnncellDesc.numProjection; + tmpOutSize = (tmpOutNum * bytesOf(dt) + 1023) / 1024 * 1024; + } + *bytes = xhSize + interSize + tmpOutSize; + return SUCCESS; +} + +EE rnncell_mali_fp16(GCLHandle_t handle, + TensorDesc xDesc, + const GCLMem_t currentX, + TensorDesc filterDesc, + GCLMem_t filter, + TensorDesc biasDesc, + GCLMem_t bias, + GCLMem_t state, + U32 tmpBytes, + GCLMem_t tmpBuf, + RNNParamSpec rnncellDesc, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + GCLMem_t output, + ForwardRunInfoMali_t forwardRunInfo) +{ + CHECK_STATUS(rnncell_checkpara_mali_fp16(xDesc, filterDesc, biasDesc, hDesc)); + CHECK_STATUS(fill_output_zero(handle, output, hDesc)); + CHECK_STATUS(rnncell_core_mali_fp16(handle, xDesc, currentX, filterDesc, filter, biasDesc, bias, + state, tmpBytes, tmpBuf, rnncellDesc, batchStrideX, batchStrideH, hDesc, output, + forwardRunInfo)); + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/fp16/rnncell_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/rnncell_mali_fp16.h new file mode 100644 index 00000000..7cb9fccf --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/rnncell_mali_fp16.h @@ -0,0 +1,44 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _RNNCELL_MALI_FP16 +#define _RNNCELL_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE rnncell_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + RNNParamSpec rnncellDesc, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo); + +EE rnncell_mali_fp16(GCLHandle_t handle, + TensorDesc xDesc, + const GCLMem_t currentX, + TensorDesc filterDesc, + GCLMem_t filter, + TensorDesc biasDesc, + GCLMem_t bias, + GCLMem_t state, + U32 tmpBytes, + GCLMem_t tmpBuf, + RNNParamSpec rnncellDesc, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + GCLMem_t output, + ForwardRunInfoMali_t forwardRunInfo); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/scale_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/scale_mali_fp16.cpp new file mode 100644 index 00000000..a9ba9c5c --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/scale_mali_fp16.cpp @@ -0,0 +1,107 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/scale_mali_fp16.h" + +inline EE scale_checkpara_mali_fp16(TensorDesc inputDesc, TensorDesc outputDesc) +{ + if (inputDesc.dt != outputDesc.dt || inputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE scale_core_mali_fp16(GCLHandle_t handle, + GCLMem_t alpha, + GCLMem_t beta, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output) +{ + UNUSED(outputDesc); + U32 iw, ih, ic, in; + tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); + U32 iw_str, ih_str, iw_off, ih_off; + U32 ow_str, oh_str, ow_off, oh_off; + ih_str = input->desc.stride[0]; + iw_str = input->desc.stride[1]; + ih_off = input->desc.offset[0]; + iw_off = input->desc.offset[1]; + oh_str = output->desc.stride[0]; + ow_str = output->desc.stride[1]; + oh_off = output->desc.offset[0]; + ow_off = output->desc.offset[1]; + cl_mem inbuf, outbuf, albuf, bebuf; + inbuf = input->mem; + outbuf = output->mem; + albuf = alpha->mem; + bebuf = (beta) ? beta->mem : albuf; + + char modeName[16]; + char kernelName[128]; + if (beta) { + strcpy(modeName, "beta"); + if (alpha->desc.stride[0] == 1 && beta->desc.stride[0] == 1 && alpha->desc.stride[1] == 1 && + beta->desc.stride[1] == 1 && alpha->desc.stride[2] == 1 && beta->desc.stride[2] == 1) { + sprintf(kernelName, "scale1_%s", modeName); + } else { + sprintf(kernelName, "scale_%s", modeName); + } + } else { + strcpy(modeName, "nobeta"); + if (alpha->desc.stride[0] == 1 && alpha->desc.stride[1] == 1 && alpha->desc.stride[2] == 1) { + sprintf(kernelName, "scale1_%s", modeName); + } else { + sprintf(kernelName, "scale_%s", modeName); + } + } + + U32 gs[3] = {ih, iw, (ic + 3) / 4}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, ih_str, iw_str, ih_off, iw_off, oh_str, ow_str, + oh_off, ow_off, gs[0], gs[1], albuf, bebuf, inbuf, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_print_memory(handle, input, "scale_input")); + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); + CHECK_STATUS(gcl_print_memory(handle, alpha, "scale_alpha")); + if (beta) { + CHECK_STATUS(gcl_print_memory(handle, beta, "scale_beta")); + } + CHECK_STATUS(gcl_print_memory(handle, output, "scale_output")); +#endif + return SUCCESS; +} + +EE scale_mali_fp16(GCLHandle_t handle, + GCLMem_t alpha, + GCLMem_t beta, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output) +{ + CHECK_STATUS(scale_checkpara_mali_fp16(inputDesc, outputDesc)); + if (input->mem != output->mem) { + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + } + CHECK_STATUS(scale_core_mali_fp16(handle, alpha, beta, inputDesc, input, outputDesc, output)); + return SUCCESS; +} diff --git a/tensor_computing/src/gpu/mali/fp16/scale_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/scale_mali_fp16.h similarity index 82% rename from tensor_computing/src/gpu/mali/fp16/scale_mali_fp16.h rename to compute/tensor/src/gpu/mali/fp16/scale_mali_fp16.h index b5f1bb9b..f135907d 100644 --- a/tensor_computing/src/gpu/mali/fp16/scale_mali_fp16.h +++ b/compute/tensor/src/gpu/mali/fp16/scale_mali_fp16.h @@ -14,16 +14,15 @@ #ifndef _SCALE_MALI_FP16 #define _SCALE_MALI_FP16 #include "sys.h" -#include "type.h" -#include "tensor_desc.h" +#include "types.h" #include "error.h" #include "tensor_computing_type.h" EE scale_mali_fp16(GCLHandle_t handle, - GCLMem_t alpha, - GCLMem_t beta, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output); + GCLMem_t alpha, + GCLMem_t beta, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output); #endif diff --git a/compute/tensor/src/gpu/mali/fp16/slice_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/slice_mali_fp16.cpp new file mode 100644 index 00000000..268eb884 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/slice_mali_fp16.cpp @@ -0,0 +1,120 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "error.h" +#include "gpu/mali/fp16/slice_mali_fp16.h" +#define MAX_SLICE_NUM 2 + +inline EE slice_checkpara_mali_fp16(TensorDesc inputDesc, std::vector outputDesc) +{ + if (inputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + for (auto p : outputDesc) { + if (p.dt != DT_F16) { + return NOT_SUPPORTED; + } + } + return SUCCESS; +} + +inline EE slice_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + SliceParamSpec p, + std::vector outputDesc, + std::vector *output) +{ + if (inputDesc.df == DF_MKT) { + U32 m, k, t; + U32 gw, gh, gc; + get_nlp_mkt_val(inputDesc, NULL, &m, &k, &t); + map_nlp_mkt_to_ncwhc4(m, k, t, &gw, &gh, &gc); + if (p.axis == 2) { + U32 iw_str, ih_str, iw_off, ih_off; + ih_str = input->desc.stride[0]; + iw_str = input->desc.stride[1]; + ih_off = input->desc.offset[0]; + iw_off = input->desc.offset[1]; + U32 ow_str[MAX_SLICE_NUM]; + U32 oh_str[MAX_SLICE_NUM]; + U32 ow_off[MAX_SLICE_NUM]; + U32 oh_off[MAX_SLICE_NUM]; + cl_mem outbuf[MAX_SLICE_NUM]; + U32 sliceEnd[MAX_SLICE_NUM]; + U32 sliceNum = (*output).size(); + if (sliceNum > MAX_SLICE_NUM) { + CHECK_STATUS(NOT_SUPPORTED); + } + U32 j = 0; + std::vector outputArray = *output; + for (U32 i = 0; i < sliceNum; ++i) { + oh_str[i] = ((GCLMem_t)outputArray[i])->desc.stride[0]; + ow_str[i] = ((GCLMem_t)outputArray[i])->desc.stride[1]; + oh_off[i] = ((GCLMem_t)outputArray[i])->desc.offset[0]; + ow_off[i] = ((GCLMem_t)outputArray[i])->desc.offset[1]; + outbuf[i] = ((GCLMem_t)outputArray[i])->mem; + get_nlp_mkt_val(outputDesc[i], NULL, NULL, NULL, &t); + j += t; + sliceEnd[i] = j; + } + char kernelName[128]; + sprintf(kernelName, "slice_h_%d", sliceNum); + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + U32 gs[3] = {gh, gw, gc}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + switch (sliceNum) { + case 2: + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, iw_str, ih_off, iw_off, gs[0], + gs[1], input->mem, oh_str[0], ow_str[0], oh_off[0], ow_off[0], sliceEnd[0], + outbuf[0], oh_str[1], ow_str[1], oh_off[1], ow_off[1], sliceEnd[1], + outbuf[1])); + break; + default: + return NOT_SUPPORTED; + } + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); + CHECK_STATUS(gcl_print_memory(handle, input, "slice_input")); + for (U32 i = 0; i < sliceNum; ++i) { + CHECK_STATUS( + gcl_print_memory(handle, (GCLMem_t)(outputArray[i]), "slice_output")); + } +#endif + return SUCCESS; + } + return NOT_SUPPORTED; + } + return NOT_SUPPORTED; +} + +EE slice_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + SliceParamSpec p, + std::vector outputDesc, + std::vector *output) +{ + std::vector outputArray = *output; + CHECK_STATUS(slice_checkpara_mali_fp16(inputDesc, outputDesc)); + for (U32 i = 0; i < outputArray.size(); i++) { + CHECK_STATUS(fill_output_zero(handle, (GCLMem_t)(outputArray[i]), outputDesc[i])); + } + CHECK_STATUS(slice_core_mali_fp16(handle, inputDesc, input, p, outputDesc, output)); + return SUCCESS; +} diff --git a/tensor_computing/src/gpu/mali/fp16/slice_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/slice_mali_fp16.h similarity index 76% rename from tensor_computing/src/gpu/mali/fp16/slice_mali_fp16.h rename to compute/tensor/src/gpu/mali/fp16/slice_mali_fp16.h index e75ea731..c238a2b0 100644 --- a/tensor_computing/src/gpu/mali/fp16/slice_mali_fp16.h +++ b/compute/tensor/src/gpu/mali/fp16/slice_mali_fp16.h @@ -11,19 +11,16 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _SLICE_MALI_FP16 #define _SLICE_MALI_FP16 #include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" +#include "types.h" #include "tensor_computing_type.h" -EE slice_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - I32 axis, - std::vector outputDesc, - std::vector* output); +EE slice_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + SliceParamSpec p, + std::vector outputDesc, + std::vector *output); #endif diff --git a/compute/tensor/src/gpu/mali/fp16/softmax_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/softmax_mali_fp16.cpp new file mode 100644 index 00000000..4c151c86 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/softmax_mali_fp16.cpp @@ -0,0 +1,207 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "error.h" +#include "gpu/mali/fp16/softmax_mali_fp16.h" +namespace { +constexpr int SOFTMAX_KERNEL_ITEM_NUM = 16; +constexpr int SOFTMAX_KERNEL_TMPBUF_EXPAND = 2; +} // namespace + +inline EE softmax_checkpara_mali_fp16(TensorDesc inputDesc, TensorDesc outputDesc) +{ + if (inputDesc.dt != outputDesc.dt) { + return NOT_SUPPORTED; + } + if (outputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE softmax_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + GCLMem_t tmp, + int axis, + TensorDesc outputDesc, + GCLMem_t output) +{ + UNUSED(outputDesc); + U32 iw, ih, ic, in; + tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); + U32 iw_str, ih_str, ic_str, iw_off, ih_off, ihw_str; + U32 ow_str, oh_str, ow_off, oh_off, ohw_str; + get_gclmem_dim(input->desc, &iw_str, &ih_str, &ic_str, &iw_off, &ih_off); + get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off); + ihw_str = ih_str * iw_str; + ohw_str = oh_str * ow_str; + U32 nDims = inputDesc.nDims; + I32 axisTran = (axis + nDims) % nDims; + axisTran = nDims - 1 - axisTran; + cl_mem inbuf, outbuf; + inbuf = input->mem; + outbuf = output->mem; + Kernel kernel; + char kernelname[128]; + U32 gs[2]; + U32 ls[2] = {0, 0}; + U32 dim = 2; + if (iw_off == 0 && ih_off == 0) { + bool matchCase = false; + I32 icd4; + I32 ice4; + if (iw_str == 1 && ih_str == 1) { + icd4 = (ic + 3) >> 2; + ice4 = ((ic & 3) == 0) ? 4 : (ic & 3); + matchCase = true; + } + if (iw_str == 1 && ic_str == 1) { + icd4 = (ih + 3) >> 2; + ice4 = ((ih & 3) == 0) ? 4 : (ih & 3); + matchCase = true; + } + if (ih_str == 1 && ic_str == 1) { + icd4 = (iw + 3) >> 2; + ice4 = ((iw & 3) == 0) ? 4 : (iw & 3); + matchCase = true; + } + + if (matchCase) { + gs[0] = SOFTMAX_KERNEL_ITEM_NUM; + dim = 1; + Mem clTmpBuf = tmp->mem; + sprintf(kernelname, "softmax_h1w1_max_part"); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS( + gcl_set_kernelArgs(kernel, icd4, ice4, SOFTMAX_KERNEL_ITEM_NUM, inbuf, clTmpBuf)); + CHECK_STATUS(gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname)); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); +#endif + dim = 1; + gs[0] = 1; + sprintf(kernelname, "softmax_h1w1_max_all"); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, SOFTMAX_KERNEL_ITEM_NUM, clTmpBuf)); + CHECK_STATUS(gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname)); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); +#endif + dim = 1; + gs[0] = SOFTMAX_KERNEL_ITEM_NUM; + sprintf(kernelname, "softmax_h1w1_sum_part"); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS( + gcl_set_kernelArgs(kernel, icd4, ice4, SOFTMAX_KERNEL_ITEM_NUM, inbuf, clTmpBuf)); + CHECK_STATUS(gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname)); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); +#endif + dim = 1; + gs[0] = 1; + sprintf(kernelname, "softmax_h1w1_sum_all"); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, SOFTMAX_KERNEL_ITEM_NUM, clTmpBuf)); + CHECK_STATUS(gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname)); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); +#endif + dim = 1; + gs[0] = icd4; + sprintf(kernelname, "softmax_h1w1_output"); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs( + kernel, icd4, ice4, SOFTMAX_KERNEL_ITEM_NUM, inbuf, clTmpBuf, outbuf)); + CHECK_STATUS(gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname)); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); +#endif + return SUCCESS; + } + } + + if (input->desc.memFormat == DF_NCWHC4) { + if ((nDims == 4 && axisTran == 1) || (inputDesc.df == DF_MTK && axisTran == 0) || + (inputDesc.df == DF_MKT && axisTran == 1)) { + gs[0] = ih; + gs[1] = iw; + I32 icd4 = (ic + 3) >> 2; + I32 ice4 = ((ic & 3) == 0) ? 4 : (ic & 3); + sprintf(kernelname, "softmax"); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, icd4, ice4, ih_str, ihw_str, ih_off, iw_off, + oh_str, ohw_str, oh_off, ow_off, gs[0], gs[1], inbuf, outbuf)); + } else { + return NOT_SUPPORTED; + } + } else if (input->desc.memFormat == DF_NCHW) { + if (axisTran == 2) { // on c axis + gs[0] = (iw + 3) / 4; + gs[1] = ih; + sprintf(kernelname, "softmax_nchw_c"); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ic, iw_str, ihw_str, iw_off, ih_off, ow_str, + ohw_str, ow_off, oh_off, iw, gs[0], gs[1], inbuf, outbuf)); + } else if (axisTran == 0) { // on w axis + gs[0] = ih; + gs[1] = ic; + I32 iwd4 = (iw + 3) >> 2; + I32 iwe4 = ((iw & 3) == 0) ? 4 : (iw & 3); + sprintf(kernelname, "softmax_nchw_w"); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iwd4, iwe4, iw_str, ih_str, iw_off, ih_off, + ow_str, oh_str, ow_off, oh_off, gs[0], gs[1], inbuf, outbuf)); + } else { + return NOT_SUPPORTED; + } + } else { + return NOT_SUPPORTED; + } + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); +#endif + return SUCCESS; +} + +EE softmax_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + GCLMem_t tmp, + int axis, + TensorDesc outputDesc, + GCLMem_t output) +{ + CHECK_STATUS(softmax_checkpara_mali_fp16(inputDesc, outputDesc)); + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + CHECK_STATUS(softmax_core_mali_fp16(handle, inputDesc, input, tmp, axis, outputDesc, output)); + return SUCCESS; +} + +EE softmax_infer_forward_tmp_bytes_mali_fp16( + TensorDesc inputDesc, U32 *bytes, ForwardRunInfoMali_t forwardRunInfo) +{ + UNUSED(forwardRunInfo); + U32 in, ic, ih, iw; + tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); + if (ih != 1 || iw != 1 || in != 1) { + *bytes = 0; + } else { + *bytes = SOFTMAX_KERNEL_ITEM_NUM + SOFTMAX_KERNEL_TMPBUF_EXPAND; + } + + return SUCCESS; +} diff --git a/tensor_computing/src/gpu/mali/fp16/softmax_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/softmax_mali_fp16.h similarity index 82% rename from tensor_computing/src/gpu/mali/fp16/softmax_mali_fp16.h rename to compute/tensor/src/gpu/mali/fp16/softmax_mali_fp16.h index 758e83b3..5a01ff3b 100644 --- a/tensor_computing/src/gpu/mali/fp16/softmax_mali_fp16.h +++ b/compute/tensor/src/gpu/mali/fp16/softmax_mali_fp16.h @@ -11,21 +11,21 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _H_SOFTMAX_MALI_FP16 #define _H_SOFTMAX_MALI_FP16 #include "sys.h" -#include "type.h" -#include "tensor_desc.h" +#include "types.h" #include "error.h" #include "tensor_computing_type.h" -EE softmax_mali_fp16(GCLHandle_t handle, - TensorDesc inputdesc, - GCLMem_t input, - int axis, - TensorDesc outputDesc, - GCLMem_t output); -#endif - +EE softmax_infer_forward_tmp_bytes_mali_fp16( + TensorDesc inputDesc, U32 *bytes, ForwardRunInfoMali_t forwardRunInfo); +EE softmax_mali_fp16(GCLHandle_t handle, + TensorDesc inputdesc, + GCLMem_t input, + GCLMem_t tmp, + int axis, + TensorDesc outputDesc, + GCLMem_t output); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/squeeze_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/squeeze_mali_fp16.cpp new file mode 100644 index 00000000..8757f85e --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/squeeze_mali_fp16.cpp @@ -0,0 +1,83 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "error.h" +#include "gpu/mali/fp16/squeeze_mali_fp16.h" + +inline EE squeeze_checkpara_mali_fp16(TensorDesc inputDesc, TensorDesc outputDesc) +{ + if (inputDesc.dt != outputDesc.dt) { + return NOT_SUPPORTED; + } + if (outputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE squeeze_core_mali_fp16( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output) +{ + UNUSED(outputDesc); + U32 iw, ih, ic, in; + if (inputDesc.df == DF_NCHW) { + tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); + } else if (inputDesc.df == DF_MKT) { + get_nlp_mkt_val(inputDesc, NULL, &in, &ic, &ih); + iw = 1; + } else { + return NOT_SUPPORTED; + } + U32 iw_str, ih_str, iw_off, ih_off; + ih_str = input->desc.stride[0]; + iw_str = input->desc.stride[1]; + ih_off = input->desc.offset[0]; + iw_off = input->desc.offset[1]; + U32 ow_str, oh_str, ow_off, oh_off; + oh_str = output->desc.stride[0]; + ow_str = output->desc.stride[1]; + oh_off = output->desc.offset[0]; + ow_off = output->desc.offset[1]; + + cl_mem inbuf, outbuf; + inbuf = input->mem; + outbuf = output->mem; + + U32 gs[3] = {ih, iw, (ic + 3) / 4}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, "squeeze", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ih_str, iw_str, ih_off, iw_off, oh_str, ow_str, + oh_off, ow_off, inbuf, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, "squeeze"); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "squeeze")); + CHECK_STATUS(gcl_print_memory(handle, input, "squeeze_input")); + CHECK_STATUS(gcl_print_memory(handle, output, "squeeze_output")); +#endif + return SUCCESS; +} + +EE squeeze_mali_fp16( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output) +{ + CHECK_STATUS(squeeze_checkpara_mali_fp16(inputDesc, outputDesc)); + if (input->mem != output->mem) { + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + } + CHECK_STATUS(squeeze_core_mali_fp16(handle, inputDesc, input, outputDesc, output)); + return SUCCESS; +} diff --git a/tensor_computing/src/gpu/mali/fp16/squeeze_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/squeeze_mali_fp16.h similarity index 79% rename from tensor_computing/src/gpu/mali/fp16/squeeze_mali_fp16.h rename to compute/tensor/src/gpu/mali/fp16/squeeze_mali_fp16.h index b8dcfaa6..f3e9cc94 100644 --- a/tensor_computing/src/gpu/mali/fp16/squeeze_mali_fp16.h +++ b/compute/tensor/src/gpu/mali/fp16/squeeze_mali_fp16.h @@ -11,20 +11,13 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#ifndef _ACTIVATION_MALI_FP16 -#define _ACTIVATION_MALI_FP16 +#ifndef _SQUEEZE_MALI_FP16 +#define _SQUEEZE_MALI_FP16 #include "sys.h" -#include "type.h" -#include "tensor_desc.h" +#include "types.h" #include "error.h" #include "tensor_computing_type.h" - -EE squeeze_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output); +EE squeeze_mali_fp16( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output); #endif - diff --git a/compute/tensor/src/gpu/mali/fp16/transpose_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/transpose_mali_fp16.cpp new file mode 100644 index 00000000..27d42df8 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/transpose_mali_fp16.cpp @@ -0,0 +1,229 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "error.h" +#include "gpu/mali/fp16/transpose_mali_fp16.h" + +inline EE transpose_checkpara_mali_fp16(TensorDesc inputDesc, TensorDesc outputDesc) +{ + if (inputDesc.dt != outputDesc.dt || inputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE transpose_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output, + GCLMem_t tmpbuf, + U32 *dims) +{ + DataFormat df; + U32 nDims; + U32 in, ic, ih, iw, it; + U32 on, oc, oh, ow, ot; + nDims = inputDesc.nDims; + tensorSelectGet(inputDesc, NULL, &df, &in, &ic, &ih, &iw, &it); + tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow, &ot); + DataFormat imf = input->desc.memFormat; + DataFormat omf = output->desc.memFormat; + U32 iw_str, ih_str, iw_off, ih_off; + U32 ow_str, oh_str, ow_off, oh_off; + get_gclmem_dim(input->desc, &iw_str, &ih_str, NULL, &iw_off, &ih_off); + get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off); + cl_mem inbuf = input->mem; + cl_mem outbuf = output->mem; + cl_mem tmp = tmpbuf->mem; + I32 dimTran[6] = {0, 1, 2, 3, 4, 5}; + for (U32 i = 0; i < nDims; i++) { + dimTran[nDims - 1 - i] = nDims - 1 - dims[i]; + } + char kernelName[128]; + Kernel kernel; + U32 gs[3]; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + if (dimTran[2] == 2 && dimTran[3] == 3 && nDims == 4) { + bool matchCase = false; + if (imf == DF_NCWHC4 && omf == DF_NCWHC4) { + if (dimTran[0] == 0 && dimTran[1] == 1) { + sprintf(kernelName, "mem_trans_ncwhc4_to_ncwhc4"); + gs[0] = oh; + gs[1] = ow; + gs[2] = (oc + 3) / 4; + matchCase = true; + } else if (dimTran[0] == 1 && dimTran[1] == 0) { + sprintf(kernelName, "mem_trans_ncwhc4_to_ncwhc4_output_tran"); + gs[0] = ow; + gs[1] = oh; + gs[2] = (oc + 3) / 4; + matchCase = true; + } else { + return NOT_SUPPORTED; + } + } + if (imf == DF_NCWHC4 && omf == DF_NCHW) { + if (dimTran[0] == 0 && dimTran[1] == 1) { + sprintf(kernelName, "mem_trans_ncwhc4_to_nchw"); + gs[0] = oh; + gs[1] = (ow + 3) / 4; + gs[2] = (oc + 3) / 4; + matchCase = true; + } else if (dimTran[0] == 1 && dimTran[1] == 0) { + sprintf(kernelName, "mem_trans_ncwhc4_to_nchw_output_tran"); + gs[0] = (ow + 3) / 4; + gs[1] = oh; + gs[2] = (oc + 3) / 4; + matchCase = true; + } else { + return NOT_SUPPORTED; + } + } + if (imf == DF_NCHW && omf == DF_NCWHC4) { + if (dimTran[0] == 0 && dimTran[1] == 1) { + sprintf(kernelName, "mem_trans_nchw_to_ncwhc4"); + gs[0] = (ow + 3) / 4; + gs[1] = oh; + gs[2] = (oc + 3) / 4; + matchCase = true; + } else if (dimTran[0] == 1 && dimTran[1] == 0) { + sprintf(kernelName, "mem_trans_nchw_to_ncwhc4_output_tran"); + gs[0] = (oh + 3) / 4; + gs[1] = ow; + gs[2] = (oc + 3) / 4; + matchCase = true; + } else { + return NOT_SUPPORTED; + } + } + if (matchCase) { + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, iw_off, ih_off, ow_str, oh_str, + ow_off, oh_off, iw, ih, ic, ow, oh, oc, 0, 0, inbuf, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); +#endif + return SUCCESS; + } + } + + if (imf == DF_NCWHC4) { + gs[0] = ih; + gs[1] = (iw + 3) / 4; + gs[2] = (ic + 3) / 4 * it; + if (df == DF_NCTHW) { + sprintf(kernelName, "mem_trans_3d_ncwhc4_to_nchw"); + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, iw_off, ih_off, iw, ih, 0, 0, + iw, ih, ic, iw, ih, ic, 0, 0, inbuf, tmp)); + } else { + sprintf(kernelName, "mem_trans_ncwhc4_to_nchw"); + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, iw_off, ih_off, iw, ih, 0, 0, + iw, ih, ic, it, iw, ih, ic, it, 0, 0, inbuf, tmp)); + } + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); +#endif + inbuf = tmp; + } + U32 ow_str_val = ow_str; + U32 oh_str_val = oh_str; + U32 ow_off_val = ow_off; + U32 oh_off_val = ow_off; + + if (omf == DF_NCWHC4) { + U32 offset = tensorNumBytes(inputDesc); + offset = ALIGN(offset, 1024); + U32 size = tensorNumBytes(outputDesc); + gcl_create_sub_buffer(size, &offset, tmpbuf, &outbuf); + ow_str_val = ow; + oh_str_val = oh; + ow_off_val = 0; + oh_off_val = 0; + } + + gs[0] = (iw + 3) / 4; + gs[1] = ih; + gs[2] = ic * it; + if (df == DF_NCTHW) { + sprintf(kernelName, "transpose_3d_nchw"); + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, iw_off, ih_off, ow_str_val, + oh_str_val, ow_off_val, oh_off_val, dimTran[0], dimTran[1], dimTran[2], dimTran[3], iw, + it, ot, gs[0], gs[1], inbuf, outbuf)); + } else { + sprintf(kernelName, "transpose_nchw"); + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, iw_off, ih_off, ow_str_val, + oh_str_val, ow_off_val, oh_off_val, dimTran[0], dimTran[1], dimTran[2], iw, gs[0], + gs[1], inbuf, outbuf)); + } + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); +#endif + if (omf == DF_NCWHC4) { + if (df == DF_NCTHW) { + CHECK_STATUS(NOT_SUPPORTED); + } + sprintf(kernelName, "mem_trans_nchw_to_ncwhc4"); + gs[0] = (ow + 3) / 4; + gs[1] = oh; + gs[2] = (oc + 3) / 4; + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ow_str_val, oh_str_val, ow_off_val, oh_off_val, + ow_str, oh_str, ow_off, oh_off, ow, oh, oc, ow, oh, oc, 0, 0, outbuf, output->mem)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); +#endif + } + return SUCCESS; +} + +EE transpose_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc, + U32 *bytes) +{ + UNUSED(inputDesc); + UNUSED(outputDesc); + U32 input_size = gclmemInputDesc->byteSize; + input_size = ALIGN(input_size, 1024); + U32 output_size = gclmemOutputDesc->byteSize; + *bytes = input_size + output_size; + return SUCCESS; +} + +EE transpose_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output, + GCLMem_t tmpbuf, + U32 *dim) +{ + CHECK_STATUS(transpose_checkpara_mali_fp16(inputDesc, outputDesc)); + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + CHECK_STATUS( + transpose_core_mali_fp16(handle, inputDesc, input, outputDesc, output, tmpbuf, dim)); + return SUCCESS; +} diff --git a/tensor_computing/src/gpu/mali/fp16/transpose_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/transpose_mali_fp16.h similarity index 79% rename from tensor_computing/src/gpu/mali/fp16/transpose_mali_fp16.h rename to compute/tensor/src/gpu/mali/fp16/transpose_mali_fp16.h index 5be446e1..5123b20b 100644 --- a/tensor_computing/src/gpu/mali/fp16/transpose_mali_fp16.h +++ b/compute/tensor/src/gpu/mali/fp16/transpose_mali_fp16.h @@ -14,15 +14,21 @@ #ifndef _TRANSPOSE_MALI_FP16 #define _TRANSPOSE_MALI_FP16 #include "sys.h" -#include "type.h" -#include "tensor_desc.h" +#include "types.h" #include "error.h" #include "tensor_computing_type.h" +EE transpose_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc, + U32 *bytes); + EE transpose_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output, - U32* dim); + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output, + GCLMem_t tmpbuf, + U32 *dim); #endif diff --git a/compute/tensor/src/gpu/mali/fp16/unsqueeze_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/unsqueeze_mali_fp16.cpp new file mode 100644 index 00000000..b2bb0f3e --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/unsqueeze_mali_fp16.cpp @@ -0,0 +1,81 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "error.h" +#include "gpu/mali/fp16/unsqueeze_mali_fp16.h" + +inline EE unsqueeze_checkpara_mali_fp16(TensorDesc inputDesc, TensorDesc outputDesc) +{ + if (inputDesc.dt != outputDesc.dt) { + return NOT_SUPPORTED; + } + if (outputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE unsqueeze_core_mali_fp16( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output) +{ + UNUSED(outputDesc); + U32 iw, ih, ic, in; + if (inputDesc.df == DF_NCHW) { + tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); + } else if (inputDesc.df == DF_MKT) { + get_nlp_mkt_val(inputDesc, NULL, &in, &ic, &ih); + iw = 1; + } else { + return NOT_SUPPORTED; + } + U32 iw_str, ih_str, iw_off, ih_off; + ih_str = input->desc.stride[0]; + iw_str = input->desc.stride[1]; + ih_off = input->desc.offset[0]; + iw_off = input->desc.offset[1]; + U32 ow_str, oh_str, ow_off, oh_off; + oh_str = output->desc.stride[0]; + ow_str = output->desc.stride[1]; + oh_off = output->desc.offset[0]; + ow_off = output->desc.offset[1]; + + cl_mem inbuf, outbuf; + inbuf = input->mem; + outbuf = output->mem; + + U32 gs[3] = {ih, iw, (ic + 3) / 4}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, "squeeze", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ih_str, iw_str, ih_off, iw_off, oh_str, ow_str, + oh_off, ow_off, inbuf, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, "squeeze"); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "squeeze")); + CHECK_STATUS(gcl_print_memory(handle, input, "unsqueeze_input")); + CHECK_STATUS(gcl_print_memory(handle, output, "unsqueeze_output")); +#endif + return SUCCESS; +} + +EE unsqueeze_mali_fp16( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output) +{ + CHECK_STATUS(unsqueeze_checkpara_mali_fp16(inputDesc, outputDesc)); + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + CHECK_STATUS(unsqueeze_core_mali_fp16(handle, inputDesc, input, outputDesc, output)); + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/fp16/unsqueeze_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/unsqueeze_mali_fp16.h new file mode 100644 index 00000000..cd2a2b06 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/unsqueeze_mali_fp16.h @@ -0,0 +1,23 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _UNSQUEEZE_MALI_FP16 +#define _UNSQUEEZE_MALI_FP16 +#include "sys.h" +#include "types.h" +#include "error.h" +#include "tensor_computing_type.h" + +EE unsqueeze_mali_fp16( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output); +#endif diff --git a/compute/tensor/src/gpu/mali/fully_connected.cpp b/compute/tensor/src/gpu/mali/fully_connected.cpp new file mode 100644 index 00000000..090daf11 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fully_connected.cpp @@ -0,0 +1,507 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/fully_connected_mali_fp16.h" +inline void fully_connected_produce_algos_paras(TensorDesc inputDesc, + TensorDesc filterDesc, + std::vector outputDescs, + std::vector *fcAlgorithms, + std::vector *algoNumIndex, + std::vector *vecW, + std::vector *vecC, + std::vector *vecK) +{ + DataType dt; + U32 iw, ih, ic, fw, fh, fn; + tensorSelectGet(filterDesc, &dt, NULL, &fn, NULL, &fh, &fw); + tensorSelectGet(inputDesc, NULL, NULL, NULL, &ic, &ih, &iw); + U32 configInfo[3][128]; + U32 configNums[2]; + ConvolutionForwardAlgorithm algo[2]; + U32 algoNum = 1; + algo[0] = CONVOLUTION_ALGORITHM_DIRECT; + if (inputDesc.df == DF_NCHW || inputDesc.df == DF_NORMAL) { + if (ih != 1 || iw != 1 || fh != 1 || fw != 1) { + U32 item_w = (64 + ih - 1) / ih; + item_w = (item_w > iw) ? iw : item_w; + configInfo[0][0] = item_w; + configInfo[1][0] = 4; + configInfo[2][0] = 4; + configNums[0] = 1; + } else { + U32 configNum = 0; + U32 j = 8; + for (U32 i = 0; i < 3; i++) { + configInfo[0][configNum] = 1; + configInfo[1][configNum] = 1 << (2 + i); + configInfo[2][configNum] = 0; + configNum++; + if (ic % j != 0) { + break; + } + j = j << 1; + } + configNums[0] = configNum; + } + } else if (inputDesc.df == DF_MKT) { + U32 configNum = 0; + U32 align8 = true; + U32 nj = 8; + U32 k = 4; + for (U32 i = 0; i < outputDescs.size(); i++) { + if (outputDescs[i].dims[1] % 8 != 0) { + align8 = false; + } + } + for (U32 i = 0; i < 2; i++) { + for (U32 j = 0; j < nj; j++) { + configInfo[0][configNum] = j + 1; + configInfo[1][configNum] = 4; + configInfo[2][configNum] = k; + configNum++; + } + if (!align8) { + break; + } + nj = 4; + k = 8; + } + configNums[0] = configNum; + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + + for (U32 i = 0; i < algoNum; i++) { + (*fcAlgorithms).push_back(algo[i]); + (*algoNumIndex).push_back(configNums[i]); + U32 be = (i == 0) ? 0 : configNums[i - 1]; + U32 end = configNums[i]; + for (U32 j = be; j < end; j++) { + if (vecW) { + (*vecW).push_back(configInfo[0][j]); + } + if (vecC) { + (*vecC).push_back(configInfo[1][j]); + } + if (vecK) { + (*vecK).push_back(configInfo[2][j]); + } + } + } +} +inline EE fully_connected_checkpara_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + std::vector *filter, + std::vector *bias, + TensorDesc outputDesc, + std::vector *output) +{ + if (nullptr == handle || nullptr == input || nullptr == filter || nullptr == output || + nullptr == bias) { + return NULL_POINTER; + } + if (filter->size() != output->size() || filter->size() != bias->size() || bias->size() == 0) { + return NOT_MATCH; + } + for (U32 i = 0; i < filter->size(); ++i) { + if (nullptr == (*filter)[i] || nullptr == (*output)[i] || nullptr == (*bias)[i]) { + return NULL_POINTER; + } + } + if (inputDesc.df == DF_NCHW || inputDesc.df == DF_NORMAL) { + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 oc; + CHECK_STATUS(tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensorSelectGet(outputDesc, NULL, NULL, NULL, &oc, NULL, NULL)); + if (filterDesc.df != DF_NCHW) { + return NOT_SUPPORTED; + } + if (input->desc.memFormat != DF_NCWHC4) { + return NOT_SUPPORTED; + } + if ((*filter)[0]->desc.memFormat != DF_NCWHN4C4) { + return NOT_SUPPORTED; + } + if ((*output)[0]->desc.memFormat != DF_NCWHC4) { + return NOT_SUPPORTED; + } + if (in > 1) { + return NOT_SUPPORTED; + } + if (filter->size() > 1) { + return NOT_SUPPORTED; + } + if (fw != iw) { + return NOT_MATCH; + } + if (fh != ih) { + return NOT_MATCH; + } + if (fc != ic) { + return NOT_MATCH; + } + if (fn != oc) { + return NOT_MATCH; + } + } + if (inputDesc.df == DF_MKT) { + U32 k; + U32 fw, fh, fc, fn; + k = inputDesc.dims[1]; + CHECK_STATUS(tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw)); + if (fh != 1 || fw != 1) { + return NOT_MATCH; + } + if (k != fc) { + return NOT_MATCH; + } + } + return SUCCESS; +} +EE fully_connected_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + U32 fn; + tensorSelectGet(filterDesc, NULL, NULL, &fn, NULL, NULL, NULL); + if (inputDesc.df == DF_NCHW || inputDesc.df == DF_NORMAL) { + DataType idt; + DataFormat idf; + U32 iw, ih, ic, in; + tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); + if (outputDesc) { + *outputDesc = tensor4df(idt, idf, in, fn, 1, 1); + } + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + iw, ih, ic, 0, 0, 1, 1, fn, idt, idt, gclmemInputDesc, gclmemOutputDesc)); + return SUCCESS; + } else if (inputDesc.df == DF_MKT) { + bool need_pad = false; + DataType dt; + U32 m, k, t; + get_nlp_mkt_val(inputDesc, &dt, &m, &k, &t); + if (outputDesc) { + *outputDesc = inputDesc; + (*outputDesc).dims[1] = fn; + } + std::vector fcAlgorithms; + std::vector algoNumIndex; + std::vector vecW; + std::vector outputDescs; + outputDescs.push_back(*outputDesc); + fully_connected_produce_algos_paras( + inputDesc, filterDesc, outputDescs, &fcAlgorithms, &algoNumIndex, &vecW, NULL, NULL); + U32 igw, igh, igc; + U32 ogw, ogh, ogc; + U32 t_align = t; + for (U32 i = 0; i < algoNumIndex[0]; i++) { + U32 j = ALIGN(t, vecW[i]); + t_align = (t_align < j) ? j : t_align; + } + if (t_align != t) { + need_pad = true; + } + map_nlp_mkt_to_ncwhc4(m, k, t_align, &igw, &igh, &igc); + map_nlp_mkt_to_ncwhc4(m, fn, t, &ogw, &ogh, &ogc); + igc = igc * 4; + ogc = ogc * 4; + CHECK_STATUS(infer_gclmem_desc_ncwhc4(igw, igh, igc, 0, 0, ogw, ogh, ogc, dt, dt, + gclmemInputDesc, gclmemOutputDesc, need_pad)); + return SUCCESS; + } + CHECK_STATUS(NOT_SUPPORTED); + return NOT_SUPPORTED; +} + +EE fully_connected_infer_forward_algorithm_mali(GCLHandle_t handle, + TensorDesc inputDesc, + TensorDesc filterDesc, + std::vector outputDescs, + ForwardRunInfoMali_t forwardRunInfo) +{ + if (forwardRunInfo == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + ConvolutionForwardAlgorithm algorithm = (ConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + if (algorithm != CONVOLUTION_ALGORITHM_NULL) { + return SUCCESS; + } + DataType dt; + U32 fn; + tensorSelectGet(filterDesc, &dt, NULL, &fn, NULL, NULL, NULL); + std::vector fcAlgorithms; + std::vector algoNumIndex; + std::vector vecW; + std::vector vecC; + std::vector vecK; + fully_connected_produce_algos_paras( + inputDesc, filterDesc, outputDescs, &fcAlgorithms, &algoNumIndex, &vecW, &vecC, &vecK); + if (vecW.size() == 1) { + forwardRunInfo->best_w[0] = vecW[0]; + forwardRunInfo->best_k[0] = vecK[0]; + forwardRunInfo->best_c[0] = vecC[0]; + forwardRunInfo->algorithm = fcAlgorithms[0]; + return SUCCESS; + } + + CHECK_STATUS(gcl_clean_kernelVec(handle)); + CHECK_STATUS(gcl_enable_queue_profiling(handle)); + U32 sliceNum = outputDescs.size(); + GCLMem_t input = gcl_create_gclmem(); + GCLMem_t tmpbuf = gcl_create_gclmem(); + std::vector filter; + std::vector bias; + std::vector output; + for (U32 i = 0; i < sliceNum; ++i) { + GCLMem_t filterTmp = gcl_create_gclmem(); + GCLMem_t biasTmp = gcl_create_gclmem(); + GCLMem_t outTmp = gcl_create_gclmem(); + filter.push_back(filterTmp); + bias.push_back(biasTmp); + output.push_back(outTmp); + } + + std::vector runInfos; + U32 stride[3] = {0, 0, 0}; + U32 offset[3] = {0, 0, 0}; + GCLMemDesc inputMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + GCLMemDesc outputMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + CHECK_STATUS(fully_connected_infer_output_size_mali( + inputDesc, filterDesc, NULL, &inputMemDesc, &outputMemDesc)); + std::vector filterMemDescs; + U32 maxBytes = 0; + U32 maxFilterSize = 0; + for (U32 i = 0; i < algoNumIndex.size(); i++) { + U32 bytes = 0; + ForwardRunInfoMali runInfo; + runInfo.algorithm = fcAlgorithms[i]; + U32 be = (i == 0) ? 0 : algoNumIndex[i - 1]; + U32 end = algoNumIndex[i]; + for (U32 j = be; j < end; j++) { + GCLMemDesc filterMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + runInfo.best_w[0] = vecW[j]; + runInfo.best_c[0] = vecC[j]; + runInfo.best_k[0] = vecK[j]; + if (fully_connected_transform_filter_bytes_mali( + filterDesc, &filterMemDesc, &bytes, &runInfo) != SUCCESS) { + continue; + } + maxBytes = (maxBytes < bytes) ? bytes : maxBytes; + if (fully_connected_infer_forward_tmp_bytes_mali( + inputDesc, filterDesc, &bytes, &runInfo) != SUCCESS) { + continue; + } + maxBytes = (maxBytes < bytes) ? bytes : maxBytes; + maxFilterSize = (maxFilterSize < filterMemDesc.byteSize) ? filterMemDesc.byteSize + : maxFilterSize; + filterMemDescs.push_back(filterMemDesc); + runInfos.push_back(runInfo); + } + } + + MemFlags flags = CL_MEM_READ_WRITE; + if (inputDesc.df == DF_MKT) { + U32 stride[3] = {(fn + 3) / 4, 1, 1}; + U32 offset[3] = {0, 0, 0}; + CHECK_STATUS(gclmem_set_desc_padding( + &bias[0]->desc, stride, offset, dt, DF_NHWC, GCL_MEM_IMG_1D, flags)); + } else { + U32 stride[3] = {fn, 1, 1}; + U32 offset[3] = {0, 0, 0}; + CHECK_STATUS(gclmem_set_desc_padding( + &bias[0]->desc, stride, offset, dt, DF_NHWC, GCL_MEM_BUF, flags)); + } + + U32 algosNum = runInfos.size(); + if (algosNum == 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + TensorDesc biasDesc = tensor1d(dt, fn); + filterMemDescs[0].byteSize = maxFilterSize; + input->desc = inputMemDesc; + output[0]->desc = outputMemDesc; + filter[0]->desc = filterMemDescs[0]; + tmpbuf->desc.byteSize = maxBytes; + gcl_create_memory(handle, input); + for (U32 i = 0; i < sliceNum; ++i) { + filter[i]->desc = filter[0]->desc; + bias[i]->desc = bias[0]->desc; + output[i]->desc = output[0]->desc; + gcl_create_memory(handle, filter[i]); + gcl_create_memory(handle, bias[i]); + gcl_create_memory(handle, output[i]); + } + if (maxBytes) { + gcl_create_memory(handle, tmpbuf); + } + + U32 runKernelBe = 0; + U32 runKernelEnd = 0; + double minTime = DBL_MAX; + ForwardRunInfoMali bestRunInfo; + for (U32 i = 0; i < algosNum; i++) { + filter[0]->desc = filterMemDescs[i]; + if (sliceNum > 1) { + U32 item_k = runInfos[i].best_k[0]; + for (U32 j = 0; j < sliceNum; j++) { + U32 fn = outputDescs[j].dims[1]; + output[j]->desc.stride[2] = (fn + 3) / 4; + filter[j]->desc.stride[2] = (fn + item_k - 1) / item_k; + bias[j]->desc.stride[0] = (inputDesc.df == DF_MKT) ? (fn + 3) / 4 : fn; + } + } + if (fully_connected_mali(handle, inputDesc, input, filterDesc, &filter, biasDesc, &bias, + maxBytes, tmpbuf, outputDescs[0], &output, &runInfos[i]) == SUCCESS) { + runKernelEnd = handle->kernelVec->size(); + gcl_run_kernelVec_timing(handle, runKernelBe, runKernelEnd); + runKernelBe = runKernelEnd; + if (minTime > handle->t_execute) { + minTime = handle->t_execute; + bestRunInfo = runInfos[i]; + } + } + } + if (minTime == DBL_MAX) { + CHECK_STATUS(NOT_SUPPORTED); + } + *forwardRunInfo = bestRunInfo; + CHECK_STATUS(gcl_finish(handle)); + gcl_destroy_gclmem(input); + gcl_destroy_gclmem(tmpbuf); + for (auto p : filter) { + gcl_destroy_gclmem(p); + } + for (auto p : output) { + gcl_destroy_gclmem(p); + } + for (auto p : bias) { + gcl_destroy_gclmem(p); + } + runInfos.clear(); + filterMemDescs.clear(); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + CHECK_STATUS(gcl_off_queue_profiling(handle)); + return SUCCESS; +} +EE fully_connected_transform_filter_bytes_mali(TensorDesc filterDesc, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo) +{ + EE ret = SUCCESS; + switch (filterDesc.dt) { + case DT_F16: { + ret = fully_connected_transform_filter_bytes_mali_fp16( + filterDesc, gclmemFilterDesc, bytes, forwardRunInfo); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE fully_connected_transform_filter_mali(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + TensorDesc *fltmemDesc, + std::vector fltmem, + ForwardRunInfoMali_t forwardRunInfo) +{ + EE ret = SUCCESS; + switch (filterDesc.dt) { + case DT_F16: { + ret = fully_connected_transform_filter_mali_fp16( + handle, filterDesc, filter, fltmemDesc, fltmem, forwardRunInfo); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE fully_connected_infer_forward_tmp_bytes_mali( + TensorDesc inputDesc, TensorDesc filterDesc, U32 *bytes, ForwardRunInfoMali_t forwardRunInfo) +{ + EE ret = SUCCESS; + switch (inputDesc.dt) { + case DT_F16: { + ret = fully_connected_infer_forward_tmp_bytes_mali_fp16( + inputDesc, filterDesc, bytes, forwardRunInfo); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE fully_connected_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + std::vector *filter, + TensorDesc biasDesc, + std::vector *bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + std::vector *output, + ForwardRunInfoMali_t forwardRunInfo) +{ + EE ret = SUCCESS; + ret = fully_connected_checkpara_mali( + handle, inputDesc, input, filterDesc, filter, bias, outputDesc, output); + switch (inputDesc.dt) { + case DT_F16: { + ret = fully_connected_mali_fp16(handle, inputDesc, input, filterDesc, *filter, biasDesc, + *bias, tmpBytes, tmpBuf, outputDesc, *output, forwardRunInfo); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/matmul.cpp b/compute/tensor/src/gpu/mali/matmul.cpp new file mode 100644 index 00000000..752fcb54 --- /dev/null +++ b/compute/tensor/src/gpu/mali/matmul.cpp @@ -0,0 +1,476 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/matmul_mali_fp16.h" +inline void matmul_produce_algos_paras(bool transposeA, + TensorDesc matrixADesc, + bool transposeB, + TensorDesc matrixBDesc, + std::vector *matmulAlgorithms, + std::vector *vecW, + std::vector *vecC, + std::vector *vecK) +{ + U32 configInfo[3][192]; + U32 configNum = 0; + if (matmulAlgorithms) { + (*matmulAlgorithms).push_back(CONVOLUTION_ALGORITHM_GEMM); + } + if (transposeA && !transposeB) { //TN + for (U32 i = 1; i <= 8; ++i) { + for (U32 j = 1; j <= 8; ++j) { + if (i * j <= 2) { + continue; + } + configInfo[0][configNum] = j; // w + configInfo[1][configNum] = 1; // c + configInfo[2][configNum] = i; // k + configNum++; + } + } + } else if (!transposeA && transposeB) { + for (U32 ii = 1; ii <= 2; ++ii) { + for (U32 i = 1; i <= 8; ++i) { + for (U32 j = 1; j <= 8; ++j) { + if (i * j <= 2) { + continue; + } + if (i == 5 && j > 6 && ii == 2) { + continue; + } + if (i == 6) { + if ((j > 7 && ii == 1) || (j > 5 && ii == 2)) { + continue; + } + } + if (i == 7) { + if ((j > 6 && ii == 1) || (j > 4 && ii == 2)) { + continue; + } + } + if (i == 8) { + if ((j > 5 && ii == 1) || (j > 4 && ii == 2)) { + continue; + } + } + configInfo[0][configNum] = j; // w + configInfo[1][configNum] = 2 * ii; // c + configInfo[2][configNum] = i; // k + configNum++; + } + } + } + } else if (transposeA && transposeB) { + for (U32 ii = 1; ii <= 2; ++ii) { + for (U32 i = 1; i <= 8; ++i) { + if (i <= 2) { + continue; + } + configInfo[0][configNum] = i; // w + configInfo[1][configNum] = 2 * ii; // c + configInfo[2][configNum] = 1; // k + configNum++; + } + } + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + + for (U32 i = 0; i < configNum; i++) { + if (vecW) { + (*vecW).push_back(configInfo[0][i]); + } + if (vecC) { + (*vecC).push_back(configInfo[1][i]); + } + if (vecK) { + (*vecK).push_back(configInfo[2][i]); + } + } +} + +inline EE matmul_checkpara_mali(GCLHandle_t handle, + TensorDesc matrixADesc, + bool transposeA, + const GCLMem_t matrixA, + TensorDesc matrixBDesc, + bool transposeB, + const GCLMem_t matrixB, + TensorDesc matrixCDesc, + GCLMem_t matrixC) +{ + if (nullptr == handle || nullptr == matrixA || nullptr == matrixB || nullptr == matrixC) { + return NULL_POINTER; + } + if ((transposeA && !transposeB) || (!transposeA && transposeB)) { + if (matrixADesc.df != matrixBDesc.df || matrixADesc.df != matrixCDesc.df || + matrixADesc.df != DF_NCHW) { + return NOT_SUPPORTED; + } + if (matrixA->desc.memFormat != DF_NCHW || matrixB->desc.memFormat != DF_NCHW || + matrixC->desc.memFormat != DF_NCHW) { + return NOT_SUPPORTED; + } + } + if (!transposeA && !transposeB) { + return NOT_SUPPORTED; + } + if (matrixA->desc.stride[2] != matrixB->desc.stride[2]) { + return NOT_MATCH; + } + if (matrixA->desc.offset[0] != 0 || matrixA->desc.offset[1] != 0) { + return NOT_SUPPORTED; + } + if (matrixB->desc.offset[0] != 0 || matrixB->desc.offset[1] != 0) { + return NOT_SUPPORTED; + } + if (matrixC->desc.offset[0] != 0 || matrixC->desc.offset[1] != 0) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +EE matmul_infer_output_size_mali(TensorDesc matrixADesc, + bool transposeA, + TensorDesc matrixBDesc, + bool transposeB, + TensorDesc *matrixCDesc, + GCLMemDesc_t gclmemMatrixADesc, + GCLMemDesc_t gclmemMatrixBDesc, + GCLMemDesc_t gclmemMatrixCDesc) +{ + U32 adims = matrixADesc.nDims; + U32 bdims = matrixBDesc.nDims; + DataType adt = matrixADesc.dt; + DataType bdt = matrixBDesc.dt; + if (adims < 2 || bdims < 2) { + CHECK_STATUS(NOT_MATCH); + } + if (adt != bdt) { + CHECK_STATUS(NOT_MATCH); + } + U32 ac = (adims > 2) ? matrixADesc.dims[2] : 1; + U32 ah = matrixADesc.dims[1]; + U32 aw = matrixADesc.dims[0]; + U32 bc = (bdims > 2) ? matrixBDesc.dims[2] : 1; + U32 bh = matrixBDesc.dims[1]; + U32 bw = matrixBDesc.dims[0]; + bool need_pad_a = false; + bool need_pad_b = false; + if (ac != bc) { + CHECK_STATUS(NOT_SUPPORTED); + } + std::vector vecW; + std::vector vecC; + std::vector vecK; + matmul_produce_algos_paras( + transposeA, matrixADesc, transposeB, matrixBDesc, NULL, &vecW, NULL, &vecK); + + if (transposeA && !transposeB) { + /*TN*/ + if (ah != bh) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (matrixCDesc) { + *matrixCDesc = matrixADesc; + (*matrixCDesc).dims[0] = bw; + (*matrixCDesc).dims[1] = aw; + } + U32 aw_align = aw; + U32 bw_align = bw; + for (auto item_k : vecK) { + U32 i = ALIGN(aw, item_k); + aw_align = (aw_align < i) ? i : aw_align; + } + for (auto item_w : vecW) { + U32 i = ALIGN(bw, item_w); + bw_align = (bw_align < i) ? i : bw_align; + } + if (aw_align != aw) { + need_pad_a = true; + } + if (bw_align != bw) { + need_pad_b = true; + } + CHECK_STATUS(infer_gclmem_desc_nchw(aw_align, ah, ac, 0, 0, bw_align, aw_align, ac, adt, + adt, gclmemMatrixADesc, gclmemMatrixCDesc, need_pad_a)); + CHECK_STATUS(infer_gclmem_desc_nchw( + bw_align, bh, bc, 0, 0, 0, 0, 0, adt, adt, gclmemMatrixBDesc, NULL, need_pad_b)); + return SUCCESS; + } + if (!transposeA && transposeB) { + /*NT*/ + if (aw != bw) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (matrixCDesc) { + *matrixCDesc = matrixADesc; + (*matrixCDesc).dims[0] = bh; + (*matrixCDesc).dims[1] = ah; + } + U32 ah_align = ah; + U32 bh_align = bh; + U32 aw_align = aw; + for (auto item_k : vecK) { + U32 i = ALIGN(ah, item_k); + ah_align = (ah_align < i) ? i : ah_align; + } + for (auto item_w : vecW) { + U32 i = ALIGN(bh, item_w); + bh_align = (bh_align < i) ? i : bh_align; + } + for (auto item_c : vecC) { + U32 i = ALIGN(aw, item_c); + aw_align = (aw_align < i) ? i : aw_align; + } + if (aw_align != aw || ah_align != ah) { + need_pad_a = true; + } + if (aw_align != aw || bh_align != bh) { + need_pad_b = true; + } + CHECK_STATUS(infer_gclmem_desc_nchw(aw_align, ah_align, ac, 0, 0, bh_align, ah_align, ac, + adt, adt, gclmemMatrixADesc, gclmemMatrixCDesc, need_pad_a)); + CHECK_STATUS(infer_gclmem_desc_nchw( + aw_align, bh_align, bc, 0, 0, 0, 0, 0, adt, adt, gclmemMatrixBDesc, NULL, need_pad_b)); + return SUCCESS; + } + + if (transposeA && transposeB) { + /*TT*/ + if (ah != bw) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (matrixCDesc) { + *matrixCDesc = matrixADesc; + (*matrixCDesc).dims[0] = bh; + (*matrixCDesc).dims[1] = aw; + } + U32 aw_align = aw; + U32 ah_align = ah; + U32 bh_align = bh; + for (auto item_k : vecK) { + U32 i = ALIGN(aw, item_k); + aw_align = (aw_align < i) ? i : ah_align; + } + for (auto item_c : vecC) { + U32 i = ALIGN(ah, item_c); + ah_align = (ah_align < i) ? i : ah_align; + } + for (auto item_w : vecW) { + U32 i = ALIGN(bh, item_w); + bh_align = (bh_align < i) ? i : bh_align; + } + if (aw_align != aw || ah_align != ah) { + need_pad_a = true; + } + if (ah_align != ah || bh_align != bh) { + need_pad_b = true; + } + if (matrixADesc.df == DF_MKT) { + U32 m, k, t; + U32 gw, gh, gc; + get_nlp_mkt_val(matrixADesc, NULL, &m, &k, &t); + if (t != 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (ah != k) { + CHECK_STATUS(NOT_MATCH); + } + if (aw != t) { + CHECK_STATUS(NOT_MATCH) + t = aw_align; + } + k = ah_align; + map_nlp_mkt_to_ncwhc4(m, k, t, &gw, &gh, &gc); + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + gw, gh, gc * 4, 0, 0, 0, 0, 0, adt, adt, gclmemMatrixADesc, NULL, need_pad_a)); + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + CHECK_STATUS(infer_gclmem_desc_nchw( + ah_align, bh_align, bc, 0, 0, 0, 0, 0, adt, adt, gclmemMatrixBDesc, NULL, need_pad_b)); + CHECK_STATUS(infer_gclmem_desc_nchw( + 0, 0, 0, 0, 0, bh_align, aw_align, ac, adt, adt, NULL, gclmemMatrixCDesc)); + return SUCCESS; + } + return NOT_SUPPORTED; +} + +EE matmul_infer_forward_algorithm_mali(GCLHandle_t handle, + TensorDesc matrixADesc, + bool transposeA, + TensorDesc matrixBDesc, + bool transposeB, + TensorDesc matrixCDesc, + ForwardRunInfoMali_t forwardRunInfo) +{ + if (forwardRunInfo == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + ConvolutionForwardAlgorithm algorithm = (ConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + if (algorithm != CONVOLUTION_ALGORITHM_NULL) { + return SUCCESS; + } + std::vector matmulAlgorithms; + std::vector vecW; + std::vector vecC; + std::vector vecK; + matmul_produce_algos_paras( + transposeA, matrixADesc, transposeB, matrixBDesc, &matmulAlgorithms, &vecW, &vecC, &vecK); + if (vecW.size() == 1) { + forwardRunInfo->best_w[0] = vecW[0]; + forwardRunInfo->best_k[0] = vecK[0]; + forwardRunInfo->best_c[0] = vecC[0]; + forwardRunInfo->algorithm = matmulAlgorithms[0]; + return SUCCESS; + } + + CHECK_STATUS(gcl_clean_kernelVec(handle)); + CHECK_STATUS(gcl_enable_queue_profiling(handle)); + GCLMem_t matrixA = gcl_create_gclmem(); + GCLMem_t matrixB = gcl_create_gclmem(); + GCLMem_t matrixC = gcl_create_gclmem(); + GCLMem_t tmpbuf = gcl_create_gclmem(); + std::vector runInfos; + U32 stride[3] = {0, 0, 0}; + U32 offset[3] = {0, 0, 0}; + GCLMemDesc matrixAMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + GCLMemDesc matrixBMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + GCLMemDesc matrixCMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + U32 bytes; + U32 maxBytes = 0; + ForwardRunInfoMali runInfo; + runInfo.algorithm = matmulAlgorithms[0]; + CHECK_STATUS(matmul_infer_output_size_mali(matrixADesc, transposeA, matrixBDesc, transposeB, + NULL, &matrixAMemDesc, &matrixBMemDesc, &matrixCMemDesc)); + + for (U32 i = 0; i < vecW.size(); i++) { + runInfo.best_w[0] = vecW[i]; + runInfo.best_c[0] = vecC[i]; + runInfo.best_k[0] = vecK[i]; + if (matmul_infer_forward_tmp_bytes_mali( + matrixADesc, transposeA, matrixBDesc, transposeB, &bytes, &runInfo) != SUCCESS) { + continue; + } + maxBytes = (maxBytes < bytes) ? bytes : maxBytes; + runInfos.push_back(runInfo); + } + + U32 algosNum = runInfos.size(); + if (algosNum == 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + matrixA->desc = matrixAMemDesc; + matrixB->desc = matrixBMemDesc; + matrixC->desc = matrixCMemDesc; + tmpbuf->desc.byteSize = maxBytes; + gcl_create_memory(handle, matrixA); + gcl_create_memory(handle, matrixB); + gcl_create_memory(handle, matrixC); + if (maxBytes) { + gcl_create_memory(handle, tmpbuf); + } + + U32 runKernelBe = 0; + U32 runKernelEnd = 0; + double minTime = DBL_MAX; + ForwardRunInfoMali bestRunInfo; + for (U32 i = 0; i < algosNum; i++) { + if (matmul_mali(handle, matrixADesc, transposeA, matrixA, matrixBDesc, transposeB, matrixB, + tmpbuf, matrixCDesc, matrixC, &runInfos[i]) == SUCCESS) { + runKernelEnd = handle->kernelVec->size(); + gcl_run_kernelVec_timing(handle, runKernelBe, runKernelEnd); + runKernelBe = runKernelEnd; + if (minTime > handle->t_execute) { + minTime = handle->t_execute; + bestRunInfo = runInfos[i]; + } + } + } + if (minTime == DBL_MAX) { + CHECK_STATUS(NOT_SUPPORTED); + } + *forwardRunInfo = bestRunInfo; + CHECK_STATUS(gcl_finish(handle)); + gcl_destroy_gclmem(matrixA); + gcl_destroy_gclmem(matrixB); + gcl_destroy_gclmem(matrixC); + gcl_destroy_gclmem(tmpbuf); + runInfos.clear(); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + CHECK_STATUS(gcl_off_queue_profiling(handle)); + return SUCCESS; +} + +EE matmul_infer_forward_tmp_bytes_mali(TensorDesc matrixADesc, + bool transposeA, + TensorDesc matrixBDesc, + bool transposeB, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo) +{ + EE ret = SUCCESS; + switch (matrixADesc.dt) { + case DT_F16: { + ret = matmul_infer_forward_tmp_bytes_mali_fp16( + matrixADesc, transposeA, matrixBDesc, transposeB, bytes, forwardRunInfo); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE matmul_mali(GCLHandle_t handle, + TensorDesc matrixADesc, + bool transposeA, + const GCLMem_t matrixA, + TensorDesc matrixBDesc, + bool transposeB, + const GCLMem_t matrixB, + GCLMem_t tmp, + TensorDesc matrixCDesc, + GCLMem_t matrixC, + ForwardRunInfoMali_t forwardRunInfo) +{ + EE ret = SUCCESS; + ret = matmul_checkpara_mali(handle, matrixADesc, transposeA, matrixA, matrixBDesc, transposeB, + matrixB, matrixCDesc, matrixC); + switch (matrixADesc.dt) { + case DT_F16: { + ret = matmul_mali_fp16(handle, matrixADesc, transposeA, matrixA, matrixBDesc, + transposeB, matrixB, tmp, matrixCDesc, matrixC, forwardRunInfo); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/multihead_attention.cpp b/compute/tensor/src/gpu/mali/multihead_attention.cpp new file mode 100644 index 00000000..8f7203ef --- /dev/null +++ b/compute/tensor/src/gpu/mali/multihead_attention.cpp @@ -0,0 +1,722 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/multihead_attention_mali_fp16.h" +#include "tensor_computing_type.h" + +inline bool find_vector(std::vector vec, U32 val) +{ + bool find = false; + for (auto p : vec) { + if (p == val) { + find = true; + break; + } + } + return find; +} + +EE multihead_attention_checkpara_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + std::vector filterDesc, + std::vector filter, + std::vector bias, + std::vector layerNormAlpha, + std::vector layerNormBeta, + void *multiplyAlpha, + void *multiplyBeta, + U32 *firstFCSliceNum, + U32 matmulSliceLen, + std::vector eltwiseWithLayerNormIn, + ActivationMode activation, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output) +{ + if (nullptr == handle || nullptr == input || nullptr == output) { + return NULL_POINTER; + } + U32 filterNum = filterDesc.size(); + U32 lnNum = layerNormAlpha.size(); + if (filterNum != filter.size() || filterNum != bias.size()) { + return NOT_MATCH; + } + if (lnNum != layerNormBeta.size()) { + return NOT_MATCH; + } + if (filterNum != 4 || lnNum != 2) { + return NOT_SUPPORTED; + } + for (U32 i = 0; i < filterNum; ++i) { + if (nullptr == filter[i] || nullptr == bias[i]) { + return NULL_POINTER; + } + } + for (U32 i = 0; i < lnNum; ++i) { + if (nullptr == layerNormAlpha[i] || nullptr == layerNormBeta[i]) { + return NULL_POINTER; + } + } + + if (inputDesc.df == DF_MKT || inputDesc.df == DF_MTK) { + U32 m, k; + U32 fw, fh, fc, fn; + get_nlp_mkt_val(inputDesc, NULL, &m, &k, NULL); + if (firstFCSliceNum[0] != firstFCSliceNum[1] || firstFCSliceNum[0] != firstFCSliceNum[2]) { + return NOT_SUPPORTED; + } + if (firstFCSliceNum[0] % matmulSliceLen != 0) { + return NOT_MATCH; + } + if (m != 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + CHECK_STATUS(tensorSelectGet(filterDesc[0], NULL, NULL, &fn, &fc, &fh, &fw)); + if (fh != 1 || fw != 1) { + return NOT_MATCH; + } + if (k != fc) { + return NOT_MATCH; + } + } + return SUCCESS; +} + +EE multihead_attention_infer_output_size_mali(TensorDesc inputDesc, + std::vector filterDesc, + TensorDesc *outputDesc, + U32 *firstFCSliceNum, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc, + ForwardRunInfoMali_t forwardRunInfo) +{ + if (inputDesc.df == DF_MTK || inputDesc.df == DF_MKT) { + DataType dt; + U32 m, k, t; + get_nlp_mkt_val(inputDesc, &dt, &m, &k, &t); + U32 filterNum = filterDesc.size(); + U32 fn; + tensorSelectGet(filterDesc[filterNum - 1], NULL, NULL, &fn, NULL, NULL, NULL); + if (filterNum == 1) { + fn = firstFCSliceNum[2]; + } + if (outputDesc) { + *outputDesc = inputDesc; + (*outputDesc).dims[1] = fn; + } + U32 igw, igh, igc; + U32 ogw, ogh, ogc; + map_nlp_mkt_to_ncwhc4(m, k, t, &igw, &igh, &igc); + map_nlp_mkt_to_ncwhc4(m, fn, t, &ogw, &ogh, &ogc); + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + igw, igh, igc * 4, 0, 0, ogw, ogh, ogc * 4, dt, dt, gclmemInputDesc, gclmemOutputDesc)); + return SUCCESS; + } + return NOT_SUPPORTED; +} + +EE multihead_attention_infer_forward_algorithm_mali(GCLHandle_t handle, + TensorDesc inputDesc, + std::vector filterDesc, + void *multiplyAlpha, + void *multiplyBeta, + U32 *firstFCSliceNum, + U32 matmulSliceLen, + std::vector eltwiseWithLayerNormIn, + ActivationMode activation, + TensorDesc outputDesc, + ForwardRunInfoMali_t forwardRunInfo) +{ + if (forwardRunInfo == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + ConvolutionForwardAlgorithm algorithm = (ConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + if (algorithm != CONVOLUTION_ALGORITHM_NULL) { + return SUCCESS; + } + + CHECK_STATUS(gcl_clean_kernelVec(handle)); + CHECK_STATUS(gcl_enable_queue_profiling(handle)); + GCLMem_t input = gcl_create_gclmem(); + GCLMem_t tmpbuf = gcl_create_gclmem(); + GCLMem_t output = gcl_create_gclmem(); + std::vector filter; + std::vector bias; + std::vector layerNormAlpha; + std::vector layerNormBeta; + U32 fn[4]; + + for (U32 i = 0; i < filterDesc.size(); ++i) { + tensorSelectGet(filterDesc[i], NULL, NULL, &fn[i], NULL, NULL, NULL); + GCLMem_t filterTmp = gcl_create_gclmem(); + GCLMem_t biasTmp = gcl_create_gclmem(); + filter.push_back((void *)filterTmp); + bias.push_back((void *)biasTmp); + } + + for (U32 i = 0; i < 2; ++i) { + GCLMem_t alphaTmp = gcl_create_gclmem(); + GCLMem_t betaTmp = gcl_create_gclmem(); + layerNormAlpha.push_back((void *)alphaTmp); + layerNormBeta.push_back((void *)betaTmp); + } + + std::vector runInfos; + ForwardRunInfoMali runInfo; + runInfo.algorithm = (I32)CONVOLUTION_ALGORITHM_GEMM; + std::vector inputMemDescs; + std::vector outputMemDescs; + std::vector filterMemDescs0; + std::vector filterMemDescs1; + std::vector filterMemDescs2; + std::vector filterMemDescs3; + std::vector> filterMemDescs; + /*0: fc0 + * 1: tn + * 2: nt + * 3: fc1 + * 4: fc2 + * 5: fc3*/ + U32 configInfos[6][3][64]; + U32 configNum_fc0 = 0; + U32 configNum_fc1 = 0; + U32 configNum_fc2 = 0; + U32 configNum_fc3 = 0; + U32 configNum_tn = 0; + U32 configNum_nt = 0; + U32 bytes; + U32 maxBytes = 0; + U32 maxInputSize = 0; + U32 maxOutputSize = 0; + U32 maxFilterSize[4] = {0, 0, 0, 0}; + U32 stride[3] = {0, 0, 0}; + U32 offset[3] = {0, 0, 0}; + + for (U32 i = 1; i <= 8; ++i) { + for (U32 j = 1; j <= 8; ++j) { + if (i * j <= 2) { + continue; + } + configInfos[0][0][configNum_fc0] = j; + configInfos[0][1][configNum_fc0] = 1; + configInfos[0][2][configNum_fc0] = i; + configInfos[1][0][configNum_tn] = j; + configInfos[1][1][configNum_tn] = 1; + configInfos[1][2][configNum_tn] = i; + configNum_fc0++; + configNum_tn++; + } + } + + for (U32 i = 4; i <= 8; i += 4) { + for (U32 j = 1; j <= 8; ++j) { + configInfos[3][0][configNum_fc1] = j; + configInfos[3][1][configNum_fc1] = 1; + configInfos[3][2][configNum_fc1] = i; + configNum_fc1++; + } + } + + for (U32 j = 1; j <= 8; j++) { + configInfos[4][0][configNum_fc2] = j; + configInfos[4][1][configNum_fc2] = 4; + configInfos[4][2][configNum_fc2] = 4; + configInfos[5][0][configNum_fc3] = j; + configInfos[5][1][configNum_fc3] = 4; + configInfos[5][2][configNum_fc3] = 4; + configNum_fc2++; + configNum_fc3++; + } + + if (fn[2] % 8 == 0) { + for (U32 j = 1; j <= 4; j++) { + configInfos[4][0][configNum_fc2] = j; + configInfos[4][1][configNum_fc2] = 4; + configInfos[4][2][configNum_fc2] = 8; + configNum_fc2++; + } + } + + if (fn[3] % 8 == 0) { + for (U32 j = 1; j <= 4; j++) { + configInfos[5][0][configNum_fc3] = j; + configInfos[5][1][configNum_fc3] = 4; + configInfos[5][2][configNum_fc3] = 8; + configNum_fc3++; + } + } + + for (U32 i = 1; i <= 8; ++i) { + for (U32 j = 1; j <= 8; ++j) { + if (i * j <= 2) { + continue; + } + if (i == 6 && j > 7) { + continue; + } + if (i == 7 && j > 6) { + continue; + } + if (i == 8 && j > 5) { + continue; + } + if (matmulSliceLen % i != 0) { + continue; + } + configInfos[2][0][configNum_nt] = j; // w + configInfos[2][1][configNum_nt] = 2; // c + configInfos[2][2][configNum_nt] = i; // k + configNum_nt++; + } + } + + for (U32 i = 1; i <= 8; ++i) { + for (U32 j = 1; j <= 8; ++j) { + if (i * j <= 2) { + continue; + } + if (i == 5 && j > 6) { + continue; + } + if (i == 6 && j > 5) { + continue; + } + if (i == 7 && j > 4) { + continue; + } + if (i == 8 && j > 3) { + continue; + } + if (matmulSliceLen % i != 0) { + continue; + } + configInfos[2][0][configNum_nt] = j; // w + configInfos[2][1][configNum_nt] = 4; // c + configInfos[2][2][configNum_nt] = i; // k + configNum_nt++; + } + } + std::vector configNums; + configNums.push_back(configNum_fc0); + configNums.push_back(configNum_tn); + configNums.push_back(configNum_nt); + configNums.push_back(configNum_fc1); + configNums.push_back(configNum_fc2); + configNums.push_back(configNum_fc3); + + DataType dt; + U32 t, k; + get_nlp_mkt_val(inputDesc, &dt, NULL, &k, &t); + std::vector biasDesc; + for (U32 i = 0; i < 2; ++i) { + GCLMemDesc tmpDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + U32 biasNum = fn[i] + 8; + tmpDesc.stride[0] = biasNum; + tmpDesc.stride[1] = 1; + tmpDesc.stride[2] = 1; + tmpDesc.offset[0] = 0; + tmpDesc.offset[1] = 0; + tmpDesc.offset[2] = 0; + tmpDesc.num = biasNum; + tmpDesc.byteSize = biasNum * bytesOf(dt); + tmpDesc.flags = CL_MEM_READ_WRITE; + tmpDesc.memFormat = DF_NHWC; + tmpDesc.memType = GCL_MEM_BUF; + TensorDesc biasDescTmp = tensor1d(dt, fn[i]); + biasDesc.push_back(biasDescTmp); + ((GCLMem_t)bias[i])->desc = tmpDesc; + gcl_create_memory(handle, (GCLMem_t)bias[i]); + } + + for (U32 i = 2; i < filterDesc.size(); ++i) { + GCLMemDesc tmpDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + U32 biasNum = (fn[i] + 3) / 4; + tmpDesc.stride[0] = biasNum; + tmpDesc.stride[1] = 1; + tmpDesc.stride[2] = 1; + tmpDesc.offset[0] = 0; + tmpDesc.offset[1] = 0; + tmpDesc.offset[2] = 0; + tmpDesc.num = biasNum; + tmpDesc.byteSize = biasNum * 4 * bytesOf(dt); + tmpDesc.flags = CL_MEM_READ_WRITE; + tmpDesc.memFormat = DF_NHWC; + tmpDesc.memType = GCL_MEM_IMG_1D; + TensorDesc biasDescTmp = tensor1d(dt, fn[i]); + biasDesc.push_back(biasDescTmp); + ((GCLMem_t)bias[i])->desc = tmpDesc; + gcl_create_memory(handle, (GCLMem_t)bias[i]); + } + + for (U32 i = 0; i < 2; ++i) { + U32 layerNormNum = ALIGN(k, 4); + if (i == 1) { + tensorSelectGet(filterDesc[1], NULL, NULL, &layerNormNum, NULL, NULL, NULL); + layerNormNum = ALIGN(layerNormNum, 4); + } + GCLMemDesc tmpDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + tmpDesc.stride[0] = layerNormNum; + tmpDesc.stride[1] = 1; + tmpDesc.stride[2] = 1; + tmpDesc.offset[0] = 0; + tmpDesc.offset[1] = 0; + tmpDesc.offset[2] = 0; + tmpDesc.num = layerNormNum; + tmpDesc.byteSize = layerNormNum * bytesOf(dt); + tmpDesc.flags = CL_MEM_READ_WRITE; + tmpDesc.memFormat = DF_NHWC; + tmpDesc.memType = GCL_MEM_BUF; + ((GCLMem_t)layerNormAlpha[i])->desc = tmpDesc; + ((GCLMem_t)layerNormBeta[i])->desc = tmpDesc; + gcl_create_memory(handle, (GCLMem_t)layerNormAlpha[i]); + gcl_create_memory(handle, (GCLMem_t)layerNormBeta[i]); + } + + U32 runKernelBe = 0; + U32 runKernelEnd = 0; + ForwardRunInfoMali bestRunInfo; + bestRunInfo.algorithm = (I32)CONVOLUTION_ALGORITHM_GEMM; + for (U32 i = 0; i < configNums.size(); ++i) { + bestRunInfo.best_w[i] = configInfos[i][0][0]; + bestRunInfo.best_c[i] = configInfos[i][1][0]; + bestRunInfo.best_k[i] = configInfos[i][2][0]; + } + GCLMemDesc inputMemDesc; + GCLMemDesc outputMemDesc; + GCLMemDesc filterMemDesc[4]; + for (U32 i = 0; i < configNums.size(); ++i) { + runInfo = bestRunInfo; + for (U32 j = 0; j < configNums[i]; ++j) { + runInfo.best_w[i] = configInfos[i][0][j]; + runInfo.best_c[i] = configInfos[i][1][j]; + runInfo.best_k[i] = configInfos[i][2][j]; + inputMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + outputMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + for (U32 m = 0; m < filterDesc.size(); m++) { + filterMemDesc[i] = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + } + if (multihead_attention_infer_output_size_mali(inputDesc, filterDesc, NULL, + firstFCSliceNum, &inputMemDesc, &outputMemDesc, &runInfo) != SUCCESS) { + continue; + } + if (multihead_attention_transform_filter_bytes_mali( + filterDesc, filterMemDesc, &bytes, &runInfo) != SUCCESS) { + continue; + } + if (maxBytes < bytes) { + maxBytes = bytes; + } + if (multihead_attention_infer_forward_tmp_bytes_mali(inputDesc, filterDesc, + eltwiseWithLayerNormIn, firstFCSliceNum, matmulSliceLen, &bytes, + &runInfo) != SUCCESS) { + continue; + } + if (maxBytes < bytes) { + maxBytes = bytes; + } + if (maxInputSize < inputMemDesc.byteSize) { + maxInputSize = inputMemDesc.byteSize; + } + if (maxOutputSize < outputMemDesc.byteSize) { + maxOutputSize = outputMemDesc.byteSize; + } + if (maxFilterSize[0] < filterMemDesc[0].byteSize) { + maxFilterSize[0] = filterMemDesc[0].byteSize; + } + if (maxFilterSize[1] < filterMemDesc[1].byteSize) { + maxFilterSize[1] = filterMemDesc[1].byteSize; + } + if (maxFilterSize[2] < filterMemDesc[2].byteSize) { + maxFilterSize[2] = filterMemDesc[2].byteSize; + } + if (maxFilterSize[3] < filterMemDesc[3].byteSize) { + maxFilterSize[3] = filterMemDesc[3].byteSize; + } + inputMemDescs.push_back(inputMemDesc); + outputMemDescs.push_back(outputMemDesc); + filterMemDescs0.push_back(filterMemDesc[0]); + filterMemDescs1.push_back(filterMemDesc[1]); + filterMemDescs2.push_back(filterMemDesc[2]); + filterMemDescs3.push_back(filterMemDesc[3]); + runInfos.push_back(runInfo); + } + U32 algosNum = runInfos.size(); + if (algosNum == 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + + if (maxInputSize > inputMemDescs[0].byteSize || i == 0) { + inputMemDescs[0].byteSize = maxInputSize; + if (i > 0) { + CHECK_STATUS(gcl_release_memory(input)) + } + input->desc = inputMemDescs[0]; + CHECK_STATUS(gcl_create_memory(handle, input)); + } + if (maxOutputSize > outputMemDescs[0].byteSize || i == 0) { + outputMemDescs[0].byteSize = maxOutputSize; + if (i > 0) { + CHECK_STATUS(gcl_release_memory(output)) + } + output->desc = outputMemDescs[0]; + CHECK_STATUS(gcl_create_memory(handle, output)); + } + filterMemDescs.push_back(filterMemDescs0); + filterMemDescs.push_back(filterMemDescs1); + filterMemDescs.push_back(filterMemDescs2); + filterMemDescs.push_back(filterMemDescs3); + for (U32 k = 0; k < filterDesc.size(); k++) { + if (maxFilterSize[k] > filterMemDescs[k][0].byteSize || i == 0) { + filterMemDescs[k][0].byteSize = maxFilterSize[k]; + if (i > 0) { + CHECK_STATUS(gcl_release_memory((GCLMem_t)filter[k])); + } + ((GCLMem_t)filter[k])->desc = filterMemDescs[k][0]; + CHECK_STATUS(gcl_create_memory(handle, (GCLMem_t)filter[k])); + } + } + if (maxBytes > tmpbuf->desc.byteSize || i == 0) { + tmpbuf->desc.byteSize = maxBytes; + if (i > 0) { + CHECK_STATUS(gcl_release_subMem(tmpbuf)); + CHECK_STATUS(gcl_release_memory(tmpbuf)); + } + if (maxBytes) { + gcl_create_memory(handle, tmpbuf); + } + } + + double minTime = DBL_MAX; + for (U32 ii = 0; ii < algosNum; ii++) { + input->desc = inputMemDescs[ii]; + output->desc = outputMemDescs[ii]; + ((GCLMem_t)filter[0])->desc = filterMemDescs0[ii]; + ((GCLMem_t)filter[1])->desc = filterMemDescs1[ii]; + ((GCLMem_t)filter[2])->desc = filterMemDescs2[ii]; + ((GCLMem_t)filter[3])->desc = filterMemDescs3[ii]; + U32 best_w = runInfos[ii].best_w[i]; + U32 best_c = runInfos[ii].best_c[i]; + U32 best_k = runInfos[ii].best_k[i]; + runKernelBe = handle->kernelVec->size(); + if (multihead_attention_mali(handle, inputDesc, input, filterDesc, filter, biasDesc, + bias, layerNormAlpha, layerNormBeta, multiplyAlpha, multiplyBeta, + firstFCSliceNum, matmulSliceLen, eltwiseWithLayerNormIn, activation, maxBytes, + tmpbuf, outputDesc, output, &runInfos[ii]) == SUCCESS) { + runKernelEnd = handle->kernelVec->size(); + runKernelBe = runKernelBe + 1; + auto kernelInfo = (*handle->kernelVec)[runKernelBe]; + if (kernelInfo.name == "unknow_fill_memory_zero_vec4_f16") { + runKernelBe = runKernelBe + 1; + } + if (i == 0) { + goto R00; + } + runKernelBe = runKernelBe + 1; + kernelInfo = (*handle->kernelVec)[runKernelBe]; + if (kernelInfo.name == "unknow_fill_memory_zero_vec4_f16") { + runKernelBe = runKernelBe + 1; + } + if (i == 1) { + goto R00; + } + runKernelBe = runKernelBe + 2; + if (i == 2) { + goto R00; + } + runKernelBe = runKernelBe + 1; + if (i == 3) { + goto R00; + } + runKernelBe = runKernelBe + 2; + if (i == 4) { + goto R00; + } + runKernelBe = runKernelBe + 1; + if (runKernelBe >= runKernelEnd) { + CHECK_STATUS(NOT_MATCH); + } + R00: + gcl_run_kernelVec_timing(handle, runKernelBe, runKernelBe + 1); + if (minTime > handle->t_execute) { + minTime = handle->t_execute; + bestRunInfo.best_w[i] = best_w; + bestRunInfo.best_c[i] = best_c; + bestRunInfo.best_k[i] = best_k; + } + } + } + inputMemDescs.clear(); + outputMemDescs.clear(); + filterMemDescs.clear(); + filterMemDescs0.clear(); + filterMemDescs1.clear(); + filterMemDescs2.clear(); + filterMemDescs3.clear(); + runInfos.clear(); + CHECK_STATUS(gcl_finish(handle)); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + if (minTime == DBL_MAX) { + CHECK_STATUS(NOT_SUPPORTED); + } + } + *forwardRunInfo = bestRunInfo; + CHECK_STATUS(gcl_finish(handle)); + gcl_destroy_gclmem(input); + gcl_destroy_gclmem(output); + gcl_destroy_gclmem(tmpbuf); + for (auto p : filter) { + gcl_destroy_gclmem(GCLMem_t(p)); + } + for (auto p : bias) { + gcl_destroy_gclmem(GCLMem_t(p)); + } + for (auto p : layerNormAlpha) { + gcl_destroy_gclmem(GCLMem_t(p)); + } + for (auto p : layerNormBeta) { + gcl_destroy_gclmem(GCLMem_t(p)); + } + runInfos.clear(); + inputMemDescs.clear(); + outputMemDescs.clear(); + filterMemDescs[0].clear(); + filterMemDescs[1].clear(); + filterMemDescs[2].clear(); + filterMemDescs[3].clear(); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + CHECK_STATUS(gcl_off_queue_profiling(handle)); + return SUCCESS; +} +EE multihead_attention_transform_filter_bytes_mali(std::vector filterDesc, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo) +{ + EE ret = SUCCESS; + switch (filterDesc[0].dt) { + case DT_F16: { + ret = multihead_attention_transform_filter_bytes_mali_fp16( + filterDesc, gclmemFilterDesc, bytes, forwardRunInfo); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE multihead_attention_transform_filter_mali(GCLHandle_t handle, + std::vector filterDesc, + std::vector filter, + std::vector *fltmemDesc, + std::vector fltmem, + ForwardRunInfoMali_t forwardRunInfo) +{ + EE ret = SUCCESS; + switch (filterDesc[0].dt) { + case DT_F16: { + ret = multihead_attention_transform_filter_mali_fp16( + handle, filterDesc, filter, fltmemDesc, fltmem, forwardRunInfo); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE multihead_attention_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, + std::vector filterDesc, + std::vector eltwiseWithLayerNormIn, + U32 *firstFCSliceNum, + U32 matmulSliceLen, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo) +{ + EE ret = SUCCESS; + switch (inputDesc.dt) { + case DT_F16: { + ret = multihead_attention_infer_forward_tmp_bytes_mali_fp16(inputDesc, filterDesc, + eltwiseWithLayerNormIn, firstFCSliceNum, matmulSliceLen, bytes, forwardRunInfo); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE multihead_attention_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + std::vector filterDesc, + std::vector filter, + std::vector biasDesc, + std::vector bias, + std::vector layerNormAlpha, + std::vector layerNormBeta, + void *multiplyAlpha, + void *multiplyBeta, + U32 *firstFCSliceNum, + U32 matmulSliceLen, + std::vector eltwiseWithLayerNormIn, + ActivationMode activation, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ForwardRunInfoMali_t forwardRunInfo) +{ + EE ret = SUCCESS; + ret = multihead_attention_checkpara_mali(handle, inputDesc, input, filterDesc, filter, bias, + layerNormAlpha, layerNormBeta, multiplyAlpha, multiplyBeta, firstFCSliceNum, matmulSliceLen, + eltwiseWithLayerNormIn, activation, tmpBuf, outputDesc, output); + switch (inputDesc.dt) { + case DT_F16: { + ret = multihead_attention_mali_fp16(handle, inputDesc, input, filterDesc, filter, + biasDesc, bias, layerNormAlpha, layerNormBeta, multiplyAlpha, multiplyBeta, + firstFCSliceNum, matmulSliceLen, eltwiseWithLayerNormIn, activation, tmpBytes, + tmpBuf, outputDesc, output, forwardRunInfo); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/normalization.cpp b/compute/tensor/src/gpu/mali/normalization.cpp new file mode 100644 index 00000000..aa9d58fb --- /dev/null +++ b/compute/tensor/src/gpu/mali/normalization.cpp @@ -0,0 +1,85 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/normalization_mali_fp16.h" + +EE normalization_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + if (outputDesc) { + *outputDesc = inputDesc; + } + if (inputDesc.df == DF_MKT) { + DataType dt; + U32 m, k, t; + U32 w, h, c; + get_nlp_mkt_val(inputDesc, &dt, &m, &k, &t); + map_nlp_mkt_to_ncwhc4(m, k, t, &w, &h, &c); + c = c * 4; + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + w, h, c, 0, 0, w, h, c, dt, dt, gclmemInputDesc, gclmemOutputDesc)); + return SUCCESS; + } + return NOT_SUPPORTED; +} + +inline EE normalization_checkpara_mali(GCLHandle_t handle, + GCLMem_t alpha, + GCLMem_t beta, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output) +{ + if (nullptr == handle || nullptr == alpha || nullptr == beta || nullptr == input || + nullptr == output) { + return NULL_POINTER; + } + if (inputDesc.df != outputDesc.df || inputDesc.df != DF_MKT) { + return NOT_SUPPORTED; + } + if (input->desc.memFormat != output->desc.memFormat || input->desc.memFormat != DF_NCWHC4) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +EE layer_normalization_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + GCLMem_t alpha, + GCLMem_t beta, + TensorDesc outputDesc, + GCLMem_t output) +{ + EE ret = SUCCESS; + CHECK_STATUS( + normalization_checkpara_mali(handle, alpha, beta, inputDesc, input, outputDesc, output)); + switch (inputDesc.dt) { + case DT_F16: { + ret = normalization_mali_fp16(handle, alpha, beta, inputDesc, input, outputDesc, output); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/padding.cpp b/compute/tensor/src/gpu/mali/padding.cpp new file mode 100644 index 00000000..16e8f8c6 --- /dev/null +++ b/compute/tensor/src/gpu/mali/padding.cpp @@ -0,0 +1,72 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/padding_mali_fp16.h" + +EE padding_infer_output_size_mali(TensorDesc inputDesc, + PadParamSpec padParamSpec, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + /*tensorDesc record cpu org data format info*/ + /*gclmemDesc record gpu trans data format info*/ + DataType idt; + DataFormat idf; + U32 iw, ih, ic, in; + U32 ow, oh; + U32 pw, ph, pr, pb; + tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); + pw = padParamSpec.left; + pr = padParamSpec.right; + ph = padParamSpec.top; + pb = padParamSpec.bottom; + // if (pw!=pr || ph != pb) CHECK_STATUS(NOT_SUPPORTED); + ow = iw + pw + pr; + oh = ih + ph + pb; + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + iw, ih, ic, 0, 0, ow, oh, ic, idt, idt, gclmemInputDesc, gclmemOutputDesc)); + if (outputDesc) { + *outputDesc = tensor4df(idt, idf, in, ic, oh, ow); + } + return SUCCESS; +} + +EE padding_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + PadParamSpec padParamSpec, + TensorDesc outputDesc, + GCLMem_t output) +{ + EE ret = SUCCESS; + switch (inputDesc.dt) { + case DT_F16: { + ret = padding_mali_fp16(handle, inputDesc, input, padParamSpec, outputDesc, output); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/pooling.cpp b/compute/tensor/src/gpu/mali/pooling.cpp new file mode 100644 index 00000000..7f14af45 --- /dev/null +++ b/compute/tensor/src/gpu/mali/pooling.cpp @@ -0,0 +1,124 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/pooling_mali_fp16.h" + +EE pooling_infer_output_size_mali(TensorDesc inputDesc, + PoolingParamSpec poolingParamSpec, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + /*tensorDesc record cpu org data format info*/ + /*gclmemDesc record gpu trans data format info*/ + DataType idt; + DataFormat idf; + U32 iw, ih, ic, in, it; + U32 ow, oh, ot; + U32 kw, kh, kt, sw, sh, st, pl, pt, pr, pb, pt_b, pt_a; + tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw, &it); + pl = poolingParamSpec.padding_left; + pr = poolingParamSpec.padding_right; + pt = poolingParamSpec.padding_top; + pb = poolingParamSpec.padding_bottom; + pt_b = poolingParamSpec.padding_before; + pt_a = poolingParamSpec.padding_after; + kw = poolingParamSpec.kernel_w; + kh = poolingParamSpec.kernel_h; + kt = poolingParamSpec.kernel_t; + sw = poolingParamSpec.stride_w; + sh = poolingParamSpec.stride_h; + st = poolingParamSpec.stride_t; + if (st == 0) { + st = 1; + } + switch (poolingParamSpec.rm) { + case CEIL: { + ow = (U32)(ceil((double(iw + pl + pr - kw) / sw))) + 1; + oh = (U32)(ceil((double(ih + pt + pb - kh) / sh))) + 1; + ot = (U32)(ceil((double(it + pt_b + pt_a - kt) / st))) + 1; + break; + } + case FLOOR: { + ow = (U32)(floor((double(iw + pl + pr - kw) / sw))) + 1; + oh = (U32)(floor((double(ih + pb + pt - kh) / sh))) + 1; + ot = (U32)(floor((double(it + pt_b + pt_a - kt) / st))) + 1; + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + U32 iw_align, ih_align; + ih_align = ih + pt + pb; + ih_align = ih_align - pt * 2; + iw_align = iw + pl + pr; + iw_align = iw_align - pl * 2; + + if (inputDesc.df == DF_NCTHW) { + *outputDesc = tensor5df(idt, idf, in, ic, ot, oh, ow); + } else { + *outputDesc = tensor4df(idt, idf, in, ic, oh, ow); + it = 1; + ot = 1; + } + + ic = ALIGN(ic, 4); + CHECK_STATUS(infer_gclmem_desc_ncwhc4(iw_align, ih_align, ic * it, pl, pt, ow, oh, ic * ot, idt, + idt, gclmemInputDesc, gclmemOutputDesc)); + return SUCCESS; +} +EE pooling_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + PoolingParamSpec poolingParamSpec, + const void *scale, + GCLMem_t temp, + TensorDesc outputDesc, + GCLMem_t output) +{ + UNUSED(scale); + EE ret = SUCCESS; + switch (inputDesc.dt) { + case DT_F16: { + ret = pooling_mali_fp16( + handle, inputDesc, input, poolingParamSpec, outputDesc, output, temp); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE pooling_infer_forward_tmp_bytes_mali( + TensorDesc inputDesc, U32 *bytes, ForwardRunInfoMali_t forwardRunInfo) +{ + EE ret = SUCCESS; + switch (inputDesc.dt) { + case DT_F16: { + ret = pooling_infer_forward_tmp_bytes_mali_fp16(inputDesc, bytes, forwardRunInfo); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/power.cpp b/compute/tensor/src/gpu/mali/power.cpp new file mode 100644 index 00000000..16e343ab --- /dev/null +++ b/compute/tensor/src/gpu/mali/power.cpp @@ -0,0 +1,113 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/power_mali_fp16.h" + +EE power_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + if (outputDesc) { + *outputDesc = inputDesc; + } + DataType idt; + DataFormat idf = inputDesc.df; + U32 iw, ih, ic, in; + if (idf == DF_NCHW || idf == DF_NORMAL) { + if (gclmemInputDesc) { + tensorSelectGet(inputDesc, &idt, NULL, &in, &ic, &ih, &iw); + if (gclmemInputDesc->memFormat == DF_NCHW) { + CHECK_STATUS(infer_gclmem_desc_nchw( + iw, ih, ic, 0, 0, iw, ih, ic, idt, idt, gclmemInputDesc, gclmemOutputDesc)); + if (gclmemInputDesc && gclmemOutputDesc) { + *gclmemOutputDesc = *gclmemInputDesc; + } + } else if (gclmemInputDesc->memFormat == DF_NCWHC4) { + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + iw, ih, ic, 0, 0, iw, ih, ic, idt, idt, gclmemInputDesc, gclmemOutputDesc)); + if (gclmemInputDesc && gclmemOutputDesc) { + *gclmemOutputDesc = *gclmemInputDesc; + } + } else { + return NOT_SUPPORTED; + } + } + return SUCCESS; + } + if (idf == DF_MKT) { + if (gclmemInputDesc) { + if (gclmemInputDesc->memFormat == DF_NCWHC4) { + get_nlp_mkt_val(inputDesc, &idt, &in, &ic, &ih); + iw = 1; + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + iw, ih, ic, 0, 0, iw, ih, ic, idt, idt, gclmemInputDesc, gclmemOutputDesc)); + if (gclmemInputDesc && gclmemOutputDesc) { + *gclmemOutputDesc = *gclmemInputDesc; + } + } else { + return NOT_SUPPORTED; + } + } + return SUCCESS; + } + return NOT_SUPPORTED; +} + +inline EE power_checkpara_mali( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output) +{ + EE ret = SUCCESS; + if (handle == nullptr || nullptr == input || nullptr == output) { + ret = NULL_POINTER; + } + if (inputDesc.df != outputDesc.df) { + ret = NOT_SUPPORTED; + } + if (input->desc.memFormat != output->desc.memFormat) { + ret = NOT_SUPPORTED; + } + if (inputDesc.df != DF_NCHW && inputDesc.df != DF_MKT && inputDesc.df != DF_NORMAL) { + ret = NOT_SUPPORTED; + } + if (input->desc.memFormat != DF_NCHW && input->desc.memFormat != DF_NCWHC4) { + ret = NOT_SUPPORTED; + } + return ret; +} + +EE power_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + PowerParamSpec p, + TensorDesc outputDesc, + GCLMem_t output) +{ + EE ret = SUCCESS; + CHECK_STATUS(power_checkpara_mali(handle, inputDesc, input, outputDesc, output)); + switch (inputDesc.dt) { + case DT_F16: { + ret = power_mali_fp16(handle, inputDesc, input, p, outputDesc, output); + break; + } + case DT_I32: { + ret = power_mali_fp16(handle, inputDesc, input, p, outputDesc, output); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/preallocated_memory.cpp b/compute/tensor/src/gpu/mali/preallocated_memory.cpp new file mode 100644 index 00000000..b12b485f --- /dev/null +++ b/compute/tensor/src/gpu/mali/preallocated_memory.cpp @@ -0,0 +1,94 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" + +inline EE preallocated_memory_checkpara_mali( + GCLHandle_t handle, TensorDesc outputDesc, GCLMem_t output) +{ + if (handle == nullptr || nullptr == output) { + return NULL_POINTER; + } + if (output->desc.memFormat != DF_NCHW) { + return NOT_SUPPORTED; + } + if (outputDesc.dt != DT_F16 && outputDesc.dt != DT_I32) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE preallocated_memory_core_mali_fp16( + GCLHandle_t handle, TensorDesc outputDesc, GCLMem_t output) +{ + DataType dt = outputDesc.dt; + U32 numElements = output->desc.num; + cl_mem outbuf = output->mem; + U32 gs = numElements; + U32 ls = 0; + U32 dim = 1; + Kernel kernel; + char dataType[16]; + if (dt == DT_I32) { + strcpy(dataType, "i32"); + } + if (dt == DT_F16) { + strcpy(dataType, "f16"); + } + char kernelName[128]; + sprintf(kernelName, "fill_memory_zero_%s", dataType); + + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, numElements, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, &gs, &ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, &gs, &ls, kernelName)); + CHECK_STATUS(gcl_print_memory(handle, output, "preallocated_memory_output")); +#endif + return SUCCESS; +} + +EE preallocated_memory_infer_output_size_mali(TensorDesc *outputDesc, GCLMemDesc_t gclmemOutputDesc) +{ + U32 w, h, c, n; + TensorDesc desc = *outputDesc; + U32 ndims = desc.nDims; + DataType dt = desc.dt; + if (ndims < 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + w = desc.dims[0]; + h = (ndims > 1) ? desc.dims[1] : 1; + c = (ndims > 2) ? desc.dims[2] : 1; + n = (ndims > 3) ? desc.dims[3] : 1; + if (dt != DT_F16 && dt != DT_I32) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (n != 1) { + CHECK_STATUS(NOT_SUPPORTED) + CHECK_STATUS(infer_gclmem_desc_nchw(0, 0, 0, 0, 0, w, h, c, dt, dt, NULL, gclmemOutputDesc)); + } + return SUCCESS; +} + +EE preallocated_memory_mali(GCLHandle_t handle, TensorDesc outputDesc, GCLMem_t output) +{ + EE ret = SUCCESS; + CHECK_STATUS(preallocated_memory_checkpara_mali(handle, outputDesc, output)); + CHECK_STATUS(preallocated_memory_core_mali_fp16(handle, outputDesc, output)); + return ret; +} diff --git a/compute/tensor/src/gpu/mali/prelu.cpp b/compute/tensor/src/gpu/mali/prelu.cpp new file mode 100644 index 00000000..d8e69f0f --- /dev/null +++ b/compute/tensor/src/gpu/mali/prelu.cpp @@ -0,0 +1,84 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/prelu_mali_fp16.h" + +EE prelu_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + /*tensorDesc record cpu org data format info*/ + /*gclmemDesc record gpu trans data format info*/ + + DataType idt; + DataFormat idf; + U32 iw, ih, ic, in; + tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); + + if (idf == DF_NCHW) { + if (outputDesc) { + *outputDesc = inputDesc; + } + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + iw, ih, ic, 0, 0, iw, ih, ic, idt, idt, gclmemInputDesc, gclmemOutputDesc)); + if (gclmemInputDesc && gclmemOutputDesc) { + *gclmemOutputDesc = *gclmemInputDesc; // the input and output mem maybe the same + } + return SUCCESS; + } + return NOT_SUPPORTED; +} + +inline EE prelu_checkpara_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + GCLMem_t weight, + TensorDesc outputDesc, + GCLMem_t output) +{ + if (handle == nullptr || nullptr == weight || nullptr == input || nullptr == output) { + return NULL_POINTER; + } + if (input->desc.memFormat != output->desc.memFormat || input->desc.memFormat != DF_NCWHC4) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +EE prelu_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + GCLMem_t weight, + PReLUParamSpec preluDesc, + TensorDesc outputDesc, + GCLMem_t output) +{ + EE ret = SUCCESS; + CHECK_STATUS(prelu_checkpara_mali(handle, inputDesc, input, weight, outputDesc, output)); + switch (inputDesc.dt) { + case DT_F16: { + ret = prelu_mali_fp16(handle, inputDesc, input, weight, preluDesc, outputDesc, output); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/reshape.cpp b/compute/tensor/src/gpu/mali/reshape.cpp new file mode 100644 index 00000000..6c63b50c --- /dev/null +++ b/compute/tensor/src/gpu/mali/reshape.cpp @@ -0,0 +1,157 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/reshape_mali_fp16.h" + +EE reshape_infer_output_size_mali(TensorDesc inputDesc, + ReshapeParamSpec p, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + /*tensorDesc record cpu org data format info*/ + /*gclmemDesc record gpu trans data format info*/ + if (outputDesc == nullptr || gclmemInputDesc == nullptr || gclmemOutputDesc == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + I32 *dims = p.shape_dims; + I32 shapeSize = p.shape_size; + int inputElementNum = tensorNumElements(inputDesc); + int outputElementNum = 1; + for (int i = 0; i < shapeSize; i++) { + outputElementNum *= dims[i]; + } + int index_range = ((int)inputDesc.nDims > shapeSize) ? shapeSize : inputDesc.nDims; + if (inputElementNum > 0 && outputElementNum > 0 && inputElementNum != outputElementNum) { + for (int i = 0; i < index_range; i++) { + if ((inputElementNum / (int)inputDesc.dims[inputDesc.nDims - 1 - i]) == + (outputElementNum / dims[i])) { + dims[i] = inputDesc.dims[inputDesc.nDims - 1 - i]; + break; + } + } + } + *outputDesc = inputDesc; + if (shapeSize == 2) { + (*outputDesc).df = DF_NORMAL; + } else if (shapeSize == 3) { + (*outputDesc).df = DF_MKT; + } else if (shapeSize == 4) { + (*outputDesc).df = DF_NCHW; + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + (*outputDesc).nDims = shapeSize; + + U32 factor = 1; + U32 count = 0; + for (I32 i = 0; i < shapeSize; i++) { + I32 value = dims[i]; + if (value == 0) { + value = inputDesc.dims[inputDesc.nDims - 1 - i]; + } + if (value == -1) { + value = 0; + count++; + } else { + factor *= value; + } + (*outputDesc).dims[shapeSize - 1 - i] = value; + } + + if (count > 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + + for (I32 i = 0; i < 4; i++) { + if (i < shapeSize) { + if ((*outputDesc).dims[i] == 0) { + (*outputDesc).dims[i] = tensorNumElements(inputDesc) / factor; + } + } else { + (*outputDesc).dims[i] = 1; + } + } + + DataType idt, odt; + U32 ic, ih, iw, it; + U32 oc, oh, ow, ot; + tensorSelectGet(inputDesc, &idt, NULL, NULL, &ic, &ih, &iw, &it); + tensorSelectGet((*outputDesc), &odt, NULL, NULL, &oc, &oh, &ow, &ot); + if (gclmemInputDesc->memFormat == DF_NCHW || gclmemInputDesc->byteSize == 0) { + CHECK_STATUS( + infer_gclmem_desc_nchw(iw, ih, ic * it, 0, 0, 0, 0, 0, idt, odt, gclmemInputDesc, NULL)); + } else { + ic = ALIGN(ic, 4); + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + iw, ih, ic * it, 0, 0, 0, 0, 0, idt, odt, gclmemInputDesc, NULL)); + } + CHECK_STATUS( + infer_gclmem_desc_nchw(0, 0, 0, 0, 0, ow, oh, oc * ot, idt, odt, NULL, gclmemOutputDesc)); + return SUCCESS; +} + +inline EE reshape_checkpara_mali( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output) +{ + if (handle == nullptr || nullptr == input || nullptr == output) { + return NULL_POINTER; + } + return SUCCESS; +} + +EE reshape_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, + TensorDesc outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc, + U32 *bytes) +{ + EE ret = SUCCESS; + switch (inputDesc.dt) { + case DT_F16: { + ret = reshape_infer_forward_tmp_bytes_mali_fp16( + inputDesc, outputDesc, gclmemInputDesc, gclmemOutputDesc, bytes); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE reshape_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + GCLMem_t tmpbuf, + TensorDesc outputDesc, + GCLMem_t output) +{ + EE ret = SUCCESS; + CHECK_STATUS(reshape_checkpara_mali(handle, inputDesc, input, outputDesc, output)); + switch (inputDesc.dt) { + case DT_F16: { + ret = reshape_mali_fp16(handle, inputDesc, input, outputDesc, output, tmpbuf); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/rnncell.cpp b/compute/tensor/src/gpu/mali/rnncell.cpp new file mode 100644 index 00000000..fe19e09c --- /dev/null +++ b/compute/tensor/src/gpu/mali/rnncell.cpp @@ -0,0 +1,439 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/rnncell_mali_fp16.h" +#include "gpu/mali/fp16/rnn_mali_fp16.h" + +inline EE rnncell_checkpara_mali(GCLHandle_t handle, + TensorDesc xDesc, + GCLMem_t currentX, + TensorDesc filterDesc, + GCLMem_t filter, + GCLMem_t bias, + GCLMem_t state, + RNNParamSpec rnncellDesc, + GCLMem_t tmpBuf, + TensorDesc hDesc, + GCLMem_t output) +{ + if (nullptr == handle || nullptr == currentX || nullptr == filter || nullptr == output || + nullptr == state || nullptr == bias || nullptr == tmpBuf) { + return NULL_POINTER; + } + DataFormat df; + DataType dt; + U32 iB, iX; + if (xDesc.nDims == 2) { + CHECK_STATUS(tensor2dGet(xDesc, &dt, &df, &iB, &iX)); + } + if (xDesc.nDims == 3) { + if (xDesc.df != DF_MTK && xDesc.df != DF_MKT) { + CHECK_STATUS(NOT_SUPPORTED); + } + U32 m, k, t; + get_nlp_mkt_val(xDesc, &dt, &m, &k, &t); + iB = m; + iX = k; + if (t != 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + } + if (iB != 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + U32 hDim = rnncellDesc.numOutput; + U32 col = (rnncellDesc.numProjection > 0) ? rnncellDesc.numProjection : hDim; + U32 filterRow, filterCol; + tensorSelectGet(filterDesc, NULL, NULL, NULL, NULL, &filterRow, &filterCol); + if (filterCol != hDim + iX) { + CHECK_STATUS(NOT_MATCH); + } + if (filterRow != col * 4) { + CHECK_STATUS(NOT_MATCH); + } + if (hDesc.df != xDesc.df) { + CHECK_STATUS(NOT_MATCH); + } + if (hDesc.dims[0] != hDim && hDesc.dims[1] != hDim) { + CHECK_STATUS(NOT_MATCH); + } + return SUCCESS; +} + +EE rnncell_infer_output_size_mali(TensorDesc inputDesc, + RNNParamSpec rnncellDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemStateDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + DataType dt; + DataFormat df; + U32 iB, iX; + U32 hDim = rnncellDesc.numOutput; + if (inputDesc.nDims == 2) { + CHECK_STATUS(tensor2dGet(inputDesc, &dt, &df, &iB, &iX)); + } else if (inputDesc.nDims == 3) { + if (inputDesc.df != DF_MTK && inputDesc.df != DF_MKT) { + CHECK_STATUS(NOT_SUPPORTED); + } + U32 m, k, t; + get_nlp_mkt_val(inputDesc, &dt, &m, &k, &t); + iB = m; + iX = k; + } else { + return NOT_SUPPORTED; + } + + if (outputDesc) { + *outputDesc = inputDesc; + if (inputDesc.nDims == 2) { + (*outputDesc).dims[0] = hDim; + } + if (inputDesc.df == DF_MTK) { + (*outputDesc).dims[0] = hDim; + } + if (inputDesc.df == DF_MKT) { + (*outputDesc).dims[1] = hDim; + } + } + + // U32 item_c = forwardRunInfo->best_c[0]; + // U32 iX_align = (iX + item_c - 1) / item_c * item_c; + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + 1, 1, iX, 0, 0, 1, 1, hDim, dt, dt, gclmemInputDesc, gclmemOutputDesc)); + U32 hdim = rnncellDesc.numOutput; + U32 col = (rnncellDesc.numProjection > 0) ? rnncellDesc.numProjection : hDim; + U32 numState = col + (hdim + 3) / 4 * 4; + CHECK_STATUS( + infer_gclmem_desc_nchw(1, 1, numState, 0, 0, 0, 0, 0, dt, dt, gclmemStateDesc, NULL)); + return SUCCESS; +} + +EE rnncell_infer_forward_algorithm_mali(GCLHandle_t handle, + TensorDesc xDesc, + TensorDesc filterDesc, + TensorDesc biasDesc, + RNNParamSpec rnncellDesc, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + ForwardRunInfoMali_t forwardRunInfo) +{ + if (forwardRunInfo == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + ConvolutionForwardAlgorithm algorithm = (ConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + if (algorithm != CONVOLUTION_ALGORITHM_NULL) { + return SUCCESS; + } + CHECK_STATUS(gcl_clean_kernelVec(handle)); + CHECK_STATUS(gcl_enable_queue_profiling(handle)); + GCLMem_t currentX = gcl_create_gclmem(); + GCLMem_t state = gcl_create_gclmem(); + GCLMem_t filter0 = gcl_create_gclmem(); + GCLMem_t filter1 = gcl_create_gclmem(); + GCLMem_t bias = gcl_create_gclmem(); + GCLMem_t tmpbuf = gcl_create_gclmem(); + GCLMem_t currentH = gcl_create_gclmem(); + + std::vector runInfos; + ForwardRunInfoMali runInfo; + runInfo.algorithm = (I32)CONVOLUTION_ALGORITHM_DIRECT; + std::vector currentXMemDescs; + std::vector stateMemDescs; + std::vector currentHMemDescs; + std::vector filterMemDescs; + std::vector filterMemProDescs; + U32 configInfo[3][64]; + U32 configNum = 3; + U32 bytes = 0; + U32 maxBytes = 0; + U32 maxCurrentXSize = 0; + U32 maxStateSize = 0; + U32 maxCurrentHSize = 0; + U32 maxFilterSize = 0; + U32 hDim = rnncellDesc.numOutput; + U32 col = (rnncellDesc.numProjection > 0) ? rnncellDesc.numProjection : hDim; + U32 biasNum = col * 4; + U32 stride[3] = {0, 0, 0}; + U32 offset[3] = {0, 0, 0}; + DataType dt = xDesc.dt; + bool useProject = (rnncellDesc.numProjection > 0) ? true : false; + for (U32 i = 0; i < configNum; ++i) { + configInfo[0][i] = 1; + configInfo[1][i] = 1 << (2 + i); + configInfo[2][i] = 0; + configInfo[0][i + configNum] = 1; + configInfo[1][i + configNum] = 1 << (2 + i); + configInfo[2][i + configNum] = 0; + } + + for (U32 i = 0; i < configNum; ++i) { + GCLMemDesc currentXMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + GCLMemDesc stateMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCHW); + GCLMemDesc currentHMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + GCLMemDesc filterMemDesc[2]; + filterMemDesc[0] = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + filterMemDesc[1] = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + runInfo.best_w[0] = configInfo[0][i]; + runInfo.best_c[0] = configInfo[1][i]; + runInfo.best_k[0] = configInfo[2][i]; + runInfo.best_w[1] = configInfo[0][i + configNum]; + runInfo.best_c[1] = configInfo[1][i + configNum]; + runInfo.best_k[1] = configInfo[2][i + configNum]; + if (rnncell_infer_output_size_mali(xDesc, rnncellDesc, NULL, ¤tXMemDesc, + &stateMemDesc, ¤tHMemDesc) != SUCCESS) { + continue; + } + if (rnn_transform_filter_bytes_mali( + filterDesc, rnncellDesc, filterMemDesc, &bytes, &runInfo) != SUCCESS) { + continue; + } + if (maxBytes < bytes) { + maxBytes = bytes; + } + if (rnncell_infer_forward_tmp_bytes_mali( + xDesc, filterDesc, hDesc, rnncellDesc, &bytes, &runInfo) != SUCCESS) { + continue; + } + if (maxBytes < bytes) { + maxBytes = bytes; + } + if (maxCurrentXSize < currentXMemDesc.byteSize) { + maxCurrentXSize = currentXMemDesc.byteSize; + } + if (maxStateSize < stateMemDesc.byteSize) { + maxStateSize = stateMemDesc.byteSize; + } + if (maxCurrentHSize < currentHMemDesc.byteSize) { + maxCurrentHSize = currentHMemDesc.byteSize; + } + if (maxFilterSize < filterMemDesc[0].byteSize) { + maxFilterSize = filterMemDesc[0].byteSize; + } + if (maxFilterSize < filterMemDesc[1].byteSize) { + maxFilterSize = filterMemDesc[1].byteSize; + } + currentXMemDescs.push_back(currentXMemDesc); + stateMemDescs.push_back(stateMemDesc); + currentHMemDescs.push_back(currentHMemDesc); + filterMemDescs.push_back(filterMemDesc[0]); + filterMemProDescs.push_back(filterMemDesc[1]); + runInfos.push_back(runInfo); + } + + U32 algosNum = runInfos.size(); + if (algosNum == 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + currentXMemDescs[0].byteSize = maxCurrentXSize; + stateMemDescs[0].byteSize = maxStateSize; + currentHMemDescs[0].byteSize = maxCurrentHSize; + filterMemDescs[0].byteSize = maxFilterSize; + filterMemProDescs[0].byteSize = maxFilterSize; + + currentX->desc = currentXMemDescs[0]; + state->desc = stateMemDescs[0]; + currentH->desc = currentHMemDescs[0]; + filter0->desc = filterMemDescs[0]; + filter1->desc = filterMemProDescs[0]; + bias->desc.stride[0] = biasNum; + bias->desc.stride[1] = 1; + bias->desc.stride[2] = 1; + bias->desc.offset[0] = 0; + bias->desc.offset[1] = 0; + bias->desc.offset[2] = 0; + bias->desc.num = biasNum; + bias->desc.memFormat = DF_NHWC; + bias->desc.byteSize = biasNum * bytesOf(dt); + bias->desc.memType = GCL_MEM_BUF; + tmpbuf->desc.byteSize = maxBytes; + gcl_create_memory(handle, currentX); + gcl_create_memory(handle, state); + gcl_create_memory(handle, currentH); + gcl_create_memory(handle, filter0); + gcl_create_memory(handle, filter1); + gcl_create_memory(handle, bias); + if (maxBytes) { + gcl_create_memory(handle, tmpbuf); + } + + U32 runKernelBe = 0; + U32 runKernelEnd = 0; + double minTime = DBL_MAX; + double minTimePro = DBL_MAX; + ForwardRunInfoMali bestRunInfo; + for (U32 i = 0; i < algosNum; i++) { + currentX->desc = currentXMemDescs[i]; + state->desc = currentXMemDescs[i]; + currentH->desc = currentHMemDescs[i]; + filter0->desc = filterMemDescs[i]; + filter1->desc = filterMemProDescs[i]; + GCLMem filter[2]; + filter[0] = *filter0; + filter[1] = *filter1; + + runKernelBe = handle->kernelVec->size() + 1; + runKernelEnd = handle->kernelVec->size() + 2; + if (rnncell_mali(handle, xDesc, currentX, filterDesc, filter, biasDesc, bias, state, + rnncellDesc, batchStrideX, batchStrideH, maxBytes, tmpbuf, hDesc, currentH, + &runInfos[i]) == SUCCESS) { + gcl_run_kernelVec_timing(handle, runKernelBe, runKernelEnd); + if (minTime > handle->t_execute) { + minTime = handle->t_execute; + bestRunInfo.algorithm = runInfos[i].algorithm; + bestRunInfo.best_w[0] = runInfos[i].best_w[0]; + bestRunInfo.best_c[0] = runInfos[i].best_c[0]; + bestRunInfo.best_k[0] = runInfos[i].best_k[0]; + } + if (useProject) { + runKernelBe += 2; + runKernelEnd += 2; + gcl_run_kernelVec_timing(handle, runKernelBe, runKernelEnd); + if (minTimePro > handle->t_execute) { + minTime = handle->t_execute; + bestRunInfo.algorithm = runInfos[i].algorithm; + bestRunInfo.best_w[1] = runInfos[i].best_w[1]; + bestRunInfo.best_c[1] = runInfos[i].best_c[1]; + bestRunInfo.best_k[1] = runInfos[i].best_k[1]; + } + } + } + } + if (minTime == DBL_MAX) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (useProject && minTimePro == DBL_MAX) { + CHECK_STATUS(NOT_SUPPORTED); + } + *forwardRunInfo = bestRunInfo; + CHECK_STATUS(gcl_finish(handle)); + gcl_destroy_gclmem(currentX); + gcl_destroy_gclmem(state); + gcl_destroy_gclmem(currentH); + gcl_destroy_gclmem(filter0); + gcl_destroy_gclmem(filter1); + gcl_destroy_gclmem(bias); + runInfos.clear(); + currentXMemDescs.clear(); + stateMemDescs.clear(); + currentHMemDescs.clear(); + filterMemDescs.clear(); + filterMemProDescs.clear(); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + CHECK_STATUS(gcl_off_queue_profiling(handle)); + return SUCCESS; +} + +EE rnn_transform_filter_bytes_mali(TensorDesc filterDesc, + RNNParamSpec rnnParamSpec, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo) +{ + EE ret = SUCCESS; + switch (filterDesc.dt) { + case DT_F16: { + ret = rnn_transform_filter_bytes_mali_fp16( + filterDesc, rnnParamSpec, gclmemFilterDesc, bytes, forwardRunInfo); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE rnn_transform_filter_mali(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + RNNParamSpec rnnParamSpec, + TensorDesc *fltmemDesc, + GCLMem_t fltmem, + ForwardRunInfoMali_t forwardRunInfo) +{ + EE ret = SUCCESS; + switch (filterDesc.dt) { + case DT_F16: { + ret = rnn_transform_filter_mali_fp16( + handle, filterDesc, filter, rnnParamSpec, fltmemDesc, fltmem, forwardRunInfo); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE rnncell_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + RNNParamSpec rnncellDesc, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo) +{ + EE ret = SUCCESS; + switch (inputDesc.dt) { + case DT_F16: { + ret = rnncell_infer_forward_tmp_bytes_mali_fp16( + inputDesc, filterDesc, outputDesc, rnncellDesc, bytes, forwardRunInfo); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE rnncell_mali(GCLHandle_t handle, + TensorDesc xDesc, + const GCLMem_t currentX, + TensorDesc filterDesc, + GCLMem_t filter, + TensorDesc biasDesc, + GCLMem_t bias, + GCLMem_t state, + RNNParamSpec rnncellDesc, + U32 batchStrideX, + U32 batchStrideH, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc hDesc, + GCLMem_t output, + ForwardRunInfoMali_t forwardRunInfo) +{ + EE ret = SUCCESS; + ret = rnncell_checkpara_mali(handle, xDesc, currentX, filterDesc, filter, bias, state, + rnncellDesc, tmpBuf, hDesc, output); + switch (xDesc.dt) { + case DT_F16: { + ret = rnncell_mali_fp16(handle, xDesc, currentX, filterDesc, filter, biasDesc, bias, + state, tmpBytes, tmpBuf, rnncellDesc, batchStrideX, batchStrideH, hDesc, output, + forwardRunInfo); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/scale.cpp b/compute/tensor/src/gpu/mali/scale.cpp new file mode 100644 index 00000000..44cfc8b3 --- /dev/null +++ b/compute/tensor/src/gpu/mali/scale.cpp @@ -0,0 +1,90 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/scale_mali_fp16.h" + +EE scale_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + /*tensorDesc record cpu org data format info*/ + /*gclmemDesc record gpu trans data format info*/ + + DataType idt; + DataFormat idf; + U32 iw, ih, ic, in; + tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); + + if (idf == DF_NCHW) { + if (outputDesc) { + *outputDesc = inputDesc; + } + U32 ih_align = (ih + 1) / 2 * 2; + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + iw, ih_align, ic, 0, 0, iw, ih_align, ic, idt, idt, gclmemInputDesc, gclmemOutputDesc)); + if (gclmemInputDesc && gclmemOutputDesc) { + *gclmemOutputDesc = *gclmemInputDesc; // the input and output mem maybe the same + } + return SUCCESS; + } + return NOT_SUPPORTED; +} + +inline EE scale_checkpara_mali(GCLHandle_t handle, + GCLMem_t alpha, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output) +{ + if (handle == nullptr || nullptr == alpha || nullptr == input || nullptr == output) { + return NULL_POINTER; + } + // if(inputDesc.df != outputDesc.df || inputDesc.df != DF_NCHW) return NOT_SUPPORTED; + if (input->desc.memFormat != output->desc.memFormat || input->desc.memFormat != DF_NCWHC4) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +EE scale_mali(GCLHandle_t handle, + GCLMem_t alpha, + GCLMem_t beta, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output) +{ + EE ret = SUCCESS; + CHECK_STATUS(scale_checkpara_mali(handle, alpha, inputDesc, input, outputDesc, output)); + switch (inputDesc.dt) { + case DT_F16: { + ret = scale_mali_fp16(handle, alpha, beta, inputDesc, input, outputDesc, output); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/slice.cpp b/compute/tensor/src/gpu/mali/slice.cpp new file mode 100644 index 00000000..3b903a63 --- /dev/null +++ b/compute/tensor/src/gpu/mali/slice.cpp @@ -0,0 +1,131 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/slice_mali_fp16.h" + +EE slice_infer_output_size_mali(TensorDesc inputDesc, + SliceParamSpec p, + std::vector *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + if (outputDesc == NULL) { + CHECK_STATUS(NULL_POINTER); + } + int axis = p.axis; + int *slice_points = p.slice_points; + U32 num = outputDesc->size(); + axis = (axis + inputDesc.nDims) % inputDesc.nDims; + I32 target_axis = inputDesc.nDims - 1 - axis; + for (U32 i = 0; i < num; i++) { + (*outputDesc)[i] = inputDesc; + + I32 prev_point = 0; + if (i > 0) { + prev_point = slice_points[i - 1]; + } + I32 next_point = inputDesc.dims[target_axis]; + if (i < num - 1) { + next_point = slice_points[i]; + } + if (prev_point < 0) { + prev_point = (prev_point + inputDesc.dims[target_axis]) % inputDesc.dims[target_axis]; + } + if (next_point < 0) { + next_point = (next_point + inputDesc.dims[target_axis]) % inputDesc.dims[target_axis]; + } + (*outputDesc)[i].dims[target_axis] = next_point - prev_point; + } + if (inputDesc.df == DF_MKT) { + if (axis == 2) { // slice on T + DataType dt; + U32 m, k, t; + U32 gw, gh, gc; + get_nlp_mkt_val(inputDesc, &dt, &m, &k, &t); + map_nlp_mkt_to_ncwhc4(m, k, t, &gw, &gh, &gc); + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + gw, gh, gc * 4, 0, 0, 0, 0, 0, dt, dt, gclmemInputDesc, NULL)); + if (gclmemOutputDesc) { + for (U32 i = 0; i < num; ++i) { + get_nlp_mkt_val((*outputDesc)[i], NULL, &m, &k, &t); + map_nlp_mkt_to_ncwhc4(m, k, t, &gw, &gh, &gc); + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + 0, 0, 0, 0, 0, gw, gh, gc * 4, dt, dt, NULL, &gclmemOutputDesc[i])); + } + } + } + return SUCCESS; + } + return NOT_SUPPORTED; +} + +inline EE slice_checkpara_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + SliceParamSpec p, + std::vector outputDesc, + std::vector *output) +{ + if (handle == nullptr || input == nullptr) { + return NULL_POINTER; + } + if (input->desc.memFormat != DF_NCWHC4) { + return NOT_SUPPORTED; + } + for (auto p : (*output)) { + if (p == nullptr) { + return NULL_POINTER; + } + if (((GCLMem_t)p)->desc.memFormat != input->desc.memFormat) { + return NOT_MATCH; + } + } + if (inputDesc.df != DF_MKT) { + return NOT_SUPPORTED; + } + if (inputDesc.df == DF_MKT && p.axis != 2) { + return NOT_SUPPORTED; + } + for (auto p : outputDesc) { + if (p.df != inputDesc.df) { + return NOT_MATCH; + } + } + return SUCCESS; +} + +EE slice_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + SliceParamSpec p, + std::vector outputDesc, + std::vector *output) +{ + EE ret = SUCCESS; + CHECK_STATUS(slice_checkpara_mali(handle, inputDesc, input, p, outputDesc, output)); + switch (inputDesc.dt) { + case DT_F16: { + ret = slice_mali_fp16(handle, inputDesc, input, p, outputDesc, output); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/softmax.cpp b/compute/tensor/src/gpu/mali/softmax.cpp new file mode 100644 index 00000000..4b521347 --- /dev/null +++ b/compute/tensor/src/gpu/mali/softmax.cpp @@ -0,0 +1,131 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/softmax_mali_fp16.h" + +EE softmax_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + /*tensorDesc record cpu org data format info*/ + /*gclmemDesc record gpu trans data format info*/ + if (outputDesc) { + *outputDesc = inputDesc; + } + + DataType idt; + DataFormat idf; + U32 iw, ih, ic, in; + tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); + if (gclmemInputDesc) { + if (gclmemInputDesc->memFormat == DF_NCHW) { + U32 iw_align = (iw + 3) / 4 * 4; + if ((iw == 1 && ic == 1) || (iw == 1 && ih == 1)) { + iw_align = 1; + } + bool need_pad = false; + if (iw_align != iw) { + need_pad = true; + } + CHECK_STATUS(infer_gclmem_desc_nchw(iw_align, ih, ic, 0, 0, iw_align, ih, ic, idt, idt, + gclmemInputDesc, gclmemOutputDesc, need_pad)); + } else if (gclmemInputDesc->memFormat == DF_NCWHC4) { + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + iw, ih, ic, 0, 0, iw, ih, ic, idt, idt, gclmemInputDesc, gclmemOutputDesc)); + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + } + return SUCCESS; +} + +inline EE softmax_checkpara_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + SoftmaxParamSpec p, + TensorDesc outputDesc, + GCLMem_t output) +{ + if (handle == nullptr || nullptr == input || nullptr == output) { + return NULL_POINTER; + } + if (input->desc.memFormat != output->desc.memFormat) { + return NOT_SUPPORTED; + } + if (inputDesc.df != outputDesc.df) { + return NOT_SUPPORTED; + } + if (inputDesc.dims[0] != outputDesc.dims[0]) { + return NOT_SUPPORTED; + } + if (inputDesc.dims[1] != outputDesc.dims[1]) { + return NOT_SUPPORTED; + } + if (inputDesc.dims[2] != outputDesc.dims[2]) { + return NOT_SUPPORTED; + } + if (inputDesc.dims[3] != outputDesc.dims[3]) { + return NOT_SUPPORTED; + } + if (output->desc.memFormat != DF_NCWHC4 && output->desc.memFormat != DF_NCHW) { + return NOT_SUPPORTED; + } + if (p.axis != 1 && p.axis != 3 && p.axis != -1) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +EE softmax_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + SoftmaxParamSpec p, + GCLMem_t tmp, + TensorDesc outputDesc, + GCLMem_t output) +{ + EE ret = SUCCESS; + CHECK_STATUS(softmax_checkpara_mali(handle, inputDesc, input, p, outputDesc, output)); + switch (inputDesc.dt) { + case DT_F16: { + ret = softmax_mali_fp16(handle, inputDesc, input, tmp, p.axis, outputDesc, output); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE softmax_infer_forward_tmp_bytes_mali( + TensorDesc inputDesc, U32 *bytes, ForwardRunInfoMali_t forwardRunInfo) +{ + EE ret = SUCCESS; + switch (inputDesc.dt) { + case DT_F16: { + ret = softmax_infer_forward_tmp_bytes_mali_fp16(inputDesc, bytes, forwardRunInfo); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/space2depth.cpp b/compute/tensor/src/gpu/mali/space2depth.cpp new file mode 100644 index 00000000..ab75ddff --- /dev/null +++ b/compute/tensor/src/gpu/mali/space2depth.cpp @@ -0,0 +1,142 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" + +inline EE space2depth_checkpara_mali( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output) +{ + if (handle == nullptr || nullptr == input || nullptr == output) { + return NULL_POINTER; + } + if (input->desc.memFormat != DF_NCHW) { + return NOT_SUPPORTED; + } + if (output->desc.memFormat != DF_NCWHC4) { + return NOT_SUPPORTED; + } + if (inputDesc.df != DF_NCHW) { + return NOT_SUPPORTED; + } + if (outputDesc.df != DF_NCHW) { + return NOT_SUPPORTED; + } + if (inputDesc.dt != DT_U8) { + return NOT_SUPPORTED; + } + if (outputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + if (inputDesc.dims[0] != outputDesc.dims[0] * 4) { + return NOT_SUPPORTED; + } + if (inputDesc.dims[1] != outputDesc.dims[1] * 4) { + return NOT_SUPPORTED; + } + if (inputDesc.dims[2] != outputDesc.dims[2] / 16) { + return NOT_SUPPORTED; + } + if (inputDesc.dims[3] != outputDesc.dims[3]) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE space2depth_core_mali_fp16( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output) +{ + UNUSED(outputDesc); + U32 iw, ih, ic, in; + tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); + U32 iw_str, ih_str, iw_off, ih_off; + iw_str = input->desc.stride[0]; + ih_str = input->desc.stride[1]; + iw_off = input->desc.offset[0]; + ih_off = input->desc.offset[1]; + U32 ow_str, oh_str, ow_off, oh_off, ohw_str; + oh_str = output->desc.stride[0]; + ow_str = output->desc.stride[1]; + oh_off = output->desc.offset[0]; + ow_off = output->desc.offset[1]; + ohw_str = oh_str * ow_str; + + cl_mem inbuf, outbuf; + inbuf = input->mem; + outbuf = output->mem; + + U32 gs[3] = {(ih + 3) / 4, (iw + 3) / 4}; + U32 ls[3] = {0, 0}; + U32 dim = 2; + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, "space2depth", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, iw_off, ih_off, oh_str, ohw_str, ow_off, + oh_off, gs[0], gs[1], inbuf, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, "space2depth"); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "space2depth")); + CHECK_STATUS(gcl_print_memory(handle, input, "space2depth_input")); + CHECK_STATUS(gcl_print_memory(handle, output, "space2depth_output")); +#endif + return SUCCESS; +} + +EE space2depth_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + /*tensorDesc record cpu org data format info*/ + /*gclmemDesc record gpu trans data format info*/ + *outputDesc = inputDesc; + + DataType idt; + DataFormat idf; + U32 iw, ih, ic, in; + U32 ow, oh, oc, on; + tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); + if (idt != DT_U8) { + return NOT_SUPPORTED; + } + if (ic != 1) { + return NOT_SUPPORTED; + } + on = in; + oc = ic * 16; + oh = ih / 4; + ow = iw / 4; + + if (idf == DF_NCHW) { + if (outputDesc) { + *outputDesc = tensor4df(DT_F16, idf, on, oc, oh, ow); + } + CHECK_STATUS( + infer_gclmem_desc_nchw(iw, ih, ic, 0, 0, 0, 0, 0, DT_U8, DT_U8, gclmemInputDesc, NULL)); + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + 0, 0, 0, 0, 0, ow, oh, oc, DT_F16, DT_F16, NULL, gclmemOutputDesc)); + return SUCCESS; + } + return NOT_SUPPORTED; +} + +EE space2depth_mali( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output) +{ + EE ret = SUCCESS; + CHECK_STATUS(space2depth_checkpara_mali(handle, inputDesc, input, outputDesc, output)); + CHECK_STATUS(space2depth_core_mali_fp16(handle, inputDesc, input, outputDesc, output)); + return ret; +} diff --git a/compute/tensor/src/gpu/mali/squeeze.cpp b/compute/tensor/src/gpu/mali/squeeze.cpp new file mode 100644 index 00000000..e4f980ef --- /dev/null +++ b/compute/tensor/src/gpu/mali/squeeze.cpp @@ -0,0 +1,96 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/squeeze_mali_fp16.h" + +EE squeeze_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + /*tensorDesc record cpu org data format info*/ + /*gclmemDesc record gpu trans data format info*/ + if (outputDesc) { + *outputDesc = inputDesc; + } + + DataType idt; + DataFormat idf; + U32 iw, ih, ic, in; + tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + iw, ih, ic, 0, 0, iw, ih, ic, idt, idt, gclmemInputDesc, gclmemOutputDesc)); + if (gclmemInputDesc && gclmemOutputDesc) { + *gclmemOutputDesc = *gclmemInputDesc; + } + return SUCCESS; +} + +inline EE squeeze_checkpara_mali( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output) +{ + if (handle == nullptr || nullptr == input || nullptr == output) { + return NULL_POINTER; + } + if (input->desc.memFormat != output->desc.memFormat) { + return NOT_SUPPORTED; + } + if (inputDesc.df != outputDesc.df) { + return NOT_SUPPORTED; + } + if (inputDesc.dims[0] != outputDesc.dims[0]) { + return NOT_SUPPORTED; + } + if (inputDesc.dims[1] != outputDesc.dims[1]) { + return NOT_SUPPORTED; + } + if (inputDesc.dims[2] != outputDesc.dims[2]) { + return NOT_SUPPORTED; + } + if (inputDesc.dims[3] != outputDesc.dims[3]) { + return NOT_SUPPORTED; + } + if (outputDesc.df != DF_NCHW) { + return NOT_SUPPORTED; + } + if (output->desc.memFormat != DF_NCWHC4) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +EE squeeze_mali( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output) +{ + EE ret = SUCCESS; + CHECK_STATUS(squeeze_checkpara_mali(handle, inputDesc, input, outputDesc, output)); + switch (inputDesc.dt) { + case DT_F16: { + ret = squeeze_mali_fp16(handle, inputDesc, input, outputDesc, output); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/tensor_computing_mali.h b/compute/tensor/src/gpu/mali/tensor_computing_mali.h new file mode 100644 index 00000000..a750b321 --- /dev/null +++ b/compute/tensor/src/gpu/mali/tensor_computing_mali.h @@ -0,0 +1,785 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_TENSOR_COMPUTING_MALI +#define _H_TENSOR_COMPUTING_MALI +#include "types.h" +#include "tensor_computing_type.h" + +EE pooling_infer_forward_tmp_bytes_mali( + TensorDesc inputDesc, U32 *bytes, ForwardRunInfoMali_t forwardRunInfo); + +EE pooling_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + PoolingParamSpec poolingParamSpec, + const void *scale, + GCLMem_t temp, + TensorDesc outputDesc, + GCLMem_t output); + +EE pooling_infer_output_size_mali(TensorDesc inputDesc, + PoolingParamSpec poolingParamSpec, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE padding_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + PadParamSpec padParamSpec, + TensorDesc outputDesc, + GCLMem_t output); + +EE padding_infer_output_size_mali(TensorDesc inputDesc, + PadParamSpec padParamSpec, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE convolution_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE convolution_infer_forward_algorithm_mali(GCLHandle_t handle, + TensorDesc inputDesc, + TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + TensorDesc outputDesc, + GCLMemDesc inputMemDesc, + GCLMemDesc outputMemDesc, + ConvolutionPolicy policy, + ActivationMode activationMode, + ForwardRunInfoMali_t forwardRunInfo); + +EE convolution_transform_filter_bytes_mali(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes); + +EE convolution_transform_filter_mali(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + GCLMem_t tmp, + TensorDesc *fltmemDesc, + GCLMem_t fltmem); + +EE convolution_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes); + +EE convolution_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc scaleDesc, + const GCLMem_t scale, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode); + +EE depthwise_pointwise_convolution_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + ConvolutionParamSpec convParamSpec, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE depthwise_pointwise_convolution_infer_forward_algorithm_mali(GCLHandle_t handle, + TensorDesc inputDesc, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + ActivationMode depthwiseActivationMode, + ActivationMode pointwiseActivationMode, + ForwardRunInfoMali_t forwardRunInfo); + +EE depthwise_pointwise_convolution_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes); + +EE depthwise_pointwise_convolution_transform_filter_bytes_mali(TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemDwFilterDesc, + GCLMemDesc_t gclmemPwFilterDesc, + U32 *bytes); + +EE depthwise_pointwise_convolution_transform_filter_mali(GCLHandle_t handle, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + GCLMem_t dwFilter, + GCLMem_t pwFilter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *dwfltmemDesc, + TensorDesc *pwfltmemDesc, + GCLMem_t dwfltmem, + GCLMem_t pwfltmem); + +EE depthwise_pointwise_convolution_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + const GCLMem_t dwFilter, + const GCLMem_t pwFilter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc dwBiasDesc, + TensorDesc pwBiasDesc, + const GCLMem_t dwBias, + const GCLMem_t pwBias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode depthwiseActivationMode, + ActivationMode pointwiseActivationMode); + +EE depthwise_convolution_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE depthwise_convolution_infer_forward_algorithm_mali(GCLHandle_t handle, + TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + ActivationMode depthwiseActivationMode, + ForwardRunInfoMali_t forwardRunInfo); + +EE depthwise_convolution_transform_filter_bytes_mali(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes); + +EE depthwise_convolution_transform_filter_mali(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem); + +EE depthwise_convolution_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes); + +EE depthwise_convolution_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode depthwiseActivationMode); + +EE deconvolution_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE deconvolution_infer_forward_algorithm_mali(GCLHandle_t handle, + TensorDesc inputDesc, + TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + TensorDesc outputDesc, + ConvolutionPolicy policy, + ActivationMode activationMode, + ForwardRunInfoMali_t forwardRunInfo); + +EE deconvolution_transform_filter_bytes_mali(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes); + +EE deconvolution_transform_filter_mali(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + GCLMem_t tmp, + TensorDesc *fltmemDesc, + GCLMem_t fltmem); + +EE deconvolution_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes); + +EE deconvolution_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc scaleDesc, + const GCLMem_t scale, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode); + +EE bilateral_slice_apply_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc guideDesc, + TensorDesc gridDesc, + BilateralSliceApplyParamSpec bilateralSliceApplyParamSpec, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemGuideDesc, + GCLMemDesc_t gclmemGridDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE bilateral_slice_apply_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, + TensorDesc guideDesc, + TensorDesc gridDesc, + BilateralSliceApplyParamSpec bilateralSliceApplyParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes); + +EE bilateral_slice_apply_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc guideDesc, + const GCLMem_t guide, + TensorDesc gridDesc, + const GCLMem_t grid, + BilateralSliceApplyParamSpec bilateralSliceApplyParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output); + +EE eltwise_infer_output_size_mali(std::vector inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE eltwise_mali(GCLHandle_t handle, + std::vector inputDesc, + std::vector input, + EltwiseParamSpec eltwiseDesc, + TensorDesc outputDesc, + GCLMem_t output); + +EE softmax_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE softmax_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + SoftmaxParamSpec p, + GCLMem_t tmp, + TensorDesc outputDesc, + GCLMem_t output); + +EE softmax_infer_forward_tmp_bytes_mali( + TensorDesc inputDesc, U32 *bytes, ForwardRunInfoMali_t forwardRunInfo); + +EE activation_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE activation_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode); + +EE fully_connected_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE fully_connected_infer_forward_algorithm_mali(GCLHandle_t handle, + TensorDesc inputDesc, + TensorDesc filterDesc, + std::vector outputDescs, + ForwardRunInfoMali_t forwardRunInfo); + +EE fully_connected_transform_filter_bytes_mali(TensorDesc filterDesc, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo); + +EE fully_connected_transform_filter_mali(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + TensorDesc *fltmemDesc, + std::vector fltmem, + ForwardRunInfoMali_t forwardRunInfo); + +EE fully_connected_infer_forward_tmp_bytes_mali( + TensorDesc inputDesc, TensorDesc filterDesc, U32 *bytes, ForwardRunInfoMali_t forwardRunInfo); + +EE fully_connected_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + std::vector *filter, + TensorDesc biasDesc, + std::vector *bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + std::vector *output, + ForwardRunInfoMali_t forwardRunInfo); + +EE scale_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE scale_mali(GCLHandle_t handle, + GCLMem_t alpha, + GCLMem_t beta, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output); + +EE prelu_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE prelu_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + GCLMem_t weight, + PReLUParamSpec preluDesc, + TensorDesc outputDesc, + GCLMem_t output); + +EE concat_infer_output_size_mali(std::vector inputDesc, + ConcatParamSpec p, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE concat_infer_forward_tmp_bytes_mali(std::vector inputDesc, U32 *bytes); + +EE concat_mali(GCLHandle_t handle, + std::vector inputDesc, + std::vector input, + GCLMem_t inputScale, + ConcatParamSpec p, + GCLMem_t tmpbuf, + TensorDesc outputDesc, + GCLMem_t output, + GCLMem_t outputScale); + +EE clip_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE clip_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + ClipParamSpec p, + TensorDesc outputDesc, + GCLMem_t output); + +EE squeeze_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE squeeze_mali( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output); + +EE unsqueeze_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE unsqueeze_mali( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output); + +EE reshape_infer_output_size_mali(TensorDesc inputDesc, + ReshapeParamSpec p, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE reshape_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, + TensorDesc outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc, + U32 *bytes); + +EE reshape_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + GCLMem_t tmpbuf, + TensorDesc outputDesc, + GCLMem_t output); + +EE space2depth_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE space2depth_mali( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output); + +EE depth2space_infer_output_size_mali(TensorDesc inputDesc, + Depth2SpaceParamSpec p, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE depth2space_infer_tmpBuf_size_mali( + TensorDesc inputDesc, Depth2SpaceParamSpec p, TensorDesc outputDesc, U32 *bytes); + +EE depth2space_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + Depth2SpaceParamSpec p, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output); + +EE embedding_infer_output_size_mali(TensorDesc inputDesc, + EmbedParamSpec p, + DataType dt, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE embedding_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc weightDesc, + GCLMem_t weight, + EmbedParamSpec p, + TensorDesc outputDesc, + GCLMem_t output); + +EE normalization_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE layer_normalization_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + GCLMem_t alpha, + GCLMem_t beta, + TensorDesc outputDesc, + GCLMem_t output); + +EE matmul_infer_output_size_mali(TensorDesc matrixADesc, + bool transposeA, + TensorDesc matrixBDesc, + bool transposeB, + TensorDesc *matrixCDesc, + GCLMemDesc_t gclmemMatrixADesc, + GCLMemDesc_t gclmemMatrixBDesc, + GCLMemDesc_t gclmemMatrixCDesc); + +EE matmul_infer_forward_algorithm_mali(GCLHandle_t handle, + TensorDesc matrixADesc, + bool TransposeA, + TensorDesc matrixBDesc, + bool TransposeB, + TensorDesc matrixCDesc, + ForwardRunInfoMali_t forwardRunInfo); + +EE matmul_infer_forward_tmp_bytes_mali(TensorDesc matrixADesc, + bool transposeA, + TensorDesc matrixBDesc, + bool transposeB, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo); + +EE matmul_mali(GCLHandle_t handle, + TensorDesc matrixADesc, + bool transposeA, + const GCLMem_t matrixA, + TensorDesc matrixBDesc, + bool transposeB, + const GCLMem_t matrixB, + GCLMem_t tmp, + TensorDesc matrixCDesc, + GCLMem_t matrixC, + ForwardRunInfoMali_t forwardRunInfo); + +EE power_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE power_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + PowerParamSpec p, + TensorDesc outputDesc, + GCLMem_t output); + +EE transpose_infer_output_size_mali(TensorDesc inputDesc, + TransposeParamSpec p, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE transpose_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, + TensorDesc outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc, + U32 *bytes); + +EE transpose_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TransposeParamSpec p, + GCLMem_t tmpbuf, + TensorDesc outputDesc, + GCLMem_t output); + +EE slice_infer_output_size_mali(TensorDesc inputDesc, + SliceParamSpec p, + std::vector *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE slice_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + SliceParamSpec p, + std::vector outputDesc, + std::vector *output); + +EE rnncell_infer_output_size_mali(TensorDesc inputDesc, + RNNParamSpec rnnParamSpec, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemStateDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE rnncell_infer_forward_algorithm_mali(GCLHandle_t handle, + TensorDesc xDesc, + TensorDesc filterDesc, + TensorDesc biasDesc, + RNNParamSpec rnncellDesc, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + ForwardRunInfoMali_t forwardRunInfo); + +EE rnn_transform_filter_bytes_mali(TensorDesc filterDesc, + RNNParamSpec rnnParamSpec, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo); + +EE rnn_transform_filter_mali(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + RNNParamSpec rnnParamSpec, + TensorDesc *fltmemDesc, + GCLMem_t fltmem, + ForwardRunInfoMali_t forwardRunInfo); + +EE rnncell_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + RNNParamSpec rnncellDesc, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo); + +EE rnncell_mali(GCLHandle_t handle, + TensorDesc xDesc, + const GCLMem_t currentX, + TensorDesc filterDesc, + GCLMem_t filter, + TensorDesc biasDesc, + GCLMem_t bias, + GCLMem_t state, + RNNParamSpec rnncellDesc, + U32 batchStrideX, + U32 batchStrideH, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc hDesc, + GCLMem_t output, + ForwardRunInfoMali_t forwardRunInfo); + +EE argmax_infer_output_size_mali(TensorDesc inputDesc, + ArgMaxParamSpec p, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE argmax_infer_forward_tmp_bytes_mali( + TensorDesc inputDesc, ArgMaxParamSpec p, TensorDesc outputDesc, U32 *bytes); + +EE argmax_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + ArgMaxParamSpec p, + GCLMem_t tmpbuf, + TensorDesc outputDesc, + GCLMem_t output); + +EE preallocated_memory_infer_output_size_mali(TensorDesc *outputDesc, GCLMemDesc_t gclmemOutputDesc); + +EE preallocated_memory_mali(GCLHandle_t handle, TensorDesc outputDesc, GCLMem_t output); + +EE copy_infer_output_size_mali(std::vector inputDesc, GCLMemDesc_t gclmemInputDesc); + +EE copy_mali(GCLHandle_t handle, + std::vector inputDesc, + std::vector input, + U32 srcOffset, + U32 dstOffset, + U32 srcStride, + U32 dstStride, + U32 length); + +EE check_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputADesc, + GCLMemDesc_t gclmemInputBDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE check_mali(GCLHandle_t handle, + TensorDesc inputDescA, + GCLMem_t inputA, + TensorDesc inputDescB, + GCLMem_t inputB, + CheckParamSpec p, + TensorDesc outputDesc, + GCLMem_t output); + +EE multihead_attention_infer_output_size_mali(TensorDesc inputDesc, + std::vector filterDesc, + TensorDesc *outputDesc, + U32 *firstFCSliceNum, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc, + ForwardRunInfoMali_t forwardRunInfo); + +EE multihead_attention_infer_forward_algorithm_mali(GCLHandle_t handle, + TensorDesc inputDesc, + std::vector filterDesc, + void *multiplyAlpha, + void *multiplyBeta, + U32 *firstFCSliceNum, + U32 matmulSliceLen, + std::vector eltwiseWithLayerNormIn, + ActivationMode activation, + TensorDesc outputDesc, + ForwardRunInfoMali_t forwardRunInfo); + +EE multihead_attention_transform_filter_bytes_mali(std::vector filterDesc, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo); + +EE multihead_attention_transform_filter_mali(GCLHandle_t handle, + std::vector filterDesc, + std::vector filter, + std::vector *fltmemDesc, + std::vector fltmem, + ForwardRunInfoMali_t forwardRunInfo); + +EE multihead_attention_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, + std::vector filterDesc, + std::vector eltwiseWithLayerNormIn, + U32 *firstFCSliceNum, + U32 matmulSliceLen, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo); + +EE multihead_attention_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + std::vector filterDesc, + std::vector filter, + std::vector biasDesc, + std::vector bias, + std::vector layerNormAlpha, + std::vector layerNormBeta, + void *multiplyAlpha, + void *multiplyBeta, + U32 *firstFCSliceNum, + U32 matmulSliceLen, + std::vector eltwiseWithLayerNormIn, + ActivationMode activation, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ForwardRunInfoMali_t forwardRunInfo); + +EE channel_resize_infer_output_size_mali(TensorDesc inputDesc, + ChannelResizeParamSpec p, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE channel_resize_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + ChannelResizeParamSpec p, + TensorDesc outputDesc, + GCLMem_t output); +#endif diff --git a/compute/tensor/src/gpu/mali/transpose.cpp b/compute/tensor/src/gpu/mali/transpose.cpp new file mode 100644 index 00000000..568cddce --- /dev/null +++ b/compute/tensor/src/gpu/mali/transpose.cpp @@ -0,0 +1,114 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/transpose_mali_fp16.h" + +EE transpose_infer_output_size_mali(TensorDesc inputDesc, + TransposeParamSpec p, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + if (outputDesc == nullptr || gclmemInputDesc == nullptr || gclmemOutputDesc == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + U32 *dim = p.trans_dims; + U32 dimTran[6] = {1, 1, 1, 1, 1, 1}; + U32 nDims = inputDesc.nDims; + for (U32 i = 0; i < nDims; ++i) { + dimTran[nDims - 1 - i] = inputDesc.dims[nDims - 1 - dim[i]]; + } + *outputDesc = inputDesc; + for (U32 i = 0; i < nDims; ++i) { + (*outputDesc).dims[i] = dimTran[i]; + } + + DataType idt; + DataType odt; + U32 iw, ih, ic, in, it; + U32 ow, oh, oc, on, ot; + tensorSelectGet(inputDesc, &idt, NULL, &in, &ic, &ih, &iw, &it); + tensorSelectGet(*outputDesc, &odt, NULL, &on, &oc, &oh, &ow, &ot); + if (gclmemInputDesc->byteSize == 0 || gclmemInputDesc->memFormat == DF_NCHW) { + CHECK_STATUS( + infer_gclmem_desc_nchw(iw, ih, ic * it, 0, 0, 0, 0, 0, idt, odt, gclmemInputDesc, NULL)); + } else { + ic = ALIGN(ic, 4); + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + iw, ih, ic * it, 0, 0, 0, 0, 0, idt, odt, gclmemInputDesc, NULL)); + } + CHECK_STATUS( + infer_gclmem_desc_nchw(0, 0, 0, 0, 0, ow, oh, oc * ot, idt, odt, NULL, gclmemOutputDesc)); + return SUCCESS; +} + +inline EE transpose_checkpara_mali( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output) +{ + if (handle == nullptr || input == nullptr || output == nullptr) { + return NULL_POINTER; + } + + if (inputDesc.df != outputDesc.df) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +EE transpose_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, + TensorDesc outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc, + U32 *bytes) +{ + EE ret = SUCCESS; + switch (inputDesc.dt) { + case DT_F16: { + ret = transpose_infer_forward_tmp_bytes_mali_fp16( + inputDesc, outputDesc, gclmemInputDesc, gclmemOutputDesc, bytes); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE transpose_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TransposeParamSpec p, + GCLMem_t tmpbuf, + TensorDesc outputDesc, + GCLMem_t output) +{ + EE ret = SUCCESS; + CHECK_STATUS(transpose_checkpara_mali(handle, inputDesc, input, outputDesc, output)); + switch (inputDesc.dt) { + case DT_F16: { + ret = transpose_mali_fp16( + handle, inputDesc, input, outputDesc, output, tmpbuf, p.trans_dims); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/uchar/bilateral_slice_apply_mali_uchar.cpp b/compute/tensor/src/gpu/mali/uchar/bilateral_slice_apply_mali_uchar.cpp new file mode 100644 index 00000000..e9c87b61 --- /dev/null +++ b/compute/tensor/src/gpu/mali/uchar/bilateral_slice_apply_mali_uchar.cpp @@ -0,0 +1,130 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "error.h" +#include "gpu/mali/uchar/bilateral_slice_apply_mali_uchar.h" + +inline EE bilateral_slice_apply_checkpara_mali_uchar( + TensorDesc inputDesc, TensorDesc guideDesc, TensorDesc gridDesc, TensorDesc outputDesc) +{ + if (inputDesc.dt != outputDesc.dt || inputDesc.dt != DT_U8) { + return NOT_SUPPORTED; + } + if (gridDesc.dt != guideDesc.dt || gridDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE bilateral_slice_apply_core_mali_uchar(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc guideDesc, + const GCLMem_t guide, + TensorDesc gridDesc, + const GCLMem_t grid, + BilateralSliceApplyParamSpec bilateralSliceApplyParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output) +{ + UNUSED(guideDesc); + UNUSED(forwardRunInfo); + U32 iw, ih, ic, in; + U32 gw, gh, gc, gn; + U32 ow, oh, oc, on; + tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); + tensorSelectGet(gridDesc, NULL, NULL, &gn, &gc, &gh, &gw); + tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); + + U32 coe = bilateralSliceApplyParamSpec.coefficient_len; + BilateralSliceApplyMode mode = bilateralSliceApplyParamSpec.mode; + U32 dep = gc / coe; + U32 gcw = gc * gw; + U32 wh = iw * ih; + F32 scale_x = (F32)gw / iw; + F32 scale_y = (F32)gh / ih; + Mem inbuf, gridbuf, guidebuf, outbuf, gridTran; + inbuf = input->mem; + gridbuf = grid->mem; + outbuf = output->mem; + gridTran = tmpBuf->mem; + if (mode == BSliceApply_NULL) { + guidebuf = guide->mem; + } else { + guidebuf = inbuf; + } + + U32 gs0[3] = {gc / 4, gw, ih}; + U32 ls0[3] = {0, 0, 0}; + U32 dim0 = 3; + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, "bilateral_slice_apply_pre", &kernel)); + CHECK_STATUS( + gcl_set_kernelArgs(kernel, gh, gc, gcw, gs0[0], gs0[1], scale_y, gridbuf, gridTran)); + gcl_set_kernelVec(handle, kernel, dim0, gs0, ls0, "bilateral_slice_apply_pre"); + +#ifdef _DEBUG + CHECK_STATUS( + gcl_run_kernel_profiling(handle, kernel, dim0, gs0, ls0, "bilateral_slice_apply_pre")); + CHECK_STATUS(gcl_print_memory(handle, grid, "bilateral_slice_apply_grid")); +#endif + U32 gs[2] = {ow, oh}; + U32 ls[2] = {0, 0}; + U32 dim = 2; + char kernelname[128]; + if (mode == BSliceApply_CONV) { + sprintf(kernelname, "bilateral_slice_apply_c12_conv_uchar"); + } else { + sprintf(kernelname, "bilateral_slice_apply_c12_uchar"); + } + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw, wh, gc, gw, gh, gcw, dep, coe, gs[0], gs[1], + scale_x, scale_y, guidebuf, gridTran, inbuf, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); + +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel_profiling(handle, kernel, dim, gs, ls, kernelname)); + CHECK_STATUS(gcl_print_memory(handle, input, "bilateral_slice_apply_input")); + CHECK_STATUS(gcl_print_memory(handle, output, "bilateral_slice_apply_output")); + if (mode == BSliceApply_NULL) { + CHECK_STATUS(gcl_print_memory(handle, guide, "bilateral_slice_apply_guide")); + } +#endif + return SUCCESS; +} + +EE bilateral_slice_apply_mali_uchar(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc guideDesc, + const GCLMem_t guide, + TensorDesc gridDesc, + const GCLMem_t grid, + BilateralSliceApplyParamSpec bilateralSliceApplyParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output) +{ + UNUSED(tmpBytes); + CHECK_STATUS( + bilateral_slice_apply_checkpara_mali_uchar(inputDesc, guideDesc, gridDesc, outputDesc)); + CHECK_STATUS(bilateral_slice_apply_core_mali_uchar(handle, inputDesc, input, guideDesc, guide, + gridDesc, grid, bilateralSliceApplyParamSpec, forwardRunInfo, tmpBuf, outputDesc, output)); + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/uchar/bilateral_slice_apply_mali_uchar.h b/compute/tensor/src/gpu/mali/uchar/bilateral_slice_apply_mali_uchar.h new file mode 100644 index 00000000..c854698b --- /dev/null +++ b/compute/tensor/src/gpu/mali/uchar/bilateral_slice_apply_mali_uchar.h @@ -0,0 +1,34 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _BILATERAL_SLICE_APPLY_MALI_UCHAR +#define _BILATERAL_SLICE_APPLY_MALI_UCHAR +#include "sys.h" +#include "types.h" +#include "error.h" +#include "tensor_computing_type.h" + +EE bilateral_slice_apply_mali_uchar(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc guideDesc, + const GCLMem_t guide, + TensorDesc gridDesc, + const GCLMem_t grid, + BilateralSliceApplyParamSpec bilateralSliceApplyParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output); +#endif diff --git a/compute/tensor/src/gpu/mali/unsqueeze.cpp b/compute/tensor/src/gpu/mali/unsqueeze.cpp new file mode 100644 index 00000000..ccdc6d9b --- /dev/null +++ b/compute/tensor/src/gpu/mali/unsqueeze.cpp @@ -0,0 +1,102 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/unsqueeze_mali_fp16.h" + +EE unsqueeze_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + /*tensorDesc record cpu org data format info*/ + /*gclmemDesc record gpu trans data format info*/ + if (outputDesc) { + *outputDesc = inputDesc; + } + + DataType idt; + DataFormat idf; + U32 iw, ih, ic, in; + if (inputDesc.df == DF_NCHW) { + tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); + } else if (inputDesc.df == DF_MKT) { + U32 m, k, t; + get_nlp_mkt_val(inputDesc, &idt, &m, &k, &t); + map_nlp_mkt_to_ncwhc4(m, k, t, &iw, &ih, &ic); + ic = ic * 4; + in = 1; + idf = DF_MKT; + } else { + return NOT_SUPPORTED; + } + + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + iw, ih, ic, 0, 0, iw, ih, ic, idt, idt, gclmemInputDesc, gclmemOutputDesc)); + return SUCCESS; +} + +inline EE unsqueeze_checkpara_mali( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output) +{ + if (handle == nullptr || nullptr == input || nullptr == output) { + return NULL_POINTER; + } + if (input->desc.memFormat != output->desc.memFormat) { + return NOT_SUPPORTED; + } + if (inputDesc.df != outputDesc.df) { + return NOT_SUPPORTED; + } + if (inputDesc.dims[0] != outputDesc.dims[0]) { + return NOT_SUPPORTED; + } + if (inputDesc.dims[1] != outputDesc.dims[1]) { + return NOT_SUPPORTED; + } + if (inputDesc.dims[2] != outputDesc.dims[2]) { + return NOT_SUPPORTED; + } + if (inputDesc.dims[3] != outputDesc.dims[3]) { + return NOT_SUPPORTED; + } + if (output->desc.memFormat != DF_NCWHC4) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +EE unsqueeze_mali( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output) +{ + EE ret = SUCCESS; + CHECK_STATUS(unsqueeze_checkpara_mali(handle, inputDesc, input, outputDesc, output)); + switch (inputDesc.dt) { + case DT_F16: { + ret = unsqueeze_mali_fp16(handle, inputDesc, input, outputDesc, output); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/l2normalization.cpp b/compute/tensor/src/l2normalization.cpp new file mode 100644 index 00000000..ec3668c1 --- /dev/null +++ b/compute/tensor/src/l2normalization.cpp @@ -0,0 +1,65 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif + +inline EE l2normalization_infer_output_size_cpu(TensorDesc inputDesc, TensorDesc *outputDesc) +{ + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + if (tensorIs2d(inputDesc) || tensorIs3d(inputDesc)) { + *outputDesc = inputDesc; + } else if (tensorIs4d(inputDesc) && inputDesc.dims[0] == 1 && inputDesc.dims[1] == 1) { + *outputDesc = inputDesc; + } else { + CHECK_STATUS(NOT_MATCH); + } + return SUCCESS; +} + +EE l2normalization_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + UNUSED(archInfo); + CHECK_STATUS(l2normalization_infer_output_size_cpu(inputDesc, &outputDesc)); + outputTensor->resize(outputDesc); + return SUCCESS; +} + +EE l2normalization(Tensor inputTensor, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + ret = l2normalization_cpu(inputDesc, input, outputDesc, output, arch); +#endif + } + return ret; +} diff --git a/compute/tensor/src/matmul.cpp b/compute/tensor/src/matmul.cpp new file mode 100644 index 00000000..670b3f1e --- /dev/null +++ b/compute/tensor/src/matmul.cpp @@ -0,0 +1,384 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#include "blas_enhance.h" +#include +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +EE matmul_infer_output_size_cpu(TensorDesc matrixADesc, + bool transposeA, + TensorDesc matrixBDesc, + bool transposeB, + TensorDesc *matrixCDesc) +{ + if (matrixCDesc == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + + if (DT_I8 == matrixADesc.dt || DT_I8 == matrixBDesc.dt) { + matrixADesc.dt = DT_I8; + matrixBDesc.dt = DT_I8; + } + + if (matrixADesc.dt != matrixBDesc.dt || matrixADesc.nDims < 2) { + CHECK_STATUS(NOT_MATCH); + } + + if (DF_NCHWC8 == matrixADesc.df && 4 == matrixADesc.nDims) { + CHECK_REQUIREMENT(1 == matrixADesc.dims[1] && 1 == matrixADesc.dims[0]); + } + + if (DF_NCHWC8 == matrixBDesc.df && 4 == matrixBDesc.nDims) { + CHECK_REQUIREMENT(1 == matrixBDesc.dims[1] && 1 == matrixBDesc.dims[0]); + } + + int i = 0; + int j = 0; + int dimA = matrixADesc.nDims; + int dimB = matrixBDesc.nDims; + while (i < dimA - 2 || j < dimB - 2) { + if (matrixADesc.dims[dimA - 1 - i] != matrixBDesc.dims[dimB - 1 - j]) { + if (matrixADesc.dims[dimA - 1 - i] == 1) { + i++; + continue; + } + if (matrixBDesc.dims[dimB - 1 - j] == 1) { + j++; + continue; + } + CHECK_STATUS(NOT_MATCH); + } else { + i++; + j++; + } + } + if (i != dimA - 2 || j != dimB - 2) { + CHECK_STATUS(NOT_MATCH); + } + + U32 kDimA, kDimB; + if (transposeA) { + kDimA = 1; + } else { + kDimA = 0; + } + if (transposeB) { + kDimB = 0; + } else { + kDimB = 1; + } + + if (matrixADesc.dims[kDimA] != matrixBDesc.dims[kDimB]) { + CHECK_STATUS(NOT_MATCH); + } + + *matrixCDesc = matrixADesc; + (*matrixCDesc).dims[kDimA] = matrixBDesc.dims[1 - kDimB]; + if (transposeA) { + U32 tmp = (*matrixCDesc).dims[0]; + (*matrixCDesc).dims[0] = (*matrixCDesc).dims[1]; + (*matrixCDesc).dims[1] = tmp; + } + return SUCCESS; +} + +EE matmul_infer_output_size(Tensor *matrixATensor, + bool transposeA, + Tensor *matrixBTensor, + bool transposeB, + Tensor *matrixCTensor, + ArchInfo_t archInfo) +{ + if (matrixATensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (matrixBTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (matrixCTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc matrixADesc = matrixATensor->get_desc(); + TensorDesc matrixBDesc = matrixBTensor->get_desc(); + TensorDesc matrixCDesc = matrixCTensor->get_desc(); + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemMatrixADesc = ocl_get_desc(*matrixATensor); + GCLMemDesc gclmemMatrixBDesc = ocl_get_desc(*matrixBTensor); + GCLMemDesc gclmemMatrixCDesc = ocl_get_desc(*matrixCTensor); + CHECK_STATUS(matmul_infer_output_size_mali(matrixADesc, transposeA, matrixBDesc, transposeB, + &matrixCDesc, &gclmemMatrixADesc, &gclmemMatrixBDesc, &gclmemMatrixCDesc)); + ocl_set_desc(matrixATensor, gclmemMatrixADesc); + ocl_set_desc(matrixBTensor, gclmemMatrixBDesc); + ocl_set_desc(matrixCTensor, gclmemMatrixCDesc); +#endif + } else { + CHECK_STATUS(matmul_infer_output_size_cpu( + matrixADesc, transposeA, matrixBDesc, transposeB, &matrixCDesc)); + } + matrixCTensor->resize(matrixCDesc); + return SUCCESS; +} + +EE matmul_infer_forward_algorithm(Tensor matrixATensor, + bool transposeA, + Tensor matrixBTensor, + bool transposeB, + Tensor matrixCTensor, + ArchInfo_t archInfo) +{ +#ifdef _USE_MALI + if (IS_MALI_GPU(archInfo->arch)) { + TensorDesc matrixADesc = matrixATensor.get_desc(); + TensorDesc matrixBDesc = matrixBTensor.get_desc(); + TensorDesc matrixCDesc = matrixCTensor.get_desc(); + CHECK_STATUS(matmul_infer_forward_algorithm_mali(((MaliPara_t)(archInfo->archPara))->handle, + matrixADesc, transposeA, matrixBDesc, transposeB, matrixCDesc, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo)); + } else { +#endif + return NOT_SUPPORTED; +#ifdef _USE_MALI + } +#endif + return SUCCESS; +} + +EE matmul_infer_forward_tmp_bytes(Tensor matrixATensor, + bool transposeA, + Tensor matrixBTensor, + bool transposeB, + U32 *bytes, + ArchInfo_t archInfo) +{ + TensorDesc matrixADesc = matrixATensor.get_desc(); + TensorDesc matrixBDesc = matrixBTensor.get_desc(); + + if (bytes == nullptr) { + CHECK_STATUS(NULL_POINTER); + } +#ifdef _USE_MALI + if (IS_MALI_GPU(archInfo->arch)) { + CHECK_STATUS(matmul_infer_forward_tmp_bytes_mali(matrixADesc, transposeA, matrixBDesc, + transposeB, bytes, ((MaliPara_t)(archInfo->archPara))->forwardRunInfo)); + return SUCCESS; + } +#endif + bool quantA = false; + bool quantB = false; + if (DT_I8 == matrixADesc.dt || DT_I8 == matrixBDesc.dt) { + if (DT_F16 == matrixADesc.dt) { + quantA = true; + matrixADesc.dt = DT_I8; + } + + if (DT_F16 == matrixBDesc.dt) { + quantB = true; + matrixBDesc.dt = DT_I8; + } + } + + EE ret = SUCCESS; + U32 kDimA, kDimB; + DataFormat dataFormatA, dataFormatB; + if (transposeA) { + kDimA = 1; + dataFormatA = DF_TRANSPOSE; + } else { + kDimA = 0; + dataFormatA = DF_NORMAL; + } + if (transposeB) { + kDimB = 0; + dataFormatB = DF_TRANSPOSE; + } else { + kDimB = 1; + dataFormatB = DF_NORMAL; + } + if (matrixADesc.dims[1 - kDimA] == 1 || matrixBDesc.dims[1 - kDimB] == 1) { + TensorDesc matrixDesc, vectorDesc; + if (matrixADesc.dims[1 - kDimA] == 1) { + matrixDesc = + tensor2df(matrixBDesc.dt, dataFormatB, matrixBDesc.dims[1], matrixBDesc.dims[0]); + vectorDesc = tensor1d(matrixADesc.dt, matrixADesc.dims[kDimA]); + } else { + matrixDesc = + tensor2df(matrixADesc.dt, dataFormatA, matrixADesc.dims[1], matrixADesc.dims[0]); + vectorDesc = tensor1d(matrixBDesc.dt, matrixBDesc.dims[kDimB]); + } + ret = matrix_vector_multiply_tmp_bytes(matrixDesc, vectorDesc, bytes, archInfo->arch); + } else { + TensorDesc matrixA2DDesc = + tensor2df(matrixADesc.dt, dataFormatA, matrixADesc.dims[1], matrixADesc.dims[0]); + TensorDesc matrixB2Ddesc = + tensor2df(matrixBDesc.dt, dataFormatB, matrixBDesc.dims[1], matrixBDesc.dims[0]); + ret = matrix_matrix_multiply_tmp_bytes(matrixA2DDesc, matrixB2Ddesc, bytes, archInfo->arch); + } + + if (quantA) { + *bytes += tensorNumBytes(matrixADesc); + } + if (quantB) { + *bytes += tensorNumBytes(matrixBDesc); + } + return ret; +} + +EE matmul(Tensor matrixATensor, + bool transposeA, + Tensor matrixBTensor, + bool transposeB, + Tensor tmpTensor, + Tensor matrixCTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + U32 tmpBytes = tmpTensor.bytes(); + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + TensorDesc matrixADesc = matrixATensor.get_desc(); + void *matrixA = get_ptr_from_tensor(matrixATensor, arch); + TensorDesc matrixBDesc = matrixBTensor.get_desc(); + void *matrixB = get_ptr_from_tensor(matrixBTensor, arch); + TensorDesc matrixCDesc = matrixCTensor.get_desc(); + void *matrixC = get_ptr_from_tensor(matrixCTensor, arch); + + if (matrixA == nullptr || matrixB == nullptr || matrixC == nullptr) { + CHECK_STATUS(NULL_POINTER); + } +#ifdef _USE_MALI + if (IS_MALI_GPU(arch)) { + CHECK_STATUS(matmul_mali(((MaliPara_t)(archInfo->archPara))->handle, matrixADesc, + transposeA, (GCLMem_t)matrixA, matrixBDesc, transposeB, (GCLMem_t)matrixB, (GCLMem_t)tmp, + matrixCDesc, (GCLMem_t)matrixC, ((MaliPara_t)(archInfo->archPara))->forwardRunInfo)); + return SUCCESS; + } +#endif + +#ifdef _USE_INT8 + F32 scaleO = 1; + if (DT_I8 == matrixADesc.dt || DT_I8 == matrixBDesc.dt) { + if (DT_F16 == matrixADesc.dt) { + F16 *inD = (F16 *)matrixA; + INT8 *inQ = (INT8 *)tmp; + F16 scale = matrixATensor.get_scale(); + quantize_tensor(matrixADesc, inD, &matrixADesc, inQ, &scale); + scaleO *= scale; + matrixA = (U8 *)tmp; + tmp = (U8 *)tmp + tensorNumBytes(matrixADesc); + } else { + scaleO *= matrixATensor.get_scale(); + } + if (DT_F16 == matrixBDesc.dt) { + F16 *inD = (F16 *)matrixB; + INT8 *inQ = (INT8 *)tmp; + F16 scale = matrixBTensor.get_scale(); + quantize_tensor(matrixBDesc, inD, &matrixBDesc, inQ, &scale); + scaleO *= scale; + matrixB = (U8 *)tmp; + tmp = (U8 *)tmp + tensorNumBytes(matrixBDesc); + } else { + scaleO *= matrixBTensor.get_scale(); + } + matrixCDesc.dt = DT_I32; + matrixC = tmp; + tmp = (U8 *)tmp + tensorNumBytes(matrixCDesc); + } +#endif + + U32 sizeA = tensorNumElements(matrixADesc); + U32 loops = sizeA / (matrixADesc.dims[1] * matrixADesc.dims[0]); + U32 kDimA, kDimB; + if (transposeA) { + kDimA = 1; + } else { + kDimA = 0; + } + if (transposeB) { + kDimB = 0; + } else { + kDimB = 1; + } + + U32 matrixA2DBytes = (matrixADesc.dims[1] * matrixADesc.dims[0]) * bytesOf(matrixADesc.dt); + U32 matrixB2DBytes = (matrixBDesc.dims[1] * matrixBDesc.dims[0]) * bytesOf(matrixBDesc.dt); + U32 matrixC2DBytes = (matrixCDesc.dims[1] * matrixCDesc.dims[0]) * bytesOf(matrixCDesc.dt); + U8 *matrixAPtr = (U8 *)matrixA; + U8 *matrixBPtr = (U8 *)matrixB; + U8 *matrixCPtr = (U8 *)matrixC; + memset(matrixC, 0, tensorNumBytes(matrixCDesc)); + for (U32 i = 0; i < loops; i++) { + if (matrixADesc.dims[1 - kDimA] == 1) { + TensorDesc matrixA1DDesc = tensor1d(matrixADesc.dt, matrixADesc.dims[kDimA]); + TensorDesc matrixB2DDesc = tensor2df(matrixBDesc.dt, + transposeB ? DF_NORMAL : DF_TRANSPOSE, matrixBDesc.dims[1], matrixBDesc.dims[0]); + TensorDesc matrixC1DDesc = tensor1d(matrixCDesc.dt, matrixCDesc.dims[0]); + CHECK_STATUS(matrix_vector_multiply(matrixB2DDesc, matrixBPtr, matrixA1DDesc, + matrixAPtr, tmpBytes, tmp, matrixC1DDesc, matrixCPtr, archInfo->arch)); + } else { + if (matrixBDesc.dims[1 - kDimB] == 1) { + TensorDesc matrixA2DDesc; + if (transposeA) { + matrixA2DDesc = tensor2df( + matrixADesc.dt, DF_TRANSPOSE, matrixADesc.dims[1], matrixADesc.dims[0]); + } else { + matrixA2DDesc = tensor2df( + matrixADesc.dt, DF_NORMAL, matrixADesc.dims[1], matrixADesc.dims[0]); + } + TensorDesc matrixB1DDesc = tensor1d(matrixBDesc.dt, matrixBDesc.dims[kDimB]); + TensorDesc matrixC1DDesc = tensor1d(matrixCDesc.dt, matrixCDesc.dims[1]); + CHECK_STATUS(matrix_vector_multiply(matrixA2DDesc, matrixAPtr, matrixB1DDesc, + matrixBPtr, tmpBytes, tmp, matrixC1DDesc, matrixCPtr, archInfo->arch)); + } else { + DataFormat dataFormatA, dataFormatB; + if (transposeA) { + dataFormatA = DF_TRANSPOSE; + } else { + dataFormatA = DF_NORMAL; + } + if (transposeB) { + dataFormatB = DF_TRANSPOSE; + } else { + dataFormatB = DF_NORMAL; + } + TensorDesc matrixA2DDesc = tensor2df( + matrixADesc.dt, dataFormatA, matrixADesc.dims[1], matrixADesc.dims[0]); + TensorDesc matrixB2DDesc = tensor2df( + matrixBDesc.dt, dataFormatB, matrixBDesc.dims[1], matrixBDesc.dims[0]); + TensorDesc matrixC2DDesc = + tensor2df(matrixCDesc.dt, DF_NORMAL, matrixCDesc.dims[1], matrixCDesc.dims[0]); + CHECK_STATUS(matrix_matrix_multiply(matrixA2DDesc, matrixAPtr, matrixB2DDesc, + matrixBPtr, tmpBytes, tmp, matrixC2DDesc, matrixCPtr, archInfo->arch)); + } + } + matrixAPtr += matrixA2DBytes; + matrixBPtr += matrixB2DBytes; + matrixCPtr += matrixC2DBytes; + } +#ifdef _USE_INT8 + if (DT_I8 == matrixADesc.dt || DT_I8 == matrixBDesc.dt) { + if (DT_I8 == matrixCTensor.get_desc().dt) { + CHECK_STATUS(quantize_tensor(matrixCDesc, matrixC, &matrixCDesc, + get_ptr_from_tensor(matrixCTensor, arch), &scaleO)); + matrixCTensor.set_scale(scaleO); + } else { + CHECK_REQUIREMENT(DT_F16 == matrixCTensor.get_desc().dt); + F16 *output = (F16 *)get_ptr_from_tensor(matrixCTensor, arch); + dequantize_int32_to_fp16(tensorNumElements(matrixCDesc), (I32 *)matrixC, scaleO, output); + } + } +#endif + return SUCCESS; +} diff --git a/compute/tensor/src/multihead_attention.cpp b/compute/tensor/src/multihead_attention.cpp new file mode 100644 index 00000000..c05f496a --- /dev/null +++ b/compute/tensor/src/multihead_attention.cpp @@ -0,0 +1,227 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include + +#include "tensor_computing.h" +#include "blas_enhance.h" +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +EE multihead_attention_infer_output_size(Tensor *inputTensor, + std::vector filterTensor, + Tensor *outputTensor, + U32 *firstFCSliceNum, + ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + std::vector filterDesc = get_desc_from_tensors(filterTensor); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = multihead_attention_infer_output_size_mali(inputDesc, filterDesc, &outputDesc, + firstFCSliceNum, &gclmemInputDesc, &gclmemOutputDesc, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } else { + UNUSED(inputDesc); + UNUSED(filterDesc); + UNUSED(outputDesc); + UNUSED(firstFCSliceNum); + } + outputTensor->resize(outputDesc); + return ret; +} + +EE multihead_attention_infer_forward_algorithm(Tensor inputTensor, + std::vector filterTensor, + void *multiplyAlpha, + void *multiplyBeta, + U32 *firstFCSliceNum, + U32 matmulSliceLen, + std::vector eltwiseWithLayerNormIn, + ActivationMode activation, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + TensorDesc inputDesc = inputTensor.get_desc(); + std::vector filterDesc = get_desc_from_tensors(filterTensor); + TensorDesc outputDesc = outputTensor.get_desc(); + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + ret = multihead_attention_infer_forward_algorithm_mali( + ((MaliPara_t)(archInfo->archPara))->handle, inputDesc, filterDesc, multiplyAlpha, + multiplyBeta, firstFCSliceNum, matmulSliceLen, eltwiseWithLayerNormIn, activation, + outputDesc, ((MaliPara_t)(archInfo->archPara))->forwardRunInfo); +#endif + } else { + UNUSED(inputTensor); + UNUSED(filterTensor); + UNUSED(multiplyAlpha); + UNUSED(multiplyBeta); + UNUSED(firstFCSliceNum); + UNUSED(matmulSliceLen); + UNUSED(eltwiseWithLayerNormIn); + UNUSED(activation); + UNUSED(outputTensor); + UNUSED(archInfo); + } + return ret; +} + +EE multihead_attention_infer_forward_tmp_bytes(Tensor inputTensor, + std::vector filterTensor, + std::vector eltwiseWithLayerNormIn, + U32 *firstFCSliceNum, + U32 matmulSliceLen, + U32 *bytes, + ArchInfo_t archInfo) +{ + TensorDesc inputDesc = inputTensor.get_desc(); + std::vector filterDesc = get_desc_from_tensors(filterTensor); + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + ret = multihead_attention_infer_forward_tmp_bytes_mali(inputDesc, filterDesc, + eltwiseWithLayerNormIn, firstFCSliceNum, matmulSliceLen, bytes, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo); +#endif + } else { + UNUSED(inputTensor); + UNUSED(filterTensor); + UNUSED(eltwiseWithLayerNormIn); + UNUSED(firstFCSliceNum); + UNUSED(matmulSliceLen); + UNUSED(bytes); + UNUSED(archInfo); + } + return ret; +} + +EE multihead_attention_transform_filter_bytes( + std::vector filterTensor, U32 *bytes, ArchInfo_t archInfo) +{ + std::vector filterDesc = get_desc_from_tensors(filterTensor); + + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + CHECK_STATUS(multihead_attention_transform_filter_bytes_mali(filterDesc, + ((MaliPara_t)(archInfo->archPara))->gclmemFilterDesc, bytes, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo)); +#endif + } else { + UNUSED(filterTensor); + UNUSED(bytes); + UNUSED(archInfo); + } + return SUCCESS; +} + +EE multihead_attention_transform_filter( + std::vector filterTensor, std::vector ftmTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + std::vector filterDesc = get_desc_from_tensors(filterTensor); + std::vector filter = get_data_from_tensors(filterTensor, arch); + std::vector ftmDesc = get_desc_from_tensor_ptrs(ftmTensor); + std::vector filterTransformed = get_data_from_tensor_ptrs(ftmTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(arch)) { +#ifdef _USE_MALI + ret = multihead_attention_transform_filter_mali(((MaliPara_t)(archInfo->archPara))->handle, + filterDesc, filter, &ftmDesc, filterTransformed, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo); +#endif + } else { + UNUSED(filterTensor); + UNUSED(ftmTensor); + UNUSED(archInfo); + } + for (U32 i = 0; i < ftmTensor.size(); i++) { + ftmTensor[i]->resize(ftmDesc[i]); + } + return ret; +} + +EE multihead_attention(Tensor inputTensor, + std::vector filterTensor, + std::vector biasTensor, + std::vector layerNormAlphaTensor, + std::vector layerNormBetaTensor, + void *multiplyAlpha, + void *multiplyBeta, + U32 *firstFCSliceNum, + U32 matmulSliceLen, + std::vector eltwiseWithLayerNormIn, + ActivationMode activation, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + std::vector filterDesc = get_desc_from_tensors(filterTensor); + std::vector filter = get_data_from_tensors(filterTensor, arch); + std::vector layerNormAlpha = get_data_from_tensors(layerNormAlphaTensor, arch); + ; + std::vector layerNormBeta = get_data_from_tensors(layerNormBetaTensor, arch); + std::vector biasDesc = get_desc_from_tensors(biasTensor); + std::vector bias = get_data_from_tensors(biasTensor, arch); + U32 tmpBytes = tmpTensor.bytes(); + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(arch)) { +#ifdef _USE_MALI + ret = multihead_attention_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, + (GCLMem_t)input, filterDesc, filter, biasDesc, bias, layerNormAlpha, layerNormBeta, + multiplyAlpha, multiplyBeta, firstFCSliceNum, matmulSliceLen, eltwiseWithLayerNormIn, + activation, tmpBytes, (GCLMem_t)tmp, outputDesc, (GCLMem_t)output, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo); +#endif + } else { + UNUSED(inputDesc); + UNUSED(filterTensor); + UNUSED(biasTensor); + UNUSED(layerNormAlpha); + UNUSED(layerNormBeta); + UNUSED(multiplyAlpha); + UNUSED(multiplyBeta); + UNUSED(firstFCSliceNum); + UNUSED(matmulSliceLen); + UNUSED(eltwiseWithLayerNormIn); + UNUSED(activation); + UNUSED(tmpTensor); + UNUSED(outputTensor); + } + return ret; +} diff --git a/compute/tensor/src/non_max_suppression.cpp b/compute/tensor/src/non_max_suppression.cpp new file mode 100644 index 00000000..cf04825f --- /dev/null +++ b/compute/tensor/src/non_max_suppression.cpp @@ -0,0 +1,84 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif + +inline EE non_max_suppression_infer_output_size_cpu( + std::vector inputDesc, NonMaxSuppressionParamSpec p, TensorDesc *outputDesc) +{ + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt0, idt1; + DataFormat idf0, idf1; + U32 in0, ic0, ilens0; + U32 in1, ic1, ilens1; + // boxes + CHECK_STATUS(tensor3dGet(inputDesc[0], &idt0, &idf0, &in0, &ic0, &ilens0)); + // scores + CHECK_STATUS(tensor3dGet(inputDesc[1], &idt1, &idf1, &in1, &ic1, &ilens1)); + CHECK_REQUIREMENT(ilens0 == 4); + CHECK_REQUIREMENT(ic0 == ilens1); + CHECK_REQUIREMENT(p.max_output_boxes_per_class != 0); + // output size + U32 oh, ow; + // oh = the first box for saving the number of available boxes(1) + the maximum number of dectected boxes(max_output_boxes_per_class * num_class) + U32 max_output_boxes_per_class = p.max_output_boxes_per_class; + U32 num_class = ic1; + U32 num_detected_max = max_output_boxes_per_class * num_class; + oh = num_detected_max + 1; + // Each width is a 3 dimension vector, which stores [batch_index, class_index, box_index] -> 3 + // The first box is [ number of available boxes, 0, 0 ] + ow = 3; + *outputDesc = tensor2d(idt0, oh, ow); + return SUCCESS; +} + +EE non_max_suppression_infer_output_size(std::vector inputTensor, + NonMaxSuppressionParamSpec p, + Tensor *outputTensor, + ArchInfo_t archInfo) +{ + UNUSED(archInfo); + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + std::vector inputDesc = get_desc_from_tensor_ptrs(inputTensor); + TensorDesc outputDesc = outputTensor->get_desc(); + CHECK_STATUS(non_max_suppression_infer_output_size_cpu(inputDesc, p, &outputDesc)); + outputTensor->resize(outputDesc); + return SUCCESS; +} + +EE non_max_suppression(std::vector inputTensor, + NonMaxSuppressionParamSpec p, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + std::vector inputDesc = get_desc_from_tensors(inputTensor); + std::vector input = get_data_from_tensors(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + ret = non_max_suppression_cpu(inputDesc, input, p, outputDesc, output); +#endif + } + return ret; +} diff --git a/compute/tensor/src/normalization.cpp b/compute/tensor/src/normalization.cpp new file mode 100644 index 00000000..8c033065 --- /dev/null +++ b/compute/tensor/src/normalization.cpp @@ -0,0 +1,88 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_GENERAL +#include "cpu/general/tensor_computing_general.h" +#endif +#ifdef _USE_X86 +#include "cpu/x86/tensor_computing_x86.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/tensor_computing_arm.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +EE layer_normalization(Tensor inputTensor, + Tensor alphaTensor, + Tensor betaTensor, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + void *alpha = get_ptr_from_tensor(alphaTensor, arch); + void *beta = get_ptr_from_tensor(betaTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = layer_normalization_general(inputDesc, input, alpha, beta, outputDesc, output); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = layer_normalization_x86(inputDesc, input, alpha, beta, outputDesc, output); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = layer_normalization_arm(inputDesc, input, alpha, beta, outputDesc, output); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = layer_normalization_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, + (GCLMem_t)input, (GCLMem_t)alpha, (GCLMem_t)beta, outputDesc, (GCLMem_t)output); +#endif + } + return ret; +} + +EE normalization_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + CHECK_STATUS(normalization_infer_output_size_mali( + inputDesc, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc)); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } else { + outputDesc = inputDesc; + } + outputTensor->resize(outputDesc); + return SUCCESS; +} diff --git a/compute/tensor/src/padding.cpp b/compute/tensor/src/padding.cpp new file mode 100644 index 00000000..b8ec5192 --- /dev/null +++ b/compute/tensor/src/padding.cpp @@ -0,0 +1,72 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +EE padding_infer_output_size( + Tensor *inputTensor, PadParamSpec padParamSpec, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = padding_infer_output_size_mali( + inputDesc, padParamSpec, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif +#ifdef _USE_CPU + } else { + ret = padding_infer_output_size_cpu(inputDesc, padParamSpec, &outputDesc); +#endif + } + outputTensor->resize(outputDesc); + return ret; +} + +EE padding(Tensor inputTensor, PadParamSpec padParamSpec, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(arch)) { +#ifdef _USE_MALI + ret = padding_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, + (const GCLMem_t)input, padParamSpec, outputDesc, (GCLMem_t)output); +#endif +#ifdef _USE_CPU + } else { + ret = padding_cpu(inputDesc, input, padParamSpec, outputDesc, output); +#endif + } + return ret; +} diff --git a/compute/tensor/src/pooling.cpp b/compute/tensor/src/pooling.cpp new file mode 100644 index 00000000..d08e133e --- /dev/null +++ b/compute/tensor/src/pooling.cpp @@ -0,0 +1,185 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_GENERAL +#include "cpu/general/tensor_computing_general.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/tensor_computing_arm.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif +#ifdef _USE_X86 +#include "cpu/x86/tensor_computing_x86.h" +#endif + +inline EE pooling_infer_output_size_cpu( + TensorDesc inputDesc, PoolingParamSpec poolingParamSpec, TensorDesc *outputDesc) +{ + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt; + DataFormat idf; + U32 in, ic, ih, iw; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + U32 strideH = poolingParamSpec.stride_h; + U32 strideW = poolingParamSpec.stride_w; + U32 paddingT = poolingParamSpec.padding_top; + U32 paddingB = poolingParamSpec.padding_bottom; + U32 paddingL = poolingParamSpec.padding_left; + U32 paddingR = poolingParamSpec.padding_right; + U32 kernelSizeH = poolingParamSpec.kernel_h; + U32 kernelSizeW = poolingParamSpec.kernel_w; + RoundMode rm = poolingParamSpec.rm; + U32 oh = 0, ow = 0; + switch (rm) { + case CEIL: { + oh = (U32)(ceil((double(ih + paddingT + paddingB - kernelSizeH) / strideH))) + 1; + ow = (U32)(ceil((double(iw + paddingL + paddingR - kernelSizeW) / strideW))) + 1; + break; + } + case FLOOR: { + oh = (U32)(floor((double(ih + paddingT + paddingB - kernelSizeH) / strideH))) + 1; + ow = (U32)(floor((double(iw + paddingL + paddingR - kernelSizeW) / strideW))) + 1; + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + *outputDesc = tensor4df(idt, idf, in, ic, oh, ow); + return SUCCESS; +} + +EE pooling_infer_output_size( + Tensor *inputTensor, PoolingParamSpec poolingParamSpec, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (0 == poolingParamSpec.kernel_h && 0 == poolingParamSpec.kernel_w) { // Global pooling + CHECK_REQUIREMENT(4 == inputDesc.nDims); + poolingParamSpec.kernel_h = inputDesc.dims[1]; + poolingParamSpec.kernel_w = inputDesc.dims[0]; + } + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = pooling_infer_output_size_mali( + inputDesc, poolingParamSpec, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } else { + ret = pooling_infer_output_size_cpu(inputDesc, poolingParamSpec, &outputDesc); + } + outputTensor->resize(outputDesc); + return ret; +} + +EE pooling(Tensor inputTensor, + PoolingParamSpec poolingParamSpec, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + F32 scale[2] = {inputTensor.get_scale(), -1}; + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + + EE ret = NOT_SUPPORTED; + if (0 == poolingParamSpec.kernel_h && 0 == poolingParamSpec.kernel_w) { // Global pooling + CHECK_REQUIREMENT(4 == inputDesc.nDims); + poolingParamSpec.kernel_h = inputDesc.dims[1]; + poolingParamSpec.kernel_w = inputDesc.dims[0]; + } + TensorDesc inDescCPU = inputDesc; + U8 *inputCPU = (U8 *)input; + TensorDesc outDescCPU = outputDesc; + U8 *outputCPU = (U8 *)output; + if (DF_NCHWC8 != inputDesc.df && !IS_MALI_GPU(arch)) { + U32 paddedC = (inputDesc.dims[2] + 7) / 8 * 8; + inDescCPU.dims[2] = paddedC; + inDescCPU.df = DF_NCHWC8; + outDescCPU.dims[2] = paddedC; + outDescCPU.df = DF_NCHWC8; + inputCPU = (U8 *)tmp; + outputCPU = inputCPU + tensorNumBytes(inDescCPU); + transformNCHWToNCHWC8(inputDesc, input, inDescCPU, inputCPU); + } + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = pooling_general(inDescCPU, inputCPU, poolingParamSpec, outDescCPU, outputCPU); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = pooling_x86(inDescCPU, inputCPU, poolingParamSpec, scale, outDescCPU, outputCPU); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = pooling_arm(inDescCPU, inputCPU, poolingParamSpec, scale, outDescCPU, outputCPU); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = pooling_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, + (const GCLMem_t)input, poolingParamSpec, scale, (GCLMem_t)tmp, outputDesc, + (GCLMem_t)output); +#endif + } + if (DF_NCHWC8 != outputDesc.df && !IS_MALI_GPU(arch)) { + transformToNCHW(outDescCPU, outputCPU, outputDesc, output); + } + outputTensor.set_scale(scale[1]); + return ret; +} + +EE pooling_infer_forward_tmp_bytes( + Tensor inputTensor, Tensor outputTensor, U32 *bytes, ArchInfo_t archInfo) +{ + if (bytes == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor.get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + ret = pooling_infer_forward_tmp_bytes_mali( + inputDesc, bytes, ((MaliPara_t)(archInfo->archPara))->forwardRunInfo); +#endif + } else { + *bytes = 0; + if (DF_NCHW == inputDesc.df) { + U32 paddedC = (inputDesc.dims[2] + 7) / 8 * 8; + TensorDesc outputDesc = outputTensor.get_desc(); + inputDesc.dims[2] = paddedC; + outputDesc.dims[2] = paddedC; + *bytes = tensorNumBytes(inputDesc) + tensorNumBytes(outputDesc); + } + ret = SUCCESS; + } + return ret; +} diff --git a/compute/tensor/src/pooling_bp.cpp b/compute/tensor/src/pooling_bp.cpp new file mode 100644 index 00000000..746e9fca --- /dev/null +++ b/compute/tensor/src/pooling_bp.cpp @@ -0,0 +1,48 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_GENERAL +#include "cpu/general/tensor_computing_general.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/tensor_computing_arm.h" +#endif +// only support average pooling now +EE pooling_bp( + Tensor inputTensor, PoolingParamSpec poolingParamSpec, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + CHECK_REQUIREMENT(POOLING_MEAN == poolingParamSpec.mode); + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = pooling_bp_general(inputDesc, input, poolingParamSpec, outputDesc, output); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + UNI_WARNING_LOG("The x86 pooling_bp operator is not optimized now.\n"); + ret = pooling_bp_general(inputDesc, input, poolingParamSpec, outputDesc, output); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = pooling_bp_arm(inputDesc, input, poolingParamSpec, outputDesc, output); +#endif + } + return ret; +} diff --git a/compute/tensor/src/power.cpp b/compute/tensor/src/power.cpp new file mode 100644 index 00000000..7681b5e2 --- /dev/null +++ b/compute/tensor/src/power.cpp @@ -0,0 +1,78 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +inline EE power_infer_output_size_cpu(TensorDesc inputDesc, TensorDesc *outputDesc) +{ + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + *outputDesc = inputDesc; + return SUCCESS; +} + +EE power_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = power_infer_output_size_mali( + inputDesc, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } else { + ret = power_infer_output_size_cpu(inputDesc, &outputDesc); + } + outputTensor->resize(outputDesc); + return ret; +} + +EE power(Tensor inputTensor, PowerParamSpec p, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + ret = power_cpu(inputDesc, input, p, outputDesc, output, arch); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = power_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, (GCLMem_t)input, p, + outputDesc, (GCLMem_t)output); +#endif + } + return ret; +} diff --git a/compute/tensor/src/preallocated_memory.cpp b/compute/tensor/src/preallocated_memory.cpp new file mode 100644 index 00000000..0771e118 --- /dev/null +++ b/compute/tensor/src/preallocated_memory.cpp @@ -0,0 +1,56 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +EE preallocated_memory_infer_output_size(Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = preallocated_memory_infer_output_size_mali(&outputDesc, &gclmemOutputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } + outputTensor->resize(outputDesc); + return ret; +} + +EE preallocated_memory(Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(arch)) { +#ifdef _USE_MALI + ret = preallocated_memory_mali( + ((MaliPara_t)(archInfo->archPara))->handle, outputDesc, (GCLMem_t)output); +#endif +#ifdef _USE_CPU + } else { + memset(output, 0, tensorNumBytes(outputDesc)); + ret = SUCCESS; +#endif + } + return ret; +} diff --git a/compute/tensor/src/prelu.cpp b/compute/tensor/src/prelu.cpp new file mode 100644 index 00000000..11d53a55 --- /dev/null +++ b/compute/tensor/src/prelu.cpp @@ -0,0 +1,98 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_GENERAL +#include "cpu/general/tensor_computing_general.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/tensor_computing_arm.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +inline EE prelu_infer_output_size_cpu(TensorDesc inputDesc, TensorDesc *outputDesc) +{ + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + if (inputDesc.df != DF_NCHWC8) { + CHECK_STATUS(NOT_SUPPORTED); + } + *outputDesc = inputDesc; + return SUCCESS; +} + +EE prelu_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = prelu_infer_output_size_mali( + inputDesc, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } else { + ret = prelu_infer_output_size_cpu(inputDesc, &outputDesc); + } + outputTensor->resize(outputDesc); + return ret; +} + +EE prelu(Tensor inputTensor, + Tensor weightTensor, + PReLUParamSpec preluDesc, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + void *weight = get_ptr_from_tensor(weightTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = prelu_general(inputDesc, input, weight, preluDesc, outputDesc, output); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + UNI_WARNING_LOG("The x86 prelu operator is not optimized now.\n"); + ret = prelu_general(inputDesc, input, weight, preluDesc, outputDesc, output); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = prelu_arm(inputDesc, input, weight, preluDesc, outputDesc, output); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = prelu_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, (GCLMem_t)input, + (GCLMem_t)weight, preluDesc, outputDesc, (GCLMem_t)output); +#endif + } + return ret; +} diff --git a/compute/tensor/src/priorbox.cpp b/compute/tensor/src/priorbox.cpp new file mode 100644 index 00000000..d525a3d4 --- /dev/null +++ b/compute/tensor/src/priorbox.cpp @@ -0,0 +1,104 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif + +inline EE priorbox_infer_output_size_cpu( + std::vector inputDesc, PriorBoxParamSpec priorBoxParamSpec, TensorDesc *outputDesc) +{ + std::vector minsizes; + for (int i = 0; i < 2; i++) { + if (priorBoxParamSpec.min_sizes[i] == 0) { + break; + } + minsizes.push_back(priorBoxParamSpec.min_sizes[i]); + } + std::vector maxsizes; + for (int i = 0; i < 2; i++) { + if (priorBoxParamSpec.max_sizes[i] == 0) { + break; + } + maxsizes.push_back(priorBoxParamSpec.max_sizes[i]); + } + std::vector ars; + for (int i = 0; i < 2; i++) { + if (priorBoxParamSpec.aspect_ratios[i] == 0) { + break; + } + ars.push_back(priorBoxParamSpec.aspect_ratios[i]); + } + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt; + DataFormat idf; + U32 in, ic, ih, iw; + CHECK_STATUS(tensor4dGet(inputDesc[0], &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_REQUIREMENT(!ars.empty()); + U32 num_priorboxs = ars.size(); + if (priorBoxParamSpec.flip) { + num_priorboxs = num_priorboxs * 2; + } + CHECK_REQUIREMENT(!minsizes.empty()); + U32 num_minsize = minsizes.size(); + num_priorboxs = num_priorboxs * num_minsize + num_minsize; + if (!maxsizes.empty()) { + U32 num_maxsize = maxsizes.size(); + CHECK_REQUIREMENT(num_minsize == num_maxsize); + num_priorboxs = num_priorboxs + num_maxsize; + } + UNI_DEBUG_LOG("Number of priorboxs per pixel: %u\n", num_priorboxs); + // on = 1, oc = 2, ol= 4*num_priorboxs*ih*iw + if (DT_I8 == idt) { + idt = DT_F16; + } + *outputDesc = tensor3d(idt, 1, 2, 4 * num_priorboxs * ih * iw); + return SUCCESS; +} + +EE priorbox_infer_output_size(std::vector inputTensor, + PriorBoxParamSpec priorBoxParamSpec, + Tensor *outputTensor, + ArchInfo_t archInfo) +{ + UNUSED(archInfo); + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + std::vector inputDesc = get_desc_from_tensor_ptrs(inputTensor); + TensorDesc outputDesc = outputTensor->get_desc(); + CHECK_STATUS(priorbox_infer_output_size_cpu(inputDesc, priorBoxParamSpec, &outputDesc)); + outputTensor->resize(outputDesc); + return SUCCESS; +} + +EE priorbox(std::vector inputTensor, + PriorBoxParamSpec priorBoxParamSpec, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + std::vector inputDesc = get_desc_from_tensors(inputTensor); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + ret = priorbox_cpu(inputDesc, priorBoxParamSpec, outputDesc, output, arch); +#endif + } + return ret; +} diff --git a/compute/tensor/src/quantize.cpp b/compute/tensor/src/quantize.cpp new file mode 100644 index 00000000..913eaefd --- /dev/null +++ b/compute/tensor/src/quantize.cpp @@ -0,0 +1,287 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_NEON +#include "cpu/arm/tensor_computing_arm.h" +#ifdef _USE_FP16 +#include "cpu/arm/fp16/arm_functions_fp16.h" +#endif +#ifdef _USE_FP32 +#include "cpu/arm/fp32/arm_functions_fp32.h" +#endif +#endif + +EE quantize_tensor(TensorDesc dDesc, const void *data, TensorDesc *qDesc, void *qData, void *scale) +{ + EE ret = NOT_SUPPORTED; +#ifdef _USE_NEON + ret = quantize_tensor_arm(dDesc, data, qDesc, qData, scale); +#endif + return ret; +} + +#if defined(_USE_NEON) && defined(_USE_INT8) +void dequantize_int8_to_fp16(U32 len, INT8 *q, F32 scale, F16 *d) +{ + F16 factor = 1 / scale; + int i = 0; + for (; i < ((int)len) - 15; i += 16) { + int8x8_t in0 = vld1_s8(q + i); + int8x8_t in1 = vld1_s8(q + i + 8); + int16x8_t s0 = vmovl_s8(in0); + int16x8_t s1 = vmovl_s8(in1); + float16x8_t f0 = vcvtq_f16_s16(s0); + float16x8_t f1 = vcvtq_f16_s16(s1); + f0 = vmulq_n_f16(f0, factor); + f1 = vmulq_n_f16(f1, factor); + vst1q_f16(d + i, f0); + vst1q_f16(d + i + 8, f1); + } + + for (; i < (int)len; i++) { + d[i] = q[i] * factor; + } +} + +void dequantize_int32_to_fp16(U32 len, I32 *q, F32 scale, F16 *d, U32 biasLen, F16 *biasPtr) +{ + if (0 != biasLen) { + CHECK_REQUIREMENT(nullptr != biasPtr); + CHECK_REQUIREMENT(len % biasLen == 0); + } + float16x4_t bias[4]; + + F32 factor = 1 / scale; + if (biasLen % 4 == 0) { + int i = 0; + for (; i < ((int)len) - 15; i += 16) { + int32x4_t in0 = vld1q_s32(q + i); + int32x4_t in1 = vld1q_s32(q + i + 4); + int32x4_t in2 = vld1q_s32(q + i + 8); + int32x4_t in3 = vld1q_s32(q + i + 12); + if (0 != biasLen) { + U32 offset = i % biasLen; + for (U32 j = 0; j < 4; j++) { + bias[j] = vld1_f16(biasPtr + offset); + offset += 4; + if (offset >= biasLen) { + offset = 0; + } + } + } + float32x4_t f0 = vcvtq_f32_s32(in0); + float32x4_t f1 = vcvtq_f32_s32(in1); + float32x4_t f2 = vcvtq_f32_s32(in2); + float32x4_t f3 = vcvtq_f32_s32(in3); + f0 = vmulq_n_f32(f0, factor); + f1 = vmulq_n_f32(f1, factor); + f2 = vmulq_n_f32(f2, factor); + f3 = vmulq_n_f32(f3, factor); + float16x4_t h0 = vcvt_f16_f32(f0); + float16x4_t h1 = vcvt_f16_f32(f1); + float16x4_t h2 = vcvt_f16_f32(f2); + float16x4_t h3 = vcvt_f16_f32(f3); + if (0 != biasLen) { + h0 = vadd_f16(h0, bias[0]); + h1 = vadd_f16(h1, bias[1]); + h2 = vadd_f16(h2, bias[2]); + h3 = vadd_f16(h3, bias[3]); + } + vst1_f16(d + i, h0); + vst1_f16(d + i + 4, h1); + vst1_f16(d + i + 8, h2); + vst1_f16(d + i + 12, h3); + } + + for (; i < (int)len; i++) { + d[i] = q[i] * factor; + if (0 != biasLen) { + d[i] += biasPtr[i % biasLen]; + } + } + } else { + for (int i = 0; i < ((int)len); i += biasLen) { + int j = 0; + for (; j < ((int)biasLen) - 3; j += 4) { + int32x4_t in0 = vld1q_s32(q + i + j); + bias[0] = vld1_f16(biasPtr + j); + float32x4_t f0 = vcvtq_f32_s32(in0); + f0 = vmulq_n_f32(f0, factor); + float16x4_t h0 = vcvt_f16_f32(f0); + h0 = vadd_f16(h0, bias[0]); + vst1_f16(d + i + j, h0); + } + for (; j < (int)biasLen; j++) { + d[i + j] = q[i + j] * factor + biasPtr[j]; + } + } + } +} + +void update_histogram(U32 len, const F16 *data, int numBins, F32 interval, F32 *histo) +{ + for (U32 i = 0; i < len; i++) { + F32 tmp = data[i]; + int index = floor(abs(tmp) / interval); + if (index >= numBins) { + index = numBins - 1; + } + histo[index] += 1; + } +} + +std::vector compress_histogram(std::vector &histogram, F32 numPerBin, F32 last_max) +{ + std::vector newhistogram(2048, 0); + for (U32 q = 0; q < ceil(2048 / numPerBin); q++) { + F32 start = q * numPerBin; + F32 end = start + numPerBin; + int left = ceil(start); + if (left > start) { + newhistogram[q] += ((F32)left - start) * histogram[left - 1]; + } + if (end <= last_max) { + int right = floor(end); + if (right < end) { + newhistogram[q] += (end - (F32)right) * histogram[right]; + } + + for (int k = left; k < right; k++) { + newhistogram[q] += histogram[k]; + } + } else { + for (int k = left; k < 2048; k++) { + newhistogram[q] += histogram[k]; + } + } + } + histogram.assign(newhistogram.begin(), newhistogram.end()); + return histogram; +} + +F32 compute_KLD(U32 len, const F32 *p, const F32 *q) +{ + F32 kld = 0; + + for (U32 i = 0; i < len; i++) { + if (0 != p[i]) { + if (0 == q[i]) { + kld += 1; + } else { + kld += p[i] * log(p[i] / q[i]); + } + } + } + + return kld; +} +#endif + +std::vector compute_scale_with_KL(std::vector &histogram, F32 interval) +{ + std::vector scale; +#ifdef _USE_INT8 + const int BINS = 2048; + F32 histoSum = array_sum_f32(histogram.data(), BINS); + array_scale_f32(histogram.data(), histogram.data(), BINS, 1 / histoSum, 0); + + F32 minKLD = 2048; + int bestThreshold = 128; + F32 sumBin = array_sum_f32(histogram.data(), 128); + UNI_DEBUG_LOG("First 128 bins contain %f of values", sumBin); + F32 sumOver = 1 - sumBin; + + for (U32 i = 128; i < 2048; i++) { + std::vector clipDist(histogram.begin(), histogram.begin() + i); + clipDist[i - 1] += sumOver; + sumOver -= histogram[i]; // Prepare for next round + + std::vector quantDist(128, 0); + + F32 numPerBin = (F32)i / 128.0; + + for (U32 j = 0; j < 128; j++) { + F32 start = j * numPerBin; + F32 end = start + numPerBin; + + int left = ceil(start); + if (left > start) { + quantDist[j] += ((F32)left - start) * histogram[left - 1]; + } + + int right = floor(end); + if (right < end) { + quantDist[j] += (end - (F32)right) * histogram[right]; + } + + for (int k = left; k < right; k++) { + quantDist[j] += histogram[k]; + } + } + + std::vector qExpand(i, 0); + + for (U32 j = 0; j < 128; j++) { + F32 start = j * numPerBin; + F32 end = start + numPerBin; + + F32 count = 0; + + int left = ceil(start); + if (left > start && 0 != histogram[left - 1]) { + count += (F32)left - start; + } + + int right = floor(end); + if (right < end && 0 != histogram[right]) { + count += end - (F32)right; + } + + for (int k = left; k < right; k++) { + if (0 != histogram[k]) { + count += 1; + } + } + + F32 expandVal = quantDist[j] / count; + + if (left > start && 0 != histogram[left - 1]) { + qExpand[left - 1] += expandVal * ((F32)left - start); + } + + if (right < end && 0 != histogram[right]) { + qExpand[right] += expandVal * (end - (F32)right); + } + + for (int k = left; k < right; k++) { + if (0 != histogram[k]) { + qExpand[k] += expandVal; + } + } + } + + F32 kld = compute_KLD(i, clipDist.data(), qExpand.data()); + + if (kld < minKLD) { + minKLD = kld; + bestThreshold = i; + } + } + UNI_DEBUG_LOG(" %d/2048\n", bestThreshold); + F32 threshold = (F32)bestThreshold * interval; + F32 quantScale = 127.99 / threshold; + scale.push_back(quantScale); +#endif + return scale; +} diff --git a/compute/tensor/src/reduction.cpp b/compute/tensor/src/reduction.cpp new file mode 100644 index 00000000..98e7ada7 --- /dev/null +++ b/compute/tensor/src/reduction.cpp @@ -0,0 +1,144 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif + +EE reduction(Tensor inputTensor, + Tensor maskTensor, + ReductionParamSpec p, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc maskDesc = maskTensor.get_desc(); + void *mask = get_ptr_from_tensor(maskTensor, arch); + U32 tmpBytes = tmpTensor.bytes(); + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + ret = reduction_cpu( + inputDesc, input, maskDesc, mask, p, tmpBytes, tmp, outputDesc, output, arch); +#endif + } + return ret; +} + +EE reduction_infer_forward_tmp_bytes( + Tensor inputTensor, ReductionParamSpec p, Tensor outputTensor, U32 *bytes, ArchInfo_t archInfo) +{ + TensorDesc inputDesc = inputTensor.get_desc(); + int factor = 0; + if (p.axes_num > 1) { + factor = 2; + } + if (inputDesc.df == DF_NCHWC8) { + for (int i = 0; i < p.axes_num; i++) { + // channel dimension + if (p.axes[i] == 1 || p.axes[i] == -3) { + factor = 2; + break; + } + } + } + *bytes = UNI_MAX(inputTensor.bytes(), outputTensor.bytes()) * factor; + return SUCCESS; +} + +EE reduction_infer_output_size( + Tensor *inputTensor, Tensor maskTensor, ReductionParamSpec p, Tensor *outputTensor) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc maskDesc = maskTensor.get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + int start = 0; + TensorDesc tmpDesc = inputDesc; + if (inputDesc.df == DF_NCHWC8) { + for (int i = 0; i < p.axes_num; i++) { + // channel dimension + if (p.axes[i] == 1 || p.axes[i] == -3) { + start = -1; + break; + } + } + for (int i = (int)tmpDesc.nDims - 1; i >= 0; i--) { + tmpDesc.dims[i + 1] = tmpDesc.dims[i]; + } + tmpDesc.dims[3] /= 8; + tmpDesc.dims[0] = 8; + tmpDesc.nDims += 1; + } + outputDesc = tmpDesc; + for (int i = start; i < p.axes_num; i++) { + int axis; + if (i == -1) { + axis = 4; + } else { + axis = p.axes[i]; + } + if (axis < 0) { + axis = tmpDesc.nDims + axis; + } + axis = tmpDesc.nDims - 1 - axis; + if (tensorNumElements(maskDesc) == 0) { + outputDesc.dims[axis] = 0; + } else { + int num = maskDesc.dims[1] > 1 ? maskDesc.dims[1] : 0; + outputDesc.dims[axis] = num; + } + } + if (p.keep_dim) { + for (U32 i = 0; i < tmpDesc.nDims; i++) { + if (outputDesc.dims[i] == 0) { + outputDesc.dims[i] = 1; + } + } + outputDesc.nDims = tmpDesc.nDims; + } else { + int index = 0; + for (U32 i = 0; i < tmpDesc.nDims; i++) { + if (outputDesc.dims[i] != 0) { + outputDesc.dims[index++] = outputDesc.dims[i]; + } + } + outputDesc.nDims = index; + } + outputDesc.df = getTensorDefaultDataFormat(outputDesc.nDims); + if (inputDesc.df == DF_NCHWC8) { + if (start == 0) { + outputDesc.df = DF_NCHWC8; + for (int i = 0; i < (int)outputDesc.nDims - 1; i++) { + outputDesc.dims[i] = outputDesc.dims[i + 1]; + } + outputDesc.nDims -= 1; + outputDesc.dims[outputDesc.nDims - 2] *= 8; + } + } + outputTensor->resize(outputDesc); + return SUCCESS; +} diff --git a/compute/tensor/src/reshape.cpp b/compute/tensor/src/reshape.cpp new file mode 100644 index 00000000..24a4f685 --- /dev/null +++ b/compute/tensor/src/reshape.cpp @@ -0,0 +1,157 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +inline EE reshape_infer_output_size_cpu( + TensorDesc inputDesc, ReshapeParamSpec p, TensorDesc *outputDesc) +{ + if (nullptr == outputDesc) { + return NULL_POINTER; + } + + I32 *shape = p.shape_dims; + I32 shape_size = p.shape_size; + + int inputElementNum = tensorNumElements(inputDesc); + int outputElementNum = 1; + for (int i = 0; i < shape_size; i++) { + outputElementNum *= shape[i]; + } + int index_range = ((int)inputDesc.nDims > shape_size) ? shape_size : inputDesc.nDims; + if (inputElementNum > 0 && outputElementNum > 0 && inputElementNum != outputElementNum) { + for (int i = 0; i < index_range; i++) { + if ((inputElementNum / (int)inputDesc.dims[inputDesc.nDims - 1 - i]) == + (outputElementNum / shape[i])) { + shape[i] = inputDesc.dims[inputDesc.nDims - 1 - i]; + break; + } + } + } + + *outputDesc = inputDesc; + (*outputDesc).nDims = shape_size; + if (shape_size == 2) { + (*outputDesc).df = DF_NORMAL; + } + if (shape_size >= 4) { + (*outputDesc).df = DF_NCHW; + } + + U32 factor = 1; + I32 count = 0; + for (I32 i = 0; i < shape_size; i++) { + I32 value = shape[i]; + if (value == 0) { + value = inputDesc.dims[inputDesc.nDims - 1 - i]; + } + if (value == -1) { + value = 0; + count++; + } else { + factor *= value; + } + + (*outputDesc).dims[shape_size - 1 - i] = value; + } + if (count > 1) { + return NOT_SUPPORTED; + } + + for (I32 i = 0; i < shape_size; i++) { + if ((*outputDesc).dims[i] == 0) { + (*outputDesc).dims[i] = tensorNumElements(inputDesc) / factor; + } + } + + return SUCCESS; +} + +EE reshape_infer_output_size( + Tensor *inputTensor, ReshapeParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = reshape_infer_output_size_mali( + inputDesc, p, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif +#ifdef _USE_CPU + } else { + ret = reshape_infer_output_size_cpu(inputDesc, p, &outputDesc); +#endif + } + outputTensor->resize(outputDesc); + return ret; +} + +EE reshape_infer_forward_tmp_bytes( + Tensor inputTensor, Tensor outputTensor, U32 *bytes, ArchInfo_t archInfo) +{ + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + TensorDesc inputDesc = inputTensor.get_desc(); + TensorDesc outputDesc = outputTensor.get_desc(); + GCLMemDesc gclmemInputDesc = ocl_get_desc(inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(outputTensor); + ret = reshape_infer_forward_tmp_bytes_mali( + inputDesc, outputDesc, &gclmemInputDesc, &gclmemOutputDesc, bytes); +#endif + } else { + *bytes = UNI_MAX(inputTensor.bytes(), outputTensor.bytes()); + ret = SUCCESS; + } + return ret; +} + +EE reshape(Tensor inputTensor, Tensor tmpTensor, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(arch)) { +#ifdef _USE_MALI + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + ret = reshape_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, (GCLMem_t)input, + (GCLMem_t)tmp, outputDesc, (GCLMem_t)output); +#endif +#ifdef _USE_CPU + } else { + ret = reshape_cpu(inputDesc, input, outputDesc, output); +#endif + } + return ret; +} diff --git a/compute/tensor/src/rnn.cpp b/compute/tensor/src/rnn.cpp new file mode 100644 index 00000000..91e53294 --- /dev/null +++ b/compute/tensor/src/rnn.cpp @@ -0,0 +1,298 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +EE rnn_transform_filter(std::vector filterTensors, + RNNParamSpec rnnParamSpec, + std::vector ftmTensors, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + std::vector filterDescs = get_desc_from_tensors(filterTensors); + std::vector filters = get_data_from_tensors(filterTensors, arch); + std::vector ftmDescs(ftmTensors.size()); + std::vector ftms = get_data_from_tensor_ptrs(ftmTensors, arch); + + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + ret = rnn_transform_filter_cpu(filterDescs.data(), (const void **)filters.data(), + rnnParamSpec, ftmDescs.data(), ftms.data()); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + GCLMem filterArray[2]; + GCLMem filterTranArray[2]; + filterArray[0] = *((GCLMem_t)filters[0]); + filterTranArray[0] = *((GCLMem_t)ftms[0]); + if (rnnParamSpec.numProjection > 0) { + filterArray[1] = *((GCLMem_t)filters[1]); + filterTranArray[1] = *((GCLMem_t)ftms[1]); + } + ret = rnn_transform_filter_mali(((MaliPara_t)(archInfo->archPara))->handle, filterDescs[0], + filterArray, rnnParamSpec, ftmDescs.data(), filterTranArray, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo); +#endif + } + for (U32 i = 0; i < ftmTensors.size(); i++) { + ftmTensors[i]->resize(ftmDescs[i]); + } + return ret; +} + +EE rnn_transform_filter_bytes( + std::vector filterTensors, RNNParamSpec rnnParamSpec, U32 *bytes, ArchInfo_t archInfo) +{ + std::vector filterDescs = get_desc_from_tensors(filterTensors); + EE ret = NOT_SUPPORTED; + auto arch = archInfo->arch; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + ret = rnn_transform_filter_bytes_cpu(filterDescs.data(), rnnParamSpec, bytes); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = rnn_transform_filter_bytes_mali(filterDescs[0], rnnParamSpec, + ((MaliPara_t)(archInfo->archPara))->gclmemFilterDesc, bytes, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo); +#endif + } + return ret; +} + +EE rnn_infer_output_size( + Tensor *inputTensor, RNNParamSpec rnnParamSpec, Tensor *outputTensor, ArchInfo_t archInfo) +{ + UNUSED(archInfo); + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + DataType idt; + DataFormat idf; + U32 batch, step, xDim; + CHECK_STATUS(tensor3dGet(inputDesc, &idt, &idf, &batch, &step, &xDim)); + U32 num = (rnnParamSpec.biDirection) ? 2 : 1; + U32 hDim = num * rnnParamSpec.numOutput; + outputDesc = tensor3df(idt, idf, batch, step, hDim); + outputTensor->resize(outputDesc); + return SUCCESS; +} + +EE rnn_infer_forward_tmp_bytes(Tensor inputTensor, + Tensor filterTensor, + Tensor outputTensor, + RNNParamSpec rnnParamSpec, + U32 *bytes, + ArchInfo_t archInfo) +{ + TensorDesc inputDesc = inputTensor.get_desc(); + TensorDesc filterDesc = filterTensor.get_desc(); + TensorDesc outputDesc = outputTensor.get_desc(); + + EE ret = NOT_SUPPORTED; +#ifdef _USE_CPU + if (IS_CPU(archInfo->arch)) { + ret = rnn_infer_forward_tmp_bytes_cpu( + inputDesc, filterDesc, outputDesc, rnnParamSpec, bytes, archInfo->arch); + } +#endif + return ret; +} + +EE rnn(Tensor inputTensor, + std::vector filterTensors, + std::vector biasTensors, + RNNParamSpec rnnParamSpec, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + std::vector filterDescs = get_desc_from_tensors(filterTensors); + std::vector filters = get_data_from_tensors(filterTensors, arch); + std::vector biasDescs = get_desc_from_tensors(biasTensors); + std::vector biases = get_data_from_tensors(biasTensors, arch); + U32 tmpBytes = tmpTensor.bytes(); + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + ret = rnn_cpu(inputDesc, input, filterDescs.data(), (const void **)filters.data(), + biasDescs.data(), (const void **)biases.data(), rnnParamSpec, tmpBytes, tmp, outputDesc, + output, arch); +#endif + } + return ret; +} + +EE rnncell_infer_output_size(std::vector inputTensor, + RNNParamSpec rnnParamSpec, + Tensor *outputTensor, + ArchInfo_t archInfo) +{ + if (inputTensor[0] == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (inputTensor[1] == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor[0]->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + auto arch = archInfo->arch; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + DataType idt; + DataFormat idf; + U32 batch, xDim; + CHECK_STATUS(tensor2dGet(inputDesc, &idt, &idf, &batch, &xDim)); + U32 hDim = rnnParamSpec.numOutput; + outputDesc = tensor2df(idt, idf, batch, hDim); + ret = SUCCESS; +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor[0]); + GCLMemDesc gclmemStateDesc = ocl_get_desc(*inputTensor[1]); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = rnncell_infer_output_size_mali(inputDesc, rnnParamSpec, &outputDesc, &gclmemInputDesc, + &gclmemStateDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor[0], gclmemInputDesc); + ocl_set_desc(inputTensor[1], gclmemStateDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } + outputTensor->resize(outputDesc); + return ret; +} + +EE rnncell_infer_forward_tmp_bytes(Tensor inputTensor, + Tensor filterTensor, + Tensor outputTensor, + RNNParamSpec rnnParamSpec, + U32 *bytes, + ArchInfo_t archInfo) +{ + TensorDesc inputDesc = inputTensor.get_desc(); + TensorDesc filterDesc = filterTensor.get_desc(); + TensorDesc outputDesc = outputTensor.get_desc(); + + EE ret = NOT_SUPPORTED; + auto arch = archInfo->arch; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + ret = rnncell_infer_forward_tmp_bytes_cpu( + inputDesc, filterDesc, outputDesc, rnnParamSpec, bytes, archInfo->arch); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = rnncell_infer_forward_tmp_bytes_mali(inputDesc, filterDesc, outputDesc, rnnParamSpec, + bytes, ((MaliPara_t)(archInfo->archPara))->forwardRunInfo); +#endif + } + return ret; +} + +EE rnncell_infer_forward_algorithm(Tensor xTensor, + Tensor filterTensor, + Tensor biasTensor, + RNNParamSpec rnncellDesc, + U32 batchStrideX, + U32 batchStrideH, + Tensor hTensor, + ArchInfo_t archInfo) +{ + EE ret = NOT_SUPPORTED; +#ifdef _USE_MALI + if (IS_MALI_GPU(archInfo->arch)) { + TensorDesc filterDesc = filterTensor.get_desc(); + TensorDesc biasDesc = biasTensor.get_desc(); + TensorDesc xDesc = xTensor.get_desc(); + TensorDesc hDesc = hTensor.get_desc(); + ret = rnncell_infer_forward_algorithm_mali(((MaliPara_t)(archInfo->archPara))->handle, + xDesc, filterDesc, biasDesc, rnncellDesc, batchStrideX, batchStrideH, hDesc, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo); + } +#endif + return ret; +} + +EE rnncell(Tensor xTensor, + std::vector filterTensors, + std::vector biasTensors, + Tensor stateTensor, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + U32 tmpOffset, + Tensor tmpTensor, + Tensor hTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc xDesc = xTensor.get_desc(); + void *currentX = get_ptr_from_tensor(xTensor, arch); + std::vector filterDescs = get_desc_from_tensors(filterTensors); + std::vector filters = get_data_from_tensors(filterTensors, arch); + std::vector biasDescs = get_desc_from_tensors(biasTensors); + std::vector biases = get_data_from_tensors(biasTensors, arch); + void *state = get_ptr_from_tensor(stateTensor, arch); + U32 tmpBytes = tmpTensor.bytes(); + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + TensorDesc hDesc = hTensor.get_desc(); + void *currentH = get_ptr_from_tensor(hTensor, arch); + if (!IS_MALI_GPU(arch)) { + tmp = (U8 *)tmp + tmpOffset; + } + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + ret = rnncell_cpu(xDesc, currentX, filterDescs.data(), (const void **)filters.data(), + biasDescs.data(), (const void **)biases.data(), state, rnnParamSpec, batchStrideX, + batchStrideH, tmpBytes, tmp, hDesc, currentH, archInfo->arch); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + GCLMem filterArray[2]; + filterArray[0] = *((GCLMem_t)filters[0]); + if (rnnParamSpec.numProjection > 0) { + filterArray[1] = *((GCLMem_t)filters[1]); + } + ret = rnncell_mali(((MaliPara_t)(archInfo->archPara))->handle, xDesc, (GCLMem_t)currentX, + filterDescs[0], filterArray, biasDescs[0], (GCLMem_t)biases[0], (GCLMem_t)state, + rnnParamSpec, batchStrideX, batchStrideH, tmpBytes, (GCLMem_t)tmp, hDesc, + (GCLMem_t)currentH, ((MaliPara_t)(archInfo->archPara))->forwardRunInfo); +#endif + } + return ret; +} diff --git a/compute/tensor/src/roialign.cpp b/compute/tensor/src/roialign.cpp new file mode 100644 index 00000000..08069c3f --- /dev/null +++ b/compute/tensor/src/roialign.cpp @@ -0,0 +1,80 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif + +inline EE roialign_infer_output_size_cpu( + std::vector inputDesc, RoiAlignParamSpec p, TensorDesc *outputDesc) +{ + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + CHECK_REQUIREMENT(inputDesc.size() == 3); + DataType idt0, idt1, idt2; + DataFormat idf0, idf1, idf2; + U32 in0, ic0, ih0, iw0; + U32 ih1, iw1; + U32 ilens2; + // feature map + CHECK_STATUS(tensor4dGet(inputDesc[0], &idt0, &idf0, &in0, &ic0, &ih0, &iw0)); + // rois + CHECK_STATUS(tensor2dGet(inputDesc[1], &idt1, &idf1, &ih1, &iw1)); + // bacth indices + CHECK_STATUS(tensor1dGet(inputDesc[2], &idt2, &idf2, &ilens2)); + CHECK_REQUIREMENT(ih1 == ilens2); + CHECK_REQUIREMENT(iw1 == 4); + // output size + U32 on, oc, oh, ow; + // on = num_rois, oc = ic, oh = output_h, ow = output_w + on = ih1; + oc = ic0; + oh = p.output_h; + ow = p.output_w; + *outputDesc = tensor4d(idt0, on, oc, oh, ow); + return SUCCESS; +} + +EE roialign_infer_output_size( + std::vector inputTensor, RoiAlignParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo) +{ + UNUSED(archInfo); + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + std::vector inputDesc = get_desc_from_tensor_ptrs(inputTensor); + TensorDesc outputDesc = outputTensor->get_desc(); + + CHECK_STATUS(roialign_infer_output_size_cpu(inputDesc, p, &outputDesc)); + outputTensor->resize(outputDesc); + return SUCCESS; +} + +EE roialign( + std::vector inputTensor, RoiAlignParamSpec p, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + std::vector inputDesc = get_desc_from_tensors(inputTensor); + std::vector input = get_data_from_tensors(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + ret = roialign_cpu(inputDesc, input, p, outputDesc, output); +#endif + } + return ret; +} diff --git a/compute/tensor/src/scale.cpp b/compute/tensor/src/scale.cpp new file mode 100644 index 00000000..fea48db9 --- /dev/null +++ b/compute/tensor/src/scale.cpp @@ -0,0 +1,98 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_GENERAL +#include "cpu/general/tensor_computing_general.h" +#endif +#ifdef _USE_X86 +#include "cpu/x86/tensor_computing_x86.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/tensor_computing_arm.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +inline EE scale_infer_output_size_cpu(TensorDesc inputDesc, TensorDesc *outputDesc) +{ + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + + *outputDesc = inputDesc; + return SUCCESS; +} + +EE scale_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = scale_infer_output_size_mali( + inputDesc, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } else { + ret = scale_infer_output_size_cpu(inputDesc, &outputDesc); + } + outputTensor->resize(outputDesc); + return ret; +} + +EE scale(Tensor inputTensor, + void *alpha, + void *beta, + ScaleParamSpec p, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = scale_general(inputDesc, input, alpha, beta, p, outputDesc, output); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = scale_x86(inputDesc, input, alpha, beta, p, outputDesc, output); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = scale_arm(inputDesc, input, alpha, beta, p, outputDesc, output); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = scale_mali(((MaliPara_t)(archInfo->archPara))->handle, (GCLMem_t)alpha, + (GCLMem_t)beta, inputDesc, (GCLMem_t)input, outputDesc, (GCLMem_t)output); +#endif + } + return ret; +} diff --git a/compute/tensor/src/slice.cpp b/compute/tensor/src/slice.cpp new file mode 100644 index 00000000..6be69a12 --- /dev/null +++ b/compute/tensor/src/slice.cpp @@ -0,0 +1,128 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +inline EE slice_infer_output_size_cpu( + TensorDesc inputDesc, SliceParamSpec p, std::vector *outputDesc) +{ + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + U32 num = (*outputDesc).size(); + int axis = (p.axis + inputDesc.nDims) % inputDesc.nDims; + I32 *slice_points = p.slice_points; + + bool splitEqual = true; + for (U32 i = 0; i < num; i++) { + if (0 != slice_points[i]) { + splitEqual = false; + break; + } + } + I32 target_axis = inputDesc.nDims - 1 - axis; + if (splitEqual) { + CHECK_REQUIREMENT(0 == inputDesc.dims[target_axis] % num); + inputDesc.dims[target_axis] /= num; + } + for (U32 i = 0; i < num; i++) { + (*outputDesc)[i] = inputDesc; + if (splitEqual) { + continue; + } + + I32 prev_point = 0; + if (i > 0) { + prev_point = slice_points[i - 1]; + } + I32 next_point = inputDesc.dims[target_axis]; + if (i < num - 1) { + next_point = slice_points[i]; + } + if (i == 0 && num == 1 && p.slice_size == 1) { // Could happen in onnx + next_point = slice_points[0]; + } + if (prev_point < 0) { + prev_point = prev_point + inputDesc.dims[target_axis]; + if (prev_point < 0) { + prev_point = 0; + } + } + if (next_point < 0) { + next_point = next_point + inputDesc.dims[target_axis]; + if (next_point < 0) { + next_point = 0; + } + } + (*outputDesc)[i].dims[target_axis] = next_point - prev_point; + } + return SUCCESS; +} + +EE slice_infer_output_size( + Tensor *inputTensor, SliceParamSpec p, std::vector outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + std::vector outputDesc = get_desc_from_tensor_ptrs(outputTensor); + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + std::vector gclmemOutputDescs; + for (auto p : outputTensor) { + gclmemOutputDescs.push_back(ocl_get_desc(*p)); + } + CHECK_STATUS(slice_infer_output_size_mali( + inputDesc, p, &outputDesc, &gclmemInputDesc, gclmemOutputDescs.data())); + ocl_set_desc(inputTensor, gclmemInputDesc); + for (U32 i = 0; i < outputTensor.size(); i++) { + ocl_set_desc(outputTensor[i], gclmemOutputDescs[i]); + } +#endif + } else { + CHECK_STATUS(slice_infer_output_size_cpu(inputDesc, p, &outputDesc)); + } + for (U32 i = 0; i < outputTensor.size(); i++) { + outputTensor[i]->resize(outputDesc[i]); + } + return SUCCESS; +} + +EE slice(Tensor inputTensor, SliceParamSpec p, std::vector outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + std::vector outputDesc = get_desc_from_tensors(outputTensor); + std::vector output = get_data_from_tensors(outputTensor, arch); + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + ret = slice_cpu(inputDesc, input, p, outputDesc, &output); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = slice_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, (GCLMem_t)input, p, + outputDesc, &output); +#endif + } + return ret; +} diff --git a/compute/tensor/src/softmax.cpp b/compute/tensor/src/softmax.cpp new file mode 100644 index 00000000..c025d364 --- /dev/null +++ b/compute/tensor/src/softmax.cpp @@ -0,0 +1,115 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_GENERAL +#include "cpu/general/tensor_computing_general.h" +#endif +#ifdef _USE_X86 +#include "cpu/x86/tensor_computing_x86.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/tensor_computing_arm.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +EE softmax( + Tensor inputTensor, SoftmaxParamSpec p, Tensor tmpTensor, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = softmax_general(inputDesc, input, p, outputDesc, output); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = softmax_x86(inputDesc, input, p, outputDesc, output); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = softmax_arm(inputDesc, input, p, outputDesc, output); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + ret = softmax_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, (GCLMem_t)input, + p, (GCLMem_t)tmp, outputDesc, (GCLMem_t)output); +#endif + } + return ret; +} + +inline EE softmax_infer_output_size_cpu(TensorDesc inputDesc, TensorDesc *outputDesc) +{ + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + *outputDesc = inputDesc; + if (DF_NCHWC8 == (*outputDesc).df) { + (*outputDesc).df = DF_NCHW; + } + return SUCCESS; +} + +EE softmax_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = softmax_infer_output_size_mali( + inputDesc, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } else { + ret = softmax_infer_output_size_cpu(inputDesc, &outputDesc); + } + outputTensor->resize(outputDesc); + return ret; +} + +EE softmax_infer_forward_tmp_bytes(Tensor inputTensor, U32 *bytes, ArchInfo_t archInfo) +{ + if (bytes == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + TensorDesc inputDesc = inputTensor.get_desc(); + ret = softmax_infer_forward_tmp_bytes_mali( + inputDesc, bytes, ((MaliPara_t)(archInfo->archPara))->forwardRunInfo); +#endif + } else { + *bytes = 0; + ret = SUCCESS; + } + return ret; +} diff --git a/compute/tensor/src/space2depth.cpp b/compute/tensor/src/space2depth.cpp new file mode 100644 index 00000000..85b5c5b8 --- /dev/null +++ b/compute/tensor/src/space2depth.cpp @@ -0,0 +1,60 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +EE space2depth_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = space2depth_infer_output_size_mali( + inputDesc, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } + outputTensor->resize(outputDesc); + return ret; +} + +EE space2depth(Tensor inputTensor, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(arch)) { +#ifdef _USE_MALI + ret = space2depth_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, + (GCLMem_t)input, outputDesc, (GCLMem_t)output); +#endif + } + return ret; +} diff --git a/compute/tensor/src/split.cpp b/compute/tensor/src/split.cpp new file mode 100644 index 00000000..672337b4 --- /dev/null +++ b/compute/tensor/src/split.cpp @@ -0,0 +1,49 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "tensor_computing.h" +#if defined(_USE_GENERAL) || defined(_USE_NEON) || defined(_USE_X86) +#include "cpu/tensor_computing_cpu.h" +#endif + +EE split_infer_output_size(Tensor *inputTensor, std::vector output) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + for (auto p : output) { + if (p == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + p->resize(inputDesc); + } + return SUCCESS; +} + +EE split(Tensor inputTensor, std::vector outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + std::vector outputDesc = get_desc_from_tensors(outputTensor); + std::vector output = get_data_from_tensors(outputTensor, arch); + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { +#if defined(_USE_GENERAL) || defined(_USE_NEON) || defined(_USE_X86) + ret = split_cpu(inputDesc, input, outputDesc, &output); +#endif + } + return ret; +} diff --git a/compute/tensor/src/squeeze.cpp b/compute/tensor/src/squeeze.cpp new file mode 100644 index 00000000..066dc8b1 --- /dev/null +++ b/compute/tensor/src/squeeze.cpp @@ -0,0 +1,101 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif +#include + +EE squeeze(Tensor inputTensor, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(arch)) { +#ifdef _USE_MALI + TensorDesc outputDesc = outputTensor.get_desc(); + ret = squeeze_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, (GCLMem_t)input, + outputDesc, (GCLMem_t)output); +#endif +#ifdef _USE_CPU + } else { + if (output != input) { + memcpy(output, input, tensorNumBytes(inputDesc)); + } + ret = SUCCESS; +#endif + } + return ret; +} + +#ifdef _USE_CPU +EE squeeze_infer_output_size_cpu( + TensorDesc inputDesc, int *axes, int axesNum, TensorDesc *outputDesc) +{ + outputDesc->dt = inputDesc.dt; + for (U32 i = 0; i < inputDesc.nDims; i++) { + outputDesc->dims[i] = inputDesc.dims[i]; + } + for (int i = 0; i < axesNum; i++) { + int axis = axes[i]; + if (axis < 0) { + axis += inputDesc.nDims; + } + outputDesc->dims[inputDesc.nDims - 1 - axis] = 0; + } + U32 index = 0; + for (U32 i = 0; i < inputDesc.nDims; i++) { + if (outputDesc->dims[i] != 0) { + outputDesc->dims[index++] = outputDesc->dims[i]; + } + } + CHECK_REQUIREMENT(index + axesNum == inputDesc.nDims); + outputDesc->nDims = index; + outputDesc->df = getTensorDefaultDataFormat(outputDesc->nDims); + return SUCCESS; +} +#endif + +EE squeeze_infer_output_size( + Tensor *inputTensor, SqueezeParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = squeeze_infer_output_size_mali( + inputDesc, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif +#ifdef _USE_CPU + } else { + ret = squeeze_infer_output_size_cpu(inputDesc, p.axes, p.axes_num, &outputDesc); +#endif + } + outputTensor->resize(outputDesc); + return ret; +} diff --git a/compute/tensor/src/tensor_computing_type.cpp b/compute/tensor/src/tensor_computing_type.cpp new file mode 100644 index 00000000..5e1b6d13 --- /dev/null +++ b/compute/tensor/src/tensor_computing_type.cpp @@ -0,0 +1,174 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "types.h" +#include "tensor_computing_type.h" + +ConvolutionParamSpec createConvolutionParamSpec(U32 group, + U32 kernelH, + U32 kernelW, + U32 strideH, + U32 strideW, + U32 paddingT, + U32 paddingB, + U32 paddingL, + U32 paddingR, + U32 dilateH, + U32 dilateW, + U32 num_outputs, + ConvolutionMode convMode) +{ + ConvolutionParamSpec p; + p.group = group; + p.kernel_h = kernelH; + p.kernel_w = kernelW; + p.stride_h = strideH; + p.stride_w = strideW; + p.padding_top = paddingT; + p.padding_bottom = paddingB; + p.padding_left = paddingL; + p.padding_right = paddingR; + p.dilatedRate_h = dilateH; + p.dilatedRate_w = dilateW; + p.num_outputs = num_outputs; + p.convolution_type = convMode; + return p; +} + +FullyConnectedParamSpec createFullyConnectedParamSpec( + U32 num_outputs, U32 num_slices, I32 *slice_point) +{ + FullyConnectedParamSpec p; + p.num_outputs = num_outputs; + p.num_slices = num_slices; + if (num_slices > 1 && slice_point != nullptr) { + for (int i = 0; i < (int)num_slices; i++) { + p.slice_point[i] = slice_point[i]; + } + } + return p; +} + +PoolingParamSpec createPoolingParamSpec(PoolingMode pm, + U32 ksH, + U32 ksW, + U32 strideH, + U32 strideW, + U32 paddingT, + U32 paddingB, + U32 paddingL, + U32 paddingR, + RoundMode rm) +{ + PoolingParamSpec p; + p.mode = pm; + p.kernel_h = ksH; + p.kernel_w = ksW; + p.stride_h = strideH; + p.stride_w = strideW; + p.padding_top = paddingT; + p.padding_bottom = paddingB; + p.padding_left = paddingL; + p.padding_right = paddingR; + p.rm = rm; + return p; +} + +ReshapeParamSpec createReshapeParamSpec(I32 *shape_dims, I32 shape_size, I32 axis, I32 num_axes) +{ + ReshapeParamSpec p; + p.shape_size = shape_size; + p.axis = axis; + p.num_axes = num_axes; + if (shape_dims != nullptr && shape_size != 0) { + for (int i = 0; i < shape_size; i++) { + p.shape_dims[i] = shape_dims[i]; + } + } + return p; +} + +ClipParamSpec createClipParamSpec(float min, float max) +{ + ClipParamSpec p; + p.min = min; + p.max = max; + return p; +} + +SqueezeParamSpec createSqueezeParamSpec(int *axes, int axes_num) +{ + SqueezeParamSpec p; + p.axes_num = axes_num; + if (axes != nullptr && axes_num != 0) { + for (int i = 0; i < axes_num; i++) { + p.axes[i] = axes[i]; + } + } + return p; +} + +std::vector get_desc_from_tensors(std::vector tensors) +{ + int size = tensors.size(); + std::vector result(size); + for (int i = 0; i < size; i++) { + result[i] = tensors[i].get_desc(); + } + return result; +} + +std::vector get_desc_from_tensor_ptrs(std::vector tensors) +{ + int size = tensors.size(); + std::vector result(size); + for (int i = 0; i < size; i++) { + result[i] = tensors[i]->get_desc(); + } + return result; +} + +std::vector get_scale_from_tensors(std::vector tensors) +{ + int size = tensors.size(); + std::vector result(size); + for (int i = 0; i < size; i++) { + result[i] = tensors[i].get_scale(); + } + return result; +} + +template +std::vector get_data_from_tensors(std::vector tensors, Arch arch) +{ + int size = tensors.size(); + std::vector result(size); + for (int i = 0; i < size; i++) { + result[i] = (T)get_ptr_from_tensor(tensors[i], arch); + } + return result; +} + +template +std::vector get_data_from_tensor_ptrs(std::vector tensors, Arch arch) +{ + int size = tensors.size(); + std::vector result(size); + for (int i = 0; i < size; i++) { + result[i] = (T)get_ptr_from_tensor(*tensors[i], arch); + } + return result; +} + +template std::vector get_data_from_tensors(std::vector tensors, Arch arch); +template std::vector get_data_from_tensor_ptrs(std::vector tensors, Arch arch); diff --git a/compute/tensor/src/tfslice.cpp b/compute/tensor/src/tfslice.cpp new file mode 100644 index 00000000..e1e49774 --- /dev/null +++ b/compute/tensor/src/tfslice.cpp @@ -0,0 +1,55 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif + +EE tfslice_infer_output_size( + Tensor *inputTensor, TfSliceParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_CPU(archInfo->arch)) { +#ifdef _USE_CPU + ret = tfslice_infer_output_size_cpu(inputDesc, p, &outputDesc); +#endif + } + outputTensor->resize(outputDesc); + return ret; +} + +EE tfslice(Tensor inputTensor, TfSliceParamSpec p, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + ret = tfslice_cpu(inputDesc, input, p, outputDesc, output); +#endif + } + return ret; +} diff --git a/compute/tensor/src/tile.cpp b/compute/tensor/src/tile.cpp new file mode 100644 index 00000000..a958687e --- /dev/null +++ b/compute/tensor/src/tile.cpp @@ -0,0 +1,80 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +#include "tensor_computing.h" + +EE tile_infer_output_size( + Tensor *inputTensor, TileParamSpec tileParamSpec, Tensor *outputTensor, ArchInfo_t archInfo) +{ + auto inDim = inputTensor->get_desc(); + auto outDim = inDim; + if ((int)inDim.nDims == tileParamSpec.dimsSize) { + for (int i = 0; i < tileParamSpec.dimsSize; i++) { + outDim.dims[tileParamSpec.dimsSize - 1 - i] = + inDim.dims[tileParamSpec.dimsSize - 1 - i] * tileParamSpec.repeatsInfo[i]; + } + } else { + if (tileParamSpec.axis == -1) { + tileParamSpec.axis = 0; + } + outDim.dims[tileParamSpec.axis] = + outDim.dims[tileParamSpec.axis] * tileParamSpec.repeatsInfo[0]; + } + outputTensor->resize(outDim); + return SUCCESS; +} + +EE tile(Tensor inputTensor, TileParamSpec tileParamSpec, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + U8 *ptr = (U8 *)output; + int element_size = bytesOf(inputDesc.dt); + if (tileParamSpec.dimsSize == (int)inputDesc.nDims) { //onnx model support + ret = NOT_SUPPORTED; + } else { //caffe model support + int axis = tileParamSpec.axis; + if (axis == -1) { + axis = 0; + } + int length = 1; + for (U32 i = 0; i < inputDesc.nDims; i++) { + length = length * inputDesc.dims[i]; + } + if (axis == (int)inputDesc.nDims - 1) { + for (int i = 0; i < tileParamSpec.repeatsInfo[0]; i++) { + U8 *srcPtr = (U8 *)input; + U8 *desPtr = ptr + element_size * length * i; + memcpy(desPtr, srcPtr, element_size * length); + } + ret = SUCCESS; + } else if (axis == 0) { + int count = length / inputDesc.dims[axis]; + for (int i = 0; i < count; i++) { + for (int j = 0; j < tileParamSpec.repeatsInfo[0]; j++) { + U8 *srcPtr = (U8 *)input + element_size * inputDesc.dims[axis] * i; + U8 *desPtr = ptr + + element_size * inputDesc.dims[axis] * (tileParamSpec.repeatsInfo[0] * i + j); + memcpy(desPtr, srcPtr, element_size * inputDesc.dims[axis]); + } + } + ret = SUCCESS; + } else { + ret = NOT_SUPPORTED; + } + } + return ret; +} diff --git a/compute/tensor/src/transpose.cpp b/compute/tensor/src/transpose.cpp new file mode 100644 index 00000000..3542e8e3 --- /dev/null +++ b/compute/tensor/src/transpose.cpp @@ -0,0 +1,166 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "tensor_computing.h" +#ifdef _USE_GENERAL +#include "cpu/general/tensor_computing_general.h" +#endif +#if defined(_USE_X86) || defined(_USE_NEON) +#include "cpu/tensor_computing_cpu.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +EE transpose(Tensor inputTensor, + TransposeParamSpec p, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + std::vector tmpDims(p.trans_dims, p.trans_dims + p.trans_size); + if (IS_CPU(arch)) { + // Keep transDims unchanged so that input resize does not lead to error + if (DF_NCHWC8 == inputDesc.df) { + if (4 == p.trans_size) { + auto ptr = std::find(tmpDims.begin(), tmpDims.end(), 1); + tmpDims.insert(ptr + 1, 4); + } + inputDesc.nDims = 5; + for (int i = 3; i >= 0; i--) { + inputDesc.dims[i + 1] = inputDesc.dims[i]; + } + inputDesc.dims[3] /= 8; + inputDesc.dims[0] = 8; + + TensorDesc desc = outputDesc; + desc.nDims = 5; + U32 idx = 4; + for (int i = 3; i >= 0; i--) { + if (1 == tmpDims[3 - i]) { // C + desc.dims[idx] = outputDesc.dims[i] / 8; + idx--; + desc.dims[idx] = 8; + idx--; + } else { + desc.dims[idx] = outputDesc.dims[i]; + idx--; + } + } + outputDesc = desc; + } + } + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = transpose_general(inputDesc, input, tmpDims.data(), outputDesc, output); +#endif +#if defined(_USE_X86) || defined(_USE_NEON) + } else if (IS_CPU(arch)) { + ret = transpose_cpu(inputDesc, input, tmpDims.data(), outputDesc, output); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + ret = transpose_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, + (const GCLMem_t)input, p, (GCLMem_t)tmp, outputDesc, (GCLMem_t)output); +#endif + } + return ret; +} + +inline EE transpose_infer_output_size_cpu( + TensorDesc inputDesc, TransposeParamSpec p, TensorDesc *outputDesc) +{ + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + + U32 *dim = p.trans_dims; + *outputDesc = inputDesc; + U32 inputDim = inputDesc.nDims; + if (4 == inputDim) { + (*outputDesc).df = DF_NCHW; + } + U32 outputDim = (*outputDesc).nDims; + for (U32 i = 0; i < inputDim; i++) { + CHECK_REQUIREMENT(dim[i] < inputDim); + // NOTE: TensorDesc.dims array is in [W H C N] order. + // so if you want to transpose [N C H W] format data, we use (dims - 1 - *) + // [5 6 7 8] + [0 3 2 1] = [5 8 7 6] + // [8 7 6 5] + [0 3 2 1] = [6 7 8 5] + (*outputDesc).dims[outputDim - 1 - i] = inputDesc.dims[inputDim - 1 - dim[i]]; + } + if ((*outputDesc).nDims >= 4) { + (*outputDesc).df = DF_NCHW; + } + if ((*outputDesc).nDims == 4 && p.trans_size == 3 && (*outputDesc).dims[0] == 1) { + (*outputDesc) = tensor3df(inputDesc.dt, DF_NCHW, (*outputDesc).dims[3], (*outputDesc).dims[2], (*outputDesc).dims[1]); + } + return SUCCESS; +} + +EE transpose_infer_output_size( + Tensor *inputTensor, TransposeParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = transpose_infer_output_size_mali( + inputDesc, p, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } else { + ret = transpose_infer_output_size_cpu(inputDesc, p, &outputDesc); + } + outputTensor->resize(outputDesc); + return ret; +} + +EE transpose_infer_forward_tmp_bytes( + Tensor inputTensor, Tensor outputTensor, U32 *bytes, ArchInfo_t archInfo) +{ + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(outputTensor); + TensorDesc inputDesc = inputTensor.get_desc(); + TensorDesc outputDesc = outputTensor.get_desc(); + ret = transpose_infer_forward_tmp_bytes_mali( + inputDesc, outputDesc, &gclmemInputDesc, &gclmemOutputDesc, bytes); +#endif + } else { + *bytes = 0; + ret = SUCCESS; + } + return ret; +} diff --git a/compute/tensor/src/unsqueeze.cpp b/compute/tensor/src/unsqueeze.cpp new file mode 100644 index 00000000..299c8c1e --- /dev/null +++ b/compute/tensor/src/unsqueeze.cpp @@ -0,0 +1,102 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif +#include + +EE unsqueeze(Tensor inputTensor, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(arch)) { +#ifdef _USE_MALI + TensorDesc outputDesc = outputTensor.get_desc(); + ret = unsqueeze_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, (GCLMem_t)input, + outputDesc, (GCLMem_t)output); +#endif +#ifdef _USE_CPU + } else { + if (output != input) { + memcpy(output, input, tensorNumBytes(inputDesc)); + } + ret = SUCCESS; +#endif + } + return ret; +} + +#ifdef _USE_CPU +EE unsqueeze_infer_output_size_cpu( + TensorDesc inputDesc, int *axes, int axesNum, TensorDesc *outputDesc) +{ + outputDesc->dt = inputDesc.dt; + outputDesc->nDims = inputDesc.nDims + axesNum; + outputDesc->df = getTensorDefaultDataFormat(outputDesc->nDims); + for (U32 i = 0; i < outputDesc->nDims; i++) { + outputDesc->dims[i] = 0; + } + for (int i = 0; i < axesNum; i++) { + int axis = axes[i]; + if (axis < 0) { + axis += outputDesc->nDims; + } + outputDesc->dims[outputDesc->nDims - 1 - axis] = 1; + } + U32 index = 0; + for (U32 i = 0; i < outputDesc->nDims; i++) { + if (outputDesc->dims[i] == 0) { + outputDesc->dims[i] = inputDesc.dims[index++]; + } + } + CHECK_REQUIREMENT(index == inputDesc.nDims); + return SUCCESS; +} +#endif + +EE unsqueeze_infer_output_size( + Tensor *inputTensor, UnsqueezeParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = unsqueeze_infer_output_size_mali( + inputDesc, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif +#ifdef _USE_CPU + } else { + ret = unsqueeze_infer_output_size_cpu(inputDesc, p.axes, p.axes_num, &outputDesc); +#endif + } + outputTensor->resize(outputDesc); + return ret; +} diff --git a/compute/tensor/src/yolov3detectionoutput.cpp b/compute/tensor/src/yolov3detectionoutput.cpp new file mode 100644 index 00000000..32affba6 --- /dev/null +++ b/compute/tensor/src/yolov3detectionoutput.cpp @@ -0,0 +1,77 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif + +#define NUM_DETECTED_MAX 200 + +inline EE yolov3detectionoutput_infer_output_size_cpu(std::vector inputDesc, + Yolov3DetectionOutputParamSpec yolov3DetectionOutputParamSpec, + TensorDesc *outputDesc) +{ + UNUSED(yolov3DetectionOutputParamSpec); + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt0; + idt0 = inputDesc[0].dt; + // output size + U32 oh, ow; + // oh = the first box for saving the number of available boxes(1) + the maximum number of dectected boxes(NUM_DETECTED_MAX = 200) + oh = 1 + NUM_DETECTED_MAX; + // Each width is a 6 dimension vector, which stores [label, confidence, xmin, ymin, xmax, ymax] -> 6 + // The first box is [ number of available boxes, 0, 0, 0, 0, 0 ] + ow = 6; + *outputDesc = tensor2d(idt0, oh, ow); + return SUCCESS; +} + +EE yolov3detectionoutput_infer_output_size(std::vector inputTensor, + Yolov3DetectionOutputParamSpec yolov3DetectionOutputParamSpec, + Tensor *outputTensor, + ArchInfo_t archInfo) +{ + UNUSED(archInfo); + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + std::vector inputDesc = get_desc_from_tensor_ptrs(inputTensor); + TensorDesc outputDesc = outputTensor->get_desc(); + CHECK_STATUS(yolov3detectionoutput_infer_output_size_cpu( + inputDesc, yolov3DetectionOutputParamSpec, &outputDesc)); + outputTensor->resize(outputDesc); + return SUCCESS; +} + +EE yolov3detectionoutput(std::vector inputTensor, + Yolov3DetectionOutputParamSpec yolov3DetectionOutputParamSpec, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + std::vector inputDesc = get_desc_from_tensors(inputTensor); + std::vector input = get_data_from_tensors(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + ret = yolov3detectionoutput_cpu( + inputDesc, input, yolov3DetectionOutputParamSpec, outputDesc, output, arch); +#endif + } + return ret; +} diff --git a/compute/tensor/tests/CMakeLists.txt b/compute/tensor/tests/CMakeLists.txt new file mode 100644 index 00000000..63622e8e --- /dev/null +++ b/compute/tensor/tests/CMakeLists.txt @@ -0,0 +1,67 @@ +function(tensor_test name) + add_executable(${name} ${name}.cpp) + link_tensor(${name}) + install(TARGETS ${name} + RUNTIME DESTINATION tests) +endfunction() + +set_test_c_cxx_flags() + +tensor_test(test_activation) +tensor_test(test_argmax) +tensor_test(test_attention) +tensor_test(test_check) +tensor_test(test_clip) +tensor_test(test_concat) +tensor_test(test_convolution) +tensor_test(test_deconvolution) +tensor_test(test_depthwise_convolution) +tensor_test(test_dilated_convolution) +tensor_test(test_detectionoutput) +tensor_test(test_eltwise) +tensor_test(test_fully_connected) +tensor_test(test_rnn) +tensor_test(test_power) +tensor_test(test_reduction) +tensor_test(test_pooling) +tensor_test(test_pooling_bp) +tensor_test(test_padding) +tensor_test(test_priorbox) +tensor_test(test_reshape) +tensor_test(test_softmax) +tensor_test(test_split) +tensor_test(test_slice) +tensor_test(test_scale) +tensor_test(test_transpose) +tensor_test(test_non_max_suppression) +tensor_test(test_roialign) +tensor_test(test_l2normalization) +tensor_test(test_prelu) +tensor_test(test_normalization) +tensor_test(test_tile) + +tensor_test(test_convolution_int8) +tensor_test(test_depthwise_convolution_int8) +tensor_test(test_concat_int8) +tensor_test(test_pooling_int8) +tensor_test(test_convolution_bnn) + +if (USE_MALI) + if (USE_FP16) + tensor_test(test_convolution_ocl test_convolution_ocl.cpp) + tensor_test(test_deconvolution_ocl test_deconvolution_ocl.cpp) + tensor_test(test_channel_resize_ocl test_channel_resize_ocl.cpp) + tensor_test(test_depthwise_convolution_ocl test_depthwise_convolution_ocl.cpp) + tensor_test(test_depthwise_pointwise_convolution_ocl test_depthwise_pointwise_convolution_ocl.cpp) + tensor_test(test_fully_connected_ocl test_fully_connected_ocl.cpp) + tensor_test(test_multihead_attention_ocl test_multihead_attention_ocl.cpp) + tensor_test(test_padding_ocl test_padding_ocl.cpp) + tensor_test(test_prelu_ocl test_prelu_ocl.cpp) + tensor_test(test_pooling_ocl test_pooling_ocl.cpp) + tensor_test(test_softmax_h1w1_ocl test_softmax_h1w1_ocl.cpp) + tensor_test(test_power_ocl test_power_ocl.cpp) + tensor_test(test_transpose_ocl test_transpose_ocl.cpp) + tensor_test(test_concat_ocl test_concat_ocl.cpp) + tensor_test(test_reshape_ocl test_reshape_ocl.cpp) + endif (USE_FP16) +endif (USE_MALI) diff --git a/compute/tensor/tests/test_activation.cpp b/compute/tensor/tests/test_activation.cpp new file mode 100644 index 00000000..177206fb --- /dev/null +++ b/compute/tensor/tests/test_activation.cpp @@ -0,0 +1,124 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "tensor_computing.h" +#include "ut_util.h" + +int activationFunctionTest(U32 in, + U32 ic, + U32 ih, + U32 iw, + DataType dt, + ActivationParamSpec activationDesc, + const char *activationType) +{ + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + DataFormat df = DF_NCHWC8; + memset(activationDesc.value, 0, sizeof(activationDesc.value)); + + TensorDesc dataDesc = tensor4df(dt, df, in, ic, ih, iw); + U32 len = tensorNumElements(dataDesc); + + U8 *data = ut_input_v(len, dt, UT_INIT_RANDOM); + + Tensor dataTensor = Tensor::alloc_sized(dataDesc); + Tensor dataTensorRef = Tensor::alloc_sized(dataDesc); + memcpy(get_ptr_from_tensor(dataTensor, UT_ARCH), data, tensorNumBytes(dataDesc)); + memcpy(get_ptr_from_tensor(dataTensorRef, UT_ARCH), data, tensorNumBytes(dataDesc)); + + if (UT_CHECK) { + //check + CHECK_STATUS(activation(dataTensor, activationDesc, dataTensor, &archInfo)); + + // naive implement + CHECK_STATUS(activation(dataTensorRef, activationDesc, dataTensorRef, &archInfo_org)); + + // check + ut_check_v(get_ptr_from_tensor(dataTensor, UT_ARCH), + get_ptr_from_tensor(dataTensorRef, UT_ARCH), dataTensor.length(), dt, 0.01, __FILE__, + __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(activation(dataTensor, activationDesc, dataTensor, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)=(%u %u %u %u)", in, ic, ih, iw, in, ic, ih, iw); + sprintf(buffer, "%20s, %80s", activationType, params); + double ops = 1.0 * in * ic * ih * iw; + ut_log(dt, buffer, ops, time / UT_LOOPS); + + free(data); + + return 0; +} + +int activationTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 5); + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + + ActivationParamSpec activationDesc; + //test relu + activationDesc.mode = ACTIVATION_RELU; + activationFunctionTest(in, ic, ih, iw, dt, activationDesc, "Activation Relu"); + //test relu6 + activationDesc.mode = ACTIVATION_RELU6; + activationFunctionTest(in, ic, ih, iw, dt, activationDesc, "Activation Relu6"); + //test h swish + activationDesc.mode = ACTIVATION_H_SWISH; + activationFunctionTest(in, ic, ih, iw, dt, activationDesc, "Activation h_siwsh"); + //test h sigmod + activationDesc.mode = ACTIVATION_H_SIGMOID; + activationFunctionTest(in, ic, ih, iw, dt, activationDesc, "Activation h_sigmod"); + //test tanh + activationDesc.mode = ACTIVATION_TANH; + activationFunctionTest(in, ic, ih, iw, dt, activationDesc, "Activation tanh"); + //test gelu + activationDesc.mode = ACTIVATION_GELU; + activationFunctionTest(in, ic, ih, iw, dt, activationDesc, "Activation gelu"); + //test mish + activationDesc.mode = ACTIVATION_MISH; + activationFunctionTest(in, ic, ih, iw, dt, activationDesc, "Activation mish"); + //test sigmod + activationDesc.mode = ACTIVATION_SIGMOID; + activationFunctionTest(in, ic, ih, iw, dt, activationDesc, "Activation sigmod"); + + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + activationTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + activationTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_argmax.cpp b/compute/tensor/tests/test_argmax.cpp new file mode 100644 index 00000000..49dc7b18 --- /dev/null +++ b/compute/tensor/tests/test_argmax.cpp @@ -0,0 +1,91 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "tensor_computing.h" +#include "ut_util.h" + +int argmaxTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 6); + ArgMaxParamSpec p; + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + p.axis = atoi(argv[5]); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + DataFormat df = DF_NCHW; + TensorDesc inDesc = tensor4df(dt, df, in, ic, ih, iw); + U8 *input = ut_input_v(tensorNumElements(inDesc), dt, UT_INIT_RANDOM); + Tensor inputTensor; + inputTensor.resize(inDesc); + inputTensor.alloc(); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inDesc)); + + Tensor outputTensor; + Tensor outputTensorRef; + CHECK_STATUS(argmax_infer_output_size(&inputTensor, p, &outputTensor, &archInfo)); + outputTensor.alloc(); + outputTensorRef.resize(outputTensor.get_desc()); + outputTensorRef.alloc(); + + Tensor nullTensor; + + if (UT_CHECK) { + CHECK_STATUS(argmax(inputTensor, p, nullTensor, outputTensor, &archInfo)); + + // naive implement + CHECK_STATUS(argmax(inputTensor, p, nullTensor, outputTensorRef, &archInfo_org)); + + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), outputTensor.length(), DT_U32, 0, + __FILE__, __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(argmax(inputTensor, p, nullTensor, outputTensor, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + U32 on, oh, ow; + CHECK_STATUS(tensor3dGet(outputTensor.get_desc(), &dt, &df, &on, &oh, &ow)); + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u) %d =(%u %u %u)", in, ic, ih, iw, p.axis, on, oh, ow); + sprintf(buffer, "%20s, %80s", "Argmax", params); + double ops = 1.0 * in * ic * ih * iw; + ut_log(dt, buffer, ops, time / UT_LOOPS); + + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + argmaxTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + argmaxTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_attention.cpp b/compute/tensor/tests/test_attention.cpp new file mode 100644 index 00000000..7316b50f --- /dev/null +++ b/compute/tensor/tests/test_attention.cpp @@ -0,0 +1,109 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "tensor_computing.h" +#include "ut_util.h" + +int attentionTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 5); + AttentionParamSpec p; + U32 batch = atoi(argv[1]); + p.num_heads = atoi(argv[2]); + p.from_sequence_length = atoi(argv[3]); + p.to_sequence_length = atoi(argv[4]); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + DataFormat df = DF_NORMAL; + TensorDesc inDesc = tensor2df(dt, df, batch, p.to_sequence_length); + U32 inputLength = tensorNumElements(inDesc); + U8 *input = ut_input_v(inputLength, dt, UT_INIT_ZERO); + Tensor inputTensor = Tensor::alloc_sized(inDesc); + + Tensor outputTensor; + CHECK_STATUS(attention_infer_output_size(&inputTensor, p, &outputTensor)); + outputTensor.alloc(); + Tensor outputTensorRef = Tensor::alloc_sized(outputTensor.get_desc()); + ; + U32 outputLength = outputTensor.length(); + for (U32 i = 0; i < batch; i++) { + U32 threshold = p.to_sequence_length / 2 + i; + for (U32 j = 0; j < p.to_sequence_length; j++) { + if (j < threshold) { + switch (dt) { +#ifdef _USE_FP32 + case DT_F32: + ((F32 *)input)[i * p.to_sequence_length + j] = 1; + break; +#endif +#ifdef _USE_FP16 + case DT_F16: + ((F16 *)input)[i * p.to_sequence_length + j] = 1; + break; +#endif + default: + break; + } + } + } + } + + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inDesc)); + + if (UT_CHECK) { + CHECK_STATUS(attention(inputTensor, outputTensor, &archInfo)); + + // naive implement + CHECK_STATUS(attention(inputTensor, outputTensorRef, &archInfo_org)); + + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), outputLength, dt, 0, __FILE__, __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(attention(inputTensor, outputTensor, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u)=(%u %u %u %u)", batch, p.from_sequence_length, batch, p.num_heads, + p.from_sequence_length, p.to_sequence_length); + sprintf(buffer, "%20s, %80s", "Attention", params); + double ops = 3.0 * outputLength; + ut_log(dt, buffer, ops, time / UT_LOOPS); + + free(input); + + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + attentionTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + attentionTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_axpby.cpp b/compute/tensor/tests/test_axpby.cpp new file mode 100644 index 00000000..70b2f351 --- /dev/null +++ b/compute/tensor/tests/test_axpby.cpp @@ -0,0 +1,75 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "blas_enhance.h" +#include "ut_util.h" + +int axpbyTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 4); + U32 len = atoi(argv[1]); + F32 a = atof(argv[2]); + F32 b = atof(argv[3]); + + TensorDesc xDesc = tensor1d(dt, len); + TensorDesc yDesc = tensor1d(dt, len); + + U8 *x = ut_input_v(len, dt, UT_INIT_RANDOM); + U8 *y = ut_input_v(len, dt, UT_INIT_RANDOM); + U8 *y_ref = ut_input_v(len, dt, UT_INIT_ZERO); + + memcpy(y_ref, y, tensorNumBytes(yDesc)); + // check + if (UT_CHECK) { + CHECK_STATUS(vector_vector_axpby(a, xDesc, x, b, yDesc, y, UT_ARCH)); + + // naive implement + CHECK_STATUS(vector_vector_axpby(a, xDesc, x, b, yDesc, y_ref, CPU_GENERAL)); + + ut_check_v(y, y_ref, len, dt, 0.01, __FILE__, __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + vector_vector_axpby(a, xDesc, x, b, yDesc, y, UT_ARCH); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%.2f * %u) + (%.2f * %u) = (%u)", a, len, b, len, len); + sprintf(buffer, "%20s, %80s", "VectorVectoraXpbY", params); + double ops = 3.0 * len; + ut_log(dt, buffer, ops, time); + + free(x); + free(y); + free(y_ref); + + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + axpbyTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + axpbyTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_channel_resize_ocl.cpp b/compute/tensor/tests/test_channel_resize_ocl.cpp new file mode 100644 index 00000000..6ba31933 --- /dev/null +++ b/compute/tensor/tests/test_channel_resize_ocl.cpp @@ -0,0 +1,166 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "types.h" +#include +#include "tensor_computing.h" +#include "ut_util.h" +#include "gcl.h" +#include "libkernelsource.h" +#include "iostream" + +#ifdef _USE_FP16 +inline GCLMem_t alloc(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_map(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->mapped_alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_bytes(Tensor tensor, U32 size) +{ + auto mem = (OclMemory *)tensor.get_memory(); + GCLMem_t ptr = NULL; + if (size > 0) { + mem->resize(tensor1d(DT_U8, size)); + mem->alloc(); + ptr = (GCLMem_t)mem->get_ptr(); + } + return ptr; +} + +int channelresizeTest(int argc, char *argv[], DataType dt) +{ + CHECK_REQUIREMENT(argc == 8); + // in data + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + + ChannelResizeParamSpec p; + p.channel_before = atoi(argv[5]); + p.channel_after = atoi(argv[6]); + p.group = atoi(argv[7]); + // output + U32 on = in; + U32 oc = p.channel_after; + U32 oh = ih; + U32 ow = iw; + + CHECK_REQUIREMENT(in == 1 && on == 1); + CHECK_REQUIREMENT(p.channel_before == (int)ic); + + ArchInfo archInfo; + archInfo.arch = MALI; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + TensorDesc inputDesc_cpu, inputDesc_gpu, outputDesc; + inputDesc_cpu = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + inputDesc_gpu = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + outputDesc = tensor4df(dt, DF_NCHW, in, oc, oh, ow); + + // setup input + U8 *input_cpu = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM); + U8 *output_cpu = ut_input_v(on * oc * oh * ow, dt, UT_INIT_RANDOM); + U8 *output_gpu = NULL; + F16 *in_val = (F16 *)input_cpu; + U32 len_in = tensorNumElements(inputDesc_cpu); + for (U32 i = 0; i < len_in; i++) { + in_val[i] = i; + } + + U32 len = tensorNumElements(outputDesc); + F16 *out_val = (F16 *)output_cpu; + for (U32 i = 0; i < len; i++) { + out_val[i] = in_val[i]; + } + + std::shared_ptr handleSharedPtr = OCLContext::getInstance().handle; + GCLHandle_t handle = handleSharedPtr.get(); + std::vector kernelVec; + handle->kernelVec = &kernelVec; + Tensor inputTensor = Tensor(OCLMem); + Tensor outputTensor = Tensor(OCLMem); + Tensor tmpTensor = Tensor(OCLMem); + inputTensor.resize(inputDesc_gpu); + + MaliPara maliPara; + maliPara.handle = handle; + archInfo.archPara = &maliPara; + + CHECK_STATUS(channel_resize_infer_output_size(&inputTensor, p, &outputTensor, &archInfo)); + outputDesc = outputTensor.get_desc(); + CHECK_REQUIREMENT(tensorNumElements(outputDesc) == on * oc * oh * ow); + + GCLMem_t output = alloc_map(outputTensor); + GCLMem_t input = alloc(inputTensor); + CHECK_STATUS(gcl_fill_memory_zero(handle, input)); + + U32 maxBytes = 0; + U32 tmpBytes = 0; + tmpBytes = tensorNumBytes(inputDesc_gpu); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + GCLMem_t tmpbuf = alloc_bytes(tmpTensor, maxBytes); + + CHECK_STATUS(ocl_set_input(handle, input, inputDesc_gpu, input_cpu, tmpbuf, true)); + CHECK_STATUS(channel_resize(inputTensor, p, outputTensor, &archInfo)); + /*warp up*/ + UNI_INFO_LOG("Warp up gpu:\n") + for (U32 i = 0; i < 2; i++) { + CHECK_STATUS(gcl_run_kernelVec(handle)); + } + + UNI_INFO_LOG("Run:\n") +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernelVec_timing(handle, 0, handle->kernelVec->size())); + double time = handle->t_execute * 0.001; +#else + CHECK_STATUS(gcl_run_kernelVec(handle)); +#endif + CHECK_STATUS(ocl_get_output(handle, output, outputDesc, true)); + output_gpu = output->mapPtrArray.back(); + + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)->(%u %u %u %u)", in, ic, ih, iw, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "ChannelResize", params); +#ifdef _DEBUG + double ops = 1.0 * on * oc * oh * ow; + ut_log(dt, buffer, ops, time); +#endif + + ut_check_a(output_gpu, output_cpu, on * oc * ow * oh, dt); + CHECK_STATUS(gcl_finish(handle)); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + free(input_cpu); + free(output_cpu); + return 0; +} +#endif + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + channelresizeTest(argc, argv, DT_F16); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_check.cpp b/compute/tensor/tests/test_check.cpp new file mode 100644 index 00000000..93dff6c0 --- /dev/null +++ b/compute/tensor/tests/test_check.cpp @@ -0,0 +1,95 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "tensor_computing.h" +#include "ut_util.h" + +int checkTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 5); + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + DataFormat df = DF_NCHW; + CheckParamSpec p; + p.check_mode = CHECK_EQUAL; + + TensorDesc inDesc = tensor4df(dt, df, in, ic, ih, iw); + U8 *inputA = ut_input_v(tensorNumElements(inDesc), dt, UT_INIT_RANDOM); + U8 *inputB = ut_input_v(tensorNumElements(inDesc), dt, UT_INIT_RANDOM); + + Tensor inputTensorA; + Tensor inputTensorB; + inputTensorA.resize(inDesc); + inputTensorB.resize(inDesc); + inputTensorA.alloc(); + inputTensorB.alloc(); + memcpy(get_ptr_from_tensor(inputTensorA, UT_ARCH), inputA, tensorNumBytes(inDesc)); + memcpy(get_ptr_from_tensor(inputTensorB, UT_ARCH), inputB, tensorNumBytes(inDesc)); + + Tensor outputTensor; + Tensor outputTensorRef; + CHECK_STATUS(check_infer_output_size({&inputTensorA, &inputTensorB}, &outputTensor, &archInfo)); + outputTensor.alloc(); + outputTensorRef.resize(outputTensor.get_desc()); + outputTensorRef.alloc(); + + if (UT_CHECK) { + CHECK_STATUS(check(inputTensorA, inputTensorB, p, outputTensor, &archInfo)); + + // naive implement + CHECK_STATUS(check(inputTensorA, inputTensorB, p, outputTensorRef, &archInfo_org)); + + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), outputTensor.length(), DT_I32, 0, + __FILE__, __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(check(inputTensorA, inputTensorB, p, outputTensor, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)=(%u)", in, ic, ih, iw, in); + sprintf(buffer, "%20s, %80s", "Check", params); + double ops = 1.0 * in * ic * ih * iw; + ut_log(dt, buffer, ops, time / UT_LOOPS); + + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + checkTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + checkTest(argc, argv, DT_F32); +#endif + checkTest(argc, argv, DT_U32); + return 0; +} diff --git a/compute/tensor/tests/test_clip.cpp b/compute/tensor/tests/test_clip.cpp new file mode 100644 index 00000000..32b4d251 --- /dev/null +++ b/compute/tensor/tests/test_clip.cpp @@ -0,0 +1,85 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#include "ut_util.h" + +int clipTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 4); + U32 len = atoi(argv[1]); + ClipParamSpec p; + p.min = atof(argv[2]); + p.max = atof(argv[3]); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + TensorDesc inDesc = tensor1d(dt, len); + U8 *input = ut_input_v(len, dt, UT_INIT_RANDOM); + Tensor inputTensor; + inputTensor.resize(inDesc); + inputTensor.alloc(); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inDesc)); + + Tensor outputTensor; + Tensor outputTensorRef; + CHECK_STATUS(clip_infer_output_size(&inputTensor, &outputTensor, &archInfo)); + outputTensor.alloc(); + outputTensorRef.resize(outputTensor.get_desc()); + outputTensorRef.alloc(); + + if (UT_CHECK) { + CHECK_STATUS(clip(inputTensor, p, outputTensor, &archInfo)); + + // naive implement + CHECK_STATUS(clip(inputTensor, p, outputTensorRef, &archInfo_org)); + + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), outputTensor.length(), dt, 0, __FILE__, + __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(clip(inputTensor, p, outputTensor, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%u)=(%u)", len, len); + sprintf(buffer, "%20s, %80s", "Clip", params); + double ops = 2.0 * len; + ut_log(dt, buffer, ops, time / UT_LOOPS); + + free(input); + + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + clipTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + clipTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_concat.cpp b/compute/tensor/tests/test_concat.cpp new file mode 100644 index 00000000..9f5b0202 --- /dev/null +++ b/compute/tensor/tests/test_concat.cpp @@ -0,0 +1,135 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include + +#include "tensor_computing.h" +#include "ut_util.h" + +int concatTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc > 2); + ConcatParamSpec p; + int num = atoi(argv[1]); + p.axis = atoi(argv[2]); + CHECK_REQUIREMENT(p.axis == 0 || p.axis == 1); + CHECK_REQUIREMENT(argc == 1 + 2 + (num + 1) * 4); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + + std::vector inTensors(num); + std::vector inTensorPtr(num); + Tensor outTensor; + + for (int i = 0; i < num; i++) { + std::vector in_dim(4); + in_dim[0] = atoi(argv[3 + i * 4]); + in_dim[1] = atoi(argv[3 + i * 4 + 1]); + in_dim[2] = atoi(argv[3 + i * 4 + 2]); + in_dim[3] = atoi(argv[3 + i * 4 + 3]); + TensorDesc inDesc; + if (in_dim[1] % 8 == 0) { + inDesc = tensor4df(dt, DF_NCHWC8, in_dim[0], in_dim[1], in_dim[2], in_dim[3]); + } else { + inDesc = tensor4df(dt, DF_NCHW, in_dim[0], in_dim[1], in_dim[2], in_dim[3]); + } + inTensors[i].resize(inDesc); + inTensorPtr[i] = &inTensors[i]; + } + U32 on = atoi(argv[3 + num * 4]); + U32 oc = atoi(argv[3 + num * 4 + 1]); + U32 oh = atoi(argv[3 + num * 4 + 2]); + U32 ow = atoi(argv[3 + num * 4 + 3]); + + CHECK_STATUS(concat_infer_output_size(inTensorPtr, p, &outTensor, &archInfo)); + + U32 in_len = 0; + for (int i = 0; i < num; i++) { + in_len += inTensors[i].length(); + } + U32 out_len = outTensor.length(); + CHECK_REQUIREMENT(in_len == out_len && out_len == on * oc * oh * ow); + + // setup tmp + U32 tmpBytes; + CHECK_STATUS(concat_infer_forward_tmp_bytes(inTensors, &tmpBytes, &archInfo)); + Tensor tmpTensor; + tmpTensor.resize(tensor1d(DT_U8, tmpBytes)); + tmpTensor.alloc(); + + U8 *outputRef = ut_input_v(in_len, dt, UT_INIT_RANDOM); + U8 *tmp = ut_input_v(tmpBytes, dt, UT_INIT_RANDOM); + U8 *tmpPtr = tmp; + + U32 count = 0; + TensorDesc outDesc = outTensor.get_desc(); + for (int i = 0; i < num; i++) { + inTensors[i].alloc(); + TensorDesc inputDesc = inTensors[i].get_desc(); + U32 bytes = tensorNumBytes(inputDesc); + TensorDesc tmpDesc = inputDesc; + tmpDesc.df = outDesc.df; + U8 *srcPtr = (U8 *)get_ptr_from_tensor(inTensors[i], UT_ARCH); + if (inputDesc.df == DF_NCHW && outDesc.df == DF_NCHWC8) { + transformNCHWToNCHWC8(inputDesc, srcPtr, tmpDesc, tmpPtr); + srcPtr = tmpPtr; + } else if (inputDesc.df == DF_NCHWC8 && outDesc.df == DF_NCHW) { + transformToNCHW(inputDesc, srcPtr, tmpDesc, tmpPtr); + srcPtr = tmpPtr; + } + memcpy(outputRef + count, srcPtr, bytes); + count += bytes; + tmpPtr += bytes; + } + outTensor.alloc(); + + if (UT_CHECK) { + CHECK_STATUS(concat(inTensors, p, tmpTensor, outTensor, &archInfo)); + + // check + ut_check_v( + get_ptr_from_tensor(outTensor, UT_ARCH), outputRef, in_len, dt, 0, __FILE__, __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(concat(inTensors, p, tmpTensor, outTensor, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "%d (*)/%u=(%u %u %u %u)", num, p.axis, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "Concat", params); + double ops = 1.0 * out_len; + ut_log(dt, buffer, ops, time); + + free(tmp); + free(outputRef); + + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + concatTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + concatTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_concat_int8.cpp b/compute/tensor/tests/test_concat_int8.cpp new file mode 100644 index 00000000..0c1e7351 --- /dev/null +++ b/compute/tensor/tests/test_concat_int8.cpp @@ -0,0 +1,145 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include + +#include "tensor_computing.h" +#include "ut_util.h" + +#ifdef _USE_INT8 +int int8ConcatTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc > 2); + ConcatParamSpec p; + int num = atoi(argv[1]); + p.axis = atoi(argv[2]); + CHECK_REQUIREMENT(p.axis == 0 || p.axis == 1); + CHECK_REQUIREMENT(argc == 1 + 2 + (num + 1) * 4); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + + std::vector inTensors(num); + std::vector inTensorsRef(num); + std::vector inTensorPtr(num); + Tensor outTensor; + + for (int i = 0; i < num; i++) { + std::vector in_dim(4); + in_dim[0] = atoi(argv[3 + i * 4]); + in_dim[1] = atoi(argv[3 + i * 4 + 1]); + in_dim[2] = atoi(argv[3 + i * 4 + 2]); + in_dim[3] = atoi(argv[3 + i * 4 + 3]); + TensorDesc inDesc; + if (in_dim[1] % 8 == 0) { + inDesc = tensor4df(DT_I8, DF_NCHWC8, in_dim[0], in_dim[1], in_dim[2], in_dim[3]); + } else { + inDesc = tensor4df(DT_I8, DF_NCHW, in_dim[0], in_dim[1], in_dim[2], in_dim[3]); + } + inTensors[i].resize(inDesc); + inDesc.dt = dt; + inTensorsRef[i].resize(inDesc); + inTensorPtr[i] = &inTensors[i]; + } + U32 on = atoi(argv[3 + num * 4]); + U32 oc = atoi(argv[3 + num * 4 + 1]); + U32 oh = atoi(argv[3 + num * 4 + 2]); + U32 ow = atoi(argv[3 + num * 4 + 3]); + + CHECK_STATUS(concat_infer_output_size(inTensorPtr, p, &outTensor, &archInfo)); + + U32 in_len = 0; + for (int i = 0; i < num; i++) { + in_len += inTensors[i].length(); + } + U32 out_len = outTensor.length(); + CHECK_REQUIREMENT(in_len == out_len && out_len == on * oc * oh * ow); + + U8 *tmp = ut_input_v(in_len, dt, UT_INIT_RANDOM); + //INT8 *quant = (INT8 *)ut_input_v(in_len, DT_I8, UT_INIT_ZERO); + + U32 count = 0; + for (int i = 0; i < num; i++) { + //input_ref[i] = (void *)(tmp + count * bytesOf(dt)); + inTensorsRef[i].alloc(); + U32 floatBytes = inTensorsRef[i].bytes(); + memcpy(get_ptr_from_tensor(inTensorsRef[i], UT_ARCH), tmp + count, floatBytes); + + inTensors[i].alloc(); + TensorDesc dummy; + F16 scale = -1; + quantize_tensor(inTensorsRef[i].get_desc(), tmp + count, &dummy, + get_ptr_from_tensor(inTensors[i], UT_ARCH), &scale); + inTensors[i].set_scale(scale); + count += floatBytes; + } + + outTensor.alloc(); + U8 *out_d = ut_input_v(out_len, dt, UT_INIT_ZERO); + + Tensor tmpTensor; + + if (UT_CHECK) { + CHECK_STATUS(concat(inTensors, p, tmpTensor, outTensor, &archInfo)); + F32 scale_o = outTensor.get_scale(); + INT8 *output = (INT8 *)get_ptr_from_tensor(outTensor, UT_ARCH); + + for (U32 i = 0; i < out_len; i++) { + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + ((F16 *)out_d)[i] = output[i] / scale_o; + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + ((F32 *)out_d)[i] = output[i] / scale_o; + break; +#endif + default: + break; + } + } + + // check + ut_check_v(out_d, tmp, in_len, dt, 0.05, __FILE__, __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(concat(inTensors, p, tmpTensor, outTensor, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "%d (*)/%u=(%u %u %u %u)", num, p.axis, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "Concat", params); + double ops = 1.0 * out_len; + ut_log(DT_I8, buffer, ops, time); + + free(tmp); + free(out_d); + return 0; +} +#endif + +int main(int argc, char **argv) +{ +#ifdef _USE_INT8 + int8ConcatTest(argc, argv, DT_F16); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_concat_ocl.cpp b/compute/tensor/tests/test_concat_ocl.cpp new file mode 100644 index 00000000..ad3a75c5 --- /dev/null +++ b/compute/tensor/tests/test_concat_ocl.cpp @@ -0,0 +1,187 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#include "ut_util.h" +#include "libkernelsource.h" +#include +#include "gcl.h" +#include + +#ifdef _USE_FP16 +inline GCLMem_t alloc(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_map(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->mapped_alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_bytes(Tensor tensor, U32 size) +{ + auto mem = (OclMemory *)tensor.get_memory(); + GCLMem_t ptr = NULL; + if (size > 0) { + mem->resize(tensor1d(DT_U8, size)); + mem->alloc(); + ptr = (GCLMem_t)mem->get_ptr(); + } + return ptr; +} + +int concatTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc > 2); + ConcatParamSpec p; + int num = atoi(argv[1]); + p.axis = atoi(argv[2]); + CHECK_REQUIREMENT(argc == 1 + 2 + (num + 1) * 4); + std::vector inputDesc(num); + std::vector inputTensorCpu; + std::vector inputTensor; + for (int i = 0; i < num; i++) { + U32 n, c, h, w; + n = atoi(argv[3 + i * 4]); + c = atoi(argv[3 + i * 4 + 1]); + h = atoi(argv[3 + i * 4 + 2]); + w = atoi(argv[3 + i * 4 + 3]); + inputDesc[i] = tensor4df(dt, DF_NCHW, n, c, h, w); + std::shared_ptr tensorCpu(new Tensor()); + std::shared_ptr tensor(new Tensor(OCLMem)); + tensorCpu->resize(inputDesc[i]); + tensor->resize(inputDesc[i]); + inputTensorCpu.push_back(*tensorCpu.get()); + inputTensor.push_back(*tensor.get()); + } + U32 on = atoi(argv[3 + num * 4]); + U32 oc = atoi(argv[3 + num * 4 + 1]); + U32 oh = atoi(argv[3 + num * 4 + 2]); + U32 ow = atoi(argv[3 + num * 4 + 3]); + + ArchInfo archInfo; + archInfo.arch = MALI; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + std::vector inputTensorCpuPtr; + std::vector inputTensorPtr; + for (int i = 0; i < num; i++) { + inputTensorCpuPtr.push_back(&inputTensorCpu[i]); + } + for (int i = 0; i < num; i++) { + inputTensorPtr.push_back(&inputTensor[i]); + } + + std::shared_ptr handleSharedPtr = OCLContext::getInstance().handle; + GCLHandle_t handle = handleSharedPtr.get(); + std::vector kernelVec; + handle->kernelVec = &kernelVec; + Tensor outputTensor = Tensor(OCLMem); + Tensor tmpTensor = Tensor(OCLMem); + + U32 in_len = 0; + for (int i = 0; i < num; i++) { + in_len += tensorNumElements(inputDesc[i]); + } + std::vector input_cpu(num); + U8 *tmp = ut_input_v(in_len, dt, UT_INIT_RANDOM); + U32 count = 0; + for (int i = 0; i < num; i++) { + input_cpu[i] = (void *)(tmp + count * bytesOf(dt)); + count += tensorNumElements(inputDesc[i]); + } + + MaliPara maliPara; + maliPara.handle = handle; + archInfo.archPara = &maliPara; + + CHECK_STATUS(concat_infer_output_size(inputTensorPtr, p, &outputTensor, &archInfo)); + TensorDesc outputDesc = outputTensor.get_desc(); + + U32 maxBytes = 0; + U32 tmpBytes = 0; + CHECK_STATUS(concat_infer_forward_tmp_bytes(inputTensor, &tmpBytes, &archInfo)); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + + GCLMem_t output = alloc_map(outputTensor); + for (int i = 0; i < num; i++) { + tmpBytes = tensorNumBytes(inputTensor[i].get_desc()); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + } + GCLMem_t tmpbuf = alloc_bytes(tmpTensor, maxBytes); + + for (int i = 0; i < num; i++) { + GCLMem_t input = alloc(inputTensor[i]); + CHECK_STATUS(gcl_fill_memory_zero(handle, input)); + CHECK_STATUS(ocl_set_input(handle, input, inputDesc[i], (U8 *)input_cpu[i], tmpbuf, true)); + } + + CHECK_STATUS(concat(inputTensor, p, tmpTensor, outputTensor, &archInfo)); + + /*warp up*/ + UNI_INFO_LOG("Warp up gpu:\n") + for (U32 i = 0; i < 2; i++) { + CHECK_STATUS(gcl_run_kernelVec(handle)); + } + + UNI_INFO_LOG("Run:\n") +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernelVec_timing(handle, 0, handle->kernelVec->size())); + double time = handle->t_execute * 0.001; +#else + CHECK_STATUS(gcl_run_kernelVec(handle)); +#endif + CHECK_STATUS(ocl_get_output(handle, output, outputDesc, true)); + U8 *output_gpu_val = output->mapPtrArray.back(); + + char buffer[150]; + char params[120]; + sprintf(params, "%d (*)/%u=(%u %u %u %u)", num, p.axis, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "Concat", params); +#ifdef _DEBUG + double ops = 1.0 * on * oc * oh * ow; + ut_log(dt, buffer, ops, time); +#endif + for (int i = 0; i < num; i++) { + inputTensorCpu[i].alloc(); + memcpy(get_ptr_from_tensor(inputTensorCpu[i], UT_ARCH), input_cpu[i], + tensorNumBytes(inputDesc[i])); + } + + Tensor outputTensorCpu; + CHECK_STATUS(concat_infer_output_size(inputTensorCpuPtr, p, &outputTensorCpu, &archInfo_org)); + outputTensorCpu.alloc(); + + Tensor tmpTensorCpu; + CHECK_STATUS(concat(inputTensorCpu, p, tmpTensorCpu, outputTensorCpu, &archInfo_org)); + ut_check_a(output_gpu_val, get_ptr_from_tensor(outputTensorCpu, UT_ARCH), on * oc * ow * oh, dt); + + CHECK_STATUS(gcl_finish(handle)); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + free(tmp); + return 0; +} +#endif +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + concatTest(argc, argv, DT_F16); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_convolution.cpp b/compute/tensor/tests/test_convolution.cpp new file mode 100644 index 00000000..7cedf58d --- /dev/null +++ b/compute/tensor/tests/test_convolution.cpp @@ -0,0 +1,172 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "tensor_computing.h" +#include "ut_util.h" + +int convolutionTest(int argc, char *argv[], DataType dt) +{ + CHECK_REQUIREMENT(argc == 16); + // in data + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + // weight + U32 fn = atoi(argv[5]); + U32 fc = atoi(argv[6]); + U32 fh = atoi(argv[7]); + U32 fw = atoi(argv[8]); + U32 group = atoi(argv[9]); + // stride & padding + U32 stride = atoi(argv[10]); + U32 padding = atoi(argv[11]); + // output + U32 on = atoi(argv[12]); + U32 oc = atoi(argv[13]); + U32 oh = atoi(argv[14]); + U32 ow = atoi(argv[15]); + CHECK_REQUIREMENT(in == 1 && on == 1); + + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_RELU; + activationDesc.value[0] = 0; + + TensorDesc inputDesc, outputDesc; + if (ic % 8 != 0) { + inputDesc = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + } else { + inputDesc = tensor4df(dt, DF_NCHWC8, in, ic, ih, iw); + } + TensorDesc filterDesc = tensor4df(dt, DF_NCHW, fn, fc, fh, fw); + TensorDesc biasDesc = tensor1d(dt, oc); + ConvolutionParamSpec p = createConvolutionParamSpec(group, fh, fw, stride, stride, padding, + padding, padding, padding, 1, 1, fn, Convolution_Depthwise_Pointwise); + + // setup input, filter, bias + U8 *input = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM); + U8 *filter = ut_input_v(fn * fc * fh * fw, dt, UT_INIT_RANDOM); + U8 *bias = ut_input_v(oc, dt, UT_INIT_RANDOM); + Tensor inputTensor; + Tensor inputTensorRef; + Tensor filterTensor; + Tensor filterTensorRef; + Tensor outputTensor; + Tensor outputTensorRef; + Tensor biasTensor; + + inputTensor.resize(inputDesc); + inputTensorRef.resize(inputDesc); + filterTensor.resize(filterDesc); + filterTensorRef.resize(filterDesc); + biasTensor.resize(biasDesc); + + inputTensor.alloc(); + inputTensorRef.alloc(); + filterTensor.alloc(); + filterTensorRef.alloc(); + biasTensor.alloc(); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, bytesOf(dt) * in * ic * ih * iw); + memcpy(get_ptr_from_tensor(inputTensorRef, UT_ARCH), input, bytesOf(dt) * in * ic * ih * iw); + memcpy(get_ptr_from_tensor(filterTensor, UT_ARCH), filter, bytesOf(dt) * fn * fc * fh * fw); + memcpy(get_ptr_from_tensor(filterTensorRef, UT_ARCH), filter, bytesOf(dt) * fn * fc * fh * fw); + memcpy(get_ptr_from_tensor(biasTensor, UT_ARCH), bias, bytesOf(dt) * oc); + + // setup output, bias + CHECK_STATUS( + convolution_infer_output_size(&inputTensor, filterTensor, p, &outputTensor, dt, &archInfo)); + outputDesc = outputTensor.get_desc(); + + outputTensor.alloc(); + outputTensorRef.resize(outputTensor.get_desc()); + outputTensorRef.alloc(); + + // setup alg + ConvolutionPolicy policy = CONVOLUTION_FASTEST; + ConvolutionForwardAlgorithm alg = CONVOLUTION_ALGORITHM_NULL; + CHECK_STATUS(convolution_infer_forward_algorithm( + inputTensor, filterTensor, outputTensor, p, policy, &alg, dt, activationDesc, &archInfo)); + + // setup tmp + U32 tmpBytes; + CHECK_STATUS(convolution_infer_forward_tmp_bytes( + inputTensor, filterTensor, outputTensor, p, alg, &tmpBytes, &archInfo)); + Tensor tmpTensor; + tmpTensor.resize(tensor1d(DT_U8, tmpBytes)); + tmpTensor.alloc(); + + // setup filter trans + U32 ftmBytes; + CHECK_STATUS(convolution_transform_filter_bytes(filterTensor, p, alg, &ftmBytes, &archInfo)); + // trans filter + Tensor ftmTensor; + ftmTensor.resize(tensor1d(DT_U8, ftmBytes)); + ftmTensor.alloc(); + CHECK_STATUS( + convolution_transform_filter(filterTensor, p, alg, tmpTensor, &ftmTensor, &archInfo)); + + if (UT_CHECK) { + CHECK_STATUS(convolution(inputTensor, ftmTensor, p, alg, nullptr, biasTensor, tmpTensor, + outputTensor, activationDesc, &archInfo)); + + // naive implement + CHECK_STATUS(convolution(inputTensorRef, filterTensorRef, p, alg, nullptr, biasTensor, + tmpTensor, outputTensorRef, activationDesc, &archInfo_org)); + + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), outputTensor.length(), dt, 5, __FILE__, + __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(convolution(inputTensor, ftmTensor, p, alg, nullptr, biasTensor, tmpTensor, + outputTensor, activationDesc, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + DataFormat df; + CHECK_STATUS(tensor4dGet(outputDesc, &dt, &df, &on, &oc, &oh, &ow)); + sprintf(params, "(%u %u %u %u)+(%u %u %u %u)/(%u %u %u)=(%u %u %u %u)", in, ic, ih, iw, fn, fc, + fh, fw, group, stride, padding, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "Convolution", params); + double ops = (1.0 * on * oc * oh * ow) * (2.0 * ic / group * fh * fw + 1); + ut_log(dt, buffer, ops, time); + + free(input); + free(filter); + free(bias); + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + convolutionTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + convolutionTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_convolution_bnn.cpp b/compute/tensor/tests/test_convolution_bnn.cpp new file mode 100644 index 00000000..4ed4658f --- /dev/null +++ b/compute/tensor/tests/test_convolution_bnn.cpp @@ -0,0 +1,179 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "tensor_computing.h" +#include "ut_util.h" + +int bnnConvolutionTest(int argc, char *argv[], DataType dt) +{ + CHECK_REQUIREMENT(argc == 16); + // in data + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + // weight + U32 fn = atoi(argv[5]); + U32 fc = atoi(argv[6]); + U32 fh = atoi(argv[7]); + U32 fw = atoi(argv[8]); + U32 group = atoi(argv[9]); + // stride & padding + U32 stride = atoi(argv[10]); + U32 padding = atoi(argv[11]); + // output + U32 on = atoi(argv[12]); + U32 oc = atoi(argv[13]); + U32 oh = atoi(argv[14]); + U32 ow = atoi(argv[15]); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + CHECK_REQUIREMENT(in == 1 && on == 1); + + DataType fdt = DT_BIN11; // Use dt to distinguish DoReFa and XNOR + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_NULL; + + TensorDesc inputDesc = tensor4df(dt, DF_NCHWC8, in, ic, ih, iw); + TensorDesc filterDesc = tensor4df(fdt, DF_NCHW, oc, ic, fh, fw); + TensorDesc biasDesc = tensor1d(dt, oc * 2); // including scale and bias + ConvolutionParamSpec p = createConvolutionParamSpec(group, fh, fw, stride, stride, padding, + padding, padding, padding, 1, 1, oc, Convolution_Depthwise_Pointwise); + + // setup input, filter, bias + U8 *input = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM); + if (fdt == DT_BIN01) { + for (U32 i = 0; i < in * ic * ih * iw; i++) { + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + ((F16 *)input)[i] += 0.5; + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + ((F32 *)input)[i] += 0.5; + break; +#endif + default: + break; + } + } + } + + BIN8 *filter = (BIN8 *)ut_input_v(fn * fc * fh * fw / 8, fdt, UT_INIT_POS); + U8 *bias = ut_input_v(oc * 2, dt, UT_INIT_RANDOM); + Tensor inputTensor; + Tensor inputTensorRef; + Tensor filterTensor; + Tensor filterTensorRef; + Tensor outputTensor; + Tensor outputTensorRef; + Tensor biasTensor; + + inputTensor.resize(inputDesc); + inputTensorRef.resize(inputDesc); + filterTensor.resize(filterDesc); + filterTensorRef.resize(filterDesc); + biasTensor.resize(biasDesc); + + inputTensor.alloc(); + inputTensorRef.alloc(); + filterTensor.alloc(); + filterTensorRef.alloc(); + biasTensor.alloc(); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, bytesOf(dt) * in * ic * ih * iw); + memcpy(get_ptr_from_tensor(inputTensorRef, UT_ARCH), input, bytesOf(dt) * in * ic * ih * iw); + memcpy(get_ptr_from_tensor(filterTensor, UT_ARCH), filter, tensorNumBytes(filterDesc)); + memcpy(get_ptr_from_tensor(filterTensorRef, UT_ARCH), filter, tensorNumBytes(filterDesc)); + memcpy(get_ptr_from_tensor(biasTensor, UT_ARCH), bias, tensorNumBytes(biasDesc)); + + // setup output, bias + CHECK_STATUS( + convolution_infer_output_size(&inputTensor, filterTensor, p, &outputTensor, dt, &archInfo)); + + outputTensor.alloc(); + outputTensorRef.resize(outputTensor.get_desc()); + outputTensorRef.alloc(); + + // setup alg + ConvolutionPolicy policy = CONVOLUTION_FASTEST; + ConvolutionForwardAlgorithm alg = CONVOLUTION_ALGORITHM_NULL; + CHECK_STATUS(convolution_infer_forward_algorithm( + inputTensor, filterTensor, outputTensor, p, policy, &alg, fdt, activationDesc, &archInfo)); + + // setup tmp + U32 tmpBytes; + CHECK_STATUS(convolution_infer_forward_tmp_bytes( + inputTensor, filterTensor, outputTensor, p, alg, &tmpBytes, &archInfo)); + Tensor tmpTensor; + tmpTensor.resize(tensor1d(DT_U8, tmpBytes)); + tmpTensor.alloc(); + // setup filter trans + U32 ftmBytes; + CHECK_STATUS(convolution_transform_filter_bytes(filterTensor, p, alg, &ftmBytes, &archInfo)); + // trans filter + Tensor ftmTensor; + ftmTensor.resize(tensor1d(DT_U8, ftmBytes)); + ftmTensor.alloc(); + + CHECK_STATUS( + convolution_transform_filter(filterTensor, p, alg, tmpTensor, &ftmTensor, &archInfo)); + + if (UT_CHECK) { + CHECK_STATUS(convolution(inputTensor, ftmTensor, p, alg, nullptr, biasTensor, tmpTensor, + outputTensor, activationDesc, &archInfo)); + // naive implement + CHECK_STATUS(convolution(inputTensorRef, filterTensorRef, p, alg, nullptr, biasTensor, + tmpTensor, outputTensorRef, activationDesc, &archInfo_org)); + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), outputTensor.length(), dt, 1, __FILE__, + __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(convolution(inputTensor, ftmTensor, p, alg, nullptr, biasTensor, tmpTensor, + outputTensor, activationDesc, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)+(%u %u %u %u)/(%u %u)=(%u %u %u %u)", in, ic, ih, iw, fn, fc, fh, + fw, stride, padding, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "BNN Convolution", params); + double ops = (1.0 * on * oc * oh * ow) * (2.0 * ic * fh * fw + 1); + ut_log(DT_I8, buffer, ops, time); + + free(input); + free(filter); + free(bias); + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + bnnConvolutionTest(argc, argv, DT_F16); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_convolution_int8.cpp b/compute/tensor/tests/test_convolution_int8.cpp new file mode 100644 index 00000000..45ef38ce --- /dev/null +++ b/compute/tensor/tests/test_convolution_int8.cpp @@ -0,0 +1,249 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "tensor_computing.h" +#include "ut_util.h" + +#ifdef _USE_INT8 +int int8ConvolutionTest(int argc, char *argv[], DataType dt, DataType filterDataType) +{ + CHECK_REQUIREMENT(argc == 16); + // in data + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + // weight + U32 fn = atoi(argv[5]); + U32 fc = atoi(argv[6]); + U32 fh = atoi(argv[7]); + U32 fw = atoi(argv[8]); + U32 group = atoi(argv[9]); + // stride & padding + U32 stride = atoi(argv[10]); + U32 padding = atoi(argv[11]); + // output + U32 on = atoi(argv[12]); + U32 oc = atoi(argv[13]); + U32 oh = atoi(argv[14]); + U32 ow = atoi(argv[15]); + CHECK_REQUIREMENT(in == 1 && on == 1); + + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_RELU; + activationDesc.value[0] = 0; + + TensorDesc inputDesc, filterDesc, outputDesc, biasDesc; + ConvolutionParamSpec p = createConvolutionParamSpec(group, fh, fw, stride, stride, padding, + padding, padding, padding, 1, 1, fn, Convolution_Depthwise_Pointwise); + + if (ic % 8 != 0) { + printf("[WARN] can not quantize the first layer\n"); + return 0; + } else { + DataType qdt = DT_I8; + TensorDesc inputDesc_ref = tensor4df(dt, DF_NCHWC8, in, ic, ih, iw); + filterDesc = tensor4df(dt, DF_NCHW, oc, ic, fh, fw); + biasDesc = tensor1d(dt, oc); + + // setup input, filter, bias + U8 *input_ref = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM); + U8 *filter = ut_input_v(fn * fc * fh * fw, dt, UT_INIT_RANDOM); + U8 *bias = ut_input_v(oc, dt, UT_INIT_RANDOM); + + INT8 *input = (INT8 *)ut_input_v(in * ic * ih * iw, DT_I8, UT_INIT_ZERO); + F16 scale_i = -1; + quantize_tensor(inputDesc_ref, input_ref, &inputDesc, input, &scale_i); + + Tensor inputTensor; + inputTensor.resize(inputDesc); + inputTensor.alloc(); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inputDesc)); + + Tensor inputTensorRef; + inputTensorRef.resize(inputDesc_ref); + inputTensorRef.alloc(); + memcpy( + get_ptr_from_tensor(inputTensorRef, UT_ARCH), input_ref, tensorNumBytes(inputDesc_ref)); + + Tensor filterTensor; + filterTensor.resize(filterDesc); + filterTensor.alloc(); + memcpy(get_ptr_from_tensor(filterTensor, UT_ARCH), filter, tensorNumBytes(filterDesc)); + + Tensor filterTensorRef; + filterTensorRef.resize(filterDesc); + filterTensorRef.alloc(); + memcpy(get_ptr_from_tensor(filterTensorRef, UT_ARCH), filter, tensorNumBytes(filterDesc)); + + Tensor biasTensor; + biasTensor.resize(biasDesc); + biasTensor.alloc(); + memcpy(get_ptr_from_tensor(biasTensor, UT_ARCH), bias, tensorNumBytes(biasDesc)); + + Tensor outputTensor, outputTensorRef; + + // setup output, bias + CHECK_STATUS(convolution_infer_output_size( + &inputTensor, filterTensor, p, &outputTensor, qdt, &archInfo)); + outputTensor.alloc(); + + outputDesc = outputTensor.get_desc(); + TensorDesc outputDesc_ref = outputTensor.get_desc(); + outputDesc_ref.dt = dt; + outputTensorRef.resize(outputDesc_ref); + outputTensorRef.alloc(); + + // setup alg + ConvolutionPolicy policy = CONVOLUTION_FASTEST; + ConvolutionForwardAlgorithm alg = CONVOLUTION_ALGORITHM_NULL; + CHECK_STATUS(convolution_infer_forward_algorithm(inputTensor, filterTensor, outputTensor, p, + policy, &alg, qdt, activationDesc, &archInfo)); + + F16 *scales; + + // setup filter trans + U32 ftBytes; + Tensor ftmTensor, tmpTensor; + + switch (alg) { + case CONVOLUTION_ALGORITHM_WINOGRAD: { + CHECK_STATUS( + convolution_transform_filter_bytes(filterTensor, p, alg, &ftBytes, &archInfo)); + + Tensor tFilter; + tFilter.resize(tensor1d(DT_U8, ftBytes)); + tFilter.alloc(); + + filterDesc.dt = filterDataType; // To label as int8 + filterTensor.resize(filterDesc); + CHECK_STATUS(convolution_transform_filter( + filterTensor, p, alg, tmpTensor, &tFilter, &archInfo)); + + TensorDesc ftmDesc = tFilter.get_desc(); + ftmDesc.dt = DT_I8; + ftmTensor.resize(ftmDesc); + ftmTensor.alloc(); + + scales = (F16 *)ut_input_v( + 38, DT_F16, UT_INIT_ZERO); // 1 for input, 1 for output and 36 for filter + CHECK_STATUS( + quantize_tensor(tFilter.get_desc(), get_ptr_from_tensor(tFilter, UT_ARCH), + &ftmDesc, get_ptr_from_tensor(ftmTensor, UT_ARCH), scales + 2)); + break; + } + default: { + Tensor qFilter; + TensorDesc qDesc = filterDesc; + qDesc.dt = DT_I8; + qFilter.resize(qDesc); + qFilter.alloc(); + scales = (F16 *)ut_input_v(3, DT_F16, UT_INIT_ZERO); + CHECK_STATUS(quantize_tensor( + filterDesc, filter, &qDesc, get_ptr_from_tensor(qFilter, UT_ARCH), scales + 2)); + + CHECK_STATUS( + convolution_transform_filter_bytes(qFilter, p, alg, &ftBytes, &archInfo)); + + ftmTensor.resize(tensor1d(DT_U8, ftBytes)); + ftmTensor.alloc(); + // trans filter + CHECK_STATUS( + convolution_transform_filter(qFilter, p, alg, tmpTensor, &ftmTensor, &archInfo)); + break; + } + } + + scales[0] = scale_i; + + // setup tmp + U32 tmpBytes; + CHECK_STATUS(convolution_infer_forward_tmp_bytes( + inputTensor, ftmTensor, outputTensor, p, alg, &tmpBytes, &archInfo)); + tmpTensor.resize(tensor1d(DT_U8, tmpBytes)); + tmpTensor.alloc(); + + if (UT_CHECK) { + CHECK_STATUS(convolution(inputTensor, ftmTensor, p, alg, scales, biasTensor, tmpTensor, + outputTensor, activationDesc, &archInfo)); + + // naive implement + CHECK_STATUS(convolution(inputTensorRef, filterTensorRef, p, alg, nullptr, biasTensor, + tmpTensor, outputTensorRef, activationDesc, &archInfo_org)); + + U32 output_size = outputTensor.length(); + U8 *out_d = ut_input_v(output_size, dt, UT_INIT_ZERO); + INT8 *output = (INT8 *)get_ptr_from_tensor(outputTensor, UT_ARCH); + for (U32 i = 0; i < output_size; i++) { + switch (dt) { +#ifdef _USE_FP32 + case DT_F32: + ((F32 *)out_d)[i] = output[i] / scales[1]; + break; +#endif +#ifdef _USE_FP16 + case DT_F16: + ((F16 *)out_d)[i] = output[i] / scales[1]; + break; +#endif + default: + break; + } + } + ut_check_v(out_d, get_ptr_from_tensor(outputTensorRef, UT_ARCH), output_size, dt, 8, + __FILE__, __LINE__); + free(out_d); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(convolution(inputTensor, ftmTensor, p, alg, scales, biasTensor, tmpTensor, + outputTensor, activationDesc, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + DataFormat df; + CHECK_STATUS(tensor4dGet(outputDesc, &dt, &df, &on, &oc, &oh, &ow)); + sprintf(params, "(%u %u %u %u)+(%u %u %u %u)/(%u %u %u)=(%u %u %u %u)", in, ic, ih, iw, fn, + fc, fh, fw, group, stride, padding, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "Convolution", params); + double ops = (1.0 * on * oc * oh * ow) * (2.0 * ic * fh * fw / group + 1); + ut_log(DT_I8, buffer, ops, time); + + free(input); + free(filter); + free(bias); + free(input_ref); + free(scales); + } + return 0; +} +#endif + +int main(int argc, char **argv) +{ +#ifdef _USE_INT8 + int8ConvolutionTest(argc, argv, DT_F16, DT_F16_8Q); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_convolution_ocl.cpp b/compute/tensor/tests/test_convolution_ocl.cpp new file mode 100644 index 00000000..03b9a87a --- /dev/null +++ b/compute/tensor/tests/test_convolution_ocl.cpp @@ -0,0 +1,296 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "tensor_computing.h" +#include "ut_util.h" +#include "gcl.h" +#include "libkernelsource.h" +inline GCLMem_t alloc(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_map(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->mapped_alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_bytes(Tensor tensor, U32 size) +{ + auto mem = (OclMemory *)tensor.get_memory(); + GCLMem_t ptr = NULL; + if (size > 0) { + mem->resize(tensor1d(DT_U8, size)); + mem->alloc(); + ptr = (GCLMem_t)mem->get_ptr(); + } + return ptr; +} + +inline GCLMem_t alloc_desc(Tensor tensor, GCLMemDesc desc) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->padding(desc); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +int convolutionTest(int argc, char *argv[], DataType dt) +{ + U32 biasNum; + ArchInfo archInfo; + archInfo.arch = MALI; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + U32 in = 1; + U32 ic = 4; + U32 ih = 4; + U32 iw = 4; + U32 fn = 4; + U32 fh = 3; + U32 fw = 3; + U32 group = 1; + U32 strideW = 1; + U32 strideH = 1; + U32 paddingT = 1; + U32 paddingB = 1; + U32 paddingL = 1; + U32 paddingR = 1; + if (argc == 9) { + ic = atoi(argv[1]); + ih = atoi(argv[2]); + iw = atoi(argv[3]); + fn = atoi(argv[4]); + fh = atoi(argv[5]); + fw = atoi(argv[6]); + strideH = atoi(argv[7]); + strideW = atoi(argv[7]); + paddingT = atoi(argv[8]); + paddingB = atoi(argv[8]); + paddingL = atoi(argv[8]); + paddingR = atoi(argv[8]); + } + if (argc == 13) { + ic = atoi(argv[1]); + ih = atoi(argv[2]); + iw = atoi(argv[3]); + fn = atoi(argv[4]); + fh = atoi(argv[5]); + fw = atoi(argv[6]); + strideH = atoi(argv[7]); + strideW = atoi(argv[8]); + paddingT = atoi(argv[9]); + paddingB = atoi(argv[10]); + paddingL = atoi(argv[11]); + paddingR = atoi(argv[12]); + } + U32 fc = ic; + U32 on = 1; + U32 oc = fn; + U32 oh = (ih + paddingT + paddingB - fh) / strideH + 1; + U32 ow = (iw + paddingL + paddingR - fw) / strideW + 1; + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_NULL; + ConvolutionParamSpec convParamSpec = createConvolutionParamSpec(group, fh, fw, strideH, strideW, + paddingT, paddingB, paddingL, paddingR, 1, 1, fn, Convolution_Depthwise_Pointwise); + + TensorDesc inputDesc = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + TensorDesc filterDesc = tensor4df(dt, DF_NCHW, fn, fc, fh, fw); + TensorDesc biasDesc = tensor1d(dt, oc); + U8 *input_cpu = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM); + U8 *filter_cpu = ut_input_v(fn * fc * fh * fw, dt, UT_INIT_RANDOM); + U8 *bias_cpu = ut_input_v(oc, dt, UT_INIT_RANDOM); + TensorDesc inputDesc_gpu = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + + std::shared_ptr handleSharedPtr = OCLContext::getInstance().handle; + ; + GCLHandle_t handle = handleSharedPtr.get(); + std::vector kernelVec; + handle->kernelVec = &kernelVec; + Tensor inputTensor = Tensor(OCLMem); + Tensor outputTensor = Tensor(OCLMem); + Tensor filterTensorOrg = Tensor(OCLMem); + Tensor filterTensor = Tensor(OCLMem); + Tensor biasTensor = Tensor(OCLMem); + Tensor tmpTensor = Tensor(OCLMem); + inputTensor.resize(inputDesc_gpu); + filterTensor.resize(filterDesc); + filterTensorOrg.resize(filterDesc); + biasTensor.resize(biasDesc); + U32 str[3] = {1, 1, 1}; + U32 off[3] = {0, 0, 0}; + GCLMemDesc inputMemDesc = gcl_mem_desc(str, off, DT_U8, DF_NCWHC4); + ocl_set_desc(&inputTensor, inputMemDesc); + + MaliPara maliPara; + ForwardRunInfoMali runInfo; + runInfo.algorithm = (I32)(CONVOLUTION_ALGORITHM_NULL); + maliPara.handle = handle; + maliPara.forwardRunInfo = &runInfo; + archInfo.archPara = &maliPara; + + CHECK_STATUS(convolution_infer_output_size( + &inputTensor, filterTensor, convParamSpec, &outputTensor, dt, &archInfo)); + + ConvolutionPolicy policy = CONVOLUTION_TUNNING; + ConvolutionForwardAlgorithm alg = CONVOLUTION_ALGORITHM_NULL; + CHECK_STATUS(convolution_infer_forward_algorithm(inputTensor, filterTensor, outputTensor, + convParamSpec, policy, &alg, dt, activationDesc, &archInfo)); + + U32 maxBytes = 0; + U32 tmpBytes; + CHECK_STATUS(convolution_infer_forward_tmp_bytes( + inputTensor, filterTensor, outputTensor, convParamSpec, alg, &tmpBytes, &archInfo)); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + + GCLMemDesc filterMemDesc = gcl_mem_desc(str, off, DT_U8, DF_NCWHC4); + maliPara.gclmemFilterDesc = &filterMemDesc; + U32 ftmBytes; + CHECK_STATUS( + convolution_transform_filter_bytes(filterTensor, convParamSpec, alg, &ftmBytes, &archInfo)); + + GCLMem_t output = alloc_map(outputTensor); + GCLMem_t input = alloc(inputTensor); + CHECK_STATUS(gcl_fill_memory_zero(handle, input)); + + GCLMemDesc desc = gclmem_build_desc(); + if ((fh == 1 && fw == 1 && ih == 1 && iw == 1) || fn == 1) { + biasNum = oc; + desc.memType = GCL_MEM_BUF; + desc.byteSize = biasNum * bytesOf(dt); + } else { + biasNum = (oc + 3) / 4; + desc.memType = GCL_MEM_IMG_1D; + desc.byteSize = biasNum * 4 * bytesOf(dt); + } + desc.stride[0] = biasNum; + desc.stride[1] = 1; + desc.stride[2] = 1; + desc.offset[0] = 0; + desc.offset[1] = 0; + desc.offset[2] = 0; + desc.num = biasNum; + desc.memFormat = DF_NHWC; + desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + U8 *bias_cpu_align = NULL; + if ((oc & 3) != 0) { + U8 *bias_cpu_align = ut_input_v((oc + 3) / 4 * 4, dt, UT_INIT_ZERO); + memcpy(bias_cpu_align, bias_cpu, (oc + 3) / 4 * 4 * bytesOf(dt)); + desc.host_ptr = bias_cpu_align; + } else { + desc.host_ptr = bias_cpu; + } + alloc_desc(biasTensor, desc); + + desc = filterMemDesc; + alloc_desc(filterTensor, desc); + desc.stride[0] = fw * fh; + desc.stride[1] = fc; + desc.stride[2] = fn; + desc.offset[0] = 0; + desc.offset[1] = 0; + desc.offset[2] = 0; + desc.byteSize = fw * fh * fc * fn * bytesOf(dt); + desc.num = fw * fh * fc * fn; + desc.memType = GCL_MEM_BUF; + desc.memFormat = DF_NCHW; + desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + desc.host_ptr = filter_cpu; + alloc_desc(filterTensorOrg, desc); + + tmpBytes = tensorNumBytes(inputDesc_gpu); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + GCLMem_t tmpbuf = alloc_bytes(tmpTensor, maxBytes); + + CHECK_STATUS(convolution_transform_filter( + filterTensorOrg, convParamSpec, alg, tmpTensor, &filterTensor, &archInfo)); + + CHECK_STATUS(ocl_set_input(handle, input, inputDesc_gpu, input_cpu, tmpbuf, true)); + + CHECK_STATUS(convolution(inputTensor, filterTensor, convParamSpec, alg, nullptr, biasTensor, + tmpTensor, outputTensor, activationDesc, &archInfo)); + /*warp up*/ + UNI_INFO_LOG("Warp up gpu:\n") + for (U32 i = 0; i < 2; i++) { + CHECK_STATUS(gcl_run_kernelVec(handle)); + } + +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernelVec_timing(handle, 0, handle->kernelVec->size())); + double time = handle->t_execute * 0.001; +#else + CHECK_STATUS(gcl_run_kernelVec(handle)); +#endif + TensorDesc outputDesc = outputTensor.get_desc(); + ; + CHECK_STATUS(ocl_get_output(handle, output, outputDesc, true)); + void *output_gpu = output->mapPtrArray.back(); + + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)+(%u %u %u %u)/(%u %u %u %u %u %u %u)=(%u %u %u %u)", in, ic, ih, + iw, fn, fc, fh, fw, group, strideH, strideW, paddingT, paddingB, paddingL, paddingR, on, oc, + oh, ow); + sprintf(buffer, "%20s, %80s", "Convolution", params); +#ifdef _DEBUG + double ops = (1.0 * on * oc * oh * ow) * (2.0 * ic * fh * fw / group + 1); + ut_log(dt, buffer, ops, time); +#endif + Tensor inputTensorCpu; + inputTensorCpu.resize(inputDesc); + inputTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(inputTensorCpu, UT_ARCH), input_cpu, tensorNumBytes(inputDesc)); + + Tensor filterTensorCpu; + filterTensorCpu.resize(filterDesc); + filterTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(filterTensorCpu, UT_ARCH), filter_cpu, tensorNumBytes(filterDesc)); + + Tensor biasTensorCpu; + biasTensorCpu.resize(biasDesc); + biasTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(biasTensorCpu, UT_ARCH), bias_cpu, tensorNumBytes(biasDesc)); + + Tensor outputTensorCpu; + outputTensorCpu.resize(outputDesc); + outputTensorCpu.alloc(); + + Tensor tmpTensorCpu; + CHECK_STATUS( + convolution(inputTensorCpu, filterTensorCpu, convParamSpec, CONVOLUTION_ALGORITHM_GEMM, + nullptr, biasTensorCpu, tmpTensorCpu, outputTensorCpu, activationDesc, &archInfo_org)); + ut_check_a(output_gpu, get_ptr_from_tensor(outputTensorCpu, UT_ARCH), on * oc * ow * oh, dt); + + CHECK_STATUS(gcl_finish(handle)); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + free(input_cpu); + free(filter_cpu); + free(bias_cpu); + if (bias_cpu_align) { + free(bias_cpu_align); + } + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + convolutionTest(argc, argv, DT_F16); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_deconvolution.cpp b/compute/tensor/tests/test_deconvolution.cpp new file mode 100644 index 00000000..cd1b93aa --- /dev/null +++ b/compute/tensor/tests/test_deconvolution.cpp @@ -0,0 +1,168 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "tensor_computing.h" +#include "ut_util.h" + +int deconvolutionTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 16); + // in data + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + // weight + U32 fn = atoi(argv[5]); + U32 fc = atoi(argv[6]); + U32 fh = atoi(argv[7]); + U32 fw = atoi(argv[8]); + U32 group = atoi(argv[9]); + // stride & padding + U32 stride = atoi(argv[10]); + U32 padding = atoi(argv[11]); + // output + U32 on = atoi(argv[12]); + U32 oc = atoi(argv[13]); + U32 oh = atoi(argv[14]); + U32 ow = atoi(argv[15]); + CHECK_REQUIREMENT(in == 1 && on == 1); + CHECK_REQUIREMENT(ic % 8 == 0 && oc % 8 == 0); + + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_NULL; + ConvolutionParamSpec convParamSpec = createConvolutionParamSpec(group, fh, fw, stride, stride, + padding, padding, padding, padding, 1, 1, fn, Convolution_Deconvolution); + + TensorDesc outputDesc; + TensorDesc inputDesc = tensor4df(dt, DF_NCHWC8, in, ic, ih, iw); + TensorDesc filterDesc = tensor4df(dt, DF_NCHW, fn, fc, fh, fw); + TensorDesc biasDesc = tensor1d(dt, oc); + + // setup input, filter, bias + U8 *input = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM); + U8 *filter = ut_input_v(fn * fc * fh * fw, dt, UT_INIT_RANDOM); + U8 *bias = ut_input_v(oc, dt, UT_INIT_RANDOM); + + Tensor inputTensor; + Tensor inputTensorRef; + Tensor filterTensor; + Tensor filterTensorRef; + Tensor outputTensor; + Tensor outputTensorRef; + Tensor biasTensor; + + inputTensor.resize(inputDesc); + inputTensorRef.resize(inputDesc); + filterTensor.resize(filterDesc); + filterTensorRef.resize(filterDesc); + biasTensor.resize(biasDesc); + + inputTensor.alloc(); + inputTensorRef.alloc(); + filterTensor.alloc(); + filterTensorRef.alloc(); + biasTensor.alloc(); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, bytesOf(dt) * in * ic * ih * iw); + memcpy(get_ptr_from_tensor(inputTensorRef, UT_ARCH), input, bytesOf(dt) * in * ic * ih * iw); + memcpy(get_ptr_from_tensor(filterTensor, UT_ARCH), filter, tensorNumBytes(filterDesc)); + memcpy(get_ptr_from_tensor(filterTensorRef, UT_ARCH), filter, tensorNumBytes(filterDesc)); + memcpy(get_ptr_from_tensor(biasTensor, UT_ARCH), bias, tensorNumBytes(biasDesc)); + + // setup output, bias + CHECK_STATUS(deconvolution_infer_output_size( + &inputTensor, filterTensor, convParamSpec, &outputTensor, dt, &archInfo)); + U32 output_size = outputTensor.length(); + + outputTensor.alloc(); + outputTensorRef.resize(outputTensor.get_desc()); + outputTensorRef.alloc(); + + // setup alg + ConvolutionPolicy policy = CONVOLUTION_FASTEST; + ConvolutionForwardAlgorithm alg = CONVOLUTION_ALGORITHM_NULL; + CHECK_STATUS(deconvolution_infer_forward_algorithm(inputTensor, filterTensor, outputTensor, + convParamSpec, policy, &alg, dt, activationDesc, &archInfo)); + + // setup tmp + U32 tmpBytes; + CHECK_STATUS(deconvolution_infer_forward_tmp_bytes( + inputTensor, filterTensor, outputTensor, convParamSpec, alg, &tmpBytes, &archInfo)); + Tensor tmpTensor; + tmpTensor.resize(tensor1d(DT_U8, tmpBytes)); + tmpTensor.alloc(); + + // setup filter trans + U32 ftmBytes; + CHECK_STATUS(deconvolution_transform_filter_bytes( + filterTensor, convParamSpec, alg, &ftmBytes, &archInfo)); + Tensor ftmTensor; + ftmTensor.resize(tensor1d(DT_U8, ftmBytes)); + ftmTensor.alloc(); + + // trans filter + CHECK_STATUS(deconvolution_transform_filter( + filterTensor, convParamSpec, alg, tmpTensor, &ftmTensor, &archInfo)); + + if (UT_CHECK) { + CHECK_STATUS(deconvolution(inputTensor, ftmTensor, convParamSpec, alg, nullptr, biasTensor, + tmpTensor, outputTensor, activationDesc, &archInfo)); + + // naive implement + CHECK_STATUS(deconvolution(inputTensorRef, filterTensorRef, convParamSpec, alg, nullptr, + biasTensor, tmpTensor, outputTensorRef, activationDesc, &archInfo_org)); + + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), output_size, dt, 1, __FILE__, __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(deconvolution(inputTensor, ftmTensor, convParamSpec, alg, nullptr, biasTensor, + tmpTensor, outputTensor, activationDesc, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)+(%u %u %u %u)/(%u %u)=(%u %u %u %u)", in, ic, ih, iw, fn, fc, fh, + fw, stride, padding, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "Deconvolution", params); + double ops = (1.0 * on * oc * ih * iw) * (2.0 * ic * fh * fw + fh * fw); + ut_log(dt, buffer, ops, time); + + free(input); + free(filter); + free(bias); + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + deconvolutionTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + deconvolutionTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_deconvolution_ocl.cpp b/compute/tensor/tests/test_deconvolution_ocl.cpp new file mode 100644 index 00000000..dd1d38c6 --- /dev/null +++ b/compute/tensor/tests/test_deconvolution_ocl.cpp @@ -0,0 +1,268 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "tensor_computing.h" +#include "ut_util.h" +#include "gcl.h" +#include "libkernelsource.h" +#include +inline GCLMem_t alloc(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_map(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->mapped_alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_bytes(Tensor tensor, U32 size) +{ + auto mem = (OclMemory *)tensor.get_memory(); + GCLMem_t ptr = NULL; + if (size > 0) { + mem->resize(tensor1d(DT_U8, size)); + mem->alloc(); + ptr = (GCLMem_t)mem->get_ptr(); + } + return ptr; +} + +inline GCLMem_t alloc_desc(Tensor tensor, GCLMemDesc desc) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->padding(desc); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +int deconvolutionTest(int argc, char *argv[], DataType dt) +{ + U32 biasNum; + ArchInfo archInfo; + archInfo.arch = MALI; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + U32 in = 1; + U32 ic = 4; + U32 ih = 2; + U32 iw = 2; + U32 fn = 4; + U32 fh = 2; + U32 fw = 2; + U32 fc = 4; + U32 stride = 2; + U32 padding = 0; + U32 group = 1; + if (argc == 9) { + ic = atoi(argv[1]); + ih = atoi(argv[2]); + iw = atoi(argv[3]); + fc = atoi(argv[4]); + fh = atoi(argv[5]); + fw = atoi(argv[6]); + stride = atoi(argv[7]); + padding = atoi(argv[8]); + fn = ic; + } + U32 on = 1; + U32 oc = fc; + U32 oh = fh + stride * (ih - 1) - padding - padding; + U32 ow = fw + stride * (iw - 1) - padding - padding; + + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_NULL; + ConvolutionParamSpec convParamSpec = createConvolutionParamSpec(group, fh, fw, stride, stride, + padding, padding, padding, padding, 1, 1, fn, Convolution_Deconvolution); + + TensorDesc inputDesc = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + TensorDesc filterDesc = tensor4df(dt, DF_NCHW, fn, fc, fh, fw); + TensorDesc biasDesc = tensor1d(dt, oc); + U8 *input_cpu = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM); + U8 *filter_cpu = ut_input_v(fn * fc * fh * fw, dt, UT_INIT_RANDOM); + U8 *bias_cpu = ut_input_v(oc, dt, UT_INIT_RANDOM); + U8 *output_gpu = NULL; + TensorDesc inputDesc_gpu = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + + std::shared_ptr handleSharedPtr = OCLContext::getInstance().handle; + ; + GCLHandle_t handle = handleSharedPtr.get(); + std::vector kernelVec; + handle->kernelVec = &kernelVec; + Tensor inputTensor = Tensor(OCLMem); + Tensor outputTensor = Tensor(OCLMem); + Tensor filterTensorOrg = Tensor(OCLMem); + Tensor filterTensor = Tensor(OCLMem); + Tensor biasTensor = Tensor(OCLMem); + Tensor tmpTensor = Tensor(OCLMem); + inputTensor.resize(inputDesc_gpu); + filterTensor.resize(filterDesc); + filterTensorOrg.resize(filterDesc); + biasTensor.resize(biasDesc); + + MaliPara maliPara; + ForwardRunInfoMali runInfo; + runInfo.algorithm = (I32)(CONVOLUTION_ALGORITHM_NULL); + maliPara.handle = handle; + maliPara.forwardRunInfo = &runInfo; + archInfo.archPara = &maliPara; + + CHECK_STATUS(deconvolution_infer_output_size( + &inputTensor, filterTensor, convParamSpec, &outputTensor, dt, &archInfo)); + + ConvolutionPolicy policy = CONVOLUTION_TUNNING; + ConvolutionForwardAlgorithm alg = CONVOLUTION_ALGORITHM_NULL; + CHECK_STATUS(deconvolution_infer_forward_algorithm(inputTensor, filterTensor, outputTensor, + convParamSpec, policy, &alg, dt, activationDesc, &archInfo)); + + U32 maxBytes = 0; + U32 tmpBytes; + CHECK_STATUS(deconvolution_infer_forward_tmp_bytes( + inputTensor, filterTensor, outputTensor, convParamSpec, alg, &tmpBytes, &archInfo)); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + + U32 ftmBytes; + U32 str[3] = {0, 0, 0}; + U32 off[3] = {0, 0, 0}; + GCLMemDesc filterMemDesc = gcl_mem_desc(str, off, DT_U8, DF_NCWHC4); + maliPara.gclmemFilterDesc = &filterMemDesc; + CHECK_STATUS(deconvolution_transform_filter_bytes( + filterTensor, convParamSpec, alg, &ftmBytes, &archInfo)); + + GCLMem_t output = alloc_map(outputTensor); + GCLMem_t input = alloc(inputTensor); + CHECK_STATUS(gcl_fill_memory_zero(handle, input)); + + GCLMemDesc desc = gclmem_build_desc(); + biasNum = (oc + 3) / 4; + desc.memType = GCL_MEM_IMG_1D; + desc.byteSize = biasNum * 4 * bytesOf(dt); + desc.stride[0] = biasNum; + desc.stride[1] = 1; + desc.stride[2] = 1; + desc.offset[0] = 0; + desc.offset[1] = 0; + desc.offset[2] = 0; + desc.num = biasNum; + desc.memFormat = DF_NHWC; + desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + U8 *bias_cpu_align = NULL; + if ((oc & 3) != 0) { + U8 *bias_cpu_align = ut_input_v((oc + 3) / 4 * 4, dt, UT_INIT_ZERO); + memcpy(bias_cpu_align, bias_cpu, (oc + 3) / 4 * 4 * bytesOf(dt)); + desc.host_ptr = bias_cpu_align; + } else { + desc.host_ptr = bias_cpu; + } + + alloc_desc(biasTensor, desc); + + desc = filterMemDesc; + alloc_desc(filterTensor, desc); + + desc.stride[0] = fw * fh; + desc.stride[1] = fc; + desc.stride[2] = fn; + desc.offset[0] = 0; + desc.offset[1] = 0; + desc.offset[2] = 0; + desc.byteSize = fw * fh * fc * fn * bytesOf(dt); + desc.num = fw * fh * fc * fn; + desc.memType = GCL_MEM_BUF; + desc.memFormat = DF_NCHW; + desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + desc.host_ptr = filter_cpu; + alloc_desc(filterTensorOrg, desc); + + tmpBytes = tensorNumBytes(inputDesc_gpu); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + GCLMem_t tmpbuf = alloc_bytes(tmpTensor, maxBytes); + + CHECK_STATUS(deconvolution_transform_filter( + filterTensorOrg, convParamSpec, alg, tmpTensor, &filterTensor, &archInfo)); + CHECK_STATUS(ocl_set_input(handle, input, inputDesc_gpu, input_cpu, tmpbuf, true)); + + CHECK_STATUS(deconvolution(inputTensor, filterTensor, convParamSpec, alg, nullptr, biasTensor, + tmpTensor, outputTensor, activationDesc, &archInfo)); + /*warp up*/ + UNI_INFO_LOG("Warp up gpu:\n") + for (U32 i = 0; i < 2; i++) { + CHECK_STATUS(gcl_run_kernelVec(handle)); + } + +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernelVec_timing(handle, 0, handle->kernelVec->size())); + double time = handle->t_execute * 0.001; +#else + CHECK_STATUS(gcl_run_kernelVec(handle)); +#endif + TensorDesc outputDesc = outputTensor.get_desc(); + CHECK_STATUS(ocl_get_output(handle, output, outputDesc, true)); + output_gpu = output->mapPtrArray.back(); + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)+(%u %u %u %u)/(%u %u)=(%u %u %u %u)", in, ic, ih, iw, fn, fc, fh, + fw, stride, padding, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "Deonvolution", params); +#ifdef _DEBUG + double ops = (1.0 * on * oc * ih * iw) * (2.0 * ic * fh * fw + fh * fw); + ut_log(dt, buffer, ops, time); +#endif + Tensor inputTensorCpu; + inputTensorCpu.resize(inputDesc); + inputTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(inputTensorCpu, UT_ARCH), input_cpu, tensorNumBytes(inputDesc)); + + Tensor filterTensorCpu; + filterTensorCpu.resize(filterDesc); + filterTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(filterTensorCpu, UT_ARCH), filter_cpu, tensorNumBytes(filterDesc)); + + Tensor biasTensorCpu; + biasTensorCpu.resize(biasDesc); + biasTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(biasTensorCpu, UT_ARCH), bias_cpu, tensorNumBytes(biasDesc)); + + Tensor outputTensorCpu; + outputTensorCpu.resize(outputDesc); + outputTensorCpu.alloc(); + + Tensor tmpTensorCpu; + CHECK_STATUS( + deconvolution(inputTensorCpu, filterTensorCpu, convParamSpec, CONVOLUTION_ALGORITHM_GEMM, + nullptr, biasTensorCpu, tmpTensorCpu, outputTensorCpu, activationDesc, &archInfo_org)); + ut_check_a(output_gpu, get_ptr_from_tensor(outputTensorCpu, UT_ARCH), on * oc * ow * oh, dt); + + CHECK_STATUS(gcl_finish(handle)); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + free(input_cpu); + free(filter_cpu); + free(bias_cpu); + if (bias_cpu_align) { + free(bias_cpu_align); + } + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + deconvolutionTest(argc, argv, DT_F16); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_depthwise_convolution.cpp b/compute/tensor/tests/test_depthwise_convolution.cpp new file mode 100644 index 00000000..6b14c492 --- /dev/null +++ b/compute/tensor/tests/test_depthwise_convolution.cpp @@ -0,0 +1,257 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include + +#include "tensor_computing.h" +#include "ut_util.h" + +int depthwiseConvolutionTest(int argc, char *argv[], bool isFusedWithPw, DataType dt) +{ + CHECK_REQUIREMENT(argc == 16); + // in data + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + // weight + U32 fn = atoi(argv[5]); + U32 fc = atoi(argv[6]); + U32 fh = atoi(argv[7]); + U32 fw = atoi(argv[8]); + U32 group = atoi(argv[9]); + // stride & padding + U32 stride = atoi(argv[10]); + U32 padding = atoi(argv[11]); + // output + U32 on = atoi(argv[12]); + U32 oc = atoi(argv[13]); + U32 oh = atoi(argv[14]); + U32 ow = atoi(argv[15]); + CHECK_REQUIREMENT(in == 1 && on == 1); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + ActivationParamSpec dwActivationParamSpec; + ActivationParamSpec pwActivationParamSpec; + dwActivationParamSpec.mode = ACTIVATION_NULL; + pwActivationParamSpec.mode = ACTIVATION_NULL; + + TensorDesc inputDesc, dwFilterDesc, pwFilterDesc, outputDesc, dwBiasDesc, pwBiasDesc; + inputDesc = tensor4df(dt, DF_NCHWC8, in, ic, ih, iw); + dwFilterDesc = tensor4df(dt, DF_NCHW, 1, ic, fh, fw); + dwBiasDesc = tensor1d(dt, ic); + if (isFusedWithPw) { + pwFilterDesc = tensor4df(dt, DF_NCHW, oc, ic, 1, 1); + pwBiasDesc = tensor1d(dt, oc); + } + ConvolutionParamSpec p = createConvolutionParamSpec(group, fh, fw, stride, stride, padding, + padding, padding, padding, 1, 1, fn, Convolution_Depthwise); + + // setup input, filter, bias + U8 *dwFilter = nullptr; + U8 *dwBias = nullptr; + U8 *pwFilter = nullptr; + U8 *pwBias = nullptr; + + U8 *input = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM); + dwFilter = ut_input_v(tensorNumElements(dwFilterDesc), dt, UT_INIT_RANDOM); + dwBias = ut_input_v(tensorNumElements(dwBiasDesc), dt, UT_INIT_RANDOM); + Tensor inputTensor; + Tensor inputTensorRef; + Tensor dwFilterTensor; + Tensor dwFilterTensorRef; + Tensor outputTensor; + Tensor outputTensorRef; + Tensor dwBiasTensor; + + inputTensor.resize(inputDesc); + inputTensorRef.resize(inputDesc); + dwFilterTensor.resize(dwFilterDesc); + dwFilterTensorRef.resize(dwFilterDesc); + dwBiasTensor.resize(dwBiasDesc); + + inputTensor.alloc(); + inputTensorRef.alloc(); + dwFilterTensor.alloc(); + dwFilterTensorRef.alloc(); + dwBiasTensor.alloc(); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, bytesOf(dt) * in * ic * ih * iw); + memcpy(get_ptr_from_tensor(inputTensorRef, UT_ARCH), input, bytesOf(dt) * in * ic * ih * iw); + memcpy(get_ptr_from_tensor(dwFilterTensor, UT_ARCH), dwFilter, bytesOf(dt) * 1 * ic * fh * fw); + memcpy( + get_ptr_from_tensor(dwFilterTensorRef, UT_ARCH), dwFilter, bytesOf(dt) * 1 * ic * fh * fw); + memcpy(get_ptr_from_tensor(dwBiasTensor, UT_ARCH), dwBias, bytesOf(dt) * ic); + Tensor pwFilterTensor; + Tensor pwFilterTensorRef; + Tensor pwBiasTensor; + if (isFusedWithPw) { + pwFilter = ut_input_v(tensorNumElements(pwFilterDesc), dt, UT_INIT_RANDOM); + pwBias = ut_input_v(tensorNumElements(pwBiasDesc), dt, UT_INIT_RANDOM); + pwFilterTensor.resize(pwFilterDesc); + pwFilterTensorRef.resize(pwFilterDesc); + pwBiasTensor.resize(pwBiasDesc); + pwFilterTensor.alloc(); + pwFilterTensorRef.alloc(); + pwBiasTensor.alloc(); + memcpy( + get_ptr_from_tensor(pwFilterTensor, UT_ARCH), pwFilter, bytesOf(dt) * oc * ic * 1 * 1); + memcpy(get_ptr_from_tensor(pwFilterTensorRef, UT_ARCH), pwFilter, + bytesOf(dt) * oc * ic * 1 * 1); + memcpy(get_ptr_from_tensor(pwBiasTensor, UT_ARCH), pwBias, bytesOf(dt) * oc); + } + + // setup output, bias + if (isFusedWithPw) { + CHECK_STATUS(depthwise_pointwise_convolution_infer_output_size( + &inputTensor, dwFilterTensor, pwFilterTensor, p, &outputTensor, dt, &archInfo)); + } else { + CHECK_STATUS(depthwise_convolution_infer_output_size( + &inputTensor, dwFilterTensor, p, &outputTensor, dt, &archInfo)); + } + + outputTensor.alloc(); + outputTensorRef.resize(outputTensor.get_desc()); + outputTensorRef.alloc(); + + // setup alg + ConvolutionPolicy policy = CONVOLUTION_FASTEST; + DepthwiseConvolutionForwardAlgorithm alg = DEPTHWISE_CONVOLUTION_ALGORITHM_NULL; + if (isFusedWithPw) { + CHECK_STATUS(depthwise_pointwise_convolution_infer_forward_algorithm(inputTensor, + dwFilterTensor, pwFilterTensor, outputTensor, p, policy, &alg, dt, + dwActivationParamSpec, pwActivationParamSpec, &archInfo)); + } else { + CHECK_STATUS(depthwise_convolution_infer_forward_algorithm(inputTensor, dwFilterTensor, + outputTensor, p, policy, &alg, dt, dwActivationParamSpec, &archInfo)); + } + + // setup tmp + U32 tmpBytes; + if (isFusedWithPw) { + CHECK_STATUS(depthwise_pointwise_convolution_infer_forward_tmp_bytes(inputTensor, + dwFilterTensor, pwFilterTensor, outputTensor, p, alg, &tmpBytes, &archInfo)); + } else { + CHECK_STATUS(depthwise_convolution_infer_forward_tmp_bytes( + inputTensor, dwFilterTensor, outputTensor, p, alg, &tmpBytes, &archInfo)); + } + Tensor tmpTensor; + tmpTensor.resize(tensor1d(DT_U8, tmpBytes)); + tmpTensor.alloc(); + + // setup filter trans + U32 dwBytes, pwBytes; + if (isFusedWithPw) { + CHECK_STATUS(depthwise_pointwise_convolution_transform_filter_bytes( + dwFilterTensor, pwFilterTensor, p, alg, &dwBytes, &pwBytes, &archInfo)); + } else { + CHECK_STATUS(depthwise_convolution_transform_filter_bytes( + dwFilterTensor, p, alg, &dwBytes, &archInfo)); + } + Tensor dwFtmTensor; + dwFtmTensor.resize(tensor1d(DT_U8, dwBytes)); + dwFtmTensor.alloc(); + Tensor pwFtmTensor; + if (isFusedWithPw) { + pwFtmTensor.resize(tensor1d(DT_U8, pwBytes)); + pwFtmTensor.alloc(); + } + + // trans filter + if (isFusedWithPw) { + CHECK_STATUS(depthwise_pointwise_convolution_transform_filter( + dwFilterTensor, pwFilterTensor, p, alg, &dwFtmTensor, &pwFtmTensor, &archInfo)); + } else { + CHECK_STATUS( + depthwise_convolution_transform_filter(dwFilterTensor, p, alg, &dwFtmTensor, &archInfo)); + } + + if (UT_CHECK) { + if (isFusedWithPw) { + CHECK_STATUS(depthwise_pointwise_convolution(inputTensor, dwFtmTensor, pwFtmTensor, p, + alg, dwBiasTensor, pwBiasTensor, tmpTensor, outputTensor, dwActivationParamSpec, + pwActivationParamSpec, &archInfo)); + + // naive implement + CHECK_STATUS(depthwise_pointwise_convolution(inputTensorRef, dwFilterTensorRef, + pwFilterTensorRef, p, alg, dwBiasTensor, pwBiasTensor, tmpTensor, outputTensorRef, + dwActivationParamSpec, pwActivationParamSpec, &archInfo_org)); + } else { + CHECK_STATUS(depthwise_convolution(inputTensor, dwFtmTensor, p, alg, dwBiasTensor, + tmpTensor, outputTensor, dwActivationParamSpec, &archInfo)); + + // naive implement + CHECK_STATUS(depthwise_convolution(inputTensorRef, dwFilterTensorRef, p, alg, + dwBiasTensor, tmpTensor, outputTensorRef, dwActivationParamSpec, &archInfo_org)); + } + + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), outputTensor.length(), dt, 0.1, __FILE__, + __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + if (isFusedWithPw) { + CHECK_STATUS(depthwise_pointwise_convolution(inputTensor, dwFtmTensor, pwFtmTensor, p, + alg, dwBiasTensor, pwBiasTensor, tmpTensor, outputTensor, dwActivationParamSpec, + pwActivationParamSpec, &archInfo)); + } else { + CHECK_STATUS(depthwise_convolution(inputTensor, dwFtmTensor, p, alg, dwBiasTensor, + tmpTensor, outputTensor, dwActivationParamSpec, &archInfo)); + } + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)+(%u %u %u %u)/(%u %u)=(%u %u %u %u)", in, ic, ih, iw, fn, fc, fh, + fw, stride, padding, on, oc, oh, ow); + double ops = 0; + if (isFusedWithPw) { + sprintf(buffer, "%20s, %80s", "DepthwisePointwise", params); + ops = 2.0 * in * ic * ih * iw * fh * fw + in * ic * oh * ow + 2.0 * on * oc * oh * ow * ic + + on * oc * oh * ow; + } else { + sprintf(buffer, "%20s, %80s", "DepthwiseConvolution", params); + ops = 2.0 * in * ic * ih * iw * fh * fw + in * ic * oh * ow; + } + ut_log(dt, buffer, ops, time); + + free(input); + free(dwFilter); + free(dwBias); + if (isFusedWithPw) { + free(pwFilter); + free(pwBias); + } + return 0; +} + +int main(int argc, char *argv[]) +{ +#ifdef _USE_FP16 + depthwiseConvolutionTest(argc, argv, true, DT_F16); + depthwiseConvolutionTest(argc, argv, false, DT_F16); +#endif +#ifdef _USE_FP32 + depthwiseConvolutionTest(argc, argv, true, DT_F32); + depthwiseConvolutionTest(argc, argv, false, DT_F32); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_depthwise_convolution_int8.cpp b/compute/tensor/tests/test_depthwise_convolution_int8.cpp new file mode 100644 index 00000000..f1dfcd56 --- /dev/null +++ b/compute/tensor/tests/test_depthwise_convolution_int8.cpp @@ -0,0 +1,190 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include + +#include "tensor_computing.h" +#include "ut_util.h" + +int main(int argc, char *argv[]) +{ +#ifdef _USE_INT8 + CHECK_REQUIREMENT(argc == 16); + // in data + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + // weight + U32 fn = atoi(argv[5]); + U32 fc = atoi(argv[6]); + U32 fh = atoi(argv[7]); + U32 fw = atoi(argv[8]); + U32 group = atoi(argv[9]); + // stride & padding + U32 stride = atoi(argv[10]); + U32 padding = atoi(argv[11]); + // output + U32 on = atoi(argv[12]); + U32 oc = atoi(argv[13]); + U32 oh = atoi(argv[14]); + U32 ow = atoi(argv[15]); + CHECK_REQUIREMENT(in == 1 && on == 1); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + DataType dt = DT_I8; + DataType odt = DT_I32; + ActivationParamSpec dwActivationParamSpec; + ActivationParamSpec pwActivationParamSpec; + dwActivationParamSpec.mode = ACTIVATION_RELU6; + pwActivationParamSpec.mode = ACTIVATION_RELU6; + + TensorDesc inputDesc, dwFilterDesc, pwFilterDesc, outputDesc, dwBiasDesc, pwBiasDesc; + inputDesc = tensor4df(dt, DF_NCHWC8, in, ic, ih, iw); + dwFilterDesc = tensor4df(dt, DF_NCHW, 1, ic, fh, fw); + pwFilterDesc = tensor4df(dt, DF_NCHW, oc, ic, 1, 1); + dwBiasDesc = tensor1d(odt, ic); + pwBiasDesc = tensor1d(odt, oc); + ConvolutionParamSpec convParamSpec = createConvolutionParamSpec(group, fh, fw, stride, stride, + padding, padding, padding, padding, 1, 1, fn, Convolution_Depthwise); + + // setup input, filter, bias + INT8 *input = (INT8 *)ut_input_v(in * ic * ih * iw, DT_I8, UT_INIT_RANDOM); + INT8 *dwFilter = (INT8 *)ut_input_v(tensorNumElements(dwFilterDesc), DT_I8, UT_INIT_RANDOM); + INT8 *pwFilter = (INT8 *)ut_input_v(tensorNumElements(pwFilterDesc), DT_I8, UT_INIT_RANDOM); + I32 *dwBias = (I32 *)ut_input_v(ic, DT_I32, UT_INIT_RANDOM); + I32 *pwBias = (I32 *)ut_input_v(oc, DT_I32, UT_INIT_RANDOM); + + Tensor inputTensor; + Tensor inputTensorRef; + Tensor dwFilterTensor; + Tensor dwFilterTensorRef; + Tensor outputTensor; + Tensor outputTensorRef; + Tensor dwBiasTensor; + + inputTensor.resize(inputDesc); + inputTensorRef.resize(inputDesc); + dwFilterTensor.resize(dwFilterDesc); + dwFilterTensorRef.resize(dwFilterDesc); + dwBiasTensor.resize(dwBiasDesc); + + inputTensor.alloc(); + inputTensorRef.alloc(); + dwFilterTensor.alloc(); + dwFilterTensorRef.alloc(); + dwBiasTensor.alloc(); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, bytesOf(dt) * in * ic * ih * iw); + memcpy(get_ptr_from_tensor(inputTensorRef, UT_ARCH), input, bytesOf(dt) * in * ic * ih * iw); + memcpy(get_ptr_from_tensor(dwFilterTensor, UT_ARCH), dwFilter, bytesOf(dt) * 1 * ic * fh * fw); + memcpy( + get_ptr_from_tensor(dwFilterTensorRef, UT_ARCH), dwFilter, bytesOf(dt) * 1 * ic * fh * fw); + memcpy(get_ptr_from_tensor(dwBiasTensor, UT_ARCH), dwBias, bytesOf(dt) * ic); + + Tensor pwFilterTensor; + Tensor pwFilterTensorRef; + Tensor pwBiasTensor; + pwFilterTensor.resize(pwFilterDesc); + pwFilterTensorRef.resize(pwFilterDesc); + pwBiasTensor.resize(pwBiasDesc); + pwFilterTensor.alloc(); + pwFilterTensorRef.alloc(); + pwBiasTensor.alloc(); + memcpy(get_ptr_from_tensor(pwFilterTensor, UT_ARCH), pwFilter, bytesOf(dt) * oc * ic * 1 * 1); + memcpy(get_ptr_from_tensor(pwFilterTensorRef, UT_ARCH), pwFilter, bytesOf(dt) * oc * ic * 1 * 1); + memcpy(get_ptr_from_tensor(pwBiasTensor, UT_ARCH), pwBias, bytesOf(dt) * oc); + + // setup output, bias + CHECK_STATUS(depthwise_pointwise_convolution_infer_output_size(&inputTensor, dwFilterTensor, + pwFilterTensor, convParamSpec, &outputTensor, odt, &archInfo)); + outputTensor.alloc(); + outputTensorRef.resize(outputTensor.get_desc()); + outputTensorRef.alloc(); + + // setup alg + ConvolutionPolicy policy = CONVOLUTION_FASTEST; + DepthwiseConvolutionForwardAlgorithm alg = DEPTHWISE_CONVOLUTION_ALGORITHM_NULL; + CHECK_STATUS(depthwise_pointwise_convolution_infer_forward_algorithm(inputTensor, + dwFilterTensor, pwFilterTensor, outputTensor, convParamSpec, policy, &alg, dt, + dwActivationParamSpec, pwActivationParamSpec, &archInfo)); + + // setup tmp + U32 tmpBytes; + CHECK_STATUS(depthwise_pointwise_convolution_infer_forward_tmp_bytes(inputTensor, + dwFilterTensor, pwFilterTensor, outputTensor, convParamSpec, alg, &tmpBytes, &archInfo)); + Tensor tmpTensor; + tmpTensor.resize(tensor1d(DT_U8, tmpBytes)); + tmpTensor.alloc(); + + // setup filter trans + U32 dwBytes, pwBytes; + CHECK_STATUS(depthwise_pointwise_convolution_transform_filter_bytes( + dwFilterTensor, pwFilterTensor, convParamSpec, alg, &dwBytes, &pwBytes, &archInfo)); + Tensor dwFtmTensor; + dwFtmTensor.resize(tensor1d(DT_U8, dwBytes)); + dwFtmTensor.alloc(); + Tensor pwFtmTensor; + pwFtmTensor.resize(tensor1d(DT_U8, pwBytes)); + pwFtmTensor.alloc(); + // trans filter + CHECK_STATUS(depthwise_pointwise_convolution_transform_filter( + dwFilterTensor, pwFilterTensor, convParamSpec, alg, &dwFtmTensor, &pwFtmTensor, &archInfo)); + + if (UT_CHECK) { + CHECK_STATUS(depthwise_pointwise_convolution(inputTensor, dwFtmTensor, pwFtmTensor, + convParamSpec, alg, dwBiasTensor, pwBiasTensor, tmpTensor, outputTensor, + dwActivationParamSpec, pwActivationParamSpec, &archInfo)); + + // naive implement + CHECK_STATUS(depthwise_pointwise_convolution(inputTensorRef, dwFilterTensorRef, + pwFilterTensorRef, convParamSpec, alg, dwBiasTensor, pwBiasTensor, tmpTensor, + outputTensorRef, dwActivationParamSpec, pwActivationParamSpec, &archInfo_org)); + + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), outputTensor.length(), DT_I32, 1, + __FILE__, __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(depthwise_pointwise_convolution(inputTensor, dwFtmTensor, pwFtmTensor, + convParamSpec, alg, dwBiasTensor, pwBiasTensor, tmpTensor, outputTensor, + dwActivationParamSpec, pwActivationParamSpec, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)+(%u %u %u %u)/(%u %u)=(%u %u %u %u)", in, ic, ih, iw, fn, fc, fh, + fw, stride, padding, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "DepthwiseConvolution", params); + double ops = 2.0 * in * ic * ih * iw * fh * fw + in * ic * oh * ow + + 2.0 * on * oc * oh * ow * ic + on * oc * oh * ow; + ut_log(DT_I8, buffer, ops, time); + + free(input); + free(dwFilter); + free(pwFilter); + free(dwBias); + free(pwBias); +#endif + + return 0; +} diff --git a/compute/tensor/tests/test_depthwise_convolution_ocl.cpp b/compute/tensor/tests/test_depthwise_convolution_ocl.cpp new file mode 100644 index 00000000..d135ff50 --- /dev/null +++ b/compute/tensor/tests/test_depthwise_convolution_ocl.cpp @@ -0,0 +1,270 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "tensor_computing.h" +#include "ut_util.h" +#include "gcl.h" +#include "libkernelsource.h" + +#ifdef _USE_FP16 +inline GCLMem_t alloc(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_map(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->mapped_alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_bytes(Tensor tensor, U32 size) +{ + auto mem = (OclMemory *)tensor.get_memory(); + GCLMem_t ptr = NULL; + if (size > 0) { + mem->resize(tensor1d(DT_U8, size)); + mem->alloc(); + ptr = (GCLMem_t)mem->get_ptr(); + } + return ptr; +} + +inline GCLMem_t alloc_desc(Tensor tensor, GCLMemDesc desc) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->padding(desc); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} +int depthwiseConvolutionTest(int argc, char *argv[], DataFormat filterDataFormat, DataType dt) +{ + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 group, stride, padding; + U32 on, oc, oh, ow; + U32 biasNum; + ArchInfo archInfo; + archInfo.arch = MALI; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + in = 1; + ic = 8; + ih = 4; + iw = 4; + fn = 1; + fc = 8; + fh = 3; + fw = 3; + group = 1; + stride = 1; + padding = 1; + + if (argc == 9) { + ic = atoi(argv[1]); + ih = atoi(argv[2]); + iw = atoi(argv[3]); + fc = atoi(argv[4]); + fh = atoi(argv[5]); + fw = atoi(argv[6]); + stride = atoi(argv[7]); + padding = atoi(argv[8]); + } + + on = 1; + oc = fc; + oh = (ih + padding * 2 - fh) / stride + 1; + ow = (iw + padding * 2 - fw) / stride + 1; + ActivationParamSpec dwActivationParamSpec; + dwActivationParamSpec.mode = ACTIVATION_NULL; + ConvolutionParamSpec convParamSpec = createConvolutionParamSpec(group, fh, fw, stride, stride, + padding, padding, padding, padding, 1, 1, fn, Convolution_Depthwise); + + U32 filterLen = fn * fc * fh * fw; + U32 biasLen = oc; + TensorDesc inputDesc = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + TensorDesc filterDesc = tensor4df(dt, filterDataFormat, fn, fc, fh, fw); + TensorDesc biasDesc = tensor1d(dt, biasLen); + U8 *input_cpu = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM); + U8 *filter_cpu = ut_input_v(filterLen, dt, UT_INIT_RANDOM); + U8 *bias_cpu = ut_input_v(biasLen, dt, UT_INIT_RANDOM); + + std::shared_ptr handleSharedPtr = OCLContext::getInstance().handle; + ; + GCLHandle_t handle = handleSharedPtr.get(); + std::vector kernelVec; + handle->kernelVec = &kernelVec; + Tensor inputTensor = Tensor(OCLMem); + Tensor outputTensor = Tensor(OCLMem); + Tensor filterTensorOrg = Tensor(OCLMem); + Tensor filterTensor = Tensor(OCLMem); + Tensor biasTensor = Tensor(OCLMem); + Tensor tmpTensor = Tensor(OCLMem); + inputTensor.resize(inputDesc); + filterTensor.resize(filterDesc); + filterTensorOrg.resize(filterDesc); + biasTensor.resize(biasDesc); + + MaliPara maliPara; + ForwardRunInfoMali runInfo; + runInfo.algorithm = (I32)(DEPTHWISE_CONVOLUTION_ALGORITHM_NULL); + maliPara.handle = handle; + maliPara.forwardRunInfo = &runInfo; + archInfo.archPara = &maliPara; + + CHECK_STATUS(depthwise_convolution_infer_output_size( + &inputTensor, filterTensor, convParamSpec, &outputTensor, dt, &archInfo)); + ConvolutionPolicy policy = CONVOLUTION_TUNNING; + DepthwiseConvolutionForwardAlgorithm alg = DEPTHWISE_CONVOLUTION_ALGORITHM_NULL; + CHECK_STATUS(depthwise_convolution_infer_forward_algorithm(inputTensor, filterTensor, + outputTensor, convParamSpec, policy, &alg, dt, dwActivationParamSpec, &archInfo)); + + U32 maxBytes = 0; + U32 tmpBytes; + CHECK_STATUS(depthwise_convolution_infer_forward_tmp_bytes( + inputTensor, filterTensor, outputTensor, convParamSpec, alg, &tmpBytes, &archInfo)); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + + U32 str[3] = {0, 0, 0}; + U32 off[3] = {0, 0, 0}; + GCLMemDesc filterMemDesc = gcl_mem_desc(str, off, DT_U8, DF_NCWHC4); + maliPara.gclmemFilterDesc = &filterMemDesc; + U32 ftmBytes; + CHECK_STATUS(depthwise_convolution_transform_filter_bytes( + filterTensor, convParamSpec, alg, &ftmBytes, &archInfo)); + + GCLMem_t output = alloc_map(outputTensor); + GCLMem_t input = alloc(inputTensor); + CHECK_STATUS(gcl_fill_memory_zero(handle, input)); + + GCLMemDesc desc = gclmem_build_desc(); + biasNum = (oc + 3) / 4; + desc.memType = GCL_MEM_IMG_1D; + desc.byteSize = biasNum * 4 * bytesOf(dt); + desc.stride[0] = biasNum; + desc.stride[1] = 1; + desc.stride[2] = 1; + desc.offset[0] = 0; + desc.offset[1] = 0; + desc.offset[2] = 0; + desc.num = biasNum; + desc.memFormat = DF_NHWC; + desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + desc.host_ptr = bias_cpu; + alloc_desc(biasTensor, desc); + + desc = filterMemDesc; + alloc_desc(filterTensor, desc); + + desc.stride[0] = fw * fh; + desc.stride[1] = fc; + desc.stride[2] = fn; + desc.offset[0] = 0; + desc.offset[1] = 0; + desc.offset[2] = 0; + desc.byteSize = fw * fh * fc * fn * bytesOf(dt); + desc.num = fw * fh * fc * fn; + desc.memType = GCL_MEM_BUF; + desc.memFormat = DF_NCHW; + desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + desc.host_ptr = filter_cpu; + alloc_desc(filterTensorOrg, desc); + + tmpBytes = tensorNumBytes(inputDesc); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + GCLMem_t tmpbuf = alloc_bytes(tmpTensor, maxBytes); + + CHECK_STATUS(depthwise_convolution_transform_filter( + filterTensorOrg, convParamSpec, alg, &filterTensor, &archInfo)); + + CHECK_STATUS(ocl_set_input(handle, input, inputDesc, input_cpu, tmpbuf, true)); + + CHECK_STATUS(depthwise_convolution(inputTensor, filterTensor, convParamSpec, alg, biasTensor, + tmpTensor, outputTensor, dwActivationParamSpec, &archInfo)); + + /*warp up*/ + UNI_INFO_LOG("Warp up gpu:\n") + for (U32 i = 0; i < 2; i++) { + CHECK_STATUS(gcl_run_kernelVec(handle)); + } + +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernelVec_timing(handle, 0, handle->kernelVec->size())); + double time = handle->t_execute * 0.001; +#else + CHECK_STATUS(gcl_run_kernelVec(handle)); +#endif + TensorDesc outputDesc = outputTensor.get_desc(); + CHECK_STATUS(ocl_get_output(handle, output, outputDesc, true)); + void *output_gpu = output->mapPtrArray.back(); + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)+(%u %u %u %u)/(%u %u)=(%u %u %u %u)", in, ic, ih, iw, fn, fc, fh, + fw, stride, padding, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "DepthwiseConvolution", params); +#ifdef _DEBUG + double ops = 2.0 * in * ic * ih * iw * fh * fw + in * ic * oh * ow; + ut_log(dt, buffer, ops, time); +#endif + Tensor inputTensorCpu; + inputTensorCpu.resize(inputDesc); + inputTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(inputTensorCpu, UT_ARCH), input_cpu, tensorNumBytes(inputDesc)); + + Tensor filterTensorCpu; + filterTensorCpu.resize(filterDesc); + filterTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(filterTensorCpu, UT_ARCH), filter_cpu, tensorNumBytes(filterDesc)); + + Tensor biasTensorCpu; + biasTensorCpu.resize(biasDesc); + biasTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(biasTensorCpu, UT_ARCH), bias_cpu, tensorNumBytes(biasDesc)); + + Tensor outputTensorCpu; + outputTensorCpu.resize(outputDesc); + outputTensorCpu.alloc(); + + Tensor tmpTensorCpu; + // setup tmp + CHECK_STATUS(depthwise_convolution_infer_forward_tmp_bytes(inputTensorCpu, filterTensorCpu, + outputTensorCpu, convParamSpec, alg, &tmpBytes, &archInfo)); + tmpTensorCpu.resize(tensor1d(DT_F16, tmpBytes / bytesOf(DT_F16))); + tmpTensorCpu.alloc(); + + CHECK_STATUS(depthwise_convolution(inputTensorCpu, filterTensorCpu, convParamSpec, + DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT, biasTensorCpu, tmpTensorCpu, outputTensorCpu, + dwActivationParamSpec, &archInfo_org)); + ut_check_a(output_gpu, get_ptr_from_tensor(outputTensorCpu, UT_ARCH), on * oc * ow * oh, dt); + + CHECK_STATUS(gcl_finish(handle)); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + free(input_cpu); + free(filter_cpu); + free(bias_cpu); + return 0; +} +#endif + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + depthwiseConvolutionTest(argc, argv, DF_NCHW, DT_F16); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_depthwise_pointwise_convolution_ocl.cpp b/compute/tensor/tests/test_depthwise_pointwise_convolution_ocl.cpp new file mode 100644 index 00000000..b87b7126 --- /dev/null +++ b/compute/tensor/tests/test_depthwise_pointwise_convolution_ocl.cpp @@ -0,0 +1,361 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "tensor_computing.h" +#include "ut_util.h" +#include "gcl.h" +#include "libkernelsource.h" + +#ifdef _USE_FP16 +inline GCLMem_t alloc(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_map(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->mapped_alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_bytes(Tensor tensor, U32 size) +{ + auto mem = (OclMemory *)tensor.get_memory(); + GCLMem_t ptr = NULL; + if (size > 0) { + mem->resize(tensor1d(DT_U8, size)); + mem->alloc(); + ptr = (GCLMem_t)mem->get_ptr(); + } + return ptr; +} + +inline GCLMem_t alloc_desc(Tensor tensor, GCLMemDesc desc) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->padding(desc); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +int depthwisePointwiseConvolutionTest( + int argc, char *argv[], DataFormat filterDataFormat, DataType dt) +{ + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 group, stride, padding; + U32 on, oc, oh, ow; + U32 biasNum; + ArchInfo archInfo; + archInfo.arch = MALI; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + in = 1; + ic = 8; + ih = 4; + iw = 4; + fn = 8; + fh = 3; + fw = 3; + group = 1; + stride = 1; + padding = 1; + + if (argc == 9) { + ic = atoi(argv[1]); + ih = atoi(argv[2]); + iw = atoi(argv[3]); + fn = atoi(argv[4]); + fh = atoi(argv[5]); + fw = atoi(argv[6]); + stride = atoi(argv[7]); + padding = atoi(argv[8]); + } + fc = ic; + on = 1; + oc = fn; + oh = (ih + padding * 2 - fh) / stride + 1; + ow = (iw + padding * 2 - fw) / stride + 1; + ActivationParamSpec dwActivationParamSpec; + ActivationParamSpec pwActivationParamSpec; + dwActivationParamSpec.mode = ACTIVATION_NULL; + pwActivationParamSpec.mode = ACTIVATION_NULL; + ConvolutionParamSpec convParamSpec = createConvolutionParamSpec(group, fh, fw, stride, stride, + padding, padding, padding, padding, 1, 1, fn, Convolution_Depthwise_Pointwise); + + U32 dwFilterLen = 1 * fc * fh * fw; + U32 pwFilterLen = fn * fc * 1 * 1; + U32 dwBiasLen = fc; + U32 pwBiasLen = fn; + + TensorDesc inputDesc = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + TensorDesc dwFilterDesc = tensor4df(dt, filterDataFormat, 1, fc, fh, fw); + TensorDesc pwFilterDesc = tensor4df(dt, filterDataFormat, fn, fc, 1, 1); + TensorDesc dwBiasDesc = tensor1d(dt, dwBiasLen); + TensorDesc pwBiasDesc = tensor1d(dt, pwBiasLen); + + U8 *input_cpu = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM); + U8 *dw_filter_cpu = ut_input_v(dwFilterLen, dt, UT_INIT_RANDOM); + U8 *pw_filter_cpu = ut_input_v(pwFilterLen, dt, UT_INIT_RANDOM); + U8 *dw_bias_cpu = ut_input_v(dwBiasLen, dt, UT_INIT_RANDOM); + U8 *pw_bias_cpu = ut_input_v(pwBiasLen, dt, UT_INIT_RANDOM); + + std::shared_ptr handleSharedPtr = OCLContext::getInstance().handle; + ; + GCLHandle_t handle = handleSharedPtr.get(); + std::vector kernelVec; + handle->kernelVec = &kernelVec; + Tensor inputTensor = Tensor(OCLMem); + Tensor outputTensor = Tensor(OCLMem); + Tensor dwFilterTensorOrg = Tensor(OCLMem); + Tensor dwFilterTensor = Tensor(OCLMem); + Tensor pwFilterTensorOrg = Tensor(OCLMem); + Tensor pwFilterTensor = Tensor(OCLMem); + Tensor dwBiasTensor = Tensor(OCLMem); + Tensor pwBiasTensor = Tensor(OCLMem); + Tensor pwBiasTensorBuf = Tensor(OCLMem); + Tensor pwBiasTensorImg = Tensor(OCLMem); + Tensor tmpTensor = Tensor(OCLMem); + inputTensor.resize(inputDesc); + dwFilterTensor.resize(dwFilterDesc); + dwFilterTensorOrg.resize(dwFilterDesc); + pwFilterTensor.resize(pwFilterDesc); + pwFilterTensorOrg.resize(pwFilterDesc); + dwBiasTensor.resize(dwBiasDesc); + pwBiasTensor.resize(pwBiasDesc); + pwBiasTensorBuf.resize(pwBiasDesc); + pwBiasTensorImg.resize(pwBiasDesc); + + MaliPara maliPara; + ForwardRunInfoMali runInfo; + runInfo.algorithm = (I32)(DEPTHWISE_CONVOLUTION_ALGORITHM_NULL); + maliPara.handle = handle; + maliPara.forwardRunInfo = &runInfo; + archInfo.archPara = &maliPara; + + CHECK_STATUS(depthwise_pointwise_convolution_infer_output_size( + &inputTensor, dwFilterTensor, pwFilterTensor, convParamSpec, &outputTensor, dt, &archInfo)); + ConvolutionPolicy policy = CONVOLUTION_TUNNING; + DepthwiseConvolutionForwardAlgorithm alg = DEPTHWISE_CONVOLUTION_ALGORITHM_NULL; + CHECK_STATUS(depthwise_pointwise_convolution_infer_forward_algorithm(inputTensor, + dwFilterTensor, pwFilterTensor, outputTensor, convParamSpec, policy, &alg, DT_F16, + dwActivationParamSpec, pwActivationParamSpec, &archInfo)); + + U32 maxBytes = 0; + U32 tmpBytes; + CHECK_STATUS(depthwise_pointwise_convolution_infer_forward_tmp_bytes(inputTensor, + dwFilterTensor, pwFilterTensor, outputTensor, convParamSpec, alg, &tmpBytes, &archInfo)); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + + U32 dwBytes; + U32 pwBytes; + U32 str[3] = {0, 0, 0}; + U32 off[3] = {0, 0, 0}; + GCLMemDesc filterMemDesc[2]; + filterMemDesc[0] = gcl_mem_desc(str, off, DT_U8, DF_NCWHC4); + filterMemDesc[1] = gcl_mem_desc(str, off, DT_U8, DF_NCWHC4); + maliPara.gclmemFilterDesc = filterMemDesc; + CHECK_STATUS(depthwise_pointwise_convolution_transform_filter_bytes( + dwFilterTensor, pwFilterTensor, convParamSpec, alg, &dwBytes, &pwBytes, &archInfo)); + + GCLMem_t output = alloc_map(outputTensor); + GCLMem_t input = alloc(inputTensor); + CHECK_STATUS(gcl_fill_memory_zero(handle, input)); + + GCLMemDesc desc = gclmem_build_desc(); + biasNum = (oc + 3) / 4; + desc.memType = GCL_MEM_IMG_1D; + desc.byteSize = biasNum * 4 * bytesOf(dt); + desc.stride[0] = biasNum; + desc.stride[1] = 1; + desc.stride[2] = 1; + desc.offset[0] = 0; + desc.offset[1] = 0; + desc.offset[2] = 0; + desc.num = biasNum; + desc.memFormat = DF_NHWC; + desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + desc.host_ptr = pw_bias_cpu; + alloc_desc(pwBiasTensorImg, desc); + + biasNum = (oc + 7) / 8 * 8; + desc.memType = GCL_MEM_BUF; + desc.byteSize = biasNum * bytesOf(dt); + desc.stride[0] = biasNum; + desc.stride[1] = 1; + desc.stride[2] = 1; + desc.offset[0] = 0; + desc.offset[1] = 0; + desc.offset[2] = 0; + desc.num = biasNum; + desc.memFormat = DF_NHWC; + desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + desc.host_ptr = pw_bias_cpu; + alloc_desc(pwBiasTensorBuf, desc); + + biasNum = (ic + 3) / 4; + desc.memType = GCL_MEM_IMG_1D; + desc.byteSize = biasNum * 4 * bytesOf(dt); + desc.stride[0] = biasNum; + desc.stride[1] = 1; + desc.stride[2] = 1; + desc.offset[0] = 0; + desc.offset[1] = 0; + desc.offset[2] = 0; + desc.num = biasNum; + desc.memFormat = DF_NHWC; + desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + desc.host_ptr = dw_bias_cpu; + alloc_desc(dwBiasTensor, desc); + + desc = filterMemDesc[0]; + alloc_desc(dwFilterTensor, desc); + desc = filterMemDesc[1]; + alloc_desc(pwFilterTensor, desc); + + desc.stride[0] = fw * fh; + desc.stride[1] = fc; + desc.stride[2] = 1; + desc.offset[0] = 0; + desc.offset[1] = 0; + desc.offset[2] = 0; + desc.byteSize = fw * fh * fc * bytesOf(dt); + desc.num = fw * fh * fc; + desc.memType = GCL_MEM_BUF; + desc.memFormat = DF_NCHW; + desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + desc.host_ptr = dw_filter_cpu; + alloc_desc(dwFilterTensorOrg, desc); + + desc.stride[0] = 1; + desc.stride[1] = fc; + desc.stride[2] = fn; + desc.offset[0] = 0; + desc.offset[1] = 0; + desc.offset[2] = 0; + desc.byteSize = fn * fc * bytesOf(dt); + desc.num = fn * fc; + desc.memType = GCL_MEM_BUF; + desc.memFormat = DF_NCHW; + desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + desc.host_ptr = pw_filter_cpu; + alloc_desc(pwFilterTensorOrg, desc); + + tmpBytes = tensorNumBytes(inputDesc); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + GCLMem_t tmpbuf = alloc_bytes(tmpTensor, maxBytes); + + CHECK_STATUS(depthwise_pointwise_convolution_transform_filter(dwFilterTensorOrg, + pwFilterTensorOrg, convParamSpec, alg, &dwFilterTensor, &pwFilterTensor, &archInfo)); + pwBiasTensor = (runInfo.algorithm == (I32)(DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_GEMM)) + ? pwBiasTensorBuf + : pwBiasTensorImg; + + CHECK_STATUS(ocl_set_input(handle, input, inputDesc, input_cpu, tmpbuf, true)); + + CHECK_STATUS(depthwise_pointwise_convolution(inputTensor, dwFilterTensor, pwFilterTensor, + convParamSpec, alg, dwBiasTensor, pwBiasTensor, tmpTensor, outputTensor, + dwActivationParamSpec, pwActivationParamSpec, &archInfo)); + /*warp up*/ + UNI_INFO_LOG("Warp up gpu:\n") + for (U32 i = 0; i < 2; i++) { + CHECK_STATUS(gcl_run_kernelVec(handle)); + } + +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernelVec_timing(handle, 0, handle->kernelVec->size())); + double time = handle->t_execute * 0.001; +#else + CHECK_STATUS(gcl_run_kernelVec(handle)); +#endif + TensorDesc outputDesc = outputTensor.get_desc(); + CHECK_STATUS(ocl_get_output(handle, output, outputDesc, true)); + void *output_gpu = output->mapPtrArray.back(); + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)+(%u %u %u %u)/(%u %u)=(%u %u %u %u)", in, ic, ih, iw, fn, fc, fh, + fw, stride, padding, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "DepthwisePointwise", params); +#ifdef _DEBUG + double ops = 2.0 * in * ic * ih * iw * fh * fw + in * ic * oh * ow + + 2.0 * on * oc * oh * ow * ic + on * oc * oh * ow; + ut_log(dt, buffer, ops, time); +#endif + Tensor inputTensorCpu; + inputTensorCpu.resize(inputDesc); + inputTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(inputTensorCpu, UT_ARCH), input_cpu, tensorNumBytes(inputDesc)); + + Tensor dwFilterTensorCpu; + dwFilterTensorCpu.resize(dwFilterDesc); + dwFilterTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(dwFilterTensorCpu, UT_ARCH), dw_filter_cpu, + tensorNumBytes(dwFilterDesc)); + + Tensor pwFilterTensorCpu; + pwFilterTensorCpu.resize(pwFilterDesc); + pwFilterTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(pwFilterTensorCpu, UT_ARCH), pw_filter_cpu, + tensorNumBytes(pwFilterDesc)); + + Tensor dwBiasTensorCpu; + dwBiasTensorCpu.resize(dwBiasDesc); + dwBiasTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(dwBiasTensorCpu, UT_ARCH), dw_bias_cpu, tensorNumBytes(dwBiasDesc)); + + Tensor pwBiasTensorCpu; + pwBiasTensorCpu.resize(pwBiasDesc); + pwBiasTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(pwBiasTensorCpu, UT_ARCH), pw_bias_cpu, tensorNumBytes(pwBiasDesc)); + + Tensor outputTensorCpu; + outputTensorCpu.resize(outputDesc); + outputTensorCpu.alloc(); + + Tensor tmpTensorCpu; + // setup tmp + CHECK_STATUS( + depthwise_pointwise_convolution_infer_forward_tmp_bytes(inputTensorCpu, dwFilterTensorCpu, + pwFilterTensorCpu, outputTensorCpu, convParamSpec, alg, &tmpBytes, &archInfo)); + tmpTensorCpu.resize(tensor1d(DT_F16, tmpBytes / bytesOf(DT_F16))); + tmpTensorCpu.alloc(); + + CHECK_STATUS(depthwise_pointwise_convolution(inputTensorCpu, dwFilterTensorCpu, + pwFilterTensorCpu, convParamSpec, DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT, dwBiasTensorCpu, + pwBiasTensorCpu, tmpTensorCpu, outputTensorCpu, dwActivationParamSpec, + pwActivationParamSpec, &archInfo_org)); + ut_check_a(output_gpu, get_ptr_from_tensor(outputTensorCpu, UT_ARCH), on * oc * ow * oh, dt); + + CHECK_STATUS(gcl_finish(handle)); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + free(input_cpu); + free(dw_filter_cpu); + free(pw_filter_cpu); + free(dw_bias_cpu); + free(pw_bias_cpu); + return 0; +} +#endif + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + depthwisePointwiseConvolutionTest(argc, argv, DF_NCHW, DT_F16); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_detectionoutput.cpp b/compute/tensor/tests/test_detectionoutput.cpp new file mode 100644 index 00000000..4ad46012 --- /dev/null +++ b/compute/tensor/tests/test_detectionoutput.cpp @@ -0,0 +1,145 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#include "ut_util.h" + +int detectionoutputTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 11); + // in0 loc + U32 ih0 = atoi(argv[1]); + U32 iw0 = atoi(argv[2]); + // in1 conf + U32 ih1 = atoi(argv[3]); + U32 iw1 = atoi(argv[4]); + // in2 priorbox + U32 in2 = atoi(argv[5]); + U32 ic2 = atoi(argv[6]); + U32 ilens2 = atoi(argv[7]); + // output + U32 oh = atoi(argv[8]); + U32 ow = atoi(argv[9]); + U32 num_class = atoi(argv[10]); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + DetectionOutputParamSpec detectionoutput_desc; + detectionoutput_desc.num_class = num_class; + detectionoutput_desc.nms_top_k = 400; + detectionoutput_desc.nms_threshold = 0.449999988079; + detectionoutput_desc.keep_top_k = 200; + detectionoutput_desc.confidence_threshold = 0.00999999977648; + + std::vector inputTensors(3); + std::vector inputTensorPtrs(3); + Tensor inputTensor_loc, inputTensor_conf, inputTensor_priorbox; + TensorDesc inputDesc_loc = tensor2d(dt, ih0, iw0); + TensorDesc inputDesc_conf = tensor2d(dt, ih1, iw1); + TensorDesc inputDesc_priorbox = tensor3d(dt, in2, ic2, ilens2); + inputTensor_loc.resize(inputDesc_loc); + inputTensor_conf.resize(inputDesc_conf); + inputTensor_priorbox.resize(inputDesc_priorbox); + inputTensor_loc.alloc(); + inputTensor_conf.alloc(); + inputTensor_priorbox.alloc(); + U32 input_len_loc = tensorNumElements(inputDesc_loc); + U32 input_len_conf = tensorNumElements(inputDesc_conf); + U32 input_len_priorbox = tensorNumElements(inputDesc_priorbox); + U8 *input_loc = ut_input_v(input_len_loc, dt, UT_INIT_RANDOM); + U8 *input_conf = ut_input_v(input_len_conf, dt, UT_INIT_RANDOM); + U8 *input_priorbox = ut_input_v(input_len_priorbox, dt, UT_INIT_RANDOM); + memcpy(get_ptr_from_tensor(inputTensor_loc, UT_ARCH), input_loc, tensorNumBytes(inputDesc_loc)); + memcpy( + get_ptr_from_tensor(inputTensor_conf, UT_ARCH), input_conf, tensorNumBytes(inputDesc_conf)); + memcpy(get_ptr_from_tensor(inputTensor_priorbox, UT_ARCH), input_priorbox, + tensorNumBytes(inputDesc_priorbox)); + inputTensors[0] = inputTensor_loc; + inputTensors[1] = inputTensor_conf; + inputTensors[2] = inputTensor_priorbox; + inputTensorPtrs[0] = &inputTensors[0]; + inputTensorPtrs[1] = &inputTensors[1]; + inputTensorPtrs[2] = &inputTensors[2]; + // set output + Tensor outputTensor, outputTensorRef; + CHECK_STATUS(detectionoutput_infer_output_size( + inputTensorPtrs, detectionoutput_desc, &outputTensor, &archInfo)); + outputTensor.alloc(); + TensorDesc outputDesc_ref = outputTensor.get_desc(); + outputTensorRef.resize(outputDesc_ref); + outputTensorRef.alloc(); + U32 output_len = outputTensor.length(); + CHECK_REQUIREMENT(input_len_loc == ih0 * iw0 && input_len_conf == ih1 * iw1 && + input_len_priorbox == in2 * ic2 * ilens2 && output_len == oh * ow); + if (UT_CHECK) { + CHECK_STATUS(detectionoutput(inputTensors, detectionoutput_desc, outputTensor, &archInfo)); + CHECK_STATUS( + detectionoutput(inputTensors, detectionoutput_desc, outputTensorRef, &archInfo_org)); + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), output_len, dt, 0.05, __FILE__, __LINE__); + } + U32 num_detected_max = detectionoutput_desc.keep_top_k; +#ifdef _USE_FP16 + if (dt == DT_F16) { + F16 *output_f16 = (F16 *)get_ptr_from_tensor(outputTensor, UT_ARCH); + int idx = 0; + for (U32 i = 0; i < 1 + num_detected_max; i++) { + if (i >= 1 && output_f16[idx] == 0) { + break; + } + std::cout << " 1 : " << output_f16[idx] << " 2 : " << output_f16[idx + 1] + << " 3 : " << output_f16[idx + 2] << " 4 : " << output_f16[idx + 3] + << " 5 : " << output_f16[idx + 4] << " 6 : " << output_f16[idx + 5] + << std::endl; + idx = idx + 6; + } + } +#endif + if (dt == DT_F32) { + F32 *output_f32 = (F32 *)get_ptr_from_tensor(outputTensorRef, UT_ARCH); + int idx = 0; + for (U32 i = 0; i < 1 + num_detected_max; i++) { + if (i >= 1 && output_f32[idx] == 0) { + break; + } + std::cout << " 1 : " << output_f32[idx] << " 2 : " << output_f32[idx + 1] + << " 3 : " << output_f32[idx + 2] << " 4 : " << output_f32[idx + 3] + << " 5 : " << output_f32[idx + 4] << " 6 : " << output_f32[idx + 5] + << std::endl; + idx = idx + 6; + } + } + + free(input_loc); + free(input_conf); + free(input_priorbox); + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + std::cout << "----- Testing FP16 Detectionoutput -----" << std::endl; + detectionoutputTest(argc, argv, DT_F16); + std::cout << "----- Finished FP16 Detectionoutput -----" << std::endl; +#endif +#ifdef _USE_FP32 + std::cout << "----- Testing FP32 Detectionoutput -----" << std::endl; + detectionoutputTest(argc, argv, DT_F32); + std::cout << "----- Finished FP32 Detectionoutput -----" << std::endl; +#endif + return 0; +} diff --git a/compute/tensor/tests/test_dilated_convolution.cpp b/compute/tensor/tests/test_dilated_convolution.cpp new file mode 100644 index 00000000..8aff5695 --- /dev/null +++ b/compute/tensor/tests/test_dilated_convolution.cpp @@ -0,0 +1,169 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "tensor_computing.h" +#include "ut_util.h" + +int dilatedConvolutionTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 17); + // in data + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + // weight + U32 fn = atoi(argv[5]); + U32 fc = atoi(argv[6]); + U32 fh = atoi(argv[7]); + U32 fw = atoi(argv[8]); + U32 group = atoi(argv[9]); + // stride & padding + U32 stride = atoi(argv[10]); + U32 padding = atoi(argv[11]); + + // dilation rate + U32 rate = atoi(argv[12]); + + // output + U32 on = atoi(argv[13]); + U32 oc = atoi(argv[14]); + U32 oh = atoi(argv[15]); + U32 ow = atoi(argv[16]); + CHECK_REQUIREMENT(in == 1 && on == 1); + + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_RELU; + activationDesc.value[0] = 0; + TensorDesc outputDesc; + TensorDesc inputDesc = tensor4df(dt, DF_NCHWC8, in, ic, ih, iw); + TensorDesc filterDesc = tensor4df(dt, DF_NCHW, oc, ic, fh, fw); + TensorDesc biasDesc = tensor1d(dt, oc); + ConvolutionParamSpec convParamSpec = createConvolutionParamSpec(group, fh, fw, stride, stride, + padding, padding, padding, padding, rate, rate, fn, Convolution_Dilation); + + // setup input, filter, bias + U8 *input = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM); + U8 *filter = ut_input_v(fn * fc * fh * fw, dt, UT_INIT_RANDOM); + U8 *bias = ut_input_v(oc, dt, UT_INIT_RANDOM); + + Tensor inputTensor; + Tensor inputTensorRef; + Tensor filterTensor; + Tensor filterTensorRef; + Tensor outputTensor; + Tensor outputTensorRef; + Tensor biasTensor; + + inputTensor.resize(inputDesc); + inputTensorRef.resize(inputDesc); + filterTensor.resize(filterDesc); + filterTensorRef.resize(filterDesc); + biasTensor.resize(biasDesc); + + inputTensor.alloc(); + inputTensorRef.alloc(); + filterTensor.alloc(); + filterTensorRef.alloc(); + biasTensor.alloc(); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, bytesOf(dt) * in * ic * ih * iw); + memcpy(get_ptr_from_tensor(inputTensorRef, UT_ARCH), input, bytesOf(dt) * in * ic * ih * iw); + memcpy(get_ptr_from_tensor(filterTensor, UT_ARCH), filter, tensorNumBytes(filterDesc)); + memcpy(get_ptr_from_tensor(filterTensorRef, UT_ARCH), filter, tensorNumBytes(filterDesc)); + memcpy(get_ptr_from_tensor(biasTensor, UT_ARCH), bias, tensorNumBytes(biasDesc)); + + // setup output, bias + CHECK_STATUS(convolution_infer_output_size( + &inputTensor, filterTensor, convParamSpec, &outputTensor, dt, &archInfo)); + outputTensor.alloc(); + outputTensorRef.resize(outputTensor.get_desc()); + outputTensorRef.alloc(); + + // setup alg + ConvolutionPolicy policy = CONVOLUTION_FASTEST; + ConvolutionForwardAlgorithm alg = CONVOLUTION_ALGORITHM_NULL; + CHECK_STATUS(convolution_infer_forward_algorithm(inputTensor, filterTensor, outputTensor, + convParamSpec, policy, &alg, dt, activationDesc, &archInfo)); + + // setup tmp + U32 tmpBytes; + CHECK_STATUS(convolution_infer_forward_tmp_bytes( + inputTensor, filterTensor, outputTensor, convParamSpec, alg, &tmpBytes, &archInfo)); + Tensor tmpTensor; + tmpTensor.resize(tensor1d(DT_U8, tmpBytes)); + tmpTensor.alloc(); + + // setup filter trans + U32 ftmBytes; + CHECK_STATUS( + convolution_transform_filter_bytes(filterTensor, convParamSpec, alg, &ftmBytes, &archInfo)); + Tensor ftmTensor; + ftmTensor.resize(tensor1d(DT_U8, ftmBytes)); + ftmTensor.alloc(); + // trans filter + CHECK_STATUS(convolution_transform_filter( + filterTensor, convParamSpec, alg, tmpTensor, &ftmTensor, &archInfo)); + + if (UT_CHECK) { + CHECK_STATUS(convolution(inputTensor, ftmTensor, convParamSpec, alg, nullptr, biasTensor, + tmpTensor, outputTensor, activationDesc, &archInfo)); + + // naive implement + CHECK_STATUS(convolution(inputTensorRef, filterTensorRef, convParamSpec, alg, nullptr, + biasTensor, tmpTensor, outputTensorRef, activationDesc, &archInfo_org)); + + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), outputTensor.length(), dt, 1, __FILE__, + __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(convolution(inputTensor, ftmTensor, convParamSpec, alg, nullptr, biasTensor, + tmpTensor, outputTensor, activationDesc, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)+(%u %u %u %u)/(%u %u %u)=(%u %u %u %u)", in, ic, ih, iw, fn, fc, + fh, fw, stride, padding, rate, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "DilatedConvolution", params); + double ops = (1.0 * on * oc * oh * ow) * (2.0 * ic * fh * fw + 1); + ut_log(dt, buffer, ops, time); + + free(input); + free(filter); + free(bias); + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + dilatedConvolutionTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + dilatedConvolutionTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_eltwise.cpp b/compute/tensor/tests/test_eltwise.cpp new file mode 100644 index 00000000..7452c24a --- /dev/null +++ b/compute/tensor/tests/test_eltwise.cpp @@ -0,0 +1,107 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include + +#include "tensor_computing.h" +#include "ut_util.h" + +int eltwiseTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 6); + U32 num = atoi(argv[1]); + U32 in = atoi(argv[2]); + U32 ic = atoi(argv[3]); + U32 ih = atoi(argv[4]); + U32 iw = atoi(argv[5]); + + U32 len = in * ic * ih * iw; + EltwiseMode eltwiseMode = ELTWISE_MAX; + EltwiseParamSpec eltwiseDesc; + eltwiseDesc.elt_mode = eltwiseMode; + eltwiseDesc.activation_type = ACTIVATION_NULL; + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + std::vector input(num); + std::vector inTensors(num); + std::vector inTensorPtr(num); + TensorDesc inDesc = tensor4df(dt, DF_NCHWC8, in, ic, ih, iw); + Tensor outTensor; + for (U32 i = 0; i < num; i++) { + input[i] = (void *)ut_input_v(len, dt, UT_INIT_RANDOM); + inTensors[i].resize(inDesc); + inTensors[i].alloc(); + memcpy(get_ptr_from_tensor(inTensors[i], UT_ARCH), input[i], tensorNumBytes(inDesc)); + inTensorPtr[i] = &inTensors[i]; + } + + CHECK_STATUS(eltwise_infer_output_size(inTensorPtr, &outTensor, &archInfo)); + CHECK_REQUIREMENT(len == outTensor.length()); + outTensor.alloc(); + Tensor outTensorRef; + outTensorRef.resize(outTensor.get_desc()); + outTensorRef.alloc(); + + U32 tmpBytes; + CHECK_STATUS(eltwise_infer_forward_tmp_bytes(inTensors, outTensor, &tmpBytes, &archInfo)); + Tensor tmpTensor; + tmpTensor.resize(tensor1d(DT_U8, tmpBytes)); + tmpTensor.alloc(); + + if (UT_CHECK) { + CHECK_STATUS(eltwise(inTensors, eltwiseDesc, tmpTensor, outTensor, &archInfo)); + + CHECK_STATUS(eltwise(inTensors, eltwiseDesc, tmpTensor, outTensorRef, &archInfo_org)); + + // check + ut_check_v(get_ptr_from_tensor(outTensor, UT_ARCH), + get_ptr_from_tensor(outTensorRef, UT_ARCH), len, dt, 1, __FILE__, __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(eltwise(inTensors, eltwiseDesc, tmpTensor, outTensor, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "%u (%u %u %u %u)=(%u %u %u %u)", num, in, ic, ih, iw, in, ic, ih, iw); + sprintf(buffer, "%20s, %80s", "Eltwise", params); + double ops = 1.0 * num * in * ic * ih * iw; + ut_log(dt, buffer, ops, time); + + for (U32 i = 0; i < num; i++) { + free(input[i]); + } + + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + eltwiseTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + eltwiseTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_fully_connected.cpp b/compute/tensor/tests/test_fully_connected.cpp new file mode 100644 index 00000000..f42a1ff8 --- /dev/null +++ b/compute/tensor/tests/test_fully_connected.cpp @@ -0,0 +1,119 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#include "ut_util.h" + +int fullyConnectedTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 4); + U32 m = atoi(argv[1]); + U32 k = atoi(argv[2]); + U32 n = atoi(argv[3]); + + DataFormat df = DF_TRANSPOSE; + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + TensorDesc inputDesc = tensor4df(dt, DF_NCHW, m, 1, 1, k); + TensorDesc filterDesc = tensor2df(dt, df, n, k); + TensorDesc biasDesc = tensor1d(dt, n); + + Tensor inputTensor, filterTensor, biasTensor; + inputTensor.resize(inputDesc); + inputTensor.alloc(); + U8 *input = ut_input_v(m * k, dt, UT_INIT_RANDOM); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inputDesc)); + + filterTensor.resize(filterDesc); + filterTensor.alloc(); + U8 *filter = ut_input_v(k * n, dt, UT_INIT_RANDOM); + memcpy(get_ptr_from_tensor(filterTensor, UT_ARCH), filter, tensorNumBytes(filterDesc)); + + biasTensor.resize(biasDesc); + biasTensor.alloc(); + U8 *bias = ut_input_v(n, dt, UT_INIT_RANDOM); + memcpy(get_ptr_from_tensor(biasTensor, UT_ARCH), bias, tensorNumBytes(biasDesc)); + // set output + Tensor outputTensor, outputTensorRef; + CHECK_STATUS( + fully_connected_infer_output_size(&inputTensor, filterTensor, &outputTensor, &archInfo)); + outputTensor.alloc(); + TensorDesc outputDesc_ref = outputTensor.get_desc(); + outputTensorRef.resize(outputDesc_ref); + outputTensorRef.alloc(); + // setup tmp + Tensor tmpTensor; + U32 tmpBytes; + CHECK_STATUS( + fully_connected_infer_forward_tmp_bytes(inputTensor, filterTensor, &tmpBytes, &archInfo)); + tmpTensor.resize(tensor1d(DT_U8, tmpBytes)); + tmpTensor.alloc(); + // setup filter trans + U32 ftmBytes; + CHECK_STATUS(fully_connected_transform_filter_bytes(filterTensor, &ftmBytes, &archInfo)); + // trans filter + Tensor ftmTensor; + ftmTensor.resize(tensor1d(DT_U8, ftmBytes)); + ftmTensor.alloc(); + CHECK_STATUS(fully_connected_transform_filter(inputTensor, filterTensor, &ftmTensor, &archInfo)); + + //U32 bytes = 0; + if (UT_CHECK) { + CHECK_STATUS( + fully_connected(inputTensor, ftmTensor, biasTensor, tmpTensor, outputTensor, &archInfo)); + + // naive implement + CHECK_STATUS(fully_connected( + inputTensor, ftmTensor, biasTensor, tmpTensor, outputTensorRef, &archInfo_org)); + + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), m * n, dt, 1, __FILE__, __LINE__); + } + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS( + fully_connected(inputTensor, ftmTensor, biasTensor, tmpTensor, outputTensor, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u)+(%u %u)=(%u %u)", m, k, k, n, m, n); + sprintf(buffer, "%20s, %80s", "InnerProduct", params); + double ops = 2.0 * m * n * k + 1.0 * m * n; + ut_log(dt, buffer, ops, time); + + free(input); + free(filter); + free(bias); + + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + fullyConnectedTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + fullyConnectedTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_fully_connected_ocl.cpp b/compute/tensor/tests/test_fully_connected_ocl.cpp new file mode 100644 index 00000000..5fc43c4b --- /dev/null +++ b/compute/tensor/tests/test_fully_connected_ocl.cpp @@ -0,0 +1,270 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "tensor_computing.h" +#include "ut_util.h" +#include "gcl.h" +#include "libkernelsource.h" + +#ifdef _USE_FP16 +inline GCLMem_t alloc(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_map(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->mapped_alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_bytes(Tensor tensor, U32 size) +{ + auto mem = (OclMemory *)tensor.get_memory(); + GCLMem_t ptr = NULL; + if (size > 0) { + mem->resize(tensor1d(DT_U8, size)); + mem->alloc(); + ptr = (GCLMem_t)mem->get_ptr(); + } + return ptr; +} + +inline GCLMem_t alloc_desc(Tensor tensor, GCLMemDesc desc) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->padding(desc); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} +int fullyConnectedTest(int argc, char *argv[], DataType dt) +{ + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + U32 biasNum; + ArchInfo archInfo; + archInfo.arch = MALI; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + in = 1; + ic = 4; + ih = 4; + iw = 4; + fn = 4; + + if (argc == 5) { + ic = atoi(argv[1]); + ih = atoi(argv[2]); + iw = atoi(argv[3]); + fn = atoi(argv[4]); + } + fc = ic; + fh = ih; + fw = iw; + + on = 1; + oc = fn; + oh = 1; + ow = 1; + + TensorDesc inputDesc, filterDesc, outputDesc, biasDesc; + TensorDesc filterDesc_cpu, outputDesc_cpu; + + inputDesc = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + filterDesc = tensor4df(dt, DF_NCHW, fn, fc, fh, fw); + filterDesc_cpu = tensor2df(dt, DF_NORMAL, fn, fc * fh * fw); + outputDesc_cpu = tensor2df(dt, DF_NORMAL, 1, fn); + biasDesc = tensor1d(dt, oc); + + U8 *input_cpu = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM); + U8 *filter_cpu = ut_input_v(fn * fc * fh * fw, dt, UT_INIT_RANDOM); + U8 *bias_cpu = ut_input_v(oc, dt, UT_INIT_RANDOM); + U8 *output_gpu = NULL; + + std::shared_ptr handleSharedPtr = OCLContext::getInstance().handle; + ; + GCLHandle_t handle = handleSharedPtr.get(); + std::vector kernelVec; + handle->kernelVec = &kernelVec; + Tensor inputTensor = Tensor(OCLMem); + Tensor outputTensor = Tensor(OCLMem); + Tensor filterTensorOrg = Tensor(OCLMem); + Tensor filterTensor = Tensor(OCLMem); + Tensor biasTensor = Tensor(OCLMem); + Tensor tmpTensor = Tensor(OCLMem); + inputTensor.resize(inputDesc); + filterTensor.resize(filterDesc); + filterTensorOrg.resize(filterDesc); + biasTensor.resize(biasDesc); + + MaliPara maliPara; + ForwardRunInfoMali runInfo; + runInfo.algorithm = (I32)(CONVOLUTION_ALGORITHM_NULL); + runInfo.best_w[0] = 1; + runInfo.best_c[0] = 1; + runInfo.best_k[0] = 1; + maliPara.handle = handle; + maliPara.gclmemInputDesc = NULL; + maliPara.gclmemOutputDesc = NULL; + maliPara.gclmemFilterDesc = NULL; + maliPara.forwardRunInfo = &runInfo; + archInfo.archPara = &maliPara; + + CHECK_STATUS( + fully_connected_infer_output_size(&inputTensor, filterTensor, &outputTensor, &archInfo)); + CHECK_STATUS( + fully_connected_infer_forward_algorithm(inputTensor, filterTensor, outputTensor, &archInfo)); + + U32 maxBytes = 0; + U32 tmpBytes; + CHECK_STATUS( + fully_connected_infer_forward_tmp_bytes(inputTensor, filterTensor, &tmpBytes, &archInfo)); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + + U32 ftmBytes; + U32 str[3] = {0, 0, 0}; + U32 off[3] = {0, 0, 0}; + GCLMemDesc filterMemDesc = gcl_mem_desc(str, off, DT_U8, DF_NCWHC4); + maliPara.gclmemFilterDesc = &filterMemDesc; + CHECK_STATUS(fully_connected_transform_filter_bytes(filterTensor, &ftmBytes, &archInfo)); + + GCLMem_t output = alloc_map(outputTensor); + GCLMem_t input = alloc(inputTensor); + CHECK_STATUS(gcl_fill_memory_zero(handle, input)); + + GCLMemDesc desc = gclmem_build_desc(); + biasNum = oc; + desc.memType = GCL_MEM_BUF; + desc.byteSize = biasNum * bytesOf(dt); + desc.stride[0] = biasNum; + desc.stride[1] = 1; + desc.stride[2] = 1; + desc.offset[0] = 0; + desc.offset[1] = 0; + desc.offset[2] = 0; + desc.num = biasNum; + desc.memFormat = DF_NHWC; + desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + desc.host_ptr = bias_cpu; + GCLMem_t bias = alloc_desc(biasTensor, desc); + + desc = filterMemDesc; + GCLMem_t filter = alloc_desc(filterTensor, desc); + + desc.stride[0] = fw * fh * fc; + desc.stride[1] = fn; + desc.stride[2] = 1; + desc.offset[0] = 0; + desc.offset[1] = 0; + desc.offset[2] = 0; + desc.byteSize = fw * fh * fc * fn * bytesOf(dt); + desc.num = fw * fh * fc * fn; + desc.memType = GCL_MEM_BUF; + desc.memFormat = DF_NCHW; + desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + desc.host_ptr = filter_cpu; + alloc_desc(filterTensorOrg, desc); + + tmpBytes = tensorNumBytes(inputDesc); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + GCLMem_t tmpbuf = alloc_bytes(tmpTensor, maxBytes); + TensorDesc filterDescTran; + std::vector filterArray; + std::vector outputArray; + std::vector biasArray; + filterArray.push_back(filter); + outputArray.push_back(output); + biasArray.push_back(bias); + + CHECK_STATUS( + fully_connected_transform_filter(inputTensor, filterTensorOrg, &filterTensor, &archInfo)); + + CHECK_STATUS(ocl_set_input(handle, input, inputDesc, input_cpu, tmpbuf, true)); + + CHECK_STATUS( + fully_connected(inputTensor, filterTensor, biasTensor, tmpTensor, outputTensor, &archInfo)); + /*warp up*/ + UNI_INFO_LOG("Warp up gpu:\n") + for (U32 i = 0; i < 2; i++) { + CHECK_STATUS(gcl_run_kernelVec(handle)); + } + +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernelVec_timing(handle, 0, handle->kernelVec->size())); + double time = handle->t_execute * 0.001; +#else + CHECK_STATUS(gcl_run_kernelVec(handle)); +#endif + outputDesc = outputTensor.get_desc(); + CHECK_STATUS(ocl_get_output(handle, output, outputDesc, true)); + output_gpu = output->mapPtrArray.back(); + + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)+(%u %u %u %u)=(%u %u %u %u)", in, ic, ih, iw, fn, fc, fh, fw, on, + oc, oh, ow); + sprintf(buffer, "%20s, %80s", "InnerProdect", params); +#ifdef _DEBUG + double ops = 2.0 * fn * fc * fh * fw + 1.0 * fn; + ut_log(dt, buffer, ops, time); +#endif + Tensor inputTensorCpu; + inputTensorCpu.resize(inputDesc); + inputTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(inputTensorCpu, UT_ARCH), input_cpu, tensorNumBytes(inputDesc)); + + Tensor filterTensorCpu; + filterTensorCpu.resize(filterDesc_cpu); + filterTensorCpu.alloc(); + memcpy( + get_ptr_from_tensor(filterTensorCpu, UT_ARCH), filter_cpu, tensorNumBytes(filterDesc_cpu)); + + Tensor biasTensorCpu; + biasTensorCpu.resize(biasDesc); + biasTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(biasTensorCpu, UT_ARCH), bias_cpu, tensorNumBytes(biasDesc)); + + Tensor outputTensorCpu; + outputTensorCpu.resize(outputDesc_cpu); + outputTensorCpu.alloc(); + + Tensor tmpTensorCpu; + CHECK_STATUS(fully_connected_infer_forward_tmp_bytes( + inputTensorCpu, filterTensorCpu, &tmpBytes, &archInfo_org)); + tmpTensorCpu.resize(tensor1d(DT_F16, tmpBytes / bytesOf(DT_F16))); + tmpTensorCpu.alloc(); + + CHECK_STATUS(fully_connected(inputTensorCpu, filterTensorCpu, biasTensorCpu, tmpTensorCpu, + outputTensorCpu, &archInfo_org)); + ut_check_a(output_gpu, get_ptr_from_tensor(outputTensorCpu, UT_ARCH), on * oc * ow * oh, dt); + + free(input_cpu); + free(filter_cpu); + free(bias_cpu); + return 0; +} +#endif + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + fullyConnectedTest(argc, argv, DT_F16); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_l2normalization.cpp b/compute/tensor/tests/test_l2normalization.cpp new file mode 100644 index 00000000..1a987616 --- /dev/null +++ b/compute/tensor/tests/test_l2normalization.cpp @@ -0,0 +1,101 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#include "ut_util.h" + +int l2normalizationTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 4); + U32 ic = atoi(argv[1]); + U32 ih = atoi(argv[2]); + U32 iw = atoi(argv[3]); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + DataFormat df = DF_MTK; + TensorDesc inputDesc = tensor3df(dt, df, ic, ih, iw); + U32 input_len = tensorNumElements(inputDesc); + U8 *input = ut_input_v(input_len, dt, UT_INIT_RANDOM); + + Tensor inputTensor; + inputTensor.resize(inputDesc); + inputTensor.alloc(); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inputDesc)); + + // set output + Tensor outputTensor, outputTensorRef; + CHECK_STATUS(l2normalization_infer_output_size(&inputTensor, &outputTensor, &archInfo)); + outputTensor.alloc(); + TensorDesc outputDesc_ref = outputTensor.get_desc(); + outputTensorRef.resize(outputDesc_ref); + outputTensorRef.alloc(); + + U32 output_len = outputTensor.length(); + CHECK_REQUIREMENT(input_len == ic * ih * iw && output_len == ic * ih * iw); + + if (UT_CHECK) { + CHECK_STATUS(l2normalization(inputTensor, outputTensor, &archInfo)); + + // naive implement + CHECK_STATUS(l2normalization(inputTensor, outputTensorRef, &archInfo_org)); + + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), output_len, dt, 0.05, __FILE__, __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(l2normalization(inputTensor, outputTensor, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + //general benchmark + time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(l2normalization(inputTensor, outputTensorRef, &archInfo_org)); + } + time_end = ut_time_ms(); + double general_implement_time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char general_buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u)=(%u %u %u)", ic, ih, iw, ic, ih, iw); + sprintf(buffer, "%20s, %80s", "L2Normalization", params); + sprintf(general_buffer, "%20s, %80s", "General L2Normalization", params); + double ops = input_len; + ut_log(dt, buffer, ops, time); + ut_log(dt, general_buffer, ops, general_implement_time); + + free(input); + + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + l2normalizationTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + l2normalizationTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_multihead_attention_ocl.cpp b/compute/tensor/tests/test_multihead_attention_ocl.cpp new file mode 100644 index 00000000..eac6128e --- /dev/null +++ b/compute/tensor/tests/test_multihead_attention_ocl.cpp @@ -0,0 +1,393 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "tensor_computing.h" +#include "ut_util.h" +#include "gcl.h" +#include "libkernelsource.h" + +#ifdef _USE_FP16 +inline GCLMem_t alloc(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_map(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->mapped_alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_bytes(Tensor tensor, U32 size) +{ + auto mem = (OclMemory *)tensor.get_memory(); + GCLMem_t ptr = NULL; + if (size > 0) { + mem->resize(tensor1d(DT_U8, size)); + mem->alloc(); + ptr = (GCLMem_t)mem->get_ptr(); + } + return ptr; +} + +inline GCLMem_t alloc_desc(Tensor tensor, GCLMemDesc desc) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->padding(desc); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} +int multiheadAttentionTest(int argc, char *argv[], DataType dt) +{ + U32 in, ic, ih, iw; + U32 fn[4]; + U32 fc[4]; + U32 on, oc, oh, ow; + U32 firstFCSliceNum[3]; + U32 matmulSliceLen; + float multiplyAlpha; + float multiplyBeta; + std::vector eltwiseWithLayerNormIn; + ArchInfo archInfo; + archInfo.arch = MALI; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + in = 1; + ic = 312; + ih = 9; + iw = 1; + + fn[0] = 936; + fc[0] = 312; + + fn[1] = 312; + fc[1] = 312; + + fn[2] = 1200; + fc[2] = 312; + + fn[3] = 312; + fc[3] = 1200; + + firstFCSliceNum[0] = 312; + firstFCSliceNum[1] = 312; + firstFCSliceNum[2] = 312; + + matmulSliceLen = 26; + multiplyAlpha = 0.196116134524; + multiplyBeta = 0; + U32 filterNum = 4; + U32 lnNum = 2; + for (U32 i = 0; i < lnNum; ++i) { + eltwiseWithLayerNormIn.push_back(false); + } + + if (argc == 20) { + in = atoi(argv[1]); + ic = atoi(argv[2]); + ih = atoi(argv[3]); + iw = atoi(argv[4]); + fn[0] = atoi(argv[5]); + fc[0] = atoi(argv[6]); + fn[1] = atoi(argv[7]); + fc[1] = atoi(argv[8]); + fn[2] = atoi(argv[9]); + fc[2] = atoi(argv[10]); + fn[3] = atoi(argv[11]); + fc[3] = atoi(argv[12]); + firstFCSliceNum[0] = atoi(argv[13]); + firstFCSliceNum[1] = atoi(argv[14]); + firstFCSliceNum[2] = atoi(argv[15]); + matmulSliceLen = atoi(argv[16]); + multiplyAlpha = atof(argv[17]); + multiplyBeta = atof(argv[18]); + eltwiseWithLayerNormIn[0] = atoi(argv[19]); + eltwiseWithLayerNormIn[1] = atoi(argv[19]); + } + on = 1; + oc = fn[3]; + oh = ih; + ow = 1; + + TensorDesc inputDesc, outputDesc; + std::vector filterDesc; + std::vector biasDesc; + std::vector lnAlphaDesc; + std::vector lnBetaDesc; + + inputDesc = tensor3df(dt, DF_MKT, in, ic, ih); + for (U32 i = 0; i < filterNum; ++i) { + TensorDesc tmpFilterDesc = tensor4df(dt, DF_NCHW, fn[i], fc[i], 1, 1); + TensorDesc tmpBiasDesc = tensor1d(dt, fn[i] + 8); + filterDesc.push_back(tmpFilterDesc); + biasDesc.push_back(tmpBiasDesc); + } + + for (U32 i = 0; i < lnNum; ++i) { + TensorDesc tmpDesc = tensor1d(dt, (ic + 3) / 4 * 4); + if (i == 1) { + tmpDesc = tensor1d(dt, (fn[1] + 3) / 4 * 4); + } + lnAlphaDesc.push_back(tmpDesc); + lnBetaDesc.push_back(tmpDesc); + } + + std::vector filter_cpu; + std::vector bias_cpu; + std::vector lnAlpha_cpu; + std::vector lnBeta_cpu; + + U8 *input_cpu = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM); + + for (U32 i = 0; i < filterNum; i++) { + U8 *fltval = ut_input_v(tensorNumElements(filterDesc[i]), dt, UT_INIT_RANDOM); + U8 *biasval = ut_input_v(tensorNumElements(biasDesc[i]), dt, UT_INIT_RANDOM); + filter_cpu.push_back(fltval); + bias_cpu.push_back(biasval); + } + + for (U32 i = 0; i < lnNum; i++) { + U8 *alphaVal = ut_input_v(tensorNumElements(lnAlphaDesc[i]), dt, UT_INIT_RANDOM); + U8 *betaVal = ut_input_v(tensorNumElements(lnBetaDesc[i]), dt, UT_INIT_RANDOM); + lnAlpha_cpu.push_back(alphaVal); + lnBeta_cpu.push_back(betaVal); + } + + U8 *output_gpu = NULL; + + std::shared_ptr handleSharedPtr = OCLContext::getInstance().handle; + ; + GCLHandle_t handle = handleSharedPtr.get(); + std::vector kernelVec; + handle->kernelVec = &kernelVec; + Tensor inputTensor = Tensor(OCLMem); + inputTensor.resize(inputDesc); + + std::vector filterTensorOrg; + std::vector filterTensor; + std::vector biasTensor; + for (U32 i = 0; i < filterNum; i++) { + Tensor tensor = Tensor(OCLMem); + tensor.resize(filterDesc[i]); + filterTensor.push_back(tensor); + filterTensorOrg.push_back(tensor); + tensor.resize(biasDesc[i]); + biasTensor.push_back(tensor); + } + + std::vector lnAlphaTensor; + std::vector lnBetaTensor; + for (U32 i = 0; i < lnNum; i++) { + Tensor tensor = Tensor(OCLMem); + tensor.resize(lnAlphaDesc[i]); + lnAlphaTensor.push_back(tensor); + tensor.resize(lnBetaDesc[i]); + lnBetaTensor.push_back(tensor); + } + Tensor tmpTensor = Tensor(OCLMem); + Tensor outputTensor = Tensor(OCLMem); + + MaliPara maliPara; + ForwardRunInfoMali runInfo; + runInfo.algorithm = (I32)(CONVOLUTION_ALGORITHM_NULL); + for (U32 i = 0; i < 6; ++i) { + runInfo.best_w[i] = 1; + runInfo.best_c[i] = 1; + runInfo.best_k[i] = 1; + } + maliPara.handle = handle; + maliPara.forwardRunInfo = &runInfo; + archInfo.archPara = &maliPara; + ActivationMode activation = ACTIVATION_GELU; + CHECK_STATUS(multihead_attention_infer_output_size( + &inputTensor, filterTensor, &outputTensor, firstFCSliceNum, &archInfo)); + + CHECK_STATUS(multihead_attention_infer_forward_algorithm(inputTensor, filterTensor, + &multiplyAlpha, &multiplyBeta, firstFCSliceNum, matmulSliceLen, eltwiseWithLayerNormIn, + activation, outputTensor, &archInfo)); + U32 maxBytes = 0; + U32 tmpBytes; + CHECK_STATUS(multihead_attention_infer_forward_tmp_bytes(inputTensor, filterTensor, + eltwiseWithLayerNormIn, firstFCSliceNum, matmulSliceLen, &tmpBytes, &archInfo)); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + + U32 ftmBytes = 0; + GCLMemDesc filterMemDesc[4]; + U32 stride[3] = {0, 0, 0}; + U32 offset[3] = {0, 0, 0}; + for (U32 i = 0; i < filterNum; i++) { + filterMemDesc[i] = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + } + maliPara.gclmemFilterDesc = filterMemDesc; + CHECK_STATUS(multihead_attention_transform_filter_bytes(filterTensor, &ftmBytes, &archInfo)); + + GCLMem_t output = alloc_map(outputTensor); + + for (U32 i = 0; i < 2; ++i) { + U32 biasNum = fn[i] + 8; + GCLMemDesc tmpDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + tmpDesc.stride[0] = biasNum; + tmpDesc.stride[1] = 1; + tmpDesc.stride[2] = 1; + tmpDesc.offset[0] = 0; + tmpDesc.offset[1] = 0; + tmpDesc.offset[2] = 0; + tmpDesc.num = biasNum; + tmpDesc.byteSize = biasNum * bytesOf(dt); + tmpDesc.memFormat = DF_NHWC; + tmpDesc.memType = GCL_MEM_BUF; + tmpDesc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + tmpDesc.host_ptr = bias_cpu[i]; + alloc_desc(biasTensor[i], tmpDesc); + } + for (U32 i = 2; i < filterNum; ++i) { + U32 biasNum = (fn[i] + 3) / 4; + GCLMemDesc tmpDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + tmpDesc.stride[0] = biasNum; + tmpDesc.stride[1] = 1; + tmpDesc.stride[2] = 1; + tmpDesc.offset[0] = 0; + tmpDesc.offset[1] = 0; + tmpDesc.offset[2] = 0; + tmpDesc.num = biasNum; + tmpDesc.byteSize = biasNum * bytesOf(dt) * 4; + tmpDesc.memFormat = DF_NHWC; + tmpDesc.memType = GCL_MEM_IMG_1D; + tmpDesc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + tmpDesc.host_ptr = bias_cpu[i]; + alloc_desc(biasTensor[i], tmpDesc); + } + + for (U32 i = 0; i < lnNum; ++i) { + U32 layerNormNum = (ic + 3) / 4 * 4; + if (i == 1) { + layerNormNum = (fn[1] + 3) / 4 * 4; + } + GCLMemDesc tmpDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + tmpDesc.stride[0] = layerNormNum; + tmpDesc.stride[1] = 1; + tmpDesc.stride[2] = 1; + tmpDesc.offset[0] = 0; + tmpDesc.offset[1] = 0; + tmpDesc.offset[2] = 0; + tmpDesc.num = layerNormNum; + tmpDesc.byteSize = layerNormNum * bytesOf(dt); + tmpDesc.memFormat = DF_NHWC; + tmpDesc.memType = GCL_MEM_BUF; + tmpDesc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + tmpDesc.host_ptr = lnAlpha_cpu[i]; + alloc_desc(lnAlphaTensor[i], tmpDesc); + + tmpDesc.stride[0] = layerNormNum; + tmpDesc.stride[1] = 1; + tmpDesc.stride[2] = 1; + tmpDesc.offset[0] = 0; + tmpDesc.offset[1] = 0; + tmpDesc.offset[2] = 0; + tmpDesc.num = layerNormNum; + tmpDesc.byteSize = layerNormNum * bytesOf(dt); + tmpDesc.memFormat = DF_NHWC; + tmpDesc.memType = GCL_MEM_BUF; + tmpDesc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + tmpDesc.host_ptr = lnBeta_cpu[i]; + alloc_desc(lnBetaTensor[i], tmpDesc); + } + for (U32 i = 0; i < filterNum; ++i) { + GCLMemDesc desc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + desc.stride[0] = fc[i]; + desc.stride[1] = fn[i]; + desc.stride[2] = 1; + desc.offset[0] = 0; + desc.offset[1] = 0; + desc.offset[2] = 0; + desc.byteSize = fc[i] * fn[i] * bytesOf(dt); + desc.num = fc[i] * fn[i]; + desc.memType = GCL_MEM_BUF; + desc.memFormat = DF_NCHW; + desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + desc.host_ptr = filter_cpu[i]; + alloc_desc(filterTensorOrg[i], desc); + } + + for (U32 i = 0; i < filterNum; ++i) { + GCLMemDesc desc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + desc = filterMemDesc[i]; + alloc_desc(filterTensor[i], desc); + } + + auto inputMem = (OclMemory *)inputTensor.get_memory(); + GCLMemDesc inputGclDesc = inputMem->get_desc(); + inputGclDesc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + inputGclDesc.host_ptr = input_cpu; + alloc_desc(inputTensor, inputGclDesc); + + tmpBytes = tensorNumBytes(inputDesc); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + alloc_bytes(tmpTensor, maxBytes); + + std::vector filterTensorPtr; + for (U32 i = 0; i < filterNum; i++) { + filterTensorPtr.push_back(&filterTensor[i]); + } + CHECK_STATUS(multihead_attention_transform_filter(filterTensorOrg, filterTensorPtr, &archInfo)); + CHECK_STATUS(multihead_attention(inputTensor, filterTensor, biasTensor, lnAlphaTensor, + lnBetaTensor, &multiplyAlpha, &multiplyBeta, firstFCSliceNum, matmulSliceLen, + eltwiseWithLayerNormIn, activation, tmpTensor, outputTensor, &archInfo)); + /*warp up*/ + UNI_INFO_LOG("Warp up gpu:\n") + for (U32 i = 0; i < 2; i++) { + CHECK_STATUS(gcl_run_kernelVec(handle)); + } + UNI_INFO_LOG("Run:\n") +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernelVec_timing(handle, 0, handle->kernelVec->size())); +// double time = handle->t_execute * 0.001; +#else + CHECK_STATUS(gcl_run_kernelVec(handle)); +#endif + outputDesc = outputTensor.get_desc(); + ; + CHECK_STATUS(ocl_get_output(handle, output, outputDesc, true)); + output_gpu = output->mapPtrArray.back(); + + CHECK_STATUS(gcl_finish(handle)); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + free(input_cpu); + for (auto p : filter_cpu) { + free((U8 *)p); + } + for (auto p : bias_cpu) { + free((U8 *)p); + } + for (auto p : lnAlpha_cpu) { + free((U8 *)p); + } + for (auto p : lnBeta_cpu) { + free((U8 *)p); + } + return 0; +} +#endif + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + multiheadAttentionTest(argc, argv, DT_F16); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_non_max_suppression.cpp b/compute/tensor/tests/test_non_max_suppression.cpp new file mode 100644 index 00000000..b31e66d3 --- /dev/null +++ b/compute/tensor/tests/test_non_max_suppression.cpp @@ -0,0 +1,139 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#include "ut_util.h" + +int nonmaxsuppressionTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 12); + // in0 boxes + U32 in0 = atoi(argv[1]); + U32 ic0 = atoi(argv[2]); + U32 ilens0 = atoi(argv[3]); + // in1 scores + U32 in1 = atoi(argv[4]); + U32 ic1 = atoi(argv[5]); + U32 ilens1 = atoi(argv[6]); + // output + U32 oh = atoi(argv[7]); + U32 ow = atoi(argv[8]); + // nonMaxSuppressionParamSpec + U32 max_output_boxes_per_class = atoi(argv[9]); + F32 iou_threshold = (F32)atof(argv[10]); + F32 score_threshold = (F32)atof(argv[11]); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + NonMaxSuppressionParamSpec nonMaxSuppressionParamSpec; + nonMaxSuppressionParamSpec.max_output_boxes_per_class = max_output_boxes_per_class; + nonMaxSuppressionParamSpec.iou_threshold = iou_threshold; + nonMaxSuppressionParamSpec.score_threshold = score_threshold; + + std::vector inputTensors(2); + TensorDesc input_desc_boxes = tensor3d(dt, in0, ic0, ilens0); + TensorDesc input_desc_scores = tensor3d(dt, in1, ic1, ilens1); + inputTensors[0] = Tensor::alloc_sized(input_desc_boxes); + inputTensors[1] = Tensor::alloc_sized(input_desc_scores); + U32 input_len_boxes = tensorNumElements(input_desc_boxes); + U8 *input_boxes = ut_input_v(input_len_boxes, dt, UT_INIT_RANDOM); + memcpy(get_ptr_from_tensor(inputTensors[0], UT_ARCH), input_boxes, + tensorNumBytes(input_desc_boxes)); + U32 input_len_scores = tensorNumElements(input_desc_scores); + U8 *input_scores = ut_input_v(input_len_scores, dt, UT_INIT_RANDOM); + memcpy(get_ptr_from_tensor(inputTensors[1], UT_ARCH), input_scores, + tensorNumBytes(input_desc_scores)); + std::vector inputTensorsPtr(2); + inputTensorsPtr[0] = &inputTensors[0]; + inputTensorsPtr[1] = &inputTensors[1]; + //set output + Tensor outputTensor; + CHECK_STATUS(non_max_suppression_infer_output_size( + inputTensorsPtr, nonMaxSuppressionParamSpec, &outputTensor, &archInfo)); + outputTensor.alloc(); + Tensor outputTensorRef = Tensor::alloc_sized(outputTensor.get_desc()); + U32 output_len = outputTensor.length(); + CHECK_REQUIREMENT(input_len_boxes == in0 * ic0 * ilens0 && + input_len_scores == in1 * ic1 * ilens1 && output_len == oh * ow); + /* + You can also change codes and use datas in the following example. + Command: ./test_non_max_suppression 1 6 4 1 2 6 7 3 3 0.5 0 + example: + input_box[24] = { 0.0, 0.0, 1.0, 1.0, + 0.0, 0.1, 1.0, 1.1, + 0.0, -0.1, 1.0, 0.9, + 0.0, 10.0, 1.0, 11.0, + 0.0, 10.1, 1.0, 11.1, + 0.0, 100.0, 1.0, 101.0 }; + input_score[12] = { 0.75, 0.9, 0.6, 0.95, 0.5, 0.3, 0.75, 0.9, 0.6, 0.95, 0.5, 0.3 }; + output_expect: + { 6, 0, 0, + 0, 0, 3, + 0, 0, 1, + 0, 0, 5, + 0, 1, 3, + 0, 1, 1, + 0, 1, 5 }; + */ + if (UT_CHECK) { + CHECK_STATUS( + non_max_suppression(inputTensors, nonMaxSuppressionParamSpec, outputTensor, &archInfo)); + CHECK_STATUS(non_max_suppression( + inputTensors, nonMaxSuppressionParamSpec, outputTensorRef, &archInfo_org)); + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), output_len, dt, 0.05, __FILE__, __LINE__); + } + + U32 num_detected_max = max_output_boxes_per_class * ic1; + if (dt == DT_F32) { + F32 *output_f32 = (F32 *)get_ptr_from_tensor(outputTensor, UT_ARCH); + int idx = 0; + for (U32 i = 0; i < 1 + num_detected_max; i++) { + std::cout << " 1 : " << output_f32[idx] << " 2 : " << output_f32[idx + 1] + << " 3 : " << output_f32[idx + 2] << std::endl; + idx = idx + 3; + } + } +#ifdef _USE_FP16 + if (dt == DT_F16) { + F16 *output_f16 = (F16 *)get_ptr_from_tensor(outputTensorRef, UT_ARCH); + int idx = 0; + for (U32 i = 0; i < 1 + num_detected_max; i++) { + std::cout << " 1 : " << output_f16[idx] << " 2 : " << output_f16[idx + 1] + << " 3 : " << output_f16[idx + 2] << std::endl; + idx = idx + 3; + } + } +#endif + free(input_boxes); + free(input_scores); + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + std::cout << "----- Testing FP16 Nonmaxsuppression -----" << std::endl; + nonmaxsuppressionTest(argc, argv, DT_F16); + std::cout << "----- Finished FP16 Nonmaxsuppression -----" << std::endl; +#endif +#ifdef _USE_FP32 + std::cout << "----- Testing FP32 Nonmaxsuppression -----" << std::endl; + nonmaxsuppressionTest(argc, argv, DT_F32); + std::cout << "----- Finished FP32 Nonmaxsuppression -----" << std::endl; +#endif + return 0; +} diff --git a/compute/tensor/tests/test_normalization.cpp b/compute/tensor/tests/test_normalization.cpp new file mode 100644 index 00000000..63457173 --- /dev/null +++ b/compute/tensor/tests/test_normalization.cpp @@ -0,0 +1,111 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#include "ut_util.h" + +int normalizationTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 6); + U32 alpha = atoi(argv[1]); + U32 beta = atoi(argv[2]); + U32 ic = atoi(argv[3]); + U32 ih = atoi(argv[4]); + U32 iw = atoi(argv[5]); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + DataFormat df = DF_MTK; + Tensor inputTensor; + TensorDesc inputDesc = tensor3df(dt, df, ic, ih, iw); + inputTensor.resize(inputDesc); + inputTensor.alloc(); + U32 input_len = tensorNumElements(inputDesc); + U8 *input = ut_input_v(input_len, dt, UT_INIT_RANDOM); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inputDesc)); + + // set output + Tensor outputTensor, outputTensorRef; + CHECK_STATUS(normalization_infer_output_size(&inputTensor, &outputTensor, &archInfo)); + outputTensor.alloc(); + TensorDesc outputDesc_ref = outputTensor.get_desc(); + outputTensorRef.resize(outputDesc_ref); + outputTensorRef.alloc(); + U32 output_len = outputTensor.length(); + CHECK_REQUIREMENT(input_len == ic * ih * iw && output_len == ic * ih * iw); + + U32 alpha_list[ic]; + U32 beta_list[ic]; + for (int i = 0; i < (int)ic; i++) { + alpha_list[i] = alpha; + beta_list[i] = beta; + } + Tensor alphaTensor, betaTensor; + TensorDesc alphaDesc, betaDesc; + alphaDesc = tensor1d(dt, ic); + betaDesc = tensor1d(dt, ic); + alphaTensor.resize(alphaDesc); + betaTensor.resize(betaDesc); + alphaTensor.alloc(); + betaTensor.alloc(); + memcpy(get_ptr_from_tensor(alphaTensor, UT_ARCH), alpha_list, tensorNumBytes(alphaDesc)); + memcpy(get_ptr_from_tensor(betaTensor, UT_ARCH), beta_list, tensorNumBytes(betaDesc)); + + if (UT_CHECK) { + CHECK_STATUS( + layer_normalization(inputTensor, alphaTensor, betaTensor, outputTensor, &archInfo)); + + // naive implement + CHECK_STATUS(layer_normalization( + inputTensor, alphaTensor, betaTensor, outputTensorRef, &archInfo_org)); + + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), output_len, dt, 0.000001, __FILE__, + __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS( + layer_normalization(inputTensor, alphaTensor, betaTensor, outputTensor, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u)=(%u %u %u)", ic, ih, iw, ic, ih, iw); + sprintf(buffer, "%20s, %80s", "Normalization", params); + double ops = input_len; + ut_log(dt, buffer, ops, time); + + free(input); + + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + normalizationTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + normalizationTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_padding.cpp b/compute/tensor/tests/test_padding.cpp new file mode 100644 index 00000000..8d2540b7 --- /dev/null +++ b/compute/tensor/tests/test_padding.cpp @@ -0,0 +1,115 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#include "ut_util.h" + +int paddingTest(int argc, char **argv, DataType dt) +{ + // input dim + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + + // padding info + U32 n_fir = atoi(argv[5]); + U32 c_fir = atoi(argv[6]); + U32 h_fir = atoi(argv[7]); + U32 w_fir = atoi(argv[8]); + U32 n_sec = atoi(argv[9]); + U32 c_sec = atoi(argv[10]); + U32 h_sec = atoi(argv[11]); + U32 w_sec = atoi(argv[12]); + U32 mode = atoi(argv[13]); + CHECK_REQUIREMENT(n_fir == 0 && n_sec == 0 && c_fir == 0 && c_sec == 0); + + CHECK_REQUIREMENT(0 == n_fir); + CHECK_REQUIREMENT(0 == n_sec); + CHECK_REQUIREMENT(0 == c_fir); + CHECK_REQUIREMENT(0 == c_sec); + + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + PadParamSpec padParamSpec; + + padParamSpec.top = h_fir; + padParamSpec.bottom = h_sec; + padParamSpec.left = w_fir; + padParamSpec.right = w_sec; + padParamSpec.constant_value = 0.0; + switch (mode) { + case 0: { + padParamSpec.pad_mode = Pad_Constant; + break; + } + case 1: { + padParamSpec.pad_mode = Pad_Edge; + break; + } + case 2: { + // limitation: the h_fir and the h_sec should lower than 0 + padParamSpec.pad_mode = Pad_Reflect; + break; + } + case 3: { + padParamSpec.pad_mode = Pad_Symmetric; + break; + } + default: { + UNI_ERROR_LOG("unknown paddding mode %d\n", mode); + break; + } + } + + Tensor inputTensor; + TensorDesc inputDesc = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + inputTensor.resize(inputDesc); + inputTensor.alloc(); + U32 input_len = tensorNumElements(inputDesc); + U8 *input = ut_input_v(input_len, dt, UT_INIT_RANDOM); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inputDesc)); + + // set output + Tensor outputTensor, outputTensorRef; + CHECK_STATUS( + padding_infer_output_size(&inputTensor, padParamSpec, &outputTensor, &archInfo_org)); + outputTensor.alloc(); + TensorDesc outputDesc_ref = outputTensor.get_desc(); + outputTensorRef.resize(outputDesc_ref); + outputTensorRef.alloc(); + U32 output_len = outputTensor.length(); + + if (UT_CHECK) { + CHECK_STATUS(padding(inputTensor, padParamSpec, outputTensor, &archInfo)); + + CHECK_STATUS(padding(inputTensor, padParamSpec, outputTensorRef, &archInfo_org)); + + // check + ut_check_a(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), output_len, dt); + } + + free(input); + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + paddingTest(argc, argv, DT_F16); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_padding_ocl.cpp b/compute/tensor/tests/test_padding_ocl.cpp new file mode 100644 index 00000000..9c0efc47 --- /dev/null +++ b/compute/tensor/tests/test_padding_ocl.cpp @@ -0,0 +1,191 @@ + +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#include "ut_util.h" +#include "gcl.h" +#include "libkernelsource.h" + +#ifdef _USE_FP16 +inline GCLMem_t alloc(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_map(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->mapped_alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_bytes(Tensor tensor, U32 size) +{ + auto mem = (OclMemory *)tensor.get_memory(); + GCLMem_t ptr = NULL; + if (size > 0) { + mem->resize(tensor1d(DT_U8, size)); + mem->alloc(); + ptr = (GCLMem_t)mem->get_ptr(); + } + return ptr; +} +int paddingTest(int argc, char **argv, DataType dt) +{ + // input dim + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + + // padding info + U32 h_fir = atoi(argv[7]); + U32 w_fir = atoi(argv[8]); + U32 h_sec = atoi(argv[11]); + U32 w_sec = atoi(argv[12]); + U32 mode = atoi(argv[13]); + + ArchInfo archInfo; + archInfo.arch = MALI; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + PadParamSpec padParamSpec; + + padParamSpec.top = h_fir; + padParamSpec.bottom = h_sec; + padParamSpec.left = w_fir; + padParamSpec.right = w_sec; + padParamSpec.constant_value = 0.0; + switch (mode) { + case 0: { + padParamSpec.pad_mode = Pad_Constant; + break; + } + case 1: { + padParamSpec.pad_mode = Pad_Edge; + break; + } + case 2: { + // limitation: the h_fir and the h_sec should lower than 0 + padParamSpec.pad_mode = Pad_Reflect; + break; + } + case 3: { + padParamSpec.pad_mode = Pad_Symmetric; + break; + } + default: { + UNI_ERROR_LOG("unknown paddding mode %d\n", mode); + break; + } + } + + TensorDesc inputDescCPU, inputDescGPU, outputDescCPU, outputDescGPU; + inputDescCPU = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + inputDescGPU = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + + U32 input_len = tensorNumElements(inputDescCPU); + U8 *inputCPU = ut_input_v(input_len, dt, UT_INIT_RANDOM); + U8 *outputGPU = NULL; + + std::shared_ptr handleSharedPtr = OCLContext::getInstance().handle; + ; + GCLHandle_t handle = handleSharedPtr.get(); + std::vector kernelVec; + handle->kernelVec = &kernelVec; + Tensor inputTensor = Tensor(OCLMem); + Tensor outputTensor = Tensor(OCLMem); + Tensor tmpTensor = Tensor(OCLMem); + inputTensor.resize(inputDescGPU); + + MaliPara maliPara; + maliPara.handle = handle; + archInfo.archPara = &maliPara; + + CHECK_STATUS(padding_infer_output_size(&inputTensor, padParamSpec, &outputTensor, &archInfo)); + + GCLMem_t output = alloc_map(outputTensor); + GCLMem_t input = alloc(inputTensor); + CHECK_STATUS(gcl_fill_memory_zero(handle, input)); + + U32 maxBytes = 0; + U32 tmpBytes = 0; + tmpBytes = tensorNumBytes(inputDescGPU); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + GCLMem_t tmpbuf = alloc_bytes(tmpTensor, maxBytes); + + CHECK_STATUS(ocl_set_input(handle, input, inputDescGPU, inputCPU, tmpbuf, true)); + CHECK_STATUS(padding(inputTensor, padParamSpec, outputTensor, &archInfo)); + /*warp up*/ + UNI_INFO_LOG("Warp up gpu:\n") + for (U32 i = 0; i < 2; i++) { + CHECK_STATUS(gcl_run_kernelVec(handle)); + } +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernelVec_timing(handle, 0, handle->kernelVec->size())); + double time = handle->t_execute * 0.001; +#else + CHECK_STATUS(gcl_run_kernelVec(handle)); +#endif + outputDescGPU = outputTensor.get_desc(); + ; + CHECK_STATUS(ocl_get_output(handle, output, outputDescGPU, true)); + outputGPU = output->mapPtrArray.back(); + + char buffer[150]; + char params[120]; + U32 on, oc, oh, ow; + on = outputDescGPU.dims[3]; + oc = outputDescGPU.dims[2]; + oh = outputDescGPU.dims[1]; + ow = outputDescGPU.dims[0]; + sprintf(params, "(%u %u %u %u)->(%u %u %u %u)", in, ic, ih, iw, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "padding", params); +#ifdef _DEBUG + double ops = on * oc * oh * ow * 4; // TO DO + ut_log(dt, buffer, ops, time); +#endif + Tensor inputTensorCpu; + inputTensorCpu.resize(inputDescCPU); + inputTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(inputTensorCpu, UT_ARCH), inputCPU, tensorNumBytes(inputDescCPU)); + + Tensor outputTensorCpu; + CHECK_STATUS( + padding_infer_output_size(&inputTensorCpu, padParamSpec, &outputTensorCpu, &archInfo_org)); + outputTensorCpu.alloc(); + + if (UT_CHECK) { + CHECK_STATUS(padding(inputTensorCpu, padParamSpec, outputTensorCpu, &archInfo_org)); + } + TensorDesc desc = outputTensorCpu.get_desc(); + ut_check_a( + outputGPU, get_ptr_from_tensor(outputTensorCpu, UT_ARCH), tensorNumElements(desc), dt); + + CHECK_STATUS(gcl_finish(handle)); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + free(inputCPU); + return 0; +} +#endif + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + paddingTest(argc, argv, DT_F16); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_pooling.cpp b/compute/tensor/tests/test_pooling.cpp new file mode 100644 index 00000000..66b69173 --- /dev/null +++ b/compute/tensor/tests/test_pooling.cpp @@ -0,0 +1,117 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#include "ut_util.h" + +int poolingTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 15); + // in data + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + // weight + U32 fn = atoi(argv[5]); + U32 fc = atoi(argv[6]); + U32 fh = atoi(argv[7]); + U32 fw = atoi(argv[8]); + // stride & padding + U32 stride = atoi(argv[9]); + U32 padding = atoi(argv[10]); + // output + U32 on = atoi(argv[11]); + U32 oc = atoi(argv[12]); + U32 oh = atoi(argv[13]); + U32 ow = atoi(argv[14]); + CHECK_REQUIREMENT(in == 1 && fn == 1 && fc == 1); + CHECK_REQUIREMENT(ic == oc && ic % 8 == 0); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + PoolingParamSpec poolingParamSpec; + poolingParamSpec.mode = POOLING_MAX; + + poolingParamSpec.stride_h = stride; + poolingParamSpec.stride_w = stride; + poolingParamSpec.padding_top = padding; + poolingParamSpec.padding_bottom = padding; + poolingParamSpec.padding_left = padding; + poolingParamSpec.padding_right = padding; + poolingParamSpec.kernel_h = fh; + poolingParamSpec.kernel_w = fw; + poolingParamSpec.rm = CEIL; + + Tensor inputTensor; + TensorDesc inputDesc = tensor4df(dt, DF_NCHWC8, in, ic, ih, iw); + inputTensor.resize(inputDesc); + inputTensor.alloc(); + U32 input_len = tensorNumElements(inputDesc); + U8 *input = ut_input_v(input_len, dt, UT_INIT_RANDOM); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inputDesc)); + + // set output + Tensor outputTensor, outputTensorRef; + CHECK_STATUS( + pooling_infer_output_size(&inputTensor, poolingParamSpec, &outputTensor, &archInfo)); + outputTensor.alloc(); + TensorDesc outputDesc_ref = outputTensor.get_desc(); + outputTensorRef.resize(outputDesc_ref); + outputTensorRef.alloc(); + U32 output_len = outputTensor.length(); + CHECK_REQUIREMENT(input_len == in * ic * ih * iw && output_len == on * oc * oh * ow); + Tensor tmpTensor; + if (UT_CHECK) { + CHECK_STATUS(pooling(inputTensor, poolingParamSpec, tmpTensor, outputTensor, &archInfo)); + + CHECK_STATUS( + pooling(inputTensor, poolingParamSpec, tmpTensor, outputTensorRef, &archInfo_org)); + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), output_len, dt, 0.05, __FILE__, __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(pooling(inputTensor, poolingParamSpec, tmpTensor, outputTensor, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)+(%u %u %u %u)/(%u %u)=(%u %u %u %u)", in, ic, ih, iw, fn, fc, fh, + fw, stride, padding, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "Pooling", params); + double ops = 1.0 * on * oc * oh * ow * fh * fw; + ut_log(dt, buffer, ops, time); + + free(input); + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + poolingTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + poolingTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_pooling_bp.cpp b/compute/tensor/tests/test_pooling_bp.cpp new file mode 100644 index 00000000..eb8c3943 --- /dev/null +++ b/compute/tensor/tests/test_pooling_bp.cpp @@ -0,0 +1,111 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#include "ut_util.h" + +int poolingbpTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 15); + // in data + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + // weight + U32 fn = atoi(argv[5]); + U32 fc = atoi(argv[6]); + U32 fh = atoi(argv[7]); + U32 fw = atoi(argv[8]); + // stride & padding + U32 stride = atoi(argv[9]); + U32 padding = atoi(argv[10]); + // output + U32 on = atoi(argv[11]); + U32 oc = atoi(argv[12]); + U32 oh = atoi(argv[13]); + U32 ow = atoi(argv[14]); + CHECK_REQUIREMENT(fn == 1 && fc == 1); + CHECK_REQUIREMENT(ic == oc && ic % 8 == 0); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + PoolingParamSpec poolingParamSpec; + poolingParamSpec.mode = POOLING_MEAN; + poolingParamSpec.stride_h = stride; + poolingParamSpec.stride_w = stride; + poolingParamSpec.padding_top = padding; + poolingParamSpec.padding_bottom = padding; + poolingParamSpec.padding_left = padding; + poolingParamSpec.padding_right = padding; + poolingParamSpec.kernel_h = fh; + poolingParamSpec.kernel_w = fw; + + Tensor inputTensor; + TensorDesc inputDesc = tensor4df(dt, DF_NCHWC8, in, ic, ih, iw); + inputTensor.resize(inputDesc); + inputTensor.alloc(); + U32 input_len = tensorNumElements(inputDesc); + U8 *input = ut_input_v(input_len, dt, UT_INIT_RANDOM); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inputDesc)); + + // set output + Tensor outputTensor, outputTensorRef; + TensorDesc outputDesc = tensor4df(dt, DF_NCHWC8, on, oc, oh, ow); + U32 output_len = tensorNumElements(outputDesc); + outputTensor.resize(outputDesc); + outputTensorRef.resize(outputDesc); + outputTensor.alloc(); + outputTensorRef.alloc(); + CHECK_REQUIREMENT(input_len == in * ic * ih * iw && output_len == on * oc * oh * ow); + + if (UT_CHECK) { + CHECK_STATUS(pooling_bp(inputTensor, poolingParamSpec, outputTensor, &archInfo)); + + CHECK_STATUS(pooling_bp(inputTensor, poolingParamSpec, outputTensorRef, &archInfo_org)); + + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), output_len, dt, 0.05, __FILE__, __LINE__); + } + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(pooling_bp(inputTensor, poolingParamSpec, outputTensor, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)+(%u %u %u %u)*(%u %u)=(%u %u %u %u)", in, ic, ih, iw, fn, fc, fh, + fw, stride, padding, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "Pooling_bp", params); + double ops = 1.0 * on * oc * oh * ow * fh * fw; + ut_log(dt, buffer, ops, time); + + free(input); + return 0; +} + +int main(int argc, char **argv) +{ +// only support average pooling now +#ifdef _USE_FP32 + poolingbpTest(argc, argv, DT_F32); +#endif + return 0; +} \ No newline at end of file diff --git a/compute/tensor/tests/test_pooling_int8.cpp b/compute/tensor/tests/test_pooling_int8.cpp new file mode 100644 index 00000000..bc2784ec --- /dev/null +++ b/compute/tensor/tests/test_pooling_int8.cpp @@ -0,0 +1,150 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#include "ut_util.h" + +#ifdef _USE_INT8 +int int8PoolingTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 15); + // in data + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + // weight + U32 fn = atoi(argv[5]); + U32 fc = atoi(argv[6]); + U32 fh = atoi(argv[7]); + U32 fw = atoi(argv[8]); + // stride & padding + U32 stride = atoi(argv[9]); + U32 padding = atoi(argv[10]); + // output + U32 on = atoi(argv[11]); + U32 oc = atoi(argv[12]); + U32 oh = atoi(argv[13]); + U32 ow = atoi(argv[14]); + CHECK_REQUIREMENT(in == 1 && fn == 1 && fc == 1); + CHECK_REQUIREMENT(ic == oc && ic % 8 == 0); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + PoolingParamSpec poolingParamSpec; + poolingParamSpec.mode = POOLING_MEAN; + + poolingParamSpec.stride_h = stride; + poolingParamSpec.stride_w = stride; + poolingParamSpec.padding_top = padding; + poolingParamSpec.padding_bottom = padding; + poolingParamSpec.padding_left = padding; + poolingParamSpec.padding_right = padding; + poolingParamSpec.kernel_h = fh; + poolingParamSpec.kernel_w = fw; + poolingParamSpec.rm = CEIL; + + TensorDesc input_desc = tensor4df(DT_I8, DF_NCHWC8, in, ic, ih, iw); + TensorDesc in_desc_ref = input_desc; + in_desc_ref.dt = dt; + + Tensor inputTensor, outputTensor; + inputTensor.resize(input_desc); + + //TensorDesc output_desc; + CHECK_STATUS( + pooling_infer_output_size(&inputTensor, poolingParamSpec, &outputTensor, &archInfo)); + U32 input_len = tensorNumElements(input_desc); + U32 output_len = outputTensor.length(); + CHECK_REQUIREMENT(input_len == in * ic * ih * iw && output_len == on * oc * oh * ow); + + U8 *input_ref = ut_input_v(input_len, dt, UT_INIT_RANDOM); + Tensor inputTensorRef = Tensor::alloc_sized(in_desc_ref); + memcpy(get_ptr_from_tensor(inputTensorRef, UT_ARCH), input_ref, tensorNumBytes(in_desc_ref)); + + inputTensor.alloc(); + F16 inputScale = -1; + quantize_tensor(in_desc_ref, input_ref, &input_desc, get_ptr_from_tensor(inputTensor, UT_ARCH), + &inputScale); + inputTensor.set_scale(inputScale); + + outputTensor.alloc(); + INT8 *output = (INT8 *)get_ptr_from_tensor(outputTensor, UT_ARCH); + U8 *out_d = ut_input_v(output_len, dt, UT_INIT_ZERO); + + TensorDesc outDescRef = outputTensor.get_desc(); + outDescRef.dt = dt; + Tensor outputTensorRef = Tensor::alloc_sized(outDescRef); + + Tensor tmpTensor; + if (UT_CHECK) { + CHECK_STATUS(pooling(inputTensor, poolingParamSpec, tmpTensor, outputTensor, &archInfo)); + F32 outputScale = outputTensor.get_scale(); + for (U32 i = 0; i < output_len; i++) { + switch (dt) { +#ifdef _USE_FP32 + case DT_F32: + ((F32 *)out_d)[i] = output[i] / outputScale; + break; +#endif +#ifdef _USE_FP16 + case DT_F16: + ((F16 *)out_d)[i] = output[i] / outputScale; + break; +#endif + default: + break; + } + } + + CHECK_STATUS( + pooling(inputTensorRef, poolingParamSpec, tmpTensor, outputTensorRef, &archInfo_org)); + + // check + ut_check_v(out_d, get_ptr_from_tensor(outputTensorRef, UT_ARCH), output_len, dt, 0.05, + __FILE__, __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(pooling(inputTensor, poolingParamSpec, tmpTensor, outputTensor, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)+(%u %u %u %u)/(%u %u)=(%u %u %u %u)", in, ic, ih, iw, fn, fc, fh, + fw, stride, padding, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "Pooling", params); + double ops = 1.0 * on * oc * oh * ow * fh * fw; + ut_log(DT_I8, buffer, ops, time); + + free(input_ref); + free(out_d); + + return 0; +} +#endif + +int main(int argc, char **argv) +{ +#ifdef _USE_INT8 + int8PoolingTest(argc, argv, DT_F16); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_pooling_ocl.cpp b/compute/tensor/tests/test_pooling_ocl.cpp new file mode 100644 index 00000000..d507efa9 --- /dev/null +++ b/compute/tensor/tests/test_pooling_ocl.cpp @@ -0,0 +1,210 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#include "ut_util.h" +#include "libkernelsource.h" +#include +#include "gcl.h" +#include + +#ifdef _USE_FP16 +inline GCLMem_t alloc(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_map(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->mapped_alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_bytes(Tensor tensor, U32 size) +{ + auto mem = (OclMemory *)tensor.get_memory(); + GCLMem_t ptr = NULL; + if (size > 0) { + mem->resize(tensor1d(DT_U8, size)); + mem->alloc(); + ptr = (GCLMem_t)mem->get_ptr(); + } + return ptr; +} +void NCHWC8_to_NCHW(F16 *input_cpu, F16 *input_cpu_nchw, U32 ih, U32 iw, U32 ic) +{ + int index_c = 0; + int index_hw = 0; + int channel_k = 0; + for (int i = 0; i < (int)(ic * ih * iw);) { + index_c = i % (ih * iw); + index_hw = i / (ih * iw); + for (int k = 0; k < 8; k++) { + if (index_hw % 8 == 0) { + channel_k = index_hw * (ih * iw); + } + if (index_c == 0) { + for (int j = 0; j < (int)(ih * iw); j++) { + input_cpu_nchw[i++] = input_cpu[channel_k + k + j * 8]; + } + } + } + } +} + +int poolingTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 15); + // in data + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + // weight + U32 fn = atoi(argv[5]); + U32 fc = atoi(argv[6]); + U32 fh = atoi(argv[7]); + U32 fw = atoi(argv[8]); + // stride & padding + U32 stride = atoi(argv[9]); + U32 padding = atoi(argv[10]); + // output + U32 on = atoi(argv[11]); + U32 oc = atoi(argv[12]); + U32 oh = atoi(argv[13]); + U32 ow = atoi(argv[14]); + CHECK_REQUIREMENT(in == 1 && fn == 1 && fc == 1); + CHECK_REQUIREMENT(ic == oc && ic % 8 == 0); + ArchInfo archInfo; + archInfo.arch = MALI; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + PoolingParamSpec poolingParamSpec; + poolingParamSpec.mode = POOLING_MEAN; + poolingParamSpec.stride_h = stride; + poolingParamSpec.stride_w = stride; + poolingParamSpec.padding_top = padding; + poolingParamSpec.padding_bottom = padding; + poolingParamSpec.padding_left = padding; + poolingParamSpec.padding_right = padding; + poolingParamSpec.kernel_h = fh; + poolingParamSpec.kernel_w = fw; + poolingParamSpec.rm = CEIL; + + TensorDesc input_desc_cpu = tensor4df(dt, DF_NCHWC8, in, ic, ih, iw); + TensorDesc input_desc_gpu = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + TensorDesc output_desc_cpu, output_desc_gpu; + U32 input_len = tensorNumElements(input_desc_cpu); + U8 *input_cpu_nchwc8 = ut_input_v(input_len, dt, UT_INIT_RANDOM); + U8 *input_cpu_nchw = ut_input_v(input_len, dt, UT_INIT_ZERO); + NCHWC8_to_NCHW((F16 *)input_cpu_nchwc8, (F16 *)input_cpu_nchw, ih, iw, ic); + Tensor inputTensorCpu; + inputTensorCpu.resize(input_desc_cpu); + inputTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(inputTensorCpu, UT_ARCH), input_cpu_nchwc8, + tensorNumBytes(input_desc_cpu)); + + Tensor outputTensorCpu; + Tensor tmpTensorCpu; + CHECK_STATUS(pooling_infer_output_size( + &inputTensorCpu, poolingParamSpec, &outputTensorCpu, &archInfo_org)); + + outputTensorCpu.alloc(); + CHECK_STATUS( + pooling(inputTensorCpu, poolingParamSpec, tmpTensorCpu, outputTensorCpu, &archInfo_org)); + + U32 output_len = outputTensorCpu.length(); + U8 *output_cpu_nchw = ut_input_v(output_len, dt, UT_INIT_ZERO); + NCHWC8_to_NCHW( + (F16 *)get_ptr_from_tensor(outputTensorCpu, UT_ARCH), (F16 *)output_cpu_nchw, oh, ow, oc); + + std::shared_ptr handleSharedPtr = OCLContext::getInstance().handle; + ; + GCLHandle_t handle = handleSharedPtr.get(); + std::vector kernelVec; + handle->kernelVec = &kernelVec; + Tensor inputTensor = Tensor(OCLMem); + Tensor outputTensor = Tensor(OCLMem); + Tensor tmpTensor = Tensor(OCLMem); + inputTensor.resize(input_desc_gpu); + + MaliPara maliPara; + maliPara.handle = handle; + archInfo.archPara = &maliPara; + + CHECK_STATUS( + pooling_infer_output_size(&inputTensor, poolingParamSpec, &outputTensor, &archInfo)); + U32 maxBytes = 0; + U32 tmpBytes = 0; + CHECK_STATUS(pooling_infer_forward_tmp_bytes(inputTensor, outputTensor, &tmpBytes, &archInfo)); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + + GCLMem_t output = alloc_map(outputTensor); + GCLMem_t input = alloc(inputTensor); + CHECK_STATUS(gcl_fill_memory_zero(handle, input)); + tmpBytes = tensorNumBytes(input_desc_gpu); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + GCLMem_t tmpbuf = alloc_bytes(tmpTensor, maxBytes); + + CHECK_STATUS(ocl_set_input(handle, input, input_desc_gpu, input_cpu_nchw, tmpbuf, true)); + CHECK_STATUS(pooling(inputTensor, poolingParamSpec, tmpTensor, outputTensor, &archInfo)); + + /*warp up*/ + UNI_INFO_LOG("Warp up gpu:\n") + for (U32 i = 0; i < 2; i++) { + CHECK_STATUS(gcl_run_kernelVec(handle)); + } + + UNI_INFO_LOG("Run:\n") +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernelVec_timing(handle, 0, handle->kernelVec->size())); + double time = handle->t_execute * 0.001; +#else + CHECK_STATUS(gcl_run_kernelVec(handle)); +#endif + + TensorDesc outputDesc = outputTensor.get_desc(); + ; + CHECK_STATUS(ocl_get_output(handle, output, outputDesc, true)); + void *output_gpu_val = output->mapPtrArray.back(); + + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)+(%u %u %u %u)/(%u %u)=(%u %u %u %u)", in, ic, ih, iw, fn, fc, fh, + fw, stride, padding, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "Pooling", params); +#ifdef _DEBUG + double ops = 1.0 * on * oc * oh * ow * fh * fw; + ut_log(dt, buffer, ops, time); +#endif + + ut_check_a(output_gpu_val, output_cpu_nchw, on * oc * ow * oh, dt); + free(input_cpu_nchwc8); + free(input_cpu_nchw); + free(output_cpu_nchw); + CHECK_STATUS(gcl_finish(handle)); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + return 0; +} +#endif +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + poolingTest(argc, argv, DT_F16); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_power.cpp b/compute/tensor/tests/test_power.cpp new file mode 100644 index 00000000..8e91f4c9 --- /dev/null +++ b/compute/tensor/tests/test_power.cpp @@ -0,0 +1,85 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#include "ut_util.h" + +int powerTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 5); + U32 len = atoi(argv[1]); + PowerParamSpec p; + p.scale = atof(argv[2]); + p.shift = atof(argv[3]); + p.power = atof(argv[4]); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + Tensor inputTensor; + TensorDesc inputDesc = tensor1d(dt, len); + inputTensor.resize(inputDesc); + inputTensor.alloc(); + U8 *input = ut_input_v(len, dt, UT_INIT_RANDOM); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inputDesc)); + // set output + Tensor outputTensor, outputTensorRef; + CHECK_STATUS(power_infer_output_size(&inputTensor, &outputTensor, &archInfo)); + outputTensor.alloc(); + TensorDesc outputDesc_ref = outputTensor.get_desc(); + outputTensorRef.resize(outputDesc_ref); + outputTensorRef.alloc(); + + if (UT_CHECK) { + CHECK_STATUS(power(inputTensor, p, outputTensor, &archInfo)); + + // naive implement + CHECK_STATUS(power(inputTensor, p, outputTensorRef, &archInfo_org)); + + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), len, dt, 0.1, __FILE__, __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(power(inputTensor, p, outputTensor, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%u)=(%u)", len, len); + sprintf(buffer, "%20s, %80s", "Power", params); + double ops = 2.0 * len; + ut_log(dt, buffer, ops, time / UT_LOOPS); + + free(input); + + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + powerTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + powerTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_power_ocl.cpp b/compute/tensor/tests/test_power_ocl.cpp new file mode 100644 index 00000000..99a210a3 --- /dev/null +++ b/compute/tensor/tests/test_power_ocl.cpp @@ -0,0 +1,157 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_FP16 +#include "tensor_computing.h" +#include "ut_util.h" +#include "gcl.h" +#include "libkernelsource.h" +inline GCLMem_t alloc(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_map(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->mapped_alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_bytes(Tensor tensor, U32 size) +{ + auto mem = (OclMemory *)tensor.get_memory(); + GCLMem_t ptr = NULL; + if (size > 0) { + mem->resize(tensor1d(DT_U8, size)); + mem->alloc(); + ptr = (GCLMem_t)mem->get_ptr(); + } + return ptr; +} + +int powerTest(int argc, char **argv, DataType dt) +{ + U32 in = 1; + U32 ic = 4; + U32 ih = 4; + U32 iw = 4; + PowerParamSpec p; + p.scale = 0.5; + p.shift = 0.5; + p.power = 2; + if (argc == 8) { + in = atoi(argv[1]); + ic = atoi(argv[2]); + ih = atoi(argv[3]); + iw = atoi(argv[4]); + p.scale = atof(argv[5]); + p.shift = atof(argv[6]); + p.power = atof(argv[7]); + } + U32 on = in; + U32 oc = ic; + U32 oh = ih; + U32 ow = iw; + + ArchInfo archInfo; + archInfo.arch = MALI; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + U32 len = in * ic * ih * iw; + + TensorDesc input_desc_cpu = tensor1d(dt, len); + TensorDesc output_desc_cpu = tensor1d(dt, len); + TensorDesc input_desc_gpu = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + TensorDesc output_desc_gpu; + + U8 *input_cpu = ut_input_v(len, dt, UT_INIT_RANDOM); + U8 *output_gpu = NULL; + + std::shared_ptr handleSharedPtr = OCLContext::getInstance().handle; + GCLHandle_t handle = handleSharedPtr.get(); + std::vector kernelVec; + handle->kernelVec = &kernelVec; + Tensor inputTensor = Tensor(OCLMem); + Tensor outputTensor = Tensor(OCLMem); + Tensor tmpTensor = Tensor(OCLMem); + inputTensor.resize(input_desc_gpu); + + MaliPara maliPara; + maliPara.handle = handle; + archInfo.archPara = &maliPara; + CHECK_STATUS(power_infer_output_size(&inputTensor, &outputTensor, &archInfo)); + + GCLMem_t output = alloc_map(outputTensor); + GCLMem_t input = alloc(inputTensor); + CHECK_STATUS(gcl_fill_memory_zero(handle, input)); + + U32 maxBytes = 0; + U32 tmpBytes = 0; + tmpBytes = tensorNumBytes(input_desc_gpu); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + GCLMem_t tmpbuf = alloc_bytes(tmpTensor, maxBytes); + + CHECK_STATUS(ocl_set_input(handle, input, input_desc_gpu, input_cpu, tmpbuf, true)); + CHECK_STATUS(power(inputTensor, p, outputTensor, &archInfo)); + /*warp up*/ + UNI_INFO_LOG("Warp up gpu:\n") + for (U32 i = 0; i < 2; i++) { + CHECK_STATUS(gcl_run_kernelVec(handle)); + } + + UNI_INFO_LOG("Run:\n") +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernelVec_timing(handle, 0, handle->kernelVec->size())); + double time = handle->t_execute * 0.001; +#else + CHECK_STATUS(gcl_run_kernelVec(handle)); +#endif + + output_desc_gpu = outputTensor.get_desc(); + CHECK_STATUS(ocl_get_output(handle, output, output_desc_gpu, true)); + output_gpu = output->mapPtrArray.back(); + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u) = (%u %u %u %u)", in, ic, ih, iw, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "Power", params); +#ifdef _DEBUG + double ops = (2.0 * on * oc * oh * ow); + ut_log(dt, buffer, ops, time); +#endif + Tensor inputTensorCpu; + inputTensorCpu.resize(input_desc_cpu); + inputTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(inputTensorCpu, UT_ARCH), input_cpu, tensorNumBytes(input_desc_cpu)); + + Tensor outputTensorCpu; + outputTensorCpu.resize(output_desc_cpu); + outputTensorCpu.alloc(); + + CHECK_STATUS(power(inputTensorCpu, p, outputTensorCpu, &archInfo_org)); + ut_check_a(output_gpu, get_ptr_from_tensor(outputTensorCpu, UT_ARCH), on * oc * ow * oh, dt); + + CHECK_STATUS(gcl_finish(handle)); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + free(input_cpu); + return 0; +} + +int main(int argc, char **argv) +{ + powerTest(argc, argv, DT_F16); + return 0; +} +#endif diff --git a/compute/tensor/tests/test_prelu.cpp b/compute/tensor/tests/test_prelu.cpp new file mode 100644 index 00000000..097db305 --- /dev/null +++ b/compute/tensor/tests/test_prelu.cpp @@ -0,0 +1,92 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#include "ut_util.h" + +int preluTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 5); + // in data + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + + CHECK_REQUIREMENT(ic % 8 == 0); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + PReLUParamSpec prelu_desc; + prelu_desc.propagate_down = 0; + TensorDesc inputDesc = tensor4df(dt, DF_NCHWC8, in, ic, ih, iw); + TensorDesc weightDesc = tensor1d(dt, ic); + U32 input_len = tensorNumElements(inputDesc); + U8 *input = ut_input_v(input_len, dt, UT_INIT_RANDOM); + U8 *weight = ut_input_v(ic, dt, UT_INIT_RANDOM); + + Tensor inputTensor = Tensor::alloc_sized(inputDesc); + Tensor weightTensor = Tensor::alloc_sized(weightDesc); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inputDesc)); + memcpy(get_ptr_from_tensor(weightTensor, UT_ARCH), weight, tensorNumBytes(weightDesc)); + + // set output + Tensor outputTensor; + CHECK_STATUS(prelu_infer_output_size(&inputTensor, &outputTensor, &archInfo)); + outputTensor.alloc(); + Tensor outputTensorRef = Tensor::alloc_sized(outputTensor.get_desc()); + U32 output_len = outputTensor.length(); + CHECK_REQUIREMENT(input_len == in * ic * ih * iw && output_len == in * ic * ih * iw); + + if (UT_CHECK) { + CHECK_STATUS(prelu(inputTensor, weightTensor, prelu_desc, outputTensor, &archInfo)); + + CHECK_STATUS(prelu(inputTensor, weightTensor, prelu_desc, outputTensorRef, &archInfo_org)); + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), output_len, dt, 0.05, __FILE__, __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(prelu(inputTensor, weightTensor, prelu_desc, outputTensor, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)*(%u)=(%u %u %u %u)", in, ic, ih, iw, ic, in, ic, ih, iw); + sprintf(buffer, "%20s, %80s", "Prelu", params); + double ops = 2.0 * in * ic * ih * iw + 1.0 * in; + ut_log(dt, buffer, ops, time); + + free(input); + free(weight); + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + preluTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + preluTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_prelu_ocl.cpp b/compute/tensor/tests/test_prelu_ocl.cpp new file mode 100644 index 00000000..a61ec0b6 --- /dev/null +++ b/compute/tensor/tests/test_prelu_ocl.cpp @@ -0,0 +1,173 @@ + +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_FP16 +#include "tensor_computing.h" +#include "ut_util.h" +#include "gcl.h" +#include "libkernelsource.h" +inline GCLMem_t alloc(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_map(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->mapped_alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_bytes(Tensor tensor, U32 size) +{ + auto mem = (OclMemory *)tensor.get_memory(); + GCLMem_t ptr = NULL; + if (size > 0) { + mem->resize(tensor1d(DT_U8, size)); + mem->alloc(); + ptr = (GCLMem_t)mem->get_ptr(); + } + return ptr; +} +inline GCLMem_t alloc_desc(Tensor tensor, GCLMemDesc desc) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->padding(desc); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +int preluTest(int argc, char **argv, DataType dt) +{ + // input dim + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + U32 prop = atoi(argv[5]); + U32 weightNum; + + ArchInfo archInfo; + archInfo.arch = MALI; + + TensorDesc inputDescGPU, outputDescGPU, weightDescGPU; + inputDescGPU = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + + U32 input_len = tensorNumElements(inputDescGPU); + U8 *inputCPU = ut_input_v(input_len, dt, UT_INIT_RANDOM); + U8 *weightCPU = NULL; + U8 *outputGPU = NULL; + PReLUParamSpec preluDesc; + if (prop) { + preluDesc.propagate_down = true; + weightCPU = ut_input_v(1, dt, UT_INIT_RANDOM); + weightDescGPU = tensor1d(dt, 1); + } else { + preluDesc.propagate_down = false; + weightCPU = ut_input_v(ic, dt, UT_INIT_RANDOM); + weightDescGPU = tensor1d(dt, ic); + } + std::shared_ptr handleSharedPtr = OCLContext::getInstance().handle; + ; + GCLHandle_t handle = handleSharedPtr.get(); + std::vector kernelVec; + handle->kernelVec = &kernelVec; + Tensor inputTensor = Tensor(OCLMem); + Tensor outputTensor = Tensor(OCLMem); + Tensor tmpTensor = Tensor(OCLMem); + Tensor weightTensor = Tensor(OCLMem); + inputTensor.resize(inputDescGPU); + weightTensor.resize(weightDescGPU); + + MaliPara maliPara; + maliPara.handle = handle; + archInfo.archPara = &maliPara; + + CHECK_STATUS(prelu_infer_output_size(&inputTensor, &outputTensor, &archInfo)); + + GCLMem_t output = alloc_map(outputTensor); + GCLMem_t input = alloc(inputTensor); + CHECK_STATUS(gcl_fill_memory_zero(handle, input)); + + GCLMemDesc desc = gclmem_build_desc(); + if (preluDesc.propagate_down) { + weightNum = 1; + desc.byteSize = weightNum * bytesOf(dt); + } else { + weightNum = (ic + 3) / 4 * 4; + desc.byteSize = weightNum * bytesOf(dt); + } + desc.stride[0] = weightNum; + desc.stride[1] = 1; + desc.stride[2] = 1; + desc.offset[0] = 0; + desc.offset[1] = 0; + desc.offset[2] = 0; + desc.memType = GCL_MEM_BUF; + desc.num = weightNum; + desc.memFormat = DF_NHWC; + desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + if (ic != 1) { + U8 *weight_align = ut_input_v((ic + 3) / 4 * 4, dt, UT_INIT_ZERO); + memcpy(weight_align, weightCPU, (ic + 3) / 4 * 4 * bytesOf(dt)); + desc.host_ptr = weight_align; + } else { + desc.host_ptr = weightCPU; + } + alloc_desc(weightTensor, desc); + + U32 tmpBytes; + U32 maxBytes = 0; + tmpBytes = tensorNumBytes(inputDescGPU); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + GCLMem_t tmpbuf = alloc_bytes(tmpTensor, maxBytes); + + CHECK_STATUS(ocl_set_input(handle, input, inputDescGPU, inputCPU, tmpbuf, true)); + CHECK_STATUS(prelu(inputTensor, weightTensor, preluDesc, outputTensor, &archInfo)); + /*warp up*/ + UNI_INFO_LOG("Warp up gpu:\n") + for (U32 i = 0; i < 2; i++) { + CHECK_STATUS(gcl_run_kernelVec(handle)); + } + UNI_INFO_LOG("Run gpu:\n") +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernelVec_timing(handle, 0, handle->kernelVec->size())); +// double time = handle->t_execute * 0.001; +#else + CHECK_STATUS(gcl_run_kernelVec(handle)); +#endif + outputDescGPU = outputTensor.get_desc(); + ; + CHECK_STATUS(ocl_get_output(handle, output, outputDescGPU, true)); + outputGPU = output->mapPtrArray.back(); + + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)->(%u %u %u %u)", in, ic, ih, iw, in, ic, ih, iw); + sprintf(buffer, "%20s, %80s", "prelu", params); + CHECK_STATUS(gcl_finish(handle)); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + free(inputCPU); + free(weightCPU); + return 0; +} + +int main(int argc, char **argv) +{ + preluTest(argc, argv, DT_F16); + return 0; +} +#endif diff --git a/compute/tensor/tests/test_priorbox.cpp b/compute/tensor/tests/test_priorbox.cpp new file mode 100644 index 00000000..29dea659 --- /dev/null +++ b/compute/tensor/tests/test_priorbox.cpp @@ -0,0 +1,172 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#include "ut_util.h" +#include + +int priorboxTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 18 || argc == 19 || argc == 20 || argc == 21); + // in0 feature map + U32 in0 = atoi(argv[1]); + U32 ic0 = atoi(argv[2]); + U32 ih0 = atoi(argv[3]); + U32 iw0 = atoi(argv[4]); + // in1 data + U32 in1 = atoi(argv[5]); + U32 ic1 = atoi(argv[6]); + U32 ih1 = atoi(argv[7]); + U32 iw1 = atoi(argv[8]); + // param priorbox + F32 min_size = (F32)atof(argv[9]); + F32 max_size = (F32)atof(argv[10]); + U32 flip = atoi(argv[11]); + U32 clip = atoi(argv[12]); + F32 step = (F32)atof(argv[13]); + // output + U32 on = atoi(argv[14]); + U32 oc = atoi(argv[15]); + U32 olens = atoi(argv[16]); + // multi param priorbox + F32 ar1 = (F32)atof(argv[17]); + F32 ar2 = 0; + F32 min_size1 = 0; + F32 max_size1 = 0; + if (argc == 19 || argc == 21) { + ar2 = (F32)atof(argv[18]); + if (argc == 21) { + min_size1 = (F32)atof(argv[19]); + max_size1 = (F32)atof(argv[20]); + } + } + if (argc == 20) { + min_size1 = (F32)atof(argv[18]); + max_size1 = (F32)atof(argv[19]); + } + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + CHECK_REQUIREMENT(in0 == 1 && in1 == 1 && on == 1 && oc == 2); + + PriorBoxParamSpec priorbox_desc; + int min_sizes_len = 1; + int max_sizes_len = 1; + int aspect_ratios_len = 1; + priorbox_desc.min_sizes[0] = min_size; + priorbox_desc.max_sizes[0] = max_size; + priorbox_desc.aspect_ratios[0] = ar1; + priorbox_desc.min_sizes[1] = min_size1; + priorbox_desc.max_sizes[1] = max_size1; + priorbox_desc.aspect_ratios[1] = ar2; + if (argc == 19 || argc == 21) { + aspect_ratios_len++; + if (argc == 21) { + min_sizes_len++; + max_sizes_len++; + } + } + if (argc == 20) { + min_sizes_len++; + max_sizes_len++; + } + priorbox_desc.flip = flip; + priorbox_desc.clip = clip; + priorbox_desc.image_h = ih1; + priorbox_desc.image_w = iw1; + priorbox_desc.step_h = step; + priorbox_desc.step_w = step; + priorbox_desc.variances[0] = 0.10000000149; + priorbox_desc.variances[1] = 0.10000000149; + priorbox_desc.variances[2] = 0.20000000298; + priorbox_desc.variances[3] = 0.20000000298; + priorbox_desc.offset = 0.5; + + std::vector inputTensors(2); + std::vector inputTensorPtrs(2); + Tensor inputTensor_fm, inputTensor_data; + TensorDesc inputDesc_fm = tensor4df(dt, DF_NCHWC8, in0, ic0, ih0, iw0); + TensorDesc inputDesc_data = tensor4df(dt, DF_NCHWC8, in1, ic1, ih1, iw1); + inputTensor_fm.resize(inputDesc_fm); + inputTensor_data.resize(inputDesc_data); + U32 input_len_fm = tensorNumElements(inputDesc_fm); + U32 input_len_data = tensorNumElements(inputDesc_data); + inputTensors[0] = inputTensor_fm; + inputTensors[1] = inputTensor_data; + inputTensorPtrs[0] = &inputTensors[0]; + inputTensorPtrs[1] = &inputTensors[1]; + // set output + Tensor outputTensor, outputTensorRef; + CHECK_STATUS( + priorbox_infer_output_size(inputTensorPtrs, priorbox_desc, &outputTensor, &archInfo)); + outputTensor.alloc(); + TensorDesc outputDesc_ref = outputTensor.get_desc(); + outputTensorRef.resize(outputDesc_ref); + outputTensorRef.alloc(); + U32 output_len = outputTensor.length(); + CHECK_REQUIREMENT(input_len_fm == in0 * ic0 * ih0 * iw0 && + input_len_data == in1 * ic1 * ih1 * iw1 && output_len == on * oc * olens); + + if (UT_CHECK) { + CHECK_STATUS(priorbox(inputTensors, priorbox_desc, outputTensor, &archInfo)); + + CHECK_STATUS(priorbox(inputTensors, priorbox_desc, outputTensorRef, &archInfo_org)); + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), output_len, dt, 0.05, __FILE__, __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(priorbox(inputTensors, priorbox_desc, outputTensor, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + U32 num_priorboxs = aspect_ratios_len; + if (priorbox_desc.flip) { + num_priorboxs = num_priorboxs * 2; + } + U32 num_minsize = min_sizes_len; + num_priorboxs = (num_priorboxs + 1) * num_minsize; + if (max_sizes_len != 0) { + U32 num_maxsize = max_sizes_len; + num_priorboxs = num_priorboxs + num_maxsize; + } + U32 ochannel = 2; + U32 numperbox = 4; + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u) * (%u %u %u) = (%u %u %u)", in0, ic0, ih0, iw0, ochannel, + numperbox, num_priorboxs, on, oc, olens); + sprintf(buffer, "%20s, %80s", "Priorbox", params); + double ops = 1.0 * output_len; + ut_log(dt, buffer, ops, time); + + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + priorboxTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + priorboxTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_reduction.cpp b/compute/tensor/tests/test_reduction.cpp new file mode 100644 index 00000000..f815f543 --- /dev/null +++ b/compute/tensor/tests/test_reduction.cpp @@ -0,0 +1,107 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "tensor_computing.h" +#include "ut_util.h" + +int reductionTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc >= 6); + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + ReductionParamSpec p; + p.axes_num = atoi(argv[5]); + for (int i = 0; i < p.axes_num; i++) { + p.axes[i] = atoi(argv[6 + i]); + } + p.reduction_mode = REDUCTION_MEAN; + p.coeff = 1.0; + p.keep_dim = true; + DataFormat df = DF_NCHW; + TensorDesc maskDesc; + maskDesc.nDims = 0; + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + TensorDesc inDesc = tensor4df(dt, df, in, ic, ih, iw); + U8 *input = ut_input_v(tensorNumElements(inDesc), dt, UT_INIT_RANDOM); + Tensor inputTensor; + inputTensor.resize(inDesc); + inputTensor.alloc(); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inDesc)); + + Tensor maskTensor; + maskTensor.resize(maskDesc); + Tensor outputTensor; + Tensor outputTensorRef; + CHECK_STATUS(reduction_infer_output_size(&inputTensor, maskTensor, p, &outputTensor)); + outputTensor.alloc(); + outputTensorRef.resize(outputTensor.get_desc()); + outputTensorRef.alloc(); + + U32 tmpBytes; + CHECK_STATUS( + reduction_infer_forward_tmp_bytes(inputTensor, p, outputTensor, &tmpBytes, &archInfo)); + Tensor tmpTensor; + tmpTensor.resize(tensor1d(dt, tmpBytes)); + tmpTensor.alloc(); + + if (UT_CHECK) { + CHECK_STATUS(reduction(inputTensor, maskTensor, p, tmpTensor, outputTensor, &archInfo)); + + // naive implement + CHECK_STATUS( + reduction(inputTensor, maskTensor, p, tmpTensor, outputTensorRef, &archInfo_org)); + + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), outputTensor.length(), dt, 1, __FILE__, + __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(reduction(inputTensor, maskTensor, p, tmpTensor, outputTensor, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + U32 on = 1, oc = 1, oh = 1, ow = 1; + CHECK_STATUS(tensor4dGet(outputTensor.get_desc(), &dt, &df, &on, &oc, &oh, &ow)); + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u) %d =(%u %u %u %u)", in, ic, ih, iw, p.axes_num, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "Reduction", params); + double ops = 1.0 * in * ic * ih * iw; + ut_log(dt, buffer, ops, time / UT_LOOPS); + + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + reductionTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + reductionTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_reshape.cpp b/compute/tensor/tests/test_reshape.cpp new file mode 100644 index 00000000..563b1c62 --- /dev/null +++ b/compute/tensor/tests/test_reshape.cpp @@ -0,0 +1,100 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include + +#include "tensor_computing.h" +#include "ut_util.h" + +int reshapeTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc > 4); + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + ReshapeParamSpec p; + p.shape_size = atoi(argv[5]); + CHECK_REQUIREMENT(argc == 6 + p.shape_size); + for (I32 i = 0; i < p.shape_size; i++) { + p.shape_dims[i] = atoi(argv[6 + i]); + } + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + + DataFormat df = DF_NCHW; + TensorDesc inDesc = tensor4df(dt, df, in, ic, ih, iw); + U32 len = tensorNumElements(inDesc); + U8 *input = ut_input_v(len, dt, UT_INIT_RANDOM); + Tensor inputTensor; + inputTensor.resize(inDesc); + inputTensor.alloc(); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inDesc)); + + Tensor outputTensor; + CHECK_STATUS(reshape_infer_output_size(&inputTensor, p, &outputTensor, &archInfo)); + outputTensor.alloc(); + TensorDesc outDesc = outputTensor.get_desc(); + Tensor nullTensor; + + if (UT_CHECK) { + CHECK_STATUS(reshape(inputTensor, nullTensor, outputTensor, &archInfo)); + + CHECK_REQUIREMENT(tensorNumElements(outDesc) == len); + } + + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(reshape(inputTensor, nullTensor, outputTensor, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + memset(params, 0, 120); + sprintf(params, "(%u %u %u %u)=(", in, ic, ih, iw); + for (I32 i = 0; i < p.shape_size; i++) { + I32 index = 0; + for (; index < 120; index++) { + if (params[index] == '\0') { + break; + } + } + if (i != p.shape_size - 1) { + sprintf(params + index, "%d ", outDesc.dims[outDesc.nDims - 1 - i]); + } else { + sprintf(params + index, "%d)", outDesc.dims[outDesc.nDims - 1 - i]); + } + } + sprintf(buffer, "%20s, %80s", "Reshape", params); + double ops = len; + ut_log(dt, buffer, ops, time); + + free(input); + + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + reshapeTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + reshapeTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_reshape_ocl.cpp b/compute/tensor/tests/test_reshape_ocl.cpp new file mode 100644 index 00000000..694cb4d1 --- /dev/null +++ b/compute/tensor/tests/test_reshape_ocl.cpp @@ -0,0 +1,162 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_FP16 +#include "tensor_computing.h" +#include "ut_util.h" +#include "gcl.h" +#include "libkernelsource.h" +inline GCLMem_t alloc(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_map(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->mapped_alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_bytes(Tensor tensor, U32 size) +{ + auto mem = (OclMemory *)tensor.get_memory(); + GCLMem_t ptr = NULL; + if (size > 0) { + mem->resize(tensor1d(DT_U8, size)); + mem->alloc(); + ptr = (GCLMem_t)mem->get_ptr(); + } + return ptr; +} + +int reshapeTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc > 4); + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + ReshapeParamSpec p; + p.shape_size = atoi(argv[5]); + CHECK_REQUIREMENT(argc == 6 + p.shape_size); + for (I32 i = 0; i < p.shape_size; i++) { + p.shape_dims[i] = atoi(argv[6 + i]); + } + + ArchInfo archInfo; + archInfo.arch = MALI; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + DataFormat df = DF_NCHW; + TensorDesc inputDesc = tensor4df(dt, df, in, ic, ih, iw); + U32 len = tensorNumElements(inputDesc); + U8 *input_cpu = ut_input_v(len, dt, UT_INIT_RANDOM); + + Tensor inputTensorCpu; + inputTensorCpu.resize(inputDesc); + inputTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(inputTensorCpu, UT_ARCH), input_cpu, tensorNumBytes(inputDesc)); + + Tensor outputTensorCpu; + Tensor tmpTensorCpu; + CHECK_STATUS(reshape_infer_output_size(&inputTensorCpu, p, &outputTensorCpu, &archInfo_org)); + outputTensorCpu.alloc(); + CHECK_STATUS(reshape(inputTensorCpu, tmpTensorCpu, outputTensorCpu, &archInfo_org)); + + U8 *output_gpu = NULL; + std::shared_ptr handleSharedPtr = OCLContext::getInstance().handle; + GCLHandle_t handle = handleSharedPtr.get(); + std::vector kernelVec; + handle->kernelVec = &kernelVec; + Tensor inputTensor = Tensor(OCLMem); + Tensor outputTensor = Tensor(OCLMem); + Tensor tmpTensor = Tensor(OCLMem); + inputTensor.resize(inputDesc); + + MaliPara maliPara; + maliPara.handle = handle; + archInfo.archPara = &maliPara; + CHECK_STATUS(reshape_infer_output_size(&inputTensor, p, &outputTensor, &archInfo)); + TensorDesc outputDesc = outputTensor.get_desc(); + U32 on, oc, oh, ow; + tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); + + GCLMem_t output = alloc_map(outputTensor); + GCLMem_t input = alloc(inputTensor); + CHECK_STATUS(gcl_fill_memory_zero(handle, input)); + + U32 maxBytes = 0; + U32 tmpBytes = 0; + tmpBytes = tensorNumBytes(inputDesc); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + GCLMem_t tmpbuf = alloc_bytes(tmpTensor, maxBytes); + + CHECK_STATUS(ocl_set_input(handle, input, inputDesc, input_cpu, tmpbuf, true)); + CHECK_STATUS(reshape(inputTensor, tmpTensor, outputTensor, &archInfo)); + + /*warp up*/ + UNI_INFO_LOG("Warp up gpu:\n") + for (U32 i = 0; i < 2; i++) { + CHECK_STATUS(gcl_run_kernelVec(handle)); + } + + UNI_INFO_LOG("Run:\n") +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernelVec_timing(handle, 0, handle->kernelVec->size())); + double time = handle->t_execute * 0.001; +#else + CHECK_STATUS(gcl_run_kernelVec(handle)); +#endif + outputDesc = outputTensor.get_desc(); + CHECK_STATUS(ocl_get_output(handle, output, outputDesc, true)); + output_gpu = output->mapPtrArray.back(); + + char buffer[150]; + char params[120]; + memset(params, 0, 120); + sprintf(params, "(%u %u %u %u)=(", in, ic, ih, iw); + for (I32 i = 0; i < p.shape_size; i++) { + I32 index = 0; + for (; index < 120; index++) { + if (params[index] == '\0') { + break; + } + } + if (i != p.shape_size - 1) { + sprintf(params + index, "%d ", outputDesc.dims[outputDesc.nDims - 1 - i]); + } else { + sprintf(params + index, "%d)", outputDesc.dims[outputDesc.nDims - 1 - i]); + } + } + sprintf(buffer, "%20s, %80s", "Reshape", params); +#ifdef _DEBUG + double ops = len; + ut_log(dt, buffer, ops, time); +#endif + ut_check_a(output_gpu, get_ptr_from_tensor(outputTensorCpu, UT_ARCH), len, dt); + CHECK_STATUS(gcl_finish(handle)); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + free(input_cpu); + return 0; +} + +int main(int argc, char **argv) +{ + reshapeTest(argc, argv, DT_F16); + return 0; +} +#endif diff --git a/compute/tensor/tests/test_rnn.cpp b/compute/tensor/tests/test_rnn.cpp new file mode 100644 index 00000000..b2b14238 --- /dev/null +++ b/compute/tensor/tests/test_rnn.cpp @@ -0,0 +1,151 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include + +#include "tensor_computing.h" +#include "ut_util.h" + +int rnnTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 5); + U32 batch = atoi(argv[1]); + U32 step = atoi(argv[2]); + U32 xDim = atoi(argv[3]); + U32 hDim = atoi(argv[4]); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + RNNParamSpec rnnParamSpec; + rnnParamSpec.mode = RNN_LSTM; + rnnParamSpec.biDirection = false; + rnnParamSpec.numOutput = hDim; + rnnParamSpec.numProjection = 1024; + rnnParamSpec.forgetBias = 1.0; + rnnParamSpec.activationMode = ACTIVATION_TANH; + rnnParamSpec.zoneoutCell = 0; + rnnParamSpec.zoneoutOutput = 0; + F32 threshold = 10; + if (rnnParamSpec.numProjection > 0) { + threshold = 40; + } + + U32 column = (rnnParamSpec.numProjection > 0) ? rnnParamSpec.numProjection + : rnnParamSpec.numOutput; + U32 num2 = (rnnParamSpec.numProjection > 0) ? 2 : 1; + TensorDesc inputDesc = tensor3df(dt, DF_MTK, batch, step, xDim); + Tensor inputTensor; + inputTensor.resize(inputDesc); + inputTensor.alloc(); + U32 inputLength = batch * step * xDim; + U8 *input = ut_input_v(inputLength, dt, UT_INIT_RANDOM); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inputDesc)); + + U32 tmpBytes; + std::vector filterDesc(2), biasDesc(2); + filterDesc[0] = tensor2df(dt, DF_NK, 4 * column, xDim + hDim); + filterDesc[1] = tensor2df(dt, DF_NK, rnnParamSpec.numOutput, rnnParamSpec.numProjection); + biasDesc[0] = tensor1d(dt, column * 4); + biasDesc[1] = tensor1d(dt, rnnParamSpec.numOutput); + std::vector filterTensor(num2), biasTensor(num2); + for (U32 i = 0; i < num2; i++) { + filterTensor[i].resize(filterDesc[i]); + filterTensor[i].alloc(); + U8 *filter = ut_input_v(tensorNumBytes(filterDesc[i]) / bytesOf(dt), dt, UT_INIT_RANDOM); + memcpy(get_ptr_from_tensor(filterTensor[i], UT_ARCH), filter, tensorNumBytes(filterDesc[i])); + free(filter); + + biasTensor[i].resize(biasDesc[i]); + biasTensor[i].alloc(); + U8 *bias = ut_input_v(tensorNumBytes(biasDesc[i]) / bytesOf(dt), dt, UT_INIT_RANDOM); + memcpy(get_ptr_from_tensor(biasTensor[i], UT_ARCH), bias, tensorNumBytes(biasDesc[i])); + free(bias); + } + + // set output + Tensor outputTensor, outputTensorRef; + CHECK_STATUS(rnn_infer_output_size(&inputTensor, rnnParamSpec, &outputTensor, &archInfo)); + outputTensor.alloc(); + U32 outputLength = outputTensor.length(); + + TensorDesc outputDesc_ref = outputTensor.get_desc(); + outputTensorRef.resize(outputDesc_ref); + outputTensorRef.alloc(); + + CHECK_STATUS(rnn_infer_forward_tmp_bytes( + inputTensor, filterTensor[0], outputTensor, rnnParamSpec, &tmpBytes, &archInfo)); + std::vector ftmBytes(num2); + CHECK_STATUS(rnn_transform_filter_bytes(filterTensor, rnnParamSpec, ftmBytes.data(), &archInfo)); + std::vector ftmTensor(num2); + std::vector ftmTensorPtr(num2); + for (U32 i = 0; i < num2; i++) { + ftmTensor[i].resize(tensor1d(DT_U8, ftmBytes[i])); + ftmTensor[i].alloc(); + ftmTensorPtr[i] = &ftmTensor[i]; + } + + Tensor tmpTensor; + tmpTensor.resize(tensor1d(DT_U8, tmpBytes)); + tmpTensor.alloc(); + + CHECK_STATUS(rnn_transform_filter(filterTensor, rnnParamSpec, ftmTensorPtr, &archInfo)); + + if (UT_CHECK) { + CHECK_STATUS(rnn( + inputTensor, ftmTensor, biasTensor, rnnParamSpec, tmpTensor, outputTensor, &archInfo)); + + // naive implement + CHECK_STATUS(rnn(inputTensor, ftmTensor, biasTensor, rnnParamSpec, tmpTensor, + outputTensorRef, &archInfo_org)); + + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), outputLength, dt, threshold, __FILE__, + __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(rnn( + inputTensor, ftmTensor, biasTensor, rnnParamSpec, tmpTensor, outputTensor, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "%u (%u %u %u)=(%u %u)", batch, step, xDim, hDim, batch, hDim); + sprintf(buffer, "%20s, %80s", "RNN", params); + double hxDim = hDim + xDim; + double ops = 1.0 * batch * step * + (2.0 * hxDim * column * 4 + column * 4 + rnnParamSpec.numProjection * rnnParamSpec.numOutput); + ut_log(dt, buffer, ops, time); + + free(input); + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + rnnTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + rnnTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_roialign.cpp b/compute/tensor/tests/test_roialign.cpp new file mode 100644 index 00000000..931f5e7a --- /dev/null +++ b/compute/tensor/tests/test_roialign.cpp @@ -0,0 +1,131 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#include "ut_util.h" + +int roialignTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 16); + // in0 feature map + U32 in0 = atoi(argv[1]); + U32 ic0 = atoi(argv[2]); + U32 ih0 = atoi(argv[3]); + U32 iw0 = atoi(argv[4]); + // in1 rois + U32 ih1 = atoi(argv[5]); + U32 iw1 = atoi(argv[6]); + // in2 batch_indices + U32 ilens2 = atoi(argv[7]); + // output + U32 on0 = atoi(argv[8]); + U32 oc0 = atoi(argv[9]); + U32 oh0 = atoi(argv[10]); + U32 ow0 = atoi(argv[11]); + // p + U32 output_h = atoi(argv[12]); + U32 output_w = atoi(argv[13]); + U32 sampling_ratio = atoi(argv[14]); + F32 spatial_scale = (F32)atof(argv[15]); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + RoiAlignParamSpec p; + p.output_h = output_h; + p.output_w = output_w; + p.sampling_ratio = sampling_ratio; + p.spatial_scale = spatial_scale; + + std::vector inputTensors(3); + std::vector inputTensorPtrs(3); + TensorDesc inputDesc_feat = tensor4d(dt, in0, ic0, ih0, iw0); + TensorDesc inputDesc_rois = tensor2d(dt, ih1, iw1); + TensorDesc inputDesc_batch = tensor1d(dt, ilens2); + Tensor inputTensor_feat = Tensor::alloc_sized(inputDesc_feat); + Tensor inputTensor_rois = Tensor::alloc_sized(inputDesc_rois); + Tensor inputTensor_batch = Tensor::alloc_sized(inputDesc_batch); + U32 input_len_feat = tensorNumElements(inputDesc_feat); + U32 input_len_rois = tensorNumElements(inputDesc_rois); + U32 input_len_batch = tensorNumElements(inputDesc_batch); + U8 *input_feat = ut_input_v(input_len_feat, dt, UT_INIT_RANDOM); + U8 *input_rois = ut_input_v(input_len_rois, dt, UT_INIT_RANDOM); + U8 *input_batch = ut_input_v(input_len_batch, dt, UT_INIT_ZERO); + memcpy( + get_ptr_from_tensor(inputTensor_feat, UT_ARCH), input_feat, tensorNumBytes(inputDesc_feat)); + memcpy( + get_ptr_from_tensor(inputTensor_rois, UT_ARCH), input_rois, tensorNumBytes(inputDesc_rois)); + memcpy(get_ptr_from_tensor(inputTensor_batch, UT_ARCH), input_batch, + tensorNumBytes(inputDesc_batch)); + inputTensors[0] = inputTensor_feat; + inputTensors[1] = inputTensor_rois; + inputTensors[2] = inputTensor_batch; + inputTensorPtrs[0] = &inputTensors[0]; + inputTensorPtrs[1] = &inputTensors[1]; + inputTensorPtrs[2] = &inputTensors[2]; + + // set output + Tensor outputTensor, outputTensorRef; + CHECK_STATUS(roialign_infer_output_size(inputTensorPtrs, p, &outputTensor, &archInfo)); + outputTensor.alloc(); + TensorDesc outputDesc_ref = outputTensor.get_desc(); + outputTensorRef.resize(outputDesc_ref); + outputTensorRef.alloc(); + U32 output_len = outputTensor.length(); + CHECK_REQUIREMENT(ih1 == on0 && ic0 == oc0 && output_h == oh0 && output_w == ow0); + CHECK_REQUIREMENT(input_len_feat == in0 * ic0 * ih0 * iw0 && input_len_rois == ih1 * iw1 && + input_len_batch == ilens2 && output_len == on0 * oc0 * oh0 * ow0); + + if (UT_CHECK) { + CHECK_STATUS(roialign(inputTensors, p, outputTensor, &archInfo)); + CHECK_STATUS(roialign(inputTensors, p, outputTensorRef, &archInfo_org)); + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), output_len, dt, 0.05, __FILE__, __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(roialign(inputTensors, p, outputTensor, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u) * (%u %u) * (%u) * (%u %u) = (%u %u %u %u)", in0, ic0, ih0, iw0, + ih1, iw1, ilens2, output_h, output_w, on0, oc0, oh0, ow0); + sprintf(buffer, "%20s, %80s", "Roialign", params); + double ops = 1.0 * output_len; + ut_log(dt, buffer, ops, time); + + free(input_feat); + free(input_rois); + free(input_batch); + + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + roialignTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + roialignTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_scale.cpp b/compute/tensor/tests/test_scale.cpp new file mode 100644 index 00000000..0eb788c2 --- /dev/null +++ b/compute/tensor/tests/test_scale.cpp @@ -0,0 +1,90 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "tensor_computing.h" +#include "ut_util.h" + +int scaleTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 5); + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + ScaleParamSpec p; + p.axis = 1; + DataFormat df = DF_NCHWC8; + TensorDesc inDesc = tensor4df(dt, df, in, ic, ih, iw); + U32 len = tensorNumElements(inDesc); + U8 *data = ut_input_v(len, dt, UT_INIT_RANDOM); + + Tensor dataTensor; + Tensor dataTensorRef; + dataTensor.resize(inDesc); + dataTensorRef.resize(inDesc); + dataTensor.alloc(); + dataTensorRef.alloc(); + memcpy(get_ptr_from_tensor(dataTensor, UT_ARCH), data, tensorNumBytes(inDesc)); + memcpy(get_ptr_from_tensor(dataTensorRef, UT_ARCH), data, tensorNumBytes(inDesc)); + + U8 *alpha = ut_input_v(ic, dt, UT_INIT_RANDOM); + U8 *beta = ut_input_v(ic, dt, UT_INIT_RANDOM); + + if (UT_CHECK) { + CHECK_STATUS(scale(dataTensor, alpha, beta, p, dataTensor, &archInfo)); + + // naive implement + CHECK_STATUS(scale(dataTensorRef, alpha, beta, p, dataTensorRef, &archInfo_org)); + + // check + ut_check_v(get_ptr_from_tensor(dataTensor, UT_ARCH), + get_ptr_from_tensor(dataTensorRef, UT_ARCH), len, dt, 1.0, __FILE__, __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(scale(dataTensor, alpha, beta, p, dataTensor, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)=(%u %u %u %u)", in, ic, ih, iw, in, ic, ih, iw); + sprintf(buffer, "%20s, %80s", "Scale", params); + double ops = 2.0 * in * ic * ih * iw; + ut_log(dt, buffer, ops, time / UT_LOOPS); + + free(data); + + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + scaleTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + scaleTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_slice.cpp b/compute/tensor/tests/test_slice.cpp new file mode 100644 index 00000000..e5f1b996 --- /dev/null +++ b/compute/tensor/tests/test_slice.cpp @@ -0,0 +1,95 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include + +#include "tensor_computing.h" +#include "ut_util.h" + +int sliceTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc > 2); + I32 num = atoi(argv[1]); + CHECK_REQUIREMENT(argc == 2 + 4 + 1 + num - 1); + U32 in = atoi(argv[2]); + U32 ic = atoi(argv[3]); + U32 ih = atoi(argv[4]); + U32 iw = atoi(argv[5]); + SliceParamSpec p; + p.axis = atoi(argv[6]); + p.slice_size = num - 1; + for (U32 i = 0; i < p.slice_size; i++) { + p.slice_points[i] = atoi(argv[7 + i]); + } + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + + DataFormat df = DF_NCHW; + TensorDesc inDesc = tensor4df(dt, df, in, ic, ih, iw); + U32 len = tensorNumElements(inDesc); + U8 *input = ut_input_v(len, dt, UT_INIT_RANDOM); + Tensor inputTensor; + inputTensor.resize(inDesc); + inputTensor.alloc(); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inDesc)); + + std::vector outputTensors(num); + std::vector outputTensorsPtr(num); + for (I32 i = 0; i < num; i++) { + outputTensorsPtr[i] = &outputTensors[i]; + } + CHECK_STATUS(slice_infer_output_size(&inputTensor, p, outputTensorsPtr, &archInfo)); + for (I32 i = 0; i < num; i++) { + outputTensors[i].alloc(); + } + + if (UT_CHECK) { + CHECK_STATUS(slice(inputTensor, p, outputTensors, &archInfo)); + + U32 tmp = 0; + for (I32 i = 0; i < num; i++) { + tmp += outputTensors[i].length(); + } + CHECK_REQUIREMENT(tmp == len); + } + + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(slice(inputTensor, p, outputTensors, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)=(%u %u %u %u)/%u", in, ic, ih, iw, in, ic, ih, iw, num); + sprintf(buffer, "%20s, %80s", "Slice", params); + double ops = num * len; + ut_log(dt, buffer, ops, time); + + free(input); + + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + sliceTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + sliceTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_softmax.cpp b/compute/tensor/tests/test_softmax.cpp new file mode 100644 index 00000000..8bed13af --- /dev/null +++ b/compute/tensor/tests/test_softmax.cpp @@ -0,0 +1,82 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#include "ut_util.h" + +int softmaxTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 2); + SoftmaxParamSpec p; + U32 len = atoi(argv[1]); + p.axis = 1; + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + TensorDesc inDesc = tensor2df(dt, DF_NORMAL, 1, len); + U8 *input = ut_input_v(len, dt, UT_INIT_RANDOM); + Tensor inputTensor = Tensor::alloc_sized(inDesc); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inDesc)); + + Tensor outputTensor; + CHECK_STATUS(softmax_infer_output_size(&inputTensor, &outputTensor, &archInfo)); + outputTensor.alloc(); + Tensor outputTensorRef = Tensor::alloc_sized(outputTensor.get_desc()); + + Tensor blankTensor; + + if (UT_CHECK) { + CHECK_STATUS(softmax(inputTensor, p, blankTensor, outputTensor, &archInfo)); + + // naive implement + CHECK_STATUS(softmax(inputTensor, p, blankTensor, outputTensorRef, &archInfo_org)); + + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), outputTensor.length(), dt, 0.1, __FILE__, + __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(softmax(inputTensor, p, blankTensor, outputTensor, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%u)=(%u)", len, len); + sprintf(buffer, "%20s, %80s", "Softmax", params); + double ops = 4.0 * len; + ut_log(dt, buffer, ops, time); + + free(input); + + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + softmaxTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + softmaxTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_softmax_h1w1_ocl.cpp b/compute/tensor/tests/test_softmax_h1w1_ocl.cpp new file mode 100644 index 00000000..bf938a45 --- /dev/null +++ b/compute/tensor/tests/test_softmax_h1w1_ocl.cpp @@ -0,0 +1,150 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "tensor_computing.h" +#include "ut_util.h" +#include "gcl.h" +#include "libkernelsource.h" +inline GCLMem_t alloc(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_map(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->mapped_alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_bytes(Tensor tensor, U32 size) +{ + auto mem = (OclMemory *)tensor.get_memory(); + GCLMem_t ptr = NULL; + if (size > 0) { + mem->resize(tensor1d(DT_U8, size)); + mem->alloc(); + ptr = (GCLMem_t)mem->get_ptr(); + } + return ptr; +} + +int softmaxTest(int argc, char **argv, DataType dt) +{ + U32 in, ic, ih, iw; + in = 1; + ic = 4; + ih = 1; + iw = 1; + + if (argc == 2) { + ic = atoi(argv[1]); + } + + SoftmaxParamSpec p; + p.axis = 1; + + ArchInfo archInfo; + archInfo.arch = MALI; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + TensorDesc in_desc, in_desc_gpu, out_desc; + in_desc = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + + U8 *input_cpu = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM); + U8 *output_gpu = NULL; + in_desc_gpu = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + + std::shared_ptr handleSharedPtr = OCLContext::getInstance().handle; + GCLHandle_t handle = handleSharedPtr.get(); + std::vector kernelVec; + handle->kernelVec = &kernelVec; + Tensor inputTensor = Tensor(OCLMem); + Tensor outputTensor = Tensor(OCLMem); + Tensor tmpTensor = Tensor(OCLMem); + inputTensor.resize(in_desc_gpu); + + MaliPara maliPara; + maliPara.handle = handle; + archInfo.archPara = &maliPara; + CHECK_STATUS(softmax_infer_output_size(&inputTensor, &outputTensor, &archInfo)); + U32 maxBytes = 0; + U32 tmpBytes; + CHECK_STATUS(softmax_infer_forward_tmp_bytes(inputTensor, &tmpBytes, &archInfo)); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; // 18 + + GCLMem_t output = alloc_map(outputTensor); + GCLMem_t input = alloc(inputTensor); + CHECK_STATUS(gcl_fill_memory_zero(handle, input)); + tmpBytes = tensorNumBytes(in_desc_gpu); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + GCLMem_t tmpbuf = alloc_bytes(tmpTensor, maxBytes); + + CHECK_STATUS(ocl_set_input(handle, input, in_desc_gpu, input_cpu, tmpbuf, true)); + CHECK_STATUS(softmax(inputTensor, p, tmpTensor, outputTensor, &archInfo)); + /*warp up*/ + UNI_INFO_LOG("Warp up gpu:\n") + for (U32 i = 0; i < 2; i++) { + CHECK_STATUS(gcl_run_kernelVec(handle)); + } + + UNI_INFO_LOG("Run:\n") +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernelVec_timing(handle, 0, handle->kernelVec->size())); + double time = handle->t_execute * 0.001; +#else + CHECK_STATUS(gcl_run_kernelVec(handle)); +#endif + out_desc = outputTensor.get_desc(); + CHECK_STATUS(ocl_get_output(handle, output, out_desc, true)); + output_gpu = output->mapPtrArray.back(); + + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)", in, ic, ih, iw); + sprintf(buffer, "%20s, %80s", "softmax_h1w1", params); +#ifdef _DEBUG + double ops = 1; + ut_log(dt, buffer, ops, time); +#endif + Tensor inputTensorCpu; + inputTensorCpu.resize(in_desc); + inputTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(inputTensorCpu, UT_ARCH), input_cpu, tensorNumBytes(in_desc)); + + Tensor outputTensorCpu; + outputTensorCpu.resize(out_desc); + outputTensorCpu.alloc(); + + Tensor tmpTensorCpu; + CHECK_STATUS(softmax(inputTensorCpu, p, tmpTensorCpu, outputTensorCpu, &archInfo_org)); + + ut_check_a(output_gpu, get_ptr_from_tensor(outputTensorCpu, UT_ARCH), in * ih * iw * ic, dt); + CHECK_STATUS(gcl_finish(handle)); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + + free(input_cpu); + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + softmaxTest(argc, argv, DT_F16); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_split.cpp b/compute/tensor/tests/test_split.cpp new file mode 100644 index 00000000..a75f6431 --- /dev/null +++ b/compute/tensor/tests/test_split.cpp @@ -0,0 +1,87 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include + +#include "tensor_computing.h" +#include "ut_util.h" + +int splitTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 6); + I32 num = atoi(argv[1]); + U32 in = atoi(argv[2]); + U32 ic = atoi(argv[3]); + U32 ih = atoi(argv[4]); + U32 iw = atoi(argv[5]); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + + DataFormat df = DF_NCHWC8; + TensorDesc inDesc = tensor4df(dt, df, in, ic, ih, iw); + U32 len = tensorNumElements(inDesc); + U8 *input = ut_input_v(len, dt, UT_INIT_RANDOM); + Tensor inputTensor; + inputTensor.resize(inDesc); + inputTensor.alloc(); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inDesc)); + + std::vector outputTensors(num); + std::vector outputTensorsPtr(num); + for (I32 i = 0; i < num; i++) { + outputTensorsPtr[i] = &outputTensors[i]; + } + CHECK_STATUS(split_infer_output_size(&inputTensor, outputTensorsPtr)); + for (I32 i = 0; i < num; i++) { + outputTensors[i].alloc(); + } + + if (UT_CHECK) { + CHECK_STATUS(split(inputTensor, outputTensors, &archInfo)); + + for (I32 i = 0; i < num; i++) { + ut_check_v(get_ptr_from_tensor(outputTensors[i], UT_ARCH), input, len, dt, 0, __FILE__, + __LINE__); + } + } + + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(split(inputTensor, outputTensors, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)=(%u %u %u %u)*%u", in, ic, ih, iw, in, ic, ih, iw, num); + sprintf(buffer, "%20s, %80s", "Split", params); + double ops = num * len; + ut_log(dt, buffer, ops, time); + + free(input); + + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + splitTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + splitTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_tile.cpp b/compute/tensor/tests/test_tile.cpp new file mode 100644 index 00000000..657e6b3b --- /dev/null +++ b/compute/tensor/tests/test_tile.cpp @@ -0,0 +1,61 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + +#include "tensor_computing.h" +#include "ut_util.h" + +int tileTest(int argc, char **argv, DataType dt) +{ + // input dim + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + //input axis and tiles + TileParamSpec tileParamSpec; + tileParamSpec.axis = atoi(argv[5]); + tileParamSpec.dimsSize = 0; + tileParamSpec.repeatsInfo[0] = atoi(argv[6]); + + //set input + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + DataFormat df = DF_NCHW; + TensorDesc inDesc = tensor4df(dt, df, in, ic, ih, iw); + U32 len = tensorNumElements(inDesc); + U8 *input = ut_input_v(len, dt, UT_INIT_RANDOM); + Tensor inputTensor = Tensor::alloc_sized(inDesc); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, inputTensor.bytes()); + + //set output + Tensor outputTensor; + CHECK_STATUS(tile_infer_output_size(&inputTensor, tileParamSpec, &outputTensor, &archInfo)); + outputTensor.alloc(); + if (UT_CHECK) { + CHECK_STATUS(tile(inputTensor, tileParamSpec, outputTensor, &archInfo)); + + CHECK_REQUIREMENT(outputTensor.length() == (len * tileParamSpec.repeatsInfo[0])); + } + + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + tileTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + tileTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_transpose.cpp b/compute/tensor/tests/test_transpose.cpp new file mode 100644 index 00000000..48f9decc --- /dev/null +++ b/compute/tensor/tests/test_transpose.cpp @@ -0,0 +1,98 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include + +#include "tensor_computing.h" +#include "ut_util.h" + +int transposeTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 9); + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + TransposeParamSpec p, p_inv; + p.trans_size = 4; + p_inv.trans_size = 4; + for (int i = 0; i < 4; i++) { + I32 value = atoi(argv[5 + i]); + p.trans_dims[i] = value; + p_inv.trans_dims[value] = i; + } + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + + DataFormat df = DF_NCHW; + TensorDesc inDesc = tensor4df(dt, df, in, ic, ih, iw); + U32 len = tensorNumElements(inDesc); + U8 *input = ut_input_v(len, dt, UT_INIT_RANDOM); + Tensor inputTensor; + inputTensor.resize(inDesc); + inputTensor.alloc(); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inDesc)); + + Tensor outputTensor1; + Tensor outputTensor2; + CHECK_STATUS(transpose_infer_output_size(&inputTensor, p, &outputTensor1, &archInfo)); + CHECK_STATUS(transpose_infer_output_size(&outputTensor1, p_inv, &outputTensor2, &archInfo)); + outputTensor1.alloc(); + outputTensor2.alloc(); + Tensor blankTensor; + + if (UT_CHECK) { + CHECK_STATUS(transpose(inputTensor, p, blankTensor, outputTensor1, &archInfo)); + + CHECK_STATUS(transpose(outputTensor1, p_inv, blankTensor, outputTensor2, &archInfo)); + + // check + ut_check_v(input, get_ptr_from_tensor(outputTensor2, UT_ARCH), len, dt, 0.0001, __FILE__, + __LINE__); + } + + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(transpose(inputTensor, p, blankTensor, outputTensor1, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + U32 on = 0; + U32 oc = 0; + U32 oh = 0; + U32 ow = 0; + CHECK_STATUS(tensor4dGet(outputTensor1.get_desc(), &dt, &df, &on, &oc, &oh, &ow)); + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)=(%u %u %u %u)", in, ic, ih, iw, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "Transpose", params); + double ops = len; + ut_log(dt, buffer, ops, time); + + free(input); + + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + transposeTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + transposeTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_transpose_ocl.cpp b/compute/tensor/tests/test_transpose_ocl.cpp new file mode 100644 index 00000000..0f1368fc --- /dev/null +++ b/compute/tensor/tests/test_transpose_ocl.cpp @@ -0,0 +1,158 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include +#include "tensor_computing.h" +#include "ut_util.h" +#include "gcl.h" +#include "libkernelsource.h" +inline GCLMem_t alloc(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_map(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->mapped_alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_bytes(Tensor tensor, U32 size) +{ + auto mem = (OclMemory *)tensor.get_memory(); + GCLMem_t ptr = NULL; + if (size > 0) { + mem->resize(tensor1d(DT_U8, size)); + mem->alloc(); + ptr = (GCLMem_t)mem->get_ptr(); + } + return ptr; +} + +int transposeTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 9); + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + TransposeParamSpec p; + p.trans_size = 4; + for (int i = 0; i < 4; i++) { + I32 value = atoi(argv[5 + i]); + p.trans_dims[i] = value; + } + + ArchInfo archInfo; + ArchInfo archInfo_org; + archInfo.arch = MALI; + archInfo_org.arch = CPU_GENERAL; + + TensorDesc inputDesc_cpu, inputDesc_gpu, outputDesc; + inputDesc_cpu = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + inputDesc_gpu = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + + U32 len = tensorNumElements(inputDesc_cpu); + U8 *input_cpu = ut_input_v(len, dt, UT_INIT_RANDOM); + + Tensor inputTensorCpu; + inputTensorCpu.resize(inputDesc_cpu); + inputTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(inputTensorCpu, UT_ARCH), input_cpu, tensorNumBytes(inputDesc_cpu)); + Tensor outputTensorCpu; + Tensor tmpTensorCpu; + //run on cpu + CHECK_STATUS(transpose_infer_output_size(&inputTensorCpu, p, &outputTensorCpu, &archInfo_org)); + outputTensorCpu.alloc(); + CHECK_STATUS(transpose(inputTensorCpu, p, tmpTensorCpu, outputTensorCpu, &archInfo_org)); + //run on gpu + std::shared_ptr handleSharedPtr = OCLContext::getInstance().handle; + ; + GCLHandle_t handle = handleSharedPtr.get(); + std::vector kernelVec; + handle->kernelVec = &kernelVec; + Tensor inputTensor = Tensor(OCLMem); + Tensor outputTensor = Tensor(OCLMem); + Tensor tmpTensor = Tensor(OCLMem); + inputTensor.resize(inputDesc_gpu); + U8 *output_gpu = NULL; + MaliPara maliPara; + maliPara.handle = handle; + archInfo.archPara = &maliPara; + + CHECK_STATUS(transpose_infer_output_size(&inputTensor, p, &outputTensor, &archInfo)); + + U32 maxBytes = 0; + U32 tmpBytes; + CHECK_STATUS(transpose_infer_forward_tmp_bytes(inputTensor, outputTensor, &tmpBytes, &archInfo)) + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + + GCLMem_t output = alloc_map(outputTensor); + GCLMem_t input = alloc(inputTensor); + CHECK_STATUS(gcl_fill_memory_zero(handle, input)); + + tmpBytes = tensorNumBytes(inputDesc_gpu); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + GCLMem_t tmpbuf = alloc_bytes(tmpTensor, maxBytes); + + CHECK_STATUS(ocl_set_input(handle, input, inputDesc_gpu, input_cpu, tmpbuf, true)); + CHECK_STATUS(transpose(inputTensor, p, tmpTensor, outputTensor, &archInfo)); + /*warp up*/ + UNI_INFO_LOG("Warp up gpu:\n") + for (U32 i = 0; i < 2; i++) { + CHECK_STATUS(gcl_run_kernelVec(handle)); + } + + UNI_INFO_LOG("Run:\n") +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernelVec_timing(handle, 0, handle->kernelVec->size())); + double time = handle->t_execute * 0.001; +#else + CHECK_STATUS(gcl_run_kernelVec(handle)); +#endif + outputDesc = outputTensor.get_desc(); + ; + CHECK_STATUS(ocl_get_output(handle, output, outputDesc, true)); + output_gpu = output->mapPtrArray.back(); + + char buffer[150]; + char params[120]; + U32 on = outputDesc.dims[3]; + U32 oc = outputDesc.dims[2]; + U32 oh = outputDesc.dims[1]; + U32 ow = outputDesc.dims[0]; + sprintf(params, "(%u %u %u %u)=(%u %u %u %u)", in, ic, ih, iw, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "Transpose", params); +#ifdef _DEBUG + double ops = len; + ut_log(dt, buffer, ops, time); +#endif + ut_check_a(output_gpu, get_ptr_from_tensor(outputTensorCpu, UT_ARCH), len, dt); + + CHECK_STATUS(gcl_finish(handle)); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + free(input_cpu); + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + transposeTest(argc, argv, DT_F16); +#endif + return 0; +} diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md new file mode 100644 index 00000000..1cb2adb1 --- /dev/null +++ b/docs/ARCHITECTURE.md @@ -0,0 +1,21 @@ +# Architecture + +![bolt_framework](images/Framework.PNG) + +- [common](../common) + - [uni](../common/uni) hosts the common headers that are used in bolt. The model representation of bolt is [ModelSpec](../uni/include/type), which shows the rigorous model format defined by bolt. + - [gcl](../common/gcl) hosts the setup of MALI GPU environment. + - [memory](../common/memory) hosts the memory data structure which bolt needs. +- [model_tools](../model_tools) + - [X2bolt](../model_tools/tools/X2bolt) : a general converter of converting different deep learning models to bolt models. + - [model_optimizer](../model_tools/include/model_optimizer.hpp) : a static computing graph optimizer to fuse the operators and simplify the calculation graph. +- [compute](../compute) + - [blas_enhance](compute/blas_enhance) hosts the fast implementation of matrix-matrix multiplication and matrix-vector multiplication of FP32, FP16 and INT8. It is referenced by some of the operators in [tensor](compute/tensor). + - [tensor](compute/tensor) hosts the implementation for all kinds of operators defined by bolt. + - [image](compute/image) hosts common preprocessing routines for image inputs (e.g. bilinear interpolation). +- [inference](../inference) + - [engine](inference/engine) hosts the inference engine of neural networks. + - [flow](inference/flow) hosts the multi-backends(CPU+GPU) heterogeneous device schedule for time series data. + - [examples](inference/examples) gives some application examples (Network Benchmark, ImageNet classification). + +For API, Flow and operator development, please refer to [DEVELOPER.md](DEVELOPER.md). \ No newline at end of file diff --git a/docs/BENCHMARK.md b/docs/BENCHMARK.md index ac877563..2d488639 100644 --- a/docs/BENCHMARK.md +++ b/docs/BENCHMARK.md @@ -1,165 +1,340 @@ # Benchmark Report -We have tested kinds of neural network models with Bolt(v0.2.0) on HUAWEI 810 mobile phone and HUAWEI 990 mobile phone respectively. The benchmark data are given under different operating systems, different computing cores, and different inference accuracy. For more detailed evaluation data, please refer to the following table. +We have tested kinds of neural network models with Bolt(v1.0.0) on HUAWEI 810 mobile phone and HUAWEI 990 mobile phone respectively. The benchmark data are given under different operating systems, different computing cores, and different inference accuracy. For more detailed evaluation data, please refer to the following table. -| Model | Framework | Os | Compiler | Kirin Soc Version | Core | Precision | Cpu or Gpu | Thread | Input Size | Performance1 | Performance2 | Performance3 | Accuracy1 | Accuracy2 | -| -------------------- | --------- | ----------- | -------- | ----------------- | ------------- | --------- | ---------- | ------ | ----------- | --------------------------- | ------------------------ | ------------------------ | ------------- | ------------- | -| tinybert | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | 32+32+32 | avg_time:17.5701ms/sequence | | | | | -| tinybert | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | 32+32+32 | avg_time:9.3479ms/sequence | | | | | -| tinybert | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | 32+32+32 | avg_time:4.48315ms/sequence | | | | | -| tinybert | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | 32+32+32 | avg_time:2.6499ms/sequence | | | | | -| tinybert | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | 32+32+32 | avg_time:16.9138ms/sequence | | | | | -| tinybert | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | 32+32+32 | avg_time:8.96313ms/sequence | | | | | -| tinybert | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | 32+32+32 | avg_time:3.69189ms/sequence | | | | | -| tinybert | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | 32+32+32 | avg_time:2.2041ms/sequence | | | | | -| tinybert | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | 32+32+32 | avg_time:16.96ms/sequence | | | | | -| tinybert | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | 32+32+32 | avg_time:8.88281ms/sequence | | | | | -| tinybert | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | 32+32+32 | avg_time:4.39697ms/sequence | | | | | -| tinybert | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | 32+32+32 | avg_time:2.84619ms/sequence | | | | | -| tinybert | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | 32+32+32 | avg_time:15.3071ms/sequence | | | | | -| tinybert | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | 32+32+32 | avg_time:8.33203ms/sequence | | | | | -| tinybert | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | 32+32+32 | avg_time:3.84497ms/sequence | | | | | -| tinybert | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | 32+32+32 | avg_time:2.3479ms/sequence | | | | | -| nmt | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | 32+32+32 | avg_time:875.392ms/sequence | | | | | -| nmt | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | 32+32+32 | avg_time:497.91ms/sequence | | | | | -| nmt | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | 32+32+32 | avg_time:295.943ms/sequence | | | | | -| nmt | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | 32+32+32 | avg_time:156.86ms/sequence | | | | | -| nmt | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | 32+32+32 | avg_time:975.889ms/sequence | | | | | -| nmt | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | 32+32+32 | avg_time:492.725ms/sequence | | | | | -| nmt | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | 32+32+32 | avg_time:285.426ms/sequence | | | | | -| nmt | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | 32+32+32 | avg_time:136.338ms/sequence | | | | | -| nmt | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | 32+32+32 | avg_time:874.251ms/sequence | | | | | -| nmt | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | 32+32+32 | avg_time:457.736ms/sequence | | | | | -| nmt | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | 32+32+32 | avg_time:300.887ms/sequence | | | | | -| nmt | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | 32+32+32 | avg_time:160.95ms/sequence | | | | | -| nmt | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | 32+32+32 | avg_time:854.466ms/sequence | | | | | -| nmt | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | 32+32+32 | avg_time:455.878ms/sequence | | | | | -| nmt | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | 32+32+32 | avg_time:246.937ms/sequence | | | | | -| nmt | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | 32+32+32 | avg_time:128.898ms/sequence | | | | | -| ghostnet | onnx | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:69.29ms/image | min_time:58.073ms/image | avg_time:59.0912ms/image | top5:0.973684 | top1:0.861842 | -| ghostnet | onnx | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:34.3389ms/image | min_time:27.093ms/image | avg_time:29.1388ms/image | top5:0.973684 | top1:0.875 | -| ghostnet | onnx | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:20.813ms/image | min_time:18.3352ms/image | avg_time:18.4675ms/image | top5:0.973684 | top1:0.861842 | -| ghostnet | onnx | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:9.94312ms/image | min_time:8.79199ms/image | avg_time:8.85982ms/image | top5:0.973684 | top1:0.875 | -| ghostnet | onnx | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:74.1409ms/image | min_time:56.3591ms/image | avg_time:59.8203ms/image | top5:0.973684 | top1:0.861842 | -| ghostnet | onnx | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:36.093ms/image | min_time:25.9141ms/image | avg_time:28.1741ms/image | top5:0.973684 | top1:0.875 | -| ghostnet | onnx | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:13.7258ms/image | min_time:13.3469ms/image | avg_time:13.4824ms/image | top5:0.973684 | top1:0.861842 | -| ghostnet | onnx | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:7.14282ms/image | min_time:6.92798ms/image | avg_time:7.01458ms/image | top5:0.973684 | top1:0.875 | -| ghostnet | onnx | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:74.3708ms/image | min_time:66.72ms/image | avg_time:67.9283ms/image | top5:0.973684 | top1:0.861842 | -| ghostnet | onnx | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:36.603ms/image | min_time:32.616ms/image | avg_time:33.2657ms/image | top5:0.973684 | top1:0.868421 | -| ghostnet | onnx | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:27.4438ms/image | min_time:18.6541ms/image | avg_time:18.8378ms/image | top5:0.973684 | top1:0.861842 | -| ghostnet | onnx | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:15.4668ms/image | min_time:9.32104ms/image | avg_time:9.45095ms/image | top5:0.973684 | top1:0.868421 | -| ghostnet | onnx | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:70.2061ms/image | min_time:55.4148ms/image | avg_time:61.4079ms/image | top5:0.973684 | top1:0.861842 | -| ghostnet | onnx | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:37.0811ms/image | min_time:27.947ms/image | avg_time:29.9928ms/image | top5:0.973684 | top1:0.868421 | -| ghostnet | onnx | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:15.6489ms/image | min_time:13.8831ms/image | avg_time:14.0345ms/image | top5:0.973684 | top1:0.861842 | -| ghostnet | onnx | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:8.07397ms/image | min_time:7.29102ms/image | avg_time:7.38781ms/image | top5:0.973684 | top1:0.868421 | -| mobilenet_v1 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:190.538ms/image | min_time:160.101ms/image | avg_time:168.335ms/image | top5:0.894737 | top1:0.638158 | -| mobilenet_v1 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:71.125ms/image | min_time:63.7568ms/image | avg_time:67.3027ms/image | top5:0.894737 | top1:0.638158 | -| mobilenet_v1 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:50.1008ms/image | min_time:40.907ms/image | avg_time:41.3074ms/image | top5:0.894737 | top1:0.638158 | -| mobilenet_v1 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:24.7939ms/image | min_time:19.5669ms/image | avg_time:19.6895ms/image | top5:0.894737 | top1:0.638158 | -| mobilenet_v1 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:174.117ms/image | min_time:151.17ms/image | avg_time:154.917ms/image | top5:0.894737 | top1:0.638158 | -| mobilenet_v1 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:74.842ms/image | min_time:59.8052ms/image | avg_time:62.1738ms/image | top5:0.894737 | top1:0.638158 | -| mobilenet_v1 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:34.542ms/image | min_time:33.5129ms/image | avg_time:33.7169ms/image | top5:0.894737 | top1:0.638158 | -| mobilenet_v1 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:16.0791ms/image | min_time:15.8879ms/image | avg_time:15.9935ms/image | top5:0.894737 | top1:0.638158 | -| mobilenet_v1 | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:178.423ms/image | min_time:167.58ms/image | avg_time:169.656ms/image | top5:0.894737 | top1:0.638158 | -| mobilenet_v1 | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:77.3501ms/image | min_time:66.8999ms/image | avg_time:68.3253ms/image | top5:0.894737 | top1:0.638158 | -| mobilenet_v1 | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:52.7412ms/image | min_time:41.7322ms/image | avg_time:42.0848ms/image | top5:0.894737 | top1:0.638158 | -| mobilenet_v1 | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:26.1299ms/image | min_time:19.927ms/image | avg_time:20.0323ms/image | top5:0.894737 | top1:0.638158 | -| mobilenet_v1 | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:173.724ms/image | min_time:151.815ms/image | avg_time:154.701ms/image | top5:0.894737 | top1:0.638158 | -| mobilenet_v1 | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:74.7651ms/image | min_time:60.425ms/image | avg_time:62.6472ms/image | top5:0.894737 | top1:0.638158 | -| mobilenet_v1 | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:36.054ms/image | min_time:33.9338ms/image | avg_time:34.2018ms/image | top5:0.894737 | top1:0.638158 | -| mobilenet_v1 | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:17.3879ms/image | min_time:16.575ms/image | avg_time:16.7297ms/image | top5:0.894737 | top1:0.638158 | -| mobilenet_v2 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:154.487ms/image | min_time:141.754ms/image | avg_time:145.152ms/image | top5:0.940789 | top1:0.756579 | -| mobilenet_v2 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:64.2239ms/image | min_time:58.8081ms/image | avg_time:59.9808ms/image | top5:0.934211 | top1:0.756579 | -| mobilenet_v2 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:49.458ms/image | min_time:38.323ms/image | avg_time:38.5056ms/image | top5:0.940789 | top1:0.756579 | -| mobilenet_v2 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:23.1702ms/image | min_time:17.5068ms/image | avg_time:17.6611ms/image | top5:0.934211 | top1:0.756579 | -| mobilenet_v2 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:150.025ms/image | min_time:128.345ms/image | avg_time:132.484ms/image | top5:0.940789 | top1:0.756579 | -| mobilenet_v2 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:68.334ms/image | min_time:53.7939ms/image | avg_time:55.3328ms/image | top5:0.934211 | top1:0.756579 | -| mobilenet_v2 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:30.573ms/image | min_time:29.303ms/image | avg_time:29.476ms/image | top5:0.940789 | top1:0.756579 | -| mobilenet_v2 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:14.3711ms/image | min_time:13.9141ms/image | avg_time:14.0406ms/image | top5:0.934211 | top1:0.756579 | -| mobilenet_v2 | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:165.334ms/image | min_time:142.623ms/image | avg_time:145.197ms/image | top5:0.940789 | top1:0.756579 | -| mobilenet_v2 | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:70.1091ms/image | min_time:63.4839ms/image | avg_time:65.2859ms/image | top5:0.934211 | top1:0.756579 | -| mobilenet_v2 | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:52.321ms/image | min_time:39.3108ms/image | avg_time:39.6574ms/image | top5:0.940789 | top1:0.756579 | -| mobilenet_v2 | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:24.9519ms/image | min_time:17.696ms/image | avg_time:18.0272ms/image | top5:0.934211 | top1:0.756579 | -| mobilenet_v2 | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:154.559ms/image | min_time:121.211ms/image | avg_time:130.884ms/image | top5:0.940789 | top1:0.756579 | -| mobilenet_v2 | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:77.2429ms/image | min_time:52.5979ms/image | avg_time:57.1493ms/image | top5:0.934211 | top1:0.756579 | -| mobilenet_v2 | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:34.0449ms/image | min_time:29.9241ms/image | avg_time:30.6575ms/image | top5:0.940789 | top1:0.756579 | -| mobilenet_v2 | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:15.9011ms/image | min_time:14.261ms/image | avg_time:14.4966ms/image | top5:0.934211 | top1:0.756579 | -| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:112.09ms/image | min_time:83.571ms/image | avg_time:93.3668ms/image | top5:0.875 | top1:0.618421 | -| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:53.822ms/image | min_time:35.8259ms/image | avg_time:41.434ms/image | top5:0.881579 | top1:0.618421 | -| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | int8 | cpu | 1 | 1/3/224/224 | max_time:44.7251ms/image | min_time:31.103ms/image | avg_time:35.5491ms/image | top5:0.875 | top1:0.592105 | -| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:31.9202ms/image | min_time:22.28ms/image | avg_time:22.4415ms/image | top5:0.875 | top1:0.618421 | -| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:16.2849ms/image | min_time:11.3989ms/image | avg_time:11.5547ms/image | top5:0.881579 | top1:0.618421 | -| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | int8 | cpu | 1 | 1/3/224/224 | max_time:13.2561ms/image | min_time:9.74902ms/image | avg_time:9.84788ms/image | top5:0.875 | top1:0.592105 | -| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:99.656ms/image | min_time:81.1941ms/image | avg_time:83.3297ms/image | top5:0.875 | top1:0.618421 | -| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:47.0339ms/image | min_time:34.7532ms/image | avg_time:37.4053ms/image | top5:0.881579 | top1:0.618421 | -| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | int8 | cpu | 1 | 1/3/224/224 | max_time:38.6331ms/image | min_time:30.9619ms/image | avg_time:31.3992ms/image | top5:0.875 | top1:0.592105 | -| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:18.9709ms/image | min_time:17.4961ms/image | avg_time:17.6799ms/image | top5:0.875 | top1:0.618421 | -| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:10.2009ms/image | min_time:9.35303ms/image | avg_time:9.4497ms/image | top5:0.881579 | top1:0.618421 | -| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | int8 | cpu | 1 | 1/3/224/224 | max_time:8.61816ms/image | min_time:7.65796ms/image | avg_time:7.70149ms/image | top5:0.875 | top1:0.592105 | -| squeezenet | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:107.665ms/image | min_time:89.8459ms/image | avg_time:91.3227ms/image | top5:0.875 | top1:0.618421 | -| squeezenet | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:52.929ms/image | min_time:41.8279ms/image | avg_time:42.9839ms/image | top5:0.868421 | top1:0.618421 | -| squeezenet | caffe | ubuntu16_04 | llvm | 810 | A55 | int8 | cpu | 1 | 1/3/224/224 | max_time:39.2102ms/image | min_time:35.2351ms/image | avg_time:36.3468ms/image | top5:0.868421 | top1:0.664474 | -| squeezenet | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:30.657ms/image | min_time:22.5791ms/image | avg_time:22.897ms/image | top5:0.875 | top1:0.618421 | -| squeezenet | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:13.2539ms/image | min_time:11.6641ms/image | avg_time:12.1377ms/image | top5:0.868421 | top1:0.618421 | -| squeezenet | caffe | ubuntu16_04 | llvm | 810 | A76 | int8 | cpu | 1 | 1/3/224/224 | max_time:12.3049ms/image | min_time:9.78296ms/image | avg_time:10.041ms/image | top5:0.868421 | top1:0.664474 | -| squeezenet | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:98.7239ms/image | min_time:81.147ms/image | avg_time:83.2495ms/image | top5:0.875 | top1:0.618421 | -| squeezenet | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:45.7671ms/image | min_time:33.8918ms/image | avg_time:36.3651ms/image | top5:0.868421 | top1:0.618421 | -| squeezenet | caffe | ubuntu16_04 | llvm | 990 | A55 | int8 | cpu | 1 | 1/3/224/224 | max_time:40.23ms/image | min_time:32.166ms/image | avg_time:33.0669ms/image | top5:0.868421 | top1:0.664474 | -| squeezenet | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:19.312ms/image | min_time:17.6091ms/image | avg_time:17.8038ms/image | top5:0.875 | top1:0.618421 | -| squeezenet | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:10.54ms/image | min_time:9.58398ms/image | avg_time:9.73297ms/image | top5:0.868421 | top1:0.618421 | -| squeezenet | caffe | ubuntu16_04 | llvm | 990 | A76 | int8 | cpu | 1 | 1/3/224/224 | max_time:8.69287ms/image | min_time:7.7019ms/image | avg_time:7.80628ms/image | top5:0.868421 | top1:0.664474 | -| mobilenet_v3 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:39.6121ms/image | min_time:30.645ms/image | avg_time:31.2452ms/image | top5:0.809211 | top1:0.539474 | -| mobilenet_v3 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:20.4729ms/image | min_time:15.2979ms/image | avg_time:15.8074ms/image | top5:0.822368 | top1:0.546053 | -| mobilenet_v3 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:9.58984ms/image | min_time:7.26196ms/image | avg_time:7.37196ms/image | top5:0.809211 | top1:0.539474 | -| mobilenet_v3 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:6.87085ms/image | min_time:4.26001ms/image | avg_time:4.38828ms/image | top5:0.822368 | top1:0.546053 | -| mobilenet_v3 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:36.8108ms/image | min_time:26.8101ms/image | avg_time:28.4539ms/image | top5:0.809211 | top1:0.539474 | -| mobilenet_v3 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:15.5039ms/image | min_time:13.3491ms/image | avg_time:14.5589ms/image | top5:0.822368 | top1:0.546053 | -| mobilenet_v3 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:5.90894ms/image | min_time:5.72192ms/image | avg_time:5.80782ms/image | top5:0.809211 | top1:0.539474 | -| mobilenet_v3 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:3.52197ms/image | min_time:3.30493ms/image | avg_time:3.34958ms/image | top5:0.822368 | top1:0.546053 | -| mobilenet_v3 | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:36.0649ms/image | min_time:31.5652ms/image | avg_time:32.6065ms/image | top5:0.809211 | top1:0.539474 | -| mobilenet_v3 | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:19.0042ms/image | min_time:16.8479ms/image | avg_time:17.7286ms/image | top5:0.815789 | top1:0.539474 | -| mobilenet_v3 | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:15.012ms/image | min_time:7.82007ms/image | avg_time:7.97568ms/image | top5:0.809211 | top1:0.539474 | -| mobilenet_v3 | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:6.84717ms/image | min_time:4.59595ms/image | avg_time:4.69553ms/image | top5:0.815789 | top1:0.539474 | -| mobilenet_v3 | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:38.3628ms/image | min_time:27.134ms/image | avg_time:29.9821ms/image | top5:0.809211 | top1:0.539474 | -| mobilenet_v3 | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:20.6179ms/image | min_time:15.635ms/image | avg_time:16.1516ms/image | top5:0.815789 | top1:0.539474 | -| mobilenet_v3 | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:6.75513ms/image | min_time:6.01196ms/image | avg_time:6.08447ms/image | top5:0.809211 | top1:0.539474 | -| mobilenet_v3 | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:4.0332ms/image | min_time:3.573ms/image | avg_time:3.6405ms/image | top5:0.815789 | top1:0.539474 | -| fingerprint_resnet18 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | UNKNOWN | avg_time:6.20508ms/image | | | | | -| fingerprint_resnet18 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | UNKNOWN | avg_time:3.82397ms/image | | | | | -| fingerprint_resnet18 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | UNKNOWN | avg_time:6.42603ms/image | | | | | -| fingerprint_resnet18 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | UNKNOWN | avg_time:3.63501ms/image | | | | | -| fingerprint_resnet18 | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | UNKNOWN | avg_time:5.70605ms/image | | | | | -| fingerprint_resnet18 | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | UNKNOWN | avg_time:3.38989ms/image | | | | | -| fingerprint_resnet18 | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | UNKNOWN | avg_time:5.6731ms/image | | | | | -| fingerprint_resnet18 | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | UNKNOWN | avg_time:3.521ms/image | | | | | -| resnet50 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:769.38ms/image | min_time:726.458ms/image | avg_time:738.653ms/image | top5:0.934211 | top1:0.730263 | -| resnet50 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:333.544ms/image | min_time:298.813ms/image | avg_time:311.864ms/image | top5:0.934211 | top1:0.730263 | -| resnet50 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:194.501ms/image | min_time:185.442ms/image | avg_time:186.798ms/image | top5:0.934211 | top1:0.730263 | -| resnet50 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:97.8091ms/image | min_time:92.2888ms/image | avg_time:93.0512ms/image | top5:0.934211 | top1:0.730263 | -| resnet50 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:716.164ms/image | min_time:620.755ms/image | avg_time:676.802ms/image | top5:0.934211 | top1:0.730263 | -| resnet50 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:298.58ms/image | min_time:252.569ms/image | avg_time:274.926ms/image | top5:0.934211 | top1:0.730263 | -| resnet50 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:153.967ms/image | min_time:152.562ms/image | avg_time:153.01ms/image | top5:0.934211 | top1:0.730263 | -| resnet50 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:76.0752ms/image | min_time:75.5671ms/image | avg_time:75.844ms/image | top5:0.934211 | top1:0.730263 | -| resnet50 | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:956.05ms/image | min_time:669.802ms/image | avg_time:692.174ms/image | top5:0.934211 | top1:0.730263 | -| resnet50 | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:308.14ms/image | min_time:283.158ms/image | avg_time:287.35ms/image | top5:0.934211 | top1:0.730263 | -| resnet50 | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:157.702ms/image | min_time:154.775ms/image | avg_time:155.15ms/image | top5:0.934211 | top1:0.730263 | -| resnet50 | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:79.03ms/image | min_time:77.3899ms/image | avg_time:77.7673ms/image | top5:0.934211 | top1:0.730263 | -| birealnet18 | onnx | ubuntu16_04 | gcc8.3 | 810 | A55 | binary | cpu | 1 | 1/3/224/224 | max_time:103.304ms/image | min_time:86.573ms/image | avg_time:88.4649ms/image | top5:0.782895 | top1:0.460526 | -| birealnet18 | onnx | ubuntu16_04 | gcc8.3 | 810 | A76 | binary | cpu | 1 | 1/3/224/224 | max_time:32.9131ms/image | min_time:30.6541ms/image | avg_time:32.1737ms/image | top5:0.782895 | top1:0.460526 | -| birealnet18 | onnx | ubuntu16_04 | gcc8.3 | 990 | A55 | binary | cpu | 1 | 1/3/224/224 | max_time:99.9858ms/image | min_time:82.6409ms/image | avg_time:85.7396ms/image | top5:0.782895 | top1:0.460526 | -| birealnet18 | onnx | ubuntu16_04 | gcc8.3 | 990 | A76 | binary | cpu | 1 | 1/3/224/224 | max_time:25.5642ms/image | min_time:23.6038ms/image | avg_time:24.7936ms/image | top5:0.782895 | top1:0.460526 | -| birealnet18 | onnx | ubuntu16_04 | llvm | 810 | A55 | binary | cpu | 1 | 1/3/224/224 | max_time:89.7971ms/image | min_time:77.2161ms/image | avg_time:78.3607ms/image | top5:0.782895 | top1:0.473684 | -| birealnet18 | onnx | ubuntu16_04 | llvm | 810 | A76 | binary | cpu | 1 | 1/3/224/224 | max_time:27.375ms/image | min_time:25.1851ms/image | avg_time:25.892ms/image | top5:0.782895 | top1:0.473684 | -| birealnet18 | onnx | ubuntu16_04 | llvm | 990 | A55 | binary | cpu | 1 | 1/3/224/224 | max_time:86.949ms/image | min_time:73.6069ms/image | avg_time:75.5283ms/image | top5:0.782895 | top1:0.473684 | -| birealnet18 | onnx | ubuntu16_04 | llvm | 990 | A76 | binary | cpu | 1 | 1/3/224/224 | max_time:20.3091ms/image | min_time:19.636ms/image | avg_time:19.7468ms/image | top5:0.782895 | top1:0.473684 | -| mobilenet_v1 | caffe | ubuntu16_04 | llvm | 990 | MALI-G76-MP16 | fp16 | gpu | 1 | 1/3/224/224 | max_time:11.2231ms/image | min_time:7.74097ms/image | avg_time:8.5096ms/image | top5:0.894737 | top1:0.644737 | -| mobilenet_v2 | caffe | ubuntu16_04 | llvm | 990 | MALI-G76-MP16 | fp16 | gpu | 1 | 1/3/224/224 | max_time:13.521ms/image | min_time:8.36597ms/image | avg_time:9.10277ms/image | top5:0.940789 | top1:0.756579 | -| mobilenet_v3 | caffe | ubuntu16_04 | llvm | 990 | MALI-G76-MP16 | fp16 | gpu | 1 | 1/3/224/224 | max_time:12.2771ms/image | min_time:5.12305ms/image | avg_time:7.50856ms/image | top5:0.671053 | top1:0.342105 | -| squeezenet | caffe | ubuntu16_04 | llvm | 990 | MALI-G76-MP16 | fp16 | gpu | 1 | 1/3/224/224 | max_time:11.2141ms/image | min_time:6.36304ms/image | avg_time:7.38169ms/image | top5:0.875 | top1:0.618421 | -| ghostnet | onnx | ubuntu16_04 | llvm | 990 | MALI-G76-MP16 | fp16 | gpu | 1 | 1/3/224/224 | max_time:15.1619ms/image | min_time:8.94604ms/image | avg_time:11.566ms/image | top5:0.973684 | top1:0.868421 | -| mobilenet_v1 | caffe | ubuntu16_04 | llvm | 810 | MALI-G52-MP6 | fp16 | gpu | 1 | 1/3/224/224 | max_time:16.575ms/image | min_time:11.979ms/image | avg_time:13.0283ms/image | top5:0.894737 | top1:0.644737 | -| mobilenet_v2 | caffe | ubuntu16_04 | llvm | 810 | MALI-G52-MP6 | fp16 | gpu | 1 | 1/3/224/224 | max_time:17.498ms/image | min_time:13.28ms/image | avg_time:14.2388ms/image | top5:0.940789 | top1:0.756579 | -| mobilenet_v3 | caffe | ubuntu16_04 | llvm | 810 | MALI-G52-MP6 | fp16 | gpu | 1 | 1/3/224/224 | max_time:12.8831ms/image | min_time:7.37183ms/image | avg_time:9.25686ms/image | top5:0.671053 | top1:0.342105 | -| squeezenet | caffe | ubuntu16_04 | llvm | 810 | MALI-G52-MP6 | fp16 | gpu | 1 | 1/3/224/224 | max_time:14.5859ms/image | min_time:9.90796ms/image | avg_time:10.5907ms/image | top5:0.875 | top1:0.618421 | -| ghostnet | onnx | ubuntu16_04 | llvm | 810 | MALI-G52-MP6 | fp16 | gpu | 1 | 1/3/224/224 | max_time:15.1641ms/image | min_time:12.0188ms/image | avg_time:13.2528ms/image | top5:0.973684 | top1:0.868421 | +| Model | Framework | Os | Compiler | Kirin Soc Version | Core | Precision | Cpu or Gpu | Thread | Input Size | avg_time | +| ------------ | --------- | ----------- | -------- | ----------------- | ---- | --------- | ---------- | ------ | ----------- | --------------------------- | +| tinybert | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | 32+32+32 | 16.605225 | +| tinybert | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | 32+32+32 | 9.114014 | +| tinybert | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | int8 | cpu | 1 | 32+32+32 | 7.108154 | +| tinybert | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | 32+32+32 | 4.708984 | +| tinybert | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | 32+32+32 | 3.208984 | +| tinybert | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | int8 | cpu | 1 | 32+32+32 | 2.205811 | +| tinybert | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | 32+32+32 | 17.630127 | +| tinybert | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | 32+32+32 | 9.800049 | +| tinybert | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | int8 | cpu | 1 | 32+32+32 | 7.642090 | +| tinybert | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | 32+32+32 | 4.029053 | +| tinybert | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | 32+32+32 | 2.835938 | +| tinybert | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | int8 | cpu | 1 | 32+32+32 | 1.875977 | +| tinybert | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | 32+32+32 | 16.586914 | +| tinybert | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | 32+32+32 | 9.926025 | +| tinybert | caffe | ubuntu16_04 | llvm | 810 | A55 | int8 | cpu | 1 | 32+32+32 | 8.780029 | +| tinybert | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | 32+32+32 | 4.817139 | +| tinybert | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | 32+32+32 | 3.411133 | +| tinybert | caffe | ubuntu16_04 | llvm | 810 | A76 | int8 | cpu | 1 | 32+32+32 | 2.603027 | +| tinybert | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | 32+32+32 | 17.857910 | +| tinybert | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | 32+32+32 | 10.166016 | +| tinybert | caffe | ubuntu16_04 | llvm | 990 | A55 | int8 | cpu | 1 | 32+32+32 | 9.524902 | +| tinybert | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | 32+32+32 | 4.157959 | +| tinybert | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | 32+32+32 | 2.849854 | +| tinybert | caffe | ubuntu16_04 | llvm | 990 | A76 | int8 | cpu | 1 | 32+32+32 | 2.161865 | +| nmt | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | 32+32+32 | 642.424072 | +| nmt | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | 32+32+32 | 356.841064 | +| nmt | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | 32+32+32 | 232.143066 | +| nmt | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | 32+32+32 | 116.806885 | +| nmt | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | 32+32+32 | 710.650879 | +| nmt | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | 32+32+32 | 373.492920 | +| nmt | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | 32+32+32 | 188.436035 | +| nmt | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | 32+32+32 | 95.020996 | +| nmt | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | 32+32+32 | 656.012939 | +| nmt | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | 32+32+32 | 355.530029 | +| nmt | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | 32+32+32 | 233.974854 | +| nmt | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | 32+32+32 | 120.966797 | +| nmt | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | 32+32+32 | 694.150146 | +| nmt | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | 32+32+32 | 370.792969 | +| nmt | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | 32+32+32 | 191.239014 | +| nmt | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | 32+32+32 | 96.389893 | +| ghostnet | onnx | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 59.0912 | +| ghostnet | onnx | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 29.1388 | +| ghostnet | onnx | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 18.4675 | +| ghostnet | onnx | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 8.85982 | +| ghostnet | onnx | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 59.8203 | +| ghostnet | onnx | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 28.1741 | +| ghostnet | onnx | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 13.4824 | +| ghostnet | onnx | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 7.01458 | +| ghostnet | onnx | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 67.9283 | +| ghostnet | onnx | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 33.2657 | +| ghostnet | onnx | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 18.8378 | +| ghostnet | onnx | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 9.45095 | +| ghostnet | onnx | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 61.4079 | +| ghostnet | onnx | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 29.9928 | +| ghostnet | onnx | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 14.0345 | +| ghostnet | onnx | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 7.38781 | +| mobilenet_v1 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 157.002292 | +| mobilenet_v1 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 58.171620 | +| mobilenet_v1 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 39.825202 | +| mobilenet_v1 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 19.578837 | +| mobilenet_v1 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 162.878805 | +| mobilenet_v1 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 60.626796 | +| mobilenet_v1 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 34.055518 | +| mobilenet_v1 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 16.632785 | +| mobilenet_v1 | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 146.566687 | +| mobilenet_v1 | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 56.444002 | +| mobilenet_v1 | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 41.474450 | +| mobilenet_v1 | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 19.824521 | +| mobilenet_v1 | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 154.871895 | +| mobilenet_v1 | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 58.995204 | +| mobilenet_v1 | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 35.317029 | +| mobilenet_v1 | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 17.012493 | +| mobilenet_v2 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 128.224905 | +| mobilenet_v2 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 52.524629 | +| mobilenet_v2 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 37.341060 | +| mobilenet_v2 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 17.874094 | +| mobilenet_v2 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 133.358493 | +| mobilenet_v2 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 54.601522 | +| mobilenet_v2 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 30.195150 | +| mobilenet_v2 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 14.732285 | +| mobilenet_v2 | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 128.164528 | +| mobilenet_v2 | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 52.918264 | +| mobilenet_v2 | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 39.283749 | +| mobilenet_v2 | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 18.219124 | +| mobilenet_v2 | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 135.029091 | +| mobilenet_v2 | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 53.469070 | +| mobilenet_v2 | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 31.188000 | +| mobilenet_v2 | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 14.823740 | +| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 81.188454 | +| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 41.068093 | +| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | int8 | cpu | 1 | 1/3/224/224 | 40.926114 | +| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 22.351720 | +| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 12.302643 | +| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | int8 | cpu | 1 | 1/3/224/224 | 12.866001 | +| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 83.545513 | +| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 38.876539 | +| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | int8 | cpu | 1 | 1/3/224/224 | 42.012501 | +| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 18.350247 | +| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 10.457777 | +| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | int8 | cpu | 1 | 1/3/224/224 | 10.791769 | +| squeezenet | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 87.881422 | +| squeezenet | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 41.555842 | +| squeezenet | caffe | ubuntu16_04 | llvm | 810 | A55 | int8 | cpu | 1 | 1/3/224/224 | 42.674965 | +| squeezenet | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 23.652636 | +| squeezenet | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 12.840966 | +| squeezenet | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 83.131129 | +| squeezenet | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 40.010485 | +| squeezenet | caffe | ubuntu16_04 | llvm | 990 | A55 | int8 | cpu | 1 | 1/3/224/224 | 42.526015 | +| squeezenet | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 18.508339 | +| squeezenet | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 10.512915 | +| squeezenet | caffe | ubuntu16_04 | llvm | 990 | A76 | int8 | cpu | 1 | 1/3/224/224 | 10.664495 | +| resnet50 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 671.152633 | +| resnet50 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 285.339836 | +| resnet50 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | int8 | cpu | 1 | 1/3/224/224 | 392.863447 | +| resnet50 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | int8 | cpu | 1 | 1/3/224/224 | 238.454521 | +| resnet50 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 704.473506 | +| resnet50 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 305.267536 | +| resnet50 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | int8 | cpu | 1 | 1/3/224/224 | 415.956299 | +| resnet50 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 163.979900 | +| resnet50 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 84.558313 | +| resnet50 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | int8 | cpu | 1 | 1/3/224/224 | 107.986291 | +| resnet50 | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 659.161729 | +| resnet50 | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 291.172460 | +| resnet50 | caffe | ubuntu16_04 | llvm | 810 | A55 | int8 | cpu | 1 | 1/3/224/224 | 364.732873 | +| resnet50 | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 192.280511 | +| resnet50 | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 104.545466 | +| resnet50 | caffe | ubuntu16_04 | llvm | 810 | A76 | int8 | cpu | 1 | 1/3/224/224 | 106.067036 | +| resnet50 | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 696.839442 | +| resnet50 | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 305.819625 | +| resnet50 | caffe | ubuntu16_04 | llvm | 990 | A55 | int8 | cpu | 1 | 1/3/224/224 | 384.351028 | +| resnet50 | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 162.355430 | +| resnet50 | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 85.032211 | +| resnet50 | caffe | ubuntu16_04 | llvm | 990 | A76 | int8 | cpu | 1 | 1/3/224/224 | 87.458490 | +| mobilenet_v3 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 26.322640 | +| mobilenet_v3 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 13.424403 | +| mobilenet_v3 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 6.937206 | +| mobilenet_v3 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 4.120585 | +| mobilenet_v3 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 27.295058 | +| mobilenet_v3 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 14.080605 | +| mobilenet_v3 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 5.702913 | +| mobilenet_v3 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 3.402296 | +| mobilenet_v3 | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 27.603806 | +| mobilenet_v3 | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 13.785805 | +| mobilenet_v3 | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 7.579590 | +| mobilenet_v3 | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 4.360988 | +| mobilenet_v3 | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 29.233688 | +| mobilenet_v3 | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 14.552715 | +| mobilenet_v3 | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 6.072735 | +| mobilenet_v3 | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 3.545582 | +| asr_rnnt | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | / | 1394.038574 | +| asr_rnnt | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | / | 647.142578 | +| asr_rnnt | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | / | 491.428833 | +| asr_rnnt | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | / | 253.110596 | +| asr_rnnt | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | / | 1431.238037 | +| asr_rnnt | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | / | 717.116455 | +| asr_rnnt | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | / | 406.063965 | +| asr_rnnt | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | / | 201.946533 | +| asr_rnnt | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | / | 1373.467163 | +| asr_rnnt | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | / | 645.445068 | +| asr_rnnt | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | / | 513.895874 | +| asr_rnnt | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | / | 258.136353 | +| asr_rnnt | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | / | 1390.146973 | +| asr_rnnt | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | / | 689.771851 | +| asr_rnnt | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | / | 407.601929 | +| asr_rnnt | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | / | 204.034424 | +| asr_convolution_transformer_encoder | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | / | 73.672119 | +| asr_convolution_transformer_encoder | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | / | 38.642822 | +| asr_convolution_transformer_encoder | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | / | 20.154053 | +| asr_convolution_transformer_encoder | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | / | 11.269043 | +| asr_convolution_transformer_encoder | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | / | 80.286865 | +| asr_convolution_transformer_encoder | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | / | 43.187988 | +| asr_convolution_transformer_encoder | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | / | 17.701904 | +| asr_convolution_transformer_encoder | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | / | 9.718018 | +| asr_convolution_transformer_encoder | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | / | 70.837158 | +| asr_convolution_transformer_encoder | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | / | 38.571045 | +| asr_convolution_transformer_encoder | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | / | 20.748047 | +| asr_convolution_transformer_encoder | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | / | 11.482910 | +| asr_convolution_transformer_encoder | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | / | 78.658203 | +| asr_convolution_transformer_encoder | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | / | 41.130127 | +| asr_convolution_transformer_encoder | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | / | 17.866943 | +| asr_convolution_transformer_encoder | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | / | 9.725830 | +| asr_convolution_transformer_joint_net | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | / | 1.088867 | +| asr_convolution_transformer_joint_net | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | / | 0.776123 | +| asr_convolution_transformer_joint_net | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | / | 0.474121 | +| asr_convolution_transformer_joint_net | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | / | 0.277832 | +| asr_convolution_transformer_joint_net | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | / | 1.187012 | +| asr_convolution_transformer_joint_net | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | / | 0.599854 | +| asr_convolution_transformer_joint_net | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | / | 0.366211 | +| asr_convolution_transformer_joint_net | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | / | 0.197021 | +| asr_convolution_transformer_prediction_net | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | / | 12.655029 | +| asr_convolution_transformer_prediction_net | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | / | 5.782227 | +| asr_convolution_transformer_prediction_net | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | / | 3.854980 | +| asr_convolution_transformer_prediction_net | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | / | 2.052002 | +| asr_convolution_transformer_prediction_net | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | / | 12.645996 | +| asr_convolution_transformer_prediction_net | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | / | 6.840088 | +| asr_convolution_transformer_prediction_net | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | / | 3.276123 | +| asr_convolution_transformer_prediction_net | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | / | 1.738037 | +| asr_convolution_transformer_prediction_net | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | / | 11.480957 | +| asr_convolution_transformer_prediction_net | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | / | 6.652100 | +| asr_convolution_transformer_prediction_net | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | / | 3.854004 | +| asr_convolution_transformer_prediction_net | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | / | 2.174072 | +| asr_convolution_transformer_prediction_net | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | / | 12.539062 | +| asr_convolution_transformer_prediction_net | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | / | 6.432129 | +| asr_convolution_transformer_prediction_net | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | / | 3.377930 | +| asr_convolution_transformer_prediction_net | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | / | 1.842041 | +| birealnet18 | onnx | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | / | 68.122700 | +| birealnet18 | onnx | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | / | 27.371033 | +| birealnet18 | onnx | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | / | 72.439991| +| birealnet18 | onnx | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | / | 22.747310 | +| birealnet18 | onnx | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | / | 64.079100 | +| birealnet18 | onnx | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | / | 23.102961| +| birealnet18 | onnx | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | / | 68.148505 | +| birealnet18 | onnx | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | / | 19.146468 | +| fingerprint_resnet18 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 |/ | 10.235107 | +| fingerprint_resnet18 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 |/ | 5.970947 | +| fingerprint_resnet18 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 |/ | 2.722900 | +| fingerprint_resnet18 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 |/ | 1.936035 | +| fingerprint_resnet18 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 |/ | 10.719971 | +| fingerprint_resnet18 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 |/ | 6.263916 | +| fingerprint_resnet18 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 |/ | 2.169189 | +| fingerprint_resnet18 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 |/ | 1.608887 | +| fingerprint_resnet18 | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 |/ | 9.856934 | +| fingerprint_resnet18 | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 |/ | 6.613037 | +| fingerprint_resnet18 | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 |/ | 2.770996 | +| fingerprint_resnet18 | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 |/ | 1.895996 | +| fingerprint_resnet18 | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 |/ | 9.671875 | +| fingerprint_resnet18 | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 |/ | 6.834961 | +| fingerprint_resnet18 | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 |/ | 2.219971 | +| fingerprint_resnet18 | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 |/ | 1.579102 | +| tinybert384 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | / | 22.581055 | +| tinybert384 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | / | 11.834961 | +| tinybert384 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | int8 | cpu | 1 | / | 9.236084 | +| tinybert384 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | / | 6.566895 | +| tinybert384 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | / | 4.331055 | +| tinybert384 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | int8 | cpu | 1 | / | 2.608154 | +| tinybert384 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | / | 25.033936 | +| tinybert384 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | / | 12.445801 | +| tinybert384 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | int8 | cpu | 1 | / | 8.791992 | +| tinybert384 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | / | 5.834229 | +| tinybert384 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | / | 4.145020 | +| tinybert384 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | int8 | cpu | 1 | / | 2.248047 | +| tinybert384 | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | / | 23.499023 | +| tinybert384 | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | / | 11.804932 | +| tinybert384 | caffe | ubuntu16_04 | llvm | 810 | A55 | int8 | cpu | 1 | / | 10.481934 | +| tinybert384 | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | / | 6.758057 | +| tinybert384 | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | / | 4.487061 | +| tinybert384 | caffe | ubuntu16_04 | llvm | 810 | A76 | int8 | cpu | 1 | / | 2.980957 | +| tinybert384 | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | / | 24.321045 | +| tinybert384 | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | / | 11.989014 | +| tinybert384 | caffe | ubuntu16_04 | llvm | 990 | A55 | int8 | cpu | 1 | / | 11.645020 | +| tinybert384 | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | / | 5.864990 | +| tinybert384 | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | / | 3.717041 | +| tinybert384 | caffe | ubuntu16_04 | llvm | 990 | A76 | int8 | cpu | 1 | / | 2.541992 | +| tts_postnet | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | / | 190.645020 | +| tts_postnet | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | / | 76.032959 | +| tts_postnet | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | / | 50.892822 | +| tts_postnet | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | / | 26.031006 | +| tts_postnet | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | / | 196.128906 | +| tts_postnet | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | / | 78.461182 | +| tts_postnet | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | / | 45.419922 | +| tts_postnet | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | / | 23.227051 | +| tts_postnet | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | / | 177.791016 | +| tts_postnet | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | / | 76.929932 | +| tts_postnet | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | / | 50.316162 | +| tts_postnet | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | / | 26.363037 | +| tts_postnet | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | / | 187.880859 | +| tts_postnet | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | / | 77.622070 | +| tts_postnet | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | / | 45.151123 | +| tts_postnet | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | / | 23.705078 | +| tinybert_onnx | onnx | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | / | 16.797119 | +| tinybert_onnx | onnx | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | / | 9.298096 | +| tinybert_onnx | onnx | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | / | 4.791992 | +| tinybert_onnx | onnx | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | / | 3.198975 | +| tinybert_onnx | onnx | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | / | 18.088135 | +| tinybert_onnx | onnx | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | / | 9.281006 | +| tinybert_onnx | onnx | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | / | 4.185059 | +| tinybert_onnx | onnx | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | / | 2.751953 | +| tinybert_onnx | onnx | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | / | 17.781006 | +| tinybert_onnx | onnx | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | / | 10.011963 | +| tinybert_onnx | onnx | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | / | 5.114014 | +| tinybert_onnx | onnx | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | / | 3.356934 | +| tinybert_onnx | onnx | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | / | 19.456055 | +| tinybert_onnx | onnx | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | / | 10.182861 | +| tinybert_onnx | onnx | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | / | 4.295898 | +| tinybert_onnx | onnx | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | / | 2.818848 | +| tts_melgan_vocoder | onnx | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | / | 1678.862793 | +| tts_melgan_vocoder | onnx | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | / | 776.404053 | +| tts_melgan_vocoder | onnx | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | / | 525.206055 | +| tts_melgan_vocoder | onnx | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | / | 263.708008 | +| tts_melgan_vocoder | onnx | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | / | 1764.489014 | +| tts_melgan_vocoder | onnx | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | / | 813.339844 | +| tts_melgan_vocoder | onnx | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | / | 417.965088 | +| tts_melgan_vocoder | onnx | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | / | 215.749023 | +| tts_melgan_vocoder | onnx | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | / | 1681.593994 | +| tts_melgan_vocoder | onnx | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | / | 745.455078 | +| tts_melgan_vocoder | onnx | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | / | 493.489014 | +| tts_melgan_vocoder | onnx | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | / | 261.107910 | +| tts_melgan_vocoder | onnx | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | / | 1768.612061 | +| tts_melgan_vocoder | onnx | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | / | 784.126953 | +| tts_melgan_vocoder | onnx | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | / | 410.212891 | +| tts_melgan_vocoder | onnx | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | / | 211.166016 | +| tts_encoder_decoder | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | / | 420.667969 | +| tts_encoder_decoder | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | / | 216.756104 | +| tts_encoder_decoder | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | / | 150.831055 | +| tts_encoder_decoder | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | / | 76.963867 | +| tts_encoder_decoder | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | / | 452.342041 | +| tts_encoder_decoder | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | / | 227.301025 | +| tts_encoder_decoder | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | / | 126.700195 | +| tts_encoder_decoder | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | / | 64.525879 | +| tts_encoder_decoder | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | / | 418.912109 | +| tts_encoder_decoder | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | / | 210.114990| +| tts_encoder_decoder | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | / | 150.854004 | +| tts_encoder_decoder | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | / | 76.160156| +| tts_encoder_decoder | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | / | 449.791016 | +| tts_encoder_decoder | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | / | 222.086914 | +| tts_encoder_decoder | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | / | 124.972168 | +| tts_encoder_decoder | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | / | 65.025146 | +| vad | tflite | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | / | 10.746826 | +| vad | tflite | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | / | 8.893066 | +| vad | tflite | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | / | 2.525146 | +| vad | tflite | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | / | 2.598145 | +| vad | tflite | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | / | 10.649902 | +| vad | tflite | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | / | 8.578857 | +| vad | tflite | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | / | 2.264893 | +| vad | tflite | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | / | 2.222168 | +| vad | tflite | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | / | 11.018066 | +| vad | tflite | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | / | 11.024170 | +| vad | tflite | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | / | 3.061035 | +| vad | tflite | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | / | 2.907959 | +| vad | tflite | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | / | 11.968994 | +| vad | tflite | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | / | 11.719971 | +| vad | tflite | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | / | 2.645020 | +| vad | tflite | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | / | 2.597900 | diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 79ef083b..ff974e3d 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -7,6 +7,18 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](). +## [1.0.0] - 2020-11-20 + +### Added + +- Support fp32 on X86 AVX2 CPU +- Support partial fp32 operator(convolution, lstm) multi-threads parallel +- Support Tensorflow model +- Support more networks(Pointnet, ...) +- Support more networks int8 inference(TinyBert, NMT, ASR) +- Support time-series data acceleration +- Support Apple IOS phone + ## [0.3.0] - 2020-06-01 diff --git a/docs/DEVELOPER.md b/docs/DEVELOPER.md index 8eabde05..3df69f1e 100644 --- a/docs/DEVELOPER.md +++ b/docs/DEVELOPER.md @@ -1,613 +1,418 @@ -# Customize Models +Before learning this developer guide of bolt, the [code architecture](ARCHITECTURE.md) is strongly recommended for you to read in advance. After reading the [code architecture](ARCHITECTURE.md), your will get the deep understanding of the whole design of bolt, which helps you develop bolt more efficiently. -- ### model-tools customization +If you want to verify your model quickly, you can use the out-of-the-box c api or java api to infer your model and check the inference result. If your model run with time series data, you can use “Flow” to accelerate the inference. What’s more, if your encounter unsupported operators in conversion or inference of your model, you can customize the unsupported operators step by step which has been described in details in the document. - ​ In model-tools, you can define any operator for model conversion. +[Use out-of-the-box API to infer your model](#use-out-of-the-box-api-to-infer-your-model) +    [C API](#c-api) +    [Java API](#java-api) +[Accelerate time series model by Flow](#accelerate-time-series-model-by-flow) +[Customize models with unsupported operators step by step](#customize-models-with-unsupported-operators-step-by-step) +    [model conversion customization](#model-conversion-customization) +    [tensor computing customization](#tensor-computing-customization) +    [inference's engine customization](#inference's-engine-customization) +[How to contribute](#how-to-contribute) +    [submit issue](#submit-issue) +    [pull request](#pull-request) - 1. Switch to code of the specific framework (caffe/onnx/tflite) you are working on; - 2. Judge the op whether it is a weight-op or non-weight-op; - 3. Define the Operator parameter format; - 4. Extract the meta information of the op; - 5. Extract the weight data if the op is a weight-op, otherwise skip this step. - +# Use out-of-the-box API to infer your model - +## C API - - [ ] caffe converter +Bolt provides C API document generated by doxygen to help you use [C API](../inference/engine/api/c/bolt.h) with a detailed [example](../inference/examples/c_api/test_api_c.c). You can compile it and link *libbolt.so* library with your C/C++ project. - - add `pooling` in caffe converter +## Java API - (1) Switch to bolt/model-tools/src/caffe, which is the caffe converter for bolt; +Bolt provides Java API document generated by doxygen to help use [Java API](../inference/engine/api/java) with a detailed [example](../inference/examples/java_api/test_api_java.java). You can compile bolt and load *libBoltModel.so* to using the Java Native Interface(JNI) with your Java project. - (2) Pooling is non-weight-op. +# Accelerate time series model by Flow - (3) Define `pooling` parameter format. +Flow provides API document generated by doxygen to help use [Flow C++ header](../inference/flow/include), and examples([tinybert](../inference/examples/bert/flow_tinybert.cpp), [faceSR](../inference/examples/facesr/flow_facesr.cpp), [ASR](../inference/examples/automatic_speech_recognition/flow_asr.cpp)). - Add `pooling` definition of bolt in bolt/model-tools/include/model_tools.h +- ## Generally, it includes the following steps: - ``` - // Addition begin - typedef struct { - U32 kernel_size_h; - U32 kernel_size_w; - U32 stride_h; - U32 stride_w; - U32 padding_top; - U32 padding_bottom; - U32 padding_left; - U32 padding_right; - RoundMode rm; - PoolingMode mode; - } PoolingParamSpec; - // Addition end - ``` + - ### Use [predefined flow protobuf standard](../inference/flow/src/flow.proto) to define a graph - Modified "OperatorType" data structure in bolt/uni/include/op_type.h + Here is an example for CV application faceSR graph file [flow_facesr.prototxt](../inference/examples/facesr/flow_facesr.prototxt). This graph has one input, one input node, one inference node and one output. Input node need to be marked as *Input*, and inference node need to be marked as *Inference*. Each node can have multiple input or output tensors. Each type node has typical fields. - ``` - typedef enum { - ... - OT_Pooling, // Addition - ... - } OperatorType - ``` + - ### Add output tensor size infer function for each node, and register function to Flow function manager (optional) - Modified "inline const char* const* OperatorTypeName()" function in bolt/uni/include/op_type.h + Because facesr doesn't have post process function, so the node's output tensor can use model inference result tensor directly. - ``` - inline const char* const* OperatorTypeName() { - static const char* const names[] = { - ... - "OT_Pooling", // Addition, please corresponds to the OperatorType - ... - } - } - ``` - - (4) Extract the meta information of `pooling` operator in caffe. - - Modified the function named "OperatorType convert_caffe_type(std::string inputType)" in bolt/model-tools/caffe/caffe_adaptee.h . - - Add the caffe type mapping code as following: - - ``` - OperatorType convert_caffe_type(std::string inputType) { - // Addition begin - if (inputType == "Pooling") { - return OT_Pooling; - } // Addition end - else if (inputType == "Convolution") { - ... - } - } - ``` - - Extract the meta information of pooling operator from caffe model, add the function "ParameterSpec adapt_Pooling() override" in bolt/model-tools/caffe/caffe_adaptee.h - - ``` - // Addition begin - ParameterSpec adapt_Pooling() override { - ParameterSpec curPs; - PoolingParamSpec pps; - if (layer.pooling_param().has_kernel_w() && layer.pooling_param().has_kernel_h()) { - pps.kernel_size_w = layer.pooling_param().kernel_w(); - pps.kernel_size_h = layer.pooling_param().kernel_h(); - } else { - pps.kernel_size_h = layer.pooling_param().kernel_size(); - pps.kernel_size_w = pps.kernel_size_h; - } - if (layer.pooling_param().has_stride_w() && layer.pooling_param().has_stride_h()) { - pps.stride_w = layer.pooling_param().stride_w(); - pps.stride_h = layer.pooling_param().stride_h(); - } else { - pps.stride_h = layer.pooling_param().stride(); - pps.stride_w = pps.stride_h; - } - bool global_pooling = layer.pooling_param().global_pooling(); - if (global_pooling) { - pps.kernel_size_h = 0; - pps.kernel_size_w = 0; - pps.stride_h = 1; - pps.stride_w = 1; - }else { - CHECK_REQUIREMENT(pps.kernel_size_h > 0); - } - if (layer.pooling_param().has_pad_w() && layer.pooling_param().has_pad_h()) { - pps.padding_left = layer.pooling_param().pad_w(); - pps.padding_right = pps.padding_left; - pps.padding_top = layer.pooling_param().pad_h(); - pps.padding_bottom = pps.padding_top; - } else { - pps.padding_top = layer.pooling_param().has_pad() ? layer.pooling_param().pad() : 0; - pps.padding_bottom = pps.padding_top; - pps.padding_left = pps.padding_top; - pps.padding_right = pps.padding_top; - } - - if (layer.pooling_param().has_round_mode() && layer.pooling_param().round_mode() == 1) { - pps.rm = FLOOR; - }else{ - pps.rm = CEIL; - } - switch (layer.pooling_param().pool()) { - case caffe::PoolingParameter_PoolMethod_MAX: { - pps.mode = POOLING_MAX; - break; - } - case caffe::PoolingParameter_PoolMethod_AVE: { - pps.mode = POOLING_MEAN; - break; - } - default: { - std::cerr << "[ERROR] encounter unsupported Pooling method " << layer.pooling_param().pool() << std::endl; - break; - } - } - curPs.pooling_spec = pps; - return curPs; - } - // Addition end - ``` - - (5) Pooling is non-weight op, skip this step. - - - - - [ ] onnx converter - - - add `pooling` in onnx converter - - (1) Switch to bolt/model-tools/src/onnx, which is the onnx converter for bolt; - - (2) Pooling is non-weight-op; - - (3) Define `pooling` parameter format. - - Note: Definition actions same with add pooling in caffe converter step(3) . Please refer the former content. - - (4) Extract the meta information of `pooling` operator in onnx. - - Modified the function named "OperatorType convert_onnx_type(std::string inputType)" in bolt/model-tools/onnx/onnx_adaptee.h . - - Add the onnx type mapping code as following: - - ``` - OperatorType convert_onnx_type(std::string inputType) { - // Addition begin - if (inputType == "AveragePool" || inputType == "MaxPool") { - return OT_Pooling; - } // Addition end - else if (inputType == "Conv") { - ... - } - } - ``` - - Extract the meta information of pooling operator from onnx model, add the function "ParameterSpec adapt_Pooling() override" in bolt/model-tools/onnx/onnx_adaptee.h - - ``` - // Addition begin - ParameterSpec adapt_Pooling() override - { - ParameterSpec curPs; - PoolingParamSpec pps; - std::string autoPad = get_node_str_attribute_by_name(node, "auto_pad"); - std::vector kernelShape = get_node_vector_ints_attribute_by_name(node, "kernel_shape"); - std::vector strides = get_node_vector_ints_attribute_by_name(node, "strides"); - std::vector pads = get_node_vector_ints_attribute_by_name(node, "pads"); - - if (op == "AveragePool" || op == "ReduceMean") { - pps.mode = POOLING_MEAN; - } else { - pps.mode = POOLING_MAX; - } - - if (autoPad == "SAME_UPPER") { - pps.rm = CEIL; - } else { - pps.rm = FLOOR; - } - - if (kernelShape.size() == 2) { - pps.kernel_size_h = kernelShape[0]; - pps.kernel_size_w = kernelShape[1]; - } else { - pps.kernel_size_h = 0; - pps.kernel_size_w = 0; - std::cerr << "[Info] pooling: kernel_size unknown. This could be global pooling." << std::endl; - } - - if (strides.size() == 2) { - pps.stride_h = strides[0]; - pps.stride_w = strides[1]; - } else { - pps.stride_h = 0; - pps.stride_w = 0; - std::cerr << "[Info] pooling: stride unknown. This could be global pooling." << std::endl; - } - - if (pads.size() == 4) { - pps.padding_top = pads[0]; - pps.padding_bottom = pads[2]; - pps.padding_left = pads[1]; - pps.padding_right = pads[3]; - } else { - pps.padding_top = 0; - pps.padding_bottom = 0; - pps.padding_left = 0; - pps.padding_right = 0; - } - curPs.pooling_spec = pps; - return curPs; - } - // Addition end - ``` - - (5) Pooling is non-weight op, skip this step. - - - - - [ ] tflite converter - - - add `pooling` in tflite converter - - (1) Switch to bolt/model-tools/src/onnx, which is the onnx converter for bolt; - - (2) Pooling is non-weight-op; - - (3) Define `pooling` parameter format. - - Note: Definition actions same with add pooling in caffe converter step(3) . Please refer the former content. - - (4) Extract the meta information of `pooling` operator in tflite. - - Modified the function named "OperatorType convert_tflite_type(std::string inputType)" in bolt/model-tools/tflite/tflite_adaptee.h . - - Add the tflite type mapping code as following: - - ``` - OperatorType convert_tflite_type(tflite::BuiltinOperator tfliteType) { - // Addition begin - if (tfliteType == tflite::BuiltinOperator_MAX_POOL_2D) { - return OT_Pooling; - } // Addition end - else if (tfliteType == tflite::BuiltinOperator_CONCATENATION) { - ... - } - } - ``` - - Extract the meta information of pooling operator from tflite model, add the function "ParameterSpec adapt_Pooling() override" in bolt/model-tools/tflite/tflite_adaptee.h - - ``` - // Addition begin - ParameterSpec adapt_Pooling() override - { - ParameterSpec curPs; - const auto& tflitePoolOption = ops[curIndex]->builtin_options.AsPool2DOptions(); - PoolingParamSpec poolingPs; - poolingPs.kernel_size_h = tflitePoolOption->filter_height; - poolingPs.kernel_size_w = tflitePoolOption->filter_width; - poolingPs.stride_h = tflitePoolOption->stride_h; - poolingPs.stride_w = tflitePoolOption->stride_w; - poolingPs.padding_top = 0; - poolingPs.padding_bottom = 0; - poolingPs.padding_left = 0; - poolingPs.padding_right = 0; - poolingPs.rm = CEIL; - if (opCode == tflite::BuiltinOperator_MAX_POOL_2D) { - poolingPs.mode = POOLING_MAX; - } else if (opCode == tflite::BuiltinOperator_AVERAGE_POOL_2D) { - poolingPs.mode = POOLING_MEAN; - } - curPs.pooling_spec = poolingPs; - return curPs; - } - // Addition end - ``` - - (5) Pooling is non-weight op, skip this step. + If you have post process function, you can refer [flow_tinybert](../inference/examples/bert/flow_tinybert.cpp) defines *tinybertInferOutputSize* function and registers it by using *flowRegisterFunction* API. + + - ### Add input tensor preprocess function for each node, and register function to Flow function manager (optional) + + *(same as output tensor size infer function)* + + - ### Add output tensor postprocess function for each node, and register function to Flow function manager (optional) + + *(same as output tensor size infer function)* + + - ### Define a *Flow* object, and add task + + Declare a *Flow* object and set the used CPU cores number and GPU. Use *Task* format to describe task. Use *enque* API to add task into Flow heterogeneous executor. + + - ### Get Flow process result + + Use *dequeue* API to get the already finished task result. This is in the FIFO order. You can choose to set as blocked to get all enqueue tasks result. *size* function can be used to query the unfinishe +d task number. + +# Customize models with unsupported operators step by step + +## model conversion customization + +In [model_tools](../model_tools), you can define any operator for model conversion. + +1. Switch to code of the specific framework (caffe/onnx/tflite) you are working on; +2. Judge the op whether it is a weight-op or non-weight-op; +3. Define the Operator parameter format; +4. Extract the meta information of the op; +5. Extract the weight data if the op is a weight-op, otherwise skip this step. + +- Example: support `pooling` in caffe converter + 1. Switch to [model_tools/src/caffe](../model_tools/src/caffe), which is the caffe converter for bolt; + + 2. Judgment: pooling is non-weight-op. + + 3. Define `pooling` parameter format. + + 3.1 Add `pooling` definition of bolt in [model_tools/include/model_tools.h](../common/uni/include/types.h) + + ``` + // Addition ======> + typedef struct { + U32 kernel_h; + U32 kernel_w; + U32 stride_h; + U32 stride_w; + U32 padding_top; + U32 padding_bottom; + U32 padding_left; + U32 padding_right; + RoundMode rm; + PoolingMode mode; + } PoolingParamSpec; + // <====== Addition end + ``` + + 3.2 Modify "OperatorType" data structure in [common/uni/include/op_type.h](../common/uni/include/op_type.h) + + ``` + typedef enum { + ... + OT_Pooling, // Addition + ... + } OperatorType + ``` + + 3.3 Modify "inline const char* const* OperatorTypeName()" function in [common/uni/include/op_type.h](../common/uni/include/op_type.h) + + ``` + inline const char* const* OperatorTypeName() { + static const char* const names[] = { + ... + "OT_Pooling", // Addition, please corresponds to the OperatorType + ... + } + } + ``` + + 3.4 Modify "int get_operator_parameter_size(OperatorType operatorType)" function in [model_tools/src/model_deserialize.cpp](../model_tools/src/model_deserialize.cpp) + + ``` + std::map operatorParameterSizeMap = { + ... + {OT_Pooling, sizeof(PoolingParamSpec)}}; + ``` + + 4. Extract the meta information of `pooling` operator in caffe. + + 4.1 Modify the function named "OperatorType convert_caffe_type(std::string inputType)" in [model_tools/caffe/caffe_adaptee.h](../model_tools/src/caffe/caffe_adaptee.h). + + Add the caffe type mapping code as following: + + ``` + OperatorType convert_caffe_type(std::string inputType) { + // Addition ======> + if (inputType == "Pooling") { + return OT_Pooling; + } // <====== Addition + else if (inputType == "Convolution") { + ... + } + } + ``` + + 4.2 Extract the meta information of pooling operator from caffe model, add the function "ParameterSpec adapt_Pooling() override" in [model_tools/caffe/caffe_adaptee.h](../model_tools/src/caffe/caffe_adaptee.h). + + ``` + // Addition ======> + ParameterSpec adapt_Pooling() override { + ParameterSpec curPs; + PoolingParamSpec pps; + if (layer.pooling_param().has_kernel_w() && layer.pooling_param().has_kernel_h()) { + pps.kernel_w = layer.pooling_param().kernel_w(); + pps.kernel_h = layer.pooling_param().kernel_h(); + } else { + pps.kernel_h = layer.pooling_param().kernel_size(); + pps.kernel_w = pps.kernel_h; + } + if (layer.pooling_param().has_stride_w() && layer.pooling_param().has_stride_h()) { + pps.stride_w = layer.pooling_param().stride_w(); + pps.stride_h = layer.pooling_param().stride_h(); + } else { + pps.stride_h = layer.pooling_param().stride(); + pps.stride_w = pps.stride_h; + } + bool global_pooling = layer.pooling_param().global_pooling(); + if (global_pooling) { + pps.kernel_h = 0; + pps.kernel_w = 0; + pps.stride_h = 1; + pps.stride_w = 1; + }else { + CHECK_REQUIREMENT(pps.kernel_h > 0); + } + if (layer.pooling_param().has_pad_w() && layer.pooling_param().has_pad_h()) { + pps.padding_left = layer.pooling_param().pad_w(); + pps.padding_right = pps.padding_left; + pps.padding_top = layer.pooling_param().pad_h(); + pps.padding_bottom = pps.padding_top; + } else { + pps.padding_top = layer.pooling_param().has_pad() ? layer.pooling_param().pad() : 0; + pps.padding_bottom = pps.padding_top; + pps.padding_left = pps.padding_top; + pps.padding_right = pps.padding_top; + } + + if (layer.pooling_param().has_round_mode() && layer.pooling_param().round_mode() == 1) { + pps.rm = FLOOR; + }else{ + pps.rm = CEIL; + } + switch (layer.pooling_param().pool()) { + case caffe::PoolingParameter_PoolMethod_MAX: { + pps.mode = POOLING_MAX; + break; + } + case caffe::PoolingParameter_PoolMethod_AVE: { + pps.mode = POOLING_MEAN; + break; + } + default: { + std::cerr << "[ERROR] encounter unsupported Pooling method " << layer.pooling_param().pool() << std::endl; + break; + } + } + curPs.pooling_spec = pps; + return curPs; + } + // <====== Addition + ``` + + 5. Pooling is non-weight op, skip this step. + +- Example: support `pooling` in onnx converter + + 1. Switch to [model_tools/src/onnx](../model_tools/src/onnx), which is the onnx converter for bolt; + + 2. Judgment: pooling is non-weight-op; + + 3. Define `pooling` parameter format. + + Note: Definition actions same with add pooling in caffe converter step 3 . Please refer the former content. + + 4. Extract the meta information of `pooling` operator in onnx. + + 4.1 Modify the function named "OperatorType convert_onnx_type(std::string inputType)" in [model_tools/onnx/onnx_adaptee.h](../model_tools/src/onnx/onnx_adaptee.h). + + Add the onnx type mapping code as following: + + ``` + OperatorType convert_onnx_type(std::string inputType) { + // Addition ======> + if (inputType == "AveragePool" || inputType == "MaxPool") { + return OT_Pooling; + } // <====== Addition + else if (inputType == "Conv") { + ... + } + } + ``` + + 4.2 Extract the meta information of pooling operator from onnx model, add the function "ParameterSpec adapt_Pooling() override" in [model_tools/onnx/onnx_adaptee.h](../model_tools/src/onnx/onnx_adaptee.h). + + ``` + // Addition ======> + ParameterSpec adapt_Pooling() override + { + ParameterSpec curPs; + PoolingParamSpec pps; + std::string autoPad = get_node_str_attribute_by_name(node, "auto_pad"); + std::vector kernelShape = get_node_vector_ints_attribute_by_name(node, "kernel_shape"); + std::vector strides = get_node_vector_ints_attribute_by_name(node, "strides"); + std::vector pads = get_node_vector_ints_attribute_by_name(node, "pads"); + + if (op == "AveragePool" || op == "ReduceMean") { + pps.mode = POOLING_MEAN; + } else { + pps.mode = POOLING_MAX; + } + if (autoPad == "SAME_UPPER") { + pps.rm = CEIL; + } else { + pps.rm = FLOOR; + } + + if (kernelShape.size() == 2) { + pps.kernel_h = kernelShape[0]; + pps.kernel_w = kernelShape[1]; + } else { + pps.kernel_h = 0; + pps.kernel_w = 0; + std::cerr << "[Info] pooling: kernel_size unknown. This could be global pooling." << std::endl; + } + + if (strides.size() == 2) { + pps.stride_h = strides[0]; + pps.stride_w = strides[1]; + } else { + pps.stride_h = 0; + pps.stride_w = 0; + std::cerr << "[Info] pooling: stride unknown. This could be global pooling." << std::endl; + } + + if (pads.size() == 4) { + pps.padding_top = pads[0]; + pps.padding_bottom = pads[2]; + pps.padding_left = pads[1]; + pps.padding_right = pads[3]; + } else { + pps.padding_top = 0; + pps.padding_bottom = 0; + pps.padding_left = 0; + pps.padding_right = 0; + } + curPs.pooling_spec = pps; + return curPs; + } + // <======= Addition + ``` + + 5. Pooling is non-weight op, skip this step. + +- Example: support `pooling` in tflite converter + + 1. Switch to [model_tools/src/tflite](../model_tools/src/tflite), which is the tflite converter for bolt; + + 2. Judgment: pooling is non-weight-op; + + 3. Define `pooling` parameter format; + + Note: Definition actions same with add pooling in caffe converter step(3) . Please refer the former content. + + 4. Extract the meta information of `pooling` operator in tflite. + + 4.1 Modify the function named "OperatorType convert_tflite_type(std::string inputType)" in [model_tools/tflite/tflite_adaptee.h](../model_tools/src/tflite/tflite_adaptee.h). + + Add the tflite type mapping code as following: + + ``` + OperatorType convert_tflite_type(tflite::BuiltinOperator tfliteType) { + // Addition ======> + if (tfliteType == tflite::BuiltinOperator_MAX_POOL_2D) { + return OT_Pooling; + } // Addition + else if (tfliteType == tflite::BuiltinOperator_CONCATENATION) { + ... + } + } + ``` + + 4.2 Extract the meta information of pooling operator from tflite model, add the function "ParameterSpec adapt_Pooling() override" in [model_tools/tflite/tflite_adaptee.h](../model_tools/src/tflite/tflite_adaptee.h). + + ``` + // Addition ======> + ParameterSpec adapt_Pooling() override + { + ParameterSpec curPs; + const auto& tflitePoolOption = ops[curIndex]->builtin_options.AsPool2DOptions(); + PoolingParamSpec poolingPs; + poolingPs.kernel_h = tflitePoolOption->filter_height; + poolingPs.kernel_w = tflitePoolOption->filter_width; + poolingPs.stride_h = tflitePoolOption->stride_h; + poolingPs.stride_w = tflitePoolOption->stride_w; + poolingPs.padding_top = 0; + poolingPs.padding_bottom = 0; + poolingPs.padding_left = 0; + poolingPs.padding_right = 0; + poolingPs.rm = CEIL; + if (opCode == tflite::BuiltinOperator_MAX_POOL_2D) { + poolingPs.mode = POOLING_MAX; + } else if (opCode == tflite::BuiltinOperator_AVERAGE_POOL_2D) { + poolingPs.mode = POOLING_MEAN; + } + curPs.pooling_spec = poolingPs; + return curPs; + } + // <====== Addition + ``` + + 5. Pooling is non-weight op, skip this step. + +## tensor computing customization -- ### tensor_computing customization - - In tensor_computing, you can define any operator for operator computing process. - -- ### inference customization - - In inference, you can define any operator for the inference of your model. - - 1. Add the definition of the specific operator in bolt/inference/include; - 2. If the specific operator implement in CPU is different from its implement in GPU, implement should be divided into CPU and GPU version. If the specific operator implement in CPU is same with its implement in GPU, skip this step. - - - - - [ ] Example: add `pooling` operator in bolt/inference - - 1. Create `pooling.hpp` in bolt/inference/include , add the definition of `pooling` operator; - - ``` - // Addition begin - #ifndef _POOLING_H - #define _POOLING_H - #include "operator.hpp" - #include "tensor_computing.h" - - class Pooling: public Operator { - public: - Pooling(PoolingMode mode, U32 ksH, U32 ksW, U32 strideH, U32 strideW, - U32 paddingT, U32 paddingB, U32 paddingL, U32 paddingR, RoundMode rm) - { - this->mode = mode; - this->kernelSizeH = ksH; - this->kernelSizeW = ksW; - this->strideH = strideH; - this->strideW = strideW; - this->paddingT = paddingT; - this->paddingB = paddingB; - this->paddingL = paddingL; - this->paddingR = paddingR; - this->rm = rm; - } - - PoolingDesc create_PoolingDesc(PoolingMode pm, U32 ksH, U32 ksW, U32 strideH, U32 strideW, - U32 paddingT, U32 paddingB, U32 paddingL, U32 paddingR, RoundMode rm) - { - PoolingDesc poolingDesc; - poolingDesc.pm = pm; - poolingDesc.kernelSize_h = ksH; - poolingDesc.kernelSize_w = ksW; - poolingDesc.stride_h = strideH; - poolingDesc.stride_w = strideW; - poolingDesc.padding_top = paddingT; - poolingDesc.padding_bottom = paddingB; - poolingDesc.padding_left = paddingL; - poolingDesc.padding_right = paddingR; - poolingDesc.rm = rm; - return poolingDesc; - } - - void set_kernelSize(U32 globalKernelSizeH, U32 globalKernelSizeW) { - this->kernelSizeH = globalKernelSizeH; - this->kernelSizeW = globalKernelSizeW; - } - - void set_stride(U32 globalStrideH, U32 globalStrideW) { - this->strideH = globalStrideH; - this->strideW = globalStrideW; - } - - virtual void run() = 0; - virtual EE infer_output_tensors_size(Vec, Vec*) = 0; - #ifdef _USE_MALI - virtual EE infer_output_tensors_size(Vec, Vec*, Vec*, Vec*){return NOT_SUPPORTED;} - #endif - - protected: - PoolingMode mode; - RoundMode rm; - U32 kernelSizeH; - U32 kernelSizeW; - U32 strideH; - U32 strideW; - U32 paddingT; - U32 paddingB; - U32 paddingL; - U32 paddingR; - }; - - #endif //_POOLING_H - // Addition end - ``` - - 2. `pooling` operator implement in CPU is different from its implement in GPU. So `pooling` implement should be two version: CPU and GPU - - Create `pooling_cpu.hpp` and add `pooling` CPU implement in bolt/inference/include/cpu : - - ``` - #ifndef _POOLING_CPU_H - #define _POOLING_CPU_H - #include - #include "operator.hpp" - #include "tensor_computing.h" - #include "tensor_desc.h" - #include "model_tools.h" - #include "pooling.hpp" - - class PoolingCPU: public Pooling { - public: - - /** - * @param mode - * @param ks - * @param stride - * @param padding - * @param name - */ - PoolingCPU(PoolingMode mode, U32 ksH, U32 ksW, U32 strideH, U32 strideW, U32 paddingT, U32 paddingB, U32 paddingL, U32 paddingR, RoundMode rm): - Pooling(mode, ksH, ksW, strideH, strideW, paddingT, paddingB, paddingL, paddingR, rm){} - - virtual void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); - PoolingDesc poolingDesc = Pooling::create_PoolingDesc(this->mode, this->kernelSizeH, this->kernelSizeW, this->strideH, this->strideW, - this->paddingT, this->paddingB, this->paddingL, this->paddingR, this->rm); - Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); - F16 scales[2]; - if (DT_I8 == inputDesc.dt) { - scales[0] = inputTensor.get_scale(); - } - CHECK_STATUS(pooling(inputDesc, inputTensor.get_val(), - poolingDesc, scales, - outputDesc, outputTensor.get_val(), - this->schedule)); - if (DT_I8 == inputDesc.dt) { - outputTensor.set_scale(scales[1]); - } - - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - - virtual EE infer_output_tensors_size(Vec inDims, Vec* outDims) override - { - auto inDim = inDims[0]; - DataType dt; - DataFormat df; - U32 width ; - U32 height; - U32 numChannels; - U32 num; - CHECK_STATUS(tensor4dGet(inDim, &dt, &df, &num, &numChannels, &height, &width)); - - TensorDesc inputDesc = tensor4df(dt, df, num, numChannels, height, width); - if (this->kernelSizeH == 0 && this->kernelSizeW == 0) { - Pooling::set_kernelSize(height, width); - Pooling::set_stride(1, 1); - } - PoolingDesc poolingDesc = Pooling::create_PoolingDesc(this->mode, this->kernelSizeH, this->kernelSizeW, this->strideH, this->strideW, - this->paddingT, this->paddingB, this->paddingL, this->paddingR, this->rm); - CHECK_STATUS(pooling_infer_output_size(inputDesc, poolingDesc, &((*outDims)[0]), this->schedule)); - return SUCCESS; - } - - }; - - #endif //_POOLINGCPU_H - ``` - - Create `pooling_ocl.hpp` and add `pooling` GPU implement in bolt/inference/include/ocl : - - ``` - #ifndef _POOLING_OCL_H - #define _POOLING_OCL_H - #include - #include "operator.hpp" - #include "tensor_computing.h" - #include "tensor_desc.h" - #include "model_tools.h" - #include "pooling.hpp" - - class PoolingOCL: public Pooling { - public: - - /** - * @param mode - * @param ks - * @param stride - * @param padding - * @param name - */ - PoolingOCL(PoolingMode mode, U32 ksH, U32 ksW, U32 strideH, U32 strideW, U32 paddingT, U32 paddingB, U32 paddingL, U32 paddingR, RoundMode rm): - Pooling(mode, ksH, ksW, strideH, strideW, paddingT, paddingB, paddingL, paddingR, rm){} - - virtual void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); - PoolingDesc poolingDesc = Pooling::create_PoolingDesc(this->mode, this->kernelSizeH, this->kernelSizeW, this->strideH, this->strideW, - this->paddingT, this->paddingB, this->paddingL, this->paddingR, this->rm); - Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); - F16 scales[2]; - if (DT_I8 == inputDesc.dt) { - scales[0] = inputTensor.get_scale(); - } - CHECK_STATUS(pooling(inputDesc, inputTensor.get_val(), - poolingDesc, scales, - outputDesc, outputTensor.get_val(), - this->schedule, &this->oclExtInfo)); - if (DT_I8 == inputDesc.dt) { - outputTensor.set_scale(scales[1]); - } - - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - virtual EE infer_output_tensors_size(Vec inDims, Vec* outDims) override - { - auto inDim = inDims[0]; - DataType dt; - DataFormat df; - U32 width ; - U32 height; - U32 numChannels; - U32 num; - CHECK_STATUS(tensor4dGet(inDim, &dt, &df, &num, &numChannels, &height, &width)); - - TensorDesc inputDesc = tensor4df(dt, df, num, numChannels, height, width); - if (this->kernelSizeH == 0 && this->kernelSizeW == 0) { - Pooling::set_kernelSize(height, width); - Pooling::set_stride(1, 1); - } - PoolingDesc poolingDesc = Pooling::create_PoolingDesc(this->mode, this->kernelSizeH, this->kernelSizeW, this->strideH, this->strideW, - this->paddingT, this->paddingB, this->paddingL, this->paddingR, this->rm); - CHECK_STATUS(pooling_infer_output_size(inputDesc, poolingDesc, &((*outDims)[0]), this->schedule)); - return SUCCESS; - } - - virtual EE infer_output_tensors_size(Vec inDims, Vec* outDims, Vec* gclmemInputDesc, Vec* gclmemOutputDesc) override - { - auto inDim = inDims[0]; - DataType dt; - DataFormat df; - U32 width ; - U32 height; - U32 numChannels; - U32 num; - CHECK_STATUS(tensor4dGet(inDim, &dt, &df, &num, &numChannels, &height, &width)); - - this->oclExtInfo.maliInfo.gclmemInputDesc = &((*gclmemInputDesc)[0]); - this->oclExtInfo.maliInfo.gclmemOutputDesc = &((*gclmemOutputDesc)[0]); - TensorDesc inputDesc = tensor4df(dt, df, num, numChannels, height, width); - if (this->kernelSizeH == 0 && this->kernelSizeW == 0) { - Pooling::set_kernelSize(height, width); - Pooling::set_stride(1, 1); - } - - PoolingDesc poolingDesc = Pooling::create_PoolingDesc(this->mode, this->kernelSizeH, this->kernelSizeW, this->strideH, this->strideW, - this->paddingT, this->paddingB, this->paddingL, this->paddingR, this->rm); - CHECK_STATUS(pooling_infer_output_size(inputDesc, poolingDesc, &((*outDims)[0]), this->schedule, &this->oclExtInfo)); - return SUCCESS; - } - - - private: - }; - - #endif //_POOLING_OCL_H - ``` +In [tensor](../compute/tensor), you can define any operator for operator computing process. + +1. Create a new operator file in [compute/tensor/src](../compute/tensor/src); +2. The implementation of the operator is related to the backends(x86 CPU, ARM CPU, GPU), for a specific backend, you need to add the corresponding operator implementation to the specific folder in [compute/tensor/src](../compute/tensor/src). + +- Example: add `pooling` operator in [tensor](../compute/tensor) + + 1. Create `pooling.cpp` in [compute/tensor/src](../compute/tensor/src), the complete implementation refers to [compute/tensor/src/pooling.cpp](../compute/tensor/src/pooling.cpp) + + 2. For ARM CPU, create `pooling.cpp` in [compute/tensor/src/arm/pooling.cpp](../compute/tensor/src/arm/pooling.cpp), and dispatch to implementations of different data type(bnn/fp16/fp32/int8). + + 3. For ARM GPU, create `pooling.cpp` in [compute/tensor/src/gpu/mali/pooling.cpp](../compute/tensor/src/gpu/mali/pooling.cpp), and only fp16 supported now [compute/tensor/src/gpu/mali/fp16/pooling_mali_fp16.cpp](../compute/tensor/src/gpu/mali/fp16/pooling_mali_fp16.cpp), and put your cl file in [compute/tensor/src/gpu/mali/cl/pooling_max.cpp](../compute/tensor/src/gpu/mali/pooling_max.cpp), the file name of cl must be the same with kernel name. if your kernel has compile option, create .sh file in [common/gcl/tools/kernel_lib_compile/sh/compile](../common/gcl/tools/kernel_lib_compile/sh/compile), the file name of sh must be the same with kernel name. + +## inference's engine customization + +In [engine](../inference/engine), you can define any operator for the inference of your model. + +1. Add the definition of the specific operator in [inference/engine/include](../inference/engine/include); +2. If the specific operator implement in CPU is different from its implement in GPU, implement should be divided into CPU and GPU version. If the specific operator implement in CPU is same with its implement in GPU, skip this step. + +- Example: add `pooling` operator in [inference/engine](../inference/engine) + + 1. Create `pooling.hpp` in [inference/engine/include](../inference/engine), add the definition of `pooling` operator, the complete implement code refers to [inference/engine/include/pooling.hpp](../inference/engine/include/pooling.hpp) + + 2. `pooling` operator implement in CPU is different from its implement in GPU. So `pooling` implement should be two version: CPU and GPU + + (1) Create `pooling_cpu.hpp` and add `pooling` CPU implement in [inference/engine/include/cpu](../inference/engine/include/cpu) , the complete implement refers to [inference/engine/include/cpu/pooling_cpu.hpp](../inference/engine/include/cpu/pooling_cpu.hpp) + + (2) Create `pooling_ocl.hpp` and add `pooling` GPU implement in [inference/engine/include/ocl](../inference/engine/include/ocl) , the complete implement refers to [inference/engine/include/ocl/pooling_ocl.hpp](../inference/engine/include/ocl/pooling_ocl.hpp) # How to contribute -- ### submit issue +## submit issue - - [ ] question +- question - Submit any question you have encountered when you use Bolt. You can give feedback to us through committing issues. Refer to https://github.com/huawei-noah/bolt/issues, create your new issue and submit it. The issue can be a bug in Bolt, a suggestion for Bolt, or anything you don't understand about Bolt. + Submit any question you have encountered when you use Bolt. You can give feedback to us through committing issues. Refer to https://github.com/huawei-noah/bolt/issues, create your new issue and submit it. The issue can be a bug in Bolt, a suggestion for Bolt, or anything you don't understand about Bolt. - +- feature request - - [ ] feature request + Submit any feature that you want but it has not been implemented in Bolt. We have created a [special issue](https://github.com/huawei-noah/bolt/issues/5) and you can leave a commit under this issue . We will seriously consider the needs of all users and continue to enrich the functions of Bolt. - Submit any feature that you want but it has not been implemented in Bolt. We have created a [special issue](https://github.com/huawei-noah/bolt/issues/5) and you can leave a commit under this issue . We will seriously consider the needs of all users and continue to enrich the functions of Bolt. +## pull request -- ### pull request +- add a license - - [ ] add a license - - Add the license at the head of your source codes indicating your codes will be open to all. - - - [ ] provide an executable unit test - - Fork [Bolt](https://github.com/huawei-noah/bolt) on your github account. Modify your code and make sure your code pass all testing cases. Commit the code and initiate a pull request on github. - - - - + Add the license at the head of your source codes indicating your codes will be open to all. + +- provide an executable unit test + + Fork [Bolt](https://github.com/huawei-noah/bolt) on your github account. Modify your code and make sure your code pass all testing cases. Commit the code and initiate a pull request on github. \ No newline at end of file diff --git a/docs/FAQ.md b/docs/FAQ.md new file mode 100644 index 00000000..df27e835 --- /dev/null +++ b/docs/FAQ.md @@ -0,0 +1,25 @@ +# FAQ on Bolt + +1. Why configuring bolt.cmake does not take effect? + + The [install.sh](install.sh) serves as an example of compilation setup, and it overwrites some settings in [bolt.cmake](common/cmakes/bolt.cmake). Please check install.sh first. + +2. More details about dependency libraries for cross-compilation? + + The major dependency is Protobuf. Protoc should agree with your building platform but protbuf should be the ARM version. + +3. Restrictions for 1-bit BNN? + + For BNN convolution layers, the number of input channels must be divisible by 32, and the output channels must be divisible by 16. + +4. Restrictions on quantization (int8)? + + For the time being, Bolt only supports post-training int8 quantization. The quantization method is symmetrical for both activation and weight. We have added a calibration tool for image CNN pipelines. Please feel free to report cases of usage failure. + +5. Requirements for float16 and int8? + + Only arm-v8.2 CPU supports float16 and int8 dotprod instructions. + +6. Restrictions for ARM Mali GPU? + + Only *arm_llvm* compilation supports ARM Mali computing. \ No newline at end of file diff --git a/docs/INSTALL.md b/docs/INSTALL.md index 6b27b0cf..4842bedc 100644 --- a/docs/INSTALL.md +++ b/docs/INSTALL.md @@ -1,172 +1,249 @@ -# Prerequisites - -- CMake +This document will help you compile and install bolt on your server. Generally, you only need to be concerned about two parts. The first part is how to build bolt which is described in the "Download and Build Bolt" section. And the second part is when you fail to build bolt , you should check the "Prerequisites" section which is required by bolt. - We use [cmake v3.15.1](https://cmake.org/files/v3.15/cmake-3.15.1-Linux-x86_64.tar.gz) to build Bolt. After installing the cmake, you need to set shell environment **PATH** to find it. You can use this simple test to confirm you have installed it successfully. - - ```shell - cmake -version - ``` +[Download and Build Bolt](#download-and-build-bolt) +    [Options](#options) +    [Environment variables](#environment-variables) +[Prerequisites](#prerequisites) +    [Compilation Tools](#compilation-tools) +    [Android Tools](#android-tools) +    [Other Dependency Libraries](#other-dependency-libraries) +    [Optional Software](#optional-software) -- GNU make - - We use [GNU make v3.81](http://ftp.gnu.org/gnu/make/make-3.81.tar.gz) to build Bolt. After installing the make, you also need to set shell environment **PATH**. Simple test: - - ```shell - make -version - ``` - -- Cross compiler - - If you plan to directly compile Bolt on ARM platform and run on ARM, you can use gcc and skip this section. - - NDK compiler uses Android NDK toolchains to build Bolt for Java APIs required by Android applications and ARM MALI GPU Bolt. GNU compiler uses gcc to build Bolt for simple ARM CPU tests. Please choose according to your scenario. - - - Android NDK compiler - - We use Android NDK [android-ndk-r20](https://dl.google.com/android/repository/android-ndk-r20b-linux-x86_64.zip?hl=zh-cn) to build Bolt. After installing the Android NDK, you need to set shell environment **PATH** to find *aarch64-linux-android21-clang++*. Simple test: - - ```shell - aarch64-linux-android21-clang++ --version - ``` - - - GNU compiler - - We use GNU compiler [gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu](https://developer.arm.com/-/media/Files/downloads/gnu-a/8.3-2019.03/binrel/gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu.tar.xz?revision=2e88a73f-d233-4f96-b1f4-d8b36e9bb0b9&la=en&hash=167687FADA00B73D20EED2A67D0939A197504ACD) to build Bolt. You need to set shell environment **PATH** to find *aarch64-linux-gnu-g++*. Simple test: - - ```shell - aarch64-linux-android21-clang++ -version - ``` - -- ADB - - We use [ADB](https://developer.android.com/studio/command-line/adb.html) tool to transfer the executables to android mobile phones and run the program. You also need to set shell environment **PATH**. Simple test: - - ```shell - # this will list all available android devices - adb devives - ``` - -- Optional - - Java SDK - - If you want to compile Java programs, you need to download and install [Java SE SDK](https://www.oracle.com/java/technologies/oracle-java-archive-downloads.html). After installing the SDK, you need to set shell environment **PATH** to find it. Simple test: - ```shell - java --version - ``` - - - Android dx - - If you want to directly run *jar* file on Android device, you can use [Android dx tool](https://developer.android.com/studio/releases/build-tools). We currently use Android *v28.0.3* build tools. After installing the *dx* tool, you also need to set shell environment **PATH**. Simple test: - ```shell - dx --version - ``` - -- Third party library - - We provide a simple [install shell script](../third_party/install.sh) to install third party libraries(*protoc, protobuf, flatbuffers, tensorflow-lite, jpeg, ARM GPU OpenCL*) to the [third_party](third_party) directory and generate a shell script to set up compilation environment. You can choose between LLVM and GCC. Here is an example of installation for LLVM. - - ```shell - ./third_party/install.sh -c llvm -t 33 - ``` # Download and Build Bolt -We provide a simple shell script [install.sh](../install.sh) to build and install the Bolt library, and you can modify it according to your scenario and environment. Please refer to the options section of [bolt.cmake](../bolt.cmake) and configure accordingly. Here we give an example of building Bolt with LLVM. +A simple shell script [install.sh](../install.sh) is provided to build and install the Bolt library, and you should modify it according to your scenario and environment. Set all the options correctly in [bolt.cmake](../common/cmakes/bolt.cmake) . Use help message to find more useful information. + +NOTE: Some Build options are set by default in [install.sh](../install.sh), which will be overwritten in [bolt.cmake](common/cmakes/bolt.cmake). You should check these two files meticulously before installation. -NOTE: Some build options are turned on or off by default in the given install.sh, which overwrites the settings in bolt.cmake. Be sure to check install.sh first. +Here are the examples of installation on different platforms(*arm_gnu*, *arm_llvm*, *arm_ndkv7* and *x86_gnu*). ```shell git clone https://github.com/huawei-noah/bolt.git cd bolt -./install.sh -c llvm -t 33 + +# build for ARM V8+ GNU CPU platform +./install.sh -c arm_gnu -t 33 + +# build for Android ARM V8+ CPU platform +./install.sh -c arm_llvm -t 33 -g OFF + +# build for Android ARM V8+ CPU + MALI GPU platform +./install.sh -c arm_llvm -t 33 -g ON + +# build for X86 GNU CPU platform +./install.sh -c x86_gnu -t 33 + +# build for Android ARM V7 CPU platform +./install.sh -c arm_ndkv7 -t 33 ``` -We will install Bolt to *install_llvm* directory, you will find these subdirectories in it. +We will install Bolt to *install_* directory, you will find these subdirectories in it. + +- include + - [C API](inference/engine/api/c) header file + - [Java API](inference/engine/api/java) class file +- lib + - libBoltModel.so: build for Java application + - libbolt.so: build for C/C++ application + - libflow.so: flow sub project library + - libinference.so: inference sub project library + - libtensor.so: tensor computing sub project library + - libimage.so: image sub project library + - libblas_enhance.so: blas_enhance sub project library + - libmodel_tools.so: model_tools sub project library + - libuni.so: uni sub project library +- tools + - *X2bolt* for generally converting deep learning(caffe/onnx/tflite) model to bolt model + - *tensorflow2caffe* for converting tensorflow model to caffe model + - *pytorch2caffe* for converting pytorch model to caffe model + - *tensor_computing_library_search* for performance tuning of the operator library -- kits +- examples + - *benchmark* for measuring any model(.bolt) inference performance - *tinybert* for intention identification - - *nmt* for machine translation - - - *classification* for computer vision classification task - + - *classification* for imagenet classification task - *asr_rnnt* for automatic speech recognition task RNNT model - - *asr_convolution_transformer* for automatic speech recognition task Convolution+Transformer model - - *tts* for text to speech - - - *super_resolution* for super resolution task - - - *hdr* for high dynamic range task - -- include - - C API - - Java API - -- lib: all static and shared library -- tools - - *caffe2bolt* for converting caffe model to bolt model - - - *onnx2bolt* for converting onnx model to bolt model - - - *tflite2bolt* for converting tflite model to bolt model - - - *tensorflow2caffe* for converting tensorflow model to caffe model +- docs + - API/html: doxygen html document for C/Java/Flow API - - *pytorch2caffe* for converting pytorch model to caffe model - - - *tensor_computing_library_search* for performance tuning of the operator library - -If you want to build operator and API tests, please turn on the *BUILD_TEST* option and rebuild Bolt. These programs will be installed to *tests/bin* directory. +If you want to build [operator uni tests](compute/tensor/tests) and [C](inference/examples/c_api/test_api.c)/[Java](inference/examples/java_api/test_api_java.java)/Flow API tests, please turn on the *BUILD_TEST* option and rebuild Bolt. These executables will be installed to ***install_/tests*** directory. ## Options -Here we list all options in [bolt.cmake](../bolt.cmake). +Here are all options in [bolt.cmake](../common/cmakes/bolt.cmake). | options | default | note | | --------------------- | ------- | --------------------------------------------- | -| USE_CROSS_COMPILE | OFF | use cross compile or not | +| USE_CROSS_COMPILE | ON | use cross compile or not | | USE_GNU_GCC | OFF | use GNU gcc compler or not | | USE_LLVM_CLANG | OFF | use LLVM clang compiler or not | -| USE_DEBUG | OFF | use debug information or not | +| USE_IOS_CLANG | OFF | use ios compiler or not | | USE_DYNAMIC_LIBRARY | OFF | use dynamic library or not | +| USE_MINSIZEREL | OFF | use cmake library storage size optimization | +| USE_ANDROID_LOG | OFF | use Android log or not | +| USE_DEBUG | OFF | use debug information or not | +| USE_PROFILE | OFF | print each layer performance information or not | +| USE_PROFILE_STATISTICS | OFF | print performance statistics information or not | +| USE_THREAD_SAFE | OFF | use thread safe function or not | | USE_CAFFE | ON | use caffe model as input or not | | USE_ONNX | ON | use onnx model as input or not | | USE_TFLITE | ON | use tflite model as input or not | -| USE_NEON | ON | use ARM NEON instruction or not | +| USE_TFLITE | ON | use tensorflow model as input or not | +| USE_GENERAL | ON | use serial CPU code for debug or not | +| USE_X86 | OFF | use X86 AVX2 instruction or not | +| USE_NEON | OFF | use ARM NEON instruction or not | +| USE_ARMV7 | OFF | use ARMv7 CPU or not | +| USE_ARMV8 | ON | use ARMv8 CPU or not | +| USE_MALI | ON | use MALI GPU for parallel or not | | USE_FP32 | OFF | use FP32 implementation or not | | USE_FP16 | ON | use FP16 implementation or not | | USE_F16_MIX_PRECISION | ON | use ARM NEON mixed-precision (F16/F32) or not | | USE_INT8 | ON | use INT8 implementation or not | -| BUILD_TEST | OFF | build unit test or not | -| USE_MALI | ON | use MALI GPU for parallel or not | -| USE_ARMV7 | OFF | use ARMv7 CPU or not | -| USE_ARMV8 | ON | use ARMv8 CPU or not | -| USE_GENERAL | ON | use serial CPU code for debug or not | +| USE_OPENMP | OFF | use OpenMP to run operator multi-thread or not, currently only support partial float32 operator | +| USE_LIBRARY_TUNING | ON | use algorithm tuning or not | +| USE_FLOW | ON | use flow or not | +| USE_TEST | OFF | build unit test or not | ## Environment variables -We reserve some shell environment variable for Bolt. +Some Linux shell environment variables are reserved for Bolt. -- *Bolt_ROOT*: Bolt project home directory, set by user or Bolt. -- *BOLT_MEMORY_REUSE_OPTIMIZATION*: whether to use memory reuse optimization(default is ON), you can set it to *OFF* to disable memory reuse optimization. +- *BOLT_ROOT*: Bolt project home directory, set by user or Bolt. +- *BOLT_MEMORY_REUSE_OPTIMIZATION*: whether to use memory reuse optimization. The default value is ON and you can set it *OFF* to disable memory reuse optimization. - *Bolt_TensorComputing_LibraryAlgoritmMap*: a path on the target device set by user to save tensor_computing library performance tuning result. -## How to build Bolt MALI GPU -For compile bolt MALI GPU, -- Ensure your ADB works well, and connected with your target device with mali gpu. - NOTE: Bolt need to precompile all GPU kernels to bins on your target device, and they will be packaged to libkernelbin.a/.so - If you change your target device, these kernel bins may be not adaptive, you should recompile them. - Bolt support mult devices precompiling for GPU Kernels, you can connect all the target devices you need with ADB, and the kernel bins for them will be built and packged together. -- LLVM Compiler must be used and version of andriod NDK is more than r19. -- OpenCL headfiles and lib are provided in "/cheetah/third_party/llvm/opencl", if the OpenCL lib we provided are not matching with your target device, you can replace it with the Opencl lib on your device. -- When you compile bolt MALI GPU, please set these options ON: - USE_CROSS_COMPILE - USE_LLVM_CLANG - USE_FP16 - USE_MALI - They can be set in install.sh, options of compiler_arch llvm. -- After open these options, run "./install.sh -c llvm -t 33" to build bolt MALI GPU +# Prerequisites + +## Compilation Tools + +- ### Cmake + + <1> Installation + + ``` + sudo apt-get install cmake + ``` + + <2> Verfication + + ``` + cmake -version + ``` + + cmake -version + + If cmake has been installed successfully, you can see the uniquely determined cmake version number(For example: 3.15.1). If you fail to see the version number or the number you see is lower than 3.2.0, please reinstall the cmake tool on your server. You can refer to the [cmake official docs](https://cmake.org/install/) to implement the installation of cmake and set environment **PATH** to find it. + +- ### GNU make + + <1> Installation + + ``` + sudo apt-get install make + ``` + + <2> Verification + + ``` + make -version + ``` + + + + If GNU make has been installed successfully, you can see the uniquely determined GNU make version number(For example: 4.1). If you fail to see the version number or the number you see is lower than 3.81, please reinstall the GNU make tool on your server. You can refer to the [GNU make installation example](https://stackoverflow.com/questions/35568016/install-make-3-75-on-ubuntu-15-10) to implement the installation of GNU make and set the environment **PATH** to find it. + +- ### Cross compiler + + NDK compiler uses Android NDK toolchains to build Bolt for Java APIs required by Android applications and ARM MALI GPU Bolt. ARM GNU compiler uses gcc to build Bolt for simple ARM CPU tests. Choose **one of them** according to your scenario. + + - Android NDK compiler + + <1> Installation + + Refer to the [NDK installation example](https://askubuntu.com/questions/837847/how-to-install-android-ndk) to install [android-ndk-r20](https://dl.google.com/android/repository/android-ndk-r20b-linux-x86_64.zip?hl=zh-cn) and set environment **PATH** to find *aarch64-linux-android21-clang++*. + + <2> Verification + + ``` + aarch64-linux-android21-clang++ --version + ``` + + NDK + + If android ndk has been installed successfully, you can see the InstalledDir which represent the ndk compilers storage path. If you fail to see InstalledDir, please reinstall ndk and set the environment **PATH** to find it. + + - ARM GNU compiler + + <1> Installation + + ``` + sudo apt-get install gcc-arm-linux-gnueabi + ``` + + <2> Verification + + Install [gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu](https://developer.arm.com/-/media/Files/downloads/gnu-a/8.3-2019.03/binrel/gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu.tar.xz?revision=2e88a73f-d233-4f96-b1f4-d8b36e9bb0b9&la=en&hash=167687FADA00B73D20EED2A67D0939A197504ACD) and set Linux shell environment **PATH** to find *aarch64-linux-gnu-g++*. Simple test: + + ``` + aarch64-linux-gnu-g++ --version + ``` + + GNU + + If GNU has been installed successfully, you can see the uniquely determined GNU compiler version number(For example: 8.3.0). If you fail to see the version number or the number you see is lower than 8.3.0, please reinstall the ARM GNU compiler on your server. You can refer to the [GNU compiler installation example](https://askubuntu.com/questions/472219/how-to-install-gcc-4-7-arm-linux-gnueabihf-on-ubuntu-12-04) to implement the installation of GNU compiler and set the environment **PATH** to find it. + +## Android Tools + +- ### ADB + + <1> Installation + + Refer to the [ADB installation example](https://unix.stackexchange.com/questions/378041/how-to-install-adb-on-ubuntu-from-download) to install [ADB](https://developer.android.com/studio/command-line/adb.html) tool helping you transfer the executables to android mobile phones. + + ``` + unzip platform-tools-latest-linux.zip + cd platform-tools + mv adb /usr/bin/adb + ``` + + <2> Verification + + ``` + # list all available android devices + adb devices + ``` + + ADB + + If GDB has been installed successfully, you can see all the android devices on your server. + +## Other Dependency Libraries + +Use [install script]() to install the dependency libraries(*protoc, protobuf, flatbuffers, tensorflow-lite, jpeg, ARM GPU OpenCL*) to the [third_party]() directory and generate a shell script to set up compilation environment. To choose ARM or X86, LLVM or GCC. Here is an example of installation for ARM NDK LLVM build. + +``` +./third_party/install.sh -c arm_llvm -t 33 +``` + +## Optional Software + +- ### JDK + +If you want to compile Java programs, you need to download and install [Java SE Development Kit](https://www.oracle.com/java/technologies/oracle-java-archive-downloads.html) and set Linux shell environment **PATH**. Run the command "java -version" to verify whether jdk has been installed. + +JDK + +You can see the uniquely determined JDK version number(For example: 1.8.0_265). If you fail to see the version number or the number you see is lower than 1.8.0_265, please reinstall the JDK on your server. You can refer to the [JDK installation example](https://stackoverflow.com/questions/14788345/how-to-install-the-jdk-on-ubuntu-linux?page=2&tab=Votes) to implement the installation of JDK and set the environment **PATH** to find it. + +- ### Android dx + +If you want to directly run *jar* file on Android device, you can use [Android dx tool](https://developer.android.com/studio/releases/build-tools).Install Android *v28.0.3* build tools and set Linux shell environment **PATH**. Run the command "dx --version" to verify the dx tool version. + +dx + +You can see the uniquely determined dx version number(For example: 1.16). If you fail to see the version number or the number you see is lower than 1.16, please reinstall the dx tool on your server. \ No newline at end of file diff --git a/docs/IOS_USAGE.md b/docs/IOS_USAGE.md new file mode 100644 index 00000000..94827ce2 --- /dev/null +++ b/docs/IOS_USAGE.md @@ -0,0 +1,78 @@ +# How to Use Bolt on iOS Devices + +## Overview + +Bolt can be used on iOS and you can use the option <-c arm_ios> in our shell script [install.sh](../install.sh) to finish compilation on Linux platform. Before using the script, you need to make an ARM-iOS cross complier toolchain first. A tutorial is given below. + +After compilation, you will find libbolt.a and libbolt.dylib in the install_arm_ios/lib directory. We have tested the development using the Objective-C language, in which you can direcly use our [C API](DEVELOPER.md). You can also try our C++ API as in the [examples](../inference/examples). For the time being you need to include more headers than using the C API, and some compilation flags need to be managed. + +You can also find demo projects as described in [KIT.md](KIT.md). Note that before compiling bolt the demo cannot be directly used, because the built libraries and headers will be installed dynamically in [install.sh](../install.sh). The demo is in the experimental stage, based on our new feature [Flow](DEVELOPER.md). + +## Call for Contribution + +- Xcode simulator support. So far we haven't supported compilation for MAC-x86, so bolt can only be tested on real devices. +- Swift/Objective-C API based on our C API. +- Compilation on platforms other than Linux. + +## Related links + +In addition to our tutorial, you can also refer to the following two links. + +- https://heroims.github.io/2017/09/10/Linux%20%E6%9E%84%E5%BB%BA:%E7%BC%96%E8%AF%91IOS:Mac%E7%A8%8B%E5%BA%8F/ +- https://medium.com/@fyf786452470/%E5%9C%A8linux%E7%9A%84%E7%9A%84%E7%9A%84%E4%B8%8B%E4%BA%A4%E5%8F%89%E7%BC%96%E8%AF%91%E7%94%9F%E6%88%90%E7%9A%84ios%E7%89%88%E7%9A%84%E5%B7%A5%E5%85%B7%E9%93%BE%E7%9A%84%E6%8C%87%E5%AF%BC%E6%89%8B%E5%86%8C-b87b472cbe14 + +## Preparations + +- llvm clang: You can download and install llvm clang from the [llvm website](https://releases.llvm.org/). +- openssl: Generally this tool is installed by default. **Note that if you want to copy your created iOS cross compiler toolchain to another computer for use, you need to confirm that the versions of openssl on these two machines are the same, otherwise your created toolchain can not be used.** +- iPhoneOSSDK: If you don't have your own iPhoneOS SDK, you can download and choose one iPhoneOS SDK from [iPhoneOSSDK](https://github.com/okanon/iPhoneOS.sdk), which contains iPhoneOS SDKs from the version 8.4 to 13.2. +- cctools : This open-source tool can help us make the ARM-iOS cross compiler toolchain and you can clone the tool from [cctools-port](https://github.com/tpoechtrager/cctools-port). + +### Our versions of these tools : + +- llvm clang : 3.9.1 +- openssl : 1.0.2g +- iPhoneOsSDK: 10.0 +- cctools : 949.0.1, ld64: 530 (the latest version on github) + +## Step by Step + +**1.** First, make sure that you have available tools including llvm clang and openssl. + +**2.** Clone iPhoneOS SDK from [iPhoneOSSDK](https://github.com/okanon/iPhoneOS.sdk), and then place the archive **in the user home directory ~/**. For example we place it in */data/home/test*. We tried to put it in other directories, but it failed for us. + +**3.** Clone cctools-port from [cctools-port](https://github.com/tpoechtrager/cctools-port). + +``` +test@ubuntu:~$ pwd +/data/home/test +test@ubuntu:~$ mkdir ioscompile +test@ubuntu:~$ cd ioscompile +test@ubuntu:~/ioscompile$ git clone https://github.com/tpoechtrager/cctools-port.git +test@ubuntu:~/ioscompile$ ls +cctools-port-master +test@ubuntu:~$ cd .. +``` + +**4.** Use the shell script build.sh of cctools-port in the directory *cctools-port-master/usage_examples/ios_toolchain/* to make aarch64/arm64-ios cross compiler toolchain. The commands are : +``` +test@ubuntu:~$ cd ioscompile/cctools-port-master/ +test@ubuntu:~$ ./usage_examples/ios_toolchain/build.sh /data/home/test/iPhoneOS10.0.sdk.tar.gz arm64 +``` +After a while, a folder **target** is created in the directory cctools-port-master/usage_examples/ios_toolchain/ and this folder **target** is the created aarch64-ios cross compiler toolchain. Now you have successfully made an ARM-IOS cross compiler toolchain on Linux. In this folder, the sub-folder */bin* contains cross compilers and related tools like *arm-apple-darwin-clang/clang++*, and the sub-folder */lib* contains the dependent libraries. By the way, if you want to make an armv7-ios cross compiler toolchain, you can change these commands like : +``` +test@ubuntu:~$ cd ioscompile/cctools-port-master/ +test@ubuntu:~$ ./usage_examples/ios_toolchain/build.sh /data/home/test/iPhoneOS10.0.sdk.tar.gz armv7 +``` + +**5.** When you use your created ARM-IOS cross compiler toolchain to build bolt, you need to configure the toolchain in your environment with the following commands or you can configure the toolchain permanently in your environment. +``` +test@ubuntu:~$ export PATH=/data/home/test/ioscompile/cctools-port-master/usage_examples/ios_toolchain/target/bin:$PATH +test@ubuntu:~$ export LD_LIBRARY_PATH=/data/home/test/ioscompile/cctools-port-master/usage_examples/ios_toolchain/target/lib:$LD_LIBRARY_PATH +``` + +**6.** Simply go to the root directory for bolt, and run: + +``` +test@ubuntu:~/bolt$ ./install.sh -c arm_ios +``` diff --git a/docs/KIT.md b/docs/KIT.md new file mode 100644 index 00000000..09980d29 --- /dev/null +++ b/docs/KIT.md @@ -0,0 +1,50 @@ +# Kit + +Kit is an experimental feature based on [Flow](DEVELOPER.md), which aims to simplify the integration of bolt into applications. At this stage we are still rapidly exploring different designs. In the long run we want to provide symmetrical APIs for different platforms including iOS, Android, etc. + +In the [kit](../kit) directory, you can find the available demo project. In order to use the demo, bolt should be compiled first and some [headers and libraries](../kit/iOS/setup_lib_iOS.sh) need to be installed into the project, which is also taken care of in [install.sh](../install.sh). Currently we have uploaded an iOS project for image classification. + +## iOS Overview + +Our demo is using the Objective-C Language and the C++ API of Flow. Mainbody of the codes is in [ViewController.mm](../kit/iOS/image_classification/ImageClassificationDemo/ViewController.mm). There are some notes regarding iOS kits: + +- Compilation flags. The C++ API of Flow requires quite a few headers, and some compilation flags need to be set. For convenience, you can include [kit_flags.h](../kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/kit_flags.h) before including flow.h. +- Model path in flow prototxt. Flow reads the model paths in prototxt in order to locate the models. On iOS, however, the exact storage path for model files is dynamically determined. [ViewController.mm](../kit/iOS/image_classification/ImageClassificationDemo/ViewController.mm) demonstrates how to update prototxt with the new model path. + +### Image Classification + +The demo takes video input from camera, and uses GhostNet model trained on ImageNet. You can easily switch to other models trained on other datasets, following the steps below. As a tutorial, we will show how to change the model to the FP16 GhostNet that is also included in the project. You can try other models if your device is older than iPhone X and thus not in ARMv8.2 architecture. + +**0.** In [image_classification.prototxt](../kit/iOS/image_classification/ImageClassificationDemo/libbolt/image_classification.prototxt), you can see that the Inference node includes a path to ghostnet_f32.bolt. Actually, it is not necessary to change this path to ghostnet_f16.bolt, because this path will be dynamically overwritten as explained above. We will show how to switch to FP16 in Step 1. + +In the following steps, if the file name is not specified, please check [ViewController.mm](../kit/iOS/image_classification/ImageClassificationDemo/ViewController.mm). + +**1.** Switch to FP16 model. Change Line 78 to: + +``` +NSString *boltPath=[[NSBundle mainBundle]pathForResource:@"ghostnet_f16" ofType:@"bolt"]; +``` + +Please also change the variable inferencePrecision to DT_F16. + +**2.** Adjust the pixelProcess function, which is registered as the preprocessing function for the Inference node. For FP16 inference, actual input to the model should be in FP16: + +``` +F16 *oneArr = (F16 *)((CpuMemory *)outputs["input:0"]->get_memory())->get_ptr(); +``` + +If you are using your own model, change "input:0" to the name of your model input tensor. + +The provided Ghostnet requires input pixels organized as BGRBGRBGR... Adjust accordingly if your other model is trained with different preprocessing (i.e. normalizing each channel). + +**3.** Adjust the postProcess function, which is registered as the postprocessing function for the Inference node. For FP16 inference, the output score is also in FP16: + +``` +F16 *score1000 =(F16 *)((CpuMemory *)inputs[boltModelOutputName]->get_memory())->get_ptr(); +``` + +If necessary, change boltModelOutputName to the name of your model output tensor. If your model is not trained on ImageNet, there may not be 1000 scores. You may also change the topK variable. + +**4.** If necessary, replace imagenet_classes.txt. Add codes to handle the class index numbers that Flow outputs. + +**5.** Please run it under file path "/data/local/tmp" for andriod devices to ensure the program get full authorities. diff --git a/docs/QUANTIZATION.md b/docs/QUANTIZATION.md new file mode 100644 index 00000000..4d2ec3ab --- /dev/null +++ b/docs/QUANTIZATION.md @@ -0,0 +1,48 @@ +# Quantization Toolchain + +So far bolt supports various modes of post-training quantization, including quantized storage, dynamic quantization inference, calibration, etc. In the future, we will also provide quantization training tools. + +## post_training_quantization + +Please refer to [model_tools/tools/quantization/post_training_quantization.cpp](../model_tools/tools/quantization/post_training_quantization.cpp). All post-training quantization utilities are covered in this tool, except the calibration, which will also be merged into this tool in the future. + +Before using this tool, you need to first produce the input model with X2bolt using the "-i PTQ" option. Later, you can use the tool: + +``` +./post_training_quantization -p model_ptq_input.bolt +``` + +Different options of the tool are explained below. The default setting will produce model_int8_q.bolt which will be executed with dynamic int8 quantization. + +Here are the list of covered utilities: + +1. **Quantized Storage**: If you would like to compress your model, use the -q option. Choose from {FP16, INT8, MIX}. INT8 storage could lead to accuracy drop, so we provided the MIX mode which will try to avoid accuracy-critical layers. Note that this option is independent from the -i option, which sets the inference precision. +2. **Global Clipping of GEMM Inputs**: In some cases of quantization-aware training, GEMM inputs will be clipped so that they can be better quantized symmetrically. Please use the -c option is necessary. +3. **Ad-Hoc Clipping of Feature Maps**: In some other cases, the clip value is a trainable parameter for individual layers. Please use the -s option. The parameter **scaleFileDirectory** is the directory of your scale table file(.txt). Note that text format of the file is like the following codes, and **clipvalue** is the clip value of each feature map in your model. In our tool, we will calculate true scales of each tensor with the equation clipvalue/127.0 and store them into the created int8 model. +``` +tensor_name_0 clipvalue +tensor_name_1 clipvalue +tensor_name_2 clipvalue +``` + +## Calibration tool + +The post training quantization calibration tool is in the directory [inference/engine/tools/ptq_calibration/ptq_calibration.cpp](../inference/engine/tools/ptq_calibration/ptq_calibration.cpp). The command to use this tool is : +``` +./ptq_calibration modelPath dataDirectory dataFormat scaleValue affinityPolicyName algorithmMapPath +``` +So these parameters are : + +1. **modelPath** : the directory of your int8 Bolt model, make sure that you get your int8 Bolt model with our converter tool X2Bolt and then you can use this post training quantization calibration tool with your own related calibration datasets. +2. **dataDirectory** : the directory of your calibration datasets, note that the structure of the folder is : +``` +HWSEA:/data/local/tmp/test # cd calibration_dataset +HWSEA:/data/local/tmp/test # ls +XXXXX.JPEG XXXXX.JPEG XXXXX.JPEG XXXXX.JPEG XXXXX.JPEG +``` +3. **dataFormat** : specific imageFormat : BGR/RGB/RGB_SC/BGR_SC_RAW/BGR_SC_R +4. **scaleValue** : specific scaleValue for image classification, the default value is 1 +5. **affinityPolicyName** : specific running mode: CPU_AFFINITY_HIGH_PERFORMANCE/CPU_AFFINITY_LOW_POWER/GPU, the default value is CPU_AFFINITY_HIGH_PERFORMANCE. +6. **algorithmMapPath** : specific file path to read or write algorithm auto tunning result + +After running this post training quantization calibration tool, you will get a int8-KL Bolt model named by **_int8_q_KL.bolt** in the directory of the folder which stores your original int8 model. diff --git a/docs/REDUCE_GPU_PREPARE_TIME.md b/docs/REDUCE_GPU_PREPARE_TIME.md new file mode 100644 index 00000000..b0dd34fd --- /dev/null +++ b/docs/REDUCE_GPU_PREPARE_TIME.md @@ -0,0 +1,62 @@ +# How to reduce gpu initial time + +Bolt support ARM Mali GPU, large addtitional prepare time is cost due to algorithm selecting and building kernel from source code. + +- ## Build extra resources for reducing prepare time on GPU + + Bolt provides offline tools [preprocess_ocl](../inference/engine/tools/preprocess_ocl/build_preprocess_ocl.sh) to reduce GPU prepare time. We have test mobilenet_v1 on MALI G76 GPU. Prepare time can be reduced from 2-3s to 60ms after build algorithm file and OpenCL kernel binary. Here we give an exaple: + + - ### Step By Step + + <1> Connect target device by Andriod ADB; + + <2> Convert your models to .bolt with X2bolt; + + <3> Make a write/read able folder on target device, copy all your needed .bolt models into it, E.g: + + ``` + adb shell "mkdir /data/local/tmp/preprocess_bolt_models" + adb shell "cp ${boltModelDir}/*.bolt /data/local/tmp/preprocess_bolt_models" + ``` + + <4> Set essential variables in file *tools/preproces_ocl/build_preprocess_ocl.sh*: + + - dNum: Device serial number, which can be aquired by using command + + ``` + adb devices + ``` + + - device_bolt_models: which is created in step <3>; + + - device_work_local: "preprocess_ocl" work path on target device, suggest to be: /data/local/tmp/preprocess; + + <5> Run *build_preprocess_ocl.sh* on host; + + After running build_preprocess_ocl.sh successfully, these extra resources will be produced: + - algorithm file: record best algorithm for your model on target devices, such as: *${BOLT_ROOT}/tools/preprocess_ocl/algoFiles/algorithmInfo_Mali_G76p_GPUSR_p-16-1-p-8-1-p_1input_2_4* + + - OpenCL kernel bin dynamic library: All needed kernels for your model has been compiled from sources to bins, and package into .so, such as: *${BOLT_ROOT}/tools/preprocess_ocl/lib/libMali_G76p_map.so* + +- ## Use algorithm file and kernel binary dynamic library to reduce gpu prepare time for your model + + - ### Reduce Imagenet classification prepare time + ``` + adb shell "mkdir /data/local/tmp/kits" + adb push install_arm_llvm/kits/classification /data/local/tmp/kits + adb push tools/preprocess_ocl/algoFiles/algorithmInfo_Mali_G76p_GPUSR_p-16-1-p-8-1-p_1input_2_4 /data/local/tmp/kits + adb push tools/preprocess_ocl/lib/libMali_G76p_map.so /data/local/tmp/kits + adb shell "cd /data/local/tmp/kits && export LD_LIBRARY_PATH=./ && ./classification -m ./mobilenet_v1_f16.bolt -a GPU -p ./" + ``` + + - ### Reduce C project prepare time + + - Argument *algoPath* of C API *ModelHandle CreateModel(const char *modelPath, AFFINITY_TYPE affinity, const char *algoPath)* is used to set your algofile; + - Argument *algoFileStread* of C API *ModelHandle CreateModelWithFileStream( const char *modelFileStream, AFFINITY_TYPE affinity, const char *algoFileStream)* is used to set your algofile filestream; + - Package kernel binary dynamic library into your project; + +- ## Note + - AlgoFiles are binding with specific model and target device; + - Kernel binary dynamic library are binding with specific GPU type; + - tools *preprocess_ocl* can produce several bolt models algoFiles once, and package all kernels they need into one single kernel binay dynamic library; + - Please run it under file path "/data/local/tmp" for android devices to ensure the program get full authorities; diff --git a/docs/THIRD PARTY OPEN SOURCE SOFTWARE NOTICE.md b/docs/THIRD PARTY OPEN SOURCE SOFTWARE NOTICE.md new file mode 100644 index 00000000..b57d845d --- /dev/null +++ b/docs/THIRD PARTY OPEN SOURCE SOFTWARE NOTICE.md @@ -0,0 +1,208 @@ +Please note we provide an open source software notice for the third party open source software along with this software and/or this software component contributed by Huawei (in the following just “this SOFTWARE”). The open source software licenses are granted by the respective right holders. + + + +Warranty Disclaimer + +THE OPEN SOURCE SOFTWARE IN THIS SOFTWARE IS DISTRIBUTED IN THE HOPE THAT IT WILL BE USEFUL, BUT WITHOUT ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. SEE THE APPLICABLE LICENSES FOR MORE DETAILS. + + + +Copyright Notice and License Texts + +Software: caffe 1.0 () + +Copyright notice: + +All contributions by the University of California: + +Copyright (c) 2014-2017 The Regents of the University of California(Regents) + +All right reserved. + +All other contributions: + +Copyright (c) 2014-2017, the respective contributors + +All rights reserved. + +Caffe uses a shared copyright model: each contributor holds copyright over their contributions to Caffe. The project versioning records all such contribution and copyright details. If a contributor want to further mark their specific copyright on a particular contribution, they should indicate their copyright solely in commit message of the change when it is committed. + +License: + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the followingdisclaimer in the documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + + +Copyright Notice and License Texts + +Software: onnx 1.6.0 () + +Copyright notice: + +Copyright (c) 2017 ONNX Project Contributors + +All rights reserved. + +License: + +Permission is hereby granted, free of charge, to any person obtaining a copy + +of this software and associated documentation files (the "Software"), to deal + +in the Software without restriction, including without limitation the rights + +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + +copies of the Software, and to permit persons to whom the Software is + +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all + +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + +SOFTWARE. + + + +Copyright Notice and License Texts + +Software: protobuf 2.7.0 () + +Copyright (c) 2008 Google Inc. + +All rights reserved. + +License: + +Redistribution and use in source and binary forms, with or without + +modification, are permitted provided that the following conditions are + +met: + + \* Redistributions of source code must retain the above copyright + +notice, this list of conditions and the following disclaimer. + + \* Redistributions in binary form must reproduce the above + +copyright notice, this list of conditions and the following disclaimer + +in the documentation and/or other materials provided with the + +distribution. + + \* Neither the name of Google Inc. nor the names of its + +contributors may be used to endorse or promote products derived from + +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Code generated by the Protocol Buffer compiler is owned by the owner + +of the input file used when generating it. This code is not + +standalone and requires a support library to be linked with it. This + +support library is itself covered by the above license. + + + +Copyright Notice and License Texts + +Software: tensorflow 1.15.0 () + +Copyright (c) 2019 The TensorFlow Authors. + +All rights reserved. + +License: + +Licensed under the Apache License, Version 2.0 (the "License"); + +you may not use this file except in compliance with the License. + +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software + +distributed under the License is distributed on an "AS IS" BASIS, + +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +See the License for the specific language governing permissions and + +limitations under the License. + + + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files(the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + + + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + + + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/docs/USER_HANDBOOK.md b/docs/USER_HANDBOOK.md index 34b227f6..51b5a791 100644 --- a/docs/USER_HANDBOOK.md +++ b/docs/USER_HANDBOOK.md @@ -1,338 +1,322 @@ Before you try any step described in this document, please make sure you have installed Bolt correctly. You can refer to [INSTALL.md](INSTALL.md) for more details. - +[Basic Usage](#basic-usage) +    [Model Conversion](#model-conversion) +    [Model Inference](#model-inference) +    [API](#api) +    [Performance Profiling](#performance-profiling) +[Advanced Features](#advanced-features) +    [INT8 Post Training Quantization](#int8-post-traning-quantization) +    [BNN Network Support](#bnn-network-support) +    [Algorithm Tuning for Key Layers](#algorithm-tuning-for-key-layers) +    [Time-Series Data Acceleration](#time-series-data-acceleration) +[Feedback](#feedback) # Basic Usage -### Model Conversion - - +It's quiet easy for users to get started with bolt by learning the following two steps: "Model conversion" and "Model inference". And after successfully running bolt with your model, you can further explore the "API" section to customize your application. -1. **Caffe model to Bolt model** +## Model Conversion - <1> Push the `caffe2bolt` executable file to the phone; +![ModelConversion](images/ModelConversion.PNG) - <2> Push the caffe model to the phone; +[X2bolt](models_tools/tools/X2bolt/X2bolt.cpp) is a general converter, which focuses on converting different deep learning model to bolt model. Currently, X2bolt support caffe/onnx/tflite/tensorflow model conversion. Here we list the examples of two typical model conversions for ARM backend, for X86 backend the ADB tool is not required, bolt X86 only support FP32 precision inference now. - <3> Use `caffe2bolt` to transform model of caffe format to model of bolt format +### resnet50(caffe) Model Conversion - Parameters: caffe_model_path caffe_model_name precision +resnet50(caffe) model contains two model files : *resnet50.prototxt* and *resnet50.caffemodel*. Prepare these two model files on */home/resnet/* in advance. - ​ Note: Your should make sure the .prototxt file and the .caffemodel file have the same model name +<1> Push your model to the phone; -Example: Transform mobilenet_v1 of caffe format into bolt format - -```shell -<1> adb push /home/bolt/install_llvm/tools/caffe2bolt /data/local/bolt/tools/caffe2bolt -<2> adb push /home/bolt/models/caffe/mobilenet_v1/ /data/local/bolt_model/caffe/mobilenet_v1 -<3> adb shell "./data/local/bolt/tools/caffe2bolt ./data/local/bolt_model/caffe/mobilenet_v1/ mobilenet_v1 FP16" ``` +adb push /home/resnet50/ /data/local/tmp/models/resnet50 - After running, you can see the mobilenet_v1_f16.bolt file in the same directory with the original caffe model. - - The suffix "_f16" indicates that the bolt model is saved in FP16 representations, and will be run with FP16 operations (ARMv8.2) by default. - - If you want to deploy the model as FP32, please set the last parameter to "FP32" for caffe2bolt. You will then get mobilenet_v1_f32.bolt. +adb shell "ls /data/local/tmp/models/resnet50" +# command output$ resnet50.caffemodel resnet50.prototxt +``` - This precision setting also applies to onnx2bolt and tflite2bolt. +<2> Push the ***X2bolt*** to the phone and get the help information of ***X2bolt*** ; +``` +adb push /home/bolt/install_arm_gnu/tools/X2bolt /data/local/tmp/bolt/tools/X2bolt + +adb shell "ls /data/local/tmp/bolt/tools/" +# command output$ X2bolt + +adb shell "./X2bolt --help" +``` +<3> Execute ***X2bolt*** to convert a model from caffe model to bolt model. Here shows the example of float16 model conversion. -2. **Onnx model to Bolt model** +``` +adb shell "/data/local/tmp/bolt/tools/X2bolt -d /data/local/tmp/models/resnet50/ -m resnet50 -i FP16" + +adb shell "ls /data/local/tmp/models/resnet50" +# command output$ resnet50_fp16.bolt +``` - <1> Push the `onnx2bolt` executable file to the phone; +Note : Model conversion procedure of onnx and tflite is similar to caffe. - <2> Push the onnx model to the phone; +### mobilenet_v1(tensorflow) Model Conversion - <3> Use `onnx2bolt` to transform model of onnx format to model of bolt format +Save your mobilenet_v1 to frozen .pb model. And preprocess your model using [tf2json](model_tools/tools/tensorflow2json/tf2json.py) which can convert the .pb to .json. Then use **X2bolt** to convert .json to .bolt model. - ​ Parameters: onnx_model_path onnx_model_name remove_op_number precision inputN inputC inputH inputW +Here is the example of mobilenet_v1_frozen.pb converted to mobilenet_v1.bolt. -Example: Transform ghostnet of onnx format into bolt format +<1> Prepare mobilenet_v1 model(frozen .pb) on the server; -```shell -<1> adb push /home/bolt/tools/onnx2bolt /data/local/bolt/tools/onnx2bolt -<2> adb push /home/bolt/models/onnx/ghostnet/ /data/local/bolt_model/caffe/ghostnet -<3> adb shell "./data/local/bolt/tools/onnx2bolt ./data/local/bolt_model/onnx/ghostnet/ ghostnet 3 FP16 1 3 224 224" +``` +file /home/mobilenet_v1/mobilenet_v1_frozen.pb ``` - After running, you can see the ghostnet_f16.bolt file in /data/local/bolt_model/onnx/ghostnet/ on the phone. - - Since onnx models may not specify the input dimensions, onnx2bolt accepts 4 more parameters. If they are not provided, the .bolt model will specify 1x3x224x224 by default, which is the typical input size for ImageNet networks. - - - -3. **TensorFlow model to Bolt model** +<2> Convert mobilenet_v1_frozen.pb to mobilenet_v1.json; - The process flow is : TensorFlow model to Caffe model, and then to Bolt model. +``` +python3 model_tools/tools/tensorflow2json/tf2json.py /home/mobilenet_v1/mobilenet_v1_frozen.pb /home/mobilenet_v1/mobilenet_v1.json - <1> Tensorflow model to Caffe model +ls /home/mobilenet_v1 +# command output$ mobilenet_v1.json +``` - Refer to the [tensorflow2caffe README.md](../model-tools/tools/tensorflow2caffe/README.md) for more details on transforming TensorFlow model to Caffe model. +<3> Push the mobilenet_v1.json to the phone; - <2> Caffe model to Bolt model +``` +adb push /home/mobilenet_v1/mobilenet_v1.json /data/local/tmp/models/mobilenet_v1/mobilenet_v1.json + +adb shell "ls /data/local/tmp/models/mobilenet_v1" +# command output$ mobilenet_v1_frozen.pb mobilenet_v1.json +``` - Refer to the former steps in "Caffe model to Bolt model" section in this chapter. +<4> Push the ***X2bolt*** to the phone and get the help information of ***X2bolt*** ; +``` +adb push /home/bolt/install_arm_gnu/tools/X2bolt /data/local/tmp/bolt/tools/X2bolt + +adb shell "ls /data/local/tmp/bolt/tools/" +# command output$ X2bolt + +adb shell "./X2bolt --help" +``` +<5> Execute ***X2bolt*** to convert model from .json(converted from .pb) to bolt model. Here shows the example of float32 model conversion. -4. **PyTorch model to Bolt model** +``` +adb shell "/data/local/tmp/bolt/tools/X2bolt -d /data/local/tmp/models/mobilenet_v1/ -m mobilenet_v1 -i FP32" - PyTorch should have native support for onnx format. For your own convenience, you can try that first. +adb shell "ls /data/local/tmp/models/mobilenet_v1" +# command output$ mobilenet_v1.json mobilenet_v1_f32.bolt +``` - The process flow is: PyTorch model to Caffe model, and then to Bolt model +## Model Inference - <1> PyTorch model to Caffe model +### General Benchmark - Refer to the [pytorch2caffe README.md](../model-tools/tools/pytorch2caffe/README.md) for more details on transforming Pytorch model to Caffe model. +[*benchmark*](../inference/examples/benchmark/benchmark.cpp) is a general tool for measuring any .bolt model inference performace. - <2> Caffe model to Bolt model +<1> Push the ***benchmark*** to the phone and check its usage; - Refer to the former steps in "Caffe model to Bolt model" section in this chapter. +``` +adb push /home/bolt/install_arm_gnu/kits/benchmark /data/local/tmp/bolt/bin/benchmark +adb shell "./benchmark --help" +``` +<2> Execute ***benchmark*** for your model inference performace. -### Model Inference +``` +# running with fake data +adb shell "./data/local/tmp/bolt/bin/benchmark -m /data/local/tmp/bolt_model/caffe/resnet/resnet_f16.bolt" -We provide several demo programs, and here we will explain the usage of two typical programs: image classification and tinybert. +# running with real data +adb shell "./data/local/tmp/bolt/bin/benchmark -m /data/local/tmp/bolt_model/caffe/resnet/resnet_f16.bolt -i /data/local/tmp/data/1_3_224_224_fp16.bin" +``` +### Imagenet classification +Example: Run mobilenet_v1 for image classification with CPU -1. **Classification** +<1> Push classification to the phone; - <1> Push classification to the phone; - - <2> Push the testing image data to the phone; - - <3> Run classification and get the result. - - Parameters: bolt_model image_directory image_format scale_value TopK correct_label archInfo algorithmMapPath +``` +adb push /home/bolt/install_arm_gnu/kits/classification /data/local/tmp/bolt/bin/classification +``` -Example: Run mobilenet_v1 for image classification +<2> Push the testing image data to the phone; -```shell -<1> adb push /home/bolt/install_llvm/kits/classification /data/local/bolt/bin/classification -<2> adb push /home/bolt/data/ILSVRC/n02085620/ /data/local/bolt_data/cv/ILSVRC/n02085620 -<3> adb shell "./data/local/bolt/bin/classification /data/local/bolt_model/caffe/mobilenet_v1/mobilenet_v1_f16.bolt /data/local/bolt_data/cv/ILSVRC/n02085620 BGR 0.017 5 151 CPU_AFFINITY_HIGH_PERFORMANCE" +``` +adb push /home/bolt/data/ILSVRC/n02085620/ /data/local/tmp/bolt_data/cv/ILSVRC/n02085620 ``` - After running, you should be able to see the TopK labels for each image calculated according to the model, the Top1 and TopK accuracy, and the execution time. +<3> Run CPU classification and get the result. - Here we explain a little more for some of the parameters. +``` +adb shell "/data/local/tmp/bolt/bin/classification -m /data/local/tmp/bolt_model/caffe/mobilenet_v1/mobilenet_v1_f16.bolt -i /data/local/tmp/bolt_data/cv/ILSVRC/n02085620 -f BGR -s 0.017 -t 5 -c 151 -a CPU_AFFINITY_HIGH_PERFORMANCE -p ./" +``` - - image_format: The image format requested by the model. For example, caffe models usually require BGR format. You can refer to [image_processing.cpp](../image/src/image_processing.cpp) for more details. - - scale_value: The scale value requested in the input preprocessing. This value is also used in [image_processing.cpp](../image/src/image_processing.cpp). If your network required normalized inputs, the typical scale value is 0.017. - - TopK: The number of predictions that you are interested in for each image. Typical choice is 5. - - correct_label: The correct label number for the whole image directory. - - archInfo: - -- CPU_AFFINITY_HIGH_PERFORMANCE, Bolt will look for a high-frequency core and bind to it. - -- CPU_AFFINITY_LOW_POWER, Bolt will look for a low-frequency core. - -- GPU, Bolt will run the model on MALI GPU. - If the parameter is missing, the default value is "CPU_AFFINITY_HIGH_PERFORMANCE". - - algorithmMapPath The file path to save algorithm selection result info, it is strongly recommended to be set when use GPU. +After running, you should be able to see the TopK labels for each image calculated according to the model, the Top1 and TopK accuracy, and the execution time. - More Details for using GPU on classification nets: - Example for running with bolt GPU: - /*bolt_model*/ /*image_directory*/ /*image_format*/ /*scale*/ /*TopK*/ /*correct_lable*/ /*archInfo*/ /*algorithMapPath*/ - ./classification /data/local/tmp/model/mobilenet_v1_f16.bolt /data/local/tmp/data BGR 0.017 5 151 GPU /data/local/tmp +**Detailed explanation of the parameters:** - When you first running program, GPU will take lots of time to do algorithm selected and save the results to the algorithmMapPath you set. - After algorithm selected results been saved successfully, this step will be skipped. - If you want to get the best performance, please set the algorithmMapPath, and running your model after algorithm selected results been produced. - NOTE: - -- The file name of algorithm selected results are constitute with "modelname + archInfo + dataType", such as "algorithmInfo_MOBILENET_2_4". - -- If you modified your model, please delete the old algorithm selected results and run it again, or it may cause unpredicted errors. +- -f/--imageFormat: The image format requested by the model. For example, caffe models usually require BGR format. You can refer to [image_processing.cpp](../compute/image/src/image_processing.cpp) for more details. - - -2. **Tinybert** +- -s/--scaleValue: The scale value requested in the input preprocessing. This value is also used in [image_processing.cpp](../compute/image/src/image_processing.cpp). If your network required normalized inputs, the typical scale value is 0.017. - <1> Push tinybert to the phone; +- -t/--topK: The number of predictions that you are interested in for each image. Typical choice is 5.correct_label: The correct label number for the whole image directory. - <2> Push the testing sequence data to the phone; +- -c/--correctLabels: The correct label number for the whole image directory. - <3> Run tinybert and get the result. +- -a/--archinfo: - Parameters: bolt_model sequence_directory thread_affinity + The default value is "CPU_AFFINITY_HIGH_PERFORMANCE". -Example: + -- CPU_AFFINITY_HIGH_PERFORMANCE, Bolt will look for a high-frequency core and bind to it. -```shell -<1> adb push /home/bolt/install_llvm/kits/tinybert /data/local/bolt/bin/tinybert -<2> adb mkdir /data/local/bolt_data/nlp/tinybert/data -<3> adb mkdir /data/local/bolt_data/nlp/tinybert/data/input -<4> adb mkdir /data/local/bolt_data/nlp/tinybert/data/result -<5> adb push /home/bolt/model-tools/tools/tensorflow2caffe/tinybert/sequence.seq /data/local/bolt_data/nlp/tinybert/data/input/0.seq -<6> adb shell "./data/local/bolt/bin/tinybert /data/local/bolt_model/caffe/tinybert/tinybert_f16.bolt /data/local/bolt_data/nlp/tinybert/data CPU_AFFINITY_HIGH_PERFORMANCE" -``` + -- CPU_AFFINITY_LOW_POWER, Bolt will look for a low-frequency core. - After running, you should be able to see the labels for each sequence calculated according to the model, and the execution time. + -- GPU, Bolt will run the model on MALI GPU. - Here we explain a little more for some of the parameters. +- -p/--algoPath: The file path to save algorithm selection result info, it is strongly recommended to be set when use GPU. - - thread_affinity: When it is set to be CPU_AFFINITY_HIGH_PERFORMANCE, Bolt will look for a high-frequency core and bind to it. When it is set to be CPU_AFFINITY_LOW_POWER, Bolt will look for a low-frequency core. If the parameter is missing, the default value is "CPU_AFFINITY_HIGH_PERFORMANCE". +<4> Run GPU classification and get the result. +``` +adb shell "/data/local/tmp/bolt/bin/classification -m /data/local/tmp/bolt_model/caffe/mobilenet_v1/mobilenet_v1_f16.bolt -i /data/local/tmp/bolt_data/cv/ILSVRC/n02085620 -f BGR -s 0.017 -t 5 -c 151 -a GPU -p /data/local/tmp/tmp +``` -3. **Neural Machine Translation** +When you run the program for the first time, GPU will take lots of time to do algorithm selected and save the results to the algorithmMapPath you set. After algorithm selected results been saved successfully, this step will be skipped. - <1> Push nmt to the phone; +If you want to get the best performance, please set the *-p/--algoPath*, and running your model after algorithm selected results been produced. - <2> Push the testing sequence data to the phone; + NOTE: - <3> Run nmt and get the result. +- The file name of algorithm selected results are constitute with "modelName + archInfo + dataType", such as "algorithmInfo_MOBILENET_2_4". +- If you modified your model, please delete the old algorithm selected results and run it again, or it may cause unpredicted errors. - Parameters: bolt_model sequence_directory thread_affinity +### tinybert -Example: +<1> Push tinybert to the phone; -```shell -<1> adb push /home/bolt/install_llvm/kits/nmt /data/local/bolt/bin/nmt -<2> adb mkdir /data/local/bolt_data/nlp/machine_translation/data -<3> adb mkdir /data/local/bolt_data/nlp/machine_translation/data/input -<4> adb mkdir /data/local/bolt_data/nlp/machine_translation/data/result -<5> adb push /home/bolt/model-tools/tools/tensorflow2caffe/nmt/0.seq /data/local/bolt_data/nlp/machine_translation/data/input/0.seq -<6> adb shell "./data/local/bolt/bin/nmt /data/local/bolt_model/caffe/nmt/nmt_f16.bolt /data/local/bolt_data/nlp/machine_translation/data CPU_AFFINITY_HIGH_PERFORMANCE" +``` +adb push /home/bolt/install_arm_gnu/kits/tinybert /data/local/tmp/bolt/bin/tinybert ``` - After running, you should be able to see the machine translation result, and the execution time. - - Here we explain a little more for some of the parameters. +<2> Push the testing sequence data to the phone; - - thread_affinity: When it is set to be CPU_AFFINITY_HIGH_PERFORMANCE, Bolt will look for a high-frequency core and bind to it. When it is set to be CPU_AFFINITY_LOW_POWER, Bolt will look for a low-frequency core. If the parameter is missing, the default value is "CPU_AFFINITY_HIGH_PERFORMANCE". +``` +adb mkdir /data/local/tmp/bolt_data/nlp/tinybert/data +adb mkdir /data/local/tmp/bolt_data/nlp/tinybert/data/input +adb mkdir /data/local/tmp/bolt_data/nlp/tinybert/data/result +adb push /home/bolt/model_tools/tools/tensorflow2caffe/tinybert/sequence.seq /data/local/tmp/bolt_data/nlp/tinybert/data/input/0.seq +``` +<3> Run tinybert and get the result. -4. **Automatic Speech Recognition RNNT** +``` +adb shell "./data/local/tmp/bolt/bin/tinybert -m /data/local/tmp/bolt_model/caffe/tinybert/tinybert_f16.bolt -i /data/local/tmp/bolt_data/nlp/tinybert/data -a CPU_AFFINITY_HIGH_PERFORMANCE" +``` - <1> Push asr_rnnt to the phone; +After running, you should be able to see the labels for each sequence calculated according to the model, and the execution time. - <2> Push the testing sequence data to the phone; +### neural machine translation(nmt) - <3> Run asr_rnnt and get the result. +<1> Push nmt to the phone; - Parameters: bolt_model sequence_directory thread_affinity +``` +adb push /home/bolt/install_llvm/kits/nmt /data/local/tmp/bolt/bin/nmt +``` -Example: +<2> Push the testing sequence data to the phone; -```shell -<1> adb push /home/bolt/install_llvm/kits/asr_rnnt /data/local/bolt/bin/asr_rnnt -<2> adb mkdir /data/local/bolt_data/nlp/asr/asr_rnnt/data -<3> adb mkdir /data/local/bolt_data/nlp/asr/asr_rnnt/data/input -<4> adb mkdir /data/local/bolt_data/nlp/asr/asr_rnnt/data/result -<5> adb push /home/bolt/model-tools/tools/tensorflow2caffe/asr/asr_rnnt.seq /data/local/bolt_data/nlp/asr/asr_rnnt/data/input/0.seq -<6> adb shell "./data/local/bolt/bin/asr_rnnt /data/local/bolt_model/caffe/asr_rnnt/asr_rnnt_f16.bolt /data/local/bolt_data/nlp/asr/asr_rnnt/data CPU_AFFINITY_HIGH_PERFORMANCE" +``` +adb mkdir /data/local/tmp/bolt_data/nlp/machine_translation/data +adb mkdir /data/local/tmp/bolt_data/nlp/machine_translation/data/input +adb mkdir /data/local/tmp/bolt_data/nlp/machine_translation/data/result +adb push /home/bolt/model_tools/tools/tensorflow2caffe/nmt/0.seq /data/local/tmp/bolt_data/nlp/machine_translation/data/input/0.seq ``` - After running, you should be able to see the speech recognition result, and the execution time. - - Here we explain a little more for some of the parameters. +<3> Run nmt and get the result. - - thread_affinity: When it is set to be CPU_AFFINITY_HIGH_PERFORMANCE, Bolt will look for a high-frequency core and bind to it. When it is set to be CPU_AFFINITY_LOW_POWER, Bolt will look for a low-frequency core. If the parameter is missing, the default value is "CPU_AFFINITY_HIGH_PERFORMANCE". +``` +adb shell "./data/local/tmp/bolt/bin/nmt -m /data/local/tmp/bolt_model/caffe/nmt/nmt_f16.bolt -i /data/local/tmp/bolt_data/nlp/machine_translation/data -a CPU_AFFINITY_HIGH_PERFORMANCE" +``` +After running, you should be able to see the machine translation result, and the execution time. -5. **Automatic Speech Recognition Convolution+Transformer** +## API - <1> Push asr_convolution_transformer to the phone; +Please refer to [DEVELOPER.md](DEVELOPER.md#api-usage) for more details. - <2> Push the testing sequence data to the phone; +## Performance Profiling - <3> Run asr_convolution_transformer and get the result. +Bolt provides a program performance visualization interface to help user identify performance bottlenecks. - Parameters: bolt_model sequence_directory thread_affinity +- ### Visualize an inference program performance -Example: +<1> Edit [common/cmakes/bolt.cmake](../common/cmakes/bolt.cmake) file to open performance profile switch *USE_PROFILE*, and recompile bolt library. -```shell -<1> adb push /home/bolt/install_llvm/kits/asr_convolution_transformer /data/local/bolt/bin/asr_convolution_transformer -<2> adb mkdir /data/local/bolt_data/nlp/asr/asr_convolution_transformer/data -<3> adb push /home/bolt/model-tools/tools/tensorflow2caffe/asr /data/local/bolt_data/nlp/asr/asr_rnnt/data -<4> adb shell "./data/local/bolt/bin/asr_convolution_transformer /data/local/bolt_model/caffe/asr_rnnt/asr_convolution_transformer_encoder_f16.bolt /data/local/bolt_data/nlp/asr/asr_convolution_transformer/data CPU_AFFINITY_HIGH_PERFORMANCE" -<5> adb shell "./data/local/bolt/bin/asr_convolution_transformer /data/local/bolt_model/caffe/asr_rnnt/asr_convolution_transformer_prediction_net.f16.bolt /data/local/bolt_data/nlp/asr/asr_convolution_transformer/data CPU_AFFINITY_HIGH_PERFORMANCE" -<6> adb shell "./data/local/bolt/bin/asr_convolution_transformer /data/local/bolt_model/caffe/asr_rnnt/asr_convolution_transformer_joint_net_f16.bolt /data/local/bolt_data/nlp/asr/asr_convolution_transformer/data CPU_AFFINITY_HIGH_PERFORMANCE" +<2> Use the newly generated executable program or library to do inference. Bolt will print performance log in the command line window or Android log. Collect the performance log that started with *[PROFILE]*. Here is an example. +``` +[PROFILE] thread 7738 {"name": "deserialize_model_from_file", "cat": "prepare", "ph": "X", "pid": "0", "tid": "7738", "ts": 1605748035860637, "dur": 9018}, +[PROFILE] thread 7738 {"name": "ready", "cat": "prepare", "ph": "X", "pid": "0", "tid": "7738", "ts": 1605748035889436, "dur": 8460}, +[PROFILE] thread 7738 {"name": "conv1", "cat": "OT_Conv::run", "ph": "X", "pid": "0", "tid": "7738", "ts": 1605748035898106, "dur": 764}, +[PROFILE] thread 7738 {"name": "conv2_1/dw", "cat": "OT_Conv::run", "ph": "X", "pid": "0", "tid": "7738", "ts": 1605748035898876, "dur": 2516}, ``` - After running, you should be able to see the result of each sub network(encoder, prediction net, joint net), and the execution time. - - Here we explain a little more for some of the parameters. - - - thread_affinity: When it is set to be CPU_AFFINITY_HIGH_PERFORMANCE, Bolt will look for a high-frequency core and bind to it. When it is set to be CPU_AFFINITY_LOW_POWER, Bolt will look for a low-frequency core. If the parameter is missing, the default value is "CPU_AFFINITY_HIGH_PERFORMANCE". - - -### API - -Currently, we provide C and Java API. After installation, you can find the API documents docs/API/html/index.html. - +<3> Remove the prefix of thread private information *[PROFILE] thread 7738* and the comma at the end of log, add *[* at the beginning of the file and *]* at the end of file. Save it as a JSON file. Here is an JSON file example. +``` +[ + {"name": "deserialize_model_from_file", "cat": "prepare", "ph": "X", "pid": "0", "tid": "7738", "ts": 1605748035860637, "dur": 9018}, + {"name": "ready", "cat": "prepare", "ph": "X", "pid": "0", "tid": "7738", "ts": 1605748035889436, "dur": 8460}, + {"name": "conv1", "cat": "OT_Conv::run", "ph": "X", "pid": "0", "tid": "7738", "ts": 1605748035898106, "dur": 764}, + {"name": "conv2_1/dw", "cat": "OT_Conv::run", "ph": "X", "pid": "0", "tid": "7738", "ts": 1605748035898876, "dur": 2516} +] +``` +<4> Use Google Chrome browser to open extension. Load the JSON file. You can see the program execution time. +![](/images/PerformanceProfiling.PNG) # Advanced Features -### Graph Optimization - - By default, all graph optimizers that we have implemented are activated during model conversion. In the converters (caffe2bolt, onnx2bolt), you can find a function call: - ```c++ - ms_optimizer.suggest(); - ``` - If you wish to turn them off, you can adjust the suggest() function, or simply call: - ```c++ - ms_optimizer.empty(); - ``` - However, some of the optimizers are essential, which will be marked with * below. - - - *DeprecatedOpOptimizer: This optimizer removes the deprecated layers from the model - - *ConvBNOptimizer: This optimizer folds BN parameters into the weight and bias of convolution. - - *BNScaleOptimizer: When a BN layer is not precedented by a convolution layer, we will fold it into the following scale layer. - - *ConvScaleOptimizer: This optimizer folds scale parameters into the weight and bias of convolution. - - InPlaceOptimizer: If the input and output of a layer are identical in dimensions, they might share the same tensor name. Typical layers include the Activation Layer. - - ConvActivationOptimizer: This optimizer fuses convolution and activation layers - - *ChannelPaddingOptimizer: This optimizer will pad the output channels to a multiple of 8 for convolution layers. This increases the model compatibility. - - DepthwisePointwiseOptimizer: This optimizers fuses depthwise conv and pointwise conv for computation efficiency. - - TransposeMulToScaleOptimizer: This is useful for some NLP models. - - *MemoryReuseOptimizer: When a feature map tensor is no longer needed as input or output, the storage that it occupies can be reused by other feature maps. This saves on average **two-thirds** of feature map storage for networks that we have tested. - - - -### INT8 Post-Training Quantization - - If quantization is activated, the second convolution layer will quantize the tensors to 8-bit integers. For now, int8 operators include Convolution, Pooling and Concatenation (end-to-end support for Squeezenet). If your network includes other operators, you may need to add type casting in the front of those operators. The quantization method is symmetrical for both activation and weight. +## INT8 Post Training Quantization - If you want to activate the quantization, pass "INT8_Q" as the precision parameter to caffe2bolt or onnx2bolt during model conversion. +Operations are smartly quantized, avoiding layers that are critical to accuracy. When possible, gemm layers (e.g. conv, FC) will directly output int8 tensors so as to save dequantization time. The quantization method is symmetrical for both activation and weight. Please refer to [QUANTIZATION.md](QUANTIZATION.md) for more details. +## BNN Network Support +Bolt supports both XNOR-style and DoReFa-style BNN networks. Just save the binary weights as FP32 in an Onnx model, and X2bolt will automatically convert the storage to 1-bit representations. So far, the floating-point portion of the BNN network can only be FP16 operations, so pass "FP16" as the precision parameter to X2bolt. The number of output channels for BNN convolution layers should be divisible by 32. -### BNN Network Support +## Algorithm Tuning for Key Layers - Bolt supports both XNOR-style and DoReFa-style BNN networks. Just save the binary weights as FP32 in an Onnx model, and onnx2bolt will automatically convert the storage to 1-bit representations. So far, the floating-point portion of the BNN network can only be FP16 operations, so pass "FP16" as the precision parameter to onnx2bolt. The number of output channels for BNN convolution layers should be divisible by 32. - - - -### Layer Performance Benchmark - - If you target device is an Android phone connected to your compilation server, you can call "make test" to run a quick verification test, which runs the [quick_benchmark.sh](../quick_benchmark.sh). For more details, please refer to the individual unit test programs under [tests](../tests). +Bolt provides tensor_computing_library_search program for performance tuning of the operator library. Bolt currently supports convolution layer algorithm tuning. +<1> Push tensor_computing_library_search to the phone; +``` +adb push /home/bolt/install_arm_gnu/tools/tensor_computing_library_search /data/local/tmp/bolt/tools/tensor_computing_library_search +``` -### Algorithm Tuning for Key Layers +<2> Set Bolt_TensorComputing_LibraryAlgoritmMap shell environment variable; - Bolt provides tensor_computing_library_search program for performance tuning of the operator library. Bolt currently supports convolution layer algorithm tuning. +<3> Run library tuning program; - <1> Push tensor_computing_library_search to the phone; +``` +adb shell "export Bolt_TensorComputing_LibraryAlgoritmMap=/data/local/tmp/bolt/tensor_computing_library_algorithm_map.txt && ./data/local/tmp/bolt/tools/tensor_computing_library_search" +``` - <2> Set Bolt_TensorComputing_LibraryAlgoritmMap shell environment variable +After running, you should be able to get algorithm map file on device. - <3> Run library tuning program. - - <4> Use *CONVOLUTION_LIBRARY_SEARCH* convolution policy during model inference. +<4> Use *CONVOLUTION_LIBRARY_SEARCH* convolution policy during model inference. -Example: +Modify Convolution algorithm search policy in [inference/engine/include/cpu/convolution_cpu.hpp](../inference/engine/include/cpu/convolution_cpu.hpp) -```shell -<1> adb push /home/bolt/inference/tools/tensor_computing_library_search /data/local/bolt/tools/tensor_computing_library_search -<2> adb shell "export Bolt_TensorComputing_LibraryAlgoritmMap=/data/local/bolt/tensor_computing_library_algorithm_map.txt && ./data/local/bolt/tools/tensor_computing_library_search" -``` +## Time-Series Data Acceleration - After running, you should be able to get algorithm map file on device. +Flow is the time-series data acceleration module for Bolt. Flow simplifies the application development process. Flow uses graph as an abstraction of application deployment, and each stage (function) is viewed as a node. A node can do data preprocessing, deep learning inference or result postprocessing. Separate feature extraction can also be abstracted as a node. The bridging entity between function is data (tensor), and that can be represented as an edge. +Flow provides flexible CPU multi-core parallelism and heterogeneous scheduling (CPU + GPU). User don't need to pay excessive attention to heterogeneous management and write lots of non-reusable code to implement a heterogeneous application. User can get the best end-to-end performance with the help of Flow. Flow supports data parallelism and subgraph parallelism, with a simple API. +More usage information can be find in [DEVELOPER.md](docs/DEVELOPER.md#time-series-data-acceleration-by-using-flow). # Feedback - If you have encountered any difficulty, feel free to reach out to us by summitting issues. You are also encouraged to contribute your implementations. Please refer to [DEVELOPER.md](DEVELOPER.md). +If you have encountered any difficulty, feel free to reach out to us by submitting issues. You are also encouraged to contribute your implementations. Please refer to [DEVELOPER.md](DEVELOPER.md). diff --git a/docs/images/ADB.PNG b/docs/images/ADB.PNG new file mode 100644 index 00000000..a7a07786 Binary files /dev/null and b/docs/images/ADB.PNG differ diff --git a/docs/images/Framework.PNG b/docs/images/Framework.PNG new file mode 100644 index 00000000..f51360fa Binary files /dev/null and b/docs/images/Framework.PNG differ diff --git a/docs/images/GNU.PNG b/docs/images/GNU.PNG new file mode 100644 index 00000000..5729d380 Binary files /dev/null and b/docs/images/GNU.PNG differ diff --git a/docs/images/JDK.PNG b/docs/images/JDK.PNG new file mode 100644 index 00000000..17543fc8 Binary files /dev/null and b/docs/images/JDK.PNG differ diff --git a/docs/images/ModelConversion.PNG b/docs/images/ModelConversion.PNG new file mode 100644 index 00000000..26e96bcc Binary files /dev/null and b/docs/images/ModelConversion.PNG differ diff --git a/docs/images/NDK.PNG b/docs/images/NDK.PNG new file mode 100644 index 00000000..22fd45df Binary files /dev/null and b/docs/images/NDK.PNG differ diff --git a/docs/images/PerformanceProfiling.PNG b/docs/images/PerformanceProfiling.PNG new file mode 100644 index 00000000..60a178f2 Binary files /dev/null and b/docs/images/PerformanceProfiling.PNG differ diff --git a/docs/images/QuickStart.PNG b/docs/images/QuickStart.PNG new file mode 100644 index 00000000..5bd4bfc0 Binary files /dev/null and b/docs/images/QuickStart.PNG differ diff --git a/docs/images/cmake.PNG b/docs/images/cmake.PNG new file mode 100644 index 00000000..079e603a Binary files /dev/null and b/docs/images/cmake.PNG differ diff --git a/docs/images/dx.PNG b/docs/images/dx.PNG new file mode 100644 index 00000000..de0410ff Binary files /dev/null and b/docs/images/dx.PNG differ diff --git a/docs/images/make.PNG b/docs/images/make.PNG new file mode 100644 index 00000000..29a68c42 Binary files /dev/null and b/docs/images/make.PNG differ diff --git a/gcl/include/context.h b/gcl/include/context.h deleted file mode 100644 index 91d3bdcb..00000000 --- a/gcl/include/context.h +++ /dev/null @@ -1,167 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - -#ifndef _H_CONTEXT -#define _H_CONTEXT - -#ifdef __cplusplus -extern "C" { -#endif - - /** - * @brief create OpenCL Context based on platform - * - * @param platform input, context will be created on this platform - * @param num_devices input, context will be created on num_devices Device - * @param devices input, context created contains devices - * @param context output, return context created - * - * @return - * - */ - inline EE create_context(Platform platform, - U32 num_devices, Device *devices, - Context *context) { - if(NULL == context) return NULL_POINTER; - - I32 ret; - cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0}; - *context = clCreateContext(properties, num_devices, devices, NULL, NULL, &ret); - map_cl_error_2_ee(ret); - } - - - /** - * @brief get context information - * - * @warning please free the memory allocate by this function - **/ - inline EE get_context_info(Context context, cl_context_info info, - void** value, U32 *len) { - if(NULL == value) return NULL_POINTER; - - size_t size; - I32 ret = clGetContextInfo(context, info, 0, NULL, &size); - if(CL_SUCCESS == ret) { - if(NULL == len) *len = size; - void* data = malloc(size); - if(NULL == data) return ALLOC_FAILED; - ret = clGetContextInfo(context, info, size, data, NULL); - if(CL_SUCCESS == ret) { *value = data; } else { free(data); } - } - - map_cl_error_2_ee(ret); - } - - inline EE retain_context(Context context) { - I32 ret = clRetainContext(context); - map_cl_error_2_ee(ret); - } - - inline EE release_context(Context context) { - I32 ret = clReleaseContext(context); - map_cl_error_2_ee(ret); - } - - inline EE create_command_queue_properties(Context context, Device device, - cl_queue_properties* properties, CommandQueue* queue) { - if(NULL == queue) return NULL_POINTER; - I32 ret; - *queue = clCreateCommandQueueWithProperties(context, device, properties, &ret); - map_cl_error_2_ee(ret); - } -/* - inline EE create_command_queue(Context context, Device device, - cl_command_queue_properties properties, CommandQueue* queue) { - if(NULL == queue) return NULL_POINTER; - I32 ret; - *queue = clCreateCommandQueue(context, device, properties, &ret); - map_cl_error_2_ee(ret); - } -*/ - /** - * @brief get information of command queue - * - * @warning please free memory associated with value - * - **/ - inline EE get_command_queue_info(CommandQueue queue, - cl_command_queue_info info, - void** value, size_t *len) { - if(NULL == value) return NULL_POINTER; - - size_t size; - I32 ret = clGetCommandQueueInfo(queue, info, 0, NULL, &size); - if(CL_SUCCESS == ret) { - if(NULL != len) *len = size; - void* data = malloc(size); - if(NULL == data) return ALLOC_FAILED; - ret = clGetCommandQueueInfo(queue, info, size, data, NULL); - if(CL_SUCCESS == ret) { *value = data; } else { free(data); } - } - - map_cl_error_2_ee(ret); - } - - /** - * @brief get context of command queue - * - **/ - inline EE command_queue_get_context(CommandQueue queue, Context *context) { - if(NULL == context) return NULL_POINTER; - I32 ret = clGetCommandQueueInfo(queue, CL_QUEUE_CONTEXT, sizeof(Context), context, NULL); - map_cl_error_2_ee(ret); - } - - /** - * @brief get device of command queue - * - **/ - inline EE command_queue_get_device(CommandQueue queue, Device *device) { - if(NULL == device) return NULL_POINTER; - I32 ret = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(Device), device, NULL); - map_cl_error_2_ee(ret); - } - - inline EE retain_command_queue(CommandQueue queue) { - I32 ret = clRetainCommandQueue(queue); - map_cl_error_2_ee(ret); - } - - inline EE release_command_queue(CommandQueue queue) { - I32 ret = clReleaseCommandQueue(queue); - map_cl_error_2_ee(ret); - } - - /** - * @brief flush command queue, issue all command to execuate - **/ - inline EE flush(CommandQueue queue) { - I32 ret = clFlush(queue); - map_cl_error_2_ee(ret); - } - - /** - * @brief wait all commands finish - **/ - inline EE finish (CommandQueue queue) { - I32 ret = clFinish(queue); - map_cl_error_2_ee(ret); - } - -#ifdef __cplusplus -} -#endif -#endif diff --git a/gcl/include/event.h b/gcl/include/event.h deleted file mode 100644 index b9931755..00000000 --- a/gcl/include/event.h +++ /dev/null @@ -1,120 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - -#ifndef EVENT_H_ -#define EVENT_H_ - -#ifdef __cplusplus -extern "C" { -#endif - - /** - * @brief wait for event to complete - **/ - inline EE wait_events(U32 num_events, const Event *event_list) { - I32 ret = clWaitForEvents(num_events, event_list); - map_cl_error_2_ee(ret); - } - - /** - * @brief get informaiton about event - * - * @warning please free memory associated with value - **/ - inline EE get_event_info(cl_event event, cl_event_info info, void* *value, size_t *size) { - size_t len; - I32 ret = clGetEventInfo(event, info, 0, NULL, &len); - if(CL_SUCCESS == ret){ - if(NULL != size) *size = len; - void* data = malloc(len); - if(NULL == data) return ALLOC_FAILED; - ret = clGetEventInfo(event, info, len, data, NULL); - if(CL_SUCCESS == ret) { *value = data; } else { free(data); } - } - - map_cl_error_2_ee(ret); - } - - - /** - * @brief increase reference count of event - **/ - inline EE retain_event(Event event) { - I32 ret = clRetainEvent(event); - map_cl_error_2_ee(ret); - } - - inline EE release_event(Event event) { - I32 ret = clReleaseEvent(event); - map_cl_error_2_ee(ret); - } - - inline EE enqueue_barrier_wait_lists(CommandQueue queue, - U32 num_wait_events, - const Event *wait_events, Event *event) { - I32 ret = clEnqueueBarrierWithWaitList(queue, num_wait_events, wait_events, event); - map_cl_error_2_ee(ret); - } - - inline EE event_counting_time(Event* event, double* t_queue, double* t_submit, double* t_start, double* t_end, double* t_execute){ - cl_ulong queued, submit, start, end; - CHECK_STATUS(wait_events(1, event)); - I32 ret; - ret = clGetEventProfilingInfo(*event, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &queued, NULL); - if(ret) map_cl_error_2_ee(ret); - ret = clGetEventProfilingInfo(*event, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &submit, NULL); - if(ret) map_cl_error_2_ee(ret); - ret = clGetEventProfilingInfo(*event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); - if(ret) map_cl_error_2_ee(ret); - ret = clGetEventProfilingInfo(*event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); - if(ret) map_cl_error_2_ee(ret); - - double t0, t1, t2, t3, t4; - t0 = (double)(queued) * 1e-03; - t1 = (double)(submit) * 1e-03; - t2 = (double)(start) * 1e-03; - t3 = (double)(end) * 1e-03; - t4 = ((double)(end) - (double)(start)) * 1e-03; - - if(t_queue) *t_queue = t0; - if(t_submit) *t_submit = t1; - if(t_start) *t_start = t2; - if(t_end) *t_end = t3; - if(t_execute) *t_execute = t4; - return SUCCESS; - } - /** - * @brief get profiling information - **/ - inline EE event_get_profiling_info(Event event, cl_profiling_info info, - void* *value, size_t *size) { - size_t len; - I32 ret = clGetEventProfilingInfo(event, info, 0, NULL, &len); - if(CL_SUCCESS == ret) { - if(NULL != size) *size = len; - void* data = malloc(len); - if(NULL == data) return ALLOC_FAILED; - ret = clGetEventProfilingInfo(event, info, len, data, NULL); - if(CL_SUCCESS == ret) { *value = data; } else { free(data); } - } - - map_cl_error_2_ee(ret); - } - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/gcl/include/gcl_common.h b/gcl/include/gcl_common.h deleted file mode 100644 index d95035ad..00000000 --- a/gcl/include/gcl_common.h +++ /dev/null @@ -1,257 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - -#ifndef H_GCL_COMMON -#define H_GCL_COMMON -#define CL_TARGET_OPENCL_VERSION 200 - -#include "type.h" -#include "error.h" -#include "tensor_desc.h" -#include "CL/cl.h" -#include -#include -#include -#include -#include -/** - * @file - */ -#define ERROR_CASE(x) case x: return(#x) - -#ifdef __cplusplus -extern "C" { -#endif - - typedef cl_platform_id Platform; - typedef cl_device_id Device; - typedef cl_context Context; - typedef cl_command_queue CommandQueue; - typedef cl_program Program; - typedef cl_mem Mem; - typedef cl_sampler Sampler; - typedef cl_kernel Kernel; - typedef cl_event Event; - typedef cl_mem_flags MemFlags; - typedef cl_image_format ImgFormat; - - inline CI8* map_cl_error_2_string(cl_int err){ - switch(err) - { - ERROR_CASE(CL_SUCCESS ); - ERROR_CASE(CL_DEVICE_NOT_FOUND ); - ERROR_CASE(CL_DEVICE_NOT_AVAILABLE ); - ERROR_CASE(CL_COMPILER_NOT_AVAILABLE ); - ERROR_CASE(CL_MEM_OBJECT_ALLOCATION_FAILURE ); - ERROR_CASE(CL_OUT_OF_RESOURCES ); - ERROR_CASE(CL_OUT_OF_HOST_MEMORY ); - ERROR_CASE(CL_PROFILING_INFO_NOT_AVAILABLE ); - ERROR_CASE(CL_MEM_COPY_OVERLAP ); - ERROR_CASE(CL_IMAGE_FORMAT_MISMATCH ); - ERROR_CASE(CL_IMAGE_FORMAT_NOT_SUPPORTED ); - ERROR_CASE(CL_BUILD_PROGRAM_FAILURE ); - ERROR_CASE(CL_MAP_FAILURE ); -#ifdef CL_VERSION_1_1 - ERROR_CASE(CL_MISALIGNED_SUB_BUFFER_OFFSET ); - ERROR_CASE(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST); -#endif -#ifdef CL_VERSION_1_2 - ERROR_CASE(CL_COMPILE_PROGRAM_FAILURE ); - ERROR_CASE(CL_LINKER_NOT_AVAILABLE ); - ERROR_CASE(CL_LINK_PROGRAM_FAILURE ); - ERROR_CASE(CL_DEVICE_PARTITION_FAILED ); - ERROR_CASE(CL_KERNEL_ARG_INFO_NOT_AVAILABLE ); -#endif - ERROR_CASE(CL_INVALID_VALUE ); - ERROR_CASE(CL_INVALID_DEVICE_TYPE ); - ERROR_CASE(CL_INVALID_PLATFORM ); - ERROR_CASE(CL_INVALID_DEVICE ); - ERROR_CASE(CL_INVALID_CONTEXT ); - ERROR_CASE(CL_INVALID_QUEUE_PROPERTIES ); - ERROR_CASE(CL_INVALID_COMMAND_QUEUE ); - ERROR_CASE(CL_INVALID_HOST_PTR ); - ERROR_CASE(CL_INVALID_MEM_OBJECT ); - ERROR_CASE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR ); - ERROR_CASE(CL_INVALID_IMAGE_SIZE ); - ERROR_CASE(CL_INVALID_SAMPLER ); - ERROR_CASE(CL_INVALID_BINARY ); - ERROR_CASE(CL_INVALID_BUILD_OPTIONS ); - ERROR_CASE(CL_INVALID_PROGRAM ); - ERROR_CASE(CL_INVALID_PROGRAM_EXECUTABLE ); - ERROR_CASE(CL_INVALID_KERNEL_NAME ); - ERROR_CASE(CL_INVALID_KERNEL_DEFINITION ); - ERROR_CASE(CL_INVALID_KERNEL ); - ERROR_CASE(CL_INVALID_ARG_INDEX ); - ERROR_CASE(CL_INVALID_ARG_VALUE ); - ERROR_CASE(CL_INVALID_ARG_SIZE ); - ERROR_CASE(CL_INVALID_KERNEL_ARGS ); - ERROR_CASE(CL_INVALID_WORK_DIMENSION ); - ERROR_CASE(CL_INVALID_WORK_GROUP_SIZE ); - ERROR_CASE(CL_INVALID_WORK_ITEM_SIZE ); - ERROR_CASE(CL_INVALID_GLOBAL_OFFSET ); - ERROR_CASE(CL_INVALID_EVENT_WAIT_LIST ); - ERROR_CASE(CL_INVALID_EVENT ); - ERROR_CASE(CL_INVALID_OPERATION ); - ERROR_CASE(CL_INVALID_GL_OBJECT ); - ERROR_CASE(CL_INVALID_BUFFER_SIZE ); - ERROR_CASE(CL_INVALID_MIP_LEVEL ); - ERROR_CASE(CL_INVALID_GLOBAL_WORK_SIZE ); -#ifdef CL_VERSION_1_1 - ERROR_CASE(CL_INVALID_PROPERTY ); -#endif -#ifdef CL_VERSION_1_2 - ERROR_CASE(CL_INVALID_IMAGE_DESCRIPTOR ); - ERROR_CASE(CL_INVALID_COMPILER_OPTIONS ); - ERROR_CASE(CL_INVALID_LINKER_OPTIONS ); - ERROR_CASE(CL_INVALID_DEVICE_PARTITION_COUNT ); -#endif -#ifdef CL_VERSION_2_0 - ERROR_CASE(CL_INVALID_PIPE_SIZE ); - ERROR_CASE(CL_INVALID_DEVICE_QUEUE ); -#endif -#ifdef CL_VERSION_2_2 - ERROR_CASE(CL_INVALID_SPEC_ID ); - ERROR_CASE(CL_MAX_SIZE_RESTRICTION_EXCEEDED ); -#endif - - default: - return"CL_UNKNOW_ERROR"; - } - } - -#define map_cl_error_2_ee(err)\ - {\ - if(err == 0) return SUCCESS;\ - std::cout << "GCLAPI error in: " << std::endl;\ - std::cout << "File: " << __FILE__ << std::endl;\ - std::cout << "Line: " << __LINE__ << std::endl;\ - std::cout << "Func name is: " << __func__ << std::endl;\ - std::cout << "GCLERROR = " << map_cl_error_2_string(err) << std::endl;\ - return GCL_ERROR;\ - } - - inline EE has_dedicated_local(Device device, I32 *b) { - void* value; - I32 ret = clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_TYPE, sizeof(void*), &value, nullptr); - if(CL_SUCCESS == ret) *b = (*((cl_device_local_mem_type*)value) == CL_LOCAL); - free(value); - map_cl_error_2_ee(ret); - } - - -/** -*@ enum define -**/ -typedef enum{ - GCL_MEM_BUF = 0, - GCL_MEM_IMG_1D = 1, - GCL_MEM_IMG_2D = 2, - GCL_MEM_IMG_3D = 3 -}GCLMemType; - -typedef enum{ - HOST_TO_DEVICE_BUF = 0, - HOST_TO_DEVICE_IMG = 1, - DEVICE_BUF_TO_HOST = 2, - DEVICE_IMG_TO_HOST = 3, - DEVICE_BUF_TO_BUF = 4, - DEVICE_BUF_TO_IMG = 5, - DEVICE_IMG_TO_BUF = 6, - DEVICE_IMG_TO_IMG = 7 -}GCLMemTransType; -/** -*@ struct define -**/ -struct GCLKernelInfo{ - Kernel kernel = NULL; - U32 dim = 0; - U32 gs[3] = {0}; - U32 ls[3] = {0}; - std::string name; -}; - -struct GCLKernelBin{ - CU8* data; - CU32 len; -}; - -struct GCLHandle{ - Platform* platforms; - U32 numPlatform; - U32 platformId; - - Device* devices; - U32 numDevice; - U32 deviceId; - cl_device_type deviceType; - - Context context; - CommandQueue queue; - CommandQueue queue_profiling; - - cl_command_queue_properties queueProperties; - Event eventObj; - Event* eventPtr; - U32 numWaitEvents; - Event* waitEvents; - double t_execute; - double t_total; - - std::string deviceBinmapName; - std::unordered_map* binMapPtr; - std::map kernelMap; - std::vector kernelVec; - std::string curOpName; -}; - -typedef struct GCLHandle* GCLHandle_t; - -struct GCLHandleConfig{ - CI8* deviceBinmapName; -}; - -typedef GCLHandleConfig* GCLHandleConfig_t; - -struct GCLMemDesc{ - U32 stride[3]; - U32 offset[3]; - GCLMemType memType; - DataFormat memFormat; - U32 byteSize; - U32 num; - MemFlags flags; - ImgFormat imgFormat; - void* host_ptr; - U8* map_ptr; - bool use_map; - bool has_alloc; -}; -typedef struct GCLMemDesc* GCLMemDesc_t; -struct GCLMem{ - Mem mem; - GCLMemDesc desc; - std::vector subMem; - std::vector mapPtrArray; -}; -typedef struct GCLMem* GCLMem_t; - - - - -#ifdef __cplusplus -} -#endif -#endif diff --git a/gcl/include/gcl_func.h b/gcl/include/gcl_func.h deleted file mode 100644 index 90e9f579..00000000 --- a/gcl/include/gcl_func.h +++ /dev/null @@ -1,1232 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - -#ifndef H_GCL_FUNC -#define H_GCL_FUNC - -#include -#include "gcl_common.h" -#include "platform.h" -#include "context.h" -#include "program.h" -#include "memory.h" -#include "kernel.h" -#include "event.h" -#include "gcl_kernel_binmap.h" -#include -#ifdef __cplusplus -extern "C" { -#endif - inline EE gcl_get_device_name(GCLHandle_t handle) { - cl_device_id device = handle->devices[handle->deviceId]; - U32 len; - I8* data; - CHECK_STATUS(get_device_info(device, CL_DEVICE_NAME, (void**)&data, &len)); - I8 devName[64]; - for(U32 i = 0; i < len - 1; i++) { - if(data[i] == '-') { - data[i] = '_'; - } - devName[i] = data[i]; - } - U32 version_len; - free(data); - CHECK_STATUS(get_device_info(device, CL_DEVICE_VERSION, (void**)&data, &version_len)); - std::string deviceV = std::string(data); - U32 be = deviceV.find("r"); - U32 end = deviceV.find("p", be + 1); - std::string numV = deviceV.substr(be + 1, end - be - 1); - U32 i = atoi(numV.c_str()); - if(i >= 14) { - devName[len - 1] = 'p'; - devName[len] = '\0'; - } else { - devName[len - 1] = '\0'; - } - free(data); - handle->deviceBinmapName = devName; - return SUCCESS; - } - - inline EE gcl_create_handle(GCLHandle_t* handlePtr) { - - if(handlePtr == NULL) { - printf("the handlePtr set to gcl_create_handle is NULL\n"); - return NULL_POINTER; - } - GCLHandle_t handle= new GCLHandle(); - handle->platformId = 0; - handle->deviceId = 0; - handle->deviceType = CL_DEVICE_TYPE_GPU; - handle->eventPtr = NULL; - handle->numWaitEvents = 0; - handle->waitEvents = NULL; - handle->t_execute = 0; - handle->t_total = 0; - handle->curOpName = "unknow"; - U32 platformId = handle->platformId; - U32 deviceId = handle->deviceId; - CHECK_STATUS(get_platforms(&handle->numPlatform, &handle->platforms)); - CHECK_STATUS(platform_get_devices(handle->platforms[platformId], - handle->deviceType, - &handle->numDevice, - &handle->devices)); - CHECK_STATUS(create_context(handle->platforms[platformId], - handle->numDevice, - handle->devices, - &handle->context)); - cl_queue_properties props[]={CL_QUEUE_PROPERTIES, 0, 0}; -#ifdef _DEBUG - handle->queueProperties = CL_QUEUE_PROFILING_ENABLE; - handle->eventPtr = &handle->eventObj; - props[1] = props[1] | CL_QUEUE_PROFILING_ENABLE; -#endif - CHECK_STATUS(create_command_queue_properties(handle->context, - handle->devices[deviceId], - props, - &handle->queue)); - CHECK_STATUS(gcl_get_device_name(handle)); - *handlePtr = handle; - return SUCCESS; - } - - inline EE gcl_create_handle_profiling(GCLHandle_t* handlePtr) { - - if(handlePtr == NULL) { - printf("the handlePtr set to gcl_create_handle is NULL\n"); - return NULL_POINTER; - } - GCLHandle_t handle= new GCLHandle(); - handle->platformId = 0; - handle->deviceId = 0; - handle->deviceType = CL_DEVICE_TYPE_GPU; - handle->eventPtr = NULL; - handle->numWaitEvents = 0; - handle->t_execute = 0; - handle->t_total = 0; - handle->curOpName = "unknow"; - U32 platformId = handle->platformId; - U32 deviceId = handle->deviceId; - CHECK_STATUS(get_platforms(&handle->numPlatform, &handle->platforms)); - CHECK_STATUS(platform_get_devices(handle->platforms[platformId], - handle->deviceType, - &handle->numDevice, - &handle->devices)); - CHECK_STATUS(create_context(handle->platforms[platformId], - handle->numDevice, - handle->devices, - &handle->context)); - cl_queue_properties props[]={CL_QUEUE_PROPERTIES, 0, 0}; - handle->queueProperties = CL_QUEUE_PROFILING_ENABLE; - handle->eventPtr = &handle->eventObj; - props[1] = props[1] | CL_QUEUE_PROFILING_ENABLE; - CHECK_STATUS(create_command_queue_properties(handle->context, - handle->devices[deviceId], - props, - &handle->queue)); - CHECK_STATUS(gcl_get_device_name(handle)); - *handlePtr = handle; - return SUCCESS; - } - - inline void gcl_destroy_handle(GCLHandle_t handle) { - U32 deviceId = handle->deviceId; - CHECK_STATUS(finish(handle->queue)); - for(auto k : handle->kernelMap) CHECK_STATUS(release_kernel(k.second)); - for(auto k : handle->kernelVec) CHECK_STATUS(release_kernel(k.kernel)); - handle->kernelMap.clear(); - handle->kernelVec.clear(); - CHECK_STATUS(release_command_queue(handle->queue)); - CHECK_STATUS(release_context(handle->context)); - CHECK_STATUS(release_device(handle->devices[deviceId])); - free(handle->devices); - free(handle->platforms); - delete handle; - } - - inline EE gcl_create_queue_profiling(GCLHandle_t handle) { - - cl_queue_properties props[]={CL_QUEUE_PROPERTIES, 0, 0}; - handle->eventPtr = &handle->eventObj; - props[1] = props[1] | CL_QUEUE_PROFILING_ENABLE; - CHECK_STATUS(create_command_queue_properties(handle->context, - handle->devices[handle->deviceId], - props, - &handle->queue_profiling)); - return SUCCESS; - } - - inline EE gcl_destroy_queue_profiling(GCLHandle_t handle) { - CHECK_STATUS(finish(handle->queue_profiling)); - CHECK_STATUS(release_command_queue(handle->queue_profiling)); - handle->eventPtr = NULL; - return SUCCESS; - } - - inline EE gcl_regist_binMap(GCLHandle_t handle){ - gcl_kernel_binmap_factory::instance()->create_gcl_kernel_binmap(handle->deviceBinmapName); - gcl_kernel_binmap* kernel_binmap; - U32 EE = gcl_kernel_binmap_container::instance()->get(handle->deviceBinmapName, &kernel_binmap); - if(EE == NULL_POINTER) { - DEBUG_info("warning: the kernel binmap is not found"); - } else { - handle->binMapPtr = &kernel_binmap->binMap(); - } - return SUCCESS; - } - - inline GCLMemDesc gcl_mem_desc(U32 stride[], U32 offset[], DataType dt, DataFormat memFormat){ - GCLMemDesc desc; - U32 s0, s1, s2; - s0 = stride[0]; - s1 = stride[1]; - s2 = stride[2]; - desc.stride[0] = s0; - desc.stride[1] = s1; - desc.stride[2] = s2; - desc.offset[0] = offset[0]; - desc.offset[1] = offset[1]; - desc.offset[2] = offset[2]; - desc.memFormat = memFormat; - desc.memType = GCL_MEM_BUF; - desc.num = s0 * s1 * s2; - desc.byteSize = s0 * s1 * s2 * bytesOf(dt); - desc.flags = CL_MEM_READ_WRITE; - desc.host_ptr = NULL; - desc.imgFormat.image_channel_order = CL_RGBA; - desc.imgFormat.image_channel_data_type = CL_HALF_FLOAT; - desc.use_map = false; - desc.map_ptr = NULL; - desc.has_alloc = false; - return desc; - } - - - inline GCLMem_t gcl_create_gclmem(){ - GCLMem_t ret = new GCLMem; - ret->mem = NULL; - U32 str[3] = {0, 0, 0}; - U32 off[3] = {0, 0, 0}; - ret->desc = gcl_mem_desc(str, off, DT_U8, DF_NCWHC4); - return ret; - } - - inline EE gcl_release_memory(GCLMem_t gclMem) { - if(gclMem->mem) { - if(gclMem->subMem.size()) { - for(auto p: gclMem->subMem) CHECK_STATUS(release_memory(p)); - gclMem->subMem.clear(); - } - CHECK_STATUS(release_memory(gclMem->mem)); - gclMem->mem = NULL; - gclMem->desc.has_alloc = false; - } - return SUCCESS; - } - - inline void gcl_destroy_gclmem(GCLMem_t mem){ - CHECK_STATUS(gcl_release_memory(mem)); - delete mem; - } - - inline EE gcl_finish(GCLHandle_t handle) { - CHECK_STATUS(finish(handle->queue)); - return SUCCESS; - } - - - inline EE gcl_unmap_memory(GCLHandle_t handle, GCLMem_t gclMem) - { - for(auto p : gclMem->mapPtrArray) { - CHECK_STATUS(enqueue_unmap_memory(handle->queue, gclMem->mem, (void*)p, - handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); -#ifdef _DEBUG - DEBUG_info_s("DATAUNMAP>>> enqueue_unmap_memory runInfo:"); - double executeTime = 0; - CHECK_STATUS(event_counting_time(handle->eventPtr, NULL, NULL, NULL, NULL, &executeTime)); - CHECK_STATUS(release_event(handle->eventObj)); - DEBUG_info("executeTime = " << executeTime << " us"); - CHECK_STATUS(gcl_finish(handle)); -#endif - } - gclMem->mapPtrArray.clear(); - return SUCCESS; - - } - - inline EE gcl_produce_program_kernel_with_source(GCLHandle_t handle, - U32* len, - CI8* src, - CI8* option, - Program* program, - U32 numKernel, - Kernel* kernels) { - U32 deviceId = handle->deviceId; - CHECK_STATUS(create_build_program_from_source(handle->context, len, src, handle->devices[deviceId], option, program)); - CHECK_STATUS(create_kernels_in_program(*program, numKernel, kernels)); - return SUCCESS; - } - - inline EE gcl_get_program_info(Program program, - U8** binary, - U32* len) { - CHECK_STATUS(get_program_binary(program, binary, len)); - return SUCCESS; - } - - inline EE gcl_kernelmap_put(GCLHandle_t handle, - std::string kernelName, - Kernel kernel) { - handle->kernelMap.insert(std::pair(kernelName, kernel)); - return SUCCESS; - } - - inline Kernel gcl_kernelmap_get(GCLHandle_t handle, - std::string kernelName) { - auto it = handle->kernelMap.find(std::string(kernelName)); - if(it == handle->kernelMap.end()) CHECK_STATUS(NOT_MATCH); - return it->second; - } - - inline EE gcl_create_kernel_binary(GCLHandle_t handle, - CI8* kernelName, - Kernel* kernel) { - - std::string binmapname = handle->deviceBinmapName; - std::string binmap_kernelname = binmapname + "_" + std::string(kernelName); - auto binMapPtr = handle->binMapPtr; - auto it = binMapPtr->find(binmap_kernelname); - if(it == binMapPtr->end()) { - DEBUG_info("get kernel " << kernelName << " failed"); - return NULL_POINTER; - } - - U32 length = it->second.len; - CU8* data = it->second.data; - I32 binsta; - Program program; - CI8* options = ""; - Device device = handle->devices[handle->deviceId]; - CHECK_STATUS(create_program_from_binary(handle->context, device, &length, &data, &binsta, &program)); - CHECK_STATUS(build_program(program, device, options)); - CHECK_STATUS(create_kernel(program, kernelName, kernel)); - CHECK_STATUS(release_program(program)); - return SUCCESS; - } - - inline EE gcl_get_kernel_from_map(GCLHandle_t handle, - CI8* kernelName, - Kernel* kernel) { - std::string binmapname = handle->deviceBinmapName; - std::string binmap_kernelname = binmapname + "_" + std::string(kernelName); - if(handle->kernelMap.find(binmap_kernelname) == handle->kernelMap.end()) { - CHECK_STATUS(gcl_create_kernel_binary(handle, kernelName, kernel)) - CHECK_STATUS(gcl_kernelmap_put(handle, binmap_kernelname, *kernel)); - } else { - *kernel = gcl_kernelmap_get(handle, binmap_kernelname); - } - return SUCCESS; - } - - - inline EE gcl_set_kernelVec(GCLHandle_t handle, - Kernel kernel, - U32 work_dim, - U32 global_work_size[], - U32 local_work_size[], - CI8* kernelName = NULL) { - GCLKernelInfo kernelInfo; - kernelInfo.kernel = kernel; - kernelInfo.dim = work_dim; - kernelInfo.name = handle->curOpName + "_" + std::string(kernelName); - switch(work_dim) { - case 1: { - kernelInfo.gs[0] = global_work_size[0]; - kernelInfo.gs[1] = 1; - kernelInfo.gs[2] = 1; - kernelInfo.ls[0] = local_work_size[0]; - kernelInfo.ls[1] = 0; - kernelInfo.ls[2] = 0; - break;} - case 2: { - kernelInfo.gs[0] = global_work_size[0]; - kernelInfo.gs[1] = global_work_size[1]; - kernelInfo.gs[2] = 1; - kernelInfo.ls[0] = local_work_size[0]; - kernelInfo.ls[1] = local_work_size[1]; - kernelInfo.ls[2] = 0; - break;} - case 3: { - kernelInfo.gs[0] = global_work_size[0]; - kernelInfo.gs[1] = global_work_size[1]; - kernelInfo.gs[2] = global_work_size[2]; - kernelInfo.ls[0] = local_work_size[0]; - kernelInfo.ls[1] = local_work_size[1]; - kernelInfo.ls[2] = local_work_size[2]; - break;} - default: - return NOT_SUPPORTED; - } - handle->kernelVec.push_back(kernelInfo); - return SUCCESS; - } - - inline EE gcl_run_kernelVec(GCLHandle_t handle) { - U32 len = handle->kernelVec.size(); - CommandQueue queue = handle->queue; - U32 numWaitEvents = handle->numWaitEvents; - Event* waitEvents = handle->waitEvents; - Event* eventPtr = handle->eventPtr; - for(U32 i = 0 ; i < len; ++i) { - auto kernelInfo = handle->kernelVec[i]; - CHECK_STATUS(enqueue_ndrange_kernel(queue, kernelInfo.kernel, kernelInfo.dim, NULL, - kernelInfo.gs, kernelInfo.ls, numWaitEvents, waitEvents, eventPtr)); -#ifdef _DEBUG - DEBUG_info_s("KERNEL>>> " << kernelInfo.name << " runInfo:"); - double executeTime = 0; - CHECK_STATUS(event_counting_time(eventPtr, NULL, NULL, NULL, NULL, &executeTime)); - CHECK_STATUS(release_event(*eventPtr)); - handle->t_execute = executeTime; - DEBUG_info("executeTime = " << executeTime << " us"); - CHECK_STATUS(gcl_finish(handle)); -#endif - } - return SUCCESS; - } - - inline EE gcl_run_kernelVec_timing(GCLHandle_t handle, U32 be, U32 end, std::vector *kernelArrayTime = NULL) { - if(handle->queueProperties & CL_QUEUE_PROFILING_ENABLE) { - double executeTime = 0; - double totalTime = 0; - CommandQueue queue = handle->queue; - U32 numWaitEvents = handle->numWaitEvents; - Event* waitEvents = handle->waitEvents; - Event* eventPtr = handle->eventPtr; - for(U32 i = be ; i < end; ++i) { - auto kernelInfo = handle->kernelVec[i]; - CHECK_STATUS(enqueue_ndrange_kernel(queue, kernelInfo.kernel, kernelInfo.dim, NULL, - kernelInfo.gs, kernelInfo.ls, numWaitEvents, waitEvents, eventPtr)); - CHECK_STATUS(event_counting_time(handle->eventPtr, NULL, NULL, NULL, NULL, &executeTime)); - CHECK_STATUS(release_event(handle->eventObj)); - totalTime += executeTime; - if(kernelArrayTime) (*kernelArrayTime).push_back(executeTime); - } - handle->t_execute = totalTime; - return SUCCESS; - } - return NOT_SUPPORTED; - } - - inline EE gcl_clean_kernelVec(GCLHandle_t handle) { - for(auto k : handle->kernelVec) CHECK_STATUS(release_kernel(k.kernel)); - handle->kernelVec.clear(); - return SUCCESS; - } - - inline EE gcl_run_kernel(GCLHandle_t handle, Kernel kernel, U32 work_dim, U32* gs, U32* ls, CI8* kernelName = NULL) { -#ifdef _DEBUG - std::string name = "unknown kernel"; - if(kernelName) name = handle->curOpName + "_" + std::string(kernelName); - DEBUG_info_s("KERNEL>>> " << name.c_str() << " runInfo:"); -#endif - CHECK_STATUS(enqueue_ndrange_kernel(handle->queue, kernel, work_dim, - NULL, gs, ls, - handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); -#ifdef _DEBUG - double executeTime = 0; - CHECK_STATUS(event_counting_time(handle->eventPtr, NULL, NULL, NULL, NULL, &executeTime)); - CHECK_STATUS(release_event(handle->eventObj)); - handle->t_execute = executeTime; - DEBUG_info("executeTime = " << executeTime << " us"); - CHECK_STATUS(gcl_finish(handle)); -#else - UNUSED(kernelName); -#endif - return SUCCESS; - } - - inline U32 get_next_ls_size(U32 ls_size) { - return (ls_size << 1); - } - inline EE gcl_run_kernel_select_ls(GCLHandle_t handle, GCLKernelInfo* kernelInfo) { - auto kernel = kernelInfo->kernel; - auto work_dim = kernelInfo->dim; - auto gs = kernelInfo->gs; - double minTime = DBL_MAX; - double time; - U32 test_ls[3]; - U32 best_ls[3]; - U32 test_gs[3]; - U32 maxSize = 384; - U32 gs_x = 256; - U32 gs_y = (work_dim > 1) ? 256 : 1; - U32 gs_z = (work_dim > 2) ? gs[2] : 1; - for(U32 z = 1; z <= gs_z; z = get_next_ls_size(z)) { - if(0 != gs_z % z) continue; - for(U32 y = 1; y <= gs_y; y = get_next_ls_size(y)) { - if(0 != gs_y % y) continue; - for(U32 x = 1; x <= gs_x; x = get_next_ls_size(x)) { - if(0 != gs_x % x) continue; - U32 total = x * y * z; - if(total <= maxSize) { - test_gs[0] = (gs[0] + x - 1) / x * x; - test_gs[1] = (gs[1] + y - 1) / y * y; - test_gs[2] = (gs[2] + z - 1) / z * z; - test_ls[0] = x; - test_ls[1] = y; - test_ls[2] = z; - CHECK_STATUS(enqueue_ndrange_kernel(handle->queue_profiling, kernel, work_dim, NULL, test_gs, test_ls, - handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); - CHECK_STATUS(event_counting_time(handle->eventPtr, NULL, NULL, NULL, NULL, &time)); - if(minTime > time){ - minTime = time; - best_ls[0] = test_ls[0]; - best_ls[1] = test_ls[1]; - best_ls[2] = test_ls[2]; - } - CHECK_STATUS(release_event(handle->eventObj)); - } - } - } - } - test_ls[0] = 0; - test_ls[1] = 0; - test_ls[2] = 0; - CHECK_STATUS(enqueue_ndrange_kernel(handle->queue_profiling, kernel, work_dim, NULL, gs, test_ls, handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); - CHECK_STATUS(event_counting_time(handle->eventPtr, NULL, NULL, NULL, NULL, &time)); - if(minTime > time){ - minTime = time; - best_ls[0] = test_ls[0]; - best_ls[1] = test_ls[1]; - best_ls[2] = test_ls[2]; - } - CHECK_STATUS(release_event(handle->eventObj)); - if(best_ls[0] != 0 && best_ls[1] != 0 && best_ls[2] != 0) { - kernelInfo->gs[0] = (gs[0] + best_ls[0] - 1) / best_ls[0] * best_ls[0]; - kernelInfo->gs[1] = (gs[1] + best_ls[1] - 1) / best_ls[1] * best_ls[1]; - kernelInfo->gs[2] = (gs[2] + best_ls[2] - 1) / best_ls[2] * best_ls[2]; - } - kernelInfo->ls[0] = best_ls[0]; - kernelInfo->ls[1] = best_ls[1]; - kernelInfo->ls[2] = best_ls[2]; - handle->t_execute = minTime; -#ifdef _DEBUG - DEBUG_info_s("SELECT LS KERNEL>>> " << kernelInfo->name.c_str() << " runInfo:"); - DEBUG_info_s("best ls = " << best_ls[0] << " " << best_ls[1] << " " << best_ls[2] << " "); - DEBUG_info(" executeTime = " << minTime << " us"); -#endif - return SUCCESS; - } - - inline EE gcl_run_kernelVec_select_ls(GCLHandle_t handle, std::vector kernelIndex) { - if(kernelIndex.size() == 0) return SUCCESS; - CHECK_STATUS(gcl_create_queue_profiling(handle)); - for(auto index : kernelIndex) { - auto kernelInfo = handle->kernelVec[index]; - CHECK_STATUS(gcl_run_kernel_select_ls(handle, &kernelInfo)); - handle->kernelVec[index].gs[0] = kernelInfo.gs[0]; - handle->kernelVec[index].gs[1] = kernelInfo.gs[1]; - handle->kernelVec[index].gs[2] = kernelInfo.gs[2]; - handle->kernelVec[index].ls[0] = kernelInfo.ls[0]; - handle->kernelVec[index].ls[1] = kernelInfo.ls[1]; - handle->kernelVec[index].ls[2] = kernelInfo.ls[2]; - } - CHECK_STATUS(gcl_destroy_queue_profiling(handle)); - return SUCCESS; - } - -#ifdef _DEBUG - inline EE gcl_run_kernel_profiling(GCLHandle_t handle, Kernel kernel, U32 work_dim, U32* gs, U32* ls, CI8* kernelName = NULL) { - std::string name = "unknown kernel"; - if(kernelName) name = kernelName; - DEBUG_info_s("KERNEL>>> " << name.c_str() << " runInfo:"); - double totalTime = 0; - double executeTime = 0; - U32 loop = 10; - for(U32 i = 0; i < loop; i++) { - double t; - CHECK_STATUS(enqueue_ndrange_kernel(handle->queue, kernel, work_dim, - NULL, gs, ls, - handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); - CHECK_STATUS(event_counting_time(handle->eventPtr, NULL, NULL, NULL, NULL, &t)); - CHECK_STATUS(release_event(handle->eventObj)); - DEBUG_info("loop " << i << " executeTime = " << t << " us"); - totalTime += t; - } - executeTime = totalTime / loop; - DEBUG_info("executeTime = " << executeTime << " us for " << loop << " times average"); - CHECK_STATUS(gcl_finish(handle)); - return SUCCESS; - } -#endif - - inline EE gcl_create_memory(GCLHandle_t handle, GCLMem_t gclMem) { - GCLMemDesc_t desc = &gclMem->desc; - if(!desc->has_alloc){ - switch(desc->memType) { - case GCL_MEM_BUF: { - CHECK_STATUS(create_buffer(handle->context, desc->flags, desc->byteSize, desc->host_ptr, &gclMem->mem)); - desc->has_alloc = true; - break; - } - case GCL_MEM_IMG_1D: { - CHECK_STATUS(create_image1D(handle->context, desc->flags, &desc->imgFormat, desc->stride[0], 0, desc->host_ptr, &gclMem->mem)); - desc->has_alloc = true; - break; - } - case GCL_MEM_IMG_2D: { - CHECK_STATUS(create_image2D(handle->context, desc->flags, &desc->imgFormat, desc->stride[0], desc->stride[1], 0, desc->host_ptr, &gclMem->mem)); - desc->has_alloc = true; - break; - } - case GCL_MEM_IMG_3D: { - CHECK_STATUS(create_image3D(handle->context, desc->flags, &desc->imgFormat, desc->stride[0], desc->stride[1], desc->stride[2], 0, 0, desc->host_ptr, &gclMem->mem)); - desc->has_alloc = true; - break; - } - default: return NOT_SUPPORTED; - } - } else { - //std::cout << "warning try to alloc the same gpu mem again without release" << std::endl; - } - return SUCCESS; - } - - inline EE gcl_trans_memory(GCLHandle_t handle, void* src, void* dst, U32* size, GCLMemTransType type, cl_bool blocking, U32* offset = NULL) - { - DEBUG_info_s("DATATRANS>>>"); - switch(type) { - case HOST_TO_DEVICE_BUF: { - U8* hostPtr = (U8*)src; - GCLMem_t gclMem = (GCLMem_t)dst; - U32 dstOff = (offset) ? offset[0] : 0; - CHECK_STATUS(enqueue_write_buffer(handle->queue, gclMem->mem, blocking, dstOff, *size, hostPtr, - handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); - DEBUG_info_s("enqueue_write_buffer runInfo: "); - break; - } - case HOST_TO_DEVICE_IMG: { - U8* hostPtr = (U8*)src; - GCLMem_t gclMem = (GCLMem_t)dst; - U32 origin[3] = {0, 0, 0}; - if(offset) { - origin[0] = offset[0]; - origin[1] = offset[1]; - origin[2] = offset[2]; - } - CHECK_STATUS(enqueue_write_image(handle->queue, gclMem->mem, blocking, origin, size, 0, 0, hostPtr, - handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); - DEBUG_info_s("enqueue_write_image runInfo: "); - break; - } - case DEVICE_BUF_TO_HOST: { - U8* hostPtr = (U8*)dst; - GCLMem_t gclMem = (GCLMem_t)src; - U32 srcOff = (offset) ? offset[0] : 0; - CHECK_STATUS(enqueue_read_buffer(handle->queue, gclMem->mem, blocking, srcOff, *size, hostPtr, - handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); - DEBUG_info_s("enqueue_read_buffer runInfo: "); - break; - } - case DEVICE_IMG_TO_HOST: { - U8* hostPtr = (U8*)dst; - GCLMem_t gclMem = (GCLMem_t)src; - U32 origin[3] = {0, 0, 0}; - if(offset) { - origin[0] = offset[0]; - origin[1] = offset[1]; - origin[2] = offset[2]; - } - CHECK_STATUS(enqueue_read_image(handle->queue, gclMem->mem, blocking, origin, size, 0, 0, hostPtr, - handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); - DEBUG_info_s("enqueue_read_image runInfo: "); - break; - } - case DEVICE_BUF_TO_BUF: { - GCLMem_t srcBuf = (GCLMem_t)src; - GCLMem_t dstBuf = (GCLMem_t)dst; - U32 srcOff = 0; - U32 dstOff = 0; - if(offset) { - srcOff = offset[0]; - dstOff = offset[1]; - } - CHECK_STATUS(enqueue_copy_buffer(handle->queue, srcBuf->mem, dstBuf->mem, srcOff, dstOff, *size, - handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); - DEBUG_info_s("enqueue_copy_buffer runInfo: "); - break; - } - case DEVICE_BUF_TO_IMG: { - GCLMem_t srcBuf = (GCLMem_t)src; - GCLMem_t dstImg = (GCLMem_t)dst; - U32 origin[3] = {0, 0, 0}; - U32 srcOff = 0; - if(offset) { - srcOff = offset[0]; - origin[0] = offset[1]; - origin[1] = offset[2]; - origin[2] = offset[3]; - } - CHECK_STATUS(enqueue_copy_buffer_to_image(handle->queue, srcBuf->mem, dstImg->mem, srcOff, origin, size, - handle->numWaitEvents, handle->waitEvents, handle->eventPtr)) - DEBUG_info_s("enqueue_copy_buffer_to_image runInfo: "); - break; - } - case DEVICE_IMG_TO_BUF: { - GCLMem_t srcImg = (GCLMem_t)src; - GCLMem_t dstBuf = (GCLMem_t)dst; - U32 origin[3] = {0, 0, 0}; - U32 dstOff = 0; - if(offset) { - origin[0] = offset[0]; - origin[1] = offset[1]; - origin[2] = offset[2]; - dstOff = offset[3]; - } - CHECK_STATUS(enqueue_copy_image_to_buffer(handle->queue, srcImg->mem, dstBuf->mem, origin, size, dstOff, - handle->numWaitEvents, handle->waitEvents, handle->eventPtr)) - DEBUG_info_s("enqueue_copy_image_to_buffer runInfo: "); - break; - } - case DEVICE_IMG_TO_IMG: { - return NOT_SUPPORTED; - break; - } - default: return NOT_SUPPORTED; - } -#ifdef _DEBUG - double executeTime = 0; - CHECK_STATUS(event_counting_time(handle->eventPtr, NULL, NULL, NULL, NULL, &executeTime)); - CHECK_STATUS(release_event(handle->eventObj)); - DEBUG_info("executeTime = " << executeTime << " us"); - CHECK_STATUS(gcl_finish(handle)); -#endif - return SUCCESS; - } - - inline EE gcl_trans_buffer_rect(GCLHandle_t handle, void* src, void* dst, U32* host_org, U32* buf_org, U32* region, U32 host_row_pitch, U32 host_slice_pitch, - U32 buf_row_pitch, U32 buf_slice_pitch, GCLMemTransType type, cl_bool blocking) { - DEBUG_info_s("DATATRANS>>>"); - switch(type) { - case HOST_TO_DEVICE_BUF: { - GCLMem_t dstBuf = (GCLMem_t)dst; - CHECK_STATUS(enqueue_write_buffer_rect(handle->queue, dstBuf->mem, blocking, buf_org, host_org, region, buf_row_pitch, buf_slice_pitch, - host_row_pitch, host_slice_pitch, src, handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); - DEBUG_info_s("enqueue_write_buffer_rect runInfo: "); - break; - } - case DEVICE_BUF_TO_HOST: { - return NOT_SUPPORTED; - break; - } - default: return NOT_SUPPORTED; - } -#ifdef _DEBUG - double executeTime = 0; - CHECK_STATUS(event_counting_time(handle->eventPtr, NULL, NULL, NULL, NULL, &executeTime)); - CHECK_STATUS(release_event(handle->eventObj)); - DEBUG_info("executeTime = " << executeTime << " us"); - CHECK_STATUS(gcl_finish(handle)); -#endif - return SUCCESS; - } - - inline EE gcl_map_memory(GCLHandle_t handle, GCLMem_t gclMem, U32*offset, U32* size, cl_map_flags flags, cl_bool blocking) - { - DEBUG_info_s("DATAMAP>>> enqueue_map_buffer runInfo:"); - GCLMemDesc_t desc = &gclMem->desc; - if (gclMem->desc.memType == GCL_MEM_BUF) { - CHECK_STATUS(enqueue_map_buffer(handle->queue, gclMem->mem, blocking, flags, *offset, *size, - handle->numWaitEvents, handle->waitEvents, handle->eventPtr, (void**)&desc->map_ptr)); - gclMem->mapPtrArray.push_back(desc->map_ptr); - } else { - return NOT_SUPPORTED; - } -#ifdef _DEBUG - double executeTime = 0; - CHECK_STATUS(event_counting_time(handle->eventPtr, NULL, NULL, NULL, NULL, &executeTime)); - CHECK_STATUS(release_event(handle->eventObj)); - DEBUG_info("executeTime = " << executeTime << " us"); - CHECK_STATUS(gcl_finish(handle)); -#endif - return SUCCESS; - } - - - inline EE gcl_fill_memory_zero(GCLHandle_t handle, GCLMem_t gclMem) { - if(gclMem->desc.memType == GCL_MEM_BUF) { - DEBUG_info_s("FILLMEM>>> enqueue_fill_buffer runInfo:"); - U8 pat_val = 0; - CHECK_STATUS(enqueue_fill_buffer(handle->queue, gclMem->mem, &pat_val, sizeof(pat_val), 0, gclMem->desc.byteSize, - handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); - } else { - DEBUG_info_s("FILLMEM>>> enqueue_fill_image runInfo:"); - F32 color[4] = {0.0f, 0.0f, 0.0f, 0.0f}; - U32 origin[3] = {0, 0, 0}; - U32 region[3]; - region[0] = gclMem->desc.stride[0]; - region[1] = gclMem->desc.stride[1]; - region[2] = gclMem->desc.stride[2]; - CHECK_STATUS(enqueue_fill_image(handle->queue, gclMem->mem, color, origin, region, - handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); - } -#ifdef _DEBUG - double executeTime = 0; - CHECK_STATUS(event_counting_time(handle->eventPtr, NULL, NULL, NULL, NULL, &executeTime)); - CHECK_STATUS(release_event(handle->eventObj)); - DEBUG_info("executeTime = " << executeTime << " us"); - CHECK_STATUS(gcl_finish(handle)); -#endif - return SUCCESS; - } - - inline EE gcl_get_mem_size(GCLMem_t gclMem, U32* size) { - CHECK_STATUS(get_memory_size(gclMem->mem, size)); - return SUCCESS; - } - - inline EE gcl_create_sub_buffer(U32 size, U32* offset, GCLMem_t src, Mem* subbuf){ - CHECK_STATUS(create_sub_buffer(src->mem, CL_MEM_READ_WRITE, *offset, size, subbuf)); - src->subMem.push_back(*subbuf); - *offset += (size + 1023) / 1024 * 1024; - return SUCCESS; - } - #ifdef __cplusplus - } - #endif - template - struct DummpyWrapper{ - static void set_kernel_arg_wrapper(Kernel kernel, const Tuple& t) { - DummpyWrapper::set_kernel_arg_wrapper(kernel, t); - auto arg = std::get(t); - set_kernel_arg(kernel, N-1, sizeof(arg), (void*)&arg); - } - }; - - template - struct DummpyWrapper{ - static void set_kernel_arg_wrapper(Kernel kernel, const Tuple& t) { - UNUSED(kernel); - UNUSED(t); - } - }; - - template - inline EE gcl_set_kernelArgs(Kernel kernel, Args ... args) { - std::tuple t = std::make_tuple(args...); - DummpyWrapper::set_kernel_arg_wrapper(kernel, t); - return SUCCESS; - } - - inline std::string gclMemDesc2Str(GCLMemDesc desc) { - char buff[128]; - snprintf(buff, sizeof(buff), "memFormat: %d, ", desc.memFormat); - std::string descStr = buff; - descStr += "stride("; - for(U32 i = 0; i < 3; i++) { - descStr += std::to_string(desc.stride[i]); - if(i < 2) descStr += ","; - } - descStr += "), "; - descStr += "offset("; - for(U32 i = 0; i < 3; i++) { - descStr += std::to_string(desc.offset[i]); - if(i < 2) descStr += ","; - } - descStr += ")"; - return descStr; - } -#ifdef _DEBUG - template - inline EE gcl_print_memory(GCLHandle_t handle, GCLMem_t gclMem, CI8* gclMemName = NULL) { - UNUSED(handle); - UNUSED(gclMem); - UNUSED(gclMemName); -/* GCLMemDesc_t desc = &gclMem->desc; - if(gclMemName) std::cout << "MEMORY>>>"<< gclMemName << " info:"<stride[0]; - U32 s1 = desc->stride[1]; - U32 s2 = desc->stride[2]; - switch(desc->memType) { - case GCL_MEM_BUF: { - U32 size = desc->byteSize; - hostPtr = new U8[(size_t)size]; - gcl_trans_memory(handle, (void*)gclMem, (void*)hostPtr, &size, DEVICE_BUF_TO_HOST, CL_TRUE); - break; - } - case GCL_MEM_IMG_1D: { - U32 dim[3]; - dim[0] = s0; - dim[1] = s1; - dim[2] = s2; - U32 size = desc->byteSize; - hostPtr = new U8[(size_t)size]; - gcl_trans_memory(handle, (void*)gclMem, (void*)hostPtr, dim, DEVICE_IMG_TO_HOST, CL_TRUE); - s0 = s0 * 4; - break; - } - case GCL_MEM_IMG_2D: { - break; - } - case GCL_MEM_IMG_3D: { - break; - } - default: return NOT_SUPPORTED; - } - - T* data = (T*)hostPtr; - if(desc->memFormat == DF_NCHW) { - std::cout << "Format: NCHW" << std::endl; - std::cout << "s0 = " << s0 << std::endl; - std::cout << "s1 = " << s1 << std::endl; - std::cout << "s2 = " << s2 << std::endl; - U32 num = 0; - for(U32 i = 0; i < s2; i++) { - U32 ii = i * s1 * s0; - for(U32 j = 0; j < s1; j++) { - U32 jj = j * s0; - for(U32 k = 0; k < s0; k++) { - std::cout << 0.0 + data[ii + jj + k] << " "; - if(num >= 63) {std::cout << std::endl; goto end;} - num++; - } - std::cout << std::endl; - } - std::cout << std::endl; - } - } - - if(desc->memFormat == DF_NCWHC4) { - std::cout << "Format: NCWHC4" << std::endl; - std::cout << "s0 * 4 = " << s0 * 4 << std::endl; - std::cout << "s1 = " << s1 << std::endl; - std::cout << "s2 = " << s2 << std::endl; - U32 num = 0; - for(U32 i = 0; i < s2; i++) { - U32 ii = i * s1 * s0 * 4; - for(U32 j = 0; j < s1; j++) { - U32 jj = j * s0 * 4; - for(U32 k = 0; k < s0 * 4; k++) { - std::cout << 0.0 + data[ii + jj + k] << " "; - if(num >= 63) {std::cout << std::endl; goto end;} - num++; - } - std::cout << std::endl; - } - std::cout << std::endl; - } - } - - if(desc->memFormat == DF_NHWC || desc->memFormat == DF_HWCN) { - if(desc->memFormat == DF_NHWC) std::cout << "Format: NHWC" << std::endl; - if(desc->memFormat == DF_HWCN) std::cout << "Format: HWCN" << std::endl; - std::cout << "s0 = " << s0 << std::endl; - std::cout << "s1 = " << s1 << std::endl; - std::cout << "s2 = " << s2 << std::endl; - U32 num = 0; - for(U32 i = 0; i < s2; i++) { - U32 ii = i * s1 * s0; - for(U32 j = 0; j < s1; j++) { - U32 jj = j * s0; - for(U32 k = 0; k < s0; k++) { - std::cout << 0.0 + data[ii + jj + k] << " "; - if(num >= 63) {std::cout << std::endl; goto end;} - num++; - } - std::cout << std::endl; - } - std::cout << std::endl; - } - } - - if(desc->memFormat == DF_NCHWN4C4) { - std::cout << "Format: NCHWN4C4" << std::endl; - std::cout << "s0 * 16 = " << s0 * 16 << std::endl; - std::cout << "s1 = " << s1 << std::endl; - std::cout << "s2 = " << s2 << std::endl; - U32 num = 0; - for(U32 i = 0; i < s2; i++) { - U32 ii = i * s1 * s0 * 16; - for(U32 j = 0; j < s1; j++) { - U32 jj = j * s0 * 16; - for(U32 k = 0; k < s0 * 16; k++) { - std::cout << 0.0 + data[ii + jj + k] << " "; - if(num >= 63) {std::cout << std::endl; goto end;} - num++; - } - std::cout << std::endl; - } - std::cout << std::endl; - } - } - - if(desc->memFormat == DF_NCWHN4C4) { - std::cout << "Format: NCWHN4C4" << std::endl; - std::cout << "s0 * 16 = " << s0 * 16 << std::endl; - std::cout << "s1 = " << s1 << std::endl; - std::cout << "s2 = " << s2 << std::endl; - U32 num = 0; - for(U32 i = 0; i < s2; i++) { - U32 ii = i * s1 * s0 * 16; - for(U32 j = 0; j < s1; j++) { - U32 jj = j * s0 * 16; - for(U32 k = 0; k < s0 * 16; k++) { - std::cout << 0.0 + data[ii + jj + k] << " "; - if(num >= 63) {std::cout << std::endl; goto end;} - num++; - } - std::cout << std::endl; - } - std::cout << std::endl; - } - } - - if(desc->memFormat == DF_NCHWN4C4) { - std::cout << "Format: NCHWN4C4" << std::endl; - std::cout << "s0 * 4 = " << s0 * 4 << std::endl; - std::cout << "s1 = " << s1 << std::endl; - std::cout << "s2 = " << s2 << std::endl; - U32 num = 0; - for(U32 i = 0; i < s2; i++) { - U32 ii = i * s1 * s0 * 4; - for(U32 j = 0; j < s1; j++) { - U32 jj = j * s0 * 4; - for(U32 k = 0; k < s0 * 4; k++) { - std::cout << 0.0 + data[ii + jj + k] << " "; - if(num >= 63) {std::cout << std::endl; goto end;} - num++; - } - std::cout << std::endl; - } - std::cout << std::endl; - } - } - if(desc->memFormat == DF_NHWCN4) { - std::cout << "Format: NHWCN4" << std::endl; - std::cout << "s0 * 4 = " << s0 * 4 << std::endl; - std::cout << "s1 = " << s1 << std::endl; - std::cout << "s2 = " << s2 << std::endl; - U32 num = 0; - for(U32 i = 0; i < s2; i++) { - U32 ii = i * s1 * s0 * 4; - for(U32 j = 0; j < s1; j++) { - U32 jj = j * s0 * 4; - for(U32 k = 0; k < s0 * 4; k++) { - std::cout << 0.0 + data[ii + jj + k] << " "; - if(num >= 63) {std::cout << std::endl; goto end;} - num++; - } - std::cout << std::endl; - } - std::cout << std::endl; - } - } -end: - delete[] hostPtr;*/ - return SUCCESS; - } - - template - inline EE gcl_print_buffer(GCLHandle_t handle, Mem mem, U32 num, CI8* bufferName = NULL) { - UNUSED(handle); - UNUSED(mem); - UNUSED(num); - UNUSED(bufferName); -/* if(bufferName) std::cout << "BUFFER>>> "<< bufferName << " info:"<>> unknown info: " << std::endl; - std::cout << "Element number = " << num << std::endl; - U8* hostPtr = new U8[(size_t)num * sizeof(T)]; - CHECK_STATUS(enqueue_read_buffer(handle->queue, mem, CL_TRUE, 0, num * sizeof(T), (void*)hostPtr, - handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); - T* val = (T*)hostPtr; - for(U32 i = 0; i < num; i++){ - std::cout << val[i] << " "; - if(i >= 63) break; - } - std::cout << std::endl; - delete[] hostPtr;*/ - return SUCCESS; - } - - template - inline EE gcl_write_buf_to_bin(GCLHandle_t handle, Mem buf, U32 size, CI8* dataName) { - U32 num = size / sizeof(T); - U8* hostPtr = new U8[size]; - F32* hostPtrTran = new F32[num]; - CHECK_STATUS(enqueue_read_buffer(handle->queue, buf, CL_TRUE, 0, size, hostPtr, - handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); - T* val = (T*)hostPtr; - for(U32 i = 0; i < num; i++) hostPtrTran[i] = (F32)val[i]; - - FILE* outfile; - std::string fileName = dataName; - replace(fileName.begin(), fileName.end(), '/', '_'); - replace(fileName.begin(), fileName.end(), '.', '_'); - replace(fileName.begin(), fileName.end(), ' ', '_'); - fileName += "_gpu"; - fileName +=".out"; - outfile = fopen(fileName.c_str(), "wb"); - if(outfile == NULL) { - DEBUG_info("waring fopen outfile " << fileName <<" failed"); - delete[] hostPtr; - delete[] hostPtrTran; - return SUCCESS; - } - fwrite(hostPtrTran, sizeof(float), num, outfile); - fclose(outfile); - delete[] hostPtr; - delete[] hostPtrTran; - return SUCCESS; - } - template - inline EE gcl_write_data_to_bin(GCLHandle_t handle, TensorDesc tensorDesc, void* ptr, U32 ptrType, CI8* dataName = NULL) { - /*ptrType: - *GPU: 0 - *CPU: 1 - */ - DataFormat tdf; - DataType tdt; - U32 tn, tc, th, tw; - U32 dims; - tn = 1; tc = 1; th = 1; tw = 1; - dims = tensorDesc.nDims; - switch(dims) { - case 1: - tensor1dGet(tensorDesc, &tdt, &tw); - break; - case 2: - tensor2dfGet(tensorDesc, &tdt, &tdf, &th, &tw); - break; - case 3: - tensor3dGet(tensorDesc, &tdt, &tdf, &tc, &th, &tw); - break; - case 4: - tensor4dGet(tensorDesc, &tdt, &tdf, &tn, &tc, &th, &tw); - break; - default: CHECK_STATUS(NOT_SUPPORTED); - } - U32 num = tn * tc * th * tw; - F32* hostPtrTran = new F32[num]; - - if(ptrType == 0) { - GCLMem_t mem = (GCLMem_t)ptr; - GCLMemDesc desc = mem->desc; - GCLMemType type = desc.memType; - DataFormat df = desc.memFormat; - U8* hostPtr = nullptr; - U32 s0 = desc.stride[0]; - U32 s1 = desc.stride[1]; - U32 off0 = desc.offset[0]; - U32 off1 = desc.offset[1]; - U32 byteSize = desc.byteSize; - hostPtr = new U8[(size_t)byteSize]; - - GCLMemTransType tranType = DEVICE_BUF_TO_HOST; - U32 size[3] = {byteSize, 1, 1}; - if(type == GCL_MEM_IMG_1D) { - tranType = DEVICE_IMG_TO_HOST; - size[0] = s0; - } - gcl_trans_memory(handle, (void*)mem, (void*)hostPtr, size, tranType, CL_TRUE); - - T* val = (T*) hostPtr; - if(df == DF_NCWHC4) { - if(tdf == DF_NCHW) { - for(U32 i = 0; i < num; i++) { - U32 iw = i % tw; - U32 ih = (i / tw) % th; - U32 ic = i / (tw * th); - hostPtrTran[i] = (float)(val[((ic / 4) * s1 + iw + off1) * s0 * 4 + (ih + off0) * 4 + (ic & 3)]); - } - } - if(tdf == DF_MKT) { - for(U32 i = 0; i < num; i++) { - U32 ih = i % tw; - U32 ic = i / tw; - U32 in_off = ((ic / 4) * s1 + off1) * s0 * 4 + (ih + off0) * 4 + (ic & 3); - hostPtrTran[i] = (float)val[in_off]; - } - } - } else if(df == DF_NCHW || df == DF_NHWC) { - for(U32 i = 0; i < num; i++) { - U32 iw = i % tw; - U32 ih = (i / tw) % th; - U32 ic = i / (tw * th); - hostPtrTran[i] = (float)(val[(ic * s1 + ih + off1) * s0 + (iw + off0)]); - } - } else if(df == DF_NORMAL) { - for(U32 i = 0; i < num; i++) hostPtrTran[i] = (float)val[i]; - } else { - DEBUG_info("warning write GPU memory " << dataName <<" to bin, format not support: " << df); - delete[] hostPtrTran; - delete[] hostPtr; - return SUCCESS; - } - - delete[] hostPtr; - } - - if(ptrType == 1) { - T* val = (T*) ptr; - if(tdf == DF_NCHWC8) { - for(U32 i = 0; i < num; i++) { - U32 iw = i % tw; - U32 ih = (i / tw) % th; - U32 ic = i / (tw * th); - hostPtrTran[i] = (float)(val[((ic / 8) * th + ih) * tw * 8 + iw * 8 + (ic & 7)]); - } - } else if(tdf == DF_NORMAL || tdf == DF_NCHW) { - for(U32 i = 0; i < num; i++) { - hostPtrTran[i] = (float)(val[i]); - } - } else if(tdf == DF_MTK) { - for(U32 i = 0; i < num; i++) { - U32 it = i % th; - U32 ik = i / th; - U32 in_off = it * tw + ik; - hostPtrTran[i] = (float)(val[in_off]);//write as MKT, for compare with gpu - } - } else { - DEBUG_info("warning write CPU memory" << dataName <<" to bin, format not support: " << tdf); - delete[] hostPtrTran; - return SUCCESS; - } - } - - FILE* outfile; - std::string fileName = dataName; - replace(fileName.begin(), fileName.end(), '/', '_'); - replace(fileName.begin(), fileName.end(), '.', '_'); - replace(fileName.begin(), fileName.end(), ' ', '_'); - if(ptrType == 0) fileName += "_gpu"; - if(ptrType == 1) fileName += "_cpu"; - fileName +=".out"; - - outfile = fopen(fileName.c_str(), "wb"); - if(outfile == NULL) { - DEBUG_info("waring fopen outfile " << fileName <<" failed"); - delete[] hostPtrTran; - return SUCCESS; - } - fwrite(hostPtrTran, sizeof(float), num, outfile); - fclose(outfile); - delete[] hostPtrTran; - return SUCCESS; - } -#endif -#endif diff --git a/gcl/include/gcl_kernel_binmap.h b/gcl/include/gcl_kernel_binmap.h deleted file mode 100644 index a214cd06..00000000 --- a/gcl/include/gcl_kernel_binmap.h +++ /dev/null @@ -1,121 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - -#ifndef GCL_KERNELMAP -#define GCL_KERNELMAP - -#include"gcl_common.h" -#include -#include -typedef GCLKernelBin kernelBin; - -class gcl_kernel_binmap{ - public: - gcl_kernel_binmap(){} - std::unordered_map& binMap() {return binMap_;} - EE put(std::string kernelname, kernelBin kernelbin) { - std::lock_guard lock(mtx_); - auto it = binMap_.find(kernelname); - if(it == binMap_.end()) binMap_.insert({kernelname, kernelbin}); - return SUCCESS; - } - EE get(std::string kernelname, kernelBin** kernelbin_ptr) { - std::lock_guard lock(mtx_); - auto it = binMap_.find(kernelname); - if(it == binMap_.end()){ - printf("the kernel %s doesn't exist in binMap\n", kernelname.c_str()); - return NULL_POINTER; - } - *kernelbin_ptr = &it->second; - return SUCCESS; - } - private: - std::unordered_map binMap_; - std::mutex mtx_; -}; - - -class gcl_kernel_binmap_container{ - public: - static gcl_kernel_binmap_container* instance(){ - static gcl_kernel_binmap_container sInst; - return &sInst; - } - EE put(std::string kernel_binmap_name, std::unique_ptr kernel_binmap) { - std::lock_guard lock(mtx_); - auto it = kernel_binmap_container_.find(kernel_binmap_name); - if(it == kernel_binmap_container_.end()) kernel_binmap_container_.insert(std::make_pair(kernel_binmap_name, std::move(kernel_binmap))); - return SUCCESS; - } - EE get(std::string kernel_binmap_name, gcl_kernel_binmap** kernel_binmap_ptr) { - std::lock_guard lock(mtx_); - auto it = kernel_binmap_container_.find(kernel_binmap_name); - if(it == kernel_binmap_container_.end()){ - printf("the kernel_binmap %s doesn't exist in kernel_binmap container\n", kernel_binmap_name.c_str()); - return NULL_POINTER; - } - *kernel_binmap_ptr = it->second.get(); - return SUCCESS; - } - private: - gcl_kernel_binmap_container(){} - std::unordered_map> kernel_binmap_container_; - std::mutex mtx_; -}; - -class gcl_kernel_binmap_factory{ - public: - static gcl_kernel_binmap_factory* instance() { - static gcl_kernel_binmap_factory sInst; - return &sInst; - } - typedef gcl_kernel_binmap* (*PFN_GCLKERNELMAP_CREATOR)(); - EE register_gcl_kernel_binmap(const std::string& kernel_binmap_name, PFN_GCLKERNELMAP_CREATOR pfnCreator) { - std::lock_guard lock(mtx_); - auto it = creators_.find(kernel_binmap_name); - if(it == creators_.end()) creators_.insert({kernel_binmap_name, pfnCreator}); - return SUCCESS; - } - EE create_gcl_kernel_binmap(const std::string& kernel_binmap_name) { - std::lock_guard lock(mtx_); - auto it = creators_.find(kernel_binmap_name); - if(it == creators_.end()){ - printf("the kernel_binmap creator %s doesn't exist in kernel_binmap factory\n", kernel_binmap_name.c_str()); - return NULL_POINTER; - } - PFN_GCLKERNELMAP_CREATOR pfn = it->second; - gcl_kernel_binmap_container::instance()->put(kernel_binmap_name, std::unique_ptr(pfn())); - return SUCCESS; - } - private: - gcl_kernel_binmap_factory(){} - std::unordered_map creators_; - std::mutex mtx_; -}; - -#define REGISTER_GCLKERNELMAP_CREATOR_IMPL(kernel_binmap_name)\ - namespace{\ - static gcl_kernel_binmap* kernel_binmap_name ## _gcl_kernel_binmap_pfn() {return new kernel_binmap_name();}\ - class kernel_binmap_name ## _gcl_kernel_binmap_loader{\ - public:\ - kernel_binmap_name ## _gcl_kernel_binmap_loader() {\ - gcl_kernel_binmap_factory::instance()->register_gcl_kernel_binmap(#kernel_binmap_name, kernel_binmap_name ## _gcl_kernel_binmap_pfn);\ - }\ - };\ - static kernel_binmap_name ## _gcl_kernel_binmap_loader kernel_binmap_name ## _sLoader;\ - } - -#define REGISTER_GCLKERNELMAP(kernel_binmap_name) REGISTER_GCLKERNELMAP_CREATOR_IMPL(kernel_binmap_name) -#endif diff --git a/gcl/include/kernel.h b/gcl/include/kernel.h deleted file mode 100644 index adf29fb4..00000000 --- a/gcl/include/kernel.h +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - -#ifndef KERNEL_H_ -#define KERNEL_H_ - -#ifdef __cplusplus -extern "C" { -#endif - - /** - * @brief get information of kernel - * @warning please free memory associate with value - **/ - inline EE get_kernel_info(Kernel kernel, cl_kernel_info info, void* *value, size_t *size) { - if(NULL == value) return NULL_POINTER; - - size_t len; - cl_int ret = clGetKernelInfo(kernel, info, 0, NULL, &len); - if(CL_SUCCESS == ret) { - if(NULL != size) *size = len; - void* data = malloc(len); - if(NULL == data) return ALLOC_FAILED; - ret = clGetKernelInfo(kernel, info, len, data, NULL); - if(CL_SUCCESS == ret) { *value = data; } else { free(data); } - } - - map_cl_error_2_ee(ret); - } - - /** - * @brief get workgroup information of kernel - * @warning please free memory associate with value - **/ - inline EE get_kernel_workgroup_info(Kernel kernel, Device device, cl_kernel_work_group_info info, void* *value, size_t *size) { - size_t len; - cl_int ret = clGetKernelWorkGroupInfo(kernel, device, info, 0, NULL, &len); - if(CL_SUCCESS == ret) { - if(NULL != size) *size = len; - void* data = malloc(len); - if(NULL == data) return ALLOC_FAILED; - *value = data; - } - - map_cl_error_2_ee(ret); - } - - inline EE create_kernels_in_program(Program program, U32 num_kernel, Kernel* kernels){ - if(kernels == nullptr) return NULL_POINTER; - I32 ret = clCreateKernelsInProgram(program, num_kernel, kernels, NULL); - map_cl_error_2_ee(ret); - } - - inline EE create_kernel(Program program, CI8 *name, Kernel* kernel) { - if(kernel == nullptr) return NULL_POINTER; - I32 ret; - *kernel = clCreateKernel(program, name, &ret); - map_cl_error_2_ee(ret); - } - - inline EE retain_kernel(Kernel kernel) { - cl_int ret = clRetainKernel(kernel); - map_cl_error_2_ee(ret); - } - - inline EE release_kernel(Kernel kernel) { - cl_int ret = clReleaseKernel(kernel); - map_cl_error_2_ee(ret); - } - - inline EE set_kernel_arg(Kernel kernel, - U32 arg_index, U32 arg_size, - const void *arg_value) { - cl_int ret = clSetKernelArg(kernel, arg_index, arg_size, arg_value); - map_cl_error_2_ee(ret); - } -/* - inline EE clone_kernel(Kernel src_kernel, Kernel* dst_kernel) { - //TODO - I32 ret; - dst_kernel = clCloneKernel(src_kernel, &ret); - map_cl_error_2_ee(ret); - } -*/ - inline EE enqueue_ndrange_kernel(CommandQueue queue, Kernel kernel, U32 work_dim, CU32* global_work_offset, CU32* global_work_size, CU32* local_work_size, - U32 num_events_in_wait_list, const Event* event_in_wait_list, Event* event){ - I32 ret; - UNUSED(global_work_offset); - UNUSED(local_work_size); - switch(work_dim){ - case 1:{ - size_t gs = global_work_size[0]; - size_t ls = local_work_size[0]; - size_t* ls_ptr = (ls == 0) ? NULL : &ls; - ret = clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, &gs, ls_ptr, num_events_in_wait_list, event_in_wait_list, event); - break;} - case 2:{ - size_t gs[2] = {global_work_size[0], global_work_size[1]}; - size_t ls[2] = {local_work_size[0], local_work_size[1]}; - size_t* ls_ptr = (ls[0] == 0 || ls[1] == 0) ? NULL : ls; - ret = clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, gs, ls_ptr, num_events_in_wait_list, event_in_wait_list, event); - break;} - case 3:{ - size_t gs[3] = {global_work_size[0], global_work_size[1], global_work_size[2]}; - size_t ls[3] = {local_work_size[0], local_work_size[1], local_work_size[2]}; - size_t* ls_ptr = (ls[0] == 0 || ls[1] == 0 || ls[2] == 0) ? NULL : ls; - ret = clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, gs, ls_ptr, num_events_in_wait_list, event_in_wait_list, event); - break;} - default: - return NOT_SUPPORTED; - } - map_cl_error_2_ee(ret); - } - -#ifdef __cplusplus -} -#endif -#endif diff --git a/gcl/include/memory.h b/gcl/include/memory.h deleted file mode 100644 index a4003074..00000000 --- a/gcl/include/memory.h +++ /dev/null @@ -1,487 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - -#ifndef _H_BUFFER -#define _H_BUFFER - -#include "event.h" - -#ifdef __cplusplus -extern "C" { -#endif - - /** - * @brief get memory information - * - **/ - inline EE get_mememory_info(Mem mem, cl_mem_info info, void* *value, U32 *len) { - if(NULL == value) return NULL_POINTER; - - size_t size; - I32 ret = clGetMemObjectInfo(mem, info, 0, NULL, &size); - if(CL_SUCCESS == ret) { - if(NULL != len) *len = size; - void* data = malloc(size); - if(NULL == data) return NULL_POINTER; - ret = clGetMemObjectInfo(mem, info, size, data, NULL); - if(CL_SUCCESS == ret) *value = data; - } - - map_cl_error_2_ee(ret); - } - -#if defined(CL_VERSION_1_2) - - inline EE create_image1D(Context context, cl_mem_flags flags, const cl_image_format *format, U32 len, U32 pitch, void* host_ptr, Mem *image) { - cl_image_desc image_desc; - image_desc.image_type = CL_MEM_OBJECT_IMAGE1D; - image_desc.image_width = len; - image_desc.image_height = 1; - image_desc.image_depth = 1; - image_desc.image_array_size = 1; - image_desc.image_row_pitch = pitch; - image_desc.image_slice_pitch = 0; - image_desc.num_mip_levels = 0; - image_desc.num_samples = 0; - image_desc.buffer = NULL; - - I32 ret; - Mem temp = clCreateImage(context, flags, format, &image_desc, host_ptr, &ret); - *image = temp; - map_cl_error_2_ee(ret); - } - - /** - * @brief create 1d image buffer - * - **/ - inline EE create_image1D_buffer(Context context, cl_mem_flags flags, const cl_image_format *format, U32 len, const cl_mem buffer, Mem *image) { - cl_image_desc image_desc; - image_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; - image_desc.image_width = len; - image_desc.image_height = 1; - image_desc.image_depth = 1; - image_desc.image_array_size = 1; - image_desc.image_row_pitch = len; - image_desc.image_slice_pitch = len; - image_desc.num_mip_levels = 0; - image_desc.num_samples = 0; - image_desc.buffer = buffer; - - I32 ret; - Mem temp = clCreateImage(context, flags, format, &image_desc, NULL, &ret); - if(CL_SUCCESS == ret) *image = temp; - map_cl_error_2_ee(ret); - } -#endif - - /** - * @brief create 2d image object - * - **/ - inline EE create_image2D(Context cont, cl_mem_flags flags, cl_image_format *format, U32 width, U32 height, U32 pitch, void* host_ptr, Mem *mem) { - I32 ret; -#if defined(CL_VERSION_1_2) - cl_image_desc image_desc; - image_desc.image_type = CL_MEM_OBJECT_IMAGE2D; - image_desc.image_width = width; - image_desc.image_height = height; - image_desc.image_depth = 1; - image_desc.image_array_size = 1; - image_desc.image_row_pitch = pitch; - image_desc.image_slice_pitch = 0; - image_desc.num_mip_levels = 0; - image_desc.num_samples = 0; - image_desc.buffer = NULL; - - Mem temp = clCreateImage(cont, flags, format, &image_desc, host_ptr, &ret); -#else - Mem temp = clCreateImage2D(cont, flags, format, width, height, pitch, host_ptr, &ret); -#endif - if(CL_SUCCESS == ret) *mem = temp; - - map_cl_error_2_ee(ret); - } - -#if defined(CL_VERSION_1_2) - /** - * @brief create 2d image buffer object - * - **/ - inline EE create_image2D_array(Context cont, cl_mem_flags flags, cl_image_format *format, U32 width, U32 height, U32 pitch, U32 arraySize, void* host_ptr, Mem *mem) { - I32 ret; - cl_image_desc image_desc; - image_desc.image_type = CL_MEM_OBJECT_IMAGE2D_ARRAY; - image_desc.image_width = width; - image_desc.image_height = height; - image_desc.image_depth = 1; - image_desc.image_array_size = arraySize; - image_desc.image_row_pitch = pitch; - image_desc.image_slice_pitch = 0; - image_desc.num_mip_levels = 0; - image_desc.num_samples = 0; - image_desc.buffer = NULL; - - *mem = clCreateImage(cont, flags, format, &image_desc, host_ptr, &ret); - map_cl_error_2_ee(ret); - } -#endif - - /** - * @brief create 3d image object - * - **/ - inline EE create_image3D(Context cont, cl_mem_flags flags, cl_image_format *format, U32 width, U32 height, U32 depth, U32 rowPitch, U32 slicePitch, void* host_ptr, Mem *mem) { - I32 ret; -#if defined(CL_VERSION_1_2) - cl_image_desc image_desc; - image_desc.image_type = CL_MEM_OBJECT_IMAGE3D; - image_desc.image_width = width; - image_desc.image_height = height; - image_desc.image_depth = depth; - image_desc.image_array_size = 1; - image_desc.image_row_pitch = rowPitch; - image_desc.image_slice_pitch = slicePitch; - image_desc.num_mip_levels = 0; - image_desc.num_samples = 0; - image_desc.buffer = NULL; - - Mem temp = clCreateImage(cont, flags, format, &image_desc, host_ptr, &ret); -#else - Mem temp = clCreateImage3D(cont, flags, format, width, height, depth, rowPitch, slicePitch, host_ptr, &ret); -#endif - if(CL_SUCCESS == ret) *mem = temp; - - map_cl_error_2_ee(ret); - } - - /** - * @brief get image information - * - **/ - inline EE get_image_info(Mem mem, cl_mem_info info, void* *value, U32 *len) { - size_t size; - I32 ret = clGetImageInfo(mem, info, 0, NULL, &size); - if(CL_SUCCESS == ret) { - if(NULL != len) *len = size; - - void* data = malloc(size); - if(NULL == data) return NULL_POINTER; - ret = clGetImageInfo(mem, info, size, data, NULL); - if(CL_SUCCESS == ret) *value = data; - } - - map_cl_error_2_ee(ret); - } - - /** - * @brief get supported image format - * - * @warning please free memory associated with format - **/ - inline EE get_supported_image_formats(Context cont, cl_mem_flags flags, cl_mem_object_type type, cl_image_format **format, U32 *num) { - if(NULL == format) return NULL_POINTER; - - U32 len; - I32 ret = clGetSupportedImageFormats(cont, flags, type, 0, NULL, &len); - if(CL_SUCCESS == ret) { - if(NULL != num) *num = len; - cl_image_format *data = (cl_image_format*) malloc(len); - if(NULL == data) return NULL_POINTER; - ret = clGetSupportedImageFormats(cont, flags, type, len, data, 0); - if(CL_SUCCESS == ret) *format = data; - } - - map_cl_error_2_ee(ret); - } - - inline EE retain_memory(Mem mem) { - I32 ret = clRetainMemObject(mem); - map_cl_error_2_ee(ret); - } - - inline EE release_memory(Mem mem) { - I32 ret = clReleaseMemObject(mem); - map_cl_error_2_ee(ret); - } - - inline EE enqueue_unmap_memory(CommandQueue queue, Mem mem, void* mapped_ptr, - I32 num_wait_events, const Event *wait_events, Event *event) { - I32 ret = clEnqueueUnmapMemObject(queue, mem, mapped_ptr, - num_wait_events, wait_events, event); - - map_cl_error_2_ee(ret); - } - - inline EE create_buffer(Context context, cl_mem_flags flags, U32 size, - void* host_ptr, Mem* buffe) { - I32 ret; - size_t len = size; - *buffe = clCreateBuffer(context, flags, len, host_ptr, &ret); - map_cl_error_2_ee(ret); - } - - inline EE create_sub_buffer(Mem buffer, cl_mem_flags flags, - U32 offset, U32 size, Mem* sub) { - I32 ret; - cl_buffer_region region = { offset, size}; - *sub = clCreateSubBuffer(buffer, flags, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &ret); - map_cl_error_2_ee(ret); - } - - inline EE enqueue_read_buffer(CommandQueue queue, Mem buffer, cl_bool blocking, - U32 offset, U32 size, void* ptr, - U32 num_wait_events, const Event* wait_events, Event* event) { - I32 ret = clEnqueueReadBuffer(queue, buffer, blocking, - offset, size, ptr, num_wait_events, wait_events, event); - map_cl_error_2_ee(ret); - } - - /* - inline EE enqueue_read_buffer_rect(CommandQueue queue, Mem buffer, cl_bool blocking, - const U32 *buffer_origin, const U32 *host_origin, const U32 *region, - U32 buffer_row_pitch, U32 buffer_slice_pitch, U32 host_row_pitch, - U32 host_slice_pitch, void *ptr, U32 num_wait_events, - const Event *wait_events, Event *event) { - - I32 ret = clEnqueueReadBufferRect(queue, buffer, blocking, - buffer_origin, host_origin, region, - buffer_row_pitch, buffer_slice_pitch, host_row_pitch, - host_slice_pitch, ptr, num_wait_events, wait_events, event); - map_cl_error_2_ee(ret); - } -*/ - inline EE enqueue_write_buffer(CommandQueue queue, Mem buffer, cl_bool blocking, - U32 offset, U32 size, const void *ptr, U32 num_wait_events, - const Event *wait_events, Event *event) { - - I32 ret = clEnqueueWriteBuffer(queue, buffer, blocking, - offset, size, ptr, num_wait_events, - wait_events, event); - map_cl_error_2_ee(ret); - } - - inline EE enqueue_fill_buffer(CommandQueue queue, Mem buffer, const void *pattern, - U32 pattern_size, U32 offset, U32 size, U32 num_wait_events, - const Event *wait_events, Event *event) { - size_t pat_size = pattern_size; - size_t off = offset; - size_t si = size; - I32 ret = clEnqueueFillBuffer(queue, buffer, pattern, pat_size, off, si, num_wait_events, wait_events, event); - map_cl_error_2_ee(ret); - } - - inline EE enqueue_write_buffer_rect(CommandQueue queue, Mem buffer, cl_bool blocking_write, const U32 *buffer_origin, const U32 *host_origin, - const U32 *region, U32 buffer_row_pitch, U32 buffer_slice_pitch, U32 host_row_pitch, U32 host_slice_pitch, const void *ptr, - U32 num_wait_events, const Event *wait_events, Event *event) { - size_t b_ori[3]; - size_t h_ori[3]; - size_t reg[3]; - size_t b_rp = buffer_row_pitch; - size_t b_sp = buffer_slice_pitch; - size_t h_rp = host_row_pitch; - size_t h_sp = host_slice_pitch; - for(U32 i = 0; i < 3; i++) { - b_ori[i] = buffer_origin[i]; - h_ori[i] = host_origin[i]; - reg[i] = region[i]; - } - I32 ret = clEnqueueWriteBufferRect(queue, buffer, blocking_write, b_ori, h_ori, reg, b_rp, b_sp, h_rp, h_sp, - ptr, num_wait_events, wait_events, event); - map_cl_error_2_ee(ret); - } - - inline EE enqueue_copy_buffer(CommandQueue queue, Mem src_buffer, Mem dst_buffer, - U32 src_offset, U32 dst_offset, U32 size, U32 num_wait_events, - const Event *wait_events, Event *event){ - I32 ret = clEnqueueCopyBuffer(queue, src_buffer, dst_buffer, - src_offset, dst_offset, size, - num_wait_events, wait_events, event); - map_cl_error_2_ee(ret); - } - - /* - EE enqueue_copy_buffer_rect(CommandQueue queue, Mem src_buffer, Mem dst_buffer, - const U32 *src_origin, const U32 *dst_origin, const U32 *region, - U32 src_row_pitch, U32 src_slice_pitch, U32 dst_row_pitch, - U32 dst_slice_pitch, U32 num_wait_events, - const Event *wait_events, Event *event) { - I32 ret = clEnqueueCopyBufferRect(queue, src_buffer, dst_buffer, - const size_t *src_origin, const size_t *dst_origin, const size_t *region, - src_row_pitch, src_slice_pitch, dst_row_pitch, - dst_slice_pitch, num_wait_events, wait_events, event); - map_cl_error_2_ee(ret); - } - */ - - inline EE enqueue_map_buffer(CommandQueue queue, Mem buffer, cl_bool blocking_map, - cl_map_flags map_flags, U32 offset, U32 size, - U32 num_wait_events, const Event *wait_events, Event *event, - void* *ptr) { - I32 ret; - *ptr = clEnqueueMapBuffer(queue, buffer, blocking_map, map_flags, offset, size, - num_wait_events, wait_events, event, &ret); - map_cl_error_2_ee(ret); - } - - inline EE create_image(Context context, cl_mem_flags flags, const cl_image_format *image_format, - const cl_image_desc *image_desc, void *host_ptr, Mem* mem) { - I32 ret; - *mem = clCreateImage(context, flags, image_format, image_desc, host_ptr, &ret); - map_cl_error_2_ee(ret); - } - - inline EE enqueue_read_image(CommandQueue queue, Mem image, cl_bool blocking_read, - const U32 *origin, const U32 *region, U32 row_pitch, U32 slice_pitch, - void *ptr, U32 num_wait_events, const Event *wait_events, - Event *event) { - size_t org [3]; - size_t reg [3]; - for(U32 i = 0; i < 3; ++i){ - org[i] = (size_t)origin[i]; - reg[i] = (size_t)region[i]; - } - I32 ret = clEnqueueReadImage(queue, image, blocking_read, org, reg, row_pitch, slice_pitch, - ptr, num_wait_events, wait_events, event); - map_cl_error_2_ee(ret); - } - - inline EE enqueue_write_image(CommandQueue queue, Mem image, cl_bool blocking_write, - const U32 *origin, const U32 *region, U32 input_row_pitch, - U32 input_slice_pitch, const void *ptr, U32 num_wait_events, - const Event *wait_events, Event *event) { - size_t org [3]; - size_t reg [3]; - for(U32 i = 0; i < 3; ++i){ - org[i] = (size_t)origin[i]; - reg[i] = (size_t)region[i]; - } - I32 ret = clEnqueueWriteImage(queue, image, blocking_write, org, reg, input_row_pitch, - input_slice_pitch, ptr, num_wait_events, wait_events, event); - map_cl_error_2_ee(ret); - } - - inline EE enqueue_fill_image(CommandQueue queue, Mem image, const void *fill_color, - const U32 *origin, const U32 *region,U32 num_wait_events, - const Event *wait_events, Event *event) { - size_t org [3]; - size_t reg [3]; - for(U32 i = 0; i < 3; ++i){ - org[i] = (size_t)origin[i]; - reg[i] = (size_t)region[i]; - } - I32 ret = clEnqueueFillImage(queue, image, fill_color, - org, reg, num_wait_events, wait_events, event); - map_cl_error_2_ee(ret); - } - - inline EE enqueue_copy_image_to_buffer(CommandQueue queue, Mem src_image, Mem dst_buffer, - const U32 *src_origin, const U32 *region, U32 dst_offset, - U32 num_wait_events, const cl_event *wait_events, cl_event *event) { - size_t org [3]; - size_t reg [3]; - for(U32 i = 0; i < 3; ++i){ - org[i] = (size_t)src_origin[i]; - reg[i] = (size_t)region[i]; - } - I32 ret = clEnqueueCopyImageToBuffer(queue, src_image, dst_buffer, org, reg, dst_offset, - num_wait_events, wait_events, event); - map_cl_error_2_ee(ret); - } - - inline EE enqueue_copy_buffer_to_image(CommandQueue queue, Mem src_buffer, Mem dst_image, - U32 src_offset, const U32 *dst_origin, const U32 *region, - U32 num_wait_events, const cl_event *wait_events, cl_event *event) { - size_t org [3]; - size_t reg [3]; - for(U32 i = 0; i < 3; ++i){ - org[i] = (size_t)dst_origin[i]; - reg[i] = (size_t)region[i]; - } - I32 ret = clEnqueueCopyBufferToImage(queue, src_buffer, dst_image, - src_offset, org, reg, num_wait_events, wait_events, event); - map_cl_error_2_ee(ret); - } -/* - - EE enqueue_copy_image(CommandQueue queue, Mem src_image, Mem dst_image, - const U32 *src_origin, const U32 *dst_origin, const U32 *region, - U32 num_wait_events, const cl_event *wait_events, cl_event *event) { - I32 ret = clEnqueueCopyImage(queue, src_image, dst_image, - const size_t *src_origin, const size_t *dst_origin, const size_t *region, - num_wait_events, wait_events, event); - map_cl_error_2_ee(ret); - } - - - - EE enqueue_map_image(CommandQueue queue, Mem image, cl_bool blocking_map, - cl_map_flags map_flags, const U32 *origin, const U32 *region, - U32 *image_row_pitch, U32 *image_slice_pitch, U32 num_wait_events, - const cl_event *wait_events, cl_event *event, void* *ptr) { - I32 ret; - *ptr = clEnqueueMapImage(queue, image, blocking_map, - map_flags, const size_t *origin, const size_t *region, - size_t *image_row_pitch, size_t *image_slice_pitch, - num_wait_events, wait_events, event, &ret); - map_cl_error_2_ee(ret); - } -*/ - - inline EE create_sampler(Context context, const cl_sampler_properties* properties, Sampler *s) { - I32 ret; - *s = clCreateSamplerWithProperties(context, properties, &ret); - map_cl_error_2_ee(ret); - } - - inline EE retain_sampler(Sampler s) { - I32 ret = clRetainSampler(s); - map_cl_error_2_ee(ret); - } - - inline EE release_sampler(Sampler s) { - I32 ret = clReleaseSampler(s); - map_cl_error_2_ee(ret); - } - - inline EE get_sampler_info(Sampler s, - cl_sampler_info info, - void** value, size_t *len) { - if(NULL == value) return NULL_POINTER; - - size_t size; - I32 ret = clGetSamplerInfo(s, info, 0, NULL, &size); - if(CL_SUCCESS == ret) { - if(NULL != len) *len = size; - void* data = malloc(size); - if(NULL == data) return NULL_POINTER; - ret = clGetSamplerInfo(s, info, size, data, NULL); - if(CL_SUCCESS == ret) *value = data; - } - - map_cl_error_2_ee(ret); - } - - inline EE get_memory_size(Mem memory, U32* size){ - size_t len; - int ret = clGetMemObjectInfo(memory, CL_MEM_SIZE, sizeof(len), &len, NULL); - *size = len; - map_cl_error_2_ee(ret); - } -#ifdef __cplusplus -} -#endif - -#endif diff --git a/gcl/include/platform.h b/gcl/include/platform.h deleted file mode 100644 index d2726514..00000000 --- a/gcl/include/platform.h +++ /dev/null @@ -1,397 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - -#ifndef _H_PLATFORM -#define _H_PLATFORM - -#include -#include - -#if defined(__cplusplus) -extern "C" { -#endif - - typedef enum { - VENDOR_ARM = 0, - } PlatformVendor; - - inline EE get_platforms(U32 *numPlatforms, Platform** platforms){ - if(NULL == platforms || NULL == numPlatforms) return NULL_POINTER; - U32 num; - I32 ret = clGetPlatformIDs (0, NULL, &num); - if(SUCCESS == ret){ - *numPlatforms = num; - Platform *p = (Platform*)malloc(num * sizeof(Platform)); - if(NULL == p) return ALLOC_FAILED; - - ret = clGetPlatformIDs(num, p, NULL); - if(SUCCESS != ret) { free(p); } else { *platforms = p; } - } - - map_cl_error_2_ee(ret); - } - - static cl_bool stringContains(char* big, const char* s) { - for(unsigned int i = 0; i < strlen(big); i++) big[i] = tolower(big[i]); - std::string str(big); - return std::string::npos != str.find(s); - } - - /** - * @brief get information from platform - * - * @param value value associate with info, memory is allocate by this - * function - * @param len the lengith of value, return by this function - * - **/ - - inline EE get_platform_info(Platform platform, - cl_platform_info info, - void** value, U32 *len){ - if(NULL == len || NULL == value) return NULL_POINTER; - size_t sizeRet; - I32 ret = clGetPlatformInfo(platform, info, 0, NULL, &sizeRet); - if(CL_SUCCESS == ret){ - if(len) *len = (U32)sizeRet; - void* data = malloc(sizeRet+1); - if(NULL == data){ return ALLOC_FAILED; } - - ret = clGetPlatformInfo(platform, info, sizeRet+1, data, NULL); - if(CL_SUCCESS != ret){ free(data); } else { *value = data; } - } - - map_cl_error_2_ee(ret); - } - - /** - * @brief select platfrom by platform type - * - * @param numPlatforms the number of platforms - * @param platforms platform array need to be selected - * @param type the type of platform we want - * @param index index of the selected platform - * - **/ - inline EE select_platform(PlatformVendor vendor, Platform* platform) { - if(NULL == platform) return NULL_POINTER; - - const static char* key[] = {"arm", "qualcomm"}; - U32 num_platforms; - Platform* platforms; - EE ret = get_platforms(&num_platforms, &platforms); - if(SUCCESS == ret) { - const char* platform_vendor = key[vendor]; - for(U32 i = 0; i < num_platforms; i++) { - Platform p = platforms[i]; - U32 nameLen; - char *name; - ret = get_platform_info(p, CL_PLATFORM_NAME, (void**)&name, &nameLen); - if(SUCCESS == ret) { - if(stringContains(name, platform_vendor)) *platform = p; - free(name); - } - } - } - free(platforms); - - map_cl_error_2_ee(ret); - } - -#define CHAR_PLATFORM_INFO(info, str) {\ - EE ret = get_platform_info(p, info, &value, &len); \ - if(SUCCESS == ret){\ - char* tmp = (char*) value;\ - tmp[len] = '\0';\ - printf(str": %s\n", tmp);\ - free(value);\ - }else{ map_cl_error_2_ee(ret);}\ -} - -/** - * @brief list information about platform - * - */ -inline EE list_platform_info(Platform p){ - void* value; - U32 len; - - CHAR_PLATFORM_INFO(CL_PLATFORM_PROFILE, "\t Profile"); - CHAR_PLATFORM_INFO(CL_PLATFORM_VERSION, "\t Version "); - CHAR_PLATFORM_INFO(CL_PLATFORM_NAME, "\t Name "); - CHAR_PLATFORM_INFO(CL_PLATFORM_VENDOR, "\t Vendor "); - CHAR_PLATFORM_INFO(CL_PLATFORM_EXTENSIONS, "\t Extensions "); - - return SUCCESS; -} - -/** - * @brief get devices in platform, and allocate space for storing devices - * @warning please free space of devices allocated in this function - * - * @param p input, specify platform, device will be retrived from this platform - * @param type input, specify device type - * @param num_devices output, return device number with type in platform p - * @param devices output, return devices - * - * @return - * 0 means sucess - * -1 means fail - * - */ -inline EE platform_get_devices(Platform platform, - cl_device_type type, U32 *num_devices, Device **devices){ - if(NULL == devices || NULL == num_devices) return NULL_POINTER; - - U32 num; - I32 ret = clGetDeviceIDs(platform, type, 0, NULL, &num); - if(CL_SUCCESS == ret) { - *num_devices = num; - - Device *did = (Device*) malloc(num*sizeof(Device)); - if(NULL == did) return ALLOC_FAILED; - - ret = clGetDeviceIDs(platform, type, num, did, NULL); - if(CL_SUCCESS != ret){ free(did);} else { *devices = did;} - } - map_cl_error_2_ee(ret); -} - -inline EE create_sub_device(Device device, - const cl_device_partition_property* properties, - U32* num_devices, Device** devices) { - U32 len; - I32 ret = clCreateSubDevices(device, properties, 0, NULL, &len); - if(CL_SUCCESS == ret) { - if(NULL != num_devices) *num_devices = len; - Device *d = (Device*) malloc(sizeof(Device)*len); - if(NULL == d) return ALLOC_FAILED; - ret = clCreateSubDevices(device, properties, len, d, NULL); - if(CL_SUCCESS == ret) { *devices = d; } else { free(d); } - } - map_cl_error_2_ee(ret); -} - -inline EE retain_device(Device device) { - I32 ret = clRetainDevice(device); - map_cl_error_2_ee(ret); -} - -inline EE release_device(Device device) { - I32 ret = clReleaseDevice(device); - map_cl_error_2_ee(ret); -} - -/** - * - *@brief get device information - * - * @warning please free memory space allocated for value - * - **/ - -inline EE get_device_info(Device device, cl_device_info info, - void** value, U32 *len) { - if(NULL == value) return NULL_POINTER; - - size_t size; - I32 ret = clGetDeviceInfo(device, info, 0, NULL, &size); - if(CL_SUCCESS == ret) { - if(NULL != len) *len = (U32)(size); - void* data = malloc(size); - if(NULL == data) return ALLOC_FAILED; - ret = clGetDeviceInfo(device, info, size, data, NULL); - if(CL_SUCCESS != ret) { free(data); } else { *value = data; } - } - - map_cl_error_2_ee(ret); -} - -#define V_Q_Info(device, info, type, str, modifier) {\ - type v;\ - I32 ret = clGetDeviceInfo(device, info, sizeof(type), &v, NULL);\ - if(CL_SUCCESS != ret){\ - map_cl_error_2_ee(ret);\ - }\ - \ - printf(str "%" modifier "\n", v);\ -} - -#define B_Q_Info(device, info, str) {\ - cl_bool v;\ - I32 ret = clGetDeviceInfo(device, info, sizeof(cl_bool), &v, NULL);\ - if(CL_SUCCESS != ret){\ - map_cl_error_2_ee(ret);\ - }\ - \ - printf(str "%s\n", v? "Yes" : "NO");\ -} - -#define STR_Q_Info(device, info, str) {\ - size_t len;\ - I32 ret = clGetDeviceInfo(device, info, 0, NULL, &len);\ - if(SUCCESS != ret){\ - map_cl_error_2_ee(ret);\ - }\ - \ - char* v = (char*) malloc(len+1);\ - ret = clGetDeviceInfo(device, info, len, v, NULL);\ - if(SUCCESS != ret){\ - map_cl_error_2_ee(ret);\ - }\ - \ - v[len] = '\0';\ - printf(str"%s\n", v);\ - free(v);\ - \ -} - - -/** - * @brief list all attributes of device - * - * @param device input - * - * @return - * 0 : success - * -1: error - */ -inline EE list_device_info(Device device){ - printf("..........Device Info..............\n"); - STR_Q_Info(device, CL_DEVICE_NAME, "Device name : "); - V_Q_Info(device, CL_DEVICE_ADDRESS_BITS, U32, "Address Bits : ", "u"); - B_Q_Info(device, CL_DEVICE_AVAILABLE, "Device Available : "); - B_Q_Info(device, CL_DEVICE_COMPILER_AVAILABLE, "Device Compiler Available : "); - B_Q_Info(device, CL_DEVICE_ENDIAN_LITTLE, "Device is little Endian : "); - B_Q_Info(device, CL_DEVICE_ERROR_CORRECTION_SUPPORT, "ECC Supported : "); - STR_Q_Info(device, CL_DEVICE_EXTENSIONS, "Device Extensions : "); - STR_Q_Info(device, CL_DEVICE_OPENCL_C_VERSION, "OpenCL C Version : "); - STR_Q_Info(device, CL_DEVICE_PROFILE, "Device Profile : "); - V_Q_Info(device, CL_DEVICE_PROFILING_TIMER_RESOLUTION, size_t, "Timer Resolution : ", "ld"); - { cl_device_fp_config v; - I32 ret = clGetDeviceInfo(device, CL_DEVICE_SINGLE_FP_CONFIG, sizeof(cl_device_fp_config), &v, NULL); - if(CL_SUCCESS != ret){ - map_cl_error_2_ee(ret); - } - - if(v & CL_FP_DENORM){printf("Device Support Denorm Single Float \n");} - if(v & CL_FP_INF_NAN){printf("Device Support Single Float INF NAN\n");} - if(v & CL_FP_ROUND_TO_NEAREST){printf("Device Support Single Float Round to Nearest\n");} - if(v & CL_FP_ROUND_TO_ZERO){printf("Device Support Single Float Round to Zero \n");} - if(v & CL_FP_ROUND_TO_INF){printf("Device Support Single Float Round to Inf\n");} - if(v & CL_FP_FMA){printf("Device Support Single Float FMA\n");} - if(v & CL_FP_SOFT_FLOAT){printf("Device does not Support Hardware Single Float\n");} - } - - STR_Q_Info(device, CL_DEVICE_VENDOR, "Device Vendor : "); - V_Q_Info(device, CL_DEVICE_VENDOR_ID, U32, "Device Vendor ID : ", "u"); - STR_Q_Info(device, CL_DEVICE_VERSION, "Device Version : "); - STR_Q_Info(device, CL_DRIVER_VERSION, "Driver Version : "); - B_Q_Info(device, CL_DEVICE_HOST_UNIFIED_MEMORY, "Unified Memory Supported : "); - V_Q_Info(device, CL_DEVICE_MAX_PARAMETER_SIZE, size_t, "Max Parameter Size : ", "ld"); - - printf("..............Global Memory Configuration.............\n"); - V_Q_Info(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong, "Max Memory Allocate Size : ", "lu"); - V_Q_Info(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, U32, "Max Base Address Align Size : ", "u"); - V_Q_Info(device, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, U32, "Min Data Type align Size :", "u"); - - V_Q_Info(device, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong, "Global Memory Cache Size : ", "lu"); - { cl_device_mem_cache_type v; - I32 ret = clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, sizeof(cl_device_mem_cache_type), &v, NULL); - if(CL_SUCCESS != ret){ - map_cl_error_2_ee(ret); - } - switch(v) { - case CL_NONE: printf("Global Memory does not have Cache \n"); break; - case CL_READ_ONLY_CACHE : printf("Global Memory has Readonly Cache \n"); break; - case CL_READ_WRITE_CACHE : printf("Global Memory has Read Write Cache \n"); break; - } - } - - V_Q_Info(device, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, U32, "Global Memory, Cacheline Size : ", "u"); - V_Q_Info(device, CL_DEVICE_GLOBAL_MEM_SIZE, cl_ulong, "Global Memory Size : ", "lu"); - //CL_DEVICE_HALF_FP_CONFIG - - printf("..................Image Information...................\n"); - B_Q_Info(device, CL_DEVICE_IMAGE_SUPPORT, "Image Supported : "); - V_Q_Info(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, size_t, "2D Image Max Height : ", "ld"); - V_Q_Info(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, size_t, "2D Image Max Width : ", "ld"); - V_Q_Info(device, CL_DEVICE_IMAGE3D_MAX_DEPTH, size_t, "3D Image Max Depth : ", "ld"); - V_Q_Info(device, CL_DEVICE_IMAGE3D_MAX_HEIGHT, size_t, "3D Image Max Height : ", "ld"); - V_Q_Info(device, CL_DEVICE_IMAGE3D_MAX_WIDTH, size_t, "3D Image Max Width : ", "ld"); - V_Q_Info(device, CL_DEVICE_MAX_READ_IMAGE_ARGS, U32, "Max Read Image Args : ", "u"); - V_Q_Info(device, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, U32, "Max Write Image Args : ", "u"); - V_Q_Info(device, CL_DEVICE_MAX_SAMPLERS, U32, "Max Samples : ", "u"); - - printf(".................Local Memory...............................\n"); - V_Q_Info(device, CL_DEVICE_LOCAL_MEM_SIZE, cl_ulong, "Local Memory Size : ", "lu"); - { cl_device_local_mem_type v; - I32 ret = clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_TYPE, sizeof(cl_device_local_mem_type), &v, NULL); - if(CL_SUCCESS != ret){ - map_cl_error_2_ee(ret); - } - switch(v) { - case CL_LOCAL: printf("Device has Dedicate Local Memory\n"); break; - case CL_GLOBAL : printf("Local Memory uses Global Memory\n"); break; - default: - printf("%d\n", __LINE__); - } - } - - printf("...................CU Information...........................\n"); - V_Q_Info(device, CL_DEVICE_MAX_CLOCK_FREQUENCY, U32, "Max Clock Frequency : ", "u"); - V_Q_Info(device, CL_DEVICE_MAX_COMPUTE_UNITS, U32, "Max Compute Units : ", "u"); - - printf(".................Constant Memory Information.............\n"); - V_Q_Info(device, CL_DEVICE_MAX_CONSTANT_ARGS, U32, "Max Constant Args : ", "u"); - V_Q_Info(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, cl_ulong, "Max Constant Buffer Size : ", "lu"); - - printf("...................ND Range Information........................\n"); - V_Q_Info(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, size_t, "Max Work Group Size : ", "ld"); - V_Q_Info(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, U32, "Work Item Dimensions : ", "u"); - - { size_t v[3]; - I32 ret = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, &v, NULL); - if(CL_SUCCESS != ret){ - map_cl_error_2_ee(ret); - } - printf("Max Work Item size : %ld %ld %ld\n", v[0], v[1], v[2]); - } - - printf(".....................Vector Information..................\n"); - V_Q_Info(device, CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, U32, "Native Vector Width Char : ", "u"); - V_Q_Info(device, CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, U32, "Native Vector Width Short : ", "u"); - V_Q_Info(device, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, U32, "Native Vector Width Int : ", "u"); - V_Q_Info(device, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, U32, "Native Vector Width Long : ", "u"); - V_Q_Info(device, CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, U32, "Native Vector Width Float : ", "u"); - V_Q_Info(device, CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, U32, "Native Vector Width Double : ", "u"); - V_Q_Info(device, CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, U32, "Native Vector Width Half : ", "u"); - - V_Q_Info(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, U32, "Preferred Vector Width Char : ", "u"); - V_Q_Info(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, U32, "Preferred Vector Width Short : ", "u"); - V_Q_Info(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, U32, "Preferred Vector Width Int : ", "u"); - V_Q_Info(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, U32, "Preferred Vector Width Long : ", "u"); - V_Q_Info(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, U32, "Preferred Vector Width Float : ", "u"); - V_Q_Info(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, U32, "Preferred Vector Width Double : ", "u"); - V_Q_Info(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, U32, "Preferred Vector Width Half : ", "u"); - - return SUCCESS; - -} - -#if defined(__cplusplus) -} -#endif -#endif diff --git a/gcl/include/program.h b/gcl/include/program.h deleted file mode 100644 index 39ae2e22..00000000 --- a/gcl/include/program.h +++ /dev/null @@ -1,238 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - -#ifndef PROGRAM_H_ -#define PROGRAM_H_ - -#ifdef __cplusplus -extern "C" { -#endif -#define check_build_program_error(ret, program, device) {\ - if(SUCCESS != ret){\ - void* buildLog; \ - U32 buildLogSize;\ - ret = get_program_build_info(program, device, CL_PROGRAM_BUILD_LOG, &buildLog, &buildLogSize);\ - if(SUCCESS == ret) { \ - printf("build log of device %s\n", (char*)buildLog);\ - free(buildLog);\ - }\ - }\ -}\ - /** - * @brief get build information of program - * @warning please free memory associate with value - **/ - - inline EE get_program_build_info(Program program, - Device device, - cl_program_build_info info, - void* *value, U32 *size) { - if(NULL == value) return NULL_POINTER; - - size_t len; - I32 ret = clGetProgramBuildInfo(program, device, info, 0, NULL, &len); - if(SUCCESS == ret) { - if(NULL == size) *size = len; - void* data = malloc(len); - if(NULL == data) return ALLOC_FAILED; - ret = clGetProgramBuildInfo(program, device, info, len, data, NULL); - if(SUCCESS == ret) { *value = data; } else { free(data); } - } - - map_cl_error_2_ee(ret); - } - - /** - * @brief create program from source code - * - * @param context input, specify associate context - * @param source input, source code - * @param program output, created and built program - * - **/ - - inline EE create_program_from_source(Context context, U32* len, CI8* str, Program *program) { - I32 ret; - size_t length = (size_t)(*len); - *program = clCreateProgramWithSource(context, 1, &str, &length, &ret); - map_cl_error_2_ee(ret); - } - - /** - * @brief create program from binary code - * - * @param context input, specify associate context - * @param numDevices input, the number of devices need to compile the - * code for - * @param devices input, devices need to compile the code for - * @param lengths input, - * @param binaries - * @param binary_status output, compiled status for every devices - * @param program output, created and built program - * - **/ - - inline EE create_program_from_binary(Context context, const Device device, - U32* length, CU8 **binary, I32 *binary_status, Program *program) { - I32 ret; - size_t len = *length; - *program = clCreateProgramWithBinary(context, 1, &device, &len, binary, binary_status, &ret); - map_cl_error_2_ee(ret); - } - - /** - * @brief build program - * - **/ - - inline EE build_program(Program program, Device device, CI8 *options) { - I32 ret = clBuildProgram(program, 1, &device, options, NULL, NULL); - if(CL_SUCCESS != ret) check_build_program_error(ret, program, device); - map_cl_error_2_ee(ret); - } - - /** - * @brief create program from source then build it - * - * @param cont input, specify associate context - * @param source input, source code - * @param devices input, source will be built on devices - * @param options input, options for compiling source - * @param program output, created and built program - * - */ - - inline EE create_build_program_from_source(Context context, U32* length, CI8* source, Device device, CI8* options, Program *program) { - if(NULL == program) return NULL_POINTER; - Program prog; - EE ret; - create_program_from_source(context, length, source, &prog); - ret = build_program(prog, device, options); - *program = prog; - map_cl_error_2_ee(ret); - } - - /** - * @brief create program from binary then build it - * - **/ - - inline EE create_build_program_from_binary(Context context, Device device, U32* length, CU8** binary, CI8* options, I32 *binary_status, Program *program) { - if(NULL == program) return NULL_POINTER; - Program prog; - EE ret; - create_program_from_binary(context, device, length, binary, binary_status, &prog); - ret = build_program(prog, device, options); - *program = prog; - map_cl_error_2_ee(ret); - } - - /** - * @brief get information of program - * @warning please free memory associate with value - **/ - - inline EE get_program_info(Program program, cl_program_info info, void* *value, U32 *size) { - if(NULL == value) return NULL_POINTER; - size_t len; - I32 ret = clGetProgramInfo(program, info, 0, NULL, &len); - if(CL_SUCCESS == ret) { - if(NULL != size) *size = len; - void* data = malloc(len); - if(NULL == data) return ALLOC_FAILED; - ret = clGetProgramInfo (program, info, len, data, NULL); - if(CL_SUCCESS == ret) { *value = data;} else { free(data); } - } - map_cl_error_2_ee(ret); - } - - /** - * @brief get information of program - * @warning please free memory associate with value - **/ - inline EE get_program_binary(Program program, U8* *binary, U32 *len) { - size_t size; - I32 ret = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL); - if(CL_SUCCESS == ret){ - *len = (U32)(size); - void*data = malloc(size); - if(NULL == data) return ALLOC_FAILED; - ret = clGetProgramInfo(program, CL_PROGRAM_BINARIES, size, &data, NULL);//waring: need set &data - if(CL_SUCCESS == ret ){*binary = (U8*)(data);} - else{free(data);} - } - map_cl_error_2_ee(ret); - } - - /** - * @brief get binary of source code - * - * @warning please don't free binary, it is return by ocl - * - **/ - - inline EE get_program_binary_from_source(Context context, U32* length, CI8* str, Device device, CI8* options, U8* *binary, U32 *len) { - if(NULL == binary) return NULL_POINTER; - - Program program; - EE ret = create_build_program_from_source(context, length, str, device, options, &program); - if(SUCCESS == ret) { ret = get_program_binary(program, binary, len); } - return ret; - } - -/* -inline EE create_program_from_il(Context context, - const void *il, U32 length, Program *program) { -//TODO - I32 ret; - *program = clCreateProgramWithIL(context, il, length, &ret); - map_cl_error_2_ee(ret); -} -*/ - - inline EE release_program(Program program) { - map_cl_error_2_ee(clReleaseProgram(program)); - } - - inline EE compile_program(Program program, - const Device device, - CI8 *options, U32 num_input_headers, const Program *input_headers, - CI8 **header_include_names) { - I32 ret = clCompileProgram(program, 1, &device, - options, num_input_headers, input_headers, header_include_names, - NULL, NULL); - map_cl_error_2_ee(ret); - } - - inline EE link_program(Context context, - const Device device, - CI8* options, U32 num_input_programs, - const Program *input_programs, Program* program) { - I32 ret; - *program = clLinkProgram(context, 1, &device, options, num_input_programs, input_programs, - NULL, NULL, &ret); - map_cl_error_2_ee(ret); - } - - inline EE unload_platform_compiler(Platform p) { - I32 ret = clUnloadPlatformCompiler(p); - map_cl_error_2_ee(ret); - } - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/gcl/tools/device_info/CMakeLists.txt b/gcl/tools/device_info/CMakeLists.txt deleted file mode 100644 index e96015bc..00000000 --- a/gcl/tools/device_info/CMakeLists.txt +++ /dev/null @@ -1,30 +0,0 @@ -cmake_minimum_required(VERSION 3.2) - -file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/bolt.cmake ${BOLT_ROOT}/bolt.cmake) -if (BOLT_CONFIGURE_FILE) - set(USE_LLVM_CLANG ON) - set(USE_GNU_GCC OFF) - set(USE_MALI ON) - set(USE_DYNAMIC_LIBRARY OFF) - include(${BOLT_CONFIGURE_FILE}) -else (BOLT_CONFIGURE_FILE) - message(FATAL_ERROR " -FATAL: can not find bolt.cmake in directory, - please set shell or cmake environment variable BOLT_ROOT. - ") -endif (BOLT_CONFIGURE_FILE) - -project(gclinfo) - -set_policy() - -SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${BOLT_ROOT}/cmakes") -find_package(Gcl) -find_package(Uni) - -set_c_cxx_flags() - -set_test_c_cxx_flags() - -add_executable(gcl_info clinfo.cpp) -TARGET_LINK_LIBRARIES(gcl_info ${OPENCL_LIBRARIES}) diff --git a/gcl/tools/gcl_sample/CMakeLists.txt b/gcl/tools/gcl_sample/CMakeLists.txt deleted file mode 100644 index 625209b8..00000000 --- a/gcl/tools/gcl_sample/CMakeLists.txt +++ /dev/null @@ -1,30 +0,0 @@ -cmake_minimum_required(VERSION 3.4.1) - -file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/bolt.cmake ${BOLT_ROOT}/bolt.cmake) -if (BOLT_CONFIGURE_FILE) - set(USE_LLVM_CLANG ON) - set(USE_GNU_GCC OFF) - set(USE_MALI ON) - set(USE_DYNAMIC_LIBRARY OFF) - include(${BOLT_CONFIGURE_FILE}) -else (BOLT_CONFIGURE_FILE) - message(FATAL_ERROR " -FATAL: can not find bolt.cmake in directory, - please set shell or cmake environment variable BOLT_ROOT. - ") -endif (BOLT_CONFIGURE_FILE) - -project(sample) - -set_policy() - -SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${BOLT_ROOT}/cmakes") -find_package(Gcl) -find_package(Uni) - -set_c_cxx_flags() - -set_test_c_cxx_flags() - -add_executable(sample sample.cpp) -target_link_libraries(sample ${KERNELBIN_LIBRARIES} ${OPENCL_LIBRARIES}) diff --git a/gcl/tools/gcl_sample/sample.cpp b/gcl/tools/gcl_sample/sample.cpp deleted file mode 100644 index 2102b25f..00000000 --- a/gcl/tools/gcl_sample/sample.cpp +++ /dev/null @@ -1,152 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - -#include"gcl.h" -#include"libkernelbin.h" - - -void setMemDesc(GCLMem_t mem, DataType dt, DataFormat ft, GCLMemType mt, - U32 s0, U32 s1, U32 s2, U32 off0, U32 off1, U32 off2){ - mem->desc.stride[0] = s0 + 2 * off0; - mem->desc.stride[1] = s1 + 2 * off1; - mem->desc.stride[2] = s2; - mem->desc.offset[0] = off0; - mem->desc.offset[1] = off1; - mem->desc.offset[2] = off2; - mem->desc.num = s0 * s1 * s2; - mem->desc.byteSize = s0 * s1 * s2 * bytesOf(dt); - mem->desc.memFormat = ft; - mem->desc.memType = mt; -} - -int main(){ -while(1) { - GCLHandle_t handle; - CHECK_STATUS(gcl_create_handle(&handle)); - CHECK_STATUS(gcl_regist_binMap(handle)); - U32 iw, ih, ic, in; - U32 fw, fh, fc, fn; - U32 sv, pv; - U32 ow, oh, oc, on; - - iw = 4; - ih = 4; - ic = 4; - in = 1; - - fw = 3; - fh = 3; - fc = 4; - fn = 4; - - ow = iw; - oh = ih; - oc = fn; - on = in; - - sv = 1; - pv = 1; - - GCLMem_t input = gcl_create_gclmem(); - GCLMem_t flt = gcl_create_gclmem(); - GCLMem_t bias = gcl_create_gclmem(); - GCLMem_t output = gcl_create_gclmem(); - setMemDesc(input, DT_F16, DF_NCHW, GCL_MEM_BUF, iw, ih, ic, pv, pv, 0); - setMemDesc(flt, DT_F16, DF_NCHW, GCL_MEM_BUF, fw * fh, fc, fn, 0, 0, 0); - setMemDesc(bias, DT_F16, DF_NCHW, GCL_MEM_BUF, fn, 1, 1, 0, 0, 0); - setMemDesc(output,DT_F16, DF_NCHW, GCL_MEM_BUF, ow, oh, oc, 0, 0, 0); - CHECK_STATUS(gcl_create_memory(handle, input)); - CHECK_STATUS(gcl_create_memory(handle, flt)); - CHECK_STATUS(gcl_create_memory(handle, bias)); - CHECK_STATUS(gcl_create_memory(handle, output)); - - U8* iptr = new U8[input->desc.byteSize]; - U8* fptr = new U8[flt->desc.byteSize]; - U8* bptr = new U8[bias->desc.byteSize]; - - F16* ival = (F16*)iptr; - F16* fval = (F16*)fptr; - F16* bval = (F16*)bptr; - for(U32 i = 0; i < input->desc.num; i++){ - ival[i] = (rand() & 1023) / 1024.0 - 0.5; - U32 s0 = input->desc.stride[0]; - U32 s1 = input->desc.stride[1]; - U32 j = i % (s0 * s1); - if((j % s0) == 0 || (j % s0) == s0 - 1) ival[i] = 0; - if( j / s0 == 0 || j / s0 == s1 - 1) ival[i] = 0; - } - - for(U32 i = 0; i < flt->desc.num; i++){ - fval[i] = (rand() & 1023) / 1024.0 - 0.5; - } - - for(U32 i = 0; i < bias->desc.num; i++){ - bval[i] = (rand() & 1023) / 1024.0 - 0.5; - } - - CHECK_STATUS(gcl_trans_memory(handle, (void*)iptr, (void*)input, &input->desc.byteSize, HOST_TO_DEVICE_BUF, CL_TRUE)); - CHECK_STATUS(gcl_trans_memory(handle, (void*)fptr, (void*)flt, &flt->desc.byteSize, HOST_TO_DEVICE_BUF, CL_TRUE)); - CHECK_STATUS(gcl_trans_memory(handle, (void*)bptr, (void*)bias, &bias->desc.byteSize, HOST_TO_DEVICE_BUF, CL_TRUE)); -#ifdef _DEBUG - CHECK_STATUS(gcl_print_memory(handle, input, "input")); - CHECK_STATUS(gcl_print_memory(handle, flt, "flt")); - CHECK_STATUS(gcl_print_memory(handle, bias, "bias")); -#endif - - - Kernel kernel; - char kernelname[128]; - for(int i = 0; i < 1; i++){ - sprintf(kernelname, "sample"); - CHECK_STATUS(gcl_create_kernel_binary(handle, kernelname, &kernel)); - U32 iw_str = input->desc.stride[0]; - U32 ih_str = input->desc.stride[1]; - U32 iwh_str = iw_str * ih_str; - - U32 fwh_str = flt->desc.stride[0]; - U32 fc_str = flt->desc.stride[1]; - U32 flt_str = fwh_str * fc_str; - - U32 ow_str = output->desc.stride[0]; - U32 oh_str = output->desc.stride[1]; - U32 oc_str = output->desc.stride[2]; - U32 gs[3]; - gs[0] = ow_str; - gs[1] = oh_str; - gs[2] = oc_str; - U32 dim = 3; - U32 ls[3] = {0, 0, 0}; - CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, iwh_str, fc_str, flt_str, ow_str, oh_str, gs[0], gs[1], input->mem, flt->mem, bias->mem, output->mem)); - CHECK_STATUS(gcl_set_kernelVec(handle, kernel, dim, gs, ls, "sample")); - CHECK_STATUS(gcl_run_kernelVec(handle)); - - } - -#ifdef _DEBUG - CHECK_STATUS(gcl_print_memory(handle, output, "output")); -#endif - delete[] iptr; - delete[] fptr; - delete[] bptr; - gcl_destroy_gclmem(input); - gcl_destroy_gclmem(flt); - gcl_destroy_gclmem(bias); - gcl_destroy_gclmem(output); - gcl_destroy_handle(handle); - } -} - - - diff --git a/gcl/tools/kernel_lib_compile/CMakeLists.txt b/gcl/tools/kernel_lib_compile/CMakeLists.txt deleted file mode 100644 index 807c6128..00000000 --- a/gcl/tools/kernel_lib_compile/CMakeLists.txt +++ /dev/null @@ -1,36 +0,0 @@ -cmake_minimum_required(VERSION 3.2) - - -file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/bolt.cmake ${BOLT_ROOT}/bolt.cmake) -if (BOLT_CONFIGURE_FILE) - include(${BOLT_CONFIGURE_FILE}) -else (BOLT_CONFIGURE_FILE) - message(FATAL_ERROR " -FATAL: can not find bolt.cmake in directory, - please set shell or cmake environment variable BOLT_ROOT. - ") -endif (BOLT_CONFIGURE_FILE) - -project(KERNELBIN) - -set_policy() - -SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${BOLT_ROOT}/cmakes") -find_package(Gcl) -find_package(Uni) -include_directories(${PROJECT_SOURCE_DIR}/include) -set_project_install_directory() - -set_c_cxx_flags() - -execute_process( - COMMAND bash buildKernelLib.sh - WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" -) - -file(GLOB_RECURSE kernel_src_list "src/*.cpp") -ADD_LIBRARY(kernelbin SHARED ${kernel_src_list}) -ADD_LIBRARY(kernelbin_static STATIC ${kernel_src_list}) -SET_TARGET_PROPERTIES(kernelbin_static PROPERTIES OUTPUT_NAME "kernelbin") -SET_TARGET_PROPERTIES(kernelbin PROPERTIES CLEAN_DIRECT_OUTPUT 1) -SET_TARGET_PROPERTIES(kernelbin_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) diff --git a/gcl/tools/kernel_lib_compile/device_name/CMakeLists.txt b/gcl/tools/kernel_lib_compile/device_name/CMakeLists.txt deleted file mode 100644 index b709b930..00000000 --- a/gcl/tools/kernel_lib_compile/device_name/CMakeLists.txt +++ /dev/null @@ -1,26 +0,0 @@ -cmake_minimum_required(VERSION 3.2) - -file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/bolt.cmake ${BOLT_ROOT}/bolt.cmake) -if (BOLT_CONFIGURE_FILE) - include(${BOLT_CONFIGURE_FILE}) -else (BOLT_CONFIGURE_FILE) - message(FATAL_ERROR " -FATAL: can not find bolt.cmake in directory, - please set shell or cmake environment variable BOLT_ROOT. - ") -endif (BOLT_CONFIGURE_FILE) - -project(deviceName) - -set_policy() - -set_c_cxx_flags() - -set_test_c_cxx_flags() - -SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${BOLT_ROOT}/cmakes") -find_package(Gcl) -find_package(Uni) - -add_executable(gcl_device_name device_name.cpp) -TARGET_LINK_LIBRARIES(gcl_device_name ${OPENCL_LIBRARIES} -Wl,-allow-shlib-undefined, -static-libstdc++) diff --git a/gcl/tools/kernel_lib_compile/device_name/device_name.cpp b/gcl/tools/kernel_lib_compile/device_name/device_name.cpp deleted file mode 100644 index ea7e17b5..00000000 --- a/gcl/tools/kernel_lib_compile/device_name/device_name.cpp +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - -#include"gcl.h" -#include - -int main(){ - GCLHandle_t handle; - CHECK_STATUS(gcl_create_handle(&handle)); - FILE* fp = fopen("deviceBinmapNameFile", "w"); - fwrite(handle->deviceBinmapName.c_str(), handle->deviceBinmapName.length(), 1, fp); - fclose(fp); - gcl_destroy_handle(handle); - return 0; -} - - - diff --git a/gcl/tools/kernel_lib_compile/kernel_bin/CMakeLists.txt b/gcl/tools/kernel_lib_compile/kernel_bin/CMakeLists.txt deleted file mode 100644 index d55ee6ca..00000000 --- a/gcl/tools/kernel_lib_compile/kernel_bin/CMakeLists.txt +++ /dev/null @@ -1,26 +0,0 @@ -cmake_minimum_required(VERSION 3.2) - -file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/bolt.cmake ${BOLT_ROOT}/bolt.cmake) -if (BOLT_CONFIGURE_FILE) - include(${BOLT_CONFIGURE_FILE}) -else (BOLT_CONFIGURE_FILE) - message(FATAL_ERROR " -FATAL: can not find bolt.cmake in directory, - please set shell or cmake environment variable BOLT_ROOT. - ") -endif (BOLT_CONFIGURE_FILE) - -project(gclBinary) - -set_policy() - -SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${BOLT_ROOT}/cmakes") -find_package(Gcl) -find_package(Uni) - -set_c_cxx_flags() - -set_test_c_cxx_flags() - -add_executable(gcl_binary clbinary.cpp) -TARGET_LINK_LIBRARIES(gcl_binary ${OPENCL_LIBRARIES} -Wl,-allow-shlib-undefined, -static-libstdc++) diff --git a/gcl/tools/kernel_lib_compile/kernel_bin/clbinary.cpp b/gcl/tools/kernel_lib_compile/kernel_bin/clbinary.cpp deleted file mode 100644 index 028c03a4..00000000 --- a/gcl/tools/kernel_lib_compile/kernel_bin/clbinary.cpp +++ /dev/null @@ -1,175 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - -#include"gcl.h" -#include - -const char *imagesource = "#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable\n"; - -const char *half16source = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"; - -void printHelp() { - printf("please use the linux tradition, or you will face problem!!!!!!!!!!!!!!\n"); - printf("The program only support opencl kernel compile now !!!!!!!!!!!1!!!!!!!\n"); - printf("-i or --input to specify OpenCL input cl soruce file name\n"); - printf("-o or --output to specify OpenCL output binary file name\n"); - printf("-O or --options to specify OpenCL compiling options\n"); -} - -bool GetFileLength(CI8* filename, U32* len) { - if((NULL == filename) || (0 == strlen(filename)) ) return false; - FILE *fp = fopen(filename, "rb"); - if(NULL == fp) return false; - rewind(fp); - if(0 != fseek(fp, 0, SEEK_END)) return false; - * len = ftell(fp); - fclose(fp); - return true; -} - -bool LoadBinFile(CI8* filename, I8* str, U32 len) { - if((NULL == filename) || (0 == strlen(filename)) ) return false; - FILE *fp = fopen(filename, "rb"); - if(NULL == fp) return false; - rewind(fp); - if(len != fread(str, sizeof(char), len, fp)) { - fclose(fp); - return false; - } - fclose(fp); - return true; -} - -bool StoreToBinFile(CI8* filename, U32 length, CU8* s) { - if((NULL == s) || (NULL == filename) || (0 == strlen(filename)) ) return false; - FILE *fp = fopen(filename, "wb"); - if(NULL == fp) return false; - if(length != fwrite(s, sizeof(char), length, fp)) { - fclose(fp); - return false; - } - fclose(fp); - return true; -} - - -void parseCommandLine(I32 argc, I8* argv[], I8** inputFilename, I8** outputFilename, I8** options){ - const struct option long_options[] = { - {"input", 1, nullptr, 'i'}, - {"output", 1, nullptr, 'o'}, - {"options", 1, nullptr, 'O'}, - {nullptr, 1, nullptr, '0'}}; - bool setInput = false; - bool setOutput = false; - bool setOptions = false; - int optionIndex = 0; - int ch; - while((ch = getopt_long(argc, argv, "i:o:O", long_options, &optionIndex)) != -1){ - switch(ch) { - case 'i': - printf("input file name is %s\n", optarg); - *inputFilename = optarg; - if(setInput) { - printf("you specify input file name twice, program will exit\n"); - exit(0); - } - setInput = true; - break; - case 'o': - printf("output file name is %s\n", optarg); - *outputFilename = optarg; - if(setOutput) { - printf("you specify output file name twice, program will exit\n"); - exit(0); - } - setOutput = true; - break; - case 'O': - printf("options is %s\n", optarg); - *options = optarg; - if(setOptions) { - printf("you specify compiling options twice, program will exit\n"); - exit(0); - } - setOptions = true; - break; - default: - printf("not support option:%c\n", ch); - } - } - if(!setInput) { - printf("you don't specify the input cl file name, program will exit\n"); - exit(0); - } - if(!setOutput) { - printf("you don't specify the output file name, program will exit\n"); - exit(0); - } - if(!setOptions) { - printf("you don't specify the options for compiling cl file, default is empty\n"); - *options=(char*)""; - } - -} - -int main(I32 argc , I8* argv[]){ - if(1 == argc){ - printHelp(); - return 0; - } - - I8* FLAGS_inputFilename; - I8* FLAGS_outputFilename; - I8* FLAGS_options; - parseCommandLine(argc, argv, &FLAGS_inputFilename, &FLAGS_outputFilename, &FLAGS_options); - - GCLHandle_t handle; - CHECK_STATUS(gcl_create_handle(&handle)); - U32 imageLen = 0; -#ifdef CL_VERSION_1_2 - imageLen = strlen(imagesource); -#endif - U32 half16Len = strlen(half16source); - U32 clcodeLen = 0; - bool FileStatus = GetFileLength(FLAGS_inputFilename, &clcodeLen); - if(!FileStatus) {printf("get file length failed\n");return 0;} - U32 srcLen = imageLen + half16Len + clcodeLen; - I8* source = new I8[srcLen]; -#ifdef CL_VERSION_1_2 - memcpy(source, imagesource, imageLen); -#endif - memcpy(source + imageLen, half16source, half16Len); - FileStatus = LoadBinFile(FLAGS_inputFilename, source + imageLen + half16Len, clcodeLen); - if(!FileStatus) {printf("load bin file failed\n");delete[] source; return 0;} - - Program program; - U32 numKernel = 1; - Kernel kernel; - U32 size = 0; - U8* binary; - - CHECK_STATUS(gcl_produce_program_kernel_with_source(handle, &srcLen, source, FLAGS_options, &program, numKernel, &kernel)); - CHECK_STATUS(gcl_get_program_info(program, &binary, &size)); - FileStatus = StoreToBinFile(FLAGS_outputFilename, size, binary); - if(!FileStatus) {printf("store bin file failed\n");} - free(binary); - delete[] source; - CHECK_STATUS(release_program(program)); - CHECK_STATUS(release_kernel(kernel)); - gcl_destroy_handle(handle); -} - - - diff --git a/gcl/tools/kernel_lib_compile/sh/adbDeviceNum.sh b/gcl/tools/kernel_lib_compile/sh/adbDeviceNum.sh deleted file mode 100644 index 0522b27e..00000000 --- a/gcl/tools/kernel_lib_compile/sh/adbDeviceNum.sh +++ /dev/null @@ -1,11 +0,0 @@ -adbDeviceNum=($(adb devices | grep ".device$")) -i=0 -length=${#adbDeviceNum[@]} -while [ "$i" -lt "$length" ];do - if - ((i%2!=0)) - then - unset adbDeviceNum[i] - fi - ((i++)) -done diff --git a/gcl/tools/kernel_lib_compile/sh/buildKernelBin.sh b/gcl/tools/kernel_lib_compile/sh/buildKernelBin.sh deleted file mode 100644 index 31349475..00000000 --- a/gcl/tools/kernel_lib_compile/sh/buildKernelBin.sh +++ /dev/null @@ -1,41 +0,0 @@ -#build kernel bin on device# -#if devices name are the same, the build will only execute once# -index=1 -for dNum in "${adbDeviceNum[@]}"; do - adb -s ${dNum} shell "rm -rf ${kernelBuildPath}" - adb -s ${dNum} shell "mkdir ${kernelBuildPath}" - adb -s ${dNum} push gcl_device_name ${kernelBuildPath} - adb -s ${dNum} shell "cd ${kernelBuildPath} && chmod +x gcl_device_name && ./gcl_device_name" - adb -s ${dNum} shell "cd ${kernelBuildPath} && cat ${deviceNameFile} >> ${dNum}.dn" - adb -s ${dNum} pull ${kernelBuildPath}/${dNum}.dn ${namePath} - dname=$(awk '{print $1}' ${namePath}/${dNum}.dn) - deviceNamesAll[$index]="${dname}" - dnameS=0 - for((j=1;j tmp.sh && chmod +x tmp.sh && ./tmp.sh" - done - adb -s ${dNum} shell "cd ${kernelBuildPath} && mkdir bin" - adb -s ${dNum} shell "cd ${kernelBuildPath} && cp *.bin ${kernelBuildPath}/bin" - adb -s ${dNum} pull ${kernelBuildPath}/bin/ ${binPath}/${dname} - adb -s ${dNum} shell "rm -rf ${kernelBuildPath}" - echo ${dname} >> ${dNameFile} - fi - index=`expr $index + 1` -done diff --git a/gcl/tools/kernel_lib_compile/sh/compile/common.sh b/gcl/tools/kernel_lib_compile/sh/compile/common.sh deleted file mode 100644 index 3fe7ab58..00000000 --- a/gcl/tools/kernel_lib_compile/sh/compile/common.sh +++ /dev/null @@ -1,22 +0,0 @@ -for file in * - do - if [ "${file##*.}"x = "cl"x ];then - clFileName=${file%.*} - speConfig=0 - for filesh in `ls sh` - do - if [ "${filesh##*.}"x = "sh"x ];then - shFileName=${filesh%.*} - if [ "$clFileName" = "$shFileName" ];then - speConfig=1; - fi - fi - done - if [ $speConfig -eq 0 ]; then - echo ./gcl_binary --input=$file --output=${file%.*}.bin --options=\"${copt}\" - fi - fi - done - - - diff --git a/gcl/tools/kernel_lib_compile/sh/compile/concat.sh b/gcl/tools/kernel_lib_compile/sh/compile/concat.sh deleted file mode 100644 index 3e701a10..00000000 --- a/gcl/tools/kernel_lib_compile/sh/compile/concat.sh +++ /dev/null @@ -1,18 +0,0 @@ -for file in * - do - if [ "${file##*.}"x = "cl"x ];then - if [[ "${file}" == "concat.cl" ]];then - echo ./gcl_binary --input=$file --output=${file%.*}_11.bin --options=\"${copt} -D A=1 -D N=1\" - echo ./gcl_binary --input=$file --output=${file%.*}_12.bin --options=\"${copt} -D A=1 -D N=2\" - echo ./gcl_binary --input=$file --output=${file%.*}_13.bin --options=\"${copt} -D A=1 -D N=3\" - echo ./gcl_binary --input=$file --output=${file%.*}_14.bin --options=\"${copt} -D A=1 -D N=4\" - echo ./gcl_binary --input=$file --output=${file%.*}_15.bin --options=\"${copt} -D A=1 -D N=5\" - echo ./gcl_binary --input=$file --output=${file%.*}_16.bin --options=\"${copt} -D A=1 -D N=6\" - echo ./gcl_binary --input=$file --output=${file%.*}_17.bin --options=\"${copt} -D A=1 -D N=7\" - echo ./gcl_binary --input=$file --output=${file%.*}_18.bin --options=\"${copt} -D A=1 -D N=8\" - fi - fi - done - - - diff --git a/gcl/tools/kernel_lib_compile/sh/compile/conv_depthwise_s1.sh b/gcl/tools/kernel_lib_compile/sh/compile/conv_depthwise_s1.sh deleted file mode 100644 index ae04ae43..00000000 --- a/gcl/tools/kernel_lib_compile/sh/compile/conv_depthwise_s1.sh +++ /dev/null @@ -1,81 +0,0 @@ -for file in * - do - if [ "${file##*.}"x = "cl"x ];then - if [[ "${file}" == "conv_depthwise_s1.cl" ]];then - echo ./gcl_binary --input=$file --output=${file%.*}_31.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_32.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=9 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_33.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_34.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=6 -D LN=6 -D UN=5 -D Fsq=9 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_35.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=7 -D LN=7 -D UN=6 -D Fsq=9 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_36.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -DUSE_HALF -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_37.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -DUSE_HALF -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_38.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -DUSE_HALF -D BASICE_REG\" - - echo ./gcl_binary --input=$file --output=${file%.*}_relu_31.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_32.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=9 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_33.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_34.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=6 -D LN=6 -D UN=5 -D Fsq=9 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_35.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=7 -D LN=7 -D UN=6 -D Fsq=9 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_36.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_37.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_38.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" - - echo ./gcl_binary --input=$file --output=${file%.*}_51.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_52.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_53.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_54.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -DUSE_HALF -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_55.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -DUSE_HALF -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_56.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -DUSE_HALF -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_57.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -DUSE_HALF -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_58.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -DUSE_HALF -D BASICE_REG\" - - echo ./gcl_binary --input=$file --output=${file%.*}_relu_51.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_52.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_53.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_54.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_55.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_56.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_57.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_58.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" - - echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_31.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -DUSE_NCWH -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_32.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=9 -DUSE_NCWH -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_33.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -DUSE_NCWH -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_34.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=6 -D LN=6 -D UN=5 -D Fsq=9 -DUSE_NCWH -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_35.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=7 -D LN=7 -D UN=6 -D Fsq=9 -DUSE_NCWH -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_36.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_37.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_38.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -D BASICE_REG\" - - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_31.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_32.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_33.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_34.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=6 -D LN=6 -D UN=5 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_35.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=7 -D LN=7 -D UN=6 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_36.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_37.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_38.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -D BASICE_REG\" - - echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_51.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -DUSE_NCWH -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_52.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -DUSE_NCWH -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_53.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -DUSE_NCWH -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_54.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_55.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_56.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_57.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_58.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -D BASICE_REG\" - - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_51.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_52.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_53.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_54.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_55.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_56.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_57.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_58.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -D BASICE_REG\" - fi - fi - done - - - diff --git a/gcl/tools/kernel_lib_compile/sh/compile/conv_depthwise_s2.sh b/gcl/tools/kernel_lib_compile/sh/compile/conv_depthwise_s2.sh deleted file mode 100644 index 410d92fe..00000000 --- a/gcl/tools/kernel_lib_compile/sh/compile/conv_depthwise_s2.sh +++ /dev/null @@ -1,81 +0,0 @@ -for file in * - do - if [ "${file##*.}"x = "cl"x ];then - if [[ "${file}" == "conv_depthwise_s2.cl" ]];then - echo ./gcl_binary --input=$file --output=${file%.*}_31.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_32.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_33.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=9 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_34.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=9 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_35.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=9 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_36.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_37.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_38.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -DUSE_HALF -DBASIC_REG\" - - echo ./gcl_binary --input=$file --output=${file%.*}_relu_31.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_32.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_33.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=9 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_34.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=9 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_35.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=9 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_36.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_37.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_38.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - - echo ./gcl_binary --input=$file --output=${file%.*}_51.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=1 -D LN=0 -D UN=0 -D Fsq=25 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_52.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=25 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_53.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_54.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_55.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_56.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_57.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_58.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -DUSE_HALF -DBASIC_REG\" - - echo ./gcl_binary --input=$file --output=${file%.*}_relu_51.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=1 -D LN=0 -D UN=0 -D Fsq=25 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_52.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=25 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_53.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_54.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_55.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_56.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_57.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_58.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - - echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_31.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -DUSE_NCWH -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_32.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -DUSE_NCWH -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_33.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=9 -DUSE_NCWH -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_34.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_35.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_36.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_37.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_38.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DBASIC_REG\" - - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_31.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_32.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_33.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_34.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_35.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_36.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_37.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_38.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - - echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_51.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=1 -D LN=0 -D UN=0 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_52.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_53.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_54.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_55.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_56.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_57.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_58.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DBASIC_REG\" - - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_51.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=1 -D LN=0 -D UN=0 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_52.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_53.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_54.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_55.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_56.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_57.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_58.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - fi - fi - done - - - diff --git a/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1.sh b/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1.sh deleted file mode 100644 index e9554c64..00000000 --- a/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1.sh +++ /dev/null @@ -1,102 +0,0 @@ -for file in * - do - if [ "${file##*.}"x = "cl"x ];then - if [[ "${file}" == "conv_direct_s1.cl" ]];then - echo ./gcl_binary --input=$file --output=${file%.*}_111.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_121.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_131.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_141.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_151.bin --options=\"${copt} -D F=1 -D ON=5 -D IN=5 -D LN=5 -D Fsq=1 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_161.bin --options=\"${copt} -D F=1 -D ON=6 -D IN=6 -D LN=6 -D Fsq=1 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_171.bin --options=\"${copt} -D F=1 -D ON=7 -D IN=7 -D LN=7 -D Fsq=1 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_181.bin --options=\"${copt} -D F=1 -D ON=8 -D IN=8 -D LN=8 -D Fsq=1 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_111.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_121.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_131.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_141.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_151.bin --options=\"${copt} -D F=1 -D ON=5 -D IN=5 -D LN=5 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_161.bin --options=\"${copt} -D F=1 -D ON=6 -D IN=6 -D LN=6 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_171.bin --options=\"${copt} -D F=1 -D ON=7 -D IN=7 -D LN=7 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_181.bin --options=\"${copt} -D F=1 -D ON=8 -D IN=8 -D LN=8 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU\" - - echo ./gcl_binary --input=$file --output=${file%.*}_112.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=2 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_122.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=2 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_132.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=2 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_142.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=2 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_112.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_122.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_132.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_142.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU\" - -# echo ./gcl_binary --input=$file --output=${file%.*}_114.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=4 -DUSE_HALF\" -# echo ./gcl_binary --input=$file --output=${file%.*}_124.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=4 -DUSE_HALF\" -# echo ./gcl_binary --input=$file --output=${file%.*}_relu_114.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_RELU\" -# echo ./gcl_binary --input=$file --output=${file%.*}_relu_124.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_RELU\" - - echo ./gcl_binary --input=$file --output=${file%.*}_311.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_321.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=9 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_331.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_341.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=9 -D KN=1 -DUSE_HALF -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_351.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=9 -D KN=1 -DUSE_HALF -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_361.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -D KN=1 -DUSE_HALF -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_371.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -D KN=1 -DUSE_HALF -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_381.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -D KN=1 -DUSE_HALF -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_311.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_321.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_331.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_341.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_351.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_361.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_371.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_381.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" - - echo ./gcl_binary --input=$file --output=${file%.*}_312.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=2 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_322.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=9 -D KN=2 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_332.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -D KN=2 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_342.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=6 -D LN=6 -D UN=5 -D Fsq=9 -D KN=2 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_312.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_322.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_332.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_342.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=6 -D LN=6 -D UN=5 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU\" - -# echo ./gcl_binary --input=$file --output=${file%.*}_314.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=4 -DUSE_HALF\" -# echo ./gcl_binary --input=$file --output=${file%.*}_324.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=9 -D KN=4 -DUSE_HALF\" -# echo ./gcl_binary --input=$file --output=${file%.*}_relu_314.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=4 -DUSE_HALF -DUSE_RELU\" -# echo ./gcl_binary --input=$file --output=${file%.*}_relu_324.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=9 -D KN=4 -DUSE_HALF -DUSE_RELU\" - - echo ./gcl_binary --input=$file --output=${file%.*}_511.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_521.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_531.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -D KN=1 -DUSE_HALF -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_541.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -D KN=1 -DUSE_HALF -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_551.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -D KN=1 -DUSE_HALF -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_561.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -D KN=1 -DUSE_HALF -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_571.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -D KN=1 -DUSE_HALF -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_581.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -D KN=1 -DUSE_HALF -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_511.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU \" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_521.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU \" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_531.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_541.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_551.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_561.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_571.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_581.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" - - echo ./gcl_binary --input=$file --output=${file%.*}_512.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=2 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_522.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -D KN=2 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_532.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -D KN=2 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_542.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -D KN=2 -DUSE_HALF -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_512.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_522.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_532.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_542.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" - -# echo ./gcl_binary --input=$file --output=${file%.*}_514.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=4 -DUSE_HALF\" -# echo ./gcl_binary --input=$file --output=${file%.*}_524.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -D KN=4 -DUSE_HALF\" -# echo ./gcl_binary --input=$file --output=${file%.*}_relu_514.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=4 -DUSE_HALF -DUSE_RELU\" -# echo ./gcl_binary --input=$file --output=${file%.*}_relu_524.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -D KN=4 -DUSE_HALF -DUSE_RELU\" - fi - fi - done - - - diff --git a/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1_nchw_to_ncwhc4.sh b/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1_nchw_to_ncwhc4.sh deleted file mode 100644 index 2798b4df..00000000 --- a/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1_nchw_to_ncwhc4.sh +++ /dev/null @@ -1,16 +0,0 @@ -for file in * - do - if [ "${file##*.}"x = "cl"x ];then - if [[ "${file}" == "conv_direct_s1_nchw_to_ncwhc4.cl" ]];then - echo ./gcl_binary --input=$file --output=${file%.*}_18.bin --options=\"${copt} -D F=1 -D ON=8 -D Fsq=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_36.bin --options=\"${copt} -D F=3 -D ON=6 -D Fsq=9 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_54.bin --options=\"${copt} -D F=5 -D ON=4 -D Fsq=25 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_18.bin --options=\"${copt} -D F=1 -D ON=8 -D Fsq=1 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_36.bin --options=\"${copt} -D F=3 -D ON=6 -D Fsq=9 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_54.bin --options=\"${copt} -D F=5 -D ON=4 -D Fsq=25 -DUSE_HALF -DUSE_RELU\" - fi - fi - done - - - diff --git a/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s2.sh b/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s2.sh deleted file mode 100644 index 2ff47db7..00000000 --- a/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s2.sh +++ /dev/null @@ -1,102 +0,0 @@ -for file in * - do - if [ "${file##*.}"x = "cl"x ];then - if [[ "${file}" == "conv_direct_s2.cl" ]];then - echo ./gcl_binary --input=$file --output=${file%.*}_111.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_121.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_131.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_141.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_151.bin --options=\"${copt} -D F=1 -D ON=5 -D IN=5 -D LN=5 -D Fsq=1 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_161.bin --options=\"${copt} -D F=1 -D ON=6 -D IN=6 -D LN=6 -D Fsq=1 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_171.bin --options=\"${copt} -D F=1 -D ON=7 -D IN=7 -D LN=7 -D Fsq=1 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_181.bin --options=\"${copt} -D F=1 -D ON=8 -D IN=8 -D LN=8 -D Fsq=1 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_111.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=1 -DUSE_RELU -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_121.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=1 -DUSE_RELU -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_131.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=1 -DUSE_RELU -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_141.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=1 -DUSE_RELU -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_151.bin --options=\"${copt} -D F=1 -D ON=5 -D IN=5 -D LN=5 -D Fsq=1 -D KN=1 -DUSE_RELU -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_161.bin --options=\"${copt} -D F=1 -D ON=6 -D IN=6 -D LN=6 -D Fsq=1 -D KN=1 -DUSE_RELU -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_171.bin --options=\"${copt} -D F=1 -D ON=7 -D IN=7 -D LN=7 -D Fsq=1 -D KN=1 -DUSE_RELU -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_181.bin --options=\"${copt} -D F=1 -D ON=8 -D IN=8 -D LN=8 -D Fsq=1 -D KN=1 -DUSE_RELU -DUSE_HALF\" - - echo ./gcl_binary --input=$file --output=${file%.*}_112.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=2 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_122.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=2 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_132.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=2 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_142.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=2 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_112.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_122.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_132.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_142.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU\" - -# echo ./gcl_binary --input=$file --output=${file%.*}_114.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=4 -DUSE_HALF\" -# echo ./gcl_binary --input=$file --output=${file%.*}_124.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=4 -DUSE_HALF\" -# echo ./gcl_binary --input=$file --output=${file%.*}_relu_114.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_RELU\" -# echo ./gcl_binary --input=$file --output=${file%.*}_relu_124.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_RELU\" - - echo ./gcl_binary --input=$file --output=${file%.*}_311.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_321.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=9 -D KN=1 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_331.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=9 -D KN=1 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_341.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=9 -D KN=1 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_351.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=9 -D KN=1 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_361.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -D KN=1 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_371.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -D KN=1 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_381.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -D KN=1 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_311.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_321.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_331.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_341.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_351.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_361.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_371.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_381.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - - echo ./gcl_binary --input=$file --output=${file%.*}_312.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=2 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_322.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -D KN=2 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_332.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=9 -D KN=2 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_342.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=9 -D KN=2 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_312.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_322.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_332.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_342.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - -# echo ./gcl_binary --input=$file --output=${file%.*}_314.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=4 -DUSE_HALF\" -# echo ./gcl_binary --input=$file --output=${file%.*}_324.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -D KN=4 -DUSE_HALF\" -# echo ./gcl_binary --input=$file --output=${file%.*}_relu_314.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=4 -DUSE_HALF -DUSE_RELU\" -# echo ./gcl_binary --input=$file --output=${file%.*}_relu_324.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -D KN=4 -DUSE_HALF -DUSE_RELU\" - - echo ./gcl_binary --input=$file --output=${file%.*}_511.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=1 -D LN=0 -D UN=0 -D Fsq=25 -D KN=1 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_521.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=25 -D KN=1 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_531.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -D KN=1 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_541.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -D KN=1 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_551.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -D KN=1 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_561.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -D KN=1 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_571.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -D KN=1 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_581.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -D KN=1 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_511.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=1 -D LN=0 -D UN=0 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_521.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_531.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_541.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_551.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_561.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_571.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_581.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - - echo ./gcl_binary --input=$file --output=${file%.*}_512.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=2 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_522.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -D KN=2 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_532.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -D KN=2 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_542.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -D KN=2 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_512.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_522.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_532.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_542.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - -# echo ./gcl_binary --input=$file --output=${file%.*}_514.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=4 -DUSE_HALF\" -# echo ./gcl_binary --input=$file --output=${file%.*}_524.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -D KN=4 -DUSE_HALF\" -# echo ./gcl_binary --input=$file --output=${file%.*}_relu_514.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=4 -DUSE_HALF -DUSE_RELU\" -# echo ./gcl_binary --input=$file --output=${file%.*}_relu_524.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -D KN=4 -DUSE_HALF -DUSE_RELU\" - fi - fi - done - - - diff --git a/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s2_nchw_to_ncwhc4.sh b/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s2_nchw_to_ncwhc4.sh deleted file mode 100644 index c3cf03c5..00000000 --- a/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s2_nchw_to_ncwhc4.sh +++ /dev/null @@ -1,16 +0,0 @@ -for file in * - do - if [ "${file##*.}"x = "cl"x ];then - if [[ "${file}" == "conv_direct_s2_nchw_to_ncwhc4.cl" ]];then - echo ./gcl_binary --input=$file --output=${file%.*}_18.bin --options=\"${copt} -D F=1 -D ON=8 -D Fsq=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_37.bin --options=\"${copt} -D F=3 -D ON=7 -D Fsq=9 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_56.bin --options=\"${copt} -D F=5 -D ON=6 -D Fsq=25 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_18.bin --options=\"${copt} -D F=1 -D ON=8 -D Fsq=1 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_37.bin --options=\"${copt} -D F=3 -D ON=7 -D Fsq=9 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_56.bin --options=\"${copt} -D F=5 -D ON=6 -D Fsq=25 -DUSE_HALF -DUSE_RELU\" - fi - fi - done - - - diff --git a/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_spe_fwhs1.sh b/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_spe_fwhs1.sh deleted file mode 100644 index 67663d6e..00000000 --- a/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_spe_fwhs1.sh +++ /dev/null @@ -1,19 +0,0 @@ -for file in * - do - if [ "${file##*.}"x = "cl"x ];then - if [[ "${file}" == "conv_direct_spe_fwhs1.cl" ]];then - echo ./gcl_binary --input=$file --output=${file%.*}_1.bin --options=\"${copt} -D OC=1\" - echo ./gcl_binary --input=$file --output=${file%.*}_2.bin --options=\"${copt} -D OC=2\" - echo ./gcl_binary --input=$file --output=${file%.*}_3.bin --options=\"${copt} -D OC=3\" - echo ./gcl_binary --input=$file --output=${file%.*}_4.bin --options=\"${copt} -D OC=4\" - echo ./gcl_binary --input=$file --output=${file%.*}_8.bin --options=\"${copt} -D OC=8\" - echo ./gcl_binary --input=$file --output=${file%.*}_16.bin --options=\"${copt} -D OC=16\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_4.bin --options=\"${copt} -D OC=4 -D USE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_8.bin --options=\"${copt} -D OC=8 -D USE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_16.bin --options=\"${copt} -D OC=16 -D USE_RELU\" - fi - fi - done - - - diff --git a/gcl/tools/kernel_lib_compile/sh/compile/eltwise.sh b/gcl/tools/kernel_lib_compile/sh/compile/eltwise.sh deleted file mode 100644 index 4b5da516..00000000 --- a/gcl/tools/kernel_lib_compile/sh/compile/eltwise.sh +++ /dev/null @@ -1,36 +0,0 @@ -for file in * - do - if [ "${file##*.}"x = "cl"x ];then - if [[ "${file}" == "eltwise.cl" ]];then - echo ./gcl_binary --input=$file --output=${file%.*}_max1.bin --options=\"${copt} -D N=1 -D TP=max -DUSE_MAX\" - echo ./gcl_binary --input=$file --output=${file%.*}_max2.bin --options=\"${copt} -D N=2 -D TP=max -DUSE_MAX\" - echo ./gcl_binary --input=$file --output=${file%.*}_max3.bin --options=\"${copt} -D N=3 -D TP=max -DUSE_MAX\" - echo ./gcl_binary --input=$file --output=${file%.*}_max4.bin --options=\"${copt} -D N=4 -D TP=max -DUSE_MAX\" - echo ./gcl_binary --input=$file --output=${file%.*}_max5.bin --options=\"${copt} -D N=5 -D TP=max -DUSE_MAX\" - echo ./gcl_binary --input=$file --output=${file%.*}_max6.bin --options=\"${copt} -D N=6 -D TP=max -DUSE_MAX\" - echo ./gcl_binary --input=$file --output=${file%.*}_max7.bin --options=\"${copt} -D N=7 -D TP=max -DUSE_MAX\" - echo ./gcl_binary --input=$file --output=${file%.*}_max8.bin --options=\"${copt} -D N=8 -D TP=max -DUSE_MAX\" - - echo ./gcl_binary --input=$file --output=${file%.*}_sum1.bin --options=\"${copt} -D N=1 -D TP=sum -DUSE_SUM\" - echo ./gcl_binary --input=$file --output=${file%.*}_sum2.bin --options=\"${copt} -D N=2 -D TP=sum -DUSE_SUM\" - echo ./gcl_binary --input=$file --output=${file%.*}_sum3.bin --options=\"${copt} -D N=3 -D TP=sum -DUSE_SUM\" - echo ./gcl_binary --input=$file --output=${file%.*}_sum4.bin --options=\"${copt} -D N=4 -D TP=sum -DUSE_SUM\" - echo ./gcl_binary --input=$file --output=${file%.*}_sum5.bin --options=\"${copt} -D N=5 -D TP=sum -DUSE_SUM\" - echo ./gcl_binary --input=$file --output=${file%.*}_sum6.bin --options=\"${copt} -D N=6 -D TP=sum -DUSE_SUM\" - echo ./gcl_binary --input=$file --output=${file%.*}_sum7.bin --options=\"${copt} -D N=7 -D TP=sum -DUSE_SUM\" - echo ./gcl_binary --input=$file --output=${file%.*}_sum8.bin --options=\"${copt} -D N=8 -D TP=sum -DUSE_SUM\" - - echo ./gcl_binary --input=$file --output=${file%.*}_prod1.bin --options=\"${copt} -D N=1 -D TP=prod -DUSE_PROD\" - echo ./gcl_binary --input=$file --output=${file%.*}_prod2.bin --options=\"${copt} -D N=2 -D TP=prod -DUSE_PROD\" - echo ./gcl_binary --input=$file --output=${file%.*}_prod3.bin --options=\"${copt} -D N=3 -D TP=prod -DUSE_PROD\" - echo ./gcl_binary --input=$file --output=${file%.*}_prod4.bin --options=\"${copt} -D N=4 -D TP=prod -DUSE_PROD\" - echo ./gcl_binary --input=$file --output=${file%.*}_prod5.bin --options=\"${copt} -D N=5 -D TP=prod -DUSE_PROD\" - echo ./gcl_binary --input=$file --output=${file%.*}_prod6.bin --options=\"${copt} -D N=6 -D TP=prod -DUSE_PROD\" - echo ./gcl_binary --input=$file --output=${file%.*}_prod7.bin --options=\"${copt} -D N=7 -D TP=prod -DUSE_PROD\" - echo ./gcl_binary --input=$file --output=${file%.*}_prod8.bin --options=\"${copt} -D N=8 -D TP=prod -DUSE_PROD\" - fi - fi - done - - - diff --git a/gcl/tools/kernel_lib_compile/sh/compile/gemm_tn.sh b/gcl/tools/kernel_lib_compile/sh/compile/gemm_tn.sh deleted file mode 100644 index 762c9eff..00000000 --- a/gcl/tools/kernel_lib_compile/sh/compile/gemm_tn.sh +++ /dev/null @@ -1,114 +0,0 @@ -for file in * - do - if [ "${file##*.}"x = "cl"x ];then - if [[ "${file}" == gemm_tn.cl ]];then - echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_41.bin --options=\"${copt} -D LM=4 -D LN=1 -D UN=0 -DUSE_NCWHC4\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_42.bin --options=\"${copt} -D LM=4 -D LN=2 -D UN=1 -DUSE_NCWHC4\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_43.bin --options=\"${copt} -D LM=4 -D LN=3 -D UN=2 -DUSE_NCWHC4\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_44.bin --options=\"${copt} -D LM=4 -D LN=4 -D UN=3 -DUSE_NCWHC4\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_45.bin --options=\"${copt} -D LM=4 -D LN=5 -D UN=4 -DUSE_NCWHC4\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_46.bin --options=\"${copt} -D LM=4 -D LN=6 -D UN=5 -DUSE_NCWHC4\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_47.bin --options=\"${copt} -D LM=4 -D LN=7 -D UN=6 -DUSE_NCWHC4\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_48.bin --options=\"${copt} -D LM=4 -D LN=8 -D UN=7 -DUSE_NCWHC4\" - - echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_81.bin --options=\"${copt} -D LM=8 -D LN=1 -D UN=0 -DUSE_NCWHC4\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_82.bin --options=\"${copt} -D LM=8 -D LN=2 -D UN=1 -DUSE_NCWHC4\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_83.bin --options=\"${copt} -D LM=8 -D LN=3 -D UN=2 -DUSE_NCWHC4\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_84.bin --options=\"${copt} -D LM=8 -D LN=4 -D UN=3 -DUSE_NCWHC4\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_85.bin --options=\"${copt} -D LM=8 -D LN=5 -D UN=4 -DUSE_NCWHC4\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_86.bin --options=\"${copt} -D LM=8 -D LN=6 -D UN=5 -DUSE_NCWHC4\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_87.bin --options=\"${copt} -D LM=8 -D LN=7 -D UN=6 -DUSE_NCWHC4\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_88.bin --options=\"${copt} -D LM=8 -D LN=8 -D UN=7 -DUSE_NCWHC4\" - - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_41.bin --options=\"${copt} -D LM=4 -D LN=1 -D UN=0 -DUSE_NCWHC4 -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_42.bin --options=\"${copt} -D LM=4 -D LN=2 -D UN=1 -DUSE_NCWHC4 -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_43.bin --options=\"${copt} -D LM=4 -D LN=3 -D UN=2 -DUSE_NCWHC4 -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_44.bin --options=\"${copt} -D LM=4 -D LN=4 -D UN=3 -DUSE_NCWHC4 -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_45.bin --options=\"${copt} -D LM=4 -D LN=5 -D UN=4 -DUSE_NCWHC4 -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_46.bin --options=\"${copt} -D LM=4 -D LN=6 -D UN=5 -DUSE_NCWHC4 -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_47.bin --options=\"${copt} -D LM=4 -D LN=7 -D UN=6 -DUSE_NCWHC4 -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_48.bin --options=\"${copt} -D LM=4 -D LN=8 -D UN=7 -DUSE_NCWHC4 -DUSE_RELU\" - - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_81.bin --options=\"${copt} -D LM=8 -D LN=1 -D UN=0 -DUSE_NCWHC4 -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_82.bin --options=\"${copt} -D LM=8 -D LN=2 -D UN=1 -DUSE_NCWHC4 -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_83.bin --options=\"${copt} -D LM=8 -D LN=3 -D UN=2 -DUSE_NCWHC4 -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_84.bin --options=\"${copt} -D LM=8 -D LN=4 -D UN=3 -DUSE_NCWHC4 -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_85.bin --options=\"${copt} -D LM=8 -D LN=5 -D UN=4 -DUSE_NCWHC4 -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_86.bin --options=\"${copt} -D LM=8 -D LN=6 -D UN=5 -DUSE_NCWHC4 -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_87.bin --options=\"${copt} -D LM=8 -D LN=7 -D UN=6 -DUSE_NCWHC4 -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_88.bin --options=\"${copt} -D LM=8 -D LN=8 -D UN=7 -DUSE_NCWHC4 -DUSE_RELU\" - - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_13.bin --options=\"${copt} -D LM=1 -D LN=3 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_14.bin --options=\"${copt} -D LM=1 -D LN=4 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_15.bin --options=\"${copt} -D LM=1 -D LN=5 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_16.bin --options=\"${copt} -D LM=1 -D LN=6 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_17.bin --options=\"${copt} -D LM=1 -D LN=7 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_18.bin --options=\"${copt} -D LM=1 -D LN=8 -D NO_BIAS\" - - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_22.bin --options=\"${copt} -D LM=2 -D LN=2 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_23.bin --options=\"${copt} -D LM=2 -D LN=3 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_24.bin --options=\"${copt} -D LM=2 -D LN=4 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_25.bin --options=\"${copt} -D LM=2 -D LN=5 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_26.bin --options=\"${copt} -D LM=2 -D LN=6 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_27.bin --options=\"${copt} -D LM=2 -D LN=7 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_28.bin --options=\"${copt} -D LM=2 -D LN=8 -D NO_BIAS\" - - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_31.bin --options=\"${copt} -D LM=3 -D LN=1 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_32.bin --options=\"${copt} -D LM=3 -D LN=2 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_33.bin --options=\"${copt} -D LM=3 -D LN=3 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_34.bin --options=\"${copt} -D LM=3 -D LN=4 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_35.bin --options=\"${copt} -D LM=3 -D LN=5 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_36.bin --options=\"${copt} -D LM=3 -D LN=6 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_37.bin --options=\"${copt} -D LM=3 -D LN=7 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_38.bin --options=\"${copt} -D LM=3 -D LN=8 -D NO_BIAS\" - - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_41.bin --options=\"${copt} -D LM=4 -D LN=1 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_42.bin --options=\"${copt} -D LM=4 -D LN=2 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_43.bin --options=\"${copt} -D LM=4 -D LN=3 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_44.bin --options=\"${copt} -D LM=4 -D LN=4 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_45.bin --options=\"${copt} -D LM=4 -D LN=5 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_46.bin --options=\"${copt} -D LM=4 -D LN=6 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_47.bin --options=\"${copt} -D LM=4 -D LN=7 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_48.bin --options=\"${copt} -D LM=4 -D LN=8 -D NO_BIAS\" - - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_51.bin --options=\"${copt} -D LM=5 -D LN=1 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_52.bin --options=\"${copt} -D LM=5 -D LN=2 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_53.bin --options=\"${copt} -D LM=5 -D LN=3 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_54.bin --options=\"${copt} -D LM=5 -D LN=4 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_55.bin --options=\"${copt} -D LM=5 -D LN=5 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_56.bin --options=\"${copt} -D LM=5 -D LN=6 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_57.bin --options=\"${copt} -D LM=5 -D LN=7 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_58.bin --options=\"${copt} -D LM=5 -D LN=8 -D NO_BIAS\" - - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_61.bin --options=\"${copt} -D LM=6 -D LN=1 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_62.bin --options=\"${copt} -D LM=6 -D LN=2 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_63.bin --options=\"${copt} -D LM=6 -D LN=3 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_64.bin --options=\"${copt} -D LM=6 -D LN=4 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_65.bin --options=\"${copt} -D LM=6 -D LN=5 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_66.bin --options=\"${copt} -D LM=6 -D LN=6 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_67.bin --options=\"${copt} -D LM=6 -D LN=7 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_68.bin --options=\"${copt} -D LM=6 -D LN=8 -D NO_BIAS\" - - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_71.bin --options=\"${copt} -D LM=7 -D LN=1 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_72.bin --options=\"${copt} -D LM=7 -D LN=2 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_73.bin --options=\"${copt} -D LM=7 -D LN=3 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_74.bin --options=\"${copt} -D LM=7 -D LN=4 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_75.bin --options=\"${copt} -D LM=7 -D LN=5 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_76.bin --options=\"${copt} -D LM=7 -D LN=6 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_77.bin --options=\"${copt} -D LM=7 -D LN=7 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_78.bin --options=\"${copt} -D LM=7 -D LN=8 -D NO_BIAS\" - - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_81.bin --options=\"${copt} -D LM=8 -D LN=1 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_82.bin --options=\"${copt} -D LM=8 -D LN=2 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_83.bin --options=\"${copt} -D LM=8 -D LN=3 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_84.bin --options=\"${copt} -D LM=8 -D LN=4 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_85.bin --options=\"${copt} -D LM=8 -D LN=5 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_86.bin --options=\"${copt} -D LM=8 -D LN=6 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_87.bin --options=\"${copt} -D LM=8 -D LN=7 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_88.bin --options=\"${copt} -D LM=8 -D LN=8 -D NO_BIAS\" - fi - fi - done - - - diff --git a/gcl/tools/kernel_lib_compile/sh/compile/scale.sh b/gcl/tools/kernel_lib_compile/sh/compile/scale.sh deleted file mode 100644 index f8cdd99f..00000000 --- a/gcl/tools/kernel_lib_compile/sh/compile/scale.sh +++ /dev/null @@ -1,12 +0,0 @@ -for file in * - do - if [ "${file##*.}"x = "cl"x ];then - if [[ "${file}" == "scale.cl" ]];then - echo ./gcl_binary --input=$file --output=${file%.*}_nobeta.bin --options=\"${copt} -D MD=nobeta \" - echo ./gcl_binary --input=$file --output=${file%.*}_beta.bin --options=\"${copt} -D MD=beta -DUSE_BETA\" - fi - fi - done - - - diff --git a/image/CMakeLists.txt b/image/CMakeLists.txt deleted file mode 100644 index d1266406..00000000 --- a/image/CMakeLists.txt +++ /dev/null @@ -1,25 +0,0 @@ -cmake_minimum_required(VERSION 3.2) - -file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/bolt.cmake ${BOLT_ROOT}/bolt.cmake) -if (BOLT_CONFIGURE_FILE) - include(${BOLT_CONFIGURE_FILE}) -else (BOLT_CONFIGURE_FILE) - message(FATAL_ERROR " -FATAL: can not find bolt.cmake in directory, - please set shell or cmake environment variable BOLT_ROOT. - ") -endif (BOLT_CONFIGURE_FILE) - -project(image) - -set_policy() - -SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${BOLT_ROOT}/cmakes") -find_package(Uni) -find_package(Image) - -set_project_install_directory() - -set_c_cxx_flags() - -add_subdirectory(src) diff --git a/image/include/image.h b/image/include/image.h deleted file mode 100644 index 02b1052a..00000000 --- a/image/include/image.h +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_IMAGE -#define _H_IMAGE - -#include -#include "tensor_desc.h" -#include "type.h" -#include "sys.h" - -#ifdef __cplusplus -extern "C" { -#endif - - typedef struct { - DataType paramDT; - } ResizeDesc; - - EE resize_infer_output_size(TensorDesc inputDesc, ResizeDesc resizeDesc, void* params, - TensorDesc* outputDesc, U32* outputBytes); - - EE resize(TensorDesc inputDesc, void* input, - TensorDesc outputDesc, void* output, - Arch arch); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/image/src/CMakeLists.txt b/image/src/CMakeLists.txt deleted file mode 100644 index bc70c095..00000000 --- a/image/src/CMakeLists.txt +++ /dev/null @@ -1,22 +0,0 @@ -if (USE_GENERAL) - file(GLOB general_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/general/*.cpp) -endif (USE_GENERAL) - -if (USE_NEON) - file(GLOB arm_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/*.cpp) -endif (USE_NEON) - -file(GLOB srcs ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) -set(srcs "${srcs};${general_srcs};${arm_srcs}") - -include_directories(${CMAKE_CURRENT_SOURCE_DIR}) - -# shared library -ADD_LIBRARY(${PROJECT_NAME} SHARED ${srcs}) - -# static library -ADD_LIBRARY(${PROJECT_NAME}_static STATIC ${srcs}) - -SET_TARGET_PROPERTIES(${PROJECT_NAME}_static PROPERTIES OUTPUT_NAME "${PROJECT_NAME}") -SET_TARGET_PROPERTIES(${PROJECT_NAME} PROPERTIES CLEAN_DIRECT_OUTPUT 1) -SET_TARGET_PROPERTIES(${PROJECT_NAME}_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) diff --git a/image/src/cpu/arm/resize_bilinear.cpp b/image/src/cpu/arm/resize_bilinear.cpp deleted file mode 100644 index 8de01a58..00000000 --- a/image/src/cpu/arm/resize_bilinear.cpp +++ /dev/null @@ -1,237 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "image.h" -#include "cpu/arm/image_arm.h" - -#ifdef _USE_FP16 -EE resize_bilinear_fp16(TensorDesc inputDesc, F16* inArray, - TensorDesc outputDesc, F16* outArray) -{ - DataType idt, odt; - DataFormat idf, odf; - U32 in, ic, ih, iw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - - if (idf != DF_NCHWC8 || odf != DF_NCHWC8) { - CHECK_STATUS(NOT_MATCH); - } - - F32 strideH = (F32)(ih - 1) / (F32)(oh - 1); - F32 strideW = (F32)(iw - 1) / (F32)(ow - 1); - - oc /= 8; - - for (U32 n = 0; n < on; n++) { - for (U32 c = 0; c < oc; c++) { - I32 outBase = n*oc*oh*ow + c*oh*ow*8; - I32 inBase = n*oc*ih*iw + c*ih*iw*8; - for (U32 h = 0; h < oh; h++) { - for (U32 w = 0; w < ow; w++) { - if (h == 0 && w == 0) { - memcpy(outArray + outBase, inArray + inBase, 8*bytesOf(DT_F16)); - continue; - } - if (h == 0 && w == ow - 1) { - memcpy(outArray + outBase + w*8, inArray + inBase + (iw-1)*8, 8*bytesOf(DT_F16)); - continue; - } - if (h == oh - 1 && w == 0) { - memcpy(outArray + outBase + h*ow*8, inArray + inBase + (ih-1)*iw*8, 8*bytesOf(DT_F16)); - continue; - } - if (h == oh - 1 && w == ow - 1) { - memcpy(outArray + outBase + h*ow*8 + w*8, inArray + inBase + (ih-1)*iw*8 + (iw-1)*8, 8*bytesOf(DT_F16)); - continue; - } - - F32 hC = strideH * h; - F32 wC = strideW * w; - - I32 hT = floor(hC); - I32 hB = ceil(hC); - I32 wL = floor(wC); - I32 wR = ceil(wC); - - if (hT == hB && wL == wR) { - memcpy(outArray + outBase + h*ow*8 + w*8, inArray + inBase + hT*iw*8 + wL*8, 8*bytesOf(DT_F16)); - } else if (hT == hB) { - float16x8_t res = {0}; - float16x8_t vecL = vld1q_f16(inArray + inBase + hT*iw*8 + wL*8); - float16x8_t vecR = vld1q_f16(inArray + inBase + hT*iw*8 + wR*8); - res = vfmaq_n_f16(res, vecL, wR - wC); - res = vfmaq_n_f16(res, vecR, wC - wL); - vst1q_f16(outArray + outBase + h*ow*8 + w*8, res); - } else if (wL == wR) { - float16x8_t res = {0}; - float16x8_t vecT = vld1q_f16(inArray + inBase + hT*iw*8 + wL*8); - float16x8_t vecB = vld1q_f16(inArray + inBase + hB*iw*8 + wL*8); - res = vfmaq_n_f16(res, vecT, hB - hC); - res = vfmaq_n_f16(res, vecB, hC - hT); - vst1q_f16(outArray + outBase + h*ow*8 + w*8, res); - } else { - float16x8_t res = {0}; - float16x8_t vecTL = vld1q_f16(inArray + inBase + hT*iw*8 + wL*8); - float16x8_t vecTR = vld1q_f16(inArray + inBase + hT*iw*8 + wR*8); - float16x8_t vecBL = vld1q_f16(inArray + inBase + hB*iw*8 + wL*8); - float16x8_t vecBR = vld1q_f16(inArray + inBase + hB*iw*8 + wR*8); - res = vfmaq_n_f16(res, vecTL, (hB - hC) * (wR - wC)); - res = vfmaq_n_f16(res, vecTR, (hB - hC) * (wC - wL)); - res = vfmaq_n_f16(res, vecBL, (hC - hT) * (wR - wC)); - res = vfmaq_n_f16(res, vecBR, (hC - hT) * (wC - wL)); - vst1q_f16(outArray + outBase + h*ow*8 + w*8, res); - } - } - } - } - } - return SUCCESS; -} -#endif - -#ifdef _USE_FP32 -EE resize_bilinear_fp32(TensorDesc inputDesc, F32* inArray, - TensorDesc outputDesc, F32* outArray) -{ - DataType idt, odt; - DataFormat idf, odf; - U32 in, ic, ih, iw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - - if (idf != DF_NCHWC8 || odf != DF_NCHWC8) { - CHECK_STATUS(NOT_MATCH); - } - - F32 strideH = (F32)(ih - 1) / (F32)(oh - 1); - F32 strideW = (F32)(iw - 1) / (F32)(ow - 1); - - oc /= 8; - - for (U32 n = 0; n < on; n++) { - for (U32 c = 0; c < oc; c++) { - I32 outBase = n*oc*oh*ow + c*oh*ow*8; - I32 inBase = n*oc*ih*iw + c*ih*iw*8; - for (U32 h = 0; h < oh; h++) { - for (U32 w = 0; w < ow; w++) { - if (h == 0 && w == 0) { - memcpy(outArray + outBase, inArray + inBase, 8*bytesOf(DT_F32)); - continue; - } - if (h == 0 && w == ow - 1) { - memcpy(outArray + outBase + w*8, inArray + inBase + (iw-1)*8, 8*bytesOf(DT_F32)); - continue; - } - if (h == oh - 1 && w == 0) { - memcpy(outArray + outBase + h*ow*8, inArray + inBase + (ih-1)*iw*8, 8*bytesOf(DT_F32)); - continue; - } - if (h == oh - 1 && w == ow - 1) { - memcpy(outArray + outBase + h*ow*8 + w*8, inArray + inBase + (ih-1)*iw*8 + (iw-1)*8, 8*bytesOf(DT_F32)); - continue; - } - - F32 hC = strideH * h; - F32 wC = strideW * w; - - I32 hT = floor(hC); - I32 hB = ceil(hC); - I32 wL = floor(wC); - I32 wR = ceil(wC); - - if (hT == hB && wL == wR) { - memcpy(outArray + outBase + h*ow*8 + w*8, inArray + inBase + hT*iw*8 + wL*8, 8*bytesOf(DT_F32)); - } else if (hT == hB) { - float32x4_t res[2] = {0}; - float32x4_t vecL = vld1q_f32(inArray + inBase + hT*iw*8 + wL*8); - float32x4_t vecL1 = vld1q_f32(inArray + inBase + hT*iw*8 + wL*8 + 4); - float32x4_t vecR = vld1q_f32(inArray + inBase + hT*iw*8 + wR*8); - float32x4_t vecR1 = vld1q_f32(inArray + inBase + hT*iw*8 + wR*8 + 4); - res[0] = vfmaq_n_f32(res[0], vecL, wR - wC); - res[1] = vfmaq_n_f32(res[1], vecL1, wR - wC); - res[0] = vfmaq_n_f32(res[0], vecR, wC - wL); - res[1] = vfmaq_n_f32(res[1], vecR1, wC - wL); - vst1q_f32(outArray + outBase + h*ow*8 + w*8, res[0]); - vst1q_f32(outArray + outBase + h*ow*8 + w*8 + 4, res[1]); - } else if (wL == wR) { - float32x4_t res[2] = {0}; - float32x4_t vecT = vld1q_f32(inArray + inBase + hT*iw*8 + wL*8); - float32x4_t vecT1 = vld1q_f32(inArray + inBase + hT*iw*8 + wL*8 + 4); - float32x4_t vecB = vld1q_f32(inArray + inBase + hB*iw*8 + wL*8); - float32x4_t vecB1 = vld1q_f32(inArray + inBase + hB*iw*8 + wL*8 + 4); - res[0] = vfmaq_n_f32(res[0], vecT, hB - hC); - res[1] = vfmaq_n_f32(res[1], vecT1, hB - hC); - res[0] = vfmaq_n_f32(res[0], vecB, hC - hT); - res[1] = vfmaq_n_f32(res[1], vecB1, hC - hT); - vst1q_f32(outArray + outBase + h*ow*8 + w*8, res[0]); - vst1q_f32(outArray + outBase + h*ow*8 + w*8 + 4, res[1]); - } else { - float32x4_t res[2] = {0}; - float32x4_t vecTL = vld1q_f32(inArray + inBase + hT*iw*8 + wL*8); - float32x4_t vecTL1 = vld1q_f32(inArray + inBase + hT*iw*8 + wL*8 + 4); - float32x4_t vecTR = vld1q_f32(inArray + inBase + hT*iw*8 + wR*8); - float32x4_t vecTR1 = vld1q_f32(inArray + inBase + hT*iw*8 + wR*8 + 4); - float32x4_t vecBL = vld1q_f32(inArray + inBase + hB*iw*8 + wL*8); - float32x4_t vecBL1 = vld1q_f32(inArray + inBase + hB*iw*8 + wL*8 + 4); - float32x4_t vecBR = vld1q_f32(inArray + inBase + hB*iw*8 + wR*8); - float32x4_t vecBR1 = vld1q_f32(inArray + inBase + hB*iw*8 + wR*8 + 4); - res[0] = vfmaq_n_f32(res[0], vecTL, (hB - hC) * (wR - wC)); - res[1] = vfmaq_n_f32(res[1], vecTL1, (hB - hC) * (wR - wC)); - res[0] = vfmaq_n_f32(res[0], vecTR, (hB - hC) * (wC - wL)); - res[1] = vfmaq_n_f32(res[1], vecTR1, (hB - hC) * (wC - wL)); - res[0] = vfmaq_n_f32(res[0], vecBL, (hC - hT) * (wR - wC)); - res[1] = vfmaq_n_f32(res[1], vecBL1, (hC - hT) * (wR - wC)); - res[0] = vfmaq_n_f32(res[0], vecBR, (hC - hT) * (wC - wL)); - res[1] = vfmaq_n_f32(res[1], vecBR1, (hC - hT) * (wC - wL)); - vst1q_f32(outArray + outBase + h*ow*8 + w*8, res[0]); - vst1q_f32(outArray + outBase + h*ow*8 + w*8 + 4, res[1]); - } - } - } - } - } - return SUCCESS; -} -#endif - -EE resize_bilinear_arm(TensorDesc inputDesc, void* input, - TensorDesc outputDesc, void* output) -{ - EE ret = SUCCESS; - switch (inputDesc.dt) { -#ifdef _USE_FP16 - case DT_F16: - ret = resize_bilinear_fp16(inputDesc, (F16*)input, - outputDesc, (F16*)output); - break; -#endif -#ifdef _USE_FP32 - case DT_F32: - ret = resize_bilinear_fp32(inputDesc, (F32*)input, - outputDesc, (F32*)output); - break; -#endif - default: - return NOT_SUPPORTED; - } - return ret; -} diff --git a/image/src/cpu/general/image_general.h b/image/src/cpu/general/image_general.h deleted file mode 100644 index 5ff00261..00000000 --- a/image/src/cpu/general/image_general.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_IMAGE_GENERAL -#define _H_IMAGE_GENERAL - -#include "error.h" -#include "sys.h" -#include "tensor_desc.h" -#include "image.h" - -EE resize_bilinear_general(TensorDesc inputDesc, void* input, - TensorDesc outputDesc, void* output); - -template -inline EE from_nchwc8_to_nchw(TensorDesc *desc, T *data) { - if (desc == nullptr || data == nullptr) - CHECK_STATUS(NULL_POINTER); - - DataType idt; - DataFormat idf; - U32 in, ic, ih, iw; - CHECK_STATUS(tensor4dGet(*desc, &idt, &idf, &in, &ic, &ih, &iw)); - if (idf != DF_NCHWC8) - CHECK_STATUS(NOT_MATCH); - - *desc = tensor4df(idt, DF_NCHW, in, ic, ih, iw); - - T *tmp = (T *)malloc(tensorNumBytes(*desc)); - ic /= 8; - for (U32 n = 0; n < in; n++) { - for (U32 c = 0; c < ic; c++) { - for (U32 hw = 0; hw < ih*iw; hw++) { - for (U32 c8 = 0; c8 < 8; c8++) { - tmp[n*ic*8*ih*iw + (c*8 + c8)*ih*iw + hw] = data[n*ic*ih*iw*8 + c*ih*iw*8 + hw*8 + c8]; - } - } - } - } - memcpy(data, tmp, tensorNumBytes(*desc)); - free(tmp); - return SUCCESS; -} - -template -inline EE from_nchw_to_nchwc8(TensorDesc *desc, T *data) { - if (desc == nullptr || data == nullptr) - CHECK_STATUS(NULL_POINTER); - - DataType idt; - DataFormat idf; - U32 in, ic, ih, iw; - CHECK_STATUS(tensor4dGet(*desc, &idt, &idf, &in, &ic, &ih, &iw)); - if (idf != DF_NCHW) - CHECK_STATUS(NOT_MATCH); - - *desc = tensor4df(idt, DF_NCHWC8, in, ic, ih, iw); - - T *tmp = (T *)malloc(tensorNumBytes(*desc)); - ic /= 8; - for (U32 n = 0; n < in; n++) { - for (U32 c = 0; c < ic; c++) { - for (U32 hw = 0; hw < ih*iw; hw++) { - for (U32 c8 = 0; c8 < 8; c8++) { - tmp[n*ic*ih*iw*8 + c*ih*iw*8 + hw*8 + c8] = data[n*ic*8*ih*iw + (c*8 + c8)*ih*iw + hw]; - } - } - } - } - memcpy(data, tmp, tensorNumBytes(*desc)); - free(tmp); - return SUCCESS; -} -#endif diff --git a/image/src/cpu/general/resize_bilinear.cpp b/image/src/cpu/general/resize_bilinear.cpp deleted file mode 100644 index 7806a64a..00000000 --- a/image/src/cpu/general/resize_bilinear.cpp +++ /dev/null @@ -1,143 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "image.h" -#include "cpu/general/image_general.h" - -template -EE resize_bilinear(TensorDesc inputDesc, IT* inArray, - TensorDesc outputDesc, OT* outArray) -{ - DataType idt, odt; - DataFormat idf, odf; - U32 in, ic, ih, iw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - - if (idf == DF_NCHWC8) { - CHECK_STATUS(from_nchwc8_to_nchw(&inputDesc, inArray)); - } - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - if (idf != DF_NCHW && idf != DF_RGB) { - CHECK_STATUS(NOT_MATCH); - } - - F32 strideH = (F32)(ih - 1) / (F32)(oh - 1); - F32 strideW = (F32)(iw - 1) / (F32)(ow - 1); - - for (U32 n = 0; n < on; n++) { - for (U32 c = 0; c < oc; c++) { - I32 outBase = n*oc*oh*ow + c*oh*ow; - I32 inBase = n*oc*ih*iw + c*ih*iw; - for (U32 h = 0; h < oh; h++) { - for (U32 w = 0; w < ow; w++) { - if (h == 0 && w == 0) { - outArray[outBase] = inArray[inBase]; - continue; - } - if (h == 0 && w == ow - 1) { - outArray[outBase + w] = inArray[inBase + iw - 1]; - continue; - } - if (h == oh - 1 && w == 0) { - outArray[outBase + h * ow] = inArray[inBase + (ih - 1) * iw]; - continue; - } - if (h == oh - 1 && w == ow - 1) { - outArray[outBase + h * ow + w] = inArray[inBase + (ih - 1) * iw + iw - 1]; - continue; - } - - F32 hC = strideH * h; - F32 wC = strideW * w; - - I32 hT = floor(hC); - I32 hB = ceil(hC); - I32 wL = floor(wC); - I32 wR = ceil(wC); - - if (hT == hB && wL == wR) { - outArray[outBase + h * ow + w] = inArray[inBase + hT * iw + wL]; - } else if (hT == hB) { - outArray[outBase + h * ow + w] = inArray[inBase + hT * iw + wL] * (wR - wC) + inArray[inBase + hT * iw + wR] * (wC - wL); - } else if (wL == wR) { - outArray[outBase + h * ow + w] = inArray[inBase + hT * iw + wL] * (hB - hC) + inArray[inBase + hB * iw + wL] * (hC - hT); - } else { - F32 factorTL = (hB - hC) * (wR - wC); - F32 factorTR = (hB - hC) * (wC - wL); - F32 factorBL = (hC - hT) * (wR - wC); - F32 factorBR = (hC - hT) * (wC - wL); - - outArray[outBase + h * ow + w] = inArray[inBase + hT * iw + wL] * factorTL; - outArray[outBase + h * ow + w] += inArray[inBase + hT * iw + wR] * factorTR; - outArray[outBase + h * ow + w] += inArray[inBase + hB * iw + wL] * factorBL; - outArray[outBase + h * ow + w] += inArray[inBase + hB * iw + wR] * factorBR; - } - } - } - } - } - - if (odf == DF_NCHWC8) { - outputDesc.df = DF_NCHW; - CHECK_STATUS(from_nchw_to_nchwc8(&outputDesc, outArray)); - } - return SUCCESS; -} - -EE resize_bilinear_general(TensorDesc inputDesc, void* input, - TensorDesc outputDesc, void* output) -{ - EE ret = SUCCESS; - switch (inputDesc.dt) { -#ifdef __aarch64__ - case DT_F16: { - ret = resize_bilinear(inputDesc, (F16*)input, - outputDesc, (F16*)output); - break; - } -#endif -#ifdef _USE_FP32 - case DT_F32: { - ret = resize_bilinear(inputDesc, (F32*)input, - outputDesc, (F32*)output); - break; - } -#endif - case DT_U8: { -#ifdef __aarch64__ - if (DT_F16 == outputDesc.dt) { - ret = resize_bilinear(inputDesc, (U8*)input, - outputDesc, (F16*)output); - } -#endif -#ifdef _USE_FP32 - if (DT_F32 == outputDesc.dt) { - ret = resize_bilinear(inputDesc, (U8*)input, - outputDesc, (F32*)output); - } -#endif - break; - } - default: - return NOT_SUPPORTED; - } - return ret; -} diff --git a/image/src/image_processing.cpp b/image/src/image_processing.cpp deleted file mode 100644 index 60283413..00000000 --- a/image/src/image_processing.cpp +++ /dev/null @@ -1,251 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include -#include -#include "image.h" -#include "tensor_desc.h" -#include "type.h" -#include "error.h" - - -template -std::shared_ptr get_resize_image(TensorDesc rgbDesc, void* rgb, TensorDesc imageDesc, ImageFormat targetImageFormat, float scaleValue) -{ - DataType rgbDt = DT_F16, imageDt = DT_F16; - DataFormat rgbDf = DF_RGB, imageDf = DF_RGB; - U32 rgbNum = 0, rgbChannel = 0, rgbHeight = 0, rgbWidth = 0; - U32 imageNum = 0, imageChannel = 0, imageHeight = 0, imageWidth = 0; - CHECK_STATUS(tensor4dGet(rgbDesc, &rgbDt, &rgbDf, &rgbNum, &rgbChannel, &rgbHeight, &rgbWidth)); - CHECK_REQUIREMENT(rgbDf == DF_RGB); - CHECK_REQUIREMENT(rgbChannel == 3); - CHECK_REQUIREMENT(rgbNum == 1); - - CHECK_STATUS(tensor4dGet(imageDesc, &imageDt, &imageDf, &imageNum, &imageChannel, &imageHeight, &imageWidth)); - CHECK_REQUIREMENT(imageDf == DF_NCHW); - CHECK_REQUIREMENT(imageNum == 1); - - U32 height = rgbHeight; - U32 width = rgbWidth; - - U32 totalBytes = tensorNumBytes(imageDesc); - T *transferSpacePtr = (T *)operator new(totalBytes); - T *transferSpacePtrMov = transferSpacePtr; - - // magic number - float meanRGB[3] = {122.6789143406786, 116.66876761696767, 104.0069879317889}; - float meanRGBSC[3] = {0.485, 0.456, 0.406}; - float stdRGBSC[3] = {0.229, 0.224, 0.225}; - - U32 transform[3]; - switch (targetImageFormat) { - case RGB: - transform[0] = 0; - transform[1] = 1; - transform[2] = 2; - break; - case BGR: - transform[0] = 2; - transform[1] = 1; - transform[2] = 0; - break; - case BGR_SC_RAW: - transform[0] = 2; - transform[1] = 1; - transform[2] = 0; - break; - case RGB_SC: - transform[0] = 0; - transform[1] = 1; - transform[2] = 2; - break; - case RGB_RAW: - transform[0] = 0; - transform[1] = 1; - transform[2] = 2; - break; - case RGB_SC_RAW: - transform[0] = 0; - transform[1] = 1; - transform[2] = 2; - break; - default: - std::cerr << "[ERROR] unsupported image format" << std::endl; - exit(1); - return nullptr; - } - - // consider the dataformat - if (targetImageFormat == RGB_SC) { // Specific for Birealnet18, scale short edge to 224 first - F32 scale = 224.0 / UNI_MIN(height, width); - if (height < width) { - height = 224; - width = (U32)(scale * width + 0.5); - } else { - height = (U32)(scale * height + 0.5); - width = 224; - } - TensorDesc scaledDesc = tensor4df(imageDt, imageDf, imageNum, imageChannel, height, width); - T *scaled = (T*)malloc(tensorNumBytes(scaledDesc)); - resize(rgbDesc, rgb, scaledDesc, scaled, CPU_GENERAL); - - U32 h0 = (U32)((height - 224) * 0.5); - U32 w0 = (U32)((width - 224) * 0.5); - - for (U32 c : transform) { - for (U32 h = h0; h < h0 + imageHeight; h++) { - for (U32 w = w0; w < w0 + imageWidth; w++) { - T value = (scaled[c*height*width + h*width + w] / 255 - meanRGBSC[c]) / stdRGBSC[c]; - CHECK_REQUIREMENT(!UNI_ISNAN(value)); - *transferSpacePtrMov = value; - transferSpacePtrMov++; - } - } - } - free(scaled); - } else if (targetImageFormat == RGB_RAW) { - resize(rgbDesc, rgb, imageDesc, transferSpacePtr, CPU_GENERAL); - } else if (targetImageFormat == RGB_SC_RAW || targetImageFormat == BGR_SC_RAW) { - F32 scale = 256.0 / UNI_MIN(height, width); - if (height < width) { - height = 256; - width = (U32)(scale * (F32)width + 0.5); - } else { - height = (U32)(scale * (F32)height + 0.5); - width = 256; - } - TensorDesc scaledDesc = tensor4df(imageDt, imageDf, imageNum, imageChannel, height, width); - T *scaled = (T*)malloc(tensorNumBytes(scaledDesc)); - resize(rgbDesc, rgb, scaledDesc, scaled, CPU_GENERAL); - - U32 h0 = (U32)((height - 224) * 0.5); - U32 w0 = (U32)((width - 224) * 0.5); - - for (U32 c : transform) { - for (U32 h = h0; h < h0 + 224; h++) { - memcpy(transferSpacePtrMov, scaled + c*height*width + h*width + w0, 224*bytesOf(imageDt)); - transferSpacePtrMov += 224; - } - } - free(scaled); - } else { - T *resized = (T*)malloc(tensorNumBytes(imageDesc)); - resize(rgbDesc, rgb, imageDesc, resized, CPU_GENERAL); - - for (U32 c : transform) { - for (U32 h = 0; h < imageHeight; h++) { - for (U32 w = 0; w < imageWidth; w++) { - T value = (resized[c*imageHeight*imageWidth + h*imageWidth + w] - 1.0*meanRGB[c]) * scaleValue; - CHECK_REQUIREMENT(!UNI_ISNAN(value)); - *transferSpacePtrMov = value; - transferSpacePtrMov++; - } - } - } - free(resized); - } - - std::shared_ptr val((U8*)transferSpacePtr); - return val; -} - -// CImg load image to save in RGB format -// OpenCV load image to save in BGR format -// PIL load image to save in BGR format -// scikit-image load image to save in RGB format -// If you want to use other format, please set targetImageFormat -// numpy use OpenCV to load image - -// Assume most networks require 224*224 inputs -std::shared_ptr load_resize_image(TensorDesc rgbDesc, void* rgb, TensorDesc imageDesc, ImageFormat targetImageFormat, float scaleValue) -{ - DataType imageDt = DT_F32; - DataFormat imageDf; - U32 imageNum, imageChannel, imageHeight, imageWidth; - - CHECK_STATUS(tensor4dGet(imageDesc, &imageDt, &imageDf, &imageNum, &imageChannel, &imageHeight, &imageWidth)); - - switch (imageDt) { -#ifdef __aarch64__ - case DT_F16: { - return get_resize_image(rgbDesc, rgb, imageDesc, targetImageFormat, scaleValue); - } -#endif -#ifdef _USE_FP32 - case DT_F32: { - return get_resize_image(rgbDesc, rgb, imageDesc, targetImageFormat, scaleValue); - } -#endif - default: { - CHECK_STATUS(NOT_SUPPORTED); - return nullptr; - } - } -} - -template -std::shared_ptr gen_fake_image(TensorDesc inputDesc) -{ - DataType dt; - DataFormat df; - U32 in = 0, ic = 0, ih = 0, iw = 0; - CHECK_STATUS(tensor4dGet(inputDesc, &dt, &df, &in, &ic, &ih, &iw)); - CHECK_REQUIREMENT(df == DF_NCHW); - CHECK_REQUIREMENT(in == 1); - - U32 totalBytes = tensorNumBytes(inputDesc); - - // upon on the data type, to malloc the corresponding space - T *transferSpacePtr = (T *)operator new(totalBytes); - T *transferSpacePtrMov = transferSpacePtr; - - // consider the dataformat - for (U32 c = 0; c < ic; c++) { - for (U32 h = 0; h < ih; h++) { - for (U32 w = 0; w < iw; w++) { - *transferSpacePtrMov = 1; - transferSpacePtrMov++; - } - } - } - - std::shared_ptr val((U8*)transferSpacePtr); - return val; -} - -std::shared_ptr load_fake_image(TensorDesc inputDesc) -{ - DataType dt = DT_F32; - DataFormat df; - U32 in, ic, ih, iw; - CHECK_STATUS(tensor4dGet(inputDesc, &dt, &df, &in, &ic, &ih, &iw)); - - switch (dt) { -#ifdef __aarch64__ - case DT_F16: { - return gen_fake_image(inputDesc); - } -#endif -#ifdef _USE_FP32 - case DT_F32: { - return gen_fake_image(inputDesc); - } -#endif - default: { - CHECK_STATUS(NOT_SUPPORTED); - return nullptr; - } - } -} diff --git a/image/src/resize.cpp b/image/src/resize.cpp deleted file mode 100644 index 290c1821..00000000 --- a/image/src/resize.cpp +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "image.h" -#ifdef _USE_GENERAL -#include "cpu/general/image_general.h" -#endif -#ifdef _USE_NEON -#include "cpu/arm/image_arm.h" -#endif -#include - -// params is a pointer to either the target size or the resize ratios -// When resizeDesc specifies DT_U32, params should point to target sizes (height and width) -// When resizeDesc specifies DT_F32, params should point to resize ratios -EE resize_infer_output_size(TensorDesc inputDesc, ResizeDesc resizeDesc, void* params, - TensorDesc* outputDesc, U32* outputBytes) -{ - if (nullptr == outputDesc || nullptr == outputBytes) { - CHECK_STATUS(NULL_POINTER); - } - DataType idt; - DataFormat idf; - U32 in, ic, ih, iw; - U32 oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - - switch(resizeDesc.paramDT) { - case DT_F32: { - F32 *scales = (F32*)params; - oh = ih * scales[0]; - ow = iw * scales[1]; - break; - } - case DT_U32: { - U32 *len = (U32*)params; - oh = len[0]; - ow = len[1]; - break; - } - default: { - return NOT_SUPPORTED; - } - } - - *outputDesc = tensor4df(idt, idf, in, ic, oh, ow); - *outputBytes = tensorNumBytes(*outputDesc); - return SUCCESS; -} - -EE resize(TensorDesc inputDesc, void* input, - TensorDesc outputDesc, void* output, - Arch arch) -{ - DataType idt, odt; - DataFormat idf, odf; - U32 in, ic, ih, iw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - - CHECK_REQUIREMENT(in == on && ic == oc); - - if (ih == oh && iw == ow) { - memcpy(output, input, tensorNumBytes(inputDesc)); - return SUCCESS; - } - - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = resize_bilinear_general(inputDesc, input, - outputDesc, output); -#endif -#ifdef _USE_NEON - } else { - ret = resize_bilinear_arm(inputDesc, input, - outputDesc, output); -#endif - } - return ret; -} diff --git a/inference/CMakeLists.txt b/inference/CMakeLists.txt index 9599b2f6..b1e7ea08 100644 --- a/inference/CMakeLists.txt +++ b/inference/CMakeLists.txt @@ -1,33 +1,19 @@ cmake_minimum_required(VERSION 3.2) -file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/bolt.cmake ${BOLT_ROOT}/bolt.cmake) +file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/common/cmakes/bolt.cmake ${BOLT_ROOT}/common/cmakes/bolt.cmake) if (BOLT_CONFIGURE_FILE) include(${BOLT_CONFIGURE_FILE}) else (BOLT_CONFIGURE_FILE) message(FATAL_ERROR " -FATAL: can not find bolt.cmake in directory, +FATAL: can not find bolt.cmake in /common/cmakes directory, please set shell or cmake environment variable BOLT_ROOT. ") endif (BOLT_CONFIGURE_FILE) project(inference) -set_policy() - -SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${BOLT_ROOT}/cmakes") -find_package(Uni) -find_package(ModelTools) -find_package(Image) -find_package(TensorComputing) -if(USE_MALI) - find_package(Gcl) -endif(USE_MALI) -if(BUILD_TEST) - find_package(jpeg) -endif(BUILD_TEST) - -set_project_install_directory() - -set_c_cxx_flags() - -add_subdirectory(src) +add_subdirectory(engine) +if (USE_FLOW) + add_subdirectory(flow) +endif (USE_FLOW) +add_subdirectory(examples) diff --git a/inference/engine/CMakeLists.txt b/inference/engine/CMakeLists.txt new file mode 100644 index 00000000..ac1be1a5 --- /dev/null +++ b/inference/engine/CMakeLists.txt @@ -0,0 +1,28 @@ +cmake_minimum_required(VERSION 3.2) + +file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/common/cmakes/bolt.cmake ${BOLT_ROOT}/common/cmakes/bolt.cmake) +if (BOLT_CONFIGURE_FILE) + include(${BOLT_CONFIGURE_FILE}) +else (BOLT_CONFIGURE_FILE) + message(FATAL_ERROR " +FATAL: can not find bolt.cmake in directory, + please set shell or cmake environment variable BOLT_ROOT. + ") +endif (BOLT_CONFIGURE_FILE) + +project(engine) + +if (BUILD_TEST) + find_package(jpeg) +endif (BUILD_TEST) + +set_c_cxx_flags() + +include_engine() + +add_subdirectory(src) +add_subdirectory(tools) + +install(DIRECTORY api/java + api/c + DESTINATION include) diff --git a/inference/engine/api/c/bolt.h b/inference/engine/api/c/bolt.h new file mode 100644 index 00000000..4e4428bc --- /dev/null +++ b/inference/engine/api/c/bolt.h @@ -0,0 +1,341 @@ +/** + * @file + * @brief C API Document + * + * @copyright + * @code + * Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE + * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * @endcode + */ +#ifdef __cplusplus +extern "C" { +#endif + +/** inference pipeline handle */ +typedef void *ModelHandle; + +/** result data memory handle */ +typedef void *ResultHandle; + +/** CPU affinity policy */ +typedef enum { + CPU_HIGH_PERFORMANCE = 0, ///< performance is high priority(use big core) + CPU_LOW_POWER = 1, ///< power is high priority(use small core) + GPU = 2 ///< use GPU +} AFFINITY_TYPE; + +/** heterogeneous device type */ +typedef enum { + CPU_SERIAL = 0, ///< CPU serial + CPU_ARM_V7 = 1, ///< ARMv7 CPU + CPU_ARM_V8 = 2, ///< ARMv8 CPU + CPU_ARM_A55 = 3, ///< ARM A55 CPU + CPU_ARM_A76 = 4, ///< ARM A76 CPU + CPU_X86_AVX2 = 5, ///< X86_64 AVX2 CPU + GPU_MALI = 10 ///< ARM MALI GPU +} DEVICE_TYPE; + +/** data precision */ +typedef enum { + FP_32 = 0, ///< 32 bit float + FP_16 = 1, ///< 16 bit float + INT_32 = 2, ///< 32 bit integer + UINT_32 = 3 ///< 32 bit unsigned integer +} DATA_TYPE; + +/** multi-dimension data format */ +typedef enum { + NCHW = 0, ///< batch->channel->high->width data order + NHWC = 1, ///< batch->high->width->channel data order + NCHWC8 = 2, ///< batch->channel/8->high->width->channel four element data order + MTK = 3, ///< batch->time->unit data order + NORMAL = 4 ///< batch->unit data order +} DATA_FORMAT; + +/** + * @brief create model from file + * @param modelPath model file path + * @param affinity CPU affinity setting + * @param algoPath the file path to save and load algos info + * + * @return inference pipeline handle + * + * @note destroy model when pipeline end + * @code + * ModelHandle handle = CreateModel(...); + * ... + * DestroyModel(handle); + * @endcode + * valid algoPath can reduce PrepareModel significantly + * if you set a valid algoPath, algorithm selected only need to run once, which is usually time consuming + * the algorithm select result will be saved to the file path you set, and loaded when you run it next time, + * which avoid to do the algorithm selected again + * it is strongly suggest that set a valid algoPath, especiall for GPU running + * @note + * if your inputSize changed, please delete the old algorithm file be saved + * if your model changed, please delete the old algorithm file be saved + * if any unexpected error happen, you can try to delete algorithm file and run it again + */ +ModelHandle CreateModel(const char *modelPath, AFFINITY_TYPE affinity, const char *algoPath); + +/** + * @brief create model from file stream + * Other info is the same with CreateModel + **/ +ModelHandle CreateModelWithFileStream( + const char *modelFileStream, AFFINITY_TYPE affinity, const char *algoFileStream); + +/** + * @brief get the number of model input from ModelHandle + * @param ih inference pipeline handle + * + * @return the number of input + */ +int GetNumInputsFromModel(ModelHandle ih); + +/** + * @brief get input Data info set in model handle, which is read from .bolt + * @param ih inference pipeline handle + * @param number_inputs the number of input + * @param inputNames the array of all input data's name + * @param n the array of all input data's n dimension + * @param c the array of all input data's c dimension + * @param h the array of all input data's h dimension + * @param w the array of all input data's w dimension + * @param dt the array of all input data's data type + * @param df the array of all input data's data format + * + * @return + * @note + * ptr of inputNames/n/c/h/w need be managed by user, the space must be larger than numInputs * Bytesof(dataType) + */ +void GetInputDataInfoFromModel(ModelHandle ih, + const int number_inputs, + char **inputNames, + int *n, + int *c, + int *h, + int *w, + DATA_TYPE *dt, + DATA_FORMAT *df); + +/** + * @brief complete model inference engine prepare + * @param ih model inference handle + * @param num_input the number of input data + * @param name the array of all input data's name in string format + * @param n the array of all input data's n dimension + * @param c the array of all input data's c dimension + * @param h the array of all input data's h dimension + * @param w the array of all input data's w dimension + * @param dt_input the array of all input data's data type + * @param df_input the array of all input data's data format + * + * @return + */ +void PrepareModel(ModelHandle ih, + const int num_input, + char **name, + const int *n, + const int *c, + const int *h, + const int *w, + const DATA_TYPE *dt_input, + const DATA_FORMAT *df_input); + +/** + * @brief clone model from a model + * @param ih a inference pipeline handle pointer of a model + * + * @return inference pipeline handle + **/ +ModelHandle CloneModel(ModelHandle ih); + +/** + * @brief resize model input size + * @param ih model inference handle + * @param num_input the number of input data + * @param n the array of all input data's n dimension + * @param c the array of all input data's c dimension + * @param h the array of all input data's h dimension + * @param w the array of all input data's w dimension + * @param name the array of all input data's name in string format + * @param dt_input the array of all input data's data type + * @param df_input the array of all input data's data format + * + * @return + * + * @code + * // model_resize must behind PrepareModel; + * PrepareModel(...); + * ResizeModelInput(...); + * RunModel(...); + * @endcode + */ +void ResizeModelInput(ModelHandle ih, + const int num_input, + char **name, + const int *n, + const int *c, + const int *h, + const int *w, + const DATA_TYPE *dt_input, + const DATA_FORMAT *df_input); + +/** + * @brief malloc result data memory + * @param ih inference pipeline handle + * + * @return result data memory handle + */ +ResultHandle AllocAllResultHandle(ModelHandle ih); + +/** + * @brief malloc result data memory according to user specification + * @param ih inference pipeline handle + * @param num_outputs the number of tensor that needed + * @param outputNames the array of tesor name that needed + * + * @return result data memory handle + */ +ResultHandle AllocSpecificResultHandle(ModelHandle ih, const int num_outputs, char **outputNames); + +/** + * @brief clone result handle + * @param ir a result data handle + * + * @return result data memory handle + **/ +ResultHandle CloneResultHandle(ResultHandle ir); + +/** + * @brief set process to run on specified CPU core + * @param ih inference pipeline handle + * @param cpu_id cpu core id(0, 1, 2...) + * @param device cpu core architecture(ARM_A76) + * + * @return + */ +void SetRuntimeDevice(ModelHandle ih, int cpu_id, DEVICE_TYPE device); + +/** + * @brief set process cpu affinity according cpu average occupy + * @param ih inference pipeline handle + * + * @return + */ +void SetRuntimeDeviceDynamic(ModelHandle ih); + +/** + * @brief inference result from input + * @param ih inference pipeline handle + * @param ir result data memory handle + * @param num_input the number of input data + * @param inputNames the array of all input data's name in string format + * @param mem the array of all input data + * + * @return + */ +void RunModel(ModelHandle ih, ResultHandle ir, const int num_input, char **inputNames, void **mem); + +/** + * @brief get the number of model output from ResultHandle + * @param ir result data memory handle + * + * @return the number of output + */ +int GetNumOutputsFromResultHandle(ResultHandle ir); + +/** + * @brief get output Data info from ResultHandle + * @param ir result data memory handle + * @param num_outputs the number of output data + * @param outputNames the array of all output data's name + * @param n the array of all output data's n dimension + * @param c the array of all output data's c dimension + * @param h the array of all output data's h dimension + * @param w the array of all output data's w dimension + * @param dt_output the array of all output data's data type + * @param df_output the array of all output data's data format + * + * @return + * @note + * ptr of outputNames/n/c/h/w/ need be managed by user, the space must be larger than num_outputs * Bytesof(dataType) + */ +void GetOutputDataInfoFromResultHandle(ResultHandle ir, + int num_outputs, + char **outputNames, + int *n, + int *c, + int *h, + int *w, + DATA_TYPE *dt_output, + DATA_FORMAT *df_output); +/** + * @brief get data from ResultHandle, default to pass value of output ptr, + * if need copy data to your own ptr, please use CopyOutputsFromResultHandle + * @param ir result data memory handle + * @param num_outputs the number of output data + * @param outputNames the array of all output data's name + * @param data the array of all output data's content + * @param n the array of all output data's n dimension + * @param c the array of all output data's c dimension + * @param h the array of all output data's h dimension + * @param w the array of all output data's w dimension + * @param dt_output the array of all output data's data type + * @param df_output the array of all output data's data format + * + * @return + */ +void GetPtrFromResultHandle(ResultHandle ir, + int num_outputs, + char **outputNames, + void **data, + int *n, + int *c, + int *h, + int *w, + DATA_TYPE *dt_output, + DATA_FORMAT *df_output); + +/** + * @brief get data ptr from ResultHandle with memcpy + * @param ir result data memory handle + * @param num_outputs the number of output data + * @param size the array of size of output + * @param data the array of all output data's content + * + * @return + * ptr of data need be managed by user, the space must be >= size + */ +void CopyOutputsFromResultHandle(ResultHandle ir, int num_outputs, const int *size, void **data); + +/** + * @brief free result data memory + * @param ir result data memory handle + * + * @return + */ +void FreeResultHandle(ResultHandle ir); + +/** + * @brief destroy model + * @param ih inference pipeline handle + * + * @return + */ +void DestroyModel(ModelHandle ih); +#ifdef __cplusplus +} +#endif diff --git a/inference/engine/api/dllite/Bolt.h b/inference/engine/api/dllite/Bolt.h new file mode 100644 index 00000000..e212fc9e --- /dev/null +++ b/inference/engine/api/dllite/Bolt.h @@ -0,0 +1,101 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef DLLITE_BOLT_H +#define DLLITE_BOLT_H + +#include +#include + +namespace bolt { + +/** inference pipeline handle */ +using ModelHandle = void *; + +/** result data memory handle */ +using ResultHandle = void *; + +/** CPU affinity policy */ +enum class AffinityType { + CPU_HIGH_PERFORMANCE = 0, ///< performance is high priority(use big core) + CPU_LOW_POWER = 1, ///< power is high priority(use small core) + GPU = 2 ///< use GPU +}; + +/** data precision */ +enum class TensorType { + FP32 = 0, ///< 32 bit float + FP16 = 1, ///< 16 bit float + INT32 = 2, ///< 32 bit integer + UINT32 = 3 ///< 32 bit unsigned integer +}; + +/** multi-dimension data format */ +enum class TensorLayout { + NCHW = 0, ///< batch->channel->height->width data order + NHWC = 1, ///< batch->height->width->channel data order + NCHWC8 = 2, ///< batch->channel/8->height->width->8 data order + ROW_MAJOR = 3, ///< batch->unit data order + RNN_MTK = 4 ///< batch->time->unit data order +}; + +// IOTensor +struct IOTensor { + std::string name; + TensorType type; + TensorLayout layout; + std::vector shape; + std::pair buffer; // +}; + +// For model and algo config, either both use stream (default) or both use path +struct ModelConfig { + AffinityType affinity; + std::pair modelStream; + std::pair algoStream; + std::string modelPath; + std::string algoPath; +}; + +// Return status +enum class ReturnStatus { + SUCCESS = 0, ///< SUCCESS + FAIL = -1, ///< FAIL + NULLPTR = -2 ///< NULLPTR +}; + +ModelHandle CreateModel(const ModelConfig &modelConfig); + +ReturnStatus GetIOFormats( + ModelHandle modelHandle, std::vector &inputs, std::vector &outputs); + +ReturnStatus PrepareModel(ModelHandle modelHandle, const std::vector &inputs); + +ReturnStatus GetInputTensors(ModelHandle modelHandle, std::vector &inputs); + +ReturnStatus ResizeInput(ModelHandle modelHandle, const std::vector &inputs); + +ResultHandle AllocResult(ModelHandle modelHandle, const std::vector &outputs); + +ReturnStatus RunModel( + ModelHandle modelHandle, ResultHandle resultHandle, const std::vector &inputs); + +ReturnStatus GetOutputTensors(ResultHandle resultHandle, std::vector &outputs); + +ReturnStatus FreeResult(ResultHandle resultHandle); + +ReturnStatus DestroyModel(ModelHandle modelHandle); + +} // namespace bolt + +#endif // DLLITE_BOLT_H diff --git a/inference/engine/api/java/BoltModel.java b/inference/engine/api/java/BoltModel.java new file mode 100644 index 00000000..bd043611 --- /dev/null +++ b/inference/engine/api/java/BoltModel.java @@ -0,0 +1,440 @@ +/** + * @file + * @brief Java BoltModel Class Document + * + * @copyright + * @code + * Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE + * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * @endcode + */ + +import java.io.File; +import java.io.FileNotFoundException; + +/** affinity policy */ +enum AffinityType { + CPU_HIGH_PERFORMANCE, ///< performance is high priority(use CPU big core) + CPU_LOW_POWER, ///< power is high priority(use CPU small core) + GPU ///< use ARM MALI GPU +} + +/** heterogeneous device type */ +enum DeviceType { + CPU_ARM_V7, ///< ARMv7 CPU + CPU_ARM_V8, ///< ARMv8 CPU + CPU_ARM_A55, ///< ARM A55 CPU + CPU_ARM_A76, ///< ARM A76 CPU + GPU_MALI, ///< ARM MALI GPU + CPU_X86_AVX2, ///< X86_64 AVX2 CPU + CPU_SERIAL ///< CPU serial +} + +/** data precision */ +enum DataType { + FP32, ///< 32 bit float + FP16, ///< 16 bit float + INT32, ///< 32 bit integer + UINT32 ///< 32 bit unsigned char +} + +/** multi-dimensions data format */ +enum DataFormat { + NCHW, ///< batch->channel->high->width data order + NHWC, ///< batch->high->width->channel data order + MTK, ///< batch->time->unit data order + NORMAL ///< vectorize input of row major +} + +public final class BoltModel implements Cloneable { + private static void loadLibrary(String libraryAbsolutePath, boolean optional) + { + File file = new File(libraryAbsolutePath); + if (file.exists()) { + System.load(libraryAbsolutePath); + } else { + if (!optional) { + System.err.println("[ERROR] unable to load " + libraryAbsolutePath); + } + } + } + + static + { + String dir = System.getProperty("user.dir"); + loadLibrary(dir + "/libc++_shared.so", true); + loadLibrary("/system/lib64/libOpenCL.so", true); + loadLibrary(dir + "/libkernelsource.so", true); + loadLibrary(dir + "/libBoltModel.so", false); + } + + private long modelAddr; + + private long IResult; + + private native long createModel(String modelPath, String affinity); + + private native void prepareModel(long modelAddr, + int inputNum, + String[] inputName, + int[] inputN, + int[] inputC, + int[] inputH, + int[] inputW, + String[] inputDataType, + String[] inputDataFormat); + + private native long cloneModel(long modelAddr); + + private native long cloneResult(long IResult); + + private native void resizeModelInput(long modelAddr, + int inputNum, + String[] inputName, + int[] inputN, + int[] inputC, + int[] inputH, + int[] inputW, + String[] inputDataType, + String[] inputDataFormat); + + private native long allocAllResultHandle(long modelAddr); + + private native long allocSpecificResultHandle(long modelAddr, int outputNum, String[] outputName); + + private native void setRuntimeDeviceJNI(int cpuId, String device); + + private native void setRuntimeDeviceDynamicJNI(); + + private native void runModel( + long modelAddr, long IResult, int inputNum, String[] inputName, float[][] inputData); + + private native BoltResult getOutput(long IResult); + + private native void freeResultHandle(long IResult); + + private native void destroyModel(long modelAddr); + + public String affinityMapping(AffinityType affinity) + { + String ret = "CPU_AFFINITY_HIGH_PERFORMANCE"; + if (affinity == AffinityType.CPU_HIGH_PERFORMANCE) { + ret = "CPU_AFFINITY_HIGH_PERFORMANCE"; + } else if (affinity == AffinityType.CPU_LOW_POWER) { + ret = "CPU_AFFINITY_LOW_POWER"; + } else if (affinity == AffinityType.GPU) { + ret = "GPU"; + } else { + System.err.println("[ERROR] unsupported CPU affinity in " + this.getClass().getName()); + } + return ret; + } + + public String deviceMapping(DeviceType device) + { + String ret = "CPU_ARM_V8"; + if (device == DeviceType.CPU_ARM_V7) { + ret = "CPU_ARM_V7"; + } else if (device == DeviceType.CPU_ARM_V8) { + ret = "CPU_ARM_V8"; + } else if (device == DeviceType.CPU_ARM_A55) { + ret = "CPU_ARM_A55"; + } else if (device == DeviceType.CPU_ARM_A76) { + ret = "CPU_ARM_A76"; + } else if (device == DeviceType.GPU_MALI) { + ret = "GPU_MALI"; + } else if (device == DeviceType.CPU_X86_AVX2) { + ret = "CPU_X86_AVX2"; + } else if (device == DeviceType.CPU_SERIAL) { + ret = "CPU_SERIAL"; + } else { + System.err.println("[ERROR] unsupported device in " + this.getClass().getName()); + } + return ret; + } + + public String dataTypeMapping(DataType data_type) + { + String ret = "FP32"; + if (data_type == DataType.FP32) { + ret = "FP32"; + } else if (data_type == DataType.FP16) { + ret = "FP16"; + } else if (data_type == DataType.INT32) { + ret = "INT32"; + } else if (data_type == DataType.UINT32) { + ret = "UINT32"; + } else { + System.err.println("[ERROR] unsupported data type in " + this.getClass().getName()); + } + return ret; + } + + private String dataFormatMapping(DataFormat data_format) + { + String ret = "NCHW"; + if (data_format == DataFormat.NCHW) { + ret = "NCHW"; + } else if (data_format == DataFormat.NHWC) { + ret = "NHWC"; + } else if (data_format == DataFormat.MTK) { + ret = "MTK"; + } else if (data_format == DataFormat.NORMAL) { + ret = "NORMAL"; + } else { + System.err.println("[ERROR] unsupported data format in " + this.getClass().getName()); + } + return ret; + } + + BoltModel() + { + this.modelAddr = 0; + this.IResult = -1; + } + + /** + * @brief initial model and alloc memory + * @param modelPath model file path of String type + * @param affinity CPU affinity setting of AffinityType(enum) type + * @param device heterogeneous device setting of DeviceType(enum) type + * @param inputNum the number of input data of int type + * @param inputName the array of all input data's name of string type + * @param inputN the array of all input data's n dimension of int type + * @param inputC the array of all input data's c dimension of int type + * @param inputH the array of all input data's h dimension of int type + * @param inputW the array of all input data's w dimension of int type + * @param inputDataType the array of all input data's data type of DataType(enum) type + * @param inputDataFormat the array of all input data's data format of DataFormat(enum) type + * + * @return + * + * @note destroy model when pipeline end + * @code + * BoltModel example = BoltModel(...); + * ... + * example.estructor(); + * @endcode + */ + BoltModel(String modelPath, + AffinityType affinity, + int inputNum, + String[] inputName, + int[] inputN, + int[] inputC, + int[] inputH, + int[] inputW, + DataType[] inputDataType, + DataFormat[] inputDataFormat) + { + String affinityString = affinityMapping(affinity); + String[] inputDataTypeString = new String[inputNum]; + String[] inputDataFormatString = new String[inputNum]; + for (int i = 0; i < inputNum; i++) { + inputDataTypeString[i] = dataTypeMapping(inputDataType[i]); + inputDataFormatString[i] = dataFormatMapping(inputDataFormat[i]); + } + + this.modelAddr = createModel(modelPath, affinityString); + if (0 != this.modelAddr) { + prepareModel(this.modelAddr, inputNum, inputName, inputN, inputC, inputH, inputW, + inputDataTypeString, inputDataFormatString); + this.IResult = allocAllResultHandle(this.modelAddr); + } else { + this.IResult = -1; + System.err.println("[ERROR] model cannot be created in " + this.getClass().getName()); + } + } + + /** + * @brief initial model and alloc memory, and the output is decided by user + * @param modelPath model file path of String type + * @param affinity CPU affinity setting of AffinityType(enum) type + * @param device heterogeneous device setting of DeviceType(enum) type + * @param inputNum the number of input data of int type + * @param inputName the array of all input data's name of string type + * @param inputN the array of all input data's n dimension of int type + * @param inputC the array of all input data's c dimension of int type + * @param inputH the array of all input data's h dimension of int type + * @param inputW the array of all input data's w dimension of int type + * @param inputDataType the array of all input data's data type of DataType(enum) type + * @param inputDataFormat the array of all input data's data format of DataFormat(enum) type + * @param outputNum the number of output data of int type + * @param outputName the array of all output data's name of string type + * + * @return + * + * @note destroy model when pipeline end + * @code + * BoltModel example = BoltModel(...); + * ... + * example.estructor(); + * @endcode + */ + BoltModel(String modelPath, + AffinityType affinity, + int inputNum, + String[] inputName, + int[] inputN, + int[] inputC, + int[] inputH, + int[] inputW, + DataType[] inputDataType, + DataFormat[] inputDataFormat, + int outputNum, + String[] outputName) + { + String affinityString = affinityMapping(affinity); + String[] inputDataTypeString = new String[inputNum]; + String[] inputDataFormatString = new String[inputNum]; + for (int i = 0; i < inputNum; i++) { + inputDataTypeString[i] = dataTypeMapping(inputDataType[i]); + inputDataFormatString[i] = dataFormatMapping(inputDataFormat[i]); + } + + this.modelAddr = createModel(modelPath, affinityString); + if (0 != this.modelAddr) { + prepareModel(this.modelAddr, inputNum, inputName, inputN, inputC, inputH, inputW, + inputDataTypeString, inputDataFormatString); + this.IResult = allocSpecificResultHandle(this.modelAddr, outputNum, outputName); + } else { + this.IResult = -1; + System.err.println("[ERROR] model cannot be created in " + this.getClass().getName()); + } + } + + /** + * @brief clone BoltModel + * + * @return cloneModel: shared weight with original model but has different tensor space + */ + protected Object clone() { + BoltModel cloneModel = new BoltModel(); + if (0 != this.modelAddr) { + cloneModel.modelAddr = cloneModel(this.modelAddr); + } else { + cloneModel.modelAddr = 0; + } + if (-1 != this.IResult) { + cloneModel.IResult = cloneResult(this.IResult); + } else { + cloneModel.IResult = -1; + } + return cloneModel; + } + + /** + * @brief set process to run on specified CPU core + * @param cpuId cpu core id(0, 1, 2...) + * @param device cpu core architecture(ARM_A76) + * + * @return + */ + public void setRuntimeDevice(int cpuId, DeviceType device) throws FileNotFoundException + { + if (0 == this.modelAddr) { + throw new FileNotFoundException(); + } + String deviceString = deviceMapping(device); + setRuntimeDeviceJNI(cpuId, deviceString); + } + + /** + * @brief set process cpu affinity according cpu average occupy + * + * @return + */ + public void setRuntimeDeviceDynamic() throws FileNotFoundException + { + if (0 == this.modelAddr) { + throw new FileNotFoundException(); + } + setRuntimeDeviceDynamicJNI(); + } + + /** + * @brief inference result from input + * @param inputNum the number of input data of int type + * @param inputName the array of all input data's name of string type + * @param inputData the 2D array of all input data of float type + * + * @return BoltResult : the result class of bolt model after inference + */ + public BoltResult run(int inputNum, String[] inputName, float[][] inputData) + { + if (0 == this.modelAddr) { + return null; + } + runModel(this.modelAddr, this.IResult, inputNum, inputName, inputData); + BoltResult boltResult = getOutput(this.IResult); + return boltResult; + } + + /** + * @brief inference result from resized input + * @param inputNum the number of input data of int type + * @param inputName the array of all input data's name of String type + * @param inputN the array of all input data's n dimension of int type + * @param inputC the array of all input data's c dimension of int type + * @param inputH the array of all input data's h dimension of int type + * @param inputW the array of all input data's w dimension of int type + * @param inputDataType the array of all input data's data type of DataType(enum) type + * @param inputDataFormat the array of all input data's data format of DataFormat(enum) type + * @param inputData the 2D array of all input data of float type + * + * @return BoltResult : the result class of bolt model after inference + */ + public BoltResult run(int inputNum, + String[] inputName, + int[] inputN, + int[] inputC, + int[] inputH, + int[] inputW, + DataType[] inputDataType, + DataFormat[] inputDataFormat, + float[][] inputData) + { + if (0 == this.modelAddr) { + return null; + } + String[] inputDataTypeString = new String[inputNum]; + String[] inputDataFormatString = new String[inputNum]; + for (int i = 0; i < inputNum; i++) { + inputDataTypeString[i] = dataTypeMapping(inputDataType[i]); + inputDataFormatString[i] = dataFormatMapping(inputDataFormat[i]); + } + + resizeModelInput(this.modelAddr, inputNum, inputName, inputN, inputC, inputH, inputW, + inputDataTypeString, inputDataFormatString); + runModel(this.modelAddr, this.IResult, inputNum, inputName, inputData); + BoltResult boltResult = getOutput(this.IResult); + return boltResult; + } + + /** + * @brief recycle memory and destroy model + * + * @return + */ + public void destructor() + { + if (-1 != this.IResult) { + freeResultHandle(this.IResult); + this.IResult = -1; + } + if (0 != this.modelAddr) { + destroyModel(this.modelAddr); + this.modelAddr = 0; + } + } +} diff --git a/inference/engine/api/java/BoltResult.java b/inference/engine/api/java/BoltResult.java new file mode 100644 index 00000000..e0e6a80e --- /dev/null +++ b/inference/engine/api/java/BoltResult.java @@ -0,0 +1,127 @@ +/** + * @file + * @brief Java BoltResult Class Document + * + * @copyright + * @code + * Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE + * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * @endcode + */ + +public class BoltResult { + /** 2d float array of output data in the inference result, the length of value is output size */ + private float[][] value; + + /** 2d int array of output dimension info in the inference result, the length of dimension is output size */ + private int[][] dimension; + + /** String array of output names info in the inference result, the length of name is output size */ + private String[] name; + + /** String array of output data info in the inference result, the length of dataFormat is output size */ + private String[] dataFormat; + + /** calculate product and skip 0 */ + public static int calculateLength(int[] array) + { + int num = array.length; + int length = 0; + for (int j = 0; j < num; j++) { + if (array[j] == 0) { + break; + } else { + if (length == 0) { + length = array[j]; + } else { + length *= array[j]; + } + } + } + return length; + } + + public BoltResult(float[][] value, int[][] dimension, String[] name, String[] dataFormat) + { + this.value = value; + this.dimension = dimension; + this.name = name; + this.dataFormat = dataFormat; + } + + /** + * @brief get result data name from BoltResult object + * + * @return 1d String array of output data in the inference result + */ + public String[] getResultName() + { + return this.name; + } + + /** + * @brief get result data format from BoltResult object + * + * @return 1d String array of output data in the inference result + */ + public String[] getResultDataFormat() + { + return this.dataFormat; + } + + /** + * @brief get result data dimension information from BoltResult object + * + * @return 2d int array of output data in the inference result + */ + public int[][] getResultDimension() + { + return this.dimension; + } + + /** + * @brief get result data array from BoltResult object + * + * @return 2d float array of output data in the inference result + */ + public float[][] getResultData() + { + return this.value; + } + + /** + * @brief print BoltResult object info + * @param num the number of the result you want + * + * @return + */ + public void print(int num) + { + for (int i = 0; i < name.length; i++) { + System.out.println("[INFO] output name: " + name[i]); + System.out.println(" data format: " + dataFormat[i]); + int len = calculateLength(this.dimension[i]); + System.out.println(" data number: " + len); + if (num >= 0) { + if (num < len) { + len = num; + } + } + + for (int j = 0; j < len; j++) { + System.out.print(value[i][j] + " "); + } + System.out.println(); + } + } +} diff --git a/inference/engine/include/BoltModel.h b/inference/engine/include/BoltModel.h new file mode 100644 index 00000000..163fc68c --- /dev/null +++ b/inference/engine/include/BoltModel.h @@ -0,0 +1,145 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include + +#ifndef _Included_BoltModel +#define _Included_BoltModel +#ifdef __cplusplus +extern "C" { +#endif +// there is no need to add '/' before com +// #define BOLT_JNI_PATH_PREFIX "com/huawei/noah/" +// #define BOLT_JNI_PREFIX_(X) Java_com_huawei_noah_##X +#define BOLT_JNI_PATH_PREFIX "" +#define BOLT_JNI_PREFIX_(X) Java_##X +#define BOLT_JNI_PREFIX(X) BOLT_JNI_PREFIX_(X) +/* + * Class: BoltModel + * Method: createModel + * Signature: (Ljava/lang/String;Ljava/lang/String;)J + */ +JNIEXPORT jlong JNICALL BOLT_JNI_PREFIX(BoltModel_createModel)(JNIEnv *, jobject, jstring, jstring); + +/* + * Class: BoltModel + * Method: prepareModel + * Signature: (JI[Ljava/lang/String;[I[I[I[I[Ljava/lang/String;[Ljava/lang/String;)V + */ +JNIEXPORT void JNICALL BOLT_JNI_PREFIX(BoltModel_prepareModel)(JNIEnv *, + jobject, + jlong, + jint, + jobjectArray, + jintArray, + jintArray, + jintArray, + jintArray, + jobjectArray, + jobjectArray); + +/* + * Class: BoltModel + * Method: cloneModel + * Signature: (J)J + */ +JNIEXPORT jlong JNICALL BOLT_JNI_PREFIX(BoltModel_cloneModel)(JNIEnv *, jobject, jlong); + +/* + * Class: BoltModel + * Method: resizeModelInput + * Signature: (JI[Ljava/lang/String;[I[I[I[I[Ljava/lang/String;[Ljava/lang/String;)V + */ +JNIEXPORT void JNICALL BOLT_JNI_PREFIX(BoltModel_resizeModelInput)(JNIEnv *, + jobject, + jlong, + jint, + jobjectArray, + jintArray, + jintArray, + jintArray, + jintArray, + jobjectArray, + jobjectArray); + +/* + * Class: BoltModel + * Method: allocAllResultHandle + * Signature: (J)J + */ +JNIEXPORT jlong JNICALL BOLT_JNI_PREFIX(BoltModel_allocAllResultHandle)(JNIEnv *, jobject, jlong); + +/* + * Class: BoltModel + * Method: allocSpecificResultHandle + * Signature: (JI[Ljava/lang/String;)J + */ +JNIEXPORT jlong JNICALL BOLT_JNI_PREFIX(BoltModel_allocSpecificResultHandle)( + JNIEnv *, jobject, jlong, jint, jobjectArray); + +/* + * Class: BoltModel + * Method: cloneResult + * Signature: (J)J + */ +JNIEXPORT jlong JNICALL BOLT_JNI_PREFIX(BoltModel_cloneResult)(JNIEnv *, jobject, jlong); + +/* + * Class: BoltModel + * Method: setRuntimeDeviceJNI + * Signature: (ILjava/lang/String;)V + */ +JNIEXPORT void JNICALL BOLT_JNI_PREFIX(BoltModel_setRuntimeDeviceJNI)( + JNIEnv *, jobject, jlong, jint, jstring); + +/* + * Class: BoltModel + * Method: setRuntimeDeviceDynamicJNI + * Signature: (V)V + */ +JNIEXPORT void JNICALL BOLT_JNI_PREFIX(BoltModel_setRuntimeDeviceDynamicJNI)( + JNIEnv *, jobject, jlong); + +/* + * Class: BoltModel + * Method: runModel + * Signature: (JJI[Ljava/lang/String;[[F)V + */ +JNIEXPORT void JNICALL BOLT_JNI_PREFIX(BoltModel_runModel)( + JNIEnv *, jobject, jlong, jlong, jint, jobjectArray, jobjectArray); + +/* + * Class: BoltModel + * Method: getOutput + * Signature: (J)LBoltResult; + */ +JNIEXPORT jobject JNICALL BOLT_JNI_PREFIX(BoltModel_getOutput)(JNIEnv *, jobject, jlong); + +/* + * Class: BoltModel + * Method: freeResultHandle + * Signature: (J)V + */ +JNIEXPORT void JNICALL BOLT_JNI_PREFIX(BoltModel_freeResultHandle)(JNIEnv *, jobject, jlong); + +/* + * Class: BoltModel + * Method: destroyModel + * Signature: (J)V + */ +JNIEXPORT void JNICALL BOLT_JNI_PREFIX(BoltModel_destroyModel)(JNIEnv *, jobject, jlong); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/inference/include/activation.hpp b/inference/engine/include/activation.hpp similarity index 77% rename from inference/include/activation.hpp rename to inference/engine/include/activation.hpp index f1a5404a..38e3054a 100644 --- a/inference/include/activation.hpp +++ b/inference/engine/include/activation.hpp @@ -1,34 +1,27 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _ACTIVATION_H #define _ACTIVATION_H #include "operator.hpp" -#include "tensor_computing.h" -#include "tensor_desc.h" -class Activation: public Operator -{ +class Activation : public Operator { public: - /** - @param mode - */ - Activation(ActivationDesc activationDesc) + Activation(ActivationParamSpec activationDesc) { this->activationDesc = activationDesc; - switch(activationDesc.mode) { + switch (activationDesc.mode) { case ACTIVATION_RELU: { this->opt = OT_Relu; break; @@ -57,6 +50,14 @@ class Activation: public Operator this->opt = OT_TanH; break; } + case ACTIVATION_MISH: { + this->opt = OT_Mish; + break; + } + case ACTIVATION_GREATER: { + this->opt = OT_Greater; + break; + } default: { CHECK_STATUS(NOT_SUPPORTED); } @@ -64,20 +65,19 @@ class Activation: public Operator this->lenOfTemp = 0; } - OperatorType get_op_type() override + OperatorType get_type() override { return this->opt; } - bool can_input_output_the_same() override { return true; } protected: - ActivationDesc activationDesc; + ActivationParamSpec activationDesc; OperatorType opt; }; -#endif //_ACTIVATION_H +#endif // _ACTIVATION_H diff --git a/inference/engine/include/argmax.hpp b/inference/engine/include/argmax.hpp new file mode 100644 index 00000000..50d98fdb --- /dev/null +++ b/inference/engine/include/argmax.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _ARGMAX_H +#define _ARGMAX_H + +#include "operator.hpp" + +class ArgMax : public Operator { +public: + ArgMax(DataType dt, ArgMaxParamSpec p) + { + this->dt = dt; + this->p = p; + } + + OperatorType get_type() override + { + return OT_ArgMax; + } + +protected: + ArgMaxParamSpec p; +}; + +#endif // _ARGMAX_H diff --git a/inference/engine/include/attention.hpp b/inference/engine/include/attention.hpp new file mode 100644 index 00000000..a992be7f --- /dev/null +++ b/inference/engine/include/attention.hpp @@ -0,0 +1,65 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _ATTENTION_H +#define _ATTENTION_H + +#include "operator.hpp" + +class Attention : public Operator { +public: + Attention(DataType dt, AttentionParamSpec p) + { + this->dt = dt; + this->p = p; + } + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new Attention(this->dt, this->p)); + *mem = *this; + return mem; + } + + OperatorType get_type() override + { + return OT_Attention; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + auto inDesc = inputTensor.get_desc(); + inDesc.dt = this->dt; + inputTensor.resize(inDesc); + CHECK_STATUS(attention(inputTensor, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + auto inTensor = *inTensors[0]; + auto inDesc = inTensor.get_desc(); + inDesc.dt = this->dt; + inTensor.resize(inDesc); + CHECK_STATUS(attention_infer_output_size(&inTensor, this->p, outTensors[0])); + return SUCCESS; + } + +private: + AttentionParamSpec p; +}; + +#endif // _ATTENTION_H diff --git a/inference/engine/include/attention_mask.hpp b/inference/engine/include/attention_mask.hpp new file mode 100644 index 00000000..91561b33 --- /dev/null +++ b/inference/engine/include/attention_mask.hpp @@ -0,0 +1,57 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _ATTENTION_MASK_H +#define _ATTENTION_MASK_H + +#include "operator.hpp" + +class AttentionMask : public Operator { +public: + AttentionMask(DataType dt, AttentionMaskParamSpec p) + { + this->dt = dt; + this->p = p; + } + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new AttentionMask(this->dt, this->p)); + *mem = *this; + return mem; + } + + OperatorType get_type() override + { + return OT_AttentionMask; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + CHECK_STATUS(attention_mask(inputTensor, this->p, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + return attention_mask_infer_output_size(inTensors[0], outTensors[0]); + } + +private: + AttentionMaskParamSpec p; +}; + +#endif diff --git a/inference/engine/include/bilateral_slice_apply.hpp b/inference/engine/include/bilateral_slice_apply.hpp new file mode 100644 index 00000000..bcfc6f18 --- /dev/null +++ b/inference/engine/include/bilateral_slice_apply.hpp @@ -0,0 +1,37 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _BILATERAL_SLICE_APPLY_H +#define _BILATERAL_SLICE_APPLY_H + +#include "operator.hpp" + +class BilateralSliceApply : public Operator { +public: + BilateralSliceApply(BilateralSliceApplyParamSpec p) + { + this->p = p; + } + virtual ~BilateralSliceApply() + {} + + OperatorType get_type() override + { + return OT_BilateralSliceApply; + } + +protected: + BilateralSliceApplyParamSpec p; +}; + +#endif // _BILATERAL_SLICE_APPLY_H diff --git a/inference/engine/include/channel_resize.hpp b/inference/engine/include/channel_resize.hpp new file mode 100644 index 00000000..837f45a0 --- /dev/null +++ b/inference/engine/include/channel_resize.hpp @@ -0,0 +1,42 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _CHANNEL_RESIZE_H +#define _CHANNEL_RESIZE_H + +#include "operator.hpp" + +class ChannelResize : public Operator { +public: + ChannelResize(DataType dt, ChannelResizeParamSpec p) + { + this->dt = dt; + this->valid = true; + this->rearrange = true; + this->p = p; + } + + OperatorType get_type() override + { + return OT_ChannelResize; + } + +protected: + bool valid; + // whether to rearrange cut data to specific format(NCHWC8) + bool rearrange; + + ChannelResizeParamSpec p; +}; + +#endif // _CHANNEL_RESIZE_H diff --git a/inference/engine/include/check.hpp b/inference/engine/include/check.hpp new file mode 100644 index 00000000..2c8ce0b0 --- /dev/null +++ b/inference/engine/include/check.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _CHECK_H +#define _CHECK_H + +#include "operator.hpp" + +class Check : public Operator { +public: + Check(DataType dt, CheckParamSpec p) + { + this->dt = dt; + this->p = p; + } + + OperatorType get_type() override + { + return OT_Check; + } + +protected: + CheckParamSpec p; +}; + +#endif // _CHECK_H diff --git a/inference/engine/include/clip.hpp b/inference/engine/include/clip.hpp new file mode 100644 index 00000000..bdbda939 --- /dev/null +++ b/inference/engine/include/clip.hpp @@ -0,0 +1,41 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _CLIP_H +#define _CLIP_H + +#include "operator.hpp" + +class Clip : public Operator { +public: + Clip(DataType dt, ClipParamSpec p) + { + this->dt = dt; + this->p = p; + } + + OperatorType get_type() override + { + return OT_Clip; + } + + bool can_input_output_the_same() override + { + return true; + } + +protected: + ClipParamSpec p; +}; + +#endif // _CLIP_H diff --git a/inference/engine/include/cnn.h b/inference/engine/include/cnn.h new file mode 100644 index 00000000..54b197c9 --- /dev/null +++ b/inference/engine/include/cnn.h @@ -0,0 +1,114 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _CNN_H +#define _CNN_H + +#include +#include +#include "model.hpp" +#include "memory_tracker.hpp" +#ifdef _USE_MALI +#include "gcl_common.h" +#endif + +class CNN : public Model { +public: + CNN() + {} + + explicit CNN(AffinityPolicy affinityPolicy, DataType dt, std::string name) + : Model(affinityPolicy, dt, name) + {} + + virtual ~CNN() = default; + + CNN clone(); + + void sort_operators_sequential(const ModelSpec *ms); + + void initialize_ops(const ModelSpec *ms); + + void ready(std::map inputDescMap) override; + + void reready(std::map inputDescMap); + + EE mark_input_output(); + + void copy_to_named_input(std::string inputName, const U8 *data); + + void set_input_tensors_value(std::map> modelTensorsInput); + + std::map> get_inputs(); + + std::map> get_outputs(); + + Tensor get_tensor_by_name(std::string tensorName); + + TensorDesc get_tensor_desc_by_name(std::string tensorName); + + std::vector get_model_input_tensor_names(); + + std::vector get_model_input_tensor_descs(); + + std::vector get_model_output_tensor_names(); + + EE infer_output_tensors_size(std::map inputDescMap) override; + + void assign_output_tensor() override; + + void addOutputTensorNames(std::vector outputTensorNames); + + void run() override; + +#ifdef _USE_MALI + void mali_prepare(bool reset); +#endif +private: + std::shared_ptr allocate_tensor(U32 size = 0); + + void add(std::shared_ptr op, + std::vector inputTensorsName, + std::vector outputTensorsName); + + void infer_layout_desc(); + + void update_op_tensors(); + + void set_input_tensors_desc(std::map inputDescMap); + + void infer_tmp_memory_size() override; + + void assign_tmp_tensor() override; + + void check_memory_reuse_ratio(); + +private: + std::map> tensorMap; + std::map> operatorMap; + std::map>> operatorTensorMap; + + std::set weightOpOutputNames; + std::map> inputTensors; + std::map> outputTensors; + std::vector> storageMemory; + Tensor tmpTensor; + + std::vector sortedOps; + + std::vector modelInputTensorNames; + std::vector modelInputTensorDescs; + std::vector modelOutputTensorNames; + MemoryTracker memoryTracker; +}; +#endif diff --git a/inference/engine/include/concat.hpp b/inference/engine/include/concat.hpp new file mode 100644 index 00000000..a055619e --- /dev/null +++ b/inference/engine/include/concat.hpp @@ -0,0 +1,35 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _CONCAT_H +#define _CONCAT_H + +#include "operator.hpp" + +class Concat : public Operator { +public: + Concat(ConcatParamSpec p) + { + this->p = p; + } + + OperatorType get_type() override + { + return OT_Concat; + } + +protected: + ConcatParamSpec p; +}; + +#endif // _CONCAT_H diff --git a/inference/engine/include/constant.hpp b/inference/engine/include/constant.hpp new file mode 100644 index 00000000..93c3e344 --- /dev/null +++ b/inference/engine/include/constant.hpp @@ -0,0 +1,57 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _CONSTANT_H +#define _CONSTANT_H +#include "operator.hpp" + +class Constant : public Operator { +public: + Constant(TensorDesc constDesc, void *data) + { + this->constDesc = constDesc; + this->data = data; + } + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new Constant(this->constDesc, this->data)); + *mem = *this; + return mem; + } + + OperatorType get_type() override + { + return OT_Constant; + } + + void run() override + { + Tensor outputTensor = this->outputTensors[0]; + auto outputPtr = ((CpuMemory *)outputTensor.get_memory())->get_ptr(); + memcpy(outputPtr, data, tensorNumBytes(constDesc)); + } + + EE infer_output_tensors_size(std::vector *outDims) override + { + (*outDims)[0] = constDesc; + return SUCCESS; + } + +private: + TensorDesc constDesc; + void *data; +}; + +#endif // _CONSTANT__H diff --git a/inference/engine/include/convolution.hpp b/inference/engine/include/convolution.hpp new file mode 100644 index 00000000..a4f7975f --- /dev/null +++ b/inference/engine/include/convolution.hpp @@ -0,0 +1,69 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _CONVOLUTION_H +#define _CONVOLUTION_H + +#include "weight_operator.hpp" + +class Convolution : public WeightOperator { +public: + Convolution(DataType dt, + ConvolutionParamSpec p, + ActivationParamSpec dwActivationParamSpec, + ActivationParamSpec pwActivationParamSpec) + { + this->dt = dt; + this->p = p; + this->dwActivationParamSpec = dwActivationParamSpec; + this->pwActivationParamSpec = pwActivationParamSpec; + this->hasBias = false; + this->pwAlg = CONVOLUTION_ALGORITHM_NULL; + this->dwAlg = DEPTHWISE_CONVOLUTION_ALGORITHM_NULL; + } + + OperatorType get_type() override + { + return OT_Conv; + } + + TensorDesc desc_process(TensorDesc inputDesc) + { + TensorDesc resultDesc; + if (tensorIs3d(inputDesc)) { + DataType idt; + DataFormat idf; + U32 in, ic, ih; + CHECK_STATUS(tensor3dGet(inputDesc, &idt, &idf, &in, &ic, &ih)); + resultDesc = tensor4df(idt, idf, in, ic, ih, 1); + } else { + resultDesc = inputDesc; + } + return resultDesc; + } + +public: + U32 numChannels; + + ConvolutionParamSpec p; + ActivationParamSpec dwActivationParamSpec; + ActivationParamSpec pwActivationParamSpec; + + ConvolutionForwardAlgorithm pwAlg; + DepthwiseConvolutionForwardAlgorithm dwAlg; +#ifdef _USE_FP16 + std::shared_ptr scales; +#endif +}; + +#endif // _CONVOLUTION_H diff --git a/inference/engine/include/copy.hpp b/inference/engine/include/copy.hpp new file mode 100644 index 00000000..60649b0e --- /dev/null +++ b/inference/engine/include/copy.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _COPY_H +#define _COPY_H + +#include "operator.hpp" + +class Copy : public Operator { +public: + Copy(DataType dt, CopyParamSpec p) + { + this->dt = dt; + this->p = p; + } + + OperatorType get_type() override + { + return OT_Copy; + } + +protected: + CopyParamSpec p; +}; + +#endif // _COPY_H diff --git a/inference/engine/include/cpu/activation_cpu.hpp b/inference/engine/include/cpu/activation_cpu.hpp new file mode 100644 index 00000000..c598c142 --- /dev/null +++ b/inference/engine/include/cpu/activation_cpu.hpp @@ -0,0 +1,48 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _ACTIVATION_CPU_H +#define _ACTIVATION_CPU_H + +#include "activation.hpp" + +class ActivationCPU : public Activation { +public: + ActivationCPU(ActivationParamSpec activationDesc) : Activation(activationDesc) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new ActivationCPU(this->activationDesc)); + *mem = *this; + return mem; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + CHECK_STATUS(activation(inputTensor, this->activationDesc, outputTensor, &this->archInfo)); + outputTensor.set_scale(inputTensor.get_scale()); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS(activation_infer_output_size(inTensors[0], outTensors[0], &this->archInfo)); + return SUCCESS; + } +}; + +#endif // _ACTIVATION_CPU_H diff --git a/inference/engine/include/cpu/argmax_cpu.hpp b/inference/engine/include/cpu/argmax_cpu.hpp new file mode 100644 index 00000000..5c3b643b --- /dev/null +++ b/inference/engine/include/cpu/argmax_cpu.hpp @@ -0,0 +1,48 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _ARGMAX_CPU_H +#define _ARGMAX_CPU_H + +#include "argmax.hpp" + +class ArgMaxCPU : public ArgMax { +public: + ArgMaxCPU(DataType dt, ArgMaxParamSpec p) : ArgMax(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new ArgMaxCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + CHECK_STATUS(argmax(inputTensor, this->p, this->temp, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS( + argmax_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } +}; + +#endif // _ARGMAX_CPU_H diff --git a/inference/engine/include/cpu/channel_resize_cpu.hpp b/inference/engine/include/cpu/channel_resize_cpu.hpp new file mode 100644 index 00000000..cf5d6c5f --- /dev/null +++ b/inference/engine/include/cpu/channel_resize_cpu.hpp @@ -0,0 +1,121 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _CHANNEL_RESIZE_CPU_H +#define _CHANNEL_RESIZE_CPU_H + +#include "channel_resize.hpp" + +class ChannelResizeCPU : public ChannelResize { +public: + ChannelResizeCPU(DataType dt, ChannelResizeParamSpec p) : ChannelResize(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new ChannelResizeCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + TensorDesc inputDesc = inputTensor.get_desc(); + U32 inputSize = tensorNumBytes(inputDesc); + U8 *inputPtr = (U8 *)((CpuMemory *)(inputTensor.get_memory()))->get_ptr(); + Tensor outputTensor = this->outputTensors[0]; + TensorDesc outputDesc = outputTensor.get_desc(); + U32 outputSize = tensorNumBytes(outputDesc); + U8 *outputPtr = (U8 *)((CpuMemory *)(outputTensor.get_memory()))->get_ptr(); + // don't need to span or cut + if (!this->valid) { + if (inputPtr != outputPtr) { + CHECK_REQUIREMENT(inputSize == outputSize); + memcpy(outputPtr, inputPtr, inputSize); + } + } else if (this->rearrange && DF_NCHWC8 == inputDesc.df && DF_NCHWC8 == outputDesc.df) { + transformNCHWC8ToNCHWC8ByGroup( + inputDesc, inputPtr, this->p.group, outputDesc, outputPtr); + } else { + U32 batch = inputDesc.dims[inputDesc.nDims - 1]; + U32 inputChannelGroupSize = this->p.channel_before / this->p.group; + U32 inputTileSize = inputSize / (batch * this->p.group); + U32 outputChannelGroupSize = this->p.channel_after / this->p.group; + U32 outputTileSize = outputSize / (batch * this->p.group); + int channelAxis = inputDesc.nDims - 2; + TensorDesc tmpInputDesc = inputDesc; + tmpInputDesc.dims[channelAxis] = inputChannelGroupSize; + TensorDesc tmpOutputDesc = outputDesc; + tmpOutputDesc.dims[channelAxis] = outputChannelGroupSize; + for (int g = 0; g < this->p.group; g++) { + if (this->p.channel_after > this->p.channel_before) { + transformNCHWToNCHWC8(tmpInputDesc, inputPtr, tmpOutputDesc, outputPtr); + } else { + transformToNCHW(tmpInputDesc, inputPtr, tmpOutputDesc, outputPtr); + } + inputPtr += inputTileSize; + outputPtr += outputTileSize; + } + } +#ifdef _USE_INT8 + outputTensor.set_scale(inputTensor.get_scale()); +#endif + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_REQUIREMENT(inTensors.size() > 0); + auto inDesc = inTensors[0]->get_desc(); + CHECK_REQUIREMENT(inDesc.nDims > 0); + // don't need to span + if (this->p.channel_after > this->p.channel_before && inDesc.df == DF_NCHWC8) { + this->valid = false; + } + // don't need to cut + if (this->p.channel_after < this->p.channel_before && inDesc.df == DF_NCHW) { + this->valid = false; + } + if (!this->valid) { + outTensors[0]->resize(inDesc); + return SUCCESS; + } + + int channelAxis = inDesc.nDims - 2; + // channel span or cut for OT_Resize + if (this->p.group == 0) { + this->p.group = 1; + this->p.channel_before = (int)inDesc.dims[channelAxis]; + this->p.channel_after = + (this->p.channel_before / 8 + ((this->p.channel_before % 8 == 0) ? 0 : 1)) * 8; + } else { + CHECK_REQUIREMENT((int)inDesc.dims[channelAxis] == this->p.channel_before); + } + + inDesc.dims[channelAxis] = this->p.channel_after; + DataFormat dataFormat; + if (this->p.channel_after > this->p.channel_before || + (this->rearrange && this->p.channel_after % 8 == 0)) { + dataFormat = DF_NCHWC8; + } else { + dataFormat = DF_NCHW; + } + inDesc.df = dataFormat; + outTensors[0]->resize(inDesc); + return SUCCESS; + } +}; + +#endif // _CHANNEL_RESIZE_CPU_H diff --git a/inference/engine/include/cpu/check_cpu.hpp b/inference/engine/include/cpu/check_cpu.hpp new file mode 100644 index 00000000..464721f9 --- /dev/null +++ b/inference/engine/include/cpu/check_cpu.hpp @@ -0,0 +1,46 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _CHECK_CPU_H +#define _CHECK_CPU_H + +#include "check.hpp" + +class CheckCPU : public Check { +public: + CheckCPU(DataType dt, CheckParamSpec p) : Check(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new CheckCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + Tensor inputATensor = this->inputTensors[0]; + Tensor inputBTensor = this->inputTensors[1]; + Tensor outputTensor = this->outputTensors[0]; + CHECK_STATUS(check(inputATensor, inputBTensor, this->p, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + return check_infer_output_size(inTensors, outTensors[0], &this->archInfo); + } +}; + +#endif // _CHECK_CPU_H diff --git a/inference/engine/include/cpu/clip_cpu.hpp b/inference/engine/include/cpu/clip_cpu.hpp new file mode 100644 index 00000000..4d656ee7 --- /dev/null +++ b/inference/engine/include/cpu/clip_cpu.hpp @@ -0,0 +1,46 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _CLIP_CPU_H +#define _CLIP_CPU_H + +#include "clip.hpp" + +class ClipCPU : public Clip { +public: + ClipCPU(DataType dt, ClipParamSpec p) : Clip(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new ClipCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + CHECK_STATUS(clip(inputTensor, this->p, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS(clip_infer_output_size(inTensors[0], outTensors[0], &this->archInfo)); + return SUCCESS; + } +}; + +#endif // _CLIP_CPU_H diff --git a/inference/engine/include/cpu/concat_cpu.hpp b/inference/engine/include/cpu/concat_cpu.hpp new file mode 100644 index 00000000..b758c63d --- /dev/null +++ b/inference/engine/include/cpu/concat_cpu.hpp @@ -0,0 +1,52 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _CONCAT_CPU_H +#define _CONCAT_CPU_H + +#include "concat.hpp" + +class ConcatCPU : public Concat { +public: + ConcatCPU(ConcatParamSpec p) : Concat(p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new ConcatCPU(this->p)); + *mem = *this; + return mem; + } + + void run() override + { + CHECK_STATUS( + concat(this->inputTensors, this->p, this->temp, outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS(concat_infer_output_size(inTensors, this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + U32 bytes = 0; + CHECK_STATUS(concat_infer_forward_tmp_bytes(this->inputTensors, &bytes, &this->archInfo)); + return bytes; + } +}; + +#endif // _CONCAT_CPU_H diff --git a/inference/engine/include/cpu/convolution_cpu.hpp b/inference/engine/include/cpu/convolution_cpu.hpp new file mode 100644 index 00000000..f2a405c6 --- /dev/null +++ b/inference/engine/include/cpu/convolution_cpu.hpp @@ -0,0 +1,539 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _CONVELTWISEPOOLING_CPU_H +#define _CONVELTWISEPOOLING_CPU_H + +#include "convolution.hpp" + +class ConvolutionCPU : public Convolution { +public: + ConvolutionCPU(DataType dt, + ConvolutionParamSpec p, + ActivationParamSpec dwActivationParamSpec, + ActivationParamSpec pwActivationParamSpec) + : Convolution(dt, p, dwActivationParamSpec, pwActivationParamSpec) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new ConvolutionCPU( + this->dt, this->p, this->dwActivationParamSpec, this->pwActivationParamSpec)); + *mem = *this; + return mem; + } + + EE init_weight_bias_from_model(std::shared_ptr *modelPtrShared) override + { + U8 *modelPtr = nullptr; + if (modelPtrShared != nullptr) { + modelPtr = (*modelPtrShared).get(); + } + auto curOpWs = this->get_weightspec(); + DataType filterDt = curOpWs.mdt; // weight data type may not be the same as input and output + if (modelPtr != nullptr) { + filterDt = this->dt; + } + DataType dtNoQ = (this->dt == DT_F16_8Q) ? DT_F16 : this->dt; + U32 isBNN = 0; + if (filterDt == DT_BIN01 || filterDt == DT_BIN11) { + isBNN = 1; + } + + int weight_num = 1; + std::vector weight_desc(2), bias_desc(2); + switch (this->p.convolution_type) { + case Convolution_Pointwise: { + U32 vectorLen = this->p.num_outputs; // bias length + if (isBNN == 1) { + this->dt = dtNoQ; // BNN convolution should not be quantized further + vectorLen *= 2; // Scale has the same vector length as bias, so double the length + } + weight_desc[0] = tensor4df(filterDt, DF_NCHW, this->p.num_outputs, + this->numChannels, this->p.kernel_h, this->p.kernel_w); + bias_desc[0] = tensor1d(dtNoQ, vectorLen); + break; + } + case Convolution_Depthwise: { + weight_desc[0] = tensor4df( + filterDt, DF_NCHW, 1, this->p.num_outputs, this->p.kernel_h, this->p.kernel_w); + bias_desc[0] = tensor1d(dtNoQ, this->p.num_outputs); + break; + } + case Convolution_Depthwise_Pointwise: { + weight_desc[0] = tensor4df( + filterDt, DF_NCHW, 1, this->numChannels, this->p.kernel_h, this->p.kernel_w); + bias_desc[0] = tensor1d(dtNoQ, this->numChannels); + weight_desc[1] = + tensor4df(filterDt, DF_NCHW, this->p.num_outputs, this->numChannels, 1, 1); + bias_desc[1] = tensor1d(dtNoQ, this->p.num_outputs); + weight_num = 2; + break; + } + case Convolution_Dilation: { + weight_desc[0] = tensor4df(filterDt, DF_NCHW, this->p.num_outputs, + this->numChannels, this->p.kernel_h, this->p.kernel_w); + bias_desc[0] = tensor1d(dtNoQ, this->p.num_outputs); + break; + } + default: + return NOT_SUPPORTED; + } + + std::shared_ptr weight_ptr(curOpWs.weight); + std::shared_ptr bias_ptr(curOpWs.vec); + U32 weight_offset = 0; + U32 bias_offset = 0; + for (int j = 0; j < weight_num; j++) { + Tensor weight_tensor, bias_tensor; + weight_tensor.resize(weight_desc[j]); + bias_tensor.resize(bias_desc[j]); + U32 weight_bytes = weight_tensor.bytes(); + U32 bias_bytes = bias_tensor.bytes(); + U32 offset_bytes = 0; + if (modelPtr != nullptr) { + weight_tensor.alloc(); + memcpy( + ((CpuMemory *)(weight_tensor.get_memory()))->get_ptr(), modelPtr, weight_bytes); + offset_bytes += weight_bytes; + if (this->hasBias) { + bias_tensor.alloc(); + memcpy(((CpuMemory *)(bias_tensor.get_memory()))->get_ptr(), + modelPtr + offset_bytes, bias_bytes); + offset_bytes += bias_bytes; + } + *modelPtrShared = std::shared_ptr(*modelPtrShared, modelPtr + offset_bytes); + } else { + ((CpuMemory *)(weight_tensor.get_memory())) + ->set_shared_ptr( + std::shared_ptr(weight_ptr, weight_ptr.get() + weight_offset)); + weight_offset += weight_bytes; + if (this->hasBias) { + ((CpuMemory *)(bias_tensor.get_memory())) + ->set_shared_ptr( + std::shared_ptr(bias_ptr, bias_ptr.get() + bias_offset)); + bias_offset += bias_bytes; + } + } + if (!this->hasBias) { + bias_tensor.alloc(); + if (isBNN == 1) { +#ifdef _USE_FP16 + U8 *ptr = (U8 *)((CpuMemory *)(bias_tensor.get_memory()))->get_ptr(); + UNI_init(p.num_outputs, DT_F16, 1.0, ptr); + ptr += bias_bytes / 2; + memset(ptr, 0, bias_bytes / 2); // second half is bias +#endif + } else { + memset(((CpuMemory *)(bias_tensor.get_memory()))->get_ptr(), 0, bias_bytes); + } + } + this->weightTensors.push_back(weight_tensor); + this->biasTensors.push_back(bias_tensor); + } + return SUCCESS; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor filterTensor = this->weightTensors[0]; + U8 *scalePtr = nullptr; + Tensor biasTensor = this->biasTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + + switch (this->p.convolution_type) { + case Convolution_Pointwise: { + if (DT_F16_8Q == this->dt) { +#ifdef _USE_INT8 + F16 *ptr = this->scales.get(); + scalePtr = (U8 *)ptr; + auto inputDesc = inputTensor.get_desc(); + + ptr[0] = inputTensor.get_scale(); + if (featureScale.size() > 0 && featureScale[0][0] > 0) { + ptr[0] = featureScale[0][0]; + } else if (DT_F16 == inputDesc.dt) { + ptr[0] = -1; + } + + if (featureScale.size() > 0 && (featureScale.back())[0] != -2) { + ptr[1] = (featureScale.back())[0]; + } else { + ptr[1] = -1; + } +#endif + } + CHECK_STATUS( + convolution(inputTensor, filterTensor, p, this->pwAlg, scalePtr, biasTensor, + this->temp, outputTensor, this->pwActivationParamSpec, &this->archInfo)); +#ifdef _USE_INT8 + auto outputDesc = outputTensor.get_desc(); + if (DT_I8 == outputDesc.dt) { + F16 *ptr = (F16 *)scalePtr; + outputTensor.set_scale(ptr[1]); + } +#endif + break; + } + case Convolution_Depthwise: { + CHECK_STATUS( + depthwise_convolution(inputTensor, filterTensor, p, this->dwAlg, biasTensor, + this->temp, outputTensor, this->dwActivationParamSpec, &this->archInfo)); + break; + } + case Convolution_Depthwise_Pointwise: { + CHECK_STATUS( + depthwise_pointwise_convolution(inputTensor, filterTensor, weightTensors[1], p, + this->dwAlg, biasTensor, biasTensors[1], this->temp, outputTensor, + this->dwActivationParamSpec, this->pwActivationParamSpec, &this->archInfo)); + break; + } + case Convolution_Dilation: { + CHECK_STATUS( + convolution(inputTensor, filterTensor, p, this->pwAlg, scalePtr, biasTensor, + this->temp, outputTensor, this->pwActivationParamSpec, &this->archInfo)); + break; + } + default: { + UNI_ERROR_LOG("unsupported convolution type %d\n", this->p.convolution_type); + } + } + } + + EE infer_forward_algorithm(std::shared_ptr algorithmMap) override + { + auto inputTensor = this->inputTensors[0]; + auto filterTensor = this->weightTensors[0]; + auto outputTensor = this->outputTensors[0]; + TensorDesc inputDesc = this->desc_process(inputTensor.get_desc()); + inputTensor.resize(inputDesc); + TensorDesc filterDesc = filterTensor.get_desc(); + + ConvolutionPolicy policy = CONVOLUTION_FASTEST; + DataType targetType = filterDesc.dt; + I32 algo; + switch (this->p.convolution_type) { + case Convolution_Pointwise: { + if (this->dt == DT_F16_8Q) { + targetType = DT_I8; + } + if (algorithmMap->getAlgorithmInfoFromMap(this->name, &algo, 1)) { + this->pwAlg = (ConvolutionForwardAlgorithm)algo; + } else if (algorithmMap->getCommonAlgoInfoFromMap(OT_Conv, this->dt, + inputDesc.dims[2], inputDesc.dims[1], inputDesc.dims[0], + filterDesc.dims[3], filterDesc.dims[1], filterDesc.dims[0], + this->p.stride_h, this->p.stride_w, &algo, 1)) { + this->pwAlg = (ConvolutionForwardAlgorithm)algo; + } else { + CHECK_STATUS(convolution_infer_forward_algorithm(inputTensor, filterTensor, + outputTensor, p, policy, &(this->pwAlg), targetType, + this->pwActivationParamSpec, &this->archInfo)); + algo = this->pwAlg; + algorithmMap->setAlgorithmInfoToMap(this->name, &algo, 1); + } + break; + } + case Convolution_Depthwise: { + if (algorithmMap->getAlgorithmInfoFromMap(this->name, &algo, 1)) { + this->dwAlg = (DepthwiseConvolutionForwardAlgorithm)algo; + } else { + CHECK_STATUS(depthwise_convolution_infer_forward_algorithm(inputTensor, + filterTensor, outputTensor, p, policy, &(this->dwAlg), targetType, + this->dwActivationParamSpec, &this->archInfo)); + algo = this->dwAlg; + algorithmMap->setAlgorithmInfoToMap(this->name, &algo, 1); + } + break; + } + case Convolution_Depthwise_Pointwise: { + if (algorithmMap->getAlgorithmInfoFromMap(this->name, &algo, 1)) { + this->dwAlg = (DepthwiseConvolutionForwardAlgorithm)algo; + } else { + CHECK_STATUS(depthwise_pointwise_convolution_infer_forward_algorithm( + inputTensor, filterTensor, this->weightTensors[1], outputTensor, p, policy, + &(this->dwAlg), targetType, this->dwActivationParamSpec, + this->pwActivationParamSpec, &this->archInfo)); + algo = this->dwAlg; + algorithmMap->setAlgorithmInfoToMap(this->name, &algo, 1); + } + break; + } + case Convolution_Dilation: { + if (algorithmMap->getAlgorithmInfoFromMap(this->name, &algo, 1)) { + this->pwAlg = (ConvolutionForwardAlgorithm)algo; + } else { + CHECK_STATUS(convolution_infer_forward_algorithm(inputTensor, filterTensor, + outputTensor, p, policy, &(this->pwAlg), targetType, + this->pwActivationParamSpec, &this->archInfo)); + algo = this->pwAlg; + algorithmMap->setAlgorithmInfoToMap(this->name, &algo, 1); + } + break; + } + default: + CHECK_STATUS(NOT_SUPPORTED); + } + return SUCCESS; + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + TensorDesc inDim = this->desc_process(inTensors[0]->get_desc()); + Tensor tmpTensor; + tmpTensor.resize(inDim); + auto inputTensor = &tmpTensor; + auto outputTensor = outTensors[0]; + + DataType idt; + DataFormat idf; + U32 in, ic, ih, iw; + CHECK_STATUS(tensor4dGet(inDim, &idt, &idf, &in, &ic, &ih, &iw)); + if (DF_NCHW == idf && DT_F16_8Q == this->dt && DT_F16 == idt) { + this->dt = DT_F16; + } + this->numChannels = ic; + if (this->p.convolution_type == Convolution_Dilation || + this->p.convolution_type == Convolution_Pointwise) { + this->numChannels /= this->p.group; + } + + Tensor filterTensor; + TensorDesc filterDim = tensor4df(this->dt, DF_NCHW, this->p.num_outputs, this->numChannels, + this->p.kernel_h, this->p.kernel_w); + filterTensor.resize(filterDim); + + DataType targetType = this->dt; + if (DT_F16_8Q == this->dt && Convolution_Pointwise == this->p.convolution_type) { + targetType = DT_I8; + } + + switch (this->p.convolution_type) { + case Convolution_Pointwise: { + CHECK_STATUS(convolution_infer_output_size( + inputTensor, filterTensor, p, outputTensor, targetType, &this->archInfo)); + break; + } + case Convolution_Depthwise: { + filterDim.dims[3] = 1; + CHECK_STATUS(depthwise_convolution_infer_output_size( + inputTensor, filterTensor, p, outputTensor, targetType, &this->archInfo)); + break; + } + case Convolution_Depthwise_Pointwise: { + TensorDesc dwFilterDesc = tensor4df( + this->dt, DF_NCHW, 1, this->numChannels, this->p.kernel_h, this->p.kernel_w); + TensorDesc pwFilterDesc = + tensor4df(this->dt, DF_NCHW, this->p.num_outputs, this->numChannels, 1, 1); + Tensor dwFilterTensor; + Tensor pwFilterTensor; + dwFilterTensor.resize(dwFilterDesc); + pwFilterTensor.resize(pwFilterDesc); + CHECK_STATUS(depthwise_pointwise_convolution_infer_output_size(inputTensor, + dwFilterTensor, pwFilterTensor, p, outputTensor, targetType, &this->archInfo)); + break; + } + case Convolution_Dilation: { + CHECK_STATUS(convolution_infer_output_size( + inputTensor, filterTensor, p, outputTensor, targetType, &this->archInfo)); + break; + } + default: + CHECK_STATUS(NOT_SUPPORTED); + } + if (DT_F16_8Q == this->dt && featureScale.size() > 0 && -2 == (featureScale.back())[0]) { + TensorDesc outputDesc = outputTensor->get_desc(); + outputDesc.dt = DT_F16; + outputTensor->resize(outputDesc); + } + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + auto inputTensor = this->inputTensors[0]; + TensorDesc inDim = this->desc_process(inputTensor.get_desc()); + inputTensor.resize(inDim); + auto filterTensor = this->weightTensors[0]; + TensorDesc filterDesc = filterTensor.get_desc(); + if (DT_F16_8Q == filterDesc.dt) { + filterDesc.dt = DT_I8; + filterTensor.resize(filterDesc); + } + auto outputTensor = this->outputTensors[0]; + + U32 bytes = 0; + switch (this->p.convolution_type) { + case Convolution_Pointwise: { + CHECK_STATUS(convolution_infer_forward_tmp_bytes(inputTensor, filterTensor, + outputTensor, p, this->pwAlg, &bytes, &this->archInfo)); + break; + } + case Convolution_Depthwise: { + CHECK_STATUS(depthwise_convolution_infer_forward_tmp_bytes(inputTensor, + filterTensor, outputTensor, p, this->dwAlg, &bytes, &this->archInfo)); + break; + } + case Convolution_Depthwise_Pointwise: { + CHECK_STATUS(depthwise_pointwise_convolution_infer_forward_tmp_bytes(inputTensor, + filterTensor, this->weightTensors[1], outputTensor, p, this->dwAlg, &bytes, + &this->archInfo)); + break; + } + case Convolution_Dilation: { + CHECK_STATUS(convolution_infer_forward_tmp_bytes(inputTensor, filterTensor, + outputTensor, p, this->pwAlg, &bytes, &this->archInfo)); + break; + } + default: + CHECK_STATUS(NOT_SUPPORTED); + } + return bytes; + } + + U32 infer_filter_transform_bytes(U32 *bytesExtra) + { + auto filterTensor = this->weightTensors[0]; + U32 bytes = 0; + switch (this->p.convolution_type) { + case Convolution_Pointwise: { + CHECK_STATUS(convolution_transform_filter_bytes( + filterTensor, this->p, this->pwAlg, &bytes, &this->archInfo)); + break; + } + case Convolution_Depthwise: { + CHECK_STATUS(depthwise_convolution_transform_filter_bytes( + filterTensor, this->p, this->dwAlg, &bytes, &this->archInfo)); + break; + } + case Convolution_Depthwise_Pointwise: { + CHECK_STATUS(depthwise_pointwise_convolution_transform_filter_bytes(filterTensor, + weightTensors[1], this->p, this->dwAlg, &bytes, bytesExtra, &this->archInfo)); + break; + } + case Convolution_Dilation: { + CHECK_STATUS(convolution_transform_filter_bytes( + filterTensor, this->p, this->pwAlg, &bytes, &this->archInfo)); + break; + } + default: + CHECK_STATUS(NOT_SUPPORTED); + } + return bytes; + } + + EE transform_filter() override + { + Tensor filterTensor = this->weightTensors[0]; + this->wtm = std::shared_ptr(new Tensor()); + + TensorDesc wtmDesc; + if (DT_F16_8Q == this->dt && Convolution_Pointwise == this->p.convolution_type && + CONVOLUTION_ALGORITHM_WINOGRAD == this->pwAlg) { // int8 winograd +#ifdef _USE_INT8 + U32 ftBytes; + CHECK_STATUS(convolution_transform_filter_bytes( + filterTensor, this->p, this->pwAlg, &ftBytes, &this->archInfo)); + + Tensor tFilter; + tFilter.resize(tensor1d(DT_U8, ftBytes)); + tFilter.alloc(); + + // To label as int8 + TensorDesc filterDesc = filterTensor.get_desc(); + filterDesc.dt = DT_F16_8Q; + + filterTensor.resize(filterDesc); + CHECK_STATUS(convolution_transform_filter( + filterTensor, this->p, this->pwAlg, this->temp, &tFilter, &this->archInfo)); + + U32 ftmBytes = ftBytes / bytesOf(DT_F16); + wtm->resize(tensor1d(DT_U8, ftmBytes)); + wtm->alloc(); + + std::shared_ptr fsp((F16 *)operator new(38 * bytesOf(DT_F16))); + this->scales = fsp; + TensorDesc wtmDesc; + CHECK_STATUS(quantize_tensor(tFilter.get_desc(), + ((CpuMemory *)(tFilter.get_memory()))->get_ptr(), &wtmDesc, + ((CpuMemory *)(wtm->get_memory()))->get_ptr(), this->scales.get() + 2)); + wtm->resize(wtmDesc); + } else if (DT_F16_8Q == this->dt && + Convolution_Pointwise == this->p.convolution_type) { // int8 tilegemm + Tensor qFilterTensor; + TensorDesc qDesc = filterTensor.get_desc(); + qDesc.dt = DT_I8; + qFilterTensor.resize(qDesc); + qFilterTensor.alloc(); + std::shared_ptr fsp((F16 *)operator new(3 * bytesOf(DT_F16))); + this->scales = fsp; + this->scales.get()[2] = -1; + CHECK_STATUS(quantize_tensor(filterTensor.get_desc(), + ((CpuMemory *)(filterTensor.get_memory()))->get_ptr(), &qDesc, + ((CpuMemory *)(qFilterTensor.get_memory()))->get_ptr(), this->scales.get() + 2)); + + U32 ftmBytes; + qFilterTensor.resize(qDesc); + CHECK_STATUS(convolution_transform_filter_bytes( + qFilterTensor, this->p, this->pwAlg, &ftmBytes, &this->archInfo)); + + wtm->resize(tensor1d(DT_U8, ftmBytes)); + wtm->alloc(); + + // trans filter + CHECK_STATUS(convolution_transform_filter( + qFilterTensor, this->p, this->pwAlg, this->temp, this->wtm.get(), &this->archInfo)); +#endif + } else { // All other cases + U32 bytesExtra; + auto wtmBytes = this->infer_filter_transform_bytes(&bytesExtra); + wtm->resize(tensor1d(DT_U8, wtmBytes)); + wtm->alloc(); + + switch (this->p.convolution_type) { + case Convolution_Pointwise: { + CHECK_STATUS(convolution_transform_filter(filterTensor, this->p, this->pwAlg, + this->temp, this->wtm.get(), &this->archInfo)); + break; + } + case Convolution_Depthwise: { + CHECK_STATUS(depthwise_convolution_transform_filter( + filterTensor, this->p, this->dwAlg, this->wtm.get(), &this->archInfo)); + break; + } + case Convolution_Depthwise_Pointwise: { + Tensor pwTensor; + pwTensor.resize(tensor1d(DT_U8, bytesExtra)); + pwTensor.alloc(); + CHECK_STATUS(depthwise_pointwise_convolution_transform_filter(filterTensor, + weightTensors[1], this->p, this->dwAlg, this->wtm.get(), &pwTensor, + &this->archInfo)); + weightTensors[1] = pwTensor; + break; + } + case Convolution_Dilation: { + CHECK_STATUS(convolution_transform_filter(filterTensor, this->p, this->pwAlg, + this->temp, this->wtm.get(), &this->archInfo)); + break; + } + default: + CHECK_STATUS(NOT_SUPPORTED); + } + } + this->weightTensors[0] = *this->get_wtm(); + return SUCCESS; + } +}; + +#endif // _CONVELTWISEPOOLING_H diff --git a/inference/engine/include/cpu/copy_cpu.hpp b/inference/engine/include/cpu/copy_cpu.hpp new file mode 100644 index 00000000..83f482c3 --- /dev/null +++ b/inference/engine/include/cpu/copy_cpu.hpp @@ -0,0 +1,83 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _COPY_CPU_H +#define _COPY_CPU_H + +#include "copy.hpp" + +class CopyCPU : public Copy { +public: + CopyCPU(DataType dt, CopyParamSpec p) : Copy(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new CopyCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + Tensor srcTensor = this->inputTensors[0]; + TensorDesc srcDesc = srcTensor.get_desc(); + Tensor dstTensor = this->inputTensors[1]; + TensorDesc dstDesc = dstTensor.get_desc(); + + std::vector input; + input.push_back(((CpuMemory *)(srcTensor.get_memory()))->get_ptr()); + input.push_back(((CpuMemory *)(dstTensor.get_memory()))->get_ptr()); + + U32 batch = srcDesc.dims[srcDesc.nDims - 1]; + U32 copyLength = (this->p.length >= 0) ? this->p.length : tensorNumElements(srcDesc) / batch; + U32 srcBatchStride = (this->p.src_dims[0] >= 0) ? this->p.src_dims[0] + : tensorNumElements(srcDesc) / batch; + U32 srcStride = (this->p.src_dims[0] >= 0) ? this->p.src_dims[1] + : tensorNumElements(srcDesc) / batch; + U32 dstBatchStride = (this->p.dst_dims[0] >= 0) ? this->p.dst_dims[0] + : tensorNumElements(dstDesc) / batch; + U32 dstStride = (this->p.dst_dims[0] >= 0) ? this->p.dst_dims[1] + : tensorNumElements(dstDesc) / batch; + for (U32 i = 0; i < batch; i++) { + U32 srcBlockIndex = 0; + if (this->inputTensors.size() > 2) { + U32 *ptr = (U32 *)((CpuMemory *)(this->inputTensors[2].get_memory()))->get_ptr(); + srcBlockIndex = ptr[i]; + } + U32 dstBlockIndex = 0; + if (this->inputTensors.size() > 3) { + U32 *ptr = (U32 *)((CpuMemory *)(this->inputTensors[3].get_memory()))->get_ptr(); + dstBlockIndex = ptr[i]; + } + U32 srcIndex = i * srcBatchStride + srcBlockIndex * srcStride + this->p.src_dims[2]; + U32 dstIndex = i * dstBatchStride + dstBlockIndex * dstStride + this->p.dst_dims[2]; + CHECK_STATUS( + copy(this->inputTensors, srcIndex, dstIndex, 0, 0, copyLength, &this->archInfo)); + } + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + UNUSED(inTensors); + auto desc = outTensors[0]->get_desc(); + desc.dt = this->dt; + desc.df = getTensorDefaultDataFormat(0); + desc.nDims = 0; + outTensors[0]->resize(desc); + return SUCCESS; + } +}; + +#endif // _COPY_CPU_H diff --git a/inference/engine/include/cpu/deconvolution_cpu.hpp b/inference/engine/include/cpu/deconvolution_cpu.hpp new file mode 100644 index 00000000..1fc71357 --- /dev/null +++ b/inference/engine/include/cpu/deconvolution_cpu.hpp @@ -0,0 +1,150 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _DECONVOLUTION_CPU_H +#define _DECONVOLUTION_CPU_H + +#include "deconvolution.hpp" + +class DeconvolutionCPU : public Deconvolution { +public: + DeconvolutionCPU(DataType dt, ConvolutionParamSpec p, ActivationParamSpec activationDesc) + : Deconvolution(dt, p, activationDesc) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr( + new DeconvolutionCPU(this->dt, this->p, this->activationDesc)); + *mem = *this; + return mem; + } + + EE infer_weight_desc() override + { + auto curOpWs = this->get_weightspec(); + DataType filterDt = curOpWs.mdt; // weight data type may not be the same as input and output + if (curOpWs.weight == nullptr) { + filterDt = this->dt; + } + DataType dtNoQ = (this->dt == DT_F16_8Q) ? DT_F16 : this->dt; + CHECK_REQUIREMENT(filterDt != DT_BIN01 && filterDt != DT_BIN11); + DataFormat filterDf = DF_NCHW; + TensorDesc filterTensorDesc = tensor4df(filterDt, filterDf, this->numInputs, + this->p.num_outputs, this->p.kernel_h, this->p.kernel_w); + // bias length + U32 vectorLen = this->numInputs * this->p.group; + // bias data type should be the same as input and output + TensorDesc vectorTensorDesc = tensor1d(dtNoQ, vectorLen); + + this->weightTensors = std::vector(1); + this->weightTensors[0].resize(filterTensorDesc); + this->biasTensors = std::vector(1); + this->biasTensors[0].resize(vectorTensorDesc); + return SUCCESS; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor filterTensor = this->weightTensors[0]; + U8 *scalePtr = nullptr; + Tensor biasTensor = this->biasTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + auto filterDesc = filterTensor.get_desc(); + if (filterDesc.dt == DT_BIN01 || filterDesc.dt == DT_BIN11) { + CHECK_STATUS(NOT_SUPPORTED); + } + CHECK_STATUS(deconvolution(inputTensor, filterTensor, p, this->alg, scalePtr, biasTensor, + this->temp, outputTensor, this->activationDesc, &this->archInfo)); + } + + EE infer_forward_algorithm(std::shared_ptr algorithmMap) override + { + ConvolutionPolicy policy = CONVOLUTION_FASTEST; + auto filterDesc = this->weightTensors[0].get_desc(); + DataType targetType = filterDesc.dt; + I32 algo; + if (algorithmMap->getAlgorithmInfoFromMap(this->name, &algo, 1)) { + this->alg = (ConvolutionForwardAlgorithm)algo; + } else { + CHECK_STATUS(deconvolution_infer_forward_algorithm(this->inputTensors[0], + this->weightTensors[0], this->outputTensors[0], p, policy, &(this->alg), targetType, + this->activationDesc, &this->archInfo)); + algo = this->alg; + algorithmMap->setAlgorithmInfoToMap(this->name, &algo, 1); + } + return SUCCESS; + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + auto inputTensor = inTensors[0]; + TensorDesc inDim = inputTensor->get_desc(); + DataType idt; + DataFormat idf; + U32 in, ic, ih, iw; + CHECK_STATUS(tensor4dGet(inDim, &idt, &idf, &in, &ic, &ih, &iw)); + this->numInputs = ic / this->p.group; + + Tensor filterTensor; + TensorDesc filterDim = tensor4df(this->dt, DF_NCHW, this->numInputs, this->p.num_outputs, + this->p.kernel_h, this->p.kernel_w); + filterTensor.resize(filterDim); + + this->p = createConvolutionParamSpec(this->p.group, this->p.kernel_h, this->p.kernel_w, + this->p.stride_h, this->p.stride_w, this->p.padding_top, this->p.padding_bottom, + this->p.padding_left, this->p.padding_right, this->p.dilatedRate_h, + this->p.dilatedRate_w, this->p.num_outputs, this->p.convolution_type); + + DataType targetType = this->dt; + if (DT_F16_8Q == this->dt) { + targetType = DT_I8; + } + + CHECK_STATUS(deconvolution_infer_output_size( + inputTensor, filterTensor, p, outTensors[0], targetType, &this->archInfo)); + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + U32 bytes = 0; + CHECK_STATUS(deconvolution_infer_forward_tmp_bytes(this->inputTensors[0], + this->weightTensors[0], this->outputTensors[0], p, this->alg, &bytes, &this->archInfo)); + return bytes; + } + + U32 infer_wtm_memory_size() override + { + U32 bytes = 0; + CHECK_STATUS(deconvolution_transform_filter_bytes( + this->weightTensors[0], this->p, this->alg, &bytes, &this->archInfo)); + return bytes; + } + + EE transform_filter() override + { + this->wtm = std::shared_ptr(new Tensor()); + Tensor filterTensor = this->weightTensors[0]; + auto wtmBytes = this->infer_wtm_memory_size(); + Tensor wtm = Tensor::alloc_sized(tensor1d(DT_U8, wtmBytes)); + CHECK_STATUS(deconvolution_transform_filter( + filterTensor, this->p, this->alg, this->temp, &wtm, &this->archInfo)); + this->weightTensors[0] = wtm; + return SUCCESS; + } +}; + +#endif // _DECONVOLUTION_CPU_H diff --git a/inference/engine/include/cpu/eltwise_cpu.hpp b/inference/engine/include/cpu/eltwise_cpu.hpp new file mode 100644 index 00000000..e538eea1 --- /dev/null +++ b/inference/engine/include/cpu/eltwise_cpu.hpp @@ -0,0 +1,81 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _ELTWISE_CPU_H +#define _ELTWISE_CPU_H + +#include "eltwise.hpp" + +class EltwiseCPU : public Eltwise { +public: + EltwiseCPU(EltwiseParamSpec eltwiseDesc) : Eltwise(eltwiseDesc) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new EltwiseCPU(this->eltwiseDesc)); + *mem = *this; + return mem; + } + + void run() override + { + std::vector inputDesc; + for (auto p : this->inputTensors) { + inputDesc.push_back(p.get_desc()); + } + if (this->eltwiseDesc.elt_mode == ELTWISE_PROD && inputDesc.size() == 2 && + (inputDesc[1].nDims == 2 || (inputDesc[1].nDims == 3 && inputDesc[1].dims[0] == 1) || + (inputDesc[1].nDims == 4 && inputDesc[1].dims[0] == 1 && inputDesc[1].dims[1] == 1)) && + tensorNumElements(inputDesc[0]) != tensorNumElements(inputDesc[1])) { + Tensor inTensor = this->inputTensors[1]; + U8 *alpha = (U8 *)((CpuMemory *)(inTensor.get_memory()))->get_ptr(); + ScaleParamSpec scaleParam; + scaleParam.axis = 1; + CHECK_STATUS(scale(this->inputTensors[0], alpha, nullptr, scaleParam, + this->outputTensors[0], &this->archInfo)); + } else { + CHECK_STATUS(eltwise(this->inputTensors, this->eltwiseDesc, this->temp, + this->outputTensors[0], &this->archInfo)); + } + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + std::vector inDims; + for (auto p : inTensors) { + inDims.push_back(p->get_desc()); + } + if (this->eltwiseDesc.elt_mode == ELTWISE_PROD && inDims.size() == 2 && + (inDims[1].nDims == 2 || (inDims[1].nDims == 3 && inDims[1].dims[0] == 1) || + (inDims[1].nDims == 4 && inDims[1].dims[0] == 1 && inDims[1].dims[1] == 1)) && + tensorNumElements(inDims[0]) != tensorNumElements(inDims[1])) { + CHECK_STATUS(scale_infer_output_size(inTensors[0], outTensors[0], &this->archInfo)); + } else { + CHECK_STATUS(eltwise_infer_output_size(inTensors, outTensors[0], &this->archInfo)); + } + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + U32 bytes = 0; + CHECK_STATUS(eltwise_infer_forward_tmp_bytes( + this->inputTensors, this->outputTensors[0], &bytes, &this->archInfo)); + return UNI_MAX(bytes, this->lenOfTemp); + } +}; + +#endif // _ELTWISE_CPU_H diff --git a/inference/engine/include/cpu/embedding_cpu.hpp b/inference/engine/include/cpu/embedding_cpu.hpp new file mode 100644 index 00000000..9caa733f --- /dev/null +++ b/inference/engine/include/cpu/embedding_cpu.hpp @@ -0,0 +1,87 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _EMBEDDING_CPU_H +#define _EMBEDDING_CPU_H + +#include "embedding.hpp" + +class EmbeddingCPU : public Embedding { +public: + EmbeddingCPU(DataType dt, EmbedParamSpec p) : Embedding(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new EmbeddingCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + Tensor weightTensor = (this->weightTensors.size()) ? this->weightTensors[0] + : this->inputTensors[1]; + CHECK_STATUS(embedding( + this->inputTensors[0], weightTensor, this->p, this->outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS(embedding_infer_output_size( + inTensors[0], this->p, this->dt, outTensors[0], &this->archInfo)); + return SUCCESS; + } + + EE init_weight_bias_from_model(std::shared_ptr *modelPtrShared) override + { + U8 *modelPtr = nullptr; + if (modelPtrShared != nullptr) { + modelPtr = (*modelPtrShared).get(); + } + TensorDesc weightDesc; + if (this->p.transpose) { + weightDesc = tensor2df(this->dt, DF_TRANSPOSE, this->p.num_output, this->p.input_dim); + } else { + weightDesc = tensor2df(this->dt, DF_NORMAL, this->p.input_dim, this->p.num_output); + } + U32 weightBytes = tensorNumBytes(weightDesc); + + std::shared_ptr modelWeightTensor(new Tensor()); + modelWeightTensor->resize(weightDesc); + + bool set_ptr = false; + if (modelPtr != nullptr) { + modelWeightTensor->alloc(); + memcpy( + ((CpuMemory *)(modelWeightTensor->get_memory()))->get_ptr(), modelPtr, weightBytes); + *modelPtrShared = std::shared_ptr(*modelPtrShared, modelPtr + weightBytes); + set_ptr = true; + } else { + auto curOpWs = this->get_weightspec(); + if (curOpWs.weight != nullptr) { + ((CpuMemory *)(modelWeightTensor->get_memory())) + ->set_shared_ptr(std::shared_ptr(curOpWs.weight)); + set_ptr = true; + } + } + if (set_ptr) { + this->weightTensors.push_back(*modelWeightTensor.get()); + } + return SUCCESS; + } +}; + +#endif // _EMBEDDING_CPU_H diff --git a/inference/engine/include/cpu/factory_cpu.hpp b/inference/engine/include/cpu/factory_cpu.hpp new file mode 100644 index 00000000..44a30ae7 --- /dev/null +++ b/inference/engine/include/cpu/factory_cpu.hpp @@ -0,0 +1,366 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _FACTORY_CPU_H +#define _FACTORY_CPU_H + +#include "factory.hpp" +#include "attention.hpp" +#include "reduction.hpp" +#include "jump.hpp" +#include "cpu/resize_cpu.hpp" +#include "cpu/pooling_cpu.hpp" +#include "cpu/convolution_cpu.hpp" +#include "cpu/deconvolution_cpu.hpp" +#include "cpu/eltwise_cpu.hpp" +#include "cpu/softmax_cpu.hpp" +#include "cpu/activation_cpu.hpp" +#include "cpu/fully_connected_cpu.hpp" +#include "cpu/scale_cpu.hpp" +#include "cpu/concat_cpu.hpp" +#include "cpu/clip_cpu.hpp" +#include "cpu/squeeze_cpu.hpp" +#include "cpu/reshape_cpu.hpp" +#include "cpu/embedding_cpu.hpp" +#include "cpu/layer_norm_cpu.hpp" +#include "cpu/matmul_cpu.hpp" +#include "cpu/power_cpu.hpp" +#include "cpu/transpose_cpu.hpp" +#include "cpu/slice_cpu.hpp" +#include "cpu/shared_weight_cpu.hpp" +#include "cpu/repeat_cpu.hpp" +#include "cpu/copy_cpu.hpp" +#include "cpu/check_cpu.hpp" +#include "cpu/preallocated_memory_cpu.hpp" +#include "cpu/argmax_cpu.hpp" +#include "cpu/unsqueeze_cpu.hpp" +#include "cpu/rnncell_cpu.hpp" +#include "cpu/rnn_cpu.hpp" +#include "cpu/padding_cpu.hpp" +#include "attention_mask.hpp" +#include "relative_position_embedding.hpp" +#include "relative_shift.hpp" +#include "detection_output.hpp" +#include "prior_box.hpp" +#include "yolov3_detection_output.hpp" +#include "cpu/channel_resize_cpu.hpp" +#include "cpu/l2normalization_cpu.hpp" +#include "cpu/tile_cpu.hpp" +#include "cpu/prelu_cpu.hpp" +#include "cpu/tfslice_cpu.hpp" +#include "cpu/splice_cpu.hpp" +#include "cpu/shape_cpu.hpp" + +class FactoryCPU : public Factory { +public: + std::shared_ptr createConvolution(DataType dt, + ConvolutionParamSpec p, + ActivationParamSpec dwActivationParamSpec, + ActivationParamSpec pwActivationParamSpec) override + { + auto cep = + (Convolution *)(new ConvolutionCPU(dt, p, dwActivationParamSpec, pwActivationParamSpec)); + return std::shared_ptr(cep); + } + + std::shared_ptr createDeconvolution( + DataType dt, ConvolutionParamSpec p, ActivationParamSpec activationDesc) override + { + auto cep = new DeconvolutionCPU(dt, p, activationDesc); + return std::shared_ptr(cep); + } + + std::shared_ptr createPooling(PoolingParamSpec p) override + { + auto cep = (Pooling *)(new PoolingCPU(p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createFullyConnected( + DataType dt, FullyConnectedParamSpec p, U32 numInput) override + { + auto cep = (FullyConnected *)(new FullyConnectedCPU(dt, p, numInput)); + return std::shared_ptr(cep); + } + + std::shared_ptr createSoftmax(DataType dt, SoftmaxParamSpec p) override + { + auto cep = new SoftmaxCPU(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createConcat(ConcatParamSpec p) override + { + auto cep = (Concat *)(new ConcatCPU(p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createActivation(ActivationParamSpec activeDesc) override + { + auto cep = (Activation *)new ActivationCPU(activeDesc); + return std::shared_ptr(cep); + } + + std::shared_ptr createEltwise(EltwiseParamSpec eltwiseDesc) override + { + auto cep = (Eltwise *)new EltwiseCPU(eltwiseDesc); + return std::shared_ptr(cep); + } + + std::shared_ptr createScale(DataType dt, ScaleParamSpec p, int numChannels) override + { + auto cep = (Scale *)(new ScaleCPU(dt, p, numChannels)); + return std::shared_ptr(cep); + } + + std::shared_ptr createRNN(DataType dt, RNNParamSpec p) override + { + auto cep = (RNNCell *)new RNNCPU(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createRNNCell(DataType dt, RNNParamSpec p) override + { + auto cep = (RNNCell *)new RNNCellCPU(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createEmbedding(DataType dt, EmbedParamSpec p) override + { + auto cep = (Embedding *)(new EmbeddingCPU(dt, p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createPower(DataType dt, PowerParamSpec p) override + { + auto cep = (Power *)(new PowerCPU(dt, p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createMatMul(DataType dt, MatMulParamSpec p) override + { + auto cep = (MatMul *)(new MatMulCPU(dt, p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createLayerNorm(DataType dt, U32 weightNum) override + { + auto cep = (LayerNorm *)(new LayerNormCPU(dt, weightNum)); + return std::shared_ptr(cep); + } + + std::shared_ptr createReshape(DataType dt, ReshapeParamSpec p) override + { + auto cep = (Reshape *)(new ReshapeCPU(dt, p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createResize(DataType paramDT, ResizeParamSpec p) override + { + auto cep = (Resize *)(new ResizeCPU(paramDT, p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createSlice(DataType dt, SliceParamSpec p) override + { + auto cep = (Slice *)(new SliceCPU(dt, p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createTranspose(DataType dt, TransposeParamSpec p) override + { + auto cep = (Transpose *)new TransposeCPU(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createAttention(DataType dt, AttentionParamSpec p) override + { + auto cep = new Attention(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createClip(DataType dt, ClipParamSpec p) override + { + auto cep = (Clip *)(new ClipCPU(dt, p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createSqueeze(DataType dt, SqueezeParamSpec p) override + { + auto cep = (Squeeze *)(new SqueezeCPU(dt, p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createUnsqueeze(DataType dt, UnsqueezeParamSpec p) override + { + auto cep = (Unsqueeze *)new UnsqueezeCPU(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createReduction(DataType dt, ReductionParamSpec p) override + { + auto cep = new Reduction(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createArgMax(DataType dt, ArgMaxParamSpec p) override + { + auto cep = (ArgMax *)new ArgMaxCPU(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createCopy(DataType dt, CopyParamSpec p) override + { + auto cep = (Copy *)new CopyCPU(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createCheck(DataType dt, CheckParamSpec p) override + { + auto cep = (Check *)new CheckCPU(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createRepeat( + DataType dt, RepeatParamSpec p, I32 jumpOperatorIndex, I32 currentOperatorIndex) override + { + auto cep = (Repeat *)new RepeatCPU(dt, p, jumpOperatorIndex, currentOperatorIndex); + return std::shared_ptr(cep); + } + + std::shared_ptr createBilateralSliceApply(BilateralSliceApplyParamSpec p) override + { + OP_UNSUP(1, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createPreAllocatedMemory(DataType dt, TensorDesc desc) override + { + auto cep = (PreAllocatedMemory *)new PreAllocatedMemoryCPU(dt, desc); + return std::shared_ptr(cep); + } + + std::shared_ptr createSharedWeight(DataType dt, + TensorDesc desc, + std::string outputTensorName, + std::map> *tensorMapPtr) override + { + auto cep = (SharedWeight *)new SharedWeightCPU(dt, desc, outputTensorName, tensorMapPtr); + return std::shared_ptr(cep); + } + + std::shared_ptr createJump( + DataType dt, I32 jumpOperatorIndex, I32 currentOperatorIndex) override + { + auto cep = new Jump(dt, jumpOperatorIndex, currentOperatorIndex); + return std::shared_ptr(cep); + } + + std::shared_ptr createSpace2Depth(DataType dt) override + { + OP_UNSUP(1, dt); + return std::shared_ptr(cep); + } + + std::shared_ptr createDepth2Space(DataType dt, Depth2SpaceParamSpec p) override + { + OP_UNSUP(2, dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createPReLU(DataType dt) override + { + auto cep = new PReLUCPU(dt); + return std::shared_ptr(cep); + } + + std::shared_ptr createAttentionMask(DataType dt, AttentionMaskParamSpec p) override + { + auto cep = new AttentionMask(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createRelativePositionEmbedding(DataType dt, EmbedParamSpec p) override + { + auto cep = new RelativePositionEmbedding(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createRelativeShift(DataType dt, RelativeShiftParamSpec p) override + { + auto cep = new RelativeShift(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createPadding(DataType dt, PadParamSpec p) override + { + auto cep = (Padding *)(new PaddingCPU(dt, p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createPriorBox(DataType dt, PriorBoxParamSpec p) override + { + auto cep = new PriorBox(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createDetectionOutput(DataType dt, DetectionOutputParamSpec p) override + { + auto cep = new DetectionOutput(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createYolov3DetectionOutput( + DataType dt, Yolov3DetectionOutputParamSpec p) override + { + auto cep = new Yolov3DetectionOutput(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createChannelResize(DataType dt, ChannelResizeParamSpec p) override + { + auto cep = new ChannelResizeCPU(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createL2Normalization(DataType dt) override + { + auto cep = new L2NormalizationCPU(dt); + return std::shared_ptr(cep); + } + + std::shared_ptr createTile(DataType dt, TileParamSpec p) override + { + auto cep = new TileCPU(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createTfSlice(DataType dt, TfSliceParamSpec p) override + { + auto cep = new TfSliceCPU(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createSplice(DataType dt, SpliceParamSpec p) override + { + auto cep = new SpliceCPU(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createShape() override + { + auto cep = new ShapeCPU(); + return std::shared_ptr(cep); + } +}; +#endif // _FACTORY_CPU_H diff --git a/inference/engine/include/cpu/fully_connected_cpu.hpp b/inference/engine/include/cpu/fully_connected_cpu.hpp new file mode 100644 index 00000000..9d76e972 --- /dev/null +++ b/inference/engine/include/cpu/fully_connected_cpu.hpp @@ -0,0 +1,266 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _FULLY_CONNECTED_CPU_H +#define _FULLY_CONNECTED_CPU_H + +#include "fully_connected.hpp" +#include "blas_enhance.h" + +class FullyConnectedCPU : public FullyConnected { +public: + FullyConnectedCPU(DataType dt, FullyConnectedParamSpec p, U32 numInput) + : FullyConnected(dt, p, numInput) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr( + new FullyConnectedCPU(this->dt, this->p, this->numInput)); + *mem = *this; + return mem; + } + + EE infer_weight_desc() override + { + DataType dtNoQ = (DT_F16_8Q == this->dt) ? DT_F16 : this->dt; + this->weightTensors = std::vector(1); + this->weightTensors[0].resize( + tensor2df(dtNoQ, DF_TRANSPOSE, this->p.num_outputs, this->numInput)); + this->biasTensors = std::vector(1); + this->biasTensors[0].resize(tensor1d(dtNoQ, this->p.num_outputs)); + return SUCCESS; + } + + TensorDesc desc_process(TensorDesc inDim) + { + TensorDesc inputDesc; + DataType dt; + DataFormat df; + U32 in, ic, ih, iw; + switch (inDim.nDims) { + case 2: { + CHECK_STATUS(tensor2dGet(inDim, &dt, &df, &in, &(this->numInput))); + inputDesc = inDim; + break; + } + case 3: { + CHECK_STATUS(tensor3dGet(inDim, &dt, &df, &in, &ih, &iw)); + this->numInput = iw; + inputDesc = tensor2df(dt, DF_NORMAL, in * ih, iw); + break; + } + case 4: { + CHECK_STATUS(tensor4dGet(inDim, &dt, &df, &in, &ic, &ih, &iw)); + this->numInput = ic * ih * iw; + inputDesc = inDim; + break; + } + default: + break; + } + return inputDesc; + } + + TensorDesc desc_process_reverse(TensorDesc inDim, TensorDesc outDim) + { + TensorDesc outDesc; + DataType dt; + DataFormat df; + U32 in, ih, iw; + switch (inDim.nDims) { + case 2: { + outDesc = outDim; + break; + } + case 3: { + CHECK_STATUS(tensor3dGet(inDim, &dt, &df, &in, &ih, &iw)); + outDesc = tensor3df(dt, df, in, ih, this->p.num_outputs); + break; + } + case 4: { + outDesc = outDim; + break; + } + default: + break; + } + return outDesc; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + TensorDesc inputDesc = desc_process(inputTensor.get_desc()); + inputTensor.resize(inputDesc); + + Tensor outputTensor = this->outputTensors[0]; + TensorDesc outputDesc = outputTensor.get_desc(); + outputDesc.dims[0] = this->p.num_outputs; + outputDesc = desc_process(outputDesc); + outputTensor.resize(outputDesc); + + if (featureScale.size() > 1 && featureScale[0][0] > 0 && DT_I8 != inputDesc.dt) { + inputTensor.set_scale(featureScale[0][0]); + } + + CHECK_STATUS(fully_connected(inputTensor, weightTensors[0], biasTensors[0], this->temp, + outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->mvm = false; + TensorDesc inputDesc = desc_process(inTensors[0]->get_desc()); + TensorDesc weightDesc = + tensor2df(inputDesc.dt, DF_TRANSPOSE, this->p.num_outputs, this->numInput); + TensorDesc outputDesc; + + DataType idt; + DataFormat idf; + U32 in = 0, ic, ih, iw; + if (tensorIs2d(inputDesc)) { + CHECK_STATUS(tensor2dGet(inputDesc, &idt, &idf, &in, &iw)); + } else if (tensorIs4d(inputDesc)) { + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + } else { + CHECK_STATUS(NOT_MATCH); + } + if (1 == in) { + this->mvm = true; + } + + Tensor tmpInput = *inTensors[0]; + tmpInput.resize(inputDesc); + Tensor tmpFilter; + tmpFilter.resize(weightDesc); + CHECK_STATUS( + fully_connected_infer_output_size(&tmpInput, tmpFilter, outTensors[0], &this->archInfo)); + if (1 == this->p.num_slices) { + outputDesc = outTensors[0]->get_desc(); + outputDesc = desc_process_reverse(inTensors[0]->get_desc(), outputDesc); + if (DT_F16_8Q == this->dt) { + if (featureScale.size() > 0 && -2 == (featureScale.back())[0]) { + outputDesc.dt = DT_F16; + } else { + outputDesc.dt = DT_I8; + } + } + outTensors[0]->resize(outputDesc); + } else { + UNI_ERROR_LOG("FC merge is deprecated\n"); + outputDesc = desc_process_reverse(inTensors[0]->get_desc(), outputDesc); + for (U32 i = 0; i < this->p.num_slices; i++) { + outputDesc.dims[0] = this->p.slice_point[i]; + if (DT_F16_8Q == this->dt) { + if (featureScale.size() > 0 && -2 == (featureScale.back())[0]) { + outputDesc.dt = DT_F16; + } else { + outputDesc.dt = DT_I8; + } + } + } + } + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + TensorDesc inputDesc = desc_process((this->inputTensors[0]).get_desc()); + U32 bytes = 0; + + Tensor tmpInput, tmpFilter; + tmpInput.resize(inputDesc); + + CHECK_STATUS(fully_connected_infer_forward_tmp_bytes( + tmpInput, weightTensors[0], &bytes, &this->archInfo)); + return bytes; + } + + U32 infer_wtm_memory_size() override + { + U32 bytes = 0; + CHECK_STATUS( + fully_connected_transform_filter_bytes(weightTensors[0], &bytes, &this->archInfo)); + return bytes; + } + + EE transform_filter() override + { + TensorDesc inputDesc = desc_process(this->inputTensors[0].get_desc()); + Tensor weightTensor = this->weightTensors[0]; + TensorDesc weightDesc = weightTensor.get_desc(); + TensorDesc wtmDesc; + auto wtm_bytes = this->infer_wtm_memory_size(); + + TensorDesc tmpDesc; + Tensor tmpFilter; + + Tensor tmpInput; + tmpInput.resize(inputDesc); + if (inputDesc.df == DF_NCHWC8) { + tmpFilter.resize(tensor1d(DT_U8, wtm_bytes)); + tmpFilter.alloc(); + CHECK_STATUS(fully_connected_transform_filter( + tmpInput, weightTensors[0], &tmpFilter, &this->archInfo)); + } else { + tmpDesc = weightDesc; + if (this->mvm) { + tmpDesc.df = DF_NORMAL; + } + tmpFilter = weightTensor; + tmpFilter.resize(tmpDesc); + } + +#ifdef _USE_INT8 + if (DT_F16_8Q == this->dt) { + std::shared_ptr qFilter = std::shared_ptr( + (U8 *)operator new(bytesOf(DT_I8) * tensorNumElements(tmpDesc))); + + F16 scale = -1; + F16 *inD = (F16 *)((CpuMemory *)(tmpFilter.get_memory()))->get_ptr(); + CHECK_STATUS( + quantize_tensor(tmpFilter.get_desc(), inD, &tmpDesc, qFilter.get(), &scale)); + tmpFilter.resize(tmpDesc); + ((CpuMemory *)(tmpFilter.get_memory()))->set_shared_ptr(qFilter); + tmpFilter.set_scale(scale); + } +#endif + this->wtm = std::shared_ptr(new Tensor()); + wtm->resize(tensor1d(DT_U8, wtm_bytes)); + wtm->alloc(); + wtm->set_scale(tmpFilter.get_scale()); + if (this->mvm) { + if (X86_AVX2 != this->archInfo.arch) { + CHECK_STATUS(matrix_vector_multiply_transform_weight(tmpFilter.get_desc(), + ((CpuMemory *)(tmpFilter.get_memory()))->get_ptr(), &wtmDesc, + ((CpuMemory *)(wtm->get_memory()))->get_ptr(), this->archInfo.arch)); + wtm->resize(wtmDesc); + } else { + *wtm.get() = tmpFilter; + } + } else { + CHECK_STATUS(matrix_matrix_multiply_transform_rhs(tmpFilter.get_desc(), + ((CpuMemory *)(tmpFilter.get_memory()))->get_ptr(), &wtmDesc, + ((CpuMemory *)(wtm->get_memory()))->get_ptr(), this->archInfo.arch)); + wtm->resize(wtmDesc); + } + this->weightTensors[0] = *this->get_wtm(); + return SUCCESS; + } + + bool mvm; +}; + +#endif // _FULLY_CONNECTED_CPU_H diff --git a/inference/engine/include/cpu/l2normalization_cpu.hpp b/inference/engine/include/cpu/l2normalization_cpu.hpp new file mode 100644 index 00000000..9cdb463f --- /dev/null +++ b/inference/engine/include/cpu/l2normalization_cpu.hpp @@ -0,0 +1,47 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _L2NORMALIZATION_CPU_H +#define _L2NORMALIZATION_CPU_H + +#include "l2normalization.hpp" + +class L2NormalizationCPU : public L2Normalization { +public: + L2NormalizationCPU(DataType dt) : L2Normalization(dt) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new L2NormalizationCPU(this->dt)); + *mem = *this; + return mem; + } + + void run() override + { + CHECK_STATUS( + l2normalization(this->inputTensors[0], this->outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS( + l2normalization_infer_output_size(inTensors[0], outTensors[0], &this->archInfo)); + return SUCCESS; + } +}; + +#endif // _L2NORMALIZATION_CPU_H diff --git a/inference/engine/include/cpu/layer_norm_cpu.hpp b/inference/engine/include/cpu/layer_norm_cpu.hpp new file mode 100644 index 00000000..3f34e1b3 --- /dev/null +++ b/inference/engine/include/cpu/layer_norm_cpu.hpp @@ -0,0 +1,67 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _LAYER_NORM_CPU_H +#define _LAYER_NORM_CPU_H + +#include "layer_norm.hpp" + +class LayerNormCPU : public LayerNorm { +public: + LayerNormCPU(DataType dt, U32 weightNum) : LayerNorm(dt, weightNum) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new LayerNormCPU(this->dt, this->weightNum)); + *mem = *this; + return mem; + } + + EE infer_weight_desc() override + { + auto curOpWs = this->get_weightspec(); + DataType dtNoQ = (DT_F16_8Q == this->dt) ? DT_F16 : this->dt; + if (0 != curOpWs.bytes_of_weight) { + this->weightNum = curOpWs.bytes_of_weight / bytesOf(curOpWs.mdt); + } + Tensor weightTensor; + weightTensor.resize(tensor1d(dtNoQ, this->weightNum)); + this->weightTensors.push_back(weightTensor); + Tensor biasTensor; + biasTensor.resize(tensor1d(dtNoQ, this->weightNum)); + this->biasTensors.push_back(biasTensor); + return SUCCESS; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor weightTensor = this->weightTensors[0]; + Tensor biasTensor = this->biasTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + + CHECK_STATUS(layer_normalization( + inputTensor, weightTensor, biasTensor, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS(normalization_infer_output_size(inTensors[0], outTensors[0], &this->archInfo)); + return SUCCESS; + } +}; + +#endif // _LAYER_NORM_CPU_H diff --git a/inference/engine/include/cpu/matmul_cpu.hpp b/inference/engine/include/cpu/matmul_cpu.hpp new file mode 100644 index 00000000..4ea44af0 --- /dev/null +++ b/inference/engine/include/cpu/matmul_cpu.hpp @@ -0,0 +1,72 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _MATMUL_CPU_H +#define _MATMUL_CPU_H + +#include "matmul.hpp" + +class MatMulCPU : public MatMul { +public: + MatMulCPU(DataType dt, MatMulParamSpec p) : MatMul(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new MatMulCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + Tensor inputTensorA = this->inputTensors[0]; + TensorDesc inputDescA = inputTensorA.get_desc(); + Tensor inputTensorB = this->inputTensors[1]; + TensorDesc inputDescB = inputTensorB.get_desc(); + Tensor outputTensor = this->outputTensors[0]; + if (3 == featureScale.size() && featureScale[0][0] > 0 && DT_I8 != inputDescA.dt) { + inputTensorA.set_scale(featureScale[0][0]); + } + if (3 == featureScale.size() && featureScale[1][0] > 0 && DT_I8 != inputDescB.dt) { + inputTensorB.set_scale(featureScale[1][0]); + } + + CHECK_STATUS(matmul(inputTensors[0], this->p.transpose_a, inputTensors[1], + this->p.transpose_b, this->temp, outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS(matmul_infer_output_size(inTensors[0], this->p.transpose_a, inTensors[1], + this->p.transpose_b, outTensors[0], &this->archInfo)); + if (DT_F16_8Q == this->dt && featureScale.size() > 0 && -2 == (featureScale.back())[0]) { + auto outDesc = outTensors[0]->get_desc(); + outDesc.dt = DT_F16; + outTensors[0]->resize(outDesc); + } + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + U32 bytes = 0; + CHECK_STATUS(matmul_infer_forward_tmp_bytes(inputTensors[0], this->p.transpose_a, + inputTensors[1], this->p.transpose_b, &bytes, &this->archInfo)); + return bytes; + } +}; + +#endif // _MATMUL_CPU_H diff --git a/inference/engine/include/cpu/padding_cpu.hpp b/inference/engine/include/cpu/padding_cpu.hpp new file mode 100644 index 00000000..ac4979db --- /dev/null +++ b/inference/engine/include/cpu/padding_cpu.hpp @@ -0,0 +1,48 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _PADDING_CPU_H +#define _PADDING_CPU_H + +#include "padding.hpp" + +class PaddingCPU : public Padding { +public: + PaddingCPU(DataType dt, PadParamSpec p) : Padding(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new PaddingCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + CHECK_STATUS(padding(inputTensor, this->p, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS( + padding_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } +}; + +#endif // _PADDINGCPU_H diff --git a/inference/engine/include/cpu/pooling_cpu.hpp b/inference/engine/include/cpu/pooling_cpu.hpp new file mode 100644 index 00000000..c2a058f5 --- /dev/null +++ b/inference/engine/include/cpu/pooling_cpu.hpp @@ -0,0 +1,57 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _POOLING_CPU_H +#define _POOLING_CPU_H + +#include "pooling.hpp" + +class PoolingCPU : public Pooling { +public: + PoolingCPU(PoolingParamSpec p) : Pooling(p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new PoolingCPU(this->p)); + *mem = *this; + return mem; + } + + void run() override + { + CHECK_STATUS(pooling( + this->inputTensors[0], this->p, this->temp, this->outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + if (this->p.kernel_h == 0 && this->p.kernel_w == 0) { + Pooling::set_stride(1, 1); + } + CHECK_STATUS( + pooling_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + U32 size = 0; + CHECK_STATUS(pooling_infer_forward_tmp_bytes( + this->inputTensors[0], this->outputTensors[0], &size, &this->archInfo)); + return size; + } +}; + +#endif // _POOLINGCPU_H diff --git a/inference/engine/include/cpu/power_cpu.hpp b/inference/engine/include/cpu/power_cpu.hpp new file mode 100644 index 00000000..aace8126 --- /dev/null +++ b/inference/engine/include/cpu/power_cpu.hpp @@ -0,0 +1,60 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _POWER_CPU_H +#define _POWER_CPU_H + +#include "power.hpp" + +class PowerCPU : public Power { +public: + PowerCPU(DataType dt, PowerParamSpec p) : Power(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new PowerCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + TensorDesc inputDesc = inputTensor.get_desc(); + Tensor outputTensor = this->outputTensors[0]; + + if (DT_I8 == inputDesc.dt) { +#ifdef _USE_INT8 + CHECK_REQUIREMENT(0 == this->p.shift); + F32 scaleO = inputTensor.get_scale() / this->p.scale; + outputTensor.set_scale(scaleO); + auto inPtr = ((CpuMemory *)(inputTensor.get_memory()))->get_ptr(); + auto outPtr = ((CpuMemory *)(outputTensor.get_memory()))->get_ptr(); + if (inPtr != outPtr) { + memcpy(outPtr, inPtr, tensorNumBytes(inputDesc)); + } +#endif + } else { + CHECK_STATUS(power(inputTensor, this->p, outputTensor, &this->archInfo)); + } + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + return power_infer_output_size(inTensors[0], outTensors[0], &this->archInfo); + } +}; + +#endif // _POWER_CPU_H diff --git a/inference/engine/include/cpu/preallocated_memory_cpu.hpp b/inference/engine/include/cpu/preallocated_memory_cpu.hpp new file mode 100644 index 00000000..92a0fab5 --- /dev/null +++ b/inference/engine/include/cpu/preallocated_memory_cpu.hpp @@ -0,0 +1,48 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _PREALLOCATED_MEMORY_CPU_H +#define _PREALLOCATED_MEMORY_CPU_H + +#include "preallocated_memory.hpp" + +class PreAllocatedMemoryCPU : public PreAllocatedMemory { +public: + PreAllocatedMemoryCPU(DataType dt, TensorDesc desc) : PreAllocatedMemory(dt, desc) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new PreAllocatedMemoryCPU(this->dt, this->desc)); + *mem = *this; + return mem; + } + + void run() override + { + CHECK_STATUS(preallocated_memory(this->outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + if (inTensors.size() > 0) { + CHECK_STATUS(NOT_MATCH); + } + outTensors[0]->resize(this->desc); + return SUCCESS; + } +}; + +#endif // _PREALLOCATED_MEMORY_CPU_H diff --git a/inference/engine/include/cpu/prelu_cpu.hpp b/inference/engine/include/cpu/prelu_cpu.hpp new file mode 100644 index 00000000..5f81a573 --- /dev/null +++ b/inference/engine/include/cpu/prelu_cpu.hpp @@ -0,0 +1,65 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _PRELU_CPU_H +#define _PRELU_CPU_H + +#include "prelu.hpp" + +class PReLUCPU : public PReLU { +public: + PReLUCPU(DataType dt) : PReLU(dt) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new PReLUCPU(this->dt)); + *mem = *this; + return mem; + } + + EE infer_weight_desc() override + { + auto curOpWs = this->get_weightspec(); + U32 weightNum = 0; + if (curOpWs.weight != nullptr) { + weightNum = curOpWs.bytes_of_weight / UNI_MAX(1, bytesOf(curOpWs.mdt)); + } + if (weightNum == 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (weightNum == 1) { + this->preluDesc.propagate_down = true; + } else { + this->preluDesc.propagate_down = false; + } + Tensor weightTensor; + weightTensor.resize(tensor1d(this->dt, weightNum)); + this->weightTensors.push_back(weightTensor); + return SUCCESS; + } + + void run() override + { + CHECK_STATUS(prelu(this->inputTensors[0], this->weightTensors[0], this->preluDesc, + this->outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS(prelu_infer_output_size(inTensors[0], outTensors[0], &this->archInfo)); + return SUCCESS; + } +}; +#endif // _PRELU_CPU_H diff --git a/inference/engine/include/cpu/repeat_cpu.hpp b/inference/engine/include/cpu/repeat_cpu.hpp new file mode 100644 index 00000000..46f8789c --- /dev/null +++ b/inference/engine/include/cpu/repeat_cpu.hpp @@ -0,0 +1,86 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _REPEAT_CPU_H +#define _REPEAT_CPU_H + +#include "repeat.hpp" + +class RepeatCPU : public Repeat { +public: + RepeatCPU(DataType dt, RepeatParamSpec p, I32 jumpOperatorIndex, I32 currentOperatorIndex) + : Repeat(dt, p, jumpOperatorIndex, currentOperatorIndex) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr( + new RepeatCPU(this->dt, this->p, this->jumpOperatorIndex, this->nextOperatorIndex - 1)); + *mem = *this; + return mem; + } + + void run() override + {} + + int get_next_operator_index() override + { + // check status + if (this->inputTensors.size() > 1) { + Tensor inputTensor = this->inputTensors[1]; + TensorDesc inputDesc = inputTensor.get_desc(); + I32 *ptr = (I32 *)(((CpuMemory *)(inputTensor.get_memory()))->get_ptr()); + U32 length = tensorNumElements(inputDesc); + for (U32 i = 0; i < length; i++) { + // end loop + if (ptr[i]) { + this->iter = 0; + return this->nextOperatorIndex; + } + } + } + + // check loop + if (this->iter < this->p.loops) { + this->iter++; + return this->jumpOperatorIndex; + } else { + this->iter = 0; + return this->nextOperatorIndex; + } + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->iter = 0; + if (this->p.axis >= 0) { + int axisIndex = 0; + if (inTensors.size() > 2) { + axisIndex = 2; + } else { + UNI_ERROR_LOG("[ERROR] set to use axis feature of Repeat must meet input tensors " + ">= 3 requirement\n"); + } + TensorDesc desc = inTensors[axisIndex]->get_desc(); + this->p.loops = desc.dims[desc.nDims - 1 - this->p.axis]; + } + TensorDesc outDesc = outTensors[0]->get_desc(); + outDesc.dt = this->dt; + outDesc.nDims = 0; + outTensors[0]->resize(outDesc); + return SUCCESS; + } +}; + +#endif // _REPEAT_CPU_H diff --git a/inference/engine/include/cpu/reshape_cpu.hpp b/inference/engine/include/cpu/reshape_cpu.hpp new file mode 100644 index 00000000..eec8571b --- /dev/null +++ b/inference/engine/include/cpu/reshape_cpu.hpp @@ -0,0 +1,89 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _RESHAPE_CPU_H +#define _RESHAPE_CPU_H + +#include "reshape.hpp" + +class ReshapeCPU : public Reshape { +public: + ReshapeCPU(DataType dt, ReshapeParamSpec p) : Reshape(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new ReshapeCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + Tensor tmpInputTensor = inputTensor; + Tensor tmpOutputTensor = outputTensor; + auto inputDesc = inputTensor.get_desc(); + auto outputDesc = outputTensor.get_desc(); + // if axis is 8, the mode of a model for reshape is tflite. + if (this->p.axis == 8 && outputDesc.nDims == 4) { + auto tmpOutputDesc = outputTensor.get_desc(); + tmpOutputDesc.df = DF_NHWC; + tmpOutputTensor = this->temp; + tmpOutputTensor.resize(tmpOutputDesc); + } + + // if axis is 8, the mode of a model for reshape is tflite. + // NCHW/NCHWC8 -> NHWC + if (this->p.axis == 8 && inputDesc.nDims == 4) { + auto inputDesc = inputTensor.get_desc(); + auto tmpInputDesc = inputDesc; + tmpInputDesc.df = DF_NHWC; + transformToNHWC(inputDesc, ((CpuMemory *)(inputTensor.get_memory()))->get_ptr(), + tmpInputDesc, ((CpuMemory *)(this->temp.get_memory()))->get_ptr()); + tmpInputTensor = this->temp; + tmpInputTensor.resize(tmpInputDesc); + } + + CHECK_STATUS(reshape(tmpInputTensor, this->temp, tmpOutputTensor, &this->archInfo)); + // NHWC -> NCHW + if (this->p.axis == 8 && outputDesc.nDims == 4) { + auto outputDesc = outputTensor.get_desc(); + auto tmpOutputDesc = tmpOutputTensor.get_desc(); + void *tmpOutputPtr = ((CpuMemory *)(tmpOutputTensor.get_memory()))->get_ptr(); + transformToNCHW(tmpOutputDesc, tmpOutputPtr, outputDesc, + ((CpuMemory *)(outputTensor.get_memory()))->get_ptr()); + } + outputTensor.set_scale(inputTensor.get_scale()); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS( + reshape_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + U32 bytes = 0; + CHECK_STATUS(reshape_infer_forward_tmp_bytes( + this->inputTensors[0], this->outputTensors[0], &bytes, &this->archInfo)); + return bytes; + } +}; + +#endif // _RESHAPE_CPU_H diff --git a/inference/engine/include/cpu/resize_cpu.hpp b/inference/engine/include/cpu/resize_cpu.hpp new file mode 100644 index 00000000..b8ee1bf6 --- /dev/null +++ b/inference/engine/include/cpu/resize_cpu.hpp @@ -0,0 +1,78 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _RESIZE_CPU_H +#define _RESIZE_CPU_H + +#include "image.h" +#include "resize.hpp" + +class ResizeCPU : public Resize { +public: + ResizeCPU(DataType paramDT, ResizeParamSpec p) : Resize(paramDT, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new ResizeCPU(this->paramDT, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + CHECK_STATUS(resize(inputTensors[0], temp, outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + ResizeDesc resizeDesc; + resizeDesc.paramDT = this->paramDT; + U32 bytes; + switch (paramDT) { + case DT_F32: { + CHECK_REQUIREMENT(1 == this->p.scales[0] && 1 == this->p.scales[1]); + CHECK_STATUS(resize_infer_output_size(inTensors[0], resizeDesc, this->p.scales + 2, + outTensors[0], &bytes, &this->archInfo)); + break; + } + case DT_U32: { + CHECK_STATUS(resize_infer_output_size(inTensors[0], resizeDesc, this->p.sizes, + outTensors[0], &bytes, &this->archInfo)); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + U32 size = 0; + TensorDesc inputDesc = inputTensors[0].get_desc(); + if (DF_NCHW == inputDesc.df && IS_ARM(archInfo.arch)) { + U32 paddedC = (inputDesc.dims[2] + 7) / 8 * 8; + TensorDesc outputDesc = outputTensors[0].get_desc(); + inputDesc.dims[2] = paddedC; + outputDesc.dims[2] = paddedC; + size = tensorNumBytes(inputDesc) + tensorNumBytes(outputDesc); + } + return size; + } +}; + +#endif // _RESIZECPU_H diff --git a/inference/engine/include/cpu/rnn_cpu.hpp b/inference/engine/include/cpu/rnn_cpu.hpp new file mode 100644 index 00000000..de5691db --- /dev/null +++ b/inference/engine/include/cpu/rnn_cpu.hpp @@ -0,0 +1,62 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _RNN_CPU_H +#define _RNN_CPU_H + +#include "cpu/rnncell_cpu.hpp" + +class RNNCPU : public RNNCellCPU { +public: + RNNCPU(DataType dt, RNNParamSpec p) : RNNCellCPU(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new RNNCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + // NOTE: no clean tmp and output + CHECK_STATUS(rnn(inputTensor, this->weightTensors, this->biasTensors, this->p, this->temp, + outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + TensorDesc inDim = inTensors[0]->get_desc(); + DataType dt; + DataFormat df; + U32 iB, inT, iX; + CHECK_STATUS(tensor3dGet(inDim, &dt, &df, &iB, &inT, &iX)); + this->xDim = iX; + CHECK_STATUS(rnn_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + U32 bytes = 0; + CHECK_STATUS(rnn_infer_forward_tmp_bytes(this->inputTensors[0], this->weightTensors[0], + this->outputTensors[0], this->p, &bytes, &this->archInfo)); + return bytes; + } +}; + +#endif // _RNN_CPU_H diff --git a/inference/engine/include/cpu/rnncell_cpu.hpp b/inference/engine/include/cpu/rnncell_cpu.hpp new file mode 100644 index 00000000..f5fd5b58 --- /dev/null +++ b/inference/engine/include/cpu/rnncell_cpu.hpp @@ -0,0 +1,140 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _RNNCELL_CPU_H +#define _RNNCELL_CPU_H + +#include "rnncell.hpp" + +class RNNCellCPU : public RNNCell { +public: + RNNCellCPU(DataType dt, RNNParamSpec p) : RNNCell(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new RNNCellCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + Tensor xTensor = this->inputTensors[0]; + Tensor stateTensor = this->inputTensors[1]; + Tensor hTensor = this->outputTensors[0]; + Tensor tmpTensor = this->temp; + U32 tmpOffset = 0; + if (this->featureScale.size() > 1) { + tmpTensor.resize(xTensor.get_desc()); + CHECK_STATUS(clip(xTensor, this->clipParam, tmpTensor, &this->archInfo)); + xTensor = tmpTensor; + tmpOffset = xTensor.bytes(); + } + CHECK_STATUS(rnncell(xTensor, this->weightTensors, this->biasTensors, stateTensor, this->p, + this->xDim, this->p.numOutput, tmpOffset, tmpTensor, hTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + TensorDesc inDim = inTensors[0]->get_desc(); + DataType dt; + DataFormat df; + U32 iB, iX; + CHECK_STATUS(tensor2dGet(inDim, &dt, &df, &iB, &iX)); + this->xDim = iX; + CHECK_STATUS(rnncell_infer_output_size(inTensors, this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + U32 bytes = 0; + CHECK_STATUS(rnncell_infer_forward_tmp_bytes(this->inputTensors[0], this->weightTensors[0], + this->outputTensors[0], this->p, &bytes, &this->archInfo)); + + if (featureScale.size() > 1) { + CHECK_REQUIREMENT(featureScale[0][0] > 0); + CHECK_REQUIREMENT(featureScale[0][0] == featureScale[1][0]); + this->clipParam.max = 127.0 / featureScale[0][0]; + this->clipParam.min = -1 * this->clipParam.max; + bytes += this->inputTensors[0].bytes(); + } + return bytes; + } + + EE transform_filter() override + { + I32 filter_num = this->weightTensors.size(); + std::vector bytes(filter_num); + CHECK_STATUS( + rnn_transform_filter_bytes(this->weightTensors, this->p, bytes.data(), &this->archInfo)); + std::vector ftmTensors(filter_num); + std::vector tmp(filter_num); + for (I32 i = 0; i < filter_num; i++) { + ftmTensors[i].resize(tensor1d(DT_U8, bytes[i])); + ftmTensors[i].alloc(); + tmp[i] = &ftmTensors[i]; + } + CHECK_STATUS(rnn_transform_filter(this->weightTensors, this->p, tmp, &this->archInfo)); + this->weightTensors = ftmTensors; + return SUCCESS; + } + + EE infer_weight_desc() override + { + int num1 = (this->p.biDirection) ? 2 : 1; + int num2, column; + if (this->p.numProjection > 0) { + num2 = 2; + column = this->p.numProjection; + } else { + num2 = 1; + column = this->p.numOutput; + } + int factor = 0; + switch (this->p.mode) { + case RNN_LSTM: + factor = 4; + break; + case RNN_GRU: + factor = 3; + break; + case RNN_GRU_LBR: + factor = 3; + break; + default: + return NOT_SUPPORTED; + } + U32 filterRow = factor * column; + U32 filterCol = this->xDim + this->p.numOutput; + std::vector weight_desc(2), bias_desc(2); + weight_desc[0] = tensor2df(this->dt, DF_NK, filterRow, filterCol); + weight_desc[1] = tensor2df(this->dt, DF_NK, this->p.numOutput, this->p.numProjection); + bias_desc[0] = tensor1d(this->dt, filterRow); + bias_desc[1] = tensor1d(this->dt, this->p.numOutput); + this->weightTensors = std::vector(num1 * num2); + this->biasTensors = std::vector(num1 * num2); + for (int i = 0, id = 0; i < num1; i++) { + for (int j = 0; j < num2; j++, id++) { + this->weightTensors[id].resize(weight_desc[j]); + this->biasTensors[id].resize(bias_desc[j]); + } + } + return SUCCESS; + } +}; + +#endif // _RNNCELL_CPU_H diff --git a/inference/engine/include/cpu/scale_cpu.hpp b/inference/engine/include/cpu/scale_cpu.hpp new file mode 100644 index 00000000..3d4a7b0a --- /dev/null +++ b/inference/engine/include/cpu/scale_cpu.hpp @@ -0,0 +1,109 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _SCALE_CPU_H +#define _SCALE_CPU_H + +#include "scale.hpp" + +class ScaleCPU : public Scale { +public: + ScaleCPU(DataType dt, ScaleParamSpec p, int numChannels) : Scale(dt, p, numChannels) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new ScaleCPU(this->dt, this->p, this->numChannels)); + *mem = *this; + return mem; + } + + void run() override + { + int inputTensorNumber = this->inputTensors.size(); + Tensor inputTensor = this->inputTensors[this->dataID]; + Tensor outputTensor = this->outputTensors[0]; + outputTensor.resize(inputTensor.get_desc()); + + void *alpha, *beta; + if (inputTensorNumber == 1) { + alpha = ((CpuMemory *)(this->weightTensors[0].get_memory()))->get_ptr(); + beta = ((CpuMemory *)(this->biasTensors[0].get_memory()))->get_ptr(); + } else { + alpha = ((CpuMemory *)(this->inputTensors[1 - this->dataID].get_memory()))->get_ptr(); + beta = nullptr; + } + CHECK_STATUS(scale(inputTensor, alpha, beta, this->p, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + auto inDim = inTensors[0]->get_desc(); + auto curOpWs = this->get_weightspec(); + if (curOpWs.bytes_of_weight == bytesOf(curOpWs.mdt) || + curOpWs.bytes_of_vec == bytesOf(curOpWs.mdt)) { + this->p.axis = 0; + } + I32 tmpAxis = (this->p.axis + inDim.nDims) % inDim.nDims; + tmpAxis = inDim.nDims - 1 - tmpAxis; + CHECK_REQUIREMENT(tmpAxis < (I32)inDim.nDims); + U32 ic = inDim.dims[tmpAxis]; + + if (0 != curOpWs.bytes_of_weight) { + this->numChannels = curOpWs.bytes_of_weight / UNI_MAX(1, bytesOf(curOpWs.mdt)); + } else if (0 != curOpWs.bytes_of_vec) { + this->numChannels = curOpWs.bytes_of_vec / UNI_MAX(1, bytesOf(curOpWs.mdt)); + } else { + this->numChannels = 0; + } + + if (ic != numChannels && 0 != numChannels) { + UNI_ERROR_LOG("ScaleCPU input channels (IC) do not match. Perhaps some channel padding " + "has been done earlier\n" + " IC is now %u but should be %u\n", + ic, numChannels); + CHECK_STATUS(NOT_SUPPORTED); + return NOT_SUPPORTED; + } else { + if (inTensors.size() > 1 && + tensorNumElements(inTensors[1]->get_desc()) > tensorNumElements(inDim)) { + this->dataID = 1; + } + } + + CHECK_STATUS( + scale_infer_output_size(inTensors[this->dataID], outTensors[0], &this->archInfo)); + return SUCCESS; + } + + EE infer_weight_desc() override + { + auto curOpWs = this->get_weightspec(); + if (0 != curOpWs.bytes_of_weight) { + this->numChannels = curOpWs.bytes_of_weight / UNI_MAX(1, bytesOf(curOpWs.mdt)); + } else if (0 != curOpWs.bytes_of_vec) { + this->numChannels = curOpWs.bytes_of_vec / UNI_MAX(1, bytesOf(curOpWs.mdt)); + } else { + this->numChannels = 0; + } + this->weightTensors = std::vector(1); + this->weightTensors[0].resize(tensor1d(this->dt, numChannels)); + this->biasTensors = std::vector(1); + this->biasTensors[0].resize(tensor1d(this->dt, numChannels)); + return SUCCESS; + } +}; + +#endif // _SCALE_CPU_H diff --git a/inference/engine/include/cpu/shape_cpu.hpp b/inference/engine/include/cpu/shape_cpu.hpp new file mode 100644 index 00000000..be56eebb --- /dev/null +++ b/inference/engine/include/cpu/shape_cpu.hpp @@ -0,0 +1,50 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _SHAPE_CPU_H +#define _SHAPE_CPU_H + +#include "shape.hpp" + +class ShapeCPU : public Shape { +public: + ShapeCPU() : Shape() + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new ShapeCPU()); + *mem = *this; + return mem; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + TensorDesc inputDesc = inputTensor.get_desc(); + Tensor outputTensor = this->outputTensors[0]; + UNI_memcpy(((CpuMemory *)(outputTensor.get_memory()))->get_ptr(), inputDesc.dims, + inputDesc.nDims * sizeof(U32)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + TensorDesc inputDesc = inTensors[0]->get_desc(); + TensorDesc outputDesc = tensor1d(DT_U32, inputDesc.nDims); + outTensors[0]->resize(outputDesc); + return SUCCESS; + } +}; + +#endif // _SHAPE_CPU_H diff --git a/inference/engine/include/cpu/shared_weight_cpu.hpp b/inference/engine/include/cpu/shared_weight_cpu.hpp new file mode 100644 index 00000000..26185ade --- /dev/null +++ b/inference/engine/include/cpu/shared_weight_cpu.hpp @@ -0,0 +1,73 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _SHARED_WEIGHT_CPU_H +#define _SHARED_WEIGHT_CPU_H + +#include "shared_weight.hpp" + +class SharedWeightCPU : public SharedWeight { +public: + SharedWeightCPU(DataType dt, + TensorDesc desc, + std::string outputTensorName, + std::map> *tensorMapPtr) + : SharedWeight(dt, desc, outputTensorName, tensorMapPtr) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr( + new SharedWeightCPU(this->dt, this->desc, this->outputTensorName, this->tensorMapPtr)); + *mem = *this; + return mem; + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + UNUSED(inTensors); + outTensors[0]->resize(this->desc); + return SUCCESS; + } + + void run() override + {} + + EE init_weight_bias_from_model(std::shared_ptr *modelPtrShared) override + { + U8 *modelPtr = nullptr; + if (modelPtrShared != nullptr) { + modelPtr = (*modelPtrShared).get(); + } + TensorDesc weightDesc = this->desc; + Tensor modelWeightTensor; + modelWeightTensor.resize(weightDesc); + U32 weightBytes = modelWeightTensor.bytes(); + if (modelPtr != nullptr) { + modelWeightTensor.alloc(); + memcpy( + ((CpuMemory *)(modelWeightTensor.get_memory()))->get_ptr(), modelPtr, weightBytes); + *modelPtrShared = std::shared_ptr(*modelPtrShared, modelPtr + weightBytes); + } else { + auto curOpWs = this->get_weightspec(); + ((CpuMemory *)(modelWeightTensor.get_memory())) + ->set_shared_ptr(std::shared_ptr(curOpWs.weight)); + } + this->weightTensors.push_back(modelWeightTensor); + (*this->tensorMapPtr)[this->outputTensorName]->reuse(&(this->weightTensors[0])); + return SUCCESS; + } +}; + +#endif // _SHARED_WEIGHT_CPU_H diff --git a/inference/engine/include/cpu/slice_cpu.hpp b/inference/engine/include/cpu/slice_cpu.hpp new file mode 100644 index 00000000..e57da50f --- /dev/null +++ b/inference/engine/include/cpu/slice_cpu.hpp @@ -0,0 +1,50 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _SLICE_CPU_H +#define _SLICE_CPU_H + +#include "slice.hpp" + +class SliceCPU : public Slice { +public: + SliceCPU(DataType dt, SliceParamSpec p) : Slice(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new SliceCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + CHECK_STATUS(slice(inputTensors[0], this->p, outputTensors, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS(slice_infer_output_size(inTensors[0], this->p, outTensors, &this->archInfo)); + auto outDesc = outTensors[0]->get_desc(); + if (outDesc.nDims == 3 && outDesc.dims[1] == 1 && outDesc.dims[2] == 1) { + outDesc.nDims = 2; + outDesc.df = DF_NORMAL; + outTensors[0]->resize(outDesc); + } + return SUCCESS; + } +}; + +#endif // _SLICE_CPU_H diff --git a/inference/engine/include/cpu/softmax_cpu.hpp b/inference/engine/include/cpu/softmax_cpu.hpp new file mode 100644 index 00000000..03bdf6fa --- /dev/null +++ b/inference/engine/include/cpu/softmax_cpu.hpp @@ -0,0 +1,46 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _SOFTMAX_CPU_H +#define _SOFTMAX_CPU_H + +#include "softmax.hpp" + +class SoftmaxCPU : public Softmax { +public: + SoftmaxCPU(DataType dt, SoftmaxParamSpec p) : Softmax(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new SoftmaxCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + CHECK_STATUS( + softmax(inputTensors[0], this->p, this->temp, outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS(softmax_infer_output_size(inTensors[0], outTensors[0], &this->archInfo)); + return SUCCESS; + } +}; + +#endif // SOFTMAX_CPU_H diff --git a/inference/engine/include/cpu/splice_cpu.hpp b/inference/engine/include/cpu/splice_cpu.hpp new file mode 100644 index 00000000..2ef79256 --- /dev/null +++ b/inference/engine/include/cpu/splice_cpu.hpp @@ -0,0 +1,69 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _SPLICE_CPU_H +#define _SPLICE_CPU_H + +#include "splice.hpp" + +class SpliceCPU : public Splice { +public: + SpliceCPU(DataType dt, SpliceParamSpec p) : Splice(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new SpliceCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + TensorDesc inputDesc = inputTensor.get_desc(); + + EmbedParamSpec embedParamSpec; + embedParamSpec.input_dim = inputDesc.dims[1]; + embedParamSpec.num_output = inputDesc.dims[0]; + embedParamSpec.transpose = false; + CHECK_STATUS(embedding( + weightTensors[0], inputTensor, embedParamSpec, outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + auto inDim = inTensors[0]->get_desc(); + CHECK_REQUIREMENT(this->p.outputDim % inDim.dims[0] == 0); + auto outDim = inDim; + outDim.dims[1] = this->p.numIndices / (this->p.outputDim / inDim.dims[0]); + outDim.dims[0] = this->p.outputDim; + outTensors[0]->resize(outDim); + return SUCCESS; + } + + EE infer_weight_desc() override + { + auto curOpWs = this->get_weightspec(); + if (curOpWs.weight != nullptr) { + Tensor weightTensor; + weightTensor.resize(tensor1d(DT_U32, this->p.numIndices)); + this->weightTensors.push_back(weightTensor); + } + return SUCCESS; + } +}; + +#endif // _SPLICE_CPU_H diff --git a/inference/engine/include/cpu/squeeze_cpu.hpp b/inference/engine/include/cpu/squeeze_cpu.hpp new file mode 100644 index 00000000..28f14d72 --- /dev/null +++ b/inference/engine/include/cpu/squeeze_cpu.hpp @@ -0,0 +1,48 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _SQUEEZE_CPU_H +#define _SQUEEZE_CPU_H + +#include "squeeze.hpp" + +class SqueezeCPU : public Squeeze { +public: + SqueezeCPU(DataType dt, SqueezeParamSpec p) : Squeeze(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new SqueezeCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + squeeze(inputTensor, outputTensor, &this->archInfo); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS( + squeeze_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } +}; + +#endif // _SQUEEZE_CPU_H diff --git a/inference/engine/include/cpu/tfslice_cpu.hpp b/inference/engine/include/cpu/tfslice_cpu.hpp new file mode 100644 index 00000000..61828926 --- /dev/null +++ b/inference/engine/include/cpu/tfslice_cpu.hpp @@ -0,0 +1,48 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _TFSLICE_CPU_H +#define _TFSLICE_CPU_H + +#include "tfslice.hpp" + +class TfSliceCPU : public TfSlice { +public: + TfSliceCPU(DataType dt, TfSliceParamSpec p) : TfSlice(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new TfSliceCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + CHECK_STATUS(tfslice(inputTensor, this->p, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS( + tfslice_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } +}; + +#endif // _TFSLICE_CPU_H diff --git a/inference/engine/include/cpu/tile_cpu.hpp b/inference/engine/include/cpu/tile_cpu.hpp new file mode 100644 index 00000000..63a887aa --- /dev/null +++ b/inference/engine/include/cpu/tile_cpu.hpp @@ -0,0 +1,44 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _TILE_CPU_H +#define _TILE_CPU_H + +#include "tile.hpp" + +class TileCPU : public Tile { +public: + TileCPU(DataType dt, TileParamSpec p) : Tile(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new TileCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + CHECK_STATUS(tile(this->inputTensors[0], this->p, this->outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS(tile_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } +}; + +#endif // _TILECPU_H diff --git a/inference/engine/include/cpu/transpose_cpu.hpp b/inference/engine/include/cpu/transpose_cpu.hpp new file mode 100644 index 00000000..77dfde43 --- /dev/null +++ b/inference/engine/include/cpu/transpose_cpu.hpp @@ -0,0 +1,49 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _TRANSPOSE_CPU_H +#define _TRANSPOSE_CPU_H + +#include "transpose.hpp" + +class TransposeCPU : public Transpose { +public: + TransposeCPU(DataType dt, TransposeParamSpec p) : Transpose(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new TransposeCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + CHECK_STATUS(transpose(inputTensor, this->p, this->temp, outputTensor, &this->archInfo)); + outputTensor.set_scale(inputTensor.get_scale()); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS( + transpose_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } +}; + +#endif // _TRANSPOSE_CPU_H diff --git a/inference/engine/include/cpu/unsqueeze_cpu.hpp b/inference/engine/include/cpu/unsqueeze_cpu.hpp new file mode 100644 index 00000000..39eeab2f --- /dev/null +++ b/inference/engine/include/cpu/unsqueeze_cpu.hpp @@ -0,0 +1,49 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _UNSQUEEZE_CPU_H +#define _UNSQUEEZE_CPU_H + +#include "unsqueeze.hpp" + +class UnsqueezeCPU : public Unsqueeze { +public: + UnsqueezeCPU(DataType dt, UnsqueezeParamSpec p) : Unsqueeze(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new UnsqueezeCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + CHECK_STATUS(unsqueeze(inputTensor, outputTensor, &this->archInfo)); + outputTensor.set_scale(inputTensor.get_scale()); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS( + unsqueeze_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } +}; + +#endif // _UNSQUEEZE_CPU_H diff --git a/inference/engine/include/data_loader.hpp b/inference/engine/include/data_loader.hpp new file mode 100644 index 00000000..ad1417d0 --- /dev/null +++ b/inference/engine/include/data_loader.hpp @@ -0,0 +1,43 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_DATA_LOADER + +#include +#include "tensor_desc.h" +#include "tensor.hpp" + +#ifdef _BUILD_TEST +int string_end_with(std::string s, std::string sub); + +std::vector load_txt(std::string dataPath, std::vector dataDesc); + +std::vector load_bin( + std::string dataPath, std::vector sourceDataType, std::vector dataDesc); + +std::vector load_data(std::string directoryPath, + std::vector dataDesc, + std::vector> *datas); + +std::vector load_image_with_scale(std::string directoryPath, + std::vector dataDesc, + std::vector> *datas, + ImageFormat ImageFormat, + F32 scaleValue); + +std::vector load_bin_with_type(std::string directoryPath, + std::vector dataDesc, + std::vector> *datas, + std::vector sourceDataType); +#endif +#endif diff --git a/inference/engine/include/deconvolution.hpp b/inference/engine/include/deconvolution.hpp new file mode 100644 index 00000000..e2778310 --- /dev/null +++ b/inference/engine/include/deconvolution.hpp @@ -0,0 +1,45 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _DECONVOLUTION_H +#define _DECONVOLUTION_H + +#include "weight_operator.hpp" + +class Deconvolution : public WeightOperator { +public: + Deconvolution(DataType dt, ConvolutionParamSpec p, ActivationParamSpec activationDesc) + { + this->dt = dt; + this->p = p; + this->activationDesc = activationDesc; + this->hasBias = false; + this->alg = CONVOLUTION_ALGORITHM_NULL; + } + + OperatorType get_type() override + { + return OT_Deconvolution; + } + +public: + U32 numInputs; + + ConvolutionParamSpec p; + + ActivationParamSpec activationDesc; + + ConvolutionForwardAlgorithm alg; +}; + +#endif // _DECONVOLUTION_H diff --git a/inference/engine/include/depth2space.hpp b/inference/engine/include/depth2space.hpp new file mode 100644 index 00000000..4ef72334 --- /dev/null +++ b/inference/engine/include/depth2space.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _DEPTH2SPACE_H +#define _DEPTH2SPACE_H + +#include "operator.hpp" + +class Depth2Space : public Operator { +public: + Depth2Space(DataType dt, Depth2SpaceParamSpec p) + { + this->dt = dt; + this->p = p; + } + + OperatorType get_type() override + { + return OT_Depth2Space; + } + +protected: + Depth2SpaceParamSpec p; +}; + +#endif // _DEPTH2SPACE_H diff --git a/inference/engine/include/detection_output.hpp b/inference/engine/include/detection_output.hpp new file mode 100644 index 00000000..cdc92799 --- /dev/null +++ b/inference/engine/include/detection_output.hpp @@ -0,0 +1,57 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _DETECTION_OUTPUT_H +#define _DETECTION_OUTPUT_H + +#include "operator.hpp" + +class DetectionOutput : public Operator { +public: + DetectionOutput(DataType dt, DetectionOutputParamSpec p) + { + this->dt = dt; + this->p = p; + } + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new DetectionOutput(this->dt, this->p)); + *mem = *this; + return mem; + } + + OperatorType get_type() override + { + return OT_DetectionOutput; + } + + void run() override + { + CHECK_STATUS( + detectionoutput(this->inputTensors, this->p, this->outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS( + detectionoutput_infer_output_size(inTensors, this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } + +protected: + DetectionOutputParamSpec p; +}; +#endif // _DETECTION_OUTPUT_H diff --git a/inference/engine/include/eltwise.hpp b/inference/engine/include/eltwise.hpp new file mode 100644 index 00000000..3c3d5e34 --- /dev/null +++ b/inference/engine/include/eltwise.hpp @@ -0,0 +1,34 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _ELTWISE_H +#define _ELTWISE_H + +#include "operator.hpp" + +class Eltwise : public Operator { +public: + Eltwise(EltwiseParamSpec eltwiseDesc) + { + this->eltwiseDesc = eltwiseDesc; + } + + OperatorType get_type() override + { + return OT_Eltwise; + } + +protected: + EltwiseParamSpec eltwiseDesc; +}; +#endif // _ELTWISE_H diff --git a/inference/engine/include/embedding.hpp b/inference/engine/include/embedding.hpp new file mode 100644 index 00000000..d2c657ae --- /dev/null +++ b/inference/engine/include/embedding.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _EMBEDDING_H +#define _EMBEDDING_H + +#include "weight_operator.hpp" + +class Embedding : public WeightOperator { +public: + Embedding(DataType dt, EmbedParamSpec p) + { + this->dt = dt; + this->p = p; + } + + OperatorType get_type() override + { + return OT_Embedding; + } + +protected: + EmbedParamSpec p; +}; + +#endif // _EMBEDDING__H diff --git a/inference/engine/include/factory.hpp b/inference/engine/include/factory.hpp new file mode 100644 index 00000000..504d24e5 --- /dev/null +++ b/inference/engine/include/factory.hpp @@ -0,0 +1,464 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _FACTORY_H +#define _FACTORY_H + +#include "operator.hpp" + +#define NOT_SUPPORT \ + Operator *cep = NULL; \ + CHECK_STATUS(NOT_SUPPORTED); +#define NOT_USE0() +#define NOT_USE1(a1) \ + { \ + UNUSED(a1); \ + } +#define NOT_USE2(a1, a2) \ + { \ + NOT_USE1(a1) NOT_USE1(a2) \ + } +#define NOT_USE3(a1, a2, a3) \ + { \ + NOT_USE2(a1, a2) NOT_USE1(a3) \ + } +#define NOT_USE4(a1, a2, a3, a4) \ + { \ + NOT_USE2(a1, a2) NOT_USE2(a3, a4) \ + } +#define NOT_USE5(a1, a2, a3, a4, a5) \ + { \ + NOT_USE4(a1, a2, a3, a4) NOT_USE1(a5) \ + } +#define NOT_USE6(a1, a2, a3, a4, a5, a6) \ + { \ + NOT_USE4(a1, a2, a3, a4) NOT_USE2(a5, a6) \ + } +#define NOT_USE8(a1, a2, a3, a4, a5, a6, a7, a8) \ + { \ + NOT_USE4(a1, a2, a3, a4) NOT_USE4(a5, a6, a7, a8) \ + } +#define NOT_USE10(a1, a2, a3, a4, a5, a6, a7, a8, a9, aa) \ + { \ + NOT_USE8(a1, a2, a3, a4, a5, a6, a7, a8) NOT_USE2(a9, aa) \ + } +#define NOT_USE16(a1, a2, a3, a4, a5, a6, a7, a8, a9, aa, ab, ac, ad, ae, af, ag) \ + { \ + NOT_USE8(a1, a2, a3, a4, a5, a6, a7, a8) NOT_USE8(a9, aa, ab, ac, ad, ae, af, ag) \ + } +#define OP_UNSUP(num, ...) NOT_USE##num(__VA_ARGS__) NOT_SUPPORT + +class Factory { +public: + virtual ~Factory() + {} + + virtual std::shared_ptr createConvolution(DataType dt, + ConvolutionParamSpec p, + ActivationParamSpec dwActivationParamSpec, + ActivationParamSpec pwActivationParamSpec) = 0; + + virtual std::shared_ptr createDeconvolution( + DataType dt, ConvolutionParamSpec p, ActivationParamSpec activationDesc) = 0; + + virtual std::shared_ptr createPooling(PoolingParamSpec p) = 0; + + virtual std::shared_ptr createFullyConnected( + DataType dt, FullyConnectedParamSpec p, U32 numInput) = 0; + + virtual std::shared_ptr createSoftmax(DataType dt, SoftmaxParamSpec p) = 0; + + virtual std::shared_ptr createConcat(ConcatParamSpec p) = 0; + + virtual std::shared_ptr createActivation(ActivationParamSpec activationDesc) = 0; + + virtual std::shared_ptr createEltwise(EltwiseParamSpec eltwiseDesc) = 0; + + virtual std::shared_ptr createScale( + DataType dt, ScaleParamSpec p, int numChannels) = 0; + + virtual std::shared_ptr createRNN(DataType dt, RNNParamSpec p) = 0; + + virtual std::shared_ptr createRNNCell(DataType dt, RNNParamSpec p) = 0; + + virtual std::shared_ptr createEmbedding(DataType dt, EmbedParamSpec p) = 0; + + virtual std::shared_ptr createPower(DataType dt, PowerParamSpec p) = 0; + + virtual std::shared_ptr createMatMul(DataType dt, MatMulParamSpec p) = 0; + + virtual std::shared_ptr createLayerNorm(DataType dt, U32 weightNum) = 0; + + virtual std::shared_ptr createReshape(DataType dt, ReshapeParamSpec p) = 0; + + virtual std::shared_ptr createResize(DataType paramDT, ResizeParamSpec p) = 0; + + virtual std::shared_ptr createSlice(DataType dt, SliceParamSpec p) = 0; + + virtual std::shared_ptr createTranspose(DataType dt, TransposeParamSpec p) = 0; + + virtual std::shared_ptr createAttention(DataType dt, AttentionParamSpec p) = 0; + + virtual std::shared_ptr createClip(DataType dt, ClipParamSpec p) = 0; + + virtual std::shared_ptr createSqueeze(DataType dt, SqueezeParamSpec p) = 0; + + virtual std::shared_ptr createUnsqueeze(DataType dt, UnsqueezeParamSpec p) = 0; + + virtual std::shared_ptr createReduction(DataType dt, ReductionParamSpec p) = 0; + + virtual std::shared_ptr createArgMax(DataType dt, ArgMaxParamSpec p) = 0; + + virtual std::shared_ptr createCopy(DataType dt, CopyParamSpec p) = 0; + + virtual std::shared_ptr createCheck(DataType dt, CheckParamSpec p) = 0; + + virtual std::shared_ptr createRepeat( + DataType dt, RepeatParamSpec p, I32 jumpOperatorIndex, I32 currentOperatorIndex) = 0; + + virtual std::shared_ptr createBilateralSliceApply(BilateralSliceApplyParamSpec p) = 0; + + virtual std::shared_ptr createPreAllocatedMemory(DataType dt, TensorDesc desc) = 0; + + virtual std::shared_ptr createSharedWeight(DataType dt, + TensorDesc desc, + std::string outputTensorName, + std::map> *tensorMapPtr) = 0; + + virtual std::shared_ptr createJump( + DataType dt, I32 jumpOperatorIndex, I32 currentOperatorIndex) = 0; + + virtual std::shared_ptr createSpace2Depth(DataType dt) = 0; + + virtual std::shared_ptr createDepth2Space(DataType dt, Depth2SpaceParamSpec p) = 0; + + virtual std::shared_ptr createAttentionMask(DataType dt, AttentionMaskParamSpec p) = 0; + + virtual std::shared_ptr createRelativePositionEmbedding( + DataType dt, EmbedParamSpec p) = 0; + + virtual std::shared_ptr createRelativeShift(DataType dt, RelativeShiftParamSpec p) = 0; + + virtual std::shared_ptr createPadding(DataType dt, PadParamSpec p) = 0; + + virtual std::shared_ptr createPReLU(DataType dt) = 0; + + virtual std::shared_ptr createPriorBox(DataType dt, PriorBoxParamSpec p) = 0; + + virtual std::shared_ptr createDetectionOutput( + DataType dt, DetectionOutputParamSpec p) = 0; + + virtual std::shared_ptr createYolov3DetectionOutput( + DataType dt, Yolov3DetectionOutputParamSpec p) = 0; + + virtual std::shared_ptr createChannelResize(DataType dt, ChannelResizeParamSpec p) = 0; + + virtual std::shared_ptr createL2Normalization(DataType dt) = 0; + + virtual std::shared_ptr createTile(DataType dt, TileParamSpec p) = 0; + + virtual std::shared_ptr createTfSlice(DataType dt, TfSliceParamSpec p) = 0; + + virtual std::shared_ptr createSplice(DataType dt, SpliceParamSpec p) = 0; + + virtual std::shared_ptr createShape() = 0; + + std::shared_ptr createOperators(OperatorSpec curOps, + DataType dt, + std::map operatorIndexMap, + std::map> *tensorMapPtr, + std::vector inputTensorsName, + std::vector outputTensorsName, + std::set *weightOpOutputNames) + { + OperatorType opType = curOps.type; + DataType dtNoQ = (dt == DT_F16_8Q) ? DT_F16 : dt; + std::string opName = curOps.name; + std::shared_ptr op; + auto curPs = curOps.ps; + switch (opType) { + case OT_Conv: { + ActivationParamSpec dwActiveDesc; + ActivationParamSpec pwActiveDesc; + dwActiveDesc.mode = curPs.conv_spec.dw_activation_type; + pwActiveDesc.mode = curPs.conv_spec.pw_activation_type; + dwActiveDesc.value[0] = 0; + pwActiveDesc.value[0] = 0; + op = createConvolution(dt, curPs.conv_spec, dwActiveDesc, pwActiveDesc); + break; + } + case OT_Deconvolution: { + ActivationParamSpec activeDesc; + activeDesc.mode = curPs.conv_spec.pw_activation_type; + activeDesc.value[0] = 0; + op = createDeconvolution(dtNoQ, curPs.conv_spec, activeDesc); + break; + } + case OT_FC: { + op = createFullyConnected(dt, curPs.fc_spec, 0); + break; + } + case OT_Pooling: { + op = createPooling(curPs.pooling_spec); + break; + } + case OT_Softmax: { + op = createSoftmax(dtNoQ, curPs.softmax_spec); + break; + } + case OT_Relu: { + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_RELU; + activationDesc.value[0] = curOps.ps.relu_spec.neg_slope; + op = createActivation(activationDesc); + break; + } + case OT_Relu6: { + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_RELU6; + op = createActivation(activationDesc); + break; + } + case OT_HSwish: { + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_H_SWISH; + op = createActivation(activationDesc); + break; + } + case OT_Sigmoid: { + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_SIGMOID; + op = createActivation(activationDesc); + break; + } + case OT_HSigmoid: { + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_H_SIGMOID; + op = createActivation(activationDesc); + break; + } + case OT_Gelu: { + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_GELU; + op = createActivation(activationDesc); + break; + } + case OT_TanH: { + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_TANH; + op = createActivation(activationDesc); + break; + } + case OT_Mish: { + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_MISH; + op = createActivation(activationDesc); + break; + } + case OT_Greater: { + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_GREATER; + op = createActivation(activationDesc); + break; + } + case OT_Concat: { + op = createConcat(curPs.concat_spec); + break; + } + case OT_Eltwise: { + op = createEltwise(curOps.ps.eltwise_spec); + break; + } + case OT_Embedding: { + op = createEmbedding(dtNoQ, curPs.embed_spec); + break; + } + case OT_MatMul: { + op = createMatMul(dt, curPs.matmul_spec); + break; + } + case OT_Power: { + op = createPower(dt, curPs.power_spec); + break; + } + case OT_Scale: { + op = createScale(dtNoQ, curPs.scale_spec, 0); + break; + } + case OT_LayerNorm: { + op = createLayerNorm(dt, 0); + break; + } + case OT_Reshape: { + op = createReshape(dt, curPs.reshape_spec); + break; + } + case OT_Resize: { + if (curPs.resize_spec.num_sizes > 0) { + op = createResize(DT_U32, curPs.resize_spec); + } else { + CHECK_REQUIREMENT(curPs.resize_spec.num_scales == 4); + op = createResize(DT_F32, curPs.resize_spec); + } + break; + } + case OT_Slice: { + op = createSlice(dt, curPs.slice_spec); + break; + } + case OT_Transpose: { + op = createTranspose(dt, curPs.transpose_spec); + break; + } + case OT_Attention: { + op = createAttention(dtNoQ, curPs.attention_spec); + break; + } + case OT_Clip: { + op = createClip(dtNoQ, curPs.clip_spec); + break; + } + case OT_RNN: { + if (curPs.rnn_spec.steps >= 0) { + op = createRNN(dt, curPs.rnn_spec); + } else { + op = createRNNCell(dt, curPs.rnn_spec); + } + break; + } + case OT_Squeeze: { + op = createSqueeze(dtNoQ, curPs.squeeze_spec); + break; + } + case OT_Unsqueeze: { + op = createUnsqueeze(dtNoQ, curPs.unsqueeze_spec); + break; + } + case OT_Reduction: { + op = createReduction(dtNoQ, curPs.reduction_spec); + break; + } + case OT_ArgMax: { + op = createArgMax(dtNoQ, curPs.argmax_spec); + break; + } + case OT_PreAllocatedMemory: { + PreAllocatedMemoryParamSpec curPreAllocatedMemoryParamSpec = + curOps.ps.preallocated_memory_spec; + TensorDesc desc = curPreAllocatedMemoryParamSpec.desc; + op = createPreAllocatedMemory(dtNoQ, desc); + break; + } + case OT_SharedWeight: { + SharedWeightParamSpec curSharedWeightParamSpec = curOps.ps.shared_weight_spec; + TensorDesc desc = curSharedWeightParamSpec.desc; + op = createSharedWeight(dtNoQ, desc, outputTensorsName[0], tensorMapPtr); + weightOpOutputNames->insert(outputTensorsName[0]); + break; + } + case OT_Repeat: { + op = createRepeat(dtNoQ, curPs.repeat_spec, operatorIndexMap[inputTensorsName[0]], + operatorIndexMap[opName]); + break; + } + case OT_Check: { + op = createCheck(dtNoQ, curPs.check_spec); + break; + } + case OT_Copy: { + op = createCopy(dtNoQ, curPs.copy_spec); + break; + } + case OT_BilateralSliceApply: { + op = createBilateralSliceApply(curPs.bilateral_slice_apply_spec); + break; + } + case OT_Jump: { + op = createJump( + dtNoQ, operatorIndexMap[inputTensorsName[0]], operatorIndexMap[opName]); + break; + } + case OT_Space2Depth: { + op = createSpace2Depth(dt); + break; + } + case OT_Depth2Space: { + op = createDepth2Space(dt, curPs.depth2space_spec); + break; + } + case OT_AttentionMask: { + op = createAttentionMask(dt, curPs.attention_mask_spec); + break; + } + case OT_RelativePositionEmbedding: { + op = createRelativePositionEmbedding(dtNoQ, curPs.embed_spec); + break; + } + case OT_RelativeShift: { + op = createRelativeShift(dt, curPs.relative_shift_spec); + break; + } + case OT_Pad: { + op = createPadding(dt, curPs.pad_spec); + break; + } + case OT_PriorBox: { + op = createPriorBox(dt, curPs.prior_box_spec); + break; + } + case OT_DetectionOutput: { + op = createDetectionOutput(dt, curPs.detection_output_spec); + break; + } + case OT_Yolov3DetectionOutput: { + op = createYolov3DetectionOutput(dt, curPs.yolov3_detection_output_spec); + break; + } + case OT_ChannelResize: { + op = createChannelResize(dt, curPs.channel_resize_spec); + break; + } + case OT_L2Normalization: { + op = createL2Normalization(dt); + break; + } + case OT_PRelu: { + op = createPReLU(dt); + break; + } + case OT_Tile: { + op = createTile(dt, curPs.tile_spec); + break; + } + case OT_TfSlice: { + op = createTfSlice(dt, curPs.tfslice_spec); + break; + } + case OT_Splice: { + op = createSplice(dt, curPs.splice_spec); + break; + } + case OT_Shape: { + op = createShape(); + break; + } + default: { + UNI_ERROR_LOG("unsupported layer %s\n", OperatorTypeName()[opType]); + break; + } + } + return op; + } +}; + +#endif // _FACTORY_H diff --git a/inference/engine/include/fully_connected.hpp b/inference/engine/include/fully_connected.hpp new file mode 100644 index 00000000..1f563e69 --- /dev/null +++ b/inference/engine/include/fully_connected.hpp @@ -0,0 +1,40 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _FULLY_CONNECTED_H +#define _FULLY_CONNECTED_H + +#include "weight_operator.hpp" + +class FullyConnected : public WeightOperator { +public: + FullyConnected(DataType dt, FullyConnectedParamSpec p, U32 numInput) + { + this->dt = dt; + this->p = p; + this->numInput = numInput; + this->hasBias = false; + } + + OperatorType get_type() override + { + return OT_FC; + } + +public: + U32 numInput; + + FullyConnectedParamSpec p; +}; + +#endif // _FULLY_CONNECTED_H diff --git a/inference/engine/include/inference.hpp b/inference/engine/include/inference.hpp new file mode 100644 index 00000000..a5944c49 --- /dev/null +++ b/inference/engine/include/inference.hpp @@ -0,0 +1,84 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _HPP_INFERENCE +#define _HPP_INFERENCE + +#include "cnn.h" +#ifdef _USE_MALI +#include "gcl.h" +#endif +#ifdef _BUILD_TEST +#include "sequential.hpp" +#endif +#include "thread_affinity.h" +#include "op_type.h" +#include "model_tools.h" +#include "model_serialize_deserialize.hpp" + +inline std::map extractInputDims(const ModelSpec *ms) +{ + std::map inputDescMap; + int inputNum = ms->num_inputs; + for (int i = 0; i < inputNum; i++) { + inputDescMap[ms->input_names[i]] = ms->input_dims[i]; + } + return inputDescMap; +} + +inline std::shared_ptr createPipelinefromMs( + const char *affinityPolicyName, ModelSpec *ms, const char *algorithmMapPath) +{ + AffinityPolicy affinityPolicy = thread_affinity_get_policy_by_name(affinityPolicyName); + CNN *cnn = new CNN(affinityPolicy, ms->dt, ms->model_name); + + cnn->sort_operators_sequential(ms); + + // create ops + cnn->initialize_ops(ms); + + std::map inputDescMap = extractInputDims(ms); + + cnn->loadAlgorithmMapFromText(algorithmMapPath); + + // assign space for output, tmp, bias, and trans_weight + cnn->ready(inputDescMap); + + CHECK_STATUS(cnn->mark_input_output()); + + return std::shared_ptr(cnn); +} + +inline std::shared_ptr createPipeline( + const char *affinityPolicyName, const char *modelPath, const char *algorithmMapPath = "") +{ + // deserialize model from file + ModelSpec ms; + CHECK_STATUS(deserialize_model_from_file(modelPath, &ms)); + + std::shared_ptr pipeline = createPipelinefromMs(affinityPolicyName, &ms, algorithmMapPath); + + CHECK_STATUS(mt_destroy_model(&ms)); + return pipeline; +} + +#ifdef _BUILD_TEST +inline Sequential createSequentialPipeline( + const char *affinityPolicyName, DataType dt, const char *modelName) +{ + AffinityPolicy affinityPolicy = thread_affinity_get_policy_by_name(affinityPolicyName); + auto sequential = Sequential(affinityPolicy, dt, modelName); + return sequential; +} +#endif +#endif diff --git a/inference/engine/include/jump.hpp b/inference/engine/include/jump.hpp new file mode 100644 index 00000000..2932217b --- /dev/null +++ b/inference/engine/include/jump.hpp @@ -0,0 +1,76 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _JUMP_H +#define _JUMP_H + +#include "operator.hpp" + +class Jump : public Operator { +public: + Jump(DataType dt, I32 jumpOperatorIndex, I32 currentOperatorIndex) + { + this->dt = dt; + this->jumpOperatorIndex = jumpOperatorIndex; + this->nextOperatorIndex = currentOperatorIndex + 1; + } + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr( + new Jump(this->dt, this->jumpOperatorIndex, this->nextOperatorIndex)); + *mem = *this; + return mem; + } + + OperatorType get_type() override + { + return OT_Jump; + } + + void run() override + {} + + int get_next_operator_index() override + { + // check status + if (this->inputTensors.size() > 1) { + Tensor inputTensor = this->inputTensors[1]; + I32 *ptr = (I32 *)((CpuMemory *)(inputTensor.get_memory()))->get_ptr(); + U32 length = inputTensor.length(); + for (U32 i = 0; i < length; i++) { + if (ptr[i]) { + return this->jumpOperatorIndex; + } + } + } + return this->nextOperatorIndex; + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + UNUSED(inTensors); + auto outDim = outTensors[0]->get_desc(); + outDim.dt = this->dt; + outDim.nDims = 0; + outTensors[0]->resize(outDim); + return SUCCESS; + } + +private: + int jumpOperatorIndex; + int nextOperatorIndex; +}; + +#endif // _JUMP_H diff --git a/inference/engine/include/l2normalization.hpp b/inference/engine/include/l2normalization.hpp new file mode 100644 index 00000000..9916cd87 --- /dev/null +++ b/inference/engine/include/l2normalization.hpp @@ -0,0 +1,32 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _L2NORMALIZATION_H +#define _L2NORMALIZATION_H + +#include "operator.hpp" + +class L2Normalization : public Operator { +public: + L2Normalization(DataType dt) + { + this->dt = dt; + } + + OperatorType get_type() override + { + return OT_L2Normalization; + } +}; + +#endif // _L2NORMALIZATION_H diff --git a/inference/engine/include/layer_norm.hpp b/inference/engine/include/layer_norm.hpp new file mode 100644 index 00000000..f9e27ac0 --- /dev/null +++ b/inference/engine/include/layer_norm.hpp @@ -0,0 +1,37 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _LAYER_NORM_H +#define _LAYER_NORM_H + +#include "operator.hpp" + +class LayerNorm : public WeightOperator { +public: + LayerNorm(DataType dt, U32 weightNum) + { + this->dt = dt; + this->weightNum = weightNum; + this->hasBias = false; + } + + OperatorType get_type() override + { + return OT_LayerNorm; + } + +protected: + U32 weightNum; +}; + +#endif // _LAYER_NORM_H diff --git a/inference/engine/include/matmul.hpp b/inference/engine/include/matmul.hpp new file mode 100644 index 00000000..ee193d04 --- /dev/null +++ b/inference/engine/include/matmul.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _MATMUL_H +#define _MATMUL_H + +#include "operator.hpp" + +class MatMul : public Operator { +public: + MatMul(DataType dt, MatMulParamSpec p) + { + this->dt = dt; + this->p = p; + } + + OperatorType get_type() override + { + return OT_MatMul; + } + +protected: + MatMulParamSpec p; +}; + +#endif // _MATMUL_H diff --git a/inference/engine/include/memory_tracker.hpp b/inference/engine/include/memory_tracker.hpp new file mode 100644 index 00000000..19f05a65 --- /dev/null +++ b/inference/engine/include/memory_tracker.hpp @@ -0,0 +1,115 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _MEMORY_TRACKER_H +#define _MEMORY_TRACKER_H + +#include "tensor_desc.h" + +class MemoryTracker { +public: + MemoryTracker() + { + this->storageSize.clear(); + this->tensorStoragePosition.clear(); + this->memoryNeedAssign = true; + } + + void trackOpTensorSizes(std::shared_ptr op, std::vector tensorNames) + { + I32 *pos = op->get_tensor_positions().data(); + auto inputTensors = op->get_input_tensors(); + auto outputTensors = op->get_output_tensors(); + size_t numInput = inputTensors.size(); + size_t numOutput = outputTensors.size(); + for (size_t i = 0; i < numInput; i++) { + U32 size = inputTensors[i].bytes(); + I32 slot = pos[i]; + this->tensorStoragePosition[tensorNames[i]] = slot; + if (-1 == slot) { + if (!memoryNeedAssign) { + if (size > inputTensors[i].capacity()) { + this->memoryNeedAssign = true; + } + } + continue; + } + this->trackSlotSize(slot, size); + } + for (size_t i = 0; i < numOutput; i++) { + U32 size = outputTensors[i].bytes(); + I32 slot = pos[numInput + i]; + this->tensorStoragePosition[tensorNames[numInput + i]] = slot; + if (-1 == slot) { + if (!memoryNeedAssign) { + if (size > outputTensors[i].capacity()) { + this->memoryNeedAssign = true; + } + } + continue; + } + this->trackSlotSize(slot, size); + } + } + + I32 getSlotByTensorName(std::string name) + { + return tensorStoragePosition[name]; + } + + U32 getNumSlots() + { + return this->storageSize.size(); + } + + U32 getSizeSum() + { + U32 sum = 0; + for (U32 size : this->storageSize) { + sum += size; + } + return sum; + } + + std::vector getStorageSize() + { + return this->storageSize; + } + + void setMemoryAssigned() + { + this->memoryNeedAssign = false; + } + + bool getMemoryNeedAssign() + { + return this->memoryNeedAssign; + } + +protected: + void trackSlotSize(I32 slot, U32 size) + { + if (slot >= (I32)this->storageSize.size()) { + this->storageSize.resize(slot + 1, 0); + } + if (size > this->storageSize[slot]) { + this->storageSize[slot] = size; + this->memoryNeedAssign = true; + } + } + + std::vector storageSize; + std::map tensorStoragePosition; + bool memoryNeedAssign; +}; +#endif diff --git a/inference/engine/include/model.hpp b/inference/engine/include/model.hpp new file mode 100644 index 00000000..6e00c69c --- /dev/null +++ b/inference/engine/include/model.hpp @@ -0,0 +1,196 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _MODEL_H +#define _MODEL_H + +#include "thread_affinity.h" +#include "operator.hpp" +#include "algorithm_map.h" +#include "tensor_desc.h" +#ifdef _USE_MALI +#include "gcl.h" +#endif +#include "profiling.h" + +class Model { +public: + Model() + {} + Model(AffinityPolicy affinityPolicy, DataType dt, std::string name) + { + this->set_device_info(affinityPolicy); + this->dt = dt; + this->name = name; + std::string deviceName = ""; +#ifdef _USE_MALI + if (this->deviceInfo.schedule == MALI) { + this->handle = OCLContext::getInstance().handle; + deviceName = this->handle->deviceName; + } +#endif + algorithmMap = std::shared_ptr( + new AlgorithmMap(this->deviceInfo.schedule, name, deviceName, dt)); + } + + void set_device_info(AffinityPolicy affinityPolicy) + { +#ifndef _USE_IOS + this->deviceInfo = get_cpu_info(affinityPolicy); + this->set_runtime_device_dynamic(); +#else + this->deviceInfo.affinityPolicy = affinityPolicy; + this->deviceInfo.schedule = ARM_A76; +#endif + } + + void set_runtime_device(int cpuId, int threadId = 0) + { + this->set_runtime_device(cpuId, this->deviceInfo.archs[cpuId], threadId); + } + + void set_runtime_device(int cpuId, Arch arch, int threadId = 0) + { + this->deviceInfo.schedule = arch; + if (cpuId >= 0 && cpuId < this->deviceInfo.cpuNum) { + set_thread_affinity(threadId, &cpuId, 1); + for (auto op : ops) { + op->set_schedule(this->deviceInfo.schedule); + } + } + } + + void set_runtime_device_dynamic(int threadId = 0) + { + set_cpu_dynamic(&this->deviceInfo, threadId); + } + + Arch get_runtime_device() + { + return this->deviceInfo.schedule; + } + + virtual EE infer_output_tensors_size(std::map) = 0; + virtual void assign_output_tensor() = 0; + virtual void infer_tmp_memory_size() = 0; + virtual void assign_tmp_tensor() = 0; + + virtual void ready(std::map inputDescMap) + { + infer_output_tensors_size(inputDescMap); + assign_output_tensor(); + + infer_tmp_memory_size(); + assign_tmp_tensor(); + } + + virtual void run() = 0; + +#ifdef _USE_INT8 + virtual U32 find_next_dynamic_scale_op(std::vector calibratedOpIdx, U32 startIdx) + { + CHECK_REQUIREMENT(startIdx < this->ops.size()) + for (U32 i = startIdx; i < this->ops.size();) { + auto op = this->ops[i]; + if (op->is_dynamic_scale()) { + bool calibrated = false; + for (auto idx : calibratedOpIdx) { + if (i == idx) { + calibrated = true; + break; + } + } + if (!calibrated) { + return i; + } + } + + if (op->get_type() == OT_Repeat || op->get_type() == OT_Jump) { + i = op->get_next_operator_index(); + } else { + i++; + } + } + + return 0; // The first layer should never be quantized + } + + virtual std::shared_ptr get_operator_by_index(U32 index) + { + return this->ops[index]; + } + + virtual void run_till_breakpoint(U32 opIdx) + { + CHECK_REQUIREMENT(MALI != this->deviceInfo.schedule); + for (U32 i = 0; i < this->ops.size();) { + auto op = this->ops[i]; + if (op->get_type() == OT_Repeat || op->get_type() == OT_Jump) { + if (opIdx == i) { + break; + } + i = op->get_next_operator_index(); + } else { + op->run(); + if (opIdx == i) { + break; + } + i++; + } + } + } +#endif + + virtual bool checkOperator() + { + for (auto op : this->ops) { + if (!op->checkOperator()) { + return false; + } + } + return true; + } + + std::string get_name() + { + return this->name; + } + + void loadAlgorithmMapFromFileStream(const char *algoFileStream) + { + this->algorithmMap->loadAlgorithmMapFromFileStream(algoFileStream); + } + + void loadAlgorithmMapFromText(std::string algorithmMapPath) + { + this->algorithmMap->loadAlgorithmMapFromText(algorithmMapPath); + } + + void saveAlgorithmMapToText(std::string algorithmMapPath) + { + this->algorithmMap->saveAlgorithmMapToText(algorithmMapPath); + } + +protected: + std::vector> ops; + DeviceInfo deviceInfo; + DataType dt; +#ifdef _USE_MALI + std::shared_ptr handle; +#endif + std::shared_ptr algorithmMap; + +private: + std::string name; +}; +#endif diff --git a/inference/engine/include/ocl/activation_ocl.hpp b/inference/engine/include/ocl/activation_ocl.hpp new file mode 100644 index 00000000..d7aaa5af --- /dev/null +++ b/inference/engine/include/ocl/activation_ocl.hpp @@ -0,0 +1,55 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _ACTIVATION_OCL_H +#define _ACTIVATION_OCL_H + +#include "activation.hpp" + +class ActivationOCL : public Activation { +public: + ActivationOCL(ActivationParamSpec activationDesc) : Activation(activationDesc) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~ActivationOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new ActivationOCL(this->activationDesc)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + CHECK_STATUS(activation(inputTensor, this->activationDesc, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + CHECK_STATUS(activation_infer_output_size(inTensors[0], outTensors[0], &this->archInfo)); + return SUCCESS; + } + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _ACTIVATION_OCL_H diff --git a/inference/engine/include/ocl/argmax_ocl.hpp b/inference/engine/include/ocl/argmax_ocl.hpp new file mode 100644 index 00000000..329fb93b --- /dev/null +++ b/inference/engine/include/ocl/argmax_ocl.hpp @@ -0,0 +1,66 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _ARGMAX_OCL_H +#define _ARGMAX_OCL_H + +#include "argmax.hpp" + +class ArgMaxOCL : public ArgMax { +public: + ArgMaxOCL(DataType dt, ArgMaxParamSpec p) : ArgMax(dt, p) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~ArgMaxOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new ArgMaxOCL(this->dt, this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + CHECK_STATUS(argmax(inputTensor, this->p, this->temp, outputTensor, &this->archInfo)); + } + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + CHECK_STATUS( + argmax_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + U32 bytes = 0; + CHECK_STATUS(argmax_infer_forward_tmp_bytes( + inputTensor, this->p, outputTensor, &bytes, &this->archInfo)); + return bytes; + } + + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _ARGMAX_OCL_H diff --git a/inference/engine/include/ocl/bilateral_slice_apply_ocl.hpp b/inference/engine/include/ocl/bilateral_slice_apply_ocl.hpp new file mode 100644 index 00000000..b2d3150f --- /dev/null +++ b/inference/engine/include/ocl/bilateral_slice_apply_ocl.hpp @@ -0,0 +1,87 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _BILATERAL_SLICE_APPLY_OCL_H +#define _BILATERAL_SLICE_APPLY_OCL_H + +#include "bilateral_slice_apply.hpp" + +class BilateralSliceApplyOCL : public BilateralSliceApply { +public: + BilateralSliceApplyOCL(BilateralSliceApplyParamSpec p) : BilateralSliceApply(p) + { + this->guideTensor = Tensor(OCLMem); + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~BilateralSliceApplyOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new BilateralSliceApplyOCL(this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + Tensor inputTensor = this->inputTensors[0]; + Tensor gridTensor = this->inputTensors[1]; + Tensor outputTensor = this->outputTensors[0]; + + if (this->p.mode == BSliceApply_NULL) { + this->guideTensor = this->inputTensors[2]; + } + CHECK_STATUS(bilateral_slice_apply( + inputTensor, guideTensor, gridTensor, p, this->temp, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + auto inTensor = inTensors[0]; + auto gridTensor = inTensors[1]; + auto inDim = inTensor->get_desc(); + DataType dt; + DataFormat df; + U32 width; + U32 height; + U32 numChannels; + U32 num; + CHECK_STATUS(tensor4dGet(inDim, &dt, &df, &num, &numChannels, &height, &width)); + TensorDesc guideDesc = tensor4df(DT_F16, df, num, 1, height, width); + this->guideTensor.resize(guideDesc); + + CHECK_STATUS(bilateral_slice_apply_infer_output_size( + inTensor, &guideTensor, gridTensor, p, outTensors[0], &this->archInfo)); + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + U32 bytes = 0; + CHECK_STATUS(bilateral_slice_apply_infer_forward_tmp_bytes(this->inputTensors[0], + this->guideTensor, this->inputTensors[1], p, &bytes, &this->archInfo)); + return bytes; + } + + REGISTER_OCL_OPERATOR_RUN +private: + Tensor guideTensor; +}; + +#endif // _BILATERAL_SLICE_APPLY_OCL_H diff --git a/inference/engine/include/ocl/channel_resize_ocl.hpp b/inference/engine/include/ocl/channel_resize_ocl.hpp new file mode 100644 index 00000000..8a76e921 --- /dev/null +++ b/inference/engine/include/ocl/channel_resize_ocl.hpp @@ -0,0 +1,72 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _CHANNEL_RESIZE_OCL_H +#define _CHANNEL_RESIZE_OCL_H + +#include "channel_resize.hpp" + +class ChannelResizeOCL : public ChannelResize { +public: + ChannelResizeOCL(DataType dt, ChannelResizeParamSpec p) : ChannelResize(dt, p) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~ChannelResizeOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new ChannelResizeOCL(this->dt, this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + + CHECK_STATUS(channel_resize(inputTensor, this->p, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + TensorDesc inDesc = inTensors[0]->get_desc(); + int channelAxis = inDesc.nDims - 2; + if ((int)inDesc.dims[channelAxis] != this->p.channel_before) { + this->p.channel_before = inDesc.dims[channelAxis]; + } + if (this->p.group == 0) { + this->p.group = 1; + this->p.channel_before = (int)inDesc.dims[channelAxis]; + this->p.channel_after = this->p.channel_before; + } + if (this->p.group != 1) { + return NOT_SUPPORTED; + } + CHECK_STATUS( + channel_resize_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } + + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _CHANNEL_RESIZE_OCL_H diff --git a/inference/engine/include/ocl/check_ocl.hpp b/inference/engine/include/ocl/check_ocl.hpp new file mode 100644 index 00000000..3e4c322c --- /dev/null +++ b/inference/engine/include/ocl/check_ocl.hpp @@ -0,0 +1,54 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _CHECK_OCL_H +#define _CHECK_OCL_H + +#include "check.hpp" + +class CheckOCL : public Check { +public: + CheckOCL(DataType dt, CheckParamSpec p) : Check(dt, p) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~CheckOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new CheckOCL(this->dt, this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + Tensor inputATensor = this->inputTensors[0]; + Tensor inputBTensor = this->inputTensors[1]; + Tensor outputTensor = this->outputTensors[0]; + CHECK_STATUS(check(inputATensor, inputBTensor, this->p, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + return check_infer_output_size(inTensors, outTensors[0], &this->archInfo); + } + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _CHECK_OCL_H diff --git a/inference/engine/include/ocl/clip_ocl.hpp b/inference/engine/include/ocl/clip_ocl.hpp new file mode 100644 index 00000000..8733027e --- /dev/null +++ b/inference/engine/include/ocl/clip_ocl.hpp @@ -0,0 +1,54 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _CLIP_OCL_H +#define _CLIP_OCL_H + +#include "clip.hpp" + +class ClipOCL : public Clip { +public: + ClipOCL(DataType dt, ClipParamSpec p) : Clip(dt, p) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~ClipOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new ClipOCL(this->dt, this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + CHECK_STATUS(clip(inputTensor, this->p, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + CHECK_STATUS(clip_infer_output_size(inTensors[0], outTensors[0], &this->archInfo)); + return SUCCESS; + } + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _CLIP_OCL_H diff --git a/inference/engine/include/ocl/concat_ocl.hpp b/inference/engine/include/ocl/concat_ocl.hpp new file mode 100644 index 00000000..54ba4027 --- /dev/null +++ b/inference/engine/include/ocl/concat_ocl.hpp @@ -0,0 +1,60 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _CONCAT_OCL_H +#define _CONCAT_OCL_H + +#include "concat.hpp" + +class ConcatOCL : public Concat { +public: + ConcatOCL(ConcatParamSpec p) : Concat(p) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~ConcatOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new ConcatOCL(this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + auto outputTensor = this->outputTensors[0]; + CHECK_STATUS(concat(this->inputTensors, this->p, this->temp, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + CHECK_STATUS(concat_infer_output_size(inTensors, this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + U32 bytes = 0; + CHECK_STATUS(concat_infer_forward_tmp_bytes(this->inputTensors, &bytes, &this->archInfo)); + return bytes; + } + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _CONCAT_OCL_H diff --git a/inference/engine/include/ocl/convolution_ocl.hpp b/inference/engine/include/ocl/convolution_ocl.hpp new file mode 100644 index 00000000..be6e3400 --- /dev/null +++ b/inference/engine/include/ocl/convolution_ocl.hpp @@ -0,0 +1,534 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _CONVELTWISEPOOLING_OCL_H +#define _CONVELTWISEPOOLING_OCL_H + +#include "convolution.hpp" + +#include "ocl_desc_trans.h" + +class ConvolutionOCL : public Convolution { +public: + ConvolutionOCL(DataType dt, + ConvolutionParamSpec p, + ActivationParamSpec dwActivationParamSpec, + ActivationParamSpec pwActivationParamSpec) + : Convolution(dt, p, dwActivationParamSpec, pwActivationParamSpec) + { + setMALIArchInfo(&(this->archInfo), &(this->runInfo), &this->needSetKernelVec, + &this->needSelectKernelLS); + } + + ~ConvolutionOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new ConvolutionOCL( + this->dt, this->p, this->dwActivationParamSpec, this->pwActivationParamSpec)); + *mem = *this; + return mem; + } + + EE infer_weight_desc() override + { + TensorDesc wDesc[2]; + TensorDesc vDesc[2]; + wDesc[0] = this->filterDesc; + U32 filterNum = 1; + DataType dtNoQ = (this->dt == DT_F16_8Q) ? DT_F16 : this->dt; + switch (this->p.convolution_type) { + case Convolution_Pointwise: { + vDesc[0] = tensor1d(dtNoQ, + this->p.num_outputs); // bias data type should be the same as input and output + ((MaliPara_t)(this->archInfo.archPara))->forwardRunInfo->algorithm = + CONVOLUTION_ALGORITHM_NULL; + break; + } + case Convolution_Depthwise: { + vDesc[0] = tensor1d(dtNoQ, this->p.num_outputs); + ((MaliPara_t)(this->archInfo.archPara))->forwardRunInfo->algorithm = + DEPTHWISE_CONVOLUTION_ALGORITHM_NULL; + break; + } + case Convolution_Depthwise_Pointwise: { + wDesc[1] = this->filterDescExt; + vDesc[0] = tensor1d(dtNoQ, this->numChannels); + vDesc[1] = tensor1d(dtNoQ, this->p.num_outputs); + filterNum = 2; + ((MaliPara_t)(this->archInfo.archPara))->forwardRunInfo->algorithm = + DEPTHWISE_CONVOLUTION_ALGORITHM_NULL; + break; + } + case Convolution_Dilation: { + CHECK_STATUS(NOT_SUPPORTED); + return NOT_SUPPORTED; + break; + } + default: + CHECK_STATUS(NOT_SUPPORTED); + return NOT_SUPPORTED; + } + + for (U32 i = 0; i < filterNum; i++) { + Tensor modelWeightTensor = Tensor(OCLMem); + Tensor modelVectorTensor = Tensor(OCLMem); + auto weightMem = (OclMemory *)modelWeightTensor.get_memory(); + auto vectorMem = (OclMemory *)modelVectorTensor.get_memory(); + modelWeightTensor.resize(wDesc[i]); + modelVectorTensor.resize(vDesc[i]); + + U32 ww, wh, wc, wn; + DataFormat df; + DataType dt; + tensorSelectGet(wDesc[i], &dt, &df, &wn, &wc, &wh, &ww); + U32 stride[3] = {ww * wh, wc, wn}; + U32 offset[3] = {0, 0, 0}; + GCLMemType mt = GCL_MEM_BUF; + MemFlags flags = CL_MEM_READ_WRITE; + GCLMemDesc desc = gclmem_build_desc(); + CHECK_STATUS(gclmem_set_desc_padding(&desc, stride, offset, dt, df, mt, flags)); + weightMem->padding(desc); + + mt = GCL_MEM_IMG_1D; + U32 vecLen = vDesc[i].dims[0]; + U32 vecAlign = 4; + stride[0] = (vecLen + vecAlign - 1) / vecAlign; + if (i == 0) { + U32 iw, ih; + TensorDesc inputDesc = this->inputTensors[0].get_desc(); + tensorSelectGet(inputDesc, NULL, NULL, NULL, NULL, &ih, &iw); + if ((wn == 1 && this->p.convolution_type == Convolution_Pointwise) || + (ww == 1 && wh == 1 && iw == 1 && ih == 1)) { + mt = GCL_MEM_BUF; + vecAlign = 8; + stride[0] = (vecLen + vecAlign - 1) / vecAlign * vecAlign; + } + } + stride[1] = 1; + stride[2] = 1; + gclmem_set_desc_padding(&desc, stride, offset, dt, DF_NHWC, mt, flags); + vectorMem->padding(desc); + this->weightTensors.push_back(modelWeightTensor); + this->biasTensors.push_back(modelVectorTensor); + } + return SUCCESS; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + Tensor inputTensor = this->inputTensors[0]; + if (this->needTransInput) { + auto inputMem = (OclMemory *)inputTensor.get_memory(); + GCLMemDesc inputDesc = inputMem->get_desc(); + void *inputPtr = inputMem->get_ptr(); + TensorDesc inputDescCpu = inputTensor.get_desc(); + DataType dt; + DataFormat df; + U32 iw, ih, ic; + U32 iw_str, ih_str, ic_str, iw_off, ih_off; + tensorSelectGet(inputDescCpu, &dt, &df, NULL, &ic, &ih, &iw); + get_gclmem_dim(inputDesc, &iw_str, &ih_str, &ic_str, &iw_off, &ih_off); + if (inputDesc.memFormat == df && iw_str == iw && ih_str == ih && ic_str == ic && + iw_off == 0 && ih_off == 0) { + this->needTransInput = false; + } else { + auto tmpMem = (OclMemory *)this->temp.get_memory(); + void *tmpPtr = tmpMem->get_ptr(); + U32 stride[3] = {iw, ih, ic}; + U32 offset[3] = {0, 0, 0}; + GCLMemType mt = GCL_MEM_BUF; + MemFlags flags = CL_MEM_READ_WRITE; + GCLMemDesc initDesc = gclmem_build_desc(); + CHECK_STATUS(gclmem_set_desc_padding(&initDesc, stride, offset, dt, df, mt, flags)); + CHECK_STATUS(ocl_trans_mem(OCLContext::getInstance().handle.get(), + (GCLMem_t)inputPtr, initDesc, (GCLMem_t)tmpPtr, initDesc)); + CHECK_STATUS(ocl_trans_mem(OCLContext::getInstance().handle.get(), (GCLMem_t)tmpPtr, + initDesc, (GCLMem_t)inputPtr, inputDesc)); + } + } + Tensor filterTensor = this->weightTensors[0]; + filterTensor.resize(this->filterDesc); + U8 *scalePtr = nullptr; + Tensor biasTensor = this->biasTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + switch (this->p.convolution_type) { + case Convolution_Pointwise: { + CHECK_STATUS( + convolution(inputTensor, filterTensor, p, this->pwAlg, scalePtr, biasTensor, + this->temp, outputTensor, this->pwActivationParamSpec, &this->archInfo)); + break; + } + case Convolution_Depthwise: { + CHECK_STATUS( + depthwise_convolution(inputTensor, filterTensor, p, this->dwAlg, biasTensor, + this->temp, outputTensor, this->dwActivationParamSpec, &this->archInfo)); + break; + } + case Convolution_Depthwise_Pointwise: { + auto dwFilterTensor = filterTensor; + auto pwFilterTensor = this->weightTensors[1]; + auto dwBiasTensor = biasTensor; + auto pwBiasTensor = this->biasTensors[1]; + CHECK_STATUS( + depthwise_pointwise_convolution(inputTensor, dwFilterTensor, pwFilterTensor, p, + this->dwAlg, dwBiasTensor, pwBiasTensor, this->temp, outputTensor, + this->dwActivationParamSpec, this->pwActivationParamSpec, &this->archInfo)); + break; + } + case Convolution_Dilation: { + CHECK_STATUS(NOT_SUPPORTED); + break; + } + default: { + UNI_ERROR_LOG("[ERROR] unsupported convolution type %d\n", this->p.convolution_type); + } + } + } + + EE infer_forward_algorithm(std::shared_ptr algorithmMap) override + { + OCLContext::getInstance().handle.get()->kernelVec = &this->opKernelVec; + auto inputTensor = this->inputTensors[0]; + auto filterTensor = this->weightTensors[0]; + auto outputTensor = this->outputTensors[0]; + filterTensor.resize(this->filterDesc); + ConvolutionPolicy policy = CONVOLUTION_TUNNING; + DataType targetType = DT_F16; + I32 algo[7]; + switch (this->p.convolution_type) { + case Convolution_Pointwise: { + if (this->dt == DT_F16_8Q) { + targetType = DT_I8; + } + if (algorithmMap->getAlgorithmInfoFromMap(this->name, algo, 4)) { + this->runInfo.algorithm = (ConvolutionForwardAlgorithm)algo[0]; + this->runInfo.best_w[0] = algo[1]; + this->runInfo.best_c[0] = algo[2]; + this->runInfo.best_k[0] = algo[3]; + this->pwAlg = (ConvolutionForwardAlgorithm)algo[0]; + } else { + CHECK_STATUS(convolution_infer_forward_algorithm(inputTensor, filterTensor, + outputTensor, p, policy, &(this->pwAlg), targetType, + this->pwActivationParamSpec, &this->archInfo)); + algo[0] = this->runInfo.algorithm; + algo[1] = this->runInfo.best_w[0]; + algo[2] = this->runInfo.best_c[0]; + algo[3] = this->runInfo.best_k[0]; + this->pwAlg = (ConvolutionForwardAlgorithm)algo[0]; + algorithmMap->setAlgorithmInfoToMap(this->name, algo, 4); + } + break; + } + case Convolution_Depthwise: { + if (algorithmMap->getAlgorithmInfoFromMap(this->name, algo, 4)) { + this->runInfo.algorithm = (ConvolutionForwardAlgorithm)algo[0]; + this->runInfo.best_w[0] = algo[1]; + this->runInfo.best_c[0] = algo[2]; + this->runInfo.best_k[0] = algo[3]; + this->dwAlg = (DepthwiseConvolutionForwardAlgorithm)algo[0]; + } else { + CHECK_STATUS(depthwise_convolution_infer_forward_algorithm(inputTensor, + filterTensor, outputTensor, p, policy, &(this->dwAlg), targetType, + this->dwActivationParamSpec, &this->archInfo)); + algo[0] = this->runInfo.algorithm; + algo[1] = this->runInfo.best_w[0]; + algo[2] = this->runInfo.best_c[0]; + algo[3] = this->runInfo.best_k[0]; + this->dwAlg = (DepthwiseConvolutionForwardAlgorithm)algo[0]; + algorithmMap->setAlgorithmInfoToMap(this->name, algo, 4); + } + break; + } + case Convolution_Depthwise_Pointwise: { + if (algorithmMap->getAlgorithmInfoFromMap(this->name, algo, 7)) { + this->runInfo.algorithm = (ConvolutionForwardAlgorithm)algo[0]; + this->runInfo.best_w[0] = algo[1]; + this->runInfo.best_c[0] = algo[2]; + this->runInfo.best_k[0] = algo[3]; + this->runInfo.best_w[1] = algo[4]; + this->runInfo.best_c[1] = algo[5]; + this->runInfo.best_k[1] = algo[6]; + this->dwAlg = (DepthwiseConvolutionForwardAlgorithm)algo[0]; + } else { + auto dwFilterTensor = filterTensor; + auto pwFilterTensor = this->weightTensors[1]; + CHECK_STATUS(depthwise_pointwise_convolution_infer_forward_algorithm( + inputTensor, dwFilterTensor, pwFilterTensor, outputTensor, p, policy, + &(this->dwAlg), targetType, this->dwActivationParamSpec, + this->pwActivationParamSpec, &this->archInfo)); + algo[0] = this->runInfo.algorithm; + algo[1] = this->runInfo.best_w[0]; + algo[2] = this->runInfo.best_c[0]; + algo[3] = this->runInfo.best_k[0]; + algo[4] = this->runInfo.best_w[1]; + algo[5] = this->runInfo.best_c[1]; + algo[6] = this->runInfo.best_k[1]; + this->dwAlg = (DepthwiseConvolutionForwardAlgorithm)algo[0]; + algorithmMap->setAlgorithmInfoToMap(this->name, algo, 7); + } + break; + } + case Convolution_Dilation: { + CHECK_STATUS(NOT_SUPPORTED); + break; + } + default: + CHECK_STATUS(NOT_SUPPORTED); + } + return SUCCESS; + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + auto inputTensor = inTensors[0]; + Tensor filterTensor = Tensor(OCLMem); + TensorDesc inDim = inputTensor->get_desc(); + DataType idt; + DataFormat idf; + U32 in, ic, ih, iw; + CHECK_STATUS(tensor4dGet(inDim, &idt, &idf, &in, &ic, &ih, &iw)); + this->numChannels = ic; + U32 numFiltersOcl = this->p.num_outputs; + GCLMemDesc inputGclDesc = ocl_get_desc(*inputTensor); + if (this->p.num_outputs_origin == 1 && inputGclDesc.byteSize == 0) { + numFiltersOcl = this->p.num_outputs_origin; + } + DataType targetType = DT_F16; // Default DT_F16 + + auto inputMem = (OclMemory *)inputTensor->get_memory(); + GCLMemDesc gclDesc = inputMem->get_desc(); + this->needTransInput = (gclDesc.byteSize == 0) ? true : false; + switch (this->p.convolution_type) { + case Convolution_Pointwise: { + this->filterDesc = tensor4df(this->dt, DF_NCHW, numFiltersOcl, this->numChannels, + this->p.kernel_h, this->p.kernel_w); + filterTensor.resize(this->filterDesc); + CHECK_STATUS(convolution_infer_output_size( + inputTensor, filterTensor, p, outTensors[0], targetType, &this->archInfo)); + break; + } + case Convolution_Depthwise: { + this->filterDesc = tensor4df( + this->dt, DF_NCHW, 1, this->numChannels, this->p.kernel_h, this->p.kernel_w); + filterTensor.resize(this->filterDesc); + CHECK_STATUS(depthwise_convolution_infer_output_size( + inputTensor, filterTensor, p, outTensors[0], targetType, &this->archInfo)); + break; + } + case Convolution_Depthwise_Pointwise: { + this->filterDesc = tensor4df( + this->dt, DF_NCHW, 1, this->numChannels, this->p.kernel_h, this->p.kernel_w); + this->filterDescExt = + tensor4df(this->dt, DF_NCHW, this->p.num_outputs, this->numChannels, 1, 1); + filterTensor.resize(this->filterDesc); + Tensor filterTensorExt = Tensor(OCLMem); + filterTensorExt.resize(this->filterDescExt); + CHECK_STATUS(depthwise_pointwise_convolution_infer_output_size(inputTensor, + filterTensor, filterTensorExt, p, outTensors[0], targetType, &this->archInfo)); + break; + } + case Convolution_Dilation: { + return NOT_SUPPORTED; + break; + } + default: + CHECK_STATUS(NOT_SUPPORTED); + } + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + auto inputTensor = this->inputTensors[0]; + auto filterTensor = this->weightTensors[0]; + auto outputTensor = this->outputTensors[0]; + + U32 bytes = 0; + switch (this->p.convolution_type) { + case Convolution_Pointwise: { + CHECK_STATUS(convolution_infer_forward_tmp_bytes(inputTensor, filterTensor, + outputTensor, p, this->pwAlg, &bytes, &this->archInfo)); + break; + } + case Convolution_Depthwise: { + CHECK_STATUS(depthwise_convolution_infer_forward_tmp_bytes(inputTensor, + filterTensor, outputTensor, p, this->dwAlg, &bytes, &this->archInfo)); + break; + } + case Convolution_Depthwise_Pointwise: { + CHECK_STATUS(depthwise_pointwise_convolution_infer_forward_tmp_bytes(inputTensor, + filterTensor, this->weightTensors[1], outputTensor, p, this->dwAlg, &bytes, + &this->archInfo)); + break; + } + case Convolution_Dilation: { + CHECK_STATUS(NOT_SUPPORTED); + break; + } + default: + CHECK_STATUS(NOT_SUPPORTED); + } + if (this->needTransInput) { + TensorDesc desc = inputTensor.get_desc(); + U32 size = tensorNumBytes(desc); + if (bytes < size) { + bytes = size; + } + } + return bytes; + } + + GCLMemDesc infer_wtm_memory_size_mali() override + { + auto filterTensor = this->weightTensors[0]; + filterTensor.resize(this->filterDesc); + U32 stride[3] = {0, 0, 0}; + U32 offset[3] = {0, 0, 0}; + GCLMemDesc tmpDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + GCLMemDesc gclmemWtmDesc[2]; + gclmemWtmDesc[0] = tmpDesc; + gclmemWtmDesc[1] = tmpDesc; + U32 bytes = 0; + ((MaliPara_t)(this->archInfo.archPara))->gclmemFilterDesc = gclmemWtmDesc; + bool needTransBiasImgToBuf = false; + U32 biasNum = 0; + switch (this->p.convolution_type) { + case Convolution_Pointwise: { + CHECK_STATUS(convolution_transform_filter_bytes( + filterTensor, this->p, this->pwAlg, &bytes, &this->archInfo)); + U32 best_c = this->runInfo.best_c[0]; + U32 best_k = this->runInfo.best_k[0]; + if (best_c == 4 && best_k == 1) { + needTransBiasImgToBuf = true; + } + break; + } + case Convolution_Depthwise: { + CHECK_STATUS(depthwise_convolution_transform_filter_bytes( + filterTensor, this->p, this->dwAlg, &bytes, &this->archInfo)); + break; + } + case Convolution_Depthwise_Pointwise: { + U32 bytesExt = 0; + CHECK_STATUS(depthwise_pointwise_convolution_transform_filter_bytes(filterTensor, + this->weightTensors[1], this->p, this->dwAlg, &bytes, &bytesExt, + &this->archInfo)); + wtm_dp = Tensor(OCLMem); + OclMemory *wtmMem = (OclMemory *)wtm_dp.get_memory(); + wtmMem->padding(gclmemWtmDesc[1]); + wtmMem->alloc(); + if (this->dwAlg == DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_GEMM) { + needTransBiasImgToBuf = true; + biasNum = 1; + } + break; + } + case Convolution_Dilation: { + CHECK_STATUS(NOT_SUPPORTED); + break; + } + default: + CHECK_STATUS(NOT_SUPPORTED); + } + if (needTransBiasImgToBuf) { + Tensor biasTensorBuf = Tensor(OCLMem); + auto biasMemImg = (OclMemory *)(this->biasTensors[biasNum].get_memory()); + auto biasMemBuf = (OclMemory *)(biasTensorBuf.get_memory()); + GCLMemDesc descImg = biasMemImg->get_desc(); + TensorDesc desc = tensor4df(descImg.dt, descImg.df, descImg.dims[3], descImg.dims[2], + descImg.dims[1], descImg.dims[0]); + biasTensorBuf.resize(desc); + GCLMemDesc descBuf = gclmem_build_desc(); + U32 stride[3] = { + (descImg.stride[0] * 4 + 7) / 8 * 8, descImg.stride[1], descImg.stride[2]}; + U32 offset[3] = {0, 0, 0}; + GCLMemType mt = GCL_MEM_BUF; + MemFlags flags = CL_MEM_READ_WRITE; + CHECK_STATUS( + gclmem_set_desc_padding(&descBuf, stride, offset, desc.dt, DF_NCHW, mt, flags)); + biasMemBuf->padding(descBuf); + biasMemBuf->alloc(); + void *bufPtr = biasMemBuf->get_ptr(); + CHECK_STATUS( + gcl_fill_memory_zero(OCLContext::getInstance().handle.get(), (GCLMem_t)bufPtr)); + biasMemBuf->copy_from((Memory *)biasMemImg); + this->biasTensors[biasNum] = biasTensorBuf; + } + return gclmemWtmDesc[0]; + } + + EE transform_filter() override + { + auto filterTensor = this->weightTensors[0]; + filterTensor.resize(this->filterDesc); + + if (DT_F16_8Q == this->dt && Convolution_Pointwise == this->p.convolution_type && + CONVOLUTION_ALGORITHM_WINOGRAD == this->pwAlg) { // int8 winograd + return NOT_SUPPORTED; + } else if (DT_F16_8Q == this->dt && + Convolution_Pointwise == this->p.convolution_type) { // int8 tilegemm + return NOT_SUPPORTED; + } else { // All other cases + auto wtmDesc = this->infer_wtm_memory_size_mali(); + this->wtm = std::shared_ptr(new Tensor(OCLMem)); + OclMemory *wtmMem = (OclMemory *)this->wtm->get_memory(); + wtmMem->padding(wtmDesc); + wtmMem->alloc(); + + switch (this->p.convolution_type) { + case Convolution_Pointwise: { + CHECK_STATUS(convolution_transform_filter(filterTensor, this->p, this->pwAlg, + this->temp, this->wtm.get(), &this->archInfo)); + break; + } + case Convolution_Depthwise: { + CHECK_STATUS(depthwise_convolution_transform_filter( + filterTensor, this->p, this->dwAlg, this->wtm.get(), &this->archInfo)); + break; + } + case Convolution_Depthwise_Pointwise: { + CHECK_STATUS(depthwise_pointwise_convolution_transform_filter(filterTensor, + this->weightTensors[1], this->p, this->dwAlg, this->wtm.get(), + &this->wtm_dp, &this->archInfo)); + this->weightTensors[1] = wtm_dp; + break; + } + case Convolution_Dilation: { + CHECK_STATUS(NOT_SUPPORTED); + break; + } + default: + CHECK_STATUS(NOT_SUPPORTED); + } + } + this->weightTensors[0] = *this->get_wtm(); + return SUCCESS; + } + + REGISTER_OCL_OPERATOR_RUN + +private: + Tensor wtm_dp; + TensorDesc filterDesc; + TensorDesc filterDescExt; + bool needTransInput; + +protected: + ForwardRunInfoMali runInfo; +}; + +#endif // _CONVELTWISEPOOLING_H diff --git a/inference/engine/include/ocl/copy_ocl.hpp b/inference/engine/include/ocl/copy_ocl.hpp new file mode 100644 index 00000000..64b4b54d --- /dev/null +++ b/inference/engine/include/ocl/copy_ocl.hpp @@ -0,0 +1,72 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _COPY_OCL_H +#define _COPY_OCL_H + +#include "copy.hpp" + +class CopyOCL : public Copy { +public: + CopyOCL(DataType dt, CopyParamSpec p) : Copy(dt, p) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~CopyOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new CopyOCL(this->dt, this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + TensorDesc srcDesc = this->inputTensors[0].get_desc(); + TensorDesc dstDesc = this->inputTensors[1].get_desc(); + U32 batch = srcDesc.dims[srcDesc.nDims - 1]; + if (batch > 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + U32 copyLength = (this->p.length >= 0) ? this->p.length : tensorNumElements(srcDesc) / batch; + U32 srcStride = (this->p.src_dims[0] >= 0) ? this->p.src_dims[1] + : tensorNumElements(srcDesc) / batch; + U32 dstStride = (this->p.dst_dims[0] >= 0) ? this->p.dst_dims[1] + : tensorNumElements(dstDesc) / batch; + U32 srcIndex = this->p.src_dims[2]; + U32 dstIndex = this->p.dst_dims[2]; + CHECK_STATUS(copy(this->inputTensors, srcIndex, dstIndex, srcStride, dstStride, copyLength, + &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + CHECK_STATUS(copy_infer_output_size(inTensors, &this->archInfo)); + auto desc = outTensors[0]->get_desc(); + desc.dt = this->dt; + desc.df = getTensorDefaultDataFormat(0); + desc.nDims = 0; + outTensors[0]->resize(desc); + return SUCCESS; + } + + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _COPY_OCL_H diff --git a/inference/engine/include/ocl/deconvolution_ocl.hpp b/inference/engine/include/ocl/deconvolution_ocl.hpp new file mode 100644 index 00000000..03bcde25 --- /dev/null +++ b/inference/engine/include/ocl/deconvolution_ocl.hpp @@ -0,0 +1,195 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _DECONVOLUTION_OCL_H +#define _DECONVOLUTION_OCL_H + +#include "deconvolution.hpp" + +class DeconvolutionOCL : public Deconvolution { +public: + DeconvolutionOCL(DataType dt, ConvolutionParamSpec p, ActivationParamSpec activationDesc) + : Deconvolution(dt, p, activationDesc) + { + setMALIArchInfo(&(this->archInfo), &(this->runInfo), &this->needSetKernelVec, + &this->needSelectKernelLS); + } + + ~DeconvolutionOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr( + new DeconvolutionOCL(this->dt, this->p, this->activationDesc)); + *mem = *this; + return mem; + } + + EE infer_weight_desc() override + { + auto curOpWs = this->get_weightspec(); + DataType dt = curOpWs.mdt; // weight data type may not be the same as input and output + if (curOpWs.weight == nullptr) { + dt = this->dt; + } + DataType dtNoQ = (this->dt == DT_F16_8Q) ? DT_F16 : this->dt; + DataFormat df = DF_NCHW; + U32 fh, fw, fc, fn; + fn = this->numInputs; + fc = this->p.num_outputs; + fh = this->p.kernel_h; + fw = this->p.kernel_w; + U32 vectorLen = fn; + ((MaliPara_t)(this->archInfo.archPara))->forwardRunInfo->algorithm = + CONVOLUTION_ALGORITHM_NULL; + TensorDesc filterTensorDesc = tensor4df(dtNoQ, df, fn, fc, fh, fw); + TensorDesc vectorTensorDesc = tensor1d(dtNoQ, vectorLen); + + Tensor modelWeightTensor = Tensor(OCLMem); + Tensor modelVectorTensor = Tensor(OCLMem); + auto weightMem = (OclMemory *)modelWeightTensor.get_memory(); + auto vectorMem = (OclMemory *)modelVectorTensor.get_memory(); + modelWeightTensor.resize(filterTensorDesc); + modelVectorTensor.resize(vectorTensorDesc); + U32 stride[3] = {fw * fh, fc, fn}; + U32 offset[3] = {0, 0, 0}; + GCLMemType mt = GCL_MEM_BUF; + MemFlags flags = CL_MEM_READ_WRITE; + GCLMemDesc desc = gclmem_build_desc(); + CHECK_STATUS(gclmem_set_desc_padding(&desc, stride, offset, dtNoQ, df, mt, flags)); + weightMem->padding(desc); + + mt = GCL_MEM_IMG_1D; + stride[0] = (vectorLen + 3) / 4; + stride[1] = 1; + stride[2] = 1; + gclmem_set_desc_padding(&desc, stride, offset, dtNoQ, DF_NHWC, mt, flags); + vectorMem->padding(desc); + this->weightTensors.push_back(modelWeightTensor); + this->biasTensors.push_back(modelVectorTensor); + return SUCCESS; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + Tensor inputTensor = this->inputTensors[0]; + Tensor filterTensor = this->weightTensors[0]; + U8 *scalePtr = nullptr; + Tensor biasTensor = this->biasTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + CHECK_STATUS(deconvolution(inputTensor, filterTensor, p, this->alg, scalePtr, biasTensor, + this->temp, outputTensor, this->activationDesc, &this->archInfo)); + } + + EE infer_forward_algorithm(std::shared_ptr algorithmMap) override + { + OCLContext::getInstance().handle.get()->kernelVec = &this->opKernelVec; + ConvolutionPolicy policy = CONVOLUTION_TUNNING; + ((MaliPara_t)(this->archInfo.archPara))->forwardRunInfo->algorithm = + CONVOLUTION_ALGORITHM_NULL; + DataType targetType = DT_F16; + + I32 algo[4]; + if (algorithmMap->getAlgorithmInfoFromMap(this->name, algo, 4)) { + this->runInfo.algorithm = (ConvolutionForwardAlgorithm)algo[0]; + this->runInfo.best_w[0] = algo[1]; + this->runInfo.best_c[0] = algo[2]; + this->runInfo.best_k[0] = algo[3]; + this->alg = (ConvolutionForwardAlgorithm)algo[0]; + } else { + CHECK_STATUS(deconvolution_infer_forward_algorithm(this->inputTensors[0], + this->weightTensors[0], this->outputTensors[0], p, policy, &(this->alg), targetType, + this->activationDesc, &this->archInfo)); + algo[0] = this->runInfo.algorithm; + algo[1] = this->runInfo.best_w[0]; + algo[2] = this->runInfo.best_c[0]; + algo[3] = this->runInfo.best_k[0]; + this->alg = (ConvolutionForwardAlgorithm)algo[0]; + algorithmMap->setAlgorithmInfoToMap(this->name, algo, 4); + } + return SUCCESS; + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + auto inputTensor = inTensors[0]; + TensorDesc inDim = inputTensor->get_desc(); + DataType idt; + DataFormat idf; + U32 in, ic, ih, iw; + CHECK_STATUS(tensor4dGet(inDim, &idt, &idf, &in, &ic, &ih, &iw)); + this->numInputs = ic; + + TensorDesc filterDim = tensor4df(this->dt, DF_NCHW, this->numInputs, this->p.num_outputs, + this->p.kernel_h, this->p.kernel_w); + Tensor filterTensor = Tensor(OCLMem); + filterTensor.resize(filterDim); + this->p = createConvolutionParamSpec(this->p.group, this->p.kernel_h, this->p.kernel_w, + this->p.stride_h, this->p.stride_w, this->p.padding_top, this->p.padding_bottom, + this->p.padding_left, this->p.padding_right, this->p.dilatedRate_h, + this->p.dilatedRate_w, this->p.num_outputs, this->p.convolution_type); + + DataType targetType = this->dt; + CHECK_STATUS(deconvolution_infer_output_size( + inputTensor, filterTensor, p, outTensors[0], targetType, &this->archInfo)); + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor filterTensor = this->weightTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + + U32 bytes = 0; + CHECK_STATUS(deconvolution_infer_forward_tmp_bytes( + inputTensor, filterTensor, outputTensor, p, this->alg, &bytes, &this->archInfo)); + return bytes; + } + + GCLMemDesc infer_wtm_memory_size_mali() override + { + U32 stride[3] = {0, 0, 0}; + U32 offset[3] = {0, 0, 0}; + GCLMemDesc gclmemWtmDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + U32 bytes = 0; + ((MaliPara_t)(this->archInfo.archPara))->gclmemFilterDesc = &gclmemWtmDesc; + CHECK_STATUS(deconvolution_transform_filter_bytes( + this->weightTensors[0], this->p, this->alg, &bytes, &this->archInfo)); + return gclmemWtmDesc; + } + + EE transform_filter() override + { + Tensor filterTensor = this->weightTensors[0]; + auto wtmDesc = this->infer_wtm_memory_size_mali(); + Tensor wtm(OCLMem); + OclMemory *wtmMem = (OclMemory *)wtm.get_memory(); + wtmMem->padding(wtmDesc); + wtmMem->alloc(); + CHECK_STATUS(deconvolution_transform_filter( + filterTensor, this->p, this->alg, this->temp, &wtm, &this->archInfo)); + this->weightTensors[0] = wtm; + return SUCCESS; + } + + REGISTER_OCL_OPERATOR_RUN + +protected: + ForwardRunInfoMali runInfo; +}; + +#endif // _DECONVOLUTION_OCL_H diff --git a/inference/engine/include/ocl/depth2space_ocl.hpp b/inference/engine/include/ocl/depth2space_ocl.hpp new file mode 100644 index 00000000..d8f09678 --- /dev/null +++ b/inference/engine/include/ocl/depth2space_ocl.hpp @@ -0,0 +1,67 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _DEPTH2SPACE_OCL_H +#define _DEPTH2SPACE_OCL_H + +#include "depth2space.hpp" + +class Depth2SpaceOCL : public Depth2Space { +public: + Depth2SpaceOCL(DataType dt, Depth2SpaceParamSpec p) : Depth2Space(dt, p) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~Depth2SpaceOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new Depth2SpaceOCL(this->dt, this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + CHECK_STATUS(depth2space(inputTensor, this->p, this->temp, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + CHECK_STATUS( + depth2space_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + U32 bytes = 0; + CHECK_STATUS(depth2space_infer_forward_tmp_bytes( + inputTensor, this->p, outputTensor, &bytes, &this->archInfo)); + return bytes; + } + + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _DEPTH2SPACE_OCL_H diff --git a/inference/engine/include/ocl/eltwise_ocl.hpp b/inference/engine/include/ocl/eltwise_ocl.hpp new file mode 100644 index 00000000..6a1df477 --- /dev/null +++ b/inference/engine/include/ocl/eltwise_ocl.hpp @@ -0,0 +1,54 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _ELTWISE_OCL_H +#define _ELTWISE_OCL_H + +#include "eltwise.hpp" + +class EltwiseOCL : public Eltwise { +public: + EltwiseOCL(EltwiseParamSpec eltwiseDesc) : Eltwise(eltwiseDesc) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~EltwiseOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new EltwiseOCL(this->eltwiseDesc)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + CHECK_STATUS(eltwise(this->inputTensors, this->eltwiseDesc, this->temp, + this->outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + CHECK_STATUS(eltwise_infer_output_size(inTensors, outTensors[0], &this->archInfo)); + return SUCCESS; + } + + REGISTER_OCL_OPERATOR_RUN +}; +#endif // _ELTWISE_OCL_H diff --git a/inference/engine/include/ocl/embedding_ocl.hpp b/inference/engine/include/ocl/embedding_ocl.hpp new file mode 100644 index 00000000..ec52ecb6 --- /dev/null +++ b/inference/engine/include/ocl/embedding_ocl.hpp @@ -0,0 +1,104 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _EMBEDDING_OCL_H +#define _EMBEDDING_OCL_H + +#include "embedding.hpp" + +class EmbeddingOCL : public Embedding { +public: + EmbeddingOCL(DataType dt, EmbedParamSpec p) : Embedding(dt, p) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~EmbeddingOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new EmbeddingOCL(this->dt, this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + Tensor inputTensor = this->inputTensors[0]; + Tensor weightTensor; + if (this->weightTensors.size() > 0) { + weightTensor = this->weightTensors[0]; + } else { + weightTensor = this->inputTensors[1]; + } + Tensor outputTensor = this->outputTensors[0]; + CHECK_STATUS(embedding(inputTensor, weightTensor, this->p, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + CHECK_STATUS(embedding_infer_output_size( + inTensors[0], this->p, this->dt, outTensors[0], &this->archInfo)); + return SUCCESS; + } + + EE init_weight_bias_from_model(std::shared_ptr *modelPtr) override + { + auto curOpWs = this->get_weightspec(); + if (modelPtr == nullptr && curOpWs.weight == nullptr) { + return SUCCESS; + } + TensorDesc weightDesc; + if (this->p.transpose) { + weightDesc = tensor2df(this->dt, DF_TRANSPOSE, this->p.num_output, this->p.input_dim); + } else { + weightDesc = tensor2df(this->dt, DF_NORMAL, this->p.input_dim, this->p.num_output); + } + Tensor modelWeightTensor = Tensor(OCLMem); + auto weightMem = (OclMemory *)modelWeightTensor.get_memory(); + modelWeightTensor.resize(weightDesc); + U32 stride[3] = {weightDesc.dims[0], weightDesc.dims[1], 1}; + U32 offset[3] = {0, 0, 0}; + GCLMemType mt = GCL_MEM_BUF; + MemFlags flags = CL_MEM_READ_WRITE; + GCLMemDesc desc = gclmem_build_desc(); + CHECK_STATUS(gclmem_set_desc_padding(&desc, stride, offset, this->dt, DF_NCHW, mt, flags)); + weightMem->padding(desc); + + CpuMemory weight_mem_src; + std::shared_ptr weight_ptr; + if (modelPtr) { + weight_ptr = *modelPtr; + } else { + weight_ptr = std::shared_ptr(curOpWs.weight); + } + weight_mem_src.resize(weightDesc); + weight_mem_src.set_shared_ptr(std::shared_ptr(weight_ptr)); + weightMem->copy_from((Memory *)&weight_mem_src); + this->weightTensors.push_back(modelWeightTensor); + if (modelPtr) { + *modelPtr = + std::shared_ptr(*modelPtr, (*modelPtr).get() + tensorNumBytes(weightDesc)); + } + return SUCCESS; + } + + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _EMBEDDING_OCL_H diff --git a/inference/engine/include/ocl/factory_ocl.hpp b/inference/engine/include/ocl/factory_ocl.hpp new file mode 100644 index 00000000..8ebb4ad5 --- /dev/null +++ b/inference/engine/include/ocl/factory_ocl.hpp @@ -0,0 +1,357 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _FACTORY_OCL_H +#define _FACTORY_OCL_H +#include "factory.hpp" +#include "ocl/resize_ocl.hpp" +#include "ocl/channel_resize_ocl.hpp" +#include "ocl/deconvolution_ocl.hpp" +#include "ocl/bilateral_slice_apply_ocl.hpp" +#include "ocl/pooling_ocl.hpp" +#include "ocl/convolution_ocl.hpp" +#include "ocl/eltwise_ocl.hpp" +#include "ocl/softmax_ocl.hpp" +#include "ocl/activation_ocl.hpp" +#include "ocl/fully_connected_ocl.hpp" +#include "ocl/scale_ocl.hpp" +#include "ocl/concat_ocl.hpp" +#include "ocl/clip_ocl.hpp" +#include "ocl/squeeze_ocl.hpp" +#include "ocl/reshape_ocl.hpp" +#include "ocl/space2depth_ocl.hpp" +#include "ocl/depth2space_ocl.hpp" +#include "ocl/embedding_ocl.hpp" +#include "ocl/layer_norm_ocl.hpp" +#include "ocl/matmul_ocl.hpp" +#include "ocl/power_ocl.hpp" +#include "ocl/transpose_ocl.hpp" +#include "ocl/slice_ocl.hpp" +#include "ocl/shared_weight_ocl.hpp" +#include "ocl/repeat_ocl.hpp" +#include "ocl/copy_ocl.hpp" +#include "ocl/check_ocl.hpp" +#include "ocl/preallocated_memory_ocl.hpp" +#include "ocl/argmax_ocl.hpp" +#include "ocl/unsqueeze_ocl.hpp" +#include "ocl/rnn_ocl.hpp" +#include "ocl/rnncell_ocl.hpp" +#include "ocl/padding_ocl.hpp" +#include "ocl/prelu_ocl.hpp" + +class FactoryOCL : public Factory { +public: + std::shared_ptr createConvolution(DataType dt, + ConvolutionParamSpec p, + ActivationParamSpec dwActivationParamSpec, + ActivationParamSpec pwActivationParamSpec) override + { + auto cep = + (Convolution *)(new ConvolutionOCL(dt, p, dwActivationParamSpec, pwActivationParamSpec)); + return std::shared_ptr(cep); + } + + std::shared_ptr createDeconvolution( + DataType dt, ConvolutionParamSpec p, ActivationParamSpec activationDesc) override + { + auto cep = new DeconvolutionOCL(dt, p, activationDesc); + return std::shared_ptr(cep); + } + + std::shared_ptr createPooling(PoolingParamSpec p) override + { + auto cep = (Pooling *)(new PoolingOCL(p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createFullyConnected( + DataType dt, FullyConnectedParamSpec p, U32 numInput) override + { + auto cep = (FullyConnectedOCL *)(new FullyConnectedOCL(dt, p, numInput)); + return std::shared_ptr(cep); + } + + std::shared_ptr createSoftmax(DataType dt, SoftmaxParamSpec p) override + { + auto cep = new SoftmaxOCL(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createConcat(ConcatParamSpec p) override + { + auto cep = (Concat *)(new ConcatOCL(p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createActivation(ActivationParamSpec activeDesc) override + { + auto cep = (Activation *)new ActivationOCL(activeDesc); + return std::shared_ptr(cep); + } + + std::shared_ptr createEltwise(EltwiseParamSpec eltwiseDesc) override + { + auto cep = (Eltwise *)new EltwiseOCL(eltwiseDesc); + return std::shared_ptr(cep); + } + + std::shared_ptr createScale(DataType dt, ScaleParamSpec p, int numChannels) override + { + auto cep = (Scale *)(new ScaleOCL(dt, p, numChannels)); + return std::shared_ptr(cep); + } + + std::shared_ptr createPReLU(DataType dt) override + { + auto cep = (PReLU *)(new PReLUOCL(dt)); + return std::shared_ptr(cep); + } + + std::shared_ptr createRNN(DataType dt, RNNParamSpec p) override + { + auto cep = (RNNCell *)(new RNNOCL(dt, p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createRNNCell(DataType dt, RNNParamSpec p) override + { + auto cep = (RNNCell *)(new RNNCellOCL(dt, p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createEmbedding(DataType dt, EmbedParamSpec p) override + { + auto cep = (Embedding *)new EmbeddingOCL(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createPower(DataType dt, PowerParamSpec p) override + { + auto cep = (Power *)new PowerOCL(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createMatMul(DataType dt, MatMulParamSpec p) override + { + auto cep = (MatMul *)(new MatMulOCL(dt, p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createLayerNorm(DataType dt, U32 weightNum) override + { + auto cep = (LayerNorm *)new LayerNormOCL(dt, weightNum); + return std::shared_ptr(cep); + } + + std::shared_ptr createReshape(DataType dt, ReshapeParamSpec p) override + { + auto cep = (Reshape *)(new ReshapeOCL(dt, p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createResize(DataType paramDT, ResizeParamSpec p) override + { + // auto cep = new Resize(paramDT, paramPtr); + // OP_UNSUP(2, paramDT, paramPtr); + auto cep = (Resize *)(new ResizeOCL(paramDT, p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createSlice(DataType dt, SliceParamSpec p) override + { + auto cep = (Slice *)(new SliceOCL(dt, p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createTranspose(DataType dt, TransposeParamSpec p) override + { + auto cep = (Transpose *)(new TransposeOCL(dt, p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createAttention(DataType dt, AttentionParamSpec p) override + { + // auto cep = new AttentionOCL(dt, numHeads, fromSequenceLength, toSequenceLength); + OP_UNSUP(2, dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createClip(DataType dt, ClipParamSpec p) override + { + auto cep = (Clip *)(new ClipOCL(dt, p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createSqueeze(DataType dt, SqueezeParamSpec p) override + { + auto cep = (Squeeze *)(new SqueezeOCL(dt, p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createUnsqueeze(DataType dt, UnsqueezeParamSpec p) override + { + auto cep = (Unsqueeze *)new UnsqueezeOCL(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createReduction(DataType dt, ReductionParamSpec p) override + { + OP_UNSUP(2, dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createArgMax(DataType dt, ArgMaxParamSpec p) override + { + auto cep = (ArgMax *)new ArgMaxOCL(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createCopy(DataType dt, CopyParamSpec p) override + { + auto cep = (Copy *)new CopyOCL(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createCheck(DataType dt, CheckParamSpec p) override + { + auto cep = (Check *)new CheckOCL(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createRepeat( + DataType dt, RepeatParamSpec p, I32 jumpOperatorIndex, I32 currentOperatorIndex) override + { + auto cep = (Repeat *)new RepeatOCL(dt, p, jumpOperatorIndex, currentOperatorIndex); + return std::shared_ptr(cep); + } + + std::shared_ptr createBilateralSliceApply(BilateralSliceApplyParamSpec p) override + { + auto cep = (BilateralSliceApply *)(new BilateralSliceApplyOCL(p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createPreAllocatedMemory(DataType dt, TensorDesc desc) override + { + auto cep = (PreAllocatedMemory *)new PreAllocatedMemoryOCL(dt, desc); + return std::shared_ptr(cep); + } + + std::shared_ptr createSharedWeight(DataType dt, + TensorDesc desc, + std::string outputTensorName, + std::map> *tensorMapPtr) override + { + auto cep = (SharedWeight *)new SharedWeightOCL(dt, desc, outputTensorName, tensorMapPtr); + return std::shared_ptr(cep); + } + + std::shared_ptr createJump( + DataType dt, I32 jumpOperatorIndex, I32 currentOperatorIndex) override + { + OP_UNSUP(3, dt, jumpOperatorIndex, currentOperatorIndex); + return std::shared_ptr(cep); + } + + std::shared_ptr createSpace2Depth(DataType dt) override + { + auto cep = (Space2Depth *)(new Space2DepthOCL(dt)); + return std::shared_ptr(cep); + } + + std::shared_ptr createDepth2Space(DataType dt, Depth2SpaceParamSpec p) override + { + auto cep = (Depth2Space *)(new Depth2SpaceOCL(dt, p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createAttentionMask(DataType dt, AttentionMaskParamSpec p) override + { + OP_UNSUP(2, dt, p) + return std::shared_ptr(cep); + } + + std::shared_ptr createRelativePositionEmbedding(DataType dt, EmbedParamSpec p) override + { + OP_UNSUP(2, dt, p) + return std::shared_ptr(cep); + } + + std::shared_ptr createRelativeShift(DataType dt, RelativeShiftParamSpec p) override + { + OP_UNSUP(2, dt, p) + return std::shared_ptr(cep); + } + + std::shared_ptr createPadding(DataType dt, PadParamSpec p) override + { + auto cep = (Padding *)(new PaddingOCL(dt, p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createPriorBox(DataType dt, PriorBoxParamSpec p) override + { + OP_UNSUP(2, dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createDetectionOutput(DataType dt, DetectionOutputParamSpec p) override + { + OP_UNSUP(2, dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createYolov3DetectionOutput( + DataType dt, Yolov3DetectionOutputParamSpec p) override + { + OP_UNSUP(2, dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createChannelResize(DataType dt, ChannelResizeParamSpec p) override + { + auto cep = new ChannelResizeOCL(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createL2Normalization(DataType dt) override + { + OP_UNSUP(1, dt); + return std::shared_ptr(cep); + } + + std::shared_ptr createTile(DataType dt, TileParamSpec p) override + { + OP_UNSUP(2, dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createTfSlice(DataType dt, TfSliceParamSpec p) override + { + OP_UNSUP(2, dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createSplice(DataType dt, SpliceParamSpec p) override + { + OP_UNSUP(2, dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createShape() override + { + OP_UNSUP(0); + return std::shared_ptr(cep); + } +}; +#endif // _FACTORY_OCL_H diff --git a/inference/engine/include/ocl/fully_connected_ocl.hpp b/inference/engine/include/ocl/fully_connected_ocl.hpp new file mode 100644 index 00000000..ffcfb29b --- /dev/null +++ b/inference/engine/include/ocl/fully_connected_ocl.hpp @@ -0,0 +1,207 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _FULLY_CONNECTED_OCL_H +#define _FULLY_CONNECTED_OCL_H + +#include "fully_connected.hpp" + +class FullyConnectedOCL : public FullyConnected { +public: + FullyConnectedOCL(DataType dt, FullyConnectedParamSpec p, U32 numInput) + : FullyConnected(dt, p, numInput) + { + setMALIArchInfo(&(this->archInfo), &(this->runInfo), &this->needSetKernelVec, + &this->needSelectKernelLS); + } + + ~FullyConnectedOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr( + new FullyConnectedOCL(this->dt, this->p, this->numInput)); + *mem = *this; + return mem; + } + + EE infer_weight_desc() override + { + TensorDesc weightDesc = tensor2df(this->dt, DF_NORMAL, this->p.num_outputs, this->numInput); + TensorDesc biasDesc = tensor1d(this->dt, this->p.num_outputs); + + Tensor modelWeightTensor = Tensor(OCLMem); + Tensor modelVectorTensor = Tensor(OCLMem); + auto weightMem = (OclMemory *)modelWeightTensor.get_memory(); + auto vectorMem = (OclMemory *)modelVectorTensor.get_memory(); + modelWeightTensor.resize(weightDesc); + modelVectorTensor.resize(biasDesc); + + U32 stride[3] = {this->p.num_outputs, this->numInput, 1}; + U32 offset[3] = {0, 0, 0}; + GCLMemType mt = GCL_MEM_BUF; + MemFlags flags = CL_MEM_READ_WRITE; + GCLMemDesc desc = gclmem_build_desc(); + CHECK_STATUS(gclmem_set_desc_padding(&desc, stride, offset, this->dt, DF_NCHW, mt, flags)); + weightMem->padding(desc); + + stride[0] = (this->p.num_outputs + 3) / 4 * 4; + stride[1] = 1; + stride[2] = 1; + gclmem_set_desc_padding(&desc, stride, offset, this->dt, DF_NHWC, mt, flags); + vectorMem->padding(desc); + this->weightTensors.push_back(modelWeightTensor); + this->biasTensors.push_back(modelVectorTensor); + return SUCCESS; + } + + EE infer_forward_algorithm(std::shared_ptr algorithmMap) override + { + OCLContext::getInstance().handle.get()->kernelVec = &this->opKernelVec; + Tensor inputTensor = this->inputTensors[0]; + Tensor filterTensor = Tensor(OCLMem); + Tensor outputTensor = this->outputTensors[0]; + filterTensor.resize(filterDesc4D); + ((MaliPara_t)(this->archInfo.archPara))->forwardRunInfo->algorithm = + CONVOLUTION_ALGORITHM_NULL; + I32 algo[4]; + if (algorithmMap->getAlgorithmInfoFromMap(this->name, algo, 4)) { + this->runInfo.algorithm = (ConvolutionForwardAlgorithm)algo[0]; + this->runInfo.best_w[0] = algo[1]; + this->runInfo.best_c[0] = algo[2]; + this->runInfo.best_k[0] = algo[3]; + } else { + CHECK_STATUS(fully_connected_infer_forward_algorithm( + inputTensor, filterTensor, outputTensor, &this->archInfo)); + algo[0] = this->runInfo.algorithm; + algo[1] = this->runInfo.best_w[0]; + algo[2] = this->runInfo.best_c[0]; + algo[3] = this->runInfo.best_k[0]; + algorithmMap->setAlgorithmInfoToMap(this->name, algo, 4); + } + return SUCCESS; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + Tensor inputTensor = this->inputTensors[0]; + Tensor weightTensor = this->weightTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + Tensor biasTensor = this->biasTensors[0]; + + CHECK_STATUS(fully_connected( + inputTensor, weightTensor, biasTensor, this->temp, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + TensorDesc inputDesc = inTensors[0]->get_desc(); + U32 ic, ih, iw; + if (inputDesc.df == DF_NCHW) { + tensorSelectGet(inputDesc, NULL, NULL, NULL, &ic, &ih, &iw); + } + if (inputDesc.df == DF_MKT) { + iw = 1; + ih = 1; + ic = inputDesc.dims[1]; + } + filterDesc4D = tensor4df(this->dt, DF_NCHW, this->p.num_outputs, ic, ih, iw); + this->numInput = ic * ih * iw; + Tensor filterTensor = Tensor(OCLMem); + filterTensor.resize(filterDesc4D); + CHECK_STATUS(fully_connected_infer_output_size( + inTensors[0], filterTensor, outTensors[0], &this->archInfo)); + if (this->p.num_slices > 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor filterTensor = Tensor(OCLMem); + filterTensor.resize(filterDesc4D); + U32 bytes = 0; + CHECK_STATUS(fully_connected_infer_forward_tmp_bytes( + inputTensor, filterTensor, &bytes, &this->archInfo)); + return bytes; + } + + GCLMemDesc infer_wtm_memory_size_mali() override + { + U32 stride[3] = {0, 0, 0}; + U32 offset[3] = {0, 0, 0}; + GCLMemDesc gclmemWtmDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + U32 bytes = 0; + ((MaliPara_t)(this->archInfo.archPara))->gclmemFilterDesc = &gclmemWtmDesc; + Tensor filterTensor = Tensor(OCLMem); + filterTensor.resize(filterDesc4D); + CHECK_STATUS(fully_connected_transform_filter_bytes(filterTensor, &bytes, &this->archInfo)); + return gclmemWtmDesc; + } + + EE transform_filter() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor filterTensor = this->weightTensors[0]; + filterTensor.resize(this->filterDesc4D); + auto wtmDesc = this->infer_wtm_memory_size_mali(); + if (this->p.num_slices == 1) { + this->wtm = std::shared_ptr(new Tensor(OCLMem)); + OclMemory *wtmMem = (OclMemory *)this->wtm->get_memory(); + wtmMem->padding(wtmDesc); + wtmMem->alloc(); + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + CHECK_STATUS(fully_connected_transform_filter( + inputTensor, filterTensor, this->wtm.get(), &this->archInfo)); + this->weightTensors[0] = *this->get_wtm(); + auto inputDesc = this->inputTensors[0].get_desc(); + if (inputDesc.df == DF_MKT) { + Tensor biasTensorImg = Tensor(OCLMem); + auto biasMemBuf = (OclMemory *)(biasTensors[0].get_memory()); + auto biasMemImg = (OclMemory *)(biasTensorImg.get_memory()); + GCLMemDesc descBuf = biasMemBuf->get_desc(); + TensorDesc desc = tensor4df(descBuf.dt, descBuf.df, descBuf.dims[3], descBuf.dims[2], + descBuf.dims[1], descBuf.dims[0]); + biasTensorImg.resize(desc); + GCLMemDesc descImg = gclmem_build_desc(); + U32 stride[3] = {(descBuf.stride[0] + 3) / 4, descBuf.stride[1], descBuf.stride[2]}; + U32 offset[3] = {0, 0, 0}; + GCLMemType mt = GCL_MEM_IMG_1D; + MemFlags flags = CL_MEM_READ_WRITE; + CHECK_STATUS( + gclmem_set_desc_padding(&descImg, stride, offset, desc.dt, DF_NCHW, mt, flags)); + biasMemImg->padding(descBuf); + biasMemImg->alloc(); + biasMemImg->copy_from((Memory *)biasMemBuf); + biasTensors[0] = biasTensorImg; + } + return SUCCESS; + } + + REGISTER_OCL_OPERATOR_RUN + +private: + TensorDesc filterDesc4D; + +protected: + ForwardRunInfoMali runInfo; +}; + +#endif // _FULLY_CONNECTED_OCL_H diff --git a/inference/engine/include/ocl/layer_norm_ocl.hpp b/inference/engine/include/ocl/layer_norm_ocl.hpp new file mode 100644 index 00000000..b6097018 --- /dev/null +++ b/inference/engine/include/ocl/layer_norm_ocl.hpp @@ -0,0 +1,87 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _LAYER_NORM_OCL_H +#define _LAYER_NORM_OCL_H + +#include "layer_norm.hpp" + +class LayerNormOCL : public LayerNorm { +public: + LayerNormOCL(DataType dt, U32 weightNum) : LayerNorm(dt, weightNum) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~LayerNormOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new LayerNormOCL(this->dt, this->weightNum)); + *mem = *this; + return mem; + } + + EE infer_weight_desc() override + { + auto curOpWs = this->get_weightspec(); + if (0 != curOpWs.bytes_of_weight) { + this->weightNum = curOpWs.bytes_of_weight / bytesOf(curOpWs.mdt); + } + DataType dtNoQ = (DT_F16_8Q == this->dt) ? DT_F16 : this->dt; + TensorDesc weightDesc = tensor1d(dtNoQ, this->weightNum); + TensorDesc biasDesc = tensor1d(dtNoQ, this->weightNum); + Tensor modelWeightTensor = Tensor(OCLMem); + Tensor modelBiasTensor = Tensor(OCLMem); + auto weightMem = (OclMemory *)modelWeightTensor.get_memory(); + auto vectorMem = (OclMemory *)modelBiasTensor.get_memory(); + modelWeightTensor.resize(weightDesc); + modelBiasTensor.resize(biasDesc); + U32 stride[3] = {(this->weightNum + 3) / 4 * 4, 1, 1}; + U32 offset[3] = {0, 0, 0}; + GCLMemType mt = GCL_MEM_BUF; + MemFlags flags = CL_MEM_READ_WRITE; + GCLMemDesc desc = gclmem_build_desc(); + CHECK_STATUS(gclmem_set_desc_padding(&desc, stride, offset, dtNoQ, DF_NCHW, mt, flags)); + weightMem->padding(desc); + vectorMem->padding(desc); + this->weightTensors.push_back(modelWeightTensor); + this->biasTensors.push_back(modelBiasTensor); + return SUCCESS; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + Tensor inputTensor = this->inputTensors[0]; + Tensor weightTensor = this->weightTensors[0]; + Tensor biasTensor = this->biasTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + CHECK_STATUS(layer_normalization( + inputTensor, weightTensor, biasTensor, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + CHECK_STATUS(normalization_infer_output_size(inTensors[0], outTensors[0], &this->archInfo)); + return SUCCESS; + } + + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _LAYER_NORM_OCL_H diff --git a/inference/engine/include/ocl/matmul_ocl.hpp b/inference/engine/include/ocl/matmul_ocl.hpp new file mode 100644 index 00000000..f92d499a --- /dev/null +++ b/inference/engine/include/ocl/matmul_ocl.hpp @@ -0,0 +1,97 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _MATMUL_OCL_H +#define _MATMUL_OCL_H + +#include "matmul.hpp" + +class MatMulOCL : public MatMul { +public: + MatMulOCL(DataType dt, MatMulParamSpec p) : MatMul(dt, p) + { + setMALIArchInfo(&(this->archInfo), &(this->runInfo), &this->needSetKernelVec, + &this->needSelectKernelLS); + } + + ~MatMulOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new MatMulOCL(this->dt, this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + Tensor inputTensorA = this->inputTensors[0]; + Tensor inputTensorB = this->inputTensors[1]; + Tensor outputTensor = this->outputTensors[0]; + + CHECK_STATUS(matmul(inputTensorA, this->p.transpose_a, inputTensorB, this->p.transpose_b, + this->temp, outputTensor, &this->archInfo)); + } + + EE infer_forward_algorithm(std::shared_ptr algorithmMap) override + { + OCLContext::getInstance().handle.get()->kernelVec = &this->opKernelVec; + Tensor matrixATensor = this->inputTensors[0]; + Tensor matrixBTensor = this->inputTensors[1]; + Tensor matrixCTensor = this->outputTensors[0]; + ((MaliPara_t)(this->archInfo.archPara))->forwardRunInfo->algorithm = + CONVOLUTION_ALGORITHM_NULL; + I32 algo[4]; + if (algorithmMap->getAlgorithmInfoFromMap(this->name, algo, 4)) { + this->runInfo.algorithm = (ConvolutionForwardAlgorithm)algo[0]; + this->runInfo.best_w[0] = algo[1]; + this->runInfo.best_c[0] = algo[2]; + this->runInfo.best_k[0] = algo[3]; + } else { + CHECK_STATUS(matmul_infer_forward_algorithm(matrixATensor, this->p.transpose_a, + matrixBTensor, this->p.transpose_b, matrixCTensor, &this->archInfo)); + algo[0] = this->runInfo.algorithm; + algo[1] = this->runInfo.best_w[0]; + algo[2] = this->runInfo.best_c[0]; + algo[3] = this->runInfo.best_k[0]; + algorithmMap->setAlgorithmInfoToMap(this->name, algo, 4); + } + return SUCCESS; + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + CHECK_STATUS(matmul_infer_output_size(inTensors[0], this->p.transpose_a, inTensors[1], + this->p.transpose_b, outTensors[0], &this->archInfo)); + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + U32 bytes = 0; + CHECK_STATUS(matmul_infer_forward_tmp_bytes(this->inputTensors[0], this->p.transpose_a, + this->inputTensors[1], this->p.transpose_b, &bytes, &this->archInfo)); + return bytes; + } + + REGISTER_OCL_OPERATOR_RUN + +protected: + ForwardRunInfoMali runInfo; +}; + +#endif // _MATMUL_OCL_H diff --git a/inference/engine/include/ocl/padding_ocl.hpp b/inference/engine/include/ocl/padding_ocl.hpp new file mode 100644 index 00000000..13827052 --- /dev/null +++ b/inference/engine/include/ocl/padding_ocl.hpp @@ -0,0 +1,57 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _PADDING_OCL_H +#define _PADDING_OCL_H + +#include "padding.hpp" + +class PaddingOCL : public Padding { +public: + PaddingOCL(DataType dt, PadParamSpec p) : Padding(dt, p) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~PaddingOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new PaddingOCL(this->dt, this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + CHECK_STATUS(padding(inputTensor, this->p, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + CHECK_STATUS( + padding_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } + + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _PADDING_OCL_H diff --git a/inference/engine/include/ocl/pooling_ocl.hpp b/inference/engine/include/ocl/pooling_ocl.hpp new file mode 100644 index 00000000..53e590db --- /dev/null +++ b/inference/engine/include/ocl/pooling_ocl.hpp @@ -0,0 +1,66 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _POOLING_OCL_H +#define _POOLING_OCL_H + +#include "pooling.hpp" + +class PoolingOCL : public Pooling { +public: + PoolingOCL(PoolingParamSpec p) : Pooling(p) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~PoolingOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new PoolingOCL(this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + CHECK_STATUS(pooling( + this->inputTensors[0], this->p, this->temp, this->outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + if (this->p.kernel_h == 0 && this->p.kernel_w == 0) { + Pooling::set_stride(1, 1); + } + CHECK_STATUS( + pooling_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + U32 bytes = 0; + CHECK_STATUS(pooling_infer_forward_tmp_bytes( + this->inputTensors[0], this->outputTensors[0], &bytes, &this->archInfo)); + return bytes; + } + + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _POOLING_OCL_H diff --git a/inference/engine/include/ocl/power_ocl.hpp b/inference/engine/include/ocl/power_ocl.hpp new file mode 100644 index 00000000..dea8229e --- /dev/null +++ b/inference/engine/include/ocl/power_ocl.hpp @@ -0,0 +1,56 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _POWER_OCL_H +#define _POWER_OCL_H + +#include "power.hpp" + +class PowerOCL : public Power { +public: + PowerOCL(DataType dt, PowerParamSpec p) : Power(dt, p) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~PowerOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new PowerOCL(this->dt, this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + + CHECK_STATUS(power(inputTensor, this->p, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + CHECK_STATUS(power_infer_output_size(inTensors[0], outTensors[0], &this->archInfo)); + return SUCCESS; + } + + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _POWER_OCL_H diff --git a/inference/engine/include/ocl/preallocated_memory_ocl.hpp b/inference/engine/include/ocl/preallocated_memory_ocl.hpp new file mode 100644 index 00000000..9b189417 --- /dev/null +++ b/inference/engine/include/ocl/preallocated_memory_ocl.hpp @@ -0,0 +1,58 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _PREALLOCATED_MEMORY_OCL_H +#define _PREALLOCATED_MEMORY_OCL_H + +#include "preallocated_memory.hpp" + +class PreAllocatedMemoryOCL : public PreAllocatedMemory { +public: + PreAllocatedMemoryOCL(DataType dt, TensorDesc desc) : PreAllocatedMemory(dt, desc) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~PreAllocatedMemoryOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new PreAllocatedMemoryOCL(this->dt, this->desc)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + CHECK_STATUS(preallocated_memory(this->outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + if (inTensors.size() > 0) { + CHECK_STATUS(NOT_MATCH); + } + outTensors[0]->resize(this->desc); + CHECK_STATUS(preallocated_memory_infer_output_size(outTensors[0], &this->archInfo)); + return SUCCESS; + } + + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _PREALLOCATED_MEMORY_OCL_H diff --git a/inference/engine/include/ocl/prelu_ocl.hpp b/inference/engine/include/ocl/prelu_ocl.hpp new file mode 100644 index 00000000..3d784f5a --- /dev/null +++ b/inference/engine/include/ocl/prelu_ocl.hpp @@ -0,0 +1,86 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _PRELU_OCL_H +#define _PRELU_OCL_H + +#include "prelu.hpp" + +class PReLUOCL : public PReLU { +public: + PReLUOCL(DataType dt) : PReLU(dt) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~PReLUOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new PReLUOCL(this->dt)); + *mem = *this; + return mem; + } + + EE infer_weight_desc() override + { + auto curOpWs = this->get_weightspec(); + U32 weightNum = 0; + if (curOpWs.weight != nullptr) { + weightNum = curOpWs.bytes_of_weight / UNI_MAX(1, bytesOf(curOpWs.mdt)); + } + if (weightNum == 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (weightNum == 1) { + this->preluDesc.propagate_down = true; + } else { + this->preluDesc.propagate_down = false; + } + Tensor modelWeightTensor = Tensor(OCLMem); + auto weightMem = (OclMemory *)modelWeightTensor.get_memory(); + TensorDesc weightDesc = tensor1d(this->dt, weightNum); + modelWeightTensor.resize(weightDesc); + + U32 stride[3] = {1, 1, 1}; + U32 offset[3] = {0, 0, 0}; + stride[0] = (weightNum > 1) ? (weightNum + 3) / 4 * 4 : 1; + GCLMemType mt = GCL_MEM_BUF; + MemFlags flags = CL_MEM_READ_WRITE; + GCLMemDesc desc = gclmem_build_desc(); + CHECK_STATUS(gclmem_set_desc_padding(&desc, stride, offset, this->dt, DF_NCHW, mt, flags)); + weightMem->padding(desc); + this->weightTensors.push_back(modelWeightTensor); + return SUCCESS; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + CHECK_STATUS(prelu(this->inputTensors[0], this->weightTensors[0], this->preluDesc, + this->outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + CHECK_STATUS(prelu_infer_output_size(inTensors[0], outTensors[0], &this->archInfo)); + return SUCCESS; + } + + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _PRELU_OCL_H diff --git a/inference/engine/include/ocl/repeat_ocl.hpp b/inference/engine/include/ocl/repeat_ocl.hpp new file mode 100644 index 00000000..e5d2892b --- /dev/null +++ b/inference/engine/include/ocl/repeat_ocl.hpp @@ -0,0 +1,116 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _REPEAT_OCL_H +#define _REPEAT_OCL_H + +#include "repeat.hpp" + +class RepeatOCL : public Repeat { +public: + RepeatOCL(DataType dt, RepeatParamSpec p, I32 jumpOperatorIndex, I32 currentOperatorIndex) + : Repeat(dt, p, jumpOperatorIndex, currentOperatorIndex) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~RepeatOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr( + new RepeatOCL(this->dt, this->p, this->jumpOperatorIndex, this->nextOperatorIndex - 1)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + } + + int get_next_operator_index() override + { + // check status + if (this->inputTensors.size() > 1) { + Tensor inputTensor = this->inputTensors[1]; + TensorDesc inputDesc = inputTensor.get_desc(); + GCLMem_t ptr = (GCLMem_t)(((OclMemory *)(inputTensor.get_memory()))->get_ptr()); + U32 length = tensorNumElements(inputDesc); + DataFormat df = ptr->desc.memFormat; + if (df != DF_NCHW) { + CHECK_STATUS(NOT_SUPPORTED); + } + U32 w_off, h_off; + w_off = ptr->desc.offset[0]; + h_off = ptr->desc.offset[1]; + if (w_off != 0 || h_off != 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + I32 *val = hostVal.get(); + CHECK_STATUS(gcl_trans_memory(OCLContext::getInstance().handle.get(), ptr, val, &length, + DEVICE_BUF_TO_HOST, CL_TRUE)); + for (U32 i = 0; i < length; i++) { + // end loop + if (val[i]) { + this->iter = 0; + return this->nextOperatorIndex; + } + } + } + + // check loop + if (this->iter < this->p.loops) { + this->iter++; + return this->jumpOperatorIndex; + } else { + this->iter = 0; + return this->nextOperatorIndex; + } + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + this->iter = 0; + if (this->p.axis >= 0) { + int axisIndex = 0; + if (inTensors.size() > 2) { + axisIndex = 2; + } else { + UNI_ERROR_LOG("[ERROR] set to use axis feature of Repeat must meet input tensors " + ">= 3 requirement\n"); + } + TensorDesc desc = inTensors[axisIndex]->get_desc(); + this->p.loops = desc.dims[desc.nDims - 1 - this->p.axis]; + } + TensorDesc outDesc = outTensors[0]->get_desc(); + outDesc.dt = this->dt; + outDesc.nDims = 0; + outTensors[0]->resize(outDesc); + auto inTensor = inTensors[1]; + TensorDesc inDesc = inTensor->get_desc(); + U32 length = tensorNumElements(inDesc); + hostVal = std::shared_ptr(new I32(length)); + return SUCCESS; + } + + REGISTER_OCL_OPERATOR_RUN + +private: + std::shared_ptr hostVal; +}; + +#endif // _REPEAT_OCL_H diff --git a/inference/engine/include/ocl/reshape_ocl.hpp b/inference/engine/include/ocl/reshape_ocl.hpp new file mode 100644 index 00000000..5c038bab --- /dev/null +++ b/inference/engine/include/ocl/reshape_ocl.hpp @@ -0,0 +1,67 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _RESHAPE_OCL_H +#define _RESHAPE_OCL_H + +#include "reshape.hpp" + +class ReshapeOCL : public Reshape { +public: + ReshapeOCL(DataType dt, ReshapeParamSpec p) : Reshape(dt, p) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~ReshapeOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new ReshapeOCL(this->dt, this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + CHECK_STATUS(reshape(inputTensor, this->temp, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + + CHECK_STATUS( + reshape_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + U32 bytes = 0; + CHECK_STATUS(reshape_infer_forward_tmp_bytes( + this->inputTensors[0], this->outputTensors[0], &bytes, &this->archInfo)); + return bytes; + } + + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _RESHAPE_OCL_H diff --git a/inference/engine/include/ocl/resize_ocl.hpp b/inference/engine/include/ocl/resize_ocl.hpp new file mode 100644 index 00000000..fa4ab2bb --- /dev/null +++ b/inference/engine/include/ocl/resize_ocl.hpp @@ -0,0 +1,75 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _RESIZE_OCL_H +#define _RESIZE_OCL_H + +#include "resize.hpp" +#include "image.h" + +class ResizeOCL : public Resize { +public: + ResizeOCL(DataType paramDT, ResizeParamSpec p) : Resize(paramDT, p) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~ResizeOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new ResizeOCL(this->paramDT, this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + CHECK_STATUS(resize(inputTensor, this->temp, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + ResizeDesc resizeDesc; + resizeDesc.paramDT = this->paramDT; + U32 bytes; + switch (paramDT) { + case DT_F32: { + CHECK_REQUIREMENT(1 == this->p.scales[0] && 1 == this->p.scales[1]); + CHECK_STATUS(resize_infer_output_size(inTensors[0], resizeDesc, this->p.scales + 2, + outTensors[0], &bytes, &this->archInfo)); + break; + } + case DT_U32: { + CHECK_STATUS(resize_infer_output_size(inTensors[0], resizeDesc, this->p.sizes, + outTensors[0], &bytes, &this->archInfo)); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + return SUCCESS; + } + + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _RESIZE_H diff --git a/inference/engine/include/ocl/rnn_ocl.hpp b/inference/engine/include/ocl/rnn_ocl.hpp new file mode 100644 index 00000000..4ab6424d --- /dev/null +++ b/inference/engine/include/ocl/rnn_ocl.hpp @@ -0,0 +1,71 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _RNN_OCL_H +#define _RNN_OCL_H + +#include "ocl/rnncell_ocl.hpp" + +class RNNOCL : public RNNCellOCL { +public: + RNNOCL(DataType dt, RNNParamSpec p) : RNNCellOCL(dt, p) + {} + + ~RNNOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new RNNOCL(this->dt, this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + + // NOTE: no clean tmp and output + CHECK_STATUS(rnn(inputTensor, this->weightTensors, this->biasTensors, this->p, this->temp, + outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + return NOT_SUPPORTED; + TensorDesc inDim = inTensors[0]->get_desc(); + + DataType dt; + DataFormat df; + U32 iB, inT, iX; + CHECK_STATUS(tensor3dGet(inDim, &dt, &df, &iB, &inT, &iX)); + this->xDim = iX; + CHECK_STATUS(rnn_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + U32 bytes = 0; + CHECK_STATUS(rnn_infer_forward_tmp_bytes(this->inputTensors[0], this->weightTensors[0], + this->outputTensors[0], this->p, &bytes, &this->archInfo)); + return bytes; + } + + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _RNN_OCL_H diff --git a/inference/engine/include/ocl/rnncell_ocl.hpp b/inference/engine/include/ocl/rnncell_ocl.hpp new file mode 100644 index 00000000..8caf784f --- /dev/null +++ b/inference/engine/include/ocl/rnncell_ocl.hpp @@ -0,0 +1,228 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _RNNCELL_OCL_H +#define _RNNCELL_OCL_H + +#include "rnncell.hpp" + +class RNNCellOCL : public RNNCell { +public: + RNNCellOCL(DataType dt, RNNParamSpec p) : RNNCell(dt, p) + { + setMALIArchInfo(&(this->archInfo), &(this->runInfo), &this->needSetKernelVec, + &this->needSelectKernelLS); + } + + ~RNNCellOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new RNNCellOCL(this->dt, this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + Tensor xTensor = this->inputTensors[0]; + Tensor stateTensor = this->inputTensors[1]; + Tensor hTensor = this->outputTensors[0]; + + CHECK_STATUS(rnncell(xTensor, this->weightTensors, this->biasTensors, stateTensor, this->p, + this->xDim, this->p.numOutput, 0, this->temp, hTensor, &this->archInfo)); + } + + EE infer_forward_algorithm(std::shared_ptr algorithmMap) override + { + OCLContext::getInstance().handle.get()->kernelVec = &this->opKernelVec; + Tensor xTensor = this->inputTensors[0]; + Tensor filterTensor = this->weightTensors[0]; + Tensor biasTensor = this->biasTensors[0]; + Tensor hTensor = this->outputTensors[0]; + ((MaliPara_t)(this->archInfo.archPara))->forwardRunInfo->algorithm = + CONVOLUTION_ALGORITHM_NULL; + I32 algo[7]; + U32 algoNum = (this->p.numProjection > 0) ? 7 : 4; + if (algorithmMap->getAlgorithmInfoFromMap(this->name, algo, algoNum)) { + this->runInfo.algorithm = (ConvolutionForwardAlgorithm)algo[0]; + this->runInfo.best_w[0] = algo[1]; + this->runInfo.best_c[0] = algo[2]; + this->runInfo.best_k[0] = algo[3]; + if (algoNum == 7) { + this->runInfo.best_w[0] = algo[4]; + this->runInfo.best_c[0] = algo[5]; + this->runInfo.best_k[0] = algo[6]; + } + } else { + CHECK_STATUS(rnncell_infer_forward_algorithm(xTensor, filterTensor, biasTensor, this->p, + this->xDim, this->p.numOutput, hTensor, &this->archInfo)); + algo[0] = this->runInfo.algorithm; + algo[1] = this->runInfo.best_w[0]; + algo[2] = this->runInfo.best_c[0]; + algo[3] = this->runInfo.best_k[0]; + if (algoNum == 7) { + algo[4] = this->runInfo.best_w[1]; + algo[5] = this->runInfo.best_c[1]; + algo[6] = this->runInfo.best_k[1]; + } + algorithmMap->setAlgorithmInfoToMap(this->name, algo, algoNum); + } + return SUCCESS; + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + TensorDesc inDim = inTensors[0]->get_desc(); + DataType dt; + DataFormat df; + U32 iB, iX; + if (inDim.nDims == 2) { + CHECK_STATUS(tensor2dGet(inDim, &dt, &df, &iB, &iX)); + } else if (inDim.nDims == 3) { + dt = inDim.dt; + U32 m, k, t; + if (inDim.df == DF_MTK) { + m = inDim.dims[2]; + t = inDim.dims[1]; + k = inDim.dims[0]; + } else if (inDim.df == DF_MKT) { + m = inDim.dims[2]; + t = inDim.dims[0]; + k = inDim.dims[1]; + } else { + return NOT_SUPPORTED; + } + if (t != 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + iB = m; + iX = k; + } else { + return NOT_SUPPORTED; + } + this->xDim = iX; + CHECK_STATUS(rnncell_infer_output_size(inTensors, this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + U32 bytes = 0; + CHECK_STATUS(rnncell_infer_forward_tmp_bytes(this->inputTensors[0], this->weightTensors[0], + this->outputTensors[0], this->p, &bytes, &this->archInfo)); + return bytes; + } + + GCLMemDesc infer_wtm_memory_size_mali() override + { + U32 stride[3] = {0, 0, 0}; + U32 offset[3] = {0, 0, 0}; + GCLMemDesc tmpDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + GCLMemDesc gclmemWtmDesc[2]; + gclmemWtmDesc[0] = tmpDesc; + gclmemWtmDesc[1] = tmpDesc; + U32 bytes = 0; + ((MaliPara_t)(this->archInfo.archPara))->gclmemFilterDesc = gclmemWtmDesc; + CHECK_STATUS( + rnn_transform_filter_bytes(this->weightTensors, this->p, &bytes, &this->archInfo)); + wtm_pro = std::shared_ptr(new Tensor(OCLMem)); + OclMemory *wtmMem = (OclMemory *)wtm_pro->get_memory(); + wtmMem->padding(gclmemWtmDesc[1]); + if (this->p.numProjection > 0) { + wtm_pro->alloc(); + } + return gclmemWtmDesc[0]; + } + + EE transform_filter() override + { + auto wtmDesc = this->infer_wtm_memory_size_mali(); + this->wtm = std::shared_ptr(new Tensor(OCLMem)); + OclMemory *wtmMem = (OclMemory *)this->wtm->get_memory(); + wtmMem->padding(wtmDesc); + this->wtm->alloc(); + std::vector filterTensors; + std::vector ftmTensors; + filterTensors.push_back(this->weightTensors[0]); + ftmTensors.push_back(this->wtm.get()); + if (this->p.numProjection > 0) { + filterTensors.push_back(this->weightTensors[1]); + ftmTensors.push_back(this->wtm_pro.get()); + } + CHECK_STATUS(rnn_transform_filter(filterTensors, this->p, ftmTensors, &this->archInfo)); + this->weightTensors[0] = *this->get_wtm(); + if (this->p.numProjection > 0) { + this->weightTensors[1] = *wtm_pro.get(); + } + return SUCCESS; + } + + EE infer_weight_desc() override + { + U32 row = this->xDim + this->p.numOutput; + U32 column = (this->p.numProjection > 0) ? this->p.numProjection : this->p.numOutput; + U32 filterRow = 4 * column; + U32 filterCol = this->p.numOutput + this->xDim; + TensorDesc weightDesc[2]; + weightDesc[0] = tensor2df(this->dt, DF_NK, filterRow, filterCol); + TensorDesc biasDesc = tensor1d(this->dt, column * 4); + U32 weightNum = 1; + if (this->p.numProjection > 0) { + weightDesc[1] = tensor2df(this->dt, DF_NK, this->p.numOutput, this->p.numProjection); + weightNum = 2; + } + + for (U32 i = 0; i < weightNum; i++) { + Tensor modelWeightTensor = Tensor(OCLMem); + modelWeightTensor.resize(weightDesc[i]); + auto weightMem = (OclMemory *)modelWeightTensor.get_memory(); + U32 s0 = (i == 0) ? row : this->p.numProjection; + U32 s1 = (i == 0) ? column * 4 : this->p.numOutput; + U32 stride[3] = {s0, s1, 1}; + U32 offset[3] = {0, 0, 0}; + GCLMemType mt = GCL_MEM_BUF; + MemFlags flags = CL_MEM_READ_WRITE; + GCLMemDesc desc = gclmem_build_desc(); + CHECK_STATUS(gclmem_set_desc_padding(&desc, stride, offset, dt, DF_NCHW, mt, flags)); + weightMem->padding(desc); + this->weightTensors.push_back(modelWeightTensor); + + if (i == 0) { + Tensor modelBiasTensor = Tensor(OCLMem); + auto vectorMem = (OclMemory *)modelBiasTensor.get_memory(); + modelBiasTensor.resize(biasDesc); + stride[0] = column * 4; + stride[1] = 1; + CHECK_STATUS(gclmem_set_desc_padding(&desc, stride, offset, dt, DF_NCHW, mt, flags)); + vectorMem->padding(desc); + this->biasTensors.push_back(modelBiasTensor); + } + } + return SUCCESS; + } + + REGISTER_OCL_OPERATOR_RUN + +private: + std::shared_ptr wtm_pro; + +protected: + ForwardRunInfoMali runInfo; +}; + +#endif // _RNNCELL_OCL_H diff --git a/inference/engine/include/ocl/scale_ocl.hpp b/inference/engine/include/ocl/scale_ocl.hpp new file mode 100644 index 00000000..6a37c588 --- /dev/null +++ b/inference/engine/include/ocl/scale_ocl.hpp @@ -0,0 +1,118 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _SCALE_GPU_H +#define _SCALE_GPU_H + +#include "scale.hpp" + +class ScaleOCL : public Scale { +public: + ScaleOCL(DataType dt, ScaleParamSpec p, int numChannels) : Scale(dt, p, numChannels) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~ScaleOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new ScaleOCL(this->dt, this->p, this->numChannels)); + *mem = *this; + return mem; + } + + EE infer_weight_desc() override + { + auto curOpWs = this->get_weightspec(); + if (0 != curOpWs.bytes_of_weight) { + this->numChannels = curOpWs.bytes_of_weight / UNI_MAX(1, bytesOf(curOpWs.mdt)); + } else if (0 != curOpWs.bytes_of_vec) { + this->numChannels = curOpWs.bytes_of_vec / UNI_MAX(1, bytesOf(curOpWs.mdt)); + } else { + this->numChannels = 0; + } + Tensor modelWeightTensor = Tensor(OCLMem); + Tensor modelBiasTensor = Tensor(OCLMem); + TensorDesc weightDesc = tensor1d(this->dt, this->numChannels); + TensorDesc biasDesc = weightDesc; + modelWeightTensor.resize(weightDesc); + modelBiasTensor.resize(biasDesc); + auto weightMem = (OclMemory *)modelWeightTensor.get_memory(); + auto vectorMem = (OclMemory *)modelBiasTensor.get_memory(); + + U32 stride[3] = {(this->numChannels + 3) / 4 * 4, 1, 1}; + U32 offset[3] = {0, 0, 0}; + GCLMemType mt = GCL_MEM_BUF; + MemFlags flags = CL_MEM_READ_WRITE; + GCLMemDesc desc = gclmem_build_desc(); + CHECK_STATUS(gclmem_set_desc_padding(&desc, stride, offset, this->dt, DF_NCHW, mt, flags)); + weightMem->padding(desc); + vectorMem->padding(desc); + this->weightTensors.push_back(modelWeightTensor); + this->biasTensors.push_back(modelBiasTensor); + return SUCCESS; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + int inputNum = this->inputTensors.size(); + Tensor inputTensor = this->inputTensors[this->dataID]; + Tensor outputTensor = this->outputTensors[0]; + if (inputNum == 1 && weightTensors.size() == 0) { + CHECK_STATUS(NOT_MATCH); + } + + if (inputNum > 1) { + U32 cNum = this->inputTensors[0].get_desc().dims[2]; + for (int i = 1; i < inputNum; i++) { + if (cNum != this->inputTensors[i].get_desc().dims[2]) { + CHECK_STATUS(NOT_MATCH); + } + } + } + + void *alpha, *beta; + if (inputNum == 1) { + alpha = ((OclMemory *)(this->weightTensors[0].get_memory()))->get_ptr(); + beta = ((OclMemory *)(this->biasTensors[0].get_memory()))->get_ptr(); + } else { + alpha = ((OclMemory *)(this->inputTensors[1 - this->dataID].get_memory()))->get_ptr(); + beta = nullptr; + } + CHECK_STATUS(scale(inputTensor, alpha, beta, this->p, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + if (inTensors.size() > 1) { + U32 len0 = inTensors[0]->length(); + U32 len1 = inTensors[1]->length(); + if (len1 > len0) { + this->dataID = 1; + } + } + CHECK_STATUS( + scale_infer_output_size(inTensors[this->dataID], outTensors[0], &this->archInfo)); + return SUCCESS; + } + + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _SCALE_GPU_H diff --git a/inference/engine/include/ocl/shared_weight_ocl.hpp b/inference/engine/include/ocl/shared_weight_ocl.hpp new file mode 100644 index 00000000..1a0ce04c --- /dev/null +++ b/inference/engine/include/ocl/shared_weight_ocl.hpp @@ -0,0 +1,134 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _SHARED_WEIGHT_OCL_H +#define _SHARED_WEIGHT_OCL_H + +#include "shared_weight.hpp" + +#include "ocl_desc_trans.h" +#include "ocl_data_trans.h" + +class SharedWeightOCL : public SharedWeight { +public: + SharedWeightOCL(DataType dt, + TensorDesc desc, + std::string outputTensorName, + std::map> *tensorMapPtr) + : SharedWeight(dt, desc, outputTensorName, tensorMapPtr) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~SharedWeightOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr( + new SharedWeightOCL(this->dt, this->desc, this->outputTensorName, tensorMapPtr)); + *mem = *this; + return mem; + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + UNUSED(inTensors); + outTensors[0]->resize(this->desc); + U32 s0, s1, s2; + s0 = this->desc.dims[0]; + s1 = (this->desc.nDims > 1) ? this->desc.dims[1] : 1; + s2 = (this->desc.nDims > 2) ? this->desc.dims[2] : 1; + U32 stride[3] = {s0, s1, s2}; + U32 offset[3] = {0, 0, 0}; + GCLMemType mt = GCL_MEM_BUF; + MemFlags flags = CL_MEM_READ_WRITE; + GCLMemDesc gclMemDesc = gclmem_build_desc(); + CHECK_STATUS(gclmem_set_desc_padding(&gclMemDesc, stride, offset, dt, DF_NCHW, mt, flags)); + ocl_set_desc(outTensors[0], gclMemDesc); + return SUCCESS; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + } + + EE init_weight_bias_from_model(std::shared_ptr *modelPtr) override + { + auto dstTensor = (*this->tensorMapPtr)[this->outputTensorName]; + auto dstMem = (OclMemory *)(dstTensor->get_memory()); + GCLMemDesc dstMemDesc = dstMem->get_desc(); + std::shared_ptr weight_ptr; + auto curOpWs = this->get_weightspec(); + if (modelPtr) { + weight_ptr = *modelPtr; + } else { + weight_ptr = std::shared_ptr(curOpWs.weight); + } + U32 s0, s1, s2; + s0 = this->desc.dims[0]; + s1 = (this->desc.nDims > 1) ? this->desc.dims[1] : 1; + s2 = (this->desc.nDims > 2) ? this->desc.dims[2] : 1; + this->needTrans = false; + if (dstMemDesc.stride[0] == s0 && dstMemDesc.stride[1] == s1 && dstMemDesc.stride[2] == s2) { + CpuMemory weight_mem_src; + weight_mem_src.resize(this->desc); + weight_mem_src.set_shared_ptr(std::shared_ptr(weight_ptr)); + dstMem->copy_from((Memory *)&weight_mem_src); + } else { + this->needTrans = true; + this->host_ptr = weight_ptr; + } + this->weightTensors.push_back(*dstTensor.get()); + if (modelPtr) { + *modelPtr = + std::shared_ptr(*modelPtr, (*modelPtr).get() + tensorNumBytes(this->desc)); + } + return SUCCESS; + } + + EE transform_filter() override + { + if (needTrans) { + auto dstTensor = (*this->tensorMapPtr)[this->outputTensorName]; + auto dstMem = (OclMemory *)(dstTensor->get_memory()); + GCLMem_t dst = (GCLMem_t)dstMem->get_ptr(); + auto tempMem = (OclMemory *)(this->temp.get_memory()); + GCLMem_t temp = (GCLMem_t)tempMem->get_ptr(); + CHECK_STATUS(ocl_set_input(OCLContext::getInstance().handle.get(), dst, this->desc, + host_ptr.get(), temp, true)); + this->weightTensors[0] = *dstTensor.get(); + } + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + U32 bytes = 0; + if (needTrans) { + bytes = tensorNumBytes(this->desc); + } + return bytes; + } + + REGISTER_OCL_OPERATOR_RUN + +private: + std::shared_ptr host_ptr; + bool needTrans; +}; + +#endif // _WEIGHT_OCL_H diff --git a/inference/engine/include/ocl/slice_ocl.hpp b/inference/engine/include/ocl/slice_ocl.hpp new file mode 100644 index 00000000..b825e6dc --- /dev/null +++ b/inference/engine/include/ocl/slice_ocl.hpp @@ -0,0 +1,54 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _SLICE_OCL_H +#define _SLICE_OCL_H + +#include "slice.hpp" + +class SliceOCL : public Slice { +public: + SliceOCL(DataType dt, SliceParamSpec p) : Slice(dt, p) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~SliceOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new SliceOCL(this->dt, this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + Tensor inputTensor = this->inputTensors[0]; + CHECK_STATUS(slice(this->inputTensors[0], this->p, this->outputTensors, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + CHECK_STATUS(slice_infer_output_size(inTensors[0], this->p, outTensors, &this->archInfo)); + return SUCCESS; + } + + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _SLICE_OCL_H diff --git a/inference/engine/include/ocl/softmax_ocl.hpp b/inference/engine/include/ocl/softmax_ocl.hpp new file mode 100644 index 00000000..7afb03ac --- /dev/null +++ b/inference/engine/include/ocl/softmax_ocl.hpp @@ -0,0 +1,63 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _SOFTMAX_OCL_H +#define _SOFTMAX_OCL_H + +#include "softmax.hpp" + +class SoftmaxOCL : public Softmax { +public: + SoftmaxOCL(DataType dt, SoftmaxParamSpec p) : Softmax(dt, p) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~SoftmaxOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new SoftmaxOCL(this->dt, this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + CHECK_STATUS(softmax( + this->inputTensors[0], this->p, this->temp, this->outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + CHECK_STATUS(softmax_infer_output_size(inTensors[0], outTensors[0], &this->archInfo)); + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + U32 bytes = 0; + CHECK_STATUS( + softmax_infer_forward_tmp_bytes(this->inputTensors[0], &bytes, &this->archInfo)); + return bytes; + } + + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // SOFTMAX_OCL_H diff --git a/inference/engine/include/ocl/space2depth_ocl.hpp b/inference/engine/include/ocl/space2depth_ocl.hpp new file mode 100644 index 00000000..ae76f01c --- /dev/null +++ b/inference/engine/include/ocl/space2depth_ocl.hpp @@ -0,0 +1,54 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _SPACE2DEPTH_OCL_H +#define _SPACE2DEPTH_OCL_H + +#include "space2depth.hpp" + +class Space2DepthOCL : public Space2Depth { +public: + Space2DepthOCL(DataType dt) : Space2Depth(dt) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~Space2DepthOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new Space2DepthOCL(this->dt)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + CHECK_STATUS(space2depth(this->inputTensors[0], this->outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + CHECK_STATUS(space2depth_infer_output_size(inTensors[0], outTensors[0], &this->archInfo)); + return SUCCESS; + } + + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _SPACE2DEPTH_OCL_H diff --git a/inference/engine/include/ocl/squeeze_ocl.hpp b/inference/engine/include/ocl/squeeze_ocl.hpp new file mode 100644 index 00000000..7e29a191 --- /dev/null +++ b/inference/engine/include/ocl/squeeze_ocl.hpp @@ -0,0 +1,55 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _SQUEEZE_OCL_H +#define _SQUEEZE_OCL_H + +#include "squeeze.hpp" + +class SqueezeOCL : public Squeeze { +public: + SqueezeOCL(DataType dt, SqueezeParamSpec p) : Squeeze(dt, p) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~SqueezeOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new SqueezeOCL(this->dt, this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + CHECK_STATUS(squeeze(this->inputTensors[0], this->outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + CHECK_STATUS( + squeeze_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } + + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _SQUEEZE_OCL_H diff --git a/inference/engine/include/ocl/transpose_ocl.hpp b/inference/engine/include/ocl/transpose_ocl.hpp new file mode 100644 index 00000000..d404f7bc --- /dev/null +++ b/inference/engine/include/ocl/transpose_ocl.hpp @@ -0,0 +1,64 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _TRANSPOSE_OCL_H +#define _TRANSPOSE_OCL_H + +#include "transpose.hpp" + +class TransposeOCL : public Transpose { +public: + TransposeOCL(DataType dt, TransposeParamSpec p) : Transpose(dt, p) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~TransposeOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new TransposeOCL(this->dt, this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + CHECK_STATUS(transpose( + this->inputTensors[0], this->p, this->temp, this->outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + CHECK_STATUS( + transpose_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + U32 bytes = 0; + CHECK_STATUS(transpose_infer_forward_tmp_bytes( + this->inputTensors[0], this->outputTensors[0], &bytes, &this->archInfo)); + return bytes; + } + + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _TRANSPOSE_OCL_H diff --git a/inference/engine/include/ocl/unsqueeze_ocl.hpp b/inference/engine/include/ocl/unsqueeze_ocl.hpp new file mode 100644 index 00000000..03aec120 --- /dev/null +++ b/inference/engine/include/ocl/unsqueeze_ocl.hpp @@ -0,0 +1,54 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _UNSQUEEZE_OCL_H +#define _UNSQUEEZE_OCL_H + +#include "unsqueeze.hpp" + +class UnsqueezeOCL : public Unsqueeze { +public: + UnsqueezeOCL(DataType dt, UnsqueezeParamSpec p) : Unsqueeze(dt, p) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~UnsqueezeOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new UnsqueezeOCL(this->dt, this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + CHECK_STATUS(unsqueeze(this->inputTensors[0], this->outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + CHECK_STATUS( + unsqueeze_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _UNSQUEEZE_OCL_H diff --git a/inference/engine/include/operator.hpp b/inference/engine/include/operator.hpp new file mode 100644 index 00000000..33b70642 --- /dev/null +++ b/inference/engine/include/operator.hpp @@ -0,0 +1,251 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _OPERATOR_H +#define _OPERATOR_H + +#include +#include "sys.h" +#include "tensor.hpp" +#include "algorithm_map.h" +#include "tensor_computing.h" +#ifdef _USE_MALI +#include "gcl.h" +#include "gcl_engine.h" +#endif + +class Operator { +public: + Operator() + { + this->dt = DT_F32; + this->name = ""; + this->lenOfTemp = 0; + this->archInfo.archPara = nullptr; + } + + Operator(std::string name) + { + this->dt = DT_F32; + this->name = name; + this->lenOfTemp = 0; + this->archInfo.archPara = nullptr; + } + + virtual ~Operator() + { + if (this->archInfo.archPara != nullptr) { + free(this->archInfo.archPara); + this->archInfo.archPara = nullptr; + } + } + + virtual std::shared_ptr clone() = 0; + + virtual EE infer_output_tensors_size(std::vector, std::vector) = 0; + + virtual U32 infer_tmp_memory_size() + { + this->lenOfTemp = 0; + return 0; + } + + virtual void set_tmp_memory(Tensor temp) + { + this->lenOfTemp = temp.bytes(); + this->temp = temp; + } + + virtual void run() = 0; + + virtual void set_input_output_tensors(std::vector it, std::vector ot) + { + this->inputTensors = it; + this->outputTensors = ot; + } + + virtual void set_input_tensors(std::vector it) + { + this->inputTensors = it; + } + + virtual std::vector get_input_tensors() + { + return this->inputTensors; + } + + virtual void set_output_tensors(std::vector ot) + { + this->outputTensors = ot; + } + + virtual std::vector get_output_tensors() + { + return this->outputTensors; + } + + virtual bool can_input_output_the_same() + { + return false; + } + + virtual bool is_weight() + { + return false; + } + + virtual U32 get_len_of_temp() + { + return this->lenOfTemp; + } + + virtual Tensor get_tmp() + { + return this->temp; + } + + virtual void set_name(std::string opName) + { + this->name = opName; + } + + std::string get_name() + { + return this->name; + } + + virtual void set_schedule(Arch opSchedule) + { + this->archInfo.arch = opSchedule; + } + + virtual void set_tensor_positions(std::vector tensorPos) + { + this->tensorPos = tensorPos; + } + + virtual std::vector &get_tensor_positions() + { + return this->tensorPos; + } + + virtual int get_next_operator_index() + { + return -1; + } + + virtual void init_feature_scale(U32 num, QuantSpec *qs) + { +#ifdef _USE_INT8 + if (1 == num && 0 == qs[0].scale[0]) { // OP is labelled as no-quantization + if (DT_F16_8Q == this->dt) { + this->dt = DT_F16; + } + return; + } + featureScale.resize(num); + for (U32 i = 0; i < num; i++) { + featureScale[i].resize(qs[i].num_scale); + memcpy(featureScale[i].data(), qs[i].scale, qs[i].num_scale * bytesOf(DT_F32)); + } +#endif + } + +#ifdef _USE_INT8 + virtual void set_feature_scale(std::vector> fs) + { + this->featureScale = fs; + } + + virtual bool is_dynamic_scale() + { + OperatorType ot = this->get_type(); + if (OT_Conv != ot) { + return false; + } + + U32 numScale = featureScale.size(); + U32 numQuant = (DT_F16_8Q == this->dt) ? inputTensors.size() : 0; + + if (0 != numScale && 0 == featureScale[0][0]) { // OP is labelled as no-quantization + return false; + } + + if (0 != numScale && -2 == (featureScale.back())[0]) { // OP is labelled as fp-output + numScale = 0; + numQuant += 1; + } + + for (auto tensor : outputTensors) { + if (DT_I8 == tensor.get_desc().dt) { + numQuant++; + } + } + if (0 == numQuant) { + return false; + } + + if (0 == numScale) { + return true; + } + + CHECK_REQUIREMENT(numQuant == numScale); + return false; + } +#endif + + virtual bool checkOperator() + { + for (U32 i = 0; i < inputTensors.size(); i++) { + if (!tensorDescIsValid(inputTensors[i].get_desc())) { + return false; + } + } + for (U32 i = 0; i < outputTensors.size(); i++) { + if (!tensorDescIsValid(outputTensors[i].get_desc())) { + return false; + } + } + return true; + }; + + virtual OperatorType get_type() = 0; + + virtual EE infer_forward_algorithm(std::shared_ptr algorithmMap) + { + UNUSED(algorithmMap); + return SUCCESS; + } + + virtual void set_algorithm_map(std::shared_ptr algorithmMap) + { + this->algorithmMap = algorithmMap; + } + +protected: + ArchInfo archInfo; + DataType dt; + + std::vector inputTensors; + std::vector outputTensors; + std::vector tensorPos; + + U32 lenOfTemp; + Tensor temp; + + std::string name; + std::vector> featureScale; + std::shared_ptr algorithmMap; +}; + +#endif // _OPERATOR_H diff --git a/inference/engine/include/padding.hpp b/inference/engine/include/padding.hpp new file mode 100644 index 00000000..aebfa7b0 --- /dev/null +++ b/inference/engine/include/padding.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _PADDING_H +#define _PADDING_H + +#include "operator.hpp" + +class Padding : public Operator { +public: + Padding(DataType dt, PadParamSpec p) + { + this->dt = dt; + this->p = p; + } + + OperatorType get_type() override + { + return OT_Pad; + } + +protected: + PadParamSpec p; +}; + +#endif // _PADDING_H diff --git a/inference/engine/include/pooling.hpp b/inference/engine/include/pooling.hpp new file mode 100644 index 00000000..1c0f8f7c --- /dev/null +++ b/inference/engine/include/pooling.hpp @@ -0,0 +1,47 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _POOLING_H +#define _POOLING_H + +#include "operator.hpp" + +class Pooling : public Operator { +public: + Pooling(PoolingParamSpec p) + { + this->p = p; + } + + OperatorType get_type() override + { + return OT_Pooling; + } + + void set_kernelSize(U32 globalKernelSizeH, U32 globalKernelSizeW) + { + this->p.kernel_h = globalKernelSizeH; + this->p.kernel_w = globalKernelSizeW; + } + + void set_stride(U32 globalStrideH, U32 globalStrideW) + { + this->p.stride_h = globalStrideH; + this->p.stride_w = globalStrideW; + } + +protected: + PoolingParamSpec p; +}; + +#endif // _POOLING_H diff --git a/inference/engine/include/power.hpp b/inference/engine/include/power.hpp new file mode 100644 index 00000000..91a37389 --- /dev/null +++ b/inference/engine/include/power.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _POWER_H +#define _POWER_H + +#include "operator.hpp" + +class Power : public Operator { +public: + Power(DataType dt, PowerParamSpec p) + { + this->dt = dt; + this->p = p; + } + + OperatorType get_type() override + { + return OT_Power; + } + +public: + PowerParamSpec p; +}; + +#endif // _POWER_H diff --git a/inference/engine/include/preallocated_memory.hpp b/inference/engine/include/preallocated_memory.hpp new file mode 100644 index 00000000..6a909c54 --- /dev/null +++ b/inference/engine/include/preallocated_memory.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _PREALLOCATED_MEMORY_H +#define _PREALLOCATED_MEMORY_H + +#include "operator.hpp" + +class PreAllocatedMemory : public Operator { +public: + PreAllocatedMemory(DataType dt, TensorDesc desc) + { + this->dt = dt; + this->desc = desc; + } + + OperatorType get_type() override + { + return OT_PreAllocatedMemory; + } + +protected: + TensorDesc desc; +}; + +#endif // _PREALLOCATED_MEMORY_H diff --git a/inference/engine/include/prelu.hpp b/inference/engine/include/prelu.hpp new file mode 100644 index 00000000..0a0e504c --- /dev/null +++ b/inference/engine/include/prelu.hpp @@ -0,0 +1,35 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _PRELU_H +#define _PRELU_H + +#include "weight_operator.hpp" + +class PReLU : public WeightOperator { +public: + PReLU(DataType dt) + { + this->dt = dt; + } + + OperatorType get_type() override + { + return OT_PRelu; + } + +protected: + PReLUParamSpec preluDesc; +}; + +#endif // _PADDING_H diff --git a/inference/engine/include/prior_box.hpp b/inference/engine/include/prior_box.hpp new file mode 100644 index 00000000..4ee39b74 --- /dev/null +++ b/inference/engine/include/prior_box.hpp @@ -0,0 +1,54 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _PRIOR_BOX_H +#define _PRIOR_BOX_H + +#include "operator.hpp" + +class PriorBox : public Operator { +public: + PriorBox(DataType dt, PriorBoxParamSpec p) + { + this->dt = dt; + this->p = p; + } + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new PriorBox(this->dt, this->p)); + *mem = *this; + return mem; + } + + OperatorType get_type() override + { + return OT_PriorBox; + } + + void run() override + { + CHECK_STATUS(priorbox(this->inputTensors, this->p, this->outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS(priorbox_infer_output_size(inTensors, this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } + +protected: + PriorBoxParamSpec p; +}; +#endif // _PRIOR_BOX_H diff --git a/inference/engine/include/reduction.hpp b/inference/engine/include/reduction.hpp new file mode 100644 index 00000000..4f3e776d --- /dev/null +++ b/inference/engine/include/reduction.hpp @@ -0,0 +1,83 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _REDUCTION_H +#define _REDUCTION_H + +#include "operator.hpp" + +class Reduction : public Operator { +public: + Reduction(DataType dt, ReductionParamSpec p) + { + this->dt = dt; + this->p = p; + } + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new Reduction(this->dt, this->p)); + *mem = *this; + return mem; + } + + OperatorType get_type() override + { + return OT_Reduction; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + Tensor maskTensor; + if (this->inputTensors.size() > 1) { + maskTensor = this->inputTensors[1]; + } else { + TensorDesc maskDesc; + maskDesc.nDims = 0; + maskTensor.resize(maskDesc); + } + + CHECK_STATUS( + reduction(inputTensor, maskTensor, this->p, this->temp, outputTensor, &this->archInfo)); + } + + U32 infer_tmp_memory_size() override + { + U32 bytes = 0; + CHECK_STATUS(reduction_infer_forward_tmp_bytes( + this->inputTensors[0], this->p, this->outputTensors[0], &bytes, &this->archInfo)); + return bytes; + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + Tensor maskTensor; + if (inTensors.size() > 1) { + maskTensor = *(inTensors[1]); + } else { + TensorDesc maskDesc; + maskDesc.nDims = 0; + maskTensor.resize(maskDesc); + } + return reduction_infer_output_size(inTensors[0], maskTensor, this->p, outTensors[0]); + } + +private: + ReductionParamSpec p; +}; + +#endif diff --git a/inference/engine/include/relative_position_embedding.hpp b/inference/engine/include/relative_position_embedding.hpp new file mode 100644 index 00000000..3af4d378 --- /dev/null +++ b/inference/engine/include/relative_position_embedding.hpp @@ -0,0 +1,96 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _RELATIVE_POSITION_EMBEDDING_H +#define _RELATIVE_POSITION_EMBEDDING_H + +#include "cpu/embedding_cpu.hpp" + +class RelativePositionEmbedding : public EmbeddingCPU { +public: + RelativePositionEmbedding(DataType dt, EmbedParamSpec p) : EmbeddingCPU(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr( + new RelativePositionEmbedding(this->dt, this->p)); + *mem = *this; + return mem; + } + + OperatorType get_type() override + { + return OT_RelativePositionEmbedding; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor weightTensor; + if (this->weightTensors.size() > 0) { + weightTensor = this->weightTensors[0]; + } else { + weightTensor = this->inputTensors[1]; + } + Tensor outputTensor = this->outputTensors[0]; + + TensorDesc inputDesc = inputTensor.get_desc(); + U8 *weightPtr = (U8 *)((CpuMemory *)weightTensor.get_memory())->get_ptr(); + U8 *outputPtr = (U8 *)((CpuMemory *)outputTensor.get_memory())->get_ptr(); + + I32 tmpAxis = (this->p.axis + inputDesc.nDims) % inputDesc.nDims; + U32 batch = inputDesc.dims[inputDesc.nDims - 1]; + U32 length = inputDesc.dims[inputDesc.nDims - 1 - tmpAxis]; + for (U32 in = 0; in < batch; in++) { + U8 *ptr = outputPtr + in * length * this->p.num_output * bytesOf(this->dt); + if (length > this->p.input_dim) { + U32 size = (length - this->p.input_dim) * this->p.num_output * bytesOf(this->dt); + memset(ptr, 0, size); + ptr += size; + } + U32 start = 0; + U32 copyLength = this->p.input_dim; + if (length < this->p.input_dim) { + start = this->p.input_dim - length; + copyLength = length; + } + if (this->p.transpose) { + for (U32 i = 0; i < copyLength; i++) { + for (U32 j = 0; j < this->p.num_output; j++) { + memcpy(ptr, + weightPtr + (j * this->p.input_dim + start + i) * bytesOf(this->dt), + bytesOf(this->dt)); + } + } + } else { + memcpy(ptr, weightPtr + start * this->p.num_output * bytesOf(this->dt), + copyLength * this->p.num_output * bytesOf(this->dt)); + } + } + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + TensorDesc inDim = inTensors[0]->get_desc(); + I32 tmpAxis = (this->p.axis + inDim.nDims) % inDim.nDims; + U32 batch = inDim.dims[inDim.nDims - 1]; + U32 length = inDim.dims[inDim.nDims - 1 - tmpAxis]; + TensorDesc outDim = tensor3df(this->dt, DF_MTK, batch, length, this->p.num_output); + outTensors[0]->resize(outDim); + return SUCCESS; + } +}; + +#endif // _RELATIVE_POSITION_EMBEDDING_H diff --git a/inference/engine/include/relative_shift.hpp b/inference/engine/include/relative_shift.hpp new file mode 100644 index 00000000..7fd0c11f --- /dev/null +++ b/inference/engine/include/relative_shift.hpp @@ -0,0 +1,98 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _RELATIVE_SHIFT_H +#define _RELATIVE_SHIFT_H + +#include "operator.hpp" + +class RelativeShift : public Operator { +public: + RelativeShift(DataType dt, RelativeShiftParamSpec p) + { + this->dt = dt; + this->p = p; + } + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new RelativeShift(this->dt, this->p)); + *mem = *this; + return mem; + } + + OperatorType get_type() override + { + return OT_RelativeShift; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + U8 *inputPtr = (U8 *)((CpuMemory *)(inputTensor.get_memory()))->get_ptr(); + U8 *outputPtr = (U8 *)((CpuMemory *)(outputTensor.get_memory()))->get_ptr(); + + TensorDesc inputDesc = inputTensor.get_desc(); + I32 tmpAxis = (this->p.axis + inputDesc.nDims) % inputDesc.nDims; + tmpAxis = (I32)inputDesc.nDims - 1 - tmpAxis; + U32 length = inputDesc.dims[tmpAxis]; + if (tmpAxis + 1 >= (I32)inputDesc.nDims) { + U32 bytes = inputTensor.bytes(); + memcpy(outputPtr, inputPtr, bytes); + return; + } + U32 loops = inputDesc.dims[tmpAxis + 1]; + U32 innerLength = 1; + U32 outerLength = 1; + for (I32 i = 0; i < tmpAxis; i++) { + innerLength *= inputDesc.dims[i]; + } + for (U32 i = tmpAxis + 2; i < inputDesc.nDims; i++) { + outerLength *= inputDesc.dims[i]; + } + U32 tileSize = innerLength * bytesOf(inputDesc.dt); + U32 chunkSize = length * tileSize; + U8 *dstPtr = outputPtr; + for (U32 i = 0; i < outerLength; i++) { + U8 *srcPtr = inputPtr + i * loops * chunkSize; + U32 num = + loops * length - (loops - this->p.shift_length) * (this->p.shift_length + length); + U32 start = this->p.shift_length * length - num; + U32 srcIndex = start * tileSize; + memcpy(dstPtr, srcPtr + srcIndex, num * tileSize); + dstPtr += num * tileSize; + srcIndex += num * tileSize; + for (U32 j = this->p.shift_length; j < loops; j++) { + memset(dstPtr, 0, this->p.shift_length * tileSize); + dstPtr += this->p.shift_length * tileSize; + memcpy(dstPtr, srcPtr + srcIndex, chunkSize); + dstPtr += chunkSize; + srcIndex += chunkSize; + } + } + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + outTensors[0]->resize(inTensors[0]->get_desc()); + return SUCCESS; + } + +private: + RelativeShiftParamSpec p; +}; + +#endif // _RELATIVE_SHIFT_H diff --git a/inference/engine/include/repeat.hpp b/inference/engine/include/repeat.hpp new file mode 100644 index 00000000..7d5e1471 --- /dev/null +++ b/inference/engine/include/repeat.hpp @@ -0,0 +1,42 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _REPEAT_H +#define _REPEAT_H + +#include "operator.hpp" + +class Repeat : public Operator { +public: + Repeat(DataType dt, RepeatParamSpec p, I32 jumpOperatorIndex, I32 currentOperatorIndex) + { + this->dt = dt; + this->p = p; + this->iter = 0; + this->jumpOperatorIndex = jumpOperatorIndex; + this->nextOperatorIndex = currentOperatorIndex + 1; + } + + OperatorType get_type() override + { + return OT_Repeat; + } + +protected: + RepeatParamSpec p; + int iter; + int jumpOperatorIndex; + int nextOperatorIndex; +}; + +#endif // _REPEAT_H diff --git a/inference/engine/include/reshape.hpp b/inference/engine/include/reshape.hpp new file mode 100644 index 00000000..0a8f9f7b --- /dev/null +++ b/inference/engine/include/reshape.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _RESHAPE_H +#define _RESHAPE_H + +#include "operator.hpp" + +class Reshape : public Operator { +public: + Reshape(DataType dt, ReshapeParamSpec p) + { + this->dt = dt; + this->p = p; + } + + OperatorType get_type() override + { + return OT_Reshape; + } + +protected: + ReshapeParamSpec p; +}; + +#endif // _RESHAPE_H diff --git a/inference/engine/include/resize.hpp b/inference/engine/include/resize.hpp new file mode 100644 index 00000000..981855b0 --- /dev/null +++ b/inference/engine/include/resize.hpp @@ -0,0 +1,41 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _RESIZE_H +#define _RESIZE_H + +#include "operator.hpp" + +class Resize : public Operator { +public: + Resize(DataType paramDT, ResizeParamSpec p) + { + if (paramDT == DT_F32 || paramDT == DT_U32) { + this->paramDT = paramDT; + this->p = p; + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + OperatorType get_type() override + { + return OT_Resize; + } + +protected: + DataType paramDT; + ResizeParamSpec p; +}; + +#endif // _RESIZE_H diff --git a/inference/include/result_format.hpp b/inference/engine/include/result_format.hpp similarity index 79% rename from inference/include/result_format.hpp rename to inference/engine/include/result_format.hpp index 41344e7d..411579cc 100644 --- a/inference/include/result_format.hpp +++ b/inference/engine/include/result_format.hpp @@ -1,23 +1,21 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _RESULT_FORMAT_H #define _RESULT_FORMAT_H #include "tensor.hpp" -Vec topK_index(Tensor data, U32 topK); - +std::vector topK_index(U8 *res, TensorDesc desc, U32 topK); -#endif //_RESULT_FORMAT_H +#endif // _RESULT_FORMAT_H diff --git a/inference/engine/include/rnncell.hpp b/inference/engine/include/rnncell.hpp new file mode 100644 index 00000000..c898e9a4 --- /dev/null +++ b/inference/engine/include/rnncell.hpp @@ -0,0 +1,39 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _RNNCELL_H +#define _RNNCELL_H + +#include "weight_operator.hpp" + +class RNNCell : public WeightOperator { +public: + RNNCell(DataType dt, RNNParamSpec p) + { + this->dt = dt; + this->p = p; + this->hasBias = false; + } + + OperatorType get_type() override + { + return OT_RNN; + } + +public: + RNNParamSpec p; + U32 xDim; + ClipParamSpec clipParam; +}; + +#endif // _RNNCELL_H diff --git a/inference/engine/include/scale.hpp b/inference/engine/include/scale.hpp new file mode 100644 index 00000000..e438dfdc --- /dev/null +++ b/inference/engine/include/scale.hpp @@ -0,0 +1,45 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _SCALE_H +#define _SCALE_H + +#include "weight_operator.hpp" + +class Scale : public WeightOperator { +public: + Scale(DataType dt, ScaleParamSpec p, int numChannels) + { + this->dt = dt; + this->p = p; + this->numChannels = numChannels; + this->dataID = 0; + } + + OperatorType get_type() override + { + return OT_Scale; + } + + bool can_input_output_the_same() override + { + return true; + } + +protected: + ScaleParamSpec p; + U32 numChannels; + int dataID; +}; + +#endif // _SCALE_H diff --git a/inference/engine/include/sequential.hpp b/inference/engine/include/sequential.hpp new file mode 100644 index 00000000..28c7c67e --- /dev/null +++ b/inference/engine/include/sequential.hpp @@ -0,0 +1,228 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _SEQUENTIAL_HPP +#define _SEQUENTIAL_HPP + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor.hpp" +#include "operator.hpp" +#include "convolution.hpp" +#include "fully_connected.hpp" +#include "cnn.h" +#include "op_type.h" +#include "tensor_desc.h" +#include "sequential.hpp" +#include "cpu/rnn_cpu.hpp" + +class Sequential : public CNN { +public: + Sequential(AffinityPolicy affinityPolicy, DataType dt, std::string name) + : CNN(affinityPolicy, dt, name) + {} + + void initialize_weight(std::shared_ptr _modelPtr) + { + this->modelPtr = _modelPtr; + } + + EE infer_output_tensors_size(std::map inputDescMap) override + { + if (inputDescMap.size() != 1) { + return NOT_SUPPORTED; + } + std::vector inputTensors; + std::vector inputTensorsPtr; + std::vector inDims; + std::vector outputTensors; + std::vector outputTensorsPtr(1); + U32 count = 0; + for (auto iter : inputDescMap) { + Tensor tensor; + tensor.resize(iter.second); + inputTensors.push_back(tensor); + inDims.push_back(iter.second); + inputTensorsPtr.push_back(&inputTensors[count]); + count++; + } + this->dimsOp = {inDims}; + auto num = [](std::vector inDims) -> U32 { + U32 ret = 0; + for (auto d : inDims) { + ret += tensorNumElements(d); + } + return ret; + }; + maxOutputElements = num(inDims); + + count = 0; + for (auto op : this->ops) { + Tensor tensor; + outputTensors.push_back(tensor); + outputTensorsPtr[0] = &outputTensors[count]; + CHECK_STATUS(op->infer_output_tensors_size(inputTensorsPtr, outputTensorsPtr)); + auto outDesc = outputTensorsPtr[0]->get_desc(); + std::vector outDescVec; + outDescVec.push_back(outDesc); + dimsOp.push_back(outDescVec); + U32 numElements = tensorNumElements(outDesc); + if (maxOutputElements < numElements) { + maxOutputElements = numElements; + } + inputTensorsPtr[0] = &outputTensors[count]; + count++; + } + return SUCCESS; + } + + void assign_output_tensor() override + { + auto firstPtr = (U8 *)operator new(bytesOf(this->dt) * maxOutputElements); + std::shared_ptr firstSharedPtr(firstPtr); + auto secondPtr = (U8 *)operator new(bytesOf(this->dt) * maxOutputElements); + std::shared_ptr secondSharedPtr(secondPtr); + for (U32 i = 0; i < this->ops.size(); i++) { + auto op = this->ops[i]; + auto inDims = dimsOp[i]; + auto outDims = dimsOp[i + 1]; + + std::vector inTensors; + U32 index = 0; + for (auto d : inDims) { + auto val = + std::shared_ptr(firstSharedPtr, (U8 *)firstPtr + index * bytesOf(this->dt)); + Tensor tensor; + tensor.resize(d); + ((CpuMemory *)tensor.get_memory())->set_shared_ptr(val); + inTensors.push_back(tensor); + index += tensorNumElements(d); + } + + std::vector outTensors; + index = 0; + for (auto d : outDims) { + auto val = std::shared_ptr( + secondSharedPtr, (U8 *)secondPtr + index * bytesOf(this->dt)); + Tensor tensor; + tensor.resize(d); + ((CpuMemory *)tensor.get_memory())->set_shared_ptr(val); + outTensors.push_back(tensor); + index += tensorNumElements(d); + } + + op->set_input_output_tensors(inTensors, outTensors); + + std::swap(firstPtr, secondPtr); + std::swap(firstSharedPtr, secondSharedPtr); + } + } + + EE ConvBiasAssignmentAndWeightTransform() + { + return SUCCESS; + } + + EE FCBiasAssignmentAndWeight() + { + return SUCCESS; + } + + void ready(std::map inputDescMap) override + { + for (auto op : this->ops) { + op->set_schedule(this->deviceInfo.schedule); + } + this->infer_output_tensors_size(inputDescMap); + this->assign_output_tensor(); + + for (auto op : this->ops) { + if (op->is_weight()) { + if (op->get_type() == OT_Conv) { + auto convOpPtr = dynamic_cast(op.get()); + CHECK_STATUS(convOpPtr->init_weight_bias_from_model(&modelPtr)); + CHECK_STATUS(convOpPtr->infer_forward_algorithm(this->algorithmMap)); + CHECK_STATUS(convOpPtr->transform_filter()); + } else if (op->get_type() == OT_FC) { + auto fcOpPtr = dynamic_cast(op.get()); + CHECK_STATUS(fcOpPtr->init_weight_bias_from_model(&modelPtr)); + CHECK_STATUS(fcOpPtr->transform_filter()); + } else if (op->get_type() == OT_RNN) { + auto rnnOpPtr = dynamic_cast(op.get()); + CHECK_STATUS(rnnOpPtr->init_weight_bias_from_model(&modelPtr)); + CHECK_STATUS(rnnOpPtr->transform_filter()); + } + } + } + + this->infer_tmp_memory_size(); + this->assign_tmp_tensor(); + } + + void infer_tmp_memory_size() override + { + tmpElements.clear(); + maxTmpElements = 0; + + for (auto op : this->ops) { + auto len = op->infer_tmp_memory_size(); + tmpElements.push_back(len); + if (len > maxTmpElements) { + maxTmpElements = len; + } + } + } + + void assign_tmp_tensor() override + { + temp.resize(tensor1d(DT_U8, maxTmpElements)); + temp.alloc(); + for (auto op : this->ops) { + op->set_tmp_memory(temp); + } + } + + void add(std::shared_ptr op) + { + this->ops.push_back(op); + } + + std::vector get_inputTensors() + { + auto op = this->ops[0].get(); + return op->get_input_tensors(); + } + + std::vector get_output_tensors() + { + auto len = this->ops.size(); + auto op = this->ops[len - 1].get(); + return op->get_output_tensors(); + } + + void set_input_tensors(std::vector inputTensors) + { + auto op = this->ops[0].get(); + op->set_input_tensors(inputTensors); + } + +private: + std::shared_ptr modelPtr; + U32 maxOutputElements; + std::vector> dimsOp; + U32 maxTmpElements; + std::vector tmpElements; + Tensor temp; +}; +#endif diff --git a/inference/engine/include/sequential_ocl.hpp b/inference/engine/include/sequential_ocl.hpp new file mode 100644 index 00000000..d4843e06 --- /dev/null +++ b/inference/engine/include/sequential_ocl.hpp @@ -0,0 +1,230 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_MALI +#ifndef _SEQUENTIAL_OCL_HPP +#define _SEQUENTIAL_OCL_HPP + +#include +#include "sys.h" +#include "error.h" +#include "types.h" +#include +#include "tensor.hpp" +#include "operator.hpp" +#include "cnn.h" +#include "op_type.h" +#include "tensor_desc.h" +#include "memory.hpp" +#include "weight_operator.hpp" +#include "pooling.hpp" +#include "convolution.hpp" +#include "bilateral_slice_apply.hpp" +#include "ocl/pooling_ocl.hpp" +#include "memory_ocl.hpp" +#include "ocl/convolution_ocl.hpp" +#include "ocl/bilateral_slice_apply_ocl.hpp" +#include "ocl/fully_connected_ocl.hpp" +#include "ocl/scale_ocl.hpp" + +class SequentialOcl : public CNN { +public: + SequentialOcl(AffinityPolicy affinityPolicy, DataType dt, std::string name) + : CNN(affinityPolicy, dt, name) + { + input_output_same = false; + } + virtual ~SequentialOcl() + {} + + EE ready(std::vector dims, std::shared_ptr modelPtr, U32 numOutput) + { + this->ops[0]->set_schedule(this->deviceInfo.schedule); + input_output_same = this->ops[0]->can_input_output_the_same(); + CHECK_STATUS(this->infer_output_tensors_size(dims, numOutput)); + std::vector inTensors; + std::vector outTensors; + for (U32 i = 0; i < inputTensors.size(); i++) { + inTensors.push_back(*inputTensors[i].get()); + } + for (U32 i = 0; i < outputTensors.size(); i++) { + outTensors.push_back(*outputTensors[i].get()); + } + this->ops[0]->set_input_output_tensors(inTensors, outTensors); + this->ops[0]->set_algorithm_map(this->algorithmMap); + + if (this->ops[0]->is_weight()) { + if (this->ops[0]->get_type() == OT_Conv) { + auto convOpPtr = dynamic_cast(this->ops[0].get()); + auto weightOp = (WeightOperator *)convOpPtr; + weightOp->set_hasBias(true); + CHECK_STATUS(convOpPtr->init_weight_bias_from_model(&modelPtr)); + CHECK_STATUS(convOpPtr->infer_forward_algorithm(this->algorithmMap)); + CHECK_STATUS(convOpPtr->transform_filter()); + } + if (this->ops[0]->get_type() == OT_FC) { + auto fcOpPtr = dynamic_cast(this->ops[0].get()); + auto weightOp = (WeightOperator *)fcOpPtr; + weightOp->set_hasBias(true); + CHECK_STATUS(fcOpPtr->init_weight_bias_from_model(&modelPtr)); + CHECK_STATUS(fcOpPtr->transform_filter()); + } + if (this->ops[0]->get_type() == OT_Scale) { + auto scaleOpPtr = dynamic_cast(this->ops[0].get()); + auto weightOp = (WeightOperator *)scaleOpPtr; + weightOp->set_hasBias(true); + CHECK_STATUS(scaleOpPtr->init_weight_bias_from_model(&modelPtr)); + } + } + this->infer_tmp_memory_size(); + this->assign_tmp_tensor(); + this->alloc_output_host_tensors(numOutput); + return SUCCESS; + } + + EE infer_output_tensors_size(std::map) override + { + return NOT_SUPPORTED; + } + + void assign_output_tensor() override + {} + + EE infer_output_tensors_size(std::vector dims, U32 outputTensorNum) + { + std::vector inTensors; + std::vector outTensors; + for (U32 i = 0; i < dims.size(); ++i) { + std::shared_ptr tmpTensor(new Tensor(OCLMem)); + tmpTensor->resize(dims[i]); + inputTensors.push_back(tmpTensor); + inTensors.push_back(inputTensors[i].get()); + } + for (U32 i = 0; i < outputTensorNum; ++i) { + std::shared_ptr tmpTensor(new Tensor(OCLMem)); + outputTensors.push_back(tmpTensor); + outTensors.push_back(outputTensors[i].get()); + } + + CHECK_STATUS(this->ops[0]->infer_output_tensors_size(inTensors, outTensors)); + for (auto p : inTensors) { + p->alloc(); + } + return SUCCESS; + } + + EE infer_gclmem_descs(std::map) + { + return NOT_SUPPORTED; + } + + void alloc_output_host_tensors(U32 outputTensorNum) + { + for (U32 i = 0; i < outputTensorNum; i++) { + auto mem = (OclMemory *)outputTensors[i]->get_memory(); + mem->mapped_alloc(); + } + } + + void infer_tmp_memory_size() override + { + maxTmpElements = 0; + for (auto op : this->ops) { + auto len = op->infer_tmp_memory_size(); + if (len > maxTmpElements) { + maxTmpElements = len; + } + } + } + + void assign_tmp_tensor() override + { + this->temp = Tensor(OCLMem); + if (maxTmpElements) { + temp.resize(tensor1d(DT_U8, maxTmpElements)); + temp.alloc(); + } + for (auto op : this->ops) { + op->set_tmp_memory(temp); + } + } + + void add(std::shared_ptr op) + { + this->ops.push_back(op); + } + + void mark_input_output() + { + if (this->deviceInfo.schedule == MALI) { + U32 tmpBufSize = 0; + for (U32 i = 0; i < inputTensors.size(); i++) { + Tensor *inputTensor = inputTensors[i].get(); + TensorDesc desc = inputTensor->get_desc(); + U32 size = tensorNumBytes(desc); + ArchInfo archInfo; + archInfo.arch = MALI; + tmpBufSize = (tmpBufSize < size) ? size : tmpBufSize; + } + + if (tmpBufSize > maxTmpElements) { + maxTmpElements = tmpBufSize; + } + temp.resize(tensor1d(DT_U8, maxTmpElements)); + temp.alloc(); + } + } + + void set_input_tensors(std::vector modelInputTensors) + { + for (U32 i = 0; i < modelInputTensors.size(); i++) { + auto hostMem = (CpuMemory *)modelInputTensors[i].get_memory(); + U8 *hostPtr = (U8 *)hostMem->get_ptr(); + TensorDesc hostDesc = modelInputTensors[i].get_desc(); + auto *mem = (OclMemory *)inputTensors[i]->get_memory(); + GCLMem_t input = (GCLMem_t)mem->get_ptr(); + auto *tmpmem = (OclMemory *)temp.get_memory(); + GCLMem_t tmp = (GCLMem_t)tmpmem->get_ptr(); + CHECK_STATUS(ocl_set_input(this->handle.get(), input, hostDesc, hostPtr, tmp, true)); + } + gcl_finish(this->handle.get()); + } + + std::vector> get_output_tensors() + { + return this->outputTensors; + } + +#ifdef _USE_MALI +#else + EE ConvBiasAssignmentAndWeightTransform() + { + return SUCCESS; + } + + EE FCBiasAssignmentAndWeight() + { + return SUCCESS; + } +#endif + +private: + using Model::ready; + U32 maxTmpElements; + Tensor temp; + std::vector> inputTensors; + std::vector> outputTensors; + bool input_output_same; +}; +#endif +#endif diff --git a/inference/engine/include/shape.hpp b/inference/engine/include/shape.hpp new file mode 100644 index 00000000..a67d30c0 --- /dev/null +++ b/inference/engine/include/shape.hpp @@ -0,0 +1,30 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _SHAPE_H +#define _SHAPE_H + +#include "operator.hpp" + +class Shape : public Operator { +public: + Shape() + {} + + OperatorType get_type() override + { + return OT_Shape; + } +}; + +#endif // _SHAPE_H diff --git a/inference/engine/include/shared_weight.hpp b/inference/engine/include/shared_weight.hpp new file mode 100644 index 00000000..d1d0e4d7 --- /dev/null +++ b/inference/engine/include/shared_weight.hpp @@ -0,0 +1,43 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _SHARED_WEIGHT_H +#define _SHARED_WEIGHT_H + +#include "weight_operator.hpp" + +class SharedWeight : public WeightOperator { +public: + SharedWeight(DataType dt, + TensorDesc desc, + std::string outputTensorName, + std::map> *tensorMapPtr) + { + this->dt = dt; + this->desc = desc; + this->outputTensorName = outputTensorName; + this->tensorMapPtr = tensorMapPtr; + } + + OperatorType get_type() override + { + return OT_SharedWeight; + } + +protected: + TensorDesc desc; + std::string outputTensorName; + std::map> *tensorMapPtr; +}; + +#endif // _WEIGHT_H diff --git a/inference/engine/include/slice.hpp b/inference/engine/include/slice.hpp new file mode 100644 index 00000000..34568a91 --- /dev/null +++ b/inference/engine/include/slice.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _SLICE_H +#define _SLICE_H + +#include "operator.hpp" + +class Slice : public Operator { +public: + Slice(DataType dt, SliceParamSpec p) + { + this->dt = dt; + this->p = p; + } + + OperatorType get_type() override + { + return OT_Slice; + } + +protected: + SliceParamSpec p; +}; + +#endif // _SLICE_H diff --git a/inference/include/softmax.hpp b/inference/engine/include/softmax.hpp similarity index 75% rename from inference/include/softmax.hpp rename to inference/engine/include/softmax.hpp index cb92aa3b..60cda0fb 100644 --- a/inference/include/softmax.hpp +++ b/inference/engine/include/softmax.hpp @@ -1,37 +1,35 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _SOFTMAX_H #define _SOFTMAX_H #include "operator.hpp" -#include "tensor_computing.h" class Softmax : public Operator { public: - explicit Softmax(DataType dt, int axis) + explicit Softmax(DataType dt, SoftmaxParamSpec p) { this->dt = dt; - this->axis = axis; - this->lenOfTemp = 0; + this->p = p; } - OperatorType get_op_type() override + OperatorType get_type() override { return OT_Softmax; } + protected: - int axis; + SoftmaxParamSpec p; }; -#endif //_SOFTMAX_H +#endif // _SOFTMAX_H diff --git a/inference/include/space2depth.hpp b/inference/engine/include/space2depth.hpp similarity index 78% rename from inference/include/space2depth.hpp rename to inference/engine/include/space2depth.hpp index 5c7874a6..592c4903 100644 --- a/inference/include/space2depth.hpp +++ b/inference/engine/include/space2depth.hpp @@ -1,38 +1,32 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _SPACE2DEPTH_H #define _SPACE2DEPTH_H #include "operator.hpp" -class Space2Depth: public Operator -{ +class Space2Depth : public Operator { public: - /** - @param mode - */ Space2Depth(DataType dt) { this->dt = dt; } - OperatorType get_op_type() override + OperatorType get_type() override { return OT_Space2Depth; } - }; -#endif //_SPACE2DEPTH_H +#endif // _SPACE2DEPTH_H diff --git a/inference/engine/include/splice.hpp b/inference/engine/include/splice.hpp new file mode 100644 index 00000000..31b024c3 --- /dev/null +++ b/inference/engine/include/splice.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _SPLICE_H +#define _SPLICE_H + +#include "weight_operator.hpp" + +class Splice : public WeightOperator { +public: + Splice(DataType dt, SpliceParamSpec p) + { + this->dt = dt; + this->p = p; + } + + OperatorType get_type() override + { + return OT_Splice; + } + +protected: + SpliceParamSpec p; +}; + +#endif // _EMBEDDING__H diff --git a/inference/engine/include/squeeze.hpp b/inference/engine/include/squeeze.hpp new file mode 100644 index 00000000..978b7217 --- /dev/null +++ b/inference/engine/include/squeeze.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _SQUEEZE_H +#define _SQUEEZE_H + +#include "operator.hpp" + +class Squeeze : public Operator { +public: + Squeeze(DataType dt, SqueezeParamSpec p) + { + this->dt = dt; + this->p = p; + } + + OperatorType get_type() override + { + return OT_Squeeze; + } + +protected: + SqueezeParamSpec p; +}; + +#endif // _SQUEEZE_H diff --git a/inference/engine/include/tfslice.hpp b/inference/engine/include/tfslice.hpp new file mode 100644 index 00000000..6deb2f1d --- /dev/null +++ b/inference/engine/include/tfslice.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _TFSLICE_H +#define _TFSLICE_H + +#include "operator.hpp" + +class TfSlice : public Operator { +public: + TfSlice(DataType dt, TfSliceParamSpec p) + { + this->dt = dt; + this->p = p; + } + + OperatorType get_type() override + { + return OT_TfSlice; + } + +protected: + TfSliceParamSpec p; +}; + +#endif // _TFSLICE_H diff --git a/inference/engine/include/tile.hpp b/inference/engine/include/tile.hpp new file mode 100644 index 00000000..c1cc34ed --- /dev/null +++ b/inference/engine/include/tile.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _TILE_H +#define _TILE_H + +#include "operator.hpp" + +class Tile : public Operator { +public: + Tile(DataType dt, TileParamSpec p) + { + this->dt = dt; + this->p = p; + } + + OperatorType get_type() override + { + return OT_Tile; + } + +protected: + TileParamSpec p; +}; + +#endif // _TILE_H diff --git a/inference/engine/include/transpose.hpp b/inference/engine/include/transpose.hpp new file mode 100644 index 00000000..0de95a88 --- /dev/null +++ b/inference/engine/include/transpose.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _TRANSPOSE_H +#define _TRANSPOSE_H + +#include "operator.hpp" + +class Transpose : public Operator { +public: + Transpose(DataType dt, TransposeParamSpec p) + { + this->dt = dt; + this->p = p; + } + + OperatorType get_type() override + { + return OT_Transpose; + } + +protected: + TransposeParamSpec p; +}; + +#endif // _TRANSPOSE_H diff --git a/inference/engine/include/unsqueeze.hpp b/inference/engine/include/unsqueeze.hpp new file mode 100644 index 00000000..8f0e82c8 --- /dev/null +++ b/inference/engine/include/unsqueeze.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _UNSQUEEZE_H +#define _UNSQUEEZE_H + +#include "operator.hpp" + +class Unsqueeze : public Operator { +public: + Unsqueeze(DataType dt, UnsqueezeParamSpec p) + { + this->dt = dt; + this->p = p; + } + + OperatorType get_type() override + { + return OT_Unsqueeze; + } + +protected: + UnsqueezeParamSpec p; +}; + +#endif // _UNSQUEEZE_H diff --git a/inference/engine/include/weight_operator.hpp b/inference/engine/include/weight_operator.hpp new file mode 100644 index 00000000..83e94f29 --- /dev/null +++ b/inference/engine/include/weight_operator.hpp @@ -0,0 +1,195 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _WEIGHTOPERATOR_H +#define _WEIGHTOPERATOR_H + +#include "operator.hpp" +#include "model_tools.h" + +class WeightOperator : public Operator { +public: + WeightOperator() + { + this->hasBias = false; + this->lenOfWtm = 0; + + this->ws.mdt = DT_U8; + this->ws.bytes_of_weight = 0; + this->ws.weight = nullptr; + this->ws.bytes_of_vec = 0; + this->ws.vec = nullptr; + } + + bool is_weight() override + { + return true; + } + + U32 get_weight_size() + { + U32 ret = 0; + for (auto tensor : this->weightTensors) { + TensorDesc desc = tensor.get_desc(); + ret += tensorNumBytes(desc); + } + return ret; + } + + virtual void set_weight_tensors(std::vector weightTensors) + { + this->weightTensors = weightTensors; + } + + virtual std::vector get_weight_tensors() + { + return this->weightTensors; + } + + virtual void set_bias_tensors(std::vector biasTensors) + { + this->biasTensors = biasTensors; + } + + virtual std::vector get_bias_tensors() + { + return this->biasTensors; + } + + virtual U32 infer_wtm_memory_size() + { + this->lenOfWtm = 0; + this->wtm = std::shared_ptr(); + return 0; + } + +#ifdef _USE_MALI + virtual GCLMemDesc infer_wtm_memory_size_mali() + { + this->lenOfWtm = 0; + this->wtm = std::shared_ptr(); + U32 stride[3] = {0, 0, 0}; + U32 offset[3] = {0, 0, 0}; + GCLMemDesc tmpdesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + return tmpdesc; + } +#endif + + virtual void set_wtm_memory(U32 len, Tensor wtm) + { + this->lenOfWtm = len; + this->temp = wtm; + } + + virtual U32 get_lenOfWtm() + { + return this->lenOfWtm; + } + + virtual Tensor *get_wtm() + { + return this->wtm.get(); + } + + virtual void set_weightspec_ptr(WeightSpec ws) + { + this->ws = ws; + } + + virtual WeightSpec get_weightspec() + { + return this->ws; + } + + virtual void set_hasBias(bool hasBiasOrNot) + { + this->hasBias = hasBiasOrNot; + } + + virtual EE infer_weight_desc() + { + return SUCCESS; + } + + virtual EE init_weight_bias_from_model(std::shared_ptr *modelPtr) + { + EE ret = this->infer_weight_desc(); + if (ret != SUCCESS) { + return ret; + } + auto curOpWs = this->get_weightspec(); + CpuMemory weight_mem_src, bias_mem_src; + std::shared_ptr weight_ptr, bias_ptr; + if (modelPtr != nullptr) { + weight_ptr = *modelPtr; + bias_ptr = *modelPtr; + } else { + weight_ptr = std::shared_ptr(curOpWs.weight); + bias_ptr = std::shared_ptr(curOpWs.vec); + } + + U32 weight_offset = 0; + for (auto weight_tensor : this->weightTensors) { + TensorDesc desc = weight_tensor.get_desc(); + auto weight_mem_dst = weight_tensor.get_memory(); + weight_mem_src.resize(desc); + weight_mem_src.set_shared_ptr( + std::shared_ptr(weight_ptr, weight_ptr.get() + weight_offset)); + weight_mem_dst->reuse(&weight_mem_src); + weight_offset += tensorNumBytes(desc); + } + + U32 bias_offset = (modelPtr != nullptr) ? weight_offset : 0; + if (this->hasBias) { + for (auto bias_tensor : this->biasTensors) { + TensorDesc desc = bias_tensor.get_desc(); + auto bias_mem_dst = bias_tensor.get_memory(); + bias_mem_src.resize(desc); + bias_mem_src.set_shared_ptr( + std::shared_ptr(bias_ptr, bias_ptr.get() + bias_offset)); + bias_mem_dst->reuse(&bias_mem_src); + bias_offset += tensorNumBytes(desc); + } + } else { + for (auto bias_tensor : this->biasTensors) { + TensorDesc desc = bias_tensor.get_desc(); + auto bias_mem_dst = bias_tensor.get_memory(); + bias_mem_src.resize(desc); + bias_mem_src.alloc(); + U8 *tmp = (U8 *)bias_mem_src.get_ptr(); + memset(tmp, 0, bias_mem_src.bytes()); + bias_mem_dst->reuse(&bias_mem_src); + } + } + if (modelPtr != nullptr) { + *modelPtr = std::shared_ptr(bias_ptr, bias_ptr.get() + bias_offset); + } + return SUCCESS; + } + + virtual EE transform_filter() + { + return SUCCESS; + } + +protected: + std::vector weightTensors; + std::vector biasTensors; + bool hasBias; + + U32 lenOfWtm; + std::shared_ptr wtm; + WeightSpec ws; +}; + +#endif // _WEIGHTOPERATOR_H diff --git a/inference/engine/include/yolov3_detection_output.hpp b/inference/engine/include/yolov3_detection_output.hpp new file mode 100644 index 00000000..1c4f6188 --- /dev/null +++ b/inference/engine/include/yolov3_detection_output.hpp @@ -0,0 +1,57 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _YOLOV3_DETECTION_OUTPUT_H +#define _YOLOV3_DETECTION_OUTPUT_H + +#include "operator.hpp" + +class Yolov3DetectionOutput : public Operator { +public: + Yolov3DetectionOutput(DataType dt, Yolov3DetectionOutputParamSpec p) + { + this->dt = dt; + this->p = p; + } + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new Yolov3DetectionOutput(this->dt, this->p)); + *mem = *this; + return mem; + } + + OperatorType get_type() override + { + return OT_Yolov3DetectionOutput; + } + + void run() override + { + CHECK_STATUS(yolov3detectionoutput( + this->inputTensors, this->p, this->outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS(yolov3detectionoutput_infer_output_size( + inTensors, this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } + +protected: + Yolov3DetectionOutputParamSpec p; +}; +#endif // _YOLOV3_DETECTION_OUTPUT_H diff --git a/inference/engine/src/BoltModel_Jni.cpp b/inference/engine/src/BoltModel_Jni.cpp new file mode 100644 index 00000000..143136a6 --- /dev/null +++ b/inference/engine/src/BoltModel_Jni.cpp @@ -0,0 +1,558 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_JNI +#include +#include "cnn.h" +#include "BoltModel.h" +#include "../api/c/bolt.h" + +struct ModelHandleInfo { + void *cnn; + DEVICE_TYPE deviceType; + void *algoPath; + bool useFileStream; +}; + +typedef struct { + U32 dims[4] = {0}; + char name[NAME_LEN] = {0}; + DataType dt; + DataFormat df; + void *dataPtr; +} DataDesc; + +typedef struct { + U32 num_outputs; + DataDesc *outputArr; + DEVICE_TYPE deviceType; +} ResultHandleInner; + +AFFINITY_TYPE str2AFFINITY_TYPE(std::string affinity_str) +{ + AFFINITY_TYPE ret = CPU_HIGH_PERFORMANCE; + if (affinity_str == "CPU_AFFINITY_HIGH_PERFORMANCE") { + ret = CPU_HIGH_PERFORMANCE; + } else if (affinity_str == "CPU_AFFINITY_LOW_POWER") { + ret = CPU_LOW_POWER; + } else if (affinity_str == "GPU") { + ret = GPU; + } else { + UNI_ERROR_LOG("unsupported JNI CPU affinity setting %s\n", affinity_str.c_str()); + } + return ret; +} + +DEVICE_TYPE str2DEVICE_TYPE(std::string device_str) +{ + DEVICE_TYPE ret = CPU_ARM_V8; + if (device_str == "CPU_ARM_V7") { + ret = CPU_ARM_V7; + } else if (device_str == "CPU_ARM_V8") { + ret = CPU_ARM_V8; + } else if (device_str == "CPU_ARM_A55") { + ret = CPU_ARM_A55; + } else if (device_str == "CPU_ARM_A76") { + ret = CPU_ARM_A76; + } else if (device_str == "GPU_MALI") { + ret = GPU_MALI; + } else if (device_str == "CPU_X86_AVX2") { + ret = CPU_X86_AVX2; + } else if (device_str == "CPU_SERIAL") { + ret = CPU_SERIAL; + } else { + UNI_ERROR_LOG("unsupported JNI device setting %s\n", device_str.c_str()); + } + return ret; +} + +DATA_TYPE str2DATA_TYPE(std::string data_type) +{ + DATA_TYPE ret = FP_32; + if (data_type == "FP32") { + ret = FP_32; +#ifdef __aarch64__ + } else if (data_type == "FP16") { + ret = FP_16; +#endif + } else if (data_type == "INT32") { + ret = INT_32; + } else if (data_type == "UINT32") { + ret = UINT_32; + } else { + UNI_ERROR_LOG("unsupported JNI data type setting %s\n", data_type.c_str()); + } + return ret; +} + +DATA_FORMAT str2DATA_FORMAT(std::string data_format) +{ + DATA_FORMAT ret = NCHW; + if (data_format == "NCHW") { + ret = NCHW; + } else if (data_format == "NHWC") { + ret = NHWC; + } else if (data_format == "MTK") { + ret = MTK; + } else if (data_format == "NORMAL") { + ret = NORMAL; + } else { + UNI_ERROR_LOG("unsupported JNI data format setting %s\n", data_format.c_str()); + } + return ret; +} + +std::string DataFormat2str(DataFormat data_format) +{ + std::string ret = "NCHW"; + switch (data_format) { + case DF_NCHW: + ret = "NCHW"; + break; + case DF_NCHWC8: + ret = "NCHWC8"; + break; + case DF_NHWC: + ret = "NHWC"; + break; + case DF_MTK: + ret = "MTK"; + break; + case DF_NORMAL: + ret = "NORMAL"; + break; + default: + UNI_ERROR_LOG("unsupported JNI data format setting %d\n", data_format); + } + return ret; +} + +extern "C" JNIEXPORT jlong JNICALL BOLT_JNI_PREFIX(BoltModel_createModel)( + JNIEnv *env, jobject, jstring modelPath, jstring affinity) +{ + const char *modelPathPtr = env->GetStringUTFChars(modelPath, JNI_FALSE); + const char *affinityPtr = env->GetStringUTFChars(affinity, JNI_FALSE); + std::string affinity_str = (std::string)affinityPtr; + AFFINITY_TYPE affinity_cur = str2AFFINITY_TYPE(affinity_str); + long modelAddr = (long)CreateModel(modelPathPtr, affinity_cur, NULL); + ModelHandleInfo *ihInfo = (ModelHandleInfo *)modelAddr; + if (nullptr == ihInfo->cnn) { + UNI_ERROR_LOG("Bolt instance not created\n"); + modelAddr = 0; + } + env->ReleaseStringUTFChars(modelPath, modelPathPtr); + env->ReleaseStringUTFChars(affinity, affinityPtr); + return modelAddr; +} + +extern "C" JNIEXPORT jlong JNICALL BOLT_JNI_PREFIX(BoltModel_cloneModel)( + JNIEnv *env, jobject, jlong modelAddr) +{ + ModelHandle handle = (ModelHandle)modelAddr; + ModelHandle cloneHandle = CloneModel(handle); + long ret = (long)cloneHandle; + return ret; +} + +void getInputParameters(JNIEnv *env, + jint num, + jobjectArray input_names, + char ***data_name_ptr, + jintArray n, + int **data_n_ptr, + jintArray c, + int **data_c_ptr, + jintArray h, + int **data_h_ptr, + jintArray w, + int **data_w_ptr, + jobjectArray dt_input, + DATA_TYPE **data_dt_ptr, + jobjectArray df_input, + DATA_FORMAT **data_df_ptr) +{ + if (env->GetArrayLength(input_names) != num) { + UNI_ERROR_LOG("input name array length %d is not equal to input num %d\n", + env->GetArrayLength(input_names), num); + } + if (env->GetArrayLength(n) != num) { + UNI_ERROR_LOG( + "input N array length %d is not equal to input num %d\n", env->GetArrayLength(n), num); + } + if (env->GetArrayLength(c) != num) { + UNI_ERROR_LOG( + "input C array length %d is not equal to input num %d\n", env->GetArrayLength(c), num); + } + if (env->GetArrayLength(h) != num) { + UNI_ERROR_LOG( + "input H array length %d is not equal to input num %d\n", env->GetArrayLength(h), num); + } + if (env->GetArrayLength(w) != num) { + UNI_ERROR_LOG( + "input W array length %d is not equal to input num %d\n", env->GetArrayLength(w), num); + } + if (env->GetArrayLength(dt_input) != num) { + UNI_ERROR_LOG("input DataType array length %d is not equal to input num %d\n", + env->GetArrayLength(dt_input), num); + } + if (env->GetArrayLength(df_input) != num) { + UNI_ERROR_LOG("input DataFormat array length %d is not equal to input num %d\n", + env->GetArrayLength(df_input), num); + } + int *data_n = (int *)malloc(num * sizeof(int)); + int *data_c = (int *)malloc(num * sizeof(int)); + int *data_h = (int *)malloc(num * sizeof(int)); + int *data_w = (int *)malloc(num * sizeof(int)); + char **data_name = (char **)malloc(num * sizeof(char *)); + DATA_TYPE *data_dt = (DATA_TYPE *)malloc(num * sizeof(DATA_TYPE)); + DATA_FORMAT *data_df = (DATA_FORMAT *)malloc(num * sizeof(DATA_FORMAT)); + jint *curArray_n = env->GetIntArrayElements(n, 0); + jint *curArray_c = env->GetIntArrayElements(c, 0); + jint *curArray_h = env->GetIntArrayElements(h, 0); + jint *curArray_w = env->GetIntArrayElements(w, 0); + for (int i = 0; i < num; i++) { + data_n[i] = curArray_n[i]; + data_c[i] = curArray_c[i]; + data_h[i] = curArray_h[i]; + data_w[i] = curArray_w[i]; + + jstring cur_str = (jstring)(env->GetObjectArrayElement(input_names, i)); + const char *cur_str_ptr = env->GetStringUTFChars(cur_str, 0); + int length = strlen(cur_str_ptr); + data_name[i] = (char *)malloc(sizeof(char) * (length + 1)); + UNI_memcpy(data_name[i], cur_str_ptr, length); + data_name[i][length] = '\0'; + + jstring tmp_str_dt = (jstring)(env->GetObjectArrayElement(dt_input, i)); + const char *tmp_str_dt_ptr = env->GetStringUTFChars(tmp_str_dt, 0); + std::string cur_tmp_str_dt = tmp_str_dt_ptr; + data_dt[i] = str2DATA_TYPE(cur_tmp_str_dt); + + jstring tmp_str_df = (jstring)(env->GetObjectArrayElement(df_input, i)); + const char *tmp_str_df_ptr = env->GetStringUTFChars(tmp_str_df, 0); + std::string cur_tmp_str_df = tmp_str_df_ptr; + data_df[i] = str2DATA_FORMAT(cur_tmp_str_df); + + env->ReleaseStringUTFChars(cur_str, cur_str_ptr); + env->DeleteLocalRef(cur_str); + env->ReleaseStringUTFChars(tmp_str_dt, tmp_str_dt_ptr); + env->ReleaseStringUTFChars(tmp_str_df, tmp_str_df_ptr); + env->DeleteLocalRef(tmp_str_dt); + env->DeleteLocalRef(tmp_str_df); + } + env->ReleaseIntArrayElements(n, curArray_n, 0); + env->ReleaseIntArrayElements(c, curArray_c, 0); + env->ReleaseIntArrayElements(h, curArray_h, 0); + env->ReleaseIntArrayElements(w, curArray_w, 0); + *data_name_ptr = data_name; + *data_n_ptr = data_n; + *data_c_ptr = data_c; + *data_h_ptr = data_h; + *data_w_ptr = data_w; + *data_dt_ptr = data_dt; + *data_df_ptr = data_df; +} + +extern "C" JNIEXPORT void JNICALL BOLT_JNI_PREFIX(BoltModel_prepareModel)(JNIEnv *env, + jobject, + jlong modelAddr, + jint num_input, + jobjectArray input_names, + jintArray n, + jintArray c, + jintArray h, + jintArray w, + jobjectArray dt_input, + jobjectArray df_input) +{ + ModelHandle ih = (ModelHandle)modelAddr; + char **data_name = nullptr; + int *data_n = nullptr; + int *data_c = nullptr; + int *data_h = nullptr; + int *data_w = nullptr; + DATA_TYPE *data_dt = nullptr; + DATA_FORMAT *data_df = nullptr; + getInputParameters(env, num_input, input_names, &data_name, n, &data_n, c, &data_c, h, &data_h, + w, &data_w, dt_input, &data_dt, df_input, &data_df); + + PrepareModel(ih, num_input, data_name, data_n, data_c, data_h, data_w, data_dt, data_df); + + free(data_n); + free(data_c); + free(data_h); + free(data_w); + for (int i = 0; i < num_input; i++) { + free(data_name[i]); + } + free(data_name); + free(data_dt); + free(data_df); +} + +extern "C" JNIEXPORT void JNICALL BOLT_JNI_PREFIX(BoltModel_resizeModelInput)(JNIEnv *env, + jobject, + jlong modelAddr, + jint num_input, + jobjectArray input_names, + jintArray n, + jintArray c, + jintArray h, + jintArray w, + jobjectArray dt_input, + jobjectArray df_input) +{ + ModelHandle ih = (ModelHandle)modelAddr; + char **data_name = nullptr; + int *data_n = nullptr; + int *data_c = nullptr; + int *data_h = nullptr; + int *data_w = nullptr; + DATA_TYPE *data_dt = nullptr; + DATA_FORMAT *data_df = nullptr; + getInputParameters(env, num_input, input_names, &data_name, n, &data_n, c, &data_c, h, &data_h, + w, &data_w, dt_input, &data_dt, df_input, &data_df); + + ResizeModelInput(ih, num_input, data_name, data_n, data_c, data_h, data_w, data_dt, data_df); + + free(data_n); + free(data_c); + free(data_h); + free(data_w); + for (int i = 0; i < num_input; i++) { + free(data_name[i]); + } + free(data_name); + free(data_dt); + free(data_df); +} + +extern "C" JNIEXPORT jlong JNICALL BOLT_JNI_PREFIX(BoltModel_allocAllResultHandle)( + JNIEnv *, jobject, jlong modelAddr) +{ + ModelHandle ih = (ModelHandle)modelAddr; + ResultHandle ir = AllocAllResultHandle(ih); + return (long)ir; +} + +extern "C" JNIEXPORT jlong JNICALL BOLT_JNI_PREFIX(BoltModel_allocSpecificResultHandle)( + JNIEnv *env, jobject, jlong modelAddr, jint num_outputs, jobjectArray outputNames) +{ + if (env->GetArrayLength(outputNames) != num_outputs) { + UNI_ERROR_LOG("output name array length %d is not equal to output num %d\n", + env->GetArrayLength(outputNames), num_outputs); + } + ModelHandle ih = (ModelHandle)modelAddr; + char **output_names_ptr = (char **)malloc(sizeof(char *) * num_outputs); + for (int i = 0; i < num_outputs; i++) { + jstring cur_str = (jstring)(env->GetObjectArrayElement(outputNames, i)); + const char *cur_str_ptr = env->GetStringUTFChars(cur_str, 0); + int length = strlen(cur_str_ptr); + output_names_ptr[i] = (char *)malloc(sizeof(char) * (length + 1)); + UNI_memcpy(output_names_ptr[i], cur_str_ptr, length); + output_names_ptr[i][length] = '\0'; + + env->ReleaseStringUTFChars(cur_str, cur_str_ptr); + env->DeleteLocalRef(cur_str); + } + ResultHandle ir = AllocSpecificResultHandle(ih, num_outputs, output_names_ptr); + + for (int i = 0; i < num_outputs; i++) { + free(output_names_ptr[i]); + } + free(output_names_ptr); + return (long)ir; +} + +extern "C" JNIEXPORT void JNICALL BOLT_JNI_PREFIX(BoltModel_setRuntimeDeviceJNI)( + JNIEnv *env, jobject, jlong modelAddr, jint cpu_id, jstring device) +{ + ModelHandle ih = (ModelHandle)modelAddr; + const char *devicePtr = env->GetStringUTFChars(device, JNI_FALSE); + std::string device_str = (std::string)devicePtr; + DEVICE_TYPE device_cur = str2DEVICE_TYPE(device_str); + SetRuntimeDevice(ih, cpu_id, device_cur); + env->ReleaseStringUTFChars(device, devicePtr); +} + +extern "C" JNIEXPORT void JNICALL BOLT_JNI_PREFIX(BoltModel_setRuntimeDeviceDynamicJNI)( + JNIEnv *env, jobject, jlong modelAddr) +{ + ModelHandle ih = (ModelHandle)modelAddr; + SetRuntimeDeviceDynamic(ih); +} + +extern "C" JNIEXPORT void JNICALL BOLT_JNI_PREFIX(BoltModel_runModel)(JNIEnv *env, + jobject, + jlong modelAddr, + jlong ResultHandleAddr, + jint num_input, + jobjectArray input_names, + jobjectArray inputData) +{ + if (env->GetArrayLength(input_names) != num_input) { + UNI_ERROR_LOG("input name array length %d is not equal to input num %d\n", + env->GetArrayLength(input_names), num_input); + } + if (env->GetArrayLength(inputData) != num_input) { + UNI_ERROR_LOG("input data array length %d is not equal to input num %d\n", + env->GetArrayLength(inputData), num_input); + } + ModelHandle ih = (ModelHandle)modelAddr; + ResultHandle ir = (ResultHandle)ResultHandleAddr; + + ModelHandleInfo *ihInfo = (ModelHandleInfo *)ih; + CNN *cnn = (CNN *)ihInfo->cnn; + std::map> inMap = cnn->get_inputs(); + + char **input_names_ptr = (char **)malloc(sizeof(char *) * num_input); + void **mem_ptr = (void **)malloc(sizeof(void *) * num_input); + for (int i = 0; i < num_input; i++) { + jstring cur_str = (jstring)(env->GetObjectArrayElement(input_names, i)); + const char *cur_str_ptr = env->GetStringUTFChars(cur_str, 0); + int length = strlen(cur_str_ptr); + input_names_ptr[i] = (char *)malloc(sizeof(char) * (length + 1)); + UNI_memcpy(input_names_ptr[i], cur_str_ptr, length); + input_names_ptr[i][length] = '\0'; + env->ReleaseStringUTFChars(cur_str, cur_str_ptr); + env->DeleteLocalRef(cur_str); + + jfloatArray curArray = static_cast(env->GetObjectArrayElement(inputData, i)); + jfloat *datas = env->GetFloatArrayElements(curArray, JNI_FALSE); + std::string curTensorName = input_names_ptr[i]; + std::shared_ptr cur_input_tensor = inMap[curTensorName]; + jint dataNum = env->GetArrayLength(curArray); + TensorDesc tensorDesc = cur_input_tensor->get_desc(); + mem_ptr[i] = ((CpuMemory *)(cur_input_tensor->get_memory()))->get_ptr(); + transformFromFloat(tensorDesc.dt, datas, mem_ptr[i], dataNum); + env->ReleaseFloatArrayElements(curArray, datas, 0); + env->DeleteLocalRef(curArray); + } + + RunModel(ih, ir, num_input, input_names_ptr, mem_ptr); + for (int i = 0; i < num_input; i++) { + free(input_names_ptr[i]); + } + free(input_names_ptr); + free(mem_ptr); +} + +int calculateLength(int *array, int num) +{ + int length = 0; + for (int j = 0; j < num; j++) { + if (array[j] == 0) { + break; + } else { + if (length == 0) { + length = array[j]; + } else { + length *= array[j]; + } + } + } + return length; +} + +extern "C" JNIEXPORT jobject JNICALL BOLT_JNI_PREFIX(BoltModel_getOutput)( + JNIEnv *env, jobject, jlong ResultHandleAddr) +{ + std::string boltResultClassPath = std::string(BOLT_JNI_PATH_PREFIX) + "BoltResult"; + jclass stucls = env->FindClass(boltResultClassPath.c_str()); + + jmethodID constrocMID = + env->GetMethodID(stucls, "", "([[F[[I[Ljava/lang/String;[Ljava/lang/String;)V"); + + ResultHandleInner *ir_inner = (ResultHandleInner *)ResultHandleAddr; + DataDesc *outputArrPtr = (*ir_inner).outputArr; + int num_outputs = (*ir_inner).num_outputs; + + jobjectArray output_values; + jclass floatArrCls = env->FindClass("[F"); + output_values = env->NewObjectArray(num_outputs, floatArrCls, nullptr); + jobjectArray output_dimension; + jclass intArrCls = env->FindClass("[I"); + output_dimension = env->NewObjectArray(num_outputs, intArrCls, nullptr); + + jobjectArray output_names_arr; + output_names_arr = (jobjectArray)env->NewObjectArray( + num_outputs, env->FindClass("java/lang/String"), env->NewStringUTF("")); + + jobjectArray df_arr; + df_arr = (jobjectArray)env->NewObjectArray( + num_outputs, env->FindClass("java/lang/String"), env->NewStringUTF("")); + + for (int i = 0; i < num_outputs; i++) { + std::string cur_output_name = outputArrPtr[i].name; + env->SetObjectArrayElement(output_names_arr, i, env->NewStringUTF(cur_output_name.c_str())); + DataType cur_data_type = outputArrPtr[i].dt; + DataFormat cur_data_format = outputArrPtr[i].df; + std::string cur_data_format_str = DataFormat2str(cur_data_format); + env->SetObjectArrayElement(df_arr, i, env->NewStringUTF(cur_data_format_str.c_str())); + + void *cur_dataPtr = outputArrPtr[i].dataPtr; + int tensorNumber = calculateLength((int *)outputArrPtr[i].dims, 4); + jfloatArray floatArr = env->NewFloatArray(tensorNumber); + float *tmp_output_values = env->GetFloatArrayElements(floatArr, NULL); + + jint tmp_output_dimensions[4]; + jintArray intArr = env->NewIntArray(4); + + for (int j = 0; j < 4; j++) { + tmp_output_dimensions[j] = (int)(outputArrPtr[i].dims[j]); + } + + transformToFloat(cur_data_type, cur_dataPtr, tmp_output_values, tensorNumber); + env->SetFloatArrayRegion(floatArr, 0, tensorNumber, tmp_output_values); + env->SetObjectArrayElement(output_values, i, floatArr); + env->ReleaseFloatArrayElements(floatArr, tmp_output_values, 0); + + env->DeleteLocalRef(floatArr); + + env->SetIntArrayRegion(intArr, 0, 4, tmp_output_dimensions); + env->SetObjectArrayElement(output_dimension, i, intArr); + env->DeleteLocalRef(intArr); + } + + jobject bolt_result_obj = env->NewObject( + stucls, constrocMID, output_values, output_dimension, output_names_arr, df_arr); + env->DeleteLocalRef(stucls); + env->DeleteLocalRef(intArrCls); + env->DeleteLocalRef(output_values); + env->DeleteLocalRef(output_dimension); + env->DeleteLocalRef(output_names_arr); + env->DeleteLocalRef(df_arr); + return bolt_result_obj; +} + +extern "C" JNIEXPORT jlong JNICALL BOLT_JNI_PREFIX(BoltModel_cloneResultHandle)( + JNIEnv *, jobject, jlong ResultHandleAddr) +{ + ResultHandle ir = (ResultHandle)ResultHandleAddr; + return (long)CloneResultHandle(ir); +} + +extern "C" JNIEXPORT void JNICALL BOLT_JNI_PREFIX(BoltModel_freeResultHandle)( + JNIEnv *, jobject, jlong ResultHandleAddr) +{ + ResultHandle ir = (ResultHandle)ResultHandleAddr; + FreeResultHandle(ir); +} + +extern "C" JNIEXPORT void JNICALL BOLT_JNI_PREFIX(BoltModel_destroyModel)( + JNIEnv *, jobject, jlong modelAddr) +{ + ModelHandle ih = (ModelHandle)modelAddr; + DestroyModel(ih); +} +#endif diff --git a/inference/engine/src/CMakeLists.txt b/inference/engine/src/CMakeLists.txt new file mode 100644 index 00000000..f3e3f5b7 --- /dev/null +++ b/inference/engine/src/CMakeLists.txt @@ -0,0 +1,20 @@ +file(GLOB_RECURSE srcs ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) + +# shared library +add_library(${PROJECT_NAME} SHARED ${srcs}) + +# static library +add_library(${PROJECT_NAME}_static STATIC ${srcs}) +if (USE_IOS_CLANG) + target_link_libraries(${PROJECT_NAME} LINK_PUBLIC tensor image model_tools) + if (BUILD_TEST) + target_link_libraries(${PROJECT_NAME} LINK_PUBLIC ${JPEG_LIBRARY}) + endif (BUILD_TEST) +endif (USE_IOS_CLANG) + +set_target_properties(${PROJECT_NAME}_static PROPERTIES OUTPUT_NAME "${PROJECT_NAME}") +set_target_properties(${PROJECT_NAME} PROPERTIES CLEAN_DIRECT_OUTPUT 1) +set_target_properties(${PROJECT_NAME}_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) +install(TARGETS ${PROJECT_NAME} ${PROJECT_NAME}_static + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib) diff --git a/inference/engine/src/bolt.cpp b/inference/engine/src/bolt.cpp new file mode 100644 index 00000000..7b23c8b5 --- /dev/null +++ b/inference/engine/src/bolt.cpp @@ -0,0 +1,689 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "inference.hpp" +#include "../api/c/bolt.h" + +struct ModelHandleInfo { + void *cnn; + DEVICE_TYPE deviceType; + void *algoPath; + bool useFileStream; +}; + +typedef struct { + U32 dims[4] = {0}; + char name[NAME_LEN] = {0}; + DataType dt; + DataFormat df; + void *dataPtr; +} DataDesc; + +typedef struct { + U32 num_outputs; + DataDesc *outputArr; + DEVICE_TYPE deviceType; +} ResultHandleInner; + +DataType dt_mapping_user2bolt(DATA_TYPE dt_user) +{ + DataType ret = DT_F32; + switch (dt_user) { + case FP_32: + ret = DT_F32; + break; +#ifdef __aarch64__ + case FP_16: + ret = DT_F16; + break; +#endif + case INT_32: + ret = DT_I32; + break; + case UINT_32: + ret = DT_U32; + break; + default: + UNI_ERROR_LOG("unsupported user data type in API\n"); + } + return ret; +} + +DATA_TYPE dt_mapping_bolt2user(DataType dt_bolt) +{ + DATA_TYPE ret = FP_32; + switch (dt_bolt) { + case DT_F32: + ret = FP_32; + break; +#ifdef __aarch64__ + case DT_F16: + ret = FP_16; + break; +#endif + case DT_I32: + ret = INT_32; + break; + case DT_U32: + ret = UINT_32; + break; + default: + UNI_ERROR_LOG("unsupported bolt data type in API\n"); + } + return ret; +} + +DataFormat df_mapping_user2bolt(DATA_FORMAT df_user) +{ + DataFormat ret = DF_NCHW; + switch (df_user) { + case NCHW: + ret = DF_NCHW; + break; + case NHWC: + ret = DF_NHWC; + break; + case NCHWC8: + ret = DF_NCHWC8; + break; + case MTK: + ret = DF_MTK; + break; + case NORMAL: + ret = DF_NORMAL; + break; + default: { + UNI_ERROR_LOG("unsupported user data format in API\n"); + } + } + return ret; +} + +DATA_FORMAT df_mapping_bolt2user(DataFormat df_bolt) +{ + DATA_FORMAT ret = NCHW; + switch (df_bolt) { + case DF_NCHW: + ret = NCHW; + break; + case DF_NHWC: + ret = NHWC; + break; + case DF_NCHWC8: + ret = NCHWC8; + break; + case DF_MTK: + ret = MTK; + break; + case DF_NORMAL: + ret = NORMAL; + break; + default: { + UNI_ERROR_LOG("unsupported bolt data format in API\n"); + } + } + return ret; +} + +inline AffinityPolicy affinity_mapping_user2bolt(AFFINITY_TYPE affinity) +{ + AffinityPolicy ret = AFFINITY_CPU_HIGH_PERFORMANCE; + switch (affinity) { + case CPU_HIGH_PERFORMANCE: + ret = AFFINITY_CPU_HIGH_PERFORMANCE; + break; + case CPU_LOW_POWER: + ret = AFFINITY_CPU_LOW_POWER; + break; + case GPU: + ret = AFFINITY_GPU; + break; + default: { + UNI_ERROR_LOG("unsupported user affinity type in API\n"); + } + } + return ret; +} + +inline Arch device_mapping_user2bolt(DEVICE_TYPE device) +{ + Arch ret = ARM_V8; + switch (device) { + case CPU_ARM_V7: + ret = ARM_V7; + break; + case CPU_ARM_V8: + ret = ARM_V8; + break; + case CPU_ARM_A55: + ret = ARM_A55; + break; + case CPU_ARM_A76: + ret = ARM_A76; + break; + case GPU_MALI: + ret = MALI; + break; + case CPU_X86_AVX2: + ret = X86_AVX2; + break; + case CPU_SERIAL: + ret = CPU_GENERAL; + break; + default: { + UNI_ERROR_LOG("unsupported user device type %d in API\n", device); + break; + } + } + return ret; +} + +inline DEVICE_TYPE device_mapping_bolt2user(Arch arch) +{ + DEVICE_TYPE ret = CPU_ARM_V8; + switch (arch) { + case ARM_V7: + ret = CPU_ARM_V7; + break; + case ARM_V8: + ret = CPU_ARM_V8; + break; + case ARM_A55: + ret = CPU_ARM_A55; + break; + case ARM_A76: + ret = CPU_ARM_A76; + break; + case MALI: + ret = GPU_MALI; + break; + case X86_AVX2: + ret = CPU_X86_AVX2; + break; + case CPU_GENERAL: + ret = CPU_SERIAL; + break; + default: { + UNI_ERROR_LOG("unsupported bolt device type %d in API\n", arch); + break; + } + } + return ret; +} + +void copyTensorDescToDataDesc(TensorDesc srcDesc, DataDesc *dstDesc) +{ + dstDesc->dt = srcDesc.dt; + dstDesc->df = srcDesc.df; + if (srcDesc.nDims > 4) { + UNI_ERROR_LOG("user interface only support 4 dimensions, not %d\n", srcDesc.nDims); + } + for (U32 i = 0; i < srcDesc.nDims; i++) { + dstDesc->dims[i] = srcDesc.dims[srcDesc.nDims - 1 - i]; + } + for (int i = srcDesc.nDims; i < 4; i++) { + dstDesc->dims[i] = 1; + } +} + +ModelHandle CreateModel(const char *modelPath, AFFINITY_TYPE affinity, const char *algoPath) +{ + ModelHandleInfo *handle = new ModelHandleInfo(); + ModelSpec *ms = new ModelSpec(); + if (SUCCESS != deserialize_model_from_file(modelPath, ms)) { + UNI_ERROR_LOG("CreateModel failed\n"); + delete ms; + handle->cnn = nullptr; + return (ModelHandle)handle; + } + CNN *cnn = new CNN(affinity_mapping_user2bolt(affinity), ms->dt, ms->model_name); + cnn->sort_operators_sequential(ms); + cnn->initialize_ops(ms); + + handle->cnn = (void *)cnn; + handle->deviceType = device_mapping_bolt2user(cnn->get_runtime_device()); + handle->algoPath = (void *)algoPath; + handle->useFileStream = false; + CHECK_STATUS(mt_destroy_model(ms)); + delete ms; + return (ModelHandle)handle; +} + +ModelHandle CloneModel(ModelHandle ih) +{ + ModelHandleInfo *handle = (ModelHandleInfo *)ih; + ModelHandleInfo *cloneHandle = new ModelHandleInfo(); + *cloneHandle = *handle; + CNN *cloneCnn = new CNN(); + *cloneCnn = ((CNN *)handle->cnn)->clone(); + cloneHandle->cnn = cloneCnn; + return (ModelHandle)cloneHandle; +} + +ModelHandle CreateModelWithFileStream( + const char *modelFileStream, AFFINITY_TYPE affinity, const char *algoFileStream) +{ + ModelHandleInfo *handle = new ModelHandleInfo(); + ModelSpec *ms = new ModelSpec(); + if (SUCCESS != deserialize_model_from_file(modelFileStream, ms, true)) { + UNI_ERROR_LOG("CreateModelWithFileStream failed\n"); + delete ms; + handle->cnn = nullptr; + return (ModelHandle)handle; + } + CNN *cnn = new CNN(affinity_mapping_user2bolt(affinity), ms->dt, ms->model_name); + cnn->sort_operators_sequential(ms); + cnn->initialize_ops(ms); + + handle->cnn = (void *)cnn; + handle->deviceType = device_mapping_bolt2user(cnn->get_runtime_device()); + handle->algoPath = (void *)algoFileStream; + handle->useFileStream = true; + CHECK_STATUS(mt_destroy_model(ms)); + delete ms; + return (ModelHandle)handle; +} + +int GetNumInputsFromModel(ModelHandle ih) +{ + ModelHandleInfo *ihInfo = (ModelHandleInfo *)ih; + CNN *cnn = (CNN *)ihInfo->cnn; + if (ihInfo == NULL) { + UNI_ERROR_LOG("GetNumInputsFromModel: inference handle is nullptr\n"); + } + return (cnn->get_model_input_tensor_names()).size(); +} + +void GetInputDataInfoFromModel(ModelHandle handle, + const int number_inputs, + char **inputNames, + int *n, + int *c, + int *h, + int *w, + DATA_TYPE *dt_input, + DATA_FORMAT *df_input) +{ + ModelHandleInfo *ihInfo = (ModelHandleInfo *)handle; + CNN *cnn = (CNN *)ihInfo->cnn; + if (ihInfo == NULL) { + UNI_ERROR_LOG("GetInputDataInfoFromModel: inference handle is nullptr\n"); + } + std::vector inputTensorDescs = cnn->get_model_input_tensor_descs(); + std::vector inputTensorNames = cnn->get_model_input_tensor_names(); + if (number_inputs != (int)inputTensorDescs.size() || + number_inputs != (int)inputTensorNames.size()) { + UNI_ERROR_LOG("GetInputDataInfoFromModel: number of inputs is not match, " + "please use GetNumInputsFromModel to get the right value\n"); + } + DataType dt; + DataFormat df; + U32 in, ic, ih, iw; + for (int i = 0; i < number_inputs; i++) { + strcpy(inputNames[i], inputTensorNames[i].c_str()); + in = ic = ih = iw = 0; + if (tensorIs1d(inputTensorDescs[i])) { + CHECK_STATUS(tensor1dGet(inputTensorDescs[i], &dt, &df, &in)); + } else if (tensorIs2d(inputTensorDescs[i])) { + CHECK_STATUS(tensor2dGet(inputTensorDescs[i], &dt, &df, &in, &ic)); + } else if (tensorIs3d(inputTensorDescs[i])) { + CHECK_STATUS(tensor3dGet(inputTensorDescs[i], &dt, &df, &in, &ic, &ih)); + } else if (tensorIs4d(inputTensorDescs[i])) { + CHECK_STATUS(tensor4dGet(inputTensorDescs[i], &dt, &df, &in, &ic, &ih, &iw)); + } else { + UNI_ERROR_LOG("C API currently only support 1d,2d,3d,4d query\n"); + } + n[i] = in; + c[i] = ic; + h[i] = ih; + w[i] = iw; + dt_input[i] = dt_mapping_bolt2user(dt); + df_input[i] = df_mapping_bolt2user(df); + } +} + +std::map getInputDataFormatFromUser(ModelHandle ih, + const int num_input, + char **name, + const int *n, + const int *c, + const int *h, + const int *w, + const DATA_TYPE *dt_input, + const DATA_FORMAT *df_input) +{ + ModelHandleInfo *ihInfo = (ModelHandleInfo *)ih; + CNN *cnn = (CNN *)ihInfo->cnn; + std::vector inputTensorNames = cnn->get_model_input_tensor_names(); + U32 num = inputTensorNames.size(); + if (num != (U32)num_input) { + UNI_ERROR_LOG("getInputDataFormatFromUser: model has %d inputs, not %d\n", num, num_input); + } + if (ihInfo == NULL) { + UNI_ERROR_LOG("getInputDataFormatFromUser: ih is nullptr\n"); + } + if (n == NULL) { + UNI_ERROR_LOG("getInputDataFormatFromUser: n is nullptr\n"); + } + if (c == NULL) { + UNI_ERROR_LOG("getInputDataFormatFromUser: c is nullptr\n"); + } + if (h == NULL) { + UNI_ERROR_LOG("getInputDataFormatFromUser: h is nullptr\n"); + } + if (w == NULL) { + UNI_ERROR_LOG("getInputDataFormatFromUser: w is nullptr\n"); + } + if (name == NULL) { + UNI_ERROR_LOG("getInputDataFormatFromUser: name is nullptr\n"); + } + for (U32 i = 0; i < num; ++i) { + if (name[i] == NULL) { + UNI_ERROR_LOG("getInputDataFormatFromUser: input name %d is nullptr\n", i); + } + } + + std::map modelInputDims; + for (U32 i = 0; i < num; ++i) { + std::string inputName = name[i]; + bool findTensorName = false; + for (U32 j = 0; j < num; ++j) { + std::string modelName = inputTensorNames[j]; + if (modelName == inputName) { + DataType dt = (dt_input == NULL) ? DT_F32 : dt_mapping_user2bolt(dt_input[i]); + DataFormat df = (df_input == NULL) ? DF_NCHW : df_mapping_user2bolt(df_input[i]); + switch (df) { + case DF_NORMAL: + modelInputDims[inputName] = tensor2df(dt, df, n[i], c[i]); + break; + case DF_MTK: + modelInputDims[inputName] = tensor3df(dt, df, n[i], c[i], h[i]); + break; + case DF_NCHW: + modelInputDims[inputName] = tensor4df(dt, df, n[i], c[i], h[i], w[i]); + break; + default: + UNI_ERROR_LOG("unsupported data format in %s\n", __func__); + } + findTensorName = true; + break; + } + } + + if (!findTensorName) { + std::string errorLog = "("; + for (U32 j = 0; j < num; ++j) { + errorLog.append(inputTensorNames[j]); + if (j != num - 1) { + errorLog.append(", "); + } + } + errorLog.append(")"); + UNI_ERROR_LOG("input data %s is not a valid model input %s\n", inputName.c_str(), + errorLog.c_str()); + } + } + return modelInputDims; +} + +void PrepareModel(ModelHandle ih, + const int num_input, + char **name, + const int *n, + const int *c, + const int *h, + const int *w, + const DATA_TYPE *dt_input = NULL, + const DATA_FORMAT *df_input = NULL) +{ + ModelHandleInfo *ihInfo = (ModelHandleInfo *)ih; + CNN *cnn = (CNN *)ihInfo->cnn; + + std::map modelInputDims = + getInputDataFormatFromUser(ih, num_input, name, n, c, h, w, dt_input, df_input); + if (ihInfo->algoPath) { + const char *algoPath = (const char *)ihInfo->algoPath; + if (ihInfo->useFileStream) { + cnn->loadAlgorithmMapFromFileStream(algoPath); + } else { + cnn->loadAlgorithmMapFromText(algoPath); + } + } + cnn->ready(modelInputDims); + cnn->mark_input_output(); + return; +} + +void ResizeModelInput(ModelHandle ih, + const int num_input, + char **name, + const int *n, + const int *c, + const int *h, + const int *w, + const DATA_TYPE *dt_input = NULL, + const DATA_FORMAT *df_input = NULL) +{ + ModelHandleInfo *ihInfo = (ModelHandleInfo *)ih; + CNN *cnn = (CNN *)ihInfo->cnn; + + std::map modelInputDims = + getInputDataFormatFromUser(ih, num_input, name, n, c, h, w, dt_input, df_input); + cnn->reready(modelInputDims); +} + +ResultHandle AllocAllResultHandle(ModelHandle ih) +{ + ModelHandleInfo *ihInfo = (ModelHandleInfo *)ih; + CNN *cnn = (CNN *)ihInfo->cnn; + DEVICE_TYPE device = ihInfo->deviceType; + + ResultHandleInner *model_result_ptr = (ResultHandleInner *)malloc(sizeof(ResultHandleInner)); + std::vector modelOutputTensorNames = cnn->get_model_output_tensor_names(); + int model_num_outputs = modelOutputTensorNames.size(); + DataDesc *outputArrPtr = (DataDesc *)malloc(sizeof(DataDesc) * model_num_outputs); + for (int i = 0; i < model_num_outputs; ++i) { + std::string name = modelOutputTensorNames[i]; + U32 length = name.size(); + length = (length > NAME_LEN) ? NAME_LEN : length; + memcpy(outputArrPtr[i].name, name.c_str(), length); + if (length < NAME_LEN) { + outputArrPtr[i].name[length] = '\0'; + } + TensorDesc srcDesc = cnn->get_tensor_desc_by_name(name); + copyTensorDescToDataDesc(srcDesc, &outputArrPtr[i]); + } + model_result_ptr->num_outputs = model_num_outputs; + model_result_ptr->outputArr = outputArrPtr; + model_result_ptr->deviceType = device; + return (void *)model_result_ptr; +} + +ResultHandle AllocSpecificResultHandle(ModelHandle ih, const int num_outputs, char **outputNames) +{ + ModelHandleInfo *ihInfo = (ModelHandleInfo *)ih; + CNN *cnn = (CNN *)ihInfo->cnn; + DEVICE_TYPE device = ihInfo->deviceType; + + ResultHandleInner *model_result_ptr = (ResultHandleInner *)malloc(sizeof(ResultHandleInner)); + int model_num_outputs = num_outputs; + DataDesc *outputArrPtr = (DataDesc *)malloc(sizeof(DataDesc) * model_num_outputs); + for (int i = 0; i < num_outputs; i++) { + U32 length = UNI_MIN(strlen(outputNames[i]), NAME_LEN - 1); + memcpy(outputArrPtr[i].name, outputNames[i], length); + if (length < NAME_LEN) { + outputArrPtr[i].name[length] = '\0'; + } + std::string name = outputNames[i]; + TensorDesc srcDesc = cnn->get_tensor_desc_by_name(name); + copyTensorDescToDataDesc(srcDesc, &outputArrPtr[i]); + } + model_result_ptr->num_outputs = model_num_outputs; + model_result_ptr->outputArr = outputArrPtr; + model_result_ptr->deviceType = device; + return (void *)model_result_ptr; +} + +void SetRuntimeDevice(ModelHandle ih, int cpu_id, DEVICE_TYPE device) +{ + ModelHandleInfo *ihInfo = (ModelHandleInfo *)ih; + CNN *cnn = (CNN *)ihInfo->cnn; + cnn->set_runtime_device(cpu_id, device_mapping_user2bolt(device)); + ihInfo->deviceType = device; +} + +void SetRuntimeDeviceDynamic(ModelHandle ih) +{ + ModelHandleInfo *ihInfo = (ModelHandleInfo *)ih; + CNN *cnn = (CNN *)ihInfo->cnn; + cnn->set_runtime_device_dynamic(); + ihInfo->deviceType = device_mapping_bolt2user(cnn->get_runtime_device()); +} + +void RunModel(ModelHandle ih, ResultHandle ir, const int num_input, char **inputNames, void **mem) +{ + ModelHandleInfo *ihInfo = (ModelHandleInfo *)ih; + CNN *cnn = (CNN *)ihInfo->cnn; + DEVICE_TYPE device = ihInfo->deviceType; + ResultHandleInner *ir_inner = (ResultHandleInner *)ir; + + for (int index = 0; index < num_input; index++) { + std::string input_name(inputNames[index]); + cnn->copy_to_named_input(input_name, (U8 *)(mem[index])); + } + cnn->run(); + + DataDesc *outputArrPtr = ir_inner->outputArr; + for (U32 curIndex = 0; curIndex < ir_inner->num_outputs; curIndex++) { + Tensor output_tensor = cnn->get_tensor_by_name(outputArrPtr[curIndex].name); + copyTensorDescToDataDesc(output_tensor.get_desc(), &(outputArrPtr[curIndex])); + if (device == GPU_MALI) { +#ifdef _USE_MALI + auto mem = (OclMemory *)output_tensor.get_memory(); + outputArrPtr[curIndex].dataPtr = mem->get_mapped_ptr(); +#else + UNI_WARNING_LOG("this binary not support GPU, please recompile project with GPU " + "compile options\n"); +#endif + } else { + outputArrPtr[curIndex].dataPtr = ((CpuMemory *)(output_tensor.get_memory()))->get_ptr(); + } + } +} + +int GetNumOutputsFromResultHandle(ResultHandle ir) +{ + ResultHandleInner *ir_inner = (ResultHandleInner *)ir; + return (*ir_inner).num_outputs; +} + +void GetOutputDataInfoFromResultHandle(ResultHandle ir, + int num_outputs, + char **outputNames, + int *n, + int *c, + int *h, + int *w, + DATA_TYPE *dt_output, + DATA_FORMAT *df_output) +{ + ResultHandleInner *ir_inner = (ResultHandleInner *)ir; + DataDesc *outputArrPtr = (*ir_inner).outputArr; + for (int i = 0; i < num_outputs; i++) { + n[i] = outputArrPtr[i].dims[0]; + c[i] = outputArrPtr[i].dims[1]; + h[i] = outputArrPtr[i].dims[2]; + w[i] = outputArrPtr[i].dims[3]; + strcpy(outputNames[i], outputArrPtr[i].name); + DataType dt = outputArrPtr[i].dt; + dt_output[i] = dt_mapping_bolt2user(dt); + df_output[i] = df_mapping_bolt2user(outputArrPtr[i].df); + } +} + +void GetPtrFromResultHandle(ResultHandle ir, + int num_outputs, + char **outputNames, + void **data, + int *n, + int *c, + int *h, + int *w, + DATA_TYPE *dt_output, + DATA_FORMAT *df_output) +{ + ResultHandleInner *ir_inner = (ResultHandleInner *)ir; + DataDesc *outputArrPtr = (*ir_inner).outputArr; + for (int i = 0; i < num_outputs; i++) { + n[i] = outputArrPtr[i].dims[0]; + c[i] = outputArrPtr[i].dims[1]; + h[i] = outputArrPtr[i].dims[2]; + w[i] = outputArrPtr[i].dims[3]; + strcpy(outputNames[i], outputArrPtr[i].name); + DataType dt = outputArrPtr[i].dt; + dt_output[i] = dt_mapping_bolt2user(dt); + df_output[i] = df_mapping_bolt2user(outputArrPtr[i].df); + data[i] = outputArrPtr[i].dataPtr; + } +} + +void CopyOutputsFromResultHandle(ResultHandle ir, int num_outputs, const int *size, void **data) +{ + ResultHandleInner *ir_inner = (ResultHandleInner *)ir; + DataDesc *outputArrPtr = (*ir_inner).outputArr; + for (int i = 0; i < num_outputs; i++) { + U32 dataSize = size[i]; + memcpy((void *)data[i], (void *)outputArrPtr[i].dataPtr, dataSize); + } +} + +ResultHandle CloneResultHandle(ResultHandle ir) +{ + ResultHandleInner *irInner = (ResultHandleInner *)ir; + ResultHandleInner *cloneIrInner = new ResultHandleInner(); + *cloneIrInner = *irInner; + U32 size = sizeof(DataDesc) * cloneIrInner->num_outputs; + cloneIrInner->outputArr = (DataDesc *)malloc(size); + memcpy(cloneIrInner->outputArr, irInner->outputArr, size); + return (ResultHandle)cloneIrInner; +} + +void FreeResultHandle(ResultHandle ir) +{ + ResultHandleInner *ir_inner = (ResultHandleInner *)ir; + DataDesc *outputArrPtr = (*ir_inner).outputArr; + free(outputArrPtr); + (*ir_inner).outputArr = nullptr; + free(ir_inner); +} + +void DestroyModel(ModelHandle ih) +{ + ModelHandleInfo *ihInfo = (ModelHandleInfo *)ih; + if (nullptr == ihInfo) { + UNI_WARNING_LOG("DestroyModel received null handle.\n"); + return; + } + CNN *cnn = (CNN *)ihInfo->cnn; + if (nullptr != ihInfo->algoPath) { + const char *algoPath = (const char *)ihInfo->algoPath; + UNI_THREAD_SAFE(cnn->saveAlgorithmMapToText(algoPath)); + } + if (nullptr == cnn) { + UNI_WARNING_LOG("nullptr in DestroyModel. Resource cleared.\n"); + } else { + delete cnn; + ihInfo->cnn = nullptr; + } + delete ihInfo; +} diff --git a/inference/engine/src/bolt_dllite.cpp b/inference/engine/src/bolt_dllite.cpp new file mode 100644 index 00000000..450d9080 --- /dev/null +++ b/inference/engine/src/bolt_dllite.cpp @@ -0,0 +1,489 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "../api/c/bolt.h" +#include "../api/dllite/Bolt.h" +#include "inference.hpp" +#include "tensor.hpp" + +struct ModelHandleInfo { + void *cnn; + DEVICE_TYPE deviceType; + void *algoPath; + bool useFileStream; +}; + +struct DLLiteInfo { + ModelHandle modelHandle; + bool isReady; +}; + +typedef struct { + U32 dims[4] = {0}; + char name[NAME_LEN] = {0}; + DataType dt; + DataFormat df; + void *dataPtr; +} DataDesc; + +typedef struct { + U32 num_outputs; + DataDesc *outputArr; + DEVICE_TYPE deviceType; +} ResultHandleInner; + +inline AFFINITY_TYPE AffinityMapDLLite2c(bolt::AffinityType affinity) +{ + AFFINITY_TYPE ret = CPU_HIGH_PERFORMANCE; + switch (affinity) { + case bolt::AffinityType::CPU_HIGH_PERFORMANCE: + ret = CPU_HIGH_PERFORMANCE; + break; + case bolt::AffinityType::CPU_LOW_POWER: + ret = CPU_LOW_POWER; + break; + case bolt::AffinityType::GPU: + ret = GPU; + break; + default: { + UNI_ERROR_LOG("Unsupported affinity type in dllite API\n"); + } + } + return ret; +} + +bolt::TensorType TypeMapBolt2DLLite(DataType dt) +{ + bolt::TensorType ret = bolt::TensorType::FP32; + switch (dt) { + case DT_F32: + ret = bolt::TensorType::FP32; + break; +#ifdef _USE_FP16 + case DT_F16: + ret = bolt::TensorType::FP16; + break; +#endif + case DT_I32: + ret = bolt::TensorType::INT32; + break; + case DT_U32: + ret = bolt::TensorType::UINT32; + break; + default: + UNI_ERROR_LOG("unsupported bolt data type in DLLite API\n"); + } + return ret; +} + +DataType TypeMapDLLite2bolt(bolt::TensorType dt) +{ + DataType ret = DT_F32; + switch (dt) { + case bolt::TensorType::FP32: + ret = DT_F32; + break; +#ifdef _USE_FP16 + case bolt::TensorType::FP16: + ret = DT_F16; + break; +#endif + case bolt::TensorType::INT32: + ret = DT_I32; + break; + case bolt::TensorType::UINT32: + ret = DT_U32; + break; + default: + UNI_ERROR_LOG("unsupported data type in DLLite API\n"); + } + return ret; +} + +bolt::TensorLayout LayoutMapBolt2DLLite(DataFormat df) +{ + bolt::TensorLayout ret = bolt::TensorLayout::NCHW; + switch (df) { + case DF_NCHW: + ret = bolt::TensorLayout::NCHW; + break; + case DF_NHWC: + ret = bolt::TensorLayout::NHWC; + break; + case DF_NCHWC8: + ret = bolt::TensorLayout::NCHWC8; + break; + case DF_MTK: + ret = bolt::TensorLayout::RNN_MTK; + break; + case DF_NORMAL: + ret = bolt::TensorLayout::ROW_MAJOR; + break; + default: { + UNI_ERROR_LOG("unsupported bolt data layout in DLLite API\n"); + } + } + return ret; +} + +DataFormat LayoutMapDLLite2bolt(bolt::TensorLayout df) +{ + DataFormat ret = DF_NCHW; + switch (df) { + case bolt::TensorLayout::NCHW: + ret = DF_NCHW; + break; + case bolt::TensorLayout::NHWC: + ret = DF_NHWC; + break; + case bolt::TensorLayout::NCHWC8: + ret = DF_NCHWC8; + break; + case bolt::TensorLayout::RNN_MTK: + ret = DF_MTK; + break; + case bolt::TensorLayout::ROW_MAJOR: + ret = DF_NORMAL; + break; + default: { + UNI_ERROR_LOG("unsupported data layout in DLLite API\n"); + } + } + return ret; +} + +std::map GetInputInfoFromDLLite( + bolt::ModelHandle ih, const std::vector &inputs) +{ + DLLiteInfo *handle = (DLLiteInfo *)ih; + ModelHandleInfo *ihInfo = (ModelHandleInfo *)handle->modelHandle; + CNN *cnn = (CNN *)ihInfo->cnn; + std::vector inputTensorNames = cnn->get_model_input_tensor_names(); + int num = inputTensorNames.size(); + if (num != (int)inputs.size()) { + UNI_ERROR_LOG( + "GetInputInfoFromDLLite: model has %d inputs, not %d\n", num, (int)inputs.size()); + } + + std::map modelInputDims; + for (int i = 0; i < num; ++i) { + std::string inputName = inputs[i].name; + bool findTensorName = false; + for (int j = 0; j < num; ++j) { + std::string modelName = inputTensorNames[j]; + if (modelName == inputName) { + DataType dt = TypeMapDLLite2bolt(inputs[i].type); + DataFormat df = LayoutMapDLLite2bolt(inputs[i].layout); + switch (df) { + case DF_NORMAL: + modelInputDims[inputName] = + tensor2df(dt, df, inputs[i].shape[0], inputs[i].shape[1]); + break; + case DF_NCHW: + modelInputDims[inputName] = tensor4df(dt, df, inputs[i].shape[0], + inputs[i].shape[1], inputs[i].shape[2], inputs[i].shape[3]); + break; + case DF_MTK: + modelInputDims[inputName] = tensor3df( + dt, df, inputs[i].shape[0], inputs[i].shape[1], inputs[i].shape[2]); + break; + default: + UNI_ERROR_LOG("unsupported data format in %s\n", __func__); + } + findTensorName = true; + break; + } + } + + if (!findTensorName) { + std::string errorLog = "("; + for (int j = 0; j < num; ++j) { + errorLog.append(inputTensorNames[j]); + if (j != num - 1) { + errorLog.append(", "); + } + } + errorLog.append(")"); + UNI_ERROR_LOG("[ERROR] input data %s is not a valid model input %s\n", + inputName.c_str(), errorLog.c_str()); + } + } + return modelInputDims; +} + +void UpdateDataDesc(TensorDesc srcDesc, DataDesc *dstDesc) +{ + dstDesc->dt = srcDesc.dt; + dstDesc->df = srcDesc.df; + if (srcDesc.nDims > 4) { + UNI_ERROR_LOG("user interface only support 4 dimensions, not %d\n", srcDesc.nDims); + } + for (U32 i = 0; i < srcDesc.nDims; i++) { + dstDesc->dims[i] = srcDesc.dims[srcDesc.nDims - 1 - i]; + } + for (int i = srcDesc.nDims; i < 4; i++) { + dstDesc->dims[i] = 1; + } +} + +bolt::ModelHandle bolt::CreateModel(const bolt::ModelConfig &modelConfig) +{ + DLLiteInfo *handle = new DLLiteInfo(); + if (nullptr != modelConfig.modelStream.first && modelConfig.modelStream.second > 0) { + handle->modelHandle = CreateModelWithFileStream((char *)modelConfig.modelStream.first, + AffinityMapDLLite2c(modelConfig.affinity), + modelConfig.algoStream.second > 0 ? (char *)modelConfig.algoStream.first : nullptr); + } else if ("" != modelConfig.modelPath) { + handle->modelHandle = CreateModel(modelConfig.modelPath.c_str(), + AffinityMapDLLite2c(modelConfig.affinity), modelConfig.algoPath.c_str()); + } else { + handle->modelHandle = nullptr; + } + handle->isReady = false; + return (bolt::ModelHandle)handle; +} + +bolt::ReturnStatus bolt::GetIOFormats(bolt::ModelHandle modelHandle, + std::vector &inputs, + std::vector &outputs) +{ + DLLiteInfo *handle = (DLLiteInfo *)modelHandle; + ModelHandleInfo *ihInfo = (ModelHandleInfo *)handle->modelHandle; + if (nullptr == ihInfo) { + return bolt::ReturnStatus::NULLPTR; + } + CNN *cnn = (CNN *)ihInfo->cnn; + std::vector inputTensorNames = cnn->get_model_input_tensor_names(); + std::vector inputTensorDescs = cnn->get_model_input_tensor_descs(); + + std::map inputDescMap; + for (size_t i = 0; i < inputTensorNames.size(); i++) { + inputDescMap[inputTensorNames[i]] = inputTensorDescs[i]; + } + if (ihInfo->algoPath) { + const char *algoPath = (const char *)ihInfo->algoPath; + cnn->loadAlgorithmMapFromText(algoPath); + } + cnn->ready(inputDescMap); + cnn->mark_input_output(); + if (ihInfo->algoPath) { + const char *algoPath = (const char *)ihInfo->algoPath; + cnn->saveAlgorithmMapToText(algoPath); + } + handle->isReady = true; + + std::map> inMap = cnn->get_inputs(); + inputs.clear(); + + for (auto iter : inMap) { + bolt::IOTensor in; + in.name = iter.first; + TensorDesc inDesc = iter.second->get_desc(); + in.type = TypeMapBolt2DLLite(inDesc.dt); + in.shape.clear(); + for (U32 j = 0; j < inDesc.nDims; j++) { + in.shape.push_back(inDesc.dims[inDesc.nDims - 1 - j]); + } + in.layout = LayoutMapBolt2DLLite(inDesc.df); + inputs.push_back(in); + } + + std::map> outMap = cnn->get_outputs(); + outputs.clear(); + for (auto iter : outMap) { + IOTensor out; + out.name = iter.first; + TensorDesc outDesc = iter.second->get_desc(); + out.type = TypeMapBolt2DLLite(outDesc.dt); + out.shape.clear(); + for (U32 j = 0; j < outDesc.nDims; j++) { + out.shape.push_back(outDesc.dims[outDesc.nDims - 1 - j]); + } + out.layout = LayoutMapBolt2DLLite(outDesc.df); + outputs.push_back(out); + } + + return bolt::ReturnStatus::SUCCESS; +} + +bolt::ReturnStatus bolt::PrepareModel( + bolt::ModelHandle modelHandle, const std::vector &inputs) +{ + DLLiteInfo *handle = (DLLiteInfo *)modelHandle; + if (handle->isReady) { + return bolt::ReturnStatus::SUCCESS; + } + ModelHandleInfo *ihInfo = (ModelHandleInfo *)handle->modelHandle; + if (nullptr == ihInfo) { + return bolt::ReturnStatus::NULLPTR; + } + CNN *cnn = (CNN *)ihInfo->cnn; + + std::map modelInputDims = GetInputInfoFromDLLite(modelHandle, inputs); + if (ihInfo->algoPath) { + const char *algoPath = (const char *)ihInfo->algoPath; + cnn->loadAlgorithmMapFromText(algoPath); + } + cnn->ready(modelInputDims); + cnn->mark_input_output(); + if (ihInfo->algoPath) { + const char *algoPath = (const char *)ihInfo->algoPath; + cnn->saveAlgorithmMapToText(algoPath); + } + return bolt::ReturnStatus::SUCCESS; +} + +bolt::ReturnStatus bolt::GetInputTensors( + bolt::ModelHandle modelHandle, std::vector &inputs) +{ + DLLiteInfo *handle = (DLLiteInfo *)modelHandle; + ModelHandleInfo *ihInfo = (ModelHandleInfo *)handle->modelHandle; + if (nullptr == ihInfo) { + return bolt::ReturnStatus::NULLPTR; + } + CNN *cnn = (CNN *)ihInfo->cnn; + + std::map> inMap = cnn->get_inputs(); + + for (U32 i = 0; i < inputs.size(); i++) { + auto tensorPtr = inMap[inputs[i].name]; + if (nullptr == tensorPtr) { + return bolt::ReturnStatus::FAIL; + } + inputs[i].buffer.first = ((CpuMemory *)(tensorPtr->get_memory()))->get_ptr(); + inputs[i].buffer.second = tensorPtr->bytes(); + } + return bolt::ReturnStatus::SUCCESS; +} + +bolt::ReturnStatus bolt::ResizeInput( + bolt::ModelHandle modelHandle, const std::vector &inputs) +{ + DLLiteInfo *handle = (DLLiteInfo *)modelHandle; + ModelHandleInfo *ihInfo = (ModelHandleInfo *)handle->modelHandle; + if (nullptr == ihInfo) { + return bolt::ReturnStatus::NULLPTR; + } + CNN *cnn = (CNN *)ihInfo->cnn; + + std::map modelInputDims = GetInputInfoFromDLLite(modelHandle, inputs); + cnn->reready(modelInputDims); + return bolt::ReturnStatus::SUCCESS; +} + +bolt::ResultHandle bolt::AllocResult( + bolt::ModelHandle modelHandle, const std::vector &outputs) +{ + DLLiteInfo *handle = (DLLiteInfo *)modelHandle; + char **outputNames = (char **)malloc(outputs.size() * sizeof(char *)); + for (size_t i = 0; i < outputs.size(); i++) { + U32 length = outputs[i].name.length(); + outputNames[i] = (char *)malloc(length + 1); + memcpy(outputNames[i], outputs[i].name.c_str(), length); + outputNames[i][length] = '\0'; + } + bolt::ResultHandle rh = (bolt::ResultHandle)AllocSpecificResultHandle( + handle->modelHandle, outputs.size(), outputNames); + for (size_t i = 0; i < outputs.size(); i++) { + free(outputNames[i]); + } + free(outputNames); + return rh; +} + +bolt::ReturnStatus bolt::RunModel(bolt::ModelHandle modelHandle, + bolt::ResultHandle resultHandle, + const std::vector &inputs) +{ + DLLiteInfo *handle = (DLLiteInfo *)modelHandle; + ModelHandleInfo *ihInfo = (ModelHandleInfo *)handle->modelHandle; + if (nullptr == ihInfo) { + return bolt::ReturnStatus::NULLPTR; + } + CNN *cnn = (CNN *)ihInfo->cnn; + DEVICE_TYPE device = ihInfo->deviceType; + ResultHandleInner *ir_inner = (ResultHandleInner *)resultHandle; + + for (size_t index = 0; index < inputs.size(); index++) { + cnn->copy_to_named_input(inputs[index].name, (U8 *)(inputs[index].buffer.first)); + } + cnn->run(); + + DataDesc *outputArrPtr = ir_inner->outputArr; + for (U32 curIndex = 0; curIndex < ir_inner->num_outputs; curIndex++) { + Tensor output_tensor = cnn->get_tensor_by_name(outputArrPtr[curIndex].name); + UpdateDataDesc(output_tensor.get_desc(), &(outputArrPtr[curIndex])); + if (device == GPU_MALI) { +#ifdef _USE_MALI + auto mem = (OclMemory *)output_tensor.get_memory(); + outputArrPtr[curIndex].dataPtr = mem->get_mapped_ptr(); +#else + UNI_WARNING_LOG("this binary not support GPU, please recompile project with GPU " + "compile options\n"); +#endif + } else { + outputArrPtr[curIndex].dataPtr = ((CpuMemory *)(output_tensor.get_memory()))->get_ptr(); + } + } + return bolt::ReturnStatus::SUCCESS; +} + +bolt::ReturnStatus bolt::GetOutputTensors( + bolt::ResultHandle resultHandle, std::vector &outputs) +{ + ResultHandleInner *ir_inner = (ResultHandleInner *)resultHandle; + if (nullptr == ir_inner) { + return bolt::ReturnStatus::NULLPTR; + } + DataDesc *outputArrPtr = (*ir_inner).outputArr; + + for (size_t i = 0; i < outputs.size(); i++) { + U32 n = outputArrPtr[i].dims[0]; + U32 c = outputArrPtr[i].dims[1]; + U32 h = outputArrPtr[i].dims[2]; + U32 w = outputArrPtr[i].dims[3]; + DataType dt = outputArrPtr[i].dt; + U32 size = n * c * h * w * bytesOf(dt); + outputs[i].buffer = std::make_pair((void *)outputArrPtr[i].dataPtr, size); + } + return bolt::ReturnStatus::SUCCESS; +} + +bolt::ReturnStatus bolt::FreeResult(bolt::ResultHandle resultHandle) +{ + if (nullptr == resultHandle) { + return bolt::ReturnStatus::NULLPTR; + } + FreeResultHandle((ResultHandle)resultHandle); + return bolt::ReturnStatus::SUCCESS; +} + +bolt::ReturnStatus bolt::DestroyModel(bolt::ModelHandle modelHandle) +{ + DLLiteInfo *handle = (DLLiteInfo *)modelHandle; + ModelHandleInfo *ihInfo = (ModelHandleInfo *)handle->modelHandle; + + if (nullptr == ihInfo) { + UNI_ERROR_LOG("DestroyModel received null handle.\n"); + return bolt::ReturnStatus::NULLPTR; + } + CNN *cnn = (CNN *)ihInfo->cnn; + if (nullptr == cnn) { + UNI_WARNING_LOG("nullptr in DestroyModel. Resource cleared.\n"); + delete ihInfo; + return bolt::ReturnStatus::SUCCESS; + } + delete cnn; + delete ihInfo; + return bolt::ReturnStatus::SUCCESS; +} diff --git a/inference/engine/src/cnn.cpp b/inference/engine/src/cnn.cpp new file mode 100644 index 00000000..d96568f7 --- /dev/null +++ b/inference/engine/src/cnn.cpp @@ -0,0 +1,610 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cnn.h" +#if defined(_USE_CPU) +#include "cpu/factory_cpu.hpp" +#endif +#ifdef _USE_MALI +#include "ocl/factory_ocl.hpp" +#endif + +bool is_same_tensor(Tensor a, Tensor b) +{ + auto ptr_a = ((CpuMemory *)a.get_memory())->get_ptr(); + auto ptr_b = ((CpuMemory *)b.get_memory())->get_ptr(); + bool ret; + if (ptr_a != nullptr && ptr_a == ptr_b) { + ret = true; + } else { + ret = false; + } + return ret; +} + +CNN CNN::clone() +{ + CNN cnn = *this; + for (U32 i = 0; i < cnn.ops.size(); i++) { + cnn.ops[i] = cnn.ops[i]->clone(); + cnn.operatorMap[cnn.ops[i]->get_name()] = cnn.ops[i]; + } + for (auto &tensor : cnn.tensorMap) { + std::shared_ptr cloneTensor = std::shared_ptr(new Tensor()); + *cloneTensor = tensor.second->clone(false); + tensor.second = cloneTensor; + } + cnn.assign_output_tensor(); + cnn.tmpTensor = this->tmpTensor.clone(); + for (auto &operatorTensor : cnn.operatorTensorMap) { + std::string operatorName = operatorTensor.first; + std::vector> tensors(operatorTensor.second.size()); + for (U32 i = 0; i < operatorTensor.second.size(); i++) { + for (U32 j = 0; j < operatorTensor.second[i].size(); j++) { + std::string tensorName = operatorTensor.second[i][j]; + if (cnn.weightOpOutputNames.find(tensorName) != cnn.weightOpOutputNames.end()) { + cnn.tensorMap[tensorName] = this->tensorMap[tensorName]; + } + tensors[i].push_back(*(cnn.tensorMap[tensorName].get())); + } + } + cnn.operatorMap[operatorName]->set_input_output_tensors(tensors[0], tensors[1]); + cnn.operatorMap[operatorName]->set_tmp_memory(cnn.tmpTensor); + } + for (auto &tensor : cnn.inputTensors) { + tensor.second = cnn.tensorMap[tensor.first]; + } + for (auto &tensor : cnn.outputTensors) { + tensor.second = cnn.tensorMap[tensor.first]; + } + + // check + CHECK_REQUIREMENT(!is_same_tensor(this->tmpTensor, cnn.tmpTensor)); + for (U32 i = 0; i < this->storageMemory.size(); i++) { + CHECK_REQUIREMENT( + !is_same_tensor(*(this->storageMemory[i].get()), *(cnn.storageMemory[i].get()))); + } + for (auto iter : this->tensorMap) { + if (cnn.weightOpOutputNames.find(iter.first) == cnn.weightOpOutputNames.end()) { + CHECK_REQUIREMENT( + !is_same_tensor(*(iter.second.get()), *(cnn.tensorMap[iter.first].get()))); + } + } + for (auto iter : this->inputTensors) { + CHECK_REQUIREMENT( + !is_same_tensor(*(iter.second.get()), *(cnn.inputTensors[iter.first].get()))); + } + for (auto iter : this->outputTensors) { + CHECK_REQUIREMENT( + !is_same_tensor(*(iter.second.get()), *(cnn.outputTensors[iter.first].get()))); + } + for (auto iter : this->operatorMap) { + std::shared_ptr op1 = iter.second; + std::shared_ptr op2 = cnn.operatorMap[iter.first]; + for (int i = 0; i < 2; i++) { + std::vector names = this->operatorTensorMap[iter.first][i]; + std::vector tensor1, tensor2; + if (i == 0) { + tensor1 = op1->get_input_tensors(); + tensor2 = op2->get_input_tensors(); + } else { + tensor1 = op1->get_output_tensors(); + tensor2 = op2->get_output_tensors(); + } + CHECK_REQUIREMENT(tensor1.size() == tensor2.size()); + for (U32 j = 0; j < tensor1.size(); j++) { + if (tensor1[j].bytes() != 0) { + CHECK_REQUIREMENT( + is_same_tensor(tensor1[j], *(this->tensorMap[names[j]].get()))); + CHECK_REQUIREMENT(is_same_tensor(tensor2[j], *(cnn.tensorMap[names[j]].get()))); + if (this->weightOpOutputNames.find(names[j]) == this->weightOpOutputNames.end()) { + CHECK_REQUIREMENT(!is_same_tensor(tensor1[j], tensor2[j])); + } + } + } + } + } + return cnn; +} + +void CNN::sort_operators_sequential(const ModelSpec *ms) +{ + int opNum = ms->num_operator_specs; + this->sortedOps.clear(); + for (int i = 0; i < opNum; i++) { + std::string opName = ms->ops[i].name; + if (opName.compare("data") == 0) { + continue; + } + this->sortedOps.push_back(opName); + } +} + +void CNN::initialize_ops(const ModelSpec *ms) +{ + int opNum = ms->num_operator_specs; + + for (int i = 0; i < ms->num_inputs; i++) { + this->modelInputTensorNames.push_back(ms->input_names[i]); + this->modelInputTensorDescs.push_back(ms->input_dims[i]); + } + for (int i = 0; i < ms->num_outputs; i++) { + this->modelOutputTensorNames.push_back(ms->output_names[i]); + } + + U32 operatorIndex = 0; + std::map operatorIndexMap; + for (int i = 0; i < opNum; i++) { + OperatorSpec curOps = ms->ops[i]; + std::string opName = curOps.name; + if (opName.compare("data") == 0) { + continue; + } + operatorIndexMap[opName] = operatorIndex++; + } + + for (int i = 0; i < opNum; i++) { + OperatorSpec curOps = ms->ops[i]; + std::string opName = curOps.name; + if (opName.compare("data") == 0) { + continue; + } + std::vector inputTensorsName; + std::vector outputTensorsName; + int inputTensorsNum = curOps.num_inputs; + for (int j = 0; j < inputTensorsNum; j++) { + inputTensorsName.push_back(curOps.input_tensors_name[j]); + } + + int outputTensorsNum = curOps.num_outputs; + for (int j = 0; j < outputTensorsNum; j++) { + outputTensorsName.push_back(curOps.output_tensors_name[j]); + } + + int numTensors = inputTensorsNum + outputTensorsNum; + std::vector tensorPositions(numTensors); + memcpy(tensorPositions.data(), curOps.tensor_positions, numTensors * bytesOf(DT_I32)); + // create op object + std::shared_ptr factory; + if (this->deviceInfo.schedule == MALI) { +#ifdef _USE_MALI + auto factory_ocl = (Factory *)(new FactoryOCL()); + factory = std::shared_ptr(factory_ocl); + for (int j = 0; j < outputTensorsNum; j++) { + auto curOutputTensorName = outputTensorsName[j]; + for (auto modelOutputTensorName : modelOutputTensorNames) { + if (modelOutputTensorName == curOutputTensorName) { + tensorPositions[j + inputTensorsNum] = -1; + } + } + } +#endif + } else { + auto factory_cpu = (Factory *)(new FactoryCPU()); + factory = std::shared_ptr(factory_cpu); + } + std::shared_ptr op = factory->createOperators(curOps, this->dt, operatorIndexMap, + &this->tensorMap, inputTensorsName, outputTensorsName, &weightOpOutputNames); + op->set_name(opName); + op->set_schedule(this->deviceInfo.schedule); + op->set_tensor_positions(tensorPositions); + op->init_feature_scale(curOps.num_quant_feature, curOps.feature_scale); + op->set_algorithm_map(this->algorithmMap); + this->ops.push_back(op); + + // setup operatorMap, tensorMap, operatorTensorMap + this->add(op, inputTensorsName, outputTensorsName); + } + + // setup WeightSpec ptr in WeightOperator + for (int i = 0; i < ms->num_weight_specs; i++) { + WeightSpec curOpWs = ms->ws[i]; + std::string opName = curOpWs.op_name; + auto op = this->operatorMap[opName]; + auto weightOp = dynamic_cast(op.get()); + weightOp->set_weightspec_ptr(curOpWs); + if (curOpWs.bytes_of_vec != 0) { + CHECK_REQUIREMENT(curOpWs.vec != nullptr); + weightOp->set_hasBias(true); + } + // These two pointers will be managed by engine via shared_ptr, so mt_destroy_model should not free them + ms->ws[i].weight = nullptr; + ms->ws[i].vec = nullptr; + } +} + +void CNN::ready(std::map inputDescMap) +{ + UNI_DEBUG_LOG("ready() schedule: %d\n", (int)(this->deviceInfo.schedule)); + UNI_PROFILE( + { + this->infer_output_tensors_size(inputDescMap); + // handle the weight ops + for (auto op : this->ops) { + UNI_DEBUG_LOG("ready() op: %s init weight and infer forward algorithm\n", + op->get_name().c_str()); + if (op->is_weight()) { + auto weightOpPtr = dynamic_cast(op.get()); + CHECK_STATUS(weightOpPtr->init_weight_bias_from_model(nullptr)); + } + CHECK_STATUS(op->infer_forward_algorithm(this->algorithmMap)); + } + + this->tmpTensor = *(this->allocate_tensor().get()); + this->infer_tmp_memory_size(); + this->assign_tmp_tensor(); + // transform filter + for (auto op : this->ops) { + UNI_DEBUG_LOG("ready() op: %s transform filter\n", op->get_name().c_str()); + if (op->is_weight()) { + auto weightOpPtr = dynamic_cast(op.get()); + CHECK_STATUS(weightOpPtr->transform_filter()); + } + } + this->infer_tmp_memory_size(); + this->tmpTensor.alloc(); + this->assign_output_tensor(); + }, + std::string("ready"), std::string("prepare")); +} + +void CNN::reready(std::map inputDescMap) +{ + this->infer_output_tensors_size(inputDescMap); + if (this->memoryTracker.getMemoryNeedAssign()) { + this->assign_output_tensor(); + } + this->infer_tmp_memory_size(); + this->tmpTensor.alloc(); +} + +EE CNN::mark_input_output() +{ + this->inputTensors.clear(); + for (U32 i = 0; i < this->modelInputTensorNames.size(); i++) { + std::string str = this->modelInputTensorNames[i]; + if (tensorMap.find(str) != tensorMap.end()) { + inputTensors[str] = tensorMap[str]; + } else { + return NOT_MATCH; + } + } + this->outputTensors.clear(); + for (U32 i = 0; i < this->modelOutputTensorNames.size(); i++) { + std::string str = this->modelOutputTensorNames[i]; + if (tensorMap.find(str) != tensorMap.end()) { + outputTensors[str] = tensorMap[str]; + } else { + return NOT_MATCH; + } + } + return SUCCESS; +} + +void CNN::copy_to_named_input(std::string inputName, const U8 *data) +{ + if (inputTensors.find(inputName) == inputTensors.end()) { + CHECK_STATUS(NOT_MATCH); + } + auto tensorPtr = this->inputTensors[inputName]; + Tensor input; + input.resize(tensorPtr->get_desc()); + std::shared_ptr shared_data((U8 *)data, [](U8 *ptr) {}); + ((CpuMemory *)(input.get_memory()))->set_shared_ptr(shared_data); + tensorPtr->copy_from(&input); +} + +void CNN::set_input_tensors_value(std::map> modelTensorsInput) +{ + for (auto &modelTensorInput : modelTensorsInput) { + std::string inputName = modelTensorInput.first; + std::shared_ptr data = modelTensorInput.second; + if (inputTensors.find(inputName) == inputTensors.end()) { + CHECK_STATUS(NOT_MATCH); + } + auto tensorPtr = this->inputTensors[inputName]; + Tensor input; + input.resize(tensorPtr->get_desc()); + ((CpuMemory *)(input.get_memory()))->set_shared_ptr(data); + tensorPtr->reuse(&input); + } +} + +std::map> CNN::get_inputs() +{ + std::map> ret; + if (this->deviceInfo.schedule == MALI) { +#ifdef _USE_MALI + for (U32 i = 0; i < modelInputTensorNames.size(); i++) { + std::shared_ptr tmpTensorCPU(new Tensor()); + tmpTensorCPU->resize(modelInputTensorDescs[i]); + tmpTensorCPU->alloc(); + auto p = std::pair>( + modelInputTensorNames[i], tmpTensorCPU); + ret.insert(p); + } +#endif + } else { + ret = this->inputTensors; + } + return ret; +} + +std::map> CNN::get_outputs() +{ + return this->outputTensors; +} + +Tensor CNN::get_tensor_by_name(std::string tensorName) +{ + if (this->tensorMap.find(tensorName) == this->tensorMap.end()) { + CHECK_STATUS(NOT_MATCH); + } + return *(this->tensorMap[tensorName].get()); +} + +TensorDesc CNN::get_tensor_desc_by_name(std::string tensorName) +{ + TensorDesc desc = tensor4d(DT_U8, 0, 0, 0, 0); + if (this->tensorMap.find(tensorName) != this->tensorMap.end()) { + desc = this->tensorMap[tensorName]->get_desc(); + } + return desc; +} + +std::vector CNN::get_model_input_tensor_names() +{ + return this->modelInputTensorNames; +} + +std::vector CNN::get_model_input_tensor_descs() +{ + return this->modelInputTensorDescs; +} + +std::vector CNN::get_model_output_tensor_names() +{ + return this->modelOutputTensorNames; +} + +EE CNN::infer_output_tensors_size(std::map inputDescMap) +{ + this->set_input_tensors_desc(inputDescMap); + for (auto iter : inputDescMap) { + UNI_DEBUG_LOG("infer_output_tensors_size() model input: %s desc %s\n", iter.first.c_str(), + tensorDesc2Str(iter.second).c_str()); + } + this->infer_layout_desc(); + this->update_op_tensors(); + return SUCCESS; +} + +void CNN::assign_output_tensor() +{ + this->storageMemory.clear(); + auto storageSize = this->memoryTracker.getStorageSize(); + for (U32 size : storageSize) { + auto tensor = this->allocate_tensor(size); + this->storageMemory.push_back(tensor); + } + + std::set input_set(modelInputTensorNames.begin(), modelInputTensorNames.end()); + std::set output_set(modelOutputTensorNames.begin(), modelOutputTensorNames.end()); + for (std::string opName : this->sortedOps) { + std::shared_ptr op = this->operatorMap[opName]; + std::vector tensorPositions = op->get_tensor_positions(); + std::vector> tensors(this->operatorTensorMap[opName].size()); + for (U32 i = 0, tensorIter = 0; i < this->operatorTensorMap[opName].size(); i++) { + std::vector &tensorNames = this->operatorTensorMap[opName][i]; + for (std::string &tensorName : tensorNames) { + UNI_DEBUG_LOG("assign_output_tensor() tensor %s slot %d\n", tensorName.c_str(), + tensorPositions[tensorIter]); + auto tensor = this->tensorMap[tensorName]; + if (i == 1 || input_set.find(tensorName) != input_set.end()) { + if (tensorPositions[tensorIter] != -1) { + auto mem = this->storageMemory[tensorPositions[tensorIter]].get(); + tensor->reuse(mem); + } else if (this->weightOpOutputNames.find(tensorName) == + this->weightOpOutputNames.end()) { + if (this->deviceInfo.schedule == MALI && + output_set.find(tensorName) != output_set.end()) { +#ifdef _USE_MALI + auto mem = (OclMemory *)tensor->get_memory(); + mem->mapped_alloc(); +#endif + } else { + tensor->alloc(); + } + } + } + tensorIter++; + tensors[i].push_back(*(tensor.get())); + } + } + op->set_input_output_tensors(tensors[0], tensors[1]); + } + this->memoryTracker.setMemoryAssigned(); +} + +void CNN::run() +{ + for (U32 opIndex = 0; opIndex < ops.size();) { + std::shared_ptr op = this->ops[opIndex]; + UNI_DEBUG_LOG( + "run() op: %s type: %s\n", op->get_name().c_str(), OperatorTypeName()[op->get_type()]); + if (op->get_type() == OT_Repeat || op->get_type() == OT_Jump) { + opIndex = op->get_next_operator_index(); + } else { + UNI_PROFILE(op->run(), op->get_name(), + std::string(OperatorTypeName()[op->get_type()]) + std::string("::run")); + opIndex++; + } +#ifdef _DEBUG + std::vector outputTensors = op->get_output_tensors(); + for (U32 i = 0; i < outputTensors.size(); i++) { + Tensor outputTensor = outputTensors[i]; + std::string line = outputTensor.string(32); + UNI_DEBUG_LOG(" output: %s\n", line.c_str()); + } +#endif + } +} + +std::shared_ptr CNN::allocate_tensor(U32 size) +{ + MemoryType type = CPUMem; + if (this->deviceInfo.schedule == MALI) { + type = OCLMem; + } + std::shared_ptr tensor = std::shared_ptr(new Tensor(type)); + tensor->resize(tensor1d(DT_U8, size)); + tensor->alloc(); + return tensor; +} + +void CNN::add(std::shared_ptr op, + std::vector inputTensorsName, + std::vector outputTensorsName) +{ + std::string operatorName = op->get_name(); + this->operatorMap[operatorName] = op; + + if (this->operatorTensorMap.find(operatorName) == this->operatorTensorMap.end()) { + this->operatorTensorMap[operatorName] = {inputTensorsName, outputTensorsName}; + } else { + UNI_ERROR_LOG("duplicate tensor: %s\n", operatorName.c_str()); + } + + for (std::string &inputName : inputTensorsName) { + if (this->tensorMap.find(inputName) == this->tensorMap.end()) { + this->tensorMap[inputName] = this->allocate_tensor(); + } + } + + for (std::string &outputName : outputTensorsName) { + if (this->tensorMap.find(outputName) == this->tensorMap.end()) { + this->tensorMap[outputName] = this->allocate_tensor(); + } + } +} + +void CNN::infer_layout_desc() +{ + for (std::string opName : this->sortedOps) { + auto op = this->operatorMap[opName]; + UNI_DEBUG_LOG("op: %s type: %s\n", opName.c_str(), OperatorTypeName()[op->get_type()]); + std::vector curOpInputTensorName = this->operatorTensorMap[opName][0]; + std::vector curOpOutputTensorName = this->operatorTensorMap[opName][1]; + std::vector inputTensors; + std::vector outputTensors; + for (std::string inputTensorName : curOpInputTensorName) { + auto tensor = this->tensorMap[inputTensorName].get(); + inputTensors.push_back(tensor); + UNI_DEBUG_LOG(" input: %s desc %s\n", inputTensorName.c_str(), + tensorDesc2Str(tensor->get_desc()).c_str()); + } + for (std::string outputTensorName : curOpOutputTensorName) { + auto tensor = this->tensorMap[outputTensorName].get(); + outputTensors.push_back(tensor); + } + CHECK_STATUS(op->infer_output_tensors_size(inputTensors, outputTensors)); + for (std::string outputTensorName : curOpOutputTensorName) { + UNI_DEBUG_LOG(" output: %s desc %s\n", outputTensorName.c_str(), + tensorDesc2Str(this->tensorMap[outputTensorName]->get_desc()).c_str()); + } + } +} + +void CNN::update_op_tensors() +{ + for (auto opName : this->sortedOps) { + auto op = this->operatorMap[opName]; + UNI_DEBUG_LOG("update_op_tensors() op: %s type: %s\n", opName.c_str(), + OperatorTypeName()[op->get_type()]); + std::vector curOpInputTensorName = this->operatorTensorMap[opName][0]; + std::vector curOpOutputTensorName = this->operatorTensorMap[opName][1]; + std::vector inTensors, outTensors; + for (std::string inputTensorName : curOpInputTensorName) { + auto tensorTmp = this->tensorMap[inputTensorName]; + inTensors.push_back(*tensorTmp.get()); + } + + for (std::string outputTensorName : curOpOutputTensorName) { + auto tensorTmp = this->tensorMap[outputTensorName]; + outTensors.push_back(*tensorTmp.get()); + } + op->set_input_output_tensors(inTensors, outTensors); + + curOpInputTensorName.insert( + curOpInputTensorName.end(), curOpOutputTensorName.begin(), curOpOutputTensorName.end()); + memoryTracker.trackOpTensorSizes(op, curOpInputTensorName); + } + check_memory_reuse_ratio(); +} + +void CNN::set_input_tensors_desc(std::map inputDescMap) +{ + for (auto iter : inputDescMap) { + if (tensorMap.find(iter.first) == tensorMap.end()) { + UNI_WARNING_LOG("Unused model input node: %s\n", iter.first.c_str()); + continue; + } + TensorDesc desc = iter.second; + (this->tensorMap[iter.first].get())->resize(desc); + } +} + +void CNN::infer_tmp_memory_size() +{ + U32 tmpSize = this->tmpTensor.bytes(); + // input data format transform tmp buffer + if (this->deviceInfo.schedule == MALI) { + for (auto desc : modelInputTensorDescs) { + tmpSize = UNI_MAX(tmpSize, tensorNumBytes(desc)); + } + } + + // operator tmp buffer + for (auto &op : this->ops) { + auto len = op->infer_tmp_memory_size(); + tmpSize = UNI_MAX(tmpSize, len); + } + this->tmpTensor.resize(tensor1d(DT_U8, tmpSize)); +} + +void CNN::assign_tmp_tensor() +{ + this->tmpTensor.alloc(); + for (auto &op : this->ops) { + op->set_tmp_memory(this->tmpTensor); + } +} + +void CNN::check_memory_reuse_ratio() +{ + U32 originalSize = 0; + U32 standaloneSize = 0; + for (auto tensor : this->tensorMap) { + U32 tensorSize = tensor.second->bytes(); + originalSize += tensorSize; + if (weightOpOutputNames.find(tensor.first) != weightOpOutputNames.end()) { + standaloneSize += tensorSize; + } + } + UNI_DEBUG_LOG("tensor memory: originally %d tensors take %u bytes.\n", + (int)this->tensorMap.size(), originalSize); + UNI_DEBUG_LOG("tensor memory: now %u tensors take %u bytes, and %u bytes are reserved " + "for standalone tensors (e.g. loop topology). reuse rate: %f\n", + this->memoryTracker.getNumSlots(), this->memoryTracker.getSizeSum(), standaloneSize, + (F32)originalSize / (this->memoryTracker.getSizeSum() + standaloneSize)); +} diff --git a/inference/engine/src/data_loader.cpp b/inference/engine/src/data_loader.cpp new file mode 100644 index 00000000..8750162f --- /dev/null +++ b/inference/engine/src/data_loader.cpp @@ -0,0 +1,377 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _BUILD_TEST + +#include +#include +#include +#include +#include +#include +#include + +#include "image_processing.hpp" +#include "data_loader.hpp" + +template +void init_one(U8 *memory, U32 len) +{ + T *data = (T *)memory; + for (U32 i = 0; i < len; i++) { + data[i] = 1; + } +} + +template +void init_rand(U8 *memory, U32 len) +{ + T *data = (T *)memory; + for (U32 i = 0; i < len; i++) { + data[i] = (rand() % 1024) / (T)1024.0 - (T)0.5; + } +} + +void get_files(std::string directoryName, std::vector &files) +{ + if (directoryName.empty()) { + UNI_ERROR_LOG("null data\n"); + } + DIR *directory = opendir(directoryName.c_str()); + if (NULL == directory) { + UNI_ERROR_LOG("permission denied to access %s\n", directoryName.c_str()); + } + struct dirent *file; + while ((file = readdir(directory)) != NULL) { + if (strcmp(file->d_name, ".") == 0 || strcmp(file->d_name, "..") == 0) { + continue; + } + if (file->d_type == DT_DIR) { + continue; + } else { + files.push_back(directoryName + "/" + file->d_name); + } + } + closedir(directory); +} + +std::vector load_jpeg( + std::string dataPath, std::vector imageDesc, ImageFormat ImageFormat, F32 scaleValue) +{ + FILE *file = fopen(dataPath.c_str(), "rb"); + CHECK_REQUIREMENT(NULL != file); + + struct jpeg_decompress_struct info; + struct jpeg_error_mgr err; + + info.err = jpeg_std_error(&err); + jpeg_create_decompress(&info); + + jpeg_stdio_src(&info, file); + jpeg_read_header(&info, TRUE); + + jpeg_start_decompress(&info); + + U32 width = info.output_width; + U32 height = info.output_height; + U32 numChannels = info.output_components; + U32 dataSize = numChannels * height * width; + + UNI_DEBUG_LOG("%s: channels %u , out color space %d\n", dataPath.c_str(), numChannels, + info.out_color_space); + CHECK_REQUIREMENT(2 == info.out_color_space); // Support RGB for now + + U8 *data = (U8 *)malloc(dataSize); + JSAMPROW row_pointer[1]; + while (info.output_scanline < info.output_height) { + row_pointer[0] = data + info.output_scanline * width * numChannels; + int ret = jpeg_read_scanlines(&info, row_pointer, 1); + CHECK_REQUIREMENT(ret == 1); + } + + jpeg_finish_decompress(&info); + jpeg_destroy_decompress(&info); + fclose(file); + + TensorDesc rgbDesc = tensor4df(DT_U8, DF_RGB, 1, 3, height, width); + Tensor rgbTensor = Tensor::alloc_sized(rgbDesc); + U8 *rgb = (U8 *)((CpuMemory *)(rgbTensor.get_memory()))->get_ptr(); + U8 *r = rgb; + U8 *g = r + height * width; + U8 *b = g + height * width; + + U8 *dataMov = data; + for (U32 i = 0; i < height * width; i++) { + r[i] = dataMov[0]; + g[i] = dataMov[1]; + b[i] = dataMov[2]; + dataMov += numChannels; + } + free(data); + + std::shared_ptr imageTensor = + load_resize_image(rgbTensor, imageDesc[0], ImageFormat, scaleValue); + std::vector result; + imageTensor->resize(imageDesc[0]); + result.push_back(*imageTensor.get()); + return result; +} + +std::vector load_fake_data(std::vector dataDesc) +{ + std::vector result; + for (U32 index = 0; index < dataDesc.size(); index++) { + Tensor tensor = Tensor::alloc_sized(dataDesc[index]); + U8 *ptr = (U8 *)((CpuMemory *)(tensor.get_memory()))->get_ptr(); + switch (dataDesc[index].dt) { + case DT_F32: { + init_one(ptr, tensorNumElements(dataDesc[index])); + break; + } +#ifdef __aarch64__ + case DT_F16: { + init_one(ptr, tensorNumElements(dataDesc[index])); + break; + } +#endif + case DT_U32: { + init_one(ptr, tensorNumElements(dataDesc[index])); + break; + } + case DT_I32: { + init_one(ptr, tensorNumElements(dataDesc[index])); + break; + } + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } + result.push_back(tensor); + } + return result; +} + +Tensor fscanfReadData(FILE *f, TensorDesc desc) +{ + Tensor tensor = Tensor::alloc_sized(desc); + U32 size = tensor.length(); + DataType dataType = desc.dt; + auto ptr = ((CpuMemory *)(tensor.get_memory()))->get_ptr(); + switch (dataType) { + case DT_F32: { + F32 *dataPtr = (F32 *)ptr; + for (U32 i = 0; i < size; i++) { + fscanf(f, "%f", dataPtr + i); + } + break; + } +#ifdef __aarch64__ + case DT_F16: { + F16 *dataPtr = (F16 *)ptr; + F32 value; + for (U32 i = 0; i < size; i++) { + fscanf(f, "%f", &value); + dataPtr[i] = (F16)value; + } + break; + } +#endif + case DT_U32: { + U32 *dataPtr = (U32 *)ptr; + for (U32 i = 0; i < size; i++) { + fscanf(f, "%u", dataPtr + i); + } + break; + } + case DT_I32: { + I32 *dataPtr = (I32 *)ptr; + for (U32 i = 0; i < size; i++) { + fscanf(f, "%d", dataPtr + i); + } + break; + } + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } + return tensor; +} + +std::vector load_txt(std::string dataPath, std::vector dataDesc) +{ + std::vector result; + FILE *f = fopen(dataPath.c_str(), "r"); + CHECK_REQUIREMENT(f != nullptr); + for (U32 index = 0; index < dataDesc.size(); index++) { + result.push_back(fscanfReadData(f, dataDesc[index])); + } + fclose(f); + return result; +} + +std::vector load_seq(std::string dataPath, std::vector dataDesc) +{ + std::vector result; + FILE *f = fopen(dataPath.c_str(), "r"); + CHECK_REQUIREMENT(f != nullptr); + for (U32 index = 0; index < dataDesc.size(); index++) { + U32 sequenceLen = 0; + fscanf(f, "%u", &sequenceLen); + TensorDesc sequenceDesc = dataDesc[index]; + sequenceDesc.dims[0] = sequenceLen; + for (U32 j = 1; j < sequenceDesc.nDims; j++) { + sequenceDesc.dims[j] = 1; + } + + result.push_back(fscanfReadData(f, sequenceDesc)); + } + fclose(f); + return result; +} + +std::vector load_bin( + std::string dataPath, std::vector sourceDataType, std::vector dataDesc) +{ + std::vector result; + FILE *f = fopen(dataPath.c_str(), "r"); + if (nullptr == f) { + result = load_fake_data(dataDesc); + } else { + for (U32 index = 0; index < dataDesc.size(); index++) { + TensorDesc sourceDesc = dataDesc[index]; + sourceDesc.dt = sourceDataType[index]; + Tensor tensor = Tensor::alloc_sized(sourceDesc); + U32 len = tensor.length(); + auto ptr = ((CpuMemory *)(tensor.get_memory()))->get_ptr(); + U32 readLength = fread(ptr, bytesOf(sourceDataType[index]), len, f); + CHECK_REQUIREMENT(len == readLength); + if (sourceDataType[index] != dataDesc[index].dt) { + Tensor transform_tensor = Tensor::alloc_sized(dataDesc[index]); + if (0) { +#ifdef __aarch64__ + } else if (sourceDataType[index] == DT_F32 && dataDesc[index].dt == DT_F16) { + F32 *ptr1 = (F32 *)ptr; + F16 *ptr2 = (F16 *)((CpuMemory *)(transform_tensor.get_memory()))->get_ptr(); + for (U32 i = 0; i < len; i++) { + ptr2[i] = (F16)ptr1[i]; + } +#endif + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + result.push_back(transform_tensor); + } else { + result.push_back(tensor); + } + } + fclose(f); + } + return result; +} + +int string_end_with(std::string s, std::string sub) +{ + std::transform(s.begin(), s.end(), s.begin(), ::tolower); + std::transform(sub.begin(), sub.end(), sub.begin(), ::tolower); + return s.rfind(sub) == (s.length() - sub.length()) ? 1 : 0; +} + +std::vector load_data(std::string directoryPath, + std::vector dataDesc, + std::vector> *datas) +{ + std::vector dataPaths; + if (directoryPath == "") { + std::vector data = load_fake_data(dataDesc); + (*datas).push_back(data); + dataPaths.push_back("fake data"); + return dataPaths; + } + + std::vector paths; + get_files(directoryPath, paths); + std::vector data; + for (U32 i = 0; i < paths.size(); i++) { + std::string dataPath = paths[i]; + if (string_end_with(dataPath, ".txt")) { + data = load_txt(dataPath, dataDesc); + } else if (string_end_with(dataPath, ".seq")) { + data = load_seq(dataPath, dataDesc); + } else { + UNI_ERROR_LOG("can not load data %s\n", dataPath.c_str()); + } + (*datas).push_back(data); + dataPaths.push_back(dataPath); + } + return dataPaths; +} + +std::vector load_image_with_scale(std::string directoryPath, + std::vector dataDesc, + std::vector> *datas, + ImageFormat ImageFormat, + F32 scaleValue) +{ + std::vector dataPaths; + if (directoryPath == "") { + std::vector data = load_fake_data(dataDesc); + (*datas).push_back(data); + dataPaths.push_back("fake data"); + return dataPaths; + } + + std::vector paths; + get_files(directoryPath, paths); + std::vector data; + for (U32 i = 0; i < paths.size(); i++) { + std::string dataPath = paths[i]; + if (string_end_with(dataPath, ".jpg") || string_end_with(dataPath, ".jpeg")) { + data = load_jpeg(dataPath, dataDesc, ImageFormat, scaleValue); + } else if (string_end_with(dataPath, ".txt")) { + data = load_txt(dataPath, dataDesc); + } else { + UNI_ERROR_LOG("can not load jpeg data %s\n", dataPath.c_str()); + } + (*datas).push_back(data); + dataPaths.push_back(dataPath); + } + return dataPaths; +} + +std::vector load_bin_with_type(std::string directoryPath, + std::vector dataDesc, + std::vector> *datas, + std::vector sourceDataType) +{ + std::vector dataPaths; + if (directoryPath == "") { + std::vector data = load_fake_data(dataDesc); + (*datas).push_back(data); + dataPaths.push_back("fake data"); + return dataPaths; + } + + std::vector paths; + get_files(directoryPath, paths); + std::vector data; + for (U32 i = 0; i < paths.size(); i++) { + std::string dataPath = paths[i]; + if (string_end_with(dataPath, ".bin")) { + data = load_bin(dataPath, sourceDataType, dataDesc); + (*datas).push_back(data); + dataPaths.push_back(dataPath); + } + } + return dataPaths; +} +#endif diff --git a/inference/engine/src/result_format.cpp b/inference/engine/src/result_format.cpp new file mode 100644 index 00000000..f214a6ad --- /dev/null +++ b/inference/engine/src/result_format.cpp @@ -0,0 +1,49 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "result_format.hpp" + +std::vector topK_index(U8 *res, TensorDesc desc, U32 topK) +{ + U32 len = tensorNumElements(desc); + std::vector index(len); + for (U32 i = 0; i < index.size(); i++) { + index[i] = i; + } + + switch (desc.dt) { +#ifdef __aarch64__ + case DT_F16: { + F16 *dataPtr = (F16 *)res; + sort(index.begin(), index.end(), + [&](const int &a, const int &b) { return (dataPtr[a] > dataPtr[b]); }); + break; + } +#endif + case DT_F32: { + F32 *dataPtr = (F32 *)res; + sort(index.begin(), index.end(), + [&](const int &a, const int &b) { return (dataPtr[a] > dataPtr[b]); }); + break; + } + default: + break; + } + + std::vector::const_iterator first = index.begin() + 0; + std::vector::const_iterator last = index.begin() + topK; + std::vector indexesTopK(first, last); + + return indexesTopK; +} diff --git a/inference/engine/tools/CMakeLists.txt b/inference/engine/tools/CMakeLists.txt new file mode 100644 index 00000000..c392e964 --- /dev/null +++ b/inference/engine/tools/CMakeLists.txt @@ -0,0 +1,25 @@ +cmake_minimum_required(VERSION 3.2) + +set_test_c_cxx_flags() + +if (BUILD_TEST) + engine_test(common_algo_search ./common_algo_search/common_algo_search.cpp) + install(TARGETS common_algo_search + RUNTIME DESTINATION tools) +endif (BUILD_TEST) +if (BUILD_TEST AND USE_INT8) + engine_test(ptq_calibration ./ptq_calibration/ptq_calibration.cpp) + install(TARGETS ptq_calibration + RUNTIME DESTINATION tools) +endif (BUILD_TEST AND USE_INT8) +if (USE_MALI) + engine_test(preprocess_ocl ./preprocess_ocl/preprocess_ocl.cpp) + install(TARGETS preprocess_ocl + RUNTIME DESTINATION tools) +endif (USE_MALI) +if (USE_TRAINING) + train_test(model_finetuner ./model_finetuner/model_finetuner.cpp) + TARGET_LINK_LIBRARIES(model_finetuner RaulLib) + install(TARGETS model_finetuner + RUNTIME DESTINATION tools) +endif (USE_TRAINING) diff --git a/inference/engine/tools/common_algo_search/common_algo_search.cpp b/inference/engine/tools/common_algo_search/common_algo_search.cpp new file mode 100644 index 00000000..50e1b01a --- /dev/null +++ b/inference/engine/tools/common_algo_search/common_algo_search.cpp @@ -0,0 +1,128 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "ut_util.h" +#include "tensor_computing.h" +#include "algorithm_map.h" +#include "parse_command.h" + +int convolutionCPUFloatAlgorithmSearch(Arch arch, DataType dt, std::string path) +{ + TensorDesc inputDesc, filterDesc, outputDesc; + ConvolutionPolicy policy = CONVOLUTION_TUNNING; + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_RELU; + activationDesc.value[0] = 0; + ConvolutionParamSpec convParamSpec; + convParamSpec.dilatedRate_h = 1; + convParamSpec.dilatedRate_w = 1; + U32 in = 1; + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + U32 ic_step, ihw_step, fn_step, ic_max, ihw_max, fn_max; + std::set fwh; + std::set stride; + std::string modelName = ""; + std::string deviceName = ""; + AlgorithmMap *algoMap = new AlgorithmMap(arch, modelName, deviceName, dt); + algoMap->getCommonAlgoMapPara( + &ic_step, &ihw_step, &fn_step, &ic_max, &ihw_max, &fn_max, &fwh, &stride); + for (auto sv : stride) { + for (auto fv : fwh) { + U32 pl = fv / 2; + U32 pr = (fv - 1) / 2; + U32 pt = fv / 2; + U32 pb = (fv - 1) / 2; + for (U32 fn = fn_step; fn <= fn_max; fn += fn_step) { + for (U32 ic = ic_step; ic <= ic_max; ic += ic_step) { + for (U32 ih = ihw_step; ih <= ihw_max; ih += ihw_step) { + for (U32 iw = ihw_step; iw <= ihw_max; iw += ihw_step) { + if (ic % 8 != 0) { + inputDesc = tensor4df(dt, DF_NCHW, in, ic, ih, ih); + } else { + inputDesc = tensor4df(dt, DF_NCHWC8, in, ic, ih, ih); + } + convParamSpec.stride_h = sv; + convParamSpec.stride_w = sv; + convParamSpec.padding_left = pl; + convParamSpec.padding_right = pr; + convParamSpec.padding_top = pt; + convParamSpec.padding_bottom = pb; + filterDesc = tensor4df(dt, DF_NCHW, fn, ic, fv, fv); + Tensor inputTensor; + Tensor outputTensor; + Tensor filterTensor; + inputTensor.resize(inputDesc); + outputTensor.resize(outputDesc); + filterTensor.resize(filterDesc); + CHECK_STATUS(convolution_infer_output_size(&inputTensor, filterTensor, + convParamSpec, &outputTensor, dt, &archInfo)); + ConvolutionForwardAlgorithm algorithm = CONVOLUTION_ALGORITHM_NULL; + CHECK_STATUS(convolution_infer_forward_algorithm(inputTensor, + filterTensor, outputTensor, convParamSpec, policy, &algorithm, dt, + activationDesc, &archInfo)); + algoMap->setCommonAlgoInfoToMap(OT_Conv, dt, ic, ih, iw, fn, fv, fv, sv, + sv, (I32 *)(&algorithm), 1); + } + } + } + } + } + } + algoMap->saveAlgorithmMapToText(path); + delete algoMap; + return 0; +} + +int main(int argc, char *argv[]) +{ + std::string affinityPolicyName = "CPU_AFFINITY_HIGH_PERFORMANCE"; + std::string algorithmMapPath = "./"; + ParseRes parse_res; + parseCommandLine(argc, argv, &parse_res, "examples"); + if (parse_res.archInfo.second) { + affinityPolicyName = parse_res.archInfo.first; + } + if (parse_res.algoPath.second) { + algorithmMapPath = parse_res.algoPath.first; + } + AffinityPolicy affinityPolicy = thread_affinity_get_policy_by_name(affinityPolicyName.c_str()); + + if (affinityPolicyName == "CPU_AFFINITY_HIGH_PERFORMANCE" || + affinityPolicyName == "CPU_AFFINITY_LOW_POWER") { + Arch arch; +#ifndef _USE_IOS + DeviceInfo deviceInfo = get_cpu_info(affinityPolicy); + set_cpu_dynamic(&deviceInfo, 0); + arch = deviceInfo.schedule; +#else + arch = ARM_A76; +#endif +#ifdef _USE_FP16 + convolutionCPUFloatAlgorithmSearch(arch, DT_F16, algorithmMapPath); + +#endif +#ifdef _USE_FP32 + convolutionCPUFloatAlgorithmSearch(arch, DT_F32, algorithmMapPath); +#endif + } else if (affinityPolicyName == "GPU") { + UNI_ERROR_LOG("Unsupport GPU now\n"); + exit(-1); + } else { + UNI_ERROR_LOG("Unknow archInfo %s, please use " + "CPU_AFFINITY_HIGH_PERFORMANCE/CPU_AFFINITY_LOW_POWER/GPU\n", + affinityPolicyName.c_str()); + exit(-1); + } + return 0; +} diff --git a/inference/engine/tools/preprocess_ocl/CMakeLists.txt b/inference/engine/tools/preprocess_ocl/CMakeLists.txt new file mode 100644 index 00000000..329d60b5 --- /dev/null +++ b/inference/engine/tools/preprocess_ocl/CMakeLists.txt @@ -0,0 +1,10 @@ +cmake_minimum_required(VERSION 3.2) + +project(kernelbin) + +include_directories(${PROJECT_SOURCE_DIR}/include) +include_directories(${PROJECT_SOURCE_DIR}/extern) + +file(GLOB_RECURSE kernel_bin_src_list cpp/*.cpp) + +add_library(${PROJECT_NAME} SHARED ${kernel_bin_src_list}) diff --git a/inference/engine/tools/preprocess_ocl/build_preprocess_ocl.sh b/inference/engine/tools/preprocess_ocl/build_preprocess_ocl.sh new file mode 100644 index 00000000..290cb9d1 --- /dev/null +++ b/inference/engine/tools/preprocess_ocl/build_preprocess_ocl.sh @@ -0,0 +1,114 @@ +#Ensure your target device is connected with host by adb +#Set your target devices number here +device=GCL5T19822000030 + +#Set your preprocess_ocl program file location of host +preprocess_ocl=${BOLT_ROOT}/install_arm_llvm/tools/preprocess_ocl + +#Set your bolt models location on device, put all your bolt models need to preprocess here +device_bolt_models=/data/local/tmp/preprocess_bolt_models + +#Set your work location on device, make sure it is read-write avaliable, sh will build filefolds automatically +device_work_local=/data/local/tmp/preprocess +device_algo_files=${device_work_local}/algoFiles +device_include=${device_work_local}/include +device_cpp=${device_work_local}/cpp + +host_work_local=$(pwd) +host_algo_files=${host_work_local}/algoFiles +host_include=${host_work_local}/include +host_cpp=${host_work_local}/cpp +host_extern=${host_work_local}/extern +host_lib=${host_work_local}/lib +host_build=${host_work_local}/build +rm -rf ${host_algo_files} ${host_include} ${host_cpp} +mkdir ${host_algo_files} ${host_include} ${host_cpp} + + +adb -s ${device} shell "rm -rf ${device_work_local}" +adb -s ${device} shell "mkdir ${device_work_local}" +adb -s ${device} shell "mkdir ${device_work_local}/lib" +adb -s ${device} shell "mkdir ${device_algo_files}" +adb -s ${device} shell "mkdir ${device_include}" +adb -s ${device} shell "mkdir ${device_cpp}" + +adb -s ${device} push ${preprocess_ocl} ${device_work_local} > /dev/null || exit 1 +for file in `ls ${BOLT_ROOT}/install_arm_llvm/lib/*.so` + do + adb -s ${device} push ${file} ${device_work_local}/lib > /dev/null || exit 1 + done + +echo "Running GPU preprocess on device ${device}" +adb -s ${device} shell "cd ${device_work_local} && chmod +x preprocess_ocl && export LD_LIBRARY_PATH=./lib && ./preprocess_ocl ${device_bolt_models} ${device_algo_files} ${device_include} ${device_cpp}" +echo "Finish GPU preprocess on device ${device}" + +echo "Aquire algoFiles and kernelBins from device ${device}" +adb -s ${device} pull ${device_algo_files} ${host_algo_files} > /dev/null +adb -s ${device} pull ${device_include} ${host_include} > /dev/null +adb -s ${device} pull ${device_cpp} ${host_cpp} > /dev/null + +echo "build kernel bin .so on host" +if [ -d ${host_algo_files}/algoFiles ]; then + mv ${host_algo_files}/algoFiles/* ${host_algo_files} + rm -rf ${host_algo_files}/algoFiles +fi + +if [ -d ${host_include}/include ]; then + mv ${host_include}/include/* ${host_include} + rm -rf ${host_include}/include +fi + +if [ -d ${host_cpp}/cpp ]; then + mv ${host_cpp}/cpp/* ${host_cpp} + rm -rf ${host_cpp}/cpp +fi + +rm -rf ${host_extern} +mkdir ${host_extern} +cp ${BOLT_ROOT}/common/gcl/include/gcl_kernel_type.h ${host_extern} +cp ${BOLT_ROOT}/common/gcl/include/gcl_kernel_binmap.h ${host_extern} + +cpp_files_name=$(ls ${host_cpp}) +lib_name=libkernelbin +for p in ${cpp_files_name[@]} +do + postfix=${p##*.} + if [ ${postfix} = "h" ]; then + lib_name=${p%.*} + lib_name=${lib_name#inline_} + fi +done + +lib_name=${lib_name%.*} + +rm -rf ${host_build} +mkdir ${host_build} +cd ${host_build} +cmake .. -DCMAKE_C_COMPILER=`which aarch64-linux-android21-clang` \ + -DCMAKE_CXX_COMPILER=`which aarch64-linux-android21-clang++` \ + -DCMAKE_STRIP=`which aarch64-linux-android-strip` +make -j33 + +cd ${host_work_local} +rm -rf ${host_lib} +mkdir ${host_lib} +#mv ${host_build}/libkernelbin.so ${host_lib}/lib${lib_name}_map.so + +allSrcs=`find ${host_build} -name "*.o" -printf "%P\n"` +for file in ${allSrcs} +do + sharedSrcs="${sharedSrcs} ${host_build}/${file}" +done +CXX=aarch64-linux-android21-clang++ +${CXX} -shared -o ${host_lib}/lib${lib_name}_map.so ${sharedSrcs} \ + -L${BOLT_ROOT}/third_party/arm_llvm/opencl/lib64 -lOpenCL -Wl,-soname,lib${lib_name}_map.so + +cd ${host_lib} +STRIP=aarch64-linux-android-strip +${STRIP} lib${lib_name}_map.so + +cd ${host_work_local} +rm -rf ${host_cpp} ${host_extern} ${host_build} ${host_include} +echo "Preprocess finish" +echo "Check algofiles in path ${host_algo_files}" +echo "Check lib${lib_name}_map.so in path ${host_lib}" diff --git a/inference/engine/tools/preprocess_ocl/preprocess_ocl.cpp b/inference/engine/tools/preprocess_ocl/preprocess_ocl.cpp new file mode 100644 index 00000000..c7a4b080 --- /dev/null +++ b/inference/engine/tools/preprocess_ocl/preprocess_ocl.cpp @@ -0,0 +1,252 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "inference.hpp" +#include "tensor.hpp" +#include "result_format.hpp" +#include +#include +#include +#include +#include +#include +#include +#include +#include "types.h" +#include "error.h" + +#ifdef _USE_FP16 +inline std::vector buildModelsNameArray(std::string path, std::string postfix) +{ + struct dirent *dirTp; + DIR *handle = opendir(path.c_str()); + std::vector names; + if (handle != NULL) { + while ((dirTp = readdir(handle)) != NULL) { + std::string modelName = dirTp->d_name; + U32 len = modelName.size(); + U32 postfix_len = postfix.size(); + if (len > postfix_len) { + if (modelName.substr(len - postfix_len) == postfix) { + modelName = path + modelName; + names.push_back(modelName); + } + } + } + } else { + UNI_ERROR_LOG("opendir %s failed\n", path.c_str()); + } + closedir(handle); + return names; +} + +inline void write_to_file(std::string str, std::string path, std::string name) +{ + std::string fileName = path + name; + std::ofstream file(fileName.c_str()); + if (file.is_open()) { + file << str.c_str(); + file.close(); + } else { + UNI_ERROR_LOG("fail to write file %s\n", fileName.c_str()); + } +} + +inline void runBoltModel(CI8 *modelPath, CI8 *algoPath, std::vector *kernelNames) +{ + if (!strstr(modelPath, "f16.bolt")) { + UNI_ERROR_LOG("Bolt gpu only support F16(_f16.bolt) now\n"); + UNI_ERROR_LOG("Ensure your model is xxxx_f16.bolt\n"); + exit(1); + } + + UNI_INFO_LOG("Building algofile and used kernelNames for %s\n", modelPath); + auto cnn = createPipeline("GPU", modelPath, algoPath); + std::vector inputDescs = cnn->get_model_input_tensor_descs(); + U8 **input_ptrs = new U8 *[inputDescs.size()]; + for (U32 i = 0; i < inputDescs.size(); i++) { + U32 size = tensorNumBytes(inputDescs[i]); + input_ptrs[i] = new U8[size]; + } + + std::vector inputNames = cnn->get_model_input_tensor_names(); + for (U32 i = 0; i < inputNames.size(); i++) { + cnn->copy_to_named_input(inputNames[i], input_ptrs[i]); + } + + std::map> outMap; + cnn->run(); + outMap = cnn->get_outputs(); + cnn->saveAlgorithmMapToText(algoPath); + GCLHandle_t handle = OCLContext::getInstance().handle.get(); + for (auto p : handle->kernelMap) { + std::string device_name = handle->deviceName; + std::string kernelName = p.first; + kernelName.erase(0, device_name.size() + 1); + if (find((*kernelNames).begin(), (*kernelNames).end(), kernelName) == (*kernelNames).end()) { + (*kernelNames).push_back(kernelName); + } + } + for (auto p : handle->programMap) { + std::string kernelName = p.first; + if (find((*kernelNames).begin(), (*kernelNames).end(), kernelName) == (*kernelNames).end()) { + (*kernelNames).push_back(kernelName); + } + } + + for (U32 i = 0; i < inputDescs.size(); i++) { + delete[] input_ptrs[i]; + } + delete[] input_ptrs; + CHECK_STATUS(gcl_finish(handle)); +} + +inline void buildKernelBinFiles( + std::vector kernelNames, std::string includePath, std::string cppPath) +{ + GCLHandle_t handle; + CHECK_STATUS(gcl_create_handle(&handle)); + std::string device_name = handle->deviceName; + std::string device_name_up = device_name; + std::transform(device_name_up.begin(), device_name_up.end(), device_name_up.begin(), ::toupper); + + std::string inline_kernel_bin_head; + std::string inline_kernel_bin_head_name; + inline_kernel_bin_head_name = "inline_" + device_name + ".h"; + inline_kernel_bin_head = "#ifndef _INLINE_" + device_name_up + "_H\n"; + inline_kernel_bin_head += "#define _INLINE_" + device_name_up + "_H\n"; + + std::string device_map_head; + std::string device_map_head_name; + device_map_head_name = device_name + "_map.h"; + device_map_head = "#ifndef " + device_name_up + "_MAP_H\n"; + device_map_head += "#define " + device_name_up + "_MAP_H\n"; + device_map_head += "extern \"C\" {\n"; + device_map_head += " gcl_kernel_binmap* create_" + device_name + "_kernelbin_map();\n"; + device_map_head += "}\n"; + device_map_head += "#endif"; + write_to_file(device_map_head, includePath, device_map_head_name); + + std::string device_map; + std::string device_map_name; + device_map_name = device_name + "_map.cpp"; + device_map = "#include \"gcl_kernel_binmap.h\"\n"; + device_map += "#include\"" + device_map_head_name + "\"\n"; + device_map += "#include\"" + inline_kernel_bin_head_name + "\"\n"; + device_map += "class " + device_name + " : public gcl_kernel_binmap {\n"; + device_map += "public:\n"; + device_map += " " + device_name + "() {\n"; + device_map += " loadKernelBin();\n"; + device_map += " }\n"; + device_map += " void loadKernelBin();\n"; + device_map += "};\n"; + device_map += "void " + device_name + "::loadKernelBin() {\n"; + + std::string device_kernel_bin; + std::string device_kernel_bin_name; + device_kernel_bin_name = device_name + "_kernel_bin.cpp"; + device_kernel_bin = "#include\"" + inline_kernel_bin_head_name + "\"\n"; + + for (auto p : kernelNames) { + Kernel kernel; + U8 *binary; + U32 len; + CHECK_STATUS(gcl_create_kernel(handle, p.c_str(), &kernel)); + Program program = handle->programMap[p]; + CHECK_STATUS(gcl_get_program_info(program, &binary, &len)); + std::string func = device_name + "_" + p; + inline_kernel_bin_head += "extern const unsigned int " + func + "_len;\n"; + inline_kernel_bin_head += "extern const unsigned char " + func + "[];\n"; + device_map += " put(\"" + func + "\", " + "{" + func + ", " + func + "_len});\n"; + device_kernel_bin += "const unsigned int " + func + "_len = " + std::to_string(len) + ";\n"; + device_kernel_bin += "const unsigned char " + func + "[] = " + "{"; + for (U32 i = 0; i < len; i++) { + char tempstr[4]; + if (i % 20 == 0) { + device_kernel_bin += "\n"; + } + sprintf(tempstr, "0x%02x", binary[i]); + device_kernel_bin += std::string(tempstr); + if (i != len - 1) { + device_kernel_bin += ", "; + } else { + device_kernel_bin += "};\n"; + } + } + CHECK_STATUS(release_kernel(kernel)); + } + inline_kernel_bin_head += "#endif"; + device_map += "}\n"; + device_map += "gcl_kernel_binmap* create_" + device_name + "_kernelbin_map(){\n"; + device_map += " " + device_name + "* kernelbin = new " + device_name + "();\n"; + device_map += " return (gcl_kernel_binmap*) kernelbin;\n"; + device_map += "}"; + write_to_file(inline_kernel_bin_head, cppPath, inline_kernel_bin_head_name); + write_to_file(device_map, cppPath, device_map_name); + write_to_file(device_kernel_bin, cppPath, device_kernel_bin_name); + gcl_destroy_handle(handle); +} +#endif + +int main(int argc, char *argv[]) +{ +#ifdef _USE_FP16 + if (argc != 5) { + UNI_INFO_LOG("Please set your models path, and put your bolt models into it\n"); + UNI_INFO_LOG("Please set your algosPath for save produced algo files, and ensure it is " + "clean\n"); + UNI_INFO_LOG("Please set your include Path for save ocl kernelBin headFile, and ensure it " + "is clean\n"); + UNI_INFO_LOG("Please set your cpp Path for save ocl kernelBin cpp, and ensure it is " + "clean\n"); + UNI_INFO_LOG("For example: ./preprocess_ocl ./boltModels/ ./algoFiles/ ./include/ " + "./cpp/\n"); + exit(1); + } + I8 lastFlag; + std::string modelsPath = (CI8 *)argv[1]; + lastFlag = modelsPath[modelsPath.length() - 1]; + if (strcmp(&lastFlag, "/") != 0) { + modelsPath += "/"; + } + + std::string algoPath = (CI8 *)argv[2]; + lastFlag = algoPath[algoPath.length() - 1]; + if (strcmp(&lastFlag, "/") != 0) { + algoPath += "/"; + } + + std::string includePath = (CI8 *)argv[3]; + lastFlag = includePath[includePath.length() - 1]; + if (strcmp(&lastFlag, "/") != 0) { + includePath += "/"; + } + + std::string cppPath = (CI8 *)argv[4]; + lastFlag = cppPath[cppPath.length() - 1]; + if (strcmp(&lastFlag, "/") != 0) { + cppPath += "/"; + } + + std::vector modelsNameArray; + modelsNameArray = buildModelsNameArray(modelsPath, ".bolt"); + std::vector kernelNames; + for (auto name : modelsNameArray) { + runBoltModel(name.c_str(), algoPath.c_str(), &kernelNames); + } + + buildKernelBinFiles(kernelNames, includePath, cppPath); +#endif + return 0; +} diff --git a/inference/engine/tools/ptq_calibration/ptq_calibration.cpp b/inference/engine/tools/ptq_calibration/ptq_calibration.cpp new file mode 100644 index 00000000..da0fc546 --- /dev/null +++ b/inference/engine/tools/ptq_calibration/ptq_calibration.cpp @@ -0,0 +1,443 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include +#include +#include "inference.hpp" +#include "tensor.hpp" +#include "data_loader.hpp" +#include "result_format.hpp" +#include "profiling.h" +#include "tensor_computing.h" +#include "model_print.h" +#ifdef _USE_FP16 +#include "../../../../compute/tensor/src/cpu/arm/fp16/arm_functions_fp16.h" +#endif +#ifdef _USE_FP32 +#include "../../../../compute/tensor/src/cpu/arm/fp32/arm_functions_fp32.h" +#endif + +#define BINS 2048 +#define NUM_IMAGES_INPUT 100 + +void print_help(char *argv[]) +{ + std::cout << "usage: " << argv[0] + << " modelPath dataDirectory dataFormat scaleValue affinityPolicyName " + "algorithmMapPath" + << std::endl; +} + +int main(int argc, char *argv[]) +{ +#ifdef _USE_FP16 + UNI_TIME_INIT + + char *modelPath = (char *)""; + char *dataDir = (char *)""; + char *affinityPolicyName = (char *)""; + char *algorithmMapPath = (char *)""; + ImageFormat imageFormat = RGB; + F32 scaleValue = 1; + if (argc < 5) { + print_help(argv); + return 1; + } + modelPath = argv[1]; + dataDir = argv[2]; + + imageFormat = (std::string(argv[3]) == std::string("BGR") ? BGR : RGB); + if (std::string(argv[3]) == std::string("RGB_SC")) { + imageFormat = RGB_SC; + } else if (std::string(argv[3]) == std::string("BGR_SC_RAW")) { + imageFormat = BGR_SC_RAW; + } else if (std::string(argv[3]) == std::string("RGB_SC_RAW")) { + imageFormat = RGB_SC_RAW; + } + + scaleValue = atof(argv[4]); + + if (argc > 5) { + affinityPolicyName = argv[5]; + } + + if (argc > 6) { + algorithmMapPath = argv[6]; + } + + ModelSpec int8Ms; + CHECK_STATUS(deserialize_model_from_file(modelPath, &int8Ms)); + CHECK_REQUIREMENT(DT_F16_8Q == int8Ms.dt || DT_F16 == int8Ms.dt); + int8Ms.dt = DT_F16_8Q; + + ModelSpec f16Ms; + CHECK_STATUS(deserialize_model_from_file(modelPath, &f16Ms)); + f16Ms.dt = DT_F16; + + ModelSpec resultMs; + CHECK_STATUS(deserialize_model_from_file(modelPath, &resultMs)); + resultMs.dt = DT_F16_8Q; + + auto relationNum = resultMs.num_op_tensor_entries; + auto relationPtr = resultMs.op_relationship_entries; + resultMs.num_op_tensor_entries = 0; + resultMs.op_relationship_entries = nullptr; + + auto int8CNN = createPipelinefromMs(affinityPolicyName, &int8Ms, algorithmMapPath); + auto f16CNN = createPipelinefromMs(affinityPolicyName, &f16Ms, algorithmMapPath); + + // load images + std::map> inMap = int8CNN->get_inputs(); + TensorDesc imageDesc = (*(inMap.begin()->second)).get_desc(); + std::vector imageDescs; + imageDescs.push_back(imageDesc); + std::vector> images; + std::vector imagePaths = + load_image_with_scale(dataDir, imageDescs, &images, imageFormat, scaleValue); + + std::cout << "[Calibration]:" << std::endl; + + std::vector dBuf; + //std::vector qBuf; + std::vector calibratedOpIdx; + + auto curModelInputTensorNames = int8CNN->get_model_input_tensor_names(); + for (int index = 0; index < (int)curModelInputTensorNames.size(); index++) { + int8CNN->copy_to_named_input(curModelInputTensorNames[index], + (U8 *)((CpuMemory *)images[0][index].get_memory())->get_ptr()); + } + + U32 opIdx = int8CNN->find_next_dynamic_scale_op(calibratedOpIdx, 0); + std::map> tensorScale; + + while (0 != opIdx) { + auto op = int8CNN->get_operator_by_index(opIdx); + std::string opName = op->get_name(); + std::cout << "Calibrating OP " << opIdx << ": " << opName << std::endl; + std::string opsName = int8Ms.ops[opIdx].name; + CHECK_REQUIREMENT(opName == opsName); + + std::vector> scales; + auto inputTensors = op->get_input_tensors(); + auto outputTensors = op->get_output_tensors(); + std::cout << " Inputs:\n"; + + for (U32 i = 0; i < int8Ms.ops[opIdx].num_inputs; i++) { + std::string tensorName = int8Ms.ops[opIdx].input_tensors_name[i]; + TensorDesc inDesc = inputTensors[i].get_desc(); + + auto it = tensorScale.find(tensorName); + if (it != tensorScale.end()) { + scales.push_back(tensorScale[tensorName]); + std::cout << " InputTensor " << i << " " << tensorName << " inherits scale " + << tensorScale[tensorName][0] << std::endl; + continue; + } + + if (DT_I8 == inDesc.dt) { // Gets scale from int8 pooling or concat. Label with -1 + std::vector scale; + scale.push_back(-1); + scales.push_back(scale); + tensorScale[tensorName] = scale; + std::cout << " InputTensor " << i << " " << tensorName + << " inherits transformed scale " << std::endl; + continue; + } + + U32 dBytes = tensorNumBytes(inDesc); + dBuf.resize(dBytes * NUM_IMAGES_INPUT); + U8 *d = dBuf.data(); + std::vector histogram; + F32 last_max = 0; + F32 interval = 0; + + for (U32 j = 0; j < images.size(); j++) { + for (int index = 0; index < (int)curModelInputTensorNames.size(); index++) { + int8CNN->copy_to_named_input(curModelInputTensorNames[index], + (U8 *)((CpuMemory *)images[j][index].get_memory())->get_ptr()); + } + + int8CNN->run_till_breakpoint(opIdx); + memcpy(d, ((CpuMemory *)(inputTensors[i].get_memory()))->get_ptr(), dBytes); + d += dBytes; + + if ((j != images.size() - 1) && ((j + 1) % NUM_IMAGES_INPUT != 0)) { + continue; + } + + if (j == NUM_IMAGES_INPUT - 1 || + ((j == images.size() - 1) && (j < NUM_IMAGES_INPUT - 1))) { + UNI_DEBUG_LOG("---------- start getting 1 - %u images input tensors " + "----------\n", + j + 1); + F16 *ptr_d = (F16 *)dBuf.data(); + F32 max = array_maxabs_f16(ptr_d, (I32)(tensorNumElements(inDesc) * (j + 1))); + UNI_DEBUG_LOG(" %f is the maximum value\n", max); + interval = max / BINS; + histogram.resize(BINS, 0.00001f); + //update histogram first time + update_histogram(tensorNumElements(inDesc) * (j + 1), ptr_d, BINS, interval, + histogram.data()); + last_max = max; + d = dBuf.data(); + dBuf.clear(); + continue; + } + + if ((j + 1) % NUM_IMAGES_INPUT == 0 && j != (NUM_IMAGES_INPUT - 1)) { + UNI_DEBUG_LOG("---------- start getting %d - %u images input tensors " + "----------\n", + j + 1 - 100, j + 1); + F16 *ptr_d = (F16 *)dBuf.data(); + F32 max = array_maxabs_f16( + ptr_d, (I32)(tensorNumElements(inDesc) * NUM_IMAGES_INPUT)); + if (max <= last_max) { + UNI_DEBUG_LOG(" %f is the maximum value\n", last_max); + interval = last_max / BINS; + //update histogram if no new max + update_histogram(tensorNumElements(inDesc) * NUM_IMAGES_INPUT, ptr_d, BINS, + interval, histogram.data()); + } else { + UNI_DEBUG_LOG(" %f is the maximum value\n", max); + interval = max / BINS; + F32 numPerBin = (F32)max / last_max; + //last_max = max; -> may optimize accuracy. + histogram = compress_histogram(histogram, numPerBin, last_max); + last_max = max; + update_histogram((tensorNumElements(inDesc) * NUM_IMAGES_INPUT), ptr_d, + BINS, interval, histogram.data()); + } + d = dBuf.data(); + dBuf.clear(); + continue; + } + + if ((j == images.size() - 1) && ((j + 1) % NUM_IMAGES_INPUT != 0)) { + UNI_DEBUG_LOG("---------- start getting %d - %u images input tensors " + "----------\n", + j + 1 - ((j + 1) % NUM_IMAGES_INPUT), j + 1); + dBuf.resize(dBytes * ((j + 1) % NUM_IMAGES_INPUT)); + F16 *ptr_d = (F16 *)dBuf.data(); + F32 max = array_maxabs_f16( + ptr_d, (I32)(tensorNumElements(inDesc) * ((j + 1) % NUM_IMAGES_INPUT))); + if (max <= last_max) { + UNI_DEBUG_LOG(" %f is the maximum value\n", last_max); + interval = last_max / BINS; + //update histogram if no new max + update_histogram(tensorNumElements(inDesc) * ((j + 1) % NUM_IMAGES_INPUT), + ptr_d, BINS, interval, histogram.data()); + } else { + UNI_DEBUG_LOG(" %f is the maximum value\n", max); + interval = max / BINS; + F32 numPerBin = (F32)max / last_max; + //last_max = max; -> may optimize accuracy + histogram = compress_histogram(histogram, numPerBin, last_max); + last_max = max; + update_histogram((tensorNumElements(inDesc) * NUM_IMAGES_INPUT), ptr_d, + BINS, interval, histogram.data()); + } + d = dBuf.data(); + dBuf.clear(); + continue; + } + } + + UNI_DEBUG_LOG("---------- compute KL ----------\n"); + std::vector scale = compute_scale_with_KL(histogram, interval); + UNI_DEBUG_LOG("--------- finish compute KL ---------\n"); + scales.push_back(scale); + tensorScale[tensorName] = scale; + UNI_DEBUG_LOG(" InputTensor %u %s gets scale %f\n", i, tensorName.c_str(), + tensorScale[tensorName][0]); + } + + op->set_feature_scale(scales); + UNI_DEBUG_LOG(" Outputs:\n"); + + for (U32 i = 0; i < int8Ms.ops[opIdx].num_outputs; i++) { + std::string tensorName = int8Ms.ops[opIdx].output_tensors_name[i]; + TensorDesc desc = outputTensors[i].get_desc(); + + auto it = tensorScale.find(tensorName); + CHECK_REQUIREMENT(it == tensorScale.end()); + + if (DT_F16 == desc.dt) { + continue; + } + + CHECK_REQUIREMENT(DT_I8 == desc.dt); + + auto opF16 = f16CNN->get_operator_by_index(opIdx); + auto outputs = opF16->get_output_tensors(); + + TensorDesc outDesc = outputs[i].get_desc(); + U32 dBytes = tensorNumBytes(outDesc); + dBuf.resize(dBytes * NUM_IMAGES_INPUT); + std::vector histogram; + F32 last_max = 0; + F32 interval = 0; + + U8 *d = dBuf.data(); + + for (U32 j = 0; j < images.size(); j++) { + for (int index = 0; index < (int)curModelInputTensorNames.size(); index++) { + f16CNN->copy_to_named_input(curModelInputTensorNames[index], + (U8 *)((CpuMemory *)images[j][index].get_memory())->get_ptr()); + } + + f16CNN->run_till_breakpoint(opIdx); + memcpy(d, ((CpuMemory *)outputs[i].get_memory())->get_ptr(), dBytes); + d += dBytes; + + if ((j != images.size() - 1) && ((j + 1) % NUM_IMAGES_INPUT != 0)) { + continue; + } + + if (j == NUM_IMAGES_INPUT - 1 || + ((j == images.size() - 1) && (j < NUM_IMAGES_INPUT - 1))) { + UNI_DEBUG_LOG("---------- start getting 1 - %u images output tensors " + "----------\n", + j + 1); + + F16 *ptr_d = (F16 *)dBuf.data(); + F32 max = array_maxabs_f16(ptr_d, (I32)(tensorNumElements(outDesc) * (j + 1))); + UNI_DEBUG_LOG(" %f is the maximum value\n", max); + interval = max / BINS; + histogram.resize(BINS, 0.00001f); + //update histogram first time + update_histogram(tensorNumElements(outDesc) * (j + 1), ptr_d, BINS, interval, + histogram.data()); + last_max = max; + d = dBuf.data(); + dBuf.clear(); + continue; + } + + if ((j + 1) % NUM_IMAGES_INPUT == 0 && j != (NUM_IMAGES_INPUT - 1)) { + F16 *ptr_d = (F16 *)dBuf.data(); + F32 max = + array_maxabs_f16(ptr_d, (I32)tensorNumElements(outDesc) * NUM_IMAGES_INPUT); + + UNI_DEBUG_LOG("---------- start getting %d - %u images output tensors " + "----------\n", + j + 1 - 100, j + 1); + + if (max <= last_max) { + UNI_DEBUG_LOG(" %f is the maximum value\n", last_max); + interval = last_max / BINS; + //update histogram if no new max + update_histogram(tensorNumElements(outDesc) * NUM_IMAGES_INPUT, ptr_d, BINS, + interval, histogram.data()); + } else { + UNI_DEBUG_LOG(" %f is the maximum value\n", max); + interval = max / BINS; + F32 numPerBin = (F32)max / last_max; + //last_max = max; -> may optimize accuracy + histogram = compress_histogram(histogram, numPerBin, last_max); + last_max = max; + update_histogram(tensorNumElements(outDesc) * NUM_IMAGES_INPUT, ptr_d, BINS, + interval, histogram.data()); + } + d = dBuf.data(); + dBuf.clear(); + continue; + } + + if ((j == images.size() - 1) && ((j + 1) % NUM_IMAGES_INPUT != 0)) { + UNI_DEBUG_LOG("---------- start getting %d - %u images output tensors " + "----------\n", + j + 1 - ((j + 1) % NUM_IMAGES_INPUT), j + 1); + dBuf.resize(dBytes * ((j + 1) % NUM_IMAGES_INPUT)); + F16 *ptr_d = (F16 *)dBuf.data(); + F32 max = array_maxabs_f16( + ptr_d, (I32)(tensorNumElements(outDesc) * ((j + 1) % NUM_IMAGES_INPUT))); + if (max <= last_max) { + UNI_DEBUG_LOG(" %f is the maximum value\n", last_max); + interval = last_max / BINS; + //update histogram if no new max + update_histogram(tensorNumElements(outDesc) * ((j + 1) % NUM_IMAGES_INPUT), + ptr_d, BINS, interval, histogram.data()); + } else { + UNI_DEBUG_LOG(" %f is the maximum value\n", max); + interval = max / BINS; + F32 numPerBin = (F32)max / last_max; + //last_max = max; -> may optimize accuracy + histogram = compress_histogram(histogram, numPerBin, last_max); + last_max = max; + update_histogram(tensorNumElements(outDesc) * ((j + 1) % NUM_IMAGES_INPUT), + ptr_d, BINS, interval, histogram.data()); + } + d = dBuf.data(); + dBuf.clear(); + continue; + } + } + UNI_DEBUG_LOG("---------- compute KL ----------\n"); + std::vector scale = compute_scale_with_KL(histogram, interval); + UNI_DEBUG_LOG("---------- finish compute KL ---------\n"); + scales.push_back(scale); + tensorScale[tensorName] = scale; + UNI_DEBUG_LOG(" OutputTensor %u %s gets scale %f\n", i, tensorName.c_str(), + tensorScale[tensorName][0]); + } + if (int8Ms.ops[opIdx].num_quant_feature == 1 && + -2 == int8Ms.ops[opIdx].feature_scale[0].scale[0]) { + std::vector outputScale; + outputScale.push_back(-2); + scales.push_back(outputScale); + } + + op->set_feature_scale(scales); + + // Store scales into result model + if (nullptr != resultMs.ops[opIdx].feature_scale) { // Could be labelled with -2 + for (U32 i = 0; i < resultMs.ops[opIdx].num_quant_feature; i++) { + if (nullptr != resultMs.ops[opIdx].feature_scale[i].scale) { + delete[] resultMs.ops[opIdx].feature_scale[i].scale; + } + } + delete[] resultMs.ops[opIdx].feature_scale; + } + + resultMs.ops[opIdx].num_quant_feature = scales.size(); + resultMs.ops[opIdx].feature_scale = + (QuantSpec *)mt_new_storage(scales.size() * sizeof(QuantSpec)); + + for (U32 i = 0; i < scales.size(); i++) { + resultMs.ops[opIdx].feature_scale[i].num_scale = scales[i].size(); + U32 scaleBytes = scales[i].size() * sizeof(F32); + resultMs.ops[opIdx].feature_scale[i].scale = (F32 *)mt_new_storage(scaleBytes); + memcpy(resultMs.ops[opIdx].feature_scale[i].scale, scales[i].data(), scaleBytes); + } + + calibratedOpIdx.push_back(opIdx); + opIdx = int8CNN->find_next_dynamic_scale_op(calibratedOpIdx, opIdx); + } + + print_ms(resultMs); + + std::string modelStorePath = std::string(argv[1]); + auto suffixPos = modelStorePath.find(".bolt"); + modelStorePath.erase(suffixPos, 5); + modelStorePath += "_KL.bolt"; + CHECK_STATUS(serialize_model_to_file(&resultMs, modelStorePath.c_str())); + + CHECK_STATUS(mt_destroy_model(&int8Ms)); + CHECK_STATUS(mt_destroy_model(&f16Ms)); + resultMs.num_op_tensor_entries = relationNum; + resultMs.op_relationship_entries = relationPtr; + CHECK_STATUS(mt_destroy_model(&resultMs)); +#endif + return 0; +} diff --git a/inference/examples/CMakeLists.txt b/inference/examples/CMakeLists.txt new file mode 100644 index 00000000..13cad12c --- /dev/null +++ b/inference/examples/CMakeLists.txt @@ -0,0 +1,63 @@ +cmake_minimum_required(VERSION 3.2) + +file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/common/cmakes/bolt.cmake ${BOLT_ROOT}/common/cmakes/bolt.cmake) +if (BOLT_CONFIGURE_FILE) + include(${BOLT_CONFIGURE_FILE}) +else (BOLT_CONFIGURE_FILE) + message(FATAL_ERROR " +FATAL: can not find bolt.cmake in directory, + please set shell or cmake environment variable BOLT_ROOT. + ") +endif (BOLT_CONFIGURE_FILE) + +project(examples) + +set_c_cxx_flags() +set_test_c_cxx_flags() + +include_flow() + +if (BUILD_TEST) + engine_test(bert bert/bert.cpp) + engine_test(tinybert bert/tinybert.cpp) + engine_test(classification image_classification/classification.cpp) + engine_test(nmt machine_translation/nmt.cpp) + engine_test(nmt_tsc machine_translation/nmt_tsc.cpp) + engine_test(asr_rnnt automatic_speech_recognition/asr_rnnt.cpp) + engine_test(asr_convolution_transformer automatic_speech_recognition/asr_convolution_transformer.cpp) + engine_test(tts text_to_speech/tts.cpp) + engine_test(vad automatic_speech_recognition/vad.cpp) + engine_test(detection object_detection/detection.cpp) + engine_test(tinybert_onnx bert/tinybert_onnx.cpp) + engine_test(benchmark benchmark/benchmark.cpp) + engine_test(test_api_c c_api/test_api_c.c) + install(TARGETS classification + benchmark + tinybert + tinybert_onnx + nmt + asr_rnnt + asr_convolution_transformer + tts + vad + test_api_c + RUNTIME DESTINATION examples) + if (USE_MALI AND USE_FP16) + engine_test(test_pipeline_ocl sequential/test_pipeline_ocl.cpp) + engine_test(hdr high_dynamic_range/hdr.cpp) + install(TARGETS hdr + RUNTIME DESTINATION examples) + endif (USE_MALI AND USE_FP16) + + if (USE_FLOW) + flow_test(graph_tinybert bert/graph_tinybert.cpp) + flow_test(flow_tinybert bert/flow_tinybert.cpp) + flow_test(flow_asr "automatic_speech_recognition/flow_asr.cpp;automatic_speech_recognition/audio_feature.cpp") + flow_test(flow_dlaWOdcn dlaWOdcn/flow_dlaWOdcn.cpp) + flow_test(flow_facesr facesr/flow_facesr.cpp) + install(TARGETS flow_asr + flow_dlaWOdcn + flow_facesr + RUNTIME DESTINATION examples) + endif (USE_FLOW) +endif (BUILD_TEST) diff --git a/inference/examples/automatic_speech_recognition/asr_convolution_transformer.cpp b/inference/examples/automatic_speech_recognition/asr_convolution_transformer.cpp new file mode 100644 index 00000000..f5c53193 --- /dev/null +++ b/inference/examples/automatic_speech_recognition/asr_convolution_transformer.cpp @@ -0,0 +1,218 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include + +#include "inference.hpp" +#include "tensor.hpp" +#include "data_loader.hpp" +#include "profiling.h" +#include "parse_command.h" + +std::map prepareStates( + DataType dt, std::string sequenceDirectory, std::string shapeMapFileName) +{ + std::map shapeMap; + std::string filePath = sequenceDirectory + "/" + shapeMapFileName; + FILE *shapeMapFile = fopen(filePath.c_str(), "r"); + char buffer[NAME_LEN]; + while (fscanf(shapeMapFile, "%s", buffer) != EOF) { + TensorDesc desc; + fscanf(shapeMapFile, "%u", &(desc.nDims)); + for (U32 i = 0; i < desc.nDims; i++) { + fscanf(shapeMapFile, "%u", &(desc.dims[desc.nDims - 1 - i])); + } + if (std::string(buffer) == std::string("label")) { + desc.dt = DT_U32; + } else { + desc.dt = dt; + } + std::string inputName(buffer); + if (inputName.find(std::string("layer1_mem")) != std::string::npos) { + desc.df = DF_NCHWC8; + } else { + desc.df = DF_NCHW; + } + shapeMap[inputName] = desc; + } + fclose(shapeMapFile); + + std::map tensorMap; + for (auto iter : shapeMap) { + std::string filePath = sequenceDirectory + "/" + iter.first + ".txt"; + TensorDesc desc = iter.second; + tensorMap[iter.first] = load_txt(filePath, std::vector{desc})[0]; + } + return tensorMap; +} + +void saveStates(std::shared_ptr pipeline, + std::string sequenceDirectory, + std::string outputFileName, + std::string outputStatesFileName) +{ + char buffer[NAME_LEN]; + std::string outputFilePath = sequenceDirectory + "/" + outputFileName; + std::string outputStatesFilePath = sequenceDirectory + "/" + outputStatesFileName; + FILE *outputFile = fopen(outputFilePath.c_str(), "r"); + FILE *outputStatesFile = fopen(outputStatesFilePath.c_str(), "w"); + while (!feof(outputFile)) { + fscanf(outputFile, "%s", buffer); + Tensor tensor = pipeline->get_tensor_by_name(buffer); + TensorDesc desc = tensor.get_desc(); + + // write states + fprintf(outputStatesFile, "%s\n", buffer); + fprintf(outputStatesFile, "%u\n", desc.nDims); + for (U32 i = 0; i < desc.nDims; i++) { + fprintf(outputStatesFile, "%u ", desc.dims[desc.nDims - 1 - i]); + } + + // write data + U32 num = tensorNumElements(desc); + std::string outputDataPath = sequenceDirectory + "/" + std::string(buffer) + ".txt"; + FILE *outputDataFile = fopen(outputDataPath.c_str(), "w"); + for (U32 i = 0; i < num; i++) { + fprintf(outputDataFile, "%f ", tensor.element(i)); + if (i % 10 == 9) { + fprintf(outputDataFile, "\n"); + } + } + fclose(outputDataFile); + } + fclose(outputFile); + fclose(outputStatesFile); +} + +int verify(Tensor tensor, std::string subNetworkName, std::map inputDescMap) +{ + U32 num = tensor.length(); + F32 sum = 0; + for (U32 i = 0; i < num; i++) { + sum += tensor.element(i); + } + I32 result = 0; + if (subNetworkName == std::string("encoder")) { + if (inputDescMap["sounds"].dims[1] == 15) { + if (abs(sum - 44.4) >= 1) { + result = 1; + } + } else if (inputDescMap["sounds"].dims[1] == 8) { + if (abs(sum - 102.3) >= 1) { + result = 1; + } + } else { + result = 1; + } + } else if (subNetworkName == std::string("prediction_net")) { + if (abs(sum - 21.7) >= 1) { + result = 1; + } + } else if (subNetworkName == std::string("joint_net")) { + if (abs(sum - (-24.6)) >= 1) { + result = 1; + } + } + return result; +} + +int main(int argc, char *argv[]) +{ + UNI_TIME_INIT + ParseRes parse_res; + parseCommandLine(argc, argv, &parse_res, "examples"); + char *modelPath = (char *)""; + char *sequenceDirectory = (char *)""; + std::string subNetworkName = std::string("encoder"); + char *affinityPolicyName = (char *)"CPU_AFFINITY_HIGH_PERFORMANCE"; + + if (!parse_res.model.second) { + exit(-1); + } + if (parse_res.model.second) { + modelPath = parse_res.model.first; + } + if (parse_res.inputPath.second) { + sequenceDirectory = parse_res.inputPath.first; + } + if (parse_res.archInfo.second) { + affinityPolicyName = parse_res.archInfo.first; + } + if (parse_res.subNetworkName.second) { + subNetworkName = std::string(parse_res.subNetworkName.first); + } + + std::string outputTensorName; + if (subNetworkName == std::string("encoder")) { + outputTensorName = "encoder_block3_transformer_ln"; + } else if (subNetworkName == std::string("prediction_net")) { + outputTensorName = "prediction_net_ln"; + } else if (subNetworkName == std::string("joint_net")) { + outputTensorName = "joint_output_fc"; + } else { + UNI_ERROR_LOG("unrecognized sub network(encoder|prediction_net|joint_net) %s\n", + subNetworkName.c_str()); + } + + DataType dt; + std::string modelPathStr = std::string(modelPath); + // "_f[16|32].bolt" + std::string modelPathSuffix = modelPathStr.substr(modelPathStr.size() - 9); + if (modelPathSuffix == std::string("_f16.bolt")) { + dt = DT_F16; + } else if (modelPathSuffix == std::string("_f32.bolt")) { + dt = DT_F32; + } else if (modelPathSuffix == std::string("t8_q.bolt")) { + dt = DT_F16; + } else { + UNI_ERROR_LOG("unrecognized model file path suffix %s\n", modelPathSuffix.c_str()); + } + auto pipeline = createPipeline(affinityPolicyName, modelPath); + + double totalTime = 0; + int loops = 1; + U32 falseResult = 0; + for (int i = 0; i < loops; i++) { + std::map input = + prepareStates(dt, sequenceDirectory, "input_shape.txt"); + std::map inputDescMap; + for (auto iter : input) { + inputDescMap[iter.first] = iter.second.get_desc(); + } + pipeline->reready(inputDescMap); + for (auto iter : input) { + U8 *tensorPointer = (U8 *)((CpuMemory *)(iter.second.get_memory()))->get_ptr(); + pipeline->copy_to_named_input(iter.first, tensorPointer); + } + + double timeBegin = ut_time_ms(); + pipeline->run(); + double timeEnd = ut_time_ms(); + totalTime += (timeEnd - timeBegin); + Tensor output = pipeline->get_tensor_by_name(outputTensorName); + falseResult += verify(output, subNetworkName, inputDescMap); + //saveStates(pipeline, sequenceDirectory, "output_name.txt", "output_shape.txt"); + } + UNI_TIME_STATISTICS + + std::cout << "[SUMMARY]:" << std::endl; + U32 validSequence = loops; + UNI_CI_LOG( + "speech recognition rate: %f %%\n", 100.0 * (validSequence - falseResult) / validSequence); + UNI_CI_LOG("avg_time:%fms/sequence\n", 1.0 * totalTime / validSequence); + if (falseResult > 0) { + UNI_ERROR_LOG("verify failed\n"); + } + + return 0; +} diff --git a/inference/examples/automatic_speech_recognition/asr_labels.txt b/inference/examples/automatic_speech_recognition/asr_labels.txt new file mode 100644 index 00000000..9cfb1ac5 --- /dev/null +++ b/inference/examples/automatic_speech_recognition/asr_labels.txt @@ -0,0 +1,2 @@ +[NULL] [START] [SEP] [MASK] 的 我 你 了 是 不 一 么 有 好 在 个 人 这 要 天 什 到 大 来 上 吗 去 没 下 啊 就 电 点 怎 说 看 会 开 小 多 还 给 十 子 能 那 中 可 家 为 打 想 时 都 二 车 们 发 话 三 以 过 吧 吃 生 年 出 里 最 也 他 机 回 用 手 心 老 和 样 道 明 地 行 现 后 得 女 哪 国 今 五 公 妈 新 爱 事 自 学 呢 很 成 对 动 真 知 市 起 日 四 快 做 把 意 信 高 儿 面 方 美 爸 关 安 情 路 调 分 少 八 六 作 之 前 呀 哈 长 度 如 场 几 零 经 请 着 百 姐 歌 全 号 太 晚 九 业 放 第 问 月 乐 气 网 七 西 帮 工 加 法 觉 喜 东 别 定 睡 无 些 通 近 被 头 名 间 文 两 听 买 于 主 欢 空 然 友 又 再 玩 办 导 才 等 水 感 只 果 风 找 谁 理 查 同 让 宝 区 己 本 比 力 走 谢 海 叫 男 重 次 当 实 早 山 片 钱 外 州 南 她 啦 航 身 干 影 视 已 体 王 接 将 正 进 金 候 思 花 音 饭 语 记 提 部 哦 系 星 收 何 万 游 位 图 所 首 哥 见 张 站 从 班 应 者 息 该 门 合 式 城 马 北 服 火 节 相 因 英 完 字 周 世 白 活 口 带 民 刚 期 神 色 跟 种 照 孩 线 光 嗯 平 与 司 红 表 单 千 联 而 但 化 阳 交 物 保 特 题 结 婆 院 报 常 广 量 解 变 更 入 直 界 死 内 边 京 目 热 店 难 师 原 房 超 告 费 数 温 员 模 球 其 像 台 嘛 消 午 资 品 产 设 总 件 演 流 朋 错 利 江 价 先 码 代 呵 级 笑 福 需 战 短 亮 亲 速 建 眼 华 聊 强 清 务 转 教 向 声 离 商 写 元 龙 书 播 戏 改 此 传 考 卡 制 换 受 每 酒 包 微 远 认 充 拉 越 运 示 处 管 置 奶 醒 座 条 格 赛 县 展 林 停 试 黄 送 讲 园 喝 拿 医 排 警 衣 连 计 雨 算 望 性 始 婚 拍 求 春 票 半 校 准 幺 曲 黑 夜 剧 住 飞 灯 深 啥 词 整 附 装 穿 器 油 李 注 达 队 斯 省 选 诉 假 冷 功 德 政 复 米 备 科 客 程 画 双 证 持 局 洗 岁 便 未 底 梦 取 银 集 终 精 基 香 反 布 尔 妹 青 久 版 容 习 指 技 河 唱 士 任 案 您 云 推 使 句 钟 失 股 伤 蛋 非 义 至 低 读 款 检 步 巴 丽 坐 续 满 奇 病 况 频 食 驾 克 预 并 脑 休 掉 否 专 弟 祝 启 忙 刘 识 村 载 楼 鱼 它 助 汽 支 立 治 帅 菜 昨 言 标 药 故 牌 易 跑 切 湖 烦 阿 窗 确 显 创 组 型 闭 爷 景 古 决 雪 狗 必 鸡 养 户 却 街 石 各 铁 落 及 据 课 由 录 造 卖 武 肉 领 怕 牛 社 值 亚 投 术 兴 命 共 陈 除 差 待 屏 脸 拨 幸 统 答 贝 猪 宁 板 或 餐 随 观 约 修 季 床 麻 搜 货 毛 防 密 冰 足 累 害 份 希 企 盘 象 港 紧 较 懂 具 规 环 罗 营 旅 搞 团 忘 简 蓝 留 奥 绝 军 许 历 段 称 杨 获 够 皮 质 配 智 汉 镇 态 志 举 限 护 往 幕 牙 形 项 居 存 背 痛 源 论 升 庆 破 礼 按 陪 贵 康 味 轻 卫 翻 索 维 险 议 愿 农 评 料 增 健 类 参 永 慢 沙 招 彩 购 冬 漂 藏 博 粉 坏 初 秋 室 念 荣 爆 赶 众 苦 夫 响 育 艺 遇 极 血 顺 套 呼 急 嘿 询 职 驶 介 兰 暖 效 委 椅 官 竟 奖 魔 波 适 倒 属 怀 验 富 恩 角 恋 挺 势 销 济 减 雄 块 馆 软 乡 突 财 免 围 摩 讯 苏 降 即 左 控 静 继 童 范 断 救 登 优 退 练 根 宣 诗 秀 尽 母 疼 肥 舞 划 末 权 脚 谈 杀 苹 土 纪 党 羊 敢 韩 测 拜 豆 圈 跳 曾 右 普 烧 某 饿 肯 猫 际 胡 亿 似 击 依 娘 松 群 丝 络 颜 细 咋 木 岛 止 史 妇 绿 怪 哎 赵 闻 川 郑 灵 肚 冒 争 烟 址 哭 监 察 付 状 吴 雷 熊 闹 副 致 笔 章 草 胖 售 压 哇 临 补 仙 桥 吉 树 坚 迷 困 追 移 独 迎 列 吹 归 茶 惊 傻 秘 享 境 杰 朝 胜 骗 猜 耀 晓 率 娃 积 绍 典 夏 抱 引 则 田 府 靠 鞋 负 供 兄 择 露 研 额 圣 输 般 须 杯 令 严 孕 漫 厂 余 浪 姑 泰 笨 邮 佳 乱 弄 抓 素 欧 讨 激 采 涨 训 斗 箱 暴 冲 刻 尼 镜 甲 互 玉 腿 责 泡 施 究 叔 借 疑 透 曼 承 狂 庭 洋 勇 轮 厅 且 爽 瓜 鬼 酷 律 庄 担 冠 舒 洲 迪 宜 咯 页 豪 挂 野 堂 咱 攻 抢 峰 孙 毒 纸 译 父 唐 皇 甜 偷 赞 族 骑 叶 娱 脱 盟 略 呗 顾 圳 派 池 批 丰 舍 圆 恐 巨 益 距 刷 递 丹 净 威 另 锅 签 乖 糖 操 材 聚 晨 良 乎 暗 吸 凉 访 恶 融 晒 守 妻 荐 旧 概 耳 湾 兵 纳 抽 虎 仅 层 败 托 伦 睛 寻 汤 疗 嘴 媒 宋 酸 鲜 裤 印 齐 策 厉 善 曝 执 央 楚 宿 惠 萌 棒 腾 汇 昌 嘉 席 寒 届 途 屁 束 顶 租 鸟 辛 珠 编 召 骨 俩 折 毕 构 协 莫 熟 劳 骂 架 胎 税 例 巧 封 辣 诞 梅 疯 散 姓 哟 遭 娜 伙 键 泪 侠 贴 审 申 雅 虽 炒 阴 姨 菲 判 泉 币 异 颖 弃 革 震 努 凯 阅 宫 伟 仍 陆 帝 霸 辆 拼 误 拥 册 赢 征 赚 潮 锁 津 端 核 乌 档 缺 删 释 厌 悲 伴 织 尚 堵 订 摄 域 紫 谷 逃 犯 私 培 估 眠 默 违 桃 杭 仔 桌 账 屋 剑 挑 隐 刀 尾 缘 盛 稳 戴 滴 弹 萨 枪 鲁 蜜 糕 扣 盖 君 罪 佛 瘦 刺 剩 淘 澡 探 蒙 惜 绩 忍 淡 搭 逼 禁 库 宾 雾 徐 若 聪 析 贷 唉 瓶 序 厦 凤 趣 嘻 危 忆 滚 瑞 烤 洛 醉 狼 亦 旁 遍 偶 丁 晕 乘 钢 竞 泽 纯 兽 森 榜 闲 丑 恭 幼 伊 避 阵 铃 麦 盗 秒 篇 嫁 射 硬 欲 肤 丢 吐 慧 饮 塞 均 猎 奔 迟 鼻 滑 闪 锋 遗 船 甘 尊 唯 诺 饼 惯 敬 纷 宇 噢 虚 郭 昆 卷 润 炸 亡 塔 染 朵 混 逛 罚 俊 贫 针 沉 幻 旗 症 拒 纹 袋 袭 剪 替 炎 锦 胸 珍 梁 篮 植 厕 尸 鼠 扎 吓 跌 顿 滨 扬 诚 玲 孤 虑 烈 含 凡 延 占 燕 媳 映 臭 敏 姥 姆 乔 摇 吵 废 鹿 朱 杂 灰 扫 恒 障 返 祖 煮 弱 艳 暂 撞 摸 敌 刑 斤 劲 沈 辈 喊 兔 朗 怒 残 咖 扰 迹 妆 嫌 洞 摆 迅 邓 胃 偏 闷 井 龄 仪 督 莱 既 凌 貌 溪 妞 坑 湿 掌 览 触 综 惨 幅 腰 鼓 励 玛 呆 链 隆 患 柳 琴 摔 虫 沟 隔 鸭 灭 魂 宽 逆 饱 妖 柔 扶 宵 萝 郎 插 钓 寄 杜 洁 抗 舅 裙 辑 懒 烂 喂 徽 涛 戒 饰 浙 旦 桂 赏 怡 恨 仁 伯 寂 欣 壁 述 详 擦 莲 盒 炉 液 彻 慕 涉 损 氛 碰 芳 宗 董 秦 宠 冻 墙 怜 僵 泳 悔 坛 搬 羽 缩 裁 拖 侧 奋 夺 妙 揭 筑 郁 抄 浮 狐 栏 喔 甚 尘 横 灾 献 添 寿 贸 腐 宅 岗 垃 忽 固 耶 夹 纠 幂 圾 措 哼 宏 渐 靓 尿 躺 撒 阶 艾 焦 悉 辉 忧 倍 啡 脏 穷 祥 炼 援 竹 著 碗 莉 奏 羡 澳 厚 铺 截 姿 徒 拳 污 赫 促 箭 欠 猴 胆 尝 怖 旋 洪 盆 飘 帽 赔 谓 爹 桩 括 坦 耐 描 勒 扩 撑 芝 吻 谱 惹 肖 逐 恢 膜 窝 跨 拆 氏 拔 颗 虹 蛮 衡 晴 盐 詹 寞 挥 柏 晶 痘 卓 碎 催 爬 佩 聘 岸 苍 炮 债 咬 磨 燃 遥 棋 饺 潜 挡 泥 毁 径 慰 券 畅 汗 岳 捷 辞 捕 霆 欺 孔 俗 籍 眉 扮 挖 莞 符 悠 肠 歉 咪 埋 亏 拾 赖 耍 喽 俄 握 霜 汪 贺 浦 璃 呜 瓦 荷 岩 缓 蛇 梯 绵 瑶 拟 夕 玻 黎 驰 陵 傅 棉 凭 绕 储 弯 愁 塘 翔 厨 夸 陷 悬 蔡 偿 仓 胶 幽 浩 猛 奈 携 予 岭 振 番 荒 逗 薄 繁 贤 迁 巡 兆 疆 堡 坊 卧 匙 丫 狠 鹏 虾 毫 壮 薇 忠 坡 乳 晗 娇 喷 倾 稀 驻 卜 仲 陌 晋 巾 霍 肃 悟 牵 荡 帐 熬 轨 勤 尖 允 旺 循 柜 矿 稍 潘 辽 丈 疾 寺 尬 滩 魅 涂 堪 钥 阻 苗 槽 诊 邀 湘 姻 炫 枝 抵 堆 凶 芭 咳 羞 钻 肿 傲 沿 侣 婴 铜 谅 瞬 遮 祸 伞 卢 呐 吊 填 尤 仿 浏 霞 浓 碧 鉴 悦 裂 尴 铠 咸 漏 腹 酱 粗 痴 串 煤 肌 挣 翼 狱 糊 贾 芦 葡 闺 绪 嗨 挤 锡 曹 摘 姜 彤 棍 骚 婷 寨 筹 琳 俺 陕 帘 惑 墨 蟹 狮 冯 授 粥 耗 蜂 亭 贯 桑 趋 馨 蓉 萄 玫 踢 谋 椒 淮 妃 兼 辅 诸 肩 舟 础 犹 桶 慈 捡 庙 唔 瞎 谜 诱 姚 诈 龟 壳 柯 踏 粮 凰 淋 叉 逸 贱 赌 蝶 寓 袜 齿 愉 烫 盈 屎 涯 刹 跃 瑰 浴 卸 鸣 丸 裸 盾 孝 拘 迫 洒 肝 橙 殊 萧 鹅 趟 娶 汁 沃 氧 尺 筋 踪 廉 伍 暑 捐 扇 歇 涵 滋 扔 浅 赴 崩 邻 谦 斑 蒸 披 粤 袁 贼 胞 唤 迈 躲 痒 锻 蕾 砸 妮 轩 赤 邪 犬 嫂 溜 峡 寸 抛 悄 琪 柴 薪 纱 渡 侵 哒 哲 虐 陶 霉 糟 牢 莎 蚊 梨 脾 樱 奚 翠 屈 恼 伏 叹 誉 扑 蒋 翅 昏 覆 刮 侦 逢 劫 醋 宴 仇 笼 绑 鹰 袖 蹈 丘 遵 衫 塑 俱 潭 芒 鹤 薛 捧 菊 驱 罩 哀 稿 碑 磊 潇 署 牧 邦 鸿 薯 泄 啤 彭 踩 闯 舌 勿 膀 罐 沧 漠 翰 谣 雕 膏 寝 柱 怨 兹 仰 孟 墓 杏 蒂 芬 熙 愤 囊 乃 伸 菇 矛 割 葫 魏 坤 埃 咨 蜡 纽 辩 莓 彼 癌 旬 韵 嫩 琦 撕 押 贪 韦 吾 旭 枣 贡 慌 崇 呦 吕 捉 葛 械 茫 敲 杆 肺 慎 辰 磅 嗽 蕉 茂 抬 躁 垫 跪 劝 渣 溃 役 缝 谎 苑 窃 巢 憾 愈 腊 纵 谐 剂 喉 宙 罢 渠 匆 侯 缴 肾 御 粒 伪 佑 抚 昂 痕 砍 扭 皆 坪 葱 渴 谊 淇 腻 雀 茄 瘾 抹 碍 瑟 颈 逊 敦 茅 疲 唇 阁 纲 衰 炖 脉 谭 穆 蔬 疏 厘 扯 愧 螺 鼎 撤 赠 蝴 怼 轿 斌 艰 哄 矮 嘟 浆 框 嘎 绒 渔 瓷 摊 盼 灿 荆 舰 鑫 煎 阜 酬 泛 呈 玄 弗 甩 盲 砖 莹 雯 佬 媚 匹 脂 奉 凑 葩 脖 骄 惧 奕 拐 筒 叠 朴 凝 践 萍 廊 奢 巷 戚 兜 胀 谨 丧 逝 枕 恰 帆 阔 乏 歪 芯 翁 脆 辨 硕 吨 豫 兑 郊 呃 飙 霾 抑 掘 铭 垂 塌 蓄 瞧 锤 吼 蠢 桐 拓 栋 珊 绘 诀 菌 绣 腔 斜 蛙 捞 碳 昕 抖 噶 涌 雁 勾 靖 毅 肇 誓 芙 扒 殿 磁 绳 拯 锐 疫 兮 妥 甄 嗓 掩 咒 蜀 臣 彦 昔 喻 崔 殖 吞 趁 帖 穴 逻 赋 削 棚 轰 驴 郸 枚 诶 璐 颁 玺 柿 募 邯 鸽 沪 捏 凳 忌 攀 僧 坠 挽 沫 挨 屯 嫣 乙 绮 沂 彬 缠 邵 暮 奸 裹 坝 馒 豹 颠 爪 渭 厢 帕 臂 粘 愚 爵 饶 汕 灌 慨 弥 乒 浑 廷 扁 芜 壶 喵 淑 楠 馅 钙 钮 岂 俪 嘞 黛 辱 拦 咙 妒 耻 杉 伐 佐 撩 芽 祭 晃 筝 仗 橘 茜 娟 淀 枫 漆 榴 蝎 蚁 遂 衬 澄 邢 瑜 冤 咧 蹭 罕 鞭 蛛 瓣 煌 惩 憋 倩 泊 烊 株 菠 冈 戈 掀 刊 腺 鲍 隧 碌 襄 赣 睁 妍 裕 吟 鞍 奴 屌 绎 碟 昭 骤 蘑 讼 咏 祈 蒲 卦 崛 荔 拌 墅 泸 玮 哑 叛 噜 绯 卑 棵 咩 巫 沦 敷 氓 嘲 趴 卵 螃 杠 恺 铅 坎 狸 缸 辖 麟 陀 稻 燥 藤 钉 揍 蜘 丛 耽 契 函 蚂 喇 湛 贩 婶 筷 盯 巅 饥 梳 蒜 媛 庞 屠 菱 橡 辟 撼 砂 溢 葬 衔 邱 嫉 毯 泼 汰 甸 惕 蹄 犀 牡 勉 蹲 蓬 琼 履 榆 搏 弊 氢 娄 堰 噩 禅 葵 咽 姬 颇 沁 酿 柠 弦 鄙 烛 戳 淄 檬 鲨 粽 逾 跤 坞 畀 沾 椎 枯 晰 斥 煲 涕 揉 狙 骏 渤 彰 宪 膝 渝 潍 韶 卿 衷 浇 勃 侄 贞 焰 巩 闸 婉 撸 掏 翘 馋 剥 辜 暨 栈 屉 硫 姗 斩 琐 稚 悍 勋 庐 溶 矫 鸦 瞌 牲 疤 叮 昧 屡 炭 伽 涩 漳 庸 啰 乾 讽 侈 诵 剁 昊 迦 澜 轴 袍 驼 乓 禾 挪 蹦 仑 妨 纤 靴 钩 瘤 叙 逮 宰 笛 蔚 匪 渊 刃 寡 赐 辐 俏 柚 憨 畏 烨 泣 嗦 耿 袄 坟 秩 疮 屿 淫 弘 猩 邑 邹 挫 胁 倪 樊 鄂 谍 讶 颂 佟 扛 蝉 凄 剖 烁 埔 劣 鳄 侨 耕 菏 栗 瘫 栽 歧 郡 睇 皱 猥 荧 倡 祷 夷 傍 匠 睿 孽 贿 咕 滥 梧 膊 胳 栖 亨 儒 梗 蔽 牺 钠 浒 腕 疹 窦 茨 闽 侃 驳 茹 竖 阱 娅 咔 蜗 崖 喘 弓 埠 擎 劈 粑 乞 萱 馈 啧 凸 俞 钰 吁 珂 滞 揽 魄 啃 舔 彪 顽 沸 艇 驹 骆 捣 窄 菩 啪 亩 絮 绽 颤 韭 淹 剃 炅 逍 朔 擅 霖 熄 垄 哗 粹 惫 瀑 俾 馍 铝 抠 皂 禽 氯 闰 蝠 篷 肪 舆 搅 仕 攒 拱 雇 蚕 汾 斋 浸 楔 殴 镖 枉 辫 茵 肆 畔 囧 棠 蚌 哩 倚 熏 漱 嵩 淳 卤 甫 倦 嚣 荟 硅 哨 榨 睫 黔 堕 勺 冥 蝙 沐 矩 窍 嘘 拽 俯 椰 旱 焊 懵 汝 氨 豚 呕 翡 纺 濮 尹 睹 莆 瞒 曦 锈 缅 祁 酥 磕 钞 陇 纬 沥 肢 叭 晖 搁 诛 捂 芹 娥 颐 噪 隋 岚 赁 魁 衍 鹦 搓 瞻 脊 钦 鲤 诠 聋 茉 笋 榄 臀 禹 郝 杞 祛 庚 廖 诡 鹉 仆 裔 畜 阖 艘 唠 斧 啸 挠 芋 狄 槛 冀 绥 嫖 洱 饲 泌 碱 哮 鲸 瘩 悴 疙 崎 摧 梭 觅 憔 汀 惬 痣 茎 苔 溺 梓 堤 镐 碾 冉 翩 腥 灶 暧 宛 潼 禺 芸 滤 哺 遛 遣 侍 镑 闫 秤 旷 殷 龚 糯 醇 晏 奎 陋 喧 舱 矣 冕 惟 峻 芷 匀 毙 咦 霄 纶 噻 蝇 嬛 竿 缉 眷 萎 禄 峨 佣 咚 恍 拂 掐 蔓 瞳 泻 丞 籽 穹 曰 枸 阎 怅 竭 桔 贬 樟 铲 藕 穗 缆 痰 稣 壤 妓 嗷 捶 啼 杖 娲 瞪 猿 擒 沛 赎 驿 瞅 芥 墩 焚 癫 鸳 琅 揪 稽 坨 珞 峪 挞 囚 敞 哉 霹 雳 挚 眨 眸 簿 屑 狭 旨 尧 嚏 麽 绅 蜕 丙 懿 肋 悚 婿 匿 崽 渺 锣 涡 澈 绸 膨 焉 缕 璧 皓 焕 瞄 璇 酵 滕 惭 垮 泗 窟 叨 垒 裴 凹 虞 矶 昼 疡 隶 硝 鹊 橄 鱿 朕 邂 叼 怂 柄 萤 逅 曙 铸 汶 棕 楂 颓 壹 髓 貂 呛 蕴 窑 脐 侮 沽 煞 窖 洽 猝 俐 秃 朦 眯 盔 炳 鸯 麒 湄 阑 鹭 抒 窒 伺 漯 滁 鸥 睐 芈 粪 窥 哆 拇 螂 豁 韬 捅 钧 妄 捆 拢 苟 羯 胧 丐 腌 岔 聂 喀 懈 鳞 嘱 悼 卉 籁 恳 楷 巍 牟 喱 娼 嗝 蒽 笙 惆 寥 驭 嗲 搂 掖 涟 榕 莺 疚 泵 咆 甥 忑 娴 阙 镯 掰 箫 焖 禧 忐 锯 祠 缇 郴 砺 辕 鞠 缔 痔 烽 灸 歹 嫦 玥 趾 筛 妾 墟 槐 孜 羁 栓 厄 冶 乍 锄 忻 徊 俭 唧 棱 黍 荫 赃 唿 诅 勘 拎 徘 垣 汹 灼 衢 圭 锂 裳 阀 隙 跆 佰 聆 撇 琢 澎 闵 炬 啵 呱 绊 廿 凋 滔 藻 暇 鲫 雌 阮 槟 涧 蕊 孵 篱 昵 捎 坂 皋 坷 奠 顷 雍 剔 躯 砥 靡 腩 狡 擂 掠 竣 讪 窜 枢 钝 磷 惦 嘀 鬓 灏 忒 婊 涮 秧 癖 钾 殡 馄 沭 烹 獒 叽 喆 鳌 嚼 亳 刁 漓 岐 盏 跷 肛 婕 绚 帷 瞩 梵 棺 泾 糙 薰 缤 饵 橱 渗 蔷 辙 髦 蛤 雹 噗 僻 韧 烘 琉 酝 俑 蜓 粱 翟 阚 鹃 璨 褪 馀 锌 豌 撬 扳 肘 饨 蜻 吱 掷 黏 喃 亢 嗒 咐 痪 祺 缀 懦 羹 檐 扉 妩 躬 婧 蟒 雏 漾 锥 朽 珀 柬 敛 眶 炜 拧 愣 桦 揣 惚 嗑 抉 蟑 吏 猕 浊 蛀 迢 汛 熔 绰 琶 俘 谚 蚀 寇 煽 酪 陛 潢 帼 拙 犁 噬 黯 倔 谴 娆 猬 淤 涅 拭 衅 茸 桓 侏 嘶 晾 淌 孰 绞 粟 苇 痫 芮 缚 璀 弈 踹 酌 唬 遏 铛 釜 峥 莽 恕 瑾 廓 拣 弧 靳 尉 沌 嗜 耸 垦 箍 殃 褒 檀 掂 镶 勐 诫 帜 驯 伶 彝 姊 绷 瑚 嵌 迭 倘 湃 龈 襟 茧 烙 矢 溅 惶 坍 钗 溯 糜 讳 纫 醛 珑 仨 蔗 邨 荤 筐 崭 棘 腮 赘 秉 惰 咀 袱 衩 戎 榔 蚓 胭 蛾 侬 囤 辗 蚯 媲 飓 洼 瀚 睦 寐 溧 酶 霏 瓢 痞 胚 骁 恬 蹊 嬉 淅 凿 癣 羚 烯 垢 酋 狩 吒 琵 惋 亥 唾 炊 裆 敖 吩 缭 撰 壕 蚝 苯 圃 簧 皖 忏 秆 蛟 岷 拷 霓 缪 斐 藉 琊 犊 痿 玖 啄 夭 妲 崂 瑕 嗅 逞 呸 幢 膳 亵 氮 虏 猖 卒 秸 庇 捍 蝌 嚎 笠 蚪 泷 殇 麓 旌 涪 遁 沮 茗 昀 酣 舶 舜 榻 唏 嘤 怯 砌 颍 萦 酮 谬 陨 怠 樽 梢 阐 宥 饪 痹 馥 濒 鳅 栾 遐 绢 镁 迄 笈 苛 蔻 恤 晟 嗡 稼 侥 吭 骷 婺 滇 矜 嬷 渍 斓 颊 崴 茬 铐 糗 铂 碉 刨 慵 瑄 阆 睬 炙 妊 枭 颅 屹 畸 匡 犒 畴 髅 猾 裘 栀 摁 瓮 膺 浔 袅 毋 蟆 佘 鸠 羔 渎 礁 菁 泓 偕 筵 飒 栅 凛 陡 轲 癜 诏 陂 寅 铮 慷 涤 鳝 隍 莅 碘 樵 胰 跻 憬 烩 岑 剿 哽 婪 憧 蟾 庵 羿 腋 沼 蹬 苓 孚 淼 瞥 涿 洙 孑 轶 稠 锰 蛰 骇 昙 彷 锵 羌 琍 恙 匈 鸾 璋 簇 撮 喋 掺 吝 狈 杷 祎 枇 嚷 徨 浣 荨 睾 羲 娠 琛 腑 菅 蒿 厮 憎 蕙 蜈 磋 鸵 漩 寮 晤 镀 赂 祀 孢 莘 濠 轼 婵 宸 踝 酉 萃 筱 钛 蛳 娩 沱 皎 苞 隽 眺 嫡 迥 莒 琥 藩 挟 寰 氟 僚 戟 鳖 夯 薏 绛 炽 沅 蔑 窘 庶 咻 脓 吆 蛊 曳 簪 眩 褐 磐 嗟 阪 呻 庾 覃 蓓 瘟 匕 轧 峭 肮 蟀 疣 邋 挎 痊 肴 辘 砰 芊 滦 嗖 町 匣 攘 戮 蓟 峙 濑 笃 鹫 谤 骸 蜥 戛 蟋 伎 镍 竺 胺 桨 俨 偈 搀 悸 遢 饽 酚 蚣 扼 幌 谧 骅 辄 剌 辍 惘 啬 蓑 岱 鲈 鲶 杳 喳 珉 瑙 赡 痧 螳 囡 壑 簸 蹋 悯 瞰 歆 煳 谛 踊 膛 蔼 傀 咤 隅 渲 殉 衙 燎 涝 砚 荃 拄 飕 儡 骝 拴 晦 唷 臻 漪 擀 烷 惺 瓯 慑 蘸 蚤 漉 挝 耘 瑛 窿 嘈 昱 嵊 茱 粕 阡 钵 渚 疵 甭 拗 嘚 盎 撅 瘸 匝 蹿 瘪 淆 卯 馁 盹 臊 泞 婀 讧 拈 徂 讷 痤 叟 湍 抡 啷 孬 仄 餮 亘 吮 碜 尻 齁 冗 纂 尕 叵 怄 酩 囔 旯 擘 哏 蓿 谆 噌 扃 忾 骈 逡 鳔 逋 忖 瓤 疃 嘬 捯 颙 耨 蒯 虿 蓊 剋 曩 抔 谝 哕 镲 夼 谮 撙 哞 耪 裉 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z ABOUT ACCEPT ACCESS ACER AOSS ACT ACTION ACTIVITY ADD ADDRESS ADOBE AFRAID AFTER AFTERNOON AGAIN AGE AGO AGREEMENT AIR AJAX ALEX ALL ALMOST ALONE ALPHA ALREADY ALSO ALTHOUGH ALWAYS AM AMAZING AMAZON AMERICAN AMONG AN AND ANDROID ANGEL ANGELA ANGLE ANGRY ANIMAL ANIMALS ANOTHER ANSWER ANY ANYONE ANYTHING ANYWAY APEC APP APPLE APPLICATION APPOINTMENT APRIL ARE ARM ARMANI AROUND ART AS ASIA ASK ASS ASSISTANT AT ATTACK AUDI AUGUST AUTO AUTUMN AWAY AWESOME BACK BAD BAG BALANCE BALL BANANA BAND BANG BANK BAR BASE BASS BATTLE BE BEACH BEAN BEAR BEAST BEAT BEATS BEAUTIFUL BEAUTY BECAUSE BECOME BED BEE BEEN BEFORE BEGIN BEHIND BEING BELIEVE BELL BEN BEST BETA BETTER BEYOND BIG BILL BIN BINGO BIOS BIRD BIRTHDAY BIT BITCH BLACK BLESS BLIZZARD BLOOD BLUE BOARD BODY BOOK BOOT BORN BOSS BOT BOTH BOX BOY BOYS BRA BRAND BRAVE BREAK BREAKING BREATHE BRIDGE BRIGHT BRING BROKEN BROTHER BROWN BUFF BUG BUILD BULL BULLSHIT BUS BUSINESS BUSY BUT BUTTON BUY BY BYE CAKE CALIFORNIA CALL CAMERA CAN CANADA CANDY CANNOT CANON CANT CAPTAIN CAR CARD CAT CATCH CAUSE CELL CENTER CHAIR CHAIRMAN CHAMPION CHANCE CHANEL CHANGE CHARLOTTE CHECK CHEER CHEERS CHEESE CHERISH CHERRY CHICKEN CHILD CHILDREN CHINA CHOCOLATE CHOICE CHOOSE CHRISTMAS CHROME CINDY CISCO CITY CLASSIC CLEAN CLEAR CLOSE CLOUD CLUB COACH COCK COCO CODE COFFEE COLD COLLECTION COLLEGE COLOR COLUMBIA COM COME COMES COMING COMMON COMMUNICATION COMPANY COMPLETE COMPUTER CONCERT CONSTANCE CONTINUE CONTROL COOK COOKIE COOL COPY COSPLAY COST COSTA COULD COUNTRY COURAGE COURSE COVER AZY EAM EATE EW OSS OWN YSTAL CULTURE CUP CURRENT CUT CUTE DAILY DANCE DANCER DANCING DARK DATA DATE DAUGHTER DAVID DAY DEAD DEAL DEAR DEATH DEBUG DECEMBER DECISION DEEP DEER DEFAULT DELL DELPHI DESTINY DEVICE DIAMOND DICK DID DIE DIFFERENCE DIFFERENT DIFFICULT DINNER DIOR DISABLE DISCUSS DISCUSSION DISK DISNEY DISPLAY DISS DO DOES DOG DOING DONALD DONE DOOR DOS DOUBLE DOWN DRAGON DREAM DREAMER DREAMS DRESS DRIVE DRIVER DRY EACH EARTH EAST EAT EBAY ECHO EDGE EIGHT EINSTEIN EITHER ELEVEN ELSE EMAIL ENABLE END ENDING ENERGY ENGLISH ENJOY ENOUGH ENTER EVEN EVENING EVENT EVER EVERY EVERYBODY EVERYDAY EVERYONE EVERYTHING EVIL EXCEL EXCITING EXIST EXPLAIN EXPRESS EYE EYES FACE FACT FADED FAIL FAILED FAIRY FAITH FALL FAMILY FAN FANCY FANS FANTASTIC FAR FARM FASHION FAST FAT FATHER FAVORITE FEAR FEBRUARY FEEL FEELING FEET FESTIVAL FEW FIELD FIFA FIGHT FIGHTING FILE FILES FILL FILM FINAL FINALLY FIND FINE FINGER FINISH FIRE FIREFOX FIRST FISH FIT FIVE FLASH FLOOR FLOWER FLY FLYING FOOD FOOL FOOT FORCE FORHEIGN FOREVER FORGET FORM FORWARD FOUND FOUR FOX FRANK FREE FREEDOM FRESH FRIDAY FRIEND FRIENDS FROM FUCK FUCKING FULL FUN FUNCTION FUNNY FUTURE GAIN GALAXY GAME GAP GARDEN GATE GAY GEAR GEEK GEFORCE GENERAL GEORGE GET GETTING GHOST GIFT GIRL GIT GIVE GIVING GLASS GLOBAL GLORIA GO GOAL GOD GOES GOING GOLD GOLDEN GOLF GONE GONNA GOOD GOODBYE GOODNIGHT GOOGLE GOT GRACE GRADE GRAND GREAT GREATEST GREEN GROUND GROUP GROW GUCCI GUESS GUN GUY HAD HAIR HALF HAND HANDS HANG HAPPEN HAPPINESS HARD HARRY HARVARD HAS HATE HAVE HAVING HEAD HEALTH HEALTHY HEAR HEAT HEAVEN HEAVY HELEN HELL HELLO HELP HER HERE HERO HEY HI HIGH HILL HIM HIS HISTORY HIT HOLD HOLIDAY HOLMES HOME HONDA HONEY HONOR HOPE HOSPITAL HOST HOT HOUR HOUSE HOW HOWEVER HUG HUMAN HUNDRED HUNTER HURT HUSBAND ICE ICY IDEA IDOL IF IMAGE IMAGINE IMAX IMPOSSIBLE IN INCLUDE INDEED INDEX INFINITE INFORMATION INPUT INSIDE INSIGHT INSTALL INT INTEL INTEREST INTERESTING INTERNATIONAL INTERVIEW INTO IPHONE IPOD IRON IS ISLAND ITEM ITS IVY JACK JACKY JAM JAMES JANUARY JAPAN JAPANESE JASON JAVA JAY JAZZ JEAN JEEP JERRY JIM JOB JOE JOEY JOHN JOKER JORDAN JOURNAL JOURNEY JOY JUICY JULY JUMP JUNE JUNIOR JUNK JUST JUSTIN KATE KEEP KEY KEYBOARD KICK KID KIDS KILL KIM KIND KINDLE KING KISS KIT KITTY KNOCKING KNOW KOBE KOP KOREA KOREAN LAB LADY LAN LAND LANGUAGE LARGE LARRY LAST LATE LATER LAUGH LAUGHING LAW LAY LAZY LEAD LEADER LEARN LEARNING LEAVE LEFT LEG LEGAL LEMON LET LETTER LEVEL LICENSE LIGHT LIKE LILY LINE LINK LINUX LIST LISTEN LITTLE LIVE LIVING LOCAL LOCATION LOFT LOG LOGO LONDON LONELY LONG LOOK LOOKING LOSE LOSER LOST LOT LOVE LOVED LOVELY LOVER LOVING LOW LUCK LUCKY LUCY LUNCH MAC MACHINE MADE MAGIC MAIL MAIN MAJOR MAKE MAKES MALL MAMA MAN MANAGER MANGO MANY MAP MARCH MARCO MARK MARKET MARKETING MARRY MARTIN MARY MASTER MATCH MATE MATLAB MATTER MAX MAY MAYA ME MEANS MEDIA MEDICAL MEET MEETING MEMORY MEN MERRY MESSAGE METHOD METRO MEXICO MICHAEL MIO MIOPHONE MIOSOFT MID MIDDLE MIGHT MIKE MILES MILLION MIND MINE MINI MINUTE MIRROR MISS MISSING MIX MOBILE MOD MODE MODERN MOM MOMENT MONDAY MONEY MONICA MONKEY MONSTER MONTH MOON MORE MORNING MOST MOTHER MOUNTAIN MOUTH MOVE MOVIE MR MRS MUCH MUSE MUSIC MUST MY MYSELF NAME NASA NATION NATURAL NATURE NEAR NEARLY NEED NET NETWORK NEVER NEW NEWS NEWSPAPER NEWTON NEXT NEXUS NICE NICK NIGHT NIKE NINE NO NOBODY NOKIA NONE NORTH NOT NOTE NOTHING NOVEMBER NOW NULL NUMBER OBJECT OCCUR OCEAN OCTOBER OF OFF OFFER OFFICE OFFICIAL OFTEN OH OK OKAY OLD OLIVIA ON ONCE ONE ONES ONLINE ONLY OPEN OPPO OPTION OR ORACLE ORDER OSCAR OTHER OTHERS OUR OUT OUTLOOK OVER OWEN OXFORD PAD PAGE PAIN PANDA PANDORA PAPA PAPER PARENT PARIS PARK PART PARTNER PARTY PASS PASSION PASSWORD PAST PAY PAYPAL PEACE PENNY PEOPLE PERFECT PERSONAL PET PHONE PHOTO PHOTOS PHOTOSHOP PIANO PICK PICTURE PIECE PIG PIN PINK PIZZA PLACE PLAN PLAY PLEASE POCKET POINT POLICE POLO POOR POP POPULAR POSE POSITIVE POSSIBLE POST POWER PRACTICE PRE PREFER PRESENT PRESIDENT PRESS PRETTY PRICE PRINCE PRINCESS PRIVATE PRO PROBLEM PROCESS PROFESSIONAL PROFESSOR PROGRAM PROJECT PROMISE PROTECT PUBLIC PULL PUNK PURE PURPLE PUSH PUT PYTHON QUEEN QUEENIE QUESTION QUITE RABBIT RADIO RAIN RAINBOW RAISE RAM RAN RANGE RAP RATE RATHER RAY READ READY REAL REALLY REASON RECENT RECORD RED REGRET RELEASE REMAIN REMEMBER REPORT REPOST REQUEST RESEARCH REST RESULT RETURN REVIEW RICE RIGHT RING RISE RISK RIVER ROCK ROCKY ROLLING ROM ROOM ROOT ROYAL RULE RUN RUNNING RUSH SAD SAFE SAID SALES SAM SAME SAMSUNG SANDY SATURDAY SAVE SAY SCHOOL SCIENCE SCIENTIST SCIENTISTS SEEN SIPT SEASON SECOND SEET SEETARY SECTION SECURITY SEE SEED SELECT SELF SELL SEND SENIOR SENSE SEOUL SEPTEMBER SERIOUS SERVE SERVER SERVICE SET SETUP SEVERAL SEX SEXUAL SEXY SHADOW SHARE SHARP SHE SHELDON SHELL SHERLOCK SHIFT SHINE SHINING SHIRT SHIT SHOCK SHOES SHOOT SHOP SHORT SHOT SHOULD SHOW SHUFFLE SHY SIDE SILENCE SILENT SILVER SIM SIMPLE SINA SINCE SING SIR SIRI SISTER SIT SITE SIX SIZE SKIN SKY SKYPE SLEEP SMALL SMART SMILE SNEAKER SO SOCIAL SOCKET SOFTWARE SOLDIER SOLO SOME SOMEBODY SOMEONE SOMETHING SOMETIME SOMETIMES SON SONG SONY SOON SORRY SOUND SOURCE SOUTH SPA SPACE SPEAK SPEAKER SPECIAL SPEED SPEND SPIDER SPORT SPRING STAFF STAGE STAND STANFORD STAR STARS START STATE STATEMENT STATIC STATION STAY STEAM STEVE STEVEN STILL STONE STOP STORE STORY STREET STRING STRONG STRONGER STUDENT STUDIO STUDY STUFF STYLE SUB SUBJECT SUCCESS SUCH SUCK SUE SUGAR SUMMER SUN SUNDAY SUNFLOWER SUNNY SUNSHINE SUPER SUPERMAN SUPPER SUPPORT SURFACE SURPRISE SWEET SWIFT SYSTEM TAB TABLE TAG TAKE TALE TALK TANK TASK TATTOO TAX TAXI TAYLOR TEA TEACH TEACHER TEAM TEAMWORK TEARS TECHNOLOGY TED TELEVISION TELL TEN TENCENT TERESA TEST TEXT THAN THANK THANKS THAT THE THEIR THEM THEN THEORY THERE THESE THEY THING THINGS THINK THINKING THIRD THOSE THOUGH THOUGHT THOUSAND THREE THROUGH THROW THURSDAY TIFFANY TIGER TIME TIMELINE TIMES TIMING TIPS TIRED TITLE TO TODAY TOGETHER TOKYO TOM TOMATO TOMORROW TONIGHT TOO TOP TOTAL TOUCH TOUR TOWARD TOWN TRADE TRAVEL TREAT TREE TRICK TRIP TROUBLE TRUE TRUMP TRUST TRUTH TRY TUESDAY TURBO TURN TWITTER TWO TYPE UBER UBUNTU UNCLE UNDER UNDERSTAND UNIQUE UNIT UNITED UNITY UNIVERSITY UNIX UNTIL UP UPDATE US USE USER USUALLY VALUE VAMPIRE VAN VERSION VERY VIA VICTOR VICTORIA VIDEO VIEW VISA VISION VISTA VISUAL VIVO VOGUE VOICE VOID WAIT WAITING WAKE WALK WALKER WALL WANNA WAR WARM WASHINGTON WATCH WATER WATSON WAY WE WEAR WEB WEDDING WEDNESDAY WEEK WEEKEND WEIGHT WELCOME WELL WERE WEST WESTERN WHAT WHATEVER WHATS WHEN WHERE WHICH WHILE WHITE WHO WHOLE WHOM WHOSE WHY WIFE WIFI WIN WIND WINDOW WINDOWS WING WINNER WINNIE WINTER WISH WITH WITHIN WOLF WOMAN WOMEN WON WONDER WONT WORD WORK WORLD WORRY WORTH WOULD WOW WRITE WRONG XBOX YAHOO YEAH YEAR YELLOW YES YESTERDAY YET YOGA YORK YOU YOUNG YOUR YOURS YOURSELF YOUTH YOUTUBE ZARA ZERO ZIP ZONE ACCOUNT ADIDAS AGREE AIRPORT AMERICA AREA ASUS BABY BETWEEN BLOG CAFE CARE CLASS DADDY DAYS DESIGN DOCTOR EASY ECLIPSE EGG ERROR EXCHANGE EXPLORER FACTOR FANTASY FATE FOCUS FOR GOSSIP GRAY HAPPY HE HEART HOLY HOTEL IMPORTANT INTERNET IT JOBS JOIN KINGDOM LESS LIE LION LOAD MACBOOK MAGAZINE MAKING MAYBE MEAN MILK MINOR MIRACLE MODEL MUMMY NOVA NVIDIA ORANGE PERSON PIRATE PLAYER PLUS POSITION PRINT RACE RAINY REQUIRE ROAD ROSE SAT SCHEDULE SCORE SEVEN SHOPPING SIGN SINGLE SNOW SOFT SOUL STEP STUPID SURE TEDDY THIS TIM TWINS USED VISIT WANT WAS WILL WINE WITHOUT WONDERFUL WORDS WORKER EXISTS ACFUN AKB ANGELABABY BEAUTYLEG BIEBER BIGBANG CHINAJOY CHRISTY COULDNT CSGO DEBIAN DIDNT DOESNT DONT DOTA MOJI FACEBOOK GANK GATES GIRLS GUYS HIPHOP HTML ICLOUD INS INSTAGRAM IPAD JQUERY LOLY LOVES MEMORIES OCLOCK OLAY PEPPA PRADA ROLEX SDCARD SQL STARBUCKS TFBOYS THINKPAD VISIO WASNT WEARE WECHAT WEIBO YEARS ESPACE LIFE CHINESE EXIT SEA EXPERIENCE LETUS CASE HDMI EMUI FOLLOW CARRY MIKEY WCDMA BILIBILI FOREIGN CAMARA AMBER KEIL OBAMA HERMES BURBERRY ASCII ARIES LETS SWATCH QQ YY AV ID WLAN KTV CF CP CS DNF MM MV PK PP PPT PS TXT VS AIRPLANE BLUETOOTH AI OFO IOS API AARON ABBY ABEL ABRAHAM ADA ADAM ADRIAN ADRIENNE AGNES AID AKON ALAN ALBERT ALBERTO ALEJANDRO ALEXANDER ALEXANDRA ALEXIS ALFONSO ALFRED ALFREDO ALICE ALICIA ALIN ALISON ALLAH ALLAN ALLEN ALLIN ALMA ALONZO ALTON ALVIN ALYSSA AMANDA AMANI AMELIA AMOS AMY ANDRE ANDREA ANDRES ANDREW ANDY ANGELINA ANGELO ANGIE ANITA ANN ANNA ANNE ANNETTE ANNIE ANNY ANTHONY ANTOINETTE ANTONIA ANTONIO ARA ARCHIE ARD ARENA ARLENE ARMANDO ARNOLD ARON ARTHUR ARTURO ASHLEY ATLAS AUBREY AUDREY AUSTIN AVRIL BAN BANDA BANDER BARBARA BARK BARRY BAT BEATRICE BECKY BELINDA BENJAMIN BENNIE BENNY BERNADETTE BERNARD BERNICE BERT BESSIE BETH BETHANY BETSY BETTY BEVERLY BEY BEYONCE BILLIE BILLY BING BLAKE BLANCA BOB BOBBIE BOBBY BOBO BODDY BONNIE BOOS BOYD BRAD BRADFORD BRADLEY BRANDI BRANDON BRENDA BRENDAN BRETT BREW BRIAN BRIDGET BRIGHTMAN BRITTANY BROOKE BRUCE BRYAN BRYANT BY2 BYRON CABLE CALEB CALVIN CAMERON CAMILLE CANDACE CANDICE CARL CARLA CARLOS CARLTON CARLY CARMEN CAROL CAROLE CAROLINE CAROLYN CARRIE CARROLL CARY CASEY CASSANDRA CATHERINE CATHY CECELIA CECIL CECILIA CEDRIC CELIA CELINE CESAR CHARLENE CHARLES CHARLIE CHELSEA CHERYL CHESTER CHRIS CHRISTIAN CHRISTIE CHRISTINA CHRISTINE CHRISTOPHER CLAIRE CLAN CLARA CLARE CLARENCE CLARK CLARKSON CLAUDE CLAUDIA CLAY CLAYTON CLEVER CLIFFORD CLIFTON CLINT CLINTON CLYDE COCOLEE CODY COLIN COLLAR COLLEEN CONNIE CONNOR CONRAD CONVERSE CORA COREY CORNELIUS CORTANA CORY COS COURTNEY AIG ISTINA CYNTHIA DADY DAISY DALE DAMON DANIEL DANIELLE DANNY DARIN DARLA DARLENE DARLING DARNELL DARREL DARRELL DARREN DARRIN DARRYL DARYL DASH DAVE DAWN DEANNA DEBBIE DEBORAH DEBRA DELBERT DELIA DELLA DELORES DENISE DENNIS DEREK DERRICK DESIREE DEVIN DIANA DIANE DIANNA DIANNE DINA DINE DION DIXIE DOLORES DOMINIC DOMINICK DONNA DONNIE DORA DOREEN DORIS DOROTHY DOT DOTT DOUG DOUGLAS DOYLE DRING DUANE DUSTIN DWAYNE DWIGHT EAGLES EARL EARNEST EASON EBONY ECONOMY EDDIE EDGAR EDITH EDMOND EDMUND EDNA EDUARDO EDWARD EDWIN EILEEN ELAINE ELBERT ELEANOR ELENA ELIJAH ELISA ELIZABETH ELLA ELLEN ELLIS ELMER ELOISE ELSA ELSIE ELVIRA EMANUEL EMILIO EMILY EMINEM EMMA EMMETT ENRIQUE ERICA ERICK ERIK ERIKA ERIN ERMA ERNESTINE ERNESTO ERVIN ESSIE ESTELLE ESTHER ETHEL EUGENE EUNICE EVA EVAN EVELYN EVERETT EXO FANNIE FANNY FAYE FELICIA FELIPE FELIX FERNANDO FIR FLORA FLORENCE FLOYD FORD FORREST FRANCES FRANCIS FRANCISCO FRANKIE FRANKLIN FRED FREDDIE FREDERICK FREDRICK FRIENDLY FRY GABRIEL GAGA GAIL GAILE GALA GARRETT GARRY GARY GAYLE GEE GEM GENEVIEVE GEOFFREY GEORGIA GERALD GERARD GERTRUDE GILBERT GILBERTO GINA GINGER GLAD GLADYS GLEN GLENDA GLENN GODEN GOMEZ GORDON GRADY GRANT GREG GREGG GREGORY GRETCHEN GREY GRIMES GUILLERMO GUL GUSTAVO GWEN GWENDOLYN HANNAH HAROLD HARRELL HARRIET HARVEY HATTIE HAYES HEAL HEATHER HEBE HECTOR HEIDI HELLE HENRIETTA HENRY HERBERT HERMAN HILDA HOLLE HOLLY HOMER HORACE HOWARD HUBERT HUGH HUGO IAN IGNACIO IKE INEZ IRA IRENE IRMA IRVIN IRVING ISAAC ISABEL ISIS ISMAEL IVAN JACKI JACKIE JACKSON JACOB JACQUELINE JACQUELYN JAIME JAKE JAMIE JANE JANET JANICE JANIE JANIS JARED JASMINE JAYDEN JEANETTE JEANNE JEANNETTE JEANNIE JEANS JEFF JEFFERY JEFFREY JENNA JENNIE JENNIFER JENNY JEPSEN JEREMY JERMAINE JEROME JESSE JESSICA JESSIE JESUS JILL JIMMIE JIMMY JOAN JOANN JOANNA JOANNE JODY JOEL JOHANNA JOHNATHAN JOHNNIE JOHNNY JOJO JOKE JOLIN JON JONATHAN JONATHON JOSEPH JOSEPHINE JOSH JOSHUA JOYCE JUANA JUDE JUDITH JUDY JULIA JULIAN JULIE JULIUS KAELA KARA KAREN KARL KARLA KATHERINE KATHLEEN KATHRYN KATHY KATIE KATRINA KATY KAY KAYLA KEITH KELLER KELLEY KELLI KENDRA KERRY KEVIN KIMBERLY KIMI KIRK KONE KRIS KRISTEN KRISTI KRISTINE KRISTOPHER KRISTY LANA LAURA LAUREN LAWRENCE LEAH LEE LELA LELAND LENKA LEON LEONA LESLIE LESTER LETICIA LEWIS LILLIAN LILLIE LINDA LINDSEY LIONEL LISA LOIS LOLA LOREN LORENA LORENZO LORETTA LORI LORRAINE LOUIS LOUISE LOVATO LUCAS LUCIA LUCILLE LUIS LUKE LULA LUTHER LYLE LYNDA LYNETTE LYNN LYNNE MABEL MAKIYO MANDY MANUEL MARCELLA MARCOS MARGARET MARGARITA MARGIE MARIANNE MARIE MARILYN MARIO MARION MARJORIE MARLON MARSHA MARTA MARTHA MARTY MARVIN MARX MARYANN MATTHEW MATTIE MAUREEN MAURICE MAXINE MEGAN MELANIE MELBA MELINDA MELISSA MELODY MELVIN MEREDITH MICHAELJACKSON MICHALE MICHEAL MICHELE MICHELLE MIDI MIGUEL MILDRED MILTON MIMI MINDY MIRANDA MIRIAM MISTY MITCHELL MOMO MONA MOR MORRIS MOSES MUMA MURIEL MYRA NADINE NAIM NANCY NAOMI NATALIE NATASHA NATHAN NATHANIEL NELLIE NETTIE NICHOLAS NICHOLE NICOLAS NICOLE NIMMO NINA NORA NORMAN NORWOOD OLIVER OLLIE OMAR ORA ORLANDO ORVILLE OTIS PAM PAMELA PATRICIA PATRICK PATTI PATY PAUL PAULA PEGGY PERCY PETER PHILIP PHILLIP PRESTON PRISCILLA QUEENA RACHAEL RACHEL RAFAEL RAMONA RANDAL RANDALL RANDOLPH RANDY RAYMOND REBECCA REGINALD RENE RENEE RHONDA RICARDO RICHARD RIHANNA RITA RITE ROBBIE ROBBIEWILLIAMS ROBERT ROBERTA ROBERTO ROBYN ROCHELLE RODNEY ROGELIO ROGER ROLAND ROLANDO RONALD RONNIE ROOSEVELT ROSA ROSALIE ROSIE ROSS ROXANNE RUBEN RUDOLPH RUDY RUFUS RUSSELL RUTH RYAN SABRINA SADIE SAMANTHA SAMMY SAMUEL SANDRA SARA SARAH SAUL SCOTT SEAN SELENA SELINA SERGIO SETH SHAKIRA SHANE SHANIA SHANICE SHANNON SHARI SHARON SHAUN SHAWN SHAWNA SHAYNE SHEILA SHELIA SHELLEY SHERI SHERMAN SHERRI SHERYL SHIRLEY SIDNEY SILVIA SIMON SMITH SODA SOFIA SOM SONIA SONYA SOPHIA SOPHIE STACEY STACY STADION STANLEY STEPHEN STEVIE SUSAN SUSIE SUZANNE TABITHA TAMARA TANNER TANYA TEMPO TERENCE TERRANCE TERRENCE TERRI TERRY THELMA THEODORE THERESA THOMAS TIMBALAND TIMMY TINA TOBY TOMAS TOMMIE TOMMY TONI TONYA TRACEY TRACI TRACY TRAVIS TREVOR TRICIA VALERIE VANESSA VELMA VERNA VERNON VICKI VICKIE VIOLET VIRGIL VIRGINIA VITAS VIVIAN WALLACE WANDA WARD WAYNE WENDELL WENDY WESLEY WILLIAM WELLS WILBERT WILBUR WILFRED WILLARD WILLIAMS WILLIE WILLIS WILMA WILSON WINIFRED WINSTON YOLANDA YVONNE ZICO STELLA KELLY BYRNE STEPHANIE SILIENT TODD ALBERTA KENNETH NELSON ERNEST MINNIE GDRAGON JOSE SYLVIA JEREMIAH FRIENDSHIP JUAN DEAN PHYLLIS KATYPERRY CARLYRAE RICKY JIMKELLER PATSY BEA JAVIER CURTIS ALANCHEN MARCUS DREW JORGE FREDA CHE WALTER JULIO JUANITA RAQUEL LLOYD WADE MARIA STEWART HAZEL NORMA DRAW DAVEY SALLY DON LELAN EARLY VINCENT ROY TAMMY JOSEFINA ERIC MARCIA CLAYDERMAN TONY BYTWO FATHERS HENE RUBY LEONARD DEWEY RAUL TIMOTHY GAMES CARPENTERS ALLISON AMN LADYGAGA WINDS PERCENT MAKER TIZZY SNH48 BACKSTREET AIMER JONY AKB48 VICTORY CONTRACTIONSHOULDNT CONTRACTIONWHERELL CONTRACTIONWOULDNT CONTRACTIONCOULDNT CONTRACTIONMIGHTNT CONTRACTIONTHEYRE CONTRACTIONTHEYLL CONTRACTIONTHEYVE CONTRACTIONTHATLL CONTRACTIONWHATRE CONTRACTIONWHATLL CONTRACTIONWHERES CONTRACTIONWHERED CONTRACTIONWHENLL CONTRACTIONWERENT CONTRACTIONHAVENT CONTRACTIONDOESNT CONTRACTIONMUSTNT CONTRACTIONOCLOCK CONTRACTIONYOURE CONTRACTIONYOULL CONTRACTIONYOUVE CONTRACTIONSHELL CONTRACTIONTHEYD CONTRACTIONTHATS CONTRACTIONTHATD CONTRACTIONWHOLL CONTRACTIONWHATS CONTRACTIONWHATD CONTRACTIONWHENS CONTRACTIONWHEND CONTRACTIONWHYLL CONTRACTIONHOWLL CONTRACTIONARENT CONTRACTIONWASNT CONTRACTIONHASNT CONTRACTIONHADNT CONTRACTIONDIDNT CONTRACTIONYOUD CONTRACTIONHELL CONTRACTIONSHES CONTRACTIONSHED CONTRACTIONITLL CONTRACTIONWERE CONTRACTIONWELL CONTRACTIONWEVE CONTRACTIONWHOS CONTRACTIONWHOD CONTRACTIONWHYS CONTRACTIONWHYD CONTRACTIONHOWS CONTRACTIONHOWD CONTRACTIONISNT CONTRACTIONWONT CONTRACTIONDONT CONTRACTIONCANT CONTRACTIONILL CONTRACTIONIVE CONTRACTIONHES CONTRACTIONHED CONTRACTIONITS CONTRACTIONITD CONTRACTIONWED CONTRACTIONIM CONTRACTIONID SHOUD MISTER FENDIMAN BABYFACE FOURTY LINKA BEATLESS STORIES KISKIS ANYMORE ODYSSEY EVERYWHERE DEVILSLINE IMAGES USAHANA ACCIDENTALLY INGRESS MAINTENANT DREAMERS OVERLORD RELOAD SAFARI ROBERTS SUNSET MAGGIE ROLL FUSE ABS UME ANIMA SHANGHAI ENCORE MANHAND LIN LOCKING SWITCH TRUCK WORKING BABYLOONZ DEMO YELL RUNFOX RERIDED FALLER WARRIORS IMAGINATION AUMPAIPORN BRAHMS JOYJOY REGENESIS LESLI ISAIAH MOFY YHBOYS ENDLESSWHITE MOUVEMENT SLOW AKI CUTIE HELLOKITTY REALIZE MUFFIN AEW VIVID KRAWITZ BREATH MACH DIY HEYKIDS GEORGEN EXTRA DETROIT HELENA SUNNEE COMIC RELIFE PIPI NEO BLOSSOM KAIJUDO LULLABY GENNEO IDOLISH YOUTHFUL RANGERS BABYJOYJOY BUTLERS BLAST AICO ABCSONG CCUP ZEROG GAI AINY HELLOKONGZI HUTOS SPARK HO KITTISAK HAPPYKISS REMIX MEGA HIP HOP JAE HRS FOURTH FIFTH AH FORK LIP MRRIGHT SAIL HOLLYWOOD PANAMA PIS DOC RESPECT SALALA VENUS MARS CAGE CHEMICAL CONGRATULATIONS KALA SOSO INSTRUMENT SICKNESS DISCO CYBER ACOUSTIC EDEN INFINITY BRAIN HIPOP DIMENSION ROSY CHACHA PASTY TA SHINNING VITAMIN FREESTYLE SIGNAL CEO VAE GENTLEMAN XMAS DROP ZOMBIE ADVENTURE SCAPE HOHO SPELL IQ NANA PRECIOUS ICHIBAN SUPERSTAR TRULY INTRO OS GUIDANCE JINGLE SHAKE SHAPE FRAGMENT ONER SENS ITTY BLOCK SHELLY PANTA HOLLOW NOISE PAT REVERSE OMEN RUSSIAN YOO PUPPY PRISM NONONO DEPARTURE CBA NBA CHAOS HORIZON INTRODUCTION GUARD GUARDIAN PHANTOM JEWEL INNOVATION SWING WALTZ WEIWEI PRIDE PLANET COMPLEX DOUBT CHANNEL ACHE ORZ KIKI LEO CC YOYO LINA HEDI GROWTH MOOSE CLASSICAL WEED SONIC TASTE WISE JOHNSON HA PAC 5IVE MEI TIC TRACK JUSTINE LIQUID DIARY MISSION SMOKE SMOKING NEIGHBOR LESSON FILTHY SMASH DRAMA CLIMAX FAKE LAKE DOLLAR SAN COMEDY CHAT CENTRAL LATELY LAWYER SKETCH FOSTER CHARACTER GAMER AWARD CHEF COMMERCIAL MACDONALD JAIL PAIR TITAN MAXIMUM STABLE UNSTABLE NORTON ULTRA REVOLUTION TICKET ANCIENT CAVE DEMAND NOBODIES UNTITLED MARVEL CINEMATIC CINEMA BEGAN DANGEROUS ASSASSIN KNIGHT DISCOVERY YO MA JUSTICE CARSON JAMIESON MORRISON PINSON DE SE VIP YANG MARRON GENERATION PUTH TROYE SIVAN ADELE LINKIN BRUNO DIRECTION ED SHEERAN OWL CNBLUE AMIGO WASH CHICK ABOVE ACID IDIOT ASH BACH SUITE BAMBOO BANDARI BARI IMPROV BEATRICH BLAH BLAME BODAK BON BONEY BOOM CLAP WILD BREACH CHINATWO CAUGHT CONSOUL TRAININ OOKED YANKEE DARKNESS BOWIE DECK HALL DESPICABLE DEVOTE DINERO DISCONNECT DIVE DJETHAN ORLEANS TONITE DRUNK ELECTRICITY ELSTEN TORRES EMPIRE EMPTY DRINK FADE FAMOUS RAINCOAT FIX UPPER FLOW FREAK FURTHER CHOP GOTTA ADDICT GROOVE COVERAGE MANE HALO AINT HOOD REMIND HYDE POLAND PRAYER ILL IMAGINARY IMMIGRANT IMMORTAL ORIGINALLY PERFORM INNOCENCE JONES SUIT PAID KANE KESHA KNOCK VALENTINE APART TEAR STRANGER STRIKE MACK MAD MULLET MARIAH CAREY PETRIE MATTEO MAXIMILIAN HECKER AGAINST MEGHAN TRAINOR MERCY RONSON INVINCIBLE PERRY MILLIONAIRE TEMPLE THEME LIBRARY MYSTERY NEVADE ALASKA OLLY MURS IM OPERA PAINT PARADISE CHAMPAGNE POLLY MALONE PRAY SWAN MARRAKESH ROCKET ROUND SNHFOURTY BOAT SCARDO SCARE SCARED SEVE SHALL SHAME SHEEP EFFECT SNITCH SOCAN SOFI TUKER LOVIN SPIRIT SPOON STEPPENWOLF STRAWBERRY CIGARETTE SUEDE GRILL SHOULDER SHOUT USA SURVIVE PROPHET SUTTER SWIFTY BEATLE CHAIN SMOKER MAGICAL SPECTRE TRAIN RAT GOIN CAME THOMSTON TOCH TREASURE PILOT TWENTY UNBELIEVABLE UNLESS UNPREDICTABLE UPTOWN FUNK UTWO VARIOUS ARTIST WICK WONDERLAND WINKY WRECKING DUMB BROKE YOURE AUDIO UNRAVEL ASHES WASTE WEATHER PARADE BURN BURNING @ [OOV] [BLANK] +[NULL] [START] [SEP] [MASK] a0 a1 ai1 ai2 ai3 ai4 an1 an3 an4 ang1 ang2 ang4 ao1 ao2 ao3 ao4 ba1 ba2 ba3 ba4 bai1 bai2 bai3 bai4 ban1 ban3 ban4 bang1 bang3 bang4 bao1 bao2 bao3 bao4 bei0 bei1 bei3 bei4 ben1 ben3 ben4 beng1 beng2 beng4 bi1 bi2 bi3 bi4 bian1 bian3 bian4 biao1 biao3 biao4 bie1 bie2 bie3 bie4 bin1 bin4 bing1 bing3 bing4 bo0 bo1 bo2 bo3 bo4 bu1 bu2 bu3 bu4 ca1 cai1 cai2 cai3 cai4 can1 can2 can3 can4 cang1 cang2 cao1 cao2 cao3 ce4 cen1 cen2 ceng1 ceng2 ceng4 cha1 cha2 cha3 cha4 chai1 chai2 chai4 chan1 chan2 chan3 chan4 chang1 chang2 chang3 chang4 chao1 chao2 chao3 che1 che3 che4 chen1 chen2 chen3 chen4 cheng1 cheng2 cheng3 cheng4 chi1 chi2 chi3 chi4 chong1 chong2 chong3 chong4 chou1 chou2 chou3 chou4 chu1 chu2 chu3 chu4 chuai1 chuai3 chuai4 chuan1 chuan2 chuan3 chuan4 chuang1 chuang2 chuang3 chuang4 chui1 chui2 chun1 chun2 chun3 chuo1 chuo4 ci1 ci2 ci3 ci4 cong1 cong2 cou4 cu1 cu2 cu4 cuan1 cuan2 cuan4 cui1 cui3 cui4 cun1 cun2 cun3 cun4 cuo1 cuo2 cuo4 da0 da1 da2 da3 da4 dai1 dai3 dai4 dan1 dan3 dan4 dang1 dang3 dang4 dao1 dao2 dao3 dao4 de0 de1 de2 de4 dei3 deng1 deng3 deng4 di1 di2 di3 di4 dia3 dian1 dian3 dian4 diao1 diao3 diao4 die1 die2 ding1 ding3 ding4 diu1 dong1 dong3 dong4 dou1 dou3 dou4 du1 du2 du3 du4 duan1 duan3 duan4 dui1 dui3 dui4 dun1 dun3 dun4 duo1 duo2 duo3 duo4 e1 e2 e3 e4 ei1 en1 en4 er0 er2 er3 er4 fa1 fa2 fa3 fa4 fan1 fan2 fan3 fan4 fang1 fang2 fang3 fang4 fei1 fei2 fei3 fei4 fen1 fen2 fen3 fen4 feng1 feng2 feng3 feng4 fo2 fou3 fu1 fu2 fu3 fu4 ga1 ga2 ga3 ga4 gai1 gai3 gai4 gan1 gan3 gan4 gang1 gang3 gang4 gao1 gao3 gao4 ge1 ge2 ge3 ge4 gei3 gen1 gen2 gen4 geng1 geng3 geng4 gong1 gong3 gong4 gou1 gou3 gou4 gu1 gu3 gu4 gua1 gua3 gua4 guai1 guai3 guai4 guan1 guan3 guan4 guang1 guang3 guang4 gui1 gui3 gui4 gun3 gun4 guo1 guo2 guo3 guo4 ha1 ha2 hai1 hai2 hai3 hai4 han1 han2 han3 han4 hang1 hang2 hang4 hao1 hao2 hao3 hao4 he1 he2 he4 hei1 hen2 hen3 hen4 heng1 heng2 heng4 hong1 hong2 hong3 hong4 hou1 hou2 hou3 hou4 hu1 hu2 hu3 hu4 hua1 hua2 hua4 huai2 huai4 huan1 huan2 huan3 huan4 huang1 huang2 huang3 huang4 hui1 hui2 hui3 hui4 hun1 hun2 hun4 huo1 huo2 huo3 huo4 ji1 ji2 ji3 ji4 jia1 jia2 jia3 jia4 jian1 jian3 jian4 jiang1 jiang3 jiang4 jiao1 jiao2 jiao3 jiao4 jie1 jie2 jie3 jie4 jin1 jin3 jin4 jing1 jing3 jing4 jiong1 jiong3 jiu1 jiu3 jiu4 ju1 ju2 ju3 ju4 juan1 juan3 juan4 jue1 jue2 jue4 jun1 jun4 ka1 ka3 kai1 kai3 kai4 kan1 kan3 kan4 kang1 kang2 kang4 kao1 kao3 kao4 ke1 ke2 ke3 ke4 kei1 ken3 ken4 keng1 kong1 kong3 kong4 kou1 kou3 kou4 ku1 ku3 ku4 kua1 kua3 kua4 kuai3 kuai4 kuan1 kuan3 kuang1 kuang2 kuang3 kuang4 kui1 kui2 kui3 kui4 kun1 kun3 kun4 kuo4 la1 la2 la3 la4 lai2 lai4 lan2 lan3 lan4 lang1 lang2 lang3 lang4 lao1 lao2 lao3 lao4 le0 le4 lei0 lei1 lei2 lei3 lei4 leng2 leng3 leng4 li0 li1 li2 li3 li4 lia3 lian2 lian3 lian4 liang2 liang3 liang4 liao2 liao3 liao4 lie1 lie3 lie4 lin1 lin2 lin3 lin4 ling2 ling3 ling4 liu1 liu2 liu3 liu4 lo0 long2 long3 lou1 lou2 lou3 lou4 lu1 lu2 lu3 lu4 luan2 luan3 luan4 lun1 lun2 lun4 luo1 luo2 luo3 luo4 lv2 lv3 lv4 lve4 ma0 ma1 ma2 ma3 ma4 mai2 mai3 mai4 man1 man2 man3 man4 mang2 mang3 mao1 mao2 mao3 mao4 me0 mei2 mei3 mei4 men0 men1 men2 men4 meng1 meng2 meng3 meng4 mi1 mi2 mi3 mi4 mian2 mian3 mian4 miao1 miao2 miao3 miao4 mie1 mie4 min2 min3 ming2 ming3 ming4 miu4 mo1 mo2 mo3 mo4 mou1 mou2 mou3 mu2 mu3 mu4 na2 na3 na4 nai3 nai4 nan1 nan2 nan3 nan4 nang1 nang2 nang3 nao1 nao2 nao3 nao4 ne0 ne2 ne4 nei3 nei4 nen4 neng2 ng0 ni1 ni2 ni3 ni4 nian1 nian2 nian3 nian4 niang2 niang4 niao3 niao4 nie1 nie4 nin2 ning2 ning3 ning4 niu1 niu2 niu3 niu4 nong2 nong4 nou4 nu2 nu3 nu4 nuan3 nuo2 nuo4 nv3 nve4 o1 ou1 ou3 ou4 pa1 pa2 pa4 pai1 pai2 pai3 pai4 pan1 pan2 pan4 pang1 pang2 pang3 pang4 pao1 pao2 pao3 pao4 pei1 pei2 pei4 pen1 pen2 peng1 peng2 peng3 peng4 pi1 pi2 pi3 pi4 pian1 pian2 pian3 pian4 piao1 piao2 piao3 piao4 pie1 pie3 pin1 pin2 pin3 pin4 ping1 ping2 po1 po2 po3 po4 pou1 pou2 pu1 pu2 pu3 pu4 qi1 qi2 qi3 qi4 qia1 qia3 qia4 qian1 qian2 qian3 qian4 qiang1 qiang2 qiang3 qiang4 qiao1 qiao2 qiao3 qiao4 qie1 qie2 qie3 qie4 qin1 qin2 qin3 qin4 qing1 qing2 qing3 qing4 qiong2 qiu1 qiu2 qiu3 qu1 qu2 qu3 qu4 quan1 quan2 quan3 quan4 que1 que2 que4 qun1 qun2 ran2 ran3 rang2 rang3 rang4 rao2 rao3 rao4 re3 re4 ren2 ren3 ren4 reng1 reng2 ri4 rong2 rong3 rou2 rou4 ru2 ru3 ru4 ruan3 rui2 rui3 rui4 run4 ruo4 sa1 sa3 sa4 sai1 sai4 san1 san3 san4 sang1 sang3 sang4 sao1 sao3 sao4 se4 sen1 seng1 sha1 sha2 sha3 sha4 shai1 shai3 shai4 shan1 shan3 shan4 shang0 shang1 shang3 shang4 shao1 shao2 shao3 shao4 she1 she2 she3 she4 shei2 shen1 shen2 shen3 shen4 sheng1 sheng2 sheng3 sheng4 shi0 shi1 shi2 shi3 shi4 shou1 shou3 shou4 shu1 shu2 shu3 shu4 shua1 shua3 shuai1 shuai3 shuai4 shuan1 shuan4 shuang1 shuang3 shui3 shui4 shun3 shun4 shuo1 shuo4 si1 si3 si4 song1 song3 song4 sou1 sou3 sou4 su1 su2 su4 suan1 suan4 sui1 sui2 sui3 sui4 sun1 sun3 suo1 suo3 ta1 ta3 ta4 tai1 tai2 tai4 tan1 tan2 tan3 tan4 tang1 tang2 tang3 tang4 tao1 tao2 tao3 tao4 te4 teng2 ti1 ti2 ti3 ti4 tian1 tian2 tian3 tiao1 tiao2 tiao3 tiao4 tie1 tie3 tie4 ting1 ting2 ting3 tong1 tong2 tong3 tong4 tou1 tou2 tou4 tu1 tu2 tu3 tu4 tuan1 tuan2 tuan3 tui1 tui2 tui3 tui4 tun1 tun2 tuo1 tuo2 tuo3 tuo4 wa1 wa2 wa3 wa4 wai1 wai3 wai4 wan1 wan2 wan3 wan4 wang1 wang2 wang3 wang4 wei1 wei2 wei3 wei4 wen1 wen2 wen3 wen4 weng1 weng3 weng4 wo1 wo3 wo4 wu1 wu2 wu3 wu4 xi1 xi2 xi3 xi4 xia1 xia2 xia4 xian1 xian2 xian3 xian4 xiang1 xiang2 xiang3 xiang4 xiao1 xiao2 xiao3 xiao4 xie1 xie2 xie3 xie4 xin1 xin4 xing1 xing2 xing3 xing4 xiong1 xiong2 xiu1 xiu3 xiu4 xu0 xu1 xu2 xu3 xu4 xuan1 xuan2 xuan3 xuan4 xue1 xue2 xue3 xue4 xun1 xun2 xun4 ya0 ya1 ya2 ya3 ya4 yan1 yan2 yan3 yan4 yang1 yang2 yang3 yang4 yao1 yao2 yao3 yao4 ye1 ye2 ye3 ye4 yi1 yi2 yi3 yi4 yin1 yin2 yin3 yin4 ying1 ying2 ying3 ying4 yo1 yong1 yong2 yong3 yong4 you1 you2 you3 you4 yu1 yu2 yu3 yu4 yuan1 yuan2 yuan3 yuan4 yue1 yue3 yue4 yun1 yun2 yun3 yun4 za1 za2 za3 zai1 zai3 zai4 zan1 zan2 zan3 zan4 zang1 zang4 zao1 zao2 zao3 zao4 ze2 ze4 zei2 zen3 zen4 zeng1 zeng4 zha1 zha2 zha3 zha4 zhai1 zhai2 zhai3 zhai4 zhan1 zhan3 zhan4 zhang1 zhang3 zhang4 zhao1 zhao2 zhao3 zhao4 zhe0 zhe1 zhe2 zhe3 zhe4 zhen1 zhen3 zhen4 zheng1 zheng3 zheng4 zhi1 zhi2 zhi3 zhi4 zhong1 zhong3 zhong4 zhou1 zhou2 zhou3 zhou4 zhu1 zhu2 zhu3 zhu4 zhua1 zhua3 zhuai4 zhuan1 zhuan3 zhuan4 zhuang1 zhuang4 zhui1 zhui4 zhun1 zhun3 zhuo1 zhuo2 zi1 zi3 zi4 zong1 zong3 zong4 zou1 zou3 zou4 zu1 zu2 zu3 zuan1 zuan3 zuan4 zui3 zui4 zun1 zun3 zuo1 zuo2 zuo3 zuo4 a ai an ang ao ar b ba bai ban bang bao be bea bei ben beng ber bew bi bien bin bo bou bu cen d da dai dan dang de dea dei den di dia din diu do dong dou dra drai drhi drhing dro drong du e ea ei en er f fa fai fan fang fe fea fei fen feng few fi fing fo fou ft fu g ga gai gan ge gea gei gen gi gin go gong gou gu ha hai han hang hao he hea hei hew hi ho hong hou hu jei jew ji jia jiang jie jien jier jin jio jiong jiu jue juer jun k ka kai kan kang kea kei ken keng ker kew ki kin king kiou ko kon kou kre krhi krhin kro ks ksi kt ku kuai kwea kwi kwin la lai lan lang lao le lea lei len lew li lien lin ling lo long lou lu lun m ma mai man mang mao me mea mei men mi min ming miu mo mou mp ms mu mun na nai nan nao ne nea nei nen new ni ning no nou o ong ou p pa pai pan pang pe pea pei pen pew pi pia pien pin ping po pot pou ps pt pu q qai qew qi qia qiang qie qien qier qio quei rha rhai rhan rhang rhe rhea rhei rhen rhi rhin rhing rho rhong rhou ru ruo rza rzan rzao rzea rzei rzen rzer rzi rzin rzing rzong rzou s sa sai san sang sao se sea see sei sen sew sin sk so song sou sp st su sun t ta tai tan tang te tea tei ten tew ti tin ting to tong tou tra trai trhi trou tru ts tu twi twin v va vai van vea ven ver vew vi ving vo vou vrhi vs wa wai wan wang wao we wea wei wen wer wi win wo wong wu xew xi xia xiai xian xie xien xier xin xing xio xiu ya yang ye yen yer yew yi yier yin yo you zi [OOV] [BLANK] diff --git a/inference/examples/automatic_speech_recognition/asr_rnnt.cpp b/inference/examples/automatic_speech_recognition/asr_rnnt.cpp new file mode 100644 index 00000000..ae005d55 --- /dev/null +++ b/inference/examples/automatic_speech_recognition/asr_rnnt.cpp @@ -0,0 +1,110 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include + +#include "inference.hpp" +#include "tensor.hpp" +#include "data_loader.hpp" +#include "profiling.h" +#include "parse_command.h" + +int main(int argc, char *argv[]) +{ + UNI_TIME_INIT + ParseRes parse_res; + parseCommandLine(argc, argv, &parse_res, "examples"); + + char *modelPath = (char *)""; + char *sequenceDirectory = (char *)""; + char *affinityPolicyName = (char *)""; + + if (!parse_res.model.second) { + exit(-1); + } + if (parse_res.model.second) { + modelPath = parse_res.model.first; + } + if (parse_res.inputPath.second) { + sequenceDirectory = parse_res.inputPath.first; + } + if (parse_res.archInfo.second) { + affinityPolicyName = parse_res.archInfo.first; + } + + auto pipeline = createPipeline(affinityPolicyName, modelPath); + + // load sequences + std::map> inMap = pipeline->get_inputs(); + std::vector sequenceDescs; + TensorDesc soundInputDesc = (*(inMap["sounds"])).get_desc(); + sequenceDescs.push_back(soundInputDesc); + + std::vector> sequences, results; + std::vector sequencePaths = + load_data(sequenceDirectory + std::string("/input"), sequenceDescs, &sequences); + std::vector resultDescs; + resultDescs.push_back(soundInputDesc); + std::vector resultPaths = + load_data(sequenceDirectory + std::string("/result"), resultDescs, &results); + + double totalTime = 0; + U32 sequenceIndex = 0; + U32 falseResult = 0; + std::cout << "[RESULT]:" << std::endl; + for (auto sequence : sequences) { + std::cout << sequencePaths[sequenceIndex] << ": " << std::endl; + TensorDesc desc = sequence[0].get_desc(); + TensorDesc inputDesc = tensor3d(soundInputDesc.dt, 1, + tensorNumElements(desc) / soundInputDesc.dims[0], soundInputDesc.dims[0]); + std::map inputDescMap; + inputDescMap["sounds"] = inputDesc; + pipeline->reready(inputDescMap); + + auto modelInputTensorNames = pipeline->get_model_input_tensor_names(); + std::map> model_tensors_input; + for (int index = 0; index < (int)modelInputTensorNames.size(); index++) { + U8 *tensorPointer = (U8 *)((CpuMemory *)(sequence[index].get_memory()))->get_ptr(); + pipeline->copy_to_named_input(modelInputTensorNames[index], tensorPointer); + } + + double timeBegin = ut_time_ms(); + pipeline->run(); + double timeEnd = ut_time_ms(); + totalTime += (timeEnd - timeBegin); + + Tensor output = pipeline->get_tensor_by_name("labels"); + std::cout << output.string(32) << std::endl; + if (resultPaths.size() > sequenceIndex) { + F32 *result = (F32 *)((CpuMemory *)(results[sequenceIndex][0].get_memory()))->get_ptr(); + U32 inferenceSize = output.length(); + for (U32 i = 0; i < results[sequenceIndex][0].length(); i++) { + if (i >= inferenceSize || result[i] != output.element(i)) { + falseResult++; + break; + } + } + } + + sequenceIndex++; + } + + UNI_TIME_STATISTICS + + std::cout << "[SUMMARY]:" << std::endl; + UNI_CI_LOG( + "speech recognition rate: %f %%\n", 100.0 * (sequenceIndex - falseResult) / sequenceIndex); + UNI_CI_LOG("avg_time:%fms/sequence\n", 1.0 * totalTime / sequenceIndex); + + return 0; +} diff --git a/inference/examples/automatic_speech_recognition/audio_feature.cpp b/inference/examples/automatic_speech_recognition/audio_feature.cpp new file mode 100644 index 00000000..becb6410 --- /dev/null +++ b/inference/examples/automatic_speech_recognition/audio_feature.cpp @@ -0,0 +1,2352 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "audio_feature.h" +#include "error.h" + +void AudioFeatureExtractor::PreEmphasis( + std::vector &signal, short lastPoint, std::vector &output) +{ + float PRE_EMPH = 0.97F; + + output.push_back(static_cast(signal[0] - PRE_EMPH * lastPoint)); + + for (int i = 1; i < (int)signal.size(); i++) { + output.push_back(static_cast(signal[i] - PRE_EMPH * signal[i - 1])); + } +} + +void AudioFeatureExtractor::SplitToFrames( + std::vector &signal, std::vector> &output, int nFrames) +{ + auto itr = signal.begin(); + + for (int i = 0; i < nFrames; i++) { + if ((i * FRAME_STEP + W_LENGTH) < (int)signal.size()) { + std::copy(signal.begin() + i * FRAME_STEP, signal.begin() + i * FRAME_STEP + W_LENGTH, + output[i].begin()); + } else { + std::copy(signal.begin() + i * FRAME_STEP, signal.end(), output[i].begin()); + } + itr += FRAME_STEP; + } +} + +void AudioFeatureExtractor::CentralPadding(std::vector &signal, std::vector &output) +{ + int padding_size = (N_FFT - W_LENGTH) / 2; + std::vector begin_padding(padding_size, 0); + std::vector end_padding(padding_size, 0); + + std::copy(begin_padding.begin(), begin_padding.end(), output.begin()); + std::copy(signal.begin(), signal.end(), output.begin() + padding_size); + std::copy(end_padding.begin(), end_padding.end(), output.begin() + padding_size + W_LENGTH); +} + +std::vector AudioFeatureExtractor::GetHammingWindow(bool periodic) +{ + int normSize = W_LENGTH - 1; + if (periodic) { + normSize = normSize - W_LENGTH % 2 + 1; + } + + std::vector factors(W_LENGTH); + for (int i = 0; i < W_LENGTH; i++) { + float pi = 3.14159f; + factors[i] = 0.54F - (0.46F * static_cast(cos((2 * pi * i) / normSize))); + } + + return factors; +} + +void AudioFeatureExtractor::AddHammingWindow(std::vector &data) +{ + std::vector HAMMING_WINDOW; + HAMMING_WINDOW = GetHammingWindow(true); + + for (int i = 0; i < (int)data.size(); i++) { + data[i] *= HAMMING_WINDOW[i]; + } +} + +std::vector AudioFeatureExtractor::ComputePowerSpec(std::vector fft) +{ + std::vector powerSpec(N_DIM); + for (int i = 0; i < N_DIM; i++) { + auto fft_r = static_cast(fft[2 * i]); + auto fft_i = static_cast(fft[2 * i + 1]); + powerSpec[i] = (fft_r * fft_r + fft_i * fft_i) / N_FFT; + } + return powerSpec; +} + +double AudioFeatureExtractor::HerzToMel(double herz) +{ + return _MEL_HIGH_FREQUENCY_Q * log(1.0 + (herz / _MEL_BREAK_FREQUENCY_HERTZ)); +} + +std::vector AudioFeatureExtractor::HerzToMel(std::vector herzVec) +{ + std::vector melVec(herzVec.size()); + + for (int i = 0; i < (int)herzVec.size(); i++) { + melVec[i] = HerzToMel(herzVec[i]); + } + + return melVec; +} + +std::vector AudioFeatureExtractor::LineSpace(double lower, double upper, int number) +{ + double interval = (upper - lower) / (number - 1); + std::vector result(number); + + result[0] = lower; + result[number - 1] = upper; + + for (int i = 1; i < number - 1; i++) { + result[i] = lower + interval * i; + } + + return result; +} + +std::vector> AudioFeatureExtractor::GetLinearToMelMatrix() +{ + double nyquistHertz = SAMPLE_RATE / 2.0; + + std::vector linearFrequencies = LineSpace(0.0, nyquistHertz, N_DIM); + std::vector spectrogramBinsMel = HerzToMel(linearFrequencies); + std::vector bandEdgesMel = + LineSpace(HerzToMel(LOWER_HERZ_FREQ), HerzToMel(UPPER_HERZ_FREQ), N_FILTERS + 2); + + double bandEdgesMelScale = bandEdgesMel[1] - bandEdgesMel[0]; + + std::vector> melWeightsMat( + spectrogramBinsMel.size(), std::vector(N_FILTERS)); + + for (int i = 1; i < (int)spectrogramBinsMel.size(); i++) { + for (int j = 0; j < N_FILTERS; j++) { + double lowerSlope = spectrogramBinsMel[i] - bandEdgesMel[j]; + double upperSlope = bandEdgesMel[j + 2] - spectrogramBinsMel[i]; + double minSlope = fmin(lowerSlope, upperSlope); + if (minSlope > 0) { + melWeightsMat[i][j] = static_cast(minSlope / bandEdgesMelScale); + } + } + } + + return melWeightsMat; +} + +std::vector AudioFeatureExtractor::GetMelBankForSingleFrame(std::vector frame) +{ + std::vector powerSpec; + std::vector framePadded(N_FFT); + + AddHammingWindow(frame); + CentralPadding(frame, framePadded); + + fftwf_complex *in = static_cast(fftwf_malloc(sizeof(fftwf_complex) * N_FFT)); + fftwf_complex *out = static_cast(fftwf_malloc(sizeof(fftwf_complex) * N_FFT)); + for (int i = 0; i < N_FFT; i++) { + in[i][0] = framePadded[i]; + in[i][1] = 1.0f; + } + + fftwf_plan p = fftwf_plan_dft_1d(N_FFT, in, out, FFTW_FORWARD, FFTW_ESTIMATE); + fftwf_execute(p); + + std::vector specInput(N_FFT * 2); + for (int i = 0; i < N_FFT; i++) { + if (i == 0) { + specInput[2 * i] = out[i][0]; + specInput[2 * i + 1] = 0.0; + } else { + specInput[2 * i] = out[i][0]; + specInput[2 * i + 1] = out[i][1]; + } + } + fftwf_destroy_plan(p); + fftwf_free(in); + fftwf_free(out); + + powerSpec = ComputePowerSpec(specInput); + + // std::vector> MEL_WEIGHTS = GetLinearToMelMatrix(); + std::vector> MEL_WEIGHTS = { + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.028377542974119204, 0.01438900822987297, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.013988534744246243, 0.02877801645974593, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.04236607771836546, 0.00040047348562673776, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.027977069488492455, 0.014789481715499701, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.013588061258619515, 0.029178489945372667, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.04196560423273872, 0.0008009469712534763, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.02757659600286572, 0.015189955201126462, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.013187587772992768, 0.02957896343099936, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.04156513074711206, + 0.0012014204568801378, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.027176122517239023, + 0.015590428686753173, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.012787114287366015, + 0.02997943691662618, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.041164657261485195, 0.0016018939425069506, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.026775649031612274, 0.015990902172379924, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.012386640801739273, 0.030379910402252925, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.040764183775858435, 0.002002367428133761, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.026375175545985473, 0.01639137565800666, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.011986167316112515, 0.030780383887879643, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.04036371029023186, 0.0024028409137602843, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.025974702060358814, 0.016791849143633433, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01158569383048571, 0.03118085737350646, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03996323680460505, 0.0028033143993871023, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.025574228574731996, + 0.017192322629260206, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.011185220344859019, + 0.031581330859133123, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.039562763318978324, + 0.003203787885013913, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.025173755089105205, 0.017592796114886934, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.010784746859232256, 0.031981804344759966, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.03916228983335135, 0.003604261370640695, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.024773281603478636, 0.01799326960051355, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.010384273373605622, 0.03238227783038662, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.03876181634772464, 0.004004734856267508, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.024372808117851818, 0.01832063995035457, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.010202637843519902, 0.031611458930250136, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03855391092566718, 0.0016560068311672965, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.028360663514711087, 0.010670250536719897, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01995565672770752, + 0.017961163884671668, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.013171320975496788, + 0.02369179259268357, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.007852420705033428, + 0.028013154870290456, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.003855252056611007, + 0.031065024343655736, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0010468890270737834, 0.03297666563721646, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.032525795232788056, 0.0006469856915086729, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.03098233138702626, 0.0013817753283573875, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.030265615006260627, 0.001327906355091734, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.030283262261724658, 0.0005751233146929622, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0008535034173718123, 0.029303698353873656, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0029060864012274584, 0.02658198842031494, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.005464414489148291, 0.02338369067460831, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.008459612705604193, 0.01977563150017821, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.011828568565461231, 0.01581902179659536, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.015513532642574207, 0.01156984666288346, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01946174420293937, 0.007079230618957747, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.02362508042224929, 0.002393779809957561, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0026274615629053957, 0.02289111627252, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.007950754161161056, + 0.01708814892773665, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01335899693463432, + 0.01121670581095207, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.018818283166961453, + 0.005309541497503488, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0006491445198316474, + 0.023045788191944288, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.00698437812813675, + 0.01629731656779974, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.013272337673729569, 0.00960843461949587, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.019490308734984828, 0.0030009953187898905, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0037686404167832357, 0.018348084311577542, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.01063669900930533, 0.011118855305415653, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.017361958679361446, 0.0040415268100010915, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.003085905508775607, 0.017977443408692638, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.010326550845234005, 0.010409214612737486, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.017366704187608988, 0.003048616094710451, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.004403341139589429, 0.01570311829619988, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.011844562319580402, 0.007962638905889666, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.01904049783284377, 0.0004729552180659903, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.007269122403884178, 0.011963869707503835, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.014769530008231939, 0.0041880517929891135, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0035523942877343393, 0.015137931707918249, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.011301830634324753, 0.007129837865952539, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0006128389755833674, 0.0175641571067387, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.008560664476431551, 0.009372962620292765, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.016181126760858023, 0.0015116418151702193, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.006474993151394213, 0.010986520260981823, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.014228235878880604, 0.003005647064580796, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0049791027868558045, 0.012034413337757007, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.012825904098576837, + 0.003972035328996462, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.004012254256524645, + 0.012575694033629195, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.011917025434887662, + 0.004466317925452859, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0035183546875407045, + 0.012664884691514762, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01144887940563181, + 0.004539730732766297, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0034456490088684635, + 0.012352278780654393, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.011372839212756227, + 0.004239518924627132, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0037464303784879504, + 0.011684223437276832, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.011644097810997383, + 0.0036092042446276493, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.004376768392250799, 0.010703384650423508, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.012221410823378834, 0.002688835838931377, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.005296254038803357, 0.00944899610378322, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.01306685532137641, 0.001515225605278247, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.006467760423154104, 0.007957092627597761, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.014145603542796002, 0.00012216896312295877, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.007857218336451012, 0.006260729161023512, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.001568833130106022, 0.012399289358924065, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.009433405801511287, 0.004390186074597024, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.003441100930424836, 0.010239720250433047, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.01116775077277046, 0.002373161654500664, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.00545758568431269, 0.007947278217856237, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.013034146215745834, 0.00023495250500369148, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.007592836729846747, 0.005546619131420947, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0021515272439476597, 0.010858285757838202, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.00982366423969415, 0.003060196381725472, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.004638551644406247, 0.008121770180822758, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.012128978363063033, 0.0005084177982820857, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.007187999965218728, 0.005331674061446397, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.002247021567374424, 0.01015493032461071, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.009781299670107893, 0.0025059521691133246, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.00507296073255986, 0.007102111763645642, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0003646217950118263, 0.01169827135817796, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.007915006851784577, 0.004040579742123949, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.003428353851945017, 0.008420335215566682, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.010758535703990588, 0.000984443037557675, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.006483130851456317, 0.005157983492935077, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.002207725998922047, 0.009331523948312478, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.009516324471877342, 0.0019233806012160773, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0054422214258588635, 0.005900415401056275, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0013681183798403844, 0.009877450200896474, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.008634395297734325, + 0.0025171406115715206, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.004752116027798543, + 0.006306921965855592, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0008698367578627612, + 0.010096703320139662, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.008075210008049085, + 0.0028022115135905353, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0043757227443425445, + 0.006413556004178512, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.000676235480636005, + 0.010024900494766488, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.007804118255427382, + 0.0028122664319902243, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.004278816477084893, + 0.0062535755235970806, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0007535146987424039, + 0.009694884615203937, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.00778916518370714, + 0.0025783508354261563, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.004429847596563119, + 0.005857630420866068, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0010705300094190984, + 0.009136910006305978, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.008000910888885989, 0.002129058688011321, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.004799762345909029, 0.005253937713179549, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.001598613802932068, 0.008378816738347778, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.00841226098941319, 0.0014906977582096177, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.005361834309476819, 0.004468445970131726, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.002311407629540449, 0.007446194182053835, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.008998307657936223, 0.0006874447175282588, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.006091506300628975, 0.0035249895767657533, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0031847049433217267, 0.006362534436003247, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0002779035860144786, 0.009200079295240741, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.006966242059193332, 0.0024454334226279993, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.004196303615362464, 0.005149376219385745, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0014263651715315966, 0.007853319016143492, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.007965387183447241, 0.001249808578221988, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.005325867643068401, 0.0038264397896499358, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0026863481026895624, 0.006403071001077883, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 4.682856231072361e-05, 0.008979702212505832, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.006554797514716648, 0.0024117534129888644, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0040395562754902675, 0.004867067337405281, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.001524315036263888, 0.007322381261821698, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.007866108718963106, 0.0009217718857476596, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.005469294300761493, 0.0032614805878292195, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0030724798825598823, 0.005601189289910779, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0006756654643582706, 0.00794089799199234, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.006960354349857755, 0.001601034175999913, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.004676390781788259, 0.0038305807753812276, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0023924272137187635, 0.006060127374762542, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0001084636456492677, 0.008289673974143857, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.00632248402662231, 0.0020236771012191778, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0041460578750377435, 0.004148248435415844, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0019696317234531766, 0.006272819769612511, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.007998465907218316, 0.00019236275053673336, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.005924513914496727, 0.002216901441651177, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0038505619217751356, 0.0042414401327656205, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0017766099290535452, 0.006265978823880064, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.007716917721831137, 0.00027659128794857755, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.005740615027167201, 0.002205807241231245, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0037643123325032638, 0.004135023194513913, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0017880096378393273, 0.0060642391477965804, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.007630220726782063, 0.00017515254470366014, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.005746969641586685, 0.0020135339057196664, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.003863718556391308, 0.003851915266735673, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.00198046747119593, 0.005690296627751679, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 9.721638600055224e-05, 0.007528677988767685, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.005921636521264824, 0.0016613917038077958, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.004127055833060703, 0.003413215299699383, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.002332475144856583, 0.005165038895590971, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0005378944566524627, 0.006916862491482558, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.006244424435332735, + 0.001168985171637787, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.004534339215261857, + 0.0028383264607739827, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0028242539951909788, + 0.004507667749910178, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0011141687751201007, + 0.006177009039046374, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.006696773489474588, + 0.000554328893078138, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.005067205379705533, + 0.0021450714462383793, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.003437637269936478, + 0.0037358139993986204, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0018080691601674225, + 0.005326556552558862, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.00017850105039836746, + 0.006917299105719103, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.005708801770471052, + 0.0013498006366808207, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0041559597289110885, + 0.002865645171829301, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.002603117687351125, + 0.004381489706977781, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.001050275645791162, + 0.005897334242126261, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.006443682645083771, + 0.0004674935156182875, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.004963954125864155, + 0.0019119665072878423, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.003484225606644538, + 0.0033564394989573972, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0020044970874249215, + 0.004800912490626952, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0005247685682053051, + 0.006245385482296507, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.005847654166832974, + 0.0008883156305972345, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0044375967160492505, + 0.002264777513911948, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0030275392652655262, + 0.0036412393972266614, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0016174818144818024, + 0.005017701280541375, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0002074243636980783, + 0.006394163163856089, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0054506368104109415, 0.0011187042639818625, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.004106970057566763, 0.002430357252403933, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0027633033047225848, 0.0037420102408260033, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.001419636551878406, 0.0050536632292480736, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 7.596979903422734e-05, 0.006365316217670143, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.005231466872755892, 0.0011792274825073057, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.003951064899279318, 0.0024291230174955107, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.002670662925802746, 0.003679018552483716, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0013902609523261731, 0.004928914087471921, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.00010985897884960065, 0.006178809622460126, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.005170621617823595, 0.001088853660626072, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.003950505684901677, 0.0022798995105370408, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0027303897519797603, 0.0034709453604480104, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0015102738190578432, 0.004661991210358979, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0002901578861359261, 0.005853037060269948, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.005250109542583016, 0.0008650585481435862, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.004087441161396524, 0.0020000255728906545, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0029247727802100315, 0.0031349925976377224, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0017621043990235388, 0.004269959622384791, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0005994360178370465, 0.0054049266471318585, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.005453367389203894, 0.0005239257536298926, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.004345441717441285, 0.001605454350836663, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0032375160456786754, 0.002686982948043433, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.002129590373916066, 0.003768511545250204, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0010216647021534563, 0.004850040142456974, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.005765163510226976, 8.024102741994223e-05, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.004709403059514468, 0.0011108472750655516, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0036536426088019584, 0.002141453522711161, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0025978821580894492, 0.0031720597703567706, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0015421217073769398, 0.00420266601800238, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0004863612566644305, 0.0052332722656479895, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.005165455855213152, 0.0005296622162899674, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0041594044948223075, 0.0015117437261357035, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0031533531344314636, 0.0024938252359814395, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.002147301774040619, 0.0034759067458271756, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0011412504136497745, 0.004457988255672912, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.00013519905325893013, 0.005440069765518648, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.004742198237154558, 0.0008100776535657531, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.003783515480037499, 0.0017459191489808859, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.002824832722920441, 0.002681760644396019, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0018661499658033825, 0.0036176021398111515, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0009074672086863242, 0.004553443635226284, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.005390485695323663, 4.76413405109918e-05}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.004476941253182504, 0.0009394199716741435}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.003563396811041345, 0.0018311986028372953}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0026498523689001867, 0.0027229772340004467}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0017363079267590281, 0.0036147558651635986}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0008227634846178695, 0.004506534496326751}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.005223188431655863}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.004352657026379878}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.003482125621103892}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.002611594215827906}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0017410628105519205}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0008705314052759349}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}}; + + std::vector melSpec(N_FILTERS); + for (int j = 0; j < N_FILTERS; j++) { + float sum = 0; + for (int i = 0; i < N_DIM; i++) { + sum += powerSpec[i] * MEL_WEIGHTS[i][j]; + } + melSpec[j] = sum > EPSILON ? static_cast(log(sum)) : LOG_EPSILON; + } + return melSpec; +} + +std::vector>> AudioFeatureExtractor::getEncoderInput( + std::vector signal, std::vector lastPoints, bool padding) +{ + int numCategory = 3; + std::vector>> featureByCategory(numCategory); + + short lastPoint; + int nFrames; + + lastPoint = lastPoints.back(); + if (padding) { + nFrames = static_cast((int)signal.size() - 1) / FRAME_STEP + 1; + } else { + nFrames = static_cast((int)signal.size() - W_LENGTH) / FRAME_STEP + 1; + } + + if (nFrames >= 1) { + std::vector signalPreEnph; + + // central padding's frames shape + std::vector> frames(nFrames, std::vector(W_LENGTH)); + + PreEmphasis(signal, lastPoint, signalPreEnph); + SplitToFrames(signalPreEnph, frames, nFrames); + + // calculate mel_bank_coefficiences + std::vector> melFeatures(nFrames, std::vector(N_DIM)); + for (int i = 0; i < (int)frames.size(); i++) { + melFeatures[i] = GetMelBankForSingleFrame(frames[i]); + } + featureByCategory[0] = melFeatures; + } else { + std::vector> melFeatures(1, std::vector(N_DIM)); + featureByCategory[0] = melFeatures; + } + return featureByCategory; +} +int AudioFeatureExtractor::getWavHead(FILE *file) +{ + static char wavHead[100]; + int headSize = 0; + int i = 0; + + fseek(file, 0, SEEK_SET); + size_t readSize = fread(wavHead, 1, 100, file); + if (readSize != 100) { + return -1; + } + + for (i = 0; i < 92; i++) { + if (wavHead[i] == 'd' && wavHead[i + 1] == 'a' && wavHead[i + 2] == 't' && + wavHead[i + 3] == 'a') { + headSize = i + 8; + } + } + fseek(file, 0, SEEK_SET); + return headSize; +} + +std::vector AudioFeatureExtractor::readWav(const std::string &wavName) +{ + unsigned int wavSize = 0; + size_t readSize = 0; + + FILE *fp = fopen(wavName.c_str(), "rb"); + if (fp == NULL) { + UNI_ERROR_LOG("wav file %s not exist\n", wavName.c_str()); + std::vector data; + return data; + } + + int wavHeadSize = getWavHead(fp); + if (wavHeadSize > 4) { + int retSek = fseek(fp, wavHeadSize - 4, SEEK_SET); + CHECK_REQUIREMENT(retSek == 0); + + readSize = fread(&wavSize, sizeof(int), 1, fp); + CHECK_REQUIREMENT(readSize == 1); + } + std::vector data(wavSize / 2); + readSize = fread(data.data(), sizeof(short), wavSize / 2, fp); + CHECK_REQUIREMENT(readSize == data.size()); + + fclose(fp); + return data; +} + +std::vector>> AudioFeatureExtractor::getEncoderInputFromWav( + std::string wavFilePath) +{ + std::vector audioRaw = readWav(wavFilePath); + std::vector lastPoints(10); + std::vector>> melFea = + getEncoderInput(audioRaw, lastPoints, false); + return melFea; +} diff --git a/inference/examples/automatic_speech_recognition/audio_feature.h b/inference/examples/automatic_speech_recognition/audio_feature.h new file mode 100644 index 00000000..04fd48d2 --- /dev/null +++ b/inference/examples/automatic_speech_recognition/audio_feature.h @@ -0,0 +1,77 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef AUDIO_FEATURE_H_ +#define AUDIO_FEATURE_H_ + +#include +#include + +using cd = std::complex; + +class AudioFeatureExtractor { +public: + static std::vector>> getEncoderInputFromWav( + std::string wavFilePath); + + static std::vector>> getEncoderInput(std::vector signal, + std::vector lastPoints, + bool padding); // padding false + +private: + static constexpr int FRAME_STEP = 160; + static constexpr int W_LENGTH = 400; // window length + static constexpr int N_FFT = 512; // Num of FFT length + static constexpr int N_DIM = N_FFT / 2 + 1; + static constexpr int N_FILTERS = 128; // N_FILTERS = 41; + + static constexpr int SAMPLE_RATE = 16000; + static constexpr double LOWER_HERZ_FREQ = 0; + static constexpr double UPPER_HERZ_FREQ = 8000; + static constexpr float EPSILON = 2.2204460492503131e-16F; + static constexpr float LOG_EPSILON = -36.043653389F; + + static constexpr float _MEL_BREAK_FREQUENCY_HERTZ = 700.0F; + static constexpr float _MEL_HIGH_FREQUENCY_Q = 1127.0F; + + static void PreEmphasis(std::vector &signal, short lastPoint, std::vector &output); + + static void SplitToFrames( + std::vector &signal, std::vector> &output, int nFrames); + + static void CentralPadding(std::vector &signal, std::vector &output); + + static std::vector GetMelBankForSingleFrame(std::vector frame); + + static void AddHammingWindow(std::vector &data); + + static void fft(std::vector &a, bool invert); + + static std::vector ComputePowerSpec(std::vector fft); + + static std::vector GetHammingWindow(bool periodic); + + static std::vector> GetLinearToMelMatrix(); + + static std::vector LineSpace(double lower, double upper, int number); + + static std::vector HerzToMel(std::vector herzVec); + + static double HerzToMel(double herz); + + static int getWavHead(FILE *file); + + static std::vector readWav(const std::string &wavName); +}; + +#endif // AUDIO_FEATURE_H_ diff --git a/inference/examples/automatic_speech_recognition/encoder_flow.prototxt b/inference/examples/automatic_speech_recognition/encoder_flow.prototxt new file mode 100644 index 00000000..510a5bf4 --- /dev/null +++ b/inference/examples/automatic_speech_recognition/encoder_flow.prototxt @@ -0,0 +1,350 @@ +name: "encoder" +input: "sounds" +input: "encoder_block0_trunk0_layer0_mem" +input: "encoder_block0_trunk0_layer1_mem" +input: "encoder_block1_trunk1_layer0_kmem" +input: "encoder_block1_trunk1_layer0_vmem" +input: "encoder_block1_trunk1_layer1_kmem" +input: "encoder_block1_trunk1_layer1_vmem" +input: "encoder_block2_trunk0_layer0_mem" +input: "encoder_block2_trunk0_layer1_mem" +input: "encoder_block2_trunk1_layer0_kmem" +input: "encoder_block2_trunk1_layer0_vmem" +input: "encoder_block2_trunk1_layer1_kmem" +input: "encoder_block2_trunk1_layer1_vmem" +input: "encoder_block3_trunk0_layer0_mem" +input: "encoder_block3_trunk0_layer1_mem" +input: "encoder_block3_trunk1_layer0_kmem" +input: "encoder_block3_trunk1_layer0_vmem" +input: "encoder_block3_trunk1_layer1_kmem" +input: "encoder_block3_trunk1_layer1_vmem" +input: "encoder_block3_trunk1_layer2_kmem" +input: "encoder_block3_trunk1_layer2_vmem" +input: "encoder_block3_trunk1_layer3_kmem" +input: "encoder_block3_trunk1_layer3_vmem" +output: "encoder_block3_transformer_ln" +output: "encoder_block0_conv0_neg_slice" +output: "encoder_block0_conv1_neg_slice" +output: "encoder_block1_transformer_layer0_k_neg_slice" +output: "encoder_block1_transformer_layer0_v_neg_slice" +output: "encoder_block1_transformer_layer1_k_neg_slice" +output: "encoder_block1_transformer_layer1_v_neg_slice" +output: "encoder_block2_conv0_neg_slice" +output: "encoder_block2_conv1_neg_slice" +output: "encoder_block2_transformer_layer0_k_neg_slice" +output: "encoder_block2_transformer_layer0_v_neg_slice" +output: "encoder_block2_transformer_layer1_k_neg_slice" +output: "encoder_block2_transformer_layer1_v_neg_slice" +output: "encoder_block3_conv0_neg_slice" +output: "encoder_block3_conv1_neg_slice" +output: "encoder_block3_transformer_layer0_k_neg_slice" +output: "encoder_block3_transformer_layer0_v_neg_slice" +output: "encoder_block3_transformer_layer1_k_neg_slice" +output: "encoder_block3_transformer_layer1_v_neg_slice" +output: "encoder_block3_transformer_layer2_k_neg_slice" +output: "encoder_block3_transformer_layer2_v_neg_slice" +output: "encoder_block3_transformer_layer3_k_neg_slice" +output: "encoder_block3_transformer_layer3_v_neg_slice" +node { + name: "sounds" + type: "Input" + output: "sounds" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 15 + input_dim: 128 +} +node { + name: "encoder_block0_trunk0_layer0_mem" + type: "Input" + output: "encoder_block0_trunk0_layer0_mem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 2 + input_dim: 128 + input_dim: 1 +} +node { + name: "encoder_block0_trunk0_layer1_mem" + type: "Input" + output: "encoder_block0_trunk0_layer1_mem" + input_type: "FLOAT32" + input_format: "NCHWC8" + input_dim: 1 + input_dim: 32 + input_dim: 1 + input_dim: 64 +} +node { + name: "encoder_block1_trunk1_layer0_kmem" + type: "Input" + output: "encoder_block1_trunk1_layer0_kmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 5 + input_dim: 6 + input_dim: 64 +} +node { + name: "encoder_block1_trunk1_layer0_vmem" + type: "Input" + output: "encoder_block1_trunk1_layer0_vmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 5 + input_dim: 6 + input_dim: 64 +} +node { + name: "encoder_block1_trunk1_layer1_kmem" + type: "Input" + output: "encoder_block1_trunk1_layer1_kmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 7 + input_dim: 6 + input_dim: 64 +} +node { + name: "encoder_block1_trunk1_layer1_vmem" + type: "Input" + output: "encoder_block1_trunk1_layer1_vmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 7 + input_dim: 6 + input_dim: 64 +} +node { + name: "encoder_block2_trunk0_layer0_mem" + type: "Input" + output: "encoder_block2_trunk0_layer0_mem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 2 + input_dim: 384 +} +node { + name: "encoder_block2_trunk0_layer1_mem" + type: "Input" + output: "encoder_block2_trunk0_layer1_mem" + input_type: "FLOAT32" + input_format: "NCHWC8" + input_dim: 1 + input_dim: 1024 + input_dim: 1 + input_dim: 1 +} +node { + name: "encoder_block2_trunk1_layer0_kmem" + type: "Input" + output: "encoder_block2_trunk1_layer0_kmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 7 + input_dim: 8 + input_dim: 64 +} +node { + name: "encoder_block2_trunk1_layer0_vmem" + type: "Input" + output: "encoder_block2_trunk1_layer0_vmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 7 + input_dim: 8 + input_dim: 64 +} +node { + name: "encoder_block2_trunk1_layer1_kmem" + type: "Input" + output: "encoder_block2_trunk1_layer1_kmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 9 + input_dim: 8 + input_dim: 64 +} +node { + name: "encoder_block2_trunk1_layer1_vmem" + type: "Input" + output: "encoder_block2_trunk1_layer1_vmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 9 + input_dim: 8 + input_dim: 64 +} +node { + name: "encoder_block3_trunk0_layer0_mem" + type: "Input" + output: "encoder_block3_trunk0_layer0_mem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 2 + input_dim: 512 +} +node { + name: "encoder_block3_trunk0_layer1_mem" + type: "Input" + output: "encoder_block3_trunk0_layer1_mem" + input_type: "FLOAT32" + input_format: "NCHWC8" + input_dim: 1 + input_dim: 1024 + input_dim: 1 + input_dim: 1 +} +node { + name: "encoder_block3_trunk1_layer0_kmem" + type: "Input" + output: "encoder_block3_trunk1_layer0_kmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 9 + input_dim: 8 + input_dim: 64 +} +node { + name: "encoder_block3_trunk1_layer0_vmem" + type: "Input" + output: "encoder_block3_trunk1_layer0_vmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 9 + input_dim: 8 + input_dim: 64 +} +node { + name: "encoder_block3_trunk1_layer1_kmem" + type: "Input" + output: "encoder_block3_trunk1_layer1_kmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 15 + input_dim: 8 + input_dim: 64 +} +node { + name: "encoder_block3_trunk1_layer1_vmem" + type: "Input" + output: "encoder_block3_trunk1_layer1_vmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 15 + input_dim: 8 + input_dim: 64 +} +node { + name: "encoder_block3_trunk1_layer2_kmem" + type: "Input" + output: "encoder_block3_trunk1_layer2_kmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 23 + input_dim: 8 + input_dim: 64 +} +node { + name: "encoder_block3_trunk1_layer2_vmem" + type: "Input" + output: "encoder_block3_trunk1_layer2_vmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 23 + input_dim: 8 + input_dim: 64 +} +node { + name: "encoder_block3_trunk1_layer3_kmem" + type: "Input" + output: "encoder_block3_trunk1_layer3_kmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 31 + input_dim: 8 + input_dim: 64 +} +node { + name: "encoder_block3_trunk1_layer3_vmem" + type: "Input" + output: "encoder_block3_trunk1_layer3_vmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 31 + input_dim: 8 + input_dim: 64 +} +node { + name: "encoder_inference" + type: "Inference" + input: "sounds" + input: "encoder_block0_trunk0_layer0_mem" + input: "encoder_block0_trunk0_layer1_mem" + input: "encoder_block1_trunk1_layer0_kmem" + input: "encoder_block1_trunk1_layer0_vmem" + input: "encoder_block1_trunk1_layer1_kmem" + input: "encoder_block1_trunk1_layer1_vmem" + input: "encoder_block2_trunk0_layer0_mem" + input: "encoder_block2_trunk0_layer1_mem" + input: "encoder_block2_trunk1_layer0_kmem" + input: "encoder_block2_trunk1_layer0_vmem" + input: "encoder_block2_trunk1_layer1_kmem" + input: "encoder_block2_trunk1_layer1_vmem" + input: "encoder_block3_trunk0_layer0_mem" + input: "encoder_block3_trunk0_layer1_mem" + input: "encoder_block3_trunk1_layer0_kmem" + input: "encoder_block3_trunk1_layer0_vmem" + input: "encoder_block3_trunk1_layer1_kmem" + input: "encoder_block3_trunk1_layer1_vmem" + input: "encoder_block3_trunk1_layer2_kmem" + input: "encoder_block3_trunk1_layer2_vmem" + input: "encoder_block3_trunk1_layer3_kmem" + input: "encoder_block3_trunk1_layer3_vmem" + output: "encoder_block3_transformer_ln" + output: "encoder_block0_conv0_neg_slice" + output: "encoder_block0_conv1_neg_slice" + output: "encoder_block1_transformer_layer0_k_neg_slice" + output: "encoder_block1_transformer_layer0_v_neg_slice" + output: "encoder_block1_transformer_layer1_k_neg_slice" + output: "encoder_block1_transformer_layer1_v_neg_slice" + output: "encoder_block2_conv0_neg_slice" + output: "encoder_block2_conv1_neg_slice" + output: "encoder_block2_transformer_layer0_k_neg_slice" + output: "encoder_block2_transformer_layer0_v_neg_slice" + output: "encoder_block2_transformer_layer1_k_neg_slice" + output: "encoder_block2_transformer_layer1_v_neg_slice" + output: "encoder_block3_conv0_neg_slice" + output: "encoder_block3_conv1_neg_slice" + output: "encoder_block3_transformer_layer0_k_neg_slice" + output: "encoder_block3_transformer_layer0_v_neg_slice" + output: "encoder_block3_transformer_layer1_k_neg_slice" + output: "encoder_block3_transformer_layer1_v_neg_slice" + output: "encoder_block3_transformer_layer2_k_neg_slice" + output: "encoder_block3_transformer_layer2_v_neg_slice" + output: "encoder_block3_transformer_layer3_k_neg_slice" + output: "encoder_block3_transformer_layer3_v_neg_slice" + infer_output_size_parameter: "encoderInferOutputSize" + preprocess_parameter: "encoderPreProcess" + inference_parameter: "/data/local/tmp/CI/model_zoo/caffe_models/asr_convolution_transformer_encoder/asr_convolution_transformer_encoder_f32.bolt" +} diff --git a/inference/examples/automatic_speech_recognition/example.wav b/inference/examples/automatic_speech_recognition/example.wav new file mode 100644 index 00000000..b6483ae8 Binary files /dev/null and b/inference/examples/automatic_speech_recognition/example.wav differ diff --git a/inference/examples/automatic_speech_recognition/flow_asr.cpp b/inference/examples/automatic_speech_recognition/flow_asr.cpp new file mode 100644 index 00000000..c4fc0e10 --- /dev/null +++ b/inference/examples/automatic_speech_recognition/flow_asr.cpp @@ -0,0 +1,884 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include +#include +#include "task.h" +#include "flow.h" +#include "audio_feature.h" + +DataType inferencePrecision = DT_F32; +const int N_FILTERS = 128; + +// prediction&joint&pinyin2hanzi +const int START_TOKEN = 0; +const int BLANK_TOKEN = 1600; + +// pinyin2hanzi +const int PINYIN_FEATURE_GAP = 2; +const int PINYIN_BUFFER_SIZE = 32; +const int PINYIN_BUFFER_VALID_SIZE = 16; +std::shared_ptr pinyinEmbeddingDict; +std::atomic pinyinEmbeddingFlag(0); + +EE encoderInferOutputSize(std::map> &inputs, + std::shared_ptr &tmp, + std::map> &outputs, + std::vector parameter = std::vector()) +{ + TensorDesc inputDesc = inputs["sounds"]->get_desc(); + TensorDesc desc = inputs["encoder_block0_trunk0_layer0_mem"]->get_desc(); + desc.dims[2] = UNI_MAX((int)desc.dims[2] + 1, 2); + outputs["encoder_block0_conv0_neg_slice"]->resize(desc); + outputs["encoder_block0_conv1_neg_slice"]->resize( + inputs["encoder_block0_trunk0_layer1_mem"]->get_desc()); + + int block1[2] = {5, 7}; + for (int i = 0; i < 2; i++) { + std::string inputPrefix = + std::string("encoder_block1_trunk1_layer") + std::to_string(i) + std::string("_"); + std::string outputPrefix = + std::string("encoder_block1_transformer_layer") + std::to_string(i) + std::string("_"); + TensorDesc desc = inputs[inputPrefix + "kmem"]->get_desc(); + desc.dims[2] = UNI_MAX((int)desc.dims[2] + block1[i], block1[i]); + outputs[outputPrefix + "k_neg_slice"]->resize(desc); + outputs[outputPrefix + "v_neg_slice"]->resize(desc); + } + + desc = inputs["encoder_block2_trunk0_layer0_mem"]->get_desc(); + desc.dims[1] = UNI_MAX((int)desc.dims[1] + 1, 2); + outputs["encoder_block2_conv0_neg_slice"]->resize(desc); + outputs["encoder_block2_conv1_neg_slice"]->resize( + inputs["encoder_block2_trunk0_layer1_mem"]->get_desc()); + int block2[2] = {7, 9}; + for (int i = 0; i < 2; i++) { + std::string inputPrefix = + std::string("encoder_block2_trunk1_layer") + std::to_string(i) + std::string("_"); + std::string outputPrefix = + std::string("encoder_block2_transformer_layer") + std::to_string(i) + std::string("_"); + TensorDesc desc = inputs[inputPrefix + "kmem"]->get_desc(); + int adder = 2; + if (inputDesc.dims[1] == 15) { + adder = 3; + } else { + if (inputDesc.dims[1] != 8) { + UNI_ERROR_LOG("unmatched encoder input\n"); + } + } + desc.dims[2] = UNI_MAX((int)desc.dims[2] + adder, block2[i]); + outputs[outputPrefix + "k_neg_slice"]->resize(desc); + outputs[outputPrefix + "v_neg_slice"]->resize(desc); + } + + desc = inputs["encoder_block3_trunk0_layer0_mem"]->get_desc(); + desc.dims[1] = UNI_MAX((int)desc.dims[1] + 1, 2); + outputs["encoder_block3_conv0_neg_slice"]->resize(desc); + outputs["encoder_block3_conv1_neg_slice"]->resize( + inputs["encoder_block3_trunk0_layer1_mem"]->get_desc()); + int block3[4] = {9, 15, 23, 31}; + for (int i = 0; i < 4; i++) { + std::string inputPrefix = + std::string("encoder_block3_trunk1_layer") + std::to_string(i) + std::string("_"); + std::string outputPrefix = + std::string("encoder_block3_transformer_layer") + std::to_string(i) + std::string("_"); + TensorDesc desc = inputs[inputPrefix + "kmem"]->get_desc(); + desc.dims[2] = UNI_MAX((int)desc.dims[2] + 1, block3[i]); + outputs[outputPrefix + "k_neg_slice"]->resize(desc); + outputs[outputPrefix + "v_neg_slice"]->resize(desc); + } + outputs["encoder_block3_transformer_ln"]->resize( + tensor2df(inferencePrecision, DF_NORMAL, 1, 512)); + return SUCCESS; +} + +EE predictionInferOutputSize(std::map> &inputs, + std::shared_ptr &tmp, + std::map> &outputs, + std::vector parameter = std::vector()) +{ + int block3[4] = {3, 5, 7, 9}; + for (int i = 0; i < 4; i++) { + std::string inputPrefix = + std::string("prediction_net_layer") + std::to_string(i) + std::string("_"); + std::string outputPrefix = + std::string("prediction_net_layer") + std::to_string(i) + std::string("_"); + TensorDesc desc = inputs[inputPrefix + "kmem"]->get_desc(); + desc.dims[2] = UNI_MAX((int)desc.dims[2] + 1, block3[i]); + outputs[outputPrefix + "k_neg_slice"]->resize(desc); + outputs[outputPrefix + "v_neg_slice"]->resize(desc); + } + outputs["prediction_net_ln"]->resize(tensor2df(inferencePrecision, DF_NORMAL, 1, 512)); + return SUCCESS; +} + +EE jointInferOutputSize(std::map> &inputs, + std::shared_ptr &tmp, + std::map> &outputs, + std::vector parameter = std::vector()) +{ + // outputs["joint_output_fc"]->resize(tensor2df(inferencePrecision, DF_NORMAL, 1, 512)); + outputs["output_argmax"]->resize(tensor2df(DT_I32, DF_NORMAL, 1, 1)); + return SUCCESS; +} + +EE pinyin2hanziInferOutputSize(std::map> &inputs, + std::shared_ptr &tmp, + std::map> &outputs, + std::vector parameter = std::vector()) +{ + TensorDesc desc = inputs["pinyin"]->get_desc(); + outputs["hanzi_squeeze/Squeeze"]->resize( + tensor4df(inferencePrecision, DF_NCHW, 1, 1, desc.dims[0], 7160)); + return SUCCESS; +} + +EE encoderPreProcess(std::map> &inputs, + std::shared_ptr &tmp, + std::map> &outputs, + std::vector parameter = std::vector()) +{ + int featureLength = N_FILTERS; + // inputs and outputs can not be same one + CHECK_REQUIREMENT(inputs.size() > 0); + std::vector weightA = {0.26793470448235757, 0.2597546401553133, 0.25070439183132637, + 0.2389518634030468, 0.22591939536296402, 0.21842706422127695, 0.21073101672676822, + 0.19888634668966934, 0.1934352819534865, 0.19483272371655574, 0.19307169092034548, + 0.19794880602465662, 0.2041545140444457, 0.20548612384306975, 0.205089112033574, + 0.202463874511741, 0.1997057297551323, 0.1986376615816107, 0.1953351397506247, + 0.19526630343057141, 0.19707734328352133, 0.19871668436383344, 0.19880258511761903, + 0.20143541652121727, 0.2044134862423108, 0.20602641560137125, 0.20564694818486318, + 0.206515308314549, 0.2092981906166021, 0.2105148453821694, 0.209482433282912, + 0.21072670095339943, 0.21295487096308688, 0.21402032655941866, 0.21254455731621794, + 0.21365817460879144, 0.2163171444197802, 0.21766703064503207, 0.21640375119276742, + 0.2177893882181534, 0.2205046640925341, 0.2218610679573307, 0.22053006469571076, + 0.22162170408445966, 0.22370872632630542, 0.22537803061334274, 0.22641169891592502, + 0.2274135200959736, 0.22817822886370503, 0.22850555770692876, 0.22849091616908523, + 0.22942646398018746, 0.23089530924664364, 0.23176498740499615, 0.23372326568964216, + 0.23547995759926693, 0.2364584692820128, 0.23713210245263003, 0.2375549912435519, + 0.23761757113350296, 0.23757638746581106, 0.23820814260735781, 0.2385523824231173, + 0.23896144410382456, 0.2397607819892432, 0.24065938255474512, 0.2416691468977067, + 0.24337672078468509, 0.24427940599421233, 0.24517506765424793, 0.24579829824437913, + 0.24723941129617125, 0.24809058963717726, 0.24874810693293706, 0.248877475370626, + 0.24951549731479883, 0.24955122418541695, 0.2492060337981675, 0.24902471798206796, + 0.24888344336656584, 0.24846182447195098, 0.24729274718749017, 0.24639018404388816, + 0.24659313647419556, 0.24630866444966484, 0.24585278398389177, 0.24605167118751672, + 0.24594061893719316, 0.24532106768133538, 0.24572437083432735, 0.2459548905112401, + 0.245982906631063, 0.24652363950502573, 0.24715790835692908, 0.2478608527450776, + 0.24889337178480928, 0.249329751248172, 0.24960285555075376, 0.24955584458875266, + 0.2497572027892517, 0.2499798759413889, 0.2500960262323433, 0.2506400682242264, + 0.2515477086314016, 0.25259227168784903, 0.25364113255322157, 0.25537851424540586, + 0.2573300627421209, 0.25956427589759357, 0.26117713995761727, 0.2624523374880242, + 0.2632993514075515, 0.26413640430134505, 0.26511896710476746, 0.2662951418810798, + 0.26744233631929915, 0.267688136864862, 0.2672668616086788, 0.26649503147446485, + 0.26594129076005935, 0.2659199727680806, 0.2664476518237045, 0.26695480256723025, + 0.2678133595844467, 0.2701192220836497, 0.2742489539853769, 0.2798973923783803, + 0.28540062392560295}; + std::vector weightB = {4.594726366770656, 4.192752172632116, 3.9776274929119557, + 3.4349833759246713, 3.0983192175590126, 2.8131751675954018, 2.674216353771496, + 2.299024401714484, 2.2277405730898843, 2.2079989172157086, 2.2080042633425534, + 2.239013527979191, 2.41471012643739, 2.405628743225133, 2.45394225056771, 2.3372751727216574, + 2.3356523900751234, 2.2857494554648192, 2.263597932542921, 2.199953784963237, + 2.283013730372439, 2.287507759169855, 2.3248724084010197, 2.3234718339153364, + 2.428010836779634, 2.4391312085381363, 2.4676774757702, 2.4445873870383834, + 2.5379614937156854, 2.541529720288643, 2.552965909269937, 2.528893119611279, + 2.609828446143808, 2.611520901760278, 2.6113588465301225, 2.5879040353367735, + 2.670180890126309, 2.6768002097714785, 2.6745482022603047, 2.6589252525406937, + 2.7405675184409484, 2.748250039256346, 2.7504889136399346, 2.7279897692691324, + 2.803509804647416, 2.8033767975633253, 2.81782662029014, 2.8398580132615985, + 2.8634585052804473, 2.8850252018322435, 2.8939588401492355, 2.9149064619044824, + 2.938446538597044, 2.9491789310074474, 2.9655894539521057, 2.9814448232043804, + 2.9946988873469187, 2.9974272291551625, 2.9982878146018908, 2.997330908879054, + 2.9987101107447867, 2.9833493242668405, 2.9875125168844545, 3.0194390288802575, + 3.028980829234581, 3.0057895811449447, 3.076450198087296, 3.0683058012421935, + 3.0938844769593064, 3.11508333263089, 3.121912904965018, 3.146879175832384, + 3.1768447540457245, 3.1598400327144147, 3.190448649847769, 3.1933782870894385, + 3.1789337132666655, 3.1801368920926776, 3.1702021059419705, 3.1585067337253734, + 3.145159095452153, 3.124279154413975, 3.1068527554445096, 3.103454244479969, + 3.096145034068362, 3.0888735929867055, 3.0728735019732527, 3.0772210570154477, + 3.0684300226295047, 3.0504857878230385, 3.068488307579292, 3.051638660693075, + 3.0726374420353735, 3.0707974307243466, 3.088892965875781, 3.103242655729246, + 3.1090877750810226, 3.112699742574199, 3.111884782449412, 3.1145576667173303, + 3.1185679471418215, 3.1242895827009405, 3.136642993753398, 3.15245492583083, + 3.185308230069337, 3.2015540228767803, 3.245292124114324, 3.2826235672398743, + 3.3220448193935534, 3.3566443133338755, 3.3843542201410166, 3.406417064746228, + 3.4294187840241075, 3.458963279130731, 3.4864911772857177, 3.508984664352243, + 3.525467921720016, 3.5317980631290027, 3.5339991083767575, 3.5397785467806564, + 3.5511168000118016, 3.5702997212991785, 3.6000097146634724, 3.6546755683682086, + 3.763185000352641, 3.9092252627215855, 4.07891493530088, 4.22557473399065}; + TensorDesc inputDesc = inputs["sounds"]->get_desc(); + outputs = inputs; + outputs["sounds"] = std::shared_ptr(new Tensor()); + outputs["sounds"]->resize(inputDesc); + outputs["sounds"]->alloc(); + int num = tensorNumElements(inputDesc); + int loops = num / featureLength; + CHECK_REQUIREMENT(loops * featureLength == num); + switch (inferencePrecision) { + case DT_F32: { + F32 *inPtr = (F32 *)((CpuMemory *)(inputs["sounds"]->get_memory()))->get_ptr(); + F32 *outPtr = (F32 *)((CpuMemory *)(outputs["sounds"]->get_memory()))->get_ptr(); + for (int i = 0, index = 0; i < loops; i++) { + for (int j = 0; j < featureLength; j++, index++) { + outPtr[index] = weightA[j] * inPtr[index] + weightB[j]; + } + } + break; + } +#ifdef _USE_FP16 + case DT_F16: { + F16 *inPtr = (F16 *)((CpuMemory *)(inputs["sounds"]->get_memory()))->get_ptr(); + F16 *outPtr = (F16 *)((CpuMemory *)(outputs["sounds"]->get_memory()))->get_ptr(); + for (int i = 0, index = 0; i < loops; i++) { + for (int j = 0; j < featureLength; j++, index++) { + outPtr[index] = weightA[j] * inPtr[index] + weightB[j]; + } + } + break; + } +#endif + default: + UNI_ERROR_LOG("unsupported precision type in asr encoder preprocess function\n"); + break; + } + return SUCCESS; +} + +void loadBinary(const std::string fileName, char *data, size_t size) +{ + std::ifstream ifs(fileName, std::ifstream::in | std::ifstream::binary | std::ifstream::ate); + if (!ifs.good()) { + UNI_ERROR_LOG("load binary data from %s failed\n", fileName.c_str()); + } + size_t length = ifs.tellg(); + ifs.seekg(0, std::ifstream::beg); + ifs.read(data, UNI_MIN(length, size)); + if (length < size) { + memset(data + length, 0, size - length); + } + ifs.close(); +} + +EE pinyin2hanziPreProcess(std::map> &inputs, + std::shared_ptr &tmp, + std::map> &outputs, + std::vector parameter = std::vector()) +{ + int embeddingSize = std::stoi(parameter[3]); + if (!atomic_exchange(&pinyinEmbeddingFlag, 1)) { + std::string embeddingFile = parameter[1]; + int classes = std::stoi(parameter[2]); + size_t size = sizeof(float) * classes * embeddingSize; + pinyinEmbeddingDict = std::shared_ptr(reinterpret_cast(operator new(size))); + loadBinary(embeddingFile, reinterpret_cast(pinyinEmbeddingDict.get()), size); + } + TensorDesc inputDesc = inputs["pinyin"]->get_desc(); + int batch = inputDesc.dims[inputDesc.nDims - 1]; + int inputSize = tensorNumElements(inputDesc); + int inputSizePerBatch = inputSize / batch; + unsigned int *inputPtr = + (unsigned int *)((CpuMemory *)(inputs["pinyin"]->get_memory()))->get_ptr(); + std::string name = "lm_in_deploy"; + outputs[name] = std::shared_ptr(new Tensor()); + outputs[name]->resize( + tensor4df(inferencePrecision, DF_NCHW, 1, embeddingSize, 1, inputDesc.dims[0])); + outputs[name]->alloc(); + float *pinyinEmbeddingDictPtr = pinyinEmbeddingDict.get(); + switch (inferencePrecision) { + case DT_F32: { + F32 *outputPtr = (F32 *)((CpuMemory *)(outputs[name]->get_memory()))->get_ptr(); + for (int i = 0; i < batch; i++) { + for (int j = 0; j < inputSizePerBatch; j++) { + int element = inputPtr[i * inputSizePerBatch + j]; + for (int k = 0; k < embeddingSize; k++) { + outputPtr[(i * embeddingSize + k) * inputSizePerBatch + j] = + pinyinEmbeddingDictPtr[element * embeddingSize + k]; + } + } + } + break; + } +#ifdef _USE_FP16 + case DT_F16: { + F16 *outputPtr = (F16 *)((CpuMemory *)(outputs[name]->get_memory()))->get_ptr(); + for (int i = 0; i < batch; i++) { + for (int j = 0; j < inputSizePerBatch; j++) { + int element = inputPtr[i * inputSizePerBatch + j]; + for (int k = 0; k < embeddingSize; k++) { + outputPtr[(i * embeddingSize + k) * inputSizePerBatch + j] = + pinyinEmbeddingDictPtr[element * embeddingSize + k]; + } + } + } + break; + } +#endif + default: + UNI_ERROR_LOG("unsupported precision type in asr pinyin2hanzi preprocess function\n"); + break; + } + return SUCCESS; +} + +std::map> getEncoderInputOutput( + std::vector>> feature, + int frameId, + int frameLength, + std::map> cache) +{ + std::map> tensors; + int frameOffset = ((frameId > 0) ? 15 : 0) + ((frameId > 0) ? (frameId - 1) : 0) * 8; + if (frameOffset + frameLength > static_cast(feature[0].size())) { + return tensors; + } + int featureLength = N_FILTERS; + tensors["sounds"] = std::shared_ptr(new Tensor()); + tensors["sounds"]->resize(tensor3df(inferencePrecision, DF_NCHW, 1, frameLength, featureLength)); + tensors["sounds"]->alloc(); + switch (inferencePrecision) { + case DT_F32: { + F32 *ptr = (F32 *)((CpuMemory *)(tensors["sounds"]->get_memory()))->get_ptr(); + for (int i = 0; i < frameLength; i++) { + memcpy(ptr + i * featureLength, feature[0][i + frameOffset].data(), + featureLength * sizeof(float)); + } + break; + } +#ifdef _USE_FP16 + case DT_F16: { + F16 *ptr = (F16 *)((CpuMemory *)(tensors["sounds"]->get_memory()))->get_ptr(); + for (int i = 0; i < frameLength; i++) { + for (int j = 0; j < featureLength; j++) { + ptr[i * featureLength + j] = feature[0][i + frameOffset][j]; + } + } + break; + } +#endif + default: + UNI_ERROR_LOG("not support inference precision to get encoder input\n"); + } + std::vector outputName = {"encoder_block3_transformer_ln", + "encoder_block0_conv0_neg_slice", "encoder_block0_conv1_neg_slice", + "encoder_block1_transformer_layer0_k_neg_slice", + "encoder_block1_transformer_layer0_v_neg_slice", + "encoder_block1_transformer_layer1_k_neg_slice", + "encoder_block1_transformer_layer1_v_neg_slice", "encoder_block2_conv0_neg_slice", + "encoder_block2_conv1_neg_slice", "encoder_block2_transformer_layer0_k_neg_slice", + "encoder_block2_transformer_layer0_v_neg_slice", + "encoder_block2_transformer_layer1_k_neg_slice", + "encoder_block2_transformer_layer1_v_neg_slice", "encoder_block3_conv0_neg_slice", + "encoder_block3_conv1_neg_slice", "encoder_block3_transformer_layer0_k_neg_slice", + "encoder_block3_transformer_layer0_v_neg_slice", + "encoder_block3_transformer_layer1_k_neg_slice", + "encoder_block3_transformer_layer1_v_neg_slice", + "encoder_block3_transformer_layer2_k_neg_slice", + "encoder_block3_transformer_layer2_v_neg_slice", + "encoder_block3_transformer_layer3_k_neg_slice", + "encoder_block3_transformer_layer3_v_neg_slice"}; + for (unsigned int i = 0; i < outputName.size(); i++) { + tensors[outputName[i]] = std::shared_ptr(new Tensor()); + } + + if (cache.size() > 0 && frameLength == 8) { + tensors["encoder_block0_trunk0_layer0_mem"] = cache["encoder_block0_conv0_neg_slice"]; + tensors["encoder_block0_trunk0_layer1_mem"] = cache["encoder_block0_conv1_neg_slice"]; + for (int i = 0; i < 2; i++) { + std::string inputPrefix = + std::string("encoder_block1_trunk1_layer") + std::to_string(i) + std::string("_"); + std::string outputPrefix = std::string("encoder_block1_transformer_layer") + + std::to_string(i) + std::string("_"); + tensors[inputPrefix + "kmem"] = cache[outputPrefix + "k_neg_slice"]; + tensors[inputPrefix + "vmem"] = cache[outputPrefix + "v_neg_slice"]; + } + + tensors["encoder_block2_trunk0_layer0_mem"] = cache["encoder_block2_conv0_neg_slice"]; + tensors["encoder_block2_trunk0_layer1_mem"] = cache["encoder_block2_conv1_neg_slice"]; + for (int i = 0; i < 2; i++) { + std::string inputPrefix = + std::string("encoder_block2_trunk1_layer") + std::to_string(i) + std::string("_"); + std::string outputPrefix = std::string("encoder_block2_transformer_layer") + + std::to_string(i) + std::string("_"); + tensors[inputPrefix + "kmem"] = cache[outputPrefix + "k_neg_slice"]; + tensors[inputPrefix + "vmem"] = cache[outputPrefix + "v_neg_slice"]; + } + + tensors["encoder_block3_trunk0_layer0_mem"] = cache["encoder_block3_conv0_neg_slice"]; + tensors["encoder_block3_trunk0_layer1_mem"] = cache["encoder_block3_conv1_neg_slice"]; + for (int i = 0; i < 4; i++) { + std::string inputPrefix = + std::string("encoder_block3_trunk1_layer") + std::to_string(i) + std::string("_"); + std::string outputPrefix = std::string("encoder_block3_transformer_layer") + + std::to_string(i) + std::string("_"); + tensors[inputPrefix + "kmem"] = cache[outputPrefix + "k_neg_slice"]; + tensors[inputPrefix + "vmem"] = cache[outputPrefix + "v_neg_slice"]; + } + } else { + tensors["encoder_block0_trunk0_layer0_mem"] = std::shared_ptr(new Tensor()); + tensors["encoder_block0_trunk0_layer0_mem"]->resize( + tensor4df(inferencePrecision, DF_NCHW, 1, 1, 128, 1)); + tensors["encoder_block0_trunk0_layer0_mem"]->alloc(); + tensors["encoder_block0_trunk0_layer1_mem"] = std::shared_ptr(new Tensor()); + tensors["encoder_block0_trunk0_layer1_mem"]->resize( + tensor4df(inferencePrecision, DF_NCHWC8, 1, 32, 1, 64)); + tensors["encoder_block0_trunk0_layer1_mem"]->alloc(); + + for (int i = 0; i < 2; i++) { + for (int j = 0; j < 2; j++) { + std::string kv = std::string("k"); + if (j == 0) { + kv = std::string("v"); + } + std::string name = std::string("encoder_block1_trunk1_layer") + std::to_string(i) + + std::string("_") + kv + "mem"; + tensors[name] = std::shared_ptr(new Tensor()); + tensors[name]->resize(tensor4df(inferencePrecision, DF_NCHW, 1, 0, 6, 64)); + tensors[name]->alloc(); + } + } + + tensors["encoder_block2_trunk0_layer0_mem"] = std::shared_ptr(new Tensor()); + tensors["encoder_block2_trunk0_layer0_mem"]->resize( + tensor3df(inferencePrecision, DF_NCHW, 1, 1, 384)); + tensors["encoder_block2_trunk0_layer0_mem"]->alloc(); + tensors["encoder_block2_trunk0_layer1_mem"] = std::shared_ptr(new Tensor()); + tensors["encoder_block2_trunk0_layer1_mem"]->resize( + tensor4df(inferencePrecision, DF_NCHWC8, 1, 1024, 1, 1)); + tensors["encoder_block2_trunk0_layer1_mem"]->alloc(); + for (int i = 0; i < 2; i++) { + for (int j = 0; j < 2; j++) { + std::string kv = std::string("k"); + if (j == 0) { + kv = std::string("v"); + } + std::string name = std::string("encoder_block2_trunk1_layer") + std::to_string(i) + + std::string("_") + kv + "mem"; + tensors[name] = std::shared_ptr(new Tensor()); + tensors[name]->resize(tensor4df(inferencePrecision, DF_NCHW, 1, 0, 8, 64)); + tensors[name]->alloc(); + } + } + + tensors["encoder_block3_trunk0_layer0_mem"] = std::shared_ptr(new Tensor()); + tensors["encoder_block3_trunk0_layer0_mem"]->resize( + tensor3df(inferencePrecision, DF_NCHW, 1, 1, 512)); + tensors["encoder_block3_trunk0_layer0_mem"]->alloc(); + tensors["encoder_block3_trunk0_layer1_mem"] = std::shared_ptr(new Tensor()); + tensors["encoder_block3_trunk0_layer1_mem"]->resize( + tensor4df(inferencePrecision, DF_NCHWC8, 1, 1024, 1, 1)); + tensors["encoder_block3_trunk0_layer1_mem"]->alloc(); + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 2; j++) { + std::string kv = std::string("k"); + if (j == 0) { + kv = std::string("v"); + } + std::string name = std::string("encoder_block3_trunk1_layer") + std::to_string(i) + + std::string("_") + kv + "mem"; + tensors[name] = std::shared_ptr(new Tensor()); + tensors[name]->resize(tensor4df(inferencePrecision, DF_NCHW, 1, 0, 8, 64)); + tensors[name]->alloc(); + } + } + for (auto iter : tensors) { + if (iter.first != std::string("sounds")) { + TensorDesc desc = iter.second->get_desc(); + U8 *ptr = (U8 *)((CpuMemory *)(iter.second->get_memory()))->get_ptr(); + memset(ptr, 0, tensorNumBytes(desc)); + } + } + } + std::shared_ptr tmp; + encoderInferOutputSize(tensors, tmp, tensors); + for (unsigned int i = 0; i < outputName.size(); i++) { + tensors[outputName[i]]->alloc(); + } + return tensors; +} + +std::map> getPredictionInputOutput( + std::map> jointResult, + std::map> cache) +{ + std::map> tensors; + if (jointResult.size() == 0) { + tensors["label"] = std::shared_ptr(new Tensor()); + tensors["label"]->resize(tensor2df(DT_U32, DF_NORMAL, 1, 1)); + tensors["label"]->alloc(); + U32 *ptr = (U32 *)(((CpuMemory *)(tensors["label"]->get_memory()))->get_ptr()); + *ptr = START_TOKEN; + } else { + tensors["label"] = jointResult["output_argmax"]; + } + if (cache.size() > 0) { + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 2; j++) { + std::string kv = std::string("k"); + if (j == 0) { + kv = std::string("v"); + } + std::string inputName = std::string("prediction_net_layer") + std::to_string(i) + + std::string("_") + kv + "mem"; + std::string outputName = std::string("prediction_net_layer") + std::to_string(i) + + std::string("_") + kv + "_neg_slice"; + tensors[inputName] = cache[outputName]; + } + } + } else { + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 2; j++) { + std::string kv = std::string("k"); + if (j == 0) { + kv = std::string("v"); + } + std::string name = std::string("prediction_net_layer") + std::to_string(i) + + std::string("_") + kv + "mem"; + tensors[name] = std::shared_ptr(new Tensor()); + tensors[name]->resize(tensor4df(inferencePrecision, DF_NCHW, 1, 0, 8, 64)); + tensors[name]->alloc(); + } + } + } + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 2; j++) { + std::string kv = std::string("k"); + if (j == 0) { + kv = std::string("v"); + } + std::string name = std::string("prediction_net_layer") + std::to_string(i) + + std::string("_") + kv + "_neg_slice"; + tensors[name] = std::shared_ptr(new Tensor()); + } + } + tensors["prediction_net_ln"] = std::shared_ptr(new Tensor()); + std::shared_ptr tmp; + predictionInferOutputSize(tensors, tmp, tensors); + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 2; j++) { + std::string kv = std::string("k"); + if (j == 0) { + kv = std::string("v"); + } + std::string name = std::string("prediction_net_layer") + std::to_string(i) + + std::string("_") + kv + "_neg_slice"; + tensors[name]->alloc(); + } + } + tensors["prediction_net_ln"]->alloc(); + return tensors; +} + +std::map> getJointInputOutput( + std::map> encoder, + std::map> prediction_net) +{ + std::map> tensors; + tensors["encoder"] = encoder["encoder_block3_transformer_ln"]; + tensors["prediction_net"] = prediction_net["prediction_net_ln"]; + tensors["output_argmax"] = std::shared_ptr(new Tensor()); + std::shared_ptr tmp; + jointInferOutputSize(tensors, tmp, tensors); + tensors["output_argmax"]->alloc(); + return tensors; +} + +std::map> getPinYin2HanZiInputOutput(int frameId, + unsigned int *buffer, + int bufferLength, + int bufferValidSize, + std::map> joint) +{ + std::map> tensors; + tensors["pinyin"] = std::shared_ptr(new Tensor()); + tensors["pinyin"]->resize(tensor2df(DT_U32, DF_NORMAL, 1, bufferLength)); + tensors["pinyin"]->alloc(); + if (frameId == 0) { + memset(buffer, 0, sizeof(unsigned int) * bufferLength); + } + int pinyin = *((unsigned int *)((CpuMemory *)(joint["output_argmax"]->get_memory()))->get_ptr()) - + PINYIN_FEATURE_GAP; + CHECK_REQUIREMENT(pinyin >= 0); + if (frameId < bufferValidSize) { + buffer[frameId] = pinyin; + } else { + for (int i = 0; i < bufferValidSize - 1; i++) { + buffer[i] = buffer[i + 1]; + } + buffer[bufferValidSize - 1] = pinyin; + } + unsigned int *ptr = (unsigned int *)((CpuMemory *)(tensors["pinyin"]->get_memory()))->get_ptr(); + memcpy(ptr, buffer, sizeof(unsigned int) * bufferValidSize); + memset(ptr + bufferValidSize, 0, sizeof(unsigned int) * (bufferLength - bufferValidSize)); + + tensors["hanzi_squeeze/Squeeze"] = std::shared_ptr(new Tensor()); + std::shared_ptr tmp; + pinyin2hanziInferOutputSize(tensors, tmp, tensors); + tensors["hanzi_squeeze/Squeeze"]->alloc(); + return tensors; +} + +std::vector split(const std::string &str, const std::string &sep) +{ + std::vector vec; + if (str.empty()) { + return vec; + } + + size_t pos1; + size_t pos2; + pos2 = str.find(sep); + pos1 = 0; + while (std::string::npos != pos2) { + vec.push_back(str.substr(pos1, pos2 - pos1)); + + pos1 = pos2 + sep.size(); + pos2 = str.find(sep, pos1); + } + if (pos1 != str.length() * sizeof(typename std::string::value_type)) { + vec.push_back(str.substr(pos1)); + } + + return vec; +} + +std::map> loadLabels(std::string labelFilePath) +{ + std::map> labels; + std::ifstream infile; + infile.open(labelFilePath); + if (!infile.is_open()) { + return labels; + } + std::string s; + int index = 0; + while (getline(infile, s)) { + switch (index) { + case 0: + labels["hanzi"] = split(s, std::string(" ")); + break; + case 1: + labels["pinyin"] = split(s, std::string(" ")); + break; + default: + UNI_WARNING_LOG("unrecognized label file line %s\n", s.c_str()); + break; + } + index++; + } + infile.close(); + return labels; +} + +bool jointOutputIsBlank(std::map> jointResult) +{ + if (jointResult.find("output_argmax") == jointResult.end()) { + UNI_ERROR_LOG("unrecognized joint result"); + } + TensorDesc desc = jointResult["output_argmax"]->get_desc(); + if (tensorNumElements(desc) != 1) { + UNI_ERROR_LOG("unrecognized joint result(output_argmax) tensor"); + } + U32 *ptr = (U32 *)((CpuMemory *)(jointResult["output_argmax"]->get_memory()))->get_ptr(); + bool ret = false; + if (*ptr == BLANK_TOKEN) { + ret = true; + } + return ret; +} + +void freshPinYinResult(std::vector> &pinyinResult, + std::vector pinyinLabels, + std::map> joint, + int frameId) +{ + int pinyin = + *((unsigned int *)(((CpuMemory *)(joint["output_argmax"]->get_memory()))->get_ptr())); + pinyinResult.push_back(std::pair(frameId, pinyinLabels[pinyin])); +} + +void freshHanZiResult(std::vector> &hanziResult, + std::vector hanziLabels, + std::map> pinyin2hanzi, + int frameId) +{ + int pinyinBufferIndex = -1; + if (frameId < PINYIN_BUFFER_VALID_SIZE) { + pinyinBufferIndex = frameId; + } else { + pinyinBufferIndex = PINYIN_BUFFER_VALID_SIZE - 1; + } + int pinyin = + ((U32 *)(((CpuMemory *)(pinyin2hanzi["pinyin"]->get_memory()))->get_ptr()))[pinyinBufferIndex] + + PINYIN_FEATURE_GAP; + if (pinyin == BLANK_TOKEN) { + return; + } + hanziResult.push_back(std::pair(frameId, "init")); + std::shared_ptr hanziTensor = pinyin2hanzi["hanzi_squeeze/Squeeze"]; + TensorDesc hanziTensorDesc = hanziTensor->get_desc(); + int num = tensorNumElements(hanziTensorDesc); + int loops = hanziTensorDesc.dims[1]; + int slots = hanziTensorDesc.dims[0]; + int batch = num / loops / slots; + CHECK_REQUIREMENT(batch == 1); + CHECK_REQUIREMENT(loops == PINYIN_BUFFER_SIZE); + for (int i = hanziResult.size() - 1; i >= 0; i--) { + std::pair element = hanziResult[i]; + int lastFrameId = element.first; + if (frameId - lastFrameId < PINYIN_BUFFER_VALID_SIZE) { + int lastPinyinBufferIndex = pinyinBufferIndex - (frameId - lastFrameId); + int offset = lastPinyinBufferIndex * slots; + int maxIndex = offset; + for (int j = 0, index = maxIndex; j < slots; j++, index++) { + if (hanziTensor->element(maxIndex) < hanziTensor->element(index)) { + maxIndex = index; + } + } + int hanziIndex = maxIndex - offset; + hanziResult[i] = std::pair(lastFrameId, hanziLabels[hanziIndex]); + } else { + break; + } + } +} + +int main(int argc, char *argv[]) +{ + flowRegisterFunction("encoderInferOutputSize", encoderInferOutputSize); + flowRegisterFunction("encoderPreProcess", encoderPreProcess); + flowRegisterFunction("predictionInferOutputSize", predictionInferOutputSize); + flowRegisterFunction("jointInferOutputSize", jointInferOutputSize); + flowRegisterFunction("pinyin2hanziInferOutputSize", pinyin2hanziInferOutputSize); + flowRegisterFunction("pinyin2hanziPreProcess", pinyin2hanziPreProcess); + + std::string wavFilePath = argv[6]; + AudioFeatureExtractor audioFeatureExtractor; + std::vector>> feature = + audioFeatureExtractor.getEncoderInputFromWav(wavFilePath); + + std::string encoderGraphPath = argv[1]; + std::string predictionGraphPath = argv[2]; + std::string jointGraphPath = argv[3]; + std::string pinyin2hanziGraphPath = argv[4]; + std::string labelFilePath = argv[5]; + std::map> labels = loadLabels(labelFilePath); + std::vector graphPath = { + encoderGraphPath, predictionGraphPath, jointGraphPath, pinyin2hanziGraphPath}; + // TODO(some): beam search conflict + std::vector pinyinBuffer(PINYIN_BUFFER_SIZE); + + int threads = 2; + + // TODO(some): beam search conflict + int frameId = 0; + Flow flowExample; + flowExample.init(graphPath, DT_F32, AFFINITY_CPU_HIGH_PERFORMANCE, threads, false); + sleep(5); + + std::map> blankData; + std::map> encoderData = + getEncoderInputOutput(feature, frameId, 15, blankData); + if (encoderData.size() == 0) { + return 0; + } + Task encoderTask(frameId, encoderGraphPath, encoderData); + std::map> predictionData = + getPredictionInputOutput(blankData, blankData); + Task predictionTask(frameId, predictionGraphPath, predictionData); + double timeStart = ut_time_ms(); + flowExample.enqueue(encoderTask); + flowExample.enqueue(predictionTask); + frameId++; + + std::set readyTaskId; + std::map encoderResults; + std::map predictionResults; + std::map jointResults; + std::vector> pinyinResult; + std::vector> hanziResult; + while (1) { + std::vector results = flowExample.dequeue(); + for (unsigned int i = 0; i < results.size(); i++) { + std::string graphPath = results[i].graphPath; + if (graphPath == encoderGraphPath) { + encoderResults[results[i].id] = results[i]; + readyTaskId.insert(results[i].id); + } else if (graphPath == predictionGraphPath) { + predictionResults[results[i].id] = results[i]; + readyTaskId.insert(results[i].id); + } else if (graphPath == jointGraphPath) { + jointResults[results[i].id] = results[i]; + // not skip blank will affect accuracy of result + if (jointOutputIsBlank(results[i].data)) { + Task copyTask(&predictionResults[results[i].id]); + copyTask.id++; + predictionResults[copyTask.id] = copyTask; + readyTaskId.insert(copyTask.id); + } else { + std::map> predictionData = + getPredictionInputOutput( + results[i].data, predictionResults[results[i].id].data); + Task predictionTask(results[i].id + 1, predictionGraphPath, predictionData); + flowExample.enqueue(predictionTask); + freshPinYinResult( + pinyinResult, labels["pinyin"], results[i].data, results[i].id); + } + + std::map> pinyin2hanziData = + getPinYin2HanZiInputOutput(results[i].id, pinyinBuffer.data(), + PINYIN_BUFFER_SIZE, PINYIN_BUFFER_VALID_SIZE, results[i].data); + Task pinyin2hanziTask(results[i].id, pinyin2hanziGraphPath, pinyin2hanziData); + flowExample.enqueue(pinyin2hanziTask); + } else if (graphPath == pinyin2hanziGraphPath) { + freshHanZiResult(hanziResult, labels["hanzi"], results[i].data, results[i].id); + } + } + for (std::set::iterator iter = readyTaskId.begin(); iter != readyTaskId.end();) { + int item = *iter; + if (encoderResults.find(item) != encoderResults.end() && + predictionResults.find(item) != predictionResults.end()) { + std::map> jointData = + getJointInputOutput(encoderResults[item].data, predictionResults[item].data); + Task jointTask(item, jointGraphPath, jointData); + flowExample.enqueue(jointTask); + iter = readyTaskId.erase(iter); + } else { + iter++; + } + } + if (frameId < 1000 && encoderResults.find(frameId - 1) != encoderResults.end()) { + std::map> encoderData = + getEncoderInputOutput(feature, frameId, 8, encoderResults[frameId - 1].data); + if (encoderData.size() > 0) { + Task encoderTask(frameId, encoderGraphPath, encoderData); + frameId++; + flowExample.enqueue(encoderTask); + } + } + + if (flowExample.size() == 0) { + break; + } + } + double timeEnd = ut_time_ms(); + std::string pinyinLine, hanziLine; + for (unsigned int i = 0; i < pinyinResult.size(); i++) { + pinyinLine += pinyinResult[i].second + " "; + hanziLine += hanziResult[i].second; + } + std::cout << "[PROFILE] flow asr time: " << timeEnd - timeStart << " ms" << std::endl; + std::cout << "[RESULT] length: " << pinyinResult.size() << std::endl; + std::cout << "[RESULT] pinyin: " << pinyinLine << std::endl; + std::cout << "[RESULT] hanzi: " << hanziLine << std::endl; + return 0; +} diff --git a/inference/examples/automatic_speech_recognition/joint_flow.prototxt b/inference/examples/automatic_speech_recognition/joint_flow.prototxt new file mode 100644 index 00000000..e13fe196 --- /dev/null +++ b/inference/examples/automatic_speech_recognition/joint_flow.prototxt @@ -0,0 +1,33 @@ +name: "joint_flow" +input: "encoder" +input: "prediction_net" +output: "output_argmax" +node { + name: "encoder" + type: "Input" + output: "encoder" + input_type: "FLOAT32" + input_format: "MTK" + input_dim: 1 + input_dim: 1 + input_dim: 512 +} +node { + name: "prediction_net" + type: "Input" + output: "prediction_net" + input_type: "FLOAT32" + input_format: "MTK" + input_dim: 1 + input_dim: 1 + input_dim: 512 +} +node { + name: "joint_inference" + type: "Inference" + input: "encoder" + input: "prediction_net" + output: "output_argmax" + infer_output_size_parameter: "jointInferOutputSize" + inference_parameter: "/data/local/tmp/CI/model_zoo/caffe_models/asr_convolution_transformer_joint_net/asr_convolution_transformer_joint_net_f32.bolt" +} diff --git a/inference/examples/automatic_speech_recognition/pinyin2hanzi_flow.prototxt b/inference/examples/automatic_speech_recognition/pinyin2hanzi_flow.prototxt new file mode 100644 index 00000000..7d8133d7 --- /dev/null +++ b/inference/examples/automatic_speech_recognition/pinyin2hanzi_flow.prototxt @@ -0,0 +1,24 @@ +name: "pinyin2hanzi_flow" +input: "pinyin" +output: "hanzi_squeeze/Squeeze" +node { + name: "pinyin" + type: "Input" + output: "pinyin" + input_type: "UINT32" + input_format: "NORMAL" + input_dim: 1 + input_dim: 32 +} +node { + name: "pinyin2hanzi_inference" + type: "Inference" + input: "pinyin" + output: "hanzi_squeeze/Squeeze" + infer_output_size_parameter: "pinyin2hanziInferOutputSize" + preprocess_parameter: "pinyin2hanziPreProcess" + preprocess_parameter: "/data/local/tmp/CI/test/pinyin_lm_embedding.bin" + preprocess_parameter: "1601" + preprocess_parameter: "512" + inference_parameter: "/data/local/tmp/CI/model_zoo/tflite_models/cnn_pinyin_lm_b7h512e4_cn_en_20200518_cloud_fp32/cnn_pinyin_lm_b7h512e4_cn_en_20200518_cloud_fp32_f32.bolt" +} diff --git a/inference/examples/automatic_speech_recognition/pinyin_lm_embedding.bin b/inference/examples/automatic_speech_recognition/pinyin_lm_embedding.bin new file mode 100644 index 00000000..9e1940b1 Binary files /dev/null and b/inference/examples/automatic_speech_recognition/pinyin_lm_embedding.bin differ diff --git a/inference/examples/automatic_speech_recognition/prediction_flow.prototxt b/inference/examples/automatic_speech_recognition/prediction_flow.prototxt new file mode 100644 index 00000000..70c6ab78 --- /dev/null +++ b/inference/examples/automatic_speech_recognition/prediction_flow.prototxt @@ -0,0 +1,139 @@ +name: "prediction" +input: "label" +input: "prediction_net_layer0_kmem" +input: "prediction_net_layer0_vmem" +input: "prediction_net_layer1_kmem" +input: "prediction_net_layer1_vmem" +input: "prediction_net_layer2_kmem" +input: "prediction_net_layer2_vmem" +input: "prediction_net_layer3_kmem" +input: "prediction_net_layer3_vmem" +output: "prediction_net_ln" +output: "prediction_net_layer0_k_neg_slice" +output: "prediction_net_layer0_v_neg_slice" +output: "prediction_net_layer1_k_neg_slice" +output: "prediction_net_layer1_v_neg_slice" +output: "prediction_net_layer2_k_neg_slice" +output: "prediction_net_layer2_v_neg_slice" +output: "prediction_net_layer3_k_neg_slice" +output: "prediction_net_layer3_v_neg_slice" +node { + name: "label" + type: "Input" + output: "label" + input_type: "UINT32" + input_format: "NORMAL" + input_dim: 1 + input_dim: 1 +} +node { + name: "prediction_net_layer0_kmem" + type: "Input" + output: "prediction_net_layer0_kmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 3 + input_dim: 8 + input_dim: 64 +} +node { + name: "prediction_net_layer0_vmem" + type: "Input" + output: "prediction_net_layer0_vmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 3 + input_dim: 8 + input_dim: 64 +} +node { + name: "prediction_net_layer1_kmem" + type: "Input" + output: "prediction_net_layer1_kmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 5 + input_dim: 8 + input_dim: 64 +} +node { + name: "prediction_net_layer1_vmem" + type: "Input" + output: "prediction_net_layer1_vmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 5 + input_dim: 8 + input_dim: 64 +} +node { + name: "prediction_net_layer2_kmem" + type: "Input" + output: "prediction_net_layer2_kmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 7 + input_dim: 8 + input_dim: 64 +} +node { + name: "prediction_net_layer2_vmem" + type: "Input" + output: "prediction_net_layer2_vmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 7 + input_dim: 8 + input_dim: 64 +} +node { + name: "prediction_net_layer3_kmem" + type: "Input" + output: "prediction_net_layer3_kmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 9 + input_dim: 8 + input_dim: 64 +} +node { + name: "prediction_net_layer3_vmem" + type: "Input" + output: "prediction_net_layer3_vmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 9 + input_dim: 8 + input_dim: 64 +} +node { + name: "prediction_inference" + input: "label" + input: "prediction_net_layer0_kmem" + input: "prediction_net_layer0_vmem" + input: "prediction_net_layer1_kmem" + input: "prediction_net_layer1_vmem" + input: "prediction_net_layer2_kmem" + input: "prediction_net_layer2_vmem" + input: "prediction_net_layer3_kmem" + input: "prediction_net_layer3_vmem" + output: "prediction_net_ln" + output: "prediction_net_layer0_k_neg_slice" + output: "prediction_net_layer0_v_neg_slice" + output: "prediction_net_layer1_k_neg_slice" + output: "prediction_net_layer1_v_neg_slice" + output: "prediction_net_layer2_k_neg_slice" + output: "prediction_net_layer2_v_neg_slice" + output: "prediction_net_layer3_k_neg_slice" + output: "prediction_net_layer3_v_neg_slice" + infer_output_size_parameter: "predictionInferOutputSize" + inference_parameter: "/data/local/tmp/CI/model_zoo/caffe_models/asr_convolution_transformer_prediction_net/asr_convolution_transformer_prediction_net_f32.bolt" +} diff --git a/inference/examples/automatic_speech_recognition/run.sh b/inference/examples/automatic_speech_recognition/run.sh new file mode 100644 index 00000000..e0fddedd --- /dev/null +++ b/inference/examples/automatic_speech_recognition/run.sh @@ -0,0 +1,95 @@ +#!/bin/bash + +script_name=$0 +script_abs=$(readlink -f "$0") +script_dir=$(dirname $script_abs) +bolt_root=${script_dir}/../../.. + +device="" +arch="arm_gnu" +device_dir=/data/local/tmp/CI/test +model_zoo_dir=/data/local/tmp/CI/model_zoo + +print_help() { + cat < use to set device architecture(default: arm_gnu). + -p, --path device test directory. +EOF + exit 1; +} + +TEMP=`getopt -o d:a:p:m:h --long device:arch:path:model_zoo:help, \ + -n ${script_name} -- "$@"` +if [ $? != 0 ] ; then echo "[ERROR] terminating..." >&2 ; exit 1 ; fi +eval set -- "$TEMP" +while true ; do + case "$1" in + -d|--device) + device=$2 + echo "[INFO] run on '${device}'" ; + shift 2 ;; + -a|--arch) + arch=$2 + echo "[INFO] device architecture ${arch}" ; + shift 2 ;; + -p|--path) + device_dir=$2 + echo "[INFO] run in '${device_dir}'" ; + shift 2 ;; + -m|--model_zoo) + model_zoo_dir=$2 + echo "[INFO] use model_zoo ${model_zoo_dir}" ; + shift 2 ;; + -h|--help) + print_help ; + shift ;; + --) shift ; + break ;; + *) echo "[ERROR]" ; exit 1 ;; + esac +done + +echo "[WARNING] Please make sure ${model_zoo_dir} is valid to find all inference models" +echo "[WARNING] Please make sure to use models in ${model_zoo_dir} in ${script_dir}/*.prototxt configure files" +echo "[WARNING] Please make sure you have modified ${script_dir}/pinyin2hanzi_flow.prototxt to find pinyin_lm_embedding.bin file" + +adb -s $device shell "mkdir ${device_dir}" +adb -s $device push ${script_dir}/encoder_flow.prototxt $device_dir > /dev/null +adb -s $device push ${script_dir}/prediction_flow.prototxt $device_dir > /dev/null +adb -s $device push ${script_dir}/joint_flow.prototxt $device_dir > /dev/null +adb -s $device push ${script_dir}/pinyin2hanzi_flow.prototxt $device_dir > /dev/null +adb -s $device push ${script_dir}/example.wav $device_dir > /dev/null +adb -s $device push ${script_dir}/asr_labels.txt $device_dir > /dev/null +adb -s $device push ${script_dir}/pinyin_lm_embedding.bin $device_dir > /dev/null +adb -s $device push ${bolt_root}/install_${arch}/examples/flow_asr $device_dir > /dev/null +adb -s $device push ${bolt_root}/install_${arch}/tools/X2bolt $device_dir > /dev/null +adb -s $device shell "mkdir ${device_dir}/lib" +for file in `ls ${bolt_root}/install_${arch}/lib/*.so` +do + adb -s ${device} push ${file} ${device_dir}/lib > /dev/null +done + +# prepare inference models +adb -s $device shell "export LD_LIBRARY_PATH=${device_dir}/lib && ${device_dir}/X2bolt -d ${model_zoo_dir}/caffe_models/asr_convolution_transformer_encoder -m asr_convolution_transformer_encoder -i FP32" +adb -s $device shell "export LD_LIBRARY_PATH=${device_dir}/lib && ${device_dir}/X2bolt -d ${model_zoo_dir}/caffe_models/asr_convolution_transformer_prediction_net -m asr_convolution_transformer_prediction_net -i FP32" +adb -s $device shell "export LD_LIBRARY_PATH=${device_dir}/lib && ${device_dir}/X2bolt -d ${model_zoo_dir}/caffe_models/asr_convolution_transformer_joint_net -m asr_convolution_transformer_joint_net -i FP32" +adb -s $device shell "export LD_LIBRARY_PATH=${device_dir}/lib && ${device_dir}/X2bolt -d ${model_zoo_dir}/tflite_models/cnn_pinyin_lm_b7h512e4_cn_en_20200518_cloud_fp32 -m cnn_pinyin_lm_b7h512e4_cn_en_20200518_cloud_fp32 -i FP32" + +# inference +adb -s $device shell "export LD_LIBRARY_PATH=${device_dir}/lib && cd ${device_dir} && ./flow_asr ./encoder_flow.prototxt ./prediction_flow.prototxt ./joint_flow.prototxt pinyin2hanzi_flow.prototxt asr_labels.txt example.wav" | tee tmp.txt + +# clean work directory +adb -s $device shell "rm -rf ${device_dir}" + +check=$(grep -I "\[RESULT\] hanzi: 打电话给杜娟" tmp.txt) +rm tmp.txt +if [[ ${check} < 1 ]] +then + exit 1 +fi diff --git a/inference/examples/automatic_speech_recognition/vad.cpp b/inference/examples/automatic_speech_recognition/vad.cpp new file mode 100644 index 00000000..d6b7f048 --- /dev/null +++ b/inference/examples/automatic_speech_recognition/vad.cpp @@ -0,0 +1,107 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include + +#include "inference.hpp" +#include "tensor.hpp" +#include "data_loader.hpp" +#include "profiling.h" +#include "parse_command.h" + +int verify(Tensor vad, Tensor eoq) +{ + I32 result = 0; + U32 num = vad.length(); + CHECK_REQUIREMENT(2 == num); + if (abs(vad.element(0) - 0.999107) >= 0.0005) { + result = 1; + } + if (abs(vad.element(1) - 0.0009) >= 0.0005) { + result = 1; + } + + num = eoq.length(); + CHECK_REQUIREMENT(2 == num); + if (abs(eoq.element(0) - 1) >= 0.0005) { + result = 1; + } + if (abs(eoq.element(1) - 1.4e-8) >= 0.0005) { + result = 1; + } + return result; +} + +int main(int argc, char *argv[]) +{ + UNI_TIME_INIT + ParseRes parse_res; + parseCommandLine(argc, argv, &parse_res, "examples"); + + char *modelPath = (char *)""; + char *affinityPolicyName = (char *)""; + if (!parse_res.model.second) { + exit(-1); + } + if (parse_res.model.second) { + modelPath = parse_res.model.first; + } + if (parse_res.archInfo.second) { + affinityPolicyName = parse_res.archInfo.first; + } + + auto pipeline = createPipeline(affinityPolicyName, modelPath); + + std::map> inMap = pipeline->get_inputs(); + TensorDesc cacheDesc = (*(inMap["input_cache"])).get_desc(); + + std::map inputDescMap; + inputDescMap["input_fea"] = (*(inMap["input_fea"])).get_desc(); + inputDescMap["input_cache"] = cacheDesc; + pipeline->reready(inputDescMap); + + std::vector cache; + cache.resize(tensorNumBytes(cacheDesc), 0); + + double totalTime = 0; + int loops = 1; + U32 falseResult = 0; + for (int i = 0; i < loops; i++) { + pipeline->copy_to_named_input("input_cache", cache.data()); + pipeline->copy_to_named_input("input_fea", cache.data()); + + double timeBegin = ut_time_ms(); + pipeline->run(); + double timeEnd = ut_time_ms(); + totalTime += (timeEnd - timeBegin); + Tensor vad = pipeline->get_tensor_by_name("output_vad"); + std::cout << "output_vad: " << vad.element(0) << " " << vad.element(1) << std::endl; + Tensor eoq = pipeline->get_tensor_by_name("output_eoq"); + std::cout << "output_eoq: " << eoq.element(0) << " " << eoq.element(1) << std::endl; + falseResult += verify(vad, eoq); + Tensor outCache = pipeline->get_tensor_by_name("output_cache"); + memcpy(cache.data(), (U8 *)((CpuMemory *)(outCache.get_memory()))->get_ptr(), + tensorNumBytes(cacheDesc)); + } + UNI_TIME_STATISTICS + + std::cout << "[SUMMARY]:" << std::endl; + U32 validSequence = loops; + UNI_CI_LOG("vad rate: %f %%\n", 100.0 * (validSequence - falseResult) / validSequence); + UNI_CI_LOG("avg_time:%fms/sequence\n", 1.0 * totalTime / validSequence); + if (falseResult > 0) { + UNI_ERROR_LOG("verify failed\n"); + } + + return 0; +} diff --git a/inference/examples/benchmark/benchmark.cpp b/inference/examples/benchmark/benchmark.cpp new file mode 100644 index 00000000..9d443c5c --- /dev/null +++ b/inference/examples/benchmark/benchmark.cpp @@ -0,0 +1,188 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include +#include "inference.hpp" +#include "data_loader.hpp" + +char *modelPath = (char *)""; +std::string inputData = ""; +char *affinityPolicyName = (char *)"CPU_AFFINITY_HIGH_PERFORMANCE"; +char *algorithmMapPath = (char *)""; +int loopTime = 1; +int warmUp = 10; + +void print_benchmark_usage() +{ + std::cout << "benchmark usage: (<> must be filled in with exact value; [] is optional)\n" + "./benchmark -m -i [inputDataPath] -a [affinityPolicyName] -p " + "[algorithmMapPath] -l [loopTime]\n" + "\nParameter description:\n" + "1. -m : The path where .bolt is stored.\n" + "2. -i [inputDataPath]: The input data absolute path. If not input the option, " + "benchmark will run with fake data.\n" + "3. -a [affinityPolicyName]: The affinity policy. If not input the option, " + "affinityPolicyName is CPU_AFFINITY_HIGH_PERFORMANCE.Or you can only choose one " + "of {CPU_AFFINITY_HIGH_PERFORMANCE, CPU_AFFINITY_LOW_POWER, GPU}.\n" + "4. -p [algorithmMapPath]: The algorithm configration path.\n" + "5. -l [loopTime]: The running loopTimes.\n" + "6. -w [warmUp]: WarmUp times. The default value is 10.\n" + "Example: ./benchmark -m /local/models/resnet50_f16.bolt" + << std::endl; +} + +void parse_options(int argc, char *argv[]) +{ + std::cout << "\nPlease enter this command './benchmark --help' to get more usage " + "information.\n"; + std::vector lineArgs(argv, argv + argc); + for (std::string arg : lineArgs) { + if (arg == "--help" || arg == "-help" || arg == "--h" || arg == "-h") { + print_benchmark_usage(); + exit(-1); + } + } + + int option; + const char *optionstring = "m:i:a:p:l:w:"; + while ((option = getopt(argc, argv, optionstring)) != -1) { + switch (option) { + case 'm': + std::cout << "option is -m , value is: " << optarg << std::endl; + modelPath = optarg; + break; + case 'i': + std::cout << "option is -i [inputDataPath], value is: " << optarg << std::endl; + inputData = std::string(optarg); + break; + case 'a': + std::cout << "option is -a [affinityPolicyName], value is: " << optarg << std::endl; + affinityPolicyName = optarg; + break; + case 'p': + std::cout << "option is -p [algorithmMapPath], value is: " << optarg << std::endl; + algorithmMapPath = optarg; + break; + case 'l': + std::cout << "option is -l [loopTime], value is: " << optarg << std::endl; + loopTime = atoi(optarg); + break; + case 'w': + std::cout << "option is -w [warmUp], value is: " << optarg << std::endl; + warmUp = atoi(optarg); + break; + default: + std::cout << "Input option gets error, please check the params meticulously.\n"; + print_benchmark_usage(); + exit(-1); + } + } +} + +std::map> create_tensors_from_path( + std::string dataPath, std::shared_ptr pipeline) +{ + std::vector inputNames = pipeline->get_model_input_tensor_names(); + std::map> inMap = pipeline->get_inputs(); + std::vector sourceDataTypes; + std::vector inputDescs; + for (int i = 0; i < (int)(inputNames.size()); i++) { + std::string curName = inputNames[i]; + TensorDesc curDesc = (*(inMap[curName])).get_desc(); + std::cout << "Input Tensor Dimension: " << tensorDesc2Str(curDesc) << std::endl; + sourceDataTypes.push_back(curDesc.dt); + inputDescs.push_back(curDesc); + } + std::vector input; + if (string_end_with(inputData, ".txt")) { + input = load_txt(inputData, inputDescs); + } else { + input = load_bin(inputData, sourceDataTypes, inputDescs); + } + std::map> model_tensors_input; + for (U32 index = 0; index < inputNames.size(); index++) { + model_tensors_input[inputNames[index]] = + ((CpuMemory *)input[index].get_memory())->get_shared_ptr(); + } + return model_tensors_input; +} + +void print_result(std::map> outMap) +{ + std::cout << "\n\nBenchmark Result:\n"; + int outputIndex = 0; + for (auto iter : outMap) { + Tensor result = *(iter.second); + std::cout << "Output Tensor" << outputIndex++ << " : " << iter.first << "\n" + << result.string(8) << "\n\n"; + } +} + +std::map> get_output( + std::shared_ptr pipeline, std::string affinity) +{ + std::map> outMap = pipeline->get_outputs(); + if (affinity == "GPU") { +#ifdef _USE_MALI + for (auto iter : outMap) { + Tensor result = *(iter.second); + auto mem = (OclMemory *)result.get_memory(); + mem->get_mapped_ptr(); + } +#else + UNI_WARNING_LOG("this binary not support GPU, please recompile project with GPU " + "compile options\n"); +#endif + } + return outMap; +} + +int main(int argc, char *argv[]) +{ + UNI_TIME_INIT + parse_options(argc, argv); + + // 1: set up the pipeline + auto pipeline = createPipeline(affinityPolicyName, modelPath, algorithmMapPath); + + // 2: create input data and feed the pipeline with it + auto model_tensors_input = create_tensors_from_path(inputData, pipeline); + + std::map> outMap; + + // 3: warm up and run + for (int i = 0; i < warmUp; i++) { + pipeline->set_input_tensors_value(model_tensors_input); + pipeline->run(); + outMap = get_output(pipeline, affinityPolicyName); + } + + double timeBegin = ut_time_ms(); + for (int i = 0; i < loopTime; i++) { + pipeline->set_input_tensors_value(model_tensors_input); + pipeline->run(); + outMap = get_output(pipeline, affinityPolicyName); + } + double timeEnd = ut_time_ms(); + double totalTime = (timeEnd - timeBegin); + + // 4: process result + print_result(outMap); + + UNI_TIME_STATISTICS + UNI_CI_LOG("total_time:%fms(loops=%d)\n", 1.0 * totalTime, loopTime); + UNI_CI_LOG("avg_time:%fms/data\n", 1.0 * totalTime / loopTime); + pipeline->saveAlgorithmMapToText(algorithmMapPath); + return 0; +} diff --git a/inference/examples/bert/bert.cpp b/inference/examples/bert/bert.cpp new file mode 100644 index 00000000..fbc811c6 --- /dev/null +++ b/inference/examples/bert/bert.cpp @@ -0,0 +1,112 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "inference.hpp" +#include "tensor.hpp" +#include "data_loader.hpp" +#include "profiling.h" +#include "parse_command.h" + +int main(int argc, char *argv[]) +{ + UNI_TIME_INIT + ParseRes parse_res; + parseCommandLine(argc, argv, &parse_res, "examples"); + char *modelPath = (char *)""; + char *sequenceDirectory = (char *)""; + char *affinityPolicyName = (char *)"CPU_AFFINITY_HIGH_PERFORMANCE"; + + if (!parse_res.model.second) { + exit(-1); + } + if (parse_res.model.second) { + modelPath = parse_res.model.first; + } + if (parse_res.inputPath.second) { + sequenceDirectory = parse_res.inputPath.first; + } + if (parse_res.archInfo.second) { + affinityPolicyName = parse_res.archInfo.first; + } + + auto pipeline = createPipeline(affinityPolicyName, modelPath); + + // load sequences + std::map> inMap = pipeline->get_inputs(); + std::vector sequenceDescs; + TensorDesc wordInputDesc = (*(inMap["bert_words"])).get_desc(); + wordInputDesc.dt = DT_U32; + sequenceDescs.push_back(wordInputDesc); + TensorDesc positionInputDesc = (*(inMap["bert_positions"])).get_desc(); + positionInputDesc.dt = DT_U32; + sequenceDescs.push_back(positionInputDesc); + TensorDesc tokenTypeInputDesc = (*(inMap["bert_token_type"])).get_desc(); + tokenTypeInputDesc.dt = DT_U32; + sequenceDescs.push_back(tokenTypeInputDesc); + std::vector> sequences; + std::vector sequencePaths = + load_data(sequenceDirectory + std::string("/input"), sequenceDescs, &sequences); + + double totalTime = 0; + U32 sequenceIndex = 0; + std::cout << "[RESULT]:" << std::endl; + for (auto sequence : sequences) { + std::cout << sequencePaths[sequenceIndex] << std::endl; + std::map inputDescMap; + inputDescMap["bert_words"] = sequence[0].get_desc(); + inputDescMap["bert_positions"] = sequence[1].get_desc(); + inputDescMap["bert_token_type"] = sequence[2].get_desc(); + pipeline->reready(inputDescMap); + + auto modelInputTensorNames = pipeline->get_model_input_tensor_names(); + for (int index = 0; index < (int)modelInputTensorNames.size(); index++) { + U8 *tmp = (U8 *)((CpuMemory *)(sequence[index].get_memory()))->get_ptr(); + pipeline->copy_to_named_input(modelInputTensorNames[index], tmp); + } + + double timeBegin = ut_time_ms(); + pipeline->run(); + double timeEnd = ut_time_ms(); + totalTime += (timeEnd - timeBegin); + + // stage5: process result + std::map> outMap = pipeline->get_outputs(); + for (auto iter : outMap) { + std::string key = iter.first; + std::shared_ptr value = iter.second; + Tensor result = *value; + if (key == "other") { + continue; + } + U32 resultElementNum = tensorNumElements(result.get_desc()); + std::cout << " " << key << ": "; + std::cout << tensorDesc2Str(result.get_desc()); + std::cout << std::endl; + std::cout << " "; + for (U32 index = 0; index < resultElementNum; index++) { + std::cout << result.element(index) << " "; + } + std::cout << std::endl; + } + + sequenceIndex++; + } + + UNI_TIME_STATISTICS + + std::cout << "[SUMMARY]:" << std::endl; + UNI_CI_LOG("avg_time:%fms/sequence\n", 1.0 * totalTime / sequenceIndex); + + return 0; +} diff --git a/inference/examples/bert/flow_tinybert.cpp b/inference/examples/bert/flow_tinybert.cpp new file mode 100644 index 00000000..859afc94 --- /dev/null +++ b/inference/examples/bert/flow_tinybert.cpp @@ -0,0 +1,91 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "task.h" +#include "flow.h" + +EE tinybertInferOutputSize(std::map> &inputs, + std::shared_ptr &tmp, + std::map> &outputs, + std::vector parameter = std::vector()) +{ + TensorDesc inputDesc = inputs.begin()->second->get_desc(); + outputs["intent_softmax"]->resize(tensor3df(DT_F32, DF_MTK, 1, 1, 65)); + outputs["slot_softmax"]->resize(tensor3df(DT_F32, DF_MTK, 1, inputDesc.dims[1], 45)); + return SUCCESS; +} + +std::map> inputOutput() +{ + const int length = 9; + int words[length] = {101, 2224, 8224, 7341, 2000, 22149, 2000, 2899, 102}; + + std::map> tensors; + TensorDesc inputDesc = tensor2df(DT_U32, DF_NORMAL, 1, 9); + tensors["tinybert_words"] = std::shared_ptr(new Tensor()); + tensors["tinybert_words"]->resize(inputDesc); + tensors["tinybert_words"]->alloc(); + memcpy(((CpuMemory *)tensors["tinybert_words"]->get_memory())->get_ptr(), words, + tensorNumBytes(inputDesc)); + + tensors["tinybert_positions"] = std::shared_ptr(new Tensor()); + tensors["tinybert_positions"]->resize(inputDesc); + tensors["tinybert_positions"]->alloc(); + tensors["tinybert_token_type"] = std::shared_ptr(new Tensor()); + tensors["tinybert_token_type"]->resize(inputDesc); + tensors["tinybert_token_type"]->alloc(); + unsigned int *positionPtr = + (unsigned int *)((CpuMemory *)tensors["tinybert_positions"]->get_memory())->get_ptr(); + unsigned int *tokenTypePtr = + (unsigned int *)((CpuMemory *)tensors["tinybert_token_type"]->get_memory())->get_ptr(); + for (int i = 0; i < length; i++) { + positionPtr[i] = i; + tokenTypePtr[i] = 0; + } + + DataType dataType = DT_F32; + tensors["intent_softmax"] = std::shared_ptr(new Tensor()); + tensors["intent_softmax"]->resize(tensor3df(dataType, DF_MTK, 1, 1, 65)); + tensors["intent_softmax"]->alloc(); + + tensors["slot_softmax"] = std::shared_ptr(new Tensor()); + tensors["slot_softmax"]->resize(tensor3df(dataType, DF_MTK, 1, length, 45)); + tensors["slot_softmax"]->alloc(); + + return tensors; +} + +int main(int argc, char *argv[]) +{ + flowRegisterFunction("tinybertInferOutputSize", tinybertInferOutputSize); + + std::string tinybertGraphPath = argv[1]; + std::vector graphPath = {tinybertGraphPath}; + int threads = 1; + + Flow flowExample; + flowExample.init(graphPath, DT_F32, AFFINITY_CPU_HIGH_PERFORMANCE, threads, false); + + for (int i = 0; i < 10; i++) { + std::map> data = inputOutput(); + Task task(tinybertGraphPath, data); + flowExample.enqueue(task); + } + + std::vector results = flowExample.dequeue(true); + for (auto task : results) { + std::cout << task << std::endl; + } + return 0; +} diff --git a/inference/examples/bert/flow_tinybert.prototxt b/inference/examples/bert/flow_tinybert.prototxt new file mode 100644 index 00000000..a42e96a0 --- /dev/null +++ b/inference/examples/bert/flow_tinybert.prototxt @@ -0,0 +1,44 @@ +name: "tinybert_flow" +input: "tinybert_words" +input: "tinybert_positions" +input: "tinybert_token_type" +output: "intent_softmax" +output: "slot_softmax" +node { + name: "tinybert_words" + type: "Input" + output: "tinybert_words" + input_type: "UINT32" + input_format: "NORMAL" + input_dim: 1 + input_dim: 32 +} +node { + name: "tinybert_positions" + type: "Input" + output: "tinybert_positions" + input_type: "UINT32" + input_format: "NORMAL" + input_dim: 1 + input_dim: 32 +} +node { + name: "tinybert_token_type" + type: "Input" + output: "tinybert_token_type" + input_type: "UINT32" + input_format: "NORMAL" + input_dim: 1 + input_dim: 32 +} +node { + name: "tinybert_inference" + type: "Inference" + input: "tinybert_words" + input: "tinybert_positions" + input: "tinybert_token_type" + output: "intent_softmax" + output: "slot_softmax" + infer_output_size_parameter: "tinybertInferOutputSize" + inference_parameter: "/data/local/tmp/CI/model_zoo/caffe_models/tinybert/tinybert_f32.bolt" +} diff --git a/inference/examples/bert/graph_tinybert.cpp b/inference/examples/bert/graph_tinybert.cpp new file mode 100644 index 00000000..5eb77d5b --- /dev/null +++ b/inference/examples/bert/graph_tinybert.cpp @@ -0,0 +1,82 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include + +#include "graph.h" +#include "tensor.hpp" +#include "node.h" +#include "flow.pb.h" + +EE tinybertInferOutputSize(std::map> &inputs, + std::shared_ptr &tmp, + std::map> &outputs, + std::vector parameter = std::vector()) +{ + TensorDesc inputDesc = inputs.begin()->second->get_desc(); + outputs["intent_softmax"]->resize(tensor3df(DT_F32, DF_MTK, 1, 1, 65)); + outputs["slot_softmax"]->resize(tensor3df(DT_F32, DF_MTK, 1, inputDesc.dims[1], 45)); + return SUCCESS; +} + +std::map> inputOutput() +{ + const int length = 9; + int words[length] = {101, 2224, 8224, 7341, 2000, 22149, 2000, 2899, 102}; + + std::map> tensors; + TensorDesc inputDesc = tensor2df(DT_U32, DF_NORMAL, 1, 9); + tensors["tinybert_words"] = std::shared_ptr(new Tensor()); + tensors["tinybert_words"]->resize(inputDesc); + tensors["tinybert_words"]->alloc(); + memcpy(((CpuMemory *)tensors["tinybert_words"]->get_memory())->get_ptr(), words, + tensorNumBytes(inputDesc)); + + tensors["tinybert_positions"] = std::shared_ptr(new Tensor()); + tensors["tinybert_positions"]->resize(inputDesc); + tensors["tinybert_positions"]->alloc(); + tensors["tinybert_token_type"] = std::shared_ptr(new Tensor()); + tensors["tinybert_token_type"]->resize(inputDesc); + tensors["tinybert_token_type"]->alloc(); + unsigned int *positionPtr = + (unsigned int *)((CpuMemory *)tensors["tinybert_positions"]->get_memory())->get_ptr(); + unsigned int *tokenTypePtr = + (unsigned int *)((CpuMemory *)tensors["tinybert_token_type"]->get_memory())->get_ptr(); + for (int i = 0; i < length; i++) { + positionPtr[i] = i; + tokenTypePtr[i] = 0; + } + + DataType dataType = DT_F32; + tensors["intent_softmax"] = std::shared_ptr(new Tensor()); + tensors["intent_softmax"]->resize(tensor3df(dataType, DF_MTK, 1, 1, 65)); + tensors["intent_softmax"]->alloc(); + + tensors["slot_softmax"] = std::shared_ptr(new Tensor()); + tensors["slot_softmax"]->resize(tensor3df(dataType, DF_MTK, 1, length, 45)); + tensors["slot_softmax"]->alloc(); + + return tensors; +} + +int main(int argc, char *argv[]) +{ + flowRegisterFunction("tinybertInferOutputSize", tinybertInferOutputSize); + std::string tinybertGraphPath = argv[1]; + std::map> data = inputOutput(); + Graph graph; + graph.init(tinybertGraphPath); + graph.ready(DT_F32, AFFINITY_CPU_HIGH_PERFORMANCE, -1); + graph.run(data); + return 0; +} diff --git a/inference/examples/bert/tinybert.cpp b/inference/examples/bert/tinybert.cpp new file mode 100644 index 00000000..97c4d7d9 --- /dev/null +++ b/inference/examples/bert/tinybert.cpp @@ -0,0 +1,28 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tinybert_test.h" + +int main(int argc, char *argv[]) +{ + const char *input_names[3] = {"tinybert_words", "tinybert_positions", "tinybert_token_type"}; + const char *output_names[2] = {"intent_softmax", "slot_softmax"}; + F32 intentRate, slotRate; + tinybertTest(argc, argv, input_names, output_names, &intentRate, &slotRate); + + if (intentRate != 100 || slotRate != 100) { + return 1; + } else { + return 0; + } +} diff --git a/inference/examples/bert/tinybert_onnx.cpp b/inference/examples/bert/tinybert_onnx.cpp new file mode 100644 index 00000000..cbd9eea0 --- /dev/null +++ b/inference/examples/bert/tinybert_onnx.cpp @@ -0,0 +1,28 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tinybert_test.h" + +int main(int argc, char *argv[]) +{ + const char *input_names[3] = {"input_ids", "position_ids", "segment_ids"}; + const char *output_names[2] = {"intent", "slots"}; + F32 intentRate, slotRate; + tinybertTest(argc, argv, input_names, output_names, &intentRate, &slotRate); + + if (intentRate != 100 || slotRate != 100) { + return 1; + } else { + return 0; + } +} diff --git a/inference/examples/bert/tinybert_test.h b/inference/examples/bert/tinybert_test.h new file mode 100644 index 00000000..a4bcdf58 --- /dev/null +++ b/inference/examples/bert/tinybert_test.h @@ -0,0 +1,202 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_TINYBERT_TEST +#define _H_TINYBERT_TEST + +#ifdef _USE_OPENMP +#include +#endif + +#include "inference.hpp" +#include "data_loader.hpp" +#include "profiling.h" +#include "parse_command.h" + +static std::string tinybertTestKernel(U32 sequenceIndex, + std::vector sequence, + std::shared_ptr pipeline, + std::vector> intents, + std::vector> slots, + int *falseIntent, + int *falseSlot, + const char **inputNames, + const char **outputNames) +{ + std::map inputDescMap; + inputDescMap[inputNames[0]] = sequence[0].get_desc(); + inputDescMap[inputNames[1]] = sequence[1].get_desc(); + inputDescMap[inputNames[2]] = sequence[2].get_desc(); + pipeline->reready(inputDescMap); + + std::map> inputs; + inputs[inputNames[0]] = ((CpuMemory *)sequence[0].get_memory())->get_shared_ptr(); + inputs[inputNames[1]] = ((CpuMemory *)sequence[1].get_memory())->get_shared_ptr(); + inputs[inputNames[2]] = ((CpuMemory *)sequence[2].get_memory())->get_shared_ptr(); + pipeline->set_input_tensors_value(inputs); + + pipeline->run(); + + Tensor intentSoftmax = pipeline->get_tensor_by_name(outputNames[0]); + U32 intentNum = intentSoftmax.length(); + U32 intentMaxIndex = 0; + for (U32 index = 1; index < intentNum; index++) { + if (intentSoftmax.element(index) > intentSoftmax.element(intentMaxIndex)) { + intentMaxIndex = index; + } + } + std::string log = std::string(" intent: ") + std::to_string(intentMaxIndex) + std::string(" ") + + std::to_string(intentSoftmax.element(intentMaxIndex)); + if (intents.size() > 0) { + F32 *intentResult = + (F32 *)((CpuMemory *)(intents[sequenceIndex][0].get_memory()))->get_ptr(); + if (intentMaxIndex != intentResult[0] || + abs(intentSoftmax.element(intentMaxIndex) - intentResult[1]) > 0.1) { + (*falseIntent)++; + } + } + Tensor slotSoftmax = pipeline->get_tensor_by_name(outputNames[1]); + auto slotDesc = slotSoftmax.get_desc(); + U32 slotNum = slotDesc.dims[1]; + U32 slotRange = slotDesc.dims[0]; + if (slotDesc.df == DF_MKT) { + slotNum = slotDesc.dims[0]; + slotRange = slotDesc.dims[1]; + } + std::vector slotSoftmaxResult; + log += std::string(" slot: "); + for (U32 i = 0; i < slotNum; i++) { + U32 slotMaxIndex = 0; + for (U32 index = 1; index < slotRange; index++) { + if (slotSoftmax.element(i * slotRange + index) > + slotSoftmax.element(i * slotRange + slotMaxIndex)) { + slotMaxIndex = index; + } + } + slotSoftmaxResult.push_back(slotMaxIndex); + log += std::to_string(slotMaxIndex) + std::string(" "); + } + if (slots.size() > sequenceIndex) { + U32 *slotResult = (U32 *)((CpuMemory *)(slots[sequenceIndex][0].get_memory()))->get_ptr(); + for (U32 i = 0; i < slotSoftmaxResult.size(); i++) { + if (slotSoftmaxResult.size() != slots[sequenceIndex][0].get_desc().dims[0] || + slotResult[i] != slotSoftmaxResult[i]) { + (*falseSlot)++; + break; + } + } + } + return log; +} + +inline void tinybertTest(int argc, + char **argv, + const char **inputNames, + const char **outputNames, + F32 *intentRate, + F32 *slotRate) +{ + UNI_TIME_INIT + ParseRes parse_res; + parseCommandLine(argc, argv, &parse_res, "examples"); + + char *modelPath = (char *)""; + char *sequenceDirectory = (char *)""; + char *affinityPolicyName = (char *)""; + char *algorithmMapPath = (char *)""; + + if (!parse_res.model.second) { + exit(-1); + } + if (parse_res.model.second) { + modelPath = parse_res.model.first; + } + if (parse_res.inputPath.second) { + sequenceDirectory = parse_res.inputPath.first; + } + if (parse_res.archInfo.second) { + affinityPolicyName = parse_res.archInfo.first; + } + if (parse_res.algoPath.second) { + algorithmMapPath = parse_res.algoPath.first; + } + + std::shared_ptr pipelineBase; + UNI_PROFILE(pipelineBase = createPipeline(affinityPolicyName, modelPath, algorithmMapPath), + std::string("bolt::prepare"), std::string("prepare")); + + // load sequences + std::map> inMap = pipelineBase->get_inputs(); + std::vector sequenceDescs; + TensorDesc wordInputDesc = (*(inMap[inputNames[0]])).get_desc(); + wordInputDesc.dt = DT_U32; + sequenceDescs.push_back(wordInputDesc); + TensorDesc positionInputDesc = (*(inMap[inputNames[1]])).get_desc(); + positionInputDesc.dt = DT_U32; + sequenceDescs.push_back(positionInputDesc); + TensorDesc tokenTypeInputDesc = (*(inMap[inputNames[2]])).get_desc(); + tokenTypeInputDesc.dt = DT_U32; + sequenceDescs.push_back(tokenTypeInputDesc); + std::vector> sequences, intents, slots; + std::vector sequencePaths = + load_data(sequenceDirectory + std::string("/input"), sequenceDescs, &sequences); + + // load result + std::vector intentDescs; + TensorDesc intentDesc = tensor1d(DT_F32, 2); + intentDescs.push_back(intentDesc); + std::vector intentPaths = + load_data(sequenceDirectory + std::string("/intent"), intentDescs, &intents); + std::vector slotDescs; + slotDescs.push_back(wordInputDesc); + std::vector slotPaths = + load_data(sequenceDirectory + std::string("/slot"), slotDescs, &slots); + + int falseIntent = 0; + int falseSlot = 0; + double timeBegin = ut_time_ms(); +#ifdef _USE_OPENMP +#pragma omp parallel num_threads(OMP_NUM_THREADS) + { + std::shared_ptr pipeline = std::shared_ptr(new CNN()); + int threadId = omp_get_thread_num(); + UNI_PROFILE(*pipeline = pipelineBase->clone(), + std::string("bolt::clone-") + std::to_string(threadId), std::string("clone")); + pipeline->set_runtime_device(threadId, threadId); +#pragma omp for + for (U32 sequenceIndex = 0; sequenceIndex < sequences.size(); sequenceIndex++) { + std::string log = sequencePaths[sequenceIndex] + ":" + + tinybertTestKernel(sequenceIndex, sequences[sequenceIndex], pipeline, intents, + slots, &falseIntent, &falseSlot, inputNames, outputNames); + UNI_INFO_LOG("%s\n", log.c_str()); + } + } +#else + for (U32 sequenceIndex = 0; sequenceIndex < sequences.size(); sequenceIndex++) { + std::string log = sequencePaths[sequenceIndex] + ":" + + tinybertTestKernel(sequenceIndex, sequences[sequenceIndex], pipelineBase, intents, + slots, &falseIntent, &falseSlot, inputNames, outputNames); + UNI_INFO_LOG("%s\n", log.c_str()); + } +#endif + double timeEnd = ut_time_ms(); + double totalTime = (timeEnd - timeBegin); + UNI_TIME_STATISTICS + U32 validSequence = UNI_MAX(1, sequences.size()); + *intentRate = 100.0 * (validSequence - falseIntent) / validSequence; + *slotRate = 100.0 * (validSequence - falseSlot) / validSequence; + UNI_CI_LOG("intent correct rate: %f %%\n", *intentRate); + UNI_CI_LOG("slot correct rate: %f %%\n", *slotRate); + UNI_CI_LOG("avg_time:%fms/sequence\n", 1.0 * totalTime / validSequence); +} +#endif // _H_TINYBERT_TEST diff --git a/inference/examples/c_api/test_api_c.c b/inference/examples/c_api/test_api_c.c new file mode 100644 index 00000000..91ca37ba --- /dev/null +++ b/inference/examples/c_api/test_api_c.c @@ -0,0 +1,276 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include +#include +#include +#include "../api/c/bolt.h" +#include +#include +#include +#include +#include +#include +#ifdef _USE_FP16 +#include +typedef __fp16 F16; +#endif + +double ut_time_ms() +{ + struct timeval tv; + gettimeofday(&tv, NULL); + double time = tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0; + return time; +} + +void print_help(char *argv[]) +{ + printf("usage: %s modelPath \n", argv[0]); +} + +void classification(const char *modelPath, AFFINITY_TYPE affinity, DATA_TYPE dt, const char *algoPath, bool useFileStream) +{ + DATA_TYPE precisionMode = dt; + ModelHandle model_address; + if(useFileStream) { + model_address = CreateModelWithFileStream(modelPath, affinity, algoPath); + } else { + model_address = CreateModel(modelPath, affinity, algoPath); + } + + int num_input = GetNumInputsFromModel(model_address); + int *n = (int *)malloc(sizeof(int) * num_input); + int *c = (int *)malloc(sizeof(int) * num_input); + int *h = (int *)malloc(sizeof(int) * num_input); + int *w = (int *)malloc(sizeof(int) * num_input); + char **name = (char **)malloc(sizeof(char *) * num_input); + for (int i = 0; i < num_input; i++) { + name[i] = (char *)malloc(sizeof(char) * 1024); + } + DATA_TYPE *dt_input = (DATA_TYPE *)malloc(sizeof(DATA_TYPE) * num_input); + DATA_FORMAT *df_input = (DATA_FORMAT *)malloc(sizeof(DATA_FORMAT) * num_input); + + GetInputDataInfoFromModel(model_address, num_input, name, n, c, h, w, dt_input, df_input); + + unsigned char **input_ptr = (unsigned char **)malloc(sizeof(unsigned char *) * num_input); + for (int i = 0; i < num_input; i++) { + printf("input name = %s in = %d ic = %d ih = %d iw = %d\n", name[i], n[i], c[i], h[i], w[i]); + int length = n[i] * c[i] * h[i] * w[i]; + switch (precisionMode) { +#ifdef _USE_FP32 + case FP_32: { + float *ptr = (float *)malloc(sizeof(float) * length); + for (int i = 0; i < length; i++) { + ptr[i] = 1; + } + input_ptr[i] = (unsigned char *)ptr; + break; + } +#endif +#ifdef _USE_FP16 + case FP_16: { + F16 *ptr = (F16 *)malloc(sizeof(F16) * length); + for (int i = 0; i < length; i++) { + ptr[i] = 1; + } + input_ptr[i] = (unsigned char *)ptr; + break; + } +#endif + default: + printf("[ERROR] unsupported data precision in C API test\n"); + exit(1); + } + } + + PrepareModel(model_address, num_input, name, n, c, h, w, dt_input, df_input); + + ResultHandle model_result = AllocAllResultHandle(model_address); + int model_result_num = GetNumOutputsFromResultHandle(model_result); + + int *output_n = (int *)malloc(sizeof(int) * model_result_num); + int *output_c = (int *)malloc(sizeof(int) * model_result_num); + int *output_h = (int *)malloc(sizeof(int) * model_result_num); + int *output_w = (int *)malloc(sizeof(int) * model_result_num); + char **outputNames = (char **)malloc(sizeof(char *) * model_result_num); + for (int i = 0; i < model_result_num; i++) { + outputNames[i] = (char *)malloc(sizeof(char) * 1024); + } + DATA_TYPE *dt_output = (DATA_TYPE *)malloc(sizeof(DATA_TYPE) * model_result_num); + DATA_FORMAT *df_output = (DATA_FORMAT *)malloc(sizeof(DATA_FORMAT) * model_result_num); + + GetOutputDataInfoFromResultHandle(model_result, model_result_num, outputNames, output_n, + output_c, output_h, output_w, dt_output, df_output); + + unsigned char **user_out_ptr = + (unsigned char **)malloc(sizeof(unsigned char *) * model_result_num); + for (int i = 0; i < model_result_num; i++) { + printf("output name = %s on = %d oc = %d oh = %d ow = %d\n", outputNames[i], output_n[i], + output_c[i], output_h[i], output_w[i]); + int length = output_n[i] * output_c[i] * output_h[i] * output_w[i]; + switch (precisionMode) { +#ifdef _USE_FP32 + case FP_32: { + float *ptr = (float *)malloc(sizeof(float) * length); + user_out_ptr[i] = (unsigned char *)ptr; + break; + } +#endif +#ifdef _USE_FP16 + case FP_16: { + F16 *ptr = (F16 *)malloc(sizeof(F16) * length); + user_out_ptr[i] = (unsigned char *)ptr; + break; + } +#endif + default: + printf("[ERROR] unsupported data precision in C API test\n"); + exit(1); + } + } + + double totalTime = 0; + double max_time = -DBL_MAX; + double min_time = DBL_MAX; + int loop = 1; + + /*warm up*/ + for (int i = 0; i < 1; i++) { + RunModel(model_address, model_result, 1, name, (void **)input_ptr); + } + + for (int i = 0; i < loop; i++) { + double timeBegin = ut_time_ms(); + RunModel(model_address, model_result, 1, name, (void **)input_ptr); + double timeEnd = ut_time_ms(); + double t = timeEnd - timeBegin; + totalTime += t; + if (t < min_time) { + min_time = t; + } + if (t > max_time) { + max_time = t; + } + } + + unsigned char **bolt_out_ptr = + (unsigned char **)malloc(sizeof(unsigned char *) * model_result_num); + GetPtrFromResultHandle(model_result, model_result_num, outputNames, (void **)bolt_out_ptr, + output_n, output_c, output_h, output_w, dt_output, df_output); + for (int i = 0; i < model_result_num; i++) { + int length = output_n[i] * output_c[i] * output_h[i] * output_w[i]; + switch (precisionMode) { +#ifdef _USE_FP32 + case FP_32: { + memcpy(user_out_ptr[i], bolt_out_ptr, sizeof(float) * length); + break; + } +#endif +#ifdef _USE_FP16 + case FP_16: { + memcpy(user_out_ptr[i], bolt_out_ptr, sizeof(F16) * length); + break; + } +#endif + default: + printf("[ERROR] unsupported data precision in C API test\n"); + exit(1); + } + } + FreeResultHandle(model_result); + DestroyModel(model_address); + free(n); + free(c); + free(h); + free(w); + free(dt_input); + free(df_input); + for (int i = 0; i < num_input; i++) { + free(name[i]); + free(input_ptr[i]); + } + free(name); + free(input_ptr); + free(output_n); + free(output_c); + free(output_h); + free(output_w); + free(dt_output); + free(df_output); + for (int i = 0; i < model_result_num; i++) { + free(outputNames[i]); + free(user_out_ptr[i]); + } + free(outputNames); + free(user_out_ptr); + free(bolt_out_ptr); + + const char* modelName = (useFileStream) ? "Use file stream" : modelPath; + if (affinity == GPU) { + printf("DeviceType = GPU, Model = %s\n", modelName); + } else { + printf("DeviceType = CPU, Model = %s\n", modelName); + } + printf("avg_time: %lf ms\n", 1.0 * totalTime / loop); + printf("max_time: %lf ms\n", 1.0 * max_time); + printf("min_time: %lf ms\n", 1.0 * min_time); + fflush(stdout); +} + +char* buildFileStream(const char* fileName) { + int fd; + int length; + struct stat ss; + fd = open(fileName, O_RDONLY); + if (-1 == fd) { + printf("Open file %s failed\n", fileName); + exit(-1); + } + if (-1 == fstat(fd, &ss)) { + printf("Can not get size from file %s\n", fileName); + exit(-1); + } + length = ss.st_size; + char* bytes = (char*)mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0); + if(MAP_FAILED == bytes) { + printf("Map file %s failed\n", fileName); + exit(-1); + } + char* res = malloc(length); + memcpy(res, bytes, length); + munmap(bytes, length); + if (-1 != fd) { + close(fd); + } + return res; +} + +int main() +{ + const char *mobilenet_v1_fp16_modelPath = "/data/local/tmp/xyf/model/mobilenet_v1_f16.bolt"; + const char *algoPath = "./"; + bool useFileStream = false; + classification(mobilenet_v1_fp16_modelPath, CPU_HIGH_PERFORMANCE, FP_16, algoPath, useFileStream); + classification(mobilenet_v1_fp16_modelPath, GPU, FP_16, algoPath, useFileStream); + + /*Test use filestream to read algoFile*/ + useFileStream = true; + const char* modelFileStream = buildFileStream(mobilenet_v1_fp16_modelPath); + const char* algoFileStream = buildFileStream("./algorithmInfo_Mali_G52p_MOBILENET_2_4"); + classification(modelFileStream, GPU, FP_16, algoFileStream, useFileStream); + free((void*)modelFileStream); + free((void*)algoFileStream); + return 0; +} diff --git a/inference/examples/dlaWOdcn/flow_dlaWOdcn.cpp b/inference/examples/dlaWOdcn/flow_dlaWOdcn.cpp new file mode 100644 index 00000000..83068678 --- /dev/null +++ b/inference/examples/dlaWOdcn/flow_dlaWOdcn.cpp @@ -0,0 +1,101 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "task.h" +#include "flow.h" + +DataType inferencePrecision = DT_F16; + +std::map> inputOutput() +{ + std::map> tensors; + TensorDesc inputDesc = tensor4df(inferencePrecision, DF_NCHW, 1, 3, 128, 128); + tensors["input"] = std::shared_ptr(new Tensor()); + tensors["input"]->resize(inputDesc); + tensors["input"]->alloc(); + + switch (inferencePrecision) { + case DT_F32: { + F32 *ptr = (F32 *)((CpuMemory *)tensors["input"]->get_memory())->get_ptr(); + for (U32 i = 0; i < tensorNumElements(inputDesc); i++) { + ptr[i] = 1; + } + break; + } +#ifdef _USE_FP16 + case DT_F16: { + F16 *ptr = (F16 *)((CpuMemory *)tensors["input"]->get_memory())->get_ptr(); + for (U32 i = 0; i < tensorNumElements(inputDesc); i++) { + ptr[i] = 1; + } + break; + } +#endif + default: + UNI_ERROR_LOG("currently not support to init this data type(%d) dlaWOdcn input data\n", + inferencePrecision); + break; + } + + tensors["594"] = std::shared_ptr(new Tensor()); + tensors["594"]->resize(tensor4df(inferencePrecision, DF_NCHW, 1, 13, 32, 32)); + tensors["594"]->alloc(); + tensors["nms_hm"] = std::shared_ptr(new Tensor()); + tensors["nms_hm"]->resize(tensor4df(inferencePrecision, DF_NCHW, 1, 13, 32, 32)); + tensors["nms_hm"]->alloc(); + tensors["598"] = std::shared_ptr(new Tensor()); + tensors["598"]->resize(tensor4df(inferencePrecision, DF_NCHW, 1, 62, 32, 32)); + tensors["598"]->alloc(); + tensors["nms_hm_kp"] = std::shared_ptr(new Tensor()); + tensors["nms_hm_kp"]->resize(tensor4df(inferencePrecision, DF_NCHW, 1, 62, 32, 32)); + tensors["nms_hm_kp"]->alloc(); + tensors["wh"] = std::shared_ptr(new Tensor()); + tensors["wh"]->resize(tensor4df(inferencePrecision, DF_NCHW, 1, 2, 32, 32)); + tensors["wh"]->alloc(); + tensors["kps"] = std::shared_ptr(new Tensor()); + tensors["kps"]->resize(tensor4df(inferencePrecision, DF_NCHW, 1, 124, 32, 32)); + tensors["kps"]->alloc(); + tensors["reg"] = std::shared_ptr(new Tensor()); + tensors["reg"]->resize(tensor4df(inferencePrecision, DF_NCHW, 1, 2, 32, 32)); + tensors["reg"]->alloc(); + tensors["kp_offset"] = std::shared_ptr(new Tensor()); + tensors["kp_offset"]->resize(tensor4df(inferencePrecision, DF_NCHW, 1, 2, 32, 32)); + tensors["kp_offset"]->alloc(); + return tensors; +} + +int main(int argc, char *argv[]) +{ + int num = 200; + std::string dlaWOdcnGraphPath = argv[1]; + std::vector graphPath = {dlaWOdcnGraphPath}; + int threads = atoi(argv[2]); + + Flow flowExample; + flowExample.init(graphPath, inferencePrecision, AFFINITY_CPU_HIGH_PERFORMANCE, threads, false); + sleep(10); + + for (int i = 0; i < num; i++) { + std::map> data = inputOutput(); + Task task(dlaWOdcnGraphPath, data); + flowExample.enqueue(task); + } + + std::vector results; + double start = ut_time_ms(); + UNI_PROFILE(results = flowExample.dequeue(true), std::string("flow_dlaWOdcn"), + std::string("flow_dlaWOdcn")); + double end = ut_time_ms(); + UNI_CI_LOG("avg_time:%fms/image\n", (end - start) / num); + return 0; +} diff --git a/inference/examples/dlaWOdcn/flow_dlaWOdcn.prototxt b/inference/examples/dlaWOdcn/flow_dlaWOdcn.prototxt new file mode 100644 index 00000000..8d1180e2 --- /dev/null +++ b/inference/examples/dlaWOdcn/flow_dlaWOdcn.prototxt @@ -0,0 +1,36 @@ +name: "facesr_dlaWOdcn" +input: "input" +output: "594" +output: "nms_hm" +output: "598" +output: "nms_hm_kp" +output: "wh" +output: "kps" +output: "reg" +output: "kp_offset" +node { + name: "input" + type: "Input" + output: "input" + input_type: "FLOAT16" + input_format: "NCHW" + input_dim: 1 + input_dim: 3 + input_dim: 128 + input_dim: 128 +} +node { + name: "dlaWOdcn_inference" + type: "Inference" + input: "input" + output: "594" + output: "nms_hm" + output: "598" + output: "nms_hm_kp" + output: "wh" + output: "kps" + output: "reg" + output: "kp_offset" + inference_parameter: "/data/local/tmp/CI/model_zoo/onnx_models/dlaWOdcn_34_128/dlaWOdcn_34_128_f16.bolt" + inference_parameter: "/data/local/tmp/CI/test" +} diff --git a/inference/examples/dlaWOdcn/run.sh b/inference/examples/dlaWOdcn/run.sh new file mode 100644 index 00000000..c02a4eae --- /dev/null +++ b/inference/examples/dlaWOdcn/run.sh @@ -0,0 +1,84 @@ +#!/bin/bash + +script_name=$0 +script_abs=$(readlink -f "$0") +script_dir=$(dirname $script_abs) +bolt_root=${script_dir}/../../.. + +device="" +arch="arm_gnu" +device_dir=/data/local/tmp/CI/test +model_zoo_dir=/data/local/tmp/CI/model_zoo + +print_help() { + cat < use to set device architecture(default: arm_gnu). + -p, --path device test directory. +EOF + exit 1; +} + +TEMP=`getopt -o d:a:p:m:h --long device:arch:path:model_zoo:help, \ + -n ${script_name} -- "$@"` +if [ $? != 0 ] ; then echo "[ERROR] terminating..." >&2 ; exit 1 ; fi +eval set -- "$TEMP" +while true ; do + case "$1" in + -d|--device) + device=$2 + echo "[INFO] run on '${device}'" ; + shift 2 ;; + -a|--arch) + arch=$2 + echo "[INFO] device architecture ${arch}" ; + shift 2 ;; + -p|--path) + device_dir=$2 + echo "[INFO] run in '${device_dir}'" ; + shift 2 ;; + -m|--model_zoo) + model_zoo_dir=$2 + echo "[INFO] use model_zoo ${model_zoo_dir}" ; + shift 2 ;; + -h|--help) + print_help ; + shift ;; + --) shift ; + break ;; + *) echo "[ERROR]" ; exit 1 ;; + esac +done + +echo "[WARNING] Please make sure ${model_zoo_dir} is valid to find all inference models" +echo "[WARNING] Please make sure to use models in ${model_zoo_dir} in ${script_dir}/*.prototxt configure files" + +adb -s $device shell "mkdir ${device_dir}" +adb -s $device push ${script_dir}/flow_dlaWOdcn.prototxt $device_dir > /dev/null +adb -s $device push ${bolt_root}/install_${arch}/examples/flow_dlaWOdcn $device_dir > /dev/null +adb -s $device push ${bolt_root}/install_${arch}/tools/X2bolt $device_dir > /dev/null +adb -s $device shell "mkdir ${device_dir}/lib" +for file in `ls ${bolt_root}/install_${arch}/lib/*.so` +do + adb -s ${device} push ${file} ${device_dir}/lib > /dev/null +done + +# prepare inference models +adb -s $device shell "export LD_LIBRARY_PATH=${device_dir}/lib && ${device_dir}/X2bolt -d ${model_zoo_dir}/onnx_models/dlaWOdcn_34_128 -m dlaWOdcn_34_128 -i FP16" + +# inference +adb -s $device shell "export LD_LIBRARY_PATH=${device_dir}/lib && cd ${device_dir} && ./flow_dlaWOdcn ./flow_dlaWOdcn.prototxt 4 || echo '[FAILURE]'" | tee status.txt + +# clean work directory +adb -s $device shell "rm -rf ${device_dir}" + +cat status.txt || exit 1 +if [ `grep -c "\[FAILURE\]" status.txt` -ne '0' ] ; then + exit 1 +fi +rm status.txt diff --git a/inference/examples/facesr/flow_facesr.cpp b/inference/examples/facesr/flow_facesr.cpp new file mode 100644 index 00000000..4bc49ada --- /dev/null +++ b/inference/examples/facesr/flow_facesr.cpp @@ -0,0 +1,82 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "task.h" +#include "flow.h" + +DataType inferencePrecision = DT_F16; + +std::map> inputOutput() +{ + std::map> tensors; + TensorDesc inputDesc = tensor4df(inferencePrecision, DF_NCHW, 1, 64, 48, 48); + tensors["geninput"] = std::shared_ptr(new Tensor()); + tensors["geninput"]->resize(inputDesc); + tensors["geninput"]->alloc(); + + switch (inferencePrecision) { + case DT_F32: { + F32 *ptr = (F32 *)((CpuMemory *)tensors["geninput"]->get_memory())->get_ptr(); + for (U32 i = 0; i < tensorNumElements(inputDesc); i++) { + ptr[i] = 1; + } + break; + } +#ifdef _USE_FP16 + case DT_F16: { + F16 *ptr = (F16 *)((CpuMemory *)tensors["geninput"]->get_memory())->get_ptr(); + for (U32 i = 0; i < tensorNumElements(inputDesc); i++) { + ptr[i] = 1; + } + break; + } +#endif + default: + UNI_ERROR_LOG("currently not support to init this data type(%d) facesr input data\n", + inferencePrecision); + break; + } + + tensors["pixel_shuffle_final_out"] = std::shared_ptr(new Tensor()); + tensors["pixel_shuffle_final_out"]->resize( + tensor4df(inferencePrecision, DF_NCHWC8, 1, 8, 384, 384)); + tensors["pixel_shuffle_final_out"]->alloc(); + return tensors; +} + +int main(int argc, char *argv[]) +{ + int num = 100; + std::string facesrGraphPath = argv[1]; + std::vector graphPath = {facesrGraphPath}; + int threads = atoi(argv[2]); + + Flow flowExample; + flowExample.init(graphPath, inferencePrecision, AFFINITY_CPU_HIGH_PERFORMANCE, threads, false); + sleep(10); + + for (int i = 0; i < num; i++) { + std::map> data = inputOutput(); + Task task(facesrGraphPath, data); + flowExample.enqueue(task); + } + + std::vector results; + double start = ut_time_ms(); + UNI_PROFILE(results = flowExample.dequeue(true), std::string("flow_facesr"), + std::string("flow_facesr")); + double end = ut_time_ms(); + UNI_CI_LOG("avg_time:%fms/image\n", (end - start) / num); + return 0; +} diff --git a/inference/examples/facesr/flow_facesr.prototxt b/inference/examples/facesr/flow_facesr.prototxt new file mode 100644 index 00000000..2cc56991 --- /dev/null +++ b/inference/examples/facesr/flow_facesr.prototxt @@ -0,0 +1,22 @@ +name: "facesr_flow" +input: "geninput" +output: "pixel_shuffle_final_out" +node { + name: "geninput" + type: "Input" + output: "geninput" + input_type: "FLOAT16" + input_format: "NCHW" + input_dim: 1 + input_dim: 64 + input_dim: 48 + input_dim: 48 +} +node { + name: "facesr_inference" + type: "Inference" + input: "geninput" + output: "pixel_shuffle_final_out" + inference_parameter: "/data/local/tmp/CI/model_zoo/caffe_models/facesr/facesr_f16.bolt" + inference_parameter: "/data/local/tmp/CI/test" +} diff --git a/inference/examples/facesr/run.sh b/inference/examples/facesr/run.sh new file mode 100644 index 00000000..433b46b8 --- /dev/null +++ b/inference/examples/facesr/run.sh @@ -0,0 +1,84 @@ +#!/bin/bash + +script_name=$0 +script_abs=$(readlink -f "$0") +script_dir=$(dirname $script_abs) +bolt_root=${script_dir}/../../.. + +device="" +arch="arm_gnu" +device_dir=/data/local/tmp/CI/test +model_zoo_dir=/data/local/tmp/CI/model_zoo + +print_help() { + cat < use to set device architecture(default: arm_gnu). + -p, --path device test directory. +EOF + exit 1; +} + +TEMP=`getopt -o d:a:p:m:h --long device:arch:path:model_zoo:help, \ + -n ${script_name} -- "$@"` +if [ $? != 0 ] ; then echo "[ERROR] terminating..." >&2 ; exit 1 ; fi +eval set -- "$TEMP" +while true ; do + case "$1" in + -d|--device) + device=$2 + echo "[INFO] run on '${device}'" ; + shift 2 ;; + -a|--arch) + arch=$2 + echo "[INFO] device architecture ${arch}" ; + shift 2 ;; + -p|--path) + device_dir=$2 + echo "[INFO] run in '${device_dir}'" ; + shift 2 ;; + -m|--model_zoo) + model_zoo_dir=$2 + echo "[INFO] use model_zoo ${model_zoo_dir}" ; + shift 2 ;; + -h|--help) + print_help ; + shift ;; + --) shift ; + break ;; + *) echo "[ERROR]" ; exit 1 ;; + esac +done + +echo "[WARNING] Please make sure ${model_zoo_dir} is valid to find all inference models" +echo "[WARNING] Please make sure to use models in ${model_zoo_dir} in ${script_dir}/*.prototxt configure files" + +adb -s $device shell "mkdir ${device_dir}" +adb -s $device push ${script_dir}/flow_facesr.prototxt $device_dir > /dev/null +adb -s $device push ${bolt_root}/install_${arch}/examples/flow_facesr $device_dir > /dev/null +adb -s $device push ${bolt_root}/install_${arch}/tools/X2bolt $device_dir > /dev/null +adb -s $device shell "mkdir ${device_dir}/lib" +for file in `ls ${bolt_root}/install_${arch}/lib/*.so` +do + adb -s ${device} push ${file} ${device_dir}/lib > /dev/null +done + +# prepare inference models +adb -s $device shell "export LD_LIBRARY_PATH=${device_dir}/lib && ${device_dir}/X2bolt -d ${model_zoo_dir}/caffe_models/facesr -m facesr -i FP16" + +# inference +adb -s $device shell "export LD_LIBRARY_PATH=${device_dir}/lib && cd ${device_dir} && ./flow_facesr ./flow_facesr.prototxt 1 || echo '[FAILURE]'" | tee status.txt + +# clean work directory +adb -s $device shell "rm -rf ${device_dir}" + +cat status.txt || exit 1 +if [ `grep -c "\[FAILURE\]" status.txt` -ne '0' ] ; then + exit 1 +fi +rm status.txt diff --git a/inference/examples/high_dynamic_range/hdr.cpp b/inference/examples/high_dynamic_range/hdr.cpp new file mode 100644 index 00000000..b35b4f96 --- /dev/null +++ b/inference/examples/high_dynamic_range/hdr.cpp @@ -0,0 +1,455 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_FP16 +#include +#include +#include "ut_util.h" +#include "types.h" +#include "tensor_desc.h" +#include "sequential_ocl.hpp" +#include "factory.hpp" +#include "ocl/factory_ocl.hpp" +#include "tensor.hpp" +#include "data_loader.hpp" + +void print_help() +{ + std::cout << "please set argvs: " << std::endl; + std::cout << "usage: argv[1]: in" << std::endl; + std::cout << "usage: argv[2]: ic" << std::endl; + std::cout << "usage: argv[3]: ih" << std::endl; + std::cout << "usage: argv[4]: iw" << std::endl; + std::cout << "usage: argv[5]: dt" << std::endl; +} + +inline void calWeight(F16 *para) +{ + float ccm[3][3] = {{0.900616, -0.079311, -0.068347}, {-0.100600, 0.919760, -0.069032}, + {-0.058384, -0.037624, 0.975032}}; + float ccm_bias[3] = {0.036360, 0.062180, 0.064861}; + float shifts[3] = {-0.036361, -0.062179, -0.064860}; + float slopes[3] = {0.003211, 0.007948, 0.046259}; + float cmix[3] = {0.249512, 0.274577, 0.324276}; + float cmix_bias = 0.078941; + float x[3]; + for (int i = 0; i < 3; ++i) { + x[i] = (ccm_bias[i] - shifts[i]) * slopes[i] * 16; + } + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 3; ++j) { + ccm[i][j] = ccm[i][j] * slopes[i] * 16; + } + } + for (int i = 0; i < 3; i++) { + para[i] = (F16)(ccm[i][0] * cmix[0] + ccm[i][1] * cmix[1] + ccm[i][2] * cmix[2]); + } + para[3] = (F16)(x[0] * cmix[0] + x[1] * cmix[1] + x[2] * cmix[2] + cmix_bias); +} + +template +inline void calGuide( + const int w, const int h, const int c, F16 *para, T *input, F16 *guide, std::string DATA_DT) +{ + float in_val[3]; + for (int i = 0; i < h; i++) { + for (int j = 0; j < w; j++) { + if (DATA_DT == "UCHAR") { + in_val[0] = input[c * (j + w * i)] / 256.0; + in_val[1] = input[c * (j + w * i) + 1] / 256.0; + in_val[2] = input[c * (j + w * i) + 2] / 256.0; + } else { + in_val[0] = input[c * (j + w * i)]; + in_val[1] = input[c * (j + w * i) + 1]; + in_val[2] = input[c * (j + w * i) + 2]; + } + guide[j + w * i] = + in_val[0] * para[0] + in_val[1] * para[1] + in_val[2] * para[2] + para[3]; + } + } +} +template +inline void bilateralSliceApply(const int w, + const int h, + const int gw, + const int gh, + const int gd, + const int input_chans, + const int output_chans, + const bool has_offset, + F16 *grid, + F16 *guide, + T *input, + T *output, + std::string DATA_DT) +{ + int grid_chans = input_chans * output_chans; + int coeff_stride = input_chans; + if (has_offset) { + grid_chans += output_chans; + coeff_stride += 1; + } + int sz = grid_chans; + int sx = grid_chans * gd; + int sy = grid_chans * gd * gw; + + float in_val[3]; + float out_val[3]; + for (int y = 0; y < h; ++y) { + float gy = (y + 0.5f) * gh / (1.0f * h); + int fy = static_cast(floor(gy - 0.5)); + for (int x = 0; x < w; ++x) { + float gx = (x + 0.5f) * gw / (1.0f * w); + float gz = guide[x + w * y] * gd; + int fx = static_cast(floor(gx - 0.5f)); + int fz = static_cast(floor(gz - 0.5f)); + float coeff_sample[12] = {0.0f}; + for (int xx = fx; xx < fx + 2; ++xx) { + int x_ = UNI_MAX(UNI_MIN(xx, gw - 1), 0); + float wx = fmax(1.0f - fabs(xx + 0.5 - gx), 0.0f); + for (int yy = fy; yy < fy + 2; ++yy) { + int y_ = UNI_MAX(UNI_MIN(yy, gh - 1), 0); + float wy = fmax(1.0f - fabs(yy + 0.5 - gy), 0.0f); + for (int zz = fz; zz < fz + 2; ++zz) { + int z_ = UNI_MAX(UNI_MIN(zz, gd - 1), 0); + float wz = fmax(1.0f - fabs(zz + 0.5 - gz), 0.0f); + for (int in_c = 0; in_c < grid_chans; ++in_c) { + int grid_idx = in_c + sz * z_ + sx * x_ + sy * y_; + coeff_sample[in_c] += grid[grid_idx] * wx * wy * wz; + } + } + } + } + if (DATA_DT == "UCHAR") { + in_val[0] = input[input_chans * (x + w * y)] / 256.0; + in_val[1] = input[input_chans * (x + w * y) + 1] / 256.0; + in_val[2] = input[input_chans * (x + w * y) + 2] / 256.0; + } else { + in_val[0] = input[input_chans * (x + w * y)]; + in_val[1] = input[input_chans * (x + w * y) + 1]; + in_val[2] = input[input_chans * (x + w * y) + 2]; + } + + if (has_offset) { + out_val[0] = in_val[0] * coeff_sample[0] + in_val[1] * coeff_sample[1] + + in_val[2] * coeff_sample[2] + coeff_sample[3]; + out_val[1] = in_val[0] * coeff_sample[4] + in_val[1] * coeff_sample[5] + + in_val[2] * coeff_sample[6] + coeff_sample[7]; + out_val[2] = in_val[0] * coeff_sample[8] + in_val[1] * coeff_sample[9] + + in_val[2] * coeff_sample[10] + coeff_sample[11]; + } else { + out_val[0] = in_val[0] * coeff_sample[0] + in_val[1] * coeff_sample[1] + + in_val[2] * coeff_sample[2]; + out_val[1] = in_val[0] * coeff_sample[3] + in_val[1] * coeff_sample[4] + + in_val[2] * coeff_sample[5]; + out_val[2] = in_val[0] * coeff_sample[6] + in_val[1] * coeff_sample[7] + + in_val[2] * coeff_sample[8]; + } + + if (DATA_DT == "UCHAR") { + output[input_chans * (x + w * y)] = (U8)(out_val[0] * 256.0); + output[input_chans * (x + w * y) + 1] = (U8)(out_val[1] * 256.0); + output[input_chans * (x + w * y) + 2] = (U8)(out_val[2] * 256.0); + } else { + output[input_chans * (x + w * y)] = out_val[0]; + output[input_chans * (x + w * y) + 1] = out_val[1]; + output[input_chans * (x + w * y) + 2] = out_val[2]; + } + } + } +} + +template +void HDR_CPU(const int w, + const int h, + const int gw, + const int gh, + const int gd, + const int input_chans, + const int output_chans, + const bool has_offset, + F16 *grid, + T *input, + T *output, + std::string DATA_DT) +{ + U8 *guideptr = (U8 *)operator new(w *h *bytesOf(DT_F16)); + F16 *guide = (F16 *)guideptr; + F16 para[4]; + calWeight(para); + calGuide(w, h, input_chans, para, input, guide, DATA_DT); + bilateralSliceApply(w, h, gw, gh, gd, input_chans, output_chans, has_offset, grid, guide, + input, output, DATA_DT); +} + +template +void buildInputTensor(DataType dt, + DataFormat df, + U32 n, + U32 c, + U32 h, + U32 w, + std::vector *dims, + std::vector *inputTensors) +{ + TensorDesc inputDesc = tensor4df(dt, df, n, c, h, w); + U32 inputNum = tensorNumElements(inputDesc); + U32 inputSize = tensorNumBytes(inputDesc); + U8 *inputVal = new U8[inputSize]; + + T *data = (T *)inputVal; + if (dt == DT_F16) { + for (U32 i = 0; i < inputNum; i++) { + data[i] = (T)(rand() & 255) / (256.0); + } + } + if (dt == DT_U8) { + for (U32 i = 0; i < inputNum; i++) { + data[i] = (T)(rand() & 255); + } + } + std::shared_ptr inputTensor = std::shared_ptr(new Tensor()); + inputTensor->resize(inputDesc); + auto mem = (CpuMemory *)inputTensor->get_memory(); + mem->set_shared_ptr(std::shared_ptr(inputVal)); + + dims->push_back(inputDesc); + inputTensors->push_back(*inputTensor.get()); +} + +int main(int argc, char *argv[]) +{ + if (argc != 6 && argc != 5) { + printf("%d\n", argc); + print_help(); + return 0; + } + std::string INPUT_DT = "F16"; + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + if (argc == 6) { + INPUT_DT = argv[5]; + } + U32 gw = 16; + U32 gh = 16; + U32 gc = 96; + U32 gd = 8; + U32 coe = gc / gd; + bool has_offset = true; + + AffinityPolicy affinityPolicy = AFFINITY_GPU; + DataType dt = DT_F16; + auto model = new SequentialOcl(affinityPolicy, dt, "OT_BilateralSliceApply"); + std::shared_ptr model_ptr = std::shared_ptr(model); + + Factory *factory_ocl = (Factory *)(new FactoryOCL()); + std::shared_ptr factory; + factory = std::shared_ptr(factory_ocl); + + BilateralSliceApplyMode mode = BSliceApply_CONV; + BilateralSliceApplyParamSpec p; + p.coefficient_len = coe; + p.mode = mode; + p.has_offset = has_offset; + auto op = factory->createBilateralSliceApply(p); + model_ptr->add(op); + + std::vector dims; + std::vector inputTensors; + if (INPUT_DT == "UCHAR") { + buildInputTensor(DT_U8, DF_NHWC, in, ic, ih, iw, &dims, &inputTensors); + } else { + buildInputTensor(dt, DF_NHWC, in, ic, ih, iw, &dims, &inputTensors); + } + buildInputTensor(dt, DF_NHWC, 1, gc, gh, gw, &dims, &inputTensors); // grid + + F16 *grid_val = (F16 *)((CpuMemory *)(inputTensors[1].get_memory()))->get_ptr(); + for (U32 i = 0; i < tensorNumElements(dims[1]); i++) { + grid_val[i] = grid_val[i] / 8.0; + } + U8 *input = new U8[tensorNumBytes(dims[0])]; + U8 *grid = new U8[tensorNumBytes(dims[1])]; + memcpy((void *)input, ((CpuMemory *)(inputTensors[0].get_memory()))->get_ptr(), + tensorNumBytes(dims[0])); + memcpy((void *)grid, ((CpuMemory *)(inputTensors[1].get_memory()))->get_ptr(), + tensorNumBytes(dims[1])); + // model_ptr->loadAlgorithmMapFromText("./"); + model_ptr->ready(dims, NULL, 1); + model_ptr->mark_input_output(); + // model_ptr->saveAlgorithmMapToText("./"); + + double totalTime = 0; + double max_time = -DBL_MAX; + double min_time = DBL_MAX; + int loop = 10; + U8 *ocl_res = NULL; + std::vector> ocl_output; + for (int i = 0; i < loop; i++) { + double timeBegin = ut_time_ms(); + model_ptr->set_input_tensors(inputTensors); + model_ptr->run(); + ocl_output = model_ptr->get_output_tensors(); + auto oclMem = (OclMemory *)ocl_output[0]->get_memory(); + ocl_res = (U8 *)oclMem->get_mapped_ptr(); + double timeEnd = ut_time_ms(); + double t = timeEnd - timeBegin; + totalTime += t; + if (max_time < t) { + max_time = t; + } + if (min_time > t) { + min_time = t; + } + } + + int e0, e1, e2, e3, e4, e5, e6; + e0 = 0; + e1 = 0; + e2 = 0; + e3 = 0; + e4 = 0; + e5 = 0; + e6 = 0; + float maxrel = 0; + float maxabs = 0; + if (INPUT_DT == "UCHAR") { + U8 *output = new U8[iw * ih * ic * sizeof(U8)]; + HDR_CPU(iw, ih, gw, gh, gd, ic, ic, has_offset, (F16 *)grid, input, output, INPUT_DT); + for (U32 i = 0; i < ih; i++) { + for (U32 j = 0; j < iw; j++) { + U8 c, g; + int d; + int index = (i * iw + j) * 3; + for (int k = 0; k < 3; k++) { + c = output[index + k]; + g = ocl_res[index + k]; + d = c - g; + if (d < 0) { + d = -d; + } + maxabs = ((float)d > maxabs) ? (float)d : maxabs; + maxrel = ((float)d * 2 / (c + g + 0.000001) > maxrel) + ? (float)d * 2 / (c + g + 0.000001) + : maxrel; + if (d >= 30) { + e0++; + continue; + } + if (d >= 20) { + e1++; + continue; + } + if (d >= 10) { + e2++; + continue; + } + if (d >= 5) { + e3++; + continue; + } + if (d >= 2) { + e4++; + continue; + } + if (d >= 1) { + e5++; + continue; + } + e6++; + } + } + } + std::cout << " abs(diff) >=30 number = " << e0 << std::endl; + std::cout << "20 <= abs(diff) < 30 number = " << e1 << std::endl; + std::cout << "10 <= abs(diff) < 20 number = " << e2 << std::endl; + std::cout << "5 <= abs(diff) < 10 number = " << e3 << std::endl; + std::cout << "2 <= abs(diff) < 5 number = " << e4 << std::endl; + std::cout << "1 <= abs(diff) < 2 number = " << e5 << std::endl; + std::cout << "0 <= abs(diff) < 1 number = " << e6 << std::endl; + std::cout << "maxabs = " << maxabs << std::endl; + std::cout << "maxrel = " << maxrel << std::endl; + delete[] output; + } else { + U8 *output = new U8[iw * ih * ic * sizeof(F16)]; + HDR_CPU(iw, ih, gw, gh, gd, ic, ic, has_offset, (F16 *)grid, (F16 *)input, + (F16 *)output, INPUT_DT); + F16 *cpu_res = (F16 *)output; + F16 *gpu_res = (F16 *)ocl_res; + for (U32 i = 0; i < ih; i++) { + for (U32 j = 0; j < iw; j++) { + float c, g, d; + int index = (i * iw + j) * 3; + for (int k = 0; k < 3; k++) { + c = cpu_res[index + k]; + g = gpu_res[index + k]; + d = c - g; + if (d < 0) { + d = -d; + } + maxabs = ((float)d > maxabs) ? (float)d : maxabs; + maxrel = ((float)d * 2 / (c + g + 0.000001) > maxrel) + ? (float)d * 2 / (c + g + 0.000001) + : maxrel; + if (d >= 1) { + e0++; + continue; + } + if (d >= 0.1) { + e1++; + continue; + } + if (d >= 0.01) { + e2++; + continue; + } + if (d >= 0.001) { + e3++; + continue; + } + if (d >= 0.0001) { + e4++; + continue; + } + if (d >= 0.00001) { + e5++; + continue; + } + e6++; + } + } + } + std::cout << " abs(diff) >=1 number = " << e0 << std::endl; + std::cout << "0.1 <= abs(diff) < 1 number = " << e1 << std::endl; + std::cout << "0.01 <= abs(diff) < 0.1 number = " << e2 << std::endl; + std::cout << "0.001 <= abs(diff) < 0.01 number = " << e3 << std::endl; + std::cout << "0.0001 <= abs(diff) < 0.001 number = " << e4 << std::endl; + std::cout << "0.00001 <= abs(diff) < 0.0001 number = " << e5 << std::endl; + std::cout << "0 <= abs(diff) < 0.00001 number = " << e6 << std::endl; + std::cout << "maxabs = " << maxabs << std::endl; + std::cout << "maxrel = " << maxrel << std::endl; + + delete[] output; + } + + printf("avg_time: %lf ms\n", 1.0 * totalTime / loop); + printf("max_time: %lf ms\n", 1.0 * max_time); + printf("min_time: %lf ms\n", 1.0 * min_time); + UNI_TIME_STATISTICS + + delete[] input; + delete[] grid; + return 0; +} +#endif diff --git a/inference/examples/image_classification/classification.cpp b/inference/examples/image_classification/classification.cpp new file mode 100644 index 00000000..556c5c51 --- /dev/null +++ b/inference/examples/image_classification/classification.cpp @@ -0,0 +1,240 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include +#include +#include "inference.hpp" +#include "tensor.hpp" +#include "data_loader.hpp" +#include "result_format.hpp" +#include "profiling.h" +#include "parse_command.h" + +int main(int argc, char *argv[]) +{ + UNI_TIME_INIT + ParseRes parse_res; + parseCommandLine(argc, argv, &parse_res, "examples"); + + char *modelPath = (char *)""; + char *imageDir = (char *)""; + char *affinityPolicyName = (char *)"CPU_AFFINITY_HIGH_PERFORMANCE"; + char *algorithmMapPath = (char *)""; + ImageFormat imageFormat = RGB; + F32 scaleValue = 1; + int topK = 5; + int category = -1; + int loopTime = 1; + if (!parse_res.model.second) { + exit(-1); + } + if (parse_res.model.second) { + modelPath = parse_res.model.first; + } + if (parse_res.inputPath.second) { + imageDir = parse_res.inputPath.first; + } + if (parse_res.archInfo.second) { + affinityPolicyName = parse_res.archInfo.first; + } + if (parse_res.algoPath.second) { + algorithmMapPath = parse_res.algoPath.first; + } + if (parse_res.imageFormat.second) { + imageFormat = parse_res.imageFormat.first; + } + if (parse_res.scaleValue.second) { + scaleValue = parse_res.scaleValue.first; + } + if (parse_res.topK.second) { + topK = parse_res.topK.first; + } + if (parse_res.loopTime.second) { + loopTime = parse_res.loopTime.first; + } + if (parse_res.correctLable.second) { + category = parse_res.correctLable.first; + } + char *inputBinName = nullptr; + if (parse_res.readInputBinName.second) { + inputBinName = parse_res.readInputBinName.first; + } + char *outputBinName = nullptr; + if (parse_res.writeOutputBinName.second) { + outputBinName = parse_res.writeOutputBinName.first; + } + + double timeBegin = ut_time_ms(); + auto cnn = createPipeline(affinityPolicyName, modelPath, algorithmMapPath); + double timeEnd = ut_time_ms(); + std::cout << "Prepare time = " << timeEnd - timeBegin << "ms" << std::endl; + + // load images + std::vector> images; + std::vector imageDescs; + std::vector imagePaths; + U8 *inputBinPtr = nullptr; +#ifdef _USE_FP16 + if (parse_res.readInputBinName.second) { + U32 size = getBinFileSize(imageDir, inputBinName); + inputBinPtr = new U8[size]; + readF32BinToF16((F16 *)inputBinPtr, size / bytesOf(DT_F32), imageDir, inputBinName); + } else { +#endif + std::map> inMap = cnn->get_inputs(); + for (auto p : inMap) { + TensorDesc imageDesc = (*(p.second)).get_desc(); + imageDescs.push_back(imageDesc); + } + imagePaths = load_image_with_scale(imageDir, imageDescs, &images, imageFormat, scaleValue); +#ifdef _USE_FP16 + } +#endif + + std::map categoryNum; + double totalTime = 0; + double max_time = -DBL_MAX; + double min_time = DBL_MAX; + U32 imageIndex = 0; + std::cout << "[RESULT]:" << std::endl; + int top1Index = 0; + int top1Match = 0; + int topKMatch = 0; + U32 count = (images.size() > 0) ? images.size() : 1; + UNI_INFO_LOG("WARM UP\n"); + for (int i = 0; i < 2; i++) { + cnn->run(); + } + cnn->saveAlgorithmMapToText(algorithmMapPath); +#ifdef _USE_MALI + if (strcmp(affinityPolicyName, "GPU") == 0) { + gcl_finish(OCLContext::getInstance().handle.get()); + } +#endif + UNI_INFO_LOG("RUN\n"); + for (imageIndex = 0; imageIndex < count;) { + // stage3: set input + std::map> outMap; + double loop_max_time = -DBL_MAX; + double loop_min_time = DBL_MAX; + double loop_total_time = 0; + + U8 *res = nullptr; + TensorDesc resDesc; + for (int i = 0; i < loopTime; i++) { + timeBegin = ut_time_ms(); + auto curModelInputTensorNames = cnn->get_model_input_tensor_names(); + for (int index = 0; index < (int)curModelInputTensorNames.size(); index++) { + U8 *inputPtr; + if (inputBinPtr) { + inputPtr = inputBinPtr; + } else { + std::cout << imagePaths[imageIndex] << " : " << std::endl; + std::vector image = images[imageIndex]; + inputPtr = (U8 *)((CpuMemory *)image[index].get_memory())->get_ptr(); + } + cnn->copy_to_named_input(curModelInputTensorNames[index], inputPtr); + } + // stage4: run + cnn->run(); + + // stage5: process result + outMap = cnn->get_outputs(); + Tensor result = *(outMap.begin()->second); + auto mem = result.get_memory(); + if (mem->get_mem_type() == OCLMem) { +#ifdef _USE_MALI + res = (U8 *)((OclMemory *)mem)->get_mapped_ptr(); +#endif + } else { + res = (U8 *)((CpuMemory *)mem)->get_ptr(); + } + resDesc = result.get_desc(); + timeEnd = ut_time_ms(); + double time = (timeEnd - timeBegin); + loop_total_time += time; + if (time < loop_min_time) { + loop_min_time = time; + } + if (time > loop_max_time) { + loop_max_time = time; + } + } + totalTime += (loop_total_time) / loopTime; + if (loopTime > 1) { + UNI_CI_LOG("loop %d times for set_input + run + get_output\n", loopTime); + UNI_CI_LOG("avg_time:%fms/loop\n", loop_total_time / loopTime); + UNI_CI_LOG("max_time:%fms/loop\n", loop_max_time); + UNI_CI_LOG("min_time:%fms/loop\n", loop_min_time); + } + +#ifdef _USE_FP16 + if (parse_res.writeOutputBinName.second) { + U32 num = tensorNumElements(resDesc); + CI8 *dataName = outputBinName; + writeF16ToF32Bin((F16 *)res, num, imageDir, dataName); + } +#endif + + std::vector topKResult = topK_index(res, resDesc, topK); + top1Index = topKResult[0]; + if (category != -1) { + if (top1Index == category) { + top1Match++; + } + for (int i = 0; i < topK; i++) { + if (topKResult[i] == category) { + topKMatch++; + break; + } + } + for (int i = 0; i < topK; i++) { + std::cout << topKResult[i] << " "; + } + std::cout << std::endl; + } + + if ((timeEnd - timeBegin) >= max_time) { + max_time = (timeEnd - timeBegin); + } + + if ((timeEnd - timeBegin) <= min_time) { + min_time = (timeEnd - timeBegin); + } + if (categoryNum.count(top1Index) == 0) { + categoryNum[top1Index] = 1; + } else { + categoryNum[top1Index] = categoryNum[top1Index] + 1; + } + imageIndex++; + } + + UNI_TIME_STATISTICS + + std::cout << "[CATEGORY]:" << std::endl; + std::cout << "category\tnum" << std::endl; + for (auto elem : categoryNum) { + std::cout << elem.first << "\t" << elem.second << std::endl; + } + std::cout << "[SUMMARY]:" << std::endl; + UNI_CI_LOG("top1:%f\n", 1.0 * top1Match / imageIndex); + UNI_CI_LOG("top%d:%f\n", topK, 1.0 * topKMatch / imageIndex); + UNI_CI_LOG("avg_time:%fms/image\n", 1.0 * totalTime / imageIndex); + UNI_CI_LOG("max_time:%fms/image\n", 1.0 * max_time); + UNI_CI_LOG("min_time:%fms/image\n", 1.0 * min_time); + if (inputBinPtr) { + delete[] inputBinPtr; + } + return 0; +} diff --git a/inference/examples/java_api/test_api_java.java b/inference/examples/java_api/test_api_java.java new file mode 100644 index 00000000..e622f44c --- /dev/null +++ b/inference/examples/java_api/test_api_java.java @@ -0,0 +1,707 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; + +public final class test_api_java { + public static float[] readSequenceDataFromFile(String pathName, int lineNumber) + { + float[] array = {}; + try + (FileReader reader = new FileReader(pathName); + BufferedReader br = new BufferedReader(reader)) + { + String line; + int lineIndex = 0; + while ((line = br.readLine()) != null) { + if (lineIndex == lineNumber) { + String[] strArray = line.split(" "); + int arraySize = Integer.valueOf(strArray[0]); + array = new float[arraySize]; + for (int i = 0; i < arraySize; i++) { + array[i] = Float.valueOf(strArray[1 + i]); + } + } else { + lineIndex++; + } + } + } + catch (IOException e) { + e.printStackTrace(); + } + return array; + } + + public static double getMillisTime() + { + return System.nanoTime() / 1000.0 / 1000.0; + } + + public static void verify(int[] arrayA, int[] arrayB, int length) + { + for (int j = 0; j < length; j++) { + if (arrayA[j] != arrayB[j]) { + System.err.println("[ERROR] verify failed " + j + " @ " + arrayA[j] + " " + + arrayB[j] + ", in Java API test"); + System.exit(1); + } + } + } + + public static void verify(float[] arrayA, float[] arrayB, int length, float threshold) + { + for (int j = 0; j < arrayA.length; j++) { + if (Math.abs(arrayA[j] - arrayB[j]) > threshold) { + System.err.println("[ERROR] verify failed " + j + " @ " + arrayA[j] + " " + + arrayB[j] + ", in Java API test"); + System.exit(1); + } + } + } + + public static int verify(float[][] arrayA, float[][] arrayB, int[][] dimensions, float threshold) + { + if (arrayA.length != arrayB.length || arrayA.length != dimensions.length) { + System.err.println("[ERROR] unmatch data to verify, in Java API test"); + System.exit(1); + } + + int sum = 0; + for (int i = 0; i < dimensions.length; i++) { + int length = BoltResult.calculateLength(dimensions[i]); + verify(arrayA[i], arrayB[i], length, threshold); + sum += length; + } + return sum; + } + + public static int top1(float[] array, int offset, int length) + { + int maxIndex = offset; + for (int i = offset + 1; i < offset + length; i++) { + if (array[i] > array[maxIndex]) { + maxIndex = i; + } + } + return maxIndex; + } + + public static void tinybertIntentSlot(String outputPrefix, AffinityType affinity, String modelPath) + { + int inputNum = 3; + int outputNum = 2; + String[] inputName = {"tinybert_words", "tinybert_positions", "tinybert_token_type"}; + String[] outputName = {"intent_softmax", "slot_softmax"}; + int[] inputN = {1, 1, 1}; + int[] inputCMax = {32, 32, 32}; + int[] inputH = {1, 1, 1}; + int[] inputW = {1, 1, 1}; + DataType[] intputDataType = {DataType.UINT32, DataType.UINT32, DataType.UINT32}; + DataFormat[] intputDataFormat = {DataFormat.NORMAL, DataFormat.NORMAL, DataFormat.NORMAL}; + BoltModel boltModel = new BoltModel(modelPath, affinity, inputNum, inputName, inputN, + inputCMax, inputH, inputW, intputDataType, intputDataFormat, outputNum, outputName); + + int[] inputCActual = {9, 9, 9}; + float[][] inputData = {{101, 2224, 8224, 7341, 2000, 22149, 2000, 2899, 102}, + {0, 1, 2, 3, 4, 5, 6, 7, 8}, {0, 0, 0, 0, 0, 0, 0, 0, 0}}; + float[][] resultData = {{22, 0.999023f}, {44, 44, 1, 23, 44, 44, 44, 8, 44}}; + + double startTime = getMillisTime(); + BoltResult boltResult = boltModel.run(inputNum, inputName, inputN, inputCActual, inputH, + inputW, intputDataType, intputDataFormat, inputData); + if (null == boltResult) { + System.err.println("[ERROR] modelAddr is 0 in Java API test"); + boltModel.destructor(); + System.exit(1); + } + double endTime = getMillisTime(); + System.out.println(outputPrefix + boltModel.affinityMapping(affinity) + ", tinybert " + + String.format("%.3f", endTime - startTime) + " ms/sequence, model " + modelPath); + float[][] result = boltResult.getResultData(); + int[][] dimension = boltResult.getResultDimension(); + int intentIndex = top1(result[0], 0, result[0].length); + float[][] finalResult = new float[2][dimension[1][1]]; + finalResult[0][0] = intentIndex; + finalResult[0][1] = result[0][intentIndex]; + for (int i = 0; i < dimension[1][1]; i++) { + finalResult[1][i] = + top1(result[1], i * dimension[1][2], dimension[1][2]) - i * dimension[1][2]; + } + int[][] finalDimension = {{1, 2}, {1, dimension[1][1]}}; + int length = verify(resultData, finalResult, finalDimension, 0.1f); + if (length == 0) { + System.err.println("[ERROR] verify null data in tinybert, in Java API test"); + System.exit(1); + } + + // model destroy + boltModel.destructor(); + } + + public static void tinybertDisambiguate( + String outputPrefix, AffinityType affinity, String modelPath, DataType dt) + { + int inputNum = 5; + int outputNum = 1; + String[] inputName = {"tinybert_words", "tinybert_positions", "tinybert_token_type", + "tinybert_words_mask", "tinybert_dict_type"}; + String[] outputName = {"slot_softmax"}; + int[] inputN = {1, 1, 1, 1, 1}; + int[] inputCMax = {32, 32, 32, 511, 511}; + int[] inputH = {1, 1, 1, 32, 1}; + int[] inputW = {1, 1, 1, 1, 1}; + DataType[] intputDataType = { + DataType.UINT32, DataType.UINT32, DataType.UINT32, dt, DataType.UINT32}; + DataFormat[] intputDataFormat = {DataFormat.NORMAL, DataFormat.NORMAL, DataFormat.NORMAL, + DataFormat.MTK, DataFormat.NORMAL}; + BoltModel boltModel = new BoltModel(modelPath, affinity, inputNum, inputName, inputN, + inputCMax, inputH, inputW, intputDataType, intputDataFormat, outputNum, outputName); + + int[] inputCActual = {27, 27, 27, 1, 1}; + float[][] inputData = { + {101, 3017, 5164, 678, 5341, 5686, 5688, 4680, 5564, 6577, 1920, 1104, 2773, 5018, 671, + 2108, 2001, 3813, 3924, 2193, 4028, 3330, 3247, 712, 2898, 4638, 102}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0}, {5}}; + float[][] resultData = {{0.796903967857f, 0.203096017241f}}; + + double startTime = getMillisTime(); + BoltResult boltResult = boltModel.run(inputNum, inputName, inputN, inputCActual, inputH, + inputW, intputDataType, intputDataFormat, inputData); + if (null == boltResult) { + System.err.println("[ERROR] modelAddr is 0 in Java API test"); + boltModel.destructor(); + System.exit(1); + } + double endTime = getMillisTime(); + System.out.println(outputPrefix + boltModel.affinityMapping(affinity) + ", tinybert " + + String.format("%.3f", endTime - startTime) + " ms/sequence, model " + modelPath); + float[][] result = boltResult.getResultData(); + int[][] dimension = boltResult.getResultDimension(); + int length = verify(resultData, result, dimension, 0.1f); + if (length == 0) { + System.err.println("[ERROR] verify null data in tinybert, in Java API test"); + System.exit(1); + } + + // model destroy + boltModel.destructor(); + } + + public static void nmt(String outputPrefix, AffinityType affinity, String modelPath) + { + int inputNum = 2; + int outputNum = 1; + String[] inputName = {"nmt_words", "nmt_positions"}; + String[] outputName = {"decoder_output"}; + int[] inputN = {1, 1}; + int[] inputCMax = {128, 128}; + int[] inputH = {1, 1}; + int[] inputW = {1, 1}; + DataType[] intputDataType = {DataType.UINT32, DataType.UINT32, DataType.UINT32}; + DataFormat[] intputDataFormat = {DataFormat.NORMAL, DataFormat.NORMAL, DataFormat.NORMAL}; + BoltModel boltModel = new BoltModel(modelPath, affinity, inputNum, inputName, inputN, + inputCMax, inputH, inputW, intputDataType, intputDataFormat, outputNum, outputName); + + int[] inputCActual = {28, 28}; + float[][] inputData = { + {1977, 1788, 2061, 3911, 248, 734, 1330, 1111, 1307, 729, 411, 383, 101, 713, 5640, 627, + 1330, 37, 282, 352, 438, 94, 1111, 729, 1103, 72, 133, 2}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27}}; + float[][] resultData = {{7456, 40, 1788, 2061, 3911, 248, 734, 140, 4667, 1307, 5365, 411, + 383, 1244, 206, 2669, 5640, 627, 50, 236, 37, 63, 48, 352, 94, 4667, 53, 287, 1763, 72, + 133, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}}; + + double startTime = getMillisTime(); + BoltResult boltResult = boltModel.run(inputNum, inputName, inputN, inputCActual, inputH, + inputW, intputDataType, intputDataFormat, inputData); + if (null == boltResult) { + System.err.println("[ERROR] modelAddr is 0 in Java API test"); + boltModel.destructor(); + System.exit(1); + } + double endTime = getMillisTime(); + System.out.println(outputPrefix + boltModel.affinityMapping(affinity) + + ", machine translation " + String.format("%.3f", endTime - startTime) + + " ms/sequence, model " + modelPath); + int length = + verify(resultData, boltResult.getResultData(), boltResult.getResultDimension(), 0); + if (length == 0) { + System.err.println("[ERROR] verify null data in machine translation, in Java API test"); + System.exit(1); + } + + // model destroy + boltModel.destructor(); + } + + public static void nmtTSC(String outputPrefix, + AffinityType affinity, + DataType dataType, + String encoderModelPath, + String decoderModelPath) + { + int encoderInputNum = 2; + String[] encoderInputNames = {"encoder_words", "encoder_positions"}; + int[] encoderNs = {1, 1}; + int[] encoderCMaxs = {128, 128}; + int[] encoderHs = {1, 1}; + int[] encoderWs = {1, 1}; + DataType[] encoderDataTypes = {DataType.UINT32, DataType.UINT32, DataType.UINT32}; + DataFormat[] encoderDataFormats = {DataFormat.NORMAL, DataFormat.NORMAL, DataFormat.NORMAL}; + BoltModel encoderModel = new BoltModel(encoderModelPath, affinity, encoderInputNum, + encoderInputNames, encoderNs, encoderCMaxs, encoderHs, encoderWs, encoderDataTypes, + encoderDataFormats); + + int[] encoderCActs = {4, 4}; + float[][] encoderInputData = {{13024, 1657, 35399, 0}, {0, 1, 2, 3}}; + int[] result = {6160, 3057, 113, 157, 0}; + + double startTime = getMillisTime(); + BoltResult encoderResult = encoderModel.run(encoderInputNum, encoderInputNames, encoderNs, + encoderCActs, encoderHs, encoderWs, encoderDataTypes, encoderDataFormats, + encoderInputData); + if (null == encoderResult) { + System.err.println("[ERROR] modelAddr is 0 in Java API test"); + encoderModel.destructor(); + System.exit(1); + } + double endTime = getMillisTime(); + double encoderTime = endTime - startTime; + + int decoderInputNum = 26; + int decoderOutputNum = 13; + int maxDecodeLength = 128; + String[] decoderInputNames = {"decoder_words", "decoder_positions", + "decoder_layer0_multihead_k", "decoder_layer0_multihead_v", "decoder_layer1_multihead_k", + "decoder_layer1_multihead_v", "decoder_layer2_multihead_k", "decoder_layer2_multihead_v", + "decoder_layer3_multihead_k", "decoder_layer3_multihead_v", "decoder_layer4_multihead_k", + "decoder_layer4_multihead_v", "decoder_layer5_multihead_k", "decoder_layer5_multihead_v", + "decoder_layer0_kmem", "decoder_layer0_vmem", "decoder_layer1_kmem", + "decoder_layer1_vmem", "decoder_layer2_kmem", "decoder_layer2_vmem", + "decoder_layer3_kmem", "decoder_layer3_vmem", "decoder_layer4_kmem", + "decoder_layer4_vmem", "decoder_layer5_kmem", "decoder_layer5_vmem"}; + String[] decoderOutputNames = { + "transformer_decoder_embedding_argmax", + "transformer_decoder_layer_0_self_attention_multihead_k_cache", + "transformer_decoder_layer_0_self_attention_multihead_v_cache", + "transformer_decoder_layer_1_self_attention_multihead_k_cache", + "transformer_decoder_layer_1_self_attention_multihead_v_cache", + "transformer_decoder_layer_2_self_attention_multihead_k_cache", + "transformer_decoder_layer_2_self_attention_multihead_v_cache", + "transformer_decoder_layer_3_self_attention_multihead_k_cache", + "transformer_decoder_layer_3_self_attention_multihead_v_cache", + "transformer_decoder_layer_4_self_attention_multihead_k_cache", + "transformer_decoder_layer_4_self_attention_multihead_v_cache", + "transformer_decoder_layer_5_self_attention_multihead_k_cache", + "transformer_decoder_layer_5_self_attention_multihead_v_cache", + }; + int[] decoderNs = new int[decoderInputNum]; + int[] decoderCMaxs = new int[decoderInputNum]; + int[] decoderHs = new int[decoderInputNum]; + int[] decoderWs = new int[decoderInputNum]; + DataType[] decoderDataTypes = new DataType[decoderInputNum]; + DataFormat[] decoderDataFormats = new DataFormat[decoderInputNum]; + double decoderTime = 0; + for (int i = 0; i < 2; i++) { + decoderNs[i] = 1; + decoderCMaxs[i] = 1; + decoderHs[i] = 1; + decoderWs[i] = 1; + decoderDataTypes[i] = DataType.UINT32; + decoderDataFormats[i] = DataFormat.NORMAL; + } + for (int i = 2; i < decoderInputNum; i++) { + decoderNs[i] = 1; + if (i - 2 < 12) { + decoderCMaxs[i] = 4; + } else { + decoderCMaxs[i] = maxDecodeLength - 1; + } + decoderHs[i] = 512; + decoderWs[i] = 1; + decoderDataTypes[i] = dataType; + decoderDataFormats[i] = DataFormat.MTK; + } + BoltModel decoderModel = new BoltModel(decoderModelPath, affinity, decoderInputNum, + decoderInputNames, decoderNs, decoderCMaxs, decoderHs, decoderWs, decoderDataTypes, + decoderDataFormats, decoderOutputNum, decoderOutputNames); + float[][] encoderResultData = encoderResult.getResultData(); + float[][] decoderStates = {{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}}; + int word = 0, i; + int[] words = new int[maxDecodeLength]; + for (i = 0; i < maxDecodeLength; i++) { + int[] decoderCActs = { + 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, i, i, i, i, i, i, i, i, i, i, i, i}; + float[][] decoderInputData = { + {word}, + {i}, + encoderResultData[0], + encoderResultData[1], + encoderResultData[2], + encoderResultData[3], + encoderResultData[4], + encoderResultData[5], + encoderResultData[6], + encoderResultData[7], + encoderResultData[8], + encoderResultData[9], + encoderResultData[10], + encoderResultData[11], + decoderStates[0], + decoderStates[1], + decoderStates[2], + decoderStates[3], + decoderStates[4], + decoderStates[5], + decoderStates[6], + decoderStates[7], + decoderStates[8], + decoderStates[9], + decoderStates[10], + decoderStates[11], + }; + startTime = getMillisTime(); + BoltResult decoderResult = decoderModel.run(decoderInputNum, decoderInputNames, + decoderNs, decoderCActs, decoderHs, decoderWs, decoderDataTypes, decoderDataFormats, + decoderInputData); + if (null == decoderResult) { + System.err.println("[ERROR] modelAddr is 0 in Java API test"); + decoderModel.destructor(); + encoderModel.destructor(); + System.exit(1); + } + endTime = getMillisTime(); + decoderTime += endTime - startTime; + float[][] decoderResultData = decoderResult.getResultData(); + for (int j = 0; j < 12; j++) { + decoderStates[j] = decoderResultData[j + 1]; + } + word = (int)decoderResultData[0][0]; + words[i] = word; + if (word == 0) { + break; + } + } + System.out.println(outputPrefix + encoderModel.affinityMapping(affinity) + + ", machine translation " + String.format("%.3f", encoderTime + decoderTime) + + " ms/sequence, encoder model " + encoderModelPath + ", decoder model " + + decoderModelPath); + verify(result, words, result.length); + + // model destroy + encoderModel.destructor(); + decoderModel.destructor(); + } + + public static void tts(String outputPrefix, + AffinityType affinity, + String encoderDecoderModelPath, + String postnetModelPath, + String melganModelPath, + DataType dataType) + { + int numMels = 80; + int maxResult = 2000 * 3; + int encoderDecoderInputNum = 2; + int encoderDecoderOutputNum = 2; + String[] encoderDecoderInputNames = {"tts_words", "tts_alignments"}; + String[] encoderDecoderOutputNames = {"decoder_position", "decoder_result"}; + int[] encoderDecoderNs = {1, 1}; + int[] encoderDecoderCMaxs = {128, 128}; + int[] encoderDecoderHs = {1, 1}; + int[] encoderDecoderWs = {1, 1}; + DataType[] encoderDecoderDataTypes = {DataType.UINT32, dataType}; + DataFormat[] encoderDecoderDataFormats = {DataFormat.NORMAL, DataFormat.NORMAL}; + BoltModel encoderDecoderModel = new BoltModel(encoderDecoderModelPath, affinity, + encoderDecoderInputNum, encoderDecoderInputNames, encoderDecoderNs, encoderDecoderCMaxs, + encoderDecoderHs, encoderDecoderWs, encoderDecoderDataTypes, encoderDecoderDataFormats, + encoderDecoderOutputNum, encoderDecoderOutputNames); + int[] encoderDecoderCActs = {50, 50}; + float[][] encoderDecoderInputData = { + {4, 25, 14, 33, 11, 20, 1, 9, 14, 33, 27, 2, 20, 35, 15, 1, 10, 37, 11, 2, 30, 34, 15, + 7, 21, 1, 25, 14, 35, 21, 27, 3, 25, 14, 34, 27, 1, 25, 14, 35, 27, 1, 17, 36, 7, + 20, 1, 37, 7, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}}; + + int postnetInputNum = 1; + int postnetOutputNum = 1; + String[] postnetInputNames = {"tts_decoder"}; + String[] postnetOutputNames = {"mel"}; + int[] postnetNs = {1}; + int[] postnetCMaxs = {maxResult}; + int[] postnetHs = {numMels}; + int[] postnetWs = {1}; + DataType[] postnetDataTypes = {dataType}; + DataFormat[] postnetDataFormats = {DataFormat.MTK}; + BoltModel postnetModel = new BoltModel(postnetModelPath, affinity, postnetInputNum, + postnetInputNames, postnetNs, postnetCMaxs, postnetHs, postnetWs, postnetDataTypes, + postnetDataFormats, postnetOutputNum, postnetOutputNames); + + int melganInputNum = 1; + int melganOutputNum = 1; + String[] melganInputNames = {"input"}; + String[] melganOutputNames = {"output"}; + int[] melganNs = {1}; + int[] melganCs = {numMels}; + int[] melganHMaxs = {maxResult}; + int[] melganWs = {1}; + DataType[] melganDataTypes = {dataType}; + DataFormat[] melganDataFormats = {DataFormat.NCHW}; + BoltModel melganModel = new BoltModel(melganModelPath, affinity, melganInputNum, + melganInputNames, melganNs, melganCs, melganHMaxs, melganWs, melganDataTypes, + melganDataFormats, melganOutputNum, melganOutputNames); + + double startTime = getMillisTime(); + BoltResult encoderDecoderResult = encoderDecoderModel.run(encoderDecoderInputNum, + encoderDecoderInputNames, encoderDecoderNs, encoderDecoderCActs, encoderDecoderHs, + encoderDecoderWs, encoderDecoderDataTypes, encoderDecoderDataFormats, + encoderDecoderInputData); + float[][] encoderDecoderResultData = encoderDecoderResult.getResultData(); + + int frameNum = ((int)encoderDecoderResultData[0][0] + 1) * 3; + int[] postnetCActs = {frameNum}; + float[][] postnetInputData = {encoderDecoderResultData[1]}; + BoltResult postnetResult = postnetModel.run(postnetInputNum, postnetInputNames, postnetNs, + postnetCActs, postnetHs, postnetWs, postnetDataTypes, postnetDataFormats, + postnetInputData); + int[][] postnetResultDimension = postnetResult.getResultDimension(); + float[][] postnetResultData = postnetResult.getResultData(); + + if (postnetResultDimension[0][0] != 1 || postnetResultDimension[0][1] != numMels || + postnetResultDimension[0][2] != frameNum) { + System.out.println("[ERROR] unmatched dimension of postnet"); + System.exit(1); + } + int[] melganHActs = {frameNum}; + float[][] melganInputData = {postnetResultData[0]}; + BoltResult melganResult = melganModel.run(melganInputNum, melganInputNames, melganNs, + melganCs, melganHActs, melganWs, melganDataTypes, melganDataFormats, melganInputData); + int[][] melganResultDimension = melganResult.getResultDimension(); + float[][] melganResultData = melganResult.getResultData(); + int length = (int)melganResultDimension[0][2]; + float[] resultSum = {180.595139f}; + if (DataType.FP16 == dataType) { + resultSum[0] = 224.713181f; + } + float[] sum = {0}; + for (int i = 0; i < length; i++) { + sum[0] += melganResultData[0][i]; + } + double endTime = getMillisTime(); + System.out.println(outputPrefix + encoderDecoderModel.affinityMapping(affinity) + + ", text to speech " + String.format("%.3f", endTime - startTime) + + " ms/sequence, encoder decoder model " + encoderDecoderModelPath + ", postnet model " + + postnetModelPath + ", melgan vocoder model " + melganModelPath); + verify(sum, resultSum, 1, 1); + + // model destroy + encoderDecoderModel.destructor(); + postnetModel.destructor(); + melganModel.destructor(); + } + + public static void asr(String rootPath, String outputPrefix, AffinityType affinity, String modelPath, DataType dataType) + { + int inputNum = 1; + int outputNum = 1; + String[] inputName = {"sounds"}; + String[] outputName = {"labels"}; + int[] inputN = {1}; + int[] inputCMax = {128}; + int[] inputH = {240}; + int[] inputW = {1}; + DataType[] intputDataType = {dataType}; + DataFormat[] intputDataFormat = {DataFormat.NCHW}; + BoltModel boltModel = new BoltModel(modelPath, affinity, inputNum, inputName, inputN, + inputCMax, inputH, inputW, intputDataType, intputDataFormat, outputNum, outputName); + + String soundDataPath = rootPath + "/testing_data/nlp/asr/asr_rnnt/input/1.seq"; + String resultDataPath = rootPath + "/testing_data/nlp/asr/asr_rnnt/result/1.seq"; + float[] sound = readSequenceDataFromFile(soundDataPath, 0); + float[] result = readSequenceDataFromFile(resultDataPath, 0); + int[] inputCActual = {sound.length / inputH[0]}; + float[][] inputData = {sound}; + float[][] resultData = {result}; + + double startTime = getMillisTime(); + BoltResult boltResult = boltModel.run(inputNum, inputName, inputN, inputCActual, inputH, + inputW, intputDataType, intputDataFormat, inputData); + if (null == boltResult) { + System.err.println("[ERROR] modelAddr is 0 in Java API test"); + boltModel.destructor(); + System.exit(1); + } + double endTime = getMillisTime(); + System.out.println(outputPrefix + boltModel.affinityMapping(affinity) + + ", speech recognization " + String.format("%.3f", endTime - startTime) + + " ms/sequence, model " + modelPath); + int length = + verify(resultData, boltResult.getResultData(), boltResult.getResultDimension(), 0); + if (length == 0) { + System.err.println("[ERROR] verify null data in speech recognize, in Java API test"); + System.exit(1); + } + + // model destroy + boltModel.destructor(); + } + + public static void classification(String outputPrefix, + AffinityType affinity, + String modelPath, + String name, + DataType dataType, + int[] imageSize, + float initValue, + int topIndex) + { + int inputNum = 1; + String[] inputName = {name}; + int[] inputN = {1}; + int[] inputC = {imageSize[0]}; + int[] inputH = {imageSize[1]}; + int[] inputW = {imageSize[2]}; + DataType[] intputDataType = {dataType}; + DataFormat[] intputDataFormat = {DataFormat.NCHW}; + // constructor(modelCreate + ready) + BoltModel boltModel = new BoltModel(modelPath, affinity, inputNum, inputName, inputN, + inputC, inputH, inputW, intputDataType, intputDataFormat); + + int length = imageSize[0] * imageSize[1] * imageSize[2]; + float[][] inputData = new float[1][length]; + for (int i = 0; i < length; i++) { + inputData[0][i] = initValue; + } + // warm up + boltModel.run(inputNum, inputName, inputData); + + // model run + double startTime = getMillisTime(); + BoltResult boltResult = boltModel.run(inputNum, inputName, inputData); + if (null == boltResult) { + System.err.println("[ERROR] modelAddr is 0 in Java API test"); + boltModel.destructor(); + System.exit(1); + } + double endTime = getMillisTime(); + System.out.println(outputPrefix + boltModel.affinityMapping(affinity) + ", classification " + + String.format("%.3f", endTime - startTime) + " ms/image, model " + modelPath); + + float[][] result = boltResult.getResultData(); + int labelIndex = top1(result[0], 0, result[0].length); + if (labelIndex != topIndex) { + System.err.println("[ERROR] verify data classfication label failed " + labelIndex + + " " + topIndex + ", in Java API test"); + System.exit(1); + } + + // model destroy + boltModel.destructor(); + } + + public static void testSuites0(String rootPath, String outputPrefix, AffinityType affinity, DataType dt) + { + String prefix = rootPath + "/model_zoo"; + String modelSuffix = ""; + if (dt == DataType.FP16) + modelSuffix = "_f16.bolt"; + else if (dt == DataType.FP32) + modelSuffix = "_f32.bolt"; + int[] image_3x224x224 = {3, 224, 224}; + int[] image_224x224x3 = {224, 224, 3}; + int[] image_2x188x188 = {2, 188, 188}; + classification(outputPrefix, affinity, + prefix + "/caffe_models/mobilenet_v1/mobilenet_v1" + modelSuffix, "data", dt, + image_3x224x224, 1, 499); + classification(outputPrefix, affinity, + prefix + "/caffe_models/mobilenet_v2/mobilenet_v2" + modelSuffix, "data", dt, + image_3x224x224, 1, 813); + classification(outputPrefix, affinity, + prefix + "/caffe_models/mobilenet_v3/mobilenet_v3" + modelSuffix, "data", dt, + image_3x224x224, 1, 892); + classification(outputPrefix, affinity, + prefix + "/caffe_models/squeezenet/squeezenet" + modelSuffix, "data", dt, + image_3x224x224, 255, 310); + classification(outputPrefix, affinity, prefix + "/caffe_models/resnet50/resnet50" + modelSuffix, + "data", dt, image_3x224x224, 255, 506); + classification(outputPrefix, affinity, + prefix + "/caffe_models/fingerprint_resnet18/fingerprint_resnet18" + modelSuffix, "Data", + dt, image_2x188x188, 1, 0); + classification(outputPrefix, affinity, prefix + "/onnx_models/ghostnet/ghostnet" + modelSuffix, + "input:0", dt, image_224x224x3, 255, 789); + if (affinity == AffinityType.GPU) { + return; + } + tinybertIntentSlot( + outputPrefix, affinity, prefix + "/caffe_models/tinybert384/tinybert384" + modelSuffix); + tinybertIntentSlot( + outputPrefix, affinity, prefix + "/caffe_models/tinybert/tinybert" + modelSuffix); + tinybertDisambiguate(outputPrefix, affinity, + prefix + "/caffe_models/tinybert_disambiguate/tinybert_disambiguate" + modelSuffix, dt); + nmt(outputPrefix, affinity, prefix + "/caffe_models/nmt/nmt" + modelSuffix); + nmtTSC(outputPrefix, affinity, dt, + prefix + "/caffe_models/nmt_tsc_encoder/nmt_tsc_encoder" + modelSuffix, + prefix + "/caffe_models/nmt_tsc_decoder/nmt_tsc_decoder" + modelSuffix); + asr(rootPath, outputPrefix, affinity, prefix + "/caffe_models/asr_rnnt/asr_rnnt" + modelSuffix, dt); + tts(outputPrefix, affinity, + prefix + "/caffe_models/tts_encoder_decoder/tts_encoder_decoder" + modelSuffix, + prefix + "/caffe_models/tts_postnet/tts_postnet" + modelSuffix, + prefix + "/onnx_models/tts_melgan_vocoder/tts_melgan_vocoder" + modelSuffix, dt); + } + + public static void testSuites1(String rootPath, String outputPrefix, AffinityType affinity) + { + String prefix = rootPath + "/model_zoo"; + int[] image_3x224x224 = {3, 224, 224}; + int[] image_224x224x3 = {224, 224, 3}; + int[] image_2x188x188 = {2, 188, 188}; + classification(outputPrefix, affinity, + prefix + "/onnx_models/birealnet18/birealnet18_f16.bolt", "0", DataType.FP16, + image_3x224x224, 255, 565); + classification(outputPrefix, affinity, + prefix + "/caffe_models/squeezenet/squeezenet_int8_q.bolt", "data", DataType.FP16, + image_3x224x224, 255, 310); + tinybertIntentSlot( + outputPrefix, affinity, prefix + "/caffe_models/tinybert384/tinybert384_int8_q.bolt"); + } + + public static void main(String[] args) + { + String outputPrefix = "[INFO] "; + if (args.length > 0) { + outputPrefix += args[0] + ", "; + } + String rootPath = args[1]; + if (args[0].equals("x86_HOST")) { + testSuites0(rootPath, outputPrefix, AffinityType.CPU_HIGH_PERFORMANCE, DataType.FP32); + } else { + testSuites0(rootPath, outputPrefix, AffinityType.CPU_HIGH_PERFORMANCE, DataType.FP32); + testSuites0(rootPath, outputPrefix, AffinityType.CPU_HIGH_PERFORMANCE, DataType.FP16); + testSuites1(rootPath, outputPrefix, AffinityType.CPU_HIGH_PERFORMANCE); + testSuites0(rootPath, outputPrefix, AffinityType.CPU_LOW_POWER, DataType.FP32); + testSuites0(rootPath, outputPrefix, AffinityType.CPU_LOW_POWER, DataType.FP16); + testSuites1(rootPath, outputPrefix, AffinityType.CPU_LOW_POWER); + testSuites0(rootPath, outputPrefix, AffinityType.GPU, DataType.FP16); + } + } +} diff --git a/inference/examples/machine_translation/nmt.cpp b/inference/examples/machine_translation/nmt.cpp new file mode 100644 index 00000000..d2662830 --- /dev/null +++ b/inference/examples/machine_translation/nmt.cpp @@ -0,0 +1,119 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include + +#include "inference.hpp" +#include "tensor.hpp" +#include "data_loader.hpp" +#include "profiling.h" +#include "parse_command.h" + +int main(int argc, char *argv[]) +{ + UNI_TIME_INIT + ParseRes parse_res; + parseCommandLine(argc, argv, &parse_res, "examples"); + + char *modelPath = (char *)""; + char *sequenceDirectory = (char *)""; + char *affinityPolicyName = (char *)""; + char *algorithmMapPath = (char *)""; + + if (!parse_res.model.second) { + exit(-1); + } + if (parse_res.model.second) { + modelPath = parse_res.model.first; + } + if (parse_res.inputPath.second) { + sequenceDirectory = parse_res.inputPath.first; + } + if (parse_res.archInfo.second) { + affinityPolicyName = parse_res.archInfo.first; + } + if (parse_res.algoPath.second) { + algorithmMapPath = parse_res.algoPath.first; + } + + auto pipeline = createPipeline(affinityPolicyName, modelPath, algorithmMapPath); + + // load sequences + std::map> inMap = pipeline->get_inputs(); + std::vector sequenceDescs; + TensorDesc wordInputDesc = (*(inMap["nmt_words"])).get_desc(); + wordInputDesc.dt = DT_U32; + sequenceDescs.push_back(wordInputDesc); + TensorDesc positionInputDesc = (*(inMap["nmt_positions"])).get_desc(); + positionInputDesc.dt = DT_U32; + sequenceDescs.push_back(positionInputDesc); + + std::vector> sequences, results; + std::vector sequencePaths = + load_data(sequenceDirectory + std::string("/input"), sequenceDescs, &sequences); + std::vector resultDescs; + resultDescs.push_back(wordInputDesc); + std::vector resultPaths = + load_data(sequenceDirectory + std::string("/result"), resultDescs, &results); + + double totalTime = 0; + U32 sequenceIndex = 0; + U32 falseResult = 0; + std::cout << "[RESULT]:" << std::endl; + for (auto sequence : sequences) { + std::cout << sequencePaths[sequenceIndex] << ": " << std::endl; + std::map inputDescMap; + inputDescMap["nmt_words"] = sequence[0].get_desc(); + inputDescMap["nmt_positions"] = sequence[1].get_desc(); + pipeline->reready(inputDescMap); + + auto modelInputTensorNames = pipeline->get_model_input_tensor_names(); + std::map> model_tensors_input; + for (int index = 0; index < (int)modelInputTensorNames.size(); index++) { + U8 *tensorPointer = (U8 *)((CpuMemory *)(sequence[index].get_memory()))->get_ptr(); + pipeline->copy_to_named_input(modelInputTensorNames[index], tensorPointer); + } + + double timeBegin = ut_time_ms(); + pipeline->run(); + double timeEnd = ut_time_ms(); + totalTime += (timeEnd - timeBegin); + + Tensor output = pipeline->get_tensor_by_name("decoder_output"); + std::cout << output.string(32) << std::endl; + if (resultPaths.size() > sequenceIndex) { + U32 *result = (U32 *)((CpuMemory *)(results[sequenceIndex][0].get_memory()))->get_ptr(); + U32 inferenceSize = output.length(); + for (U32 i = 0; i < results[sequenceIndex][0].length(); i++) { + if (i >= inferenceSize || result[i] != output.element(i)) { + falseResult++; + break; + } + } + } + + sequenceIndex++; + } + + UNI_TIME_STATISTICS + + std::cout << "[SUMMARY]:" << std::endl; + UNI_CI_LOG( + "translation correct rate: %f %%\n", 100.0 * (sequenceIndex - falseResult) / sequenceIndex); + UNI_CI_LOG("avg_time:%fms/sequence\n", 1.0 * totalTime / sequenceIndex); + if (falseResult > 0) { + UNI_ERROR_LOG("verify failed\n"); + } + + return 0; +} diff --git a/inference/examples/machine_translation/nmt_tsc.cpp b/inference/examples/machine_translation/nmt_tsc.cpp new file mode 100644 index 00000000..7d4bf45e --- /dev/null +++ b/inference/examples/machine_translation/nmt_tsc.cpp @@ -0,0 +1,175 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include + +#include "inference.hpp" +#include "tensor.hpp" +#include "data_loader.hpp" +#include "profiling.h" +#include "parse_command.h" + +std::map prepareStates( + DataType dt, std::string sequenceDirectory, std::string shapeMapFileName) +{ + std::map shapeMap; + std::string filePath = sequenceDirectory + "/" + shapeMapFileName; + FILE *shapeMapFile = fopen(filePath.c_str(), "r"); + char buffer[NAME_LEN]; + while (fscanf(shapeMapFile, "%s", buffer) != EOF) { + TensorDesc desc; + fscanf(shapeMapFile, "%u", &(desc.nDims)); + for (U32 i = 0; i < desc.nDims; i++) { + fscanf(shapeMapFile, "%u", &(desc.dims[desc.nDims - 1 - i])); + } + if (std::string(buffer) == std::string("encoder_words") || + std::string(buffer) == std::string("encoder_positions") || + std::string(buffer) == std::string("decoder_words") || + std::string(buffer) == std::string("decoder_positions")) { + desc.dt = DT_U32; + } else { + desc.dt = dt; + } + desc.df = DF_NCHW; + shapeMap[buffer] = desc; + } + fclose(shapeMapFile); + + std::map tensorMap; + for (auto iter : shapeMap) { + std::string filePath = sequenceDirectory + "/" + iter.first + ".txt"; + TensorDesc desc = iter.second; + tensorMap[iter.first] = load_txt(filePath, std::vector{desc})[0]; + } + return tensorMap; +} + +void saveStates(std::shared_ptr pipeline, + std::string sequenceDirectory, + std::string outputFileName, + std::string outputStatesFileName) +{ + char buffer[NAME_LEN]; + std::string outputFilePath = sequenceDirectory + "/" + outputFileName; + std::string outputStatesFilePath = sequenceDirectory + "/" + outputStatesFileName; + FILE *outputFile = fopen(outputFilePath.c_str(), "r"); + FILE *outputStatesFile = fopen(outputStatesFilePath.c_str(), "w"); + while (!feof(outputFile)) { + fscanf(outputFile, "%s", buffer); + Tensor tensor = pipeline->get_tensor_by_name(buffer); + TensorDesc desc = tensor.get_desc(); + + // write states + fprintf(outputStatesFile, "%s\n", buffer); + fprintf(outputStatesFile, "%u\n", desc.nDims); + for (U32 i = 0; i < desc.nDims; i++) { + fprintf(outputStatesFile, "%u ", desc.dims[desc.nDims - 1 - i]); + } + + // write data + U32 num = tensorNumElements(desc); + std::string outputDataPath = sequenceDirectory + "/" + std::string(buffer) + ".txt"; + FILE *outputDataFile = fopen(outputDataPath.c_str(), "w"); + for (U32 i = 0; i < num; i++) { + fprintf(outputDataFile, "%f ", tensor.element(i)); + if (i % 10 == 9) { + fprintf(outputDataFile, "\n"); + } + } + fclose(outputDataFile); + } + fclose(outputFile); + fclose(outputStatesFile); +} + +int main(int argc, char *argv[]) +{ + UNI_TIME_INIT + ParseRes parse_res; + parseCommandLine(argc, argv, &parse_res, "examples"); + + char *modelPath = (char *)""; + char *sequenceDirectory = (char *)""; + char *affinityPolicyName = (char *)""; + std::string subNetworkName = std::string("encoder"); + + if (!parse_res.model.second) { + exit(-1); + } + if (parse_res.model.second) { + modelPath = parse_res.model.first; + } + if (parse_res.inputPath.second) { + sequenceDirectory = parse_res.inputPath.first; + } + if (parse_res.archInfo.second) { + affinityPolicyName = parse_res.archInfo.first; + } + if (parse_res.subNetworkName.second) { + subNetworkName = std::string(parse_res.subNetworkName.first); + } + + std::string outputTensorName; + if (subNetworkName == std::string("encoder")) { + outputTensorName = "transformer_decoder_layer5_multihead_v"; + } else if (subNetworkName == std::string("decoder")) { + outputTensorName = "transformer_decoder_embedding_argmax"; + } else { + UNI_ERROR_LOG("unrecognized sub network(encoder|decoder) %s\n", subNetworkName.c_str()); + } + + DataType dt; + std::string modelPathStr = std::string(modelPath); + // "_f[16|32].bolt" + std::string modelPathSuffix = modelPathStr.substr(modelPathStr.size() - 9); + if (modelPathSuffix == std::string("_f16.bolt")) { + dt = DT_F16; + } else if (modelPathSuffix == std::string("_f32.bolt")) { + dt = DT_F32; + } else { + UNI_ERROR_LOG("unrecognized model file path suffix %s\n", modelPathSuffix.c_str()); + } + auto pipeline = createPipeline(affinityPolicyName, modelPath); + + double totalTime = 0; + int loops = 60; + for (int i = 0; i < loops; i++) { + std::map input = + prepareStates(dt, sequenceDirectory, "input_shape.txt"); + std::map inputDescMap; + for (auto iter : input) { + inputDescMap[iter.first] = iter.second.get_desc(); + } + pipeline->infer_output_tensors_size(inputDescMap); + pipeline->assign_output_tensor(); + for (auto iter : input) { + U8 *tensorPointer = (U8 *)((CpuMemory *)(iter.second.get_memory()))->get_ptr(); + pipeline->copy_to_named_input(iter.first, tensorPointer); + } + + double timeBegin = ut_time_ms(); + pipeline->run(); + double timeEnd = ut_time_ms(); + totalTime += (timeEnd - timeBegin); + Tensor output = pipeline->get_tensor_by_name(outputTensorName); + std::cout << output.string(32) << std::endl; + saveStates(pipeline, sequenceDirectory, "output_name.txt", "output_shape.txt"); + } + UNI_TIME_STATISTICS + + std::cout << "[SUMMARY]:" << std::endl; + U32 validSequence = loops; + UNI_CI_LOG("avg_time:%fms/sequence\n", 1.0 * totalTime / validSequence); + + return 0; +} diff --git a/inference/examples/object_detection/detection.cpp b/inference/examples/object_detection/detection.cpp new file mode 100644 index 00000000..450d5afb --- /dev/null +++ b/inference/examples/object_detection/detection.cpp @@ -0,0 +1,140 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include +#include +#include "inference.hpp" +#include "tensor.hpp" +#include "data_loader.hpp" +#include "result_format.hpp" +#include "profiling.h" +#include "parse_command.h" + +int main(int argc, char *argv[]) +{ + UNI_TIME_INIT + ParseRes parse_res; + parseCommandLine(argc, argv, &parse_res, "examples"); + + char *modelPath = (char *)""; + char *imageDir = (char *)""; + char *affinityPolicyName = (char *)"CPU_AFFINITY_HIGH_PERFORMANCE"; + char *algorithmMapPath = (char *)""; + ImageFormat imageFormat = RGB; + F32 scaleValue = 1; + if (!parse_res.model.second) { + exit(-1); + } + if (parse_res.model.second) { + modelPath = parse_res.model.first; + } + if (parse_res.inputPath.second) { + imageDir = parse_res.inputPath.first; + } + if (parse_res.archInfo.second) { + affinityPolicyName = parse_res.archInfo.first; + } + if (parse_res.algoPath.second) { + algorithmMapPath = parse_res.algoPath.first; + } + if (parse_res.imageFormat.second) { + imageFormat = parse_res.imageFormat.first; + } + if (parse_res.scaleValue.second) { + scaleValue = parse_res.scaleValue.first; + } + + F32 confidenceThreshold = 0.4; + + auto cnn = createPipeline(affinityPolicyName, modelPath, algorithmMapPath); + + // load images + std::map> inMap = cnn->get_inputs(); + TensorDesc imageDesc = (*(inMap.begin()->second)).get_desc(); + std::vector imageDescs; + imageDescs.push_back(imageDesc); + std::vector> images; + std::vector imagePaths = + load_image_with_scale(imageDir, imageDescs, &images, imageFormat, scaleValue); + + double totalTime = 0; + double max_time = -DBL_MAX; + double min_time = DBL_MAX; + U32 imageIndex = 0; + std::cout << "[RESULT]:" << std::endl; + + for (auto image : images) { + std::cout << imagePaths[imageIndex] << " : "; + // stage3: set input + double timeBegin = ut_time_ms(); + if (cnn->get_runtime_device() == MALI) { + auto curModelInputTensorNames = cnn->get_model_input_tensor_names(); + std::map> modelInputTensors; + for (int index = 0; index < (int)curModelInputTensorNames.size(); index++) { + modelInputTensors[curModelInputTensorNames[index]] = + ((CpuMemory *)image[index].get_memory())->get_shared_ptr(); + } + cnn->set_input_tensors_value(modelInputTensors); + } else { + auto curModelInputTensorNames = cnn->get_model_input_tensor_names(); + for (int index = 0; index < (int)curModelInputTensorNames.size(); index++) { + cnn->copy_to_named_input(curModelInputTensorNames[index], + (U8 *)((CpuMemory *)(image[index].get_memory()))->get_ptr()); + } + } + // stage4: run + cnn->run(); + + // stage5: process result + std::map> outMap = cnn->get_outputs(); + double timeEnd = ut_time_ms(); + totalTime += (timeEnd - timeBegin); + Tensor result = *(outMap.begin()->second); + F32 numBox = result.element(0); + std::cout << numBox << " boxes in total, including these ones with confidence over " + << confidenceThreshold << ":\n"; + + for (U32 i = 6; i < result.length(); i++) { + F32 confidence = result.element(i + 1); + if (confidence < confidenceThreshold) { + break; + } + F32 label = result.element(i); + F32 topLeftX = result.element(i + 2); + F32 topLeftY = result.element(i + 3); + F32 bottomRightX = result.element(i + 4); + F32 bottomRightY = result.element(i + 5); + std::cout << "\tClass " << label << " with " << confidence << " confidence, top left (" + << topLeftX << ", " << topLeftY << ")" + << ", bottom right (" << bottomRightX << ", " << bottomRightY << ")\n"; + } + + if ((timeEnd - timeBegin) >= max_time) { + max_time = (timeEnd - timeBegin); + } + + if ((timeEnd - timeBegin) <= min_time) { + min_time = (timeEnd - timeBegin); + } + imageIndex++; + } + + UNI_TIME_STATISTICS + + std::cout << "[SUMMARY]:" << std::endl; + UNI_CI_LOG("avg_time:%fms/image\n", 1.0 * totalTime / imageIndex); + UNI_CI_LOG("max_time:%fms/image\n", 1.0 * max_time); + UNI_CI_LOG("min_time:%fms/image\n", 1.0 * min_time); + return 0; +} diff --git a/inference/examples/sequential/test_pipeline_ocl.cpp b/inference/examples/sequential/test_pipeline_ocl.cpp new file mode 100644 index 00000000..6ad6a480 --- /dev/null +++ b/inference/examples/sequential/test_pipeline_ocl.cpp @@ -0,0 +1,434 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_FP16 +#include +#include "types.h" +#include "tensor_desc.h" +#include "sequential_ocl.hpp" +#include "factory.hpp" +#include "ocl/factory_ocl.hpp" +#include "tensor.hpp" +#include "data_loader.hpp" + +void print_help() +{ + std::cout << "please set argvs: " << std::endl; + std::cout << "usage: argv[1]: opName" << std::endl; + std::cout << "usage: argv[2]: in" << std::endl; + std::cout << "usage: argv[3]: ic" << std::endl; + std::cout << "usage: argv[4]: ih" << std::endl; + std::cout << "usage: argv[5]: iw" << std::endl; + std::cout << "usage: argv[6]: fn" << std::endl; + std::cout << "usage: argv[7]: fc" << std::endl; + std::cout << "usage: argv[8]: fh" << std::endl; + std::cout << "usage: argv[9]: fw" << std::endl; + std::cout << "usage: argv[10]: sw" << std::endl; + std::cout << "usage: argv[11]: sh" << std::endl; + std::cout << "usage: argv[12]: pl" << std::endl; + std::cout << "usage: argv[13]: pr" << std::endl; + std::cout << "usage: argv[14]: pt" << std::endl; + std::cout << "usage: argv[15]: pb" << std::endl; + std::cout << "usage: argv[16]: inputNum" << std::endl; + std::cout << "usage: argv[17]: pm" << std::endl; + std::cout << "usage: argv[18]: dt" << std::endl; + std::cout << "supported op: OT_Pooling" << std::endl; + std::cout << "supported op: OT_Conv" << std::endl; + std::cout << "supported op: OT_Eltwise" << std::endl; + std::cout << "supported op: OT_Softmax" << std::endl; + std::cout << "supported op: OT_Relu" << std::endl; + std::cout << "supported op: OT_Relu6" << std::endl; + std::cout << "supported op: OT_HSwish" << std::endl; + std::cout << "supported op: OT_HSigmoid" << std::endl; + std::cout << "supported op: OT_HGelu" << std::endl; + std::cout << "supported op: OT_TanH" << std::endl; + std::cout << "supported op: OT_FC" << std::endl; + std::cout << "supported op: OT_Scale" << std::endl; + std::cout << "supported op: OT_Concat" << std::endl; + std::cout << "supported op: OT_Clip" << std::endl; + std::cout << "supported op: OT_Squeeze" << std::endl; + std::cout << "supported op: OT_Reshape" << std::endl; + std::cout << "supported op: OT_Space2Depth" << std::endl; + std::cout << "supported op: OT_Depth2Space" << std::endl; +} + +template +void buildInputTensor(DataType dt, + DataFormat df, + U32 n, + U32 c, + U32 h, + U32 w, + std::vector *dims, + std::vector *inputTensors) +{ + TensorDesc inputDesc = tensor4df(dt, df, n, c, h, w); + U32 inputNum = tensorNumElements(inputDesc); + U32 inputSize = tensorNumBytes(inputDesc); + U8 *inputVal = (U8 *)operator new(inputSize); + + T *data = (T *)inputVal; + if (dt == DT_F16) { + for (U32 i = 0; i < inputNum; i++) { + data[i] = (T)(rand() & 255) / 256.0 - 0.5; + } + // for(U32 i = 0; i < inputNum; i++) data[i] = (T)(i & 255) / 255.0; + } + if (dt == DT_U8) { + for (U32 i = 0; i < inputNum; i++) { + data[i] = (T)(i & 255); + } + } + std::shared_ptr inputTensor = std::shared_ptr(new Tensor()); + auto mem = (CpuMemory *)inputTensor->get_memory(); + mem->resize(inputDesc); + mem->set_shared_ptr(std::shared_ptr(inputVal)); + + dims->push_back(inputDesc); + inputTensors->push_back(*inputTensor.get()); +} + +int main(int argc, char *argv[]) +{ + if (argc != 16 && argc != 17 && argc != 18 && argc != 19) { + printf("%d\n", argc); + print_help(); + return 0; + } + + U32 inputNum = 1; + std::string pm = "NULL"; + std::string DT_NAME = "F16"; + std::string opName = argv[1]; + + U32 in = atoi(argv[2]); + U32 ic = atoi(argv[3]); + U32 ih = atoi(argv[4]); + U32 iw = atoi(argv[5]); + + U32 fn = atoi(argv[6]); + U32 fc = atoi(argv[7]); + U32 fh = atoi(argv[8]); + U32 fw = atoi(argv[9]); + + U32 sw = atoi(argv[10]); + U32 sh = atoi(argv[11]); + U32 pl = atoi(argv[12]); + U32 pr = atoi(argv[13]); + U32 pt = atoi(argv[14]); + U32 pb = atoi(argv[15]); + if (argc == 17) { + inputNum = atoi(argv[16]); + } + if (argc == 18) { + pm = argv[17]; + } + if (argc == 19) { + DT_NAME = argv[18]; + } + + AffinityPolicy affinityPolicy = AFFINITY_GPU; + DataType dt = DT_F16; + auto model = new SequentialOcl(affinityPolicy, dt, opName); + std::shared_ptr model_ptr = std::shared_ptr(model); + + OperatorType OType; + if (opName == "OT_Pooling") { + OType = OT_Pooling; + } + if (opName == "OT_Conv") { + OType = OT_Conv; + } + if (opName == "OT_Eltwise") { + OType = OT_Eltwise; + } + if (opName == "OT_Softmax") { + OType = OT_Softmax; + } + if (opName == "OT_Relu") { + OType = OT_Relu; + } + if (opName == "OT_Relu6") { + OType = OT_Relu6; + } + if (opName == "OT_HSwish") { + OType = OT_HSwish; + } + if (opName == "OT_HSigmoid") { + OType = OT_HSigmoid; + } + if (opName == "OT_Gelu") { + OType = OT_Gelu; + } + if (opName == "OT_TanH") { + OType = OT_TanH; + } + if (opName == "OT_Sigmoid") { + OType = OT_Sigmoid; + } + if (opName == "OT_FC") { + OType = OT_FC; + } + if (opName == "OT_Scale") { + OType = OT_Scale; + } + if (opName == "OT_Concat") { + OType = OT_Concat; + } + if (opName == "OT_Clip") { + OType = OT_Clip; + } + if (opName == "OT_Squeeze") { + OType = OT_Squeeze; + } + if (opName == "OT_Reshape") { + OType = OT_Reshape; + } + if (opName == "OT_Space2Depth") { + OType = OT_Space2Depth; + } + if (opName == "OT_Depth2Space") { + OType = OT_Depth2Space; + } + Factory *factory_ocl = (Factory *)(new FactoryOCL()); + std::shared_ptr factory; + factory = std::shared_ptr(factory_ocl); + ConvolutionMode convMode; + // convMode = Convolution_Depthwise_Pointwise; + convMode = Convolution_Pointwise; + + switch (OType) { + case OT_Pooling: { + auto p = createPoolingParamSpec( + PoolingMode::POOLING_MAX, fh, fw, sh, sw, pt, pb, pl, pr, RoundMode::CEIL); + auto op = factory->createPooling(p); + model_ptr->add(op); + break; + } + case OT_Eltwise: { + EltwiseParamSpec eltwiseDesc; + eltwiseDesc.elt_mode = EltwiseMode::ELTWISE_SUM; + eltwiseDesc.activation_type = ACTIVATION_NULL; + auto op = factory->createEltwise(eltwiseDesc); + model_ptr->add(op); + break; + } + case OT_Softmax: { + SoftmaxParamSpec p; + p.axis = -1; + auto op = factory->createSoftmax(dt, p); + model_ptr->add(op); + break; + } + case OT_Conv: { + if (pm == "NULL") { + ActivationParamSpec dwActivationParamSpec, pwActivationParamSpec; + dwActivationParamSpec.mode = ACTIVATION_NULL; + pwActivationParamSpec.mode = ACTIVATION_NULL; + auto p = createConvolutionParamSpec( + 1, fh, fw, sh, sw, pt, pb, pl, pr, 1, 1, fn, convMode); + auto op = + factory->createConvolution(dt, p, dwActivationParamSpec, pwActivationParamSpec); + model_ptr->add(op); + } + + if (pm == "RELU") { + ActivationParamSpec dwActivationParamSpec, pwActivationParamSpec; + dwActivationParamSpec.mode = ACTIVATION_RELU; + dwActivationParamSpec.value[0] = 0; + pwActivationParamSpec.mode = ACTIVATION_NULL; + auto p = createConvolutionParamSpec( + 1, fh, fw, sh, sw, pt, pb, pl, pr, 1, 1, fn, convMode); + auto op = + factory->createConvolution(dt, p, dwActivationParamSpec, pwActivationParamSpec); + model_ptr->add(op); + } + break; + } + case OT_Relu: { + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_RELU; + activationDesc.value[0] = 0; + auto op = factory->createActivation(activationDesc); + model_ptr->add(op); + break; + } + case OT_Relu6: { + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_RELU6; + auto op = factory->createActivation(activationDesc); + model_ptr->add(op); + break; + } + case OT_HSwish: { + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_H_SWISH; + auto op = factory->createActivation(activationDesc); + model_ptr->add(op); + break; + } + case OT_HSigmoid: { + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_H_SIGMOID; + auto op = factory->createActivation(activationDesc); + model_ptr->add(op); + break; + } + case OT_Gelu: { + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_GELU; + auto op = factory->createActivation(activationDesc); + model_ptr->add(op); + break; + } + case OT_TanH: { + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_TANH; + auto op = factory->createActivation(activationDesc); + model_ptr->add(op); + break; + } + case OT_Sigmoid: { + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_SIGMOID; + auto op = factory->createActivation(activationDesc); + model_ptr->add(op); + break; + } + case OT_FC: { + auto p = createFullyConnectedParamSpec(fn, 1, nullptr); + auto op = factory->createFullyConnected(dt, p, ih * iw * ic); + model_ptr->add(op); + break; + } + case OT_Scale: { + ScaleParamSpec p; + p.axis = 1; + p.num_concat = inputNum; + auto op = factory->createScale(dt, p, fc); + model_ptr->add(op); + break; + } + case OT_Concat: { + ConcatParamSpec p; + p.axis = 1; + auto op = factory->createConcat(p); + model_ptr->add(op); + break; + } + case OT_Clip: { + auto p = createClipParamSpec(0, 0.5); + auto op = factory->createClip(dt, p); + model_ptr->add(op); + break; + } + case OT_Squeeze: { + int dim[1] = {0}; + auto p = createSqueezeParamSpec(dim, 1); + auto op = factory->createSqueeze(dt, p); + model_ptr->add(op); + break; + } + case OT_Reshape: { + int dim[2] = {-1, 8}; + auto p = createReshapeParamSpec(dim, 2, 0, 0); + auto op = factory->createReshape(dt, p); + model_ptr->add(op); + break; + } + case OT_Space2Depth: { + auto op = factory->createSpace2Depth(dt); + model_ptr->add(op); + break; + } + case OT_Depth2Space: { + Depth2SpaceParamSpec p; + p.blockSize = 2; + auto op = factory->createDepth2Space(dt, p); + model_ptr->add(op); + break; + } + default: + std::cout << "not support op" << std::endl; + } + + std::vector dims; + std::vector inputTensors; + if (OType == OT_Space2Depth) { + for (U32 i = 0; i < inputNum; i++) { + buildInputTensor(DT_U8, DF_NCHW, in, ic, ih, iw, &dims, &inputTensors); + } + } else { + for (U32 i = 0; i < inputNum; i++) { + buildInputTensor(DT_F16, DF_NCHW, in, ic, ih, iw, &dims, &inputTensors); + } + } + + U8 *weightVal = NULL; + if (OType == OT_Conv) { + TensorDesc weightDesc = tensor4df(dt, DF_NCHW, fn, fc, fh, fw); + U32 weightNum = tensorNumElements(weightDesc); + U32 vectorNum = fn; + if (convMode == Convolution_Depthwise_Pointwise) { + vectorNum = fc + fn + fn * fc; + } + U32 weightSize = tensorNumBytes(weightDesc) + vectorNum * bytesOf(dt); + weightVal = (U8 *)operator new(weightSize); + F16 *weight = (F16 *)weightVal; + for (U32 i = 0; i < weightNum + vectorNum; i++) { + weight[i] = (F16)(rand() & 255) / 256.0; + } + } + + if (OType == OT_FC) { + U32 weightNum = iw * ih * ic * fn; + U32 biasNum = fn; + U32 weightSize = (weightNum + biasNum) * bytesOf(dt); + weightVal = (U8 *)operator new(weightSize); + F16 *weight = (F16 *)weightVal; + for (U32 i = 0; i < weightNum + biasNum; i++) { + weight[i] = (F16)(rand() & 255) / 256.0; + } + } + + if (OType == OT_Scale) { + U32 weightNum = fc; + U32 biasNum = fc; + U32 weightSize = (weightNum + biasNum) * bytesOf(dt); + weightVal = (U8 *)operator new(weightSize); + F16 *weight = (F16 *)weightVal; + for (U32 i = 0; i < weightNum + biasNum; i++) { + weight[i] = (F16)(rand() & 255) / 256.0; + } + } + + if (weightVal) { + std::shared_ptr modelPtr(weightVal); + model_ptr->ready(dims, modelPtr, 1); + } else { + model_ptr->ready(dims, NULL, 1); + } + model_ptr->mark_input_output(); + model_ptr->set_input_tensors(inputTensors); + model_ptr->run(); + + auto output = model_ptr->get_output_tensors(); + auto oclMem = (OclMemory *)output[0]->get_memory(); + F16 *val = (F16 *)(oclMem->get_mapped_ptr()); + for (int i = 0; i < 64; i++) { + std::cout << val[i] << " "; + } + std::cout << std::endl; + return 0; +} +#endif diff --git a/inference/examples/text_to_speech/tts.cpp b/inference/examples/text_to_speech/tts.cpp new file mode 100644 index 00000000..c7607ffb --- /dev/null +++ b/inference/examples/text_to_speech/tts.cpp @@ -0,0 +1,228 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include + +#include "inference.hpp" +#include "tensor.hpp" +#include "data_loader.hpp" +#include "profiling.h" +#include "parse_command.h" + +std::map prepareStates( + DataType dt, std::string sequenceDirectory, std::string shapeMapFileName) +{ + std::map shapeMap; + std::string filePath = sequenceDirectory + "/" + shapeMapFileName; + FILE *shapeMapFile = fopen(filePath.c_str(), "r"); + char buffer[NAME_LEN]; + while (fscanf(shapeMapFile, "%s", buffer) != EOF) { + TensorDesc desc; + fscanf(shapeMapFile, "%u", &(desc.nDims)); + for (U32 i = 0; i < desc.nDims; i++) { + fscanf(shapeMapFile, "%u", &(desc.dims[desc.nDims - 1 - i])); + } + if (std::string(buffer) == std::string("tts_words") || + std::string(buffer) == std::string("tts_emotions") || + std::string(buffer) == std::string("tinybert_words") || + std::string(buffer) == std::string("tinybert_positions") || + std::string(buffer) == std::string("tinybert_token_type")) { + desc.dt = DT_U32; + } else { + desc.dt = dt; + } + if (std::string(buffer) == std::string("tts_words") || + std::string(buffer) == std::string("tts_alignments") || + std::string(buffer) == std::string("tts_emotions") || + std::string(buffer) == std::string("tinybert_words") || + std::string(buffer) == std::string("tinybert_positions") || + std::string(buffer) == std::string("tinybert_token_type")) { + desc.df = DF_NORMAL; + } else { + desc.df = DF_NCHW; + } + shapeMap[buffer] = desc; + } + fclose(shapeMapFile); + + std::map tensorMap; + for (auto iter : shapeMap) { + std::string filePath = sequenceDirectory + "/" + iter.first + ".txt"; + TensorDesc desc = iter.second; + tensorMap[iter.first] = load_txt(filePath, std::vector{desc})[0]; + } + return tensorMap; +} + +void saveStates(std::shared_ptr pipeline, + std::string sequenceDirectory, + std::string outputFileName, + std::string outputStatesFileName) +{ + char buffer[NAME_LEN]; + std::string outputFilePath = sequenceDirectory + "/" + outputFileName; + std::string outputStatesFilePath = sequenceDirectory + "/" + outputStatesFileName; + FILE *outputFile = fopen(outputFilePath.c_str(), "r"); + FILE *outputStatesFile = fopen(outputStatesFilePath.c_str(), "w"); + while (!feof(outputFile)) { + fscanf(outputFile, "%s", buffer); + Tensor tensor = pipeline->get_tensor_by_name(buffer); + TensorDesc desc = tensor.get_desc(); + + // write states + fprintf(outputStatesFile, "%s\n", buffer); + fprintf(outputStatesFile, "%u\n", desc.nDims); + for (U32 i = 0; i < desc.nDims; i++) { + fprintf(outputStatesFile, "%u ", desc.dims[desc.nDims - 1 - i]); + } + + // write data + U32 num = tensorNumElements(desc); + std::string outputDataPath = sequenceDirectory + "/" + std::string(buffer) + ".txt"; + FILE *outputDataFile = fopen(outputDataPath.c_str(), "w"); + for (U32 i = 0; i < num; i++) { + fprintf(outputDataFile, "%f ", tensor.element(i)); + if (i % 10 == 9) { + fprintf(outputDataFile, "\n"); + } + } + fclose(outputDataFile); + } + fclose(outputFile); + fclose(outputStatesFile); +} + +int verify(Tensor tensor, std::string subNetworkName) +{ + U32 num = tensor.length(); + F32 sum = 0; + for (U32 i = 0; i < num; i++) { + sum += tensor.element(i); + } + I32 result = 0; + if (subNetworkName == std::string("encoder_decoder")) { + if (abs(sum - 6921) >= 1100) { + result = 1; + } + } else if (subNetworkName == std::string("postnet")) { + if (abs(sum - (-11987.7)) >= 1) { + result = 1; + } + } else if (subNetworkName == std::string("melgan_vocoder")) { + if (abs(sum - (-0.665192)) >= 0.7) { + result = 1; + } + } + return result; +} + +int main(int argc, char *argv[]) +{ + UNI_TIME_INIT + ParseRes parse_res; + parseCommandLine(argc, argv, &parse_res, "examples"); + + char *modelPath = (char *)""; + char *sequenceDirectory = (char *)""; + char *affinityPolicyName = (char *)""; + std::string subNetworkName = std::string("encoder_decoder"); + + if (!parse_res.model.second) { + exit(-1); + } + if (parse_res.model.second) { + modelPath = parse_res.model.first; + } + if (parse_res.inputPath.second) { + sequenceDirectory = parse_res.inputPath.first; + } + if (parse_res.archInfo.second) { + affinityPolicyName = parse_res.archInfo.first; + } + if (parse_res.subNetworkName.second) { + subNetworkName = std::string(parse_res.subNetworkName.first); + } + + std::vector outputTensorNames; + if (subNetworkName == std::string("encoder_decoder")) { + outputTensorNames.push_back("decoder_result"); + outputTensorNames.push_back("decoder_position"); + } else if (subNetworkName == std::string("postnet")) { + outputTensorNames.push_back("mel"); + } else if (subNetworkName == std::string("melgan_vocoder")) { + outputTensorNames.push_back("output"); + } else if (subNetworkName == std::string("tinybert")) { + outputTensorNames.push_back("dense_layer_argmax"); + outputTensorNames.push_back("dense_layer2_argmax"); + } else { + UNI_ERROR_LOG( + "unrecognized sub network(encoder_decoder|postnet) %s\n", subNetworkName.c_str()); + } + + DataType dt; + std::string modelPathStr = std::string(modelPath); + // "_f[16|32].bolt" + std::string modelPathSuffix = modelPathStr.substr(modelPathStr.size() - 9); + if (modelPathSuffix == std::string("_f16.bolt")) { + dt = DT_F16; + } else if (modelPathSuffix == std::string("_f32.bolt")) { + dt = DT_F32; + } else if (modelPathSuffix == std::string("t8_q.bolt")) { + dt = DT_F16; + } else { + UNI_ERROR_LOG("unrecognized model file path suffix %s\n", modelPathSuffix.c_str()); + } + auto pipeline = createPipeline(affinityPolicyName, modelPath); + + double totalTime = 0; + int loops = 1; + U32 falseResult = 0; + for (int i = 0; i < loops; i++) { + std::map input = + prepareStates(dt, sequenceDirectory, "input_shape.txt"); + std::map inputDescMap; + for (auto iter : input) { + inputDescMap[iter.first] = iter.second.get_desc(); + } + pipeline->reready(inputDescMap); + for (auto iter : input) { + U8 *tensorPointer = (U8 *)((CpuMemory *)(iter.second.get_memory()))->get_ptr(); + pipeline->copy_to_named_input(iter.first, tensorPointer); + } + + double timeBegin = ut_time_ms(); + pipeline->run(); + double timeEnd = ut_time_ms(); + totalTime += (timeEnd - timeBegin); + std::vector output; + for (auto outputTensorName : outputTensorNames) { + Tensor outputTensor = pipeline->get_tensor_by_name(outputTensorName); + output.push_back(outputTensor); + std::cout << outputTensor.string(32) << std::endl; + } + falseResult += verify(output[0], subNetworkName); + //saveStates(pipeline, sequenceDirectory, "output_name.txt", "output_shape.txt"); + } + UNI_TIME_STATISTICS + + std::cout << "[SUMMARY]:" << std::endl; + U32 validSequence = loops; + UNI_CI_LOG("text to speech correct rate: %f %%\n", + 100.0 * (validSequence - falseResult) / validSequence); + UNI_CI_LOG("avg_time:%fms/sequence\n", 1.0 * totalTime / validSequence); + if (falseResult > 0) { + UNI_ERROR_LOG("verify failed\n"); + } + + return 0; +} diff --git a/inference/exports/c/bolt.h b/inference/exports/c/bolt.h deleted file mode 100644 index 61e4f3a7..00000000 --- a/inference/exports/c/bolt.h +++ /dev/null @@ -1,227 +0,0 @@ -/** -* @file -* @brief C API Document -* -* @copyright -* @code -* Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -* WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -* @endcode -*/ -#ifdef __cplusplus -extern "C" { -#endif - - -/** inference pipeline handle */ -typedef void* ModelHandle; - -/** result data memory handle */ -typedef void* ResultHandle; - -/** CPU affinity policy */ -typedef enum { - HIGH_PERFORMANCE = 0, ///< performance is high priority(use big core) - LOW_POWER = 1 ///< power is high priority(use small core) -} AFFINITY_TYPE; - -/** heterogeneous device type */ -typedef enum { - CPU = 0, ///< CPU - GPU = 1 ///< GPU -} DEVICE_TYPE; - -/** data precision */ -typedef enum { - FP_32 = 0, ///< 32 bit float - FP_16 = 1, ///< 16 bit float - INT_32 = 2, ///< 32 bit integer - UINT_32 = 3 ///< 32 bit unsigned integer -} DATA_TYPE; - -/** multi-dimension data format */ -typedef enum { - NCHW = 0, ///< batch->channel->high->width data order - NHWC = 1, ///< batch->high->width->channel data order - NCHWC8 = 2, ///< batch->channel/8->high->width->channel four element data order - MTK = 3, ///< batch->time->unit data order - NORMAL = 4 ///< batch->unit data order -} DATA_FORMAT; - -/** -* @brief create model from file -* @param modelPath model file path -* @param affinity CPU affinity setting -* @param device heterogeneous device setting -* @param algoPath the file path to save and load algos info -* -* @return inference pipeline handle -* -* @note destroy model when pipeline end -* @code -* ModelHandle handle = CreateModel(...); -* ... -* DestroyModel(handle); -* @endcode -* valid algoPath can reduce PrepareModel significantly -* if you set a valid algoPath, algorithm selected only need to run once, which is usually time consuming -* the algorithm select result will be saved to the file path you set, and loaded when you run it next time, -* which avoid to do the algorithm selected again -* it is strongly suggest that set a valid algoPath, especiall for GPU running -* @note -* if your inputSize changed, please delete the old algorithm file be saved -* if your model changed, please delete the old algorithm file be saved -* if any unexpected error happen, you can try to delete algorithm file and run it again -*/ -ModelHandle CreateModel(const char* modelPath, AFFINITY_TYPE affinity, DEVICE_TYPE device, const char* algoPath); - -/** -* @brief complete model inference engine prepare -* @param ih model inference handle -* @param num_input the number of input data -* @param n the array of all input data's n dimension -* @param c the array of all input data's c dimension -* @param h the array of all input data's h dimension -* @param w the array of all input data's w dimension -* @param name the array of all input data's name in string format -* @param dt_input the array of all input data's data type -* @param df_input the array of all input data's data format -* -* @return -*/ -void PrepareModel(ModelHandle ih, const int num_input, - const int* n, const int* c, const int* h, const int* w, - char** name, - const DATA_TYPE* dt_input, const DATA_FORMAT* df_input); - -/** -* @brief resize model input size -* @param ih model inference handle -* @param num_input the number of input data -* @param n the array of all input data's n dimension -* @param c the array of all input data's c dimension -* @param h the array of all input data's h dimension -* @param w the array of all input data's w dimension -* @param name the array of all input data's name in string format -* @param dt_input the array of all input data's data type -* @param df_input the array of all input data's data format -* -* @return -* -* @code -* // model_resize must behind PrepareModel; -* PrepareModel(...); -* ResizeModelInput(...); -* RunModel(...); -* @endcode -*/ -void ResizeModelInput(ModelHandle ih, const int num_input, - const int* n, const int* c, const int* h, const int* w, - char** name, - const DATA_TYPE* dt_input, const DATA_FORMAT* df_input); - -/** -* @brief malloc result data memory -* @param ih inference pipeline handle -* -* @return result data memory handle -*/ -ResultHandle AllocAllResultHandle(ModelHandle ih); - -/** -* @brief malloc result data memory according to user specification -* @param ih inference pipeline handle -* @param num_outputs the number of tensor that needed -* @param outputNames the array of tesor name that needed -* -* @return result data memory handle -*/ -ResultHandle AllocSpecificResultHandle(ModelHandle ih, const int num_outputs, - char** outputNames); - -/** -* @brief inference result from input -* @param ih inference pipeline handle -* @param ir result data memory handle -* @param num_input the number of input data -* @param inputNames the array of all input data's name in string format -* @param mem the array of all input data -* -* @return -*/ -void RunModel(ModelHandle ih, ResultHandle ir, const int num_input, char** inputNames, void** mem); - -/** -* @brief get the number of model output from ResultHandle -* @param ir result data memory handle -* -* @return the number of output -*/ -int GetNumOutputsFromResultHandle(ResultHandle ir); - -/** -* @brief get data from ResultHandle, default to pass value of output ptr, -* if need copy data to your own ptr, please use CopyOutputsFromResultHandle -* @param ir result data memory handle -* @param num_outputs the number of output data -* @param outputNames the array of all output data's name in string format -* @param data the array of all output data's content -* @param n the array of all output data's n dimension -* @param c the array of all output data's c dimension -* @param h the array of all output data's h dimension -* @param w the array of all output data's w dimension -* @param dt_output the array of all output data's data type -* @param df_output the array of all output data's data format -* -* @return -*/ -void GetPtrFromResultHandle(ResultHandle ir, int num_outputs, char** outputNames, void** data, - int* n, int* c, int* h, int* w, - DATA_TYPE* dt_output, DATA_FORMAT* df_output); - -/** -* @brief get data ptr from ResultHandle with memcpy -* @param ir result data memory handle -* @param num_outputs the number of output data -* @param outputNames the array of all output data's name in string format -* @param data the array of all output data's content -* @param n the array of all output data's n dimension -* @param c the array of all output data's c dimension -* @param h the array of all output data's h dimension -* @param w the array of all output data's w dimension -* @param dt_output the array of all output data's data type -* @param df_output the array of all output data's data format -* -* @return -*/ -void CopyOutputsFromResultHandle(ResultHandle ir, int num_outputs, char** outputNames, void** data, - int* n, int* c, int* h, int* w, - DATA_TYPE* dt_output, DATA_FORMAT* df_output); -/** -* @brief free result data memory -* @param ir result data memory handle -* -* @return -*/ -void FreeResultHandle(ResultHandle ir); - -/** -* @brief destroy model -* @param ih inference pipeline handle -* -* @return -*/ -void DestroyModel(ModelHandle ih); -#ifdef __cplusplus -} -#endif diff --git a/inference/exports/java/BoltModel.java b/inference/exports/java/BoltModel.java deleted file mode 100644 index a69139e5..00000000 --- a/inference/exports/java/BoltModel.java +++ /dev/null @@ -1,302 +0,0 @@ -/** -* @file -* @brief Java BoltModel Class Document -* -* @copyright -* @code -* Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -* WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -* @endcode -*/ - -import java.io.File; - -/** CPU affinity policy */ -enum AffinityType { - HIGH_PERFORMANCE, ///< performance is high priority(use big core) - LOW_POWER ///< power is high priority(use small core) -} - -/** heterogeneous device type */ -enum DeviceType { - CPU, ///< CPU - GPU ///< GPU -} - -/** data precision */ -enum DataType { - FP32, ///< 32 bit float - FP16, ///< 16 bit float - INT32, ///< 32 bit integer - UINT32 ///< 32 bit unsigned char -} - -/** multi-dimensions data format */ -enum DataFormat { - NCHW, ///< batch->channel->high->width data order - NHWC, ///< batch->high->width->channel data order - MTK, ///< batch->time->unit data order - NORMAL ///< vectorize input of row major -} - -public final class BoltModel { - private static void loadLibrary(String libraryAbsolutePath, boolean optional) { - File file = new File(libraryAbsolutePath); - if (file.exists()) { - System.load(libraryAbsolutePath); - } - else { - if (!optional) { - System.err.println("[ERROR] unable to load " + libraryAbsolutePath); - System.exit(1); - } - } - } - - static { - loadLibrary("/data/local/tmp/CI/libc++_shared.so", true); - loadLibrary("/system/lib64/libOpenCL.so", true); - //loadLibrary("/data/local/tmp/CI/java/libOpenCL.so", true); - loadLibrary("/data/local/tmp/CI/java/libkernelbin.so", true); - loadLibrary("/data/local/tmp/CI/java/libBoltModel.so", false); - } - - private long modelAddr; - - private long IResult; - - private native long model_create(String modelPath, String affinity, String device); - - private native void model_ready(long modelAddr, int num_input, - String[] input_names, int[] n, int[] c, int[] h, int[] w, String[] dt_input, String[] df_input); - - private native void model_resize_input(long modelAddr, int num_input, - String[] input_names, int[] n, int[] c, int[] h, int[] w, String[] dt_input, String[] df_input); - - private native long IResult_malloc_all(long modelAddr); - - private native long IResult_malloc_part(long modelAddr, int num_outputs, String[] outputNames); - - private native void model_run(long modelAddr, long IResult, - int num_input, String[] input_names, float[][] inputData); - - private native BoltResult getOutput(long IResult); - - private native void IResult_free(long IResult); - - private native void destroyModel(long modelAddr); - - public String AffinityMapping(AffinityType affinity) { - String ret = "HIGH_PERFORMANCE"; - if (affinity == AffinityType.HIGH_PERFORMANCE) { - ret = "HIGH_PERFORMANCE"; - } else if (affinity == AffinityType.LOW_POWER) { - ret = "LOW_POWER"; - } else { - System.err.println("[ERROR] unsupported CPU affinity in " + this.getClass().getName()); - System.exit(1); - } - return ret; - } - - public String DeviceMapping(DeviceType device) { - String ret = "CPU"; - if (device == DeviceType.CPU) { - ret = "CPU"; - } else if (device == DeviceType.GPU) { - ret = "GPU"; - } else { - System.err.println("[ERROR] unsupported device in " + this.getClass().getName()); - System.exit(1); - } - return ret; - } - - public String DataTypeMapping(DataType data_type) { - String ret = "FP32"; - if (data_type == DataType.FP32) { - ret = "FP32"; - } else if (data_type == DataType.FP16) { - ret = "FP16"; - } else if (data_type == DataType.INT32) { - ret = "INT32"; - } else if (data_type == DataType.UINT32) { - ret = "UINT32"; - } else { - System.err.println("[ERROR] unsupported data type in " + this.getClass().getName()); - System.exit(1); - } - return ret; - } - - private String DataFormatMapping(DataFormat data_format) { - String ret = "NCHW"; - if (data_format == DataFormat.NCHW) { - ret = "NCHW"; - } else if (data_format == DataFormat.NHWC) { - ret = "NHWC"; - } else if (data_format == DataFormat.MTK) { - ret = "MTK"; - } else if (data_format == DataFormat.NORMAL) { - ret = "NORMAL"; - } else { - System.err.println("[ERROR] unsupported data format in " + this.getClass().getName()); - System.exit(1); - } - return ret; - } - - /** - * @brief initial model and alloc memory - * @param modelPath model file path of String type - * @param affinity CPU affinity setting of AffinityType(enum) type - * @param device heterogeneous device setting of DeviceType(enum) type - * @param num_input the number of input data of int type - * @param input_names the array of all input data's name of string type - * @param n the array of all input data's n dimension of int type - * @param c the array of all input data's c dimension of int type - * @param h the array of all input data's h dimension of int type - * @param w the array of all input data's w dimension of int type - * @param dts the array of all input data's data type of DataType(enum) type - * @param dfs the array of all input data's data format of DataFormat(enum) type - * - * @return - * - * @note destroy model when pipeline end - * @code - * BoltModel(...); - * ... - * Destructor(); - * @endcode - */ - BoltModel(String modelPath, AffinityType affinity, DeviceType device, - int num_input, String[] input_names, int[] n, int[] c, int[] h, int[] w, - DataType[] dts, DataFormat[] dfs) - { - String input_affinity = AffinityMapping(affinity); - String input_device = DeviceMapping(device); - String[] dts_str = new String[num_input]; - String[] dfs_str = new String[num_input]; - for (int i = 0; i < num_input; i++) { - dts_str[i] = DataTypeMapping(dts[i]); - dfs_str[i] = DataFormatMapping(dfs[i]); - } - - this.modelAddr = model_create(modelPath, input_affinity, input_device); - - model_ready(this.modelAddr, num_input, input_names, n, c, h, w, dts_str, dfs_str); - - this.IResult = IResult_malloc_all(this.modelAddr); - } - - /** - * @brief initial model and alloc memory, and the output is decided by user - * @param modelPath model file path of String type - * @param affinity CPU affinity setting of AffinityType(enum) type - * @param device heterogeneous device setting of DeviceType(enum) type - * @param num_input the number of input data of int type - * @param input_names the array of all input data's name of string type - * @param n the array of all input data's n dimension of int type - * @param c the array of all input data's c dimension of int type - * @param h the array of all input data's h dimension of int type - * @param w the array of all input data's w dimension of int type - * @param dts the array of all input data's data type of DataType(enum) type - * @param dfs the array of all input data's data format of DataFormat(enum) type - * @param num_output the number of output data of int type - * @param output_names the array of all output data's name of string type - * - * @return - * - * @note destroy model when pipeline end - * @code - * BoltModel(...); - * ... - * Destructor(); - * @endcode - */ - BoltModel(String modelPath, AffinityType affinity, DeviceType device, - int num_input, String[] input_names, - int[] n, int[] c, int[] h, int[] w, - DataType[] dts, DataFormat[] dfs, - int num_output, String[] output_names) - { - String input_affinity = AffinityMapping(affinity); - String input_device = DeviceMapping(device); - String[] dts_str = new String[num_input]; - String[] dfs_str = new String[num_input]; - for (int i = 0; i < num_input; i++) { - dts_str[i] = DataTypeMapping(dts[i]); - dfs_str[i] = DataFormatMapping(dfs[i]); - } - - this.modelAddr = model_create(modelPath, input_affinity, input_device); - - model_ready(this.modelAddr, num_input, input_names, n, c, h, w, dts_str, dfs_str); - - this.IResult = IResult_malloc_part(this.modelAddr, num_output, output_names); - } - - /** - * @brief inference result from input - * @param num_input the number of input data of int type - * @param input_names the array of all input data's name of string type - * @param inputData the 2D array of all input data of float type - * - * @return BoltResult : the result class of bolt model after inference - */ - public BoltResult Run(int num_input, String[] input_names, float[][] inputData) { - model_run(this.modelAddr, this.IResult, num_input, input_names, inputData); - BoltResult bolt_result = getOutput(this.IResult); - return bolt_result; - } - - /** - * @brief inference result from resized input - * @param num_input the number of input data of int type - * @param input_names the array of all input data's name of String type - * @param n the array of all input data's n dimension of int type - * @param c the array of all input data's c dimension of int type - * @param h the array of all input data's h dimension of int type - * @param w the array of all input data's w dimension of int type - * @param dts the array of all input data's data type of DataType(enum) type - * @param dfs the array of all input data's data format of DataFormat(enum) type - * @param inputData the 2D array of all input data of float type - * - * @return BoltResult : the result class of bolt model after inference - */ - public BoltResult Run( - int num_input, String[] input_names, int[] n, int[] c, int[] h, int[] w, - DataType[] dts, DataFormat[] dfs, - float[][] inputData) { - String[] dts_str = new String[num_input]; - String[] dfs_str = new String[num_input]; - for (int i = 0; i < num_input; i++) { - dts_str[i] = DataTypeMapping(dts[i]); - dfs_str[i] = DataFormatMapping(dfs[i]); - } - model_resize_input(this.modelAddr, num_input, input_names, n, c, h, w, dts_str, dfs_str); - model_run(this.modelAddr, this.IResult, num_input, input_names, inputData); - BoltResult bolt_result = getOutput(this.IResult); - return bolt_result; - } - - /** - * @brief recycle memory and destroy model - * - * @return - */ - public void Destructor() { - IResult_free(this.IResult); - destroyModel(this.modelAddr); - } -} diff --git a/inference/exports/java/BoltResult.java b/inference/exports/java/BoltResult.java deleted file mode 100644 index f9837b76..00000000 --- a/inference/exports/java/BoltResult.java +++ /dev/null @@ -1,121 +0,0 @@ -/** -* @file -* @brief Java BoltResult Class Document -* -* @copyright -* @code -* Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -* WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -* @endcode -*/ -public class BoltResult { - /** 2d float array of output data in the inference result, the length of output_values is output size */ - private float[][] output_values; - - /** 2d int array of output dimension info in the inference result, the length of output_dimensions is output size */ - private int[][] output_dimensions; - - /** String array of output names info in the inference result, the length of output_names is output size */ - private String[] output_names; - - /** String array of output data info in the inference result, the length of output_dataformat is output size */ - private String[] output_dataformat; - - /** calculate product and skip 0 */ - public static int calculateLength(int[] array) { - int num = array.length; - int length = 0; - for (int j = 0; j < num; j++) { - if (array[j] == 0) - break; - else { - if (length == 0) - length = array[j]; - else - length *= array[j]; - } - } - return length; - } - - public BoltResult(float[][] output_values, - int[][] output_dimensions, - String[] output_names, - String[] output_dataformat) - { - this.output_values = output_values; - this.output_dimensions = output_dimensions; - this.output_names = output_names; - this.output_dataformat = output_dataformat; - } - - /** - * @brief get result data name from BoltResult object - * - * @return 1d String array of output data in the inference result - */ - public String[] getResultName() { - return this.output_names; - } - - /** - * @brief get result data format from BoltResult object - * - * @return 1d String array of output data in the inference result - */ - public String[] getResultDataFormat() { - return this.output_dataformat; - } - - /** - * @brief get result data dimension information from BoltResult object - * - * @return 2d int array of output data in the inference result - */ - public int[][] getResultDimension() { - return this.output_dimensions; - } - - /** - * @brief get result data array from BoltResult object - * - * @return 2d float array of output data in the inference result - */ - public float[][] getResultData() { - return this.output_values; - } - - /** - * @brief print BoltResult object info - * @param num the number of the result you want - * - * @return - */ - public void print(int num) { - for (int i = 0; i < output_names.length; i++) { - System.out.println("[INFO] output name: " + output_names[i]); - System.out.println(" data format: " + output_dataformat[i]); - int len = calculateLength(this.output_dimensions[i]); - System.out.println(" data number: " + len); - if (num >= 0) { - if (num < len) - len = num; - } - - for (int j = 0; j < len; j++) { - System.out.print(output_values[i][j] + " "); - } - System.out.println(); - } - } -} diff --git a/inference/flow/CMakeLists.txt b/inference/flow/CMakeLists.txt new file mode 100644 index 00000000..2334a518 --- /dev/null +++ b/inference/flow/CMakeLists.txt @@ -0,0 +1,21 @@ +cmake_minimum_required(VERSION 3.2) + +file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/common/cmakes/bolt.cmake ${BOLT_ROOT}/common/cmakes/bolt.cmake) +if (BOLT_CONFIGURE_FILE) + include(${BOLT_CONFIGURE_FILE}) +else (BOLT_CONFIGURE_FILE) + message(FATAL_ERROR " +FATAL: can not find bolt.cmake in directory, + please set shell or cmake environment variable BOLT_ROOT. + ") +endif (BOLT_CONFIGURE_FILE) + +project(flow) + +set_policy() + +set_c_cxx_flags() + +include_flow() + +add_subdirectory(src) diff --git a/inference/flow/include/flow.h b/inference/flow/include/flow.h new file mode 100644 index 00000000..3f8040f1 --- /dev/null +++ b/inference/flow/include/flow.h @@ -0,0 +1,85 @@ +/** + * @file + * @brief Flow API Document + * + * @copyright + * @code + * Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE + * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * @endcode + */ + +#ifndef FLOW_INCLUDE_FLOW_H_ +#define FLOW_INCLUDE_FLOW_H_ + +#include +#include +#include +#include +#include "flow.pb.h" + +#include "node.h" +#include "task.h" +#include "schedule.h" +#include "tensor.hpp" + +class Flow { +public: + Flow(); + + ~Flow(); + + /** + * @brief initialize flow + * @param graphPaths predefined flow graph file path array + * @param precision data process precision + * @param affinityPolicy CPU affinity setting + * @param cpuThreads the number of CPU cores to use(default is 1) + * @param useGPU whether to use ARM MALI GPU(default is false) + * + * @return + */ + void init(std::vector graphPaths, + DataType precision, + AffinityPolicy affinityPolicy = AFFINITY_CPU_HIGH_PERFORMANCE, + int cpuThreads = 1, + bool useGPU = true); + + /** + * @brief + * @param task predefined flow task + * + * @return + */ + void enqueue(Task task); + + /** get already finished tasks + * @brief + * @param block set to blocked until all tasks has finished(default is false) + * + * @return finishedTasks: array of already finished tasks + */ + std::vector dequeue(bool block = false); + + /** + * @brief get the current number of unfinished tasks + * + * @return size : the number of unfinished tasks + */ + unsigned int size(); + +private: + Schedule schedule; + std::queue> tasks; +}; +#endif // FLOW_INCLUDE_FLOW_H_ diff --git a/inference/flow/include/flow_function_factory.h b/inference/flow/include/flow_function_factory.h new file mode 100644 index 00000000..7aa6643d --- /dev/null +++ b/inference/flow/include/flow_function_factory.h @@ -0,0 +1,35 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef FLOW_INCLUDE_FLOW_FUNCTION_FACTORY_H_ +#define FLOW_INCLUDE_FLOW_FUNCTION_FACTORY_H_ + +#include +#include +#include +#include + +#include "error.h" +#include "tensor.hpp" + +typedef EE (*FlowFunction)(std::map> &, + std::shared_ptr &, + std::map> &, + std::vector); + +void flowBuildFunctions(); + +FlowFunction flowGetFunctionByName(std::string functionName); + +void flowRegisterFunction(std::string functionName, FlowFunction function); +#endif // FLOW_INCLUDE_FLOW_FUNCTION_FACTORY_H_ diff --git a/inference/flow/include/node.h b/inference/flow/include/node.h new file mode 100644 index 00000000..5a1d5d45 --- /dev/null +++ b/inference/flow/include/node.h @@ -0,0 +1,72 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef FLOW_INCLUDE_NODE_H_ +#define FLOW_INCLUDE_NODE_H_ + +#include +#include +#include +#include +#include "flow.pb.h" +#include "cnn.h" +#include "flow_function_factory.h" + +class Node { +public: + Node(); + + ~Node(); + + Node clone(); + + void setNodeParameter(flow::NodeParameter nodeParameter); + + flow::NodeParameter getNodeParameter(); + + EE inferOutputSize(); + + void setPrecision(DataType precision); + + void initInference(AffinityPolicy affinityPolicy); + + unsigned int getTmpBufferSize(); + + void setTmpBuffer(std::shared_ptr tmpTensor); + + EE ready(); + + void setInput(std::map> inputs); + + void setOutput(std::map> outputs); + + void setRuntime(int cpuId, Arch arch); + + EE run(); + +private: + DataType precision; + flow::NodeParameter nodeParameter; + std::map> inputs; + std::shared_ptr tmpTensor; + std::map> outputs; + FlowFunction inferOutputSizeFunction; + std::vector inferOutputSizeParameter; + FlowFunction preprocessFunction; + std::vector preprocessParameter; + CNN boltModel; + std::vector inferenceParameter; + FlowFunction postprocessFunction; + std::vector postprocessParameter; +}; +#endif // FLOW_INCLUDE_NODE_H_ diff --git a/inference/flow/src/CMakeLists.txt b/inference/flow/src/CMakeLists.txt new file mode 100644 index 00000000..df667a31 --- /dev/null +++ b/inference/flow/src/CMakeLists.txt @@ -0,0 +1,32 @@ +file(GLOB srcs ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) + +include_directories(${Protobuf_INCLUDE_DIR}) + +protobuf_generate_cpp(FLOW_PROTO_SRCS FLOW_PROTO_HDRS flow.proto) + +include_directories(${PROJECT_SOURCE_DIR}/include) +include_directories(${PROJECT_SOURCE_DIR}/include) +add_custom_target(flow.pb.h ALL + DEPENDS ${FLOW_PROTO_HDRS} + COMMAND ${CMAKE_COMMAND} -E copy ${FLOW_PROTO_HDRS} ${PROJECT_SOURCE_DIR}/include) + +# shared library +add_library(${PROJECT_NAME} SHARED ${srcs} ${FLOW_PROTO_HDRS} ${FLOW_PROTO_SRCS}) +# static library +add_library(${PROJECT_NAME}_static STATIC ${srcs} ${FLOW_PROTO_HDRS} ${FLOW_PROTO_SRCS}) +if (USE_IOS_CLANG) + target_link_libraries(${PROJECT_NAME} LINK_PUBLIC tensor image model_tools engine) + if (BUILD_TEST) + target_link_libraries(${PROJECT_NAME} LINK_PUBLIC ${JPEG_LIBRARY}) + endif (BUILD_TEST) + target_link_libraries(${PROJECT_NAME} LINK_PUBLIC ${Protobuf_LIBRARY}) +endif (USE_IOS_CLANG) +add_dependencies(${PROJECT_NAME} flow.pb.h) +add_dependencies(${PROJECT_NAME}_static flow.pb.h) + +set_target_properties(${PROJECT_NAME}_static PROPERTIES OUTPUT_NAME "${PROJECT_NAME}") +set_target_properties(${PROJECT_NAME} PROPERTIES CLEAN_DIRECT_OUTPUT 1) +set_target_properties(${PROJECT_NAME}_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) +install(TARGETS ${PROJECT_NAME} ${PROJECT_NAME}_static + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib) diff --git a/inference/flow/src/flow.cpp b/inference/flow/src/flow.cpp new file mode 100644 index 00000000..d2efbd92 --- /dev/null +++ b/inference/flow/src/flow.cpp @@ -0,0 +1,77 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "flow.h" + +Flow::Flow() +{} + +Flow::~Flow() +{ + this->schedule.end(); +} + +void Flow::init(std::vector graphPath, + DataType precision, + AffinityPolicy affinityPolicy, + int cpuThreads, + bool useGPU) +{ + UNI_DEBUG_LOG("flow (schedule) init begin\n"); + flowBuildFunctions(); + this->schedule.init(graphPath, precision, affinityPolicy, cpuThreads, useGPU); + UNI_DEBUG_LOG("flow init end\n"); +} + +void Flow::enqueue(Task task) +{ + UNI_DEBUG_LOG("user enqueues task: begin\n"); + if (task.status != TASK_READY) { + UNI_ERROR_LOG("task is not ready to add queue\n"); + } + std::shared_ptr taskPtr = std::shared_ptr(new Task(&task)); + this->tasks.emplace(taskPtr); + this->schedule.enqueue(taskPtr.get()); + UNI_DEBUG_LOG("user enqueues task: end\n"); +} + +std::vector Flow::dequeue(bool block) +{ + std::vector outputs; + if (this->tasks.size() == 0) { + return outputs; + } + for (;;) { + if (this->tasks.size() == 0) { + break; + } + auto task = this->tasks.front(); + if (task->status == TASK_END) { + outputs.push_back(*task.get()); + this->tasks.pop(); + } else { + if (!block) { + break; + } + } + } + if (outputs.size() > 0) { + UNI_DEBUG_LOG("user get result (num=%d) end\n", (int)outputs.size()); + } + return outputs; +} + +unsigned int Flow::size() +{ + return this->tasks.size(); +} diff --git a/inference/flow/src/flow.proto b/inference/flow/src/flow.proto new file mode 100644 index 00000000..b1d343bc --- /dev/null +++ b/inference/flow/src/flow.proto @@ -0,0 +1,96 @@ +/** + * @file + * @brief Flow graph defination + * + * @copyright + * @code + * Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE + * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * @endcode + */ + +syntax = "proto2"; + +package flow; + +/** flow graph defination */ +message GraphParameter { + /** graph name */ + optional string name = 1; + /** graph input names */ + repeated string input = 2; + /** graph output names */ + repeated string output = 3; + /** node parameter */ + repeated NodeParameter node = 4; +} + +/** flow node defination */ +message NodeParameter { + /** node name */ + optional string name = 1; + /** + * node type + * type='Input' to mark node as an input node + * type='Inference' to mark node as an inference node + */ + optional string type = 2; + + /** + * node input data type, only input node use this information + * candidates are FLOAT32, FLOAT16, UINT32, INT8 + */ + optional string input_type = 3; + /** + * node input data format, only input node use this information + * candidates are NCHW, NCHWC8, MTK, NORMAL + */ + optional string input_format = 4; + /** node input dimension, only input node use this information */ + repeated int32 input_dim = 5; + + /** node input names */ + repeated string input = 6; + /** node output names */ + repeated string output = 7; + + /** preprocess and postprocess tmp bufer size, element number, not bytes */ + optional uint32 tmp = 8 [default = 0]; + /** node inference precision, support prefered process precision */ + optional string precision = 9 [default = "FLOAT32"]; + + /** + * self-defined infer node output size function parameter + * parameter use array to manage, [function name, other parameters] + * other paramaters are optional + */ + repeated string infer_output_size_parameter = 10; + /** + * self-defined preprocess function parameter + * parameter use array to manage, [preprocess function name, other parameters] + * other paramaters are optional + */ + repeated string preprocess_parameter = 11; + /** + * self-defined inference parameter + * parameter use array to manage, [inference model path, model output] + * model output paramaters are optional when there is no postprocess function. + */ + repeated string inference_parameter = 12; + /** + * self-defined postprocess function parameter + * parameter use array to manage, [postprocess function name, other parameters] + * other paramaters are optional + */ + repeated string postprocess_parameter = 13; +} diff --git a/inference/flow/src/flow_function_factory.cpp b/inference/flow/src/flow_function_factory.cpp new file mode 100644 index 00000000..64be514e --- /dev/null +++ b/inference/flow/src/flow_function_factory.cpp @@ -0,0 +1,43 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "flow_function_factory.h" + +std::map flowFunctions; + +EE test(std::map> &inputs, + std::shared_ptr &tmp, + std::map> &outputs, + std::vector parameter = std::vector()) +{ + return SUCCESS; +} + +void flowBuildFunctions() +{ + flowFunctions["test"] = test; +} + +FlowFunction flowGetFunctionByName(std::string functionName) +{ + if (flowFunctions.find(functionName) != flowFunctions.end()) { + return flowFunctions[functionName]; + } else { + return NULL; + } +} + +void flowRegisterFunction(std::string functionName, FlowFunction function) +{ + flowFunctions[functionName] = function; +} diff --git a/inference/flow/src/node.cpp b/inference/flow/src/node.cpp new file mode 100644 index 00000000..9ad497ad --- /dev/null +++ b/inference/flow/src/node.cpp @@ -0,0 +1,242 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "node.h" +#include "inference.hpp" + +Node::Node() +{} + +Node::~Node() +{} + +Node Node::clone() +{ + UNI_DEBUG_LOG("node %s clone start\n", this->nodeParameter.name().c_str()); + Node node = *this; + node.boltModel = node.boltModel.clone(); + UNI_DEBUG_LOG("node %s clone end\n", this->nodeParameter.name().c_str()); + return node; +} + +void Node::setNodeParameter(flow::NodeParameter nodeParameter) +{ + this->nodeParameter = nodeParameter; + for (int i = 0; i < this->nodeParameter.infer_output_size_parameter_size(); i++) { + this->inferOutputSizeParameter.push_back(this->nodeParameter.infer_output_size_parameter(i)); + } + for (int i = 0; i < this->nodeParameter.preprocess_parameter_size(); i++) { + this->preprocessParameter.push_back(this->nodeParameter.preprocess_parameter(i)); + } + for (int i = 0; i < this->nodeParameter.inference_parameter_size(); i++) { + this->inferenceParameter.push_back(this->nodeParameter.inference_parameter(i)); + } + for (int i = 0; i < this->nodeParameter.postprocess_parameter_size(); i++) { + this->postprocessParameter.push_back(this->nodeParameter.postprocess_parameter(i)); + } +} + +flow::NodeParameter Node::getNodeParameter() +{ + return this->nodeParameter; +} + +EE Node::inferOutputSize() +{ + std::string inferOutputSizeFunctionName = + (this->inferOutputSizeParameter.size() > 0) ? this->inferOutputSizeParameter[0] : "NULL"; + UNI_DEBUG_LOG("node %s infer output size use %s begin\n", this->nodeParameter.name().c_str(), + inferOutputSizeFunctionName.c_str()); + this->inferOutputSizeFunction = flowGetFunctionByName(inferOutputSizeFunctionName); + EE ret = SUCCESS; + if (this->inferOutputSizeFunction != NULL) { + ret = this->inferOutputSizeFunction( + this->inputs, this->tmpTensor, this->outputs, this->inferOutputSizeParameter); + } else { + std::vector inferenceOutputNames = + this->boltModel.get_model_output_tensor_names(); + for (std::string name : inferenceOutputNames) { + if (this->outputs.find(name) == this->outputs.end()) { + this->outputs[name] = std::shared_ptr(new Tensor()); + } + this->outputs[name]->resize(this->boltModel.get_tensor_desc_by_name(name)); + } + } + UNI_DEBUG_LOG("node %s infer output size end\n", this->nodeParameter.name().c_str()); + return ret; +} + +void Node::setPrecision(DataType precision) +{ + this->precision = precision; +} + +unsigned int Node::getTmpBufferSize() +{ + return this->nodeParameter.tmp() * bytesOf(this->precision); +} + +void Node::setTmpBuffer(std::shared_ptr tmpTensor) +{ + this->tmpTensor = tmpTensor; +} + +void Node::initInference(AffinityPolicy affinityPolicy) +{ + if (this->inferenceParameter.size() == 0) { + UNI_DEBUG_LOG("node %s has no inference\n", this->nodeParameter.name().c_str()); + return; + } + std::string modelPath = this->inferenceParameter[0]; + const char *algorithmMapPath = "./"; + if (this->inferenceParameter.size() > 1) { + algorithmMapPath = this->inferenceParameter[1].c_str(); + } + UNI_DEBUG_LOG("node %s init inference engine(precision:%d affinity:%d algorithm:%s) from %s\n", + this->nodeParameter.name().c_str(), this->precision, affinityPolicy, algorithmMapPath, + modelPath.c_str()); + ModelSpec ms; + CHECK_STATUS(deserialize_model_from_file(modelPath.c_str(), &ms)); + CNN cnn(affinityPolicy, precision, ms.model_name); + cnn.sort_operators_sequential(&ms); + cnn.initialize_ops(&ms); + cnn.loadAlgorithmMapFromText(algorithmMapPath); + std::map inputDescMap = extractInputDims(&ms); + cnn.ready(inputDescMap); + CHECK_STATUS(cnn.mark_input_output()); + cnn.saveAlgorithmMapToText(algorithmMapPath); + CHECK_STATUS(mt_destroy_model(&ms)); + this->boltModel = cnn; + UNI_DEBUG_LOG("node %s init inference engine end\n", this->nodeParameter.name().c_str()); +} + +EE Node::ready() +{ + UNI_DEBUG_LOG("node %s ready begin\n", this->nodeParameter.name().c_str()); + std::string preprocessFunctionName = + (this->preprocessParameter.size() > 0) ? this->preprocessParameter[0] : "NULL"; + std::string postprocessFunctionName = + (this->postprocessParameter.size() > 0) ? this->postprocessParameter[0] : "NULL"; + this->preprocessFunction = flowGetFunctionByName(preprocessFunctionName); + this->postprocessFunction = flowGetFunctionByName(postprocessFunctionName); + UNI_DEBUG_LOG("node %s ready end\n", this->nodeParameter.name().c_str()); + return SUCCESS; +} + +void Node::setInput(std::map> inputs) +{ + this->inputs = inputs; +} + +void Node::setOutput(std::map> outputs) +{ + this->outputs = outputs; +} + +void Node::setRuntime(int cpuId, Arch arch) +{ + UNI_DEBUG_LOG("node %s setRuntime(core:%d arch:%d)begin\n", this->nodeParameter.name().c_str(), + cpuId, arch); + if (this->inferenceParameter[0] != std::string("NULL") && cpuId >= 0) { + this->boltModel.set_runtime_device(cpuId, arch); + } else { + UNI_DEBUG_LOG("currently not support to setRuntime for no inference node\n"); + } + UNI_DEBUG_LOG("node %s setRuntime end\n", this->nodeParameter.name().c_str()); +} + +EE Node::run() +{ + std::string preprocessFunctionName = + (this->preprocessParameter.size() > 0) ? this->preprocessParameter[0] : "NULL"; + std::string postprocessFunctionName = + (this->postprocessParameter.size() > 0) ? this->postprocessParameter[0] : "NULL"; + std::string modelPath = (this->inferenceParameter.size() > 0) ? this->inferenceParameter[0] + : "NULL"; + UNI_DEBUG_LOG("node %s run begin, preprocess use %s begin\n", + this->nodeParameter.name().c_str(), preprocessFunctionName.c_str()); + std::map> preprocessOutputs, postprocessInputs; + if (postprocessFunction == NULL) { + for (auto iter : this->outputs) { + postprocessInputs[iter.first] = iter.second; + } + } else { + postprocessInputs = this->boltModel.get_outputs(); + } + + // pre process part + if (preprocessFunction != NULL) { + preprocessOutputs = this->boltModel.get_inputs(); + preprocessFunction( + this->inputs, this->tmpTensor, preprocessOutputs, this->preprocessParameter); + } else { + UNI_DEBUG_LOG("node %s use default preprocess function(output is set to input)\n", + this->nodeParameter.name().c_str()); + preprocessOutputs = this->inputs; + } + UNI_DEBUG_LOG("node %s preprocess end, inference use %s begin\n", + this->nodeParameter.name().c_str(), this->inferenceParameter[0].c_str()); + + // inference part + if (modelPath != std::string("NULL")) { + std::map inputDescs; + for (auto iter : preprocessOutputs) { + inputDescs[iter.first] = iter.second->get_desc(); + } + this->boltModel.reready(inputDescs); + std::map> inputs; + for (auto iter : preprocessOutputs) { + inputs[iter.first] = ((CpuMemory *)iter.second->get_memory())->get_shared_ptr(); + } + this->boltModel.set_input_tensors_value(inputs); + double timeStart = ut_time_ms(); + this->boltModel.run(); + double timeEnd = ut_time_ms(); + UNI_PROFILE_INFO(this->nodeParameter.name().c_str(), "run", timeStart * 1000, + (timeEnd - timeStart) * 1000); + std::map> inferenceResult = + this->boltModel.get_outputs(); + for (auto iter : postprocessInputs) { + std::string name = iter.first; + if (inferenceResult.find(name) != inferenceResult.end()) { + TensorDesc desc = inferenceResult[name]->get_desc(); + iter.second->resize(desc); + void *src = ((CpuMemory *)inferenceResult[name]->get_memory())->get_ptr(); + void *dst = ((CpuMemory *)iter.second->get_memory())->get_ptr(); + if (src != dst) { + memcpy(dst, src, tensorNumBytes(desc)); + } + } else { + UNI_ERROR_LOG("%s is not marked as graph %s output\n", name.c_str(), + this->inferenceParameter[0].c_str()); + } + } + } else { + UNI_DEBUG_LOG("node %s use default inference function(output is set to input)\n", + this->nodeParameter.name().c_str()); + postprocessInputs = preprocessOutputs; + } + UNI_DEBUG_LOG("node %s inference end, postprocess use %s begin\n", + this->nodeParameter.name().c_str(), postprocessFunctionName.c_str()); + + // post process part + if (this->postprocessFunction != NULL) { + this->postprocessFunction( + postprocessInputs, this->tmpTensor, this->outputs, this->postprocessParameter); + } else { + UNI_DEBUG_LOG("node %s use default postprocess function(output is set to input)\n", + this->nodeParameter.name().c_str()); + } + UNI_DEBUG_LOG("node %s postprocess end, run end\n", this->nodeParameter.name().c_str()); + return SUCCESS; +} diff --git a/inference/include/BoltModel.h b/inference/include/BoltModel.h deleted file mode 100644 index a133c1a4..00000000 --- a/inference/include/BoltModel.h +++ /dev/null @@ -1,98 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -/* Header for class BoltModel */ - -#ifndef _Included_BoltModel -#define _Included_BoltModel -#ifdef __cplusplus -extern "C" { -#endif -/* - * Class: BoltModel - * Method: model_create - * Signature: (Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;)J - */ -JNIEXPORT jlong JNICALL Java_BoltModel_model_1create - (JNIEnv *, jobject, jstring, jstring, jstring); - -/* - * Class: BoltModel - * Method: model_ready - * Signature: (JI[Ljava/lang/String;[I[I[I[I[Ljava/lang/String;[Ljava/lang/String;)V - */ -JNIEXPORT void JNICALL Java_BoltModel_model_1ready - (JNIEnv *, jobject, jlong, jint, jobjectArray, jintArray, jintArray, jintArray, jintArray, jobjectArray, jobjectArray); - -/* - * Class: BoltModel - * Method: model_ready - * Signature: (JI[Ljava/lang/String;[I[I[I[I[Ljava/lang/String;[Ljava/lang/String;)V - */ -JNIEXPORT void JNICALL Java_BoltModel_model_1resize_1input - (JNIEnv *, jobject, jlong, jint, jobjectArray, jintArray, jintArray, jintArray, jintArray, jobjectArray, jobjectArray); - -/* - * Class: BoltModel - * Method: IResult_malloc_all - * Signature: (J)J - */ -JNIEXPORT jlong JNICALL Java_BoltModel_IResult_1malloc_1all - (JNIEnv *, jobject, jlong); - -/* - * Class: BoltModel - * Method: IResult_malloc_part - * Signature: (JI[Ljava/lang/String;)J - */ -JNIEXPORT jlong JNICALL Java_BoltModel_IResult_1malloc_1part - (JNIEnv *, jobject, jlong, jint, jobjectArray); - -/* - * Class: BoltModel - * Method: model_run - * Signature: (JJI[Ljava/lang/String;[[F)V - */ -JNIEXPORT void JNICALL Java_BoltModel_model_1run - (JNIEnv *, jobject, jlong, jlong, jint, jobjectArray, jobjectArray); - -/* - * Class: BoltModel - * Method: getOutput - * Signature: (J)LBoltResult; - */ -JNIEXPORT jobject JNICALL Java_BoltModel_getOutput - (JNIEnv *, jobject, jlong); - -/* - * Class: BoltModel - * Method: IResult_free - * Signature: (J)V - */ -JNIEXPORT void JNICALL Java_BoltModel_IResult_1free - (JNIEnv *, jobject, jlong); - -/* - * Class: BoltModel - * Method: destroyModel - * Signature: (J)V - */ -JNIEXPORT void JNICALL Java_BoltModel_destroyModel - (JNIEnv *, jobject, jlong); - -#ifdef __cplusplus -} -#endif -#endif diff --git a/inference/include/argmax.hpp b/inference/include/argmax.hpp deleted file mode 100644 index 6ef3d8ea..00000000 --- a/inference/include/argmax.hpp +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _ARGMAX_H -#define _ARGMAX_H - -#include "operator.hpp" -#include "tensor_computing.h" - -class ArgMax: public Operator -{ -public: - /** - @param mode - */ - ArgMax(DataType dt, I32 axis) - { - this->dt = dt; - this->axis = axis; - } - - OperatorType get_op_type() override - { - return OT_ArgMax; - } - - void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); - Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); - - CHECK_STATUS(argmax(inputDesc, inputTensor.get_val(), - this->axis, - outputDesc, outputTensor.get_val(), this->schedule)); - - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - EE infer_output_tensors_size(VecinDims, Vec* outDims) override - { - CHECK_STATUS(argmax_infer_output_size(inDims[0], this->axis, &((*outDims)[0]))); - return SUCCESS; - } - -private: - I32 axis; -}; - -#endif //_ARGMAX_H diff --git a/inference/include/attention.hpp b/inference/include/attention.hpp deleted file mode 100644 index 4d4af59f..00000000 --- a/inference/include/attention.hpp +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _ATTENTION_H -#define _ATTENTION_H - -#include "operator.hpp" -#include "tensor_computing.h" - -class Attention: public Operator -{ -public: - Attention(DataType dt, U32 numHeads, U32 fromSequenceLength, U32 toSequenceLength) - { - this->dt = dt; - this->numHeads = numHeads; - this->fromSequenceLength = fromSequenceLength; - this->toSequenceLength = toSequenceLength; - } - - OperatorType get_op_type() override - { - return OT_Attention; - } - - void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); - - Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); - - inputDesc.dt = this->dt; - CHECK_STATUS(attention(inputDesc, inputTensor.get_val(), - outputDesc, outputTensor.get_val(), this->schedule)); - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - EE infer_output_tensors_size(VecinDims, Vec* outDims) override - { - inDims[0].dt = this->dt; - CHECK_STATUS(attention_infer_output_size(inDims[0], - this->numHeads, this->fromSequenceLength, this->toSequenceLength, - &((*outDims)[0]))); - return SUCCESS; - } - -private: - U32 numHeads; - U32 fromSequenceLength; - U32 toSequenceLength; -}; - -#endif //_ATTENTION_H diff --git a/inference/include/attention_mask.hpp b/inference/include/attention_mask.hpp deleted file mode 100644 index 00dff935..00000000 --- a/inference/include/attention_mask.hpp +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _ATTENTION_MASK_H -#define _ATTENTION_MASK_H - -#include "operator.hpp" -#include "tensor_computing.h" - -class AttentionMask: public Operator -{ -public: - /** - @param mode - */ - AttentionMask(DataType dt, I32 attentionLength, bool sameLength, float mask) - { - this->dt = dt; - this->attentionLength = attentionLength; - this->sameLength = sameLength; - this->mask = mask; - } - - OperatorType get_op_type() override - { - return OT_AttentionMask; - } - - void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); - Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); - - CHECK_STATUS(attention_mask(inputDesc, inputTensor.get_val(), - this->attentionLength, this->sameLength, this->mask, - outputDesc, outputTensor.get_val(), this->schedule)); - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - EE infer_output_tensors_size(VecinDims, Vec* outDims) override - { - CHECK_STATUS(attention_mask_infer_output_size(inDims[0], &((*outDims)[0]))); - return SUCCESS; - } -private: - I32 attentionLength; - bool sameLength; - float mask; -}; - -#endif diff --git a/inference/include/bilateral_slice_apply.hpp b/inference/include/bilateral_slice_apply.hpp deleted file mode 100644 index 0a2278c5..00000000 --- a/inference/include/bilateral_slice_apply.hpp +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _BILATERAL_SLICE_APPLY_H -#define _BILATERAL_SLICE_APPLY_H -#include -#include "operator.hpp" -#include "tensor_computing.h" -#include "tensor_desc.h" -#include "op_type.h" - -class BilateralSliceApply: public Operator{ -public: - -/** - * @param coefficient_len - * @param has_offset - */ - BilateralSliceApply(U32 coefficient_len, bool has_offset, BilateralSliceApplyMode mode) - { - this->coefficient_len = coefficient_len; - this->has_offset = has_offset; - this->mode = mode; - } - virtual ~BilateralSliceApply(){}; - - OperatorType get_op_type() override - { - return OT_BilateralSliceApply; - } - - BilateralSliceApplyDesc create_BilateralSliceApplyDesc(U32 coefficient_len, bool has_offset, BilateralSliceApplyMode mode) - { - BilateralSliceApplyDesc bilateralSliceApplyDesc; - bilateralSliceApplyDesc.coefficient_len = coefficient_len; - bilateralSliceApplyDesc.has_offset = has_offset; - bilateralSliceApplyDesc.mode = mode; - return bilateralSliceApplyDesc; - } - - void set_coefficient_len(U32 coefficient_len) { - this->coefficient_len = coefficient_len; - } - - void set_has_offset(bool has_offset) { - this->has_offset = has_offset; - } - - void set_mode(BilateralSliceApplyMode mode) { - this->mode = mode; - } - -protected: - U32 coefficient_len; - bool has_offset; - BilateralSliceApplyMode mode; -}; - -#endif //_BILATERAL_SLICE_APPLY_H diff --git a/inference/include/check.hpp b/inference/include/check.hpp deleted file mode 100644 index 0957ec11..00000000 --- a/inference/include/check.hpp +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _CHECK_H -#define _CHECK_H - -#include "operator.hpp" -#include "tensor_computing.h" - -class Check: public Operator -{ -public: - /** - @param mode - */ - Check(DataType dt, CheckMode checkMode) - { - this->dt = dt; - this->checkMode = checkMode; - } - - OperatorType get_op_type() override - { - return OT_Check; - } - - void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - Tensor inputATensor = this->inputTensors[0]; - TensorDesc inputADesc = inputATensor.get_desc(); - Tensor inputBTensor = this->inputTensors[1]; - TensorDesc inputBDesc = inputBTensor.get_desc(); - - Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); - - CHECK_STATUS(check(inputADesc, inputATensor.get_val(), - inputBDesc, inputBTensor.get_val(), - this->checkMode, - outputDesc, outputTensor.get_val(), this->schedule)); - - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - EE infer_output_tensors_size(VecinDims, Vec* outDims) override - { - CHECK_STATUS(check_infer_output_size(inDims[0], &((*outDims)[0]))); - return SUCCESS; - } - -private: - CheckMode checkMode; -}; - -#endif //_CHECK_H diff --git a/inference/include/clip.hpp b/inference/include/clip.hpp deleted file mode 100644 index b35baa61..00000000 --- a/inference/include/clip.hpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _CLIP_H -#define _CLIP_H - -#include "operator.hpp" -#include "tensor_computing.h" - -class Clip: public Operator -{ -public: - /** - @param mode - */ - Clip(DataType dt, F32 clipMinScalar, F32 clipMaxScalar) - { - this->dt = dt; - this->clipMinScalar = clipMinScalar; - this->clipMaxScalar = clipMaxScalar; - } - - OperatorType get_op_type() override - { - return OT_Clip; - } - - bool can_input_output_the_same() override - { - return true; - } -protected: - F32 clipMinScalar; - F32 clipMaxScalar; -}; - -#endif //_CLIP_H diff --git a/inference/include/cnn.hpp b/inference/include/cnn.hpp deleted file mode 100644 index fafeea24..00000000 --- a/inference/include/cnn.hpp +++ /dev/null @@ -1,864 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _CNN_H -#define _CNN_H - -#include -#include -#include -#include -#include "model.hpp" -#include "model_tools.h" -#include "tensor.hpp" -#include "operator.hpp" -#include "tensor_desc.h" -#include "factory.hpp" -#include "cpu/factory_cpu.hpp" -#ifdef _USE_MALI -#include "ocl/factory_ocl.hpp" -#endif - -class CNN: public Model { -public: - - /** - * @param name - */ - CNN() {} - - explicit CNN(Arch arch, DataType dt, std::string name) : Model(arch, dt, name) { } - virtual ~CNN() = default; - /** - * @param op - * @param in - * @param out - */ - - void sort_operators_sequential(const ModelSpec* ms) - { - int opNum = ms->num_operator_specs; - for (int i = 0; i < opNum; i++) { - std::string opName = ms->ops[i].name; - if (opName.compare("data") == 0) { - continue; - } - this->sortedOps.push_back(opName); - } - } - - void initialize_ops(const ModelSpec* ms) - { - int opNum = ms->num_operator_specs; - - Vec modelInputTensorNames; - for (int i = 0; i < ms->num_inputs; i++) { - modelInputTensorNames.push_back(ms->input_names[i]); - } - for (int i = 0; i < ms->num_outputs; i++) { - this->modelOutputTensorNames.push_back(ms->output_names[i]); - } - this->modelInputTensorNames = modelInputTensorNames; - - U32 operatorIndex = 0; - HashMap operatorIndexMap; - for (int i = 0; i < opNum; i++) { - OperatorSpec curOps = ms->ops[i]; - std::string opName = curOps.name; - if (opName.compare("data") == 0) { - continue; - } - operatorIndexMap[opName] = operatorIndex++; - } - - for (int i = 0; i < opNum; i++) { - OperatorSpec curOps = ms->ops[i]; - std::string opName = curOps.name; - if (opName.compare("data") == 0) { - continue; - } - Vec inputTensorsName; - Vec outputTensorsName; - int inputTensorsNum = curOps.num_inputs; - for (int j = 0; j < inputTensorsNum; j++) { - inputTensorsName.push_back(curOps.input_tensors_name[j]); - } - - int outputTensorsNum = curOps.num_outputs; - for (int j = 0; j < outputTensorsNum; j++) { - outputTensorsName.push_back(curOps.output_tensors_name[j]); - } - - int numTensors = inputTensorsNum + outputTensorsNum; - Vec tensorPositions(numTensors); - memcpy(tensorPositions.data(), curOps.tensor_positions, numTensors * bytesOf(DT_I32)); - - // create op object - std::shared_ptr factory; -#ifdef _USE_MALI - if(this->schedule == MALI) { - auto factory_ocl = (Factory*)(new FactoryOCL()); - factory = std::shared_ptr(factory_ocl); - } else { -#endif - auto factory_cpu = (Factory*)(new FactoryCPU()); - factory = std::shared_ptr(factory_cpu); -#ifdef _USE_MALI - } -#endif - std::shared_ptr op = factory->createOperators(curOps, this->dt, operatorIndexMap, inputTensorsName); - op->set_op_name(opName); -#ifdef _USE_MALI - if(this->schedule == MALI) CHECK_STATUS(op->set_mali_handle(this->handle)); -#endif - op->set_op_schedule(this->schedule); - op->set_tensor_positions(tensorPositions); - op->init_feature_scale(curOps.num_quant_feature, curOps.feature_scale); - this->ops.push_back(op); - - // setup operatorMap, tensorMap, operatorTensorMap - this->add(op, inputTensorsName, outputTensorsName); - } - - // setup WeightSpec ptr in WeightOperator - for (int i = 0; i < ms->num_weight_specs; i++) { - WeightSpec curOpWs = ms->ws[i]; - std::string opName = curOpWs.op_name; - auto op = this->operatorMap[opName]; - auto weightOp = dynamic_cast(op.get()); - weightOp->set_weightspec_ptr(curOpWs); - if (curOpWs.bytes_of_vec != 0) { - CHECK_REQUIREMENT(curOpWs.vec != nullptr); - weightOp->set_hasBias(true); - } - // These two pointers will be managed by engine via shared_ptr, so mt_destroy_model should not free them - ms->ws[i].weight = nullptr; - ms->ws[i].vec = nullptr; - } - } - - void ready(HashMap inputDescMap) override - { -#ifdef _DEBUG - const char* funcStr = "[DEBUG] ready()"; - std::cout << "[INFO] schedule: " << this->schedule << std::endl; - std::cout << funcStr << " Model input num: " << this->modelInputTensorNames.size() << std::endl; - for (auto item: this->modelInputTensorNames) { - std::cout << " input: " << item << std::endl; - } -#endif - - this->infer_output_tensors_size(inputDescMap); - // handle the weight ops - for (auto op : this->ops) { -#ifdef _DEBUG - std::cout << funcStr << " op: " << op->get_name() << std::endl; -#endif - if (op->is_weight()) { - if (op->get_op_type() == OT_Conv) { - auto convOpPtr = dynamic_cast(op.get()); - CHECK_STATUS(convOpPtr->init_weight_bias_from_model(nullptr)); - CHECK_STATUS(convOpPtr->infer_forward_algorithm(this->algorithmMap)); - } else if (op->get_op_type() == OT_Deconvolution) { - auto convOpPtr = dynamic_cast(op.get()); - CHECK_STATUS(convOpPtr->init_weight_bias_from_model(nullptr)); - CHECK_STATUS(convOpPtr->infer_forward_algorithm()); - } else if (op->get_op_type() == OT_FC) { - auto fcOpPtr = dynamic_cast(op.get()); - CHECK_STATUS(fcOpPtr->init_weight_bias_from_model(nullptr)); - CHECK_STATUS(fcOpPtr->infer_forward_algorithm(this->algorithmMap)); - } else if (op->get_op_type() == OT_Embedding) { - auto embeddingOpPtr = dynamic_cast(op.get()); - CHECK_STATUS(embeddingOpPtr->init_weight_bias_from_model(nullptr)); - } else if (op->get_op_type() == OT_LayerNorm) { - auto layernormOpPtr = dynamic_cast(op.get()); - CHECK_STATUS(layernormOpPtr->init_weight_bias_from_model(nullptr)); - } else if (op->get_op_type() == OT_Scale) { - auto scaleOpPtr = dynamic_cast(op.get()); - CHECK_STATUS(scaleOpPtr->init_weight_bias_from_model(nullptr)); - } else if (op->get_op_type() == OT_LSTM) { - auto lstmOpPtr = dynamic_cast(op.get()); - CHECK_STATUS(lstmOpPtr->init_weight_bias_from_model(nullptr)); - } else if (op->get_op_type() == OT_SharedWeight) { - auto weightOpPtr = dynamic_cast(op.get()); - CHECK_STATUS(weightOpPtr->init_weight_bias_from_model(nullptr)); - std::string weightOpOutputName = (std::get<1>(this->operatorTensorMap[op->get_name()]))[0]; - Tensor weightTensor = weightOpPtr->weightTensors[0]; - this->tensorMap[weightOpOutputName]->set_shared_ptr(weightTensor.get_shared_ptr()); - this->weightOpOutputNames.insert(weightOpOutputName); - } - } - if(op->get_op_type() == OT_MatMul) { - auto matmulOpPtr = dynamic_cast(op.get()); - CHECK_STATUS(matmulOpPtr->infer_forward_algorithm(this->algorithmMap)); - } - } - -#ifdef _USE_MALI - if(this->schedule == MALI) this->infer_gclmem_descs(inputDescMap); -#endif - this->infer_tmp_memory_size(); - this->assign_tmp_tensor(); - //transform filter - for (auto op : this->ops) { - if (op->is_weight()) { - if (op->get_op_type() == OT_Conv) { - auto convOpPtr = dynamic_cast(op.get()); - CHECK_STATUS(convOpPtr->transform_filter()); - } else if (op->get_op_type() == OT_Deconvolution) { - auto convOpPtr = dynamic_cast(op.get()); - CHECK_STATUS(convOpPtr->transform_filter()); - } else if (op->get_op_type() == OT_FC) { - auto fcOpPtr = dynamic_cast(op.get()); - CHECK_STATUS(fcOpPtr->transform_filter()); - } else if (op->get_op_type() == OT_LSTM) { - auto lstmOpPtr = dynamic_cast(op.get()); - CHECK_STATUS(lstmOpPtr->transform_filter()); - } - } - } - this->infer_tmp_memory_size(); - temp->alloc(this->maxTmpElements); - this->assign_output_tensor(); -#ifdef _USE_MALI - if(this->schedule == MALI) CHECK_STATUS(gcl_finish(handle.get())); -#endif - } - - void reready(HashMap inputDescMap) - { - this->infer_output_tensors_size(inputDescMap); -#ifdef _USE_MALI - if(this->schedule == MALI) this->infer_gclmem_descs(inputDescMap); -#endif - this->infer_tmp_memory_size(); - temp->alloc(this->maxTmpElements); -#ifdef _USE_MALI - if(this->schedule == MALI) { - for(auto it : outputTensors) { - auto outTensor = it.second; - std::shared_ptr oclMem = outTensor->get_shared_ptr(); - U32 orgSize = oclMem->desc.byteSize; - U32 size = orgSize * 2; - oclMem->desc.byteSize = size; - oclMem->desc.use_map = true; - oclMem->desc.flags = CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR; - } - CHECK_STATUS(gcl_clean_kernelVec(this->handle.get())); - Model::run_mali_prepare(true); - CHECK_STATUS(gcl_finish(this->handle.get())); - } -#endif - } - - /** - * @param inputTensorsName - * @param outputTensorsName - */ - EE mark_input_output(const ModelSpec* ms) - { - inputTensors.clear(); - for (I32 i = 0; i < ms->num_inputs; i++) { - std::string str = ms->input_names[i]; - auto it = tensorMap.find(str); - if (tensorMap.end() != it) { -#ifdef _USE_MALI - if(this->schedule == MALI) { - it->second->alloc();//alloc ocl gpu memory for inputTensor - std::shared_ptr tmpTensorCPU(new Tensor()); - tmpTensorCPU->set_desc(ms->input_dims[i]); - tmpTensorCPU->alloc(); - auto p = std::pair>(str, tmpTensorCPU); - inputTensorsHost.insert(p); - } -#endif - (*(it->second)).set_desc(ms->input_dims[i]); - inputTensors.insert(*it); - } else { - return NOT_MATCH; - } - } - outputTensors.clear(); - for (I32 i = 0; i < ms->num_outputs; i++) { - std::string str = ms->output_names[i]; - auto it = tensorMap.find(str); - - if (tensorMap.end() != it) { -#ifdef _USE_MALI - if(this->schedule == MALI) {//resize outputTensors to map - auto outTensor = it->second; - std::shared_ptr oclMem = outTensor->get_shared_ptr(); - U32 orgSize = oclMem->desc.byteSize; - U32 size = orgSize * 2; - oclMem->desc.use_map = true; - oclMem->desc.flags = CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR; - outTensor->get_memory()->alloc(size); - } -#endif - outputTensors.insert(*it); - } else { - return NOT_MATCH; - } - } - -#ifdef _USE_MALI - if(this->schedule == MALI) { - U32 tmpBufSize = 0; - for(auto it = inputTensors.begin(); it != inputTensors.end(); it++) { - Tensor* inputTensor = it->second.get(); - TensorDesc desc = inputTensor->get_desc(); - GCLMem_t mem = inputTensor->get_val(); - U32 size = 0; - tensor_computing_set_input_infer_tmpBuf_size(mem, desc, &size, MALI); - tmpBufSize = (tmpBufSize < size) ? size : tmpBufSize; - } - - if(tmpBufSize > maxTmpElements) { - maxTmpElements = tmpBufSize; - temp->alloc(maxTmpElements); - } - } -#endif - return SUCCESS; - } - -#ifdef _USE_MALI - void mali_prepare() { - Model::run_mali_prepare(false); - CHECK_STATUS(gcl_finish(this->handle.get())); - } -#endif - - void copy_to_named_input(std::string inputName, U8* data) { - if(inputTensors.find(inputName) == inputTensors.end()) CHECK_STATUS(NOT_MATCH); - auto tensorPtr = this->inputTensors[inputName]; - TensorDesc desc = tensorPtr->get_desc(); -#ifdef _USE_MALI - if(this->schedule == MALI) { - OclMemory* mem = (OclMemory*) tensorPtr->get_memory(); - auto tempMem = std::static_pointer_cast(temp->get_shared_ptr()); - mem->set_tmpBuf(tempMem); - } -#endif - tensorPtr->set_val_by_copy(desc, data); - } - - void set_input_tensors_value(HashMap> modelTensorsInput) { - for(auto &modelTensorInput : modelTensorsInput) { - std::string inputName = modelTensorInput.first; - std::shared_ptr data = modelTensorInput.second; - if(inputTensors.find(inputName) == inputTensors.end()) CHECK_STATUS(NOT_MATCH); - auto tensorPtr = this->inputTensors[inputName]; -#ifdef _USE_MALI - if(this->schedule == MALI) { - TensorDesc desc = tensorPtr->get_desc(); - auto mem = (OclMemory*) tensorPtr->get_memory(); - auto tempMem = std::static_pointer_cast(temp->get_shared_ptr()); - mem->set_tmpBuf(tempMem); - tensorPtr->set_val_by_copy(desc, data.get()); - } else { -#endif - tensorPtr->set_shared_ptr(data); -#ifdef _USE_MALI - } -#endif - } - } - - HashMap> get_inputs() { -#ifdef _USE_MALI - if(this->schedule == MALI) return this->inputTensorsHost; -#endif - return this->inputTensors; - } - - HashMap> get_outputs() - { -#ifdef _USE_MALI - if(this->schedule == MALI) { - for(auto it = outputTensors.begin(); it != outputTensors.end(); it++) { - auto outputTensor = it->second; - auto host_desc = outputTensor->get_desc(); - auto mem = (OclMemory*)outputTensor->get_memory(); - mem->get_val_to_hostptr(host_desc, NULL, CL_TRUE); - } - CHECK_STATUS(gcl_finish(handle.get())); - } -#endif - return this->outputTensors; - } - Tensor get_tensor_by_name(std::string tensorName) { - if (this->tensorMap.find(tensorName) != this->tensorMap.end()) { -#ifdef _USE_MALI - if(this->schedule == MALI) { - if(this->outputTensors.find(tensorName) != this->outputTensors.end()) { - auto outputTensor = outputTensors[tensorName]; - auto host_desc = outputTensor->get_desc(); - auto mem = (OclMemory*)outputTensor->get_memory(); - mem->get_val_to_hostptr(host_desc, NULL, CL_TRUE); - } else { - CHECK_STATUS(NOT_SUPPORTED); - } - CHECK_STATUS(gcl_finish(handle.get())); - } -#endif - return *(this->tensorMap[tensorName].get()); - } else { - std::shared_ptr tensor(new Tensor()); - TensorDesc desc; - desc.dt = this->dt; - desc.nDims = 0; - tensor->set_desc(desc); - return *tensor.get(); - } - } - - void set_modelInputTensorNames(Vec modelInputTensorNames) { - this->modelInputTensorNames = modelInputTensorNames; - } - - Vec get_model_input_tensor_names() { - return this->modelInputTensorNames; - } - - Vec get_model_output_tensor_names() { - return this->modelOutputTensorNames; - } - - EE infer_output_tensors_size(HashMap inputDescMap) override - { - bool reassignMemory = false; - this->set_input_tensors_desc(inputDescMap); -#ifdef _DEBUG - const char* funcStr = "[DEBUG] infer_output_tensors_size()"; - std::cout << funcStr << std::endl; - for (auto iter: inputDescMap) { - std::cout << funcStr << " input: " << iter.first << " " << tensorDesc2Str(iter.second) << std::endl; - } -#endif - int opsNum = this->sortedOps.size(); - for (int i = 0; i < opsNum; i++) { - std::string opName = sortedOps[i]; - - auto op = this->operatorMap[opName]; -#ifdef _DEBUG - std::cout << funcStr << " op: " << opName << " type: " << OperatorTypeName()[op->get_op_type()] << std::endl; -#endif - Vec curOpInputTensorName = std::get<0>(this->operatorTensorMap[opName]); - Vec curOpOutputTensorName = std::get<1>(this->operatorTensorMap[opName]); - int curOpInNum = curOpInputTensorName.size(); - int curOpOutNum = curOpOutputTensorName.size(); - Vec inTensorDescs; - Vec outTensorDescs; - - for (int j = 0; j < curOpOutNum; j++) { - TensorDesc dummyTensorDesc; - outTensorDescs.push_back(dummyTensorDesc); - } - - Vec inTensors, outTensors; - for (std::string inputTensorName: curOpInputTensorName) { -#ifdef _DEBUG - std::cout << " inputTensorName: " << inputTensorName << " "; -#endif - inTensorDescs.push_back(this->tensorMap[inputTensorName]->get_desc()); - -#ifdef _DEBUG - std::cout << tensorDesc2Str(this->tensorMap[inputTensorName]->get_desc()); - std::cout << std::endl; -#endif - } -#ifdef _USE_MALI - if(this->schedule != MALI) { -#endif - for (int k = 0; k < curOpInNum; k++) { - U32 size = tensorNumBytes(inTensorDescs[k]); - I32 slot = op->tensorPos[k]; - if (slot == -1) { //These tensors will be standalone - continue; - } - if (slot >= (I32)this->storageSizes.size()) { - this->storageSizes.resize(slot+1, 0); - } - if (size > this->storageSizes[slot]) { - this->storageSizes[slot] = size; - reassignMemory = this->memoryAssigned; - } - } -#ifdef _USE_MALI - } -#endif - CHECK_STATUS(op->infer_output_tensors_size(inTensorDescs, &outTensorDescs)); - for (std::string inputTensorName: curOpInputTensorName) inTensors.push_back(*this->tensorMap[inputTensorName].get()); - - for (int k = 0; k < curOpOutNum; k++) { - std::string outputTensorName = curOpOutputTensorName[k]; -#ifdef _DEBUG - std::cout << " outputTensorName: " << outputTensorName << " "; -#endif - TensorDesc outputTensorDesc = outTensorDescs[k]; -#ifdef _DEBUG - std::cout << tensorDesc2Str(outputTensorDesc); - std::cout << std::endl; -#endif -#ifdef _USE_MALI - if(this->schedule != MALI) { -#endif - U32 size = tensorNumBytes(outputTensorDesc); - I32 slot = op->tensorPos[curOpInNum + k]; - if (slot != -1) { - if (slot >= (I32)this->storageSizes.size()) { - this->storageSizes.resize(slot+1, 0); - } - if (size > this->storageSizes[slot]) { - this->storageSizes[slot] = size; - reassignMemory = this->memoryAssigned; - } - } else { - if (this->memoryAssigned && size > tensorNumBytes(this->tensorMap[outputTensorName]->get_desc())) - reassignMemory = true; - } -#ifdef _USE_MALI - } -#endif - this->tensorMap[outputTensorName]->set_desc(outputTensorDesc); - outTensors.push_back(*this->tensorMap[outputTensorName].get()); - } - op->set_input_output_tensors(inTensors, outTensors); - } -#ifdef _DEBUG - U32 originalSize = 0; - U32 standaloneSize = 0; - for (auto tensor : this->tensorMap) { - originalSize += tensorNumBytes(tensor.second->get_desc()); - if (weightOpOutputNames.find(tensor.first) != weightOpOutputNames.end()) { - standaloneSize += tensorNumBytes(tensor.second->get_desc()); - } - } - std::cout << "Originally " << this->tensorMap.size() << " tensors, taking " << originalSize << " bytes.\n"; - - std::cout << "Storage reduced to " << storageSizes.size() << " reuse slots: \n"; - U32 totalSize = 0; - for (U32 size : storageSizes) { - std::cout << size << " bytes, "; - totalSize += size; - } - std::cout << "\nIn total " << totalSize << " bytes.\n"; - - if (0 != standaloneSize) { - std::cout << "Another " << standaloneSize << " bytes are reserved for standalone tensors (e.g. loop topology).\n"; - } - - std::cout << "Reuse ratio is " << (F32)originalSize / (totalSize+standaloneSize) << std::endl; -#endif - if (reassignMemory) { - this->assign_output_tensor(); - } - return SUCCESS; - } - -#ifdef _USE_MALI - EE infer_gclmem_descs(HashMap inputDescMap) override - { -#ifdef _DEBUG - const char* funcStr = "[DEBUG] infer_gclmem_desc()"; - std::cout << funcStr << std::endl; -#endif - - for (auto iter: inputDescMap) { - U32 stride[3] = {0, 0, 0}; - U32 offset[3] = {0, 0, 0}; - GCLMemDesc gclTmpDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); - auto mem = (OclMemory*)(this->tensorMap[iter.first]->get_memory()); - mem->set_mem_desc(gclTmpDesc); - } - - int opsNum = this->sortedOps.size(); - for (int i = 0; i < opsNum; i++) { - std::string opName = sortedOps[i]; - auto op = this->operatorMap[opName]; - Vec curOpInputTensorName = std::get<0>(this->operatorTensorMap[opName]); - Vec curOpOutputTensorName = std::get<1>(this->operatorTensorMap[opName]); - Vec inGCLMemDescs; - Vec outGCLMemDescs; - U32 j; - - for (j = 0; j < curOpOutputTensorName.size(); j++) { - U32 stride[3] = {0, 0, 0}; - U32 offset[3] = {0, 0, 0}; - GCLMemDesc gclTmpDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); - outGCLMemDescs.push_back(gclTmpDesc); - } - - for (std::string inputTensorName: curOpInputTensorName) { - auto mem =(OclMemory*)(this->tensorMap[inputTensorName]->get_memory()); - auto desc = mem->get_mem_desc(); - inGCLMemDescs.push_back(desc); - } - CHECK_STATUS(op->infer_gclmem_desc(&inGCLMemDescs, &outGCLMemDescs)); - - j = 0; - for (std::string inputTensorName: curOpInputTensorName) { - auto tensorTmp = this->tensorMap[inputTensorName]; - auto mem = (OclMemory*)(tensorTmp->get_memory()); - mem->set_mem_desc(inGCLMemDescs[j]); - j++; - } - - j = 0; - for (std::string outputTensorName: curOpOutputTensorName) { - auto tensorTmp = this->tensorMap[outputTensorName]; - auto mem = (OclMemory*)(tensorTmp->get_memory()); - mem->set_mem_desc(outGCLMemDescs[j]); - j++; - } - } - - for (int i = 0; i < opsNum; i++) { - std::string opName = sortedOps[i]; - auto op = this->operatorMap[opName]; -#ifdef _DEBUG - std::cout << funcStr << " op: " << opName << " type " << op->get_op_type() << std::endl; -#endif - Vec curOpInputTensorName = std::get<0>(this->operatorTensorMap[opName]); - Vec curOpOutputTensorName = std::get<1>(this->operatorTensorMap[opName]); - Vec inTensors, outTensors; - for (std::string inputTensorName: curOpInputTensorName) { - auto tensorTmp = this->tensorMap[inputTensorName]; - inTensors.push_back(*tensorTmp.get()); -#ifdef _DEBUG - auto mem = (OclMemory*)(tensorTmp->get_memory()); - auto desc = mem->get_mem_desc(); - std::cout << " inputTensorName: " << inputTensorName << " "; - std::cout << gclMemDesc2Str(desc) << std::endl; -#endif - } - - for (std::string outputTensorName: curOpOutputTensorName) { - auto tensorTmp = this->tensorMap[outputTensorName]; - outTensors.push_back(*tensorTmp.get()); -#ifdef _DEBUG - auto mem = (OclMemory*)(tensorTmp->get_memory()); - auto desc = mem->get_mem_desc(); - std::cout << " outputTensorName: " << outputTensorName << " "; - std::cout << gclMemDesc2Str(desc) << std::endl; -#endif - } - op->set_input_output_tensors(inTensors, outTensors); - } - return SUCCESS; - } -#endif - - void assign_output_tensor() override - { -#ifdef _DEBUG - const char* funcStr = "[DEBUG] assign_output_tensor()"; - std::cout << funcStr << std::endl; -#endif - - Vec> storages; -#ifdef _USE_MALI - if(this->schedule != MALI) { -#endif - for (U32 i = 0; i < storageSizes.size(); i++) { - storages.push_back(std::shared_ptr((U8*)operator new(storageSizes[i]))); - } -#ifdef _USE_MALI - } -#endif - - for (std::string opName: sortedOps) { -#ifdef _DEBUG - std::cout << funcStr << " op: " << opName << "\n input tensor names: "; -#endif - U32 tensorIter = 0; - std::shared_ptr op = this->operatorMap[opName]; - Vec inTensors, outTensors; - Vec inTensorNames = std::get<0>(this->operatorTensorMap[opName]); - Vec outTensorNames = std::get<1>(this->operatorTensorMap[opName]); - for (std::string inName: inTensorNames) { -#ifdef _DEBUG - std::cout << inName << " to Slot " << op->tensorPos[tensorIter] << ", "; -#endif -#ifdef _USE_MALI - if(this->schedule != MALI) { -#endif - if (op->tensorPos[tensorIter] != -1) { - this->tensorMap[inName].get()->set_shared_ptr(storages[op->tensorPos[tensorIter]]); - } else { - if (this->weightOpOutputNames.find(inName) == this->weightOpOutputNames.end()) { - this->tensorMap[inName].get()->alloc(); - } - } - tensorIter++; -#ifdef _USE_MALI - } -#endif - inTensors.push_back(*(this->tensorMap[inName].get())); - } -#ifdef _DEBUG - std::cout << "\n output tensor names: "; -#endif - - for (std::string outName: outTensorNames) { -#ifdef _DEBUG - std::cout << outName << " to Slot " << op->tensorPos[tensorIter] << ", "; -#endif -#ifdef _USE_MALI - if(this->schedule == MALI) { - this->tensorMap[outName].get()->alloc(); - } else { -#endif - if (this->weightOpOutputNames.find(outName) == this->weightOpOutputNames.end()) { - if (op->tensorPos[tensorIter] != -1) { - this->tensorMap[outName].get()->set_shared_ptr(storages[op->tensorPos[tensorIter]]); - } else { - this->tensorMap[outName].get()->alloc(); - } - } - tensorIter++; -#ifdef _USE_MALI - } -#endif - outTensors.push_back(*(this->tensorMap[outName].get())); - } -#ifdef _DEBUG - std::cout << std::endl; -#endif - op->set_input_output_tensors(inTensors, outTensors); - } - this->memoryAssigned = true; - } - -private: - void add(std::shared_ptr op, Vec inputTensorsName, Vec outputTensorsName) - { - std::string name = op->get_name(); - this->operatorMap[name] = op; - - std::tuple, Vec> in_outTensors(std::make_tuple(inputTensorsName, outputTensorsName)); - if (this->operatorTensorMap.find(name) == this->operatorTensorMap.end()) { - this->operatorTensorMap[name] = in_outTensors; - } - else { - std::cout << "[ERROR] duplicate tensor name: " << name << std::endl; - exit(1); - } - this->operatorTensorMap[name] = in_outTensors; - - for (std::string input : inputTensorsName) { - std::shared_ptr tmp; -#ifdef _USE_MALI - if(this->schedule == MALI) { - tmp = std::shared_ptr(new Tensor(this->handle)); - } else { -#endif - tmp = std::shared_ptr(new Tensor()); -#ifdef _USE_MALI - } -#endif - auto p = std::pair>(input, tmp); - this->tensorMap.insert(p); - } - - for (std::string output : outputTensorsName) { - std::shared_ptr tmp; -#ifdef _USE_MALI - if(this->schedule == MALI) { - tmp = std::shared_ptr(new Tensor(this->handle)); - } else { -#endif - tmp = std::shared_ptr(new Tensor()); -#ifdef _USE_MALI - } -#endif - auto p = std::pair>(output, tmp); - this->tensorMap.insert(p); - } - } - - void set_input_tensors_desc(HashMap inputDescMap) { - for (auto iter: inputDescMap) { - TensorDesc desc = iter.second; -#ifdef _USE_MALI - if(this->schedule == MALI) { - if(desc.df == DF_NCHW) desc.df = DF_NCHW_ORG_MALI; - } -#endif - (this->tensorMap[iter.first].get())->set_desc(desc); - } - } - - - void infer_tmp_memory_size() override - { - this->tmpElements.clear(); - this->maxTmpElements = 0; - - for (auto op: this->ops) { - auto len = op->infer_tmp_memory_size(); - this->tmpElements.push_back(len); - if (len > (this->maxTmpElements)) { - this->maxTmpElements = len; - } - } - } - - void assign_tmp_tensor() override - { - // design for serial , if parallel running should redesign -#ifdef _USE_MALI - if(this->schedule == MALI) { - this->temp = std::shared_ptr(new OclMemory(this->handle)); - } else { -#endif - this->temp = std::shared_ptr(new CpuMemory()); -#ifdef _USE_MALI - } -#endif - temp->alloc(this->maxTmpElements); - for (auto op: this->ops) { - op->set_tmp_memory(this->maxTmpElements, temp); - } - } - - -private: - HashMap> tensorMap; - HashMap> operatorMap; - HashMap, Vec>> operatorTensorMap; - - std::set weightOpOutputNames; - - //input & output tensors - HashMap> inputTensors; - HashMap> outputTensors; -#ifdef _USE_MALI - HashMap> inputTensorsHost; -#endif - Vec storageSizes; - - Vec sortedOps; - - U32 maxTmpElements; - Vec tmpElements; - std::shared_ptr temp; - - Vec modelInDims; - - Vec modelInputTensorNames; - Vec modelOutputTensorNames; -}; -#endif diff --git a/inference/include/concat.hpp b/inference/include/concat.hpp deleted file mode 100644 index 7a11c800..00000000 --- a/inference/include/concat.hpp +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _CONCAT_H -#define _CONCAT_H - -#include "operator.hpp" -#include "tensor_computing.h" - -class Concat: public Operator { -public: - Concat(int axis) - { - this->axis = axis; - } - - OperatorType get_op_type() override - { - return OT_Concat; - } - - -protected: - I32 axis; -}; - -#endif //_CONCAT_H diff --git a/inference/include/constant.hpp b/inference/include/constant.hpp deleted file mode 100644 index 5abb427c..00000000 --- a/inference/include/constant.hpp +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _CONSTANT_H -#define _CONSTANT_H -#include "operator.hpp" - -class Constant: public Operator { -public: - Constant(TensorDesc constDesc, void* data) - { - this->constDesc = constDesc; - this->data = data; - } - - OperatorType get_op_type() override - { - return OT_Constant; - } - - void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - Tensor outputTensor = this->outputTensors[0]; - - U8* outputPtr = outputTensor.get_val().get(); - memcpy(outputPtr, data, tensorNumBytes(constDesc)); - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - EE infer_output_tensors_size(Vec* outDims) override - { - (*outDims)[0] = constDesc; - return SUCCESS; - } - -private: - TensorDesc constDesc; - void* data; -}; - -#endif //_CONSTANT__H diff --git a/inference/include/convolution.hpp b/inference/include/convolution.hpp deleted file mode 100644 index 986f77ee..00000000 --- a/inference/include/convolution.hpp +++ /dev/null @@ -1,98 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _CONVOLUTION_H -#define _CONVOLUTION_H - -#include "weight_operator.hpp" -#include "tensor_computing.h" -#include "op_type.h" - -class Convolution: public WeightOperator { -public: - Convolution(DataType dt, U32 nf, - U32 ksizeH, U32 ksizeW, U32 kstrideH, U32 kstrideW, - U32 kpaddingT, U32 kpaddingB, U32 kpaddingL, U32 kpaddingR, - ActivationDesc dwActivationDesc, ActivationDesc pwActivationDesc, - ConvolutionMode convolutionType, U32 group, U32 dilateH, U32 dilateW) - { - this->dt = dt; - this->numFilters = nf; - this->kernelSizeH = ksizeH; - this->kernelSizeW = ksizeW; - this->strideH = kstrideH; - this->strideW = kstrideW; - this->paddingT = kpaddingT; - this->paddingB = kpaddingB; - this->paddingL = kpaddingL; - this->paddingR = kpaddingR; - this->dwActivationDesc = dwActivationDesc; - this->pwActivationDesc = pwActivationDesc; - this->convolutionType = convolutionType; - this->group = group; - this->dilateH = dilateH; - this->dilateW = dilateW; - this->hasBias = false; - this->pwAlg = CONVOLUTION_ALGORITHM_NULL; - this->dwAlg = DEPTHWISE_CONVOLUTION_ALGORITHM_NULL; - } - - OperatorType get_op_type() override - { - return OT_Conv; - } - - ConvolutionDesc create_convDesc(U32 strideH, U32 strideW, U32 paddingT, U32 paddingB, U32 paddingL, U32 paddingR, U32 dilateH, U32 dilateW) - { - ConvolutionDesc convDesc; - convDesc.stride_h = strideH; - convDesc.stride_w = strideW; - convDesc.padding_top = paddingT; - convDesc.padding_bottom = paddingB; - convDesc.padding_left = paddingL; - convDesc.padding_right = paddingR; - convDesc.dilatedRate_h = dilateH; - convDesc.dilatedRate_w = dilateW; - return convDesc; - } - virtual EE init_weight_bias_from_model(U8** modelPtr) = 0; - virtual EE infer_forward_algorithm(HashMap &algorithmMap) = 0; - virtual EE transform_filter() = 0; -public: - U32 numFilters; - U32 numChannels; - U32 kernelSizeH; - U32 kernelSizeW; - U32 strideH; - U32 strideW; - U32 paddingT; - U32 paddingB; - U32 paddingL; - U32 paddingR; - ConvolutionMode convolutionType; - U32 group; - U32 dilateH; - U32 dilateW; - - ActivationDesc dwActivationDesc; - ActivationDesc pwActivationDesc; - - ConvolutionForwardAlgorithm pwAlg; - DepthwiseConvolutionForwardAlgorithm dwAlg; -#ifdef _USE_FP16 - std::shared_ptr scales; -#endif -}; - -#endif //_CONVOLUTION_H diff --git a/inference/include/copy.hpp b/inference/include/copy.hpp deleted file mode 100644 index 8c88e12e..00000000 --- a/inference/include/copy.hpp +++ /dev/null @@ -1,88 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _COPY_H -#define _COPY_H - -#include "operator.hpp" - -class Copy: public Operator -{ -public: - /** - @param mode - */ - Copy(DataType dt, I32 *srcDimsPtr, I32 *dstDimsPtr, I32 len) - { - this->dt = dt; - this->srcDims = Vec(3); - memcpy(this->srcDims.data(), srcDimsPtr, 3 * sizeof(I32)); - this->dstDims = Vec(3); - memcpy(this->dstDims.data(), dstDimsPtr, 3 * sizeof(I32)); - this->length = len; - } - - OperatorType get_op_type() override - { - return OT_Copy; - } - - void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - Tensor srcTensor = this->inputTensors[0]; - TensorDesc srcDesc = srcTensor.get_desc(); - Tensor dstTensor = this->inputTensors[1]; - TensorDesc dstDesc = dstTensor.get_desc(); - - U32 batch = srcDesc.dims[srcDesc.nDims - 1]; - U32 copyLength = (this->length >= 0) ? this->length : tensorNumElements(srcDesc) / batch; - U32 srcBatchStride = (this->srcDims[0] >= 0) ? this->srcDims[0] : tensorNumElements(srcDesc) / batch; - U32 srcStride = (this->srcDims[0] >= 0) ? this->srcDims[1] : tensorNumElements(srcDesc) / batch; - U32 dstBatchStride = (this->dstDims[0] >= 0) ? this->dstDims[0] : tensorNumElements(dstDesc) / batch; - U32 dstStride = (this->dstDims[0] >= 0) ? this->dstDims[1] : tensorNumElements(dstDesc) / batch; - for (U32 i = 0; i < batch; i++) { - U32 srcBlockIndex = 0; - if (this->inputTensors.size() > 2) - srcBlockIndex = ((U32 *)(this->inputTensors[2].get_val()))[i]; - U32 dstBlockIndex = 0; - if (this->inputTensors.size() > 3) - dstBlockIndex = ((U32 *)(this->inputTensors[3].get_val()))[i]; - U32 srcIndex = i * srcBatchStride + srcBlockIndex * srcStride + this->srcDims[2]; - U32 dstIndex = i * dstBatchStride + dstBlockIndex * dstStride + this->dstDims[2]; - memcpy((U8*)(dstTensor.get_val()) + bytesOf(srcDesc.dt) * dstIndex, - (U8*)(srcTensor.get_val()) + bytesOf(srcDesc.dt) * srcIndex, - copyLength * bytesOf(srcDesc.dt)); - } - - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - EE infer_output_tensors_size(VecinDims, Vec* outDims) override - { - UNUSED(inDims); - - (*outDims)[0].dt = this->dt; - (*outDims)[0].df = getTensorDefaultDataFormat(0); - (*outDims)[0].nDims = 0; - return SUCCESS; - } - -private: - Vec srcDims; - Vec dstDims; - I32 length; -}; - -#endif //_COPY_H diff --git a/inference/include/cpu/activation_cpu.hpp b/inference/include/cpu/activation_cpu.hpp deleted file mode 100644 index e9970041..00000000 --- a/inference/include/cpu/activation_cpu.hpp +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _ACTIVATION_CPU_H -#define _ACTIVATION_CPU_H - -#include "operator.hpp" -#include "tensor_computing.h" -#include "tensor_desc.h" -#include "activation.hpp" - -class ActivationCPU: public Activation -{ -public: - /** - @param mode - */ - ActivationCPU(ActivationDesc activationDesc): Activation(activationDesc) {} - - virtual void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - - Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); - Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); - U8* inPtr = inputTensor.get_val(); - U8* outPtr = outputTensor.get_val(); - - CHECK_STATUS(activation(inputDesc, inPtr, this->activationDesc, outputDesc, outPtr, this->schedule)); - outputTensor.set_scale(inputTensor.get_scale()); - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - virtual EE infer_output_tensors_size(VecinDims, Vec* outDims) override - { - CHECK_STATUS(activation_infer_output_size(inDims[0], &((*outDims)[0]), this->schedule)); - return SUCCESS; - } -}; - -#endif //_ACTIVATION_CPU_H diff --git a/inference/include/cpu/clip_cpu.hpp b/inference/include/cpu/clip_cpu.hpp deleted file mode 100644 index 345f4b91..00000000 --- a/inference/include/cpu/clip_cpu.hpp +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _CLIP_CPU_H -#define _CLIP_CPU_H - -#include "operator.hpp" -#include "tensor_computing.h" -#include "clip.hpp" - -class ClipCPU: public Clip -{ -public: - /** - @param mode - */ - ClipCPU(DataType dt, F32 clipMinScalar, F32 clipMaxScalar) : Clip(dt, clipMinScalar, clipMaxScalar) { } - - virtual void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - - Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); - Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); - - CHECK_STATUS(clip(&(this->clipMinScalar), &(this->clipMaxScalar), - inputDesc, inputTensor.get_val(), - outputDesc, outputTensor.get_val(), this->schedule)); - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - virtual EE infer_output_tensors_size(VecinDims, Vec* outDims) override - { - CHECK_STATUS(clip_infer_output_size(inDims[0], &((*outDims)[0]), this->schedule)); - return SUCCESS; - } - -}; - -#endif //_CLIP_CPU_H diff --git a/inference/include/cpu/concat_cpu.hpp b/inference/include/cpu/concat_cpu.hpp deleted file mode 100644 index 6a866519..00000000 --- a/inference/include/cpu/concat_cpu.hpp +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _CONCAT_CPU_H -#define _CONCAT_CPU_H - -#include "operator.hpp" -#include "tensor_computing.h" -#include "concat.hpp" - -class ConcatCPU: public Concat { -public: - ConcatCPU(int axis) : Concat(axis) {} - - virtual void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - - Vec inputDesc; - Vec inputPtr; - Vec inputScales; - - for (Tensor tensorIn: this->inputTensors) { - inputDesc.push_back(tensorIn.get_desc()); - inputPtr.push_back((void*)tensorIn.get_val()); - inputScales.push_back(tensorIn.get_scale()); - } - auto outputDesc = this->outputTensors[0].get_desc(); - auto outputPtr = this->outputTensors[0].get_val(); - F32 outputScale = 1.0; - - CHECK_STATUS(concat(inputDesc, inputPtr, inputScales.data(), outputDesc, outputPtr, &outputScale, this->axis, this->schedule)); - - if (DT_I8 == outputDesc.dt) { - this->outputTensors[0].set_scale(outputScale); - } - - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - virtual EE infer_output_tensors_size(VecinDims, Vec* outDims) override - { - CHECK_STATUS(concat_infer_output_size(inDims, &((*outDims)[0]), this->axis, this->schedule)); - return SUCCESS; - } -}; - -#endif //_CONCAT_CPU_H diff --git a/inference/include/cpu/convolution_cpu.hpp b/inference/include/cpu/convolution_cpu.hpp deleted file mode 100644 index 1d44fbc4..00000000 --- a/inference/include/cpu/convolution_cpu.hpp +++ /dev/null @@ -1,527 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _CONVELTWISEPOOLING_CPU_H -#define _CONVELTWISEPOOLING_CPU_H -#include "weight_operator.hpp" -#include "tensor_computing.h" -#include "tensor_desc.h" -#include "op_type.h" -#include "convolution.hpp" -#include "pooling.hpp" -#include "eltwise.hpp" - -class ConvolutionCPU: public Convolution { -public: - ConvolutionCPU(DataType dt, U32 nf, U32 ksizeH, U32 ksizeW, U32 kstrideH, U32 kstrideW, U32 kpaddingT, U32 kpaddingB, U32 kpaddingL, U32 kpaddingR, - ActivationDesc dwActivationDesc, ActivationDesc pwActivationDesc, ConvolutionMode convolutionType, U32 group, U32 dilateH, U32 dilateW) : - Convolution(dt, nf, ksizeH, ksizeW, kstrideH, kstrideW, kpaddingT, kpaddingB, kpaddingL, kpaddingR, - dwActivationDesc, pwActivationDesc, convolutionType, group, dilateH, dilateW) {} - - virtual EE init_weight_bias_from_model(U8** modelPtr)override - { - auto curOpWs = this->get_weightspec_ptr(); - DataType filterDt = curOpWs.mdt; // weight data type may not be the same as input and output - if (modelPtr != nullptr) { - filterDt = this->dt; - } - DataType dtNoQ = (this->dt == DT_F16_8Q) ? DT_F16 : this->dt; - U32 isBNN = 0; - if (filterDt == DT_BIN01 || filterDt == DT_BIN11) { - isBNN = 1; - } - DataFormat filterDf; - U32 vectorLen = 0; // Vector must contain bias. BNN has one more scale vector. - switch (this->convolutionType) { - case Convolution_Pointwise: { - filterDf = DF_NCHW; - vectorLen = this->numFilters; // bias length - if (isBNN == 1) { - this->dt = dtNoQ; // BNN convolution should not be quantized further - vectorLen *= 2; // Scale has the same vector length as bias, so double the length - } - break; - } - case Convolution_Depthwise: { - filterDf = DF_NCHW; - vectorLen = this->numFilters; - break; - } - case Convolution_Depthwise_Pointwise: { - filterDf = DF_CHW_NC; - vectorLen = this->numFilters + this->numChannels; - break; - } - case Convolution_Dilation: { - filterDf = DF_NCHW; - vectorLen = this->numFilters; - break; - } - default: - return NOT_SUPPORTED; - } - TensorDesc filterTensorDesc = tensor4df(filterDt, filterDf, - this->numFilters, this->numChannels, - this->kernelSizeH, this->kernelSizeW); - TensorDesc vectorTensorDesc = tensor1d(dtNoQ, vectorLen); // bias data type should be the same as input and output - - std::shared_ptr modelWeightTensor(new Tensor()); - std::shared_ptr modelVectorTensor(new Tensor()); - modelWeightTensor->set_desc(filterTensorDesc); - modelVectorTensor->set_desc(vectorTensorDesc); - - if (modelPtr != nullptr) { - modelWeightTensor->alloc(); - memcpy((U8*)modelWeightTensor->get_val(), *modelPtr, tensorNumBytes(filterTensorDesc)); - *modelPtr += tensorNumBytes(filterTensorDesc); - } else { - modelWeightTensor->set_shared_ptr(std::shared_ptr(curOpWs.weight)); - } - - U8* biasVal = NULL; - if(modelPtr != nullptr) { - if(this->hasBias){ - biasVal = *modelPtr; - *modelPtr += tensorNumBytes(vectorTensorDesc); - } - } else { - if(this->hasBias) biasVal = curOpWs.vec; - } - - if (biasVal) { - modelVectorTensor->set_shared_ptr(std::shared_ptr(biasVal)); - } else { - modelVectorTensor->alloc(); - if (isBNN == 1) { -#ifdef _USE_FP16 - F16 *vec = (F16*)modelVectorTensor->get_val(); - for (U32 i = 0; i < this->numFilters; i++) { // first half is scale - *vec = 1.0; - vec++; - } - memset(vec, 0, tensorNumBytes(vectorTensorDesc) / 2); // second half is bias -#endif - } else { - memset((U8*)modelVectorTensor->get_val(), 0, tensorNumBytes(vectorTensorDesc)); - } - } - - this->weightTensors.push_back(*modelWeightTensor.get()); - this->biasTensors.push_back(*modelVectorTensor.get()); - return SUCCESS; - } - - virtual void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - - Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); - - Tensor filterTensor = this->weightTensors[0]; - TensorDesc filterDesc = filterTensor.get_desc(); - - ConvolutionDesc convDesc = Convolution::create_convDesc(this->strideH, this->strideW, this->paddingT, this->paddingB, - this->paddingL, this->paddingR, this->dilateH, this->dilateW); - - TensorDesc scaleDesc = filterDesc; // Dummy initialization - U8 *scalePtr = nullptr; - - Tensor biasTensor = this->biasTensors[0]; - TensorDesc biasDesc = biasTensor.get_desc(); - U8 *biasPtr = biasTensor.get_val(); - - Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); - - switch (this->convolutionType) { - case Convolution_Pointwise: { - if (filterDesc.dt == DT_BIN01 || filterDesc.dt == DT_BIN11) { -#ifdef _USE_FP16 - U32 vecLen = tensorNumElements(biasDesc) / 2; - - scaleDesc = tensor1d(biasDesc.dt, vecLen); - biasDesc = tensor1d(biasDesc.dt, vecLen); - scalePtr = biasTensor.get_val(); - biasPtr = scalePtr + vecLen * bytesOf(DT_F16); -#endif - } else if (DT_F16_8Q == this->dt) { -#ifdef _USE_INT8 - F16 *ptr = this->scales.get(); - scalePtr = (U8*)ptr; - - ptr[0] = inputTensor.get_scale(); - if (featureScale.size() > 0 && featureScale[0][0] > 0) { - ptr[0] = featureScale[0][0]; - } else if (DT_F16 == inputDesc.dt) { - ptr[0] = -1; - } - - if (featureScale.size() > 0 && (featureScale.back())[0] != -2) { - ptr[1] = (featureScale.back())[0]; - } else { - ptr[1] = -1; - } -#endif - } - CHECK_STATUS(convolution(inputDesc, inputTensor.get_val(), - filterDesc, filterTensor.get_val(), - convDesc, this->pwAlg, - scaleDesc, scalePtr, - biasDesc, biasPtr, - this->lenOfTemp, this->temp->get_val(), - outputDesc, (void*)outputTensor.get_val(), - this->pwActivationDesc, this->schedule)); -#ifdef _USE_INT8 - if (DT_I8 == outputDesc.dt) { - F16 *ptr = (F16*)scalePtr; - outputTensor.set_scale(ptr[1]); - } -#endif - break; - } - case Convolution_Depthwise: { - CHECK_STATUS(depthwise_convolution(inputDesc, inputTensor.get_val(), - filterDesc, filterTensor.get_val(), - convDesc, this->dwAlg, - biasDesc, biasPtr, - this->lenOfTemp, this->temp->get_val(), - outputDesc, outputTensor.get_val(), - this->dwActivationDesc, this->pwActivationDesc, - this->schedule)); - break; - } - case Convolution_Depthwise_Pointwise: { - CHECK_STATUS(depthwise_convolution(inputDesc, inputTensor.get_val(), - filterDesc, filterTensor.get_val(), - convDesc, this->dwAlg, - biasDesc, biasPtr, - this->lenOfTemp, this->temp->get_val(), - outputDesc, outputTensor.get_val(), - this->dwActivationDesc, this->pwActivationDesc, - this->schedule)); - break; - } - case Convolution_Dilation: { - CHECK_STATUS(convolution(inputDesc, inputTensor.get_val(), - filterDesc, filterTensor.get_val(), - convDesc, this->pwAlg, - scaleDesc, scalePtr, - biasDesc, biasPtr, - this->lenOfTemp, this->temp->get_val(), - outputDesc, (void*)outputTensor.get_val(), - this->pwActivationDesc, this->schedule)); - break; - } - default: - std::cerr << "[ERROR] unsupported convolution type " << this->convolutionType << std::endl; - exit(1); - } - - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - virtual EE infer_forward_algorithm(HashMap &algorithmMap) override - { - TensorDesc inputDesc = (this->inputTensors[0]).get_desc(); - TensorDesc filterDesc = (this->weightTensors[0]).get_desc(); - - ConvolutionPolicy policy = CONVOLUTION_FASTEST; - ConvolutionDesc convDesc = Convolution::create_convDesc(this->strideH, this->strideW, this->paddingT, this->paddingB, - this->paddingL, this->paddingR, this->dilateH, this->dilateW); - - DataType targetType = filterDesc.dt; - switch (this->convolutionType) { - case Convolution_Pointwise: { - if (this->dt == DT_F16_8Q) { - targetType = DT_I8; - } - if (algorithmMap.find(this->name) != algorithmMap.end()) { - I32 algo; - Operator::getAlgorithmInfoFromMap(algorithmMap, this->name, &algo, 1); - this->pwAlg = (ConvolutionForwardAlgorithm)algo; - } else { - CHECK_STATUS(convolution_infer_forward_algorithm(inputDesc, filterDesc, - this->outputTensors[0].get_desc(), - convDesc, policy, &(this->pwAlg), targetType, this->pwActivationDesc, this->schedule)); - I32 algo = this->pwAlg; - Operator::setAlgorithmInfoToMap(algorithmMap, this->name, &algo, 1); - } - break; - } - case Convolution_Depthwise: { - if (algorithmMap.find(this->name) != algorithmMap.end()) { - I32 algo; - Operator::getAlgorithmInfoFromMap(algorithmMap, this->name, &algo, 1); - this->dwAlg = (DepthwiseConvolutionForwardAlgorithm)algo; - } else { - CHECK_STATUS(depthwise_convolution_infer_forward_algorithm(inputDesc, filterDesc, - this->outputTensors[0].get_desc(), - convDesc, policy, &(this->dwAlg), - targetType, this->dwActivationDesc, this->pwActivationDesc, this->schedule)); - I32 algo = this->dwAlg; - Operator::setAlgorithmInfoToMap(algorithmMap, this->name, &algo, 1); - } - break; - } - case Convolution_Depthwise_Pointwise: { - if (algorithmMap.find(this->name) != algorithmMap.end()) { - I32 algo; - Operator::getAlgorithmInfoFromMap(algorithmMap, this->name, &algo, 1); - this->dwAlg = (DepthwiseConvolutionForwardAlgorithm)algo; - } else { - CHECK_STATUS(depthwise_convolution_infer_forward_algorithm(inputDesc, filterDesc, - this->outputTensors[0].get_desc(), - convDesc, policy, &(this->dwAlg), - targetType, this->dwActivationDesc, this->pwActivationDesc, this->schedule)); - I32 algo = this->dwAlg; - Operator::setAlgorithmInfoToMap(algorithmMap, this->name, &algo, 1); - } - break; - } - case Convolution_Dilation: { - if (algorithmMap.find(this->name) != algorithmMap.end()) { - I32 algo; - Operator::getAlgorithmInfoFromMap(algorithmMap, this->name, &algo, 1); - this->pwAlg = (ConvolutionForwardAlgorithm)algo; - } else { - CHECK_STATUS(convolution_infer_forward_algorithm(inputDesc, filterDesc, - this->outputTensors[0].get_desc(), - convDesc, policy, &(this->pwAlg), targetType, this->pwActivationDesc, this->schedule)); - I32 algo = this->pwAlg; - Operator::setAlgorithmInfoToMap(algorithmMap, this->name, &algo, 1); - } - break; - } - default: - CHECK_STATUS(NOT_SUPPORTED); - } - return SUCCESS; - } - - virtual EE infer_output_tensors_size(Vec inDims, Vec* outDims) override - { - TensorDesc inDim = inDims[0]; - DataType idt; - DataFormat idf; - U32 in, ic, ih, iw; - CHECK_STATUS(tensor4dGet(inDim, &idt, &idf, &in, &ic, &ih, &iw)); - this->numChannels = ic; - - TensorDesc filterDim = tensor4df(this->dt, DF_NCHW, this->numFilters, this->numChannels, this->kernelSizeH, - this->kernelSizeW); - - if (Convolution_Depthwise_Pointwise == this->convolutionType) { - filterDim.df = DF_CHW_NC; - } - - ConvolutionDesc convDesc = Convolution::create_convDesc(this->strideH, this->strideW, this->paddingT, this->paddingB, - this->paddingL, this->paddingR, this->dilateH, this->dilateW); - - DataType targetType = this->dt; - if (DT_F16_8Q == this->dt && Convolution_Pointwise == this->convolutionType) { - targetType = DT_I8; - } - - U32 outBytes = 0; - switch (this->convolutionType) { - case Convolution_Pointwise: { - CHECK_STATUS(convolution_infer_output_size(inDim, filterDim, convDesc, &((*outDims)[0]), targetType, &outBytes, this->schedule)); - break; - } - case Convolution_Depthwise: { - CHECK_STATUS(depthwise_convolution_infer_output_size(inDim, filterDim, convDesc, &((*outDims)[0]), targetType, &outBytes, this->schedule)); - break; - } - case Convolution_Depthwise_Pointwise: { - CHECK_STATUS(depthwise_convolution_infer_output_size(inDim, filterDim, convDesc, &((*outDims)[0]), targetType, &outBytes, this->schedule)); - break; - } - case Convolution_Dilation: { - CHECK_STATUS(convolution_infer_output_size(inDim, filterDim, convDesc, &((*outDims)[0]), targetType, &outBytes, this->schedule)); - break; - } - default: - CHECK_STATUS(NOT_SUPPORTED); - } - if (DT_F16_8Q == this->dt && featureScale.size() > 0 && -2 == (featureScale.back())[0]) { - (*outDims)[0].dt = DT_F16; - } - return SUCCESS; - } - - virtual U32 infer_tmp_memory_size() override - { - TensorDesc inputDesc = (this->inputTensors[0]).get_desc(); - TensorDesc filterDesc = (this->weightTensors[0]).get_desc(); - if (DT_F16_8Q == filterDesc.dt) { - filterDesc.dt = DT_I8; - } - TensorDesc outputDesc = (this->outputTensors[0]).get_desc(); - ConvolutionDesc convDesc = Convolution::create_convDesc(this->strideH, this->strideW, this->paddingT, this->paddingB, - this->paddingL, this->paddingR, this->dilateH, this->dilateW); - - U32 bytes = 0; - switch (this->convolutionType) { - case Convolution_Pointwise: { - CHECK_STATUS(convolution_infer_forward_tmp_bytes(inputDesc, filterDesc, outputDesc, convDesc, this->pwAlg, &bytes, this->schedule)); - break; - } - case Convolution_Depthwise: { - CHECK_STATUS(depthwise_convolution_infer_forward_tmp_bytes(inputDesc, filterDesc, outputDesc, convDesc, this->dwAlg, &bytes, this->schedule)); - break; - } - case Convolution_Depthwise_Pointwise: { - CHECK_STATUS(depthwise_convolution_infer_forward_tmp_bytes(inputDesc, filterDesc, outputDesc, convDesc, this->dwAlg, &bytes, this->schedule)); - break; - } - case Convolution_Dilation: { - CHECK_STATUS(convolution_infer_forward_tmp_bytes(inputDesc, filterDesc, outputDesc, convDesc, this->pwAlg, &bytes, this->schedule)); - break; - } - default: - CHECK_STATUS(NOT_SUPPORTED); - } - return bytes; - } - - virtual U32 infer_wtm_memory_size() override - { - TensorDesc filterDesc = (this->weightTensors[0]).get_desc(); - U32 bytes = 0; - switch (this->convolutionType) { - case Convolution_Pointwise: { - CHECK_STATUS(convolution_transform_filter_bytes(filterDesc, this->pwAlg, &bytes, this->schedule)); - break; - } - case Convolution_Depthwise: { - CHECK_STATUS(depthwise_convolution_transform_filter_bytes(filterDesc, this->dwAlg, &bytes, this->schedule)); - break; - } - case Convolution_Depthwise_Pointwise: { - CHECK_STATUS(depthwise_convolution_transform_filter_bytes(filterDesc, this->dwAlg, &bytes, this->schedule)); - break; - } - case Convolution_Dilation: { - CHECK_STATUS(convolution_transform_filter_bytes(filterDesc, this->pwAlg, &bytes, this->schedule)); - break; - } - default: - CHECK_STATUS(NOT_SUPPORTED); - } - return bytes; - } - - virtual EE transform_filter() override - { - Tensor filterTensor = this->weightTensors[0]; - TensorDesc filterDesc = filterTensor.get_desc(); - U8* weightPtr = filterTensor.get_val(); - this->wtm = std::shared_ptr(new Tensor()); - - TensorDesc wtmDesc; - if (DT_F16_8Q == this->dt && Convolution_Pointwise == this->convolutionType && CONVOLUTION_ALGORITHM_WINOGRAD == this->pwAlg) { // int8 winograd -#ifdef _USE_INT8 - U32 ftBytes; - CHECK_STATUS(convolution_transform_filter_bytes(filterDesc, this->pwAlg, &ftBytes, this->schedule)); - - TensorDesc tFilterDesc; - F16 *tFilter = (F16*)malloc(ftBytes); - if (nullptr == tFilter) { - std::cerr << "[ERROR] allocation failed for filter transform in int8 winograd" << std::endl; - CHECK_STATUS(ALLOC_FAILED); - } - - filterDesc.dt = DT_F16_8Q; // To label as int8 - CHECK_STATUS(convolution_transform_filter(filterDesc, weightPtr, this->pwAlg, &tFilterDesc, tFilter, this->temp.get(), this->schedule)); - - U32 ftmBytes = ftBytes / bytesOf(DT_F16); - std::shared_ptr sPtr((U8*) operator new(ftmBytes)); - auto cpuMem = new CpuMemory(); - cpuMem->set_shared_ptr_caster(sPtr); - Memory_* mem = (Memory_*)(cpuMem); - std::shared_ptr memsPtr(mem); - this->set_wtm_memory(ftmBytes, memsPtr); - - std::shared_ptr fsp((F16*) operator new(38*bytesOf(DT_F16))); - this->scales = fsp; - CHECK_STATUS(quantize_tensor(tFilterDesc, tFilter, &wtmDesc, this->get_wtm()->get_val(), this->scales.get()+2)); - free(tFilter); - } else if (DT_F16_8Q == this->dt && Convolution_Pointwise == this->convolutionType) { // int8 tilegemm - TensorDesc qFilterDesc; - INT8 *qFilter = (INT8*)malloc(tensorNumElements(filterDesc) * bytesOf(DT_I8)); - if (nullptr == qFilter) { - std::cerr << "[ERROR] allocation failed for filter quantization" << std::endl; - CHECK_STATUS(ALLOC_FAILED); - } - std::shared_ptr fsp((F16*) operator new(3*bytesOf(DT_F16))); - this->scales = fsp; - this->scales.get()[2] = -1; - CHECK_STATUS(quantize_tensor(filterDesc, weightPtr, &qFilterDesc, qFilter, this->scales.get()+2)); - - U32 ftmBytes; - CHECK_STATUS(convolution_transform_filter_bytes(qFilterDesc, this->pwAlg, &ftmBytes, this->schedule)); - - std::shared_ptr sPtr((U8*) operator new(ftmBytes)); - auto cpuMem = new CpuMemory(); - cpuMem->set_shared_ptr_caster(sPtr); - Memory_* mem = (Memory_*)(cpuMem); - std::shared_ptr memsPtr(mem); - this->set_wtm_memory(ftmBytes, memsPtr); - - // trans filter - CHECK_STATUS(convolution_transform_filter(qFilterDesc, qFilter, this->pwAlg, - &wtmDesc, this->get_wtm()->get_val(), this->temp.get(), this->schedule)); - - free(qFilter); -#endif - } else { // All other cases - auto wtmBytes = this->infer_wtm_memory_size(); - std::shared_ptr sPtr((U8*) operator new(wtmBytes)); - auto cpuMem = new CpuMemory(); - cpuMem->set_shared_ptr_caster(sPtr); - Memory_* mem = (Memory_*)(cpuMem); - std::shared_ptr memsPtr(mem); - this->set_wtm_memory(wtmBytes, memsPtr); - - switch (this->convolutionType) { - case Convolution_Pointwise: { - CHECK_STATUS(convolution_transform_filter(filterDesc, weightPtr, this->pwAlg, &wtmDesc, this->get_wtm()->get_val(), this->temp.get(), this->schedule)); - break; - } - case Convolution_Depthwise: { - CHECK_STATUS(depthwise_convolution_transform_filter(filterDesc, weightPtr, this->dwAlg, &wtmDesc, this->get_wtm()->get_val(), this->schedule)); - break; - } - case Convolution_Depthwise_Pointwise: { - CHECK_STATUS(depthwise_convolution_transform_filter(filterDesc, weightPtr, this->dwAlg, &wtmDesc, this->get_wtm()->get_val(), this->schedule)); - break; - } - case Convolution_Dilation: { - CHECK_STATUS(convolution_transform_filter(filterDesc, weightPtr, this->pwAlg, &wtmDesc, this->get_wtm()->get_val(), this->temp.get(), this->schedule)); - break; - } - default: - CHECK_STATUS(NOT_SUPPORTED); - } - } - - this->get_wtm()->set_desc(wtmDesc); - this->weightTensors[0] = *this->get_wtm(); - return SUCCESS; - } -public: -}; - -#endif //_CONVELTWISEPOOLING_H diff --git a/inference/include/cpu/eltwise_cpu.hpp b/inference/include/cpu/eltwise_cpu.hpp deleted file mode 100644 index 53aa92d5..00000000 --- a/inference/include/cpu/eltwise_cpu.hpp +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -/** - * Project deploy - */ - - -#ifndef _ELTWISE_CPU_H -#define _ELTWISE_CPU_H - -#include "operator.hpp" -#include "eltwise.hpp" - -class EltwiseCPU: public Eltwise { -public: - EltwiseCPU(EltwiseMode eltMode, I32 coeffSize, F32* coeffValues):Eltwise(eltMode, coeffSize, coeffValues){} - - virtual void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - - Vec inputDesc; - Vec inputPtr; -#ifdef _USE_INT8 - F16 *inD = (F16*)this->temp->get_val(); -#endif - for (Tensor tensorIn: this->inputTensors) { - TensorDesc desc = tensorIn.get_desc(); - U8 *ptr = tensorIn.get_val(); -#ifdef _USE_INT8 - if (DT_I8 == desc.dt) { - INT8 *inQ = (INT8*)ptr; - F32 inputScale = tensorIn.get_scale(); - dequantize_int8_to_fp16(tensorNumElements(desc), inQ, inputScale, inD); - desc.dt = DT_F16; - ptr = (U8*)inD; - inD += tensorNumElements(desc); - } -#endif - inputDesc.push_back(desc); - inputPtr.push_back((void*)ptr); - } - auto outputDesc = this->outputTensors[0].get_desc(); - auto outputPtr = this->outputTensors[0].get_val(); - - if (this->eltMode == ELTWISE_PROD && inputDesc.size() == 2 && - (inputDesc[1].nDims == 2 || (inputDesc[1].nDims == 4 && inputDesc[1].dims[0] == 1 && inputDesc[1].dims[1] == 1)) && - tensorNumElements(inputDesc[0]) != tensorNumElements(inputDesc[1]) ) { - CHECK_STATUS(scale(this->inputTensors[0].get_desc(), this->inputTensors[0].get_val(), - 1, this->inputTensors[1].get_val(), nullptr, - outputDesc, outputPtr, this->schedule)); - } else { - CHECK_STATUS(eltwise(inputDesc, inputPtr, outputDesc, outputPtr, this->eltMode, this->schedule)); - } - - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - virtual EE infer_output_tensors_size(VecinDims, Vec* outDims) override - { - CHECK_STATUS(eltwise_infer_output_size(inDims, &((*outDims)[0]), this->schedule)); - if (DT_I8 == (*outDims)[0].dt) { - (*outDims)[0].dt = DT_F16; - this->lenOfTemp = 0; - for (auto desc : inDims) { - if (DT_I8 == desc.dt) { - this->lenOfTemp += tensorNumElements(desc) * bytesOf(DT_F16); - } - } - } - return SUCCESS; - } - - U32 infer_tmp_memory_size() override - { - return this->lenOfTemp; - } -}; - -#endif //_ELTWISE_CPU_H diff --git a/inference/include/cpu/embedding_cpu.hpp b/inference/include/cpu/embedding_cpu.hpp deleted file mode 100644 index 834f1b09..00000000 --- a/inference/include/cpu/embedding_cpu.hpp +++ /dev/null @@ -1,109 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _EMBEDDING_CPU_H -#define _EMBEDDING_CPU_H -#include "weight_operator.hpp" -#include "tensor_computing.h" -#include "embedding.hpp" - -class EmbeddingCPU: public Embedding { -public: - EmbeddingCPU(DataType dt, U32 inputDim, U32 numOutput, bool transpose) : - Embedding(dt, inputDim, numOutput, transpose) { } - - void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - Tensor inputTensor = this->inputTensors[0]; - Tensor weightTensor; - if (this->weightTensors.size() > 0) - weightTensor = this->weightTensors[0]; - else - weightTensor = this->inputTensors[1]; - Tensor outputTensor = this->outputTensors[0]; - - U32* inputPtr = (U32*)(inputTensor.get_val()); - U8* weightPtr = weightTensor.get_val(); - U8* outputPtr = outputTensor.get_val(); - - TensorDesc inputDesc = inputTensor.get_desc(); - U32 len = tensorNumElements(inputDesc); - U32 elementBytes = bytesOf(this->dt); - U32 wordEmbeddingCPUBytes = elementBytes * this->numOutput; - U32 transposeStride = elementBytes * this->inputDim; - for (U32 i = 0; i < len; i++) { - U32 wordIndex = inputPtr[i]; - U8* dest = outputPtr; - if (transpose) { - U8* src = weightPtr + wordIndex * elementBytes; - for (U32 j = 0; j < this->numOutput; j++) { - memcpy(dest, src, elementBytes); - src += transposeStride; - dest += elementBytes; - } - } else { - U8* src = weightPtr + wordIndex * wordEmbeddingCPUBytes; - memcpy(dest, src, wordEmbeddingCPUBytes); - } - outputPtr += wordEmbeddingCPUBytes; - } - - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - EE infer_output_tensors_size(Vec inDims, Vec* outDims) override - { - TensorDesc inDim = inDims[0]; - DataType dt; - DataFormat df; - U32 batch, step; - CHECK_REQUIREMENT(tensorIs2d(inDim)); - CHECK_STATUS(tensor2dfGet(inDim, &dt, &df, &batch, &step)); - - (*outDims)[0] = tensor3df(this->dt, DF_MTK, batch, step, this->numOutput); - return SUCCESS; - } - - EE init_weight_bias_from_model(U8** modelPtr) override - { - TensorDesc weightDesc; - if (transpose) - weightDesc = tensor2df(this->dt, DF_TRANSPOSE, this->numOutput, this->inputDim); - else - weightDesc = tensor2df(this->dt, DF_NORMAL, this->inputDim, this->numOutput); - U32 weightBytes = tensorNumBytes(weightDesc); - - std::shared_ptr modelWeightTensor(new Tensor()); - modelWeightTensor->set_desc(weightDesc); - - bool set_ptr = false; - if(modelPtr != nullptr){ - modelWeightTensor->alloc(); - memcpy((U8*)modelWeightTensor->get_val(), *modelPtr, weightBytes); - *modelPtr += weightBytes; - set_ptr = true; - } else { - auto curOpWs = this->get_weightspec_ptr(); - if (curOpWs.weight != nullptr) { - modelWeightTensor->set_shared_ptr(std::shared_ptr(curOpWs.weight)); - set_ptr = true; - } - } - if(set_ptr) this->weightTensors.push_back(*modelWeightTensor.get()); - return SUCCESS; - } -}; - -#endif //_EMBEDDING_CPU_H diff --git a/inference/include/cpu/factory_cpu.hpp b/inference/include/cpu/factory_cpu.hpp deleted file mode 100644 index 76b1f4c9..00000000 --- a/inference/include/cpu/factory_cpu.hpp +++ /dev/null @@ -1,296 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _FACTORY_CPU_H -#define _FACTORY_CPU_H - -#include "operator.hpp" -#include "deconvolution.hpp" -#include "lstm.hpp" -#include "lstmcell.hpp" -#include "resize.hpp" -#include "attention.hpp" -#include "unsqueeze.hpp" -#include "reduction.hpp" -#include "argmax.hpp" -#include "check.hpp" -#include "repeat.hpp" -#include "preallocated_memory.hpp" -#include "shared_weight.hpp" -#include "copy.hpp" -#include "jump.hpp" -#include "cpu/pooling_cpu.hpp" -#include "cpu/convolution_cpu.hpp" -#include "cpu/eltwise_cpu.hpp" -#include "cpu/softmax_cpu.hpp" -#include "cpu/activation_cpu.hpp" -#include "cpu/fully_connected_cpu.hpp" -#include "cpu/scale_cpu.hpp" -#include "cpu/concat_cpu.hpp" -#include "cpu/clip_cpu.hpp" -#include "cpu/squeeze_cpu.hpp" -#include "cpu/reshape_cpu.hpp" -#include "cpu/embedding_cpu.hpp" -#include "cpu/layer_norm_cpu.hpp" -#include "cpu/matmul_cpu.hpp" -#include "cpu/multiply_cpu.hpp" -#include "cpu/transpose_cpu.hpp" -#include "cpu/slice_cpu.hpp" -#include "attention_mask.hpp" -#include "relative_position_embedding.hpp" -#include "relative_shift.hpp" -#include "padding.hpp" -#include "detection_output.hpp" -#include "prior_box.hpp" - -class FactoryCPU: public Factory { -public: - virtual std::shared_ptr createConvolution(DataType dt, U32 nf, - U32 ksizeH, U32 ksizeW, U32 kstrideH, U32 kstrideW, - U32 kpaddingT, U32 kpaddingB, U32 kpaddingL, U32 kpaddingR, - ActivationDesc dwActivationDesc, ActivationDesc pwActivationDesc, - ConvolutionMode convolutionType, U32 group, U32 dilateH, U32 dilateW) override { - auto cep = (Convolution*)(new ConvolutionCPU(dt, nf, ksizeH, ksizeW, kstrideH, kstrideW, - kpaddingT, kpaddingB, kpaddingL, kpaddingR, - dwActivationDesc, pwActivationDesc, - convolutionType, group, dilateH, dilateW)); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createDeconvolution(DataType dt, U32 nf, - U32 ksizeH, U32 ksizeW, U32 kstrideH, U32 kstrideW, - U32 kpaddingT, U32 kpaddingB, U32 kpaddingL, U32 kpaddingR, - ActivationDesc dwActivationDesc, ActivationDesc pwActivationDesc, - ConvolutionMode convolutionType, U32 group, U32 dilateH, U32 dilateW) override { - auto cep = new Deconvolution(dt, nf, ksizeH, ksizeW, kstrideH, kstrideW, - kpaddingT, kpaddingB, kpaddingL, kpaddingR, - dwActivationDesc, pwActivationDesc, - convolutionType, group, dilateH, dilateW); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createPooling(PoolingMode mode, U32 ksH, U32 ksW, U32 strideH, U32 strideW, - U32 paddingT, U32 paddingB, U32 paddingL, U32 paddingR, RoundMode rm) override { - auto cep = (Pooling*)(new PoolingCPU(mode, ksH, ksW, strideH, strideW, paddingT, paddingB, paddingL, paddingR, rm)); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createFullyConnected(DataType dt, U32 numInput, U32 numOutput, - U32 numSlice, I32* slicePoint) override { - auto cep = (FullyConnected*)(new FullyConnectedCPU(dt, numInput, numOutput, numSlice, slicePoint)); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createSoftmax(DataType dt, int axis) override { - auto cep = new SoftmaxCPU(dt, axis); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createConcat(int axis) override { - auto cep = (Concat*)(new ConcatCPU(axis)); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createActivation(ActivationDesc activeDesc) override { - auto cep = (Activation*) new ActivationCPU(activeDesc); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createEltwise(EltwiseMode eltMode, I32 coeffSize, F32* coeffValues) override { - auto cep = (Eltwise*)new EltwiseCPU(eltMode, coeffSize, coeffValues); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createScale(DataType dt, int axis, int numChannels, int numSource) override { - auto cep = (Scale*)(new ScaleCPU(dt, axis, numChannels, numSource)); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createLSTM(DataType dt, U32 numOutput, U32 numProjection, - F32 zoneoutCell, F32 zoneoutOutput, bool biDirection) override - { - auto cep = new LSTM(dt, numOutput, numProjection, zoneoutCell, zoneoutOutput, biDirection); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createLSTMCell(DataType dt, U32 numOutput, U32 numProjection, - F32 zoneoutCell, F32 zoneoutOutput, bool biDirection) override - { - auto cep = new LSTMCell(dt, numOutput, numProjection, zoneoutCell, zoneoutOutput, biDirection); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createLSTM(DataType dt, U32 numOutput, U32 numProjection, - F32 zoneoutCell, F32 zoneoutOutput, I32 steps) override - { - if (steps == -2) - return FactoryCPU::createLSTM(dt, numOutput, numProjection, zoneoutCell, zoneoutOutput, true); - if (steps >= 0) - return FactoryCPU::createLSTM(dt, numOutput, numProjection, zoneoutCell, zoneoutOutput, false); - else - return FactoryCPU::createLSTMCell(dt, numOutput, numProjection, zoneoutCell, zoneoutOutput, false); - } - - virtual std::shared_ptr createEmbedding(DataType dt, U32 inputDim, U32 numOutput, bool transpose) override { - auto cep = (Embedding*)(new EmbeddingCPU(dt, inputDim, numOutput, transpose)); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createMultiply(DataType dt, F32 scale, F32 bias) override { - auto cep = (Multiply*)(new MultiplyCPU(dt, scale, bias)); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createMatMul(DataType dt, bool transposeA, bool transposeB) override { - auto cep = (MatMul*)(new MatMulCPU(dt, transposeA, transposeB)); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createLayerNorm(DataType dt, U32 weightNum) override { - auto cep = (LayerNorm*) (new LayerNormCPU(dt, weightNum)); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createReshape(DataType dt, I32* shapeDims, - I32 shapeSize, I32 axis, I32 numAxes) override { - auto cep = (Reshape*)(new ReshapeCPU(dt, shapeDims, shapeSize, axis, numAxes)); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createResize(DataType paramDT, void* paramPtr) override { - auto cep = new Resize(paramDT, paramPtr); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createSlice(DataType dt, I32 axis, I32* slicePoints, U32 sliceSize) override { - auto cep = (Slice*)(new SliceCPU(dt, axis, slicePoints, sliceSize)); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createTranspose(DataType dt, U32* transDims, U32 transSize) override { - auto cep = (Transpose*)new TransposeCPU(dt, transDims, transSize); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createAttention(DataType dt, - U32 numHeads, U32 fromSequenceLength, U32 toSequenceLength) override { - auto cep = new Attention(dt, numHeads, fromSequenceLength, toSequenceLength); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createClip(DataType dt, F32 clipMinScalar, F32 clipMaxScalar) override { - auto cep = (Clip*)(new ClipCPU(dt, clipMinScalar, clipMaxScalar)); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createSqueeze(DataType dt, I32 axis, I32 *dims, I32 dimSize) override { - auto cep = (Squeeze*)(new SqueezeCPU(dt, axis, dims, dimSize)); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createUnsqueeze(DataType dt, I32 axis, I32 *dims, I32 dimSize) override { - auto cep = new Unsqueeze(dt, axis, dims, dimSize); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createReduction(DataType dt, I32 axis, bool keepDim, ReductionMode reductionMode, float coeff) override { - auto cep = new Reduction(dt, axis, keepDim, reductionMode, coeff); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createArgMax(DataType dt, I32 axis) override { - auto cep = new ArgMax(dt, axis); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createCopy(DataType dt, I32 *srcDims, I32 *dstDims, I32 length) override { - auto cep = new Copy(dt, srcDims, dstDims, length); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createCheck(DataType dt, CheckMode checkMode) override { - auto cep = new Check(dt, checkMode); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createRepeat(DataType dt, I32 loops, I32 axis, - I32 jumpOperatorIndex, I32 currentOperatorIndex) override { - auto cep = new Repeat(dt, loops, axis, jumpOperatorIndex, currentOperatorIndex); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createBilateralSliceApply(U32 coefficiency_len, bool has_offset, - BilateralSliceApplyMode mode) override { - OP_UNSUP(3, coefficiency_len, has_offset, mode); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createPreAllocatedMemory(DataType dt, TensorDesc desc) override { - auto cep = new PreAllocatedMemory(dt, desc); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createSharedWeight(DataType dt, TensorDesc desc) override { - auto cep = new SharedWeight(dt, desc); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createJump(DataType dt, - I32 jumpOperatorIndex, I32 currentOperatorIndex) override { - auto cep = new Jump(dt, jumpOperatorIndex, currentOperatorIndex); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createSpace2Depth(DataType dt) override { - OP_UNSUP(1, dt); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createDepth2Space(DataType dt) override { - OP_UNSUP(1, dt); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createAttentionMask(DataType dt, I32 attentionLength, - bool sameLength, float mask) override { - auto cep = new AttentionMask(dt, attentionLength, sameLength, mask); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createRelativePositionEmbedding(DataType dt, U32 inputDim, - U32 numOutput, bool transpose, I32 axis) override { - auto cep = new RelativePositionEmbedding(dt, inputDim, numOutput, transpose, axis); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createRelativeShift(DataType dt, I32 axis, - I32 shiftLength) override { - auto cep = new RelativeShift(dt, axis, shiftLength); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createPadding(DataType dt, PadDesc padDesc) override { - auto cep = new Padding(dt, padDesc); - return std::shared_ptr(cep); - } - virtual std::shared_ptr createPriorBox(DataType dt, PriorBoxDesc priorboxDesc) override { - auto cep = new PriorBox(dt, priorboxDesc); - return std::shared_ptr(cep); - } - virtual std::shared_ptr createDetectionOutput(DataType dt, DetectionOutputDesc detectionoutputDesc) override { - auto cep = new DetectionOutput(dt, detectionoutputDesc); - return std::shared_ptr(cep); - } -}; -#endif //_FACTORY_CPU_H diff --git a/inference/include/cpu/fully_connected_cpu.hpp b/inference/include/cpu/fully_connected_cpu.hpp deleted file mode 100644 index 9037d9c1..00000000 --- a/inference/include/cpu/fully_connected_cpu.hpp +++ /dev/null @@ -1,481 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -/** - * Project deploy - */ -#ifndef _FCELTWISE_CPU_H -#define _FCELTWISE_CPU_H - -#include "weight_operator.hpp" -#include "tensor_computing.h" -#include "fully_connected.hpp" -#include "blas-enhance.h" -#include - -class FullyConnectedCPU: public FullyConnected { -public: - FullyConnectedCPU(DataType dt, U32 numInput, U32 numOutput, - U32 numSlice, I32* slicePoint): - FullyConnected(dt, numInput, numOutput, numSlice, slicePoint) { } - - virtual EE init_weight_bias_from_model(U8** modelPtr) override - { - DataType dtNoQ = (DT_F16_8Q == this->dt) ? DT_F16 : this->dt; - TensorDesc weightDesc = tensor2df(dtNoQ, DF_NORMAL, this->numOutput, this->numInput); - TensorDesc biasDesc = tensor1d(dtNoQ, this->numOutput); - - std::shared_ptr modelWeightTensor(new Tensor()); - std::shared_ptr modelBiasTensor(new Tensor()); - modelWeightTensor->set_desc(weightDesc); - modelBiasTensor->set_desc(biasDesc); - - auto curOpWs = this->get_weightspec_ptr(); - if(modelPtr != nullptr){ - modelWeightTensor->alloc(); - memcpy((U8*)modelWeightTensor->get_val(), *modelPtr, tensorNumBytes(weightDesc)); - *modelPtr += tensorNumBytes(weightDesc); - } else { - modelWeightTensor->set_shared_ptr(std::shared_ptr(curOpWs.weight)); - } - - U8* biasVal = nullptr; - if (modelPtr != nullptr) { - if (this->hasBias) { - biasVal = *modelPtr; - *modelPtr += tensorNumBytes(biasDesc); - } - } else if (this->hasBias) { - biasVal = curOpWs.vec; - } - - if (biasVal) { - modelBiasTensor->set_shared_ptr(std::shared_ptr(biasVal)); - } else { - modelBiasTensor->alloc(); - memset((U8*)modelBiasTensor->get_val(), 0, tensorNumBytes(biasDesc)); - } - - this->weightTensors.push_back(*modelWeightTensor.get()); - this->biasTensors.push_back(*modelBiasTensor.get()); - return SUCCESS; - } - - TensorDesc desc_process(TensorDesc inDim) - { - TensorDesc inputDesc; - DataType dt; - DataFormat df; - U32 in, ic, ih, iw; - switch (inDim.nDims) { - case 2: { - CHECK_STATUS(tensor2dGet(inDim, &dt, &in, &(this->numInput))); - inputDesc = inDim; - break; - } - case 3: { - CHECK_STATUS(tensor3dGet(inDim, &dt, &df, &in, &ih, &iw)); - this->numInput = iw; - inputDesc = tensor2df(dt, DF_NORMAL, in*ih, iw); - break; - } - case 4: { - CHECK_STATUS(tensor4dGet(inDim, &dt, &df, &in, &ic, &ih, &iw)); - this->numInput = ic*ih*iw; - inputDesc = inDim; - break; - } - default: - break; - } - return inputDesc; - } - - TensorDesc desc_process_reverse(TensorDesc inDim, TensorDesc outDim) - { - TensorDesc outDesc; - DataType dt; - DataFormat df; - U32 in, ih, iw; - switch (inDim.nDims) { - case 2: { - outDesc = outDim; - break; - } - case 3: { - CHECK_STATUS(tensor3dGet(inDim, &dt, &df, &in, &ih, &iw)); - outDesc = tensor3df(dt, df, in, ih, this->numOutput); - break; - } - case 4: { - outDesc = outDim; - break; - } - default: - break; - } - return outDesc; - } - - - virtual void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = desc_process(inputTensor.get_desc()); - - Tensor weightTensor = this->weightTensors[0]; - TensorDesc weightDesc = weightTensor.get_desc(); - - Tensor biasTensor = this->biasTensors[0]; - TensorDesc biasDesc = biasTensor.get_desc(); - U8 *bias = biasTensor.get_val(); - - U8 *tmp = (U8*)this->temp->get_val(); - - Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); - outputDesc.dims[0] = this->numOutput; - U32 numRow = outputDesc.dims[1]; - outputDesc = desc_process(outputDesc); - U8 *fcOutput; - if (1 == numSlice) { - fcOutput = outputTensor.get_val(); - } else { - fcOutput = tmp; - if (DT_I8 == weightDesc.dt) { - tmp += tensorNumElements(outputDesc) * bytesOf(DT_I32); - } else { - tmp += tensorNumBytes(outputDesc); - } - } - - if (DT_I8 == weightDesc.dt) { -#ifdef _USE_INT8 - U8 *inputPtr = inputTensor.get_val(); - F32 scaleI = 1; - if (DT_F16 == inputDesc.dt) { - F16 *inD = (F16*)inputPtr; - INT8 *inQ = (INT8*)tmp; - F16 scale = -1; - if (featureScale.size() > 1 && featureScale[0][0] > 0) { - scale = featureScale[0][0]; - } - quantize_tensor(inputDesc, inD, &inputDesc, inQ, &scale); - scaleI = scale; - inputPtr = (U8*)tmp; - tmp += tensorNumBytes(inputDesc); - } else { - scaleI = inputTensor.get_scale(); - } - // The first portion of tmp is used for quantized bias and results before quantization - if (this->hasBias && DT_F16 != outputDesc.dt) { - biasDesc.dt = DT_I32; - bias = (U8*)biasScaled.data(); - } else { - bias = nullptr; - } - outputDesc.dt = DT_I32; - - I32 *result = (I32*)tmp; - U8 *tmpReal = tmp + tensorNumBytes(outputDesc); - - if (nullptr == bias) { - memset(result, 0, tensorNumBytes(outputDesc)); - } else { - F16 *biasF = (F16*)biasTensor.get_val(); - I32 *biasI = biasScaled.data(); - for (U32 i = 0; i < numSlice; i++) { - F32 scale = scaleI * weightScale[i]; - for (int j = 0; j < slicePoints[i]; j++) { - biasI[j] = round(scale * biasF[j]); - } - biasI += slicePoints[i]; - biasF += slicePoints[i]; - } - } - CHECK_STATUS(fully_connected(inputDesc, inputPtr, - weightDesc, weightTensor.get_val(), - tmpReal, this->lenOfTemp, - outputDesc, result, - biasDesc, bias, this->schedule)); - - if (1 == this->numSlice) { - F32 scale = scaleI * weightScale[0]; - if (DT_I8 == outputTensor.get_desc().dt) { - CHECK_STATUS(quantize_tensor(outputDesc, result, &outputDesc, fcOutput, &scale)); - this->outputTensors[0].set_scale(scale); - } else { - CHECK_REQUIREMENT(DT_F16 == outputTensor.get_desc().dt); - F16 *output = outputTensor.get_val(); - dequantize_int32_to_fp16(tensorNumElements(outputDesc), result, scale, output, tensorNumElements(biasDesc), (F16*)this->biasTensors[0].get_val()); - } - } else { - CHECK_REQUIREMENT(this->numSlice == this->outputTensors.size()); - Vec bufD(this->numSlice); - bufD[0] = fcOutput; - for (U32 i = 1; i < this->numSlice; i++) { - bufD[i] = bufD[i - 1] + tensorNumElements(this->outputTensors[i - 1].get_desc()) * bytesOf(DT_I32); - } - CHECK_REQUIREMENT(numRow * this->numOutput == tensorNumElements(outputDesc)); - for (U32 i = 0; i < numRow; i++) { - for (U32 j = 0; j < this->numSlice; j++) { - U32 sliceSize = this->slicePoints[j] * bytesOf(DT_I32); - memcpy(bufD[j], result, sliceSize); - bufD[j] += sliceSize; - result += this->slicePoints[j]; - } - } - F16 *biasPtr = (F16*)this->biasTensors[0].get_val(); - for (U32 i = 0; i < this->numSlice; i++) { - F32 scale = scaleI * weightScale[i]; - outputDesc.dims[0] = slicePoints[i]; - if (DT_I8 == outputTensor.get_desc().dt) { - CHECK_STATUS(quantize_tensor(outputDesc, fcOutput, &outputDesc, this->outputTensors[i].get_val(), &scale)); - this->outputTensors[i].set_scale(scale); - } else { - CHECK_REQUIREMENT(DT_F16 == outputTensor.get_desc().dt); - F16 *output = outputTensors[i].get_val(); - dequantize_int32_to_fp16(tensorNumElements(outputDesc), (I32*)fcOutput, scale, output, slicePoints[i], biasPtr); - biasPtr += slicePoints[i]; - } - - outputDesc.dt = DT_I32; - fcOutput += tensorNumBytes(outputDesc); - } - } -#endif - } else { - if (nullptr == bias) { - memset(fcOutput, 0, tensorNumBytes(outputDesc)); - } - CHECK_STATUS(fully_connected(inputDesc, inputTensor.get_val(), - weightDesc, weightTensor.get_val(), - tmp, this->lenOfTemp, - outputDesc, fcOutput, - biasDesc, bias, this->schedule)); - - if (1 != this->numSlice) { - CHECK_REQUIREMENT(this->numSlice == this->outputTensors.size()); - Vec outputPtr(this->numSlice); - for (U32 i = 0; i < this->numSlice; i++) { - outputPtr[i] = this->outputTensors[i].get_val(); - } - CHECK_REQUIREMENT(numRow * this->numOutput == tensorNumElements(outputDesc)); - for (U32 i = 0; i < numRow; i++) { - for (U32 j = 0; j < this->numSlice; j++) { - U32 sliceSize = this->slicePoints[j] * bytesOf(outputDesc.dt); - memcpy(outputPtr[j], fcOutput, sliceSize); - outputPtr[j] += sliceSize; - fcOutput += sliceSize; - } - } - } - } - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - virtual EE infer_output_tensors_size(Vec inDims, Vec* outDims) override - { - this->mvm = false; - TensorDesc inputDesc = desc_process(inDims[0]); - TensorDesc weightDesc = tensor2df(inputDesc.dt, DF_NORMAL, this->numOutput, this->numInput); - TensorDesc outputDesc; - - DataType idt; - DataFormat idf; - U32 in = 0, ic, ih, iw; - if (tensorIs2d(inputDesc)) { - CHECK_STATUS(tensor2dfGet(inputDesc, &idt, &idf, &in, &iw)); - } else if (tensorIs4d(inputDesc)) { - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - } else { - CHECK_STATUS(NOT_MATCH); - } - if (1 == in) { - this->mvm = true; - } - - CHECK_STATUS(fully_connected_infer_output_size(inputDesc, weightDesc, &outputDesc, this->schedule)); - if (1 == this->numSlice) { - (*outDims)[0] = desc_process_reverse(inDims[0], outputDesc); - if (DT_F16_8Q == this->dt) { - if (featureScale.size() > 0 && -2 == (featureScale.back())[0]) { - (*outDims)[0].dt = DT_F16; - } else { - (*outDims)[0].dt = DT_I8; - } - } - } else { - outputDesc = desc_process_reverse(inDims[0], outputDesc); - for (U32 i = 0; i < this->numSlice; i++) { - (*outDims)[i] = outputDesc; - (*outDims)[i].dims[0] = this->slicePoints[i]; - if (DT_F16_8Q == this->dt) { - if (featureScale.size() > 0 && -2 == (featureScale.back())[0]) { - (*outDims)[i].dt = DT_F16; - } else { - (*outDims)[i].dt = DT_I8; - } - } - } - } - return SUCCESS; - } - - virtual U32 infer_tmp_memory_size() override - { - TensorDesc inputDesc = desc_process((this->inputTensors[0]).get_desc()); - TensorDesc castDesc = inputDesc; - TensorDesc filterDesc = (this->weightTensors[0]).get_desc(); - TensorDesc outputDesc = this->outputTensors[0].get_desc(); - outputDesc.dims[0] = this->numOutput; - U32 bytes = 0; - - castDesc.dt = filterDesc.dt; - CHECK_STATUS(fully_connected_infer_forward_tmp_bytes(castDesc, filterDesc, &bytes, this->schedule)); - if (DT_I8 == filterDesc.dt) { - if (DT_F16 == inputDesc.dt) { - bytes += tensorNumElements(castDesc); - } - outputDesc.dt = DT_I32; - bytes += tensorNumBytes(outputDesc); // Results before quantization - } - if (1 != this->numSlice) { - bytes += tensorNumBytes(outputDesc); - } - return bytes; - } - - virtual U32 infer_wtm_memory_size() override - { - TensorDesc weightDesc = (this->weightTensors[0]).get_desc(); - U32 bytes = 0; - CHECK_STATUS(fully_connected_transform_filter_bytes(weightDesc, &bytes, this->schedule)); - return bytes; - } - - virtual EE transform_filter() override - { - this->wtm = std::shared_ptr(new Tensor()); - TensorDesc inputDesc = desc_process((this->inputTensors[0]).get_desc()); - - Tensor weightTensor = this->weightTensors[0]; - TensorDesc weightDesc = weightTensor.get_desc(); - U8* weightPtr = weightTensor.get_val(); - - TensorDesc wtmDesc; - auto wtm_bytes = this->infer_wtm_memory_size(); - - if (DT_F16_8Q == this->dt) { -#ifdef _USE_INT8 - TensorDesc tFilterDesc; - F16 *tFilter = (F16*)malloc(wtm_bytes + bytesOf(DT_I8) * tensorNumElements(weightDesc)); - if (nullptr == tFilter) { - std::cerr << "[ERROR] allocation failed for filter transform in int8 FC" << std::endl; - CHECK_STATUS(ALLOC_FAILED); - } - TensorDesc qFilterDesc; - INT8 *qFilter = (INT8*)(tFilter + wtm_bytes / bytesOf(DT_F16)); - - inputDesc.dt = DT_F16; - CHECK_STATUS(fully_connected_transform_filter(inputDesc, weightDesc, weightPtr, &tFilterDesc, tFilter, this->schedule)); - U32 ftm_bytes = wtm_bytes / bytesOf(DT_F16); - std::shared_ptr wtmPtr((U8*) operator new(ftm_bytes)); - auto cpuMem = new CpuMemory(); - cpuMem->set_shared_ptr_caster(wtmPtr); - Memory_* mem = (Memory_*)(cpuMem); - std::shared_ptr memWtmPtr(mem); - this->set_wtm_memory(wtm_bytes, memWtmPtr); - - F16 scale; - this->weightScale = Vec(numSlice); - if (this->mvm) { - F16 *inD = tFilter; - INT8 *inQ = this->get_wtm()->get_val(); - for (U32 i = 0; i < numSlice; i++) { - tFilterDesc.dims[1] = slicePoints[i]; - scale = -1; - CHECK_STATUS(quantize_tensor(tFilterDesc, inD, &qFilterDesc, inQ, &scale)); - weightScale[i] = scale; - inD += tensorNumElements(tFilterDesc); - inQ += tensorNumElements(qFilterDesc); - } - wtmDesc = qFilterDesc; - wtmDesc.dims[1] = numOutput; - } else if (featureScale.size() > 0 && featureScale[0][0] > 0) { - F16 *inD = tFilter; - INT8 *inQ = qFilter; - scale = -1; - CHECK_STATUS(quantize_tensor(tFilterDesc, inD, &qFilterDesc, inQ, &scale)); - - for (U32 i = 0; i < numSlice; i++) { - weightScale[i] = scale; - } - CHECK_STATUS(matrix_matrix_multiply_transform_rhs(qFilterDesc, qFilter, &wtmDesc, this->get_wtm()->get_val())); - } else { - F16 *inD = tFilter; - INT8 *inQ = qFilter; - for (U32 i = 0; i < numSlice; i++) { - tFilterDesc.dims[0] = slicePoints[i]; - scale = -1; - CHECK_STATUS(quantize_tensor(tFilterDesc, inD, &qFilterDesc, inQ, &scale)); - weightScale[i] = scale; - inD += tensorNumElements(tFilterDesc); - inQ += tensorNumElements(qFilterDesc); - } - qFilterDesc.dims[0] = numOutput; - CHECK_STATUS(matrix_matrix_multiply_transform_rhs(qFilterDesc, qFilter, &wtmDesc, this->get_wtm()->get_val())); - } - this->get_wtm()->set_scale(scale); - biasScaled.resize(this->numOutput); - - free(tFilter); -#endif - } else { - std::shared_ptr wtmPtr((U8*) operator new(wtm_bytes)); - auto cpuMem = new CpuMemory(); - cpuMem->set_shared_ptr_caster(wtmPtr); - Memory_* mem = (Memory_*)(cpuMem); - std::shared_ptr memWtmPtr(mem); - this->set_wtm_memory(wtm_bytes, memWtmPtr); - - if (this->mvm) { - CHECK_STATUS(fully_connected_transform_filter(inputDesc, weightDesc, weightPtr, &wtmDesc, this->get_wtm()->get_val(), this->schedule)); - } else { - TensorDesc tFilterDesc; - U8 *tFilter = (U8*)malloc(wtm_bytes); - if (nullptr == tFilter) { - std::cerr << "[ERROR] allocation failed for filter transform in FC" << std::endl; - CHECK_STATUS(ALLOC_FAILED); - } - CHECK_STATUS(fully_connected_transform_filter(inputDesc, weightDesc, weightPtr, &tFilterDesc, tFilter, this->schedule)); - CHECK_STATUS(matrix_matrix_multiply_transform_rhs(tFilterDesc, tFilter, &wtmDesc, this->get_wtm()->get_val())); - free(tFilter); - } - } - - this->get_wtm()->set_desc(wtmDesc); - this->weightTensors[0] = *this->get_wtm(); - return SUCCESS; - } - - bool mvm; - Vec weightScale; -#ifdef _USE_INT8 - Vec biasScaled; -#endif -}; - -#endif //_FCELTWISE_CPU_H diff --git a/inference/include/cpu/layer_norm_cpu.hpp b/inference/include/cpu/layer_norm_cpu.hpp deleted file mode 100644 index 6d2bba3a..00000000 --- a/inference/include/cpu/layer_norm_cpu.hpp +++ /dev/null @@ -1,97 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _LAYER_NORM_CPU_H -#define _LAYER_NORM_CPU_H - -#include "operator.hpp" -#include "tensor_computing.h" -#include "tensor_desc.h" -#include "op_type.h" -#include "layer_norm.hpp" - -class LayerNormCPU: public LayerNorm { -public: - LayerNormCPU(DataType dt, U32 weightNum) : LayerNorm(dt, weightNum) {} - - EE init_weight_bias_from_model(U8** modelPtr) override - { - auto curOpWs = this->get_weightspec_ptr(); - if(modelPtr == nullptr){ - this->weightNum = curOpWs.bytes_of_weight / bytesOf(curOpWs.mdt); - } - - DataType dtNoQ = (DT_F16_8Q == this->dt) ? DT_F16 : this->dt; - TensorDesc weightDesc = tensor1d(dtNoQ, this->weightNum); - TensorDesc biasDesc = tensor1d(dtNoQ, this->weightNum); - std::shared_ptr modelWeightTensor(new Tensor()); - std::shared_ptr modelBiasTensor(new Tensor()); - modelWeightTensor->set_desc(weightDesc); - modelBiasTensor->set_desc(biasDesc); - U32 weightBytes = tensorNumBytes(weightDesc); - if(modelPtr != nullptr){ - modelWeightTensor->alloc(); - memcpy((U8*)modelWeightTensor->get_val(), *modelPtr, weightBytes); - *modelPtr += weightBytes; - } else { - modelWeightTensor->set_shared_ptr(std::shared_ptr(curOpWs.weight)); - } - - U8* biasVal = nullptr; - if(modelPtr != nullptr){ - if(this->hasBias){ - biasVal = *modelPtr; - *modelPtr += tensorNumBytes(biasDesc); - } - } else { - if(this->hasBias) biasVal = curOpWs.vec; - } - - if(biasVal){ - modelBiasTensor->set_shared_ptr(std::shared_ptr(biasVal)); - } else { - modelBiasTensor->alloc(); - memset((U8*)modelBiasTensor->get_val(), 0, tensorNumBytes(biasDesc)); - } - - this->weightTensors.push_back(*modelWeightTensor.get()); - this->biasTensors.push_back(*modelBiasTensor.get()); - return SUCCESS; - } - - void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); - Tensor weightTensor = this->weightTensors[0]; - Tensor biasTensor = this->biasTensors[0]; - - Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); - CHECK_STATUS(layer_normalization(weightTensor.get_val(), biasTensor.get_val(), - inputDesc, inputTensor.get_val(), - outputDesc, outputTensor.get_val(), this->schedule)); - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - EE infer_output_tensors_size(Vec inDims, Vec* outDims) override - { - TensorDesc in_dim = inDims[0]; - CHECK_STATUS(normalization_infer_output_size(in_dim, &((*outDims)[0]), this->schedule)); - return SUCCESS; - } -}; - -#endif //_LAYER_NORM_CPU_H diff --git a/inference/include/cpu/matmul_cpu.hpp b/inference/include/cpu/matmul_cpu.hpp deleted file mode 100644 index cda7f4f7..00000000 --- a/inference/include/cpu/matmul_cpu.hpp +++ /dev/null @@ -1,121 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -/** - * Project deploy - */ -#ifndef _MATMUL_CPU_H -#define _MATMUL_CPU_H - -#include "operator.hpp" -#include "tensor_computing.h" -#include "matmul.hpp" - -class MatMulCPU: public MatMul { -public: - MatMulCPU(DataType dt, bool transposeA, bool transposeB) : MatMul(dt, transposeA, transposeB) {} - - void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - Tensor inputTensorA = this->inputTensors[0]; - TensorDesc inputDescA = inputTensorA.get_desc(); - Tensor inputTensorB = this->inputTensors[1]; - TensorDesc inputDescB = inputTensorB.get_desc(); - Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); - - U8 *inputA = inputTensorA.get_val(); - U8 *inputB = inputTensorB.get_val(); - U8 *tmp = (U8*)this->temp->get_val(); - - if (DT_I8 == inputDescA.dt || DT_I8 == inputDescB.dt) { -#ifdef _USE_INT8 - F32 scaleO = 1; - if (DT_F16 == inputDescA.dt) { - F16 *inD = (F16*)inputA; - INT8 *inQ = (INT8*)tmp; - F16 scale = -1; - if (featureScale.size() == 3 && featureScale[0][0] > 0) { - scale = featureScale[0][0]; - } - quantize_tensor(inputDescA, inD, &inputDescA, inQ, &scale); - scaleO *= scale; - inputA = (U8*)tmp; - tmp += tensorNumBytes(inputDescA); - } else { - scaleO *= inputTensorA.get_scale(); - } - if (DT_F16 == inputDescB.dt) { - F16 *inD = (F16*)inputB; - INT8 *inQ = (INT8*)tmp; - F16 scale = -1; - if (featureScale.size() == 3 && featureScale[1][0] > 0) { - scale = featureScale[1][0]; - } - quantize_tensor(inputDescB, inD, &inputDescB, inQ, &scale); - scaleO *= scale; - inputB = (U8*)tmp; - tmp += tensorNumBytes(inputDescB); - } else { - scaleO *= inputTensorB.get_scale(); - } - outputDesc.dt = DT_I32; - I32 *result = (I32*)tmp; - U8 *tmpReal = tmp + tensorNumBytes(outputDesc); - CHECK_STATUS(matmul(inputDescA, this->transposeA, inputA, - inputDescB, this->transposeB, inputB, - tmpReal, this->lenOfTemp, - outputDesc, result, this->schedule)); - if (DT_I8 == outputTensor.get_desc().dt) { - CHECK_STATUS(quantize_tensor(outputDesc, result, &outputDesc, outputTensor.get_val(), &scaleO)); - outputTensor.set_scale(scaleO); - } else { - CHECK_REQUIREMENT(DT_F16 == outputTensor.get_desc().dt) { - F16 *output = outputTensor.get_val(); - dequantize_int32_to_fp16(tensorNumElements(outputDesc), result, scaleO, output); - } - } -#endif - } else { - CHECK_STATUS(matmul(inputDescA, this->transposeA, inputA, - inputDescB, this->transposeB, inputB, - tmp, this->lenOfTemp, - outputDesc, outputTensor.get_val(), this->schedule)); - } - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - EE infer_output_tensors_size(Vec inDims, Vec* outDims) override - { - TensorDesc inDimA = inDims[0]; - TensorDesc inDimB = inDims[1]; - CHECK_STATUS(matmul_infer_output_size(inDimA, this->transposeA, inDimB, this->transposeB, &((*outDims)[0]), this->schedule)); - if (DT_F16_8Q == this->dt && featureScale.size() > 0 && -2 == (featureScale.back())[0]) { - (*outDims)[0].dt = DT_F16; - } - return SUCCESS; - } - - U32 infer_tmp_memory_size() override - { - TensorDesc inputDescA = (this->inputTensors[0]).get_desc(); - TensorDesc inputDescB = (this->inputTensors[1]).get_desc(); - U32 bytes = 0; - CHECK_STATUS(matmul_infer_forward_tmp_bytes(inputDescA, this->transposeA, inputDescB, this->transposeB, &bytes, this->schedule)); - return bytes; - } -}; - -#endif //_MATMUL_CPU_H diff --git a/inference/include/cpu/memory_cpu.hpp b/inference/include/cpu/memory_cpu.hpp deleted file mode 100644 index 365aca01..00000000 --- a/inference/include/cpu/memory_cpu.hpp +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _MEMORY_CPU_H -#define _MEMORY_CPU_H -#include -#include -#include -#include "memory.hpp" - -class CpuMemory : public Memory_ -{ -public: - CpuMemory(){ - len = 0; - type = CPUMem; - } - virtual ~CpuMemory() = default; - - virtual void alloc(TensorDesc desc) override - { - U32 size = tensorNumBytes(desc); - if (len < size) { - this->val = std::shared_ptr((U8*)operator new(size)); - len = size; - } - } - - virtual void alloc(U32 size) override - { - if (len < size) { - this->val = std::shared_ptr((U8*)operator new(size)); - len = size; - } - } - - virtual void set_val_by_copy(TensorDesc desc, U8* ptr) override { - memcpy(val.get(), ptr, tensorNumBytes(desc)); - } - - virtual void* get_val() override{ - return this->val.get(); - }; - - virtual MemoryType get_mem_type() override{ - return type; - } - - virtual void set_shared_ptr(PtrCasterShared val) override{ - this->val = val; - } - - virtual std::shared_ptr get_shared_ptr() override{ - return val; - } - -private: - std::shared_ptr val; - U32 len; - MemoryType type; -}; -#endif diff --git a/inference/include/cpu/multiply_cpu.hpp b/inference/include/cpu/multiply_cpu.hpp deleted file mode 100644 index 2f7e27fc..00000000 --- a/inference/include/cpu/multiply_cpu.hpp +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -/** - * Project deploy - */ -#ifndef _MULTIPLY_CPU_H -#define _MULTIPLY_CPU_H -#include "operator.hpp" -#include "tensor_computing.h" -#include "multiply.hpp" - -class MultiplyCPU: public Multiply { -public: - MultiplyCPU(DataType dt, F32 scale, F32 bias) : Multiply(dt, scale, bias) {} - - void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); - Tensor outputTensor = this->outputTensors[0]; - TensorDesc output_desc = outputTensor.get_desc(); - - if (DT_I8 == inputDesc.dt) { -#ifdef _USE_INT8 - CHECK_REQUIREMENT(0 == this->beta); - F32 scaleO = inputTensor.get_scale() / this->alpha; - outputTensor.set_scale(scaleO); - U8 *inPtr = inputTensor.get_val(); - U8 *outPtr = outputTensor.get_val(); - if (inPtr != outPtr) { - memcpy(outPtr, inPtr, tensorNumBytes(inputDesc)); - } -#endif - } else { - CHECK_STATUS(multiply(&(this->alpha), &(this->beta), - inputDesc, inputTensor.get_val(), - output_desc, outputTensor.get_val(), this->schedule)); - } - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - - EE infer_output_tensors_size(Vec inDims, Vec* outDims) override - { - CHECK_STATUS(multiply_infer_output_size(inDims[0], &((*outDims)[0]), this->schedule)); - return SUCCESS; - } -}; - -#endif //_MULTIPLY_CPU_H diff --git a/inference/include/cpu/pooling_cpu.hpp b/inference/include/cpu/pooling_cpu.hpp deleted file mode 100644 index 4949c44f..00000000 --- a/inference/include/cpu/pooling_cpu.hpp +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _POOLING_CPU_H -#define _POOLING_CPU_H -#include -#include "operator.hpp" -#include "tensor_computing.h" -#include "tensor_desc.h" -#include "op_type.h" -#include "pooling.hpp" - -class PoolingCPU: public Pooling { -public: - -/** - * @param mode - * @param ks - * @param stride - * @param padding - * @param name - */ - PoolingCPU(PoolingMode mode, U32 ksH, U32 ksW, U32 strideH, U32 strideW, U32 paddingT, U32 paddingB, U32 paddingL, U32 paddingR, RoundMode rm): - Pooling(mode, ksH, ksW, strideH, strideW, paddingT, paddingB, paddingL, paddingR, rm){} - - virtual void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); - PoolingDesc poolingDesc = Pooling::create_PoolingDesc(this->mode, this->kernelSizeH, this->kernelSizeW, this->strideH, this->strideW, - this->paddingT, this->paddingB, this->paddingL, this->paddingR, this->rm); - Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); - short scales[2]; -#ifdef _USE_INT8 - if (DT_I8 == inputDesc.dt) { - F16* scale = (F16*)scales; - scale[0] = inputTensor.get_scale(); - } -#endif - CHECK_STATUS(pooling(inputDesc, inputTensor.get_val(), - poolingDesc, scales, - outputDesc, outputTensor.get_val(), - this->schedule)); -#ifdef _USE_INT8 - if (DT_I8 == inputDesc.dt) { - F16 *scale = (F16*)scales; - outputTensor.set_scale(scale[1]); - } -#endif - - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - - virtual EE infer_output_tensors_size(Vec inDims, Vec* outDims) override - { - auto inDim = inDims[0]; - DataType dt; - DataFormat df; - U32 width ; - U32 height; - U32 numChannels; - U32 num; - CHECK_STATUS(tensor4dGet(inDim, &dt, &df, &num, &numChannels, &height, &width)); - - TensorDesc inputDesc = tensor4df(dt, df, num, numChannels, height, width); - if (this->kernelSizeH == 0 && this->kernelSizeW == 0) { - Pooling::set_stride(1, 1); - } - PoolingDesc poolingDesc = Pooling::create_PoolingDesc(this->mode, this->kernelSizeH, this->kernelSizeW, this->strideH, this->strideW, - this->paddingT, this->paddingB, this->paddingL, this->paddingR, this->rm); - CHECK_STATUS(pooling_infer_output_size(inputDesc, poolingDesc, &((*outDims)[0]), this->schedule)); - return SUCCESS; - } - -}; - -#endif //_POOLINGCPU_H diff --git a/inference/include/cpu/reshape_cpu.hpp b/inference/include/cpu/reshape_cpu.hpp deleted file mode 100644 index e73d3bbf..00000000 --- a/inference/include/cpu/reshape_cpu.hpp +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _RESHAPE_CPU_H -#define _RESHAPE_CPU_H - -#include "operator.hpp" -#include "tensor_computing.h" -#include "reshape.hpp" - -class ReshapeCPU: public Reshape { -public: -/** - * @param shapeDims - * @param axis - * @param numAxes - */ - ReshapeCPU(DataType dt, I32* shapeDimsPtr, I32 shapeSize, I32 axis, I32 numAxes) : Reshape(dt, shapeDimsPtr, shapeSize, axis, numAxes) {} - - void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - - Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); - - Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); - - CHECK_STATUS(reshape(inputDesc, inputTensor.get_val(), outputDesc, outputTensor.get_val(), this->schedule)); - outputTensor.set_scale(inputTensor.get_scale()); - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - EE infer_output_tensors_size(Vec inDims, Vec* outDims) override - { - TensorDesc inputDesc = inDims[0]; - CHECK_STATUS(reshape_infer_output_size(inputDesc, &((*outDims)[0]), this->shapeDims.data(), this->shapeDims.size(), this->schedule)); - return SUCCESS; - } -}; - -#endif //_RESHAPE_CPU_H diff --git a/inference/include/cpu/scale_cpu.hpp b/inference/include/cpu/scale_cpu.hpp deleted file mode 100644 index 54af0044..00000000 --- a/inference/include/cpu/scale_cpu.hpp +++ /dev/null @@ -1,168 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _SCALE_CPU_H -#define _SCALE_CPU_H - -#include -#include "weight_operator.hpp" -#include "tensor_computing.h" -#include "tensor_desc.h" -#include "op_type.h" -#include "scale.hpp" - -class ScaleCPU: public Scale -{ -public: - ScaleCPU(DataType dt, int axis, int numChannels, int numSource): - Scale(dt, axis, numChannels, numSource) - { - this->alpha = nullptr; - this->beta = nullptr; - } - - virtual EE init_weight_bias_from_model(U8** modelPtr) override - { - auto curOpWs = this->get_weightspec_ptr(); - U32 weightNum = 0; - if(modelPtr == nullptr){ - weightNum = curOpWs.bytes_of_weight / UNI_MAX(1, bytesOf(curOpWs.mdt)); - if (0 == weightNum) { - weightNum = curOpWs.bytes_of_vec / UNI_MAX(1, bytesOf(curOpWs.mdt)); - } - } - - TensorDesc weightDesc = tensor1d(this->dt, weightNum); - TensorDesc biasDesc = weightDesc; - std::shared_ptr modelWeightTensor(new Tensor()); - std::shared_ptr modelBiasTensor(new Tensor()); - modelWeightTensor->set_desc(weightDesc); - modelBiasTensor->set_desc(biasDesc); - - U32 weightBytes = tensorNumBytes(weightDesc); - if(modelPtr != nullptr){ - modelWeightTensor->alloc(); - memcpy((U8*)modelWeightTensor->get_val(), *modelPtr, weightBytes); - *modelPtr += weightBytes; - } else { - modelWeightTensor->set_shared_ptr(std::shared_ptr(curOpWs.weight)); - } - - U8* biasVal = nullptr; - if(modelPtr != nullptr){ - if(this->hasBias){ - biasVal = *modelPtr; - *modelPtr += tensorNumBytes(biasDesc); - } - } else { - if(this->hasBias) biasVal = curOpWs.vec; - } - - if (biasVal) { - modelBiasTensor->set_shared_ptr(std::shared_ptr(biasVal)); - } else { - modelBiasTensor->alloc(); - memset((U8*)modelBiasTensor->get_val(), 0, tensorNumBytes(biasDesc)); - } - - this->weightTensors.push_back(*modelWeightTensor.get()); - this->biasTensors.push_back(*modelBiasTensor.get()); - return SUCCESS; - } - - virtual void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - int inputTensorNumber = this->inputTensors.size(); - Tensor inputTensor = this->inputTensors[this->dataID];; - Tensor outputTensor = this->outputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); - U8* inputPtr = inputTensor.get_val(); - - if (inputTensorNumber == 1) { - this->alpha = this->weightTensors[0].get_val(); - this->beta = this->biasTensors[0].get_val(); - CHECK_STATUS(scale(inputDesc, inputPtr, this->axis, this->alpha, this->beta, - inputTensor.get_desc(), outputTensor.get_val(), this->schedule)); - } else { - CHECK_STATUS(scale(inputDesc, inputPtr, - this->axis, this->inputTensors[1-this->dataID].get_val(), nullptr, - inputTensor.get_desc(), outputTensor.get_val(), this->schedule)); - } - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - virtual EE infer_output_tensors_size(VecinDims, Vec* outDims) override - { - I32 tmpAxis = (this->axis + inDims[0].nDims) % inDims[0].nDims; - tmpAxis = inDims[0].nDims - 1 - tmpAxis; - CHECK_REQUIREMENT(tmpAxis < (I32)inDims[0].nDims); - U32 ic = inDims[0].dims[tmpAxis]; - - auto curOpWs = this->get_weightspec_ptr(); - this->alpha = curOpWs.weight; - this->beta = curOpWs.vec; - U32 numChannels; - if (0 != curOpWs.bytes_of_weight) { - numChannels = curOpWs.bytes_of_weight / UNI_MAX(1, bytesOf(curOpWs.mdt)); - } else if (0 != curOpWs.bytes_of_vec) { - numChannels = curOpWs.bytes_of_vec / UNI_MAX(1, bytesOf(curOpWs.mdt)); - } else { - numChannels = 0; - } - - TensorDesc inputDesc; - if (ic != numChannels && 0 != numChannels) { - std::cout << "[ERROR] ScaleCPU input channels (IC) do not match. Perhaps some channel padding has been done earlier" << std::endl; - std::cout << " IC is now " << ic << " but should be " << numChannels << std::endl; - CHECK_STATUS(NOT_SUPPORTED); - } else { - if (inDims.size() > 1 && tensorNumElements(inDims[1]) > tensorNumElements(inDims[0])) { - this->dataID = 1; - } - inputDesc = inDims[this->dataID]; - } - - CHECK_STATUS(scale_infer_output_size(inputDesc, &((*outDims)[0]), this->schedule)); - return SUCCESS; - } - -#ifdef _USE_FP16 - void set_scale_alpha(F16* alpha) - { - this->alpha = (U8*)alpha; - } - - F16* get_scale_alpha() - { - return (F16*)(this->alpha); - } - - void set_scale_beta(F16* beta) - { - this->beta = (U8*)beta; - } - - F16* get_scale_beta() - { - return (F16*)(this->beta); - } -#endif - -private: - U8* alpha; - U8* beta; -}; - -#endif //_SCALE_CPU_H diff --git a/inference/include/cpu/slice_cpu.hpp b/inference/include/cpu/slice_cpu.hpp deleted file mode 100644 index fe735ede..00000000 --- a/inference/include/cpu/slice_cpu.hpp +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _SLICE_CPU_H -#define _SLICE_CPU_H - -#include "operator.hpp" -#include "tensor_computing.h" -#include "slice.hpp" - -class SliceCPU: public Slice { -public: - SliceCPU(DataType dt, I32 axis, I32* slicePointsPtr, I32 sliceSize) : Slice(dt, axis, slicePointsPtr, sliceSize) {} - - void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); - - Vec outputTensors = this->get_output_tensors(); - Vec outputTensorDescs; - Vec outputPtrs; - for (U32 i = 0; i < outputTensors.size(); i++) { - outputTensors[i].set_scale(inputTensor.get_scale()); - outputTensorDescs.push_back(outputTensors[i].get_desc()); - outputPtrs.push_back(outputTensors[i].get_val()); - } - - CHECK_STATUS(slice(inputDesc, inputTensor.get_val(), this->axis, outputTensorDescs, &outputPtrs, this->schedule)); - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - EE infer_output_tensors_size(Vec inDims, Vec* outDims) override - { - TensorDesc in_dim = inDims[0]; - CHECK_STATUS(slice_infer_output_size(in_dim, outDims, this->axis, this->slicePoints.data(), this->schedule)); - return SUCCESS; - } - -}; - -#endif //_SLICE_CPU_H diff --git a/inference/include/cpu/softmax_cpu.hpp b/inference/include/cpu/softmax_cpu.hpp deleted file mode 100644 index ab5f74ca..00000000 --- a/inference/include/cpu/softmax_cpu.hpp +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _SOFTMAX_CPU_H -#define _SOFTMAX_CPU_H - -#include "operator.hpp" -#include "tensor_computing.h" -#include "softmax.hpp" - -class SoftmaxCPU : public Softmax { -public: - - SoftmaxCPU(DataType dt, int axis): - Softmax(dt, axis) { } - - virtual void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); - Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); - - U8 *inputPtr = (U8*)inputTensor.get_val(); - - if (DT_I8 == inputDesc.dt) { -#ifdef _USE_INT8 - F32 inputScale = inputTensor.get_scale(); - INT8 *inQ = (INT8*)inputPtr; - U32 numData = tensorNumElements(inputDesc); - F16* inD = (F16*)this->temp->get_val(); - dequantize_int8_to_fp16(numData, inQ, inputScale, inD); - CHECK_STATUS(softmax(outputDesc, inD, this->axis, outputDesc, outputTensor.get_val(), this->schedule)); -#endif - } else { - CHECK_STATUS(softmax(inputDesc, inputPtr, this->axis, outputDesc, outputTensor.get_val(), this->schedule)); - } - - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - virtual EE infer_output_tensors_size(Vec inDims, Vec* outDims) override - { - CHECK_STATUS(softmax_infer_output_size(inDims[0], &((*outDims)[0]), this->schedule)); - if (DT_I8 == (*outDims)[0].dt) { - (*outDims)[0].dt = DT_F16; - this->lenOfTemp = tensorNumBytes((*outDims)[0]); - } - return SUCCESS; - } - - U32 infer_tmp_memory_size() override - { - return this->lenOfTemp; - } -}; - -#endif //SOFTMAX_CPU_H diff --git a/inference/include/cpu/squeeze_cpu.hpp b/inference/include/cpu/squeeze_cpu.hpp deleted file mode 100644 index df81e720..00000000 --- a/inference/include/cpu/squeeze_cpu.hpp +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _SQUEEZE_CPU_H -#define _SQUEEZE_CPU_H - -#include "operator.hpp" -#include "squeeze.hpp" - -class SqueezeCPU: public Squeeze -{ -public: - /** - @param mode - */ - SqueezeCPU(DataType dt, I32 axis, I32 *dims, I32 dimSize) : Squeeze(dt, axis, dims, dimSize) { } - - virtual void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - - Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); - Tensor outputTensor = this->outputTensors[0]; - U8* inPtr = inputTensor.get_val(); - U8* outPtr = outputTensor.get_val(); - if(inPtr != outPtr) { - memcpy(outPtr, inPtr, tensorNumBytes(inputDesc)); - } - - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - virtual EE infer_output_tensors_size(VecinDims, Vec* outDims) override - { - auto outDimsPtr = &((*outDims)[0]); - outDimsPtr->dt = inDims[0].dt; - int axis = this->axis; - if (axis < 0) - axis += inDims[0].nDims; - if (axis >= 0 && axis < (int)(inDims[0].nDims)) { - axis = inDims[0].nDims - 1 - axis; - for (int i = 0; i < axis; i++) { - outDimsPtr->dims[i] = inDims[0].dims[i]; - } - if (inDims[0].dims[axis] != 1) { - CHECK_STATUS(NOT_MATCH); - } - for (int i = axis+1; i < (int)(inDims[0].nDims); i++) { - outDimsPtr->dims[i-1] = inDims[0].dims[i]; - } - outDimsPtr->nDims = inDims[0].nDims - 1; - } - else { - for (U32 i = 0; i < inDims[0].nDims; i++) - outDimsPtr->dims[i] = inDims[0].dims[i]; - for (U32 i = 0; i < this->dims.size(); i++) { - outDimsPtr->dims[inDims[0].nDims - 1 - this->dims[i]] = 0; - } - U32 index = 0; - for (U32 i = 0; i < inDims[0].nDims; i++) { - if (outDimsPtr->dims[i] != 0) - outDimsPtr->dims[index++] = outDimsPtr->dims[i]; - } - CHECK_REQUIREMENT(index + this->dims.size() == inDims[0].nDims); - outDimsPtr->nDims = index; - } - outDimsPtr->df = getTensorDefaultDataFormat(outDimsPtr->nDims); - return SUCCESS; - } -}; - -#endif //_SQUEEZE_CPU_H diff --git a/inference/include/cpu/transpose_cpu.hpp b/inference/include/cpu/transpose_cpu.hpp deleted file mode 100644 index 4caa6a57..00000000 --- a/inference/include/cpu/transpose_cpu.hpp +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _TRANSPOSE_CPU_H -#define _TRANSPOSE_CPU_H - -#include -#include "operator.hpp" -#include "tensor_computing.h" -#include "transpose.hpp" - -class TransposeCPU: public Transpose { -public: - TransposeCPU(DataType dt, U32* transDimsPtr, U32 transDimsSize) : Transpose(dt, transDimsPtr, transDimsSize) {} - - void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - - Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); - - Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); - - if (DF_NCHWC8 == inputDesc.df) { - inputDesc.nDims = 5; - for (int i = 3; i >= 0; i--) { - inputDesc.dims[i + 1] = inputDesc.dims[i]; - } - inputDesc.dims[3] /= 8; - inputDesc.dims[0] = 8; - - TensorDesc desc = outputDesc; - desc.nDims = 5; - U32 idx = 4; - for (int i = 3; i >= 0; i--) { - if (1 == transDims[3 - i]) { // C - desc.dims[idx] = outputDesc.dims[i] / 8; - idx--; - desc.dims[idx] = 8; - idx--; - } else { - desc.dims[idx] = outputDesc.dims[i]; - idx--; - } - } - outputDesc = desc; - } - - CHECK_STATUS(transpose(inputDesc, inputTensor.get_val(), outputDesc, outputTensor.get_val(), this->transDims.data(), this->schedule)); - outputTensor.set_scale(inputTensor.get_scale()); - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - EE infer_output_tensors_size(Vec imDims, Vec* outDims) override - { - TensorDesc inputDesc = imDims[0]; - if (DF_NCHWC8 == inputDesc.df) { - if (this->transDims.size() == 4) { - auto ptr = std::find(this->transDims.begin(), this->transDims.end(), 1); - this->transDims.insert(ptr+1, 4); - } - } - CHECK_STATUS(transpose_infer_output_size(inputDesc, &((*outDims)[0]), this->transDims.data(), this->schedule)); - return SUCCESS; - } -}; - -#endif //_TRANSPOSE_CPU_H diff --git a/inference/include/data_loader.hpp b/inference/include/data_loader.hpp deleted file mode 100644 index 9ba61194..00000000 --- a/inference/include/data_loader.hpp +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#ifndef _H_DATA_LOADER - -#include -#include "tensor_desc.h" -#include "tensor.hpp" - -#ifdef _BUILD_TEST -Vec load_txt(std::string dataPath, Vec dataDesc); - -Vec load_data(std::string directoryPath, - Vec dataDesc, - Vec>* datas); - -Vec load_image_with_scale(std::string directoryPath, - Vec dataDesc, - Vec>* datas, - ImageFormat ImageFormat, - F32 scaleValue); - -Vec load_bin_with_type(std::string directoryPath, - Vec dataDesc, - Vec>* datas, - Vec sourceDataType); -#endif -#endif diff --git a/inference/include/deconvolution.hpp b/inference/include/deconvolution.hpp deleted file mode 100644 index 31eb7df2..00000000 --- a/inference/include/deconvolution.hpp +++ /dev/null @@ -1,334 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _DECONVOLUTION_H -#define _DECONVOLUTION_H -#include "weight_operator.hpp" -#include "tensor_computing.h" -#include "op_type.h" - -class Deconvolution: public WeightOperator { -public: - Deconvolution(DataType dt, U32 nf, U32 ksizeH, U32 ksizeW, U32 kstrideH, U32 kstrideW, U32 kpaddingT, U32 kpaddingB, U32 kpaddingL, U32 kpaddingR, - ActivationDesc dwActivationDesc, ActivationDesc pwActivationDesc, - ConvolutionMode convolutionType, U32 group, U32 dilateH, U32 dilateW) - { - this->dt = dt; - this->numOutputs = nf; - this->kernelSizeH = ksizeH; - this->kernelSizeW = ksizeW; - this->strideH = kstrideH; - this->strideW = kstrideW; - this->paddingT = kpaddingT; - this->paddingB = kpaddingB; - this->paddingL = kpaddingL; - this->paddingR = kpaddingR; - this->dwActivationDesc = dwActivationDesc; - this->pwActivationDesc = pwActivationDesc; - this->convolutionType = convolutionType; - this->group = group; - this->dilateH = dilateH; - this->dilateW = dilateW; - this->hasBias = false; - this->pwAlg = CONVOLUTION_ALGORITHM_NULL; - } - - OperatorType get_op_type() override - { - return OT_Deconvolution; - } - - ConvolutionDesc create_convDesc(U32 strideH, U32 strideW, U32 paddingT, U32 paddingB, U32 paddingL, U32 paddingR, U32 dilateH, U32 dilateW) - { - ConvolutionDesc convDesc; - convDesc.stride_h = strideH; - convDesc.stride_w = strideW; - convDesc.padding_top = paddingT; - convDesc.padding_bottom = paddingB; - convDesc.padding_left = paddingL; - convDesc.padding_right = paddingR; - convDesc.dilatedRate_h = dilateH; - convDesc.dilatedRate_w = dilateW; - return convDesc; - } - - EE init_weight_bias_from_model(U8** modelPtr) - { - auto curOpWs = this->get_weightspec_ptr(); - DataType filterDt = curOpWs.mdt; // weight data type may not be the same as input and output - if (modelPtr != nullptr) { - filterDt = this->dt; - } - DataType dtNoQ = (this->dt == DT_F16_8Q) ? DT_F16 : this->dt; - U32 isBNN = 0; - if (filterDt == DT_BIN01 || filterDt == DT_BIN11) { - isBNN = 1; - } - DataFormat filterDf; - U32 vectorLen = 0; // Vector must contain bias. BNN has one more scale vector. - switch (this->convolutionType) { - case Convolution_Deconvolution: { - filterDf = DF_NCHW; - vectorLen = this->numInputs; // bias length - if (isBNN == 1) { - this->dt = dtNoQ; // BNN convolution should not be quantized further - vectorLen *= 2; // Scale has the same vector length as bias, so double the length - } - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - return NOT_SUPPORTED; - } - } - TensorDesc filterTensorDesc = tensor4df(filterDt, filterDf, - this->numInputs, this->numOutputs, - this->kernelSizeH, this->kernelSizeW); - TensorDesc vectorTensorDesc = tensor1d(dtNoQ, vectorLen); // bias data type should be the same as input and output - - std::shared_ptr modelWeightTensor(new Tensor()); - std::shared_ptr modelVectorTensor(new Tensor()); - modelWeightTensor->set_desc(filterTensorDesc); - modelVectorTensor->set_desc(vectorTensorDesc); - - if (modelPtr != nullptr) { - modelWeightTensor->alloc(); - memcpy((U8*)modelWeightTensor->get_val(), *modelPtr, tensorNumBytes(filterTensorDesc)); - *modelPtr += tensorNumBytes(filterTensorDesc); - } else { - modelWeightTensor->set_shared_ptr(std::shared_ptr(curOpWs.weight)); - } - - U8* biasVal = NULL; - if(modelPtr != nullptr) { - if(this->hasBias){ - biasVal = *modelPtr; - *modelPtr += tensorNumBytes(vectorTensorDesc); - } - } else { - if(this->hasBias) biasVal = curOpWs.vec; - } - - if (biasVal) { - modelVectorTensor->set_shared_ptr(std::shared_ptr(biasVal)); - } else { - modelVectorTensor->alloc(); - if (isBNN == 1) { -#ifdef _USE_FP16 - F16 *vec = (F16*)modelVectorTensor->get_val(); - for (U32 i = 0; i < this->numInputs; i++) { // first half is scale - *vec = 1.0; - vec++; - } - memset(vec, 0, tensorNumBytes(vectorTensorDesc) / 2); // second half is bias -#endif - } else { - memset((U8*)modelVectorTensor->get_val(), 0, tensorNumBytes(vectorTensorDesc)); - } - } - - this->weightTensors.push_back(*modelWeightTensor.get()); - this->biasTensors.push_back(*modelVectorTensor.get()); - return SUCCESS; - } - - void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - - Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); - - Tensor filterTensor = this->weightTensors[0]; - TensorDesc filterDesc = filterTensor.get_desc(); - - ConvolutionDesc convDesc = create_convDesc(this->strideH, this->strideW, - this->paddingT, this->paddingB, this->paddingL, this->paddingR, this->dilateH, this->dilateW); - - TensorDesc scaleDesc = filterDesc; // Dummy initialization - U8 *scalePtr = nullptr; - - Tensor biasTensor = this->biasTensors[0]; - TensorDesc biasDesc = biasTensor.get_desc(); - U8 *biasPtr = (U8*)biasTensor.get_val(); - - Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); - - switch (this->convolutionType) { - case Convolution_Deconvolution: { - if (filterDesc.dt == DT_BIN01 || filterDesc.dt == DT_BIN11) { - CHECK_STATUS(NOT_SUPPORTED); - } - CHECK_STATUS(deconvolution(inputDesc, inputTensor.get_val(), - filterDesc, filterTensor.get_val(), - convDesc, this->pwAlg, - scaleDesc, scalePtr, - biasDesc, biasPtr, - this->lenOfTemp, this->temp->get_val(), - outputDesc, (void*)outputTensor.get_val(), - this->pwActivationDesc, this->schedule)); - break; - } - default: { - std::cerr << "[ERROR] unsupported deconvolution type " << convolutionType << std::endl; - exit(1); - } - } - - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - EE infer_forward_algorithm() - { - TensorDesc inputDesc = (this->inputTensors[0]).get_desc(); - TensorDesc filterDesc = (this->weightTensors[0]).get_desc(); - - ConvolutionPolicy policy = CONVOLUTION_FASTEST; - ConvolutionDesc convDesc = create_convDesc(this->strideH, this->strideW, - this->paddingT, this->paddingB, this->paddingL, this->paddingR, this->dilateH, this->dilateW); - - DataType targetType = filterDesc.dt; - switch (this->convolutionType) { - case Convolution_Deconvolution: { - CHECK_STATUS(deconvolution_infer_forward_algorithm(inputDesc, filterDesc, - this->outputTensors[0].get_desc(), - convDesc, policy, &(this->pwAlg), targetType, this->schedule)); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - return SUCCESS; - } - - EE infer_output_tensors_size(Vec inDims, Vec* outDims) override - { - TensorDesc inDim = inDims[0]; - DataType idt; - DataFormat idf; - U32 in, ic, ih, iw; - CHECK_STATUS(tensor4dGet(inDim, &idt, &idf, &in, &ic, &ih, &iw)); - this->numInputs = ic; - - TensorDesc filterDim = tensor4df(this->dt, DF_NCHW, this->numInputs, this->numOutputs, this->kernelSizeH, - this->kernelSizeW); - - ConvolutionDesc convDesc = create_convDesc(this->strideH, this->strideW, - this->paddingT, this->paddingB, this->paddingL, this->paddingR, this->dilateH, this->dilateW); - - DataType targetType = this->dt; - if (DT_F16_8Q == this->dt) { - targetType = DT_I8; - } - - U32 outBytes = 0; - CHECK_STATUS(deconvolution_infer_output_size(inDim, filterDim, convDesc, &((*outDims)[0]), targetType, &outBytes)); - return SUCCESS; - } - - U32 infer_tmp_memory_size() override - { - TensorDesc inputDesc = (this->inputTensors[0]).get_desc(); - TensorDesc filterDesc = (this->weightTensors[0]).get_desc(); - TensorDesc outputDesc = (this->outputTensors[0]).get_desc(); - ConvolutionDesc convDesc = create_convDesc(this->strideH, this->strideW, - this->paddingT, this->paddingB, this->paddingL, this->paddingR, this->dilateH, this->dilateW); - - U32 bytes = 0; - switch (this->convolutionType) { - case Convolution_Deconvolution: { - CHECK_STATUS(deconvolution_infer_forward_tmp_bytes(inputDesc, filterDesc, outputDesc, convDesc, this->pwAlg, &bytes, this->schedule)); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - return bytes; - } - - U32 infer_wtm_memory_size() override - { - TensorDesc filterDesc = (this->weightTensors[0]).get_desc(); - U32 bytes = 0; - switch (this->convolutionType) { - case Convolution_Deconvolution: { - CHECK_STATUS(deconvolution_transform_filter_bytes(filterDesc, this->pwAlg, &bytes, this->schedule)); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - return bytes; - } - - EE transform_filter() - { - this->wtm = std::shared_ptr(new Tensor()); - Tensor filterTensor = this->weightTensors[0]; - TensorDesc filterDesc = filterTensor.get_desc(); - U8* weightPtr = filterTensor.get_val(); - - TensorDesc wtmDesc; - - auto wtmBytes = this->infer_wtm_memory_size(); - std::shared_ptr sPtr((U8*) operator new(wtmBytes)); - auto cpuMem = new CpuMemory(); - cpuMem->set_shared_ptr_caster(sPtr); - Memory_* mem = (Memory_*)(cpuMem); - std::shared_ptr memsPtr(mem); - this->set_wtm_memory(wtmBytes, memsPtr); - - switch (this->convolutionType) { - case Convolution_Deconvolution: { - CHECK_STATUS(deconvolution_transform_filter(filterDesc, weightPtr, this->pwAlg, &wtmDesc, this->get_wtm()->get_val(), this->schedule)); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - - this->get_wtm()->set_desc(wtmDesc); - this->weightTensors[0] = *this->get_wtm(); - return SUCCESS; - } - -public: - U32 numOutputs; - U32 numInputs; - U32 kernelSizeH; - U32 kernelSizeW; - U32 strideH; - U32 strideW; - U32 paddingT; - U32 paddingB; - U32 paddingL; - U32 paddingR; - ConvolutionMode convolutionType; - U32 group; - U32 dilateH; - U32 dilateW; - - ActivationDesc dwActivationDesc; - ActivationDesc pwActivationDesc; - - ConvolutionForwardAlgorithm pwAlg; - DepthwiseConvolutionForwardAlgorithm dwAlg; -}; - -#endif //_DECONVOLUTION_H diff --git a/inference/include/depth2space.hpp b/inference/include/depth2space.hpp deleted file mode 100644 index 8dca41ce..00000000 --- a/inference/include/depth2space.hpp +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _DEPTH2SPACE_H -#define _DEPTH2SPACE_H - -#include "operator.hpp" - -class Depth2Space: public Operator -{ -public: - /** - @param mode - */ - Depth2Space(DataType dt) - { - this->dt = dt; - } - - OperatorType get_op_type() override - { - return OT_Depth2Space; - } - -}; - -#endif //_DEPTH2SPACE_H diff --git a/inference/include/detection_output.hpp b/inference/include/detection_output.hpp deleted file mode 100644 index aef2e14c..00000000 --- a/inference/include/detection_output.hpp +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _DETECTION_OUTPUT_H -#define _DETECTION_OUTPUT_H -#include "operator.hpp" -#include "tensor_computing.h" - -class DetectionOutput: public Operator { -public: - DetectionOutput(DataType dt, DetectionOutputDesc detectionoutputDesc) - { - this->dt = dt; - this->detectionoutputDesc = detectionoutputDesc; - } - - OperatorType get_op_type() override - { - return OT_DetectionOutput; - } - - void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - - Vec inputDesc; - Vec inputPtr; - - for (Tensor tensorIn: this->inputTensors) { - inputDesc.push_back(tensorIn.get_desc()); - inputPtr.push_back((void*)tensorIn.get_val()); - } - auto outputDesc = this->outputTensors[0].get_desc(); - auto outputPtr = this->outputTensors[0].get_val(); - - CHECK_STATUS(detectionoutput(inputDesc, inputPtr, this->detectionoutputDesc, outputDesc, outputPtr, this->schedule)); - - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - EE infer_output_tensors_size(VecinDims, Vec* outDims) override - { - CHECK_STATUS(detectionoutput_infer_output_size(inDims, this->detectionoutputDesc, &((*outDims)[0]), this->schedule)); - return SUCCESS; - } -protected: - DetectionOutputDesc detectionoutputDesc; -}; -#endif //_DETECTION_OUTPUT_H \ No newline at end of file diff --git a/inference/include/eltwise.hpp b/inference/include/eltwise.hpp deleted file mode 100644 index b307ffd6..00000000 --- a/inference/include/eltwise.hpp +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -/** - * Project deploy - */ - - -#ifndef _ELTWISE_H -#define _ELTWISE_H - -#include "operator.hpp" - -class Eltwise: public Operator { -public: - Eltwise(EltwiseMode eltMode, I32 coeffSize, F32* coeffValues) - { - this->eltMode = eltMode; - this->coeffSize = coeffSize; - this->coeffValues = coeffValues; - this->lenOfTemp = 0; - } - - OperatorType get_op_type() override - { - return OT_Eltwise; - } - -protected: - EltwiseMode eltMode; - I32 coeffSize; - F32* coeffValues; -}; - -#endif //_ELTWISE_H diff --git a/inference/include/embedding.hpp b/inference/include/embedding.hpp deleted file mode 100644 index dd293f28..00000000 --- a/inference/include/embedding.hpp +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _EMBEDDING_H -#define _EMBEDDING_H -#include "weight_operator.hpp" -#include "tensor_computing.h" - -class Embedding: public WeightOperator { -public: - Embedding(DataType dt, U32 inputDim, U32 numOutput, bool transpose) - { - this->dt = dt; - this->inputDim = inputDim; - this->numOutput = numOutput; - this->transpose = transpose; - } - - OperatorType get_op_type() override - { - return OT_Embedding; - } - - virtual EE init_weight_bias_from_model(U8** modelPtr) { - UNUSED(modelPtr); - return NOT_SUPPORTED; - } -protected: - U32 inputDim; - U32 numOutput; - bool transpose; -}; - -#endif //_EMBEDDING__H diff --git a/inference/include/factory.hpp b/inference/include/factory.hpp deleted file mode 100644 index 9f1ca078..00000000 --- a/inference/include/factory.hpp +++ /dev/null @@ -1,536 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _FACTORY_H -#define _FACTORY_H -#define NOT_SUPPORT Operator* cep = NULL;CHECK_STATUS(NOT_SUPPORTED); -#define NOT_USE1(a1) {UNUSED(a1);} -#define NOT_USE2(a1, a2) {NOT_USE1(a1) NOT_USE1(a2)} -#define NOT_USE3(a1, a2, a3) {NOT_USE2(a1, a2) NOT_USE1(a3)} -#define NOT_USE4(a1, a2, a3, a4) {NOT_USE2(a1, a2) NOT_USE2(a3, a4)} -#define NOT_USE5(a1, a2, a3, a4, a5) {NOT_USE4(a1, a2, a3, a4) NOT_USE1(a5)} -#define NOT_USE6(a1, a2, a3, a4, a5, a6) {NOT_USE4(a1, a2, a3, a4) NOT_USE2(a5, a6)} -#define NOT_USE8(a1, a2, a3, a4, a5, a6, a7, a8) {NOT_USE4(a1, a2, a3, a4) NOT_USE4(a5, a6, a7, a8)} -#define NOT_USE16(a1, a2, a3, a4, a5, a6, a7, a8, a9, aa, ab, ac, ad, ae, af, ag) {NOT_USE8(a1, a2, a3, a4, a5, a6, a7, a8) NOT_USE8(a9, aa, ab, ac, ad, ae, af, ag)} -#define OP_UNSUP(num,...) NOT_USE##num(__VA_ARGS__) NOT_SUPPORT - -class Factory { -public: - virtual ~Factory(){}; - virtual std::shared_ptr createConvolution(DataType dt, U32 nf, U32 ksizeH, U32 ksizeW, U32 kstrideH, U32 kstrideW, - U32 kpaddingT, U32 kpaddingB, U32 kpaddingL, U32 kpaddingR, - ActivationDesc dwActivationDesc, ActivationDesc pwActivationDesc, - ConvolutionMode convolutionType, U32 group, U32 dilateH, U32 dilateW) = 0; - - virtual std::shared_ptr createDeconvolution(DataType dt, U32 nf, U32 ksizeH, U32 ksizeW, U32 kstrideH, U32 kstrideW, - U32 kpaddingT, U32 kpaddingB, U32 kpaddingL, U32 kpaddingR, - ActivationDesc dwActivationDesc, ActivationDesc pwActivationDesc, - ConvolutionMode convolutionType, U32 group, U32 dilateH, U32 dilateW) = 0; - - virtual std::shared_ptr createPooling(PoolingMode mode, U32 ksH, U32 ksW, U32 strideH, U32 strideW, - U32 paddingT, U32 paddingB, U32 paddingL, U32 paddingR, RoundMode rm) = 0; - - virtual std::shared_ptr createFullyConnected(DataType dt, U32 numInput, U32 numOutput, - U32 numSlice, I32* slicePoint) = 0; - - virtual std::shared_ptr createSoftmax(DataType dt, int axis) = 0; - - virtual std::shared_ptr createConcat(int axis) = 0; - - virtual std::shared_ptr createActivation(ActivationDesc activationDesc) = 0; - - virtual std::shared_ptr createEltwise(EltwiseMode eltMode, I32 coeffSize, F32* coeffValues) = 0; - - virtual std::shared_ptr createScale(DataType dt, int axis, int numChannels, int numSource) = 0; - - virtual std::shared_ptr createLSTM(DataType dt, U32 numOutput, U32 numProjection, F32 zoneoutCell, F32 zoneoutOutput, bool biDirection) = 0; - - virtual std::shared_ptr createLSTMCell(DataType dt, U32 numOutput, U32 numProjection, F32 zoneoutCell, F32 zoneoutOutput, bool biDirection) = 0; - - virtual std::shared_ptr createLSTM(DataType dt, U32 numOutput, U32 numProjection, F32 zoneoutCell, F32 zoneoutOutput, I32 steps) = 0; - - virtual std::shared_ptr createEmbedding(DataType dt, U32 inputDim, U32 numOutput, bool transpose) = 0; - - virtual std::shared_ptr createMultiply(DataType dt, F32 scale, F32 bias) = 0; - - virtual std::shared_ptr createMatMul(DataType dt, bool transposeA, bool transposeB) = 0; - - virtual std::shared_ptr createLayerNorm(DataType dt, U32 weightNum) = 0; - - virtual std::shared_ptr createReshape(DataType dt, I32* shapeDims, I32 shapeSize, I32 axis, I32 numAxes) = 0; - - virtual std::shared_ptr createResize(DataType paramDT, void* paramPtr) = 0; - - virtual std::shared_ptr createSlice(DataType dt, I32 axis, I32* slicePoints, U32 sliceSize) = 0; - - virtual std::shared_ptr createTranspose(DataType dt, U32* transDims, U32 transSize) = 0; - - virtual std::shared_ptr createAttention(DataType dt, U32 numHeads, U32 fromSequenceLength, U32 toSequenceLength) = 0; - - virtual std::shared_ptr createClip(DataType dt, F32 clipMinScalar, F32 clipMaxScalar) = 0; - - virtual std::shared_ptr createSqueeze(DataType dt, I32 axis, I32 *dims, I32 dimSize) = 0; - - virtual std::shared_ptr createUnsqueeze(DataType dt, I32 axis, I32 *dims, I32 dimSize) = 0; - - virtual std::shared_ptr createReduction(DataType dt, I32 axis, bool keepDim, ReductionMode reductionMode, float coeff) = 0; - - virtual std::shared_ptr createArgMax(DataType dt, I32 axis) = 0; - - virtual std::shared_ptr createCopy(DataType dt, I32 *srcDims, I32 *dstDims, I32 length) = 0; - - virtual std::shared_ptr createCheck(DataType dt, CheckMode checkMode) = 0; - - virtual std::shared_ptr createRepeat(DataType dt, I32 loops, I32 axis, - I32 jumpOperatorIndex, I32 currentOperatorIndex) = 0; - - virtual std::shared_ptr createBilateralSliceApply(U32 coefficiency_len, - bool has_offset, BilateralSliceApplyMode mode) = 0; - - virtual std::shared_ptr createPreAllocatedMemory(DataType dt, TensorDesc desc) = 0; - - virtual std::shared_ptr createSharedWeight(DataType dt, TensorDesc desc) = 0; - - virtual std::shared_ptr createJump(DataType dt, I32 jumpOperatorIndex, I32 currentOperatorIndex) = 0; - - virtual std::shared_ptr createSpace2Depth(DataType dt) = 0; - - virtual std::shared_ptr createDepth2Space(DataType dt) = 0; - - virtual std::shared_ptr createAttentionMask(DataType dt, I32 attentionLength, - bool sameLength, float mask) = 0; - - virtual std::shared_ptr createRelativePositionEmbedding(DataType dt, U32 inputDim, - U32 numOutput, bool transpose, I32 axis) = 0; - - virtual std::shared_ptr createRelativeShift(DataType dt, I32 axis, - I32 shiftLength) = 0; - - virtual std::shared_ptr createPadding(DataType dt, PadDesc padDesc) = 0; - - virtual std::shared_ptr createPriorBox(DataType dt, PriorBoxDesc priorboxDesc) = 0; - - virtual std::shared_ptr createDetectionOutput(DataType dt, DetectionOutputDesc detectionoutputDesc) = 0; - - std::shared_ptr createOperators(OperatorSpec curOps, DataType dt, HashMap operatorIndexMap, Vec inputTensorsName) { - OperatorType opType = curOps.type; - DataType dtNoQ = (dt == DT_F16_8Q) ? DT_F16 : dt; - std::string opName = curOps.name; - std::shared_ptr op; - switch (opType) { - case OT_Conv: { - ConvolutionParamSpec curConvParamSpec = curOps.ps.conv_spec; - U32 nf = curConvParamSpec.num_outputs; - U32 ksizeH = curConvParamSpec.kernel_size_h; - U32 ksizeW = curConvParamSpec.kernel_size_w; - ConvolutionMode curConvolutionType = curConvParamSpec.convolution_type; - U32 group = curConvParamSpec.group; - U32 dilateH = curConvParamSpec.dilatedRate_h; - U32 dilateW = curConvParamSpec.dilatedRate_w; - U32 kstrideH = curConvParamSpec.stride_h; - U32 kstrideW = curConvParamSpec.stride_w; - U32 paddingT = curConvParamSpec.padding_top; - U32 paddingB = curConvParamSpec.padding_bottom; - U32 paddingL = curConvParamSpec.padding_left; - U32 paddingR = curConvParamSpec.padding_right; - ActivationDesc dwActiveDesc; - ActivationDesc pwActiveDesc; - dwActiveDesc.mode = curConvParamSpec.dw_activation_type; - pwActiveDesc.mode = curConvParamSpec.pw_activation_type; - dwActiveDesc.value[0] = 0; - pwActiveDesc.value[0] = 0; - op = createConvolution(dt, nf, ksizeH, ksizeW, kstrideH, kstrideW, - paddingT, paddingB, paddingL, paddingR, - dwActiveDesc, pwActiveDesc, curConvolutionType, group, dilateH, dilateW); - break; - } - case OT_Deconvolution: { - ConvolutionParamSpec curConvParamSpec = curOps.ps.conv_spec; - U32 nf = curConvParamSpec.num_outputs; - U32 ksizeH = curConvParamSpec.kernel_size_h; - U32 ksizeW = curConvParamSpec.kernel_size_w; - ConvolutionMode curConvolutionType = curConvParamSpec.convolution_type; - U32 group = curConvParamSpec.group; - U32 dilateH = curConvParamSpec.dilatedRate_h; - U32 dilateW = curConvParamSpec.dilatedRate_w; - U32 kstrideH = curConvParamSpec.stride_h; - U32 kstrideW = curConvParamSpec.stride_w; - U32 paddingT = curConvParamSpec.padding_top; - U32 paddingB = curConvParamSpec.padding_bottom; - U32 paddingL = curConvParamSpec.padding_left; - U32 paddingR = curConvParamSpec.padding_right; - ActivationDesc dwActiveDesc; - ActivationDesc pwActiveDesc; - dwActiveDesc.mode = curConvParamSpec.dw_activation_type; - pwActiveDesc.mode = curConvParamSpec.pw_activation_type; - dwActiveDesc.value[0] = 0; - pwActiveDesc.value[0] = 0; - op = createDeconvolution(dtNoQ, nf, ksizeH, ksizeW, kstrideH, kstrideW, - paddingT, paddingB, paddingL, paddingR, - dwActiveDesc, pwActiveDesc, curConvolutionType, group, dilateH, dilateW); - break; - } - case OT_FC: { - FullyConnectedParamSpec curFcParamSpec = curOps.ps.fc_spec; - I32 curNumOutput = curFcParamSpec.num_outputs; - I32 curNumSlice = curFcParamSpec.num_slices; - op = createFullyConnected(dt, 0, curNumOutput, curNumSlice, curFcParamSpec.slice_point); - break; - } - case OT_Pooling: { - PoolingParamSpec curPoolingParamSpec = curOps.ps.pooling_spec; - PoolingMode mode = curPoolingParamSpec.mode; - U32 ksH = curPoolingParamSpec.kernel_size_h; - U32 ksW = curPoolingParamSpec.kernel_size_w; - U32 strideH = curPoolingParamSpec.stride_h; - U32 strideW = curPoolingParamSpec.stride_w; - U32 paddingT = curPoolingParamSpec.padding_top; - U32 paddingB = curPoolingParamSpec.padding_bottom; - U32 paddingL = curPoolingParamSpec.padding_left; - U32 paddingR = curPoolingParamSpec.padding_right; - RoundMode rm = curPoolingParamSpec.rm; - op = createPooling(mode, ksH, ksW, strideH, strideW, paddingT, paddingB, paddingL, paddingR, rm); - break; - } - case OT_Softmax: { - SoftmaxParamSpec curSoftmaxParamSpec = curOps.ps.softmax_spec; - I32 axis = curSoftmaxParamSpec.axis; - op = createSoftmax(dtNoQ, axis); - break; - } - case OT_Relu: { - ActivationDesc activationDesc; - activationDesc.mode = ACTIVATION_RELU; - activationDesc.value[0] = curOps.ps.relu_spec.neg_slope; - op = createActivation(activationDesc); - break; - } - case OT_Relu6: { - ActivationDesc activationDesc; - activationDesc.mode = ACTIVATION_RELU6; - op = createActivation(activationDesc); - break; - } - case OT_HSwish: { - ActivationDesc activationDesc; - activationDesc.mode = ACTIVATION_H_SWISH; - op = createActivation(activationDesc); - break; - } - case OT_Sigmoid: { - ActivationDesc activationDesc; - activationDesc.mode = ACTIVATION_SIGMOID; - op = createActivation(activationDesc); - break; - } - case OT_HSigmoid: { - ActivationDesc activationDesc; - activationDesc.mode = ACTIVATION_H_SIGMOID; - op = createActivation(activationDesc); - break; - } - case OT_Gelu: { - ActivationDesc activationDesc; - activationDesc.mode = ACTIVATION_GELU; - op = createActivation(activationDesc); - break; - } - case OT_TanH: { - ActivationDesc activationDesc; - activationDesc.mode = ACTIVATION_TANH; - op = createActivation(activationDesc); - break; - } - case OT_Concat: { - ConcatParamSpec curConcatParamSpec = curOps.ps.concat_spec; - I32 axis = curConcatParamSpec.axis; - op = createConcat(axis); - break; - } - case OT_Eltwise: { - EltwiseParamSpec curEltwiseParamSpec = curOps.ps.eltwise_spec; - EltwiseMode curEltMode = curEltwiseParamSpec.elt_mode; - EltwiseSumSpec curEltSumSpec = curEltwiseParamSpec.elt_sum_spec; - op = createEltwise(curEltMode, curEltSumSpec.coeff_size, curEltSumSpec.coeff_values); - break; - } - case OT_Embedding: { - EmbedParamSpec curEmbedParamSpec = curOps.ps.embed_spec; - U32 curInputDim = curEmbedParamSpec.input_dim; - U32 curNumOutput = curEmbedParamSpec.num_output; - bool curTranspose = curEmbedParamSpec.transpose; - op = createEmbedding(dtNoQ, curInputDim, curNumOutput, curTranspose); - break; - } - case OT_MatMul: { - MatMulParamSpec curMatMulParamSpec = curOps.ps.matmul_spec; - bool transposeA = curMatMulParamSpec.transpose_a; - bool transposeB = curMatMulParamSpec.transpose_b; - op = createMatMul(dt, transposeA, transposeB); - break; - } - case OT_Multiply: { - MultiplyParamSpec curMultiplyParamSpec = curOps.ps.multiply_spec; - F32 scale = curMultiplyParamSpec.scale; - F32 bias = curMultiplyParamSpec.bias; - op = createMultiply(dt, scale, bias); - break; - } - case OT_Scale: { - ScaleParamSpec curScaleParamSpec = curOps.ps.scale_spec; - I32 num = curScaleParamSpec.num_concat; - I32 axis = curScaleParamSpec.axis; - op = createScale(dtNoQ, axis, 0, num); - break; - } - case OT_LayerNorm: { - op = createLayerNorm(dt, 0); - break; - } - case OT_Reshape: { - ReshapeParamSpec curReshapeParamSpec = curOps.ps.reshape_spec; - I32* curShapeDims = curReshapeParamSpec.shape_dims; - I32 curShapeSize = curReshapeParamSpec.shape_size; - I32 curAxis = curReshapeParamSpec.axis; - I32 curNumAxes = curReshapeParamSpec.num_axes; - op = createReshape(dt, curShapeDims, curShapeSize, curAxis, curNumAxes); - break; - } - case OT_Upsample: { - UpsampleParamSpec curUpsampleParamSpec = curOps.ps.upsample_spec; - F32* paramPtr = curUpsampleParamSpec.scale; - op = createResize(DT_F32, paramPtr); - break; - } - case OT_Interp: { - InterpParamSpec curInterpParamSpec = curOps.ps.interp_spec; - U32 size[2]; - size[0] = curInterpParamSpec.height; - size[1] = curInterpParamSpec.width; - op = createResize(DT_U32, size); - break; - } - case OT_Slice: { - SliceParamSpec curSliceParamSpec = curOps.ps.slice_spec; - I32 curAxis = curSliceParamSpec.axis; - I32* curSlicePoints = curSliceParamSpec.slice_points; - I32 curSliceSize = curSliceParamSpec.slice_size; - op = createSlice(dt, curAxis, curSlicePoints, curSliceSize); - break; - } - case OT_Transpose: { - TransposeParamSpec curTransposeSpec = curOps.ps.transpose_spec; - U32* curTransDimsPtr = curTransposeSpec.trans_dims; - U32 curTransSize = curTransposeSpec.trans_size; - op = createTranspose(dt, curTransDimsPtr, curTransSize); - break; - } - case OT_Attention: { - AttentionParamSpec curAttentionSpec = curOps.ps.attention_spec; - U32 numHeads = curAttentionSpec.num_heads; - U32 fromSequenceLength = curAttentionSpec.from_sequence_length; - U32 toSequenceLength = curAttentionSpec.to_sequence_length; - op = createAttention(dtNoQ, numHeads, fromSequenceLength, toSequenceLength); - break; - } - case OT_Clip: { - ClipParamSpec curClipSpec = curOps.ps.clip_spec; - F32 curClipMinScalar = curClipSpec.min; - F32 curClipMaxScalar = curClipSpec.max; - op = createClip(dtNoQ, curClipMinScalar, curClipMaxScalar); - break; - } - case OT_LSTM: { - LSTMParamSpec curLSTMParamSpec = curOps.ps.lstm_spec; - U32 numOutput = curLSTMParamSpec.num_output; - U32 numProjection = curLSTMParamSpec.num_projection; - F32 zoneoutCell = curLSTMParamSpec.zoneout_cell; - F32 zoneoutOutput = curLSTMParamSpec.zoneout_output; - I32 steps = curLSTMParamSpec.steps; - op = createLSTM(dtNoQ, numOutput, numProjection, zoneoutCell, zoneoutOutput, steps); - break; - } - case OT_Squeeze: { - SqueezeParamSpec curSqueezeParamSpec = curOps.ps.squeeze_spec; - I32 axis = curSqueezeParamSpec.axis; - I32 *squeezeAxes = curSqueezeParamSpec.squeeze_axes; - I32 numAxes = curSqueezeParamSpec.axes_num; - op = createSqueeze(dtNoQ, axis, squeezeAxes, numAxes); - break; - } - case OT_Unsqueeze: { - UnsqueezeParamSpec curUnsqueezeParamSpec = curOps.ps.unsqueeze_spec; - I32 axis = curUnsqueezeParamSpec.axis; - I32 *unsqueezeAxes = curUnsqueezeParamSpec.unsqueeze_axes; - I32 numAxes = curUnsqueezeParamSpec.axes_num; - op = createUnsqueeze(dtNoQ, axis, unsqueezeAxes, numAxes); - break; - } - case OT_Reduction: { - ReductionParamSpec curReductionParamSpec = curOps.ps.reduction_spec; - I32 axis = curReductionParamSpec.axis; - bool keepDim = curReductionParamSpec.keep_dim; - ReductionMode reductionMode = curReductionParamSpec.reduction_mode; - float coeff = curReductionParamSpec.coeff; - op = createReduction(dtNoQ, axis, keepDim, reductionMode, coeff); - break; - } - case OT_ArgMax: { - ArgMaxParamSpec curArgMaxParamSpec = curOps.ps.argmax_spec; - I32 axis = curArgMaxParamSpec.axis; - op = createArgMax(dtNoQ, axis); - break; - } - case OT_PreAllocatedMemory: { - PreAllocatedMemoryParamSpec curPreAllocatedMemoryParamSpec = curOps.ps.preallocated_memory_spec; - TensorDesc desc= curPreAllocatedMemoryParamSpec.desc; - op = createPreAllocatedMemory(dtNoQ, desc); - break; - } - case OT_SharedWeight: { - SharedWeightParamSpec curSharedWeightParamSpec = curOps.ps.shared_weight_spec; - TensorDesc desc = curSharedWeightParamSpec.desc; - op = createSharedWeight(dtNoQ, desc); - break; - } - case OT_Repeat: { - RepeatParamSpec curRepeatParamSpec = curOps.ps.repeat_spec; - I32 loops = curRepeatParamSpec.loops; - I32 axis = curRepeatParamSpec.axis; - op = createRepeat(dtNoQ, loops, axis, operatorIndexMap[inputTensorsName[0]], operatorIndexMap[opName]); - break; - } - case OT_Check: { - CheckParamSpec curCheckParamSpec = curOps.ps.check_spec; - CheckMode checkMode = curCheckParamSpec.check_mode; - op = createCheck(dtNoQ, checkMode); - break; - } - case OT_Copy: { - CopyParamSpec curCopyParamSpec = curOps.ps.copy_spec; - I32 *srcDims = curCopyParamSpec.src_dims; - I32 *dstDims = curCopyParamSpec.dst_dims; - I32 length = curCopyParamSpec.length; - op = createCopy(dtNoQ, srcDims, dstDims, length); - break; - } - case OT_BilateralSliceApply: { - BilateralSliceApplyParamSpec curBilateralSliceApplyParamSpec = curOps.ps.bilateral_slice_apply_spec; - U32 coefficient_len = curBilateralSliceApplyParamSpec.coefficient_len; - bool has_offset = curBilateralSliceApplyParamSpec.has_offset; - BilateralSliceApplyMode mode = curBilateralSliceApplyParamSpec.mode; - op = createBilateralSliceApply(coefficient_len, has_offset, mode); - break; - } - case OT_Jump: { - op = createJump(dtNoQ, operatorIndexMap[inputTensorsName[0]], operatorIndexMap[opName]); - break; - } - case OT_Space2Depth: { - op = createSpace2Depth(dt); - break; - } - case OT_Depth2Space: { - op = createDepth2Space(dt); - break; - } - case OT_AttentionMask: { - AttentionMaskParamSpec curAttentionMaskParamSpec = curOps.ps.attention_mask_spec; - I32 attention_length = curAttentionMaskParamSpec.attention_length; - bool same_length = curAttentionMaskParamSpec.same_length; - float mask = curAttentionMaskParamSpec.mask; - op = createAttentionMask(dt, attention_length, same_length, mask); - break; - } - case OT_RelativePositionEmbedding: { - RelativePositionEmbedParamSpec curRelativePositionEmbedParamSpec = curOps.ps.relative_position_embed_spec; - U32 curInputDim = curRelativePositionEmbedParamSpec.input_dim; - U32 curNumOutput = curRelativePositionEmbedParamSpec.num_output; - bool curTranspose = curRelativePositionEmbedParamSpec.transpose; - I32 axis = curRelativePositionEmbedParamSpec.axis; - op = createRelativePositionEmbedding(dtNoQ, curInputDim, curNumOutput, curTranspose, axis); - break; - } - case OT_RelativeShift: { - RelativeShiftParamSpec curRelativeShiftParamSpec = curOps.ps.relative_shift_spec; - I32 axis = curRelativeShiftParamSpec.axis; - I32 shift_length = curRelativeShiftParamSpec.shift_length; - op = createRelativeShift(dt, axis, shift_length); - break; - } - case OT_Pad: { - PadParamSpec curPadParamSpec = curOps.ps.pad_spec; - PadDesc padDesc; - padDesc.top = curPadParamSpec.top; - padDesc.bottom = curPadParamSpec.bottom; - padDesc.left = curPadParamSpec.left; - padDesc.right = curPadParamSpec.right; - padDesc.constant_value = curPadParamSpec.constant_value; - padDesc.pad_mode = curPadParamSpec.pad_mode; - op = createPadding(dt, padDesc); - break; - } - case OT_PriorBox: { - PriorBoxParamSpec curPriorBoxParamSpec = curOps.ps.prior_box_spec; - PriorBoxDesc priorboxDesc; - for(int i = 0; i < 2; i++ ){ - if(curPriorBoxParamSpec.min_sizes[i] == 0) - break; - priorboxDesc.min_sizes.push_back(curPriorBoxParamSpec.min_sizes[i]); - } - for(int i = 0; i < 2; i++ ){ - if(curPriorBoxParamSpec.max_sizes[i] == 0) - break; - priorboxDesc.max_sizes.push_back(curPriorBoxParamSpec.max_sizes[i]); - } - for(int i = 0; i < 2; i++ ){ - if(curPriorBoxParamSpec.aspect_ratios[i] == 0) - break; - priorboxDesc.aspect_ratios.push_back(curPriorBoxParamSpec.aspect_ratios[i]); - } - priorboxDesc.flip = curPriorBoxParamSpec.flip; - priorboxDesc.clip = curPriorBoxParamSpec.clip; - for(int i = 0; i < 4; i++){ - priorboxDesc.variances[i] = curPriorBoxParamSpec.variances[i]; - } - priorboxDesc.image_h = curPriorBoxParamSpec.image_h; - priorboxDesc.image_w = curPriorBoxParamSpec.image_w; - priorboxDesc.step_h = curPriorBoxParamSpec.step_h; - priorboxDesc.step_w = curPriorBoxParamSpec.step_w; - priorboxDesc.offset = curPriorBoxParamSpec.offset; - op = createPriorBox(dt, priorboxDesc); - break; - } - case OT_DetectionOutput: { - DetectionOutputParamSpec curDetectionoutputParamSpec = curOps.ps.detection_output_spec; - DetectionOutputDesc detectionoutputDesc; - detectionoutputDesc.num_class = curDetectionoutputParamSpec.num_class; - detectionoutputDesc.nms_threshold = curDetectionoutputParamSpec.nms_threshold; - detectionoutputDesc.nms_top_k = curDetectionoutputParamSpec.nms_top_k; - detectionoutputDesc.keep_top_k = curDetectionoutputParamSpec.keep_top_k; - detectionoutputDesc.confidence_threshold = curDetectionoutputParamSpec.confidence_threshold; - op = createDetectionOutput(dt, detectionoutputDesc); - break; - } - default: { - std::cerr << "[ERROR] unsupported layer " << OperatorTypeName()[opType] << std::endl; - exit(1); - break; - } - } - return op; - } -}; - -#endif //_FACTORY_H diff --git a/inference/include/fully_connected.hpp b/inference/include/fully_connected.hpp deleted file mode 100644 index 5236d6a8..00000000 --- a/inference/include/fully_connected.hpp +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -/** - * Project deploy - */ -#ifndef _FCELTWISE_H -#define _FCELTWISE_H - -#include "weight_operator.hpp" -#include "tensor_computing.h" - -class FullyConnected: public WeightOperator { -public: - FullyConnected(DataType dt, U32 numInput, U32 numOutput, - U32 numSlice, I32* slicePoint) - { - this->dt = dt; - this->numInput = numInput; - this->numOutput = numOutput; - this->numSlice = numSlice; - slicePoints = Vec(numSlice); - memcpy(slicePoints.data(), slicePoint, sizeof(I32)*numSlice); - this->hasBias = false; - } - - OperatorType get_op_type() override - { - return OT_FC; - } - - virtual EE init_weight_bias_from_model(U8** modelPtr) = 0; - virtual EE transform_filter() = 0; - virtual EE infer_forward_algorithm(HashMap &algorithmMap) { - UNUSED(algorithmMap); - return SUCCESS; - } -public: - U32 numInput; - U32 numOutput; - U32 numSlice; - Vec slicePoints; -}; - -#endif //_FCELTWISE_H diff --git a/inference/include/inference.hpp b/inference/include/inference.hpp deleted file mode 100644 index 461c06dd..00000000 --- a/inference/include/inference.hpp +++ /dev/null @@ -1,128 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _HPP_INFERENCE -#define _HPP_INFERENCE - -#include "cnn.hpp" -#ifdef _USE_MALI -#include "gcl.h" -#endif -#ifdef _BUILD_TEST -#include "sequential.hpp" -#endif -#include "thread_affinity.h" -#include "op_type.h" -#include "model_serialize_deserialize.hpp" -typedef enum{ - d_CPU = 0, - d_GPU = 1 -} DeviceTypeIn; - -inline HashMap extractInputDims(const ModelSpec *ms) { - HashMap inputDescMap; - int inputNum = ms->num_inputs; - for (int i = 0; i < inputNum; i++) { - inputDescMap[ms->input_names[i]] = ms->input_dims[i]; - } - return inputDescMap; -} - -inline Arch getCpuArchInfo(const char *cpuAffinityPolicyName){ - int *cpuids; - Arch *archs; - int cpuNum; - thread_affinity_init(&cpuNum, &archs, &cpuids); - CpuAffinityPolicy affinityPolicy = thread_affinity_get_policy_by_name(cpuAffinityPolicyName); - Arch arch = thread_affinity_set_by_policy(cpuNum, archs, cpuids, affinityPolicy, 0); - thread_affinity_destroy(&cpuNum, &archs, &cpuids); - return arch; -} - -inline Arch getArch(const char *cpuAffinityPolicyName, DeviceTypeIn device) -{ - Arch arch = CPU_GENERAL; - if(device == d_CPU){ - arch = getCpuArchInfo(cpuAffinityPolicyName); - } else if(device == d_GPU) { - arch = MALI; - } else { - CHECK_STATUS(NOT_SUPPORTED); - } - return arch; -} - -inline std::shared_ptr createPipelinefromMs(Arch arch, ModelSpec* ms, const char *algorithmMapPath) -{ - CNN* cnn; - cnn = new CNN(arch, ms->dt, ms->model_name); - - cnn->sort_operators_sequential(ms); - //TODO this function is not tested - //cnn ->sort_operators_tp(ms); - - // create ops - cnn->initialize_ops(ms); - - HashMap inputDescMap = extractInputDims(ms); - - cnn->loadAlgorithmMapFromText(algorithmMapPath); - - // assign space for output, tmp, bias, and trans_weight - cnn->ready(inputDescMap); - - CHECK_STATUS(cnn->mark_input_output(ms)); -#ifdef _USE_MALI - if(arch == MALI) cnn->mali_prepare(); -#endif - - cnn->saveAlgorithmMapToText(algorithmMapPath); - - return std::shared_ptr(cnn); -} - -inline std::shared_ptr createPipelineWithConfigure(const char *cpuAffinityPolicyName, - const char* modelPath, - DeviceTypeIn device, - const char *algorithmMapPath) -{ - // set cpu affinity - //TODO add mali support - Arch arch = getArch(cpuAffinityPolicyName, device); - - // deserialize model from file - ModelSpec ms; - deserialize_model_from_file(modelPath, &ms); - - std::shared_ptr pipeline = createPipelinefromMs(arch, &ms, algorithmMapPath); - - CHECK_STATUS(mt_destroy_model(&ms)); - return pipeline; -} - -inline std::shared_ptr createPipeline(const char *cpuAffinityPolicyName, const char* modelPath, DeviceTypeIn device) -{ - return createPipelineWithConfigure(cpuAffinityPolicyName, modelPath, device, ""); -} - -#ifdef _BUILD_TEST -inline Sequential createSequentialPipeline(const char *cpuAffinityPolicyName, DataType dt, const char *modelName) -{ - // set cpu affinity - Arch arch = getCpuArchInfo(cpuAffinityPolicyName); - auto sequential = Sequential(arch, dt, modelName); - return sequential; -} -#endif -#endif diff --git a/inference/include/jump.hpp b/inference/include/jump.hpp deleted file mode 100644 index 07ccd39e..00000000 --- a/inference/include/jump.hpp +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _JUMP_H -#define _JUMP_H - -#include "operator.hpp" - -class Jump: public Operator -{ -public: - /** - @param mode - */ - Jump(DataType dt, I32 jumpOperatorIndex, I32 currentOperatorIndex) - { - this->dt = dt; - this->jumpOperatorIndex = jumpOperatorIndex; - this->nextOperatorIndex = currentOperatorIndex + 1; - } - - OperatorType get_op_type() override - { - return OT_Jump; - } - - void run() override - { } - - int get_next_operator_index() override - { - // check status - if (this->inputTensors.size() > 1) { - Tensor inputTensor = this->inputTensors[1]; - TensorDesc inputDesc = inputTensor.get_desc(); - I32 *ptr = (I32 *)(inputTensor.get_val()); - U32 length = tensorNumElements(inputDesc); - for (U32 i = 0; i < length; i++) { - if (ptr[i]) { - return this->jumpOperatorIndex; - } - } - } - return this->nextOperatorIndex; - } - EE infer_output_tensors_size(VecinDims, Vec* outDims) override - { - UNUSED(inDims); - - (*outDims)[0].dt = this->dt; - (*outDims)[0].nDims = 0; - return SUCCESS; - } - -private: - int jumpOperatorIndex; - int nextOperatorIndex; -}; - -#endif //_JUMP_H diff --git a/inference/include/layer_norm.hpp b/inference/include/layer_norm.hpp deleted file mode 100644 index 59590fc8..00000000 --- a/inference/include/layer_norm.hpp +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _LAYER_NORM_H -#define _LAYER_NORM_H - -#include "operator.hpp" -#include "tensor_computing.h" -#include "tensor_desc.h" -#include "op_type.h" - -class LayerNorm: public WeightOperator { -public: - LayerNorm(DataType dt, U32 weightNum){ - this->dt = dt; - this->weightNum = weightNum; - this->hasBias = false; - } - - OperatorType get_op_type() override - { - return OT_LayerNorm; - } - - virtual EE init_weight_bias_from_model(U8** modelPtr) = 0; -protected: - U32 weightNum; -}; - -#endif //_LAYER_NORM_H diff --git a/inference/include/lstm.hpp b/inference/include/lstm.hpp deleted file mode 100644 index 3d524444..00000000 --- a/inference/include/lstm.hpp +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -/** - * Project deploy - */ -#ifndef _LSTM_H -#define _LSTM_H - -#include "lstmcell.hpp" -#include "tensor_computing.h" - -class LSTM: public LSTMCell { -public: - LSTM(DataType dt, U32 numOutput, U32 numProjection, F32 zoneoutCell, F32 zoneoutOutput, bool biDirection) - :LSTMCell(dt, numOutput, numProjection, zoneoutCell, zoneoutOutput, biDirection) { - } - - void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); - - Tensor weightTensor = this->weightTensors[0]; - TensorDesc weightDesc = weightTensor.get_desc(); - - Tensor biasTensor = this->biasTensors[0]; - TensorDesc biasDesc = biasTensor.get_desc(); - - Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); - - //NOTE: no clean tmp and output - CHECK_STATUS(lstm(inputDesc, inputTensor.get_val(), - weightDesc, weightTensor.get_val(), - biasDesc, biasTensor.get_val(), - this->lenOfTemp, this->temp->get_val(), - this->lstmDesc, - outputDesc, outputTensor.get_val(), this->schedule)); - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - EE infer_output_tensors_size(Vec inDims, Vec* outDims) override - { - TensorDesc inDim = inDims[0]; - - DataType dt; - DataFormat df; - U32 iB, inT, iX; - CHECK_STATUS(tensor3dGet(inDim, &dt, &df, &iB, &inT, &iX)); - U32 column = (this->lstmDesc.numProjection > 0) ? this->lstmDesc.numProjection : this->lstmDesc.numOutput; - this->xDim = iX; - this->filterRow = 4 * column; - this->filterCol = column + iX; - TensorDesc filter_dim = tensor2df(this->dt, DF_NK, this->filterRow, this->filterCol); - U32 outBytes = 0; - CHECK_STATUS(lstm_infer_output_size(inDim, filter_dim, this->lstmDesc, &((*outDims)[0]), &outBytes)); - return SUCCESS; - } - - U32 infer_tmp_memory_size() override - { - TensorDesc inputDesc = (this->inputTensors[0]).get_desc(); - TensorDesc filterDesc = (this->weightTensors[0]).get_desc(); - TensorDesc outputDesc = (this->outputTensors[0]).get_desc(); - U32 bytes = 0; - CHECK_STATUS(lstm_infer_forward_tmp_bytes(inputDesc, filterDesc, outputDesc, this->lstmDesc, &bytes, this->schedule)); - return bytes; - } -}; - -#endif //_LSTM_H diff --git a/inference/include/lstmcell.hpp b/inference/include/lstmcell.hpp deleted file mode 100644 index a21b963b..00000000 --- a/inference/include/lstmcell.hpp +++ /dev/null @@ -1,199 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -/** - * Project deploy - */ -#ifndef _LSTMCELL_H -#define _LSTMCELL_H - -#include "weight_operator.hpp" -#include "tensor_computing.h" - - -class LSTMCell: public WeightOperator { -public: - LSTMCell(DataType dt, U32 numOutput, U32 numProjection, F32 zoneoutCell, F32 zoneoutOutput, bool biDirection) - { - this->dt = dt; - this->lstmDesc.numOutput = numOutput; - this->lstmDesc.numProjection = numProjection; - this->lstmDesc.forgetBias = 1.0; - this->lstmDesc.zoneoutCell = zoneoutCell; - this->lstmDesc.zoneoutOutput = zoneoutOutput; - this->lstmDesc.biDirection = biDirection; - this->lstmDesc.activationMode = ACTIVATION_TANH; - this->hasBias = false; - } - - OperatorType get_op_type() override - { - return OT_LSTM; - } - - void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - Tensor xTensor = this->inputTensors[0]; - U8* xPtr = (U8*)xTensor.get_val(); - Tensor weightTensor = this->weightTensors[0]; - Tensor biasTensor = this->biasTensors[0]; - Tensor stateTensor = this->inputTensors[1]; - Tensor hTensor = this->outputTensors[0]; - - U8 *tempPtr = (U8*)this->temp->get_val(); - - if (this->featureScale.size() > 1) { - CHECK_STATUS(clip(&(this->clipMin), &(this->clipMax), - xTensor.get_desc(), xTensor.get_val(), - xTensor.get_desc(), tempPtr, this->schedule)); - xPtr = tempPtr; - tempPtr += tensorNumBytes(xTensor.get_desc()); - } - - CHECK_STATUS(lstmcell(xTensor.get_desc(), xPtr, - weightTensor.get_desc(), weightTensor.get_val(), - biasTensor.get_desc(), biasTensor.get_val(), - stateTensor.get_val(), - this->lenOfTemp, tempPtr, - this->lstmDesc, this->xDim, this->lstmDesc.numOutput, - hTensor.get_desc(), hTensor.get_val(), this->schedule)); - - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - EE infer_output_tensors_size(Vec inDims, Vec* outDims) override - { - TensorDesc inDim = inDims[0]; - - DataType dt; - DataFormat df; - U32 iB, iX; - CHECK_STATUS(tensor2dfGet(inDim, &dt, &df, &iB, &iX)); - U32 column = (this->lstmDesc.numProjection > 0) ? this->lstmDesc.numProjection : this->lstmDesc.numOutput; - this->xDim = iX; - this->filterRow = 4 * column; - this->filterCol = this->lstmDesc.numOutput + iX; - TensorDesc filter_dim = tensor2df(this->dt, DF_NK, this->filterRow, this->filterCol); - U32 outBytes = 0; - CHECK_STATUS(lstmcell_infer_output_size(inDim, filter_dim, this->lstmDesc, &((*outDims)[0]), &outBytes)); - return SUCCESS; - } - - U32 infer_tmp_memory_size() override - { - TensorDesc inputDesc = (this->inputTensors[0]).get_desc(); - TensorDesc filterDesc = (this->weightTensors[0]).get_desc(); - TensorDesc outputDesc = (this->outputTensors[0]).get_desc(); - U32 bytes = 0; - CHECK_STATUS(lstmcell_infer_forward_tmp_bytes(inputDesc, filterDesc, outputDesc, this->lstmDesc, &bytes, this->schedule)); - - if (featureScale.size() > 1) { - CHECK_REQUIREMENT(featureScale[0][0] > 0); - CHECK_REQUIREMENT(featureScale[0][0] == featureScale[1][0]); - this->clipMax = 127.0 / featureScale[0][0]; - this->clipMin = -1 * this->clipMax; - - bytes += tensorNumBytes(inputDesc); - } - return bytes; - } - - U32 infer_wtm_memory_size() override - { - TensorDesc filterDesc = (this->weightTensors[0]).get_desc(); - U32 byte = 0; - CHECK_STATUS(lstm_transform_filter_bytes(filterDesc, this->lstmDesc, &byte, this->schedule)); - return byte; - } - - EE transform_filter() - { - this->wtm = std::shared_ptr(new Tensor()); - Tensor weightTensor = this->weightTensors[0]; - TensorDesc weightDesc = weightTensor.get_desc(); - U8* weightPtr = weightTensor.get_val(); - - auto wtmBytes = this->infer_wtm_memory_size(); - std::shared_ptr wtmPtr((U8*) operator new(wtmBytes)); - auto cpuMem = new CpuMemory(); - cpuMem->set_shared_ptr_caster(wtmPtr); - Memory_* mem = (Memory_*)(cpuMem); - std::shared_ptr memWtmPtr(mem); - this->set_wtm_memory(wtmBytes, memWtmPtr); - - TensorDesc wtmDesc; - CHECK_STATUS(lstm_transform_filter(weightDesc, weightPtr, this->lstmDesc, &wtmDesc, this->get_wtm()->get_val(), this->schedule)); - - this->get_wtm()->set_desc(wtmDesc); - this->weightTensors[0] = *this->get_wtm(); - return SUCCESS; - } - - EE init_weight_bias_from_model(U8** modelPtr) - { - int num = (this->lstmDesc.biDirection) ? 2 : 1; - U32 row = this->xDim + this->lstmDesc.numOutput; - U32 column = (this->lstmDesc.numProjection > 0) ? this->lstmDesc.numProjection : this->lstmDesc.numOutput; - TensorDesc weightDesc = tensor2df(this->dt, DF_NK, this->filterRow, this->filterCol); - TensorDesc biasDesc = tensor1d(this->dt, column * 4); - U32 weightBytes = num * bytesOf(this->dt) * ((row * column * 4) + (this->lstmDesc.numProjection * this->lstmDesc.numOutput)); - U32 biasBytes = num * tensorNumBytes(biasDesc); - - std::shared_ptr modelWeightTensor(new Tensor()); - std::shared_ptr modelBiasTensor(new Tensor()); - modelWeightTensor->set_desc(weightDesc); - modelBiasTensor->set_desc(biasDesc); - - auto curOpWs = this->get_weightspec_ptr(); - if(modelPtr != nullptr){ - modelWeightTensor->alloc(); - memcpy((U8*)modelWeightTensor->get_val(), *modelPtr, weightBytes); - *modelPtr += weightBytes; - } else { - modelWeightTensor->set_shared_ptr(std::shared_ptr(curOpWs.weight)); - } - - U8* biasVal = nullptr; - if(modelPtr != nullptr){ - if(this->hasBias){ - biasVal = *modelPtr; - *modelPtr += biasBytes; - } - } else { - if(this->hasBias) biasVal = curOpWs.vec; - } - - if (biasVal) { - modelBiasTensor->set_shared_ptr(std::shared_ptr(biasVal)); - } else { - modelBiasTensor->alloc(); - memset((U8*)modelBiasTensor->get_val(), 0, biasBytes); - } - - this->weightTensors.push_back(*modelWeightTensor.get()); - this->biasTensors.push_back(*modelBiasTensor.get()); - return SUCCESS; - } - -public: - LSTMDesc lstmDesc; - U32 filterRow; - U32 filterCol; - U32 xDim; - F32 clipMax; - F32 clipMin; -}; - -#endif //_LSTMCELL_H diff --git a/inference/include/matmul.hpp b/inference/include/matmul.hpp deleted file mode 100644 index 6674ee3b..00000000 --- a/inference/include/matmul.hpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -/** - * Project deploy - */ -#ifndef _MATMUL_H -#define _MATMUL_H - -#include "operator.hpp" -#include "tensor_computing.h" - -class MatMul: public Operator { -public: - MatMul(DataType dt, bool transposeA, bool transposeB) - { - this->dt = dt; - this->transposeA = transposeA; - this->transposeB = transposeB; - } - - OperatorType get_op_type() override - { - return OT_MatMul; - } - - virtual EE infer_forward_algorithm(HashMap &algorithmMap) { - UNUSED(algorithmMap); - return SUCCESS; - } - -protected: - bool transposeA; - bool transposeB; -}; - -#endif //_MATMUL_H diff --git a/inference/include/memory.hpp b/inference/include/memory.hpp deleted file mode 100644 index fd3c967e..00000000 --- a/inference/include/memory.hpp +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _MEMORY_H -#define _MEMORY_H - -#include -#include -#include "point_cast.hpp" - -typedef enum{ - OCLMem = 0, - CPUMem = 1 -} MemoryType; - -class Memory_ { -public: - Memory_(){} - virtual ~Memory_(){} - virtual void alloc(TensorDesc desc) = 0; - virtual void alloc(U32 size) = 0; - virtual void set_val_by_copy(TensorDesc desc, U8* ptr) = 0; - virtual void* get_val() = 0; - inline PtrCaster get_val_caster(){return PtrCaster(this->get_val());} - virtual MemoryType get_mem_type() = 0; - - virtual void set_shared_ptr(PtrCasterShared val) = 0; - virtual std::shared_ptr get_shared_ptr() = 0; - inline void set_shared_ptr_caster(std::shared_ptr val) {set_shared_ptr(PtrCasterShared(val));} - inline PtrCasterShared get_shared_ptr_caster() {return PtrCasterShared(this->get_shared_ptr());} -}; -#endif diff --git a/inference/include/model.hpp b/inference/include/model.hpp deleted file mode 100644 index dd74239e..00000000 --- a/inference/include/model.hpp +++ /dev/null @@ -1,384 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _MODEL_H -#define _MODEL_H - -#include "operator.hpp" -#include "tensor_desc.h" -#ifdef _USE_MALI -#include "gcl.h" -#include "libkernelbin.h" -#endif - -class Model { -public: - /** - * @param name - */ - Model() {} - Model(Arch schedule, DataType dt, std::string name) { - this->schedule = schedule; - this->dt = dt; - this->name = name; - this->memoryAssigned = false; - this->algorithmFileName = "algorithmInfo_"; - this->algorithmFileName += name; - this->algorithmFileName += "_"; - this->algorithmFileName += std::to_string(schedule); - this->algorithmFileName += "_"; - this->algorithmFileName += std::to_string(dt); -#ifdef _USE_MALI - if(schedule == MALI){ - GCLHandle_t handleInfo; - CHECK_STATUS(gcl_create_handle(&handleInfo)); - CHECK_STATUS(gcl_regist_binMap(handleInfo)); - this->handle = std::shared_ptr(handleInfo, gcl_destroy_handle); - } -#endif - } - - virtual EE infer_output_tensors_size(HashMap) = 0; - virtual void assign_output_tensor() = 0; - virtual void infer_tmp_memory_size() = 0; - virtual void assign_tmp_tensor() = 0; - - virtual void ready(HashMap inputDescMap) { - infer_output_tensors_size(inputDescMap); - assign_output_tensor(); - - infer_tmp_memory_size(); - assign_tmp_tensor(); - } - -#ifdef _USE_MALI - virtual EE infer_gclmem_descs(HashMap) {return NOT_SUPPORTED;} - virtual void run_mali_prepare(bool reset) { -#ifndef _DEBUG - for(auto op : ops) op->run(); - if(reset) return; - std::vector kernelIndex; - U32 len = handle->kernelVec.size(); - for(U32 i = 0; i < len; i++) { - auto kernelInfo = handle->kernelVec[i]; - U32 gs[3]; - U32 ls[3]; - bool findKernelThreadInfo = false; - findKernelThreadInfo = getKernelThreadInfoFromMap(kernelThreadMap, kernelInfo.name, gs, ls); - if(findKernelThreadInfo){ - handle->kernelVec[i].gs[0] = gs[0]; - handle->kernelVec[i].gs[1] = gs[1]; - handle->kernelVec[i].gs[2] = gs[2]; - handle->kernelVec[i].ls[0] = ls[0]; - handle->kernelVec[i].ls[1] = ls[1]; - handle->kernelVec[i].ls[2] = ls[2]; - } else { - kernelIndex.push_back(i); - } - } - CHECK_STATUS(gcl_run_kernelVec_select_ls(handle.get(), kernelIndex)); - for(U32 i = 0; i < len; i++) { - auto kernelInfo = handle->kernelVec[i]; - setKernelThreadInfoToMap(kernelThreadMap, kernelInfo.name, kernelInfo.gs, kernelInfo.ls); - } -#else - UNUSED(reset); -#endif - } -#endif - virtual void run() { - -#ifdef _DEBUG - const char* funcStr = "[DEBUG] run()"; -#endif - -#ifdef _USE_MALI - if(this->schedule == MALI) { -#ifndef _DEBUG - CHECK_STATUS(gcl_run_kernelVec(handle.get())); -#else - for (U32 opIndex = 0; opIndex < ops.size(); opIndex++) { - std::shared_ptr op = this->ops[opIndex]; - std::cout << funcStr << " op: " << op->get_name() << "/"<< OperatorTypeName()[op->get_op_type()] << std::endl; - op->run(); - Tensor outputTensor = op->get_output_tensors()[0]; - GCLMem_t output = outputTensor.get_val(); - if(output->desc.memFormat == DF_NCWHC4) { - U32 s1 = output->desc.stride[0]; - U32 s2 = output->desc.stride[1]; - U32 s3 = output->desc.stride[2]; - U32 ho = output->desc.offset[0]; - U32 wo = output->desc.offset[1]; - U32 bi = (s2 > 4) ? 4 : s2; - U32 bj = (s3 > 1) ? 2 : s3; - if(bi == 1 && s1 == 1) bj = (s3 > 8) ? 8 : s3; - U32 size = 4 * s1 * s2 * bj * bytesOf(outputTensor.get_desc().dt); - U8* hostPtr = new U8[(size_t)size]; - gcl_trans_memory(handle.get(), (void*)output, (void*)hostPtr, &size, DEVICE_BUF_TO_HOST, CL_TRUE); - F16* val = (F16*) hostPtr; - for(U32 i = 0; i < bi; i++){ - for(U32 j = 0; j < bj; j++){ - for(U32 k = 0; k < 4; k++){ - std::cout << val[k + ho * 4 + j * s1 * 4 * s2 + (i + wo) * s1 * 4] << " "; - } - } - std::cout << std::endl; - } - delete[] hostPtr; - } - /* - std::string name = op->get_name(); - for(auto outputTensor: op->get_output_tensors()) { - gcl_write_data_to_bin(handle.get(), outputTensor.get_desc(), outputTensor.get_val(), 0, name.c_str()); - name = name + "x"; - } - */ - } - - U32 len = handle->kernelVec.size(); - for(U32 i = 0; i < len; i++) { - auto kernelInfo = handle->kernelVec[i]; - U32 gs[3]; - U32 ls[3]; - bool findKernelThreadInfo = getKernelThreadInfoFromMap(kernelThreadMap, kernelInfo.name, gs, ls); - if(findKernelThreadInfo){ - handle->kernelVec[i].gs[0] = gs[0]; - handle->kernelVec[i].gs[1] = gs[1]; - handle->kernelVec[i].gs[2] = gs[2]; - handle->kernelVec[i].ls[0] = ls[0]; - handle->kernelVec[i].ls[1] = ls[1]; - handle->kernelVec[i].ls[2] = ls[2]; - } - } - CHECK_STATUS(gcl_run_kernelVec(handle.get())); -#endif - } else { -#endif - for (U32 opIndex = 0; opIndex < ops.size();) { - std::shared_ptr op = this->ops[opIndex]; -#ifdef _DEBUG - std::cout << funcStr << " op: " << op->get_name() << "/"<< OperatorTypeName()[op->get_op_type()] << std::endl; -#endif - if (op->get_op_type() == OT_Repeat || op->get_op_type() == OT_Jump ) { - opIndex = op->get_next_operator_index(); - } else { - op->run(); - opIndex++; - } -#ifdef _DEBUG - // debug for nan - Tensor outputTensor = op->get_output_tensors()[0]; - U32 elementNum = tensorNumElements(outputTensor.get_desc()); - for (U32 i = 0; i < elementNum; i++) { - F32 value = outputTensor.getElement(i); - if (i < 32) { - if (i % 8 == 0) { - if (i != 0) - std::cout << std::endl; - std::cout << " "; - } - std::cout << value << " "; - } - - if (UNI_ISINF(value)) { - std::cerr << "\n[ERROR] encounter inf at " << i << std::endl; - exit(1); - } - if (UNI_ISNAN(value)) { - std::cerr << "\n[ERROR] encounter nan at " << i << std::endl; - exit(1); - } - } - /* - std::string name = op->get_name(); - for(auto outputTensor: op->get_output_tensors()) { - auto desc = outputTensor.get_desc(); - if(desc.nDims == 3) desc.df = DF_MTK; - gcl_write_data_to_bin(NULL, desc, outputTensor.get_val(), 1, name.c_str()); - name = name + "x"; - }*/ - std::cout << std::endl; -#endif - } -#ifdef _USE_MALI - } -#endif - } - -#ifdef _USE_INT8 - virtual U32 find_next_dynamic_scale_op(Vec calibratedOpIdx, U32 startIdx) - { - CHECK_REQUIREMENT(startIdx < this->ops.size()) - for (U32 i = startIdx; i < this->ops.size(); ) { - auto op = this->ops[i]; - if (op->is_dynamic_scale()) { - bool calibrated = false; - for (auto idx : calibratedOpIdx) { - if (i == idx) { - calibrated = true; - break; - } - } - if (!calibrated) { - return i; - } - } - - if (op->get_op_type() == OT_Repeat || op->get_op_type() == OT_Jump ) { - i = op->get_next_operator_index(); - } else { - i++; - } - } - - return 0; // The first layer should never be quantized - } - - virtual std::shared_ptr get_op_by_index(U32 i) - { - return ops[i]; - } - - virtual void run_till_breakpoint(U32 opIdx) - { - CHECK_REQUIREMENT(MALI != this->schedule); - for (U32 i = 0; i < this->ops.size(); ) { - auto op = this->ops[i]; - if (op->get_op_type() == OT_Repeat || op->get_op_type() == OT_Jump ) { - if (opIdx == i) { - break; - } - i = op->get_next_operator_index(); - } else { - op->run(); - if (opIdx == i) { - break; - } - i++; - } - } - } -#endif - - virtual bool checkOperator() - { - for (auto op : this->ops) { - if (! op->checkOperator()) return false; - } - return true; - } - - std::string get_name() {return this->name;} - - void loadAlgorithmMapFromText(std::string algorithmMapPath) { - if (algorithmMapPath == std::string("")) return; - FILE *file = fopen(algorithmFileName.c_str(), "r"); - if (!file || feof(file)) - return; - int num = 0; - fscanf(file, "%d", &num); - char operatorName[100]; - char algorithm[100]; - for (int i = 0; i < num; i++) { - fscanf(file, "%s %s", operatorName, algorithm); - algorithmMap[operatorName] = algorithm; - } -#ifdef _USE_MALI - if(this->schedule == MALI) { - fscanf(file, "%d", &num); - char kernelName[100]; - char kernelThreadInfo[100]; - for (int i = 0; i < num; i++) { - fscanf(file, "%s %s", kernelName, kernelThreadInfo); - kernelThreadMap[kernelName] = kernelThreadInfo; - } - } -#endif - fclose(file); - } - - void saveAlgorithmMapToText(std::string algorithmMapPath) { - if (algorithmMapPath == std::string("")) return; - FILE* fileProb = fopen(algorithmFileName.c_str(), "r"); - if (fileProb) { - fclose(fileProb); - return; - } - - FILE *file = fopen(algorithmFileName.c_str(), "w"); - fprintf(file, "%ld\n", (I64)algorithmMap.size()); - for (auto iter: algorithmMap) { - fprintf(file, "%s %s\n", iter.first.c_str(), iter.second.c_str()); - } -#ifdef _USE_MALI - if(this->schedule == MALI) { - fprintf(file, "%ld\n", (I64)kernelThreadMap.size()); - for (auto iter: kernelThreadMap) { - fprintf(file, "%s %s\n", iter.first.c_str(), iter.second.c_str()); - } - } -#endif - fclose(file); - } -#ifdef _USE_MALI - void setKernelThreadInfoToMap(HashMap &kernelThreadMap, std::string name, U32 gs[3], U32 ls[3]) { - std::string kernelThreadInfo = "/"; - for(U32 i = 0; i < 3; i++) { - kernelThreadInfo += std::to_string(gs[i]); - kernelThreadInfo += "/"; - } - for(U32 i = 0; i < 3; i++) { - kernelThreadInfo += std::to_string(ls[i]); - kernelThreadInfo += "/"; - } - kernelThreadMap[name] = kernelThreadInfo; - } - - bool getKernelThreadInfoFromMap(HashMap &kernelThreadMap, std::string name, U32* gs, U32* ls) { - bool findKernelInfo = kernelThreadMap.count(name); - if(!findKernelInfo) return findKernelInfo; - std::string kernelThreadInfo = kernelThreadMap[name]; - U32 be = kernelThreadInfo.find_first_of("/"); - U32 end; - for(U32 i = 0; i < 3; i++) { - end = kernelThreadInfo.find("/", be + 1); - gs[i] = std::stoi(kernelThreadInfo.substr(be + 1, end - be - 1)); - be = end; - } - for(U32 i = 0; i < 3; i++) { - end = kernelThreadInfo.find("/", be + 1); - ls[i] = std::stoi(kernelThreadInfo.substr(be + 1, end - be - 1)); - be = end; - } - return findKernelInfo; - } -#endif - -protected: - Vec> ops; - Arch schedule; - DataType dt; -#ifdef _USE_MALI - std::shared_ptr handle; - HashMap kernelThreadMap; -#endif - HashMap algorithmMap; - bool memoryAssigned; - -private: - std::string name; - std::string algorithmFileName; -}; -#endif diff --git a/inference/include/multiply.hpp b/inference/include/multiply.hpp deleted file mode 100644 index ed8fd4ac..00000000 --- a/inference/include/multiply.hpp +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -/** - * Project deploy - */ -#ifndef _MULTIPLY_H -#define _MULTIPLY_H -#include "operator.hpp" -#include "tensor_computing.h" - -class Multiply: public Operator { -public: - Multiply(DataType dt, F32 scale, F32 bias) - { - this->dt = dt; - this->alpha = scale; - this->beta = bias; - } - - OperatorType get_op_type() override - { - return OT_Multiply; - } - -public: - F32 alpha; - F32 beta; -}; - -#endif //_MULTIPLY_H diff --git a/inference/include/ocl/activation_ocl.hpp b/inference/include/ocl/activation_ocl.hpp deleted file mode 100644 index 203c2430..00000000 --- a/inference/include/ocl/activation_ocl.hpp +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _ACTIVATION_OCL_H -#define _ACTIVATION_OCL_H - -#include "operator.hpp" -#include "tensor_computing.h" -#include "tensor_desc.h" -#include "activation.hpp" - -class ActivationOCL: public Activation -{ -public: - /** - @param mode - */ - ActivationOCL(ActivationDesc activationDesc): Activation(activationDesc) {} - - virtual void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - this->handle->curOpName = this->get_op_name(); - Tensor inputTensor = this->inputTensors[0]; - Tensor outputTensor = this->outputTensors[0]; - GCLMem_t inPtr = inputTensor.get_val(); - GCLMem_t outPtr = outputTensor.get_val(); - TensorDesc inputDesc = inputTensor.get_desc(); - TensorDesc outputDesc = outputTensor.get_desc(); - CHECK_STATUS(activation(inputDesc, inPtr, this->activationDesc, outputDesc, outPtr, this->schedule, &this->oclExtInfo)); - - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - virtual EE infer_output_tensors_size(VecinDims, Vec* outDims) override - { - this->oclExtInfo.maliInfo.gclmemInputDesc = NULL; - this->oclExtInfo.maliInfo.gclmemOutputDesc = NULL; - CHECK_STATUS(activation_infer_output_size(inDims[0], &((*outDims)[0]), this->schedule, &this->oclExtInfo)); - return SUCCESS; - } - - - virtual EE infer_gclmem_desc(Vec* gclmemInputDesc, Vec* gclmemOutputDesc) override - { - TensorDesc inputDesc = this->inputTensors[0].get_desc(); - this->oclExtInfo.maliInfo.gclmemInputDesc = &((*gclmemInputDesc)[0]); - this->oclExtInfo.maliInfo.gclmemOutputDesc = &((*gclmemOutputDesc)[0]); - CHECK_STATUS(activation_infer_output_size(inputDesc, NULL, this->schedule, &this->oclExtInfo)); - return SUCCESS; - } -}; - -#endif //_ACTIVATION_OCL_H diff --git a/inference/include/ocl/bilateral_slice_apply_ocl.hpp b/inference/include/ocl/bilateral_slice_apply_ocl.hpp deleted file mode 100644 index 7008ae4e..00000000 --- a/inference/include/ocl/bilateral_slice_apply_ocl.hpp +++ /dev/null @@ -1,108 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _BILATERAL_SLICE_APPLY_OCL_H -#define _BILATERAL_SLICE_APPLY_OCL_H -#include -#include "operator.hpp" -#include "tensor_computing.h" -#include "tensor_desc.h" -#include "op_type.h" -#include "bilateral_slice_apply.hpp" - -class BilateralSliceApplyOCL: public BilateralSliceApply { -public: - -/** - * @param coefficient_len - * @param has_offset - */ - BilateralSliceApplyOCL(U32 coefficient_len, bool has_offset, BilateralSliceApplyMode mode) : BilateralSliceApply(coefficient_len, has_offset, mode){} - virtual ~BilateralSliceApplyOCL(){} - virtual void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - this->handle->curOpName = this->get_op_name(); - Tensor inputTensor = this->inputTensors[0]; - Tensor gridTensor = this->inputTensors[1]; - TensorDesc inputDesc = inputTensor.get_desc(); - TensorDesc gridDesc = gridTensor.get_desc(); - Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); - - U8* guidePtr = NULL; - if(mode == BSliceApply_NULL) guidePtr = this->inputTensors[2].get_val(); - CHECK_STATUS(bilateral_slice_apply(inputDesc, inputTensor.get_val(), - guideDesc, guidePtr, - gridDesc, gridTensor.get_val(), - bilateralSliceApplyDesc, this->lenOfTemp, this->temp->get_val(), - outputDesc, outputTensor.get_val(), this->schedule, &this->oclExtInfo)); - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - virtual EE infer_output_tensors_size(Vec inDims, Vec* outDims) override - { - auto inDim = inDims[0]; - auto gridDim = inDims[1]; - DataType dt; - DataFormat df; - U32 width; - U32 height; - U32 numChannels; - U32 num; - CHECK_STATUS(tensor4dGet(inDim, &dt, &df, &num, &numChannels, &height, &width)); - TensorDesc inputDesc = tensor4df(dt, df, num, numChannels, height, width); - guideDesc = tensor4df(DT_F16, df, num, 1, height, width); - - CHECK_STATUS(tensor4dGet(gridDim, &dt, &df, &num, &numChannels, &height, &width)); - TensorDesc gridDesc = tensor4df(dt, df, num, numChannels, height, width); - - bilateralSliceApplyDesc = BilateralSliceApply::create_BilateralSliceApplyDesc(this->coefficient_len, this->has_offset, this->mode); - this->oclExtInfo.maliInfo.gclmemInputDesc = NULL; - this->oclExtInfo.maliInfo.gclmemOutputDesc = NULL; - CHECK_STATUS(bilateral_slice_apply_infer_output_size(inputDesc, guideDesc, gridDesc, bilateralSliceApplyDesc, &((*outDims)[0]), this->schedule, &this->oclExtInfo)); - return SUCCESS; - } - - virtual EE infer_gclmem_desc(Vec* gclmemInputDesc, Vec* gclmemOutputDesc) override - { - TensorDesc inputDesc = this->inputTensors[0].get_desc(); - TensorDesc gridDesc = this->inputTensors[1].get_desc(); - GCLMemDesc gclmemDesc[3]; - gclmemDesc[0] = (*gclmemInputDesc)[0]; - gclmemDesc[1] = (*gclmemInputDesc)[1]; - if(this->mode == BSliceApply_NULL) gclmemDesc[2] = (*gclmemInputDesc)[2]; - this->oclExtInfo.maliInfo.gclmemInputDesc = gclmemDesc; - this->oclExtInfo.maliInfo.gclmemOutputDesc = &((*gclmemOutputDesc)[0]); - CHECK_STATUS(bilateral_slice_apply_infer_output_size(inputDesc, guideDesc, gridDesc, bilateralSliceApplyDesc, NULL, this->schedule, &this->oclExtInfo)); - (*gclmemInputDesc)[0] = gclmemDesc[0];//inputDesc - (*gclmemInputDesc)[1] = gclmemDesc[1];//gridDesc - if(this->mode == BSliceApply_NULL)(*gclmemInputDesc)[2] = gclmemDesc[2];//guideDesc - return SUCCESS; - } - - virtual U32 infer_tmp_memory_size()override - { - TensorDesc inputDesc = (this->inputTensors[0]).get_desc(); - TensorDesc gridDesc = (this->inputTensors[1]).get_desc(); - U32 bytes = 0; - CHECK_STATUS(bilateral_slice_apply_infer_forward_tmp_bytes(inputDesc, guideDesc, gridDesc, bilateralSliceApplyDesc, &bytes, this->schedule, &this->oclExtInfo)); - return bytes; - } -private: - TensorDesc guideDesc; - BilateralSliceApplyDesc bilateralSliceApplyDesc; -}; - -#endif //_BILATERAL_SLICE_APPLY_OCL_H diff --git a/inference/include/ocl/clip_ocl.hpp b/inference/include/ocl/clip_ocl.hpp deleted file mode 100644 index 2a8ed93c..00000000 --- a/inference/include/ocl/clip_ocl.hpp +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _CLIP_OCL_H -#define _CLIP_OCL_H - -#include "operator.hpp" -#include "tensor_computing.h" -#include "clip.hpp" - -class ClipOCL: public Clip -{ -public: - /** - @param mode - */ - ClipOCL(DataType dt, F32 clipMinScalar, F32 clipMaxScalar) : Clip(dt, clipMinScalar, clipMaxScalar) { } - - virtual void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - this->handle->curOpName = this->get_op_name(); - Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); - Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); - - CHECK_STATUS(clip(&(this->clipMinScalar), &(this->clipMaxScalar), - inputDesc, inputTensor.get_val(), - outputDesc, outputTensor.get_val(), this->schedule, &this->oclExtInfo)); - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - virtual EE infer_output_tensors_size(VecinDims, Vec* outDims) override - { - this->oclExtInfo.maliInfo.gclmemInputDesc = NULL; - this->oclExtInfo.maliInfo.gclmemOutputDesc = NULL; - CHECK_STATUS(clip_infer_output_size(inDims[0], &((*outDims)[0]), this->schedule, &this->oclExtInfo)); - return SUCCESS; - } - - virtual EE infer_gclmem_desc(Vec* gclmemInputDesc, Vec* gclmemOutputDesc) override - { - TensorDesc inputDesc = this->inputTensors[0].get_desc(); - this->oclExtInfo.maliInfo.gclmemInputDesc = &((*gclmemInputDesc)[0]); - this->oclExtInfo.maliInfo.gclmemOutputDesc = &((*gclmemOutputDesc)[0]); - CHECK_STATUS(clip_infer_output_size(inputDesc, NULL, this->schedule, &this->oclExtInfo)); - return SUCCESS; - } -}; - -#endif //_CLIP_OCL_H diff --git a/inference/include/ocl/concat_ocl.hpp b/inference/include/ocl/concat_ocl.hpp deleted file mode 100644 index 73d18aac..00000000 --- a/inference/include/ocl/concat_ocl.hpp +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _CONCAT_OCL_H -#define _CONCAT_OCL_H - -#include "operator.hpp" -#include "tensor_computing.h" -#include "concat.hpp" - -class ConcatOCL: public Concat { -public: - ConcatOCL(int axis) : Concat(axis) {} - - virtual void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - this->handle->curOpName = this->get_op_name(); - Vec inputDesc; - Vec inputPtr; - Vec inputScales; - - for (Tensor tensorIn: this->inputTensors) { - inputDesc.push_back(tensorIn.get_desc()); - inputPtr.push_back((void*)tensorIn.get_val()); -// inputScales.push_back(tensorIn.get_scale()); - } - auto outputDesc = this->outputTensors[0].get_desc(); - auto outputPtr = this->outputTensors[0].get_val(); -// F32 outputScale = 1.0; - - CHECK_STATUS(concat(inputDesc, inputPtr, NULL, outputDesc, outputPtr, NULL, this->axis, this->schedule, &this->oclExtInfo)); - -// if (DT_I8 == outputDesc.dt) { -// this->outputTensors[0].set_scale(outputScale); -// } - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - virtual EE infer_output_tensors_size(Vec inDims, Vec* outDims) override - { - this->oclExtInfo.maliInfo.gclmemInputDesc = NULL; - this->oclExtInfo.maliInfo.gclmemOutputDesc = NULL; - CHECK_STATUS(concat_infer_output_size(inDims, &((*outDims)[0]), this->axis, this->schedule, &this->oclExtInfo)); - return SUCCESS; - } - - virtual EE infer_gclmem_desc(Vec* gclmemInputDesc, Vec* gclmemOutputDesc) override - { - Vec inputDesc; - for (Tensor tensorIn: this->inputTensors) inputDesc.push_back(tensorIn.get_desc()); - U32 num = inputDesc.size(); - GCLMemDesc_t memInDesc = (GCLMemDesc_t) operator new(sizeof(struct GCLMemDesc) * num) ; - for(U32 i = 0; i < num; i++) memInDesc[i] = (*gclmemInputDesc)[i]; - this->oclExtInfo.maliInfo.gclmemInputDesc = memInDesc; - this->oclExtInfo.maliInfo.gclmemOutputDesc = &((*gclmemOutputDesc)[0]); - CHECK_STATUS(concat_infer_output_size(inputDesc, NULL, this->axis, this->schedule, &this->oclExtInfo)); - for(U32 i = 0; i < num; i++) (*gclmemInputDesc)[i] = memInDesc[i]; - delete memInDesc; - return SUCCESS; - } -}; - -#endif //_CONCAT_OCL_H diff --git a/inference/include/ocl/convolution_ocl.hpp b/inference/include/ocl/convolution_ocl.hpp deleted file mode 100644 index f4ff31a3..00000000 --- a/inference/include/ocl/convolution_ocl.hpp +++ /dev/null @@ -1,675 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _CONVELTWISEPOOLING_OCL_H -#define _CONVELTWISEPOOLING_OCL_H -#include "weight_operator.hpp" -#include "pooling.hpp" -#include "eltwise.hpp" -#include "tensor_computing.h" -#include "tensor_desc.h" -#include "op_type.h" -#include "convolution.hpp" -#include - -class ConvolutionOCL: public Convolution { -public: - ConvolutionOCL(DataType dt, U32 nf, U32 ksizeH, U32 ksizeW, U32 kstrideH, U32 kstrideW, U32 kpaddingT, U32 kpaddingB, U32 kpaddingL, U32 kpaddingR, - ActivationDesc dwActivationDesc, ActivationDesc pwActivationDesc, ConvolutionMode convolutionType, U32 group, U32 dilateH, U32 dilateW) : - Convolution(dt, nf, ksizeH, ksizeW, kstrideH, kstrideW, kpaddingT, kpaddingB, kpaddingL, kpaddingR, - dwActivationDesc, pwActivationDesc, convolutionType, group, dilateH, dilateW) {} - - virtual EE init_weight_bias_from_model(U8** modelPtr)override - { - auto curOpWs = this->get_weightspec_ptr(); - DataType filterDt = curOpWs.mdt; // weight data type may not be the same as input and output - if (modelPtr != nullptr) { - filterDt = DT_F16; - } - DataType dtNoQ = (this->dt == DT_F16_8Q) ? DT_F16 : this->dt; - DataFormat filterDf; - U32 vectorLen = 0; // Vector must contain bias. BNN has one more scale vector. - switch (this->convolutionType) { - case Convolution_Pointwise: { - filterDf = DF_NCHW; - vectorLen = this->numFilters; // bias length - this->oclExtInfo.maliInfo.forwardRunInfo->algorithm = CONVOLUTION_ALGORITHM_NULL; - break; - } - case Convolution_Depthwise: { - filterDf = DF_NCHW; - vectorLen = this->numFilters; - this->oclExtInfo.maliInfo.forwardRunInfo->algorithm = DEPTHWISE_CONVOLUTION_ALGORITHM_NULL; - break; - } - case Convolution_Depthwise_Pointwise: { - filterDf = DF_CHW_NC; - vectorLen = this->numFilters + this->numChannels; - this->oclExtInfo.maliInfo.forwardRunInfo->algorithm = DEPTHWISE_CONVOLUTION_ALGORITHM_NULL; - break; - } - case Convolution_Dilation: { - CHECK_STATUS(NOT_SUPPORTED); - return NOT_SUPPORTED; - break; - } - default: - CHECK_STATUS(NOT_SUPPORTED); - return NOT_SUPPORTED; - } - TensorDesc filterTensorDesc = tensor4df(filterDt, filterDf, - this->numFilters, this->numChannels, - this->kernelSizeH, this->kernelSizeW); - if(this->convolutionType == Convolution_Depthwise) filterTensorDesc.dims[2] = 1; - TensorDesc vectorTensorDesc = tensor1d(dtNoQ, vectorLen); // bias data type should be the same as input and output - - std::shared_ptr modelWeightTensor(new Tensor(this->handle)); - std::shared_ptr modelVectorTensor(new Tensor(this->handle)); - std::shared_ptr modelWeightTensorExt; - std::shared_ptr modelVectorTensorExt; - modelWeightTensor->set_desc(filterTensorDesc); - GCLMem_t weightMem = modelWeightTensor->get_val(); - U32 ww, wh, wc, wn; - DataFormat df; - DataType dt; - U32 num, bytes; - TensorDesc filterTensorDescTmp = filterTensorDesc; - TensorDesc filterTensorDescExt = filterTensorDesc; - if(this->convolutionType == Convolution_Depthwise_Pointwise){ - filterTensorDescTmp.dims[2] = 1; - filterTensorDescTmp.dims[3] = this->numChannels; - filterTensorDescExt.dims[0] = 1; - filterTensorDescExt.dims[1] = 1; - filterTensorDescExt.dims[2] = this->numChannels; - filterTensorDescExt.dims[3] = this->numFilters; - filterTensorDescExt.df = DF_NCHW; - modelWeightTensorExt = std::shared_ptr(new Tensor(this->handle)); - modelWeightTensorExt->set_desc(filterTensorDescExt); - GCLMem_t weightMemExt = modelWeightTensorExt->get_val(); - tensorSelectGet(filterTensorDescExt, &dt, &df, &wn, &wc, &wh, &ww); - num = tensorNumElements(filterTensorDescExt); - bytes = tensorNumBytes(filterTensorDescExt); - weightMemExt->desc.stride[0] = ww * wh; - weightMemExt->desc.stride[1] = wc; - weightMemExt->desc.stride[2] = wn; - weightMemExt->desc.offset[0] = 0; - weightMemExt->desc.offset[1] = 0; - weightMemExt->desc.offset[2] = 0; - weightMemExt->desc.memType = GCL_MEM_BUF; - weightMemExt->desc.memFormat = df; - weightMemExt->desc.byteSize = bytes; - weightMemExt->desc.num = num; - weightMemExt->desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; - } - tensorSelectGet(filterTensorDescTmp, &dt, &df, &wn, &wc, &wh, &ww); - num = tensorNumElements(filterTensorDescTmp); - bytes = tensorNumBytes(filterTensorDescTmp); - weightMem->desc.stride[0] = ww * wh; - weightMem->desc.stride[1] = wc; - weightMem->desc.stride[2] = wn; - weightMem->desc.offset[0] = 0; - weightMem->desc.offset[1] = 0; - weightMem->desc.offset[2] = 0; - weightMem->desc.memType = GCL_MEM_BUF; - weightMem->desc.memFormat = df; - weightMem->desc.byteSize = bytes; - weightMem->desc.num = num; - weightMem->desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; - GCLMem_t vectorMem = modelVectorTensor->get_val(); - modelVectorTensor->set_desc(vectorTensorDesc); - U32 vectorLenTmp = vectorLen; - U32 vectorLenExt = vectorLen; - TensorDesc vectorTensorDescTmp = vectorTensorDesc; - TensorDesc vectorTensorDescExt = vectorTensorDesc; - if(this->convolutionType == Convolution_Depthwise_Pointwise) { - vectorLenTmp = this->numChannels; - vectorLenExt = this->numFilters; - vectorTensorDescTmp.dims[0] = vectorLenTmp; - vectorTensorDescExt.dims[0] = vectorLenExt; - modelVectorTensorExt = std::shared_ptr(new Tensor(this->handle)); - modelVectorTensorExt->set_desc(vectorTensorDescExt); - GCLMem_t vectorMemExt = modelVectorTensorExt->get_val(); - vectorMemExt->desc.stride[0] = (vectorLenExt + 3) / 4; - vectorMemExt->desc.stride[1] = 1; - vectorMemExt->desc.stride[2] = 1; - vectorMemExt->desc.offset[0] = 0; - vectorMemExt->desc.offset[1] = 0; - vectorMemExt->desc.offset[2] = 0; - vectorMemExt->desc.memType = GCL_MEM_IMG_1D; - vectorMemExt->desc.byteSize = (vectorLenExt + 3) / 4 * 4 * bytesOf(dtNoQ); - vectorMemExt->desc.num = (vectorLenExt + 3) / 4; - vectorMemExt->desc.memFormat = DF_NHWC; - } - U32 iw, ih; - TensorDesc inputDesc = this->inputTensors[0].get_desc(); - tensorSelectGet(inputDesc, NULL, NULL, NULL, NULL, &ih, &iw); - if(wn == 1 || (ww == 1 && wh == 1 && iw == 1 && ih == 1)) { - vectorMem->desc.stride[0] = (vectorLenTmp + 7) / 8 * 8; - vectorMem->desc.memType = GCL_MEM_BUF; - vectorMem->desc.byteSize = (vectorLenTmp + 7) / 8 * 8 * sizeof(dtNoQ); - vectorMem->desc.num = (vectorLenTmp + 7) / 8 * 8; - }else{ - vectorMem->desc.stride[0] = (vectorLenTmp + 3) / 4; - vectorMem->desc.memType = GCL_MEM_IMG_1D; - vectorMem->desc.byteSize = (vectorLenTmp + 3) / 4 * 4 * bytesOf(dtNoQ); - vectorMem->desc.num = (vectorLenTmp + 3) / 4; - } - vectorMem->desc.stride[1] = 1; - vectorMem->desc.stride[2] = 1; - vectorMem->desc.offset[0] = 0; - vectorMem->desc.offset[1] = 0; - vectorMem->desc.offset[2] = 0; - vectorMem->desc.memFormat = DF_NHWC; - - if (modelPtr != nullptr) { - weightMem->desc.host_ptr = *modelPtr; - if(this->convolutionType == Convolution_Depthwise_Pointwise){ - GCLMem_t weightMemExt = modelWeightTensorExt->get_val(); - weightMemExt->desc.host_ptr = *modelPtr + weightMem->desc.byteSize; - } - *modelPtr += tensorNumBytes(filterTensorDesc); - } else { - weightMem->desc.host_ptr = curOpWs.weight; - if(this->convolutionType == Convolution_Depthwise_Pointwise){ - GCLMem_t weightMemExt = modelWeightTensorExt->get_val(); - weightMemExt->desc.host_ptr = curOpWs.weight + weightMem->desc.byteSize; - } - } - - U8* biasVal = nullptr; - U8* biasValExt = nullptr; - U8* biasTmp = nullptr; - U8* biasTmpExt = nullptr; - if(modelPtr != nullptr) { - if(this->hasBias){ - biasVal = *modelPtr; - if(this->convolutionType == Convolution_Depthwise_Pointwise) biasValExt= *modelPtr + vectorMem->desc.byteSize; - *modelPtr += tensorNumBytes(vectorTensorDesc); - } - } else { - if(this->hasBias) { - biasVal = curOpWs.vec; - if(this->convolutionType == Convolution_Depthwise_Pointwise) biasValExt = curOpWs.vec + vectorMem->desc.byteSize; - } - } - - if (biasVal != nullptr) { - vectorMem->desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; - if((vectorLenTmp & 3) == 0){ - vectorMem->desc.host_ptr = biasVal; - } else { - biasTmp = (U8*)operator new(vectorMem->desc.byteSize); - memset(biasTmp, 0, vectorMem->desc.byteSize); - memcpy(biasTmp, biasVal, tensorNumBytes(vectorTensorDescTmp)); - vectorMem->desc.host_ptr = biasTmp; - } - if(this->convolutionType == Convolution_Depthwise_Pointwise){ - GCLMem_t vectorMemExt = modelVectorTensorExt->get_val(); - vectorMemExt->desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; - if((vectorLenExt & 3) == 0){ - vectorMemExt->desc.host_ptr = biasValExt; - } else { - biasTmpExt = (U8*)operator new(vectorMemExt->desc.byteSize); - memset(biasTmpExt, 0, vectorMemExt->desc.byteSize); - memcpy(biasTmpExt, biasValExt, tensorNumBytes(vectorTensorDescExt)); - vectorMem->desc.host_ptr = biasTmpExt; - } - } - } else { - vectorMem->desc.host_ptr = nullptr; - vectorMem->desc.flags = CL_MEM_READ_WRITE; - if(this->convolutionType == Convolution_Depthwise_Pointwise){ - GCLMem_t vectorMemExt = modelVectorTensorExt->get_val(); - vectorMemExt->desc.host_ptr = nullptr; - vectorMemExt->desc.flags = CL_MEM_READ_WRITE; - } - } - - U8* weightTmp = nullptr; - if(wn == 1 && ww == 1 && wh == 1 && wc == 3){ - weightMem->desc.stride[1] = wc + wn; - weightMem->desc.num = ww * wh * (wc + wn) * wn; - weightMem->desc.byteSize = weightMem->desc.num * bytesOf(dt); - weightTmp = (U8*)operator new(weightMem->desc.byteSize + vectorMem->desc.byteSize); - memset(weightTmp, 0, weightMem->desc.byteSize + vectorMem->desc.byteSize); - memcpy(weightTmp, (U8*)weightMem->desc.host_ptr, weightMem->desc.byteSize); - if(vectorMem->desc.host_ptr){ - memcpy(weightTmp + weightMem->desc.byteSize, (U8*)vectorMem->desc.host_ptr, vectorMem->desc.byteSize); - } - weightMem->desc.host_ptr = weightTmp; - } - modelWeightTensor->alloc(); - modelVectorTensor->alloc(); - this->weightTensors.push_back(*modelWeightTensor.get()); - this->biasTensors.push_back(*modelVectorTensor.get()); - if(this->convolutionType == Convolution_Depthwise_Pointwise){ - modelWeightTensorExt->alloc(); - modelVectorTensorExt->alloc(); - this->weightTensors.push_back(*modelWeightTensorExt.get()); - this->biasTensors.push_back(*modelVectorTensorExt.get()); - } - if(weightTmp) delete weightTmp; - if(biasTmp) delete biasTmp; - if(biasTmpExt) delete biasTmpExt; - if(curOpWs.weight) delete curOpWs.weight; - if(curOpWs.vec) delete curOpWs.vec; - - return SUCCESS; - } - - virtual void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - this->handle->curOpName = this->get_op_name(); - Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); - - Tensor filterTensor = this->weightTensors[0]; - TensorDesc filterDesc = filterTensor.get_desc(); - - TensorDesc scaleDesc = filterDesc; // Dummy initialization - U8 *scalePtr = nullptr; - - Tensor biasTensor = this->biasTensors[0]; - TensorDesc biasDesc = biasTensor.get_desc(); - - Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); - switch (this->convolutionType) { - case Convolution_Pointwise: { - CHECK_STATUS(convolution(inputDesc, inputTensor.get_val(), - filterDesc, filterTensor.get_val(), - convDesc, this->pwAlg, - scaleDesc, scalePtr, - biasDesc, biasTensor.get_val(), - this->lenOfTemp, this->temp->get_val(), - outputDesc, (void*)outputTensor.get_val(), - this->pwActivationDesc, this->schedule, &this->oclExtInfo)); - break; - } - case Convolution_Depthwise: { - CHECK_STATUS(depthwise_convolution(inputDesc, inputTensor.get_val(), - filterDesc, filterTensor.get_val(), - convDesc, this->dwAlg, - biasDesc, biasTensor.get_val(), - this->lenOfTemp, this->temp->get_val(), - outputDesc, (void*)outputTensor.get_val(), - this->dwActivationDesc, this->pwActivationDesc, - this->schedule, &this->oclExtInfo)); - break; - } - case Convolution_Depthwise_Pointwise: { - GCLMem filterMem[2]; - filterMem[0] = *((GCLMem_t)filterTensor.get_val()); - filterMem[1] = *((GCLMem_t)this->weightTensors[1].get_val()); - GCLMem biasMem[2]; - biasMem[0] = *((GCLMem_t)biasTensor.get_val()); - biasMem[1] = *((GCLMem_t)this->biasTensors[1].get_val()); - if(this->dwAlg == DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_GEMM) biasMem[1] = *((GCLMem_t)this->bias_buf->get_val()); - CHECK_STATUS(depthwise_convolution(inputDesc, inputTensor.get_val(), - filterDesc, (void*)filterMem, - convDesc, this->dwAlg, - biasDesc, (void*)biasMem, - this->lenOfTemp, this->temp->get_val(), - outputDesc, (void*)outputTensor.get_val(), - this->dwActivationDesc, this->pwActivationDesc, - this->schedule, &this->oclExtInfo)); - break; - } - case Convolution_Dilation: { - CHECK_STATUS(NOT_SUPPORTED); - break; - } - default: - std::cerr << "[ERROR] unsupported convolution type " << this->convolutionType << std::endl; - exit(1); - } - - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - virtual EE infer_forward_algorithm(HashMap &algorithmMap)override - { - TensorDesc inputDesc = (this->inputTensors[0]).get_desc(); - TensorDesc filterDesc = (this->weightTensors[0]).get_desc(); - - ConvolutionPolicy policy = CONVOLUTION_TUNNING; - DataType targetType = filterDesc.dt; - switch (this->convolutionType) { - case Convolution_Pointwise: { - if (this->dt == DT_F16_8Q) { - targetType = DT_I8; - } - if (algorithmMap.find(this->name) != algorithmMap.end()) { - I32 algo[4]; - Operator::getAlgorithmInfoFromMap(algorithmMap, this->name, algo, 4); - this->runInfo.algorithm = (ConvolutionForwardAlgorithm)algo[0]; - this->runInfo.best_w[0] = algo[1]; - this->runInfo.best_c[0] = algo[2]; - this->runInfo.best_k[0] = algo[3]; - this->pwAlg = (ConvolutionForwardAlgorithm)algo[0]; - } else { - CHECK_STATUS(convolution_infer_forward_algorithm(inputDesc, filterDesc, - this->outputTensors[0].get_desc(), - convDesc, policy, &(this->pwAlg), targetType, this->pwActivationDesc, this->schedule, &this->oclExtInfo)); - I32 algo[4]; - algo[0] = this->runInfo.algorithm; - algo[1] = this->runInfo.best_w[0]; - algo[2] = this->runInfo.best_c[0]; - algo[3] = this->runInfo.best_k[0]; - this->pwAlg = (ConvolutionForwardAlgorithm)algo[0]; - Operator::setAlgorithmInfoToMap(algorithmMap, this->name, algo, 4); - } - break; - } - case Convolution_Depthwise: { - if (algorithmMap.find(this->name) != algorithmMap.end()) { - I32 algo[4]; - Operator::getAlgorithmInfoFromMap(algorithmMap, this->name, algo, 4); - this->runInfo.algorithm = (ConvolutionForwardAlgorithm)algo[0]; - this->runInfo.best_w[0] = algo[1]; - this->runInfo.best_c[0] = algo[2]; - this->runInfo.best_k[0] = algo[3]; - this->dwAlg = (DepthwiseConvolutionForwardAlgorithm)algo[0]; - } else { - CHECK_STATUS(depthwise_convolution_infer_forward_algorithm(inputDesc, filterDesc, - this->outputTensors[0].get_desc(), - convDesc, policy, &(this->dwAlg), targetType, this->dwActivationDesc, this->pwActivationDesc, this->schedule, &this->oclExtInfo)); - I32 algo[4]; - algo[0] = this->runInfo.algorithm; - algo[1] = this->runInfo.best_w[0]; - algo[2] = this->runInfo.best_c[0]; - algo[3] = this->runInfo.best_k[0]; - this->dwAlg = (DepthwiseConvolutionForwardAlgorithm)algo[0]; - Operator::setAlgorithmInfoToMap(algorithmMap, this->name, algo, 4); - } - break; - } - case Convolution_Depthwise_Pointwise: { - if (algorithmMap.find(this->name) != algorithmMap.end()) { - I32 algo[7]; - Operator::getAlgorithmInfoFromMap(algorithmMap, this->name, algo, 7); - this->runInfo.algorithm = (ConvolutionForwardAlgorithm)algo[0]; - this->runInfo.best_w[0] = algo[1]; - this->runInfo.best_c[0] = algo[2]; - this->runInfo.best_k[0] = algo[3]; - this->runInfo.best_w[1] = algo[4]; - this->runInfo.best_c[1] = algo[5]; - this->runInfo.best_k[1] = algo[6]; - this->dwAlg = (DepthwiseConvolutionForwardAlgorithm)algo[0]; - } else { - CHECK_STATUS(depthwise_convolution_infer_forward_algorithm(inputDesc, filterDesc, - this->outputTensors[0].get_desc(), - convDesc, policy, &(this->dwAlg), targetType, this->dwActivationDesc, this->pwActivationDesc, this->schedule, &this->oclExtInfo)); - I32 algo[7]; - algo[0] = this->runInfo.algorithm; - algo[1] = this->runInfo.best_w[0]; - algo[2] = this->runInfo.best_c[0]; - algo[3] = this->runInfo.best_k[0]; - algo[4] = this->runInfo.best_w[1]; - algo[5] = this->runInfo.best_c[1]; - algo[6] = this->runInfo.best_k[1]; - this->dwAlg = (DepthwiseConvolutionForwardAlgorithm)algo[0]; - Operator::setAlgorithmInfoToMap(algorithmMap, this->name, algo, 7); - } - break; - } - case Convolution_Dilation: { - CHECK_STATUS(NOT_SUPPORTED); - break; - } - default: - CHECK_STATUS(NOT_SUPPORTED); - } - return SUCCESS; - } - - virtual EE infer_output_tensors_size(Vec inDims, Vec* outDims) override - { - TensorDesc inDim = inDims[0]; - DataType idt; - DataFormat idf; - U32 in, ic, ih, iw; - CHECK_STATUS(tensor4dGet(inDim, &idt, &idf, &in, &ic, &ih, &iw)); - this->numChannels = ic; - - TensorDesc filterDim = tensor4df(this->dt, DF_NCHW, this->numFilters, this->numChannels, this->kernelSizeH, - this->kernelSizeW); - - convDesc = Convolution::create_convDesc(this->strideH, this->strideW, this->paddingT, this->paddingB, - this->paddingL, this->paddingR, this->dilateH, this->dilateW); - - DataType targetType = DT_F16; // Default DT_F16 - if (DT_F16_8Q == this->dt && Convolution_Pointwise == this->convolutionType) { - targetType = DT_I8; - } - - U32 outBytes = 0; - this->oclExtInfo.maliInfo.gclmemInputDesc = NULL; - this->oclExtInfo.maliInfo.gclmemOutputDesc = NULL; - switch (this->convolutionType) { - case Convolution_Pointwise: { - CHECK_STATUS(convolution_infer_output_size(inDim, filterDim, convDesc, &((*outDims)[0]), targetType, &outBytes, this->schedule, &this->oclExtInfo)); - break; - } - case Convolution_Depthwise: { - CHECK_STATUS(depthwise_convolution_infer_output_size(inDim, filterDim, convDesc, &((*outDims)[0]), targetType, &outBytes, - this->schedule, &this->oclExtInfo)); - break; - } - case Convolution_Depthwise_Pointwise: { - CHECK_STATUS(depthwise_convolution_infer_output_size(inDim, filterDim, convDesc, &((*outDims)[0]), targetType, &outBytes, - this->schedule, &this->oclExtInfo)); - break; - } - case Convolution_Dilation: { - return NOT_SUPPORTED; - break; - } - default: - CHECK_STATUS(NOT_SUPPORTED); - } - return SUCCESS; - } - - virtual EE infer_gclmem_desc(Vec* gclmemInputDesc, Vec* gclmemOutputDesc) override - { - TensorDesc inputDesc = this->inputTensors[0].get_desc(); - TensorDesc filterDesc = this->weightTensors[0].get_desc(); - DataType targetType = DT_F16; // Default DT_F16 - if (DT_F16_8Q == this->dt && Convolution_Pointwise == this->convolutionType) { targetType = DT_I8;} - this->oclExtInfo.maliInfo.gclmemInputDesc = &((*gclmemInputDesc)[0]); - this->oclExtInfo.maliInfo.gclmemOutputDesc = &((*gclmemOutputDesc)[0]); - switch (this->convolutionType) { - case Convolution_Pointwise: { - CHECK_STATUS(convolution_infer_output_size(inputDesc, filterDesc, convDesc, NULL, - targetType, NULL, this->schedule, &this->oclExtInfo)); - break; - } - case Convolution_Depthwise: { - CHECK_STATUS(depthwise_convolution_infer_output_size(inputDesc, filterDesc, convDesc, NULL, - targetType, NULL, this->schedule, &this->oclExtInfo)); - break; - } - case Convolution_Depthwise_Pointwise: { - CHECK_STATUS(depthwise_convolution_infer_output_size(inputDesc, filterDesc, convDesc, NULL, - targetType, NULL, this->schedule, &this->oclExtInfo)); - break; - } - case Convolution_Dilation: { - return NOT_SUPPORTED; - break; - } - default: - CHECK_STATUS(NOT_SUPPORTED); - } - return SUCCESS; - - } - - virtual U32 infer_tmp_memory_size()override - { - TensorDesc inputDesc = (this->inputTensors[0]).get_desc(); - TensorDesc filterDesc = (this->weightTensors[0]).get_desc(); - TensorDesc outputDesc = (this->outputTensors[0]).get_desc(); - - U32 bytes = 0; - switch (this->convolutionType) { - case Convolution_Pointwise: { - CHECK_STATUS(convolution_infer_forward_tmp_bytes(inputDesc, filterDesc, outputDesc, convDesc, this->pwAlg, &bytes, this->schedule, &this->oclExtInfo)); - break; - } - case Convolution_Depthwise: { - CHECK_STATUS(depthwise_convolution_infer_forward_tmp_bytes(inputDesc, filterDesc, outputDesc, convDesc, this->dwAlg, &bytes, this->schedule, &this->oclExtInfo)); - break; - } - case Convolution_Depthwise_Pointwise: { - CHECK_STATUS(depthwise_convolution_infer_forward_tmp_bytes(inputDesc, filterDesc, outputDesc, convDesc, this->dwAlg, &bytes, this->schedule, &this->oclExtInfo)); - break; - } - case Convolution_Dilation: { - CHECK_STATUS(NOT_SUPPORTED); - break; - } - default: - CHECK_STATUS(NOT_SUPPORTED); - } - return bytes; - } - - virtual GCLMemDesc infer_wtm_memory_size_mali()override - { - TensorDesc filterDesc = (this->weightTensors[0]).get_desc(); - U32 stride[3] = {0, 0, 0}; - U32 offset[3] = {0, 0, 0}; - GCLMemDesc tmpDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); - GCLMemDesc gclmemWtmDesc[2]; - gclmemWtmDesc[0] = tmpDesc; - gclmemWtmDesc[1] = tmpDesc; - U32 bytes = 0; - this->oclExtInfo.maliInfo.gclmemFilterDesc = gclmemWtmDesc; - switch (this->convolutionType) { - case Convolution_Pointwise: { - CHECK_STATUS(convolution_transform_filter_bytes(filterDesc, this->pwAlg, &bytes, this->schedule, &this->oclExtInfo)); - break; - } - case Convolution_Depthwise: { - CHECK_STATUS(depthwise_convolution_transform_filter_bytes(filterDesc, this->dwAlg, &bytes, this->schedule, &this->oclExtInfo)); - break; - } - case Convolution_Depthwise_Pointwise: { - CHECK_STATUS(depthwise_convolution_transform_filter_bytes(filterDesc, this->dwAlg, &bytes, this->schedule, &this->oclExtInfo)); - wtm_dp = std::shared_ptr(new Tensor(this->handle)); - OclMemory* wtmMem = (OclMemory*)wtm_dp->get_memory(); - wtmMem->set_mem_desc(gclmemWtmDesc[1]); - wtm_dp->alloc(); - if(this->dwAlg == DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_GEMM) { - bias_buf = std::shared_ptr(new Tensor(this->handle)); - DataType dt = this->biasTensors[1].get_desc().dt; - GCLMem_t biasImg = (GCLMem_t)this->biasTensors[1].get_val(); - GCLMem_t biasBuf = bias_buf->get_val(); - GCLMemDesc descImg = biasImg->desc; - U32 s1_img = descImg.stride[0]; - U32 s1 = (s1_img * 4 + 7) / 8 * 8; - U32 s2 = 1; - U32 s3 = 1; - U32 num = s1 * s2 * s3; - U32 byteSize = num * bytesOf(dt); - biasBuf->desc.stride[0] = s1; - biasBuf->desc.stride[1] = s2; - biasBuf->desc.stride[2] = s3; - biasBuf->desc.offset[0] = 0; - biasBuf->desc.offset[1] = 0; - biasBuf->desc.offset[2] = 0; - biasBuf->desc.memType = GCL_MEM_BUF; - biasBuf->desc.num = num; - biasBuf->desc.byteSize = byteSize; - biasBuf->desc.memFormat = DF_NHWC; - bias_buf->alloc(); - U32 region[3] = {s1_img, 1, 1}; - CHECK_STATUS(gcl_trans_memory(this->handle.get(), (void*)biasImg, (void*)biasBuf, region, DEVICE_IMG_TO_BUF, CL_TRUE)); - } - break; - } - case Convolution_Dilation: { - CHECK_STATUS(NOT_SUPPORTED); - break; - } - default: - CHECK_STATUS(NOT_SUPPORTED); - } - return gclmemWtmDesc[0]; - } - - virtual EE transform_filter()override - { - Tensor filterTensor = this->weightTensors[0]; - TensorDesc filterDesc = filterTensor.get_desc(); - GCLMem_t weightPtr = filterTensor.get_val(); - - TensorDesc wtmCpuDesc; - if (DT_F16_8Q == this->dt && Convolution_Pointwise == this->convolutionType && CONVOLUTION_ALGORITHM_WINOGRAD == this->pwAlg) { // int8 winograd - return NOT_SUPPORTED; - } else if (DT_F16_8Q == this->dt && Convolution_Pointwise == this->convolutionType) { // int8 tilegemm - return NOT_SUPPORTED; - } else { // All other cases - auto wtmDesc = this->infer_wtm_memory_size_mali(); - this->wtm = std::shared_ptr(new Tensor(this->handle)); - OclMemory* wtmMem = (OclMemory*)this->wtm->get_memory(); - wtmMem->set_mem_desc(wtmDesc); - this->wtm->alloc(); - - switch (this->convolutionType) { - case Convolution_Pointwise: { - CHECK_STATUS(convolution_transform_filter(filterDesc, weightPtr, this->pwAlg, &wtmCpuDesc, this->get_wtm()->get_val(), this->temp->get_val(), this->schedule, &this->oclExtInfo)); - break; - } - case Convolution_Depthwise: { - CHECK_STATUS(depthwise_convolution_transform_filter(filterDesc, weightPtr, this->dwAlg, &wtmCpuDesc, this->get_wtm()->get_val(), - this->schedule, &this->oclExtInfo)); - break; - } - case Convolution_Depthwise_Pointwise: { - GCLMem weightPtrDp[2]; - weightPtrDp[0] = *((GCLMem_t)this->weightTensors[0].get_val()); - weightPtrDp[1] = *((GCLMem_t)this->weightTensors[1].get_val()); - GCLMem weightPtrTranDp[2]; - weightPtrTranDp[0] = *((GCLMem_t)(this->get_wtm()->get_val())); - weightPtrTranDp[1] = *((GCLMem_t)(wtm_dp->get_val())); - CHECK_STATUS(depthwise_convolution_transform_filter(filterDesc, weightPtrDp, this->dwAlg, &wtmCpuDesc, weightPtrTranDp, - this->schedule, &this->oclExtInfo)); - this->weightTensors[1] = *wtm_dp.get(); - break; - } - case Convolution_Dilation: { - CHECK_STATUS(NOT_SUPPORTED); - break; - } - default: - CHECK_STATUS(NOT_SUPPORTED); - } - } - - this->get_wtm()->set_desc(wtmCpuDesc); - this->weightTensors[0] = *this->get_wtm(); - return SUCCESS; - } - - private: - std::shared_ptr wtm_dp; - std::shared_ptr bias_buf; - ConvolutionDesc convDesc; -}; - -#endif //_CONVELTWISEPOOLING_H diff --git a/inference/include/ocl/depth2space_ocl.hpp b/inference/include/ocl/depth2space_ocl.hpp deleted file mode 100644 index 535fc150..00000000 --- a/inference/include/ocl/depth2space_ocl.hpp +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -/** - * Project deploy - */ - - -#ifndef _DEPTH2SPACE_OCL_H -#define _DEPTH2SPACE_OCL_H - -#include "operator.hpp" -#include "depth2space.hpp" - -class Depth2SpaceOCL: public Depth2Space { -public: - Depth2SpaceOCL(DataType dt) : Depth2Space(dt) { } - - virtual void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - this->handle->curOpName = this->get_op_name(); - Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); - Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); - - CHECK_STATUS(depth2space(inputDesc, inputTensor.get_val(), outputDesc, outputTensor.get_val(), this->schedule, &this->oclExtInfo)); - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - virtual EE infer_output_tensors_size(Vec inDims, Vec* outDims) override - { - this->oclExtInfo.maliInfo.gclmemInputDesc = NULL; - this->oclExtInfo.maliInfo.gclmemOutputDesc = NULL; - CHECK_STATUS(depth2space_infer_output_size(inDims[0], &((*outDims)[0]), this->schedule, &this->oclExtInfo)); - return SUCCESS; - } - - virtual EE infer_gclmem_desc(Vec* gclmemInputDesc, Vec* gclmemOutputDesc) override - { - TensorDesc inputDesc = this->inputTensors[0].get_desc(); - this->oclExtInfo.maliInfo.gclmemInputDesc = &((*gclmemInputDesc)[0]); - this->oclExtInfo.maliInfo.gclmemOutputDesc = &((*gclmemOutputDesc)[0]); - CHECK_STATUS(depth2space_infer_output_size(inputDesc, NULL, this->schedule, &this->oclExtInfo)); - return SUCCESS; - } -}; - -#endif //_DEPTH2SPACE_OCL_H diff --git a/inference/include/ocl/eltwise_ocl.hpp b/inference/include/ocl/eltwise_ocl.hpp deleted file mode 100644 index 6818a05c..00000000 --- a/inference/include/ocl/eltwise_ocl.hpp +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -/** - * Project deploy - */ - - -#ifndef _ELTWISE_OCL_H -#define _ELTWISE_OCL_H - -#include "operator.hpp" -#include "eltwise.hpp" - -class EltwiseOCL: public Eltwise { -public: - EltwiseOCL(EltwiseMode eltMode, I32 coeffSize, F32* coeffValues):Eltwise(eltMode, coeffSize, coeffValues){} - - virtual void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - this->handle->curOpName = this->get_op_name(); - Vec inputDesc; - Vec inputPtr; - for (Tensor tensorIn: this->inputTensors) { - inputDesc.push_back(tensorIn.get_desc()); - inputPtr.push_back((void*)tensorIn.get_val()); - } - auto outputDesc = this->outputTensors[0].get_desc(); - auto outputPtr = this->outputTensors[0].get_val(); - if (inputDesc.size() == 2 - && inputDesc[1].nDims == 2 - && inputDesc[0].dims[inputDesc[0].nDims-1] == inputDesc[1].dims[0]) { - CHECK_STATUS(NOT_SUPPORTED); - //CHECK_STATUS(scale(this->inputTensors[1].get_val(), nullptr, outputDesc, this->inputTensors[0].get_val(), this->schedule)); - //memcpy(outputPtr, this->inputTensors[0].get_val(), tensorNumBytes(outputDesc)); - } else { - CHECK_STATUS(eltwise(inputDesc, inputPtr, outputDesc, outputPtr, this->eltMode, this->schedule, &this->oclExtInfo)); - } - - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - virtual EE infer_output_tensors_size(Vec inDims, Vec* outDims) override - { - this->oclExtInfo.maliInfo.gclmemInputDesc = NULL; - this->oclExtInfo.maliInfo.gclmemOutputDesc = NULL; - CHECK_STATUS(eltwise_infer_output_size(inDims, &((*outDims)[0]), this->schedule, &this->oclExtInfo)); - return SUCCESS; - } - - virtual EE infer_gclmem_desc(Vec* gclmemInputDesc, Vec* gclmemOutputDesc) override - { - Vec inputDesc; - for (Tensor tensorIn: this->inputTensors) inputDesc.push_back(tensorIn.get_desc()); - GCLMemDesc memInDesc[8]; - U32 inputNum = gclmemInputDesc->size(); - if(inputNum > 8) CHECK_STATUS(NOT_SUPPORTED); - for(U32 i = 0; i < inputNum; i++) memInDesc[i] = (*gclmemInputDesc)[i]; - this->oclExtInfo.maliInfo.gclmemInputDesc = memInDesc; - this->oclExtInfo.maliInfo.gclmemOutputDesc = &((*gclmemOutputDesc)[0]); - CHECK_STATUS(eltwise_infer_output_size(inputDesc, NULL, this->schedule, &this->oclExtInfo)); - U32 num = (*gclmemInputDesc).size(); - for(U32 i = 0; i < num; i++) (*gclmemInputDesc)[i] = memInDesc[i]; - return SUCCESS; - } -}; - -#endif //_ELTWISE_OCL_H diff --git a/inference/include/ocl/embedding_ocl.hpp b/inference/include/ocl/embedding_ocl.hpp deleted file mode 100644 index 7067c5d9..00000000 --- a/inference/include/ocl/embedding_ocl.hpp +++ /dev/null @@ -1,119 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _EMBEDDING_OCL_H -#define _EMBEDDING_OCL_H -#include "weight_operator.hpp" -#include "tensor_computing.h" -#include "embedding.hpp" - -class EmbeddingOCL: public Embedding { -public: - EmbeddingOCL(DataType dt, U32 inputDim, U32 numOutput, bool transpose) : - Embedding(dt, inputDim, numOutput, transpose) { } - - void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - this->handle->curOpName = this->get_op_name(); - Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); - Tensor weightTensor; - if (this->weightTensors.size() > 0) - weightTensor = this->weightTensors[0]; - else - weightTensor = this->inputTensors[1]; - TensorDesc weightDesc = weightTensor.get_desc(); - Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); - CHECK_STATUS(embedding(inputDesc, inputTensor.get_val(), - weightDesc, weightTensor.get_val(), - outputDesc, outputTensor.get_val(), - this->inputDim, this->numOutput, - this->transpose, this->dt, - this->schedule, &this->oclExtInfo)); - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - EE infer_output_tensors_size(Vec inDims, Vec* outDims) override - { - TensorDesc inputDesc = inDims[0]; - this->oclExtInfo.maliInfo.gclmemInputDesc = NULL; - this->oclExtInfo.maliInfo.gclmemOutputDesc = NULL; - CHECK_STATUS(embedding_infer_output_size(inputDesc, &((*outDims)[0]), this->inputDim, this->numOutput, this->dt, this->schedule, &this->oclExtInfo)); - return SUCCESS; - } - - virtual EE infer_gclmem_desc(Vec* gclmemInputDesc, Vec* gclmemOutputDesc) override - { - TensorDesc inputDesc = this->inputTensors[0].get_desc(); - this->oclExtInfo.maliInfo.gclmemInputDesc = &((*gclmemInputDesc)[0]); - this->oclExtInfo.maliInfo.gclmemOutputDesc = &((*gclmemOutputDesc)[0]); - CHECK_STATUS(embedding_infer_output_size(inputDesc, NULL, this->inputDim, this->numOutput, this->dt, this->schedule, &this->oclExtInfo)); - return SUCCESS; - } - - EE init_weight_bias_from_model(U8** modelPtr) override - { - TensorDesc weightDesc; - if (transpose) - weightDesc = tensor2df(this->dt, DF_TRANSPOSE, this->numOutput, this->inputDim); - else - weightDesc = tensor2df(this->dt, DF_NORMAL, this->inputDim, this->numOutput); - std::shared_ptr modelWeightTensor(new Tensor(this->handle)); - modelWeightTensor->set_desc(weightDesc); - - GCLMem_t weightMem = modelWeightTensor->get_val(); - U32 s0, s1, s2; - U32 num, bytes; - s0 = weightDesc.dims[0]; - s1 = weightDesc.dims[1]; - s2 = 1; - num = s0 * s1 * s2; - bytes = num * bytesOf(this->dt); - weightMem->desc.stride[0] = s0; - weightMem->desc.stride[1] = s1; - weightMem->desc.stride[2] = s2; - weightMem->desc.offset[0] = 0; - weightMem->desc.offset[1] = 0; - weightMem->desc.offset[2] = 0; - weightMem->desc.memType = GCL_MEM_BUF; - weightMem->desc.memFormat = DF_NORMAL; - weightMem->desc.num = num; - weightMem->desc.byteSize = bytes; - weightMem->desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; - - - bool set_ptr = false; - auto curOpWs = this->get_weightspec_ptr(); - if(modelPtr != nullptr){ - weightMem->desc.host_ptr = *modelPtr; - *modelPtr += tensorNumBytes(weightDesc); - set_ptr = true; - } else { - if (curOpWs.weight != nullptr) { - weightMem->desc.host_ptr = curOpWs.weight; - set_ptr = true; - } - } - if(set_ptr) { - modelWeightTensor->alloc(); - this->weightTensors.push_back(*modelWeightTensor.get()); - if(curOpWs.weight) delete curOpWs.weight; - } - return SUCCESS; - } -}; - -#endif //_EMBEDDING_OCL_H diff --git a/inference/include/ocl/factory_ocl.hpp b/inference/include/ocl/factory_ocl.hpp deleted file mode 100644 index ece12ded..00000000 --- a/inference/include/ocl/factory_ocl.hpp +++ /dev/null @@ -1,310 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _FACTORY_OCL_H -#define _FACTORY_OCL_H -#include "operator.hpp" -/* -#include "deconvolution.hpp" -#include "activation.hpp" -#include "scale.hpp" -#include "lstm.hpp" -#include "lstmcell.hpp" -#include "resize.hpp" -#include "attention.hpp" -#include "squeeze.hpp" -#include "unsqueeze.hpp" -#include "reduction.hpp" -#include "argmax.hpp" -#include "check.hpp" -#include "repeat.hpp" -#include "preallocated_memory.hpp" -#include "shared_weight.hpp" -#include "copy.hpp" -*/ -#include "ocl/bilateral_slice_apply_ocl.hpp" -#include "ocl/pooling_ocl.hpp" -#include "ocl/convolution_ocl.hpp" -#include "ocl/eltwise_ocl.hpp" -#include "ocl/softmax_ocl.hpp" -#include "ocl/activation_ocl.hpp" -#include "ocl/fully_connected_ocl.hpp" -#include "ocl/scale_ocl.hpp" -#include "ocl/concat_ocl.hpp" -#include "ocl/clip_ocl.hpp" -#include "ocl/squeeze_ocl.hpp" -#include "ocl/reshape_ocl.hpp" -#include "ocl/space2depth_ocl.hpp" -#include "ocl/depth2space_ocl.hpp" -#include "ocl/embedding_ocl.hpp" -#include "ocl/layer_norm_ocl.hpp" -#include "ocl/matmul_ocl.hpp" -#include "ocl/multiply_ocl.hpp" -#include "ocl/transpose_ocl.hpp" -#include "ocl/slice_ocl.hpp" - -class FactoryOCL: public Factory { -public: - virtual std::shared_ptr createConvolution(DataType dt, U32 nf, U32 ksizeH, U32 ksizeW, U32 kstrideH, U32 kstrideW, - U32 kpaddingT, U32 kpaddingB, U32 kpaddingL, U32 kpaddingR, - ActivationDesc dwActivationDesc, ActivationDesc pwActivationDesc, - ConvolutionMode convolutionType, U32 group, U32 dilateH, U32 dilateW) override { - auto cep = (Convolution*)(new ConvolutionOCL(dt, nf, ksizeH, ksizeW, kstrideH, kstrideW, - kpaddingT, kpaddingB, kpaddingL, kpaddingR, - dwActivationDesc, pwActivationDesc, - convolutionType, group, dilateH, dilateW)); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createDeconvolution(DataType dt, U32 nf, U32 ksizeH, U32 ksizeW, U32 kstrideH, U32 kstrideW, - U32 kpaddingT, U32 kpaddingB, U32 kpaddingL, U32 kpaddingR, - ActivationDesc dwActivationDesc, ActivationDesc pwActivationDesc, - ConvolutionMode convolutionType, U32 group, U32 dilateH, U32 dilateW) override { - /* - //auto cep = new DeconvolutionOCL(dt, nf, ksizeH, ksizeW, kstrideH, kstrideW, - kpaddingT, kpaddingB, kpaddingL, kpaddingR, - dwActivationDesc, pwActivationDesc, - convolutionType, group, dilateH, dilateW); - */ - OP_UNSUP(16, dt, nf, ksizeH, ksizeW, kstrideH, kstrideW, - kpaddingT, kpaddingB, kpaddingL, kpaddingR, - dwActivationDesc, pwActivationDesc, - convolutionType, group, dilateH, dilateW); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createPooling(PoolingMode mode, U32 ksH, U32 ksW, U32 strideH, U32 strideW, - U32 paddingT, U32 paddingB, U32 paddingL, U32 paddingR, RoundMode rm) override { - auto cep = (Pooling*)(new PoolingOCL(mode, ksH, ksW, strideH, strideW, paddingT, paddingB, paddingL, paddingR, rm)); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createFullyConnected(DataType dt, U32 numInput, U32 numOutput, - U32 numSlice, I32* slicePoint) override { - auto cep = (FullyConnectedOCL*)(new FullyConnectedOCL(dt, - numInput, numOutput, numSlice, slicePoint)); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createSoftmax(DataType dt, int axis) override { - auto cep = new SoftmaxOCL(dt, axis); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createConcat(int axis) override { - auto cep = (Concat*)(new ConcatOCL(axis)); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createActivation(ActivationDesc activeDesc) override { - auto cep = (Activation*) new ActivationOCL(activeDesc); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createEltwise(EltwiseMode eltMode, I32 coeffSize, F32* coeffValues) override { - auto cep = (Eltwise*)new EltwiseOCL(eltMode, coeffSize, coeffValues); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createScale(DataType dt, int axis, int numChannels, int numSource) override { - auto cep = (Scale*)(new ScaleOCL(dt, axis, numChannels, numSource)); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createLSTM(DataType dt, U32 numOutput, U32 numProjection, - F32 zoneoutCell, F32 zoneoutOutput, bool biDirection) override - { - OP_UNSUP(6, dt, numOutput, numProjection, zoneoutCell, zoneoutOutput, biDirection); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createLSTMCell(DataType dt, U32 numOutput, U32 numProjection, - F32 zoneoutCell, F32 zoneoutOutput, bool biDirection) override - { - OP_UNSUP(6, dt, numOutput, numProjection, zoneoutCell, zoneoutOutput, biDirection); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createLSTM(DataType dt, U32 numOutput, U32 numProjection, - F32 zoneoutCell, F32 zoneoutOutput, I32 steps) override { - if (steps == -2) - return FactoryOCL::createLSTM(dt, numOutput, numProjection, zoneoutCell, zoneoutOutput, true); - if (steps >= 0) - return FactoryOCL::createLSTM(dt, numOutput, numProjection, zoneoutCell, zoneoutOutput, false); - else - return FactoryOCL::createLSTMCell(dt, numOutput, numProjection, zoneoutCell, zoneoutOutput, false); - } - - virtual std::shared_ptr createEmbedding(DataType dt, U32 inputDim, U32 numOutput, bool transpose) override { - auto cep = (Embedding*)new EmbeddingOCL(dt, inputDim, numOutput, transpose); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createMultiply(DataType dt, F32 scale, F32 bias) override { - auto cep = (Multiply*)new MultiplyOCL(dt, scale, bias); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createMatMul(DataType dt, bool transposeA, bool transposeB) override { - auto cep = (MatMul*)(new MatMulOCL(dt, transposeA, transposeB)); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createLayerNorm(DataType dt, U32 weightNum) override { - auto cep = (LayerNorm*) new LayerNormOCL(dt, weightNum); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createReshape(DataType dt, I32* shapeDims, I32 shapeSize, I32 axis, I32 numAxes) override { - auto cep = (Reshape*)(new ReshapeOCL(dt, shapeDims, shapeSize, axis, numAxes)); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createResize(DataType paramDT, void* paramPtr) override { - //auto cep = new Resize(paramDT, paramPtr); - OP_UNSUP(2, paramDT, paramPtr); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createSlice(DataType dt, I32 axis, I32* slicePoints, U32 sliceSize) override { - auto cep = (Slice*)(new SliceOCL(dt, axis, slicePoints, sliceSize)); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createTranspose(DataType dt, U32* transDims, U32 transSize) override { - auto cep = (Transpose*)(new TransposeOCL(dt, transDims, transSize)); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createAttention(DataType dt, U32 numHeads, U32 fromSequenceLength, - U32 toSequenceLength) override { - //auto cep = new AttentionOCL(dt, numHeads, fromSequenceLength, toSequenceLength); - OP_UNSUP(4, dt, numHeads, fromSequenceLength, toSequenceLength); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createClip(DataType dt, F32 clipMinScalar, F32 clipMaxScalar) override { - auto cep = (Clip*)(new ClipOCL(dt, clipMinScalar, clipMaxScalar)); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createSqueeze(DataType dt, I32 axis, I32 *dims, I32 dimSize) override { - auto cep = (Squeeze*)(new SqueezeOCL(dt, axis, dims, dimSize)); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createUnsqueeze(DataType dt, I32 axis, I32 *dims, I32 dimSize) - override { - //auto cep = new UnsqueezeOCL(dt, axis, dims, dimSize); - OP_UNSUP(4, dt, axis, dims, dimSize); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createReduction(DataType dt, I32 axis, bool keepDim, ReductionMode reductionMode, float coeff) override { - OP_UNSUP(5, dt, axis, keepDim, reductionMode, coeff); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createArgMax(DataType dt, I32 axis) override { - //auto cep = new ArgMaxOCL(dt, axis); - OP_UNSUP(2, dt, axis); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createCopy(DataType dt, I32 *srcDims, I32 *dstDims, I32 length) override { - //auto cep = new CopyOCL(dt, srcDims, dstDims, length); - OP_UNSUP(4, dt, srcDims, dstDims, length); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createCheck(DataType dt, CheckMode checkMode) override { - //auto cep = new CheckOCL(dt, checkMode); - OP_UNSUP(2, dt, checkMode); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createRepeat(DataType dt, I32 loops, I32 axis, - I32 jumpOperatorIndex, I32 currentOperatorIndex) override { - //auto cep = new RepeatOCL(dt, loops, jumpOperatorIndex, currentOperatorIndex); - OP_UNSUP(5, dt, loops, axis, jumpOperatorIndex, currentOperatorIndex); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createBilateralSliceApply(U32 coefficiency_len, bool has_offset, - BilateralSliceApplyMode mode) override { - auto cep = (BilateralSliceApply*)(new BilateralSliceApplyOCL(coefficiency_len, has_offset, mode)); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createPreAllocatedMemory(DataType dt, TensorDesc desc) override { - //auto cep = new PreAllocatedMemoryOCL(dt, desc); - OP_UNSUP(2, dt, desc) - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createSharedWeight(DataType dt, TensorDesc desc) override { - //auto cep = new SharedWeightOCL(dt, desc); - OP_UNSUP(2, dt, desc) - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createJump(DataType dt, I32 jumpOperatorIndex, I32 currentOperatorIndex) override { - OP_UNSUP(3, dt, jumpOperatorIndex, currentOperatorIndex); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createSpace2Depth(DataType dt) override { - auto cep = (Space2Depth*)(new Space2DepthOCL(dt)); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createDepth2Space(DataType dt) override { - auto cep = (Space2Depth*)(new Depth2SpaceOCL(dt)); - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createAttentionMask(DataType dt, I32 attentionLength, - bool sameLength, float mask) override { - OP_UNSUP(4, dt, attentionLength, sameLength, mask) - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createRelativePositionEmbedding(DataType dt, U32 inputDim, - U32 numOutput, bool transpose, I32 axis) override { - OP_UNSUP(5, dt, inputDim, numOutput, transpose, axis) - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createRelativeShift(DataType dt, I32 axis, - I32 shiftLength) override { - OP_UNSUP(3, dt, axis, shiftLength) - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createPadding(DataType dt, PadDesc padDesc) override { - OP_UNSUP(2, dt, padDesc) - return std::shared_ptr(cep); - } - - virtual std::shared_ptr createPriorBox(DataType dt, PriorBoxDesc priorboxDesc) override { - OP_UNSUP(2, dt, priorboxDesc); - return std::shared_ptr(cep); - } - virtual std::shared_ptr createDetectionOutput(DataType dt, DetectionOutputDesc detectionoutputDesc) override { - OP_UNSUP(2, dt, detectionoutputDesc); - return std::shared_ptr(cep); - } -}; - -#endif //_FACTORY_OCL_H diff --git a/inference/include/ocl/fully_connected_ocl.hpp b/inference/include/ocl/fully_connected_ocl.hpp deleted file mode 100644 index 66240491..00000000 --- a/inference/include/ocl/fully_connected_ocl.hpp +++ /dev/null @@ -1,339 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -/** - * Project deploy - */ -#ifndef _FCELTWISE_OCL_H -#define _FCELTWISE_OCL_H - -#include "weight_operator.hpp" -#include "tensor_computing.h" -#include "fully_connected.hpp" - -class FullyConnectedOCL: public FullyConnected { -public: - FullyConnectedOCL(DataType dt, U32 numInput, U32 numOutput, U32 numSlice, - I32 *slicePoints): - FullyConnected(dt, numInput, numOutput, numSlice, slicePoints) { } - - virtual EE init_weight_bias_from_model(U8** modelPtr) override - { - auto curOpWs = this->get_weightspec_ptr(); - if(modelPtr == nullptr){ - this->numInput = curOpWs.bytes_of_weight / this->numOutput / UNI_MAX(1, bytesOf(curOpWs.mdt)); - } - TensorDesc weightDesc = tensor2df(this->dt, DF_NORMAL, this->numOutput, this->numInput); - TensorDesc biasDesc = tensor1d(this->dt, this->numOutput); - - std::shared_ptr modelWeightTensor(new Tensor(this->handle)); - std::shared_ptr modelBiasTensor(new Tensor(this->handle)); - modelWeightTensor->set_desc(weightDesc); - modelBiasTensor->set_desc(biasDesc); - GCLMem_t weightMem = modelWeightTensor->get_val(); - U32 s0, s1, s2; - U32 num, bytes; - s0 = this->numInput; - s1 = this->numOutput; - s2 = 1; - num = s0 * s1 * s2; - bytes = num * bytesOf(this->dt); - weightMem->desc.stride[0] = s0; - weightMem->desc.stride[1] = s1; - weightMem->desc.stride[2] = s2; - weightMem->desc.offset[0] = 0; - weightMem->desc.offset[1] = 0; - weightMem->desc.offset[2] = 0; - weightMem->desc.memType = GCL_MEM_BUF; - weightMem->desc.memFormat = DF_NORMAL; - weightMem->desc.num = num; - weightMem->desc.byteSize = bytes; - weightMem->desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; - - GCLMem_t biasMem = modelBiasTensor->get_val(); - biasMem->desc.stride[0] = (this->numOutput + 3) / 4 * 4; - biasMem->desc.stride[1] = 1; - biasMem->desc.stride[2] = 1; - biasMem->desc.offset[0] = 0; - biasMem->desc.offset[1] = 0; - biasMem->desc.memType = GCL_MEM_BUF; - biasMem->desc.memFormat = DF_NCHW; - biasMem->desc.num = (this->numOutput + 3) / 4 * 4; - biasMem->desc.byteSize = (this->numOutput + 3) / 4 * 4 * bytesOf(this->dt); - - if(modelPtr != nullptr){ - weightMem->desc.host_ptr = *modelPtr; - *modelPtr += tensorNumBytes(weightDesc); - } else { - weightMem->desc.host_ptr = curOpWs.weight; - } - - U8* biasVal = nullptr; - U8* biasTmp = nullptr; - if(modelPtr != nullptr){ - if(this->hasBias){ - biasVal = *modelPtr; - *modelPtr += tensorNumBytes(biasDesc); - } - } else { - if(this->hasBias) biasVal = curOpWs.vec; - } - - if(biasVal){ - biasMem->desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; - if((this->numOutput & 3) == 0){ - biasMem->desc.host_ptr = biasVal; - } else { - biasTmp = (U8*)operator new(biasMem->desc.byteSize); - memset(biasTmp, 0, biasMem->desc.byteSize); - memcpy(biasTmp, biasVal, this->numOutput * bytesOf(this->dt)); - biasMem->desc.host_ptr = biasTmp; - } - } else { - biasMem->desc.host_ptr = nullptr; - biasMem->desc.flags = CL_MEM_READ_WRITE; - } - - modelWeightTensor->alloc(); - modelBiasTensor->alloc(); - this->weightTensors.push_back(*modelWeightTensor.get()); - this->biasTensors.push_back(*modelBiasTensor.get()); - if(biasTmp) delete biasTmp; - if(curOpWs.weight) delete curOpWs.weight; - if(curOpWs.vec) delete curOpWs.vec; - return SUCCESS; - } - - virtual EE infer_forward_algorithm(HashMap &algorithmMap) override - { - TensorDesc inputDesc = (this->inputTensors[0]).get_desc(); - std::vector outputDescs; - this->oclExtInfo.maliInfo.forwardRunInfo->algorithm = CONVOLUTION_ALGORITHM_NULL; - for(U32 i = 0; i < this->outputTensors.size(); ++i) outputDescs.push_back(this->outputTensors[i].get_desc()); - if (algorithmMap.find(this->name) != algorithmMap.end()) { - I32 algo[4]; - Operator::getAlgorithmInfoFromMap(algorithmMap, this->name, algo, 4); - this->runInfo.algorithm = (ConvolutionForwardAlgorithm)algo[0]; - this->runInfo.best_w[0] = algo[1]; - this->runInfo.best_c[0] = algo[2]; - this->runInfo.best_k[0] = algo[3]; - } else { - CHECK_STATUS(fully_connected_infer_forward_algorithm(inputDesc, filterDesc4D, outputDescs,this->schedule, &this->oclExtInfo)); - I32 algo[4]; - algo[0] = this->runInfo.algorithm; - algo[1] = this->runInfo.best_w[0]; - algo[2] = this->runInfo.best_c[0]; - algo[3] = this->runInfo.best_k[0]; - Operator::setAlgorithmInfoToMap(algorithmMap, this->name, algo, 4); - } - return SUCCESS; - } - - virtual void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - this->handle->curOpName = this->get_op_name(); - Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); - TensorDesc weightDesc = this->weightTensors[0].get_desc(); - TensorDesc biasDesc = this->biasTensors[0].get_desc(); - TensorDesc outputDesc = this->outputTensors[0].get_desc(); - std::vector outputGCLMemArray; - for(U32 i = 0; i < numSlice; i++) outputGCLMemArray.push_back(this->outputTensors[i].get_val()); - - Tensor outputTensor = this->outputTensors[0]; - - CHECK_STATUS(fully_connected(inputDesc, inputTensor.get_val(), - weightDesc, &wtmGCLMemArray, - this->temp->get_val(), this->lenOfTemp, - outputDesc, &outputGCLMemArray, - biasDesc, &biasGCLMemArray, this->schedule, &this->oclExtInfo)); - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - virtual EE infer_output_tensors_size(Vec inDims, Vec* outDims) override - { - TensorDesc inputDesc = inDims[0]; - U32 ic, ih, iw; - if(inputDesc.df == DF_NCHW) tensorSelectGet(inputDesc, NULL, NULL, NULL, &ic, &ih, &iw); - if(inputDesc.df == DF_MKT) { - iw = 1; - ih = 1; - ic = inputDesc.dims[1]; - } - filterDesc4D = tensor4df(this->dt, DF_NCHW, this->numOutput, ic, ih, iw); - this->oclExtInfo.maliInfo.gclmemInputDesc = NULL; - this->oclExtInfo.maliInfo.gclmemOutputDesc = NULL; - CHECK_STATUS(fully_connected_infer_output_size(inputDesc, filterDesc4D, &((*outDims)[0]), this->schedule, &this->oclExtInfo)); - if(this->numSlice > 1) { - for(U32 i = 0; i < this->numSlice; ++i) { - (*outDims)[i] = (*outDims)[0]; - (*outDims)[i].dims[1] = this->slicePoints[i]; - } - } - return SUCCESS; - } - - virtual EE infer_gclmem_desc(Vec* gclmemInputDesc, Vec* gclmemOutputDesc) override - { - TensorDesc inputDesc = this->inputTensors[0].get_desc(); - this->oclExtInfo.maliInfo.gclmemInputDesc = &((*gclmemInputDesc)[0]); - this->oclExtInfo.maliInfo.gclmemOutputDesc = &((*gclmemOutputDesc)[0]); - CHECK_STATUS(fully_connected_infer_output_size(inputDesc, filterDesc4D, NULL, this->schedule, &this->oclExtInfo)); - if(this->numSlice > 1) { - U32 h_str = (*gclmemOutputDesc)[0].stride[0]; - U32 w_str = (*gclmemOutputDesc)[0].stride[1]; - U32 c_str = (this->slicePoints[0] + 3) / 4; - U32 num = h_str * w_str * c_str * 4; - (*gclmemOutputDesc)[0].stride[2] = c_str; - (*gclmemOutputDesc)[0].num = num; - (*gclmemOutputDesc)[0].byteSize = num * bytesOf(this->dt); - for(U32 i = 1; i < this->numSlice; ++i) { - (*gclmemOutputDesc)[i] = (*gclmemOutputDesc)[0]; - c_str = (this->slicePoints[i] + 3) / 4; - num = h_str * w_str * c_str * 4; - (*gclmemOutputDesc)[i].stride[2] = c_str; - (*gclmemOutputDesc)[i].num = num; - (*gclmemOutputDesc)[i].byteSize = num * bytesOf(this->dt); - } - } - return SUCCESS; - } - - virtual U32 infer_tmp_memory_size() override - { - TensorDesc inputDesc = this->inputTensors[0].get_desc(); - U32 bytes = 0; - CHECK_STATUS(fully_connected_infer_forward_tmp_bytes(inputDesc, filterDesc4D, &bytes, this->schedule, &this->oclExtInfo)); - return bytes; - } - - virtual GCLMemDesc infer_wtm_memory_size_mali() override - { - U32 stride[3] = {0, 0, 0}; - U32 offset[3] = {0, 0, 0}; - GCLMemDesc gclmemWtmDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); - U32 bytes = 0; - this->oclExtInfo.maliInfo.gclmemFilterDesc = &gclmemWtmDesc; - CHECK_STATUS(fully_connected_transform_filter_bytes(filterDesc4D, &bytes, this->schedule, &this->oclExtInfo)); - return gclmemWtmDesc; - } - - virtual EE transform_filter() override - { - TensorDesc inputDesc = this->inputTensors[0].get_desc(); - auto wtmDesc = this->infer_wtm_memory_size_mali(); - if(this->numSlice == 1) { - this->wtm = std::shared_ptr(new Tensor(this->handle)); - OclMemory* wtmMem = (OclMemory*)this->wtm->get_memory(); - wtmMem->set_mem_desc(wtmDesc); - this->wtm->alloc(); - wtmGCLMemArray.push_back((GCLMem_t)(this->get_wtm()->get_val())); - } else { - U32 item_c = this->oclExtInfo.maliInfo.forwardRunInfo->best_c[0]; - U32 item_k = this->oclExtInfo.maliInfo.forwardRunInfo->best_k[0]; - for(U32 i = 0; i < this->numSlice; i++) { - GCLMemDesc tmpDesc = wtmDesc; - U32 s0 = wtmDesc.stride[0]; - U32 s1 = wtmDesc.stride[1]; - U32 s2 = (this->slicePoints[i] + item_k - 1) / item_k; - U32 num = s0 * s1 * s2 * item_c * item_k / (item_k >> 2); - tmpDesc.stride[2] = s2; - tmpDesc.num = num; - tmpDesc.byteSize = num * bytesOf(this->dt); - auto tmpWtm = std::shared_ptr(new Tensor(this->handle)); - auto tmpMem = (OclMemory*)tmpWtm->get_memory(); - tmpMem->set_mem_desc(tmpDesc); - tmpWtm->alloc(); - if(i == 0) { - this->wtm = tmpWtm; - } else { - wtmArray.push_back(tmpWtm); - } - wtmGCLMemArray.push_back((GCLMem_t)tmpWtm->get_val()); - } - } - TensorDesc wtmCpuDesc; - GCLMem_t weightPtr = this->weightTensors[0].get_val(); - CHECK_STATUS(fully_connected_transform_filter(inputDesc, filterDesc4D, weightPtr, &wtmCpuDesc, &wtmGCLMemArray, - this->schedule, &this->oclExtInfo)); - if(this->numSlice == 1) { - this->get_wtm()->set_desc(wtmCpuDesc); - this->weightTensors[0] = *this->get_wtm(); - } - - GCLMem_t biasBuf = biasTensors[0].get_val(); - U32 size[3] = {1, 1, 1}; - if(this->numSlice > 1) { - U32 offset[4] = {0, 0, 0, 0}; - for(U32 i = 0; i < this->numSlice; ++i) { - TensorDesc tmpCpuDesc = wtmCpuDesc; - tmpCpuDesc.dims[3] = this->slicePoints[i]; - if(i == 0) { - this->get_wtm()->set_desc(tmpCpuDesc); - this->weightTensors[0] = *this->get_wtm(); - } else { - wtmArray[i - 1]->set_desc(tmpCpuDesc); - this->weightTensors.push_back(*(wtmArray[i - 1].get())); - } - - GCLMemDesc tmpDesc = biasBuf->desc; - U32 spNum = this->slicePoints[i]; - size[0] = (spNum + 3) / 4; - tmpDesc.stride[0] = (spNum + 3) / 4; - tmpDesc.num = (spNum + 3) / 4; - tmpDesc.byteSize = spNum * bytesOf(this->dt); - tmpDesc.memType = GCL_MEM_IMG_1D; - tmpDesc.has_alloc = false; - auto tmpBias = std::shared_ptr(new Tensor(this->handle)); - auto tmpMem = (OclMemory*)tmpBias->get_memory(); - tmpMem->set_mem_desc(tmpDesc); - tmpBias->alloc(); - CHECK_STATUS(gcl_trans_memory(this->handle.get(), biasBuf, tmpBias->get_val(), size, DEVICE_BUF_TO_IMG, CL_TRUE, offset)) - offset[0] += spNum * bytesOf(this->dt); - biasImgArray.push_back(tmpBias); - biasGCLMemArray.push_back((GCLMem_t)tmpBias->get_val()); - } - } else { - if(inputDesc.df == DF_MKT) { - GCLMemDesc tmpDesc = biasBuf->desc; - U32 spNum = tmpDesc.stride[0]; - size[0] = (spNum + 3) / 4; - tmpDesc.stride[0] = (spNum + 3) / 4; - tmpDesc.num = (spNum + 3) / 4; - tmpDesc.byteSize = spNum * bytesOf(this->dt); - tmpDesc.memType = GCL_MEM_IMG_1D; - tmpDesc.has_alloc = false; - auto tmpBias = std::shared_ptr(new Tensor(this->handle)); - auto tmpMem = (OclMemory*)tmpBias->get_memory(); - tmpMem->set_mem_desc(tmpDesc); - tmpBias->alloc(); - CHECK_STATUS(gcl_trans_memory(this->handle.get(), biasBuf, tmpBias->get_val(), size, DEVICE_BUF_TO_IMG, CL_TRUE)) - biasImgArray.push_back(tmpBias); - biasGCLMemArray.push_back((GCLMem_t)tmpBias->get_val()); - } else { - biasGCLMemArray.push_back(biasBuf); - } - } - return SUCCESS; - } -private: - TensorDesc filterDesc4D; - std::vector wtmGCLMemArray; - std::vector> wtmArray; - std::vector biasGCLMemArray; - std::vector> biasImgArray; -}; - -#endif //_FCELTWISE_OCL_H diff --git a/inference/include/ocl/layer_norm_ocl.hpp b/inference/include/ocl/layer_norm_ocl.hpp deleted file mode 100644 index 46c1ebab..00000000 --- a/inference/include/ocl/layer_norm_ocl.hpp +++ /dev/null @@ -1,164 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _LAYER_NORM_OCL_H -#define _LAYER_NORM_OCL_H - -#include "operator.hpp" -#include "tensor_computing.h" -#include "tensor_desc.h" -#include "op_type.h" -#include "layer_norm.hpp" - -class LayerNormOCL: public LayerNorm { -public: - LayerNormOCL(DataType dt, U32 weightNum) : LayerNorm(dt, weightNum) {} - - EE init_weight_bias_from_model(U8** modelPtr) override - { - auto curOpWs = this->get_weightspec_ptr(); - if(modelPtr == nullptr) { - this->weightNum = curOpWs.bytes_of_weight / bytesOf(curOpWs.mdt); - } - - DataType dtNoQ = (DT_F16_8Q == this->dt) ? DT_F16 : this->dt; - TensorDesc weightDesc = tensor1d(dtNoQ, this->weightNum); - TensorDesc biasDesc = tensor1d(dtNoQ, this->weightNum); - U32 weightBytes = tensorNumBytes(weightDesc); - - std::shared_ptr modelWeightTensor(new Tensor(this->handle)); - std::shared_ptr modelBiasTensor(new Tensor(this->handle)); - modelWeightTensor->set_desc(weightDesc); - modelBiasTensor->set_desc(biasDesc); - GCLMem_t weightMem = modelWeightTensor->get_val(); - - U32 s0, s1, s2; - U32 num, bytes; - s0 = (this->weightNum + 3) / 4 * 4; - s1 = 1; - s2 = 1; - num = s0 * s1 * s2; - bytes = num * bytesOf(dtNoQ); - weightMem->desc.stride[0] = s0; - weightMem->desc.stride[1] = s1; - weightMem->desc.stride[2] = s2; - weightMem->desc.offset[0] = 0; - weightMem->desc.offset[1] = 0; - weightMem->desc.offset[2] = 0; - weightMem->desc.memType = GCL_MEM_BUF; - weightMem->desc.memFormat = DF_NORMAL; - weightMem->desc.num = num; - weightMem->desc.byteSize = bytes; - weightMem->desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; - - GCLMem_t biasMem = modelBiasTensor->get_val(); - biasMem->desc.stride[0] = s0; - biasMem->desc.stride[1] = s1; - biasMem->desc.stride[2] = s2; - biasMem->desc.offset[0] = 0; - biasMem->desc.offset[1] = 0; - biasMem->desc.memType = GCL_MEM_BUF; - biasMem->desc.memFormat = DF_NCHW; - biasMem->desc.num = num; - biasMem->desc.byteSize = bytes; - - - if(modelPtr != nullptr) { - weightMem->desc.host_ptr = *modelPtr; - *modelPtr += tensorNumBytes(weightDesc); - } else { - weightMem->desc.host_ptr = curOpWs.weight; - } - - U8* weightTmp = nullptr; - if((this->weightNum & 3) != 0) { - weightTmp = (U8*)operator new(weightMem->desc.byteSize); - memset(weightTmp, 0, weightMem->desc.byteSize); - memcpy(weightTmp, weightMem->desc.host_ptr, weightBytes); - weightMem->desc.host_ptr = weightTmp; - } - - U8* biasVal = nullptr; - U8* biasTmp = nullptr; - if(modelPtr != nullptr) { - if(this->hasBias) { - biasVal = *modelPtr; - *modelPtr += tensorNumBytes(biasDesc); - } - } else { - if(this->hasBias) biasVal = curOpWs.vec; - } - if(biasVal) { - biasMem->desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; - if((this->weightNum & 3) == 0) { - biasMem->desc.host_ptr = biasVal; - } else { - biasTmp = (U8*)operator new(biasMem->desc.byteSize); - memset(biasTmp, 0, biasMem->desc.byteSize); - memcpy(biasTmp, biasVal, weightBytes); - biasMem->desc.host_ptr = biasTmp; - } - } else { - biasMem->desc.host_ptr = nullptr; - biasMem->desc.flags = CL_MEM_READ_WRITE; - } - - modelWeightTensor->alloc(); - modelBiasTensor->alloc(); - this->weightTensors.push_back(*modelWeightTensor.get()); - this->biasTensors.push_back(*modelBiasTensor.get()); - if(weightTmp) delete weightTmp; - if(biasTmp) delete biasTmp; - if(curOpWs.weight) delete curOpWs.weight; - if(curOpWs.vec) delete curOpWs.vec; - return SUCCESS; - } - - void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - this->handle->curOpName = this->get_op_name(); - Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); - Tensor weightTensor = this->weightTensors[0]; - Tensor biasTensor = this->biasTensors[0]; - - Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); - CHECK_STATUS(layer_normalization(weightTensor.get_val(), biasTensor.get_val(), - inputDesc, inputTensor.get_val(), - outputDesc, outputTensor.get_val(), this->schedule, &this->oclExtInfo)); - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - virtual EE infer_output_tensors_size(Vec inDims, Vec* outDims) override - { - TensorDesc in_dim = inDims[0]; - this->oclExtInfo.maliInfo.gclmemInputDesc = NULL; - this->oclExtInfo.maliInfo.gclmemOutputDesc = NULL; - CHECK_STATUS(normalization_infer_output_size(in_dim, &((*outDims)[0]), this->schedule, &this->oclExtInfo)); - return SUCCESS; - } - - virtual EE infer_gclmem_desc(Vec* gclmemInputDesc, Vec* gclmemOutputDesc) override - { - TensorDesc inputDesc = this->inputTensors[0].get_desc(); - this->oclExtInfo.maliInfo.gclmemInputDesc = &((*gclmemInputDesc)[0]); - this->oclExtInfo.maliInfo.gclmemOutputDesc = &((*gclmemOutputDesc)[0]); - CHECK_STATUS(normalization_infer_output_size(inputDesc, NULL, this->schedule, &this->oclExtInfo)); - return SUCCESS; - } -}; - -#endif //_LAYER_NORM_OCL_H diff --git a/inference/include/ocl/matmul_ocl.hpp b/inference/include/ocl/matmul_ocl.hpp deleted file mode 100644 index 0cf217a2..00000000 --- a/inference/include/ocl/matmul_ocl.hpp +++ /dev/null @@ -1,105 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -/** - * Project deploy - */ -#ifndef _MATMUL_OCL_H -#define _MATMUL_OCL_H - -#include "operator.hpp" -#include "tensor_computing.h" -#include "matmul.hpp" - -class MatMulOCL: public MatMul { -public: - MatMulOCL(DataType dt, bool transposeA, bool transposeB) : MatMul(dt, transposeA, transposeB) {} - - void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - this->handle->curOpName = this->get_op_name(); - Tensor inputTensorA = this->inputTensors[0]; - TensorDesc inputDescA = inputTensorA.get_desc(); - Tensor inputTensorB = this->inputTensors[1]; - TensorDesc inputDescB = inputTensorB.get_desc(); - Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); - - U8 *inputA = inputTensorA.get_val(); - U8 *inputB = inputTensorB.get_val(); - U8 *tmp = (U8*)this->temp->get_val(); - - CHECK_STATUS(matmul(inputDescA, this->transposeA, inputA, - inputDescB, this->transposeB, inputB, tmp, this->lenOfTemp, - outputDesc, outputTensor.get_val(), this->schedule, &this->oclExtInfo)); - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - virtual EE infer_forward_algorithm(HashMap &algorithmMap) override - { - TensorDesc matrixADesc = (this->inputTensors[0]).get_desc(); - TensorDesc matrixBDesc = (this->inputTensors[1]).get_desc(); - TensorDesc matrixCDesc = this->outputTensors[0].get_desc(); - this->oclExtInfo.maliInfo.forwardRunInfo->algorithm = CONVOLUTION_ALGORITHM_NULL; - if (algorithmMap.find(this->name) != algorithmMap.end()) { - I32 algo[4]; - Operator::getAlgorithmInfoFromMap(algorithmMap, this->name, algo, 4); - this->runInfo.algorithm = (ConvolutionForwardAlgorithm)algo[0]; - this->runInfo.best_w[0] = algo[1]; - this->runInfo.best_c[0] = algo[2]; - this->runInfo.best_k[0] = algo[3]; - } else { - CHECK_STATUS(matmul_infer_forward_algorithm(matrixADesc, this->transposeA, matrixBDesc, this->transposeB, matrixCDesc, this->schedule, &this->oclExtInfo)); - I32 algo[4]; - algo[0] = this->runInfo.algorithm; - algo[1] = this->runInfo.best_w[0]; - algo[2] = this->runInfo.best_c[0]; - algo[3] = this->runInfo.best_k[0]; - Operator::setAlgorithmInfoToMap(algorithmMap, this->name, algo, 4); - } - return SUCCESS; - } - - EE infer_output_tensors_size(Vec inDims, Vec* outDims) override - { - TensorDesc inDimA = inDims[0]; - TensorDesc inDimB = inDims[1]; - this->oclExtInfo.maliInfo.gclmemInputDesc = NULL; - this->oclExtInfo.maliInfo.gclmemOutputDesc = NULL; - CHECK_STATUS(matmul_infer_output_size(inDimA, this->transposeA, inDimB, this->transposeB, &((*outDims)[0]), this->schedule, &this->oclExtInfo)); - return SUCCESS; - } - - virtual EE infer_gclmem_desc(Vec* gclmemInputDesc, Vec* gclmemOutputDesc) override - { - TensorDesc inDimA = this->inputTensors[0].get_desc(); - TensorDesc inDimB = this->inputTensors[1].get_desc(); - this->oclExtInfo.maliInfo.gclmemInputDesc = &((*gclmemInputDesc)[0]); - this->oclExtInfo.maliInfo.gclmemOutputDesc = &((*gclmemOutputDesc)[0]); - CHECK_STATUS(matmul_infer_output_size(inDimA, this->transposeA, inDimB, this->transposeB, NULL, this->schedule, &this->oclExtInfo)); - return SUCCESS; - } - - U32 infer_tmp_memory_size() override - { - TensorDesc inputDescA = (this->inputTensors[0]).get_desc(); - TensorDesc inputDescB = (this->inputTensors[1]).get_desc(); - U32 bytes = 0; - CHECK_STATUS(matmul_infer_forward_tmp_bytes(inputDescA, this->transposeA, inputDescB, this->transposeB, &bytes, this->schedule, &this->oclExtInfo)); - return bytes; - } -}; - -#endif //_MATMUL_OCL_H diff --git a/inference/include/ocl/memory_ocl.hpp b/inference/include/ocl/memory_ocl.hpp deleted file mode 100644 index 9303da63..00000000 --- a/inference/include/ocl/memory_ocl.hpp +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _MEMORY_OCL_H -#define _MEMORY_OCL_H -#include "memory.hpp" -#include "gcl.h" -#include "tensor_computing.h" - -class OclMemory : public Memory_ -{ -public: - OclMemory(std::shared_ptr handle) { - this->handle = handle; - val = std::shared_ptr (gcl_create_gclmem()); - type = OCLMem; - }; - - virtual ~OclMemory() override { - release_gclmem(); - }; - - virtual void alloc(TensorDesc desc) override { - UNUSED(desc); - if(val->desc.byteSize && !val->desc.has_alloc) { - CHECK_STATUS(gcl_create_memory(handle.get(), val.get())); - if(val->desc.host_ptr == NULL) CHECK_STATUS(gcl_fill_memory_zero(handle.get(), val.get())); -// CHECK_STATUS(gcl_finish(handle.get())); - } - } - - virtual void alloc(U32 size) override { - if(val->desc.byteSize < size) { - if(val->desc.byteSize > 0) release_gclmem(); - val->desc.byteSize = size; - CHECK_STATUS(gcl_create_memory(handle.get(), val.get())); - if(val->desc.host_ptr == NULL) CHECK_STATUS(gcl_fill_memory_zero(handle.get(), val.get())); -// CHECK_STATUS(gcl_finish(handle.get())); - } - } - - void set_val_by_copy(TensorDesc desc, U8* ptr) override { - ExtInfo extInfo; - extInfo.maliInfo.handle = handle.get(); - CHECK_STATUS(tensor_computing_set_input((void*)val.get(), desc, (const void*)ptr, (void*)tmpBuf.get(), CL_TRUE, MALI, &extInfo)); - - } - - virtual void set_shared_ptr(PtrCasterShared val) override { - release_gclmem(); - this->val = val; - } - - virtual std::shared_ptr get_shared_ptr() override { - return val; - } - - virtual void* get_val() override { - return this->val.get(); - }; - - void get_val_to_hostptr(TensorDesc hostDesc, U8** hostPtr, bool blocking) { - UNUSED(hostPtr); - ExtInfo extInfo; - extInfo.maliInfo.handle = handle.get(); - CHECK_STATUS(tensor_computing_get_output((const void*)val.get(), hostDesc, NULL, NULL, blocking, MALI, &extInfo)); - } - - void set_tmpBuf(std::shared_ptr tmpBuf) { - this->tmpBuf = tmpBuf; - } - - void set_mem_desc(GCLMemDesc desc) {val.get()->desc = desc;} - - GCLMemDesc get_mem_desc() {return val.get()->desc;} - - MemoryType get_mem_type() override { - return type; - } - - - private: - void release_gclmem() { - if(val->desc.use_map) CHECK_STATUS(gcl_unmap_memory(handle.get(), val.get())); - CHECK_STATUS(gcl_release_memory(val.get())); - } - - std::shared_ptr handle; - std::shared_ptr val; - std::shared_ptr tmpBuf; - MemoryType type; -}; -#endif diff --git a/inference/include/ocl/multiply_ocl.hpp b/inference/include/ocl/multiply_ocl.hpp deleted file mode 100644 index 4bc4261f..00000000 --- a/inference/include/ocl/multiply_ocl.hpp +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _MULTIPLY_OCL_H -#define _MULTIPLY_OCL_H - -#include "operator.hpp" -#include "tensor_computing.h" -#include "multiply.hpp" - -class MultiplyOCL: public Multiply -{ -public: - MultiplyOCL(DataType dt, F32 scale, F32 bias) : Multiply(dt, scale, bias) { } - - virtual void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - this->handle->curOpName = this->get_op_name(); - Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); - Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); - - CHECK_STATUS(multiply(&(this->alpha), &(this->beta), - inputDesc, inputTensor.get_val(), - outputDesc, outputTensor.get_val(), this->schedule, &this->oclExtInfo)); - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - virtual EE infer_output_tensors_size(VecinDims, Vec* outDims) override - { - this->oclExtInfo.maliInfo.gclmemInputDesc = NULL; - this->oclExtInfo.maliInfo.gclmemOutputDesc = NULL; - CHECK_STATUS(multiply_infer_output_size(inDims[0], &((*outDims)[0]), this->schedule, &this->oclExtInfo)); - return SUCCESS; - } - - virtual EE infer_gclmem_desc(Vec* gclmemInputDesc, Vec* gclmemOutputDesc) override - { - TensorDesc inputDesc = this->inputTensors[0].get_desc(); - this->oclExtInfo.maliInfo.gclmemInputDesc = &((*gclmemInputDesc)[0]); - this->oclExtInfo.maliInfo.gclmemOutputDesc = &((*gclmemOutputDesc)[0]); - CHECK_STATUS(multiply_infer_output_size(inputDesc, NULL, this->schedule, &this->oclExtInfo)); - return SUCCESS; - } -}; - -#endif //_MULTIPLY_OCL_H diff --git a/inference/include/ocl/pooling_ocl.hpp b/inference/include/ocl/pooling_ocl.hpp deleted file mode 100644 index afcdd487..00000000 --- a/inference/include/ocl/pooling_ocl.hpp +++ /dev/null @@ -1,105 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _POOLING_OCL_H -#define _POOLING_OCL_H -#include -#include "operator.hpp" -#include "tensor_computing.h" -#include "tensor_desc.h" -#include "op_type.h" -#include "pooling.hpp" - -class PoolingOCL: public Pooling { -public: - -/** - * @param mode - * @param ks - * @param stride - * @param padding - * @param name - */ - PoolingOCL(PoolingMode mode, U32 ksH, U32 ksW, U32 strideH, U32 strideW, U32 paddingT, U32 paddingB, U32 paddingL, U32 paddingR, RoundMode rm): - Pooling(mode, ksH, ksW, strideH, strideW, paddingT, paddingB, paddingL, paddingR, rm){} - - virtual void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - this->handle->curOpName = this->get_op_name(); - Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); - PoolingDesc poolingDesc = Pooling::create_PoolingDesc(this->mode, this->kernelSizeH, this->kernelSizeW, this->strideH, this->strideW, - this->paddingT, this->paddingB, this->paddingL, this->paddingR, this->rm); - Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); - short scales[2]; -#ifdef _USE_INT8 - if (DT_I8 == inputDesc.dt) { - F16* scale = (F16*)scales; - scale[0] = inputTensor.get_scale(); - } -#endif - CHECK_STATUS(pooling(inputDesc, inputTensor.get_val(), - poolingDesc, scales, - outputDesc, outputTensor.get_val(), - this->schedule, &this->oclExtInfo)); -#ifdef _USE_INT8 - if (DT_I8 == inputDesc.dt) { - F16 *scale = (F16*)scales; - outputTensor.set_scale(scale[1]); - } -#endif - - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - virtual EE infer_output_tensors_size(Vec inDims, Vec* outDims) override - { - auto inDim = inDims[0]; - DataType dt; - DataFormat df; - U32 width ; - U32 height; - U32 numChannels; - U32 num; - CHECK_STATUS(tensor4dGet(inDim, &dt, &df, &num, &numChannels, &height, &width)); - - this->oclExtInfo.maliInfo.gclmemInputDesc = NULL; - this->oclExtInfo.maliInfo.gclmemOutputDesc = NULL; - TensorDesc inputDesc = tensor4df(dt, df, num, numChannels, height, width); - if (this->kernelSizeH == 0 && this->kernelSizeW == 0) { - Pooling::set_stride(1, 1); - } - poolingDesc = Pooling::create_PoolingDesc(this->mode, this->kernelSizeH, this->kernelSizeW, this->strideH, this->strideW, - this->paddingT, this->paddingB, this->paddingL, this->paddingR, this->rm); - CHECK_STATUS(pooling_infer_output_size(inputDesc, poolingDesc, &((*outDims)[0]), this->schedule, &this->oclExtInfo)); - return SUCCESS; - } - - virtual EE infer_gclmem_desc(Vec* gclmemInputDesc, Vec* gclmemOutputDesc) override - { - TensorDesc inputDesc = this->inputTensors[0].get_desc(); - this->oclExtInfo.maliInfo.gclmemInputDesc = &((*gclmemInputDesc)[0]); - this->oclExtInfo.maliInfo.gclmemOutputDesc = &((*gclmemOutputDesc)[0]); - CHECK_STATUS(pooling_infer_output_size(inputDesc, poolingDesc, NULL, this->schedule, &this->oclExtInfo)); - return SUCCESS; - } - - -private: - PoolingDesc poolingDesc; -}; - -#endif //_POOLING_OCL_H diff --git a/inference/include/ocl/reshape_ocl.hpp b/inference/include/ocl/reshape_ocl.hpp deleted file mode 100644 index b20abbdb..00000000 --- a/inference/include/ocl/reshape_ocl.hpp +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _RESHAPE_OCL_H -#define _RESHAPE_OCL_H - -#include "operator.hpp" -#include "tensor_computing.h" -#include "reshape.hpp" - -class ReshapeOCL: public Reshape { -public: -/** - * @param shapeDims - * @param axis - * @param numAxes - */ - ReshapeOCL(DataType dt, I32* shapeDimsPtr, I32 shapeSize, I32 axis, I32 numAxes) : Reshape(dt, shapeDimsPtr, shapeSize, axis, numAxes) {} - - virtual void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - this->handle->curOpName = this->get_op_name(); - Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); - - Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); - GCLMem_t inputPtr = inputTensor.get_val(); - GCLMem_t outputPtr = outputTensor.get_val(); - - if(inputPtr != outputPtr) { - CHECK_STATUS(reshape(inputDesc, inputTensor.get_val(), outputDesc, outputTensor.get_val(), this->schedule, &this->oclExtInfo)); - } - - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - virtual EE infer_output_tensors_size(Vec inDims, Vec* outDims) override - { - this->oclExtInfo.maliInfo.gclmemInputDesc = NULL; - this->oclExtInfo.maliInfo.gclmemOutputDesc = NULL; - CHECK_STATUS(reshape_infer_output_size(inDims[0], &((*outDims)[0]), this->shapeDims.data(), this->shapeDims.size(), this->schedule, &this->oclExtInfo)); - return SUCCESS; - } - - virtual EE infer_gclmem_desc(Vec* gclmemInputDesc, Vec* gclmemOutputDesc) override - { - TensorDesc inputDesc = this->inputTensors[0].get_desc(); - this->oclExtInfo.maliInfo.gclmemInputDesc = &((*gclmemInputDesc)[0]); - this->oclExtInfo.maliInfo.gclmemOutputDesc = &((*gclmemOutputDesc)[0]); - CHECK_STATUS(reshape_infer_output_size(inputDesc, NULL, this->shapeDims.data(), this->shapeDims.size(), this->schedule, &this->oclExtInfo)); - return SUCCESS; - } -}; - -#endif //_RESHAPE_OCL_H diff --git a/inference/include/ocl/scale_ocl.hpp b/inference/include/ocl/scale_ocl.hpp deleted file mode 100644 index c54d7a07..00000000 --- a/inference/include/ocl/scale_ocl.hpp +++ /dev/null @@ -1,179 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _SCALE_GPU_H -#define _SCALE_GPU_H - -#include -#include "weight_operator.hpp" -#include "tensor_computing.h" -#include "tensor_desc.h" -#include "op_type.h" -#include "scale.hpp" - -class ScaleOCL: public Scale -{ -public: - ScaleOCL(DataType dt, int axis, int numChannels, int numSource): - Scale(dt, axis, numChannels, numSource) {} - virtual EE init_weight_bias_from_model(U8** modelPtr) override - { - auto curOpWs = this->get_weightspec_ptr(); - if(modelPtr == nullptr){ - this->numChannels = curOpWs.bytes_of_weight / UNI_MAX(1, bytesOf(curOpWs.mdt)); - } - if(this->numChannels == 0) return SUCCESS; - - TensorDesc weightDesc = tensor1d(this->dt, this->numChannels); - TensorDesc biasDesc = weightDesc; - std::shared_ptr modelWeightTensor(new Tensor(this->handle)); - std::shared_ptr modelBiasTensor(new Tensor(this->handle)); - modelWeightTensor->set_desc(weightDesc); - modelBiasTensor->set_desc(biasDesc); - GCLMem_t weightMem = modelWeightTensor->get_val(); - U32 s0, s1, s2; - U32 num, bytes; - s0 = (this->numChannels + 3) / 4 * 4; - s1 = 1; - s2 = 1; - num = s0 * s1 * s2; - bytes = num * 4 * bytesOf(this->dt); - weightMem->desc.stride[0] = s0; - weightMem->desc.stride[1] = s1; - weightMem->desc.stride[2] = s2; - weightMem->desc.offset[0] = 0; - weightMem->desc.offset[1] = 0; - weightMem->desc.offset[2] = 0; - weightMem->desc.memType = GCL_MEM_BUF; - weightMem->desc.memFormat = DF_NCHW; - weightMem->desc.num = num; - weightMem->desc.byteSize = bytes; - weightMem->desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; - GCLMem_t biasMem = modelBiasTensor->get_val(); - *biasMem = *weightMem; - biasMem->desc.flags = CL_MEM_READ_WRITE; - - U32 weightBytes = tensorNumBytes(weightDesc); - U8* weightTmp = nullptr; - if(modelPtr != nullptr){ - weightMem->desc.host_ptr = *modelPtr; - *modelPtr += weightBytes; - } else { - weightMem->desc.host_ptr = curOpWs.weight; - } - if((this->numChannels & 3) != 0){ - weightTmp = (U8*)operator new(weightMem->desc.byteSize); - memset(weightTmp, 0, weightMem->desc.byteSize); - memcpy(weightTmp, weightMem->desc.host_ptr, this->numChannels * bytesOf(this->dt)); - weightMem->desc.host_ptr = weightTmp; - } - - U8* biasVal = nullptr; - U8* biasTmp = nullptr; - if(modelPtr != nullptr){ - if(this->hasBias){ - biasVal = *modelPtr; - *modelPtr += tensorNumBytes(biasDesc); - } - } else { - if(this->hasBias) biasVal = curOpWs.vec; - } - - if(biasVal){ - biasMem->desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; - if((this->numChannels & 3) == 0){ - biasMem->desc.host_ptr = biasVal; - } else { - biasTmp = (U8*)operator new(biasMem->desc.byteSize); - memset(biasTmp, 0, biasMem->desc.byteSize); - memcpy(biasTmp, biasVal, this->numChannels * bytesOf(this->dt)); - biasMem->desc.host_ptr = biasTmp; - } - } else { - biasMem->desc.host_ptr = nullptr; - biasMem->desc.flags = CL_MEM_READ_WRITE; - } - modelWeightTensor->alloc(); - modelBiasTensor->alloc(); - this->weightTensors.push_back(*modelWeightTensor.get()); - this->biasTensors.push_back(*modelBiasTensor.get()); - if(weightTmp) delete weightTmp; - if(biasTmp) delete biasTmp; - if(curOpWs.weight) delete curOpWs.weight; - if(curOpWs.vec) delete curOpWs.vec; - return SUCCESS; - } - - virtual void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - this->handle->curOpName = this->get_op_name(); - int inputNum = this->inputTensors.size(); - Tensor inputTensor = this->inputTensors[this->dataID]; - Tensor outputTensor = this->outputTensors[0]; - GCLMem_t inPtr = inputTensor.get_val(); - GCLMem_t outPtr = outputTensor.get_val(); - TensorDesc inputDesc = inputTensor.get_desc(); - TensorDesc outputDesc = outputTensor.get_desc(); - U32 ic; - tensorSelectGet(inputDesc, NULL, NULL, NULL, &ic, NULL, NULL); - - if(inputNum == 1 && ic != this->numChannels) CHECK_STATUS(NOT_MATCH); - if(inputNum == 1 && weightTensors.size() == 0) CHECK_STATUS(NOT_MATCH); - if(inputNum > 1){ - U32 cNum = this->inputTensors[0].get_desc().dims[2]; - for(int i = 1; i < inputNum; i++){ - if(cNum != this->inputTensors[i].get_desc().dims[2]) CHECK_STATUS(NOT_MATCH); - } - } - if (inputNum == 1) { - CHECK_STATUS(scale(inputDesc, inPtr, - this->axis, this->weightTensors[0].get_val(), this->biasTensors[0].get_val(), - outputDesc, outPtr, this->schedule, &this->oclExtInfo)); - } else { - CHECK_STATUS(scale(inputDesc, inPtr, - this->axis, this->inputTensors[1 - this->dataID].get_val(), NULL, - outputDesc, outPtr, this->schedule, &this->oclExtInfo)); - } - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - virtual EE infer_output_tensors_size(VecinDims, Vec* outDims) override - { - if(inDims.size() > 1 && tensorNumElements(inDims[1]) > tensorNumElements(inDims[0])) this->dataID = 1; - TensorDesc inputDesc = inDims[dataID]; - this->oclExtInfo.maliInfo.gclmemInputDesc = NULL; - this->oclExtInfo.maliInfo.gclmemOutputDesc = NULL; - CHECK_STATUS(scale_infer_output_size(inputDesc, &((*outDims)[0]), this->schedule, &this->oclExtInfo)); - return SUCCESS; - } - - virtual EE infer_gclmem_desc(Vec* gclmemInputDesc, Vec* gclmemOutputDesc) override - { - TensorDesc inputDesc = this->inputTensors[0].get_desc(); - if(this->inputTensors.size() > 1) { - TensorDesc tmpDesc = this->inputTensors[1].get_desc(); - if(tensorNumElements(tmpDesc) > tensorNumElements(inputDesc)) { - this->dataID = 1; - inputDesc = tmpDesc; - } - } - this->oclExtInfo.maliInfo.gclmemInputDesc = &((*gclmemInputDesc)[dataID]); - this->oclExtInfo.maliInfo.gclmemOutputDesc = &((*gclmemOutputDesc)[0]); - CHECK_STATUS(scale_infer_output_size(inputDesc, NULL, this->schedule, &this->oclExtInfo)); - return SUCCESS; - } -}; - -#endif //_SCALE_GPU_H diff --git a/inference/include/ocl/slice_ocl.hpp b/inference/include/ocl/slice_ocl.hpp deleted file mode 100644 index de061438..00000000 --- a/inference/include/ocl/slice_ocl.hpp +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _SLICE_OCL_H -#define _SLICE_OCL_H - -#include "operator.hpp" -#include "tensor_computing.h" -#include "slice.hpp" - -class SliceOCL: public Slice -{ -public: - SliceOCL(DataType dt, I32 axis, I32* slicePointsPtr, I32 sliceSize) : Slice(dt, axis, slicePointsPtr, sliceSize) {} - - - virtual void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - this->handle->curOpName = this->get_op_name(); - Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); - - Vec outputTensors = this->get_output_tensors(); - Vec outputTensorDescs; - Vec outputPtrs; - for (U32 i = 0; i < outputTensors.size(); i++) { - outputTensors[i].set_scale(inputTensor.get_scale()); - outputTensorDescs.push_back(outputTensors[i].get_desc()); - outputPtrs.push_back(outputTensors[i].get_val()); - } - CHECK_STATUS(slice(inputDesc, inputTensor.get_val(), this->axis, outputTensorDescs, &outputPtrs, this->schedule, &this->oclExtInfo)); - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - virtual EE infer_output_tensors_size(VecinDims, Vec* outDims) override - { - this->oclExtInfo.maliInfo.gclmemInputDesc = NULL; - this->oclExtInfo.maliInfo.gclmemOutputDesc = NULL; - CHECK_STATUS(slice_infer_output_size(inDims[0], outDims, this->axis, this->slicePoints.data(), this->schedule, &this->oclExtInfo)); - return SUCCESS; - } - - virtual EE infer_gclmem_desc(Vec* gclmemInputDesc, Vec* gclmemOutputDesc) override - { - TensorDesc inputDesc = this->inputTensors[0].get_desc(); - Vec outputDesc; - U32 sliceNum = (*gclmemOutputDesc).size(); - for(U32 i = 0; i < sliceNum; ++i) outputDesc.push_back(this->outputTensors[i].get_desc()); - GCLMemDesc_t memOutDesc = (GCLMemDesc_t) operator new(sizeof(struct GCLMemDesc) * sliceNum) ; - for(U32 i = 0; i < sliceNum; i++) memOutDesc[i] = (*gclmemOutputDesc)[i]; - this->oclExtInfo.maliInfo.gclmemInputDesc = &((*gclmemInputDesc)[0]); - this->oclExtInfo.maliInfo.gclmemOutputDesc = memOutDesc; - CHECK_STATUS(slice_infer_output_size(inputDesc, &outputDesc, this->axis, this->slicePoints.data(), this->schedule, &this->oclExtInfo)); - for(U32 i = 0; i < sliceNum; i++) (*gclmemOutputDesc)[i] = memOutDesc[i]; - delete memOutDesc; - return SUCCESS; - } -}; - -#endif //_SLICE_OCL_H diff --git a/inference/include/ocl/softmax_ocl.hpp b/inference/include/ocl/softmax_ocl.hpp deleted file mode 100644 index d8273589..00000000 --- a/inference/include/ocl/softmax_ocl.hpp +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _SOFTMAX_OCL_H -#define _SOFTMAX_OCL_H - -#include "operator.hpp" -#include "tensor_computing.h" -#include "softmax.hpp" - -class SoftmaxOCL : public Softmax { -public: - - SoftmaxOCL(DataType dt, int axis): - Softmax(dt, axis) {} - - virtual void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - this->handle->curOpName = this->get_op_name(); - Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); - Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); - CHECK_STATUS(softmax(inputDesc, inputTensor.get_val(), axis, outputDesc, outputTensor.get_val(), this->schedule, &this->oclExtInfo)); - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - virtual EE infer_output_tensors_size(Vec inDims, Vec* outDims) override - { - this->oclExtInfo.maliInfo.gclmemInputDesc = NULL; - this->oclExtInfo.maliInfo.gclmemOutputDesc = NULL; - CHECK_STATUS(softmax_infer_output_size(inDims[0], &((*outDims)[0]), this->schedule, &this->oclExtInfo)); - return SUCCESS; - } - - virtual EE infer_gclmem_desc(Vec* gclmemInputDesc, Vec* gclmemOutputDesc) override - { - TensorDesc inputDesc = this->inputTensors[0].get_desc(); - this->oclExtInfo.maliInfo.gclmemInputDesc = &((*gclmemInputDesc)[0]); - this->oclExtInfo.maliInfo.gclmemOutputDesc = &((*gclmemOutputDesc)[0]); - CHECK_STATUS(softmax_infer_output_size(inputDesc, NULL, this->schedule, &this->oclExtInfo)); - return SUCCESS; - } -}; - -#endif //SOFTMAX_OCL_H diff --git a/inference/include/ocl/space2depth_ocl.hpp b/inference/include/ocl/space2depth_ocl.hpp deleted file mode 100644 index 404ee4d7..00000000 --- a/inference/include/ocl/space2depth_ocl.hpp +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -/** - * Project deploy - */ - - -#ifndef _SPACE2DEPTH_OCL_H -#define _SPACE2DEPTH_OCL_H - -#include "operator.hpp" -#include "space2depth.hpp" - -class Space2DepthOCL: public Space2Depth { -public: - Space2DepthOCL(DataType dt) : Space2Depth(dt) { } - - virtual void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - this->handle->curOpName = this->get_op_name(); - Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); - Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); - - CHECK_STATUS(space2depth(inputDesc, inputTensor.get_val(), outputDesc, outputTensor.get_val(), this->schedule, &this->oclExtInfo)); - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - virtual EE infer_output_tensors_size(Vec inDims, Vec* outDims) override - { - this->oclExtInfo.maliInfo.gclmemInputDesc = NULL; - this->oclExtInfo.maliInfo.gclmemOutputDesc = NULL; - CHECK_STATUS(space2depth_infer_output_size(inDims[0], &((*outDims)[0]), this->schedule, &this->oclExtInfo)); - return SUCCESS; - } - - virtual EE infer_gclmem_desc(Vec* gclmemInputDesc, Vec* gclmemOutputDesc) override - { - TensorDesc inputDesc = this->inputTensors[0].get_desc(); - this->oclExtInfo.maliInfo.gclmemInputDesc = &((*gclmemInputDesc)[0]); - this->oclExtInfo.maliInfo.gclmemOutputDesc = &((*gclmemOutputDesc)[0]); - CHECK_STATUS(space2depth_infer_output_size(inputDesc, NULL, this->schedule, &this->oclExtInfo)); - return SUCCESS; - } -}; - -#endif //_SPACE2DEPTH_OCL_H diff --git a/inference/include/ocl/squeeze_ocl.hpp b/inference/include/ocl/squeeze_ocl.hpp deleted file mode 100644 index 6a659462..00000000 --- a/inference/include/ocl/squeeze_ocl.hpp +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -/** - * Project deploy - */ - - -#ifndef _SQUEEZE_OCL_H -#define _SQUEEZE_OCL_H - -#include "operator.hpp" -#include "squeeze.hpp" - -class SqueezeOCL: public Squeeze { -public: - SqueezeOCL(DataType dt, I32 axis, I32 *dims, I32 dimSize) : Squeeze(dt, axis, dims, dimSize) { } - - virtual void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - this->handle->curOpName = this->get_op_name(); - Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); - Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); - GCLMem_t inputPtr = inputTensor.get_val(); - GCLMem_t outputPtr = outputTensor.get_val(); - - if(inputPtr != outputPtr){ - CHECK_STATUS(squeeze(inputDesc, inputTensor.get_val(), outputDesc, outputTensor.get_val(), this->schedule, &this->oclExtInfo)); - } - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - virtual EE infer_output_tensors_size(Vec inDims, Vec* outDims) override - { - this->oclExtInfo.maliInfo.gclmemInputDesc = NULL; - this->oclExtInfo.maliInfo.gclmemOutputDesc = NULL; - CHECK_STATUS(squeeze_infer_output_size(inDims[0], &((*outDims)[0]), this->schedule, &this->oclExtInfo)); - return SUCCESS; - } - - virtual EE infer_gclmem_desc(Vec* gclmemInputDesc, Vec* gclmemOutputDesc) override - { - TensorDesc inputDesc = this->inputTensors[0].get_desc(); - this->oclExtInfo.maliInfo.gclmemInputDesc = &((*gclmemInputDesc)[0]); - this->oclExtInfo.maliInfo.gclmemOutputDesc = &((*gclmemOutputDesc)[0]); - CHECK_STATUS(squeeze_infer_output_size(inputDesc, NULL, this->schedule, &this->oclExtInfo)); - return SUCCESS; - } -}; - -#endif //_SQUEEZE_OCL_H diff --git a/inference/include/ocl/transpose_ocl.hpp b/inference/include/ocl/transpose_ocl.hpp deleted file mode 100644 index 5d3dae05..00000000 --- a/inference/include/ocl/transpose_ocl.hpp +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _TRANSPOSE_OCL_H -#define _TRANSPOSE_OCL_H - -#include "operator.hpp" -#include "tensor_computing.h" -#include "transpose.hpp" - -class TransposeOCL: public Transpose -{ -public: - /** - @param mode - */ - TransposeOCL(DataType dt, U32* transDimsPtr, U32 transDimsSize) : Transpose(dt, transDimsPtr, transDimsSize) { } - - virtual void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - this->handle->curOpName = this->get_op_name(); - Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); - Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); - - CHECK_STATUS(transpose( inputDesc, inputTensor.get_val(), outputDesc, outputTensor.get_val(), - this->transDims.data(), this->schedule, &this->oclExtInfo)); - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - virtual EE infer_output_tensors_size(VecinDims, Vec* outDims) override - { - this->oclExtInfo.maliInfo.gclmemInputDesc = NULL; - this->oclExtInfo.maliInfo.gclmemOutputDesc = NULL; - CHECK_STATUS(transpose_infer_output_size(inDims[0], &((*outDims)[0]), this->transDims.data(), this->schedule, &this->oclExtInfo)); - return SUCCESS; - } - - virtual EE infer_gclmem_desc(Vec* gclmemInputDesc, Vec* gclmemOutputDesc) override - { - TensorDesc inputDesc = this->inputTensors[0].get_desc(); - this->oclExtInfo.maliInfo.gclmemInputDesc = &((*gclmemInputDesc)[0]); - this->oclExtInfo.maliInfo.gclmemOutputDesc = &((*gclmemOutputDesc)[0]); - CHECK_STATUS(transpose_infer_output_size(inputDesc, NULL, this->transDims.data(), this->schedule, &this->oclExtInfo)); - return SUCCESS; - } -}; - -#endif //_TRANSPOSE_OCL_H diff --git a/inference/include/operator.hpp b/inference/include/operator.hpp deleted file mode 100644 index 0f025266..00000000 --- a/inference/include/operator.hpp +++ /dev/null @@ -1,275 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _OPERATOR_H -#define _OPERATOR_H - -#include -#include "sys.h" -#include "tensor_computing.h" -#include "tensor.hpp" -#include "op_type.h" -#include -#include "model_tools.h" -#define HashMap std::map - -#ifdef _USE_MALI -#include "gcl.h" -#endif - -class Operator { -public: - virtual bool checkOperator() { - for (U32 i = 0; i < inputTensors.size(); i++) { - if (!tensorDescIsValid(inputTensors[i].get_desc())) - return false; - } - for (U32 i = 0; i < outputTensors.size(); i++) { - if (!tensorDescIsValid(outputTensors[i].get_desc())) - return false; - } - return true; - }; - - virtual void run() = 0; - - /** - * @param inputTensors - * @param outputTensors - */ - virtual void set_input_output_tensors(Vec it, Vec ot) - { - this->inputTensors = it; - this->outputTensors = ot; - } - - virtual Vec get_input_tensors() - { - return this->inputTensors; - } - - virtual Vec get_output_tensors() - { - return this->outputTensors; - } - - virtual void set_input_tensors(Vec it) - { - this->inputTensors = it; - } - - virtual void set_output_tensors(Vec ot) - { - this->outputTensors = ot; - } - - virtual bool can_input_output_the_same() { return false; } - - virtual EE infer_output_tensors_size(Vec, Vec*) = 0; - - std::string get_name() - { - return this->name; - } - /** - * @param name - */ - explicit Operator(std::string name) - { - this->name = name; - } - - Operator():name("") { } - - virtual bool is_weight() - { - return false; - } - - virtual U32 infer_tmp_memory_size() - { - this->lenOfTemp = 0; - this->temp = std::shared_ptr(); - return 0; - } - - virtual void set_tmp_memory(U32 len, std::shared_ptr temp) - { - this->lenOfTemp = len; - this->temp = temp; - } -#ifdef _USE_MALI - virtual EE set_mali_handle(std::shared_ptr handle){ - this->handle = handle; - oclExtInfo.maliInfo.handle = handle.get(); - runInfo.algorithm = 0; - runInfo.best_w[0] = 1; - runInfo.best_w[1] = 1; - runInfo.best_c[0] = 1; - runInfo.best_c[1] = 1; - runInfo.best_k[0] = 1; - runInfo.best_k[1] = 1; - oclExtInfo.maliInfo.forwardRunInfo = &runInfo; - return SUCCESS; - } - virtual EE infer_gclmem_desc(Vec*, Vec*){return NOT_SUPPORTED;} -#endif - - virtual U32 get_len_of_temp() - { - return this->lenOfTemp; - } - - virtual std::shared_ptr get_tmp() - { - return this->temp; - } - - virtual OperatorType get_op_type() = 0; - - virtual void set_op_name(std::string opName) { - this->name = opName; - } - - virtual void set_op_schedule(Arch opSchedule) { - this->schedule = opSchedule; - } - - virtual Vec get_tensor_positions() - { - return this->tensorPos; - } - - virtual void set_tensor_positions(Vec tensorPos) - { - this->tensorPos = tensorPos; - } - - virtual ~Operator(){ } - - virtual int get_next_operator_index() - { - return -1; - } - - virtual void setAlgorithmInfoToMap(HashMap &algorithmMap, std::string name, I32* algorithmArray, U32 ArrayNum) - { - std::string algoInfo = "/"; - for(U32 i = 0; i < ArrayNum; i++) { - algoInfo += std::to_string(algorithmArray[i]); - algoInfo += "/"; - } - algorithmMap[name] = algoInfo; - } - - virtual void getAlgorithmInfoFromMap(HashMap &algorithmMap, std::string name, I32* algorithmArray, U32 ArrayNum) - { - std::string algoInfo = algorithmMap[name]; - U32 be = algoInfo.find_first_of("/"); - U32 end; - for(U32 i = 0; i < ArrayNum; i++) { - end = algoInfo.find("/", be + 1); - algorithmArray[i] = std::stoi(algoInfo.substr(be + 1, end - be - 1)); - be = end; - } - } - - virtual void init_feature_scale(U32 num, QuantSpec* qs) - { - UNUSED(num); - UNUSED(qs); -#ifdef _USE_INT8 - if (1 == num && 0 == qs[0].scale[0]) { // OP is labelled as no-quantization - if (DT_F16_8Q == this->dt) { - this->dt = DT_F16; - } - return; - } - featureScale.resize(num); - for (U32 i = 0; i < num; i++) { - featureScale[i].resize(qs[i].num_scale); - memcpy(featureScale[i].data(), qs[i].scale, qs[i].num_scale * bytesOf(DT_F32)); - } -#endif - } - -#ifdef _USE_INT8 - virtual void set_feature_scale(Vec> fs) { - this->featureScale = fs; - } - - virtual bool is_dynamic_scale() - { - OperatorType ot = this->get_op_type(); - if (OT_Conv != ot) { - return false; - } - - U32 numScale = featureScale.size(); - U32 numQuant = (DT_F16_8Q == this->dt) ? inputTensors.size() : 0; - - if (0 != numScale && 0 == featureScale[0][0]) { // OP is labelled as no-quantization - return false; - } - - if (0 != numScale && -2 == (featureScale.back())[0]) { // OP is labelled as fp-output - numScale = 0; - numQuant += 1; - } - - for (auto tensor : outputTensors) { - if (DT_I8 == tensor.get_desc().dt) { - numQuant++; - } - } - if (0 == numQuant) { - return false; - } - - if (0 == numScale) { - return true; - } - - CHECK_REQUIREMENT(numQuant == numScale); - return false; - } - -#endif - std::string get_op_name() - { - return this->name; - } - -public: - Arch schedule; - DataType dt; - - Vec inputTensors; - Vec outputTensors; - Vec tensorPos; - - U32 lenOfTemp; - std::shared_ptr temp; - -#ifdef _USE_MALI - std::shared_ptr handle; - ExtInfo oclExtInfo; - ForwardRunInfoMali runInfo; -#endif - - std::string name; - Vec> featureScale; -}; - -#endif //_OPERATOR_H diff --git a/inference/include/padding.hpp b/inference/include/padding.hpp deleted file mode 100644 index 48540b49..00000000 --- a/inference/include/padding.hpp +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _PADDING_H -#define _PADDING_H - -#include "operator.hpp" - -class Padding: public Operator -{ -public: - /** - @param mode - */ - Padding(DataType dt, PadDesc padDesc) - { - this->dt = dt; - this->padDesc = padDesc; - } - - OperatorType get_op_type() override - { - return OT_Pad; - } - - void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - - Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); - Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); - U8* inPtr = inputTensor.get_val(); - U8* outPtr = outputTensor.get_val(); - CHECK_STATUS(padding(inputDesc, inPtr, this->padDesc, outputDesc, outPtr, this->schedule)); - - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - EE infer_output_tensors_size(VecinDims, Vec* outDims) override - { - CHECK_STATUS(padding_infer_output_size(inDims[0], this->padDesc, &((*outDims)[0]))); - return SUCCESS; - } - -private: - PadDesc padDesc; -}; - -#endif //_PADDING_H diff --git a/inference/include/point_cast.hpp b/inference/include/point_cast.hpp deleted file mode 100644 index 52c5c047..00000000 --- a/inference/include/point_cast.hpp +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#ifndef _POINT_CAST_H -#define _POINT_CAST_H -#ifdef _USE_MALI -#include "gcl.h" -#endif - -class PtrCaster{ -public: - PtrCaster(void* p) - :ptr(p){} - inline operator U8*() {return (U8*)ptr;} - inline operator void*() {return ptr;} -#ifdef __aarch64__ - inline operator F16*() {return (F16*)ptr;} -#endif - inline operator F32*() {return (F32*)ptr;} - inline operator U32*() {return (U32*)ptr;} - inline operator I32*() {return (I32*)ptr;} - inline operator INT8*() {return (INT8*)ptr;} -#ifdef _USE_MALI - inline operator GCLMem_t(){return (GCLMem_t)ptr;} -#endif -private: - void* ptr; -}; - -class PtrCasterShared{ - public: - PtrCasterShared(std::shared_ptr p){ptr = p;} - inline operator std::shared_ptr() {return std::static_pointer_cast(ptr);} - inline operator std::shared_ptr() {return std::static_pointer_cast(ptr);} -#ifdef _USE_FP16 - inline operator std::shared_ptr() {return std::static_pointer_cast(ptr);} -#endif - inline operator std::shared_ptr() {return std::static_pointer_cast(ptr);} - inline operator std::shared_ptr() {return std::static_pointer_cast(ptr);} - inline operator std::shared_ptr() {return std::static_pointer_cast(ptr);} -#ifdef _USE_MALI - inline operator std::shared_ptr() {return std::static_pointer_cast(ptr);} -#endif - - private: - std::shared_ptr ptr; -}; -#endif diff --git a/inference/include/pooling.hpp b/inference/include/pooling.hpp deleted file mode 100644 index b8cfe7ca..00000000 --- a/inference/include/pooling.hpp +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _POOLING_H -#define _POOLING_H -#include "operator.hpp" -#include "tensor_computing.h" - -class Pooling: public Operator { -public: - Pooling(PoolingMode mode, U32 ksH, U32 ksW, U32 strideH, U32 strideW, - U32 paddingT, U32 paddingB, U32 paddingL, U32 paddingR, RoundMode rm) - { - this->mode = mode; - this->kernelSizeH = ksH; - this->kernelSizeW = ksW; - this->strideH = strideH; - this->strideW = strideW; - this->paddingT = paddingT; - this->paddingB = paddingB; - this->paddingL = paddingL; - this->paddingR = paddingR; - this->rm = rm; - } - - OperatorType get_op_type() override - { - return OT_Pooling; - } - - PoolingDesc create_PoolingDesc(PoolingMode pm, U32 ksH, U32 ksW, U32 strideH, U32 strideW, - U32 paddingT, U32 paddingB, U32 paddingL, U32 paddingR, RoundMode rm) - { - PoolingDesc poolingDesc; - poolingDesc.pm = pm; - poolingDesc.kernelSize_h = ksH; - poolingDesc.kernelSize_w = ksW; - poolingDesc.stride_h = strideH; - poolingDesc.stride_w = strideW; - poolingDesc.padding_top = paddingT; - poolingDesc.padding_bottom = paddingB; - poolingDesc.padding_left = paddingL; - poolingDesc.padding_right = paddingR; - poolingDesc.rm = rm; - return poolingDesc; - } - - void set_kernelSize(U32 globalKernelSizeH, U32 globalKernelSizeW) { - this->kernelSizeH = globalKernelSizeH; - this->kernelSizeW = globalKernelSizeW; - } - - void set_stride(U32 globalStrideH, U32 globalStrideW) { - this->strideH = globalStrideH; - this->strideW = globalStrideW; - } - -protected: - PoolingMode mode; - RoundMode rm; - U32 kernelSizeH; - U32 kernelSizeW; - U32 strideH; - U32 strideW; - U32 paddingT; - U32 paddingB; - U32 paddingL; - U32 paddingR; -}; - -#endif //_POOLING_H diff --git a/inference/include/preallocated_memory.hpp b/inference/include/preallocated_memory.hpp deleted file mode 100644 index 90444118..00000000 --- a/inference/include/preallocated_memory.hpp +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _PREALLOCATED_MEMORY_H -#define _PREALLOCATED_MEMORY_H - -#include "operator.hpp" - -class PreAllocatedMemory: public Operator -{ -public: - /** - @param mode - */ - PreAllocatedMemory(DataType dt, TensorDesc desc) - { - this->dt = dt; - this->desc = desc; - } - - OperatorType get_op_type() override - { - return OT_PreAllocatedMemory; - } - - void run() override { - Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); - memset(outputTensor.get_val(), 0, tensorNumBytes(outputDesc)); - } - - EE infer_output_tensors_size(VecinDims, Vec* outDims) override - { - if (inDims.size() > 0) - CHECK_STATUS(NOT_MATCH); - - (*outDims)[0] = this->desc; - return SUCCESS; - } - -private: - TensorDesc desc; -}; - -#endif //_PREALLOCATED_MEMORY_H diff --git a/inference/include/prior_box.hpp b/inference/include/prior_box.hpp deleted file mode 100644 index 12903c40..00000000 --- a/inference/include/prior_box.hpp +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _PRIOR_BOX_H -#define _PRIOR_BOX_H -#include "operator.hpp" -#include "tensor_computing.h" - -class PriorBox: public Operator { -public: - PriorBox(DataType dt, PriorBoxDesc priorboxDesc) - { - this->dt = dt; - this->priorboxDesc = priorboxDesc; - } - - OperatorType get_op_type() override - { - return OT_PriorBox; - } - - void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - - Vec inputDesc; - - for (Tensor tensorIn: this->inputTensors) { - inputDesc.push_back(tensorIn.get_desc()); - } - auto outputDesc = this->outputTensors[0].get_desc(); - auto outputPtr = this->outputTensors[0].get_val(); - CHECK_STATUS(priorbox(inputDesc, this->priorboxDesc, outputDesc, outputPtr, this->schedule)); - - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - EE infer_output_tensors_size(VecinDims, Vec* outDims) override - { - CHECK_STATUS(priorbox_infer_output_size(inDims, this->priorboxDesc, &((*outDims)[0]), this->schedule)); - return SUCCESS; - } -protected: - PriorBoxDesc priorboxDesc; -}; -#endif //_PRIOR_BOX_H \ No newline at end of file diff --git a/inference/include/reduction.hpp b/inference/include/reduction.hpp deleted file mode 100644 index 3019a5a2..00000000 --- a/inference/include/reduction.hpp +++ /dev/null @@ -1,86 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _REDUCTION_H -#define _REDUCTION_H - -#include "operator.hpp" -#include "tensor_computing.h" - -class Reduction: public Operator -{ -public: - /** - @param mode - */ - Reduction(DataType dt, I32 axis, bool keepDim, ReductionMode reductionMode, float coeff) - { - this->dt = dt; - this->axis = axis; - this->keepDim = keepDim; - this->reductionMode = reductionMode; - this->coeff = coeff; - } - - OperatorType get_op_type() override - { - return OT_Reduction; - } - - void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); - Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); - TensorDesc maskDesc; - U8 *mask; - if (this->inputTensors.size() > 1) { - maskDesc = this->inputTensors[1].get_desc();; - mask = this->inputTensors[1].get_val(); - } else { - maskDesc.nDims = 0; - mask = nullptr; - } - - CHECK_STATUS(reduction(inputDesc, inputTensor.get_val(), - maskDesc, mask, - this->axis, - this->reductionMode, - this->coeff, - outputDesc, outputTensor.get_val(), this->schedule)); - - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - EE infer_output_tensors_size(VecinDims, Vec* outDims) override - { - TensorDesc maskDesc; - if (inDims.size() > 1) - maskDesc = inDims[1]; - else - maskDesc.nDims = 0; - CHECK_STATUS(reduction_infer_output_size(inDims[0], maskDesc, this->axis, this->keepDim, &((*outDims)[0]))); - return SUCCESS; - } - -private: - I32 axis; - bool keepDim; - ReductionMode reductionMode; - float coeff; -}; - -#endif diff --git a/inference/include/relative_position_embedding.hpp b/inference/include/relative_position_embedding.hpp deleted file mode 100644 index 5fcb7866..00000000 --- a/inference/include/relative_position_embedding.hpp +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _RELATIVE_POSITION_EMBEDDING_H -#define _RELATIVE_POSITION_EMBEDDING_H -#include "weight_operator.hpp" -#include "embedding.hpp" -#include "tensor_computing.h" - -class RelativePositionEmbedding: public Embedding { -public: - RelativePositionEmbedding(DataType dt, U32 inputDim, U32 numOutput, bool transpose, I32 axis) - :Embedding(dt, inputDim, numOutput, transpose) - { - this->axis = axis; - } - - OperatorType get_op_type() override - { - return OT_RelativePositionEmbedding; - } - - void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - Tensor inputTensor = this->inputTensors[0]; - Tensor weightTensor; - if (this->weightTensors.size() > 0) - weightTensor = this->weightTensors[0]; - else - weightTensor = this->inputTensors[1]; - Tensor outputTensor = this->outputTensors[0]; - - TensorDesc inputDesc = inputTensor.get_desc(); - U8* weightPtr = weightTensor.get_val(); - U8* outputPtr = outputTensor.get_val(); - - I32 tmpAxis = (this->axis + inputDesc.nDims) % inputDesc.nDims; - U32 batch = inputDesc.dims[inputDesc.nDims-1]; - U32 length = inputDesc.dims[inputDesc.nDims - 1 - tmpAxis]; - for (U32 in = 0; in < batch; in++) { - U8* ptr = outputPtr + in * length * this->numOutput * bytesOf(this->dt); - if (length > this->inputDim) { - U32 size = (length - this->inputDim) * this->numOutput * bytesOf(this->dt); - memset(ptr, 0, size); - ptr += size; - } - U32 start = 0; - U32 copyLength = this->inputDim; - if (length < this->inputDim) { - start = this->inputDim - length; - copyLength = length; - } - if (transpose) { - for (U32 i = 0; i < copyLength; i++) { - for (U32 j = 0; j < this->numOutput; j++) { - memcpy(ptr, weightPtr+(j*this->inputDim+start+i)*bytesOf(this->dt), - bytesOf(this->dt)); - } - } - } else { - memcpy(ptr, weightPtr+start*this->numOutput*bytesOf(this->dt), - copyLength*this->numOutput*bytesOf(this->dt)); - } - } - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - EE infer_output_tensors_size(Vec inDims, Vec* outDims) override - { - TensorDesc inDim = inDims[0]; - I32 tmpAxis = (this->axis + inDim.nDims) % inDim.nDims; - U32 batch = inDim.dims[inDim.nDims-1]; - U32 length = inDim.dims[inDim.nDims - 1 - tmpAxis]; - (*outDims)[0] = tensor3df(this->dt, DF_MTK, batch, length, this->numOutput); - return SUCCESS; - } -private: - int axis; -}; - -#endif //_RELATIVE_POSITION_EMBEDDING_H diff --git a/inference/include/relative_shift.hpp b/inference/include/relative_shift.hpp deleted file mode 100644 index 422227bd..00000000 --- a/inference/include/relative_shift.hpp +++ /dev/null @@ -1,90 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _RELATIVE_SHIFT_H -#define _RELATIVE_SHIFT_H -#include "operator.hpp" - -class RelativeShift: public Operator { -public: - RelativeShift(DataType dt, I32 axis, I32 shiftLength) - { - this->dt = dt;; - this->axis = axis; - this->shiftLength = shiftLength; - } - - OperatorType get_op_type() override - { - return OT_RelativeShift; - } - - void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - Tensor inputTensor = this->inputTensors[0]; - Tensor outputTensor = this->outputTensors[0]; - U8* inputPtr = inputTensor.get_val(); - U8* outputPtr = outputTensor.get_val(); - - TensorDesc inputDesc = inputTensor.get_desc(); - I32 tmpAxis = (this->axis + inputDesc.nDims) % inputDesc.nDims; - tmpAxis = (I32)inputDesc.nDims - 1 - tmpAxis; - U32 length = inputDesc.dims[tmpAxis]; - if (tmpAxis + 1 >= (I32)inputDesc.nDims) { - memcpy(outputPtr, inputPtr, tensorNumBytes(inputDesc)); - return; - } - U32 loops = inputDesc.dims[tmpAxis+1]; - U32 innerLength = 1; - U32 outerLength = 1; - for (I32 i = 0; i < tmpAxis; i++) { - innerLength *= inputDesc.dims[i]; - } - for (U32 i = tmpAxis+2; i < inputDesc.nDims; i++) { - outerLength *= inputDesc.dims[i]; - } - U32 tileSize = innerLength * bytesOf(inputDesc.dt); - U32 chunkSize = length * tileSize; - U8* dstPtr = outputPtr; - for (U32 i = 0; i < outerLength; i++) { - U8* srcPtr = inputPtr + i * loops * chunkSize; - U32 num = loops * length - (loops - shiftLength) * (shiftLength + length); - U32 start = shiftLength * length - num; - U32 srcIndex = start * tileSize; - memcpy(dstPtr, srcPtr+srcIndex, num*tileSize); - dstPtr += num * tileSize; - srcIndex += num * tileSize; - for (U32 j = shiftLength; j < loops; j++) { - memset(dstPtr, 0, shiftLength*tileSize); - dstPtr += shiftLength * tileSize; - memcpy(dstPtr, srcPtr+srcIndex, chunkSize); - dstPtr += chunkSize; - srcIndex += chunkSize; - } - } - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - EE infer_output_tensors_size(Vec inDims, Vec* outDims) override - { - (*outDims)[0] = inDims[0]; - return SUCCESS; - } -private: - int axis; - int shiftLength; -}; - -#endif //_RELATIVE_SHIFT_H diff --git a/inference/include/repeat.hpp b/inference/include/repeat.hpp deleted file mode 100644 index 8ad2f80d..00000000 --- a/inference/include/repeat.hpp +++ /dev/null @@ -1,99 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _REPEAT_H -#define _REPEAT_H - -#include "operator.hpp" - -class Repeat: public Operator -{ -public: - /** - @param mode - */ - Repeat(DataType dt, I32 loops, I32 axis, I32 jumpOperatorIndex, I32 currentOperatorIndex) - { - this->dt = dt; - this->loops = loops; - this->axis = axis; - this->iter = 0; - this->jumpOperatorIndex = jumpOperatorIndex; - this->nextOperatorIndex = currentOperatorIndex + 1; - } - - OperatorType get_op_type() override - { - return OT_Repeat; - } - - void run() override - { } - - int get_next_operator_index() override - { - // check status - if (this->inputTensors.size() > 1) { - Tensor inputTensor = this->inputTensors[1]; - TensorDesc inputDesc = inputTensor.get_desc(); - I32 *ptr = (I32 *)(inputTensor.get_val()); - U32 length = tensorNumElements(inputDesc); - for (U32 i = 0; i < length; i++) { - // end loop - if (ptr[i]) { - this->iter = 0; - return this->nextOperatorIndex; - } - } - } - - // check loop - if (this->iter < this->loops) { - this->iter ++; - return this->jumpOperatorIndex; - } - else { - this->iter = 0; - return this->nextOperatorIndex; - } - } - EE infer_output_tensors_size(VecinDims, Vec* outDims) override - { - this->iter = 0; - if (this->axis >= 0) { - int axisIndex = 0; - if (inDims.size() > 2) - axisIndex = 2; - else { - std::cerr << "[ERROR] set to use axis feature of Repeat must meet input tensors >= 3 requirement" << std::endl; - exit(1); - } - TensorDesc desc = inDims[axisIndex]; - this->loops = desc.dims[desc.nDims-1-axis]; - } - - (*outDims)[0].dt = this->dt; - (*outDims)[0].nDims = 0; - return SUCCESS; - } - -private: - int loops; - int axis; - int iter; - int jumpOperatorIndex; - int nextOperatorIndex; -}; - -#endif //_REPEAT_H diff --git a/inference/include/reshape.hpp b/inference/include/reshape.hpp deleted file mode 100644 index 3cf7f668..00000000 --- a/inference/include/reshape.hpp +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _RESHAPE_H -#define _RESHAPE_H - -#include "operator.hpp" -#include "tensor_computing.h" - -class Reshape: public Operator { -public: -/** - * @param shapeDims - * @param axis - * @param numAxes - */ - Reshape(DataType dt, I32* shapeDimsPtr, I32 shapeSize, I32 axis, I32 numAxes) { - this->dt = dt; - shapeDims = Vec(shapeSize); - memcpy(this->shapeDims.data(), shapeDimsPtr, shapeSize * sizeof(I32)); - this->axis = axis; - this->numAxes = numAxes; - } - - OperatorType get_op_type() override - { - return OT_Reshape; - } - -protected: - Vec shapeDims; - I32 axis; - I32 numAxes; -}; - -#endif //_RESHAPE_H diff --git a/inference/include/resize.hpp b/inference/include/resize.hpp deleted file mode 100644 index 59150218..00000000 --- a/inference/include/resize.hpp +++ /dev/null @@ -1,90 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _RESIZE_H -#define _RESIZE_H - -#include "operator.hpp" -#include "image.h" - -class Resize: public Operator { -public: - Resize(DataType paramDT, void* paramPtr) - { - switch (paramDT) { - case DT_F32: { - memcpy(this->scale, paramPtr, 4 * bytesOf(paramDT)); - break; - } - case DT_U32: { - memcpy(this->size, paramPtr, 2 * bytesOf(paramDT)); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - this->paramDT = paramDT; - } - - OperatorType get_op_type() override - { - return OT_Resize; - } - - void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - - Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); - - Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); - - CHECK_STATUS(resize(inputDesc, inputTensor.get_val(), outputDesc, outputTensor.get_val(), this->schedule)); - - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - EE infer_output_tensors_size(Vec inDims, Vec* outDims) override - { - TensorDesc inputDesc = inDims[0]; - ResizeDesc resizeDesc; - resizeDesc.paramDT = this->paramDT; - U32 bytes; - switch (paramDT) { - case DT_F32: { - CHECK_REQUIREMENT(1 == scale[0] && 1 == scale[1]); - CHECK_STATUS(resize_infer_output_size(inputDesc, resizeDesc, this->scale + 2, &((*outDims)[0]), &bytes)); - break; - } - case DT_U32: { - CHECK_STATUS(resize_infer_output_size(inputDesc, resizeDesc, this->size, &((*outDims)[0]), &bytes)); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - return SUCCESS; - } - -private: - DataType paramDT; - F32 scale[4]; - U32 size[2]; -}; - -#endif //_RESIZE_H diff --git a/inference/include/scale.hpp b/inference/include/scale.hpp deleted file mode 100644 index acedd315..00000000 --- a/inference/include/scale.hpp +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _SCALE_H -#define _SCALE_H - -#include -#include "weight_operator.hpp" -#include "tensor_computing.h" -#include "tensor_desc.h" -#include "op_type.h" - -class Scale: public WeightOperator -{ -public: - Scale(DataType dt, int axis, int numChannels, int numSource) - { - this->dt = dt; - this->axis = axis; - this->numSource = numSource; - this->numChannels = numChannels; - this->dataID = 0; - } - - OperatorType get_op_type() override - { - return OT_Scale; - } - - bool can_input_output_the_same() override - { - return true; - } - - virtual EE init_weight_bias_from_model(U8** modelPtr) = 0; - -protected: - int axis; - U32 numChannels; - int numSource; // How many source tensors compose this input - int dataID; -}; - -#endif //_SCALE_H diff --git a/inference/include/sequential.hpp b/inference/include/sequential.hpp deleted file mode 100644 index b4b03c7d..00000000 --- a/inference/include/sequential.hpp +++ /dev/null @@ -1,201 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _SEQUENTIAL_HPP -#define _SEQUENTIAL_HPP - -#include "sys.h" -#include "error.h" -#include "type.h" -#include "tensor.hpp" -#include "operator.hpp" -#include "convolution.hpp" -#include "fully_connected.hpp" -#include "model.hpp" -#include "op_type.h" -#include "tensor_desc.h" -#include "sequential.hpp" -#include "lstm.hpp" - -class Sequential:public Model { -public: - Sequential(Arch arch, DataType dt, std::string name): Model(arch, dt, name) { } - - void initialize_weight(std::shared_ptr _modelPtr) { - this->modelPtr = _modelPtr; - } - - EE infer_output_tensors_size(HashMap inputDescMap) override - { - if (inputDescMap.size() != 1) { - return NOT_SUPPORTED; - } - Vec inDims; - for (auto iter: inputDescMap) - inDims.push_back(iter.second); - Vec outDims(1); - this->dimsOp = { inDims }; - auto num = [](Vec inDims) -> U32 { - U32 ret = 0; - for (auto d: inDims) ret += tensorNumElements(d); - return ret; - }; - - maxOutputElements = num(inDims); - for (auto op: this->ops) { - CHECK_STATUS(op->infer_output_tensors_size(inDims, &outDims)); - dimsOp.push_back(outDims); - auto numElements = num(outDims); - if(maxOutputElements < numElements) maxOutputElements = numElements; - inDims = outDims; - } - return SUCCESS; - } - - void assign_output_tensor() override - { - auto firstPtr = (U8*)operator new(bytesOf(this->dt) * maxOutputElements); - std::shared_ptr firstSharedPtr(firstPtr); - auto secondPtr = (U8*)operator new(bytesOf(this->dt) * maxOutputElements); - std::shared_ptr secondSharedPtr(secondPtr); - for (U32 i = 0; i < this->ops.size(); i++) { - auto op = this->ops[i]; - auto inDims = dimsOp[i]; - auto outDims = dimsOp[i+1]; - - Vec inTensors; - U32 index = 0; - for (auto d: inDims) { - auto val = std::shared_ptr(firstSharedPtr, (U8*)firstPtr + index*bytesOf(this->dt)); - std::shared_ptr tensor = std::shared_ptr(new Tensor()); - tensor->set_desc(d); - tensor->set_shared_ptr(val); - inTensors.push_back(*tensor.get()); - index += tensorNumElements(d); - } - - Vec outTensors; - index = 0; - for (auto d: outDims) { - auto val = std::shared_ptr(secondSharedPtr, (U8*)secondPtr + index*bytesOf(this->dt)); - std::shared_ptr tensor = std::shared_ptr(new Tensor()); - tensor->set_desc(d); - tensor->set_shared_ptr(val); - outTensors.push_back(*tensor.get()); - index += tensorNumElements(d); - } - - op->set_input_output_tensors(inTensors, outTensors); - - std::swap(firstPtr, secondPtr); - std::swap(firstSharedPtr, secondSharedPtr); - } - } - - EE ConvBiasAssignmentAndWeightTransform() { - return SUCCESS; - } - - EE FCBiasAssignmentAndWeight() { - return SUCCESS; - } - - - void ready(HashMap inputDescMap) override - { - for (auto op : this->ops) { - op->set_op_schedule(this->schedule); - } - this->infer_output_tensors_size(inputDescMap); - this->assign_output_tensor(); - - U8* curPtr = modelPtr.get(); - for (auto op : this->ops) { - if (op->is_weight()) { - if (op->get_op_type() == OT_Conv) { - auto convOpPtr = dynamic_cast(op.get()); - CHECK_STATUS(convOpPtr->init_weight_bias_from_model(&curPtr)); - CHECK_STATUS(convOpPtr->infer_forward_algorithm(this->algorithmMap)); - CHECK_STATUS(convOpPtr->transform_filter()); - } else if (op->get_op_type() == OT_FC) { - auto fcOpPtr = dynamic_cast(op.get()); - CHECK_STATUS(fcOpPtr->init_weight_bias_from_model(&curPtr)); - CHECK_STATUS(fcOpPtr->transform_filter()); - } else if (op->get_op_type() == OT_LSTM) { - auto lstmOpPtr = dynamic_cast(op.get()); - CHECK_STATUS(lstmOpPtr->init_weight_bias_from_model(&curPtr)); - CHECK_STATUS(lstmOpPtr->transform_filter()); - } - } - } - - this->infer_tmp_memory_size(); - this->assign_tmp_tensor(); - } - - void infer_tmp_memory_size() override - { - tmpElements.clear(); - maxTmpElements = 0; - - for (auto op: this->ops) { - auto len = op->infer_tmp_memory_size(); - tmpElements.push_back(len); - if(len > maxTmpElements) maxTmpElements = len; - } - } - - void assign_tmp_tensor() override - { - temp = std::shared_ptr(new CpuMemory()); - temp->alloc(this->maxTmpElements); - for (auto op: this->ops) { - op->set_tmp_memory(this->maxTmpElements, temp); - } - } - - void add(std::shared_ptr op) - { - this->ops.push_back(op); - } - - Vec get_inputTensors() - { - auto op = this->ops[0].get(); - return op->get_input_tensors(); - } - - Vec get_output_tensors() - { - auto len = this->ops.size(); - auto op = this->ops[len-1].get(); - return op->get_output_tensors(); - } - - void set_input_tensors(Vec inputTensors) - { - auto op = this->ops[0].get(); - op->set_input_tensors(inputTensors); - } -private: - std::shared_ptr modelPtr; - U32 maxOutputElements; - Vec > dimsOp; - U32 maxTmpElements; - Vec tmpElements; - std::shared_ptr temp; -}; -#endif - - diff --git a/inference/include/sequential_ocl.hpp b/inference/include/sequential_ocl.hpp deleted file mode 100644 index f7d5ec42..00000000 --- a/inference/include/sequential_ocl.hpp +++ /dev/null @@ -1,295 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#ifdef _USE_MALI -#ifndef _SEQUENTIAL_OCL_HPP -#define _SEQUENTIAL_OCL_HPP - -#include -#include "sys.h" -#include "error.h" -#include "type.h" -#include -#include "tensor.hpp" -#include "operator.hpp" -#include "model.hpp" -#include "op_type.h" -#include "tensor_desc.h" -#include "memory.hpp" -#include "weight_operator.hpp" -#include "pooling.hpp" -#include "convolution.hpp" -#include "bilateral_slice_apply.hpp" -#include "ocl/pooling_ocl.hpp" -#include "ocl/memory_ocl.hpp" -#include "ocl/convolution_ocl.hpp" -#include "ocl/bilateral_slice_apply_ocl.hpp" -#include "ocl/fully_connected_ocl.hpp" -#include "ocl/scale_ocl.hpp" - - -class SequentialOcl : public Model { -public: - SequentialOcl(Arch A, DataType dt, std::string name) : Model(A, dt, name) { - input_output_same = false; - } - virtual ~SequentialOcl() { - } - - EE ready(Vec dims, std::shared_ptr modelPtr, U32 numOutput) - { - CHECK_STATUS(this->ops[0]->set_mali_handle(this->handle)); - this->ops[0]->set_op_schedule(this->schedule); - input_output_same = this->ops[0]->can_input_output_the_same(); - CHECK_STATUS(this->infer_output_tensors_size(dims, numOutput)); - Vec inTensors; - Vec outTensors; - for(U32 i = 0; i < inputTensors.size(); i++) inTensors.push_back(*inputTensors[i].get()); - for(U32 i = 0; i < outputTensors.size(); i++) outTensors.push_back(*outputTensors[i].get()); - this->ops[0]->set_input_output_tensors(inTensors, outTensors); - - if(this->ops[0]->is_weight()) { - U8* curPtr = modelPtr.get(); - if(this->ops[0]->get_op_type() == OT_Conv) { - auto convOpPtr = dynamic_cast(this->ops[0].get()); - auto weightOp = (WeightOperator*) convOpPtr; - weightOp->set_hasBias(true); - CHECK_STATUS(convOpPtr->init_weight_bias_from_model(&curPtr)); - CHECK_STATUS(convOpPtr->infer_forward_algorithm(this->algorithmMap)); - CHECK_STATUS(convOpPtr->transform_filter()); - } - if(this->ops[0]->get_op_type() == OT_FC) { - auto fcOpPtr = dynamic_cast(this->ops[0].get()); - auto weightOp = (WeightOperator*) fcOpPtr; - weightOp->set_hasBias(true); - CHECK_STATUS(fcOpPtr->init_weight_bias_from_model(&curPtr)); - CHECK_STATUS(fcOpPtr->transform_filter()); - } - if(this->ops[0]->get_op_type() == OT_Scale) { - auto scaleOpPtr = dynamic_cast(this->ops[0].get()); - auto weightOp = (WeightOperator*) scaleOpPtr; - weightOp->set_hasBias(true); - CHECK_STATUS(scaleOpPtr->init_weight_bias_from_model(&curPtr)); - } - } - this->infer_gclmem_descs(); - for(U32 i = 0; i < inputTensors.size(); i++) inTensors.push_back(*inputTensors[i].get()); - for(U32 i = 0; i < outputTensors.size(); i++) outTensors.push_back(*outputTensors[i].get()); - this->ops[0]->set_input_output_tensors(inTensors, outTensors); - this->infer_tmp_memory_size(); - this->assign_tmp_tensor(); - this->alloc_output_host_tensors(numOutput); - return SUCCESS; - } - - virtual EE infer_output_tensors_size(HashMap) override{return NOT_SUPPORTED;} - virtual void assign_output_tensor() override{} - EE infer_output_tensors_size(Vec dims, U32 outputTensorNum) - { - Vec inTensorDescs; - Vec outTensorDescs; - for (U32 i = 0; i < dims.size(); ++i) inTensorDescs.push_back(dims[i]); - for (U32 i = 0; i < outputTensorNum; ++i) { - TensorDesc tmpDesc; - outTensorDescs.push_back(tmpDesc); - } - - - CHECK_STATUS(this->ops[0]->infer_output_tensors_size(inTensorDescs, &outTensorDescs)); - - if(!input_output_same) { - for(U32 i = 0; i < dims.size(); ++i) { - std::shared_ptr tmpTensor(new Tensor(this->handle)); - tmpTensor->set_desc(inTensorDescs[i]); - inputTensors.push_back(tmpTensor); - } - } - - for(U32 i = 0; i < outputTensorNum; ++i) { - std::shared_ptr tmpTensor(new Tensor(this->handle)); - tmpTensor->set_desc(outTensorDescs[i]); - outputTensors.push_back(tmpTensor); - if(input_output_same) inputTensors.push_back(tmpTensor); - } - return SUCCESS; - } - - virtual EE infer_gclmem_descs(HashMap) override {return NOT_SUPPORTED;} - EE infer_gclmem_descs() - { - /*infer GCLMemDesc & alloc gpu mem for tensors*/ - Vec inGCLMemDescs; - Vec outGCLMemDescs; - for (U32 i = 0; i < inputTensors.size(); ++i) { - U32 stride[3] = {0, 0, 0}; - U32 offset[3] = {0, 0, 0}; - GCLMemDesc tmpDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); - inGCLMemDescs.push_back(tmpDesc); - } - - for (U32 i = 0; i < outputTensors.size(); ++i) { - U32 stride[3] = {0, 0, 0}; - U32 offset[3] = {0, 0, 0}; - GCLMemDesc gclTmpDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); - outGCLMemDescs.push_back(gclTmpDesc); - } - - CHECK_STATUS(this->ops[0]->infer_gclmem_desc(&inGCLMemDescs, &outGCLMemDescs)); - - if(!input_output_same) { - for(U32 i = 0; i < inputTensors.size(); ++i) { - Memory_* tmpmem = inputTensors[i]->get_memory(); - OclMemory* mem = (OclMemory*)tmpmem; - mem->set_mem_desc(inGCLMemDescs[i]); - inputTensors[i]->alloc(); - } - } - - for(U32 i = 0; i < outputTensors.size(); ++i) { - Memory_* tmpmem = outputTensors[i]->get_memory(); - OclMemory* mem = (OclMemory*)tmpmem; - mem->set_mem_desc(outGCLMemDescs[i]); - outputTensors[i]->alloc(); - if(input_output_same) inputTensors[i] = outputTensors[i]; - } - return SUCCESS; - } - - void alloc_output_host_tensors(U32 outputTensorNum) { - for(U32 i = 0; i < outputTensorNum; i++) { - std::shared_ptr val = outputTensors[i]->get_shared_ptr(); - auto device_desc = val->desc; - U32 size = device_desc.byteSize * 2; - val->desc.use_map = true; - val->desc.flags = CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR; - outputTensors[i]->get_memory()->alloc(size); - } - } - - void infer_tmp_memory_size() override - { - maxTmpElements = 0; - for (auto op: this->ops) { - auto len = op->infer_tmp_memory_size(); - if(len > maxTmpElements) maxTmpElements = len; - } - } - - void assign_tmp_tensor() override - { - this->temp = std::shared_ptr(new OclMemory(this->handle)); - temp->alloc(this->maxTmpElements); - for (auto op: this->ops) { - op->set_tmp_memory(this->maxTmpElements, temp); - } - } - - - void add(std::shared_ptr op) - { - this->ops.push_back(op); - } - - void mark_input_output() - { - if(this->schedule == MALI) { - U32 tmpBufSize = 0; - for(U32 i = 0; i < inputTensors.size(); i++) { - Tensor* inputTensor = inputTensors[i].get(); - TensorDesc desc = inputTensor->get_desc(); - GCLMem_t mem = inputTensor->get_val(); - U32 size = 0; - tensor_computing_set_input_infer_tmpBuf_size(mem, desc, &size, MALI); - tmpBufSize = (tmpBufSize < size) ? size : tmpBufSize; - } - - for(U32 i = 0; i < outputTensors.size(); i++) { - Tensor* outputTensor = outputTensors[i].get(); - TensorDesc desc = outputTensor->get_desc(); - GCLMem_t mem = outputTensor->get_val(); - U32 size = 0; - tensor_computing_get_output_infer_tmpBuf_size(mem, desc, &size, MALI); - tmpBufSize = (tmpBufSize < size) ? size : tmpBufSize; - } - - if(tmpBufSize > maxTmpElements) { - maxTmpElements = tmpBufSize; - temp->alloc(maxTmpElements); - } - DataType dt = inputTensors[0]->get_desc().dt; - auto gclmem = std::static_pointer_cast(temp->get_shared_ptr()); - gclmem->desc.stride[0] = this->maxTmpElements / bytesOf(dt); - gclmem->desc.stride[1] = 1; - gclmem->desc.stride[2] = 1; - gclmem->desc.memFormat = DF_NCHW; - } - } - - void mali_prepare() { - Model::run_mali_prepare(false); - CHECK_STATUS(gcl_finish(this->handle.get())); - } - - void set_input_tensors(Vec modelInputTensors) - { - for(U32 i = 0; i < modelInputTensors.size(); i++) { - U8* tmpTensorPtr = modelInputTensors[i].get_val(); - TensorDesc tmpTensorDesc = modelInputTensors[i].get_desc(); - Memory_* mem = inputTensors[i]->get_memory(); - OclMemory* oclMem = (OclMemory*) mem; - auto tempMem = std::static_pointer_cast(temp->get_shared_ptr()); - oclMem->set_tmpBuf(tempMem); - oclMem->set_val_by_copy(tmpTensorDesc, tmpTensorPtr); - } - gcl_finish(this->handle.get()); - } - - Vec> get_output_tensors() - { - for(U32 i = 0; i < outputTensors.size(); i++) { - auto outputTensor = outputTensors[i]; - auto host_desc = outputTensor->get_desc(); - auto mem = (OclMemory*)outputTensor->get_memory(); - mem->get_val_to_hostptr(host_desc, NULL, CL_TRUE); - } - return this->outputTensors; - } - -#ifdef _USE_MALI -#else - - //TODO 0823 - EE ConvBiasAssignmentAndWeightTransform() { - return SUCCESS; - } - - //TODO 0823 - EE FCBiasAssignmentAndWeight() { - return SUCCESS; - } - - - - -#endif -private: - using Model::ready; - U32 maxTmpElements; - std::shared_ptr temp; - Vec> inputTensors; - Vec> outputTensors; - bool input_output_same; -}; -#endif -#endif - diff --git a/inference/include/shared_weight.hpp b/inference/include/shared_weight.hpp deleted file mode 100644 index 3171548d..00000000 --- a/inference/include/shared_weight.hpp +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _SHARED_WEIGHT_H -#define _SHARED_WEIGHT_H - -#include "weight_operator.hpp" -#include "op_type.h" - -class SharedWeight: public WeightOperator -{ -public: - /** - @param mode - */ - SharedWeight(DataType dt, TensorDesc desc) - { - this->dt = dt; - this->desc = desc; - } - - OperatorType get_op_type() override - { - return OT_SharedWeight; - } - - EE infer_output_tensors_size(VecinDims, Vec* outDims) override - { - UNUSED(inDims); - (*outDims)[0] = this->desc; - return SUCCESS; - } - - void run() override { } - - EE init_weight_bias_from_model(U8** modelPtr) - { - TensorDesc weightDesc = this->desc; - U32 weightBytes = tensorNumBytes(weightDesc); - - std::shared_ptr modelWeightTensor(new Tensor()); - modelWeightTensor->set_desc(weightDesc); - - if(modelPtr != nullptr){ - modelWeightTensor->alloc(); - memcpy((U8*)modelWeightTensor->get_val(), *modelPtr, weightBytes); - *modelPtr += weightBytes; - } else { - auto curOpWs = this->get_weightspec_ptr(); - modelWeightTensor->set_shared_ptr(std::shared_ptr(curOpWs.weight)); - } - this->weightTensors.push_back(*modelWeightTensor.get()); - return SUCCESS; - } - -private: - TensorDesc desc; -}; - -#endif //_WEIGHT_H diff --git a/inference/include/slice.hpp b/inference/include/slice.hpp deleted file mode 100644 index 5c5b517e..00000000 --- a/inference/include/slice.hpp +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _SLICE_H -#define _SLICE_H - -#include "operator.hpp" -#include "tensor_computing.h" - -class Slice: public Operator { -public: - Slice(DataType dt, I32 axis, I32* slicePointsPtr, I32 sliceSize) - { - this->dt = dt; - this->axis = axis; - this->slicePoints = Vec(sliceSize); - memcpy(this->slicePoints.data(), slicePointsPtr, sizeof(I32) * sliceSize); - } - - OperatorType get_op_type() override - { - return OT_Slice; - } - -protected: - Vec slicePoints; - I32 axis; -}; - -#endif //_SLICE_H diff --git a/inference/include/squeeze.hpp b/inference/include/squeeze.hpp deleted file mode 100644 index 63964494..00000000 --- a/inference/include/squeeze.hpp +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _SQUEEZE_H -#define _SQUEEZE_H - -#include "operator.hpp" - -class Squeeze: public Operator -{ -public: - /** - @param mode - */ - Squeeze(DataType dt, I32 axis, I32 *dims, I32 dimSize) - { - this->dt = dt; - this->axis = axis; - this->dims = Vec(dimSize); - memcpy(this->dims.data(), dims, sizeof(I32) * dimSize); - } - - OperatorType get_op_type() override - { - return OT_Squeeze; - } - -protected: - I32 axis; - Vec dims; -}; - -#endif //_SQUEEZE_H diff --git a/inference/include/tensor.hpp b/inference/include/tensor.hpp deleted file mode 100644 index 71814530..00000000 --- a/inference/include/tensor.hpp +++ /dev/null @@ -1,196 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _TENSOR_H -#define _TENSOR_H - -#include -#include -#include -#include -#include -#include "memory.hpp" -#include "cpu/memory_cpu.hpp" -#ifdef _USE_MALI -#include "ocl/memory_ocl.hpp" -#endif -#include -#include "type.h" -#include "tensor_desc.h" - -#define HashMap std::map -#define Vec std::vector - -class Tensor { -public: - Tensor() - { - this->val = std::shared_ptr(new CpuMemory()); - this->scalePtr = std::shared_ptr((F32*)operator new(bytesOf(DT_F32))); - } - -#ifdef _USE_MALI - Tensor(std::shared_ptr handle) - { - this->val = std::shared_ptr(new OclMemory(handle)); - this->scalePtr = std::shared_ptr((F32*)operator new(bytesOf(DT_F32))); - } -#endif - - void alloc() - { - this->val->alloc(desc); - } - - void set_desc(TensorDesc d) - { - this->desc = d; - } - - TensorDesc get_desc() - { - return this->desc; - }; - - void set_scale(F32 s) - { - if (nullptr == this->scalePtr.get()) { - this->scalePtr = std::shared_ptr((F32*)operator new(bytesOf(DT_F32))); - } - *(this->scalePtr.get()) = s; - } - - F32 get_scale() - { - if (nullptr != this->scalePtr.get()) { - return *(this->scalePtr.get()); - } else { - return 1.0; - } - } - - void set_val_by_copy(TensorDesc desc, U8* ptr) { - this->val->set_val_by_copy(desc, ptr); - } - - void set_shared_ptr(std::shared_ptr val) - { - this->val->set_shared_ptr_caster(val); - } - - PtrCaster get_val() - { - return this->val->get_val_caster(); - } - - PtrCasterShared get_shared_ptr() - { - return this->val->get_shared_ptr_caster(); - } - - void set_memory(std::shared_ptr mem) - { - this->val = mem; - } - - Memory_* get_memory() - { - return this->val.get(); - } - - bool isInvalid() - { - U32 num = tensorNumElements(this->desc); - for (U32 i = 0; i < num; i++) { - if (UNI_ISNAN(getElement(i)) || UNI_ISINF(getElement(i))) { - return true; - } - } - return false; - } - - void print() - { - U32 num = tensorNumElements(this->desc); - std::cout << num << std::endl; - num = (num > 64) ? 64 : num; - for(U32 i = 0; i < num; i++) { - std::cout << getElement(i) << " "; - } - std::cout << std::endl; - } - - F32 getElement(U32 index) - { - F32 value = 0; - U8* res = NULL; -#ifdef _USE_MALI - if(val->get_mem_type() == OCLMem) { - std::shared_ptr oclMem = this->val->get_shared_ptr_caster(); - if(!oclMem->desc.use_map) { - std::cerr << "[ERROR] Not support check unmapped gcl memory value" << std::endl; - exit(1); - } - res = oclMem->desc.map_ptr; - } else { -#endif - res = (U8*)this->val->get_val(); -#ifdef _USE_MALI - } -#endif - switch (this->desc.dt) { - case DT_F32: { - F32* data = (F32*)res; - value = data[index]; - break; - } -#ifdef __aarch64__ - case DT_F16: { - F16* data = (F16*)res; - value = data[index]; - break; - } -#endif - case DT_U32: { - U32* data = (U32*)res; - value = data[index]; - break; - } - case DT_I32: { - I32* data = (I32*)res; - value = data[index]; - break; - } - case DT_I8: { - INT8* data = (INT8*)res; - value = data[index] / this->get_scale(); - break; - } - case DT_U8: { - U8* data = (U8*)res; - value = data[index]; - break; - } - default: - CHECK_STATUS(NOT_SUPPORTED); - } - return value; - } - -private: - TensorDesc desc; - std::shared_ptr val; - std::shared_ptr scalePtr; -}; -#endif //_TENSOR_H diff --git a/inference/include/transpose.hpp b/inference/include/transpose.hpp deleted file mode 100644 index 6dcc0213..00000000 --- a/inference/include/transpose.hpp +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _TRANSPOSE_H -#define _TRANSPOSE_H - -#include -#include "operator.hpp" -#include "tensor_computing.h" - -class Transpose: public Operator { -public: - Transpose(DataType dt, U32* transDimsPtr, U32 transDimsSize){ - this->dt = dt; - this->transDims = Vec(transDimsSize); - memcpy(this->transDims.data(), transDimsPtr, sizeof(U32) * transDimsSize); - } - - OperatorType get_op_type() override - { - return OT_Transpose; - } -protected: - Vec transDims; -}; - -#endif //_TRANSPOSE_H diff --git a/inference/include/unsqueeze.hpp b/inference/include/unsqueeze.hpp deleted file mode 100644 index 74baa4f5..00000000 --- a/inference/include/unsqueeze.hpp +++ /dev/null @@ -1,97 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _UNSQUEEZE_H -#define _UNSQUEEZE_H - -#include "operator.hpp" - -class Unsqueeze: public Operator -{ -public: - /** - @param mode - */ - Unsqueeze(DataType dt, I32 axis, I32 *dims, I32 dimSize) - { - this->dt = dt; - this->axis = axis; - this->dims = Vec(dimSize); - memcpy(this->dims.data(), dims, sizeof(I32) * dimSize); - } - - OperatorType get_op_type() override - { - return OT_Unsqueeze; - } - - void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - - Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); - Tensor outputTensor = this->outputTensors[0]; - U8* inPtr = inputTensor.get_val(); - U8* outPtr = outputTensor.get_val(); - if(inPtr != outPtr) { - memcpy(outPtr, inPtr, tensorNumBytes(inputDesc)); - } - - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - EE infer_output_tensors_size(VecinDims, Vec* outDims) override - { - auto outDimsPtr = &((*outDims)[0]); - outDimsPtr->dt = inDims[0].dt; - int axis = this->axis; - if (axis < 0) - axis += inDims[0].nDims + 1; - if (axis >= 0 && axis < (int)inDims[0].nDims+1) { - axis = inDims[0].nDims - axis; - for (int i = 0; i < axis; i++) { - outDimsPtr->dims[i] = inDims[0].dims[i]; - } - outDimsPtr->dims[axis] = 1; - for (int i = axis; i < (int)inDims[0].nDims; i++) { - outDimsPtr->dims[i+1] = inDims[0].dims[i]; - } - outDimsPtr->nDims = inDims[0].nDims + 1; - } - else { - outDimsPtr->nDims = this->dims.size() + inDims[0].nDims; - for (U32 i = 0; i < outDimsPtr->nDims; i++) { - outDimsPtr->dims[i] = 0; - } - for (U32 i = 0; i < this->dims.size(); i++) { - outDimsPtr->dims[outDimsPtr->nDims - 1 - this->dims[i]] = 1; - } - U32 index = 0; - for (U32 i = 0; i < outDimsPtr->nDims; i++) { - if (outDimsPtr->dims[i] == 0) - outDimsPtr->dims[i] = inDims[0].dims[index++]; - } - CHECK_REQUIREMENT(index == inDims[0].nDims); - } - outDimsPtr->df = getTensorDefaultDataFormat(outDimsPtr->nDims); - return SUCCESS; - } - -private: - I32 axis; - Vec dims; -}; - -#endif //_UNSQUEEZE_H diff --git a/inference/include/utils.hpp b/inference/include/utils.hpp deleted file mode 100644 index 92bb3ec8..00000000 --- a/inference/include/utils.hpp +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _UTILS_HPP -#define _UTILS_HPP - -#include "ut_util.h" - -std::string extract_class_function(std::string&& pretty_function); -std::string extract_file_function(std::string&& pretty_function); - -#define __CLASS_FUNCTION__ extract_class_function(std::string(__PRETTY_FUNCTION__)) -#define __FILE_FUNCTION__ extract_file_function(std::string(__FILE__)+"::"+std::string(__FUNCTION__)) - -void ut_time_init(); -void ut_time_tic(std::string key); -void ut_time_toc(std::string key); -void ut_time_statistics(); - -#define UTIL_TIME_INIT ut_time_init(); -#define UTIL_TIME_TIC(str) ut_time_tic(str); -#define UTIL_TIME_TOC(str) ut_time_toc(str); -#define UTIL_TIME_STATISTICS ut_time_statistics(); - - -#endif //_UTILS_HPP diff --git a/inference/include/weight_operator.hpp b/inference/include/weight_operator.hpp deleted file mode 100644 index eee24e77..00000000 --- a/inference/include/weight_operator.hpp +++ /dev/null @@ -1,119 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _WEIGHTOPERATOR_H -#define _WEIGHTOPERATOR_H - -#include "operator.hpp" -#include "utils.hpp" -#include "model_tools.h" - -class WeightOperator: public Operator { -public: - WeightOperator() { - hasBias = false; - lenOfWtm = 0; - - ws.mdt = DT_U8; - ws.bytes_of_weight = 0; - ws.weight = nullptr; - ws.bytes_of_vec = 0; - ws.vec = nullptr; - } - - virtual bool is_weight() override { - return true; - } - /** - * @param weightTensors - */ - U32 get_weight_size() - { - U32 ret = 0; - for(auto tensor : weightTensors) { - auto dim = tensor.get_desc(); - ret += tensorNumElements(dim) * bytesOf(dim.dt); - } - return ret; - } - - virtual void set_weightTensors(Vec weightTensors) - { - this->weightTensors = weightTensors; - } - - virtual void set_biasTensors(Vec biasTensors) - { - this->biasTensors = biasTensors; - } - - virtual U32 infer_wtm_memory_size() - { - this->lenOfWtm = 0; - this->wtm = std::shared_ptr(); - return 0; - } -#ifdef _USE_MALI - virtual GCLMemDesc infer_wtm_memory_size_mali() - { - this->lenOfWtm = 0; - this->wtm = std::shared_ptr(); - U32 stride[3] = {0, 0, 0}; - U32 offset[3] = {0, 0, 0}; - GCLMemDesc tmpdesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); - return tmpdesc; - } -#endif - - virtual void set_wtm_memory(U32 len, std::shared_ptr wtm) - { - this->lenOfWtm = len; - this->wtm->set_memory(wtm); - } - - virtual U32 get_lenOfWtm() - { - return this->lenOfWtm; - } - - virtual Tensor* get_wtm() - { - return this->wtm.get(); - } - - virtual void set_weightspec_ptr(WeightSpec ws) - { - this->ws = ws; // struct can not directly assigned to another? - } - - virtual WeightSpec get_weightspec_ptr() - { - return ws; - } - - virtual void set_hasBias(bool hasBiasOrNot) { - this->hasBias = hasBiasOrNot; - } - -public: - Vec weightTensors; - Vec biasTensors; - bool hasBias; - - U32 lenOfWtm; - std::shared_ptr wtm; - WeightSpec ws; -}; - -#endif //_WEIGHTOPERATOR_H diff --git a/inference/src/BoltModel_Jni.cpp b/inference/src/BoltModel_Jni.cpp deleted file mode 100644 index 315ed9d2..00000000 --- a/inference/src/BoltModel_Jni.cpp +++ /dev/null @@ -1,516 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifdef __clang__ -#include -#include -#include -#include "type.h" -#include "tensor_desc.h" -#include "cnn.hpp" -#include "BoltModel.h" -#include "../exports/c/bolt.h" - -struct ModelHandleInfo { - void* cnn; - void* ms; - DEVICE_TYPE deviceType; -}; - -typedef struct { - U32 dims[4] = {0}; - char name[NAME_LEN] = {0}; - DataType dt; - DataFormat df; - void* dataPtr; -} DataDesc; - -typedef struct { - U32 num_outputs; - DataDesc* outputArr; - DEVICE_TYPE deviceType; -} ResultHandleInner; - -AFFINITY_TYPE str2AFFINITY_TYPE(std::string affinity_str) { - AFFINITY_TYPE ret = HIGH_PERFORMANCE; - if (affinity_str == "HIGH_PERFORMANCE") { - ret = HIGH_PERFORMANCE; - } else if (affinity_str == "LOW_POWER") { - ret = LOW_POWER; - } else { - std::cerr << "[ERROR] unsupported JNI CPU affinity setting " << affinity_str << std::endl; - exit(1); - } - return ret; -} - -DEVICE_TYPE str2DEVICE_TYPE(std::string device_str) { - DEVICE_TYPE ret = CPU; - if (device_str == "CPU") { - ret = CPU; - } else if (device_str == "GPU") { - ret = GPU; - } else { - std::cerr << "[ERROR] unsupported JNI device setting " << device_str << std::endl; - exit(1); - } - return ret; -} - -DATA_TYPE str2DATA_TYPE (std::string data_type) { - DATA_TYPE ret = FP_32; - if (data_type == "FP32") { - ret = FP_32; -#ifdef __aarch64__ - } else if (data_type == "FP16"){ - ret = FP_16; -#endif - } else if (data_type == "INT32") { - ret = INT_32; - } else if (data_type == "UINT32") { - ret = UINT_32; - } else { - std::cerr << "[ERROR] unsupported JNI data type setting " << data_type << std::endl; - exit(1); - } - return ret; -} - -DATA_FORMAT str2DATA_FORMAT (std::string data_format) { - DATA_FORMAT ret = NCHW; - if (data_format == "NCHW") { - ret = NCHW; - } else if (data_format == "NHWC") { - ret = NHWC; - } else if (data_format == "MTK") { - ret = MTK; - } else if (data_format == "NORMAL") { - ret = NORMAL; - } else { - std::cerr << "[ERROR] unsupported JNI data format setting " << data_format << std::endl; - exit(1); - } - return ret; -} - -std::string DataFormat2str (DataFormat data_format) { - std::string ret = "NCHW"; - switch (data_format) { - case DF_NCHW: - ret = "NCHW"; - break; - case DF_NCHWC8: - ret = "NCHWC8"; - break; - case DF_NHWC: - ret = "NHWC"; - break; - case DF_MTK: - ret = "MTK"; - break; - case DF_NORMAL: - ret = "NORMAL"; - break; - default: - std::cerr << "[ERROR] unsupported JNI data format setting " << data_format << std::endl; - exit(1); - } - return ret; -} - -void dataTypeConverterToFloat(void *src, DataType srcDataType, float *dst, int num) { - switch (srcDataType) { -#ifdef __aarch64__ - case DT_F16: { - F16 *srcPtr = (F16 *)src; - for (int i = 0; i < num; i++) { - dst[i] = srcPtr[i]; - } - break; - } -#endif - case DT_F32: { - memcpy(dst, src, sizeof(float)*num); - break; - } - case DT_U32: { - U32 *srcPtr = (U32 *)src; - for (int i = 0; i < num; i++) { - dst[i] = srcPtr[i]; - } - break; - } - case DT_I32: { - I32 *srcPtr = (I32 *)src; - for (int i = 0; i < num; i++) { - dst[i] = srcPtr[i]; - } - break; - } - default: - std::cerr << "[ERROR] unsupported source data type in " << __func__ << std::endl; - exit(1); - } -} - -void dataTypeConverterFromFloat(float *src, void *dst, DataType dstDataType, int num) { - switch (dstDataType) { -#ifdef __aarch64__ - case DT_F16: { - F16 *dstPtr = (F16 *)dst; - for (int i = 0; i < num; i++) { - dstPtr[i] = (F16)src[i]; - } - break; - } -#endif - case DT_F32: { - memcpy(dst, src, sizeof(float)*num); - break; - } - case DT_U32: { - U32 *dstPtr = (U32 *)dst; - for (int i = 0; i < num; i++) { - dstPtr[i] = (U32)src[i]; - } - break; - } - case DT_I32: { - I32 *dstPtr = (I32 *)dst; - for (int i = 0; i < num; i++) { - dstPtr[i] = (I32)src[i]; - } - break; - } - default: - std::cerr << "[ERROR] unsupported source data type in " << __func__ << std::endl; - exit(1); - } -} - -extern "C" JNIEXPORT jlong JNICALL Java_BoltModel_model_1create - (JNIEnv *env, jobject, jstring modelPath, jstring affinity, jstring device) { - const char* modelPathPtr = env->GetStringUTFChars(modelPath, JNI_FALSE); - const char* affinityPtr = env->GetStringUTFChars(affinity, JNI_FALSE); - const char* devicePtr = env->GetStringUTFChars(device, JNI_FALSE); - std::string affinity_str = (std::string)affinityPtr; - AFFINITY_TYPE affinity_cur = str2AFFINITY_TYPE(affinity_str); - std::string device_str = devicePtr; - DEVICE_TYPE device_cur = str2DEVICE_TYPE(device_str); - - long modelAddr = (long)CreateModel(modelPathPtr, affinity_cur, device_cur, NULL); - return modelAddr; -} - -extern "C" JNIEXPORT void JNICALL Java_BoltModel_model_1ready - (JNIEnv *env, jobject, jlong modelAddr, jint num_input, jobjectArray input_names, jintArray n, jintArray c, jintArray h, jintArray w, jobjectArray dt_input, jobjectArray df_input) { - ModelHandle ih = (ModelHandle)modelAddr; - - jint *curArray_n = env->GetIntArrayElements(n, 0); - int* datas_n = (int*)malloc(num_input * sizeof(int)); - jint *curArray_c = env->GetIntArrayElements(c, 0); - int* datas_c = (int*)malloc(num_input * sizeof(int)); - jint *curArray_h = env->GetIntArrayElements(h, 0); - int* datas_h = (int*)malloc(num_input * sizeof(int)); - jint *curArray_w = env->GetIntArrayElements(w, 0); - int* datas_w = (int*)malloc(num_input * sizeof(int)); - for (int i = 0; i < num_input; i++) { - datas_n[i] = curArray_n[i]; - datas_c[i] = curArray_c[i]; - datas_h[i] = curArray_h[i]; - datas_w[i] = curArray_w[i]; - } - - char** input_names_ptr = (char**)malloc(sizeof(char*) * num_input); - std::vector name_strs; - for (int i=0; i < num_input; i++) { - jstring cur_str = (jstring)(env->GetObjectArrayElement(input_names, i)); - const char* cur_str_ptr = env->GetStringUTFChars(cur_str, 0); - std::string tmp_str = cur_str_ptr; - name_strs.push_back(tmp_str); - input_names_ptr[i] = (char*)name_strs[i].c_str(); - - env->ReleaseStringUTFChars(cur_str, cur_str_ptr); - env->DeleteLocalRef(cur_str); - } - - for (int i=0; iGetArrayLength(dt_input); - int df_input_num = env->GetArrayLength(df_input); - - if (dt_input_num != df_input_num) { - std::cerr << "[ERROR]: num of input_datatype not equal to num of input_dataformat!" << std::endl; - exit(1); - } - - for (int i=0; iGetObjectArrayElement(dt_input, i)); - const char* tmp_str_dt_ptr = env->GetStringUTFChars(tmp_str_dt, 0); - std::string cur_tmp_str_dt = tmp_str_dt_ptr; - dt_inputs_ptr[i] = str2DATA_TYPE(cur_tmp_str_dt); - - jstring tmp_str_df = (jstring)(env->GetObjectArrayElement(df_input, i)); - const char* tmp_str_df_ptr = env->GetStringUTFChars(tmp_str_df, 0); - std::string cur_tmp_str_df = tmp_str_df_ptr; - df_inputs_ptr[i] = str2DATA_FORMAT(cur_tmp_str_df); - } - PrepareModel(ih, num_input, datas_n, datas_c, datas_h, datas_w, input_names_ptr, dt_inputs_ptr, df_inputs_ptr); - - env->ReleaseIntArrayElements(n, curArray_n, 0); - free(datas_n); - env->ReleaseIntArrayElements(c, curArray_c, 0); - free(datas_c); - env->ReleaseIntArrayElements(h, curArray_h, 0); - free(datas_h); - env->ReleaseIntArrayElements(w, curArray_w, 0); - free(datas_w); - free(input_names_ptr); -} - -extern "C" JNIEXPORT void JNICALL Java_BoltModel_model_1resize_1input - (JNIEnv *env, jobject, jlong modelAddr, jint num_input, jobjectArray input_names, jintArray n, jintArray c, jintArray h, jintArray w, jobjectArray dt_input, jobjectArray df_input) { - ModelHandle ih = (ModelHandle)modelAddr; - - jint *curArray_n = env->GetIntArrayElements(n, 0); - int* datas_n = (int*)malloc(num_input * sizeof(int)); - jint *curArray_c = env->GetIntArrayElements(c, 0); - int* datas_c = (int*)malloc(num_input * sizeof(int)); - jint *curArray_h = env->GetIntArrayElements(h, 0); - int* datas_h = (int*)malloc(num_input * sizeof(int)); - jint *curArray_w = env->GetIntArrayElements(w, 0); - int* datas_w = (int*)malloc(num_input * sizeof(int)); - for (int i = 0; i < num_input; i++) { - datas_n[i] = curArray_n[i]; - datas_c[i] = curArray_c[i]; - datas_h[i] = curArray_h[i]; - datas_w[i] = curArray_w[i]; - } - - char** input_names_ptr = (char**)malloc(sizeof(char*) * num_input); - std::vector name_strs; - for (int i=0; i < num_input; i++) { - jstring cur_str = (jstring)(env->GetObjectArrayElement(input_names, i)); - const char* cur_str_ptr = env->GetStringUTFChars(cur_str, 0); - std::string tmp_str = cur_str_ptr; - name_strs.push_back(tmp_str); - input_names_ptr[i] = (char*)name_strs[i].c_str(); - - env->ReleaseStringUTFChars(cur_str, cur_str_ptr); - env->DeleteLocalRef(cur_str); - } - - for (int i=0; iGetArrayLength(dt_input); - int df_input_num = env->GetArrayLength(df_input); - - if (dt_input_num != df_input_num) { - std::cerr << "[ERROR]: num of input_datatype not equal to num of input_dataformat!" << std::endl; - exit(1); - } - - for (int i=0; iGetObjectArrayElement(dt_input, i)); - const char* tmp_str_dt_ptr = env->GetStringUTFChars(tmp_str_dt, 0); - std::string cur_tmp_str_dt = tmp_str_dt_ptr; - dt_inputs_ptr[i] = str2DATA_TYPE(cur_tmp_str_dt); - - jstring tmp_str_df = (jstring)(env->GetObjectArrayElement(df_input, i)); - const char* tmp_str_df_ptr = env->GetStringUTFChars(tmp_str_df, 0); - std::string cur_tmp_str_df = tmp_str_df_ptr; - df_inputs_ptr[i] = str2DATA_FORMAT(cur_tmp_str_df); - } - - ResizeModelInput(ih, num_input, datas_n, datas_c, datas_h, datas_w, input_names_ptr, dt_inputs_ptr, df_inputs_ptr); - - env->ReleaseIntArrayElements(n, curArray_n, 0); - free(datas_n); - env->ReleaseIntArrayElements(c, curArray_c, 0); - free(datas_c); - env->ReleaseIntArrayElements(h, curArray_h, 0); - free(datas_h); - env->ReleaseIntArrayElements(w, curArray_w, 0); - free(datas_w); - free(input_names_ptr); -} - -extern "C" JNIEXPORT jlong JNICALL Java_BoltModel_IResult_1malloc_1all - (JNIEnv *, jobject, jlong modelAddr) { - ModelHandle ih = (ModelHandle)modelAddr; - ResultHandle ir = AllocAllResultHandle(ih); - return (long)ir; -} - -extern "C" JNIEXPORT jlong JNICALL Java_BoltModel_IResult_1malloc_1part - (JNIEnv *env, jobject, jlong modelAddr, jint num_outputs, jobjectArray outputNames) { - ModelHandle ih = (ModelHandle)modelAddr; - char** output_names_ptr = (char**)malloc(sizeof(char*) * num_outputs); - std::vector name_strs; - for (int i=0; i < num_outputs; i++) { - jstring cur_str = (jstring)(env->GetObjectArrayElement(outputNames, i)); - const char* cur_str_ptr = env->GetStringUTFChars(cur_str, 0); - std::string tmp_str = cur_str_ptr; - name_strs.push_back(tmp_str); - output_names_ptr[i] = (char*)name_strs[i].c_str(); - - env->ReleaseStringUTFChars(cur_str, cur_str_ptr); - env->DeleteLocalRef(cur_str); - } - - for (int i=0; icnn; - - char** input_names_ptr = (char**)malloc(sizeof(char*) * num_input); - std::vector name_strs; - for (int i=0; i < num_input; i++) { - jstring cur_str = (jstring)(env->GetObjectArrayElement(input_names, i)); - const char* cur_str_ptr = env->GetStringUTFChars(cur_str, 0); - std::string tmp_str = cur_str_ptr; - name_strs.push_back(tmp_str); - input_names_ptr[i] = (char*)name_strs[i].c_str(); - env->ReleaseStringUTFChars(cur_str, cur_str_ptr); - env->DeleteLocalRef(cur_str); - } - - for (int i=0; iGetArrayLength(inputData); - HashMap> inMap = cnn->get_inputs(); - for (int i=0; i(env->GetObjectArrayElement(inputData, i)); - jfloat* datas = env->GetFloatArrayElements(curArray, JNI_FALSE); - std::string curTensorName = name_strs[i]; - std::shared_ptr cur_input_tensor = inMap[curTensorName]; - jint clos = env->GetArrayLength(curArray); - TensorDesc tensorDesc = cur_input_tensor->get_desc(); - mem_ptr[i] = cur_input_tensor->get_val(); - dataTypeConverterFromFloat(datas, mem_ptr[i], tensorDesc.dt, clos); - } - - RunModel(ih, ir, num_input, input_names_ptr, mem_ptr); - free(input_names_ptr); - free(mem_ptr); -} - -int calculateLength(int *array, int num) { - int length = 0; - for (int j = 0; j < num; j++) { - if (array[j] == 0) - break; - else { - if (length == 0) - length = array[j]; - else - length *= array[j]; - } - } - return length; -} - -extern "C" JNIEXPORT jobject JNICALL Java_BoltModel_getOutput - (JNIEnv *env, jobject, jlong ResultHandleAddr) { - jclass stucls = env->FindClass("BoltResult"); - - jmethodID constrocMID = env->GetMethodID(stucls, "", "([[F[[I[Ljava/lang/String;[Ljava/lang/String;)V"); - - ResultHandleInner* ir_inner = (ResultHandleInner*)ResultHandleAddr; - DataDesc* outputArrPtr = (*ir_inner).outputArr; - int num_outputs = (*ir_inner).num_outputs; - - jobjectArray output_values; - jclass floatArrCls = env->FindClass("[F"); - output_values = env->NewObjectArray(num_outputs, floatArrCls, nullptr); - jobjectArray output_dimension; - jclass intArrCls = env->FindClass("[I"); - output_dimension = env->NewObjectArray(num_outputs, intArrCls, nullptr); - - jobjectArray output_names_arr; - output_names_arr = (jobjectArray)env->NewObjectArray(num_outputs, env->FindClass("java/lang/String"), env->NewStringUTF("")); - - jobjectArray df_arr; - df_arr = (jobjectArray)env->NewObjectArray(num_outputs, env->FindClass("java/lang/String"), env->NewStringUTF("")); - - for (int i=0; iSetObjectArrayElement(output_names_arr, i, env->NewStringUTF(cur_output_name.c_str())); - DataType cur_data_type = outputArrPtr[i].dt; - DataFormat cur_data_format = outputArrPtr[i].df; - std::string cur_data_format_str = DataFormat2str(cur_data_format); - env->SetObjectArrayElement(df_arr, i, env->NewStringUTF(cur_data_format_str.c_str())); - - void* cur_dataPtr = outputArrPtr[i].dataPtr; - int tensorNumber = calculateLength((int*)outputArrPtr[i].dims, 4); - jfloat tmp_output_values[tensorNumber]; - jfloatArray floatArr = env->NewFloatArray(tensorNumber); - - jint tmp_output_dimensions[4]; - jintArray intArr = env->NewIntArray(4); - - for (int j = 0; j < 4; j++) { - tmp_output_dimensions[j] = (int)(outputArrPtr[i].dims[j]); - } - - dataTypeConverterToFloat(cur_dataPtr, cur_data_type, tmp_output_values, tensorNumber); - env->SetFloatArrayRegion(floatArr, 0, tensorNumber, tmp_output_values); - env->SetObjectArrayElement(output_values, i, floatArr); - env->DeleteLocalRef(floatArr); - - env->SetIntArrayRegion(intArr, 0, 4, tmp_output_dimensions); - env->SetObjectArrayElement(output_dimension, i, intArr); - env->DeleteLocalRef(intArr); - } - - jobject bolt_result_obj = env->NewObject(stucls, constrocMID, output_values, output_dimension, output_names_arr, df_arr); - return bolt_result_obj; -} - -extern "C" JNIEXPORT void JNICALL Java_BoltModel_IResult_1free - (JNIEnv *, jobject, jlong ResultHandleAddr) { - ResultHandle ir = (ResultHandle)ResultHandleAddr; - FreeResultHandle(ir); -} - -extern "C" JNIEXPORT void JNICALL Java_BoltModel_destroyModel - (JNIEnv *, jobject, jlong modelAddr) { - ModelHandle ih = (ModelHandle)modelAddr; - DestroyModel(ih); -} -#endif diff --git a/inference/src/CMakeLists.txt b/inference/src/CMakeLists.txt deleted file mode 100644 index 12d2930c..00000000 --- a/inference/src/CMakeLists.txt +++ /dev/null @@ -1,10 +0,0 @@ -file(GLOB_RECURSE srcs ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) - -# shared library -ADD_LIBRARY(inference SHARED ${srcs}) -# static library -ADD_LIBRARY(inference_static STATIC ${srcs}) -SET_TARGET_PROPERTIES(inference_static PROPERTIES OUTPUT_NAME "inference") -SET_TARGET_PROPERTIES(inference PROPERTIES CLEAN_DIRECT_OUTPUT 1) -SET_TARGET_PROPERTIES(inference_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) - diff --git a/inference/src/bolt.cpp b/inference/src/bolt.cpp deleted file mode 100644 index 88020044..00000000 --- a/inference/src/bolt.cpp +++ /dev/null @@ -1,462 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include -#include "inference.hpp" -#include "tensor.hpp" -#include "data_loader.hpp" -#include "result_format.hpp" -#include "utils.hpp" -#include "tensor_desc.h" -#include "../exports/c/bolt.h" - -struct ModelHandleInfo { - void* cnn; - void* ms; - DEVICE_TYPE deviceType; - void* algoPath; -}; - -typedef struct { - U32 dims[4] = {0}; - char name[NAME_LEN] = {0}; - DataType dt; - DataFormat df; - void* dataPtr; -} DataDesc; - -typedef struct { - U32 num_outputs; - DataDesc* outputArr; - DEVICE_TYPE deviceType; -} ResultHandleInner; - -DataType dt_mapping_user2bolt(DATA_TYPE dt_user) { - DataType ret = DT_F32; - switch (dt_user) { - case FP_32: - ret = DT_F32; - break; -#ifdef __aarch64__ - case FP_16: - ret = DT_F16; - break; -#endif - case INT_32: - ret = DT_I32; - break; - case UINT_32: - ret = DT_U32; - break; - default: - std::cerr << "[ERROR] unsupported user data type in C API" << std::endl; - exit(1); - } - return ret; -} - -DATA_TYPE dt_mapping_bolt2user(DataType dt_bolt) { - DATA_TYPE ret = FP_32; - switch (dt_bolt) { - case DT_F32: - ret = FP_32; - break; -#ifdef __aarch64__ - case DT_F16: - ret = FP_16; - break; -#endif - case DT_I32: - ret = INT_32; - break; - case DT_U32: - ret = UINT_32; - break; - default: - std::cerr << "[ERROR] unsupported bolt data type in C API" << std::endl; - exit(1); - } - return ret; -} - -DataFormat df_mapping_user2bolt(DATA_FORMAT df_user) { - DataFormat ret = DF_NCHW; - switch (df_user) { - case NCHW: - ret = DF_NCHW; - break; - case NHWC: - ret = DF_NHWC; - break; - case NCHWC8: - ret = DF_NCHWC8; - break; - case MTK: - ret = DF_MTK; - break; - case NORMAL: - ret = DF_NORMAL; - break; - default: { - std::cerr << "[ERROR] unsupported user data format in C API" << std::endl; - exit(1); - } - } - return ret; -} - -DATA_FORMAT df_mapping_bolt2user(DataFormat df_bolt) { - DATA_FORMAT ret = NCHW; - switch (df_bolt) { - case DF_NCHW: - ret = NCHW; - break; - case DF_NHWC: - ret = NHWC; - break; - case DF_NCHWC8: - ret = NCHWC8; - break; - case DF_MTK: - ret = MTK; - break; - case DF_NORMAL: - ret = NORMAL; - break; - default: { - std::cerr << "[ERROR] unsupported bolt data format in C API" << std::endl; - exit(1); - } - } - return ret; -} - -CpuAffinityPolicy affinity_mapping(AFFINITY_TYPE affinity_user) { - CpuAffinityPolicy ret = CPU_AFFINITY_HIGH_PERFORMANCE; - if (affinity_user == HIGH_PERFORMANCE) { - ret = CPU_AFFINITY_HIGH_PERFORMANCE; - } else if (affinity_user == LOW_POWER) { - ret = CPU_AFFINITY_LOW_POWER; - } else { - std::cerr << "[ERROR] unsupported user CPU affinity setting in C API" << std::endl; - exit(1); - } - return ret; -} - -inline Arch arch_acquire(AFFINITY_TYPE affinity, DEVICE_TYPE device) -{ - Arch ret = ARM_V8; - switch(device) { - case GPU: - ret = MALI; - break; - case CPU: { - Arch* archs; - int* cpuids; - int cpuNum; - thread_affinity_init(&cpuNum, &archs, &cpuids); - CpuAffinityPolicy affinityPolicy = affinity_mapping(affinity); - ret = thread_affinity_set_by_policy(cpuNum, archs, cpuids, affinityPolicy, 0); - thread_affinity_destroy(&cpuNum, &archs, &cpuids); - break; - } - default: { - std::cerr << "[ERROR] unsupported device type in C API" << std::endl; - exit(1); - } - } - return ret; -} - -ModelHandle CreateModel(const char* modelPath, AFFINITY_TYPE affinity, DEVICE_TYPE device, const char* algoPath) { - ModelSpec* ms = new ModelSpec(); - Arch arch; - - arch = arch_acquire(affinity, device); - deserialize_model_from_file(modelPath, ms); - CNN* cnn = new CNN(arch, ms->dt, ms->model_name); - cnn->sort_operators_sequential(ms); - cnn->initialize_ops(ms); - - ModelHandleInfo* handle = new ModelHandleInfo(); - handle->cnn = (void*)cnn; - handle->ms = (void*)ms; - handle->deviceType = device; - handle->algoPath = (void*)algoPath; - return (ModelHandle)handle; -} - -HashMap getInputDataFormatFromUser(ModelHandle ih, - const int num_input, - const int* n, const int* c, const int* h, const int* w, - char** name, - const DATA_TYPE* dt_input, - const DATA_FORMAT* df_input) -{ - ModelHandleInfo* ihInfo = (ModelHandleInfo*)ih; - ModelSpec*ms = (ModelSpec*)ihInfo->ms; - U32 num = ms->num_inputs; - if(num != (U32)num_input) { - std::cerr << "[ERROR] model has " << num << " input, not " << num_input << std::endl; - exit(1); - } - - HashMap modelInputDims; - for(U32 i = 0; i < num; ++i) { - std::string inputName = name[i]; - bool findTensorName = false; - for (U32 j = 0; j < num; ++j) { - std::string modelName = ms->input_names[j]; - if (modelName == inputName) { - DataType dt = (dt_input == NULL) ? DT_F32 : dt_mapping_user2bolt(dt_input[i]); - DataFormat df = (df_input == NULL) ? DF_NCHW : df_mapping_user2bolt(df_input[i]); - switch (df) { - case DF_NORMAL: - modelInputDims[inputName] = tensor2df(dt, df, n[i], c[i]); - break; - case DF_MTK: - modelInputDims[inputName] = tensor3df(dt, df, n[i], c[i], h[i]); - break; - case DF_NCHW: - modelInputDims[inputName] = tensor4df(dt, df, n[i], c[i], h[i], w[i]); - break; - default: - std::cerr << "[ERROR] unsupported data format in " << __func__ << std::endl; - exit(1); - } - findTensorName = true; - break; - } - } - - if(!findTensorName) { - std::cerr << "[ERROR] input data " << inputName << " is not a valid model input("; - for (U32 j = 0; j < num; ++j) { - std::cerr << ms->input_names[j]; - if (j != num - 1) - std::cerr << ", " << std::endl; - } - std::cerr << ")" << std::endl; - exit(1); - } - } - return modelInputDims; -} - -void PrepareModel(ModelHandle ih, - const int num_input, - const int* n, const int* c, const int* h, const int* w, - char** name, - const DATA_TYPE* dt_input = NULL, - const DATA_FORMAT* df_input = NULL) -{ - ModelHandleInfo* ihInfo = (ModelHandleInfo*)ih; - CNN* cnn = (CNN*)ihInfo->cnn; - ModelSpec*ms = (ModelSpec*)ihInfo->ms; - const char* algoPath = (ihInfo->algoPath) ? (const char*)ihInfo->algoPath : ""; - - HashMap modelInputDims = getInputDataFormatFromUser(ih, - num_input, n, c, h, w, name, dt_input, df_input); - cnn->loadAlgorithmMapFromText(algoPath); - cnn->ready(modelInputDims); - cnn->mark_input_output(ms); -#ifdef _USE_MALI - if (ihInfo->deviceType == GPU) cnn->mali_prepare(); -#endif - cnn->saveAlgorithmMapToText(algoPath); - return; -} - -void ResizeModelInput(ModelHandle ih, - const int num_input, - const int* n, const int* c, const int* h, const int* w, - char** name, - const DATA_TYPE* dt_input = NULL, - const DATA_FORMAT* df_input = NULL) -{ - ModelHandleInfo* ihInfo = (ModelHandleInfo*)ih; - CNN* cnn = (CNN*)ihInfo->cnn; - - HashMap modelInputDims = getInputDataFormatFromUser(ih, - num_input, n, c, h, w, name, dt_input, df_input); - cnn->reready(modelInputDims); -} - -ResultHandle AllocAllResultHandle(ModelHandle ih) { - ModelHandleInfo* ihInfo = (ModelHandleInfo*)ih; - CNN* cnn = (CNN*)ihInfo->cnn; - DEVICE_TYPE device = ihInfo->deviceType; - - ResultHandleInner* model_result_ptr = (ResultHandleInner*)malloc(sizeof(ResultHandleInner)); - Vec modelOutputTensorNames = cnn->get_model_output_tensor_names(); - int model_num_outputs = modelOutputTensorNames.size(); - DataDesc* outputArrPtr = (DataDesc*)malloc(sizeof(DataDesc) * model_num_outputs); - for (int i = 0; i < model_num_outputs; ++i) { - std::string name = modelOutputTensorNames[i]; - U32 length = name.size(); - length = (length > NAME_LEN) ? NAME_LEN : length; - memcpy(outputArrPtr[i].name, name.c_str(), length); - if (length < NAME_LEN) - outputArrPtr[i].name[length] = '\0'; - } - model_result_ptr->num_outputs = model_num_outputs; - model_result_ptr->outputArr = outputArrPtr; - model_result_ptr->deviceType = device; - return (void*)model_result_ptr; -} - -ResultHandle AllocSpecificResultHandle(ModelHandle ih, const int num_outputs, - char** outputNames) -{ - ModelHandleInfo* ihInfo = (ModelHandleInfo*)ih; - DEVICE_TYPE device = ihInfo->deviceType; - - ResultHandleInner* model_result_ptr = (ResultHandleInner*)malloc(sizeof(ResultHandleInner)); - int model_num_outputs = num_outputs; - DataDesc* outputArrPtr = (DataDesc*)malloc(sizeof(DataDesc) * model_num_outputs); - for (int i = 0; i < num_outputs; i++) { - U32 length = strlen(outputNames[i]); - memcpy(outputArrPtr[i].name, outputNames[i], strlen(outputNames[i])); - if (length < NAME_LEN) - outputArrPtr[i].name[length] = '\0'; - } - model_result_ptr->num_outputs = model_num_outputs; - model_result_ptr->outputArr = outputArrPtr; - model_result_ptr->deviceType = device; - return (void*)model_result_ptr; -} - -void copyTensorDescToDataDesc(TensorDesc srcDesc, DataDesc *dstDesc) { - dstDesc->dt = srcDesc.dt; - dstDesc->df = srcDesc.df; - if (srcDesc.nDims > 4) { - std::cerr << "[ERROR] user interface only support 4 dimensions, not " << srcDesc.nDims << std::endl; - exit(1); - } - for (U32 i = 0; i < srcDesc.nDims; i++) - dstDesc->dims[i] = srcDesc.dims[srcDesc.nDims-1-i]; - for (int i = srcDesc.nDims; i < 4; i++) - dstDesc->dims[i] = 1; -} - -void RunModel(ModelHandle ih, ResultHandle ir, const int num_input, char** inputNames, void** mem) { - ModelHandleInfo* ihInfo = (ModelHandleInfo*)ih; - CNN* cnn = (CNN*)ihInfo->cnn; - DEVICE_TYPE device = ihInfo->deviceType; - ResultHandleInner* ir_inner = (ResultHandleInner*)ir; - - for (int index = 0; index < num_input; index++) { - std::string input_name(inputNames[index]); - cnn->copy_to_named_input(input_name, (U8*)(mem[index])); - } - cnn->run(); - - DataDesc* outputArrPtr = ir_inner->outputArr; - if(device == CPU) { - for (U32 curIndex = 0; curIndex < ir_inner->num_outputs; curIndex++) { - Tensor output_tensor = cnn->get_tensor_by_name(outputArrPtr[curIndex].name); - copyTensorDescToDataDesc(output_tensor.get_desc(), &(outputArrPtr[curIndex])); - outputArrPtr[curIndex].dataPtr = output_tensor.get_val(); - } - } -#ifdef _USE_MALI - else if(device == GPU) { - HashMap> outMap = cnn->get_outputs(); - if (ir_inner->num_outputs != outMap.size()) { - std::cerr << "[ERROR] GPU currently not support AllocSpecificResultHandle" << std::endl; - exit(1); - } - int curIndex = 0; - for(const auto &p : outMap) { - std::string output_name = p.first; - auto mem = p.second->get_memory(); - std::shared_ptr oclMem = mem->get_shared_ptr_caster(); - copyTensorDescToDataDesc(p.second->get_desc(), &(outputArrPtr[curIndex])); - outputArrPtr[curIndex].dataPtr = oclMem->desc.map_ptr; - curIndex++; - } - } -#endif -} - -int GetNumOutputsFromResultHandle(ResultHandle ir) { - ResultHandleInner* ir_inner = (ResultHandleInner*)ir; - return (*ir_inner).num_outputs; -} - -void GetPtrFromResultHandle(ResultHandle ir, - int num_outputs, - char** outputNames, - void** data, - int* n, int* c, int* h, int* w, - DATA_TYPE* dt_output, DATA_FORMAT* df_output) -{ - ResultHandleInner* ir_inner = (ResultHandleInner*)ir; - DataDesc* outputArrPtr = (*ir_inner).outputArr; - for (int i = 0; i < num_outputs; i++) { - n[i] = outputArrPtr[i].dims[0]; - c[i] = outputArrPtr[i].dims[1]; - h[i] = outputArrPtr[i].dims[2]; - w[i] = outputArrPtr[i].dims[3]; - outputNames[i] = const_cast(outputArrPtr[i].name); - DataType dt = outputArrPtr[i].dt; - dt_output[i] = dt_mapping_bolt2user(dt); - df_output[i] = df_mapping_bolt2user(outputArrPtr[i].df); - data[i] = outputArrPtr[i].dataPtr; - } -} - -void CopyOutputsFromResultHandle(ResultHandle ir, - int num_outputs, - char** outputNames, - void** data, - int* n, int* c, int* h, int* w, - DATA_TYPE* dt_output, DATA_FORMAT* df_output) -{ - ResultHandleInner* ir_inner = (ResultHandleInner*)ir; - DataDesc* outputArrPtr = (*ir_inner).outputArr; - for (int i = 0; i < num_outputs; i++) { - n[i] = outputArrPtr[i].dims[0]; - c[i] = outputArrPtr[i].dims[1]; - h[i] = outputArrPtr[i].dims[2]; - w[i] = outputArrPtr[i].dims[3]; - outputNames[i] = const_cast(outputArrPtr[i].name); - DataType dt = outputArrPtr[i].dt; - dt_output[i] = dt_mapping_bolt2user(dt); - df_output[i] = df_mapping_bolt2user(outputArrPtr[i].df); - U32 size = n[i] * c[i] * h[i] * w[i] * bytesOf(dt); - memcpy((void*)data[i], (void*)outputArrPtr[i].dataPtr, size); - } -} - -void FreeResultHandle(ResultHandle ir) { - ResultHandleInner* ir_inner = (ResultHandleInner*)ir; - DataDesc* outputArrPtr = (*ir_inner).outputArr; - free(outputArrPtr); - free(ir_inner); -} - -void DestroyModel(ModelHandle ih) { - ModelHandleInfo* ihInfo = (ModelHandleInfo*)ih; - CNN* cnn = (CNN*)ihInfo->cnn; - ModelSpec* ms = (ModelSpec*)ihInfo->ms; - CHECK_STATUS(mt_destroy_model(ms)); - delete ms; - delete cnn; - delete ihInfo; -} - diff --git a/inference/src/data_loader.cpp b/inference/src/data_loader.cpp deleted file mode 100644 index a5550a2d..00000000 --- a/inference/src/data_loader.cpp +++ /dev/null @@ -1,422 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#ifdef _BUILD_TEST - -#include -#include -#include -#include -#include -#include - -#include "image_processing.hpp" -#include "data_loader.hpp" - -template -void init_one(T* data, U32 len) { - for (U32 i = 0; i < len; i++) { - data[i] = 1; - } -} - -void get_files(std::string directoryName, Vec &files) { - if (directoryName.empty()) { - std::cerr << "[ERROR] null data" << std::endl; - exit(1); - } - DIR *directory = opendir(directoryName.c_str()); - if (NULL == directory) { - std::cerr << "[ERROR] permission denied to access " << directoryName << std::endl; - exit(1); - } - struct dirent *file; - while ((file = readdir(directory)) != NULL) { - if (strcmp(file->d_name, ".") == 0 || strcmp(file->d_name, "..") == 0) { - continue; - } - if (file->d_type == DT_DIR) { - //std::string fileName = directoryName + "/" + file->d_name; - //get_files(fileName, files); - continue; - } else { - files.push_back(directoryName + "/" + file->d_name); - } - } - closedir(directory); -} - -Vec load_jpeg(std::string dataPath, Vec imageDesc, ImageFormat ImageFormat, F32 scaleValue) { - FILE *file = fopen(dataPath.c_str(), "rb"); - CHECK_REQUIREMENT(NULL != file); - - struct jpeg_decompress_struct info; - struct jpeg_error_mgr err; - - info.err = jpeg_std_error(&err); - jpeg_create_decompress(&info); - - jpeg_stdio_src(&info, file); - jpeg_read_header(&info, TRUE); - - jpeg_start_decompress(&info); - - U32 width = info.output_width; - U32 height = info.output_height; - U32 numChannels = info.output_components; - U32 dataSize = numChannels * height * width; - - DEBUG_info("[INFO] " << dataPath << ": channels " << numChannels << ", out color space " << info.out_color_space); - CHECK_REQUIREMENT(2 == info.out_color_space); // Support RGB for now - - U8 *data = (U8*)malloc(dataSize); - JSAMPROW row_pointer[1]; - while (info.output_scanline < info.output_height) { - row_pointer[0] = data + info.output_scanline * width * numChannels; - int ret = jpeg_read_scanlines(&info, row_pointer, 1); - CHECK_REQUIREMENT(ret == 1); - } - - jpeg_finish_decompress(&info); - jpeg_destroy_decompress(&info); - fclose(file); - - TensorDesc rgbDesc = tensor4df(DT_U8, DF_RGB, 1, 3, height, width); - U8 *rgb = (U8*)malloc(tensorNumBytes(rgbDesc)); - - U8 *r = rgb; - U8 *g = r + height * width; - U8 *b = g + height * width; - - U8 *dataMov = data; - - for (U32 i = 0; i < height * width; i++) { - r[i] = dataMov[0]; - g[i] = dataMov[1]; - b[i] = dataMov[2]; - dataMov += numChannels; - } - - free(data); - - std::shared_ptr imageData = load_resize_image(rgbDesc, rgb, imageDesc[0], ImageFormat, scaleValue); - free(rgb); - - Vec result; - std::shared_ptr tensorData(new Tensor()); - tensorData->set_desc(imageDesc[0]); - tensorData->set_shared_ptr(imageData); - result.push_back(*tensorData.get()); - return result; -} - -Vec load_fake_data(Vec dataDesc) { - Vec result; - for (U32 index = 0; index < dataDesc.size(); index++) { - U8 *ptr = nullptr; - switch (dataDesc[index].dt) { - case DT_F32: { - F32* dataPtr = (F32 *)operator new(tensorNumBytes(dataDesc[index])); - init_one(dataPtr, tensorNumElements(dataDesc[index])); - ptr = (U8 *)dataPtr; - break; - } -#ifdef __aarch64__ - case DT_F16: { - F16* dataPtr = (F16 *)operator new(tensorNumBytes(dataDesc[index])); - init_one(dataPtr, tensorNumElements(dataDesc[index])); - ptr = (U8 *)dataPtr; - break; - } -#endif - case DT_U32: { - U32* dataPtr = (U32 *)operator new(tensorNumBytes(dataDesc[index])); - init_one(dataPtr, tensorNumElements(dataDesc[index])); - ptr = (U8 *)dataPtr; - break; - } - case DT_I32: { - I32* dataPtr = (I32 *)operator new(tensorNumBytes(dataDesc[index])); - init_one(dataPtr, tensorNumElements(dataDesc[index])); - ptr = (U8 *)dataPtr; - break; - } - default: - CHECK_STATUS(NOT_SUPPORTED); - break; - } - std::shared_ptr data(new Tensor()); - data->set_desc(dataDesc[index]); - data->set_shared_ptr(std::shared_ptr(ptr)); - result.push_back(*data.get()); - } - return result; -} - -Vec load_txt(std::string dataPath, Vec dataDesc) { - Vec result; - FILE *f = fopen(dataPath.c_str(), "r"); - CHECK_REQUIREMENT(f != nullptr); - for (U32 index = 0; index < dataDesc.size(); index++) { - U8 *ptr = nullptr; - switch (dataDesc[index].dt) { - case DT_F32: { - F32* dataPtr = (F32 *)operator new(tensorNumBytes(dataDesc[index])); - for (U32 i = 0; i < tensorNumElements(dataDesc[index]); i++) { - fscanf(f, "%f", dataPtr+i); - } - ptr = (U8 *)dataPtr; - break; - } -#ifdef __aarch64__ - case DT_F16: { - F16* dataPtr = (F16 *)operator new(tensorNumBytes(dataDesc[index])); - F32 value; - for (U32 i = 0; i < tensorNumElements(dataDesc[index]); i++) { - fscanf(f, "%f", &value); - dataPtr[i] = (F16)value; - } - ptr = (U8 *)dataPtr; - break; - } -#endif - case DT_U32: { - U32* dataPtr = (U32 *)operator new(tensorNumBytes(dataDesc[index])); - for (U32 i = 0; i < tensorNumElements(dataDesc[index]); i++) { - fscanf(f, "%u", dataPtr+i); - } - ptr = (U8 *)dataPtr; - break; - } - case DT_I32: { - I32* dataPtr = (I32 *)operator new(tensorNumBytes(dataDesc[index])); - for (U32 i = 0; i < tensorNumElements(dataDesc[index]); i++) { - fscanf(f, "%d", dataPtr+i); - } - ptr = (U8 *)dataPtr; - break; - } - default: - CHECK_STATUS(NOT_SUPPORTED); - break; - } - std::shared_ptr data(new Tensor()); - data->set_desc(dataDesc[index]); - data->set_shared_ptr(std::shared_ptr(ptr)); - result.push_back(*data.get()); - } - fclose(f); - return result; -} - -Vec load_seq(std::string dataPath, Vec dataDesc) { - Vec result; - FILE *f = fopen(dataPath.c_str(), "r"); - CHECK_REQUIREMENT(f != nullptr); - for (U32 index = 0; index < dataDesc.size(); index++) { - U32 sequenceLen = 0; - fscanf(f, "%u", &sequenceLen); - TensorDesc sequenceDesc = dataDesc[index]; - sequenceDesc.dims[0] = sequenceLen; - for (U32 j = 1; j < sequenceDesc.nDims; j++) - sequenceDesc.dims[j] = 1; - - U8 *ptr = nullptr; - switch (dataDesc[index].dt) { - case DT_F32: { - F32* dataPtr = (F32 *)operator new(tensorNumBytes(sequenceDesc)); - for (U32 i = 0; i < tensorNumElements(sequenceDesc); i++) { - fscanf(f, "%f", dataPtr+i); - } - ptr = (U8 *)dataPtr; - break; - } -#ifdef __aarch64__ - case DT_F16: { - F16* dataPtr = (F16 *)operator new(tensorNumBytes(sequenceDesc)); - F32 value; - for (U32 i = 0; i < tensorNumElements(sequenceDesc); i++) { - fscanf(f, "%f", &value); - dataPtr[i] = (F16)value; - } - ptr = (U8 *)dataPtr; - break; - } -#endif - case DT_U32: { - U32* dataPtr = (U32 *)operator new(tensorNumBytes(sequenceDesc)); - for (U32 i = 0; i < tensorNumElements(sequenceDesc); i++) { - fscanf(f, "%u", dataPtr+i); - } - ptr = (U8 *)dataPtr; - break; - } - case DT_I32: { - I32* dataPtr = (I32 *)operator new(tensorNumBytes(sequenceDesc)); - for (U32 i = 0; i < tensorNumElements(sequenceDesc); i++) { - fscanf(f, "%d", dataPtr+i); - } - ptr = (U8 *)dataPtr; - break; - } - default: - CHECK_STATUS(NOT_SUPPORTED); - break; - } - - std::shared_ptr data(new Tensor()); - data->set_desc(sequenceDesc); - data->set_shared_ptr(std::shared_ptr(ptr)); - result.push_back(*data.get()); - } - fclose(f); - return result; -} - -Vec load_bin(std::string dataPath, Vec sourceDataType, Vec dataDesc) { - Vec result; -#ifdef __aarch64__ - FILE *f = fopen(dataPath.c_str(), "r"); - CHECK_REQUIREMENT(f != nullptr); - for (U32 index = 0; index < dataDesc.size(); index++) { - U32 len = tensorNumElements(dataDesc[index]); - U8* ptr = (U8 *)operator new(len * bytesOf(sourceDataType[index])); - U32 readLength = fread(ptr, bytesOf(sourceDataType[index]), len, f); - CHECK_REQUIREMENT(len == readLength); - - U8 *ptrNew = nullptr; - if (sourceDataType[index] != dataDesc[index].dt) { - ptrNew = (U8 *)operator new(len * bytesOf(dataDesc[index].dt)); - if (sourceDataType[index] == DT_F32 && dataDesc[index].dt == DT_F16) { - F32* ptr1 = (F32*)ptr; - F16* ptr2 = (F16*)ptrNew; - for (U32 i = 0; i < len; i++) - ptr2[i] = (F16)ptr1[i]; - } - else { - CHECK_STATUS(NOT_SUPPORTED); - } - } - else { - ptrNew = ptr; - } - std::shared_ptr data(new Tensor()); - data->set_desc(dataDesc[index]); - data->set_shared_ptr(std::shared_ptr(ptrNew)); - result.push_back(*data.get()); - } - fclose(f); -#endif - return result; -} - -int string_end_with(std::string s, std::string sub){ - std::transform(s.begin(), s.end(), s.begin(), ::tolower); - std::transform(sub.begin(), sub.end(), sub.begin(), ::tolower); - return s.rfind(sub)==(s.length() - sub.length()) ? 1 : 0; -} - -Vec load_data(std::string directoryPath, - Vec dataDesc, - Vec>* datas) -{ - Vec dataPaths; - if (directoryPath == "") { - Vec data = load_fake_data(dataDesc); - (*datas).push_back(data); - dataPaths.push_back("fake data"); - return dataPaths; - } - - Vec paths; - get_files(directoryPath, paths); - Vec data; - for (U32 i = 0; i < paths.size(); i++) { - std::string dataPath = paths[i]; - if (string_end_with(dataPath, ".txt")) - data = load_txt(dataPath, dataDesc); - else if (string_end_with(dataPath, ".seq")) - data = load_seq(dataPath, dataDesc); - else { - std::cerr << "[ERROR] can not load data " << dataPath << std::endl; - exit(1); - } - (*datas).push_back(data); - dataPaths.push_back(dataPath); - } - return dataPaths; -} - -Vec load_image_with_scale(std::string directoryPath, - Vec dataDesc, - Vec>* datas, - ImageFormat ImageFormat, - F32 scaleValue) -{ - Vec dataPaths; - if (directoryPath == "") { - Vec data = load_fake_data(dataDesc); - (*datas).push_back(data); - dataPaths.push_back("fake data"); - return dataPaths; - } - - Vec paths; - get_files(directoryPath, paths); - Vec data; - for (U32 i = 0; i < paths.size(); i++) { - std::string dataPath = paths[i]; - if (string_end_with(dataPath, ".jpg") || string_end_with(dataPath, ".jpeg")) - data = load_jpeg(dataPath, dataDesc, ImageFormat, scaleValue); - else if (string_end_with(dataPath, ".txt")) - data = load_txt(dataPath, dataDesc); - else { - std::cerr << "[ERROR] can not load jpeg data " << dataPath << std::endl; - exit(1); - } - (*datas).push_back(data); - dataPaths.push_back(dataPath); - } - return dataPaths; -} - -Vec load_bin_with_type(std::string directoryPath, - Vec dataDesc, - Vec>* datas, - Vec sourceDataType) -{ - Vec dataPaths; - if (directoryPath == "") { - Vec data = load_fake_data(dataDesc); - (*datas).push_back(data); - dataPaths.push_back("fake data"); - return dataPaths; - } - - Vec paths; - get_files(directoryPath, paths); - Vec data; - for (U32 i = 0; i < paths.size(); i++) { - std::string dataPath = paths[i]; - if (string_end_with(dataPath, ".bin")) - data = load_bin(dataPath, sourceDataType, dataDesc); - else { - std::cerr << "[ERROR] can not load binary data " << dataPath << std::endl; - exit(1); - } - (*datas).push_back(data); - dataPaths.push_back(dataPath); - } - return dataPaths; -} -#endif diff --git a/inference/src/result_format.cpp b/inference/src/result_format.cpp deleted file mode 100644 index 8ed4363c..00000000 --- a/inference/src/result_format.cpp +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "result_format.hpp" - -Vec topK_index(Tensor data, U32 topK){ - TensorDesc desc = data.get_desc(); - U32 len = tensorNumElements(desc); - - Vec index(len); - for (U32 i = 0; i < index.size(); i++) { - index[i] = i; - } - - U8* res = NULL; - auto mem = data.get_memory(); -#ifdef _USE_MALI - if(mem->get_mem_type() == OCLMem) { - std::shared_ptr oclMem = mem->get_shared_ptr_caster(); - if(!oclMem->desc.use_map) { - std::cout << "Error: Not support check unmapped gcl memory value" << std::endl; - exit(1); - } - res = oclMem->desc.map_ptr; - } else { -#endif - res = (U8*)mem->get_val(); -#ifdef _USE_MALI - } -#endif - - switch (desc.dt) { -#ifdef __aarch64__ - case DT_F16: { - F16* dataPtr = (F16 *)res; - sort(index.begin(), index.end(), - [&](const int& a, const int& b) { - return (dataPtr[a] > dataPtr[b]); - } - ); - break; - } -#endif - case DT_F32: { - F32* dataPtr = (F32 *)res; - sort(index.begin(), index.end(), - [&](const int& a, const int& b) { - return (dataPtr[a] > dataPtr[b]); - } - ); - break; - } - default: - break; - } - - Vec::const_iterator first = index.begin() + 0; - Vec::const_iterator last = index.begin() + topK; - Vec indexesTopK(first, last); - - return indexesTopK; -} diff --git a/inference/src/utils.cpp b/inference/src/utils.cpp deleted file mode 100644 index 6e62eb89..00000000 --- a/inference/src/utils.cpp +++ /dev/null @@ -1,96 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include -#include -#include -#include - -#include "utils.hpp" - - -std::string extract_class_function(std::string&& pretty_function) { - auto pos = pretty_function.find('('); - if (pos != std::string::npos) - pretty_function.erase(pretty_function.begin()+pos, pretty_function.end()); - - pos = pretty_function.rfind(' '); - if (pos != std::string::npos) - pretty_function.erase(pretty_function.begin(), pretty_function.begin()+pos+1); - - return std::move(pretty_function); -} - - -std::string extract_file_function(std::string&& pretty_function) { - auto pos = pretty_function.find('('); - if (pos != std::string::npos) - pretty_function.erase(pretty_function.begin()+pos, pretty_function.end()); - - pos = pretty_function.rfind('/'); - if (pos != std::string::npos) - pretty_function.erase(pretty_function.begin(), pretty_function.begin()+pos+1); - - return std::move(pretty_function); -} - - -std::map time_tic; -std::map time_toc; -std::map time_statistics; - - -void ut_time_init() { - time_tic.clear(); - time_toc.clear(); - time_statistics.clear(); -} - - -void ut_time_tic(std::string key) { - double value = ut_time_ms(); - time_tic[key] = value; -} - - -void ut_time_toc(std::string key) { - double value = ut_time_ms(); - time_toc[key] = value; - std::map::iterator iter = time_tic.find(key); - if (iter == time_tic.end()) - std::cout << "[WARNING] mismatched UTIL_TIME_TIC/UTIL_TIME_TOC " << key << std::endl; - else { - iter = time_statistics.find(key); - DEBUG_info(key << ": " << time_toc[key] - time_tic[key]); - if(iter == time_statistics.end()) - time_statistics[key] = time_toc[key] - time_tic[key]; - else - time_statistics[key] += time_toc[key] - time_tic[key]; - } -} - -void ut_time_statistics() { - std::vector> vec(time_statistics.begin(), time_statistics.end()); - - sort(vec.begin(), vec.end(), - [&](const std::pair& a, const std::pair& b) { - return (a.second > b.second); - } - ); - std::cout << "[TIME]" << std::endl; - std::cout << "function\ttime" << std::endl; - for (U32 i = 0; i < vec.size(); ++i) - std::cout << vec[i].first << " " << vec[i].second << " ms"<< std::endl; -} diff --git a/install.sh b/install.sh index a291d52f..147ff233 100644 --- a/install.sh +++ b/install.sh @@ -1,9 +1,11 @@ #!/bin/bash script_name=$0 -compiler_arch="gnu" +compiler_arch="arm_gnu" skip=false build_threads="8" +llvm_gpu="ON" +finetune="OFF" print_help() { cat < use to set compiler(default: gnu). + -c, --compiler use to set compiler(default: arm_gnu). -s, --skip skip dependency library install and option set(default: false). -t, --threads use parallel build(default: 8). + -g, --gpu use gpu(default: llvm(on), others(off)). + -f, --finetune use finetuning(default: off). EOF exit 1; } -TEMP=`getopt -o c:hs:t: --long compiler:help,skip:threads: \ +TEMP=`getopt -o c:g:hs:t:f: --long compiler:gpu:help,skip:threads:finetune: \ -n ${script_name} -- "$@"` if [ $? != 0 ] ; then echo "[ERROR] terminating..." >&2 ; exit 1 ; fi eval set -- "$TEMP" @@ -37,6 +41,22 @@ while true ; do build_threads=$2 echo "[INFO] '${build_threads}' threads parallel to build" ; shift 2 ;; + -g|--gpu) + llvm_gpu=${2^^} + if [ "${llvm_gpu}" != "OFF" -a "${llvm_gpu}" != "ON" ] ; then + echo "[ERROR] the gpu option should be " + exit 1 + fi + echo "[INFO] gpu option is ${llvm_gpu}"; + shift 2 ;; + -f|--finetune) + finetune=${2^^} + if [ "${finetune}" != "OFF" -a "${finetune}" != "ON" ] ; then + echo "[ERROR] the finetune option should be " + exit 1 + fi + echo "[INFO] finetune option is ${finetune}"; + shift 2 ;; -h|--help) print_help ; shift ;; @@ -47,7 +67,7 @@ while true ; do done exeIsValid(){ - if type $1 2>/dev/null; + if type $1 &> /dev/null; then return 1 else @@ -63,7 +83,6 @@ fi script_abs=$(readlink -f "$0") script_dir=$(dirname $script_abs) -current_dir=${PWD} export BOLT_ROOT=${script_dir} @@ -74,7 +93,7 @@ mkdir build_${compiler_arch} install_${compiler_arch} options="" if [ ${skip} != true ] ; then - if [ ! -f "./third_party/${compiler_arch}.sh" ]; then + if [[ ! -f "./third_party/${compiler_arch}.sh" || ! -d "./third_party/${compiler_arch}" ]]; then ./third_party/install.sh -c ${compiler_arch} -t ${build_threads} || exit 1 fi echo "[INFO] use ./third_party/${compiler_arch}.sh to set environment variable..." @@ -82,21 +101,22 @@ if [ ${skip} != true ] ; then options="-DUSE_CROSS_COMPILE=ON \ -DBUILD_TEST=ON " - if [ "${compiler_arch}" == "gnu" ] ; then + if [ "${compiler_arch}" == "arm_gnu" ] ; then exeIsValid aarch64-linux-gnu-g++ if [ $? == 0 ] ; then - echo "[ERROR] please install GNU gcc ARM compiler and set shell environment PATH to find it" + echo "[ERROR] please install ARM GNU gcc compiler and set shell environment PATH to find it" exit 1 fi options="${options} \ -DUSE_GNU_GCC=ON \ -DUSE_LLVM_CLANG=OFF \ -DUSE_MALI=OFF \ + -DUSE_NEON=ON \ -DCMAKE_C_COMPILER=`which aarch64-linux-gnu-gcc` \ -DCMAKE_CXX_COMPILER=`which aarch64-linux-gnu-g++` \ -DCMAKE_STRIP=`which aarch64-linux-gnu-strip` " fi - if [ "${compiler_arch}" == "llvm" ] ; then + if [ "${compiler_arch}" == "arm_llvm" ] ; then exeIsValid aarch64-linux-android21-clang++ if [ $? == 0 ] ; then echo "[ERROR] please install android ndk aarch64-linux-android21-clang++ compiler and set shell environment PATH to find it" @@ -105,13 +125,15 @@ if [ ${skip} != true ] ; then options="${options} \ -DUSE_GNU_GCC=OFF \ -DUSE_LLVM_CLANG=ON \ - -DUSE_MALI=ON \ - -DUSE_DYNAMIC_LIBRARY=ON \ + -DUSE_MALI=${llvm_gpu} \ + -DUSE_NEON=ON \ + -DUSE_DYNAMIC_LIBRARY=OFF \ + -DUSE_TRAINING=${finetune} \ -DCMAKE_C_COMPILER=`which aarch64-linux-android21-clang` \ -DCMAKE_CXX_COMPILER=`which aarch64-linux-android21-clang++` \ -DCMAKE_STRIP=`which aarch64-linux-android-strip` " fi - if [ "${compiler_arch}" == "himix100" ] ; then + if [ "${compiler_arch}" == "arm_himix100" ] ; then exeIsValid arm-himix100-linux-g++ if [ $? == 0 ] ; then echo "[ERROR] please install Himix100 GNU gcc ARM compiler and set shell environment PATH to find it" @@ -121,6 +143,7 @@ if [ ${skip} != true ] ; then -DUSE_GNU_GCC=ON \ -DUSE_LLVM_CLANG=OFF \ -DUSE_MALI=OFF \ + -DUSE_NEON=ON \ -DUSE_ARMV8=OFF \ -DUSE_ARMV7=ON \ -DUSE_FP32=ON \ @@ -130,16 +153,17 @@ if [ ${skip} != true ] ; then -DCMAKE_CXX_COMPILER=`which arm-himix100-linux-g++` \ -DCMAKE_STRIP=`which arm-himix100-linux-strip` " fi - if [ "${compiler_arch}" == "ndkv7" ] ; then + if [ "${compiler_arch}" == "arm_ndkv7" ] ; then exeIsValid armv7a-linux-androideabi19-clang++ if [ $? == 0 ] ; then - echo "[ERROR] please install Himix100 ndk armv7a-linux-androideabi19-clang++ compiler and set shell environment PATH to find it" + echo "[ERROR] please install android ndk armv7a-linux-androideabi19-clang++ compiler and set shell environment PATH to find it" exit 1 fi options="${options} \ -DUSE_GNU_GCC=OFF \ -DUSE_LLVM_CLANG=ON \ -DUSE_MALI=OFF \ + -DUSE_NEON=ON \ -DUSE_DYNAMIC_LIBRARY=ON \ -DUSE_ARMV8=OFF \ -DUSE_ARMV7=ON \ @@ -150,6 +174,69 @@ if [ ${skip} != true ] ; then -DCMAKE_CXX_COMPILER=`which armv7a-linux-androideabi19-clang++` \ -DCMAKE_STRIP=`which arm-linux-androideabi-strip` " fi + if [ "${compiler_arch}" == "x86_gnu" ] ; then + export JNI_ROOT=/usr/lib/jvm/java-8-openjdk-amd64 + exeIsValid g++ + if [ $? == 0 ] ; then + echo "[ERROR] please install X86 GNU compiler and set shell environment PATH to find it" + exit 1 + fi + options="${options} \ + -DUSE_GNU_GCC=ON \ + -DUSE_LLVM_CLANG=OFF \ + -DUSE_MALI=OFF \ + -DUSE_NEON=OFF \ + -DUSE_X86=ON \ + -DUSE_DYNAMIC_LIBRARY=OFF \ + -DUSE_ARMV8=OFF \ + -DUSE_ARMV7=OFF \ + -DUSE_FP32=ON \ + -DUSE_FP16=OFF \ + -DUSE_INT8=OFF \ + -DCMAKE_C_COMPILER=`which gcc` \ + -DCMAKE_CXX_COMPILER=`which g++` \ + -DCMAKE_STRIP=`which strip` " + fi + if [ "${compiler_arch}" == "x86_ndk" ] ; then + exeIsValid x86_64-linux-android21-clang++ + if [ $? == 0 ] ; then + echo "[ERROR] please install android ndk x86_64-linux-android21-clang++ compiler and set shell environment PATH to find it" + exit 1 + fi + options="${options} \ + -DUSE_GNU_GCC=OFF \ + -DUSE_LLVM_CLANG=ON \ + -DUSE_MALI=OFF \ + -DUSE_NEON=OFF \ + -DUSE_X86=ON \ + -DUSE_DYNAMIC_LIBRARY=ON \ + -DUSE_ARMV8=OFF \ + -DUSE_ARMV7=OFF \ + -DUSE_FP32=ON \ + -DUSE_FP16=OFF \ + -DUSE_INT8=OFF \ + -DCMAKE_C_COMPILER=`which x86_64-linux-android21-clang` \ + -DCMAKE_CXX_COMPILER=`which x86_64-linux-android21-clang++` \ + -DCMAKE_STRIP=`which x86_64-linux-android-strip` " + fi + if [ "${compiler_arch}" == "arm_ios" ] ; then + exeIsValid arm-apple-darwin11-clang++ + if [ $? == 0 ] ; then + echo "[ERROR] please install ios arm-apple-darwin11-clang++ compiler and set shell environment PATH to find it" + exit 1 + fi + options="${options} \ + -DUSE_IOS_CLANG=ON \ + -DUSE_NEON=ON \ + -DBUILD_TEST=OFF \ + -DUSE_ONNX=OFF \ + -DUSE_TFLITE=OFF \ + -DUSE_LIBRARY_TUNING=OFF \ + -DUSE_DYNAMIC_LIBRARY=ON \ + -DCMAKE_C_COMPILER=`which arm-apple-darwin11-clang` \ + -DCMAKE_CXX_COMPILER=`which arm-apple-darwin11-clang++` \ + -DCMAKE_STRIP=`which arm-apple-darwin11-strip` " + fi fi cd ${BOLT_ROOT} @@ -157,7 +244,11 @@ cd build_${compiler_arch} cmake .. -DCMAKE_INSTALL_PREFIX=${BOLT_ROOT}/install_${compiler_arch} ${options} make -j${build_threads} || exit 1 make install -j${build_threads} || exit 1 -if [ "${compiler_arch}" == "llvm" ] ; then +if [ "${compiler_arch}" == "arm_llvm" ] ; then make test ARGS="-V" fi cd .. + +if [ "${compiler_arch}" == "arm_ios" ] ; then + bash ./kit/iOS/setup_lib_iOS.sh +fi diff --git a/kit/iOS/image_classification/ImageClassificationDemo.xcodeproj/project.pbxproj b/kit/iOS/image_classification/ImageClassificationDemo.xcodeproj/project.pbxproj new file mode 100644 index 00000000..416ad346 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo.xcodeproj/project.pbxproj @@ -0,0 +1,878 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 50; + objects = { + +/* Begin PBXBuildFile section */ + BCF5005B254806B800DE7797 /* imagenet_classes.txt in Resources */ = {isa = PBXBuildFile; fileRef = BCF5FEE8254806B700DE7797 /* imagenet_classes.txt */; }; + BCF5005C254806B800DE7797 /* ghostnet_f32.bolt in Resources */ = {isa = PBXBuildFile; fileRef = BCF5FEE9254806B700DE7797 /* ghostnet_f32.bolt */; }; + BCF5006D254806B800DE7797 /* ghostnet_f16.bolt in Resources */ = {isa = PBXBuildFile; fileRef = BCF50058254806B800DE7797 /* ghostnet_f16.bolt */; }; + BCF5006E254806B800DE7797 /* libflow.a in Frameworks */ = {isa = PBXBuildFile; fileRef = BCF50059254806B800DE7797 /* libflow.a */; }; + BCF5006F254806B800DE7797 /* libbolt.a in Frameworks */ = {isa = PBXBuildFile; fileRef = BCF5005A254806B800DE7797 /* libbolt.a */; }; + BCF50073254807B500DE7797 /* CoreMedia.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = BCF50072254807B400DE7797 /* CoreMedia.framework */; }; + BCF5015C254958DA00DE7797 /* libprotobuf.a in Frameworks */ = {isa = PBXBuildFile; fileRef = BCF5015B254958DA00DE7797 /* libprotobuf.a */; }; + BCF501D0254A943E00DE7797 /* image_classification.prototxt in Resources */ = {isa = PBXBuildFile; fileRef = BCF501CF254A943E00DE7797 /* image_classification.prototxt */; }; + BCF5FEB12548061D00DE7797 /* AppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = BCF5FEB02548061D00DE7797 /* AppDelegate.m */; }; + BCF5FEB42548061D00DE7797 /* SceneDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = BCF5FEB32548061D00DE7797 /* SceneDelegate.m */; }; + BCF5FEB72548061D00DE7797 /* ViewController.mm in Sources */ = {isa = PBXBuildFile; fileRef = BCF5FEB62548061D00DE7797 /* ViewController.mm */; }; + BCF5FEBA2548061D00DE7797 /* Main.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = BCF5FEB82548061D00DE7797 /* Main.storyboard */; }; + BCF5FEBC2548062600DE7797 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = BCF5FEBB2548062600DE7797 /* Assets.xcassets */; }; + BCF5FEBF2548062600DE7797 /* LaunchScreen.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = BCF5FEBD2548062600DE7797 /* LaunchScreen.storyboard */; }; + BCF5FEC22548062600DE7797 /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = BCF5FEC12548062600DE7797 /* main.m */; }; + BCF5FECC2548062700DE7797 /* ImageClassificationDemoTests.m in Sources */ = {isa = PBXBuildFile; fileRef = BCF5FECB2548062700DE7797 /* ImageClassificationDemoTests.m */; }; + BCF5FED72548062700DE7797 /* ImageClassificationDemoUITests.m in Sources */ = {isa = PBXBuildFile; fileRef = BCF5FED62548062700DE7797 /* ImageClassificationDemoUITests.m */; }; + BCF5FEE62548064F00DE7797 /* AVFoundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = BCF5FEE52548064F00DE7797 /* AVFoundation.framework */; }; +/* End PBXBuildFile section */ + +/* Begin PBXContainerItemProxy section */ + BCF5FEC82548062700DE7797 /* PBXContainerItemProxy */ = { + isa = PBXContainerItemProxy; + containerPortal = BCF5FEA42548061D00DE7797 /* Project object */; + proxyType = 1; + remoteGlobalIDString = BCF5FEAB2548061D00DE7797; + remoteInfo = ImageClassificationDemo; + }; + BCF5FED32548062700DE7797 /* PBXContainerItemProxy */ = { + isa = PBXContainerItemProxy; + containerPortal = BCF5FEA42548061D00DE7797 /* Project object */; + proxyType = 1; + remoteGlobalIDString = BCF5FEAB2548061D00DE7797; + remoteInfo = ImageClassificationDemo; + }; +/* End PBXContainerItemProxy section */ + +/* Begin PBXFileReference section */ + BCF50058254806B800DE7797 /* ghostnet_f16.bolt */ = {isa = PBXFileReference; lastKnownFileType = file; path = ghostnet_f16.bolt; sourceTree = ""; }; + BCF50059254806B800DE7797 /* libflow.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; path = libflow.a; sourceTree = ""; }; + BCF5005A254806B800DE7797 /* libbolt.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; path = libbolt.a; sourceTree = ""; }; + BCF50072254807B400DE7797 /* CoreMedia.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreMedia.framework; path = System/Library/Frameworks/CoreMedia.framework; sourceTree = SDKROOT; }; + BCF500B82549525700DE7797 /* cnn.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = cnn.h; sourceTree = ""; }; + BCF500B92549525700DE7797 /* memory_tracker.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = memory_tracker.hpp; sourceTree = ""; }; + BCF500BA2549525700DE7797 /* model.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = model.hpp; sourceTree = ""; }; + BCF500BB2549525700DE7797 /* operator.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = operator.hpp; sourceTree = ""; }; + BCF501452549525700DE7797 /* kit_flags.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = kit_flags.h; sourceTree = ""; }; + BCF501472549525700DE7797 /* node.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = node.h; sourceTree = ""; }; + BCF501482549525700DE7797 /* flow.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = flow.h; sourceTree = ""; }; + BCF501492549525700DE7797 /* flow_function_factory.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = flow_function_factory.h; sourceTree = ""; }; + BCF5014A2549525700DE7797 /* flow.pb.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = flow.pb.h; sourceTree = ""; }; + BCF5015B254958DA00DE7797 /* libprotobuf.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; path = libprotobuf.a; sourceTree = ""; }; + BCF50163254A927800DE7797 /* scoped_ptr.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = scoped_ptr.h; sourceTree = ""; }; + BCF50164254A927800DE7797 /* stl_util.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = stl_util.h; sourceTree = ""; }; + BCF50165254A927800DE7797 /* atomicops_internals_solaris.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = atomicops_internals_solaris.h; sourceTree = ""; }; + BCF50166254A927800DE7797 /* port.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = port.h; sourceTree = ""; }; + BCF50167254A927800DE7797 /* atomic_sequence_num.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = atomic_sequence_num.h; sourceTree = ""; }; + BCF50168254A927800DE7797 /* atomicops_internals_arm_gcc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = atomicops_internals_arm_gcc.h; sourceTree = ""; }; + BCF50169254A927800DE7797 /* template_util.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = template_util.h; sourceTree = ""; }; + BCF5016A254A927800DE7797 /* atomicops_internals_macosx.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = atomicops_internals_macosx.h; sourceTree = ""; }; + BCF5016B254A927800DE7797 /* atomicops_internals_ppc_gcc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = atomicops_internals_ppc_gcc.h; sourceTree = ""; }; + BCF5016C254A927800DE7797 /* atomicops_internals_mips_gcc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = atomicops_internals_mips_gcc.h; sourceTree = ""; }; + BCF5016D254A927800DE7797 /* casts.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = casts.h; sourceTree = ""; }; + BCF5016E254A927800DE7797 /* logging.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = logging.h; sourceTree = ""; }; + BCF5016F254A927800DE7797 /* atomicops.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = atomicops.h; sourceTree = ""; }; + BCF50170254A927800DE7797 /* atomicops_internals_x86_gcc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = atomicops_internals_x86_gcc.h; sourceTree = ""; }; + BCF50171254A927800DE7797 /* atomicops_internals_power.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = atomicops_internals_power.h; sourceTree = ""; }; + BCF50172254A927800DE7797 /* atomicops_internals_x86_msvc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = atomicops_internals_x86_msvc.h; sourceTree = ""; }; + BCF50173254A927800DE7797 /* atomicops_internals_tsan.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = atomicops_internals_tsan.h; sourceTree = ""; }; + BCF50174254A927800DE7797 /* atomicops_internals_generic_gcc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = atomicops_internals_generic_gcc.h; sourceTree = ""; }; + BCF50175254A927800DE7797 /* atomicops_internals_arm_qnx.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = atomicops_internals_arm_qnx.h; sourceTree = ""; }; + BCF50176254A927800DE7797 /* common.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = common.h; sourceTree = ""; }; + BCF50177254A927800DE7797 /* shared_ptr.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = shared_ptr.h; sourceTree = ""; }; + BCF50178254A927800DE7797 /* macros.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = macros.h; sourceTree = ""; }; + BCF50179254A927800DE7797 /* atomicops_internals_arm64_gcc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = atomicops_internals_arm64_gcc.h; sourceTree = ""; }; + BCF5017A254A927800DE7797 /* platform_macros.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = platform_macros.h; sourceTree = ""; }; + BCF5017B254A927800DE7797 /* once.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = once.h; sourceTree = ""; }; + BCF5017C254A927800DE7797 /* fastmem.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fastmem.h; sourceTree = ""; }; + BCF5017D254A927800DE7797 /* atomicops_internals_pnacl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = atomicops_internals_pnacl.h; sourceTree = ""; }; + BCF5017E254A927800DE7797 /* mutex.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mutex.h; sourceTree = ""; }; + BCF5017F254A927800DE7797 /* type_traits.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = type_traits.h; sourceTree = ""; }; + BCF50180254A927800DE7797 /* callback.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = callback.h; sourceTree = ""; }; + BCF50181254A927800DE7797 /* atomicops_internals_atomicword_compat.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = atomicops_internals_atomicword_compat.h; sourceTree = ""; }; + BCF50183254A927800DE7797 /* repeated_field.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = repeated_field.h; sourceTree = ""; }; + BCF50186254A927800DE7797 /* arenastring.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arenastring.h; sourceTree = ""; }; + BCF50188254A927800DE7797 /* descriptor.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = descriptor.h; sourceTree = ""; }; + BCF5018A254A927800DE7797 /* zero_copy_stream_impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = zero_copy_stream_impl.h; sourceTree = ""; }; + BCF5018B254A927800DE7797 /* zero_copy_stream_impl_lite.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = zero_copy_stream_impl_lite.h; sourceTree = ""; }; + BCF5018C254A927800DE7797 /* strtod.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = strtod.h; sourceTree = ""; }; + BCF5018D254A927800DE7797 /* zero_copy_stream.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = zero_copy_stream.h; sourceTree = ""; }; + BCF5018E254A927800DE7797 /* coded_stream.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = coded_stream.h; sourceTree = ""; }; + BCF5018F254A927800DE7797 /* message.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = message.h; sourceTree = ""; }; + BCF50191254A927800DE7797 /* text_format.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = text_format.h; sourceTree = ""; }; + BCF50192254A927800DE7797 /* has_bits.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = has_bits.h; sourceTree = ""; }; + BCF50194254A927800DE7797 /* metadata.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = metadata.h; sourceTree = ""; }; + BCF50195254A927800DE7797 /* extension_set.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = extension_set.h; sourceTree = ""; }; + BCF50199254A927800DE7797 /* unknown_field_set.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = unknown_field_set.h; sourceTree = ""; }; + BCF5019A254A927800DE7797 /* message_lite.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = message_lite.h; sourceTree = ""; }; + BCF5019B254A927800DE7797 /* generated_message_util.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = generated_message_util.h; sourceTree = ""; }; + BCF5019E254A927800DE7797 /* arena.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arena.h; sourceTree = ""; }; + BCF501B9254A937100DE7797 /* parse_command.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = parse_command.h; sourceTree = ""; }; + BCF501BA254A937200DE7797 /* task.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = task.h; sourceTree = ""; }; + BCF501BB254A937200DE7797 /* sys.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sys.h; sourceTree = ""; }; + BCF501BC254A937200DE7797 /* thread_affinity.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = thread_affinity.h; sourceTree = ""; }; + BCF501BD254A937200DE7797 /* x86_avx2_expand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = x86_avx2_expand.h; sourceTree = ""; }; + BCF501BE254A937200DE7797 /* arm_neon_expand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = arm_neon_expand.h; sourceTree = ""; }; + BCF501BF254A937200DE7797 /* model_print.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = model_print.h; sourceTree = ""; }; + BCF501C0254A937200DE7797 /* op_type.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = op_type.h; sourceTree = ""; }; + BCF501C1254A937200DE7797 /* graph.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = graph.h; sourceTree = ""; }; + BCF501C2254A937200DE7797 /* model_serialize_deserialize.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = model_serialize_deserialize.hpp; sourceTree = ""; }; + BCF501C3254A937200DE7797 /* types.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = types.h; sourceTree = ""; }; + BCF501C4254A937200DE7797 /* error.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = error.h; sourceTree = ""; }; + BCF501C5254A937200DE7797 /* algorithm_map.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = algorithm_map.h; sourceTree = ""; }; + BCF501C6254A937200DE7797 /* ut_util.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ut_util.h; sourceTree = ""; }; + BCF501C7254A937200DE7797 /* profiling.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = profiling.h; sourceTree = ""; }; + BCF501C8254A937200DE7797 /* schedule.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = schedule.h; sourceTree = ""; }; + BCF501C9254A937200DE7797 /* tensor_desc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = tensor_desc.h; sourceTree = ""; }; + BCF501CA254A93D800DE7797 /* memory_ocl.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = memory_ocl.hpp; sourceTree = ""; }; + BCF501CB254A93D800DE7797 /* memory_cpu.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = memory_cpu.hpp; sourceTree = ""; }; + BCF501CC254A93D800DE7797 /* memory.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = memory.hpp; sourceTree = ""; }; + BCF501CD254A93D800DE7797 /* tensor.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = tensor.hpp; sourceTree = ""; }; + BCF501CE254A93D800DE7797 /* point_cast.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = point_cast.hpp; sourceTree = ""; }; + BCF501CF254A943E00DE7797 /* image_classification.prototxt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = image_classification.prototxt; sourceTree = ""; }; + BCF5FEAC2548061D00DE7797 /* ImageClassificationDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = ImageClassificationDemo.app; sourceTree = BUILT_PRODUCTS_DIR; }; + BCF5FEAF2548061D00DE7797 /* AppDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = ""; }; + BCF5FEB02548061D00DE7797 /* AppDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = AppDelegate.m; sourceTree = ""; }; + BCF5FEB22548061D00DE7797 /* SceneDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = SceneDelegate.h; sourceTree = ""; }; + BCF5FEB32548061D00DE7797 /* SceneDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = SceneDelegate.m; sourceTree = ""; }; + BCF5FEB52548061D00DE7797 /* ViewController.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = ViewController.h; sourceTree = ""; }; + BCF5FEB62548061D00DE7797 /* ViewController.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = ViewController.mm; sourceTree = ""; }; + BCF5FEB92548061D00DE7797 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/Main.storyboard; sourceTree = ""; }; + BCF5FEBB2548062600DE7797 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + BCF5FEBE2548062600DE7797 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/LaunchScreen.storyboard; sourceTree = ""; }; + BCF5FEC02548062600DE7797 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; + BCF5FEC12548062600DE7797 /* main.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = main.m; sourceTree = ""; }; + BCF5FEC72548062700DE7797 /* ImageClassificationDemoTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = ImageClassificationDemoTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; }; + BCF5FECB2548062700DE7797 /* ImageClassificationDemoTests.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = ImageClassificationDemoTests.m; sourceTree = ""; }; + BCF5FECD2548062700DE7797 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; + BCF5FED22548062700DE7797 /* ImageClassificationDemoUITests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = ImageClassificationDemoUITests.xctest; sourceTree = BUILT_PRODUCTS_DIR; }; + BCF5FED62548062700DE7797 /* ImageClassificationDemoUITests.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = ImageClassificationDemoUITests.m; sourceTree = ""; }; + BCF5FED82548062700DE7797 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; + BCF5FEE52548064F00DE7797 /* AVFoundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = AVFoundation.framework; path = System/Library/Frameworks/AVFoundation.framework; sourceTree = SDKROOT; }; + BCF5FEE8254806B700DE7797 /* imagenet_classes.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = imagenet_classes.txt; sourceTree = ""; }; + BCF5FEE9254806B700DE7797 /* ghostnet_f32.bolt */ = {isa = PBXFileReference; lastKnownFileType = file; path = ghostnet_f32.bolt; sourceTree = ""; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + BCF5FEA92548061D00DE7797 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + BCF50073254807B500DE7797 /* CoreMedia.framework in Frameworks */, + BCF5FEE62548064F00DE7797 /* AVFoundation.framework in Frameworks */, + BCF5006E254806B800DE7797 /* libflow.a in Frameworks */, + BCF5006F254806B800DE7797 /* libbolt.a in Frameworks */, + BCF5015C254958DA00DE7797 /* libprotobuf.a in Frameworks */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; + BCF5FEC42548062700DE7797 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; + BCF5FECF2548062700DE7797 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + BCF5009C2549525700DE7797 /* headers */ = { + isa = PBXGroup; + children = ( + BCF50161254A927800DE7797 /* protobuf */, + BCF5009D2549525700DE7797 /* memory */, + BCF500A42549525700DE7797 /* uni */, + BCF500B72549525700DE7797 /* engine */, + BCF501452549525700DE7797 /* kit_flags.h */, + BCF501462549525700DE7797 /* flow */, + ); + path = headers; + sourceTree = ""; + }; + BCF5009D2549525700DE7797 /* memory */ = { + isa = PBXGroup; + children = ( + BCF501CB254A93D800DE7797 /* memory_cpu.hpp */, + BCF501CA254A93D800DE7797 /* memory_ocl.hpp */, + BCF501CC254A93D800DE7797 /* memory.hpp */, + BCF501CE254A93D800DE7797 /* point_cast.hpp */, + BCF501CD254A93D800DE7797 /* tensor.hpp */, + ); + path = memory; + sourceTree = ""; + }; + BCF500A42549525700DE7797 /* uni */ = { + isa = PBXGroup; + children = ( + BCF501C5254A937200DE7797 /* algorithm_map.h */, + BCF501BE254A937200DE7797 /* arm_neon_expand.h */, + BCF501C4254A937200DE7797 /* error.h */, + BCF501C1254A937200DE7797 /* graph.h */, + BCF501BF254A937200DE7797 /* model_print.h */, + BCF501C2254A937200DE7797 /* model_serialize_deserialize.hpp */, + BCF501C0254A937200DE7797 /* op_type.h */, + BCF501B9254A937100DE7797 /* parse_command.h */, + BCF501C7254A937200DE7797 /* profiling.h */, + BCF501C8254A937200DE7797 /* schedule.h */, + BCF501BB254A937200DE7797 /* sys.h */, + BCF501BA254A937200DE7797 /* task.h */, + BCF501C9254A937200DE7797 /* tensor_desc.h */, + BCF501BC254A937200DE7797 /* thread_affinity.h */, + BCF501C3254A937200DE7797 /* types.h */, + BCF501C6254A937200DE7797 /* ut_util.h */, + BCF501BD254A937200DE7797 /* x86_avx2_expand.h */, + ); + path = uni; + sourceTree = ""; + }; + BCF500B72549525700DE7797 /* engine */ = { + isa = PBXGroup; + children = ( + BCF500B82549525700DE7797 /* cnn.h */, + BCF500B92549525700DE7797 /* memory_tracker.hpp */, + BCF500BA2549525700DE7797 /* model.hpp */, + BCF500BB2549525700DE7797 /* operator.hpp */, + ); + path = engine; + sourceTree = ""; + }; + BCF501462549525700DE7797 /* flow */ = { + isa = PBXGroup; + children = ( + BCF501472549525700DE7797 /* node.h */, + BCF501482549525700DE7797 /* flow.h */, + BCF501492549525700DE7797 /* flow_function_factory.h */, + BCF5014A2549525700DE7797 /* flow.pb.h */, + ); + path = flow; + sourceTree = ""; + }; + BCF50161254A927800DE7797 /* protobuf */ = { + isa = PBXGroup; + children = ( + BCF50162254A927800DE7797 /* stubs */, + BCF50183254A927800DE7797 /* repeated_field.h */, + BCF50186254A927800DE7797 /* arenastring.h */, + BCF50188254A927800DE7797 /* descriptor.h */, + BCF50189254A927800DE7797 /* io */, + BCF5018F254A927800DE7797 /* message.h */, + BCF50191254A927800DE7797 /* text_format.h */, + BCF50192254A927800DE7797 /* has_bits.h */, + BCF50194254A927800DE7797 /* metadata.h */, + BCF50195254A927800DE7797 /* extension_set.h */, + BCF50199254A927800DE7797 /* unknown_field_set.h */, + BCF5019A254A927800DE7797 /* message_lite.h */, + BCF5019B254A927800DE7797 /* generated_message_util.h */, + BCF5019E254A927800DE7797 /* arena.h */, + ); + path = protobuf; + sourceTree = ""; + }; + BCF50162254A927800DE7797 /* stubs */ = { + isa = PBXGroup; + children = ( + BCF50163254A927800DE7797 /* scoped_ptr.h */, + BCF50164254A927800DE7797 /* stl_util.h */, + BCF50165254A927800DE7797 /* atomicops_internals_solaris.h */, + BCF50166254A927800DE7797 /* port.h */, + BCF50167254A927800DE7797 /* atomic_sequence_num.h */, + BCF50168254A927800DE7797 /* atomicops_internals_arm_gcc.h */, + BCF50169254A927800DE7797 /* template_util.h */, + BCF5016A254A927800DE7797 /* atomicops_internals_macosx.h */, + BCF5016B254A927800DE7797 /* atomicops_internals_ppc_gcc.h */, + BCF5016C254A927800DE7797 /* atomicops_internals_mips_gcc.h */, + BCF5016D254A927800DE7797 /* casts.h */, + BCF5016E254A927800DE7797 /* logging.h */, + BCF5016F254A927800DE7797 /* atomicops.h */, + BCF50170254A927800DE7797 /* atomicops_internals_x86_gcc.h */, + BCF50171254A927800DE7797 /* atomicops_internals_power.h */, + BCF50172254A927800DE7797 /* atomicops_internals_x86_msvc.h */, + BCF50173254A927800DE7797 /* atomicops_internals_tsan.h */, + BCF50174254A927800DE7797 /* atomicops_internals_generic_gcc.h */, + BCF50175254A927800DE7797 /* atomicops_internals_arm_qnx.h */, + BCF50176254A927800DE7797 /* common.h */, + BCF50177254A927800DE7797 /* shared_ptr.h */, + BCF50178254A927800DE7797 /* macros.h */, + BCF50179254A927800DE7797 /* atomicops_internals_arm64_gcc.h */, + BCF5017A254A927800DE7797 /* platform_macros.h */, + BCF5017B254A927800DE7797 /* once.h */, + BCF5017C254A927800DE7797 /* fastmem.h */, + BCF5017D254A927800DE7797 /* atomicops_internals_pnacl.h */, + BCF5017E254A927800DE7797 /* mutex.h */, + BCF5017F254A927800DE7797 /* type_traits.h */, + BCF50180254A927800DE7797 /* callback.h */, + BCF50181254A927800DE7797 /* atomicops_internals_atomicword_compat.h */, + ); + path = stubs; + sourceTree = ""; + }; + BCF50189254A927800DE7797 /* io */ = { + isa = PBXGroup; + children = ( + BCF5018A254A927800DE7797 /* zero_copy_stream_impl.h */, + BCF5018B254A927800DE7797 /* zero_copy_stream_impl_lite.h */, + BCF5018C254A927800DE7797 /* strtod.h */, + BCF5018D254A927800DE7797 /* zero_copy_stream.h */, + BCF5018E254A927800DE7797 /* coded_stream.h */, + ); + path = io; + sourceTree = ""; + }; + BCF5FEA32548061D00DE7797 = { + isa = PBXGroup; + children = ( + BCF5FEAE2548061D00DE7797 /* ImageClassificationDemo */, + BCF5FECA2548062700DE7797 /* ImageClassificationDemoTests */, + BCF5FED52548062700DE7797 /* ImageClassificationDemoUITests */, + BCF5FEAD2548061D00DE7797 /* Products */, + BCF5FEE42548064F00DE7797 /* Frameworks */, + ); + sourceTree = ""; + }; + BCF5FEAD2548061D00DE7797 /* Products */ = { + isa = PBXGroup; + children = ( + BCF5FEAC2548061D00DE7797 /* ImageClassificationDemo.app */, + BCF5FEC72548062700DE7797 /* ImageClassificationDemoTests.xctest */, + BCF5FED22548062700DE7797 /* ImageClassificationDemoUITests.xctest */, + ); + name = Products; + sourceTree = ""; + }; + BCF5FEAE2548061D00DE7797 /* ImageClassificationDemo */ = { + isa = PBXGroup; + children = ( + BCF5FEAF2548061D00DE7797 /* AppDelegate.h */, + BCF5FEB02548061D00DE7797 /* AppDelegate.m */, + BCF5FEB22548061D00DE7797 /* SceneDelegate.h */, + BCF5FEB32548061D00DE7797 /* SceneDelegate.m */, + BCF5FEB52548061D00DE7797 /* ViewController.h */, + BCF5FEB62548061D00DE7797 /* ViewController.mm */, + BCF5FEB82548061D00DE7797 /* Main.storyboard */, + BCF5FEE7254806B700DE7797 /* libbolt */, + BCF5FEBB2548062600DE7797 /* Assets.xcassets */, + BCF5FEBD2548062600DE7797 /* LaunchScreen.storyboard */, + BCF5FEC02548062600DE7797 /* Info.plist */, + BCF5FEC12548062600DE7797 /* main.m */, + ); + path = ImageClassificationDemo; + sourceTree = ""; + }; + BCF5FECA2548062700DE7797 /* ImageClassificationDemoTests */ = { + isa = PBXGroup; + children = ( + BCF5FECB2548062700DE7797 /* ImageClassificationDemoTests.m */, + BCF5FECD2548062700DE7797 /* Info.plist */, + ); + path = ImageClassificationDemoTests; + sourceTree = ""; + }; + BCF5FED52548062700DE7797 /* ImageClassificationDemoUITests */ = { + isa = PBXGroup; + children = ( + BCF5FED62548062700DE7797 /* ImageClassificationDemoUITests.m */, + BCF5FED82548062700DE7797 /* Info.plist */, + ); + path = ImageClassificationDemoUITests; + sourceTree = ""; + }; + BCF5FEE42548064F00DE7797 /* Frameworks */ = { + isa = PBXGroup; + children = ( + BCF50072254807B400DE7797 /* CoreMedia.framework */, + BCF5FEE52548064F00DE7797 /* AVFoundation.framework */, + ); + name = Frameworks; + sourceTree = ""; + }; + BCF5FEE7254806B700DE7797 /* libbolt */ = { + isa = PBXGroup; + children = ( + BCF501CF254A943E00DE7797 /* image_classification.prototxt */, + BCF5FEE8254806B700DE7797 /* imagenet_classes.txt */, + BCF5FEE9254806B700DE7797 /* ghostnet_f32.bolt */, + BCF50058254806B800DE7797 /* ghostnet_f16.bolt */, + BCF50059254806B800DE7797 /* libflow.a */, + BCF5005A254806B800DE7797 /* libbolt.a */, + BCF5015B254958DA00DE7797 /* libprotobuf.a */, + BCF5009C2549525700DE7797 /* headers */, + ); + path = libbolt; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXNativeTarget section */ + BCF5FEAB2548061D00DE7797 /* ImageClassificationDemo */ = { + isa = PBXNativeTarget; + buildConfigurationList = BCF5FEDB2548062700DE7797 /* Build configuration list for PBXNativeTarget "ImageClassificationDemo" */; + buildPhases = ( + BCF5FEA82548061D00DE7797 /* Sources */, + BCF5FEA92548061D00DE7797 /* Frameworks */, + BCF5FEAA2548061D00DE7797 /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = ImageClassificationDemo; + productName = ImageClassificationDemo; + productReference = BCF5FEAC2548061D00DE7797 /* ImageClassificationDemo.app */; + productType = "com.apple.product-type.application"; + }; + BCF5FEC62548062700DE7797 /* ImageClassificationDemoTests */ = { + isa = PBXNativeTarget; + buildConfigurationList = BCF5FEDE2548062700DE7797 /* Build configuration list for PBXNativeTarget "ImageClassificationDemoTests" */; + buildPhases = ( + BCF5FEC32548062700DE7797 /* Sources */, + BCF5FEC42548062700DE7797 /* Frameworks */, + BCF5FEC52548062700DE7797 /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + BCF5FEC92548062700DE7797 /* PBXTargetDependency */, + ); + name = ImageClassificationDemoTests; + productName = ImageClassificationDemoTests; + productReference = BCF5FEC72548062700DE7797 /* ImageClassificationDemoTests.xctest */; + productType = "com.apple.product-type.bundle.unit-test"; + }; + BCF5FED12548062700DE7797 /* ImageClassificationDemoUITests */ = { + isa = PBXNativeTarget; + buildConfigurationList = BCF5FEE12548062700DE7797 /* Build configuration list for PBXNativeTarget "ImageClassificationDemoUITests" */; + buildPhases = ( + BCF5FECE2548062700DE7797 /* Sources */, + BCF5FECF2548062700DE7797 /* Frameworks */, + BCF5FED02548062700DE7797 /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + BCF5FED42548062700DE7797 /* PBXTargetDependency */, + ); + name = ImageClassificationDemoUITests; + productName = ImageClassificationDemoUITests; + productReference = BCF5FED22548062700DE7797 /* ImageClassificationDemoUITests.xctest */; + productType = "com.apple.product-type.bundle.ui-testing"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + BCF5FEA42548061D00DE7797 /* Project object */ = { + isa = PBXProject; + attributes = { + LastUpgradeCheck = 1160; + ORGANIZATIONNAME = leironghao; + TargetAttributes = { + BCF5FEAB2548061D00DE7797 = { + CreatedOnToolsVersion = 11.6; + }; + BCF5FEC62548062700DE7797 = { + CreatedOnToolsVersion = 11.6; + TestTargetID = BCF5FEAB2548061D00DE7797; + }; + BCF5FED12548062700DE7797 = { + CreatedOnToolsVersion = 11.6; + TestTargetID = BCF5FEAB2548061D00DE7797; + }; + }; + }; + buildConfigurationList = BCF5FEA72548061D00DE7797 /* Build configuration list for PBXProject "ImageClassificationDemo" */; + compatibilityVersion = "Xcode 9.3"; + developmentRegion = en; + hasScannedForEncodings = 0; + knownRegions = ( + en, + Base, + ); + mainGroup = BCF5FEA32548061D00DE7797; + productRefGroup = BCF5FEAD2548061D00DE7797 /* Products */; + projectDirPath = ""; + projectRoot = ""; + targets = ( + BCF5FEAB2548061D00DE7797 /* ImageClassificationDemo */, + BCF5FEC62548062700DE7797 /* ImageClassificationDemoTests */, + BCF5FED12548062700DE7797 /* ImageClassificationDemoUITests */, + ); + }; +/* End PBXProject section */ + +/* Begin PBXResourcesBuildPhase section */ + BCF5FEAA2548061D00DE7797 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + BCF5006D254806B800DE7797 /* ghostnet_f16.bolt in Resources */, + BCF5FEBF2548062600DE7797 /* LaunchScreen.storyboard in Resources */, + BCF5005B254806B800DE7797 /* imagenet_classes.txt in Resources */, + BCF5FEBC2548062600DE7797 /* Assets.xcassets in Resources */, + BCF501D0254A943E00DE7797 /* image_classification.prototxt in Resources */, + BCF5FEBA2548061D00DE7797 /* Main.storyboard in Resources */, + BCF5005C254806B800DE7797 /* ghostnet_f32.bolt in Resources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; + BCF5FEC52548062700DE7797 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; + BCF5FED02548062700DE7797 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXResourcesBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + BCF5FEA82548061D00DE7797 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + BCF5FEB72548061D00DE7797 /* ViewController.mm in Sources */, + BCF5FEB12548061D00DE7797 /* AppDelegate.m in Sources */, + BCF5FEC22548062600DE7797 /* main.m in Sources */, + BCF5FEB42548061D00DE7797 /* SceneDelegate.m in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; + BCF5FEC32548062700DE7797 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + BCF5FECC2548062700DE7797 /* ImageClassificationDemoTests.m in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; + BCF5FECE2548062700DE7797 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + BCF5FED72548062700DE7797 /* ImageClassificationDemoUITests.m in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin PBXTargetDependency section */ + BCF5FEC92548062700DE7797 /* PBXTargetDependency */ = { + isa = PBXTargetDependency; + target = BCF5FEAB2548061D00DE7797 /* ImageClassificationDemo */; + targetProxy = BCF5FEC82548062700DE7797 /* PBXContainerItemProxy */; + }; + BCF5FED42548062700DE7797 /* PBXTargetDependency */ = { + isa = PBXTargetDependency; + target = BCF5FEAB2548061D00DE7797 /* ImageClassificationDemo */; + targetProxy = BCF5FED32548062700DE7797 /* PBXContainerItemProxy */; + }; +/* End PBXTargetDependency section */ + +/* Begin PBXVariantGroup section */ + BCF5FEB82548061D00DE7797 /* Main.storyboard */ = { + isa = PBXVariantGroup; + children = ( + BCF5FEB92548061D00DE7797 /* Base */, + ); + name = Main.storyboard; + sourceTree = ""; + }; + BCF5FEBD2548062600DE7797 /* LaunchScreen.storyboard */ = { + isa = PBXVariantGroup; + children = ( + BCF5FEBE2548062600DE7797 /* Base */, + ); + name = LaunchScreen.storyboard; + sourceTree = ""; + }; +/* End PBXVariantGroup section */ + +/* Begin XCBuildConfiguration section */ + BCF5FED92548062700DE7797 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++14"; + CLANG_CXX_LIBRARY = "libc++"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = dwarf; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_TESTABILITY = YES; + GCC_C_LANGUAGE_STANDARD = gnu11; + GCC_DYNAMIC_NO_PIC = NO; + GCC_NO_COMMON_BLOCKS = YES; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DEBUG=1", + "$(inherited)", + ); + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 13.6; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; + MTL_FAST_MATH = YES; + ONLY_ACTIVE_ARCH = YES; + SDKROOT = iphoneos; + }; + name = Debug; + }; + BCF5FEDA2548062700DE7797 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++14"; + CLANG_CXX_LIBRARY = "libc++"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + GCC_C_LANGUAGE_STANDARD = gnu11; + GCC_NO_COMMON_BLOCKS = YES; + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 13.6; + MTL_ENABLE_DEBUG_INFO = NO; + MTL_FAST_MATH = YES; + SDKROOT = iphoneos; + VALIDATE_PRODUCT = YES; + }; + name = Release; + }; + BCF5FEDC2548062700DE7797 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + CODE_SIGN_STYLE = Automatic; + DEVELOPMENT_TEAM = V3B23BH745; + ENABLE_BITCODE = NO; + INFOPLIST_FILE = ImageClassificationDemo/Info.plist; + IPHONEOS_DEPLOYMENT_TARGET = 9.1; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + LIBRARY_SEARCH_PATHS = ( + "$(inherited)", + "$(PROJECT_DIR)/ImageClassificationDemo/libbolt", + ); + MACH_O_TYPE = mh_execute; + PRODUCT_BUNDLE_IDENTIFIER = huawei.ImageClassificationDemo; + PRODUCT_NAME = "$(TARGET_NAME)"; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Debug; + }; + BCF5FEDD2548062700DE7797 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + CODE_SIGN_STYLE = Automatic; + DEVELOPMENT_TEAM = V3B23BH745; + ENABLE_BITCODE = NO; + INFOPLIST_FILE = ImageClassificationDemo/Info.plist; + IPHONEOS_DEPLOYMENT_TARGET = 9.1; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + LIBRARY_SEARCH_PATHS = ( + "$(inherited)", + "$(PROJECT_DIR)/ImageClassificationDemo/libbolt", + ); + MACH_O_TYPE = mh_execute; + PRODUCT_BUNDLE_IDENTIFIER = huawei.ImageClassificationDemo; + PRODUCT_NAME = "$(TARGET_NAME)"; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Release; + }; + BCF5FEDF2548062700DE7797 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + BUNDLE_LOADER = "$(TEST_HOST)"; + CODE_SIGN_STYLE = Automatic; + DEVELOPMENT_TEAM = V3B23BH745; + INFOPLIST_FILE = ImageClassificationDemoTests/Info.plist; + IPHONEOS_DEPLOYMENT_TARGET = 13.6; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + "@loader_path/Frameworks", + ); + PRODUCT_BUNDLE_IDENTIFIER = huawei.ImageClassificationDemoTests; + PRODUCT_NAME = "$(TARGET_NAME)"; + TARGETED_DEVICE_FAMILY = "1,2"; + TEST_HOST = "$(BUILT_PRODUCTS_DIR)/ImageClassificationDemo.app/ImageClassificationDemo"; + }; + name = Debug; + }; + BCF5FEE02548062700DE7797 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + BUNDLE_LOADER = "$(TEST_HOST)"; + CODE_SIGN_STYLE = Automatic; + DEVELOPMENT_TEAM = V3B23BH745; + INFOPLIST_FILE = ImageClassificationDemoTests/Info.plist; + IPHONEOS_DEPLOYMENT_TARGET = 13.6; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + "@loader_path/Frameworks", + ); + PRODUCT_BUNDLE_IDENTIFIER = huawei.ImageClassificationDemoTests; + PRODUCT_NAME = "$(TARGET_NAME)"; + TARGETED_DEVICE_FAMILY = "1,2"; + TEST_HOST = "$(BUILT_PRODUCTS_DIR)/ImageClassificationDemo.app/ImageClassificationDemo"; + }; + name = Release; + }; + BCF5FEE22548062700DE7797 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + CODE_SIGN_STYLE = Automatic; + DEVELOPMENT_TEAM = V3B23BH745; + INFOPLIST_FILE = ImageClassificationDemoUITests/Info.plist; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + "@loader_path/Frameworks", + ); + PRODUCT_BUNDLE_IDENTIFIER = huawei.ImageClassificationDemoUITests; + PRODUCT_NAME = "$(TARGET_NAME)"; + TARGETED_DEVICE_FAMILY = "1,2"; + TEST_TARGET_NAME = ImageClassificationDemo; + }; + name = Debug; + }; + BCF5FEE32548062700DE7797 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + CODE_SIGN_STYLE = Automatic; + DEVELOPMENT_TEAM = V3B23BH745; + INFOPLIST_FILE = ImageClassificationDemoUITests/Info.plist; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + "@loader_path/Frameworks", + ); + PRODUCT_BUNDLE_IDENTIFIER = huawei.ImageClassificationDemoUITests; + PRODUCT_NAME = "$(TARGET_NAME)"; + TARGETED_DEVICE_FAMILY = "1,2"; + TEST_TARGET_NAME = ImageClassificationDemo; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + BCF5FEA72548061D00DE7797 /* Build configuration list for PBXProject "ImageClassificationDemo" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + BCF5FED92548062700DE7797 /* Debug */, + BCF5FEDA2548062700DE7797 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + BCF5FEDB2548062700DE7797 /* Build configuration list for PBXNativeTarget "ImageClassificationDemo" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + BCF5FEDC2548062700DE7797 /* Debug */, + BCF5FEDD2548062700DE7797 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + BCF5FEDE2548062700DE7797 /* Build configuration list for PBXNativeTarget "ImageClassificationDemoTests" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + BCF5FEDF2548062700DE7797 /* Debug */, + BCF5FEE02548062700DE7797 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + BCF5FEE12548062700DE7797 /* Build configuration list for PBXNativeTarget "ImageClassificationDemoUITests" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + BCF5FEE22548062700DE7797 /* Debug */, + BCF5FEE32548062700DE7797 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + }; + rootObject = BCF5FEA42548061D00DE7797 /* Project object */; +} diff --git a/kit/iOS/image_classification/ImageClassificationDemo.xcodeproj/project.xcworkspace/contents.xcworkspacedata b/kit/iOS/image_classification/ImageClassificationDemo.xcodeproj/project.xcworkspace/contents.xcworkspacedata new file mode 100644 index 00000000..96a1b668 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo.xcodeproj/project.xcworkspace/contents.xcworkspacedata @@ -0,0 +1,7 @@ + + + + + diff --git a/kit/iOS/image_classification/ImageClassificationDemo.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist b/kit/iOS/image_classification/ImageClassificationDemo.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist new file mode 100644 index 00000000..18d98100 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist @@ -0,0 +1,8 @@ + + + + + IDEDidComputeMac32BitWarning + + + diff --git a/kit/iOS/image_classification/ImageClassificationDemo.xcodeproj/project.xcworkspace/xcuserdata/aizhen.xcuserdatad/UserInterfaceState.xcuserstate b/kit/iOS/image_classification/ImageClassificationDemo.xcodeproj/project.xcworkspace/xcuserdata/aizhen.xcuserdatad/UserInterfaceState.xcuserstate new file mode 100644 index 00000000..04a7242f Binary files /dev/null and b/kit/iOS/image_classification/ImageClassificationDemo.xcodeproj/project.xcworkspace/xcuserdata/aizhen.xcuserdatad/UserInterfaceState.xcuserstate differ diff --git a/kit/iOS/image_classification/ImageClassificationDemo.xcodeproj/xcuserdata/aizhen.xcuserdatad/xcschemes/xcschememanagement.plist b/kit/iOS/image_classification/ImageClassificationDemo.xcodeproj/xcuserdata/aizhen.xcuserdatad/xcschemes/xcschememanagement.plist new file mode 100644 index 00000000..f91123a8 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo.xcodeproj/xcuserdata/aizhen.xcuserdatad/xcschemes/xcschememanagement.plist @@ -0,0 +1,14 @@ + + + + + SchemeUserState + + ImageClassificationDemo.xcscheme_^#shared#^_ + + orderHint + 0 + + + + diff --git a/kit/iOS/image_classification/ImageClassificationDemo/AppDelegate.h b/kit/iOS/image_classification/ImageClassificationDemo/AppDelegate.h new file mode 100644 index 00000000..913c6b8a --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/AppDelegate.h @@ -0,0 +1,18 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#import + +@interface AppDelegate : UIResponder + +@end diff --git a/kit/iOS/image_classification/ImageClassificationDemo/AppDelegate.m b/kit/iOS/image_classification/ImageClassificationDemo/AppDelegate.m new file mode 100644 index 00000000..9300e239 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/AppDelegate.m @@ -0,0 +1,46 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#import "AppDelegate.h" + +@interface AppDelegate () + +@end + +@implementation AppDelegate + + +- (BOOL)application:(UIApplication *)application didFinishLaunchingWithOptions:(NSDictionary *)launchOptions { + // Override point for customization after application launch. + return YES; +} + + +#pragma mark - UISceneSession lifecycle + + +- (UISceneConfiguration *)application:(UIApplication *)application configurationForConnectingSceneSession:(UISceneSession *)connectingSceneSession options:(UISceneConnectionOptions *)options { + // Called when a new scene session is being created. + // Use this method to select a configuration to create the new scene with. + return [[UISceneConfiguration alloc] initWithName:@"Default Configuration" sessionRole:connectingSceneSession.role]; +} + + +- (void)application:(UIApplication *)application didDiscardSceneSessions:(NSSet *)sceneSessions { + // Called when the user discards a scene session. + // If any sessions were discarded while the application was not running, this will be called shortly after application:didFinishLaunchingWithOptions. + // Use this method to release any resources that were specific to the discarded scenes, as they will not return. +} + + +@end diff --git a/kit/iOS/image_classification/ImageClassificationDemo/Assets.xcassets/AppIcon.appiconset/Contents.json b/kit/iOS/image_classification/ImageClassificationDemo/Assets.xcassets/AppIcon.appiconset/Contents.json new file mode 100644 index 00000000..9221b9bb --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/Assets.xcassets/AppIcon.appiconset/Contents.json @@ -0,0 +1,98 @@ +{ + "images" : [ + { + "idiom" : "iphone", + "scale" : "2x", + "size" : "20x20" + }, + { + "idiom" : "iphone", + "scale" : "3x", + "size" : "20x20" + }, + { + "idiom" : "iphone", + "scale" : "2x", + "size" : "29x29" + }, + { + "idiom" : "iphone", + "scale" : "3x", + "size" : "29x29" + }, + { + "idiom" : "iphone", + "scale" : "2x", + "size" : "40x40" + }, + { + "idiom" : "iphone", + "scale" : "3x", + "size" : "40x40" + }, + { + "idiom" : "iphone", + "scale" : "2x", + "size" : "60x60" + }, + { + "idiom" : "iphone", + "scale" : "3x", + "size" : "60x60" + }, + { + "idiom" : "ipad", + "scale" : "1x", + "size" : "20x20" + }, + { + "idiom" : "ipad", + "scale" : "2x", + "size" : "20x20" + }, + { + "idiom" : "ipad", + "scale" : "1x", + "size" : "29x29" + }, + { + "idiom" : "ipad", + "scale" : "2x", + "size" : "29x29" + }, + { + "idiom" : "ipad", + "scale" : "1x", + "size" : "40x40" + }, + { + "idiom" : "ipad", + "scale" : "2x", + "size" : "40x40" + }, + { + "idiom" : "ipad", + "scale" : "1x", + "size" : "76x76" + }, + { + "idiom" : "ipad", + "scale" : "2x", + "size" : "76x76" + }, + { + "idiom" : "ipad", + "scale" : "2x", + "size" : "83.5x83.5" + }, + { + "idiom" : "ios-marketing", + "scale" : "1x", + "size" : "1024x1024" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/kit/iOS/image_classification/ImageClassificationDemo/Assets.xcassets/Contents.json b/kit/iOS/image_classification/ImageClassificationDemo/Assets.xcassets/Contents.json new file mode 100644 index 00000000..73c00596 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/Assets.xcassets/Contents.json @@ -0,0 +1,6 @@ +{ + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/kit/iOS/image_classification/ImageClassificationDemo/Base.lproj/LaunchScreen.storyboard b/kit/iOS/image_classification/ImageClassificationDemo/Base.lproj/LaunchScreen.storyboard new file mode 100644 index 00000000..865e9329 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/Base.lproj/LaunchScreen.storyboard @@ -0,0 +1,25 @@ + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/kit/iOS/image_classification/ImageClassificationDemo/Base.lproj/Main.storyboard b/kit/iOS/image_classification/ImageClassificationDemo/Base.lproj/Main.storyboard new file mode 100644 index 00000000..808a21ce --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/Base.lproj/Main.storyboard @@ -0,0 +1,24 @@ + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/kit/iOS/image_classification/ImageClassificationDemo/Info.plist b/kit/iOS/image_classification/ImageClassificationDemo/Info.plist new file mode 100644 index 00000000..0b383a6a --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/Info.plist @@ -0,0 +1,66 @@ + + + + + CFBundleDevelopmentRegion + $(DEVELOPMENT_LANGUAGE) + CFBundleExecutable + $(EXECUTABLE_NAME) + CFBundleIdentifier + $(PRODUCT_BUNDLE_IDENTIFIER) + CFBundleInfoDictionaryVersion + 6.0 + CFBundleName + $(PRODUCT_NAME) + CFBundlePackageType + $(PRODUCT_BUNDLE_PACKAGE_TYPE) + CFBundleShortVersionString + 1.0 + CFBundleVersion + 1 + LSRequiresIPhoneOS + + NSCameraUsageDescription + 需要调用相机获取图片 + UIApplicationSceneManifest + + UIApplicationSupportsMultipleScenes + + UISceneConfigurations + + UIWindowSceneSessionRoleApplication + + + UISceneConfigurationName + Default Configuration + UISceneDelegateClassName + SceneDelegate + UISceneStoryboardFile + Main + + + + + UILaunchStoryboardName + LaunchScreen + UIMainStoryboardFile + Main + UIRequiredDeviceCapabilities + + armv7 + + UISupportedInterfaceOrientations + + UIInterfaceOrientationPortrait + UIInterfaceOrientationLandscapeLeft + UIInterfaceOrientationLandscapeRight + + UISupportedInterfaceOrientations~ipad + + UIInterfaceOrientationPortrait + UIInterfaceOrientationPortraitUpsideDown + UIInterfaceOrientationLandscapeLeft + UIInterfaceOrientationLandscapeRight + + + diff --git a/kit/iOS/image_classification/ImageClassificationDemo/SceneDelegate.h b/kit/iOS/image_classification/ImageClassificationDemo/SceneDelegate.h new file mode 100644 index 00000000..36e5317f --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/SceneDelegate.h @@ -0,0 +1,20 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#import + +@interface SceneDelegate : UIResponder + +@property (strong, nonatomic) UIWindow *window; + +@end diff --git a/kit/iOS/image_classification/ImageClassificationDemo/SceneDelegate.m b/kit/iOS/image_classification/ImageClassificationDemo/SceneDelegate.m new file mode 100644 index 00000000..4165c8c1 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/SceneDelegate.m @@ -0,0 +1,63 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#import "SceneDelegate.h" + +@interface SceneDelegate () + +@end + +@implementation SceneDelegate + + +- (void)scene:(UIScene *)scene willConnectToSession:(UISceneSession *)session options:(UISceneConnectionOptions *)connectionOptions { + // Use this method to optionally configure and attach the UIWindow `window` to the provided UIWindowScene `scene`. + // If using a storyboard, the `window` property will automatically be initialized and attached to the scene. + // This delegate does not imply the connecting scene or session are new (see `application:configurationForConnectingSceneSession` instead). +} + + +- (void)sceneDidDisconnect:(UIScene *)scene { + // Called as the scene is being released by the system. + // This occurs shortly after the scene enters the background, or when its session is discarded. + // Release any resources associated with this scene that can be re-created the next time the scene connects. + // The scene may re-connect later, as its session was not neccessarily discarded (see `application:didDiscardSceneSessions` instead). +} + + +- (void)sceneDidBecomeActive:(UIScene *)scene { + // Called when the scene has moved from an inactive state to an active state. + // Use this method to restart any tasks that were paused (or not yet started) when the scene was inactive. +} + + +- (void)sceneWillResignActive:(UIScene *)scene { + // Called when the scene will move from an active state to an inactive state. + // This may occur due to temporary interruptions (ex. an incoming phone call). +} + + +- (void)sceneWillEnterForeground:(UIScene *)scene { + // Called as the scene transitions from the background to the foreground. + // Use this method to undo the changes made on entering the background. +} + + +- (void)sceneDidEnterBackground:(UIScene *)scene { + // Called as the scene transitions from the foreground to the background. + // Use this method to save data, release shared resources, and store enough scene-specific state information + // to restore the scene back to its current state. +} + + +@end diff --git a/kit/iOS/image_classification/ImageClassificationDemo/ViewController.h b/kit/iOS/image_classification/ImageClassificationDemo/ViewController.h new file mode 100644 index 00000000..50e620ae --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/ViewController.h @@ -0,0 +1,18 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#import + +@interface ViewController : UIViewController + +@end diff --git a/kit/iOS/image_classification/ImageClassificationDemo/ViewController.mm b/kit/iOS/image_classification/ImageClassificationDemo/ViewController.mm new file mode 100644 index 00000000..7a6be416 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/ViewController.mm @@ -0,0 +1,299 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#import "ViewController.h" +#import +#include "kit_flags.h" +#include "flow.h" + +using namespace std; +@interface ViewController () + +@property (nonatomic,strong) UIImageView *imgView; +@property (nonatomic,strong) UILabel *scoreLabel; +@property (nonatomic,strong) AVCaptureVideoDataOutput *videoOutput; +@property (nonatomic,strong) NSMutableArray *rgbDataArr; +@property (nonatomic,strong) dispatch_queue_t queue; + +@property (nonatomic,strong) NSArray *transTypeArr; +@property (nonatomic,assign) BOOL isFirst; + +@property (nonatomic,strong) NSString *dstPath; + +@end + +DataType inferencePrecision = DT_F32; +const int topK=5; +const int width=224; +const int height=224; +Flow flowExample; + +using namespace std; + +@implementation ViewController + + +- (void)viewDidLoad +{ + [super viewDidLoad]; + self.view.backgroundColor=[UIColor whiteColor]; + + flowRegisterFunction("pixelProcess", pixelProcess); + flowRegisterFunction("postProcess", postProcess); + + NSString *typePath=[[NSBundle mainBundle]pathForResource:@"imagenet_classes" ofType:@"txt"]; + NSString *typeStr=[NSString stringWithContentsOfFile:typePath encoding:NSUTF8StringEncoding error:nil]; + _transTypeArr=[NSArray arrayWithArray:[typeStr componentsSeparatedByString:@"\n"] ]; + + [self setupAVCapture]; + + _scoreLabel=[[UILabel alloc]initWithFrame:CGRectMake(0, self.view.frame.size.height / 2 + 140, self.view.frame.size.width, 160)]; + _scoreLabel.font=[UIFont boldSystemFontOfSize:20]; + _scoreLabel.textColor=[UIColor blueColor]; + _scoreLabel.textAlignment=NSTextAlignmentCenter; + _scoreLabel.numberOfLines=6; + [self.view addSubview:_scoreLabel]; + + NSString *graphPathStr=[[NSBundle mainBundle]pathForResource:@"image_classification" ofType:@"prototxt"]; + + NSArray *path = NSSearchPathForDirectoriesInDomains(NSDocumentDirectory, NSUserDomainMask, YES); + NSString *docDirectory = [path objectAtIndex:0]; + + _dstPath = [docDirectory stringByAppendingPathComponent:@"image_classification.prototxt"]; + [[NSFileManager defaultManager] copyItemAtPath:graphPathStr toPath:_dstPath error:nil]; + + NSString *myStr=[[NSString alloc]initWithContentsOfFile:_dstPath encoding:NSUTF8StringEncoding error:nil]; + NSMutableArray *arr=[NSMutableArray arrayWithArray:[myStr componentsSeparatedByString:@"inference_parameter:"] ]; + + NSString *boltPath=[[NSBundle mainBundle]pathForResource:@"ghostnet_f32" ofType:@"bolt"]; + + NSString *changeStr=[NSString stringWithFormat:@"%@inference_parameter:\"%@\"\ninference_parameter:\"\"\n}", arr[0], boltPath]; + + NSError *error=nil; + [changeStr writeToFile:_dstPath atomically:YES encoding:NSUTF8StringEncoding error:&error]; + if (error) { + NSLog(@"%@",error); + } + + char* gPath =(char *)[_dstPath UTF8String]; + std::string imageClassificationGraphPath = gPath; + std::vector graphPath = {imageClassificationGraphPath}; + int threads = 1; + + flowExample.init(graphPath, inferencePrecision, AFFINITY_CPU_HIGH_PERFORMANCE, threads, false); +} + +EE pixelProcess(std::map> &inputs, + std::shared_ptr &tmp, + std::map> &outputs, + std::vector parameter = std::vector()) +{ + // RGBA + unsigned char *myBuffer =(unsigned char *)((CpuMemory*)inputs["input:1"]->get_memory())->get_ptr(); + + F32 *oneArr = (F32 *)((CpuMemory *)outputs["input:0"]->get_memory())->get_ptr(); + + for (int i = 0; i < height; i++) { + for (int y = 0; y < width; y++) { + unsigned char r = myBuffer[i * width * 4 + y * 4]; + unsigned char g = myBuffer[i * width * 4 + y * 4 + 1]; + unsigned char b = myBuffer[i * width * 4 + y * 4 + 2]; + + oneArr[i * 3 * width + y * 3] = b; + oneArr[i * 3 * width + y * 3 + 1] = g; + oneArr[i * 3 * width + y * 3 + 2] = r; + } + } + return SUCCESS; +} + +EE postProcess(std::map> &inputs, + std::shared_ptr &tmp, + std::map> &outputs, + std::vector parameter = std::vector()) +{ + std::string flowInferenceNodeOutputName = "output"; + std::string boltModelOutputName = "MobileNetV2/Predictions/Softmax:0"; + + int *flowInferenceNodeOutput = (int *)((CpuMemory *)outputs[flowInferenceNodeOutputName]->get_memory())->get_ptr(); + + F32 *score1000 =(F32 *)((CpuMemory *)inputs[boltModelOutputName]->get_memory())->get_ptr(); + + for (int i = 0; i < topK; i++) { + int max_index = 0; + for (int j = 1; j < 1000; j ++) { + if (score1000[j] > score1000[max_index]) { + max_index = j; + } + } + flowInferenceNodeOutput[i] = max_index; + score1000[max_index] = -65504; + } + return SUCCESS; +} + + +std::map> inputOutput(const unsigned char * myBuffer) +{ + std::map> tensors; + TensorDesc inputDesc = tensor4df(DT_U8, DF_NCHW, 1, 224, 224, 4); + + tensors["input:1"] = std::shared_ptr(new Tensor()); + tensors["input:1"]->resize(inputDesc); + tensors["input:1"]->alloc(); + void *ptr = (void *)((CpuMemory *)tensors["input:1"]->get_memory())->get_ptr(); + memcpy(ptr, myBuffer, tensorNumBytes(inputDesc)); + + tensors["output"] = std::shared_ptr(new Tensor()); + tensors["output"]->resize( + tensor2df(DT_I32, DF_NCHW, 1, topK)); + tensors["output"]->alloc(); + + return tensors; +} + +-(void)captureOutput:(AVCaptureOutput *)output didOutputSampleBuffer:(CMSampleBufferRef)sampleBuffer fromConnection:(AVCaptureConnection *)connection +{ + if (!_isFirst) { + // The first frame is dark + __weak typeof(self)weakSelf=self; + dispatch_after(dispatch_time(DISPATCH_TIME_NOW, (int64_t)(NSEC_PER_SEC*0.3)), dispatch_get_main_queue(), ^{ + weakSelf.isFirst=YES; + + }); + return; + } + UIImage* image = [self imageWithImageSimple:[self imageFromSampleBuffer:sampleBuffer] scaledToSize:CGSizeMake(224, 224)]; + + [self.videoOutput setSampleBufferDelegate:nil queue:self.queue]; + + __weak typeof(self)weakSelf=self; + dispatch_async(dispatch_get_global_queue(0, 0), ^{ + CGImageRef img = [image CGImage]; + CFDataRef data = CGDataProviderCopyData(CGImageGetDataProvider(img)); + const unsigned char *buffer = CFDataGetBytePtr(data); + + [weakSelf beginLoadData:buffer]; + + CFRelease(data); + weakSelf.queue = dispatch_queue_create("myQueue", NULL); + [weakSelf.videoOutput setSampleBufferDelegate:weakSelf queue:weakSelf.queue]; + }); +} + +-(void)beginLoadData:(const unsigned char * )myBuffer +{ + + char* gPath =(char *)[_dstPath UTF8String]; + + int num = 1; + std::string imageClassificationGraphPath = gPath; + + for (int i = 0; i < num; i++) { + std::map> data = inputOutput(myBuffer); + Task task(imageClassificationGraphPath, data); + flowExample.enqueue(task); + } + + std::vector results; + double start = ut_time_ms(); + UNI_PROFILE(results = flowExample.dequeue(true), std::string("image_classification"), + std::string("image_classification")); + double end = ut_time_ms(); + + int *top5 =(int *)((CpuMemory *)results[0].data["output"]->get_memory())->get_ptr(); + + __weak typeof(self)weakSelf = self; + dispatch_async(dispatch_get_main_queue(), ^{ + for (int i = 0; i < 5; i++) { + if (i == 0) { + weakSelf.scoreLabel.text=[NSString stringWithFormat:@"%d:%@",i+1,weakSelf.transTypeArr[top5[i]]]; + } else { + weakSelf.scoreLabel.text=[NSString stringWithFormat:@"%@\n%d,%@",weakSelf.scoreLabel.text,i+1,weakSelf.transTypeArr[top5[i]]]; + } + } + weakSelf.scoreLabel.text=[NSString stringWithFormat:@"%@\ntime=%lfms",weakSelf.scoreLabel.text,(end - start) / num]; + }); +} + +-(void)setupAVCapture +{ + NSError *error=nil; + + AVCaptureSession *session=[[AVCaptureSession alloc] init]; + session.sessionPreset=AVCaptureSessionPreset1280x720; + [session beginConfiguration]; + + AVCaptureDevice *device=[AVCaptureDevice defaultDeviceWithMediaType:AVMediaTypeVideo]; + AVCaptureDeviceInput *deviceInput=[AVCaptureDeviceInput deviceInputWithDevice:device error:&error]; + if ([session canAddInput:deviceInput]) { + [session addInput:deviceInput]; + } + + _videoOutput=[[AVCaptureVideoDataOutput alloc]init]; + _videoOutput.alwaysDiscardsLateVideoFrames=YES; + _videoOutput.videoSettings=[NSDictionary dictionaryWithObject:[NSNumber numberWithInt:kCVPixelFormatType_32BGRA] forKey:(id)kCVPixelBufferPixelFormatTypeKey]; + if ([session canAddOutput:_videoOutput]) { + [session addOutput:_videoOutput]; + } + + self.queue=dispatch_queue_create("myQueue", NULL); + [_videoOutput setSampleBufferDelegate:self queue:self.queue]; + AVCaptureVideoPreviewLayer *preLayer=[AVCaptureVideoPreviewLayer layerWithSession:session]; + preLayer.frame=CGRectMake((self.view.frame.size.width-width)/2, (self.view.frame.size.height-height)/2-100, width, height); + preLayer.videoGravity=AVLayerVideoGravityResizeAspectFill; + [self.view.layer addSublayer:preLayer]; + + [session commitConfiguration]; + [session startRunning]; +} + +-(UIImage *)imageFromSampleBuffer:(CMSampleBufferRef)sampleBuffer +{ + CVImageBufferRef imageBuffer=CMSampleBufferGetImageBuffer(sampleBuffer); + + CVPixelBufferLockBaseAddress(imageBuffer, 0); + + size_t bytesPerRow=CVPixelBufferGetBytesPerRow(imageBuffer); + + size_t width=CVPixelBufferGetWidth(imageBuffer); + size_t height=CVPixelBufferGetHeight(imageBuffer); + + uint8_t *baseAddress = (uint8_t *)CVPixelBufferGetBaseAddress(imageBuffer); + + CVPixelBufferUnlockBaseAddress(imageBuffer,0); + + CGColorSpaceRef colorSpace=CGColorSpaceCreateDeviceRGB(); + + CGContextRef context=CGBitmapContextCreate(baseAddress, width, height, 8, bytesPerRow, colorSpace, kCGBitmapByteOrder32Little|kCGImageAlphaPremultipliedFirst); + + CGImageRef quartzImage = CGBitmapContextCreateImage(context); + + CGContextRelease(context); + CGColorSpaceRelease(colorSpace); + + UIImage *image = [UIImage imageWithCGImage:quartzImage scale:1.0 orientation:UIImageOrientationRight]; + + CGImageRelease(quartzImage); + return (image); +} + +-(UIImage *)imageWithImageSimple:(UIImage*)image scaledToSize:(CGSize)newSize +{ + UIGraphicsBeginImageContext(newSize); + [image drawInRect:CGRectMake(0, 0, newSize.width, newSize.height)]; + UIImage *newImage=UIGraphicsGetImageFromCurrentImageContext(); + UIGraphicsEndImageContext(); + return newImage; +} +@end diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/flow/flow.pb.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/flow/flow.pb.h new file mode 100644 index 00000000..70e3fd4b --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/flow/flow.pb.h @@ -0,0 +1,1374 @@ +// Generated by the protocol buffer compiler. +// If regenerated, please remove header prefix before using in xcode +// source: flow.proto + +#ifndef PROTOBUF_flow_2eproto__INCLUDED +#define PROTOBUF_flow_2eproto__INCLUDED + +#include + +#ifdef _USE_XCODE +#include "common.h" +#else +#include +#endif + +#if GOOGLE_PROTOBUF_VERSION < 3001000 +#error This file was generated by a newer version of protoc which is +#error incompatible with your Protocol Buffer headers. Please update +#error your headers. +#endif +#if 3001000 < GOOGLE_PROTOBUF_MIN_PROTOC_VERSION +#error This file was generated by an older version of protoc which is +#error incompatible with your Protocol Buffer headers. Please +#error regenerate this file with a newer version of protoc. +#endif + +#ifdef _USE_XCODE +#include "arena.h" +#include "arenastring.h" +#include "generated_message_util.h" +#include "metadata.h" +#include "message.h" +#include "repeated_field.h" +#include "extension_set.h" +#include "unknown_field_set.h" +#else +#include +#include +#include +#include +#include +#include +#include +#include +#endif +// @@protoc_insertion_point(includes) + +namespace flow { + +// Internal implementation detail -- do not call these. +void protobuf_AddDesc_flow_2eproto(); +void protobuf_InitDefaults_flow_2eproto(); +void protobuf_AssignDesc_flow_2eproto(); +void protobuf_ShutdownFile_flow_2eproto(); + +class GraphParameter; +class NodeParameter; + +// =================================================================== + +class GraphParameter : public ::google::protobuf::Message /* @@protoc_insertion_point(class_definition:flow.GraphParameter) */ { + public: + GraphParameter(); + virtual ~GraphParameter(); + + GraphParameter(const GraphParameter& from); + + inline GraphParameter& operator=(const GraphParameter& from) { + CopyFrom(from); + return *this; + } + + inline const ::google::protobuf::UnknownFieldSet& unknown_fields() const { + return _internal_metadata_.unknown_fields(); + } + + inline ::google::protobuf::UnknownFieldSet* mutable_unknown_fields() { + return _internal_metadata_.mutable_unknown_fields(); + } + + static const ::google::protobuf::Descriptor* descriptor(); + static const GraphParameter& default_instance(); + + static const GraphParameter* internal_default_instance(); + + void Swap(GraphParameter* other); + + // implements Message ---------------------------------------------- + + inline GraphParameter* New() const { return New(NULL); } + + GraphParameter* New(::google::protobuf::Arena* arena) const; + void CopyFrom(const ::google::protobuf::Message& from); + void MergeFrom(const ::google::protobuf::Message& from); + void CopyFrom(const GraphParameter& from); + void MergeFrom(const GraphParameter& from); + void Clear(); + bool IsInitialized() const; + + size_t ByteSizeLong() const; + bool MergePartialFromCodedStream( + ::google::protobuf::io::CodedInputStream* input); + void SerializeWithCachedSizes( + ::google::protobuf::io::CodedOutputStream* output) const; + ::google::protobuf::uint8* InternalSerializeWithCachedSizesToArray( + bool deterministic, ::google::protobuf::uint8* output) const; + ::google::protobuf::uint8* SerializeWithCachedSizesToArray(::google::protobuf::uint8* output) const { + return InternalSerializeWithCachedSizesToArray(false, output); + } + int GetCachedSize() const { return _cached_size_; } + private: + void SharedCtor(); + void SharedDtor(); + void SetCachedSize(int size) const; + void InternalSwap(GraphParameter* other); + void UnsafeMergeFrom(const GraphParameter& from); + private: + inline ::google::protobuf::Arena* GetArenaNoVirtual() const { + return _internal_metadata_.arena(); + } + inline void* MaybeArenaPtr() const { + return _internal_metadata_.raw_arena_ptr(); + } + public: + + ::google::protobuf::Metadata GetMetadata() const; + + // nested types ---------------------------------------------------- + + // accessors ------------------------------------------------------- + + // optional string name = 1; + bool has_name() const; + void clear_name(); + static const int kNameFieldNumber = 1; + const ::std::string& name() const; + void set_name(const ::std::string& value); + void set_name(const char* value); + void set_name(const char* value, size_t size); + ::std::string* mutable_name(); + ::std::string* release_name(); + void set_allocated_name(::std::string* name); + + // repeated string input = 2; + int input_size() const; + void clear_input(); + static const int kInputFieldNumber = 2; + const ::std::string& input(int index) const; + ::std::string* mutable_input(int index); + void set_input(int index, const ::std::string& value); + void set_input(int index, const char* value); + void set_input(int index, const char* value, size_t size); + ::std::string* add_input(); + void add_input(const ::std::string& value); + void add_input(const char* value); + void add_input(const char* value, size_t size); + const ::google::protobuf::RepeatedPtrField< ::std::string>& input() const; + ::google::protobuf::RepeatedPtrField< ::std::string>* mutable_input(); + + // repeated string output = 3; + int output_size() const; + void clear_output(); + static const int kOutputFieldNumber = 3; + const ::std::string& output(int index) const; + ::std::string* mutable_output(int index); + void set_output(int index, const ::std::string& value); + void set_output(int index, const char* value); + void set_output(int index, const char* value, size_t size); + ::std::string* add_output(); + void add_output(const ::std::string& value); + void add_output(const char* value); + void add_output(const char* value, size_t size); + const ::google::protobuf::RepeatedPtrField< ::std::string>& output() const; + ::google::protobuf::RepeatedPtrField< ::std::string>* mutable_output(); + + // repeated .flow.NodeParameter node = 4; + int node_size() const; + void clear_node(); + static const int kNodeFieldNumber = 4; + const ::flow::NodeParameter& node(int index) const; + ::flow::NodeParameter* mutable_node(int index); + ::flow::NodeParameter* add_node(); + ::google::protobuf::RepeatedPtrField< ::flow::NodeParameter >* + mutable_node(); + const ::google::protobuf::RepeatedPtrField< ::flow::NodeParameter >& + node() const; + + // @@protoc_insertion_point(class_scope:flow.GraphParameter) + private: + inline void set_has_name(); + inline void clear_has_name(); + + ::google::protobuf::internal::InternalMetadataWithArena _internal_metadata_; + ::google::protobuf::internal::HasBits<1> _has_bits_; + mutable int _cached_size_; + ::google::protobuf::RepeatedPtrField< ::std::string> input_; + ::google::protobuf::RepeatedPtrField< ::std::string> output_; + ::google::protobuf::RepeatedPtrField< ::flow::NodeParameter > node_; + ::google::protobuf::internal::ArenaStringPtr name_; + friend void protobuf_InitDefaults_flow_2eproto_impl(); + friend void protobuf_AddDesc_flow_2eproto_impl(); + friend void protobuf_AssignDesc_flow_2eproto(); + friend void protobuf_ShutdownFile_flow_2eproto(); + + void InitAsDefaultInstance(); +}; +extern ::google::protobuf::internal::ExplicitlyConstructed GraphParameter_default_instance_; + +// ------------------------------------------------------------------- + +class NodeParameter : public ::google::protobuf::Message /* @@protoc_insertion_point(class_definition:flow.NodeParameter) */ { + public: + NodeParameter(); + virtual ~NodeParameter(); + + NodeParameter(const NodeParameter& from); + + inline NodeParameter& operator=(const NodeParameter& from) { + CopyFrom(from); + return *this; + } + + inline const ::google::protobuf::UnknownFieldSet& unknown_fields() const { + return _internal_metadata_.unknown_fields(); + } + + inline ::google::protobuf::UnknownFieldSet* mutable_unknown_fields() { + return _internal_metadata_.mutable_unknown_fields(); + } + + static const ::google::protobuf::Descriptor* descriptor(); + static const NodeParameter& default_instance(); + + static const NodeParameter* internal_default_instance(); + + void Swap(NodeParameter* other); + + // implements Message ---------------------------------------------- + + inline NodeParameter* New() const { return New(NULL); } + + NodeParameter* New(::google::protobuf::Arena* arena) const; + void CopyFrom(const ::google::protobuf::Message& from); + void MergeFrom(const ::google::protobuf::Message& from); + void CopyFrom(const NodeParameter& from); + void MergeFrom(const NodeParameter& from); + void Clear(); + bool IsInitialized() const; + + size_t ByteSizeLong() const; + bool MergePartialFromCodedStream( + ::google::protobuf::io::CodedInputStream* input); + void SerializeWithCachedSizes( + ::google::protobuf::io::CodedOutputStream* output) const; + ::google::protobuf::uint8* InternalSerializeWithCachedSizesToArray( + bool deterministic, ::google::protobuf::uint8* output) const; + ::google::protobuf::uint8* SerializeWithCachedSizesToArray(::google::protobuf::uint8* output) const { + return InternalSerializeWithCachedSizesToArray(false, output); + } + int GetCachedSize() const { return _cached_size_; } + private: + void SharedCtor(); + void SharedDtor(); + void SetCachedSize(int size) const; + void InternalSwap(NodeParameter* other); + void UnsafeMergeFrom(const NodeParameter& from); + private: + inline ::google::protobuf::Arena* GetArenaNoVirtual() const { + return _internal_metadata_.arena(); + } + inline void* MaybeArenaPtr() const { + return _internal_metadata_.raw_arena_ptr(); + } + public: + + ::google::protobuf::Metadata GetMetadata() const; + + // nested types ---------------------------------------------------- + + // accessors ------------------------------------------------------- + + // optional string name = 1; + bool has_name() const; + void clear_name(); + static const int kNameFieldNumber = 1; + const ::std::string& name() const; + void set_name(const ::std::string& value); + void set_name(const char* value); + void set_name(const char* value, size_t size); + ::std::string* mutable_name(); + ::std::string* release_name(); + void set_allocated_name(::std::string* name); + + // optional string type = 2; + bool has_type() const; + void clear_type(); + static const int kTypeFieldNumber = 2; + const ::std::string& type() const; + void set_type(const ::std::string& value); + void set_type(const char* value); + void set_type(const char* value, size_t size); + ::std::string* mutable_type(); + ::std::string* release_type(); + void set_allocated_type(::std::string* type); + + // optional string input_type = 3; + bool has_input_type() const; + void clear_input_type(); + static const int kInputTypeFieldNumber = 3; + const ::std::string& input_type() const; + void set_input_type(const ::std::string& value); + void set_input_type(const char* value); + void set_input_type(const char* value, size_t size); + ::std::string* mutable_input_type(); + ::std::string* release_input_type(); + void set_allocated_input_type(::std::string* input_type); + + // optional string input_format = 4; + bool has_input_format() const; + void clear_input_format(); + static const int kInputFormatFieldNumber = 4; + const ::std::string& input_format() const; + void set_input_format(const ::std::string& value); + void set_input_format(const char* value); + void set_input_format(const char* value, size_t size); + ::std::string* mutable_input_format(); + ::std::string* release_input_format(); + void set_allocated_input_format(::std::string* input_format); + + // repeated int32 input_dim = 5; + int input_dim_size() const; + void clear_input_dim(); + static const int kInputDimFieldNumber = 5; + ::google::protobuf::int32 input_dim(int index) const; + void set_input_dim(int index, ::google::protobuf::int32 value); + void add_input_dim(::google::protobuf::int32 value); + const ::google::protobuf::RepeatedField< ::google::protobuf::int32 >& + input_dim() const; + ::google::protobuf::RepeatedField< ::google::protobuf::int32 >* + mutable_input_dim(); + + // repeated string input = 6; + int input_size() const; + void clear_input(); + static const int kInputFieldNumber = 6; + const ::std::string& input(int index) const; + ::std::string* mutable_input(int index); + void set_input(int index, const ::std::string& value); + void set_input(int index, const char* value); + void set_input(int index, const char* value, size_t size); + ::std::string* add_input(); + void add_input(const ::std::string& value); + void add_input(const char* value); + void add_input(const char* value, size_t size); + const ::google::protobuf::RepeatedPtrField< ::std::string>& input() const; + ::google::protobuf::RepeatedPtrField< ::std::string>* mutable_input(); + + // repeated string output = 7; + int output_size() const; + void clear_output(); + static const int kOutputFieldNumber = 7; + const ::std::string& output(int index) const; + ::std::string* mutable_output(int index); + void set_output(int index, const ::std::string& value); + void set_output(int index, const char* value); + void set_output(int index, const char* value, size_t size); + ::std::string* add_output(); + void add_output(const ::std::string& value); + void add_output(const char* value); + void add_output(const char* value, size_t size); + const ::google::protobuf::RepeatedPtrField< ::std::string>& output() const; + ::google::protobuf::RepeatedPtrField< ::std::string>* mutable_output(); + + // optional uint32 tmp = 8 [default = 0]; + bool has_tmp() const; + void clear_tmp(); + static const int kTmpFieldNumber = 8; + ::google::protobuf::uint32 tmp() const; + void set_tmp(::google::protobuf::uint32 value); + + // optional string precision = 9 [default = "FLOAT32"]; + bool has_precision() const; + void clear_precision(); + static const int kPrecisionFieldNumber = 9; + const ::std::string& precision() const; + void set_precision(const ::std::string& value); + void set_precision(const char* value); + void set_precision(const char* value, size_t size); + ::std::string* mutable_precision(); + ::std::string* release_precision(); + void set_allocated_precision(::std::string* precision); + + // repeated string infer_output_size_parameter = 10; + int infer_output_size_parameter_size() const; + void clear_infer_output_size_parameter(); + static const int kInferOutputSizeParameterFieldNumber = 10; + const ::std::string& infer_output_size_parameter(int index) const; + ::std::string* mutable_infer_output_size_parameter(int index); + void set_infer_output_size_parameter(int index, const ::std::string& value); + void set_infer_output_size_parameter(int index, const char* value); + void set_infer_output_size_parameter(int index, const char* value, size_t size); + ::std::string* add_infer_output_size_parameter(); + void add_infer_output_size_parameter(const ::std::string& value); + void add_infer_output_size_parameter(const char* value); + void add_infer_output_size_parameter(const char* value, size_t size); + const ::google::protobuf::RepeatedPtrField< ::std::string>& infer_output_size_parameter() const; + ::google::protobuf::RepeatedPtrField< ::std::string>* mutable_infer_output_size_parameter(); + + // repeated string preprocess_parameter = 11; + int preprocess_parameter_size() const; + void clear_preprocess_parameter(); + static const int kPreprocessParameterFieldNumber = 11; + const ::std::string& preprocess_parameter(int index) const; + ::std::string* mutable_preprocess_parameter(int index); + void set_preprocess_parameter(int index, const ::std::string& value); + void set_preprocess_parameter(int index, const char* value); + void set_preprocess_parameter(int index, const char* value, size_t size); + ::std::string* add_preprocess_parameter(); + void add_preprocess_parameter(const ::std::string& value); + void add_preprocess_parameter(const char* value); + void add_preprocess_parameter(const char* value, size_t size); + const ::google::protobuf::RepeatedPtrField< ::std::string>& preprocess_parameter() const; + ::google::protobuf::RepeatedPtrField< ::std::string>* mutable_preprocess_parameter(); + + // repeated string inference_parameter = 12; + int inference_parameter_size() const; + void clear_inference_parameter(); + static const int kInferenceParameterFieldNumber = 12; + const ::std::string& inference_parameter(int index) const; + ::std::string* mutable_inference_parameter(int index); + void set_inference_parameter(int index, const ::std::string& value); + void set_inference_parameter(int index, const char* value); + void set_inference_parameter(int index, const char* value, size_t size); + ::std::string* add_inference_parameter(); + void add_inference_parameter(const ::std::string& value); + void add_inference_parameter(const char* value); + void add_inference_parameter(const char* value, size_t size); + const ::google::protobuf::RepeatedPtrField< ::std::string>& inference_parameter() const; + ::google::protobuf::RepeatedPtrField< ::std::string>* mutable_inference_parameter(); + + // repeated string postprocess_parameter = 13; + int postprocess_parameter_size() const; + void clear_postprocess_parameter(); + static const int kPostprocessParameterFieldNumber = 13; + const ::std::string& postprocess_parameter(int index) const; + ::std::string* mutable_postprocess_parameter(int index); + void set_postprocess_parameter(int index, const ::std::string& value); + void set_postprocess_parameter(int index, const char* value); + void set_postprocess_parameter(int index, const char* value, size_t size); + ::std::string* add_postprocess_parameter(); + void add_postprocess_parameter(const ::std::string& value); + void add_postprocess_parameter(const char* value); + void add_postprocess_parameter(const char* value, size_t size); + const ::google::protobuf::RepeatedPtrField< ::std::string>& postprocess_parameter() const; + ::google::protobuf::RepeatedPtrField< ::std::string>* mutable_postprocess_parameter(); + + // @@protoc_insertion_point(class_scope:flow.NodeParameter) + private: + inline void set_has_name(); + inline void clear_has_name(); + inline void set_has_type(); + inline void clear_has_type(); + inline void set_has_input_type(); + inline void clear_has_input_type(); + inline void set_has_input_format(); + inline void clear_has_input_format(); + inline void set_has_tmp(); + inline void clear_has_tmp(); + inline void set_has_precision(); + inline void clear_has_precision(); + + ::google::protobuf::internal::InternalMetadataWithArena _internal_metadata_; + ::google::protobuf::internal::HasBits<1> _has_bits_; + mutable int _cached_size_; + ::google::protobuf::RepeatedField< ::google::protobuf::int32 > input_dim_; + ::google::protobuf::RepeatedPtrField< ::std::string> input_; + ::google::protobuf::RepeatedPtrField< ::std::string> output_; + ::google::protobuf::RepeatedPtrField< ::std::string> infer_output_size_parameter_; + ::google::protobuf::RepeatedPtrField< ::std::string> preprocess_parameter_; + ::google::protobuf::RepeatedPtrField< ::std::string> inference_parameter_; + ::google::protobuf::RepeatedPtrField< ::std::string> postprocess_parameter_; + ::google::protobuf::internal::ArenaStringPtr name_; + ::google::protobuf::internal::ArenaStringPtr type_; + ::google::protobuf::internal::ArenaStringPtr input_type_; + ::google::protobuf::internal::ArenaStringPtr input_format_; + static ::std::string* _default_precision_; + ::google::protobuf::internal::ArenaStringPtr precision_; + ::google::protobuf::uint32 tmp_; + friend void protobuf_InitDefaults_flow_2eproto_impl(); + friend void protobuf_AddDesc_flow_2eproto_impl(); + friend void protobuf_AssignDesc_flow_2eproto(); + friend void protobuf_ShutdownFile_flow_2eproto(); + + void InitAsDefaultInstance(); +}; +extern ::google::protobuf::internal::ExplicitlyConstructed NodeParameter_default_instance_; + +// =================================================================== + + +// =================================================================== + +#if !PROTOBUF_INLINE_NOT_IN_HEADERS +// GraphParameter + +// optional string name = 1; +inline bool GraphParameter::has_name() const { + return (_has_bits_[0] & 0x00000001u) != 0; +} +inline void GraphParameter::set_has_name() { + _has_bits_[0] |= 0x00000001u; +} +inline void GraphParameter::clear_has_name() { + _has_bits_[0] &= ~0x00000001u; +} +inline void GraphParameter::clear_name() { + name_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited()); + clear_has_name(); +} +inline const ::std::string& GraphParameter::name() const { + // @@protoc_insertion_point(field_get:flow.GraphParameter.name) + return name_.GetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited()); +} +inline void GraphParameter::set_name(const ::std::string& value) { + set_has_name(); + name_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value); + // @@protoc_insertion_point(field_set:flow.GraphParameter.name) +} +inline void GraphParameter::set_name(const char* value) { + set_has_name(); + name_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value)); + // @@protoc_insertion_point(field_set_char:flow.GraphParameter.name) +} +inline void GraphParameter::set_name(const char* value, size_t size) { + set_has_name(); + name_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), + ::std::string(reinterpret_cast(value), size)); + // @@protoc_insertion_point(field_set_pointer:flow.GraphParameter.name) +} +inline ::std::string* GraphParameter::mutable_name() { + set_has_name(); + // @@protoc_insertion_point(field_mutable:flow.GraphParameter.name) + return name_.MutableNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited()); +} +inline ::std::string* GraphParameter::release_name() { + // @@protoc_insertion_point(field_release:flow.GraphParameter.name) + clear_has_name(); + return name_.ReleaseNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited()); +} +inline void GraphParameter::set_allocated_name(::std::string* name) { + if (name != NULL) { + set_has_name(); + } else { + clear_has_name(); + } + name_.SetAllocatedNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), name); + // @@protoc_insertion_point(field_set_allocated:flow.GraphParameter.name) +} + +// repeated string input = 2; +inline int GraphParameter::input_size() const { + return input_.size(); +} +inline void GraphParameter::clear_input() { + input_.Clear(); +} +inline const ::std::string& GraphParameter::input(int index) const { + // @@protoc_insertion_point(field_get:flow.GraphParameter.input) + return input_.Get(index); +} +inline ::std::string* GraphParameter::mutable_input(int index) { + // @@protoc_insertion_point(field_mutable:flow.GraphParameter.input) + return input_.Mutable(index); +} +inline void GraphParameter::set_input(int index, const ::std::string& value) { + // @@protoc_insertion_point(field_set:flow.GraphParameter.input) + input_.Mutable(index)->assign(value); +} +inline void GraphParameter::set_input(int index, const char* value) { + input_.Mutable(index)->assign(value); + // @@protoc_insertion_point(field_set_char:flow.GraphParameter.input) +} +inline void GraphParameter::set_input(int index, const char* value, size_t size) { + input_.Mutable(index)->assign( + reinterpret_cast(value), size); + // @@protoc_insertion_point(field_set_pointer:flow.GraphParameter.input) +} +inline ::std::string* GraphParameter::add_input() { + // @@protoc_insertion_point(field_add_mutable:flow.GraphParameter.input) + return input_.Add(); +} +inline void GraphParameter::add_input(const ::std::string& value) { + input_.Add()->assign(value); + // @@protoc_insertion_point(field_add:flow.GraphParameter.input) +} +inline void GraphParameter::add_input(const char* value) { + input_.Add()->assign(value); + // @@protoc_insertion_point(field_add_char:flow.GraphParameter.input) +} +inline void GraphParameter::add_input(const char* value, size_t size) { + input_.Add()->assign(reinterpret_cast(value), size); + // @@protoc_insertion_point(field_add_pointer:flow.GraphParameter.input) +} +inline const ::google::protobuf::RepeatedPtrField< ::std::string>& +GraphParameter::input() const { + // @@protoc_insertion_point(field_list:flow.GraphParameter.input) + return input_; +} +inline ::google::protobuf::RepeatedPtrField< ::std::string>* +GraphParameter::mutable_input() { + // @@protoc_insertion_point(field_mutable_list:flow.GraphParameter.input) + return &input_; +} + +// repeated string output = 3; +inline int GraphParameter::output_size() const { + return output_.size(); +} +inline void GraphParameter::clear_output() { + output_.Clear(); +} +inline const ::std::string& GraphParameter::output(int index) const { + // @@protoc_insertion_point(field_get:flow.GraphParameter.output) + return output_.Get(index); +} +inline ::std::string* GraphParameter::mutable_output(int index) { + // @@protoc_insertion_point(field_mutable:flow.GraphParameter.output) + return output_.Mutable(index); +} +inline void GraphParameter::set_output(int index, const ::std::string& value) { + // @@protoc_insertion_point(field_set:flow.GraphParameter.output) + output_.Mutable(index)->assign(value); +} +inline void GraphParameter::set_output(int index, const char* value) { + output_.Mutable(index)->assign(value); + // @@protoc_insertion_point(field_set_char:flow.GraphParameter.output) +} +inline void GraphParameter::set_output(int index, const char* value, size_t size) { + output_.Mutable(index)->assign( + reinterpret_cast(value), size); + // @@protoc_insertion_point(field_set_pointer:flow.GraphParameter.output) +} +inline ::std::string* GraphParameter::add_output() { + // @@protoc_insertion_point(field_add_mutable:flow.GraphParameter.output) + return output_.Add(); +} +inline void GraphParameter::add_output(const ::std::string& value) { + output_.Add()->assign(value); + // @@protoc_insertion_point(field_add:flow.GraphParameter.output) +} +inline void GraphParameter::add_output(const char* value) { + output_.Add()->assign(value); + // @@protoc_insertion_point(field_add_char:flow.GraphParameter.output) +} +inline void GraphParameter::add_output(const char* value, size_t size) { + output_.Add()->assign(reinterpret_cast(value), size); + // @@protoc_insertion_point(field_add_pointer:flow.GraphParameter.output) +} +inline const ::google::protobuf::RepeatedPtrField< ::std::string>& +GraphParameter::output() const { + // @@protoc_insertion_point(field_list:flow.GraphParameter.output) + return output_; +} +inline ::google::protobuf::RepeatedPtrField< ::std::string>* +GraphParameter::mutable_output() { + // @@protoc_insertion_point(field_mutable_list:flow.GraphParameter.output) + return &output_; +} + +// repeated .flow.NodeParameter node = 4; +inline int GraphParameter::node_size() const { + return node_.size(); +} +inline void GraphParameter::clear_node() { + node_.Clear(); +} +inline const ::flow::NodeParameter& GraphParameter::node(int index) const { + // @@protoc_insertion_point(field_get:flow.GraphParameter.node) + return node_.Get(index); +} +inline ::flow::NodeParameter* GraphParameter::mutable_node(int index) { + // @@protoc_insertion_point(field_mutable:flow.GraphParameter.node) + return node_.Mutable(index); +} +inline ::flow::NodeParameter* GraphParameter::add_node() { + // @@protoc_insertion_point(field_add:flow.GraphParameter.node) + return node_.Add(); +} +inline ::google::protobuf::RepeatedPtrField< ::flow::NodeParameter >* +GraphParameter::mutable_node() { + // @@protoc_insertion_point(field_mutable_list:flow.GraphParameter.node) + return &node_; +} +inline const ::google::protobuf::RepeatedPtrField< ::flow::NodeParameter >& +GraphParameter::node() const { + // @@protoc_insertion_point(field_list:flow.GraphParameter.node) + return node_; +} + +inline const GraphParameter* GraphParameter::internal_default_instance() { + return &GraphParameter_default_instance_.get(); +} +// ------------------------------------------------------------------- + +// NodeParameter + +// optional string name = 1; +inline bool NodeParameter::has_name() const { + return (_has_bits_[0] & 0x00000001u) != 0; +} +inline void NodeParameter::set_has_name() { + _has_bits_[0] |= 0x00000001u; +} +inline void NodeParameter::clear_has_name() { + _has_bits_[0] &= ~0x00000001u; +} +inline void NodeParameter::clear_name() { + name_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited()); + clear_has_name(); +} +inline const ::std::string& NodeParameter::name() const { + // @@protoc_insertion_point(field_get:flow.NodeParameter.name) + return name_.GetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited()); +} +inline void NodeParameter::set_name(const ::std::string& value) { + set_has_name(); + name_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value); + // @@protoc_insertion_point(field_set:flow.NodeParameter.name) +} +inline void NodeParameter::set_name(const char* value) { + set_has_name(); + name_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value)); + // @@protoc_insertion_point(field_set_char:flow.NodeParameter.name) +} +inline void NodeParameter::set_name(const char* value, size_t size) { + set_has_name(); + name_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), + ::std::string(reinterpret_cast(value), size)); + // @@protoc_insertion_point(field_set_pointer:flow.NodeParameter.name) +} +inline ::std::string* NodeParameter::mutable_name() { + set_has_name(); + // @@protoc_insertion_point(field_mutable:flow.NodeParameter.name) + return name_.MutableNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited()); +} +inline ::std::string* NodeParameter::release_name() { + // @@protoc_insertion_point(field_release:flow.NodeParameter.name) + clear_has_name(); + return name_.ReleaseNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited()); +} +inline void NodeParameter::set_allocated_name(::std::string* name) { + if (name != NULL) { + set_has_name(); + } else { + clear_has_name(); + } + name_.SetAllocatedNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), name); + // @@protoc_insertion_point(field_set_allocated:flow.NodeParameter.name) +} + +// optional string type = 2; +inline bool NodeParameter::has_type() const { + return (_has_bits_[0] & 0x00000002u) != 0; +} +inline void NodeParameter::set_has_type() { + _has_bits_[0] |= 0x00000002u; +} +inline void NodeParameter::clear_has_type() { + _has_bits_[0] &= ~0x00000002u; +} +inline void NodeParameter::clear_type() { + type_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited()); + clear_has_type(); +} +inline const ::std::string& NodeParameter::type() const { + // @@protoc_insertion_point(field_get:flow.NodeParameter.type) + return type_.GetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited()); +} +inline void NodeParameter::set_type(const ::std::string& value) { + set_has_type(); + type_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value); + // @@protoc_insertion_point(field_set:flow.NodeParameter.type) +} +inline void NodeParameter::set_type(const char* value) { + set_has_type(); + type_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value)); + // @@protoc_insertion_point(field_set_char:flow.NodeParameter.type) +} +inline void NodeParameter::set_type(const char* value, size_t size) { + set_has_type(); + type_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), + ::std::string(reinterpret_cast(value), size)); + // @@protoc_insertion_point(field_set_pointer:flow.NodeParameter.type) +} +inline ::std::string* NodeParameter::mutable_type() { + set_has_type(); + // @@protoc_insertion_point(field_mutable:flow.NodeParameter.type) + return type_.MutableNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited()); +} +inline ::std::string* NodeParameter::release_type() { + // @@protoc_insertion_point(field_release:flow.NodeParameter.type) + clear_has_type(); + return type_.ReleaseNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited()); +} +inline void NodeParameter::set_allocated_type(::std::string* type) { + if (type != NULL) { + set_has_type(); + } else { + clear_has_type(); + } + type_.SetAllocatedNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), type); + // @@protoc_insertion_point(field_set_allocated:flow.NodeParameter.type) +} + +// optional string input_type = 3; +inline bool NodeParameter::has_input_type() const { + return (_has_bits_[0] & 0x00000004u) != 0; +} +inline void NodeParameter::set_has_input_type() { + _has_bits_[0] |= 0x00000004u; +} +inline void NodeParameter::clear_has_input_type() { + _has_bits_[0] &= ~0x00000004u; +} +inline void NodeParameter::clear_input_type() { + input_type_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited()); + clear_has_input_type(); +} +inline const ::std::string& NodeParameter::input_type() const { + // @@protoc_insertion_point(field_get:flow.NodeParameter.input_type) + return input_type_.GetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited()); +} +inline void NodeParameter::set_input_type(const ::std::string& value) { + set_has_input_type(); + input_type_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value); + // @@protoc_insertion_point(field_set:flow.NodeParameter.input_type) +} +inline void NodeParameter::set_input_type(const char* value) { + set_has_input_type(); + input_type_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value)); + // @@protoc_insertion_point(field_set_char:flow.NodeParameter.input_type) +} +inline void NodeParameter::set_input_type(const char* value, size_t size) { + set_has_input_type(); + input_type_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), + ::std::string(reinterpret_cast(value), size)); + // @@protoc_insertion_point(field_set_pointer:flow.NodeParameter.input_type) +} +inline ::std::string* NodeParameter::mutable_input_type() { + set_has_input_type(); + // @@protoc_insertion_point(field_mutable:flow.NodeParameter.input_type) + return input_type_.MutableNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited()); +} +inline ::std::string* NodeParameter::release_input_type() { + // @@protoc_insertion_point(field_release:flow.NodeParameter.input_type) + clear_has_input_type(); + return input_type_.ReleaseNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited()); +} +inline void NodeParameter::set_allocated_input_type(::std::string* input_type) { + if (input_type != NULL) { + set_has_input_type(); + } else { + clear_has_input_type(); + } + input_type_.SetAllocatedNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), input_type); + // @@protoc_insertion_point(field_set_allocated:flow.NodeParameter.input_type) +} + +// optional string input_format = 4; +inline bool NodeParameter::has_input_format() const { + return (_has_bits_[0] & 0x00000008u) != 0; +} +inline void NodeParameter::set_has_input_format() { + _has_bits_[0] |= 0x00000008u; +} +inline void NodeParameter::clear_has_input_format() { + _has_bits_[0] &= ~0x00000008u; +} +inline void NodeParameter::clear_input_format() { + input_format_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited()); + clear_has_input_format(); +} +inline const ::std::string& NodeParameter::input_format() const { + // @@protoc_insertion_point(field_get:flow.NodeParameter.input_format) + return input_format_.GetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited()); +} +inline void NodeParameter::set_input_format(const ::std::string& value) { + set_has_input_format(); + input_format_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value); + // @@protoc_insertion_point(field_set:flow.NodeParameter.input_format) +} +inline void NodeParameter::set_input_format(const char* value) { + set_has_input_format(); + input_format_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value)); + // @@protoc_insertion_point(field_set_char:flow.NodeParameter.input_format) +} +inline void NodeParameter::set_input_format(const char* value, size_t size) { + set_has_input_format(); + input_format_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), + ::std::string(reinterpret_cast(value), size)); + // @@protoc_insertion_point(field_set_pointer:flow.NodeParameter.input_format) +} +inline ::std::string* NodeParameter::mutable_input_format() { + set_has_input_format(); + // @@protoc_insertion_point(field_mutable:flow.NodeParameter.input_format) + return input_format_.MutableNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited()); +} +inline ::std::string* NodeParameter::release_input_format() { + // @@protoc_insertion_point(field_release:flow.NodeParameter.input_format) + clear_has_input_format(); + return input_format_.ReleaseNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited()); +} +inline void NodeParameter::set_allocated_input_format(::std::string* input_format) { + if (input_format != NULL) { + set_has_input_format(); + } else { + clear_has_input_format(); + } + input_format_.SetAllocatedNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), input_format); + // @@protoc_insertion_point(field_set_allocated:flow.NodeParameter.input_format) +} + +// repeated int32 input_dim = 5; +inline int NodeParameter::input_dim_size() const { + return input_dim_.size(); +} +inline void NodeParameter::clear_input_dim() { + input_dim_.Clear(); +} +inline ::google::protobuf::int32 NodeParameter::input_dim(int index) const { + // @@protoc_insertion_point(field_get:flow.NodeParameter.input_dim) + return input_dim_.Get(index); +} +inline void NodeParameter::set_input_dim(int index, ::google::protobuf::int32 value) { + input_dim_.Set(index, value); + // @@protoc_insertion_point(field_set:flow.NodeParameter.input_dim) +} +inline void NodeParameter::add_input_dim(::google::protobuf::int32 value) { + input_dim_.Add(value); + // @@protoc_insertion_point(field_add:flow.NodeParameter.input_dim) +} +inline const ::google::protobuf::RepeatedField< ::google::protobuf::int32 >& +NodeParameter::input_dim() const { + // @@protoc_insertion_point(field_list:flow.NodeParameter.input_dim) + return input_dim_; +} +inline ::google::protobuf::RepeatedField< ::google::protobuf::int32 >* +NodeParameter::mutable_input_dim() { + // @@protoc_insertion_point(field_mutable_list:flow.NodeParameter.input_dim) + return &input_dim_; +} + +// repeated string input = 6; +inline int NodeParameter::input_size() const { + return input_.size(); +} +inline void NodeParameter::clear_input() { + input_.Clear(); +} +inline const ::std::string& NodeParameter::input(int index) const { + // @@protoc_insertion_point(field_get:flow.NodeParameter.input) + return input_.Get(index); +} +inline ::std::string* NodeParameter::mutable_input(int index) { + // @@protoc_insertion_point(field_mutable:flow.NodeParameter.input) + return input_.Mutable(index); +} +inline void NodeParameter::set_input(int index, const ::std::string& value) { + // @@protoc_insertion_point(field_set:flow.NodeParameter.input) + input_.Mutable(index)->assign(value); +} +inline void NodeParameter::set_input(int index, const char* value) { + input_.Mutable(index)->assign(value); + // @@protoc_insertion_point(field_set_char:flow.NodeParameter.input) +} +inline void NodeParameter::set_input(int index, const char* value, size_t size) { + input_.Mutable(index)->assign( + reinterpret_cast(value), size); + // @@protoc_insertion_point(field_set_pointer:flow.NodeParameter.input) +} +inline ::std::string* NodeParameter::add_input() { + // @@protoc_insertion_point(field_add_mutable:flow.NodeParameter.input) + return input_.Add(); +} +inline void NodeParameter::add_input(const ::std::string& value) { + input_.Add()->assign(value); + // @@protoc_insertion_point(field_add:flow.NodeParameter.input) +} +inline void NodeParameter::add_input(const char* value) { + input_.Add()->assign(value); + // @@protoc_insertion_point(field_add_char:flow.NodeParameter.input) +} +inline void NodeParameter::add_input(const char* value, size_t size) { + input_.Add()->assign(reinterpret_cast(value), size); + // @@protoc_insertion_point(field_add_pointer:flow.NodeParameter.input) +} +inline const ::google::protobuf::RepeatedPtrField< ::std::string>& +NodeParameter::input() const { + // @@protoc_insertion_point(field_list:flow.NodeParameter.input) + return input_; +} +inline ::google::protobuf::RepeatedPtrField< ::std::string>* +NodeParameter::mutable_input() { + // @@protoc_insertion_point(field_mutable_list:flow.NodeParameter.input) + return &input_; +} + +// repeated string output = 7; +inline int NodeParameter::output_size() const { + return output_.size(); +} +inline void NodeParameter::clear_output() { + output_.Clear(); +} +inline const ::std::string& NodeParameter::output(int index) const { + // @@protoc_insertion_point(field_get:flow.NodeParameter.output) + return output_.Get(index); +} +inline ::std::string* NodeParameter::mutable_output(int index) { + // @@protoc_insertion_point(field_mutable:flow.NodeParameter.output) + return output_.Mutable(index); +} +inline void NodeParameter::set_output(int index, const ::std::string& value) { + // @@protoc_insertion_point(field_set:flow.NodeParameter.output) + output_.Mutable(index)->assign(value); +} +inline void NodeParameter::set_output(int index, const char* value) { + output_.Mutable(index)->assign(value); + // @@protoc_insertion_point(field_set_char:flow.NodeParameter.output) +} +inline void NodeParameter::set_output(int index, const char* value, size_t size) { + output_.Mutable(index)->assign( + reinterpret_cast(value), size); + // @@protoc_insertion_point(field_set_pointer:flow.NodeParameter.output) +} +inline ::std::string* NodeParameter::add_output() { + // @@protoc_insertion_point(field_add_mutable:flow.NodeParameter.output) + return output_.Add(); +} +inline void NodeParameter::add_output(const ::std::string& value) { + output_.Add()->assign(value); + // @@protoc_insertion_point(field_add:flow.NodeParameter.output) +} +inline void NodeParameter::add_output(const char* value) { + output_.Add()->assign(value); + // @@protoc_insertion_point(field_add_char:flow.NodeParameter.output) +} +inline void NodeParameter::add_output(const char* value, size_t size) { + output_.Add()->assign(reinterpret_cast(value), size); + // @@protoc_insertion_point(field_add_pointer:flow.NodeParameter.output) +} +inline const ::google::protobuf::RepeatedPtrField< ::std::string>& +NodeParameter::output() const { + // @@protoc_insertion_point(field_list:flow.NodeParameter.output) + return output_; +} +inline ::google::protobuf::RepeatedPtrField< ::std::string>* +NodeParameter::mutable_output() { + // @@protoc_insertion_point(field_mutable_list:flow.NodeParameter.output) + return &output_; +} + +// optional uint32 tmp = 8 [default = 0]; +inline bool NodeParameter::has_tmp() const { + return (_has_bits_[0] & 0x00000080u) != 0; +} +inline void NodeParameter::set_has_tmp() { + _has_bits_[0] |= 0x00000080u; +} +inline void NodeParameter::clear_has_tmp() { + _has_bits_[0] &= ~0x00000080u; +} +inline void NodeParameter::clear_tmp() { + tmp_ = 0u; + clear_has_tmp(); +} +inline ::google::protobuf::uint32 NodeParameter::tmp() const { + // @@protoc_insertion_point(field_get:flow.NodeParameter.tmp) + return tmp_; +} +inline void NodeParameter::set_tmp(::google::protobuf::uint32 value) { + set_has_tmp(); + tmp_ = value; + // @@protoc_insertion_point(field_set:flow.NodeParameter.tmp) +} + +// optional string precision = 9 [default = "FLOAT32"]; +inline bool NodeParameter::has_precision() const { + return (_has_bits_[0] & 0x00000100u) != 0; +} +inline void NodeParameter::set_has_precision() { + _has_bits_[0] |= 0x00000100u; +} +inline void NodeParameter::clear_has_precision() { + _has_bits_[0] &= ~0x00000100u; +} +inline void NodeParameter::clear_precision() { + precision_.ClearToDefaultNoArena(_default_precision_); + clear_has_precision(); +} +inline const ::std::string& NodeParameter::precision() const { + // @@protoc_insertion_point(field_get:flow.NodeParameter.precision) + return precision_.GetNoArena(_default_precision_); +} +inline void NodeParameter::set_precision(const ::std::string& value) { + set_has_precision(); + precision_.SetNoArena(_default_precision_, value); + // @@protoc_insertion_point(field_set:flow.NodeParameter.precision) +} +inline void NodeParameter::set_precision(const char* value) { + set_has_precision(); + precision_.SetNoArena(_default_precision_, ::std::string(value)); + // @@protoc_insertion_point(field_set_char:flow.NodeParameter.precision) +} +inline void NodeParameter::set_precision(const char* value, size_t size) { + set_has_precision(); + precision_.SetNoArena(_default_precision_, + ::std::string(reinterpret_cast(value), size)); + // @@protoc_insertion_point(field_set_pointer:flow.NodeParameter.precision) +} +inline ::std::string* NodeParameter::mutable_precision() { + set_has_precision(); + // @@protoc_insertion_point(field_mutable:flow.NodeParameter.precision) + return precision_.MutableNoArena(_default_precision_); +} +inline ::std::string* NodeParameter::release_precision() { + // @@protoc_insertion_point(field_release:flow.NodeParameter.precision) + clear_has_precision(); + return precision_.ReleaseNoArena(_default_precision_); +} +inline void NodeParameter::set_allocated_precision(::std::string* precision) { + if (precision != NULL) { + set_has_precision(); + } else { + clear_has_precision(); + } + precision_.SetAllocatedNoArena(_default_precision_, precision); + // @@protoc_insertion_point(field_set_allocated:flow.NodeParameter.precision) +} + +// repeated string infer_output_size_parameter = 10; +inline int NodeParameter::infer_output_size_parameter_size() const { + return infer_output_size_parameter_.size(); +} +inline void NodeParameter::clear_infer_output_size_parameter() { + infer_output_size_parameter_.Clear(); +} +inline const ::std::string& NodeParameter::infer_output_size_parameter(int index) const { + // @@protoc_insertion_point(field_get:flow.NodeParameter.infer_output_size_parameter) + return infer_output_size_parameter_.Get(index); +} +inline ::std::string* NodeParameter::mutable_infer_output_size_parameter(int index) { + // @@protoc_insertion_point(field_mutable:flow.NodeParameter.infer_output_size_parameter) + return infer_output_size_parameter_.Mutable(index); +} +inline void NodeParameter::set_infer_output_size_parameter(int index, const ::std::string& value) { + // @@protoc_insertion_point(field_set:flow.NodeParameter.infer_output_size_parameter) + infer_output_size_parameter_.Mutable(index)->assign(value); +} +inline void NodeParameter::set_infer_output_size_parameter(int index, const char* value) { + infer_output_size_parameter_.Mutable(index)->assign(value); + // @@protoc_insertion_point(field_set_char:flow.NodeParameter.infer_output_size_parameter) +} +inline void NodeParameter::set_infer_output_size_parameter(int index, const char* value, size_t size) { + infer_output_size_parameter_.Mutable(index)->assign( + reinterpret_cast(value), size); + // @@protoc_insertion_point(field_set_pointer:flow.NodeParameter.infer_output_size_parameter) +} +inline ::std::string* NodeParameter::add_infer_output_size_parameter() { + // @@protoc_insertion_point(field_add_mutable:flow.NodeParameter.infer_output_size_parameter) + return infer_output_size_parameter_.Add(); +} +inline void NodeParameter::add_infer_output_size_parameter(const ::std::string& value) { + infer_output_size_parameter_.Add()->assign(value); + // @@protoc_insertion_point(field_add:flow.NodeParameter.infer_output_size_parameter) +} +inline void NodeParameter::add_infer_output_size_parameter(const char* value) { + infer_output_size_parameter_.Add()->assign(value); + // @@protoc_insertion_point(field_add_char:flow.NodeParameter.infer_output_size_parameter) +} +inline void NodeParameter::add_infer_output_size_parameter(const char* value, size_t size) { + infer_output_size_parameter_.Add()->assign(reinterpret_cast(value), size); + // @@protoc_insertion_point(field_add_pointer:flow.NodeParameter.infer_output_size_parameter) +} +inline const ::google::protobuf::RepeatedPtrField< ::std::string>& +NodeParameter::infer_output_size_parameter() const { + // @@protoc_insertion_point(field_list:flow.NodeParameter.infer_output_size_parameter) + return infer_output_size_parameter_; +} +inline ::google::protobuf::RepeatedPtrField< ::std::string>* +NodeParameter::mutable_infer_output_size_parameter() { + // @@protoc_insertion_point(field_mutable_list:flow.NodeParameter.infer_output_size_parameter) + return &infer_output_size_parameter_; +} + +// repeated string preprocess_parameter = 11; +inline int NodeParameter::preprocess_parameter_size() const { + return preprocess_parameter_.size(); +} +inline void NodeParameter::clear_preprocess_parameter() { + preprocess_parameter_.Clear(); +} +inline const ::std::string& NodeParameter::preprocess_parameter(int index) const { + // @@protoc_insertion_point(field_get:flow.NodeParameter.preprocess_parameter) + return preprocess_parameter_.Get(index); +} +inline ::std::string* NodeParameter::mutable_preprocess_parameter(int index) { + // @@protoc_insertion_point(field_mutable:flow.NodeParameter.preprocess_parameter) + return preprocess_parameter_.Mutable(index); +} +inline void NodeParameter::set_preprocess_parameter(int index, const ::std::string& value) { + // @@protoc_insertion_point(field_set:flow.NodeParameter.preprocess_parameter) + preprocess_parameter_.Mutable(index)->assign(value); +} +inline void NodeParameter::set_preprocess_parameter(int index, const char* value) { + preprocess_parameter_.Mutable(index)->assign(value); + // @@protoc_insertion_point(field_set_char:flow.NodeParameter.preprocess_parameter) +} +inline void NodeParameter::set_preprocess_parameter(int index, const char* value, size_t size) { + preprocess_parameter_.Mutable(index)->assign( + reinterpret_cast(value), size); + // @@protoc_insertion_point(field_set_pointer:flow.NodeParameter.preprocess_parameter) +} +inline ::std::string* NodeParameter::add_preprocess_parameter() { + // @@protoc_insertion_point(field_add_mutable:flow.NodeParameter.preprocess_parameter) + return preprocess_parameter_.Add(); +} +inline void NodeParameter::add_preprocess_parameter(const ::std::string& value) { + preprocess_parameter_.Add()->assign(value); + // @@protoc_insertion_point(field_add:flow.NodeParameter.preprocess_parameter) +} +inline void NodeParameter::add_preprocess_parameter(const char* value) { + preprocess_parameter_.Add()->assign(value); + // @@protoc_insertion_point(field_add_char:flow.NodeParameter.preprocess_parameter) +} +inline void NodeParameter::add_preprocess_parameter(const char* value, size_t size) { + preprocess_parameter_.Add()->assign(reinterpret_cast(value), size); + // @@protoc_insertion_point(field_add_pointer:flow.NodeParameter.preprocess_parameter) +} +inline const ::google::protobuf::RepeatedPtrField< ::std::string>& +NodeParameter::preprocess_parameter() const { + // @@protoc_insertion_point(field_list:flow.NodeParameter.preprocess_parameter) + return preprocess_parameter_; +} +inline ::google::protobuf::RepeatedPtrField< ::std::string>* +NodeParameter::mutable_preprocess_parameter() { + // @@protoc_insertion_point(field_mutable_list:flow.NodeParameter.preprocess_parameter) + return &preprocess_parameter_; +} + +// repeated string inference_parameter = 12; +inline int NodeParameter::inference_parameter_size() const { + return inference_parameter_.size(); +} +inline void NodeParameter::clear_inference_parameter() { + inference_parameter_.Clear(); +} +inline const ::std::string& NodeParameter::inference_parameter(int index) const { + // @@protoc_insertion_point(field_get:flow.NodeParameter.inference_parameter) + return inference_parameter_.Get(index); +} +inline ::std::string* NodeParameter::mutable_inference_parameter(int index) { + // @@protoc_insertion_point(field_mutable:flow.NodeParameter.inference_parameter) + return inference_parameter_.Mutable(index); +} +inline void NodeParameter::set_inference_parameter(int index, const ::std::string& value) { + // @@protoc_insertion_point(field_set:flow.NodeParameter.inference_parameter) + inference_parameter_.Mutable(index)->assign(value); +} +inline void NodeParameter::set_inference_parameter(int index, const char* value) { + inference_parameter_.Mutable(index)->assign(value); + // @@protoc_insertion_point(field_set_char:flow.NodeParameter.inference_parameter) +} +inline void NodeParameter::set_inference_parameter(int index, const char* value, size_t size) { + inference_parameter_.Mutable(index)->assign( + reinterpret_cast(value), size); + // @@protoc_insertion_point(field_set_pointer:flow.NodeParameter.inference_parameter) +} +inline ::std::string* NodeParameter::add_inference_parameter() { + // @@protoc_insertion_point(field_add_mutable:flow.NodeParameter.inference_parameter) + return inference_parameter_.Add(); +} +inline void NodeParameter::add_inference_parameter(const ::std::string& value) { + inference_parameter_.Add()->assign(value); + // @@protoc_insertion_point(field_add:flow.NodeParameter.inference_parameter) +} +inline void NodeParameter::add_inference_parameter(const char* value) { + inference_parameter_.Add()->assign(value); + // @@protoc_insertion_point(field_add_char:flow.NodeParameter.inference_parameter) +} +inline void NodeParameter::add_inference_parameter(const char* value, size_t size) { + inference_parameter_.Add()->assign(reinterpret_cast(value), size); + // @@protoc_insertion_point(field_add_pointer:flow.NodeParameter.inference_parameter) +} +inline const ::google::protobuf::RepeatedPtrField< ::std::string>& +NodeParameter::inference_parameter() const { + // @@protoc_insertion_point(field_list:flow.NodeParameter.inference_parameter) + return inference_parameter_; +} +inline ::google::protobuf::RepeatedPtrField< ::std::string>* +NodeParameter::mutable_inference_parameter() { + // @@protoc_insertion_point(field_mutable_list:flow.NodeParameter.inference_parameter) + return &inference_parameter_; +} + +// repeated string postprocess_parameter = 13; +inline int NodeParameter::postprocess_parameter_size() const { + return postprocess_parameter_.size(); +} +inline void NodeParameter::clear_postprocess_parameter() { + postprocess_parameter_.Clear(); +} +inline const ::std::string& NodeParameter::postprocess_parameter(int index) const { + // @@protoc_insertion_point(field_get:flow.NodeParameter.postprocess_parameter) + return postprocess_parameter_.Get(index); +} +inline ::std::string* NodeParameter::mutable_postprocess_parameter(int index) { + // @@protoc_insertion_point(field_mutable:flow.NodeParameter.postprocess_parameter) + return postprocess_parameter_.Mutable(index); +} +inline void NodeParameter::set_postprocess_parameter(int index, const ::std::string& value) { + // @@protoc_insertion_point(field_set:flow.NodeParameter.postprocess_parameter) + postprocess_parameter_.Mutable(index)->assign(value); +} +inline void NodeParameter::set_postprocess_parameter(int index, const char* value) { + postprocess_parameter_.Mutable(index)->assign(value); + // @@protoc_insertion_point(field_set_char:flow.NodeParameter.postprocess_parameter) +} +inline void NodeParameter::set_postprocess_parameter(int index, const char* value, size_t size) { + postprocess_parameter_.Mutable(index)->assign( + reinterpret_cast(value), size); + // @@protoc_insertion_point(field_set_pointer:flow.NodeParameter.postprocess_parameter) +} +inline ::std::string* NodeParameter::add_postprocess_parameter() { + // @@protoc_insertion_point(field_add_mutable:flow.NodeParameter.postprocess_parameter) + return postprocess_parameter_.Add(); +} +inline void NodeParameter::add_postprocess_parameter(const ::std::string& value) { + postprocess_parameter_.Add()->assign(value); + // @@protoc_insertion_point(field_add:flow.NodeParameter.postprocess_parameter) +} +inline void NodeParameter::add_postprocess_parameter(const char* value) { + postprocess_parameter_.Add()->assign(value); + // @@protoc_insertion_point(field_add_char:flow.NodeParameter.postprocess_parameter) +} +inline void NodeParameter::add_postprocess_parameter(const char* value, size_t size) { + postprocess_parameter_.Add()->assign(reinterpret_cast(value), size); + // @@protoc_insertion_point(field_add_pointer:flow.NodeParameter.postprocess_parameter) +} +inline const ::google::protobuf::RepeatedPtrField< ::std::string>& +NodeParameter::postprocess_parameter() const { + // @@protoc_insertion_point(field_list:flow.NodeParameter.postprocess_parameter) + return postprocess_parameter_; +} +inline ::google::protobuf::RepeatedPtrField< ::std::string>* +NodeParameter::mutable_postprocess_parameter() { + // @@protoc_insertion_point(field_mutable_list:flow.NodeParameter.postprocess_parameter) + return &postprocess_parameter_; +} + +inline const NodeParameter* NodeParameter::internal_default_instance() { + return &NodeParameter_default_instance_.get(); +} +#endif // !PROTOBUF_INLINE_NOT_IN_HEADERS +// ------------------------------------------------------------------- + + +// @@protoc_insertion_point(namespace_scope) + +} // namespace flow + +// @@protoc_insertion_point(global_scope) + +#endif // PROTOBUF_flow_2eproto__INCLUDED diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/kit_flags.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/kit_flags.h new file mode 100644 index 00000000..e5d64274 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/kit_flags.h @@ -0,0 +1,26 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _KIT_FLAGS_H +#define _KIT_FLAGS_H + +#define _USE_XCODE +#define _USE_IOS +#define _USE_NEON +#define __aarch64__ +#define _USE_FP32 +#define _USE_FP16 +#define _USE_INT8 +//#define _THREAD_SAFE + +#endif diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/arena.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/arena.h new file mode 100644 index 00000000..774b9a97 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/arena.h @@ -0,0 +1,930 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// This file defines an Arena allocator for better allocation performance. + +#ifndef GOOGLE_PROTOBUF_ARENA_H__ +#define GOOGLE_PROTOBUF_ARENA_H__ + +#include +#ifdef max +#undef max // Visual Studio defines this macro +#endif +#if __cplusplus >= 201103L +#include "type_traits.h" +#endif +#if defined(_MSC_VER) && !_HAS_EXCEPTIONS +// Work around bugs in MSVC header when _HAS_EXCEPTIONS=0. +#include +#include +namespace std { +using type_info = ::type_info; +} +#else +#include +#endif + +#include "atomic_sequence_num.h" +#include "atomicops.h" +#include "common.h" +#include "logging.h" +#include "mutex.h" +#include "type_traits.h" + + +namespace google { +namespace protobuf { + +class Arena; // defined below +class Message; // message.h + +namespace internal { +class ArenaString; // arenastring.h +class LazyField; // lazy_field.h + +template +class GenericTypeHandler; // repeated_field.h + +// Templated cleanup methods. +template void arena_destruct_object(void* object) { + reinterpret_cast(object)->~T(); +} +template void arena_delete_object(void* object) { + delete reinterpret_cast(object); +} +inline void arena_free(void* object, size_t size) { +#if defined(__GXX_DELETE_WITH_SIZE__) || defined(__cpp_sized_deallocation) + ::operator delete(object, size); +#else + ::operator delete(object); +#endif +} + +} // namespace internal + +// ArenaOptions provides optional additional parameters to arena construction +// that control its block-allocation behavior. +struct ArenaOptions { + // This defines the size of the first block requested from the system malloc. + // Subsequent block sizes will increase in a geometric series up to a maximum. + size_t start_block_size; + + // This defines the maximum block size requested from system malloc (unless an + // individual arena allocation request occurs with a size larger than this + // maximum). Requested block sizes increase up to this value, then remain + // here. + size_t max_block_size; + + // An initial block of memory for the arena to use, or NULL for none. If + // provided, the block must live at least as long as the arena itself. The + // creator of the Arena retains ownership of the block after the Arena is + // destroyed. + char* initial_block; + + // The size of the initial block, if provided. + size_t initial_block_size; + + // A function pointer to an alloc method that returns memory blocks of size + // requested. By default, it contains a ptr to the malloc function. + // + // NOTE: block_alloc and dealloc functions are expected to behave like + // malloc and free, including Asan poisoning. + void* (*block_alloc)(size_t); + // A function pointer to a dealloc method that takes ownership of the blocks + // from the arena. By default, it contains a ptr to a wrapper function that + // calls free. + void (*block_dealloc)(void*, size_t); + + // Hooks for adding external functionality such as user-specific metrics + // collection, specific debugging abilities, etc. + // Init hook may return a pointer to a cookie to be stored in the arena. + // reset and destruction hooks will then be called with the same cookie + // pointer. This allows us to save an external object per arena instance and + // use it on the other hooks (Note: It is just as legal for init to return + // NULL and not use the cookie feature). + // on_arena_reset and on_arena_destruction also receive the space used in + // the arena just before the reset. + void* (*on_arena_init)(Arena* arena); + void (*on_arena_reset)(Arena* arena, void* cookie, uint64 space_used); + void (*on_arena_destruction)(Arena* arena, void* cookie, uint64 space_used); + + // type_info is promised to be static - its lifetime extends to + // match program's lifetime (It is given by typeid operator). + // Note: typeid(void) will be passed as allocated_type every time we + // intentionally want to avoid monitoring an allocation. (i.e. internal + // allocations for managing the arena) + void (*on_arena_allocation)(const std::type_info* allocated_type, + uint64 alloc_size, void* cookie); + + ArenaOptions() + : start_block_size(kDefaultStartBlockSize), + max_block_size(kDefaultMaxBlockSize), + initial_block(NULL), + initial_block_size(0), + block_alloc(&::operator new), + block_dealloc(&internal::arena_free), + on_arena_init(NULL), + on_arena_reset(NULL), + on_arena_destruction(NULL), + on_arena_allocation(NULL) {} + + private: + // Constants define default starting block size and max block size for + // arena allocator behavior -- see descriptions above. + static const size_t kDefaultStartBlockSize = 256; + static const size_t kDefaultMaxBlockSize = 8192; +}; + +// Support for non-RTTI environments. (The metrics hooks API uses type +// information.) +#ifndef GOOGLE_PROTOBUF_NO_RTTI +#define RTTI_TYPE_ID(type) (&typeid(type)) +#else +#define RTTI_TYPE_ID(type) (NULL) +#endif + +// Arena allocator. Arena allocation replaces ordinary (heap-based) allocation +// with new/delete, and improves performance by aggregating allocations into +// larger blocks and freeing allocations all at once. Protocol messages are +// allocated on an arena by using Arena::CreateMessage(Arena*), below, and +// are automatically freed when the arena is destroyed. +// +// This is a thread-safe implementation: multiple threads may allocate from the +// arena concurrently. Destruction is not thread-safe and the destructing +// thread must synchronize with users of the arena first. +// +// An arena provides two allocation interfaces: CreateMessage, which works +// for arena-enabled proto2 message types as well as other types that satisfy +// the appropriate protocol (described below), and Create, which works for +// any arbitrary type T. CreateMessage is better when the type T supports it, +// because this interface (i) passes the arena pointer to the created object so +// that its sub-objects and internal allocations can use the arena too, and (ii) +// elides the object's destructor call when possible. Create does not place +// any special requirements on the type T, and will invoke the object's +// destructor when the arena is destroyed. +// +// The arena message allocation protocol, required by CreateMessage, is as +// follows: +// +// - The type T must have (at least) two constructors: a constructor with no +// arguments, called when a T is allocated on the heap; and a constructor with +// a google::protobuf::Arena* argument, called when a T is allocated on an arena. If the +// second constructor is called with a NULL arena pointer, it must be +// equivalent to invoking the first (no-argument) constructor. +// +// - The type T must have a particular type trait: a nested type +// |InternalArenaConstructable_|. This is usually a typedef to |void|. If no +// such type trait exists, then the instantiation CreateMessage will fail +// to compile. +// +// - The type T *may* have the type trait |DestructorSkippable_|. If this type +// trait is present in the type, then its destructor will not be called if and +// only if it was passed a non-NULL arena pointer. If this type trait is not +// present on the type, then its destructor is always called when the +// containing arena is destroyed. +// +// - One- and two-user-argument forms of CreateMessage() also exist that +// forward these constructor arguments to T's constructor: for example, +// CreateMessage(Arena*, arg1, arg2) forwards to a constructor T(Arena*, +// arg1, arg2). +// +// This protocol is implemented by all arena-enabled proto2 message classes as +// well as RepeatedPtrField. +// +// Do NOT subclass Arena. This class will be marked as final when C++11 is +// enabled. +class LIBPROTOBUF_EXPORT Arena { + public: + // Arena constructor taking custom options. See ArenaOptions below for + // descriptions of the options available. + explicit Arena(const ArenaOptions& options) : options_(options) { + Init(); + } + + // Default constructor with sensible default options, tuned for average + // use-cases. + Arena() { + Init(); + } + + // Destructor deletes all owned heap allocated objects, and destructs objects + // that have non-trivial destructors, except for proto2 message objects whose + // destructors can be skipped. Also, frees all blocks except the initial block + // if it was passed in. + ~Arena(); + + // API to create proto2 message objects on the arena. If the arena passed in + // is NULL, then a heap allocated object is returned. Type T must be a message + // defined in a .proto file with cc_enable_arenas set to true, otherwise a + // compilation error will occur. + // + // RepeatedField and RepeatedPtrField may also be instantiated directly on an + // arena with this method. + // + // This function also accepts any type T that satisfies the arena message + // allocation protocol, documented above. + template GOOGLE_ATTRIBUTE_ALWAYS_INLINE + static T* CreateMessage(::google::protobuf::Arena* arena) { + if (arena == NULL) { + return new T; + } else { + return arena->CreateMessageInternal(static_cast(0)); + } + } + + // One-argument form of CreateMessage. This is useful for constructing objects + // that implement the arena message construction protocol described above but + // take additional constructor arguments. + template GOOGLE_ATTRIBUTE_ALWAYS_INLINE + static T* CreateMessage(::google::protobuf::Arena* arena, const Arg& arg) { + if (arena == NULL) { + return new T(NULL, arg); + } else { + return arena->CreateMessageInternal(static_cast(0), + arg); + } + } + + // Two-argument form of CreateMessage. This is useful for constructing objects + // that implement the arena message construction protocol described above but + // take additional constructor arguments. + template GOOGLE_ATTRIBUTE_ALWAYS_INLINE + static T* CreateMessage(::google::protobuf::Arena* arena, + const Arg1& arg1, + const Arg2& arg2) { + if (arena == NULL) { + return new T(NULL, arg1, arg2); + } else { + return arena->CreateMessageInternal(static_cast(0), + arg1, arg2); + } + } + + // API to create any objects on the arena. Note that only the object will + // be created on the arena; the underlying ptrs (in case of a proto2 message) + // will be still heap allocated. Proto messages should usually be allocated + // with CreateMessage() instead. + // + // Note that even if T satisfies the arena message construction protocol + // (InternalArenaConstructable_ trait and optional DestructorSkippable_ + // trait), as described above, this function does not follow the protocol; + // instead, it treats T as a black-box type, just as if it did not have these + // traits. Specifically, T's constructor arguments will always be only those + // passed to Create() -- no additional arena pointer is implicitly added. + // Furthermore, the destructor will always be called at arena destruction time + // (unless the destructor is trivial). Hence, from T's point of view, it is as + // if the object were allocated on the heap (except that the underlying memory + // is obtained from the arena). + template GOOGLE_ATTRIBUTE_ALWAYS_INLINE + static T* Create(::google::protobuf::Arena* arena) { + if (arena == NULL) { + return new T(); + } else { + return arena->CreateInternal(google::protobuf::internal::has_trivial_destructor::value); + } + } + + // Version of the above with one constructor argument for the created object. + template GOOGLE_ATTRIBUTE_ALWAYS_INLINE + static T* Create(::google::protobuf::Arena* arena, const Arg& arg) { + if (arena == NULL) { + return new T(arg); + } else { + return arena->CreateInternal(google::protobuf::internal::has_trivial_destructor::value, + arg); + } + } + + // Version of the above with two constructor arguments for the created object. + template GOOGLE_ATTRIBUTE_ALWAYS_INLINE + static T* Create(::google::protobuf::Arena* arena, const Arg1& arg1, const Arg2& arg2) { + if (arena == NULL) { + return new T(arg1, arg2); + } else { + return arena->CreateInternal(google::protobuf::internal::has_trivial_destructor::value, + arg1, arg2); + } + } + + // Version of the above with three constructor arguments for the created + // object. + template + GOOGLE_ATTRIBUTE_ALWAYS_INLINE static T* Create(::google::protobuf::Arena* arena, + const Arg1& arg1, const Arg2& arg2, + const Arg3& arg3) { + if (arena == NULL) { + return new T(arg1, arg2, arg3); + } else { + return arena->CreateInternal(google::protobuf::internal::has_trivial_destructor::value, + arg1, arg2, arg3); + } + } + + // Version of the above with four constructor arguments for the created + // object. + template + GOOGLE_ATTRIBUTE_ALWAYS_INLINE static T* Create(::google::protobuf::Arena* arena, + const Arg1& arg1, const Arg2& arg2, + const Arg3& arg3, const Arg4& arg4) { + if (arena == NULL) { + return new T(arg1, arg2, arg3, arg4); + } else { + return arena->CreateInternal(google::protobuf::internal::has_trivial_destructor::value, + arg1, arg2, arg3, arg4); + } + } + + // Version of the above with five constructor arguments for the created + // object. + template + GOOGLE_ATTRIBUTE_ALWAYS_INLINE static T* Create(::google::protobuf::Arena* arena, + const Arg1& arg1, const Arg2& arg2, + const Arg3& arg3, const Arg4& arg4, + const Arg5& arg5) { + if (arena == NULL) { + return new T(arg1, arg2, arg3, arg4, arg5); + } else { + return arena->CreateInternal(google::protobuf::internal::has_trivial_destructor::value, + arg1, arg2, arg3, arg4, arg5); + } + } + + // Version of the above with six constructor arguments for the created + // object. + template + GOOGLE_ATTRIBUTE_ALWAYS_INLINE static T* Create(::google::protobuf::Arena* arena, + const Arg1& arg1, const Arg2& arg2, + const Arg3& arg3, const Arg4& arg4, + const Arg5& arg5, const Arg6& arg6) { + if (arena == NULL) { + return new T(arg1, arg2, arg3, arg4, arg5, arg6); + } else { + return arena->CreateInternal(google::protobuf::internal::has_trivial_destructor::value, + arg1, arg2, arg3, arg4, arg5, arg6); + } + } + + // Version of the above with seven constructor arguments for the created + // object. + template + GOOGLE_ATTRIBUTE_ALWAYS_INLINE static T* Create(::google::protobuf::Arena* arena, + const Arg1& arg1, const Arg2& arg2, + const Arg3& arg3, const Arg4& arg4, + const Arg5& arg5, const Arg6& arg6, + const Arg7& arg7) { + if (arena == NULL) { + return new T(arg1, arg2, arg3, arg4, arg5, arg6, arg7); + } else { + return arena->CreateInternal(google::protobuf::internal::has_trivial_destructor::value, + arg1, arg2, arg3, arg4, arg5, arg6, arg7); + } + } + + // Version of the above with eight constructor arguments for the created + // object. + template + GOOGLE_ATTRIBUTE_ALWAYS_INLINE static T* Create(::google::protobuf::Arena* arena, + const Arg1& arg1, const Arg2& arg2, + const Arg3& arg3, const Arg4& arg4, + const Arg5& arg5, const Arg6& arg6, + const Arg7& arg7, const Arg8& arg8) { + if (arena == NULL) { + return new T(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8); + } else { + return arena->CreateInternal( + google::protobuf::internal::has_trivial_destructor::value, + arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8); + } + } + + // Create an array of object type T on the arena *without* invoking the + // constructor of T. If `arena` is null, then the return value should be freed + // with `delete[] x;` (or `::operator delete[](x);`). + // To ensure safe uses, this function checks at compile time + // (when compiled as C++11) that T is trivially default-constructible and + // trivially destructible. + template GOOGLE_ATTRIBUTE_ALWAYS_INLINE + static T* CreateArray(::google::protobuf::Arena* arena, size_t num_elements) { + GOOGLE_CHECK_LE(num_elements, + std::numeric_limits::max() / sizeof(T)) + << "Requested size is too large to fit into size_t."; + if (arena == NULL) { + return static_cast(::operator new[](num_elements * sizeof(T))); + } else { + return arena->CreateInternalRawArray(num_elements); + } + } + + // Returns the total space used by the arena, which is the sums of the sizes + // of the underlying blocks. The total space used may not include the new + // blocks that are allocated by this arena from other threads concurrently + // with the call to this method. + GOOGLE_ATTRIBUTE_NOINLINE uint64 SpaceAllocated() const; + // As above, but does not include any free space in underlying blocks. + GOOGLE_ATTRIBUTE_NOINLINE uint64 SpaceUsed() const; + + // Combines SpaceAllocated and SpaceUsed. Returns a pair of + // . + GOOGLE_ATTRIBUTE_NOINLINE std::pair SpaceAllocatedAndUsed() const; + + // Frees all storage allocated by this arena after calling destructors + // registered with OwnDestructor() and freeing objects registered with Own(). + // Any objects allocated on this arena are unusable after this call. It also + // returns the total space used by the arena which is the sums of the sizes + // of the allocated blocks. This method is not thread-safe. + GOOGLE_ATTRIBUTE_NOINLINE uint64 Reset(); + + // Adds |object| to a list of heap-allocated objects to be freed with |delete| + // when the arena is destroyed or reset. + template GOOGLE_ATTRIBUTE_NOINLINE + void Own(T* object) { + OwnInternal(object, google::protobuf::internal::is_convertible()); + } + + // Adds |object| to a list of objects whose destructors will be manually + // called when the arena is destroyed or reset. This differs from Own() in + // that it does not free the underlying memory with |delete|; hence, it is + // normally only used for objects that are placement-newed into + // arena-allocated memory. + template GOOGLE_ATTRIBUTE_NOINLINE + void OwnDestructor(T* object) { + if (object != NULL) { + AddListNode(object, &internal::arena_destruct_object); + } + } + + // Adds a custom member function on an object to the list of destructors that + // will be manually called when the arena is destroyed or reset. This differs + // from OwnDestructor() in that any member function may be specified, not only + // the class destructor. + GOOGLE_ATTRIBUTE_NOINLINE void OwnCustomDestructor(void* object, + void (*destruct)(void*)) { + AddListNode(object, destruct); + } + + // Retrieves the arena associated with |value| if |value| is an arena-capable + // message, or NULL otherwise. This differs from value->GetArena() in that the + // latter is a virtual call, while this method is a templated call that + // resolves at compile-time. + template GOOGLE_ATTRIBUTE_ALWAYS_INLINE + static ::google::protobuf::Arena* GetArena(const T* value) { + return GetArenaInternal(value, static_cast(0)); + } + + private: + struct InternalIsArenaConstructableHelper { + template + static char ArenaConstructable( + const typename U::InternalArenaConstructable_*); + template + static double ArenaConstructable(...); + }; + + public: + // Helper typetrait that indicates support for arenas in a type T at compile + // time. This is public only to allow construction of higher-level templated + // utilities. is_arena_constructable::value is true if the message type T + // has arena support enabled, and false otherwise. + // + // This is inside Arena because only Arena has the friend relationships + // necessary to see the underlying generated code traits. + template + struct is_arena_constructable + : public google::protobuf::internal::integral_constant< + bool, sizeof(InternalIsArenaConstructableHelper::ArenaConstructable< + const T>(static_cast(0))) == sizeof(char)> { + }; + + private: + // Blocks are variable length malloc-ed objects. The following structure + // describes the common header for all blocks. + struct Block { + void* owner; // &ThreadCache of thread that owns this block, or + // &this->owner if not yet owned by a thread. + Block* next; // Next block in arena (may have different owner) + // ((char*) &block) + pos is next available byte. It is always + // aligned at a multiple of 8 bytes. + size_t pos; + size_t size; // total size of the block. + GOOGLE_ATTRIBUTE_ALWAYS_INLINE size_t avail() const { return size - pos; } + // data follows + }; + + template friend class ::google::protobuf::internal::GenericTypeHandler; + friend class MockArena; // For unit-testing. + friend class internal::ArenaString; // For AllocateAligned. + friend class internal::LazyField; // For CreateMaybeMessage. + + struct ThreadCache { + // The ThreadCache is considered valid as long as this matches the + // lifecycle_id of the arena being used. + int64 last_lifecycle_id_seen; + Block* last_block_used_; + }; + + static const size_t kHeaderSize = sizeof(Block); + static google::protobuf::internal::SequenceNumber lifecycle_id_generator_; +#if defined(GOOGLE_PROTOBUF_NO_THREADLOCAL) + // Android ndk does not support GOOGLE_THREAD_LOCAL keyword so we use a custom thread + // local storage class we implemented. + // iOS also does not support the GOOGLE_THREAD_LOCAL keyword. + static ThreadCache& thread_cache(); +#elif defined(PROTOBUF_USE_DLLS) + // Thread local variables cannot be exposed through DLL interface but we can + // wrap them in static functions. + static ThreadCache& thread_cache(); +#else + static GOOGLE_THREAD_LOCAL ThreadCache thread_cache_; + static ThreadCache& thread_cache() { return thread_cache_; } +#endif + + // SFINAE for skipping addition to delete list for a message type when created + // with CreateMessage. This is mainly to skip proto2/proto1 message objects + // with cc_enable_arenas=true from being part of the delete list. Also, note, + // compiler will optimize out the branch in CreateInternal. + template + static inline bool SkipDeleteList(typename T::DestructorSkippable_*) { + return true; + } + + // For message objects that don't have the DestructorSkippable_ trait, we + // always add to the delete list. + template + static inline bool SkipDeleteList(...) { + return google::protobuf::internal::has_trivial_destructor::value; + } + + private: + struct InternalIsDestructorSkippableHelper { + template + static char DestructorSkippable( + const typename U::DestructorSkippable_*); + template + static double DestructorSkippable(...); + }; + + public: + // Helper typetrait that indicates whether the desctructor of type T should be + // called when arena is destroyed at compile time. This is only to allow + // construction of higher-level templated utilities. + // is_destructor_skippable::value is true if the destructor of the message + // type T should not be called when arena is destroyed or false otherwise. + // This is inside Arena because only Arena has the friend relationships + // necessary to see the underlying generated code traits. + template + struct is_destructor_skippable + : public google::protobuf::internal::integral_constant< + bool, + sizeof(InternalIsDestructorSkippableHelper::DestructorSkippable< + const T>(static_cast(0))) == sizeof(char) || + google::protobuf::internal::has_trivial_destructor::value> {}; + + private: + // CreateMessage requires that T supports arenas, but this private method + // works whether or not T supports arenas. These are not exposed to user code + // as it can cause confusing API usages, and end up having double free in + // user code. These are used only internally from LazyField and Repeated + // fields, since they are designed to work in all mode combinations. + template GOOGLE_ATTRIBUTE_ALWAYS_INLINE + static Msg* CreateMaybeMessage( + Arena* arena, typename Msg::InternalArenaConstructable_*) { + return CreateMessage(arena); + } + + template GOOGLE_ATTRIBUTE_ALWAYS_INLINE + static T* CreateMaybeMessage(Arena* arena, ...) { + return Create(arena); + } + + // Just allocate the required size for the given type assuming the + // type has a trivial constructor. + template GOOGLE_ATTRIBUTE_ALWAYS_INLINE + T* CreateInternalRawArray(size_t num_elements) { + GOOGLE_CHECK_LE(num_elements, + std::numeric_limits::max() / sizeof(T)) + << "Requested size is too large to fit into size_t."; + return static_cast( + AllocateAligned(RTTI_TYPE_ID(T), sizeof(T) * num_elements)); + } + + template GOOGLE_ATTRIBUTE_ALWAYS_INLINE + T* CreateInternal(bool skip_explicit_ownership) { + T* t = new (AllocateAligned(RTTI_TYPE_ID(T), sizeof(T))) T(); + if (!skip_explicit_ownership) { + AddListNode(t, &internal::arena_destruct_object); + } + return t; + } + + template GOOGLE_ATTRIBUTE_ALWAYS_INLINE + T* CreateInternal(bool skip_explicit_ownership, const Arg& arg) { + T* t = new (AllocateAligned(RTTI_TYPE_ID(T), sizeof(T))) T(arg); + if (!skip_explicit_ownership) { + AddListNode(t, &internal::arena_destruct_object); + } + return t; + } + + template GOOGLE_ATTRIBUTE_ALWAYS_INLINE + T* CreateInternal( + bool skip_explicit_ownership, const Arg1& arg1, const Arg2& arg2) { + T* t = new (AllocateAligned(RTTI_TYPE_ID(T), sizeof(T))) T(arg1, arg2); + if (!skip_explicit_ownership) { + AddListNode(t, &internal::arena_destruct_object); + } + return t; + } + + template + GOOGLE_ATTRIBUTE_ALWAYS_INLINE T* CreateInternal(bool skip_explicit_ownership, + const Arg1& arg1, + const Arg2& arg2, + const Arg3& arg3) { + T* t = new (AllocateAligned(RTTI_TYPE_ID(T), sizeof(T))) + T(arg1, arg2, arg3); + if (!skip_explicit_ownership) { + AddListNode(t, &internal::arena_destruct_object); + } + return t; + } + + template + GOOGLE_ATTRIBUTE_ALWAYS_INLINE T* CreateInternal(bool skip_explicit_ownership, + const Arg1& arg1, + const Arg2& arg2, + const Arg3& arg3, + const Arg4& arg4) { + T* t = new (AllocateAligned(RTTI_TYPE_ID(T), sizeof(T))) + T(arg1, arg2, arg3, arg4); + if (!skip_explicit_ownership) { + AddListNode(t, &internal::arena_destruct_object); + } + return t; + } + + template + GOOGLE_ATTRIBUTE_ALWAYS_INLINE T* CreateInternal(bool skip_explicit_ownership, + const Arg1& arg1, + const Arg2& arg2, + const Arg3& arg3, + const Arg4& arg4, + const Arg5& arg5) { + T* t = new (AllocateAligned(RTTI_TYPE_ID(T), sizeof(T))) + T(arg1, arg2, arg3, arg4, arg5); + if (!skip_explicit_ownership) { + AddListNode(t, &internal::arena_destruct_object); + } + return t; + } + + template + GOOGLE_ATTRIBUTE_ALWAYS_INLINE T* CreateInternal(bool skip_explicit_ownership, + const Arg1& arg1, + const Arg2& arg2, + const Arg3& arg3, + const Arg4& arg4, + const Arg5& arg5, + const Arg6& arg6) { + T* t = new (AllocateAligned(RTTI_TYPE_ID(T), sizeof(T))) + T(arg1, arg2, arg3, arg4, arg5, arg6); + if (!skip_explicit_ownership) { + AddListNode(t, &internal::arena_destruct_object); + } + return t; + } + + template + GOOGLE_ATTRIBUTE_ALWAYS_INLINE T* CreateInternal(bool skip_explicit_ownership, + const Arg1& arg1, + const Arg2& arg2, + const Arg3& arg3, + const Arg4& arg4, + const Arg5& arg5, + const Arg6& arg6, + const Arg7& arg7) { + T* t = new (AllocateAligned(RTTI_TYPE_ID(T), sizeof(T))) + T(arg1, arg2, arg3, arg4, arg5, arg6, arg7); + if (!skip_explicit_ownership) { + AddListNode(t, &internal::arena_destruct_object); + } + return t; + } + + template + GOOGLE_ATTRIBUTE_ALWAYS_INLINE T* CreateInternal(bool skip_explicit_ownership, + const Arg1& arg1, + const Arg2& arg2, + const Arg3& arg3, + const Arg4& arg4, + const Arg5& arg5, + const Arg6& arg6, + const Arg7& arg7, + const Arg8& arg8) { + T* t = new (AllocateAligned(RTTI_TYPE_ID(T), sizeof(T))) + T(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8); + if (!skip_explicit_ownership) { + AddListNode(t, &internal::arena_destruct_object); + } + return t; + } + + template GOOGLE_ATTRIBUTE_ALWAYS_INLINE + T* CreateMessageInternal(typename T::InternalArenaConstructable_*) { + return CreateInternal(SkipDeleteList(static_cast(0)), + this); + } + + template GOOGLE_ATTRIBUTE_ALWAYS_INLINE + T* CreateMessageInternal(typename T::InternalArenaConstructable_*, + const Arg& arg) { + return CreateInternal(SkipDeleteList(static_cast(0)), + this, arg); + } + + template GOOGLE_ATTRIBUTE_ALWAYS_INLINE + T* CreateMessageInternal(typename T::InternalArenaConstructable_*, + const Arg1& arg1, const Arg2& arg2) { + return CreateInternal(SkipDeleteList(static_cast(0)), + this, arg1, arg2); + } + + // CreateInArenaStorage is used to implement map field. Without it, + // google::protobuf::Map need to call generated message's protected arena constructor, + // which needs to declare google::protobuf::Map as friend of generated message. + template + static void CreateInArenaStorage(T* ptr, Arena* arena) { + CreateInArenaStorageInternal(ptr, arena, + typename is_arena_constructable::type()); + RegisterDestructorInternal(ptr, arena, + typename is_destructor_skippable::type()); + } + + template + static void CreateInArenaStorageInternal( + T* ptr, Arena* arena, google::protobuf::internal::true_type) { + new (ptr) T(arena); + } + template + static void CreateInArenaStorageInternal( + T* ptr, Arena* arena, google::protobuf::internal::false_type) { + new (ptr) T(); + } + + template + static void RegisterDestructorInternal( + T* ptr, Arena* arena, google::protobuf::internal::true_type) {} + template + static void RegisterDestructorInternal( + T* ptr, Arena* arena, google::protobuf::internal::false_type) { + arena->OwnDestructor(ptr); + } + + // These implement Own(), which registers an object for deletion (destructor + // call and operator delete()). The second parameter has type 'true_type' if T + // is a subtype of ::google::protobuf::Message and 'false_type' otherwise. Collapsing + // all template instantiations to one for generic Message reduces code size, + // using the virtual destructor instead. + template GOOGLE_ATTRIBUTE_ALWAYS_INLINE + void OwnInternal(T* object, google::protobuf::internal::true_type) { + if (object != NULL) { + AddListNode(object, &internal::arena_delete_object< ::google::protobuf::Message >); + } + } + template GOOGLE_ATTRIBUTE_ALWAYS_INLINE + void OwnInternal(T* object, google::protobuf::internal::false_type) { + if (object != NULL) { + AddListNode(object, &internal::arena_delete_object); + } + } + + // Implementation for GetArena(). Only message objects with + // InternalArenaConstructable_ tags can be associated with an arena, and such + // objects must implement a GetArenaNoVirtual() method. + template GOOGLE_ATTRIBUTE_ALWAYS_INLINE + static ::google::protobuf::Arena* GetArenaInternal( + const T* value, typename T::InternalArenaConstructable_*) { + return value->GetArenaNoVirtual(); + } + + template GOOGLE_ATTRIBUTE_ALWAYS_INLINE + static ::google::protobuf::Arena* GetArenaInternal(const T* value, ...) { + return NULL; + } + + // Allocate and also optionally call on_arena_allocation callback with the + // allocated type info when the hooks are in place in ArenaOptions and + // the cookie is not null. + void* AllocateAligned(const std::type_info* allocated, size_t n); + + // Allocate an internal allocation, avoiding optional typed monitoring. + GOOGLE_ATTRIBUTE_ALWAYS_INLINE void* AllocateAligned(size_t n) { + return AllocateAligned(NULL, n); + } + + void Init(); + + // Free all blocks and return the total space used which is the sums of sizes + // of the all the allocated blocks. + uint64 FreeBlocks(); + + // Add object pointer and cleanup function pointer to the list. + // TODO(rohananil, cfallin): We could pass in a sub-arena into this method + // to avoid polluting blocks of this arena with list nodes. This would help in + // mixed mode (where many protobufs have cc_enable_arenas=false), and is an + // alternative to a chunked linked-list, but with extra overhead of *next. + void AddListNode(void* elem, void (*cleanup)(void*)); + // Delete or Destruct all objects owned by the arena. + void CleanupList(); + uint64 ResetInternal(); + + inline void SetThreadCacheBlock(Block* block) { + thread_cache().last_block_used_ = block; + thread_cache().last_lifecycle_id_seen = lifecycle_id_; + } + + int64 lifecycle_id_; // Unique for each arena. Changes on Reset(). + + google::protobuf::internal::AtomicWord blocks_; // Head of linked list of all allocated blocks + google::protobuf::internal::AtomicWord hint_; // Fast thread-local block access + + // Node contains the ptr of the object to be cleaned up and the associated + // cleanup function ptr. + struct Node { + void* elem; // Pointer to the object to be cleaned up. + void (*cleanup)(void*); // Function pointer to the destructor or deleter. + Node* next; // Next node in the list. + }; + + google::protobuf::internal::AtomicWord cleanup_list_; // Head of a linked list of nodes containing object + // ptrs and cleanup methods. + + bool owns_first_block_; // Indicates that arena owns the first block + Mutex blocks_lock_; + + void AddBlock(Block* b); + // Access must be synchronized, either by blocks_lock_ or by being called from + // Init()/Reset(). + void AddBlockInternal(Block* b); + void* SlowAlloc(size_t n); + Block* FindBlock(void* me); + Block* NewBlock(void* me, Block* my_last_block, size_t n, + size_t start_block_size, size_t max_block_size); + static void* AllocFromBlock(Block* b, size_t n); + template + friend class Map; + + // The arena may save a cookie it receives from the external on_init hook + // and then use it when calling the on_reset and on_destruction hooks. + void* hooks_cookie_; + + ArenaOptions options_; + + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Arena); +}; + +// Defined above for supporting environments without RTTI. +#undef RTTI_TYPE_ID + +} // namespace protobuf + +} // namespace google +#endif // GOOGLE_PROTOBUF_ARENA_H__ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/arenastring.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/arenastring.h new file mode 100644 index 00000000..52f5e736 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/arenastring.h @@ -0,0 +1,314 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef GOOGLE_PROTOBUF_ARENASTRING_H__ +#define GOOGLE_PROTOBUF_ARENASTRING_H__ + +#include + +#include "logging.h" +#include "common.h" +#include "fastmem.h" +#include "arena.h" +#include "generated_message_util.h" + + + +// This is the implementation of arena string fields written for the open-source +// release. The ArenaStringPtr struct below is an internal implementation class +// and *should not be used* by user code. It is used to collect string +// operations together into one place and abstract away the underlying +// string-field pointer representation, so that (for example) an alternate +// implementation that knew more about ::std::string's internals could integrate more +// closely with the arena allocator. + +namespace google { +namespace protobuf { +namespace internal { + +struct LIBPROTOBUF_EXPORT ArenaStringPtr { + inline void Set(const ::std::string* default_value, + const ::std::string& value, ::google::protobuf::Arena* arena) { + if (ptr_ == default_value) { + CreateInstance(arena, &value); + } else { + *ptr_ = value; + } + } + + // Basic accessors. + inline const ::std::string& Get(const ::std::string* /* default_value */) const { + return *ptr_; + } + + inline ::std::string* Mutable(const ::std::string* default_value, + ::google::protobuf::Arena* arena) { + if (ptr_ == default_value) { + CreateInstance(arena, default_value); + } + return ptr_; + } + + // Release returns a ::std::string* instance that is heap-allocated and is not + // Own()'d by any arena. If the field was not set, it returns NULL. The caller + // retains ownership. Clears this field back to NULL state. Used to implement + // release_() methods on generated classes. + inline ::std::string* Release(const ::std::string* default_value, + ::google::protobuf::Arena* arena) { + if (ptr_ == default_value) { + return NULL; + } + ::std::string* released = NULL; + if (arena != NULL) { + // ptr_ is owned by the arena -- we need to return a copy. + released = new ::std::string(*ptr_); + } else { + released = ptr_; + } + ptr_ = const_cast< ::std::string* >(default_value); + return released; + } + + // UnsafeArenaRelease returns a ::std::string*, but it may be arena-owned (i.e. + // have its destructor already registered) if arena != NULL. If the field was + // not set, this returns NULL. This method clears this field back to NULL + // state. Used to implement unsafe_arena_release_() methods on + // generated classes. + inline ::std::string* UnsafeArenaRelease(const ::std::string* default_value, + ::google::protobuf::Arena* /* arena */) { + if (ptr_ == default_value) { + return NULL; + } + ::std::string* released = ptr_; + ptr_ = const_cast< ::std::string* >(default_value); + return released; + } + + // Takes a string that is heap-allocated, and takes ownership. The string's + // destructor is registered with the arena. Used to implement + // set_allocated_ in generated classes. + inline void SetAllocated(const ::std::string* default_value, + ::std::string* value, ::google::protobuf::Arena* arena) { + if (arena == NULL && ptr_ != default_value) { + Destroy(default_value, arena); + } + if (value != NULL) { + ptr_ = value; + if (arena != NULL) { + arena->Own(value); + } + } else { + ptr_ = const_cast< ::std::string* >(default_value); + } + } + + // Takes a string that has lifetime equal to the arena's lifetime. The arena + // must be non-null. It is safe only to pass this method a value returned by + // UnsafeArenaRelease() on another field of a message in the same arena. Used + // to implement unsafe_arena_set_allocated_ in generated classes. + inline void UnsafeArenaSetAllocated(const ::std::string* default_value, + ::std::string* value, + ::google::protobuf::Arena* /* arena */) { + if (value != NULL) { + ptr_ = value; + } else { + ptr_ = const_cast< ::std::string* >(default_value); + } + } + + // Swaps internal pointers. Arena-safety semantics: this is guarded by the + // logic in Swap()/UnsafeArenaSwap() at the message level, so this method is + // 'unsafe' if called directly. + GOOGLE_ATTRIBUTE_ALWAYS_INLINE void Swap(ArenaStringPtr* other) { + std::swap(ptr_, other->ptr_); + } + + // Frees storage (if not on an arena) and sets field to default value. + inline void Destroy(const ::std::string* default_value, + ::google::protobuf::Arena* arena) { + if (arena == NULL && ptr_ != default_value) { + delete ptr_; + } + ptr_ = const_cast< ::std::string* >(default_value); + } + + // Clears content, but keeps allocated string if arena != NULL, to avoid the + // overhead of heap operations. After this returns, the content (as seen by + // the user) will always be the empty string. Assumes that |default_value| + // is an empty string. + inline void ClearToEmpty(const ::std::string* default_value, + ::google::protobuf::Arena* /* arena */) { + if (ptr_ == default_value) { + // Already set to default (which is empty) -- do nothing. + } else { + ptr_->clear(); + } + } + + // Clears content, but keeps allocated string if arena != NULL, to avoid the + // overhead of heap operations. After this returns, the content (as seen by + // the user) will always be equal to |default_value|. + inline void ClearToDefault(const ::std::string* default_value, + ::google::protobuf::Arena* /* arena */) { + if (ptr_ == default_value) { + // Already set to default -- do nothing. + } else { + // Have another allocated string -- rather than throwing this away and + // resetting ptr_ to the canonical default string instance, we just reuse + // this instance. + *ptr_ = *default_value; + } + } + + // Called from generated code / reflection runtime only. Resets value to point + // to a default string pointer, with the semantics that this ArenaStringPtr + // does not own the pointed-to memory. Disregards initial value of ptr_ (so + // this is the *ONLY* safe method to call after construction or when + // reinitializing after becoming the active field in a oneof union). + inline void UnsafeSetDefault(const ::std::string* default_value) { + // Casting away 'const' is safe here: accessors ensure that ptr_ is only + // returned as a const if it is equal to default_value. + ptr_ = const_cast< ::std::string* >(default_value); + } + + // The 'NoArena' variants of methods below assume arena == NULL and are + // optimized to provide very little overhead relative to a raw string pointer + // (while still being in-memory compatible with other code that assumes + // ArenaStringPtr). Note the invariant that a class instance that has only + // ever been mutated by NoArena methods must *only* be in the String state + // (i.e., tag bits are not used), *NEVER* ArenaString. This allows all + // tagged-pointer manipulations to be avoided. + inline void SetNoArena(const ::std::string* default_value, + const ::std::string& value) { + if (ptr_ == default_value) { + CreateInstanceNoArena(&value); + } else { + *ptr_ = value; + } + } + + void AssignWithDefault(const ::std::string* default_value, ArenaStringPtr value); + + inline const ::std::string& GetNoArena(const ::std::string* /* default_value */) const { + return *ptr_; + } + + inline ::std::string* MutableNoArena(const ::std::string* default_value) { + if (ptr_ == default_value) { + CreateInstanceNoArena(default_value); + } + return ptr_; + } + + inline ::std::string* ReleaseNoArena(const ::std::string* default_value) { + if (ptr_ == default_value) { + return NULL; + } else { + ::std::string* released = ptr_; + ptr_ = const_cast< ::std::string* >(default_value); + return released; + } + } + + inline void SetAllocatedNoArena(const ::std::string* default_value, + ::std::string* value) { + if (ptr_ != default_value) { + delete ptr_; + } + if (value != NULL) { + ptr_ = value; + } else { + ptr_ = const_cast< ::std::string* >(default_value); + } + } + + inline void DestroyNoArena(const ::std::string* default_value) { + if (ptr_ != default_value) { + delete ptr_; + } + ptr_ = NULL; + } + + inline void ClearToEmptyNoArena(const ::std::string* default_value) { + if (ptr_ == default_value) { + // Nothing: already equal to default (which is the empty string). + } else { + ptr_->clear(); + } + } + + inline void ClearToDefaultNoArena(const ::std::string* default_value) { + if (ptr_ == default_value) { + // Nothing: already set to default. + } else { + // Reuse existing allocated instance. + *ptr_ = *default_value; + } + } + + // Internal accessor used only at parse time to provide direct access to the + // raw pointer from the shared parse routine (in the non-arenas case). The + // parse routine does the string allocation in order to save code size in the + // generated parsing code. + inline ::std::string** UnsafeRawStringPointer() { + return &ptr_; + } + + private: + ::std::string* ptr_; + + GOOGLE_ATTRIBUTE_NOINLINE void CreateInstance(::google::protobuf::Arena* arena, + const ::std::string* initial_value) { + // Assumes ptr_ is not NULL. + if (initial_value != NULL) { + ptr_ = new ::std::string(*initial_value); + } else { + ptr_ = new ::std::string(); + } + if (arena != NULL) { + arena->Own(ptr_); + } + } + GOOGLE_ATTRIBUTE_NOINLINE void CreateInstanceNoArena(const ::std::string* initial_value) { + if (initial_value != NULL) { + ptr_ = new ::std::string(*initial_value); + } else { + ptr_ = new ::std::string(); + } + } +}; + +} // namespace internal +} // namespace protobuf + + + +} // namespace google +#endif // GOOGLE_PROTOBUF_ARENASTRING_H__ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/descriptor.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/descriptor.h new file mode 100644 index 00000000..5fa74b6e --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/descriptor.h @@ -0,0 +1,1924 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Author: kenton@google.com (Kenton Varda) +// Based on original Protocol Buffers design by +// Sanjay Ghemawat, Jeff Dean, and others. +// +// This file contains classes which describe a type of protocol message. +// You can use a message's descriptor to learn at runtime what fields +// it contains and what the types of those fields are. The Message +// interface also allows you to dynamically access and modify individual +// fields by passing the FieldDescriptor of the field you are interested +// in. +// +// Most users will not care about descriptors, because they will write +// code specific to certain protocol types and will simply use the classes +// generated by the protocol compiler directly. Advanced users who want +// to operate on arbitrary types (not known at compile time) may want to +// read descriptors in order to learn about the contents of a message. +// A very small number of users will want to construct their own +// Descriptors, either because they are implementing Message manually or +// because they are writing something like the protocol compiler. +// +// For an example of how you might use descriptors, see the code example +// at the top of message.h. + +#ifndef GOOGLE_PROTOBUF_DESCRIPTOR_H__ +#define GOOGLE_PROTOBUF_DESCRIPTOR_H__ + +#include +#ifndef _SHARED_PTR_H +#include "shared_ptr.h" +#endif +#include +#include +#include +#include "common.h" + +// TYPE_BOOL is defined in the MacOS's ConditionalMacros.h. +#ifdef TYPE_BOOL +#undef TYPE_BOOL +#endif // TYPE_BOOL + +namespace google { +namespace protobuf { + +// Defined in this file. +class Descriptor; +class FieldDescriptor; +class OneofDescriptor; +class EnumDescriptor; +class EnumValueDescriptor; +class ServiceDescriptor; +class MethodDescriptor; +class FileDescriptor; +class DescriptorDatabase; +class DescriptorPool; + +// Defined in descriptor.proto +class DescriptorProto; +class FieldDescriptorProto; +class OneofDescriptorProto; +class EnumDescriptorProto; +class EnumValueDescriptorProto; +class ServiceDescriptorProto; +class MethodDescriptorProto; +class FileDescriptorProto; +class MessageOptions; +class FieldOptions; +class OneofOptions; +class EnumOptions; +class EnumValueOptions; +class ServiceOptions; +class MethodOptions; +class FileOptions; +class UninterpretedOption; +class SourceCodeInfo; + +// Defined in message.h +class Message; + +// Defined in descriptor.cc +class DescriptorBuilder; +class FileDescriptorTables; + +// Defined in unknown_field_set.h. +class UnknownField; + +// Defined in generated_message_reflection.h. +namespace internal { +class GeneratedMessageReflection; +} // namespace internal + +// Defined in command_line_interface.cc +namespace compiler { +class CommandLineInterface; +} // namespace compiler + +namespace descriptor_unittest { +class DescriptorTest; +} // namespace descriptor_unittest + +// Defined in printer.h +namespace io { +class Printer; +} // namespace io + +// NB, all indices are zero-based. +struct SourceLocation { + int start_line; + int end_line; + int start_column; + int end_column; + + // Doc comments found at the source location. + // See the comments in SourceCodeInfo.Location (descriptor.proto) for details. + string leading_comments; + string trailing_comments; + std::vector leading_detached_comments; +}; + +// Options when generating machine-parsable output from a descriptor with +// DebugString(). +struct DebugStringOptions { + // include original user comments as recorded in SourceLocation entries. N.B. + // that this must be |false| by default: several other pieces of code (for + // example, the C++ code generation for fields in the proto compiler) rely on + // DebugString() output being unobstructed by user comments. + bool include_comments; + // If true, elide the braced body in the debug string. + bool elide_group_body; + bool elide_oneof_body; + + DebugStringOptions() + : include_comments(false), + elide_group_body(false), + elide_oneof_body(false) {} +}; + +// Describes a type of protocol message, or a particular group within a +// message. To obtain the Descriptor for a given message object, call +// Message::GetDescriptor(). Generated message classes also have a +// static method called descriptor() which returns the type's descriptor. +// Use DescriptorPool to construct your own descriptors. +class LIBPROTOBUF_EXPORT Descriptor { + public: + // The name of the message type, not including its scope. + const string& name() const; + + // The fully-qualified name of the message type, scope delimited by + // periods. For example, message type "Foo" which is declared in package + // "bar" has full name "bar.Foo". If a type "Baz" is nested within + // Foo, Baz's full_name is "bar.Foo.Baz". To get only the part that + // comes after the last '.', use name(). + const string& full_name() const; + + // Index of this descriptor within the file or containing type's message + // type array. + int index() const; + + // The .proto file in which this message type was defined. Never NULL. + const FileDescriptor* file() const; + + // If this Descriptor describes a nested type, this returns the type + // in which it is nested. Otherwise, returns NULL. + const Descriptor* containing_type() const; + + // Get options for this message type. These are specified in the .proto file + // by placing lines like "option foo = 1234;" in the message definition. + // Allowed options are defined by MessageOptions in + // google/protobuf/descriptor.proto, and any available extensions of that + // message. + const MessageOptions& options() const; + + // Write the contents of this Descriptor into the given DescriptorProto. + // The target DescriptorProto must be clear before calling this; if it + // isn't, the result may be garbage. + void CopyTo(DescriptorProto* proto) const; + + // Write the contents of this decriptor in a human-readable form. Output + // will be suitable for re-parsing. + string DebugString() const; + + // Similar to DebugString(), but additionally takes options (e.g., + // include original user comments in output). + string DebugStringWithOptions(const DebugStringOptions& options) const; + + // Returns true if this is a placeholder for an unknown type. This will + // only be the case if this descriptor comes from a DescriptorPool + // with AllowUnknownDependencies() set. + bool is_placeholder() const; + + // Field stuff ----------------------------------------------------- + + // The number of fields in this message type. + int field_count() const; + // Gets a field by index, where 0 <= index < field_count(). + // These are returned in the order they were defined in the .proto file. + const FieldDescriptor* field(int index) const; + + // Looks up a field by declared tag number. Returns NULL if no such field + // exists. + const FieldDescriptor* FindFieldByNumber(int number) const; + // Looks up a field by name. Returns NULL if no such field exists. + const FieldDescriptor* FindFieldByName(const string& name) const; + + // Looks up a field by lowercased name (as returned by lowercase_name()). + // This lookup may be ambiguous if multiple field names differ only by case, + // in which case the field returned is chosen arbitrarily from the matches. + const FieldDescriptor* FindFieldByLowercaseName( + const string& lowercase_name) const; + + // Looks up a field by camel-case name (as returned by camelcase_name()). + // This lookup may be ambiguous if multiple field names differ in a way that + // leads them to have identical camel-case names, in which case the field + // returned is chosen arbitrarily from the matches. + const FieldDescriptor* FindFieldByCamelcaseName( + const string& camelcase_name) const; + + // The number of oneofs in this message type. + int oneof_decl_count() const; + // Get a oneof by index, where 0 <= index < oneof_decl_count(). + // These are returned in the order they were defined in the .proto file. + const OneofDescriptor* oneof_decl(int index) const; + + // Looks up a oneof by name. Returns NULL if no such oneof exists. + const OneofDescriptor* FindOneofByName(const string& name) const; + + // Nested type stuff ----------------------------------------------- + + // The number of nested types in this message type. + int nested_type_count() const; + // Gets a nested type by index, where 0 <= index < nested_type_count(). + // These are returned in the order they were defined in the .proto file. + const Descriptor* nested_type(int index) const; + + // Looks up a nested type by name. Returns NULL if no such nested type + // exists. + const Descriptor* FindNestedTypeByName(const string& name) const; + + // Enum stuff ------------------------------------------------------ + + // The number of enum types in this message type. + int enum_type_count() const; + // Gets an enum type by index, where 0 <= index < enum_type_count(). + // These are returned in the order they were defined in the .proto file. + const EnumDescriptor* enum_type(int index) const; + + // Looks up an enum type by name. Returns NULL if no such enum type exists. + const EnumDescriptor* FindEnumTypeByName(const string& name) const; + + // Looks up an enum value by name, among all enum types in this message. + // Returns NULL if no such value exists. + const EnumValueDescriptor* FindEnumValueByName(const string& name) const; + + // Extensions ------------------------------------------------------ + + // A range of field numbers which are designated for third-party + // extensions. + struct ExtensionRange { + int start; // inclusive + int end; // exclusive + }; + + // The number of extension ranges in this message type. + int extension_range_count() const; + // Gets an extension range by index, where 0 <= index < + // extension_range_count(). These are returned in the order they were defined + // in the .proto file. + const ExtensionRange* extension_range(int index) const; + + // Returns true if the number is in one of the extension ranges. + bool IsExtensionNumber(int number) const; + + // Returns NULL if no extension range contains the given number. + const ExtensionRange* FindExtensionRangeContainingNumber(int number) const; + + // The number of extensions -- extending *other* messages -- that were + // defined nested within this message type's scope. + int extension_count() const; + // Get an extension by index, where 0 <= index < extension_count(). + // These are returned in the order they were defined in the .proto file. + const FieldDescriptor* extension(int index) const; + + // Looks up a named extension (which extends some *other* message type) + // defined within this message type's scope. + const FieldDescriptor* FindExtensionByName(const string& name) const; + + // Similar to FindFieldByLowercaseName(), but finds extensions defined within + // this message type's scope. + const FieldDescriptor* FindExtensionByLowercaseName(const string& name) const; + + // Similar to FindFieldByCamelcaseName(), but finds extensions defined within + // this message type's scope. + const FieldDescriptor* FindExtensionByCamelcaseName(const string& name) const; + + // Reserved fields ------------------------------------------------- + + // A range of reserved field numbers. + struct ReservedRange { + int start; // inclusive + int end; // exclusive + }; + + // The number of reserved ranges in this message type. + int reserved_range_count() const; + // Gets an reserved range by index, where 0 <= index < + // reserved_range_count(). These are returned in the order they were defined + // in the .proto file. + const ReservedRange* reserved_range(int index) const; + + // Returns true if the number is in one of the reserved ranges. + bool IsReservedNumber(int number) const; + + // Returns NULL if no reserved range contains the given number. + const ReservedRange* FindReservedRangeContainingNumber(int number) const; + + // The number of reserved field names in this message type. + int reserved_name_count() const; + + // Gets a reserved name by index, where 0 <= index < reserved_name_count(). + const string& reserved_name(int index) const; + + // Returns true if the field name is reserved. + bool IsReservedName(const string& name) const; + + // Source Location --------------------------------------------------- + + // Updates |*out_location| to the source location of the complete + // extent of this message declaration. Returns false and leaves + // |*out_location| unchanged iff location information was not available. + bool GetSourceLocation(SourceLocation* out_location) const; + + private: + typedef MessageOptions OptionsType; + + // Allows tests to test CopyTo(proto, true). + friend class ::google::protobuf::descriptor_unittest::DescriptorTest; + + // Allows access to GetLocationPath for annotations. + friend class ::google::protobuf::io::Printer; + + // Fill the json_name field of FieldDescriptorProto. + void CopyJsonNameTo(DescriptorProto* proto) const; + + // Internal version of DebugString; controls the level of indenting for + // correct depth. Takes |options| to control debug-string options, and + // |include_opening_clause| to indicate whether the "message ... " part of the + // clause has already been generated (this varies depending on context). + void DebugString(int depth, string *contents, + const DebugStringOptions& options, + bool include_opening_clause) const; + + // Walks up the descriptor tree to generate the source location path + // to this descriptor from the file root. + void GetLocationPath(std::vector* output) const; + + const string* name_; + const string* full_name_; + const FileDescriptor* file_; + const Descriptor* containing_type_; + const MessageOptions* options_; + + // True if this is a placeholder for an unknown type. + bool is_placeholder_; + // True if this is a placeholder and the type name wasn't fully-qualified. + bool is_unqualified_placeholder_; + + int field_count_; + FieldDescriptor* fields_; + int oneof_decl_count_; + OneofDescriptor* oneof_decls_; + int nested_type_count_; + Descriptor* nested_types_; + int enum_type_count_; + EnumDescriptor* enum_types_; + int extension_range_count_; + ExtensionRange* extension_ranges_; + int extension_count_; + FieldDescriptor* extensions_; + int reserved_range_count_; + ReservedRange* reserved_ranges_; + int reserved_name_count_; + const string** reserved_names_; + // IMPORTANT: If you add a new field, make sure to search for all instances + // of Allocate() and AllocateArray() in descriptor.cc + // and update them to initialize the field. + + // Must be constructed using DescriptorPool. + Descriptor() {} + friend class DescriptorBuilder; + friend class EnumDescriptor; + friend class FieldDescriptor; + friend class OneofDescriptor; + friend class MethodDescriptor; + friend class FileDescriptor; + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Descriptor); +}; + +// Describes a single field of a message. To get the descriptor for a given +// field, first get the Descriptor for the message in which it is defined, +// then call Descriptor::FindFieldByName(). To get a FieldDescriptor for +// an extension, do one of the following: +// - Get the Descriptor or FileDescriptor for its containing scope, then +// call Descriptor::FindExtensionByName() or +// FileDescriptor::FindExtensionByName(). +// - Given a DescriptorPool, call DescriptorPool::FindExtensionByNumber(). +// - Given a Reflection for a message object, call +// Reflection::FindKnownExtensionByName() or +// Reflection::FindKnownExtensionByNumber(). +// Use DescriptorPool to construct your own descriptors. +class LIBPROTOBUF_EXPORT FieldDescriptor { + public: + // Identifies a field type. 0 is reserved for errors. The order is weird + // for historical reasons. Types 12 and up are new in proto2. + enum Type { + TYPE_DOUBLE = 1, // double, exactly eight bytes on the wire. + TYPE_FLOAT = 2, // float, exactly four bytes on the wire. + TYPE_INT64 = 3, // int64, varint on the wire. Negative numbers + // take 10 bytes. Use TYPE_SINT64 if negative + // values are likely. + TYPE_UINT64 = 4, // uint64, varint on the wire. + TYPE_INT32 = 5, // int32, varint on the wire. Negative numbers + // take 10 bytes. Use TYPE_SINT32 if negative + // values are likely. + TYPE_FIXED64 = 6, // uint64, exactly eight bytes on the wire. + TYPE_FIXED32 = 7, // uint32, exactly four bytes on the wire. + TYPE_BOOL = 8, // bool, varint on the wire. + TYPE_STRING = 9, // UTF-8 text. + TYPE_GROUP = 10, // Tag-delimited message. Deprecated. + TYPE_MESSAGE = 11, // Length-delimited message. + + TYPE_BYTES = 12, // Arbitrary byte array. + TYPE_UINT32 = 13, // uint32, varint on the wire + TYPE_ENUM = 14, // Enum, varint on the wire + TYPE_SFIXED32 = 15, // int32, exactly four bytes on the wire + TYPE_SFIXED64 = 16, // int64, exactly eight bytes on the wire + TYPE_SINT32 = 17, // int32, ZigZag-encoded varint on the wire + TYPE_SINT64 = 18, // int64, ZigZag-encoded varint on the wire + + MAX_TYPE = 18, // Constant useful for defining lookup tables + // indexed by Type. + }; + + // Specifies the C++ data type used to represent the field. There is a + // fixed mapping from Type to CppType where each Type maps to exactly one + // CppType. 0 is reserved for errors. + enum CppType { + CPPTYPE_INT32 = 1, // TYPE_INT32, TYPE_SINT32, TYPE_SFIXED32 + CPPTYPE_INT64 = 2, // TYPE_INT64, TYPE_SINT64, TYPE_SFIXED64 + CPPTYPE_UINT32 = 3, // TYPE_UINT32, TYPE_FIXED32 + CPPTYPE_UINT64 = 4, // TYPE_UINT64, TYPE_FIXED64 + CPPTYPE_DOUBLE = 5, // TYPE_DOUBLE + CPPTYPE_FLOAT = 6, // TYPE_FLOAT + CPPTYPE_BOOL = 7, // TYPE_BOOL + CPPTYPE_ENUM = 8, // TYPE_ENUM + CPPTYPE_STRING = 9, // TYPE_STRING, TYPE_BYTES + CPPTYPE_MESSAGE = 10, // TYPE_MESSAGE, TYPE_GROUP + + MAX_CPPTYPE = 10, // Constant useful for defining lookup tables + // indexed by CppType. + }; + + // Identifies whether the field is optional, required, or repeated. 0 is + // reserved for errors. + enum Label { + LABEL_OPTIONAL = 1, // optional + LABEL_REQUIRED = 2, // required + LABEL_REPEATED = 3, // repeated + + MAX_LABEL = 3, // Constant useful for defining lookup tables + // indexed by Label. + }; + + // Valid field numbers are positive integers up to kMaxNumber. + static const int kMaxNumber = (1 << 29) - 1; + + // First field number reserved for the protocol buffer library implementation. + // Users may not declare fields that use reserved numbers. + static const int kFirstReservedNumber = 19000; + // Last field number reserved for the protocol buffer library implementation. + // Users may not declare fields that use reserved numbers. + static const int kLastReservedNumber = 19999; + + const string& name() const; // Name of this field within the message. + const string& full_name() const; // Fully-qualified name of the field. + const string& json_name() const; // JSON name of this field. + const FileDescriptor* file() const;// File in which this field was defined. + bool is_extension() const; // Is this an extension field? + int number() const; // Declared tag number. + + // Same as name() except converted to lower-case. This (and especially the + // FindFieldByLowercaseName() method) can be useful when parsing formats + // which prefer to use lowercase naming style. (Although, technically + // field names should be lowercased anyway according to the protobuf style + // guide, so this only makes a difference when dealing with old .proto files + // which do not follow the guide.) + const string& lowercase_name() const; + + // Same as name() except converted to camel-case. In this conversion, any + // time an underscore appears in the name, it is removed and the next + // letter is capitalized. Furthermore, the first letter of the name is + // lower-cased. Examples: + // FooBar -> fooBar + // foo_bar -> fooBar + // fooBar -> fooBar + // This (and especially the FindFieldByCamelcaseName() method) can be useful + // when parsing formats which prefer to use camel-case naming style. + const string& camelcase_name() const; + + Type type() const; // Declared type of this field. + const char* type_name() const; // Name of the declared type. + CppType cpp_type() const; // C++ type of this field. + const char* cpp_type_name() const; // Name of the C++ type. + Label label() const; // optional/required/repeated + + bool is_required() const; // shorthand for label() == LABEL_REQUIRED + bool is_optional() const; // shorthand for label() == LABEL_OPTIONAL + bool is_repeated() const; // shorthand for label() == LABEL_REPEATED + bool is_packable() const; // shorthand for is_repeated() && + // IsTypePackable(type()) + bool is_packed() const; // shorthand for is_packable() && + // options().packed() + bool is_map() const; // shorthand for type() == TYPE_MESSAGE && + // message_type()->options().map_entry() + + // Index of this field within the message's field array, or the file or + // extension scope's extensions array. + int index() const; + + // Does this field have an explicitly-declared default value? + bool has_default_value() const; + + // Whether the user has specified the json_name field option in the .proto + // file. + bool has_json_name() const; + + // Get the field default value if cpp_type() == CPPTYPE_INT32. If no + // explicit default was defined, the default is 0. + int32 default_value_int32() const; + // Get the field default value if cpp_type() == CPPTYPE_INT64. If no + // explicit default was defined, the default is 0. + int64 default_value_int64() const; + // Get the field default value if cpp_type() == CPPTYPE_UINT32. If no + // explicit default was defined, the default is 0. + uint32 default_value_uint32() const; + // Get the field default value if cpp_type() == CPPTYPE_UINT64. If no + // explicit default was defined, the default is 0. + uint64 default_value_uint64() const; + // Get the field default value if cpp_type() == CPPTYPE_FLOAT. If no + // explicit default was defined, the default is 0.0. + float default_value_float() const; + // Get the field default value if cpp_type() == CPPTYPE_DOUBLE. If no + // explicit default was defined, the default is 0.0. + double default_value_double() const; + // Get the field default value if cpp_type() == CPPTYPE_BOOL. If no + // explicit default was defined, the default is false. + bool default_value_bool() const; + // Get the field default value if cpp_type() == CPPTYPE_ENUM. If no + // explicit default was defined, the default is the first value defined + // in the enum type (all enum types are required to have at least one value). + // This never returns NULL. + const EnumValueDescriptor* default_value_enum() const; + // Get the field default value if cpp_type() == CPPTYPE_STRING. If no + // explicit default was defined, the default is the empty string. + const string& default_value_string() const; + + // The Descriptor for the message of which this is a field. For extensions, + // this is the extended type. Never NULL. + const Descriptor* containing_type() const; + + // If the field is a member of a oneof, this is the one, otherwise this is + // NULL. + const OneofDescriptor* containing_oneof() const; + + // If the field is a member of a oneof, returns the index in that oneof. + int index_in_oneof() const; + + // An extension may be declared within the scope of another message. If this + // field is an extension (is_extension() is true), then extension_scope() + // returns that message, or NULL if the extension was declared at global + // scope. If this is not an extension, extension_scope() is undefined (may + // assert-fail). + const Descriptor* extension_scope() const; + + // If type is TYPE_MESSAGE or TYPE_GROUP, returns a descriptor for the + // message or the group type. Otherwise, returns null. + const Descriptor* message_type() const; + // If type is TYPE_ENUM, returns a descriptor for the enum. Otherwise, + // returns null. + const EnumDescriptor* enum_type() const; + + // Get the FieldOptions for this field. This includes things listed in + // square brackets after the field definition. E.g., the field: + // optional string text = 1 [ctype=CORD]; + // has the "ctype" option set. Allowed options are defined by FieldOptions + // in google/protobuf/descriptor.proto, and any available extensions of that + // message. + const FieldOptions& options() const; + + // See Descriptor::CopyTo(). + void CopyTo(FieldDescriptorProto* proto) const; + + // See Descriptor::DebugString(). + string DebugString() const; + + // See Descriptor::DebugStringWithOptions(). + string DebugStringWithOptions(const DebugStringOptions& options) const; + + // Helper method to get the CppType for a particular Type. + static CppType TypeToCppType(Type type); + + // Helper method to get the name of a Type. + static const char* TypeName(Type type); + + // Helper method to get the name of a CppType. + static const char* CppTypeName(CppType cpp_type); + + // Return true iff [packed = true] is valid for fields of this type. + static inline bool IsTypePackable(Type field_type); + + // Source Location --------------------------------------------------- + + // Updates |*out_location| to the source location of the complete + // extent of this field declaration. Returns false and leaves + // |*out_location| unchanged iff location information was not available. + bool GetSourceLocation(SourceLocation* out_location) const; + + private: + typedef FieldOptions OptionsType; + + // Allows access to GetLocationPath for annotations. + friend class ::google::protobuf::io::Printer; + + // Fill the json_name field of FieldDescriptorProto. + void CopyJsonNameTo(FieldDescriptorProto* proto) const; + + // See Descriptor::DebugString(). + enum PrintLabelFlag { PRINT_LABEL, OMIT_LABEL }; + void DebugString(int depth, PrintLabelFlag print_label_flag, + string* contents, const DebugStringOptions& options) const; + + // formats the default value appropriately and returns it as a string. + // Must have a default value to call this. If quote_string_type is true, then + // types of CPPTYPE_STRING whill be surrounded by quotes and CEscaped. + string DefaultValueAsString(bool quote_string_type) const; + + // Helper function that returns the field type name for DebugString. + string FieldTypeNameDebugString() const; + + // Walks up the descriptor tree to generate the source location path + // to this descriptor from the file root. + void GetLocationPath(std::vector* output) const; + + const string* name_; + const string* full_name_; + const string* lowercase_name_; + const string* camelcase_name_; + // Whether the user has specified the json_name field option in the .proto + // file. + bool has_json_name_; + // If has_json_name_ is true, it's the value specified by the user. + // Otherwise, it has the same value as camelcase_name_. + const string* json_name_; + const FileDescriptor* file_; + int number_; + Type type_; + Label label_; + bool is_extension_; + int index_in_oneof_; + const Descriptor* containing_type_; + const OneofDescriptor* containing_oneof_; + const Descriptor* extension_scope_; + const Descriptor* message_type_; + const EnumDescriptor* enum_type_; + const FieldOptions* options_; + // IMPORTANT: If you add a new field, make sure to search for all instances + // of Allocate() and AllocateArray() in + // descriptor.cc and update them to initialize the field. + + bool has_default_value_; + union { + int32 default_value_int32_; + int64 default_value_int64_; + uint32 default_value_uint32_; + uint64 default_value_uint64_; + float default_value_float_; + double default_value_double_; + bool default_value_bool_; + + const EnumValueDescriptor* default_value_enum_; + const string* default_value_string_; + }; + + static const CppType kTypeToCppTypeMap[MAX_TYPE + 1]; + + static const char * const kTypeToName[MAX_TYPE + 1]; + + static const char * const kCppTypeToName[MAX_CPPTYPE + 1]; + + static const char * const kLabelToName[MAX_LABEL + 1]; + + // Must be constructed using DescriptorPool. + FieldDescriptor() {} + friend class DescriptorBuilder; + friend class FileDescriptor; + friend class Descriptor; + friend class OneofDescriptor; + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(FieldDescriptor); +}; + +// Describes a oneof defined in a message type. +class LIBPROTOBUF_EXPORT OneofDescriptor { + public: + const string& name() const; // Name of this oneof. + const string& full_name() const; // Fully-qualified name of the oneof. + + // Index of this oneof within the message's oneof array. + int index() const; + + // The Descriptor for the message containing this oneof. + const Descriptor* containing_type() const; + + // The number of (non-extension) fields which are members of this oneof. + int field_count() const; + // Get a member of this oneof, in the order in which they were declared in the + // .proto file. Does not include extensions. + const FieldDescriptor* field(int index) const; + + const OneofOptions& options() const; + + // See Descriptor::CopyTo(). + void CopyTo(OneofDescriptorProto* proto) const; + + // See Descriptor::DebugString(). + string DebugString() const; + + // See Descriptor::DebugStringWithOptions(). + string DebugStringWithOptions(const DebugStringOptions& options) const; + + // Source Location --------------------------------------------------- + + // Updates |*out_location| to the source location of the complete + // extent of this oneof declaration. Returns false and leaves + // |*out_location| unchanged iff location information was not available. + bool GetSourceLocation(SourceLocation* out_location) const; + + private: + typedef OneofOptions OptionsType; + + // Allows access to GetLocationPath for annotations. + friend class ::google::protobuf::io::Printer; + + // See Descriptor::DebugString(). + void DebugString(int depth, string* contents, + const DebugStringOptions& options) const; + + // Walks up the descriptor tree to generate the source location path + // to this descriptor from the file root. + void GetLocationPath(std::vector* output) const; + + const string* name_; + const string* full_name_; + const Descriptor* containing_type_; + bool is_extendable_; + int field_count_; + const FieldDescriptor** fields_; + const OneofOptions* options_; + + // IMPORTANT: If you add a new field, make sure to search for all instances + // of Allocate() and AllocateArray() + // in descriptor.cc and update them to initialize the field. + + // Must be constructed using DescriptorPool. + OneofDescriptor() {} + friend class DescriptorBuilder; + friend class Descriptor; + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(OneofDescriptor); +}; + +// Describes an enum type defined in a .proto file. To get the EnumDescriptor +// for a generated enum type, call TypeName_descriptor(). Use DescriptorPool +// to construct your own descriptors. +class LIBPROTOBUF_EXPORT EnumDescriptor { + public: + // The name of this enum type in the containing scope. + const string& name() const; + + // The fully-qualified name of the enum type, scope delimited by periods. + const string& full_name() const; + + // Index of this enum within the file or containing message's enum array. + int index() const; + + // The .proto file in which this enum type was defined. Never NULL. + const FileDescriptor* file() const; + + // The number of values for this EnumDescriptor. Guaranteed to be greater + // than zero. + int value_count() const; + // Gets a value by index, where 0 <= index < value_count(). + // These are returned in the order they were defined in the .proto file. + const EnumValueDescriptor* value(int index) const; + + // Looks up a value by name. Returns NULL if no such value exists. + const EnumValueDescriptor* FindValueByName(const string& name) const; + // Looks up a value by number. Returns NULL if no such value exists. If + // multiple values have this number, the first one defined is returned. + const EnumValueDescriptor* FindValueByNumber(int number) const; + + // If this enum type is nested in a message type, this is that message type. + // Otherwise, NULL. + const Descriptor* containing_type() const; + + // Get options for this enum type. These are specified in the .proto file by + // placing lines like "option foo = 1234;" in the enum definition. Allowed + // options are defined by EnumOptions in google/protobuf/descriptor.proto, + // and any available extensions of that message. + const EnumOptions& options() const; + + // See Descriptor::CopyTo(). + void CopyTo(EnumDescriptorProto* proto) const; + + // See Descriptor::DebugString(). + string DebugString() const; + + // See Descriptor::DebugStringWithOptions(). + string DebugStringWithOptions(const DebugStringOptions& options) const; + + + // Returns true if this is a placeholder for an unknown enum. This will + // only be the case if this descriptor comes from a DescriptorPool + // with AllowUnknownDependencies() set. + bool is_placeholder() const; + + // Source Location --------------------------------------------------- + + // Updates |*out_location| to the source location of the complete + // extent of this enum declaration. Returns false and leaves + // |*out_location| unchanged iff location information was not available. + bool GetSourceLocation(SourceLocation* out_location) const; + + private: + typedef EnumOptions OptionsType; + + // Allows access to GetLocationPath for annotations. + friend class ::google::protobuf::io::Printer; + + // Looks up a value by number. If the value does not exist, dynamically + // creates a new EnumValueDescriptor for that value, assuming that it was + // unknown. If a new descriptor is created, this is done in a thread-safe way, + // and future calls will return the same value descriptor pointer. + // + // This is private but is used by GeneratedMessageReflection (which is + // friended below) to return a valid EnumValueDescriptor from GetEnum() when + // this feature is enabled. + const EnumValueDescriptor* + FindValueByNumberCreatingIfUnknown(int number) const; + + + // See Descriptor::DebugString(). + void DebugString(int depth, string *contents, + const DebugStringOptions& options) const; + + // Walks up the descriptor tree to generate the source location path + // to this descriptor from the file root. + void GetLocationPath(std::vector* output) const; + + const string* name_; + const string* full_name_; + const FileDescriptor* file_; + const Descriptor* containing_type_; + const EnumOptions* options_; + + // True if this is a placeholder for an unknown type. + bool is_placeholder_; + // True if this is a placeholder and the type name wasn't fully-qualified. + bool is_unqualified_placeholder_; + + int value_count_; + EnumValueDescriptor* values_; + // IMPORTANT: If you add a new field, make sure to search for all instances + // of Allocate() and AllocateArray() in + // descriptor.cc and update them to initialize the field. + + // Must be constructed using DescriptorPool. + EnumDescriptor() {} + friend class DescriptorBuilder; + friend class Descriptor; + friend class FieldDescriptor; + friend class EnumValueDescriptor; + friend class FileDescriptor; + friend class internal::GeneratedMessageReflection; + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(EnumDescriptor); +}; + +// Describes an individual enum constant of a particular type. To get the +// EnumValueDescriptor for a given enum value, first get the EnumDescriptor +// for its type, then use EnumDescriptor::FindValueByName() or +// EnumDescriptor::FindValueByNumber(). Use DescriptorPool to construct +// your own descriptors. +class LIBPROTOBUF_EXPORT EnumValueDescriptor { + public: + const string& name() const; // Name of this enum constant. + int index() const; // Index within the enums's Descriptor. + int number() const; // Numeric value of this enum constant. + + // The full_name of an enum value is a sibling symbol of the enum type. + // e.g. the full name of FieldDescriptorProto::TYPE_INT32 is actually + // "google.protobuf.FieldDescriptorProto.TYPE_INT32", NOT + // "google.protobuf.FieldDescriptorProto.Type.TYPE_INT32". This is to conform + // with C++ scoping rules for enums. + const string& full_name() const; + + // The type of this value. Never NULL. + const EnumDescriptor* type() const; + + // Get options for this enum value. These are specified in the .proto file + // by adding text like "[foo = 1234]" after an enum value definition. + // Allowed options are defined by EnumValueOptions in + // google/protobuf/descriptor.proto, and any available extensions of that + // message. + const EnumValueOptions& options() const; + + // See Descriptor::CopyTo(). + void CopyTo(EnumValueDescriptorProto* proto) const; + + // See Descriptor::DebugString(). + string DebugString() const; + + // See Descriptor::DebugStringWithOptions(). + string DebugStringWithOptions(const DebugStringOptions& options) const; + + + // Source Location --------------------------------------------------- + + // Updates |*out_location| to the source location of the complete + // extent of this enum value declaration. Returns false and leaves + // |*out_location| unchanged iff location information was not available. + bool GetSourceLocation(SourceLocation* out_location) const; + + private: + typedef EnumValueOptions OptionsType; + + // Allows access to GetLocationPath for annotations. + friend class ::google::protobuf::io::Printer; + + // See Descriptor::DebugString(). + void DebugString(int depth, string *contents, + const DebugStringOptions& options) const; + + // Walks up the descriptor tree to generate the source location path + // to this descriptor from the file root. + void GetLocationPath(std::vector* output) const; + + const string* name_; + const string* full_name_; + int number_; + const EnumDescriptor* type_; + const EnumValueOptions* options_; + // IMPORTANT: If you add a new field, make sure to search for all instances + // of Allocate() and AllocateArray() + // in descriptor.cc and update them to initialize the field. + + // Must be constructed using DescriptorPool. + EnumValueDescriptor() {} + friend class DescriptorBuilder; + friend class EnumDescriptor; + friend class FileDescriptorTables; + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(EnumValueDescriptor); +}; + +// Describes an RPC service. To get the ServiceDescriptor for a service, +// call Service::GetDescriptor(). Generated service classes also have a +// static method called descriptor() which returns the type's +// ServiceDescriptor. Use DescriptorPool to construct your own descriptors. +class LIBPROTOBUF_EXPORT ServiceDescriptor { + public: + // The name of the service, not including its containing scope. + const string& name() const; + // The fully-qualified name of the service, scope delimited by periods. + const string& full_name() const; + // Index of this service within the file's services array. + int index() const; + + // The .proto file in which this service was defined. Never NULL. + const FileDescriptor* file() const; + + // Get options for this service type. These are specified in the .proto file + // by placing lines like "option foo = 1234;" in the service definition. + // Allowed options are defined by ServiceOptions in + // google/protobuf/descriptor.proto, and any available extensions of that + // message. + const ServiceOptions& options() const; + + // The number of methods this service defines. + int method_count() const; + // Gets a MethodDescriptor by index, where 0 <= index < method_count(). + // These are returned in the order they were defined in the .proto file. + const MethodDescriptor* method(int index) const; + + // Look up a MethodDescriptor by name. + const MethodDescriptor* FindMethodByName(const string& name) const; + // See Descriptor::CopyTo(). + void CopyTo(ServiceDescriptorProto* proto) const; + + // See Descriptor::DebugString(). + string DebugString() const; + + // See Descriptor::DebugStringWithOptions(). + string DebugStringWithOptions(const DebugStringOptions& options) const; + + + // Source Location --------------------------------------------------- + + // Updates |*out_location| to the source location of the complete + // extent of this service declaration. Returns false and leaves + // |*out_location| unchanged iff location information was not available. + bool GetSourceLocation(SourceLocation* out_location) const; + + private: + typedef ServiceOptions OptionsType; + + // Allows access to GetLocationPath for annotations. + friend class ::google::protobuf::io::Printer; + + // See Descriptor::DebugString(). + void DebugString(string *contents, const DebugStringOptions& options) const; + + // Walks up the descriptor tree to generate the source location path + // to this descriptor from the file root. + void GetLocationPath(std::vector* output) const; + + const string* name_; + const string* full_name_; + const FileDescriptor* file_; + const ServiceOptions* options_; + int method_count_; + MethodDescriptor* methods_; + // IMPORTANT: If you add a new field, make sure to search for all instances + // of Allocate() and AllocateArray() in + // descriptor.cc and update them to initialize the field. + + // Must be constructed using DescriptorPool. + ServiceDescriptor() {} + friend class DescriptorBuilder; + friend class FileDescriptor; + friend class MethodDescriptor; + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ServiceDescriptor); +}; + +// Describes an individual service method. To obtain a MethodDescriptor given +// a service, first get its ServiceDescriptor, then call +// ServiceDescriptor::FindMethodByName(). Use DescriptorPool to construct your +// own descriptors. +class LIBPROTOBUF_EXPORT MethodDescriptor { + public: + // Name of this method, not including containing scope. + const string& name() const; + // The fully-qualified name of the method, scope delimited by periods. + const string& full_name() const; + // Index within the service's Descriptor. + int index() const; + + // Gets the service to which this method belongs. Never NULL. + const ServiceDescriptor* service() const; + + // Gets the type of protocol message which this method accepts as input. + const Descriptor* input_type() const; + // Gets the type of protocol message which this message produces as output. + const Descriptor* output_type() const; + + // Gets whether the client streams multiple requests. + bool client_streaming() const; + // Gets whether the server streams multiple responses. + bool server_streaming() const; + + // Get options for this method. These are specified in the .proto file by + // placing lines like "option foo = 1234;" in curly-braces after a method + // declaration. Allowed options are defined by MethodOptions in + // google/protobuf/descriptor.proto, and any available extensions of that + // message. + const MethodOptions& options() const; + + // See Descriptor::CopyTo(). + void CopyTo(MethodDescriptorProto* proto) const; + + // See Descriptor::DebugString(). + string DebugString() const; + + // See Descriptor::DebugStringWithOptions(). + string DebugStringWithOptions(const DebugStringOptions& options) const; + + + // Source Location --------------------------------------------------- + + // Updates |*out_location| to the source location of the complete + // extent of this method declaration. Returns false and leaves + // |*out_location| unchanged iff location information was not available. + bool GetSourceLocation(SourceLocation* out_location) const; + + private: + typedef MethodOptions OptionsType; + + // Allows access to GetLocationPath for annotations. + friend class ::google::protobuf::io::Printer; + + // See Descriptor::DebugString(). + void DebugString(int depth, string *contents, + const DebugStringOptions& options) const; + + // Walks up the descriptor tree to generate the source location path + // to this descriptor from the file root. + void GetLocationPath(std::vector* output) const; + + const string* name_; + const string* full_name_; + const ServiceDescriptor* service_; + const Descriptor* input_type_; + const Descriptor* output_type_; + const MethodOptions* options_; + bool client_streaming_; + bool server_streaming_; + // IMPORTANT: If you add a new field, make sure to search for all instances + // of Allocate() and AllocateArray() in + // descriptor.cc and update them to initialize the field. + + // Must be constructed using DescriptorPool. + MethodDescriptor() {} + friend class DescriptorBuilder; + friend class ServiceDescriptor; + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(MethodDescriptor); +}; + + +// Describes a whole .proto file. To get the FileDescriptor for a compiled-in +// file, get the descriptor for something defined in that file and call +// descriptor->file(). Use DescriptorPool to construct your own descriptors. +class LIBPROTOBUF_EXPORT FileDescriptor { + public: + // The filename, relative to the source tree. + // e.g. "google/protobuf/descriptor.proto" + const string& name() const; + + // The package, e.g. "google.protobuf.compiler". + const string& package() const; + + // The DescriptorPool in which this FileDescriptor and all its contents were + // allocated. Never NULL. + const DescriptorPool* pool() const; + + // The number of files imported by this one. + int dependency_count() const; + // Gets an imported file by index, where 0 <= index < dependency_count(). + // These are returned in the order they were defined in the .proto file. + const FileDescriptor* dependency(int index) const; + + // The number of files public imported by this one. + // The public dependency list is a subset of the dependency list. + int public_dependency_count() const; + // Gets a public imported file by index, where 0 <= index < + // public_dependency_count(). + // These are returned in the order they were defined in the .proto file. + const FileDescriptor* public_dependency(int index) const; + + // The number of files that are imported for weak fields. + // The weak dependency list is a subset of the dependency list. + int weak_dependency_count() const; + // Gets a weak imported file by index, where 0 <= index < + // weak_dependency_count(). + // These are returned in the order they were defined in the .proto file. + const FileDescriptor* weak_dependency(int index) const; + + // Number of top-level message types defined in this file. (This does not + // include nested types.) + int message_type_count() const; + // Gets a top-level message type, where 0 <= index < message_type_count(). + // These are returned in the order they were defined in the .proto file. + const Descriptor* message_type(int index) const; + + // Number of top-level enum types defined in this file. (This does not + // include nested types.) + int enum_type_count() const; + // Gets a top-level enum type, where 0 <= index < enum_type_count(). + // These are returned in the order they were defined in the .proto file. + const EnumDescriptor* enum_type(int index) const; + + // Number of services defined in this file. + int service_count() const; + // Gets a service, where 0 <= index < service_count(). + // These are returned in the order they were defined in the .proto file. + const ServiceDescriptor* service(int index) const; + + // Number of extensions defined at file scope. (This does not include + // extensions nested within message types.) + int extension_count() const; + // Gets an extension's descriptor, where 0 <= index < extension_count(). + // These are returned in the order they were defined in the .proto file. + const FieldDescriptor* extension(int index) const; + + // Get options for this file. These are specified in the .proto file by + // placing lines like "option foo = 1234;" at the top level, outside of any + // other definitions. Allowed options are defined by FileOptions in + // google/protobuf/descriptor.proto, and any available extensions of that + // message. + const FileOptions& options() const; + + // Syntax of this file. + enum Syntax { + SYNTAX_UNKNOWN = 0, + SYNTAX_PROTO2 = 2, + SYNTAX_PROTO3 = 3, + }; + Syntax syntax() const; + static const char* SyntaxName(Syntax syntax); + + // Find a top-level message type by name. Returns NULL if not found. + const Descriptor* FindMessageTypeByName(const string& name) const; + // Find a top-level enum type by name. Returns NULL if not found. + const EnumDescriptor* FindEnumTypeByName(const string& name) const; + // Find an enum value defined in any top-level enum by name. Returns NULL if + // not found. + const EnumValueDescriptor* FindEnumValueByName(const string& name) const; + // Find a service definition by name. Returns NULL if not found. + const ServiceDescriptor* FindServiceByName(const string& name) const; + // Find a top-level extension definition by name. Returns NULL if not found. + const FieldDescriptor* FindExtensionByName(const string& name) const; + // Similar to FindExtensionByName(), but searches by lowercased-name. See + // Descriptor::FindFieldByLowercaseName(). + const FieldDescriptor* FindExtensionByLowercaseName(const string& name) const; + // Similar to FindExtensionByName(), but searches by camelcased-name. See + // Descriptor::FindFieldByCamelcaseName(). + const FieldDescriptor* FindExtensionByCamelcaseName(const string& name) const; + + // See Descriptor::CopyTo(). + // Notes: + // - This method does NOT copy source code information since it is relatively + // large and rarely needed. See CopySourceCodeInfoTo() below. + void CopyTo(FileDescriptorProto* proto) const; + // Write the source code information of this FileDescriptor into the given + // FileDescriptorProto. See CopyTo() above. + void CopySourceCodeInfoTo(FileDescriptorProto* proto) const; + // Fill the json_name field of FieldDescriptorProto for all fields. Can only + // be called after CopyTo(). + void CopyJsonNameTo(FileDescriptorProto* proto) const; + + // See Descriptor::DebugString(). + string DebugString() const; + + // See Descriptor::DebugStringWithOptions(). + string DebugStringWithOptions(const DebugStringOptions& options) const; + + // Returns true if this is a placeholder for an unknown file. This will + // only be the case if this descriptor comes from a DescriptorPool + // with AllowUnknownDependencies() set. + bool is_placeholder() const; + + // Updates |*out_location| to the source location of the complete extent of + // this file declaration (namely, the empty path). + bool GetSourceLocation(SourceLocation* out_location) const; + + // Updates |*out_location| to the source location of the complete + // extent of the declaration or declaration-part denoted by |path|. + // Returns false and leaves |*out_location| unchanged iff location + // information was not available. (See SourceCodeInfo for + // description of path encoding.) + bool GetSourceLocation(const std::vector& path, + SourceLocation* out_location) const; + + private: + typedef FileOptions OptionsType; + + const string* name_; + const string* package_; + const DescriptorPool* pool_; + int dependency_count_; + const FileDescriptor** dependencies_; + int public_dependency_count_; + int* public_dependencies_; + int weak_dependency_count_; + int* weak_dependencies_; + int message_type_count_; + Descriptor* message_types_; + int enum_type_count_; + EnumDescriptor* enum_types_; + int service_count_; + ServiceDescriptor* services_; + int extension_count_; + Syntax syntax_; + bool is_placeholder_; + FieldDescriptor* extensions_; + const FileOptions* options_; + + const FileDescriptorTables* tables_; + const SourceCodeInfo* source_code_info_; + // IMPORTANT: If you add a new field, make sure to search for all instances + // of Allocate() and AllocateArray() in + // descriptor.cc and update them to initialize the field. + + FileDescriptor() {} + friend class DescriptorBuilder; + friend class Descriptor; + friend class FieldDescriptor; + friend class OneofDescriptor; + friend class EnumDescriptor; + friend class EnumValueDescriptor; + friend class MethodDescriptor; + friend class ServiceDescriptor; + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(FileDescriptor); +}; + +// =================================================================== + +// Used to construct descriptors. +// +// Normally you won't want to build your own descriptors. Message classes +// constructed by the protocol compiler will provide them for you. However, +// if you are implementing Message on your own, or if you are writing a +// program which can operate on totally arbitrary types and needs to load +// them from some sort of database, you might need to. +// +// Since Descriptors are composed of a whole lot of cross-linked bits of +// data that would be a pain to put together manually, the +// DescriptorPool class is provided to make the process easier. It can +// take a FileDescriptorProto (defined in descriptor.proto), validate it, +// and convert it to a set of nicely cross-linked Descriptors. +// +// DescriptorPool also helps with memory management. Descriptors are +// composed of many objects containing static data and pointers to each +// other. In all likelihood, when it comes time to delete this data, +// you'll want to delete it all at once. In fact, it is not uncommon to +// have a whole pool of descriptors all cross-linked with each other which +// you wish to delete all at once. This class represents such a pool, and +// handles the memory management for you. +// +// You can also search for descriptors within a DescriptorPool by name, and +// extensions by number. +class LIBPROTOBUF_EXPORT DescriptorPool { + public: + // Create a normal, empty DescriptorPool. + DescriptorPool(); + + // Constructs a DescriptorPool that, when it can't find something among the + // descriptors already in the pool, looks for it in the given + // DescriptorDatabase. + // Notes: + // - If a DescriptorPool is constructed this way, its BuildFile*() methods + // must not be called (they will assert-fail). The only way to populate + // the pool with descriptors is to call the Find*By*() methods. + // - The Find*By*() methods may block the calling thread if the + // DescriptorDatabase blocks. This in turn means that parsing messages + // may block if they need to look up extensions. + // - The Find*By*() methods will use mutexes for thread-safety, thus making + // them slower even when they don't have to fall back to the database. + // In fact, even the Find*By*() methods of descriptor objects owned by + // this pool will be slower, since they will have to obtain locks too. + // - An ErrorCollector may optionally be given to collect validation errors + // in files loaded from the database. If not given, errors will be printed + // to GOOGLE_LOG(ERROR). Remember that files are built on-demand, so this + // ErrorCollector may be called from any thread that calls one of the + // Find*By*() methods. + // - The DescriptorDatabase must not be mutated during the lifetime of + // the DescriptorPool. Even if the client takes care to avoid data races, + // changes to the content of the DescriptorDatabase may not be reflected + // in subsequent lookups in the DescriptorPool. + class ErrorCollector; + explicit DescriptorPool(DescriptorDatabase* fallback_database, + ErrorCollector* error_collector = NULL); + + ~DescriptorPool(); + + // Get a pointer to the generated pool. Generated protocol message classes + // which are compiled into the binary will allocate their descriptors in + // this pool. Do not add your own descriptors to this pool. + static const DescriptorPool* generated_pool(); + + + // Find a FileDescriptor in the pool by file name. Returns NULL if not + // found. + const FileDescriptor* FindFileByName(const string& name) const; + + // Find the FileDescriptor in the pool which defines the given symbol. + // If any of the Find*ByName() methods below would succeed, then this is + // equivalent to calling that method and calling the result's file() method. + // Otherwise this returns NULL. + const FileDescriptor* FindFileContainingSymbol( + const string& symbol_name) const; + + // Looking up descriptors ------------------------------------------ + // These find descriptors by fully-qualified name. These will find both + // top-level descriptors and nested descriptors. They return NULL if not + // found. + + const Descriptor* FindMessageTypeByName(const string& name) const; + const FieldDescriptor* FindFieldByName(const string& name) const; + const FieldDescriptor* FindExtensionByName(const string& name) const; + const OneofDescriptor* FindOneofByName(const string& name) const; + const EnumDescriptor* FindEnumTypeByName(const string& name) const; + const EnumValueDescriptor* FindEnumValueByName(const string& name) const; + const ServiceDescriptor* FindServiceByName(const string& name) const; + const MethodDescriptor* FindMethodByName(const string& name) const; + + // Finds an extension of the given type by number. The extendee must be + // a member of this DescriptorPool or one of its underlays. + const FieldDescriptor* FindExtensionByNumber(const Descriptor* extendee, + int number) const; + + // Finds extensions of extendee. The extensions will be appended to + // out in an undefined order. Only extensions defined directly in + // this DescriptorPool or one of its underlays are guaranteed to be + // found: extensions defined in the fallback database might not be found + // depending on the database implementation. + void FindAllExtensions(const Descriptor* extendee, + std::vector* out) const; + + // Building descriptors -------------------------------------------- + + // When converting a FileDescriptorProto to a FileDescriptor, various + // errors might be detected in the input. The caller may handle these + // programmatically by implementing an ErrorCollector. + class LIBPROTOBUF_EXPORT ErrorCollector { + public: + inline ErrorCollector() {} + virtual ~ErrorCollector(); + + // These constants specify what exact part of the construct is broken. + // This is useful e.g. for mapping the error back to an exact location + // in a .proto file. + enum ErrorLocation { + NAME, // the symbol name, or the package name for files + NUMBER, // field or extension range number + TYPE, // field type + EXTENDEE, // field extendee + DEFAULT_VALUE, // field default value + INPUT_TYPE, // method input type + OUTPUT_TYPE, // method output type + OPTION_NAME, // name in assignment + OPTION_VALUE, // value in option assignment + OTHER // some other problem + }; + + // Reports an error in the FileDescriptorProto. Use this function if the + // problem occurred should interrupt building the FileDescriptorProto. + virtual void AddError( + const string& filename, // File name in which the error occurred. + const string& element_name, // Full name of the erroneous element. + const Message* descriptor, // Descriptor of the erroneous element. + ErrorLocation location, // One of the location constants, above. + const string& message // Human-readable error message. + ) = 0; + + // Reports a warning in the FileDescriptorProto. Use this function if the + // problem occurred should NOT interrupt building the FileDescriptorProto. + virtual void AddWarning( + const string& /*filename*/, // File name in which the error occurred. + const string& /*element_name*/, // Full name of the erroneous element. + const Message* /*descriptor*/, // Descriptor of the erroneous element. + ErrorLocation /*location*/, // One of the location constants, above. + const string& /*message*/ // Human-readable error message. + ) {} + + private: + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ErrorCollector); + }; + + // Convert the FileDescriptorProto to real descriptors and place them in + // this DescriptorPool. All dependencies of the file must already be in + // the pool. Returns the resulting FileDescriptor, or NULL if there were + // problems with the input (e.g. the message was invalid, or dependencies + // were missing). Details about the errors are written to GOOGLE_LOG(ERROR). + const FileDescriptor* BuildFile(const FileDescriptorProto& proto); + + // Same as BuildFile() except errors are sent to the given ErrorCollector. + const FileDescriptor* BuildFileCollectingErrors( + const FileDescriptorProto& proto, + ErrorCollector* error_collector); + + // By default, it is an error if a FileDescriptorProto contains references + // to types or other files that are not found in the DescriptorPool (or its + // backing DescriptorDatabase, if any). If you call + // AllowUnknownDependencies(), however, then unknown types and files + // will be replaced by placeholder descriptors (which can be identified by + // the is_placeholder() method). This can allow you to + // perform some useful operations with a .proto file even if you do not + // have access to other .proto files on which it depends. However, some + // heuristics must be used to fill in the gaps in information, and these + // can lead to descriptors which are inaccurate. For example, the + // DescriptorPool may be forced to guess whether an unknown type is a message + // or an enum, as well as what package it resides in. Furthermore, + // placeholder types will not be discoverable via FindMessageTypeByName() + // and similar methods, which could confuse some descriptor-based algorithms. + // Generally, the results of this option should be handled with extreme care. + void AllowUnknownDependencies() { allow_unknown_ = true; } + + // By default, weak imports are allowed to be missing, in which case we will + // use a placeholder for the dependency and convert the field to be an Empty + // message field. If you call EnforceWeakDependencies(true), however, the + // DescriptorPool will report a import not found error. + void EnforceWeakDependencies(bool enforce) { enforce_weak_ = enforce; } + + // Internal stuff -------------------------------------------------- + // These methods MUST NOT be called from outside the proto2 library. + // These methods may contain hidden pitfalls and may be removed in a + // future library version. + + // Create a DescriptorPool which is overlaid on top of some other pool. + // If you search for a descriptor in the overlay and it is not found, the + // underlay will be searched as a backup. If the underlay has its own + // underlay, that will be searched next, and so on. This also means that + // files built in the overlay will be cross-linked with the underlay's + // descriptors if necessary. The underlay remains property of the caller; + // it must remain valid for the lifetime of the newly-constructed pool. + // + // Example: Say you want to parse a .proto file at runtime in order to use + // its type with a DynamicMessage. Say this .proto file has dependencies, + // but you know that all the dependencies will be things that are already + // compiled into the binary. For ease of use, you'd like to load the types + // right out of generated_pool() rather than have to parse redundant copies + // of all these .protos and runtime. But, you don't want to add the parsed + // types directly into generated_pool(): this is not allowed, and would be + // bad design anyway. So, instead, you could use generated_pool() as an + // underlay for a new DescriptorPool in which you add only the new file. + // + // WARNING: Use of underlays can lead to many subtle gotchas. Instead, + // try to formulate what you want to do in terms of DescriptorDatabases. + explicit DescriptorPool(const DescriptorPool* underlay); + + // Called by generated classes at init time to add their descriptors to + // generated_pool. Do NOT call this in your own code! filename must be a + // permanent string (e.g. a string literal). + static void InternalAddGeneratedFile( + const void* encoded_file_descriptor, int size); + + + // For internal use only: Gets a non-const pointer to the generated pool. + // This is called at static-initialization time only, so thread-safety is + // not a concern. If both an underlay and a fallback database are present, + // the underlay takes precedence. + static DescriptorPool* internal_generated_pool(); + + // For internal use only: Changes the behavior of BuildFile() such that it + // allows the file to make reference to message types declared in other files + // which it did not officially declare as dependencies. + void InternalDontEnforceDependencies(); + + // For internal use only. + void internal_set_underlay(const DescriptorPool* underlay) { + underlay_ = underlay; + } + + // For internal (unit test) use only: Returns true if a FileDescriptor has + // been constructed for the given file, false otherwise. Useful for testing + // lazy descriptor initialization behavior. + bool InternalIsFileLoaded(const string& filename) const; + + + // Add a file to unused_import_track_files_. DescriptorBuilder will log + // warnings for those files if there is any unused import. + void AddUnusedImportTrackFile(const string& file_name); + void ClearUnusedImportTrackFiles(); + + private: + friend class Descriptor; + friend class FieldDescriptor; + friend class EnumDescriptor; + friend class ServiceDescriptor; + friend class FileDescriptor; + friend class DescriptorBuilder; + friend class FileDescriptorTables; + + // Return true if the given name is a sub-symbol of any non-package + // descriptor that already exists in the descriptor pool. (The full + // definition of such types is already known.) + bool IsSubSymbolOfBuiltType(const string& name) const; + + // Tries to find something in the fallback database and link in the + // corresponding proto file. Returns true if successful, in which case + // the caller should search for the thing again. These are declared + // const because they are called by (semantically) const methods. + bool TryFindFileInFallbackDatabase(const string& name) const; + bool TryFindSymbolInFallbackDatabase(const string& name) const; + bool TryFindExtensionInFallbackDatabase(const Descriptor* containing_type, + int field_number) const; + + // Like BuildFile() but called internally when the file has been loaded from + // fallback_database_. Declared const because it is called by (semantically) + // const methods. + const FileDescriptor* BuildFileFromDatabase( + const FileDescriptorProto& proto) const; + + // If fallback_database_ is NULL, this is NULL. Otherwise, this is a mutex + // which must be locked while accessing tables_. + Mutex* mutex_; + + // See constructor. + DescriptorDatabase* fallback_database_; + ErrorCollector* default_error_collector_; + const DescriptorPool* underlay_; + + // This class contains a lot of hash maps with complicated types that + // we'd like to keep out of the header. + class Tables; + google::protobuf::scoped_ptr tables_; + + bool enforce_dependencies_; + bool allow_unknown_; + bool enforce_weak_; + std::set unused_import_track_files_; + + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(DescriptorPool); +}; + +// inline methods ==================================================== + +// These macros makes this repetitive code more readable. +#define PROTOBUF_DEFINE_ACCESSOR(CLASS, FIELD, TYPE) \ + inline TYPE CLASS::FIELD() const { return FIELD##_; } + +// Strings fields are stored as pointers but returned as const references. +#define PROTOBUF_DEFINE_STRING_ACCESSOR(CLASS, FIELD) \ + inline const string& CLASS::FIELD() const { return *FIELD##_; } + +// Arrays take an index parameter, obviously. +#define PROTOBUF_DEFINE_ARRAY_ACCESSOR(CLASS, FIELD, TYPE) \ + inline TYPE CLASS::FIELD(int index) const { return FIELD##s_ + index; } + +#define PROTOBUF_DEFINE_OPTIONS_ACCESSOR(CLASS, TYPE) \ + inline const TYPE& CLASS::options() const { return *options_; } + +PROTOBUF_DEFINE_STRING_ACCESSOR(Descriptor, name) +PROTOBUF_DEFINE_STRING_ACCESSOR(Descriptor, full_name) +PROTOBUF_DEFINE_ACCESSOR(Descriptor, file, const FileDescriptor*) +PROTOBUF_DEFINE_ACCESSOR(Descriptor, containing_type, const Descriptor*) + +PROTOBUF_DEFINE_ACCESSOR(Descriptor, field_count, int) +PROTOBUF_DEFINE_ACCESSOR(Descriptor, oneof_decl_count, int) +PROTOBUF_DEFINE_ACCESSOR(Descriptor, nested_type_count, int) +PROTOBUF_DEFINE_ACCESSOR(Descriptor, enum_type_count, int) + +PROTOBUF_DEFINE_ARRAY_ACCESSOR(Descriptor, field, const FieldDescriptor*) +PROTOBUF_DEFINE_ARRAY_ACCESSOR(Descriptor, oneof_decl, const OneofDescriptor*) +PROTOBUF_DEFINE_ARRAY_ACCESSOR(Descriptor, nested_type, const Descriptor*) +PROTOBUF_DEFINE_ARRAY_ACCESSOR(Descriptor, enum_type, const EnumDescriptor*) + +PROTOBUF_DEFINE_ACCESSOR(Descriptor, extension_range_count, int) +PROTOBUF_DEFINE_ACCESSOR(Descriptor, extension_count, int) +PROTOBUF_DEFINE_ARRAY_ACCESSOR(Descriptor, extension_range, + const Descriptor::ExtensionRange*) +PROTOBUF_DEFINE_ARRAY_ACCESSOR(Descriptor, extension, + const FieldDescriptor*) + +PROTOBUF_DEFINE_ACCESSOR(Descriptor, reserved_range_count, int) +PROTOBUF_DEFINE_ARRAY_ACCESSOR(Descriptor, reserved_range, + const Descriptor::ReservedRange*) +PROTOBUF_DEFINE_ACCESSOR(Descriptor, reserved_name_count, int) + +PROTOBUF_DEFINE_OPTIONS_ACCESSOR(Descriptor, MessageOptions) +PROTOBUF_DEFINE_ACCESSOR(Descriptor, is_placeholder, bool) + +PROTOBUF_DEFINE_STRING_ACCESSOR(FieldDescriptor, name) +PROTOBUF_DEFINE_STRING_ACCESSOR(FieldDescriptor, full_name) +PROTOBUF_DEFINE_STRING_ACCESSOR(FieldDescriptor, json_name) +PROTOBUF_DEFINE_STRING_ACCESSOR(FieldDescriptor, lowercase_name) +PROTOBUF_DEFINE_STRING_ACCESSOR(FieldDescriptor, camelcase_name) +PROTOBUF_DEFINE_ACCESSOR(FieldDescriptor, file, const FileDescriptor*) +PROTOBUF_DEFINE_ACCESSOR(FieldDescriptor, number, int) +PROTOBUF_DEFINE_ACCESSOR(FieldDescriptor, is_extension, bool) +PROTOBUF_DEFINE_ACCESSOR(FieldDescriptor, type, FieldDescriptor::Type) +PROTOBUF_DEFINE_ACCESSOR(FieldDescriptor, label, FieldDescriptor::Label) +PROTOBUF_DEFINE_ACCESSOR(FieldDescriptor, containing_type, const Descriptor*) +PROTOBUF_DEFINE_ACCESSOR(FieldDescriptor, containing_oneof, + const OneofDescriptor*) +PROTOBUF_DEFINE_ACCESSOR(FieldDescriptor, index_in_oneof, int) +PROTOBUF_DEFINE_ACCESSOR(FieldDescriptor, extension_scope, const Descriptor*) +PROTOBUF_DEFINE_ACCESSOR(FieldDescriptor, message_type, const Descriptor*) +PROTOBUF_DEFINE_ACCESSOR(FieldDescriptor, enum_type, const EnumDescriptor*) +PROTOBUF_DEFINE_OPTIONS_ACCESSOR(FieldDescriptor, FieldOptions) +PROTOBUF_DEFINE_ACCESSOR(FieldDescriptor, has_default_value, bool) +PROTOBUF_DEFINE_ACCESSOR(FieldDescriptor, has_json_name, bool) +PROTOBUF_DEFINE_ACCESSOR(FieldDescriptor, default_value_int32 , int32 ) +PROTOBUF_DEFINE_ACCESSOR(FieldDescriptor, default_value_int64 , int64 ) +PROTOBUF_DEFINE_ACCESSOR(FieldDescriptor, default_value_uint32, uint32) +PROTOBUF_DEFINE_ACCESSOR(FieldDescriptor, default_value_uint64, uint64) +PROTOBUF_DEFINE_ACCESSOR(FieldDescriptor, default_value_float , float ) +PROTOBUF_DEFINE_ACCESSOR(FieldDescriptor, default_value_double, double) +PROTOBUF_DEFINE_ACCESSOR(FieldDescriptor, default_value_bool , bool ) +PROTOBUF_DEFINE_ACCESSOR(FieldDescriptor, default_value_enum, + const EnumValueDescriptor*) +PROTOBUF_DEFINE_STRING_ACCESSOR(FieldDescriptor, default_value_string) + +PROTOBUF_DEFINE_STRING_ACCESSOR(OneofDescriptor, name) +PROTOBUF_DEFINE_STRING_ACCESSOR(OneofDescriptor, full_name) +PROTOBUF_DEFINE_ACCESSOR(OneofDescriptor, containing_type, const Descriptor*) +PROTOBUF_DEFINE_ACCESSOR(OneofDescriptor, field_count, int) +PROTOBUF_DEFINE_OPTIONS_ACCESSOR(OneofDescriptor, OneofOptions) + +PROTOBUF_DEFINE_STRING_ACCESSOR(EnumDescriptor, name) +PROTOBUF_DEFINE_STRING_ACCESSOR(EnumDescriptor, full_name) +PROTOBUF_DEFINE_ACCESSOR(EnumDescriptor, file, const FileDescriptor*) +PROTOBUF_DEFINE_ACCESSOR(EnumDescriptor, containing_type, const Descriptor*) +PROTOBUF_DEFINE_ACCESSOR(EnumDescriptor, value_count, int) +PROTOBUF_DEFINE_ARRAY_ACCESSOR(EnumDescriptor, value, + const EnumValueDescriptor*) +PROTOBUF_DEFINE_OPTIONS_ACCESSOR(EnumDescriptor, EnumOptions) +PROTOBUF_DEFINE_ACCESSOR(EnumDescriptor, is_placeholder, bool) + +PROTOBUF_DEFINE_STRING_ACCESSOR(EnumValueDescriptor, name) +PROTOBUF_DEFINE_STRING_ACCESSOR(EnumValueDescriptor, full_name) +PROTOBUF_DEFINE_ACCESSOR(EnumValueDescriptor, number, int) +PROTOBUF_DEFINE_ACCESSOR(EnumValueDescriptor, type, const EnumDescriptor*) +PROTOBUF_DEFINE_OPTIONS_ACCESSOR(EnumValueDescriptor, EnumValueOptions) + +PROTOBUF_DEFINE_STRING_ACCESSOR(ServiceDescriptor, name) +PROTOBUF_DEFINE_STRING_ACCESSOR(ServiceDescriptor, full_name) +PROTOBUF_DEFINE_ACCESSOR(ServiceDescriptor, file, const FileDescriptor*) +PROTOBUF_DEFINE_ACCESSOR(ServiceDescriptor, method_count, int) +PROTOBUF_DEFINE_ARRAY_ACCESSOR(ServiceDescriptor, method, + const MethodDescriptor*) +PROTOBUF_DEFINE_OPTIONS_ACCESSOR(ServiceDescriptor, ServiceOptions) + +PROTOBUF_DEFINE_STRING_ACCESSOR(MethodDescriptor, name) +PROTOBUF_DEFINE_STRING_ACCESSOR(MethodDescriptor, full_name) +PROTOBUF_DEFINE_ACCESSOR(MethodDescriptor, service, const ServiceDescriptor*) +PROTOBUF_DEFINE_ACCESSOR(MethodDescriptor, input_type, const Descriptor*) +PROTOBUF_DEFINE_ACCESSOR(MethodDescriptor, output_type, const Descriptor*) +PROTOBUF_DEFINE_OPTIONS_ACCESSOR(MethodDescriptor, MethodOptions) +PROTOBUF_DEFINE_ACCESSOR(MethodDescriptor, client_streaming, bool) +PROTOBUF_DEFINE_ACCESSOR(MethodDescriptor, server_streaming, bool) + +PROTOBUF_DEFINE_STRING_ACCESSOR(FileDescriptor, name) +PROTOBUF_DEFINE_STRING_ACCESSOR(FileDescriptor, package) +PROTOBUF_DEFINE_ACCESSOR(FileDescriptor, pool, const DescriptorPool*) +PROTOBUF_DEFINE_ACCESSOR(FileDescriptor, dependency_count, int) +PROTOBUF_DEFINE_ACCESSOR(FileDescriptor, public_dependency_count, int) +PROTOBUF_DEFINE_ACCESSOR(FileDescriptor, weak_dependency_count, int) +PROTOBUF_DEFINE_ACCESSOR(FileDescriptor, message_type_count, int) +PROTOBUF_DEFINE_ACCESSOR(FileDescriptor, enum_type_count, int) +PROTOBUF_DEFINE_ACCESSOR(FileDescriptor, service_count, int) +PROTOBUF_DEFINE_ACCESSOR(FileDescriptor, extension_count, int) +PROTOBUF_DEFINE_OPTIONS_ACCESSOR(FileDescriptor, FileOptions) +PROTOBUF_DEFINE_ACCESSOR(FileDescriptor, is_placeholder, bool) + +PROTOBUF_DEFINE_ARRAY_ACCESSOR(FileDescriptor, message_type, const Descriptor*) +PROTOBUF_DEFINE_ARRAY_ACCESSOR(FileDescriptor, enum_type, const EnumDescriptor*) +PROTOBUF_DEFINE_ARRAY_ACCESSOR(FileDescriptor, service, + const ServiceDescriptor*) +PROTOBUF_DEFINE_ARRAY_ACCESSOR(FileDescriptor, extension, + const FieldDescriptor*) + +#undef PROTOBUF_DEFINE_ACCESSOR +#undef PROTOBUF_DEFINE_STRING_ACCESSOR +#undef PROTOBUF_DEFINE_ARRAY_ACCESSOR + +// A few accessors differ from the macros... + +inline bool Descriptor::IsExtensionNumber(int number) const { + return FindExtensionRangeContainingNumber(number) != NULL; +} + +inline bool Descriptor::IsReservedNumber(int number) const { + return FindReservedRangeContainingNumber(number) != NULL; +} + +inline bool Descriptor::IsReservedName(const string& name) const { + for (int i = 0; i < reserved_name_count(); i++) { + if (name == reserved_name(i)) { + return true; + } + } + return false; +} + +// Can't use PROTOBUF_DEFINE_ARRAY_ACCESSOR because reserved_names_ is actually +// an array of pointers rather than the usual array of objects. +inline const string& Descriptor::reserved_name(int index) const { + return *reserved_names_[index]; +} + +inline bool FieldDescriptor::is_required() const { + return label() == LABEL_REQUIRED; +} + +inline bool FieldDescriptor::is_optional() const { + return label() == LABEL_OPTIONAL; +} + +inline bool FieldDescriptor::is_repeated() const { + return label() == LABEL_REPEATED; +} + +inline bool FieldDescriptor::is_packable() const { + return is_repeated() && IsTypePackable(type()); +} + +// To save space, index() is computed by looking at the descriptor's position +// in the parent's array of children. +inline int FieldDescriptor::index() const { + if (!is_extension_) { + return static_cast(this - containing_type_->fields_); + } else if (extension_scope_ != NULL) { + return static_cast(this - extension_scope_->extensions_); + } else { + return static_cast(this - file_->extensions_); + } +} + +inline int Descriptor::index() const { + if (containing_type_ == NULL) { + return static_cast(this - file_->message_types_); + } else { + return static_cast(this - containing_type_->nested_types_); + } +} + +inline int OneofDescriptor::index() const { + return static_cast(this - containing_type_->oneof_decls_); +} + +inline int EnumDescriptor::index() const { + if (containing_type_ == NULL) { + return static_cast(this - file_->enum_types_); + } else { + return static_cast(this - containing_type_->enum_types_); + } +} + +inline int EnumValueDescriptor::index() const { + return static_cast(this - type_->values_); +} + +inline int ServiceDescriptor::index() const { + return static_cast(this - file_->services_); +} + +inline int MethodDescriptor::index() const { + return static_cast(this - service_->methods_); +} + +inline const char* FieldDescriptor::type_name() const { + return kTypeToName[type_]; +} + +inline FieldDescriptor::CppType FieldDescriptor::cpp_type() const { + return kTypeToCppTypeMap[type_]; +} + +inline const char* FieldDescriptor::cpp_type_name() const { + return kCppTypeToName[kTypeToCppTypeMap[type_]]; +} + +inline FieldDescriptor::CppType FieldDescriptor::TypeToCppType(Type type) { + return kTypeToCppTypeMap[type]; +} + +inline const char* FieldDescriptor::TypeName(Type type) { + return kTypeToName[type]; +} + +inline const char* FieldDescriptor::CppTypeName(CppType cpp_type) { + return kCppTypeToName[cpp_type]; +} + +inline bool FieldDescriptor::IsTypePackable(Type field_type) { + return (field_type != FieldDescriptor::TYPE_STRING && + field_type != FieldDescriptor::TYPE_GROUP && + field_type != FieldDescriptor::TYPE_MESSAGE && + field_type != FieldDescriptor::TYPE_BYTES); +} + +inline const FileDescriptor* FileDescriptor::dependency(int index) const { + return dependencies_[index]; +} + +inline const FileDescriptor* FileDescriptor::public_dependency( + int index) const { + return dependencies_[public_dependencies_[index]]; +} + +inline const FileDescriptor* FileDescriptor::weak_dependency( + int index) const { + return dependencies_[weak_dependencies_[index]]; +} + +inline FileDescriptor::Syntax FileDescriptor::syntax() const { + return syntax_; +} + +// Can't use PROTOBUF_DEFINE_ARRAY_ACCESSOR because fields_ is actually an array +// of pointers rather than the usual array of objects. +inline const FieldDescriptor* OneofDescriptor::field(int index) const { + return fields_[index]; +} + +} // namespace protobuf + +} // namespace google +#endif // GOOGLE_PROTOBUF_DESCRIPTOR_H__ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/extension_set.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/extension_set.h new file mode 100644 index 00000000..16ea76f8 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/extension_set.h @@ -0,0 +1,1318 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Author: kenton@google.com (Kenton Varda) +// Based on original Protocol Buffers design by +// Sanjay Ghemawat, Jeff Dean, and others. +// +// This header is logically internal, but is made public because it is used +// from protocol-compiler-generated code, which may reside in other components. + +#ifndef GOOGLE_PROTOBUF_EXTENSION_SET_H__ +#define GOOGLE_PROTOBUF_EXTENSION_SET_H__ + +#include +#include +#include +#include + + +#include "common.h" +#include "logging.h" +#include "once.h" + +#include "repeated_field.h" + +namespace google { + +namespace protobuf { + class Arena; + class Descriptor; // descriptor.h + class FieldDescriptor; // descriptor.h + class DescriptorPool; // descriptor.h + class MessageLite; // message_lite.h + class Message; // message.h + class MessageFactory; // message.h + class UnknownFieldSet; // unknown_field_set.h + namespace io { + class CodedInputStream; // coded_stream.h + class CodedOutputStream; // coded_stream.h + } + namespace internal { + class FieldSkipper; // wire_format_lite.h + } +} + +namespace protobuf { +namespace internal { + +// Used to store values of type WireFormatLite::FieldType without having to +// #include wire_format_lite.h. Also, ensures that we use only one byte to +// store these values, which is important to keep the layout of +// ExtensionSet::Extension small. +typedef uint8 FieldType; + +// A function which, given an integer value, returns true if the number +// matches one of the defined values for the corresponding enum type. This +// is used with RegisterEnumExtension, below. +typedef bool EnumValidityFunc(int number); + +// Version of the above which takes an argument. This is needed to deal with +// extensions that are not compiled in. +typedef bool EnumValidityFuncWithArg(const void* arg, int number); + +// Information about a registered extension. +struct ExtensionInfo { + inline ExtensionInfo() {} + inline ExtensionInfo(FieldType type_param, bool isrepeated, bool ispacked) + : type(type_param), is_repeated(isrepeated), is_packed(ispacked), + descriptor(NULL) {} + + FieldType type; + bool is_repeated; + bool is_packed; + + struct EnumValidityCheck { + EnumValidityFuncWithArg* func; + const void* arg; + }; + + union { + EnumValidityCheck enum_validity_check; + const MessageLite* message_prototype; + }; + + // The descriptor for this extension, if one exists and is known. May be + // NULL. Must not be NULL if the descriptor for the extension does not + // live in the same pool as the descriptor for the containing type. + const FieldDescriptor* descriptor; +}; + +// Abstract interface for an object which looks up extension definitions. Used +// when parsing. +class LIBPROTOBUF_EXPORT ExtensionFinder { + public: + virtual ~ExtensionFinder(); + + // Find the extension with the given containing type and number. + virtual bool Find(int number, ExtensionInfo* output) = 0; +}; + +// Implementation of ExtensionFinder which finds extensions defined in .proto +// files which have been compiled into the binary. +class LIBPROTOBUF_EXPORT GeneratedExtensionFinder : public ExtensionFinder { + public: + GeneratedExtensionFinder(const MessageLite* containing_type) + : containing_type_(containing_type) {} + virtual ~GeneratedExtensionFinder() {} + + // Returns true and fills in *output if found, otherwise returns false. + virtual bool Find(int number, ExtensionInfo* output); + + private: + const MessageLite* containing_type_; +}; + +// A FieldSkipper used for parsing MessageSet. +class MessageSetFieldSkipper; + +// Note: extension_set_heavy.cc defines DescriptorPoolExtensionFinder for +// finding extensions from a DescriptorPool. + +// This is an internal helper class intended for use within the protocol buffer +// library and generated classes. Clients should not use it directly. Instead, +// use the generated accessors such as GetExtension() of the class being +// extended. +// +// This class manages extensions for a protocol message object. The +// message's HasExtension(), GetExtension(), MutableExtension(), and +// ClearExtension() methods are just thin wrappers around the embedded +// ExtensionSet. When parsing, if a tag number is encountered which is +// inside one of the message type's extension ranges, the tag is passed +// off to the ExtensionSet for parsing. Etc. +class LIBPROTOBUF_EXPORT ExtensionSet { + public: + ExtensionSet(); + explicit ExtensionSet(::google::protobuf::Arena* arena); + ~ExtensionSet(); + + // These are called at startup by protocol-compiler-generated code to + // register known extensions. The registrations are used by ParseField() + // to look up extensions for parsed field numbers. Note that dynamic parsing + // does not use ParseField(); only protocol-compiler-generated parsing + // methods do. + static void RegisterExtension(const MessageLite* containing_type, + int number, FieldType type, + bool is_repeated, bool is_packed); + static void RegisterEnumExtension(const MessageLite* containing_type, + int number, FieldType type, + bool is_repeated, bool is_packed, + EnumValidityFunc* is_valid); + static void RegisterMessageExtension(const MessageLite* containing_type, + int number, FieldType type, + bool is_repeated, bool is_packed, + const MessageLite* prototype); + + // ================================================================= + + // Add all fields which are currently present to the given vector. This + // is useful to implement Reflection::ListFields(). + void AppendToList(const Descriptor* containing_type, + const DescriptorPool* pool, + std::vector* output) const; + + // ================================================================= + // Accessors + // + // Generated message classes include type-safe templated wrappers around + // these methods. Generally you should use those rather than call these + // directly, unless you are doing low-level memory management. + // + // When calling any of these accessors, the extension number requested + // MUST exist in the DescriptorPool provided to the constructor. Otherwise, + // the method will fail an assert. Normally, though, you would not call + // these directly; you would either call the generated accessors of your + // message class (e.g. GetExtension()) or you would call the accessors + // of the reflection interface. In both cases, it is impossible to + // trigger this assert failure: the generated accessors only accept + // linked-in extension types as parameters, while the Reflection interface + // requires you to provide the FieldDescriptor describing the extension. + // + // When calling any of these accessors, a protocol-compiler-generated + // implementation of the extension corresponding to the number MUST + // be linked in, and the FieldDescriptor used to refer to it MUST be + // the one generated by that linked-in code. Otherwise, the method will + // die on an assert failure. The message objects returned by the message + // accessors are guaranteed to be of the correct linked-in type. + // + // These methods pretty much match Reflection except that: + // - They're not virtual. + // - They identify fields by number rather than FieldDescriptors. + // - They identify enum values using integers rather than descriptors. + // - Strings provide Mutable() in addition to Set() accessors. + + bool Has(int number) const; + int ExtensionSize(int number) const; // Size of a repeated extension. + int NumExtensions() const; // The number of extensions + FieldType ExtensionType(int number) const; + void ClearExtension(int number); + + // singular fields ------------------------------------------------- + + int32 GetInt32 (int number, int32 default_value) const; + int64 GetInt64 (int number, int64 default_value) const; + uint32 GetUInt32(int number, uint32 default_value) const; + uint64 GetUInt64(int number, uint64 default_value) const; + float GetFloat (int number, float default_value) const; + double GetDouble(int number, double default_value) const; + bool GetBool (int number, bool default_value) const; + int GetEnum (int number, int default_value) const; + const string & GetString (int number, const string& default_value) const; + const MessageLite& GetMessage(int number, + const MessageLite& default_value) const; + const MessageLite& GetMessage(int number, const Descriptor* message_type, + MessageFactory* factory) const; + + // |descriptor| may be NULL so long as it is known that the descriptor for + // the extension lives in the same pool as the descriptor for the containing + // type. +#define desc const FieldDescriptor* descriptor // avoid line wrapping + void SetInt32 (int number, FieldType type, int32 value, desc); + void SetInt64 (int number, FieldType type, int64 value, desc); + void SetUInt32(int number, FieldType type, uint32 value, desc); + void SetUInt64(int number, FieldType type, uint64 value, desc); + void SetFloat (int number, FieldType type, float value, desc); + void SetDouble(int number, FieldType type, double value, desc); + void SetBool (int number, FieldType type, bool value, desc); + void SetEnum (int number, FieldType type, int value, desc); + void SetString(int number, FieldType type, const string& value, desc); + string * MutableString (int number, FieldType type, desc); + MessageLite* MutableMessage(int number, FieldType type, + const MessageLite& prototype, desc); + MessageLite* MutableMessage(const FieldDescriptor* decsriptor, + MessageFactory* factory); + // Adds the given message to the ExtensionSet, taking ownership of the + // message object. Existing message with the same number will be deleted. + // If "message" is NULL, this is equivalent to "ClearExtension(number)". + void SetAllocatedMessage(int number, FieldType type, + const FieldDescriptor* descriptor, + MessageLite* message); + void UnsafeArenaSetAllocatedMessage(int number, FieldType type, + const FieldDescriptor* descriptor, + MessageLite* message); + MessageLite* ReleaseMessage(int number, const MessageLite& prototype); + MessageLite* UnsafeArenaReleaseMessage( + int number, const MessageLite& prototype); + + MessageLite* ReleaseMessage(const FieldDescriptor* descriptor, + MessageFactory* factory); + MessageLite* UnsafeArenaReleaseMessage(const FieldDescriptor* descriptor, + MessageFactory* factory); +#undef desc + ::google::protobuf::Arena* GetArenaNoVirtual() const { return arena_; } + + // repeated fields ------------------------------------------------- + + // Fetches a RepeatedField extension by number; returns |default_value| + // if no such extension exists. User should not touch this directly; it is + // used by the GetRepeatedExtension() method. + const void* GetRawRepeatedField(int number, const void* default_value) const; + // Fetches a mutable version of a RepeatedField extension by number, + // instantiating one if none exists. Similar to above, user should not use + // this directly; it underlies MutableRepeatedExtension(). + void* MutableRawRepeatedField(int number, FieldType field_type, + bool packed, const FieldDescriptor* desc); + + // This is an overload of MutableRawRepeatedField to maintain compatibility + // with old code using a previous API. This version of + // MutableRawRepeatedField() will GOOGLE_CHECK-fail on a missing extension. + // (E.g.: borg/clients/internal/proto1/proto2_reflection.cc.) + void* MutableRawRepeatedField(int number); + + int32 GetRepeatedInt32 (int number, int index) const; + int64 GetRepeatedInt64 (int number, int index) const; + uint32 GetRepeatedUInt32(int number, int index) const; + uint64 GetRepeatedUInt64(int number, int index) const; + float GetRepeatedFloat (int number, int index) const; + double GetRepeatedDouble(int number, int index) const; + bool GetRepeatedBool (int number, int index) const; + int GetRepeatedEnum (int number, int index) const; + const string & GetRepeatedString (int number, int index) const; + const MessageLite& GetRepeatedMessage(int number, int index) const; + + void SetRepeatedInt32 (int number, int index, int32 value); + void SetRepeatedInt64 (int number, int index, int64 value); + void SetRepeatedUInt32(int number, int index, uint32 value); + void SetRepeatedUInt64(int number, int index, uint64 value); + void SetRepeatedFloat (int number, int index, float value); + void SetRepeatedDouble(int number, int index, double value); + void SetRepeatedBool (int number, int index, bool value); + void SetRepeatedEnum (int number, int index, int value); + void SetRepeatedString(int number, int index, const string& value); + string * MutableRepeatedString (int number, int index); + MessageLite* MutableRepeatedMessage(int number, int index); + +#define desc const FieldDescriptor* descriptor // avoid line wrapping + void AddInt32 (int number, FieldType type, bool packed, int32 value, desc); + void AddInt64 (int number, FieldType type, bool packed, int64 value, desc); + void AddUInt32(int number, FieldType type, bool packed, uint32 value, desc); + void AddUInt64(int number, FieldType type, bool packed, uint64 value, desc); + void AddFloat (int number, FieldType type, bool packed, float value, desc); + void AddDouble(int number, FieldType type, bool packed, double value, desc); + void AddBool (int number, FieldType type, bool packed, bool value, desc); + void AddEnum (int number, FieldType type, bool packed, int value, desc); + void AddString(int number, FieldType type, const string& value, desc); + string * AddString (int number, FieldType type, desc); + MessageLite* AddMessage(int number, FieldType type, + const MessageLite& prototype, desc); + MessageLite* AddMessage(const FieldDescriptor* descriptor, + MessageFactory* factory); + void AddAllocatedMessage(const FieldDescriptor* descriptor, + MessageLite* new_entry); +#undef desc + + void RemoveLast(int number); + MessageLite* ReleaseLast(int number); + void SwapElements(int number, int index1, int index2); + + // ----------------------------------------------------------------- + // TODO(kenton): Hardcore memory management accessors + + // ================================================================= + // convenience methods for implementing methods of Message + // + // These could all be implemented in terms of the other methods of this + // class, but providing them here helps keep the generated code size down. + + void Clear(); + void MergeFrom(const ExtensionSet& other); + void Swap(ExtensionSet* other); + void SwapExtension(ExtensionSet* other, int number); + bool IsInitialized() const; + + // Parses a single extension from the input. The input should start out + // positioned immediately after the tag. + bool ParseField(uint32 tag, io::CodedInputStream* input, + ExtensionFinder* extension_finder, + FieldSkipper* field_skipper); + + // Specific versions for lite or full messages (constructs the appropriate + // FieldSkipper automatically). |containing_type| is the default + // instance for the containing message; it is used only to look up the + // extension by number. See RegisterExtension(), above. Unlike the other + // methods of ExtensionSet, this only works for generated message types -- + // it looks up extensions registered using RegisterExtension(). + bool ParseField(uint32 tag, io::CodedInputStream* input, + const MessageLite* containing_type); + bool ParseField(uint32 tag, io::CodedInputStream* input, + const Message* containing_type, + UnknownFieldSet* unknown_fields); + bool ParseField(uint32 tag, io::CodedInputStream* input, + const MessageLite* containing_type, + io::CodedOutputStream* unknown_fields); + + // Parse an entire message in MessageSet format. Such messages have no + // fields, only extensions. + bool ParseMessageSet(io::CodedInputStream* input, + ExtensionFinder* extension_finder, + MessageSetFieldSkipper* field_skipper); + + // Specific versions for lite or full messages (constructs the appropriate + // FieldSkipper automatically). + bool ParseMessageSet(io::CodedInputStream* input, + const MessageLite* containing_type); + bool ParseMessageSet(io::CodedInputStream* input, + const Message* containing_type, + UnknownFieldSet* unknown_fields); + + // Write all extension fields with field numbers in the range + // [start_field_number, end_field_number) + // to the output stream, using the cached sizes computed when ByteSize() was + // last called. Note that the range bounds are inclusive-exclusive. + void SerializeWithCachedSizes(int start_field_number, + int end_field_number, + io::CodedOutputStream* output) const; + + // Same as SerializeWithCachedSizes, but without any bounds checking. + // The caller must ensure that target has sufficient capacity for the + // serialized extensions. + // + // Returns a pointer past the last written byte. + uint8* InternalSerializeWithCachedSizesToArray(int start_field_number, + int end_field_number, + bool deterministic, + uint8* target) const; + + // Like above but serializes in MessageSet format. + void SerializeMessageSetWithCachedSizes(io::CodedOutputStream* output) const; + uint8* InternalSerializeMessageSetWithCachedSizesToArray(bool deterministic, + uint8* target) const; + + // For backward-compatibility, versions of two of the above methods that + // are never forced to serialize deterministically. + uint8* SerializeWithCachedSizesToArray(int start_field_number, + int end_field_number, + uint8* target) const; + uint8* SerializeMessageSetWithCachedSizesToArray(uint8* target) const; + + // Returns the total serialized size of all the extensions. + size_t ByteSize() const; + + // Like ByteSize() but uses MessageSet format. + size_t MessageSetByteSize() const; + + // Returns (an estimate of) the total number of bytes used for storing the + // extensions in memory, excluding sizeof(*this). If the ExtensionSet is + // for a lite message (and thus possibly contains lite messages), the results + // are undefined (might work, might crash, might corrupt data, might not even + // be linked in). It's up to the protocol compiler to avoid calling this on + // such ExtensionSets (easy enough since lite messages don't implement + // SpaceUsed()). + int SpaceUsedExcludingSelf() const; + + private: + + // Interface of a lazily parsed singular message extension. + class LIBPROTOBUF_EXPORT LazyMessageExtension { + public: + LazyMessageExtension() {} + virtual ~LazyMessageExtension() {} + + virtual LazyMessageExtension* New(::google::protobuf::Arena* arena) const = 0; + virtual const MessageLite& GetMessage( + const MessageLite& prototype) const = 0; + virtual MessageLite* MutableMessage(const MessageLite& prototype) = 0; + virtual void SetAllocatedMessage(MessageLite *message) = 0; + virtual void UnsafeArenaSetAllocatedMessage(MessageLite *message) = 0; + virtual MessageLite* ReleaseMessage(const MessageLite& prototype) = 0; + virtual MessageLite* UnsafeArenaReleaseMessage( + const MessageLite& prototype) = 0; + + virtual bool IsInitialized() const = 0; + virtual int ByteSize() const = 0; + virtual int SpaceUsed() const = 0; + + virtual void MergeFrom(const LazyMessageExtension& other) = 0; + virtual void Clear() = 0; + + virtual bool ReadMessage(const MessageLite& prototype, + io::CodedInputStream* input) = 0; + virtual void WriteMessage(int number, + io::CodedOutputStream* output) const = 0; + virtual uint8* WriteMessageToArray(int number, uint8* target) const = 0; + virtual uint8* InternalWriteMessageToArray(int number, bool, + uint8* target) const { + // TODO(gpike): make this pure virtual. This is a placeholder because we + // need to update third_party/upb, for example. + return WriteMessageToArray(number, target); + } + + private: + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(LazyMessageExtension); + }; + struct Extension { + // The order of these fields packs Extension into 24 bytes when using 8 + // byte alignment. Consider this when adding or removing fields here. + union { + int32 int32_value; + int64 int64_value; + uint32 uint32_value; + uint64 uint64_value; + float float_value; + double double_value; + bool bool_value; + int enum_value; + string* string_value; + MessageLite* message_value; + LazyMessageExtension* lazymessage_value; + + RepeatedField * repeated_int32_value; + RepeatedField * repeated_int64_value; + RepeatedField * repeated_uint32_value; + RepeatedField * repeated_uint64_value; + RepeatedField * repeated_float_value; + RepeatedField * repeated_double_value; + RepeatedField * repeated_bool_value; + RepeatedField * repeated_enum_value; + RepeatedPtrField* repeated_string_value; + RepeatedPtrField* repeated_message_value; + }; + + FieldType type; + bool is_repeated; + + // For singular types, indicates if the extension is "cleared". This + // happens when an extension is set and then later cleared by the caller. + // We want to keep the Extension object around for reuse, so instead of + // removing it from the map, we just set is_cleared = true. This has no + // meaning for repeated types; for those, the size of the RepeatedField + // simply becomes zero when cleared. + bool is_cleared : 4; + + // For singular message types, indicates whether lazy parsing is enabled + // for this extension. This field is only valid when type == TYPE_MESSAGE + // and !is_repeated because we only support lazy parsing for singular + // message types currently. If is_lazy = true, the extension is stored in + // lazymessage_value. Otherwise, the extension will be message_value. + bool is_lazy : 4; + + // For repeated types, this indicates if the [packed=true] option is set. + bool is_packed; + + // For packed fields, the size of the packed data is recorded here when + // ByteSize() is called then used during serialization. + // TODO(kenton): Use atomic when C++ supports it. + mutable int cached_size; + + // The descriptor for this extension, if one exists and is known. May be + // NULL. Must not be NULL if the descriptor for the extension does not + // live in the same pool as the descriptor for the containing type. + const FieldDescriptor* descriptor; + + // Some helper methods for operations on a single Extension. + void SerializeFieldWithCachedSizes( + int number, + io::CodedOutputStream* output) const; + uint8* InternalSerializeFieldWithCachedSizesToArray( + int number, + bool deterministic, + uint8* target) const; + void SerializeMessageSetItemWithCachedSizes( + int number, + io::CodedOutputStream* output) const; + uint8* InternalSerializeMessageSetItemWithCachedSizesToArray( + int number, + bool deterministic, + uint8* target) const; + size_t ByteSize(int number) const; + size_t MessageSetItemByteSize(int number) const; + void Clear(); + int GetSize() const; + void Free(); + int SpaceUsedExcludingSelf() const; + }; + typedef std::map ExtensionMap; + + + // Merges existing Extension from other_extension + void InternalExtensionMergeFrom(int number, const Extension& other_extension); + + // Returns true and fills field_number and extension if extension is found. + // Note to support packed repeated field compatibility, it also fills whether + // the tag on wire is packed, which can be different from + // extension->is_packed (whether packed=true is specified). + bool FindExtensionInfoFromTag(uint32 tag, ExtensionFinder* extension_finder, + int* field_number, ExtensionInfo* extension, + bool* was_packed_on_wire); + + // Returns true and fills extension if extension is found. + // Note to support packed repeated field compatibility, it also fills whether + // the tag on wire is packed, which can be different from + // extension->is_packed (whether packed=true is specified). + bool FindExtensionInfoFromFieldNumber(int wire_type, int field_number, + ExtensionFinder* extension_finder, + ExtensionInfo* extension, + bool* was_packed_on_wire); + + // Parses a single extension from the input. The input should start out + // positioned immediately after the wire tag. This method is called in + // ParseField() after field number and was_packed_on_wire is extracted from + // the wire tag and ExtensionInfo is found by the field number. + bool ParseFieldWithExtensionInfo(int field_number, + bool was_packed_on_wire, + const ExtensionInfo& extension, + io::CodedInputStream* input, + FieldSkipper* field_skipper); + + // Like ParseField(), but this method may parse singular message extensions + // lazily depending on the value of FLAGS_eagerly_parse_message_sets. + bool ParseFieldMaybeLazily(int wire_type, int field_number, + io::CodedInputStream* input, + ExtensionFinder* extension_finder, + MessageSetFieldSkipper* field_skipper); + + // Gets the extension with the given number, creating it if it does not + // already exist. Returns true if the extension did not already exist. + bool MaybeNewExtension(int number, const FieldDescriptor* descriptor, + Extension** result); + + // Gets the repeated extension for the given descriptor, creating it if + // it does not exist. + Extension* MaybeNewRepeatedExtension(const FieldDescriptor* descriptor); + + // Parse a single MessageSet item -- called just after the item group start + // tag has been read. + bool ParseMessageSetItem(io::CodedInputStream* input, + ExtensionFinder* extension_finder, + MessageSetFieldSkipper* field_skipper); + + // Hack: RepeatedPtrFieldBase declares ExtensionSet as a friend. This + // friendship should automatically extend to ExtensionSet::Extension, but + // unfortunately some older compilers (e.g. GCC 3.4.4) do not implement this + // correctly. So, we must provide helpers for calling methods of that + // class. + + // Defined in extension_set_heavy.cc. + static inline int RepeatedMessage_SpaceUsedExcludingSelf( + RepeatedPtrFieldBase* field); + + // The Extension struct is small enough to be passed by value, so we use it + // directly as the value type in the map rather than use pointers. We use + // a map rather than hash_map here because we expect most ExtensionSets will + // only contain a small number of extensions whereas hash_map is optimized + // for 100 elements or more. Also, we want AppendToList() to order fields + // by field number. + ExtensionMap extensions_; + ::google::protobuf::Arena* arena_; + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ExtensionSet); +}; + +// These are just for convenience... +inline void ExtensionSet::SetString(int number, FieldType type, + const string& value, + const FieldDescriptor* descriptor) { + MutableString(number, type, descriptor)->assign(value); +} +inline void ExtensionSet::SetRepeatedString(int number, int index, + const string& value) { + MutableRepeatedString(number, index)->assign(value); +} +inline void ExtensionSet::AddString(int number, FieldType type, + const string& value, + const FieldDescriptor* descriptor) { + AddString(number, type, descriptor)->assign(value); +} + +// =================================================================== +// Glue for generated extension accessors + +// ------------------------------------------------------------------- +// Template magic + +// First we have a set of classes representing "type traits" for different +// field types. A type traits class knows how to implement basic accessors +// for extensions of a particular type given an ExtensionSet. The signature +// for a type traits class looks like this: +// +// class TypeTraits { +// public: +// typedef ? ConstType; +// typedef ? MutableType; +// // TypeTraits for singular fields and repeated fields will define the +// // symbol "Singular" or "Repeated" respectively. These two symbols will +// // be used in extension accessors to distinguish between singular +// // extensions and repeated extensions. If the TypeTraits for the passed +// // in extension doesn't have the expected symbol defined, it means the +// // user is passing a repeated extension to a singular accessor, or the +// // opposite. In that case the C++ compiler will generate an error +// // message "no matching member function" to inform the user. +// typedef ? Singular +// typedef ? Repeated +// +// static inline ConstType Get(int number, const ExtensionSet& set); +// static inline void Set(int number, ConstType value, ExtensionSet* set); +// static inline MutableType Mutable(int number, ExtensionSet* set); +// +// // Variants for repeated fields. +// static inline ConstType Get(int number, const ExtensionSet& set, +// int index); +// static inline void Set(int number, int index, +// ConstType value, ExtensionSet* set); +// static inline MutableType Mutable(int number, int index, +// ExtensionSet* set); +// static inline void Add(int number, ConstType value, ExtensionSet* set); +// static inline MutableType Add(int number, ExtensionSet* set); +// }; +// +// Not all of these methods make sense for all field types. For example, the +// "Mutable" methods only make sense for strings and messages, and the +// repeated methods only make sense for repeated types. So, each type +// traits class implements only the set of methods from this signature that it +// actually supports. This will cause a compiler error if the user tries to +// access an extension using a method that doesn't make sense for its type. +// For example, if "foo" is an extension of type "optional int32", then if you +// try to write code like: +// my_message.MutableExtension(foo) +// you will get a compile error because PrimitiveTypeTraits does not +// have a "Mutable()" method. + +// ------------------------------------------------------------------- +// PrimitiveTypeTraits + +// Since the ExtensionSet has different methods for each primitive type, +// we must explicitly define the methods of the type traits class for each +// known type. +template +class PrimitiveTypeTraits { + public: + typedef Type ConstType; + typedef Type MutableType; + typedef PrimitiveTypeTraits Singular; + + static inline ConstType Get(int number, const ExtensionSet& set, + ConstType default_value); + static inline void Set(int number, FieldType field_type, + ConstType value, ExtensionSet* set); +}; + +template +class RepeatedPrimitiveTypeTraits { + public: + typedef Type ConstType; + typedef Type MutableType; + typedef RepeatedPrimitiveTypeTraits Repeated; + + typedef RepeatedField RepeatedFieldType; + + static inline Type Get(int number, const ExtensionSet& set, int index); + static inline void Set(int number, int index, Type value, ExtensionSet* set); + static inline void Add(int number, FieldType field_type, + bool is_packed, Type value, ExtensionSet* set); + + static inline const RepeatedField& + GetRepeated(int number, const ExtensionSet& set); + static inline RepeatedField* + MutableRepeated(int number, FieldType field_type, + bool is_packed, ExtensionSet* set); + + static const RepeatedFieldType* GetDefaultRepeatedField(); +}; + +LIBPROTOBUF_EXPORT extern ProtobufOnceType repeated_primitive_generic_type_traits_once_init_; + +class LIBPROTOBUF_EXPORT RepeatedPrimitiveGenericTypeTraits { + private: + template friend class RepeatedPrimitiveTypeTraits; + static void InitializeDefaultRepeatedFields(); + static void DestroyDefaultRepeatedFields(); + static const RepeatedField* default_repeated_field_int32_; + static const RepeatedField* default_repeated_field_int64_; + static const RepeatedField* default_repeated_field_uint32_; + static const RepeatedField* default_repeated_field_uint64_; + static const RepeatedField* default_repeated_field_double_; + static const RepeatedField* default_repeated_field_float_; + static const RepeatedField* default_repeated_field_bool_; +}; + +#define PROTOBUF_DEFINE_PRIMITIVE_TYPE(TYPE, METHOD) \ +template<> inline TYPE PrimitiveTypeTraits::Get( \ + int number, const ExtensionSet& set, TYPE default_value) { \ + return set.Get##METHOD(number, default_value); \ +} \ +template<> inline void PrimitiveTypeTraits::Set( \ + int number, FieldType field_type, TYPE value, ExtensionSet* set) { \ + set->Set##METHOD(number, field_type, value, NULL); \ +} \ + \ +template<> inline TYPE RepeatedPrimitiveTypeTraits::Get( \ + int number, const ExtensionSet& set, int index) { \ + return set.GetRepeated##METHOD(number, index); \ +} \ +template<> inline void RepeatedPrimitiveTypeTraits::Set( \ + int number, int index, TYPE value, ExtensionSet* set) { \ + set->SetRepeated##METHOD(number, index, value); \ +} \ +template<> inline void RepeatedPrimitiveTypeTraits::Add( \ + int number, FieldType field_type, bool is_packed, \ + TYPE value, ExtensionSet* set) { \ + set->Add##METHOD(number, field_type, is_packed, value, NULL); \ +} \ +template<> inline const RepeatedField* \ + RepeatedPrimitiveTypeTraits::GetDefaultRepeatedField() { \ + ::google::protobuf::GoogleOnceInit( \ + &repeated_primitive_generic_type_traits_once_init_, \ + &RepeatedPrimitiveGenericTypeTraits::InitializeDefaultRepeatedFields); \ + return RepeatedPrimitiveGenericTypeTraits:: \ + default_repeated_field_##TYPE##_; \ +} \ +template<> inline const RepeatedField& \ + RepeatedPrimitiveTypeTraits::GetRepeated(int number, \ + const ExtensionSet& set) { \ + return *reinterpret_cast*>( \ + set.GetRawRepeatedField( \ + number, GetDefaultRepeatedField())); \ +} \ +template<> inline RepeatedField* \ + RepeatedPrimitiveTypeTraits::MutableRepeated(int number, \ + FieldType field_type, \ + bool is_packed, \ + ExtensionSet* set) { \ + return reinterpret_cast*>( \ + set->MutableRawRepeatedField(number, field_type, is_packed, NULL)); \ +} + +PROTOBUF_DEFINE_PRIMITIVE_TYPE( int32, Int32) +PROTOBUF_DEFINE_PRIMITIVE_TYPE( int64, Int64) +PROTOBUF_DEFINE_PRIMITIVE_TYPE(uint32, UInt32) +PROTOBUF_DEFINE_PRIMITIVE_TYPE(uint64, UInt64) +PROTOBUF_DEFINE_PRIMITIVE_TYPE( float, Float) +PROTOBUF_DEFINE_PRIMITIVE_TYPE(double, Double) +PROTOBUF_DEFINE_PRIMITIVE_TYPE( bool, Bool) + +#undef PROTOBUF_DEFINE_PRIMITIVE_TYPE + +// ------------------------------------------------------------------- +// StringTypeTraits + +// Strings support both Set() and Mutable(). +class LIBPROTOBUF_EXPORT StringTypeTraits { + public: + typedef const string& ConstType; + typedef string* MutableType; + typedef StringTypeTraits Singular; + + static inline const string& Get(int number, const ExtensionSet& set, + ConstType default_value) { + return set.GetString(number, default_value); + } + static inline void Set(int number, FieldType field_type, + const string& value, ExtensionSet* set) { + set->SetString(number, field_type, value, NULL); + } + static inline string* Mutable(int number, FieldType field_type, + ExtensionSet* set) { + return set->MutableString(number, field_type, NULL); + } +}; + +LIBPROTOBUF_EXPORT extern ProtobufOnceType repeated_string_type_traits_once_init_; + +class LIBPROTOBUF_EXPORT RepeatedStringTypeTraits { + public: + typedef const string& ConstType; + typedef string* MutableType; + typedef RepeatedStringTypeTraits Repeated; + + typedef RepeatedPtrField RepeatedFieldType; + + static inline const string& Get(int number, const ExtensionSet& set, + int index) { + return set.GetRepeatedString(number, index); + } + static inline void Set(int number, int index, + const string& value, ExtensionSet* set) { + set->SetRepeatedString(number, index, value); + } + static inline string* Mutable(int number, int index, ExtensionSet* set) { + return set->MutableRepeatedString(number, index); + } + static inline void Add(int number, FieldType field_type, + bool /*is_packed*/, const string& value, + ExtensionSet* set) { + set->AddString(number, field_type, value, NULL); + } + static inline string* Add(int number, FieldType field_type, + ExtensionSet* set) { + return set->AddString(number, field_type, NULL); + } + static inline const RepeatedPtrField& + GetRepeated(int number, const ExtensionSet& set) { + return *reinterpret_cast*>( + set.GetRawRepeatedField(number, GetDefaultRepeatedField())); + } + + static inline RepeatedPtrField* + MutableRepeated(int number, FieldType field_type, + bool is_packed, ExtensionSet* set) { + return reinterpret_cast*>( + set->MutableRawRepeatedField(number, field_type, + is_packed, NULL)); + } + + static const RepeatedFieldType* GetDefaultRepeatedField() { + ::google::protobuf::GoogleOnceInit(&repeated_string_type_traits_once_init_, + &InitializeDefaultRepeatedFields); + return default_repeated_field_; + } + + private: + static void InitializeDefaultRepeatedFields(); + static void DestroyDefaultRepeatedFields(); + static const RepeatedFieldType *default_repeated_field_; +}; + +// ------------------------------------------------------------------- +// EnumTypeTraits + +// ExtensionSet represents enums using integers internally, so we have to +// static_cast around. +template +class EnumTypeTraits { + public: + typedef Type ConstType; + typedef Type MutableType; + typedef EnumTypeTraits Singular; + + static inline ConstType Get(int number, const ExtensionSet& set, + ConstType default_value) { + return static_cast(set.GetEnum(number, default_value)); + } + static inline void Set(int number, FieldType field_type, + ConstType value, ExtensionSet* set) { + GOOGLE_DCHECK(IsValid(value)); + set->SetEnum(number, field_type, value, NULL); + } +}; + +template +class RepeatedEnumTypeTraits { + public: + typedef Type ConstType; + typedef Type MutableType; + typedef RepeatedEnumTypeTraits Repeated; + + typedef RepeatedField RepeatedFieldType; + + static inline ConstType Get(int number, const ExtensionSet& set, int index) { + return static_cast(set.GetRepeatedEnum(number, index)); + } + static inline void Set(int number, int index, + ConstType value, ExtensionSet* set) { + GOOGLE_DCHECK(IsValid(value)); + set->SetRepeatedEnum(number, index, value); + } + static inline void Add(int number, FieldType field_type, + bool is_packed, ConstType value, ExtensionSet* set) { + GOOGLE_DCHECK(IsValid(value)); + set->AddEnum(number, field_type, is_packed, value, NULL); + } + static inline const RepeatedField& GetRepeated(int number, + const ExtensionSet& + set) { + // Hack: the `Extension` struct stores a RepeatedField for enums. + // RepeatedField cannot implicitly convert to RepeatedField + // so we need to do some casting magic. See message.h for similar + // contortions for non-extension fields. + return *reinterpret_cast*>( + set.GetRawRepeatedField(number, GetDefaultRepeatedField())); + } + + static inline RepeatedField* MutableRepeated(int number, + FieldType field_type, + bool is_packed, + ExtensionSet* set) { + return reinterpret_cast*>( + set->MutableRawRepeatedField(number, field_type, is_packed, NULL)); + } + + static const RepeatedFieldType* GetDefaultRepeatedField() { + // Hack: as noted above, repeated enum fields are internally stored as a + // RepeatedField. We need to be able to instantiate global static + // objects to return as default (empty) repeated fields on non-existent + // extensions. We would not be able to know a-priori all of the enum types + // (values of |Type|) to instantiate all of these, so we just re-use int32's + // default repeated field object. + return reinterpret_cast*>( + RepeatedPrimitiveTypeTraits::GetDefaultRepeatedField()); + } +}; + +// ------------------------------------------------------------------- +// MessageTypeTraits + +// ExtensionSet guarantees that when manipulating extensions with message +// types, the implementation used will be the compiled-in class representing +// that type. So, we can static_cast down to the exact type we expect. +template +class MessageTypeTraits { + public: + typedef const Type& ConstType; + typedef Type* MutableType; + typedef MessageTypeTraits Singular; + + static inline ConstType Get(int number, const ExtensionSet& set, + ConstType default_value) { + return static_cast( + set.GetMessage(number, default_value)); + } + static inline MutableType Mutable(int number, FieldType field_type, + ExtensionSet* set) { + return static_cast( + set->MutableMessage(number, field_type, Type::default_instance(), NULL)); + } + static inline void SetAllocated(int number, FieldType field_type, + MutableType message, ExtensionSet* set) { + set->SetAllocatedMessage(number, field_type, NULL, message); + } + static inline void UnsafeArenaSetAllocated(int number, FieldType field_type, + MutableType message, + ExtensionSet* set) { + set->UnsafeArenaSetAllocatedMessage(number, field_type, NULL, message); + } + static inline MutableType Release(int number, FieldType /* field_type */, + ExtensionSet* set) { + return static_cast(set->ReleaseMessage( + number, Type::default_instance())); + } + static inline MutableType UnsafeArenaRelease(int number, + FieldType /* field_type */, + ExtensionSet* set) { + return static_cast(set->UnsafeArenaReleaseMessage( + number, Type::default_instance())); + } +}; + +// forward declaration +class RepeatedMessageGenericTypeTraits; + +template +class RepeatedMessageTypeTraits { + public: + typedef const Type& ConstType; + typedef Type* MutableType; + typedef RepeatedMessageTypeTraits Repeated; + + typedef RepeatedPtrField RepeatedFieldType; + + static inline ConstType Get(int number, const ExtensionSet& set, int index) { + return static_cast(set.GetRepeatedMessage(number, index)); + } + static inline MutableType Mutable(int number, int index, ExtensionSet* set) { + return static_cast(set->MutableRepeatedMessage(number, index)); + } + static inline MutableType Add(int number, FieldType field_type, + ExtensionSet* set) { + return static_cast( + set->AddMessage(number, field_type, Type::default_instance(), NULL)); + } + static inline const RepeatedPtrField& GetRepeated(int number, + const ExtensionSet& + set) { + // See notes above in RepeatedEnumTypeTraits::GetRepeated(): same + // casting hack applies here, because a RepeatedPtrField + // cannot naturally become a RepeatedPtrType even though Type is + // presumably a message. google::protobuf::Message goes through similar contortions + // with a reinterpret_cast<>. + return *reinterpret_cast*>( + set.GetRawRepeatedField(number, GetDefaultRepeatedField())); + } + static inline RepeatedPtrField* MutableRepeated(int number, + FieldType field_type, + bool is_packed, + ExtensionSet* set) { + return reinterpret_cast*>( + set->MutableRawRepeatedField(number, field_type, is_packed, NULL)); + } + + static const RepeatedFieldType* GetDefaultRepeatedField(); +}; + +LIBPROTOBUF_EXPORT extern ProtobufOnceType repeated_message_generic_type_traits_once_init_; + +// This class exists only to hold a generic default empty repeated field for all +// message-type repeated field extensions. +class LIBPROTOBUF_EXPORT RepeatedMessageGenericTypeTraits { + public: + typedef RepeatedPtrField< ::google::protobuf::MessageLite*> RepeatedFieldType; + private: + template friend class RepeatedMessageTypeTraits; + static void InitializeDefaultRepeatedFields(); + static void DestroyDefaultRepeatedFields(); + static const RepeatedFieldType* default_repeated_field_; +}; + +template inline + const typename RepeatedMessageTypeTraits::RepeatedFieldType* + RepeatedMessageTypeTraits::GetDefaultRepeatedField() { + ::google::protobuf::GoogleOnceInit( + &repeated_message_generic_type_traits_once_init_, + &RepeatedMessageGenericTypeTraits::InitializeDefaultRepeatedFields); + return reinterpret_cast( + RepeatedMessageGenericTypeTraits::default_repeated_field_); +} + +// ------------------------------------------------------------------- +// ExtensionIdentifier + +// This is the type of actual extension objects. E.g. if you have: +// extends Foo with optional int32 bar = 1234; +// then "bar" will be defined in C++ as: +// ExtensionIdentifier, 1, false> bar(1234); +// +// Note that we could, in theory, supply the field number as a template +// parameter, and thus make an instance of ExtensionIdentifier have no +// actual contents. However, if we did that, then using at extension +// identifier would not necessarily cause the compiler to output any sort +// of reference to any simple defined in the extension's .pb.o file. Some +// linkers will actually drop object files that are not explicitly referenced, +// but that would be bad because it would cause this extension to not be +// registered at static initialization, and therefore using it would crash. + +template +class ExtensionIdentifier { + public: + typedef TypeTraitsType TypeTraits; + typedef ExtendeeType Extendee; + + ExtensionIdentifier(int number, typename TypeTraits::ConstType default_value) + : number_(number), default_value_(default_value) {} + inline int number() const { return number_; } + typename TypeTraits::ConstType default_value() const { + return default_value_; + } + + private: + const int number_; + typename TypeTraits::ConstType default_value_; +}; + +// ------------------------------------------------------------------- +// Generated accessors + +// This macro should be expanded in the context of a generated type which +// has extensions. +// +// We use "_proto_TypeTraits" as a type name below because "TypeTraits" +// causes problems if the class has a nested message or enum type with that +// name and "_TypeTraits" is technically reserved for the C++ library since +// it starts with an underscore followed by a capital letter. +// +// For similar reason, we use "_field_type" and "_is_packed" as parameter names +// below, so that "field_type" and "is_packed" can be used as field names. +#define GOOGLE_PROTOBUF_EXTENSION_ACCESSORS(CLASSNAME) \ + /* Has, Size, Clear */ \ + template \ + inline bool HasExtension( \ + const ::google::protobuf::internal::ExtensionIdentifier< \ + CLASSNAME, _proto_TypeTraits, _field_type, _is_packed>& id) const { \ + return _extensions_.Has(id.number()); \ + } \ + \ + template \ + inline void ClearExtension( \ + const ::google::protobuf::internal::ExtensionIdentifier< \ + CLASSNAME, _proto_TypeTraits, _field_type, _is_packed>& id) { \ + _extensions_.ClearExtension(id.number()); \ + } \ + \ + template \ + inline int ExtensionSize( \ + const ::google::protobuf::internal::ExtensionIdentifier< \ + CLASSNAME, _proto_TypeTraits, _field_type, _is_packed>& id) const { \ + return _extensions_.ExtensionSize(id.number()); \ + } \ + \ + /* Singular accessors */ \ + template \ + inline typename _proto_TypeTraits::Singular::ConstType GetExtension( \ + const ::google::protobuf::internal::ExtensionIdentifier< \ + CLASSNAME, _proto_TypeTraits, _field_type, _is_packed>& id) const { \ + return _proto_TypeTraits::Get(id.number(), _extensions_, \ + id.default_value()); \ + } \ + \ + template \ + inline typename _proto_TypeTraits::Singular::MutableType MutableExtension( \ + const ::google::protobuf::internal::ExtensionIdentifier< \ + CLASSNAME, _proto_TypeTraits, _field_type, _is_packed>& id) { \ + return _proto_TypeTraits::Mutable(id.number(), _field_type, \ + &_extensions_); \ + } \ + \ + template \ + inline void SetExtension( \ + const ::google::protobuf::internal::ExtensionIdentifier< \ + CLASSNAME, _proto_TypeTraits, _field_type, _is_packed>& id, \ + typename _proto_TypeTraits::Singular::ConstType value) { \ + _proto_TypeTraits::Set(id.number(), _field_type, value, &_extensions_); \ + } \ + \ + template \ + inline void SetAllocatedExtension( \ + const ::google::protobuf::internal::ExtensionIdentifier< \ + CLASSNAME, _proto_TypeTraits, _field_type, _is_packed>& id, \ + typename _proto_TypeTraits::Singular::MutableType value) { \ + _proto_TypeTraits::SetAllocated(id.number(), _field_type, \ + value, &_extensions_); \ + } \ + template \ + inline void UnsafeArenaSetAllocatedExtension( \ + const ::google::protobuf::internal::ExtensionIdentifier< \ + CLASSNAME, _proto_TypeTraits, _field_type, _is_packed>& id, \ + typename _proto_TypeTraits::Singular::MutableType value) { \ + _proto_TypeTraits::UnsafeArenaSetAllocated(id.number(), _field_type, \ + value, &_extensions_); \ + } \ + template \ + inline typename _proto_TypeTraits::Singular::MutableType ReleaseExtension( \ + const ::google::protobuf::internal::ExtensionIdentifier< \ + CLASSNAME, _proto_TypeTraits, _field_type, _is_packed>& id) { \ + return _proto_TypeTraits::Release(id.number(), _field_type, \ + &_extensions_); \ + } \ + template \ + inline typename _proto_TypeTraits::Singular::MutableType \ + UnsafeArenaReleaseExtension( \ + const ::google::protobuf::internal::ExtensionIdentifier< \ + CLASSNAME, _proto_TypeTraits, _field_type, _is_packed>& id) { \ + return _proto_TypeTraits::UnsafeArenaRelease(id.number(), _field_type, \ + &_extensions_); \ + } \ + \ + /* Repeated accessors */ \ + template \ + inline typename _proto_TypeTraits::Repeated::ConstType GetExtension( \ + const ::google::protobuf::internal::ExtensionIdentifier< \ + CLASSNAME, _proto_TypeTraits, _field_type, _is_packed>& id, \ + int index) const { \ + return _proto_TypeTraits::Get(id.number(), _extensions_, index); \ + } \ + \ + template \ + inline typename _proto_TypeTraits::Repeated::MutableType MutableExtension( \ + const ::google::protobuf::internal::ExtensionIdentifier< \ + CLASSNAME, _proto_TypeTraits, _field_type, _is_packed>& id, \ + int index) { \ + return _proto_TypeTraits::Mutable(id.number(), index, &_extensions_); \ + } \ + \ + template \ + inline void SetExtension( \ + const ::google::protobuf::internal::ExtensionIdentifier< \ + CLASSNAME, _proto_TypeTraits, _field_type, _is_packed>& id, \ + int index, typename _proto_TypeTraits::Repeated::ConstType value) { \ + _proto_TypeTraits::Set(id.number(), index, value, &_extensions_); \ + } \ + \ + template \ + inline typename _proto_TypeTraits::Repeated::MutableType AddExtension( \ + const ::google::protobuf::internal::ExtensionIdentifier< \ + CLASSNAME, _proto_TypeTraits, _field_type, _is_packed>& id) { \ + return _proto_TypeTraits::Add(id.number(), _field_type, &_extensions_); \ + } \ + \ + template \ + inline void AddExtension( \ + const ::google::protobuf::internal::ExtensionIdentifier< \ + CLASSNAME, _proto_TypeTraits, _field_type, _is_packed>& id, \ + typename _proto_TypeTraits::Repeated::ConstType value) { \ + _proto_TypeTraits::Add(id.number(), _field_type, _is_packed, \ + value, &_extensions_); \ + } \ + \ + template \ + inline const typename _proto_TypeTraits::Repeated::RepeatedFieldType& \ + GetRepeatedExtension( \ + const ::google::protobuf::internal::ExtensionIdentifier< \ + CLASSNAME, _proto_TypeTraits, _field_type, \ + _is_packed>& id) const { \ + return _proto_TypeTraits::GetRepeated(id.number(), _extensions_); \ + } \ + \ + template \ + inline typename _proto_TypeTraits::Repeated::RepeatedFieldType* \ + MutableRepeatedExtension( \ + const ::google::protobuf::internal::ExtensionIdentifier< \ + CLASSNAME, _proto_TypeTraits, _field_type, \ + _is_packed>& id) { \ + return _proto_TypeTraits::MutableRepeated(id.number(), _field_type, \ + _is_packed, &_extensions_); \ + } + +} // namespace internal +} // namespace protobuf + +} // namespace google +#endif // GOOGLE_PROTOBUF_EXTENSION_SET_H__ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/generated_message_util.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/generated_message_util.h new file mode 100644 index 00000000..325f6467 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/generated_message_util.h @@ -0,0 +1,169 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Author: kenton@google.com (Kenton Varda) +// Based on original Protocol Buffers design by +// Sanjay Ghemawat, Jeff Dean, and others. +// +// This file contains miscellaneous helper code used by generated code -- +// including lite types -- but which should not be used directly by users. + +#ifndef GOOGLE_PROTOBUF_GENERATED_MESSAGE_UTIL_H__ +#define GOOGLE_PROTOBUF_GENERATED_MESSAGE_UTIL_H__ + +#include +#include + +#include "common.h" +#include "once.h" +#include "has_bits.h" + +namespace google { + +namespace protobuf { + +class Arena; +namespace io { class CodedInputStream; } + +namespace internal { + + +// Annotation for the compiler to emit a deprecation message if a field marked +// with option 'deprecated=true' is used in the code, or for other things in +// generated code which are deprecated. +// +// For internal use in the pb.cc files, deprecation warnings are suppressed +// there. +#undef DEPRECATED_PROTOBUF_FIELD +#define PROTOBUF_DEPRECATED + +#define GOOGLE_PROTOBUF_DEPRECATED_ATTR + + +// Constants for special floating point values. +LIBPROTOBUF_EXPORT double Infinity(); +LIBPROTOBUF_EXPORT double NaN(); + +// This type is used to define a global variable, without it's constructor +// and destructor run on start and end of the program lifetime. This circumvents +// the initial construction order fiasco, while keeping the address of the +// empty string a compile time constant. +template +class ExplicitlyConstructed { + public: + void DefaultConstruct() { + new (&union_) T(); + init_ = true; + } + + bool IsInitialized() { return init_; } + void Shutdown() { + if (init_) { + init_ = false; + get_mutable()->~T(); + } + } + + const T& get() const { return reinterpret_cast(union_); } + T* get_mutable() { return reinterpret_cast(&union_); } + + private: + // Prefer c++14 aligned_storage, but for compatibility this will do. + union AlignedUnion { + char space[sizeof(T)]; + int64 align_to_int64; + void* align_to_ptr; + } union_; + bool init_; // false by linker +}; + +// TODO(jieluo): Change to template. We have tried to use template, +// but it causes net/rpc/python:rpcutil_test fail (the empty string will +// init twice). It may related to swig. Change to template after we +// found the solution. + +// Default empty string object. Don't use this directly. Instead, call +// GetEmptyString() to get the reference. +extern ExplicitlyConstructed< ::std::string> fixed_address_empty_string; +LIBPROTOBUF_EXPORT extern ProtobufOnceType empty_string_once_init_; +LIBPROTOBUF_EXPORT void InitEmptyString(); + + +LIBPROTOBUF_EXPORT inline const ::std::string& GetEmptyStringAlreadyInited() { + return fixed_address_empty_string.get(); +} + +LIBPROTOBUF_EXPORT inline const ::std::string& GetEmptyString() { + ::google::protobuf::GoogleOnceInit(&empty_string_once_init_, &InitEmptyString); + return GetEmptyStringAlreadyInited(); +} + +LIBPROTOBUF_EXPORT int StringSpaceUsedExcludingSelf(const string& str); + + +// True if IsInitialized() is true for all elements of t. Type is expected +// to be a RepeatedPtrField. It's useful to have this +// helper here to keep the protobuf compiler from ever having to emit loops in +// IsInitialized() methods. We want the C++ compiler to inline this or not +// as it sees fit. +template bool AllAreInitialized(const Type& t) { + for (int i = t.size(); --i >= 0; ) { + if (!t.Get(i).IsInitialized()) return false; + } + return true; +} + +// Helper function to crash on merge failure. +// Moved out of generated code to reduce binary size. +LIBPROTOBUF_EXPORT void MergeFromFail(const char* file, int line) GOOGLE_ATTRIBUTE_NORETURN; + +// We compute sizes as size_t but cache them as int. This function converts a +// computed size to a cached size. Since we don't proceed with serialization if +// the total size was > INT_MAX, it is not important what this function returns +// for inputs > INT_MAX. +inline int ToCachedSize(size_t size) { + return static_cast(size); +} + +// We mainly calculate sizes in terms of size_t, but some functions that compute +// sizes return "int". These int sizes are expected to always be positive. +// This function is more efficient than casting an int to size_t directly on +// 64-bit platforms because it avoids making the compiler emit a sign extending +// instruction, which we don't want and don't want to pay for. +inline size_t FromIntSize(int size) { + // Convert to unsigned before widening so sign extension is not necessary. + return static_cast(size); +} + +} // namespace internal +} // namespace protobuf + +} // namespace google +#endif // GOOGLE_PROTOBUF_GENERATED_MESSAGE_UTIL_H__ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/has_bits.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/has_bits.h new file mode 100644 index 00000000..058a8057 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/has_bits.h @@ -0,0 +1,72 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef GOOGLE_PROTOBUF_HAS_BITS_H__ +#define GOOGLE_PROTOBUF_HAS_BITS_H__ + +#include "common.h" + +namespace google { +namespace protobuf { +namespace internal { + +template +class HasBits { + public: + HasBits() GOOGLE_ATTRIBUTE_ALWAYS_INLINE { Clear(); } + + void Clear() GOOGLE_ATTRIBUTE_ALWAYS_INLINE { + memset(has_bits_, 0, sizeof(has_bits_)); + } + + ::google::protobuf::uint32& operator[](int index) GOOGLE_ATTRIBUTE_ALWAYS_INLINE { + return has_bits_[index]; + } + + const ::google::protobuf::uint32& operator[](int index) const GOOGLE_ATTRIBUTE_ALWAYS_INLINE { + return has_bits_[index]; + } + + bool operator==(const HasBits& rhs) const { + return memcmp(has_bits_, rhs.has_bits_, sizeof(has_bits_)) == 0; + } + + bool operator!=(const HasBits& rhs) const { + return !(*this == rhs); + } + private: + ::google::protobuf::uint32 has_bits_[doublewords]; +}; + +} // namespace internal +} // namespace protobuf + +} // namespace google +#endif // GOOGLE_PROTOBUF_HAS_BITS_H__ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/io/coded_stream.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/io/coded_stream.h new file mode 100644 index 00000000..007bde5f --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/io/coded_stream.h @@ -0,0 +1,1367 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Author: kenton@google.com (Kenton Varda) +// Based on original Protocol Buffers design by +// Sanjay Ghemawat, Jeff Dean, and others. +// +// This file contains the CodedInputStream and CodedOutputStream classes, +// which wrap a ZeroCopyInputStream or ZeroCopyOutputStream, respectively, +// and allow you to read or write individual pieces of data in various +// formats. In particular, these implement the varint encoding for +// integers, a simple variable-length encoding in which smaller numbers +// take fewer bytes. +// +// Typically these classes will only be used internally by the protocol +// buffer library in order to encode and decode protocol buffers. Clients +// of the library only need to know about this class if they wish to write +// custom message parsing or serialization procedures. +// +// CodedOutputStream example: +// // Write some data to "myfile". First we write a 4-byte "magic number" +// // to identify the file type, then write a length-delimited string. The +// // string is composed of a varint giving the length followed by the raw +// // bytes. +// int fd = open("myfile", O_CREAT | O_WRONLY); +// ZeroCopyOutputStream* raw_output = new FileOutputStream(fd); +// CodedOutputStream* coded_output = new CodedOutputStream(raw_output); +// +// int magic_number = 1234; +// char text[] = "Hello world!"; +// coded_output->WriteLittleEndian32(magic_number); +// coded_output->WriteVarint32(strlen(text)); +// coded_output->WriteRaw(text, strlen(text)); +// +// delete coded_output; +// delete raw_output; +// close(fd); +// +// CodedInputStream example: +// // Read a file created by the above code. +// int fd = open("myfile", O_RDONLY); +// ZeroCopyInputStream* raw_input = new FileInputStream(fd); +// CodedInputStream coded_input = new CodedInputStream(raw_input); +// +// coded_input->ReadLittleEndian32(&magic_number); +// if (magic_number != 1234) { +// cerr << "File not in expected format." << endl; +// return; +// } +// +// uint32 size; +// coded_input->ReadVarint32(&size); +// +// char* text = new char[size + 1]; +// coded_input->ReadRaw(buffer, size); +// text[size] = '\0'; +// +// delete coded_input; +// delete raw_input; +// close(fd); +// +// cout << "Text is: " << text << endl; +// delete [] text; +// +// For those who are interested, varint encoding is defined as follows: +// +// The encoding operates on unsigned integers of up to 64 bits in length. +// Each byte of the encoded value has the format: +// * bits 0-6: Seven bits of the number being encoded. +// * bit 7: Zero if this is the last byte in the encoding (in which +// case all remaining bits of the number are zero) or 1 if +// more bytes follow. +// The first byte contains the least-significant 7 bits of the number, the +// second byte (if present) contains the next-least-significant 7 bits, +// and so on. So, the binary number 1011000101011 would be encoded in two +// bytes as "10101011 00101100". +// +// In theory, varint could be used to encode integers of any length. +// However, for practicality we set a limit at 64 bits. The maximum encoded +// length of a number is thus 10 bytes. + +#ifndef GOOGLE_PROTOBUF_IO_CODED_STREAM_H__ +#define GOOGLE_PROTOBUF_IO_CODED_STREAM_H__ + +#include +#include +#include +#ifdef _MSC_VER + // Assuming windows is always little-endian. + #if !defined(PROTOBUF_DISABLE_LITTLE_ENDIAN_OPT_FOR_TEST) + #define PROTOBUF_LITTLE_ENDIAN 1 + #endif + #if _MSC_VER >= 1300 && !defined(__INTEL_COMPILER) + // If MSVC has "/RTCc" set, it will complain about truncating casts at + // runtime. This file contains some intentional truncating casts. + #pragma runtime_checks("c", off) + #endif +#else + #include // __BYTE_ORDER + #if ((defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__)) || \ + (defined(__BYTE_ORDER) && __BYTE_ORDER == __LITTLE_ENDIAN)) && \ + !defined(PROTOBUF_DISABLE_LITTLE_ENDIAN_OPT_FOR_TEST) + #define PROTOBUF_LITTLE_ENDIAN 1 + #endif +#endif +#include "common.h" + +namespace google { + +namespace protobuf { + +class DescriptorPool; +class MessageFactory; + +namespace io { + +// Defined in this file. +class CodedInputStream; +class CodedOutputStream; + +// Defined in other files. +class ZeroCopyInputStream; // zero_copy_stream.h +class ZeroCopyOutputStream; // zero_copy_stream.h + +// Class which reads and decodes binary data which is composed of varint- +// encoded integers and fixed-width pieces. Wraps a ZeroCopyInputStream. +// Most users will not need to deal with CodedInputStream. +// +// Most methods of CodedInputStream that return a bool return false if an +// underlying I/O error occurs or if the data is malformed. Once such a +// failure occurs, the CodedInputStream is broken and is no longer useful. +class LIBPROTOBUF_EXPORT CodedInputStream { + public: + // Create a CodedInputStream that reads from the given ZeroCopyInputStream. + explicit CodedInputStream(ZeroCopyInputStream* input); + + // Create a CodedInputStream that reads from the given flat array. This is + // faster than using an ArrayInputStream. PushLimit(size) is implied by + // this constructor. + explicit CodedInputStream(const uint8* buffer, int size); + + // Destroy the CodedInputStream and position the underlying + // ZeroCopyInputStream at the first unread byte. If an error occurred while + // reading (causing a method to return false), then the exact position of + // the input stream may be anywhere between the last value that was read + // successfully and the stream's byte limit. + ~CodedInputStream(); + + // Return true if this CodedInputStream reads from a flat array instead of + // a ZeroCopyInputStream. + inline bool IsFlat() const; + + // Skips a number of bytes. Returns false if an underlying read error + // occurs. + bool Skip(int count); + + // Sets *data to point directly at the unread part of the CodedInputStream's + // underlying buffer, and *size to the size of that buffer, but does not + // advance the stream's current position. This will always either produce + // a non-empty buffer or return false. If the caller consumes any of + // this data, it should then call Skip() to skip over the consumed bytes. + // This may be useful for implementing external fast parsing routines for + // types of data not covered by the CodedInputStream interface. + bool GetDirectBufferPointer(const void** data, int* size); + + // Like GetDirectBufferPointer, but this method is inlined, and does not + // attempt to Refresh() if the buffer is currently empty. + GOOGLE_ATTRIBUTE_ALWAYS_INLINE void GetDirectBufferPointerInline(const void** data, + int* size); + + // Read raw bytes, copying them into the given buffer. + bool ReadRaw(void* buffer, int size); + + // Like the above, with inlined optimizations. This should only be used + // by the protobuf implementation. + GOOGLE_ATTRIBUTE_ALWAYS_INLINE bool InternalReadRawInline(void* buffer, int size); + + // Like ReadRaw, but reads into a string. + // + // Implementation Note: ReadString() grows the string gradually as it + // reads in the data, rather than allocating the entire requested size + // upfront. This prevents denial-of-service attacks in which a client + // could claim that a string is going to be MAX_INT bytes long in order to + // crash the server because it can't allocate this much space at once. + bool ReadString(string* buffer, int size); + // Like the above, with inlined optimizations. This should only be used + // by the protobuf implementation. + GOOGLE_ATTRIBUTE_ALWAYS_INLINE bool InternalReadStringInline(string* buffer, + int size); + + + // Read a 32-bit little-endian integer. + bool ReadLittleEndian32(uint32* value); + // Read a 64-bit little-endian integer. + bool ReadLittleEndian64(uint64* value); + + // These methods read from an externally provided buffer. The caller is + // responsible for ensuring that the buffer has sufficient space. + // Read a 32-bit little-endian integer. + static const uint8* ReadLittleEndian32FromArray(const uint8* buffer, + uint32* value); + // Read a 64-bit little-endian integer. + static const uint8* ReadLittleEndian64FromArray(const uint8* buffer, + uint64* value); + + // Read an unsigned integer with Varint encoding, truncating to 32 bits. + // Reading a 32-bit value is equivalent to reading a 64-bit one and casting + // it to uint32, but may be more efficient. + bool ReadVarint32(uint32* value); + // Read an unsigned integer with Varint encoding. + bool ReadVarint64(uint64* value); + + // Reads a varint off the wire into an "int". This should be used for reading + // sizes off the wire (sizes of strings, submessages, bytes fields, etc). + // + // The value from the wire is interpreted as unsigned. If its value exceeds + // the representable value of an integer on this platform, instead of + // truncating we return false. Truncating (as performed by ReadVarint32() + // above) is an acceptable approach for fields representing an integer, but + // when we are parsing a size from the wire, truncating the value would result + // in us misparsing the payload. + bool ReadVarintSizeAsInt(int* value); + + // Read a tag. This calls ReadVarint32() and returns the result, or returns + // zero (which is not a valid tag) if ReadVarint32() fails. Also, it updates + // the last tag value, which can be checked with LastTagWas(). + // Always inline because this is only called in one place per parse loop + // but it is called for every iteration of said loop, so it should be fast. + // GCC doesn't want to inline this by default. + GOOGLE_ATTRIBUTE_ALWAYS_INLINE uint32 ReadTag(); + + // This usually a faster alternative to ReadTag() when cutoff is a manifest + // constant. It does particularly well for cutoff >= 127. The first part + // of the return value is the tag that was read, though it can also be 0 in + // the cases where ReadTag() would return 0. If the second part is true + // then the tag is known to be in [0, cutoff]. If not, the tag either is + // above cutoff or is 0. (There's intentional wiggle room when tag is 0, + // because that can arise in several ways, and for best performance we want + // to avoid an extra "is tag == 0?" check here.) + GOOGLE_ATTRIBUTE_ALWAYS_INLINE std::pair ReadTagWithCutoff( + uint32 cutoff); + + // Usually returns true if calling ReadVarint32() now would produce the given + // value. Will always return false if ReadVarint32() would not return the + // given value. If ExpectTag() returns true, it also advances past + // the varint. For best performance, use a compile-time constant as the + // parameter. + // Always inline because this collapses to a small number of instructions + // when given a constant parameter, but GCC doesn't want to inline by default. + GOOGLE_ATTRIBUTE_ALWAYS_INLINE bool ExpectTag(uint32 expected); + + // Like above, except this reads from the specified buffer. The caller is + // responsible for ensuring that the buffer is large enough to read a varint + // of the expected size. For best performance, use a compile-time constant as + // the expected tag parameter. + // + // Returns a pointer beyond the expected tag if it was found, or NULL if it + // was not. + GOOGLE_ATTRIBUTE_ALWAYS_INLINE static const uint8* ExpectTagFromArray( + const uint8* buffer, + uint32 expected); + + // Usually returns true if no more bytes can be read. Always returns false + // if more bytes can be read. If ExpectAtEnd() returns true, a subsequent + // call to LastTagWas() will act as if ReadTag() had been called and returned + // zero, and ConsumedEntireMessage() will return true. + bool ExpectAtEnd(); + + // If the last call to ReadTag() or ReadTagWithCutoff() returned the + // given value, returns true. Otherwise, returns false; + // + // This is needed because parsers for some types of embedded messages + // (with field type TYPE_GROUP) don't actually know that they've reached the + // end of a message until they see an ENDGROUP tag, which was actually part + // of the enclosing message. The enclosing message would like to check that + // tag to make sure it had the right number, so it calls LastTagWas() on + // return from the embedded parser to check. + bool LastTagWas(uint32 expected); + + // When parsing message (but NOT a group), this method must be called + // immediately after MergeFromCodedStream() returns (if it returns true) + // to further verify that the message ended in a legitimate way. For + // example, this verifies that parsing did not end on an end-group tag. + // It also checks for some cases where, due to optimizations, + // MergeFromCodedStream() can incorrectly return true. + bool ConsumedEntireMessage(); + + // Limits ---------------------------------------------------------- + // Limits are used when parsing length-delimited embedded messages. + // After the message's length is read, PushLimit() is used to prevent + // the CodedInputStream from reading beyond that length. Once the + // embedded message has been parsed, PopLimit() is called to undo the + // limit. + + // Opaque type used with PushLimit() and PopLimit(). Do not modify + // values of this type yourself. The only reason that this isn't a + // struct with private internals is for efficiency. + typedef int Limit; + + // Places a limit on the number of bytes that the stream may read, + // starting from the current position. Once the stream hits this limit, + // it will act like the end of the input has been reached until PopLimit() + // is called. + // + // As the names imply, the stream conceptually has a stack of limits. The + // shortest limit on the stack is always enforced, even if it is not the + // top limit. + // + // The value returned by PushLimit() is opaque to the caller, and must + // be passed unchanged to the corresponding call to PopLimit(). + Limit PushLimit(int byte_limit); + + // Pops the last limit pushed by PushLimit(). The input must be the value + // returned by that call to PushLimit(). + void PopLimit(Limit limit); + + // Returns the number of bytes left until the nearest limit on the + // stack is hit, or -1 if no limits are in place. + int BytesUntilLimit() const; + + // Returns current position relative to the beginning of the input stream. + int CurrentPosition() const; + + // Total Bytes Limit ----------------------------------------------- + // To prevent malicious users from sending excessively large messages + // and causing integer overflows or memory exhaustion, CodedInputStream + // imposes a hard limit on the total number of bytes it will read. + + // Sets the maximum number of bytes that this CodedInputStream will read + // before refusing to continue. To prevent integer overflows in the + // protocol buffers implementation, as well as to prevent servers from + // allocating enormous amounts of memory to hold parsed messages, the + // maximum message length should be limited to the shortest length that + // will not harm usability. The theoretical shortest message that could + // cause integer overflows is 512MB. The default limit is 64MB. Apps + // should set shorter limits if possible. If warning_threshold is not -1, + // a warning will be printed to stderr after warning_threshold bytes are + // read. For backwards compatibility all negative values get squashed to -1, + // as other negative values might have special internal meanings. + // An error will always be printed to stderr if the limit is reached. + // + // This is unrelated to PushLimit()/PopLimit(). + // + // Hint: If you are reading this because your program is printing a + // warning about dangerously large protocol messages, you may be + // confused about what to do next. The best option is to change your + // design such that excessively large messages are not necessary. + // For example, try to design file formats to consist of many small + // messages rather than a single large one. If this is infeasible, + // you will need to increase the limit. Chances are, though, that + // your code never constructs a CodedInputStream on which the limit + // can be set. You probably parse messages by calling things like + // Message::ParseFromString(). In this case, you will need to change + // your code to instead construct some sort of ZeroCopyInputStream + // (e.g. an ArrayInputStream), construct a CodedInputStream around + // that, then call Message::ParseFromCodedStream() instead. Then + // you can adjust the limit. Yes, it's more work, but you're doing + // something unusual. + void SetTotalBytesLimit(int total_bytes_limit, int warning_threshold); + + // The Total Bytes Limit minus the Current Position, or -1 if there + // is no Total Bytes Limit. + int BytesUntilTotalBytesLimit() const; + + // Recursion Limit ------------------------------------------------- + // To prevent corrupt or malicious messages from causing stack overflows, + // we must keep track of the depth of recursion when parsing embedded + // messages and groups. CodedInputStream keeps track of this because it + // is the only object that is passed down the stack during parsing. + + // Sets the maximum recursion depth. The default is 100. + void SetRecursionLimit(int limit); + + + // Increments the current recursion depth. Returns true if the depth is + // under the limit, false if it has gone over. + bool IncrementRecursionDepth(); + + // Decrements the recursion depth if possible. + void DecrementRecursionDepth(); + + // Decrements the recursion depth blindly. This is faster than + // DecrementRecursionDepth(). It should be used only if all previous + // increments to recursion depth were successful. + void UnsafeDecrementRecursionDepth(); + + // Shorthand for make_pair(PushLimit(byte_limit), --recursion_budget_). + // Using this can reduce code size and complexity in some cases. The caller + // is expected to check that the second part of the result is non-negative (to + // bail out if the depth of recursion is too high) and, if all is well, to + // later pass the first part of the result to PopLimit() or similar. + std::pair IncrementRecursionDepthAndPushLimit( + int byte_limit); + + // Shorthand for PushLimit(ReadVarint32(&length) ? length : 0). + Limit ReadLengthAndPushLimit(); + + // Helper that is equivalent to: { + // bool result = ConsumedEntireMessage(); + // PopLimit(limit); + // UnsafeDecrementRecursionDepth(); + // return result; } + // Using this can reduce code size and complexity in some cases. + // Do not use unless the current recursion depth is greater than zero. + bool DecrementRecursionDepthAndPopLimit(Limit limit); + + // Helper that is equivalent to: { + // bool result = ConsumedEntireMessage(); + // PopLimit(limit); + // return result; } + // Using this can reduce code size and complexity in some cases. + bool CheckEntireMessageConsumedAndPopLimit(Limit limit); + + // Extension Registry ---------------------------------------------- + // ADVANCED USAGE: 99.9% of people can ignore this section. + // + // By default, when parsing extensions, the parser looks for extension + // definitions in the pool which owns the outer message's Descriptor. + // However, you may call SetExtensionRegistry() to provide an alternative + // pool instead. This makes it possible, for example, to parse a message + // using a generated class, but represent some extensions using + // DynamicMessage. + + // Set the pool used to look up extensions. Most users do not need to call + // this as the correct pool will be chosen automatically. + // + // WARNING: It is very easy to misuse this. Carefully read the requirements + // below. Do not use this unless you are sure you need it. Almost no one + // does. + // + // Let's say you are parsing a message into message object m, and you want + // to take advantage of SetExtensionRegistry(). You must follow these + // requirements: + // + // The given DescriptorPool must contain m->GetDescriptor(). It is not + // sufficient for it to simply contain a descriptor that has the same name + // and content -- it must be the *exact object*. In other words: + // assert(pool->FindMessageTypeByName(m->GetDescriptor()->full_name()) == + // m->GetDescriptor()); + // There are two ways to satisfy this requirement: + // 1) Use m->GetDescriptor()->pool() as the pool. This is generally useless + // because this is the pool that would be used anyway if you didn't call + // SetExtensionRegistry() at all. + // 2) Use a DescriptorPool which has m->GetDescriptor()->pool() as an + // "underlay". Read the documentation for DescriptorPool for more + // information about underlays. + // + // You must also provide a MessageFactory. This factory will be used to + // construct Message objects representing extensions. The factory's + // GetPrototype() MUST return non-NULL for any Descriptor which can be found + // through the provided pool. + // + // If the provided factory might return instances of protocol-compiler- + // generated (i.e. compiled-in) types, or if the outer message object m is + // a generated type, then the given factory MUST have this property: If + // GetPrototype() is given a Descriptor which resides in + // DescriptorPool::generated_pool(), the factory MUST return the same + // prototype which MessageFactory::generated_factory() would return. That + // is, given a descriptor for a generated type, the factory must return an + // instance of the generated class (NOT DynamicMessage). However, when + // given a descriptor for a type that is NOT in generated_pool, the factory + // is free to return any implementation. + // + // The reason for this requirement is that generated sub-objects may be + // accessed via the standard (non-reflection) extension accessor methods, + // and these methods will down-cast the object to the generated class type. + // If the object is not actually of that type, the results would be undefined. + // On the other hand, if an extension is not compiled in, then there is no + // way the code could end up accessing it via the standard accessors -- the + // only way to access the extension is via reflection. When using reflection, + // DynamicMessage and generated messages are indistinguishable, so it's fine + // if these objects are represented using DynamicMessage. + // + // Using DynamicMessageFactory on which you have called + // SetDelegateToGeneratedFactory(true) should be sufficient to satisfy the + // above requirement. + // + // If either pool or factory is NULL, both must be NULL. + // + // Note that this feature is ignored when parsing "lite" messages as they do + // not have descriptors. + void SetExtensionRegistry(const DescriptorPool* pool, + MessageFactory* factory); + + // Get the DescriptorPool set via SetExtensionRegistry(), or NULL if no pool + // has been provided. + const DescriptorPool* GetExtensionPool(); + + // Get the MessageFactory set via SetExtensionRegistry(), or NULL if no + // factory has been provided. + MessageFactory* GetExtensionFactory(); + + private: + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(CodedInputStream); + + const uint8* buffer_; + const uint8* buffer_end_; // pointer to the end of the buffer. + ZeroCopyInputStream* input_; + int total_bytes_read_; // total bytes read from input_, including + // the current buffer + + // If total_bytes_read_ surpasses INT_MAX, we record the extra bytes here + // so that we can BackUp() on destruction. + int overflow_bytes_; + + // LastTagWas() stuff. + uint32 last_tag_; // result of last ReadTag() or ReadTagWithCutoff(). + + // This is set true by ReadTag{Fallback/Slow}() if it is called when exactly + // at EOF, or by ExpectAtEnd() when it returns true. This happens when we + // reach the end of a message and attempt to read another tag. + bool legitimate_message_end_; + + // See EnableAliasing(). + bool aliasing_enabled_; + + // Limits + Limit current_limit_; // if position = -1, no limit is applied + + // For simplicity, if the current buffer crosses a limit (either a normal + // limit created by PushLimit() or the total bytes limit), buffer_size_ + // only tracks the number of bytes before that limit. This field + // contains the number of bytes after it. Note that this implies that if + // buffer_size_ == 0 and buffer_size_after_limit_ > 0, we know we've + // hit a limit. However, if both are zero, it doesn't necessarily mean + // we aren't at a limit -- the buffer may have ended exactly at the limit. + int buffer_size_after_limit_; + + // Maximum number of bytes to read, period. This is unrelated to + // current_limit_. Set using SetTotalBytesLimit(). + int total_bytes_limit_; + + // If positive/0: Limit for bytes read after which a warning due to size + // should be logged. + // If -1: Printing of warning disabled. Can be set by client. + // If -2: Internal: Limit has been reached, print full size when destructing. + int total_bytes_warning_threshold_; + + // Current recursion budget, controlled by IncrementRecursionDepth() and + // similar. Starts at recursion_limit_ and goes down: if this reaches + // -1 we are over budget. + int recursion_budget_; + // Recursion depth limit, set by SetRecursionLimit(). + int recursion_limit_; + + // See SetExtensionRegistry(). + const DescriptorPool* extension_pool_; + MessageFactory* extension_factory_; + + // Private member functions. + + // Advance the buffer by a given number of bytes. + void Advance(int amount); + + // Back up input_ to the current buffer position. + void BackUpInputToCurrentPosition(); + + // Recomputes the value of buffer_size_after_limit_. Must be called after + // current_limit_ or total_bytes_limit_ changes. + void RecomputeBufferLimits(); + + // Writes an error message saying that we hit total_bytes_limit_. + void PrintTotalBytesLimitError(); + + // Called when the buffer runs out to request more data. Implies an + // Advance(BufferSize()). + bool Refresh(); + + // When parsing varints, we optimize for the common case of small values, and + // then optimize for the case when the varint fits within the current buffer + // piece. The Fallback method is used when we can't use the one-byte + // optimization. The Slow method is yet another fallback when the buffer is + // not large enough. Making the slow path out-of-line speeds up the common + // case by 10-15%. The slow path is fairly uncommon: it only triggers when a + // message crosses multiple buffers. Note: ReadVarint32Fallback() and + // ReadVarint64Fallback() are called frequently and generally not inlined, so + // they have been optimized to avoid "out" parameters. The former returns -1 + // if it fails and the uint32 it read otherwise. The latter has a bool + // indicating success or failure as part of its return type. + int64 ReadVarint32Fallback(uint32 first_byte_or_zero); + int ReadVarintSizeAsIntFallback(); + std::pair ReadVarint64Fallback(); + bool ReadVarint32Slow(uint32* value); + bool ReadVarint64Slow(uint64* value); + int ReadVarintSizeAsIntSlow(); + bool ReadLittleEndian32Fallback(uint32* value); + bool ReadLittleEndian64Fallback(uint64* value); + // Fallback/slow methods for reading tags. These do not update last_tag_, + // but will set legitimate_message_end_ if we are at the end of the input + // stream. + uint32 ReadTagFallback(uint32 first_byte_or_zero); + uint32 ReadTagSlow(); + bool ReadStringFallback(string* buffer, int size); + + // Return the size of the buffer. + int BufferSize() const; + + static const int kDefaultTotalBytesLimit = 64 << 20; // 64MB + + static const int kDefaultTotalBytesWarningThreshold = 32 << 20; // 32MB + + static int default_recursion_limit_; // 100 by default. +}; + +// Class which encodes and writes binary data which is composed of varint- +// encoded integers and fixed-width pieces. Wraps a ZeroCopyOutputStream. +// Most users will not need to deal with CodedOutputStream. +// +// Most methods of CodedOutputStream which return a bool return false if an +// underlying I/O error occurs. Once such a failure occurs, the +// CodedOutputStream is broken and is no longer useful. The Write* methods do +// not return the stream status, but will invalidate the stream if an error +// occurs. The client can probe HadError() to determine the status. +// +// Note that every method of CodedOutputStream which writes some data has +// a corresponding static "ToArray" version. These versions write directly +// to the provided buffer, returning a pointer past the last written byte. +// They require that the buffer has sufficient capacity for the encoded data. +// This allows an optimization where we check if an output stream has enough +// space for an entire message before we start writing and, if there is, we +// call only the ToArray methods to avoid doing bound checks for each +// individual value. +// i.e., in the example above: +// +// CodedOutputStream coded_output = new CodedOutputStream(raw_output); +// int magic_number = 1234; +// char text[] = "Hello world!"; +// +// int coded_size = sizeof(magic_number) + +// CodedOutputStream::VarintSize32(strlen(text)) + +// strlen(text); +// +// uint8* buffer = +// coded_output->GetDirectBufferForNBytesAndAdvance(coded_size); +// if (buffer != NULL) { +// // The output stream has enough space in the buffer: write directly to +// // the array. +// buffer = CodedOutputStream::WriteLittleEndian32ToArray(magic_number, +// buffer); +// buffer = CodedOutputStream::WriteVarint32ToArray(strlen(text), buffer); +// buffer = CodedOutputStream::WriteRawToArray(text, strlen(text), buffer); +// } else { +// // Make bound-checked writes, which will ask the underlying stream for +// // more space as needed. +// coded_output->WriteLittleEndian32(magic_number); +// coded_output->WriteVarint32(strlen(text)); +// coded_output->WriteRaw(text, strlen(text)); +// } +// +// delete coded_output; +class LIBPROTOBUF_EXPORT CodedOutputStream { + public: + // Create an CodedOutputStream that writes to the given ZeroCopyOutputStream. + explicit CodedOutputStream(ZeroCopyOutputStream* output); + CodedOutputStream(ZeroCopyOutputStream* output, bool do_eager_refresh); + + // Destroy the CodedOutputStream and position the underlying + // ZeroCopyOutputStream immediately after the last byte written. + ~CodedOutputStream(); + + // Trims any unused space in the underlying buffer so that its size matches + // the number of bytes written by this stream. The underlying buffer will + // automatically be trimmed when this stream is destroyed; this call is only + // necessary if the underlying buffer is accessed *before* the stream is + // destroyed. + void Trim(); + + // Skips a number of bytes, leaving the bytes unmodified in the underlying + // buffer. Returns false if an underlying write error occurs. This is + // mainly useful with GetDirectBufferPointer(). + bool Skip(int count); + + // Sets *data to point directly at the unwritten part of the + // CodedOutputStream's underlying buffer, and *size to the size of that + // buffer, but does not advance the stream's current position. This will + // always either produce a non-empty buffer or return false. If the caller + // writes any data to this buffer, it should then call Skip() to skip over + // the consumed bytes. This may be useful for implementing external fast + // serialization routines for types of data not covered by the + // CodedOutputStream interface. + bool GetDirectBufferPointer(void** data, int* size); + + // If there are at least "size" bytes available in the current buffer, + // returns a pointer directly into the buffer and advances over these bytes. + // The caller may then write directly into this buffer (e.g. using the + // *ToArray static methods) rather than go through CodedOutputStream. If + // there are not enough bytes available, returns NULL. The return pointer is + // invalidated as soon as any other non-const method of CodedOutputStream + // is called. + inline uint8* GetDirectBufferForNBytesAndAdvance(int size); + + // Write raw bytes, copying them from the given buffer. + void WriteRaw(const void* buffer, int size); + // Like WriteRaw() but will try to write aliased data if aliasing is + // turned on. + void WriteRawMaybeAliased(const void* data, int size); + // Like WriteRaw() but writing directly to the target array. + // This is _not_ inlined, as the compiler often optimizes memcpy into inline + // copy loops. Since this gets called by every field with string or bytes + // type, inlining may lead to a significant amount of code bloat, with only a + // minor performance gain. + static uint8* WriteRawToArray(const void* buffer, int size, uint8* target); + + // Equivalent to WriteRaw(str.data(), str.size()). + void WriteString(const string& str); + // Like WriteString() but writing directly to the target array. + static uint8* WriteStringToArray(const string& str, uint8* target); + // Write the varint-encoded size of str followed by str. + static uint8* WriteStringWithSizeToArray(const string& str, uint8* target); + + + // Instructs the CodedOutputStream to allow the underlying + // ZeroCopyOutputStream to hold pointers to the original structure instead of + // copying, if it supports it (i.e. output->AllowsAliasing() is true). If the + // underlying stream does not support aliasing, then enabling it has no + // affect. For now, this only affects the behavior of + // WriteRawMaybeAliased(). + // + // NOTE: It is caller's responsibility to ensure that the chunk of memory + // remains live until all of the data has been consumed from the stream. + void EnableAliasing(bool enabled); + + // Write a 32-bit little-endian integer. + void WriteLittleEndian32(uint32 value); + // Like WriteLittleEndian32() but writing directly to the target array. + static uint8* WriteLittleEndian32ToArray(uint32 value, uint8* target); + // Write a 64-bit little-endian integer. + void WriteLittleEndian64(uint64 value); + // Like WriteLittleEndian64() but writing directly to the target array. + static uint8* WriteLittleEndian64ToArray(uint64 value, uint8* target); + + // Write an unsigned integer with Varint encoding. Writing a 32-bit value + // is equivalent to casting it to uint64 and writing it as a 64-bit value, + // but may be more efficient. + void WriteVarint32(uint32 value); + // Like WriteVarint32() but writing directly to the target array. + static uint8* WriteVarint32ToArray(uint32 value, uint8* target); + // Write an unsigned integer with Varint encoding. + void WriteVarint64(uint64 value); + // Like WriteVarint64() but writing directly to the target array. + static uint8* WriteVarint64ToArray(uint64 value, uint8* target); + + // Equivalent to WriteVarint32() except when the value is negative, + // in which case it must be sign-extended to a full 10 bytes. + void WriteVarint32SignExtended(int32 value); + // Like WriteVarint32SignExtended() but writing directly to the target array. + static uint8* WriteVarint32SignExtendedToArray(int32 value, uint8* target); + + // This is identical to WriteVarint32(), but optimized for writing tags. + // In particular, if the input is a compile-time constant, this method + // compiles down to a couple instructions. + // Always inline because otherwise the aformentioned optimization can't work, + // but GCC by default doesn't want to inline this. + void WriteTag(uint32 value); + // Like WriteTag() but writing directly to the target array. + GOOGLE_ATTRIBUTE_ALWAYS_INLINE static uint8* WriteTagToArray(uint32 value, + uint8* target); + + // Returns the number of bytes needed to encode the given value as a varint. + static size_t VarintSize32(uint32 value); + // Returns the number of bytes needed to encode the given value as a varint. + static size_t VarintSize64(uint64 value); + + // If negative, 10 bytes. Otheriwse, same as VarintSize32(). + static size_t VarintSize32SignExtended(int32 value); + + // Compile-time equivalent of VarintSize32(). + template + struct StaticVarintSize32 { + static const size_t value = + (Value < (1 << 7)) + ? 1 + : (Value < (1 << 14)) + ? 2 + : (Value < (1 << 21)) + ? 3 + : (Value < (1 << 28)) + ? 4 + : 5; + }; + + // Returns the total number of bytes written since this object was created. + inline int ByteCount() const; + + // Returns true if there was an underlying I/O error since this object was + // created. + bool HadError() const { return had_error_; } + + // Deterministic serialization, if requested, guarantees that for a given + // binary, equal messages will always be serialized to the same bytes. This + // implies: + // . repeated serialization of a message will return the same bytes + // . different processes of the same binary (which may be executing on + // different machines) will serialize equal messages to the same bytes. + // + // Note the deterministic serialization is NOT canonical across languages; it + // is also unstable across different builds with schema changes due to unknown + // fields. Users who need canonical serialization, e.g., persistent storage in + // a canonical form, fingerprinting, etc., should define their own + // canonicalization specification and implement the serializer using + // reflection APIs rather than relying on this API. + // + // If determinisitc serialization is requested, the serializer will + // sort map entries by keys in lexicographical order or numerical order. + // (This is an implementation detail and may subject to change.) + // + // There are two ways to determine whether serialization should be + // deterministic for this CodedOutputStream. If SetSerializationDeterministic + // has not yet been called, then the default comes from the global default, + // which is false, until SetDefaultSerializationDeterministic has been called. + // Otherwise, SetSerializationDeterministic has been called, and the last + // value passed to it is all that matters. + void SetSerializationDeterministic(bool value) { + serialization_deterministic_is_overridden_ = true; + serialization_deterministic_override_ = value; + } + // See above. Also, note that users of this CodedOutputStream may need to + // call IsSerializationDeterminstic() to serialize in the intended way. This + // CodedOutputStream cannot enforce a desire for deterministic serialization + // by itself. + bool IsSerializationDeterminstic() const { + return serialization_deterministic_is_overridden_ ? + serialization_deterministic_override_ : + default_serialization_deterministic_; + } + + private: + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(CodedOutputStream); + + ZeroCopyOutputStream* output_; + uint8* buffer_; + int buffer_size_; + int total_bytes_; // Sum of sizes of all buffers seen so far. + bool had_error_; // Whether an error occurred during output. + bool aliasing_enabled_; // See EnableAliasing(). + // See SetSerializationDeterministic() regarding these three fields. + bool serialization_deterministic_is_overridden_; + bool serialization_deterministic_override_; + static bool default_serialization_deterministic_; + + // Advance the buffer by a given number of bytes. + void Advance(int amount); + + // Called when the buffer runs out to request more data. Implies an + // Advance(buffer_size_). + bool Refresh(); + + // Like WriteRaw() but may avoid copying if the underlying + // ZeroCopyOutputStream supports it. + void WriteAliasedRaw(const void* buffer, int size); + + // If this write might cross the end of the buffer, we compose the bytes first + // then use WriteRaw(). + void WriteVarint32SlowPath(uint32 value); + + // Always-inlined versions of WriteVarint* functions so that code can be + // reused, while still controlling size. For instance, WriteVarint32ToArray() + // should not directly call this: since it is inlined itself, doing so + // would greatly increase the size of generated code. Instead, it should call + // WriteVarint32FallbackToArray. Meanwhile, WriteVarint32() is already + // out-of-line, so it should just invoke this directly to avoid any extra + // function call overhead. + GOOGLE_ATTRIBUTE_ALWAYS_INLINE static uint8* WriteVarint64ToArrayInline( + uint64 value, uint8* target); + + static size_t VarintSize32Fallback(uint32 value); + + // See above. Other projects may use "friend" to allow them to call this. + static void SetDefaultSerializationDeterministic() { + default_serialization_deterministic_ = true; + } +}; + +// inline methods ==================================================== +// The vast majority of varints are only one byte. These inline +// methods optimize for that case. + +inline bool CodedInputStream::ReadVarint32(uint32* value) { + uint32 v = 0; + if (GOOGLE_PREDICT_TRUE(buffer_ < buffer_end_)) { + v = *buffer_; + if (v < 0x80) { + *value = v; + Advance(1); + return true; + } + } + int64 result = ReadVarint32Fallback(v); + *value = static_cast(result); + return result >= 0; +} + +inline bool CodedInputStream::ReadVarint64(uint64* value) { + if (GOOGLE_PREDICT_TRUE(buffer_ < buffer_end_) && *buffer_ < 0x80) { + *value = *buffer_; + Advance(1); + return true; + } + std::pair p = ReadVarint64Fallback(); + *value = p.first; + return p.second; +} + +inline bool CodedInputStream::ReadVarintSizeAsInt(int* value) { + if (GOOGLE_PREDICT_TRUE(buffer_ < buffer_end_)) { + int v = *buffer_; + if (v < 0x80) { + *value = v; + Advance(1); + return true; + } + } + *value = ReadVarintSizeAsIntFallback(); + return *value >= 0; +} + +// static +inline const uint8* CodedInputStream::ReadLittleEndian32FromArray( + const uint8* buffer, + uint32* value) { +#if defined(PROTOBUF_LITTLE_ENDIAN) + memcpy(value, buffer, sizeof(*value)); + return buffer + sizeof(*value); +#else + *value = (static_cast(buffer[0]) ) | + (static_cast(buffer[1]) << 8) | + (static_cast(buffer[2]) << 16) | + (static_cast(buffer[3]) << 24); + return buffer + sizeof(*value); +#endif +} +// static +inline const uint8* CodedInputStream::ReadLittleEndian64FromArray( + const uint8* buffer, + uint64* value) { +#if defined(PROTOBUF_LITTLE_ENDIAN) + memcpy(value, buffer, sizeof(*value)); + return buffer + sizeof(*value); +#else + uint32 part0 = (static_cast(buffer[0]) ) | + (static_cast(buffer[1]) << 8) | + (static_cast(buffer[2]) << 16) | + (static_cast(buffer[3]) << 24); + uint32 part1 = (static_cast(buffer[4]) ) | + (static_cast(buffer[5]) << 8) | + (static_cast(buffer[6]) << 16) | + (static_cast(buffer[7]) << 24); + *value = static_cast(part0) | + (static_cast(part1) << 32); + return buffer + sizeof(*value); +#endif +} + +inline bool CodedInputStream::ReadLittleEndian32(uint32* value) { +#if defined(PROTOBUF_LITTLE_ENDIAN) + if (GOOGLE_PREDICT_TRUE(BufferSize() >= static_cast(sizeof(*value)))) { + memcpy(value, buffer_, sizeof(*value)); + Advance(sizeof(*value)); + return true; + } else { + return ReadLittleEndian32Fallback(value); + } +#else + return ReadLittleEndian32Fallback(value); +#endif +} + +inline bool CodedInputStream::ReadLittleEndian64(uint64* value) { +#if defined(PROTOBUF_LITTLE_ENDIAN) + if (GOOGLE_PREDICT_TRUE(BufferSize() >= static_cast(sizeof(*value)))) { + memcpy(value, buffer_, sizeof(*value)); + Advance(sizeof(*value)); + return true; + } else { + return ReadLittleEndian64Fallback(value); + } +#else + return ReadLittleEndian64Fallback(value); +#endif +} + +inline uint32 CodedInputStream::ReadTag() { + uint32 v = 0; + if (GOOGLE_PREDICT_TRUE(buffer_ < buffer_end_)) { + v = *buffer_; + if (v < 0x80) { + last_tag_ = v; + Advance(1); + return v; + } + } + last_tag_ = ReadTagFallback(v); + return last_tag_; +} + +inline std::pair CodedInputStream::ReadTagWithCutoff( + uint32 cutoff) { + // In performance-sensitive code we can expect cutoff to be a compile-time + // constant, and things like "cutoff >= kMax1ByteVarint" to be evaluated at + // compile time. + uint32 first_byte_or_zero = 0; + if (GOOGLE_PREDICT_TRUE(buffer_ < buffer_end_)) { + // Hot case: buffer_ non_empty, buffer_[0] in [1, 128). + // TODO(gpike): Is it worth rearranging this? E.g., if the number of fields + // is large enough then is it better to check for the two-byte case first? + first_byte_or_zero = buffer_[0]; + if (static_cast(buffer_[0]) > 0) { + const uint32 kMax1ByteVarint = 0x7f; + uint32 tag = last_tag_ = buffer_[0]; + Advance(1); + return std::make_pair(tag, cutoff >= kMax1ByteVarint || tag <= cutoff); + } + // Other hot case: cutoff >= 0x80, buffer_ has at least two bytes available, + // and tag is two bytes. The latter is tested by bitwise-and-not of the + // first byte and the second byte. + if (cutoff >= 0x80 && + GOOGLE_PREDICT_TRUE(buffer_ + 1 < buffer_end_) && + GOOGLE_PREDICT_TRUE((buffer_[0] & ~buffer_[1]) >= 0x80)) { + const uint32 kMax2ByteVarint = (0x7f << 7) + 0x7f; + uint32 tag = last_tag_ = (1u << 7) * buffer_[1] + (buffer_[0] - 0x80); + Advance(2); + // It might make sense to test for tag == 0 now, but it is so rare that + // that we don't bother. A varint-encoded 0 should be one byte unless + // the encoder lost its mind. The second part of the return value of + // this function is allowed to be either true or false if the tag is 0, + // so we don't have to check for tag == 0. We may need to check whether + // it exceeds cutoff. + bool at_or_below_cutoff = cutoff >= kMax2ByteVarint || tag <= cutoff; + return std::make_pair(tag, at_or_below_cutoff); + } + } + // Slow path + last_tag_ = ReadTagFallback(first_byte_or_zero); + return std::make_pair(last_tag_, static_cast(last_tag_ - 1) < cutoff); +} + +inline bool CodedInputStream::LastTagWas(uint32 expected) { + return last_tag_ == expected; +} + +inline bool CodedInputStream::ConsumedEntireMessage() { + return legitimate_message_end_; +} + +inline bool CodedInputStream::ExpectTag(uint32 expected) { + if (expected < (1 << 7)) { + if (GOOGLE_PREDICT_TRUE(buffer_ < buffer_end_) && buffer_[0] == expected) { + Advance(1); + return true; + } else { + return false; + } + } else if (expected < (1 << 14)) { + if (GOOGLE_PREDICT_TRUE(BufferSize() >= 2) && + buffer_[0] == static_cast(expected | 0x80) && + buffer_[1] == static_cast(expected >> 7)) { + Advance(2); + return true; + } else { + return false; + } + } else { + // Don't bother optimizing for larger values. + return false; + } +} + +inline const uint8* CodedInputStream::ExpectTagFromArray( + const uint8* buffer, uint32 expected) { + if (expected < (1 << 7)) { + if (buffer[0] == expected) { + return buffer + 1; + } + } else if (expected < (1 << 14)) { + if (buffer[0] == static_cast(expected | 0x80) && + buffer[1] == static_cast(expected >> 7)) { + return buffer + 2; + } + } + return NULL; +} + +inline void CodedInputStream::GetDirectBufferPointerInline(const void** data, + int* size) { + *data = buffer_; + *size = static_cast(buffer_end_ - buffer_); +} + +inline bool CodedInputStream::ExpectAtEnd() { + // If we are at a limit we know no more bytes can be read. Otherwise, it's + // hard to say without calling Refresh(), and we'd rather not do that. + + if (buffer_ == buffer_end_ && + ((buffer_size_after_limit_ != 0) || + (total_bytes_read_ == current_limit_))) { + last_tag_ = 0; // Pretend we called ReadTag()... + legitimate_message_end_ = true; // ... and it hit EOF. + return true; + } else { + return false; + } +} + +inline int CodedInputStream::CurrentPosition() const { + return total_bytes_read_ - (BufferSize() + buffer_size_after_limit_); +} + +inline uint8* CodedOutputStream::GetDirectBufferForNBytesAndAdvance(int size) { + if (buffer_size_ < size) { + return NULL; + } else { + uint8* result = buffer_; + Advance(size); + return result; + } +} + +inline uint8* CodedOutputStream::WriteVarint32ToArray(uint32 value, + uint8* target) { + while (value >= 0x80) { + *target = static_cast(value | 0x80); + value >>= 7; + ++target; + } + *target = static_cast(value); + return target + 1; +} + +inline void CodedOutputStream::WriteVarint32SignExtended(int32 value) { + if (value < 0) { + WriteVarint64(static_cast(value)); + } else { + WriteVarint32(static_cast(value)); + } +} + +inline uint8* CodedOutputStream::WriteVarint32SignExtendedToArray( + int32 value, uint8* target) { + if (value < 0) { + return WriteVarint64ToArray(static_cast(value), target); + } else { + return WriteVarint32ToArray(static_cast(value), target); + } +} + +inline uint8* CodedOutputStream::WriteLittleEndian32ToArray(uint32 value, + uint8* target) { +#if defined(PROTOBUF_LITTLE_ENDIAN) + memcpy(target, &value, sizeof(value)); +#else + target[0] = static_cast(value); + target[1] = static_cast(value >> 8); + target[2] = static_cast(value >> 16); + target[3] = static_cast(value >> 24); +#endif + return target + sizeof(value); +} + +inline uint8* CodedOutputStream::WriteLittleEndian64ToArray(uint64 value, + uint8* target) { +#if defined(PROTOBUF_LITTLE_ENDIAN) + memcpy(target, &value, sizeof(value)); +#else + uint32 part0 = static_cast(value); + uint32 part1 = static_cast(value >> 32); + + target[0] = static_cast(part0); + target[1] = static_cast(part0 >> 8); + target[2] = static_cast(part0 >> 16); + target[3] = static_cast(part0 >> 24); + target[4] = static_cast(part1); + target[5] = static_cast(part1 >> 8); + target[6] = static_cast(part1 >> 16); + target[7] = static_cast(part1 >> 24); +#endif + return target + sizeof(value); +} + +inline void CodedOutputStream::WriteVarint32(uint32 value) { + if (buffer_size_ >= 5) { + // Fast path: We have enough bytes left in the buffer to guarantee that + // this write won't cross the end, so we can skip the checks. + uint8* target = buffer_; + uint8* end = WriteVarint32ToArray(value, target); + int size = static_cast(end - target); + Advance(size); + } else { + WriteVarint32SlowPath(value); + } +} + +inline void CodedOutputStream::WriteTag(uint32 value) { + WriteVarint32(value); +} + +inline uint8* CodedOutputStream::WriteTagToArray( + uint32 value, uint8* target) { + return WriteVarint32ToArray(value, target); +} + +inline size_t CodedOutputStream::VarintSize32(uint32 value) { + if (value < (1 << 7)) { + return 1; + } else { + return VarintSize32Fallback(value); + } +} + +inline size_t CodedOutputStream::VarintSize32SignExtended(int32 value) { + if (value < 0) { + return 10; // TODO(kenton): Make this a symbolic constant. + } else { + return VarintSize32(static_cast(value)); + } +} + +inline void CodedOutputStream::WriteString(const string& str) { + WriteRaw(str.data(), static_cast(str.size())); +} + +inline void CodedOutputStream::WriteRawMaybeAliased( + const void* data, int size) { + if (aliasing_enabled_) { + WriteAliasedRaw(data, size); + } else { + WriteRaw(data, size); + } +} + +inline uint8* CodedOutputStream::WriteStringToArray( + const string& str, uint8* target) { + return WriteRawToArray(str.data(), static_cast(str.size()), target); +} + +inline int CodedOutputStream::ByteCount() const { + return total_bytes_ - buffer_size_; +} + +inline void CodedInputStream::Advance(int amount) { + buffer_ += amount; +} + +inline void CodedOutputStream::Advance(int amount) { + buffer_ += amount; + buffer_size_ -= amount; +} + +inline void CodedInputStream::SetRecursionLimit(int limit) { + recursion_budget_ += limit - recursion_limit_; + recursion_limit_ = limit; +} + +inline bool CodedInputStream::IncrementRecursionDepth() { + --recursion_budget_; + return recursion_budget_ >= 0; +} + +inline void CodedInputStream::DecrementRecursionDepth() { + if (recursion_budget_ < recursion_limit_) ++recursion_budget_; +} + +inline void CodedInputStream::UnsafeDecrementRecursionDepth() { + assert(recursion_budget_ < recursion_limit_); + ++recursion_budget_; +} + +inline void CodedInputStream::SetExtensionRegistry(const DescriptorPool* pool, + MessageFactory* factory) { + extension_pool_ = pool; + extension_factory_ = factory; +} + +inline const DescriptorPool* CodedInputStream::GetExtensionPool() { + return extension_pool_; +} + +inline MessageFactory* CodedInputStream::GetExtensionFactory() { + return extension_factory_; +} + +inline int CodedInputStream::BufferSize() const { + return static_cast(buffer_end_ - buffer_); +} + +inline CodedInputStream::CodedInputStream(ZeroCopyInputStream* input) + : buffer_(NULL), + buffer_end_(NULL), + input_(input), + total_bytes_read_(0), + overflow_bytes_(0), + last_tag_(0), + legitimate_message_end_(false), + aliasing_enabled_(false), + current_limit_(kint32max), + buffer_size_after_limit_(0), + total_bytes_limit_(kDefaultTotalBytesLimit), + total_bytes_warning_threshold_(kDefaultTotalBytesWarningThreshold), + recursion_budget_(default_recursion_limit_), + recursion_limit_(default_recursion_limit_), + extension_pool_(NULL), + extension_factory_(NULL) { + // Eagerly Refresh() so buffer space is immediately available. + Refresh(); +} + +inline CodedInputStream::CodedInputStream(const uint8* buffer, int size) + : buffer_(buffer), + buffer_end_(buffer + size), + input_(NULL), + total_bytes_read_(size), + overflow_bytes_(0), + last_tag_(0), + legitimate_message_end_(false), + aliasing_enabled_(false), + current_limit_(size), + buffer_size_after_limit_(0), + total_bytes_limit_(kDefaultTotalBytesLimit), + total_bytes_warning_threshold_(kDefaultTotalBytesWarningThreshold), + recursion_budget_(default_recursion_limit_), + recursion_limit_(default_recursion_limit_), + extension_pool_(NULL), + extension_factory_(NULL) { + // Note that setting current_limit_ == size is important to prevent some + // code paths from trying to access input_ and segfaulting. +} + +inline bool CodedInputStream::IsFlat() const { + return input_ == NULL; +} + +} // namespace io +} // namespace protobuf + + +#if _MSC_VER >= 1300 && !defined(__INTEL_COMPILER) + #pragma runtime_checks("c", restore) +#endif // _MSC_VER && !defined(__INTEL_COMPILER) + +} // namespace google +#endif // GOOGLE_PROTOBUF_IO_CODED_STREAM_H__ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/io/strtod.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/io/strtod.h new file mode 100644 index 00000000..f56e41c8 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/io/strtod.h @@ -0,0 +1,55 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// A locale-independent version of strtod(), used to parse floating +// point default values in .proto files, where the decimal separator +// is always a dot. + +#ifndef GOOGLE_PROTOBUF_IO_STRTOD_H__ +#define GOOGLE_PROTOBUF_IO_STRTOD_H__ + +namespace google { +namespace protobuf { +namespace io { + +// A locale-independent version of the standard strtod(), which always +// uses a dot as the decimal separator. +double NoLocaleStrtod(const char* str, char** endptr); + +// Casts a double value to a float value. If the value is outside of the +// representable range of float, it will be converted to positive or negative +// infinity. +float SafeDoubleToFloat(double value); + +} // namespace io +} // namespace protobuf + +} // namespace google +#endif // GOOGLE_PROTOBUF_IO_STRTOD_H__ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/io/zero_copy_stream.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/io/zero_copy_stream.h new file mode 100644 index 00000000..c3e793ba --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/io/zero_copy_stream.h @@ -0,0 +1,248 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Author: kenton@google.com (Kenton Varda) +// Based on original Protocol Buffers design by +// Sanjay Ghemawat, Jeff Dean, and others. +// +// This file contains the ZeroCopyInputStream and ZeroCopyOutputStream +// interfaces, which represent abstract I/O streams to and from which +// protocol buffers can be read and written. For a few simple +// implementations of these interfaces, see zero_copy_stream_impl.h. +// +// These interfaces are different from classic I/O streams in that they +// try to minimize the amount of data copying that needs to be done. +// To accomplish this, responsibility for allocating buffers is moved to +// the stream object, rather than being the responsibility of the caller. +// So, the stream can return a buffer which actually points directly into +// the final data structure where the bytes are to be stored, and the caller +// can interact directly with that buffer, eliminating an intermediate copy +// operation. +// +// As an example, consider the common case in which you are reading bytes +// from an array that is already in memory (or perhaps an mmap()ed file). +// With classic I/O streams, you would do something like: +// char buffer[BUFFER_SIZE]; +// input->Read(buffer, BUFFER_SIZE); +// DoSomething(buffer, BUFFER_SIZE); +// Then, the stream basically just calls memcpy() to copy the data from +// the array into your buffer. With a ZeroCopyInputStream, you would do +// this instead: +// const void* buffer; +// int size; +// input->Next(&buffer, &size); +// DoSomething(buffer, size); +// Here, no copy is performed. The input stream returns a pointer directly +// into the backing array, and the caller ends up reading directly from it. +// +// If you want to be able to read the old-fashion way, you can create +// a CodedInputStream or CodedOutputStream wrapping these objects and use +// their ReadRaw()/WriteRaw() methods. These will, of course, add a copy +// step, but Coded*Stream will handle buffering so at least it will be +// reasonably efficient. +// +// ZeroCopyInputStream example: +// // Read in a file and print its contents to stdout. +// int fd = open("myfile", O_RDONLY); +// ZeroCopyInputStream* input = new FileInputStream(fd); +// +// const void* buffer; +// int size; +// while (input->Next(&buffer, &size)) { +// cout.write(buffer, size); +// } +// +// delete input; +// close(fd); +// +// ZeroCopyOutputStream example: +// // Copy the contents of "infile" to "outfile", using plain read() for +// // "infile" but a ZeroCopyOutputStream for "outfile". +// int infd = open("infile", O_RDONLY); +// int outfd = open("outfile", O_WRONLY); +// ZeroCopyOutputStream* output = new FileOutputStream(outfd); +// +// void* buffer; +// int size; +// while (output->Next(&buffer, &size)) { +// int bytes = read(infd, buffer, size); +// if (bytes < size) { +// // Reached EOF. +// output->BackUp(size - bytes); +// break; +// } +// } +// +// delete output; +// close(infd); +// close(outfd); + +#ifndef GOOGLE_PROTOBUF_IO_ZERO_COPY_STREAM_H__ +#define GOOGLE_PROTOBUF_IO_ZERO_COPY_STREAM_H__ + +#include +#include "common.h" + +namespace google { + +namespace protobuf { +namespace io { + +// Defined in this file. +class ZeroCopyInputStream; +class ZeroCopyOutputStream; + +// Abstract interface similar to an input stream but designed to minimize +// copying. +class LIBPROTOBUF_EXPORT ZeroCopyInputStream { + public: + inline ZeroCopyInputStream() {} + virtual ~ZeroCopyInputStream(); + + // Obtains a chunk of data from the stream. + // + // Preconditions: + // * "size" and "data" are not NULL. + // + // Postconditions: + // * If the returned value is false, there is no more data to return or + // an error occurred. All errors are permanent. + // * Otherwise, "size" points to the actual number of bytes read and "data" + // points to a pointer to a buffer containing these bytes. + // * Ownership of this buffer remains with the stream, and the buffer + // remains valid only until some other method of the stream is called + // or the stream is destroyed. + // * It is legal for the returned buffer to have zero size, as long + // as repeatedly calling Next() eventually yields a buffer with non-zero + // size. + virtual bool Next(const void** data, int* size) = 0; + + // Backs up a number of bytes, so that the next call to Next() returns + // data again that was already returned by the last call to Next(). This + // is useful when writing procedures that are only supposed to read up + // to a certain point in the input, then return. If Next() returns a + // buffer that goes beyond what you wanted to read, you can use BackUp() + // to return to the point where you intended to finish. + // + // Preconditions: + // * The last method called must have been Next(). + // * count must be less than or equal to the size of the last buffer + // returned by Next(). + // + // Postconditions: + // * The last "count" bytes of the last buffer returned by Next() will be + // pushed back into the stream. Subsequent calls to Next() will return + // the same data again before producing new data. + virtual void BackUp(int count) = 0; + + // Skips a number of bytes. Returns false if the end of the stream is + // reached or some input error occurred. In the end-of-stream case, the + // stream is advanced to the end of the stream (so ByteCount() will return + // the total size of the stream). + virtual bool Skip(int count) = 0; + + // Returns the total number of bytes read since this object was created. + virtual int64 ByteCount() const = 0; + + + private: + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ZeroCopyInputStream); +}; + +// Abstract interface similar to an output stream but designed to minimize +// copying. +class LIBPROTOBUF_EXPORT ZeroCopyOutputStream { + public: + inline ZeroCopyOutputStream() {} + virtual ~ZeroCopyOutputStream(); + + // Obtains a buffer into which data can be written. Any data written + // into this buffer will eventually (maybe instantly, maybe later on) + // be written to the output. + // + // Preconditions: + // * "size" and "data" are not NULL. + // + // Postconditions: + // * If the returned value is false, an error occurred. All errors are + // permanent. + // * Otherwise, "size" points to the actual number of bytes in the buffer + // and "data" points to the buffer. + // * Ownership of this buffer remains with the stream, and the buffer + // remains valid only until some other method of the stream is called + // or the stream is destroyed. + // * Any data which the caller stores in this buffer will eventually be + // written to the output (unless BackUp() is called). + // * It is legal for the returned buffer to have zero size, as long + // as repeatedly calling Next() eventually yields a buffer with non-zero + // size. + virtual bool Next(void** data, int* size) = 0; + + // Backs up a number of bytes, so that the end of the last buffer returned + // by Next() is not actually written. This is needed when you finish + // writing all the data you want to write, but the last buffer was bigger + // than you needed. You don't want to write a bunch of garbage after the + // end of your data, so you use BackUp() to back up. + // + // Preconditions: + // * The last method called must have been Next(). + // * count must be less than or equal to the size of the last buffer + // returned by Next(). + // * The caller must not have written anything to the last "count" bytes + // of that buffer. + // + // Postconditions: + // * The last "count" bytes of the last buffer returned by Next() will be + // ignored. + virtual void BackUp(int count) = 0; + + // Returns the total number of bytes written since this object was created. + virtual int64 ByteCount() const = 0; + + // Write a given chunk of data to the output. Some output streams may + // implement this in a way that avoids copying. Check AllowsAliasing() before + // calling WriteAliasedRaw(). It will GOOGLE_CHECK fail if WriteAliasedRaw() is + // called on a stream that does not allow aliasing. + // + // NOTE: It is caller's responsibility to ensure that the chunk of memory + // remains live until all of the data has been consumed from the stream. + virtual bool WriteAliasedRaw(const void* data, int size); + virtual bool AllowsAliasing() const { return false; } + + + private: + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ZeroCopyOutputStream); +}; + +} // namespace io +} // namespace protobuf + +} // namespace google +#endif // GOOGLE_PROTOBUF_IO_ZERO_COPY_STREAM_H__ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/io/zero_copy_stream_impl.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/io/zero_copy_stream_impl.h new file mode 100644 index 00000000..596950ca --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/io/zero_copy_stream_impl.h @@ -0,0 +1,358 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Author: kenton@google.com (Kenton Varda) +// Based on original Protocol Buffers design by +// Sanjay Ghemawat, Jeff Dean, and others. +// +// This file contains common implementations of the interfaces defined in +// zero_copy_stream.h which are only included in the full (non-lite) +// protobuf library. These implementations include Unix file descriptors +// and C++ iostreams. See also: zero_copy_stream_impl_lite.h + +#ifndef GOOGLE_PROTOBUF_IO_ZERO_COPY_STREAM_IMPL_H__ +#define GOOGLE_PROTOBUF_IO_ZERO_COPY_STREAM_IMPL_H__ + +#include +#include +#include "zero_copy_stream.h" +#include "zero_copy_stream_impl_lite.h" +#include "common.h" + + +namespace google { +namespace protobuf { +namespace io { + + +// =================================================================== + +// A ZeroCopyInputStream which reads from a file descriptor. +// +// FileInputStream is preferred over using an ifstream with IstreamInputStream. +// The latter will introduce an extra layer of buffering, harming performance. +// Also, it's conceivable that FileInputStream could someday be enhanced +// to use zero-copy file descriptors on OSs which support them. +class LIBPROTOBUF_EXPORT FileInputStream : public ZeroCopyInputStream { + public: + // Creates a stream that reads from the given Unix file descriptor. + // If a block_size is given, it specifies the number of bytes that + // should be read and returned with each call to Next(). Otherwise, + // a reasonable default is used. + explicit FileInputStream(int file_descriptor, int block_size = -1); + ~FileInputStream(); + + // Flushes any buffers and closes the underlying file. Returns false if + // an error occurs during the process; use GetErrno() to examine the error. + // Even if an error occurs, the file descriptor is closed when this returns. + bool Close(); + + // By default, the file descriptor is not closed when the stream is + // destroyed. Call SetCloseOnDelete(true) to change that. WARNING: + // This leaves no way for the caller to detect if close() fails. If + // detecting close() errors is important to you, you should arrange + // to close the descriptor yourself. + void SetCloseOnDelete(bool value) { copying_input_.SetCloseOnDelete(value); } + + // If an I/O error has occurred on this file descriptor, this is the + // errno from that error. Otherwise, this is zero. Once an error + // occurs, the stream is broken and all subsequent operations will + // fail. + int GetErrno() { return copying_input_.GetErrno(); } + + // implements ZeroCopyInputStream ---------------------------------- + bool Next(const void** data, int* size); + void BackUp(int count); + bool Skip(int count); + int64 ByteCount() const; + + private: + class LIBPROTOBUF_EXPORT CopyingFileInputStream : public CopyingInputStream { + public: + CopyingFileInputStream(int file_descriptor); + ~CopyingFileInputStream(); + + bool Close(); + void SetCloseOnDelete(bool value) { close_on_delete_ = value; } + int GetErrno() { return errno_; } + + // implements CopyingInputStream --------------------------------- + int Read(void* buffer, int size); + int Skip(int count); + + private: + // The file descriptor. + const int file_; + bool close_on_delete_; + bool is_closed_; + + // The errno of the I/O error, if one has occurred. Otherwise, zero. + int errno_; + + // Did we try to seek once and fail? If so, we assume this file descriptor + // doesn't support seeking and won't try again. + bool previous_seek_failed_; + + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(CopyingFileInputStream); + }; + + CopyingFileInputStream copying_input_; + CopyingInputStreamAdaptor impl_; + + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(FileInputStream); +}; + +// =================================================================== + +// A ZeroCopyOutputStream which writes to a file descriptor. +// +// FileOutputStream is preferred over using an ofstream with +// OstreamOutputStream. The latter will introduce an extra layer of buffering, +// harming performance. Also, it's conceivable that FileOutputStream could +// someday be enhanced to use zero-copy file descriptors on OSs which +// support them. +class LIBPROTOBUF_EXPORT FileOutputStream : public ZeroCopyOutputStream { + public: + // Creates a stream that writes to the given Unix file descriptor. + // If a block_size is given, it specifies the size of the buffers + // that should be returned by Next(). Otherwise, a reasonable default + // is used. + explicit FileOutputStream(int file_descriptor, int block_size = -1); + ~FileOutputStream(); + + // Flushes any buffers and closes the underlying file. Returns false if + // an error occurs during the process; use GetErrno() to examine the error. + // Even if an error occurs, the file descriptor is closed when this returns. + bool Close(); + + // Flushes FileOutputStream's buffers but does not close the + // underlying file. No special measures are taken to ensure that + // underlying operating system file object is synchronized to disk. + bool Flush(); + + // By default, the file descriptor is not closed when the stream is + // destroyed. Call SetCloseOnDelete(true) to change that. WARNING: + // This leaves no way for the caller to detect if close() fails. If + // detecting close() errors is important to you, you should arrange + // to close the descriptor yourself. + void SetCloseOnDelete(bool value) { copying_output_.SetCloseOnDelete(value); } + + // If an I/O error has occurred on this file descriptor, this is the + // errno from that error. Otherwise, this is zero. Once an error + // occurs, the stream is broken and all subsequent operations will + // fail. + int GetErrno() { return copying_output_.GetErrno(); } + + // implements ZeroCopyOutputStream --------------------------------- + bool Next(void** data, int* size); + void BackUp(int count); + int64 ByteCount() const; + + private: + class LIBPROTOBUF_EXPORT CopyingFileOutputStream : public CopyingOutputStream { + public: + CopyingFileOutputStream(int file_descriptor); + ~CopyingFileOutputStream(); + + bool Close(); + void SetCloseOnDelete(bool value) { close_on_delete_ = value; } + int GetErrno() { return errno_; } + + // implements CopyingOutputStream -------------------------------- + bool Write(const void* buffer, int size); + + private: + // The file descriptor. + const int file_; + bool close_on_delete_; + bool is_closed_; + + // The errno of the I/O error, if one has occurred. Otherwise, zero. + int errno_; + + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(CopyingFileOutputStream); + }; + + CopyingFileOutputStream copying_output_; + CopyingOutputStreamAdaptor impl_; + + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(FileOutputStream); +}; + +// =================================================================== + +// A ZeroCopyInputStream which reads from a C++ istream. +// +// Note that for reading files (or anything represented by a file descriptor), +// FileInputStream is more efficient. +class LIBPROTOBUF_EXPORT IstreamInputStream : public ZeroCopyInputStream { + public: + // Creates a stream that reads from the given C++ istream. + // If a block_size is given, it specifies the number of bytes that + // should be read and returned with each call to Next(). Otherwise, + // a reasonable default is used. + explicit IstreamInputStream(std::istream* stream, int block_size = -1); + ~IstreamInputStream(); + + // implements ZeroCopyInputStream ---------------------------------- + bool Next(const void** data, int* size); + void BackUp(int count); + bool Skip(int count); + int64 ByteCount() const; + + private: + class LIBPROTOBUF_EXPORT CopyingIstreamInputStream : public CopyingInputStream { + public: + CopyingIstreamInputStream(std::istream* input); + ~CopyingIstreamInputStream(); + + // implements CopyingInputStream --------------------------------- + int Read(void* buffer, int size); + // (We use the default implementation of Skip().) + + private: + // The stream. + std::istream* input_; + + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(CopyingIstreamInputStream); + }; + + CopyingIstreamInputStream copying_input_; + CopyingInputStreamAdaptor impl_; + + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(IstreamInputStream); +}; + +// =================================================================== + +// A ZeroCopyOutputStream which writes to a C++ ostream. +// +// Note that for writing files (or anything represented by a file descriptor), +// FileOutputStream is more efficient. +class LIBPROTOBUF_EXPORT OstreamOutputStream : public ZeroCopyOutputStream { + public: + // Creates a stream that writes to the given C++ ostream. + // If a block_size is given, it specifies the size of the buffers + // that should be returned by Next(). Otherwise, a reasonable default + // is used. + explicit OstreamOutputStream(std::ostream* stream, int block_size = -1); + ~OstreamOutputStream(); + + // implements ZeroCopyOutputStream --------------------------------- + bool Next(void** data, int* size); + void BackUp(int count); + int64 ByteCount() const; + + private: + class LIBPROTOBUF_EXPORT CopyingOstreamOutputStream : public CopyingOutputStream { + public: + CopyingOstreamOutputStream(std::ostream* output); + ~CopyingOstreamOutputStream(); + + // implements CopyingOutputStream -------------------------------- + bool Write(const void* buffer, int size); + + private: + // The stream. + std::ostream* output_; + + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(CopyingOstreamOutputStream); + }; + + CopyingOstreamOutputStream copying_output_; + CopyingOutputStreamAdaptor impl_; + + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(OstreamOutputStream); +}; + +// =================================================================== + +// A ZeroCopyInputStream which reads from several other streams in sequence. +// ConcatenatingInputStream is unable to distinguish between end-of-stream +// and read errors in the underlying streams, so it assumes any errors mean +// end-of-stream. So, if the underlying streams fail for any other reason, +// ConcatenatingInputStream may do odd things. It is suggested that you do +// not use ConcatenatingInputStream on streams that might produce read errors +// other than end-of-stream. +class LIBPROTOBUF_EXPORT ConcatenatingInputStream : public ZeroCopyInputStream { + public: + // All streams passed in as well as the array itself must remain valid + // until the ConcatenatingInputStream is destroyed. + ConcatenatingInputStream(ZeroCopyInputStream* const streams[], int count); + ~ConcatenatingInputStream(); + + // implements ZeroCopyInputStream ---------------------------------- + bool Next(const void** data, int* size); + void BackUp(int count); + bool Skip(int count); + int64 ByteCount() const; + + + private: + // As streams are retired, streams_ is incremented and count_ is + // decremented. + ZeroCopyInputStream* const* streams_; + int stream_count_; + int64 bytes_retired_; // Bytes read from previous streams. + + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ConcatenatingInputStream); +}; + +// =================================================================== + +// A ZeroCopyInputStream which wraps some other stream and limits it to +// a particular byte count. +class LIBPROTOBUF_EXPORT LimitingInputStream : public ZeroCopyInputStream { + public: + LimitingInputStream(ZeroCopyInputStream* input, int64 limit); + ~LimitingInputStream(); + + // implements ZeroCopyInputStream ---------------------------------- + bool Next(const void** data, int* size); + void BackUp(int count); + bool Skip(int count); + int64 ByteCount() const; + + + private: + ZeroCopyInputStream* input_; + int64 limit_; // Decreases as we go, becomes negative if we overshoot. + int64 prior_bytes_read_; // Bytes read on underlying stream at construction + + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(LimitingInputStream); +}; + +// =================================================================== + +} // namespace io +} // namespace protobuf + +} // namespace google +#endif // GOOGLE_PROTOBUF_IO_ZERO_COPY_STREAM_IMPL_H__ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/io/zero_copy_stream_impl_lite.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/io/zero_copy_stream_impl_lite.h new file mode 100644 index 00000000..ca4c0036 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/io/zero_copy_stream_impl_lite.h @@ -0,0 +1,410 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Author: kenton@google.com (Kenton Varda) +// Based on original Protocol Buffers design by +// Sanjay Ghemawat, Jeff Dean, and others. +// +// This file contains common implementations of the interfaces defined in +// zero_copy_stream.h which are included in the "lite" protobuf library. +// These implementations cover I/O on raw arrays and strings, as well as +// adaptors which make it easy to implement streams based on traditional +// streams. Of course, many users will probably want to write their own +// implementations of these interfaces specific to the particular I/O +// abstractions they prefer to use, but these should cover the most common +// cases. + +#ifndef GOOGLE_PROTOBUF_IO_ZERO_COPY_STREAM_IMPL_LITE_H__ +#define GOOGLE_PROTOBUF_IO_ZERO_COPY_STREAM_IMPL_LITE_H__ + +#include +#ifndef _SHARED_PTR_H +#include "shared_ptr.h" +#endif +#include +#include +#include "zero_copy_stream.h" +#include "callback.h" +#include "common.h" +#include "stl_util.h" + + +namespace google { +namespace protobuf { +namespace io { + +// =================================================================== + +// A ZeroCopyInputStream backed by an in-memory array of bytes. +class LIBPROTOBUF_EXPORT ArrayInputStream : public ZeroCopyInputStream { + public: + // Create an InputStream that returns the bytes pointed to by "data". + // "data" remains the property of the caller but must remain valid until + // the stream is destroyed. If a block_size is given, calls to Next() + // will return data blocks no larger than the given size. Otherwise, the + // first call to Next() returns the entire array. block_size is mainly + // useful for testing; in production you would probably never want to set + // it. + ArrayInputStream(const void* data, int size, int block_size = -1); + ~ArrayInputStream(); + + // implements ZeroCopyInputStream ---------------------------------- + bool Next(const void** data, int* size); + void BackUp(int count); + bool Skip(int count); + int64 ByteCount() const; + + + private: + const uint8* const data_; // The byte array. + const int size_; // Total size of the array. + const int block_size_; // How many bytes to return at a time. + + int position_; + int last_returned_size_; // How many bytes we returned last time Next() + // was called (used for error checking only). + + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ArrayInputStream); +}; + +// =================================================================== + +// A ZeroCopyOutputStream backed by an in-memory array of bytes. +class LIBPROTOBUF_EXPORT ArrayOutputStream : public ZeroCopyOutputStream { + public: + // Create an OutputStream that writes to the bytes pointed to by "data". + // "data" remains the property of the caller but must remain valid until + // the stream is destroyed. If a block_size is given, calls to Next() + // will return data blocks no larger than the given size. Otherwise, the + // first call to Next() returns the entire array. block_size is mainly + // useful for testing; in production you would probably never want to set + // it. + ArrayOutputStream(void* data, int size, int block_size = -1); + ~ArrayOutputStream(); + + // implements ZeroCopyOutputStream --------------------------------- + bool Next(void** data, int* size); + void BackUp(int count); + int64 ByteCount() const; + + private: + uint8* const data_; // The byte array. + const int size_; // Total size of the array. + const int block_size_; // How many bytes to return at a time. + + int position_; + int last_returned_size_; // How many bytes we returned last time Next() + // was called (used for error checking only). + + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ArrayOutputStream); +}; + +// =================================================================== + +// A ZeroCopyOutputStream which appends bytes to a string. +class LIBPROTOBUF_EXPORT StringOutputStream : public ZeroCopyOutputStream { + public: + // Create a StringOutputStream which appends bytes to the given string. + // The string remains property of the caller, but it is mutated in arbitrary + // ways and MUST NOT be accessed in any way until you're done with the + // stream. Either be sure there's no further usage, or (safest) destroy the + // stream before using the contents. + // + // Hint: If you call target->reserve(n) before creating the stream, + // the first call to Next() will return at least n bytes of buffer + // space. + explicit StringOutputStream(string* target); + ~StringOutputStream(); + + // implements ZeroCopyOutputStream --------------------------------- + bool Next(void** data, int* size); + void BackUp(int count); + int64 ByteCount() const; + + protected: + void SetString(string* target); + + private: + static const int kMinimumSize = 16; + + string* target_; + + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(StringOutputStream); +}; + +// LazyStringOutputStream is a StringOutputStream with lazy acquisition of +// the output string from a callback. The string is owned externally, and not +// deleted in the stream destructor. +class LIBPROTOBUF_EXPORT LazyStringOutputStream : public StringOutputStream { + public: + // Callback should be permanent (non-self-deleting). Ownership is transferred + // to the LazyStringOutputStream. + explicit LazyStringOutputStream(ResultCallback* callback); + ~LazyStringOutputStream(); + + // implements ZeroCopyOutputStream, overriding StringOutputStream ----------- + bool Next(void** data, int* size); + int64 ByteCount() const; + + private: + const google::protobuf::scoped_ptr > callback_; + bool string_is_set_; + + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(LazyStringOutputStream); +}; + +// Note: There is no StringInputStream. Instead, just create an +// ArrayInputStream as follows: +// ArrayInputStream input(str.data(), str.size()); + +// =================================================================== + +// A generic traditional input stream interface. +// +// Lots of traditional input streams (e.g. file descriptors, C stdio +// streams, and C++ iostreams) expose an interface where every read +// involves copying bytes into a buffer. If you want to take such an +// interface and make a ZeroCopyInputStream based on it, simply implement +// CopyingInputStream and then use CopyingInputStreamAdaptor. +// +// CopyingInputStream implementations should avoid buffering if possible. +// CopyingInputStreamAdaptor does its own buffering and will read data +// in large blocks. +class LIBPROTOBUF_EXPORT CopyingInputStream { + public: + virtual ~CopyingInputStream(); + + // Reads up to "size" bytes into the given buffer. Returns the number of + // bytes read. Read() waits until at least one byte is available, or + // returns zero if no bytes will ever become available (EOF), or -1 if a + // permanent read error occurred. + virtual int Read(void* buffer, int size) = 0; + + // Skips the next "count" bytes of input. Returns the number of bytes + // actually skipped. This will always be exactly equal to "count" unless + // EOF was reached or a permanent read error occurred. + // + // The default implementation just repeatedly calls Read() into a scratch + // buffer. + virtual int Skip(int count); +}; + +// A ZeroCopyInputStream which reads from a CopyingInputStream. This is +// useful for implementing ZeroCopyInputStreams that read from traditional +// streams. Note that this class is not really zero-copy. +// +// If you want to read from file descriptors or C++ istreams, this is +// already implemented for you: use FileInputStream or IstreamInputStream +// respectively. +class LIBPROTOBUF_EXPORT CopyingInputStreamAdaptor : public ZeroCopyInputStream { + public: + // Creates a stream that reads from the given CopyingInputStream. + // If a block_size is given, it specifies the number of bytes that + // should be read and returned with each call to Next(). Otherwise, + // a reasonable default is used. The caller retains ownership of + // copying_stream unless SetOwnsCopyingStream(true) is called. + explicit CopyingInputStreamAdaptor(CopyingInputStream* copying_stream, + int block_size = -1); + ~CopyingInputStreamAdaptor(); + + // Call SetOwnsCopyingStream(true) to tell the CopyingInputStreamAdaptor to + // delete the underlying CopyingInputStream when it is destroyed. + void SetOwnsCopyingStream(bool value) { owns_copying_stream_ = value; } + + // implements ZeroCopyInputStream ---------------------------------- + bool Next(const void** data, int* size); + void BackUp(int count); + bool Skip(int count); + int64 ByteCount() const; + + private: + // Insures that buffer_ is not NULL. + void AllocateBufferIfNeeded(); + // Frees the buffer and resets buffer_used_. + void FreeBuffer(); + + // The underlying copying stream. + CopyingInputStream* copying_stream_; + bool owns_copying_stream_; + + // True if we have seen a permenant error from the underlying stream. + bool failed_; + + // The current position of copying_stream_, relative to the point where + // we started reading. + int64 position_; + + // Data is read into this buffer. It may be NULL if no buffer is currently + // in use. Otherwise, it points to an array of size buffer_size_. + google::protobuf::scoped_array buffer_; + const int buffer_size_; + + // Number of valid bytes currently in the buffer (i.e. the size last + // returned by Next()). 0 <= buffer_used_ <= buffer_size_. + int buffer_used_; + + // Number of bytes in the buffer which were backed up over by a call to + // BackUp(). These need to be returned again. + // 0 <= backup_bytes_ <= buffer_used_ + int backup_bytes_; + + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(CopyingInputStreamAdaptor); +}; + +// =================================================================== + +// A generic traditional output stream interface. +// +// Lots of traditional output streams (e.g. file descriptors, C stdio +// streams, and C++ iostreams) expose an interface where every write +// involves copying bytes from a buffer. If you want to take such an +// interface and make a ZeroCopyOutputStream based on it, simply implement +// CopyingOutputStream and then use CopyingOutputStreamAdaptor. +// +// CopyingOutputStream implementations should avoid buffering if possible. +// CopyingOutputStreamAdaptor does its own buffering and will write data +// in large blocks. +class LIBPROTOBUF_EXPORT CopyingOutputStream { + public: + virtual ~CopyingOutputStream(); + + // Writes "size" bytes from the given buffer to the output. Returns true + // if successful, false on a write error. + virtual bool Write(const void* buffer, int size) = 0; +}; + +// A ZeroCopyOutputStream which writes to a CopyingOutputStream. This is +// useful for implementing ZeroCopyOutputStreams that write to traditional +// streams. Note that this class is not really zero-copy. +// +// If you want to write to file descriptors or C++ ostreams, this is +// already implemented for you: use FileOutputStream or OstreamOutputStream +// respectively. +class LIBPROTOBUF_EXPORT CopyingOutputStreamAdaptor : public ZeroCopyOutputStream { + public: + // Creates a stream that writes to the given Unix file descriptor. + // If a block_size is given, it specifies the size of the buffers + // that should be returned by Next(). Otherwise, a reasonable default + // is used. + explicit CopyingOutputStreamAdaptor(CopyingOutputStream* copying_stream, + int block_size = -1); + ~CopyingOutputStreamAdaptor(); + + // Writes all pending data to the underlying stream. Returns false if a + // write error occurred on the underlying stream. (The underlying + // stream itself is not necessarily flushed.) + bool Flush(); + + // Call SetOwnsCopyingStream(true) to tell the CopyingOutputStreamAdaptor to + // delete the underlying CopyingOutputStream when it is destroyed. + void SetOwnsCopyingStream(bool value) { owns_copying_stream_ = value; } + + // implements ZeroCopyOutputStream --------------------------------- + bool Next(void** data, int* size); + void BackUp(int count); + int64 ByteCount() const; + + private: + // Write the current buffer, if it is present. + bool WriteBuffer(); + // Insures that buffer_ is not NULL. + void AllocateBufferIfNeeded(); + // Frees the buffer. + void FreeBuffer(); + + // The underlying copying stream. + CopyingOutputStream* copying_stream_; + bool owns_copying_stream_; + + // True if we have seen a permenant error from the underlying stream. + bool failed_; + + // The current position of copying_stream_, relative to the point where + // we started writing. + int64 position_; + + // Data is written from this buffer. It may be NULL if no buffer is + // currently in use. Otherwise, it points to an array of size buffer_size_. + google::protobuf::scoped_array buffer_; + const int buffer_size_; + + // Number of valid bytes currently in the buffer (i.e. the size last + // returned by Next()). When BackUp() is called, we just reduce this. + // 0 <= buffer_used_ <= buffer_size_. + int buffer_used_; + + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(CopyingOutputStreamAdaptor); +}; + +// =================================================================== + +// mutable_string_data() and as_string_data() are workarounds to improve +// the performance of writing new data to an existing string. Unfortunately +// the methods provided by the string class are suboptimal, and using memcpy() +// is mildly annoying because it requires its pointer args to be non-NULL even +// if we ask it to copy 0 bytes. Furthermore, string_as_array() has the +// property that it always returns NULL if its arg is the empty string, exactly +// what we want to avoid if we're using it in conjunction with memcpy()! +// With C++11, the desired memcpy() boils down to memcpy(..., &(*s)[0], size), +// where s is a string*. Without C++11, &(*s)[0] is not guaranteed to be safe, +// so we use string_as_array(), and live with the extra logic that tests whether +// *s is empty. + +// Return a pointer to mutable characters underlying the given string. The +// return value is valid until the next time the string is resized. We +// trust the caller to treat the return value as an array of length s->size(). +inline char* mutable_string_data(string* s) { +#ifdef LANG_CXX11 + // This should be simpler & faster than string_as_array() because the latter + // is guaranteed to return NULL when *s is empty, so it has to check for that. + return &(*s)[0]; +#else + return string_as_array(s); +#endif +} + +// as_string_data(s) is equivalent to +// ({ char* p = mutable_string_data(s); make_pair(p, p != NULL); }) +// Sometimes it's faster: in some scenarios p cannot be NULL, and then the +// code can avoid that check. +inline std::pair as_string_data(string* s) { + char *p = mutable_string_data(s); +#ifdef LANG_CXX11 + return std::make_pair(p, true); +#else + return make_pair(p, p != NULL); +#endif +} + +} // namespace io +} // namespace protobuf + +} // namespace google +#endif // GOOGLE_PROTOBUF_IO_ZERO_COPY_STREAM_IMPL_LITE_H__ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/message.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/message.h new file mode 100644 index 00000000..7898cb24 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/message.h @@ -0,0 +1,1150 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Author: kenton@google.com (Kenton Varda) +// Based on original Protocol Buffers design by +// Sanjay Ghemawat, Jeff Dean, and others. +// +// Defines Message, the abstract interface implemented by non-lite +// protocol message objects. Although it's possible to implement this +// interface manually, most users will use the protocol compiler to +// generate implementations. +// +// Example usage: +// +// Say you have a message defined as: +// +// message Foo { +// optional string text = 1; +// repeated int32 numbers = 2; +// } +// +// Then, if you used the protocol compiler to generate a class from the above +// definition, you could use it like so: +// +// string data; // Will store a serialized version of the message. +// +// { +// // Create a message and serialize it. +// Foo foo; +// foo.set_text("Hello World!"); +// foo.add_numbers(1); +// foo.add_numbers(5); +// foo.add_numbers(42); +// +// foo.SerializeToString(&data); +// } +// +// { +// // Parse the serialized message and check that it contains the +// // correct data. +// Foo foo; +// foo.ParseFromString(data); +// +// assert(foo.text() == "Hello World!"); +// assert(foo.numbers_size() == 3); +// assert(foo.numbers(0) == 1); +// assert(foo.numbers(1) == 5); +// assert(foo.numbers(2) == 42); +// } +// +// { +// // Same as the last block, but do it dynamically via the Message +// // reflection interface. +// Message* foo = new Foo; +// const Descriptor* descriptor = foo->GetDescriptor(); +// +// // Get the descriptors for the fields we're interested in and verify +// // their types. +// const FieldDescriptor* text_field = descriptor->FindFieldByName("text"); +// assert(text_field != NULL); +// assert(text_field->type() == FieldDescriptor::TYPE_STRING); +// assert(text_field->label() == FieldDescriptor::LABEL_OPTIONAL); +// const FieldDescriptor* numbers_field = descriptor-> +// FindFieldByName("numbers"); +// assert(numbers_field != NULL); +// assert(numbers_field->type() == FieldDescriptor::TYPE_INT32); +// assert(numbers_field->label() == FieldDescriptor::LABEL_REPEATED); +// +// // Parse the message. +// foo->ParseFromString(data); +// +// // Use the reflection interface to examine the contents. +// const Reflection* reflection = foo->GetReflection(); +// assert(reflection->GetString(*foo, text_field) == "Hello World!"); +// assert(reflection->FieldSize(*foo, numbers_field) == 3); +// assert(reflection->GetRepeatedInt32(*foo, numbers_field, 0) == 1); +// assert(reflection->GetRepeatedInt32(*foo, numbers_field, 1) == 5); +// assert(reflection->GetRepeatedInt32(*foo, numbers_field, 2) == 42); +// +// delete foo; +// } + +#ifndef GOOGLE_PROTOBUF_MESSAGE_H__ +#define GOOGLE_PROTOBUF_MESSAGE_H__ + +#include +#include +#include "type_traits.h" +#include + +#include "arena.h" +#include "message_lite.h" + +#include "common.h" +#include "descriptor.h" + + +#define GOOGLE_PROTOBUF_HAS_ONEOF +#define GOOGLE_PROTOBUF_HAS_ARENAS + +namespace google { +namespace protobuf { + +// Defined in this file. +class Message; +class Reflection; +class MessageFactory; + +// Defined in other files. +class MapKey; +class MapValueRef; +class MapIterator; +class MapReflectionTester; + +namespace internal { +class MapFieldBase; +} +class UnknownFieldSet; // unknown_field_set.h +namespace io { +class ZeroCopyInputStream; // zero_copy_stream.h +class ZeroCopyOutputStream; // zero_copy_stream.h +class CodedInputStream; // coded_stream.h +class CodedOutputStream; // coded_stream.h +} +namespace python { +class MapReflectionFriend; // scalar_map_container.h +} + + +template +class RepeatedField; // repeated_field.h + +template +class RepeatedPtrField; // repeated_field.h + +// A container to hold message metadata. +struct Metadata { + const Descriptor* descriptor; + const Reflection* reflection; +}; + +// Abstract interface for protocol messages. +// +// See also MessageLite, which contains most every-day operations. Message +// adds descriptors and reflection on top of that. +// +// The methods of this class that are virtual but not pure-virtual have +// default implementations based on reflection. Message classes which are +// optimized for speed will want to override these with faster implementations, +// but classes optimized for code size may be happy with keeping them. See +// the optimize_for option in descriptor.proto. +class LIBPROTOBUF_EXPORT Message : public MessageLite { + public: + inline Message() {} + virtual ~Message() {} + + // Basic Operations ------------------------------------------------ + + // Construct a new instance of the same type. Ownership is passed to the + // caller. (This is also defined in MessageLite, but is defined again here + // for return-type covariance.) + virtual Message* New() const = 0; + + // Construct a new instance on the arena. Ownership is passed to the caller + // if arena is a NULL. Default implementation allows for API compatibility + // during the Arena transition. + virtual Message* New(::google::protobuf::Arena* arena) const { + Message* message = New(); + if (arena != NULL) { + arena->Own(message); + } + return message; + } + + // Make this message into a copy of the given message. The given message + // must have the same descriptor, but need not necessarily be the same class. + // By default this is just implemented as "Clear(); MergeFrom(from);". + virtual void CopyFrom(const Message& from); + + // Merge the fields from the given message into this message. Singular + // fields will be overwritten, if specified in from, except for embedded + // messages which will be merged. Repeated fields will be concatenated. + // The given message must be of the same type as this message (i.e. the + // exact same class). + virtual void MergeFrom(const Message& from); + + // Verifies that IsInitialized() returns true. GOOGLE_CHECK-fails otherwise, with + // a nice error message. + void CheckInitialized() const; + + // Slowly build a list of all required fields that are not set. + // This is much, much slower than IsInitialized() as it is implemented + // purely via reflection. Generally, you should not call this unless you + // have already determined that an error exists by calling IsInitialized(). + void FindInitializationErrors(std::vector* errors) const; + + // Like FindInitializationErrors, but joins all the strings, delimited by + // commas, and returns them. + string InitializationErrorString() const; + + // Clears all unknown fields from this message and all embedded messages. + // Normally, if unknown tag numbers are encountered when parsing a message, + // the tag and value are stored in the message's UnknownFieldSet and + // then written back out when the message is serialized. This allows servers + // which simply route messages to other servers to pass through messages + // that have new field definitions which they don't yet know about. However, + // this behavior can have security implications. To avoid it, call this + // method after parsing. + // + // See Reflection::GetUnknownFields() for more on unknown fields. + virtual void DiscardUnknownFields(); + + // Computes (an estimate of) the total number of bytes currently used for + // storing the message in memory. The default implementation calls the + // Reflection object's SpaceUsed() method. + // + // SpaceUsed() is noticeably slower than ByteSize(), as it is implemented + // using reflection (rather than the generated code implementation for + // ByteSize()). Like ByteSize(), its CPU time is linear in the number of + // fields defined for the proto. + virtual int SpaceUsed() const; + + // Debugging & Testing---------------------------------------------- + + // Generates a human readable form of this message, useful for debugging + // and other purposes. + string DebugString() const; + // Like DebugString(), but with less whitespace. + string ShortDebugString() const; + // Like DebugString(), but do not escape UTF-8 byte sequences. + string Utf8DebugString() const; + // Convenience function useful in GDB. Prints DebugString() to stdout. + void PrintDebugString() const; + + // Heavy I/O ------------------------------------------------------- + // Additional parsing and serialization methods not implemented by + // MessageLite because they are not supported by the lite library. + + // Parse a protocol buffer from a file descriptor. If successful, the entire + // input will be consumed. + bool ParseFromFileDescriptor(int file_descriptor); + // Like ParseFromFileDescriptor(), but accepts messages that are missing + // required fields. + bool ParsePartialFromFileDescriptor(int file_descriptor); + // Parse a protocol buffer from a C++ istream. If successful, the entire + // input will be consumed. + bool ParseFromIstream(std::istream* input); + // Like ParseFromIstream(), but accepts messages that are missing + // required fields. + bool ParsePartialFromIstream(std::istream* input); + + // Serialize the message and write it to the given file descriptor. All + // required fields must be set. + bool SerializeToFileDescriptor(int file_descriptor) const; + // Like SerializeToFileDescriptor(), but allows missing required fields. + bool SerializePartialToFileDescriptor(int file_descriptor) const; + // Serialize the message and write it to the given C++ ostream. All + // required fields must be set. + bool SerializeToOstream(std::ostream* output) const; + // Like SerializeToOstream(), but allows missing required fields. + bool SerializePartialToOstream(std::ostream* output) const; + + + // Reflection-based methods ---------------------------------------- + // These methods are pure-virtual in MessageLite, but Message provides + // reflection-based default implementations. + + virtual string GetTypeName() const; + virtual void Clear(); + virtual bool IsInitialized() const; + virtual void CheckTypeAndMergeFrom(const MessageLite& other); + virtual bool MergePartialFromCodedStream(io::CodedInputStream* input); + virtual size_t ByteSizeLong() const; + virtual void SerializeWithCachedSizes(io::CodedOutputStream* output) const; + + private: + // This is called only by the default implementation of ByteSize(), to + // update the cached size. If you override ByteSize(), you do not need + // to override this. If you do not override ByteSize(), you MUST override + // this; the default implementation will crash. + // + // The method is private because subclasses should never call it; only + // override it. Yes, C++ lets you do that. Crazy, huh? + virtual void SetCachedSize(int size) const; + + public: + + // Introspection --------------------------------------------------- + + // Typedef for backwards-compatibility. + typedef google::protobuf::Reflection Reflection; + + // Get a Descriptor for this message's type. This describes what + // fields the message contains, the types of those fields, etc. + const Descriptor* GetDescriptor() const { return GetMetadata().descriptor; } + + // Get the Reflection interface for this Message, which can be used to + // read and modify the fields of the Message dynamically (in other words, + // without knowing the message type at compile time). This object remains + // property of the Message. + // + // This method remains virtual in case a subclass does not implement + // reflection and wants to override the default behavior. + virtual const Reflection* GetReflection() const { + return GetMetadata().reflection; + } + + protected: + // Get a struct containing the metadata for the Message. Most subclasses only + // need to implement this method, rather than the GetDescriptor() and + // GetReflection() wrappers. + virtual Metadata GetMetadata() const = 0; + + + private: + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Message); +}; + +namespace internal { +// Forward-declare interfaces used to implement RepeatedFieldRef. +// These are protobuf internals that users shouldn't care about. +class RepeatedFieldAccessor; +} // namespace internal + +// Forward-declare RepeatedFieldRef templates. The second type parameter is +// used for SFINAE tricks. Users should ignore it. +template +class RepeatedFieldRef; + +template +class MutableRepeatedFieldRef; + +// This interface contains methods that can be used to dynamically access +// and modify the fields of a protocol message. Their semantics are +// similar to the accessors the protocol compiler generates. +// +// To get the Reflection for a given Message, call Message::GetReflection(). +// +// This interface is separate from Message only for efficiency reasons; +// the vast majority of implementations of Message will share the same +// implementation of Reflection (GeneratedMessageReflection, +// defined in generated_message.h), and all Messages of a particular class +// should share the same Reflection object (though you should not rely on +// the latter fact). +// +// There are several ways that these methods can be used incorrectly. For +// example, any of the following conditions will lead to undefined +// results (probably assertion failures): +// - The FieldDescriptor is not a field of this message type. +// - The method called is not appropriate for the field's type. For +// each field type in FieldDescriptor::TYPE_*, there is only one +// Get*() method, one Set*() method, and one Add*() method that is +// valid for that type. It should be obvious which (except maybe +// for TYPE_BYTES, which are represented using strings in C++). +// - A Get*() or Set*() method for singular fields is called on a repeated +// field. +// - GetRepeated*(), SetRepeated*(), or Add*() is called on a non-repeated +// field. +// - The Message object passed to any method is not of the right type for +// this Reflection object (i.e. message.GetReflection() != reflection). +// +// You might wonder why there is not any abstract representation for a field +// of arbitrary type. E.g., why isn't there just a "GetField()" method that +// returns "const Field&", where "Field" is some class with accessors like +// "GetInt32Value()". The problem is that someone would have to deal with +// allocating these Field objects. For generated message classes, having to +// allocate space for an additional object to wrap every field would at least +// double the message's memory footprint, probably worse. Allocating the +// objects on-demand, on the other hand, would be expensive and prone to +// memory leaks. So, instead we ended up with this flat interface. +// +// TODO(kenton): Create a utility class which callers can use to read and +// write fields from a Reflection without paying attention to the type. +class LIBPROTOBUF_EXPORT Reflection { + public: + inline Reflection() {} + virtual ~Reflection(); + + // Get the UnknownFieldSet for the message. This contains fields which + // were seen when the Message was parsed but were not recognized according + // to the Message's definition. For proto3 protos, this method will always + // return an empty UnknownFieldSet. + virtual const UnknownFieldSet& GetUnknownFields( + const Message& message) const = 0; + // Get a mutable pointer to the UnknownFieldSet for the message. This + // contains fields which were seen when the Message was parsed but were not + // recognized according to the Message's definition. For proto3 protos, this + // method will return a valid mutable UnknownFieldSet pointer but modifying + // it won't affect the serialized bytes of the message. + virtual UnknownFieldSet* MutableUnknownFields(Message* message) const = 0; + + // Estimate the amount of memory used by the message object. + virtual int SpaceUsed(const Message& message) const = 0; + + // Check if the given non-repeated field is set. + virtual bool HasField(const Message& message, + const FieldDescriptor* field) const = 0; + + // Get the number of elements of a repeated field. + virtual int FieldSize(const Message& message, + const FieldDescriptor* field) const = 0; + + // Clear the value of a field, so that HasField() returns false or + // FieldSize() returns zero. + virtual void ClearField(Message* message, + const FieldDescriptor* field) const = 0; + + // Check if the oneof is set. Returns true if any field in oneof + // is set, false otherwise. + // TODO(jieluo) - make it pure virtual after updating all + // the subclasses. + virtual bool HasOneof(const Message& /*message*/, + const OneofDescriptor* /*oneof_descriptor*/) const { + return false; + } + + virtual void ClearOneof(Message* /*message*/, + const OneofDescriptor* /*oneof_descriptor*/) const {} + + // Returns the field descriptor if the oneof is set. NULL otherwise. + // TODO(jieluo) - make it pure virtual. + virtual const FieldDescriptor* GetOneofFieldDescriptor( + const Message& /*message*/, + const OneofDescriptor* /*oneof_descriptor*/) const { + return NULL; + } + + // Removes the last element of a repeated field. + // We don't provide a way to remove any element other than the last + // because it invites inefficient use, such as O(n^2) filtering loops + // that should have been O(n). If you want to remove an element other + // than the last, the best way to do it is to re-arrange the elements + // (using Swap()) so that the one you want removed is at the end, then + // call RemoveLast(). + virtual void RemoveLast(Message* message, + const FieldDescriptor* field) const = 0; + // Removes the last element of a repeated message field, and returns the + // pointer to the caller. Caller takes ownership of the returned pointer. + virtual Message* ReleaseLast(Message* message, + const FieldDescriptor* field) const = 0; + + // Swap the complete contents of two messages. + virtual void Swap(Message* message1, Message* message2) const = 0; + + // Swap fields listed in fields vector of two messages. + virtual void SwapFields(Message* message1, + Message* message2, + const std::vector& fields) + const = 0; + + // Swap two elements of a repeated field. + virtual void SwapElements(Message* message, + const FieldDescriptor* field, + int index1, + int index2) const = 0; + + // List all fields of the message which are currently set. This includes + // extensions. Singular fields will only be listed if HasField(field) would + // return true and repeated fields will only be listed if FieldSize(field) + // would return non-zero. Fields (both normal fields and extension fields) + // will be listed ordered by field number. + virtual void ListFields( + const Message& message, + std::vector* output) const = 0; + + // Singular field getters ------------------------------------------ + // These get the value of a non-repeated field. They return the default + // value for fields that aren't set. + + virtual int32 GetInt32 (const Message& message, + const FieldDescriptor* field) const = 0; + virtual int64 GetInt64 (const Message& message, + const FieldDescriptor* field) const = 0; + virtual uint32 GetUInt32(const Message& message, + const FieldDescriptor* field) const = 0; + virtual uint64 GetUInt64(const Message& message, + const FieldDescriptor* field) const = 0; + virtual float GetFloat (const Message& message, + const FieldDescriptor* field) const = 0; + virtual double GetDouble(const Message& message, + const FieldDescriptor* field) const = 0; + virtual bool GetBool (const Message& message, + const FieldDescriptor* field) const = 0; + virtual string GetString(const Message& message, + const FieldDescriptor* field) const = 0; + virtual const EnumValueDescriptor* GetEnum( + const Message& message, const FieldDescriptor* field) const = 0; + + // GetEnumValue() returns an enum field's value as an integer rather than + // an EnumValueDescriptor*. If the integer value does not correspond to a + // known value descriptor, a new value descriptor is created. (Such a value + // will only be present when the new unknown-enum-value semantics are enabled + // for a message.) + virtual int GetEnumValue( + const Message& message, const FieldDescriptor* field) const; + + // See MutableMessage() for the meaning of the "factory" parameter. + virtual const Message& GetMessage(const Message& message, + const FieldDescriptor* field, + MessageFactory* factory = NULL) const = 0; + + // Get a string value without copying, if possible. + // + // GetString() necessarily returns a copy of the string. This can be + // inefficient when the string is already stored in a string object in the + // underlying message. GetStringReference() will return a reference to the + // underlying string in this case. Otherwise, it will copy the string into + // *scratch and return that. + // + // Note: It is perfectly reasonable and useful to write code like: + // str = reflection->GetStringReference(field, &str); + // This line would ensure that only one copy of the string is made + // regardless of the field's underlying representation. When initializing + // a newly-constructed string, though, it's just as fast and more readable + // to use code like: + // string str = reflection->GetString(message, field); + virtual const string& GetStringReference(const Message& message, + const FieldDescriptor* field, + string* scratch) const = 0; + + + // Singular field mutators ----------------------------------------- + // These mutate the value of a non-repeated field. + + virtual void SetInt32 (Message* message, + const FieldDescriptor* field, int32 value) const = 0; + virtual void SetInt64 (Message* message, + const FieldDescriptor* field, int64 value) const = 0; + virtual void SetUInt32(Message* message, + const FieldDescriptor* field, uint32 value) const = 0; + virtual void SetUInt64(Message* message, + const FieldDescriptor* field, uint64 value) const = 0; + virtual void SetFloat (Message* message, + const FieldDescriptor* field, float value) const = 0; + virtual void SetDouble(Message* message, + const FieldDescriptor* field, double value) const = 0; + virtual void SetBool (Message* message, + const FieldDescriptor* field, bool value) const = 0; + virtual void SetString(Message* message, + const FieldDescriptor* field, + const string& value) const = 0; + virtual void SetEnum (Message* message, + const FieldDescriptor* field, + const EnumValueDescriptor* value) const = 0; + // Set an enum field's value with an integer rather than EnumValueDescriptor. + // If the value does not correspond to a known enum value, either behavior is + // undefined (for proto2 messages), or the value is accepted silently for + // messages with new unknown-enum-value semantics. + virtual void SetEnumValue(Message* message, + const FieldDescriptor* field, + int value) const; + + // Get a mutable pointer to a field with a message type. If a MessageFactory + // is provided, it will be used to construct instances of the sub-message; + // otherwise, the default factory is used. If the field is an extension that + // does not live in the same pool as the containing message's descriptor (e.g. + // it lives in an overlay pool), then a MessageFactory must be provided. + // If you have no idea what that meant, then you probably don't need to worry + // about it (don't provide a MessageFactory). WARNING: If the + // FieldDescriptor is for a compiled-in extension, then + // factory->GetPrototype(field->message_type()) MUST return an instance of + // the compiled-in class for this type, NOT DynamicMessage. + virtual Message* MutableMessage(Message* message, + const FieldDescriptor* field, + MessageFactory* factory = NULL) const = 0; + // Replaces the message specified by 'field' with the already-allocated object + // sub_message, passing ownership to the message. If the field contained a + // message, that message is deleted. If sub_message is NULL, the field is + // cleared. + virtual void SetAllocatedMessage(Message* message, + Message* sub_message, + const FieldDescriptor* field) const = 0; + // Releases the message specified by 'field' and returns the pointer, + // ReleaseMessage() will return the message the message object if it exists. + // Otherwise, it may or may not return NULL. In any case, if the return value + // is non-NULL, the caller takes ownership of the pointer. + // If the field existed (HasField() is true), then the returned pointer will + // be the same as the pointer returned by MutableMessage(). + // This function has the same effect as ClearField(). + virtual Message* ReleaseMessage(Message* message, + const FieldDescriptor* field, + MessageFactory* factory = NULL) const = 0; + + + // Repeated field getters ------------------------------------------ + // These get the value of one element of a repeated field. + + virtual int32 GetRepeatedInt32 (const Message& message, + const FieldDescriptor* field, + int index) const = 0; + virtual int64 GetRepeatedInt64 (const Message& message, + const FieldDescriptor* field, + int index) const = 0; + virtual uint32 GetRepeatedUInt32(const Message& message, + const FieldDescriptor* field, + int index) const = 0; + virtual uint64 GetRepeatedUInt64(const Message& message, + const FieldDescriptor* field, + int index) const = 0; + virtual float GetRepeatedFloat (const Message& message, + const FieldDescriptor* field, + int index) const = 0; + virtual double GetRepeatedDouble(const Message& message, + const FieldDescriptor* field, + int index) const = 0; + virtual bool GetRepeatedBool (const Message& message, + const FieldDescriptor* field, + int index) const = 0; + virtual string GetRepeatedString(const Message& message, + const FieldDescriptor* field, + int index) const = 0; + virtual const EnumValueDescriptor* GetRepeatedEnum( + const Message& message, + const FieldDescriptor* field, int index) const = 0; + // GetRepeatedEnumValue() returns an enum field's value as an integer rather + // than an EnumValueDescriptor*. If the integer value does not correspond to a + // known value descriptor, a new value descriptor is created. (Such a value + // will only be present when the new unknown-enum-value semantics are enabled + // for a message.) + virtual int GetRepeatedEnumValue( + const Message& message, + const FieldDescriptor* field, int index) const; + virtual const Message& GetRepeatedMessage( + const Message& message, + const FieldDescriptor* field, int index) const = 0; + + // See GetStringReference(), above. + virtual const string& GetRepeatedStringReference( + const Message& message, const FieldDescriptor* field, + int index, string* scratch) const = 0; + + + // Repeated field mutators ----------------------------------------- + // These mutate the value of one element of a repeated field. + + virtual void SetRepeatedInt32 (Message* message, + const FieldDescriptor* field, + int index, int32 value) const = 0; + virtual void SetRepeatedInt64 (Message* message, + const FieldDescriptor* field, + int index, int64 value) const = 0; + virtual void SetRepeatedUInt32(Message* message, + const FieldDescriptor* field, + int index, uint32 value) const = 0; + virtual void SetRepeatedUInt64(Message* message, + const FieldDescriptor* field, + int index, uint64 value) const = 0; + virtual void SetRepeatedFloat (Message* message, + const FieldDescriptor* field, + int index, float value) const = 0; + virtual void SetRepeatedDouble(Message* message, + const FieldDescriptor* field, + int index, double value) const = 0; + virtual void SetRepeatedBool (Message* message, + const FieldDescriptor* field, + int index, bool value) const = 0; + virtual void SetRepeatedString(Message* message, + const FieldDescriptor* field, + int index, const string& value) const = 0; + virtual void SetRepeatedEnum(Message* message, + const FieldDescriptor* field, int index, + const EnumValueDescriptor* value) const = 0; + // Set an enum field's value with an integer rather than EnumValueDescriptor. + // If the value does not correspond to a known enum value, either behavior is + // undefined (for proto2 messages), or the value is accepted silently for + // messages with new unknown-enum-value semantics. + virtual void SetRepeatedEnumValue(Message* message, + const FieldDescriptor* field, int index, + int value) const; + // Get a mutable pointer to an element of a repeated field with a message + // type. + virtual Message* MutableRepeatedMessage( + Message* message, const FieldDescriptor* field, int index) const = 0; + + + // Repeated field adders ------------------------------------------- + // These add an element to a repeated field. + + virtual void AddInt32 (Message* message, + const FieldDescriptor* field, int32 value) const = 0; + virtual void AddInt64 (Message* message, + const FieldDescriptor* field, int64 value) const = 0; + virtual void AddUInt32(Message* message, + const FieldDescriptor* field, uint32 value) const = 0; + virtual void AddUInt64(Message* message, + const FieldDescriptor* field, uint64 value) const = 0; + virtual void AddFloat (Message* message, + const FieldDescriptor* field, float value) const = 0; + virtual void AddDouble(Message* message, + const FieldDescriptor* field, double value) const = 0; + virtual void AddBool (Message* message, + const FieldDescriptor* field, bool value) const = 0; + virtual void AddString(Message* message, + const FieldDescriptor* field, + const string& value) const = 0; + virtual void AddEnum (Message* message, + const FieldDescriptor* field, + const EnumValueDescriptor* value) const = 0; + // Set an enum field's value with an integer rather than EnumValueDescriptor. + // If the value does not correspond to a known enum value, either behavior is + // undefined (for proto2 messages), or the value is accepted silently for + // messages with new unknown-enum-value semantics. + virtual void AddEnumValue(Message* message, + const FieldDescriptor* field, + int value) const; + // See MutableMessage() for comments on the "factory" parameter. + virtual Message* AddMessage(Message* message, + const FieldDescriptor* field, + MessageFactory* factory = NULL) const = 0; + + // Appends an already-allocated object 'new_entry' to the repeated field + // specifyed by 'field' passing ownership to the message. + // TODO(tmarek): Make virtual after all subclasses have been + // updated. + virtual void AddAllocatedMessage(Message* /* message */, + const FieldDescriptor* /*field */, + Message* /* new_entry */) const {} + + + // Get a RepeatedFieldRef object that can be used to read the underlying + // repeated field. The type parameter T must be set according to the + // field's cpp type. The following table shows the mapping from cpp type + // to acceptable T. + // + // field->cpp_type() T + // CPPTYPE_INT32 int32 + // CPPTYPE_UINT32 uint32 + // CPPTYPE_INT64 int64 + // CPPTYPE_UINT64 uint64 + // CPPTYPE_DOUBLE double + // CPPTYPE_FLOAT float + // CPPTYPE_BOOL bool + // CPPTYPE_ENUM generated enum type or int32 + // CPPTYPE_STRING string + // CPPTYPE_MESSAGE generated message type or google::protobuf::Message + // + // A RepeatedFieldRef object can be copied and the resulted object will point + // to the same repeated field in the same message. The object can be used as + // long as the message is not destroyed. + // + // Note that to use this method users need to include the header file + // "google/protobuf/reflection.h" (which defines the RepeatedFieldRef + // class templates). + template + RepeatedFieldRef GetRepeatedFieldRef( + const Message& message, const FieldDescriptor* field) const; + + // Like GetRepeatedFieldRef() but return an object that can also be used + // manipulate the underlying repeated field. + template + MutableRepeatedFieldRef GetMutableRepeatedFieldRef( + Message* message, const FieldDescriptor* field) const; + + // DEPRECATED. Please use Get(Mutable)RepeatedFieldRef() for repeated field + // access. The following repeated field accesors will be removed in the + // future. + // + // Repeated field accessors ------------------------------------------------- + // The methods above, e.g. GetRepeatedInt32(msg, fd, index), provide singular + // access to the data in a RepeatedField. The methods below provide aggregate + // access by exposing the RepeatedField object itself with the Message. + // Applying these templates to inappropriate types will lead to an undefined + // reference at link time (e.g. GetRepeatedField<***double>), or possibly a + // template matching error at compile time (e.g. GetRepeatedPtrField). + // + // Usage example: my_doubs = refl->GetRepeatedField(msg, fd); + + // DEPRECATED. Please use GetRepeatedFieldRef(). + // + // for T = Cord and all protobuf scalar types except enums. + template + const RepeatedField& GetRepeatedField( + const Message&, const FieldDescriptor*) const; + + // DEPRECATED. Please use GetMutableRepeatedFieldRef(). + // + // for T = Cord and all protobuf scalar types except enums. + template + RepeatedField* MutableRepeatedField( + Message*, const FieldDescriptor*) const; + + // DEPRECATED. Please use GetRepeatedFieldRef(). + // + // for T = string, google::protobuf::internal::StringPieceField + // google::protobuf::Message & descendants. + template + const RepeatedPtrField& GetRepeatedPtrField( + const Message&, const FieldDescriptor*) const; + + // DEPRECATED. Please use GetMutableRepeatedFieldRef(). + // + // for T = string, google::protobuf::internal::StringPieceField + // google::protobuf::Message & descendants. + template + RepeatedPtrField* MutableRepeatedPtrField( + Message*, const FieldDescriptor*) const; + + // Extensions ---------------------------------------------------------------- + + // Try to find an extension of this message type by fully-qualified field + // name. Returns NULL if no extension is known for this name or number. + virtual const FieldDescriptor* FindKnownExtensionByName( + const string& name) const = 0; + + // Try to find an extension of this message type by field number. + // Returns NULL if no extension is known for this name or number. + virtual const FieldDescriptor* FindKnownExtensionByNumber( + int number) const = 0; + + // Feature Flags ------------------------------------------------------------- + + // Does this message support storing arbitrary integer values in enum fields? + // If |true|, GetEnumValue/SetEnumValue and associated repeated-field versions + // take arbitrary integer values, and the legacy GetEnum() getter will + // dynamically create an EnumValueDescriptor for any integer value without + // one. If |false|, setting an unknown enum value via the integer-based + // setters results in undefined behavior (in practice, GOOGLE_DCHECK-fails). + // + // Generic code that uses reflection to handle messages with enum fields + // should check this flag before using the integer-based setter, and either + // downgrade to a compatible value or use the UnknownFieldSet if not. For + // example: + // + // int new_value = GetValueFromApplicationLogic(); + // if (reflection->SupportsUnknownEnumValues()) { + // reflection->SetEnumValue(message, field, new_value); + // } else { + // if (field_descriptor->enum_type()-> + // FindValueByNumver(new_value) != NULL) { + // reflection->SetEnumValue(message, field, new_value); + // } else if (emit_unknown_enum_values) { + // reflection->MutableUnknownFields(message)->AddVarint( + // field->number(), + // new_value); + // } else { + // // convert value to a compatible/default value. + // new_value = CompatibleDowngrade(new_value); + // reflection->SetEnumValue(message, field, new_value); + // } + // } + virtual bool SupportsUnknownEnumValues() const { return false; } + + // Returns the MessageFactory associated with this message. This can be + // useful for determining if a message is a generated message or not, for + // example: + // + // if (message->GetReflection()->GetMessageFactory() == + // google::protobuf::MessageFactory::generated_factory()) { + // // This is a generated message. + // } + // + // It can also be used to create more messages of this type, though + // Message::New() is an easier way to accomplish this. + virtual MessageFactory* GetMessageFactory() const; + + // --------------------------------------------------------------------------- + + protected: + // Obtain a pointer to a Repeated Field Structure and do some type checking: + // on field->cpp_type(), + // on field->field_option().ctype() (if ctype >= 0) + // of field->message_type() (if message_type != NULL). + // We use 2 routine rather than 4 (const vs mutable) x (scalar vs pointer). + virtual void* MutableRawRepeatedField( + Message* message, const FieldDescriptor* field, FieldDescriptor::CppType, + int ctype, const Descriptor* message_type) const = 0; + + // TODO(jieluo) - make it pure virtual after updating all the subclasses. + virtual const void* GetRawRepeatedField( + const Message& message, const FieldDescriptor* field, + FieldDescriptor::CppType cpptype, int ctype, + const Descriptor* message_type) const { + return MutableRawRepeatedField( + const_cast(&message), field, cpptype, ctype, message_type); + } + + // The following methods are used to implement (Mutable)RepeatedFieldRef. + // A Ref object will store a raw pointer to the repeated field data (obtained + // from RepeatedFieldData()) and a pointer to a Accessor (obtained from + // RepeatedFieldAccessor) which will be used to access the raw data. + // + // TODO(xiaofeng): Make these methods pure-virtual. + + // Returns a raw pointer to the repeated field + // + // "cpp_type" and "message_type" are decuded from the type parameter T passed + // to Get(Mutable)RepeatedFieldRef. If T is a generated message type, + // "message_type" should be set to its descriptor. Otherwise "message_type" + // should be set to NULL. Implementations of this method should check whether + // "cpp_type"/"message_type" is consistent with the actual type of the field. + // We use 1 routine rather than 2 (const vs mutable) because it is protected + // and it doesn't change the message. + virtual void* RepeatedFieldData( + Message* message, const FieldDescriptor* field, + FieldDescriptor::CppType cpp_type, + const Descriptor* message_type) const; + + // The returned pointer should point to a singleton instance which implements + // the RepeatedFieldAccessor interface. + virtual const internal::RepeatedFieldAccessor* RepeatedFieldAccessor( + const FieldDescriptor* field) const; + + private: + template + friend class RepeatedFieldRef; + template + friend class MutableRepeatedFieldRef; + friend class ::google::protobuf::python::MapReflectionFriend; + + // Special version for specialized implementations of string. We can't call + // MutableRawRepeatedField directly here because we don't have access to + // FieldOptions::* which are defined in descriptor.pb.h. Including that + // file here is not possible because it would cause a circular include cycle. + // We use 1 routine rather than 2 (const vs mutable) because it is private + // and mutable a repeated string field doesn't change the message. + void* MutableRawRepeatedString( + Message* message, const FieldDescriptor* field, bool is_string) const; + + friend class MapReflectionTester; + // TODO(jieluo) - make the map APIs pure virtual after updating + // all the subclasses. + // Returns true if key is in map. Returns false if key is not in map field. + virtual bool ContainsMapKey(const Message& /* message*/, + const FieldDescriptor* /* field */, + const MapKey& /* key */) const { + return false; + } + + // If key is in map field: Saves the value pointer to val and returns + // false. If key in not in map field: Insert the key into map, saves + // value pointer to val and retuns true. + virtual bool InsertOrLookupMapValue(Message* /* message */, + const FieldDescriptor* /* field */, + const MapKey& /* key */, + MapValueRef* /* val */) const { + return false; + } + + // Delete and returns true if key is in the map field. Returns false + // otherwise. + virtual bool DeleteMapValue(Message* /* mesage */, + const FieldDescriptor* /* field */, + const MapKey& /* key */) const { + return false; + } + + // Returns a MapIterator referring to the first element in the map field. + // If the map field is empty, this function returns the same as + // reflection::MapEnd. Mutation to the field may invalidate the iterator. + virtual MapIterator MapBegin( + Message* message, + const FieldDescriptor* field) const; + + // Returns a MapIterator referring to the theoretical element that would + // follow the last element in the map field. It does not point to any + // real element. Mutation to the field may invalidate the iterator. + virtual MapIterator MapEnd( + Message* message, + const FieldDescriptor* field) const; + + // Get the number of pair of a map field. The result may be + // different from FieldSize which can have duplicate keys. + virtual int MapSize(const Message& /* message */, + const FieldDescriptor* /* field */) const { + return 0; + } + + // Help method for MapIterator. + friend class MapIterator; + virtual internal::MapFieldBase* MapData( + Message* /* message */, const FieldDescriptor* /* field */) const { + return NULL; + } + + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Reflection); +}; + +// Abstract interface for a factory for message objects. +class LIBPROTOBUF_EXPORT MessageFactory { + public: + inline MessageFactory() {} + virtual ~MessageFactory(); + + // Given a Descriptor, gets or constructs the default (prototype) Message + // of that type. You can then call that message's New() method to construct + // a mutable message of that type. + // + // Calling this method twice with the same Descriptor returns the same + // object. The returned object remains property of the factory. Also, any + // objects created by calling the prototype's New() method share some data + // with the prototype, so these must be destroyed before the MessageFactory + // is destroyed. + // + // The given descriptor must outlive the returned message, and hence must + // outlive the MessageFactory. + // + // Some implementations do not support all types. GetPrototype() will + // return NULL if the descriptor passed in is not supported. + // + // This method may or may not be thread-safe depending on the implementation. + // Each implementation should document its own degree thread-safety. + virtual const Message* GetPrototype(const Descriptor* type) = 0; + + // Gets a MessageFactory which supports all generated, compiled-in messages. + // In other words, for any compiled-in type FooMessage, the following is true: + // MessageFactory::generated_factory()->GetPrototype( + // FooMessage::descriptor()) == FooMessage::default_instance() + // This factory supports all types which are found in + // DescriptorPool::generated_pool(). If given a descriptor from any other + // pool, GetPrototype() will return NULL. (You can also check if a + // descriptor is for a generated message by checking if + // descriptor->file()->pool() == DescriptorPool::generated_pool().) + // + // This factory is 100% thread-safe; calling GetPrototype() does not modify + // any shared data. + // + // This factory is a singleton. The caller must not delete the object. + static MessageFactory* generated_factory(); + + // For internal use only: Registers a .proto file at static initialization + // time, to be placed in generated_factory. The first time GetPrototype() + // is called with a descriptor from this file, |register_messages| will be + // called, with the file name as the parameter. It must call + // InternalRegisterGeneratedMessage() (below) to register each message type + // in the file. This strange mechanism is necessary because descriptors are + // built lazily, so we can't register types by their descriptor until we + // know that the descriptor exists. |filename| must be a permanent string. + static void InternalRegisterGeneratedFile( + const char* filename, void (*register_messages)(const string&)); + + // For internal use only: Registers a message type. Called only by the + // functions which are registered with InternalRegisterGeneratedFile(), + // above. + static void InternalRegisterGeneratedMessage(const Descriptor* descriptor, + const Message* prototype); + + + private: + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(MessageFactory); +}; + +#define DECLARE_GET_REPEATED_FIELD(TYPE) \ +template<> \ +LIBPROTOBUF_EXPORT \ +const RepeatedField& Reflection::GetRepeatedField( \ + const Message& message, const FieldDescriptor* field) const; \ + \ +template<> \ +LIBPROTOBUF_EXPORT \ +RepeatedField* Reflection::MutableRepeatedField( \ + Message* message, const FieldDescriptor* field) const; + +DECLARE_GET_REPEATED_FIELD(int32) +DECLARE_GET_REPEATED_FIELD(int64) +DECLARE_GET_REPEATED_FIELD(uint32) +DECLARE_GET_REPEATED_FIELD(uint64) +DECLARE_GET_REPEATED_FIELD(float) +DECLARE_GET_REPEATED_FIELD(double) +DECLARE_GET_REPEATED_FIELD(bool) + +#undef DECLARE_GET_REPEATED_FIELD + +// ============================================================================= +// Implementation details for {Get,Mutable}RawRepeatedPtrField. We provide +// specializations for , and and handle +// everything else with the default template which will match any type having +// a method with signature "static const google::protobuf::Descriptor* descriptor()". +// Such a type presumably is a descendant of google::protobuf::Message. + +template<> +inline const RepeatedPtrField& Reflection::GetRepeatedPtrField( + const Message& message, const FieldDescriptor* field) const { + return *static_cast* >( + MutableRawRepeatedString(const_cast(&message), field, true)); +} + +template<> +inline RepeatedPtrField* Reflection::MutableRepeatedPtrField( + Message* message, const FieldDescriptor* field) const { + return static_cast* >( + MutableRawRepeatedString(message, field, true)); +} + + +// ----- + +template<> +inline const RepeatedPtrField& Reflection::GetRepeatedPtrField( + const Message& message, const FieldDescriptor* field) const { + return *static_cast* >( + GetRawRepeatedField(message, field, FieldDescriptor::CPPTYPE_MESSAGE, + -1, NULL)); +} + +template<> +inline RepeatedPtrField* Reflection::MutableRepeatedPtrField( + Message* message, const FieldDescriptor* field) const { + return static_cast* >( + MutableRawRepeatedField(message, field, + FieldDescriptor::CPPTYPE_MESSAGE, -1, + NULL)); +} + +template +inline const RepeatedPtrField& Reflection::GetRepeatedPtrField( + const Message& message, const FieldDescriptor* field) const { + return *static_cast* >( + GetRawRepeatedField(message, field, FieldDescriptor::CPPTYPE_MESSAGE, + -1, PB::default_instance().GetDescriptor())); +} + +template +inline RepeatedPtrField* Reflection::MutableRepeatedPtrField( + Message* message, const FieldDescriptor* field) const { + return static_cast* >( + MutableRawRepeatedField(message, field, + FieldDescriptor::CPPTYPE_MESSAGE, -1, + PB::default_instance().GetDescriptor())); +} +} // namespace protobuf + +} // namespace google +#endif // GOOGLE_PROTOBUF_MESSAGE_H__ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/message_lite.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/message_lite.h new file mode 100644 index 00000000..c623dd67 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/message_lite.h @@ -0,0 +1,297 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Authors: wink@google.com (Wink Saville), +// kenton@google.com (Kenton Varda) +// Based on original Protocol Buffers design by +// Sanjay Ghemawat, Jeff Dean, and others. +// +// Defines MessageLite, the abstract interface implemented by all (lite +// and non-lite) protocol message objects. + +#ifndef GOOGLE_PROTOBUF_MESSAGE_LITE_H__ +#define GOOGLE_PROTOBUF_MESSAGE_LITE_H__ + +#include "common.h" + + +namespace google { +namespace protobuf { + class Arena; +namespace io { + class CodedInputStream; + class CodedOutputStream; + class ZeroCopyInputStream; + class ZeroCopyOutputStream; +} +namespace internal { + class WireFormatLite; +} + +// Interface to light weight protocol messages. +// +// This interface is implemented by all protocol message objects. Non-lite +// messages additionally implement the Message interface, which is a +// subclass of MessageLite. Use MessageLite instead when you only need +// the subset of features which it supports -- namely, nothing that uses +// descriptors or reflection. You can instruct the protocol compiler +// to generate classes which implement only MessageLite, not the full +// Message interface, by adding the following line to the .proto file: +// +// option optimize_for = LITE_RUNTIME; +// +// This is particularly useful on resource-constrained systems where +// the full protocol buffers runtime library is too big. +// +// Note that on non-constrained systems (e.g. servers) when you need +// to link in lots of protocol definitions, a better way to reduce +// total code footprint is to use optimize_for = CODE_SIZE. This +// will make the generated code smaller while still supporting all the +// same features (at the expense of speed). optimize_for = LITE_RUNTIME +// is best when you only have a small number of message types linked +// into your binary, in which case the size of the protocol buffers +// runtime itself is the biggest problem. +class LIBPROTOBUF_EXPORT MessageLite { + public: + inline MessageLite() {} + virtual ~MessageLite() {} + + // Basic Operations ------------------------------------------------ + + // Get the name of this message type, e.g. "foo.bar.BazProto". + virtual string GetTypeName() const = 0; + + // Construct a new instance of the same type. Ownership is passed to the + // caller. + virtual MessageLite* New() const = 0; + + // Construct a new instance on the arena. Ownership is passed to the caller + // if arena is a NULL. Default implementation for backwards compatibility. + virtual MessageLite* New(::google::protobuf::Arena* arena) const; + + // Get the arena, if any, associated with this message. Virtual method + // required for generic operations but most arena-related operations should + // use the GetArenaNoVirtual() generated-code method. Default implementation + // to reduce code size by avoiding the need for per-type implementations when + // types do not implement arena support. + virtual ::google::protobuf::Arena* GetArena() const { return NULL; } + + // Get a pointer that may be equal to this message's arena, or may not be. If + // the value returned by this method is equal to some arena pointer, then this + // message is on that arena; however, if this message is on some arena, this + // method may or may not return that arena's pointer. As a tradeoff, this + // method may be more efficient than GetArena(). The intent is to allow + // underlying representations that use e.g. tagged pointers to sometimes store + // the arena pointer directly, and sometimes in a more indirect way, and allow + // a fastpath comparison against the arena pointer when it's easy to obtain. + virtual void* GetMaybeArenaPointer() const { return GetArena(); } + + // Clear all fields of the message and set them to their default values. + // Clear() avoids freeing memory, assuming that any memory allocated + // to hold parts of the message will be needed again to hold the next + // message. If you actually want to free the memory used by a Message, + // you must delete it. + virtual void Clear() = 0; + + // Quickly check if all required fields have values set. + virtual bool IsInitialized() const = 0; + + // This is not implemented for Lite messages -- it just returns "(cannot + // determine missing fields for lite message)". However, it is implemented + // for full messages. See message.h. + virtual string InitializationErrorString() const; + + // If |other| is the exact same class as this, calls MergeFrom(). Otherwise, + // results are undefined (probably crash). + virtual void CheckTypeAndMergeFrom(const MessageLite& other) = 0; + + // Parsing --------------------------------------------------------- + // Methods for parsing in protocol buffer format. Most of these are + // just simple wrappers around MergeFromCodedStream(). Clear() will be called + // before merging the input. + + // Fill the message with a protocol buffer parsed from the given input stream. + // Returns false on a read error or if the input is in the wrong format. A + // successful return does not indicate the entire input is consumed, ensure + // you call ConsumedEntireMessage() to check that if applicable. + bool ParseFromCodedStream(io::CodedInputStream* input); + // Like ParseFromCodedStream(), but accepts messages that are missing + // required fields. + bool ParsePartialFromCodedStream(io::CodedInputStream* input); + // Read a protocol buffer from the given zero-copy input stream. If + // successful, the entire input will be consumed. + bool ParseFromZeroCopyStream(io::ZeroCopyInputStream* input); + // Like ParseFromZeroCopyStream(), but accepts messages that are missing + // required fields. + bool ParsePartialFromZeroCopyStream(io::ZeroCopyInputStream* input); + // Read a protocol buffer from the given zero-copy input stream, expecting + // the message to be exactly "size" bytes long. If successful, exactly + // this many bytes will have been consumed from the input. + bool ParseFromBoundedZeroCopyStream(io::ZeroCopyInputStream* input, int size); + // Like ParseFromBoundedZeroCopyStream(), but accepts messages that are + // missing required fields. + bool ParsePartialFromBoundedZeroCopyStream(io::ZeroCopyInputStream* input, + int size); + // Parses a protocol buffer contained in a string. Returns true on success. + // This function takes a string in the (non-human-readable) binary wire + // format, matching the encoding output by MessageLite::SerializeToString(). + // If you'd like to convert a human-readable string into a protocol buffer + // object, see google::protobuf::TextFormat::ParseFromString(). + bool ParseFromString(const string& data); + // Like ParseFromString(), but accepts messages that are missing + // required fields. + bool ParsePartialFromString(const string& data); + // Parse a protocol buffer contained in an array of bytes. + bool ParseFromArray(const void* data, int size); + // Like ParseFromArray(), but accepts messages that are missing + // required fields. + bool ParsePartialFromArray(const void* data, int size); + + + // Reads a protocol buffer from the stream and merges it into this + // Message. Singular fields read from the input overwrite what is + // already in the Message and repeated fields are appended to those + // already present. + // + // It is the responsibility of the caller to call input->LastTagWas() + // (for groups) or input->ConsumedEntireMessage() (for non-groups) after + // this returns to verify that the message's end was delimited correctly. + // + // ParsefromCodedStream() is implemented as Clear() followed by + // MergeFromCodedStream(). + bool MergeFromCodedStream(io::CodedInputStream* input); + + // Like MergeFromCodedStream(), but succeeds even if required fields are + // missing in the input. + // + // MergeFromCodedStream() is just implemented as MergePartialFromCodedStream() + // followed by IsInitialized(). + virtual bool MergePartialFromCodedStream(io::CodedInputStream* input) = 0; + + + // Serialization --------------------------------------------------- + // Methods for serializing in protocol buffer format. Most of these + // are just simple wrappers around ByteSize() and SerializeWithCachedSizes(). + + // Write a protocol buffer of this message to the given output. Returns + // false on a write error. If the message is missing required fields, + // this may GOOGLE_CHECK-fail. + bool SerializeToCodedStream(io::CodedOutputStream* output) const; + // Like SerializeToCodedStream(), but allows missing required fields. + bool SerializePartialToCodedStream(io::CodedOutputStream* output) const; + // Write the message to the given zero-copy output stream. All required + // fields must be set. + bool SerializeToZeroCopyStream(io::ZeroCopyOutputStream* output) const; + // Like SerializeToZeroCopyStream(), but allows missing required fields. + bool SerializePartialToZeroCopyStream(io::ZeroCopyOutputStream* output) const; + // Serialize the message and store it in the given string. All required + // fields must be set. + bool SerializeToString(string* output) const; + // Like SerializeToString(), but allows missing required fields. + bool SerializePartialToString(string* output) const; + // Serialize the message and store it in the given byte array. All required + // fields must be set. + bool SerializeToArray(void* data, int size) const; + // Like SerializeToArray(), but allows missing required fields. + bool SerializePartialToArray(void* data, int size) const; + + // Make a string encoding the message. Is equivalent to calling + // SerializeToString() on a string and using that. Returns the empty + // string if SerializeToString() would have returned an error. + // Note: If you intend to generate many such strings, you may + // reduce heap fragmentation by instead re-using the same string + // object with calls to SerializeToString(). + string SerializeAsString() const; + // Like SerializeAsString(), but allows missing required fields. + string SerializePartialAsString() const; + + // Like SerializeToString(), but appends to the data to the string's existing + // contents. All required fields must be set. + bool AppendToString(string* output) const; + // Like AppendToString(), but allows missing required fields. + bool AppendPartialToString(string* output) const; + + // Computes the serialized size of the message. This recursively calls + // ByteSize() on all embedded messages. Subclasses MUST override either + // ByteSize() or ByteSizeLong() (overriding both is fine). + // + // ByteSize() is generally linear in the number of fields defined for the + // proto. + virtual int ByteSize() const { return ByteSizeLong(); } + virtual size_t ByteSizeLong() const; + + // Serializes the message without recomputing the size. The message must + // not have changed since the last call to ByteSize(); if it has, the results + // are undefined. + virtual void SerializeWithCachedSizes( + io::CodedOutputStream* output) const = 0; + + // A version of SerializeWithCachedSizesToArray, below, that does + // not guarantee deterministic serialization. + virtual uint8* SerializeWithCachedSizesToArray(uint8* target) const { + return InternalSerializeWithCachedSizesToArray(false, target); + } + + // Returns the result of the last call to ByteSize(). An embedded message's + // size is needed both to serialize it (because embedded messages are + // length-delimited) and to compute the outer message's size. Caching + // the size avoids computing it multiple times. + // + // ByteSize() does not automatically use the cached size when available + // because this would require invalidating it every time the message was + // modified, which would be too hard and expensive. (E.g. if a deeply-nested + // sub-message is changed, all of its parents' cached sizes would need to be + // invalidated, which is too much work for an otherwise inlined setter + // method.) + virtual int GetCachedSize() const = 0; + + // Functions below here are not part of the public interface. It isn't + // enforced, but they should be treated as private, and will be private + // at some future time. Unfortunately the implementation of the "friend" + // keyword in GCC is broken at the moment, but we expect it will be fixed. + + // Like SerializeWithCachedSizes, but writes directly to *target, returning + // a pointer to the byte immediately after the last byte written. "target" + // must point at a byte array of at least ByteSize() bytes. If deterministic + // is true then we use deterministic serialization, e.g., map keys are sorted. + // FOR INTERNAL USE ONLY! + virtual uint8* InternalSerializeWithCachedSizesToArray(bool deterministic, + uint8* target) const; + + private: + friend class internal::WireFormatLite; + + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(MessageLite); +}; + +} // namespace protobuf + +} // namespace google +#endif // GOOGLE_PROTOBUF_MESSAGE_LITE_H__ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/metadata.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/metadata.h new file mode 100644 index 00000000..0b44334f --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/metadata.h @@ -0,0 +1,159 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// This header file defines an internal class that encapsulates internal message +// metadata (Unknown-field set, Arena pointer, ...) and allows its +// representation to be made more space-efficient via various optimizations. +// +// Note that this is distinct from google::protobuf::Metadata, which encapsulates +// Descriptor and Reflection pointers. + +#ifndef GOOGLE_PROTOBUF_METADATA_H__ +#define GOOGLE_PROTOBUF_METADATA_H__ + +#include "common.h" +#include "arena.h" +#include "unknown_field_set.h" + +namespace google { +namespace protobuf { +namespace internal { + +// This is the representation for messages that support arena allocation. It +// uses a tagged pointer to either store the Arena pointer, if there are no +// unknown fields, or a pointer to a block of memory with both the Arena pointer +// and the UnknownFieldSet, if there are unknown fields. This optimization +// allows for "zero-overhead" storage of the Arena pointer, relative to the +// above baseline implementation. +// +// The tagged pointer uses the LSB to disambiguate cases, and uses bit 0 == 0 to +// indicate an arena pointer and bit 0 == 1 to indicate a UFS+Arena-container +// pointer. +class LIBPROTOBUF_EXPORT InternalMetadataWithArena { + public: + InternalMetadataWithArena() : ptr_(NULL) {} + explicit InternalMetadataWithArena(Arena* arena) + : ptr_ (arena) {} + + ~InternalMetadataWithArena() { + if (have_unknown_fields() && arena() == NULL) { + delete PtrValue(); + } + ptr_ = NULL; + } + + GOOGLE_ATTRIBUTE_ALWAYS_INLINE const UnknownFieldSet& unknown_fields() const { + if (GOOGLE_PREDICT_FALSE(have_unknown_fields())) { + return PtrValue()->unknown_fields_; + } else { + return *UnknownFieldSet::default_instance(); + } + } + + GOOGLE_ATTRIBUTE_ALWAYS_INLINE UnknownFieldSet* mutable_unknown_fields() { + if (GOOGLE_PREDICT_TRUE(have_unknown_fields())) { + return &PtrValue()->unknown_fields_; + } else { + return mutable_unknown_fields_slow(); + } + } + + GOOGLE_ATTRIBUTE_ALWAYS_INLINE Arena* arena() const { + if (GOOGLE_PREDICT_FALSE(have_unknown_fields())) { + return PtrValue()->arena_; + } else { + return PtrValue(); + } + } + + GOOGLE_ATTRIBUTE_ALWAYS_INLINE bool have_unknown_fields() const { + return PtrTag() == kTagContainer; + } + + GOOGLE_ATTRIBUTE_ALWAYS_INLINE void Swap(InternalMetadataWithArena* other) { + // Semantics here are that we swap only the unknown fields, not the arena + // pointer. We cannot simply swap ptr_ with other->ptr_ because we need to + // maintain our own arena ptr. Also, our ptr_ and other's ptr_ may be in + // different states (direct arena pointer vs. container with UFS) so we + // cannot simply swap ptr_ and then restore the arena pointers. We reuse + // UFS's swap implementation instead. + if (have_unknown_fields() || other->have_unknown_fields()) { + mutable_unknown_fields()->Swap(other->mutable_unknown_fields()); + } + } + + GOOGLE_ATTRIBUTE_ALWAYS_INLINE void* raw_arena_ptr() const { + return ptr_; + } + + private: + void* ptr_; + + // Tagged pointer implementation. + enum { + // ptr_ is an Arena*. + kTagArena = 0, + // ptr_ is a Container*. + kTagContainer = 1, + }; + static const intptr_t kPtrTagMask = 1; + static const intptr_t kPtrValueMask = ~kPtrTagMask; + + // Accessors for pointer tag and pointer value. + GOOGLE_ATTRIBUTE_ALWAYS_INLINE int PtrTag() const { + return reinterpret_cast(ptr_) & kPtrTagMask; + } + + template T* PtrValue() const { + return reinterpret_cast( + reinterpret_cast(ptr_) & kPtrValueMask); + } + + // If ptr_'s tag is kTagContainer, it points to an instance of this struct. + struct Container { + UnknownFieldSet unknown_fields_; + Arena* arena_; + }; + + GOOGLE_ATTRIBUTE_NOINLINE UnknownFieldSet* mutable_unknown_fields_slow() { + Arena* my_arena = arena(); + Container* container = Arena::Create(my_arena); + ptr_ = reinterpret_cast( + reinterpret_cast(container) | kTagContainer); + container->arena_ = my_arena; + return &(container->unknown_fields_); + } +}; + +} // namespace internal +} // namespace protobuf + +} // namespace google +#endif // GOOGLE_PROTOBUF_METADATA_H__ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/repeated_field.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/repeated_field.h new file mode 100644 index 00000000..bdfa1063 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/repeated_field.h @@ -0,0 +1,2504 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Author: kenton@google.com (Kenton Varda) +// Based on original Protocol Buffers design by +// Sanjay Ghemawat, Jeff Dean, and others. +// +// RepeatedField and RepeatedPtrField are used by generated protocol message +// classes to manipulate repeated fields. These classes are very similar to +// STL's vector, but include a number of optimizations found to be useful +// specifically in the case of Protocol Buffers. RepeatedPtrField is +// particularly different from STL vector as it manages ownership of the +// pointers that it contains. +// +// Typically, clients should not need to access RepeatedField objects directly, +// but should instead use the accessor functions generated automatically by the +// protocol compiler. + +#ifndef GOOGLE_PROTOBUF_REPEATED_FIELD_H__ +#define GOOGLE_PROTOBUF_REPEATED_FIELD_H__ + +#ifdef _MSC_VER +// This is required for min/max on VS2013 only. +#include +#endif + +#include +#include +#include "casts.h" +#include "logging.h" +#include "common.h" +#include "type_traits.h" +#include "arena.h" +#include "generated_message_util.h" +#include "message_lite.h" + +namespace google { + +namespace upb { +namespace google_opensource { +class GMR_Handlers; +} // namespace google_opensource +} // namespace upb + +namespace protobuf { + +class Message; + +namespace internal { + +static const int kMinRepeatedFieldAllocationSize = 4; + +// A utility function for logging that doesn't need any template types. +void LogIndexOutOfBounds(int index, int size); + +template +inline int CalculateReserve(Iter begin, Iter end, std::forward_iterator_tag) { + return std::distance(begin, end); +} + +template +inline int CalculateReserve(Iter /*begin*/, Iter /*end*/, + std::input_iterator_tag /*unused*/) { + return -1; +} + +template +inline int CalculateReserve(Iter begin, Iter end) { + typedef typename std::iterator_traits::iterator_category Category; + return CalculateReserve(begin, end, Category()); +} +} // namespace internal + + +// RepeatedField is used to represent repeated fields of a primitive type (in +// other words, everything except strings and nested Messages). Most users will +// not ever use a RepeatedField directly; they will use the get-by-index, +// set-by-index, and add accessors that are generated for all repeated fields. +template +class RepeatedField { + public: + RepeatedField(); + explicit RepeatedField(Arena* arena); + RepeatedField(const RepeatedField& other); + template + RepeatedField(Iter begin, const Iter& end); + ~RepeatedField(); + + RepeatedField& operator=(const RepeatedField& other); + + bool empty() const; + int size() const; + + const Element& Get(int index) const; + Element* Mutable(int index); + + const Element& operator[](int index) const { return Get(index); } + Element& operator[](int index) { return *Mutable(index); } + + void Set(int index, const Element& value); + void Add(const Element& value); + Element* Add(); + // Remove the last element in the array. + void RemoveLast(); + + // Extract elements with indices in "[start .. start+num-1]". + // Copy them into "elements[0 .. num-1]" if "elements" is not NULL. + // Caution: implementation also moves elements with indices [start+num ..]. + // Calling this routine inside a loop can cause quadratic behavior. + void ExtractSubrange(int start, int num, Element* elements); + + void Clear(); + void MergeFrom(const RepeatedField& other); + void UnsafeMergeFrom(const RepeatedField& other); + void CopyFrom(const RepeatedField& other); + + // Reserve space to expand the field to at least the given size. If the + // array is grown, it will always be at least doubled in size. + void Reserve(int new_size); + + // Resize the RepeatedField to a new, smaller size. This is O(1). + void Truncate(int new_size); + + void AddAlreadyReserved(const Element& value); + Element* AddAlreadyReserved(); + int Capacity() const; + + // Like STL resize. Uses value to fill appended elements. + // Like Truncate() if new_size <= size(), otherwise this is + // O(new_size - size()). + void Resize(int new_size, const Element& value); + + // Gets the underlying array. This pointer is possibly invalidated by + // any add or remove operation. + Element* mutable_data(); + const Element* data() const; + + // Swap entire contents with "other". If they are separate arenas then, copies + // data between each other. + void Swap(RepeatedField* other); + + // Swap entire contents with "other". Should be called only if the caller can + // guarantee that both repeated fields are on the same arena or are on the + // heap. Swapping between different arenas is disallowed and caught by a + // GOOGLE_DCHECK (see API docs for details). + void UnsafeArenaSwap(RepeatedField* other); + + // Swap two elements. + void SwapElements(int index1, int index2); + + // STL-like iterator support + typedef Element* iterator; + typedef const Element* const_iterator; + typedef Element value_type; + typedef value_type& reference; + typedef const value_type& const_reference; + typedef value_type* pointer; + typedef const value_type* const_pointer; + typedef int size_type; + typedef ptrdiff_t difference_type; + + iterator begin(); + const_iterator begin() const; + const_iterator cbegin() const; + iterator end(); + const_iterator end() const; + const_iterator cend() const; + + // Reverse iterator support + typedef std::reverse_iterator const_reverse_iterator; + typedef std::reverse_iterator reverse_iterator; + reverse_iterator rbegin() { + return reverse_iterator(end()); + } + const_reverse_iterator rbegin() const { + return const_reverse_iterator(end()); + } + reverse_iterator rend() { + return reverse_iterator(begin()); + } + const_reverse_iterator rend() const { + return const_reverse_iterator(begin()); + } + + // Returns the number of bytes used by the repeated field, excluding + // sizeof(*this) + int SpaceUsedExcludingSelf() const; + + // Removes the element referenced by position. + // + // Returns an iterator to the element immediately following the removed + // element. + // + // Invalidates all iterators at or after the removed element, including end(). + iterator erase(const_iterator position); + + // Removes the elements in the range [first, last). + // + // Returns an iterator to the element immediately following the removed range. + // + // Invalidates all iterators at or after the removed range, including end(). + iterator erase(const_iterator first, const_iterator last); + + // Get the Arena on which this RepeatedField stores its elements. + ::google::protobuf::Arena* GetArena() const { + return GetArenaNoVirtual(); + } + + private: + static const int kInitialSize = 0; + // A note on the representation here (see also comment below for + // RepeatedPtrFieldBase's struct Rep): + // + // We maintain the same sizeof(RepeatedField) as before we added arena support + // so that we do not degrade performance by bloating memory usage. Directly + // adding an arena_ element to RepeatedField is quite costly. By using + // indirection in this way, we keep the same size when the RepeatedField is + // empty (common case), and add only an 8-byte header to the elements array + // when non-empty. We make sure to place the size fields directly in the + // RepeatedField class to avoid costly cache misses due to the indirection. + int current_size_; + int total_size_; + struct Rep { + Arena* arena; + Element elements[1]; + }; + // We can not use sizeof(Rep) - sizeof(Element) due to the trailing padding on + // the struct. We can not use sizeof(Arena*) as well because there might be + // a "gap" after the field arena and before the field elements (e.g., when + // Element is double and pointer is 32bit). + static const size_t kRepHeaderSize; + // Contains arena ptr and the elements array. We also keep the invariant that + // if rep_ is NULL, then arena is NULL. + Rep* rep_; + + friend class Arena; + typedef void InternalArenaConstructable_; + + // Move the contents of |from| into |to|, possibly clobbering |from| in the + // process. For primitive types this is just a memcpy(), but it could be + // specialized for non-primitive types to, say, swap each element instead. + void MoveArray(Element* to, Element* from, int size); + + // Copy the elements of |from| into |to|. + void CopyArray(Element* to, const Element* from, int size); + + inline void InternalSwap(RepeatedField* other); + + // Internal helper expected by Arena methods. + inline Arena* GetArenaNoVirtual() const { + return (rep_ == NULL) ? NULL : rep_->arena; + } + + // Internal helper to delete all elements and deallocate the storage. + // If Element has a trivial destructor (for example, if it's a fundamental + // type, like int32), the loop will be removed by the optimizer. + void InternalDeallocate(Rep* rep, int size) { + if (rep != NULL) { + Element* e = &rep->elements[0]; + Element* limit = &rep->elements[size]; + for (; e < limit; e++) { + e->Element::~Element(); + } + if (rep->arena == NULL) { +#if defined(__GXX_DELETE_WITH_SIZE__) || defined(__cpp_sized_deallocation) + const size_t bytes = size * sizeof(*e) + kRepHeaderSize; + ::operator delete(static_cast(rep), bytes); +#else + ::operator delete(static_cast(rep)); +#endif + } + } + } +}; + +template +const size_t RepeatedField::kRepHeaderSize = + reinterpret_cast(&reinterpret_cast(16)->elements[0]) - 16; + +namespace internal { +template class RepeatedPtrIterator; +template class RepeatedPtrOverPtrsIterator; +} // namespace internal + +namespace internal { + +// This is a helper template to copy an array of elements effeciently when they +// have a trivial copy constructor, and correctly otherwise. This really +// shouldn't be necessary, but our compiler doesn't optimize std::copy very +// effectively. +template ::value> +struct ElementCopier { + void operator()(Element* to, const Element* from, int array_size); +}; + +} // namespace internal + +namespace internal { + +// type-traits helper for RepeatedPtrFieldBase: we only want to invoke +// arena-related "copy if on different arena" behavior if the necessary methods +// exist on the contained type. In particular, we rely on MergeFrom() existing +// as a general proxy for the fact that a copy will work, and we also provide a +// specific override for string*. +template +struct TypeImplementsMergeBehavior { + typedef char HasMerge; + typedef long HasNoMerge; + + // We accept either of: + // - void MergeFrom(const T& other) + // - bool MergeFrom(const T& other) + // + // We mangle these names a bit to avoid compatibility issues in 'unclean' + // include environments that may have, e.g., "#define test ..." (yes, this + // exists). + template + struct CheckType; + template static HasMerge Check( + CheckType*); + template static HasMerge Check( + CheckType*); + template static HasNoMerge Check(...); + + // Resovles to either google::protobuf::internal::true_type or google::protobuf::internal::false_type. + typedef google::protobuf::internal::integral_constant(0)) == sizeof(HasMerge))> type; +}; + +template<> +struct TypeImplementsMergeBehavior< ::std::string > { + typedef google::protobuf::internal::true_type type; +}; + +// This is the common base class for RepeatedPtrFields. It deals only in void* +// pointers. Users should not use this interface directly. +// +// The methods of this interface correspond to the methods of RepeatedPtrField, +// but may have a template argument called TypeHandler. Its signature is: +// class TypeHandler { +// public: +// typedef MyType Type; +// static Type* New(); +// static void Delete(Type*); +// static void Clear(Type*); +// static void Merge(const Type& from, Type* to); +// +// // Only needs to be implemented if SpaceUsedExcludingSelf() is called. +// static int SpaceUsed(const Type&); +// }; +class LIBPROTOBUF_EXPORT RepeatedPtrFieldBase { + protected: + // The reflection implementation needs to call protected methods directly, + // reinterpreting pointers as being to Message instead of a specific Message + // subclass. + friend class GeneratedMessageReflection; + + // ExtensionSet stores repeated message extensions as + // RepeatedPtrField, but non-lite ExtensionSets need to + // implement SpaceUsed(), and thus need to call SpaceUsedExcludingSelf() + // reinterpreting MessageLite as Message. ExtensionSet also needs to make + // use of AddFromCleared(), which is not part of the public interface. + friend class ExtensionSet; + + // The MapFieldBase implementation needs to call protected methods directly, + // reinterpreting pointers as being to Message instead of a specific Message + // subclass. + friend class MapFieldBase; + + // To parse directly into a proto2 generated class, the upb class GMR_Handlers + // needs to be able to modify a RepeatedPtrFieldBase directly. + friend class upb::google_opensource::GMR_Handlers; + + RepeatedPtrFieldBase(); + explicit RepeatedPtrFieldBase(::google::protobuf::Arena* arena); + ~RepeatedPtrFieldBase() {} + + // Must be called from destructor. + template + void Destroy(); + + bool empty() const; + int size() const; + + template + const typename TypeHandler::Type& Get(int index) const; + template + typename TypeHandler::Type* Mutable(int index); + template + void Delete(int index); + template + typename TypeHandler::Type* Add(typename TypeHandler::Type* prototype = NULL); + + template + void RemoveLast(); + template + void Clear(); + template + void MergeFrom(const RepeatedPtrFieldBase& other); + template + void CopyFrom(const RepeatedPtrFieldBase& other); + + void CloseGap(int start, int num); + + void Reserve(int new_size); + + int Capacity() const; + + // Used for constructing iterators. + void* const* raw_data() const; + void** raw_mutable_data() const; + + template + typename TypeHandler::Type** mutable_data(); + template + const typename TypeHandler::Type* const* data() const; + + template + GOOGLE_ATTRIBUTE_ALWAYS_INLINE void Swap(RepeatedPtrFieldBase* other); + + void SwapElements(int index1, int index2); + + template + int SpaceUsedExcludingSelf() const; + + + // Advanced memory management -------------------------------------- + + // Like Add(), but if there are no cleared objects to use, returns NULL. + template + typename TypeHandler::Type* AddFromCleared(); + + template + void AddAllocated(typename TypeHandler::Type* value) { + typename TypeImplementsMergeBehavior::type t; + AddAllocatedInternal(value, t); + } + + template + void UnsafeArenaAddAllocated(typename TypeHandler::Type* value); + + template + typename TypeHandler::Type* ReleaseLast() { + typename TypeImplementsMergeBehavior::type t; + return ReleaseLastInternal(t); + } + + // Releases last element and returns it, but does not do out-of-arena copy. + // And just returns the raw pointer to the contained element in the arena. + template + typename TypeHandler::Type* UnsafeArenaReleaseLast(); + + int ClearedCount() const; + template + void AddCleared(typename TypeHandler::Type* value); + template + typename TypeHandler::Type* ReleaseCleared(); + + protected: + inline void InternalSwap(RepeatedPtrFieldBase* other); + + template + void AddAllocatedInternal(typename TypeHandler::Type* value, + google::protobuf::internal::true_type); + template + void AddAllocatedInternal(typename TypeHandler::Type* value, + google::protobuf::internal::false_type); + + template GOOGLE_ATTRIBUTE_NOINLINE + void AddAllocatedSlowWithCopy(typename TypeHandler::Type* value, + Arena* value_arena, + Arena* my_arena); + template GOOGLE_ATTRIBUTE_NOINLINE + void AddAllocatedSlowWithoutCopy(typename TypeHandler::Type* value); + + template + typename TypeHandler::Type* ReleaseLastInternal(google::protobuf::internal::true_type); + template + typename TypeHandler::Type* ReleaseLastInternal(google::protobuf::internal::false_type); + + template GOOGLE_ATTRIBUTE_NOINLINE + void SwapFallback(RepeatedPtrFieldBase* other); + + inline Arena* GetArenaNoVirtual() const { + return arena_; + } + + private: + static const int kInitialSize = 0; + // A few notes on internal representation: + // + // We use an indirected approach, with struct Rep, to keep + // sizeof(RepeatedPtrFieldBase) equivalent to what it was before arena support + // was added, namely, 3 8-byte machine words on x86-64. An instance of Rep is + // allocated only when the repeated field is non-empty, and it is a + // dynamically-sized struct (the header is directly followed by elements[]). + // We place arena_ and current_size_ directly in the object to avoid cache + // misses due to the indirection, because these fields are checked frequently. + // Placing all fields directly in the RepeatedPtrFieldBase instance costs + // significant performance for memory-sensitive workloads. + Arena* arena_; + int current_size_; + int total_size_; + struct Rep { + int allocated_size; + void* elements[1]; + }; + static const size_t kRepHeaderSize = sizeof(Rep) - sizeof(void*); + // Contains arena ptr and the elements array. We also keep the invariant that + // if rep_ is NULL, then arena is NULL. + Rep* rep_; + + template + static inline typename TypeHandler::Type* cast(void* element) { + return reinterpret_cast(element); + } + template + static inline const typename TypeHandler::Type* cast(const void* element) { + return reinterpret_cast(element); + } + + // Non-templated inner function to avoid code duplication. Takes a function + // pointer to the type-specific (templated) inner allocate/merge loop. + void MergeFromInternal( + const RepeatedPtrFieldBase& other, + void (RepeatedPtrFieldBase::*inner_loop)(void**, void**, int, int)); + + template + void MergeFromInnerLoop( + void** our_elems, void** other_elems, int length, int already_allocated); + + // Internal helper: extend array space if necessary to contain |extend_amount| + // more elements, and return a pointer to the element immediately following + // the old list of elements. This interface factors out common behavior from + // Reserve() and MergeFrom() to reduce code size. |extend_amount| must be > 0. + void** InternalExtend(int extend_amount); + + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(RepeatedPtrFieldBase); +}; + +template +class GenericTypeHandler { + public: + typedef GenericType Type; + static inline GenericType* New(Arena* arena) { + return ::google::protobuf::Arena::CreateMaybeMessage( + arena, static_cast(0)); + } + // We force NewFromPrototype() to be non-inline to reduce code size: + // else, several other methods get inlined copies of message types' + // constructors. + GOOGLE_ATTRIBUTE_NOINLINE static GenericType* NewFromPrototype( + const GenericType* prototype, ::google::protobuf::Arena* arena = NULL); + static inline void Delete(GenericType* value, Arena* arena) { + if (arena == NULL) { + delete value; + } + } + static inline ::google::protobuf::Arena* GetArena(GenericType* value) { + return ::google::protobuf::Arena::GetArena(value); + } + static inline void* GetMaybeArenaPointer(GenericType* value) { + return ::google::protobuf::Arena::GetArena(value); + } + + static inline void Clear(GenericType* value) { value->Clear(); } + GOOGLE_ATTRIBUTE_NOINLINE static void Merge(const GenericType& from, + GenericType* to); + static inline int SpaceUsed(const GenericType& value) { + return value.SpaceUsed(); + } + static inline const Type& default_instance() { + return Type::default_instance(); + } +}; + +template +GenericType* GenericTypeHandler::NewFromPrototype( + const GenericType* /* prototype */, ::google::protobuf::Arena* arena) { + return New(arena); +} +template +void GenericTypeHandler::Merge(const GenericType& from, + GenericType* to) { + to->MergeFrom(from); +} + +// NewFromPrototype() and Merge() cannot be defined here; if they're declared +// inline the compiler will complain about not matching GOOGLE_ATTRIBUTE_NOINLINE +// above, and if not, compilation will result in multiple definitions. These +// are therefore declared as specializations here and defined in +// message_lite.cc. +template<> +MessageLite* GenericTypeHandler::NewFromPrototype( + const MessageLite* prototype, google::protobuf::Arena* arena); +template<> +inline google::protobuf::Arena* GenericTypeHandler::GetArena( + MessageLite* value) { + return value->GetArena(); +} +template<> +inline void* GenericTypeHandler::GetMaybeArenaPointer( + MessageLite* value) { + return value->GetMaybeArenaPointer(); +} +template <> +void GenericTypeHandler::Merge(const MessageLite& from, + MessageLite* to); +template<> +inline void GenericTypeHandler::Clear(string* value) { + value->clear(); +} +template<> +void GenericTypeHandler::Merge(const string& from, + string* to); + +// Declarations of the specialization as we cannot define them here, as the +// header that defines ProtocolMessage depends on types defined in this header. +#define DECLARE_SPECIALIZATIONS_FOR_BASE_PROTO_TYPES(TypeName) \ + template<> \ + TypeName* GenericTypeHandler::NewFromPrototype( \ + const TypeName* prototype, google::protobuf::Arena* arena); \ + template<> \ + google::protobuf::Arena* GenericTypeHandler::GetArena( \ + TypeName* value); \ + template<> \ + void* GenericTypeHandler::GetMaybeArenaPointer( \ + TypeName* value); + +// Message specialization bodies defined in message.cc. This split is necessary +// to allow proto2-lite (which includes this header) to be independent of +// Message. +DECLARE_SPECIALIZATIONS_FOR_BASE_PROTO_TYPES(Message) + + +#undef DECLARE_SPECIALIZATIONS_FOR_BASE_PROTO_TYPES + +template <> +inline const MessageLite& GenericTypeHandler::default_instance() { + // Yes, the behavior of the code is undefined, but this function is only + // called when we're already deep into the world of undefined, because the + // caller called Get(index) out of bounds. + MessageLite* null = NULL; + return *null; +} + +template <> +inline const Message& GenericTypeHandler::default_instance() { + // Yes, the behavior of the code is undefined, but this function is only + // called when we're already deep into the world of undefined, because the + // caller called Get(index) out of bounds. + Message* null = NULL; + return *null; +} + + +// HACK: If a class is declared as DLL-exported in MSVC, it insists on +// generating copies of all its methods -- even inline ones -- to include +// in the DLL. But SpaceUsed() calls StringSpaceUsedExcludingSelf() which +// isn't in the lite library, therefore the lite library cannot link if +// StringTypeHandler is exported. So, we factor out StringTypeHandlerBase, +// export that, then make StringTypeHandler be a subclass which is NOT +// exported. +// TODO(kenton): Now that StringSpaceUsedExcludingSelf() is in the lite +// library, this can be cleaned up. +class LIBPROTOBUF_EXPORT StringTypeHandlerBase { + public: + typedef string Type; + + static inline string* New(Arena* arena) { + return Arena::Create(arena); + } + static inline string* NewFromPrototype(const string*, + ::google::protobuf::Arena* arena) { + return New(arena); + } + static inline ::google::protobuf::Arena* GetArena(string*) { + return NULL; + } + static inline void* GetMaybeArenaPointer(string* /* value */) { + return NULL; + } + static inline void Delete(string* value, Arena* arena) { + if (arena == NULL) { + delete value; + } + } + static inline void Clear(string* value) { value->clear(); } + static inline void Merge(const string& from, string* to) { *to = from; } + static inline const Type& default_instance() { + return ::google::protobuf::internal::GetEmptyString(); + } +}; + +class StringTypeHandler : public StringTypeHandlerBase { + public: + static int SpaceUsed(const string& value) { + return static_cast(sizeof(value)) + StringSpaceUsedExcludingSelf(value); + } +}; + + +} // namespace internal + +// RepeatedPtrField is like RepeatedField, but used for repeated strings or +// Messages. +template +class RepeatedPtrField : public internal::RepeatedPtrFieldBase { + public: + RepeatedPtrField(); + explicit RepeatedPtrField(::google::protobuf::Arena* arena); + + RepeatedPtrField(const RepeatedPtrField& other); + template + RepeatedPtrField(Iter begin, const Iter& end); + ~RepeatedPtrField(); + + RepeatedPtrField& operator=(const RepeatedPtrField& other); + + bool empty() const; + int size() const; + + const Element& Get(int index) const; + Element* Mutable(int index); + Element* Add(); + + const Element& operator[](int index) const { return Get(index); } + Element& operator[](int index) { return *Mutable(index); } + + // Remove the last element in the array. + // Ownership of the element is retained by the array. + void RemoveLast(); + + // Delete elements with indices in the range [start .. start+num-1]. + // Caution: implementation moves all elements with indices [start+num .. ]. + // Calling this routine inside a loop can cause quadratic behavior. + void DeleteSubrange(int start, int num); + + void Clear(); + void MergeFrom(const RepeatedPtrField& other); + void UnsafeMergeFrom(const RepeatedPtrField& other) { MergeFrom(other); } + void CopyFrom(const RepeatedPtrField& other); + + // Reserve space to expand the field to at least the given size. This only + // resizes the pointer array; it doesn't allocate any objects. If the + // array is grown, it will always be at least doubled in size. + void Reserve(int new_size); + + int Capacity() const; + + // Gets the underlying array. This pointer is possibly invalidated by + // any add or remove operation. + Element** mutable_data(); + const Element* const* data() const; + + // Swap entire contents with "other". If they are on separate arenas, then + // copies data. + void Swap(RepeatedPtrField* other); + + // Swap entire contents with "other". Caller should guarantee that either both + // fields are on the same arena or both are on the heap. Swapping between + // different arenas with this function is disallowed and is caught via + // GOOGLE_DCHECK. + void UnsafeArenaSwap(RepeatedPtrField* other); + + // Swap two elements. + void SwapElements(int index1, int index2); + + // STL-like iterator support + typedef internal::RepeatedPtrIterator iterator; + typedef internal::RepeatedPtrIterator const_iterator; + typedef Element value_type; + typedef value_type& reference; + typedef const value_type& const_reference; + typedef value_type* pointer; + typedef const value_type* const_pointer; + typedef int size_type; + typedef ptrdiff_t difference_type; + + iterator begin(); + const_iterator begin() const; + const_iterator cbegin() const; + iterator end(); + const_iterator end() const; + const_iterator cend() const; + + // Reverse iterator support + typedef std::reverse_iterator const_reverse_iterator; + typedef std::reverse_iterator reverse_iterator; + reverse_iterator rbegin() { + return reverse_iterator(end()); + } + const_reverse_iterator rbegin() const { + return const_reverse_iterator(end()); + } + reverse_iterator rend() { + return reverse_iterator(begin()); + } + const_reverse_iterator rend() const { + return const_reverse_iterator(begin()); + } + + // Custom STL-like iterator that iterates over and returns the underlying + // pointers to Element rather than Element itself. + typedef internal::RepeatedPtrOverPtrsIterator + pointer_iterator; + typedef internal::RepeatedPtrOverPtrsIterator + const_pointer_iterator; + pointer_iterator pointer_begin(); + const_pointer_iterator pointer_begin() const; + pointer_iterator pointer_end(); + const_pointer_iterator pointer_end() const; + + // Returns (an estimate of) the number of bytes used by the repeated field, + // excluding sizeof(*this). + int SpaceUsedExcludingSelf() const; + + // Advanced memory management -------------------------------------- + // When hardcore memory management becomes necessary -- as it sometimes + // does here at Google -- the following methods may be useful. + + // Add an already-allocated object, passing ownership to the + // RepeatedPtrField. + // + // Note that some special behavior occurs with respect to arenas: + // + // (i) if this field holds submessages, the new submessage will be copied if + // the original is in an arena and this RepeatedPtrField is either in a + // different arena, or on the heap. + // (ii) if this field holds strings, the passed-in string *must* be + // heap-allocated, not arena-allocated. There is no way to dynamically check + // this at runtime, so User Beware. + void AddAllocated(Element* value); + + // Remove the last element and return it, passing ownership to the caller. + // Requires: size() > 0 + // + // If this RepeatedPtrField is on an arena, an object copy is required to pass + // ownership back to the user (for compatible semantics). Use + // UnsafeArenaReleaseLast() if this behavior is undesired. + Element* ReleaseLast(); + + // Add an already-allocated object, skipping arena-ownership checks. The user + // must guarantee that the given object is in the same arena as this + // RepeatedPtrField. + // It is also useful in legacy code that uses temporary ownership to avoid + // copies. Example: + // RepeatedPtrField temp_field; + // temp_field.AddAllocated(new T); + // ... // Do something with temp_field + // temp_field.ExtractSubrange(0, temp_field.size(), NULL); + // If you put temp_field on the arena this fails, because the ownership + // transfers to the arena at the "AddAllocated" call and is not released + // anymore causing a double delete. UnsafeArenaAddAllocated prevents this. + void UnsafeArenaAddAllocated(Element* value); + + // Remove the last element and return it. Works only when operating on an + // arena. The returned pointer is to the original object in the arena, hence + // has the arena's lifetime. + // Requires: current_size_ > 0 + Element* UnsafeArenaReleaseLast(); + + // Extract elements with indices in the range "[start .. start+num-1]". + // The caller assumes ownership of the extracted elements and is responsible + // for deleting them when they are no longer needed. + // If "elements" is non-NULL, then pointers to the extracted elements + // are stored in "elements[0 .. num-1]" for the convenience of the caller. + // If "elements" is NULL, then the caller must use some other mechanism + // to perform any further operations (like deletion) on these elements. + // Caution: implementation also moves elements with indices [start+num ..]. + // Calling this routine inside a loop can cause quadratic behavior. + // + // Memory copying behavior is identical to ReleaseLast(), described above: if + // this RepeatedPtrField is on an arena, an object copy is performed for each + // returned element, so that all returned element pointers are to + // heap-allocated copies. If this copy is not desired, the user should call + // UnsafeArenaExtractSubrange(). + void ExtractSubrange(int start, int num, Element** elements); + + // Identical to ExtractSubrange() described above, except that when this + // repeated field is on an arena, no object copies are performed. Instead, the + // raw object pointers are returned. Thus, if on an arena, the returned + // objects must not be freed, because they will not be heap-allocated objects. + void UnsafeArenaExtractSubrange(int start, int num, Element** elements); + + // When elements are removed by calls to RemoveLast() or Clear(), they + // are not actually freed. Instead, they are cleared and kept so that + // they can be reused later. This can save lots of CPU time when + // repeatedly reusing a protocol message for similar purposes. + // + // Hardcore programs may choose to manipulate these cleared objects + // to better optimize memory management using the following routines. + + // Get the number of cleared objects that are currently being kept + // around for reuse. + int ClearedCount() const; + // Add an element to the pool of cleared objects, passing ownership to + // the RepeatedPtrField. The element must be cleared prior to calling + // this method. + // + // This method cannot be called when the repeated field is on an arena or when + // |value| is; both cases will trigger a GOOGLE_DCHECK-failure. + void AddCleared(Element* value); + // Remove a single element from the cleared pool and return it, passing + // ownership to the caller. The element is guaranteed to be cleared. + // Requires: ClearedCount() > 0 + // + // + // This method cannot be called when the repeated field is on an arena; doing + // so will trigger a GOOGLE_DCHECK-failure. + Element* ReleaseCleared(); + + // Removes the element referenced by position. + // + // Returns an iterator to the element immediately following the removed + // element. + // + // Invalidates all iterators at or after the removed element, including end(). + iterator erase(const_iterator position); + + // Removes the elements in the range [first, last). + // + // Returns an iterator to the element immediately following the removed range. + // + // Invalidates all iterators at or after the removed range, including end(). + iterator erase(const_iterator first, const_iterator last); + + // Gets the arena on which this RepeatedPtrField stores its elements. + ::google::protobuf::Arena* GetArena() const { + return GetArenaNoVirtual(); + } + + protected: + // Note: RepeatedPtrField SHOULD NOT be subclassed by users. We only + // subclass it in one place as a hack for compatibility with proto1. The + // subclass needs to know about TypeHandler in order to call protected + // methods on RepeatedPtrFieldBase. + class TypeHandler; + + // Internal arena accessor expected by helpers in Arena. + inline Arena* GetArenaNoVirtual() const; + + private: + // Implementations for ExtractSubrange(). The copying behavior must be + // included only if the type supports the necessary operations (e.g., + // MergeFrom()), so we must resolve this at compile time. ExtractSubrange() + // uses SFINAE to choose one of the below implementations. + void ExtractSubrangeInternal(int start, int num, Element** elements, + google::protobuf::internal::true_type); + void ExtractSubrangeInternal(int start, int num, Element** elements, + google::protobuf::internal::false_type); + + friend class Arena; + typedef void InternalArenaConstructable_; + +}; + +// implementation ==================================================== + +template +inline RepeatedField::RepeatedField() + : current_size_(0), + total_size_(0), + rep_(NULL) { +} + +template +inline RepeatedField::RepeatedField(Arena* arena) + : current_size_(0), + total_size_(0), + rep_(NULL) { + // In case arena is NULL, then we do not create rep_, as code has an invariant + // `rep_ == NULL then arena == NULL`. + if (arena != NULL) { + rep_ = reinterpret_cast( + ::google::protobuf::Arena::CreateArray(arena, kRepHeaderSize)); + rep_->arena = arena; + } +} + +template +inline RepeatedField::RepeatedField(const RepeatedField& other) + : current_size_(0), + total_size_(0), + rep_(NULL) { + CopyFrom(other); +} + +template +template +RepeatedField::RepeatedField(Iter begin, const Iter& end) + : current_size_(0), + total_size_(0), + rep_(NULL) { + int reserve = internal::CalculateReserve(begin, end); + if (reserve != -1) { + Reserve(reserve); + for (; begin != end; ++begin) { + AddAlreadyReserved(*begin); + } + } else { + for (; begin != end; ++begin) { + Add(*begin); + } + } +} + +template +RepeatedField::~RepeatedField() { + // See explanation in Reserve(): we need to invoke destructors here for the + // case that Element has a non-trivial destructor. + InternalDeallocate(rep_, total_size_); +} + +template +inline RepeatedField& +RepeatedField::operator=(const RepeatedField& other) { + if (this != &other) + CopyFrom(other); + return *this; +} + +template +inline bool RepeatedField::empty() const { + return current_size_ == 0; +} + +template +inline int RepeatedField::size() const { + return current_size_; +} + +template +inline int RepeatedField::Capacity() const { + return total_size_; +} + +template +inline void RepeatedField::AddAlreadyReserved(const Element& value) { + GOOGLE_DCHECK_LT(current_size_, total_size_); + rep_->elements[current_size_++] = value; +} + +template +inline Element* RepeatedField::AddAlreadyReserved() { + GOOGLE_DCHECK_LT(current_size_, total_size_); + return &rep_->elements[current_size_++]; +} + +template +inline void RepeatedField::Resize(int new_size, const Element& value) { + GOOGLE_DCHECK_GE(new_size, 0); + if (new_size > current_size_) { + Reserve(new_size); + std::fill(&rep_->elements[current_size_], + &rep_->elements[new_size], value); + } + current_size_ = new_size; +} + +template +inline const Element& RepeatedField::Get(int index) const { + GOOGLE_DCHECK_GE(index, 0); + GOOGLE_DCHECK_LT(index, current_size_); + return rep_->elements[index]; +} + +template +inline Element* RepeatedField::Mutable(int index) { + GOOGLE_DCHECK_GE(index, 0); + GOOGLE_DCHECK_LT(index, current_size_); + return &rep_->elements[index]; +} + +template +inline void RepeatedField::Set(int index, const Element& value) { + GOOGLE_DCHECK_GE(index, 0); + GOOGLE_DCHECK_LT(index, current_size_); + rep_->elements[index] = value; +} + +template +inline void RepeatedField::Add(const Element& value) { + if (current_size_ == total_size_) Reserve(total_size_ + 1); + rep_->elements[current_size_++] = value; +} + +template +inline Element* RepeatedField::Add() { + if (current_size_ == total_size_) Reserve(total_size_ + 1); + return &rep_->elements[current_size_++]; +} + +template +inline void RepeatedField::RemoveLast() { + GOOGLE_DCHECK_GT(current_size_, 0); + current_size_--; +} + +template +void RepeatedField::ExtractSubrange( + int start, int num, Element* elements) { + GOOGLE_DCHECK_GE(start, 0); + GOOGLE_DCHECK_GE(num, 0); + GOOGLE_DCHECK_LE(start + num, this->current_size_); + + // Save the values of the removed elements if requested. + if (elements != NULL) { + for (int i = 0; i < num; ++i) + elements[i] = this->Get(i + start); + } + + // Slide remaining elements down to fill the gap. + if (num > 0) { + for (int i = start + num; i < this->current_size_; ++i) + this->Set(i - num, this->Get(i)); + this->Truncate(this->current_size_ - num); + } +} + +template +inline void RepeatedField::Clear() { + current_size_ = 0; +} + +template +inline void RepeatedField::UnsafeMergeFrom(const RepeatedField& other) { + if (other.current_size_ != 0) { + Reserve(current_size_ + other.current_size_); + CopyArray(rep_->elements + current_size_, + other.rep_->elements, other.current_size_); + current_size_ += other.current_size_; + } +} + +template +inline void RepeatedField::MergeFrom(const RepeatedField& other) { + GOOGLE_CHECK_NE(&other, this); + UnsafeMergeFrom(other); +} + +template +inline void RepeatedField::CopyFrom(const RepeatedField& other) { + if (&other == this) return; + Clear(); + MergeFrom(other); +} + +template +inline typename RepeatedField::iterator RepeatedField::erase( + const_iterator position) { + return erase(position, position + 1); +} + +template +inline typename RepeatedField::iterator RepeatedField::erase( + const_iterator first, const_iterator last) { + size_type first_offset = first - cbegin(); + if (first != last) { + Truncate(std::copy(last, cend(), begin() + first_offset) - cbegin()); + } + return begin() + first_offset; +} + +template +inline Element* RepeatedField::mutable_data() { + return rep_ ? rep_->elements : NULL; +} + +template +inline const Element* RepeatedField::data() const { + return rep_ ? rep_->elements : NULL; +} + + +template +inline void RepeatedField::InternalSwap(RepeatedField* other) { + std::swap(rep_, other->rep_); + std::swap(current_size_, other->current_size_); + std::swap(total_size_, other->total_size_); +} + +template +void RepeatedField::Swap(RepeatedField* other) { + if (this == other) return; + if (GetArenaNoVirtual() == other->GetArenaNoVirtual()) { + InternalSwap(other); + } else { + RepeatedField temp(other->GetArenaNoVirtual()); + temp.MergeFrom(*this); + CopyFrom(*other); + other->UnsafeArenaSwap(&temp); + } +} + +template +void RepeatedField::UnsafeArenaSwap(RepeatedField* other) { + if (this == other) return; + GOOGLE_DCHECK(GetArenaNoVirtual() == other->GetArenaNoVirtual()); + InternalSwap(other); +} + +template +void RepeatedField::SwapElements(int index1, int index2) { + using std::swap; // enable ADL with fallback + swap(rep_->elements[index1], rep_->elements[index2]); +} + +template +inline typename RepeatedField::iterator +RepeatedField::begin() { + return rep_ ? rep_->elements : NULL; +} +template +inline typename RepeatedField::const_iterator +RepeatedField::begin() const { + return rep_ ? rep_->elements : NULL; +} +template +inline typename RepeatedField::const_iterator +RepeatedField::cbegin() const { + return rep_ ? rep_->elements : NULL; +} +template +inline typename RepeatedField::iterator +RepeatedField::end() { + return rep_ ? rep_->elements + current_size_ : NULL; +} +template +inline typename RepeatedField::const_iterator +RepeatedField::end() const { + return rep_ ? rep_->elements + current_size_ : NULL; +} +template +inline typename RepeatedField::const_iterator +RepeatedField::cend() const { + return rep_ ? rep_->elements + current_size_ : NULL; +} + +template +inline int RepeatedField::SpaceUsedExcludingSelf() const { + return rep_ ? + (total_size_ * sizeof(Element) + kRepHeaderSize) : 0; +} + +// Avoid inlining of Reserve(): new, copy, and delete[] lead to a significant +// amount of code bloat. +template +void RepeatedField::Reserve(int new_size) { + if (total_size_ >= new_size) return; + Rep* old_rep = rep_; + Arena* arena = GetArenaNoVirtual(); + new_size = std::max(google::protobuf::internal::kMinRepeatedFieldAllocationSize, + std::max(total_size_ * 2, new_size)); + GOOGLE_CHECK_LE(static_cast(new_size), + (std::numeric_limits::max() - kRepHeaderSize) / + sizeof(Element)) + << "Requested size is too large to fit into size_t."; + size_t bytes = kRepHeaderSize + sizeof(Element) * new_size; + if (arena == NULL) { + rep_ = static_cast(::operator new(bytes)); + } else { + rep_ = reinterpret_cast( + ::google::protobuf::Arena::CreateArray(arena, bytes)); + } + rep_->arena = arena; + int old_total_size = total_size_; + total_size_ = new_size; + // Invoke placement-new on newly allocated elements. We shouldn't have to do + // this, since Element is supposed to be POD, but a previous version of this + // code allocated storage with "new Element[size]" and some code uses + // RepeatedField with non-POD types, relying on constructor invocation. If + // Element has a trivial constructor (e.g., int32), gcc (tested with -O2) + // completely removes this loop because the loop body is empty, so this has no + // effect unless its side-effects are required for correctness. + // Note that we do this before MoveArray() below because Element's copy + // assignment implementation will want an initialized instance first. + Element* e = &rep_->elements[0]; + Element* limit = &rep_->elements[total_size_]; + for (; e < limit; e++) { + new (e) Element(); + } + if (current_size_ > 0) { + MoveArray(rep_->elements, old_rep->elements, current_size_); + } + + // Likewise, we need to invoke destructors on the old array. + InternalDeallocate(old_rep, old_total_size); + +} + +template +inline void RepeatedField::Truncate(int new_size) { + GOOGLE_DCHECK_LE(new_size, current_size_); + if (current_size_ > 0) { + current_size_ = new_size; + } +} + +template +inline void RepeatedField::MoveArray( + Element* to, Element* from, int array_size) { + CopyArray(to, from, array_size); +} + +template +inline void RepeatedField::CopyArray( + Element* to, const Element* from, int array_size) { + internal::ElementCopier()(to, from, array_size); +} + +namespace internal { + +template +void ElementCopier::operator()( + Element* to, const Element* from, int array_size) { + std::copy(from, from + array_size, to); +} + +template +struct ElementCopier { + void operator()(Element* to, const Element* from, int array_size) { + memcpy(to, from, array_size * sizeof(Element)); + } +}; + +} // namespace internal + + +// ------------------------------------------------------------------- + +namespace internal { + +inline RepeatedPtrFieldBase::RepeatedPtrFieldBase() + : arena_(NULL), + current_size_(0), + total_size_(0), + rep_(NULL) { +} + +inline RepeatedPtrFieldBase::RepeatedPtrFieldBase(::google::protobuf::Arena* arena) + : arena_(arena), + current_size_(0), + total_size_(0), + rep_(NULL) { +} + +template +void RepeatedPtrFieldBase::Destroy() { + if (rep_ != NULL && arena_ == NULL) { + int n = rep_->allocated_size; + void* const* elements = rep_->elements; + for (int i = 0; i < n; i++) { + TypeHandler::Delete(cast(elements[i]), NULL); + } +#if defined(__GXX_DELETE_WITH_SIZE__) || defined(__cpp_sized_deallocation) + const size_t size = total_size_ * sizeof(elements[0]) + kRepHeaderSize; + ::operator delete(static_cast(rep_), size); +#else + ::operator delete(static_cast(rep_)); +#endif + } + rep_ = NULL; +} + +template +inline void RepeatedPtrFieldBase::Swap(RepeatedPtrFieldBase* other) { + if (other->GetArenaNoVirtual() == GetArenaNoVirtual()) { + InternalSwap(other); + } else { + SwapFallback(other); + } +} + +template +void RepeatedPtrFieldBase::SwapFallback(RepeatedPtrFieldBase* other) { + GOOGLE_DCHECK(other->GetArenaNoVirtual() != GetArenaNoVirtual()); + + // Copy semantics in this case. We try to improve efficiency by placing the + // temporary on |other|'s arena so that messages are copied cross-arena only + // once, not twice. + RepeatedPtrFieldBase temp(other->GetArenaNoVirtual()); + temp.MergeFrom(*this); + this->Clear(); + this->MergeFrom(*other); + other->Clear(); + other->InternalSwap(&temp); + temp.Destroy(); // Frees rep_ if `other` had no arena. +} + +inline bool RepeatedPtrFieldBase::empty() const { + return current_size_ == 0; +} + +inline int RepeatedPtrFieldBase::size() const { + return current_size_; +} + +template +inline const typename TypeHandler::Type& +RepeatedPtrFieldBase::Get(int index) const { + GOOGLE_DCHECK_GE(index, 0); + GOOGLE_DCHECK_LT(index, current_size_); + return *cast(rep_->elements[index]); +} + + +template +inline typename TypeHandler::Type* +RepeatedPtrFieldBase::Mutable(int index) { + GOOGLE_DCHECK_GE(index, 0); + GOOGLE_DCHECK_LT(index, current_size_); + return cast(rep_->elements[index]); +} + +template +inline void RepeatedPtrFieldBase::Delete(int index) { + GOOGLE_DCHECK_GE(index, 0); + GOOGLE_DCHECK_LT(index, current_size_); + TypeHandler::Delete(cast(rep_->elements[index]), arena_); +} + +template +inline typename TypeHandler::Type* RepeatedPtrFieldBase::Add( + typename TypeHandler::Type* prototype) { + if (rep_ != NULL && current_size_ < rep_->allocated_size) { + return cast(rep_->elements[current_size_++]); + } + if (!rep_ || rep_->allocated_size == total_size_) { + Reserve(total_size_ + 1); + } + ++rep_->allocated_size; + typename TypeHandler::Type* result = + TypeHandler::NewFromPrototype(prototype, arena_); + rep_->elements[current_size_++] = result; + return result; +} + +template +inline void RepeatedPtrFieldBase::RemoveLast() { + GOOGLE_DCHECK_GT(current_size_, 0); + TypeHandler::Clear(cast(rep_->elements[--current_size_])); +} + +template +void RepeatedPtrFieldBase::Clear() { + const int n = current_size_; + GOOGLE_DCHECK_GE(n, 0); + if (n > 0) { + void* const* elements = rep_->elements; + int i = 0; + do { + TypeHandler::Clear(cast(elements[i++])); + } while (i < n); + current_size_ = 0; + } +} + +// To avoid unnecessary code duplication and reduce binary size, we use a +// layered approach to implementing MergeFrom(). The toplevel method is +// templated, so we get a small thunk per concrete message type in the binary. +// This calls a shared implementation with most of the logic, passing a function +// pointer to another type-specific piece of code that calls the object-allocate +// and merge handlers. +template +inline void RepeatedPtrFieldBase::MergeFrom(const RepeatedPtrFieldBase& other) { + GOOGLE_DCHECK_NE(&other, this); + if (other.current_size_ == 0) return; + MergeFromInternal( + other, &RepeatedPtrFieldBase::MergeFromInnerLoop); +} + +inline void RepeatedPtrFieldBase::MergeFromInternal( + const RepeatedPtrFieldBase& other, + void (RepeatedPtrFieldBase::*inner_loop)(void**, void**, int, int)) { + // Note: wrapper has already guaranteed that other.rep_ != NULL here. + int other_size = other.current_size_; + void** other_elements = other.rep_->elements; + void** new_elements = InternalExtend(other_size); + int allocated_elems = rep_->allocated_size - current_size_; + (this->*inner_loop)(new_elements, other_elements, + other_size, allocated_elems); + current_size_ += other_size; + if (rep_->allocated_size < current_size_) { + rep_->allocated_size = current_size_; + } +} + +// Merges other_elems to our_elems. +template +void RepeatedPtrFieldBase::MergeFromInnerLoop( + void** our_elems, void** other_elems, int length, int already_allocated) { + // Split into two loops, over ranges [0, allocated) and [allocated, length), + // to avoid a branch within the loop. + for (int i = 0; i < already_allocated && i < length; i++) { + // Already allocated: use existing element. + typename TypeHandler::Type* other_elem = + reinterpret_cast(other_elems[i]); + typename TypeHandler::Type* new_elem = + reinterpret_cast(our_elems[i]); + TypeHandler::Merge(*other_elem, new_elem); + } + Arena* arena = GetArenaNoVirtual(); + for (int i = already_allocated; i < length; i++) { + // Not allocated: alloc a new element first, then merge it. + typename TypeHandler::Type* other_elem = + reinterpret_cast(other_elems[i]); + typename TypeHandler::Type* new_elem = + TypeHandler::NewFromPrototype(other_elem, arena); + TypeHandler::Merge(*other_elem, new_elem); + our_elems[i] = new_elem; + } +} + +template +inline void RepeatedPtrFieldBase::CopyFrom(const RepeatedPtrFieldBase& other) { + if (&other == this) return; + RepeatedPtrFieldBase::Clear(); + RepeatedPtrFieldBase::MergeFrom(other); +} + +inline int RepeatedPtrFieldBase::Capacity() const { + return total_size_; +} + +inline void* const* RepeatedPtrFieldBase::raw_data() const { + return rep_ ? rep_->elements : NULL; +} + +inline void** RepeatedPtrFieldBase::raw_mutable_data() const { + return rep_ ? const_cast(rep_->elements) : NULL; +} + +template +inline typename TypeHandler::Type** RepeatedPtrFieldBase::mutable_data() { + // TODO(kenton): Breaks C++ aliasing rules. We should probably remove this + // method entirely. + return reinterpret_cast(raw_mutable_data()); +} + +template +inline const typename TypeHandler::Type* const* +RepeatedPtrFieldBase::data() const { + // TODO(kenton): Breaks C++ aliasing rules. We should probably remove this + // method entirely. + return reinterpret_cast(raw_data()); +} + +inline void RepeatedPtrFieldBase::SwapElements(int index1, int index2) { + using std::swap; // enable ADL with fallback + swap(rep_->elements[index1], rep_->elements[index2]); +} + +template +inline int RepeatedPtrFieldBase::SpaceUsedExcludingSelf() const { + int allocated_bytes = total_size_ * sizeof(void*); + if (rep_ != NULL) { + for (int i = 0; i < rep_->allocated_size; ++i) { + allocated_bytes += TypeHandler::SpaceUsed( + *cast(rep_->elements[i])); + } + allocated_bytes += kRepHeaderSize; + } + return allocated_bytes; +} + +template +inline typename TypeHandler::Type* RepeatedPtrFieldBase::AddFromCleared() { + if (rep_ != NULL && current_size_ < rep_->allocated_size) { + return cast(rep_->elements[current_size_++]); + } else { + return NULL; + } +} + +// AddAllocated version that implements arena-safe copying behavior. +template +void RepeatedPtrFieldBase::AddAllocatedInternal( + typename TypeHandler::Type* value, + google::protobuf::internal::true_type) { + Arena* element_arena = reinterpret_cast( + TypeHandler::GetMaybeArenaPointer(value)); + Arena* arena = GetArenaNoVirtual(); + if (arena == element_arena && rep_ && + rep_->allocated_size < total_size_) { + // Fast path: underlying arena representation (tagged pointer) is equal to + // our arena pointer, and we can add to array without resizing it (at least + // one slot that is not allocated). + void** elems = rep_->elements; + if (current_size_ < rep_->allocated_size) { + // Make space at [current] by moving first allocated element to end of + // allocated list. + elems[rep_->allocated_size] = elems[current_size_]; + } + elems[current_size_] = value; + current_size_ = current_size_ + 1; + rep_->allocated_size = rep_->allocated_size + 1; + return; + } else { + AddAllocatedSlowWithCopy( + value, TypeHandler::GetArena(value), arena); + } +} + +// Slowpath handles all cases, copying if necessary. +template +void RepeatedPtrFieldBase::AddAllocatedSlowWithCopy( + // Pass value_arena and my_arena to avoid duplicate virtual call (value) or + // load (mine). + typename TypeHandler::Type* value, Arena* value_arena, Arena* my_arena) { + // Ensure that either the value is in the same arena, or if not, we do the + // appropriate thing: Own() it (if it's on heap and we're in an arena) or copy + // it to our arena/heap (otherwise). + if (my_arena != NULL && value_arena == NULL) { + my_arena->Own(value); + } else if (my_arena != value_arena) { + typename TypeHandler::Type* new_value = + TypeHandler::NewFromPrototype(value, my_arena); + TypeHandler::Merge(*value, new_value); + TypeHandler::Delete(value, value_arena); + value = new_value; + } + + UnsafeArenaAddAllocated(value); +} + +// AddAllocated version that does not implement arena-safe copying behavior. +template +void RepeatedPtrFieldBase::AddAllocatedInternal( + typename TypeHandler::Type* value, + google::protobuf::internal::false_type) { + if (rep_ && rep_->allocated_size < total_size_) { + // Fast path: underlying arena representation (tagged pointer) is equal to + // our arena pointer, and we can add to array without resizing it (at least + // one slot that is not allocated). + void** elems = rep_->elements; + if (current_size_ < rep_->allocated_size) { + // Make space at [current] by moving first allocated element to end of + // allocated list. + elems[rep_->allocated_size] = elems[current_size_]; + } + elems[current_size_] = value; + current_size_ = current_size_ + 1; + ++rep_->allocated_size; + return; + } else { + UnsafeArenaAddAllocated(value); + } +} + +template +void RepeatedPtrFieldBase::UnsafeArenaAddAllocated( + typename TypeHandler::Type* value) { + // Make room for the new pointer. + if (!rep_ || current_size_ == total_size_) { + // The array is completely full with no cleared objects, so grow it. + Reserve(total_size_ + 1); + ++rep_->allocated_size; + } else if (rep_->allocated_size == total_size_) { + // There is no more space in the pointer array because it contains some + // cleared objects awaiting reuse. We don't want to grow the array in this + // case because otherwise a loop calling AddAllocated() followed by Clear() + // would leak memory. + TypeHandler::Delete( + cast(rep_->elements[current_size_]), arena_); + } else if (current_size_ < rep_->allocated_size) { + // We have some cleared objects. We don't care about their order, so we + // can just move the first one to the end to make space. + rep_->elements[rep_->allocated_size] = rep_->elements[current_size_]; + ++rep_->allocated_size; + } else { + // There are no cleared objects. + ++rep_->allocated_size; + } + + rep_->elements[current_size_++] = value; +} + +// ReleaseLast() for types that implement merge/copy behavior. +template +inline typename TypeHandler::Type* +RepeatedPtrFieldBase::ReleaseLastInternal(google::protobuf::internal::true_type) { + // First, release an element. + typename TypeHandler::Type* result = UnsafeArenaReleaseLast(); + // Now perform a copy if we're on an arena. + Arena* arena = GetArenaNoVirtual(); + if (arena == NULL) { + return result; + } else { + typename TypeHandler::Type* new_result = + TypeHandler::NewFromPrototype(result, NULL); + TypeHandler::Merge(*result, new_result); + return new_result; + } +} + +// ReleaseLast() for types that *do not* implement merge/copy behavior -- this +// is the same as UnsafeArenaReleaseLast(). Note that we GOOGLE_DCHECK-fail if we're on +// an arena, since the user really should implement the copy operation in this +// case. +template +inline typename TypeHandler::Type* +RepeatedPtrFieldBase::ReleaseLastInternal(google::protobuf::internal::false_type) { + GOOGLE_DCHECK(GetArenaNoVirtual() == NULL) + << "ReleaseLast() called on a RepeatedPtrField that is on an arena, " + << "with a type that does not implement MergeFrom. This is unsafe; " + << "please implement MergeFrom for your type."; + return UnsafeArenaReleaseLast(); +} + +template +inline typename TypeHandler::Type* + RepeatedPtrFieldBase::UnsafeArenaReleaseLast() { + GOOGLE_DCHECK_GT(current_size_, 0); + typename TypeHandler::Type* result = + cast(rep_->elements[--current_size_]); + --rep_->allocated_size; + if (current_size_ < rep_->allocated_size) { + // There are cleared elements on the end; replace the removed element + // with the last allocated element. + rep_->elements[current_size_] = rep_->elements[rep_->allocated_size]; + } + return result; +} + +inline int RepeatedPtrFieldBase::ClearedCount() const { + return rep_ ? (rep_->allocated_size - current_size_) : 0; +} + +template +inline void RepeatedPtrFieldBase::AddCleared( + typename TypeHandler::Type* value) { + GOOGLE_DCHECK(GetArenaNoVirtual() == NULL) + << "AddCleared() can only be used on a RepeatedPtrField not on an arena."; + GOOGLE_DCHECK(TypeHandler::GetArena(value) == NULL) + << "AddCleared() can only accept values not on an arena."; + if (!rep_ || rep_->allocated_size == total_size_) { + Reserve(total_size_ + 1); + } + rep_->elements[rep_->allocated_size++] = value; +} + +template +inline typename TypeHandler::Type* RepeatedPtrFieldBase::ReleaseCleared() { + GOOGLE_DCHECK(GetArenaNoVirtual() == NULL) + << "ReleaseCleared() can only be used on a RepeatedPtrField not on " + << "an arena."; + GOOGLE_DCHECK(GetArenaNoVirtual() == NULL); + GOOGLE_DCHECK(rep_ != NULL); + GOOGLE_DCHECK_GT(rep_->allocated_size, current_size_); + return cast(rep_->elements[--rep_->allocated_size]); +} + +} // namespace internal + +// ------------------------------------------------------------------- + +template +class RepeatedPtrField::TypeHandler + : public internal::GenericTypeHandler { +}; + +template <> +class RepeatedPtrField::TypeHandler + : public internal::StringTypeHandler { +}; + + +template +inline RepeatedPtrField::RepeatedPtrField() + : RepeatedPtrFieldBase() {} + +template +inline RepeatedPtrField::RepeatedPtrField(::google::protobuf::Arena* arena) : + RepeatedPtrFieldBase(arena) {} + +template +inline RepeatedPtrField::RepeatedPtrField( + const RepeatedPtrField& other) + : RepeatedPtrFieldBase() { + CopyFrom(other); +} + +template +template +inline RepeatedPtrField::RepeatedPtrField( + Iter begin, const Iter& end) { + int reserve = internal::CalculateReserve(begin, end); + if (reserve != -1) { + Reserve(reserve); + } + for (; begin != end; ++begin) { + *Add() = *begin; + } +} + +template +RepeatedPtrField::~RepeatedPtrField() { + Destroy(); +} + +template +inline RepeatedPtrField& RepeatedPtrField::operator=( + const RepeatedPtrField& other) { + if (this != &other) + CopyFrom(other); + return *this; +} + +template +inline bool RepeatedPtrField::empty() const { + return RepeatedPtrFieldBase::empty(); +} + +template +inline int RepeatedPtrField::size() const { + return RepeatedPtrFieldBase::size(); +} + +template +inline const Element& RepeatedPtrField::Get(int index) const { + return RepeatedPtrFieldBase::Get(index); +} + + +template +inline Element* RepeatedPtrField::Mutable(int index) { + return RepeatedPtrFieldBase::Mutable(index); +} + +template +inline Element* RepeatedPtrField::Add() { + return RepeatedPtrFieldBase::Add(); +} + +template +inline void RepeatedPtrField::RemoveLast() { + RepeatedPtrFieldBase::RemoveLast(); +} + +template +inline void RepeatedPtrField::DeleteSubrange(int start, int num) { + GOOGLE_DCHECK_GE(start, 0); + GOOGLE_DCHECK_GE(num, 0); + GOOGLE_DCHECK_LE(start + num, size()); + for (int i = 0; i < num; ++i) { + RepeatedPtrFieldBase::Delete(start + i); + } + ExtractSubrange(start, num, NULL); +} + +template +inline void RepeatedPtrField::ExtractSubrange( + int start, int num, Element** elements) { + typename internal::TypeImplementsMergeBehavior< + typename TypeHandler::Type>::type t; + ExtractSubrangeInternal(start, num, elements, t); +} + +// ExtractSubrange() implementation for types that implement merge/copy +// behavior. +template +inline void RepeatedPtrField::ExtractSubrangeInternal( + int start, int num, Element** elements, google::protobuf::internal::true_type) { + GOOGLE_DCHECK_GE(start, 0); + GOOGLE_DCHECK_GE(num, 0); + GOOGLE_DCHECK_LE(start + num, size()); + + if (num > 0) { + // Save the values of the removed elements if requested. + if (elements != NULL) { + if (GetArenaNoVirtual() != NULL) { + // If we're on an arena, we perform a copy for each element so that the + // returned elements are heap-allocated. + for (int i = 0; i < num; ++i) { + Element* element = RepeatedPtrFieldBase:: + Mutable(i + start); + typename TypeHandler::Type* new_value = + TypeHandler::NewFromPrototype(element, NULL); + TypeHandler::Merge(*element, new_value); + elements[i] = new_value; + } + } else { + for (int i = 0; i < num; ++i) { + elements[i] = RepeatedPtrFieldBase::Mutable(i + start); + } + } + } + CloseGap(start, num); + } +} + +// ExtractSubrange() implementation for types that do not implement merge/copy +// behavior. +template +inline void RepeatedPtrField::ExtractSubrangeInternal( + int start, int num, Element** elements, google::protobuf::internal::false_type) { + // This case is identical to UnsafeArenaExtractSubrange(). However, since + // ExtractSubrange() must return heap-allocated objects by contract, and we + // cannot fulfill this contract if we are an on arena, we must GOOGLE_DCHECK() that + // we are not on an arena. + GOOGLE_DCHECK(GetArenaNoVirtual() == NULL) + << "ExtractSubrange() when arena is non-NULL is only supported when " + << "the Element type supplies a MergeFrom() operation to make copies."; + UnsafeArenaExtractSubrange(start, num, elements); +} + +template +inline void RepeatedPtrField::UnsafeArenaExtractSubrange( + int start, int num, Element** elements) { + GOOGLE_DCHECK_GE(start, 0); + GOOGLE_DCHECK_GE(num, 0); + GOOGLE_DCHECK_LE(start + num, size()); + + if (num > 0) { + // Save the values of the removed elements if requested. + if (elements != NULL) { + for (int i = 0; i < num; ++i) { + elements[i] = RepeatedPtrFieldBase::Mutable(i + start); + } + } + CloseGap(start, num); + } +} + +template +inline void RepeatedPtrField::Clear() { + RepeatedPtrFieldBase::Clear(); +} + +template +inline void RepeatedPtrField::MergeFrom( + const RepeatedPtrField& other) { + RepeatedPtrFieldBase::MergeFrom(other); +} + +template +inline void RepeatedPtrField::CopyFrom( + const RepeatedPtrField& other) { + RepeatedPtrFieldBase::CopyFrom(other); +} + +template +inline typename RepeatedPtrField::iterator +RepeatedPtrField::erase(const_iterator position) { + return erase(position, position + 1); +} + +template +inline typename RepeatedPtrField::iterator +RepeatedPtrField::erase(const_iterator first, const_iterator last) { + size_type pos_offset = std::distance(cbegin(), first); + size_type last_offset = std::distance(cbegin(), last); + DeleteSubrange(pos_offset, last_offset - pos_offset); + return begin() + pos_offset; +} + +template +inline Element** RepeatedPtrField::mutable_data() { + return RepeatedPtrFieldBase::mutable_data(); +} + +template +inline const Element* const* RepeatedPtrField::data() const { + return RepeatedPtrFieldBase::data(); +} + +template +inline void RepeatedPtrField::Swap(RepeatedPtrField* other) { + if (this == other) + return; + RepeatedPtrFieldBase::Swap(other); +} + +template +inline void RepeatedPtrField::UnsafeArenaSwap( + RepeatedPtrField* other) { + GOOGLE_DCHECK(GetArenaNoVirtual() == other->GetArenaNoVirtual()); + if (this == other) + return; + RepeatedPtrFieldBase::InternalSwap(other); +} + +template +inline void RepeatedPtrField::SwapElements(int index1, int index2) { + RepeatedPtrFieldBase::SwapElements(index1, index2); +} + +template +inline Arena* RepeatedPtrField::GetArenaNoVirtual() const { + return RepeatedPtrFieldBase::GetArenaNoVirtual(); +} + +template +inline int RepeatedPtrField::SpaceUsedExcludingSelf() const { + return RepeatedPtrFieldBase::SpaceUsedExcludingSelf(); +} + +template +inline void RepeatedPtrField::AddAllocated(Element* value) { + RepeatedPtrFieldBase::AddAllocated(value); +} + +template +inline void RepeatedPtrField::UnsafeArenaAddAllocated(Element* value) { + RepeatedPtrFieldBase::UnsafeArenaAddAllocated(value); +} + +template +inline Element* RepeatedPtrField::ReleaseLast() { + return RepeatedPtrFieldBase::ReleaseLast(); +} + +template +inline Element* RepeatedPtrField::UnsafeArenaReleaseLast() { + return RepeatedPtrFieldBase::UnsafeArenaReleaseLast(); +} + +template +inline int RepeatedPtrField::ClearedCount() const { + return RepeatedPtrFieldBase::ClearedCount(); +} + +template +inline void RepeatedPtrField::AddCleared(Element* value) { + return RepeatedPtrFieldBase::AddCleared(value); +} + +template +inline Element* RepeatedPtrField::ReleaseCleared() { + return RepeatedPtrFieldBase::ReleaseCleared(); +} + +template +inline void RepeatedPtrField::Reserve(int new_size) { + return RepeatedPtrFieldBase::Reserve(new_size); +} + +template +inline int RepeatedPtrField::Capacity() const { + return RepeatedPtrFieldBase::Capacity(); +} + +// ------------------------------------------------------------------- + +namespace internal { + +// STL-like iterator implementation for RepeatedPtrField. You should not +// refer to this class directly; use RepeatedPtrField::iterator instead. +// +// The iterator for RepeatedPtrField, RepeatedPtrIterator, is +// very similar to iterator_ptr in util/gtl/iterator_adaptors.h, +// but adds random-access operators and is modified to wrap a void** base +// iterator (since RepeatedPtrField stores its array as a void* array and +// casting void** to T** would violate C++ aliasing rules). +// +// This code based on net/proto/proto-array-internal.h by Jeffrey Yasskin +// (jyasskin@google.com). +template +class RepeatedPtrIterator + : public std::iterator< + std::random_access_iterator_tag, Element> { + public: + typedef RepeatedPtrIterator iterator; + typedef std::iterator< + std::random_access_iterator_tag, Element> superclass; + + // Shadow the value_type in std::iterator<> because const_iterator::value_type + // needs to be T, not const T. + typedef typename remove_const::type value_type; + + // Let the compiler know that these are type names, so we don't have to + // write "typename" in front of them everywhere. + typedef typename superclass::reference reference; + typedef typename superclass::pointer pointer; + typedef typename superclass::difference_type difference_type; + + RepeatedPtrIterator() : it_(NULL) {} + explicit RepeatedPtrIterator(void* const* it) : it_(it) {} + + // Allow "upcasting" from RepeatedPtrIterator to + // RepeatedPtrIterator. + template + RepeatedPtrIterator(const RepeatedPtrIterator& other) + : it_(other.it_) { + // Force a compiler error if the other type is not convertible to ours. + if (false) { + implicit_cast(0); + } + } + + // dereferenceable + reference operator*() const { return *reinterpret_cast(*it_); } + pointer operator->() const { return &(operator*()); } + + // {inc,dec}rementable + iterator& operator++() { ++it_; return *this; } + iterator operator++(int) { return iterator(it_++); } + iterator& operator--() { --it_; return *this; } + iterator operator--(int) { return iterator(it_--); } + + // equality_comparable + bool operator==(const iterator& x) const { return it_ == x.it_; } + bool operator!=(const iterator& x) const { return it_ != x.it_; } + + // less_than_comparable + bool operator<(const iterator& x) const { return it_ < x.it_; } + bool operator<=(const iterator& x) const { return it_ <= x.it_; } + bool operator>(const iterator& x) const { return it_ > x.it_; } + bool operator>=(const iterator& x) const { return it_ >= x.it_; } + + // addable, subtractable + iterator& operator+=(difference_type d) { + it_ += d; + return *this; + } + friend iterator operator+(iterator it, const difference_type d) { + it += d; + return it; + } + friend iterator operator+(const difference_type d, iterator it) { + it += d; + return it; + } + iterator& operator-=(difference_type d) { + it_ -= d; + return *this; + } + friend iterator operator-(iterator it, difference_type d) { + it -= d; + return it; + } + + // indexable + reference operator[](difference_type d) const { return *(*this + d); } + + // random access iterator + difference_type operator-(const iterator& x) const { return it_ - x.it_; } + + private: + template + friend class RepeatedPtrIterator; + + // The internal iterator. + void* const* it_; +}; + +// Provide an iterator that operates on pointers to the underlying objects +// rather than the objects themselves as RepeatedPtrIterator does. +// Consider using this when working with stl algorithms that change +// the array. +// The VoidPtr template parameter holds the type-agnostic pointer value +// referenced by the iterator. It should either be "void *" for a mutable +// iterator, or "const void *" for a constant iterator. +template +class RepeatedPtrOverPtrsIterator + : public std::iterator { + public: + typedef RepeatedPtrOverPtrsIterator iterator; + typedef std::iterator< + std::random_access_iterator_tag, Element*> superclass; + + // Shadow the value_type in std::iterator<> because const_iterator::value_type + // needs to be T, not const T. + typedef typename remove_const::type value_type; + + // Let the compiler know that these are type names, so we don't have to + // write "typename" in front of them everywhere. + typedef typename superclass::reference reference; + typedef typename superclass::pointer pointer; + typedef typename superclass::difference_type difference_type; + + RepeatedPtrOverPtrsIterator() : it_(NULL) {} + explicit RepeatedPtrOverPtrsIterator(VoidPtr* it) : it_(it) {} + + // dereferenceable + reference operator*() const { return *reinterpret_cast(it_); } + pointer operator->() const { return &(operator*()); } + + // {inc,dec}rementable + iterator& operator++() { ++it_; return *this; } + iterator operator++(int) { return iterator(it_++); } + iterator& operator--() { --it_; return *this; } + iterator operator--(int) { return iterator(it_--); } + + // equality_comparable + bool operator==(const iterator& x) const { return it_ == x.it_; } + bool operator!=(const iterator& x) const { return it_ != x.it_; } + + // less_than_comparable + bool operator<(const iterator& x) const { return it_ < x.it_; } + bool operator<=(const iterator& x) const { return it_ <= x.it_; } + bool operator>(const iterator& x) const { return it_ > x.it_; } + bool operator>=(const iterator& x) const { return it_ >= x.it_; } + + // addable, subtractable + iterator& operator+=(difference_type d) { + it_ += d; + return *this; + } + friend iterator operator+(iterator it, difference_type d) { + it += d; + return it; + } + friend iterator operator+(difference_type d, iterator it) { + it += d; + return it; + } + iterator& operator-=(difference_type d) { + it_ -= d; + return *this; + } + friend iterator operator-(iterator it, difference_type d) { + it -= d; + return it; + } + + // indexable + reference operator[](difference_type d) const { return *(*this + d); } + + // random access iterator + difference_type operator-(const iterator& x) const { return it_ - x.it_; } + + private: + template + friend class RepeatedPtrIterator; + + // The internal iterator. + VoidPtr* it_; +}; + +void RepeatedPtrFieldBase::InternalSwap(RepeatedPtrFieldBase* other) { + std::swap(rep_, other->rep_); + std::swap(current_size_, other->current_size_); + std::swap(total_size_, other->total_size_); +} + +} // namespace internal + +template +inline typename RepeatedPtrField::iterator +RepeatedPtrField::begin() { + return iterator(raw_data()); +} +template +inline typename RepeatedPtrField::const_iterator +RepeatedPtrField::begin() const { + return iterator(raw_data()); +} +template +inline typename RepeatedPtrField::const_iterator +RepeatedPtrField::cbegin() const { + return begin(); +} +template +inline typename RepeatedPtrField::iterator +RepeatedPtrField::end() { + return iterator(raw_data() + size()); +} +template +inline typename RepeatedPtrField::const_iterator +RepeatedPtrField::end() const { + return iterator(raw_data() + size()); +} +template +inline typename RepeatedPtrField::const_iterator +RepeatedPtrField::cend() const { + return end(); +} + +template +inline typename RepeatedPtrField::pointer_iterator +RepeatedPtrField::pointer_begin() { + return pointer_iterator(raw_mutable_data()); +} +template +inline typename RepeatedPtrField::const_pointer_iterator +RepeatedPtrField::pointer_begin() const { + return const_pointer_iterator(const_cast(raw_mutable_data())); +} +template +inline typename RepeatedPtrField::pointer_iterator +RepeatedPtrField::pointer_end() { + return pointer_iterator(raw_mutable_data() + size()); +} +template +inline typename RepeatedPtrField::const_pointer_iterator +RepeatedPtrField::pointer_end() const { + return const_pointer_iterator( + const_cast(raw_mutable_data() + size())); +} + + +// Iterators and helper functions that follow the spirit of the STL +// std::back_insert_iterator and std::back_inserter but are tailor-made +// for RepeatedField and RepeatedPtrField. Typical usage would be: +// +// std::copy(some_sequence.begin(), some_sequence.end(), +// google::protobuf::RepeatedFieldBackInserter(proto.mutable_sequence())); +// +// Ported by johannes from util/gtl/proto-array-iterators.h + +namespace internal { +// A back inserter for RepeatedField objects. +template class RepeatedFieldBackInsertIterator + : public std::iterator { + public: + explicit RepeatedFieldBackInsertIterator( + RepeatedField* const mutable_field) + : field_(mutable_field) { + } + RepeatedFieldBackInsertIterator& operator=(const T& value) { + field_->Add(value); + return *this; + } + RepeatedFieldBackInsertIterator& operator*() { + return *this; + } + RepeatedFieldBackInsertIterator& operator++() { + return *this; + } + RepeatedFieldBackInsertIterator& operator++(int /* unused */) { + return *this; + } + + private: + RepeatedField* field_; +}; + +// A back inserter for RepeatedPtrField objects. +template class RepeatedPtrFieldBackInsertIterator + : public std::iterator { + public: + RepeatedPtrFieldBackInsertIterator( + RepeatedPtrField* const mutable_field) + : field_(mutable_field) { + } + RepeatedPtrFieldBackInsertIterator& operator=(const T& value) { + *field_->Add() = value; + return *this; + } + RepeatedPtrFieldBackInsertIterator& operator=( + const T* const ptr_to_value) { + *field_->Add() = *ptr_to_value; + return *this; + } + RepeatedPtrFieldBackInsertIterator& operator*() { + return *this; + } + RepeatedPtrFieldBackInsertIterator& operator++() { + return *this; + } + RepeatedPtrFieldBackInsertIterator& operator++(int /* unused */) { + return *this; + } + + private: + RepeatedPtrField* field_; +}; + +// A back inserter for RepeatedPtrFields that inserts by transferring ownership +// of a pointer. +template class AllocatedRepeatedPtrFieldBackInsertIterator + : public std::iterator { + public: + explicit AllocatedRepeatedPtrFieldBackInsertIterator( + RepeatedPtrField* const mutable_field) + : field_(mutable_field) { + } + AllocatedRepeatedPtrFieldBackInsertIterator& operator=( + T* const ptr_to_value) { + field_->AddAllocated(ptr_to_value); + return *this; + } + AllocatedRepeatedPtrFieldBackInsertIterator& operator*() { + return *this; + } + AllocatedRepeatedPtrFieldBackInsertIterator& operator++() { + return *this; + } + AllocatedRepeatedPtrFieldBackInsertIterator& operator++( + int /* unused */) { + return *this; + } + + private: + RepeatedPtrField* field_; +}; + +// Almost identical to AllocatedRepeatedPtrFieldBackInsertIterator. This one +// uses the UnsafeArenaAddAllocated instead. +template +class UnsafeArenaAllocatedRepeatedPtrFieldBackInsertIterator + : public std::iterator { + public: + explicit UnsafeArenaAllocatedRepeatedPtrFieldBackInsertIterator( + ::google::protobuf::RepeatedPtrField* const mutable_field) + : field_(mutable_field) { + } + UnsafeArenaAllocatedRepeatedPtrFieldBackInsertIterator& operator=( + T const* const ptr_to_value) { + field_->UnsafeArenaAddAllocated(const_cast(ptr_to_value)); + return *this; + } + UnsafeArenaAllocatedRepeatedPtrFieldBackInsertIterator& operator*() { + return *this; + } + UnsafeArenaAllocatedRepeatedPtrFieldBackInsertIterator& operator++() { + return *this; + } + UnsafeArenaAllocatedRepeatedPtrFieldBackInsertIterator& operator++( + int /* unused */) { + return *this; + } + + private: + ::google::protobuf::RepeatedPtrField* field_; +}; + +} // namespace internal + +// Provides a back insert iterator for RepeatedField instances, +// similar to std::back_inserter(). +template internal::RepeatedFieldBackInsertIterator +RepeatedFieldBackInserter(RepeatedField* const mutable_field) { + return internal::RepeatedFieldBackInsertIterator(mutable_field); +} + +// Provides a back insert iterator for RepeatedPtrField instances, +// similar to std::back_inserter(). +template internal::RepeatedPtrFieldBackInsertIterator +RepeatedPtrFieldBackInserter(RepeatedPtrField* const mutable_field) { + return internal::RepeatedPtrFieldBackInsertIterator(mutable_field); +} + +// Special back insert iterator for RepeatedPtrField instances, just in +// case someone wants to write generic template code that can access both +// RepeatedFields and RepeatedPtrFields using a common name. +template internal::RepeatedPtrFieldBackInsertIterator +RepeatedFieldBackInserter(RepeatedPtrField* const mutable_field) { + return internal::RepeatedPtrFieldBackInsertIterator(mutable_field); +} + +// Provides a back insert iterator for RepeatedPtrField instances +// similar to std::back_inserter() which transfers the ownership while +// copying elements. +template internal::AllocatedRepeatedPtrFieldBackInsertIterator +AllocatedRepeatedPtrFieldBackInserter( + RepeatedPtrField* const mutable_field) { + return internal::AllocatedRepeatedPtrFieldBackInsertIterator( + mutable_field); +} + +// Similar to AllocatedRepeatedPtrFieldBackInserter, using +// UnsafeArenaAddAllocated instead of AddAllocated. +// This is slightly faster if that matters. It is also useful in legacy code +// that uses temporary ownership to avoid copies. Example: +// RepeatedPtrField temp_field; +// temp_field.AddAllocated(new T); +// ... // Do something with temp_field +// temp_field.ExtractSubrange(0, temp_field.size(), NULL); +// If you put temp_field on the arena this fails, because the ownership +// transfers to the arena at the "AddAllocated" call and is not released anymore +// causing a double delete. Using UnsafeArenaAddAllocated prevents this. +template +internal::UnsafeArenaAllocatedRepeatedPtrFieldBackInsertIterator +UnsafeArenaAllocatedRepeatedPtrFieldBackInserter( + ::google::protobuf::RepeatedPtrField* const mutable_field) { + return internal::UnsafeArenaAllocatedRepeatedPtrFieldBackInsertIterator( + mutable_field); +} + +} // namespace protobuf + +} // namespace google +#endif // GOOGLE_PROTOBUF_REPEATED_FIELD_H__ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomic_sequence_num.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomic_sequence_num.h new file mode 100644 index 00000000..6ecbd052 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomic_sequence_num.h @@ -0,0 +1,54 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2014 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#ifndef GOOGLE_PROTOBUF_ATOMIC_SEQUENCE_NUM_H_ +#define GOOGLE_PROTOBUF_ATOMIC_SEQUENCE_NUM_H_ + +#include "atomicops.h" + +namespace google { +namespace protobuf { +namespace internal { + +class SequenceNumber { + public: + SequenceNumber() : word_(0) {} + + AtomicWord GetNext() { + return NoBarrier_AtomicIncrement(&word_, 1) - 1; + } + private: + AtomicWord word_; +}; + +} // namespace internal +} // namespace protobuf +} // namespace google + +#endif // GOOGLE_PROTOBUF_ATOMIC_SEQUENCE_NUM_H_ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops.h new file mode 100644 index 00000000..6e3f30cf --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops.h @@ -0,0 +1,246 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2012 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The routines exported by this module are subtle. If you use them, even if +// you get the code right, it will depend on careful reasoning about atomicity +// and memory ordering; it will be less readable, and harder to maintain. If +// you plan to use these routines, you should have a good reason, such as solid +// evidence that performance would otherwise suffer, or there being no +// alternative. You should assume only properties explicitly guaranteed by the +// specifications in this file. You are almost certainly _not_ writing code +// just for the x86; if you assume x86 semantics, x86 hardware bugs and +// implementations on other archtectures will cause your code to break. If you +// do not know what you are doing, avoid these routines, and use a Mutex. +// +// It is incorrect to make direct assignments to/from an atomic variable. +// You should use one of the Load or Store routines. The NoBarrier +// versions are provided when no barriers are needed: +// NoBarrier_Store() +// NoBarrier_Load() +// Although there are currently no compiler enforcement, you are encouraged +// to use these. + +// This header and the implementations for each platform (located in +// atomicops_internals_*) must be kept in sync with the upstream code (V8). + +#ifndef GOOGLE_PROTOBUF_ATOMICOPS_H_ +#define GOOGLE_PROTOBUF_ATOMICOPS_H_ + +// Don't include this file for people not concerned about thread safety. +#ifndef GOOGLE_PROTOBUF_NO_THREAD_SAFETY + +#include "common.h" +#include "platform_macros.h" + +namespace google { +namespace protobuf { +namespace internal { + +#if defined(GOOGLE_PROTOBUF_ARCH_POWER) +#if defined(_LP64) || defined(__LP64__) +typedef int32 Atomic32; +typedef intptr_t Atomic64; +#else +typedef intptr_t Atomic32; +typedef int64 Atomic64; +#endif +#else +typedef int32 Atomic32; +#ifdef GOOGLE_PROTOBUF_ARCH_64_BIT +// We need to be able to go between Atomic64 and AtomicWord implicitly. This +// means Atomic64 and AtomicWord should be the same type on 64-bit. +#if defined(__ILP32__) || defined(GOOGLE_PROTOBUF_OS_NACL) +// NaCl's intptr_t is not actually 64-bits on 64-bit! +// http://code.google.com/p/nativeclient/issues/detail?id=1162 +// sparcv9's pointer type is 32bits +typedef int64 Atomic64; +#else +typedef intptr_t Atomic64; +#endif +#endif +#endif + +// Use AtomicWord for a machine-sized pointer. It will use the Atomic32 or +// Atomic64 routines below, depending on your architecture. +typedef intptr_t AtomicWord; + +// Atomically execute: +// result = *ptr; +// if (*ptr == old_value) +// *ptr = new_value; +// return result; +// +// I.e., replace "*ptr" with "new_value" if "*ptr" used to be "old_value". +// Always return the old value of "*ptr" +// +// This routine implies no memory barriers. +Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value); + +// Atomically store new_value into *ptr, returning the previous value held in +// *ptr. This routine implies no memory barriers. +Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr, Atomic32 new_value); + +// Atomically increment *ptr by "increment". Returns the new value of +// *ptr with the increment applied. This routine implies no memory barriers. +Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr, Atomic32 increment); + +Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr, + Atomic32 increment); + +// These following lower-level operations are typically useful only to people +// implementing higher-level synchronization operations like spinlocks, +// mutexes, and condition-variables. They combine CompareAndSwap(), a load, or +// a store with appropriate memory-ordering instructions. "Acquire" operations +// ensure that no later memory access can be reordered ahead of the operation. +// "Release" operations ensure that no previous memory access can be reordered +// after the operation. "Barrier" operations have both "Acquire" and "Release" +// semantics. A MemoryBarrier() has "Barrier" semantics, but does no memory +// access. +Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value); +Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value); + +#if defined(__MINGW32__) && defined(MemoryBarrier) +#undef MemoryBarrier +#endif +void MemoryBarrier(); +void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value); +void Acquire_Store(volatile Atomic32* ptr, Atomic32 value); +void Release_Store(volatile Atomic32* ptr, Atomic32 value); + +Atomic32 NoBarrier_Load(volatile const Atomic32* ptr); +Atomic32 Acquire_Load(volatile const Atomic32* ptr); +Atomic32 Release_Load(volatile const Atomic32* ptr); + +// 64-bit atomic operations (only available on 64-bit processors). +#ifdef GOOGLE_PROTOBUF_ARCH_64_BIT +Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value); +Atomic64 NoBarrier_AtomicExchange(volatile Atomic64* ptr, Atomic64 new_value); +Atomic64 NoBarrier_AtomicIncrement(volatile Atomic64* ptr, Atomic64 increment); +Atomic64 Barrier_AtomicIncrement(volatile Atomic64* ptr, Atomic64 increment); + +Atomic64 Acquire_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value); +Atomic64 Release_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value); +void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value); +void Acquire_Store(volatile Atomic64* ptr, Atomic64 value); +void Release_Store(volatile Atomic64* ptr, Atomic64 value); +Atomic64 NoBarrier_Load(volatile const Atomic64* ptr); +Atomic64 Acquire_Load(volatile const Atomic64* ptr); +Atomic64 Release_Load(volatile const Atomic64* ptr); +#endif // GOOGLE_PROTOBUF_ARCH_64_BIT + +} // namespace internal +} // namespace protobuf +} // namespace google + +// Include our platform specific implementation. +#define GOOGLE_PROTOBUF_ATOMICOPS_ERROR \ +"Atomic operations are not supported on your platform" + +// ThreadSanitizer, http://clang.llvm.org/docs/ThreadSanitizer.html. +#if defined(THREAD_SANITIZER) +#include +// MSVC. +#elif defined(_MSC_VER) +#if defined(GOOGLE_PROTOBUF_ARCH_IA32) || defined(GOOGLE_PROTOBUF_ARCH_X64) +#include +#else +#error GOOGLE_PROTOBUF_ATOMICOPS_ERROR +#endif + +// Solaris +#elif defined(GOOGLE_PROTOBUF_OS_SOLARIS) +#include + +// AIX +#elif defined(GOOGLE_PROTOBUF_OS_AIX) +#include + +// Apple. +#elif defined(GOOGLE_PROTOBUF_OS_APPLE) +#include "atomicops_internals_macosx.h" + +// GCC. +#elif defined(__GNUC__) +#if defined(GOOGLE_PROTOBUF_ARCH_IA32) || defined(GOOGLE_PROTOBUF_ARCH_X64) +#include +#elif defined(GOOGLE_PROTOBUF_ARCH_ARM) && defined(__linux__) +#include +#elif defined(GOOGLE_PROTOBUF_ARCH_AARCH64) +#include +#elif defined(GOOGLE_PROTOBUF_ARCH_ARM_QNX) +#include +#elif defined(GOOGLE_PROTOBUF_ARCH_MIPS) || defined(GOOGLE_PROTOBUF_ARCH_MIPS64) +#include +#elif defined(GOOGLE_PROTOBUF_ARCH_POWER) +#include +#elif defined(__native_client__) +#include +#elif defined(GOOGLE_PROTOBUF_ARCH_PPC) +#include +#elif (((__GNUC__ == 4) && (__GNUC_MINOR__ >= 7)) || (__GNUC__ > 4)) +#include +#elif defined(__clang__) +#if __has_extension(c_atomic) +#include +#else +#error GOOGLE_PROTOBUF_ATOMICOPS_ERROR +#endif +#else +#error GOOGLE_PROTOBUF_ATOMICOPS_ERROR +#endif + +// Unknown. +#else +#error GOOGLE_PROTOBUF_ATOMICOPS_ERROR +#endif + +// On some platforms we need additional declarations to make AtomicWord +// compatible with our other Atomic* types. +#if defined(GOOGLE_PROTOBUF_OS_APPLE) +#include "atomicops_internals_atomicword_compat.h" +#endif + +#undef GOOGLE_PROTOBUF_ATOMICOPS_ERROR + +#endif // GOOGLE_PROTOBUF_NO_THREAD_SAFETY + +#endif // GOOGLE_PROTOBUF_ATOMICOPS_H_ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_arm64_gcc.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_arm64_gcc.h new file mode 100644 index 00000000..0a2d2b89 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_arm64_gcc.h @@ -0,0 +1,325 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2012 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// This file is an internal atomic implementation, use atomicops.h instead. + +#ifndef GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_ARM64_GCC_H_ +#define GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_ARM64_GCC_H_ + +namespace google { +namespace protobuf { +namespace internal { + +inline void MemoryBarrier() { + __asm__ __volatile__ ("dmb ish" ::: "memory"); // NOLINT +} + +// NoBarrier versions of the operation include "memory" in the clobber list. +// This is not required for direct usage of the NoBarrier versions of the +// operations. However this is required for correctness when they are used as +// part of the Acquire or Release versions, to ensure that nothing from outside +// the call is reordered between the operation and the memory barrier. This does +// not change the code generated, so has no or minimal impact on the +// NoBarrier operations. + +inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + Atomic32 prev; + int32_t temp; + + __asm__ __volatile__ ( // NOLINT + "0: \n\t" + "ldxr %w[prev], %[ptr] \n\t" // Load the previous value. + "cmp %w[prev], %w[old_value] \n\t" + "bne 1f \n\t" + "stxr %w[temp], %w[new_value], %[ptr] \n\t" // Try to store the new value. + "cbnz %w[temp], 0b \n\t" // Retry if it did not work. + "1: \n\t" + : [prev]"=&r" (prev), + [temp]"=&r" (temp), + [ptr]"+Q" (*ptr) + : [old_value]"IJr" (old_value), + [new_value]"r" (new_value) + : "cc", "memory" + ); // NOLINT + + return prev; +} + +inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr, + Atomic32 new_value) { + Atomic32 result; + int32_t temp; + + __asm__ __volatile__ ( // NOLINT + "0: \n\t" + "ldxr %w[result], %[ptr] \n\t" // Load the previous value. + "stxr %w[temp], %w[new_value], %[ptr] \n\t" // Try to store the new value. + "cbnz %w[temp], 0b \n\t" // Retry if it did not work. + : [result]"=&r" (result), + [temp]"=&r" (temp), + [ptr]"+Q" (*ptr) + : [new_value]"r" (new_value) + : "memory" + ); // NOLINT + + return result; +} + +inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr, + Atomic32 increment) { + Atomic32 result; + int32_t temp; + + __asm__ __volatile__ ( // NOLINT + "0: \n\t" + "ldxr %w[result], %[ptr] \n\t" // Load the previous value. + "add %w[result], %w[result], %w[increment]\n\t" + "stxr %w[temp], %w[result], %[ptr] \n\t" // Try to store the result. + "cbnz %w[temp], 0b \n\t" // Retry on failure. + : [result]"=&r" (result), + [temp]"=&r" (temp), + [ptr]"+Q" (*ptr) + : [increment]"IJr" (increment) + : "memory" + ); // NOLINT + + return result; +} + +inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr, + Atomic32 increment) { + MemoryBarrier(); + Atomic32 result = NoBarrier_AtomicIncrement(ptr, increment); + MemoryBarrier(); + + return result; +} + +inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + Atomic32 prev = NoBarrier_CompareAndSwap(ptr, old_value, new_value); + MemoryBarrier(); + + return prev; +} + +inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + MemoryBarrier(); + Atomic32 prev = NoBarrier_CompareAndSwap(ptr, old_value, new_value); + + return prev; +} + +inline void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value) { + *ptr = value; +} + +inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) { + *ptr = value; + MemoryBarrier(); +} + +inline void Release_Store(volatile Atomic32* ptr, Atomic32 value) { + __asm__ __volatile__ ( // NOLINT + "stlr %w[value], %[ptr] \n\t" + : [ptr]"=Q" (*ptr) + : [value]"r" (value) + : "memory" + ); // NOLINT +} + +inline Atomic32 NoBarrier_Load(volatile const Atomic32* ptr) { + return *ptr; +} + +inline Atomic32 Acquire_Load(volatile const Atomic32* ptr) { + Atomic32 value; + + __asm__ __volatile__ ( // NOLINT + "ldar %w[value], %[ptr] \n\t" + : [value]"=r" (value) + : [ptr]"Q" (*ptr) + : "memory" + ); // NOLINT + + return value; +} + +inline Atomic32 Release_Load(volatile const Atomic32* ptr) { + MemoryBarrier(); + return *ptr; +} + +// 64-bit versions of the operations. +// See the 32-bit versions for comments. + +inline Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + Atomic64 prev; + int32_t temp; + + __asm__ __volatile__ ( // NOLINT + "0: \n\t" + "ldxr %[prev], %[ptr] \n\t" + "cmp %[prev], %[old_value] \n\t" + "bne 1f \n\t" + "stxr %w[temp], %[new_value], %[ptr] \n\t" + "cbnz %w[temp], 0b \n\t" + "1: \n\t" + : [prev]"=&r" (prev), + [temp]"=&r" (temp), + [ptr]"+Q" (*ptr) + : [old_value]"IJr" (old_value), + [new_value]"r" (new_value) + : "cc", "memory" + ); // NOLINT + + return prev; +} + +inline Atomic64 NoBarrier_AtomicExchange(volatile Atomic64* ptr, + Atomic64 new_value) { + Atomic64 result; + int32_t temp; + + __asm__ __volatile__ ( // NOLINT + "0: \n\t" + "ldxr %[result], %[ptr] \n\t" + "stxr %w[temp], %[new_value], %[ptr] \n\t" + "cbnz %w[temp], 0b \n\t" + : [result]"=&r" (result), + [temp]"=&r" (temp), + [ptr]"+Q" (*ptr) + : [new_value]"r" (new_value) + : "memory" + ); // NOLINT + + return result; +} + +inline Atomic64 NoBarrier_AtomicIncrement(volatile Atomic64* ptr, + Atomic64 increment) { + Atomic64 result; + int32_t temp; + + __asm__ __volatile__ ( // NOLINT + "0: \n\t" + "ldxr %[result], %[ptr] \n\t" + "add %[result], %[result], %[increment] \n\t" + "stxr %w[temp], %[result], %[ptr] \n\t" + "cbnz %w[temp], 0b \n\t" + : [result]"=&r" (result), + [temp]"=&r" (temp), + [ptr]"+Q" (*ptr) + : [increment]"IJr" (increment) + : "memory" + ); // NOLINT + + return result; +} + +inline Atomic64 Barrier_AtomicIncrement(volatile Atomic64* ptr, + Atomic64 increment) { + MemoryBarrier(); + Atomic64 result = NoBarrier_AtomicIncrement(ptr, increment); + MemoryBarrier(); + + return result; +} + +inline Atomic64 Acquire_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + Atomic64 prev = NoBarrier_CompareAndSwap(ptr, old_value, new_value); + MemoryBarrier(); + + return prev; +} + +inline Atomic64 Release_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + MemoryBarrier(); + Atomic64 prev = NoBarrier_CompareAndSwap(ptr, old_value, new_value); + + return prev; +} + +inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) { + *ptr = value; +} + +inline void Acquire_Store(volatile Atomic64* ptr, Atomic64 value) { + *ptr = value; + MemoryBarrier(); +} + +inline void Release_Store(volatile Atomic64* ptr, Atomic64 value) { + __asm__ __volatile__ ( // NOLINT + "stlr %x[value], %[ptr] \n\t" + : [ptr]"=Q" (*ptr) + : [value]"r" (value) + : "memory" + ); // NOLINT +} + +inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) { + return *ptr; +} + +inline Atomic64 Acquire_Load(volatile const Atomic64* ptr) { + Atomic64 value; + + __asm__ __volatile__ ( // NOLINT + "ldar %x[value], %[ptr] \n\t" + : [value]"=r" (value) + : [ptr]"Q" (*ptr) + : "memory" + ); // NOLINT + + return value; +} + +inline Atomic64 Release_Load(volatile const Atomic64* ptr) { + MemoryBarrier(); + return *ptr; +} + +} // namespace internal +} // namespace protobuf +} // namespace google + +#endif // GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_ARM64_GCC_H_ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_arm_gcc.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_arm_gcc.h new file mode 100644 index 00000000..90e727b0 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_arm_gcc.h @@ -0,0 +1,151 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2012 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// This file is an internal atomic implementation, use atomicops.h instead. +// +// LinuxKernelCmpxchg and Barrier_AtomicIncrement are from Google Gears. + +#ifndef GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_ARM_GCC_H_ +#define GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_ARM_GCC_H_ + +namespace google { +namespace protobuf { +namespace internal { + +// 0xffff0fc0 is the hard coded address of a function provided by +// the kernel which implements an atomic compare-exchange. On older +// ARM architecture revisions (pre-v6) this may be implemented using +// a syscall. This address is stable, and in active use (hard coded) +// by at least glibc-2.7 and the Android C library. +typedef Atomic32 (*LinuxKernelCmpxchgFunc)(Atomic32 old_value, + Atomic32 new_value, + volatile Atomic32* ptr); +LinuxKernelCmpxchgFunc pLinuxKernelCmpxchg __attribute__((weak)) = + (LinuxKernelCmpxchgFunc) 0xffff0fc0; + +typedef void (*LinuxKernelMemoryBarrierFunc)(void); +LinuxKernelMemoryBarrierFunc pLinuxKernelMemoryBarrier __attribute__((weak)) = + (LinuxKernelMemoryBarrierFunc) 0xffff0fa0; + + +inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + Atomic32 prev_value = *ptr; + do { + if (!pLinuxKernelCmpxchg(old_value, new_value, + const_cast(ptr))) { + return old_value; + } + prev_value = *ptr; + } while (prev_value == old_value); + return prev_value; +} + +inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr, + Atomic32 new_value) { + Atomic32 old_value; + do { + old_value = *ptr; + } while (pLinuxKernelCmpxchg(old_value, new_value, + const_cast(ptr))); + return old_value; +} + +inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr, + Atomic32 increment) { + return Barrier_AtomicIncrement(ptr, increment); +} + +inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr, + Atomic32 increment) { + for (;;) { + // Atomic exchange the old value with an incremented one. + Atomic32 old_value = *ptr; + Atomic32 new_value = old_value + increment; + if (pLinuxKernelCmpxchg(old_value, new_value, + const_cast(ptr)) == 0) { + // The exchange took place as expected. + return new_value; + } + // Otherwise, *ptr changed mid-loop and we need to retry. + } +} + +inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + return NoBarrier_CompareAndSwap(ptr, old_value, new_value); +} + +inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + return NoBarrier_CompareAndSwap(ptr, old_value, new_value); +} + +inline void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value) { + *ptr = value; +} + +inline void MemoryBarrier() { + pLinuxKernelMemoryBarrier(); +} + +inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) { + *ptr = value; + MemoryBarrier(); +} + +inline void Release_Store(volatile Atomic32* ptr, Atomic32 value) { + MemoryBarrier(); + *ptr = value; +} + +inline Atomic32 NoBarrier_Load(volatile const Atomic32* ptr) { + return *ptr; +} + +inline Atomic32 Acquire_Load(volatile const Atomic32* ptr) { + Atomic32 value = *ptr; + MemoryBarrier(); + return value; +} + +inline Atomic32 Release_Load(volatile const Atomic32* ptr) { + MemoryBarrier(); + return *ptr; +} + +} // namespace internal +} // namespace protobuf +} // namespace google + +#endif // GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_ARM_GCC_H_ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_arm_qnx.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_arm_qnx.h new file mode 100644 index 00000000..17dfaa51 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_arm_qnx.h @@ -0,0 +1,146 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2012 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// This file is an internal atomic implementation, use atomicops.h instead. + +#ifndef GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_ARM_QNX_H_ +#define GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_ARM_QNX_H_ + +// For _smp_cmpxchg() +#include + +namespace google { +namespace protobuf { +namespace internal { + +inline Atomic32 QNXCmpxchg(Atomic32 old_value, + Atomic32 new_value, + volatile Atomic32* ptr) { + return static_cast( + _smp_cmpxchg((volatile unsigned *)ptr, + (unsigned)old_value, + (unsigned)new_value)); +} + + +inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + Atomic32 prev_value = *ptr; + do { + if (!QNXCmpxchg(old_value, new_value, + const_cast(ptr))) { + return old_value; + } + prev_value = *ptr; + } while (prev_value == old_value); + return prev_value; +} + +inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr, + Atomic32 new_value) { + Atomic32 old_value; + do { + old_value = *ptr; + } while (QNXCmpxchg(old_value, new_value, + const_cast(ptr))); + return old_value; +} + +inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr, + Atomic32 increment) { + return Barrier_AtomicIncrement(ptr, increment); +} + +inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr, + Atomic32 increment) { + for (;;) { + // Atomic exchange the old value with an incremented one. + Atomic32 old_value = *ptr; + Atomic32 new_value = old_value + increment; + if (QNXCmpxchg(old_value, new_value, + const_cast(ptr)) == 0) { + // The exchange took place as expected. + return new_value; + } + // Otherwise, *ptr changed mid-loop and we need to retry. + } +} + +inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + return NoBarrier_CompareAndSwap(ptr, old_value, new_value); +} + +inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + return NoBarrier_CompareAndSwap(ptr, old_value, new_value); +} + +inline void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value) { + *ptr = value; +} + +inline void MemoryBarrier() { + __sync_synchronize(); +} + +inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) { + *ptr = value; + MemoryBarrier(); +} + +inline void Release_Store(volatile Atomic32* ptr, Atomic32 value) { + MemoryBarrier(); + *ptr = value; +} + +inline Atomic32 NoBarrier_Load(volatile const Atomic32* ptr) { + return *ptr; +} + +inline Atomic32 Acquire_Load(volatile const Atomic32* ptr) { + Atomic32 value = *ptr; + MemoryBarrier(); + return value; +} + +inline Atomic32 Release_Load(volatile const Atomic32* ptr) { + MemoryBarrier(); + return *ptr; +} + +} // namespace internal +} // namespace protobuf +} // namespace google + +#endif // GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_ARM_QNX_H_ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_atomicword_compat.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_atomicword_compat.h new file mode 100644 index 00000000..eb198ff5 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_atomicword_compat.h @@ -0,0 +1,122 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2012 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// This file is an internal atomic implementation, use atomicops.h instead. + +#ifndef GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_ATOMICWORD_COMPAT_H_ +#define GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_ATOMICWORD_COMPAT_H_ + +// AtomicWord is a synonym for intptr_t, and Atomic32 is a synonym for int32, +// which in turn means int. On some LP32 platforms, intptr_t is an int, but +// on others, it's a long. When AtomicWord and Atomic32 are based on different +// fundamental types, their pointers are incompatible. +// +// This file defines function overloads to allow both AtomicWord and Atomic32 +// data to be used with this interface. +// +// On LP64 platforms, AtomicWord and Atomic64 are both always long, +// so this problem doesn't occur. + +#if !defined(GOOGLE_PROTOBUF_ARCH_64_BIT) + +namespace google { +namespace protobuf { +namespace internal { + +inline AtomicWord NoBarrier_CompareAndSwap(volatile AtomicWord* ptr, + AtomicWord old_value, + AtomicWord new_value) { + return NoBarrier_CompareAndSwap( + reinterpret_cast(ptr), old_value, new_value); +} + +inline AtomicWord NoBarrier_AtomicExchange(volatile AtomicWord* ptr, + AtomicWord new_value) { + return NoBarrier_AtomicExchange( + reinterpret_cast(ptr), new_value); +} + +inline AtomicWord NoBarrier_AtomicIncrement(volatile AtomicWord* ptr, + AtomicWord increment) { + return NoBarrier_AtomicIncrement( + reinterpret_cast(ptr), increment); +} + +inline AtomicWord Barrier_AtomicIncrement(volatile AtomicWord* ptr, + AtomicWord increment) { + return Barrier_AtomicIncrement( + reinterpret_cast(ptr), increment); +} + +inline AtomicWord Acquire_CompareAndSwap(volatile AtomicWord* ptr, + AtomicWord old_value, + AtomicWord new_value) { + return Acquire_CompareAndSwap( + reinterpret_cast(ptr), old_value, new_value); +} + +inline AtomicWord Release_CompareAndSwap(volatile AtomicWord* ptr, + AtomicWord old_value, + AtomicWord new_value) { + return Release_CompareAndSwap( + reinterpret_cast(ptr), old_value, new_value); +} + +inline void NoBarrier_Store(volatile AtomicWord *ptr, AtomicWord value) { + NoBarrier_Store(reinterpret_cast(ptr), value); +} + +inline void Acquire_Store(volatile AtomicWord* ptr, AtomicWord value) { + return Acquire_Store(reinterpret_cast(ptr), value); +} + +inline void Release_Store(volatile AtomicWord* ptr, AtomicWord value) { + return Release_Store(reinterpret_cast(ptr), value); +} + +inline AtomicWord NoBarrier_Load(volatile const AtomicWord *ptr) { + return NoBarrier_Load(reinterpret_cast(ptr)); +} + +inline AtomicWord Acquire_Load(volatile const AtomicWord* ptr) { + return Acquire_Load(reinterpret_cast(ptr)); +} + +inline AtomicWord Release_Load(volatile const AtomicWord* ptr) { + return Release_Load(reinterpret_cast(ptr)); +} + +} // namespace internal +} // namespace protobuf +} // namespace google + +#endif // !defined(GOOGLE_PROTOBUF_ARCH_64_BIT) + +#endif // GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_ATOMICWORD_COMPAT_H_ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_generic_gcc.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_generic_gcc.h new file mode 100644 index 00000000..7314ee4f --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_generic_gcc.h @@ -0,0 +1,155 @@ +// Copyright 2013 Red Hat Inc. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Red Hat Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// This file is an internal atomic implementation, use atomicops.h instead. + +#ifndef GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_GENERIC_GCC_H_ +#define GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_GENERIC_GCC_H_ + +namespace google { +namespace protobuf { +namespace internal { + +inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + __atomic_compare_exchange_n(ptr, &old_value, new_value, true, + __ATOMIC_RELAXED, __ATOMIC_RELAXED); + return old_value; +} + +inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr, + Atomic32 new_value) { + return __atomic_exchange_n(ptr, new_value, __ATOMIC_RELAXED); +} + +inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr, + Atomic32 increment) { + return __atomic_add_fetch(ptr, increment, __ATOMIC_RELAXED); +} + +inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr, + Atomic32 increment) { + return __atomic_add_fetch(ptr, increment, __ATOMIC_SEQ_CST); +} + +inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + __atomic_compare_exchange_n(ptr, &old_value, new_value, true, + __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE); + return old_value; +} + +inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + __atomic_compare_exchange_n(ptr, &old_value, new_value, true, + __ATOMIC_RELEASE, __ATOMIC_ACQUIRE); + return old_value; +} + +inline void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value) { + __atomic_store_n(ptr, value, __ATOMIC_RELAXED); +} + +inline void MemoryBarrier() { + __sync_synchronize(); +} + +inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) { + __atomic_store_n(ptr, value, __ATOMIC_SEQ_CST); +} + +inline void Release_Store(volatile Atomic32* ptr, Atomic32 value) { + __atomic_store_n(ptr, value, __ATOMIC_RELEASE); +} + +inline Atomic32 NoBarrier_Load(volatile const Atomic32* ptr) { + return __atomic_load_n(ptr, __ATOMIC_RELAXED); +} + +inline Atomic32 Acquire_Load(volatile const Atomic32* ptr) { + return __atomic_load_n(ptr, __ATOMIC_ACQUIRE); +} + +inline Atomic32 Release_Load(volatile const Atomic32* ptr) { + return __atomic_load_n(ptr, __ATOMIC_SEQ_CST); +} + +#ifdef __LP64__ + +inline void Release_Store(volatile Atomic64* ptr, Atomic64 value) { + __atomic_store_n(ptr, value, __ATOMIC_RELEASE); +} + +inline Atomic64 Acquire_Load(volatile const Atomic64* ptr) { + return __atomic_load_n(ptr, __ATOMIC_ACQUIRE); +} + +inline Atomic64 Acquire_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + __atomic_compare_exchange_n(ptr, &old_value, new_value, true, + __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE); + return old_value; +} + +inline Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + __atomic_compare_exchange_n(ptr, &old_value, new_value, true, + __ATOMIC_RELAXED, __ATOMIC_RELAXED); + return old_value; +} + +inline Atomic64 NoBarrier_AtomicIncrement(volatile Atomic64* ptr, + Atomic64 increment) { + return __atomic_add_fetch(ptr, increment, __ATOMIC_RELAXED); +} + +inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) { + __atomic_store_n(ptr, value, __ATOMIC_RELAXED); +} + +inline Atomic64 NoBarrier_AtomicExchange(volatile Atomic64* ptr, + Atomic64 new_value) { + return __atomic_exchange_n(ptr, new_value, __ATOMIC_RELAXED); +} + +inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) { + return __atomic_load_n(ptr, __ATOMIC_RELAXED); +} + +#endif // defined(__LP64__) + +} // namespace internal +} // namespace protobuf +} // namespace google + +#endif // GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_GENERIC_GCC_H_ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_macosx.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_macosx.h new file mode 100644 index 00000000..79633241 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_macosx.h @@ -0,0 +1,225 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2012 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// This file is an internal atomic implementation, use atomicops.h instead. + +#ifndef GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_MACOSX_H_ +#define GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_MACOSX_H_ + +#include + +namespace google { +namespace protobuf { +namespace internal { + +inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + Atomic32 prev_value; + do { + if (OSAtomicCompareAndSwap32(old_value, new_value, + const_cast(ptr))) { + return old_value; + } + prev_value = *ptr; + } while (prev_value == old_value); + return prev_value; +} + +inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr, + Atomic32 new_value) { + Atomic32 old_value; + do { + old_value = *ptr; + } while (!OSAtomicCompareAndSwap32(old_value, new_value, + const_cast(ptr))); + return old_value; +} + +inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr, + Atomic32 increment) { + return OSAtomicAdd32(increment, const_cast(ptr)); +} + +inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr, + Atomic32 increment) { + return OSAtomicAdd32Barrier(increment, const_cast(ptr)); +} + +inline void MemoryBarrier() { + OSMemoryBarrier(); +} + +inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + Atomic32 prev_value; + do { + if (OSAtomicCompareAndSwap32Barrier(old_value, new_value, + const_cast(ptr))) { + return old_value; + } + prev_value = *ptr; + } while (prev_value == old_value); + return prev_value; +} + +inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + return Acquire_CompareAndSwap(ptr, old_value, new_value); +} + +inline void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value) { + *ptr = value; +} + +inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) { + *ptr = value; + MemoryBarrier(); +} + +inline void Release_Store(volatile Atomic32* ptr, Atomic32 value) { + MemoryBarrier(); + *ptr = value; +} + +inline Atomic32 NoBarrier_Load(volatile const Atomic32* ptr) { + return *ptr; +} + +inline Atomic32 Acquire_Load(volatile const Atomic32* ptr) { + Atomic32 value = *ptr; + MemoryBarrier(); + return value; +} + +inline Atomic32 Release_Load(volatile const Atomic32* ptr) { + MemoryBarrier(); + return *ptr; +} + +#ifdef __LP64__ + +// 64-bit implementation on 64-bit platform + +inline Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + Atomic64 prev_value; + do { + if (OSAtomicCompareAndSwap64(old_value, new_value, + reinterpret_cast(ptr))) { + return old_value; + } + prev_value = *ptr; + } while (prev_value == old_value); + return prev_value; +} + +inline Atomic64 NoBarrier_AtomicExchange(volatile Atomic64* ptr, + Atomic64 new_value) { + Atomic64 old_value; + do { + old_value = *ptr; + } while (!OSAtomicCompareAndSwap64(old_value, new_value, + reinterpret_cast(ptr))); + return old_value; +} + +inline Atomic64 NoBarrier_AtomicIncrement(volatile Atomic64* ptr, + Atomic64 increment) { + return OSAtomicAdd64(increment, reinterpret_cast(ptr)); +} + +inline Atomic64 Barrier_AtomicIncrement(volatile Atomic64* ptr, + Atomic64 increment) { + return OSAtomicAdd64Barrier(increment, + reinterpret_cast(ptr)); +} + +inline Atomic64 Acquire_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + Atomic64 prev_value; + do { + if (OSAtomicCompareAndSwap64Barrier( + old_value, new_value, reinterpret_cast(ptr))) { + return old_value; + } + prev_value = *ptr; + } while (prev_value == old_value); + return prev_value; +} + +inline Atomic64 Release_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + // The lib kern interface does not distinguish between + // Acquire and Release memory barriers; they are equivalent. + return Acquire_CompareAndSwap(ptr, old_value, new_value); +} + +inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) { + *ptr = value; +} + +inline void Acquire_Store(volatile Atomic64* ptr, Atomic64 value) { + *ptr = value; + MemoryBarrier(); +} + +inline void Release_Store(volatile Atomic64* ptr, Atomic64 value) { + MemoryBarrier(); + *ptr = value; +} + +inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) { + return *ptr; +} + +inline Atomic64 Acquire_Load(volatile const Atomic64* ptr) { + Atomic64 value = *ptr; + MemoryBarrier(); + return value; +} + +inline Atomic64 Release_Load(volatile const Atomic64* ptr) { + MemoryBarrier(); + return *ptr; +} + +#endif // defined(__LP64__) + +} // namespace internal +} // namespace protobuf +} // namespace google + +#endif // GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_MACOSX_H_ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_mips_gcc.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_mips_gcc.h new file mode 100644 index 00000000..f5837c9e --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_mips_gcc.h @@ -0,0 +1,313 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2012 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// This file is an internal atomic implementation, use atomicops.h instead. + +#ifndef GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_MIPS_GCC_H_ +#define GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_MIPS_GCC_H_ + +#define ATOMICOPS_COMPILER_BARRIER() __asm__ __volatile__("" : : : "memory") + +namespace google { +namespace protobuf { +namespace internal { + +// Atomically execute: +// result = *ptr; +// if (*ptr == old_value) +// *ptr = new_value; +// return result; +// +// I.e., replace "*ptr" with "new_value" if "*ptr" used to be "old_value". +// Always return the old value of "*ptr" +// +// This routine implies no memory barriers. +inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + Atomic32 prev, tmp; + __asm__ __volatile__(".set push\n" + ".set noreorder\n" + "1:\n" + "ll %0, %5\n" // prev = *ptr + "bne %0, %3, 2f\n" // if (prev != old_value) goto 2 + "move %2, %4\n" // tmp = new_value + "sc %2, %1\n" // *ptr = tmp (with atomic check) + "beqz %2, 1b\n" // start again on atomic error + "nop\n" // delay slot nop + "2:\n" + ".set pop\n" + : "=&r" (prev), "=m" (*ptr), "=&r" (tmp) + : "r" (old_value), "r" (new_value), "m" (*ptr) + : "memory"); + return prev; +} + +// Atomically store new_value into *ptr, returning the previous value held in +// *ptr. This routine implies no memory barriers. +inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr, + Atomic32 new_value) { + Atomic32 temp, old; + __asm__ __volatile__(".set push\n" + ".set noreorder\n" + "1:\n" + "ll %1, %4\n" // old = *ptr + "move %0, %3\n" // temp = new_value + "sc %0, %2\n" // *ptr = temp (with atomic check) + "beqz %0, 1b\n" // start again on atomic error + "nop\n" // delay slot nop + ".set pop\n" + : "=&r" (temp), "=&r" (old), "=m" (*ptr) + : "r" (new_value), "m" (*ptr) + : "memory"); + + return old; +} + +// Atomically increment *ptr by "increment". Returns the new value of +// *ptr with the increment applied. This routine implies no memory barriers. +inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr, + Atomic32 increment) { + Atomic32 temp, temp2; + + __asm__ __volatile__(".set push\n" + ".set noreorder\n" + "1:\n" + "ll %0, %4\n" // temp = *ptr + "addu %1, %0, %3\n" // temp2 = temp + increment + "sc %1, %2\n" // *ptr = temp2 (with atomic check) + "beqz %1, 1b\n" // start again on atomic error + "addu %1, %0, %3\n" // temp2 = temp + increment + ".set pop\n" + : "=&r" (temp), "=&r" (temp2), "=m" (*ptr) + : "Ir" (increment), "m" (*ptr) + : "memory"); + // temp2 now holds the final value. + return temp2; +} + +inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr, + Atomic32 increment) { + ATOMICOPS_COMPILER_BARRIER(); + Atomic32 res = NoBarrier_AtomicIncrement(ptr, increment); + ATOMICOPS_COMPILER_BARRIER(); + return res; +} + +// "Acquire" operations +// ensure that no later memory access can be reordered ahead of the operation. +// "Release" operations ensure that no previous memory access can be reordered +// after the operation. "Barrier" operations have both "Acquire" and "Release" +// semantics. A MemoryBarrier() has "Barrier" semantics, but does no memory +// access. +inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + ATOMICOPS_COMPILER_BARRIER(); + Atomic32 res = NoBarrier_CompareAndSwap(ptr, old_value, new_value); + ATOMICOPS_COMPILER_BARRIER(); + return res; +} + +inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + ATOMICOPS_COMPILER_BARRIER(); + Atomic32 res = NoBarrier_CompareAndSwap(ptr, old_value, new_value); + ATOMICOPS_COMPILER_BARRIER(); + return res; +} + +inline void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value) { + *ptr = value; +} + +inline void MemoryBarrier() { + __asm__ __volatile__("sync" : : : "memory"); +} + +inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) { + *ptr = value; + MemoryBarrier(); +} + +inline void Release_Store(volatile Atomic32* ptr, Atomic32 value) { + MemoryBarrier(); + *ptr = value; +} + +inline Atomic32 NoBarrier_Load(volatile const Atomic32* ptr) { + return *ptr; +} + +inline Atomic32 Acquire_Load(volatile const Atomic32* ptr) { + Atomic32 value = *ptr; + MemoryBarrier(); + return value; +} + +inline Atomic32 Release_Load(volatile const Atomic32* ptr) { + MemoryBarrier(); + return *ptr; +} + +#if defined(__LP64__) +// 64-bit versions of the atomic ops. + +inline Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + Atomic64 prev, tmp; + __asm__ __volatile__(".set push\n" + ".set noreorder\n" + "1:\n" + "lld %0, %5\n" // prev = *ptr + "bne %0, %3, 2f\n" // if (prev != old_value) goto 2 + "move %2, %4\n" // tmp = new_value + "scd %2, %1\n" // *ptr = tmp (with atomic check) + "beqz %2, 1b\n" // start again on atomic error + "nop\n" // delay slot nop + "2:\n" + ".set pop\n" + : "=&r" (prev), "=m" (*ptr), "=&r" (tmp) + : "r" (old_value), "r" (new_value), "m" (*ptr) + : "memory"); + return prev; +} + +// Atomically store new_value into *ptr, returning the previous value held in +// *ptr. This routine implies no memory barriers. +inline Atomic64 NoBarrier_AtomicExchange(volatile Atomic64* ptr, + Atomic64 new_value) { + Atomic64 temp, old; + __asm__ __volatile__(".set push\n" + ".set noreorder\n" + "1:\n" + "lld %1, %4\n" // old = *ptr + "move %0, %3\n" // temp = new_value + "scd %0, %2\n" // *ptr = temp (with atomic check) + "beqz %0, 1b\n" // start again on atomic error + "nop\n" // delay slot nop + ".set pop\n" + : "=&r" (temp), "=&r" (old), "=m" (*ptr) + : "r" (new_value), "m" (*ptr) + : "memory"); + + return old; +} + +// Atomically increment *ptr by "increment". Returns the new value of +// *ptr with the increment applied. This routine implies no memory barriers. +inline Atomic64 NoBarrier_AtomicIncrement(volatile Atomic64* ptr, + Atomic64 increment) { + Atomic64 temp, temp2; + + __asm__ __volatile__(".set push\n" + ".set noreorder\n" + "1:\n" + "lld %0, %4\n" // temp = *ptr + "daddu %1, %0, %3\n" // temp2 = temp + increment + "scd %1, %2\n" // *ptr = temp2 (with atomic check) + "beqz %1, 1b\n" // start again on atomic error + "daddu %1, %0, %3\n" // temp2 = temp + increment + ".set pop\n" + : "=&r" (temp), "=&r" (temp2), "=m" (*ptr) + : "Ir" (increment), "m" (*ptr) + : "memory"); + // temp2 now holds the final value. + return temp2; +} + +inline Atomic64 Barrier_AtomicIncrement(volatile Atomic64* ptr, + Atomic64 increment) { + MemoryBarrier(); + Atomic64 res = NoBarrier_AtomicIncrement(ptr, increment); + MemoryBarrier(); + return res; +} + +// "Acquire" operations +// ensure that no later memory access can be reordered ahead of the operation. +// "Release" operations ensure that no previous memory access can be reordered +// after the operation. "Barrier" operations have both "Acquire" and "Release" +// semantics. A MemoryBarrier() has "Barrier" semantics, but does no memory +// access. +inline Atomic64 Acquire_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + Atomic64 res = NoBarrier_CompareAndSwap(ptr, old_value, new_value); + MemoryBarrier(); + return res; +} + +inline Atomic64 Release_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + MemoryBarrier(); + return NoBarrier_CompareAndSwap(ptr, old_value, new_value); +} + +inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) { + *ptr = value; +} + +inline void Acquire_Store(volatile Atomic64* ptr, Atomic64 value) { + *ptr = value; + MemoryBarrier(); +} + +inline void Release_Store(volatile Atomic64* ptr, Atomic64 value) { + MemoryBarrier(); + *ptr = value; +} + +inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) { + return *ptr; +} + +inline Atomic64 Acquire_Load(volatile const Atomic64* ptr) { + Atomic64 value = *ptr; + MemoryBarrier(); + return value; +} + +inline Atomic64 Release_Load(volatile const Atomic64* ptr) { + MemoryBarrier(); + return *ptr; +} +#endif + +} // namespace internal +} // namespace protobuf +} // namespace google + +#undef ATOMICOPS_COMPILER_BARRIER + +#endif // GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_MIPS_GCC_H_ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_pnacl.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_pnacl.h new file mode 100644 index 00000000..3b314fd0 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_pnacl.h @@ -0,0 +1,231 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2012 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// This file is an internal atomic implementation, use atomicops.h instead. + +#ifndef GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_PNACL_H_ +#define GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_PNACL_H_ + +#include + +namespace google { +namespace protobuf { +namespace internal { + +// This implementation is transitional and maintains the original API for +// atomicops.h. This requires casting memory locations to the atomic types, and +// assumes that the API and the C++11 implementation are layout-compatible, +// which isn't true for all implementations or hardware platforms. The static +// assertion should detect this issue, were it to fire then this header +// shouldn't be used. +// +// TODO(jfb) If this header manages to stay committed then the API should be +// modified, and all call sites updated. +typedef volatile std::atomic* AtomicLocation32; +static_assert(sizeof(*(AtomicLocation32) nullptr) == sizeof(Atomic32), + "incompatible 32-bit atomic layout"); + +inline void MemoryBarrier() { +#if defined(__GLIBCXX__) + // Work around libstdc++ bug 51038 where atomic_thread_fence was declared but + // not defined, leading to the linker complaining about undefined references. + __atomic_thread_fence(std::memory_order_seq_cst); +#else + std::atomic_thread_fence(std::memory_order_seq_cst); +#endif +} + +inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + ((AtomicLocation32)ptr) + ->compare_exchange_strong(old_value, + new_value, + std::memory_order_relaxed, + std::memory_order_relaxed); + return old_value; +} + +inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr, + Atomic32 new_value) { + return ((AtomicLocation32)ptr) + ->exchange(new_value, std::memory_order_relaxed); +} + +inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr, + Atomic32 increment) { + return increment + + ((AtomicLocation32)ptr) + ->fetch_add(increment, std::memory_order_relaxed); +} + +inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr, + Atomic32 increment) { + return increment + ((AtomicLocation32)ptr)->fetch_add(increment); +} + +inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + ((AtomicLocation32)ptr) + ->compare_exchange_strong(old_value, + new_value, + std::memory_order_acquire, + std::memory_order_acquire); + return old_value; +} + +inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + ((AtomicLocation32)ptr) + ->compare_exchange_strong(old_value, + new_value, + std::memory_order_release, + std::memory_order_relaxed); + return old_value; +} + +inline void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value) { + ((AtomicLocation32)ptr)->store(value, std::memory_order_relaxed); +} + +inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) { + ((AtomicLocation32)ptr)->store(value, std::memory_order_relaxed); + MemoryBarrier(); +} + +inline void Release_Store(volatile Atomic32* ptr, Atomic32 value) { + ((AtomicLocation32)ptr)->store(value, std::memory_order_release); +} + +inline Atomic32 NoBarrier_Load(volatile const Atomic32* ptr) { + return ((AtomicLocation32)ptr)->load(std::memory_order_relaxed); +} + +inline Atomic32 Acquire_Load(volatile const Atomic32* ptr) { + return ((AtomicLocation32)ptr)->load(std::memory_order_acquire); +} + +inline Atomic32 Release_Load(volatile const Atomic32* ptr) { + MemoryBarrier(); + return ((AtomicLocation32)ptr)->load(std::memory_order_relaxed); +} + +#if defined(GOOGLE_PROTOBUF_ARCH_64_BIT) + +typedef volatile std::atomic* AtomicLocation64; +static_assert(sizeof(*(AtomicLocation64) nullptr) == sizeof(Atomic64), + "incompatible 64-bit atomic layout"); + +inline Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + ((AtomicLocation64)ptr) + ->compare_exchange_strong(old_value, + new_value, + std::memory_order_relaxed, + std::memory_order_relaxed); + return old_value; +} + +inline Atomic64 NoBarrier_AtomicExchange(volatile Atomic64* ptr, + Atomic64 new_value) { + return ((AtomicLocation64)ptr) + ->exchange(new_value, std::memory_order_relaxed); +} + +inline Atomic64 NoBarrier_AtomicIncrement(volatile Atomic64* ptr, + Atomic64 increment) { + return increment + + ((AtomicLocation64)ptr) + ->fetch_add(increment, std::memory_order_relaxed); +} + +inline Atomic64 Barrier_AtomicIncrement(volatile Atomic64* ptr, + Atomic64 increment) { + return increment + ((AtomicLocation64)ptr)->fetch_add(increment); +} + +inline Atomic64 Acquire_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + ((AtomicLocation64)ptr) + ->compare_exchange_strong(old_value, + new_value, + std::memory_order_acquire, + std::memory_order_acquire); + return old_value; +} + +inline Atomic64 Release_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + ((AtomicLocation64)ptr) + ->compare_exchange_strong(old_value, + new_value, + std::memory_order_release, + std::memory_order_relaxed); + return old_value; +} + +inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) { + ((AtomicLocation64)ptr)->store(value, std::memory_order_relaxed); +} + +inline void Acquire_Store(volatile Atomic64* ptr, Atomic64 value) { + ((AtomicLocation64)ptr)->store(value, std::memory_order_relaxed); + MemoryBarrier(); +} + +inline void Release_Store(volatile Atomic64* ptr, Atomic64 value) { + ((AtomicLocation64)ptr)->store(value, std::memory_order_release); +} + +inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) { + return ((AtomicLocation64)ptr)->load(std::memory_order_relaxed); +} + +inline Atomic64 Acquire_Load(volatile const Atomic64* ptr) { + return ((AtomicLocation64)ptr)->load(std::memory_order_acquire); +} + +inline Atomic64 Release_Load(volatile const Atomic64* ptr) { + MemoryBarrier(); + return ((AtomicLocation64)ptr)->load(std::memory_order_relaxed); +} + +#endif // defined(GOOGLE_PROTOBUF_ARCH_64_BIT) + +} // namespace internal +} // namespace protobuf +} // namespace google + +#endif // GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_PNACL_H_ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_power.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_power.h new file mode 100644 index 00000000..b8a42f21 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_power.h @@ -0,0 +1,440 @@ +// Copyright 2014 Bloomberg Finance LP. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Bloomberg Finance LP. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// This file is an internal atomic implementation, use atomicops.h instead. + +#ifndef GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_AIX_H_ +#define GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_AIX_H_ + +namespace google { +namespace protobuf { +namespace internal { + +inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + Atomic32 result; + + asm volatile ( + "1: lwarx %[res], %[zero], %[obj] \n\t" // load and reserve + " cmpw %[cmp], %[res] \n\t" // compare values + " bne- 2f \n\t" + " stwcx. %[val], %[zero], %[obj] \n\t" // store new value + " bne- 1b \n\t" + "2: \n\t" + : [res] "=&b" (result) + : [obj] "b" (ptr), + [cmp] "b" (old_value), + [val] "b" (new_value), + [zero] "i" (0) + : "cr0", "ctr"); + + return result; +} + +inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr, + Atomic32 new_value) { + Atomic32 result; + + asm volatile ( + "1: lwarx %[res], %[zero], %[obj] \n\t" + " stwcx. %[val], %[zero], %[obj] \n\t" + " bne- 1b \n\t" + : [res] "=&b" (result) + : [obj] "b" (ptr), + [val] "b" (new_value), + [zero] "i" (0) + : "cr0", "ctr"); + + return result; +} + +inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr, + Atomic32 increment) { + Atomic32 result; + + asm volatile ( + "1: lwarx %[res], %[zero], %[obj] \n\t" // load and reserve + " add %[res], %[val], %[res] \n\t" // add the operand + " stwcx. %[res], %[zero], %[obj] \n\t" // store old value + // if still reserved + " bne- 1b \n\t" + : [res] "=&b" (result) + : [obj] "b" (ptr), + [val] "b" (increment), + [zero] "i" (0) + : "cr0", "ctr"); + + return result; +} + +inline void MemoryBarrier(void) { + asm volatile ( + " lwsync \n\t" + " isync \n\t" + : + : + : "memory"); +} + +inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr, + Atomic32 increment) { + Atomic32 result; + + asm volatile ( + " lwsync \n\t" + + "1: lwarx %[res], %[zero], %[obj] \n\t" // load and reserve + " add %[res], %[val], %[res] \n\t" // add the operand + " stwcx. %[res], %[zero], %[obj] \n\t" // store old value + // if still reserved + " bne- 1b \n\t" + " isync \n\t" + : [res] "=&b" (result) + : [obj] "b" (ptr), + [val] "b" (increment), + [zero] "i" (0) + : "cr0", "ctr"); + + return result; +} + +inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + Atomic32 result; + + asm volatile ( + "1: lwarx %[res], %[zero], %[obj] \n\t" // load and reserve + " cmpw %[cmp], %[res] \n\t" // compare values + " bne- 2f \n\t" + " stwcx. %[val], %[zero], %[obj] \n\t" // store new value + " bne- 1b \n\t" + + " isync \n\t" + "2: \n\t" + : [res] "=&b" (result) + : [obj] "b" (ptr), + [cmp] "b" (old_value), + [val] "b" (new_value), + [zero] "i" (0) + : "cr0", "ctr"); + + return result; +} + +inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + Atomic32 result; + + asm volatile ( + " lwsync \n\t" + + "1: lwarx %[res], %[zero], %[obj] \n\t" // load and reserve + " cmpw %[cmp], %[res] \n\t" // compare values + " bne- 2f \n\t" + " stwcx. %[val], %[zero], %[obj] \n\t" // store new value + " bne- 1b \n\t" + + "2: \n\t" + : [res] "=&b" (result) + : [obj] "b" (ptr), + [cmp] "b" (old_value), + [val] "b" (new_value), + [zero] "i" (0) + : "cr0", "ctr"); + + return result; +} + +inline void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value) { + *ptr = value; +} + +inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) { + asm volatile ( + " stw %[val], %[obj] \n\t" + " isync \n\t" + : [obj] "=m" (*ptr) + : [val] "b" (value)); +} + +inline void Release_Store(volatile Atomic32* ptr, Atomic32 value) { + asm volatile ( + " lwsync \n\t" + " stw %[val], %[obj] \n\t" + : [obj] "=m" (*ptr) + : [val] "b" (value)); +} + +inline Atomic32 NoBarrier_Load(volatile const Atomic32* ptr) { + return *ptr; +} + +inline Atomic32 Acquire_Load(volatile const Atomic32* ptr) { + Atomic32 result; + + asm volatile ( + "1: lwz %[res], %[obj] \n\t" + " cmpw %[res], %[res] \n\t" // create data + // dependency for + // load/load ordering + " bne- 1b \n\t" // never taken + + " isync \n\t" + : [res] "=b" (result) + : [obj] "m" (*ptr), + [zero] "i" (0) + : "cr0", "ctr"); + + return result; +} + +inline Atomic32 Release_Load(volatile const Atomic32* ptr) { + Atomic32 result; + + asm volatile ( + " lwsync \n\t" + + "1: lwz %[res], %[obj] \n\t" + " cmpw %[res], %[res] \n\t" // create data + // dependency for + // load/load ordering + " bne- 1b \n\t" // never taken + : [res] "=b" (result) + : [obj] "m" (*ptr), + [zero] "i" (0) + : "cr0", "ctr"); + + return result; +} + +#ifdef GOOGLE_PROTOBUF_ARCH_64_BIT +inline Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + Atomic64 result; + + asm volatile ( + "1: ldarx %[res], %[zero], %[obj] \n\t" // load and reserve + " cmpd %[cmp], %[res] \n\t" // compare values + " bne- 2f \n\t" + + " stdcx. %[val], %[zero], %[obj] \n\t" // store the new value + " bne- 1b \n\t" + "2: \n\t" + : [res] "=&b" (result) + : [obj] "b" (ptr), + [cmp] "b" (old_value), + [val] "b" (new_value), + [zero] "i" (0) + : "cr0", "ctr"); + + return result; +} + +inline Atomic64 NoBarrier_AtomicExchange(volatile Atomic64* ptr, + Atomic64 new_value) { + Atomic64 result; + + asm volatile ( + "1: ldarx %[res], %[zero], %[obj] \n\t" + " stdcx. %[val], %[zero], %[obj] \n\t" + " bne- 1b \n\t" + : [res] "=&b" (result) + : [obj] "b" (ptr), + [val] "b" (new_value), + [zero] "i" (0) + : "cr0", "ctr"); + + return result; +} + +inline Atomic64 NoBarrier_AtomicIncrement(volatile Atomic64* ptr, + Atomic64 increment) { + Atomic64 result; + + asm volatile ( + "1: ldarx %[res], %[zero], %[obj] \n\t" // load and reserve + " add %[res], %[res], %[val] \n\t" // add the operand + " stdcx. %[res], %[zero], %[obj] \n\t" // store old value if + // still reserved + + " bne- 1b \n\t" + : [res] "=&b" (result) + : [obj] "b" (ptr), + [val] "b" (increment), + [zero] "i" (0) + : "cr0", "ctr"); + + return result; +} + +inline Atomic64 Barrier_AtomicIncrement(volatile Atomic64* ptr, + Atomic64 increment) { + + Atomic64 result; + + asm volatile ( + " lwsync \n\t" + + "1: ldarx %[res], %[zero], %[obj] \n\t" // load and reserve + " add %[res], %[res], %[val] \n\t" // add the operand + " stdcx. %[res], %[zero], %[obj] \n\t" // store old value if + // still reserved + + " bne- 1b \n\t" + + " isync \n\t" + : [res] "=&b" (result) + : [obj] "b" (ptr), + [val] "b" (increment), + [zero] "i" (0) + : "cr0", "ctr"); + + return result; +} + +inline Atomic64 Acquire_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + Atomic64 result; + + asm volatile ( + "1: ldarx %[res], %[zero], %[obj] \n\t" // load and reserve + " cmpd %[cmp], %[res] \n\t" // compare values + " bne- 2f \n\t" + + " stdcx. %[val], %[zero], %[obj] \n\t" // store the new value + " bne- 1b \n\t" + " isync \n\t" + "2: \n\t" + : [res] "=&b" (result) + : [obj] "b" (ptr), + [cmp] "b" (old_value), + [val] "b" (new_value), + [zero] "i" (0) + : "cr0", "ctr"); + + return result; +} + +inline Atomic64 Release_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + Atomic64 result; + + asm volatile ( + " lwsync \n\t" + + "1: ldarx %[res], %[zero], %[obj] \n\t" // load and reserve + " cmpd %[cmp], %[res] \n\t" // compare values + " bne- 2f \n\t" + + " stdcx. %[val], %[zero], %[obj] \n\t" // store the new value + " bne- 1b \n\t" + "2: \n\t" + : [res] "=&b" (result) + : [obj] "b" (ptr), + [cmp] "b" (old_value), + [val] "b" (new_value), + [zero] "i" (0) + : "cr0", "ctr"); + + return result; +} + +inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) { + *ptr = value; +} + +inline void Acquire_Store(volatile Atomic64* ptr, Atomic64 value) { + asm volatile ( + " std %[val], %[obj] \n\t" + " isync \n\t" + : [obj] "=m" (*ptr) + : [val] "b" (value)); +} + +inline void Release_Store(volatile Atomic64* ptr, Atomic64 value) { + asm volatile ( + " lwsync \n\t" + " std %[val], %[obj] \n\t" + : [obj] "=m" (*ptr) + : [val] "b" (value)); +} + +inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) { + return *ptr; +} + +inline Atomic64 Acquire_Load(volatile const Atomic64* ptr) { + Atomic64 result; + + asm volatile ( + "1: ld %[res], %[obj] \n\t" + " cmpd %[res], %[res] \n\t" // create data + // dependency for + // load/load ordering + " bne- 1b \n\t" // never taken + + " isync \n\t" + : [res] "=b" (result) + : [obj] "m" (*ptr), + [zero] "i" (0) + : "cr0", "ctr"); + + return result; +} + +inline Atomic64 Release_Load(volatile const Atomic64* ptr) { + Atomic64 result; + + asm volatile ( + " lwsync \n\t" + + "1: ld %[res], %[obj] \n\t" + " cmpd %[res], %[res] \n\t" // create data + // dependency for + // load/load ordering + " bne- 1b \n\t" // never taken + : [res] "=b" (result) + : [obj] "m" (*ptr), + [zero] "i" (0) + : "cr0", "ctr"); + + return result; +} +#endif + +} // namespace internal +} // namespace protobuf +} // namespace google + +#endif // GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_SPARC_GCC_H_ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_ppc_gcc.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_ppc_gcc.h new file mode 100644 index 00000000..8231a578 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_ppc_gcc.h @@ -0,0 +1,155 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2015 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Author: ogabbay@advaoptical.com (Oded Gabbay) +// Cleaned up by: bsilver16384@gmail.com (Brian Silverman) +// +// This file is an internal atomic implementation, use atomicops.h instead. + +#ifndef GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_PPC_GCC_H_ +#define GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_PPC_GCC_H_ + +#define ATOMICOPS_COMPILER_BARRIER() __asm__ __volatile__("" : : : "memory") + +namespace google { +namespace protobuf { +namespace internal { + +inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32 *ptr, + Atomic32 old_value, + Atomic32 new_value) { + Atomic32 prev; + + __asm__ __volatile__( + "0: \n\t" + "lwarx %[prev],0,%[ptr] \n\t" + "cmpw 0,%[prev],%[old_value] \n\t" + "bne- 1f \n\t" + "stwcx. %[new_value],0,%[ptr] \n\t" + "bne- 0b \n\t" + "1: \n\t" + : [prev] "=&r"(prev), "+m"(*ptr) + : [ptr] "r"(ptr), [old_value] "r"(old_value), [new_value] "r"(new_value) + : "cc", "memory"); + + return prev; +} + +inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32 *ptr, + Atomic32 new_value) { + Atomic32 old; + + __asm__ __volatile__( + "0: \n\t" + "lwarx %[old],0,%[ptr] \n\t" + "stwcx. %[new_value],0,%[ptr] \n\t" + "bne- 0b \n\t" + : [old] "=&r"(old), "+m"(*ptr) + : [ptr] "r"(ptr), [new_value] "r"(new_value) + : "cc", "memory"); + + return old; +} + +inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32 *ptr, + Atomic32 increment) { + Atomic32 temp; + + __asm__ __volatile__( + "0: \n\t" + "lwarx %[temp],0,%[ptr] \n\t" + "add %[temp],%[increment],%[temp] \n\t" + "stwcx. %[temp],0,%[ptr] \n\t" + "bne- 0b \n\t" + : [temp] "=&r"(temp) + : [increment] "r"(increment), [ptr] "r"(ptr) + : "cc", "memory"); + + return temp; +} + +inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32 *ptr, + Atomic32 increment) { + MemoryBarrier(); + Atomic32 res = NoBarrier_AtomicIncrement(ptr, increment); + MemoryBarrier(); + return res; +} + +inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32 *ptr, + Atomic32 old_value, Atomic32 new_value) { + Atomic32 res = NoBarrier_CompareAndSwap(ptr, old_value, new_value); + MemoryBarrier(); + return res; +} + +inline Atomic32 Release_CompareAndSwap(volatile Atomic32 *ptr, + Atomic32 old_value, Atomic32 new_value) { + MemoryBarrier(); + Atomic32 res = NoBarrier_CompareAndSwap(ptr, old_value, new_value); + return res; +} + +inline void NoBarrier_Store(volatile Atomic32 *ptr, Atomic32 value) { + *ptr = value; +} + +inline void MemoryBarrier() { __asm__ __volatile__("sync" : : : "memory"); } + +inline void Acquire_Store(volatile Atomic32 *ptr, Atomic32 value) { + *ptr = value; + MemoryBarrier(); +} + +inline void Release_Store(volatile Atomic32 *ptr, Atomic32 value) { + MemoryBarrier(); + *ptr = value; +} + +inline Atomic32 NoBarrier_Load(volatile const Atomic32 *ptr) { return *ptr; } + +inline Atomic32 Acquire_Load(volatile const Atomic32 *ptr) { + Atomic32 value = *ptr; + MemoryBarrier(); + return value; +} + +inline Atomic32 Release_Load(volatile const Atomic32 *ptr) { + MemoryBarrier(); + return *ptr; +} + +} // namespace internal +} // namespace protobuf +} // namespace google + +#undef ATOMICOPS_COMPILER_BARRIER + +#endif // GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_PPC_GCC_H_ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_solaris.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_solaris.h new file mode 100644 index 00000000..d8057ecd --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_solaris.h @@ -0,0 +1,188 @@ +// Copyright 2014 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// This file is an internal atomic implementation, use atomicops.h instead. + +#ifndef GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_SPARC_GCC_H_ +#define GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_SPARC_GCC_H_ + +#include + +namespace google { +namespace protobuf { +namespace internal { + +inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + return (Atomic32)atomic_cas_32((volatile uint32_t*)ptr, (uint32_t)old_value, (uint32_t)new_value); +} + +inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr, + Atomic32 new_value) { + return (Atomic32)atomic_swap_32((volatile uint32_t*)ptr, (uint32_t)new_value); +} + +inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr, + Atomic32 increment) { + return (Atomic32)atomic_add_32_nv((volatile uint32_t*)ptr, (uint32_t)increment); +} + +inline void MemoryBarrier(void) { + membar_producer(); + membar_consumer(); +} + +inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr, + Atomic32 increment) { + MemoryBarrier(); + Atomic32 ret = NoBarrier_AtomicIncrement(ptr, increment); + MemoryBarrier(); + + return ret; +} + +inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + Atomic32 ret = NoBarrier_CompareAndSwap(ptr, old_value, new_value); + MemoryBarrier(); + + return ret; +} + +inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + MemoryBarrier(); + return NoBarrier_CompareAndSwap(ptr, old_value, new_value); +} + +inline void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value) { + *ptr = value; +} + +inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) { + *ptr = value; + membar_producer(); +} + +inline void Release_Store(volatile Atomic32* ptr, Atomic32 value) { + membar_consumer(); + *ptr = value; +} + +inline Atomic32 NoBarrier_Load(volatile const Atomic32* ptr) { + return *ptr; +} + +inline Atomic32 Acquire_Load(volatile const Atomic32* ptr) { + Atomic32 val = *ptr; + membar_consumer(); + return val; +} + +inline Atomic32 Release_Load(volatile const Atomic32* ptr) { + membar_producer(); + return *ptr; +} + +#ifdef GOOGLE_PROTOBUF_ARCH_64_BIT +inline Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + return atomic_cas_64((volatile uint64_t*)ptr, (uint64_t)old_value, (uint64_t)new_value); +} + +inline Atomic64 NoBarrier_AtomicExchange(volatile Atomic64* ptr, Atomic64 new_value) { + return atomic_swap_64((volatile uint64_t*)ptr, (uint64_t)new_value); +} + +inline Atomic64 NoBarrier_AtomicIncrement(volatile Atomic64* ptr, Atomic64 increment) { + return atomic_add_64_nv((volatile uint64_t*)ptr, increment); +} + +inline Atomic64 Barrier_AtomicIncrement(volatile Atomic64* ptr, Atomic64 increment) { + MemoryBarrier(); + Atomic64 ret = atomic_add_64_nv((volatile uint64_t*)ptr, increment); + MemoryBarrier(); + return ret; +} + +inline Atomic64 Acquire_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + Atomic64 ret = NoBarrier_CompareAndSwap(ptr, old_value, new_value); + MemoryBarrier(); + return ret; +} + +inline Atomic64 Release_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + MemoryBarrier(); + return NoBarrier_CompareAndSwap(ptr, old_value, new_value); +} + +inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) { + *ptr = value; +} + +inline void Acquire_Store(volatile Atomic64* ptr, Atomic64 value) { + *ptr = value; + membar_producer(); +} + +inline void Release_Store(volatile Atomic64* ptr, Atomic64 value) { + membar_consumer(); + *ptr = value; +} + +inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) { + return *ptr; +} + +inline Atomic64 Acquire_Load(volatile const Atomic64* ptr) { + Atomic64 ret = *ptr; + membar_consumer(); + return ret; +} + +inline Atomic64 Release_Load(volatile const Atomic64* ptr) { + membar_producer(); + return *ptr; +} +#endif + +} // namespace internal +} // namespace protobuf +} // namespace google + +#endif // GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_SPARC_GCC_H_ + diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_tsan.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_tsan.h new file mode 100644 index 00000000..0c903545 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_tsan.h @@ -0,0 +1,219 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2013 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// This file is an internal atomic implementation for compiler-based +// ThreadSanitizer (http://clang.llvm.org/docs/ThreadSanitizer.html). +// Use atomicops.h instead. + +#ifndef GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_TSAN_H_ +#define GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_TSAN_H_ + +#define ATOMICOPS_COMPILER_BARRIER() __asm__ __volatile__("" : : : "memory") + +#include + +namespace google { +namespace protobuf { +namespace internal { + +inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32 *ptr, + Atomic32 old_value, + Atomic32 new_value) { + Atomic32 cmp = old_value; + __tsan_atomic32_compare_exchange_strong(ptr, &cmp, new_value, + __tsan_memory_order_relaxed, __tsan_memory_order_relaxed); + return cmp; +} + +inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32 *ptr, + Atomic32 new_value) { + return __tsan_atomic32_exchange(ptr, new_value, + __tsan_memory_order_relaxed); +} + +inline Atomic32 Acquire_AtomicExchange(volatile Atomic32 *ptr, + Atomic32 new_value) { + return __tsan_atomic32_exchange(ptr, new_value, + __tsan_memory_order_acquire); +} + +inline Atomic32 Release_AtomicExchange(volatile Atomic32 *ptr, + Atomic32 new_value) { + return __tsan_atomic32_exchange(ptr, new_value, + __tsan_memory_order_release); +} + +inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32 *ptr, + Atomic32 increment) { + return increment + __tsan_atomic32_fetch_add(ptr, increment, + __tsan_memory_order_relaxed); +} + +inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32 *ptr, + Atomic32 increment) { + return increment + __tsan_atomic32_fetch_add(ptr, increment, + __tsan_memory_order_acq_rel); +} + +inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32 *ptr, + Atomic32 old_value, + Atomic32 new_value) { + Atomic32 cmp = old_value; + __tsan_atomic32_compare_exchange_strong(ptr, &cmp, new_value, + __tsan_memory_order_acquire, __tsan_memory_order_acquire); + return cmp; +} + +inline Atomic32 Release_CompareAndSwap(volatile Atomic32 *ptr, + Atomic32 old_value, + Atomic32 new_value) { + Atomic32 cmp = old_value; + __tsan_atomic32_compare_exchange_strong(ptr, &cmp, new_value, + __tsan_memory_order_release, __tsan_memory_order_relaxed); + return cmp; +} + +inline void NoBarrier_Store(volatile Atomic32 *ptr, Atomic32 value) { + __tsan_atomic32_store(ptr, value, __tsan_memory_order_relaxed); +} + +inline void Acquire_Store(volatile Atomic32 *ptr, Atomic32 value) { + __tsan_atomic32_store(ptr, value, __tsan_memory_order_relaxed); + __tsan_atomic_thread_fence(__tsan_memory_order_seq_cst); +} + +inline void Release_Store(volatile Atomic32 *ptr, Atomic32 value) { + __tsan_atomic32_store(ptr, value, __tsan_memory_order_release); +} + +inline Atomic32 NoBarrier_Load(volatile const Atomic32 *ptr) { + return __tsan_atomic32_load(ptr, __tsan_memory_order_relaxed); +} + +inline Atomic32 Acquire_Load(volatile const Atomic32 *ptr) { + return __tsan_atomic32_load(ptr, __tsan_memory_order_acquire); +} + +inline Atomic32 Release_Load(volatile const Atomic32 *ptr) { + __tsan_atomic_thread_fence(__tsan_memory_order_seq_cst); + return __tsan_atomic32_load(ptr, __tsan_memory_order_relaxed); +} + +inline Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64 *ptr, + Atomic64 old_value, + Atomic64 new_value) { + Atomic64 cmp = old_value; + __tsan_atomic64_compare_exchange_strong(ptr, &cmp, new_value, + __tsan_memory_order_relaxed, __tsan_memory_order_relaxed); + return cmp; +} + +inline Atomic64 NoBarrier_AtomicExchange(volatile Atomic64 *ptr, + Atomic64 new_value) { + return __tsan_atomic64_exchange(ptr, new_value, __tsan_memory_order_relaxed); +} + +inline Atomic64 Acquire_AtomicExchange(volatile Atomic64 *ptr, + Atomic64 new_value) { + return __tsan_atomic64_exchange(ptr, new_value, __tsan_memory_order_acquire); +} + +inline Atomic64 Release_AtomicExchange(volatile Atomic64 *ptr, + Atomic64 new_value) { + return __tsan_atomic64_exchange(ptr, new_value, __tsan_memory_order_release); +} + +inline Atomic64 NoBarrier_AtomicIncrement(volatile Atomic64 *ptr, + Atomic64 increment) { + return increment + __tsan_atomic64_fetch_add(ptr, increment, + __tsan_memory_order_relaxed); +} + +inline Atomic64 Barrier_AtomicIncrement(volatile Atomic64 *ptr, + Atomic64 increment) { + return increment + __tsan_atomic64_fetch_add(ptr, increment, + __tsan_memory_order_acq_rel); +} + +inline void NoBarrier_Store(volatile Atomic64 *ptr, Atomic64 value) { + __tsan_atomic64_store(ptr, value, __tsan_memory_order_relaxed); +} + +inline void Acquire_Store(volatile Atomic64 *ptr, Atomic64 value) { + __tsan_atomic64_store(ptr, value, __tsan_memory_order_relaxed); + __tsan_atomic_thread_fence(__tsan_memory_order_seq_cst); +} + +inline void Release_Store(volatile Atomic64 *ptr, Atomic64 value) { + __tsan_atomic64_store(ptr, value, __tsan_memory_order_release); +} + +inline Atomic64 NoBarrier_Load(volatile const Atomic64 *ptr) { + return __tsan_atomic64_load(ptr, __tsan_memory_order_relaxed); +} + +inline Atomic64 Acquire_Load(volatile const Atomic64 *ptr) { + return __tsan_atomic64_load(ptr, __tsan_memory_order_acquire); +} + +inline Atomic64 Release_Load(volatile const Atomic64 *ptr) { + __tsan_atomic_thread_fence(__tsan_memory_order_seq_cst); + return __tsan_atomic64_load(ptr, __tsan_memory_order_relaxed); +} + +inline Atomic64 Acquire_CompareAndSwap(volatile Atomic64 *ptr, + Atomic64 old_value, + Atomic64 new_value) { + Atomic64 cmp = old_value; + __tsan_atomic64_compare_exchange_strong(ptr, &cmp, new_value, + __tsan_memory_order_acquire, __tsan_memory_order_acquire); + return cmp; +} + +inline Atomic64 Release_CompareAndSwap(volatile Atomic64 *ptr, + Atomic64 old_value, + Atomic64 new_value) { + Atomic64 cmp = old_value; + __tsan_atomic64_compare_exchange_strong(ptr, &cmp, new_value, + __tsan_memory_order_release, __tsan_memory_order_relaxed); + return cmp; +} + +inline void MemoryBarrier() { + __tsan_atomic_thread_fence(__tsan_memory_order_seq_cst); +} + +} // namespace internal +} // namespace protobuf +} // namespace google + +#undef ATOMICOPS_COMPILER_BARRIER + +#endif // GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_TSAN_H_ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_x86_gcc.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_x86_gcc.h new file mode 100644 index 00000000..edccc59d --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_x86_gcc.h @@ -0,0 +1,293 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2012 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// This file is an internal atomic implementation, use atomicops.h instead. + +#ifndef GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_X86_GCC_H_ +#define GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_X86_GCC_H_ + +namespace google { +namespace protobuf { +namespace internal { + +// This struct is not part of the public API of this module; clients may not +// use it. +// Features of this x86. Values may not be correct before main() is run, +// but are set conservatively. +struct AtomicOps_x86CPUFeatureStruct { + bool has_amd_lock_mb_bug; // Processor has AMD memory-barrier bug; do lfence + // after acquire compare-and-swap. + bool has_sse2; // Processor has SSE2. +}; +extern struct AtomicOps_x86CPUFeatureStruct AtomicOps_Internalx86CPUFeatures; + +#define ATOMICOPS_COMPILER_BARRIER() __asm__ __volatile__("" : : : "memory") + +// 32-bit low-level operations on any platform. + +inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + Atomic32 prev; + __asm__ __volatile__("lock; cmpxchgl %1,%2" + : "=a" (prev) + : "q" (new_value), "m" (*ptr), "0" (old_value) + : "memory"); + return prev; +} + +inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr, + Atomic32 new_value) { + __asm__ __volatile__("xchgl %1,%0" // The lock prefix is implicit for xchg. + : "=r" (new_value) + : "m" (*ptr), "0" (new_value) + : "memory"); + return new_value; // Now it's the previous value. +} + +inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr, + Atomic32 increment) { + Atomic32 temp = increment; + __asm__ __volatile__("lock; xaddl %0,%1" + : "+r" (temp), "+m" (*ptr) + : : "memory"); + // temp now holds the old value of *ptr + return temp + increment; +} + +inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr, + Atomic32 increment) { + Atomic32 temp = increment; + __asm__ __volatile__("lock; xaddl %0,%1" + : "+r" (temp), "+m" (*ptr) + : : "memory"); + // temp now holds the old value of *ptr + if (AtomicOps_Internalx86CPUFeatures.has_amd_lock_mb_bug) { + __asm__ __volatile__("lfence" : : : "memory"); + } + return temp + increment; +} + +inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + Atomic32 x = NoBarrier_CompareAndSwap(ptr, old_value, new_value); + if (AtomicOps_Internalx86CPUFeatures.has_amd_lock_mb_bug) { + __asm__ __volatile__("lfence" : : : "memory"); + } + return x; +} + +inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + return NoBarrier_CompareAndSwap(ptr, old_value, new_value); +} + +inline void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value) { + *ptr = value; +} + +#if defined(__x86_64__) + +// 64-bit implementations of memory barrier can be simpler, because it +// "mfence" is guaranteed to exist. +inline void MemoryBarrier() { + __asm__ __volatile__("mfence" : : : "memory"); +} + +inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) { + *ptr = value; + MemoryBarrier(); +} + +#else + +inline void MemoryBarrier() { + if (AtomicOps_Internalx86CPUFeatures.has_sse2) { + __asm__ __volatile__("mfence" : : : "memory"); + } else { // mfence is faster but not present on PIII + Atomic32 x = 0; + NoBarrier_AtomicExchange(&x, 0); // acts as a barrier on PIII + } +} + +inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) { + if (AtomicOps_Internalx86CPUFeatures.has_sse2) { + *ptr = value; + __asm__ __volatile__("mfence" : : : "memory"); + } else { + NoBarrier_AtomicExchange(ptr, value); + // acts as a barrier on PIII + } +} +#endif + +inline void Release_Store(volatile Atomic32* ptr, Atomic32 value) { + ATOMICOPS_COMPILER_BARRIER(); + *ptr = value; // An x86 store acts as a release barrier. + // See comments in Atomic64 version of Release_Store(), below. +} + +inline Atomic32 NoBarrier_Load(volatile const Atomic32* ptr) { + return *ptr; +} + +inline Atomic32 Acquire_Load(volatile const Atomic32* ptr) { + Atomic32 value = *ptr; // An x86 load acts as a acquire barrier. + // See comments in Atomic64 version of Release_Store(), below. + ATOMICOPS_COMPILER_BARRIER(); + return value; +} + +inline Atomic32 Release_Load(volatile const Atomic32* ptr) { + MemoryBarrier(); + return *ptr; +} + +#if defined(__x86_64__) + +// 64-bit low-level operations on 64-bit platform. + +inline Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + Atomic64 prev; + __asm__ __volatile__("lock; cmpxchgq %1,%2" + : "=a" (prev) + : "q" (new_value), "m" (*ptr), "0" (old_value) + : "memory"); + return prev; +} + +inline Atomic64 NoBarrier_AtomicExchange(volatile Atomic64* ptr, + Atomic64 new_value) { + __asm__ __volatile__("xchgq %1,%0" // The lock prefix is implicit for xchg. + : "=r" (new_value) + : "m" (*ptr), "0" (new_value) + : "memory"); + return new_value; // Now it's the previous value. +} + +inline Atomic64 NoBarrier_AtomicIncrement(volatile Atomic64* ptr, + Atomic64 increment) { + Atomic64 temp = increment; + __asm__ __volatile__("lock; xaddq %0,%1" + : "+r" (temp), "+m" (*ptr) + : : "memory"); + // temp now contains the previous value of *ptr + return temp + increment; +} + +inline Atomic64 Barrier_AtomicIncrement(volatile Atomic64* ptr, + Atomic64 increment) { + Atomic64 temp = increment; + __asm__ __volatile__("lock; xaddq %0,%1" + : "+r" (temp), "+m" (*ptr) + : : "memory"); + // temp now contains the previous value of *ptr + if (AtomicOps_Internalx86CPUFeatures.has_amd_lock_mb_bug) { + __asm__ __volatile__("lfence" : : : "memory"); + } + return temp + increment; +} + +inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) { + *ptr = value; +} + +inline void Acquire_Store(volatile Atomic64* ptr, Atomic64 value) { + *ptr = value; + MemoryBarrier(); +} + +inline void Release_Store(volatile Atomic64* ptr, Atomic64 value) { + ATOMICOPS_COMPILER_BARRIER(); + + *ptr = value; // An x86 store acts as a release barrier + // for current AMD/Intel chips as of Jan 2008. + // See also Acquire_Load(), below. + + // When new chips come out, check: + // IA-32 Intel Architecture Software Developer's Manual, Volume 3: + // System Programming Guide, Chatper 7: Multiple-processor management, + // Section 7.2, Memory Ordering. + // Last seen at: + // http://developer.intel.com/design/pentium4/manuals/index_new.htm + // + // x86 stores/loads fail to act as barriers for a few instructions (clflush + // maskmovdqu maskmovq movntdq movnti movntpd movntps movntq) but these are + // not generated by the compiler, and are rare. Users of these instructions + // need to know about cache behaviour in any case since all of these involve + // either flushing cache lines or non-temporal cache hints. +} + +inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) { + return *ptr; +} + +inline Atomic64 Acquire_Load(volatile const Atomic64* ptr) { + Atomic64 value = *ptr; // An x86 load acts as a acquire barrier, + // for current AMD/Intel chips as of Jan 2008. + // See also Release_Store(), above. + ATOMICOPS_COMPILER_BARRIER(); + return value; +} + +inline Atomic64 Release_Load(volatile const Atomic64* ptr) { + MemoryBarrier(); + return *ptr; +} + +inline Atomic64 Acquire_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + Atomic64 x = NoBarrier_CompareAndSwap(ptr, old_value, new_value); + if (AtomicOps_Internalx86CPUFeatures.has_amd_lock_mb_bug) { + __asm__ __volatile__("lfence" : : : "memory"); + } + return x; +} + +inline Atomic64 Release_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + return NoBarrier_CompareAndSwap(ptr, old_value, new_value); +} + +#endif // defined(__x86_64__) + +} // namespace internal +} // namespace protobuf +} // namespace google + +#undef ATOMICOPS_COMPILER_BARRIER + +#endif // GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_X86_GCC_H_ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_x86_msvc.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_x86_msvc.h new file mode 100644 index 00000000..e53a641f --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_x86_msvc.h @@ -0,0 +1,150 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2012 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// This file is an internal atomic implementation, use atomicops.h instead. + +#ifndef GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_X86_MSVC_H_ +#define GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_X86_MSVC_H_ + +namespace google { +namespace protobuf { +namespace internal { + +inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr, + Atomic32 increment) { + return Barrier_AtomicIncrement(ptr, increment); +} + +#if !(defined(_MSC_VER) && _MSC_VER >= 1400) +#error "We require at least vs2005 for MemoryBarrier" +#endif + +inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + return NoBarrier_CompareAndSwap(ptr, old_value, new_value); +} + +inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + return NoBarrier_CompareAndSwap(ptr, old_value, new_value); +} + +inline void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value) { + *ptr = value; +} + +inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) { + NoBarrier_AtomicExchange(ptr, value); + // acts as a barrier in this implementation +} + +inline void Release_Store(volatile Atomic32* ptr, Atomic32 value) { + *ptr = value; // works w/o barrier for current Intel chips as of June 2005 + // See comments in Atomic64 version of Release_Store() below. +} + +inline Atomic32 NoBarrier_Load(volatile const Atomic32* ptr) { + return *ptr; +} + +inline Atomic32 Acquire_Load(volatile const Atomic32* ptr) { + Atomic32 value = *ptr; + return value; +} + +inline Atomic32 Release_Load(volatile const Atomic32* ptr) { + MemoryBarrier(); + return *ptr; +} + +#if defined(_WIN64) + +// 64-bit low-level operations on 64-bit platform. + +inline Atomic64 NoBarrier_AtomicIncrement(volatile Atomic64* ptr, + Atomic64 increment) { + return Barrier_AtomicIncrement(ptr, increment); +} + +inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) { + *ptr = value; +} + +inline void Acquire_Store(volatile Atomic64* ptr, Atomic64 value) { + NoBarrier_AtomicExchange(ptr, value); + // acts as a barrier in this implementation +} + +inline void Release_Store(volatile Atomic64* ptr, Atomic64 value) { + *ptr = value; // works w/o barrier for current Intel chips as of June 2005 + + // When new chips come out, check: + // IA-32 Intel Architecture Software Developer's Manual, Volume 3: + // System Programming Guide, Chatper 7: Multiple-processor management, + // Section 7.2, Memory Ordering. + // Last seen at: + // http://developer.intel.com/design/pentium4/manuals/index_new.htm +} + +inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) { + return *ptr; +} + +inline Atomic64 Acquire_Load(volatile const Atomic64* ptr) { + Atomic64 value = *ptr; + return value; +} + +inline Atomic64 Release_Load(volatile const Atomic64* ptr) { + MemoryBarrier(); + return *ptr; +} + +inline Atomic64 Acquire_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + return NoBarrier_CompareAndSwap(ptr, old_value, new_value); +} + +inline Atomic64 Release_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + return NoBarrier_CompareAndSwap(ptr, old_value, new_value); +} + +#endif // defined(_WIN64) + +} // namespace internal +} // namespace protobuf +} // namespace google + +#endif // GOOGLE_PROTOBUF_ATOMICOPS_INTERNALS_X86_MSVC_H_ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/callback.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/callback.h new file mode 100644 index 00000000..7fcdf884 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/callback.h @@ -0,0 +1,546 @@ +#ifndef GOOGLE_PROTOBUF_STUBS_CALLBACK_H_ +#define GOOGLE_PROTOBUF_STUBS_CALLBACK_H_ + +#include "macros.h" +#include "type_traits.h" + +// =================================================================== +// emulates google3/base/callback.h + +namespace google { +namespace protobuf { + +// Abstract interface for a callback. When calling an RPC, you must provide +// a Closure to call when the procedure completes. See the Service interface +// in service.h. +// +// To automatically construct a Closure which calls a particular function or +// method with a particular set of parameters, use the NewCallback() function. +// Example: +// void FooDone(const FooResponse* response) { +// ... +// } +// +// void CallFoo() { +// ... +// // When done, call FooDone() and pass it a pointer to the response. +// Closure* callback = NewCallback(&FooDone, response); +// // Make the call. +// service->Foo(controller, request, response, callback); +// } +// +// Example that calls a method: +// class Handler { +// public: +// ... +// +// void FooDone(const FooResponse* response) { +// ... +// } +// +// void CallFoo() { +// ... +// // When done, call FooDone() and pass it a pointer to the response. +// Closure* callback = NewCallback(this, &Handler::FooDone, response); +// // Make the call. +// service->Foo(controller, request, response, callback); +// } +// }; +// +// Currently NewCallback() supports binding zero, one, or two arguments. +// +// Callbacks created with NewCallback() automatically delete themselves when +// executed. They should be used when a callback is to be called exactly +// once (usually the case with RPC callbacks). If a callback may be called +// a different number of times (including zero), create it with +// NewPermanentCallback() instead. You are then responsible for deleting the +// callback (using the "delete" keyword as normal). +// +// Note that NewCallback() is a bit touchy regarding argument types. Generally, +// the values you provide for the parameter bindings must exactly match the +// types accepted by the callback function. For example: +// void Foo(string s); +// NewCallback(&Foo, "foo"); // WON'T WORK: const char* != string +// NewCallback(&Foo, string("foo")); // WORKS +// Also note that the arguments cannot be references: +// void Foo(const string& s); +// string my_str; +// NewCallback(&Foo, my_str); // WON'T WORK: Can't use referecnes. +// However, correctly-typed pointers will work just fine. +class LIBPROTOBUF_EXPORT Closure { + public: + Closure() {} + virtual ~Closure(); + + virtual void Run() = 0; + + private: + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Closure); +}; + +template +class ResultCallback { + public: + ResultCallback() {} + virtual ~ResultCallback() {} + + virtual R Run() = 0; + + private: + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ResultCallback); +}; + +template +class LIBPROTOBUF_EXPORT ResultCallback1 { + public: + ResultCallback1() {} + virtual ~ResultCallback1() {} + + virtual R Run(A1) = 0; + + private: + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ResultCallback1); +}; + +template +class LIBPROTOBUF_EXPORT ResultCallback2 { + public: + ResultCallback2() {} + virtual ~ResultCallback2() {} + + virtual R Run(A1,A2) = 0; + + private: + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ResultCallback2); +}; + +namespace internal { + +class LIBPROTOBUF_EXPORT FunctionClosure0 : public Closure { + public: + typedef void (*FunctionType)(); + + FunctionClosure0(FunctionType function, bool self_deleting) + : function_(function), self_deleting_(self_deleting) {} + ~FunctionClosure0(); + + void Run() { + bool needs_delete = self_deleting_; // read in case callback deletes + function_(); + if (needs_delete) delete this; + } + + private: + FunctionType function_; + bool self_deleting_; +}; + +template +class MethodClosure0 : public Closure { + public: + typedef void (Class::*MethodType)(); + + MethodClosure0(Class* object, MethodType method, bool self_deleting) + : object_(object), method_(method), self_deleting_(self_deleting) {} + ~MethodClosure0() {} + + void Run() { + bool needs_delete = self_deleting_; // read in case callback deletes + (object_->*method_)(); + if (needs_delete) delete this; + } + + private: + Class* object_; + MethodType method_; + bool self_deleting_; +}; + +template +class FunctionClosure1 : public Closure { + public: + typedef void (*FunctionType)(Arg1 arg1); + + FunctionClosure1(FunctionType function, bool self_deleting, + Arg1 arg1) + : function_(function), self_deleting_(self_deleting), + arg1_(arg1) {} + ~FunctionClosure1() {} + + void Run() { + bool needs_delete = self_deleting_; // read in case callback deletes + function_(arg1_); + if (needs_delete) delete this; + } + + private: + FunctionType function_; + bool self_deleting_; + Arg1 arg1_; +}; + +template +class MethodClosure1 : public Closure { + public: + typedef void (Class::*MethodType)(Arg1 arg1); + + MethodClosure1(Class* object, MethodType method, bool self_deleting, + Arg1 arg1) + : object_(object), method_(method), self_deleting_(self_deleting), + arg1_(arg1) {} + ~MethodClosure1() {} + + void Run() { + bool needs_delete = self_deleting_; // read in case callback deletes + (object_->*method_)(arg1_); + if (needs_delete) delete this; + } + + private: + Class* object_; + MethodType method_; + bool self_deleting_; + Arg1 arg1_; +}; + +template +class FunctionClosure2 : public Closure { + public: + typedef void (*FunctionType)(Arg1 arg1, Arg2 arg2); + + FunctionClosure2(FunctionType function, bool self_deleting, + Arg1 arg1, Arg2 arg2) + : function_(function), self_deleting_(self_deleting), + arg1_(arg1), arg2_(arg2) {} + ~FunctionClosure2() {} + + void Run() { + bool needs_delete = self_deleting_; // read in case callback deletes + function_(arg1_, arg2_); + if (needs_delete) delete this; + } + + private: + FunctionType function_; + bool self_deleting_; + Arg1 arg1_; + Arg2 arg2_; +}; + +template +class MethodClosure2 : public Closure { + public: + typedef void (Class::*MethodType)(Arg1 arg1, Arg2 arg2); + + MethodClosure2(Class* object, MethodType method, bool self_deleting, + Arg1 arg1, Arg2 arg2) + : object_(object), method_(method), self_deleting_(self_deleting), + arg1_(arg1), arg2_(arg2) {} + ~MethodClosure2() {} + + void Run() { + bool needs_delete = self_deleting_; // read in case callback deletes + (object_->*method_)(arg1_, arg2_); + if (needs_delete) delete this; + } + + private: + Class* object_; + MethodType method_; + bool self_deleting_; + Arg1 arg1_; + Arg2 arg2_; +}; + +template +class FunctionResultCallback_0_0 : public ResultCallback { + public: + typedef R (*FunctionType)(); + + FunctionResultCallback_0_0(FunctionType function, bool self_deleting) + : function_(function), self_deleting_(self_deleting) {} + ~FunctionResultCallback_0_0() {} + + R Run() { + bool needs_delete = self_deleting_; // read in case callback deletes + R result = function_(); + if (needs_delete) delete this; + return result; + } + + private: + FunctionType function_; + bool self_deleting_; +}; + +template +class FunctionResultCallback_1_0 : public ResultCallback { + public: + typedef R (*FunctionType)(P1); + + FunctionResultCallback_1_0(FunctionType function, bool self_deleting, + P1 p1) + : function_(function), self_deleting_(self_deleting), p1_(p1) {} + ~FunctionResultCallback_1_0() {} + + R Run() { + bool needs_delete = self_deleting_; // read in case callback deletes + R result = function_(p1_); + if (needs_delete) delete this; + return result; + } + + private: + FunctionType function_; + bool self_deleting_; + P1 p1_; +}; + +template +class FunctionResultCallback_0_1 : public ResultCallback1 { + public: + typedef R (*FunctionType)(Arg1 arg1); + + FunctionResultCallback_0_1(FunctionType function, bool self_deleting) + : function_(function), self_deleting_(self_deleting) {} + ~FunctionResultCallback_0_1() {} + + R Run(Arg1 a1) { + bool needs_delete = self_deleting_; // read in case callback deletes + R result = function_(a1); + if (needs_delete) delete this; + return result; + } + + private: + FunctionType function_; + bool self_deleting_; +}; + +template +class FunctionResultCallback_1_1 : public ResultCallback1 { + public: + typedef R (*FunctionType)(P1, A1); + + FunctionResultCallback_1_1(FunctionType function, bool self_deleting, + P1 p1) + : function_(function), self_deleting_(self_deleting), p1_(p1) {} + ~FunctionResultCallback_1_1() {} + + R Run(A1 a1) { + bool needs_delete = self_deleting_; // read in case callback deletes + R result = function_(p1_, a1); + if (needs_delete) delete this; + return result; + } + + private: + FunctionType function_; + bool self_deleting_; + P1 p1_; +}; + +template +struct InternalConstRef { + typedef typename remove_reference::type base_type; + typedef const base_type& type; +}; + +template +class MethodResultCallback_5_2 : public ResultCallback2 { + public: + typedef R (T::*MethodType)(P1, P2, P3, P4, P5, A1, A2); + MethodResultCallback_5_2(T* object, MethodType method, bool self_deleting, + P1 p1, P2 p2, P3 p3, P4 p4, P5 p5) + : object_(object), + method_(method), + self_deleting_(self_deleting), + p1_(p1), + p2_(p2), + p3_(p3), + p4_(p4), + p5_(p5) {} + ~MethodResultCallback_5_2() {} + + R Run(A1 a1, A2 a2) { + bool needs_delete = self_deleting_; + R result = (object_->*method_)(p1_, p2_, p3_, p4_, p5_, a1, a2); + if (needs_delete) delete this; + return result; + } + + private: + T* object_; + MethodType method_; + bool self_deleting_; + typename remove_reference::type p1_; + typename remove_reference::type p2_; + typename remove_reference::type p3_; + typename remove_reference::type p4_; + typename remove_reference::type p5_; +}; + +} // namespace internal + +// See Closure. +inline Closure* NewCallback(void (*function)()) { + return new internal::FunctionClosure0(function, true); +} + +// See Closure. +inline Closure* NewPermanentCallback(void (*function)()) { + return new internal::FunctionClosure0(function, false); +} + +// See Closure. +template +inline Closure* NewCallback(Class* object, void (Class::*method)()) { + return new internal::MethodClosure0(object, method, true); +} + +// See Closure. +template +inline Closure* NewPermanentCallback(Class* object, void (Class::*method)()) { + return new internal::MethodClosure0(object, method, false); +} + +// See Closure. +template +inline Closure* NewCallback(void (*function)(Arg1), + Arg1 arg1) { + return new internal::FunctionClosure1(function, true, arg1); +} + +// See Closure. +template +inline Closure* NewPermanentCallback(void (*function)(Arg1), + Arg1 arg1) { + return new internal::FunctionClosure1(function, false, arg1); +} + +// See Closure. +template +inline Closure* NewCallback(Class* object, void (Class::*method)(Arg1), + Arg1 arg1) { + return new internal::MethodClosure1(object, method, true, arg1); +} + +// See Closure. +template +inline Closure* NewPermanentCallback(Class* object, void (Class::*method)(Arg1), + Arg1 arg1) { + return new internal::MethodClosure1(object, method, false, arg1); +} + +// See Closure. +template +inline Closure* NewCallback(void (*function)(Arg1, Arg2), + Arg1 arg1, Arg2 arg2) { + return new internal::FunctionClosure2( + function, true, arg1, arg2); +} + +// See Closure. +template +inline Closure* NewPermanentCallback(void (*function)(Arg1, Arg2), + Arg1 arg1, Arg2 arg2) { + return new internal::FunctionClosure2( + function, false, arg1, arg2); +} + +// See Closure. +template +inline Closure* NewCallback(Class* object, void (Class::*method)(Arg1, Arg2), + Arg1 arg1, Arg2 arg2) { + return new internal::MethodClosure2( + object, method, true, arg1, arg2); +} + +// See Closure. +template +inline Closure* NewPermanentCallback( + Class* object, void (Class::*method)(Arg1, Arg2), + Arg1 arg1, Arg2 arg2) { + return new internal::MethodClosure2( + object, method, false, arg1, arg2); +} + +// See ResultCallback +template +inline ResultCallback* NewCallback(R (*function)()) { + return new internal::FunctionResultCallback_0_0(function, true); +} + +// See ResultCallback +template +inline ResultCallback* NewPermanentCallback(R (*function)()) { + return new internal::FunctionResultCallback_0_0(function, false); +} + +// See ResultCallback +template +inline ResultCallback* NewCallback(R (*function)(P1), P1 p1) { + return new internal::FunctionResultCallback_1_0( + function, true, p1); +} + +// See ResultCallback +template +inline ResultCallback* NewPermanentCallback( + R (*function)(P1), P1 p1) { + return new internal::FunctionResultCallback_1_0( + function, false, p1); +} + +// See ResultCallback1 +template +inline ResultCallback1* NewCallback(R (*function)(A1)) { + return new internal::FunctionResultCallback_0_1(function, true); +} + +// See ResultCallback1 +template +inline ResultCallback1* NewPermanentCallback(R (*function)(A1)) { + return new internal::FunctionResultCallback_0_1(function, false); +} + +// See ResultCallback1 +template +inline ResultCallback1* NewCallback(R (*function)(P1, A1), P1 p1) { + return new internal::FunctionResultCallback_1_1( + function, true, p1); +} + +// See ResultCallback1 +template +inline ResultCallback1* NewPermanentCallback( + R (*function)(P1, A1), P1 p1) { + return new internal::FunctionResultCallback_1_1( + function, false, p1); +} + +// See MethodResultCallback_5_2 +template +inline ResultCallback2* NewPermanentCallback( + T* object, R (T::*function)(P1, P2, P3, P4, P5, A1, A2), + typename internal::InternalConstRef::type p1, + typename internal::InternalConstRef::type p2, + typename internal::InternalConstRef::type p3, + typename internal::InternalConstRef::type p4, + typename internal::InternalConstRef::type p5) { + return new internal::MethodResultCallback_5_2(object, function, false, p1, + p2, p3, p4, p5); +} + +// A function which does nothing. Useful for creating no-op callbacks, e.g.: +// Closure* nothing = NewCallback(&DoNothing); +void LIBPROTOBUF_EXPORT DoNothing(); + + +} // namespace protobuf +} // namespace google + +#endif // GOOGLE_PROTOBUF_STUBS_CALLBACK_H_ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/casts.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/casts.h new file mode 100644 index 00000000..b38bd290 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/casts.h @@ -0,0 +1,133 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2014 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef GOOGLE_PROTOBUF_CASTS_H__ +#define GOOGLE_PROTOBUF_CASTS_H__ + +#include "common.h" +#include "type_traits.h" + +namespace google { +namespace protobuf { +namespace internal { +// Use implicit_cast as a safe version of static_cast or const_cast +// for upcasting in the type hierarchy (i.e. casting a pointer to Foo +// to a pointer to SuperclassOfFoo or casting a pointer to Foo to +// a const pointer to Foo). +// When you use implicit_cast, the compiler checks that the cast is safe. +// Such explicit implicit_casts are necessary in surprisingly many +// situations where C++ demands an exact type match instead of an +// argument type convertable to a target type. +// +// The From type can be inferred, so the preferred syntax for using +// implicit_cast is the same as for static_cast etc.: +// +// implicit_cast(expr) +// +// implicit_cast would have been part of the C++ standard library, +// but the proposal was submitted too late. It will probably make +// its way into the language in the future. +template +inline To implicit_cast(From const &f) { + return f; +} + +// When you upcast (that is, cast a pointer from type Foo to type +// SuperclassOfFoo), it's fine to use implicit_cast<>, since upcasts +// always succeed. When you downcast (that is, cast a pointer from +// type Foo to type SubclassOfFoo), static_cast<> isn't safe, because +// how do you know the pointer is really of type SubclassOfFoo? It +// could be a bare Foo, or of type DifferentSubclassOfFoo. Thus, +// when you downcast, you should use this macro. In debug mode, we +// use dynamic_cast<> to double-check the downcast is legal (we die +// if it's not). In normal mode, we do the efficient static_cast<> +// instead. Thus, it's important to test in debug mode to make sure +// the cast is legal! +// This is the only place in the code we should use dynamic_cast<>. +// In particular, you SHOULDN'T be using dynamic_cast<> in order to +// do RTTI (eg code like this: +// if (dynamic_cast(foo)) HandleASubclass1Object(foo); +// if (dynamic_cast(foo)) HandleASubclass2Object(foo); +// You should design the code some other way not to need this. + +template // use like this: down_cast(foo); +inline To down_cast(From* f) { // so we only accept pointers + // Ensures that To is a sub-type of From *. This test is here only + // for compile-time type checking, and has no overhead in an + // optimized build at run-time, as it will be optimized away + // completely. + if (false) { + implicit_cast(0); + } + +#if !defined(NDEBUG) && !defined(GOOGLE_PROTOBUF_NO_RTTI) + assert(f == NULL || dynamic_cast(f) != NULL); // RTTI: debug mode only! +#endif + return static_cast(f); +} + +template // use like this: down_cast(foo); +inline To down_cast(From& f) { + typedef typename remove_reference::type* ToAsPointer; + // Ensures that To is a sub-type of From *. This test is here only + // for compile-time type checking, and has no overhead in an + // optimized build at run-time, as it will be optimized away + // completely. + if (false) { + implicit_cast(0); + } + +#if !defined(NDEBUG) && !defined(GOOGLE_PROTOBUF_NO_RTTI) + // RTTI: debug mode only! + assert(dynamic_cast(&f) != NULL); +#endif + return *static_cast(&f); +} + +template +inline To bit_cast(const From& from) { + GOOGLE_COMPILE_ASSERT(sizeof(From) == sizeof(To), + bit_cast_with_different_sizes); + To dest; + memcpy(&dest, &from, sizeof(dest)); + return dest; +} + +} // namespace internal + +// We made these internal so that they would show up as such in the docs, +// but we don't want to stick "internal::" in front of them everywhere. +using internal::implicit_cast; +using internal::down_cast; +using internal::bit_cast; + +} // namespace protobuf +} // namespace google +#endif // GOOGLE_PROTOBUF_CASTS_H__ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/common.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/common.h new file mode 100644 index 00000000..7b486b3a --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/common.h @@ -0,0 +1,225 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Author: kenton@google.com (Kenton Varda) and others +// +// Contains basic types and utilities used by the rest of the library. + +#ifndef GOOGLE_PROTOBUF_COMMON_H__ +#define GOOGLE_PROTOBUF_COMMON_H__ + +#include + +#include "port.h" +#include "macros.h" +#include "platform_macros.h" + +// TODO(liujisi): Remove the following includes after the include clean-up. +#include "logging.h" +#include "scoped_ptr.h" +#include "mutex.h" +#include "callback.h" + +#ifndef PROTOBUF_USE_EXCEPTIONS +#if defined(_MSC_VER) && defined(_CPPUNWIND) + #define PROTOBUF_USE_EXCEPTIONS 1 +#elif defined(__EXCEPTIONS) + #define PROTOBUF_USE_EXCEPTIONS 1 +#else + #define PROTOBUF_USE_EXCEPTIONS 0 +#endif +#endif + +#if PROTOBUF_USE_EXCEPTIONS +#include +#endif +#if defined(__APPLE__) +#include // for TARGET_OS_IPHONE +#endif + +#if defined(__ANDROID__) || defined(GOOGLE_PROTOBUF_OS_ANDROID) || (defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE) || defined(GOOGLE_PROTOBUF_OS_IPHONE) +#include +#endif + +#if defined(_WIN32) && defined(GetMessage) +// Allow GetMessage to be used as a valid method name in protobuf classes. +// windows.h defines GetMessage() as a macro. Let's re-define it as an inline +// function. The inline function should be equivalent for C++ users. +inline BOOL GetMessage_Win32( + LPMSG lpMsg, HWND hWnd, + UINT wMsgFilterMin, UINT wMsgFilterMax) { + return GetMessage(lpMsg, hWnd, wMsgFilterMin, wMsgFilterMax); +} +#undef GetMessage +inline BOOL GetMessage( + LPMSG lpMsg, HWND hWnd, + UINT wMsgFilterMin, UINT wMsgFilterMax) { + return GetMessage_Win32(lpMsg, hWnd, wMsgFilterMin, wMsgFilterMax); +} +#endif + +namespace std {} + +namespace google { +namespace protobuf { +namespace internal { + +// Some of these constants are macros rather than const ints so that they can +// be used in #if directives. + +// The current version, represented as a single integer to make comparison +// easier: major * 10^6 + minor * 10^3 + micro +#define GOOGLE_PROTOBUF_VERSION 3001000 + +// The minimum library version which works with the current version of the +// headers. +#define GOOGLE_PROTOBUF_MIN_LIBRARY_VERSION 3001000 + +// The minimum header version which works with the current version of +// the library. This constant should only be used by protoc's C++ code +// generator. +static const int kMinHeaderVersionForLibrary = 3001000; + +// The minimum protoc version which works with the current version of the +// headers. +#define GOOGLE_PROTOBUF_MIN_PROTOC_VERSION 3001000 + +// The minimum header version which works with the current version of +// protoc. This constant should only be used in VerifyVersion(). +static const int kMinHeaderVersionForProtoc = 3001000; + +// Verifies that the headers and libraries are compatible. Use the macro +// below to call this. +void LIBPROTOBUF_EXPORT VerifyVersion(int headerVersion, int minLibraryVersion, + const char* filename); + +// Converts a numeric version number to a string. +std::string LIBPROTOBUF_EXPORT VersionString(int version); + +} // namespace internal + +// Place this macro in your main() function (or somewhere before you attempt +// to use the protobuf library) to verify that the version you link against +// matches the headers you compiled against. If a version mismatch is +// detected, the process will abort. +#define GOOGLE_PROTOBUF_VERIFY_VERSION \ + ::google::protobuf::internal::VerifyVersion( \ + GOOGLE_PROTOBUF_VERSION, GOOGLE_PROTOBUF_MIN_LIBRARY_VERSION, \ + __FILE__) + + +// =================================================================== +// from google3/util/utf8/public/unilib.h + +class StringPiece; +namespace internal { + +// Checks if the buffer contains structurally-valid UTF-8. Implemented in +// structurally_valid.cc. +LIBPROTOBUF_EXPORT bool IsStructurallyValidUTF8(const char* buf, int len); + +inline bool IsStructurallyValidUTF8(const std::string& str) { + return IsStructurallyValidUTF8(str.data(), static_cast(str.length())); +} + +// Returns initial number of bytes of structually valid UTF-8. +LIBPROTOBUF_EXPORT int UTF8SpnStructurallyValid(const StringPiece& str); + +// Coerce UTF-8 byte string in src_str to be +// a structurally-valid equal-length string by selectively +// overwriting illegal bytes with replace_char (typically ' ' or '?'). +// replace_char must be legal printable 7-bit Ascii 0x20..0x7e. +// src_str is read-only. +// +// Returns pointer to output buffer, src_str.data() if no changes were made, +// or idst if some bytes were changed. idst is allocated by the caller +// and must be at least as big as src_str +// +// Optimized for: all structurally valid and no byte copying is done. +// +LIBPROTOBUF_EXPORT char* UTF8CoerceToStructurallyValid( + const StringPiece& str, char* dst, char replace_char); + +} // namespace internal + + +// =================================================================== +// Shutdown support. + +// Shut down the entire protocol buffers library, deleting all static-duration +// objects allocated by the library or by generated .pb.cc files. +// +// There are two reasons you might want to call this: +// * You use a draconian definition of "memory leak" in which you expect +// every single malloc() to have a corresponding free(), even for objects +// which live until program exit. +// * You are writing a dynamically-loaded library which needs to clean up +// after itself when the library is unloaded. +// +// It is safe to call this multiple times. However, it is not safe to use +// any other part of the protocol buffers library after +// ShutdownProtobufLibrary() has been called. +LIBPROTOBUF_EXPORT void ShutdownProtobufLibrary(); + +namespace internal { + +// Register a function to be called when ShutdownProtocolBuffers() is called. +LIBPROTOBUF_EXPORT void OnShutdown(void (*func)()); + +} // namespace internal + +#if PROTOBUF_USE_EXCEPTIONS +class FatalException : public std::exception { + public: + FatalException(const char* filename, int line, const std::string& message) + : filename_(filename), line_(line), message_(message) {} + virtual ~FatalException() throw(); + + virtual const char* what() const throw(); + + const char* filename() const { return filename_; } + int line() const { return line_; } + const std::string& message() const { return message_; } + + private: + const char* filename_; + const int line_; + const std::string message_; +}; +#endif + +// This is at the end of the file instead of the beginning to work around a bug +// in some versions of MSVC. +using namespace std; // Don't do this at home, kids. + +} // namespace protobuf +} // namespace google + +#endif // GOOGLE_PROTOBUF_COMMON_H__ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/fastmem.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/fastmem.h new file mode 100644 index 00000000..eb025fab --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/fastmem.h @@ -0,0 +1,152 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2014 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Fast memory copying and comparison routines. +// strings::fastmemcmp_inlined() replaces memcmp() +// strings::memcpy_inlined() replaces memcpy() +// strings::memeq(a, b, n) replaces memcmp(a, b, n) == 0 +// +// strings::*_inlined() routines are inline versions of the +// routines exported by this module. Sometimes using the inlined +// versions is faster. Measure before using the inlined versions. +// +// Performance measurement: +// strings::fastmemcmp_inlined +// Analysis: memcmp, fastmemcmp_inlined, fastmemcmp +// 2012-01-30 + +#ifndef GOOGLE_PROTOBUF_STUBS_FASTMEM_H_ +#define GOOGLE_PROTOBUF_STUBS_FASTMEM_H_ + +#include +#include +#include + +#include "common.h" + +namespace google { +namespace protobuf { +namespace internal { + +// Return true if the n bytes at a equal the n bytes at b. +// The regions are allowed to overlap. +// +// The performance is similar to the performance memcmp(), but faster for +// moderately-sized inputs, or inputs that share a common prefix and differ +// somewhere in their last 8 bytes. Further optimizations can be added later +// if it makes sense to do so.:w +inline bool memeq(const char* a, const char* b, size_t n) { + size_t n_rounded_down = n & ~static_cast(7); + if (GOOGLE_PREDICT_FALSE(n_rounded_down == 0)) { // n <= 7 + return memcmp(a, b, n) == 0; + } + // n >= 8 + uint64 u = GOOGLE_UNALIGNED_LOAD64(a) ^ GOOGLE_UNALIGNED_LOAD64(b); + uint64 v = GOOGLE_UNALIGNED_LOAD64(a + n - 8) ^ GOOGLE_UNALIGNED_LOAD64(b + n - 8); + if ((u | v) != 0) { // The first or last 8 bytes differ. + return false; + } + a += 8; + b += 8; + n = n_rounded_down - 8; + if (n > 128) { + // As of 2012, memcmp on x86-64 uses a big unrolled loop with SSE2 + // instructions, and while we could try to do something faster, it + // doesn't seem worth pursuing. + return memcmp(a, b, n) == 0; + } + for (; n >= 16; n -= 16) { + uint64 x = GOOGLE_UNALIGNED_LOAD64(a) ^ GOOGLE_UNALIGNED_LOAD64(b); + uint64 y = GOOGLE_UNALIGNED_LOAD64(a + 8) ^ GOOGLE_UNALIGNED_LOAD64(b + 8); + if ((x | y) != 0) { + return false; + } + a += 16; + b += 16; + } + // n must be 0 or 8 now because it was a multiple of 8 at the top of the loop. + return n == 0 || GOOGLE_UNALIGNED_LOAD64(a) == GOOGLE_UNALIGNED_LOAD64(b); +} + +inline int fastmemcmp_inlined(const char *a, const char *b, size_t n) { + if (n >= 64) { + return memcmp(a, b, n); + } + const char* a_limit = a + n; + while (a + sizeof(uint64) <= a_limit && + GOOGLE_UNALIGNED_LOAD64(a) == GOOGLE_UNALIGNED_LOAD64(b)) { + a += sizeof(uint64); + b += sizeof(uint64); + } + if (a + sizeof(uint32) <= a_limit && + GOOGLE_UNALIGNED_LOAD32(a) == GOOGLE_UNALIGNED_LOAD32(b)) { + a += sizeof(uint32); + b += sizeof(uint32); + } + while (a < a_limit) { + int d = static_cast(*a++) - static_cast(*b++); + if (d) return d; + } + return 0; +} + +// The standard memcpy operation is slow for variable small sizes. +// This implementation inlines the optimal realization for sizes 1 to 16. +// To avoid code bloat don't use it in case of not performance-critical spots, +// nor when you don't expect very frequent values of size <= 16. +inline void memcpy_inlined(char *dst, const char *src, size_t size) { + // Compiler inlines code with minimal amount of data movement when third + // parameter of memcpy is a constant. + switch (size) { + case 1: memcpy(dst, src, 1); break; + case 2: memcpy(dst, src, 2); break; + case 3: memcpy(dst, src, 3); break; + case 4: memcpy(dst, src, 4); break; + case 5: memcpy(dst, src, 5); break; + case 6: memcpy(dst, src, 6); break; + case 7: memcpy(dst, src, 7); break; + case 8: memcpy(dst, src, 8); break; + case 9: memcpy(dst, src, 9); break; + case 10: memcpy(dst, src, 10); break; + case 11: memcpy(dst, src, 11); break; + case 12: memcpy(dst, src, 12); break; + case 13: memcpy(dst, src, 13); break; + case 14: memcpy(dst, src, 14); break; + case 15: memcpy(dst, src, 15); break; + case 16: memcpy(dst, src, 16); break; + default: memcpy(dst, src, size); break; + } +} + +} // namespace internal +} // namespace protobuf +} // namespace google + +#endif // GOOGLE_PROTOBUF_STUBS_FASTMEM_H_ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/logging.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/logging.h new file mode 100644 index 00000000..0deec170 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/logging.h @@ -0,0 +1,237 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef GOOGLE_PROTOBUF_STUBS_LOGGING_H_ +#define GOOGLE_PROTOBUF_STUBS_LOGGING_H_ + +#include "macros.h" +#include "port.h" + +// =================================================================== +// emulates google3/base/logging.h + +namespace google { +namespace protobuf { + +enum LogLevel { + LOGLEVEL_INFO, // Informational. This is never actually used by + // libprotobuf. + LOGLEVEL_WARNING, // Warns about issues that, although not technically a + // problem now, could cause problems in the future. For + // example, a // warning will be printed when parsing a + // message that is near the message size limit. + LOGLEVEL_ERROR, // An error occurred which should never happen during + // normal use. + LOGLEVEL_FATAL, // An error occurred from which the library cannot + // recover. This usually indicates a programming error + // in the code which calls the library, especially when + // compiled in debug mode. + +#ifdef NDEBUG + LOGLEVEL_DFATAL = LOGLEVEL_ERROR +#else + LOGLEVEL_DFATAL = LOGLEVEL_FATAL +#endif +}; + +class StringPiece; +namespace util { +class Status; +} +class uint128; +namespace internal { + +class LogFinisher; + +class LIBPROTOBUF_EXPORT LogMessage { + public: + LogMessage(LogLevel level, const char* filename, int line); + ~LogMessage(); + + LogMessage& operator<<(const std::string& value); + LogMessage& operator<<(const char* value); + LogMessage& operator<<(char value); + LogMessage& operator<<(int value); + LogMessage& operator<<(uint value); + LogMessage& operator<<(long value); + LogMessage& operator<<(unsigned long value); + LogMessage& operator<<(long long value); + LogMessage& operator<<(unsigned long long value); + LogMessage& operator<<(double value); + LogMessage& operator<<(void* value); + LogMessage& operator<<(const StringPiece& value); + LogMessage& operator<<(const ::google::protobuf::util::Status& status); + LogMessage& operator<<(const uint128& value); + + private: + friend class LogFinisher; + void Finish(); + + LogLevel level_; + const char* filename_; + int line_; + std::string message_; +}; + +// Used to make the entire "LOG(BLAH) << etc." expression have a void return +// type and print a newline after each message. +class LIBPROTOBUF_EXPORT LogFinisher { + public: + void operator=(LogMessage& other); +}; + +template +bool IsOk(T status) { return status.ok(); } +template<> +inline bool IsOk(bool status) { return status; } + +} // namespace internal + +// Undef everything in case we're being mixed with some other Google library +// which already defined them itself. Presumably all Google libraries will +// support the same syntax for these so it should not be a big deal if they +// end up using our definitions instead. +#undef GOOGLE_LOG +#undef GOOGLE_LOG_IF + +#undef GOOGLE_CHECK +#undef GOOGLE_CHECK_OK +#undef GOOGLE_CHECK_EQ +#undef GOOGLE_CHECK_NE +#undef GOOGLE_CHECK_LT +#undef GOOGLE_CHECK_LE +#undef GOOGLE_CHECK_GT +#undef GOOGLE_CHECK_GE +#undef GOOGLE_CHECK_NOTNULL + +#undef GOOGLE_DLOG +#undef GOOGLE_DCHECK +#undef GOOGLE_DCHECK_OK +#undef GOOGLE_DCHECK_EQ +#undef GOOGLE_DCHECK_NE +#undef GOOGLE_DCHECK_LT +#undef GOOGLE_DCHECK_LE +#undef GOOGLE_DCHECK_GT +#undef GOOGLE_DCHECK_GE + +#define GOOGLE_LOG(LEVEL) \ + ::google::protobuf::internal::LogFinisher() = \ + ::google::protobuf::internal::LogMessage( \ + ::google::protobuf::LOGLEVEL_##LEVEL, __FILE__, __LINE__) +#define GOOGLE_LOG_IF(LEVEL, CONDITION) \ + !(CONDITION) ? (void)0 : GOOGLE_LOG(LEVEL) + +#define GOOGLE_CHECK(EXPRESSION) \ + GOOGLE_LOG_IF(FATAL, !(EXPRESSION)) << "CHECK failed: " #EXPRESSION ": " +#define GOOGLE_CHECK_OK(A) GOOGLE_CHECK(::google::protobuf::internal::IsOk(A)) +#define GOOGLE_CHECK_EQ(A, B) GOOGLE_CHECK((A) == (B)) +#define GOOGLE_CHECK_NE(A, B) GOOGLE_CHECK((A) != (B)) +#define GOOGLE_CHECK_LT(A, B) GOOGLE_CHECK((A) < (B)) +#define GOOGLE_CHECK_LE(A, B) GOOGLE_CHECK((A) <= (B)) +#define GOOGLE_CHECK_GT(A, B) GOOGLE_CHECK((A) > (B)) +#define GOOGLE_CHECK_GE(A, B) GOOGLE_CHECK((A) >= (B)) + +namespace internal { +template +T* CheckNotNull(const char* /* file */, int /* line */, + const char* name, T* val) { + if (val == NULL) { + GOOGLE_LOG(FATAL) << name; + } + return val; +} +} // namespace internal +#define GOOGLE_CHECK_NOTNULL(A) \ + ::google::protobuf::internal::CheckNotNull(\ + __FILE__, __LINE__, "'" #A "' must not be NULL", (A)) + +#ifdef NDEBUG + +#define GOOGLE_DLOG(LEVEL) GOOGLE_LOG_IF(LEVEL, false) + +#define GOOGLE_DCHECK(EXPRESSION) while(false) GOOGLE_CHECK(EXPRESSION) +#define GOOGLE_DCHECK_OK(E) GOOGLE_DCHECK(::google::protobuf::internal::IsOk(E)) +#define GOOGLE_DCHECK_EQ(A, B) GOOGLE_DCHECK((A) == (B)) +#define GOOGLE_DCHECK_NE(A, B) GOOGLE_DCHECK((A) != (B)) +#define GOOGLE_DCHECK_LT(A, B) GOOGLE_DCHECK((A) < (B)) +#define GOOGLE_DCHECK_LE(A, B) GOOGLE_DCHECK((A) <= (B)) +#define GOOGLE_DCHECK_GT(A, B) GOOGLE_DCHECK((A) > (B)) +#define GOOGLE_DCHECK_GE(A, B) GOOGLE_DCHECK((A) >= (B)) + +#else // NDEBUG + +#define GOOGLE_DLOG GOOGLE_LOG + +#define GOOGLE_DCHECK GOOGLE_CHECK +#define GOOGLE_DCHECK_OK GOOGLE_CHECK_OK +#define GOOGLE_DCHECK_EQ GOOGLE_CHECK_EQ +#define GOOGLE_DCHECK_NE GOOGLE_CHECK_NE +#define GOOGLE_DCHECK_LT GOOGLE_CHECK_LT +#define GOOGLE_DCHECK_LE GOOGLE_CHECK_LE +#define GOOGLE_DCHECK_GT GOOGLE_CHECK_GT +#define GOOGLE_DCHECK_GE GOOGLE_CHECK_GE + +#endif // !NDEBUG + +typedef void LogHandler(LogLevel level, const char* filename, int line, + const std::string& message); + +// The protobuf library sometimes writes warning and error messages to +// stderr. These messages are primarily useful for developers, but may +// also help end users figure out a problem. If you would prefer that +// these messages be sent somewhere other than stderr, call SetLogHandler() +// to set your own handler. This returns the old handler. Set the handler +// to NULL to ignore log messages (but see also LogSilencer, below). +// +// Obviously, SetLogHandler is not thread-safe. You should only call it +// at initialization time, and probably not from library code. If you +// simply want to suppress log messages temporarily (e.g. because you +// have some code that tends to trigger them frequently and you know +// the warnings are not important to you), use the LogSilencer class +// below. +LIBPROTOBUF_EXPORT LogHandler* SetLogHandler(LogHandler* new_func); + +// Create a LogSilencer if you want to temporarily suppress all log +// messages. As long as any LogSilencer objects exist, non-fatal +// log messages will be discarded (the current LogHandler will *not* +// be called). Constructing a LogSilencer is thread-safe. You may +// accidentally suppress log messages occurring in another thread, but +// since messages are generally for debugging purposes only, this isn't +// a big deal. If you want to intercept log messages, use SetLogHandler(). +class LIBPROTOBUF_EXPORT LogSilencer { + public: + LogSilencer(); + ~LogSilencer(); +}; + +} // namespace protobuf +} // namespace google + +#endif // GOOGLE_PROTOBUF_STUBS_LOGGING_H_ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/macros.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/macros.h new file mode 100644 index 00000000..90d254ed --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/macros.h @@ -0,0 +1,168 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef GOOGLE_PROTOBUF_MACROS_H__ +#define GOOGLE_PROTOBUF_MACROS_H__ + +#include "port.h" + +namespace google { +namespace protobuf { + +#undef GOOGLE_DISALLOW_EVIL_CONSTRUCTORS +#define GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(TypeName) \ + TypeName(const TypeName&); \ + void operator=(const TypeName&) + +#undef GOOGLE_DISALLOW_IMPLICIT_CONSTRUCTORS +#define GOOGLE_DISALLOW_IMPLICIT_CONSTRUCTORS(TypeName) \ + TypeName(); \ + TypeName(const TypeName&); \ + void operator=(const TypeName&) + +// =================================================================== +// from google3/base/basictypes.h + +// The GOOGLE_ARRAYSIZE(arr) macro returns the # of elements in an array arr. +// The expression is a compile-time constant, and therefore can be +// used in defining new arrays, for example. +// +// GOOGLE_ARRAYSIZE catches a few type errors. If you see a compiler error +// +// "warning: division by zero in ..." +// +// when using GOOGLE_ARRAYSIZE, you are (wrongfully) giving it a pointer. +// You should only use GOOGLE_ARRAYSIZE on statically allocated arrays. +// +// The following comments are on the implementation details, and can +// be ignored by the users. +// +// ARRAYSIZE(arr) works by inspecting sizeof(arr) (the # of bytes in +// the array) and sizeof(*(arr)) (the # of bytes in one array +// element). If the former is divisible by the latter, perhaps arr is +// indeed an array, in which case the division result is the # of +// elements in the array. Otherwise, arr cannot possibly be an array, +// and we generate a compiler error to prevent the code from +// compiling. +// +// Since the size of bool is implementation-defined, we need to cast +// !(sizeof(a) & sizeof(*(a))) to size_t in order to ensure the final +// result has type size_t. +// +// This macro is not perfect as it wrongfully accepts certain +// pointers, namely where the pointer size is divisible by the pointee +// size. Since all our code has to go through a 32-bit compiler, +// where a pointer is 4 bytes, this means all pointers to a type whose +// size is 3 or greater than 4 will be (righteously) rejected. +// +// Kudos to Jorg Brown for this simple and elegant implementation. + +#undef GOOGLE_ARRAYSIZE +#define GOOGLE_ARRAYSIZE(a) \ + ((sizeof(a) / sizeof(*(a))) / \ + static_cast(!(sizeof(a) % sizeof(*(a))))) + +// The COMPILE_ASSERT macro can be used to verify that a compile time +// expression is true. For example, you could use it to verify the +// size of a static array: +// +// COMPILE_ASSERT(ARRAYSIZE(content_type_names) == CONTENT_NUM_TYPES, +// content_type_names_incorrect_size); +// +// or to make sure a struct is smaller than a certain size: +// +// COMPILE_ASSERT(sizeof(foo) < 128, foo_too_large); +// +// The second argument to the macro is the name of the variable. If +// the expression is false, most compilers will issue a warning/error +// containing the name of the variable. + +namespace internal { + +template +struct CompileAssert { +}; + +} // namespace internal + +#undef GOOGLE_COMPILE_ASSERT +#if __cplusplus >= 201103L +#define GOOGLE_COMPILE_ASSERT(expr, msg) static_assert(expr, #msg) +#else +#define GOOGLE_COMPILE_ASSERT(expr, msg) \ + ::google::protobuf::internal::CompileAssert<(bool(expr))> \ + msg[bool(expr) ? 1 : -1]; \ + (void)msg +// Implementation details of COMPILE_ASSERT: +// +// - COMPILE_ASSERT works by defining an array type that has -1 +// elements (and thus is invalid) when the expression is false. +// +// - The simpler definition +// +// #define COMPILE_ASSERT(expr, msg) typedef char msg[(expr) ? 1 : -1] +// +// does not work, as gcc supports variable-length arrays whose sizes +// are determined at run-time (this is gcc's extension and not part +// of the C++ standard). As a result, gcc fails to reject the +// following code with the simple definition: +// +// int foo; +// COMPILE_ASSERT(foo, msg); // not supposed to compile as foo is +// // not a compile-time constant. +// +// - By using the type CompileAssert<(bool(expr))>, we ensures that +// expr is a compile-time constant. (Template arguments must be +// determined at compile-time.) +// +// - The outter parentheses in CompileAssert<(bool(expr))> are necessary +// to work around a bug in gcc 3.4.4 and 4.0.1. If we had written +// +// CompileAssert +// +// instead, these compilers will refuse to compile +// +// COMPILE_ASSERT(5 > 0, some_message); +// +// (They seem to think the ">" in "5 > 0" marks the end of the +// template argument list.) +// +// - The array size is (bool(expr) ? 1 : -1), instead of simply +// +// ((expr) ? 1 : -1). +// +// This is to avoid running into a bug in MS VC 7.1, which +// causes ((0.0) ? 1 : -1) to incorrectly evaluate to 1. +#endif // __cplusplus >= 201103L + +} // namespace protobuf +} // namespace google + +#endif // GOOGLE_PROTOBUF_MACROS_H__ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/mutex.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/mutex.h new file mode 100644 index 00000000..763f0abd --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/mutex.h @@ -0,0 +1,148 @@ +// Copyright (c) 2006, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef GOOGLE_PROTOBUF_STUBS_MUTEX_H_ +#define GOOGLE_PROTOBUF_STUBS_MUTEX_H_ + +#ifdef GOOGLE_PROTOBUF_NO_THREADLOCAL +#include +#endif + +#include "macros.h" + +// =================================================================== +// emulates google3/base/mutex.h +namespace google { +namespace protobuf { +namespace internal { + +// A Mutex is a non-reentrant (aka non-recursive) mutex. At most one thread T +// may hold a mutex at a given time. If T attempts to Lock() the same Mutex +// while holding it, T will deadlock. +class LIBPROTOBUF_EXPORT Mutex { + public: + // Create a Mutex that is not held by anybody. + Mutex(); + + // Destructor + ~Mutex(); + + // Block if necessary until this Mutex is free, then acquire it exclusively. + void Lock(); + + // Release this Mutex. Caller must hold it exclusively. + void Unlock(); + + // Crash if this Mutex is not held exclusively by this thread. + // May fail to crash when it should; will never crash when it should not. + void AssertHeld(); + + private: + struct Internal; + Internal* mInternal; + + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Mutex); +}; + +// Undefine the macros to workaround the conflicts with Google internal +// MutexLock implementation. +// TODO(liujisi): Remove the undef once internal macros are removed. +#undef MutexLock +#undef ReaderMutexLock +#undef WriterMutexLock +#undef MutexLockMaybe + +// MutexLock(mu) acquires mu when constructed and releases it when destroyed. +class LIBPROTOBUF_EXPORT MutexLock { + public: + explicit MutexLock(Mutex *mu) : mu_(mu) { this->mu_->Lock(); } + ~MutexLock() { this->mu_->Unlock(); } + private: + Mutex *const mu_; + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(MutexLock); +}; + +// TODO(kenton): Implement these? Hard to implement portably. +typedef MutexLock ReaderMutexLock; +typedef MutexLock WriterMutexLock; + +// MutexLockMaybe is like MutexLock, but is a no-op when mu is NULL. +class LIBPROTOBUF_EXPORT MutexLockMaybe { + public: + explicit MutexLockMaybe(Mutex *mu) : + mu_(mu) { if (this->mu_ != NULL) { this->mu_->Lock(); } } + ~MutexLockMaybe() { if (this->mu_ != NULL) { this->mu_->Unlock(); } } + private: + Mutex *const mu_; + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(MutexLockMaybe); +}; + +#if defined(GOOGLE_PROTOBUF_NO_THREADLOCAL) +template +class ThreadLocalStorage { + public: + ThreadLocalStorage() { + pthread_key_create(&key_, &ThreadLocalStorage::Delete); + } + ~ThreadLocalStorage() { + pthread_key_delete(key_); + } + T* Get() { + T* result = static_cast(pthread_getspecific(key_)); + if (result == NULL) { + result = new T(); + pthread_setspecific(key_, result); + } + return result; + } + private: + static void Delete(void* value) { + delete static_cast(value); + } + pthread_key_t key_; + + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ThreadLocalStorage); +}; +#endif + +} // namespace internal + +// We made these internal so that they would show up as such in the docs, +// but we don't want to stick "internal::" in front of them everywhere. +using internal::Mutex; +using internal::MutexLock; +using internal::ReaderMutexLock; +using internal::WriterMutexLock; +using internal::MutexLockMaybe; + + +} // namespace protobuf +} // namespace google + +#endif // GOOGLE_PROTOBUF_STUBS_MUTEX_H_ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/once.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/once.h new file mode 100644 index 00000000..2bd79eb7 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/once.h @@ -0,0 +1,167 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Author: kenton@google.com (Kenton Varda) +// +// emulates google3/base/once.h +// +// This header is intended to be included only by internal .cc files and +// generated .pb.cc files. Users should not use this directly. +// +// This is basically a portable version of pthread_once(). +// +// This header declares: +// * A type called ProtobufOnceType. +// * A macro GOOGLE_PROTOBUF_DECLARE_ONCE() which declares a variable of type +// ProtobufOnceType. This is the only legal way to declare such a variable. +// The macro may only be used at the global scope (you cannot create local or +// class member variables of this type). +// * A function GoogleOnceInit(ProtobufOnceType* once, void (*init_func)()). +// This function, when invoked multiple times given the same ProtobufOnceType +// object, will invoke init_func on the first call only, and will make sure +// none of the calls return before that first call to init_func has finished. +// * The user can provide a parameter which GoogleOnceInit() forwards to the +// user-provided function when it is called. Usage example: +// int a = 10; +// GoogleOnceInit(&my_once, &MyFunctionExpectingIntArgument, &a); +// * This implementation guarantees that ProtobufOnceType is a POD (i.e. no +// static initializer generated). +// +// This implements a way to perform lazy initialization. It's more efficient +// than using mutexes as no lock is needed if initialization has already +// happened. +// +// Example usage: +// void Init(); +// GOOGLE_PROTOBUF_DECLARE_ONCE(once_init); +// +// // Calls Init() exactly once. +// void InitOnce() { +// GoogleOnceInit(&once_init, &Init); +// } +// +// Note that if GoogleOnceInit() is called before main() has begun, it must +// only be called by the thread that will eventually call main() -- that is, +// the thread that performs dynamic initialization. In general this is a safe +// assumption since people don't usually construct threads before main() starts, +// but it is technically not guaranteed. Unfortunately, Win32 provides no way +// whatsoever to statically-initialize its synchronization primitives, so our +// only choice is to assume that dynamic initialization is single-threaded. + +#ifndef GOOGLE_PROTOBUF_STUBS_ONCE_H__ +#define GOOGLE_PROTOBUF_STUBS_ONCE_H__ + +#include "atomicops.h" +#include "callback.h" +#include "common.h" + +namespace google { +namespace protobuf { + +#ifdef GOOGLE_PROTOBUF_NO_THREAD_SAFETY + +typedef bool ProtobufOnceType; + +#define GOOGLE_PROTOBUF_ONCE_INIT false + +inline void GoogleOnceInit(ProtobufOnceType* once, void (*init_func)()) { + if (!*once) { + *once = true; + init_func(); + } +} + +template +inline void GoogleOnceInit(ProtobufOnceType* once, void (*init_func)(Arg), + Arg arg) { + if (!*once) { + *once = true; + init_func(arg); + } +} + +#else + +enum { + ONCE_STATE_UNINITIALIZED = 0, + ONCE_STATE_EXECUTING_CLOSURE = 1, + ONCE_STATE_DONE = 2 +}; + +typedef internal::AtomicWord ProtobufOnceType; + +#define GOOGLE_PROTOBUF_ONCE_INIT ::google::protobuf::ONCE_STATE_UNINITIALIZED + +LIBPROTOBUF_EXPORT +void GoogleOnceInitImpl(ProtobufOnceType* once, Closure* closure); + +inline void GoogleOnceInit(ProtobufOnceType* once, void (*init_func)()) { + if (internal::Acquire_Load(once) != ONCE_STATE_DONE) { + internal::FunctionClosure0 func(init_func, false); + GoogleOnceInitImpl(once, &func); + } +} + +template +inline void GoogleOnceInit(ProtobufOnceType* once, void (*init_func)(Arg*), + Arg* arg) { + if (internal::Acquire_Load(once) != ONCE_STATE_DONE) { + internal::FunctionClosure1 func(init_func, false, arg); + GoogleOnceInitImpl(once, &func); + } +} + +#endif // GOOGLE_PROTOBUF_NO_THREAD_SAFETY + +class GoogleOnceDynamic { + public: + GoogleOnceDynamic() : state_(GOOGLE_PROTOBUF_ONCE_INIT) { } + + // If this->Init() has not been called before by any thread, + // execute (*func_with_arg)(arg) then return. + // Otherwise, wait until that prior invocation has finished + // executing its function, then return. + template + void Init(void (*func_with_arg)(T*), T* arg) { + GoogleOnceInit(&this->state_, + func_with_arg, + arg); + } + private: + ProtobufOnceType state_; +}; + +#define GOOGLE_PROTOBUF_DECLARE_ONCE(NAME) \ + ::google::protobuf::ProtobufOnceType NAME = GOOGLE_PROTOBUF_ONCE_INIT + +} // namespace protobuf +} // namespace google + +#endif // GOOGLE_PROTOBUF_STUBS_ONCE_H__ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/platform_macros.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/platform_macros.h new file mode 100644 index 00000000..4ba4b348 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/platform_macros.h @@ -0,0 +1,125 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2012 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef GOOGLE_PROTOBUF_PLATFORM_MACROS_H_ +#define GOOGLE_PROTOBUF_PLATFORM_MACROS_H_ + +#define GOOGLE_PROTOBUF_PLATFORM_ERROR \ +#error "Host platform was not detected as supported by protobuf" + +// Processor architecture detection. For more info on what's defined, see: +// http://msdn.microsoft.com/en-us/library/b0084kay.aspx +// http://www.agner.org/optimize/calling_conventions.pdf +// or with gcc, run: "echo | gcc -E -dM -" +#if defined(_M_X64) || defined(__x86_64__) +#define GOOGLE_PROTOBUF_ARCH_X64 1 +#define GOOGLE_PROTOBUF_ARCH_64_BIT 1 +#elif defined(_M_IX86) || defined(__i386__) +#define GOOGLE_PROTOBUF_ARCH_IA32 1 +#define GOOGLE_PROTOBUF_ARCH_32_BIT 1 +#elif defined(__QNX__) +#define GOOGLE_PROTOBUF_ARCH_ARM_QNX 1 +#define GOOGLE_PROTOBUF_ARCH_32_BIT 1 +#elif defined(__ARMEL__) +#define GOOGLE_PROTOBUF_ARCH_ARM 1 +#define GOOGLE_PROTOBUF_ARCH_32_BIT 1 +#elif defined(__aarch64__) +#define GOOGLE_PROTOBUF_ARCH_AARCH64 1 +#define GOOGLE_PROTOBUF_ARCH_64_BIT 1 +#elif defined(__MIPSEL__) +#if defined(__LP64__) +#define GOOGLE_PROTOBUF_ARCH_MIPS64 1 +#define GOOGLE_PROTOBUF_ARCH_64_BIT 1 +#else +#define GOOGLE_PROTOBUF_ARCH_MIPS 1 +#define GOOGLE_PROTOBUF_ARCH_32_BIT 1 +#endif +#elif defined(__pnacl__) +#define GOOGLE_PROTOBUF_ARCH_32_BIT 1 +#elif defined(sparc) +#define GOOGLE_PROTOBUF_ARCH_SPARC 1 +#if defined(__sparc_v9__) || defined(__sparcv9) || defined(__arch64__) +#define GOOGLE_PROTOBUF_ARCH_64_BIT 1 +#else +#define GOOGLE_PROTOBUF_ARCH_32_BIT 1 +#endif +#elif defined(_POWER) || defined(__powerpc64__) || defined(__PPC64__) +#define GOOGLE_PROTOBUF_ARCH_POWER 1 +#define GOOGLE_PROTOBUF_ARCH_64_BIT 1 +#elif defined(__PPC__) +#define GOOGLE_PROTOBUF_ARCH_PPC 1 +#define GOOGLE_PROTOBUF_ARCH_32_BIT 1 +#elif defined(__GNUC__) +# if (((__GNUC__ == 4) && (__GNUC_MINOR__ >= 7)) || (__GNUC__ > 4)) +// We fallback to the generic Clang/GCC >= 4.7 implementation in atomicops.h +# elif defined(__clang__) +# if !__has_extension(c_atomic) +GOOGLE_PROTOBUF_PLATFORM_ERROR +# endif +// We fallback to the generic Clang/GCC >= 4.7 implementation in atomicops.h +# endif +# if __LP64__ +# define GOOGLE_PROTOBUF_ARCH_64_BIT 1 +# else +# define GOOGLE_PROTOBUF_ARCH_32_BIT 1 +# endif +#else +GOOGLE_PROTOBUF_PLATFORM_ERROR +#endif + +#if defined(__APPLE__) +#define GOOGLE_PROTOBUF_OS_APPLE +#include +#if TARGET_OS_IPHONE +#define GOOGLE_PROTOBUF_OS_IPHONE +#endif +#elif defined(__EMSCRIPTEN__) +#define GOOGLE_PROTOBUF_OS_EMSCRIPTEN +#elif defined(__native_client__) +#define GOOGLE_PROTOBUF_OS_NACL +#elif defined(sun) +#define GOOGLE_PROTOBUF_OS_SOLARIS +#elif defined(_AIX) +#define GOOGLE_PROTOBUF_OS_AIX +#elif defined(__ANDROID__) +#define GOOGLE_PROTOBUF_OS_ANDROID +#endif + +#undef GOOGLE_PROTOBUF_PLATFORM_ERROR + +#if defined(GOOGLE_PROTOBUF_OS_ANDROID) || defined(GOOGLE_PROTOBUF_OS_IPHONE) +// Android ndk does not support the __thread keyword very well yet. Here +// we use pthread_key_create()/pthread_getspecific()/... methods for +// TLS support on android. +// iOS also does not support the __thread keyword. +#define GOOGLE_PROTOBUF_NO_THREADLOCAL +#endif + +#endif // GOOGLE_PROTOBUF_PLATFORM_MACROS_H_ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/port.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/port.h new file mode 100644 index 00000000..376be5f7 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/port.h @@ -0,0 +1,448 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef GOOGLE_PROTOBUF_STUBS_PORT_H_ +#define GOOGLE_PROTOBUF_STUBS_PORT_H_ + +#include +#include +#include +#include +#include +#if defined(__osf__) +// Tru64 lacks stdint.h, but has inttypes.h which defines a superset of +// what stdint.h would define. +#include +#elif !defined(_MSC_VER) +#include +#endif + +#undef PROTOBUF_LITTLE_ENDIAN +#ifdef _WIN32 + // Assuming windows is always little-endian. + // TODO(xiaofeng): The PROTOBUF_LITTLE_ENDIAN is not only used for + // optimization but also for correctness. We should define an + // different macro to test the big-endian code path in coded_stream. + #if !defined(PROTOBUF_DISABLE_LITTLE_ENDIAN_OPT_FOR_TEST) + #define PROTOBUF_LITTLE_ENDIAN 1 + #endif + #if _MSC_VER >= 1300 && !defined(__INTEL_COMPILER) + // If MSVC has "/RTCc" set, it will complain about truncating casts at + // runtime. This file contains some intentional truncating casts. + #pragma runtime_checks("c", off) + #endif +#else + #include // __BYTE_ORDER + #if ((defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__)) || \ + (defined(__BYTE_ORDER) && __BYTE_ORDER == __LITTLE_ENDIAN)) && \ + !defined(PROTOBUF_DISABLE_LITTLE_ENDIAN_OPT_FOR_TEST) + #define PROTOBUF_LITTLE_ENDIAN 1 + #endif +#endif +#if defined(_MSC_VER) && defined(PROTOBUF_USE_DLLS) + #ifdef LIBPROTOBUF_EXPORTS + #define LIBPROTOBUF_EXPORT __declspec(dllexport) + #else + #define LIBPROTOBUF_EXPORT __declspec(dllimport) + #endif + #ifdef LIBPROTOC_EXPORTS + #define LIBPROTOC_EXPORT __declspec(dllexport) + #else + #define LIBPROTOC_EXPORT __declspec(dllimport) + #endif +#else + #define LIBPROTOBUF_EXPORT + #define LIBPROTOC_EXPORT +#endif + +// These #includes are for the byte swap functions declared later on. +#ifdef _MSC_VER +#include // NOLINT(build/include) +#elif defined(__APPLE__) +#include +#elif defined(__GLIBC__) || defined(__CYGWIN__) +#include // IWYU pragma: export +#endif + +// =================================================================== +// from google3/base/port.h +namespace google { +namespace protobuf { + +typedef unsigned int uint; + +#ifdef _MSC_VER +typedef signed __int8 int8; +typedef __int16 int16; +typedef __int32 int32; +typedef __int64 int64; + +typedef unsigned __int8 uint8; +typedef unsigned __int16 uint16; +typedef unsigned __int32 uint32; +typedef unsigned __int64 uint64; +#else +typedef int8_t int8; +typedef int16_t int16; +typedef int32_t int32; +typedef int64_t int64; + +typedef uint8_t uint8; +typedef uint16_t uint16; +typedef uint32_t uint32; +typedef uint64_t uint64; +#endif + +// long long macros to be used because gcc and vc++ use different suffixes, +// and different size specifiers in format strings +#undef GOOGLE_LONGLONG +#undef GOOGLE_ULONGLONG +#undef GOOGLE_LL_FORMAT + +#ifdef _MSC_VER +#define GOOGLE_LONGLONG(x) x##I64 +#define GOOGLE_ULONGLONG(x) x##UI64 +#define GOOGLE_LL_FORMAT "I64" // As in printf("%I64d", ...) +#else +// By long long, we actually mean int64. +#define GOOGLE_LONGLONG(x) x##LL +#define GOOGLE_ULONGLONG(x) x##ULL +// Used to format real long long integers. +#define GOOGLE_LL_FORMAT "ll" // As in "%lld". Note that "q" is poor form also. +#endif + +static const int32 kint32max = 0x7FFFFFFF; +static const int32 kint32min = -kint32max - 1; +static const int64 kint64max = GOOGLE_LONGLONG(0x7FFFFFFFFFFFFFFF); +static const int64 kint64min = -kint64max - 1; +static const uint32 kuint32max = 0xFFFFFFFFu; +static const uint64 kuint64max = GOOGLE_ULONGLONG(0xFFFFFFFFFFFFFFFF); + +// ------------------------------------------------------------------- +// Annotations: Some parts of the code have been annotated in ways that might +// be useful to some compilers or tools, but are not supported universally. +// You can #define these annotations yourself if the default implementation +// is not right for you. + +#ifndef GOOGLE_ATTRIBUTE_ALWAYS_INLINE +#if defined(__GNUC__) && (__GNUC__ > 3 ||(__GNUC__ == 3 && __GNUC_MINOR__ >= 1)) +// For functions we want to force inline. +// Introduced in gcc 3.1. +#define GOOGLE_ATTRIBUTE_ALWAYS_INLINE __attribute__ ((always_inline)) +#else +// Other compilers will have to figure it out for themselves. +#define GOOGLE_ATTRIBUTE_ALWAYS_INLINE +#endif +#endif + +#ifndef GOOGLE_ATTRIBUTE_NOINLINE +#if defined(__GNUC__) && (__GNUC__ > 3 ||(__GNUC__ == 3 && __GNUC_MINOR__ >= 1)) +// For functions we want to force not inline. +// Introduced in gcc 3.1. +#define GOOGLE_ATTRIBUTE_NOINLINE __attribute__ ((noinline)) +#elif defined(_MSC_VER) && (_MSC_VER >= 1400) +// Seems to have been around since at least Visual Studio 2005 +#define GOOGLE_ATTRIBUTE_NOINLINE __declspec(noinline) +#else +// Other compilers will have to figure it out for themselves. +#define GOOGLE_ATTRIBUTE_NOINLINE +#endif +#endif + +#ifndef GOOGLE_ATTRIBUTE_NORETURN +#ifdef __GNUC__ +// Tell the compiler that a given function never returns. +#define GOOGLE_ATTRIBUTE_NORETURN __attribute__((noreturn)) +#else +#define GOOGLE_ATTRIBUTE_NORETURN +#endif +#endif + +#ifndef GOOGLE_ATTRIBUTE_DEPRECATED +#ifdef __GNUC__ +// If the method/variable/type is used anywhere, produce a warning. +#define GOOGLE_ATTRIBUTE_DEPRECATED __attribute__((deprecated)) +#else +#define GOOGLE_ATTRIBUTE_DEPRECATED +#endif +#endif + +#ifndef GOOGLE_PREDICT_TRUE +#ifdef __GNUC__ +// Provided at least since GCC 3.0. +#define GOOGLE_PREDICT_TRUE(x) (__builtin_expect(!!(x), 1)) +#else +#define GOOGLE_PREDICT_TRUE(x) (x) +#endif +#endif + +#ifndef GOOGLE_PREDICT_FALSE +#ifdef __GNUC__ +// Provided at least since GCC 3.0. +#define GOOGLE_PREDICT_FALSE(x) (__builtin_expect(x, 0)) +#else +#define GOOGLE_PREDICT_FALSE(x) (x) +#endif +#endif + +// Delimits a block of code which may write to memory which is simultaneously +// written by other threads, but which has been determined to be thread-safe +// (e.g. because it is an idempotent write). +#ifndef GOOGLE_SAFE_CONCURRENT_WRITES_BEGIN +#define GOOGLE_SAFE_CONCURRENT_WRITES_BEGIN() +#endif +#ifndef GOOGLE_SAFE_CONCURRENT_WRITES_END +#define GOOGLE_SAFE_CONCURRENT_WRITES_END() +#endif + +#if defined(__clang__) && defined(__has_cpp_attribute) \ + && !defined(GOOGLE_PROTOBUF_OS_APPLE) +# if defined(GOOGLE_PROTOBUF_OS_NACL) || defined(EMSCRIPTEN) || \ + __has_cpp_attribute(clang::fallthrough) +# define GOOGLE_FALLTHROUGH_INTENDED [[clang::fallthrough]] +# endif +#endif + +#ifndef GOOGLE_FALLTHROUGH_INTENDED +# define GOOGLE_FALLTHROUGH_INTENDED +#endif + +#define GOOGLE_GUARDED_BY(x) +#define GOOGLE_ATTRIBUTE_COLD + +// x86 and x86-64 can perform unaligned loads/stores directly. +#if defined(_M_X64) || defined(__x86_64__) || \ + defined(_M_IX86) || defined(__i386__) + +#define GOOGLE_UNALIGNED_LOAD16(_p) (*reinterpret_cast(_p)) +#define GOOGLE_UNALIGNED_LOAD32(_p) (*reinterpret_cast(_p)) +#define GOOGLE_UNALIGNED_LOAD64(_p) (*reinterpret_cast(_p)) + +#define GOOGLE_UNALIGNED_STORE16(_p, _val) (*reinterpret_cast(_p) = (_val)) +#define GOOGLE_UNALIGNED_STORE32(_p, _val) (*reinterpret_cast(_p) = (_val)) +#define GOOGLE_UNALIGNED_STORE64(_p, _val) (*reinterpret_cast(_p) = (_val)) + +#else +inline uint16 GOOGLE_UNALIGNED_LOAD16(const void *p) { + uint16 t; + memcpy(&t, p, sizeof t); + return t; +} + +inline uint32 GOOGLE_UNALIGNED_LOAD32(const void *p) { + uint32 t; + memcpy(&t, p, sizeof t); + return t; +} + +inline uint64 GOOGLE_UNALIGNED_LOAD64(const void *p) { + uint64 t; + memcpy(&t, p, sizeof t); + return t; +} + +inline void GOOGLE_UNALIGNED_STORE16(void *p, uint16 v) { + memcpy(p, &v, sizeof v); +} + +inline void GOOGLE_UNALIGNED_STORE32(void *p, uint32 v) { + memcpy(p, &v, sizeof v); +} + +inline void GOOGLE_UNALIGNED_STORE64(void *p, uint64 v) { + memcpy(p, &v, sizeof v); +} +#endif + +#if defined(_MSC_VER) +#define GOOGLE_THREAD_LOCAL __declspec(thread) +#else +#define GOOGLE_THREAD_LOCAL __thread +#endif + +// The following guarantees declaration of the byte swap functions. +#ifdef _MSC_VER +#define bswap_16(x) _byteswap_ushort(x) +#define bswap_32(x) _byteswap_ulong(x) +#define bswap_64(x) _byteswap_uint64(x) + +#elif defined(__APPLE__) +// Mac OS X / Darwin features +#define bswap_16(x) OSSwapInt16(x) +#define bswap_32(x) OSSwapInt32(x) +#define bswap_64(x) OSSwapInt64(x) + +#elif !defined(__GLIBC__) && !defined(__CYGWIN__) + +static inline uint16 bswap_16(uint16 x) { + return static_cast(((x & 0xFF) << 8) | ((x & 0xFF00) >> 8)); +} +#define bswap_16(x) bswap_16(x) +static inline uint32 bswap_32(uint32 x) { + return (((x & 0xFF) << 24) | + ((x & 0xFF00) << 8) | + ((x & 0xFF0000) >> 8) | + ((x & 0xFF000000) >> 24)); +} +#define bswap_32(x) bswap_32(x) +static inline uint64 bswap_64(uint64 x) { + return (((x & GOOGLE_ULONGLONG(0xFF)) << 56) | + ((x & GOOGLE_ULONGLONG(0xFF00)) << 40) | + ((x & GOOGLE_ULONGLONG(0xFF0000)) << 24) | + ((x & GOOGLE_ULONGLONG(0xFF000000)) << 8) | + ((x & GOOGLE_ULONGLONG(0xFF00000000)) >> 8) | + ((x & GOOGLE_ULONGLONG(0xFF0000000000)) >> 24) | + ((x & GOOGLE_ULONGLONG(0xFF000000000000)) >> 40) | + ((x & GOOGLE_ULONGLONG(0xFF00000000000000)) >> 56)); +} +#define bswap_64(x) bswap_64(x) + +#endif + +// =================================================================== +// from google3/util/bits/bits.h + +class Bits { + public: + static uint32 Log2FloorNonZero(uint32 n) { +#if defined(__GNUC__) + return 31 ^ __builtin_clz(n); +#elif defined(COMPILER_MSVC) && defined(_M_IX86) + _asm { + bsr ebx, n + mov n, ebx + } + return n; +#else + return Log2FloorNonZero_Portable(n); +#endif + } + + static uint64 Log2FloorNonZero64(uint64 n) { +#if defined(__GNUC__) + return 63 ^ __builtin_clzll(n); +#else + return Log2FloorNonZero64_Portable(n); +#endif + } + private: + static int Log2FloorNonZero_Portable(uint32 n) { + if (n == 0) + return -1; + int log = 0; + uint32 value = n; + for (int i = 4; i >= 0; --i) { + int shift = (1 << i); + uint32 x = value >> shift; + if (x != 0) { + value = x; + log += shift; + } + } + assert(value == 1); + return log; + } + + static int Log2FloorNonZero64_Portable(uint64 n) { + const uint32 topbits = static_cast(n >> 32); + if (topbits == 0) { + // Top bits are zero, so scan in bottom bits + return Log2FloorNonZero(static_cast(n)); + } else { + return 32 + Log2FloorNonZero(topbits); + } + } +}; + +// =================================================================== +// from google3/util/endian/endian.h +LIBPROTOBUF_EXPORT uint32 ghtonl(uint32 x); + +class BigEndian { + public: +#ifdef PROTOBUF_LITTLE_ENDIAN + + static uint16 FromHost16(uint16 x) { return bswap_16(x); } + static uint16 ToHost16(uint16 x) { return bswap_16(x); } + + static uint32 FromHost32(uint32 x) { return bswap_32(x); } + static uint32 ToHost32(uint32 x) { return bswap_32(x); } + + static uint64 FromHost64(uint64 x) { return bswap_64(x); } + static uint64 ToHost64(uint64 x) { return bswap_64(x); } + + static bool IsLittleEndian() { return true; } + +#else + + static uint16 FromHost16(uint16 x) { return x; } + static uint16 ToHost16(uint16 x) { return x; } + + static uint32 FromHost32(uint32 x) { return x; } + static uint32 ToHost32(uint32 x) { return x; } + + static uint64 FromHost64(uint64 x) { return x; } + static uint64 ToHost64(uint64 x) { return x; } + + static bool IsLittleEndian() { return false; } + +#endif /* ENDIAN */ + + // Functions to do unaligned loads and stores in big-endian order. + static uint16 Load16(const void *p) { + return ToHost16(GOOGLE_UNALIGNED_LOAD16(p)); + } + + static void Store16(void *p, uint16 v) { + GOOGLE_UNALIGNED_STORE16(p, FromHost16(v)); + } + + static uint32 Load32(const void *p) { + return ToHost32(GOOGLE_UNALIGNED_LOAD32(p)); + } + + static void Store32(void *p, uint32 v) { + GOOGLE_UNALIGNED_STORE32(p, FromHost32(v)); + } + + static uint64 Load64(const void *p) { + return ToHost64(GOOGLE_UNALIGNED_LOAD64(p)); + } + + static void Store64(void *p, uint64 v) { + GOOGLE_UNALIGNED_STORE64(p, FromHost64(v)); + } +}; + + +} // namespace protobuf +} // namespace google + +#endif // GOOGLE_PROTOBUF_STUBS_PORT_H_ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/scoped_ptr.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/scoped_ptr.h new file mode 100644 index 00000000..56198845 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/scoped_ptr.h @@ -0,0 +1,236 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef GOOGLE_PROTOBUF_STUBS_SCOPED_PTR_H_ +#define GOOGLE_PROTOBUF_STUBS_SCOPED_PTR_H_ + +#include "port.h" + +namespace google { +namespace protobuf { + +// =================================================================== +// from google3/base/scoped_ptr.h + +namespace internal { + +// This is an implementation designed to match the anticipated future TR2 +// implementation of the scoped_ptr class, and its closely-related brethren, +// scoped_array, scoped_ptr_malloc, and make_scoped_ptr. + +template class scoped_ptr; +template class scoped_array; + +// A scoped_ptr is like a T*, except that the destructor of scoped_ptr +// automatically deletes the pointer it holds (if any). +// That is, scoped_ptr owns the T object that it points to. +// Like a T*, a scoped_ptr may hold either NULL or a pointer to a T object. +// +// The size of a scoped_ptr is small: +// sizeof(scoped_ptr) == sizeof(C*) +template +class scoped_ptr { + public: + + // The element type + typedef C element_type; + + // Constructor. Defaults to initializing with NULL. + // There is no way to create an uninitialized scoped_ptr. + // The input parameter must be allocated with new. + explicit scoped_ptr(C* p = NULL) : ptr_(p) { } + + // Destructor. If there is a C object, delete it. + // We don't need to test ptr_ == NULL because C++ does that for us. + ~scoped_ptr() { + enum { type_must_be_complete = sizeof(C) }; + delete ptr_; + } + + // Reset. Deletes the current owned object, if any. + // Then takes ownership of a new object, if given. + // this->reset(this->get()) works. + void reset(C* p = NULL) { + if (p != ptr_) { + enum { type_must_be_complete = sizeof(C) }; + delete ptr_; + ptr_ = p; + } + } + + // Accessors to get the owned object. + // operator* and operator-> will assert() if there is no current object. + C& operator*() const { + assert(ptr_ != NULL); + return *ptr_; + } + C* operator->() const { + assert(ptr_ != NULL); + return ptr_; + } + C* get() const { return ptr_; } + + // Comparison operators. + // These return whether two scoped_ptr refer to the same object, not just to + // two different but equal objects. + bool operator==(C* p) const { return ptr_ == p; } + bool operator!=(C* p) const { return ptr_ != p; } + + // Swap two scoped pointers. + void swap(scoped_ptr& p2) { + C* tmp = ptr_; + ptr_ = p2.ptr_; + p2.ptr_ = tmp; + } + + // Release a pointer. + // The return value is the current pointer held by this object. + // If this object holds a NULL pointer, the return value is NULL. + // After this operation, this object will hold a NULL pointer, + // and will not own the object any more. + C* release() { + C* retVal = ptr_; + ptr_ = NULL; + return retVal; + } + + private: + C* ptr_; + + // Forbid comparison of scoped_ptr types. If C2 != C, it totally doesn't + // make sense, and if C2 == C, it still doesn't make sense because you should + // never have the same object owned by two different scoped_ptrs. + template bool operator==(scoped_ptr const& p2) const; + template bool operator!=(scoped_ptr const& p2) const; + + // Disallow evil constructors + scoped_ptr(const scoped_ptr&); + void operator=(const scoped_ptr&); +}; + +// scoped_array is like scoped_ptr, except that the caller must allocate +// with new [] and the destructor deletes objects with delete []. +// +// As with scoped_ptr, a scoped_array either points to an object +// or is NULL. A scoped_array owns the object that it points to. +// +// Size: sizeof(scoped_array) == sizeof(C*) +template +class scoped_array { + public: + + // The element type + typedef C element_type; + + // Constructor. Defaults to initializing with NULL. + // There is no way to create an uninitialized scoped_array. + // The input parameter must be allocated with new []. + explicit scoped_array(C* p = NULL) : array_(p) { } + + // Destructor. If there is a C object, delete it. + // We don't need to test ptr_ == NULL because C++ does that for us. + ~scoped_array() { + enum { type_must_be_complete = sizeof(C) }; + delete[] array_; + } + + // Reset. Deletes the current owned object, if any. + // Then takes ownership of a new object, if given. + // this->reset(this->get()) works. + void reset(C* p = NULL) { + if (p != array_) { + enum { type_must_be_complete = sizeof(C) }; + delete[] array_; + array_ = p; + } + } + + // Get one element of the current object. + // Will assert() if there is no current object, or index i is negative. + C& operator[](std::ptrdiff_t i) const { + assert(i >= 0); + assert(array_ != NULL); + return array_[i]; + } + + // Get a pointer to the zeroth element of the current object. + // If there is no current object, return NULL. + C* get() const { + return array_; + } + + // Comparison operators. + // These return whether two scoped_array refer to the same object, not just to + // two different but equal objects. + bool operator==(C* p) const { return array_ == p; } + bool operator!=(C* p) const { return array_ != p; } + + // Swap two scoped arrays. + void swap(scoped_array& p2) { + C* tmp = array_; + array_ = p2.array_; + p2.array_ = tmp; + } + + // Release an array. + // The return value is the current pointer held by this object. + // If this object holds a NULL pointer, the return value is NULL. + // After this operation, this object will hold a NULL pointer, + // and will not own the object any more. + C* release() { + C* retVal = array_; + array_ = NULL; + return retVal; + } + + private: + C* array_; + + // Forbid comparison of different scoped_array types. + template bool operator==(scoped_array const& p2) const; + template bool operator!=(scoped_array const& p2) const; + + // Disallow evil constructors + scoped_array(const scoped_array&); + void operator=(const scoped_array&); +}; + +} // namespace internal + +// We made these internal so that they would show up as such in the docs, +// but we don't want to stick "internal::" in front of them everywhere. +using internal::scoped_ptr; +using internal::scoped_array; + + +} // namespace protobuf +} // namespace google + +#endif // GOOGLE_PROTOBUF_STUBS_SCOPED_PTR_H_ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/shared_ptr.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/shared_ptr.h new file mode 100644 index 00000000..ebdc3f35 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/shared_ptr.h @@ -0,0 +1,470 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2014 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// from google3/util/gtl/shared_ptr.h + +#ifndef GOOGLE_PROTOBUF_STUBS_SHARED_PTR_H__ +#define GOOGLE_PROTOBUF_STUBS_SHARED_PTR_H__ + +#include "atomicops.h" + +#include // for swap +#include +#include + +namespace google { +namespace protobuf { +namespace internal { + +// Alias to std::shared_ptr for any C++11 platform, +// and for any supported MSVC compiler. +#if !defined(UTIL_GTL_USE_STD_SHARED_PTR) && \ + (defined(COMPILER_MSVC) || defined(LANG_CXX11)) +#define UTIL_GTL_USE_STD_SHARED_PTR 1 +#endif + +#if defined(UTIL_GTL_USE_STD_SHARED_PTR) && UTIL_GTL_USE_STD_SHARED_PTR + +// These are transitional. They will be going away soon. +// Please just #include and just type std::shared_ptr yourself, instead +// of relying on this file. +// +// Migration doc: http://go/std-shared-ptr-lsc +using std::enable_shared_from_this; +using std::shared_ptr; +using std::static_pointer_cast; +using std::weak_ptr; + +#else // below, UTIL_GTL_USE_STD_SHARED_PTR not set or set to 0. + +// For everything else there is the google3 implementation. +inline bool RefCountDec(volatile Atomic32 *ptr) { + return Barrier_AtomicIncrement(ptr, -1) != 0; +} + +inline void RefCountInc(volatile Atomic32 *ptr) { + NoBarrier_AtomicIncrement(ptr, 1); +} + +template class shared_ptr; +template class weak_ptr; + +// This class is an internal implementation detail for shared_ptr. If two +// shared_ptrs point to the same object, they also share a control block. +// An "empty" shared_pointer refers to NULL and also has a NULL control block. +// It contains all of the state that's needed for reference counting or any +// other kind of resource management. In this implementation the control block +// happens to consist of two atomic words, the reference count (the number +// of shared_ptrs that share ownership of the object) and the weak count +// (the number of weak_ptrs that observe the object, plus 1 if the +// refcount is nonzero). +// +// The "plus 1" is to prevent a race condition in the shared_ptr and +// weak_ptr destructors. We need to make sure the control block is +// only deleted once, so we need to make sure that at most one +// object sees the weak count decremented from 1 to 0. +class SharedPtrControlBlock { + template friend class shared_ptr; + template friend class weak_ptr; + private: + SharedPtrControlBlock() : refcount_(1), weak_count_(1) { } + Atomic32 refcount_; + Atomic32 weak_count_; +}; + +// Forward declaration. The class is defined below. +template class enable_shared_from_this; + +template +class shared_ptr { + template friend class weak_ptr; + public: + typedef T element_type; + + shared_ptr() : ptr_(NULL), control_block_(NULL) {} + + explicit shared_ptr(T* ptr) + : ptr_(ptr), + control_block_(ptr != NULL ? new SharedPtrControlBlock : NULL) { + // If p is non-null and T inherits from enable_shared_from_this, we + // set up the data that shared_from_this needs. + MaybeSetupWeakThis(ptr); + } + + // Copy constructor: makes this object a copy of ptr, and increments + // the reference count. + template + shared_ptr(const shared_ptr& ptr) + : ptr_(NULL), + control_block_(NULL) { + Initialize(ptr); + } + // Need non-templated version to prevent the compiler-generated default + shared_ptr(const shared_ptr& ptr) + : ptr_(NULL), + control_block_(NULL) { + Initialize(ptr); + } + + // Assignment operator. Replaces the existing shared_ptr with ptr. + // Increment ptr's reference count and decrement the one being replaced. + template + shared_ptr& operator=(const shared_ptr& ptr) { + if (ptr_ != ptr.ptr_) { + shared_ptr me(ptr); // will hold our previous state to be destroyed. + swap(me); + } + return *this; + } + + // Need non-templated version to prevent the compiler-generated default + shared_ptr& operator=(const shared_ptr& ptr) { + if (ptr_ != ptr.ptr_) { + shared_ptr me(ptr); // will hold our previous state to be destroyed. + swap(me); + } + return *this; + } + + // TODO(austern): Consider providing this constructor. The draft C++ standard + // (20.8.10.2.1) includes it. However, it says that this constructor throws + // a bad_weak_ptr exception when ptr is expired. Is it better to provide this + // constructor and make it do something else, like fail with a CHECK, or to + // leave this constructor out entirely? + // + // template + // shared_ptr(const weak_ptr& ptr); + + ~shared_ptr() { + if (ptr_ != NULL) { + if (!RefCountDec(&control_block_->refcount_)) { + delete ptr_; + + // weak_count_ is defined as the number of weak_ptrs that observe + // ptr_, plus 1 if refcount_ is nonzero. + if (!RefCountDec(&control_block_->weak_count_)) { + delete control_block_; + } + } + } + } + + // Replaces underlying raw pointer with the one passed in. The reference + // count is set to one (or zero if the pointer is NULL) for the pointer + // being passed in and decremented for the one being replaced. + // + // If you have a compilation error with this code, make sure you aren't + // passing NULL, nullptr, or 0 to this function. Call reset without an + // argument to reset to a null ptr. + template + void reset(Y* p) { + if (p != ptr_) { + shared_ptr tmp(p); + tmp.swap(*this); + } + } + + void reset() { + reset(static_cast(NULL)); + } + + // Exchanges the contents of this with the contents of r. This function + // supports more efficient swapping since it eliminates the need for a + // temporary shared_ptr object. + void swap(shared_ptr& r) { + using std::swap; // http://go/using-std-swap + swap(ptr_, r.ptr_); + swap(control_block_, r.control_block_); + } + + // The following function is useful for gaining access to the underlying + // pointer when a shared_ptr remains in scope so the reference-count is + // known to be > 0 (e.g. for parameter passing). + T* get() const { + return ptr_; + } + + T& operator*() const { + return *ptr_; + } + + T* operator->() const { + return ptr_; + } + + long use_count() const { + return control_block_ ? control_block_->refcount_ : 1; + } + + bool unique() const { + return use_count() == 1; + } + + private: + // If r is non-empty, initialize *this to share ownership with r, + // increasing the underlying reference count. + // If r is empty, *this remains empty. + // Requires: this is empty, namely this->ptr_ == NULL. + template + void Initialize(const shared_ptr& r) { + // This performs a static_cast on r.ptr_ to U*, which is a no-op since it + // is already a U*. So initialization here requires that r.ptr_ is + // implicitly convertible to T*. + InitializeWithStaticCast(r); + } + + // Initializes *this as described in Initialize, but additionally performs a + // static_cast from r.ptr_ (V*) to U*. + // NOTE(gfc): We'd need a more general form to support const_pointer_cast and + // dynamic_pointer_cast, but those operations are sufficiently discouraged + // that supporting static_pointer_cast is sufficient. + template + void InitializeWithStaticCast(const shared_ptr& r) { + if (r.control_block_ != NULL) { + RefCountInc(&r.control_block_->refcount_); + + ptr_ = static_cast(r.ptr_); + control_block_ = r.control_block_; + } + } + + // Helper function for the constructor that takes a raw pointer. If T + // doesn't inherit from enable_shared_from_this then we have nothing to + // do, so this function is trivial and inline. The other version is declared + // out of line, after the class definition of enable_shared_from_this. + void MaybeSetupWeakThis(enable_shared_from_this* ptr); + void MaybeSetupWeakThis(...) { } + + T* ptr_; + SharedPtrControlBlock* control_block_; + +#ifndef SWIG + template + friend class shared_ptr; + + template + friend shared_ptr static_pointer_cast(const shared_ptr& rhs); +#endif +}; + +// Matches the interface of std::swap as an aid to generic programming. +template void swap(shared_ptr& r, shared_ptr& s) { + r.swap(s); +} + +template +shared_ptr static_pointer_cast(const shared_ptr& rhs) { + shared_ptr lhs; + lhs.template InitializeWithStaticCast(rhs); + return lhs; +} + +// See comments at the top of the file for a description of why this +// class exists, and the draft C++ standard (as of July 2009 the +// latest draft is N2914) for the detailed specification. +template +class weak_ptr { + template friend class weak_ptr; + public: + typedef T element_type; + + // Create an empty (i.e. already expired) weak_ptr. + weak_ptr() : ptr_(NULL), control_block_(NULL) { } + + // Create a weak_ptr that observes the same object that ptr points + // to. Note that there is no race condition here: we know that the + // control block can't disappear while we're looking at it because + // it is owned by at least one shared_ptr, ptr. + template weak_ptr(const shared_ptr& ptr) { + CopyFrom(ptr.ptr_, ptr.control_block_); + } + + // Copy a weak_ptr. The object it points to might disappear, but we + // don't care: we're only working with the control block, and it can't + // disappear while we're looking at because it's owned by at least one + // weak_ptr, ptr. + template weak_ptr(const weak_ptr& ptr) { + CopyFrom(ptr.ptr_, ptr.control_block_); + } + + // Need non-templated version to prevent default copy constructor + weak_ptr(const weak_ptr& ptr) { + CopyFrom(ptr.ptr_, ptr.control_block_); + } + + // Destroy the weak_ptr. If no shared_ptr owns the control block, and if + // we are the last weak_ptr to own it, then it can be deleted. Note that + // weak_count_ is defined as the number of weak_ptrs sharing this control + // block, plus 1 if there are any shared_ptrs. We therefore know that it's + // safe to delete the control block when weak_count_ reaches 0, without + // having to perform any additional tests. + ~weak_ptr() { + if (control_block_ != NULL && + !RefCountDec(&control_block_->weak_count_)) { + delete control_block_; + } + } + + weak_ptr& operator=(const weak_ptr& ptr) { + if (&ptr != this) { + weak_ptr tmp(ptr); + tmp.swap(*this); + } + return *this; + } + template weak_ptr& operator=(const weak_ptr& ptr) { + weak_ptr tmp(ptr); + tmp.swap(*this); + return *this; + } + template weak_ptr& operator=(const shared_ptr& ptr) { + weak_ptr tmp(ptr); + tmp.swap(*this); + return *this; + } + + void swap(weak_ptr& ptr) { + using std::swap; // http://go/using-std-swap + swap(ptr_, ptr.ptr_); + swap(control_block_, ptr.control_block_); + } + + void reset() { + weak_ptr tmp; + tmp.swap(*this); + } + + // Return the number of shared_ptrs that own the object we are observing. + // Note that this number can be 0 (if this pointer has expired). + long use_count() const { + return control_block_ != NULL ? control_block_->refcount_ : 0; + } + + bool expired() const { return use_count() == 0; } + + // Return a shared_ptr that owns the object we are observing. If we + // have expired, the shared_ptr will be empty. We have to be careful + // about concurrency, though, since some other thread might be + // destroying the last owning shared_ptr while we're in this + // function. We want to increment the refcount only if it's nonzero + // and get the new value, and we want that whole operation to be + // atomic. + shared_ptr lock() const { + shared_ptr result; + if (control_block_ != NULL) { + Atomic32 old_refcount; + do { + old_refcount = control_block_->refcount_; + if (old_refcount == 0) + break; + } while (old_refcount != + NoBarrier_CompareAndSwap( + &control_block_->refcount_, old_refcount, + old_refcount + 1)); + if (old_refcount > 0) { + result.ptr_ = ptr_; + result.control_block_ = control_block_; + } + } + + return result; + } + + private: + void CopyFrom(T* ptr, SharedPtrControlBlock* control_block) { + ptr_ = ptr; + control_block_ = control_block; + if (control_block_ != NULL) + RefCountInc(&control_block_->weak_count_); + } + + private: + element_type* ptr_; + SharedPtrControlBlock* control_block_; +}; + +template void swap(weak_ptr& r, weak_ptr& s) { + r.swap(s); +} + +// See comments at the top of the file for a description of why this class +// exists, and section 20.8.10.5 of the draft C++ standard (as of July 2009 +// the latest draft is N2914) for the detailed specification. +template +class enable_shared_from_this { + friend class shared_ptr; + public: + // Precondition: there must be a shared_ptr that owns *this and that was + // created, directly or indirectly, from a raw pointer of type T*. (The + // latter part of the condition is technical but not quite redundant; it + // rules out some complicated uses involving inheritance hierarchies.) + shared_ptr shared_from_this() { + // Behavior is undefined if the precondition isn't satisfied; we choose + // to die with a CHECK failure. + CHECK(!weak_this_.expired()) << "No shared_ptr owns this object"; + return weak_this_.lock(); + } + shared_ptr shared_from_this() const { + CHECK(!weak_this_.expired()) << "No shared_ptr owns this object"; + return weak_this_.lock(); + } + + protected: + enable_shared_from_this() { } + enable_shared_from_this(const enable_shared_from_this& other) { } + enable_shared_from_this& operator=(const enable_shared_from_this& other) { + return *this; + } + ~enable_shared_from_this() { } + + private: + weak_ptr weak_this_; +}; + +// This is a helper function called by shared_ptr's constructor from a raw +// pointer. If T inherits from enable_shared_from_this, it sets up +// weak_this_ so that shared_from_this works correctly. If T does not inherit +// from weak_this we get a different overload, defined inline, which does +// nothing. +template +void shared_ptr::MaybeSetupWeakThis(enable_shared_from_this* ptr) { + if (ptr) { + CHECK(ptr->weak_this_.expired()) << "Object already owned by a shared_ptr"; + ptr->weak_this_ = *this; + } +} + +#endif // UTIL_GTL_USE_STD_SHARED_PTR + +} // internal +} // namespace protobuf +} // namespace google + +#endif // GOOGLE_PROTOBUF_STUBS_SHARED_PTR_H__ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/stl_util.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/stl_util.h new file mode 100644 index 00000000..adebf10e --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/stl_util.h @@ -0,0 +1,121 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// from google3/util/gtl/stl_util.h + +#ifndef GOOGLE_PROTOBUF_STUBS_STL_UTIL_H__ +#define GOOGLE_PROTOBUF_STUBS_STL_UTIL_H__ + +#include "common.h" + +namespace google { +namespace protobuf { + +// STLDeleteContainerPointers() +// For a range within a container of pointers, calls delete +// (non-array version) on these pointers. +// NOTE: for these three functions, we could just implement a DeleteObject +// functor and then call for_each() on the range and functor, but this +// requires us to pull in all of algorithm.h, which seems expensive. +// For hash_[multi]set, it is important that this deletes behind the iterator +// because the hash_set may call the hash function on the iterator when it is +// advanced, which could result in the hash function trying to deference a +// stale pointer. +template +void STLDeleteContainerPointers(ForwardIterator begin, + ForwardIterator end) { + while (begin != end) { + ForwardIterator temp = begin; + ++begin; + delete *temp; + } +} + +// Inside Google, this function implements a horrible, disgusting hack in which +// we reach into the string's private implementation and resize it without +// initializing the new bytes. In some cases doing this can significantly +// improve performance. However, since it's totally non-portable it has no +// place in open source code. Feel free to fill this function in with your +// own disgusting hack if you want the perf boost. +inline void STLStringResizeUninitialized(string* s, size_t new_size) { + s->resize(new_size); +} + +// Return a mutable char* pointing to a string's internal buffer, +// which may not be null-terminated. Writing through this pointer will +// modify the string. +// +// string_as_array(&str)[i] is valid for 0 <= i < str.size() until the +// next call to a string method that invalidates iterators. +// +// As of 2006-04, there is no standard-blessed way of getting a +// mutable reference to a string's internal buffer. However, issue 530 +// (http://www.open-std.org/JTC1/SC22/WG21/docs/lwg-active.html#530) +// proposes this as the method. According to Matt Austern, this should +// already work on all current implementations. +inline char* string_as_array(string* str) { + // DO NOT USE const_cast(str->data())! See the unittest for why. + return str->empty() ? NULL : &*str->begin(); +} + +// STLDeleteElements() deletes all the elements in an STL container and clears +// the container. This function is suitable for use with a vector, set, +// hash_set, or any other STL container which defines sensible begin(), end(), +// and clear() methods. +// +// If container is NULL, this function is a no-op. +// +// As an alternative to calling STLDeleteElements() directly, consider +// ElementDeleter (defined below), which ensures that your container's elements +// are deleted when the ElementDeleter goes out of scope. +template +void STLDeleteElements(T *container) { + if (!container) return; + STLDeleteContainerPointers(container->begin(), container->end()); + container->clear(); +} + +// Given an STL container consisting of (key, value) pairs, STLDeleteValues +// deletes all the "value" components and clears the container. Does nothing +// in the case it's given a NULL pointer. + +template +void STLDeleteValues(T *v) { + if (!v) return; + for (typename T::iterator i = v->begin(); i != v->end(); ++i) { + delete i->second; + } + v->clear(); +} + +} // namespace protobuf +} // namespace google + +#endif // GOOGLE_PROTOBUF_STUBS_STL_UTIL_H__ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/template_util.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/template_util.h new file mode 100644 index 00000000..feef904b --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/template_util.h @@ -0,0 +1,138 @@ +// Copyright 2005 Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// ---- +// Author: lar@google.com (Laramie Leavitt) +// +// Template metaprogramming utility functions. +// +// This code is compiled directly on many platforms, including client +// platforms like Windows, Mac, and embedded systems. Before making +// any changes here, make sure that you're not breaking any platforms. +// +// +// The names chosen here reflect those used in tr1 and the boost::mpl +// library, there are similar operations used in the Loki library as +// well. I prefer the boost names for 2 reasons: +// 1. I think that portions of the Boost libraries are more likely to +// be included in the c++ standard. +// 2. It is not impossible that some of the boost libraries will be +// included in our own build in the future. +// Both of these outcomes means that we may be able to directly replace +// some of these with boost equivalents. +// +#ifndef GOOGLE_PROTOBUF_TEMPLATE_UTIL_H_ +#define GOOGLE_PROTOBUF_TEMPLATE_UTIL_H_ + +namespace google { +namespace protobuf { +namespace internal { + +// Types small_ and big_ are guaranteed such that sizeof(small_) < +// sizeof(big_) +typedef char small_; + +struct big_ { + char dummy[2]; +}; + +// Identity metafunction. +template +struct identity_ { + typedef T type; +}; + +// integral_constant, defined in tr1, is a wrapper for an integer +// value. We don't really need this generality; we could get away +// with hardcoding the integer type to bool. We use the fully +// general integer_constant for compatibility with tr1. + +template +struct integral_constant { + static const T value = v; + typedef T value_type; + typedef integral_constant type; +}; + +template const T integral_constant::value; + + +// Abbreviations: true_type and false_type are structs that represent boolean +// true and false values. Also define the boost::mpl versions of those names, +// true_ and false_. +typedef integral_constant true_type; +typedef integral_constant false_type; +typedef true_type true_; +typedef false_type false_; + +// if_ is a templatized conditional statement. +// if_ is a compile time evaluation of cond. +// if_<>::type contains A if cond is true, B otherwise. +template +struct if_{ + typedef A type; +}; + +template +struct if_ { + typedef B type; +}; + + +// type_equals_ is a template type comparator, similar to Loki IsSameType. +// type_equals_::value is true iff "A" is the same type as "B". +// +// New code should prefer base::is_same, defined in base/type_traits.h. +// It is functionally identical, but is_same is the standard spelling. +template +struct type_equals_ : public false_ { +}; + +template +struct type_equals_ : public true_ { +}; + +// and_ is a template && operator. +// and_::value evaluates "A::value && B::value". +template +struct and_ : public integral_constant { +}; + +// or_ is a template || operator. +// or_::value evaluates "A::value || B::value". +template +struct or_ : public integral_constant { +}; + + +} // namespace internal +} // namespace protobuf +} // namespace google + +#endif // GOOGLE_PROTOBUF_TEMPLATE_UTIL_H_ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/type_traits.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/type_traits.h new file mode 100644 index 00000000..019e3448 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/type_traits.h @@ -0,0 +1,372 @@ +// Copyright (c) 2006, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// ---- +// Author: Matt Austern +// +// This code is compiled directly on many platforms, including client +// platforms like Windows, Mac, and embedded systems. Before making +// any changes here, make sure that you're not breaking any platforms. +// +// Define a small subset of tr1 type traits. The traits we define are: +// enable_if +// is_integral +// is_floating_point +// is_pointer +// is_enum +// is_reference +// is_pod +// has_trivial_constructor +// has_trivial_copy +// has_trivial_assign +// has_trivial_destructor +// remove_const +// remove_volatile +// remove_cv +// remove_reference +// add_reference +// remove_pointer +// is_same +// is_convertible +// We can add more type traits as required. + +#ifndef GOOGLE_PROTOBUF_TYPE_TRAITS_H_ +#define GOOGLE_PROTOBUF_TYPE_TRAITS_H_ + +#include // for NULL +#include // For pair + +#include "template_util.h" // For true_type and false_type + +namespace google { +namespace protobuf { +namespace internal { + +template +struct is_base_of { + typedef char (&yes)[1]; + typedef char (&no)[2]; + + // BEGIN GOOGLE LOCAL MODIFICATION -- check is a #define on Mac. + #undef check + // END GOOGLE LOCAL MODIFICATION + + static yes check(const B*); + static no check(const void*); + + enum { + value = sizeof(check(static_cast(NULL))) == sizeof(yes), + }; +}; + +template struct enable_if; +template struct is_integral; +template struct is_floating_point; +template struct is_pointer; +// MSVC can't compile this correctly, and neither can gcc 3.3.5 (at least) +#if !defined(_MSC_VER) && !(defined(__GNUC__) && __GNUC__ <= 3) +// is_enum uses is_convertible, which is not available on MSVC. +template struct is_enum; +#endif +template struct is_reference; +template struct is_pod; +template struct has_trivial_constructor; +template struct has_trivial_copy; +template struct has_trivial_assign; +template struct has_trivial_destructor; +template struct remove_const; +template struct remove_volatile; +template struct remove_cv; +template struct remove_reference; +template struct add_reference; +template struct remove_pointer; +template struct is_same; +#if !(defined(__GNUC__) && __GNUC__ <= 3) +template struct is_convertible; +#endif + +// enable_if, equivalent semantics to c++11 std::enable_if, specifically: +// "If B is true, the member typedef type shall equal T; otherwise, there +// shall be no member typedef type." +// Specified by 20.9.7.6 [Other transformations] + +template struct enable_if { typedef T type; }; +template struct enable_if {}; +// is_integral is false except for the built-in integer types. A +// cv-qualified type is integral if and only if the underlying type is. +template struct is_integral : false_type { }; +template<> struct is_integral : true_type { }; +template<> struct is_integral : true_type { }; +template<> struct is_integral : true_type { }; +template<> struct is_integral : true_type { }; +#if defined(_MSC_VER) +// wchar_t is not by default a distinct type from unsigned short in +// Microsoft C. +// See http://msdn2.microsoft.com/en-us/library/dh8che7s(VS.80).aspx +template<> struct is_integral<__wchar_t> : true_type { }; +#else +template<> struct is_integral : true_type { }; +#endif +template<> struct is_integral : true_type { }; +template<> struct is_integral : true_type { }; +template<> struct is_integral : true_type { }; +template<> struct is_integral : true_type { }; +template<> struct is_integral : true_type { }; +template<> struct is_integral : true_type { }; +#ifdef HAVE_LONG_LONG +template<> struct is_integral : true_type { }; +template<> struct is_integral : true_type { }; +#endif +#if defined(_MSC_VER) +// With VC, __int8, __int16, and __int32 are synonymous with standard types +// with the same size, but __int64 has not equivalent (i.e., it's neither +// long, nor long long and should be treated differnetly). +// https://msdn.microsoft.com/en-us/library/29dh1w7z.aspx +template<> struct is_integral<__int64> : true_type { }; +template<> struct is_integral : true_type {}; +#endif +template struct is_integral : is_integral { }; +template struct is_integral : is_integral { }; +template struct is_integral : is_integral { }; + +// is_floating_point is false except for the built-in floating-point types. +// A cv-qualified type is integral if and only if the underlying type is. +template struct is_floating_point : false_type { }; +template<> struct is_floating_point : true_type { }; +template<> struct is_floating_point : true_type { }; +template<> struct is_floating_point : true_type { }; +template struct is_floating_point + : is_floating_point { }; +template struct is_floating_point + : is_floating_point { }; +template struct is_floating_point + : is_floating_point { }; + +// is_pointer is false except for pointer types. A cv-qualified type (e.g. +// "int* const", as opposed to "int const*") is cv-qualified if and only if +// the underlying type is. +template struct is_pointer : false_type { }; +template struct is_pointer : true_type { }; +template struct is_pointer : is_pointer { }; +template struct is_pointer : is_pointer { }; +template struct is_pointer : is_pointer { }; + +#if !defined(_MSC_VER) && !(defined(__GNUC__) && __GNUC__ <= 3) + +namespace type_traits_internal { + +template struct is_class_or_union { + template static small_ tester(void (U::*)()); + template static big_ tester(...); + static const bool value = sizeof(tester(0)) == sizeof(small_); +}; + +// is_convertible chokes if the first argument is an array. That's why +// we use add_reference here. +template struct is_enum_impl + : is_convertible::type, int> { }; + +template struct is_enum_impl : false_type { }; + +} // namespace type_traits_internal + +// Specified by TR1 [4.5.1] primary type categories. + +// Implementation note: +// +// Each type is either void, integral, floating point, array, pointer, +// reference, member object pointer, member function pointer, enum, +// union or class. Out of these, only integral, floating point, reference, +// class and enum types are potentially convertible to int. Therefore, +// if a type is not a reference, integral, floating point or class and +// is convertible to int, it's a enum. Adding cv-qualification to a type +// does not change whether it's an enum. +// +// Is-convertible-to-int check is done only if all other checks pass, +// because it can't be used with some types (e.g. void or classes with +// inaccessible conversion operators). +template struct is_enum + : type_traits_internal::is_enum_impl< + is_same::value || + is_integral::value || + is_floating_point::value || + is_reference::value || + type_traits_internal::is_class_or_union::value, + T> { }; + +template struct is_enum : is_enum { }; +template struct is_enum : is_enum { }; +template struct is_enum : is_enum { }; + +#endif + +// is_reference is false except for reference types. +template struct is_reference : false_type {}; +template struct is_reference : true_type {}; + + +// We can't get is_pod right without compiler help, so fail conservatively. +// We will assume it's false except for arithmetic types, enumerations, +// pointers and cv-qualified versions thereof. Note that std::pair +// is not a POD even if T and U are PODs. +template struct is_pod + : integral_constant::value || + is_floating_point::value || +#if !defined(_MSC_VER) && !(defined(__GNUC__) && __GNUC__ <= 3) + // is_enum is not available on MSVC. + is_enum::value || +#endif + is_pointer::value)> { }; +template struct is_pod : is_pod { }; +template struct is_pod : is_pod { }; +template struct is_pod : is_pod { }; + + +// We can't get has_trivial_constructor right without compiler help, so +// fail conservatively. We will assume it's false except for: (1) types +// for which is_pod is true. (2) std::pair of types with trivial +// constructors. (3) array of a type with a trivial constructor. +// (4) const versions thereof. +template struct has_trivial_constructor : is_pod { }; +template struct has_trivial_constructor > + : integral_constant::value && + has_trivial_constructor::value)> { }; +template struct has_trivial_constructor + : has_trivial_constructor { }; +template struct has_trivial_constructor + : has_trivial_constructor { }; + +// We can't get has_trivial_copy right without compiler help, so fail +// conservatively. We will assume it's false except for: (1) types +// for which is_pod is true. (2) std::pair of types with trivial copy +// constructors. (3) array of a type with a trivial copy constructor. +// (4) const versions thereof. +template struct has_trivial_copy : is_pod { }; +template struct has_trivial_copy > + : integral_constant::value && + has_trivial_copy::value)> { }; +template struct has_trivial_copy + : has_trivial_copy { }; +template struct has_trivial_copy : has_trivial_copy { }; + +// We can't get has_trivial_assign right without compiler help, so fail +// conservatively. We will assume it's false except for: (1) types +// for which is_pod is true. (2) std::pair of types with trivial copy +// constructors. (3) array of a type with a trivial assign constructor. +template struct has_trivial_assign : is_pod { }; +template struct has_trivial_assign > + : integral_constant::value && + has_trivial_assign::value)> { }; +template struct has_trivial_assign + : has_trivial_assign { }; + +// We can't get has_trivial_destructor right without compiler help, so +// fail conservatively. We will assume it's false except for: (1) types +// for which is_pod is true. (2) std::pair of types with trivial +// destructors. (3) array of a type with a trivial destructor. +// (4) const versions thereof. +template struct has_trivial_destructor : is_pod { }; +template struct has_trivial_destructor > + : integral_constant::value && + has_trivial_destructor::value)> { }; +template struct has_trivial_destructor + : has_trivial_destructor { }; +template struct has_trivial_destructor + : has_trivial_destructor { }; + +// Specified by TR1 [4.7.1] +template struct remove_const { typedef T type; }; +template struct remove_const { typedef T type; }; +template struct remove_volatile { typedef T type; }; +template struct remove_volatile { typedef T type; }; +template struct remove_cv { + typedef typename remove_const::type>::type type; +}; + + +// Specified by TR1 [4.7.2] Reference modifications. +template struct remove_reference { typedef T type; }; +template struct remove_reference { typedef T type; }; + +template struct add_reference { typedef T& type; }; +template struct add_reference { typedef T& type; }; + +// Specified by TR1 [4.7.4] Pointer modifications. +template struct remove_pointer { typedef T type; }; +template struct remove_pointer { typedef T type; }; +template struct remove_pointer { typedef T type; }; +template struct remove_pointer { typedef T type; }; +template struct remove_pointer { + typedef T type; }; + +// Specified by TR1 [4.6] Relationships between types +template struct is_same : public false_type { }; +template struct is_same : public true_type { }; + +// Specified by TR1 [4.6] Relationships between types +#if !(defined(__GNUC__) && __GNUC__ <= 3) +namespace type_traits_internal { + +// This class is an implementation detail for is_convertible, and you +// don't need to know how it works to use is_convertible. For those +// who care: we declare two different functions, one whose argument is +// of type To and one with a variadic argument list. We give them +// return types of different size, so we can use sizeof to trick the +// compiler into telling us which function it would have chosen if we +// had called it with an argument of type From. See Alexandrescu's +// _Modern C++ Design_ for more details on this sort of trick. + +template +struct ConvertHelper { + static small_ Test(To); + static big_ Test(...); + static From Create(); + enum { + value = sizeof(Test(Create())) == sizeof(small_) + }; +}; +} // namespace type_traits_internal + +// Inherits from true_type if From is convertible to To, false_type otherwise. +template +struct is_convertible + : integral_constant::value> { +}; +#endif + +} // namespace internal +} // namespace protobuf +} // namespace google + +#endif // GOOGLE_PROTOBUF_TYPE_TRAITS_H_ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/text_format.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/text_format.h new file mode 100644 index 00000000..cabbf331 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/text_format.h @@ -0,0 +1,521 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Author: jschorr@google.com (Joseph Schorr) +// Based on original Protocol Buffers design by +// Sanjay Ghemawat, Jeff Dean, and others. +// +// Utilities for printing and parsing protocol messages in a human-readable, +// text-based format. + +#ifndef GOOGLE_PROTOBUF_TEXT_FORMAT_H__ +#define GOOGLE_PROTOBUF_TEXT_FORMAT_H__ + +#include +#include +#ifndef _SHARED_PTR_H +#include "shared_ptr.h" +#endif +#include +#include + +#include "common.h" +#include "descriptor.h" +#include "message.h" + +namespace google { +namespace protobuf { + +namespace io { + class ErrorCollector; // tokenizer.h +} + +// This class implements protocol buffer text format. Printing and parsing +// protocol messages in text format is useful for debugging and human editing +// of messages. +// +// This class is really a namespace that contains only static methods. +class LIBPROTOBUF_EXPORT TextFormat { + public: + // Outputs a textual representation of the given message to the given + // output stream. + static bool Print(const Message& message, io::ZeroCopyOutputStream* output); + + // Print the fields in an UnknownFieldSet. They are printed by tag number + // only. Embedded messages are heuristically identified by attempting to + // parse them. + static bool PrintUnknownFields(const UnknownFieldSet& unknown_fields, + io::ZeroCopyOutputStream* output); + + // Like Print(), but outputs directly to a string. + // Note: output will be cleared before prior to printing, and will + // be left empty even if printing fails. + static bool PrintToString(const Message& message, string* output); + + // Like PrintUnknownFields(), but outputs directly to a string. + static bool PrintUnknownFieldsToString(const UnknownFieldSet& unknown_fields, + string* output); + + // Outputs a textual representation of the value of the field supplied on + // the message supplied. For non-repeated fields, an index of -1 must + // be supplied. Note that this method will print the default value for a + // field if it is not set. + static void PrintFieldValueToString(const Message& message, + const FieldDescriptor* field, + int index, + string* output); + + // The default printer that converts scalar values from fields into + // their string representation. + // You can derive from this FieldValuePrinter if you want to have + // fields to be printed in a different way and register it at the + // Printer. + class LIBPROTOBUF_EXPORT FieldValuePrinter { + public: + FieldValuePrinter(); + virtual ~FieldValuePrinter(); + virtual string PrintBool(bool val) const; + virtual string PrintInt32(int32 val) const; + virtual string PrintUInt32(uint32 val) const; + virtual string PrintInt64(int64 val) const; + virtual string PrintUInt64(uint64 val) const; + virtual string PrintFloat(float val) const; + virtual string PrintDouble(double val) const; + virtual string PrintString(const string& val) const; + virtual string PrintBytes(const string& val) const; + virtual string PrintEnum(int32 val, const string& name) const; + virtual string PrintFieldName(const Message& message, + const Reflection* reflection, + const FieldDescriptor* field) const; + virtual string PrintMessageStart(const Message& message, + int field_index, + int field_count, + bool single_line_mode) const; + virtual string PrintMessageEnd(const Message& message, + int field_index, + int field_count, + bool single_line_mode) const; + + private: + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(FieldValuePrinter); + }; + + // Class for those users which require more fine-grained control over how + // a protobuffer message is printed out. + class LIBPROTOBUF_EXPORT Printer { + public: + Printer(); + ~Printer(); + + // Like TextFormat::Print + bool Print(const Message& message, io::ZeroCopyOutputStream* output) const; + // Like TextFormat::PrintUnknownFields + bool PrintUnknownFields(const UnknownFieldSet& unknown_fields, + io::ZeroCopyOutputStream* output) const; + // Like TextFormat::PrintToString + bool PrintToString(const Message& message, string* output) const; + // Like TextFormat::PrintUnknownFieldsToString + bool PrintUnknownFieldsToString(const UnknownFieldSet& unknown_fields, + string* output) const; + // Like TextFormat::PrintFieldValueToString + void PrintFieldValueToString(const Message& message, + const FieldDescriptor* field, + int index, + string* output) const; + + // Adjust the initial indent level of all output. Each indent level is + // equal to two spaces. + void SetInitialIndentLevel(int indent_level) { + initial_indent_level_ = indent_level; + } + + // If printing in single line mode, then the entire message will be output + // on a single line with no line breaks. + void SetSingleLineMode(bool single_line_mode) { + single_line_mode_ = single_line_mode; + } + + bool IsInSingleLineMode() { + return single_line_mode_; + } + + // If use_field_number is true, uses field number instead of field name. + void SetUseFieldNumber(bool use_field_number) { + use_field_number_ = use_field_number; + } + + // Set true to print repeated primitives in a format like: + // field_name: [1, 2, 3, 4] + // instead of printing each value on its own line. Short format applies + // only to primitive values -- i.e. everything except strings and + // sub-messages/groups. + void SetUseShortRepeatedPrimitives(bool use_short_repeated_primitives) { + use_short_repeated_primitives_ = use_short_repeated_primitives; + } + + // Set true to output UTF-8 instead of ASCII. The only difference + // is that bytes >= 0x80 in string fields will not be escaped, + // because they are assumed to be part of UTF-8 multi-byte + // sequences. This will change the default FieldValuePrinter. + void SetUseUtf8StringEscaping(bool as_utf8); + + // Set the default FieldValuePrinter that is used for all fields that + // don't have a field-specific printer registered. + // Takes ownership of the printer. + void SetDefaultFieldValuePrinter(const FieldValuePrinter* printer); + + // Sets whether we want to hide unknown fields or not. + // Usually unknown fields are printed in a generic way that includes the + // tag number of the field instead of field name. However, sometimes it + // is useful to be able to print the message without unknown fields (e.g. + // for the python protobuf version to maintain consistency between its pure + // python and c++ implementations). + void SetHideUnknownFields(bool hide) { + hide_unknown_fields_ = hide; + } + + // If print_message_fields_in_index_order is true, print fields of a proto + // message using the order defined in source code instead of the field + // number. By default, use the field number order. + void SetPrintMessageFieldsInIndexOrder( + bool print_message_fields_in_index_order) { + print_message_fields_in_index_order_ = + print_message_fields_in_index_order; + } + + // If expand==true, expand google.protobuf.Any payloads. The output + // will be of form + // [type_url] { } + // + // If expand==false, print Any using the default printer. The output will + // look like + // type_url: "" value: "serialized_content" + void SetExpandAny(bool expand) { + expand_any_ = expand; + } + + // If non-zero, we truncate all string fields that are longer than this + // threshold. This is useful when the proto message has very long strings, + // e.g., dump of encoded image file. + // + // NOTE(hfgong): Setting a non-zero value breaks round-trip safe + // property of TextFormat::Printer. That is, from the printed message, we + // cannot fully recover the original string field any more. + void SetTruncateStringFieldLongerThan( + const int64 truncate_string_field_longer_than) { + truncate_string_field_longer_than_ = truncate_string_field_longer_than; + } + + // Register a custom field-specific FieldValuePrinter for fields + // with a particular FieldDescriptor. + // Returns "true" if the registration succeeded, or "false", if there is + // already a printer for that FieldDescriptor. + // Takes ownership of the printer on successful registration. + bool RegisterFieldValuePrinter(const FieldDescriptor* field, + const FieldValuePrinter* printer); + + private: + // Forward declaration of an internal class used to print the text + // output to the OutputStream (see text_format.cc for implementation). + class TextGenerator; + + // Internal Print method, used for writing to the OutputStream via + // the TextGenerator class. + void Print(const Message& message, + TextGenerator& generator) const; + + // Print a single field. + void PrintField(const Message& message, + const Reflection* reflection, + const FieldDescriptor* field, + TextGenerator& generator) const; + + // Print a repeated primitive field in short form. + void PrintShortRepeatedField(const Message& message, + const Reflection* reflection, + const FieldDescriptor* field, + TextGenerator& generator) const; + + // Print the name of a field -- i.e. everything that comes before the + // ':' for a single name/value pair. + void PrintFieldName(const Message& message, + const Reflection* reflection, + const FieldDescriptor* field, + TextGenerator& generator) const; + + // Outputs a textual representation of the value of the field supplied on + // the message supplied or the default value if not set. + void PrintFieldValue(const Message& message, + const Reflection* reflection, + const FieldDescriptor* field, + int index, + TextGenerator& generator) const; + + // Print the fields in an UnknownFieldSet. They are printed by tag number + // only. Embedded messages are heuristically identified by attempting to + // parse them. + void PrintUnknownFields(const UnknownFieldSet& unknown_fields, + TextGenerator& generator) const; + + bool PrintAny(const Message& message, TextGenerator& generator) const; + + int initial_indent_level_; + + bool single_line_mode_; + + bool use_field_number_; + + bool use_short_repeated_primitives_; + + bool hide_unknown_fields_; + + bool print_message_fields_in_index_order_; + + bool expand_any_; + + int64 truncate_string_field_longer_than_; + + google::protobuf::scoped_ptr default_field_value_printer_; + typedef std::map CustomPrinterMap; + CustomPrinterMap custom_printers_; + }; + + // Parses a text-format protocol message from the given input stream to + // the given message object. This function parses the human-readable format + // written by Print(). Returns true on success. The message is cleared first, + // even if the function fails -- See Merge() to avoid this behavior. + // + // Example input: "user {\n id: 123 extra { gender: MALE language: 'en' }\n}" + // + // One use for this function is parsing handwritten strings in test code. + // Another use is to parse the output from google::protobuf::Message::DebugString() + // (or ShortDebugString()), because these functions output using + // google::protobuf::TextFormat::Print(). + // + // If you would like to read a protocol buffer serialized in the + // (non-human-readable) binary wire format, see + // google::protobuf::MessageLite::ParseFromString(). + static bool Parse(io::ZeroCopyInputStream* input, Message* output); + // Like Parse(), but reads directly from a string. + static bool ParseFromString(const string& input, Message* output); + + // Like Parse(), but the data is merged into the given message, as if + // using Message::MergeFrom(). + static bool Merge(io::ZeroCopyInputStream* input, Message* output); + // Like Merge(), but reads directly from a string. + static bool MergeFromString(const string& input, Message* output); + + // Parse the given text as a single field value and store it into the + // given field of the given message. If the field is a repeated field, + // the new value will be added to the end + static bool ParseFieldValueFromString(const string& input, + const FieldDescriptor* field, + Message* message); + + // Interface that TextFormat::Parser can use to find extensions. + // This class may be extended in the future to find more information + // like fields, etc. + class LIBPROTOBUF_EXPORT Finder { + public: + virtual ~Finder(); + + // Try to find an extension of *message by fully-qualified field + // name. Returns NULL if no extension is known for this name or number. + virtual const FieldDescriptor* FindExtension( + Message* message, + const string& name) const = 0; + }; + + // A location in the parsed text. + struct ParseLocation { + int line; + int column; + + ParseLocation() : line(-1), column(-1) {} + ParseLocation(int line_param, int column_param) + : line(line_param), column(column_param) {} + }; + + // Data structure which is populated with the locations of each field + // value parsed from the text. + class LIBPROTOBUF_EXPORT ParseInfoTree { + public: + ParseInfoTree(); + ~ParseInfoTree(); + + // Returns the parse location for index-th value of the field in the parsed + // text. If none exists, returns a location with line = -1. Index should be + // -1 for not-repeated fields. + ParseLocation GetLocation(const FieldDescriptor* field, int index) const; + + // Returns the parse info tree for the given field, which must be a message + // type. The nested information tree is owned by the root tree and will be + // deleted when it is deleted. + ParseInfoTree* GetTreeForNested(const FieldDescriptor* field, + int index) const; + + private: + // Allow the text format parser to record information into the tree. + friend class TextFormat; + + // Records the starting location of a single value for a field. + void RecordLocation(const FieldDescriptor* field, ParseLocation location); + + // Create and records a nested tree for a nested message field. + ParseInfoTree* CreateNested(const FieldDescriptor* field); + + // Defines the map from the index-th field descriptor to its parse location. + typedef std::map > LocationMap; + + // Defines the map from the index-th field descriptor to the nested parse + // info tree. + typedef std::map > NestedMap; + + LocationMap locations_; + NestedMap nested_; + + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ParseInfoTree); + }; + + // For more control over parsing, use this class. + class LIBPROTOBUF_EXPORT Parser { + public: + Parser(); + ~Parser(); + + // Like TextFormat::Parse(). + bool Parse(io::ZeroCopyInputStream* input, Message* output); + // Like TextFormat::ParseFromString(). + bool ParseFromString(const string& input, Message* output); + // Like TextFormat::Merge(). + bool Merge(io::ZeroCopyInputStream* input, Message* output); + // Like TextFormat::MergeFromString(). + bool MergeFromString(const string& input, Message* output); + + // Set where to report parse errors. If NULL (the default), errors will + // be printed to stderr. + void RecordErrorsTo(io::ErrorCollector* error_collector) { + error_collector_ = error_collector; + } + + // Set how parser finds extensions. If NULL (the default), the + // parser will use the standard Reflection object associated with + // the message being parsed. + void SetFinder(Finder* finder) { + finder_ = finder; + } + + // Sets where location information about the parse will be written. If NULL + // (the default), then no location will be written. + void WriteLocationsTo(ParseInfoTree* tree) { + parse_info_tree_ = tree; + } + + // Normally parsing fails if, after parsing, output->IsInitialized() + // returns false. Call AllowPartialMessage(true) to skip this check. + void AllowPartialMessage(bool allow) { + allow_partial_ = allow; + } + + // Allow field names to be matched case-insensitively. + // This is not advisable if there are fields that only differ in case, or + // if you want to enforce writing in the canonical form. + // This is 'false' by default. + void AllowCaseInsensitiveField(bool allow) { + allow_case_insensitive_field_ = allow; + } + + // Like TextFormat::ParseFieldValueFromString + bool ParseFieldValueFromString(const string& input, + const FieldDescriptor* field, + Message* output); + + + void AllowFieldNumber(bool allow) { + allow_field_number_ = allow; + } + + private: + // Forward declaration of an internal class used to parse text + // representations (see text_format.cc for implementation). + class ParserImpl; + + // Like TextFormat::Merge(). The provided implementation is used + // to do the parsing. + bool MergeUsingImpl(io::ZeroCopyInputStream* input, + Message* output, + ParserImpl* parser_impl); + + io::ErrorCollector* error_collector_; + Finder* finder_; + ParseInfoTree* parse_info_tree_; + bool allow_partial_; + bool allow_case_insensitive_field_; + bool allow_unknown_field_; + bool allow_unknown_enum_; + bool allow_field_number_; + bool allow_relaxed_whitespace_; + bool allow_singular_overwrites_; + }; + + + private: + // Hack: ParseInfoTree declares TextFormat as a friend which should extend + // the friendship to TextFormat::Parser::ParserImpl, but unfortunately some + // old compilers (e.g. GCC 3.4.6) don't implement this correctly. We provide + // helpers for ParserImpl to call methods of ParseInfoTree. + static inline void RecordLocation(ParseInfoTree* info_tree, + const FieldDescriptor* field, + ParseLocation location); + static inline ParseInfoTree* CreateNested(ParseInfoTree* info_tree, + const FieldDescriptor* field); + + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(TextFormat); +}; + +inline void TextFormat::RecordLocation(ParseInfoTree* info_tree, + const FieldDescriptor* field, + ParseLocation location) { + info_tree->RecordLocation(field, location); +} + + +inline TextFormat::ParseInfoTree* TextFormat::CreateNested( + ParseInfoTree* info_tree, const FieldDescriptor* field) { + return info_tree->CreateNested(field); +} + +} // namespace protobuf + +} // namespace google +#endif // GOOGLE_PROTOBUF_TEXT_FORMAT_H__ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/unknown_field_set.h b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/unknown_field_set.h new file mode 100644 index 00000000..257f1c0a --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/unknown_field_set.h @@ -0,0 +1,354 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Author: kenton@google.com (Kenton Varda) +// Based on original Protocol Buffers design by +// Sanjay Ghemawat, Jeff Dean, and others. +// +// Contains classes used to keep track of unrecognized fields seen while +// parsing a protocol message. + +#ifndef GOOGLE_PROTOBUF_UNKNOWN_FIELD_SET_H__ +#define GOOGLE_PROTOBUF_UNKNOWN_FIELD_SET_H__ + +#include +#include +#include +#include "common.h" +#include "logging.h" + +namespace google { +namespace protobuf { + namespace io { + class CodedInputStream; // coded_stream.h + class CodedOutputStream; // coded_stream.h + class ZeroCopyInputStream; // zero_copy_stream.h + } + namespace internal { + class InternalMetadataWithArena; // metadata.h + class WireFormat; // wire_format.h + class MessageSetFieldSkipperUsingCord; + // extension_set_heavy.cc + } + +class Message; // message.h +class UnknownField; // below + +// An UnknownFieldSet contains fields that were encountered while parsing a +// message but were not defined by its type. Keeping track of these can be +// useful, especially in that they may be written if the message is serialized +// again without being cleared in between. This means that software which +// simply receives messages and forwards them to other servers does not need +// to be updated every time a new field is added to the message definition. +// +// To get the UnknownFieldSet attached to any message, call +// Reflection::GetUnknownFields(). +// +// This class is necessarily tied to the protocol buffer wire format, unlike +// the Reflection interface which is independent of any serialization scheme. +class LIBPROTOBUF_EXPORT UnknownFieldSet { + public: + UnknownFieldSet(); + ~UnknownFieldSet(); + + // Remove all fields. + inline void Clear(); + + // Remove all fields and deallocate internal data objects + void ClearAndFreeMemory(); + + // Is this set empty? + inline bool empty() const; + + // Merge the contents of some other UnknownFieldSet with this one. + void MergeFrom(const UnknownFieldSet& other); + + // Similar to above, but this function will destroy the contents of other. + void MergeFromAndDestroy(UnknownFieldSet* other); + + // Merge the contents an UnknownFieldSet with the UnknownFieldSet in + // *metadata, if there is one. If *metadata doesn't have an UnknownFieldSet + // then add one to it and make it be a copy of the first arg. + static void MergeToInternalMetdata( + const UnknownFieldSet& other, + internal::InternalMetadataWithArena* metadata); + + // Swaps the contents of some other UnknownFieldSet with this one. + inline void Swap(UnknownFieldSet* x); + + // Computes (an estimate of) the total number of bytes currently used for + // storing the unknown fields in memory. Does NOT include + // sizeof(*this) in the calculation. + int SpaceUsedExcludingSelf() const; + + // Version of SpaceUsed() including sizeof(*this). + int SpaceUsed() const; + + // Returns the number of fields present in the UnknownFieldSet. + inline int field_count() const; + // Get a field in the set, where 0 <= index < field_count(). The fields + // appear in the order in which they were added. + inline const UnknownField& field(int index) const; + // Get a mutable pointer to a field in the set, where + // 0 <= index < field_count(). The fields appear in the order in which + // they were added. + inline UnknownField* mutable_field(int index); + + // Adding fields --------------------------------------------------- + + void AddVarint(int number, uint64 value); + void AddFixed32(int number, uint32 value); + void AddFixed64(int number, uint64 value); + void AddLengthDelimited(int number, const string& value); + string* AddLengthDelimited(int number); + UnknownFieldSet* AddGroup(int number); + + // Adds an unknown field from another set. + void AddField(const UnknownField& field); + + // Delete fields with indices in the range [start .. start+num-1]. + // Caution: implementation moves all fields with indices [start+num .. ]. + void DeleteSubrange(int start, int num); + + // Delete all fields with a specific field number. The order of left fields + // is preserved. + // Caution: implementation moves all fields after the first deleted field. + void DeleteByNumber(int number); + + // Parsing helpers ------------------------------------------------- + // These work exactly like the similarly-named methods of Message. + + bool MergeFromCodedStream(io::CodedInputStream* input); + bool ParseFromCodedStream(io::CodedInputStream* input); + bool ParseFromZeroCopyStream(io::ZeroCopyInputStream* input); + bool ParseFromArray(const void* data, int size); + inline bool ParseFromString(const string& data) { + return ParseFromArray(data.data(), static_cast(data.size())); + } + + static const UnknownFieldSet* default_instance(); + private: + // For InternalMergeFrom + friend class UnknownField; + // Merges from other UnknownFieldSet. This method assumes, that this object + // is newly created and has fields_ == NULL; + void InternalMergeFrom(const UnknownFieldSet& other); + void ClearFallback(); + + // fields_ is either NULL, or a pointer to a vector that is *non-empty*. We + // never hold the empty vector because we want the 'do we have any unknown + // fields' check to be fast, and avoid a cache miss: the UFS instance gets + // embedded in the message object, so 'fields_ != NULL' tests a member + // variable hot in the cache, without the need to go touch a vector somewhere + // else in memory. + std::vector* fields_; + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(UnknownFieldSet); +}; + +// Represents one field in an UnknownFieldSet. +class LIBPROTOBUF_EXPORT UnknownField { + public: + enum Type { + TYPE_VARINT, + TYPE_FIXED32, + TYPE_FIXED64, + TYPE_LENGTH_DELIMITED, + TYPE_GROUP + }; + + // The field's field number, as seen on the wire. + inline int number() const; + + // The field type. + inline Type type() const; + + // Accessors ------------------------------------------------------- + // Each method works only for UnknownFields of the corresponding type. + + inline uint64 varint() const; + inline uint32 fixed32() const; + inline uint64 fixed64() const; + inline const string& length_delimited() const; + inline const UnknownFieldSet& group() const; + + inline void set_varint(uint64 value); + inline void set_fixed32(uint32 value); + inline void set_fixed64(uint64 value); + inline void set_length_delimited(const string& value); + inline string* mutable_length_delimited(); + inline UnknownFieldSet* mutable_group(); + + // Serialization API. + // These methods can take advantage of the underlying implementation and may + // archieve a better performance than using getters to retrieve the data and + // do the serialization yourself. + void SerializeLengthDelimitedNoTag(io::CodedOutputStream* output) const; + uint8* SerializeLengthDelimitedNoTagToArray(uint8* target) const; + + inline size_t GetLengthDelimitedSize() const; + + private: + friend class UnknownFieldSet; + + // If this UnknownField contains a pointer, delete it. + void Delete(); + + // Reset all the underlying pointers to NULL. A special function to be only + // used while merging from a temporary UFS. + void Reset(); + + // Make a deep copy of any pointers in this UnknownField. + void DeepCopy(const UnknownField& other); + + // Set the wire type of this UnknownField. Should only be used when this + // UnknownField is being created. + inline void SetType(Type type); + + union LengthDelimited { + string* string_value_; + }; + + uint32 number_; + uint32 type_; + union { + uint64 varint_; + uint32 fixed32_; + uint64 fixed64_; + mutable union LengthDelimited length_delimited_; + UnknownFieldSet* group_; + }; +}; + +// =================================================================== +// inline implementations + +inline UnknownFieldSet::UnknownFieldSet() : fields_(NULL) {} + +inline UnknownFieldSet::~UnknownFieldSet() { Clear(); } + +inline void UnknownFieldSet::ClearAndFreeMemory() { Clear(); } + +inline void UnknownFieldSet::Clear() { + if (fields_ != NULL) { + ClearFallback(); + } +} + +inline bool UnknownFieldSet::empty() const { + // Invariant: fields_ is never empty if present. + return !fields_; +} + +inline void UnknownFieldSet::Swap(UnknownFieldSet* x) { + std::swap(fields_, x->fields_); +} + +inline int UnknownFieldSet::field_count() const { + return fields_ ? static_cast(fields_->size()) : 0; +} +inline const UnknownField& UnknownFieldSet::field(int index) const { + GOOGLE_DCHECK(fields_ != NULL); + return (*fields_)[index]; +} +inline UnknownField* UnknownFieldSet::mutable_field(int index) { + return &(*fields_)[index]; +} + +inline void UnknownFieldSet::AddLengthDelimited( + int number, const string& value) { + AddLengthDelimited(number)->assign(value); +} + + +inline int UnknownField::number() const { return number_; } +inline UnknownField::Type UnknownField::type() const { + return static_cast(type_); +} + +inline uint64 UnknownField::varint() const { + assert(type() == TYPE_VARINT); + return varint_; +} +inline uint32 UnknownField::fixed32() const { + assert(type() == TYPE_FIXED32); + return fixed32_; +} +inline uint64 UnknownField::fixed64() const { + assert(type() == TYPE_FIXED64); + return fixed64_; +} +inline const string& UnknownField::length_delimited() const { + assert(type() == TYPE_LENGTH_DELIMITED); + return *length_delimited_.string_value_; +} +inline const UnknownFieldSet& UnknownField::group() const { + assert(type() == TYPE_GROUP); + return *group_; +} + +inline void UnknownField::set_varint(uint64 value) { + assert(type() == TYPE_VARINT); + varint_ = value; +} +inline void UnknownField::set_fixed32(uint32 value) { + assert(type() == TYPE_FIXED32); + fixed32_ = value; +} +inline void UnknownField::set_fixed64(uint64 value) { + assert(type() == TYPE_FIXED64); + fixed64_ = value; +} +inline void UnknownField::set_length_delimited(const string& value) { + assert(type() == TYPE_LENGTH_DELIMITED); + length_delimited_.string_value_->assign(value); +} +inline string* UnknownField::mutable_length_delimited() { + assert(type() == TYPE_LENGTH_DELIMITED); + return length_delimited_.string_value_; +} +inline UnknownFieldSet* UnknownField::mutable_group() { + assert(type() == TYPE_GROUP); + return group_; +} + +inline size_t UnknownField::GetLengthDelimitedSize() const { + GOOGLE_DCHECK_EQ(TYPE_LENGTH_DELIMITED, type()); + return length_delimited_.string_value_->size(); +} + +inline void UnknownField::SetType(Type type) { + type_ = type; +} + + +} // namespace protobuf + +} // namespace google +#endif // GOOGLE_PROTOBUF_UNKNOWN_FIELD_SET_H__ diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/image_classification.prototxt b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/image_classification.prototxt new file mode 100644 index 00000000..f716933b --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/image_classification.prototxt @@ -0,0 +1,24 @@ +name: "imageClassification_flow" +input: "input:1" +output: "output" +node { + name: "input:1" + type: "Input" + output: "input:1" + input_type: "UINT8" + input_format: "NCHW" + input_dim: 1 + input_dim: 224 + input_dim: 224 + input_dim: 4 +} +node { + name: "imageClassification_inference" + type: "Inference" + input: "input:1" + output: "output" + preprocess_parameter: "pixelProcess" + postprocess_parameter: "postProcess" + inference_parameter: "/private/var/containers/Bundle/Application/B3BB416D-1160-4B8E-BF44-E92AFC9DECD3/ImageClassificationDemo.app/ghostnet_f32.bolt" + inference_parameter:"" +} diff --git a/kit/iOS/image_classification/ImageClassificationDemo/libbolt/imagenet_classes.txt b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/imagenet_classes.txt new file mode 100644 index 00000000..a509c007 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/libbolt/imagenet_classes.txt @@ -0,0 +1,1000 @@ +tench, Tinca tinca +goldfish, Carassius auratus +great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias +tiger shark, Galeocerdo cuvieri +hammerhead, hammerhead shark +electric ray, crampfish, numbfish, torpedo +stingray +cock +hen +ostrich, Struthio camelus +brambling, Fringilla montifringilla +goldfinch, Carduelis carduelis +house finch, linnet, Carpodacus mexicanus +junco, snowbird +indigo bunting, indigo finch, indigo bird, Passerina cyanea +robin, American robin, Turdus migratorius +bulbul +jay +magpie +chickadee +water ouzel, dipper +kite +bald eagle, American eagle, Haliaeetus leucocephalus +vulture +great grey owl, great gray owl, Strix nebulosa +European fire salamander, Salamandra salamandra +common newt, Triturus vulgaris +eft +spotted salamander, Ambystoma maculatum +axolotl, mud puppy, Ambystoma mexicanum +bullfrog, Rana catesbeiana +tree frog, tree-frog +tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui +loggerhead, loggerhead turtle, Caretta caretta +leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea +mud turtle +terrapin +box turtle, box tortoise +banded gecko +common iguana, iguana, Iguana iguana +American chameleon, anole, Anolis carolinensis +whiptail, whiptail lizard +agama +frilled lizard, Chlamydosaurus kingi +alligator lizard +Gila monster, Heloderma suspectum +green lizard, Lacerta viridis +African chameleon, Chamaeleo chamaeleon +Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis +African crocodile, Nile crocodile, Crocodylus niloticus +American alligator, Alligator mississipiensis +triceratops +thunder snake, worm snake, Carphophis amoenus +ringneck snake, ring-necked snake, ring snake +hognose snake, puff adder, sand viper +green snake, grass snake +king snake, kingsnake +garter snake, grass snake +water snake +vine snake +night snake, Hypsiglena torquata +boa constrictor, Constrictor constrictor +rock python, rock snake, Python sebae +Indian cobra, Naja naja +green mamba +sea snake +horned viper, cerastes, sand viper, horned asp, Cerastes cornutus +diamondback, diamondback rattlesnake, Crotalus adamanteus +sidewinder, horned rattlesnake, Crotalus cerastes +trilobite +harvestman, daddy longlegs, Phalangium opilio +scorpion +black and gold garden spider, Argiope aurantia +barn spider, Araneus cavaticus +garden spider, Aranea diademata +black widow, Latrodectus mactans +tarantula +wolf spider, hunting spider +tick +centipede +black grouse +ptarmigan +ruffed grouse, partridge, Bonasa umbellus +prairie chicken, prairie grouse, prairie fowl +peacock +quail +partridge +African grey, African gray, Psittacus erithacus +macaw +sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita +lorikeet +coucal +bee eater +hornbill +hummingbird +jacamar +toucan +drake +red-breasted merganser, Mergus serrator +goose +black swan, Cygnus atratus +tusker +echidna, spiny anteater, anteater +platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus +wallaby, brush kangaroo +koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus +wombat +jellyfish +sea anemone, anemone +brain coral +flatworm, platyhelminth +nematode, nematode worm, roundworm +conch +snail +slug +sea slug, nudibranch +chiton, coat-of-mail shell, sea cradle, polyplacophore +chambered nautilus, pearly nautilus, nautilus +Dungeness crab, Cancer magister +rock crab, Cancer irroratus +fiddler crab +king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica +American lobster, Northern lobster, Maine lobster, Homarus americanus +spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish +crayfish, crawfish, crawdad, crawdaddy +hermit crab +isopod +white stork, Ciconia ciconia +black stork, Ciconia nigra +spoonbill +flamingo +little blue heron, Egretta caerulea +American egret, great white heron, Egretta albus +bittern +crane +limpkin, Aramus pictus +European gallinule, Porphyrio porphyrio +American coot, marsh hen, mud hen, water hen, Fulica americana +bustard +ruddy turnstone, Arenaria interpres +red-backed sandpiper, dunlin, Erolia alpina +redshank, Tringa totanus +dowitcher +oystercatcher, oyster catcher +pelican +king penguin, Aptenodytes patagonica +albatross, mollymawk +grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus +killer whale, killer, orca, grampus, sea wolf, Orcinus orca +dugong, Dugong dugon +sea lion +Chihuahua +Japanese spaniel +Maltese dog, Maltese terrier, Maltese +Pekinese, Pekingese, Peke +Shih-Tzu +Blenheim spaniel +papillon +toy terrier +Rhodesian ridgeback +Afghan hound, Afghan +basset, basset hound +beagle +bloodhound, sleuthhound +bluetick +black-and-tan coonhound +Walker hound, Walker foxhound +English foxhound +redbone +borzoi, Russian wolfhound +Irish wolfhound +Italian greyhound +whippet +Ibizan hound, Ibizan Podenco +Norwegian elkhound, elkhound +otterhound, otter hound +Saluki, gazelle hound +Scottish deerhound, deerhound +Weimaraner +Staffordshire bullterrier, Staffordshire bull terrier +American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier +Bedlington terrier +Border terrier +Kerry blue terrier +Irish terrier +Norfolk terrier +Norwich terrier +Yorkshire terrier +wire-haired fox terrier +Lakeland terrier +Sealyham terrier, Sealyham +Airedale, Airedale terrier +cairn, cairn terrier +Australian terrier +Dandie Dinmont, Dandie Dinmont terrier +Boston bull, Boston terrier +miniature schnauzer +giant schnauzer +standard schnauzer +Scotch terrier, Scottish terrier, Scottie +Tibetan terrier, chrysanthemum dog +silky terrier, Sydney silky +soft-coated wheaten terrier +West Highland white terrier +Lhasa, Lhasa apso +flat-coated retriever +curly-coated retriever +golden retriever +Labrador retriever +Chesapeake Bay retriever +German short-haired pointer +vizsla, Hungarian pointer +English setter +Irish setter, red setter +Gordon setter +Brittany spaniel +clumber, clumber spaniel +English springer, English springer spaniel +Welsh springer spaniel +cocker spaniel, English cocker spaniel, cocker +Sussex spaniel +Irish water spaniel +kuvasz +schipperke +groenendael +malinois +briard +kelpie +komondor +Old English sheepdog, bobtail +Shetland sheepdog, Shetland sheep dog, Shetland +collie +Border collie +Bouvier des Flandres, Bouviers des Flandres +Rottweiler +German shepherd, German shepherd dog, German police dog, alsatian +Doberman, Doberman pinscher +miniature pinscher +Greater Swiss Mountain dog +Bernese mountain dog +Appenzeller +EntleBucher +boxer +bull mastiff +Tibetan mastiff +French bulldog +Great Dane +Saint Bernard, St Bernard +Eskimo dog, husky +malamute, malemute, Alaskan malamute +Siberian husky +dalmatian, coach dog, carriage dog +affenpinscher, monkey pinscher, monkey dog +basenji +pug, pug-dog +Leonberg +Newfoundland, Newfoundland dog +Great Pyrenees +Samoyed, Samoyede +Pomeranian +chow, chow chow +keeshond +Brabancon griffon +Pembroke, Pembroke Welsh corgi +Cardigan, Cardigan Welsh corgi +toy poodle +miniature poodle +standard poodle +Mexican hairless +timber wolf, grey wolf, gray wolf, Canis lupus +white wolf, Arctic wolf, Canis lupus tundrarum +red wolf, maned wolf, Canis rufus, Canis niger +coyote, prairie wolf, brush wolf, Canis latrans +dingo, warrigal, warragal, Canis dingo +dhole, Cuon alpinus +African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus +hyena, hyaena +red fox, Vulpes vulpes +kit fox, Vulpes macrotis +Arctic fox, white fox, Alopex lagopus +grey fox, gray fox, Urocyon cinereoargenteus +tabby, tabby cat +tiger cat +Persian cat +Siamese cat, Siamese +Egyptian cat +cougar, puma, catamount, mountain lion, painter, panther, Felis concolor +lynx, catamount +leopard, Panthera pardus +snow leopard, ounce, Panthera uncia +jaguar, panther, Panthera onca, Felis onca +lion, king of beasts, Panthera leo +tiger, Panthera tigris +cheetah, chetah, Acinonyx jubatus +brown bear, bruin, Ursus arctos +American black bear, black bear, Ursus americanus, Euarctos americanus +ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus +sloth bear, Melursus ursinus, Ursus ursinus +mongoose +meerkat, mierkat +tiger beetle +ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle +ground beetle, carabid beetle +long-horned beetle, longicorn, longicorn beetle +leaf beetle, chrysomelid +dung beetle +rhinoceros beetle +weevil +fly +bee +ant, emmet, pismire +grasshopper, hopper +cricket +walking stick, walkingstick, stick insect +cockroach, roach +mantis, mantid +cicada, cicala +leafhopper +lacewing, lacewing fly +dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk +damselfly +admiral +ringlet, ringlet butterfly +monarch, monarch butterfly, milkweed butterfly, Danaus plexippus +cabbage butterfly +sulphur butterfly, sulfur butterfly +lycaenid, lycaenid butterfly +starfish, sea star +sea urchin +sea cucumber, holothurian +wood rabbit, cottontail, cottontail rabbit +hare +Angora, Angora rabbit +hamster +porcupine, hedgehog +fox squirrel, eastern fox squirrel, Sciurus niger +marmot +beaver +guinea pig, Cavia cobaya +sorrel +zebra +hog, pig, grunter, squealer, Sus scrofa +wild boar, boar, Sus scrofa +warthog +hippopotamus, hippo, river horse, Hippopotamus amphibius +ox +water buffalo, water ox, Asiatic buffalo, Bubalus bubalis +bison +ram, tup +bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis +ibex, Capra ibex +hartebeest +impala, Aepyceros melampus +gazelle +Arabian camel, dromedary, Camelus dromedarius +llama +weasel +mink +polecat, fitch, foulmart, foumart, Mustela putorius +black-footed ferret, ferret, Mustela nigripes +otter +skunk, polecat, wood pussy +badger +armadillo +three-toed sloth, ai, Bradypus tridactylus +orangutan, orang, orangutang, Pongo pygmaeus +gorilla, Gorilla gorilla +chimpanzee, chimp, Pan troglodytes +gibbon, Hylobates lar +siamang, Hylobates syndactylus, Symphalangus syndactylus +guenon, guenon monkey +patas, hussar monkey, Erythrocebus patas +baboon +macaque +langur +colobus, colobus monkey +proboscis monkey, Nasalis larvatus +marmoset +capuchin, ringtail, Cebus capucinus +howler monkey, howler +titi, titi monkey +spider monkey, Ateles geoffroyi +squirrel monkey, Saimiri sciureus +Madagascar cat, ring-tailed lemur, Lemur catta +indri, indris, Indri indri, Indri brevicaudatus +Indian elephant, Elephas maximus +African elephant, Loxodonta africana +lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens +giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca +barracouta, snoek +eel +coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch +rock beauty, Holocanthus tricolor +anemone fish +sturgeon +gar, garfish, garpike, billfish, Lepisosteus osseus +lionfish +puffer, pufferfish, blowfish, globefish +abacus +abaya +academic gown, academic robe, judge's robe +accordion, piano accordion, squeeze box +acoustic guitar +aircraft carrier, carrier, flattop, attack aircraft carrier +airliner +airship, dirigible +altar +ambulance +amphibian, amphibious vehicle +analog clock +apiary, bee house +apron +ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin +assault rifle, assault gun +backpack, back pack, knapsack, packsack, rucksack, haversack +bakery, bakeshop, bakehouse +balance beam, beam +balloon +ballpoint, ballpoint pen, ballpen, Biro +Band Aid +banjo +bannister, banister, balustrade, balusters, handrail +barbell +barber chair +barbershop +barn +barometer +barrel, cask +barrow, garden cart, lawn cart, wheelbarrow +baseball +basketball +bassinet +bassoon +bathing cap, swimming cap +bath towel +bathtub, bathing tub, bath, tub +beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon +beacon, lighthouse, beacon light, pharos +beaker +bearskin, busby, shako +beer bottle +beer glass +bell cote, bell cot +bib +bicycle-built-for-two, tandem bicycle, tandem +bikini, two-piece +binder, ring-binder +binoculars, field glasses, opera glasses +birdhouse +boathouse +bobsled, bobsleigh, bob +bolo tie, bolo, bola tie, bola +bonnet, poke bonnet +bookcase +bookshop, bookstore, bookstall +bottlecap +bow +bow tie, bow-tie, bowtie +brass, memorial tablet, plaque +brassiere, bra, bandeau +breakwater, groin, groyne, mole, bulwark, seawall, jetty +breastplate, aegis, egis +broom +bucket, pail +buckle +bulletproof vest +bullet train, bullet +butcher shop, meat market +cab, hack, taxi, taxicab +caldron, cauldron +candle, taper, wax light +cannon +canoe +can opener, tin opener +cardigan +car mirror +carousel, carrousel, merry-go-round, roundabout, whirligig +carpenter's kit, tool kit +carton +car wheel +cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM +cassette +cassette player +castle +catamaran +CD player +cello, violoncello +cellular telephone, cellular phone, cellphone, cell, mobile phone +chain +chainlink fence +chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour +chain saw, chainsaw +chest +chiffonier, commode +chime, bell, gong +china cabinet, china closet +Christmas stocking +church, church building +cinema, movie theater, movie theatre, movie house, picture palace +cleaver, meat cleaver, chopper +cliff dwelling +cloak +clog, geta, patten, sabot +cocktail shaker +coffee mug +coffeepot +coil, spiral, volute, whorl, helix +combination lock +computer keyboard, keypad +confectionery, confectionary, candy store +container ship, containership, container vessel +convertible +corkscrew, bottle screw +cornet, horn, trumpet, trump +cowboy boot +cowboy hat, ten-gallon hat +cradle +crane +crash helmet +crate +crib, cot +Crock Pot +croquet ball +crutch +cuirass +dam, dike, dyke +desk +desktop computer +dial telephone, dial phone +diaper, nappy, napkin +digital clock +digital watch +dining table, board +dishrag, dishcloth +dishwasher, dish washer, dishwashing machine +disk brake, disc brake +dock, dockage, docking facility +dogsled, dog sled, dog sleigh +dome +doormat, welcome mat +drilling platform, offshore rig +drum, membranophone, tympan +drumstick +dumbbell +Dutch oven +electric fan, blower +electric guitar +electric locomotive +entertainment center +envelope +espresso maker +face powder +feather boa, boa +file, file cabinet, filing cabinet +fireboat +fire engine, fire truck +fire screen, fireguard +flagpole, flagstaff +flute, transverse flute +folding chair +football helmet +forklift +fountain +fountain pen +four-poster +freight car +French horn, horn +frying pan, frypan, skillet +fur coat +garbage truck, dustcart +gasmask, respirator, gas helmet +gas pump, gasoline pump, petrol pump, island dispenser +goblet +go-kart +golf ball +golfcart, golf cart +gondola +gong, tam-tam +gown +grand piano, grand +greenhouse, nursery, glasshouse +grille, radiator grille +grocery store, grocery, food market, market +guillotine +hair slide +hair spray +half track +hammer +hamper +hand blower, blow dryer, blow drier, hair dryer, hair drier +hand-held computer, hand-held microcomputer +handkerchief, hankie, hanky, hankey +hard disc, hard disk, fixed disk +harmonica, mouth organ, harp, mouth harp +harp +harvester, reaper +hatchet +holster +home theater, home theatre +honeycomb +hook, claw +hoopskirt, crinoline +horizontal bar, high bar +horse cart, horse-cart +hourglass +iPod +iron, smoothing iron +jack-o'-lantern +jean, blue jean, denim +jeep, landrover +jersey, T-shirt, tee shirt +jigsaw puzzle +jinrikisha, ricksha, rickshaw +joystick +kimono +knee pad +knot +lab coat, laboratory coat +ladle +lampshade, lamp shade +laptop, laptop computer +lawn mower, mower +lens cap, lens cover +letter opener, paper knife, paperknife +library +lifeboat +lighter, light, igniter, ignitor +limousine, limo +liner, ocean liner +lipstick, lip rouge +Loafer +lotion +loudspeaker, speaker, speaker unit, loudspeaker system, speaker system +loupe, jeweler's loupe +lumbermill, sawmill +magnetic compass +mailbag, postbag +mailbox, letter box +maillot +maillot, tank suit +manhole cover +maraca +marimba, xylophone +mask +matchstick +maypole +maze, labyrinth +measuring cup +medicine chest, medicine cabinet +megalith, megalithic structure +microphone, mike +microwave, microwave oven +military uniform +milk can +minibus +miniskirt, mini +minivan +missile +mitten +mixing bowl +mobile home, manufactured home +Model T +modem +monastery +monitor +moped +mortar +mortarboard +mosque +mosquito net +motor scooter, scooter +mountain bike, all-terrain bike, off-roader +mountain tent +mouse, computer mouse +mousetrap +moving van +muzzle +nail +neck brace +necklace +nipple +notebook, notebook computer +obelisk +oboe, hautboy, hautbois +ocarina, sweet potato +odometer, hodometer, mileometer, milometer +oil filter +organ, pipe organ +oscilloscope, scope, cathode-ray oscilloscope, CRO +overskirt +oxcart +oxygen mask +packet +paddle, boat paddle +paddlewheel, paddle wheel +padlock +paintbrush +pajama, pyjama, pj's, jammies +palace +panpipe, pandean pipe, syrinx +paper towel +parachute, chute +parallel bars, bars +park bench +parking meter +passenger car, coach, carriage +patio, terrace +pay-phone, pay-station +pedestal, plinth, footstall +pencil box, pencil case +pencil sharpener +perfume, essence +Petri dish +photocopier +pick, plectrum, plectron +pickelhaube +picket fence, paling +pickup, pickup truck +pier +piggy bank, penny bank +pill bottle +pillow +ping-pong ball +pinwheel +pirate, pirate ship +pitcher, ewer +plane, carpenter's plane, woodworking plane +planetarium +plastic bag +plate rack +plow, plough +plunger, plumber's helper +Polaroid camera, Polaroid Land camera +pole +police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria +poncho +pool table, billiard table, snooker table +pop bottle, soda bottle +pot, flowerpot +potter's wheel +power drill +prayer rug, prayer mat +printer +prison, prison house +projectile, missile +projector +puck, hockey puck +punching bag, punch bag, punching ball, punchball +purse +quill, quill pen +quilt, comforter, comfort, puff +racer, race car, racing car +racket, racquet +radiator +radio, wireless +radio telescope, radio reflector +rain barrel +recreational vehicle, RV, R.V. +reel +reflex camera +refrigerator, icebox +remote control, remote +restaurant, eating house, eating place, eatery +revolver, six-gun, six-shooter +rifle +rocking chair, rocker +rotisserie +rubber eraser, rubber, pencil eraser +rugby ball +rule, ruler +running shoe +safe +safety pin +saltshaker, salt shaker +sandal +sarong +sax, saxophone +scabbard +scale, weighing machine +school bus +schooner +scoreboard +screen, CRT screen +screw +screwdriver +seat belt, seatbelt +sewing machine +shield, buckler +shoe shop, shoe-shop, shoe store +shoji +shopping basket +shopping cart +shovel +shower cap +shower curtain +ski +ski mask +sleeping bag +slide rule, slipstick +sliding door +slot, one-armed bandit +snorkel +snowmobile +snowplow, snowplough +soap dispenser +soccer ball +sock +solar dish, solar collector, solar furnace +sombrero +soup bowl +space bar +space heater +space shuttle +spatula +speedboat +spider web, spider's web +spindle +sports car, sport car +spotlight, spot +stage +steam locomotive +steel arch bridge +steel drum +stethoscope +stole +stone wall +stopwatch, stop watch +stove +strainer +streetcar, tram, tramcar, trolley, trolley car +stretcher +studio couch, day bed +stupa, tope +submarine, pigboat, sub, U-boat +suit, suit of clothes +sundial +sunglass +sunglasses, dark glasses, shades +sunscreen, sunblock, sun blocker +suspension bridge +swab, swob, mop +sweatshirt +swimming trunks, bathing trunks +swing +switch, electric switch, electrical switch +syringe +table lamp +tank, army tank, armored combat vehicle, armoured combat vehicle +tape player +teapot +teddy, teddy bear +television, television system +tennis ball +thatch, thatched roof +theater curtain, theatre curtain +thimble +thresher, thrasher, threshing machine +throne +tile roof +toaster +tobacco shop, tobacconist shop, tobacconist +toilet seat +torch +totem pole +tow truck, tow car, wrecker +toyshop +tractor +trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi +tray +trench coat +tricycle, trike, velocipede +trimaran +tripod +triumphal arch +trolleybus, trolley coach, trackless trolley +trombone +tub, vat +turnstile +typewriter keyboard +umbrella +unicycle, monocycle +upright, upright piano +vacuum, vacuum cleaner +vase +vault +velvet +vending machine +vestment +viaduct +violin, fiddle +volleyball +waffle iron +wall clock +wallet, billfold, notecase, pocketbook +wardrobe, closet, press +warplane, military plane +washbasin, handbasin, washbowl, lavabo, wash-hand basin +washer, automatic washer, washing machine +water bottle +water jug +water tower +whiskey jug +whistle +wig +window screen +window shade +Windsor tie +wine bottle +wing +wok +wooden spoon +wool, woolen, woollen +worm fence, snake fence, snake-rail fence, Virginia fence +wreck +yawl +yurt +web site, website, internet site, site +comic book +crossword puzzle, crossword +street sign +traffic light, traffic signal, stoplight +book jacket, dust cover, dust jacket, dust wrapper +menu +plate +guacamole +consomme +hot pot, hotpot +trifle +ice cream, icecream +ice lolly, lolly, lollipop, popsicle +French loaf +bagel, beigel +pretzel +cheeseburger +hotdog, hot dog, red hot +mashed potato +head cabbage +broccoli +cauliflower +zucchini, courgette +spaghetti squash +acorn squash +butternut squash +cucumber, cuke +artichoke, globe artichoke +bell pepper +cardoon +mushroom +Granny Smith +strawberry +orange +lemon +fig +pineapple, ananas +banana +jackfruit, jak, jack +custard apple +pomegranate +hay +carbonara +chocolate sauce, chocolate syrup +dough +meat loaf, meatloaf +pizza, pizza pie +potpie +burrito +red wine +espresso +cup +eggnog +alp +bubble +cliff, drop, drop-off +coral reef +geyser +lakeside, lakeshore +promontory, headland, head, foreland +sandbar, sand bar +seashore, coast, seacoast, sea-coast +valley, vale +volcano +ballplayer, baseball player +groom, bridegroom +scuba diver +rapeseed +daisy +yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum +corn +acorn +hip, rose hip, rosehip +buckeye, horse chestnut, conker +coral fungus +agaric +gyromitra +stinkhorn, carrion fungus +earthstar +hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa +bolete +ear, spike, capitulum +toilet tissue, toilet paper, bathroom tissue diff --git a/kit/iOS/image_classification/ImageClassificationDemo/main.m b/kit/iOS/image_classification/ImageClassificationDemo/main.m new file mode 100644 index 00000000..7ef81b65 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemo/main.m @@ -0,0 +1,24 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#import +#import "AppDelegate.h" + +int main(int argc, char * argv[]) { + NSString * appDelegateClassName; + @autoreleasepool { + // Setup code that might create autoreleased objects goes here. + appDelegateClassName = NSStringFromClass([AppDelegate class]); + } + return UIApplicationMain(argc, argv, nil, appDelegateClassName); +} diff --git a/kit/iOS/image_classification/ImageClassificationDemoTests/ImageClassificationDemoTests.m b/kit/iOS/image_classification/ImageClassificationDemoTests/ImageClassificationDemoTests.m new file mode 100644 index 00000000..e48891c1 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemoTests/ImageClassificationDemoTests.m @@ -0,0 +1,42 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#import + +@interface ImageClassificationDemoTests : XCTestCase + +@end + +@implementation ImageClassificationDemoTests + +- (void)setUp { + // Put setup code here. This method is called before the invocation of each test method in the class. +} + +- (void)tearDown { + // Put teardown code here. This method is called after the invocation of each test method in the class. +} + +- (void)testExample { + // This is an example of a functional test case. + // Use XCTAssert and related functions to verify your tests produce the correct results. +} + +- (void)testPerformanceExample { + // This is an example of a performance test case. + [self measureBlock:^{ + // Put the code you want to measure the time of here. + }]; +} + +@end diff --git a/kit/iOS/image_classification/ImageClassificationDemoTests/Info.plist b/kit/iOS/image_classification/ImageClassificationDemoTests/Info.plist new file mode 100644 index 00000000..64d65ca4 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemoTests/Info.plist @@ -0,0 +1,22 @@ + + + + + CFBundleDevelopmentRegion + $(DEVELOPMENT_LANGUAGE) + CFBundleExecutable + $(EXECUTABLE_NAME) + CFBundleIdentifier + $(PRODUCT_BUNDLE_IDENTIFIER) + CFBundleInfoDictionaryVersion + 6.0 + CFBundleName + $(PRODUCT_NAME) + CFBundlePackageType + $(PRODUCT_BUNDLE_PACKAGE_TYPE) + CFBundleShortVersionString + 1.0 + CFBundleVersion + 1 + + diff --git a/kit/iOS/image_classification/ImageClassificationDemoUITests/ImageClassificationDemoUITests.m b/kit/iOS/image_classification/ImageClassificationDemoUITests/ImageClassificationDemoUITests.m new file mode 100644 index 00000000..612c5c77 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemoUITests/ImageClassificationDemoUITests.m @@ -0,0 +1,53 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#import + +@interface ImageClassificationDemoUITests : XCTestCase + +@end + +@implementation ImageClassificationDemoUITests + +- (void)setUp { + // Put setup code here. This method is called before the invocation of each test method in the class. + + // In UI tests it is usually best to stop immediately when a failure occurs. + self.continueAfterFailure = NO; + + // In UI tests it’s important to set the initial state - such as interface orientation - required for your tests before they run. The setUp method is a good place to do this. +} + +- (void)tearDown { + // Put teardown code here. This method is called after the invocation of each test method in the class. +} + +- (void)testExample { + // UI tests must launch the application that they test. + XCUIApplication *app = [[XCUIApplication alloc] init]; + [app launch]; + + // Use recording to get started writing UI tests. + // Use XCTAssert and related functions to verify your tests produce the correct results. +} + +- (void)testLaunchPerformance { + if (@available(macOS 10.15, iOS 13.0, tvOS 13.0, *)) { + // This measures how long it takes to launch your application. + [self measureWithMetrics:@[XCTOSSignpostMetric.applicationLaunchMetric] block:^{ + [[[XCUIApplication alloc] init] launch]; + }]; + } +} + +@end diff --git a/kit/iOS/image_classification/ImageClassificationDemoUITests/Info.plist b/kit/iOS/image_classification/ImageClassificationDemoUITests/Info.plist new file mode 100644 index 00000000..64d65ca4 --- /dev/null +++ b/kit/iOS/image_classification/ImageClassificationDemoUITests/Info.plist @@ -0,0 +1,22 @@ + + + + + CFBundleDevelopmentRegion + $(DEVELOPMENT_LANGUAGE) + CFBundleExecutable + $(EXECUTABLE_NAME) + CFBundleIdentifier + $(PRODUCT_BUNDLE_IDENTIFIER) + CFBundleInfoDictionaryVersion + 6.0 + CFBundleName + $(PRODUCT_NAME) + CFBundlePackageType + $(PRODUCT_BUNDLE_PACKAGE_TYPE) + CFBundleShortVersionString + 1.0 + CFBundleVersion + 1 + + diff --git a/kit/iOS/setup_lib_iOS.sh b/kit/iOS/setup_lib_iOS.sh new file mode 100644 index 00000000..4c8c9b21 --- /dev/null +++ b/kit/iOS/setup_lib_iOS.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +script_abs=$(readlink -f "$0") +script_dir=$(dirname $script_abs) + +export BOLT_ROOT=${script_dir}/../../ + +cp ${BOLT_ROOT}/install_arm_ios/lib/libbolt.a ${script_dir}/image_classification/ImageClassificationDemo/libbolt/ +cp ${BOLT_ROOT}/install_arm_ios/lib/libflow.a ${script_dir}/image_classification/ImageClassificationDemo/libbolt/ +cp ${BOLT_ROOT}/third_party/arm_ios/protobuf/lib/libprotobuf.a ${script_dir}/image_classification/ImageClassificationDemo/libbolt/ + +rm -rf ${script_dir}/image_classification/ImageClassificationDemo/libbolt/headers/memory/ +cp -r ${BOLT_ROOT}/common/memory/include/ ${script_dir}/image_classification/ImageClassificationDemo/libbolt/headers/memory/ +rm -rf ${script_dir}/image_classification/ImageClassificationDemo/libbolt/headers/uni/ +cp -r ${BOLT_ROOT}/common/uni/include/ ${script_dir}/image_classification/ImageClassificationDemo/libbolt/headers/uni/ + +rm -rf ${script_dir}/image_classification/ImageClassificationDemo/libbolt/headers/engine/ +mkdir ${script_dir}/image_classification/ImageClassificationDemo/libbolt/headers/engine/ +cp ${BOLT_ROOT}/inference/engine/include/cnn.h ${script_dir}/image_classification/ImageClassificationDemo/libbolt/headers/engine/ +cp ${BOLT_ROOT}/inference/engine/include/memory_tracker.hpp ${script_dir}/image_classification/ImageClassificationDemo/libbolt/headers/engine/ +cp ${BOLT_ROOT}/inference/engine/include/model.hpp ${script_dir}/image_classification/ImageClassificationDemo/libbolt/headers/engine/ +cp ${BOLT_ROOT}/inference/engine/include/operator.hpp ${script_dir}/image_classification/ImageClassificationDemo/libbolt/headers/engine/ +cp ${BOLT_ROOT}/inference/flow/include/flow.h ${script_dir}/image_classification/ImageClassificationDemo/libbolt/headers/flow/ +cp ${BOLT_ROOT}/inference/flow/include/flow_function_factory.h ${script_dir}/image_classification/ImageClassificationDemo/libbolt/headers/flow/ +cp ${BOLT_ROOT}/inference/flow/include/node.h ${script_dir}/image_classification/ImageClassificationDemo/libbolt/headers/flow/ diff --git a/kits/CMakeLists.txt b/kits/CMakeLists.txt deleted file mode 100644 index 78eec25f..00000000 --- a/kits/CMakeLists.txt +++ /dev/null @@ -1,49 +0,0 @@ -cmake_minimum_required(VERSION 3.2) - -file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/bolt.cmake ${BOLT_ROOT}/bolt.cmake) -if (BOLT_CONFIGURE_FILE) - include(${BOLT_CONFIGURE_FILE}) -else (BOLT_CONFIGURE_FILE) - message(FATAL_ERROR " -FATAL: can not find bolt.cmake in directory, - please set shell or cmake environment variable BOLT_ROOT. - ") -endif (BOLT_CONFIGURE_FILE) - -project(kits) - -set_policy() - -SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${BOLT_ROOT}/cmakes") -find_package(Uni) -find_package(ModelTools) -find_package(Image) -find_package(TensorComputing) -find_package(Inference) -find_package(jpeg) -if(USE_MALI) - find_package(Gcl) -endif(USE_MALI) - -set_project_install_directory() -set_c_cxx_flags() -set_test_c_cxx_flags() - -if (BUILD_TEST) - inference(bert bert/bert.cpp) - inference(tinybert bert/tinybert.cpp) - inference(classification image_classification/classification.cpp) - inference(classification_bin image_classification/classification_bin.cpp) - inference(nmt machine_translation/nmt.cpp) - inference(nmt_tsc machine_translation/nmt_tsc.cpp) - inference(asr_rnnt automatic_speech_recognition/asr_rnnt.cpp) - inference(asr_convolution_transformer automatic_speech_recognition/asr_convolution_transformer.cpp) - inference(tts text_to_speech/tts.cpp) - inference(vad automatic_speech_recognition/vad.cpp) - if (USE_MALI) - if (USE_FP16) - inference(hdr high_dynamic_range/hdr.cpp) - inference(super_resolution super_resolution/super_resolution.cpp) - endif (USE_FP16) - endif (USE_MALI) -endif (BUILD_TEST) diff --git a/kits/automatic_speech_recognition/asr_convolution_transformer.cpp b/kits/automatic_speech_recognition/asr_convolution_transformer.cpp deleted file mode 100644 index 141220f6..00000000 --- a/kits/automatic_speech_recognition/asr_convolution_transformer.cpp +++ /dev/null @@ -1,203 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include - -#include "inference.hpp" -#include "tensor.hpp" -#include "data_loader.hpp" -#include "utils.hpp" - -void print_help(char* argv[]) { - std::cout << "usage: " << argv[0] << " modelPath sequencesDirectory subNetworkName[encoder|prediction_net] cpuAffinityPolicyName" << std::endl; -} - -HashMap prepareStates(DataType dt, std::string sequenceDirectory, - std::string shapeMapFileName) -{ - HashMap shapeMap; - std::string filePath = sequenceDirectory + "/" + shapeMapFileName; - FILE *shapeMapFile = fopen(filePath.c_str(), "r"); - char buffer[NAME_LEN]; - while (fscanf(shapeMapFile, "%s", buffer) != EOF) { - TensorDesc desc; - fscanf(shapeMapFile, "%u", &(desc.nDims)); - for (U32 i = 0; i < desc.nDims; i++) - fscanf(shapeMapFile, "%u", &(desc.dims[desc.nDims - 1 - i])); - if (std::string(buffer) == std::string("label")) { - desc.dt = DT_U32; - } else { - desc.dt = dt; - } - std::string inputName(buffer); - if (inputName.find(std::string("layer1_mem")) != std::string::npos) { - desc.df = DF_NCHWC8; - } else { - desc.df = DF_NCHW; - } - shapeMap[inputName] = desc; - } - fclose(shapeMapFile); - - HashMap tensorMap; - for (auto iter: shapeMap) { - std::string filePath = sequenceDirectory + "/" + iter.first + ".txt"; - TensorDesc desc = iter.second; - tensorMap[iter.first] = load_txt(filePath, Vec{desc})[0]; - } - return tensorMap; -} - -void saveStates(std::shared_ptr pipeline, std::string sequenceDirectory, - std::string outputFileName, std::string outputStatesFileName) -{ - char buffer[NAME_LEN]; - std::string outputFilePath = sequenceDirectory + "/" + outputFileName; - std::string outputStatesFilePath = sequenceDirectory + "/" + outputStatesFileName; - FILE *outputFile = fopen(outputFilePath.c_str(), "r"); - FILE *outputStatesFile = fopen(outputStatesFilePath.c_str(), "w"); - while (!feof(outputFile)) { - fscanf(outputFile, "%s", buffer); - Tensor tensor = pipeline->get_tensor_by_name(buffer); - TensorDesc desc = tensor.get_desc(); - - // write states - fprintf(outputStatesFile, "%s\n", buffer); - fprintf(outputStatesFile, "%u\n", desc.nDims); - for (U32 i = 0; i < desc.nDims; i++) - fprintf(outputStatesFile, "%u ", desc.dims[desc.nDims-1-i]); - - // write data - U32 num = tensorNumElements(desc); - std::string outputDataPath = sequenceDirectory + "/" + std::string(buffer) + ".txt"; - FILE *outputDataFile = fopen(outputDataPath.c_str(), "w"); - for (U32 i = 0; i < num; i++) { - fprintf(outputDataFile, "%f ", tensor.getElement(i)); - if (i % 10 == 9) - fprintf(outputDataFile, "\n"); - } - fclose(outputDataFile); - } - fclose(outputFile); - fclose(outputStatesFile); -} - -int verify(Tensor tensor, std::string subNetworkName, HashMap inputDescMap) -{ - U32 num = tensorNumElements(tensor.get_desc()); - F32 sum = 0; - for (U32 i = 0; i < num; i++) { - sum += tensor.getElement(i); - } - std::cout << "Output sum is " << sum << "\n"; - I32 result = 0; - if (subNetworkName == std::string("encoder")) { - if (inputDescMap["sounds"].dims[1] == 15) { - if (abs(sum - 44.4) >= 1) { - result = 1; - } - } else if (inputDescMap["sounds"].dims[1] == 8) { - if (abs(sum - 102.3) >= 1) { - result = 1; - } - } else { - result = 1; - } - } else if (subNetworkName == std::string("prediction_net")) { - if (abs(sum - 21.7) >= 1) { - result = 1; - } - } else if (subNetworkName == std::string("joint_net")) { - if (abs(sum - (-24.6)) >= 1) { - result = 1; - } - } - return result; -} - -int main(int argc, char* argv[]) { - if (argc < 5) { - print_help(argv); - return 1; - } - UTIL_TIME_INIT - - char* modelPath = argv[1]; - char* sequenceDirectory = argv[2]; - std::string subNetworkName = std::string(argv[3]); - char* cpuAffinityPolicyName = argv[4]; - DeviceTypeIn device = d_CPU; - std::string outputTensorName; - if (subNetworkName == std::string("encoder")) { - outputTensorName = "encoder_block3_transformer_ln"; - } else if (subNetworkName == std::string("prediction_net")) { - outputTensorName = "prediction_net_ln"; - } else if (subNetworkName == std::string("joint_net")) { - outputTensorName = "joint_output_fc"; - } else { - std::cerr << "[ERROR] unrecognized sub network(encoder|prediction_net|joint_net) " << subNetworkName << std::endl; - exit(1); - } - - DataType dt; - std::string modelPathStr = std::string(modelPath); - //"_f[16|32].bolt" - std::string modelPathSuffix = modelPathStr.substr(modelPathStr.size() - 9); - if (modelPathSuffix == std::string("_f16.bolt")) - dt = DT_F16; - else if (modelPathSuffix == std::string("_f32.bolt")) - dt = DT_F32; - else if (modelPathSuffix == std::string("t8_q.bolt")) - dt = DT_F16; - else { - std::cerr << "[ERROR] unrecognized model file path suffix " << modelPathSuffix << std::endl; - exit(1); - } - auto pipeline = createPipeline(cpuAffinityPolicyName, modelPath, device); - - double totalTime = 0; - int loops = 1; - U32 falseResult = 0; - for (int i = 0; i < loops; i++) { - HashMap input = prepareStates(dt, sequenceDirectory, "input_shape.txt"); - HashMap inputDescMap; - for (auto iter: input) - inputDescMap[iter.first] = iter.second.get_desc(); - pipeline->reready(inputDescMap); - for (auto iter: input) { - U8* tensorPointer = iter.second.get_val(); - pipeline->copy_to_named_input(iter.first, tensorPointer); - } - - double timeBegin = ut_time_ms(); - pipeline->run(); - double timeEnd = ut_time_ms(); - totalTime += (timeEnd - timeBegin); - Tensor output = pipeline->get_tensor_by_name(outputTensorName); - falseResult += verify(output, subNetworkName, inputDescMap); - saveStates(pipeline, sequenceDirectory, "output_name.txt", "output_shape.txt"); - } - UTIL_TIME_STATISTICS - - std::cout << "[SUMMARY]:" << std::endl; - U32 validSequence = loops; - CI_info("speech recognition rate: " << 100.0 * (validSequence - falseResult) / validSequence << " %"); - CI_info("avg_time:" << 1.0 * totalTime / validSequence << "ms/sequence"); - if (falseResult > 0) { - std::cerr << "[ERROR] verify failed" << std::endl; - exit(1); - } - - return 0; -} diff --git a/kits/automatic_speech_recognition/asr_rnnt.cpp b/kits/automatic_speech_recognition/asr_rnnt.cpp deleted file mode 100644 index 1495e3d2..00000000 --- a/kits/automatic_speech_recognition/asr_rnnt.cpp +++ /dev/null @@ -1,108 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include - -#include "inference.hpp" -#include "tensor.hpp" -#include "data_loader.hpp" -#include "utils.hpp" - -void print_help(char* argv[]) { - std::cout << "usage: " << argv[0] << " modelPath sequencesDirectory cpuAffinityPolicyName" << std::endl; -} - -int main(int argc, char* argv[]) { - if (argc < 3) { - print_help(argv); - return 1; - } - UTIL_TIME_INIT - - char* modelPath = argv[1]; - char* sequenceDirectory = argv[2]; - char* cpuAffinityPolicyName = (char*)""; - if (argc > 3) cpuAffinityPolicyName = argv[3]; - DeviceTypeIn device = d_CPU; - - auto pipeline = createPipeline(cpuAffinityPolicyName, modelPath, device); - - // load sequences - HashMap> inMap = pipeline->get_inputs(); - Vec sequenceDescs; - TensorDesc soundInputDesc = (*(inMap["sounds"])).get_desc(); - sequenceDescs.push_back(soundInputDesc); - - Vec> sequences, results; - Vec sequencePaths = load_data(sequenceDirectory+std::string("/input"), sequenceDescs, &sequences); - Vec resultDescs; - resultDescs.push_back(soundInputDesc); - Vec resultPaths = load_data(sequenceDirectory+std::string("/result"), resultDescs, &results); - - double totalTime = 0; - U32 sequenceIndex = 0; - U32 invalidSequence = 0; - U32 falseResult = 0; - std::cout << "[RESULT]:" << std::endl; - for (auto sequence: sequences) { - std::cout << sequencePaths[sequenceIndex] << ": " << std::endl; - TensorDesc desc = sequence[0].get_desc(); - TensorDesc inputDesc = tensor3d(soundInputDesc.dt, 1, tensorNumElements(desc)/soundInputDesc.dims[0], soundInputDesc.dims[0]); - HashMap inputDescMap; - inputDescMap["sounds"] = inputDesc; - pipeline->reready(inputDescMap); - - auto modelInputTensorNames = pipeline->get_model_input_tensor_names(); - HashMap> model_tensors_input; - for (int index = 0; index < (int)modelInputTensorNames.size(); index++) { - U8* tensorPointer = sequence[index].get_val(); - pipeline->copy_to_named_input(modelInputTensorNames[index], tensorPointer); - } - - double timeBegin = ut_time_ms(); - pipeline->run(); - double timeEnd = ut_time_ms(); - totalTime += (timeEnd - timeBegin); - - Tensor output = pipeline->get_tensor_by_name("labels"); - output.print(); - bool invalid = output.isInvalid(); - if (invalid) { - totalTime -= (timeEnd - timeBegin); - std::cout << "nan" << std::endl; - invalidSequence ++; - } - if (resultPaths.size() > sequenceIndex) { - U32 *result = (U32*)results[sequenceIndex][0].get_val(); - U32 inferenceSize = tensorNumElements(output.get_desc()); - for (U32 i = 0; i < tensorNumElements(results[sequenceIndex][0].get_desc()); i++) { - if (i >= inferenceSize || result[i] != output.getElement(i)) { - falseResult++; - break; - } - } - } - - sequenceIndex++; - } - - UTIL_TIME_STATISTICS - - std::cout << "[SUMMARY]:" << std::endl; - U32 validSequence = UNI_MAX(1, sequenceIndex - invalidSequence); - CI_info("speech recognition rate: " << 100.0 * (validSequence - falseResult) / validSequence << " %"); - CI_info("avg_time:" << 1.0 * totalTime / validSequence << "ms/sequence"); - - return 0; -} diff --git a/kits/automatic_speech_recognition/vad.cpp b/kits/automatic_speech_recognition/vad.cpp deleted file mode 100644 index b9dc269d..00000000 --- a/kits/automatic_speech_recognition/vad.cpp +++ /dev/null @@ -1,105 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include - -#include "inference.hpp" -#include "tensor.hpp" -#include "data_loader.hpp" -#include "utils.hpp" - -void print_help(char* argv[]) { - std::cout << "usage: " << argv[0] << " modelPath cpuAffinityPolicyName" << std::endl; -} - -int verify(Tensor vad, Tensor eoq) -{ - I32 result = 0; - U32 num = tensorNumElements(vad.get_desc()); - CHECK_REQUIREMENT(2 == num); - if (abs(vad.getElement(0) - 0.999107) >= 0.0005) { - result = 1; - } - if (abs(vad.getElement(1) - 0.0009) >= 0.0005) { - result = 1; - } - - num = tensorNumElements(eoq.get_desc()); - CHECK_REQUIREMENT(2 == num); - if (abs(eoq.getElement(0) - 1) >= 0.0005) { - result = 1; - } - if (abs(eoq.getElement(1) - 1.4e-8) >= 0.0005) { - result = 1; - } - return result; -} - -int main(int argc, char* argv[]) -{ - if (argc < 3) { - print_help(argv); - return 1; - } - UTIL_TIME_INIT - - char* modelPath = argv[1]; - char* cpuAffinityPolicyName = argv[2]; - DeviceTypeIn device = d_CPU; - auto pipeline = createPipeline(cpuAffinityPolicyName, modelPath, device); - - HashMap> inMap = pipeline->get_inputs(); - TensorDesc cacheDesc = (*(inMap["input_cache"])).get_desc(); - cacheDesc.df = DF_NCHWC8; - - HashMap inputDescMap; - inputDescMap["input_fea"] = (*(inMap["input_fea"])).get_desc(); - inputDescMap["input_cache"] = cacheDesc; - pipeline->reready(inputDescMap); - - Vec cache; - cache.resize(tensorNumBytes(cacheDesc), 0); - - double totalTime = 0; - int loops = 1; - U32 falseResult = 0; - for (int i = 0; i < loops; i++) { - pipeline->copy_to_named_input("input_cache", cache.data()); - pipeline->copy_to_named_input("input_fea", cache.data()); - - double timeBegin = ut_time_ms(); - pipeline->run(); - double timeEnd = ut_time_ms(); - totalTime += (timeEnd - timeBegin); - Tensor vad = pipeline->get_tensor_by_name("output_vad"); - std::cout << "output_vad: " << vad.getElement(0) << " " << vad.getElement(1) << std::endl; - Tensor eoq = pipeline->get_tensor_by_name("output_eoq"); - std::cout << "output_eoq: " << eoq.getElement(0) << " " << eoq.getElement(1) << std::endl; - falseResult += verify(vad, eoq); - Tensor outCache = pipeline->get_tensor_by_name("output_cache"); - memcpy(cache.data(), (U8*)outCache.get_val(), tensorNumBytes(cacheDesc)); - } - UTIL_TIME_STATISTICS - - std::cout << "[SUMMARY]:" << std::endl; - U32 validSequence = loops; - CI_info("vad rate: " << 100.0 * (validSequence - falseResult) / validSequence << " %"); - CI_info("avg_time:" << 1.0 * totalTime / validSequence << "ms/sequence"); - if (falseResult > 0) { - std::cerr << "[ERROR] verify failed" << std::endl; - exit(1); - } - - return 0; -} diff --git a/kits/bert/bert.cpp b/kits/bert/bert.cpp deleted file mode 100644 index 3de83c44..00000000 --- a/kits/bert/bert.cpp +++ /dev/null @@ -1,115 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "inference.hpp" -#include "tensor.hpp" -#include "data_loader.hpp" -#include "utils.hpp" - -void print_help(char* argv[]) { - std::cout << "usage: " << argv[0] << " modelPath sequenceDirectory cpuAffinityPolicyName" << std::endl; -} - -int main(int argc, char* argv[]) { - UTIL_TIME_INIT - - char* modelPath = (char*)""; - char* sequenceDirectory = (char*)""; - char* cpuAffinityPolicyName = (char*)""; - if (argc < 2) { - print_help(argv); - return 1; - } - modelPath = argv[1]; - if (argc > 2) sequenceDirectory = argv[2]; - if (argc > 3) cpuAffinityPolicyName = argv[3]; - DeviceTypeIn device = d_CPU; - auto pipeline = createPipeline(cpuAffinityPolicyName, modelPath, device); - - // load sequences - HashMap> inMap = pipeline->get_inputs(); - Vec sequenceDescs; - TensorDesc wordInputDesc = (*(inMap["bert_words"])).get_desc(); - wordInputDesc.dt = DT_U32; - sequenceDescs.push_back(wordInputDesc); - TensorDesc positionInputDesc = (*(inMap["bert_positions"])).get_desc(); - positionInputDesc.dt = DT_U32; - sequenceDescs.push_back(positionInputDesc); - TensorDesc tokenTypeInputDesc = (*(inMap["bert_token_type"])).get_desc(); - tokenTypeInputDesc.dt = DT_U32; - sequenceDescs.push_back(tokenTypeInputDesc); - Vec> sequences; - Vec sequencePaths = load_data(sequenceDirectory+std::string("/input"), sequenceDescs, &sequences); - - double totalTime = 0; - U32 sequenceIndex = 0; - U32 invalidSequence = 0; - std::cout << "[RESULT]:" << std::endl; - for (auto sequence: sequences) { - std::cout << sequencePaths[sequenceIndex] << std::endl; - HashMap inputDescMap; - inputDescMap["bert_words"] = sequence[0].get_desc(); - inputDescMap["bert_positions"] = sequence[1].get_desc(); - inputDescMap["bert_token_type"] = sequence[2].get_desc(); - pipeline->reready(inputDescMap); - - auto modelInputTensorNames = pipeline->get_model_input_tensor_names(); - for (int index = 0; index < (int)modelInputTensorNames.size(); index++) { - U8* tmp = sequence[index].get_val(); - pipeline->copy_to_named_input(modelInputTensorNames[index], tmp); - } - - double timeBegin = ut_time_ms(); - pipeline->run(); - double timeEnd = ut_time_ms(); - totalTime += (timeEnd - timeBegin); - - // stage5: process result - HashMap> outMap = pipeline->get_outputs(); - bool invalid = false; - for (auto iter: outMap) { - std::string key = iter.first; - std::shared_ptr value = iter.second; - Tensor result = *value; - invalid = result.isInvalid(); - if (key == "other") - continue; - U32 resultElementNum = tensorNumElements(result.get_desc()); - std::cout << " " << key << ": "; - std::cout << tensorDesc2Str(result.get_desc()); - std::cout << std::endl; - std::cout << " "; - for (U32 index = 0; index < resultElementNum; index++) { - std::cout << result.getElement(index) << " "; - } - std::cout< - -#include "inference.hpp" -#include "tensor.hpp" -#include "data_loader.hpp" -#include "utils.hpp" - -void print_help(char* argv[]) { - std::cout << "usage: " << argv[0] << " modelPath sequencesDirectory cpuAffinityPolicyName" << std::endl; -} - -int main(int argc, char* argv[]) { - if (argc < 3) { - print_help(argv); - return 1; - } - - UTIL_TIME_INIT - - char* modelPath = argv[1]; - char* sequenceDirectory = argv[2]; - char* cpuAffinityPolicyName = (char *)""; - char* algorithmMapPath = (char*)""; - DeviceTypeIn device = d_CPU; - if (argc > 3) { - const char* deviceName = "GPU"; - const char* argvName = argv[3]; - if(strcmp(deviceName, argvName) == 0) { - device = d_GPU; - } else { - cpuAffinityPolicyName = argv[3]; - } - } - if (argc > 4) { - algorithmMapPath = argv[8]; - } - - auto pipeline = createPipelineWithConfigure(cpuAffinityPolicyName, modelPath, device, algorithmMapPath); - - // load sequences - HashMap> inMap = pipeline->get_inputs(); - Vec sequenceDescs; - TensorDesc wordInputDesc = (*(inMap["tinybert_words"])).get_desc(); - wordInputDesc.dt = DT_U32; - sequenceDescs.push_back(wordInputDesc); - TensorDesc positionInputDesc = (*(inMap["tinybert_positions"])).get_desc(); - positionInputDesc.dt = DT_U32; - sequenceDescs.push_back(positionInputDesc); - TensorDesc tokenTypeInputDesc = (*(inMap["tinybert_token_type"])).get_desc(); - tokenTypeInputDesc.dt = DT_U32; - sequenceDescs.push_back(tokenTypeInputDesc); - Vec> sequences, intents, slots; - Vec sequencePaths = load_data(sequenceDirectory+std::string("/input"), sequenceDescs, &sequences); - - // load result - Vec intentDescs; - TensorDesc intentDesc = tensor1d(DT_F32, 2); - intentDescs.push_back(intentDesc); - Vec intentPaths = load_data(sequenceDirectory+std::string("/intent"), intentDescs, &intents); - Vec slotDescs; - slotDescs.push_back(wordInputDesc); - Vec slotPaths = load_data(sequenceDirectory+std::string("/slot"), slotDescs, &slots); - - double totalTime = 0; - U32 sequenceIndex = 0; - int falseIntent = 0; - int falseSlot = 0; - std::cout << "[RESULT]:" << std::endl; - for (auto sequence: sequences) { - std::cout << sequencePaths[sequenceIndex] << ":" << std::endl; - - HashMap inputDescMap; - inputDescMap["tinybert_words"] = sequence[0].get_desc(); - inputDescMap["tinybert_positions"] = sequence[1].get_desc(); - inputDescMap["tinybert_token_type"] = sequence[2].get_desc(); - pipeline->reready(inputDescMap); - - double timeBegin = ut_time_ms(); - auto modelInputTensorNames = pipeline->get_model_input_tensor_names(); - for (int index = 0; index < (int)modelInputTensorNames.size(); index++) { - U8* tmp = sequence[index].get_val(); - pipeline->copy_to_named_input(modelInputTensorNames[index], tmp); - } - - pipeline->run(); - - Tensor intentSoftmax = pipeline->get_tensor_by_name("intent_softmax"); - double timeEnd = ut_time_ms(); - totalTime += (timeEnd - timeBegin); - U32 intentNum = tensorNumElements(intentSoftmax.get_desc()); - U32 intentMaxIndex = 0; - for (U32 index = 1; index < intentNum; index++) { - if (intentSoftmax.getElement(index) > intentSoftmax.getElement(intentMaxIndex)) - intentMaxIndex = index; - } - std::cout << " intent: " << intentMaxIndex << " " << intentSoftmax.getElement(intentMaxIndex) << std::endl; - if (intentPaths.size() > 0) { - F32 *intentResult = (F32*)intents[sequenceIndex][0].get_val(); - if (intentMaxIndex != intentResult[0] || abs(intentSoftmax.getElement(intentMaxIndex) - intentResult[1]) > 0.1) { - falseIntent++; - } - } - Tensor slotSoftmax = pipeline->get_tensor_by_name("slot_softmax"); - auto slotDesc = slotSoftmax.get_desc(); - U32 slotNum = slotDesc.dims[1]; - U32 slotRange = slotDesc.dims[0]; - if(slotDesc.df == DF_MKT) { - slotNum = slotDesc.dims[0]; - slotRange = slotDesc.dims[1]; - } - Vec slotSoftmaxResult; - std::cout << " slot: "; - for (U32 i = 0; i < slotNum; i++) { - U32 slotMaxIndex = 0; - for (U32 index = 1; index < slotRange; index++) { - if (slotSoftmax.getElement(i*slotRange + index) > - slotSoftmax.getElement(i*slotRange + slotMaxIndex)) - slotMaxIndex = index; - } - slotSoftmaxResult.push_back(slotMaxIndex); - std::cout << slotMaxIndex << " "; - } - std::cout << std::endl; - if (slotPaths.size() > sequenceIndex) { - U32 *slotResult = (U32*)slots[sequenceIndex][0].get_val(); - for (U32 i = 0; i < slotSoftmaxResult.size(); i++) { - if (slotSoftmaxResult.size() != slots[sequenceIndex][0].get_desc().dims[0] || slotResult[i] != slotSoftmaxResult[i]) { - falseSlot++; - break; - } - } - } - - sequenceIndex++; - } - UTIL_TIME_STATISTICS - std::cout << "[SUMMARY]:" << std::endl; - U32 validSequence = UNI_MAX(1, sequenceIndex); - CI_info("intent correct rate: " << 100.0 * (validSequence - falseIntent) / validSequence << " %"); - CI_info("slot correct rate: " << 100.0 * (validSequence - falseSlot) / validSequence << " %"); - CI_info("avg_time:" << 1.0 * totalTime / validSequence << "ms/sequence"); - - return 0; -} diff --git a/kits/high_dynamic_range/hdr.cpp b/kits/high_dynamic_range/hdr.cpp deleted file mode 100644 index 76f667ee..00000000 --- a/kits/high_dynamic_range/hdr.cpp +++ /dev/null @@ -1,352 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#ifdef _USE_FP16 -#include -#include -#include "ut_util.h" -#include "type.h" -#include "tensor_desc.h" -#include "sequential_ocl.hpp" -#include "factory.hpp" -#include "ocl/factory_ocl.hpp" -#include "tensor.hpp" -#include "data_loader.hpp" -const int& min(const int& a, const int& b) -{ - return (b < a) ? b : a; -} - -const int& max(const int& a, const int& b) -{ - return (b < a) ? a : b; -} -void print_help() { - - std::cout << "please set argvs: " < -inline void calGuide(const int w, const int h, const int c, F16* para, T* input, F16* guide, std::string DATA_DT){ - float in_val[3]; - for(int i = 0; i < h; i++) { - for(int j = 0; j < w; j++){ - if(DATA_DT == "UCHAR"){ - in_val[0] = input[c * (j + w * i)] / 256.0; - in_val[1] = input[c * (j + w * i) + 1] / 256.0; - in_val[2] = input[c * (j + w * i) + 2] / 256.0; - } else { - in_val[0] = input[c * (j + w * i)]; - in_val[1] = input[c * (j + w * i) + 1]; - in_val[2] = input[c * (j + w * i) + 2]; - } - guide[j + w * i] = in_val[0] * para[0] + in_val[1] * para[1] + in_val[2] * para[2] + para[3]; - } - } - -} -template -inline void bilateralSliceApply(const int w, const int h, const int gw, const int gh, const int gd, const int input_chans, const int output_chans, const bool has_offset, - F16* grid, F16* guide, T* input, T* output, std::string DATA_DT) -{ - int grid_chans = input_chans * output_chans; - int coeff_stride = input_chans; - if(has_offset){ - grid_chans += output_chans; - coeff_stride += 1; - } - int sz = grid_chans; - int sx = grid_chans * gd; - int sy = grid_chans * gd * gw; - - float in_val[3]; - float out_val[3]; - for(int y = 0; y < h; ++y){ - float gy = (y + 0.5f) * gh / (1.0f * h); - int fy = static_cast(floor(gy - 0.5)); - for(int x = 0; x < w; ++x){ - float gx = (x + 0.5f) * gw / (1.0f * w); - float gz = guide[x + w * y] * gd; - int fx = static_cast(floor(gx - 0.5f)); - int fz = static_cast(floor(gz - 0.5f)); - float coeff_sample[12] = {0.0f}; - for(int xx = fx; xx < fx + 2; ++xx){ - int x_ = max(min(xx, gw - 1), 0); - float wx = fmax(1.0f- fabs(xx + 0.5 - gx), 0.0f); - for(int yy = fy; yy < fy + 2; ++yy){ - int y_ = max(min(yy, gh - 1), 0); - float wy = fmax(1.0f - fabs(yy + 0.5 - gy), 0.0f); - for(int zz = fz; zz < fz + 2; ++zz){ - int z_ = max(min(zz, gd-1), 0); - float wz = fmax(1.0f - fabs(zz + 0.5 - gz), 0.0f); - for(int in_c = 0; in_c < grid_chans; ++in_c){ - int grid_idx = in_c + sz * z_ + sx * x_ + sy * y_; - coeff_sample[in_c] += grid[grid_idx] * wx * wy * wz; - } - } - } - } - if(DATA_DT == "UCHAR"){ - in_val[0] = input[input_chans * (x + w * y)] / 256.0; - in_val[1] = input[input_chans * (x + w * y) + 1] / 256.0; - in_val[2] = input[input_chans * (x + w * y) + 2] / 256.0; - } else { - in_val[0] = input[input_chans * (x + w * y)]; - in_val[1] = input[input_chans * (x + w * y) + 1]; - in_val[2] = input[input_chans * (x + w * y) + 2]; - } - - if(has_offset){ - out_val[0] = in_val[0] * coeff_sample[0] + in_val[1] * coeff_sample[1] + in_val[2] * coeff_sample[2] + coeff_sample[3]; - out_val[1] = in_val[0] * coeff_sample[4] + in_val[1] * coeff_sample[5] + in_val[2] * coeff_sample[6] + coeff_sample[7]; - out_val[2] = in_val[0] * coeff_sample[8] + in_val[1] * coeff_sample[9] + in_val[2] * coeff_sample[10] + coeff_sample[11]; - } else { - out_val[0] = in_val[0] * coeff_sample[0] + in_val[1] * coeff_sample[1] + in_val[2] * coeff_sample[2]; - out_val[1] = in_val[0] * coeff_sample[3] + in_val[1] * coeff_sample[4] + in_val[2] * coeff_sample[5]; - out_val[2] = in_val[0] * coeff_sample[6] + in_val[1] * coeff_sample[7] + in_val[2] * coeff_sample[8]; - } - - if(DATA_DT == "UCHAR"){ - output[input_chans * (x + w * y)] = (U8)(out_val[0] * 256.0); - output[input_chans * (x + w * y) + 1] = (U8)(out_val[1] * 256.0); - output[input_chans * (x + w * y) + 2] = (U8)(out_val[2] * 256.0); - } else { - output[input_chans * (x + w * y)] = out_val[0]; - output[input_chans * (x + w * y) + 1] = out_val[1]; - output[input_chans * (x + w * y) + 2] = out_val[2]; - } - } - } -} - -template -void HDR_CPU(const int w, const int h, const int gw, const int gh, const int gd, const int input_chans, const int output_chans, const bool has_offset, - F16* grid, T* input, T* output, std::string DATA_DT){ - - U8* guideptr = (U8*) operator new (w * h * bytesOf(DT_F16)); - F16* guide = (F16*) guideptr; - F16 para[4]; - calWeight(para); - calGuide(w, h, input_chans, para, input, guide, DATA_DT); - bilateralSliceApply(w, h, gw, gh, gd, input_chans, output_chans, has_offset, grid, guide, input, output, DATA_DT); -} - -template -void buildInputTensor(DataType dt, DataFormat df, U32 n, U32 c, U32 h, U32 w, Vec* dims, Vec* inputTensors){ - TensorDesc inputDesc = tensor4df(dt, df, n, c, h, w); - U32 inputNum = tensorNumElements(inputDesc); - U32 inputSize = tensorNumBytes(inputDesc); - U8* inputVal = new U8[inputSize]; - - T* data = (T*) inputVal; - if(dt == DT_F16){ - for(U32 i = 0; i < inputNum; i++) data[i] = (T)(rand() & 255) / (256.0); - } - if(dt == DT_U8){ - for(U32 i = 0; i < inputNum; i++) { - data[i] = (T)(rand() & 255); - } - } - std::shared_ptr inputTensor = std::shared_ptr(new Tensor()); - inputTensor->set_desc(inputDesc); - inputTensor->set_shared_ptr(std::shared_ptr(inputVal)); - - dims->push_back(inputDesc); - inputTensors->push_back(*inputTensor.get()); -} - -int main(int argc, char* argv[]) { - - if(argc != 6 && argc != 5) { - printf("%d\n", argc); - print_help(); - return 0; - } - std::string INPUT_DT = "F16"; - U32 in = atoi(argv[1]); - U32 ic = atoi(argv[2]); - U32 ih = atoi(argv[3]); - U32 iw = atoi(argv[4]); - if(argc == 6) INPUT_DT = argv[5]; - U32 gw = 16; - U32 gh = 16; - U32 gc = 96; - U32 gd = 8; - U32 coe = gc / gd; - bool has_offset = true; - - const Arch A = MALI; - DataType dt = DT_F16; - auto model = new SequentialOcl(A, dt, "OT_BilateralSliceApply"); - std::shared_ptr model_ptr = std::shared_ptr(model); - - Factory* factory_ocl = (Factory*)(new FactoryOCL()); - std::shared_ptr factory; - factory = std::shared_ptr(factory_ocl); - - - BilateralSliceApplyMode mode = BSliceApply_CONV; - auto op = factory->createBilateralSliceApply(coe, has_offset, mode); - model_ptr->add(op); - - Vec dims; - Vec inputTensors; - if(INPUT_DT=="UCHAR") { - buildInputTensor(DT_U8, DF_NHWC, in, ic, ih, iw, &dims, &inputTensors); - } else { - buildInputTensor(dt, DF_NHWC, in, ic, ih, iw, &dims, &inputTensors); - } - buildInputTensor(dt, DF_NHWC, 1, gc, gh, gw, &dims, &inputTensors);//grid - - F16* grid_val = (F16*)inputTensors[1].get_val(); - for(U32 i = 0; i < tensorNumElements(dims[1]); i++) grid_val[i] = grid_val[i] / 8.0; - U8* input = new U8[tensorNumBytes(dims[0])]; - U8* grid = new U8[tensorNumBytes(dims[1])]; - memcpy((void*)input, inputTensors[0].get_val(), tensorNumBytes(dims[0])); - memcpy((void*)grid, inputTensors[1].get_val(), tensorNumBytes(dims[1])); -// model_ptr->loadAlgorithmMapFromText("./"); - model_ptr->ready(dims, NULL, 1); - model_ptr->mark_input_output(); - model_ptr->mali_prepare(); -// model_ptr->saveAlgorithmMapToText("./"); - - double totalTime = 0; - double max_time = -DBL_MAX; - double min_time = DBL_MAX; - int loop = 10; - Vec> ocl_output; - for(int i = 0; i < loop; i++) { - double timeBegin = ut_time_ms(); - model_ptr->set_input_tensors(inputTensors); - model_ptr->run(); - ocl_output = model_ptr->get_output_tensors(); - double timeEnd = ut_time_ms(); - double t = timeEnd - timeBegin; - totalTime += t; - if(max_time < t) max_time = t; - if(min_time > t) min_time = t; - } - std::shared_ptr oclMem = ocl_output[0]->get_shared_ptr(); - - int e0, e1, e2, e3, e4, e5, e6; - e0 = 0; e1 = 0; e2 = 0; e3 = 0; e4 = 0; e5 = 0; e6 = 0; - float maxrel = 0; - float maxabs = 0; - if(INPUT_DT == "UCHAR") { - U8* output = new U8[iw * ih * ic * sizeof(U8)]; - HDR_CPU (iw, ih, gw, gh, gd, ic, ic, has_offset, (F16*)grid, input, output, INPUT_DT); - U8* ocl_res = (oclMem->desc.map_ptr); - for(U32 i = 0; i < ih; i++){ - for(U32 j = 0; j < iw; j++){ - U8 c, g; - int d; - int index = (i * iw + j) * 3; - for(int k = 0 ; k < 3; k++){ - c = output[index + k]; - g = ocl_res[index + k]; - d = c - g; - if(d < 0) d = -d; - maxabs = ((float)d > maxabs) ? (float)d : maxabs; - maxrel = ((float)d * 2/ (c + g + 0.000001) > maxrel) ? (float)d * 2 / (c + g + 0.000001): maxrel; - if(d >= 30) {e0++; continue;} - if(d >= 20) {e1++; continue;} - if(d >= 10) {e2++; continue;} - if(d >= 5) {e3++; continue;} - if(d >= 2) {e4++; continue;} - if(d >= 1) {e5++; continue;} - e6++; - } - } - } - std::cout << " abs(diff) >=30 number = " << e0 << std::endl; - std::cout << "20 <= abs(diff) < 30 number = " << e1 << std::endl; - std::cout << "10 <= abs(diff) < 20 number = " << e2 << std::endl; - std::cout << "5 <= abs(diff) < 10 number = " << e3 << std::endl; - std::cout << "2 <= abs(diff) < 5 number = " << e4 << std::endl; - std::cout << "1 <= abs(diff) < 2 number = " << e5 << std::endl; - std::cout << "0 <= abs(diff) < 1 number = " << e6 << std::endl; - std::cout << "maxabs = " << maxabs << std::endl; - std::cout << "maxrel = " << maxrel << std::endl; - delete[] output; - } else { - U8* output = new U8[iw * ih * ic * sizeof(F16)]; - HDR_CPU(iw, ih, gw, gh, gd, ic, ic, has_offset, (F16*)grid, (F16*)input, (F16*)output, INPUT_DT); - F16* cpu_res = (F16*)output; - F16* gpu_res = (F16*)oclMem->desc.map_ptr; - for(U32 i = 0; i < ih; i++){ - for(U32 j = 0; j < iw; j++){ - float c, g, d; - int index = (i * iw + j) * 3; - for(int k = 0 ; k < 3; k++){ - c = cpu_res[index + k]; - g = gpu_res[index + k]; - d = c - g; - if(d < 0) d = -d; - maxabs = ((float)d > maxabs) ? (float)d : maxabs; - maxrel = ((float)d * 2/ (c + g + 0.000001) > maxrel) ? (float)d * 2 / (c + g + 0.000001): maxrel; - if(d >= 1) {e0++; continue;} - if(d >= 0.1) {e1++; continue;} - if(d >= 0.01) {e2++; continue;} - if(d >= 0.001) {e3++; continue;} - if(d >= 0.0001) {e4++; continue;} - if(d >= 0.00001) {e5++; continue;} - e6++; - } - } - } - std::cout << " abs(diff) >=1 number = " << e0 << std::endl; - std::cout << "0.1 <= abs(diff) < 1 number = " << e1 << std::endl; - std::cout << "0.01 <= abs(diff) < 0.1 number = " << e2 << std::endl; - std::cout << "0.001 <= abs(diff) < 0.01 number = " << e3 << std::endl; - std::cout << "0.0001 <= abs(diff) < 0.001 number = " << e4 << std::endl; - std::cout << "0.00001 <= abs(diff) < 0.0001 number = " << e5 << std::endl; - std::cout << "0 <= abs(diff) < 0.00001 number = " << e6 << std::endl; - std::cout << "maxabs = " << maxabs << std::endl; - std::cout << "maxrel = " << maxrel << std::endl; - - delete[] output; - } - - printf("avg_time: %lf ms\n", 1.0 * totalTime / loop); - printf("max_time: %lf ms\n", 1.0 * max_time); - printf("min_time: %lf ms\n", 1.0 * min_time); - UTIL_TIME_STATISTICS - - delete[] input; - delete[] grid; - return 0; -} -#endif diff --git a/kits/image_classification/classification.cpp b/kits/image_classification/classification.cpp deleted file mode 100644 index 99b1a75f..00000000 --- a/kits/image_classification/classification.cpp +++ /dev/null @@ -1,192 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include -#include -#include "inference.hpp" -#include "tensor.hpp" -#include "data_loader.hpp" -#include "result_format.hpp" -#include "utils.hpp" - -void print_help(char* argv[]) -{ - std::cout << "usage: " << argv[0] << " modelPath imagesDirectory imageStyle scaleValue topK correctLabel cpuAffinityPolicyName algorithmMapPath" << std::endl; -} - -int main(int argc, char* argv[]) -{ - UTIL_TIME_INIT - - char* modelPath = (char*)""; - char* imageDir = (char*)""; - char* cpuAffinityPolicyName = (char*)""; - char* algorithmMapPath = (char*)""; - ImageFormat imageFormat = RGB; - DeviceTypeIn device = d_CPU; - F32 scaleValue = 1; - int topK = 5; - int category = -1; - if (argc < 2) { - print_help(argv); - return 1; - } - modelPath = argv[1]; - if (argc > 2) { - imageDir = argv[2]; - } - if (argc > 3) { - imageFormat = (std::string(argv[3]) == std::string("BGR") ? BGR : RGB); - if (std::string(argv[3]) == std::string("RGB_SC")) { - imageFormat = RGB_SC; - } else if (std::string(argv[3]) == std::string("BGR_SC_RAW")) { - imageFormat = BGR_SC_RAW; - } else if (std::string(argv[3]) == std::string("RGB_SC_RAW")) { - imageFormat = RGB_SC_RAW; - } - } - if (argc > 4) { - scaleValue = atof(argv[4]); - } - if (argc > 5) { - topK = atoi(argv[5]); - } - if (argc > 6) { - category = atoi(argv[6]); - } - if (argc > 7) { - const char* deviceName = "GPU"; - const char* argvName = argv[7]; - if(strcmp(deviceName, argvName) == 0) { - device = d_GPU; - } else { - cpuAffinityPolicyName = argv[7]; - } - } - if (argc > 8) { - algorithmMapPath = argv[8]; - } - //DeviceTypeIn device = d_CPU; - auto cnn = createPipelineWithConfigure(cpuAffinityPolicyName, modelPath, device, algorithmMapPath); - - // load images - HashMap> inMap = cnn->get_inputs(); - TensorDesc imageDesc = (*(inMap.begin()->second)).get_desc(); - Vec imageDescs; - imageDescs.push_back(imageDesc); - Vec> images; - Vec imagePaths = load_image_with_scale(imageDir, imageDescs, &images, imageFormat, scaleValue); - - std::map categoryNum; - double totalTime = 0; - double max_time = -DBL_MAX; - double min_time = DBL_MAX; - U32 imageIndex = 0; - std::cout << "[RESULT]:" << std::endl; - int top1Index = 0; - int top1Match = 0; - int topKMatch = 0; - const int INVALID_INDEX = INT_MAX; - - for (auto image: images) { - std::cout << imagePaths[imageIndex] << " : "; - // stage3: set input - double timeBegin = ut_time_ms(); - if(device == d_CPU){ - auto curModelInputTensorNames = cnn->get_model_input_tensor_names(); - for (int index = 0; index < (int)curModelInputTensorNames.size(); index++) { - cnn->copy_to_named_input(curModelInputTensorNames[index], image[index].get_val()); - DEBUG_info("curModelInputTensorNames[index]: " << curModelInputTensorNames[index]); - } - } else { - auto curModelInputTensorNames = cnn->get_model_input_tensor_names(); - HashMap> modelInputTensors; - for (int index = 0; index < (int)curModelInputTensorNames.size(); index++) { - modelInputTensors[curModelInputTensorNames[index]] = image[index].get_shared_ptr(); - } - cnn->set_input_tensors_value(modelInputTensors); - } - // stage4: run - cnn->run(); - - // stage5: process result - HashMap> outMap = cnn->get_outputs(); - double timeEnd = ut_time_ms(); - totalTime += (timeEnd - timeBegin); - Tensor result = *(outMap.begin()->second); - bool invalid = result.isInvalid(); - if (!invalid) { - Vec topKResult = topK_index(result, topK); - top1Index = topKResult[0]; - if (category != -1) { - if (top1Index == category) { - top1Match ++; - } - for (int i = 0; i < topK; i++) { - if(topKResult[i] == category) { - topKMatch++; - break; - } - } - for (int i = 0; i < topK; i++) { - std::cout << topKResult[i] << " "; - } - std::cout << std::endl; - } - - if ((timeEnd - timeBegin) >= max_time) { - max_time = (timeEnd - timeBegin); - } - - if ((timeEnd - timeBegin) <= min_time) { - min_time = (timeEnd - timeBegin); - } - } - else{ - totalTime -= (timeEnd - timeBegin); - top1Index = INVALID_INDEX; - std::cout << "nan" << std::endl; - } - if (categoryNum.count(top1Index) == 0) { - categoryNum[top1Index] = 1; - } else { - categoryNum[top1Index] = categoryNum[top1Index] + 1; - } - imageIndex++; - } - - UTIL_TIME_STATISTICS - - std::cout << "[CATEGORY]:" << std::endl; - std::cout << "category\tnum" << std::endl; - U32 nanImages = 0; - for (auto elem : categoryNum) { - if(elem.first == INVALID_INDEX){ - std::cout << "nan\t" << elem.second << std::endl; - nanImages = elem.second; - } - else { - std::cout << elem.first << "\t" << elem.second << std::endl; - } - } - U32 validImages = imageIndex - nanImages; - std::cout << "[SUMMARY]:" << std::endl; - CI_info("top1:" << 1.0 * top1Match / validImages); - CI_info("top" << topK << ":" << 1.0 * topKMatch / validImages); - CI_info("avg_time:" << 1.0 * totalTime / validImages << "ms/image"); - CI_info("max_time:" << 1.0 * max_time << "ms/image"); - CI_info("min_time:" << 1.0 * min_time << "ms/image"); - return 0; -} diff --git a/kits/image_classification/classification_bin.cpp b/kits/image_classification/classification_bin.cpp deleted file mode 100644 index 6471760a..00000000 --- a/kits/image_classification/classification_bin.cpp +++ /dev/null @@ -1,119 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include - -#include "inference.hpp" -#include "tensor.hpp" -#include "type.h" -#include "tensor_desc.h" -#include "model_serialize_deserialize.hpp" -#include "data_loader.hpp" -#include "result_format.hpp" -#include "utils.hpp" - - -void print_help(char* argv[]) { - std::cout << "usage: " << argv[0] << " modelPath binaryFileDirectory cpuAffinityPolicyName algorithmMapPath" << std::endl; -} - -int main(int argc, char* argv[]) { - UTIL_TIME_INIT - - char* modelPath = (char*)""; - char* sequenceDirectory = (char*)""; - char* cpuAffinityPolicyName = (char*)""; - char* algorithmMapPath = (char*)""; - if (argc < 2) { - print_help(argv); - return 1; - } - modelPath = argv[1]; - if (argc > 2) sequenceDirectory = argv[2]; - if (argc > 3) cpuAffinityPolicyName = argv[3]; - if (argc > 4) algorithmMapPath = argv[4]; - - DeviceTypeIn device = d_CPU; - auto pipeline = createPipelineWithConfigure(cpuAffinityPolicyName, modelPath, device, algorithmMapPath); - - // load data - HashMap> inMap = pipeline->get_inputs(); - Vec sourceDataTypes; - sourceDataTypes.push_back(DT_F32); - Vec inputDescs; - TensorDesc inputDesc = (*(inMap["Data"])).get_desc(); - inputDescs.push_back(inputDesc); - Vec> sequences; - Vec sequencePaths = load_bin_with_type(sequenceDirectory, inputDescs, &sequences, sourceDataTypes); - - double totalTime = 0; - U32 sequenceIndex = 0; - U32 invalidSequence = 0; - std::cout << "[RESULT]:" << std::endl; - for (auto sequence: sequences) { - std::cout << sequencePaths[sequenceIndex] << std::endl; - // stage3: set input - Vec input; - input = sequence; - - auto modelInputTensorNames = pipeline->get_model_input_tensor_names(); - HashMap> model_tensors_input; - for (int index = 0; index < (int)modelInputTensorNames.size(); index++) { - model_tensors_input[modelInputTensorNames[index]] = input[index].get_shared_ptr(); - } - pipeline->set_input_tensors_value(model_tensors_input); - - // stage4: run - double timeBegin = ut_time_ms(); - pipeline->run(); - double timeEnd = ut_time_ms(); - totalTime += (timeEnd - timeBegin); - - // stage5: process result - HashMap> outMap = pipeline->get_outputs(); - bool invalid = false; - for (auto iter: outMap) { - std::string key = iter.first; - std::shared_ptr value = iter.second; - Tensor result = *value; - invalid = result.isInvalid(); - if (key == "other") - continue; - U32 resultElementNum = tensorNumElements(result.get_desc()); - std::cout << " " << key << ": "; - std::cout << tensorDesc2Str(result.get_desc()); - std::cout << " "; - for (U32 index = 0; index < resultElementNum; index++) { - std::cout << result.getElement(index) << " "; - } - std::cout< - -#include "inference.hpp" -#include "tensor.hpp" -#include "data_loader.hpp" -#include "utils.hpp" - -void print_help(char* argv[]) { - std::cout << "usage: " << argv[0] << " modelPath sequencesDirectory cpuAffinityPolicyName" << std::endl; -} - -int main(int argc, char* argv[]) { - if (argc < 3) { - print_help(argv); - return 1; - } - UTIL_TIME_INIT - - char* modelPath = argv[1]; - char* sequenceDirectory = argv[2]; - char* cpuAffinityPolicyName = (char*)""; - if (argc > 3) cpuAffinityPolicyName = argv[3]; - DeviceTypeIn device = d_CPU; - - auto pipeline = createPipeline(cpuAffinityPolicyName, modelPath, device); - - // load sequences - HashMap> inMap = pipeline->get_inputs(); - Vec sequenceDescs; - TensorDesc wordInputDesc = (*(inMap["nmt_words"])).get_desc(); - wordInputDesc.dt = DT_U32; - sequenceDescs.push_back(wordInputDesc); - TensorDesc positionInputDesc = (*(inMap["nmt_positions"])).get_desc(); - positionInputDesc.dt = DT_U32; - sequenceDescs.push_back(positionInputDesc); - - Vec> sequences, results; - Vec sequencePaths = load_data(sequenceDirectory+std::string("/input"), sequenceDescs, &sequences); - Vec resultDescs; - resultDescs.push_back(wordInputDesc); - Vec resultPaths = load_data(sequenceDirectory+std::string("/result"), resultDescs, &results); - - double totalTime = 0; - U32 sequenceIndex = 0; - U32 invalidSequence = 0; - U32 falseResult = 0; - std::cout << "[RESULT]:" << std::endl; - for (auto sequence: sequences) { - std::cout << sequencePaths[sequenceIndex] << ": " << std::endl; - HashMap inputDescMap; - inputDescMap["nmt_words"] = sequence[0].get_desc(); - inputDescMap["nmt_positions"] = sequence[1].get_desc(); - pipeline->reready(inputDescMap); - - auto modelInputTensorNames = pipeline->get_model_input_tensor_names(); - HashMap> model_tensors_input; - for (int index = 0; index < (int)modelInputTensorNames.size(); index++) { - U8* tensorPointer = sequence[index].get_val(); - pipeline->copy_to_named_input(modelInputTensorNames[index], tensorPointer); - } - - double timeBegin = ut_time_ms(); - pipeline->run(); - double timeEnd = ut_time_ms(); - totalTime += (timeEnd - timeBegin); - - Tensor output = pipeline->get_tensor_by_name("decoder_output"); - output.print(); - bool invalid = output.isInvalid(); - if (invalid) { - totalTime -= (timeEnd - timeBegin); - std::cout << "nan" << std::endl; - invalidSequence ++; - } - if (resultPaths.size() > sequenceIndex) { - U32 *result = (U32*)results[sequenceIndex][0].get_val(); - U32 inferenceSize = tensorNumElements(output.get_desc()); - for (U32 i = 0; i < tensorNumElements(results[sequenceIndex][0].get_desc()); i++) { - if (i >= inferenceSize || result[i] != output.getElement(i)) { - falseResult++; - break; - } - } - } - - sequenceIndex++; - } - - UTIL_TIME_STATISTICS - - std::cout << "[SUMMARY]:" << std::endl; - U32 validSequence = UNI_MAX(1, sequenceIndex - invalidSequence); - CI_info("translation correct rate: " << 100.0 * (validSequence - falseResult) / validSequence << " %"); - CI_info("avg_time:" << 1.0 * totalTime / validSequence << "ms/sequence"); - if (falseResult > 0) { - std::cerr << "[ERROR] verify failed" << std::endl; - exit(1); - } - - return 0; -} diff --git a/kits/machine_translation/nmt_tsc.cpp b/kits/machine_translation/nmt_tsc.cpp deleted file mode 100644 index bcdbb55d..00000000 --- a/kits/machine_translation/nmt_tsc.cpp +++ /dev/null @@ -1,159 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include - -#include "inference.hpp" -#include "tensor.hpp" -#include "data_loader.hpp" -#include "utils.hpp" - -void print_help(char* argv[]) { - std::cout << "usage: " << argv[0] << " modelPath sequencesDirectory subNetworkName[encoder|prediction_net] cpuAffinityPolicyName" << std::endl; -} - -HashMap prepareStates(DataType dt, std::string sequenceDirectory, - std::string shapeMapFileName) -{ - HashMap shapeMap; - std::string filePath = sequenceDirectory + "/" + shapeMapFileName; - FILE *shapeMapFile = fopen(filePath.c_str(), "r"); - char buffer[NAME_LEN]; - while (fscanf(shapeMapFile, "%s", buffer) != EOF) { - TensorDesc desc; - fscanf(shapeMapFile, "%u", &(desc.nDims)); - for (U32 i = 0; i < desc.nDims; i++) - fscanf(shapeMapFile, "%u", &(desc.dims[desc.nDims - 1 - i])); - if (std::string(buffer) == std::string("encoder_words") - || std::string(buffer) == std::string("encoder_positions") - || std::string(buffer) == std::string("decoder_words") - || std::string(buffer) == std::string("decoder_positions")) - desc.dt = DT_U32; - else - desc.dt = dt; - desc.df = DF_NCHW; - shapeMap[buffer] = desc; - } - fclose(shapeMapFile); - - HashMap tensorMap; - for (auto iter: shapeMap) { - std::string filePath = sequenceDirectory + "/" + iter.first + ".txt"; - TensorDesc desc = iter.second; - tensorMap[iter.first] = load_txt(filePath, Vec{desc})[0]; - } - return tensorMap; -} - -void saveStates(std::shared_ptr pipeline, std::string sequenceDirectory, - std::string outputFileName, std::string outputStatesFileName) -{ - char buffer[NAME_LEN]; - std::string outputFilePath = sequenceDirectory + "/" + outputFileName; - std::string outputStatesFilePath = sequenceDirectory + "/" + outputStatesFileName; - FILE *outputFile = fopen(outputFilePath.c_str(), "r"); - FILE *outputStatesFile = fopen(outputStatesFilePath.c_str(), "w"); - while (!feof(outputFile)) { - fscanf(outputFile, "%s", buffer); - Tensor tensor = pipeline->get_tensor_by_name(buffer); - TensorDesc desc = tensor.get_desc(); - - // write states - fprintf(outputStatesFile, "%s\n", buffer); - fprintf(outputStatesFile, "%u\n", desc.nDims); - for (U32 i = 0; i < desc.nDims; i++) - fprintf(outputStatesFile, "%u ", desc.dims[desc.nDims-1-i]); - - // write data - U32 num = tensorNumElements(desc); - std::string outputDataPath = sequenceDirectory + "/" + std::string(buffer) + ".txt"; - FILE *outputDataFile = fopen(outputDataPath.c_str(), "w"); - for (U32 i = 0; i < num; i++) { - fprintf(outputDataFile, "%f ", tensor.getElement(i)); - if (i % 10 == 9) - fprintf(outputDataFile, "\n"); - } - fclose(outputDataFile); - } - fclose(outputFile); - fclose(outputStatesFile); -} - -int main(int argc, char* argv[]) { - if (argc < 5) { - print_help(argv); - return 1; - } - UTIL_TIME_INIT - - char* modelPath = argv[1]; - char* sequenceDirectory = argv[2]; - std::string subNetworkName = std::string(argv[3]); - char* cpuAffinityPolicyName = argv[4]; - DeviceTypeIn device = d_CPU; - std::string outputTensorName; - if (subNetworkName == std::string("encoder")) { - outputTensorName = "transformer_decoder_layer5_multihead_v"; - } else if (subNetworkName == std::string("decoder")) { - outputTensorName = "transformer_decoder_embedding_argmax"; - } else { - std::cerr << "[ERROR] unrecognized sub network(encoder|decoder) " << subNetworkName << std::endl; - exit(1); - } - - DataType dt; - std::string modelPathStr = std::string(modelPath); - //"_f[16|32].bolt" - std::string modelPathSuffix = modelPathStr.substr(modelPathStr.size() - 9); - if (modelPathSuffix == std::string("_f16.bolt")) - dt = DT_F16; - else if (modelPathSuffix == std::string("_f32.bolt")) - dt = DT_F32; - else { - std::cerr << "[ERROR] unrecognized model file path suffix " << modelPathSuffix << std::endl; - exit(1); - } - auto pipeline = createPipeline(cpuAffinityPolicyName, modelPath, device); - - double totalTime = 0; - int loops = 60; - for (int i = 0; i < loops; i++) { - HashMap input = prepareStates(dt, sequenceDirectory, "input_shape.txt"); - HashMap inputDescMap; - for (auto iter: input) { - inputDescMap[iter.first] = iter.second.get_desc(); - } - pipeline->infer_output_tensors_size(inputDescMap); - pipeline->assign_output_tensor(); - for (auto iter: input) { - U8* tensorPointer = iter.second.get_val(); - pipeline->copy_to_named_input(iter.first, tensorPointer); - } - - double timeBegin = ut_time_ms(); - pipeline->run(); - double timeEnd = ut_time_ms(); - totalTime += (timeEnd - timeBegin); - Tensor output = pipeline->get_tensor_by_name(outputTensorName); - output.print(); - saveStates(pipeline, sequenceDirectory, "output_name.txt", "output_shape.txt"); - } - UTIL_TIME_STATISTICS - - std::cout << "[SUMMARY]:" << std::endl; - U32 validSequence = loops; - CI_info("avg_time:" << 1.0 * totalTime / validSequence << "ms/sequence"); - - return 0; -} diff --git a/kits/super_resolution/super_resolution.cpp b/kits/super_resolution/super_resolution.cpp deleted file mode 100644 index 9450e82e..00000000 --- a/kits/super_resolution/super_resolution.cpp +++ /dev/null @@ -1,149 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include -#include -#include "inference.hpp" -#include "tensor.hpp" -#include "data_loader.hpp" -#include "result_format.hpp" -#include "utils.hpp" - -void print_help(char* argv[]) -{ - std::cout << "usage: " << argv[0] << " modelPath cpuAffinityPolicyName algorithmMapPath" << std::endl; -} - -int main(int argc, char* argv[]) -{ - UTIL_TIME_INIT - - char* modelPath = (char*)""; - char* cpuAffinityPolicyName = (char*)""; - char* algorithmMapPath = (char*)""; - DeviceTypeIn device = d_CPU; - U32 testNum = 1; - if (argc < 2) { - print_help(argv); - return 1; - } - modelPath = argv[1]; - if (argc > 2) { - const char* deviceName = "GPU"; - const char* argvName = argv[2]; - if(strcmp(deviceName, argvName) == 0) { - device = d_GPU; - } else { - cpuAffinityPolicyName = argv[2]; - } - } - if (argc > 3) { - algorithmMapPath = argv[3]; - } - /* - char* imageDir = (char*)""; - ImageFormat ImageFormat = RGB; - if (argc > 4) { - imageDir = argv[2]; - } - if (argc > 5) { - ImageFormat = (std::string(argv[3]) == std::string("BGR") ? BGR : RGB); - if (std::string(argv[3]) == std::string("RGB_SC")) { - ImageFormat = RGB_SC; - } else if (std::string(argv[3]) == std::string("BGR_SC_RAW")) { - ImageFormat = BGR_SC_RAW; - } else if (std::string(argv[3]) == std::string("RGB_SC_RAW")) { - ImageFormat = RGB_SC_RAW; - } - } - */ - auto cnn = createPipelineWithConfigure(cpuAffinityPolicyName, modelPath, device, algorithmMapPath); - - //set input value(TODO use image) - HashMap> inMap = cnn->get_inputs(); - TensorDesc imageDesc = (*(inMap.begin()->second)).get_desc(); - Vec imageDescs; - imageDescs.push_back(imageDesc); - Vec> images; - for(U32 i = 0; i < testNum; i++) { - std::shared_ptr imageData((U8*) operator new(tensorNumBytes(imageDescs[0]))); - F16* val = (F16*)imageData.get(); - for(U32 i = 0; i < tensorNumElements(imageDescs[0]); i++) val[i] = (i % 1024) / 1024.0 - 0.5; - std::shared_ptr tensorData(new Tensor()); - tensorData->set_desc(imageDescs[0]); - tensorData->set_shared_ptr(imageData); - Vec image; - image.push_back(*tensorData.get()); - images.push_back(image); - } - - double max_time = -DBL_MAX; - double min_time = DBL_MAX; - double totalTime = 0; - U32 imageIndex = 0; - U32 invalidIndex = 0; - std::cout << "[RESULT]:" << std::endl; - - for (auto image: images) { - // stage3: set input - double timeBegin = ut_time_ms(); - if(device == d_CPU){ - auto curModelInputTensorNames = cnn->get_model_input_tensor_names(); - for (int index = 0; index < (int)curModelInputTensorNames.size(); index++) { - cnn->copy_to_named_input(curModelInputTensorNames[index], image[index].get_val()); - DEBUG_info("curModelInputTensorNames[index]: " << curModelInputTensorNames[index]); - } - } else { - auto curModelInputTensorNames = cnn->get_model_input_tensor_names(); - HashMap> modelInputTensors; - for (int index = 0; index < (int)curModelInputTensorNames.size(); index++) { - modelInputTensors[curModelInputTensorNames[index]] = image[index].get_shared_ptr(); - } - cnn->set_input_tensors_value(modelInputTensors); - } - // stage4: run - cnn->run(); - - // stage5: process result - HashMap> outMap = cnn->get_outputs(); - double timeEnd = ut_time_ms(); - totalTime += (timeEnd - timeBegin); - Tensor result = *(outMap.begin()->second); - bool invalid = result.isInvalid(); - if (!invalid) { - if ((timeEnd - timeBegin) >= max_time) { - max_time = (timeEnd - timeBegin); - } - - if ((timeEnd - timeBegin) <= min_time) { - min_time = (timeEnd - timeBegin); - } - } - else{ - std::cout << "warnning the result get nan" << std::endl; - totalTime -=(timeEnd - timeBegin); - invalidIndex++; - } - imageIndex++; - } - - UTIL_TIME_STATISTICS - - std::cout << "[SUMMARY]:" << std::endl; - CI_info("avg_time:" << 1.0 * totalTime / (imageIndex - invalidIndex) << "ms/image"); - CI_info("max_time:" << 1.0 * max_time << "ms/image"); - CI_info("min_time:" << 1.0 * min_time << "ms/image"); - return 0; -} diff --git a/kits/text_to_speech/tts.cpp b/kits/text_to_speech/tts.cpp deleted file mode 100644 index 26bea2fd..00000000 --- a/kits/text_to_speech/tts.cpp +++ /dev/null @@ -1,198 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include - -#include "inference.hpp" -#include "tensor.hpp" -#include "data_loader.hpp" -#include "utils.hpp" - -void print_help(char* argv[]) { - std::cout << "usage: " << argv[0] << " modelPath sequencesDirectory subNetworkName[encoder_decoder|postnet] cpuAffinityPolicyName" << std::endl; -} - -HashMap prepareStates(DataType dt, std::string sequenceDirectory, - std::string shapeMapFileName) -{ - HashMap shapeMap; - std::string filePath = sequenceDirectory + "/" + shapeMapFileName; - FILE *shapeMapFile = fopen(filePath.c_str(), "r"); - char buffer[NAME_LEN]; - while (fscanf(shapeMapFile, "%s", buffer) != EOF) { - TensorDesc desc; - fscanf(shapeMapFile, "%u", &(desc.nDims)); - for (U32 i = 0; i < desc.nDims; i++) - fscanf(shapeMapFile, "%u", &(desc.dims[desc.nDims - 1 - i])); - if (std::string(buffer) == std::string("tts_words")) { - desc.dt = DT_U32; - } else { - desc.dt = dt; - } - if (std::string(buffer) == std::string("tts_words") || std::string(buffer) == std::string("tts_alignments")) { - desc.df = DF_NORMAL; - } else { - desc.df = DF_NCHW; - } - shapeMap[buffer] = desc; - } - fclose(shapeMapFile); - - HashMap tensorMap; - for (auto iter: shapeMap) { - std::string filePath = sequenceDirectory + "/" + iter.first + ".txt"; - TensorDesc desc = iter.second; - tensorMap[iter.first] = load_txt(filePath, Vec{desc})[0]; - } - return tensorMap; -} - -void saveStates(std::shared_ptr pipeline, std::string sequenceDirectory, - std::string outputFileName, std::string outputStatesFileName) -{ - char buffer[NAME_LEN]; - std::string outputFilePath = sequenceDirectory + "/" + outputFileName; - std::string outputStatesFilePath = sequenceDirectory + "/" + outputStatesFileName; - FILE *outputFile = fopen(outputFilePath.c_str(), "r"); - FILE *outputStatesFile = fopen(outputStatesFilePath.c_str(), "w"); - while (!feof(outputFile)) { - fscanf(outputFile, "%s", buffer); - Tensor tensor = pipeline->get_tensor_by_name(buffer); - TensorDesc desc = tensor.get_desc(); - - // write states - fprintf(outputStatesFile, "%s\n", buffer); - fprintf(outputStatesFile, "%u\n", desc.nDims); - for (U32 i = 0; i < desc.nDims; i++) - fprintf(outputStatesFile, "%u ", desc.dims[desc.nDims-1-i]); - - // write data - U32 num = tensorNumElements(desc); - std::string outputDataPath = sequenceDirectory + "/" + std::string(buffer) + ".txt"; - FILE *outputDataFile = fopen(outputDataPath.c_str(), "w"); - for (U32 i = 0; i < num; i++) { - fprintf(outputDataFile, "%f ", tensor.getElement(i)); - if (i % 10 == 9) - fprintf(outputDataFile, "\n"); - } - fclose(outputDataFile); - } - fclose(outputFile); - fclose(outputStatesFile); -} - -int verify(Tensor tensor, std::string subNetworkName) -{ - U32 num = tensorNumElements(tensor.get_desc()); - F32 sum = 0; - for (U32 i = 0; i < num; i++) { - sum += tensor.getElement(i); - } - I32 result = 0; - if (subNetworkName == std::string("encoder_decoder")) { - if (abs(sum - 6921) >= 1100) { - result = 1; - } - } else if (subNetworkName == std::string("postnet")) { - if (abs(sum - (-11987.7)) >= 1) { - result = 1; - } - } else if (subNetworkName == std::string("melgan_vocoder")) { - if (abs(sum - (-0.665192)) >= 0.7) { - result = 1; - } - } - return result; -} - -int main(int argc, char* argv[]) { - if (argc < 5) { - print_help(argv); - return 1; - } - UTIL_TIME_INIT - - char* modelPath = argv[1]; - char* sequenceDirectory = argv[2]; - std::string subNetworkName = std::string(argv[3]); - char* cpuAffinityPolicyName = argv[4]; - DeviceTypeIn device = d_CPU; - std::vector outputTensorNames; - if (subNetworkName == std::string("encoder_decoder")) { - outputTensorNames.push_back("decoder_result"); - outputTensorNames.push_back("decoder_position"); - } else if (subNetworkName == std::string("postnet")) { - outputTensorNames.push_back("mel"); - } else if (subNetworkName == std::string("melgan_vocoder")) { - outputTensorNames.push_back("output"); - } else { - std::cerr << "[ERROR] unrecognized sub network(encoder_decoder|postnet) " << subNetworkName << std::endl; - exit(1); - } - - DataType dt; - std::string modelPathStr = std::string(modelPath); - //"_f[16|32].bolt" - std::string modelPathSuffix = modelPathStr.substr(modelPathStr.size() - 9); - if (modelPathSuffix == std::string("_f16.bolt")) - dt = DT_F16; - else if (modelPathSuffix == std::string("_f32.bolt")) - dt = DT_F32; - else if (modelPathSuffix == std::string("t8_q.bolt")) - dt = DT_F16; - else { - std::cerr << "[ERROR] unrecognized model file path suffix " << modelPathSuffix << std::endl; - exit(1); - } - auto pipeline = createPipeline(cpuAffinityPolicyName, modelPath, device); - - double totalTime = 0; - int loops = 1; - U32 falseResult = 0; - for (int i = 0; i < loops; i++) { - HashMap input = prepareStates(dt, sequenceDirectory, "input_shape.txt"); - HashMap inputDescMap; - for (auto iter: input) - inputDescMap[iter.first] = iter.second.get_desc(); - pipeline->reready(inputDescMap); - for (auto iter: input) { - U8* tensorPointer = iter.second.get_val(); - pipeline->copy_to_named_input(iter.first, tensorPointer); - } - - double timeBegin = ut_time_ms(); - pipeline->run(); - double timeEnd = ut_time_ms(); - totalTime += (timeEnd - timeBegin); - std::vector output; - for (auto outputTensorName: outputTensorNames) { - Tensor outputTensor = pipeline->get_tensor_by_name(outputTensorName); - output.push_back(outputTensor); - } - falseResult += verify(output[0], subNetworkName); - saveStates(pipeline, sequenceDirectory, "output_name.txt", "output_shape.txt"); - } - UTIL_TIME_STATISTICS - - std::cout << "[SUMMARY]:" << std::endl; - U32 validSequence = loops; - CI_info("text to speech correct rate: " << 100.0 * (validSequence - falseResult) / validSequence << " %"); - CI_info("avg_time:" << 1.0 * totalTime / validSequence << "ms/sequence"); - if (falseResult > 0) { - std::cerr << "[ERROR] verify failed" << std::endl; - exit(1); - } - - return 0; -} diff --git a/model-tools/CMakeLists.txt b/model-tools/CMakeLists.txt deleted file mode 100644 index 34b5d2d7..00000000 --- a/model-tools/CMakeLists.txt +++ /dev/null @@ -1,34 +0,0 @@ -cmake_minimum_required(VERSION 3.2) - -file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/bolt.cmake ${BOLT_ROOT}/bolt.cmake) -if (BOLT_CONFIGURE_FILE) - include(${BOLT_CONFIGURE_FILE}) -else (BOLT_CONFIGURE_FILE) - message(FATAL_ERROR " -FATAL: can not find bolt.cmake in directory, - please set shell or cmake environment variable BOLT_ROOT. - ") -endif (BOLT_CONFIGURE_FILE) - -project(model-tools) - -set_policy() - -SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${BOLT_ROOT}/cmakes") -find_package(Uni) -find_package(ModelTools) - -set_project_install_directory() - -set_c_cxx_flags() - -add_subdirectory(src) -if (USE_CAFFE) - add_subdirectory(src/caffe) -endif(USE_CAFFE) -if (USE_ONNX) - add_subdirectory(src/onnx) -endif(USE_ONNX) -if (USE_TFLITE) - add_subdirectory(src/tflite) -endif(USE_TFLITE) diff --git a/model-tools/include/OPOptimizers/BNScaleOptimizer.hpp b/model-tools/include/OPOptimizers/BNScaleOptimizer.hpp deleted file mode 100644 index be198c1a..00000000 --- a/model-tools/include/OPOptimizers/BNScaleOptimizer.hpp +++ /dev/null @@ -1,111 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_BNSCALEOPTIMIZER -#define _H_BNSCALEOPTIMIZER - -#include -#include -#include -#include "model_tools.h" -#include "op_type.h" -#include "OPOptimizer.hpp" - -// Fold remaining BN into the following scale -class BNScaleOptimizer: public OPOptimizer { - virtual bool optimize(ModelSpec* spec) override { - const int queryNum = 1; - OperatorType queryOps[queryNum] = {OT_Scale}; - - bool hasOptimized = false; - for (int i = 1; i < spec->num_operator_specs; i++) { - if (spec->ops[i].type == OT_BatchNorm) { - int bnOpIndex = i; - int scaleOpIndex = searchOperatorIndexForward(spec, i+1, queryOps, queryNum); - if (scaleOpIndex == -1) { - std::cout << "[WARNING] encounter unoptimized BatchNorm layer (no Scale following): " << spec->ops[i].name << std::endl; - continue; - } - - // tensor relationship rewrite - str_copy(spec->ops[scaleOpIndex].input_tensors_name[0], spec->ops[bnOpIndex].input_tensors_name[0], NAME_LEN); - hasOptimized = true; - - // bn - int bnWeightIndex = searchWeightIndex(spec, spec->ops[bnOpIndex].name); - CHECK_REQUIREMENT(bnWeightIndex >= 0); - CHECK_REQUIREMENT(spec->ws[bnWeightIndex].mdt == DT_F32); - F32 epsCur = spec->ops[bnOpIndex].ps.bn_spec.eps; - F32 gamaCur = spec->ops[bnOpIndex].ps.bn_spec.gama; - U32 channelCur = spec->ws[bnWeightIndex].bytes_of_weight / bytesOf(spec->ws[bnWeightIndex].mdt); - F32* meanPtr = (F32 *)spec->ws[bnWeightIndex].weight; - F32* varPtr = (F32 *)spec->ws[bnWeightIndex].vec; - - std::vector stdValue(channelCur); - for (U32 j=0; j < channelCur; j++) { - stdValue[j] = sqrt(gamaCur * varPtr[j] + epsCur); - } - - // scale - int scaleWeightIndex = searchWeightIndex(spec, spec->ops[scaleOpIndex].name); - CHECK_REQUIREMENT(scaleWeightIndex >= 0); - CHECK_REQUIREMENT(spec->ws[scaleWeightIndex].mdt == DT_F32); - U32 channelAlpha = spec->ws[scaleWeightIndex].bytes_of_weight / bytesOf(spec->ws[scaleWeightIndex].mdt); - CHECK_REQUIREMENT(channelAlpha == channelCur); - - if (spec->ws[scaleWeightIndex].vec == nullptr) { - spec->ws[scaleWeightIndex].bytes_of_vec = channelCur * sizeof(F32); - spec->ws[scaleWeightIndex].vec = (U8 *)mt_new_storage(spec->ws[scaleWeightIndex].bytes_of_vec); - memset(spec->ws[scaleWeightIndex].vec, 0, spec->ws[scaleWeightIndex].bytes_of_vec); - } - - F32* alphaPtr = (F32*)spec->ws[scaleWeightIndex].weight; - F32* betaPtr = (F32*)spec->ws[scaleWeightIndex].vec; - - for (U32 m = 0; m < channelCur; m++) { - alphaPtr[m] /= stdValue[m]; - betaPtr[m] = betaPtr[m] - alphaPtr[m] * gamaCur * meanPtr[m]; - } - // free BN memory - if(spec->ws[bnWeightIndex].weight != nullptr) { - spec->ws[bnWeightIndex].bytes_of_weight = 0; - delete [] spec->ws[bnWeightIndex].weight; - spec->ws[bnWeightIndex].weight = nullptr; - } - if(spec->ws[bnWeightIndex].vec != nullptr) { - spec->ws[bnWeightIndex].bytes_of_vec = 0; - delete [] spec->ws[bnWeightIndex].vec; - spec->ws[bnWeightIndex].vec = nullptr; - } - setOperatorInvalid(spec, bnOpIndex); - - // If the previous OP is Concat, we need to take care of the possible padded channels before Concat. - const int queryNum = 1; - OperatorType queryOps[queryNum] = {OT_Concat}; - - int concatOpIndex = searchOperatorIndexBackward(spec, i-1, queryOps, queryNum); - if (concatOpIndex != -1) { - spec->ops[scaleOpIndex].ps.scale_spec.num_concat = spec->ops[concatOpIndex].num_inputs; - // Rename concat output and scale input to avoid desc differences for inplace tensor - std::string oldName = spec->ops[concatOpIndex].output_tensors_name[0]; - std::string breakName = "break_" + oldName; - str_copy(spec->ops[concatOpIndex].output_tensors_name[0], breakName.c_str(), NAME_LEN); - str_copy(spec->ops[scaleOpIndex].input_tensors_name[0], breakName.c_str(), NAME_LEN); - } - } - } - return hasOptimized; - } -}; -#endif diff --git a/model-tools/include/OPOptimizers/ChannelPaddingOptimizer.hpp b/model-tools/include/OPOptimizers/ChannelPaddingOptimizer.hpp deleted file mode 100644 index 393f8d89..00000000 --- a/model-tools/include/OPOptimizers/ChannelPaddingOptimizer.hpp +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_CHANNELPADDINGOPTIMIZER -#define _H_CHANNELPADDINGOPTIMIZER - -#include -#include -#include "model_tools.h" -#include "OPOptimizer.hpp" - -class ChannelPaddingOptimizer: public OPOptimizer { - virtual bool optimize(ModelSpec* spec) override { - bool hasOptimized = false; // If padding optimization has never been done, we do not need to check the number of input channels - for (int i = 0; i < spec->num_operator_specs; i++) { - bool padding = false; // Whether to check input channels and actually pad - bool optimizeOC = false; - U32 numKernels = 0; - U32 kernelSizeH = 0; - U32 kernelSizeW = 0; - if (spec->ops[i].type == OT_Conv || spec->ops[i].type == OT_Deconvolution) { - if (spec->ops[i].ps.conv_spec.convolution_type != Convolution_Pointwise && spec->ops[i].ps.conv_spec.convolution_type != Convolution_Deconvolution) { - // Depthwise not supported for the time being - continue; - } - - numKernels = spec->ops[i].ps.conv_spec.num_outputs; - kernelSizeH = spec->ops[i].ps.conv_spec.kernel_size_h; - kernelSizeW = spec->ops[i].ps.conv_spec.kernel_size_w; - if (numKernels % 8 != 0) { // Check output channels - optimizeOC = true; - } - padding = hasOptimized || optimizeOC; // If padding has been done before, we need to check the input channels as well - } else if (spec->ops[i].type == OT_FC) { - numKernels = spec->ops[i].ps.fc_spec.num_outputs; - kernelSizeH = 1; - kernelSizeW = 1; - padding = hasOptimized; - } else { - continue; - } - - if (padding) { - int weightIndex = searchWeightIndex(spec, spec->ops[i].name); - CHECK_REQUIREMENT(weightIndex >= 0); - CHECK_REQUIREMENT(spec->ws[weightIndex].mdt == DT_F32); // BNN not supported for the time being - U32 weightSize = spec->ws[weightIndex].bytes_of_weight / bytesOf(spec->ws[weightIndex].mdt); - U32 inputChannels = weightSize / (numKernels * kernelSizeH * kernelSizeW); - if (inputChannels % 8 == 0 && false == optimizeOC) { - continue; - } - - U32 numKernelsNew = optimizeOC ? ((numKernels / 8 + 1) * 8) : numKernels; - U32 inputChannelsNew = (inputChannels % 8) ? ((inputChannels / 8 + 1) * 8) : inputChannels; - - U8 *weight = spec->ws[weightIndex].weight; - U8 *vec = spec->ws[weightIndex].vec; - U32 vecBytes = spec->ws[weightIndex].bytes_of_vec; - spec->ws[weightIndex].bytes_of_weight = bytesOf(spec->ws[weightIndex].mdt) - * numKernelsNew * inputChannelsNew * kernelSizeH * kernelSizeW; - spec->ws[weightIndex].bytes_of_vec = bytesOf(spec->ws[weightIndex].mdt) * numKernelsNew; - spec->ws[weightIndex].weight = (U8 *)mt_new_storage(spec->ws[weightIndex].bytes_of_weight); - spec->ws[weightIndex].vec = (U8 *)mt_new_storage(spec->ws[weightIndex].bytes_of_vec); - memset(spec->ws[weightIndex].weight, 0, spec->ws[weightIndex].bytes_of_weight); - memset(spec->ws[weightIndex].vec, 0, spec->ws[weightIndex].bytes_of_vec); - if (spec->ops[i].type == OT_Conv) - spec->ops[i].ps.conv_spec.num_outputs = numKernelsNew; - if (spec->ops[i].type == OT_FC) - spec->ops[i].ps.fc_spec.num_outputs = numKernelsNew; - // process weight - U32 blockSize = bytesOf(spec->ws[weightIndex].mdt) * kernelSizeH * kernelSizeW; - for (U32 oc = 0; oc < numKernels; oc++) { - for (U32 ic = 0; ic < inputChannels; ic++) { - U32 oldIndex = (oc * inputChannels + ic) * blockSize; - U32 newIndex = (oc * inputChannelsNew + ic) * blockSize; - memcpy(spec->ws[weightIndex].weight + newIndex, weight + oldIndex, blockSize); - } - } - delete [] weight; - // process bias - if(vec != nullptr) { - memcpy(spec->ws[weightIndex].vec, vec, vecBytes); - delete [] vec; - } - - hasOptimized = true; - } - } - return hasOptimized; - } -}; -#endif diff --git a/model-tools/include/OPOptimizers/ClipClipOptimizer.hpp b/model-tools/include/OPOptimizers/ClipClipOptimizer.hpp deleted file mode 100644 index d6c1ae3a..00000000 --- a/model-tools/include/OPOptimizers/ClipClipOptimizer.hpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_CLIPCLIPOPTIMIZER -#define _H_CLIPCLIPOPTIMIZER - -#include -#include -#include "model_tools.h" -#include "OPOptimizer.hpp" - -class ClipClipOptimizer: public OPOptimizer { - virtual bool optimize(ModelSpec* spec) override { - const int queryNum = 1; - OperatorType queryOps[queryNum] = {OT_Clip}; - bool hasOptimized = false; - for (int i = 1; i < spec->num_operator_specs; i++) { - if (spec->ops[i].type == OT_Clip) { - int opIndex0 = i; - int opIndex1 = searchOperatorIndexForward(spec, opIndex0+1, queryOps, queryNum); - if (opIndex1 == -1) - continue; - - str_copy(spec->ops[opIndex0].output_tensors_name[0], spec->ops[opIndex1].output_tensors_name[0], NAME_LEN); - hasOptimized = true; - spec->ops[opIndex0].ps.clip_spec.min = UNI_MAX(spec->ops[opIndex0].ps.clip_spec.min, - spec->ops[opIndex1].ps.clip_spec.min); - spec->ops[opIndex0].ps.clip_spec.max = UNI_MIN(spec->ops[opIndex0].ps.clip_spec.max, - spec->ops[opIndex1].ps.clip_spec.max); - setOperatorInvalid(spec, opIndex1); - i = opIndex1; - } - } - return hasOptimized; - } -}; -#endif diff --git a/model-tools/include/OPOptimizers/ConstUpsampleOptimizer.hpp b/model-tools/include/OPOptimizers/ConstUpsampleOptimizer.hpp deleted file mode 100644 index 997a328e..00000000 --- a/model-tools/include/OPOptimizers/ConstUpsampleOptimizer.hpp +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_CONSTUPSAMPLEOPTIMIZER -#define _H_CONSTUPSAMPLEOPTIMIZER - -#include -#include -#include "model_tools.h" -#include "OPOptimizer.hpp" - -class ConstUpsampleOptimizer: public OPOptimizer { - virtual bool optimize(ModelSpec* spec) override { - const int queryNum = 1; - OperatorType queryOps[queryNum] = {OT_Constant}; - bool hasOptimized = false; - for (int i = 1; i < spec->num_operator_specs; i++) { - if (spec->ops[i].type == OT_Upsample) { - int upsampleOpIndex = i; - int constOpIndex = searchOperatorIndexBackward(spec, i-1, queryOps, queryNum); - if (constOpIndex == -1) { - std::cout << "[WARNING] encounter unoptimized Upsample (no Constant before): " << spec->ops[upsampleOpIndex].name << std::endl; - continue; - } - - CHECK_REQUIREMENT(spec->ops[constOpIndex].ps.const_f32_spec.size == 4); - memcpy(spec->ops[upsampleOpIndex].ps.upsample_spec.scale, spec->ops[constOpIndex].ps.const_f32_spec.values, 4*bytesOf(DT_F32)); - delete [] spec->ops[constOpIndex].ps.const_f32_spec.values; - setOperatorInvalid(spec, constOpIndex); - } - } - return hasOptimized; - } -}; -#endif diff --git a/model-tools/include/OPOptimizers/ConvActivationOptimizer.hpp b/model-tools/include/OPOptimizers/ConvActivationOptimizer.hpp deleted file mode 100644 index bf81f398..00000000 --- a/model-tools/include/OPOptimizers/ConvActivationOptimizer.hpp +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_CONVACTIVATIONOPTIMIZER -#define _H_CONVACTIVATIONOPTIMIZER - -#include -#include -#include "model_tools.h" -#include "OPOptimizer.hpp" - -// Optimize both Convolution and Deconvolution -class ConvActivationOptimizer: public OPOptimizer { - virtual bool optimize(ModelSpec* spec) override { - const int queryNum = 2; - OperatorType queryOps[queryNum] = {OT_Conv, OT_Deconvolution}; - bool hasOptimized = false; - for (int i = 1; i< spec->num_operator_specs; i++) { - OperatorType curOT = spec->ops[i].type; - if (curOT == OT_Relu && spec->ops[i].ps.relu_spec.neg_slope == 0) { - if (spec->ops[i].num_inputs > 0 && spec->ops[i].num_outputs > 0) { - std::string inputName = spec->ops[i].input_tensors_name[0]; - std::string outputName = spec->ops[i].output_tensors_name[0]; - if (inputName != outputName) { - std::cout << "[WARNING] encounter unoptimized Relu layer (not inPlace): " << spec->ops[i].name << std::endl; - continue; - } - } - int atOpIndex = i; - int convOpIndex = searchOperatorIndexBackward(spec, atOpIndex - 1, queryOps, queryNum); - - if (convOpIndex == -1) { - std::cout << "[WARNING] encounter unoptimized Relu layer (no Conv or Deconv before): " << spec->ops[atOpIndex].name << std::endl; - continue; - } - - // tensor relationship rewrite - str_copy(spec->ops[convOpIndex].output_tensors_name[0], spec->ops[atOpIndex].output_tensors_name[0], NAME_LEN); - hasOptimized = true; - switch (spec->ops[convOpIndex].ps.conv_spec.convolution_type) { - case Convolution_Pointwise: { - spec->ops[convOpIndex].ps.conv_spec.pw_activation_type = ACTIVATION_RELU; - break; - } - case Convolution_Deconvolution: { - spec->ops[convOpIndex].ps.conv_spec.pw_activation_type = ACTIVATION_RELU; - break; - } - case Convolution_Depthwise: { - spec->ops[convOpIndex].ps.conv_spec.dw_activation_type = ACTIVATION_RELU; - break; - } - case Convolution_Dilation: { - spec->ops[convOpIndex].ps.conv_spec.dw_activation_type = ACTIVATION_RELU; - break; - } - default: { - CHECK_REQUIREMENT(0); - break; - } - } - spec->ops[convOpIndex].ps.conv_spec.activation_spec.relu_spec = spec->ops[atOpIndex].ps.relu_spec; - setOperatorInvalid(spec, atOpIndex); - } - if (spec->ops[i].type == OT_Clip || spec->ops[i].type == OT_Sigmoid) { - // tensor_computing does not support fusion - std::cout << "[Info] encounter unoptimized " << OperatorTypeName()[spec->ops[i].type] << " layer" < -#include -#include -#include "model_tools.h" -#include "OPOptimizer.hpp" - -class ConvBNOptimizer: public OPOptimizer { - virtual bool optimize(ModelSpec* spec) override { - const int queryNum = 3; - OperatorType queryOps[queryNum] = {OT_Conv, OT_FC, OT_Deconvolution}; - - bool hasOptimized = false; - for (int i = 1; i < spec->num_operator_specs; i++) { - if (spec->ops[i].type == OT_BatchNorm) { - int bnOpIndex = i; - int convOpIndex = searchOperatorIndexBackward(spec, i-1, queryOps, queryNum); - if (convOpIndex == -1) { - std::cout << "[Info] encounter unoptimize BatchNorm layer(no Conv before): " << spec->ops[i].name << std::endl; - continue; - } - - // tensor relationship rewrite - str_copy(spec->ops[convOpIndex].output_tensors_name[0], spec->ops[bnOpIndex].output_tensors_name[0], NAME_LEN); - hasOptimized = true; - - // bn - int bnWeightIndex = searchWeightIndex(spec, spec->ops[bnOpIndex].name); - CHECK_REQUIREMENT(bnWeightIndex >= 0); - CHECK_REQUIREMENT(spec->ws[bnWeightIndex].mdt == DT_F32); - F32 epsCur = spec->ops[bnOpIndex].ps.bn_spec.eps; - F32 gamaCur = spec->ops[bnOpIndex].ps.bn_spec.gama; - U32 channelCur = spec->ws[bnWeightIndex].bytes_of_weight / bytesOf(spec->ws[bnWeightIndex].mdt); - F32* meanPtr = (F32 *)spec->ws[bnWeightIndex].weight; - F32* varPtr = (F32 *)spec->ws[bnWeightIndex].vec; - - std::vector stdValue(channelCur); - for (U32 j=0; j < channelCur; j++) { - stdValue[j] = sqrt(gamaCur * varPtr[j] + epsCur); - } - - // conv - int convWeightIndex = searchWeightIndex(spec, spec->ops[convOpIndex].name); - CHECK_REQUIREMENT(convWeightIndex >= 0); - // Now weight mdt can be DT_BIN01 or DT_BIN11 - U32 isBNN = 0; - if (spec->ws[convWeightIndex].mdt == DT_BIN01 || spec->ws[convWeightIndex].mdt == DT_BIN11) { - isBNN = 1; - } - F32* weightTemp = (F32*)spec->ws[convWeightIndex].weight; - if(spec->ws[convWeightIndex].vec == nullptr){ - spec->ws[convWeightIndex].bytes_of_vec = channelCur * sizeof(F32); - if (isBNN == 1) { - spec->ws[convWeightIndex].bytes_of_vec *= 2; - } - spec->ws[convWeightIndex].vec = (U8 *)mt_new_storage(spec->ws[convWeightIndex].bytes_of_vec); - memset(spec->ws[convWeightIndex].vec, 0, spec->ws[convWeightIndex].bytes_of_vec); - } - F32* vecTemp = (F32*)spec->ws[convWeightIndex].vec; - if (isBNN == 1) { // Do not modify weights for BNN - F32* scale = vecTemp; - F32* bias = vecTemp + channelCur; - for (U32 m = 0; m < channelCur; m++) { - scale[m] = 1.0 / stdValue[m]; // This is the first possible source of a meaningful scale, so just initilize - bias[m] = (bias[m] - meanPtr[m]) / stdValue[m]; - } - } else { - int weightDataSize = spec->ws[convWeightIndex].bytes_of_weight / bytesOf(spec->ws[convWeightIndex].mdt); - int weightPerChannel = weightDataSize / channelCur; - // NCHW - for (U32 m = 0; m < channelCur; m++) { - F32* convWeightPerChannel = weightTemp + weightPerChannel * m; - for (int n = 0; n < weightPerChannel; n++) { - convWeightPerChannel[n] /= stdValue[m]; - } - vecTemp[m] = (vecTemp[m] - gamaCur * meanPtr[m]) / stdValue[m]; - } - } - // free BN memory - if(spec->ws[bnWeightIndex].weight != nullptr) { - spec->ws[bnWeightIndex].bytes_of_weight = 0; - delete [] spec->ws[bnWeightIndex].weight; - spec->ws[bnWeightIndex].weight = nullptr; - } - if(spec->ws[bnWeightIndex].vec != nullptr) { - spec->ws[bnWeightIndex].bytes_of_vec = 0; - delete [] spec->ws[bnWeightIndex].vec; - spec->ws[bnWeightIndex].vec = nullptr; - } - setOperatorInvalid(spec, bnOpIndex); - } - } - return hasOptimized; - } -}; -#endif diff --git a/model-tools/include/OPOptimizers/ConvScaleOptimizer.hpp b/model-tools/include/OPOptimizers/ConvScaleOptimizer.hpp deleted file mode 100644 index 77a2ecee..00000000 --- a/model-tools/include/OPOptimizers/ConvScaleOptimizer.hpp +++ /dev/null @@ -1,122 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_CONVSCALEOPTIMIZER -#define _H_CONVSCALEOPTIMIZER - -#include -#include -#include "model_tools.h" -#include "OPOptimizer.hpp" - -class ConvScaleOptimizer: public OPOptimizer { - virtual bool optimize(ModelSpec* spec) override { - const int queryNum = 3; - OperatorType queryOps[queryNum] = {OT_Conv, OT_FC, OT_Deconvolution}; - - bool hasOptimized = false; - for (int i = 1; i < spec->num_operator_specs; i++) { - if (spec->ops[i].type == OT_Scale) { - int scaleOpIndex = i; - if (spec->ops[scaleOpIndex].num_inputs > 1){ - std::cout << "[WARNING] encounter unoptimize Scale layer(multi-inputs): " << spec->ops[i].name <ops[i].name <ops[convOpIndex].output_tensors_name[0], spec->ops[scaleOpIndex].output_tensors_name[0], NAME_LEN); - hasOptimized = true; - - // scale - int scaleWeightIndex = searchWeightIndex(spec, spec->ops[scaleOpIndex].name); - CHECK_REQUIREMENT(scaleWeightIndex >= 0); - CHECK_REQUIREMENT(spec->ws[scaleWeightIndex].mdt == DT_F32); - U32 channelAlpha = spec->ws[scaleWeightIndex].bytes_of_weight / bytesOf(spec->ws[scaleWeightIndex].mdt); - U32 channelBeta = spec->ws[scaleWeightIndex].bytes_of_vec / bytesOf(spec->ws[scaleWeightIndex].mdt); - U32 channelCur = UNI_MAX(channelAlpha, channelBeta); - F32* alphaPtr = (F32*)spec->ws[scaleWeightIndex].weight; - F32* betaPtr = (F32*)spec->ws[scaleWeightIndex].vec; - - // conv - int convWeightIndex = searchWeightIndex(spec, spec->ops[convOpIndex].name); - CHECK_REQUIREMENT(convWeightIndex >= 0); - // mdt can now be DT_BIN01 or DT_BIN11 - U32 isBNN = 0; - if (spec->ws[convWeightIndex].mdt==DT_BIN01 || spec->ws[convWeightIndex].mdt==DT_BIN11) { - isBNN = 1; - } - F32* weightTemp = (F32*)spec->ws[convWeightIndex].weight; - if (spec->ws[convWeightIndex].vec == nullptr) { - spec->ws[convWeightIndex].bytes_of_vec = channelCur * sizeof(F32); - if (isBNN == 1) { - spec->ws[convWeightIndex].bytes_of_vec *= 2; - } - spec->ws[convWeightIndex].vec = (U8 *)mt_new_storage(spec->ws[convWeightIndex].bytes_of_vec); - memset(spec->ws[convWeightIndex].vec, 0, spec->ws[convWeightIndex].bytes_of_vec); - } - F32* vecTemp = (F32*)spec->ws[convWeightIndex].vec; - if (isBNN == 1) { - F32 *scale = vecTemp; - F32 *bias = vecTemp + channelCur; - for (U32 m = 0; m < channelCur; m++) { - if (scale[m] == 0) { - scale[m] = alphaPtr[m]; - } else { - scale[m] *= alphaPtr[m]; - } - bias[m] *= alphaPtr[m]; - if (betaPtr != nullptr) { - bias[m] += betaPtr[m]; - } - } - } else { - int weightDataSize = spec->ws[convWeightIndex].bytes_of_weight / bytesOf(spec->ws[convWeightIndex].mdt); - int weightPerChannel = weightDataSize / channelCur; - // NCHW - for (U32 m = 0; m < channelCur; m++){ - F32* convWeightPerChannel = weightTemp + weightPerChannel * m; - if (alphaPtr != nullptr) { - for(int n = 0; n < weightPerChannel; n++){ - convWeightPerChannel[n] *= alphaPtr[m]; - } - vecTemp[m] = alphaPtr[m] * vecTemp[m]; - } - if (betaPtr != nullptr) { - vecTemp[m] += betaPtr[m]; - } - } - } - // free scale memory - if (spec->ws[scaleWeightIndex].weight != nullptr) { - spec->ws[scaleWeightIndex].bytes_of_weight = 0; - delete [] spec->ws[scaleWeightIndex].weight; - spec->ws[scaleWeightIndex].weight = nullptr; - } - if(spec->ws[scaleWeightIndex].vec != nullptr) { - spec->ws[scaleWeightIndex].bytes_of_vec = 0; - delete [] spec->ws[scaleWeightIndex].vec; - spec->ws[scaleWeightIndex].vec = nullptr; - } - setOperatorInvalid(spec, scaleOpIndex); - } - } - return hasOptimized; - } -}; -#endif diff --git a/model-tools/include/OPOptimizers/DeprecatedOPOptimizer.hpp b/model-tools/include/OPOptimizers/DeprecatedOPOptimizer.hpp deleted file mode 100644 index cb559d69..00000000 --- a/model-tools/include/OPOptimizers/DeprecatedOPOptimizer.hpp +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_DEPRECATEDOPOPTIMIZER -#define _H_DEPRECATEDOPOPTIMIZER - -#include -#include -#include "model_tools.h" -#include "OPOptimizer.hpp" - -class DeprecatedOPOptimizer: public OPOptimizer { - virtual bool optimize(ModelSpec* spec) override { - bool hasOptimized = false; - - for (int i = 0; i< spec->num_operator_specs; i++) { - if (spec->ops[i].type == OT_None && i > 0) { - str_copy(spec->ops[i - 1].output_tensors_name[0], spec->ops[i].output_tensors_name[0], NAME_LEN); - hasOptimized = true; - continue; - } - - if (spec->ops[i].type == OT_Pad) { - if(spec->ops[i].ps.pad_spec.top == 0 && spec->ops[i].ps.pad_spec.bottom == 0 && - spec->ops[i].ps.pad_spec.left == 0 && spec->ops[i].ps.pad_spec.right == 0){ - str_copy(spec->ops[i + 1].input_tensors_name[0], spec->ops[i].input_tensors_name[0], NAME_LEN); - hasOptimized = true; - spec->ops[i].type = OT_None; // trick - continue; - } - } - - if(spec->ops[i].type == OT_Input) { - spec->ops[i].type = OT_None; // trick - continue; - } - } - return hasOptimized; - } - public: - static bool isDeprecatedOp(OperatorType opType){ - if (opType == OT_None) { - return true; - } else { - return false; - } - } - static bool isDeprecatedOpWeight(const ModelSpec* spec, int index){ - if (index >= spec->num_weight_specs) { - return true; - } else { - if (spec->ws[index].bytes_of_weight == 0 && spec->ws[index].bytes_of_vec == 0) { - return true; - } else { - return false; - } - } - } -}; -#endif diff --git a/model-tools/include/OPOptimizers/DepthwisePointwiseOptimizer.hpp b/model-tools/include/OPOptimizers/DepthwisePointwiseOptimizer.hpp deleted file mode 100644 index bb7e3355..00000000 --- a/model-tools/include/OPOptimizers/DepthwisePointwiseOptimizer.hpp +++ /dev/null @@ -1,113 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_DEPTHWISEPOINTWISEOPTIMIZER -#define _H_DEPTHWISEPOINTWISEOPTIMIZER - -#include -#include -#include "model_tools.h" -#include "OPOptimizer.hpp" - -class DepthwisePointwiseOptimizer: public OPOptimizer { - virtual bool optimize(ModelSpec* spec) override { - const int queryNum = 1; - OperatorType queryOps[queryNum] = {OT_Conv}; - - bool hasOptimized = false; - for (int i = 1; i < spec->num_operator_specs; i++) { - // process depthwise convolution - if (spec->ops[i].type == OT_Conv && spec->ops[i].ps.conv_spec.convolution_type == Convolution_Depthwise) { - int dwConvOpIndex = i; - int convOpIndex = searchOperatorIndexForward(spec, i + 1, queryOps, queryNum); - if (convOpIndex == -1 || \ - spec->ops[convOpIndex].ps.conv_spec.convolution_type != Convolution_Pointwise) { - std::cout << "[WARNING] encounter unoptimize Depthwise Convolution(no Pointwise Convolution after): " << spec->ops[dwConvOpIndex].name << std::endl; - continue; - } - - // reallocate weights and bias - int dwConvWeightIndex = searchWeightIndex(spec, spec->ops[dwConvOpIndex].name); - int convWeightIndex = searchWeightIndex(spec, spec->ops[convOpIndex].name); - CHECK_REQUIREMENT(dwConvWeightIndex != -1); - CHECK_REQUIREMENT(convWeightIndex != -1); - CHECK_REQUIREMENT(spec->ws[dwConvWeightIndex].mdt == DT_F32); - CHECK_REQUIREMENT(spec->ws[convWeightIndex].mdt == DT_F32); - - U32 weightSize = spec->ws[dwConvWeightIndex].bytes_of_weight + spec->ws[convWeightIndex].bytes_of_weight; - U8* weight = (U8 *)mt_new_storage(weightSize); - memcpy(weight, spec->ws[dwConvWeightIndex].weight, spec->ws[dwConvWeightIndex].bytes_of_weight); - memcpy(weight + spec->ws[dwConvWeightIndex].bytes_of_weight, - spec->ws[convWeightIndex].weight, - spec->ws[convWeightIndex].bytes_of_weight); - - U32 vecSize = sizeof(F32) * (spec->ops[dwConvOpIndex].ps.conv_spec.num_outputs \ - + spec->ops[convOpIndex].ps.conv_spec.num_outputs); - U8* vec = (U8 *)mt_new_storage(vecSize); - U8* ptr = vec; - if (spec->ws[dwConvWeightIndex].bytes_of_vec == 0) { - memset(ptr, 0, sizeof(F32)*(spec->ops[dwConvOpIndex].ps.conv_spec.num_outputs)); - } else { - CHECK_REQUIREMENT(sizeof(F32)*(spec->ops[dwConvOpIndex].ps.conv_spec.num_outputs) == spec->ws[dwConvWeightIndex].bytes_of_vec); - memcpy(ptr, spec->ws[dwConvWeightIndex].vec, spec->ws[dwConvWeightIndex].bytes_of_vec); - } - ptr = vec + sizeof(F32)*(spec->ops[dwConvOpIndex].ps.conv_spec.num_outputs); - if (spec->ws[convWeightIndex].bytes_of_vec == 0) { - memset(ptr, 0, sizeof(F32)*(spec->ops[convOpIndex].ps.conv_spec.num_outputs)); - } else { - CHECK_REQUIREMENT(sizeof(F32)*(spec->ops[convOpIndex].ps.conv_spec.num_outputs) == spec->ws[convWeightIndex].bytes_of_vec); - memcpy(ptr, spec->ws[convWeightIndex].vec, spec->ws[convWeightIndex].bytes_of_vec); - } - - // free and reallocate - if (spec->ws[dwConvWeightIndex].weight != nullptr) { - spec->ws[dwConvWeightIndex].bytes_of_weight = 0; - delete [] spec->ws[dwConvWeightIndex].weight; - spec->ws[dwConvWeightIndex].weight = nullptr; - } - if (spec->ws[dwConvWeightIndex].vec != nullptr) { - spec->ws[dwConvWeightIndex].bytes_of_vec = 0; - delete [] spec->ws[dwConvWeightIndex].vec; - spec->ws[dwConvWeightIndex].vec = nullptr; - } - if (spec->ws[convWeightIndex].weight != nullptr) { - spec->ws[convWeightIndex].bytes_of_weight = 0; - delete [] spec->ws[convWeightIndex].weight; - spec->ws[convWeightIndex].weight = nullptr; - } - if (spec->ws[convWeightIndex].vec != nullptr) { - spec->ws[convWeightIndex].bytes_of_vec = 0; - delete [] spec->ws[convWeightIndex].vec; - spec->ws[convWeightIndex].vec = nullptr; - } - - // retain depthwise convolution operator - str_copy(spec->ops[dwConvOpIndex].output_tensors_name[0], spec->ops[convOpIndex].output_tensors_name[0], NAME_LEN); - spec->ops[dwConvOpIndex].ps.conv_spec.num_outputs = spec->ops[convOpIndex].ps.conv_spec.num_outputs; - spec->ops[dwConvOpIndex].ps.conv_spec.convolution_type = Convolution_Depthwise_Pointwise; - spec->ops[dwConvOpIndex].ps.conv_spec.pw_activation_type = spec->ops[convOpIndex].ps.conv_spec.pw_activation_type; - spec->ws[dwConvWeightIndex].bytes_of_weight = weightSize; - spec->ws[dwConvWeightIndex].weight = weight; - spec->ws[dwConvWeightIndex].bytes_of_vec = vecSize; - spec->ws[dwConvWeightIndex].vec = vec; - hasOptimized = true; - - setOperatorInvalid(spec, convOpIndex); - i = convOpIndex; - } - } - return hasOptimized; - } -}; -#endif diff --git a/model-tools/include/OPOptimizers/FCFCOptimizer.hpp b/model-tools/include/OPOptimizers/FCFCOptimizer.hpp deleted file mode 100644 index d216ed9b..00000000 --- a/model-tools/include/OPOptimizers/FCFCOptimizer.hpp +++ /dev/null @@ -1,128 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_FCFCOPTIMIZER -#define _H_FCFCOPTIMIZER - -#include -#include -#include "model_tools.h" -#include "OPOptimizer.hpp" - -class FCFCOptimizer: public OPOptimizer { - virtual bool optimize(ModelSpec* spec) override - { - const int queryNum = 1; - OperatorType queryOps[queryNum] = {OT_FC}; - bool hasOptimized = false; - for (int i = 1; i < spec->num_operator_specs; i++) { - if (spec->ops[i].type == OT_FC) { - int curOpIndex = i; - int prevOpIndex = searchOperatorIndexBackward(spec, curOpIndex - 1, queryOps, queryNum); - if (prevOpIndex == -1) { - continue; - } - if (strncmp(spec->ops[curOpIndex].input_tensors_name[0], spec->ops[prevOpIndex].input_tensors_name[0], NAME_LEN)) { - continue; - } - - int prevWeightIndex = searchWeightIndex(spec, spec->ops[prevOpIndex].name); - int curWeightIndex = searchWeightIndex(spec, spec->ops[curOpIndex].name); - CHECK_REQUIREMENT(prevWeightIndex != -1); - CHECK_REQUIREMENT(curWeightIndex != -1); - CHECK_REQUIREMENT(spec->ws[prevWeightIndex].mdt == DT_F32); - CHECK_REQUIREMENT(spec->ws[curWeightIndex].mdt == DT_F32); - - U32 weightSize = spec->ws[prevWeightIndex].bytes_of_weight + spec->ws[curWeightIndex].bytes_of_weight; - U8* weight = (U8 *)mt_new_storage(weightSize); - memcpy(weight, spec->ws[prevWeightIndex].weight, spec->ws[prevWeightIndex].bytes_of_weight); - memcpy(weight + spec->ws[prevWeightIndex].bytes_of_weight, - spec->ws[curWeightIndex].weight, - spec->ws[curWeightIndex].bytes_of_weight); - - U32 vecSize = sizeof(F32) * (spec->ops[prevOpIndex].ps.fc_spec.num_outputs \ - + spec->ops[curOpIndex].ps.fc_spec.num_outputs); - U8* vec = (U8 *)mt_new_storage(vecSize); - U8* ptr = vec; - if (spec->ws[prevWeightIndex].bytes_of_vec == 0) { - memset(ptr, 0, sizeof(F32)*(spec->ops[prevOpIndex].ps.fc_spec.num_outputs)); - } else { - CHECK_REQUIREMENT(sizeof(F32)*(spec->ops[prevOpIndex].ps.fc_spec.num_outputs) == spec->ws[prevWeightIndex].bytes_of_vec); - memcpy(ptr, spec->ws[prevWeightIndex].vec, spec->ws[prevWeightIndex].bytes_of_vec); - } - ptr = vec + sizeof(F32)*(spec->ops[prevOpIndex].ps.fc_spec.num_outputs); - if (spec->ws[curWeightIndex].bytes_of_vec == 0) { - memset(ptr, 0, sizeof(F32)*(spec->ops[curOpIndex].ps.fc_spec.num_outputs)); - } else { - CHECK_REQUIREMENT(sizeof(F32)*(spec->ops[curOpIndex].ps.fc_spec.num_outputs) == spec->ws[curWeightIndex].bytes_of_vec); - memcpy(ptr, spec->ws[curWeightIndex].vec, spec->ws[curWeightIndex].bytes_of_vec); - } - - if (spec->ws[prevWeightIndex].weight != nullptr) { - spec->ws[prevWeightIndex].bytes_of_weight = 0; - delete [] spec->ws[prevWeightIndex].weight; - spec->ws[prevWeightIndex].weight = nullptr; - } - if (spec->ws[prevWeightIndex].vec != nullptr) { - spec->ws[prevWeightIndex].bytes_of_vec = 0; - delete [] spec->ws[prevWeightIndex].vec; - spec->ws[prevWeightIndex].vec = nullptr; - } - if (spec->ws[curWeightIndex].weight != nullptr) { - spec->ws[curWeightIndex].bytes_of_weight = 0; - delete [] spec->ws[curWeightIndex].weight; - spec->ws[curWeightIndex].weight = nullptr; - } - if (spec->ws[curWeightIndex].vec != nullptr) { - spec->ws[curWeightIndex].bytes_of_vec = 0; - delete [] spec->ws[curWeightIndex].vec; - spec->ws[curWeightIndex].vec = nullptr; - } - - // FC params - spec->ops[prevOpIndex].ps.fc_spec.num_slices++; - U32 slices = spec->ops[prevOpIndex].ps.fc_spec.num_slices; - CHECK_REQUIREMENT(slices <= sizeof(spec->ops[prevOpIndex].ps.fc_spec.slice_point) / sizeof(int)); - spec->ops[prevOpIndex].ps.fc_spec.slice_point[slices - 1] = spec->ops[curOpIndex].ps.fc_spec.num_outputs; - spec->ops[prevOpIndex].ps.fc_spec.num_outputs += spec->ops[curOpIndex].ps.fc_spec.num_outputs; - - // operator spec - spec->ops[prevOpIndex].num_outputs = slices; - I8 **names = (I8**)mt_new_storage(slices * sizeof(I8 *)); - - for (U32 j = 0; j < slices - 1; j++) { - names[j] = spec->ops[prevOpIndex].output_tensors_name[j]; - } - names[slices - 1] = spec->ops[curOpIndex].output_tensors_name[0]; - delete [] spec->ops[prevOpIndex].output_tensors_name; - delete [] spec->ops[curOpIndex].output_tensors_name; - spec->ops[curOpIndex].output_tensors_name = nullptr; - spec->ops[curOpIndex].num_outputs = 0; - spec->ops[prevOpIndex].output_tensors_name = names; - - // weight spec - spec->ws[prevWeightIndex].bytes_of_weight = weightSize; - spec->ws[prevWeightIndex].weight = weight; - spec->ws[prevWeightIndex].bytes_of_vec = vecSize; - spec->ws[prevWeightIndex].vec = vec; - hasOptimized = true; - - setOperatorInvalid(spec, curOpIndex); - i = curOpIndex; - } - } - return hasOptimized; - } -}; -#endif diff --git a/model-tools/include/OPOptimizers/FlattenGemmOptimizer.hpp b/model-tools/include/OPOptimizers/FlattenGemmOptimizer.hpp deleted file mode 100644 index a8002779..00000000 --- a/model-tools/include/OPOptimizers/FlattenGemmOptimizer.hpp +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_FLATTENGEMMOPTIMIZER -#define _H_FLATTENGEMMOPTIMIZER - -#include -#include -#include "model_tools.h" -#include "OPOptimizer.hpp" - -class FlattenGemmOptimizer: public OPOptimizer { - virtual bool optimize(ModelSpec* spec) override { - const int queryNum = 1; - OperatorType queryOps[queryNum] = {OT_FC}; - bool hasOptimized = false; - for (int i = 1; i < spec->num_operator_specs; i++) { - if (spec->ops[i].type == OT_Flatten) { - int flattenIndex = i; - int gemmIndex = searchOperatorIndexForward(spec, flattenIndex + 1, queryOps, queryNum); - if (gemmIndex == -1 || spec->ops[i].ps.flatten_spec.axis != 1) { - std::cerr << "[ERROR] encountered flatten but cannot be optimized\n"; - continue; - } - - str_copy(spec->ops[gemmIndex].input_tensors_name[0], spec->ops[flattenIndex].input_tensors_name[0], NAME_LEN); - hasOptimized = true; - setOperatorInvalid(spec, flattenIndex); - i = gemmIndex; - } - } - return hasOptimized; - } -}; -#endif diff --git a/model-tools/include/OPOptimizers/InPlaceOptimizer.hpp b/model-tools/include/OPOptimizers/InPlaceOptimizer.hpp deleted file mode 100644 index 934a0ffe..00000000 --- a/model-tools/include/OPOptimizers/InPlaceOptimizer.hpp +++ /dev/null @@ -1,142 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_INPLACEOPTIMIZER -#define _H_INPLACEOPTIMIZER - -#include -#include -#include -#include -#include "model_tools.h" -#include "OPOptimizer.hpp" - -class InPlaceOptimizer: public OPOptimizer { - virtual bool optimize(ModelSpec* spec) override { - bool hasOptimized = false; - std::vector unrepeatedInputNames; - std::vector repeated; - - // Insert pass - for (int i = 0; i < spec->num_operator_specs; i++) { - if (isInPlaceOp(spec->ops[i].type)) { - CHECK_REQUIREMENT(1 == spec->ops[i].num_inputs); - std::string inputName = spec->ops[i].input_tensors_name[0]; - if (find(unrepeatedInputNames.begin(), unrepeatedInputNames.end(), inputName) == unrepeatedInputNames.end()) { - unrepeatedInputNames.push_back(inputName); - } else { - repeated.push_back(inputName); - } - } - } - - for (std::string name : repeated) { - std::vector::iterator it = find(unrepeatedInputNames.begin(), unrepeatedInputNames.end(), name); - if (it != unrepeatedInputNames.end()) { - unrepeatedInputNames.erase(it); - } - } - - // Erase pass - for (int i = 0; i < spec->num_operator_specs; i++) { - if (OT_None == spec->ops[i].type || isInPlaceOp(spec->ops[i].type)) { - continue; - } - - for (U32 j = 0; j < spec->ops[i].num_inputs; j++) { - std::string inputName = spec->ops[i].input_tensors_name[j]; - std::vector::iterator it = find(unrepeatedInputNames.begin(), unrepeatedInputNames.end(), inputName); - if (it != unrepeatedInputNames.end()) { - unrepeatedInputNames.erase(it); - } - } - } - - for (int i = spec->num_operator_specs - 1; i >= 0; i--) { - if (isInPlaceOp(spec->ops[i].type)) { - CHECK_REQUIREMENT(spec->ops[i].num_inputs == 1); - std::string inputName = spec->ops[i].input_tensors_name[0]; - if (find(unrepeatedInputNames.begin(), unrepeatedInputNames.end(), inputName) == unrepeatedInputNames.end()) { - // Input is used multiple times, so should not be in-place - continue; - } - - CHECK_REQUIREMENT(spec->ops[i].num_outputs == 1); - str_copy(spec->ops[i].input_tensors_name[0], spec->ops[i].output_tensors_name[0], NAME_LEN); - hasOptimized = true; - - I32 found = 0; - for (int j = i - 1; j >= 0; j--) { - if (spec->ops[j].type != OT_None) { - for (U32 k = 0; k < spec->ops[j].num_outputs; k++) { - std::string prevOutputName = spec->ops[j].output_tensors_name[k]; - if (prevOutputName == inputName) { - str_copy(spec->ops[j].output_tensors_name[k], spec->ops[i].input_tensors_name[0], NAME_LEN); - found = 1; - break; - } - } - } - if (1 == found) { - break; - } - } - - if (0 == found) { - std::string newName = spec->ops[i].input_tensors_name[0]; - std::cout << "[Warning] in-place tensor seems to be model input: " << inputName << ". New name: " << newName < -#include -#include "model_tools.h" -#include "OPOptimizer.hpp" - -class NoQuantLabelOptimizer: public OPOptimizer { -public: - NoQuantLabelOptimizer(float clipVal) - { - if (clipVal > 0) { - this->uniScale = 127.0 / clipVal; - } else { - this->uniScale = 0; - } - } - - virtual bool optimize(ModelSpec* spec) override - { - bool hasOptimized = false; - bool firstOpChecked = false; - - for (int i = 0; i < spec->num_operator_specs; i++) { - if (spec->ops[i].type == OT_None) { - continue; - } - - if (uniScale > 0) { - if (spec->ops[i].type == OT_FC || spec->ops[i].type == OT_MatMul || spec->ops[i].type == OT_LSTM) { - this->label_clip_input(spec->ops + i); - if (spec->ops[i].type == OT_FC || spec->ops[i].type == OT_LSTM) { - int weightIdx = searchWeightIndex(spec, spec->ops[i].name); - CHECK_REQUIREMENT(-1 != weightIdx); - CHECK_REQUIREMENT(DT_F32 == spec->ws[weightIdx].mdt); - std::cout << "[INFO] Clipping the weight of FC or LSTM\n"; - F32 clipMax = 127.0 / uniScale; - F32 clipMin = -1 * clipMax; - U32 len = spec->ws[weightIdx].bytes_of_weight / bytesOf(DT_F32); - F32 *w = (F32*)spec->ws[weightIdx].weight; - for (U32 j = 0; j < len; j++) { - if (w[j] > clipMax) { - w[j] = clipMax; - } else if (w[j] < clipMin) { - w[j] = clipMin; - } - } - } - } - continue; - } - - if (!firstOpChecked) { - if (OT_Conv == spec->ops[i].type) { - this->label_OP_as_no_quant(spec->ops + i); - hasOptimized = true; - } - firstOpChecked = true; - } - - if (spec->ops[i].type == OT_Eltwise) { - for (U32 j = 0; j < spec->ops[i].num_inputs; j++) { - std::string curIn = spec->ops[i].input_tensors_name[j]; - this->label_fp_outputs(spec, curIn); - hasOptimized = true; - } - } - - // Activation other than ReLU - if (spec->ops[i].type == OT_Relu6 || - spec->ops[i].type == OT_HSwish || - spec->ops[i].type == OT_HSigmoid || - spec->ops[i].type == OT_Sigmoid || - spec->ops[i].type == OT_Clip || - spec->ops[i].type == OT_Gelu || - spec->ops[i].type == OT_TanH) { - - std::string curIn = spec->ops[i].input_tensors_name[0]; - this->label_fp_outputs(spec, curIn); - hasOptimized = true; - } - - if (spec->ops[i].type == OT_Concat) { - for (U32 j = 0; j < spec->ops[i].num_inputs; j++) { - std::string curIn = spec->ops[i].input_tensors_name[j]; - int prevIndex = searchOperatorIndexByOutput(spec, curIn); - if (-1 == prevIndex) { // model input - std::string outName = spec->ops[i].output_tensors_name[0]; - this->label_fp_outputs(spec, outName); - break; - } - } - } - - if (spec->ops[i].type == OT_Softmax) { - std::string output = spec->ops[i].output_tensors_name[0]; - bool isModelOutput = false; - - for (int j = 0; j < spec->num_outputs; j++) { - std::string name = spec->output_names[j]; - if (name == output) { - isModelOutput = true; - break; - } - } - - if (isModelOutput) { - std::string inputName = spec->ops[i].input_tensors_name[0]; - int prevKeyIndex = searchOperatorIndexByOutput(spec, inputName); - while (-1 != prevKeyIndex) { - OperatorType ot = spec->ops[prevKeyIndex].type; - if (OT_Conv == ot || OT_FC == ot || OT_MatMul == ot) { - break; - } else { - inputName = spec->ops[prevKeyIndex].input_tensors_name[0]; - prevKeyIndex = searchOperatorIndexByOutput(spec, inputName); - } - } - - if (-1 == prevKeyIndex) { - std::cout << "[INFO] Softmax receives model input directly\n"; - continue; - } - this->label_OP_as_no_quant(spec->ops + prevKeyIndex); - - std::string prevIn = spec->ops[prevKeyIndex].input_tensors_name[0]; - this->label_fp_outputs(spec, prevIn); - } else { - std::string curIn = spec->ops[i].input_tensors_name[0]; - this->label_fp_outputs(spec, curIn); - } - hasOptimized = true; - } - } - return hasOptimized; - } - - float uniScale; - - static void label_OP_as_no_quant(OperatorSpec* ptr) - { - CHECK_REQUIREMENT(0 == ptr->num_quant_feature); - ptr->num_quant_feature = 1; - ptr->feature_scale = (QuantSpec*)mt_new_storage(sizeof(QuantSpec)); - ptr->feature_scale[0].num_scale = 1; - ptr->feature_scale[0].scale = (F32*)mt_new_storage(sizeof(F32)); - ptr->feature_scale[0].scale[0] = 0; - } - - void label_fp_outputs(ModelSpec* ms, std::string tensorName) - { - int prevIndex = searchOperatorIndexByOutput(ms, tensorName); - if (-1 == prevIndex) { - return; - } - OperatorSpec *ptr = ms->ops + prevIndex; - if (0 == ptr->num_quant_feature) { - ptr->num_quant_feature = 1; - ptr->feature_scale = (QuantSpec*)mt_new_storage(sizeof(QuantSpec)); - ptr->feature_scale[0].num_scale = 1; - ptr->feature_scale[0].scale = (F32*)mt_new_storage(sizeof(F32)); - ptr->feature_scale[0].scale[0] = -2; - } else if (-2 == ptr->feature_scale[0].scale[0] || 0 == ptr->feature_scale[0].scale[0]) { - return; // Already processed the upstream - } - - OperatorType ot = ms->ops[prevIndex].type; - if (OT_Conv != ot && OT_FC != ot && OT_MatMul != ot) { - for (U32 i = 0; i < ms->ops[prevIndex].num_inputs; i++) { - std::string name = ms->ops[prevIndex].input_tensors_name[i]; - label_fp_outputs(ms, name); - } - } - } - - void label_clip_input(OperatorSpec* ptr) - { - CHECK_REQUIREMENT(0 == ptr->num_quant_feature); - ptr->num_quant_feature = ptr->num_inputs + ptr->num_outputs; - ptr->feature_scale = (QuantSpec*)mt_new_storage(sizeof(QuantSpec) * ptr->num_quant_feature); - U32 i; - for (i = 0; i < ptr->num_inputs; i++) { - ptr->feature_scale[i].num_scale = 1; - ptr->feature_scale[i].scale = (F32*)mt_new_storage(sizeof(F32)); - ptr->feature_scale[i].scale[0] = this->uniScale; - } - for (; i < ptr->num_quant_feature; i++) { - ptr->feature_scale[i].num_scale = 1; - ptr->feature_scale[i].scale = (F32*)mt_new_storage(sizeof(F32)); - ptr->feature_scale[i].scale[0] = -2; - } - } -}; -#endif diff --git a/model-tools/include/OPOptimizers/OPOptimizer.hpp b/model-tools/include/OPOptimizers/OPOptimizer.hpp deleted file mode 100644 index 4020d83b..00000000 --- a/model-tools/include/OPOptimizers/OPOptimizer.hpp +++ /dev/null @@ -1,138 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_OPOPTIMIZER -#define _H_OPOPTIMIZER - -#include -#include -#include -#include -#include -#include "model_tools.h" -#include "op_type.h" - -class OPOptimizer { - public: - /** - * @param spec - */ - virtual ~OPOptimizer() {} - virtual bool optimize(ModelSpec* spec) = 0; - int searchWeightIndex(ModelSpec* spec, char* op_name) - { - if (spec->num_weight_specs <= 0) { - return -1; - } - - std::map weightsIndex; - for (int i=0; i < spec->num_weight_specs; i++) { - std::string key = spec->ws[i].op_name; // attention, this is static structure attribute - weightsIndex[key] = i; - } - - std::string opNameStr = op_name; - std::map::iterator iter = weightsIndex.find(opNameStr); - if(iter == weightsIndex.end()) - return -1; - else - return weightsIndex[opNameStr]; - } - - int searchOperatorIndex(ModelSpec* spec, char* op_name) - { - if (spec->num_operator_specs <= 0) { - return -1; - } - - std::string opNameStr = op_name; - for (int i = 0; i < spec->num_operator_specs; i++) { - std::string key = spec->ops[i].name; - if (key == opNameStr) { - return i; - } - } - return -1; - } - - bool isValidOperator(ModelSpec* spec, int index){ - if (index >= spec->num_operator_specs) { - return false; - } - - if (spec->ops[index].type != OT_None) { - return true; - } else { - return false; - } - } - - void setOperatorInvalid(ModelSpec* spec, int index) { - if (index >= spec->num_operator_specs || index < 0) { - return; - } - spec->ops[index].type = OT_None; - } - - int searchOperatorIndexBackward(ModelSpec* spec, int end, OperatorType *queryOps, int queryNum, bool unskip=true) - { - for (int i = end; i >= 0; i--) { - if (isValidOperator(spec, i)) { - for (int j=0; jops[i].type == opType) { - return i; - } - } - if (unskip) { - return -1; - } - } - } - return -1; - } - - int searchOperatorIndexByOutput(ModelSpec* spec, std::string tensorName) - { - for (int i = 0; i < spec->num_operator_specs; i++) { - if (isValidOperator(spec, i)) { - for (U32 j = 0; j < spec->ops[i].num_outputs; j++) { - std::string outName = spec->ops[i].output_tensors_name[j]; - if (outName == tensorName) { - return i; - } - } - } - } - return -1; - } - - int searchOperatorIndexForward(ModelSpec* spec, int start, OperatorType *queryOps, int queryNum, bool unskip=true){ - for (int i = start; i < spec->num_operator_specs; i++) { - if (isValidOperator(spec, i)) { - for (int j=0; jops[i].type == opType) { - return i; - } - } - if (unskip) { - return -1; - } - } - } - return -1; - } -}; -#endif diff --git a/model-tools/include/OPOptimizers/PadConvOptimizer.hpp b/model-tools/include/OPOptimizers/PadConvOptimizer.hpp deleted file mode 100644 index 38217869..00000000 --- a/model-tools/include/OPOptimizers/PadConvOptimizer.hpp +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_PADCONVOPTIMIZER -#define _H_PADCONVOPTIMIZER - -#include -#include -#include "model_tools.h" -#include "OPOptimizer.hpp" - -// Optimize both Convolution and Deconvolution -class PadConvOptimizer: public OPOptimizer { - virtual bool optimize(ModelSpec* spec) override { - const int queryNum = 2; - OperatorType queryOps[queryNum] = {OT_Conv, OT_Deconvolution}; - bool hasOptimized = false; - for (int i = 0; i< spec->num_operator_specs; i++) { - OperatorType curOT = spec->ops[i].type; - if (curOT == OT_Pad && spec->ops[i].ps.pad_spec.pad_mode == Pad_Constant && spec->ops[i].ps.pad_spec.constant_value == 0) { - int padOpIndex = i; - if (spec->ops[padOpIndex].ps.pad_spec.constant_value != 0) { - std::cout << "[WARNING] encounter unoptimized Pad layer (value not 0): " << spec->ops[i].name << std::endl; - continue; - } - - int convOpIndex = searchOperatorIndexForward(spec, padOpIndex + 1, queryOps, queryNum); - - if (convOpIndex == -1) { - std::cout << "[WARNING] encounter unoptimized Pad layer (no Conv or Deconv after): " << spec->ops[padOpIndex].name << std::endl; - continue; - } - - // tensor relationship rewrite - str_copy(spec->ops[convOpIndex].input_tensors_name[0], spec->ops[padOpIndex].input_tensors_name[0], NAME_LEN); - hasOptimized = true; - spec->ops[convOpIndex].ps.conv_spec.padding_bottom += spec->ops[padOpIndex].ps.pad_spec.bottom; - spec->ops[convOpIndex].ps.conv_spec.padding_left += spec->ops[padOpIndex].ps.pad_spec.left; - spec->ops[convOpIndex].ps.conv_spec.padding_right += spec->ops[padOpIndex].ps.pad_spec.right; - spec->ops[convOpIndex].ps.conv_spec.padding_top += spec->ops[padOpIndex].ps.pad_spec.top; - setOperatorInvalid(spec, padOpIndex); - } - } - return hasOptimized; - } -}; -#endif diff --git a/model-tools/include/OPOptimizers/SqueezeReshapeOptimizer.hpp b/model-tools/include/OPOptimizers/SqueezeReshapeOptimizer.hpp deleted file mode 100644 index edd3e534..00000000 --- a/model-tools/include/OPOptimizers/SqueezeReshapeOptimizer.hpp +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_SQUEEZERESHAPEOPTIMIZER -#define _H_SQUEEZERESHAPEOPTIMIZER - -#include -#include -#include "model_tools.h" -#include "OPOptimizer.hpp" - -class SqueezeReshapeOptimizer: public OPOptimizer { - virtual bool optimize(ModelSpec* spec) override { - const int queryNum = 1; - OperatorType queryOps[queryNum] = {OT_Reshape}; - bool hasOptimized = false; - for (int i = 1; i < spec->num_operator_specs; i++) { - if (spec->ops[i].type == OT_Squeeze) { - int squeezeIndex = i; - int reshapeIndex = searchOperatorIndexForward(spec, squeezeIndex + 1, queryOps, queryNum); - if (reshapeIndex == -1) { - continue; - } - - str_copy(spec->ops[reshapeIndex].input_tensors_name[0], spec->ops[squeezeIndex].input_tensors_name[0], NAME_LEN); - hasOptimized = true; - setOperatorInvalid(spec, squeezeIndex); - i = reshapeIndex; - } - } - return hasOptimized; - } -}; -#endif diff --git a/model-tools/include/OPOptimizers/TransposeMatMulToFCOptimizer.hpp b/model-tools/include/OPOptimizers/TransposeMatMulToFCOptimizer.hpp deleted file mode 100644 index 397914ca..00000000 --- a/model-tools/include/OPOptimizers/TransposeMatMulToFCOptimizer.hpp +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_TRANSPOSEMATMULTOFCOPTIMIZER -#define _H_TRANSPOSEMATMULTOFCOPTIMIZER - -#include -#include -#include "model_tools.h" -#include "OPOptimizer.hpp" - -class TransposeMatMulToFCOptimizer: public OPOptimizer { - virtual bool optimize(ModelSpec* spec) override { - const int queryNum = 1; - OperatorType queryOps[queryNum] = {OT_MatMul}; - bool hasOptimized = false; - for (int i = 1; i < spec->num_operator_specs; i++) { - if (spec->ops[i].type == OT_Transpose) { - int transposeOpIndex = i; - int transposeWeightIndex = searchWeightIndex(spec, spec->ops[transposeOpIndex].name); - if (transposeWeightIndex < 0) { - // This transpose does not have weight - continue; - } - - int matmulOpIndex = searchOperatorIndexForward(spec, transposeOpIndex + 1, queryOps, queryNum); - if (-1 == matmulOpIndex) { - std::cerr << "[ERROR] encountered transpose with weight but cannot be optimized\n"; - CHECK_STATUS(NOT_SUPPORTED); - } - - hasOptimized = true; - - // Update matmul to fc - spec->ops[matmulOpIndex].type = OT_FC; - spec->ops[matmulOpIndex].num_inputs = 1; - free(spec->ops[matmulOpIndex].input_tensors_name[1]); - spec->ops[matmulOpIndex].ps.fc_spec.num_outputs = spec->ws[transposeWeightIndex].bytes_of_vec; - - // Adjust the owner of the weight - str_copy(spec->ws[transposeWeightIndex].op_name, spec->ops[matmulOpIndex].name, NAME_LEN); - spec->ws[transposeWeightIndex].bytes_of_vec = 0; - - setOperatorInvalid(spec, transposeOpIndex); - i = matmulOpIndex; - } - } - return hasOptimized; - } -}; -#endif diff --git a/model-tools/include/OPOptimizers/TransposeMulToScaleOptimizer.hpp b/model-tools/include/OPOptimizers/TransposeMulToScaleOptimizer.hpp deleted file mode 100644 index f2868b21..00000000 --- a/model-tools/include/OPOptimizers/TransposeMulToScaleOptimizer.hpp +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_TRANSPOSEMULTOSCALEOPTIMIZER -#define _H_TRANSPOSEMULTOSCALEOPTIMIZER - -#include -#include -#include "model_tools.h" -#include "OPOptimizer.hpp" - -class TransposeMulToScaleOptimizer: public OPOptimizer { - virtual bool optimize(ModelSpec* spec) override { - const int queryNum = 2; - OperatorType queryOps[queryNum] = {OT_Transpose, OT_Reshape}; - bool hasOptimized = false; - for (int i = 1; i < spec->num_operator_specs; i++) { - if (spec->ops[i].type == OT_Eltwise && spec->ops[i].ps.eltwise_spec.elt_mode == ELTWISE_PROD) { - int mulOpIndex = i; - int transposeOpIndex00 = searchOperatorIndexBackward(spec, mulOpIndex - 1, queryOps, queryNum, false); - if (transposeOpIndex00 == -1) - continue; - int transposeOpIndex01 = searchOperatorIndexBackward(spec, transposeOpIndex00 - 1, queryOps, queryNum, false); - if (transposeOpIndex01 == -1) - continue; - int transposeOpIndex10 = searchOperatorIndexForward(spec, mulOpIndex + 1, queryOps, queryNum, false); - if (transposeOpIndex10 == -1) - continue; - - if (transposeOpIndex10 == mulOpIndex + 1 - || (transposeOpIndex10 == mulOpIndex + 2 && spec->ops[mulOpIndex+1].type == OT_Relu)) { - str_copy(spec->ops[mulOpIndex].input_tensors_name[0], spec->ops[transposeOpIndex00].input_tensors_name[0], NAME_LEN); - str_copy(spec->ops[mulOpIndex].input_tensors_name[1], spec->ops[transposeOpIndex01].input_tensors_name[0], NAME_LEN); - str_copy(spec->ops[transposeOpIndex10-1].output_tensors_name[0], spec->ops[transposeOpIndex10].output_tensors_name[0], NAME_LEN); - - hasOptimized = true; - spec->ops[mulOpIndex].type = OT_Scale; - spec->ops[mulOpIndex].ps.scale_spec.axis = 1; - - setOperatorInvalid(spec, transposeOpIndex00); - setOperatorInvalid(spec, transposeOpIndex01); - setOperatorInvalid(spec, transposeOpIndex10); - i = transposeOpIndex10; - } - } - } - return hasOptimized; - } -}; -#endif diff --git a/model-tools/include/converter.h b/model-tools/include/converter.h deleted file mode 100644 index 1013356c..00000000 --- a/model-tools/include/converter.h +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_CONVERTER -#define _H_CONVERTER -#include "type.h" -#include "error.h" -#include "model_tools.h" - -#ifdef __cplusplus -extern "C" { -#endif - -#ifdef _USE_CAFFE_MODEL - EE caffe_converter(std::string dir, std::string mfn, ModelSpec* ms); -#endif - -#ifdef _USE_TENSORFLOW_MODEL - EE tensorflow_converter(std::string dir, std::string mfn, ModelSpec* ms); -#endif - -#ifdef _USE_ONNX_MODEL - EE onnx_converter(std::string dir, std::string mfn, int removePreprocessOpNum, ModelSpec* ms); -#endif - -#ifdef _USE_TFLITE_MODEL - EE tflite_converter(std::string dir, std::string mfn, ModelSpec* ms); -#endif - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/model-tools/include/model_optimizer.hpp b/model-tools/include/model_optimizer.hpp deleted file mode 100644 index 5d45b26d..00000000 --- a/model-tools/include/model_optimizer.hpp +++ /dev/null @@ -1,131 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_MODELOPTIMIZER -#define _H_MODELOPTIMIZER - -#include -#include -#include "model_tools.h" -#include "model_serialize_deserialize.hpp" -#include "OPOptimizers/OPOptimizer.hpp" -#include "OPOptimizers/DeprecatedOPOptimizer.hpp" -#include "OPOptimizers/ConvBNOptimizer.hpp" -#include "OPOptimizers/BNScaleOptimizer.hpp" -#include "OPOptimizers/ConvScaleOptimizer.hpp" -#include "OPOptimizers/PadConvOptimizer.hpp" -#include "OPOptimizers/InPlaceOptimizer.hpp" -#include "OPOptimizers/ConvActivationOptimizer.hpp" -#include "OPOptimizers/ChannelPaddingOptimizer.hpp" -#include "OPOptimizers/DepthwisePointwiseOptimizer.hpp" -#include "OPOptimizers/TransposeMulToScaleOptimizer.hpp" -#include "OPOptimizers/TransposeMatMulToFCOptimizer.hpp" -#include "OPOptimizers/FlattenGemmOptimizer.hpp" -#include "OPOptimizers/FCFCOptimizer.hpp" -#include "OPOptimizers/ClipClipOptimizer.hpp" -#include "OPOptimizers/SqueezeReshapeOptimizer.hpp" -#include "OPOptimizers/NoQuantLabelOptimizer.hpp" -#include "OPOptimizers/MemoryReuseOptimizer.hpp" - - - -class ConvEltwisePoolingOptimizer: public OPOptimizer { - virtual bool optimize(ModelSpec* spec) override { - if(spec == nullptr) - return false; - - bool hasOptimized = false; - // TODO: add fusion(low priority) - return hasOptimized; - } -}; - - -class FCEltwiseOptimizer: public OPOptimizer { - virtual bool optimize(ModelSpec* spec) override { - if(spec == nullptr) - return false; - - bool hasOptimized = false; - // TODO: add fusion(middle priority) - return hasOptimized; - } -}; - -class ModelSpecOptimizer { - public: - ModelSpecOptimizer() { } - /** - * @param model - */ - bool optimize(ModelSpec* spec) { - bool optimizeOrNot = false; - for (auto opo: opos) { - if (opo->optimize(spec)) { - optimizeOrNot = true; - } - } - return optimizeOrNot; - } - - void suggest(float clipVal=0) - { - // strict order - this->opos.push_back(std::shared_ptr(new DeprecatedOPOptimizer())); - - // Removing ConvBN leads to error - this->opos.push_back(std::shared_ptr(new ConvBNOptimizer())); - this->opos.push_back(std::shared_ptr(new BNScaleOptimizer())); - this->opos.push_back(std::shared_ptr(new ConvScaleOptimizer())); - this->opos.push_back(std::shared_ptr(new PadConvOptimizer())); - - this->opos.push_back(std::shared_ptr(new InPlaceOptimizer())); - - this->opos.push_back(std::shared_ptr(new ConvActivationOptimizer())); - this->opos.push_back(std::shared_ptr(new ChannelPaddingOptimizer())); - this->opos.push_back(std::shared_ptr(new DepthwisePointwiseOptimizer())); - this->opos.push_back(std::shared_ptr(new TransposeMulToScaleOptimizer())); - this->opos.push_back(std::shared_ptr(new TransposeMatMulToFCOptimizer())); - this->opos.push_back(std::shared_ptr(new FlattenGemmOptimizer())); - this->opos.push_back(std::shared_ptr(new FCFCOptimizer())); - this->opos.push_back(std::shared_ptr(new ClipClipOptimizer())); - this->opos.push_back(std::shared_ptr(new SqueezeReshapeOptimizer())); - - this->opos.push_back(std::shared_ptr(new NoQuantLabelOptimizer(clipVal))); - - // Please leave MemoryReuseOptimizer at last - this->opos.push_back(std::shared_ptr(new MemoryReuseOptimizer())); - } - - void suggest_for_training() - { - // strict order - this->opos.push_back(std::shared_ptr(new DeprecatedOPOptimizer())); - - this->opos.push_back(std::shared_ptr(new PadConvOptimizer())); - - this->opos.push_back(std::shared_ptr(new NoQuantLabelOptimizer(0))); - } - - void empty() {} - - private: - // ModelSpecOptimizer() { } - /** - * @param opo - */ - std::vector> opos; -}; - -#endif diff --git a/model-tools/include/model_serialize_deserialize.hpp b/model-tools/include/model_serialize_deserialize.hpp deleted file mode 100644 index f6494d67..00000000 --- a/model-tools/include/model_serialize_deserialize.hpp +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_MODEL_SERIALIZE_DESERIALIZE -#define _H_MODEL_SERIALIZE_DESERIALIZE - -#include "model_tools.h" -#include - - -#if defined(_BUILD_TEST) || defined(_USE_CAFFE) || defined(_USE_ONNX) || defined(_USE_TFLITE) -EE serialize_header(const ModelSpec* spec, std::string* tmp); - -EE serialize_operators(const ModelSpec* spec, std::string* tmp); - -EE serialize_weights(const ModelSpec* spec, std::string* tmp); - -EE serialize_model(const ModelSpec* spec, std::string* bytes); - -EE write_to_file(std::string* bytes, const char* fn); - -EE serialize_model_to_file(const ModelSpec* spec, const char* fn); - -EE ms_datatype_converter(ModelSpec* original_ms, ModelSpec* target_ms, DataConvertType convert_mode, bool quantStorage=false); -#endif - -EE deserialize_header(char* bytes, ModelSpec* spec, U32* pos); - -EE deserialize_operator(char* bytes, ModelSpec* spec, U32* pos); - -EE deserialize_weight(char* bytes, ModelSpec* spec, U32* pos); - -EE operator_relationship(ModelSpec* spec); - -EE deserialize_model_from_file(const char* fn, ModelSpec* spec); - -EE str_copy(I8 *dst, const I8 *src, I32 src_len); - -void* mt_new_storage(size_t size); -#endif diff --git a/model-tools/include/model_tools.h b/model-tools/include/model_tools.h deleted file mode 100644 index 9b630b5c..00000000 --- a/model-tools/include/model_tools.h +++ /dev/null @@ -1,427 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_MODEL_TOOLS -#define _H_MODEL_TOOLS - -#include "error.h" -#include "type.h" -#include "tensor_desc.h" -#include "op_type.h" - -#ifdef __cplusplus -extern "C" { -#endif - - typedef enum { - F32_to_F32, - F32_to_F16, - F32_to_I8 - } DataConvertType; - - typedef struct { - U32 height; - U32 width; - } InterpParamSpec; - - typedef struct { - int axis; - } FlattenParamSpec; - - typedef struct { - int gather_axis; - } GatherParamSpec; - - typedef struct { - int axis; - int squeeze_axes[8]; - int axes_num; - } SqueezeParamSpec; - - typedef struct { - int axis; - int unsqueeze_axes[8]; - int axes_num; - } UnsqueezeParamSpec; - - typedef struct { - char upsample_mode[NAME_LEN]; - F32 scale[4]; - } UpsampleParamSpec; - - typedef struct { - int cast_to; - } CastParamSpec; - - typedef struct { - int axis; - int num_concat; - } ScaleParamSpec; - - typedef struct { - float neg_slope; - } ReLUParamSpec; - - typedef struct { - int coeff_size; - float* coeff_values; - } EltwiseSumSpec; - - typedef struct { - EltwiseMode elt_mode; - EltwiseSumSpec elt_sum_spec; - } EltwiseParamSpec; - - typedef struct { - float min; - float max; - } ClipParamSpec; - - typedef union { - ReLUParamSpec relu_spec; - ClipParamSpec clip_spec; - } ActivationSpec; - - typedef struct { - U32 num_outputs; - U32 kernel_size_h; - U32 kernel_size_w; - U32 stride_h; - U32 stride_w; - U32 padding_top; - U32 padding_bottom; - U32 padding_left; - U32 padding_right; - U32 group; - U32 dilatedRate_h; - U32 dilatedRate_w; - ConvolutionMode convolution_type; - ActivationMode dw_activation_type; - ActivationMode pw_activation_type; - ActivationSpec activation_spec; - } ConvolutionParamSpec; - - typedef struct { - U32 kernel_size_h; - U32 kernel_size_w; - U32 stride_h; - U32 stride_w; - U32 padding_top; - U32 padding_bottom; - U32 padding_left; - U32 padding_right; - RoundMode rm; - PoolingMode mode; - } PoolingParamSpec; - - typedef struct { - U32 num_outputs; - U32 num_slices; - I32 slice_point[32]; - } FullyConnectedParamSpec; - - typedef struct{ - int axis; - F32 eps; - F32 gama; - F32 momentum; - } BatchNormParamSpec; - - typedef struct { - U32 top; - U32 bottom; - U32 left; - U32 right; - F32 constant_value; - PadMode pad_mode; - } PadParamSpec; - - typedef struct { - U32 input_dim; - U32 num_output; - bool bias_term; - bool transpose; - } EmbedParamSpec; - - typedef struct { - float scale; - float bias; - } MultiplyParamSpec; - - typedef struct { - I32 shape_dims[8]; - I32 shape_size; - I32 axis; - I32 num_axes; - } ReshapeParamSpec; - - typedef struct { - I32 slice_points[8]; - U32 slice_size; - I32 axis; - } SliceParamSpec; - - typedef struct { - U32 trans_dims[8]; - U32 trans_size; - } TransposeParamSpec; - - typedef struct { - U32 num_heads; - U32 from_sequence_length; - U32 to_sequence_length; - } AttentionParamSpec; - - typedef struct { - U32 num_output; - I32 steps; - I32 num_projection; - float zoneout_cell; - float zoneout_output; - } LSTMParamSpec; - - typedef struct { - U32 coefficient_len; - bool has_offset; - BilateralSliceApplyMode mode; - } BilateralSliceApplyParamSpec; - typedef struct { - I32 axis; - ReductionMode reduction_mode; - float coeff; - bool keep_dim; - } ReductionParamSpec; - - typedef struct { - I32 axis; - } ArgMaxParamSpec; - - typedef struct { - I32 src_dims[3]; - I32 dst_dims[3]; - I32 length; - } CopyParamSpec; - - typedef struct { - CheckMode check_mode; - } CheckParamSpec; - - typedef struct { - int loops; - int axis; - } RepeatParamSpec; - - typedef struct { - TensorDesc desc; - } PreAllocatedMemoryParamSpec; - - typedef struct { - TensorDesc desc; - } SharedWeightParamSpec; - - typedef struct { - bool transpose_a; - bool transpose_b; - } MatMulParamSpec; - - typedef struct { - int attention_length; - bool same_length; - float mask; - } AttentionMaskParamSpec; - - typedef struct { - U32 input_dim; - U32 num_output; - bool bias_term; - bool transpose; - int axis; - } RelativePositionEmbedParamSpec; - - typedef struct { - int axis; - int shift_length; - } RelativeShiftParamSpec; - - typedef struct { - int axis; - int num_concat; - } ConcatParamSpec; - - typedef struct { - int axis; - } SoftmaxParamSpec; - - typedef struct { - U32 begin_arr[8]; - U32 size_arr[8]; - U32 dim_size; - } TfSliceParamSpec; - - typedef struct { - F32 min_sizes[2]; - F32 max_sizes[2]; - F32 aspect_ratios[2]; - U32 flip; - U32 clip; - F32 variances[4]; - U32 image_h; - U32 image_w; - F32 step_h; - F32 step_w; - F32 offset; - } PriorBoxParamSpec; - - typedef struct { - U32 num_class; - F32 nms_threshold; - U32 nms_top_k; - U32 keep_top_k; - F32 confidence_threshold; - } DetectionOutputParamSpec; - - typedef union ParameterSpec { - ParameterSpec() {} - ConvolutionParamSpec conv_spec; - PoolingParamSpec pooling_spec; - FullyConnectedParamSpec fc_spec; - BatchNormParamSpec bn_spec; - EltwiseParamSpec eltwise_spec; - ReLUParamSpec relu_spec; - ClipParamSpec clip_spec; - PadParamSpec pad_spec; - EmbedParamSpec embed_spec; - MultiplyParamSpec multiply_spec; - ReshapeParamSpec reshape_spec; - SliceParamSpec slice_spec; - TransposeParamSpec transpose_spec; - AttentionParamSpec attention_spec; - LSTMParamSpec lstm_spec; - GatherParamSpec gather_spec; - UnsqueezeParamSpec unsqueeze_spec; - SqueezeParamSpec squeeze_spec; - UpsampleParamSpec upsample_spec; - CastParamSpec cast_spec; - BilateralSliceApplyParamSpec bilateral_slice_apply_spec; - ScaleParamSpec scale_spec; - ReductionParamSpec reduction_spec; - CopyParamSpec copy_spec; - CheckParamSpec check_spec; - RepeatParamSpec repeat_spec; - PreAllocatedMemoryParamSpec preallocated_memory_spec; - SharedWeightParamSpec shared_weight_spec; - ArgMaxParamSpec argmax_spec; - MatMulParamSpec matmul_spec; - InterpParamSpec interp_spec; - FlattenParamSpec flatten_spec; - AttentionMaskParamSpec attention_mask_spec; - RelativePositionEmbedParamSpec relative_position_embed_spec; - RelativeShiftParamSpec relative_shift_spec; - ConcatParamSpec concat_spec; - SoftmaxParamSpec softmax_spec; - TfSliceParamSpec tf_slice_spec; - PriorBoxParamSpec prior_box_spec; - DetectionOutputParamSpec detection_output_spec; - } ParameterSpec; - - typedef struct { - int num_scale; - F32 *scale; - } QuantSpec; - - typedef struct { - I8 name[NAME_LEN]; - OperatorType type; - U32 num_inputs; - I8 **input_tensors_name; - U32 num_outputs; - I8 **output_tensors_name; - I32 *tensor_positions; - U32 num_quant_feature; - QuantSpec *feature_scale; - ParameterSpec ps; - } OperatorSpec; - - typedef struct { - I8 op_name[NAME_LEN]; - DataType mdt = DT_U8; - U32 bytes_of_weight = 0; - U8* weight; - U32 bytes_of_vec = 0; - U8* vec; - U32 num_quant_scale; // Merged FC may have multiple weight scales - QuantSpec *weight_scale; - } WeightSpec; - - typedef struct { - I8 op[NAME_LEN]; - U32 num_inputs; - I8 **input_op_names; - U32 num_outputs; - I8 **output_op_names; - } OperatorRelationshipMapEntry; - - typedef struct { - I32 version; - I32 magic_number; - - I8 model_name[NAME_LEN]; - DataType dt; - - I32 num_inputs; - I8 **input_names; - TensorDesc *input_dims; - - I32 num_outputs; - I8 **output_names; - - I32 num_operator_specs; - OperatorSpec *ops; - - I32 num_weight_specs; - WeightSpec *ws; - - I32 num_op_tensor_entries; - OperatorRelationshipMapEntry *op_relationship_entries; - } ModelSpec; -/**** - * @breif - * - * @param dir - * @param mfn model file name without extension - * - **/ - inline I32 mt_version(){ - static const I32 version = 190930; - return version; - } - - inline I32 mt_magic_number(){ - static const I32 magic_number = 1141119; - return magic_number; - } - - // you must invoke this before use md - EE mt_create_model(ModelSpec* md); - EE mt_load(CI8* dir, CI8* mfn, ModelSpec* md); - EE mt_store(CI8* dir, CI8* mfn, const ModelSpec* md); - // you must invoke this before exit to clean up resource usage - EE mt_destroy_model(ModelSpec* md); - - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/model-tools/src/CMakeLists.txt b/model-tools/src/CMakeLists.txt deleted file mode 100644 index 1edeb073..00000000 --- a/model-tools/src/CMakeLists.txt +++ /dev/null @@ -1,15 +0,0 @@ -set(srcs "${CMAKE_CURRENT_SOURCE_DIR}/model_tools.cpp;${CMAKE_CURRENT_SOURCE_DIR}/model_deserialize.cpp") - -if(BUILD_TEST OR USE_CAFFE OR USE_ONNX OR USE_TFLITE) - set(srcs "${srcs};${CMAKE_CURRENT_SOURCE_DIR}/data_type_converter.cpp;${CMAKE_CURRENT_SOURCE_DIR}/model_serialize.cpp;${CMAKE_CURRENT_SOURCE_DIR}/model_print.cpp") -endif(BUILD_TEST OR USE_CAFFE OR USE_ONNX OR USE_TFLITE) - -# shared library -ADD_LIBRARY(${PROJECT_NAME} SHARED ${srcs}) - -# static library -ADD_LIBRARY(${PROJECT_NAME}_static STATIC ${srcs}) - -SET_TARGET_PROPERTIES(${PROJECT_NAME}_static PROPERTIES OUTPUT_NAME "${PROJECT_NAME}") -SET_TARGET_PROPERTIES(${PROJECT_NAME} PROPERTIES CLEAN_DIRECT_OUTPUT 1) -SET_TARGET_PROPERTIES(${PROJECT_NAME}_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) diff --git a/model-tools/src/caffe/CMakeLists.txt b/model-tools/src/caffe/CMakeLists.txt deleted file mode 100644 index b6999807..00000000 --- a/model-tools/src/caffe/CMakeLists.txt +++ /dev/null @@ -1,24 +0,0 @@ -file(GLOB srcs ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) - -include_directories(${CMAKE_CURRENT_SOURCE_DIR}) - -protobuf_generate_cpp(CAFFE_PROTO_SRCS CAFFE_PROTO_HDRS caffe.proto) - -protobuf_generate_python(CAFFE_PROTO_PYTHON_SRCS caffe.proto) - -include_directories(${PROTOBUF_INCLUDE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) -include_directories(../) - -add_custom_target(caffe_pb2.py ALL - DEPENDS ${CAFFE_PROTO_PYTHON_SRCS} - COMMAND ${CMAKE_COMMAND} -E copy ${CAFFE_PROTO_PYTHON_SRCS} ${PROJECT_SOURCE_DIR}/tools/tensorflow2caffe/Caffe) - -# shared library -ADD_LIBRARY(${PROJECT_NAME}_caffe SHARED ${srcs} ${CAFFE_PROTO_HDRS} ${CAFFE_PROTO_SRCS}) - -# static library -ADD_LIBRARY(${PROJECT_NAME}_caffe_static STATIC ${srcs} ${CAFFE_PROTO_HDRS} ${CAFFE_PROTO_SRCS}) - -SET_TARGET_PROPERTIES(${PROJECT_NAME}_caffe_static PROPERTIES OUTPUT_NAME "${PROJECT_NAME}_caffe") -SET_TARGET_PROPERTIES(${PROJECT_NAME}_caffe PROPERTIES CLEAN_DIRECT_OUTPUT 1) -SET_TARGET_PROPERTIES(${PROJECT_NAME}_caffe_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) diff --git a/model-tools/src/caffe/caffe.proto b/model-tools/src/caffe/caffe.proto deleted file mode 100644 index 6a9eb25e..00000000 --- a/model-tools/src/caffe/caffe.proto +++ /dev/null @@ -1,1778 +0,0 @@ -// 2019.12.01-Modified -// Huawei Technologies Co., Ltd. - - -syntax = "proto2"; - -package caffe; - -// Specifies the shape (dimensions) of a Blob. -message BlobShape { - repeated int64 dim = 1 [packed = true]; -} - -message BlobProto { - optional BlobShape shape = 7; - repeated float data = 5 [packed = true]; - repeated float diff = 6 [packed = true]; - repeated double double_data = 8 [packed = true]; - repeated double double_diff = 9 [packed = true]; - - // 4D dimensions -- deprecated. Use "shape" instead. - optional int32 num = 1 [default = 0]; - optional int32 channels = 2 [default = 0]; - optional int32 height = 3 [default = 0]; - optional int32 width = 4 [default = 0]; -} - -// The BlobProtoVector is simply a way to pass multiple blobproto instances -// around. -message BlobProtoVector { - repeated BlobProto blobs = 1; -} - -message Datum { - optional int32 channels = 1; - optional int32 height = 2; - optional int32 width = 3; - // the actual image data, in bytes - optional bytes data = 4; - optional int32 label = 5; - // Optionally, the datum could also hold float data. - repeated float float_data = 6; - // If true data contains an encoded image that need to be decoded - optional bool encoded = 7 [default = false]; -} - -message FillerParameter { - // The filler type. - optional string type = 1 [default = 'constant']; - optional float value = 2 [default = 0]; // the value in constant filler - optional float min = 3 [default = 0]; // the min value in uniform filler - optional float max = 4 [default = 1]; // the max value in uniform filler - optional float mean = 5 [default = 0]; // the mean value in Gaussian filler - optional float std = 6 [default = 1]; // the std value in Gaussian filler - // The expected number of non-zero output weights for a given input in - // Gaussian filler -- the default -1 means don't perform sparsification. - optional int32 sparse = 7 [default = -1]; - // Normalize the filler variance by fan_in, fan_out, or their average. - // Applies to 'xavier' and 'msra' fillers. - enum VarianceNorm { - FAN_IN = 0; - FAN_OUT = 1; - AVERAGE = 2; - } - optional VarianceNorm variance_norm = 8 [default = FAN_IN]; -} - -message NetParameter { - optional string name = 1; // consider giving the network a name - // DEPRECATED. See InputParameter. The input blobs to the network. - repeated string input = 3; - // DEPRECATED. See InputParameter. The shape of the input blobs. - repeated BlobShape input_shape = 8; - - // 4D input dimensions -- deprecated. Use "input_shape" instead. - // If specified, for each input blob there should be four - // values specifying the num, channels, height and width of the input blob. - // Thus, there should be a total of (4 * #input) numbers. - repeated int32 input_dim = 4; - - // Whether the network will force every layer to carry out backward operation. - // If set False, then whether to carry out backward is determined - // automatically according to the net structure and learning rates. - optional bool force_backward = 5 [default = false]; - // The current "state" of the network, including the phase, level, and stage. - // Some layers may be included/excluded depending on this state and the states - // specified in the layers' include and exclude fields. - optional NetState state = 6; - - // Print debugging information about results while running Net::Forward, - // Net::Backward, and Net::Update. - optional bool debug_info = 7 [default = false]; - - // The layers that make up the net. Each of their configurations, including - // connectivity and behavior, is specified as a LayerParameter. - repeated LayerParameter layer = 100; // ID 100 so layers are printed last. - - // DEPRECATED: use 'layer' instead. - repeated V1LayerParameter layers = 2; -} - -// NOTE -// Update the next available ID when you add a new SolverParameter field. -// -// SolverParameter next available ID: 43 (last added: weights) -message SolverParameter { - ////////////////////////////////////////////////////////////////////////////// - // Specifying the train and test networks - // - // Exactly one train net must be specified using one of the following fields: - // train_net_param, train_net, net_param, net - // One or more test nets may be specified using any of the following fields: - // test_net_param, test_net, net_param, net - // If more than one test net field is specified (e.g., both net and - // test_net are specified), they will be evaluated in the field order given - // above: (1) test_net_param, (2) test_net, (3) net_param/net. - // A test_iter must be specified for each test_net. - // A test_level and/or a test_stage may also be specified for each test_net. - ////////////////////////////////////////////////////////////////////////////// - - // Proto filename for the train net, possibly combined with one or more - // test nets. - optional string net = 24; - // Inline train net param, possibly combined with one or more test nets. - optional NetParameter net_param = 25; - - optional string train_net = 1; // Proto filename for the train net. - repeated string test_net = 2; // Proto filenames for the test nets. - optional NetParameter train_net_param = 21; // Inline train net params. - repeated NetParameter test_net_param = 22; // Inline test net params. - - // The states for the train/test nets. Must be unspecified or - // specified once per net. - // - // By default, train_state will have phase = TRAIN, - // and all test_state's will have phase = TEST. - // Other defaults are set according to the NetState defaults. - optional NetState train_state = 26; - repeated NetState test_state = 27; - - // The number of iterations for each test net. - repeated int32 test_iter = 3; - - // The number of iterations between two testing phases. - optional int32 test_interval = 4 [default = 0]; - optional bool test_compute_loss = 19 [default = false]; - // If true, run an initial test pass before the first iteration, - // ensuring memory availability and printing the starting value of the loss. - optional bool test_initialization = 32 [default = true]; - optional float base_lr = 5; // The base learning rate - // the number of iterations between displaying info. If display = 0, no info - // will be displayed. - optional int32 display = 6; - // Display the loss averaged over the last average_loss iterations - optional int32 average_loss = 33 [default = 1]; - optional int32 max_iter = 7; // the maximum number of iterations - // accumulate gradients over `iter_size` x `batch_size` instances - optional int32 iter_size = 36 [default = 1]; - - // The learning rate decay policy. The currently implemented learning rate - // policies are as follows: - // - fixed: always return base_lr. - // - step: return base_lr * gamma ^ (floor(iter / step)) - // - exp: return base_lr * gamma ^ iter - // - inv: return base_lr * (1 + gamma * iter) ^ (- power) - // - multistep: similar to step but it allows non uniform steps defined by - // stepvalue - // - poly: the effective learning rate follows a polynomial decay, to be - // zero by the max_iter. return base_lr (1 - iter/max_iter) ^ (power) - // - sigmoid: the effective learning rate follows a sigmod decay - // return base_lr ( 1/(1 + exp(-gamma * (iter - stepsize)))) - // - // where base_lr, max_iter, gamma, step, stepvalue and power are defined - // in the solver parameter protocol buffer, and iter is the current iteration. - optional string lr_policy = 8; - optional float gamma = 9; // The parameter to compute the learning rate. - optional float power = 10; // The parameter to compute the learning rate. - optional float momentum = 11; // The momentum value. - optional float weight_decay = 12; // The weight decay. - // regularization types supported: L1 and L2 - // controlled by weight_decay - optional string regularization_type = 29 [default = "L2"]; - // the stepsize for learning rate policy "step" - optional int32 stepsize = 13; - // the stepsize for learning rate policy "multistep" - repeated int32 stepvalue = 34; - - // Set clip_gradients to >= 0 to clip parameter gradients to that L2 norm, - // whenever their actual L2 norm is larger. - optional float clip_gradients = 35 [default = -1]; - - optional int32 snapshot = 14 [default = 0]; // The snapshot interval - // The prefix for the snapshot. - // If not set then is replaced by prototxt file path without extension. - // If is set to directory then is augmented by prototxt file name - // without extention. - optional string snapshot_prefix = 15; - // whether to snapshot diff in the results or not. Snapshotting diff will help - // debugging but the final protocol buffer size will be much larger. - optional bool snapshot_diff = 16 [default = false]; - enum SnapshotFormat { - HDF5 = 0; - BINARYPROTO = 1; - } - optional SnapshotFormat snapshot_format = 37 [default = BINARYPROTO]; - // the mode solver will use: 0 for CPU and 1 for GPU. Use GPU in default. - enum SolverMode { - CPU = 0; - GPU = 1; - } - optional SolverMode solver_mode = 17 [default = GPU]; - // the device_id will that be used in GPU mode. Use device_id = 0 in default. - optional int32 device_id = 18 [default = 0]; - // If non-negative, the seed with which the Solver will initialize the Caffe - // random number generator -- useful for reproducible results. Otherwise, - // (and by default) initialize using a seed derived from the system clock. - optional int64 random_seed = 20 [default = -1]; - - // type of the solver - optional string type = 40 [default = "SGD"]; - - // numerical stability for RMSProp, AdaGrad and AdaDelta and Adam - optional float delta = 31 [default = 1e-8]; - // parameters for the Adam solver - optional float momentum2 = 39 [default = 0.999]; - - // RMSProp decay value - // MeanSquare(t) = rms_decay*MeanSquare(t-1) + (1-rms_decay)*SquareGradient(t) - optional float rms_decay = 38 [default = 0.99]; - - // If true, print information about the state of the net that may help with - // debugging learning problems. - optional bool debug_info = 23 [default = false]; - - // If false, don't save a snapshot after training finishes. - optional bool snapshot_after_train = 28 [default = true]; - - // DEPRECATED: old solver enum types, use string instead - enum SolverType { - SGD = 0; - NESTEROV = 1; - ADAGRAD = 2; - RMSPROP = 3; - ADADELTA = 4; - ADAM = 5; - } - // DEPRECATED: use type instead of solver_type - optional SolverType solver_type = 30 [default = SGD]; - - // Overlap compute and communication for data parallel training - optional bool layer_wise_reduce = 41 [default = true]; - - // Path to caffemodel file(s) with pretrained weights to initialize finetuning. - // Tha same as command line --weights parameter for caffe train command. - // If command line --weights parameter is specified, it has higher priority - // and overwrites this one(s). - // If --snapshot command line parameter is specified, this one(s) are ignored. - // If several model files are expected, they can be listed in a one - // weights parameter separated by ',' (like in a command string) or - // in repeated weights parameters separately. - repeated string weights = 42; -} - -// A message that stores the solver snapshots -message SolverState { - optional int32 iter = 1; // The current iteration - optional string learned_net = 2; // The file that stores the learned net. - repeated BlobProto history = 3; // The history for sgd solvers - optional int32 current_step = 4 [default = 0]; // The current step for learning rate -} - -enum Phase { - TRAIN = 0; - TEST = 1; -} - -message NetState { - optional Phase phase = 1 [default = TEST]; - optional int32 level = 2 [default = 0]; - repeated string stage = 3; -} - -message NetStateRule { - // Set phase to require the NetState have a particular phase (TRAIN or TEST) - // to meet this rule. - optional Phase phase = 1; - - // Set the minimum and/or maximum levels in which the layer should be used. - // Leave undefined to meet the rule regardless of level. - optional int32 min_level = 2; - optional int32 max_level = 3; - - // Customizable sets of stages to include or exclude. - // The net must have ALL of the specified stages and NONE of the specified - // "not_stage"s to meet the rule. - // (Use multiple NetStateRules to specify conjunctions of stages.) - repeated string stage = 4; - repeated string not_stage = 5; -} - -// Specifies training parameters (multipliers on global learning constants, -// and the name and other settings used for weight sharing). -message ParamSpec { - // The names of the parameter blobs -- useful for sharing parameters among - // layers, but never required otherwise. To share a parameter between two - // layers, give it a (non-empty) name. - optional string name = 1; - - // Whether to require shared weights to have the same shape, or just the same - // count -- defaults to STRICT if unspecified. - optional DimCheckMode share_mode = 2; - enum DimCheckMode { - // STRICT (default) requires that num, channels, height, width each match. - STRICT = 0; - // PERMISSIVE requires only the count (num*channels*height*width) to match. - PERMISSIVE = 1; - } - - // The multiplier on the global learning rate for this parameter. - optional float lr_mult = 3 [default = 1.0]; - - // The multiplier on the global weight decay for this parameter. - optional float decay_mult = 4 [default = 1.0]; -} - -// NOTE -// Update the next available ID when you add a new LayerParameter field. -// -// LayerParameter next available layer-specific ID: 149 (last added: clip_param) -message LayerParameter { - optional string name = 1; // the layer name - optional string type = 2; // the layer type - repeated string bottom = 3; // the name of each bottom blob - repeated string top = 4; // the name of each top blob - - // The train / test phase for computation. - optional Phase phase = 10; - - // The amount of weight to assign each top blob in the objective. - // Each layer assigns a default value, usually of either 0 or 1, - // to each top blob. - repeated float loss_weight = 5; - - // Specifies training parameters (multipliers on global learning constants, - // and the name and other settings used for weight sharing). - repeated ParamSpec param = 6; - - // The blobs containing the numeric parameters of the layer. - repeated BlobProto blobs = 7; - - // Specifies whether to backpropagate to each bottom. If unspecified, - // Caffe will automatically infer whether each input needs backpropagation - // to compute parameter gradients. If set to true for some inputs, - // backpropagation to those inputs is forced; if set false for some inputs, - // backpropagation to those inputs is skipped. - // - // The size must be either 0 or equal to the number of bottoms. - repeated bool propagate_down = 11; - - // Rules controlling whether and when a layer is included in the network, - // based on the current NetState. You may specify a non-zero number of rules - // to include OR exclude, but not both. If no include or exclude rules are - // specified, the layer is always included. If the current NetState meets - // ANY (i.e., one or more) of the specified rules, the layer is - // included/excluded. - repeated NetStateRule include = 8; - repeated NetStateRule exclude = 9; - - // Parameters for data pre-processing. - optional TransformationParameter transform_param = 100; - - // Parameters shared by loss layers. - optional LossParameter loss_param = 101; - - // Layer type-specific parameters. - // - // Note: certain layers may have more than one computational engine - // for their implementation. These layers include an Engine type and - // engine parameter for selecting the implementation. - // The default for the engine is set by the ENGINE switch at compile-time. - optional AccuracyParameter accuracy_param = 102; - optional ArgMaxParameter argmax_param = 103; - optional BatchNormParameter batch_norm_param = 139; - optional BiasParameter bias_param = 141; - optional ClipParameter clip_param = 148; - optional ConcatParameter concat_param = 104; - optional ContrastiveLossParameter contrastive_loss_param = 105; - optional ConvolutionParameter convolution_param = 106; - optional CropParameter crop_param = 144; - optional DataParameter data_param = 107; - optional DropoutParameter dropout_param = 108; - optional DummyDataParameter dummy_data_param = 109; - optional EltwiseParameter eltwise_param = 110; - optional ELUParameter elu_param = 140; - optional EmbedParameter embed_param = 137; - optional ExpParameter exp_param = 111; - optional FlattenParameter flatten_param = 135; - optional HDF5DataParameter hdf5_data_param = 112; - optional HDF5OutputParameter hdf5_output_param = 113; - optional HingeLossParameter hinge_loss_param = 114; - optional ImageDataParameter image_data_param = 115; - optional InfogainLossParameter infogain_loss_param = 116; - optional InnerProductParameter inner_product_param = 117; - optional InputParameter input_param = 143; - optional LogParameter log_param = 134; - optional LRNParameter lrn_param = 118; - optional MemoryDataParameter memory_data_param = 119; - optional MVNParameter mvn_param = 120; - optional ParameterParameter parameter_param = 145; - optional PoolingParameter pooling_param = 121; - optional PowerParameter power_param = 122; - optional PReLUParameter prelu_param = 131; - optional PythonParameter python_param = 130; - optional RecurrentParameter recurrent_param = 146; - optional ReductionParameter reduction_param = 136; - optional ReLUParameter relu_param = 123; - optional ReshapeParameter reshape_param = 133; - optional ScaleParameter scale_param = 142; - optional SigmoidParameter sigmoid_param = 124; - optional SoftmaxParameter softmax_param = 125; - optional SPPParameter spp_param = 132; - optional SliceParameter slice_param = 126; - optional SwishParameter swish_param = 147; - optional TanHParameter tanh_param = 127; - optional ThresholdParameter threshold_param = 128; - optional TileParameter tile_param = 138; - optional WindowDataParameter window_data_param = 129; - - optional TransposeParameter transpose_param = 150; - optional MultiplyParameter multiply_param = 151; - optional AttentionParameter attention_param = 152; - optional GeluParameter gelu_param = 153; - optional LayerNormParameter layer_norm_param = 154; - optional MatMulParameter matmul_param = 155; - - optional PreAllocatedMemoryParameter preallocated_memory_param = 156; - optional SharedWeightParameter shared_weight_param = 157; - optional CopyParameter copy_param = 158; - optional SqueezeParameter squeeze_param = 159; - optional UnsqueezeParameter unsqueeze_param = 160; - optional AxisMeanParameter axis_mean_param = 161; - optional CheckParameter check_param = 162; - optional RepeatParameter repeat_param = 163; - optional LSTMParameter lstm_param = 164; - optional InterpParameter interp_param = 165; - optional JumpParameter jump_param = 166; - optional AttentionMaskParameter attention_mask_param = 167; - optional RelativePositionEmbedParameter relative_position_embed_param = 168; - optional RelativeShiftParameter relative_shift_param = 169; - optional PaddingParameter paddding_param = 170; - - optional PermuteParameter permute_param = 171; - optional PriorBoxParameter prior_box_param = 172; - optional DetectionOutputParameter detection_output_param = 173; - optional Yolov3DetectionOutputParameter yolov3_detection_output_param = 174; -} - -// Message that store parameters used by PriorBoxLayer -message PriorBoxParameter { - // Encode/decode type. - enum CodeType { - CORNER = 1; - CENTER_SIZE = 2; - CORNER_SIZE = 3; - } - // Minimum box size (in pixels). Required! - repeated float min_size = 1; - // Maximum box size (in pixels). Required! - repeated float max_size = 2; - // Various of aspect ratios. Duplicate ratios will be ignored. - // If none is provided, we use default ratio 1. - repeated float aspect_ratio = 3; - // If true, will flip each aspect ratio. - // For example, if there is aspect ratio "r", - // we will generate aspect ratio "1.0/r" as well. - optional bool flip = 4 [default = true]; - // If true, will clip the prior so that it is within [0, 1] - optional bool clip = 5 [default = false]; - // Variance for adjusting the prior bboxes. - repeated float variance = 6; - // By default, we calculate img_height, img_width, step_x, step_y based on - // bottom[0] (feat) and bottom[1] (img). Unless these values are explicitely - // provided. - // Explicitly provide the img_size. - optional uint32 img_size = 7; - // Either img_size or img_h/img_w should be specified; not both. - optional uint32 img_h = 8; - optional uint32 img_w = 9; - - // Explicitly provide the step size. - optional float step = 10; - // Either step or step_h/step_w should be specified; not both. - optional float step_h = 11; - optional float step_w = 12; - - // Offset to the top left corner of each cell. - optional float offset = 13 [default = 0.5]; -} - -// Message that stores parameters used by data transformer for resize policy -message ResizeParameter { - //Probability of using this resize policy - optional float prob = 1 [default = 1]; - - enum Resize_mode { - WARP = 1; - FIT_SMALL_SIZE = 2; - FIT_LARGE_SIZE_AND_PAD = 3; - } - optional Resize_mode resize_mode = 2 [default = WARP]; - optional uint32 height = 3 [default = 0]; - optional uint32 width = 4 [default = 0]; - // A parameter used to update bbox in FIT_SMALL_SIZE mode. - optional uint32 height_scale = 8 [default = 0]; - optional uint32 width_scale = 9 [default = 0]; - - enum Pad_mode { - CONSTANT = 1; - MIRRORED = 2; - REPEAT_NEAREST = 3; - } - // Padding mode for BE_SMALL_SIZE_AND_PAD mode and object centering - optional Pad_mode pad_mode = 5 [default = CONSTANT]; - // if specified can be repeated once (would fill all the channels) - // or can be repeated the same number of times as channels - // (would use it them to the corresponding channel) - repeated float pad_value = 6; - - enum Interp_mode { //Same as in OpenCV - LINEAR = 1; - AREA = 2; - NEAREST = 3; - CUBIC = 4; - LANCZOS4 = 5; - } - //interpolation for for resizing - repeated Interp_mode interp_mode = 7; -} - -message NonMaximumSuppressionParameter { - // Threshold to be used in nms. - optional float nms_threshold = 1 [default = 0.3]; - // Maximum number of results to be kept. - optional int32 top_k = 2; - // Parameter for adaptive nms. - optional float eta = 3 [default = 1.0]; -} - -message SaveOutputParameter { - // Output directory. If not empty, we will save the results. - optional string output_directory = 1; - // Output name prefix. - optional string output_name_prefix = 2; - // Output format. - // VOC - PASCAL VOC output format. - // COCO - MS COCO output format. - optional string output_format = 3; - // If you want to output results, must also provide the following two files. - // Otherwise, we will ignore saving results. - // label map file. - optional string label_map_file = 4; - // A file which contains a list of names and sizes with same order - // of the input DB. The file is in the following format: - // name height width - // ... - optional string name_size_file = 5; - // Number of test images. It can be less than the lines specified in - // name_size_file. For example, when we only want to evaluate on part - // of the test images. - optional uint32 num_test_image = 6; - // The resize parameter used in saving the data. - optional ResizeParameter resize_param = 7; -} - -// Message that store parameters used by DetectionOutputLayer -message DetectionOutputParameter { - // Number of classes to be predicted. Required! - optional uint32 num_classes = 1; - // If true, bounding box are shared among different classes. - optional bool share_location = 2 [default = true]; - // Background label id. If there is no background class, - // set it as -1. - optional int32 background_label_id = 3 [default = 0]; - // Parameters used for non maximum suppression. - optional NonMaximumSuppressionParameter nms_param = 4; - // Parameters used for saving detection results. - optional SaveOutputParameter save_output_param = 5; - // Type of coding method for bbox. - optional PriorBoxParameter.CodeType code_type = 6 [default = CORNER]; - // If true, variance is encoded in target; otherwise we need to adjust the - // predicted offset accordingly. - optional bool variance_encoded_in_target = 8 [default = false]; - // Number of total bboxes to be kept per image after nms step. - // -1 means keeping all bboxes after nms step. - optional int32 keep_top_k = 7 [default = -1]; - // Only consider detections whose confidences are larger than a threshold. - // If not provided, consider all boxes. - optional float confidence_threshold = 9; - // If true, visualize the detection results. - optional bool visualize = 10 [default = false]; - // The threshold used to visualize the detection results. - optional float visualize_threshold = 11; - // If provided, save outputs to video file. - optional string save_file = 12; -} - -message Yolov3DetectionOutputParameter { - // Yolov3 detection output layer - // Yolo detection output layer - optional uint32 num_classes = 1 [default = 20]; - optional uint32 num_box = 2 [default = 3]; - optional float confidence_threshold = 3 [default = 0.01]; - optional float nms_threshold = 4 [default = 0.45]; - repeated float biases = 5; - repeated uint32 anchors_scale = 6 ; - optional uint32 mask_group_num = 7 [default = 2]; - repeated uint32 mask = 8; -} - -message PermuteParameter { - // The new orders of the axes of data. Notice it should be with - // in the same range as the input data, and it starts from 0. - // Do not provide repeated order. - repeated uint32 order = 1; -} - -message PreAllocatedMemoryParameter { - enum DataType { - FLOAT32 = 0; - INT32 = 1; - UINT32 = 2; - } - optional DataType data_type = 1; - optional BlobShape shape = 2; -} - -message SharedWeightParameter { - enum DataType { - FLOAT32 = 0; - INT32 = 1; - UINT32 = 2; - } - optional DataType data_type = 1; - optional BlobShape shape = 2; -} - -message CopyParameter { - optional int32 src_batch_stride = 1; - optional int32 src_stride = 2; - optional int32 src_offset = 3; - optional int32 dst_batch_stride = 4; - optional int32 dst_stride = 5; - optional int32 dst_offset = 6; - optional int32 length = 7; -} - -message SqueezeParameter { - optional int32 axis = 1; -} - -message UnsqueezeParameter { - optional int32 axis = 1; -} - -message AxisMeanParameter { - optional int32 axis = 1; -} - -message CheckParameter { - enum CheckOp { - EQUAL = 0; - GREAT = 1; - GREATEQUAL = 2; - } - optional CheckOp operation = 1; -} - -message RepeatParameter { - optional int32 loops = 1 [default = 0]; - optional int32 axis = 2 [default = -1]; -} - -message LSTMParameter { - optional uint32 num_output = 1; - optional int32 steps = 2 [default = -1]; - optional int32 num_proj = 3 [default = 0]; - optional float zoneout_cell = 4 [default = 0]; - optional float zoneout_output = 5 [default = 0]; -} - -message InterpParameter { - required uint32 height = 1; - required uint32 width = 2; -} - -message JumpParameter { -} - -message AttentionMaskParameter { - optional int32 attention_length = 1 [default = -1]; - optional bool same_length = 2 [default = false]; - optional float mask = 3 [default = 10000]; -} - -message RelativePositionEmbedParameter { - optional uint32 num_output = 1; // The number of outputs for the layer - optional uint32 input_dim = 2; - optional bool bias_term = 3 [default = true]; // Whether to use a bias term - optional FillerParameter weight_filler = 4; // The filler for the weight - optional FillerParameter bias_filler = 5; // The filler for the bias - optional bool transpose = 6 [default = false]; // Whether to use transpose dict - optional int32 axis = 7 [default = 1]; -} - -message RelativeShiftParameter { - optional int32 axis = 1 [default = 1]; - optional int32 shift_length = 2 [default = 1]; -} - -message PaddingParameter { - repeated int32 shape = 1; - repeated int32 value = 2; -} - -// Message that stores parameters used to apply transformation -// to the data layer's data -message TransformationParameter { - // For data pre-processing, we can do simple scaling and subtracting the - // data mean, if provided. Note that the mean subtraction is always carried - // out before scaling. - optional float scale = 1 [default = 1]; - // Specify if we want to randomly mirror data. - optional bool mirror = 2 [default = false]; - // Specify if we would like to randomly crop an image. - optional uint32 crop_size = 3 [default = 0]; - // mean_file and mean_value cannot be specified at the same time - optional string mean_file = 4; - // if specified can be repeated once (would subtract it from all the channels) - // or can be repeated the same number of times as channels - // (would subtract them from the corresponding channel) - repeated float mean_value = 5; - // Force the decoded image to have 3 color channels. - optional bool force_color = 6 [default = false]; - // Force the decoded image to have 1 color channels. - optional bool force_gray = 7 [default = false]; -} - -// Message that stores parameters shared by loss layers -message LossParameter { - // If specified, ignore instances with the given label. - optional int32 ignore_label = 1; - // How to normalize the loss for loss layers that aggregate across batches, - // spatial dimensions, or other dimensions. Currently only implemented in - // SoftmaxWithLoss and SigmoidCrossEntropyLoss layers. - enum NormalizationMode { - // Divide by the number of examples in the batch times spatial dimensions. - // Outputs that receive the ignore label will NOT be ignored in computing - // the normalization factor. - FULL = 0; - // Divide by the total number of output locations that do not take the - // ignore_label. If ignore_label is not set, this behaves like FULL. - VALID = 1; - // Divide by the batch size. - BATCH_SIZE = 2; - // Do not normalize the loss. - NONE = 3; - } - // For historical reasons, the default normalization for - // SigmoidCrossEntropyLoss is BATCH_SIZE and *not* VALID. - optional NormalizationMode normalization = 3 [default = VALID]; - // Deprecated. Ignored if normalization is specified. If normalization - // is not specified, then setting this to false will be equivalent to - // normalization = BATCH_SIZE to be consistent with previous behavior. - optional bool normalize = 2; -} - -// Messages that store parameters used by individual layer types follow, in -// alphabetical order. - -message AccuracyParameter { - // When computing accuracy, count as correct by comparing the true label to - // the top k scoring classes. By default, only compare to the top scoring - // class (i.e. argmax). - optional uint32 top_k = 1 [default = 1]; - - // The "label" axis of the prediction blob, whose argmax corresponds to the - // predicted label -- may be negative to index from the end (e.g., -1 for the - // last axis). For example, if axis == 1 and the predictions are - // (N x C x H x W), the label blob is expected to contain N*H*W ground truth - // labels with integer values in {0, 1, ..., C-1}. - optional int32 axis = 2 [default = 1]; - - // If specified, ignore instances with the given label. - optional int32 ignore_label = 3; -} - -message ArgMaxParameter { - // If true produce pairs (argmax, maxval) - optional bool out_max_val = 1 [default = false]; - optional uint32 top_k = 2 [default = 1]; - // The axis along which to maximise -- may be negative to index from the - // end (e.g., -1 for the last axis). - // By default ArgMaxLayer maximizes over the flattened trailing dimensions - // for each index of the first / num dimension. - optional int32 axis = 3; -} - -// Message that stores parameters used by ClipLayer -message ClipParameter { - required float min = 1; - required float max = 2; -} - -message ConcatParameter { - // The axis along which to concatenate -- may be negative to index from the - // end (e.g., -1 for the last axis). Other axes must have the - // same dimension for all the bottom blobs. - // By default, ConcatLayer concatenates blobs along the "channels" axis (1). - optional int32 axis = 2 [default = 1]; - - // DEPRECATED: alias for "axis" -- does not support negative indexing. - optional uint32 concat_dim = 1 [default = 1]; -} - -message BatchNormParameter { - // If false, normalization is performed over the current mini-batch - // and global statistics are accumulated (but not yet used) by a moving - // average. - // If true, those accumulated mean and variance values are used for the - // normalization. - // By default, it is set to false when the network is in the training - // phase and true when the network is in the testing phase. - optional bool use_global_stats = 1; - // What fraction of the moving average remains each iteration? - // Smaller values make the moving average decay faster, giving more - // weight to the recent values. - // Each iteration updates the moving average @f$S_{t-1}@f$ with the - // current mean @f$ Y_t @f$ by - // @f$ S_t = (1-\beta)Y_t + \beta \cdot S_{t-1} @f$, where @f$ \beta @f$ - // is the moving_average_fraction parameter. - optional float moving_average_fraction = 2 [default = .999]; - // Small value to add to the variance estimate so that we don't divide by - // zero. - optional float eps = 3 [default = 1e-5]; - - // on which dim to do batch normalization - optional int32 axis = 4 [default = 1]; -} - -message BiasParameter { - // The first axis of bottom[0] (the first input Blob) along which to apply - // bottom[1] (the second input Blob). May be negative to index from the end - // (e.g., -1 for the last axis). - // - // For example, if bottom[0] is 4D with shape 100x3x40x60, the output - // top[0] will have the same shape, and bottom[1] may have any of the - // following shapes (for the given value of axis): - // (axis == 0 == -4) 100; 100x3; 100x3x40; 100x3x40x60 - // (axis == 1 == -3) 3; 3x40; 3x40x60 - // (axis == 2 == -2) 40; 40x60 - // (axis == 3 == -1) 60 - // Furthermore, bottom[1] may have the empty shape (regardless of the value of - // "axis") -- a scalar bias. - optional int32 axis = 1 [default = 1]; - - // (num_axes is ignored unless just one bottom is given and the bias is - // a learned parameter of the layer. Otherwise, num_axes is determined by the - // number of axes by the second bottom.) - // The number of axes of the input (bottom[0]) covered by the bias - // parameter, or -1 to cover all axes of bottom[0] starting from `axis`. - // Set num_axes := 0, to add a zero-axis Blob: a scalar. - optional int32 num_axes = 2 [default = 1]; - - // (filler is ignored unless just one bottom is given and the bias is - // a learned parameter of the layer.) - // The initialization for the learned bias parameter. - // Default is the zero (0) initialization, resulting in the BiasLayer - // initially performing the identity operation. - optional FillerParameter filler = 3; -} - -message ContrastiveLossParameter { - // margin for dissimilar pair - optional float margin = 1 [default = 1.0]; - // The first implementation of this cost did not exactly match the cost of - // Hadsell et al 2006 -- using (margin - d^2) instead of (margin - d)^2. - // legacy_version = false (the default) uses (margin - d)^2 as proposed in the - // Hadsell paper. New models should probably use this version. - // legacy_version = true uses (margin - d^2). This is kept to support / - // reproduce existing models and results - optional bool legacy_version = 2 [default = false]; -} - -message ConvolutionParameter { - optional uint32 num_output = 1; // The number of outputs for the layer - optional bool bias_term = 2 [default = true]; // whether to have bias terms - - // Pad, kernel size, and stride are all given as a single value for equal - // dimensions in all spatial dimensions, or once per spatial dimension. - repeated uint32 pad = 3; // The padding size; defaults to 0 - repeated uint32 kernel_size = 4; // The kernel size - repeated uint32 stride = 6; // The stride; defaults to 1 - // Factor used to dilate the kernel, (implicitly) zero-filling the resulting - // holes. (Kernel dilation is sometimes referred to by its use in the - // algorithme à trous from Holschneider et al. 1987.) - repeated uint32 dilation = 18; // The dilation; defaults to 1 - - // For 2D convolution only, the *_h and *_w versions may also be used to - // specify both spatial dimensions. - optional uint32 pad_h = 9 [default = 0]; // The padding height (2D only) - optional uint32 pad_w = 10 [default = 0]; // The padding width (2D only) - optional uint32 kernel_h = 11; // The kernel height (2D only) - optional uint32 kernel_w = 12; // The kernel width (2D only) - optional uint32 stride_h = 13; // The stride height (2D only) - optional uint32 stride_w = 14; // The stride width (2D only) - - optional uint32 group = 5 [default = 1]; // The group size for group conv - - optional FillerParameter weight_filler = 7; // The filler for the weight - optional FillerParameter bias_filler = 8; // The filler for the bias - enum Engine { - DEFAULT = 0; - CAFFE = 1; - CUDNN = 2; - } - optional Engine engine = 15 [default = DEFAULT]; - - // The axis to interpret as "channels" when performing convolution. - // Preceding dimensions are treated as independent inputs; - // succeeding dimensions are treated as "spatial". - // With (N, C, H, W) inputs, and axis == 1 (the default), we perform - // N independent 2D convolutions, sliding C-channel (or (C/g)-channels, for - // groups g>1) filters across the spatial axes (H, W) of the input. - // With (N, C, D, H, W) inputs, and axis == 1, we perform - // N independent 3D convolutions, sliding (C/g)-channels - // filters across the spatial axes (D, H, W) of the input. - optional int32 axis = 16 [default = 1]; - - // Whether to force use of the general ND convolution, even if a specific - // implementation for blobs of the appropriate number of spatial dimensions - // is available. (Currently, there is only a 2D-specific convolution - // implementation; for input blobs with num_axes != 2, this option is - // ignored and the ND implementation will be used.) - optional bool force_nd_im2col = 17 [default = false]; -} - -message CropParameter { - // To crop, elements of the first bottom are selected to fit the dimensions - // of the second, reference bottom. The crop is configured by - // - the crop `axis` to pick the dimensions for cropping - // - the crop `offset` to set the shift for all/each dimension - // to align the cropped bottom with the reference bottom. - // All dimensions up to but excluding `axis` are preserved, while - // the dimensions including and trailing `axis` are cropped. - // If only one `offset` is set, then all dimensions are offset by this amount. - // Otherwise, the number of offsets must equal the number of cropped axes to - // shift the crop in each dimension accordingly. - // Note: standard dimensions are N,C,H,W so the default is a spatial crop, - // and `axis` may be negative to index from the end (e.g., -1 for the last - // axis). - optional int32 axis = 1 [default = 2]; - repeated uint32 offset = 2; -} - -message DataParameter { - enum DB { - LEVELDB = 0; - LMDB = 1; - } - // Specify the data source. - optional string source = 1; - // Specify the batch size. - optional uint32 batch_size = 4; - // The rand_skip variable is for the data layer to skip a few data points - // to avoid all asynchronous sgd clients to start at the same point. The skip - // point would be set as rand_skip * rand(0,1). Note that rand_skip should not - // be larger than the number of keys in the database. - // DEPRECATED. Each solver accesses a different subset of the database. - optional uint32 rand_skip = 7 [default = 0]; - optional DB backend = 8 [default = LEVELDB]; - // DEPRECATED. See TransformationParameter. For data pre-processing, we can do - // simple scaling and subtracting the data mean, if provided. Note that the - // mean subtraction is always carried out before scaling. - optional float scale = 2 [default = 1]; - optional string mean_file = 3; - // DEPRECATED. See TransformationParameter. Specify if we would like to randomly - // crop an image. - optional uint32 crop_size = 5 [default = 0]; - // DEPRECATED. See TransformationParameter. Specify if we want to randomly mirror - // data. - optional bool mirror = 6 [default = false]; - // Force the encoded image to have 3 color channels - optional bool force_encoded_color = 9 [default = false]; - // Prefetch queue (Increase if data feeding bandwidth varies, within the - // limit of device memory for GPU training) - optional uint32 prefetch = 10 [default = 4]; -} - -message DropoutParameter { - optional float dropout_ratio = 1 [default = 0.5]; // dropout ratio -} - -// DummyDataLayer fills any number of arbitrarily shaped blobs with random -// (or constant) data generated by "Fillers" (see "message FillerParameter"). -message DummyDataParameter { - // This layer produces N >= 1 top blobs. DummyDataParameter must specify 1 or N - // shape fields, and 0, 1 or N data_fillers. - // - // If 0 data_fillers are specified, ConstantFiller with a value of 0 is used. - // If 1 data_filler is specified, it is applied to all top blobs. If N are - // specified, the ith is applied to the ith top blob. - repeated FillerParameter data_filler = 1; - repeated BlobShape shape = 6; - - // 4D dimensions -- deprecated. Use "shape" instead. - repeated uint32 num = 2; - repeated uint32 channels = 3; - repeated uint32 height = 4; - repeated uint32 width = 5; -} - -message EltwiseParameter { - enum EltwiseOp { - PROD = 0; - SUM = 1; - MAX = 2; - } - optional EltwiseOp operation = 1 [default = SUM]; // element-wise operation - repeated float coeff = 2; // blob-wise coefficient for SUM operation - - // Whether to use an asymptotically slower (for >2 inputs) but stabler method - // of computing the gradient for the PROD operation. (No effect for SUM op.) - optional bool stable_prod_grad = 3 [default = true]; -} - -// Message that stores parameters used by ELULayer -message ELUParameter { - // Described in: - // Clevert, D.-A., Unterthiner, T., & Hochreiter, S. (2015). Fast and Accurate - // Deep Network Learning by Exponential Linear Units (ELUs). arXiv - optional float alpha = 1 [default = 1]; -} - -// Message that stores parameters used by EmbedLayer -message EmbedParameter { - optional uint32 num_output = 1; // The number of outputs for the layer - // The input is given as integers to be interpreted as one-hot - // vector indices with dimension num_input. Hence num_input should be - // 1 greater than the maximum possible input value. - optional uint32 input_dim = 2; - - optional bool bias_term = 3 [default = true]; // Whether to use a bias term - optional FillerParameter weight_filler = 4; // The filler for the weight - optional FillerParameter bias_filler = 5; // The filler for the bias - optional bool transpose = 6 [default = false]; // Whether to use transpose dict -} - -// Message that stores parameters used by ExpLayer -message ExpParameter { - // ExpLayer computes outputs y = base ^ (shift + scale * x), for base > 0. - // Or if base is set to the default (-1), base is set to e, - // so y = exp(shift + scale * x). - optional float base = 1 [default = -1.0]; - optional float scale = 2 [default = 1.0]; - optional float shift = 3 [default = 0.0]; -} - -/// Message that stores parameters used by FlattenLayer -message FlattenParameter { - // The first axis to flatten: all preceding axes are retained in the output. - // May be negative to index from the end (e.g., -1 for the last axis). - optional int32 axis = 1 [default = 1]; - - // The last axis to flatten: all following axes are retained in the output. - // May be negative to index from the end (e.g., the default -1 for the last - // axis). - optional int32 end_axis = 2 [default = -1]; -} - -// Message that stores parameters used by HDF5DataLayer -message HDF5DataParameter { - // Specify the data source. - optional string source = 1; - // Specify the batch size. - optional uint32 batch_size = 2; - - // Specify whether to shuffle the data. - // If shuffle == true, the ordering of the HDF5 files is shuffled, - // and the ordering of data within any given HDF5 file is shuffled, - // but data between different files are not interleaved; all of a file's - // data are output (in a random order) before moving onto another file. - optional bool shuffle = 3 [default = false]; -} - -message HDF5OutputParameter { - optional string file_name = 1; -} - -message HingeLossParameter { - enum Norm { - L1 = 1; - L2 = 2; - } - // Specify the Norm to use L1 or L2 - optional Norm norm = 1 [default = L1]; -} - -message ImageDataParameter { - // Specify the data source. - optional string source = 1; - // Specify the batch size. - optional uint32 batch_size = 4 [default = 1]; - // The rand_skip variable is for the data layer to skip a few data points - // to avoid all asynchronous sgd clients to start at the same point. The skip - // point would be set as rand_skip * rand(0,1). Note that rand_skip should not - // be larger than the number of keys in the database. - optional uint32 rand_skip = 7 [default = 0]; - // Whether or not ImageLayer should shuffle the list of files at every epoch. - optional bool shuffle = 8 [default = false]; - // It will also resize images if new_height or new_width are not zero. - optional uint32 new_height = 9 [default = 0]; - optional uint32 new_width = 10 [default = 0]; - // Specify if the images are color or gray - optional bool is_color = 11 [default = true]; - // DEPRECATED. See TransformationParameter. For data pre-processing, we can do - // simple scaling and subtracting the data mean, if provided. Note that the - // mean subtraction is always carried out before scaling. - optional float scale = 2 [default = 1]; - optional string mean_file = 3; - // DEPRECATED. See TransformationParameter. Specify if we would like to randomly - // crop an image. - optional uint32 crop_size = 5 [default = 0]; - // DEPRECATED. See TransformationParameter. Specify if we want to randomly mirror - // data. - optional bool mirror = 6 [default = false]; - optional string root_folder = 12 [default = ""]; -} - -message InfogainLossParameter { - // Specify the infogain matrix source. - optional string source = 1; - optional int32 axis = 2 [default = 1]; // axis of prob -} - -message InnerProductParameter { - optional uint32 num_output = 1; // The number of outputs for the layer - optional bool bias_term = 2 [default = true]; // whether to have bias terms - optional FillerParameter weight_filler = 3; // The filler for the weight - optional FillerParameter bias_filler = 4; // The filler for the bias - - // The first axis to be lumped into a single inner product computation; - // all preceding axes are retained in the output. - // May be negative to index from the end (e.g., -1 for the last axis). - optional int32 axis = 5 [default = 1]; - // Specify whether to transpose the weight matrix or not. - // If transpose == true, any operations will be performed on the transpose - // of the weight matrix. The weight matrix itself is not going to be transposed - // but rather the transfer flag of operations will be toggled accordingly. - optional bool transpose = 6 [default = false]; -} - -message InputParameter { - // This layer produces N >= 1 top blob(s) to be assigned manually. - // Define N shapes to set a shape for each top. - // Define 1 shape to set the same shape for every top. - // Define no shape to defer to reshaping manually. - repeated BlobShape shape = 1; -} - -// Message that stores parameters used by LogLayer -message LogParameter { - // LogLayer computes outputs y = log_base(shift + scale * x), for base > 0. - // Or if base is set to the default (-1), base is set to e, - // so y = ln(shift + scale * x) = log_e(shift + scale * x) - optional float base = 1 [default = -1.0]; - optional float scale = 2 [default = 1.0]; - optional float shift = 3 [default = 0.0]; -} - -// Message that stores parameters used by LRNLayer -message LRNParameter { - optional uint32 local_size = 1 [default = 5]; - optional float alpha = 2 [default = 1.]; - optional float beta = 3 [default = 0.75]; - enum NormRegion { - ACROSS_CHANNELS = 0; - WITHIN_CHANNEL = 1; - } - optional NormRegion norm_region = 4 [default = ACROSS_CHANNELS]; - optional float k = 5 [default = 1.]; - enum Engine { - DEFAULT = 0; - CAFFE = 1; - CUDNN = 2; - } - optional Engine engine = 6 [default = DEFAULT]; -} - -message MemoryDataParameter { - optional uint32 batch_size = 1; - optional uint32 channels = 2; - optional uint32 height = 3; - optional uint32 width = 4; -} - -message MVNParameter { - // This parameter can be set to false to normalize mean only - optional bool normalize_variance = 1 [default = true]; - - // This parameter can be set to true to perform DNN-like MVN - optional bool across_channels = 2 [default = false]; - - // Epsilon for not dividing by zero while normalizing variance - optional float eps = 3 [default = 1e-9]; -} - -message ParameterParameter { - optional BlobShape shape = 1; -} - -message PoolingParameter { - enum PoolMethod { - MAX = 0; - AVE = 1; - STOCHASTIC = 2; - } - optional PoolMethod pool = 1 [default = MAX]; // The pooling method - // Pad, kernel size, and stride are all given as a single value for equal - // dimensions in height and width or as Y, X pairs. - optional uint32 pad = 4 [default = 0]; // The padding size (equal in Y, X) - optional uint32 pad_h = 9 [default = 0]; // The padding height - optional uint32 pad_w = 10 [default = 0]; // The padding width - optional uint32 kernel_size = 2; // The kernel size (square) - optional uint32 kernel_h = 5; // The kernel height - optional uint32 kernel_w = 6; // The kernel width - optional uint32 stride = 3 [default = 1]; // The stride (equal in Y, X) - optional uint32 stride_h = 7; // The stride height - optional uint32 stride_w = 8; // The stride width - enum Engine { - DEFAULT = 0; - CAFFE = 1; - CUDNN = 2; - } - optional Engine engine = 11 [default = DEFAULT]; - // If global_pooling then it will pool over the size of the bottom by doing - // kernel_h = bottom->height and kernel_w = bottom->width - optional bool global_pooling = 12 [default = false]; - // How to calculate the output size - using ceil (default) or floor rounding. - enum RoundMode { - CEIL = 0; - FLOOR = 1; - } - optional RoundMode round_mode = 13 [default = CEIL]; -} - -message PowerParameter { - // PowerLayer computes outputs y = (shift + scale * x) ^ power. - optional float power = 1 [default = 1.0]; - optional float scale = 2 [default = 1.0]; - optional float shift = 3 [default = 0.0]; -} - -message PythonParameter { - optional string module = 1; - optional string layer = 2; - // This value is set to the attribute `param_str` of the `PythonLayer` object - // in Python before calling the `setup()` method. This could be a number, - // string, dictionary in Python dict format, JSON, etc. You may parse this - // string in `setup` method and use it in `forward` and `backward`. - optional string param_str = 3 [default = '']; - // DEPRECATED - optional bool share_in_parallel = 4 [default = false]; -} - -// Message that stores parameters used by RecurrentLayer -message RecurrentParameter { - // The dimension of the output (and usually hidden state) representation -- - // must be explicitly set to non-zero. - optional uint32 num_output = 1 [default = 0]; - - optional FillerParameter weight_filler = 2; // The filler for the weight - optional FillerParameter bias_filler = 3; // The filler for the bias - - // Whether to enable displaying debug_info in the unrolled recurrent net. - optional bool debug_info = 4 [default = false]; - - // Whether to add as additional inputs (bottoms) the initial hidden state - // blobs, and add as additional outputs (tops) the final timestep hidden state - // blobs. The number of additional bottom/top blobs required depends on the - // recurrent architecture -- e.g., 1 for RNNs, 2 for LSTMs. - optional bool expose_hidden = 5 [default = false]; -} - -// Message that stores parameters used by ReductionLayer -message ReductionParameter { - enum ReductionOp { - SUM = 1; - ASUM = 2; - SUMSQ = 3; - MEAN = 4; - } - - optional ReductionOp operation = 1 [default = SUM]; // reduction operation - - // The first axis to reduce to a scalar -- may be negative to index from the - // end (e.g., -1 for the last axis). - // (Currently, only reduction along ALL "tail" axes is supported; reduction - // of axis M through N, where N < num_axes - 1, is unsupported.) - // Suppose we have an n-axis bottom Blob with shape: - // (d0, d1, d2, ..., d(m-1), dm, d(m+1), ..., d(n-1)). - // If axis == m, the output Blob will have shape - // (d0, d1, d2, ..., d(m-1)), - // and the ReductionOp operation is performed (d0 * d1 * d2 * ... * d(m-1)) - // times, each including (dm * d(m+1) * ... * d(n-1)) individual data. - // If axis == 0 (the default), the output Blob always has the empty shape - // (count 1), performing reduction across the entire input -- - // often useful for creating new loss functions. - optional int32 axis = 2 [default = 0]; - - optional float coeff = 3 [default = 1.0]; // coefficient for output - - optional bool keep_dim = 4 [default = false]; -} - -// Message that stores parameters used by ReLULayer -message ReLUParameter { - // Allow non-zero slope for negative inputs to speed up optimization - // Described in: - // Maas, A. L., Hannun, A. Y., & Ng, A. Y. (2013). Rectifier nonlinearities - // improve neural network acoustic models. In ICML Workshop on Deep Learning - // for Audio, Speech, and Language Processing. - optional float negative_slope = 1 [default = 0]; - enum Engine { - DEFAULT = 0; - CAFFE = 1; - CUDNN = 2; - } - optional Engine engine = 2 [default = DEFAULT]; -} - -message ReshapeParameter { - // Specify the output dimensions. If some of the dimensions are set to 0, - // the corresponding dimension from the bottom layer is used (unchanged). - // Exactly one dimension may be set to -1, in which case its value is - // inferred from the count of the bottom blob and the remaining dimensions. - // For example, suppose we want to reshape a 2D blob "input" with shape 2 x 8: - // - // layer { - // type: "Reshape" bottom: "input" top: "output" - // reshape_param { ... } - // } - // - // If "input" is 2D with shape 2 x 8, then the following reshape_param - // specifications are all equivalent, producing a 3D blob "output" with shape - // 2 x 2 x 4: - // - // reshape_param { shape { dim: 2 dim: 2 dim: 4 } } - // reshape_param { shape { dim: 0 dim: 2 dim: 4 } } - // reshape_param { shape { dim: 0 dim: 2 dim: -1 } } - // reshape_param { shape { dim: 0 dim:-1 dim: 4 } } - // - optional BlobShape shape = 1; - - // axis and num_axes control the portion of the bottom blob's shape that are - // replaced by (included in) the reshape. By default (axis == 0 and - // num_axes == -1), the entire bottom blob shape is included in the reshape, - // and hence the shape field must specify the entire output shape. - // - // axis may be non-zero to retain some portion of the beginning of the input - // shape (and may be negative to index from the end; e.g., -1 to begin the - // reshape after the last axis, including nothing in the reshape, - // -2 to include only the last axis, etc.). - // - // For example, suppose "input" is a 2D blob with shape 2 x 8. - // Then the following ReshapeLayer specifications are all equivalent, - // producing a blob "output" with shape 2 x 2 x 4: - // - // reshape_param { shape { dim: 2 dim: 2 dim: 4 } } - // reshape_param { shape { dim: 2 dim: 4 } axis: 1 } - // reshape_param { shape { dim: 2 dim: 4 } axis: -3 } - // - // num_axes specifies the extent of the reshape. - // If num_axes >= 0 (and axis >= 0), the reshape will be performed only on - // input axes in the range [axis, axis+num_axes]. - // num_axes may also be -1, the default, to include all remaining axes - // (starting from axis). - // - // For example, suppose "input" is a 2D blob with shape 2 x 8. - // Then the following ReshapeLayer specifications are equivalent, - // producing a blob "output" with shape 1 x 2 x 8. - // - // reshape_param { shape { dim: 1 dim: 2 dim: 8 } } - // reshape_param { shape { dim: 1 dim: 2 } num_axes: 1 } - // reshape_param { shape { dim: 1 } num_axes: 0 } - // - // On the other hand, these would produce output blob shape 2 x 1 x 8: - // - // reshape_param { shape { dim: 2 dim: 1 dim: 8 } } - // reshape_param { shape { dim: 1 } axis: 1 num_axes: 0 } - // - optional int32 axis = 2 [default = 0]; - optional int32 num_axes = 3 [default = -1]; -} - -message ScaleParameter { - // The first axis of bottom[0] (the first input Blob) along which to apply - // bottom[1] (the second input Blob). May be negative to index from the end - // (e.g., -1 for the last axis). - // - // For example, if bottom[0] is 4D with shape 100x3x40x60, the output - // top[0] will have the same shape, and bottom[1] may have any of the - // following shapes (for the given value of axis): - // (axis == 0 == -4) 100; 100x3; 100x3x40; 100x3x40x60 - // (axis == 1 == -3) 3; 3x40; 3x40x60 - // (axis == 2 == -2) 40; 40x60 - // (axis == 3 == -1) 60 - // Furthermore, bottom[1] may have the empty shape (regardless of the value of - // "axis") -- a scalar multiplier. - optional int32 axis = 1 [default = 1]; - - // (num_axes is ignored unless just one bottom is given and the scale is - // a learned parameter of the layer. Otherwise, num_axes is determined by the - // number of axes by the second bottom.) - // The number of axes of the input (bottom[0]) covered by the scale - // parameter, or -1 to cover all axes of bottom[0] starting from `axis`. - // Set num_axes := 0, to multiply with a zero-axis Blob: a scalar. - optional int32 num_axes = 2 [default = 1]; - - // (filler is ignored unless just one bottom is given and the scale is - // a learned parameter of the layer.) - // The initialization for the learned scale parameter. - // Default is the unit (1) initialization, resulting in the ScaleLayer - // initially performing the identity operation. - optional FillerParameter filler = 3; - - // Whether to also learn a bias (equivalent to a ScaleLayer+BiasLayer, but - // may be more efficient). Initialized with bias_filler (defaults to 0). - optional bool bias_term = 4 [default = false]; - optional FillerParameter bias_filler = 5; -} - -message SigmoidParameter { - enum Engine { - DEFAULT = 0; - CAFFE = 1; - CUDNN = 2; - } - optional Engine engine = 1 [default = DEFAULT]; -} - -message SliceParameter { - // The axis along which to slice -- may be negative to index from the end - // (e.g., -1 for the last axis). - // By default, SliceLayer concatenates blobs along the "channels" axis (1). - optional int32 axis = 3 [default = 1]; - repeated int32 slice_point = 2; - - // DEPRECATED: alias for "axis" -- does not support negative indexing. - optional uint32 slice_dim = 1 [default = 1]; -} - -// Message that stores parameters used by SoftmaxLayer, SoftmaxWithLossLayer -message SoftmaxParameter { - enum Engine { - DEFAULT = 0; - CAFFE = 1; - CUDNN = 2; - } - optional Engine engine = 1 [default = DEFAULT]; - - // The axis along which to perform the softmax -- may be negative to index - // from the end (e.g., -1 for the last axis). - // Any other axes will be evaluated as independent softmaxes. - optional int32 axis = 2 [default = 1]; -} - -// Message that stores parameters used by SwishLayer -message SwishParameter { - // Beta parameter for the Swish activation function - // Described in: - // Prajit Ramachandran, Barret Zoph, Quoc V. Le. (2017). Searching for - // Activation Functions. https://arxiv.org/abs/1710.05941v2 - optional float beta = 1 [default = 1]; -} - -message TanHParameter { - enum Engine { - DEFAULT = 0; - CAFFE = 1; - CUDNN = 2; - } - optional Engine engine = 1 [default = DEFAULT]; -} - -// Message that stores parameters used by TileLayer -message TileParameter { - // The index of the axis to tile. - optional int32 axis = 1 [default = 1]; - - // The number of copies (tiles) of the blob to output. - optional int32 tiles = 2; -} - -// Message that stores parameters used by ThresholdLayer -message ThresholdParameter { - optional float threshold = 1 [default = 0]; // Strictly positive values -} - -message WindowDataParameter { - // Specify the data source. - optional string source = 1; - // For data pre-processing, we can do simple scaling and subtracting the - // data mean, if provided. Note that the mean subtraction is always carried - // out before scaling. - optional float scale = 2 [default = 1]; - optional string mean_file = 3; - // Specify the batch size. - optional uint32 batch_size = 4; - // Specify if we would like to randomly crop an image. - optional uint32 crop_size = 5 [default = 0]; - // Specify if we want to randomly mirror data. - optional bool mirror = 6 [default = false]; - // Foreground (object) overlap threshold - optional float fg_threshold = 7 [default = 0.5]; - // Background (non-object) overlap threshold - optional float bg_threshold = 8 [default = 0.5]; - // Fraction of batch that should be foreground objects - optional float fg_fraction = 9 [default = 0.25]; - // Amount of contextual padding to add around a window - // (used only by the window_data_layer) - optional uint32 context_pad = 10 [default = 0]; - // Mode for cropping out a detection window - // warp: cropped window is warped to a fixed size and aspect ratio - // square: the tightest square around the window is cropped - optional string crop_mode = 11 [default = "warp"]; - // cache_images: will load all images in memory for faster access - optional bool cache_images = 12 [default = false]; - // append root_folder to locate images - optional string root_folder = 13 [default = ""]; -} - -message SPPParameter { - enum PoolMethod { - MAX = 0; - AVE = 1; - STOCHASTIC = 2; - } - optional uint32 pyramid_height = 1; - optional PoolMethod pool = 2 [default = MAX]; // The pooling method - enum Engine { - DEFAULT = 0; - CAFFE = 1; - CUDNN = 2; - } - optional Engine engine = 6 [default = DEFAULT]; -} - -// DEPRECATED: use LayerParameter. -message V1LayerParameter { - repeated string bottom = 2; - repeated string top = 3; - optional string name = 4; - repeated NetStateRule include = 32; - repeated NetStateRule exclude = 33; - enum LayerType { - NONE = 0; - ABSVAL = 35; - ACCURACY = 1; - ARGMAX = 30; - BNLL = 2; - CONCAT = 3; - CONTRASTIVE_LOSS = 37; - CONVOLUTION = 4; - DATA = 5; - DECONVOLUTION = 39; - DROPOUT = 6; - DUMMY_DATA = 32; - EUCLIDEAN_LOSS = 7; - ELTWISE = 25; - EXP = 38; - FLATTEN = 8; - HDF5_DATA = 9; - HDF5_OUTPUT = 10; - HINGE_LOSS = 28; - IM2COL = 11; - IMAGE_DATA = 12; - INFOGAIN_LOSS = 13; - INNER_PRODUCT = 14; - LRN = 15; - MEMORY_DATA = 29; - MULTINOMIAL_LOGISTIC_LOSS = 16; - MVN = 34; - POOLING = 17; - POWER = 26; - RELU = 18; - SIGMOID = 19; - SIGMOID_CROSS_ENTROPY_LOSS = 27; - SILENCE = 36; - SOFTMAX = 20; - SOFTMAX_LOSS = 21; - SPLIT = 22; - SLICE = 33; - TANH = 23; - WINDOW_DATA = 24; - THRESHOLD = 31; - } - optional LayerType type = 5; - repeated BlobProto blobs = 6; - repeated string param = 1001; - repeated DimCheckMode blob_share_mode = 1002; - enum DimCheckMode { - STRICT = 0; - PERMISSIVE = 1; - } - repeated float blobs_lr = 7; - repeated float weight_decay = 8; - repeated float loss_weight = 35; - optional AccuracyParameter accuracy_param = 27; - optional ArgMaxParameter argmax_param = 23; - optional ConcatParameter concat_param = 9; - optional ContrastiveLossParameter contrastive_loss_param = 40; - optional ConvolutionParameter convolution_param = 10; - optional DataParameter data_param = 11; - optional DropoutParameter dropout_param = 12; - optional DummyDataParameter dummy_data_param = 26; - optional EltwiseParameter eltwise_param = 24; - optional ExpParameter exp_param = 41; - optional HDF5DataParameter hdf5_data_param = 13; - optional HDF5OutputParameter hdf5_output_param = 14; - optional HingeLossParameter hinge_loss_param = 29; - optional ImageDataParameter image_data_param = 15; - optional InfogainLossParameter infogain_loss_param = 16; - optional InnerProductParameter inner_product_param = 17; - optional LRNParameter lrn_param = 18; - optional MemoryDataParameter memory_data_param = 22; - optional MVNParameter mvn_param = 34; - optional PoolingParameter pooling_param = 19; - optional PowerParameter power_param = 21; - optional ReLUParameter relu_param = 30; - optional SigmoidParameter sigmoid_param = 38; - optional SoftmaxParameter softmax_param = 39; - optional SliceParameter slice_param = 31; - optional TanHParameter tanh_param = 37; - optional ThresholdParameter threshold_param = 25; - optional WindowDataParameter window_data_param = 20; - optional TransformationParameter transform_param = 36; - optional LossParameter loss_param = 42; - optional V0LayerParameter layer = 1; -} - -// DEPRECATED: V0LayerParameter is the old way of specifying layer parameters -// in Caffe. We keep this message type around for legacy support. -message V0LayerParameter { - optional string name = 1; // the layer name - optional string type = 2; // the string to specify the layer type - - // Parameters to specify layers with inner products. - optional uint32 num_output = 3; // The number of outputs for the layer - optional bool biasterm = 4 [default = true]; // whether to have bias terms - optional FillerParameter weight_filler = 5; // The filler for the weight - optional FillerParameter bias_filler = 6; // The filler for the bias - - optional uint32 pad = 7 [default = 0]; // The padding size - optional uint32 kernelsize = 8; // The kernel size - optional uint32 group = 9 [default = 1]; // The group size for group conv - optional uint32 stride = 10 [default = 1]; // The stride - enum PoolMethod { - MAX = 0; - AVE = 1; - STOCHASTIC = 2; - } - optional PoolMethod pool = 11 [default = MAX]; // The pooling method - optional float dropout_ratio = 12 [default = 0.5]; // dropout ratio - - optional uint32 local_size = 13 [default = 5]; // for local response norm - optional float alpha = 14 [default = 1.]; // for local response norm - optional float beta = 15 [default = 0.75]; // for local response norm - optional float k = 22 [default = 1.]; - - // For data layers, specify the data source - optional string source = 16; - // For data pre-processing, we can do simple scaling and subtracting the - // data mean, if provided. Note that the mean subtraction is always carried - // out before scaling. - optional float scale = 17 [default = 1]; - optional string meanfile = 18; - // For data layers, specify the batch size. - optional uint32 batchsize = 19; - // For data layers, specify if we would like to randomly crop an image. - optional uint32 cropsize = 20 [default = 0]; - // For data layers, specify if we want to randomly mirror data. - optional bool mirror = 21 [default = false]; - - // The blobs containing the numeric parameters of the layer - repeated BlobProto blobs = 50; - // The ratio that is multiplied on the global learning rate. If you want to - // set the learning ratio for one blob, you need to set it for all blobs. - repeated float blobs_lr = 51; - // The weight decay that is multiplied on the global weight decay. - repeated float weight_decay = 52; - - // The rand_skip variable is for the data layer to skip a few data points - // to avoid all asynchronous sgd clients to start at the same point. The skip - // point would be set as rand_skip * rand(0,1). Note that rand_skip should not - // be larger than the number of keys in the database. - optional uint32 rand_skip = 53 [default = 0]; - - // Fields related to detection (det_*) - // foreground (object) overlap threshold - optional float det_fg_threshold = 54 [default = 0.5]; - // background (non-object) overlap threshold - optional float det_bg_threshold = 55 [default = 0.5]; - // Fraction of batch that should be foreground objects - optional float det_fg_fraction = 56 [default = 0.25]; - - // optional bool OBSOLETE_can_clobber = 57 [default = true]; - - // Amount of contextual padding to add around a window - // (used only by the window_data_layer) - optional uint32 det_context_pad = 58 [default = 0]; - - // Mode for cropping out a detection window - // warp: cropped window is warped to a fixed size and aspect ratio - // square: the tightest square around the window is cropped - optional string det_crop_mode = 59 [default = "warp"]; - - // For ReshapeLayer, one needs to specify the new dimensions. - optional int32 new_num = 60 [default = 0]; - optional int32 new_channels = 61 [default = 0]; - optional int32 new_height = 62 [default = 0]; - optional int32 new_width = 63 [default = 0]; - - // Whether or not ImageLayer should shuffle the list of files at every epoch. - // It will also resize images if new_height or new_width are not zero. - optional bool shuffle_images = 64 [default = false]; - - // For ConcatLayer, one needs to specify the dimension for concatenation, and - // the other dimensions must be the same for all the bottom blobs. - // By default it will concatenate blobs along the channels dimension. - optional uint32 concat_dim = 65 [default = 1]; - - optional HDF5OutputParameter hdf5_output_param = 1001; -} - -message PReLUParameter { - // Parametric ReLU described in K. He et al, Delving Deep into Rectifiers: - // Surpassing Human-Level Performance on ImageNet Classification, 2015. - - // Initial value of a_i. Default is a_i=0.25 for all i. - optional FillerParameter filler = 1; - // Whether or not slope parameters are shared across channels. - optional bool channel_shared = 2 [default = false]; -} - -message TransposeParameter { - optional BlobShape dim = 1; -} - -message MultiplyParameter { - optional float scale = 1 [default = 1]; - optional float bias = 2 [default = 0]; -} - -message AttentionParameter { - optional uint32 num_heads = 1; - optional uint32 from_sequence_length = 2; - optional uint32 to_sequence_length = 3; -} - -message GeluParameter { -} - -message LayerNormParameter { -} -message MatMulParameter { - optional bool transpose_a = 1 [default = false]; // Whether to use transpose matrixA - optional bool transpose_b = 2 [default = false]; // Whether to use transpose matrixB -} diff --git a/model-tools/src/caffe/caffe_adaptee.h b/model-tools/src/caffe/caffe_adaptee.h deleted file mode 100644 index 17463367..00000000 --- a/model-tools/src/caffe/caffe_adaptee.h +++ /dev/null @@ -1,1285 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_CAFFEADAPTEE -#define _H_CAFFEADAPTEE - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "caffe.pb.h" - -#include "type.h" -#include "converter.h" -#include "model_serialize_deserialize.hpp" -#include "model_tools.h" -#include "model_adaptee.h" -#include "ut_util.h" - -class CaffeAdaptee: public ModelAdaptee { -public: - CaffeAdaptee() {} - ~CaffeAdaptee() {} - -protected: - - // read prototxt - EE read_from_prototxt(const char* path, google::protobuf::Message* message) { - std::ifstream fs(path, std::ifstream::in); - if (!fs.is_open()) { - return NOT_FOUND; - } - - google::protobuf::io::IstreamInputStream input(&fs); - bool ret = google::protobuf::TextFormat::Parse(&input, message); - fs.close(); - return (ret) ? SUCCESS : NOT_SUPPORTED; - } - - // read caffemodel(bin) - EE read_from_caffemodel(const char* path, google::protobuf::Message* message) { - std::ifstream fs(path, std::ifstream::in | std::ifstream::binary); - if (!fs.is_open()) { - return NOT_FOUND; - } - - google::protobuf::io::IstreamInputStream input(&fs); - google::protobuf::io::CodedInputStream codedstr(&input); - - codedstr.SetTotalBytesLimit(INT_MAX, INT_MAX / 2); - - bool ret = message -> ParseFromCodedStream(&codedstr); - fs.close(); - - return (ret) ? SUCCESS : NOT_SUPPORTED; - } - - OperatorType convert_caffe_type(std::string inputType) - { - if (inputType == "Convolution") { - return OT_Conv; - } else if (inputType == "Deconvolution") { - return OT_Deconvolution; - } else if (inputType == "BatchNorm") { - return OT_BatchNorm; - } else if (inputType == "Scale") { - return OT_Scale; - } else if (inputType == "Eltwise") { - return OT_Eltwise; - } else if (inputType == "InnerProduct") { - return OT_FC; - } else if (inputType == "Pooling") { - return OT_Pooling; - } else if (inputType == "ReLU") { - return OT_Relu; - } else if (inputType == "ReLU6") { - return OT_Relu6; - } else if (inputType == "HSwish") { - return OT_HSwish; - } else if (inputType == "Sigmoid") { - return OT_Sigmoid; - } else if (inputType == "HSigmoid") { - return OT_HSigmoid; - } else if (inputType == "Softmax") { - return OT_Softmax; - } else if (inputType == "Concat") { - return OT_Concat; - } else if (inputType == "Embed") { - return OT_Embedding; - } else if (inputType == "Gelu") { - return OT_Gelu; - } else if (inputType == "LayerNorm") { - return OT_LayerNorm; - } else if (inputType == "MatMul") { - return OT_MatMul; - } else if (inputType == "Multiply") { - return OT_Multiply; - } else if (inputType == "Reshape") { - return OT_Reshape; - } else if (inputType == "Slice") { - return OT_Slice; - } else if (inputType == "Transpose") { - return OT_Transpose; - } else if (inputType == "Attention") { - return OT_Attention; - } else if (inputType == "Input") { - return OT_Input; - } else if (inputType == "LSTM") { - return OT_LSTM; - } else if (inputType == "TanH") { - return OT_TanH; - } else if (inputType == "SoftmaxWithLoss") { - return OT_SoftmaxWithLoss; - } else if (inputType == "Squeeze") { - return OT_Squeeze; - } else if (inputType == "Unsqueeze") { - return OT_Unsqueeze; - } else if (inputType == "Reduction") { - return OT_Reduction; - } else if (inputType == "ArgMax") { - return OT_ArgMax; - } else if (inputType == "PreAllocatedMemory") { - return OT_PreAllocatedMemory; - } else if (inputType == "SharedWeight") { - return OT_SharedWeight; - } else if (inputType == "Copy") { - return OT_Copy; - } else if (inputType == "Check") { - return OT_Check; - } else if (inputType == "Repeat") { - return OT_Repeat; - } else if (inputType == "Interp") { - return OT_Interp; - } else if (inputType == "Jump") { - return OT_Jump; - } else if (inputType == "AttentionMask") { - return OT_AttentionMask; - } else if (inputType == "RelativePositionEmbed") { - return OT_RelativePositionEmbedding; - } else if (inputType == "RelativeShift") { - return OT_RelativeShift; - } else if (inputType == "Dropout") { - return OT_None; - } else if (inputType == "Flatten") { - return OT_Flatten; - } else if (inputType == "Permute") { - return OT_Permute; - } else if (inputType == "Clip") { - return OT_Clip; - } else if (inputType == "PriorBox") { - return OT_PriorBox; - } else if (inputType == "DetectionOutput") { - return OT_DetectionOutput; - } else { - std::cerr << "[ERROR] encounter unsupported operator " << inputType << std::endl; - exit(1); - } - } - - int net_search_layerId(caffe::NetParameter& netParams, std::string& layerName) { - int i = 0; - if (netParams.layer_size() > 0) { - for (i = 0; i < netParams.layer_size(); i++) { - if (netParams.layer(i).name() == layerName) { - return i; - } - } - } - else{ - for (i = 0; i < netParams.layers_size(); i++) { - if (netParams.layers(i).name() == layerName) { - return i; - } - } - } - return -1; - } - - caffe::BlobProto net_get_blob(caffe::NetParameter& netParams, int layerId, int blobId) { - if (netParams.layer_size() > 0) { - return netParams.layer(layerId).blobs(blobId); - } - else{ - return netParams.layers(layerId).blobs(blobId); - } - } - - int net_get_blobs_size(caffe::NetParameter& netParams, int layerId) { - if (netParams.layer_size() > 0) { - return netParams.layer(layerId).blobs_size(); - } - else{ - return netParams.layers(layerId).blobs_size(); - } - } - - void net_copy_blob(WeightSpec* wsPtr, int weightIndex, caffe::NetParameter& netParams, - int netLayerId, int blobNum, OperatorType operatorType) - { - wsPtr[weightIndex].mdt = DT_F32; - wsPtr[weightIndex].bytes_of_weight = 0; - wsPtr[weightIndex].weight = nullptr; - wsPtr[weightIndex].bytes_of_vec = 0; - wsPtr[weightIndex].vec = nullptr; - - std::vector> weights; - std::vector> biases; - // Batchnorm may have 3 blobs, but the third blob can be ignored - if (operatorType == OT_BatchNorm) { - if (blobNum >= 3) - blobNum = 2; - } - if (blobNum >= 1) { - caffe::BlobProto blob0 = net_get_blob(netParams, netLayerId, 0); - U32 elemSize = sizeof(*(blob0.data().data())); - CHECK_REQUIREMENT(elemSize == bytesOf(wsPtr[weightIndex].mdt)); - U32 blobSize = elemSize * blob0.data_size(); - wsPtr[weightIndex].bytes_of_weight += blobSize; - weights.push_back(std::make_pair(blob0, blobSize)); - } - if (blobNum >= 2) { - caffe::BlobProto blob1 = net_get_blob(netParams, netLayerId, 1); - U32 elemSize = sizeof(*(blob1.data().data())); - CHECK_REQUIREMENT(sizeof(*(blob1.data().data())) == bytesOf(wsPtr[weightIndex].mdt)); - U32 blobSize = elemSize * blob1.data_size(); - wsPtr[weightIndex].bytes_of_vec += blobSize; - biases.push_back(std::make_pair(blob1, blobSize)); - } - if (blobNum >= 3) { - caffe::BlobProto blob2 = net_get_blob(netParams, netLayerId, 2); - U32 elemSize = sizeof(*(blob2.data().data())); - CHECK_REQUIREMENT(elemSize == bytesOf(wsPtr[weightIndex].mdt)); - U32 blobSize = elemSize * blob2.data_size(); - wsPtr[weightIndex].bytes_of_weight += blobSize; - weights.push_back(std::make_pair(blob2, blobSize)); - } - if (weights.size() > 0) { - wsPtr[weightIndex].weight = (U8*)mt_new_storage(wsPtr[weightIndex].bytes_of_weight); - U8 *ptr = wsPtr[weightIndex].weight; - for (U32 i = 0; i < weights.size(); i++) { - memcpy(ptr, weights[i].first.data().data(), weights[i].second); - ptr += weights[i].second; - } - } - if (biases.size() > 0) { - wsPtr[weightIndex].vec = (U8*)mt_new_storage(wsPtr[weightIndex].bytes_of_vec); - U8 *ptr = wsPtr[weightIndex].vec; - for (U32 i = 0; i < biases.size(); i++) { - memcpy(ptr, biases[i].first.data().data(), biases[i].second); - ptr += biases[i].second; - } - } - } - - EE parse_file(std::string dir, std::string mfn) override { - EE ret = SUCCESS; - std::string prototxtSuffix = ".prototxt"; - std::string caffeModelSuffix = ".caffemodel"; - std::string prototxtPath = dir + "/" + mfn + prototxtSuffix; - std::string caffeModelPath = dir + "/" + mfn + caffeModelSuffix; - - // load prototxt - ret = read_from_prototxt(prototxtPath.c_str(), &proto); - if (proto.layer_size() <= 0) { - std::cerr << "[ERROR] null caffemodel " << caffeModelPath << std::endl; - exit(-1); - } - - // load model bin. - ret = read_from_caffemodel(caffeModelPath.c_str(), &net); - return ret; - } - - // the first loop can specify the input info and output info - EE adapt_operators(ModelSpec* ms) override - { - EE ret = SUCCESS; - // model_name - str_copy(ms->model_name, proto.name().c_str(), proto.name().length()); - ms->dt = DT_F32; // set default value - - ms->num_operator_specs = proto.layer_size(); - OperatorSpec* opsPtr = (OperatorSpec*)mt_new_storage(sizeof(OperatorSpec) * ms->num_operator_specs); - ms->ops = opsPtr; - for (I32 i = 0; i < ms->num_operator_specs; i++) { - ms->ops[i].tensor_positions = nullptr; - ms->ops[i].num_quant_feature = 0; - ms->ops[i].feature_scale = nullptr; - } - - int inputsNumber = 0; - weightNumber = 0; // set global variable initial value - std::map outputCounts; - for (int i = 0; i < proto.input_size(); i++) { - outputCounts[proto.input(i).c_str()] = 1; - } - for (int i = 0; i < proto.layer_size(); i++) { - const caffe::LayerParameter curLayer = proto.layer(i); - this->layer = curLayer; - - if (layer.type() == "Input") { // layer,the global variable - inputsNumber++; - } - str_copy(opsPtr[i].name, layer.name().c_str(), layer.name().length()); - - opsPtr[i].type = convert_caffe_type(layer.type()); - int bottomSize = layer.bottom_size(); - opsPtr[i].num_inputs = bottomSize; - opsPtr[i].input_tensors_name = (I8**)mt_new_storage(bottomSize * sizeof(I8 *)); - for (int j = 0; j < bottomSize; j++) { - opsPtr[i].input_tensors_name[j] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); - str_copy(opsPtr[i].input_tensors_name[j], layer.bottom(j).c_str(), layer.bottom(j).length()); - if (outputCounts.find(layer.bottom(j)) == outputCounts.end()) { - if (opsPtr[i].type != OT_Jump) { - std::cerr << "[ERROR] encounter no output as this operator's input " << layer.bottom(j) << std::endl; - exit(1); - } - } else { - outputCounts[layer.bottom(j)]--; - } - } - int topSize = layer.top_size(); - opsPtr[i].num_outputs = topSize; - opsPtr[i].output_tensors_name = (I8 **)mt_new_storage(topSize * sizeof(I8 *)); - for (int j = 0; j < topSize; j++) { - opsPtr[i].output_tensors_name[j] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); - str_copy(opsPtr[i].output_tensors_name[j], layer.top(j).c_str(), layer.top(j).length()); - if (outputCounts.find(layer.top(j)) == outputCounts.end()) { - outputCounts[layer.top(j)] = 1; - } else { - outputCounts[layer.top(j)]++; - } - } - - ParameterSpec curPs; - OperatorType curType = convert_caffe_type(layer.type()); - ret = adapt_operator(curType, &curPs); - ms->ops[i].ps = curPs; - - if (layer.type() == "Flatten") { - opsPtr[i].type = OT_Reshape; - } else if (layer.type() == "Permute") { - opsPtr[i].type = OT_Transpose; - } - } - - inputsNumber = (inputsNumber > proto.input_size()) ? inputsNumber : proto.input_size(); - ms->num_inputs = inputsNumber; - ms->input_names = (I8**)mt_new_storage(inputsNumber * sizeof(I8 *)); - ms->input_dims = (TensorDesc*)mt_new_storage(sizeof(TensorDesc) * inputsNumber); - for (int i = 0; i < inputsNumber; i++) { - ms->input_names[i] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); - - if (proto.input_size() > 0) { - str_copy(ms->input_names[i], proto.input(i).c_str(), proto.input(i).length()); - switch (proto.input_dim_size()) { - case 2: - ms->input_dims[i] = tensor2df(DT_U32, DF_NORMAL, - proto.input_dim(0), - proto.input_dim(1)); - break; - case 3: - ms->input_dims[i] = tensor3df(DT_F32, DF_MTK, - proto.input_dim(0), - proto.input_dim(1), - proto.input_dim(2)); - break; - case 4: - ms->input_dims[i] = tensor4df(DT_F32, DF_NCHW, - proto.input_dim(0), - proto.input_dim(1), - proto.input_dim(2), - proto.input_dim(3)); - break; - default: { - std::cerr << "[ERROR] unsupported input dim" << std::endl; - exit(-1); - } - } - } - if (i < proto.input_shape_size()) { - str_copy(ms->input_names[i], proto.input(i).c_str(), proto.input(i).length()); - switch (proto.input_shape(i).dim_size()) { - case 2: - ms->input_dims[i] = tensor2df(DT_U32, DF_NORMAL, - proto.input_shape(i).dim(0), - proto.input_shape(i).dim(1)); - break; - case 3: - ms->input_dims[i] = tensor3df(DT_F32, DF_NCHW, - proto.input_shape(i).dim(0), - proto.input_shape(i).dim(1), - proto.input_shape(i).dim(2)); - break; - case 4: - ms->input_dims[i] = tensor4df(DT_F32, DF_NCHW, - proto.input_shape(i).dim(0), - proto.input_shape(i).dim(1), - proto.input_shape(i).dim(2), - proto.input_shape(i).dim(3)); - break; - default: { - std::cerr << "[ERROR] unsupport input dim" << std::endl; - exit(-1); - } - } - } - } - - int outputsNumber = 0; - for (auto iter: outputCounts) { - if (iter.second > 0) { - outputsNumber ++; - } - } - ms->num_outputs = outputsNumber; - ms->output_names = (I8**)mt_new_storage(outputsNumber * sizeof(I8*)); - outputsNumber = 0; - for (auto iter: outputCounts) { - if (iter.second > 0) { - ms->output_names[outputsNumber] = (I8*)mt_new_storage(NAME_LEN * sizeof(I8)); - str_copy(ms->output_names[outputsNumber], iter.first.c_str(), iter.first.length()); - outputsNumber ++; - } - } - ms->num_weight_specs = this->weightNumber; // use the global variable - return ret; - } - - EE adapt_weights(ModelSpec* ms) override - { - EE ret = SUCCESS; - WeightSpec* wsPtr = (WeightSpec*)mt_new_storage(sizeof(WeightSpec) * ms->num_weight_specs); - for (int j = 0; j < ms->num_weight_specs; j++) { - wsPtr[j].num_quant_scale = 0; - wsPtr[j].weight_scale = nullptr; - } - ms->ws = wsPtr; - int inNamesIndex = 0; - int weightIndex = 0; - - for (int i = 0; i < proto.layer_size(); i++) { - const caffe::LayerParameter layer = proto.layer(i); - std::string layerName = layer.name(); - std::string layerType = layer.type(); - - if (layerType == "Input") { - str_copy(ms->input_names[inNamesIndex], layerName.c_str(), layerName.length()); - switch (layer.input_param().shape(0).dim_size()) { - case 2: - ms->input_dims[inNamesIndex] = tensor2df(DT_U32, DF_NORMAL, - layer.input_param().shape(0).dim(0), - layer.input_param().shape(0).dim(1)); - break; - case 3: - ms->input_dims[inNamesIndex] = tensor3df(DT_F32, DF_MTK, - layer.input_param().shape(0).dim(0), - layer.input_param().shape(0).dim(1), - layer.input_param().shape(0).dim(2)); - break; - case 4: - ms->input_dims[inNamesIndex] = tensor4df(DT_F32, DF_NCHW, layer.input_param().shape(0).dim(0), - layer.input_param().shape(0).dim(1), - layer.input_param().shape(0).dim(2), - layer.input_param().shape(0).dim(3)); - break; - default: { - std::cerr << "[ERROR] unsupport input dim" << std::endl; - exit(-1); - } - } - inNamesIndex++; - }else if (layerType == "Convolution" - || layerType == "InnerProduct" - || layerType == "BatchNorm" - || layerType == "Embed" - || layerType == "LSTM" - || layerType == "SharedWeight" - || layerType == "RelativePositionEmbed" - || layerType == "Deconvolution") { - int netLayerId = net_search_layerId(net, layerName); - CHECK_REQUIREMENT(netLayerId >= 0); - str_copy(wsPtr[weightIndex].op_name, layerName.c_str(), layerName.length()); - U32 blobNum = net_get_blobs_size(net, netLayerId); - net_copy_blob(wsPtr, weightIndex, net, netLayerId, blobNum, convert_caffe_type(layerType)); - - if (layerType == "BatchNorm" && blobNum > 2) { - caffe::BlobProto blob2 = net_get_blob(net, netLayerId, 2); - float cur_gama = blob2.data().data()[0] == 0 ? 1.0 : 1.0 / blob2.data().data()[0]; - ms->ops[i].ps.bn_spec.gama = cur_gama; - } - - weightIndex++; - }else if (layerType == "Scale" || layerType == "LayerNorm") { - int netLayerId = net_search_layerId(net, layerName); - CHECK_REQUIREMENT(netLayerId >= 0); - str_copy(wsPtr[weightIndex].op_name, layerName.c_str(), layerName.length()); - U32 blobNum = net_get_blobs_size(net, netLayerId); - if (layer.bottom_size() == 1) { - CHECK_REQUIREMENT(blobNum >= 1); - } - else { - CHECK_REQUIREMENT(blobNum == 0); - } - net_copy_blob(wsPtr, weightIndex, net, netLayerId, blobNum, convert_caffe_type(layerType)); - weightIndex++; - } - } - - CHECK_REQUIREMENT(weightIndex == weightNumber); - // relationship init null - ms->num_op_tensor_entries = 0; - ms->op_relationship_entries = nullptr; - return ret; - } - - ParameterSpec adapt_Interp() override - { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - InterpParamSpec interpPs; - auto caffeInterpParam = layer.interp_param(); - interpPs.height = caffeInterpParam.height(); - interpPs.width = caffeInterpParam.width(); - curPs.interp_spec = interpPs; - return curPs; - } - - ParameterSpec adapt_Conv() override { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - weightNumber = weightNumber + 1; - ConvolutionParamSpec cps; - initialization_zero(&cps, sizeof(cps)); - cps.num_outputs = layer.convolution_param().num_output(); - if (layer.convolution_param().has_kernel_w() && layer.convolution_param().has_kernel_h()) { - cps.kernel_size_w = layer.convolution_param().kernel_w(); - cps.kernel_size_h = layer.convolution_param().kernel_h(); - } else { - cps.kernel_size_h = (layer.convolution_param().kernel_size_size() > 0) ? \ - layer.convolution_param().kernel_size(0) : 1; - cps.kernel_size_w = (layer.convolution_param().kernel_size_size() > 1) ? \ - layer.convolution_param().kernel_size(1) : cps.kernel_size_h; - } - - cps.group = (layer.convolution_param().has_group()) ? layer.convolution_param().group() : 1; // group[default=1] - - cps.dilatedRate_h = (layer.convolution_param().dilation_size() != 0) ? layer.convolution_param().dilation(0) : 1; - cps.dilatedRate_w = cps.dilatedRate_h; - - if (cps.group != cps.num_outputs) { - std::cout << "[INFO] Convolution group != num_outputs" << std::endl; - cps.group = 1; - } else { - std::cout << "[INFO] Depthwise Convolution" << std::endl; - } - if (cps.group == 1) { - if (cps.dilatedRate_h > 1) { - cps.convolution_type = Convolution_Dilation; - } else { - cps.convolution_type = Convolution_Pointwise; - } - } else { - cps.convolution_type = Convolution_Depthwise; - } - cps.dw_activation_type = ACTIVATION_NULL; - cps.pw_activation_type = ACTIVATION_NULL; - if (layer.convolution_param().has_stride_w() && layer.convolution_param().has_stride_h()) { - cps.stride_w = layer.convolution_param().stride_w(); - cps.stride_h = layer.convolution_param().stride_h(); - } else { - cps.stride_h = (layer.convolution_param().stride_size() != 0) ? \ - layer.convolution_param().stride(0) : 1; // stride[default=1] - cps.stride_w = (layer.convolution_param().stride_size() > 1) ? \ - layer.convolution_param().stride(1) : cps.stride_h; - } - if (layer.convolution_param().has_pad_w() && layer.convolution_param().has_pad_h()) { - cps.padding_left = layer.convolution_param().pad_w(); - cps.padding_right = cps.padding_left; - cps.padding_top = layer.convolution_param().pad_h(); - cps.padding_bottom = cps.padding_top; - } else { - cps.padding_top = (layer.convolution_param().pad_size() > 0) ? layer.convolution_param().pad(0) : 0; - cps.padding_bottom = (layer.convolution_param().pad_size() > 1) ? layer.convolution_param().pad(1) : cps.padding_top; - cps.padding_left = (layer.convolution_param().pad_size() > 2) ? layer.convolution_param().pad(2) : cps.padding_top; - cps.padding_right = (layer.convolution_param().pad_size() > 3) ? layer.convolution_param().pad(3) : cps.padding_top; - } - curPs.conv_spec = cps; - return curPs; - } - - ParameterSpec adapt_Deconvolution() override - { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - weightNumber = weightNumber + 1; - ConvolutionParamSpec cps; - cps.num_outputs = layer.convolution_param().num_output(); - if (layer.convolution_param().has_kernel_w() && layer.convolution_param().has_kernel_h()) { - cps.kernel_size_w = layer.convolution_param().kernel_w(); - cps.kernel_size_h = layer.convolution_param().kernel_h(); - } else { - cps.kernel_size_h = layer.convolution_param().kernel_size(0); - cps.kernel_size_w = cps.kernel_size_h; - } - - cps.group = (layer.convolution_param().has_group()) ? layer.convolution_param().group() : 1; - if (1 != cps.group) { - std::cerr << "[ERROR] Deconvolution group != 1" << std::endl; - std::cerr << "[ERROR]: UNSUPPORTED!" << std::endl; - CHECK_STATUS(NOT_SUPPORTED); - } - cps.dilatedRate_h = 1; - cps.dilatedRate_w = 1; - cps.convolution_type = Convolution_Deconvolution; - cps.dw_activation_type = ACTIVATION_NULL; - cps.pw_activation_type = ACTIVATION_NULL; - if (layer.convolution_param().has_stride_w() && layer.convolution_param().has_stride_h()) { - cps.stride_w = layer.convolution_param().stride_w(); - cps.stride_h = layer.convolution_param().stride_h(); - } else { - cps.stride_h = (layer.convolution_param().stride_size() != 0) ? layer.convolution_param().stride(0) : 1; // stride[default=1] - cps.stride_w = cps.stride_h; - } - if (layer.convolution_param().has_pad_w() && layer.convolution_param().has_pad_h()) { - cps.padding_left = layer.convolution_param().pad_w(); - cps.padding_right = cps.padding_left; - cps.padding_top = layer.convolution_param().pad_h(); - cps.padding_bottom = cps.padding_top; - } else { - cps.padding_top = (layer.convolution_param().pad_size() != 0) ? layer.convolution_param().pad(0) : 0; // pad[default=0] - cps.padding_bottom = cps.padding_top; - cps.padding_left = cps.padding_top; - cps.padding_right = cps.padding_top; - } - curPs.conv_spec = cps; - return curPs; - } - - ParameterSpec adapt_Pooling() override { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - PoolingParamSpec pps; - initialization_zero(&pps, sizeof(pps)); - if (layer.pooling_param().has_kernel_w() && layer.pooling_param().has_kernel_h()) { - pps.kernel_size_w = layer.pooling_param().kernel_w(); - pps.kernel_size_h = layer.pooling_param().kernel_h(); - } else { - pps.kernel_size_h = layer.pooling_param().kernel_size(); - pps.kernel_size_w = pps.kernel_size_h; - } - if (layer.pooling_param().has_stride_w() && layer.pooling_param().has_stride_h()) { - pps.stride_w = layer.pooling_param().stride_w(); - pps.stride_h = layer.pooling_param().stride_h(); - } else { - pps.stride_h = layer.pooling_param().stride(); - pps.stride_w = pps.stride_h; - } - bool global_pooling = layer.pooling_param().global_pooling(); - if (global_pooling) { - pps.kernel_size_h = 0; - pps.kernel_size_w = 0; - pps.stride_h = 1; - pps.stride_w = 1; - }else { - CHECK_REQUIREMENT(pps.kernel_size_h > 0); - } - if (layer.pooling_param().has_pad_w() && layer.pooling_param().has_pad_h()) { - pps.padding_left = layer.pooling_param().pad_w(); - pps.padding_right = pps.padding_left; - pps.padding_top = layer.pooling_param().pad_h(); - pps.padding_bottom = pps.padding_top; - } else { - pps.padding_top = layer.pooling_param().has_pad() ? layer.pooling_param().pad() : 0; - pps.padding_bottom = pps.padding_top; - pps.padding_left = pps.padding_top; - pps.padding_right = pps.padding_top; - } - - if (layer.pooling_param().has_round_mode() && layer.pooling_param().round_mode() == 1) { - pps.rm = FLOOR; - }else{ - pps.rm = CEIL; - } - switch (layer.pooling_param().pool()) { - case caffe::PoolingParameter_PoolMethod_MAX: { - pps.mode = POOLING_MAX; - break; - } - case caffe::PoolingParameter_PoolMethod_AVE: { - pps.mode = POOLING_MEAN; - break; - } - default: { - std::cerr << "[ERROR] encounter unsupported Pooling method " << layer.pooling_param().pool() << std::endl; - exit(1); - } - } - curPs.pooling_spec = pps; - return curPs; - } - - ParameterSpec adapt_Fc() override { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - weightNumber = weightNumber + 1; - FullyConnectedParamSpec ips; - ips.num_outputs = layer.inner_product_param().num_output(); - ips.num_slices = 1; - ips.slice_point[0] = ips.num_outputs; - curPs.fc_spec = ips; - return curPs; - } - - ParameterSpec adapt_BatchNorm() override { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - weightNumber = weightNumber + 1; - BatchNormParamSpec bnps; - bnps.axis = layer.batch_norm_param().axis(); - bnps.eps = layer.batch_norm_param().eps(); - bnps.gama = 1; - bnps.momentum = layer.batch_norm_param().moving_average_fraction(); - curPs.bn_spec = bnps; - return curPs; - } - - ParameterSpec adapt_LayerNorm() override { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - weightNumber = weightNumber + 1; - return curPs; - } - - ParameterSpec adapt_Eltwise() override{ - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - EltwiseParamSpec eps; - initialization_zero(&eps, sizeof(eps)); - EltwiseSumSpec ess; - initialization_zero(&ess, sizeof(ess)); - - auto caffeEltwiseParam = layer.eltwise_param(); - auto op = caffeEltwiseParam.operation(); - switch (op) - { - case caffe::EltwiseParameter_EltwiseOp_PROD: - eps.elt_mode = ELTWISE_PROD; - break; - case caffe::EltwiseParameter_EltwiseOp_SUM: - eps.elt_mode = ELTWISE_SUM; - break; - case caffe::EltwiseParameter_EltwiseOp_MAX: - eps.elt_mode = ELTWISE_MAX; - break; - default: { - std::cerr << "[ERROR] unknown eltwise mode" << std::endl; - exit(-1); - } - } - U32 bytes = caffeEltwiseParam.coeff_size() * sizeof(F32); - ess.coeff_size = caffeEltwiseParam.coeff_size(); - ess.coeff_values = (F32 *)mt_new_storage(bytes); - memcpy(ess.coeff_values, caffeEltwiseParam.coeff().data(), bytes); - for (int j = 0; j < caffeEltwiseParam.coeff_size(); j++) { - CHECK_REQUIREMENT(ess.coeff_values[j] == 1); - } - eps.elt_sum_spec = ess; - curPs.eltwise_spec = eps; - return curPs; - } - - ParameterSpec adapt_Embedding() override { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - weightNumber = weightNumber + 1; - EmbedParamSpec embedPs; - auto caffeEmbedParam = layer.embed_param(); - embedPs.input_dim = caffeEmbedParam.input_dim(); - embedPs.num_output = caffeEmbedParam.num_output(); - embedPs.bias_term = caffeEmbedParam.bias_term() == 0 ? false: true; - embedPs.transpose = caffeEmbedParam.transpose() == 0 ? false : true; - curPs.embed_spec = embedPs; - return curPs; - } - - ParameterSpec adapt_Multiply() override { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - MultiplyParamSpec multiplyPs; - auto caffeMultiplyParam = layer.multiply_param(); - multiplyPs.scale = caffeMultiplyParam.scale(); - multiplyPs.bias = caffeMultiplyParam.bias(); - curPs.multiply_spec = multiplyPs; - return curPs; - } - - ParameterSpec adapt_Reshape() override - { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - ReshapeParamSpec reshapePs; - auto caffeReshapeParam = layer.reshape_param(); - reshapePs.shape_size = caffeReshapeParam.shape().dim_size(); - for (I32 iter = 0; iter < caffeReshapeParam.shape().dim_size(); iter++) { - reshapePs.shape_dims[iter] = caffeReshapeParam.shape().dim(iter); - } - reshapePs.axis = caffeReshapeParam.axis(); - reshapePs.num_axes = caffeReshapeParam.num_axes(); - curPs.reshape_spec = reshapePs; - return curPs; - } - - ParameterSpec adapt_Flatten() override - { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - ReshapeParamSpec reshapePs; - auto caffeFlattenParam = layer.flatten_param(); - CHECK_REQUIREMENT(-1 == caffeFlattenParam.end_axis()); // Currently compute as reshape layer - reshapePs.shape_size = caffeFlattenParam.axis() + 1; - for (I32 iter = 0; iter < reshapePs.shape_size - 1; iter++) { - reshapePs.shape_dims[iter] = 0; - } - reshapePs.shape_dims[reshapePs.shape_size - 1] = -1; - reshapePs.axis = 0; - reshapePs.num_axes = -1; - curPs.reshape_spec = reshapePs; - return curPs; - } - - ParameterSpec adapt_Slice() override { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - SliceParamSpec slicePs; - auto caffeSliceParam = layer.slice_param(); - for (I32 iter = 0; iter < caffeSliceParam.slice_point().size(); iter++) { - slicePs.slice_points[iter] = caffeSliceParam.slice_point(iter); - } - slicePs.slice_size = caffeSliceParam.slice_point().size(); - slicePs.axis = caffeSliceParam.axis(); - curPs.slice_spec = slicePs; - return curPs; - } - - ParameterSpec adapt_Transpose() override - { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - TransposeParamSpec transPs; - auto caffeTransposeParam = layer.transpose_param(); - for (I32 iter=0; iter < caffeTransposeParam.dim().dim_size(); iter++) { - transPs.trans_dims[iter] = caffeTransposeParam.dim().dim(iter); - } - transPs.trans_size = caffeTransposeParam.dim().dim_size(); - curPs.transpose_spec = transPs; - return curPs; - } - - ParameterSpec adapt_Permute() override { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - TransposeParamSpec transPs; - auto caffePermuteParam = layer.permute_param(); - for (I32 iter=0; iter < caffePermuteParam.order().size(); iter++) { - transPs.trans_dims[iter] = caffePermuteParam.order(iter); - } - transPs.trans_size = caffePermuteParam.order().size(); - curPs.transpose_spec = transPs; - return curPs; - } - - ParameterSpec adapt_Attention() override { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - AttentionParamSpec attentionPs; - auto caffe_attention_param = layer.attention_param(); - attentionPs.num_heads = caffe_attention_param.num_heads(); - attentionPs.from_sequence_length = caffe_attention_param.from_sequence_length(); - attentionPs.to_sequence_length = caffe_attention_param.to_sequence_length(); - curPs.attention_spec = attentionPs; - return curPs; - } - - ParameterSpec adapt_LSTM() override { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - weightNumber = weightNumber + 1; - LSTMParamSpec lstmPs; - auto caffeLSTMParam = layer.lstm_param(); - lstmPs.num_output = caffeLSTMParam.num_output(); - lstmPs.steps = caffeLSTMParam.steps(); - lstmPs.num_projection = caffeLSTMParam.num_proj(); - lstmPs.zoneout_cell = caffeLSTMParam.zoneout_cell(); - lstmPs.zoneout_output = caffeLSTMParam.zoneout_output(); - curPs.lstm_spec = lstmPs; - return curPs; - } - - ParameterSpec adapt_Scale() override { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - weightNumber = weightNumber + 1; - ScaleParamSpec scalePs; - auto caffeScaleParam = layer.scale_param(); - scalePs.axis = caffeScaleParam.axis(); - curPs.scale_spec = scalePs; - return curPs; - } - - ParameterSpec adapt_Reduction() override { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - ReductionParamSpec reductionPs; - auto caffeReductionParam = layer.reduction_param(); - reductionPs.axis = caffeReductionParam.axis(); - auto op = caffeReductionParam.operation(); - switch (op) - { - case caffe::ReductionParameter_ReductionOp_SUM: - reductionPs.reduction_mode = REDUCTION_SUM; - break; - case caffe::ReductionParameter_ReductionOp_MEAN: - reductionPs.reduction_mode = REDUCTION_MEAN; - break; - default: { - std::cerr << "[ERROR] unknown reduction mode" << std::endl; - exit(-1); - } - } - reductionPs.coeff = caffeReductionParam.coeff(); - reductionPs.keep_dim = caffeReductionParam.keep_dim(); - curPs.reduction_spec = reductionPs; - return curPs; - } - - ParameterSpec adapt_Squeeze() override { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - SqueezeParamSpec squeezePs; - auto caffeSqueezeParam = layer.squeeze_param(); - squeezePs.axis = caffeSqueezeParam.axis(); - squeezePs.axes_num = 0; - curPs.squeeze_spec = squeezePs; - return curPs; - } - - ParameterSpec adapt_Unsqueeze() override { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - UnsqueezeParamSpec unsqueezePs; - auto caffeUnsqueezeParam = layer.unsqueeze_param(); - unsqueezePs.axis = caffeUnsqueezeParam.axis(); - unsqueezePs.axes_num = 0; - curPs.unsqueeze_spec = unsqueezePs; - return curPs; - } - - ParameterSpec adapt_ArgMax() override { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - ArgMaxParamSpec argmaxPs; - auto caffeArgMaxParam = layer.argmax_param(); - argmaxPs.axis = caffeArgMaxParam.axis(); - curPs.argmax_spec = argmaxPs; - return curPs; - } - - ParameterSpec adapt_Repeat() override { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - RepeatParamSpec repeatPs; - auto caffeRepeatParam = layer.repeat_param(); - repeatPs.loops = caffeRepeatParam.loops(); - repeatPs.axis = caffeRepeatParam.axis(); - curPs.repeat_spec = repeatPs; - return curPs; - } - - ParameterSpec adapt_Check() override { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - CheckParamSpec checkPs; - auto caffeCheckParam = layer.check_param(); - auto op = caffeCheckParam.operation(); - switch (op) - { - case caffe::CheckParameter_CheckOp_EQUAL: - checkPs.check_mode = CHECK_EQUAL; - break; - case caffe::CheckParameter_CheckOp_GREAT: - checkPs.check_mode = CHECK_GREAT; - break; - case caffe::CheckParameter_CheckOp_GREATEQUAL: - checkPs.check_mode = CHECK_GREATEQUAL; - break; - default: { - std::cerr << "[ERROR] unknown check mode" << std::endl; - exit(-1); - } - } - curPs.check_spec = checkPs; - return curPs; - } - - ParameterSpec adapt_PreAllocatedMemory() override { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - PreAllocatedMemoryParamSpec preAllocatedMemoryPs; - auto caffePreAllocatedMemoryParam = layer.preallocated_memory_param(); - preAllocatedMemoryPs.desc.nDims = caffePreAllocatedMemoryParam.shape().dim_size(); - for (I32 iter=0; iter < caffePreAllocatedMemoryParam.shape().dim_size(); iter++) { - preAllocatedMemoryPs.desc.dims[preAllocatedMemoryPs.desc.nDims - 1 - iter] = caffePreAllocatedMemoryParam.shape().dim(iter); - } - preAllocatedMemoryPs.desc.df = getTensorDefaultDataFormat(preAllocatedMemoryPs.desc.nDims); - auto dt = caffePreAllocatedMemoryParam.data_type(); - switch (dt) - { - case caffe::PreAllocatedMemoryParameter_DataType_FLOAT32: - preAllocatedMemoryPs.desc.dt = DT_F32; - break; - case caffe::PreAllocatedMemoryParameter_DataType_UINT32: - preAllocatedMemoryPs.desc.dt = DT_U32; - break; - case caffe::PreAllocatedMemoryParameter_DataType_INT32: - preAllocatedMemoryPs.desc.dt = DT_I32; - break; - default: { - std::cerr << "[ERROR] unknown memory data type" << std::endl; - exit(-1); - } - } - curPs.preallocated_memory_spec = preAllocatedMemoryPs; - return curPs; - } - - ParameterSpec adapt_SharedWeight() override { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - weightNumber = weightNumber + 1; - SharedWeightParamSpec sharedWeightPs; - auto caffeSharedWeightParam = layer.shared_weight_param(); - sharedWeightPs.desc.nDims = caffeSharedWeightParam.shape().dim_size(); - for (I32 iter=0; iter < caffeSharedWeightParam.shape().dim_size(); iter++) { - sharedWeightPs.desc.dims[sharedWeightPs.desc.nDims - 1 - iter] = caffeSharedWeightParam.shape().dim(iter); - } - sharedWeightPs.desc.df = getTensorDefaultDataFormat(sharedWeightPs.desc.nDims); - auto dt = caffeSharedWeightParam.data_type(); - switch (dt) - { - case caffe::SharedWeightParameter_DataType_FLOAT32: - sharedWeightPs.desc.dt = DT_F32; - break; - case caffe::SharedWeightParameter_DataType_UINT32: - sharedWeightPs.desc.dt = DT_U32; - break; - case caffe::SharedWeightParameter_DataType_INT32: - sharedWeightPs.desc.dt = DT_I32; - break; - default: { - std::cerr << "[ERROR] unknown weight data type" << std::endl; - exit(-1); - } - } - curPs.shared_weight_spec = sharedWeightPs; - return curPs; - } - - ParameterSpec adapt_Copy() override { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - CopyParamSpec copyPs; - auto caffeCopyParam = layer.copy_param(); - copyPs.src_dims[0] = caffeCopyParam.src_batch_stride(); - copyPs.src_dims[1] = caffeCopyParam.src_stride(); - copyPs.src_dims[2] = caffeCopyParam.src_offset(); - copyPs.dst_dims[0] = caffeCopyParam.dst_batch_stride(); - copyPs.dst_dims[1] = caffeCopyParam.dst_stride(); - copyPs.dst_dims[2] = caffeCopyParam.dst_offset(); - copyPs.length = caffeCopyParam.length(); - curPs.copy_spec = copyPs; - return curPs; - } - - ParameterSpec adapt_MatMul() override { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - MatMulParamSpec matmulPs; - auto caffeMatMulParam = layer.matmul_param(); - matmulPs.transpose_a = caffeMatMulParam.transpose_a(); - matmulPs.transpose_b = caffeMatMulParam.transpose_b(); - curPs.matmul_spec = matmulPs; - return curPs; - } - - ParameterSpec adapt_AttentionMask() override { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - AttentionMaskParamSpec attentionMaskPs; - auto caffeAttentionMaskParam = layer.attention_mask_param(); - attentionMaskPs.attention_length = caffeAttentionMaskParam.attention_length(); - attentionMaskPs.same_length = caffeAttentionMaskParam.same_length(); - attentionMaskPs.mask = caffeAttentionMaskParam.mask(); - curPs.attention_mask_spec = attentionMaskPs; - return curPs; - } - - ParameterSpec adapt_RelativePositionEmbedding() override { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - weightNumber = weightNumber + 1; - RelativePositionEmbedParamSpec relativePositionEmbedPs; - auto caffeRelativePositionEmbedParam = layer.relative_position_embed_param(); - relativePositionEmbedPs.input_dim = caffeRelativePositionEmbedParam.input_dim(); - relativePositionEmbedPs.num_output = caffeRelativePositionEmbedParam.num_output(); - relativePositionEmbedPs.bias_term = caffeRelativePositionEmbedParam.bias_term() == 0 ? false: true; - relativePositionEmbedPs.transpose = caffeRelativePositionEmbedParam.transpose() == 0 ? false : true; - relativePositionEmbedPs.axis = caffeRelativePositionEmbedParam.axis(); - curPs.relative_position_embed_spec = relativePositionEmbedPs; - return curPs; - } - - ParameterSpec adapt_RelativeShift() override { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - RelativeShiftParamSpec relativeShiftPs; - auto caffeRelativeShiftParam = layer.relative_shift_param(); - relativeShiftPs.axis = caffeRelativeShiftParam.axis(); - relativeShiftPs.shift_length = caffeRelativeShiftParam.shift_length(); - curPs.relative_shift_spec = relativeShiftPs; - return curPs; - } - - ParameterSpec adapt_Concat() override { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - ConcatParamSpec concatPs; - auto caffeConcatParam = layer.concat_param(); - concatPs.axis = caffeConcatParam.axis(); - curPs.concat_spec = concatPs; - return curPs; - } - - ParameterSpec adapt_Softmax() override { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - SoftmaxParamSpec softmaxPs; - auto caffeSoftmaxParam = layer.softmax_param(); - softmaxPs.axis = caffeSoftmaxParam.axis(); - curPs.softmax_spec = softmaxPs; - return curPs; - } - - ParameterSpec adapt_PriorBox() override { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - PriorBoxParamSpec priorboxPs; - auto caffePriorBoxParam = layer.prior_box_param(); - CHECK_REQUIREMENT(caffePriorBoxParam.min_size_size() <= 2 && caffePriorBoxParam.max_size_size() <= 2); - for (int i = 0; i < 2; i++){ - priorboxPs.min_sizes[i] = 0; - if(i < caffePriorBoxParam.min_size_size()) - priorboxPs.min_sizes[i] = caffePriorBoxParam.min_size(i); - } - for (int i = 0; i < 2; i++){ - priorboxPs.max_sizes[i] = 0; - if(i < caffePriorBoxParam.max_size_size()) - priorboxPs.max_sizes[i] = caffePriorBoxParam.max_size(i); - } - CHECK_REQUIREMENT(caffePriorBoxParam.aspect_ratio_size() <= 2); - for (int i = 0; i < 2; i++){ - priorboxPs.aspect_ratios[i] = 0; - if(i < caffePriorBoxParam.aspect_ratio_size()) - priorboxPs.aspect_ratios[i] = caffePriorBoxParam.aspect_ratio(i); - } - if(caffePriorBoxParam.has_flip()){ - if(caffePriorBoxParam.flip()) - priorboxPs.flip = 1; - else - priorboxPs.flip = 0; - } else - priorboxPs.flip = 1; - if(caffePriorBoxParam.has_clip()){ - if(caffePriorBoxParam.clip()) - priorboxPs.clip = 1; - else - priorboxPs.clip = 0; - } else - priorboxPs.clip = 0; - if (caffePriorBoxParam.variance_size() == 4){ - priorboxPs.variances[0] = caffePriorBoxParam.variance(0); - priorboxPs.variances[1] = caffePriorBoxParam.variance(1); - priorboxPs.variances[2] = caffePriorBoxParam.variance(2); - priorboxPs.variances[3] = caffePriorBoxParam.variance(3); - } - else if (caffePriorBoxParam.variance_size() == 1){ - priorboxPs.variances[0] = caffePriorBoxParam.variance(0); - priorboxPs.variances[1] = caffePriorBoxParam.variance(0); - priorboxPs.variances[2] = caffePriorBoxParam.variance(0); - priorboxPs.variances[3] = caffePriorBoxParam.variance(0); - } - priorboxPs.image_w = 0; - priorboxPs.image_h = 0; - if (caffePriorBoxParam.has_img_size()){ - priorboxPs.image_w = caffePriorBoxParam.img_size(); - priorboxPs.image_h = caffePriorBoxParam.img_size(); - } - if (caffePriorBoxParam.has_img_w() && caffePriorBoxParam.has_img_h()){ - priorboxPs.image_w = caffePriorBoxParam.img_w(); - priorboxPs.image_h = caffePriorBoxParam.img_h(); - } - priorboxPs.step_w = 0; - priorboxPs.step_h = 0; - if(caffePriorBoxParam.has_step()){ - priorboxPs.step_w = caffePriorBoxParam.step(); - priorboxPs.step_h = caffePriorBoxParam.step(); - } - if (caffePriorBoxParam.has_step_w() && caffePriorBoxParam.has_step_h()){ - priorboxPs.step_w = caffePriorBoxParam.step_w(); - priorboxPs.step_h = caffePriorBoxParam.step_h(); - } - priorboxPs.offset = caffePriorBoxParam.offset(); - curPs.prior_box_spec = priorboxPs; - return curPs; - } - - ParameterSpec adapt_DetectionOutput() override { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - DetectionOutputParamSpec detectionoutputPs; - auto caffeDetectionOutputParam = layer.detection_output_param(); - detectionoutputPs.num_class = caffeDetectionOutputParam.num_classes(); - CHECK_REQUIREMENT((caffeDetectionOutputParam.background_label_id() == 0)&&(caffeDetectionOutputParam.share_location() == true)); - detectionoutputPs.nms_threshold = caffeDetectionOutputParam.nms_param().nms_threshold(); - detectionoutputPs.nms_top_k = caffeDetectionOutputParam.nms_param().top_k(); - detectionoutputPs.keep_top_k = caffeDetectionOutputParam.keep_top_k(); - detectionoutputPs.confidence_threshold = caffeDetectionOutputParam.confidence_threshold(); - curPs.detection_output_spec = detectionoutputPs; - return curPs; - } - - ParameterSpec adapt_Clip() override - { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - ClipParamSpec clipParam; - auto caffeClipParam = layer.clip_param(); - clipParam.min = caffeClipParam.min(); - clipParam.max = caffeClipParam.max(); - curPs.clip_spec = clipParam; - return curPs; - } - - ParameterSpec adapt_Relu() override { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - ReLUParamSpec reluSpec; - reluSpec.neg_slope = 0.0; - curPs.relu_spec = reluSpec; - return curPs; - } - -private: - caffe::NetParameter proto; - caffe::NetParameter net; - caffe::LayerParameter layer; - int weightNumber; -}; -#endif diff --git a/model-tools/src/data_type_converter.cpp b/model-tools/src/data_type_converter.cpp deleted file mode 100644 index 60be6b0a..00000000 --- a/model-tools/src/data_type_converter.cpp +++ /dev/null @@ -1,329 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include -#include -#include -#include -#include -#include -#include -#include "model_serialize_deserialize.hpp" -#include "model_tools.h" -#include "model_optimizer.hpp" -#include "OPOptimizers/DeprecatedOPOptimizer.hpp" -#include - - -template -EE ws_datatype_converter(U8* originalPtr, U8* targetPtr, int paramNum) -{ - F32* f32PtrParam = (F32*)originalPtr; - T* targetPtrParam = (T*)targetPtr; - for (int j = 0; j < paramNum; j++) { - F32 originalParam = f32PtrParam[j]; - T changedParam = (T)originalParam; - targetPtrParam[j] = changedParam; - } - return SUCCESS; -} - -EE ws_datatype_converter_bnn(U8* originalPtr, U8* targetPtr, int paramNum) -{ - F32* f32PtrParam = (F32*)originalPtr; - BIN8* targetPtrParam = (BIN8*)targetPtr; - for (int i = 0; i < paramNum; i+=8) { - BIN8 temp = 0; // Initialize all bits to 0 - for (int j = 0; j < 8; j++) { - U32 bitNo = 7 - j; - if (f32PtrParam[i + j] == 1.0) { // Set bit if weight is 1.0. Works for both DOREFA and XNOR - temp |= (1 << bitNo); - } - } - targetPtrParam[i/8] = temp; - } - return SUCCESS; -} - -// return quantization scale -F32 ws_datatype_converter_int8(U8* originalPtr, U8* targetPtr, int paramNum) -{ - F32* f32PtrParam = (F32*)originalPtr; - INT8* targetPtrParam = (INT8*)targetPtr; - - F32 maxabs = 0; - for (int i = 0; i < paramNum; i++) { - if (std::abs(f32PtrParam[i]) > maxabs) { - maxabs = std::abs(f32PtrParam[i]); - } - } - - F32 scale = 127.0 / maxabs; - for (int i = 0; i < paramNum; i++) { - targetPtrParam[i] = round(f32PtrParam[i] * scale); - } - return scale; -} - - -inline EE getTargetDataType(DataConvertType convertMode, DataType *type) { - if (*type != DT_F32) - return SUCCESS; - - switch (convertMode) { - case F32_to_F32:{ - *type = DT_F32; - break; - } - case F32_to_F16:{ - *type = DT_F16; - break; - } - case F32_to_I8:{ - *type = DT_I8; - break; - } - default: - return NOT_SUPPORTED; - } - return SUCCESS; -} - - -EE ms_datatype_converter(ModelSpec* originalMs, ModelSpec* targetMs, DataConvertType convertMode, bool quantStorage) -{ - str_copy(targetMs->model_name, originalMs->model_name, NAME_LEN); - targetMs->dt = originalMs->dt; - CHECK_STATUS(getTargetDataType(convertMode, &(targetMs->dt))); - - targetMs->num_inputs = originalMs->num_inputs; - targetMs->input_names = (I8 **)mt_new_storage(targetMs->num_inputs * sizeof(I8 *)); - for (I32 j = 0; j < targetMs->num_inputs; j++) { - targetMs->input_names[j] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); - str_copy(targetMs->input_names[j], originalMs->input_names[j], NAME_LEN); - } - targetMs->input_dims = (TensorDesc*)mt_new_storage(targetMs->num_inputs * sizeof(TensorDesc)); - memcpy(targetMs->input_dims, originalMs->input_dims, targetMs->num_inputs * sizeof(TensorDesc)); - for (I32 i = 0; i < targetMs->num_inputs; i++) { - CHECK_STATUS(getTargetDataType(convertMode, &(targetMs->input_dims[i].dt))); - } - - targetMs->num_outputs = originalMs->num_outputs; - targetMs->output_names = (I8 **)mt_new_storage(targetMs->num_outputs * sizeof(I8 *)); - for (int j = 0; j < targetMs->num_outputs; j++) { - targetMs->output_names[j] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); - str_copy(targetMs->output_names[j], originalMs->output_names[j], NAME_LEN); - } - - targetMs->num_operator_specs = originalMs->num_operator_specs; - OperatorSpec* opsPtr = (OperatorSpec*)mt_new_storage(targetMs->num_operator_specs * sizeof(OperatorSpec)); - std::map weightDataTypeMap; - - for (int i = 0; i < targetMs->num_operator_specs; i++) { - str_copy(opsPtr[i].name, originalMs->ops[i].name, NAME_LEN); - opsPtr[i].type = originalMs->ops[i].type; - opsPtr[i].num_inputs = originalMs->ops[i].num_inputs; - opsPtr[i].input_tensors_name = (I8 **)mt_new_storage(opsPtr[i].num_inputs * sizeof(I8 *)); - for (U32 j = 0; j < opsPtr[i].num_inputs; j++) { - opsPtr[i].input_tensors_name[j] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); - memcpy(opsPtr[i].input_tensors_name[j], originalMs->ops[i].input_tensors_name[j], NAME_LEN); - } - opsPtr[i].num_outputs = originalMs->ops[i].num_outputs; - opsPtr[i].output_tensors_name = (I8 **)mt_new_storage(opsPtr[i].num_outputs * sizeof(I8 *)); - for (U32 j = 0; j < opsPtr[i].num_outputs; j++) { - opsPtr[i].output_tensors_name[j] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); - memcpy(opsPtr[i].output_tensors_name[j], originalMs->ops[i].output_tensors_name[j], NAME_LEN); - } - - if (OT_None != opsPtr[i].type) { - U32 numTensors = opsPtr[i].num_inputs + opsPtr[i].num_outputs; - opsPtr[i].tensor_positions = (I32*)mt_new_storage(numTensors * sizeof(I32)); - memcpy(opsPtr[i].tensor_positions, originalMs->ops[i].tensor_positions, numTensors * sizeof(I32)); - } else { - opsPtr[i].tensor_positions = nullptr; - } - - opsPtr[i].num_quant_feature = originalMs->ops[i].num_quant_feature; - if (0 == opsPtr[i].num_quant_feature) { - opsPtr[i].feature_scale = nullptr; - } else { - opsPtr[i].feature_scale = (QuantSpec*)mt_new_storage(opsPtr[i].num_quant_feature * sizeof(QuantSpec)); - for (U32 j = 0; j < opsPtr[i].num_quant_feature; j++) { - opsPtr[i].feature_scale[j].num_scale = originalMs->ops[i].feature_scale[j].num_scale; - int num = opsPtr[i].feature_scale[j].num_scale; - - opsPtr[i].feature_scale[j].scale = (F32*)mt_new_storage(num * sizeof(F32)); - memcpy(opsPtr[i].feature_scale[j].scale, originalMs->ops[i].feature_scale[j].scale, num * sizeof(F32)); - } - } - - opsPtr[i].ps = originalMs->ops[i].ps; - - switch (opsPtr[i].type) { - case OT_Eltwise: { - if (opsPtr[i].ps.eltwise_spec.elt_mode == ELTWISE_SUM) { - U32 bytes = opsPtr[i].ps.eltwise_spec.elt_sum_spec.coeff_size * sizeof(float); - opsPtr[i].ps.eltwise_spec.elt_sum_spec.coeff_values = (float *)mt_new_storage(bytes); - memcpy(opsPtr[i].ps.eltwise_spec.elt_sum_spec.coeff_values, - originalMs->ops[i].ps.eltwise_spec.elt_sum_spec.coeff_values, bytes); - } - break; - } - case OT_SharedWeight: { - weightDataTypeMap[opsPtr[i].name] = opsPtr[i].ps.shared_weight_spec.desc.dt; - CHECK_STATUS(getTargetDataType(convertMode, &(opsPtr[i].ps.shared_weight_spec.desc.dt))); - break; - } - case OT_PreAllocatedMemory: { - CHECK_STATUS(getTargetDataType(convertMode, &(opsPtr[i].ps.preallocated_memory_spec.desc.dt))); - break; - } - default: - break; - } - } - targetMs->ops = opsPtr; - targetMs->num_weight_specs = originalMs->num_weight_specs; - WeightSpec* wsPtr = (WeightSpec*)mt_new_storage(targetMs->num_weight_specs * sizeof(WeightSpec)); - for (int i = 0; i < targetMs->num_weight_specs; i++) { - str_copy(wsPtr[i].op_name, originalMs->ws[i].op_name, NAME_LEN); - - int weightNum = 0; - if (originalMs->ws[i].mdt == DT_BIN01 || originalMs->ws[i].mdt == DT_BIN11) { - wsPtr[i].mdt = originalMs->ws[i].mdt; - weightNum = originalMs->ws[i].bytes_of_weight / bytesOf(DT_F32); - wsPtr[i].bytes_of_weight = weightNum * bytesOf(wsPtr[i].mdt) / 8; - } else { - DataType wdt = originalMs->ws[i].mdt; - if (weightDataTypeMap.find(wsPtr[i].op_name) != weightDataTypeMap.end()) { - wdt = weightDataTypeMap[wsPtr[i].op_name]; - } - CHECK_STATUS(getTargetDataType(convertMode, &wdt)); - - wsPtr[i].mdt = wdt; - if (quantStorage && (wdt == DT_F32 || wdt == DT_F16)) { - wsPtr[i].mdt = DT_I8; - } - - weightNum = originalMs->ws[i].bytes_of_weight / bytesOf(originalMs->ws[i].mdt); - wsPtr[i].bytes_of_weight = weightNum * bytesOf(wsPtr[i].mdt); - } - - wsPtr[i].num_quant_scale = originalMs->ws[i].num_quant_scale; - if (0 == wsPtr[i].num_quant_scale) { - wsPtr[i].weight_scale = nullptr; - } else { - wsPtr[i].weight_scale = (QuantSpec*)mt_new_storage(wsPtr[i].num_quant_scale * sizeof(QuantSpec)); - for (U32 j = 0; j < wsPtr[i].num_quant_scale; j++) { - wsPtr[i].weight_scale[j].num_scale = originalMs->ws[i].weight_scale[j].num_scale; - int num = wsPtr[i].weight_scale[j].num_scale; - - wsPtr[i].weight_scale[j].scale = (F32*)mt_new_storage(num * sizeof(F32)); - memcpy(wsPtr[i].weight_scale[j].scale, originalMs->ws[i].weight_scale[j].scale, num * sizeof(F32)); - } - } - - wsPtr[i].weight = (U8*)mt_new_storage(wsPtr[i].bytes_of_weight); - - DataType vdt = DT_F32; - int biasNum = originalMs->ws[i].bytes_of_vec / bytesOf(DT_F32); - CHECK_STATUS(getTargetDataType(convertMode, &vdt)); - wsPtr[i].bytes_of_vec = biasNum * bytesOf(vdt); - wsPtr[i].vec = (U8*)mt_new_storage(wsPtr[i].bytes_of_vec); - - switch (wsPtr[i].mdt) { - case DT_F32: { - CHECK_STATUS(ws_datatype_converter(originalMs->ws[i].weight, wsPtr[i].weight, weightNum)); - CHECK_STATUS(ws_datatype_converter(originalMs->ws[i].vec, wsPtr[i].vec, biasNum)); - break; - } - case DT_I32: { - CHECK_STATUS(ws_datatype_converter(originalMs->ws[i].weight, wsPtr[i].weight, weightNum)); - CHECK_STATUS(ws_datatype_converter(originalMs->ws[i].vec, wsPtr[i].vec, biasNum)); - break; - } - case DT_U32: { - CHECK_STATUS(ws_datatype_converter(originalMs->ws[i].weight, wsPtr[i].weight, weightNum)); - CHECK_STATUS(ws_datatype_converter(originalMs->ws[i].vec, wsPtr[i].vec, biasNum)); - break; - } -#ifdef __aarch64__ - case DT_F16: { - CHECK_STATUS(ws_datatype_converter(originalMs->ws[i].weight, wsPtr[i].weight, weightNum)); - CHECK_STATUS(ws_datatype_converter(originalMs->ws[i].vec, wsPtr[i].vec, biasNum)); - break; - } -#endif - case DT_I8: { - F32 scale = ws_datatype_converter_int8(originalMs->ws[i].weight, wsPtr[i].weight, weightNum); - wsPtr[i].num_quant_scale = 1; - wsPtr[i].weight_scale = (QuantSpec*)mt_new_storage(sizeof(QuantSpec)); - wsPtr[i].weight_scale[0].num_scale = 1; - wsPtr[i].weight_scale[0].scale = (F32*)mt_new_storage(sizeof(F32)); - wsPtr[i].weight_scale[0].scale[0] = scale; - - if (DT_F32 == vdt) { - CHECK_STATUS(ws_datatype_converter(originalMs->ws[i].vec, wsPtr[i].vec, biasNum)); - } else { -#ifdef __aarch64__ - CHECK_STATUS(ws_datatype_converter(originalMs->ws[i].vec, wsPtr[i].vec, biasNum)); -#endif - } - break; - } -#ifdef __aarch64__ - case DT_BIN01: { - CHECK_STATUS(ws_datatype_converter_bnn(originalMs->ws[i].weight, wsPtr[i].weight, weightNum)); - CHECK_STATUS(ws_datatype_converter(originalMs->ws[i].vec, wsPtr[i].vec, biasNum)); // Assume F16 for the vector - break; - } - case DT_BIN11: { - CHECK_STATUS(ws_datatype_converter_bnn(originalMs->ws[i].weight, wsPtr[i].weight, weightNum)); - CHECK_STATUS(ws_datatype_converter(originalMs->ws[i].vec, wsPtr[i].vec, biasNum)); // Assume F16 for the vector - break; - } -#endif - default: - return NOT_SUPPORTED; - } - } - targetMs->ws = wsPtr; - - if (nullptr != originalMs->op_relationship_entries) { - targetMs->num_op_tensor_entries = originalMs->num_op_tensor_entries; - targetMs->op_relationship_entries = (OperatorRelationshipMapEntry *)mt_new_storage(targetMs->num_op_tensor_entries * sizeof(OperatorRelationshipMapEntry)); - for (int i = 0; i < targetMs->num_op_tensor_entries; i++) { - str_copy(targetMs->op_relationship_entries[i].op, originalMs->op_relationship_entries[i].op, NAME_LEN); - - targetMs->op_relationship_entries[i].num_inputs = originalMs->op_relationship_entries[i].num_inputs; - targetMs->op_relationship_entries[i].input_op_names = (I8 **)mt_new_storage(targetMs->op_relationship_entries[i].num_inputs * sizeof(I8 *)); - for (U32 j = 0; j < targetMs->op_relationship_entries[i].num_inputs; j++) { - targetMs->op_relationship_entries[i].input_op_names[j] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); - str_copy(targetMs->op_relationship_entries[i].input_op_names[j], originalMs->op_relationship_entries[i].input_op_names[j], NAME_LEN); - } - - targetMs->op_relationship_entries[i].num_outputs = originalMs->op_relationship_entries[i].num_outputs; - targetMs->op_relationship_entries[i].output_op_names = (I8 **)mt_new_storage(targetMs->op_relationship_entries[i].num_outputs * sizeof(I8 *)); - for (U32 j = 0; j < targetMs->op_relationship_entries[i].num_outputs; j++) { - targetMs->op_relationship_entries[i].output_op_names[j] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); - str_copy(targetMs->op_relationship_entries[i].output_op_names[j], originalMs->op_relationship_entries[i].output_op_names[j], NAME_LEN); - } - } - } else { - targetMs->num_op_tensor_entries = 0; - targetMs->op_relationship_entries = nullptr; - } - return SUCCESS; -} diff --git a/model-tools/src/model_adaptee.h b/model-tools/src/model_adaptee.h deleted file mode 100644 index 8b3f6444..00000000 --- a/model-tools/src/model_adaptee.h +++ /dev/null @@ -1,410 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_MODELADAPTEE -#define _H_MODELADAPTEE - -#include -#include "model_tools.h" -#include "op_type.h" -#include "error.h" - -#define UNIMPLEMENT() printf("UNIMPLEMENT %s CURRENTLY, ERROR OCCUR IN FILE: %s - LINE: %d \n", __PRETTY_FUNCTION__, __FILE__, __LINE__) - -class ModelAdaptee { -public: - virtual EE adapt(std::string dir, std::string mfn, ModelSpec* ms) { - EE ret; - - ret = parse_file(dir, mfn); - - ret = adapt_operators(ms); - - ret = adapt_weights(ms); - - return ret; - } - - ModelAdaptee() {} - virtual ~ModelAdaptee() {} - -protected: - virtual EE parse_file(std::string dir, std::string mfn) = 0; - - virtual EE adapt_operators(ModelSpec* ms) = 0; - - virtual EE adapt_weights(ModelSpec* ms) = 0; - - - virtual EE adapt_operator(OperatorType type, ParameterSpec* ps) { - if (type == OT_Input - || type == OT_TanH - || type == OT_Gelu - || type == OT_Jump - ) - { - } else if (type == OT_Conv) { - *ps = adapt_Conv(); - } else if (type == OT_Deconvolution) { - *ps = adapt_Deconvolution(); - } else if (type == OT_FC) { - *ps = adapt_Fc(); - } else if (type == OT_Pooling) { - *ps = adapt_Pooling(); - } else if (type == OT_Eltwise) { - *ps = adapt_Eltwise(); - } else if (type == OT_BatchNorm) { - *ps = adapt_BatchNorm(); - } else if (type == OT_Scale) { - *ps = adapt_Scale(); - } else if (type == OT_Clip) { - *ps = adapt_Clip(); - } else if (type == OT_LSTM) { - *ps = adapt_LSTM(); - } else if (type == OT_Embedding) { - *ps = adapt_Embedding(); - } else if (type == OT_Pad) { - *ps = adapt_Pad(); - } else if (type == OT_LayerNorm) { - *ps = adapt_LayerNorm(); - } else if (type == OT_Multiply) { - *ps = adapt_Multiply(); - } else if (type == OT_Reshape) { - *ps = adapt_Reshape(); - } else if (type == OT_Slice) { - *ps = adapt_Slice(); - } else if (type == OT_Transpose) { - *ps = adapt_Transpose(); - } else if (type == OT_Attention) { - *ps = adapt_Attention(); - } else if (type == OT_Upsample) { - *ps = adapt_Upsample(); - } else if (type == OT_Cast) { - *ps = adapt_Cast(); - } else if (type == OT_Gather) { - *ps = adapt_Gather(); - } else if (type == OT_Unsqueeze) { - *ps = adapt_Unsqueeze(); - } else if (type == OT_Squeeze) { - *ps = adapt_Squeeze(); - } else if (type == OT_ArgMax) { - *ps = adapt_ArgMax(); - } else if (type == OT_Repeat) { - *ps = adapt_Repeat(); - } else if (type == OT_Check) { - *ps = adapt_Check(); - } else if (type == OT_SharedWeight) { - *ps = adapt_SharedWeight(); - } else if (type == OT_PreAllocatedMemory) { - *ps = adapt_PreAllocatedMemory(); - } else if (type == OT_Copy) { - *ps = adapt_Copy(); - } else if (type == OT_MatMul) { - *ps = adapt_MatMul(); - } else if (type == OT_Interp) { - *ps = adapt_Interp(); - } else if (type == OT_Flatten) { - *ps = adapt_Flatten(); - } else if (type == OT_AttentionMask) { - *ps = adapt_AttentionMask(); - } else if (type == OT_RelativePositionEmbedding) { - *ps = adapt_RelativePositionEmbedding(); - } else if (type == OT_RelativeShift) { - *ps = adapt_RelativeShift(); - } else if (type == OT_Concat) { - *ps = adapt_Concat(); - } else if (type == OT_Softmax) { - *ps = adapt_Softmax(); - } else if (type == OT_Relu) { - *ps = adapt_Relu(); - } else if (type == OT_Reduction) { - *ps = adapt_Reduction(); - } else if (type == OT_Permute) { - *ps = adapt_Permute(); - } else if (type == OT_Relu) { - *ps = adapt_Relu(); - } else if (type == OT_PriorBox) { - *ps = adapt_PriorBox(); - } else if (type == OT_DetectionOutput) { - *ps = adapt_DetectionOutput(); - } - // Only these OPs have parameters - return SUCCESS; - } - - virtual ParameterSpec adapt_Permute() - { - UNIMPLEMENT(); - ParameterSpec curPs; - return curPs; - } - - virtual ParameterSpec adapt_Interp() - { - UNIMPLEMENT(); - ParameterSpec curPs; - return curPs; - } - - virtual ParameterSpec adapt_Flatten() - { - UNIMPLEMENT(); - ParameterSpec curPs; - return curPs; - } - - virtual ParameterSpec adapt_Conv() - { - UNIMPLEMENT(); - ParameterSpec curPs; - return curPs; - } - - virtual ParameterSpec adapt_Deconvolution() - { - UNIMPLEMENT(); - ParameterSpec curPs; - return curPs; - } - - virtual ParameterSpec adapt_Fc() - { - UNIMPLEMENT(); - ParameterSpec curPs; - return curPs; - } - - virtual ParameterSpec adapt_Pooling() - { - UNIMPLEMENT(); - ParameterSpec curPs; - return curPs; - } - - virtual ParameterSpec adapt_Eltwise() { - UNIMPLEMENT(); - ParameterSpec curPs; - return curPs; - } - - virtual ParameterSpec adapt_BatchNorm() { - UNIMPLEMENT(); - ParameterSpec curPs; - return curPs; - } - - virtual ParameterSpec adapt_Scale() { - UNIMPLEMENT(); - ParameterSpec curPs; - return curPs; - } - - virtual ParameterSpec adapt_Clip() { - UNIMPLEMENT(); - ParameterSpec curPs; - return curPs; - } - - virtual ParameterSpec adapt_LSTM() { - UNIMPLEMENT(); - ParameterSpec curPs; - return curPs; - } - - virtual ParameterSpec adapt_Embedding() { - UNIMPLEMENT(); - ParameterSpec curPs; - return curPs; - } - - virtual ParameterSpec adapt_Pad() { - UNIMPLEMENT(); - ParameterSpec curPs; - return curPs; - } - - virtual ParameterSpec adapt_LayerNorm() { - UNIMPLEMENT(); - ParameterSpec curPs; - return curPs; - } - - virtual ParameterSpec adapt_Multiply() { - UNIMPLEMENT(); - ParameterSpec curPs; - return curPs; - } - - virtual ParameterSpec adapt_Reshape() { - UNIMPLEMENT(); - ParameterSpec curPs; - return curPs; - } - - virtual ParameterSpec adapt_Slice() { - UNIMPLEMENT(); - ParameterSpec curPs; - return curPs; - } - - virtual ParameterSpec adapt_Transpose() { - UNIMPLEMENT(); - ParameterSpec curPs; - return curPs; - } - - virtual ParameterSpec adapt_Attention() { - UNIMPLEMENT(); - ParameterSpec curPs; - return curPs; - } - - virtual ParameterSpec adapt_Upsample() { - UNIMPLEMENT(); - ParameterSpec curPs; - return curPs; - } - - virtual ParameterSpec adapt_Cast() { - UNIMPLEMENT(); - ParameterSpec curPs; - return curPs; - } - - virtual ParameterSpec adapt_Gather() { - UNIMPLEMENT(); - ParameterSpec curPs; - return curPs; - } - - virtual ParameterSpec adapt_Unsqueeze() { - UNIMPLEMENT(); - ParameterSpec curPs; - return curPs; - } - - virtual ParameterSpec adapt_Reduction() { - UNIMPLEMENT(); - ParameterSpec curPs; - return curPs; - } - - virtual ParameterSpec adapt_Squeeze() { - UNIMPLEMENT(); - ParameterSpec curPs; - return curPs; - } - - virtual ParameterSpec adapt_ArgMax() { - UNIMPLEMENT(); - ParameterSpec curPs; - return curPs; - } - - virtual ParameterSpec adapt_Repeat() { - UNIMPLEMENT(); - ParameterSpec curPs; - return curPs; - } - - virtual ParameterSpec adapt_Check() { - UNIMPLEMENT(); - ParameterSpec curPs; - return curPs; - } - - virtual ParameterSpec adapt_PreAllocatedMemory() { - UNIMPLEMENT(); - ParameterSpec curPs; - return curPs; - } - - virtual ParameterSpec adapt_SharedWeight() { - UNIMPLEMENT(); - ParameterSpec curPs; - return curPs; - } - - virtual ParameterSpec adapt_Copy() { - UNIMPLEMENT(); - ParameterSpec curPs; - return curPs; - } - - virtual ParameterSpec adapt_MatMul() { - UNIMPLEMENT(); - ParameterSpec curPs; - return curPs; - } - - virtual ParameterSpec adapt_AttentionMask() { - UNIMPLEMENT(); - ParameterSpec curPs; - return curPs; - } - - virtual ParameterSpec adapt_RelativePositionEmbedding() { - UNIMPLEMENT(); - ParameterSpec curPs; - return curPs; - } - - virtual ParameterSpec adapt_RelativeShift() { - UNIMPLEMENT(); - ParameterSpec curPs; - return curPs; - } - - virtual ParameterSpec adapt_Concat() { - UNIMPLEMENT(); - ParameterSpec curPs; - ConcatParamSpec concatPs; - concatPs.axis = 1; - curPs.concat_spec = concatPs; - return curPs; - } - - virtual ParameterSpec adapt_Softmax() { - UNIMPLEMENT(); - ParameterSpec curPs; - SoftmaxParamSpec softmaxPs; - softmaxPs.axis = -1; - curPs.softmax_spec = softmaxPs; - return curPs; - } - - virtual ParameterSpec adapt_Relu() { - ParameterSpec curPs; - ReLUParamSpec reluPs; - reluPs.neg_slope = 0; - curPs.relu_spec = reluPs; - return curPs; - } - - virtual ParameterSpec adapt_PriorBox() { - UNIMPLEMENT(); - ParameterSpec curPs; - return curPs; - } - - virtual ParameterSpec adapt_DetectionOutput() { - UNIMPLEMENT(); - ParameterSpec curPs; - return curPs; - } - -}; - -#undef UNIMPLEMENT -#endif diff --git a/model-tools/src/model_deserialize.cpp b/model-tools/src/model_deserialize.cpp deleted file mode 100644 index 30c4695c..00000000 --- a/model-tools/src/model_deserialize.cpp +++ /dev/null @@ -1,474 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include -#include -#include -#include -#include -#include -#include -#include "model_serialize_deserialize.hpp" -#include "model_tools.h" -#include -#include -#include -#include - -EE str_copy(I8* dst, const I8* src, I32 srcLen) { - memset(dst, 0, NAME_LEN); - I32 copyLen = NAME_LEN - 1; - if (copyLen > srcLen) - copyLen = srcLen; - memcpy(dst, src, copyLen*sizeof(I8)); - return SUCCESS; -} - -void* mt_new_storage(size_t size) -{ - if (size == 0) { - return nullptr; - } else { - U8* s = new U8[size]; - return (void*)s; - } -} - -EE operator_relationship(ModelSpec* spec) { - std::map opCanInChange; - std::set inplaceTensors; - std::map inplaceTensorInNum; - std::map inplaceTensorOutNum; - std::map> opInTensorNew; - std::map opOutTensorNew; - std::map tensorOpMapping; - std::map> tensorFlowsToOpSet; - - for (int i = 0; i < spec->num_operator_specs; i++) { - if (spec->ops[i].num_inputs == 1 && spec->ops[i].num_outputs == 1) { - std::string inputTensorStr = spec->ops[i].input_tensors_name[0]; - std::string outputTensorStr = spec->ops[i].output_tensors_name[0]; - if (inputTensorStr.compare(outputTensorStr) == 0) { - inplaceTensors.insert(inputTensorStr); - opCanInChange.insert(std::make_pair(inputTensorStr, true)); - } - } - } - - for (int i = 0; i < spec->num_operator_specs; i++) { - std::string currentOpName = spec->ops[i].name; - int in_tensor_number = spec->ops[i].num_inputs; - std::vector inTensorVec; - - // dealing with the relationship of op -- input tensors - for (int j = 0; j < in_tensor_number; j++) { - std::string tmpInTensor = spec->ops[i].input_tensors_name[j]; - if (inplaceTensors.find(tmpInTensor) != inplaceTensors.end()) { // judge inplace op or not - int inId; - if (inplaceTensorInNum.find(tmpInTensor) == inplaceTensorInNum.end()) { - inId = 1; - inplaceTensorInNum.insert(std::make_pair(tmpInTensor, inId)); - opCanInChange[tmpInTensor] = true; - }else{ - if (opCanInChange[tmpInTensor] == false) { - inId = inplaceTensorInNum[tmpInTensor]+1; - // inplaceTensorInNum.insert(std::make_pair(tmpInTensor, inId)); - inplaceTensorInNum[tmpInTensor] = inId; - opCanInChange[tmpInTensor] = true; - }else{ - inId = inplaceTensorInNum[tmpInTensor]; - opCanInChange[tmpInTensor] = true; - } - } - std::ostringstream stream; - stream << inId; - std::string tmpInTensorChanged = tmpInTensor + "_" + stream.str(); - inTensorVec.push_back(tmpInTensorChanged); - - if (tensorFlowsToOpSet.find(tmpInTensorChanged) == tensorFlowsToOpSet.end()) { - std::vector tmpVector; - tmpVector.push_back(currentOpName); - tensorFlowsToOpSet.insert(std::make_pair(tmpInTensorChanged, tmpVector)); - }else{ - tensorFlowsToOpSet[tmpInTensorChanged].push_back(currentOpName); - } - - }else{ - inTensorVec.push_back(tmpInTensor); - - if (tensorFlowsToOpSet.find(tmpInTensor) == tensorFlowsToOpSet.end()) { - std::vector tmpVector; - tmpVector.push_back(currentOpName); - tensorFlowsToOpSet.insert(std::make_pair(tmpInTensor, tmpVector)); - }else{ - tensorFlowsToOpSet[tmpInTensor].push_back(currentOpName); - } - } - } - opInTensorNew.insert(std::make_pair(currentOpName, inTensorVec)); - - // dealing with the relationship of op -- output tensors - std::string tmpOutTensor = spec->ops[i].output_tensors_name[0]; - if (inplaceTensors.find(tmpOutTensor) != inplaceTensors.end()) { - // todo - int outId; - if (inplaceTensorOutNum.find(tmpOutTensor) == inplaceTensorOutNum.end()) { - outId = 1; - inplaceTensorOutNum.insert(std::make_pair(tmpOutTensor, outId)); - opCanInChange[tmpOutTensor] = false; - }else{ - outId = inplaceTensorOutNum[tmpOutTensor] + 1; - // inplaceTensorOutNum.insert(std::make_pair(tmpOutTensor, outId)); can not update - inplaceTensorOutNum[tmpOutTensor] = outId; - opCanInChange[tmpOutTensor] = false; - } - std::ostringstream stream; - stream << outId; - std::string tmpOutTensorChanged = tmpOutTensor + "_" + stream.str(); - opOutTensorNew.insert(std::make_pair(currentOpName, tmpOutTensorChanged)); - tensorOpMapping.insert(std::make_pair(tmpOutTensorChanged, currentOpName)); - }else{ - opOutTensorNew.insert(std::make_pair(currentOpName, tmpOutTensor)); - tensorOpMapping.insert(std::make_pair(tmpOutTensor, currentOpName)); - } - } - - // assign op-op relationship - int opNum = spec->num_operator_specs; - spec->num_op_tensor_entries = opNum; - OperatorSpec* opsPtr2 = spec->ops; - OperatorRelationshipMapEntry* oprmePtr = (OperatorRelationshipMapEntry*)mt_new_storage(sizeof(OperatorRelationshipMapEntry) * opNum); - spec->op_relationship_entries = oprmePtr; - for (int j = 0; j < opNum; j++) { - str_copy(oprmePtr[j].op, opsPtr2[j].name, NAME_LEN); - int opInOpNum = opInTensorNew[opsPtr2[j].name].size(); - oprmePtr[j].num_inputs = opInOpNum; - oprmePtr[j].input_op_names = (I8 **)mt_new_storage(opInOpNum * sizeof(I8 *)); - for (int k = 0; k < opInOpNum; k++) { - oprmePtr[j].input_op_names[k] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); - std::string ten_name = opInTensorNew[opsPtr2[j].name][k]; - std::string tensor2op = tensorOpMapping[ten_name]; - str_copy(oprmePtr[j].input_op_names[k], tensor2op.c_str(), tensor2op.length()); - } - - int opOutOpNum = tensorFlowsToOpSet[opOutTensorNew[opsPtr2[j].name]].size(); - oprmePtr[j].num_outputs = opOutOpNum; - oprmePtr[j].output_op_names = (I8 **)mt_new_storage(opOutOpNum * sizeof(I8 *)); - for (int k = 0; k < opOutOpNum; k++) { - oprmePtr[j].output_op_names[k] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); - std::string tensor2op = tensorFlowsToOpSet[opOutTensorNew[opsPtr2[j].name]][k]; - str_copy(oprmePtr[j].output_op_names[k], tensor2op.c_str(), tensor2op.length()); - } - } - return SUCCESS; -} - -template -void dequantize_int8_weight(int num, F32 scale, INT8* q, T* d) -{ - F32 factor = 1 / scale; - T table[255]; - int base = -127; - for (int i = 0; i < 255; i++) { - table[i] = factor * base; - base++; - } - T *mid = table + 127; - for (int i = 0; i < num; i++) { - d[i] = *(mid + q[i]); - } -} - -EE deserialize_header(const char* bytes, ModelSpec* spec, U32* pos) -{ - const char* pointer = bytes + *pos; - memcpy(&spec->version, pointer, sizeof(I32)); - pointer += sizeof(I32); - *pos += sizeof(I32); - if (spec->version != mt_version()) { - std::cerr << "[ERROR] version not_match: code " << mt_version() << \ - "bolt model " << spec->version << std::endl; - CHECK_STATUS(NOT_MATCH); - } - - memcpy(&spec->magic_number, pointer, sizeof(I32)); - pointer += sizeof(I32); - *pos += sizeof(I32); - if (spec->magic_number != mt_magic_number()) { - std::cerr << "[ERROR] magic_number not_match: code " << mt_magic_number() << \ - "bolt model " << spec->version << std::endl; - CHECK_STATUS(NOT_MATCH); - } - - str_copy(spec->model_name, pointer, NAME_LEN); - pointer += NAME_LEN; - *pos += NAME_LEN; - - spec->dt = *((DataType*)pointer); - pointer += sizeof(DataType); - *pos += sizeof(DataType); - - spec->num_inputs = *((I32*)pointer); - pointer += sizeof(I32); - *pos += sizeof(I32); - - spec->input_names = (I8**)mt_new_storage(spec->num_inputs * sizeof(I8*)); - for (int i = 0; i < spec->num_inputs; i++) { - spec->input_names[i] = (I8*)mt_new_storage(NAME_LEN * sizeof(I8)); - str_copy(spec->input_names[i], pointer, NAME_LEN); - pointer += NAME_LEN; - *pos += NAME_LEN; - } - - spec->input_dims = (TensorDesc *)mt_new_storage(spec->num_inputs * sizeof(TensorDesc)); - memcpy(spec->input_dims, pointer, spec->num_inputs * sizeof(TensorDesc)); - pointer += spec->num_inputs * sizeof(TensorDesc); - *pos += spec->num_inputs * sizeof(TensorDesc); - - spec->num_outputs = *((I32*)pointer); - pointer += sizeof(I32); - *pos += sizeof(I32); - - spec->output_names = (I8**)mt_new_storage(spec->num_outputs * NAME_LEN); - for (int i = 0; i < spec->num_outputs; i++) { - spec->output_names[i] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); - str_copy(spec->output_names[i], pointer, NAME_LEN); - pointer += NAME_LEN; - *pos += NAME_LEN; - } - return SUCCESS; -} - -EE deserialize_operator(const char* bytes, ModelSpec* spec, U32* pos) -{ - const char* pointer = bytes + *pos; - I32* p4numOperatorSpecs = (I32 *)pointer; - spec->num_operator_specs = *p4numOperatorSpecs; - pointer += sizeof(U32); - *pos += sizeof(U32); - - OperatorSpec *ptr = (OperatorSpec*)mt_new_storage(spec->num_operator_specs * sizeof(OperatorSpec)); - spec->ops = ptr; - for (int i = 0; i < spec->num_operator_specs; i++) { - str_copy(ptr[i].name, pointer, NAME_LEN); - pointer += NAME_LEN * sizeof(I8); - *pos += NAME_LEN * sizeof(I8); - - ptr[i].type = *((OperatorType *)pointer); - pointer += sizeof(OperatorType); - *pos += sizeof(OperatorType); - - ptr[i].num_inputs = *((U32 *)pointer); - pointer += sizeof(U32); - *pos += sizeof(U32); - - ptr[i].input_tensors_name = (I8 **)mt_new_storage(ptr[i].num_inputs * sizeof(I8 *)); - for (U32 j = 0; jnum_weight_specs = *p4numWeightSpecs; - pointer += sizeof(U32); - *pos += sizeof(U32); - - WeightSpec* ptr = (WeightSpec*)mt_new_storage(spec->num_weight_specs * sizeof(WeightSpec)); - spec->ws = ptr; - for (int i = 0; i < spec->num_weight_specs; i++) { - U32* length = (U32*)pointer; - pointer += sizeof(U32); - *pos += sizeof(U32); - U32 weightBiasBytes = 0; - - str_copy(ptr[i].op_name, pointer, NAME_LEN); - pointer += NAME_LEN; - *pos += NAME_LEN; - - memcpy(&(ptr[i].mdt), pointer, sizeof(DataType)); - pointer += sizeof(U32); - *pos += sizeof(U32); - - bool quantWeight = false; - if (DT_I8 == ptr[i].mdt && DT_I8 != spec->dt) { - ptr[i].mdt = (spec->dt == DT_F16_8Q) ? DT_F16 : spec->dt; - quantWeight = true; - } - - memcpy(&(ptr[i].bytes_of_weight), pointer, sizeof(U32)); - U32 alignSize = ptr[i].bytes_of_weight; - if (quantWeight) { - ptr[i].bytes_of_weight *= bytesOf(ptr[i].mdt); - } - pointer += sizeof(U32); - *pos += sizeof(U32); - - ptr[i].weight = (U8*)mt_new_storage(ptr[i].bytes_of_weight); - INT8 *serialWeight = (INT8*)pointer; - - pointer += alignSize; - *pos += alignSize; - weightBiasBytes += alignSize; - - memcpy(&(ptr[i].bytes_of_vec), pointer, sizeof(U32)); - pointer += sizeof(U32); - *pos += sizeof(U32); - - U8* ppp4 = (U8*)mt_new_storage(ptr[i].bytes_of_vec); - memcpy(ppp4, pointer, ptr[i].bytes_of_vec); - ptr[i].vec = ppp4; - - pointer += ptr[i].bytes_of_vec; - *pos += ptr[i].bytes_of_vec; - weightBiasBytes += ptr[i].bytes_of_vec; - - memcpy(&(ptr[i].num_quant_scale), pointer, sizeof(U32)); - pointer += sizeof(U32); - *pos += sizeof(U32); - - if (0 != ptr[i].num_quant_scale) { - ptr[i].weight_scale = (QuantSpec*)mt_new_storage(ptr[i].num_quant_scale * sizeof(QuantSpec)); - } - for (U32 j = 0; j < ptr[i].num_quant_scale; j++) { - ptr[i].weight_scale[j].num_scale = *((int *)pointer); - int num = ptr[i].weight_scale[j].num_scale; - pointer += sizeof(int); - *pos += sizeof(int); - - ptr[i].weight_scale[j].scale = (F32*)mt_new_storage(num * sizeof(F32)); - memcpy(ptr[i].weight_scale[j].scale, pointer, num * sizeof(F32)); - pointer += num * sizeof(F32); - *pos += num * sizeof(F32); - } - - CHECK_REQUIREMENT(*length == weightBiasBytes); - - if (quantWeight) { - CHECK_REQUIREMENT(1 == ptr[i].num_quant_scale && 1 == ptr[i].weight_scale[0].num_scale); - F32 scale = ptr[i].weight_scale[0].scale[0]; - if (DT_F32 == ptr[i].mdt) { - dequantize_int8_weight(alignSize, scale, serialWeight, (F32*)ptr[i].weight); - } else { -#ifdef __aarch64__ - dequantize_int8_weight(alignSize, scale, serialWeight, (F16*)ptr[i].weight); -#endif - } - } else { - memcpy(ptr[i].weight, serialWeight, ptr[i].bytes_of_weight); - } - } - return SUCCESS; -} - -EE deserialize_model(const char* bytes, ModelSpec* spec) -{ - U32 pos = 0; - CHECK_STATUS(deserialize_header(bytes, spec, &pos)); - CHECK_STATUS(deserialize_operator(bytes, spec, &pos)); - CHECK_STATUS(deserialize_weight(bytes, spec, &pos)); - CHECK_STATUS(operator_relationship(spec)); - return SUCCESS; -} - -int read_from_file(const char* fn, char** bytes) -{ - int fd = open(fn, O_RDONLY); - CHECK_REQUIREMENT(-1 != fd); - - struct stat ss; - CHECK_REQUIREMENT(fstat(fd, &ss) != -1); - - int fileLength = ss.st_size; - *bytes = (char*)mmap(nullptr, fileLength, PROT_READ, - MAP_SHARED, fd, 0); - CHECK_REQUIREMENT(MAP_FAILED != bytes); - close(fd); - return fileLength; -} - -EE deserialize_model_from_file(const char* fn, ModelSpec* spec) -{ - char *bytes = nullptr; - int fileLength = read_from_file(fn, &bytes); - CHECK_STATUS(deserialize_model(bytes, spec)); - munmap(bytes, fileLength); - return SUCCESS; -} diff --git a/model-tools/src/model_print.cpp b/model-tools/src/model_print.cpp deleted file mode 100644 index 65890e8f..00000000 --- a/model-tools/src/model_print.cpp +++ /dev/null @@ -1,178 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "model_tools.h" -#include "model_print.h" -#include "model_optimizer.hpp" -#include -#include "OPOptimizers/DeprecatedOPOptimizer.hpp" - -F32 convert_F32(void* ptr, int index, DataType dt){ - F32 value = 0; - switch (dt){ - case DT_F32: { - value = ((F32*)ptr)[index]; - break; - } - case DT_I32: { - value = ((I32*)ptr)[index]; - break; - } - case DT_U32: { - value = ((U32*)ptr)[index]; - break; - } -#ifdef __aarch64__ - case DT_F16: { - value = ((F16*)ptr)[index]; - break; - } - case DT_F16_8Q: { - value = ((F16*)ptr)[index]; - break; - } -#endif - case DT_I8: { - value = ((I8*)ptr)[index]; - break; - } - case DT_BIN01: { - std::bitset<8> Val(((BIN8*)ptr)[index / 8]); - if (Val.test(7 - (index % 8))) { - value = 1.0; - } - break; - } - case DT_BIN11: { - std::bitset<8> Val(((BIN8*)ptr)[index / 8]); - if (Val.test(7 - (index % 8))) { - value = 1.0; - } else { - value = -1.0; - } - break; - } - default: - CHECK_REQUIREMENT(0); - break; - } - return value; -} - - -void print_header(const ModelSpec ms){ - printf("[Model] %s\n", ms.model_name); - printf(" [Input]"); - for(int i = 0; i < ms.num_inputs; i++){ - printf(" %s(", ms.input_names[i]); - std::cout << tensorDesc2Str(ms.input_dims[i]); - printf(")"); - } - printf("\n"); - - printf(" [Output]"); - for(int i = 0; i < ms.num_outputs; i++){ - printf(" %s", ms.output_names[i]); - } - printf("\n"); -} - - -void print_operator_tensor_relationship(const ModelSpec ms, bool deleteDeprecatedOp){ - int number = ms.num_operator_specs; - printf(" [Ops] %d\n", number); - for(int i = 0; i < number; i++){ - if(deleteDeprecatedOp) { - if(DeprecatedOPOptimizer::isDeprecatedOp(ms.ops[i].type)) - continue; - } -#ifdef _DEBUG - if (OT_Conv == ms.ops[i].type) { - printf("Kernel shape is %d x %d\n", ms.ops[i].ps.conv_spec.kernel_size_h, ms.ops[i].ps.conv_spec.kernel_size_w); - } -#endif - printf(" Op %3d %32s %16s|", i, ms.ops[i].name, OperatorTypeName()[ms.ops[i].type]); - for(U32 j = 0; j < ms.ops[i].num_inputs; j++){ - printf(" %s,", ms.ops[i].input_tensors_name[j]); - } - printf(" -> "); - for(U32 j = 0; j < ms.ops[i].num_outputs; j++){ - printf(" %s,", ms.ops[i].output_tensors_name[j]); - } - printf("\n"); - if (nullptr != ms.ops[i].tensor_positions) { - printf(" Tensor Positions: "); - for (U32 j = 0; j < ms.ops[i].num_inputs + ms.ops[i].num_outputs; j++) { - printf("%d ", ms.ops[i].tensor_positions[j]); - } - printf("\n"); - } - if (nullptr != ms.ops[i].feature_scale) { - printf(" Quant Scale: "); - for (U32 j = 0; j < ms.ops[i].num_quant_feature; j++) { - printf("%f ", ms.ops[i].feature_scale[j].scale[0]); - } - printf("\n"); - } - } -} - -void print_weights(const ModelSpec ms) -{ - int number = ms.num_weight_specs; - printf(" [Weights] %d\n", number); - for (int i = 0; i < number; i++) { - if (DeprecatedOPOptimizer::isDeprecatedOpWeight(&ms, i)) { - printf(" Weight %3d %32s | Delete mdt %d weight: %p %uB bias: %p %uB\n", i, ms.ws[i].op_name, ms.ws[i].mdt, - ms.ws[i].weight, ms.ws[i].bytes_of_weight, ms.ws[i].vec, ms.ws[i].bytes_of_vec); - continue; - } - - printf(" Weight %3d %32s | Retain mdt %d weight: %p %uB bias: %p %uB example: ", i, ms.ws[i].op_name, ms.ws[i].mdt, - ms.ws[i].weight, ms.ws[i].bytes_of_weight, ms.ws[i].vec, ms.ws[i].bytes_of_vec); - if (ms.ws[i].bytes_of_weight > 0 && ms.ws[i].weight != nullptr) { - printf("%f", convert_F32(ms.ws[i].weight, 0, ms.ws[i].mdt)); - } - if (ms.ws[i].bytes_of_vec > 0 && ms.ws[i].vec != nullptr) { - printf(",%f", convert_F32(ms.ws[i].vec, 0, ms.ws[i].mdt)); - } - printf("\n"); - } -} - - -void print_relationship(const ModelSpec ms){ - int number = ms.num_op_tensor_entries; - printf(" [Relationships] %d\n", number); - for(int i = 0; i < number; i++){ - printf(" Relation %3d %32s |", i, ms.op_relationship_entries[i].op); - for(U32 j = 0; j < ms.op_relationship_entries[i].num_inputs; j++){ - printf(" %s,", ms.op_relationship_entries[i].input_op_names[j]); - } - printf(" -> "); - for(U32 j = 0; j < ms.op_relationship_entries[i].num_outputs; j++){ - printf(" %s,", ms.op_relationship_entries[i].output_op_names[j]); - } - printf("\n"); - } -} - - -void print_ms(const ModelSpec ms){ - print_header(ms); - print_operator_tensor_relationship(ms); - print_weights(ms); - print_relationship(ms); -} diff --git a/model-tools/src/model_serialize.cpp b/model-tools/src/model_serialize.cpp deleted file mode 100644 index 37c4311a..00000000 --- a/model-tools/src/model_serialize.cpp +++ /dev/null @@ -1,322 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include -#include -#include -#include -#include "model_serialize_deserialize.hpp" -#include "model_tools.h" -#include "model_optimizer.hpp" -#include "OPOptimizers/DeprecatedOPOptimizer.hpp" - -EE serialize_header(const ModelSpec* spec, std::string* tmp) { - U32 bufSize = sizeof(I32) * 2 \ - + sizeof(I8) * NAME_LEN + sizeof(DataType) + sizeof(I32) \ - + sizeof(I8) * NAME_LEN * spec->num_inputs + sizeof(TensorDesc) * spec->num_inputs \ - + sizeof(I32) + sizeof(I8) * NAME_LEN * spec->num_outputs; - I8* data = (I8*)mt_new_storage(bufSize); - - I32* pointer4version = (I32*)data; - memcpy(pointer4version, &spec->version, sizeof(I32)); - pointer4version += 1; // the pointer datatype(I32) of add 1 means 4 steps - - I32* pointer4magicNumber = (I32*)pointer4version; - memcpy(pointer4magicNumber, &spec->magic_number, sizeof(I32)); - pointer4magicNumber += 1; - - I8* pointer4modelName = (I8*)pointer4magicNumber; - str_copy(pointer4modelName, spec->model_name, NAME_LEN); - pointer4modelName += NAME_LEN; - - DataType* pointer4dt = (DataType*)pointer4modelName; - *pointer4dt = spec->dt; - pointer4dt++; - - I32* pointer4numInputs = (I32*)pointer4dt; - *pointer4numInputs = spec->num_inputs; - pointer4numInputs++; - - I8* pointer4InputNames = (I8*)pointer4numInputs; - for (int i = 0; i < spec->num_inputs; i++) { - str_copy(pointer4InputNames, spec->input_names[i], NAME_LEN); - pointer4InputNames += NAME_LEN; - } - - TensorDesc* pointer4TensorDesc = (TensorDesc*)pointer4InputNames; - memcpy(pointer4TensorDesc, spec->input_dims, sizeof(TensorDesc) * spec->num_inputs); - pointer4TensorDesc += spec->num_inputs; - - I32* pointer4numOutputs = (I32 *)pointer4TensorDesc; - *pointer4numOutputs = spec->num_outputs; - pointer4numOutputs++; - - I8* pointer4outputNames = (I8 *)pointer4numOutputs; - for (int i = 0; i < spec->num_outputs; i++) { - str_copy(pointer4outputNames, spec->output_names[i], NAME_LEN); - pointer4outputNames += NAME_LEN; - } - - tmp->clear(); - CHECK_REQUIREMENT(pointer4outputNames - data == bufSize); - tmp->assign(data, data + bufSize); - delete [] data; - return SUCCESS; -} - - -U32 operator_memory_size(OperatorSpec* ops) -{ - // sizeof(U32) * 4 : type + num_inputs + num_output + num_quant_feature - U32 allocatedBufferSize = sizeof(I8) * NAME_LEN + sizeof(U32) * 4 - + ops->num_inputs * NAME_LEN * sizeof(I8) - + ops->num_outputs * NAME_LEN * sizeof(I8) - + (ops->num_inputs + ops->num_outputs) * sizeof(I32) - + sizeof(ParameterSpec); - - for (U32 i = 0; i < ops->num_quant_feature; i++) { - allocatedBufferSize += sizeof(int); // num_scale - allocatedBufferSize += ops->feature_scale[i].num_scale * sizeof(F32); - } - switch (ops->type) { - case OT_Eltwise: { - if (ops->ps.eltwise_spec.elt_mode == ELTWISE_SUM) - allocatedBufferSize += ops->ps.eltwise_spec.elt_sum_spec.coeff_size * sizeof(float); - break; - } - default: - break; - } - return allocatedBufferSize; -} - - -EE serialize_operators(const ModelSpec* spec, std::string* tmp) { - OperatorSpec* opsTmp = spec->ops; - int removeOpNum = 0; - U32 bufSize = sizeof(I32); - for (int i = 0; i < spec->num_operator_specs; i++) { - if (DeprecatedOPOptimizer::isDeprecatedOp(opsTmp->type)) { - removeOpNum++; - } - else { - bufSize += operator_memory_size(opsTmp); - } - opsTmp++; - } - - char* data = (char*)mt_new_storage(bufSize); - - I32* pointer4numOperatorSpecs = (I32 *)data; - *pointer4numOperatorSpecs = spec->num_operator_specs - removeOpNum; // attention - pointer4numOperatorSpecs++; - - OperatorSpec* opsPointer = spec->ops; - I8* pointer4opsName = (I8*)pointer4numOperatorSpecs; - - for (int i = 0; i < spec->num_operator_specs; i++) { - if (DeprecatedOPOptimizer::isDeprecatedOp(opsPointer[i].type)) { - continue; - } - - str_copy(pointer4opsName, opsPointer[i].name, NAME_LEN); // to copy the name of op - pointer4opsName += NAME_LEN; - - U32* pointer4opsType = (U32 *)pointer4opsName; - *pointer4opsType = opsPointer[i].type; - pointer4opsType++; - - U32* pointer4opsNumInputs = pointer4opsType; - *pointer4opsNumInputs = opsPointer[i].num_inputs; - pointer4opsNumInputs++; - - I8* pointer4opsInputTensorsName = (I8 *)pointer4opsNumInputs; - for (U32 j = 0; j < opsPointer[i].num_inputs; j++) { - str_copy(pointer4opsInputTensorsName, opsPointer[i].input_tensors_name[j], NAME_LEN); - pointer4opsInputTensorsName += NAME_LEN; - } - - U32* pointer4opsNumOutputs = (U32 *)pointer4opsInputTensorsName; - *pointer4opsNumOutputs = opsPointer[i].num_outputs; - pointer4opsNumOutputs++; - - I8* pointer4opsOutputTensorsName = (I8 *)pointer4opsNumOutputs; - for (U32 j = 0; j < opsPointer[i].num_outputs; j++) { - str_copy(pointer4opsOutputTensorsName, opsPointer[i].output_tensors_name[j], NAME_LEN); - pointer4opsOutputTensorsName += NAME_LEN; - } - - I32* pointer4tensorPos = (I32*)pointer4opsOutputTensorsName; - U32 numTensors = opsPointer[i].num_inputs + opsPointer[i].num_outputs; - if (nullptr != opsPointer[i].tensor_positions) { - memcpy(pointer4tensorPos, opsPointer[i].tensor_positions, numTensors*sizeof(I32)); - } else { - memset(pointer4tensorPos, 0, numTensors*sizeof(I32)); - } - pointer4tensorPos += numTensors; - - U32* pointer4numint8 = (U32*)pointer4tensorPos; - *pointer4numint8 = opsPointer[i].num_quant_feature; - pointer4numint8++; - - int* pointer4quant = (int*)pointer4numint8; - for (U32 j = 0; j < opsPointer[i].num_quant_feature; j++) { - *pointer4quant = opsPointer[i].feature_scale[j].num_scale; - int num = *pointer4quant; - pointer4quant++; - memcpy(pointer4quant, opsPointer[i].feature_scale[j].scale, num * sizeof(F32)); - pointer4quant += num; - } - - char* pointer4parameterSpecs = (char *)pointer4quant; - memcpy(pointer4parameterSpecs, &(opsPointer[i].ps), sizeof(ParameterSpec)); - if (opsPointer[i].type == OT_Eltwise) { - memset(&(pointer4parameterSpecs[(char*)(&(opsPointer[i].ps.eltwise_spec.elt_sum_spec.coeff_values)) - (char*)(&(opsPointer[i].ps))]), 0, sizeof(float*)); - } - pointer4parameterSpecs += sizeof(ParameterSpec); - switch (opsPointer[i].type) { - case OT_Eltwise: { - if (opsPointer[i].ps.eltwise_spec.elt_mode == ELTWISE_SUM) { - U32 bytes = opsPointer[i].ps.eltwise_spec.elt_sum_spec.coeff_size * sizeof(float); - memcpy(pointer4parameterSpecs, opsPointer[i].ps.eltwise_spec.elt_sum_spec.coeff_values, bytes); - pointer4parameterSpecs += bytes; - } - break; - } - default: - break; - } - pointer4opsName = (I8 *)pointer4parameterSpecs; - } - - tmp->clear(); - CHECK_REQUIREMENT(pointer4opsName - data == bufSize); - tmp->assign(data, data + bufSize); - delete [] data; - return SUCCESS; -} - -EE serialize_weights(const ModelSpec* spec, std::string* tmp) -{ - WeightSpec* tmpPointer = spec->ws; - U32 bufSize = sizeof(I32); - U32 weightCount = 0; - for (int i = 0; i < spec->num_weight_specs; i++) { - if (DeprecatedOPOptimizer::isDeprecatedOpWeight(spec, i)) { - continue; - } - - // U32 x 5: length, mdt, bytes_of_weight, bytes_of_vec, num_quant_scale - bufSize += sizeof(I8) * NAME_LEN + sizeof(U32) * 5 + tmpPointer[i].bytes_of_weight + tmpPointer[i].bytes_of_vec; - for (U32 j = 0; j < tmpPointer[i].num_quant_scale; j++) { - bufSize += sizeof(int); // num_scale - bufSize += tmpPointer[i].weight_scale[j].num_scale * sizeof(F32); - } - - weightCount++; - } - char* data = (char*)mt_new_storage(bufSize); - - I32* pointer4numWeightSpecs = (I32*)data; - *pointer4numWeightSpecs = weightCount; - pointer4numWeightSpecs++; - - WeightSpec* wsPointer = spec -> ws; - char* pointer4wsOpName = (char*)pointer4numWeightSpecs; - for (int i = 0; i < spec->num_weight_specs; i++) { - if (DeprecatedOPOptimizer::isDeprecatedOpWeight(spec, i)) { - continue; - } - - U32* length = (U32*)pointer4wsOpName; - U32 len; - len = wsPointer[i].bytes_of_weight + wsPointer[i].bytes_of_vec; - *length = len; - pointer4wsOpName += sizeof(U32); - - str_copy(pointer4wsOpName, wsPointer[i].op_name, NAME_LEN); - pointer4wsOpName += NAME_LEN; - - U32* pointer4wsMdt = (U32*)pointer4wsOpName; - *pointer4wsMdt = wsPointer[i].mdt; - pointer4wsMdt++; - - U32* pointer4wsBytesOfWeight = (U32*)pointer4wsMdt; - *pointer4wsBytesOfWeight = wsPointer[i].bytes_of_weight; - pointer4wsBytesOfWeight++; - - U8* pointer4wsWeight = (U8*)pointer4wsBytesOfWeight; - memcpy(pointer4wsWeight, wsPointer[i].weight, wsPointer[i].bytes_of_weight); - pointer4wsWeight += wsPointer[i].bytes_of_weight; - - U32* pointer4wsBytesOfVec = (U32*)pointer4wsWeight; - *pointer4wsBytesOfVec = wsPointer[i].bytes_of_vec; - pointer4wsBytesOfVec++; - - U8* pointer4wsVec = (U8*)pointer4wsBytesOfVec; - memcpy(pointer4wsVec, wsPointer[i].vec, wsPointer[i].bytes_of_vec); - pointer4wsVec += wsPointer[i].bytes_of_vec; - - U32* pointer4numquant = (U32*)pointer4wsVec; - *pointer4numquant = wsPointer[i].num_quant_scale; - pointer4numquant++; - - int* pointer4quant = (int*)pointer4numquant; - for (U32 j = 0; j < wsPointer[i].num_quant_scale; j++) { - *pointer4quant = wsPointer[i].weight_scale[j].num_scale; - int num = *pointer4quant; - pointer4quant++; - memcpy(pointer4quant, wsPointer[i].weight_scale[j].scale, num * sizeof(F32)); - pointer4quant += num; - } - - pointer4wsOpName = (char*)pointer4quant; - } - - tmp->clear(); - CHECK_REQUIREMENT(pointer4wsOpName - data == bufSize); - tmp->assign(data, data + bufSize); - delete [] data; - return SUCCESS; -} - -EE serialize_model(const ModelSpec* spec, std::string* bytes) { - bytes->clear(); - std::string tmp; - - CHECK_STATUS(serialize_header(spec, &tmp)); - *bytes += tmp; - - CHECK_STATUS(serialize_operators(spec, &tmp)); - *bytes += tmp; - - CHECK_STATUS(serialize_weights(spec, &tmp)); - *bytes += tmp; - return SUCCESS; -} - -EE write_to_file(std::string* bytes, const char* fn) { - std::ofstream out(fn); - out << *bytes; - out.close(); - return SUCCESS; -} - -EE serialize_model_to_file(const ModelSpec* spec, const char* fn) { - std::string bytes = ""; - CHECK_STATUS(serialize_model(spec, &bytes)); - CHECK_STATUS(write_to_file(&bytes, fn)); - return SUCCESS; -} diff --git a/model-tools/src/model_tools.cpp b/model-tools/src/model_tools.cpp deleted file mode 100644 index 5924be18..00000000 --- a/model-tools/src/model_tools.cpp +++ /dev/null @@ -1,209 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include - -#include "model_serialize_deserialize.hpp" -#include "model_optimizer.hpp" - - -extern "C" EE mt_create_model(ModelSpec* ms) { - if (nullptr == ms) return NULL_POINTER; - - ms->version = mt_version(); - ms->magic_number = mt_magic_number(); - ms->input_names = nullptr; - ms->num_inputs = 0; - ms->input_dims = nullptr; - ms->num_outputs = 0; - ms->output_names = nullptr; - ms->num_operator_specs = 0; - ms->ops = nullptr; - ms->num_weight_specs = 0; - ms->ws = nullptr; - ms->num_op_tensor_entries = 0; - ms->op_relationship_entries = nullptr; - - return SUCCESS; -} - - -extern "C" EE mt_destroy_model(ModelSpec* ms) -{ - if (nullptr == ms) { - return NULL_POINTER; - } - - if (nullptr != ms->input_names) { - for (int i = 0; i < ms->num_inputs; i++) { - if (nullptr != ms->input_names[i]) { - delete [] ms->input_names[i]; - } - ms->input_names[i] = nullptr; - } - delete [] ms->input_names; - ms->input_names = nullptr; - } - - if (nullptr != ms->input_dims) { - delete [] ms->input_dims; - ms->input_dims = nullptr; - } - - if (nullptr != ms->output_names) { - for (int i = 0; i < ms->num_outputs; i++) { - if (nullptr != ms->output_names[i]) { - delete [] ms->output_names[i]; - } - ms->output_names[i] = nullptr; - } - delete [] ms->output_names; - ms->output_names = nullptr; - } - - if (nullptr != ms->ops) { - int op_num = ms->num_operator_specs; - for (int i = 0; i < op_num; i++) { - if (nullptr != ms->ops[i].input_tensors_name) { - for (U32 j = 0; j < ms->ops[i].num_inputs; j++) { - if (nullptr != ms->ops[i].input_tensors_name[j]) { - delete [] ms->ops[i].input_tensors_name[j]; - } - ms->ops[i].input_tensors_name[j] = nullptr; - } - delete [] ms->ops[i].input_tensors_name; - ms->ops[i].input_tensors_name = nullptr; - } - if (nullptr != ms->ops[i].output_tensors_name) { - for (U32 j = 0; j < ms->ops[i].num_outputs; j++) { - if (nullptr != ms->ops[i].output_tensors_name[j]) { - delete [] ms->ops[i].output_tensors_name[j]; - } - ms->ops[i].output_tensors_name[j] = nullptr; - } - delete [] ms->ops[i].output_tensors_name; - ms->ops[i].output_tensors_name = nullptr; - } - - if (nullptr != ms->ops[i].tensor_positions) { - delete [] ms->ops[i].tensor_positions; - } - - if (0 != ms->ops[i].num_quant_feature && nullptr != ms->ops[i].feature_scale) { - for (U32 j = 0; j < ms->ops[i].num_quant_feature; j++) { - if (0 != ms->ops[i].feature_scale[j].num_scale) { - if (nullptr != ms->ops[i].feature_scale[j].scale) { - delete [] ms->ops[i].feature_scale[j].scale; - } - } - } - delete [] ms->ops[i].feature_scale; - } - - // process op memory - switch (ms->ops[i].type) { - case OT_Eltwise: { - if (ms->ops[i].ps.eltwise_spec.elt_mode == ELTWISE_SUM && - nullptr != ms->ops[i].ps.eltwise_spec.elt_sum_spec.coeff_values) { - delete [] ms->ops[i].ps.eltwise_spec.elt_sum_spec.coeff_values; - } - ms->ops[i].ps.eltwise_spec.elt_sum_spec.coeff_values = nullptr; - break; - } - default: - break; - } - } - delete [] ms->ops; - ms->ops = nullptr; - } - - if (nullptr != ms->ws) { - int weightOpNum = ms->num_weight_specs; - for (int i = 0; i < weightOpNum; i++) { - if (nullptr != ms->ws[i].weight) { - delete [] ms->ws[i].weight; - } - ms->ws[i].weight = nullptr; - if (nullptr != ms->ws[i].vec) { - delete [] ms->ws[i].vec; - } - ms->ws[i].vec = nullptr; - } - delete [] ms->ws; - ms->ws = nullptr; - } - - if (nullptr != ms->op_relationship_entries) { - int numOpRelationPair = ms->num_op_tensor_entries; - for (int i = 0; i < numOpRelationPair; i++) { - if (nullptr != ms->op_relationship_entries[i].input_op_names) { - for (U32 j = 0; j < ms->op_relationship_entries[i].num_inputs; j++) { - if (nullptr != ms->op_relationship_entries[i].input_op_names[j]) { - delete [] ms->op_relationship_entries[i].input_op_names[j]; - } - ms->op_relationship_entries[i].input_op_names[j] = nullptr; - } - delete [] ms->op_relationship_entries[i].input_op_names; - ms->op_relationship_entries[i].input_op_names = nullptr; - } - if (nullptr != ms->op_relationship_entries[i].output_op_names) { - for (U32 j = 0; j < ms->op_relationship_entries[i].num_outputs; j++) { - if (nullptr != ms->op_relationship_entries[i].output_op_names[j]) - delete [] ms->op_relationship_entries[i].output_op_names[j]; - ms->op_relationship_entries[i].output_op_names[j] = nullptr; - } - delete [] ms->op_relationship_entries[i].output_op_names; - ms->op_relationship_entries[i].output_op_names = nullptr; - } - } - delete [] ms->op_relationship_entries; - ms->op_relationship_entries = nullptr; - } - - return SUCCESS; -} - -std::string concat_dir_file(std::string dir, std::string file) { - std::string ret; - if (!dir.empty()) { - int len = dir.size(); - char& last = dir.at(len - 1); - if ('/' != last) { - ret = dir +'/'; - }else{ - ret = dir; - } - ret += file; - } else { - ret = file; - } - - return ret; -} - -EE mt_load(CI8* dir, CI8* mfn, ModelSpec* md) { - std::string completePath = concat_dir_file(dir, mfn); - deserialize_model_from_file(completePath.c_str(), md); - return SUCCESS; -} - -#if defined(_USE_CAFFE) || defined(_USE_ONNX) || defined(_USE_TFLITE) -EE mt_store(CI8* dir, CI8* mfn, const ModelSpec* md) { - std::string completePath = concat_dir_file(dir, mfn); - serialize_model_to_file(md, completePath.c_str()); - return SUCCESS; -} -#endif diff --git a/model-tools/src/onnx/CMakeLists.txt b/model-tools/src/onnx/CMakeLists.txt deleted file mode 100644 index 875932a8..00000000 --- a/model-tools/src/onnx/CMakeLists.txt +++ /dev/null @@ -1,18 +0,0 @@ -file(GLOB srcs ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) - -include_directories(${CMAKE_CURRENT_SOURCE_DIR}) - -protobuf_generate_cpp(ONNX_PROTO_SRCS ONNX_PROTO_HDRS onnx.proto) - -include_directories(${PROTOBUF_INCLUDE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) -include_directories(../) - -# shared library -ADD_LIBRARY(${PROJECT_NAME}_onnx SHARED ${srcs} ${ONNX_PROTO_HDRS} ${ONNX_PROTO_SRCS}) - -# static library -ADD_LIBRARY(${PROJECT_NAME}_onnx_static STATIC ${srcs} ${ONNX_PROTO_HDRS} ${ONNX_PROTO_SRCS}) - -SET_TARGET_PROPERTIES(${PROJECT_NAME}_onnx_static PROPERTIES OUTPUT_NAME "${PROJECT_NAME}_onnx") -SET_TARGET_PROPERTIES(${PROJECT_NAME}_onnx PROPERTIES CLEAN_DIRECT_OUTPUT 1) -SET_TARGET_PROPERTIES(${PROJECT_NAME}_onnx_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) diff --git a/model-tools/src/onnx/onnx.proto b/model-tools/src/onnx/onnx.proto deleted file mode 100644 index c90fb042..00000000 --- a/model-tools/src/onnx/onnx.proto +++ /dev/null @@ -1,505 +0,0 @@ -// -// WARNING: This file is automatically generated! Please edit onnx.in.proto. -// - - -// Copyright (c) ONNX Project Contributors. -// Licensed under the MIT license. - -syntax = "proto2"; - -package onnx; - -// Overview -// -// ONNX is an open specification that is comprised of the following components: -// -// 1) A definition of an extensible computation graph model. -// 2) Definitions of standard data types. -// 3) Definitions of built-in operators. -// -// This document describes the syntax of models and their computation graphs, -// as well as the standard data types. Together, they are referred to as the ONNX -// Intermediate Representation, or 'IR' for short. -// -// The normative semantic specification of the ONNX IR is found in docs/IR.md. -// Definitions of the built-in neural network operators may be found in docs/Operators.md. - -// Notes -// -// Release -// -// We are still in the very early stage of defining ONNX. The current -// version of ONNX is a starting point. While we are actively working -// towards a complete spec, we would like to get the community involved -// by sharing our working version of ONNX. -// -// Protobuf compatibility -// -// To simplify framework compatibility, ONNX is defined using the subset of protobuf -// that is compatible with both protobuf v2 and v3. This means that we do not use any -// protobuf features that are only available in one of the two versions. -// -// Here are the most notable contortions we have to carry out to work around -// these limitations: -// -// - No 'map' (added protobuf 3.0). We instead represent mappings as lists -// of key-value pairs, where order does not matter and duplicates -// are not allowed. - - -// Versioning -// -// ONNX versioning is specified in docs/IR.md and elaborated on in docs/Versioning.md -// -// To be compatible with both proto2 and proto3, we will use a version number -// that is not defined by the default value but an explicit enum number. -enum Version { - // proto3 requires the first enum value to be zero. - // We add this just to appease the compiler. - _START_VERSION = 0; - // The version field is always serialized and we will use it to store the - // version that the graph is generated from. This helps us set up version - // control. - // For the IR, we are using simple numbers starting with with 0x00000001, - // which was the version we published on Oct 10, 2017. - IR_VERSION_2017_10_10 = 0x0000000000000001; - - // IR_VERSION 2 published on Oct 30, 2017 - // - Added type discriminator to AttributeProto to support proto3 users - IR_VERSION_2017_10_30 = 0x0000000000000002; - - // IR VERSION 3 published on Nov 3, 2017 - // - For operator versioning: - // - Added new message OperatorSetIdProto - // - Added opset_import in ModelProto - // - For vendor extensions, added domain in NodeProto - IR_VERSION_2017_11_3 = 0x0000000000000003; - - // IR VERSION 4 published on Jan 22, 2019 - // - Relax constraint that initializers should be a subset of graph inputs - // - Add type BFLOAT16 - IR_VERSION_2019_1_22 = 0x0000000000000004; - - // IR VERSION 5 published on March 18, 2019 - // - Add message TensorAnnotation. - // - Add quantization annotation in GraphProto to map tensor with its scale and zero point quantization parameters. - IR_VERSION = 0x0000000000000005; -} - -// Attributes -// -// A named attribute containing either singular float, integer, string, graph, -// and tensor values, or repeated float, integer, string, graph, and tensor values. -// An AttributeProto MUST contain the name field, and *only one* of the -// following content fields, effectively enforcing a C/C++ union equivalent. -message AttributeProto { - - // Note: this enum is structurally identical to the OpSchema::AttrType - // enum defined in schema.h. If you rev one, you likely need to rev the other. - enum AttributeType { - UNDEFINED = 0; - FLOAT = 1; - INT = 2; - STRING = 3; - TENSOR = 4; - GRAPH = 5; - - FLOATS = 6; - INTS = 7; - STRINGS = 8; - TENSORS = 9; - GRAPHS = 10; - } - - // The name field MUST be present for this version of the IR. - optional string name = 1; // namespace Attribute - - // if ref_attr_name is not empty, ref_attr_name is the attribute name in parent function. - // In this case, this AttributeProto does not contain data, and it's a reference of attribute - // in parent scope. - // NOTE: This should ONLY be used in function (sub-graph). It's invalid to be used in main graph. - optional string ref_attr_name = 21; - - // A human-readable documentation for this attribute. Markdown is allowed. - optional string doc_string = 13; - - // The type field MUST be present for this version of the IR. - // For 0.0.1 versions of the IR, this field was not defined, and - // implementations needed to use has_field hueristics to determine - // which value field was in use. For IR_VERSION 0.0.2 or later, this - // field MUST be set and match the f|i|s|t|... field in use. This - // change was made to accomodate proto3 implementations. - optional AttributeType type = 20; // discriminator that indicates which field below is in use - - // Exactly ONE of the following fields must be present for this version of the IR - optional float f = 2; // float - optional int64 i = 3; // int - optional bytes s = 4; // UTF-8 string - optional TensorProto t = 5; // tensor value - optional GraphProto g = 6; // graph - // Do not use field below, it's deprecated. - // optional ValueProto v = 12; // value - subsumes everything but graph - - repeated float floats = 7; // list of floats - repeated int64 ints = 8; // list of ints - repeated bytes strings = 9; // list of UTF-8 strings - repeated TensorProto tensors = 10; // list of tensors - repeated GraphProto graphs = 11; // list of graph -} - -// Defines information on value, including the name, the type, and -// the shape of the value. -message ValueInfoProto { - // This field MUST be present in this version of the IR. - optional string name = 1; // namespace Value - // This field MUST be present in this version of the IR. - optional TypeProto type = 2; - // A human-readable documentation for this value. Markdown is allowed. - optional string doc_string = 3; -} - -// Nodes -// -// Computation graphs are made up of a DAG of nodes, which represent what is -// commonly called a "layer" or "pipeline stage" in machine learning frameworks. -// -// For example, it can be a node of type "Conv" that takes in an image, a filter -// tensor and a bias tensor, and produces the convolved output. -message NodeProto { - repeated string input = 1; // namespace Value - repeated string output = 2; // namespace Value - - // An optional identifier for this node in a graph. - // This field MAY be absent in ths version of the IR. - optional string name = 3; // namespace Node - - // The symbolic identifier of the Operator to execute. - optional string op_type = 4; // namespace Operator - // The domain of the OperatorSet that specifies the operator named by op_type. - optional string domain = 7; // namespace Domain - - // Additional named attributes. - repeated AttributeProto attribute = 5; - - // A human-readable documentation for this node. Markdown is allowed. - optional string doc_string = 6; -} - -// Models -// -// ModelProto is a top-level file/container format for bundling a ML model and -// associating its computation graph with metadata. -// -// The semantics of the model are described by the associated GraphProto. -message ModelProto { - // The version of the IR this model targets. See Version enum above. - // This field MUST be present. - optional int64 ir_version = 1; - - // The OperatorSets this model relies on. - // All ModelProtos MUST have at least one entry that - // specifies which version of the ONNX OperatorSet is - // being imported. - // - // All nodes in the ModelProto's graph will bind against the operator - // with the same-domain/same-op_type operator with the HIGHEST version - // in the referenced operator sets. - repeated OperatorSetIdProto opset_import = 8; - - // The name of the framework or tool used to generate this model. - // This field SHOULD be present to indicate which implementation/tool/framework - // emitted the model. - optional string producer_name = 2; - - // The version of the framework or tool used to generate this model. - // This field SHOULD be present to indicate which implementation/tool/framework - // emitted the model. - optional string producer_version = 3; - - // Domain name of the model. - // We use reverse domain names as name space indicators. For example: - // `com.facebook.fair` or `com.microsoft.cognitiveservices` - // - // Together with `model_version` and GraphProto.name, this forms the unique identity of - // the graph. - optional string domain = 4; - - // The version of the graph encoded. See Version enum below. - optional int64 model_version = 5; - - // A human-readable documentation for this model. Markdown is allowed. - optional string doc_string = 6; - - // The parameterized graph that is evaluated to execute the model. - optional GraphProto graph = 7; - - // Named metadata values; keys should be distinct. - repeated StringStringEntryProto metadata_props = 14; -}; - -// StringStringEntryProto follows the pattern for cross-proto-version maps. -// See https://developers.google.com/protocol-buffers/docs/proto3#maps -message StringStringEntryProto { - optional string key = 1; - optional string value= 2; -}; - -message TensorAnnotation { - optional string tensor_name = 1; - // pairs to annotate tensor specified by above. - // The keys used in the mapping below must be pre-defined in ONNX spec. - // For example, for 8-bit linear quantization case, 'SCALE_TENSOR', 'ZERO_POINT_TENSOR' will be pre-defined as - // quantization parameter keys. - repeated StringStringEntryProto quant_parameter_tensor_names = 2; -} - - - -// Graphs -// -// A graph defines the computational logic of a model and is comprised of a parameterized -// list of nodes that form a directed acyclic graph based on their inputs and outputs. -// This is the equivalent of the "network" or "graph" in many deep learning -// frameworks. -message GraphProto { - // The nodes in the graph, sorted topologically. - repeated NodeProto node = 1; - - // The name of the graph. - optional string name = 2; // namespace Graph - - // A list of named tensor values, used to specify constant inputs of the graph. - // Each TensorProto entry must have a distinct name (within the list) that - // MAY also appear in the input list. - repeated TensorProto initializer = 5; - - // A human-readable documentation for this graph. Markdown is allowed. - optional string doc_string = 10; - - // The inputs and outputs of the graph. - repeated ValueInfoProto input = 11; - repeated ValueInfoProto output = 12; - - // Information for the values in the graph. The ValueInfoProto.name's - // must be distinct. It is optional for a value to appear in value_info list. - repeated ValueInfoProto value_info = 13; - - // This field carries information to indicate the mapping among a tensor and its - // quantization parameter tensors. For example: - // For tensor 'a', it may have {'SCALE_TENSOR', 'a_scale'} and {'ZERO_POINT_TENSOR', 'a_zero_point'} annotated, - // which means, tensor 'a_scale' and tensor 'a_zero_point' are scale and zero point of tensor 'a' in the model. - repeated TensorAnnotation quantization_annotation = 14; - - // DO NOT USE the following fields, they were deprecated from earlier versions. - // repeated string input = 3; - // repeated string output = 4; - // optional int64 ir_version = 6; - // optional int64 producer_version = 7; - // optional string producer_tag = 8; - // optional string domain = 9; -} - -// Tensors -// -// A serialized tensor value. -message TensorProto { - enum DataType { - UNDEFINED = 0; - // Basic types. - FLOAT = 1; // float - UINT8 = 2; // uint8_t - INT8 = 3; // int8_t - UINT16 = 4; // uint16_t - INT16 = 5; // int16_t - INT32 = 6; // int32_t - INT64 = 7; // int64_t - STRING = 8; // string - BOOL = 9; // bool - - // IEEE754 half-precision floating-point format (16 bits wide). - // This format has 1 sign bit, 5 exponent bits, and 10 mantissa bits. - FLOAT16 = 10; - - DOUBLE = 11; - UINT32 = 12; - UINT64 = 13; - COMPLEX64 = 14; // complex with float32 real and imaginary components - COMPLEX128 = 15; // complex with float64 real and imaginary components - - // Non-IEEE floating-point format based on IEEE754 single-precision - // floating-point number truncated to 16 bits. - // This format has 1 sign bit, 8 exponent bits, and 7 mantissa bits. - BFLOAT16 = 16; - - // Future extensions go here. - } - - // The shape of the tensor. - repeated int64 dims = 1; - - // The data type of the tensor. - // This field MUST have a valid TensorProto.DataType value - optional int32 data_type = 2; - - // For very large tensors, we may want to store them in chunks, in which - // case the following fields will specify the segment that is stored in - // the current TensorProto. - message Segment { - optional int64 begin = 1; - optional int64 end = 2; - } - optional Segment segment = 3; - - // Tensor content must be organized in row-major order. - // - // Depending on the data_type field, exactly one of the fields below with - // name ending in _data is used to store the elements of the tensor. - - // For float and complex64 values - // Complex64 tensors are encoded as a single array of floats, - // with the real components appearing in odd numbered positions, - // and the corresponding imaginary component apparing in the - // subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i] - // is encoded as [1.0, 2.0 ,3.0 ,4.0] - // When this field is present, the data_type field MUST be FLOAT or COMPLEX64. - repeated float float_data = 4 [packed = true]; - - // For int32, uint8, int8, uint16, int16, bool, and float16 values - // float16 values must be bit-wise converted to an uint16_t prior - // to writing to the buffer. - // When this field is present, the data_type field MUST be - // INT32, INT16, INT8, UINT16, UINT8, BOOL, or FLOAT16 - repeated int32 int32_data = 5 [packed = true]; - - // For strings. - // Each element of string_data is a UTF-8 encoded Unicode - // string. No trailing null, no leading BOM. The protobuf "string" - // scalar type is not used to match ML community conventions. - // When this field is present, the data_type field MUST be STRING - repeated bytes string_data = 6; - - // For int64. - // When this field is present, the data_type field MUST be INT64 - repeated int64 int64_data = 7 [packed = true]; - - // Optionally, a name for the tensor. - optional string name = 8; // namespace Value - - // A human-readable documentation for this tensor. Markdown is allowed. - optional string doc_string = 12; - - // Serializations can either use one of the fields above, or use this - // raw bytes field. The only exception is the string case, where one is - // required to store the content in the repeated bytes string_data field. - // - // When this raw_data field is used to store tensor value, elements MUST - // be stored in as fixed-width, little-endian order. - // Floating-point data types MUST be stored in IEEE 754 format. - // Complex64 elements must be written as two consecutive FLOAT values, real component first. - // Complex128 elements must be written as two consecutive DOUBLE values, real component first. - // Boolean type MUST be written one byte per tensor element (00000001 for true, 00000000 for false). - // - // Note: the advantage of specific field rather than the raw_data field is - // that in some cases (e.g. int data), protobuf does a better packing via - // variable length storage, and may lead to smaller binary footprint. - // When this field is present, the data_type field MUST NOT be STRING or UNDEFINED - optional bytes raw_data = 9; - - // Data can be stored inside the protobuf file using type-specific fields or raw_data. - // Alternatively, raw bytes data can be stored in an external file, using the external_data field. - // external_data stores key-value pairs describing data location. Recognized keys are: - // - "location" (required) - POSIX filesystem path relative to the directory where the ONNX - // protobuf model was stored - // - "offset" (optional) - position of byte at which stored data begins. Integer stored as string. - // Offset values SHOULD be multiples 4096 (page size) to enable mmap support. - // - "length" (optional) - number of bytes containing data. Integer stored as string. - // - "checksum" (optional) - SHA1 digest of file specified in under 'location' key. - repeated StringStringEntryProto external_data = 13; - - // Location of the data for this tensor. MUST be one of: - // - DEFAULT - data stored inside the protobuf message. Data is stored in raw_data (if set) otherwise in type-specified field. - // - EXTERNAL - data stored in an external location as described by external_data field. - enum DataLocation { - DEFAULT = 0; - EXTERNAL = 1; - } - - // If value not set, data is stored in raw_data (if set) otherwise in type-specified field. - optional DataLocation data_location = 14; - - // For double - // Complex128 tensors are encoded as a single array of doubles, - // with the real components appearing in odd numbered positions, - // and the corresponding imaginary component apparing in the - // subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i] - // is encoded as [1.0, 2.0 ,3.0 ,4.0] - // When this field is present, the data_type field MUST be DOUBLE or COMPLEX128 - repeated double double_data = 10 [packed = true]; - - // For uint64 and uint32 values - // When this field is present, the data_type field MUST be - // UINT32 or UINT64 - repeated uint64 uint64_data = 11 [packed = true]; -} - -// Defines a tensor shape. A dimension can be either an integer value -// or a symbolic variable. A symbolic variable represents an unknown -// dimension. -message TensorShapeProto { - message Dimension { - oneof value { - int64 dim_value = 1; - string dim_param = 2; // namespace Shape - }; - // Standard denotation can optionally be used to denote tensor - // dimensions with standard semantic descriptions to ensure - // that operations are applied to the correct axis of a tensor. - // Refer to https://github.com/onnx/onnx/blob/master/docs/DimensionDenotation.md#denotation-definition - // for pre-defined dimension denotations. - optional string denotation = 3; - }; - repeated Dimension dim = 1; -} - -// Types -// -// The standard ONNX data types. -message TypeProto { - - message Tensor { - // This field MUST NOT have the value of UNDEFINED - // This field MUST have a valid TensorProto.DataType value - // This field MUST be present for this version of the IR. - optional int32 elem_type = 1; - optional TensorShapeProto shape = 2; - } - - - oneof value { - // The type of a tensor. - Tensor tensor_type = 1; - - } - - // An optional denotation can be used to denote the whole - // type with a standard semantic description as to what is - // stored inside. Refer to https://github.com/onnx/onnx/blob/master/docs/TypeDenotation.md#type-denotation-definition - // for pre-defined type denotations. - optional string denotation = 6; -} - -// Operator Sets -// -// OperatorSets are uniquely identified by a (domain, opset_version) pair. -message OperatorSetIdProto { - // The domain of the operator set being identified. - // The empty string ("") or absence of this field implies the operator - // set that is defined as part of the ONNX specification. - // This field MUST be present in this version of the IR when referring to any other operator set. - optional string domain = 1; - - // The version of the operator set being identified. - // This field MUST be present in this version of the IR. - optional int64 version = 2; -} \ No newline at end of file diff --git a/model-tools/src/onnx/onnx_adaptee.h b/model-tools/src/onnx/onnx_adaptee.h deleted file mode 100644 index 1c7e4b35..00000000 --- a/model-tools/src/onnx/onnx_adaptee.h +++ /dev/null @@ -1,1235 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_CAFFEADAPTEE -#define _H_CAFFEADAPTEE - -#include -#include -#include -#include -#include -#include -#include -#include "onnx.pb.h" - -#include "type.h" -#include "converter.h" -#include "model_serialize_deserialize.hpp" -#include "model_tools.h" -#include "model_adaptee.h" -#include "ut_util.h" - -class OnnxAdaptee: public ModelAdaptee { -public: - OnnxAdaptee(int removePreprocessOpNum_outside) { - this->removePreprocessOpNum = removePreprocessOpNum_outside; - } - ~OnnxAdaptee() {} - -protected: - EE read_from_onnx_file(const char* path, google::protobuf::Message* message) { - std::ifstream fs(path, std::ifstream::in | std::ifstream::binary); - if (!fs.is_open()) { - return NOT_FOUND; - } - - google::protobuf::io::IstreamInputStream input(&fs); - google::protobuf::io::CodedInputStream codedstr(&input); - - codedstr.SetTotalBytesLimit(INT_MAX, INT_MAX / 2); - - bool ret = message -> ParseFromCodedStream(&codedstr); - fs.close(); - - return (ret) ? SUCCESS : NOT_SUPPORTED; - } - - OperatorType convert_onnx_type(std::string inputType) { - if (inputType == "Conv") { - return OT_Conv; - } else if (inputType == "BatchNormalization") { - return OT_BatchNorm; - } else if (inputType == "Sum" || inputType == "Add" || inputType == "Mul" || inputType == "Div") { - return OT_Eltwise; - } else if (inputType == "Gemm") { - return OT_FC; - } else if (inputType == "AveragePool" || inputType == "MaxPool" - || inputType == "ReduceMean" || inputType == "GlobalAveragePool") { - return OT_Pooling; - } else if (inputType == "Relu" || inputType == "LeakyRelu") { - return OT_Relu; - } else if (inputType == "Softmax") { - return OT_Softmax; - } else if (inputType == "Concat") { - return OT_Concat; - } else if (inputType == "Pad") { - return OT_Pad; - } else if (inputType == "Max" || inputType == "Min" || inputType == "Clip") { - return OT_Clip; - } else if (inputType == "Reshape") { - return OT_Reshape; - } else if (inputType == "Squeeze") { - return OT_Squeeze; - } else if (inputType == "Transpose") { - return OT_Transpose; - } else if (inputType == "Gather") { - return OT_Gather; - } else if (inputType == "Unsqueeze") { - return OT_Unsqueeze; - } else if (inputType == "Upsample") { - return OT_Upsample; - } else if (inputType == "Cast") { - return OT_Cast; - } else if (inputType == "Constant") { - return OT_Constant; - } else if (inputType == "MatMul") { - return OT_MatMul; - } else if (inputType == "Flatten") { - return OT_Flatten; - } else if (inputType == "ConvTranspose") { - return OT_Deconvolution; - } else if (inputType == "Tanh") { - return OT_TanH; - } else if (inputType == "LogSoftmax") { - return OT_LogSoftmax; - } else { - return OT_None; - } - } - - std::vector get_node_vector_ints_attribute_by_name(const onnx::NodeProto& node, const char* key) - { - std::vector result; - for (int i = 0; i < node.attribute_size(); i++) { - const onnx::AttributeProto& attribute = node.attribute(i); - if (attribute.name() == key) { - result.resize(attribute.ints_size()); - for (int j = 0; j < attribute.ints_size(); j++) { - result[j] = attribute.ints(j); - } - break; - } - } - return result; - } - - std::vector get_node_vector_float_tensor_attribute_by_name(const onnx::NodeProto& node, const char* key) - { - std::vector result; - for (int i = 0; i < node.attribute_size(); i++) { - const onnx::AttributeProto& attribute = node.attribute(i); - if (attribute.name() == key) { - CHECK_REQUIREMENT(4 == attribute.type()); - const onnx::TensorProto& tp = attribute.t(); - F32 *value; - if (tp.has_raw_data()) { - const std::string& rawData = tp.raw_data(); - value = (F32*)(rawData.data()); - } else if (tp.data_type() == 1) { - value = (F32*)(tp.float_data().data()); - } else { - std::cout << "[WARNING] Constant not extracted\n"; - return result; - } - - result.resize(tp.dims(0)); - for (int j = 0; j < tp.dims(0); j++) { - result[j] = value[j]; - } - break; - } - } - return result; - } - - int get_node_single_int_attribute_by_name(const onnx::NodeProto& node, const char* key, int defaultValue = 0) { - for (int i = 0; i < node.attribute_size(); i++) { - const onnx::AttributeProto& attribute = node.attribute(i); - if (attribute.name() == key) { - return attribute.i(); - } - } - return defaultValue; - } - - std::string get_node_str_attribute_by_name(const onnx::NodeProto& node, const char* key, - const std::string& defaultValue = std::string()) { - for (int i = 0; i < node.attribute_size(); i++) { - const onnx::AttributeProto& attribute = node.attribute(i); - if (attribute.name() == key) { - return attribute.s(); - } - } - return defaultValue; - } - - float get_node_float_attribute_by_name(const onnx::NodeProto& node, const char* key, float defaultValue = 0.f) { - for (int i = 0; i < node.attribute_size(); i++) { - const onnx::AttributeProto& attribute = node.attribute(i); - if (attribute.name() == key) { - return attribute.f(); - } - } - return defaultValue; - } - - int get_data_size_from_tensor_proto(const onnx::TensorProto& tensorProto) { - if (tensorProto.has_raw_data()) { - const std::string& rawData = tensorProto.raw_data(); - int size = (int)rawData.size() / sizeof(float); - return size; - } else if (tensorProto.data_type() == 1) { - return tensorProto.float_data_size(); - } - return 0; - } - - float* get_ptr_from_weight_obj(const onnx::TensorProto& tensorProto) { - if (tensorProto.has_raw_data()) { - const std::string& rawData = tensorProto.raw_data(); - float* paramPtr = (float*)(rawData.data()); - return paramPtr; - } else if (tensorProto.data_type() == 1) { - float* paramPtr = (float*)(tensorProto.float_data().data()); - return paramPtr; - } - return nullptr; - } - - std::vector get_reshapeInfo_from_tensorProto(const onnx::TensorProto& tp) - { - int size = 0; - std::vector shape; - - // int64 - if (tp.data_type() == 7) { - const int64_t* shapeData = 0; - if (tp.has_raw_data()) { - shapeData = (const int64_t*)tp.raw_data().data(); - size = tp.raw_data().size() / 8; - } else { - shapeData = tp.int64_data().data(); - size = tp.int64_data_size(); - } - - for (int j=0; j < size; j++) { - shape.push_back(shapeData[j]); - } - } else if (tp.data_type() == 6) { // int32 - const int32_t* shapeData = 0; - if (tp.has_raw_data()) { - shapeData = (const int32_t*)tp.raw_data().data(); - size = tp.raw_data().size() / 4; - } else { - shapeData = tp.int32_data().data(); - size = tp.int32_data_size(); - } - - for (int j=0; jmodelName = mfn; - - EE ret = read_from_onnx_file(onnxPath.c_str(), &onnxModel); - if (ret != SUCCESS) { - std::cerr << "[ERROR] fail to load " << onnxPath; - exit(1); - } - - onnxGraph = onnxModel.graph(); - - for (int i=0; i < onnxGraph.initializer_size(); i++) { - const onnx::TensorProto& initializer = onnxGraph.initializer(i); - weights[initializer.name()] = initializer; - } - return ret; - } - - EE adapt_operators(ModelSpec* ms) override - { - EE ret = SUCCESS; - str_copy(ms->model_name, modelName.c_str(), modelName.length()); - ms->model_name[NAME_LEN - 1] = '\0'; - ms->dt = DT_F32; - - int onnxNodeCount = onnxGraph.node_size(); - - int input_node_num = onnxGraph.input().size(); - int output_node_num = onnxGraph.output().size(); - if (input_node_num != 1) { - std::cerr << "[WARNING]: num of input node is not 1 " << std::endl; - // return NOT_SUPPORTED; - } - - std::vector exactly_input_names; - std::vector> input_dimens; - for (int i=0; i dims_list; - int node_dimension_size = input_node.type().tensor_type().shape().dim().size(); - - if (node_dimension_size == 4) { - // extraction for 4 dimension tensor - int dim_0 = input_node.type().tensor_type().shape().dim(0).dim_value(); - if (dim_0 == 0) { - dims_list.push_back(1); - dims_list.push_back(input_node.type().tensor_type().shape().dim(3).dim_value()); - dims_list.push_back(input_node.type().tensor_type().shape().dim(1).dim_value()); - dims_list.push_back(input_node.type().tensor_type().shape().dim(2).dim_value()); - }else { - dims_list.push_back(input_node.type().tensor_type().shape().dim(0).dim_value()); - dims_list.push_back(input_node.type().tensor_type().shape().dim(1).dim_value()); - dims_list.push_back(input_node.type().tensor_type().shape().dim(2).dim_value()); - dims_list.push_back(input_node.type().tensor_type().shape().dim(3).dim_value()); - } - } else if (node_dimension_size == 3 || node_dimension_size == 2) { - for (int j=0; jnum_inputs = input_node_num; - ms->input_names = (I8**)mt_new_storage(ms->num_inputs * sizeof(I8*)); - if (exactly_input_names.size() == 1) { - const onnx::NodeProto& theFirstNode = onnxGraph.node(removePreprocessOpNum); - std::string modelInputName = theFirstNode.input(0); - exactly_input_names[0] = modelInputName; - } -// const onnx::NodeProto& theFirstNode = onnxGraph.node(removePreprocessOpNum); // need to be flexible -// std::string modelInputName = theFirstNode.input(0); -// ms->input_names[0] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); -// str_copy(ms->input_names[0], modelInputName.c_str(), modelInputName.length()); - for (int k=0; kinput_names[k] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); - str_copy(ms->input_names[0], exactly_input_names[k].c_str(), exactly_input_names[k].length()); - } - ms->input_dims = (TensorDesc*)mt_new_storage(sizeof(TensorDesc) * ms->num_inputs); - for (int i=0; i < ms->num_inputs; i++) { - int curInputDimSize = input_dimens[i].size(); - TensorDesc input_desc; - if (curInputDimSize == 4) { - input_desc = tensor4d(DT_F32, input_dimens[i][0], input_dimens[i][1], input_dimens[i][2], input_dimens[i][3]); - } else if (curInputDimSize == 3) { - input_desc = ms->input_dims[i] = tensor3df(DT_F32, DF_MTK, input_dimens[i][0], input_dimens[i][1], input_dimens[i][2]); - } else if (curInputDimSize == 2) { - input_desc = ms->input_dims[i] = tensor2df(DT_F32, DF_NORMAL, input_dimens[i][0], input_dimens[i][1]); - } else { - std::cerr << "[ERROR]: not support input dimension!" << std::endl; - } - ms->input_dims[i] = input_desc; - } - - ms->num_outputs = output_node_num; - ms->output_names = (I8**)mt_new_storage(ms->num_outputs * sizeof(I8*)); - // const onnx::NodeProto& the_last_node = onnxGraph.node(onnxNodeCount - 1); - // std::string modelOutputName = the_last_node.output(0); - // ms->output_names[0] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); - // str_copy(ms->output_names[0], modelOutputName.c_str(), modelOutputName.length()); - for (int k=0; k< output_node_num; k++) { - ms->output_names[k] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); - str_copy(ms->output_names[k], onnxGraph.output(k).name().c_str(), onnxGraph.output(k).name().length()); - } - - int bnOpNum = 0; - int constantOpNum = 0; - for (int i=0; i < onnxNodeCount; i++) { - const onnx::NodeProto& tmpNode = onnxGraph.node(i); - if (tmpNode.op_type() == "BatchNormalization") { - bnOpNum++; - } else if (tmpNode.op_type() == "Constant") { - if (i >= removePreprocessOpNum) { - constantOpNum++; - } - } - } - - ms->num_operator_specs = onnxNodeCount + bnOpNum - constantOpNum - removePreprocessOpNum; // appending space for scale op - OperatorSpec* opsPtr = (OperatorSpec*)mt_new_storage(sizeof(OperatorSpec) * ms->num_operator_specs); - ms->ops = opsPtr; - for (I32 i = 0; i < ms->num_operator_specs; i++) { - ms->ops[i].tensor_positions = nullptr; - ms->ops[i].num_quant_feature = 0; - ms->ops[i].feature_scale = nullptr; - } - - // Some models transformed from TF store weight and bias as Constant OP - int numUnseenConstants = 0; - nodeIndex = 0; - for (int i = 0; i < removePreprocessOpNum; i++) { - this->node = onnxGraph.node(nodeIndex); - this->op = node.op_type(); - if (op == "Constant") { - handle_Constant(); - numUnseenConstants++; - } - nodeIndex++; - } - if (0 != numUnseenConstants) { - std::cout << removePreprocessOpNum << " OPs are skipped, and " << numUnseenConstants << " of them are Constant OP.\n"; - } - - nodeIndex = removePreprocessOpNum; - int opIndex = 0; - for (int i = removePreprocessOpNum; i < onnxNodeCount; i++) { - this->node = onnxGraph.node(nodeIndex); - this->op = node.op_type(); - if (op == "Constant") { - handle_Constant(); - nodeIndex++; - continue; - } - std::string opName = node.name(); - if (opName.empty()) { - opName = node.output(0); - } - int opInputNum = (int)node.input_size(); - opFinalInputNum = opInputNum; - std::vector inputNames; - std::vector op_weight_objs; - for (int j = 0; j < opInputNum; j++) { - const std::string& input_name = node.input(j); - if (weights.find(input_name) != weights.end()) { - opFinalInputNum--; - op_weight_objs.push_back(input_name); - } else { - inputNames.push_back(input_name); - if (op == "Max" || op == "Min") { - opFinalInputNum = 1; - break; - } - } - } - int opOutputNum = (int)node.output_size(); - std::vector outputNames; - for (int j = 0; j < opOutputNum; j++) { - const std::string& output_name = node.output(j); - outputNames.push_back(output_name); - } - - str_copy(opsPtr[opIndex].name, opName.c_str(), opName.length()); - OperatorType opType = convert_onnx_type(op); - opsPtr[opIndex].type = opType; - opsPtr[opIndex].num_inputs = opFinalInputNum; - opsPtr[opIndex].input_tensors_name = (I8**)mt_new_storage(opsPtr[opIndex].num_inputs * sizeof(I8 *)); - for (U32 j = 0; j < opsPtr[opIndex].num_inputs; j++) { - opsPtr[opIndex].input_tensors_name[j] = (I8*)mt_new_storage(NAME_LEN * sizeof(I8)); - str_copy(opsPtr[opIndex].input_tensors_name[j], inputNames[j].c_str(), inputNames[j].length()); - } - opsPtr[opIndex].num_outputs = opOutputNum; - opsPtr[opIndex].output_tensors_name = (I8**)mt_new_storage(opsPtr[opIndex].num_outputs * sizeof(I8 *)); - for (U32 j = 0; j < opsPtr[opIndex].num_outputs; j++) { - opsPtr[opIndex].output_tensors_name[j] = (I8*)mt_new_storage(NAME_LEN * sizeof(I8)); - str_copy(opsPtr[opIndex].output_tensors_name[j], outputNames[j].c_str(), outputNames[j].length()); - } - - if ((op == "Add" || op == "Mul" || op == "Div") && opFinalInputNum == 1) { - weightOpIndexLists.push_back(nodeIndex); - opsPtr[opIndex].type = OT_Scale; - } else if (op == "Transpose" && opFinalInputNum == 0) { - weightOpIndexLists.push_back(nodeIndex); - } else { - ParameterSpec curPs; - ret = adapt_operator(opType, &curPs); - CHECK_STATUS(ret); - opsPtr[opIndex].ps = curPs; - - if (opType == OT_BatchNorm) { - std::string scaleInputName = outputNames[0]; - std::string scaleOpName = "scale_" + opName; - opIndex++; - str_copy(opsPtr[opIndex].name, scaleOpName.c_str(), scaleOpName.length()); - opsPtr[opIndex].type = OT_Scale; - opsPtr[opIndex].num_inputs = 1; - opsPtr[opIndex].input_tensors_name = (I8 **)mt_new_storage(sizeof(I8 *)); - opsPtr[opIndex].input_tensors_name[0] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); - str_copy(opsPtr[opIndex].input_tensors_name[0], scaleInputName.c_str(), scaleInputName.length()); - opsPtr[opIndex].num_outputs = 1; - opsPtr[opIndex].output_tensors_name = (I8 **)mt_new_storage(sizeof(I8 *)); - opsPtr[opIndex].output_tensors_name[0] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); - str_copy(opsPtr[opIndex].output_tensors_name[0], scaleInputName.c_str(), scaleInputName.length()); - } - } - - nodeIndex++; - opIndex++; - } - ms->num_weight_specs = weightOpIndexLists.size() + bnOpNum; - return ret; - } - - EE adapt_weights(ModelSpec* ms) override - { - EE ret = SUCCESS; - WeightSpec* wsPtr = (WeightSpec*)mt_new_storage(sizeof(WeightSpec) * ms->num_weight_specs); - for (int j = 0; j < ms->num_weight_specs; j++) { - wsPtr[j].num_quant_scale = 0; - wsPtr[j].weight_scale = nullptr; - } - ms->ws = wsPtr; - int weightOpIndexIndeed = 0; - for (U32 i = 0; i < (U32)ms->num_weight_specs; i++) { - int weightOpIndex = weightOpIndexLists[weightOpIndexIndeed]; - const onnx::NodeProto& weightNode = onnxGraph.node(weightOpIndex); - std::string weightOpName = weightNode.name(); - if (weightOpName.empty()) { - weightOpName = weightNode.output(0); - } - const std::string& weightOpType = weightNode.op_type(); - - if (weightOpType == "Conv" || weightOpType == "ConvTranspose") { - // to check that if any op has bias - int convInputNum = weightNode.input_size(); // if convInputNum == 3, means has bias , otherwise , do not have bias - - const onnx::TensorProto& convWeightTp = weights[weightNode.input(1)]; - - int convWeightNum = get_data_size_from_tensor_proto(convWeightTp); - float* convWeightParamPtr = get_ptr_from_weight_obj(convWeightTp); - str_copy(wsPtr[i].op_name, weightOpName.c_str(), weightOpName.length()); - - // traverse weight elements to see whether it is bnn convolution - U32 isDOREFA = 0; - U32 isXNOR = 0; - for (I32 i = 0; i < convWeightNum; i++) { - float cur = convWeightParamPtr[i]; - if (cur!=1.0 && cur!=0 && cur!=-1.0) { - isDOREFA = 0; - isXNOR = 0; - break; - } - if (cur == 0) { - if (isXNOR == 1) { - isDOREFA = 0; - isXNOR = 0; - break; - } else if (isDOREFA == 0) { - isDOREFA = 1; - } - } else if (cur == -1.0) { - if (isDOREFA == 1) { - isDOREFA = 0; - isXNOR = 0; - break; - } else if (isXNOR == 0) { - isXNOR = 1; - } - } - } - if (isDOREFA == 1) { - wsPtr[i].mdt = DT_BIN01; - } else if (isXNOR == 1) { - wsPtr[i].mdt = DT_BIN11; - } else { - wsPtr[i].mdt = DT_F32; // Assume weights will not all be 1.0 - } - //wsPtr[i].mdt = DT_F32; - wsPtr[i].bytes_of_weight = convWeightNum * sizeof(float); // Please do not change to bytesOf(mdt) - wsPtr[i].weight = (U8*)mt_new_storage(wsPtr[i].bytes_of_weight); - memcpy(wsPtr[i].weight, convWeightParamPtr, wsPtr[i].bytes_of_weight); - - int convBiasNum = 0; - float* convBiasParamPtr = nullptr; - if (convInputNum == 3) { - const onnx::TensorProto& convBiasTp = weights[weightNode.input(2)]; - convBiasNum = get_data_size_from_tensor_proto(convBiasTp); - convBiasParamPtr = get_ptr_from_weight_obj(convBiasTp); - wsPtr[i].bytes_of_vec = convBiasNum * sizeof(float); - if (isDOREFA || isXNOR) { - wsPtr[i].bytes_of_vec *= 2; // BNN conv must have a scale vector and a bias vector, so that it can fuse with BN - } - wsPtr[i].vec = (U8*)mt_new_storage(wsPtr[i].bytes_of_vec); - if (isDOREFA == 1 || isXNOR == 1) { - U32 vecBytes = convBiasNum * sizeof(float); - F32 *scale = (F32*)wsPtr[i].vec; - for (I32 j = 0; j < convBiasNum; j++) { - scale[j] = 1.0; - } - memcpy(wsPtr[i].vec + vecBytes, convBiasParamPtr, vecBytes); // Copy bias (if any) to the second half for BNN - } else { - memcpy(wsPtr[i].vec, convBiasParamPtr, wsPtr[i].bytes_of_vec); - } - } else { - wsPtr[i].bytes_of_vec = 0; - wsPtr[i].vec = nullptr; - } - } else if (weightOpType == "Gemm") { - // attention: fc op weight bias order is different from conv op - const onnx::TensorProto& fcWeightTp = weights[weightNode.input(1)]; - const onnx::TensorProto& fcBiasTp = weights[weightNode.input(2)]; - int fcWeightNum = get_data_size_from_tensor_proto(fcWeightTp); - int fcBiasNum = get_data_size_from_tensor_proto(fcBiasTp); - float* fcWeightParamPtr = get_ptr_from_weight_obj(fcWeightTp); - float* fcBiasParamPtr = get_ptr_from_weight_obj(fcBiasTp); - str_copy(wsPtr[i].op_name, weightOpName.c_str(), weightOpName.length()); - wsPtr[i].mdt = DT_F32; - wsPtr[i].bytes_of_weight = fcWeightNum * sizeof(float); - wsPtr[i].weight = (U8*)mt_new_storage(wsPtr[i].bytes_of_weight); - memcpy(wsPtr[i].weight, fcWeightParamPtr, wsPtr[i].bytes_of_weight); - wsPtr[i].bytes_of_vec = fcBiasNum * sizeof(float); - wsPtr[i].vec = (U8*)mt_new_storage(wsPtr[i].bytes_of_vec); - memcpy(wsPtr[i].vec, fcBiasParamPtr, wsPtr[i].bytes_of_vec); - } else if(weightOpType == "BatchNormalization") { - const onnx::TensorProto& scale = weights[weightNode.input(1)]; - const onnx::TensorProto& bias = weights[weightNode.input(2)]; - const onnx::TensorProto& mean = weights[weightNode.input(3)]; - const onnx::TensorProto& var = weights[weightNode.input(4)]; - - float* meanPtr = get_ptr_from_weight_obj(mean); - int bnMeanNum = get_data_size_from_tensor_proto(mean); - float* varPtr = get_ptr_from_weight_obj(var); - int bnVarNum = get_data_size_from_tensor_proto(var); - - str_copy(wsPtr[i].op_name, weightOpName.c_str(), weightOpName.length()); - wsPtr[i].mdt = DT_F32; - wsPtr[i].bytes_of_weight = bnMeanNum * sizeof(float); - wsPtr[i].bytes_of_vec = bnVarNum * sizeof(float); - - wsPtr[i].weight = (U8*)mt_new_storage(wsPtr[i].bytes_of_weight); - memcpy(wsPtr[i].weight, meanPtr, wsPtr[i].bytes_of_weight); - wsPtr[i].vec = (U8*)mt_new_storage(wsPtr[i].bytes_of_vec); - memcpy(wsPtr[i].vec, varPtr, wsPtr[i].bytes_of_vec); - - // for scale - std::string scaleWeightOpName = "scale_" + weightOpName; - i = i + 1; - float* scalePtr = get_ptr_from_weight_obj(scale); - int scaleWeightNum = get_data_size_from_tensor_proto(scale); - float* biasPtr = get_ptr_from_weight_obj(bias); - int scaleBiasNum = get_data_size_from_tensor_proto(bias); - - str_copy(wsPtr[i].op_name, scaleWeightOpName.c_str(), scaleWeightOpName.length()); - wsPtr[i].mdt = DT_F32; - wsPtr[i].bytes_of_weight = scaleWeightNum * sizeof(float); - wsPtr[i].bytes_of_vec = scaleBiasNum * sizeof(float); - - wsPtr[i].weight = (U8*)mt_new_storage(wsPtr[i].bytes_of_weight); - memcpy(wsPtr[i].weight, scalePtr, wsPtr[i].bytes_of_weight); - wsPtr[i].vec = (U8*)mt_new_storage(wsPtr[i].bytes_of_vec); - memcpy(wsPtr[i].vec, biasPtr, wsPtr[i].bytes_of_vec); - } else if(weightOpType == "Add") { - const onnx::TensorProto& bias = weights[weightNode.input(1)]; - float* bias_ptr = get_ptr_from_weight_obj(bias); - int bias_num = get_data_size_from_tensor_proto(bias); - - str_copy(wsPtr[i].op_name, weightOpName.c_str(), weightOpName.length()); - wsPtr[i].mdt = DT_F32; - wsPtr[i].bytes_of_weight = 0; - wsPtr[i].bytes_of_vec = bias_num * sizeof(float); - wsPtr[i].weight = nullptr; - wsPtr[i].vec = (U8*)mt_new_storage(wsPtr[i].bytes_of_vec); - memcpy(wsPtr[i].vec, bias_ptr, wsPtr[i].bytes_of_vec); - } else if(weightOpType == "Mul") { - const onnx::TensorProto& weight = weights[weightNode.input(1)]; - float* weight_ptr = get_ptr_from_weight_obj(weight); - int weight_num = get_data_size_from_tensor_proto(weight); - - str_copy(wsPtr[i].op_name, weightOpName.c_str(), weightOpName.length()); - wsPtr[i].mdt = DT_F32; - wsPtr[i].bytes_of_weight = weight_num * sizeof(float); - wsPtr[i].bytes_of_vec = 0; - wsPtr[i].weight = (U8*)mt_new_storage(wsPtr[i].bytes_of_weight); - memcpy(wsPtr[i].weight, weight_ptr, wsPtr[i].bytes_of_weight); - wsPtr[i].vec = nullptr; - } else if(weightOpType == "Div") { - const onnx::TensorProto& weight = weights[weightNode.input(1)]; - float* weight_ptr = get_ptr_from_weight_obj(weight); - int weight_num = get_data_size_from_tensor_proto(weight); - - str_copy(wsPtr[i].op_name, weightOpName.c_str(), weightOpName.length()); - wsPtr[i].mdt = DT_F32; - wsPtr[i].bytes_of_weight = weight_num * sizeof(float); - wsPtr[i].bytes_of_vec = 0; - wsPtr[i].weight = (U8*)mt_new_storage(wsPtr[i].bytes_of_weight); - F32 *scale = (F32*)wsPtr[i].weight; - for (int j = 0; j < weight_num; j++) { - scale[j] = 1 / weight_ptr[j]; - } - wsPtr[i].vec = nullptr; - } else if (weightOpType == "Transpose") { - const onnx::TensorProto& weight = weights[weightNode.input(0)]; - float* weight_ptr = get_ptr_from_weight_obj(weight); - int weight_num = get_data_size_from_tensor_proto(weight); - - str_copy(wsPtr[i].op_name, weightOpName.c_str(), weightOpName.length()); - wsPtr[i].mdt = DT_F32; - wsPtr[i].bytes_of_weight = weight_num * sizeof(float); - // For the time being, use bytes_of_vec to record the horizontal length of weight - wsPtr[i].bytes_of_vec = weight.dims(0); - wsPtr[i].weight = (U8*)mt_new_storage(wsPtr[i].bytes_of_weight); - memcpy(wsPtr[i].weight, weight_ptr, wsPtr[i].bytes_of_weight); - wsPtr[i].vec = nullptr; - } - weightOpIndexIndeed++; - } - return ret; - } - - ParameterSpec adapt_Reshape() override - { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - ReshapeParamSpec reshapePs; - initialization_zero(&reshapePs, sizeof(reshapePs)); - std::vector reshapeInfo; - if (node.input_size() == 1) { - reshapeInfo = get_node_vector_ints_attribute_by_name(node, "shape"); - } else { - reshapeInfo = get_reshapeInfo_from_tensorProto(weights[node.input(1)]); // tp:weights[node.input(1)] - } - reshapePs.shape_size = reshapeInfo.size(); - memcpy(reshapePs.shape_dims, reshapeInfo.data(), reshapePs.shape_size * sizeof(I32)); - curPs.reshape_spec = reshapePs; - return curPs; - } - - ParameterSpec adapt_Upsample() override - { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - UpsampleParamSpec upsamplePs; - std::string unsampleMode = get_node_str_attribute_by_name(node, "mode", "linear"); - str_copy(upsamplePs.upsample_mode, unsampleMode.c_str(), unsampleMode.length()); - - // Get scales from Constant - const onnx::TensorProto& scales = weights[node.input(1)]; - CHECK_REQUIREMENT(scales.dims(0) == 4); - F32 *value = nullptr; - if (scales.has_raw_data()) { - const std::string& rawData = scales.raw_data(); - value = (F32*)(rawData.data()); - } else if (scales.data_type() == 1) { - value = (F32*)(scales.float_data().data()); - } else { - std::cerr << "[ERROR] Upsample cannot extract scales from Constant\n"; - CHECK_STATUS(NOT_SUPPORTED); - } - memcpy(upsamplePs.scale, value, 4 * bytesOf(DT_F32)); - curPs.upsample_spec = upsamplePs; - return curPs; - } - - ParameterSpec adapt_Transpose() override - { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - TransposeParamSpec transposePs; - std::vector transpose_info = get_node_vector_ints_attribute_by_name(node, "perm"); - transposePs.trans_size = transpose_info.size(); - memcpy(transposePs.trans_dims, transpose_info.data(), transposePs.trans_size * sizeof(U32)); - curPs.transpose_spec = transposePs; - return curPs; - } - - ParameterSpec adapt_Clip() override - { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - ClipParamSpec clipParam; - if (op == "Max") { - clipParam.min = 0; - clipParam.max = UNI_F16_MAX; - } else if (op == "Min") { - clipParam.min = -UNI_F16_MAX; - clipParam.max = 1; - } else { // op == "Clip" - clipParam.min = get_node_float_attribute_by_name(node, "min", -UNI_F16_MAX);; - clipParam.max = get_node_float_attribute_by_name(node, "max", UNI_F16_MAX);; - } - curPs.clip_spec = clipParam; - return curPs; - } - - ParameterSpec adapt_Conv() override - { - weightOpIndexLists.push_back(nodeIndex); - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - std::vector kernelShape = get_node_vector_ints_attribute_by_name(node, "kernel_shape"); - std::vector dilations = get_node_vector_ints_attribute_by_name(node, "dilations"); - std::vector strides = get_node_vector_ints_attribute_by_name(node, "strides"); - std::vector pads = get_node_vector_ints_attribute_by_name(node, "pads"); - int group = get_node_single_int_attribute_by_name(node, "group", 1); - - const onnx::TensorProto& weight = weights[node.input(1)]; - ConvolutionParamSpec cps; - initialization_zero(&cps, sizeof(cps)); - cps.num_outputs = weight.dims(0); - - if (kernelShape.size() == 2) { - cps.kernel_size_h = kernelShape[0]; - cps.kernel_size_w = kernelShape[1]; - } else if (kernelShape.size() == 1) { - cps.kernel_size_h = kernelShape[0]; - cps.kernel_size_w = 1; - } else { - std::cerr << "[ERROR] convolution: kernel_size unknown" << std::endl; - exit(1); - } - - if (dilations.size() == 2) { - cps.dilatedRate_h = dilations[0]; - cps.dilatedRate_w = dilations[1]; - } else if (dilations.size() == 1) { - cps.dilatedRate_h = dilations[0]; - cps.dilatedRate_w = 1; - } else { - std::cout << "[WARNING] convolution: dilation unknown. Default to 1" << std::endl; - cps.dilatedRate_h = 1; - cps.dilatedRate_w = 1; - } - - if (strides.size() == 2) { - cps.stride_h = strides[0]; - cps.stride_w = strides[1]; - } else if (strides.size() == 1) { - cps.stride_h = strides[0]; - cps.stride_w = 1; - } else { - std::cerr << "[ERROR] convolution: stride unknown" << std::endl; - exit(1); - } - - if(pads.size() == 4) { - if (cps.kernel_size_h == cps.kernel_size_w && (pads[0] != pads[2] || pads[1] != pads[3])) { - cps.padding_top = UNI_MAX(pads[0], pads[2]); - cps.padding_bottom = UNI_MAX(pads[0], pads[2]); - cps.padding_left = UNI_MAX(pads[1], pads[3]); - cps.padding_right = UNI_MAX(pads[1], pads[3]); - } else { - cps.padding_top = pads[0]; - cps.padding_left = pads[1]; - cps.padding_bottom = pads[2]; - cps.padding_right = pads[3]; - } - } else if (pads.size() == 2) { - cps.padding_top = pads[0]; - cps.padding_bottom = pads[1]; - cps.padding_left = 0; - cps.padding_right = 0; - } else { - std::cerr << "[ERROR] deconvolution: pad unknown" << std::endl; - exit(1); - } - - cps.group = group; - if (cps.group == 1) { - if (cps.dilatedRate_h > 1 || cps.dilatedRate_w > 1) { - cps.convolution_type = Convolution_Dilation; - } else { - cps.convolution_type = Convolution_Pointwise; - } - } else { - cps.convolution_type = Convolution_Depthwise; - } - - cps.dw_activation_type = ACTIVATION_NULL; - cps.pw_activation_type = ACTIVATION_NULL; - curPs.conv_spec = cps; - return curPs; - } - - ParameterSpec adapt_Deconvolution() override - { - weightOpIndexLists.push_back(nodeIndex); - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - std::vector kernelShape = get_node_vector_ints_attribute_by_name(node, "kernel_shape"); - std::vector dilations = get_node_vector_ints_attribute_by_name(node, "dilations"); - std::vector strides = get_node_vector_ints_attribute_by_name(node, "strides"); - std::vector pads = get_node_vector_ints_attribute_by_name(node, "pads"); - int group = get_node_single_int_attribute_by_name(node, "group", 1); - - const onnx::TensorProto& weight = weights[node.input(1)]; - ConvolutionParamSpec cps; - initialization_zero(&cps, sizeof(cps)); - cps.num_outputs = weight.dims(1); - - if (kernelShape.size() == 2) { - cps.kernel_size_h = kernelShape[0]; - cps.kernel_size_w = kernelShape[1]; - } else if (kernelShape.size() == 1) { - cps.kernel_size_h = kernelShape[0]; - cps.kernel_size_w = 1; - } else { - std::cerr << "[ERROR] deconvolution: kernel_size unknown" << std::endl; - exit(1); - } - - if (dilations.size() == 2) { - cps.dilatedRate_h = dilations[0]; - cps.dilatedRate_w = dilations[1]; - } else if (dilations.size() == 1) { - cps.dilatedRate_h = dilations[0]; - cps.dilatedRate_w = 1; - } - else { - std::cerr << "[ERROR] deconvolution: dilation unknown" << std::endl; - exit(1); - } - - if (strides.size() == 2) { - cps.stride_h = strides[0]; - cps.stride_w = strides[1]; - } else if (strides.size() == 1) { - cps.stride_h = strides[0]; - cps.stride_w = 1; - } - else { - std::cerr << "[ERROR] deconvolution: stride unknown" << std::endl; - exit(1); - } - - if(pads.size() == 4) { - cps.padding_top = pads[0]; - cps.padding_left = pads[1]; - cps.padding_bottom = pads[2]; - cps.padding_right = pads[3]; - } else if (pads.size() == 2) { - cps.padding_top = pads[0]; - cps.padding_bottom = pads[1]; - cps.padding_left = 0; - cps.padding_right = 0; - } else { - std::cerr << "[ERROR] deconvolution: pad unknown" << std::endl; - exit(1); - } - - cps.group = group; - cps.convolution_type = Convolution_Deconvolution; - cps.dw_activation_type = ACTIVATION_NULL; - cps.pw_activation_type = ACTIVATION_NULL; - curPs.conv_spec = cps; - return curPs; - } - - ParameterSpec adapt_Pooling() override - { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - PoolingParamSpec pps; - initialization_zero(&pps, sizeof(pps)); - std::string autoPad = get_node_str_attribute_by_name(node, "auto_pad"); // deprecated - std::vector kernelShape = get_node_vector_ints_attribute_by_name(node, "kernel_shape"); - std::vector strides = get_node_vector_ints_attribute_by_name(node, "strides"); - std::vector pads = get_node_vector_ints_attribute_by_name(node, "pads"); - - if (op == "AveragePool" || op == "ReduceMean" || op == "GlobalAveragePool") { - pps.mode = POOLING_MEAN; - } else { - pps.mode = POOLING_MAX; - } - - if (autoPad == "SAME_UPPER") { - pps.rm = CEIL; - } else { - pps.rm = FLOOR; - } - - if (kernelShape.size() == 2) { - pps.kernel_size_h = kernelShape[0]; - pps.kernel_size_w = kernelShape[1]; - } else { - pps.kernel_size_h = 0; - pps.kernel_size_w = 0; - std::cout << "[INFO] pooling: kernel_size unknown. This could be global pooling." << std::endl; - } - - if (strides.size() == 2) { - pps.stride_h = strides[0]; - pps.stride_w = strides[1]; - } else { - pps.stride_h = 0; - pps.stride_w = 0; - std::cout << "[INFO] pooling: stride unknown. This could be global pooling." << std::endl; - } - - if (pads.size() == 4) { - pps.padding_top = pads[0]; - pps.padding_bottom = pads[2]; - pps.padding_left = pads[1]; - pps.padding_right = pads[3]; - } else { - pps.padding_top = 0; - pps.padding_bottom = 0; - pps.padding_left = 0; - pps.padding_right = 0; - } - curPs.pooling_spec = pps; - return curPs; - } - - ParameterSpec adapt_Flatten() override { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - FlattenParamSpec flattenPs; - flattenPs.axis = get_node_single_int_attribute_by_name(node, "axis", 1); - curPs.flatten_spec = flattenPs; - return curPs; - } - - ParameterSpec adapt_MatMul() override - { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - MatMulParamSpec matmulPs; - matmulPs.transpose_a = false; - matmulPs.transpose_b = false; - curPs.matmul_spec = matmulPs; - return curPs; - } - - ParameterSpec adapt_Fc() override - { - weightOpIndexLists.push_back(nodeIndex); - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - FullyConnectedParamSpec fcParamSpec; - fcParamSpec.num_outputs = -1; - float alpha = get_node_float_attribute_by_name(node, "alpha", 1.f); - float beta = get_node_float_attribute_by_name(node, "beta", 1.f); - int transA = get_node_single_int_attribute_by_name(node, "transA", 0); - int transB = get_node_single_int_attribute_by_name(node, "transB", 0); - - if (alpha == 1.f && beta == 1.f) { - if (transA ==0 && transB == 1) { - const onnx::TensorProto& C = weights[node.input(2)]; - int num_output = get_data_size_from_tensor_proto(C); - fcParamSpec.num_outputs = num_output; - } - }else{ - std::cerr << "[ERROR] fc: num_output unknown" << std::endl; - exit(1); - } - fcParamSpec.num_slices = 1; - fcParamSpec.slice_point[0] = fcParamSpec.num_outputs; - curPs.fc_spec = fcParamSpec; - return curPs; - } - - ParameterSpec adapt_BatchNorm() override - { - weightOpIndexLists.push_back(nodeIndex); - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - float epsilon = get_node_float_attribute_by_name(node, "epsilon", 1e-5f); - BatchNormParamSpec bnPs; - bnPs.eps = epsilon; - bnPs.axis = 1; - bnPs.gama = 1; - bnPs.momentum = get_node_float_attribute_by_name(node, "momentum", 0.9); - curPs.bn_spec = bnPs; - return curPs; - } - - ParameterSpec adapt_Eltwise() override - { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - EltwiseParamSpec eps; - initialization_zero(&eps, sizeof(eps)); - if (op == "Add") { - eps.elt_mode = ELTWISE_SUM; - EltwiseSumSpec elt_sum_spec; - initialization_zero(&elt_sum_spec, sizeof(elt_sum_spec)); - elt_sum_spec.coeff_size = 2; - F32* f_ptr = (F32*)mt_new_storage(elt_sum_spec.coeff_size * sizeof(float)); - for (I32 j = 0; j < elt_sum_spec.coeff_size; j++) { - f_ptr[j] = 1.0; - } - elt_sum_spec.coeff_values = f_ptr; - eps.elt_sum_spec = elt_sum_spec; - } else if (op == "Mul") { - eps.elt_mode = ELTWISE_PROD; - } else { - CHECK_STATUS(NOT_IMPLEMENTED); - } - curPs.eltwise_spec = eps; - return curPs; - } - - void handle_Constant() - { - for (int i = 0; i < node.attribute_size(); i++) { - const onnx::AttributeProto& attribute = node.attribute(i); - if (attribute.name() == "value") { - CHECK_REQUIREMENT(4 == attribute.type()); - const onnx::TensorProto& tp = attribute.t(); - weights[node.output(0)] = tp; - break; - } - } - } - - ParameterSpec adapt_Pad() override - { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - PadParamSpec padPs; - PadMode padMode; - std::string padModeStr = get_node_str_attribute_by_name(node, "mode"); - std::vector padVec = get_node_vector_ints_attribute_by_name(node, "pads"); - F32 padValue = get_node_float_attribute_by_name(node, "value", 0.f); - - if (padModeStr == "constant") { - padMode = Pad_Constant; - } else if (padModeStr == "edge") { - padMode = Pad_Edge; - } else if (padModeStr == "reflect") { - padMode = Pad_Reflect; - } else { - std::cerr << "[ERROR] unknown pad mode: " << padModeStr << std::endl; - exit(1); - } - - U32 padSize = padVec.size(); - if (padSize == 8) { // NCHW - padPs.top = padVec[2]; - padPs.left = padVec[3]; - padPs.bottom = padVec[6]; - padPs.right = padVec[7]; - } else if (padSize == 6) { // NCH - padPs.top = padVec[2]; - padPs.left = 0; - padPs.bottom = padVec[5]; - padPs.right = 0; - } else if (padSize == 4) { // HW - padPs.top = padVec[0]; - padPs.left = padVec[1]; - padPs.bottom = padVec[2]; - padPs.right = padVec[3]; - } else { - std::cerr << "[ERROR] unsupported pad length" << std::endl; - exit(1); - } - padPs.constant_value = padValue; - padPs.pad_mode = padMode; - curPs.pad_spec = padPs; - return curPs; - } - - ParameterSpec adapt_Gather() override - { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - GatherParamSpec gps; - int gatherAxis = get_node_single_int_attribute_by_name(node, "axis", 0); - gps.gather_axis = gatherAxis; - curPs.gather_spec = gps; - return curPs; - } - - ParameterSpec adapt_Squeeze() override { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - SqueezeParamSpec squeezePs; - std::vector squeezeAxes = get_node_vector_ints_attribute_by_name(node, "axes"); - squeezePs.axes_num = squeezeAxes.size(); - squeezePs.axis = -1 * squeezeAxes.size(); - for (int squeeze_i = 0; squeeze_i < (int)squeezeAxes.size(); squeeze_i++) { - squeezePs.squeeze_axes[squeeze_i] = squeezeAxes[squeeze_i]; - } - curPs.squeeze_spec = squeezePs; - return curPs; - } - - ParameterSpec adapt_Unsqueeze() override { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - UnsqueezeParamSpec unsqueezePs; - std::vector unsqueezeAxes = get_node_vector_ints_attribute_by_name(node, "axes"); - unsqueezePs.axes_num = unsqueezeAxes.size(); - unsqueezePs.axis = -1 * unsqueezeAxes.size(); - for (int unsqueeze_i = 0; unsqueeze_i < (int)unsqueezeAxes.size(); unsqueeze_i++) { - unsqueezePs.unsqueeze_axes[unsqueeze_i] = unsqueezeAxes[unsqueeze_i]; - } - curPs.unsqueeze_spec = unsqueezePs; - return curPs; - } - - ParameterSpec adapt_Cast() override - { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - CastParamSpec castPs; - int castTo = get_node_single_int_attribute_by_name(node, "to", 0); - castPs.cast_to = castTo; - curPs.cast_spec = castPs; - return curPs; - } - - ParameterSpec adapt_Concat() override { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - ConcatParamSpec concatPs; - concatPs.axis = get_node_single_int_attribute_by_name(node, "axis", 1); - curPs.concat_spec = concatPs; - return curPs; - } - - ParameterSpec adapt_Softmax() override { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - SoftmaxParamSpec softmaxPs; - softmaxPs.axis = get_node_single_int_attribute_by_name(node, "axis", 1); - curPs.softmax_spec = softmaxPs; - return curPs; - } - - ParameterSpec adapt_Relu() override { - ParameterSpec curPs; - initialization_zero(&curPs, sizeof(curPs)); - ReLUParamSpec reluPs; - reluPs.neg_slope = get_node_float_attribute_by_name(node, "alpha", 0.0); - curPs.relu_spec = reluPs; - return curPs; - } - -private: - std::string op; // op type - std::string modelName; - int removePreprocessOpNum; - TensorDesc inputDesc; - onnx::ModelProto onnxModel; - onnx::GraphProto onnxGraph; - onnx::NodeProto node; - std::map weights; - int nodeIndex; - std::vector weightOpIndexLists; - int opFinalInputNum; -}; -#endif diff --git a/model-tools/src/tflite/CMakeLists.txt b/model-tools/src/tflite/CMakeLists.txt deleted file mode 100644 index f84431fc..00000000 --- a/model-tools/src/tflite/CMakeLists.txt +++ /dev/null @@ -1,15 +0,0 @@ -file(GLOB srcs ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) - -include_directories(${CMAKE_CURRENT_SOURCE_DIR}) - -include_directories(../) - -# shared library -ADD_LIBRARY(${PROJECT_NAME}_tflite SHARED ${srcs}) - -# static library -ADD_LIBRARY(${PROJECT_NAME}_tflite_static STATIC ${srcs}) - -SET_TARGET_PROPERTIES(${PROJECT_NAME}_tflite_static PROPERTIES OUTPUT_NAME "${PROJECT_NAME}_tflite") -SET_TARGET_PROPERTIES(${PROJECT_NAME}_tflite PROPERTIES CLEAN_DIRECT_OUTPUT 1) -SET_TARGET_PROPERTIES(${PROJECT_NAME}_tflite_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) diff --git a/model-tools/src/tflite/tflite_adaptee.h b/model-tools/src/tflite/tflite_adaptee.h deleted file mode 100644 index 17f984e0..00000000 --- a/model-tools/src/tflite/tflite_adaptee.h +++ /dev/null @@ -1,792 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include -#include -#include -#include -#include -#include -#include -#include "type.h" -#include "converter.h" -#include "model_serialize_deserialize.hpp" -#include "model_tools.h" -#include "model_adaptee.h" - -class TfliteAdaptee: public ModelAdaptee { -public: - TfliteAdaptee() {} - ~TfliteAdaptee() {} - -protected: - OperatorType convert_tflite_type(tflite::BuiltinOperator tfliteType) - { - if (tfliteType == tflite::BuiltinOperator_ADD) { - if (tfliteModelBuffer[tensors[ops[curIndex]->inputs[1]]->buffer]->data.size() > 0) { - return OT_Scale; - } else { - return OT_Eltwise; - } - } else if (tfliteType == tflite::BuiltinOperator_CONCATENATION) { - return OT_Concat; - } else if (tfliteType == tflite::BuiltinOperator_CONV_2D) { - return OT_Conv; - } else if (tfliteType == tflite::BuiltinOperator_DEPTHWISE_CONV_2D) { - return OT_Conv; - } else if (tfliteType == tflite::BuiltinOperator_LOGISTIC) { - return OT_Sigmoid; - } else if (tfliteType == tflite::BuiltinOperator_MAX_POOL_2D) { - return OT_Pooling; - } else if (tfliteType == tflite::BuiltinOperator_AVERAGE_POOL_2D) { - return OT_Pooling; - } else if (tfliteType == tflite::BuiltinOperator_RESHAPE) { - return OT_Reshape; - } else if (tfliteType == tflite::BuiltinOperator_RESIZE_BILINEAR) { - return OT_Resize; - } else if (tfliteType == tflite::BuiltinOperator_SOFTMAX) { - return OT_Softmax; - } else if (tfliteType == tflite::BuiltinOperator_FULLY_CONNECTED) { - if (tfliteModelBuffer[tensors[ops[curIndex]->inputs[1]]->buffer]->data.size() > 0) { - return OT_FC; - } else { - return OT_MatMul; - } - } else if (tfliteType == tflite::BuiltinOperator_TRANSPOSE) { - return OT_Transpose; - } else if (tfliteType == tflite::BuiltinOperator_SLICE) { - return OT_Slice; - } else if (tfliteType == tflite::BuiltinOperator_PACK) { - return OT_Concat; - } else if (tfliteType == tflite::BuiltinOperator_MUL) { - if (tfliteModelBuffer[tensors[ops[curIndex]->inputs[1]]->buffer]->data.size() > 0) { - return OT_Scale; - } else { - return OT_Eltwise; - } - } else if (tfliteType == tflite::BuiltinOperator_DIV) { - if (tfliteModelBuffer[tensors[ops[curIndex]->inputs[1]]->buffer]->data.size() > 0) { - return OT_Scale; - } else { - return OT_Eltwise; - } - } else if (tfliteType == tflite::BuiltinOperator_SUB) { - if (tfliteModelBuffer[tensors[ops[curIndex]->inputs[1]]->buffer]->data.size() > 0) { - return OT_Scale; - } else { - return OT_Eltwise; - } - } else if (tfliteType == tflite::BuiltinOperator_RELU6) { - return OT_Relu6; - } else if (tfliteType == tflite::BuiltinOperator_TANH) { - return OT_TanH; - } else { - std::cerr << "[ERROR] tflite op " << tfliteType << " not implemented yet" << std::endl; - return OT_None; - } - } - - EE parse_file(std::string dir, std::string mfn) override - { - EE ret = SUCCESS; - std::string tfliteSuffix = ".tflite"; - - this->modelName = mfn; - - std::string model_name = dir + "/" + mfn + tfliteSuffix; - std::ifstream inputFile(model_name.c_str(), std::ios::binary); - inputFile.seekg(0, std::ios::end); - const auto size = inputFile.tellg(); - inputFile.seekg(0, std::ios::beg); - - char* buffer = new char[size]; - inputFile.read(buffer, size); - inputFile.close(); - - flatbuffers::Verifier verify((uint8_t*)buffer, size); - CHECK_REQUIREMENT(tflite::VerifyModelBuffer(verify)); - - auto tfliteModel = tflite::UnPackModel(buffer); - - tfliteOpSet.clear(); - for (int i = 0; i < (int)(tfliteModel->operator_codes).size(); i++) { - tfliteOpSet.push_back(std::move((tfliteModel->operator_codes)[i])); - } - - const auto subGraphsSize = tfliteModel->subgraphs.size(); - CHECK_REQUIREMENT(subGraphsSize == 1); - - tfliteModelBuffer.clear(); - for (int i = 0; i < (int)(tfliteModel->buffers).size(); i++) { - tfliteModelBuffer.push_back(std::move((tfliteModel->buffers)[i])); - } - - if (subGraphsSize != 1) { - CHECK_STATUS(NOT_SUPPORTED); - } - - ops.clear(); - for (int i=0; i < (int)(tfliteModel->subgraphs[0]->operators).size(); i++) { - ops.push_back(std::move((tfliteModel->subgraphs[0]->operators)[i])); - } - - tensors.clear(); - for (int i=0; i < (int)(tfliteModel->subgraphs[0]->tensors).size(); i++) { - tensors.push_back(std::move((tfliteModel->subgraphs[0]->tensors)[i])); - } - - inputs.clear(); - for (int i=0; i < (int)(tfliteModel->subgraphs[0]->inputs).size(); i++) { - inputs.push_back(std::move((tfliteModel->subgraphs[0]->inputs)[i])); - } - - outputs.clear(); - for (int i=0; i < (int)(tfliteModel->subgraphs[0]->outputs).size(); i++) { - outputs.push_back(std::move((tfliteModel->subgraphs[0]->outputs)[i])); - } - - return ret; - } - - EE adapt_operators(ModelSpec* ms) override - { - EE ret = SUCCESS; - str_copy(ms->model_name, modelName.c_str(), modelName.length()); - ms->model_name[NAME_LEN - 1] = '\0'; - ms->dt = DT_F32; - - int opNums = ops.size(); - - ms->num_inputs = inputs.size(); - ms->input_names = (I8**)mt_new_storage(ms->num_inputs * sizeof(I8*)); - ms->input_dims = (TensorDesc*)mt_new_storage(sizeof(TensorDesc) * ms->num_inputs); - for (I32 i = 0; i < ms->num_inputs; i++) { - const int inputIdx = inputs[i]; - const auto& inputTensor = tensors[inputIdx]; - const auto& inputShape = inputTensor->shape; - ms->input_names[i] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); - str_copy(ms->input_names[i], (inputTensor->name).c_str(), (inputTensor->name).length()); - switch (inputShape.size()) { - case 2: { - ms->input_dims[i] = tensor2df(DT_F32, DF_NORMAL, - inputShape[0], - inputShape[1]); - break; - } - case 3: { - ms->input_dims[i] = tensor3df(DT_F32, DF_MTK, - inputShape[0], - inputShape[1], - inputShape[2]); - break; - } - case 4: { - ms->input_dims[i] = tensor4df(DT_F32, DF_NCHW, - inputShape[0], - inputShape[3], - inputShape[1], - inputShape[2]); - break; - } - default: { - CHECK_STATUS(NOT_IMPLEMENTED); - } - } - } - ms->num_outputs = outputs.size(); - ms->output_names = (I8**)mt_new_storage(ms->num_outputs * sizeof(I8*)); - for (I32 i = 0; i < ms->num_outputs; i++) { - const int outputIdx = outputs[i]; - const auto& outputTensor = tensors[outputIdx]; - ms->output_names[i] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); - str_copy(ms->output_names[i], (outputTensor->name).c_str(), (outputTensor->name).length()); - } - - ms->num_operator_specs = opNums; - opsPtr = (OperatorSpec*)mt_new_storage(sizeof(OperatorSpec) * ms->num_operator_specs); - ms->ops = opsPtr; - for (I32 i = 0; i < ms->num_operator_specs; i++) { - ms->ops[i].tensor_positions = nullptr; - ms->ops[i].num_quant_feature = 0; - ms->ops[i].feature_scale = nullptr; - } - - for (int j = 0; j < ms->num_operator_specs; j++) { - std::string curOpName = "op" + std::to_string(j); - str_copy(opsPtr[j].name, curOpName.c_str(), curOpName.length()); - curIndex = j; - const int opcodeIndex = ops[j]->opcode_index; - this->opCode = tfliteOpSet[opcodeIndex]->builtin_code; - OperatorType opType = convert_tflite_type(opCode); - opsPtr[j].type = opType; - int opInputTensorSize = (modifiedInputsOp.count(opType) == 0) ? ops[j]->inputs.size() : modifiedInputsOp[opType]; - int opOutputTensorSize = (modifiedOutputsOp.count(opType) == 0) ? ops[j]->outputs.size() : modifiedOutputsOp[opType]; - opsPtr[j].num_inputs = opInputTensorSize; - opsPtr[j].input_tensors_name = (I8**)mt_new_storage(opsPtr[j].num_inputs * sizeof(I8*)); - for (int iter = 0; iter < opInputTensorSize; iter++) { - const int inIndex = ops[j]->inputs[iter]; - const auto& inTensor = tensors[inIndex]; - opsPtr[j].input_tensors_name[iter] = (I8*)mt_new_storage(NAME_LEN * sizeof(I8)); - str_copy(opsPtr[j].input_tensors_name[iter], (inTensor->name).c_str(), (inTensor->name).length()); - } - opsPtr[j].num_outputs = opOutputTensorSize; - opsPtr[j].output_tensors_name = (I8**)mt_new_storage(opsPtr[j].num_outputs * sizeof(I8*)); - for (int iter = 0; iter < opOutputTensorSize; iter++) { - const int outIndex = ops[j]->outputs[iter]; - const auto& outTensor = tensors[outIndex]; - opsPtr[j].output_tensors_name[iter] = (I8*)mt_new_storage(NAME_LEN * sizeof(I8)); - std::string outputName; - if (opType == OT_Slice) { - if (1 != iter) { - outputName = "slice_other_" + std::to_string(j) + "_" + std::to_string(iter); - } else { - const int trueIndex = ops[j]->outputs[0]; - const auto& out = tensors[trueIndex]; - outputName = out->name; - } - } else { - outputName = outTensor->name; - } - str_copy(opsPtr[j].output_tensors_name[iter], outputName.c_str(), outputName.length()); - } - - ParameterSpec curPs; - ret = adapt_operator(opType, &curPs); - opsPtr[j].ps = curPs; - } - - ms->num_weight_specs = modelWeightOpNum; - return ret; - } - - void from_nhwc_to_nchw(TensorDesc desc, F32* src, F32* dst) - { - DataType dt; - DataFormat df; - U32 n, c, h, w; - CHECK_STATUS(tensor4dGet(desc, &dt, &df, &n, &c, &h, &w)); - CHECK_REQUIREMENT(DF_NHWC == df); - - if (1 == h && 1 == w) { - memcpy(dst, src, tensorNumBytes(desc)); - } else { - for (U32 o = 0; o < n; o++) { - for (U32 hw = 0; hw < h * w; hw++) { - for (U32 cc = 0; cc < c; cc++) { - dst[o*c*h*w + cc*h*w + hw] = src[o*h*w*c + hw*c + cc]; - } - } - } - } - } - - EE adapt_weights(ModelSpec* ms) override - { - WeightSpec* wsPtr = (WeightSpec*)mt_new_storage(sizeof(WeightSpec) * ms->num_weight_specs); - for (int j = 0; j < ms->num_weight_specs; j++) { - wsPtr[j].num_quant_scale = 0; - wsPtr[j].weight_scale = nullptr; - } - ms->ws = wsPtr; - int weightMovIndex = 0; - for (int j = 0; j < ms->num_operator_specs; j++) { - std::string curOpName = "op" + std::to_string(j); - curIndex = j; - const int opcodeIndex = ops[j]->opcode_index; - opCode = tfliteOpSet[opcodeIndex]->builtin_code; - - if (opCode == tflite::BuiltinOperator_CONV_2D || opCode == tflite::BuiltinOperator_DEPTHWISE_CONV_2D) { - str_copy(wsPtr[weightMovIndex].op_name, curOpName.c_str(), curOpName.length()); - wsPtr[weightMovIndex].mdt = DT_F32; - // input 2/3: input/weight/bias - const int weightIndex = ops[j]->inputs[1]; - const auto& weightTensor = tensors[weightIndex]; - auto conv2DWeightPtr = reinterpret_cast(tfliteModelBuffer[weightTensor->buffer]->data.data()); - const auto& weightShape = weightTensor->shape; - CHECK_REQUIREMENT(weightShape.size() == 4); - const int conv2d_co = weightShape[0]; - const int conv2d_kh = weightShape[1]; - const int conv2d_kw = weightShape[2]; - const int conv2d_ci = weightShape[3]; - wsPtr[weightMovIndex].bytes_of_weight = conv2d_co * conv2d_kh * conv2d_kw * conv2d_ci * sizeof(float); - wsPtr[weightMovIndex].weight = (U8*)mt_new_storage(wsPtr[weightMovIndex].bytes_of_weight); - TensorDesc weightDesc = tensor4df(DT_F32, DF_NHWC, conv2d_co, conv2d_ci, conv2d_kh, conv2d_kw); - from_nhwc_to_nchw(weightDesc, (F32*)conv2DWeightPtr, (F32*)(wsPtr[weightMovIndex].weight)); - - if (ops[j]->inputs.size() == 3) { - const int biasIndex = ops[j]->inputs[2]; - const auto& biasTensor = tensors[biasIndex]; - auto conv2DBiasPtr = reinterpret_cast(tfliteModelBuffer[biasTensor->buffer]->data.data()); - if (opCode == tflite::BuiltinOperator_CONV_2D) { - wsPtr[weightMovIndex].bytes_of_vec = conv2d_co * sizeof(float); - } else { - wsPtr[weightMovIndex].bytes_of_vec = conv2d_ci * sizeof(float); - } - wsPtr[weightMovIndex].vec = (U8*)mt_new_storage(wsPtr[weightMovIndex].bytes_of_vec); - memcpy(wsPtr[weightMovIndex].vec, conv2DBiasPtr, wsPtr[weightMovIndex].bytes_of_vec); - } else { - wsPtr[weightMovIndex].bytes_of_vec = 0; - wsPtr[weightMovIndex].vec = nullptr; - } - weightMovIndex++; - } else if (OT_Scale == ms->ops[j].type) { - str_copy(wsPtr[weightMovIndex].op_name, curOpName.c_str(), curOpName.length()); - wsPtr[weightMovIndex].mdt = DT_F32; - switch (opCode) { - case tflite::BuiltinOperator_ADD: { - wsPtr[weightMovIndex].bytes_of_weight = 0; - wsPtr[weightMovIndex].weight = nullptr; - - const int biasIndex = ops[j]->inputs[1]; - const auto& biasTensor = tensors[biasIndex]; - auto biasPtr = reinterpret_cast(tfliteModelBuffer[biasTensor->buffer]->data.data()); - wsPtr[weightMovIndex].bytes_of_vec = tfliteModelBuffer[biasTensor->buffer]->data.size(); - wsPtr[weightMovIndex].vec = (U8*)mt_new_storage(wsPtr[weightMovIndex].bytes_of_vec); - memcpy(wsPtr[weightMovIndex].vec, biasPtr, wsPtr[weightMovIndex].bytes_of_vec); - break; - } - case tflite::BuiltinOperator_SUB: { - wsPtr[weightMovIndex].bytes_of_weight = 0; - wsPtr[weightMovIndex].weight = nullptr; - - const int biasIndex = ops[j]->inputs[1]; - const auto& biasTensor = tensors[biasIndex]; - auto biasPtr = reinterpret_cast(tfliteModelBuffer[biasTensor->buffer]->data.data()); - int size = tfliteModelBuffer[biasTensor->buffer]->data.size() / sizeof(float); - wsPtr[weightMovIndex].bytes_of_vec = size * sizeof(float); - wsPtr[weightMovIndex].vec = (U8*)mt_new_storage(wsPtr[weightMovIndex].bytes_of_vec); - F32 *ptr = (F32*)wsPtr[weightMovIndex].vec; - for (int k = 0; k < size; k++) { - ptr[k] = -1 * biasPtr[k]; - } - break; - } - case tflite::BuiltinOperator_MUL: { - const int scaleIndex = ops[j]->inputs[1]; - const auto& scaleTensor = tensors[scaleIndex]; - auto scalePtr = reinterpret_cast(tfliteModelBuffer[scaleTensor->buffer]->data.data()); - wsPtr[weightMovIndex].bytes_of_weight = tfliteModelBuffer[scaleTensor->buffer]->data.size(); - wsPtr[weightMovIndex].weight = (U8*)mt_new_storage(wsPtr[weightMovIndex].bytes_of_weight); - memcpy(wsPtr[weightMovIndex].weight, scalePtr, wsPtr[weightMovIndex].bytes_of_weight); - - wsPtr[weightMovIndex].bytes_of_vec = 0; - wsPtr[weightMovIndex].vec = nullptr; - break; - } - case tflite::BuiltinOperator_DIV: { - const int scaleIndex = ops[j]->inputs[1]; - const auto& scaleTensor = tensors[scaleIndex]; - auto scalePtr = reinterpret_cast(tfliteModelBuffer[scaleTensor->buffer]->data.data()); - int size = tfliteModelBuffer[scaleTensor->buffer]->data.size() / sizeof(float); - wsPtr[weightMovIndex].bytes_of_weight = size * sizeof(float); - wsPtr[weightMovIndex].weight = (U8*)mt_new_storage(wsPtr[weightMovIndex].bytes_of_weight); - F32 *ptr = (F32*)wsPtr[weightMovIndex].weight; - for (int k = 0; k < size; k++) { - ptr[k] = 1 / scalePtr[k]; - } - - wsPtr[weightMovIndex].bytes_of_vec = 0; - wsPtr[weightMovIndex].vec = nullptr; - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - weightMovIndex++; - } - } - return SUCCESS; - } - - ParameterSpec adapt_Eltwise() override - { - ParameterSpec curPs; - EltwiseParamSpec eltPs; - if (opCode == tflite::BuiltinOperator_ADD) { - eltPs.elt_mode = ELTWISE_SUM; - EltwiseSumSpec elt_sum_spec; - elt_sum_spec.coeff_size = 2; - F32* f_ptr = (F32*)mt_new_storage(elt_sum_spec.coeff_size * sizeof(float)); - for (I32 j = 0; j < elt_sum_spec.coeff_size; j++) { - f_ptr[j] = 1.0; - } - elt_sum_spec.coeff_values = f_ptr; - eltPs.elt_sum_spec = elt_sum_spec; - } else if (opCode == tflite::BuiltinOperator_MAXIMUM) { - eltPs.elt_mode = ELTWISE_MAX; - } else if (opCode == tflite::BuiltinOperator_MUL) { - eltPs.elt_mode = ELTWISE_PROD; - } else { - CHECK_STATUS(NOT_IMPLEMENTED); - } - curPs.eltwise_spec = eltPs; - return curPs; - } - - ParameterSpec adapt_Scale() override - { - ParameterSpec curPs; - modelWeightOpNum++; - ScaleParamSpec scalePs; - scalePs.axis = 0; - curPs.scale_spec = scalePs; - return curPs; - } - - ParameterSpec adapt_Conv() override - { - ParameterSpec curPs; - modelWeightOpNum++; - const int weightIndex = ops[curIndex]->inputs[1]; - const auto& weightTensor = tensors[weightIndex]; - - const auto& weightShape = weightTensor->shape; - CHECK_REQUIREMENT(weightShape.size() == 4); - - ConvolutionParamSpec convPs; - convPs.kernel_size_h = weightShape[1]; - convPs.kernel_size_w = weightShape[2]; - - if (opCode == tflite::BuiltinOperator_CONV_2D) { - convPs.num_outputs = weightShape[0]; - - const auto& tfliteConvOption = ops[curIndex]->builtin_options.AsConv2DOptions(); - convPs.dilatedRate_h = tfliteConvOption->dilation_h_factor; - convPs.dilatedRate_w = tfliteConvOption->dilation_w_factor; - convPs.stride_h = tfliteConvOption->stride_h; - convPs.stride_w = tfliteConvOption->stride_w; - const auto activationFunc = tfliteConvOption->fused_activation_function; - - if (1 == tfliteConvOption->padding) { // VALID - convPs.padding_top = 0; - convPs.padding_bottom = 0; - convPs.padding_left = 0; - convPs.padding_right = 0; - } else { // SAME - convPs.padding_top = (convPs.kernel_size_h - 1) / 2; - convPs.padding_bottom = (convPs.kernel_size_h - 1) / 2; - convPs.padding_left = (convPs.kernel_size_w - 1) / 2; - convPs.padding_right = (convPs.kernel_size_w - 1) / 2; - } - - convPs.group = 1; - - convPs.dw_activation_type = ACTIVATION_NULL; - convPs.pw_activation_type = ACTIVATION_NULL; - - if (convPs.dilatedRate_h > 1 || convPs.dilatedRate_w > 1) { - convPs.convolution_type = Convolution_Dilation; - } else { - convPs.convolution_type = Convolution_Pointwise; - } - if (activationFunc == tflite::ActivationFunctionType_RELU) { - convPs.pw_activation_type = ACTIVATION_RELU; - } else if (activationFunc == tflite::ActivationFunctionType_RELU6) { - convPs.pw_activation_type = ACTIVATION_RELU6; - } else if (activationFunc != tflite::ActivationFunctionType_NONE) { - std::cout << "[ERROR] tflite activation " << activationFunc << " not merged with conv yet\n"; - CHECK_STATUS(NOT_IMPLEMENTED); - } - } else if (opCode == tflite::BuiltinOperator_DEPTHWISE_CONV_2D) { - convPs.num_outputs = weightShape[3]; - - const auto& tfliteConvOption = ops[curIndex]->builtin_options.AsDepthwiseConv2DOptions(); - convPs.dilatedRate_h = tfliteConvOption->dilation_h_factor; - convPs.dilatedRate_w = tfliteConvOption->dilation_w_factor; - convPs.stride_h = tfliteConvOption->stride_h; - convPs.stride_w = tfliteConvOption->stride_w; - const auto activationFunc = tfliteConvOption->fused_activation_function; - - if (1 == tfliteConvOption->padding) { // VALID - convPs.padding_top = 0; - convPs.padding_bottom = 0; - convPs.padding_left = 0; - convPs.padding_right = 0; - } else { // SAME - convPs.padding_top = (convPs.kernel_size_h - 1) / 2; - convPs.padding_bottom = (convPs.kernel_size_h - 1) / 2; - convPs.padding_left = (convPs.kernel_size_w - 1) / 2; - convPs.padding_right = (convPs.kernel_size_w - 1) / 2; - } - - convPs.group = convPs.num_outputs; - - convPs.dw_activation_type = ACTIVATION_NULL; - convPs.pw_activation_type = ACTIVATION_NULL; - - convPs.convolution_type = Convolution_Depthwise; - if (activationFunc == tflite::ActivationFunctionType_RELU) { - convPs.dw_activation_type = ACTIVATION_RELU; - } else if (activationFunc == tflite::ActivationFunctionType_RELU6) { - convPs.dw_activation_type = ACTIVATION_RELU6; - } else if (activationFunc != tflite::ActivationFunctionType_NONE) { - std::cout << "[ERROR] tflite activation " << activationFunc << " not merged with depthwise conv yet\n"; - CHECK_STATUS(NOT_IMPLEMENTED); - } - } else { - CHECK_STATUS(NOT_SUPPORTED); - } - curPs.conv_spec = convPs; - return curPs; - } - - ParameterSpec adapt_Pooling() override - { - ParameterSpec curPs; - const auto& tflitePoolOption = ops[curIndex]->builtin_options.AsPool2DOptions(); - PoolingParamSpec poolingPs; - poolingPs.kernel_size_h = tflitePoolOption->filter_height; - poolingPs.kernel_size_w = tflitePoolOption->filter_width; - poolingPs.stride_h = tflitePoolOption->stride_h; - poolingPs.stride_w = tflitePoolOption->stride_w; - // TODO: padding support - poolingPs.padding_top = 0; - poolingPs.padding_bottom = 0; - poolingPs.padding_left = 0; - poolingPs.padding_right = 0; - poolingPs.rm = CEIL; - if (opCode == tflite::BuiltinOperator_MAX_POOL_2D) { - poolingPs.mode = POOLING_MAX; - } else if (opCode == tflite::BuiltinOperator_AVERAGE_POOL_2D) { - poolingPs.mode = POOLING_MEAN; - } - curPs.pooling_spec = poolingPs; - return curPs; - } - - ParameterSpec adapt_Reshape() override - { - ParameterSpec curPs; - const auto& shapeTensor = tensors[ops[curIndex]->inputs[1]]; - const auto& shapeData = tfliteModelBuffer[shapeTensor->buffer]->data; - CHECK_REQUIREMENT((shapeTensor->shape[0]) == (int)(shapeData.size() / sizeof(int))); - - ReshapeParamSpec reshapePs; - reshapePs.shape_size = shapeTensor->shape[0]; - - auto reshapeDimPtr = reinterpret_cast(shapeData.data()); - std::vector reshapeDim(reshapeDimPtr, reshapeDimPtr + reshapePs.shape_size); - - const auto& inputTensor = tensors[ops[curIndex]->inputs[0]]; - const auto& inputShape = inputTensor->shape; - if ((U32)reshapePs.shape_size < inputShape.size() && - 4 == inputShape.size() && - (1 != inputShape[1] || 1 != inputShape[2]) ) { - opsPtr[curIndex].type = OT_Transpose; - TransposeParamSpec transposePs; - transposePs.trans_size = 4; - bool taken[4] = {false}; - U32 i; - for (i = 0; i < reshapeDim.size(); i++) { - for (U32 j = 0; j < inputShape.size(); j++) { - if (inputShape[j] == reshapeDim[i]) { - taken[j] = true; - transposePs.trans_dims[i] = j; - break; - } - } - } - for (U32 j = 0; j < 4; j++) { - if (!taken[j]) { - transposePs.trans_dims[i] = j; - i++; - taken[j] = true; - } - } - curPs.transpose_spec = transposePs; - } else { - if (4 == reshapeDim.size()) { - reshapePs.shape_dims[0] = reshapeDim[0]; - reshapePs.shape_dims[1] = reshapeDim[3]; - reshapePs.shape_dims[2] = reshapeDim[1]; - reshapePs.shape_dims[3] = reshapeDim[2]; - } else { - for (int iter = 0; iter < (int)reshapeDim.size() ; iter++) { - reshapePs.shape_dims[iter] = reshapeDim[iter]; - } - } - curPs.reshape_spec = reshapePs; - } - return curPs; - } - - ParameterSpec adapt_Transpose() override - { - ParameterSpec curPs; - TransposeParamSpec transPs; - const auto& dimsTensor = tensors[ops[curIndex]->inputs[1]]; - const auto& dimsData = tfliteModelBuffer[dimsTensor->buffer]->data; - CHECK_REQUIREMENT((dimsTensor->shape[0]) == (int)(dimsData.size() / sizeof(int))); - transPs.trans_size = dimsTensor->shape[0]; - auto dims = reinterpret_cast(dimsData.data()); - for (U32 i = 0; i < transPs.trans_size; i++) - transPs.trans_dims[i] = dims[i]; - curPs.transpose_spec = transPs; - std::cout << "[INFO] tflite operator transpose dims: "; - for (U32 i = 0; i < transPs.trans_size; i++) - std::cout << transPs.trans_dims[i] << " "; - std::cout << std::endl; - return curPs; - } - - ParameterSpec adapt_Slice() override - { - // TODO: Tensorflow slice is not similar with Caffe - // currently only support one axis slice - ParameterSpec curPs; - SliceParamSpec slicePs; - const auto& inputShape = tensors[ops[curIndex]->inputs[0]]->shape; - const auto& beginTensor = tensors[ops[curIndex]->inputs[1]]; - auto beginData = reinterpret_cast((tfliteModelBuffer[beginTensor->buffer]->data).data()); - const auto& sizeTensor = tensors[ops[curIndex]->inputs[2]]; - auto sizeData = reinterpret_cast((tfliteModelBuffer[sizeTensor->buffer]->data).data()); - I32 axis = INT_MIN; - for (I32 i = 0; i < beginTensor->shape[0]; i++) { - if (! (beginData[i] == 0 && (sizeData[i] == -1 || sizeData[i] == inputShape[i]))) { - if (axis != INT_MIN) { - std::cerr << "[ERROR] currently not support multi axis slice" << std::endl; - exit(1); - } else { - axis = i; - } - } - } - slicePs.axis = axis; - slicePs.slice_size = 2; - slicePs.slice_points[0] = beginData[axis]; - I32 size = sizeData[axis]; - if (size == -1) { - slicePs.slice_points[1] = inputShape[axis]; - } else { - slicePs.slice_points[1] = beginData[axis] + sizeData[axis]; - } - if (4 == inputShape.size()) { - switch (slicePs.axis) { - case 0: - slicePs.axis = 0; - break; - case 1: - slicePs.axis = 2; - break; - case 2: - slicePs.axis = 3; - break; - case 3: - slicePs.axis = 1; - break; - default: - CHECK_STATUS(NOT_SUPPORTED); - } - } - curPs.slice_spec = slicePs; - return curPs; - } - - ParameterSpec adapt_MatMul() override // matrix X matrix - { - ParameterSpec curPs; - MatMulParamSpec matmulPs; - matmulPs.transpose_a = false; - matmulPs.transpose_b = false; - curPs.matmul_spec = matmulPs; - return curPs; - } - - ParameterSpec adapt_Fc() override - { - modelWeightOpNum++; - ParameterSpec curPs; - FullyConnectedParamSpec ips; - const int index = ops[curIndex]->inputs[1]; - const auto& tensor = tensors[index]; - I32 size = tfliteModelBuffer[tensor->buffer]->data.size(); - CHECK_REQUIREMENT(size != 0); - const auto& weightShape = tensor->shape; - ips.num_outputs = weightShape[1]; - ips.num_slices = 1; - ips.slice_point[0] = ips.num_outputs; - curPs.fc_spec = ips; - return curPs; - } - - - ParameterSpec adapt_Concat() override - { - ParameterSpec curPs; - ConcatParamSpec concatPs; - const auto& tfliteConcatOption = ops[curIndex]->builtin_options.AsConcatenationOptions(); - CHECK_REQUIREMENT(tflite::ActivationFunctionType_NONE == tfliteConcatOption->fused_activation_function); - concatPs.axis = tfliteConcatOption->axis; - - const auto& outputTensor = tensors[ops[curIndex]->outputs[0]]; - const auto& outputShape = outputTensor->shape; - if (4 == outputShape.size()) { - switch (concatPs.axis) { - case 0: - concatPs.axis = 0; - break; - case 1: - concatPs.axis = 2; - break; - case 2: - concatPs.axis = 3; - break; - case 3: - concatPs.axis = 1; - break; - default: - CHECK_STATUS(NOT_SUPPORTED); - } - } - curPs.concat_spec = concatPs; - return curPs; - } - - ParameterSpec adapt_Softmax() override - { - const auto& tfliteSoftmaxOption = ops[curIndex]->builtin_options.AsSoftmaxOptions(); - CHECK_REQUIREMENT(1 == tfliteSoftmaxOption->beta); - - ParameterSpec curPs; - SoftmaxParamSpec softmaxPs; - softmaxPs.axis = -1; - - const auto& inputTensor = tensors[ops[curIndex]->inputs[0]]; - const auto& inputShape = inputTensor->shape; - if (4 == inputShape.size()) { - softmaxPs.axis = 1; - } - curPs.softmax_spec = softmaxPs; - return curPs; - } - -public: - std::map modifiedInputsOp {{OT_Conv, 1}, {OT_Reshape, 1}, - {OT_Resize, 1}, {OT_Transpose, 1}, {OT_FC, 1}, {OT_Slice, 1}, {OT_Scale, 1}}; - std::map modifiedOutputsOp {{OT_Slice, 3}}; -private: - std::vector> tfliteModelBuffer; - std::vector> tfliteOpSet; - std::vector> ops; - std::vector> tensors; - std::vector inputs; - std::vector outputs; - tflite::BuiltinOperator opCode; - int modelWeightOpNum; - int curIndex; - std::string modelName; - OperatorSpec* opsPtr; -}; diff --git a/model-tools/tools/ms2bolt/CMakeLists.txt b/model-tools/tools/ms2bolt/CMakeLists.txt deleted file mode 100644 index b4503e0b..00000000 --- a/model-tools/tools/ms2bolt/CMakeLists.txt +++ /dev/null @@ -1,12 +0,0 @@ -project(model-tools C CXX) - -function(b_1_b name) - add_executable(${name} ${name}.cpp) - TARGET_LINK_LIBRARIES(${name} ${PROJECT_NAME}_static) -endfunction() - -b_1_b(caffe2mt) -b_1_b(mt2caffe) - -b_1_b(tensorflow2mt) -b_1_b(mt2tensorflow) diff --git a/model-tools/tools/ms2bolt/fixedMs2bolt.cpp b/model-tools/tools/ms2bolt/fixedMs2bolt.cpp deleted file mode 100644 index 96c7bee5..00000000 --- a/model-tools/tools/ms2bolt/fixedMs2bolt.cpp +++ /dev/null @@ -1,286 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include -#include -#include "model_tools.h" -#include "model_serialize_deserialize.hpp" -#include "converter.h" - -int main() -{ - ModelSpec fixedMs; - - std::string modelName = "leNet"; - DataType dt = DT_F32; - str_copy(fixedMs.modelName, modelName.c_str(), NAME_LEN); - fixedMs.dt = dt; - std::cout << "group1 " << std::endl; - - int numInputs = 1; - char* inputNames = "data"; - std::cout << "inputNames[0] " << inputNames[0] << std::endl; - fixedMs.inputNames = {"data"}; - - std::cout << "memcpy success " << std::endl; - fixedMs.input_dims = (TensorDesc*)mt_new_storage(sizeof(TensorDesc) * numInputs); - fixedMs.input_dims[0].dt = DT_F32; - fixedMs.input_dims[0].df = DF_NCHW; - fixedMs.input_dims[0].nDims = 4; - fixedMs.input_dims[0].dims[0] = 64; - fixedMs.input_dims[0].dims[1] = 1; - fixedMs.input_dims[0].dims[2] = 28; - fixedMs.input_dims[0].dims[3] = 28; - fixedMs.numInputs = numInputs; - std::cout << "group2 " << std::endl; - - int numOutputs = 1; - fixedMs.output_names = {"prob"}; - fixedMs.numOutputs = numOutputs; - std::cout << "group3 " << std::endl; - - int numOperatorSpecs = 7; - OperatorSpec opsArr[7]; - - // set each ops - std::string opName0 = "conv1"; - OperatorType opType0 = OT_Conv; - int opNumInputs0 = 1; - char** opInputTensorsName0 = {"data"}; - int opNumOutputs0 = 1; - char** opOutputTensorsName0 = {"conv1"}; - ConvolutionParamSpec convCps1; - convCps1.num_outputs = 64; - convCps1.kernel_size = 5; - convCps1.stride = 1; - convCps1.padding = 2; - str_copy(opsArr[0].name, opName0.c_str(), opName0.length()); - opsArr[0].type = opType0; - opsArr[0].numInputs = opNumInputs0; - opsArr[0].input_tensors_name = opInputTensorsName0; - opsArr[0].numOutputs = opNumOutputs0; - opsArr[0].output_tensors_name = opOutputTensorsName0; - opsArr[0].ps.conv_spec = convCps1; - - std::string opName1 = "pooling1"; - OperatorType opType1 = OT_Pooling; - int opNumInputs1 = 1; - char** opInputTensorsName1 = {"conv1"}; - int opNumOutputs1 = 1; - char** opOutputTensorsName1 = {"pooling1"}; - PoolingParamSpec poolingPps1; - poolingPps1.kernel_size = 2; - poolingPps1.stride = 2; - // poolingPps1.padding = 0; - poolingPps1.mode = Max; - str_copy(opsArr[1].name, opName1.c_str(), opName1.length()); - opsArr[1].type = opType1; - opsArr[1].numInputs = opNumInputs1; - opsArr[1].input_tensors_name = opInputTensorsName1; - opsArr[1].numOutputs = opNumOutputs1; - opsArr[1].output_tensors_name = opOutputTensorsName1; - opsArr[1].ps.pooling_spec = poolingPps1; - - std::string opName2 = "conv2"; - OperatorType opType2 = OT_Conv; - int opNumInputs2 = 1; - char** opInputTensorsName2 = {"pooling1"}; - int opNumOutputs2 = 1; - char** opOutputTensorsName2 = {"conv2"}; - ConvolutionParamSpec convCps2; - convCps2.num_outputs = 32; - convCps2.kernel_size = 5; - convCps2.stride = 1; - convCps2.padding = 2; - str_copy(opsArr[2].name, opName2.c_str(), opName2.length()); - opsArr[2].type = opType2; - opsArr[2].numInputs = opNumInputs2; - opsArr[2].input_tensors_name = opInputTensorsName2; - opsArr[2].numOutputs = opNumOutputs2; - opsArr[2].output_tensors_name = opOutputTensorsName2; - opsArr[2].ps.conv_spec = convCps2; - - std::string opName3 = "pooling2"; - OperatorType opType3 = OT_Pooling; - int opNumInputs3 = 1; - char** opInputTensorsName3 = {"conv2"}; - int opNumOutputs3 = 1; - char **op_output_tensors_name_3 = {"pooling2"}; - PoolingParamSpec poolingPps2; - poolingPps2.kernel_size = 2; - poolingPps2.stride = 2; - // poolingPps2.padding = 0; // pooling no padding? - poolingPps2.mode = Max; - str_copy(opsArr[3].name, opName3.c_str(), opName3.length()); - opsArr[3].type = opType3; - opsArr[3].numInputs = opNumInputs3; - opsArr[3].input_tensors_name = opInputTensorsName3; - opsArr[3].numOutputs = opNumOutputs3; - opsArr[3].output_tensors_name = op_output_tensors_name_3; - opsArr[3].ps.pooling_spec = poolingPps2; - - std::string opName4 = "fc1"; - OperatorType opType4 = OT_FC; - int opNumInputs4 = 1; - char** opInputTensorsName4 = {"pooling2"}; - int opNumOutputs4 = 1; - char** opOutputTensorsName4 = {"fc1"}; - FullyConnectedParamSpec fcps1; - fcps1.numOutputs = 100; - str_copy(opsArr[4].name, opName4.c_str(), opName4.length()); - opsArr[4].type = opType4; - opsArr[4].numInputs = opNumInputs4; - opsArr[4].input_tensors_name = opInputTensorsName4; - opsArr[4].numOutputs = opNumOutputs4; - opsArr[4].output_tensors_name = opOutputTensorsName4; - opsArr[4].ps.fc_spec = fcps1; - - std::string opName5 = "fc2"; - OperatorType opType5 = OT_FC; - int opNumInputs5 = 1; - char** opInputTensorsName5 = {"fc1"}; - int opNumOutputs5 = 1; - char** opOutputTensorsName5 = {"fc2"}; - FullyConnectedParamSpec fcps2; - fcps2.numOutputs = 50; - str_copy(opsArr[5].name, opName5.c_str(), opName5.length()); - opsArr[5].type = opType5; - opsArr[5].numInputs = opNumInputs5; - opsArr[5].input_tensors_name = opInputTensorsName5; - opsArr[5].numOutputs = opNumOutputs5; - opsArr[5].output_tensors_name = opOutputTensorsName5; - opsArr[5].ps.fc_spec = fcps2; - - std::string opName6 = "prob"; - OperatorType opType6 = OT_Softmax; - int opNumInputs6 = 1; - char** opInputTensorsName6 = {"fc2"}; - int opNumOutputs6 = 1; - char** opOutputTensorsName6 = {"prob"}; - str_copy(opsArr[6].name, opName6.c_str(), opName6.length()); - opsArr[6].type = opType6; - opsArr[6].numInputs = opNumInputs6; - opsArr[6].input_tensors_name = opInputTensorsName6; - opsArr[6].numOutputs = opNumOutputs6; - opsArr[6].output_tensors_name = opOutputTensorsName6; - - fixedMs.numOperatorSpecs = numOperatorSpecs; - fixedMs.ops = &opsArr[0]; - std::cout << "group4 " << std::endl; - - // weight op 信息 - I32 numWeightSpecs = 4; - WeightSpec wsArr[4]; - F32 floatValue32 = 1.0; - - // set each ws - std::string weigthOpNameConv1 = "conv1"; - DataType mdtConv1 = DT_F32; - U32 bytesOfWeightConv1 = 1*5*5*20*bytesOf(mdtConv1); - F32* conv1WeightPtr = (F32*)mt_new_storage(bytesOfWeightConv1); - for (int i = 0; i < 1*5*5*20; i++) { - conv1WeightPtr[i] = 1.0; - } - U32 bytesOfVecConv1 = 20*bytesOf(mdtConv1); - F32* convVecPtr1 = (F32*)mt_new_storage(bytesOfVecConv1); - for (int i = 0; i < 20; i++) { - convVecPtr1[i] = 1.0; - } - str_copywsArr[0].op_name, weigthOpNameConv1.c_str(), weigthOpNameConv1.length()); - wsArr[0].mdt = mdtConv1; - wsArr[0].bytes_of_weight = bytesOfWeightConv1; - wsArr[0].weight = (U8*)conv1WeightPtr; - wsArr[0].bytes_of_vec = bytesOfVecConv1; - wsArr[0].vec = (U8*)convVecPtr1; - - std::string weightOpNameConv2 = "conv2"; - DataType mdtConv2 = DT_F32; - U32 bytesOfWeightConv2 = 64*5*5*32*bytesOf(mdtConv2); - F32* conv2WeightPtr = (F32*)mt_new_storage(bytesOfWeightConv2); - for (int i = 0; i < 64*5*5*32; i++) { - conv2WeightPtr[i] = 1.0; - } - U32 bytesOfVecConv2 = 32 * bytesOf(mdtConv2); - F32* convVecPtr2 = (F32*)mt_new_storage(bytesOfVecConv2); - for (int i=0; i<32; i++) { - convVecPtr2[i] = 1.0; - } - str_copy(wsArr[1].op_name, weightOpNameConv2.c_str(), weightOpNameConv2.length()); - wsArr[1].mdt = mdtConv2; - wsArr[1].bytes_of_weight = bytesOfWeightConv2; - wsArr[1].weight = (U8*)conv2WeightPtr; - wsArr[1].bytes_of_vec = bytesOfVecConv2; - wsArr[1].vec = (U8*)convVecPtr2; - - std::string weightOpNameFc1 = "fc1"; - DataType mdtFc1 = DT_F32; - U32 bytesOfWeightFc1 = 32*4*4*bytesOf(mdtFc1); - F32* fcWeightPtr1 = (F32*)mt_new_storage(bytesOfWeightFc1); - for (int i=0; i<32*4*4; i++) { - fcWeightPtr1[i] = 1.0; - } - U32 bytesOfVecFc1 = 100*bytesOf(mdtFc1); - F32* fcVecPtr1 = (F32*)mt_new_storage(bytesOfVecFc1); - for (int i = 0; i < 100; i++) { - fcVecPtr1[i] = 1.0; - } - str_copy(wsArr[2].op_name, weightOpNameFc1.c_str(), weightOpNameFc1.length()); - wsArr[2].mdt = mdtFc1; - wsArr[2].bytes_of_weight = bytesOfWeightFc1; - wsArr[2].weight = (U8*)fcWeightPtr1; - wsArr[2].bytes_of_vec = bytesOfVecFc1; - wsArr[2].vec = (U8*)fcVecPtr1; - - std::string weightOpNameFc2 = "fc2"; - DataType mdtFc2 = DT_F32; - U32 bytesOfWeightFc2 = 100*10*bytesOf(mdtFc2); - F32* fcWeightPtr2 = (F32*)mt_new_storage(bytesOfWeightFc2); - for (int i = 0; i < 100*10; i++) { - fcWeightPtr2[i] = 1.0; - } - U32 bytesOfVecFc2 = 10 * bytesOf(mdtFc2); - F32* fcVecPtr2 = (F32*)mt_new_storage(bytesOfVecFc2); - for (int i = 0; i < 10; i++) { - fcVecPtr2[i] = 1.0; - } - str_copy(wsArr[3].op_name, weightOpNameFc2.c_str(), weightOpNameFc2.length()); - wsArr[3].mdt = mdtFc2; - wsArr[3].bytes_of_weight = bytesOfWeightFc2; - wsArr[3].weight = (U8*)fcWeightPtr2; - wsArr[3].bytes_of_vec = bytesOfVecFc2; - wsArr[3].vec = (U8*)fcVecPtr2; - fixedMs.numWeightSpecs = numWeightSpecs; - fixedMs.ws = &wsArr[0]; - - int number = fixedMs.numWeightSpecs; - for (int i=0; i < number; i++) { - std::cout << "op name is : " << fixedMs.ws[i].op_name << std::endl; - std::cout << "op mdt is: " << fixedMs.ws[i].mdt << std::endl; - std::cout << "op bytes_of_weight is: " << fixedMs.ws[i].bytes_of_weight << std::endl; - std::cout << "op bytes_of_vec is: " << fixedMs.ws[i].bytes_of_vec << std::endl; - std::cout << "op weight address: " << (void*)fixedMs.ws[i].weight << std::endl; - std::cout << "op bias address: " << (void*)fixedMs.ws[i].vec << std::endl; - std::cout << "first weight value: " << ((F32*)fixedMs.ws[i].weight)[0] << std::endl; - if(fixedMs.ws[i].bytes_of_vec > 0) { - std::cout << "first bias value: " << ((F32*)fixedMs.ws[i].vec)[0] << std::endl; - } - std::cout << "\n\n\n"; - } - - std::string storePath = "./fixedMs.bolt"; - serialize_model_to_file(&fixedMs, storePath.c_str()); - - return 0; -} diff --git a/model_tools/CMakeLists.txt b/model_tools/CMakeLists.txt new file mode 100644 index 00000000..b043cb09 --- /dev/null +++ b/model_tools/CMakeLists.txt @@ -0,0 +1,22 @@ +cmake_minimum_required(VERSION 3.2) + +file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/common/cmakes/bolt.cmake ${BOLT_ROOT}/common/cmakes/bolt.cmake) +if (BOLT_CONFIGURE_FILE) + include(${BOLT_CONFIGURE_FILE}) +else (BOLT_CONFIGURE_FILE) + message(FATAL_ERROR " +FATAL: can not find bolt.cmake in /common/cmakes directory, + please set shell or cmake environment variable BOLT_ROOT. + ") +endif (BOLT_CONFIGURE_FILE) + +project(model_tools) + +set_policy() + +set_c_cxx_flags() + +include_model_tools() + +add_subdirectory(src) +add_subdirectory(tools) diff --git a/model_tools/include/OPOptimizers/ActivationOptimizer.hpp b/model_tools/include/OPOptimizers/ActivationOptimizer.hpp new file mode 100644 index 00000000..f436e0be --- /dev/null +++ b/model_tools/include/OPOptimizers/ActivationOptimizer.hpp @@ -0,0 +1,74 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_ACTIVATIONOPTIMIZER +#define _H_ACTIVATIONOPTIMIZER + +#include "OPOptimizer.hpp" + +class ActivationOptimizer : public OPOptimizer { + bool optimize(ModelSpec *spec) override + { + bool hasOptimized = false; + for (int i = 0; i < spec->num_operator_specs; i++) { + if (OT_Conv == spec->ops[i].type || OT_Eltwise == spec->ops[i].type) { + int prevOpIndex = i; + std::vector> nextOpIndexes = searchOperatorIndexByInput(spec, + spec->ops[prevOpIndex].output_tensors_name[0], prevOpIndex + 1, + spec->num_operator_specs); + if (nextOpIndexes.size() != 1 || OT_Relu != spec->ops[nextOpIndexes[0].first].type || + spec->ops[nextOpIndexes[0].first].ps.relu_spec.neg_slope != 0) { + continue; + } + int atOpIndex = nextOpIndexes[0].first; + + // tensor relationship rewrite + if (spec->ops[prevOpIndex].type == OT_Conv) { + switch (spec->ops[prevOpIndex].ps.conv_spec.convolution_type) { + case Convolution_Pointwise: { + spec->ops[prevOpIndex].ps.conv_spec.pw_activation_type = ACTIVATION_RELU; + break; + } + case Convolution_Deconvolution: { + spec->ops[prevOpIndex].ps.conv_spec.pw_activation_type = ACTIVATION_RELU; + break; + } + case Convolution_Depthwise: { + spec->ops[prevOpIndex].ps.conv_spec.dw_activation_type = ACTIVATION_RELU; + break; + } + case Convolution_Dilation: { + spec->ops[prevOpIndex].ps.conv_spec.pw_activation_type = ACTIVATION_RELU; + break; + } + default: { + CHECK_REQUIREMENT(0); + break; + } + } + spec->ops[prevOpIndex].ps.conv_spec.activation_spec.relu_spec = + spec->ops[atOpIndex].ps.relu_spec; + } + if (spec->ops[prevOpIndex].type == OT_Eltwise) { + spec->ops[prevOpIndex].ps.eltwise_spec.activation_type = ACTIVATION_RELU; + spec->ops[prevOpIndex].ps.eltwise_spec.activation_spec.relu_spec = + spec->ops[atOpIndex].ps.relu_spec; + } + setOperatorInvalid(spec, atOpIndex, true); + hasOptimized = true; + } + } + return hasOptimized; + } +}; +#endif diff --git a/model_tools/include/OPOptimizers/BNScaleOptimizer.hpp b/model_tools/include/OPOptimizers/BNScaleOptimizer.hpp new file mode 100644 index 00000000..2df826c4 --- /dev/null +++ b/model_tools/include/OPOptimizers/BNScaleOptimizer.hpp @@ -0,0 +1,110 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_BNSCALEOPTIMIZER +#define _H_BNSCALEOPTIMIZER + +#include "OPOptimizer.hpp" + +class BNScaleOptimizer : public OPOptimizer { + bool optimize(ModelSpec *spec) override + { + bool hasOptimized = false; + for (int i = 0; i < spec->num_operator_specs; i++) { + if (spec->ops[i].type == OT_BatchNorm) { + int bnOpIndex = i; + std::vector> nextOpIndexes = searchOperatorIndexByInput(spec, + spec->ops[bnOpIndex].output_tensors_name[0], bnOpIndex + 1, + spec->num_operator_specs); + if (nextOpIndexes.size() != 1 || OT_Scale != spec->ops[nextOpIndexes[0].first].type) { + UNI_WARNING_LOG( + "encounter unoptimize BN layer(no Scale): %s\n", spec->ops[i].name); + continue; + } + int scaleOpIndex = nextOpIndexes[0].first; + + // bn + int bnWeightIndex = searchWeightIndex(spec, spec->ops[bnOpIndex].name); + CHECK_REQUIREMENT(bnWeightIndex >= 0); + CHECK_REQUIREMENT(spec->ws[bnWeightIndex].mdt == DT_F32); + F32 epsCur = spec->ops[bnOpIndex].ps.bn_spec.eps; + F32 gamaCur = spec->ops[bnOpIndex].ps.bn_spec.gama; + U32 channelCur = + spec->ws[bnWeightIndex].bytes_of_weight / bytesOf(spec->ws[bnWeightIndex].mdt); + F32 *meanPtr = (F32 *)spec->ws[bnWeightIndex].weight; + F32 *varPtr = (F32 *)spec->ws[bnWeightIndex].vec; + + std::vector stdValue(channelCur); + for (U32 j = 0; j < channelCur; j++) { + stdValue[j] = sqrt(gamaCur * varPtr[j] + epsCur); + } + + // scale + int scaleWeightIndex = searchWeightIndex(spec, spec->ops[scaleOpIndex].name); + CHECK_REQUIREMENT(scaleWeightIndex >= 0); + CHECK_REQUIREMENT(spec->ws[scaleWeightIndex].mdt == DT_F32); + U32 channelAlpha = spec->ws[scaleWeightIndex].bytes_of_weight / + bytesOf(spec->ws[scaleWeightIndex].mdt); + CHECK_REQUIREMENT(channelAlpha == channelCur); + + if (spec->ws[scaleWeightIndex].vec == nullptr) { + spec->ws[scaleWeightIndex].bytes_of_vec = channelCur * sizeof(F32); + spec->ws[scaleWeightIndex].vec = + (U8 *)mt_new_storage(spec->ws[scaleWeightIndex].bytes_of_vec); + memset( + spec->ws[scaleWeightIndex].vec, 0, spec->ws[scaleWeightIndex].bytes_of_vec); + } + + F32 *alphaPtr = (F32 *)spec->ws[scaleWeightIndex].weight; + F32 *betaPtr = (F32 *)spec->ws[scaleWeightIndex].vec; + + for (U32 m = 0; m < channelCur; m++) { + alphaPtr[m] /= stdValue[m]; + betaPtr[m] = betaPtr[m] - alphaPtr[m] * gamaCur * meanPtr[m]; + } + // free BN memory + if (spec->ws[bnWeightIndex].weight != nullptr) { + spec->ws[bnWeightIndex].bytes_of_weight = 0; + delete spec->ws[bnWeightIndex].weight; + spec->ws[bnWeightIndex].weight = nullptr; + } + if (spec->ws[bnWeightIndex].vec != nullptr) { + spec->ws[bnWeightIndex].bytes_of_vec = 0; + delete spec->ws[bnWeightIndex].vec; + spec->ws[bnWeightIndex].vec = nullptr; + } + setOperatorInvalid(spec, bnOpIndex, true); + hasOptimized = true; + + // If the previous OP is Concat, we need to take care of the possible padded channels before Concat. + const int queryNum = 1; + OperatorType queryOps[queryNum] = {OT_Concat}; + + int concatOpIndex = searchOperatorIndexBackward(spec, i - 1, queryOps, queryNum); + if (concatOpIndex != -1) { + spec->ops[scaleOpIndex].ps.scale_spec.num_concat = + spec->ops[concatOpIndex].num_inputs; + // Rename concat output and scale input to avoid desc differences for inplace tensor + std::string oldName = spec->ops[concatOpIndex].output_tensors_name[0]; + std::string breakName = "break_" + oldName; + str_copy(spec->ops[concatOpIndex].output_tensors_name[0], breakName.c_str(), + NAME_LEN); + str_copy( + spec->ops[scaleOpIndex].input_tensors_name[0], breakName.c_str(), NAME_LEN); + } + } + } + return hasOptimized; + } +}; +#endif diff --git a/model_tools/include/OPOptimizers/CastOptimizer.hpp b/model_tools/include/OPOptimizers/CastOptimizer.hpp new file mode 100644 index 00000000..d5e2f3e4 --- /dev/null +++ b/model_tools/include/OPOptimizers/CastOptimizer.hpp @@ -0,0 +1,50 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_CastOPTIMIZER +#define _H_CastOPTIMIZER + +#include "model_tools.h" +#include "OPOptimizer.hpp" + +class CastOptimizer : public OPOptimizer { + bool optimize(ModelSpec *spec) override + { + // const int queryNum = 3; + // OperatorType queryOps[queryNum] = {OT_Conv, OT_FC, OT_Deconvolution}; + + bool hasOptimized = false; + for (int i = 1; i < spec->num_operator_specs; i++) { + if (spec->ops[i].type == OT_Cast) { + // rewrite the relationship + // the op[i-1].output ==> op[i+1].input + str_copy(spec->ops[i + 1].input_tensors_name[0], + spec->ops[i - 1].output_tensors_name[0], NAME_LEN); + hasOptimized = true; + // cut off the op[i] input and output information + for (U32 j = 0; j < spec->ops[i].num_inputs; j++) { + free(spec->ops[i].input_tensors_name[j]); + } + spec->ops[i].num_inputs = 0; + for (U32 j = 0; j < spec->ops[i].num_outputs; j++) { + free(spec->ops[i].output_tensors_name[j]); + } + spec->ops[i].num_outputs = 0; + + setOperatorInvalid(spec, i); + } + } + return hasOptimized; + } +}; +#endif diff --git a/model_tools/include/OPOptimizers/ChannelPaddingOptimizer.hpp b/model_tools/include/OPOptimizers/ChannelPaddingOptimizer.hpp new file mode 100644 index 00000000..9b13b36c --- /dev/null +++ b/model_tools/include/OPOptimizers/ChannelPaddingOptimizer.hpp @@ -0,0 +1,357 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_CHANNELPADDINGOPTIMIZER +#define _H_CHANNELPADDINGOPTIMIZER + +#include +#include +#include "OPOptimizer.hpp" + +class ChannelPaddingOptimizer : public OPOptimizer { + void insertChannelResizeOperator(ModelSpec *spec, + int index, + const char *name, + char *input_name, + const char *symmetric, + int group, + int numChannelsBefore, + int numChannelAfter) + { + OperatorSpec channelResizeOperator = mt_create_operator(name, OT_ChannelResize, 1, 1); + if (symmetric == nullptr || symmetric == NULL) { + memset(channelResizeOperator.ps.channel_resize_spec.symmetric, 0, NAME_LEN); + } else { + str_copy(channelResizeOperator.ps.channel_resize_spec.symmetric, symmetric, + strlen(symmetric)); + } + channelResizeOperator.ps.channel_resize_spec.group = group; + channelResizeOperator.ps.channel_resize_spec.channel_before = numChannelsBefore; + channelResizeOperator.ps.channel_resize_spec.channel_after = numChannelAfter; + // channel cut + if (numChannelAfter < numChannelsBefore) { + str_copy(channelResizeOperator.output_tensors_name[0], input_name, strlen(input_name)); + str_copy(channelResizeOperator.input_tensors_name[0], name, strlen(name)); + str_copy(input_name, name, strlen(name)); + } else { + str_copy(channelResizeOperator.input_tensors_name[0], input_name, strlen(input_name)); + str_copy(channelResizeOperator.output_tensors_name[0], name, strlen(name)); + str_copy(input_name, name, strlen(name)); + } + mt_insert_operator(spec, index, channelResizeOperator); + } + + bool isBlankChannelResizeOperator(OperatorSpec currentOperator) + { + return currentOperator.ps.channel_resize_spec.group == 0; + } + + bool canMergeChannelCutSpan(OperatorSpec channelCutOperator, OperatorSpec channelSpanOperator) + { + CHECK_REQUIREMENT(channelSpanOperator.type == OT_ChannelResize && + channelCutOperator.type == OT_ChannelResize); + if (isBlankChannelResizeOperator(channelSpanOperator)) { + return true; + } + if (channelSpanOperator.ps.channel_resize_spec.group == + channelCutOperator.ps.channel_resize_spec.group && + channelSpanOperator.ps.channel_resize_spec.channel_before == + channelCutOperator.ps.channel_resize_spec.channel_after && + channelSpanOperator.ps.channel_resize_spec.channel_after == + channelCutOperator.ps.channel_resize_spec.channel_before) { + return true; + } + return false; + } + + bool optimize(ModelSpec *spec) override + { + return optimize1(spec); + } + + bool optimize1(ModelSpec *spec) + { + bool hasOptimized = false; + int channelAlign = 8; + std::string channelResizeNamePrefix = "ChannelResize_"; + for (int i = 0; i < spec->num_operator_specs; i++) { + if (spec->ops[i].type == OT_Conv && + spec->ops[i].ps.conv_spec.convolution_type == Convolution_Depthwise) { + OperatorSpec currentOperator = spec->ops[i]; + U32 numKernels = currentOperator.ps.conv_spec.num_outputs; + U32 paddingBase = channelAlign; + if (numKernels % paddingBase == 0) { + continue; + } + U32 numKernelsNew = + (numKernels / paddingBase + (numKernels % paddingBase != 0)) * paddingBase; + spec->ops[i].ps.conv_spec.num_outputs = numKernelsNew; + int weightIndex = searchWeightIndex(spec, currentOperator.name); + CHECK_REQUIREMENT(weightIndex >= 0); + U32 weightSize = spec->ws[weightIndex].bytes_of_weight; + U32 weightSizeNew = + spec->ws[weightIndex].bytes_of_weight / numKernels * numKernelsNew; + U8 *weight = spec->ws[weightIndex].weight; + spec->ws[weightIndex].bytes_of_weight = weightSizeNew; + spec->ws[weightIndex].weight = + (U8 *)mt_new_storage(spec->ws[weightIndex].bytes_of_weight); + memcpy(spec->ws[weightIndex].weight, weight, weightSize); + memset(spec->ws[weightIndex].weight + weightSize, 0, weightSizeNew - weightSize); + delete weight; + U8 *vec = spec->ws[weightIndex].vec; + if (vec != nullptr) { + U32 vecSize = spec->ws[weightIndex].bytes_of_vec; + U32 vecSizeNew = spec->ws[weightIndex].bytes_of_vec / numKernels * numKernelsNew; + spec->ws[weightIndex].bytes_of_vec = vecSizeNew; + spec->ws[weightIndex].vec = + (U8 *)mt_new_storage(spec->ws[weightIndex].bytes_of_vec); + memcpy((U8 *)(spec->ws[weightIndex].vec), vec, vecSize); + memset((U8 *)(spec->ws[weightIndex].vec + vecSize), 0, vecSizeNew - vecSize); + delete vec; + } + std::string channelResizeName1 = channelResizeNamePrefix + std::to_string(i); + std::string channelResizeName2 = channelResizeNamePrefix + std::to_string(i + 2); + insertChannelResizeOperator(spec, i, channelResizeName1.c_str(), + currentOperator.input_tensors_name[0], channelResizeName2.c_str(), 1, + numKernels, numKernelsNew); + insertChannelResizeOperator(spec, i + 2, channelResizeName2.c_str(), + currentOperator.output_tensors_name[0], nullptr, 1, numKernelsNew, numKernels); + i += 2; + continue; + } + if ((spec->ops[i].type == OT_Conv && + (spec->ops[i].ps.conv_spec.convolution_type == Convolution_Pointwise || + spec->ops[i].ps.conv_spec.convolution_type == Convolution_Dilation)) || + (spec->ops[i].type == OT_Deconvolution && + spec->ops[i].ps.conv_spec.convolution_type == Convolution_Deconvolution)) { + OperatorSpec currentOperator = spec->ops[i]; + U32 groups = currentOperator.ps.conv_spec.group; + U32 paddingBase = channelAlign * groups; + U32 numKernels = currentOperator.ps.conv_spec.num_outputs; + U32 numKernelsNew = + (numKernels / paddingBase + (numKernels % paddingBase != 0)) * paddingBase; + int weightIndex = searchWeightIndex(spec, currentOperator.name); + CHECK_REQUIREMENT(weightIndex >= 0); + U32 weightSize = spec->ws[weightIndex].bytes_of_weight; + U32 inputChannels = weightSize / bytesOf(spec->ws[weightIndex].mdt) / numKernels / + spec->ops[i].ps.conv_spec.kernel_t / spec->ops[i].ps.conv_spec.kernel_h / + spec->ops[i].ps.conv_spec.kernel_w; + U32 inputChannelsNew = inputChannels; + if (spec->ops[i].type == OT_Deconvolution) { + inputChannelsNew = + (inputChannels / paddingBase + (inputChannels % paddingBase != 0)) * + paddingBase; + } + if (inputChannelsNew == inputChannels && numKernels == numKernelsNew) { + continue; + } + spec->ops[i].ps.conv_spec.num_outputs = numKernelsNew; + U32 tileSize = weightSize / numKernels / inputChannels; + U32 weightSizeNew = tileSize * numKernelsNew * inputChannelsNew; + U8 *weight = spec->ws[weightIndex].weight; + spec->ws[weightIndex].bytes_of_weight = weightSizeNew; + spec->ws[weightIndex].weight = + (U8 *)mt_new_storage(spec->ws[weightIndex].bytes_of_weight); + memset(spec->ws[weightIndex].weight, 0, weightSizeNew); + U32 ocGroupSize = numKernels / groups; + U32 ocGroupSizeNew = numKernelsNew / groups; + U32 icGroupSize = inputChannels / groups; + U32 icGroupSizeNew = inputChannelsNew / groups; + for (U32 g = 0; g < groups; g++) { + for (U32 oc = 0; oc < ocGroupSize; oc++) { + for (U32 ic = 0; ic < icGroupSize; ic++) { + U32 index, indexNew; + if (spec->ops[i].type == OT_Deconvolution) { + index = (((g * icGroupSize + ic) * ocGroupSize) + oc) * tileSize; + indexNew = + (((g * icGroupSizeNew + ic) * ocGroupSizeNew) + oc) * tileSize; + } else { + index = (((g * ocGroupSize + oc) * icGroupSize) + ic) * tileSize; + indexNew = + (((g * ocGroupSizeNew + oc) * icGroupSizeNew) + ic) * tileSize; + } + memcpy((U8 *)(spec->ws[weightIndex].weight) + indexNew, weight + index, + tileSize); + } + } + } + delete weight; + U8 *vec = spec->ws[weightIndex].vec; + if (vec != nullptr) { + U32 vecSize = spec->ws[weightIndex].bytes_of_vec; + U32 vecSizeNew = spec->ws[weightIndex].bytes_of_vec / numKernels * numKernelsNew; + spec->ws[weightIndex].bytes_of_vec = vecSizeNew; + spec->ws[weightIndex].vec = + (U8 *)mt_new_storage(spec->ws[weightIndex].bytes_of_vec); + memcpy((U8 *)(spec->ws[weightIndex].vec), vec, vecSize); + memset((U8 *)(spec->ws[weightIndex].vec + vecSize), 0, vecSizeNew - vecSize); + delete vec; + } + int channelResizeIndex1 = i; + int channelResizeIndex2 = i + 2; + std::string channelResizeName1 = + channelResizeNamePrefix + std::to_string(channelResizeIndex1); + std::string channelResizeName2 = + channelResizeNamePrefix + std::to_string(channelResizeIndex2); + if (inputChannels != inputChannelsNew) { + const char *symmetric = channelResizeName2.c_str(); + if (numKernels == numKernelsNew) { + symmetric = nullptr; + } + insertChannelResizeOperator(spec, channelResizeIndex1, + channelResizeName1.c_str(), currentOperator.input_tensors_name[0], + symmetric, groups, inputChannels, inputChannelsNew); + i += 1; + } + if (numKernels != numKernelsNew) { + if (inputChannels == inputChannelsNew) { + channelResizeIndex2 = i + 1; + channelResizeName2 = + channelResizeNamePrefix + std::to_string(channelResizeIndex2); + } + insertChannelResizeOperator(spec, channelResizeIndex2, + channelResizeName2.c_str(), currentOperator.output_tensors_name[0], nullptr, + groups, numKernelsNew, numKernels); + i += 1; + } + continue; + } + } + + for (int i = 0; i < spec->num_operator_specs; i++) { + // cut operator + if (spec->ops[i].type == OT_ChannelResize) { + // span operator + std::vector> output = searchOperatorIndexByInput( + spec, spec->ops[i].output_tensors_name[0], i + 1, spec->num_operator_specs); + for (U32 j = 0; j < output.size(); j++) { + int nextIndex = output[j].first; + if (spec->ops[nextIndex].type == OT_ChannelResize && + canMergeChannelCutSpan(spec->ops[i], spec->ops[nextIndex])) { + if (output.size() == 1) { + setOperatorInvalid(spec, i, true); + setOperatorInvalid(spec, nextIndex, true); + } + if (nextIndex + 2 < spec->num_operator_specs && + isBlankChannelResizeOperator(spec->ops[nextIndex + 2])) { + spec->ops[nextIndex + 2].ps = spec->ops[i].ps; + } + } + } + } + } + return hasOptimized; + } + + bool optimize2(ModelSpec *spec) + { + bool hasOptimized = false; + for (int i = 0; i < spec->num_operator_specs; i++) { + bool padding = false; // Whether to check input channels and actually pad + bool optimizeOC = false; + U32 numKernels = 0; + U32 kernelSizeH = 0; + U32 kernelSizeW = 0; + if (spec->ops[i].type == OT_Conv || spec->ops[i].type == OT_Deconvolution) { + if (spec->ops[i].ps.conv_spec.convolution_type != Convolution_Depthwise && + spec->ops[i].ps.conv_spec.convolution_type != Convolution_Pointwise && + spec->ops[i].ps.conv_spec.convolution_type != Convolution_Deconvolution) { + continue; + } + + numKernels = spec->ops[i].ps.conv_spec.num_outputs; + kernelSizeH = spec->ops[i].ps.conv_spec.kernel_h; + kernelSizeW = spec->ops[i].ps.conv_spec.kernel_w; + if (numKernels % 8 != 0) { // Check output channels + optimizeOC = true; + } + padding = hasOptimized || + optimizeOC; // If padding has been done before, we need to check the input channels as well + } else if (spec->ops[i].type == OT_FC) { + numKernels = spec->ops[i].ps.fc_spec.num_outputs; + kernelSizeH = 1; + kernelSizeW = 1; + padding = hasOptimized; + } else { + continue; + } + + if (padding) { + std::string curIn = spec->ops[i].input_tensors_name[0]; + auto prevIndexes = searchOperatorIndexByOutput(spec, curIn, 0, i); + if (prevIndexes.size() == 0) { // input is model input + if (!optimizeOC) { + continue; + } + } + int weightIndex = searchWeightIndex(spec, spec->ops[i].name); + CHECK_REQUIREMENT(weightIndex >= 0); + CHECK_REQUIREMENT( + spec->ws[weightIndex].mdt == DT_F32); // BNN not supported for the time being + U32 weightSize = + spec->ws[weightIndex].bytes_of_weight / bytesOf(spec->ws[weightIndex].mdt); + U32 inputChannels = weightSize / (numKernels * kernelSizeH * kernelSizeW); + if (inputChannels % 8 == 0 && false == optimizeOC) { + continue; + } + + U32 numKernelsNew = optimizeOC ? ((numKernels / 8 + 1) * 8) : numKernels; + U32 inputChannelsNew = (inputChannels % 8) ? ((inputChannels / 8 + 1) * 8) + : inputChannels; + + if (spec->ops[i].ps.conv_spec.convolution_type == Convolution_Depthwise) { + inputChannelsNew = 1; + } + + U8 *weight = spec->ws[weightIndex].weight; + U8 *vec = spec->ws[weightIndex].vec; + U32 vecBytes = spec->ws[weightIndex].bytes_of_vec; + spec->ws[weightIndex].bytes_of_weight = bytesOf(spec->ws[weightIndex].mdt) * + numKernelsNew * inputChannelsNew * kernelSizeH * kernelSizeW; + spec->ws[weightIndex].bytes_of_vec = + bytesOf(spec->ws[weightIndex].mdt) * numKernelsNew; + spec->ws[weightIndex].weight = + (U8 *)mt_new_storage(spec->ws[weightIndex].bytes_of_weight); + spec->ws[weightIndex].vec = (U8 *)mt_new_storage(spec->ws[weightIndex].bytes_of_vec); + memset(spec->ws[weightIndex].weight, 0, spec->ws[weightIndex].bytes_of_weight); + memset(spec->ws[weightIndex].vec, 0, spec->ws[weightIndex].bytes_of_vec); + if (spec->ops[i].type == OT_Conv) { + spec->ops[i].ps.conv_spec.num_outputs = numKernelsNew; + } + if (spec->ops[i].type == OT_FC) { + spec->ops[i].ps.fc_spec.num_outputs = numKernelsNew; + } + // process weight + U32 blockSize = bytesOf(spec->ws[weightIndex].mdt) * kernelSizeH * kernelSizeW; + for (U32 oc = 0; oc < numKernels; oc++) { + for (U32 ic = 0; ic < inputChannels; ic++) { + U32 oldIndex = (oc * inputChannels + ic) * blockSize; + U32 newIndex = (oc * inputChannelsNew + ic) * blockSize; + memcpy( + spec->ws[weightIndex].weight + newIndex, weight + oldIndex, blockSize); + } + } + delete weight; + // process bias + if (vec != nullptr) { + memcpy(spec->ws[weightIndex].vec, vec, vecBytes); + delete vec; + } + + hasOptimized = true; + } + } + return hasOptimized; + } +}; +#endif diff --git a/model_tools/include/OPOptimizers/ClipClipOptimizer.hpp b/model_tools/include/OPOptimizers/ClipClipOptimizer.hpp new file mode 100644 index 00000000..4b17b628 --- /dev/null +++ b/model_tools/include/OPOptimizers/ClipClipOptimizer.hpp @@ -0,0 +1,48 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_CLIPCLIPOPTIMIZER +#define _H_CLIPCLIPOPTIMIZER + +#include "OPOptimizer.hpp" + +class ClipClipOptimizer : public OPOptimizer { + bool optimize(ModelSpec *spec) override + { + bool hasOptimized = false; + for (int i = 0; i < spec->num_operator_specs; i++) { + if (spec->ops[i].type == OT_Clip) { + int opIndex0 = i; + std::vector> nextOpIndexes = searchOperatorIndexByInput(spec, + spec->ops[opIndex0].output_tensors_name[0], opIndex0 + 1, + spec->num_operator_specs); + if (nextOpIndexes.size() != 1 || OT_Clip != spec->ops[nextOpIndexes[0].first].type) { + UNI_WARNING_LOG( + "encounter unoptimize Clip layer(no Clip): %s\n", spec->ops[i].name); + continue; + } + int opIndex1 = nextOpIndexes[0].first; + + spec->ops[opIndex0].ps.clip_spec.min = UNI_MAX( + spec->ops[opIndex0].ps.clip_spec.min, spec->ops[opIndex1].ps.clip_spec.min); + spec->ops[opIndex0].ps.clip_spec.max = UNI_MIN( + spec->ops[opIndex0].ps.clip_spec.max, spec->ops[opIndex1].ps.clip_spec.max); + setOperatorInvalid(spec, opIndex1, true); + hasOptimized = true; + i = opIndex1; + } + } + return hasOptimized; + } +}; +#endif diff --git a/model_tools/include/OPOptimizers/DeprecatedOPOptimizer.hpp b/model_tools/include/OPOptimizers/DeprecatedOPOptimizer.hpp new file mode 100644 index 00000000..542bdf98 --- /dev/null +++ b/model_tools/include/OPOptimizers/DeprecatedOPOptimizer.hpp @@ -0,0 +1,66 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_DEPRECATEDOPOPTIMIZER +#define _H_DEPRECATEDOPOPTIMIZER + +#include +#include +#include "model_tools.h" +#include "OPOptimizer.hpp" + +class DeprecatedOPOptimizer : public OPOptimizer { + bool optimize(ModelSpec *spec) override + { + bool hasOptimized = false; + + for (int i = 0; i < spec->num_operator_specs; i++) { + if (spec->ops[i].type == OT_Pad) { + if (spec->ops[i].ps.pad_spec.before == 0 && spec->ops[i].ps.pad_spec.after == 0 && + spec->ops[i].ps.pad_spec.top == 0 && spec->ops[i].ps.pad_spec.bottom == 0 && + spec->ops[i].ps.pad_spec.left == 0 && spec->ops[i].ps.pad_spec.right == 0) { + setOperatorInvalid(spec, i, true); + hasOptimized = true; + continue; + } + } + + if (spec->ops[i].type == OT_Input) { + spec->ops[i].type = OT_None; // trick + hasOptimized = true; + continue; + } + + if (spec->ops[i].type == OT_Pooling) { + if (spec->ops[i].ps.pooling_spec.kernel_t == 1 && + spec->ops[i].ps.pooling_spec.kernel_h == 1 && + spec->ops[i].ps.pooling_spec.kernel_w == 1 && + spec->ops[i].ps.pooling_spec.stride_t == 1 && + spec->ops[i].ps.pooling_spec.stride_h == 1 && + spec->ops[i].ps.pooling_spec.stride_w == 1 && + spec->ops[i].ps.pooling_spec.padding_before == 0 && + spec->ops[i].ps.pooling_spec.padding_after == 0 && + spec->ops[i].ps.pooling_spec.padding_top == 0 && + spec->ops[i].ps.pooling_spec.padding_bottom == 0 && + spec->ops[i].ps.pooling_spec.padding_left == 0 && + spec->ops[i].ps.pooling_spec.padding_right == 0) { + setOperatorInvalid(spec, i, true); + hasOptimized = true; + continue; + } + } + } + return hasOptimized; + } +}; +#endif diff --git a/model_tools/include/OPOptimizers/DepthwisePointwiseOptimizer.hpp b/model_tools/include/OPOptimizers/DepthwisePointwiseOptimizer.hpp new file mode 100644 index 00000000..f9aa0abb --- /dev/null +++ b/model_tools/include/OPOptimizers/DepthwisePointwiseOptimizer.hpp @@ -0,0 +1,123 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_DEPTHWISEPOINTWISEOPTIMIZER +#define _H_DEPTHWISEPOINTWISEOPTIMIZER + +#include "OPOptimizer.hpp" + +class DepthwisePointwiseOptimizer : public OPOptimizer { + bool optimize(ModelSpec *spec) override + { + bool hasOptimized = false; + for (int i = 0; i < spec->num_operator_specs; i++) { + // process depthwise convolution + if (spec->ops[i].type == OT_Conv && + spec->ops[i].ps.conv_spec.convolution_type == Convolution_Depthwise) { + int dwConvOpIndex = i; + std::vector> nextOpIndexes = searchOperatorIndexByInput(spec, + spec->ops[dwConvOpIndex].output_tensors_name[0], dwConvOpIndex + 1, + spec->num_operator_specs); + if (nextOpIndexes.size() != 1 || OT_Conv != spec->ops[nextOpIndexes[0].first].type || + spec->ops[nextOpIndexes[0].first].ps.conv_spec.convolution_type != + Convolution_Pointwise) { + UNI_WARNING_LOG("encounter unoptimize DepthwiseConv layer(no PointwiseConv): " + "%s\n", + spec->ops[i].name); + continue; + } + int convOpIndex = nextOpIndexes[0].first; + + // reallocate weights and bias + int dwConvWeightIndex = searchWeightIndex(spec, spec->ops[dwConvOpIndex].name); + int convWeightIndex = searchWeightIndex(spec, spec->ops[convOpIndex].name); + CHECK_REQUIREMENT(dwConvWeightIndex != -1); + CHECK_REQUIREMENT(convWeightIndex != -1); + CHECK_REQUIREMENT(spec->ws[dwConvWeightIndex].mdt == DT_F32); + CHECK_REQUIREMENT(spec->ws[convWeightIndex].mdt == DT_F32); + + U32 weightSize = spec->ws[dwConvWeightIndex].bytes_of_weight + + spec->ws[convWeightIndex].bytes_of_weight; + U8 *weight = (U8 *)mt_new_storage(weightSize); + memcpy(weight, spec->ws[dwConvWeightIndex].weight, + spec->ws[dwConvWeightIndex].bytes_of_weight); + memcpy(weight + spec->ws[dwConvWeightIndex].bytes_of_weight, + spec->ws[convWeightIndex].weight, spec->ws[convWeightIndex].bytes_of_weight); + + U32 vecSize = sizeof(F32) * + (spec->ops[dwConvOpIndex].ps.conv_spec.num_outputs + + spec->ops[convOpIndex].ps.conv_spec.num_outputs); + U8 *vec = (U8 *)mt_new_storage(vecSize); + U8 *ptr = vec; + if (spec->ws[dwConvWeightIndex].bytes_of_vec == 0) { + memset( + ptr, 0, sizeof(F32) * (spec->ops[dwConvOpIndex].ps.conv_spec.num_outputs)); + } else { + CHECK_REQUIREMENT( + sizeof(F32) * (spec->ops[dwConvOpIndex].ps.conv_spec.num_outputs) == + spec->ws[dwConvWeightIndex].bytes_of_vec); + memcpy(ptr, spec->ws[dwConvWeightIndex].vec, + spec->ws[dwConvWeightIndex].bytes_of_vec); + } + ptr = vec + sizeof(F32) * (spec->ops[dwConvOpIndex].ps.conv_spec.num_outputs); + if (spec->ws[convWeightIndex].bytes_of_vec == 0) { + memset(ptr, 0, sizeof(F32) * (spec->ops[convOpIndex].ps.conv_spec.num_outputs)); + } else { + CHECK_REQUIREMENT( + sizeof(F32) * (spec->ops[convOpIndex].ps.conv_spec.num_outputs) == + spec->ws[convWeightIndex].bytes_of_vec); + memcpy( + ptr, spec->ws[convWeightIndex].vec, spec->ws[convWeightIndex].bytes_of_vec); + } + + // free and reallocate + if (spec->ws[dwConvWeightIndex].weight != nullptr) { + spec->ws[dwConvWeightIndex].bytes_of_weight = 0; + delete spec->ws[dwConvWeightIndex].weight; + spec->ws[dwConvWeightIndex].weight = nullptr; + } + if (spec->ws[dwConvWeightIndex].vec != nullptr) { + spec->ws[dwConvWeightIndex].bytes_of_vec = 0; + delete spec->ws[dwConvWeightIndex].vec; + spec->ws[dwConvWeightIndex].vec = nullptr; + } + if (spec->ws[convWeightIndex].weight != nullptr) { + spec->ws[convWeightIndex].bytes_of_weight = 0; + delete spec->ws[convWeightIndex].weight; + spec->ws[convWeightIndex].weight = nullptr; + } + if (spec->ws[convWeightIndex].vec != nullptr) { + spec->ws[convWeightIndex].bytes_of_vec = 0; + delete spec->ws[convWeightIndex].vec; + spec->ws[convWeightIndex].vec = nullptr; + } + + // retain depthwise convolution operator + spec->ops[dwConvOpIndex].ps.conv_spec.num_outputs = + spec->ops[convOpIndex].ps.conv_spec.num_outputs; + spec->ops[dwConvOpIndex].ps.conv_spec.convolution_type = + Convolution_Depthwise_Pointwise; + spec->ops[dwConvOpIndex].ps.conv_spec.pw_activation_type = + spec->ops[convOpIndex].ps.conv_spec.pw_activation_type; + spec->ws[dwConvWeightIndex].bytes_of_weight = weightSize; + spec->ws[dwConvWeightIndex].weight = weight; + spec->ws[dwConvWeightIndex].bytes_of_vec = vecSize; + spec->ws[dwConvWeightIndex].vec = vec; + setOperatorInvalid(spec, convOpIndex, true); + hasOptimized = true; + } + } + return hasOptimized; + } +}; +#endif diff --git a/model_tools/include/OPOptimizers/FCFCOptimizer.hpp b/model_tools/include/OPOptimizers/FCFCOptimizer.hpp new file mode 100644 index 00000000..3f72f5ed --- /dev/null +++ b/model_tools/include/OPOptimizers/FCFCOptimizer.hpp @@ -0,0 +1,135 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_FCFCOPTIMIZER +#define _H_FCFCOPTIMIZER + +#include "model_tools.h" +#include "OPOptimizer.hpp" + +class FCFCOptimizer : public OPOptimizer { + bool optimize(ModelSpec *spec) override + { + const int queryNum = 1; + OperatorType queryOps[queryNum] = {OT_FC}; + bool hasOptimized = false; + for (int i = 1; i < spec->num_operator_specs; i++) { + if (spec->ops[i].type == OT_FC) { + int curOpIndex = i; + int prevOpIndex = + searchOperatorIndexBackward(spec, curOpIndex - 1, queryOps, queryNum); + if (prevOpIndex == -1) { + continue; + } + if (strncmp(spec->ops[curOpIndex].input_tensors_name[0], + spec->ops[prevOpIndex].input_tensors_name[0], NAME_LEN)) { + continue; + } + + int prevWeightIndex = searchWeightIndex(spec, spec->ops[prevOpIndex].name); + int curWeightIndex = searchWeightIndex(spec, spec->ops[curOpIndex].name); + CHECK_REQUIREMENT(prevWeightIndex != -1); + CHECK_REQUIREMENT(curWeightIndex != -1); + CHECK_REQUIREMENT(spec->ws[prevWeightIndex].mdt == DT_F32); + CHECK_REQUIREMENT(spec->ws[curWeightIndex].mdt == DT_F32); + + U32 weightSize = spec->ws[prevWeightIndex].bytes_of_weight + + spec->ws[curWeightIndex].bytes_of_weight; + U8 *weight = (U8 *)mt_new_storage(weightSize); + memcpy(weight, spec->ws[prevWeightIndex].weight, + spec->ws[prevWeightIndex].bytes_of_weight); + memcpy(weight + spec->ws[prevWeightIndex].bytes_of_weight, + spec->ws[curWeightIndex].weight, spec->ws[curWeightIndex].bytes_of_weight); + + U32 vecSize = sizeof(F32) * + (spec->ops[prevOpIndex].ps.fc_spec.num_outputs + + spec->ops[curOpIndex].ps.fc_spec.num_outputs); + U8 *vec = (U8 *)mt_new_storage(vecSize); + U8 *ptr = vec; + if (spec->ws[prevWeightIndex].bytes_of_vec == 0) { + memset(ptr, 0, sizeof(F32) * (spec->ops[prevOpIndex].ps.fc_spec.num_outputs)); + } else { + CHECK_REQUIREMENT(sizeof(F32) * (spec->ops[prevOpIndex].ps.fc_spec.num_outputs) == + spec->ws[prevWeightIndex].bytes_of_vec); + memcpy( + ptr, spec->ws[prevWeightIndex].vec, spec->ws[prevWeightIndex].bytes_of_vec); + } + ptr = vec + sizeof(F32) * (spec->ops[prevOpIndex].ps.fc_spec.num_outputs); + if (spec->ws[curWeightIndex].bytes_of_vec == 0) { + memset(ptr, 0, sizeof(F32) * (spec->ops[curOpIndex].ps.fc_spec.num_outputs)); + } else { + CHECK_REQUIREMENT(sizeof(F32) * (spec->ops[curOpIndex].ps.fc_spec.num_outputs) == + spec->ws[curWeightIndex].bytes_of_vec); + memcpy(ptr, spec->ws[curWeightIndex].vec, spec->ws[curWeightIndex].bytes_of_vec); + } + + if (spec->ws[prevWeightIndex].weight != nullptr) { + spec->ws[prevWeightIndex].bytes_of_weight = 0; + delete spec->ws[prevWeightIndex].weight; + spec->ws[prevWeightIndex].weight = nullptr; + } + if (spec->ws[prevWeightIndex].vec != nullptr) { + spec->ws[prevWeightIndex].bytes_of_vec = 0; + delete spec->ws[prevWeightIndex].vec; + spec->ws[prevWeightIndex].vec = nullptr; + } + if (spec->ws[curWeightIndex].weight != nullptr) { + spec->ws[curWeightIndex].bytes_of_weight = 0; + delete spec->ws[curWeightIndex].weight; + spec->ws[curWeightIndex].weight = nullptr; + } + if (spec->ws[curWeightIndex].vec != nullptr) { + spec->ws[curWeightIndex].bytes_of_vec = 0; + delete spec->ws[curWeightIndex].vec; + spec->ws[curWeightIndex].vec = nullptr; + } + + // FC params + spec->ops[prevOpIndex].ps.fc_spec.num_slices++; + U32 slices = spec->ops[prevOpIndex].ps.fc_spec.num_slices; + CHECK_REQUIREMENT( + slices <= sizeof(spec->ops[prevOpIndex].ps.fc_spec.slice_point) / sizeof(int)); + spec->ops[prevOpIndex].ps.fc_spec.slice_point[slices - 1] = + spec->ops[curOpIndex].ps.fc_spec.num_outputs; + spec->ops[prevOpIndex].ps.fc_spec.num_outputs += + spec->ops[curOpIndex].ps.fc_spec.num_outputs; + + // operator spec + spec->ops[prevOpIndex].num_outputs = slices; + I8 **names = (I8 **)mt_new_storage(slices * sizeof(I8 *)); + + for (U32 j = 0; j < slices - 1; j++) { + names[j] = spec->ops[prevOpIndex].output_tensors_name[j]; + } + names[slices - 1] = spec->ops[curOpIndex].output_tensors_name[0]; + delete spec->ops[prevOpIndex].output_tensors_name; + delete spec->ops[curOpIndex].output_tensors_name; + spec->ops[curOpIndex].output_tensors_name = nullptr; + spec->ops[curOpIndex].num_outputs = 0; + spec->ops[prevOpIndex].output_tensors_name = names; + + // weight spec + spec->ws[prevWeightIndex].bytes_of_weight = weightSize; + spec->ws[prevWeightIndex].weight = weight; + spec->ws[prevWeightIndex].bytes_of_vec = vecSize; + spec->ws[prevWeightIndex].vec = vec; + hasOptimized = true; + + setOperatorInvalid(spec, curOpIndex); + i = curOpIndex; + } + } + return hasOptimized; + } +}; +#endif diff --git a/model_tools/include/OPOptimizers/GeluOptimizer.hpp b/model_tools/include/OPOptimizers/GeluOptimizer.hpp new file mode 100644 index 00000000..71e62789 --- /dev/null +++ b/model_tools/include/OPOptimizers/GeluOptimizer.hpp @@ -0,0 +1,74 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_GeluOPTIMIZER +#define _H_GeluOPTIMIZER + +#include "model_tools.h" +#include "OPOptimizer.hpp" + +class GeluOptimizer : public OPOptimizer { + bool optimize(ModelSpec *spec) override + { + bool hasOptimized = false; + for (int i = 0; i < spec->num_operator_specs; i++) { + if (spec->ops[i].type == OT_Erf) { + int erfIndex = i; + int firMulIndex = erfIndex - 2; + int divIndex = erfIndex - 1; + int AddIndex = erfIndex + 1; + int secMulIndex = erfIndex + 2; + + if (spec->ops[firMulIndex].type == OT_Scale) { + if (spec->ops[divIndex].type == OT_Scale) { + if (spec->ops[AddIndex].type == OT_Scale) { + if (spec->ops[secMulIndex].type == OT_Eltwise) { + spec->ops[secMulIndex].num_inputs = 1; + free(spec->ops[secMulIndex].input_tensors_name[1]); + memcpy(spec->ops[secMulIndex].input_tensors_name[0], + spec->ops[divIndex].input_tensors_name[0], NAME_LEN); + spec->ops[secMulIndex].type = OT_Gelu; + setOperatorInvalid(spec, firMulIndex); + setOperatorInvalid(spec, divIndex); + setOperatorInvalid(spec, erfIndex); + setOperatorInvalid(spec, AddIndex); + + int firMulWeightIndex = + searchWeightIndex(spec, spec->ops[firMulIndex].name); + spec->ws[firMulWeightIndex].bytes_of_weight = 0; + delete spec->ws[firMulWeightIndex].weight; + spec->ws[firMulWeightIndex].weight = nullptr; + + int divWeightIndex = + searchWeightIndex(spec, spec->ops[divIndex].name); + spec->ws[divWeightIndex].bytes_of_weight = 0; + delete spec->ws[divWeightIndex].weight; + spec->ws[divWeightIndex].weight = nullptr; + + int AddWeightIndex = + searchWeightIndex(spec, spec->ops[AddIndex].name); + spec->ws[AddWeightIndex].bytes_of_vec = 0; + delete spec->ws[AddWeightIndex].vec; + spec->ws[AddWeightIndex].vec = nullptr; + + hasOptimized = true; + } + } + } + } + } + } + return hasOptimized; + } +}; +#endif diff --git a/model_tools/include/OPOptimizers/InPlaceOptimizer.hpp b/model_tools/include/OPOptimizers/InPlaceOptimizer.hpp new file mode 100644 index 00000000..aed8948c --- /dev/null +++ b/model_tools/include/OPOptimizers/InPlaceOptimizer.hpp @@ -0,0 +1,156 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_INPLACEOPTIMIZER +#define _H_INPLACEOPTIMIZER + +#include +#include +#include +#include "model_tools.h" +#include "OPOptimizer.hpp" + +class InPlaceOptimizer : public OPOptimizer { + bool optimize(ModelSpec *spec) override + { + bool hasOptimized = false; + std::vector unrepeatedInputNames; + std::vector repeated; + + // Insert pass + for (int i = 0; i < spec->num_operator_specs; i++) { + if (isInPlaceOp(spec->ops[i].type)) { + CHECK_REQUIREMENT(1 == spec->ops[i].num_inputs); + std::string inputName = spec->ops[i].input_tensors_name[0]; + if (find(unrepeatedInputNames.begin(), unrepeatedInputNames.end(), inputName) == + unrepeatedInputNames.end()) { + unrepeatedInputNames.push_back(inputName); + } else { + repeated.push_back(inputName); + } + } + } + + for (std::string name : repeated) { + std::vector::iterator it = + find(unrepeatedInputNames.begin(), unrepeatedInputNames.end(), name); + if (it != unrepeatedInputNames.end()) { + unrepeatedInputNames.erase(it); + } + } + + // Erase pass + for (int i = 0; i < spec->num_operator_specs; i++) { + if (OT_None == spec->ops[i].type || isInPlaceOp(spec->ops[i].type)) { + continue; + } + + for (U32 j = 0; j < spec->ops[i].num_inputs; j++) { + std::string inputName = spec->ops[i].input_tensors_name[j]; + std::vector::iterator it = + find(unrepeatedInputNames.begin(), unrepeatedInputNames.end(), inputName); + if (it != unrepeatedInputNames.end()) { + unrepeatedInputNames.erase(it); + } + } + } + + for (int i = spec->num_operator_specs - 1; i >= 1; i--) { + if (isInPlaceOp(spec->ops[i].type)) { + CHECK_REQUIREMENT(spec->ops[i].num_inputs == 1); + std::string inputName = spec->ops[i].input_tensors_name[0]; + if (find(unrepeatedInputNames.begin(), unrepeatedInputNames.end(), inputName) == + unrepeatedInputNames.end()) { + // Input is used multiple times, so should not be in-place + continue; + } + + std::vector modelOutputs = searchString( + spec->output_names, spec->num_outputs, spec->ops[i].input_tensors_name[0]); + if (modelOutputs.size() > 0) { + continue; + } + CHECK_REQUIREMENT(spec->ops[i].num_outputs == 1); + str_copy(spec->ops[i].input_tensors_name[0], spec->ops[i].output_tensors_name[0], + NAME_LEN); + hasOptimized = true; + + I32 found = 0; + for (int j = i - 1; j >= 0; j--) { + if (spec->ops[j].type != OT_None) { + for (U32 k = 0; k < spec->ops[j].num_outputs; k++) { + std::string prevOutputName = spec->ops[j].output_tensors_name[k]; + if (prevOutputName == inputName) { + str_copy(spec->ops[j].output_tensors_name[k], + spec->ops[i].input_tensors_name[0], NAME_LEN); + found = 1; + break; + } + } + } + if (1 == found) { + break; + } + } + + if (0 == found) { + std::string newName = spec->ops[i].input_tensors_name[0]; + UNI_WARNING_LOG("in-place tensor seems to be model input: %s. New name: %s\n", + inputName.c_str(), newName.c_str()); + } + } + } + return hasOptimized; + } + +public: + static bool isInPlaceOp(OperatorType opType) + { + bool ret = false; + switch (opType) { + case OT_Relu: { + ret = true; + break; + } + case OT_Relu6: { + ret = true; + break; + } + case OT_HSwish: { + ret = true; + break; + } + case OT_HSigmoid: { + ret = true; + break; + } + case OT_Sigmoid: { + ret = true; + break; + } + case OT_Gelu: { + ret = true; + break; + } + case OT_Power: { + ret = true; + break; + } + default: { + break; + } + } + return ret; + } +}; +#endif diff --git a/model_tools/include/OPOptimizers/InnerProductOptimizer.hpp b/model_tools/include/OPOptimizers/InnerProductOptimizer.hpp new file mode 100644 index 00000000..825a5728 --- /dev/null +++ b/model_tools/include/OPOptimizers/InnerProductOptimizer.hpp @@ -0,0 +1,72 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_InnerProductOPTIMIZER +#define _H_InnerProductOPTIMIZER + +#include +#include "model_tools.h" +#include "OPOptimizer.hpp" + +class InnerProductOptimizer : public OPOptimizer { + bool optimize(ModelSpec *spec) override + { + bool hasOptimized = false; + for (int i = 0; i < spec->num_operator_specs - 1; i++) { + if (spec->ops[i].type == OT_FC && spec->ops[i + 1].type == OT_Scale) { + int firScaleIndex = i; + int secScaleIndex = i + 1; + std::string firScaleOutput = spec->ops[firScaleIndex].output_tensors_name[0]; + std::string secScaleInput = spec->ops[secScaleIndex].input_tensors_name[0]; + if (spec->ops[firScaleIndex].num_inputs == 1 && + spec->ops[firScaleIndex].num_outputs == 1) { + if (spec->ops[secScaleIndex].num_inputs == 1 && + spec->ops[secScaleIndex].num_outputs == 1) { + if (firScaleOutput == secScaleInput) { + int firScaleWeightIndex = + searchWeightIndex(spec, spec->ops[firScaleIndex].name); + int secScaleWeightIndex = + searchWeightIndex(spec, spec->ops[secScaleIndex].name); + if (spec->ws[firScaleWeightIndex].bytes_of_weight != 0 && + spec->ws[firScaleWeightIndex].bytes_of_vec == 0 && + spec->ws[secScaleWeightIndex].bytes_of_weight == 0 && + spec->ws[secScaleWeightIndex].bytes_of_vec != 0) { + if (spec->ops[firScaleIndex].ps.fc_spec.num_outputs * sizeof(float) != + spec->ws[secScaleWeightIndex].bytes_of_vec) { + continue; + } + spec->ws[firScaleWeightIndex].bytes_of_vec = + spec->ws[secScaleWeightIndex].bytes_of_vec; + U8 *ln_vec = (U8 *)mt_new_storage( + spec->ws[secScaleWeightIndex].bytes_of_vec); + memcpy(ln_vec, spec->ws[secScaleWeightIndex].vec, + spec->ws[secScaleWeightIndex].bytes_of_vec); + spec->ws[firScaleWeightIndex].vec = ln_vec; + + spec->ws[secScaleWeightIndex].bytes_of_vec = 0; + delete spec->ws[secScaleWeightIndex].vec; + spec->ws[secScaleWeightIndex].vec = nullptr; + memcpy(spec->ops[firScaleIndex].output_tensors_name[0], + spec->ops[secScaleIndex].output_tensors_name[0], NAME_LEN); + setOperatorInvalid(spec, secScaleIndex); + hasOptimized = true; + } + } + } + } + } + } + return hasOptimized; + } +}; +#endif diff --git a/model_tools/include/OPOptimizers/InvariantSliceOptimizer.hpp b/model_tools/include/OPOptimizers/InvariantSliceOptimizer.hpp new file mode 100644 index 00000000..f2d4a391 --- /dev/null +++ b/model_tools/include/OPOptimizers/InvariantSliceOptimizer.hpp @@ -0,0 +1,45 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_InvariantSliceOPTIMIZER +#define _H_InvariantSliceOPTIMIZER + +#include +#include +#include +#include +#include "model_tools.h" +#include "OPOptimizer.hpp" + +class InvariantSliceOptimizer : public OPOptimizer { + bool optimize(ModelSpec *spec) override + { + // const int queryNum = 3; + // OperatorType queryOps[queryNum] = {OT_Conv, OT_FC, OT_Deconvolution}; + bool hasOptimized = false; + for (int i = 1; i < spec->num_operator_specs; i++) { + if (spec->ops[i].type == OT_Slice) { + int sliceOpIndex = i; + int curSliceSize = spec->ops[sliceOpIndex].ps.slice_spec.slice_size; + if (curSliceSize == 0) { + memcpy(spec->ops[sliceOpIndex - 1].output_tensors_name[0], + spec->ops[sliceOpIndex].output_tensors_name[0], NAME_LEN); + setOperatorInvalid(spec, sliceOpIndex); + hasOptimized = true; + } + } + } + return hasOptimized; + } +}; +#endif diff --git a/model_tools/include/OPOptimizers/LayerNormOptimizer.hpp b/model_tools/include/OPOptimizers/LayerNormOptimizer.hpp new file mode 100644 index 00000000..8f5463ad --- /dev/null +++ b/model_tools/include/OPOptimizers/LayerNormOptimizer.hpp @@ -0,0 +1,148 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_LayerNormOPTIMIZER +#define _H_LayerNormOPTIMIZER + +#include +#include +#include "model_tools.h" +#include "OPOptimizer.hpp" + +class LayerNormOptimizer : public OPOptimizer { + bool optimize(ModelSpec *spec) override + { + bool hasOptimized = false; + for (int i = 0; i < spec->num_operator_specs; i++) { + if (spec->ops[i].type == OT_Power && spec->ops[i].ps.power_spec.power == 2) { + int powOpIndex = i; + int backAddIndex = -1; + for (int j = powOpIndex - 1; j >= 0; j--) { + if (spec->ops[j].type == OT_Eltwise) { + if (spec->ops[j].ps.eltwise_spec.elt_mode == ELTWISE_SUM) { + backAddIndex = j; + break; + } else { + continue; + } + } + } + + if (backAddIndex == -1) { + continue; + } + + int forDivIndex = -1; + for (int j = powOpIndex + 1; j < spec->num_operator_specs; j++) { + if (spec->ops[j].type == OT_Eltwise) { + if (spec->ops[j].ps.eltwise_spec.elt_mode == ELTWISE_DIV) { + forDivIndex = j; + break; + } else { + continue; + } + } + } + + if (forDivIndex == -1) { + continue; + } + + std::map info_map; + bool tag = true; + for (int k = backAddIndex + 1; k < forDivIndex; k++) { + std::string tmp_str = ""; + if (spec->ops[k].type == OT_Pooling) { + tmp_str = "ReduceMean"; + } else if (spec->ops[k].type == OT_Eltwise) { + tmp_str = "Sub"; + } else if (spec->ops[k].type == OT_Scale) { + tmp_str = "Scale"; + } else if (spec->ops[k].type == OT_Power && + spec->ops[k].ps.power_spec.power == 2) { + tmp_str = "Pow"; + } else if (spec->ops[k].type == OT_Power && + spec->ops[k].ps.power_spec.power == 0.5) { + tmp_str = "Sqrt"; + } else { + tag = false; + break; + } + + if (info_map.find(tmp_str) == info_map.end()) { + info_map[tmp_str] = 1; + } else { + info_map[tmp_str] = info_map[tmp_str] + 1; + } + } + + if (tag == false) { + continue; + } + + if (info_map["ReduceMean"] == 2 && info_map["Sub"] == 2 && info_map["Scale"] == 1 && + info_map["Pow"] == 1 && info_map["Sqrt"] == 1 && + spec->ops[forDivIndex + 1].type == OT_Scale && + spec->ops[forDivIndex + 2].type == OT_Scale) { + hasOptimized = true; + } else { + continue; + } + + int tailMulIndex = forDivIndex + 1; + int tailAddIndex = forDivIndex + 2; + spec->ops[tailMulIndex].type = OT_LayerNorm; + int tailMulWeightIndex = searchWeightIndex(spec, spec->ops[tailMulIndex].name); + int tailAddWeightIndex = searchWeightIndex(spec, spec->ops[tailAddIndex].name); + CHECK_REQUIREMENT(tailAddWeightIndex >= 0); + CHECK_REQUIREMENT(spec->ws[tailAddWeightIndex].mdt == DT_F32); + + spec->ws[tailMulWeightIndex].bytes_of_vec = + spec->ws[tailAddWeightIndex].bytes_of_vec; + U8 *ln_vec = (U8 *)mt_new_storage(spec->ws[tailAddWeightIndex].bytes_of_vec); + memcpy(ln_vec, spec->ws[tailAddWeightIndex].vec, + spec->ws[tailAddWeightIndex].bytes_of_vec); + spec->ws[tailMulWeightIndex].vec = ln_vec; + + if (spec->ws[tailAddWeightIndex].weight != nullptr) { + spec->ws[tailAddWeightIndex].bytes_of_weight = 0; + delete spec->ws[tailAddWeightIndex].weight; + spec->ws[tailAddWeightIndex].weight = nullptr; + } + + if (spec->ws[tailAddWeightIndex].vec != nullptr) { + spec->ws[tailAddWeightIndex].bytes_of_vec = 0; + delete spec->ws[tailAddWeightIndex].vec; + spec->ws[tailAddWeightIndex].vec = nullptr; + } + + memcpy(spec->ops[tailMulIndex].output_tensors_name[0], + spec->ops[tailAddIndex].output_tensors_name[0], NAME_LEN); + memcpy(spec->ops[backAddIndex].output_tensors_name[0], + spec->ops[tailMulIndex].input_tensors_name[0], NAME_LEN); + + for (int k = backAddIndex + 1; k <= forDivIndex; k++) { + setOperatorInvalid(spec, k); + } + setOperatorInvalid(spec, tailAddIndex); + + int AddScaleWeightIndex = searchWeightIndex(spec, spec->ops[forDivIndex - 2].name); + spec->ws[AddScaleWeightIndex].bytes_of_vec = 0; + delete spec->ws[AddScaleWeightIndex].vec; + spec->ws[AddScaleWeightIndex].vec = nullptr; + } + } + return hasOptimized; + } +}; +#endif diff --git a/model-tools/include/OPOptimizers/MemoryReuseOptimizer.hpp b/model_tools/include/OPOptimizers/MemoryReuseOptimizer.hpp similarity index 76% rename from model-tools/include/OPOptimizers/MemoryReuseOptimizer.hpp rename to model_tools/include/OPOptimizers/MemoryReuseOptimizer.hpp index a6d262bc..2d9982c0 100644 --- a/model-tools/include/OPOptimizers/MemoryReuseOptimizer.hpp +++ b/model_tools/include/OPOptimizers/MemoryReuseOptimizer.hpp @@ -1,29 +1,27 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _H_MEMORYREUSEOPTIMIZER #define _H_MEMORYREUSEOPTIMIZER #include #include -#include #include #include "model_tools.h" #include "OPOptimizer.hpp" -class MemoryReuseOptimizer: public OPOptimizer { - virtual bool optimize(ModelSpec* spec) override +class MemoryReuseOptimizer : public OPOptimizer { + bool optimize(ModelSpec *spec) override { bool hasOptimized = false; std::map endOfLife; @@ -31,11 +29,10 @@ class MemoryReuseOptimizer: public OPOptimizer { for (i = 0; i < spec->num_operator_specs; i++) { if (OT_None != spec->ops[i].type) { if (OT_Repeat == spec->ops[i].type) { - int loopEnd = i; std::string startName = spec->ops[i].input_tensors_name[0]; - int loopStart = searchOperatorIndexByOutput(spec, startName); - CHECK_REQUIREMENT(-1 != loopStart); - loops.push_back(std::make_pair(loopStart, loopEnd)); + int loopStart = searchOperatorIndexByName(spec, startName); + CHECK_REQUIREMENT(loopStart >= 0); + loops.push_back(std::make_pair(loopStart, i)); continue; } @@ -63,16 +60,32 @@ class MemoryReuseOptimizer: public OPOptimizer { } // model inputs should not overwrite each other + // if model input is used within a loop, the space should not be reused for (int j = 0; j < spec->num_inputs; j++) { std::string inputName = spec->input_names[j]; - allocate(inputName, endOfLife[inputName], 0); + int lastId = endOfLife[inputName]; + + bool loopExternal = false; + for (auto loop : loops) { + int loopStart = std::get<0>(loop); + int loopEnd = std::get<1>(loop); + if (lastId >= loopStart && lastId <= loopEnd) { + loopExternal = true; + } + } + if (loopExternal) { + aliveTensors.insert(std::make_pair(inputName, -1)); + } else { + allocate(inputName, endOfLife[inputName], 0); + } } for (int i = 0; i < spec->num_operator_specs; i++) { if (OT_None != spec->ops[i].type) { U32 numInputs = spec->ops[i].num_inputs; U32 numOutputs = spec->ops[i].num_outputs; - spec->ops[i].tensor_positions = (I32*)mt_new_storage((numInputs + numOutputs) * bytesOf(DT_I32)); + spec->ops[i].tensor_positions = + (I32 *)mt_new_storage((numInputs + numOutputs) * bytesOf(DT_I32)); std::vector> layerTensors; @@ -111,7 +124,8 @@ class MemoryReuseOptimizer: public OPOptimizer { for (auto loop : loops) { int loopStart = std::get<0>(loop); int loopEnd = std::get<1>(loop); - if (lastId >= loopStart && lastId <= loopEnd && (i < loopStart || i > loopEnd)) { + if (lastId >= loopStart && lastId <= loopEnd && + (i < loopStart || i > loopEnd)) { loopExternal = true; } } @@ -127,10 +141,10 @@ class MemoryReuseOptimizer: public OPOptimizer { } // Sort the unallocated tensors according to their death time - sort(layerTensors.begin(), layerTensors.end(), [=](std::tuple a, std::tuple b) - { - return std::get<1>(a) > std::get<1>(b); - }); + sort(layerTensors.begin(), layerTensors.end(), + [=](std::tuple a, std::tuple b) { + return std::get<1>(a) > std::get<1>(b); + }); for (auto tuple : layerTensors) { std::string tensorName = std::get<0>(tuple); @@ -142,14 +156,15 @@ class MemoryReuseOptimizer: public OPOptimizer { #ifdef _DEBUG for (U32 j = 0; j < spec->ops[i].num_inputs; j++) { std::string inputName = spec->ops[i].input_tensors_name[j]; - std::cout << "Input Tensor " << inputName << " at " << spec->ops[i].tensor_positions[j] << std::endl; + UNI_DEBUG_LOG("Input Tensor %s at %d\n", inputName.c_str(), + spec->ops[i].tensor_positions[j]); } for (U32 j = 0; j < spec->ops[i].num_outputs; j++) { std::string outputName = spec->ops[i].output_tensors_name[j]; - std::cout << "Output Tensor " << outputName << " at " << spec->ops[i].tensor_positions[numInputs+j] << std::endl; + UNI_DEBUG_LOG("Output Tensor %s at %d\n", outputName.c_str(), + spec->ops[i].tensor_positions[numInputs + j]); } - std::cout << std::endl; #endif } } @@ -157,13 +172,13 @@ class MemoryReuseOptimizer: public OPOptimizer { } public: - private: std::vector> storages; std::map aliveTensors; - std::vector> loops; // If a tensor used in a loop is produced outside, it should not be overwritten + std::vector> + loops; // If a tensor used in a loop is produced outside, it should not be overwritten I32 allocate(std::string tensorName, int deathTime, int curID) { @@ -182,11 +197,16 @@ class MemoryReuseOptimizer: public OPOptimizer { return pos; } - bool isOPtoBypass(OperatorType ot) { + bool isOPtoBypass(OperatorType ot) + { char *environmentSetting = getenv("BOLT_MEMORY_REUSE_OPTIMIZATION"); - bool memoryReuse = (environmentSetting != NULL && std::string(environmentSetting) == std::string("OFF")) ? false : true; - if (! memoryReuse) + bool memoryReuse = + (environmentSetting != NULL && std::string(environmentSetting) == std::string("OFF")) + ? false + : true; + if (!memoryReuse) { return true; + } switch (ot) { case OT_None: { return true; diff --git a/model_tools/include/OPOptimizers/MultiHeadAttentionOptimizer.hpp b/model_tools/include/OPOptimizers/MultiHeadAttentionOptimizer.hpp new file mode 100644 index 00000000..f174ea39 --- /dev/null +++ b/model_tools/include/OPOptimizers/MultiHeadAttentionOptimizer.hpp @@ -0,0 +1,478 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_MultiHeadAttentionOPTIMIZER +#define _H_MultiHeadAttentionOPTIMIZER + +#include +#include +#include +#include "model_tools.h" +#include "OPOptimizer.hpp" + +class MultiHeadAttentionOptimizer : public OPOptimizer { + int upstreamOpIndex(ModelSpec *spec, int curIndex, std::string curInputName) + { + for (int i = curIndex - 1; i >= 0; i--) { + if (spec->ops[i].type == OT_None) { + continue; + } + if (spec->ops[i].num_outputs == 1) { + std::string curOutputName = spec->ops[i].output_tensors_name[0]; + if (curInputName.compare(curOutputName) == 0) { + return i; + } + } + } + return -1; + } + + bool optimize(ModelSpec *spec) override + { + bool hasOptimized = false; + + std::map firLoopMap; + std::map secLoopMap; + int firEltIndex = -1; + int secEltIndex = -1; + for (int i = 1; i < spec->num_operator_specs; i++) { + bool firstBoolTag = false; + bool secBoolTag = false; + + int backIpIndex1 = -1; + int backIpIndex2 = -1; + int backLayerNorm1Index = -1; + ActivationMode globalActi = ACTIVATION_RELU; + + int backIpIndex = -1; + int backReshapeIndex = -1; + int leftReshapeIndex = -1; + int midReshapeIndex = -1; + int rightReshapeIndex = -1; + int leftIpIndex = -1; + int midIpIndex = -1; + int rightIpIndex = -1; + int globalPowerIndex = -1; + + if (spec->ops[i].type == OT_LayerNorm) { + int layerNormOpIndex = i; + std::string lnOutput = spec->ops[layerNormOpIndex].output_tensors_name[0]; + for (int j = i + 1; j < spec->num_operator_specs; j++) { + if (spec->ops[j].type == OT_None) { + continue; + } + if (secEltIndex != -1) { + break; + } + if (spec->ops[j].type == OT_Eltwise && firEltIndex == -1) { + firEltIndex = j; + continue; + } + + if (spec->ops[j].type == OT_Eltwise && firEltIndex != -1) { + secEltIndex = j; + continue; + } + + if (firEltIndex == -1) { + if (firLoopMap.find(spec->ops[j].type) == firLoopMap.end()) { + firLoopMap.insert(std::pair(spec->ops[j].type, 1)); + } else { + firLoopMap[spec->ops[j].type] = firLoopMap[spec->ops[j].type] + 1; + } + } else { + if (secLoopMap.find(spec->ops[j].type) == secLoopMap.end()) { + secLoopMap.insert(std::pair(spec->ops[j].type, 1)); + } else { + secLoopMap[spec->ops[j].type] = secLoopMap[spec->ops[j].type] + 1; + } + } + } + + int secLoopMap_size = 0; + std::map::iterator secLoop_iter; + secLoop_iter = secLoopMap.begin(); + while (secLoop_iter != secLoopMap.end()) { + secLoopMap_size += secLoop_iter->second; + secLoop_iter++; + } + if (secLoopMap_size == 4) { + std::string secEltInput0 = spec->ops[secEltIndex].input_tensors_name[0]; + std::string secEltInput1 = spec->ops[secEltIndex].input_tensors_name[1]; + + std::string secLoopEltPreFcName = ""; + std::string secLoopEltPreNotFcName = ""; + int secLoopEltPreIndex0 = upstreamOpIndex(spec, secEltIndex, secEltInput0); + if (spec->ops[secLoopEltPreIndex0].type == OT_FC) { + secLoopEltPreFcName = secEltInput0; + secLoopEltPreNotFcName = secEltInput1; + } else { + secLoopEltPreFcName = secEltInput1; + secLoopEltPreNotFcName = secEltInput0; + } + + backIpIndex1 = upstreamOpIndex(spec, secEltIndex, secLoopEltPreFcName); + if (!(backIpIndex1 != -1 && spec->ops[backIpIndex1].type == OT_FC)) { + continue; + } + + std::string backIpIndex1Input = spec->ops[backIpIndex1].input_tensors_name[0]; + int actiIndex = upstreamOpIndex(spec, backIpIndex1, backIpIndex1Input); + if (actiIndex != -1) { + if (spec->ops[actiIndex].type == OT_Relu) { + globalActi = ACTIVATION_RELU; + } else if (spec->ops[actiIndex].type == OT_Gelu) { + globalActi = ACTIVATION_GELU; + } else { + continue; + } + } else { + continue; + } + + std::string actiInput = spec->ops[actiIndex].input_tensors_name[0]; + backIpIndex2 = upstreamOpIndex(spec, actiIndex, actiInput); + if (!(backIpIndex2 != -1 && spec->ops[backIpIndex2].type == OT_FC)) { + continue; + } + + std::string backIpIndex2Input = spec->ops[backIpIndex2].input_tensors_name[0]; + backLayerNorm1Index = upstreamOpIndex(spec, backIpIndex2, backIpIndex2Input); + if (!(backLayerNorm1Index != -1 && + spec->ops[backLayerNorm1Index].type == OT_LayerNorm)) { + continue; + } + + std::string firstEltOutput = spec->ops[firEltIndex].output_tensors_name[0]; + std::string backLayerNorm1Output = + spec->ops[backLayerNorm1Index].output_tensors_name[0]; + if (secLoopEltPreNotFcName.compare(firstEltOutput) == 0) { + secBoolTag = false; + } else if (secLoopEltPreNotFcName.compare(backLayerNorm1Output) == 0) { + secBoolTag = true; + } else { + continue; + } + + } else { + continue; + } + + int firLoopMap_size = 0; + std::map::iterator firLoop_iter; + firLoop_iter = firLoopMap.begin(); + while (firLoop_iter != firLoopMap.end()) { + firLoopMap_size += firLoop_iter->second; + firLoop_iter++; + } + if (firLoopMap_size == 16) { + std::string firstEltInput0 = spec->ops[firEltIndex].input_tensors_name[0]; + std::string firstEltInput1 = spec->ops[firEltIndex].input_tensors_name[1]; + + std::string firLoopEltPreFcName = ""; + std::string firLoopEltPreNotFcName = ""; + int firLoopEltPreIndex0 = upstreamOpIndex(spec, firEltIndex, firstEltInput0); + if (spec->ops[firLoopEltPreIndex0].type == OT_FC) { + firLoopEltPreFcName = firstEltInput0; + firLoopEltPreNotFcName = firstEltInput1; + } else { + firLoopEltPreFcName = firstEltInput1; + firLoopEltPreNotFcName = firstEltInput0; + } + + backIpIndex = upstreamOpIndex(spec, firEltIndex, firLoopEltPreFcName); + if (!(backIpIndex != -1 && spec->ops[backIpIndex].type == OT_FC)) { + continue; + } + + std::string backIpInput = spec->ops[backIpIndex].input_tensors_name[0]; + backReshapeIndex = upstreamOpIndex(spec, backIpIndex, backIpInput); + if (!(backReshapeIndex != -1 && spec->ops[backReshapeIndex].type == OT_Reshape)) { + continue; + } + + std::string backReshapeInput = spec->ops[backReshapeIndex].input_tensors_name[0]; + int backTransposeIndex = + upstreamOpIndex(spec, backReshapeIndex, backReshapeInput); + if (!(backTransposeIndex != -1 && + spec->ops[backTransposeIndex].type == OT_Transpose)) { + continue; + } + + std::string backTransposeInput = + spec->ops[backTransposeIndex].input_tensors_name[0]; + int backMatmulIndex1 = + upstreamOpIndex(spec, backTransposeIndex, backTransposeInput); + if (!(backMatmulIndex1 != -1 && spec->ops[backMatmulIndex1].type == OT_MatMul)) { + continue; + } + + std::string backMatmulIndex1Input0 = + spec->ops[backMatmulIndex1].input_tensors_name[0]; + std::string backMatmulIndex1Input1 = + spec->ops[backMatmulIndex1].input_tensors_name[1]; + int leftTransposeIndex = + upstreamOpIndex(spec, backMatmulIndex1, backMatmulIndex1Input1); + if (!(leftTransposeIndex != -1 && + spec->ops[leftTransposeIndex].type == OT_Transpose)) { + continue; + } + + std::string leftTransposeInput = + spec->ops[leftTransposeIndex].input_tensors_name[0]; + leftReshapeIndex = upstreamOpIndex(spec, leftTransposeIndex, leftTransposeInput); + if (!(leftReshapeIndex != -1 && spec->ops[leftReshapeIndex].type == OT_Reshape)) { + continue; + } + + std::string leftReshapeInput = spec->ops[leftReshapeIndex].input_tensors_name[0]; + leftIpIndex = upstreamOpIndex(spec, leftReshapeIndex, leftReshapeInput); + if (!(leftIpIndex != -1 && spec->ops[leftIpIndex].type == OT_FC)) { + continue; + } + + int softmaxIndex = + upstreamOpIndex(spec, backMatmulIndex1, backMatmulIndex1Input0); + if (!(softmaxIndex != -1 && spec->ops[softmaxIndex].type == OT_Softmax)) { + continue; + } + + std::string softmaxInput = spec->ops[softmaxIndex].input_tensors_name[0]; + int preSoftmaxIndex = upstreamOpIndex(spec, softmaxIndex, softmaxInput); + int backMatmulIndex2 = -1; + if (preSoftmaxIndex != -1 && + (spec->ops[preSoftmaxIndex].type == OT_Scale || + spec->ops[preSoftmaxIndex].type == OT_Power)) { + globalPowerIndex = preSoftmaxIndex; + std::string scaleInput = spec->ops[preSoftmaxIndex].input_tensors_name[0]; + backMatmulIndex2 = upstreamOpIndex(spec, preSoftmaxIndex, scaleInput); + } else if (preSoftmaxIndex != -1 && + spec->ops[preSoftmaxIndex].type == OT_MatMul) { + backMatmulIndex2 = preSoftmaxIndex; + } else { + continue; + } + if (!(backMatmulIndex2 != -1 && spec->ops[backMatmulIndex2].type == OT_MatMul)) { + continue; + } + + std::string backMatmulIndex2Input0 = + spec->ops[backMatmulIndex2].input_tensors_name[0]; + int midBackOpIndex = + upstreamOpIndex(spec, backMatmulIndex2, backMatmulIndex2Input0); + int midTransposeIndex = -1; + if (midBackOpIndex != -1 && + (spec->ops[midBackOpIndex].type == OT_Scale || + spec->ops[midBackOpIndex].type == OT_Power)) { + globalPowerIndex = midBackOpIndex; + std::string scaleInput = spec->ops[midBackOpIndex].input_tensors_name[0]; + midTransposeIndex = upstreamOpIndex(spec, midBackOpIndex, scaleInput); + } else if (midBackOpIndex != -1 && + spec->ops[midBackOpIndex].type == OT_Transpose) { + midTransposeIndex = midBackOpIndex; + } + if (!(midTransposeIndex != -1 && + spec->ops[midTransposeIndex].type == OT_Transpose)) { + continue; + } + + std::string midTransposeInput = + spec->ops[midTransposeIndex].input_tensors_name[0]; + midReshapeIndex = upstreamOpIndex(spec, midTransposeIndex, midTransposeInput); + if (!(midReshapeIndex != -1 && spec->ops[midReshapeIndex].type == OT_Reshape)) { + continue; + } + + std::string midReshapeInput = spec->ops[midReshapeIndex].input_tensors_name[0]; + midIpIndex = upstreamOpIndex(spec, midReshapeIndex, midReshapeInput); + if (!(midIpIndex != -1 && spec->ops[midIpIndex].type == OT_FC)) { + continue; + } + + std::string backMatmulIndex2Input1 = + spec->ops[backMatmulIndex2].input_tensors_name[1]; + int rightBackOpIndex = + upstreamOpIndex(spec, backMatmulIndex2, backMatmulIndex2Input1); + int rightTransposeIndex = -1; + if (rightBackOpIndex != -1 && + (spec->ops[rightBackOpIndex].type == OT_Scale || + spec->ops[rightBackOpIndex].type == OT_Power)) { + globalPowerIndex = rightBackOpIndex; + std::string scaleInput = spec->ops[rightBackOpIndex].input_tensors_name[0]; + rightTransposeIndex = upstreamOpIndex(spec, rightBackOpIndex, scaleInput); + } else if (rightBackOpIndex != -1 && + spec->ops[rightBackOpIndex].type == OT_Transpose) { + rightTransposeIndex = rightBackOpIndex; + } + if (!(rightTransposeIndex != -1 && + spec->ops[rightTransposeIndex].type == OT_Transpose)) { + continue; + } + + std::string rightTransposeInput = + spec->ops[rightTransposeIndex].input_tensors_name[0]; + rightReshapeIndex = + upstreamOpIndex(spec, rightTransposeIndex, rightTransposeInput); + if (!(midReshapeIndex != -1 && spec->ops[midReshapeIndex].type == OT_Reshape)) { + continue; + } + + std::string rightReshapeInput = + spec->ops[rightReshapeIndex].input_tensors_name[0]; + rightIpIndex = upstreamOpIndex(spec, rightReshapeIndex, rightReshapeInput); + if (!(rightIpIndex != -1 && spec->ops[rightIpIndex].type == OT_FC)) { + continue; + } + + std::string ln2ip0 = spec->ops[leftIpIndex].input_tensors_name[0]; + std::string ln2ip1 = spec->ops[midIpIndex].input_tensors_name[0]; + std::string ln2ip2 = spec->ops[rightIpIndex].input_tensors_name[0]; + if (ln2ip0.compare(ln2ip1) && ln2ip1.compare(ln2ip2)) { + if (firLoopEltPreNotFcName.compare(ln2ip0) == 0) { + firstBoolTag = true; + } else { + secBoolTag = false; + } + } + + MultiheadAttentionParamSpec multihead_spec; + multihead_spec.fc_desc[0] = spec->ops[leftIpIndex].ps.fc_spec; + multihead_spec.fc_desc[1] = spec->ops[midIpIndex].ps.fc_spec; + multihead_spec.fc_desc[2] = spec->ops[rightIpIndex].ps.fc_spec; + multihead_spec.fc_desc[3] = spec->ops[backIpIndex].ps.fc_spec; + multihead_spec.fc_desc[4] = spec->ops[backIpIndex2].ps.fc_spec; + multihead_spec.fc_desc[5] = spec->ops[backIpIndex1].ps.fc_spec; + multihead_spec.power_spec = spec->ops[globalPowerIndex].ps.power_spec; + multihead_spec.eltwiseWithLayerNormIn[0] = firstBoolTag; + multihead_spec.eltwiseWithLayerNormIn[1] = secBoolTag; + multihead_spec.actiMode = globalActi; + multihead_spec.reshapeDesc[0] = spec->ops[leftReshapeIndex].ps.reshape_spec; + multihead_spec.reshapeDesc[1] = spec->ops[midReshapeIndex].ps.reshape_spec; + multihead_spec.reshapeDesc[2] = spec->ops[rightReshapeIndex].ps.reshape_spec; + multihead_spec.reshapeDesc[3] = spec->ops[backReshapeIndex].ps.reshape_spec; + multihead_spec.eltwiseDesc[0] = spec->ops[firEltIndex].ps.eltwise_spec; + multihead_spec.eltwiseDesc[1] = spec->ops[secEltIndex].ps.eltwise_spec; + + spec->ops[layerNormOpIndex].type = OT_MultiHeadAttention; + spec->ops[layerNormOpIndex].ps.multiheadAttention_spec = multihead_spec; + + int lnWeightIndex = searchWeightIndex(spec, spec->ops[layerNormOpIndex].name); + int leftIpWeightIndex = searchWeightIndex(spec, spec->ops[leftIpIndex].name); + int midIpWeightIndex = searchWeightIndex(spec, spec->ops[midIpIndex].name); + int rightIpWeightIndex = searchWeightIndex(spec, spec->ops[rightIpIndex].name); + int backIpWeightIndex = searchWeightIndex(spec, spec->ops[backIpIndex].name); + int backLayerNorm1WeightIndex = + searchWeightIndex(spec, spec->ops[backLayerNorm1Index].name); + int backIpWeightIndex2 = searchWeightIndex(spec, spec->ops[backIpIndex2].name); + int backIpWeightIndex1 = searchWeightIndex(spec, spec->ops[backIpIndex1].name); + + U32 weightSize = spec->ws[lnWeightIndex].bytes_of_weight + + spec->ws[leftIpWeightIndex].bytes_of_weight + + spec->ws[midIpWeightIndex].bytes_of_weight + + spec->ws[rightIpWeightIndex].bytes_of_weight + + spec->ws[backIpWeightIndex].bytes_of_weight + + spec->ws[backLayerNorm1WeightIndex].bytes_of_weight + + spec->ws[backIpWeightIndex2].bytes_of_weight + + spec->ws[backIpWeightIndex1].bytes_of_weight; + U32 biasSize = spec->ws[lnWeightIndex].bytes_of_vec + + spec->ws[leftIpWeightIndex].bytes_of_vec + + spec->ws[midIpWeightIndex].bytes_of_vec + + spec->ws[rightIpWeightIndex].bytes_of_vec + + spec->ws[backIpWeightIndex].bytes_of_vec + + spec->ws[backLayerNorm1WeightIndex].bytes_of_vec + + spec->ws[backIpWeightIndex2].bytes_of_vec + + spec->ws[backIpWeightIndex1].bytes_of_vec; + U8 *multihead_weight = (U8 *)mt_new_storage(weightSize); + U8 *multihead_vec = (U8 *)mt_new_storage(biasSize); + int weightOffset = 0; + memcpy(&multihead_weight[weightOffset], spec->ws[lnWeightIndex].weight, + spec->ws[lnWeightIndex].bytes_of_weight); + weightOffset += spec->ws[lnWeightIndex].bytes_of_weight; + memcpy(&multihead_weight[weightOffset], spec->ws[leftIpWeightIndex].weight, + spec->ws[leftIpWeightIndex].bytes_of_weight); + weightOffset += spec->ws[leftIpWeightIndex].bytes_of_weight; + memcpy(&multihead_weight[weightOffset], spec->ws[midIpWeightIndex].weight, + spec->ws[midIpWeightIndex].bytes_of_weight); + weightOffset += spec->ws[midIpWeightIndex].bytes_of_weight; + memcpy(&multihead_weight[weightOffset], spec->ws[rightIpWeightIndex].weight, + spec->ws[rightIpWeightIndex].bytes_of_weight); + weightOffset += spec->ws[rightIpWeightIndex].bytes_of_weight; + memcpy(&multihead_weight[weightOffset], spec->ws[backIpWeightIndex].weight, + spec->ws[backIpWeightIndex].bytes_of_weight); + weightOffset += spec->ws[backIpWeightIndex].bytes_of_weight; + memcpy(&multihead_weight[weightOffset], + spec->ws[backLayerNorm1WeightIndex].weight, + spec->ws[backLayerNorm1WeightIndex].bytes_of_weight); + weightOffset += spec->ws[backLayerNorm1WeightIndex].bytes_of_weight; + memcpy(&multihead_weight[weightOffset], spec->ws[backIpWeightIndex2].weight, + spec->ws[backIpWeightIndex2].bytes_of_weight); + weightOffset += spec->ws[backIpWeightIndex2].bytes_of_weight; + memcpy(&multihead_weight[weightOffset], spec->ws[backIpWeightIndex1].weight, + spec->ws[backIpWeightIndex1].bytes_of_weight); + + int vecOffset = 0; + memcpy(&multihead_vec[vecOffset], spec->ws[lnWeightIndex].vec, + spec->ws[lnWeightIndex].bytes_of_vec); + vecOffset += spec->ws[lnWeightIndex].bytes_of_vec; + memcpy(&multihead_vec[vecOffset], spec->ws[leftIpWeightIndex].vec, + spec->ws[leftIpWeightIndex].bytes_of_vec); + vecOffset += spec->ws[leftIpWeightIndex].bytes_of_vec; + memcpy(&multihead_vec[vecOffset], spec->ws[midIpWeightIndex].vec, + spec->ws[midIpWeightIndex].bytes_of_vec); + vecOffset += spec->ws[midIpWeightIndex].bytes_of_vec; + memcpy(&multihead_vec[vecOffset], spec->ws[rightIpWeightIndex].vec, + spec->ws[rightIpWeightIndex].bytes_of_vec); + vecOffset += spec->ws[rightIpWeightIndex].bytes_of_vec; + memcpy(&multihead_vec[vecOffset], spec->ws[backIpWeightIndex].vec, + spec->ws[backIpWeightIndex].bytes_of_vec); + vecOffset += spec->ws[backIpWeightIndex].bytes_of_vec; + memcpy(&multihead_vec[vecOffset], spec->ws[backLayerNorm1WeightIndex].vec, + spec->ws[backLayerNorm1WeightIndex].bytes_of_vec); + vecOffset += spec->ws[backLayerNorm1WeightIndex].bytes_of_vec; + memcpy(&multihead_vec[vecOffset], spec->ws[backIpWeightIndex2].vec, + spec->ws[backIpWeightIndex2].bytes_of_vec); + vecOffset += spec->ws[backIpWeightIndex2].bytes_of_vec; + memcpy(&multihead_vec[vecOffset], spec->ws[backIpWeightIndex1].vec, + spec->ws[backIpWeightIndex1].bytes_of_vec); + spec->ws[lnWeightIndex].bytes_of_weight = weightSize; + spec->ws[lnWeightIndex].bytes_of_vec = biasSize; + + delete spec->ws[lnWeightIndex].weight; + delete spec->ws[lnWeightIndex].vec; + spec->ws[lnWeightIndex].weight = multihead_weight; + spec->ws[lnWeightIndex].vec = multihead_vec; + + memcpy(spec->ops[layerNormOpIndex].output_tensors_name[0], + spec->ops[secEltIndex].output_tensors_name[0], NAME_LEN); + + for (int k = layerNormOpIndex + 1; k <= secEltIndex; k++) { + setOperatorInvalid(spec, k); + } + + for (int k = lnWeightIndex + 1; k <= backIpWeightIndex1; k++) { + setWeightOperatorInvalid(spec, k); + } + + hasOptimized = true; + + firEltIndex = -1; + secEltIndex = -1; + firLoopMap.clear(); + secLoopMap.clear(); + } + } + } + return hasOptimized; + } +}; +#endif diff --git a/model_tools/include/OPOptimizers/NoQuantLabelOptimizer.hpp b/model_tools/include/OPOptimizers/NoQuantLabelOptimizer.hpp new file mode 100644 index 00000000..090c2fcd --- /dev/null +++ b/model_tools/include/OPOptimizers/NoQuantLabelOptimizer.hpp @@ -0,0 +1,264 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_NOQUANTLABELOPTIMIZER +#define _H_NOQUANTLABELOPTIMIZER + +#include +#include +#include "model_tools.h" +#include "OPOptimizer.hpp" + +class NoQuantLabelOptimizer : public OPOptimizer { +public: + NoQuantLabelOptimizer(bool actFP16, float clipVal) + { + if (clipVal > 0) { + this->uniScale = 127.0 / clipVal; + } else { + this->uniScale = 0; + } + this->hasCache = false; + this->actFP16 = actFP16; + } + + bool optimize(ModelSpec *spec) override + { + bool hasOptimized = false; + + for (int i = 0; i < spec->num_operator_specs; i++) { + if (spec->ops[i].type == OT_None) { + continue; + } + if (uniScale > 0) { + if (spec->ops[i].type == OT_FC || spec->ops[i].type == OT_MatMul || + spec->ops[i].type == OT_RNN) { + this->label_clip_input(spec->ops + i); + if (spec->ops[i].type == OT_FC || spec->ops[i].type == OT_RNN) { + int weightIdx = searchWeightIndex(spec, spec->ops[i].name); + CHECK_REQUIREMENT(-1 != weightIdx); + CHECK_REQUIREMENT(DT_F32 == spec->ws[weightIdx].mdt); + UNI_INFO_LOG("Clipping the weight of FC or LSTM\n"); + F32 clipMax = 127.0 / uniScale; + F32 clipMin = -1 * clipMax; + U32 len = spec->ws[weightIdx].bytes_of_weight / bytesOf(DT_F32); + F32 *w = (F32 *)spec->ws[weightIdx].weight; + for (U32 j = 0; j < len; j++) { + if (w[j] > clipMax) { + w[j] = clipMax; + } else if (w[j] < clipMin) { + w[j] = clipMin; + } + } + } + } + continue; + } + + if (OT_Conv == spec->ops[i].type) { + std::string curIn = spec->ops[i].input_tensors_name[0]; + if (this->is_kin_to_model_input(spec, curIn, i)) { // input is model input + this->label_OP_as_no_quant(spec->ops + i); + hasOptimized = true; + } + } + + // Activation other than ReLU + if (spec->ops[i].type == OT_Relu6 || spec->ops[i].type == OT_HSwish || + spec->ops[i].type == OT_HSigmoid || spec->ops[i].type == OT_Sigmoid || + spec->ops[i].type == OT_Clip || spec->ops[i].type == OT_Gelu || + spec->ops[i].type == OT_TanH || spec->ops[i].type == OT_Resize) { + std::string curIn = spec->ops[i].input_tensors_name[0]; + this->label_fp_outputs(spec, curIn); + hasOptimized = true; + } + + if (spec->ops[i].type == OT_Concat) { + for (U32 j = 0; j < spec->ops[i].num_inputs; j++) { + std::string curIn = spec->ops[i].input_tensors_name[j]; + std::vector> prevIndex = + searchOperatorIndexByOutput(spec, curIn, 0, i); + if (prevIndex.size() == 0) { // model input + this->hasCache = true; + std::string outName = spec->ops[i].output_tensors_name[0]; + this->label_fp_outputs(spec, outName); + break; + } + } + } + + if (spec->ops[i].type == OT_FC || spec->ops[i].type == OT_Conv || + spec->ops[i].type == OT_MatMul) { + std::string output = spec->ops[i].output_tensors_name[0]; + bool isModelOutput = false; + + for (int j = 0; j < spec->num_outputs; j++) { + std::string name = spec->output_names[j]; + if (name == output) { + isModelOutput = true; + break; + } + } + + if (isModelOutput) { + this->label_fp_outputs(spec, output); + hasOptimized = true; + } + } + + if (spec->ops[i].type == OT_Softmax) { + std::string inputName = spec->ops[i].input_tensors_name[0]; + int prevKeyIndex; + std::vector> prevKeyIndexes = + searchOperatorIndexByOutput(spec, inputName, 0, i); + while (prevKeyIndexes.size() != 0) { + prevKeyIndex = prevKeyIndexes[0].first; + OperatorType ot = spec->ops[prevKeyIndex].type; + if (OT_Conv == ot || OT_FC == ot || OT_MatMul == ot) { + break; + } else { + inputName = + spec->ops[prevKeyIndex].input_tensors_name[prevKeyIndexes[0].second]; + prevKeyIndexes = + searchOperatorIndexByOutput(spec, inputName, 0, prevKeyIndex); + } + } + prevKeyIndex = prevKeyIndexes[0].first; + if (-1 == prevKeyIndex) { + UNI_INFO_LOG("Softmax receives model input directly\n"); + continue; + } + this->label_OP_as_no_quant(spec->ops + prevKeyIndex); + + for (U32 j = 0; j < spec->ops[prevKeyIndex].num_inputs; j++) { + std::string prevIn = spec->ops[prevKeyIndex].input_tensors_name[j]; + this->label_fp_outputs(spec, prevIn); + } + hasOptimized = true; + } + + if (spec->ops[i].type == OT_Eltwise || spec->ops[i].type == OT_DetectionOutput || + spec->ops[i].type == OT_Scale || actFP16) { + for (U32 j = 0; j < spec->ops[i].num_inputs; j++) { + std::string curIn = spec->ops[i].input_tensors_name[j]; + this->label_fp_outputs(spec, curIn); + hasOptimized = true; + } + } + } + return hasOptimized; + } + + static void label_OP_as_no_quant(OperatorSpec *ptr) + { + switch (ptr->num_quant_feature) { + case 0: { + ptr->num_quant_feature = 1; + ptr->feature_scale = (QuantSpec *)mt_new_storage(sizeof(QuantSpec)); + ptr->feature_scale[0].num_scale = 1; + ptr->feature_scale[0].scale = (F32 *)mt_new_storage(sizeof(F32)); + ptr->feature_scale[0].scale[0] = 0; + break; + } + case 1: { + CHECK_REQUIREMENT(1 == ptr->feature_scale[0].num_scale); + ptr->feature_scale[0].scale[0] = 0; + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + } + + void label_fp_outputs(ModelSpec *ms, std::string tensorName) + { + std::vector> prevIndices = + searchOperatorIndexByOutput(ms, tensorName, 0, ms->num_operator_specs, false); + if (prevIndices.size() == 0) { + return; + } + int prevIndex = prevIndices[prevIndices.size() - 1].first; + OperatorSpec *ptr = ms->ops + prevIndex; + if (0 == ptr->num_quant_feature) { + ptr->num_quant_feature = 1; + ptr->feature_scale = (QuantSpec *)mt_new_storage(sizeof(QuantSpec)); + ptr->feature_scale[0].num_scale = 1; + ptr->feature_scale[0].scale = (F32 *)mt_new_storage(sizeof(F32)); + ptr->feature_scale[0].scale[0] = -2; + } else if (-2 == ptr->feature_scale[0].scale[0] || 0 == ptr->feature_scale[0].scale[0]) { + return; // Already processed the upstream + } + + OperatorType ot = ms->ops[prevIndex].type; + if (OT_Conv != ot && OT_FC != ot && OT_MatMul != ot && OT_PriorBox != ot) { + for (U32 i = 0; i < ms->ops[prevIndex].num_inputs; i++) { + std::string name = ms->ops[prevIndex].input_tensors_name[i]; + label_fp_outputs(ms, name); + } + } + if (hasCache && OT_MatMul == ot) { + for (U32 i = 0; i < ms->ops[prevIndex].num_inputs; i++) { + std::string name = ms->ops[prevIndex].input_tensors_name[i]; + label_fp_outputs(ms, name); + } + } + } + + void label_clip_input(OperatorSpec *ptr) + { + CHECK_REQUIREMENT(0 == ptr->num_quant_feature); + ptr->num_quant_feature = ptr->num_inputs + ptr->num_outputs; + ptr->feature_scale = (QuantSpec *)mt_new_storage(sizeof(QuantSpec) * ptr->num_quant_feature); + U32 i; + for (i = 0; i < ptr->num_inputs; i++) { + ptr->feature_scale[i].num_scale = 1; + ptr->feature_scale[i].scale = (F32 *)mt_new_storage(sizeof(F32)); + ptr->feature_scale[i].scale[0] = this->uniScale; + } + for (; i < ptr->num_quant_feature; i++) { + ptr->feature_scale[i].num_scale = 1; + ptr->feature_scale[i].scale = (F32 *)mt_new_storage(sizeof(F32)); + ptr->feature_scale[i].scale[0] = -2; + } + } + + bool is_kin_to_model_input(ModelSpec *ms, std::string name, int bound) + { + if (0 == bound) { + return true; + } + std::vector> prevIndices = + searchOperatorIndexByOutput(ms, name, 0, bound, false); + if (0 == prevIndices.size()) { + return true; + } + int prevIndex = prevIndices[prevIndices.size() - 1].first; + OperatorType ot = ms->ops[prevIndex].type; + if (OT_Conv == ot || OT_FC == ot || OT_MatMul == ot) { + return false; + } + for (U32 i = 0; i < ms->ops[prevIndex].num_inputs; i++) { + if (!is_kin_to_model_input(ms, ms->ops[prevIndex].input_tensors_name[i], prevIndex)) { + return false; + } + } + return true; + } + +private: + float uniScale; + bool hasCache; + bool actFP16; +}; +#endif diff --git a/model_tools/include/OPOptimizers/OPOptimizer.hpp b/model_tools/include/OPOptimizers/OPOptimizer.hpp new file mode 100644 index 00000000..5cce29ae --- /dev/null +++ b/model_tools/include/OPOptimizers/OPOptimizer.hpp @@ -0,0 +1,236 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_OPOPTIMIZER +#define _H_OPOPTIMIZER + +#include +#include +#include "types.h" +#include "op_type.h" +#include "model_tools.h" + +class OPOptimizer { +public: + virtual ~OPOptimizer() + {} + + virtual bool optimize(ModelSpec *spec) = 0; + + int searchWeightIndex(ModelSpec *spec, char *op_name) + { + if (spec->num_weight_specs <= 0) { + return -1; + } + + std::string opNameStr = op_name; + for (int i = 0; i < spec->num_weight_specs; i++) { + std::string key = spec->ws[i].op_name; + if (key == opNameStr) { + return i; + } + } + return -1; + } + + bool isValidOperator(ModelSpec *spec, int index) + { + if (index >= spec->num_operator_specs) { + return false; + } + + if (spec->ops[index].type != OT_None) { + return true; + } else { + return false; + } + } + + void setOperatorInvalid(ModelSpec *spec, int index, bool removeEdge = false) + { + if (index >= spec->num_operator_specs || index < 0) { + return; + } + spec->ops[index].type = OT_None; + if (removeEdge) { + if (spec->ops[index].num_inputs == 1 && spec->ops[index].num_outputs == 1 && + std::string(spec->ops[index].input_tensors_name[0]) == + std::string(spec->ops[index].output_tensors_name[0])) { + return; + } + if (spec->ops[index].num_inputs > 0) { + for (U32 i = 0; i < spec->ops[index].num_outputs; i++) { + std::vector> operatorIndexes0 = searchOperatorIndexByInput( + spec, spec->ops[index].output_tensors_name[i], index + 1); + for (U32 j = 0; j < operatorIndexes0.size(); j++) { + str_copy(spec->ops[operatorIndexes0[j].first] + .input_tensors_name[operatorIndexes0[j].second], + spec->ops[index].input_tensors_name[0], NAME_LEN); + } + std::vector outputs = searchString(spec->output_names, spec->num_outputs, + spec->ops[index].output_tensors_name[i]); + for (U32 j = 0; j < outputs.size(); j++) { + str_copy(spec->output_names[outputs[j]], + spec->ops[index].input_tensors_name[0], NAME_LEN); + } + } + } + } + } + + void setWeightOperatorInvalid(ModelSpec *spec, int index) + { + spec->ws[index].bytes_of_weight = 0; + spec->ws[index].bytes_of_vec = 0; + delete spec->ws[index].weight; + spec->ws[index].weight = nullptr; + delete spec->ws[index].weight; + spec->ws[index].vec = nullptr; + } + + int searchOperatorIndexByName(ModelSpec *spec, std::string name) + { + int result = -1; + for (int i = 0; i < spec->num_operator_specs; i++) { + if (spec->ops[i].name == name) { + result = i; + break; + } + } + return result; + } + + int searchOperatorIndexBackward( + ModelSpec *spec, int end, OperatorType *queryOps, int queryNum, bool unskip = true) + { + for (int i = end; i >= 0; i--) { + if (isValidOperator(spec, i)) { + for (int j = 0; j < queryNum; j++) { + OperatorType opType = queryOps[j]; + if (spec->ops[i].type == opType) { + return i; + } + } + if (unskip) { + return -1; + } + } + } + return -1; + } + + int searchOperatorIndexForward(ModelSpec *spec, + int start, + OperatorType *queryOps, + int queryNum, + bool unskip = true, + std::string str = "") + { + std::string strEmpty = ""; + for (int i = start; i < spec->num_operator_specs; i++) { + if (isValidOperator(spec, i)) { + for (int j = 0; j < queryNum; j++) { + OperatorType opType = queryOps[j]; + if (spec->ops[i].type == opType) { + if (str != strEmpty && spec->ops[i].num_inputs > 0) { + std::string inputName0 = + spec->ops[i].input_tensors_name[0]; // May examine more inputs in the future + if (inputName0 != str) { + continue; + } + } + return i; + } + } + if (unskip) { + return -1; + } + } + } + return -1; + } + + std::vector> searchOperatorIndexByInput(ModelSpec *spec, + std::string tensorName, + int left = 0, + int right = 0, + bool nearestNeighbor = true) + { + std::vector> result; + if (right == 0) { + right = spec->num_operator_specs; + } + for (int i = left; i < right; i++) { + if (isValidOperator(spec, i)) { + bool hasFind = false; + for (int j = 0; j < (int)spec->ops[i].num_inputs; j++) { + if (spec->ops[i].input_tensors_name[j] == tensorName) { + result.push_back(std::make_pair(i, j)); + hasFind = true; + } + } + if (nearestNeighbor) { + for (int j = 0; j < (int)spec->ops[i].num_outputs; j++) { + if (spec->ops[i].output_tensors_name[j] == tensorName && hasFind) { + return result; + } + } + } + } + } + return result; + } + + std::vector> searchOperatorIndexByOutput(ModelSpec *spec, + std::string tensorName, + int left = 0, + int right = 0, + bool nearestNeighbor = true) + { + std::vector> result; + if (right == 0) { + right = spec->num_operator_specs; + } + for (int i = right - 1; i >= left; i--) { + if (isValidOperator(spec, i)) { + bool hasFind = false; + for (int j = 0; j < (int)spec->ops[i].num_outputs; j++) { + if (spec->ops[i].output_tensors_name[j] == tensorName) { + result.push_back(std::make_pair(i, j)); + hasFind = true; + } + } + if (nearestNeighbor) { + for (int j = 0; j < (int)spec->ops[i].num_inputs; j++) { + if (spec->ops[i].input_tensors_name[j] == tensorName && hasFind) { + return result; + } + } + } + } + } + return result; + } + + std::vector searchString(char **array, int num, const char *data) + { + std::vector result; + for (int i = 0; i < num; i++) { + if (std::string(array[i]) == std::string(data)) { + result.push_back(i); + } + } + return result; + } +}; +#endif diff --git a/model_tools/include/OPOptimizers/PadOptimizer.hpp b/model_tools/include/OPOptimizers/PadOptimizer.hpp new file mode 100644 index 00000000..bcde306d --- /dev/null +++ b/model_tools/include/OPOptimizers/PadOptimizer.hpp @@ -0,0 +1,72 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_PADOPTIMIZER +#define _H_PADOPTIMIZER + +#include "OPOptimizer.hpp" + +class PadOptimizer : public OPOptimizer { + bool optimize(ModelSpec *spec) override + { + bool hasOptimized = false; + for (int i = 0; i < spec->num_operator_specs; i++) { + if (spec->ops[i].type == OT_Pad && spec->ops[i].ps.pad_spec.pad_mode == Pad_Constant && + spec->ops[i].ps.pad_spec.constant_value == 0) { + int padOpIndex = i; + std::vector> nextOpIndexes = searchOperatorIndexByInput(spec, + spec->ops[padOpIndex].output_tensors_name[0], padOpIndex + 1, + spec->num_operator_specs); + if (nextOpIndexes.size() != 1 || + OT_Pooling != spec->ops[nextOpIndexes[0].first].type || + OT_Conv != spec->ops[nextOpIndexes[0].first].type) { + continue; + } + int nextOpIndex = nextOpIndexes[0].first; + + if (spec->ops[nextOpIndex].type == OT_Pooling) { + spec->ops[nextOpIndex].ps.pooling_spec.padding_before += + spec->ops[padOpIndex].ps.pad_spec.before; + spec->ops[nextOpIndex].ps.pooling_spec.padding_after += + spec->ops[padOpIndex].ps.pad_spec.after; + spec->ops[nextOpIndex].ps.pooling_spec.padding_top += + spec->ops[padOpIndex].ps.pad_spec.top; + spec->ops[nextOpIndex].ps.pooling_spec.padding_bottom += + spec->ops[padOpIndex].ps.pad_spec.bottom; + spec->ops[nextOpIndex].ps.pooling_spec.padding_left += + spec->ops[padOpIndex].ps.pad_spec.left; + spec->ops[nextOpIndex].ps.pooling_spec.padding_right += + spec->ops[padOpIndex].ps.pad_spec.right; + } + if (spec->ops[nextOpIndex].type == OT_Conv) { + spec->ops[nextOpIndex].ps.conv_spec.padding_before += + spec->ops[padOpIndex].ps.pad_spec.before; + spec->ops[nextOpIndex].ps.conv_spec.padding_after += + spec->ops[padOpIndex].ps.pad_spec.after; + spec->ops[nextOpIndex].ps.conv_spec.padding_top += + spec->ops[padOpIndex].ps.pad_spec.top; + spec->ops[nextOpIndex].ps.conv_spec.padding_bottom += + spec->ops[padOpIndex].ps.pad_spec.bottom; + spec->ops[nextOpIndex].ps.conv_spec.padding_left += + spec->ops[padOpIndex].ps.pad_spec.left; + spec->ops[nextOpIndex].ps.conv_spec.padding_right += + spec->ops[padOpIndex].ps.pad_spec.right; + } + setOperatorInvalid(spec, padOpIndex, true); + hasOptimized = true; + } + } + return hasOptimized; + } +}; +#endif diff --git a/model_tools/include/OPOptimizers/PowerOptimizer.hpp b/model_tools/include/OPOptimizers/PowerOptimizer.hpp new file mode 100644 index 00000000..c53a738b --- /dev/null +++ b/model_tools/include/OPOptimizers/PowerOptimizer.hpp @@ -0,0 +1,133 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_POWEROPTIMIZER +#define _H_POWEROPTIMIZER + +#include "OPOptimizer.hpp" + +class PowerOptimizer : public OPOptimizer { + bool optimizeUnusedPower(ModelSpec *spec) + { + bool hasOptimized = false; + float threshold = 0.0001; + for (int i = 0; i < spec->num_operator_specs; i++) { + if (spec->ops[i].type == OT_Power && + UNI_ABS(spec->ops[i].ps.power_spec.scale - 1) < threshold && + UNI_ABS(spec->ops[i].ps.power_spec.shift) < threshold && + UNI_ABS(spec->ops[i].ps.power_spec.power - 1) < threshold) { + setOperatorInvalid(spec, i, true); + hasOptimized = true; + } + } + return hasOptimized; + } + + bool optimizePowerEltwise(ModelSpec *spec) + { + bool hasOptimized = false; + float threshold = 0.0001; + for (int i = 0; i < spec->num_operator_specs - 2; i++) { + if (spec->ops[i].type == OT_Power && + UNI_ABS(spec->ops[i].ps.power_spec.shift) < threshold && + UNI_ABS(spec->ops[i].ps.power_spec.power - 1) < threshold) { + int powerIndex0 = i; + std::vector> nextOpIndexes = searchOperatorIndexByInput(spec, + spec->ops[powerIndex0].output_tensors_name[0], powerIndex0 + 1, + spec->num_operator_specs); + if (nextOpIndexes.size() != 1 || + OT_Eltwise != spec->ops[nextOpIndexes[0].first].type || + spec->ops[nextOpIndexes[0].first].num_inputs != 2 || + spec->ops[nextOpIndexes[0].first].ps.eltwise_spec.elt_mode != ELTWISE_SUM) { + continue; + } + int eltwiseIndex = nextOpIndexes[0].first; + + std::vector> prevOpIndexes = searchOperatorIndexByOutput(spec, + spec->ops[eltwiseIndex].input_tensors_name[1 - nextOpIndexes[0].second], 0, + eltwiseIndex); + if (prevOpIndexes.size() != 1 || OT_Power != spec->ops[prevOpIndexes[0].first].type || + std::string(spec->ops[powerIndex0].input_tensors_name[0]) != + std::string(spec->ops[prevOpIndexes[0].first].input_tensors_name[0]) || + UNI_ABS(spec->ops[prevOpIndexes[0].first].ps.power_spec.shift) >= threshold || + UNI_ABS(spec->ops[prevOpIndexes[0].first].ps.power_spec.power - 1) >= threshold || + UNI_ABS(spec->ops[powerIndex0].ps.power_spec.scale + + spec->ops[prevOpIndexes[0].first].ps.power_spec.scale - 1) >= threshold) { + continue; + } + int powerIndex1 = prevOpIndexes[0].first; + nextOpIndexes = searchOperatorIndexByInput(spec, + spec->ops[powerIndex1].output_tensors_name[0], powerIndex1 + 1, + spec->num_operator_specs); + if (nextOpIndexes.size() != 1) { + continue; + } + + setOperatorInvalid(spec, powerIndex0, true); + setOperatorInvalid(spec, powerIndex1, true); + if (spec->ops[eltwiseIndex].ps.eltwise_spec.activation_type == ACTIVATION_NULL) { + setOperatorInvalid(spec, eltwiseIndex, true); + } else if (spec->ops[eltwiseIndex].ps.eltwise_spec.activation_type == + ACTIVATION_RELU) { + spec->ops[eltwiseIndex].num_inputs = 1; + str_copy(spec->ops[eltwiseIndex].input_tensors_name[0], + spec->ops[i].input_tensors_name[0], NAME_LEN); + delete spec->ops[eltwiseIndex].input_tensors_name[1]; + spec->ops[eltwiseIndex].input_tensors_name[1] = nullptr; + ReLUParamSpec reluParam = + spec->ops[eltwiseIndex].ps.eltwise_spec.activation_spec.relu_spec; + spec->ops[eltwiseIndex].ps.relu_spec = reluParam; + } else { + UNI_ERROR_LOG("not support not-relu eltwise to merge with Power\n"); + } + i = eltwiseIndex; + hasOptimized = true; + } + } + return hasOptimized; + } + + bool optimizeSquareSqrt(ModelSpec *spec) + { + bool hasOptimized = false; + for (int i = 0; i < spec->num_operator_specs; i++) { + if (spec->ops[i].type == OT_Power) { + std::vector> nextOpIndexes = searchOperatorIndexByInput( + spec, spec->ops[i].output_tensors_name[0], i + 1, spec->num_operator_specs); + if (nextOpIndexes.size() != 1 || OT_Power != spec->ops[nextOpIndexes[0].first].type) { + continue; + } + int nextPower = nextOpIndexes[0].first; + if (spec->ops[nextPower].ps.power_spec.scale == 1 && + spec->ops[nextPower].ps.power_spec.shift == 0) { + spec->ops[i].ps.power_spec.power *= spec->ops[nextPower].ps.power_spec.power; + setOperatorInvalid(spec, nextPower, true); + i = nextPower; + hasOptimized = true; + } + } + } + return hasOptimized; + } + + bool optimize(ModelSpec *spec) override + { + bool hasOptimized = false; + hasOptimized |= optimizeUnusedPower(spec); + hasOptimized |= optimizeSquareSqrt(spec); + hasOptimized |= optimizeUnusedPower(spec); + hasOptimized |= optimizePowerEltwise(spec); + return hasOptimized; + } +}; +#endif diff --git a/model_tools/include/OPOptimizers/RNNOptimizer.hpp b/model_tools/include/OPOptimizers/RNNOptimizer.hpp new file mode 100644 index 00000000..4ba01b9d --- /dev/null +++ b/model_tools/include/OPOptimizers/RNNOptimizer.hpp @@ -0,0 +1,74 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_RNNOPTIMIZER +#define _H_RNNOPTIMIZER + +#include +#include "model_tools.h" +#include "OPOptimizer.hpp" + +class RNNOptimizer : public OPOptimizer { + bool optimize(ModelSpec *spec) override + { + OperatorType queryShape[1] = {OT_Shape}; + + bool hasOptimized = false; + for (int i = 0; i < spec->num_operator_specs; i++) { + if (spec->ops[i].type == OT_RNN && spec->ops[i].num_inputs >= 1) { + int rnnOpIndex = i; + std::string rnn_input = spec->ops[rnnOpIndex].input_tensors_name[0]; + int shapeOpIndex = searchOperatorIndexBackward(spec, i - 1, queryShape, 1, false); + if (shapeOpIndex < 1) { + UNI_WARNING_LOG("rnn %s can not be optimized\n", spec->ops[i].name); + continue; + } + + std::string last_output = spec->ops[shapeOpIndex - 1].output_tensors_name[0]; + std::string shape_input = spec->ops[shapeOpIndex].input_tensors_name[0]; + if ((last_output == rnn_input) && (last_output == shape_input)) { + if (spec->ops[shapeOpIndex + 1].type == OT_Slice) { + if (spec->ops[shapeOpIndex + 2].type == OT_Unsqueeze) { + if (spec->ops[shapeOpIndex + 3].type == OT_Concat) { + if (spec->ops[shapeOpIndex + 4].type == OT_ConstantOfShape) { + for (int j = 0; j < 5; j++) { + setOperatorInvalid(spec, shapeOpIndex + j); + } + + int rnn_original_input_size = spec->ops[rnnOpIndex].num_inputs; + for (int k = 1; k < rnn_original_input_size; k++) { + delete spec->ops[rnnOpIndex].input_tensors_name[k]; + spec->ops[rnnOpIndex].input_tensors_name[k] = nullptr; + } + spec->ops[rnnOpIndex].num_inputs = 1; + } + } + } + } + } + // bi-RNN + if (rnnOpIndex + 2 < spec->num_operator_specs && + spec->ops[rnnOpIndex + 1].type == OT_Transpose && + spec->ops[rnnOpIndex + 2].type == OT_Reshape && + spec->ops[rnnOpIndex].ps.rnn_spec.steps == -2) { + memcpy(spec->ops[rnnOpIndex].output_tensors_name[0], + spec->ops[rnnOpIndex + 2].output_tensors_name[0], NAME_LEN); + setOperatorInvalid(spec, rnnOpIndex + 1); + setOperatorInvalid(spec, rnnOpIndex + 2); + } + } + } + return hasOptimized; + } +}; +#endif diff --git a/model_tools/include/OPOptimizers/ShGaUnCoReOptimizer.hpp b/model_tools/include/OPOptimizers/ShGaUnCoReOptimizer.hpp new file mode 100644 index 00000000..08ee2752 --- /dev/null +++ b/model_tools/include/OPOptimizers/ShGaUnCoReOptimizer.hpp @@ -0,0 +1,59 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_ShGaUnCoReOPTIMIZER +#define _H_ShGaUnCoReOPTIMIZER + +#include "OPOptimizer.hpp" + +class ShGaUnCoReOptimizer : public OPOptimizer { + bool optimize(ModelSpec *spec) override + { + bool hasOptimized = false; + for (int i = 1; i < spec->num_operator_specs; i++) { + if (spec->ops[i].type == OT_Shape) { + int shapeOpIndex = i; + if (spec->ops[i + 1].type == OT_Gather && spec->ops[i + 2].type == OT_Unsqueeze && + spec->ops[i + 3].type == OT_Unsqueeze && spec->ops[i + 4].type == OT_Concat && + spec->ops[i + 5].type == OT_Reshape) { + for (int k = 1; k < (int)(spec->ops[shapeOpIndex - 1].num_outputs); k++) { + delete spec->ops[shapeOpIndex - 1].output_tensors_name[k]; + spec->ops[shapeOpIndex - 1].output_tensors_name[k] = nullptr; + } + spec->ops[shapeOpIndex - 1].num_outputs = 1; + + for (int k = 1; k < (int)(spec->ops[i + 5].num_outputs); k++) { + delete spec->ops[i + 5].input_tensors_name[k]; + spec->ops[i + 5].input_tensors_name[k] = nullptr; + } + spec->ops[i + 5].num_inputs = 1; + + // make the reshape proper + spec->ops[i + 5].ps.reshape_spec.shape_dims[0] = 1; + spec->ops[i + 5].ps.reshape_spec.shape_dims[1] = -1; + spec->ops[i + 5].ps.reshape_spec.shape_size = 2; + + // drop the redundant op + setOperatorInvalid(spec, i); + setOperatorInvalid(spec, i + 1); + setOperatorInvalid(spec, i + 2); + setOperatorInvalid(spec, i + 3); + setOperatorInvalid(spec, i + 4); + hasOptimized = true; + } + } + } + return hasOptimized; + } +}; +#endif diff --git a/model_tools/include/OPOptimizers/SqueezeReshapeOptimizer.hpp b/model_tools/include/OPOptimizers/SqueezeReshapeOptimizer.hpp new file mode 100644 index 00000000..cd9c7854 --- /dev/null +++ b/model_tools/include/OPOptimizers/SqueezeReshapeOptimizer.hpp @@ -0,0 +1,49 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_SQUEEZERESHAPEOPTIMIZER +#define _H_SQUEEZERESHAPEOPTIMIZER + +#include "model_tools.h" +#include "OPOptimizer.hpp" + +class SqueezeReshapeOptimizer : public OPOptimizer { + bool optimize(ModelSpec *spec) override + { + const int queryNum = 1; + OperatorType queryOps[queryNum] = {OT_Reshape}; + bool hasOptimized = false; + for (int i = 1; i < spec->num_operator_specs; i++) { + if (spec->ops[i].type == OT_Squeeze) { + int squeezeIndex = i; + int reshapeIndex = + searchOperatorIndexForward(spec, squeezeIndex + 1, queryOps, queryNum); + if (reshapeIndex == -1) { + continue; + } + if (strncmp(spec->ops[squeezeIndex].output_tensors_name[0], + spec->ops[reshapeIndex].input_tensors_name[0], NAME_LEN)) { + continue; + } + + str_copy(spec->ops[reshapeIndex].input_tensors_name[0], + spec->ops[squeezeIndex].input_tensors_name[0], NAME_LEN); + hasOptimized = true; + setOperatorInvalid(spec, squeezeIndex); + i = reshapeIndex; + } + } + return hasOptimized; + } +}; +#endif diff --git a/model_tools/include/OPOptimizers/StdDeviationOptimizer.hpp b/model_tools/include/OPOptimizers/StdDeviationOptimizer.hpp new file mode 100644 index 00000000..b97499eb --- /dev/null +++ b/model_tools/include/OPOptimizers/StdDeviationOptimizer.hpp @@ -0,0 +1,45 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_STDDEVIATIONOPTIMIZER +#define _H_STDDEVIATIONOPTIMIZER + +#include "model_tools.h" +#include "OPOptimizer.hpp" + +class StdDeviationOptimizer : public OPOptimizer { + bool optimize(ModelSpec *spec) override + { + bool hasOptimized = false; + for (int i = 0; i < spec->num_operator_specs - 2; i++) { + if (OT_SqDiff == spec->ops[i].type) { + if (OT_Reduction == spec->ops[i + 1].type && OT_Power == spec->ops[i + 2].type) { + CHECK_REQUIREMENT( + REDUCTION_MEAN == spec->ops[i + 1].ps.reduction_spec.reduction_mode); + spec->ops[i + 1].ps.reduction_spec.reduction_mode = REDUCTION_STD_DEVIATION; + + str_copy(spec->ops[i + 1].input_tensors_name[0], + spec->ops[i].input_tensors_name[0], NAME_LEN); + str_copy(spec->ops[i + 1].output_tensors_name[0], + spec->ops[i + 2].output_tensors_name[0], NAME_LEN); + hasOptimized = true; + setOperatorInvalid(spec, i); + setOperatorInvalid(spec, i + 2); + i += 2; + } + } + } + return hasOptimized; + } +}; +#endif diff --git a/model_tools/include/OPOptimizers/TransposeMatMulToFCOptimizer.hpp b/model_tools/include/OPOptimizers/TransposeMatMulToFCOptimizer.hpp new file mode 100644 index 00000000..91a530e1 --- /dev/null +++ b/model_tools/include/OPOptimizers/TransposeMatMulToFCOptimizer.hpp @@ -0,0 +1,66 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_TRANSPOSEMATMULTOFCOPTIMIZER +#define _H_TRANSPOSEMATMULTOFCOPTIMIZER + +#include "model_tools.h" +#include "OPOptimizer.hpp" + +class TransposeMatMulToFCOptimizer : public OPOptimizer { + bool optimize(ModelSpec *spec) override + { + const int queryNum = 1; + OperatorType queryOps[queryNum] = {OT_MatMul}; + bool hasOptimized = false; + for (int i = 1; i < spec->num_operator_specs; i++) { + if (spec->ops[i].type == OT_Transpose) { + int transposeOpIndex = i; + int transposeWeightIndex = searchWeightIndex(spec, spec->ops[transposeOpIndex].name); + if (transposeWeightIndex < 0) { + // This transpose does not have weight + continue; + } + + int matmulOpIndex = + searchOperatorIndexForward(spec, transposeOpIndex + 1, queryOps, queryNum); + if (-1 == matmulOpIndex) { + UNI_ERROR_LOG("encountered transpose with weight but cannot be optimized\n"); + CHECK_STATUS(NOT_SUPPORTED); + } + + hasOptimized = true; + + // Update matmul to fc + spec->ops[matmulOpIndex].type = OT_FC; + spec->ops[matmulOpIndex].num_inputs = 1; + delete spec->ops[matmulOpIndex].input_tensors_name[1]; + spec->ops[matmulOpIndex].ps.fc_spec.num_outputs = + spec->ws[transposeWeightIndex].bytes_of_vec; + spec->ops[matmulOpIndex].ps.fc_spec.num_slices = 1; + spec->ops[matmulOpIndex].ps.fc_spec.slice_point[0] = + spec->ops[matmulOpIndex].ps.fc_spec.num_outputs; + + // Adjust the owner of the weight + str_copy(spec->ws[transposeWeightIndex].op_name, spec->ops[matmulOpIndex].name, + NAME_LEN); + spec->ws[transposeWeightIndex].bytes_of_vec = 0; + + setOperatorInvalid(spec, transposeOpIndex); + i = matmulOpIndex; + } + } + return hasOptimized; + } +}; +#endif diff --git a/model_tools/include/OPOptimizers/TransposeMulToScaleOptimizer.hpp b/model_tools/include/OPOptimizers/TransposeMulToScaleOptimizer.hpp new file mode 100644 index 00000000..7736c0b9 --- /dev/null +++ b/model_tools/include/OPOptimizers/TransposeMulToScaleOptimizer.hpp @@ -0,0 +1,61 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_TRANSPOSEMULTOSCALEOPTIMIZER +#define _H_TRANSPOSEMULTOSCALEOPTIMIZER + +#include "OPOptimizer.hpp" + +class TransposeMulToScaleOptimizer : public OPOptimizer { + bool optimize(ModelSpec *spec) override + { + const int queryNum = 2; + OperatorType queryOps[queryNum] = {OT_Transpose, OT_Reshape}; + bool hasOptimized = false; + for (int i = 1; i < spec->num_operator_specs; i++) { + if (spec->ops[i].type == OT_Eltwise && + spec->ops[i].ps.eltwise_spec.elt_mode == ELTWISE_PROD) { + int mulOpIndex = i; + int transposeOpIndex00 = + searchOperatorIndexBackward(spec, mulOpIndex - 1, queryOps, queryNum, false); + if (transposeOpIndex00 == -1) { + continue; + } + int transposeOpIndex01 = searchOperatorIndexBackward( + spec, transposeOpIndex00 - 1, queryOps, queryNum, false); + if (transposeOpIndex01 == -1) { + continue; + } + int transposeOpIndex10 = + searchOperatorIndexForward(spec, mulOpIndex + 1, queryOps, queryNum, false); + if (transposeOpIndex10 == -1) { + continue; + } + + if (transposeOpIndex10 == mulOpIndex + 1 || + (transposeOpIndex10 == mulOpIndex + 2 && + spec->ops[mulOpIndex + 1].type == OT_Relu)) { + spec->ops[mulOpIndex].type = OT_Scale; + spec->ops[mulOpIndex].ps.scale_spec.axis = 1; + setOperatorInvalid(spec, transposeOpIndex00, true); + setOperatorInvalid(spec, transposeOpIndex01, true); + setOperatorInvalid(spec, transposeOpIndex10, true); + hasOptimized = true; + i = transposeOpIndex10; + } + } + } + return hasOptimized; + } +}; +#endif diff --git a/model_tools/include/OPOptimizers/WeightBNOptimizer.hpp b/model_tools/include/OPOptimizers/WeightBNOptimizer.hpp new file mode 100644 index 00000000..29bec2ee --- /dev/null +++ b/model_tools/include/OPOptimizers/WeightBNOptimizer.hpp @@ -0,0 +1,116 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_WEIGHTBNOPTIMIZER +#define _H_WEIGHTBNOPTIMIZER + +#include "OPOptimizer.hpp" + +class WeightBNOptimizer : public OPOptimizer { + bool optimize(ModelSpec *spec) override + { + bool hasOptimized = false; + for (int i = 0; i < spec->num_operator_specs; i++) { + if (OT_Conv == spec->ops[i].type || OT_FC == spec->ops[i].type) { + int prevOpIndex = i; + if (OT_Conv == spec->ops[prevOpIndex].type) { + if (ACTIVATION_NULL != spec->ops[prevOpIndex].ps.conv_spec.dw_activation_type || + ACTIVATION_NULL != spec->ops[prevOpIndex].ps.conv_spec.pw_activation_type) { + continue; + } + } + std::vector> nextOpIndexes = searchOperatorIndexByInput(spec, + spec->ops[prevOpIndex].output_tensors_name[0], prevOpIndex + 1, + spec->num_operator_specs); + if (nextOpIndexes.size() != 1 || + OT_BatchNorm != spec->ops[nextOpIndexes[0].first].type) { + continue; + } + int bnOpIndex = nextOpIndexes[0].first; + + // bn + int bnWeightIndex = searchWeightIndex(spec, spec->ops[bnOpIndex].name); + CHECK_REQUIREMENT(bnWeightIndex >= 0); + CHECK_REQUIREMENT(spec->ws[bnWeightIndex].mdt == DT_F32); + F32 epsCur = spec->ops[bnOpIndex].ps.bn_spec.eps; + F32 gamaCur = spec->ops[bnOpIndex].ps.bn_spec.gama; + U32 channelCur = + spec->ws[bnWeightIndex].bytes_of_weight / bytesOf(spec->ws[bnWeightIndex].mdt); + F32 *meanPtr = (F32 *)spec->ws[bnWeightIndex].weight; + F32 *varPtr = (F32 *)spec->ws[bnWeightIndex].vec; + + std::vector stdValue(channelCur); + for (U32 j = 0; j < channelCur; j++) { + stdValue[j] = sqrt(gamaCur * varPtr[j] + epsCur); + } + + // conv + int convWeightIndex = searchWeightIndex(spec, spec->ops[prevOpIndex].name); + CHECK_REQUIREMENT(convWeightIndex >= 0); + // Now weight mdt can be DT_BIN01 or DT_BIN11 + U32 isBNN = 0; + if (spec->ws[convWeightIndex].mdt == DT_BIN01 || + spec->ws[convWeightIndex].mdt == DT_BIN11) { + isBNN = 1; + } + F32 *weightTemp = (F32 *)spec->ws[convWeightIndex].weight; + if (spec->ws[convWeightIndex].vec == nullptr) { + spec->ws[convWeightIndex].bytes_of_vec = channelCur * sizeof(F32); + if (isBNN == 1) { + spec->ws[convWeightIndex].bytes_of_vec *= 2; + } + spec->ws[convWeightIndex].vec = + (U8 *)mt_new_storage(spec->ws[convWeightIndex].bytes_of_vec); + memset(spec->ws[convWeightIndex].vec, 0, spec->ws[convWeightIndex].bytes_of_vec); + } + F32 *vecTemp = (F32 *)spec->ws[convWeightIndex].vec; + if (isBNN == 1) { // Do not modify weights for BNN + F32 *scale = vecTemp; + F32 *bias = vecTemp + channelCur; + for (U32 m = 0; m < channelCur; m++) { + scale[m] = 1.0 / + stdValue[m]; // This is the first possible source of a meaningful scale, so just initilize + bias[m] = (bias[m] - meanPtr[m]) / stdValue[m]; + } + } else { + int weightDataSize = spec->ws[convWeightIndex].bytes_of_weight / + bytesOf(spec->ws[convWeightIndex].mdt); + int weightPerChannel = weightDataSize / channelCur; + // NCHW + for (U32 m = 0; m < channelCur; m++) { + F32 *convWeightPerChannel = weightTemp + weightPerChannel * m; + for (int n = 0; n < weightPerChannel; n++) { + convWeightPerChannel[n] /= stdValue[m]; + } + vecTemp[m] = (vecTemp[m] - gamaCur * meanPtr[m]) / stdValue[m]; + } + } + // free BN memory + if (spec->ws[bnWeightIndex].weight != nullptr) { + spec->ws[bnWeightIndex].bytes_of_weight = 0; + delete spec->ws[bnWeightIndex].weight; + spec->ws[bnWeightIndex].weight = nullptr; + } + if (spec->ws[bnWeightIndex].vec != nullptr) { + spec->ws[bnWeightIndex].bytes_of_vec = 0; + delete spec->ws[bnWeightIndex].vec; + spec->ws[bnWeightIndex].vec = nullptr; + } + setOperatorInvalid(spec, bnOpIndex, true); + hasOptimized = true; + } + } + return hasOptimized; + } +}; +#endif diff --git a/model_tools/include/OPOptimizers/WeightScaleOptimizer.hpp b/model_tools/include/OPOptimizers/WeightScaleOptimizer.hpp new file mode 100644 index 00000000..c9436625 --- /dev/null +++ b/model_tools/include/OPOptimizers/WeightScaleOptimizer.hpp @@ -0,0 +1,144 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_WEIGHTSCALEOPTIMIZER +#define _H_WEIGHTSCALEOPTIMIZER + +#include "OPOptimizer.hpp" + +class WeightScaleOptimizer : public OPOptimizer { + bool optimize(ModelSpec *spec) override + { + bool hasOptimized = false; + for (int i = 0; i < spec->num_operator_specs; i++) { + if (OT_Conv == spec->ops[i].type || OT_FC == spec->ops[i].type || + OT_Scale == spec->ops[i].type) { + int prevOpIndex = i; + if (OT_Conv == spec->ops[prevOpIndex].type) { + if (ACTIVATION_NULL != spec->ops[prevOpIndex].ps.conv_spec.dw_activation_type || + ACTIVATION_NULL != spec->ops[prevOpIndex].ps.conv_spec.pw_activation_type) { + continue; + } + } + std::vector> nextOpIndexes = searchOperatorIndexByInput(spec, + spec->ops[prevOpIndex].output_tensors_name[0], prevOpIndex + 1, + spec->num_operator_specs); + if (nextOpIndexes.size() != 1 || OT_Scale != spec->ops[nextOpIndexes[0].first].type) { + continue; + } + int scaleOpIndex = nextOpIndexes[0].first; + if (spec->ops[scaleOpIndex].num_inputs > 1) { + UNI_WARNING_LOG( + "encounter unoptimize Scale layer(multi-inputs): %s\n", spec->ops[i].name); + continue; + } + + // scale + int scaleWeightIndex = searchWeightIndex(spec, spec->ops[scaleOpIndex].name); + CHECK_REQUIREMENT(scaleWeightIndex >= 0); + CHECK_REQUIREMENT(spec->ws[scaleWeightIndex].mdt == DT_F32); + U32 channelAlpha = spec->ws[scaleWeightIndex].bytes_of_weight / + bytesOf(spec->ws[scaleWeightIndex].mdt); + U32 channelBeta = spec->ws[scaleWeightIndex].bytes_of_vec / + bytesOf(spec->ws[scaleWeightIndex].mdt); + U32 channelCur = UNI_MAX(channelAlpha, channelBeta); + F32 *alphaPtr = (F32 *)spec->ws[scaleWeightIndex].weight; + F32 *betaPtr = (F32 *)spec->ws[scaleWeightIndex].vec; + + if (spec->ws[scaleWeightIndex].bytes_of_weight == 4 || + spec->ws[scaleWeightIndex].bytes_of_vec == 4) { + continue; + } + + int convWeightIndex = searchWeightIndex(spec, spec->ops[prevOpIndex].name); + CHECK_REQUIREMENT(convWeightIndex >= 0); + // mdt can now be DT_BIN01 or DT_BIN11 + U32 isBNN = 0; + if (spec->ws[convWeightIndex].mdt == DT_BIN01 || + spec->ws[convWeightIndex].mdt == DT_BIN11) { + isBNN = 1; + } + + // scale + scale + if (spec->ws[convWeightIndex].weight == nullptr || + spec->ws[convWeightIndex].bytes_of_weight == 0) { + spec->ws[convWeightIndex].bytes_of_weight = channelCur * sizeof(F32); + spec->ws[convWeightIndex].weight = + (U8 *)mt_new_storage(spec->ws[convWeightIndex].bytes_of_weight); + F32 *ptr = (F32 *)spec->ws[convWeightIndex].weight; + for (U32 m = 0; m < channelCur; m++) { + ptr[m] = 1; + } + } + F32 *weightTemp = (F32 *)spec->ws[convWeightIndex].weight; + if (spec->ws[convWeightIndex].vec == nullptr) { + spec->ws[convWeightIndex].bytes_of_vec = channelCur * sizeof(F32); + if (isBNN == 1) { + spec->ws[convWeightIndex].bytes_of_vec *= 2; + } + spec->ws[convWeightIndex].vec = + (U8 *)mt_new_storage(spec->ws[convWeightIndex].bytes_of_vec); + memset(spec->ws[convWeightIndex].vec, 0, spec->ws[convWeightIndex].bytes_of_vec); + } + F32 *vecTemp = (F32 *)spec->ws[convWeightIndex].vec; + if (isBNN == 1) { + F32 *scale = vecTemp; + F32 *bias = vecTemp + channelCur; + for (U32 m = 0; m < channelCur; m++) { + if (scale[m] == 0) { + scale[m] = alphaPtr[m]; + } else { + scale[m] *= alphaPtr[m]; + } + bias[m] *= alphaPtr[m]; + if (betaPtr != nullptr) { + bias[m] += betaPtr[m]; + } + } + } else { + int weightDataSize = spec->ws[convWeightIndex].bytes_of_weight / + bytesOf(spec->ws[convWeightIndex].mdt); + int weightPerChannel = weightDataSize / channelCur; + // NCHW + for (U32 m = 0; m < channelCur; m++) { + F32 *convWeightPerChannel = weightTemp + weightPerChannel * m; + if (alphaPtr != nullptr) { + for (int n = 0; n < weightPerChannel; n++) { + convWeightPerChannel[n] *= alphaPtr[m]; + } + vecTemp[m] = alphaPtr[m] * vecTemp[m]; + } + if (betaPtr != nullptr) { + vecTemp[m] += betaPtr[m]; + } + } + } + // free scale memory + if (spec->ws[scaleWeightIndex].weight != nullptr) { + spec->ws[scaleWeightIndex].bytes_of_weight = 0; + delete spec->ws[scaleWeightIndex].weight; + spec->ws[scaleWeightIndex].weight = nullptr; + } + if (spec->ws[scaleWeightIndex].vec != nullptr) { + spec->ws[scaleWeightIndex].bytes_of_vec = 0; + delete spec->ws[scaleWeightIndex].vec; + spec->ws[scaleWeightIndex].vec = nullptr; + } + setOperatorInvalid(spec, scaleOpIndex, true); + hasOptimized = true; + } + } + return hasOptimized; + } +}; +#endif diff --git a/model_tools/include/converter.h b/model_tools/include/converter.h new file mode 100644 index 00000000..4514d93a --- /dev/null +++ b/model_tools/include/converter.h @@ -0,0 +1,44 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_CONVERTER +#define _H_CONVERTER +#include "types.h" +#include "error.h" +#include "model_tools.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _USE_CAFFE +EE caffe_converter(std::string dir, std::string mfn, ModelSpec *ms); +#endif + +#ifdef _USE_ONNX +EE onnx_converter(std::string dir, std::string mfn, int removePreprocessOpNum, ModelSpec *ms); +#endif + +#ifdef _USE_TFLITE +EE tflite_converter(std::string dir, std::string mfn, ModelSpec *ms); +#endif + +#ifdef _USE_TENSORFLOW +EE tensorflow_converter(std::string dir, std::string mfn, ModelSpec *ms); +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/model_tools/include/model_optimizer.hpp b/model_tools/include/model_optimizer.hpp new file mode 100644 index 00000000..02fbbfea --- /dev/null +++ b/model_tools/include/model_optimizer.hpp @@ -0,0 +1,140 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_MODELOPTIMIZER +#define _H_MODELOPTIMIZER + +#include +#include +#include "model_tools.h" +#include "model_serialize_deserialize.hpp" +#include "OPOptimizers/OPOptimizer.hpp" +#include "OPOptimizers/DeprecatedOPOptimizer.hpp" +#include "OPOptimizers/WeightBNOptimizer.hpp" +#include "OPOptimizers/BNScaleOptimizer.hpp" +#include "OPOptimizers/WeightScaleOptimizer.hpp" +#include "OPOptimizers/PadOptimizer.hpp" +#include "OPOptimizers/InPlaceOptimizer.hpp" +#include "OPOptimizers/ActivationOptimizer.hpp" +#include "OPOptimizers/ChannelPaddingOptimizer.hpp" +#include "OPOptimizers/DepthwisePointwiseOptimizer.hpp" +#include "OPOptimizers/TransposeMulToScaleOptimizer.hpp" +#include "OPOptimizers/TransposeMatMulToFCOptimizer.hpp" +#include "OPOptimizers/FCFCOptimizer.hpp" +#include "OPOptimizers/ClipClipOptimizer.hpp" +#include "OPOptimizers/SqueezeReshapeOptimizer.hpp" +#include "OPOptimizers/NoQuantLabelOptimizer.hpp" +#include "OPOptimizers/MemoryReuseOptimizer.hpp" +#include "OPOptimizers/ShGaUnCoReOptimizer.hpp" +#include "OPOptimizers/RNNOptimizer.hpp" +#include "OPOptimizers/CastOptimizer.hpp" +#include "OPOptimizers/LayerNormOptimizer.hpp" +#include "OPOptimizers/InnerProductOptimizer.hpp" +#include "OPOptimizers/GeluOptimizer.hpp" +#include "OPOptimizers/InvariantSliceOptimizer.hpp" +#include "OPOptimizers/MultiHeadAttentionOptimizer.hpp" +#include "OPOptimizers/StdDeviationOptimizer.hpp" +#include "OPOptimizers/PowerOptimizer.hpp" + +class ModelSpecOptimizer { +public: + ModelSpecOptimizer() + {} + + bool optimize(ModelSpec *spec) + { + bool optimizeOrNot = false; + for (auto opo : opos) { + if (opo->optimize(spec)) { + optimizeOrNot = true; + } + } + return optimizeOrNot; + } + + void suggest(bool isPTQ) + { + // strict order + this->opos.push_back(std::shared_ptr(new DeprecatedOPOptimizer())); + + this->opos.push_back(std::shared_ptr(new LayerNormOptimizer())); + this->opos.push_back(std::shared_ptr(new GeluOptimizer())); + this->opos.push_back(std::shared_ptr(new TransposeMatMulToFCOptimizer())); + this->opos.push_back(std::shared_ptr(new InnerProductOptimizer())); + // this->opos.push_back(std::shared_ptr(new MultiHeadAttentionOptimizer())); + this->opos.push_back(std::shared_ptr(new InvariantSliceOptimizer())); + this->opos.push_back(std::shared_ptr(new InPlaceOptimizer())); + this->opos.push_back(std::shared_ptr(new PowerOptimizer())); + this->opos.push_back(std::shared_ptr(new ActivationOptimizer())); + if (!isPTQ) { + // Fuse BN with previous conv or fc + this->opos.push_back(std::shared_ptr(new WeightBNOptimizer())); + // Fuse scale with previous conv or fc + this->opos.push_back(std::shared_ptr(new WeightScaleOptimizer())); + } + this->opos.push_back(std::shared_ptr(new BNScaleOptimizer())); + this->opos.push_back(std::shared_ptr(new PadOptimizer())); + + this->opos.push_back(std::shared_ptr(new ActivationOptimizer())); + if (!isPTQ) { + this->opos.push_back(std::shared_ptr(new DepthwisePointwiseOptimizer())); + } + this->opos.push_back(std::shared_ptr(new TransposeMulToScaleOptimizer())); + + this->opos.push_back(std::shared_ptr(new ClipClipOptimizer())); + this->opos.push_back(std::shared_ptr(new SqueezeReshapeOptimizer())); + this->opos.push_back(std::shared_ptr(new ShGaUnCoReOptimizer())); + this->opos.push_back(std::shared_ptr(new RNNOptimizer())); + this->opos.push_back(std::shared_ptr(new CastOptimizer())); + this->opos.push_back(std::shared_ptr(new StdDeviationOptimizer())); + this->opos.push_back(std::shared_ptr(new ChannelPaddingOptimizer())); + + // Please leave MemoryReuseOptimizer at last + this->opos.push_back(std::shared_ptr(new MemoryReuseOptimizer())); + } + + void suggest_for_training() + { + // strict order + this->opos.push_back(std::shared_ptr(new DeprecatedOPOptimizer())); + + this->opos.push_back(std::shared_ptr(new PadOptimizer())); + + this->opos.push_back(std::shared_ptr(new NoQuantLabelOptimizer(false, 0))); + } + + void suggest_for_ptq(std::string inferPrecision, bool fuseBN, F32 clipVal) + { + if (fuseBN) { + // Fuse BN with previous conv or fc + this->opos.push_back(std::shared_ptr(new WeightBNOptimizer())); + // Fuse scale with previous conv or fc + this->opos.push_back(std::shared_ptr(new WeightScaleOptimizer())); + } + + bool hiddenMode = (inferPrecision == "HIDDEN"); + if (!hiddenMode) { + this->opos.push_back(std::shared_ptr(new DepthwisePointwiseOptimizer())); + } + + this->opos.push_back(std::shared_ptr(new NoQuantLabelOptimizer(hiddenMode, clipVal))); + } + + void empty() + {} + +private: + std::vector> opos; +}; + +#endif diff --git a/model_tools/include/model_quantization.h b/model_tools/include/model_quantization.h new file mode 100644 index 00000000..0729c789 --- /dev/null +++ b/model_tools/include/model_quantization.h @@ -0,0 +1,26 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_MODEL_QUANTIZATION +#define _H_MODEL_QUANTIZATION +#include +#include +#include +#include +#include "online_conversion.h" + +std::vector SplitScale(const std::string &s, char delim); + +void add_scale_from_file(ModelSpec *spec, const char *scaleFile); + +#endif diff --git a/model_tools/include/model_tools.h b/model_tools/include/model_tools.h new file mode 100644 index 00000000..fbd7cb48 --- /dev/null +++ b/model_tools/include/model_tools.h @@ -0,0 +1,35 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_MODEL_TOOLS +#define _H_MODEL_TOOLS + +#include "tensor_desc.h" +#include "op_type.h" + +#ifdef __cplusplus +extern "C" { +#endif + +EE mt_create_model(ModelSpec *md); +EE mt_load(CI8 *dir, CI8 *mfn, ModelSpec *md); +#if defined(_USE_CAFFE) || defined(_USE_ONNX) || defined(_USE_TFLITE) || defined(_USE_TENSORFLOW) +EE mt_store(CI8 *dir, CI8 *mfn, const ModelSpec *md); +#endif +EE mt_destroy_model(ModelSpec *md); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/model_tools/include/online_conversion.h b/model_tools/include/online_conversion.h new file mode 100644 index 00000000..536d464b --- /dev/null +++ b/model_tools/include/online_conversion.h @@ -0,0 +1,31 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_ONLINE_CONVERSION +#define _H_ONLINE_CONVERSION +#include "types.h" +#include "error.h" +#include "model_tools.h" +#include "model_serialize_deserialize.hpp" +#include "model_optimizer.hpp" +#include "converter.h" +#include "model_print.h" + +void *OnlineModelConversion(const char *storagePath, + const char *modelName, + const char *inferPrecision, + I32 removeProcessOpsNum); + +void OnlineModelReclaim(void *ms); + +#endif diff --git a/model_tools/src/CMakeLists.txt b/model_tools/src/CMakeLists.txt new file mode 100644 index 00000000..b135ff34 --- /dev/null +++ b/model_tools/src/CMakeLists.txt @@ -0,0 +1,36 @@ +set(srcs "${CMAKE_CURRENT_SOURCE_DIR}/model_tools.cpp") + +if(BUILD_TEST OR USE_CAFFE OR USE_ONNX OR USE_TFLITE OR USE_TENSORFLOW) + set(srcs "${srcs};${CMAKE_CURRENT_SOURCE_DIR}/data_type_converter.cpp;${CMAKE_CURRENT_SOURCE_DIR}/online_conversion.cpp;${CMAKE_CURRENT_SOURCE_DIR}/model_quantization.cpp;") +endif(BUILD_TEST OR USE_CAFFE OR USE_ONNX OR USE_TFLITE OR USE_TENSORFLOW) + +# shared library +add_library(${PROJECT_NAME} SHARED ${srcs}) + +# static library +add_library(${PROJECT_NAME}_static STATIC ${srcs}) + +if (USE_CAFFE) + add_subdirectory(caffe) + target_link_libraries (${PROJECT_NAME} LINK_PUBLIC ${PROJECT_NAME}_caffe) +endif(USE_CAFFE) +if (USE_ONNX) + add_subdirectory(onnx) + target_link_libraries (${PROJECT_NAME} LINK_PUBLIC ${PROJECT_NAME}_onnx) +endif(USE_ONNX) +if (USE_TFLITE) + add_subdirectory(tflite) + target_link_libraries (${PROJECT_NAME} LINK_PUBLIC ${PROJECT_NAME}_tflite) +endif(USE_TFLITE) +if (USE_TENSORFLOW) + add_subdirectory(tensorflow) + target_link_libraries (${PROJECT_NAME} LINK_PUBLIC ${PROJECT_NAME}_tensorflow) +endif(USE_TENSORFLOW) +target_link_libraries (${PROJECT_NAME} LINK_PUBLIC uni) + +set_target_properties(${PROJECT_NAME}_static PROPERTIES OUTPUT_NAME "${PROJECT_NAME}") +set_target_properties(${PROJECT_NAME} PROPERTIES CLEAN_DIRECT_OUTPUT 1) +set_target_properties(${PROJECT_NAME}_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) +install(TARGETS ${PROJECT_NAME} ${PROJECT_NAME}_static + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib) diff --git a/model_tools/src/caffe/CMakeLists.txt b/model_tools/src/caffe/CMakeLists.txt new file mode 100644 index 00000000..60d533f5 --- /dev/null +++ b/model_tools/src/caffe/CMakeLists.txt @@ -0,0 +1,34 @@ +file(GLOB srcs ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) +file(GLOB commonsrcs ${CMAKE_CURRENT_SOURCE_DIR}/../model_*.cpp) + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) + +include_directories(${Protobuf_INCLUDE_DIR}) + +set(Protobuf_IMPORT_DIRS ${BOLT_ROOT}/third_party/proto) +protobuf_generate_cpp(CAFFE_PROTO_SRCS CAFFE_PROTO_HDRS ${BOLT_ROOT}/third_party/proto/caffe.proto) + +protobuf_generate_python(CAFFE_PROTO_PYTHON_SRCS ${BOLT_ROOT}/third_party/proto/caffe.proto) + +include_directories(${CMAKE_CURRENT_BINARY_DIR}) +include_directories(../) + +add_custom_target(caffe_pb2.py ALL + DEPENDS ${CAFFE_PROTO_PYTHON_SRCS} + COMMAND ${CMAKE_COMMAND} -E copy ${CAFFE_PROTO_PYTHON_SRCS} ${PROJECT_SOURCE_DIR}/tools/tensorflow2caffe/Caffe) + +# shared library +add_library(${PROJECT_NAME}_caffe SHARED ${srcs} ${CAFFE_PROTO_HDRS} ${CAFFE_PROTO_SRCS} ${commonsrcs}) +if (USE_IOS_CLANG) + target_link_libraries(${PROJECT_NAME}_caffe LINK_PUBLIC uni ${Protobuf_LIBRARY}) +endif(USE_IOS_CLANG) + +# static library +add_library(${PROJECT_NAME}_caffe_static STATIC ${srcs} ${CAFFE_PROTO_HDRS} ${CAFFE_PROTO_SRCS}) + +set_target_properties(${PROJECT_NAME}_caffe_static PROPERTIES OUTPUT_NAME "${PROJECT_NAME}_caffe") +set_target_properties(${PROJECT_NAME}_caffe PROPERTIES CLEAN_DIRECT_OUTPUT 1) +set_target_properties(${PROJECT_NAME}_caffe_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) +install(TARGETS ${PROJECT_NAME}_caffe ${PROJECT_NAME}_caffe_static + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib) diff --git a/model_tools/src/caffe/caffe_adaptee.h b/model_tools/src/caffe/caffe_adaptee.h new file mode 100644 index 00000000..5577af98 --- /dev/null +++ b/model_tools/src/caffe/caffe_adaptee.h @@ -0,0 +1,1449 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_CAFFEADAPTEE +#define _H_CAFFEADAPTEE + +#include +#include +#include +#include +#include +#include +#include +#include +#include "caffe.pb.h" + +#include "converter.h" +#include "model_tools.h" +#include "model_adaptee.h" +#include "ut_util.h" + +class CaffeAdaptee : public ModelAdaptee { +public: + CaffeAdaptee() + {} + + ~CaffeAdaptee() + {} + +protected: + // read prototxt + EE read_from_prototxt(const char *path, google::protobuf::Message *message) + { + std::ifstream fs(path, std::ifstream::in); + if (!fs.is_open()) { + return NOT_FOUND; + } + + google::protobuf::io::IstreamInputStream input(&fs); + bool ret = google::protobuf::TextFormat::Parse(&input, message); + fs.close(); + return (ret) ? SUCCESS : NOT_SUPPORTED; + } + + // read caffemodel(bin) + EE read_from_caffemodel(const char *path, google::protobuf::Message *message) + { + std::ifstream fs(path, std::ifstream::in | std::ifstream::binary); + if (!fs.is_open()) { + return NOT_FOUND; + } + + google::protobuf::io::IstreamInputStream input(&fs); + google::protobuf::io::CodedInputStream codedstr(&input); + + codedstr.SetTotalBytesLimit(INT_MAX, INT_MAX / 2); + + bool ret = message->ParseFromCodedStream(&codedstr); + fs.close(); + + return (ret) ? SUCCESS : NOT_SUPPORTED; + } + + OperatorType convert_caffe_type(std::string inputType) + { + if (inputType == "Convolution") { + return OT_Conv; + } else if (inputType == "Deconvolution") { + return OT_Deconvolution; + } else if (inputType == "L2Norm") { + return OT_L2Normalization; + } else if (inputType == "BatchNorm") { + return OT_BatchNorm; + } else if (inputType == "Scale") { + return OT_Scale; + } else if (inputType == "Eltwise") { + return OT_Eltwise; + } else if (inputType == "InnerProduct") { + return OT_FC; + } else if (inputType == "Pooling") { + return OT_Pooling; + } else if (inputType == "ReLU") { + return OT_Relu; + } else if (inputType == "ReLU6") { + return OT_Relu6; + } else if (inputType == "HSwish") { + return OT_HSwish; + } else if (inputType == "Sigmoid") { + return OT_Sigmoid; + } else if (inputType == "HSigmoid") { + return OT_HSigmoid; + } else if (inputType == "Softmax") { + return OT_Softmax; + } else if (inputType == "Concat") { + return OT_Concat; + } else if (inputType == "Embed") { + return OT_Embedding; + } else if (inputType == "Gelu") { + return OT_Gelu; + } else if (inputType == "LayerNorm") { + return OT_LayerNorm; + } else if (inputType == "MatMul") { + return OT_MatMul; + } else if (inputType == "Power") { + return OT_Power; + } else if (inputType == "Reshape") { + return OT_Reshape; + } else if (inputType == "Slice") { + return OT_Slice; + } else if (inputType == "Attention") { + return OT_Attention; + } else if (inputType == "Input") { + return OT_Input; + } else if (inputType == "LSTM") { + return OT_RNN; + } else if (inputType == "TanH") { + return OT_TanH; + } else if (inputType == "SoftmaxWithLoss") { + return OT_SoftmaxWithLoss; + } else if (inputType == "Squeeze") { + return OT_Squeeze; + } else if (inputType == "Unsqueeze") { + return OT_Unsqueeze; + } else if (inputType == "Reduction") { + return OT_Reduction; + } else if (inputType == "ArgMax") { + return OT_ArgMax; + } else if (inputType == "PreAllocatedMemory") { + return OT_PreAllocatedMemory; + } else if (inputType == "SharedWeight") { + return OT_SharedWeight; + } else if (inputType == "Copy") { + return OT_Copy; + } else if (inputType == "Check") { + return OT_Check; + } else if (inputType == "Repeat") { + return OT_Repeat; + } else if (inputType == "Interp") { + return OT_Resize; + } else if (inputType == "Jump") { + return OT_Jump; + } else if (inputType == "AttentionMask") { + return OT_AttentionMask; + } else if (inputType == "RelativePositionEmbed") { + return OT_RelativePositionEmbedding; + } else if (inputType == "RelativeShift") { + return OT_RelativeShift; + } else if (inputType == "Dropout") { + return OT_None; + } else if (inputType == "Flatten") { + return OT_Reshape; + } else if (inputType == "Permute") { + return OT_Transpose; + } else if (inputType == "Clip") { + return OT_Clip; + } else if (inputType == "PriorBox") { + return OT_PriorBox; + } else if (inputType == "DetectionOutput") { + return OT_DetectionOutput; + } else if (inputType == "Yolov3DetectionOutput") { + return OT_Yolov3DetectionOutput; + } else if (inputType == "Mish") { + return OT_Mish; + } else if (inputType == "PReLU") { + return OT_PRelu; + } else if (inputType == "Tile") { + return OT_Tile; + } else if (inputType == "Pad") { + return OT_Pad; + } else { + UNI_ERROR_LOG("encounter unsupported operator %s\n", inputType.c_str()); + } + return OT_None; + } + + int net_search_layerId(caffe::NetParameter &netParams, std::string &layerName) + { + int i = 0; + if (netParams.layer_size() > 0) { + for (i = 0; i < netParams.layer_size(); i++) { + if (netParams.layer(i).name() == layerName) { + return i; + } + } + } else { + for (i = 0; i < netParams.layers_size(); i++) { + if (netParams.layers(i).name() == layerName) { + return i; + } + } + } + return -1; + } + + caffe::BlobProto net_get_blob(caffe::NetParameter &netParams, int layerId, int blobId) + { + if (netParams.layer_size() > 0) { + return netParams.layer(layerId).blobs(blobId); + } else { + return netParams.layers(layerId).blobs(blobId); + } + } + + int net_get_blobs_size(caffe::NetParameter &netParams, int layerId) + { + if (netParams.layer_size() > 0) { + return netParams.layer(layerId).blobs_size(); + } else { + return netParams.layers(layerId).blobs_size(); + } + } + + void net_copy_blob(WeightSpec *wsPtr, + int weightIndex, + caffe::NetParameter &netParams, + int netLayerId, + int blobNum, + OperatorType operatorType) + { + wsPtr[weightIndex].mdt = DT_F32; + wsPtr[weightIndex].bytes_of_weight = 0; + wsPtr[weightIndex].weight = nullptr; + wsPtr[weightIndex].bytes_of_vec = 0; + wsPtr[weightIndex].vec = nullptr; + + std::vector> weights; + std::vector> biases; + // Batchnorm may have 3 blobs, but the third blob can be ignored + if (operatorType == OT_BatchNorm) { + if (blobNum >= 3) { + blobNum = 2; + } + } + if (blobNum >= 1) { + caffe::BlobProto blob0 = net_get_blob(netParams, netLayerId, 0); + U32 elemSize = sizeof(*(blob0.data().data())); + CHECK_REQUIREMENT(elemSize == bytesOf(wsPtr[weightIndex].mdt)); + U32 blobSize = elemSize * blob0.data_size(); + wsPtr[weightIndex].bytes_of_weight += blobSize; + weights.push_back(std::make_pair(blob0, blobSize)); + } + if (blobNum >= 2) { + caffe::BlobProto blob1 = net_get_blob(netParams, netLayerId, 1); + U32 elemSize = sizeof(*(blob1.data().data())); + CHECK_REQUIREMENT(sizeof(*(blob1.data().data())) == bytesOf(wsPtr[weightIndex].mdt)); + U32 blobSize = elemSize * blob1.data_size(); + wsPtr[weightIndex].bytes_of_vec += blobSize; + biases.push_back(std::make_pair(blob1, blobSize)); + } + if (blobNum >= 3) { + caffe::BlobProto blob2 = net_get_blob(netParams, netLayerId, 2); + U32 elemSize = sizeof(*(blob2.data().data())); + CHECK_REQUIREMENT(elemSize == bytesOf(wsPtr[weightIndex].mdt)); + U32 blobSize = elemSize * blob2.data_size(); + wsPtr[weightIndex].bytes_of_weight += blobSize; + weights.push_back(std::make_pair(blob2, blobSize)); + } + if (weights.size() > 0) { + wsPtr[weightIndex].weight = (U8 *)mt_new_storage(wsPtr[weightIndex].bytes_of_weight); + U8 *ptr = wsPtr[weightIndex].weight; + for (U32 i = 0; i < weights.size(); i++) { + memcpy(ptr, weights[i].first.data().data(), weights[i].second); + ptr += weights[i].second; + } + } + if (biases.size() > 0) { + wsPtr[weightIndex].vec = (U8 *)mt_new_storage(wsPtr[weightIndex].bytes_of_vec); + U8 *ptr = wsPtr[weightIndex].vec; + for (U32 i = 0; i < biases.size(); i++) { + memcpy(ptr, biases[i].first.data().data(), biases[i].second); + ptr += biases[i].second; + } + } + } + + EE parse_file(std::string dir, std::string mfn) override + { + EE ret = SUCCESS; + std::string prototxtSuffix = ".prototxt"; + std::string caffeModelSuffix = ".caffemodel"; + std::string prototxtPath = dir + "/" + mfn + prototxtSuffix; + std::string caffeModelPath = dir + "/" + mfn + caffeModelSuffix; + + // load prototxt + ret = read_from_prototxt(prototxtPath.c_str(), (google::protobuf::Message *)(&proto)); + if (proto.layer_size() <= 0 || ret != SUCCESS) { + UNI_ERROR_LOG("fail to load caffe prototxt file %s\n", prototxtPath.c_str()); + } + + // load model bin. + ret = read_from_caffemodel(caffeModelPath.c_str(), (google::protobuf::Message *)(&net)); + if (ret != SUCCESS) { + UNI_ERROR_LOG("fail to load caffe model file %s\n", caffeModelPath.c_str()); + } + return ret; + } + + // the first loop can specify the input info and output info + EE adapt_operators(ModelSpec *ms) override + { + EE ret = SUCCESS; + // model_name + str_copy(ms->model_name, proto.name().c_str(), proto.name().length()); + ms->dt = DT_F32; // set default value + + ms->num_operator_specs = proto.layer_size(); + OperatorSpec *opsPtr = + (OperatorSpec *)mt_new_storage(sizeof(OperatorSpec) * ms->num_operator_specs); + ms->ops = opsPtr; + for (I32 i = 0; i < ms->num_operator_specs; i++) { + ms->ops[i].tensor_positions = nullptr; + ms->ops[i].num_quant_feature = 0; + ms->ops[i].feature_scale = nullptr; + } + + int inputsNumber = 0; + weightNumber = 0; // set global variable initial value + std::map outputCounts; + for (int i = 0; i < proto.input_size(); i++) { + outputCounts[proto.input(i).c_str()] = 1; + } + for (int i = 0; i < proto.layer_size(); i++) { + const caffe::LayerParameter curLayer = proto.layer(i); + this->layer = curLayer; + + if (layer.type() == "Input") { // layer,the global variable + inputsNumber++; + } + str_copy(opsPtr[i].name, layer.name().c_str(), layer.name().length()); + + this->op = layer.type(); + opsPtr[i].type = convert_caffe_type(layer.type()); + int bottomSize = layer.bottom_size(); + opsPtr[i].num_inputs = bottomSize; + opsPtr[i].input_tensors_name = (I8 **)mt_new_storage(bottomSize * sizeof(I8 *)); + for (int j = 0; j < bottomSize; j++) { + opsPtr[i].input_tensors_name[j] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + str_copy(opsPtr[i].input_tensors_name[j], layer.bottom(j).c_str(), + layer.bottom(j).length()); + if (outputCounts.find(layer.bottom(j)) == outputCounts.end()) { + if (opsPtr[i].type != OT_Jump) { + UNI_ERROR_LOG("encounter no output as this operator's input %s\n", + layer.bottom(j).c_str()); + } + } else { + outputCounts[layer.bottom(j)]--; + } + } + int topSize = layer.top_size(); + opsPtr[i].num_outputs = topSize; + opsPtr[i].output_tensors_name = (I8 **)mt_new_storage(topSize * sizeof(I8 *)); + for (int j = 0; j < topSize; j++) { + opsPtr[i].output_tensors_name[j] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + str_copy( + opsPtr[i].output_tensors_name[j], layer.top(j).c_str(), layer.top(j).length()); + if (outputCounts.find(layer.top(j)) == outputCounts.end()) { + outputCounts[layer.top(j)] = 1; + } else { + outputCounts[layer.top(j)]++; + } + } + + CHECK_STATUS(adapt_operator(opsPtr[i].type, &(ms->ops[i].ps))); + } + + inputsNumber = (inputsNumber > proto.input_size()) ? inputsNumber : proto.input_size(); + ms->num_inputs = inputsNumber; + ms->input_names = (I8 **)mt_new_storage(inputsNumber * sizeof(I8 *)); + ms->input_dims = (TensorDesc *)mt_new_storage(sizeof(TensorDesc) * inputsNumber); + for (int i = 0; i < inputsNumber; i++) { + ms->input_names[i] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + + if (proto.input_size() > 0) { + str_copy(ms->input_names[i], proto.input(i).c_str(), proto.input(i).length()); + switch (proto.input_dim_size()) { + case 2: + ms->input_dims[i] = + tensor2df(DT_U32, DF_NORMAL, proto.input_dim(0), proto.input_dim(1)); + break; + case 3: + ms->input_dims[i] = tensor3df(DT_F32, DF_MTK, proto.input_dim(0), + proto.input_dim(1), proto.input_dim(2)); + break; + case 4: + ms->input_dims[i] = tensor4df(DT_F32, DF_NCHW, proto.input_dim(0), + proto.input_dim(1), proto.input_dim(2), proto.input_dim(3)); + break; + default: { + UNI_ERROR_LOG("unsupported input dim\n"); + } + } + } + if (i < proto.input_shape_size()) { + str_copy(ms->input_names[i], proto.input(i).c_str(), proto.input(i).length()); + switch (proto.input_shape(i).dim_size()) { + case 2: + ms->input_dims[i] = tensor2df(DT_U32, DF_NORMAL, + proto.input_shape(i).dim(0), proto.input_shape(i).dim(1)); + break; + case 3: + ms->input_dims[i] = tensor3df(DT_F32, DF_NCHW, proto.input_shape(i).dim(0), + proto.input_shape(i).dim(1), proto.input_shape(i).dim(2)); + break; + case 4: + ms->input_dims[i] = tensor4df(DT_F32, DF_NCHW, proto.input_shape(i).dim(0), + proto.input_shape(i).dim(1), proto.input_shape(i).dim(2), + proto.input_shape(i).dim(3)); + break; + default: { + UNI_ERROR_LOG("unsupported input dim\n"); + } + } + } + } + + for (int i = 0; i < proto.output_size(); i++) { + std::string name = proto.output(i); + if (outputCounts.find(name) == outputCounts.end()) { + UNI_ERROR_LOG("can not find output %s in tensors\n", name.c_str()); + } else { + outputCounts[name] = (outputCounts[name] > 0) ? outputCounts[name] : 1; + } + } + int outputsNumber = 0; + for (auto iter : outputCounts) { + if (iter.second > 0) { + outputsNumber++; + } + } + ms->num_outputs = outputsNumber; + ms->output_names = (I8 **)mt_new_storage(outputsNumber * sizeof(I8 *)); + outputsNumber = 0; + for (auto iter : outputCounts) { + if (iter.second > 0) { + ms->output_names[outputsNumber] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + str_copy(ms->output_names[outputsNumber], iter.first.c_str(), iter.first.length()); + outputsNumber++; + } + } + ms->num_weight_specs = this->weightNumber; // use the global variable + return ret; + } + + EE adapt_weights(ModelSpec *ms) override + { + EE ret = SUCCESS; + WeightSpec *wsPtr = (WeightSpec *)mt_new_storage(sizeof(WeightSpec) * ms->num_weight_specs); + for (int j = 0; j < ms->num_weight_specs; j++) { + wsPtr[j].num_quant_scale = 0; + wsPtr[j].weight_scale = nullptr; + } + ms->ws = wsPtr; + int inNamesIndex = 0; + int weightIndex = 0; + + for (int i = 0; i < proto.layer_size(); i++) { + const caffe::LayerParameter layer = proto.layer(i); + std::string layerName = layer.name(); + std::string layerType = layer.type(); + + if (layerType == "Input") { + str_copy(ms->input_names[inNamesIndex], layerName.c_str(), layerName.length()); + switch (layer.input_param().shape(0).dim_size()) { + case 2: + ms->input_dims[inNamesIndex] = tensor2df(DT_U32, DF_NORMAL, + layer.input_param().shape(0).dim(0), + layer.input_param().shape(0).dim(1)); + break; + case 3: + ms->input_dims[inNamesIndex] = tensor3df(DT_F32, DF_MTK, + layer.input_param().shape(0).dim(0), layer.input_param().shape(0).dim(1), + layer.input_param().shape(0).dim(2)); + break; + case 4: + ms->input_dims[inNamesIndex] = tensor4df(DT_F32, DF_NCHW, + layer.input_param().shape(0).dim(0), + layer.input_param().shape(0).dim(1), layer.input_param().shape(0).dim(2), + layer.input_param().shape(0).dim(3)); + break; + default: { + UNI_ERROR_LOG("unsupported input dim\n"); + } + } + inNamesIndex++; + } else if (layerType == "Convolution" || layerType == "InnerProduct" || + layerType == "BatchNorm" || layerType == "Embed" || layerType == "LSTM" || + layerType == "SharedWeight" || layerType == "RelativePositionEmbed" || + layerType == "Deconvolution" || layerType == "PReLU") { + int netLayerId = net_search_layerId(net, layerName); + CHECK_REQUIREMENT(netLayerId >= 0); + str_copy(wsPtr[weightIndex].op_name, layerName.c_str(), layerName.length()); + U32 blobNum = net_get_blobs_size(net, netLayerId); + net_copy_blob( + wsPtr, weightIndex, net, netLayerId, blobNum, convert_caffe_type(layerType)); + + if (layerType == "BatchNorm" && blobNum > 2) { + caffe::BlobProto blob2 = net_get_blob(net, netLayerId, 2); + float cur_gama = blob2.data().data()[0] == 0 ? 1.0 + : 1.0 / blob2.data().data()[0]; + ms->ops[i].ps.bn_spec.gama = cur_gama; + } + + weightIndex++; + } else if (layerType == "Scale" || layerType == "LayerNorm") { + int netLayerId = net_search_layerId(net, layerName); + CHECK_REQUIREMENT(netLayerId >= 0); + str_copy(wsPtr[weightIndex].op_name, layerName.c_str(), layerName.length()); + U32 blobNum = net_get_blobs_size(net, netLayerId); + if (layer.bottom_size() == 1) { + CHECK_REQUIREMENT(blobNum >= 1); + } else { + CHECK_REQUIREMENT(blobNum == 0); + } + net_copy_blob( + wsPtr, weightIndex, net, netLayerId, blobNum, convert_caffe_type(layerType)); + weightIndex++; + } + } + + CHECK_REQUIREMENT(weightIndex == weightNumber); + // relationship init null + ms->num_op_tensor_entries = 0; + ms->op_relationship_entries = nullptr; + return ret; + } + + ParameterSpec adapt_Resize() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + ResizeParamSpec resizePs; + initialization_zero(&resizePs, sizeof(resizePs)); + auto caffeInterpParam = layer.interp_param(); + resizePs.sizes[0] = caffeInterpParam.height(); + resizePs.sizes[1] = caffeInterpParam.width(); + resizePs.num_sizes = 2; + resizePs.num_scales = 0; + curPs.resize_spec = resizePs; + return curPs; + } + + ParameterSpec adapt_Conv() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + weightNumber = weightNumber + 1; + ConvolutionParamSpec cps; + initialization_zero(&cps, sizeof(cps)); + cps.num_outputs = layer.convolution_param().num_output(); + cps.num_outputs_origin = cps.num_outputs; + cps.kernel_t = 1; + cps.stride_t = 1; + cps.padding_before = 0; + cps.padding_after = 0; + cps.dilatedRate_t = 1; + if (layer.convolution_param().has_kernel_w() && layer.convolution_param().has_kernel_h()) { + cps.kernel_w = layer.convolution_param().kernel_w(); + cps.kernel_h = layer.convolution_param().kernel_h(); + } else { + cps.kernel_h = (layer.convolution_param().kernel_size_size() > 0) + ? layer.convolution_param().kernel_size(0) + : 1; + cps.kernel_w = (layer.convolution_param().kernel_size_size() > 1) + ? layer.convolution_param().kernel_size(1) + : cps.kernel_h; + } + + cps.group = (layer.convolution_param().has_group()) ? layer.convolution_param().group() + : 1; // group[default=1] + + cps.dilatedRate_h = (layer.convolution_param().dilation_size() != 0) + ? layer.convolution_param().dilation(0) + : 1; + cps.dilatedRate_w = cps.dilatedRate_h; + + if (cps.group != 1 && cps.group == cps.num_outputs) { + cps.convolution_type = Convolution_Depthwise; + } else { + if (cps.dilatedRate_h > 1 || cps.dilatedRate_w > 1) { + cps.convolution_type = Convolution_Dilation; + } else { + cps.convolution_type = Convolution_Pointwise; + } + } + cps.dw_activation_type = ACTIVATION_NULL; + cps.pw_activation_type = ACTIVATION_NULL; + if (layer.convolution_param().has_stride_w() && layer.convolution_param().has_stride_h()) { + cps.stride_w = layer.convolution_param().stride_w(); + cps.stride_h = layer.convolution_param().stride_h(); + } else { + cps.stride_h = (layer.convolution_param().stride_size() != 0) + ? layer.convolution_param().stride(0) + : 1; // stride[default=1] + cps.stride_w = (layer.convolution_param().stride_size() > 1) + ? layer.convolution_param().stride(1) + : cps.stride_h; + } + if (layer.convolution_param().has_pad_w() && layer.convolution_param().has_pad_h()) { + cps.padding_left = layer.convolution_param().pad_w(); + cps.padding_right = cps.padding_left; + cps.padding_top = layer.convolution_param().pad_h(); + cps.padding_bottom = cps.padding_top; + } else { + cps.padding_top = + (layer.convolution_param().pad_size() > 0) ? layer.convolution_param().pad(0) : 0; + cps.padding_bottom = (layer.convolution_param().pad_size() > 1) + ? layer.convolution_param().pad(1) + : cps.padding_top; + cps.padding_left = (layer.convolution_param().pad_size() > 2) + ? layer.convolution_param().pad(2) + : cps.padding_top; + cps.padding_right = (layer.convolution_param().pad_size() > 3) + ? layer.convolution_param().pad(3) + : cps.padding_top; + } + curPs.conv_spec = cps; + return curPs; + } + + ParameterSpec adapt_Deconvolution() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + weightNumber = weightNumber + 1; + ConvolutionParamSpec cps; + initialization_zero(&cps, sizeof(cps)); + cps.num_outputs = layer.convolution_param().num_output(); + cps.num_outputs_origin = cps.num_outputs; + cps.kernel_t = 1; + cps.stride_t = 1; + cps.padding_before = 0; + cps.padding_after = 0; + cps.dilatedRate_t = 1; + if (layer.convolution_param().has_kernel_w() && layer.convolution_param().has_kernel_h()) { + cps.kernel_w = layer.convolution_param().kernel_w(); + cps.kernel_h = layer.convolution_param().kernel_h(); + } else { + cps.kernel_h = layer.convolution_param().kernel_size(0); + cps.kernel_w = cps.kernel_h; + } + + cps.group = (layer.convolution_param().has_group()) ? layer.convolution_param().group() : 1; + if (1 != cps.group) { + UNI_ERROR_LOG("Deconvolution group != 1 UNSUPPORTED!"); + } + cps.dilatedRate_h = 1; + cps.dilatedRate_w = 1; + cps.convolution_type = Convolution_Deconvolution; + cps.dw_activation_type = ACTIVATION_NULL; + cps.pw_activation_type = ACTIVATION_NULL; + if (layer.convolution_param().has_stride_w() && layer.convolution_param().has_stride_h()) { + cps.stride_w = layer.convolution_param().stride_w(); + cps.stride_h = layer.convolution_param().stride_h(); + } else { + cps.stride_h = (layer.convolution_param().stride_size() != 0) + ? layer.convolution_param().stride(0) + : 1; // stride[default=1] + cps.stride_w = cps.stride_h; + } + if (layer.convolution_param().has_pad_w() && layer.convolution_param().has_pad_h()) { + cps.padding_left = layer.convolution_param().pad_w(); + cps.padding_right = cps.padding_left; + cps.padding_top = layer.convolution_param().pad_h(); + cps.padding_bottom = cps.padding_top; + } else { + cps.padding_top = (layer.convolution_param().pad_size() != 0) + ? layer.convolution_param().pad(0) + : 0; // pad[default=0] + cps.padding_bottom = cps.padding_top; + cps.padding_left = cps.padding_top; + cps.padding_right = cps.padding_top; + } + curPs.conv_spec = cps; + return curPs; + } + + ParameterSpec adapt_Pooling() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + PoolingParamSpec pps; + initialization_zero(&pps, sizeof(pps)); + pps.kernel_t = 1; + pps.stride_t = 1; + pps.padding_before = 0; + pps.padding_after = 0; + if (layer.pooling_param().has_kernel_w() && layer.pooling_param().has_kernel_h()) { + pps.kernel_w = layer.pooling_param().kernel_w(); + pps.kernel_h = layer.pooling_param().kernel_h(); + } else { + pps.kernel_h = layer.pooling_param().kernel_size(); + pps.kernel_w = pps.kernel_h; + } + if (layer.pooling_param().has_stride_w() && layer.pooling_param().has_stride_h()) { + pps.stride_w = layer.pooling_param().stride_w(); + pps.stride_h = layer.pooling_param().stride_h(); + } else { + pps.stride_h = layer.pooling_param().stride(); + pps.stride_w = pps.stride_h; + } + bool global_pooling = layer.pooling_param().global_pooling(); + if (global_pooling) { + pps.kernel_h = 0; + pps.kernel_w = 0; + pps.stride_h = 1; + pps.stride_w = 1; + } else { + CHECK_REQUIREMENT(pps.kernel_h > 0); + } + if (layer.pooling_param().has_pad_w() && layer.pooling_param().has_pad_h()) { + pps.padding_left = layer.pooling_param().pad_w(); + pps.padding_right = pps.padding_left; + pps.padding_top = layer.pooling_param().pad_h(); + pps.padding_bottom = pps.padding_top; + } else { + pps.padding_top = layer.pooling_param().has_pad() ? layer.pooling_param().pad() : 0; + pps.padding_bottom = pps.padding_top; + pps.padding_left = pps.padding_top; + pps.padding_right = pps.padding_top; + } + + if (layer.pooling_param().has_round_mode() && layer.pooling_param().round_mode() == 1) { + pps.rm = FLOOR; + } else { + pps.rm = CEIL; + } + switch (layer.pooling_param().pool()) { + case caffe::PoolingParameter_PoolMethod_MAX: { + pps.mode = POOLING_MAX; + break; + } + case caffe::PoolingParameter_PoolMethod_AVE: { + pps.mode = POOLING_MEAN; + break; + } + default: { + UNI_ERROR_LOG("encounter unsupported Pooling method %d\n", + (int)(layer.pooling_param().pool())); + } + } + curPs.pooling_spec = pps; + return curPs; + } + + ParameterSpec adapt_Fc() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + weightNumber = weightNumber + 1; + FullyConnectedParamSpec ips; + initialization_zero(&ips, sizeof(ips)); + ips.num_outputs = layer.inner_product_param().num_output(); + ips.num_slices = 1; + ips.slice_point[0] = ips.num_outputs; + curPs.fc_spec = ips; + return curPs; + } + + ParameterSpec adapt_BatchNorm() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + weightNumber = weightNumber + 1; + BatchNormParamSpec bnps; + initialization_zero(&bnps, sizeof(bnps)); + bnps.axis = layer.batch_norm_param().axis(); + bnps.eps = layer.batch_norm_param().eps(); + bnps.gama = 1; + bnps.momentum = layer.batch_norm_param().moving_average_fraction(); + curPs.bn_spec = bnps; + return curPs; + } + + ParameterSpec adapt_LayerNorm() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + weightNumber = weightNumber + 1; + return curPs; + } + + ParameterSpec adapt_Eltwise() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + EltwiseParamSpec eps; + initialization_zero(&eps, sizeof(eps)); + EltwiseSumSpec ess; + initialization_zero(&ess, sizeof(ess)); + + auto caffeEltwiseParam = layer.eltwise_param(); + auto op = caffeEltwiseParam.operation(); + switch (op) { + case caffe::EltwiseParameter_EltwiseOp_PROD: + eps.elt_mode = ELTWISE_PROD; + break; + case caffe::EltwiseParameter_EltwiseOp_SUM: + eps.elt_mode = ELTWISE_SUM; + break; + case caffe::EltwiseParameter_EltwiseOp_MAX: + eps.elt_mode = ELTWISE_MAX; + break; + case caffe::EltwiseParameter_EltwiseOp_DIV: + eps.elt_mode = ELTWISE_DIV; + break; + default: { + UNI_ERROR_LOG("unknown eltwise mode\n"); + } + } + U32 bytes = caffeEltwiseParam.coeff_size() * sizeof(F32); + ess.coeff_size = caffeEltwiseParam.coeff_size(); + memcpy(ess.coeff_values, caffeEltwiseParam.coeff().data(), bytes); + for (int j = 0; j < caffeEltwiseParam.coeff_size(); j++) { + CHECK_REQUIREMENT(ess.coeff_values[j] == 1); + } + eps.elt_sum_spec = ess; + eps.activation_type = ACTIVATION_NULL; + curPs.eltwise_spec = eps; + return curPs; + } + + ParameterSpec adapt_Embedding() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + weightNumber = weightNumber + 1; + EmbedParamSpec embedPs; + initialization_zero(&embedPs, sizeof(embedPs)); + auto caffeEmbedParam = layer.embed_param(); + embedPs.input_dim = caffeEmbedParam.input_dim(); + embedPs.num_output = caffeEmbedParam.num_output(); + embedPs.bias_term = caffeEmbedParam.bias_term() == 0 ? false : true; + embedPs.transpose = caffeEmbedParam.transpose() == 0 ? false : true; + curPs.embed_spec = embedPs; + return curPs; + } + + ParameterSpec adapt_Power() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + PowerParamSpec powerPs; + initialization_zero(&powerPs, sizeof(powerPs)); + auto caffePowerParam = layer.power_param(); + powerPs.scale = caffePowerParam.scale(); + powerPs.shift = caffePowerParam.shift(); + powerPs.power = caffePowerParam.power(); + curPs.power_spec = powerPs; + return curPs; + } + + ParameterSpec adapt_Reshape() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + ReshapeParamSpec reshapePs; + initialization_zero(&reshapePs, sizeof(reshapePs)); + if (this->op == "Flatten") { + auto caffeFlattenParam = layer.flatten_param(); + CHECK_REQUIREMENT( + -1 == caffeFlattenParam.end_axis()); // Currently compute as reshape layer + reshapePs.shape_size = caffeFlattenParam.axis() + 1; + for (I32 iter = 0; iter < reshapePs.shape_size - 1; iter++) { + reshapePs.shape_dims[iter] = 0; + } + reshapePs.shape_dims[reshapePs.shape_size - 1] = -1; + reshapePs.axis = 0; + reshapePs.num_axes = -1; + } else { + auto caffeReshapeParam = layer.reshape_param(); + reshapePs.shape_size = caffeReshapeParam.shape().dim_size(); + for (I32 iter = 0; iter < caffeReshapeParam.shape().dim_size(); iter++) { + reshapePs.shape_dims[iter] = caffeReshapeParam.shape().dim(iter); + } + reshapePs.axis = caffeReshapeParam.axis(); + reshapePs.num_axes = caffeReshapeParam.num_axes(); + } + curPs.reshape_spec = reshapePs; + return curPs; + } + + ParameterSpec adapt_Slice() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + SliceParamSpec slicePs; + initialization_zero(&slicePs, sizeof(slicePs)); + auto caffeSliceParam = layer.slice_param(); + for (I32 iter = 0; iter < caffeSliceParam.slice_point().size(); iter++) { + slicePs.slice_points[iter] = caffeSliceParam.slice_point(iter); + } + slicePs.slice_size = caffeSliceParam.slice_point().size(); + slicePs.axis = caffeSliceParam.axis(); + curPs.slice_spec = slicePs; + return curPs; + } + + ParameterSpec adapt_Transpose() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + TransposeParamSpec transPs; + initialization_zero(&transPs, sizeof(transPs)); + auto caffePermuteParam = layer.permute_param(); + for (I32 iter = 0; iter < caffePermuteParam.order().size(); iter++) { + transPs.trans_dims[iter] = caffePermuteParam.order(iter); + } + transPs.trans_size = caffePermuteParam.order().size(); + curPs.transpose_spec = transPs; + return curPs; + } + + ParameterSpec adapt_Tile() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + TileParamSpec tilePS; + auto caffeTileParam = layer.tile_param(); + tilePS.repeatsInfo[0] = caffeTileParam.tiles(); + tilePS.axis = caffeTileParam.axis(); + curPs.tile_spec = tilePS; + return curPs; + } + + ParameterSpec adapt_Pad() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + PadParamSpec padPs; + auto caffePadParam = layer.padding_param(); + padPs.before = 0; + padPs.after = 0; + padPs.top = caffePadParam.shape(0); + padPs.bottom = caffePadParam.shape(1); + padPs.left = caffePadParam.shape(2); + padPs.right = caffePadParam.shape(3); + padPs.constant_value = 0; + padPs.pad_mode = Pad_Constant; + curPs.pad_spec = padPs; + return curPs; + } + + ParameterSpec adapt_Attention() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + AttentionParamSpec attentionPs; + initialization_zero(&attentionPs, sizeof(attentionPs)); + auto caffe_attention_param = layer.attention_param(); + attentionPs.num_heads = caffe_attention_param.num_heads(); + attentionPs.from_sequence_length = caffe_attention_param.from_sequence_length(); + attentionPs.to_sequence_length = caffe_attention_param.to_sequence_length(); + curPs.attention_spec = attentionPs; + return curPs; + } + + ParameterSpec adapt_RNN() override + { + weightNumber = weightNumber + 1; + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + RNNParamSpec rnnPs; + initialization_zero(&rnnPs, sizeof(rnnPs)); + auto caffeLSTMParam = layer.lstm_param(); + rnnPs.mode = RNN_LSTM; + rnnPs.numOutput = caffeLSTMParam.num_output(); + rnnPs.steps = caffeLSTMParam.steps(); + if (rnnPs.steps == -2) { + rnnPs.steps = 0; + rnnPs.biDirection = true; + } else { + rnnPs.biDirection = false; + } + rnnPs.numProjection = caffeLSTMParam.num_proj(); + rnnPs.zoneoutCell = caffeLSTMParam.zoneout_cell(); + rnnPs.zoneoutOutput = caffeLSTMParam.zoneout_output(); + rnnPs.forgetBias = 1.0; + rnnPs.activationMode = ACTIVATION_TANH; + curPs.rnn_spec = rnnPs; + return curPs; + } + + ParameterSpec adapt_Scale() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + weightNumber = weightNumber + 1; + ScaleParamSpec scalePs; + initialization_zero(&scalePs, sizeof(scalePs)); + auto caffeScaleParam = layer.scale_param(); + scalePs.axis = caffeScaleParam.axis(); + curPs.scale_spec = scalePs; + return curPs; + } + + ParameterSpec adapt_Reduction() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + ReductionParamSpec reductionPs; + initialization_zero(&reductionPs, sizeof(reductionPs)); + auto caffeReductionParam = layer.reduction_param(); + reductionPs.axes[0] = caffeReductionParam.axis(); + reductionPs.axes_num = 1; + auto op = caffeReductionParam.operation(); + switch (op) { + case caffe::ReductionParameter_ReductionOp_SUM: + reductionPs.reduction_mode = REDUCTION_SUM; + break; + case caffe::ReductionParameter_ReductionOp_MEAN: + reductionPs.reduction_mode = REDUCTION_MEAN; + break; + default: { + UNI_ERROR_LOG("unknown reduction mode\n"); + } + } + reductionPs.coeff = caffeReductionParam.coeff(); + reductionPs.keep_dim = caffeReductionParam.keep_dim(); + curPs.reduction_spec = reductionPs; + return curPs; + } + + ParameterSpec adapt_Squeeze() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + SqueezeParamSpec squeezePs; + initialization_zero(&squeezePs, sizeof(squeezePs)); + auto caffeSqueezeParam = layer.squeeze_param(); + squeezePs.axes[0] = caffeSqueezeParam.axis(); + squeezePs.axes_num = 1; + curPs.squeeze_spec = squeezePs; + return curPs; + } + + ParameterSpec adapt_Unsqueeze() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + UnsqueezeParamSpec unsqueezePs; + initialization_zero(&unsqueezePs, sizeof(unsqueezePs)); + auto caffeUnsqueezeParam = layer.unsqueeze_param(); + unsqueezePs.axes[0] = caffeUnsqueezeParam.axis(); + unsqueezePs.axes_num = 1; + curPs.unsqueeze_spec = unsqueezePs; + return curPs; + } + + ParameterSpec adapt_ArgMax() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + ArgMaxParamSpec argmaxPs; + initialization_zero(&argmaxPs, sizeof(argmaxPs)); + auto caffeArgMaxParam = layer.argmax_param(); + argmaxPs.axis = caffeArgMaxParam.axis(); + curPs.argmax_spec = argmaxPs; + return curPs; + } + + ParameterSpec adapt_Repeat() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + RepeatParamSpec repeatPs; + initialization_zero(&repeatPs, sizeof(repeatPs)); + auto caffeRepeatParam = layer.repeat_param(); + repeatPs.loops = caffeRepeatParam.loops(); + repeatPs.axis = caffeRepeatParam.axis(); + curPs.repeat_spec = repeatPs; + return curPs; + } + + ParameterSpec adapt_Check() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + CheckParamSpec checkPs; + initialization_zero(&checkPs, sizeof(checkPs)); + auto caffeCheckParam = layer.check_param(); + auto op = caffeCheckParam.operation(); + switch (op) { + case caffe::CheckParameter_CheckOp_EQUAL: + checkPs.check_mode = CHECK_EQUAL; + break; + case caffe::CheckParameter_CheckOp_GREAT: + checkPs.check_mode = CHECK_GREAT; + break; + case caffe::CheckParameter_CheckOp_GREATEQUAL: + checkPs.check_mode = CHECK_GREATEQUAL; + break; + default: { + UNI_ERROR_LOG("unknown check mode\n"); + } + } + curPs.check_spec = checkPs; + return curPs; + } + + ParameterSpec adapt_PreAllocatedMemory() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + PreAllocatedMemoryParamSpec preAllocatedMemoryPs; + initialization_zero(&preAllocatedMemoryPs, sizeof(preAllocatedMemoryPs)); + auto caffePreAllocatedMemoryParam = layer.preallocated_memory_param(); + preAllocatedMemoryPs.desc.nDims = caffePreAllocatedMemoryParam.shape().dim_size(); + for (I32 iter = 0; iter < caffePreAllocatedMemoryParam.shape().dim_size(); iter++) { + preAllocatedMemoryPs.desc.dims[preAllocatedMemoryPs.desc.nDims - 1 - iter] = + caffePreAllocatedMemoryParam.shape().dim(iter); + } + preAllocatedMemoryPs.desc.df = getTensorDefaultDataFormat(preAllocatedMemoryPs.desc.nDims); + auto dt = caffePreAllocatedMemoryParam.data_type(); + switch (dt) { + case caffe::PreAllocatedMemoryParameter_DataType_FLOAT32: + preAllocatedMemoryPs.desc.dt = DT_F32; + break; + case caffe::PreAllocatedMemoryParameter_DataType_UINT32: + preAllocatedMemoryPs.desc.dt = DT_U32; + break; + case caffe::PreAllocatedMemoryParameter_DataType_INT32: + preAllocatedMemoryPs.desc.dt = DT_I32; + break; + default: { + UNI_ERROR_LOG("unknown memory data mode\n"); + } + } + curPs.preallocated_memory_spec = preAllocatedMemoryPs; + return curPs; + } + + ParameterSpec adapt_SharedWeight() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + weightNumber = weightNumber + 1; + SharedWeightParamSpec sharedWeightPs; + initialization_zero(&sharedWeightPs, sizeof(sharedWeightPs)); + auto caffeSharedWeightParam = layer.shared_weight_param(); + sharedWeightPs.desc.nDims = caffeSharedWeightParam.shape().dim_size(); + for (I32 iter = 0; iter < caffeSharedWeightParam.shape().dim_size(); iter++) { + sharedWeightPs.desc.dims[sharedWeightPs.desc.nDims - 1 - iter] = + caffeSharedWeightParam.shape().dim(iter); + } + sharedWeightPs.desc.df = getTensorDefaultDataFormat(sharedWeightPs.desc.nDims); + auto dt = caffeSharedWeightParam.data_type(); + switch (dt) { + case caffe::SharedWeightParameter_DataType_FLOAT32: + sharedWeightPs.desc.dt = DT_F32; + break; + case caffe::SharedWeightParameter_DataType_UINT32: + sharedWeightPs.desc.dt = DT_U32; + break; + case caffe::SharedWeightParameter_DataType_INT32: + sharedWeightPs.desc.dt = DT_I32; + break; + default: { + UNI_ERROR_LOG("unknown weight data type\n"); + } + } + curPs.shared_weight_spec = sharedWeightPs; + return curPs; + } + + ParameterSpec adapt_Copy() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + CopyParamSpec copyPs; + initialization_zero(©Ps, sizeof(copyPs)); + auto caffeCopyParam = layer.copy_param(); + copyPs.src_dims[0] = caffeCopyParam.src_batch_stride(); + copyPs.src_dims[1] = caffeCopyParam.src_stride(); + copyPs.src_dims[2] = caffeCopyParam.src_offset(); + copyPs.dst_dims[0] = caffeCopyParam.dst_batch_stride(); + copyPs.dst_dims[1] = caffeCopyParam.dst_stride(); + copyPs.dst_dims[2] = caffeCopyParam.dst_offset(); + copyPs.length = caffeCopyParam.length(); + curPs.copy_spec = copyPs; + return curPs; + } + + ParameterSpec adapt_MatMul() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + MatMulParamSpec matmulPs; + initialization_zero(&matmulPs, sizeof(matmulPs)); + auto caffeMatMulParam = layer.matmul_param(); + matmulPs.transpose_a = caffeMatMulParam.transpose_a(); + matmulPs.transpose_b = caffeMatMulParam.transpose_b(); + curPs.matmul_spec = matmulPs; + return curPs; + } + + ParameterSpec adapt_AttentionMask() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + AttentionMaskParamSpec attentionMaskPs; + initialization_zero(&attentionMaskPs, sizeof(attentionMaskPs)); + auto caffeAttentionMaskParam = layer.attention_mask_param(); + attentionMaskPs.attention_length = caffeAttentionMaskParam.attention_length(); + attentionMaskPs.same_length = caffeAttentionMaskParam.same_length(); + attentionMaskPs.mask = caffeAttentionMaskParam.mask(); + curPs.attention_mask_spec = attentionMaskPs; + return curPs; + } + + ParameterSpec adapt_RelativePositionEmbedding() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + weightNumber = weightNumber + 1; + EmbedParamSpec p; + initialization_zero(&p, sizeof(p)); + auto caffeRelativePositionEmbedParam = layer.relative_position_embed_param(); + p.input_dim = caffeRelativePositionEmbedParam.input_dim(); + p.num_output = caffeRelativePositionEmbedParam.num_output(); + p.bias_term = caffeRelativePositionEmbedParam.bias_term() == 0 ? false : true; + p.transpose = caffeRelativePositionEmbedParam.transpose() == 0 ? false : true; + p.axis = caffeRelativePositionEmbedParam.axis(); + curPs.embed_spec = p; + return curPs; + } + + ParameterSpec adapt_RelativeShift() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + RelativeShiftParamSpec relativeShiftPs; + initialization_zero(&relativeShiftPs, sizeof(relativeShiftPs)); + auto caffeRelativeShiftParam = layer.relative_shift_param(); + relativeShiftPs.axis = caffeRelativeShiftParam.axis(); + relativeShiftPs.shift_length = caffeRelativeShiftParam.shift_length(); + curPs.relative_shift_spec = relativeShiftPs; + return curPs; + } + + ParameterSpec adapt_Concat() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + ConcatParamSpec concatPs; + initialization_zero(&concatPs, sizeof(concatPs)); + auto caffeConcatParam = layer.concat_param(); + concatPs.axis = caffeConcatParam.axis(); + curPs.concat_spec = concatPs; + return curPs; + } + + ParameterSpec adapt_Softmax() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + SoftmaxParamSpec softmaxPs; + initialization_zero(&softmaxPs, sizeof(softmaxPs)); + auto caffeSoftmaxParam = layer.softmax_param(); + softmaxPs.axis = caffeSoftmaxParam.axis(); + curPs.softmax_spec = softmaxPs; + return curPs; + } + + ParameterSpec adapt_PriorBox() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + PriorBoxParamSpec priorboxPs; + initialization_zero(&priorboxPs, sizeof(priorboxPs)); + auto caffePriorBoxParam = layer.prior_box_param(); + CHECK_REQUIREMENT( + caffePriorBoxParam.min_size_size() <= 2 && caffePriorBoxParam.max_size_size() <= 2); + for (int i = 0; i < 2; i++) { + priorboxPs.min_sizes[i] = 0; + if (i < caffePriorBoxParam.min_size_size()) { + priorboxPs.min_sizes[i] = caffePriorBoxParam.min_size(i); + } + } + for (int i = 0; i < 2; i++) { + priorboxPs.max_sizes[i] = 0; + if (i < caffePriorBoxParam.max_size_size()) { + priorboxPs.max_sizes[i] = caffePriorBoxParam.max_size(i); + } + } + CHECK_REQUIREMENT(caffePriorBoxParam.aspect_ratio_size() <= 2); + for (int i = 0; i < 2; i++) { + priorboxPs.aspect_ratios[i] = 0; + if (i < caffePriorBoxParam.aspect_ratio_size()) { + priorboxPs.aspect_ratios[i] = caffePriorBoxParam.aspect_ratio(i); + } + } + if (caffePriorBoxParam.has_flip()) { + if (caffePriorBoxParam.flip()) { + priorboxPs.flip = 1; + } else { + priorboxPs.flip = 0; + } + } else { + priorboxPs.flip = 1; + } + if (caffePriorBoxParam.has_clip()) { + if (caffePriorBoxParam.clip()) { + priorboxPs.clip = 1; + } else { + priorboxPs.clip = 0; + } + } else { + priorboxPs.clip = 0; + } + if (caffePriorBoxParam.variance_size() == 4) { + priorboxPs.variances[0] = caffePriorBoxParam.variance(0); + priorboxPs.variances[1] = caffePriorBoxParam.variance(1); + priorboxPs.variances[2] = caffePriorBoxParam.variance(2); + priorboxPs.variances[3] = caffePriorBoxParam.variance(3); + } else if (caffePriorBoxParam.variance_size() == 1) { + priorboxPs.variances[0] = caffePriorBoxParam.variance(0); + priorboxPs.variances[1] = caffePriorBoxParam.variance(0); + priorboxPs.variances[2] = caffePriorBoxParam.variance(0); + priorboxPs.variances[3] = caffePriorBoxParam.variance(0); + } + priorboxPs.image_w = 0; + priorboxPs.image_h = 0; + if (caffePriorBoxParam.has_img_size()) { + priorboxPs.image_w = caffePriorBoxParam.img_size(); + priorboxPs.image_h = caffePriorBoxParam.img_size(); + } + if (caffePriorBoxParam.has_img_w() && caffePriorBoxParam.has_img_h()) { + priorboxPs.image_w = caffePriorBoxParam.img_w(); + priorboxPs.image_h = caffePriorBoxParam.img_h(); + } + priorboxPs.step_w = 0; + priorboxPs.step_h = 0; + if (caffePriorBoxParam.has_step()) { + priorboxPs.step_w = caffePriorBoxParam.step(); + priorboxPs.step_h = caffePriorBoxParam.step(); + } + if (caffePriorBoxParam.has_step_w() && caffePriorBoxParam.has_step_h()) { + priorboxPs.step_w = caffePriorBoxParam.step_w(); + priorboxPs.step_h = caffePriorBoxParam.step_h(); + } + priorboxPs.offset = caffePriorBoxParam.offset(); + curPs.prior_box_spec = priorboxPs; + return curPs; + } + + ParameterSpec adapt_DetectionOutput() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + DetectionOutputParamSpec detectionoutputPs; + initialization_zero(&detectionoutputPs, sizeof(detectionoutputPs)); + auto caffeDetectionOutputParam = layer.detection_output_param(); + detectionoutputPs.num_class = caffeDetectionOutputParam.num_classes(); + CHECK_REQUIREMENT((caffeDetectionOutputParam.background_label_id() == 0) && + (caffeDetectionOutputParam.share_location() == true)); + detectionoutputPs.nms_threshold = caffeDetectionOutputParam.nms_param().nms_threshold(); + detectionoutputPs.nms_top_k = caffeDetectionOutputParam.nms_param().top_k(); + detectionoutputPs.keep_top_k = caffeDetectionOutputParam.keep_top_k(); + detectionoutputPs.confidence_threshold = caffeDetectionOutputParam.confidence_threshold(); + curPs.detection_output_spec = detectionoutputPs; + return curPs; + } + + ParameterSpec adapt_Yolov3DetectionOutput() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + Yolov3DetectionOutputParamSpec yolov3detectionoutputPs; + initialization_zero(&yolov3detectionoutputPs, sizeof(yolov3detectionoutputPs)); + auto caffeYolov3DetectionOutputParam = layer.yolov3_detection_output_param(); + yolov3detectionoutputPs.num_class = caffeYolov3DetectionOutputParam.num_classes(); + yolov3detectionoutputPs.num_box = caffeYolov3DetectionOutputParam.num_box(); + yolov3detectionoutputPs.confidence_threshold = + caffeYolov3DetectionOutputParam.confidence_threshold(); + yolov3detectionoutputPs.nms_threshold = caffeYolov3DetectionOutputParam.nms_threshold(); + for (int i = 0; i < 18; i++) { + yolov3detectionoutputPs.biases[i] = 0; + if (i < caffeYolov3DetectionOutputParam.biases_size()) { + yolov3detectionoutputPs.biases[i] = caffeYolov3DetectionOutputParam.biases(i); + } + } + for (int i = 0; i < 3; i++) { + yolov3detectionoutputPs.anchors_scale[i] = 0; + if (i < caffeYolov3DetectionOutputParam.anchors_scale_size()) { + yolov3detectionoutputPs.anchors_scale[i] = + caffeYolov3DetectionOutputParam.anchors_scale(i); + } + } + yolov3detectionoutputPs.mask_group_num = caffeYolov3DetectionOutputParam.mask_group_num(); + for (int i = 0; i < 9; i++) { + yolov3detectionoutputPs.mask[i] = 0; + if (i < caffeYolov3DetectionOutputParam.mask_size()) { + yolov3detectionoutputPs.mask[i] = caffeYolov3DetectionOutputParam.mask(i); + } + } + curPs.yolov3_detection_output_spec = yolov3detectionoutputPs; + return curPs; + } + + ParameterSpec adapt_Clip() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + ClipParamSpec clipParam; + initialization_zero(&clipParam, sizeof(clipParam)); + auto caffeClipParam = layer.clip_param(); + clipParam.min = caffeClipParam.min(); + clipParam.max = caffeClipParam.max(); + curPs.clip_spec = clipParam; + return curPs; + } + + ParameterSpec adapt_Relu() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + ReLUParamSpec reluSpec; + initialization_zero(&reluSpec, sizeof(reluSpec)); + reluSpec.neg_slope = 0.0; + curPs.relu_spec = reluSpec; + return curPs; + } + + ParameterSpec adapt_PRelu() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + weightNumber = weightNumber + 1; + return curPs; + } + +private: + std::string op; + caffe::NetParameter proto; + caffe::NetParameter net; + caffe::LayerParameter layer; + int weightNumber; +}; +#endif diff --git a/model-tools/src/caffe/caffe_wrapper.cpp b/model_tools/src/caffe/caffe_wrapper.cpp similarity index 76% rename from model-tools/src/caffe/caffe_wrapper.cpp rename to model_tools/src/caffe/caffe_wrapper.cpp index c36e6ee3..043e543c 100644 --- a/model-tools/src/caffe/caffe_wrapper.cpp +++ b/model_tools/src/caffe/caffe_wrapper.cpp @@ -1,25 +1,25 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include #include "converter.h" #include "model_tools.h" #include "caffe_adaptee.h" -EE caffe_converter(std::string dir, std::string mfn, ModelSpec* ms) { - ModelAdaptee* ade = new CaffeAdaptee(); - EE ret = ade->adapt(dir, mfn, ms); +EE caffe_converter(std::string dir, std::string mfn, ModelSpec *ms) +{ + ModelAdaptee *ade = new CaffeAdaptee(); + EE ret = ade->adapt(dir, mfn, ms); delete ade; return ret; } diff --git a/model_tools/src/data_type_converter.cpp b/model_tools/src/data_type_converter.cpp new file mode 100644 index 00000000..27eafa9b --- /dev/null +++ b/model_tools/src/data_type_converter.cpp @@ -0,0 +1,424 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include +#include "model_serialize_deserialize.hpp" +#include "model_tools.h" +#include "OPOptimizers/DeprecatedOPOptimizer.hpp" + +template +EE ws_datatype_converter(U8 *originalPtr, U8 *targetPtr, int paramNum) +{ + F32 *f32PtrParam = (F32 *)originalPtr; + T *targetPtrParam = (T *)targetPtr; + for (int j = 0; j < paramNum; j++) { + F32 originalParam = f32PtrParam[j]; + T changedParam = (T)originalParam; + targetPtrParam[j] = changedParam; + } + return SUCCESS; +} + +// Return the weight scale +F32 ws_datatype_converter_bnn(U8 *originalPtr, U8 *targetPtr, int paramNum) +{ + F32 *f32PtrParam = (F32 *)originalPtr; + BIN8 *targetPtrParam = (BIN8 *)targetPtr; + for (int i = 0; i < paramNum; i += 8) { + BIN8 temp = 0; // Initialize all bits to 0 + for (int j = 0; j < 8; j++) { + U32 bitNo = 7 - j; + if (f32PtrParam[i + j] > + 0) { // Set bit if weight is positive. Works for both DOREFA and XNOR + temp |= (1 << bitNo); + } + } + targetPtrParam[i / 8] = temp; + } + + F32 scale = 1; + for (int i = 0; i < paramNum; i++) { + scale = f32PtrParam[i]; + if (scale > 0) { + break; + } + } + return scale; +} + +// return quantization scale +F32 ws_datatype_converter_int8(U8 *originalPtr, U8 *targetPtr, int paramNum) +{ + F32 *f32PtrParam = (F32 *)originalPtr; + INT8 *targetPtrParam = (INT8 *)targetPtr; + + F32 maxabs = 0; + for (int i = 0; i < paramNum; i++) { + if (abs(f32PtrParam[i]) > maxabs) { + maxabs = abs(f32PtrParam[i]); + } + } + + F32 scale = 127.0 / maxabs; + for (int i = 0; i < paramNum; i++) { + targetPtrParam[i] = round(f32PtrParam[i] * scale); + } + return scale; +} + +inline EE getTargetDataType(DataConvertType convertMode, DataType *type) +{ + if (*type != DT_F32) { + return SUCCESS; + } + + switch (convertMode) { + case F32_to_F32: { + *type = DT_F32; + break; + } + case F32_to_F16: { + *type = DT_F16; + break; + } + case F32_to_I8: { + *type = DT_I8; + break; + } + default: + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline DataType get_storage_type( + ModelSpec *ms, std::string opName, std::string storageMode, DataType originalType) +{ + if ("NOQUANT" == storageMode) { + return originalType; + } + if ("FP16" == storageMode) { + return DT_F16; + } + for (int i = 0; i < ms->num_operator_specs; i++) { + std::string name = ms->ops[i].name; + if (name == opName) { + auto opType = ms->ops[i].type; + if (OT_LayerNorm == opType || OT_Scale == opType || OT_PRelu == opType) { + return originalType; + } + if ("INT8" == storageMode) { + return DT_I8; + } + if (1 == ms->ops[i].num_quant_feature + && 1 == ms->ops[i].feature_scale[0].num_scale + && 0 == ms->ops[i].feature_scale[0].scale[0]) { + return originalType; + } else { + return DT_I8; + } + } + } + UNI_ERROR_LOG("No OP found with name %s\n", opName.c_str()); + return originalType; +} + +EE ms_datatype_converter( + ModelSpec *originalMs, ModelSpec *targetMs, DataConvertType convertMode, std::string storageMode) +{ + str_copy(targetMs->model_name, originalMs->model_name, NAME_LEN); + targetMs->dt = originalMs->dt; + CHECK_STATUS(getTargetDataType(convertMode, &(targetMs->dt))); + + targetMs->num_inputs = originalMs->num_inputs; + targetMs->input_names = (I8 **)mt_new_storage(targetMs->num_inputs * sizeof(I8 *)); + for (I32 j = 0; j < targetMs->num_inputs; j++) { + targetMs->input_names[j] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + str_copy(targetMs->input_names[j], originalMs->input_names[j], NAME_LEN); + } + targetMs->input_dims = (TensorDesc *)mt_new_storage(targetMs->num_inputs * sizeof(TensorDesc)); + memcpy(targetMs->input_dims, originalMs->input_dims, targetMs->num_inputs * sizeof(TensorDesc)); + for (I32 i = 0; i < targetMs->num_inputs; i++) { + CHECK_STATUS(getTargetDataType(convertMode, &(targetMs->input_dims[i].dt))); + } + + targetMs->num_outputs = originalMs->num_outputs; + targetMs->output_names = (I8 **)mt_new_storage(targetMs->num_outputs * sizeof(I8 *)); + for (int j = 0; j < targetMs->num_outputs; j++) { + targetMs->output_names[j] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + str_copy(targetMs->output_names[j], originalMs->output_names[j], NAME_LEN); + } + + targetMs->num_operator_specs = originalMs->num_operator_specs; + OperatorSpec *opsPtr = + (OperatorSpec *)mt_new_storage(targetMs->num_operator_specs * sizeof(OperatorSpec)); + std::map weightDataTypeMap; + + for (int i = 0; i < targetMs->num_operator_specs; i++) { + str_copy(opsPtr[i].name, originalMs->ops[i].name, NAME_LEN); + opsPtr[i].type = originalMs->ops[i].type; + opsPtr[i].num_inputs = originalMs->ops[i].num_inputs; + opsPtr[i].input_tensors_name = (I8 **)mt_new_storage(opsPtr[i].num_inputs * sizeof(I8 *)); + for (U32 j = 0; j < opsPtr[i].num_inputs; j++) { + opsPtr[i].input_tensors_name[j] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + memcpy(opsPtr[i].input_tensors_name[j], originalMs->ops[i].input_tensors_name[j], + NAME_LEN); + } + opsPtr[i].num_outputs = originalMs->ops[i].num_outputs; + opsPtr[i].output_tensors_name = (I8 **)mt_new_storage(opsPtr[i].num_outputs * sizeof(I8 *)); + for (U32 j = 0; j < opsPtr[i].num_outputs; j++) { + opsPtr[i].output_tensors_name[j] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + memcpy(opsPtr[i].output_tensors_name[j], originalMs->ops[i].output_tensors_name[j], + NAME_LEN); + } + + if (OT_None != opsPtr[i].type) { + U32 numTensors = opsPtr[i].num_inputs + opsPtr[i].num_outputs; + opsPtr[i].tensor_positions = (I32 *)mt_new_storage(numTensors * sizeof(I32)); + memcpy(opsPtr[i].tensor_positions, originalMs->ops[i].tensor_positions, + numTensors * sizeof(I32)); + } else { + opsPtr[i].tensor_positions = nullptr; + } + + opsPtr[i].num_quant_feature = originalMs->ops[i].num_quant_feature; + if (0 == opsPtr[i].num_quant_feature) { + opsPtr[i].feature_scale = nullptr; + } else { + opsPtr[i].feature_scale = + (QuantSpec *)mt_new_storage(opsPtr[i].num_quant_feature * sizeof(QuantSpec)); + for (U32 j = 0; j < opsPtr[i].num_quant_feature; j++) { + opsPtr[i].feature_scale[j].num_scale = originalMs->ops[i].feature_scale[j].num_scale; + int num = opsPtr[i].feature_scale[j].num_scale; + + opsPtr[i].feature_scale[j].scale = (F32 *)mt_new_storage(num * sizeof(F32)); + memcpy(opsPtr[i].feature_scale[j].scale, originalMs->ops[i].feature_scale[j].scale, + num * sizeof(F32)); + } + } + + opsPtr[i].ps = originalMs->ops[i].ps; + + switch (opsPtr[i].type) { + case OT_SharedWeight: { + weightDataTypeMap[opsPtr[i].name] = opsPtr[i].ps.shared_weight_spec.desc.dt; + CHECK_STATUS( + getTargetDataType(convertMode, &(opsPtr[i].ps.shared_weight_spec.desc.dt))); + break; + } + case OT_PreAllocatedMemory: { + CHECK_STATUS(getTargetDataType( + convertMode, &(opsPtr[i].ps.preallocated_memory_spec.desc.dt))); + break; + } + default: + break; + } + } + targetMs->ops = opsPtr; + targetMs->num_weight_specs = originalMs->num_weight_specs; + WeightSpec *wsPtr = + (WeightSpec *)mt_new_storage(targetMs->num_weight_specs * sizeof(WeightSpec)); + for (int i = 0; i < targetMs->num_weight_specs; i++) { + str_copy(wsPtr[i].op_name, originalMs->ws[i].op_name, NAME_LEN); + + int weightNum = 0; + if (originalMs->ws[i].mdt == DT_BIN01 || originalMs->ws[i].mdt == DT_BIN11) { + wsPtr[i].mdt = originalMs->ws[i].mdt; + weightNum = originalMs->ws[i].bytes_of_weight / bytesOf(DT_F32); + wsPtr[i].bytes_of_weight = weightNum * bytesOf(wsPtr[i].mdt) / 8; + } else { + DataType wdt = originalMs->ws[i].mdt; + if (weightDataTypeMap.find(wsPtr[i].op_name) != weightDataTypeMap.end()) { + wdt = weightDataTypeMap[wsPtr[i].op_name]; + } + CHECK_STATUS(getTargetDataType(convertMode, &wdt)); + + wsPtr[i].mdt = wdt; + if (wdt == DT_F32 || wdt == DT_F16) { + wsPtr[i].mdt = get_storage_type(targetMs, wsPtr[i].op_name, storageMode, wdt); + } + + weightNum = originalMs->ws[i].bytes_of_weight / bytesOf(originalMs->ws[i].mdt); + wsPtr[i].bytes_of_weight = weightNum * bytesOf(wsPtr[i].mdt); + } + + wsPtr[i].num_quant_scale = originalMs->ws[i].num_quant_scale; + if (0 == wsPtr[i].num_quant_scale) { + wsPtr[i].weight_scale = nullptr; + } else { + wsPtr[i].weight_scale = + (QuantSpec *)mt_new_storage(wsPtr[i].num_quant_scale * sizeof(QuantSpec)); + for (U32 j = 0; j < wsPtr[i].num_quant_scale; j++) { + wsPtr[i].weight_scale[j].num_scale = originalMs->ws[i].weight_scale[j].num_scale; + int num = wsPtr[i].weight_scale[j].num_scale; + + wsPtr[i].weight_scale[j].scale = (F32 *)mt_new_storage(num * sizeof(F32)); + memcpy(wsPtr[i].weight_scale[j].scale, originalMs->ws[i].weight_scale[j].scale, + num * sizeof(F32)); + } + } + + wsPtr[i].weight = (U8 *)mt_new_storage(wsPtr[i].bytes_of_weight); + + DataType vdt = DT_F32; + int biasNum = originalMs->ws[i].bytes_of_vec / bytesOf(DT_F32); + CHECK_STATUS(getTargetDataType(convertMode, &vdt)); + if (DT_F32 == vdt && DT_F16 == wsPtr[i].mdt) { + vdt = DT_F16; + } + wsPtr[i].bytes_of_vec = biasNum * bytesOf(vdt); + wsPtr[i].vec = (U8 *)mt_new_storage(wsPtr[i].bytes_of_vec); + + switch (wsPtr[i].mdt) { + case DT_F32: { + CHECK_STATUS(ws_datatype_converter( + originalMs->ws[i].weight, wsPtr[i].weight, weightNum)); + CHECK_STATUS( + ws_datatype_converter(originalMs->ws[i].vec, wsPtr[i].vec, biasNum)); + break; + } + case DT_I32: { + if (DT_I32 == originalMs->ws[i].mdt) { + if (wsPtr[i].bytes_of_weight > 0) { + memcpy(wsPtr[i].weight, originalMs->ws[i].weight, wsPtr[i].bytes_of_weight); + } + if (wsPtr[i].bytes_of_vec > 0) { + memcpy(wsPtr[i].vec, originalMs->ws[i].vec, wsPtr[i].bytes_of_vec); + } + break; + } + CHECK_STATUS(ws_datatype_converter( + originalMs->ws[i].weight, wsPtr[i].weight, weightNum)); + CHECK_STATUS( + ws_datatype_converter(originalMs->ws[i].vec, wsPtr[i].vec, biasNum)); + break; + } + case DT_U32: { + if (DT_U32 == originalMs->ws[i].mdt) { + if (wsPtr[i].bytes_of_weight > 0) { + memcpy(wsPtr[i].weight, originalMs->ws[i].weight, wsPtr[i].bytes_of_weight); + } + if (wsPtr[i].bytes_of_vec > 0) { + memcpy(wsPtr[i].vec, originalMs->ws[i].vec, wsPtr[i].bytes_of_vec); + } + break; + } + CHECK_STATUS(ws_datatype_converter( + originalMs->ws[i].weight, wsPtr[i].weight, weightNum)); + CHECK_STATUS( + ws_datatype_converter(originalMs->ws[i].vec, wsPtr[i].vec, biasNum)); + break; + } +#ifdef _USE_FP16 + case DT_F16: { + CHECK_STATUS(ws_datatype_converter( + originalMs->ws[i].weight, wsPtr[i].weight, weightNum)); + CHECK_STATUS( + ws_datatype_converter(originalMs->ws[i].vec, wsPtr[i].vec, biasNum)); + break; + } +#endif + case DT_I8: { + F32 scale = ws_datatype_converter_int8( + originalMs->ws[i].weight, wsPtr[i].weight, weightNum); + wsPtr[i].num_quant_scale = 1; + wsPtr[i].weight_scale = (QuantSpec *)mt_new_storage(sizeof(QuantSpec)); + wsPtr[i].weight_scale[0].num_scale = 1; + wsPtr[i].weight_scale[0].scale = (F32 *)mt_new_storage(sizeof(F32)); + wsPtr[i].weight_scale[0].scale[0] = scale; + + if (DT_F32 == vdt) { + CHECK_STATUS( + ws_datatype_converter(originalMs->ws[i].vec, wsPtr[i].vec, biasNum)); + } else { +#ifdef _USE_FP16 + CHECK_STATUS( + ws_datatype_converter(originalMs->ws[i].vec, wsPtr[i].vec, biasNum)); +#endif + } + break; + } +#ifdef __aarch64__ + case DT_BIN01: { + F32 scale = + ws_datatype_converter_bnn(originalMs->ws[i].weight, wsPtr[i].weight, weightNum); + CHECK_STATUS(ws_datatype_converter( + originalMs->ws[i].vec, wsPtr[i].vec, biasNum)); // Assume F16 for the vector + // Fuse the weight scale + if (1 != scale) { + F16 *scalePtr = (F16 *)wsPtr[i].vec; + for (int i = 0; i < biasNum / 2; i++) { + scalePtr[i] *= scale; + } + } + break; + } + case DT_BIN11: { + F32 scale = + ws_datatype_converter_bnn(originalMs->ws[i].weight, wsPtr[i].weight, weightNum); + CHECK_STATUS(ws_datatype_converter( + originalMs->ws[i].vec, wsPtr[i].vec, biasNum)); // Assume F16 for the vector + // Fuse the weight scale + if (1 != scale) { + F16 *scalePtr = (F16 *)wsPtr[i].vec; + for (int i = 0; i < biasNum / 2; i++) { + scalePtr[i] *= scale; + } + } + break; + } +#endif + default: + return NOT_SUPPORTED; + } + } + targetMs->ws = wsPtr; + + if (nullptr != originalMs->op_relationship_entries) { + targetMs->num_op_tensor_entries = originalMs->num_op_tensor_entries; + targetMs->op_relationship_entries = (OperatorRelationshipMapEntry *)mt_new_storage( + targetMs->num_op_tensor_entries * sizeof(OperatorRelationshipMapEntry)); + for (int i = 0; i < targetMs->num_op_tensor_entries; i++) { + str_copy(targetMs->op_relationship_entries[i].op, + originalMs->op_relationship_entries[i].op, NAME_LEN); + + targetMs->op_relationship_entries[i].num_inputs = + originalMs->op_relationship_entries[i].num_inputs; + targetMs->op_relationship_entries[i].input_op_names = (I8 **)mt_new_storage( + targetMs->op_relationship_entries[i].num_inputs * sizeof(I8 *)); + for (U32 j = 0; j < targetMs->op_relationship_entries[i].num_inputs; j++) { + targetMs->op_relationship_entries[i].input_op_names[j] = + (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + str_copy(targetMs->op_relationship_entries[i].input_op_names[j], + originalMs->op_relationship_entries[i].input_op_names[j], NAME_LEN); + } + + targetMs->op_relationship_entries[i].num_outputs = + originalMs->op_relationship_entries[i].num_outputs; + targetMs->op_relationship_entries[i].output_op_names = (I8 **)mt_new_storage( + targetMs->op_relationship_entries[i].num_outputs * sizeof(I8 *)); + for (U32 j = 0; j < targetMs->op_relationship_entries[i].num_outputs; j++) { + targetMs->op_relationship_entries[i].output_op_names[j] = + (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + str_copy(targetMs->op_relationship_entries[i].output_op_names[j], + originalMs->op_relationship_entries[i].output_op_names[j], NAME_LEN); + } + } + } else { + targetMs->num_op_tensor_entries = 0; + targetMs->op_relationship_entries = nullptr; + } + return SUCCESS; +} diff --git a/model_tools/src/model_adaptee.h b/model_tools/src/model_adaptee.h new file mode 100644 index 00000000..9041fcbe --- /dev/null +++ b/model_tools/src/model_adaptee.h @@ -0,0 +1,219 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_MODELADAPTEE +#define _H_MODELADAPTEE + +#include "model_tools.h" +#include "ut_util.h" + +#define REGISTER_EMPTY_ADAPT_OPERATOR(name) \ + virtual ParameterSpec name() \ + { \ + UNI_ERROR_LOG( \ + "%s %s %d UNIMPLEMENT THIS OPERATOR CURRENTLY \n", __FILE__, __func__, __LINE__); \ + ParameterSpec curPs; \ + initialization_zero(&curPs, sizeof(ParameterSpec)); \ + return curPs; \ + } + +class ModelAdaptee { +public: + virtual EE adapt(std::string dir, std::string mfn, ModelSpec *ms) + { + EE ret = parse_file(dir, mfn); + ret = adapt_operators(ms); + ret = adapt_weights(ms); + return ret; + } + + ModelAdaptee() + {} + + virtual ~ModelAdaptee() + {} + +protected: + virtual EE parse_file(std::string dir, std::string mfn) = 0; + + virtual EE adapt_operators(ModelSpec *ms) = 0; + + virtual EE adapt_weights(ModelSpec *ms) = 0; + + virtual EE adapt_operator(OperatorType type, ParameterSpec *ps) + { + if (type == OT_Conv) { + *ps = adapt_Conv(); + } else if (type == OT_Deconvolution) { + *ps = adapt_Deconvolution(); + } else if (type == OT_FC) { + *ps = adapt_Fc(); + } else if (type == OT_RNN) { + *ps = adapt_RNN(); + } else if (type == OT_MatMul) { + *ps = adapt_MatMul(); + } else if (type == OT_Resize) { + *ps = adapt_Resize(); + } else if (type == OT_Pooling) { + *ps = adapt_Pooling(); + } else if (type == OT_Scale) { + *ps = adapt_Scale(); + } else if (type == OT_PRelu) { + *ps = adapt_PRelu(); + } else if (type == OT_BatchNorm) { + *ps = adapt_BatchNorm(); + } else if (type == OT_LayerNorm) { + *ps = adapt_LayerNorm(); + } else if (type == OT_Reduction) { + *ps = adapt_Reduction(); + } else if (type == OT_ArgMax) { + *ps = adapt_ArgMax(); + } else if (type == OT_Softmax) { + *ps = adapt_Softmax(); + } else if (type == OT_Clip) { + *ps = adapt_Clip(); + } else if (type == OT_Power) { + *ps = adapt_Power(); + } else if (type == OT_Relu) { + *ps = adapt_Relu(); + } else if (type == OT_Gather) { + *ps = adapt_Gather(); + } else if (type == OT_Embedding) { + *ps = adapt_Embedding(); + } else if (type == OT_Pad) { + *ps = adapt_Pad(); + } else if (type == OT_Eltwise) { + *ps = adapt_Eltwise(); + } else if (type == OT_Concat) { + *ps = adapt_Concat(); + } else if (type == OT_Slice) { + *ps = adapt_Slice(); + } else if (type == OT_TfSlice) { + *ps = adapt_TfSlice(); + } else if (type == OT_Cast) { + *ps = adapt_Cast(); + } else if (type == OT_Transpose) { + *ps = adapt_Transpose(); + } else if (type == OT_Reshape) { + *ps = adapt_Reshape(); + } else if (type == OT_Squeeze) { + *ps = adapt_Squeeze(); + } else if (type == OT_Unsqueeze) { + *ps = adapt_Unsqueeze(); + } else if (type == OT_Space2Depth) { + *ps = adapt_Space2Depth(); + } else if (type == OT_Depth2Space) { + *ps = adapt_Depth2Space(); + } else if (type == OT_PreAllocatedMemory) { + *ps = adapt_PreAllocatedMemory(); + } else if (type == OT_SharedWeight) { + *ps = adapt_SharedWeight(); + } else if (type == OT_Copy) { + *ps = adapt_Copy(); + } else if (type == OT_Check) { + *ps = adapt_Check(); + } else if (type == OT_Repeat) { + *ps = adapt_Repeat(); + } else if (type == OT_Attention) { + *ps = adapt_Attention(); + } else if (type == OT_AttentionMask) { + *ps = adapt_AttentionMask(); + } else if (type == OT_RelativePositionEmbedding) { + *ps = adapt_RelativePositionEmbedding(); + } else if (type == OT_RelativeShift) { + *ps = adapt_RelativeShift(); + } else if (type == OT_PriorBox) { + *ps = adapt_PriorBox(); + } else if (type == OT_DetectionOutput) { + *ps = adapt_DetectionOutput(); + } else if (type == OT_Yolov3DetectionOutput) { + *ps = adapt_Yolov3DetectionOutput(); + } else if (type == OT_Tile) { + *ps = adapt_Tile(); + } else if (type == OT_Splice) { + *ps = adapt_Splice(); + } + return SUCCESS; + } + + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_Conv) + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_Deconvolution) + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_Fc) + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_RNN) + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_MatMul) + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_Resize) + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_Pooling) + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_Scale) + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_PRelu) + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_BatchNorm) + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_LayerNorm) + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_Reduction) + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_ArgMax) + + virtual ParameterSpec adapt_Softmax() + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(ParameterSpec)); + curPs.softmax_spec.axis = -1; + return curPs; + } + + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_Clip) + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_Power) + + virtual ParameterSpec adapt_Relu() + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(ParameterSpec)); + curPs.relu_spec.neg_slope = 0; + return curPs; + } + + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_Gather) + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_Embedding) + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_Pad) + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_Eltwise) + + virtual ParameterSpec adapt_Concat() + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(ParameterSpec)); + curPs.concat_spec.axis = 1; + return curPs; + } + + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_Slice) + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_TfSlice) + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_Cast) + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_Transpose) + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_Reshape) + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_Squeeze) + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_Unsqueeze) + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_Space2Depth) + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_Depth2Space) + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_PreAllocatedMemory) + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_SharedWeight) + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_Copy) + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_Check) + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_Repeat) + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_Attention) + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_AttentionMask) + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_RelativePositionEmbedding) + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_RelativeShift) + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_PriorBox) + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_DetectionOutput) + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_Yolov3DetectionOutput) + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_Tile) + REGISTER_EMPTY_ADAPT_OPERATOR(adapt_Splice) +}; +#endif diff --git a/model_tools/src/model_quantization.cpp b/model_tools/src/model_quantization.cpp new file mode 100644 index 00000000..b44cecb7 --- /dev/null +++ b/model_tools/src/model_quantization.cpp @@ -0,0 +1,96 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "model_quantization.h" + +std::vector SplitScale(const std::string &s, char delim) +{ + std::vector res; + std::stringstream ss(s); + std::string item; + + while (std::getline(ss, item, delim)) { + res.push_back(item); + } + + return res; +} + +void add_scale_from_file(ModelSpec *ms, const char *scaleFile) +{ + std::fstream file(std::string(scaleFile), std::ios::in); + CHECK_REQUIREMENT(file && file.is_open()); + std::map> scaleMap; + std::string line; + UNI_DEBUG_LOG("Scale Table is : \n"); + while (std::getline(file, line)) { + auto res = SplitScale(line, ' '); + CHECK_REQUIREMENT(res.size() == 2); + std::string tensorName = res[0]; + std::vector quantScale; + quantScale.push_back(atof(res[1].c_str())); + scaleMap[tensorName] = quantScale; + UNI_DEBUG_LOG("Tensor[%s] %f\n", tensorName.c_str(), quantScale[0]); + } + file.close(); + for (I32 i = 0; i < (*ms).num_operator_specs; i++) { + if (isDeprecatedOp((*ms).ops[i].type)) { + continue; + } + if ((*ms).ops[i].num_quant_feature == 1 && (*ms).ops[i].feature_scale[0].scale[0] == 0) { + continue; + } + std::vector> scales; + for (U32 j = 0; j < (*ms).ops[i].num_inputs; j++) { + auto it = scaleMap.find((*ms).ops[i].input_tensors_name[j]); + std::vector inputScale; + if (it != scaleMap.end()) { + inputScale.push_back(127.0f / scaleMap[(*ms).ops[i].input_tensors_name[j]][0]); + } else { + inputScale.push_back(-1); + } + scales.push_back(inputScale); + } + for (U32 j = 0; j < (*ms).ops[i].num_outputs; j++) { + auto it = scaleMap.find((*ms).ops[i].output_tensors_name[j]); + std::vector outputScale; + if ((*ms).ops[i].num_quant_feature == 1 && -2 == (*ms).ops[i].feature_scale[0].scale[0]) { + outputScale.push_back(-2); + } else if (it != scaleMap.end()) { + outputScale.push_back(127.0f / scaleMap[(*ms).ops[i].output_tensors_name[j]][0]); + } else { + outputScale.push_back(-1); + } + scales.push_back(outputScale); + } + // Store scales into result model + if (nullptr != (*ms).ops[i].feature_scale) { // Could be labelled with -2 + for (U32 k = 0; k < (*ms).ops[i].num_quant_feature; k++) { + if (nullptr != (*ms).ops[i].feature_scale[k].scale) { + delete (*ms).ops[i].feature_scale[k].scale; + } + } + delete (*ms).ops[i].feature_scale; + } + + (*ms).ops[i].num_quant_feature = scales.size(); + (*ms).ops[i].feature_scale = (QuantSpec *)mt_new_storage(scales.size() * sizeof(QuantSpec)); + + for (U32 k = 0; k < scales.size(); k++) { + (*ms).ops[i].feature_scale[k].num_scale = scales[k].size(); + U32 scaleBytes = scales[k].size() * sizeof(F32); + (*ms).ops[i].feature_scale[k].scale = (F32 *)mt_new_storage(scaleBytes); + memcpy((*ms).ops[i].feature_scale[k].scale, scales[k].data(), scaleBytes); + } + } +} diff --git a/model_tools/src/model_tools.cpp b/model_tools/src/model_tools.cpp new file mode 100644 index 00000000..e48c576c --- /dev/null +++ b/model_tools/src/model_tools.cpp @@ -0,0 +1,169 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include + +#include "model_serialize_deserialize.hpp" +#include "model_tools.h" + +extern "C" EE mt_create_model(ModelSpec *ms) +{ + if (nullptr == ms) { + return NULL_POINTER; + } + + ms->version = sg_boltVersion; + ms->magic_number = sg_magicNumber; + ms->input_names = nullptr; + ms->num_inputs = 0; + ms->input_dims = nullptr; + ms->num_outputs = 0; + ms->output_names = nullptr; + ms->num_operator_specs = 0; + ms->ops = nullptr; + ms->num_weight_specs = 0; + ms->ws = nullptr; + ms->num_op_tensor_entries = 0; + ms->op_relationship_entries = nullptr; + + return SUCCESS; +} + +extern "C" EE mt_destroy_model(ModelSpec *ms) +{ + if (nullptr == ms) { + return NULL_POINTER; + } + + if (nullptr != ms->input_names) { + for (int i = 0; i < ms->num_inputs; i++) { + if (nullptr != ms->input_names[i]) { + delete ms->input_names[i]; + } + ms->input_names[i] = nullptr; + } + delete ms->input_names; + ms->input_names = nullptr; + } + + if (nullptr != ms->input_dims) { + delete ms->input_dims; + ms->input_dims = nullptr; + } + + if (nullptr != ms->output_names) { + for (int i = 0; i < ms->num_outputs; i++) { + if (nullptr != ms->output_names[i]) { + delete ms->output_names[i]; + } + ms->output_names[i] = nullptr; + } + delete ms->output_names; + ms->output_names = nullptr; + } + + if (nullptr != ms->ops) { + int op_num = ms->num_operator_specs; + for (int i = 0; i < op_num; i++) { + if (nullptr != ms->ops[i].input_tensors_name) { + for (U32 j = 0; j < ms->ops[i].num_inputs; j++) { + if (nullptr != ms->ops[i].input_tensors_name[j]) { + delete ms->ops[i].input_tensors_name[j]; + } + ms->ops[i].input_tensors_name[j] = nullptr; + } + delete ms->ops[i].input_tensors_name; + ms->ops[i].input_tensors_name = nullptr; + } + if (nullptr != ms->ops[i].output_tensors_name) { + for (U32 j = 0; j < ms->ops[i].num_outputs; j++) { + if (nullptr != ms->ops[i].output_tensors_name[j]) { + delete ms->ops[i].output_tensors_name[j]; + } + ms->ops[i].output_tensors_name[j] = nullptr; + } + delete ms->ops[i].output_tensors_name; + ms->ops[i].output_tensors_name = nullptr; + } + + if (nullptr != ms->ops[i].tensor_positions) { + delete ms->ops[i].tensor_positions; + } + + if (0 != ms->ops[i].num_quant_feature && nullptr != ms->ops[i].feature_scale) { + for (U32 j = 0; j < ms->ops[i].num_quant_feature; j++) { + if (0 != ms->ops[i].feature_scale[j].num_scale) { + if (nullptr != ms->ops[i].feature_scale[j].scale) { + delete ms->ops[i].feature_scale[j].scale; + } + } + } + delete ms->ops[i].feature_scale; + } + } + delete ms->ops; + ms->ops = nullptr; + } + + if (nullptr != ms->ws) { + int weightOpNum = ms->num_weight_specs; + for (int i = 0; i < weightOpNum; i++) { + if (nullptr != ms->ws[i].weight) { + delete ms->ws[i].weight; + } + ms->ws[i].weight = nullptr; + if (nullptr != ms->ws[i].vec) { + delete ms->ws[i].vec; + } + ms->ws[i].vec = nullptr; + } + delete ms->ws; + ms->ws = nullptr; + } + + if (nullptr != ms->op_relationship_entries) { + int numOpRelationPair = ms->num_op_tensor_entries; + for (int i = 0; i < numOpRelationPair; i++) { + if (nullptr != ms->op_relationship_entries[i].input_op_names) { + for (U32 j = 0; j < ms->op_relationship_entries[i].num_inputs; j++) { + if (nullptr != ms->op_relationship_entries[i].input_op_names[j]) { + delete ms->op_relationship_entries[i].input_op_names[j]; + } + ms->op_relationship_entries[i].input_op_names[j] = nullptr; + } + delete ms->op_relationship_entries[i].input_op_names; + ms->op_relationship_entries[i].input_op_names = nullptr; + } + if (nullptr != ms->op_relationship_entries[i].output_op_names) { + for (U32 j = 0; j < ms->op_relationship_entries[i].num_outputs; j++) { + if (nullptr != ms->op_relationship_entries[i].output_op_names[j]) { + delete ms->op_relationship_entries[i].output_op_names[j]; + } + ms->op_relationship_entries[i].output_op_names[j] = nullptr; + } + delete ms->op_relationship_entries[i].output_op_names; + ms->op_relationship_entries[i].output_op_names = nullptr; + } + } + delete ms->op_relationship_entries; + ms->op_relationship_entries = nullptr; + } + + return SUCCESS; +} + +EE mt_load(CI8 *dir, CI8 *mfn, ModelSpec *md) +{ + std::string completePath = concat_dir_file(dir, mfn); + return deserialize_model_from_file(completePath.c_str(), md); +} diff --git a/model_tools/src/online_conversion.cpp b/model_tools/src/online_conversion.cpp new file mode 100644 index 00000000..fa10bb36 --- /dev/null +++ b/model_tools/src/online_conversion.cpp @@ -0,0 +1,91 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "online_conversion.h" + +bool fileExist(const std::string &name) +{ + if (FILE *file = fopen(name.c_str(), "r")) { + fclose(file); + return true; + } else { + return false; + } +} + +void *OnlineModelConversion(const char *storagePath, + const char *modelName, + const char *inferPrecision, + I32 removeProcessOpsNum) +{ + DataConvertType converterMode = F32_to_F16; + if (inferPrecision == std::string("PTQ")) { + converterMode = F32_to_F32; + } else if (inferPrecision == std::string("FP16")) { + converterMode = F32_to_F16; + } else if (inferPrecision == std::string("FP32")) { + converterMode = F32_to_F32; + } else { + UNI_ERROR_LOG("Unknown converter data precision : %s", inferPrecision); + } + + ModelSpec *originalMs = new ModelSpec(); + ModelSpec *targetMs = new ModelSpec(); + CHECK_STATUS(mt_create_model(originalMs)); + CHECK_STATUS(mt_create_model(targetMs)); + + std::string spStr = storagePath; + std::string mnStr = modelName; + std::string prefix = (spStr.at(spStr.size() - 1) == '/') ? (spStr + mnStr) + : (spStr + "/" + mnStr); + + if (0) { +#ifdef _USE_CAFFE + } else if (fileExist(prefix + ".prototxt") && fileExist(prefix + ".caffemodel")) { + caffe_converter(storagePath, modelName, originalMs); +#endif +#ifdef _USE_ONNX + } else if (fileExist(prefix + ".onnx")) { + onnx_converter(storagePath, modelName, removeProcessOpsNum, originalMs); +#endif +#ifdef _USE_TFLITE + } else if (fileExist(prefix + ".tflite")) { + tflite_converter(storagePath, modelName, originalMs); +#endif +#ifdef _USE_TENSORFLOW + } else if (fileExist(prefix + ".json")) { + tensorflow_converter(storagePath, modelName, originalMs); +#endif + } else { + UNI_ERROR_LOG("Can not find any valid model, FAIL!"); + } + + ModelSpecOptimizer msOptimizer; + msOptimizer.suggest(inferPrecision == std::string("PTQ")); + msOptimizer.optimize(originalMs); + + CHECK_STATUS(ms_datatype_converter(originalMs, targetMs, converterMode, "NOQUANT")); + CHECK_STATUS(mt_destroy_model(originalMs)); + delete originalMs; + operator_relationship(targetMs); + return (void *)targetMs; +} + +void OnlineModelReclaim(void *ms) +{ + ModelSpec *targetMs = (ModelSpec *)ms; + CHECK_STATUS(mt_destroy_model(targetMs)); + delete targetMs; + return; +} diff --git a/model_tools/src/onnx/CMakeLists.txt b/model_tools/src/onnx/CMakeLists.txt new file mode 100644 index 00000000..3ca6d044 --- /dev/null +++ b/model_tools/src/onnx/CMakeLists.txt @@ -0,0 +1,28 @@ +file(GLOB srcs ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) +file(GLOB commonsrcs ${CMAKE_CURRENT_SOURCE_DIR}/../model_*.cpp) + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) + +include_directories(${Protobuf_INCLUDE_DIR}) + +set(Protobuf_IMPORT_DIRS ${BOLT_ROOT}/third_party/proto) +protobuf_generate_cpp(ONNX_PROTO_SRCS ONNX_PROTO_HDRS ${BOLT_ROOT}/third_party/proto/onnx.proto) + +include_directories(${CMAKE_CURRENT_BINARY_DIR}) +include_directories(../) + +# shared library +add_library(${PROJECT_NAME}_onnx SHARED ${srcs} ${ONNX_PROTO_HDRS} ${ONNX_PROTO_SRCS} ${commonsrcs}) +if (USE_IOS_CLANG) + target_link_libraries(${PROJECT_NAME}_onnx LINK_PUBLIC uni ${Protobuf_LIBRARY}) +endif (USE_IOS_CLANG) + +# static library +add_library(${PROJECT_NAME}_onnx_static STATIC ${srcs} ${ONNX_PROTO_HDRS} ${ONNX_PROTO_SRCS}) + +set_target_properties(${PROJECT_NAME}_onnx_static PROPERTIES OUTPUT_NAME "${PROJECT_NAME}_onnx") +set_target_properties(${PROJECT_NAME}_onnx PROPERTIES CLEAN_DIRECT_OUTPUT 1) +set_target_properties(${PROJECT_NAME}_onnx_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) +install(TARGETS ${PROJECT_NAME}_onnx ${PROJECT_NAME}_onnx_static + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib) diff --git a/model_tools/src/onnx/onnx_adaptee.h b/model_tools/src/onnx/onnx_adaptee.h new file mode 100644 index 00000000..58dc2cfc --- /dev/null +++ b/model_tools/src/onnx/onnx_adaptee.h @@ -0,0 +1,1768 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_ONNXADAPTEE +#define _H_ONNXADAPTEE + +#include +#include +#include +#include +#include +#include +#include "onnx.pb.h" + +#include "converter.h" +#include "model_tools.h" +#include "model_adaptee.h" +#include "ut_util.h" + +class OnnxAdaptee : public ModelAdaptee { +public: + OnnxAdaptee(int removePreprocessOpNum_outside) + { + this->removePreprocessOpNum = removePreprocessOpNum_outside; + } + ~OnnxAdaptee() + {} + +protected: + DataType get_weight_data_type(U32 weightLen, F32 *weight) + { + if (1 >= weightLen) { + return DT_F32; + } + F32 val0 = 1; + F32 val1 = 0; + for (U32 i = 0; i < weightLen; i++) { + F32 cur = weight[i]; + if (cur <= 0 && val0 <= 0 && cur != val0) { + return DT_F32; + } + if (cur > 0 && val1 > 0 && cur != val1) { + return DT_F32; + } + if (cur <= 0 && val0 > 0) { + val0 = cur; + } + if (cur > 0 && val1 <= 0) { + val1 = cur; + } + } + if (val0 == 0) { + return DT_BIN01; + } + CHECK_REQUIREMENT(0 == val0 + val1); + return DT_BIN11; + } + + std::vector getOperatorWeightInputIndex(int weightOpIndex) + { + const onnx::NodeProto &weightNode = onnxGraph.node(weightOpIndex); + std::vector index; + for (int i = 0; i < weightNode.input_size(); i++) { + if (weights.end() != weights.find(weightNode.input(i))) { + index.push_back(i); + } + } + return index; + } + + EE read_from_onnx_file(const char *path, google::protobuf::Message *message) + { + std::ifstream fs(path, std::ifstream::in | std::ifstream::binary); + if (!fs.is_open()) { + return NOT_FOUND; + } + + google::protobuf::io::IstreamInputStream input(&fs); + google::protobuf::io::CodedInputStream codedstr(&input); + + codedstr.SetTotalBytesLimit(INT_MAX, INT_MAX / 2); + + bool ret = message->ParseFromCodedStream(&codedstr); + fs.close(); + + return (ret) ? SUCCESS : NOT_SUPPORTED; + } + + OperatorType convert_onnx_type(std::string inputType) + { + if (inputType == "Conv") { + return OT_Conv; + } else if (inputType == "BatchNormalization") { + return OT_BatchNorm; + } else if (inputType == "Sum" || inputType == "Add" || inputType == "Mul" || + inputType == "Div" || inputType == "Sub") { + return OT_Eltwise; + } else if (inputType == "Gemm") { + return OT_FC; + } else if (inputType == "AveragePool" || inputType == "MaxPool" || + inputType == "ReduceMean" || inputType == "GlobalAveragePool" || + inputType == "ReduceMax") { + return OT_Pooling; + } else if (inputType == "Relu" || inputType == "LeakyRelu") { + return OT_Relu; + } else if (inputType == "Softmax") { + return OT_Softmax; + } else if (inputType == "Concat") { + return OT_Concat; + } else if (inputType == "Pad") { + return OT_Pad; + } else if (inputType == "Max" || inputType == "Min" || inputType == "Clip") { + return OT_Clip; + } else if (inputType == "Reshape") { + return OT_Reshape; + } else if (inputType == "Squeeze") { + return OT_Squeeze; + } else if (inputType == "Transpose") { + return OT_Transpose; + } else if (inputType == "Gather") { + return OT_Gather; + } else if (inputType == "Unsqueeze") { + return OT_Unsqueeze; + } else if (inputType == "Resize" || inputType == "Upsample") { + return OT_Resize; + } else if (inputType == "Cast") { + return OT_Cast; + } else if (inputType == "Constant") { + return OT_Constant; + } else if (inputType == "MatMul") { + return OT_MatMul; + } else if (inputType == "Flatten") { + return OT_Reshape; + } else if (inputType == "ConvTranspose") { + return OT_Deconvolution; + } else if (inputType == "Tanh") { + return OT_TanH; + } else if (inputType == "LogSoftmax") { + return OT_LogSoftmax; + } else if (inputType == "Shape") { + return OT_Shape; + } else if (inputType == "Erf") { + return OT_Erf; + } else if (inputType == "Pow" || inputType == "Sqrt") { + return OT_Power; + } else if (inputType == "RNN" || inputType == "LSTM" || inputType == "GRU") { + return OT_RNN; + } else if (inputType == "ConstantOfShape") { + return OT_ConstantOfShape; + } else if (inputType == "SpaceToDepth") { + return OT_Space2Depth; + } else if (inputType == "DepthToSpace") { + return OT_Depth2Space; + } else if (inputType == "PRelu") { + return OT_PRelu; + } else if (inputType == "ArgMax") { + return OT_ArgMax; + } else if (inputType == "Tile") { + return OT_Tile; + } else if (inputType == "Sigmoid") { + return OT_Sigmoid; + } else if (inputType == "Slice") { + return OT_Slice; + } else if (inputType == "ReduceSum") { + return OT_Reduction; + } else if (inputType == "Split") { + return OT_Slice; + } else if (inputType == "Splice") { + return OT_Splice; + } else if (inputType == "Greater") { + return OT_Greater; + } else { + UNI_ERROR_LOG( + "encounter unsupported operator in onnx converter: %s\n", inputType.c_str()); + } + return OT_None; + } + + std::vector get_node_vector_ints_attribute_by_name( + const onnx::NodeProto &node, const char *key) + { + std::vector result; + for (int i = 0; i < node.attribute_size(); i++) { + const onnx::AttributeProto &attribute = node.attribute(i); + if (attribute.name() == key) { + result.resize(attribute.ints_size()); + for (int j = 0; j < attribute.ints_size(); j++) { + result[j] = attribute.ints(j); + } + break; + } + } + return result; + } + + std::vector get_node_vector_float_tensor_attribute_by_name( + const onnx::NodeProto &node, const char *key) + { + std::vector result; + for (int i = 0; i < node.attribute_size(); i++) { + const onnx::AttributeProto &attribute = node.attribute(i); + if (attribute.name() == key) { + CHECK_REQUIREMENT(4 == attribute.type()); + const onnx::TensorProto &tp = attribute.t(); + F32 *value; + if (tp.has_raw_data()) { + const std::string &rawData = tp.raw_data(); + value = (F32 *)(rawData.data()); + } else if (tp.data_type() == 1) { + value = (F32 *)(tp.float_data().data()); + } else { + UNI_WARNING_LOG("Constant not extracted\n"); + return result; + } + + result.resize(tp.dims(0)); + for (int j = 0; j < tp.dims(0); j++) { + result[j] = value[j]; + } + break; + } + } + return result; + } + + int get_node_single_int_attribute_by_name( + const onnx::NodeProto &node, const char *key, int defaultValue = 0) + { + for (int i = 0; i < node.attribute_size(); i++) { + const onnx::AttributeProto &attribute = node.attribute(i); + if (attribute.name() == key) { + return attribute.i(); + } + } + return defaultValue; + } + + std::string get_node_str_attribute_by_name(const onnx::NodeProto &node, + const char *key, + const std::string &defaultValue = std::string()) + { + for (int i = 0; i < node.attribute_size(); i++) { + const onnx::AttributeProto &attribute = node.attribute(i); + if (attribute.name() == key) { + return attribute.s(); + } + } + return defaultValue; + } + + float get_node_float_attribute_by_name( + const onnx::NodeProto &node, const char *key, float defaultValue = 0.f) + { + for (int i = 0; i < node.attribute_size(); i++) { + const onnx::AttributeProto &attribute = node.attribute(i); + if (attribute.name() == key) { + return attribute.f(); + } + } + return defaultValue; + } + + int get_data_size_from_tensor_proto(const onnx::TensorProto &tensorProto) + { + if (tensorProto.has_raw_data()) { + const std::string &rawData = tensorProto.raw_data(); + int size = (int)rawData.size() / sizeof(float); + return size; + } else if (tensorProto.data_type() == 1) { + return tensorProto.float_data_size(); + } + return 0; + } + + float *get_ptr_from_weight_obj(const onnx::TensorProto &tensorProto) + { + if (tensorProto.has_raw_data()) { + const std::string &rawData = tensorProto.raw_data(); + float *paramPtr = (float *)(rawData.data()); + return paramPtr; + } else if (tensorProto.data_type() == 1) { + float *paramPtr = (float *)(tensorProto.float_data().data()); + return paramPtr; + } + return nullptr; + } + + std::vector get_reshapeInfo_from_tensorProto(const onnx::TensorProto &tp) + { + int size = 0; + std::vector shape; + + // int64 + if (tp.data_type() == 7 || tp.data_type() == 0) { + const int64_t *shapeData = 0; + if (tp.has_raw_data()) { + shapeData = (const int64_t *)tp.raw_data().data(); + size = tp.raw_data().size() / 8; + } else { + shapeData = tp.int64_data().data(); + size = tp.int64_data_size(); + } + + for (int j = 0; j < size; j++) { + shape.push_back(shapeData[j]); + } + } else if (tp.data_type() == 6) { // int32 + const int32_t *shapeData = 0; + if (tp.has_raw_data()) { + shapeData = (const int32_t *)tp.raw_data().data(); + size = tp.raw_data().size() / 4; + } else { + shapeData = tp.int32_data().data(); + size = tp.int32_data_size(); + } + + for (int j = 0; j < size; j++) { + shape.push_back(shapeData[j]); + } + } else { + UNI_ERROR_LOG("UnSupport data type\n"); + } + return shape; + } + + float getSinFloat_from_tensorProto(const onnx::TensorProto &tp) + { + float value = 0; + int exponentSize = get_data_size_from_tensor_proto(tp); + if (tp.data_type() != 1 || exponentSize != 1) { + UNI_ERROR_LOG("UnSupport this data type or the num of params exceeds 1.\n"); + } else { + if (tp.has_raw_data()) { + const std::string &raw_data = tp.raw_data(); + value = ((float *)raw_data.data())[0]; + } else { + value = ((float *)tp.float_data().data())[0]; + } + } + return value; + } + + EE parse_file(std::string dir, std::string mfn) override + { + std::string onnxSuffix = ".onnx"; + std::string onnxPath = dir + "/" + mfn + onnxSuffix; + + this->modelName = mfn; + + EE ret = read_from_onnx_file(onnxPath.c_str(), (google::protobuf::Message *)(&onnxModel)); + if (ret != SUCCESS) { + UNI_ERROR_LOG("fail to load onnx model %s\n", onnxPath.c_str()); + } + + onnxGraph = onnxModel.graph(); + + for (int i = 0; i < onnxGraph.initializer_size(); i++) { + const onnx::TensorProto &initializer = onnxGraph.initializer(i); + weights[initializer.name()] = initializer; + } + return ret; + } + + EE adapt_operators(ModelSpec *ms) override + { + EE ret = SUCCESS; + str_copy(ms->model_name, modelName.c_str(), modelName.length()); + ms->model_name[NAME_LEN - 1] = '\0'; + ms->dt = DT_F32; + + int onnxNodeCount = onnxGraph.node_size(); + int input_node_num = onnxGraph.input().size(); + int output_node_num = onnxGraph.output().size(); + if (input_node_num != 1) { + UNI_WARNING_LOG("num of input node is not 1\n"); + } + + std::vector exactly_input_names; + std::vector> input_dimens; + for (int i = 0; i < input_node_num; i++) { + auto input_node = onnxGraph.input(i); + std::string cur_input_name = input_node.name(); + if (weights.find(cur_input_name) != weights.end()) { + continue; + } + exactly_input_names.push_back(cur_input_name); + + std::vector dims_list; + int node_dimension_size = input_node.type().tensor_type().shape().dim().size(); + if (node_dimension_size == 4) { + // extraction for 4 dimension tensor + int dim_0 = input_node.type().tensor_type().shape().dim(0).dim_value(); + if (dim_0 == 0) { + dims_list.push_back(1); + } else { + dims_list.push_back(input_node.type().tensor_type().shape().dim(0).dim_value()); + } + dims_list.push_back(input_node.type().tensor_type().shape().dim(1).dim_value()); + dims_list.push_back(input_node.type().tensor_type().shape().dim(2).dim_value()); + dims_list.push_back(input_node.type().tensor_type().shape().dim(3).dim_value()); + } else if (node_dimension_size == 3 || node_dimension_size == 2) { + for (int j = 0; j < node_dimension_size; j++) { + dims_list.push_back(input_node.type().tensor_type().shape().dim(j).dim_value()); + } + } else { + UNI_WARNING_LOG("not support input dimension!\n"); + } + input_dimens.push_back(dims_list); + } + + input_node_num = exactly_input_names.size(); + ms->num_inputs = input_node_num; + ms->input_names = (I8 **)mt_new_storage(ms->num_inputs * sizeof(I8 *)); + if (exactly_input_names.size() == 1) { + const onnx::NodeProto &theFirstNode = onnxGraph.node(removePreprocessOpNum); + for (int k = 0; k < theFirstNode.input_size(); k++) { + if (weights.find(theFirstNode.input(k)) != weights.end()) { + continue; + } else { + std::string modelInputName = theFirstNode.input(k); + exactly_input_names[0] = modelInputName; + break; + } + } + } + + for (int k = 0; k < input_node_num; k++) { + ms->input_names[k] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + str_copy(ms->input_names[k], exactly_input_names[k].c_str(), + exactly_input_names[k].length()); + } + ms->input_dims = (TensorDesc *)mt_new_storage(sizeof(TensorDesc) * ms->num_inputs); + for (int i = 0; i < ms->num_inputs; i++) { + int curInputDimSize = input_dimens[i].size(); + TensorDesc input_desc; + if (curInputDimSize == 4) { + input_desc = tensor4d(DT_F32, input_dimens[i][0], input_dimens[i][1], + input_dimens[i][2], input_dimens[i][3]); + } else if (curInputDimSize == 3) { + input_desc = ms->input_dims[i] = tensor3df( + DT_F32, DF_MTK, input_dimens[i][0], input_dimens[i][1], input_dimens[i][2]); + } else if (curInputDimSize == 2) { + input_desc = ms->input_dims[i] = + tensor2df(DT_F32, DF_NORMAL, input_dimens[i][0], input_dimens[i][1]); + } else { + UNI_WARNING_LOG("not support input dimension!\n"); + } + ms->input_dims[i] = input_desc; + } + + ms->num_outputs = output_node_num; + ms->output_names = (I8 **)mt_new_storage(ms->num_outputs * sizeof(I8 *)); + for (int k = 0; k < output_node_num; k++) { + ms->output_names[k] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + str_copy(ms->output_names[k], onnxGraph.output(k).name().c_str(), + onnxGraph.output(k).name().length()); + } + + int bnOpNum = 0; + int constantOpNum = 0; + for (int i = 0; i < onnxNodeCount; i++) { + const onnx::NodeProto &tmpNode = onnxGraph.node(i); + if (tmpNode.op_type() == "BatchNormalization") { + bnOpNum++; + } else if (tmpNode.op_type() == "Constant") { + if (i >= removePreprocessOpNum) { + constantOpNum++; + } + } + } + + ms->num_operator_specs = onnxNodeCount + bnOpNum - constantOpNum - + removePreprocessOpNum; // appending space for scale op + OperatorSpec *opsPtr = + (OperatorSpec *)mt_new_storage(sizeof(OperatorSpec) * ms->num_operator_specs); + ms->ops = opsPtr; + for (I32 i = 0; i < ms->num_operator_specs; i++) { + ms->ops[i].tensor_positions = nullptr; + ms->ops[i].num_quant_feature = 0; + ms->ops[i].feature_scale = nullptr; + } + + // Some models transformed from TF store weight and bias as Constant OP + int numUnseenConstants = 0; + nodeIndex = 0; + for (int i = 0; i < removePreprocessOpNum; i++) { + this->node = onnxGraph.node(nodeIndex); + this->op = node.op_type(); + if (op == "Constant") { + handle_Constant(); + numUnseenConstants++; + } + nodeIndex++; + } + if (0 != numUnseenConstants) { + UNI_INFO_LOG("%d OPs are skipped, and %d of them are Constant OP.\n", + removePreprocessOpNum, numUnseenConstants); + } + + nodeIndex = removePreprocessOpNum; + int opIndex = 0; + for (int i = removePreprocessOpNum; i < onnxNodeCount; i++) { + this->node = onnxGraph.node(nodeIndex); + this->op = node.op_type(); + if (op == "Constant") { + handle_Constant(); + nodeIndex++; + continue; + } + std::string opName = node.name(); + if (opName.empty()) { + opName = node.output(0); + } + int opInputNum = (int)node.input_size(); + opFinalInputNum = opInputNum; + std::vector inputNames; + std::vector op_weight_objs; + for (int j = 0; j < opInputNum; j++) { + const std::string &input_name = node.input(j); + if (weights.find(input_name) != weights.end()) { + opFinalInputNum--; + op_weight_objs.push_back(input_name); + } else { + inputNames.push_back(input_name); + if (op == "Max" || op == "Min") { + opFinalInputNum = 1; + break; + } + } + } + int opOutputNum = (int)node.output_size(); + std::vector outputNames; + for (int j = 0; j < opOutputNum; j++) { + const std::string &output_name = node.output(j); + outputNames.push_back(output_name); + } + + str_copy(opsPtr[opIndex].name, opName.c_str(), opName.length()); + OperatorType opType = convert_onnx_type(op); + + // op type correction + if (op == "MatMul" && opFinalInputNum == 1) { + opType = OT_FC; + } + + opsPtr[opIndex].type = opType; + opsPtr[opIndex].num_inputs = opFinalInputNum; + opsPtr[opIndex].input_tensors_name = + (I8 **)mt_new_storage(opsPtr[opIndex].num_inputs * sizeof(I8 *)); + for (U32 j = 0; j < opsPtr[opIndex].num_inputs; j++) { + opsPtr[opIndex].input_tensors_name[j] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + str_copy(opsPtr[opIndex].input_tensors_name[j], inputNames[j].c_str(), + inputNames[j].length()); + } + opsPtr[opIndex].num_outputs = opOutputNum; + opsPtr[opIndex].output_tensors_name = + (I8 **)mt_new_storage(opsPtr[opIndex].num_outputs * sizeof(I8 *)); + for (U32 j = 0; j < opsPtr[opIndex].num_outputs; j++) { + opsPtr[opIndex].output_tensors_name[j] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + str_copy(opsPtr[opIndex].output_tensors_name[j], outputNames[j].c_str(), + outputNames[j].length()); + } + + if ((op == "Add" || op == "Mul" || op == "Div") && opFinalInputNum == 1) { + weightOpIndexLists.push_back(nodeIndex); + opsPtr[opIndex].type = OT_Scale; + initialization_zero(&(opsPtr[opIndex].ps), sizeof(opsPtr[opIndex].ps)); + opsPtr[opIndex].ps.scale_spec.axis = 1; + } else if (op == "Transpose" && opFinalInputNum == 0) { + weightOpIndexLists.push_back(nodeIndex); + } else { + if (op == "Gather") { + if (weights.find(node.input(0)) != weights.end()) { + weightOpIndexLists.push_back(nodeIndex); + if (weights.find(node.input(1)) != weights.end()) { // both provided + opsPtr[opIndex].type = OT_SharedWeight; + opType = OT_SharedWeight; + } else { + opsPtr[opIndex].type = OT_Embedding; + opType = OT_Embedding; + } + } else if (weights.find(node.input(1)) != weights.end()) { + opType = OT_Slice; + opsPtr[opIndex].type = OT_Slice; + initialization_zero(&(opsPtr[opIndex].ps), sizeof(opsPtr[opIndex].ps)); + opsPtr[opIndex].ps.slice_spec.slice_points[0] = 1; + opsPtr[opIndex].ps.slice_spec.slice_size = 1; + opsPtr[opIndex].ps.slice_spec.axis = 1; + opsPtr[opIndex].num_outputs = 2; + free(opsPtr[opIndex].output_tensors_name[0]); + free(opsPtr[opIndex].output_tensors_name); + opsPtr[opIndex].output_tensors_name = + (I8 **)mt_new_storage(opsPtr[opIndex].num_outputs * sizeof(I8 *)); + opsPtr[opIndex].output_tensors_name[0] = + (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + str_copy(opsPtr[opIndex].output_tensors_name[0], outputNames[0].c_str(), + outputNames[0].length()); + opsPtr[opIndex].output_tensors_name[1] = + (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + std::string reduntStr = "DropOut_Str"; + str_copy(opsPtr[opIndex].output_tensors_name[1], reduntStr.c_str(), + reduntStr.length()); + } + } + + ParameterSpec curPs; + ret = adapt_operator(opType, &curPs); + CHECK_STATUS(ret); + opsPtr[opIndex].ps = curPs; + + if (opType == OT_BatchNorm) { + std::string scaleInputName = outputNames[0]; + std::string scaleOpName = "scale_" + opName; + opIndex++; + str_copy(opsPtr[opIndex].name, scaleOpName.c_str(), scaleOpName.length()); + opsPtr[opIndex].type = OT_Scale; + opsPtr[opIndex].ps.scale_spec.axis = 1; + opsPtr[opIndex].num_inputs = 1; + opsPtr[opIndex].input_tensors_name = (I8 **)mt_new_storage(sizeof(I8 *)); + opsPtr[opIndex].input_tensors_name[0] = + (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + str_copy(opsPtr[opIndex].input_tensors_name[0], scaleInputName.c_str(), + scaleInputName.length()); + opsPtr[opIndex].num_outputs = 1; + opsPtr[opIndex].output_tensors_name = (I8 **)mt_new_storage(sizeof(I8 *)); + opsPtr[opIndex].output_tensors_name[0] = + (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + str_copy(opsPtr[opIndex].output_tensors_name[0], scaleInputName.c_str(), + scaleInputName.length()); + + ParameterSpec scalePs; + ret = adapt_operator(opsPtr[opIndex].type, &scalePs); + CHECK_STATUS(ret); + opsPtr[opIndex].ps = scalePs; + } + } + + nodeIndex++; + opIndex++; + } + ms->num_weight_specs = weightOpIndexLists.size() + bnOpNum; + return ret; + } + + EE adapt_weights(ModelSpec *ms) override + { + EE ret = SUCCESS; + WeightSpec *wsPtr = (WeightSpec *)mt_new_storage(sizeof(WeightSpec) * ms->num_weight_specs); + for (int j = 0; j < ms->num_weight_specs; j++) { + wsPtr[j].num_quant_scale = 0; + wsPtr[j].weight_scale = nullptr; + } + ms->ws = wsPtr; + int weightOpIndexIndeed = 0; + for (U32 i = 0; i < (U32)ms->num_weight_specs; i++) { + int weightOpIndex = weightOpIndexLists[weightOpIndexIndeed]; + const onnx::NodeProto &weightNode = onnxGraph.node(weightOpIndex); + std::string weightOpName = weightNode.name(); + if (weightOpName.empty()) { + weightOpName = weightNode.output(0); + } + const std::string &weightOpType = weightNode.op_type(); + + if (weightOpType == "Conv" || weightOpType == "ConvTranspose") { + // to check that if any op has bias + int convInputNum = + weightNode.input_size(); // if convInputNum == 3, means has bias , otherwise , do not have bias + + const onnx::TensorProto &convWeightTp = weights[weightNode.input(1)]; + + int convWeightNum = get_data_size_from_tensor_proto(convWeightTp); + float *convWeightParamPtr = get_ptr_from_weight_obj(convWeightTp); + str_copy(wsPtr[i].op_name, weightOpName.c_str(), weightOpName.length()); + + // traverse weight elements to see whether it is bnn convolution + wsPtr[i].mdt = get_weight_data_type(convWeightNum, convWeightParamPtr); + wsPtr[i].bytes_of_weight = + convWeightNum * sizeof(float); // Please do not change to bytesOf(mdt) + wsPtr[i].weight = (U8 *)mt_new_storage(wsPtr[i].bytes_of_weight); + memcpy(wsPtr[i].weight, convWeightParamPtr, wsPtr[i].bytes_of_weight); + + int convBiasNum = 0; + float *convBiasParamPtr = nullptr; + if (convInputNum == 3) { + const onnx::TensorProto &convBiasTp = weights[weightNode.input(2)]; + convBiasNum = get_data_size_from_tensor_proto(convBiasTp); + convBiasParamPtr = get_ptr_from_weight_obj(convBiasTp); + wsPtr[i].bytes_of_vec = convBiasNum * sizeof(float); + if (DT_BIN11 == wsPtr[i].mdt || DT_BIN01 == wsPtr[i].mdt) { + wsPtr[i].bytes_of_vec *= + 2; // BNN conv must have a scale vector and a bias vector, so that it can fuse with BN + } + wsPtr[i].vec = (U8 *)mt_new_storage(wsPtr[i].bytes_of_vec); + if (DT_BIN11 == wsPtr[i].mdt || DT_BIN01 == wsPtr[i].mdt) { + U32 vecBytes = convBiasNum * sizeof(float); + F32 *scale = (F32 *)wsPtr[i].vec; + for (I32 j = 0; j < convBiasNum; j++) { + scale[j] = 1.0; + } + memcpy(wsPtr[i].vec + vecBytes, convBiasParamPtr, + vecBytes); // Copy bias (if any) to the second half for BNN + } else { + memcpy(wsPtr[i].vec, convBiasParamPtr, wsPtr[i].bytes_of_vec); + } + } else { + wsPtr[i].bytes_of_vec = 0; + wsPtr[i].vec = nullptr; + } + } else if (weightOpType == "Gemm") { + // attention: fc op weight bias order is different from conv op + const onnx::TensorProto &fcWeightTp = weights[weightNode.input(1)]; + const onnx::TensorProto &fcBiasTp = weights[weightNode.input(2)]; + int fcWeightNum = get_data_size_from_tensor_proto(fcWeightTp); + int fcBiasNum = get_data_size_from_tensor_proto(fcBiasTp); + float *fcWeightParamPtr = get_ptr_from_weight_obj(fcWeightTp); + float *fcBiasParamPtr = get_ptr_from_weight_obj(fcBiasTp); + str_copy(wsPtr[i].op_name, weightOpName.c_str(), weightOpName.length()); + wsPtr[i].mdt = DT_F32; + wsPtr[i].bytes_of_weight = fcWeightNum * sizeof(float); + wsPtr[i].weight = (U8 *)mt_new_storage(wsPtr[i].bytes_of_weight); + memcpy(wsPtr[i].weight, fcWeightParamPtr, wsPtr[i].bytes_of_weight); + wsPtr[i].bytes_of_vec = fcBiasNum * sizeof(float); + wsPtr[i].vec = (U8 *)mt_new_storage(wsPtr[i].bytes_of_vec); + memcpy(wsPtr[i].vec, fcBiasParamPtr, wsPtr[i].bytes_of_vec); + } else if (weightOpType == "BatchNormalization") { + const onnx::TensorProto &scale = weights[weightNode.input(1)]; + const onnx::TensorProto &bias = weights[weightNode.input(2)]; + const onnx::TensorProto &mean = weights[weightNode.input(3)]; + const onnx::TensorProto &var = weights[weightNode.input(4)]; + + float *meanPtr = get_ptr_from_weight_obj(mean); + int bnMeanNum = get_data_size_from_tensor_proto(mean); + float *varPtr = get_ptr_from_weight_obj(var); + int bnVarNum = get_data_size_from_tensor_proto(var); + + str_copy(wsPtr[i].op_name, weightOpName.c_str(), weightOpName.length()); + wsPtr[i].mdt = DT_F32; + wsPtr[i].bytes_of_weight = bnMeanNum * sizeof(float); + wsPtr[i].bytes_of_vec = bnVarNum * sizeof(float); + + wsPtr[i].weight = (U8 *)mt_new_storage(wsPtr[i].bytes_of_weight); + memcpy(wsPtr[i].weight, meanPtr, wsPtr[i].bytes_of_weight); + wsPtr[i].vec = (U8 *)mt_new_storage(wsPtr[i].bytes_of_vec); + memcpy(wsPtr[i].vec, varPtr, wsPtr[i].bytes_of_vec); + + // for scale + std::string scaleWeightOpName = "scale_" + weightOpName; + i = i + 1; + float *scalePtr = get_ptr_from_weight_obj(scale); + int scaleWeightNum = get_data_size_from_tensor_proto(scale); + float *biasPtr = get_ptr_from_weight_obj(bias); + int scaleBiasNum = get_data_size_from_tensor_proto(bias); + + str_copy(wsPtr[i].op_name, scaleWeightOpName.c_str(), scaleWeightOpName.length()); + wsPtr[i].mdt = DT_F32; + wsPtr[i].bytes_of_weight = scaleWeightNum * sizeof(float); + wsPtr[i].bytes_of_vec = scaleBiasNum * sizeof(float); + + wsPtr[i].weight = (U8 *)mt_new_storage(wsPtr[i].bytes_of_weight); + memcpy(wsPtr[i].weight, scalePtr, wsPtr[i].bytes_of_weight); + wsPtr[i].vec = (U8 *)mt_new_storage(wsPtr[i].bytes_of_vec); + memcpy(wsPtr[i].vec, biasPtr, wsPtr[i].bytes_of_vec); + } else if (weightOpType == "Add") { + const onnx::TensorProto &bias = weights[weightNode.input(1)]; + float *bias_ptr = get_ptr_from_weight_obj(bias); + int bias_num = get_data_size_from_tensor_proto(bias); + + str_copy(wsPtr[i].op_name, weightOpName.c_str(), weightOpName.length()); + wsPtr[i].mdt = DT_F32; + wsPtr[i].bytes_of_weight = 0; + wsPtr[i].bytes_of_vec = bias_num * sizeof(float); + wsPtr[i].weight = nullptr; + wsPtr[i].vec = (U8 *)mt_new_storage(wsPtr[i].bytes_of_vec); + memcpy(wsPtr[i].vec, bias_ptr, wsPtr[i].bytes_of_vec); + } else if (weightOpType == "Mul") { + auto indices = getOperatorWeightInputIndex(weightOpIndex); + CHECK_REQUIREMENT(0 != indices.size()); + const onnx::TensorProto &weight = weights[weightNode.input(indices[0])]; + float *weight_ptr = get_ptr_from_weight_obj(weight); + int weight_num = get_data_size_from_tensor_proto(weight); + + str_copy(wsPtr[i].op_name, weightOpName.c_str(), weightOpName.length()); + wsPtr[i].mdt = DT_F32; + wsPtr[i].bytes_of_weight = weight_num * sizeof(float); + wsPtr[i].bytes_of_vec = 0; + wsPtr[i].weight = (U8 *)mt_new_storage(wsPtr[i].bytes_of_weight); + memcpy(wsPtr[i].weight, weight_ptr, wsPtr[i].bytes_of_weight); + wsPtr[i].vec = nullptr; + } else if (weightOpType == "MatMul" || weightOpType == "PRelu") { + const onnx::TensorProto &weight = weights[weightNode.input(1)]; + float *weight_ptr = get_ptr_from_weight_obj(weight); + int weight_num = get_data_size_from_tensor_proto(weight); + + str_copy(wsPtr[i].op_name, weightOpName.c_str(), weightOpName.length()); + wsPtr[i].mdt = DT_F32; + wsPtr[i].bytes_of_weight = weight_num * sizeof(float); + wsPtr[i].bytes_of_vec = 0; + wsPtr[i].weight = (U8 *)mt_new_storage(wsPtr[i].bytes_of_weight); + int row = weight.dims(0); + int column = weight.dims(1); + for (int m = 0, index = 0; m < column; m++) { + for (int n = 0; n < row; n++, index += sizeof(float)) { + memcpy(wsPtr[i].weight + index, weight_ptr + n * column + m, sizeof(float)); + } + } + wsPtr[i].vec = nullptr; + } else if (weightOpType == "Div") { + const onnx::TensorProto &weight = weights[weightNode.input(1)]; + float *weight_ptr = get_ptr_from_weight_obj(weight); + int weight_num = get_data_size_from_tensor_proto(weight); + + str_copy(wsPtr[i].op_name, weightOpName.c_str(), weightOpName.length()); + wsPtr[i].mdt = DT_F32; + wsPtr[i].bytes_of_weight = weight_num * sizeof(float); + wsPtr[i].bytes_of_vec = 0; + wsPtr[i].weight = (U8 *)mt_new_storage(wsPtr[i].bytes_of_weight); + F32 *scale = (F32 *)wsPtr[i].weight; + memcpy(scale, weight_ptr, wsPtr[i].bytes_of_weight); + for (int j = 0; j < weight_num; j++) { + scale[j] = 1 / scale[j]; + } + wsPtr[i].vec = nullptr; + } else if (weightOpType == "Transpose") { + const onnx::TensorProto &weight = weights[weightNode.input(0)]; + float *weight_ptr = get_ptr_from_weight_obj(weight); + int weight_num = get_data_size_from_tensor_proto(weight); + + str_copy(wsPtr[i].op_name, weightOpName.c_str(), weightOpName.length()); + wsPtr[i].mdt = DT_F32; + wsPtr[i].bytes_of_weight = weight_num * sizeof(float); + // For the time being, use bytes_of_vec to record the horizontal length of weight + wsPtr[i].bytes_of_vec = weight.dims(0); + wsPtr[i].weight = (U8 *)mt_new_storage(wsPtr[i].bytes_of_weight); + memcpy(wsPtr[i].weight, weight_ptr, wsPtr[i].bytes_of_weight); + wsPtr[i].vec = nullptr; + } else if (weightOpType == "LSTM") { + const onnx::TensorProto &W = weights[weightNode.input(1)]; + const onnx::TensorProto &R = weights[weightNode.input(2)]; + const onnx::TensorProto &B = weights[weightNode.input(3)]; + + float *W_ptr = get_ptr_from_weight_obj(W); + float *R_ptr = get_ptr_from_weight_obj(R); + float *B_ptr = get_ptr_from_weight_obj(B); + + int W_dim_size = W.dims_size(); + int R_dim_size = R.dims_size(); + int iter_times = 1; + std::vector W_dims_vec; + std::vector R_dims_vec; + if (W_dim_size != R_dim_size) { + UNI_ERROR_LOG("not support onnx LSTM W != R\n"); + } else { + for (int k = 0; k < W_dim_size - 1; k++) { + W_dims_vec.push_back(W.dims(k)); + R_dims_vec.push_back(R.dims(k)); + iter_times *= W.dims(k); + } + } + int W_con_dim_size = W.dims(W_dim_size - 1); + int R_con_dim_size = R.dims(R_dim_size - 1); + int W_weight_num = get_data_size_from_tensor_proto(W); + int R_weight_num = get_data_size_from_tensor_proto(R); + int B_weight_num = get_data_size_from_tensor_proto(B); + + wsPtr[i].mdt = DT_F32; + str_copy(wsPtr[i].op_name, weightOpName.c_str(), weightOpName.length()); + wsPtr[i].bytes_of_weight = (W_weight_num + R_weight_num) * sizeof(float); + wsPtr[i].weight = (U8 *)mt_new_storage(wsPtr[i].bytes_of_weight); + int begin_index = 0; + for (int k = 0; k < iter_times; k++) { + memcpy(((float *)wsPtr[i].weight) + begin_index, W_ptr + k * W_con_dim_size, + W_con_dim_size * sizeof(float)); + memcpy(((float *)wsPtr[i].weight) + begin_index + W_con_dim_size, + R_ptr + k * R_con_dim_size, R_con_dim_size * sizeof(float)); + begin_index += (W_con_dim_size + R_con_dim_size); + } + wsPtr[i].bytes_of_vec = B_weight_num * sizeof(float); + wsPtr[i].vec = (U8 *)mt_new_storage(wsPtr[i].bytes_of_vec); + memcpy(wsPtr[i].vec, B_ptr, wsPtr[i].bytes_of_vec); + } else if (weightOpType == "Gather") { + auto weightTp = weights[weightNode.input(0)]; + int weightNum = get_data_size_from_tensor_proto(weightTp); + float *weightParamPtr = get_ptr_from_weight_obj(weightTp); + str_copy(wsPtr[i].op_name, weightOpName.c_str(), weightOpName.length()); + wsPtr[i].mdt = DT_F32; + wsPtr[i].bytes_of_weight = weightNum * sizeof(float); + wsPtr[i].weight = (U8 *)mt_new_storage(wsPtr[i].bytes_of_weight); + memcpy(wsPtr[i].weight, weightParamPtr, wsPtr[i].bytes_of_weight); + wsPtr[i].bytes_of_vec = 0; + wsPtr[i].vec = nullptr; + } else if (weightOpType == "Splice") { + std::vector indices = + get_node_vector_ints_attribute_by_name(weightNode, "forward_indexes"); + str_copy(wsPtr[i].op_name, weightOpName.c_str(), weightOpName.length()); + wsPtr[i].mdt = DT_U32; + wsPtr[i].bytes_of_weight = indices.size() * sizeof(U32); + wsPtr[i].weight = (U8 *)mt_new_storage(wsPtr[i].bytes_of_weight); + memcpy(wsPtr[i].weight, indices.data(), wsPtr[i].bytes_of_weight); + wsPtr[i].bytes_of_vec = 0; + wsPtr[i].vec = nullptr; + } + + weightOpIndexIndeed++; + } + return ret; + } + + ParameterSpec adapt_SharedWeight() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + const onnx::TensorProto& data = weights[node.input(0)]; + const onnx::TensorProto& ind = weights[node.input(1)]; + SharedWeightParamSpec sharedWeightPs; + sharedWeightPs.desc.nDims = 3; + sharedWeightPs.desc.dims[2] = 1; + sharedWeightPs.desc.dims[1] = ind.dims(1); + sharedWeightPs.desc.dims[0] = data.dims(1); + sharedWeightPs.desc.df = DF_NORMAL; + sharedWeightPs.desc.dt = DT_F32; + UNI_DEBUG_LOG("SharedWeight: %s\n" ,tensorDesc2Str(sharedWeightPs.desc).c_str()); + curPs.shared_weight_spec = sharedWeightPs; + return curPs; + } + + ParameterSpec adapt_Reshape() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + ReshapeParamSpec reshapePs; + initialization_zero(&reshapePs, sizeof(reshapePs)); + std::vector reshapeInfo; + if (this->op == "Flatten") { + int axis = get_node_single_int_attribute_by_name(node, "axis", 1); + for (int i = 0; i < axis; i++) { + reshapeInfo.push_back(0); + } + reshapeInfo.push_back(-1); + } else { + if (node.input_size() == 1) { + reshapeInfo = get_node_vector_ints_attribute_by_name(node, "shape"); + } else { + reshapeInfo = get_reshapeInfo_from_tensorProto( + weights[node.input(1)]); // tp:weights[node.input(1)] + } + } + reshapePs.shape_size = reshapeInfo.size(); + memcpy(reshapePs.shape_dims, reshapeInfo.data(), reshapePs.shape_size * sizeof(I32)); + curPs.reshape_spec = reshapePs; + return curPs; + } + + ParameterSpec adapt_Resize() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + ResizeParamSpec resizePs; + initialization_zero(&resizePs, sizeof(resizePs)); + std::string mode = get_node_str_attribute_by_name(node, "mode", "linear"); + str_copy(resizePs.mode, mode.c_str(), mode.length()); + resizePs.num_scales = 0; + resizePs.num_sizes = 0; + std::string scalesIndex = ""; + std::string sizesIndex = ""; + if (node.op_type() == "Resize") { + scalesIndex = node.input(2); + if (node.input_size() == 4) { + sizesIndex = node.input(3); + } + } else if (node.op_type() == "Upsample") { + scalesIndex = node.input(1); + } else { + UNI_ERROR_LOG("unsupported resize op name %s\n", node.op_type().c_str()); + } + if (scalesIndex != "") { + const onnx::TensorProto &scales = weights[scalesIndex]; + if (scales.dims(0) > 0) { + CHECK_REQUIREMENT(scales.dims(0) == 4); + resizePs.num_scales = scales.dims(0); + F32 *scalesPtr = nullptr; + if (scales.has_raw_data()) { + const std::string &rawData = scales.raw_data(); + scalesPtr = (F32 *)(rawData.data()); + } else if (scales.data_type() == 1) { + scalesPtr = (F32 *)(scales.float_data().data()); + } else { + UNI_ERROR_LOG("Resize extract scales failed\n"); + } + memcpy(resizePs.scales, scalesPtr, resizePs.num_scales * bytesOf(DT_F32)); + } + } + if (sizesIndex != "") { + const onnx::TensorProto &sizes = weights[sizesIndex]; + if (sizes.dims(0) > 0) { + CHECK_REQUIREMENT(sizes.dims(0) == 4); + if (sizes.has_raw_data()) { + const std::string &rawData = sizes.raw_data(); + I64 *sizesPtr = (I64 *)(rawData.data()); + resizePs.sizes[0] = sizesPtr[2]; + resizePs.sizes[1] = sizesPtr[3]; + } else if (sizes.data_type() == 1) { + resizePs.sizes[0] = sizes.int64_data(2); + resizePs.sizes[1] = sizes.int64_data(3); + } else { + UNI_ERROR_LOG("Resize extract sizes failed\n"); + } + resizePs.num_sizes = 2; + } + } + curPs.resize_spec = resizePs; + return curPs; + } + + ParameterSpec adapt_Transpose() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + TransposeParamSpec transposePs; + initialization_zero(&transposePs, sizeof(transposePs)); + std::vector transpose_info = get_node_vector_ints_attribute_by_name(node, "perm"); + transposePs.trans_size = transpose_info.size(); + memcpy(transposePs.trans_dims, transpose_info.data(), transposePs.trans_size * sizeof(U32)); + curPs.transpose_spec = transposePs; + return curPs; + } + + ParameterSpec adapt_Clip() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + ClipParamSpec clipParam; + initialization_zero(&clipParam, sizeof(clipParam)); + if (op == "Max") { + clipParam.min = 0; + clipParam.max = UNI_F16_MAX; + } else if (op == "Min") { + clipParam.min = -UNI_F16_MAX; + clipParam.max = 1; + } else { // op == "Clip" + if (node.input_size() == 1) { + clipParam.min = get_node_float_attribute_by_name(node, "min", -UNI_F16_MAX); + clipParam.max = get_node_float_attribute_by_name(node, "max", UNI_F16_MAX); + } else { + auto minTp = weights[node.input(1)]; + auto maxTp = weights[node.input(2)]; + clipParam.min = getSinFloat_from_tensorProto(minTp); + clipParam.max = getSinFloat_from_tensorProto(maxTp); + } + } + curPs.clip_spec = clipParam; + return curPs; + } + + ParameterSpec adapt_Conv() override + { + weightOpIndexLists.push_back(nodeIndex); + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + ConvolutionParamSpec cps; + initialization_zero(&cps, sizeof(cps)); + std::vector kernelShape = get_node_vector_ints_attribute_by_name(node, "kernel_shape"); + std::vector dilations = get_node_vector_ints_attribute_by_name(node, "dilations"); + std::vector strides = get_node_vector_ints_attribute_by_name(node, "strides"); + std::vector pads = get_node_vector_ints_attribute_by_name(node, "pads"); + int group = get_node_single_int_attribute_by_name(node, "group", 1); + + const onnx::TensorProto &weight = weights[node.input(1)]; + cps.num_outputs = weight.dims(0); + cps.num_outputs_origin = cps.num_outputs; + cps.kernel_t = 1; + cps.stride_t = 1; + cps.padding_before = 0; + cps.padding_after = 0; + cps.dilatedRate_t = 1; + if (kernelShape.size() == 2) { + cps.kernel_h = kernelShape[0]; + cps.kernel_w = kernelShape[1]; + } else if (kernelShape.size() == 1) { + cps.kernel_h = kernelShape[0]; + cps.kernel_w = 1; + } else { + UNI_ERROR_LOG("convolution: kernel_size unknown\n"); + } + + if (dilations.size() == 2) { + cps.dilatedRate_h = dilations[0]; + cps.dilatedRate_w = dilations[1]; + } else if (dilations.size() == 1) { + cps.dilatedRate_h = dilations[0]; + cps.dilatedRate_w = 1; + } else { + UNI_WARNING_LOG("convolution: dilation unknown. Default to 1\n"); + cps.dilatedRate_h = 1; + cps.dilatedRate_w = 1; + } + + if (strides.size() == 2) { + cps.stride_h = strides[0]; + cps.stride_w = strides[1]; + } else if (strides.size() == 1) { + cps.stride_h = strides[0]; + cps.stride_w = 1; + } else { + UNI_ERROR_LOG("convolution: stride unknown\n"); + } + + if (pads.size() == 4) { + if (cps.kernel_h == cps.kernel_w && (pads[0] != pads[2] || pads[1] != pads[3])) { + cps.padding_top = UNI_MAX(pads[0], pads[2]); + cps.padding_bottom = UNI_MAX(pads[0], pads[2]); + cps.padding_left = UNI_MAX(pads[1], pads[3]); + cps.padding_right = UNI_MAX(pads[1], pads[3]); + } else { + cps.padding_top = pads[0]; + cps.padding_left = pads[1]; + cps.padding_bottom = pads[2]; + cps.padding_right = pads[3]; + } + } else if (pads.size() == 2) { + cps.padding_top = pads[0]; + cps.padding_bottom = pads[1]; + cps.padding_left = 0; + cps.padding_right = 0; + } else { + UNI_ERROR_LOG("deconvolution: pad unknown\n"); + } + + cps.group = group; + if (cps.group != 1 && cps.group == cps.num_outputs) { + cps.convolution_type = Convolution_Depthwise; + } else { + if (cps.dilatedRate_h > 1 || cps.dilatedRate_w > 1) { + cps.convolution_type = Convolution_Dilation; + } else { + cps.convolution_type = Convolution_Pointwise; + } + } + + cps.dw_activation_type = ACTIVATION_NULL; + cps.pw_activation_type = ACTIVATION_NULL; + curPs.conv_spec = cps; + return curPs; + } + + ParameterSpec adapt_Deconvolution() override + { + weightOpIndexLists.push_back(nodeIndex); + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + ConvolutionParamSpec cps; + initialization_zero(&cps, sizeof(cps)); + std::vector kernelShape = get_node_vector_ints_attribute_by_name(node, "kernel_shape"); + std::vector dilations = get_node_vector_ints_attribute_by_name(node, "dilations"); + std::vector strides = get_node_vector_ints_attribute_by_name(node, "strides"); + std::vector pads = get_node_vector_ints_attribute_by_name(node, "pads"); + int group = get_node_single_int_attribute_by_name(node, "group", 1); + + const onnx::TensorProto &weight = weights[node.input(1)]; + cps.num_outputs = weight.dims(1); + cps.kernel_t = 1; + cps.stride_t = 1; + cps.padding_before = 0; + cps.padding_after = 0; + cps.dilatedRate_t = 1; + if (kernelShape.size() == 2) { + cps.kernel_h = kernelShape[0]; + cps.kernel_w = kernelShape[1]; + } else if (kernelShape.size() == 1) { + cps.kernel_h = kernelShape[0]; + cps.kernel_w = 1; + } else { + UNI_ERROR_LOG("deconvolution: kernel_size unknown\n"); + } + + if (dilations.size() == 2) { + cps.dilatedRate_h = dilations[0]; + cps.dilatedRate_w = dilations[1]; + } else if (dilations.size() == 1) { + cps.dilatedRate_h = dilations[0]; + cps.dilatedRate_w = 1; + } else { + UNI_ERROR_LOG("deconvolution: dilation unknown\n"); + } + + if (strides.size() == 2) { + cps.stride_h = strides[0]; + cps.stride_w = strides[1]; + } else if (strides.size() == 1) { + cps.stride_h = strides[0]; + cps.stride_w = 1; + } else { + UNI_ERROR_LOG("deconvolution: stride unknown\n"); + } + + if (pads.size() == 4) { + cps.padding_top = pads[0]; + cps.padding_left = pads[1]; + cps.padding_bottom = pads[2]; + cps.padding_right = pads[3]; + } else if (pads.size() == 2) { + cps.padding_top = pads[0]; + cps.padding_bottom = pads[1]; + cps.padding_left = 0; + cps.padding_right = 0; + } else { + UNI_ERROR_LOG("deconvolution: pad unknown\n"); + } + + cps.group = group; + if (1 == group) { + cps.convolution_type = Convolution_Deconvolution; + } else { + cps.convolution_type = Convolution_Depthwise_Deconvolution; + cps.num_outputs = weight.dims(0); + } + cps.num_outputs_origin = cps.num_outputs; + cps.dw_activation_type = ACTIVATION_NULL; + cps.pw_activation_type = ACTIVATION_NULL; + curPs.conv_spec = cps; + return curPs; + } + + ParameterSpec adapt_Pooling() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + PoolingParamSpec pps; + initialization_zero(&pps, sizeof(pps)); + std::string autoPad = get_node_str_attribute_by_name(node, "auto_pad"); // deprecated + std::vector kernelShape = get_node_vector_ints_attribute_by_name(node, "kernel_shape"); + std::vector strides = get_node_vector_ints_attribute_by_name(node, "strides"); + std::vector pads = get_node_vector_ints_attribute_by_name(node, "pads"); + + pps.kernel_t = 1; + pps.stride_t = 1; + pps.padding_before = 0; + pps.padding_after = 0; + if (op == "AveragePool" || op == "ReduceMean" || op == "GlobalAveragePool") { + pps.mode = POOLING_MEAN; + } else { + pps.mode = POOLING_MAX; + } + + if (autoPad == "SAME_UPPER") { + pps.rm = CEIL; + } else { + pps.rm = FLOOR; + } + + if (kernelShape.size() == 2) { + pps.kernel_h = kernelShape[0]; + pps.kernel_w = kernelShape[1]; + } else { + pps.kernel_h = 0; + pps.kernel_w = 0; + UNI_INFO_LOG("pooling: kernel_size unknown. This could be global pooling.\n"); + } + + if (strides.size() == 2) { + pps.stride_h = strides[0]; + pps.stride_w = strides[1]; + } else { + pps.stride_h = 1; + pps.stride_w = 1; + UNI_INFO_LOG("pooling: stride unknown. This could be global pooling.\n"); + } + + if (pads.size() == 4) { + pps.padding_top = pads[0]; + pps.padding_bottom = pads[2]; + pps.padding_left = pads[1]; + pps.padding_right = pads[3]; + } else { + pps.padding_top = 0; + pps.padding_bottom = 0; + pps.padding_left = 0; + pps.padding_right = 0; + } + curPs.pooling_spec = pps; + return curPs; + } + + ParameterSpec adapt_MatMul() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + MatMulParamSpec matmulPs; + initialization_zero(&matmulPs, sizeof(matmulPs)); + matmulPs.transpose_a = false; + matmulPs.transpose_b = false; + curPs.matmul_spec = matmulPs; + return curPs; + } + + ParameterSpec adapt_Fc() override + { + weightOpIndexLists.push_back(nodeIndex); + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + FullyConnectedParamSpec fcParamSpec; + initialization_zero(&fcParamSpec, sizeof(fcParamSpec)); + fcParamSpec.num_outputs = -1; + + if (op == "MatMul") { + const onnx::TensorProto &matmulTp = weights[node.input(1)]; + if (matmulTp.dims_size() == 2) { + fcParamSpec.num_outputs = matmulTp.dims(1); + } else { + UNI_ERROR_LOG("onnx model adaptor not support matmul\n"); + } + } else { + float alpha = get_node_float_attribute_by_name(node, "alpha", 1.f); + float beta = get_node_float_attribute_by_name(node, "beta", 1.f); + int transA = get_node_single_int_attribute_by_name(node, "transA", 0); + int transB = get_node_single_int_attribute_by_name(node, "transB", 0); + if (alpha == 1.f && beta == 1.f) { + if (transA == 0 && transB == 1) { + const onnx::TensorProto &C = weights[node.input(2)]; + int num_output = get_data_size_from_tensor_proto(C); + fcParamSpec.num_outputs = num_output; + } + } else { + UNI_ERROR_LOG("onnx model adaptor fully connect layer num_output is unkown\n"); + } + } + fcParamSpec.num_slices = 1; + fcParamSpec.slice_point[0] = fcParamSpec.num_outputs; + curPs.fc_spec = fcParamSpec; + return curPs; + } + + ParameterSpec adapt_BatchNorm() override + { + weightOpIndexLists.push_back(nodeIndex); + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + BatchNormParamSpec bnPs; + initialization_zero(&bnPs, sizeof(bnPs)); + bnPs.eps = get_node_float_attribute_by_name(node, "epsilon", 1e-5f); + bnPs.axis = 1; + bnPs.gama = 1; + bnPs.momentum = get_node_float_attribute_by_name(node, "momentum", 0.9); + curPs.bn_spec = bnPs; + return curPs; + } + + ParameterSpec adapt_Eltwise() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + EltwiseParamSpec eps; + initialization_zero(&eps, sizeof(eps)); + if (op == "Add") { + eps.elt_mode = ELTWISE_SUM; + eps.elt_sum_spec.coeff_size = 2; + for (I32 j = 0; j < eps.elt_sum_spec.coeff_size; j++) { + eps.elt_sum_spec.coeff_values[j] = 1.0; + } + } else if (op == "Mul") { + eps.elt_mode = ELTWISE_PROD; + } else if (op == "Sub") { + eps.elt_mode = ELTWISE_SUB; + } else if (op == "Div") { + eps.elt_mode = ELTWISE_DIV; + } else { + CHECK_STATUS(NOT_IMPLEMENTED); + } + eps.activation_type = ACTIVATION_NULL; + curPs.eltwise_spec = eps; + return curPs; + } + + void handle_Constant() + { + for (int i = 0; i < node.attribute_size(); i++) { + const onnx::AttributeProto &attribute = node.attribute(i); + if (attribute.name() == "value") { + CHECK_REQUIREMENT(4 == attribute.type()); + const onnx::TensorProto &tp = attribute.t(); + weights[node.output(0)] = tp; + break; + } + } + } + + ParameterSpec adapt_Pad() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + PadParamSpec padPs; + initialization_zero(&padPs, sizeof(padPs)); + std::string padModeStr = get_node_str_attribute_by_name(node, "mode"); + std::vector padVec = get_node_vector_ints_attribute_by_name(node, "pads"); + F32 padValue = get_node_float_attribute_by_name(node, "value", 0.f); + if (padModeStr == "constant") { + padPs.pad_mode = Pad_Constant; + } else if (padModeStr == "edge") { + padPs.pad_mode = Pad_Edge; + } else if (padModeStr == "reflect") { + padPs.pad_mode = Pad_Reflect; + } else { + UNI_ERROR_LOG("unknown pad mode: %s\n", padModeStr.c_str()); + } + + padPs.before = 0; + padPs.after = 0; + U32 padSize = padVec.size(); + if (padSize == 8) { // NCHW + padPs.top = padVec[2]; + padPs.left = padVec[3]; + padPs.bottom = padVec[6]; + padPs.right = padVec[7]; + } else if (padSize == 6) { // NCH + padPs.top = padVec[2]; + padPs.left = 0; + padPs.bottom = padVec[5]; + padPs.right = 0; + } else if (padSize == 4) { // HW + padPs.top = padVec[0]; + padPs.left = padVec[1]; + padPs.bottom = padVec[2]; + padPs.right = padVec[3]; + } else { + UNI_ERROR_LOG("unsupported pad length\n"); + } + padPs.constant_value = padValue; + curPs.pad_spec = padPs; + return curPs; + } + + ParameterSpec adapt_Gather() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + GatherParamSpec gps; + initialization_zero(&gps, sizeof(gps)); + gps.gather_axis = get_node_single_int_attribute_by_name(node, "axis", 0); + curPs.gather_spec = gps; + return curPs; + } + + ParameterSpec adapt_Slice() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + SliceParamSpec slice_ps; + initialization_zero(&slice_ps, sizeof(slice_ps)); + if (op == "Gather") { + ParameterSpec gather_ps = adapt_Gather(); + slice_ps.slice_points[0] = 1; + slice_ps.slice_size = 1; + slice_ps.axis = gather_ps.gather_spec.gather_axis; + } else if (op == "Slice") { + std::vector startsInfo = get_node_vector_ints_attribute_by_name(node, "starts"); + CHECK_REQUIREMENT(0 == startsInfo[0]); // Support this only case + std::vector endsInfo = get_node_vector_ints_attribute_by_name(node, "ends"); + std::vector axesInfo = get_node_vector_ints_attribute_by_name(node, "axes"); + slice_ps.slice_points[0] = endsInfo[0]; + slice_ps.slice_size = 1; + slice_ps.axis = axesInfo[0]; + } else if (op == "Split") { + std::vector splitInfo = get_node_vector_ints_attribute_by_name(node, "split"); + slice_ps.axis = get_node_single_int_attribute_by_name(node, "axis", 0); + if (0 == splitInfo.size()) { // Split equally by default. Set all slice_points to 0 + slice_ps.slice_size = (int)node.output_size(); + memset(slice_ps.slice_points, 0, slice_ps.slice_size * sizeof(I32)); + } else { + slice_ps.slice_size = splitInfo.size(); + slice_ps.slice_points[0] = splitInfo[0]; + for (U32 i = 1; i < slice_ps.slice_size; i++) { + slice_ps.slice_points[i] = slice_ps.slice_points[i - 1] + splitInfo[i]; + } + } + } + curPs.slice_spec = slice_ps; + return curPs; + } + + ParameterSpec adapt_Embedding() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + EmbedParamSpec embed_ps; + initialization_zero(&embed_ps, sizeof(embed_ps)); + std::string embed_weight_name = node.input(0); + auto tensor_proto = weights[embed_weight_name]; + int size_of_dims = tensor_proto.dims_size(); + if (size_of_dims != 2) { + UNI_ERROR_LOG("unsupported onnx embedding parameter\n"); + } + embed_ps.input_dim = tensor_proto.dims(0); + embed_ps.num_output = tensor_proto.dims(1); + embed_ps.bias_term = false; + embed_ps.transpose = false; + curPs.embed_spec = embed_ps; + return curPs; + } + + ParameterSpec adapt_Squeeze() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + SqueezeParamSpec squeezePs; + initialization_zero(&squeezePs, sizeof(squeezePs)); + std::vector squeezeAxes = get_node_vector_ints_attribute_by_name(node, "axes"); + squeezePs.axes_num = squeezeAxes.size(); + for (int squeeze_i = 0; squeeze_i < (int)squeezeAxes.size(); squeeze_i++) { + squeezePs.axes[squeeze_i] = squeezeAxes[squeeze_i]; + } + curPs.squeeze_spec = squeezePs; + return curPs; + } + + ParameterSpec adapt_Unsqueeze() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + UnsqueezeParamSpec unsqueezePs; + initialization_zero(&unsqueezePs, sizeof(unsqueezePs)); + std::vector unsqueezeAxes = get_node_vector_ints_attribute_by_name(node, "axes"); + unsqueezePs.axes_num = unsqueezeAxes.size(); + for (int unsqueeze_i = 0; unsqueeze_i < (int)unsqueezeAxes.size(); unsqueeze_i++) { + unsqueezePs.axes[unsqueeze_i] = unsqueezeAxes[unsqueeze_i]; + } + curPs.unsqueeze_spec = unsqueezePs; + return curPs; + } + + ParameterSpec adapt_Cast() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + CastParamSpec castPs; + initialization_zero(&castPs, sizeof(castPs)); + int cast_to = get_node_single_int_attribute_by_name(node, "to", 0); + if (cast_to == 1) { + castPs.castPrecision = ToFloat; + } else if (cast_to == 5 || cast_to == 6 || cast_to == 7) { + castPs.castPrecision = ToInt; + } else { + castPs.castPrecision = KeepPrecision; + } + curPs.cast_spec = castPs; + return curPs; + } + + ParameterSpec adapt_Concat() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + ConcatParamSpec concatPs; + initialization_zero(&concatPs, sizeof(concatPs)); + concatPs.axis = get_node_single_int_attribute_by_name(node, "axis", 1); + curPs.concat_spec = concatPs; + return curPs; + } + + ParameterSpec adapt_Softmax() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + SoftmaxParamSpec softmaxPs; + initialization_zero(&softmaxPs, sizeof(softmaxPs)); + softmaxPs.axis = get_node_single_int_attribute_by_name(node, "axis", 1); + curPs.softmax_spec = softmaxPs; + return curPs; + } + + ParameterSpec adapt_Relu() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + ReLUParamSpec reluPs; + initialization_zero(&reluPs, sizeof(reluPs)); + reluPs.neg_slope = get_node_float_attribute_by_name(node, "alpha", 0.0); + curPs.relu_spec = reluPs; + return curPs; + } + + ParameterSpec adapt_RNN() override + { + weightOpIndexLists.push_back(nodeIndex); + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + RNNParamSpec rnnPs; + initialization_zero(&rnnPs, sizeof(rnnPs)); + if (this->op == "RNN") { + rnnPs.mode = RNN_RNN; + } else if (this->op == "LSTM") { + rnnPs.mode = RNN_LSTM; + } else if (this->op == "GRU") { + int linear_before_reset = + get_node_single_int_attribute_by_name(node, "linear_before_reset", 0); + if (linear_before_reset == 0) { + rnnPs.mode = RNN_GRU; + } else { + rnnPs.mode = RNN_GRU_LBR; + } + } else { + UNI_ERROR_LOG("not support %s currently\n", this->op.c_str()); + } + rnnPs.numOutput = get_node_single_int_attribute_by_name(node, "hidden_size", 1); + rnnPs.biDirection = + get_node_str_attribute_by_name(node, "direction", "forward") == "bidirectional" ? true + : false; + rnnPs.steps = 0; + rnnPs.numProjection = 0; + rnnPs.zoneoutCell = 0; + rnnPs.zoneoutOutput = 0; + rnnPs.forgetBias = 1.0; + rnnPs.activationMode = ACTIVATION_TANH; + curPs.rnn_spec = rnnPs; + return curPs; + } + + ParameterSpec adapt_Power() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + PowerParamSpec powerPs; + initialization_zero(&powerPs, sizeof(powerPs)); + powerPs.scale = 1; + powerPs.shift = 0; + if (this->op == "Pow") { + auto tp = weights[node.input(1)]; + powerPs.power = getSinFloat_from_tensorProto(tp); + } else if (this->op == "Sqrt") { + powerPs.power = 0.5; + } else { + UNI_ERROR_LOG("onnx model read failed in adapt_Power for %s\n", this->op.c_str()); + } + curPs.power_spec = powerPs; + return curPs; + } + + ParameterSpec adapt_Scale() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + ScaleParamSpec scale_ps; + initialization_zero(&scale_ps, sizeof(scale_ps)); + scale_ps.axis = 1; + curPs.scale_spec = scale_ps; + return curPs; + } + + ParameterSpec adapt_Space2Depth() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + Space2DepthParamSpec s2dPs; + initialization_zero(&s2dPs, sizeof(s2dPs)); + s2dPs.blockSize = get_node_single_int_attribute_by_name(node, "blocksize", 1); + curPs.space2depth_spec = s2dPs; + return curPs; + } + + ParameterSpec adapt_Depth2Space() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + Depth2SpaceParamSpec d2sPs; + initialization_zero(&d2sPs, sizeof(d2sPs)); + d2sPs.blockSize = get_node_single_int_attribute_by_name(node, "blocksize", 1); + std::string d2s_mode = get_node_str_attribute_by_name(node, "mode", "DCR"); + str_copy(d2sPs.reMode, d2s_mode.c_str(), d2s_mode.length(), 8); + curPs.depth2space_spec = d2sPs; + return curPs; + } + + ParameterSpec adapt_Reduction() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + ReductionParamSpec rsPs; + initialization_zero(&rsPs, sizeof(rsPs)); + std::vector axesInfo = + get_node_vector_ints_attribute_by_name(node, "axes"); // default one element + int keepdimsInfo = get_node_single_int_attribute_by_name(node, "keepdims", 0); + rsPs.axes[0] = axesInfo[0]; + rsPs.axes_num = 1; + rsPs.keep_dim = keepdimsInfo == 0 ? false : true; + rsPs.coeff = 1.0; + if (op == "ReduceSum") { + rsPs.reduction_mode = REDUCTION_SUM; + } else { + rsPs.reduction_mode = REDUCTION_MEAN; + } + curPs.reduction_spec = rsPs; + return curPs; + } + + ParameterSpec adapt_ArgMax() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + ArgMaxParamSpec amPs; + initialization_zero(&amPs, sizeof(amPs)); + amPs.axis = get_node_single_int_attribute_by_name(node, "axis", -1); + curPs.argmax_spec = amPs; + return curPs; + } + + ParameterSpec adapt_PRelu() override + { + weightOpIndexLists.push_back(nodeIndex); + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + return curPs; + } + + ParameterSpec adapt_Tile() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + TileParamSpec tilePs; + initialization_zero(&tilePs, sizeof(tilePs)); + std::vector tileInfo = get_reshapeInfo_from_tensorProto(weights[node.input(1)]); + if (tileInfo.size() > 0 && tileInfo.size() <= 8) { + tilePs.dimsSize = tileInfo.size(); + } else { + UNI_ERROR_LOG("not support this mode tile currently\n"); + } + for (U32 i = 0; i < tileInfo.size(); i++) { + tilePs.repeatsInfo[i] = tileInfo[i]; + } + curPs.tile_spec = tilePs; + return curPs; + } + + ParameterSpec adapt_Splice() override + { + weightOpIndexLists.push_back(nodeIndex); + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + SpliceParamSpec splicePs; + initialization_zero(&splicePs, sizeof(splicePs)); + splicePs.outputDim = get_node_single_int_attribute_by_name(node, "output_dim", 600); + std::vector indices = get_node_vector_ints_attribute_by_name(node, "forward_indexes"); + splicePs.numIndices = indices.size(); + curPs.splice_spec = splicePs; + return curPs; + } + +private: + std::string op; + std::string modelName; + int removePreprocessOpNum; + TensorDesc inputDesc; + onnx::ModelProto onnxModel; + onnx::GraphProto onnxGraph; + onnx::NodeProto node; + std::map weights; + int nodeIndex; + std::vector weightOpIndexLists; + int opFinalInputNum; +}; +#endif diff --git a/model-tools/src/onnx/onnx_wrapper.cpp b/model_tools/src/onnx/onnx_wrapper.cpp similarity index 81% rename from model-tools/src/onnx/onnx_wrapper.cpp rename to model_tools/src/onnx/onnx_wrapper.cpp index f2174d9f..9c09f665 100644 --- a/model-tools/src/onnx/onnx_wrapper.cpp +++ b/model_tools/src/onnx/onnx_wrapper.cpp @@ -1,24 +1,24 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include #include "converter.h" #include "model_tools.h" #include "onnx_adaptee.h" -EE onnx_converter(std::string dir, std::string mfn, int removePreprocessOpNum, ModelSpec* ms) { - ModelAdaptee* ade = new OnnxAdaptee(removePreprocessOpNum); +EE onnx_converter(std::string dir, std::string mfn, int removePreprocessOpNum, ModelSpec *ms) +{ + ModelAdaptee *ade = new OnnxAdaptee(removePreprocessOpNum); EE ret = ade->adapt(dir, mfn, ms); delete ade; return ret; diff --git a/model_tools/src/tensorflow/CMakeLists.txt b/model_tools/src/tensorflow/CMakeLists.txt new file mode 100644 index 00000000..c5613b03 --- /dev/null +++ b/model_tools/src/tensorflow/CMakeLists.txt @@ -0,0 +1,24 @@ +file(GLOB srcs ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) +file(GLOB commonsrcs ${CMAKE_CURRENT_SOURCE_DIR}/../model_*.cpp) + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) + +include_directories(${JSONCPP_INCLUDE_DIR}) + +include_directories(../) + +# shared library +add_library(${PROJECT_NAME}_tensorflow SHARED ${srcs} ${commonsrcs}) +if (USE_IOS_CLANG) + target_link_libraries(${PROJECT_NAME}_tensorflow LINK_PUBLIC ${JSONCPP_LIBRARY} uni) +endif (USE_IOS_CLANG) + +# static library +add_library(${PROJECT_NAME}_tensorflow_static STATIC ${srcs}) + +set_target_properties(${PROJECT_NAME}_tensorflow_static PROPERTIES OUTPUT_NAME "${PROJECT_NAME}_tensorflow") +set_target_properties(${PROJECT_NAME}_tensorflow PROPERTIES CLEAN_DIRECT_OUTPUT 1) +set_target_properties(${PROJECT_NAME}_tensorflow_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) +install(TARGETS ${PROJECT_NAME}_tensorflow ${PROJECT_NAME}_tensorflow_static + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib) diff --git a/model_tools/src/tensorflow/tensorflow_adaptee.h b/model_tools/src/tensorflow/tensorflow_adaptee.h new file mode 100644 index 00000000..db579438 --- /dev/null +++ b/model_tools/src/tensorflow/tensorflow_adaptee.h @@ -0,0 +1,977 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_TENSORFLOWADAPTEE +#define _H_TENSORFLOWADAPTEE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "converter.h" +#include "model_tools.h" +#include "model_adaptee.h" +#include "ut_util.h" + +class TensorflowAdaptee : public ModelAdaptee { +public: + TensorflowAdaptee() + { + this->modelInputLayerNum = 0; + this->entityOpCount = 0; + this->weightOpNum = 0; + this->curInDegree = 0; + this->curNodeIndex = 0; + } + + ~TensorflowAdaptee() + {} + +protected: + std::string cleanRedundantC(std::string str) + { + std::string result = ""; + for (int i = 1; i < (int)(str.length() - 1); i++) { + if (str.at(i) == '\\') { + continue; + } else { + result += str.at(i); + } + } + return result; + } + + OperatorType convert_tensorflow_type(std::string tfType) + { + if (tfType.compare("Mul") == 0 || tfType.compare("Sub") == 0 || + tfType.compare("Add") == 0 || tfType.compare("RealDiv") == 0) { + if (curInDegree == 1) { + return OT_Power; + } else { + return OT_Eltwise; + } + } else if (tfType.compare("FusedBatchNorm") == 0) { + return OT_BatchNorm; + } else if (tfType.compare("Relu6") == 0) { + return OT_Relu6; + } else if (tfType.compare("DepthwiseConv2dNative") == 0) { + return OT_Conv; + } else if (tfType.compare("MaxPool") == 0) { + return OT_Pooling; + } else if (tfType.compare("ConcatV2") == 0) { + return OT_Concat; + } else if (tfType.compare("Relu") == 0) { + return OT_Relu; + } else if (tfType.compare("ResizeBilinear") == 0) { + return OT_Resize; + } else if (tfType.compare("ArgMax") == 0) { + return OT_ArgMax; + } else if (tfType.compare("ExpandDims") == 0) { + return OT_Unsqueeze; + } else if (tfType.compare("Pad") == 0 || tfType.compare("PadV2") == 0) { + return OT_Pad; + } else if (tfType.compare("Transpose") == 0) { + return OT_Transpose; + } else if (tfType.compare("BiasAdd") == 0) { + return OT_FC; + } else if (tfType.compare("Conv2DBackpropInput") == 0 || tfType.compare("Conv2D") == 0) { + return OT_Conv; + } else if (tfType.compare("Cast") == 0) { + return OT_Cast; + } else if (tfType.compare("Reshape") == 0) { + return OT_Reshape; + } else if (tfType.compare("Rsqrt") == 0) { + return OT_Power; + } else if (tfType.compare("Squeeze") == 0) { + return OT_Squeeze; + } else if (tfType.compare("Sigmoid") == 0) { + return OT_Sigmoid; + } else if (tfType.compare("MatMul") == 0) { + if (this->curInDegree == 1) { + return OT_FC; + } else { + return OT_MatMul; + } + } else if (tfType.compare("Softmax") == 0) { + return OT_Softmax; + } else if (tfType.compare("AvgPool") == 0) { + return OT_Pooling; + } else if (tfType.compare("Mean") == 0) { + return OT_Reduction; + } else if (tfType.compare("Shape") == 0) { + return OT_Shape; + } else { + UNI_ERROR_LOG("tensorflow op %s is not supported yet\n", tfType.c_str()); + return OT_None; + } + } + + EE parse_file(std::string dir, std::string mfn) override + { + EE ret; + std::string tfSuffix = ".json"; + this->modelName = mfn; + std::string modelAbsPath = dir + "/" + mfn + tfSuffix; + std::string::size_type idx; + std::ifstream inFile; + inFile.open(modelAbsPath); + std::stringstream strStream; + strStream << inFile.rdbuf(); + std::string strValueFromPy = strStream.str(); + std::string strValue = cleanRedundantC(strValueFromPy); + std::string tailStr = strValue.substr(strValue.length() - 18, 18); + newStrValue = ""; + idx = tailStr.find("library"); + if (idx == std::string::npos) { + newStrValue = strValue; + } else { + newStrValue = strValue.substr(0, strValue.length() - 16) + "}"; + } + Json::Reader reader; + Json::Value value; + if (reader.parse(newStrValue, value)) { + this->entityOpCount = value["node"].size(); + for (int i = 0; i < (int)(value["node"].size()); i++) { + if ((value["node"][i]["op"].asString()).compare("Const") == 0) { + constId[value["node"][i]["name"].asString()] = i; + this->entityOpCount = this->entityOpCount - 1; + } else if ((value["node"][i]["op"].asString()).compare("Identity") == 0) { + idenConst[value["node"][i]["name"].asString()] = + value["node"][i]["input"][0].asString(); + this->entityOpCount = this->entityOpCount - 1; + } else if ((value["node"][i]["op"].asString()).compare("Placeholder") == 0) { + this->modelInputLayerNum = this->modelInputLayerNum + 1; + this->entityOpCount = this->entityOpCount - 1; + } + } + ret = SUCCESS; + } else { + ret = FILE_ERROR; + } + return ret; + } + + EE adapt_operators(ModelSpec *ms) override + { + EE ret = SUCCESS; + ms->dt = DT_F32; + str_copy(ms->model_name, modelName.c_str(), modelName.length()); + ms->num_inputs = this->modelInputLayerNum; + ms->input_names = (I8 **)mt_new_storage(ms->num_inputs * sizeof(I8 *)); + ms->input_dims = (TensorDesc *)mt_new_storage(sizeof(TensorDesc) * ms->num_inputs); + int traverseInputLayerIndex = 0; + + ms->num_operator_specs = this->entityOpCount; + OperatorSpec *opsPtr = + (OperatorSpec *)mt_new_storage(sizeof(OperatorSpec) * ms->num_operator_specs); + ms->ops = opsPtr; + int traverseEntityOpIndex = 0; + + std::map unmapOps; + Json::Reader reader; + Json::Value value; + if (reader.parse(newStrValue, value)) { + this->ttValue = value; + for (int i = 0; i < (int)(value["node"].size()); i++) { + std::string layerName = value["node"][i]["name"].asString(); + this->opType = value["node"][i]["op"].asString(); + if (opType.compare("Placeholder") == 0) { + ms->input_names[traverseInputLayerIndex] = + (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + str_copy(ms->input_names[traverseInputLayerIndex], layerName.c_str(), + layerName.length()); + int placeholder_shape_size = + value["node"][i]["attr"]["shape"]["shape"]["dim"].size(); + std::vector placeholderDimVec; + for (int j = 0; j < placeholder_shape_size; j++) { + placeholderDimVec.push_back(std::stoi( + value["node"][i]["attr"]["shape"]["shape"]["dim"][j]["size"].asString())); + } + if (placeholder_shape_size == 0) { + UNI_ERROR_LOG("The input dimensions are not specific, please provide a new " + "model with confirmed input."); + } + if (placeholder_shape_size == 1) { + ms->input_dims[traverseInputLayerIndex] = + tensor1d(DT_F32, placeholderDimVec[0]); + } else if (placeholder_shape_size == 2) { + ms->input_dims[traverseInputLayerIndex] = tensor2df( + DT_F32, DF_NORMAL, placeholderDimVec[0], placeholderDimVec[1]); + } else if (placeholder_shape_size == 3) { + ms->input_dims[traverseInputLayerIndex] = tensor3df(DT_F32, DF_MTK, + placeholderDimVec[0], placeholderDimVec[1], placeholderDimVec[2]); + } else if (placeholder_shape_size == 4) { + placeholderDimVec[0] = 1; + ms->input_dims[traverseInputLayerIndex] = tensor4df(DT_F32, DF_NCHW, + placeholderDimVec[0], placeholderDimVec[3], placeholderDimVec[1], + placeholderDimVec[2]); + } else { + UNI_ERROR_LOG("NOT SUPPORT THIS INPUT CURRENTLY\n"); + } + traverseInputLayerIndex++; + } else if (opType.compare("Const") == 0) { + int tensorDimSize = + value["node"][i]["attr"]["value"]["tensor"]["tensorShape"]["dim"].size(); + std::vector tensorDims; + int tensorDimsNum = 1; + for (int j = 0; j < tensorDimSize; j++) { + tensorDims.push_back(std::stoi( + value["node"][i]["attr"]["value"]["tensor"]["tensorShape"]["dim"][j]["s" + "i" + "z" + "e"] + .asString())); + tensorDimsNum *= tensorDims[j]; + } + } else if (opType.compare("Identity") != 0) { + std::vector inList; + std::vector constList; + + this->nodeV = value["node"][i]; + ParameterSpec tmpPs; + + str_copy( + opsPtr[traverseEntityOpIndex].name, layerName.c_str(), layerName.length()); + + if (opType.compare("Conv2DBackpropInput") == 0) { + UNI_WARNING_LOG("Filter the input0_size\n"); + } else if (opType.compare("FusedBatchNorm") == 0 && + value["node"][i]["input"].size() != 5) { // To collect more special cases + constList.push_back(value["node"][i]["input"][0].asString()); + } else { + inList.push_back(value["node"][i]["input"][0].asString()); + } + for (int k = 1; k < (int)(value["node"][i]["input"].size()); k++) { + std::string curIn = value["node"][i]["input"][k].asString(); + if (idenConst.find(curIn) == idenConst.end() && + constId.find(curIn) == constId.end()) { + inList.push_back(curIn); + } else { + if (constId.find(idenConst[curIn]) == constId.end() && + constId.find(curIn) == constId.end()) { + inList.push_back(curIn); + } else { + constList.push_back(curIn); + } + } + } + + if (constList.size() > 0) { + weightConstInput[layerName] = constList; + + if (opType != "Mul" && opType != "Sub" && opType != "Add" && + opType != "RealDiv" && opType != "ConcatV2" && opType != "PadV2" && + opType != "ArgMax" && opType != "Transpose" && opType != "Pad" && + opType != "ExpandDims" && opType != "ResizeBilinear" && + opType != "Reshape" && opType != "Mean") { // TODO: expand more cases + weightIds.push_back(i); + this->weightOpNum = this->weightOpNum + 1; + } + } + + opsPtr[traverseEntityOpIndex].num_inputs = inList.size(); + opsPtr[traverseEntityOpIndex].input_tensors_name = (I8 **)mt_new_storage( + opsPtr[traverseEntityOpIndex].num_inputs * sizeof(I8 *)); + for (int k = 0; k < (int)(opsPtr[traverseEntityOpIndex].num_inputs); k++) { + opsPtr[traverseEntityOpIndex].input_tensors_name[k] = + (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + str_copy(opsPtr[traverseEntityOpIndex].input_tensors_name[k], + inList[k].c_str(), inList[k].length()); + } + opsPtr[traverseEntityOpIndex].num_outputs = 1; + opsPtr[traverseEntityOpIndex].output_tensors_name = (I8 **)mt_new_storage( + opsPtr[traverseEntityOpIndex].num_outputs * sizeof(I8 *)); + for (int k = 0; k < (int)(opsPtr[traverseEntityOpIndex].num_outputs); k++) { + opsPtr[traverseEntityOpIndex].output_tensors_name[k] = + (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + str_copy(opsPtr[traverseEntityOpIndex].output_tensors_name[k], + layerName.c_str(), layerName.length()); + } + opsPtr[traverseEntityOpIndex].tensor_positions = nullptr; + opsPtr[traverseEntityOpIndex].num_quant_feature = 0; + opsPtr[traverseEntityOpIndex].feature_scale = nullptr; + + this->curInDegree = inList.size(); + OperatorType curOpType = convert_tensorflow_type(opType); + opsPtr[traverseEntityOpIndex].type = curOpType; + CHECK_STATUS(adapt_operator(curOpType, &tmpPs)); + opsPtr[traverseEntityOpIndex].ps = tmpPs; + + traverseEntityOpIndex++; + } + } + } else { + std::cout << "reading failed" << std::endl; + } + return ret; + } + + EE adapt_weights(ModelSpec *ms) override + { + ms->num_weight_specs = weightOpNum; + WeightSpec *wsPtr = (WeightSpec *)mt_new_storage(sizeof(WeightSpec) * ms->num_weight_specs); + for (int j = 0; j < ms->num_weight_specs; j++) { + wsPtr[j].num_quant_scale = 0; + wsPtr[j].weight_scale = nullptr; + } + ms->ws = wsPtr; + Json::Reader reader; + Json::Value value; + if (reader.parse(newStrValue, value)) { + for (int j = 0; j < ms->num_weight_specs; j++) { + int curWeightIndex = weightIds[j]; + std::string weightOpType = value["node"][curWeightIndex]["op"].asString(); + std::string weightOpName = value["node"][curWeightIndex]["name"].asString(); + str_copy(wsPtr[j].op_name, weightOpName.c_str(), weightOpName.length()); + std::vector constList = weightConstInput[weightOpName]; + if (weightOpType.compare("Conv2D") == 0 || + weightOpType.compare("Conv2DBackpropInput") == 0 || + weightOpType.compare("MatMul") == 0 || + weightOpType.compare("DepthwiseConv2dNative") == 0) { // To collect more op + + if (constList.size() == 1) { + std::string curIdenStr = constList[0]; + std::string curConstStr = idenConst[curIdenStr]; + int curConstIndex = constId[curConstStr]; + if (constId.find(curIdenStr) != constId.end()) { + curConstIndex = constId[curIdenStr]; + } + int tensorContentSize = + value["node"][curConstIndex]["attr"]["value"]["tensor"]["tensorContent"] + .size(); + wsPtr[j].mdt = DT_F32; + wsPtr[j].bytes_of_weight = tensorContentSize * sizeof(float); + float *fp32Ptr = (float *)mt_new_storage(wsPtr[j].bytes_of_weight); + for (int k = 0; k < tensorContentSize; k++) { + fp32Ptr[k] = std::stof( + value["node"][curConstIndex]["attr"]["value"]["tensor"]["tensorCont" + "ent"][k] + .asString()); + } + wsPtr[j].weight = (U8 *)fp32Ptr; + wsPtr[j].bytes_of_vec = 0; + wsPtr[j].vec = nullptr; + } else { + CHECK_STATUS(NOT_IMPLEMENTED); + } + } else if (weightOpType.compare("BiasAdd") == 0) { + if (constList.size() == 1) { + std::string curIdenStr = constList[0]; + std::string curConstStr = idenConst[curIdenStr]; + int curConstIndex = constId[curConstStr]; + + int tensorContentSize = + value["node"][curConstIndex]["attr"]["value"]["tensor"]["tensorContent"] + .size(); + wsPtr[j].mdt = DT_F32; + wsPtr[j].bytes_of_weight = 0; + wsPtr[j].weight = nullptr; + wsPtr[j].bytes_of_vec = tensorContentSize * sizeof(float); + float *fp32Ptr = (float *)mt_new_storage(wsPtr[j].bytes_of_vec); + for (int k = 0; k < tensorContentSize; k++) { + fp32Ptr[k] = std::stof( + value["node"][curConstIndex]["attr"]["value"]["tensor"]["tensorCont" + "ent"][k] + .asString()); + } + wsPtr[j].vec = (U8 *)fp32Ptr; + } else { + CHECK_STATUS(NOT_IMPLEMENTED); + } + + } else if (weightOpType.compare("FusedBatchNorm") == 0) { + if (constList.size() == 4) { + std::string curScaleIdenStr = constList[0]; + std::string curScaleConstStr = idenConst[curScaleIdenStr]; + int curScaleConstIndex = constId[curScaleConstStr]; + if (constId.find(curScaleIdenStr) != constId.end()) { + curScaleConstIndex = constId[curScaleIdenStr]; + } + + std::string curOffsetIdenStr = constList[1]; + std::string curOffsetConstStr = idenConst[curOffsetIdenStr]; + int curOffsetConstIndex = constId[curOffsetConstStr]; + if (constId.find(curOffsetIdenStr) != constId.end()) { + curOffsetConstIndex = constId[curOffsetIdenStr]; + } + + std::string curMeanIdenStr = constList[2]; + std::string curMeanConstStr = idenConst[curMeanIdenStr]; + int curMeanConstIndex = constId[curMeanConstStr]; + if (constId.find(curMeanIdenStr) != constId.end()) { + curMeanConstIndex = constId[curMeanIdenStr]; + } + + std::string curVarianceIdenStr = constList[3]; + std::string curVarianceConstStr = idenConst[curVarianceIdenStr]; + int curVarianceConstIndex = constId[curVarianceConstStr]; + if (constId.find(curVarianceIdenStr) != constId.end()) { + curVarianceConstIndex = constId[curVarianceIdenStr]; + } + + int iterSize = + value["node"][curScaleConstIndex]["attr"]["value"]["tensor"]["tensorCon" + "tent"] + .size(); + wsPtr[j].mdt = DT_F32; + wsPtr[j].bytes_of_weight = iterSize * sizeof(float); + float *fp32FirPtr = (float *)mt_new_storage(wsPtr[j].bytes_of_weight); + wsPtr[j].weight = (U8 *)fp32FirPtr; + wsPtr[j].bytes_of_vec = iterSize * sizeof(float); + float *fp32SecPtr = (float *)mt_new_storage(wsPtr[j].bytes_of_vec); + wsPtr[j].vec = (U8 *)fp32SecPtr; + + for (int k = 0; k < iterSize; k++) { + float tmpScale = std::stof( + value["node"][curScaleConstIndex]["attr"]["value"]["tensor"]["tenso" + "rCont" + "ent"][0] + .asString()); + float tmpOffset = std::stof( + value["node"][curOffsetConstIndex]["attr"]["value"]["tensor"]["tens" + "orCo" + "nten" + "t"][0] + .asString()); + float tmpMean = std::stof( + value["node"][curMeanConstIndex]["attr"]["value"]["tensor"]["tensor" + "Conten" + "t"][0] + .asString()); + float tmpVariance = std::stof( + value["node"][curVarianceConstIndex]["attr"]["value"]["tensor"]["te" + "ns" + "or" + "Co" + "nt" + "en" + "t"][0] + .asString()); + + float tmpNewMean = + tmpMean - tmpOffset * sqrt(tmpVariance / powf(tmpScale, 2)); + float tmpNewVariance = tmpVariance / (powf(tmpScale, 2)); + fp32FirPtr[k] = tmpNewMean; + fp32SecPtr[k] = tmpNewVariance; + } + } else { + CHECK_STATUS(NOT_IMPLEMENTED); + } + } + } + } + return SUCCESS; + } + + ParameterSpec adapt_Eltwise() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + EltwiseParamSpec eps; + initialization_zero(&eps, sizeof(eps)); + if (opType == "Add") { + eps.elt_mode = ELTWISE_SUM; + eps.activation_type = ACTIVATION_NULL; + } else if (opType == "Sub") { + eps.elt_mode = ELTWISE_SUB; + eps.activation_type = ACTIVATION_NULL; + } + curPs.eltwise_spec = eps; + return curPs; + } + + ParameterSpec adapt_ArgMax() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + ArgMaxParamSpec aps; + initialization_zero(&aps, sizeof(aps)); + aps.axis = 1; // TODO + curPs.argmax_spec = aps; + return curPs; + } + + ParameterSpec adapt_Conv() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + ConvolutionParamSpec convPs; + initialization_zero(&convPs, sizeof(convPs)); + convPs.kernel_t = 1; + convPs.stride_t = 1; + convPs.padding_before = 0; + convPs.padding_after = 0; + convPs.dilatedRate_t = 1; + + std::string conv_op = nodeV["name"].asString(); + int dilationsInfo[4] = {0, 0, 0, 0}; + int stridesInfo[4] = {0, 0, 0, 0}; + if (opType.compare("DepthwiseConv2dNative") == 0) { + for (int i = 0; i < (int)(nodeV["attr"]["dilations"]["list"]["i"].size()); i++) { + dilationsInfo[i] = 1; + } + } else { + dilationsInfo[0] = 1; + dilationsInfo[1] = 1; + } + for (int i = 0; i < (int)(nodeV["attr"]["strides"]["list"]["i"].size()); i++) { + stridesInfo[i] = std::stoi( + nodeV["attr"]["strides"]["list"]["i"][i].asString()); // TODO extract real data + } + convPs.dilatedRate_h = dilationsInfo[1]; // atten + convPs.dilatedRate_w = dilationsInfo[2]; + convPs.stride_h = stridesInfo[1]; + convPs.stride_w = stridesInfo[2]; + + std::vector curConvIdens = this->weightConstInput[conv_op]; + int curConstId = -1; + if (constId.find(curConvIdens[0]) != constId.end()) { + curConstId = constId[curConvIdens[0]]; + } else { + curConstId = constId[idenConst[curConvIdens[0]]]; + } + std::string constOpName = this->ttValue["node"][curConstId]["name"].asString(); + std::vector convWeightKernels; + for (int k = 0; k < + (int)(this->ttValue["node"][curConstId]["attr"]["value"]["tensor"]["tensorShape"]["di" + "m"] + .size()); + k++) { + convWeightKernels.push_back( + std::stoi(this->ttValue["node"][curConstId]["attr"]["value"]["tensor"] + ["tensorShape"]["dim"][k]["size"] + .asString())); + } + + if (convWeightKernels.size() < 4) { + UNI_ERROR_LOG("Not support this conv"); + } + if (opType.compare("DepthwiseConv2dNative") == 0) { + convPs.num_outputs = convWeightKernels[2]; + } else { + convPs.num_outputs = convWeightKernels[3]; + } + convPs.kernel_h = convWeightKernels[0]; + convPs.kernel_w = convWeightKernels[1]; + + std::string tfPaddingMode = + nodeV["attr"]["padding"]["s"].asString(); // choose one of VALID/SAME + if (tfPaddingMode.at(0) == 'V') { + tfPaddingMode = "VALID"; + convPs.padding_top = 0; + convPs.padding_bottom = 0; + convPs.padding_left = 0; + convPs.padding_right = 0; + } else { + tfPaddingMode = "SAME"; + convPs.padding_top = (U32)INT_MAX; + convPs.padding_bottom = (U32)INT_MAX; + convPs.padding_left = (U32)INT_MAX; + convPs.padding_right = (U32)INT_MAX; + } + + convPs.group = 1; + convPs.dw_activation_type = ACTIVATION_NULL; + convPs.pw_activation_type = ACTIVATION_NULL; + + if (convPs.group != 1 && convPs.group == convPs.num_outputs) { + convPs.convolution_type = Convolution_Depthwise; + } else { + if (convPs.dilatedRate_h > 1 || convPs.dilatedRate_w > 1) { + convPs.convolution_type = Convolution_Dilation; + } else { + convPs.convolution_type = Convolution_Pointwise; + } + } + + if (opType.compare("DepthwiseConv2dNative") == 0) { + convPs.convolution_type = Convolution_Depthwise; + } + curPs.conv_spec = convPs; + return curPs; + } + + ParameterSpec adapt_BatchNorm() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + BatchNormParamSpec bps; + initialization_zero(&bps, sizeof(bps)); + bps.axis = 0; + bps.eps = nodeV["attr"]["epsilon"]["f"].asFloat(); + bps.gama = 0; + bps.momentum = 0; + curPs.bn_spec = bps; + return curPs; + } + + ParameterSpec adapt_Fc() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + FullyConnectedParamSpec fps; + initialization_zero(&fps, sizeof(fps)); + // to locate the const weight op + std::string curOpName = nodeV["name"].asString(); + std::vector curConvIdens = this->weightConstInput[curOpName]; + int curConstId = -1; + if (constId.find(curConvIdens[0]) != constId.end()) { + curConstId = constId[curConvIdens[0]]; + } else { + curConstId = constId[idenConst[curConvIdens[0]]]; + } + int dimLengthIndex = + this->ttValue["node"][curConstId]["attr"]["value"]["tensor"]["tensorShape"].size() - 1; + fps.num_outputs = + std::stoi(this->ttValue["node"][curConstId]["attr"]["value"]["tensor"]["tensorShape"] + ["dim"][dimLengthIndex]["size"] + .asString()); // fc_dimSize is static two-dimension + fps.num_slices = 1; + curPs.fc_spec = fps; + return curPs; + } + + ParameterSpec adapt_Pooling() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + PoolingParamSpec pps; + initialization_zero(&pps, sizeof(pps)); + std::vector kernelSize; // ihwo + std::vector stridesInfo; + for (int i = 0; i < (int)(nodeV["attr"]["ksize"]["list"]["i"].size()); i++) { + kernelSize.push_back(std::stoi(nodeV["attr"]["ksize"]["list"]["i"][i].asString())); + } + for (int i = 0; i < (int)(nodeV["attr"]["strides"]["list"]["i"].size()); i++) { + stridesInfo.push_back(std::stoi(nodeV["attr"]["strides"]["list"]["i"][i].asString())); + } + pps.kernel_t = 1; + pps.kernel_h = kernelSize[1]; + pps.kernel_w = kernelSize[2]; + pps.stride_t = 1; + pps.stride_h = 1; + pps.stride_w = 1; + pps.padding_before = 0; + pps.padding_after = 0; + pps.padding_top = 0; + pps.padding_bottom = 0; + pps.padding_left = 0; + pps.padding_right = 0; + pps.rm = CEIL; + if (opType.compare("MaxPool") == 0) { + pps.mode = POOLING_MAX; + } else { // refer to "AvgPool" + pps.mode = POOLING_MEAN; + } + curPs.pooling_spec = pps; + return curPs; + } + + ParameterSpec adapt_Reduction() override + { + // Mapping to + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + ReductionParamSpec reductionPs; + initialization_zero(&reductionPs, sizeof(reductionPs)); + if (opType.compare("Mean") == 0) { + reductionPs.reduction_mode = REDUCTION_MEAN; + } else { + UNI_ERROR_LOG("not support this reduction mode\n"); + } + std::string reductionOpName = nodeV["name"].asString(); + std::vector constInputs = weightConstInput[reductionOpName]; + int constReductionOpIndex = -1; + if (constId.find(constInputs[0]) != constId.end()) { + constReductionOpIndex = constId[constInputs[0]]; + } else { + constReductionOpIndex = constId[idenConst[constInputs[0]]]; + } + reductionPs.axes_num = + this->ttValue["node"][constReductionOpIndex]["attr"]["value"]["tensor"]["tensorContent"] + .size(); + for (int i = 0; i < reductionPs.axes_num; i++) { + reductionPs.axes[i] = std::stoi( + this->ttValue["node"][constReductionOpIndex]["attr"]["value"]["tensor"]["tensorCont" + "ent"][i] + .asString()); + } + curPs.reduction_spec = reductionPs; + return curPs; + } + + ParameterSpec adapt_Pad() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + PadParamSpec padPs; + initialization_zero(&padPs, sizeof(padPs)); + + std::string curOpName = nodeV["name"].asString(); + std::vector curConvIdens = this->weightConstInput[curOpName]; + int curConstId = -1; + if (constId.find(curConvIdens[0]) != constId.end()) { + curConstId = constId[curConvIdens[0]]; + } else { + curConstId = constId[idenConst[curConvIdens[0]]]; + } + + std::vector padInfos; + for (int i = 0; i < (int)(this->ttValue["node"][curConstId]["attr"]["value"]["tensor"]["ten" + "sor" + "Con" + "ten" + "t"] + .size()); + i++) { + padInfos.push_back( + std::stoi(this->ttValue["node"][curConstId]["attr"]["value"]["tensor"]["tensorConte" + "nt"][i] + .asString())); + } + padPs.before = 0; + padPs.after = 0; + padPs.top = padInfos[2]; + padPs.bottom = padInfos[3]; + padPs.left = padInfos[4]; + padPs.right = padInfos[5]; + padPs.constant_value = 0; // TODO: for PadV2 + padPs.pad_mode = Pad_Constant; + curPs.pad_spec = padPs; + return curPs; + } + + ParameterSpec adapt_Concat() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + ConcatParamSpec concatPs; + initialization_zero(&concatPs, sizeof(concatPs)); + concatPs.axis = std::stoi(nodeV["attr"]["N"]["i"].asString()); + curPs.concat_spec = concatPs; + return curPs; + } + + ParameterSpec adapt_Resize() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + ResizeParamSpec resizePs; + initialization_zero(&resizePs, sizeof(resizePs)); + + std::string curOpName = nodeV["name"].asString(); + std::vector curConvIdens = this->weightConstInput[curOpName]; + int curConstId = -1; + if (constId.find(curConvIdens[0]) != constId.end()) { + curConstId = constId[curConvIdens[0]]; + } else { + curConstId = constId[idenConst[curConvIdens[0]]]; + } + resizePs.num_sizes = 2; + resizePs.num_scales = 0; + for (int k = 0; k < (int)(this->ttValue["node"][curConstId]["attr"]["value"]["tensor"]["ten" + "sor" + "Con" + "ten" + "t"] + .size()); + k++) { + resizePs.sizes[k] = + std::stoi(this->ttValue["node"][curConstId]["attr"]["value"]["tensor"]["tensorConte" + "nt"][k] + .asString()); + } + + curPs.resize_spec = resizePs; + return curPs; + } + + ParameterSpec adapt_Power() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + PowerParamSpec powerPs; + initialization_zero(&curPs, sizeof(powerPs)); + float curScale = 1.0; + float curShift = 0.0; + + if (opType.compare("Rsqrt") == 0) { + powerPs.power = 0.5; + curPs.power_spec = powerPs; + return curPs; + } + + std::string curOpName = nodeV["name"].asString(); + std::vector curConvIdens = this->weightConstInput[curOpName]; + int curConstId = -1; + if (constId.find(curConvIdens[0]) != constId.end()) { + curConstId = constId[curConvIdens[0]]; + } else { + curConstId = constId[idenConst[curConvIdens[0]]]; + } + + if (opType.compare("Mul") == 0) { + curScale = std::stof(this->ttValue["node"][curConstId]["attr"]["value"]["tensor"]["tens" + "orCo" + "nten" + "t"][0] + .asString()); + } else if (opType.compare("Sub") == 0) { + curShift = -1 * + std::stof(this->ttValue["node"][curConstId]["attr"]["value"]["tensor"]["tensorConte" + "nt"][0] + .asString()); + } else if (opType.compare("RealDiv") == 0) { + curScale = 1.0 / + std::stof(this->ttValue["node"][curConstId]["attr"]["value"]["tensor"]["tensorConte" + "nt"][0] + .asString()); + } + powerPs.scale = curScale; + powerPs.shift = curShift; + powerPs.power = 1; + curPs.power_spec = powerPs; + return curPs; + } + + ParameterSpec adapt_Transpose() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + TransposeParamSpec transPs; + initialization_zero(&transPs, sizeof(transPs)); + // extract the perm info from the const input + std::string curOpName = nodeV["name"].asString(); + std::vector curConvIdens = this->weightConstInput[curOpName]; + int curConstId = -1; + if (constId.find(curConvIdens[0]) != constId.end()) { + curConstId = constId[curConvIdens[0]]; + } else { + curConstId = constId[idenConst[curConvIdens[0]]]; + } + + transPs.trans_size = + this->ttValue["node"][curConstId]["attr"]["value"]["tensor"]["tensorContent"].size(); + for (int i = 0; i < (int)(transPs.trans_size); i++) { + transPs.trans_dims[i] = + std::stoi(this->ttValue["node"][curConstId]["attr"]["value"]["tensor"]["tensorConte" + "nt"][i] + .asString()); + ; + } + curPs.transpose_spec = transPs; + return curPs; + } + + ParameterSpec adapt_Reshape() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + ReshapeParamSpec reshapePs; + initialization_zero(&reshapePs, sizeof(reshapePs)); + + std::string curOpName = nodeV["name"].asString(); + std::vector curConvIdens = this->weightConstInput[curOpName]; + if (curConvIdens.size() == 0) { + return curPs; + } + int curConstId = -1; + if (constId.find(curConvIdens[0]) != constId.end()) { + curConstId = constId[curConvIdens[0]]; + } else { + curConstId = constId[idenConst[curConvIdens[0]]]; + } + reshapePs.shape_size = + this->ttValue["node"][curConstId]["attr"]["value"]["tensor"]["tensorContent"].size(); + for (int k = 0; k < reshapePs.shape_size; k++) { + reshapePs.shape_dims[k] = + std::stoi(this->ttValue["node"][curConstId]["attr"]["value"]["tensor"]["tensorConte" + "nt"][k] + .asString()); + } + reshapePs.axis = 8; + reshapePs.num_axes = -1; + curPs.reshape_spec = reshapePs; + return curPs; + } + + ParameterSpec adapt_Squeeze() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + SqueezeParamSpec squeezePs; + initialization_zero(&squeezePs, sizeof(squeezePs)); + std::vector squeezeDimsInfo; + squeezePs.axes_num = nodeV["attr"]["squeeze_dims"]["list"]["i"].size(); + for (int i = 0; i < (int)(nodeV["attr"]["squeeze_dims"]["list"]["i"].size()); i++) { + squeezePs.axes[i] = std::stoi(nodeV["attr"]["squeeze_dims"]["list"]["i"][i].asString()); + } + curPs.squeeze_spec = squeezePs; + return curPs; + } + + ParameterSpec adapt_Unsqueeze() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + UnsqueezeParamSpec unsqueezePs; + initialization_zero(&unsqueezePs, sizeof(unsqueezePs)); + std::string unsqueeze_op = nodeV["name"].asString(); + int expandDimIndex = constId[idenConst[weightConstInput[unsqueeze_op][0]]]; + unsqueezePs.axes_num = + this->ttValue["node"][expandDimIndex]["attr"]["value"]["tensor"]["tensorContent"].size(); + for (int k = 0; k < unsqueezePs.axes_num; k++) { + unsqueezePs.axes[k] = std::stoi( + this->ttValue["node"][expandDimIndex]["attr"]["value"]["tensor"]["tensorContent"][k] + .asString()); + } + curPs.unsqueeze_spec = unsqueezePs; + return curPs; + } + + ParameterSpec adapt_Cast() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + CastParamSpec castPs; + initialization_zero(&castPs, sizeof(castPs)); + castPs.castPrecision = ToFloat; + curPs.cast_spec = castPs; + return curPs; + } + +private: + int modelInputLayerNum; + int entityOpCount; + + std::string modelName; + std::string newStrValue; + Json::Value nodeV; + int curNodeIndex; + std::string opType; + + Json::Value ttValue; + + std::map constId; + std::map idenConst; + std::map> weightConstInput; + std::vector weightIds; + + int weightOpNum; + int curInDegree; +}; +#endif diff --git a/model_tools/src/tensorflow/tensorflow_wrapper.cpp b/model_tools/src/tensorflow/tensorflow_wrapper.cpp new file mode 100644 index 00000000..02178a9f --- /dev/null +++ b/model_tools/src/tensorflow/tensorflow_wrapper.cpp @@ -0,0 +1,25 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "converter.h" +#include "model_tools.h" +#include "tensorflow_adaptee.h" + +EE tensorflow_converter(std::string dir, std::string mfn, ModelSpec *ms) +{ + ModelAdaptee *ade = new TensorflowAdaptee(); + EE ret = ade->adapt(dir, mfn, ms); + delete ade; + return ret; +} diff --git a/model_tools/src/tflite/CMakeLists.txt b/model_tools/src/tflite/CMakeLists.txt new file mode 100644 index 00000000..587642c1 --- /dev/null +++ b/model_tools/src/tflite/CMakeLists.txt @@ -0,0 +1,22 @@ +file(GLOB srcs ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) +file(GLOB commonsrcs ${CMAKE_CURRENT_SOURCE_DIR}/../model_*.cpp) + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) + +include_directories(${TFLITE_INCLUDE_DIR}) + +include_directories(../) + +# shared library +add_library(${PROJECT_NAME}_tflite SHARED ${srcs} ${commonsrcs}) +target_link_libraries (${PROJECT_NAME}_tflite LINK_PUBLIC uni) + +# static library +add_library(${PROJECT_NAME}_tflite_static STATIC ${srcs}) + +set_target_properties(${PROJECT_NAME}_tflite_static PROPERTIES OUTPUT_NAME "${PROJECT_NAME}_tflite") +set_target_properties(${PROJECT_NAME}_tflite PROPERTIES CLEAN_DIRECT_OUTPUT 1) +set_target_properties(${PROJECT_NAME}_tflite_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) +install(TARGETS ${PROJECT_NAME}_tflite ${PROJECT_NAME}_tflite_static + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib) diff --git a/model_tools/src/tflite/tflite_adaptee.h b/model_tools/src/tflite/tflite_adaptee.h new file mode 100644 index 00000000..9210abcd --- /dev/null +++ b/model_tools/src/tflite/tflite_adaptee.h @@ -0,0 +1,1455 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_TFLITEADAPTEE +#define _H_TFLITEADAPTEE +#include +#include +#include +#include +#include +#include +#include +#include +#include "converter.h" +#include "model_tools.h" +#include "model_adaptee.h" +#include "types.h" +#include "ut_util.h" + +class TfliteAdaptee : public ModelAdaptee { +public: + TfliteAdaptee() + { + this->weightFormat = DF_NHWC; + } + + ~TfliteAdaptee() + {} + +protected: + std::vector getOperatorTensorInputIndex(int operatorIndex) + { + std::vector index; + for (U32 i = 0; i < this->tfliteOperators[operatorIndex]->inputs.size(); i++) { + if (this->tfliteModelBuffer + [this->tfliteTensors[this->tfliteOperators[operatorIndex]->inputs[i]]->buffer] + ->data.size() == 0) { + index.push_back(i); + } + } + return index; + } + + std::vector getOperatorWeightInputIndex(int operatorIndex) + { + std::vector index; + for (U32 i = 0; i < this->tfliteOperators[operatorIndex]->inputs.size(); i++) { + if (this->tfliteModelBuffer + [this->tfliteTensors[this->tfliteOperators[operatorIndex]->inputs[i]]->buffer] + ->data.size() > 0) { + index.push_back(i); + } + } + return index; + } + + OperatorType convert_tflite_type(tflite::BuiltinOperator tfliteOperatorType) + { + std::vector weightInputIndex = getOperatorWeightInputIndex(this->tfliteOperatorIndex); + if (tfliteOperatorType == tflite::BuiltinOperator_ADD || + tfliteOperatorType == tflite::BuiltinOperator_MUL || + tfliteOperatorType == tflite::BuiltinOperator_DIV || + tfliteOperatorType == tflite::BuiltinOperator_SUB) { + if (weightInputIndex.size() > 0) { + if (this->tfliteModelBuffer[this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex] + ->inputs[weightInputIndex[0]]] + ->buffer] + ->data.size() == sizeof(float)) { + return OT_Power; + } else { + return OT_Scale; + } + } else { + return OT_Eltwise; + } + } else if (tfliteOperatorType == tflite::BuiltinOperator_CONCATENATION || + tfliteOperatorType == tflite::BuiltinOperator_PACK) { + return OT_Concat; + } else if (tfliteOperatorType == tflite::BuiltinOperator_CONV_2D) { + return OT_Conv; + } else if (tfliteOperatorType == tflite::BuiltinOperator_DEPTHWISE_CONV_2D) { + return OT_Conv; + } else if (tfliteOperatorType == tflite::BuiltinOperator_LOGISTIC) { + return OT_Sigmoid; + } else if (tfliteOperatorType == tflite::BuiltinOperator_MAX_POOL_2D) { + return OT_Pooling; + } else if (tfliteOperatorType == tflite::BuiltinOperator_AVERAGE_POOL_2D) { + return OT_Pooling; + } else if (tfliteOperatorType == tflite::BuiltinOperator_RESHAPE) { + return OT_Reshape; + } else if (tfliteOperatorType == tflite::BuiltinOperator_RESIZE_BILINEAR) { + return OT_Resize; + } else if (tfliteOperatorType == tflite::BuiltinOperator_SOFTMAX) { + return OT_Softmax; + } else if (tfliteOperatorType == tflite::BuiltinOperator_FULLY_CONNECTED) { + if (weightInputIndex.size() > 0) { + return OT_FC; + } else { + return OT_MatMul; + } + } else if (tfliteOperatorType == tflite::BuiltinOperator_TRANSPOSE) { + return OT_Transpose; + } else if (tfliteOperatorType == tflite::BuiltinOperator_SLICE || + tfliteOperatorType == tflite::BuiltinOperator_STRIDED_SLICE) { + return OT_TfSlice; + } else if (tfliteOperatorType == tflite::BuiltinOperator_RELU || + tfliteOperatorType == tflite::BuiltinOperator_LEAKY_RELU) { + return OT_Relu; + } else if (tfliteOperatorType == tflite::BuiltinOperator_RELU6) { + return OT_Relu6; + } else if (tfliteOperatorType == tflite::BuiltinOperator_TANH) { + return OT_TanH; + } else if (tfliteOperatorType == tflite::BuiltinOperator_MEAN) { + if (this->tfliteModelBuffer + [this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[1]] + ->buffer] + ->data.size() != 8) { + return OT_Reduction; + } else { + return OT_Pooling; + } + } else if (tfliteOperatorType == tflite::BuiltinOperator_MAXIMUM) { + return OT_Clip; + } else if (tfliteOperatorType == tflite::BuiltinOperator_MINIMUM) { + return OT_Clip; + } else if (tfliteOperatorType == tflite::BuiltinOperator_TRANSPOSE_CONV) { + return OT_Deconvolution; + } else if (tfliteOperatorType == tflite::BuiltinOperator_SQUARED_DIFFERENCE) { + return OT_SqDiff; + } else if (tfliteOperatorType == tflite::BuiltinOperator_SQRT || + tfliteOperatorType == tflite::BuiltinOperator_POW) { + return OT_Power; + } else if (tfliteOperatorType == tflite::BuiltinOperator_L2_NORMALIZATION) { + return OT_L2Normalization; + } else if (tfliteOperatorType == tflite::BuiltinOperator_PAD || + tfliteOperatorType == tflite::BuiltinOperator_MIRROR_PAD) { + return OT_Pad; + } else if (tfliteOperatorType == tflite::BuiltinOperator_HARD_SWISH) { + return OT_HSwish; + } else if (tfliteOperatorType == tflite::BuiltinOperator_SHAPE) { + return OT_Shape; + } else if (tfliteOperatorType == tflite::BuiltinOperator_SQUEEZE) { + return OT_Squeeze; + } else if (tfliteOperatorType == tflite::BuiltinOperator_EXPAND_DIMS) { + return OT_Unsqueeze; + } else if (tfliteOperatorType == tflite::BuiltinOperator_NEG) { + return OT_Power; + } else { + UNI_ERROR_LOG("tflite operator %s not implemented yet\n", + tflite::EnumNamesBuiltinOperator()[tfliteOperatorType]); + return OT_None; + } + } + + EE parse_file(std::string dir, std::string mfn) override + { + EE ret = SUCCESS; + std::string tfliteSuffix = ".tflite"; + + this->modelName = mfn; + + std::string model_name = dir + "/" + mfn + tfliteSuffix; + std::ifstream inputFile(model_name.c_str(), std::ios::binary); + if (!inputFile.is_open()) { + UNI_ERROR_LOG("can not find tflite model file %s\n", model_name.c_str()); + } + inputFile.seekg(0, std::ios::end); + const auto size = inputFile.tellg(); + inputFile.seekg(0, std::ios::beg); + + char *buffer = new char[size]; + inputFile.read(buffer, size); + inputFile.close(); + + flatbuffers::Verifier verify((uint8_t *)buffer, size); + CHECK_REQUIREMENT(tflite::VerifyModelBuffer(verify)); + + auto tfliteModel = tflite::UnPackModel(buffer); + + tfliteOpSet.clear(); + for (int i = 0; i < (int)(tfliteModel->operator_codes).size(); i++) { + tfliteOpSet.push_back(std::move((tfliteModel->operator_codes)[i])); + } + + const auto subGraphsSize = tfliteModel->subgraphs.size(); + CHECK_REQUIREMENT(subGraphsSize == 1); + + tfliteModelBuffer.clear(); + for (int i = 0; i < (int)(tfliteModel->buffers).size(); i++) { + tfliteModelBuffer.push_back(std::move((tfliteModel->buffers)[i])); + } + + if (subGraphsSize != 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + + this->tfliteOperators.clear(); + for (int i = 0; i < (int)(tfliteModel->subgraphs[0]->operators).size(); i++) { + this->tfliteOperators.push_back(std::move((tfliteModel->subgraphs[0]->operators)[i])); + } + + this->tfliteTensors.clear(); + for (int i = 0; i < (int)(tfliteModel->subgraphs[0]->tensors).size(); i++) { + this->tfliteTensors.push_back(std::move((tfliteModel->subgraphs[0]->tensors)[i])); + } + + inputs.clear(); + for (int i = 0; i < (int)(tfliteModel->subgraphs[0]->inputs).size(); i++) { + inputs.push_back(std::move((tfliteModel->subgraphs[0]->inputs)[i])); + } + + outputs.clear(); + for (int i = 0; i < (int)(tfliteModel->subgraphs[0]->outputs).size(); i++) { + outputs.push_back(std::move((tfliteModel->subgraphs[0]->outputs)[i])); + } + + return ret; + } + + EE adapt_operators(ModelSpec *ms) override + { + this->modelWeightOpNum = 0; + EE ret = SUCCESS; + ms->dt = DT_F32; + str_copy(ms->model_name, modelName.c_str(), modelName.length()); + ms->num_inputs = inputs.size(); + ms->input_names = (I8 **)mt_new_storage(ms->num_inputs * sizeof(I8 *)); + ms->input_dims = (TensorDesc *)mt_new_storage(sizeof(TensorDesc) * ms->num_inputs); + for (I32 i = 0; i < ms->num_inputs; i++) { + const int inputIdx = inputs[i]; + const auto &inputTensor = this->tfliteTensors[inputIdx]; + std::vector inputShape(inputTensor->shape); + ms->input_names[i] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + str_copy(ms->input_names[i], (inputTensor->name).c_str(), (inputTensor->name).length()); + if (this->weightFormat == DF_NHWC) { + shiftRight(inputShape.data(), inputShape.size(), 1, inputShape.size() - 1); + } + switch (inputShape.size()) { + case 1: { + ms->input_dims[i] = tensor1d(DT_F32, inputShape[0]); + break; + } + case 2: { + ms->input_dims[i] = tensor2df(DT_F32, DF_NORMAL, inputShape[0], inputShape[1]); + break; + } + case 3: { + ms->input_dims[i] = + tensor3df(DT_F32, DF_MTK, inputShape[0], inputShape[1], inputShape[2]); + break; + } + case 4: { + ms->input_dims[i] = tensor4df(DT_F32, DF_NCHW, inputShape[0], inputShape[1], + inputShape[2], inputShape[3]); + break; + } + default: { + UNI_ERROR_LOG("unsupport tflite input size %d\n", (int)inputShape.size()); + break; + } + } + } + ms->num_outputs = outputs.size(); + ms->output_names = (I8 **)mt_new_storage(ms->num_outputs * sizeof(I8 *)); + for (I32 i = 0; i < ms->num_outputs; i++) { + const int outputIdx = outputs[i]; + const auto &outputTensor = this->tfliteTensors[outputIdx]; + ms->output_names[i] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + str_copy( + ms->output_names[i], (outputTensor->name).c_str(), (outputTensor->name).length()); + } + + this->boltOperators = std::vector(this->tfliteOperators.size()); + for (this->boltOperatorIndex = 0, this->tfliteOperatorIndex = 0; + this->tfliteOperatorIndex < this->tfliteOperators.size(); + this->boltOperatorIndex++, this->tfliteOperatorIndex++) { + UNI_DEBUG_LOG("load and process operator %d\n", this->tfliteOperatorIndex); + std::string operatorName = "op" + std::to_string(this->tfliteOperatorIndex); + str_copy(this->boltOperators[this->boltOperatorIndex].name, operatorName.c_str(), + operatorName.length()); + const int opcodeIndex = this->tfliteOperators[this->tfliteOperatorIndex]->opcode_index; + this->opCode = tfliteOpSet[opcodeIndex]->builtin_code; + this->boltOperators[this->boltOperatorIndex].type = convert_tflite_type(this->opCode); + this->boltOperators[this->boltOperatorIndex].num_inputs = + (modifiedInputsOp.find(this->boltOperators[this->boltOperatorIndex].type) == + modifiedInputsOp.end()) + ? this->tfliteOperators[this->tfliteOperatorIndex]->inputs.size() + : modifiedInputsOp[this->boltOperators[this->boltOperatorIndex].type]; + this->boltOperators[this->boltOperatorIndex].input_tensors_name = (I8 **)mt_new_storage( + this->boltOperators[this->boltOperatorIndex].num_inputs * sizeof(I8 *)); + + int inputStartPoint = 0; + if (opCode == tflite::BuiltinOperator_TRANSPOSE_CONV) { + inputStartPoint = 2; + } + if (opCode == tflite::BuiltinOperator_MUL) { + std::vector tensorInputIndex = + getOperatorTensorInputIndex(this->tfliteOperatorIndex); + inputStartPoint = tensorInputIndex[0]; + } + + for (U32 iter = 0; iter < this->boltOperators[this->boltOperatorIndex].num_inputs; + iter++) { + const int inIndex = + this->tfliteOperators[this->tfliteOperatorIndex]->inputs[iter + inputStartPoint]; + const auto &inTensor = this->tfliteTensors[inIndex]; + this->boltOperators[this->boltOperatorIndex].input_tensors_name[iter] = + (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + str_copy(this->boltOperators[this->boltOperatorIndex].input_tensors_name[iter], + (inTensor->name).c_str(), (inTensor->name).length()); + } + this->boltOperators[this->boltOperatorIndex].num_outputs = + this->tfliteOperators[this->tfliteOperatorIndex]->outputs.size(); + this->boltOperators[this->boltOperatorIndex].output_tensors_name = (I8 **)mt_new_storage( + this->boltOperators[this->boltOperatorIndex].num_outputs * sizeof(I8 *)); + for (U32 iter = 0; iter < this->boltOperators[this->boltOperatorIndex].num_outputs; + iter++) { + const int outIndex = this->tfliteOperators[this->tfliteOperatorIndex]->outputs[iter]; + const auto &outTensor = this->tfliteTensors[outIndex]; + this->boltOperators[this->boltOperatorIndex].output_tensors_name[iter] = + (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + str_copy(this->boltOperators[this->boltOperatorIndex].output_tensors_name[iter], + outTensor->name.c_str(), outTensor->name.length()); + } + this->boltOperatorInsertBefore = 0; + this->boltOperatorInsertAfter = 0; + ParameterSpec boltParameterSpec; + ret = adapt_operator( + this->boltOperators[this->boltOperatorIndex].type, &(boltParameterSpec)); + this->boltOperators[this->boltOperatorIndex + this->boltOperatorInsertBefore].ps = + boltParameterSpec; + this->boltOperatorIndex += + this->boltOperatorInsertBefore + this->boltOperatorInsertAfter; + } + + ms->num_operator_specs = this->boltOperators.size(); + ms->ops = (OperatorSpec *)mt_new_storage(sizeof(OperatorSpec) * ms->num_operator_specs); + memcpy(ms->ops, this->boltOperators.data(), sizeof(OperatorSpec) * ms->num_operator_specs); + for (I32 i = 0; i < ms->num_operator_specs; i++) { + this->boltOperatorNameMap[ms->ops[i].name] = i; + ms->ops[i].tensor_positions = nullptr; + ms->ops[i].num_quant_feature = 0; + ms->ops[i].feature_scale = nullptr; + } + ms->num_weight_specs = modelWeightOpNum; + return ret; + } + + int NHWCAxisToNCHWAxis(int nhwcAxis, int dimSize) + { + // tflite may not record tensor shape + if (dimSize == 0) { + dimSize = 4; + } + if (nhwcAxis < 0) { + nhwcAxis += dimSize; + } + int nchwAxis = nhwcAxis; + // only transpose 4-dim parameter + if (dimSize >= 4) { + if (nhwcAxis != 0) { + nchwAxis++; + } + if (nhwcAxis == dimSize - 1) { + nchwAxis = 1; + } + } + return nchwAxis; + } + + void bitsToCharArray(int bit, char *array, int length) + { + for (int i = 0; i < length; i++) { + array[i] = bit & 1; + bit = bit >> 1; + } + } + + template + void shiftRight(T *array, int length, int left, int right) + { + // only transpose 4-dim parameter + if (length >= 4) { + T data = array[right]; + for (int i = right; i > left; i--) { + array[i] = array[i - 1]; + } + array[left] = data; + } + } + + std::vector transformTfliteTensorToVector(const std::unique_ptr &tensor) + { + const auto &weightShape = tensor->shape; + U32 size = 1; + for (U32 i = 0; i < weightShape.size(); i++) { + size *= weightShape[i]; + } + std::vector result(size); + switch (tensor->type) { + case tflite::TensorType_FLOAT32: { + auto weight = reinterpret_cast( + this->tfliteModelBuffer[tensor->buffer]->data.data()); + UNI_memcpy(result.data(), weight, sizeof(float) * size); + break; + } + case tflite::TensorType_INT64: { + auto weight = reinterpret_cast( + this->tfliteModelBuffer[tensor->buffer]->data.data()); + for (U32 i = 0; i < size; i++) { + result[i] = weight[i]; + } + break; + } + case tflite::TensorType_INT32: { + auto weight = reinterpret_cast( + this->tfliteModelBuffer[tensor->buffer]->data.data()); + for (U32 i = 0; i < size; i++) { + result[i] = weight[i]; + } + break; + } + case tflite::TensorType_INT8: { + auto weight = reinterpret_cast( + this->tfliteModelBuffer[tensor->buffer]->data.data()); + float scale = 1, shift = 0; + CHECK_REQUIREMENT(tensor->quantization->scale.size() == 1); + scale = tensor->quantization->scale[0]; + if (tensor->quantization->zero_point.size() > 0) { + shift = tensor->quantization->zero_point[0]; + } + for (U32 i = 0; i < size; i++) { + result[i] = weight[i] * scale + shift; + } + break; + } + default: { + UNI_ERROR_LOG("tflite adaptor not support %s type weight data\n", + tflite::EnumNamesTensorType()[tensor->type]); + break; + } + } + return result; + } + + EE adapt_weights(ModelSpec *ms) override + { + WeightSpec *wsPtr = (WeightSpec *)mt_new_storage(sizeof(WeightSpec) * ms->num_weight_specs); + for (int j = 0; j < ms->num_weight_specs; j++) { + wsPtr[j].num_quant_scale = 0; + wsPtr[j].weight_scale = nullptr; + } + ms->ws = wsPtr; + UNI_memcpy(ms->ws, this->boltSharedWeights.data(), + this->boltSharedWeights.size() * sizeof(WeightSpec)); + int weightMovIndex = this->boltSharedWeights.size(); + for (this->tfliteOperatorIndex = 0; + this->tfliteOperatorIndex < this->tfliteOperators.size(); this->tfliteOperatorIndex++) { + std::string operatorName = "op" + std::to_string(this->tfliteOperatorIndex); + this->boltOperatorIndex = this->boltOperatorNameMap[operatorName]; + const int opcodeIndex = this->tfliteOperators[this->tfliteOperatorIndex]->opcode_index; + opCode = tfliteOpSet[opcodeIndex]->builtin_code; + + if (opCode == tflite::BuiltinOperator_CONV_2D || + opCode == tflite::BuiltinOperator_DEPTHWISE_CONV_2D) { + str_copy(wsPtr[weightMovIndex].op_name, operatorName.c_str(), operatorName.length()); + wsPtr[weightMovIndex].mdt = DT_F32; + // input 2/3: input/weight/bias + const int weightIndex = this->tfliteOperators[this->tfliteOperatorIndex]->inputs[1]; + const auto &weightTensor = this->tfliteTensors[weightIndex]; + std::vector conv2DWeight = transformTfliteTensorToVector(weightTensor); + const auto &weightShape = weightTensor->shape; + CHECK_REQUIREMENT(weightShape.size() == 4); + const int conv2d_co = weightShape[0]; + const int conv2d_kh = weightShape[1]; + const int conv2d_kw = weightShape[2]; + const int conv2d_ci = weightShape[3]; + wsPtr[weightMovIndex].bytes_of_weight = + conv2d_co * conv2d_kh * conv2d_kw * conv2d_ci * sizeof(float); + wsPtr[weightMovIndex].weight = + (U8 *)mt_new_storage(wsPtr[weightMovIndex].bytes_of_weight); + TensorDesc nhwcWeightDesc = + tensor4df(DT_F32, DF_NHWC, conv2d_co, conv2d_ci, conv2d_kh, conv2d_kw); + TensorDesc nchwWeightDesc = + tensor4df(DT_F32, DF_NCHW, conv2d_co, conv2d_ci, conv2d_kh, conv2d_kw); + transformToNCHW(nhwcWeightDesc, conv2DWeight.data(), nchwWeightDesc, + wsPtr[weightMovIndex].weight); + + if (this->tfliteOperators[this->tfliteOperatorIndex]->inputs.size() == 3) { + const int biasIndex = + this->tfliteOperators[this->tfliteOperatorIndex]->inputs[2]; + const auto &biasTensor = this->tfliteTensors[biasIndex]; + std::vector conv2DBias = transformTfliteTensorToVector(biasTensor); + if (opCode == tflite::BuiltinOperator_CONV_2D) { + wsPtr[weightMovIndex].bytes_of_vec = conv2d_co * sizeof(float); + } else { + wsPtr[weightMovIndex].bytes_of_vec = conv2d_ci * sizeof(float); + } + wsPtr[weightMovIndex].vec = + (U8 *)mt_new_storage(wsPtr[weightMovIndex].bytes_of_vec); + memcpy(wsPtr[weightMovIndex].vec, conv2DBias.data(), + wsPtr[weightMovIndex].bytes_of_vec); + } else { + wsPtr[weightMovIndex].bytes_of_vec = 0; + wsPtr[weightMovIndex].vec = nullptr; + } + weightMovIndex++; + } else if (OT_Scale == ms->ops[this->boltOperatorIndex].type) { + str_copy(wsPtr[weightMovIndex].op_name, operatorName.c_str(), operatorName.length()); + wsPtr[weightMovIndex].mdt = DT_F32; + std::vector weightInputIndex = + getOperatorWeightInputIndex(this->tfliteOperatorIndex); + if (weightInputIndex.size() == 0) { + UNI_ERROR_LOG("recognize op %d to scale operator is not supported\n", + this->tfliteOperatorIndex); + } + switch (opCode) { + case tflite::BuiltinOperator_ADD: { + wsPtr[weightMovIndex].bytes_of_weight = 0; + wsPtr[weightMovIndex].weight = nullptr; + + const int biasIndex = this->tfliteOperators[this->tfliteOperatorIndex] + ->inputs[weightInputIndex[0]]; + const auto &biasTensor = this->tfliteTensors[biasIndex]; + std::vector bias = transformTfliteTensorToVector(biasTensor); + wsPtr[weightMovIndex].bytes_of_vec = bias.size() * sizeof(float); + if (wsPtr[weightMovIndex].bytes_of_vec == 4) { + ms->ops[this->boltOperatorIndex].ps.scale_spec.axis = 0; + } + wsPtr[weightMovIndex].vec = + (U8 *)mt_new_storage(wsPtr[weightMovIndex].bytes_of_vec); + memcpy(wsPtr[weightMovIndex].vec, bias.data(), + wsPtr[weightMovIndex].bytes_of_vec); + break; + } + case tflite::BuiltinOperator_SUB: { + wsPtr[weightMovIndex].bytes_of_weight = 0; + wsPtr[weightMovIndex].weight = nullptr; + + const int biasIndex = this->tfliteOperators[this->tfliteOperatorIndex] + ->inputs[weightInputIndex[0]]; + const auto &biasTensor = this->tfliteTensors[biasIndex]; + std::vector bias = transformTfliteTensorToVector(biasTensor); + wsPtr[weightMovIndex].bytes_of_vec = bias.size() * sizeof(float); + if (wsPtr[weightMovIndex].bytes_of_vec == 4) { + ms->ops[this->boltOperatorIndex].ps.scale_spec.axis = 0; + } + wsPtr[weightMovIndex].vec = + (U8 *)mt_new_storage(wsPtr[weightMovIndex].bytes_of_vec); + F32 *ptr = (F32 *)wsPtr[weightMovIndex].vec; + for (U32 k = 0; k < bias.size(); k++) { + ptr[k] = -1 * bias[k]; + } + break; + } + case tflite::BuiltinOperator_MUL: { + const int scaleIndex = this->tfliteOperators[this->tfliteOperatorIndex] + ->inputs[weightInputIndex[0]]; + const auto &scaleTensor = this->tfliteTensors[scaleIndex]; + std::vector scale = transformTfliteTensorToVector(scaleTensor); + wsPtr[weightMovIndex].bytes_of_weight = scale.size() * sizeof(float); + if (wsPtr[weightMovIndex].bytes_of_weight == 4) { + ms->ops[this->boltOperatorIndex].ps.scale_spec.axis = 0; + } + wsPtr[weightMovIndex].weight = + (U8 *)mt_new_storage(wsPtr[weightMovIndex].bytes_of_weight); + memcpy(wsPtr[weightMovIndex].weight, scale.data(), + wsPtr[weightMovIndex].bytes_of_weight); + + wsPtr[weightMovIndex].bytes_of_vec = 0; + wsPtr[weightMovIndex].vec = nullptr; + break; + } + case tflite::BuiltinOperator_DIV: { + const int scaleIndex = this->tfliteOperators[this->tfliteOperatorIndex] + ->inputs[weightInputIndex[0]]; + const auto &scaleTensor = this->tfliteTensors[scaleIndex]; + std::vector scale = transformTfliteTensorToVector(scaleTensor); + wsPtr[weightMovIndex].bytes_of_weight = scale.size() * sizeof(float); + if (wsPtr[weightMovIndex].bytes_of_weight == 4) { + ms->ops[this->boltOperatorIndex].ps.scale_spec.axis = 0; + } + wsPtr[weightMovIndex].weight = + (U8 *)mt_new_storage(wsPtr[weightMovIndex].bytes_of_weight); + F32 *ptr = (F32 *)wsPtr[weightMovIndex].weight; + for (U32 k = 0; k < scale.size(); k++) { + ptr[k] = 1 / scale[k]; + } + + wsPtr[weightMovIndex].bytes_of_vec = 0; + wsPtr[weightMovIndex].vec = nullptr; + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + weightMovIndex++; + } else if (OT_FC == ms->ops[this->boltOperatorIndex].type) { + str_copy(wsPtr[weightMovIndex].op_name, operatorName.c_str(), operatorName.length()); + wsPtr[weightMovIndex].mdt = DT_F32; + const int weightIndex = this->tfliteOperators[this->tfliteOperatorIndex]->inputs[1]; + const auto &weightTensor = this->tfliteTensors[weightIndex]; + std::vector fcWeight = transformTfliteTensorToVector(weightTensor); + const auto &weightShape = weightTensor->shape; + CHECK_REQUIREMENT(weightShape.size() == 2); + wsPtr[weightMovIndex].bytes_of_weight = fcWeight.size() * sizeof(float); + wsPtr[weightMovIndex].weight = + (U8 *)mt_new_storage(wsPtr[weightMovIndex].bytes_of_weight); + memcpy(wsPtr[weightMovIndex].weight, fcWeight.data(), + wsPtr[weightMovIndex].bytes_of_weight); + + if (this->tfliteOperators[this->tfliteOperatorIndex]->inputs.size() == 3) { + const int biasIndex = + this->tfliteOperators[this->tfliteOperatorIndex]->inputs[2]; + const auto &biasTensor = this->tfliteTensors[biasIndex]; + std::vector fcBias = transformTfliteTensorToVector(biasTensor); + wsPtr[weightMovIndex].bytes_of_vec = fcBias.size() * sizeof(float); + wsPtr[weightMovIndex].vec = + (U8 *)mt_new_storage(wsPtr[weightMovIndex].bytes_of_vec); + memcpy(wsPtr[weightMovIndex].vec, fcBias.data(), + wsPtr[weightMovIndex].bytes_of_vec); + } else { + wsPtr[weightMovIndex].bytes_of_vec = 0; + wsPtr[weightMovIndex].vec = nullptr; + } + weightMovIndex++; + } else if (OT_Deconvolution == ms->ops[this->boltOperatorIndex].type) { + str_copy(wsPtr[weightMovIndex].op_name, operatorName.c_str(), operatorName.length()); + wsPtr[weightMovIndex].mdt = DT_F32; + const int weightIndex = this->tfliteOperators[this->tfliteOperatorIndex]->inputs[1]; + const auto &weightTensor = this->tfliteTensors[weightIndex]; + std::vector deConvWeight = transformTfliteTensorToVector(weightTensor); + const auto &weightShape = weightTensor->shape; + CHECK_REQUIREMENT(weightShape.size() == 4); + U32 conv2d_co = weightShape[0]; + U32 conv2d_kh = weightShape[1]; + U32 conv2d_kw = weightShape[2]; + U32 conv2d_ci = weightShape[3]; + wsPtr[weightMovIndex].bytes_of_weight = deConvWeight.size() * sizeof(float); + wsPtr[weightMovIndex].weight = + (U8 *)mt_new_storage(wsPtr[weightMovIndex].bytes_of_weight); + U32 filterDims[4] = {conv2d_ci, conv2d_kw, conv2d_kh, conv2d_co}; + U32 ftmDims[4] = {conv2d_kw, conv2d_kh, conv2d_co, conv2d_ci}; + U32 filterTransformDims[4] = {3, 0, 1, 2}; + CHECK_STATUS(array_transpose(DT_F32, filterDims, deConvWeight.data(), ftmDims, + wsPtr[weightMovIndex].weight, filterTransformDims, 4)); + if (this->tfliteOperators[this->tfliteOperatorIndex]->inputs.size() == 4) { + UNI_ERROR_LOG("tflite adaptor not support to process deconvolution's bias\n"); + } + wsPtr[weightMovIndex].bytes_of_vec = 0; + wsPtr[weightMovIndex].vec = nullptr; + weightMovIndex++; + } + } + return SUCCESS; + } + + ActivationMode getActivationOperatorType( + const tflite::ActivationFunctionType &tfliteActivationType) + { + ActivationMode ret = ACTIVATION_NULL; + switch (tfliteActivationType) { + case tflite::ActivationFunctionType_NONE: + ret = ACTIVATION_NULL; + break; + case tflite::ActivationFunctionType_RELU: + ret = ACTIVATION_RELU; + break; + case tflite::ActivationFunctionType_RELU6: + ret = ACTIVATION_RELU6; + break; + default: + UNI_ERROR_LOG("tflite activation %s not recognized\n", + tflite::EnumNamesActivationFunctionType()[tfliteActivationType]); + break; + } + return ret; + } + + void insertActivationOperator(ActivationMode activationMode) + { + if (activationMode == ACTIVATION_NULL) { + return; + } + OperatorSpec activation; + int index = this->boltOperatorIndex + this->boltOperatorInsertBefore + + this->boltOperatorInsertAfter; + const char *name = this->boltOperators[index].output_tensors_name[0]; + if (activationMode == ACTIVATION_RELU) { + activation = mt_create_operator(name, OT_Relu, 1, 1); + activation.ps.relu_spec.neg_slope = 0; + } else { + UNI_ERROR_LOG("tflite adaptor not support %d type activation fusion\n", activationMode); + } + str_copy(activation.input_tensors_name[0], name, NAME_LEN); + str_copy(activation.output_tensors_name[0], name, NAME_LEN); + this->boltOperators.insert(this->boltOperators.begin() + index + 1, activation); + this->boltOperatorInsertAfter++; + } + + ParameterSpec adapt_Eltwise() override + { + ParameterSpec curPs; + EltwiseParamSpec eltPs; + ActivationMode activationMode = ACTIVATION_NULL; + if (opCode == tflite::BuiltinOperator_ADD) { + eltPs.elt_mode = ELTWISE_SUM; + EltwiseSumSpec elt_sum_spec; + elt_sum_spec.coeff_size = 2; + for (I32 j = 0; j < elt_sum_spec.coeff_size; j++) { + elt_sum_spec.coeff_values[j] = 1.0; + } + eltPs.elt_sum_spec = elt_sum_spec; + const auto &tfliteEltwiseOption = + this->tfliteOperators[this->tfliteOperatorIndex]->builtin_options.AsAddOptions(); + activationMode = + getActivationOperatorType(tfliteEltwiseOption->fused_activation_function); + } else if (opCode == tflite::BuiltinOperator_SUB) { + eltPs.elt_mode = ELTWISE_SUB; + } else if (opCode == tflite::BuiltinOperator_MAXIMUM) { + eltPs.elt_mode = ELTWISE_MAX; + } else if (opCode == tflite::BuiltinOperator_MINIMUM) { + eltPs.elt_mode = ELTWISE_MIN; + } else if (opCode == tflite::BuiltinOperator_DIV) { + eltPs.elt_mode = ELTWISE_DIV; + } else if (opCode == tflite::BuiltinOperator_MUL) { + eltPs.elt_mode = ELTWISE_PROD; + const auto &tfliteEltwiseOption = + this->tfliteOperators[this->tfliteOperatorIndex]->builtin_options.AsMulOptions(); + activationMode = + getActivationOperatorType(tfliteEltwiseOption->fused_activation_function); + } else { + CHECK_STATUS(NOT_IMPLEMENTED); + } + eltPs.activation_type = activationMode; + curPs.eltwise_spec = eltPs; + return curPs; + } + + ParameterSpec adapt_Scale() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + modelWeightOpNum++; + ScaleParamSpec scalePs; + initialization_zero(&scalePs, sizeof(scalePs)); + scalePs.axis = 1; + curPs.scale_spec = scalePs; + return curPs; + } + + ParameterSpec adapt_Conv() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + modelWeightOpNum++; + const int weightIndex = this->tfliteOperators[this->tfliteOperatorIndex]->inputs[1]; + const auto &weightTensor = this->tfliteTensors[weightIndex]; + + const auto &weightShape = weightTensor->shape; + CHECK_REQUIREMENT(weightShape.size() == 4); + + ConvolutionParamSpec convPs; + initialization_zero(&convPs, sizeof(convPs)); + convPs.kernel_h = weightShape[1]; + convPs.kernel_w = weightShape[2]; + convPs.kernel_t = 1; + convPs.stride_t = 1; + convPs.padding_before = 0; + convPs.padding_after = 0; + convPs.dilatedRate_t = 1; + if (opCode == tflite::BuiltinOperator_CONV_2D) { + convPs.num_outputs = weightShape[0]; + convPs.num_outputs_origin = convPs.num_outputs; + + const auto &tfliteConvOption = + this->tfliteOperators[this->tfliteOperatorIndex]->builtin_options.AsConv2DOptions(); + convPs.dilatedRate_h = tfliteConvOption->dilation_h_factor; + convPs.dilatedRate_w = tfliteConvOption->dilation_w_factor; + convPs.stride_h = tfliteConvOption->stride_h; + convPs.stride_w = tfliteConvOption->stride_w; + const auto activationFunc = tfliteConvOption->fused_activation_function; + if (1 == tfliteConvOption->padding) { // VALID + convPs.padding_top = 0; + convPs.padding_bottom = 0; + convPs.padding_left = 0; + convPs.padding_right = 0; + } else { // SAME + const auto &inputTensor = + this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[0]]; + const auto &inputShape = inputTensor->shape; + convPs.padding_top = (convPs.kernel_h - 1) / 2; + convPs.padding_bottom = (convPs.kernel_h - 1) / 2; + if (convPs.kernel_h % 2 == 0) { + convPs.padding_top += 1; + } + if (convPs.padding_top != 0 && inputShape[1] % 2 == 0 && + tfliteConvOption->stride_h % 2 == 0) { + convPs.padding_top -= 1; + } + convPs.padding_left = (convPs.kernel_w - 1) / 2; + convPs.padding_right = (convPs.kernel_w - 1) / 2; + if (convPs.kernel_w % 2 == 0) { + convPs.padding_left += 1; + } + if (convPs.padding_left != 0 && inputShape[2] % 2 == 0 && + tfliteConvOption->stride_w % 2 == 0) { + convPs.padding_left -= 1; + } + } + convPs.group = 1; + convPs.dw_activation_type = ACTIVATION_NULL; + convPs.pw_activation_type = getActivationOperatorType(activationFunc); + if (convPs.dilatedRate_h > 1 || convPs.dilatedRate_w > 1) { + convPs.convolution_type = Convolution_Dilation; + } else { + convPs.convolution_type = Convolution_Pointwise; + } + } else if (opCode == tflite::BuiltinOperator_DEPTHWISE_CONV_2D) { + convPs.num_outputs = weightShape[3]; + convPs.num_outputs_origin = convPs.num_outputs; + + const auto &tfliteConvOption = this->tfliteOperators[this->tfliteOperatorIndex] + ->builtin_options.AsDepthwiseConv2DOptions(); + convPs.dilatedRate_h = tfliteConvOption->dilation_h_factor; + convPs.dilatedRate_w = tfliteConvOption->dilation_w_factor; + convPs.stride_h = tfliteConvOption->stride_h; + convPs.stride_w = tfliteConvOption->stride_w; + const auto activationFunc = tfliteConvOption->fused_activation_function; + + if (1 == tfliteConvOption->padding) { // VALID + convPs.padding_top = 0; + convPs.padding_bottom = 0; + convPs.padding_left = 0; + convPs.padding_right = 0; + } else { // SAME + const auto &inputTensor = + this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[0]]; + const auto &inputShape = inputTensor->shape; + convPs.padding_top = (convPs.kernel_h - 1) / 2; + convPs.padding_bottom = (convPs.kernel_h - 1) / 2; + if (convPs.kernel_h % 2 == 0) { + convPs.padding_top += 1; + } + if (convPs.padding_top != 0 && inputShape[1] % 2 == 0 && + tfliteConvOption->stride_h % 2 == 0) { + convPs.padding_top -= 1; + } + convPs.padding_left = (convPs.kernel_w - 1) / 2; + convPs.padding_right = (convPs.kernel_w - 1) / 2; + if (convPs.kernel_w % 2 == 0) { + convPs.padding_left += 1; + } + if (convPs.padding_left != 0 && inputShape[2] % 2 == 0 && + tfliteConvOption->stride_w % 2 == 0) { + convPs.padding_left -= 1; + } + } + + convPs.group = convPs.num_outputs; + // process the situation: when depth_multiplier > 1 && fn == depth_multiplier, depthwise ==> pointwise + if (tfliteConvOption->depth_multiplier > 1 && + tfliteConvOption->depth_multiplier == weightShape[3]) { + convPs.convolution_type = Convolution_Pointwise; + convPs.dw_activation_type = ACTIVATION_NULL; + convPs.pw_activation_type = getActivationOperatorType(activationFunc); + convPs.group = 1; + } else { + convPs.convolution_type = Convolution_Depthwise; + convPs.dw_activation_type = getActivationOperatorType(activationFunc); + convPs.pw_activation_type = ACTIVATION_NULL; + } + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + curPs.conv_spec = convPs; + return curPs; + } + + ParameterSpec adapt_Reduction() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + ReductionParamSpec reductionPs; + initialization_zero(&curPs, sizeof(reductionPs)); + if (opCode == tflite::BuiltinOperator_MEAN) { + reductionPs.reduction_mode = REDUCTION_MEAN; + } else { + UNI_ERROR_LOG("not support this reduction mode\n"); + } + + const auto &inputTensor = + this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[0]]; + const auto &inputShape = inputTensor->shape; + const auto &axisTensor = + this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[1]]; + const auto &axisData = tfliteModelBuffer[axisTensor->buffer]->data; + reductionPs.axes_num = axisData.size() / sizeof(int); + auto axisPtr = reinterpret_cast(axisData.data()); + memcpy(reductionPs.axes, axisPtr, axisData.size()); + if (this->weightFormat == DF_NHWC) { + for (int i = 0; i < reductionPs.axes_num; i++) { + reductionPs.axes[i] = NHWCAxisToNCHWAxis(reductionPs.axes[i], inputShape.size()); + } + } + reductionPs.coeff = 1; + reductionPs.keep_dim = false; + curPs.reduction_spec = reductionPs; + return curPs; + } + + ParameterSpec adapt_Pooling() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + PoolingParamSpec poolingPs; + initialization_zero(&poolingPs, sizeof(poolingPs)); + poolingPs.kernel_t = 1; + poolingPs.stride_t = 1; + poolingPs.padding_before = 0; + poolingPs.padding_after = 0; + poolingPs.padding_top = 0; + poolingPs.padding_bottom = 0; + poolingPs.padding_left = 0; + poolingPs.padding_right = 0; + poolingPs.rm = CEIL; + + if (opCode == tflite::BuiltinOperator_MEAN) { // Interpret as global pooling + const auto &inputTensor = + this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[0]]; + const auto &inputShape = inputTensor->shape; + CHECK_REQUIREMENT(inputShape.size() == 4); + + const auto &axisTensor = + this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[1]]; + const auto &axisData = tfliteModelBuffer[axisTensor->buffer]->data; + auto axisPtr = reinterpret_cast(axisData.data()); + CHECK_REQUIREMENT(1 == axisPtr[0] && 2 == axisPtr[1]); + poolingPs.mode = POOLING_MEAN; + poolingPs.kernel_h = 0; + poolingPs.kernel_w = 0; + poolingPs.stride_h = 1; + poolingPs.stride_w = 1; + } else { + const auto &tflitePoolOption = + this->tfliteOperators[this->tfliteOperatorIndex]->builtin_options.AsPool2DOptions(); + poolingPs.kernel_h = tflitePoolOption->filter_height; + poolingPs.kernel_w = tflitePoolOption->filter_width; + poolingPs.stride_h = tflitePoolOption->stride_h; + poolingPs.stride_w = tflitePoolOption->stride_w; + if (opCode == tflite::BuiltinOperator_MAX_POOL_2D) { + poolingPs.mode = POOLING_MAX; + } else if (opCode == tflite::BuiltinOperator_AVERAGE_POOL_2D) { + poolingPs.mode = POOLING_MEAN; + } + insertActivationOperator( + getActivationOperatorType(tflitePoolOption->fused_activation_function)); + } + curPs.pooling_spec = poolingPs; + return curPs; + } + + ParameterSpec adapt_Reshape() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + const auto &shapeTensor = + this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[1]]; + const auto &shapeData = tfliteModelBuffer[shapeTensor->buffer]->data; + ReshapeParamSpec reshapePs; + initialization_zero(&reshapePs, sizeof(reshapePs)); + if (shapeTensor->shape.size() == 1) { + CHECK_REQUIREMENT((shapeTensor->shape[0]) == (int)(shapeData.size() / sizeof(int))); + reshapePs.shape_size = shapeTensor->shape[0]; + } else if (shapeTensor->shape.size() == 2) { + CHECK_REQUIREMENT((shapeTensor->shape[1]) == (int)(shapeData.size() / sizeof(int))); + reshapePs.shape_size = shapeTensor->shape[1]; + } + auto reshapeDimPtr = reinterpret_cast(shapeData.data()); + std::vector reshapeDim(reshapeDimPtr, reshapeDimPtr + reshapePs.shape_size); + + for (int iter = 0; iter < (int)reshapeDim.size(); iter++) { + int axis = iter; + if (this->weightFormat == DF_NHWC) { + axis = NHWCAxisToNCHWAxis(iter, reshapeDim.size()); + } + reshapePs.shape_dims[axis] = reshapeDim[iter]; + } + reshapePs.axis = 8; + reshapePs.num_axes = -1; + curPs.reshape_spec = reshapePs; + return curPs; + } + + ParameterSpec adapt_Transpose() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + TransposeParamSpec transPs; + initialization_zero(&transPs, sizeof(transPs)); + const auto &dimsTensor = + this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[1]]; + const auto &dimsData = tfliteModelBuffer[dimsTensor->buffer]->data; + CHECK_REQUIREMENT((dimsTensor->shape[0]) == (int)(dimsData.size() / sizeof(int))); + transPs.trans_size = dimsTensor->shape[0]; + auto dims = reinterpret_cast(dimsData.data()); + for (U32 i = 0; i < transPs.trans_size; i++) { + if (this->weightFormat == DF_NHWC) { + transPs.trans_dims[i] = NHWCAxisToNCHWAxis(dims[i], transPs.trans_size); + } else { + transPs.trans_dims[i] = dims[i]; + } + } + curPs.transpose_spec = transPs; + return curPs; + } + + ParameterSpec adapt_TfSlice() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + TfSliceParamSpec tfSlicePs; + initialization_zero(&tfSlicePs, sizeof(tfSlicePs)); + if (opCode == tflite::BuiltinOperator_STRIDED_SLICE) { + const auto &stridedSliceOption = this->tfliteOperators[this->tfliteOperatorIndex] + ->builtin_options.AsStridedSliceOptions(); + const auto &beginTensor = + this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[1]]; + tfSlicePs.dim_size = beginTensor->shape[0]; + auto beginData = reinterpret_cast( + (tfliteModelBuffer[beginTensor->buffer]->data).data()); + memcpy(tfSlicePs.begin, beginData, sizeof(int) * tfSlicePs.dim_size); + const auto &endTensor = + this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[2]]; + auto endData = reinterpret_cast( + (tfliteModelBuffer[endTensor->buffer]->data).data()); + memcpy(tfSlicePs.end, endData, sizeof(int) * tfSlicePs.dim_size); + const auto &stridesTensor = + this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[3]]; + auto stridesData = reinterpret_cast( + (tfliteModelBuffer[stridesTensor->buffer]->data).data()); + memcpy(tfSlicePs.strides, stridesData, sizeof(int) * tfSlicePs.dim_size); + bitsToCharArray( + stridedSliceOption->begin_mask, tfSlicePs.begin_mask, tfSlicePs.dim_size); + bitsToCharArray(stridedSliceOption->end_mask, tfSlicePs.end_mask, tfSlicePs.dim_size); + bitsToCharArray( + stridedSliceOption->ellipsis_mask, tfSlicePs.ellipsis_mask, tfSlicePs.dim_size); + bitsToCharArray( + stridedSliceOption->new_axis_mask, tfSlicePs.new_axis_mask, tfSlicePs.dim_size); + bitsToCharArray(stridedSliceOption->shrink_axis_mask, tfSlicePs.shrink_axis_mask, + tfSlicePs.dim_size); + } else { + const auto &beginTensor = + this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[1]]; + tfSlicePs.dim_size = beginTensor->shape[0]; + auto beginData = reinterpret_cast( + (tfliteModelBuffer[beginTensor->buffer]->data).data()); + memcpy(tfSlicePs.begin, beginData, sizeof(int) * tfSlicePs.dim_size); + const auto &sizeTensor = + this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[2]]; + auto sizeData = reinterpret_cast( + (tfliteModelBuffer[sizeTensor->buffer]->data).data()); + for (U32 i = 0; i < tfSlicePs.dim_size; i++) { + tfSlicePs.end[i] = tfSlicePs.begin[i] + sizeData[i]; + tfSlicePs.strides[i] = 1; + } + memset(tfSlicePs.begin_mask, 0, sizeof(char) * tfSlicePs.dim_size); + memset(tfSlicePs.end_mask, 0, sizeof(char) * tfSlicePs.dim_size); + memset(tfSlicePs.ellipsis_mask, 0, sizeof(char) * tfSlicePs.dim_size); + memset(tfSlicePs.new_axis_mask, 0, sizeof(char) * tfSlicePs.dim_size); + memset(tfSlicePs.shrink_axis_mask, 0, sizeof(char) * tfSlicePs.dim_size); + } + if (this->weightFormat == DF_NHWC) { + shiftRight(tfSlicePs.begin, tfSlicePs.dim_size, 1, tfSlicePs.dim_size - 1); + shiftRight(tfSlicePs.end, tfSlicePs.dim_size, 1, tfSlicePs.dim_size - 1); + shiftRight(tfSlicePs.strides, tfSlicePs.dim_size, 1, tfSlicePs.dim_size - 1); + shiftRight(tfSlicePs.begin_mask, tfSlicePs.dim_size, 1, tfSlicePs.dim_size - 1); + shiftRight(tfSlicePs.end_mask, tfSlicePs.dim_size, 1, tfSlicePs.dim_size - 1); + shiftRight(tfSlicePs.ellipsis_mask, tfSlicePs.dim_size, 1, tfSlicePs.dim_size - 1); + shiftRight(tfSlicePs.new_axis_mask, tfSlicePs.dim_size, 1, tfSlicePs.dim_size - 1); + shiftRight( + tfSlicePs.shrink_axis_mask, tfSlicePs.dim_size, 1, tfSlicePs.dim_size - 1); + } + curPs.tfslice_spec = tfSlicePs; + return curPs; + } + + ParameterSpec adapt_MatMul() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + MatMulParamSpec matmulPs; + initialization_zero(&matmulPs, sizeof(matmulPs)); + matmulPs.transpose_a = false; + matmulPs.transpose_b = false; + curPs.matmul_spec = matmulPs; + return curPs; + } + + ParameterSpec adapt_Fc() override + { + modelWeightOpNum++; + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + FullyConnectedParamSpec ips; + initialization_zero(&ips, sizeof(ips)); + const int index = this->tfliteOperators[this->tfliteOperatorIndex]->inputs[1]; + const auto &tensor = this->tfliteTensors[index]; + I32 size = tfliteModelBuffer[tensor->buffer]->data.size(); + CHECK_REQUIREMENT(size != 0); + const auto &weightShape = tensor->shape; + ips.num_outputs = weightShape[0]; + ips.num_slices = 1; + ips.slice_point[0] = ips.num_outputs; + curPs.fc_spec = ips; + const auto &tfliteFullyConnectedOption = this->tfliteOperators[this->tfliteOperatorIndex] + ->builtin_options.AsFullyConnectedOptions(); + insertActivationOperator( + getActivationOperatorType(tfliteFullyConnectedOption->fused_activation_function)); + return curPs; + } + + ParameterSpec adapt_Concat() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + ConcatParamSpec concatPs; + initialization_zero(&concatPs, sizeof(concatPs)); + std::vector pinnedInput; + if (this->opCode == tflite::BuiltinOperator_CONCATENATION) { + const auto &tfliteConcatOption = this->tfliteOperators[this->tfliteOperatorIndex] + ->builtin_options.AsConcatenationOptions(); + insertActivationOperator( + getActivationOperatorType(tfliteConcatOption->fused_activation_function)); + concatPs.axis = tfliteConcatOption->axis; + pinnedInput = getOperatorWeightInputIndex(this->tfliteOperatorIndex); + } else { + const auto &tflitePackOption = + this->tfliteOperators[this->tfliteOperatorIndex]->builtin_options.AsPackOptions(); + concatPs.axis = tflitePackOption->axis; + int id = tflitePackOption->values_count - 1; + pinnedInput.push_back(id); + } + for (U32 i = 0; i < pinnedInput.size(); i++) { + int id = pinnedInput[i]; + OperatorSpec sharedWeight = mt_create_operator( + this->boltOperators[this->boltOperatorIndex].input_tensors_name[id], + OT_SharedWeight, 0, 1); + str_copy(sharedWeight.output_tensors_name[0], + this->boltOperators[this->boltOperatorIndex].input_tensors_name[id], NAME_LEN); + SharedWeightParamSpec sharedWeightPs; + const auto &weightTensor = + this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[id]]; + auto weightData = reinterpret_cast( + (tfliteModelBuffer[weightTensor->buffer]->data).data()); + sharedWeightPs.desc = tensor1d(DT_U32, 1); + WeightSpec weightSpec = mt_create_weight( + this->boltOperators[this->boltOperatorIndex].input_tensors_name[id], DT_F32, + bytesOf(DT_F32), 0, 0); + ((float *)weightSpec.weight)[0] = weightData[0]; + this->boltSharedWeights.push_back(weightSpec); + sharedWeight.ps.shared_weight_spec = sharedWeightPs; + this->boltOperators.insert( + this->boltOperators.begin() + this->boltOperatorIndex, sharedWeight); + this->boltOperatorInsertBefore++; + this->modelWeightOpNum++; + } + const auto &outputTensor = + this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->outputs[0]]; + const auto &outputShape = outputTensor->shape; + if (this->weightFormat == DF_NHWC) { + concatPs.axis = NHWCAxisToNCHWAxis(concatPs.axis, outputShape.size()); + } + curPs.concat_spec = concatPs; + return curPs; + } + + ParameterSpec adapt_Softmax() override + { + const auto &tfliteSoftmaxOption = + this->tfliteOperators[this->tfliteOperatorIndex]->builtin_options.AsSoftmaxOptions(); + CHECK_REQUIREMENT(1 == tfliteSoftmaxOption->beta); + + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + SoftmaxParamSpec softmaxPs; + initialization_zero(&softmaxPs, sizeof(softmaxPs)); + softmaxPs.axis = -1; + + const auto &inputTensor = + this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[0]]; + const auto &inputShape = inputTensor->shape; + if (this->weightFormat == DF_NHWC) { + softmaxPs.axis = NHWCAxisToNCHWAxis(softmaxPs.axis, inputShape.size()); + } + curPs.softmax_spec = softmaxPs; + return curPs; + } + + ParameterSpec adapt_Resize() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + ResizeParamSpec resizePs; + initialization_zero(&resizePs, sizeof(resizePs)); + const auto &dimsTensor = + this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[1]]; + const auto &dimsData = tfliteModelBuffer[dimsTensor->buffer]->data; + CHECK_REQUIREMENT((dimsTensor->shape[0]) == 2); + auto dims = reinterpret_cast(dimsData.data()); + resizePs.sizes[0] = dims[0]; + resizePs.sizes[1] = dims[1]; + resizePs.num_sizes = 2; + resizePs.num_scales = 0; + curPs.resize_spec = resizePs; + return curPs; + } + + ParameterSpec adapt_Clip() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + ClipParamSpec clipPs; + initialization_zero(&clipPs, sizeof(clipPs)); + const auto &clipTensor = + this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[1]]; + const auto &clipData = tfliteModelBuffer[clipTensor->buffer]->data; + if (opCode == tflite::BuiltinOperator_MINIMUM) { + clipPs.max = clipData[0]; + clipPs.min = std::numeric_limits::min(); + } else if (opCode == tflite::BuiltinOperator_MAXIMUM) { + clipPs.max = std::numeric_limits::max(); + clipPs.min = clipData[0]; + } + curPs.clip_spec = clipPs; + return curPs; + } + + ParameterSpec adapt_Deconvolution() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + modelWeightOpNum++; + const int weightIndex = this->tfliteOperators[this->tfliteOperatorIndex]->inputs[1]; + const auto &weightTensor = this->tfliteTensors[weightIndex]; + + const auto &weightShape = weightTensor->shape; + CHECK_REQUIREMENT(weightShape.size() == 4); + + ConvolutionParamSpec convPs; + initialization_zero(&convPs, sizeof(convPs)); + convPs.kernel_t = 1; + convPs.kernel_h = weightShape[1]; + convPs.kernel_w = weightShape[2]; + convPs.num_outputs = weightShape[0]; + convPs.num_outputs_origin = convPs.num_outputs; + + const auto &tfliteDeConvOption = + this->tfliteOperators[this->tfliteOperatorIndex]->builtin_options.AsTransposeConvOptions(); + convPs.stride_t = 1; + convPs.stride_h = tfliteDeConvOption->stride_h; + convPs.stride_w = tfliteDeConvOption->stride_w; + convPs.group = 1; + + convPs.dilatedRate_t = 1; + convPs.dilatedRate_h = 1; + convPs.dilatedRate_w = 1; + convPs.convolution_type = Convolution_Deconvolution; + convPs.dw_activation_type = ACTIVATION_NULL; + convPs.pw_activation_type = ACTIVATION_NULL; + + convPs.padding_before = 0; + convPs.padding_after = 0; + if (tfliteDeConvOption->padding == 1) { + convPs.padding_top = 0; + convPs.padding_bottom = 0; + convPs.padding_left = 0; + convPs.padding_right = 0; + } else { + convPs.padding_top = (convPs.kernel_h - convPs.stride_h) / 2; + convPs.padding_bottom = convPs.kernel_h - convPs.stride_h - convPs.padding_top; + convPs.padding_left = (convPs.kernel_w - convPs.stride_w) / 2; + convPs.padding_right = convPs.kernel_w - convPs.stride_w - convPs.padding_left; + } + + curPs.conv_spec = convPs; + return curPs; + } + + ParameterSpec adapt_Power() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + PowerParamSpec powerPs; + initialization_zero(&powerPs, sizeof(powerPs)); + powerPs.scale = 1; + powerPs.shift = 0; + powerPs.power = 1; + float weight = 0; + std::vector weightInputIndex = getOperatorWeightInputIndex(this->tfliteOperatorIndex); + if (weightInputIndex.size() > 0) { + const auto &weightTensor = + this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex] + ->inputs[weightInputIndex[0]]]; + weight = transformTfliteTensorToVector(weightTensor)[0]; + } + if (opCode == tflite::BuiltinOperator_SQRT) { + powerPs.power = 0.5; + } else if (opCode == tflite::BuiltinOperator_POW) { + powerPs.power = weight; + } else if (opCode == tflite::BuiltinOperator_ADD) { + powerPs.shift = weight; + } else if (opCode == tflite::BuiltinOperator_SUB) { + powerPs.shift = weight * -1; + } else if (opCode == tflite::BuiltinOperator_MUL) { + powerPs.scale = weight; + } else if (opCode == tflite::BuiltinOperator_DIV) { + powerPs.scale = 1.0 / weight; + } else if (opCode == tflite::BuiltinOperator_NEG) { + powerPs.scale = -1.0; + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + curPs.power_spec = powerPs; + return curPs; + } + + ParameterSpec adapt_Pad() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + PadParamSpec padPs; + initialization_zero(&padPs, sizeof(padPs)); + const auto &beginTensor = + this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[1]]; + auto beginData = reinterpret_cast( + (tfliteModelBuffer[beginTensor->buffer]->data).data()); + padPs.before = 0; + padPs.after = 0; + padPs.top = beginData[2]; + padPs.bottom = beginData[3]; + padPs.left = beginData[4]; + padPs.right = beginData[5]; + padPs.constant_value = 0; + if (this->opCode == tflite::BuiltinOperator_PAD) { + padPs.pad_mode = Pad_Constant; + } else { + padPs.pad_mode = Pad_Reflect; + } + curPs.pad_spec = padPs; + return curPs; + } + + ParameterSpec adapt_Relu() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + ReLUParamSpec reluPs; + initialization_zero(&reluPs, sizeof(reluPs)); + if (this->opCode == tflite::BuiltinOperator_RELU) { + reluPs.neg_slope = 0; + } else { + const auto &tfliteLeakyReluOption = + this->tfliteOperators[this->tfliteOperatorIndex]->builtin_options.AsLeakyReluOptions(); + reluPs.neg_slope = tfliteLeakyReluOption->alpha; + } + curPs.relu_spec = reluPs; + return curPs; + } + + ParameterSpec adapt_Squeeze() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + SqueezeParamSpec squeezePs; + initialization_zero(&squeezePs, sizeof(squeezePs)); + const auto &tfliteSqueezeOption = + this->tfliteOperators[this->tfliteOperatorIndex]->builtin_options.AsSqueezeOptions(); + squeezePs.axes_num = tfliteSqueezeOption->squeeze_dims.size(); + const auto &inputTensor = + this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[0]]; + const auto &inputShape = inputTensor->shape; + for (int i = 0; i < squeezePs.axes_num; i++) { + if (this->weightFormat == DF_NHWC) { + squeezePs.axes[i] = + NHWCAxisToNCHWAxis(tfliteSqueezeOption->squeeze_dims[i], inputShape.size()); + } else { + squeezePs.axes[i] = tfliteSqueezeOption->squeeze_dims[i]; + } + } + curPs.squeeze_spec = squeezePs; + return curPs; + } + + ParameterSpec adapt_Unsqueeze() override + { + ParameterSpec curPs; + initialization_zero(&curPs, sizeof(curPs)); + UnsqueezeParamSpec unsqueezePs; + initialization_zero(&unsqueezePs, sizeof(unsqueezePs)); + const auto &weightTensor = + this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[1]]; + auto weightData = reinterpret_cast( + (tfliteModelBuffer[weightTensor->buffer]->data).data()); + const auto &inputTensor = + this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[0]]; + const auto &inputShape = inputTensor->shape; + if (this->weightFormat == DF_NHWC) { + unsqueezePs.axes[0] = NHWCAxisToNCHWAxis(weightData[0], inputShape.size()); + } else { + unsqueezePs.axes[0] = weightData[0]; + } + unsqueezePs.axes_num = 1; + curPs.unsqueeze_spec = unsqueezePs; + return curPs; + } + +public: + std::map modifiedInputsOp{{OT_Conv, 1}, {OT_Reshape, 1}, {OT_Resize, 1}, + {OT_Transpose, 1}, {OT_FC, 1}, {OT_Slice, 1}, {OT_Scale, 1}, {OT_Pooling, 1}, {OT_Clip, 1}, + {OT_Deconvolution, 1}, {OT_SqDiff, 1}, {OT_Reduction, 1}, {OT_Pad, 1}, {OT_Power, 1}, + {OT_TfSlice, 1}}; + +private: + DataFormat weightFormat; + std::vector> tfliteModelBuffer; + std::vector> tfliteOpSet; + std::vector> tfliteOperators; + std::vector> tfliteTensors; + std::vector inputs; + std::vector outputs; + U32 tfliteOperatorIndex; + tflite::BuiltinOperator opCode; + int modelWeightOpNum; + std::string modelName; + + U32 boltOperatorIndex; + U32 boltOperatorInsertBefore; // 1 tflite operator -> (before + 1 + after) bolt operators + U32 boltOperatorInsertAfter; + std::map boltOperatorNameMap; + std::vector boltOperators; + std::vector boltSharedWeights; +}; +#endif diff --git a/model-tools/src/tflite/tflite_wrapper.cpp b/model_tools/src/tflite/tflite_wrapper.cpp similarity index 76% rename from model-tools/src/tflite/tflite_wrapper.cpp rename to model_tools/src/tflite/tflite_wrapper.cpp index 7700e18e..f0bb943a 100644 --- a/model-tools/src/tflite/tflite_wrapper.cpp +++ b/model_tools/src/tflite/tflite_wrapper.cpp @@ -1,25 +1,25 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include #include "converter.h" #include "model_tools.h" #include "tflite_adaptee.h" -EE tflite_converter(std::string dir, std::string mfn, ModelSpec* ms) { - ModelAdaptee* ade = new TfliteAdaptee(); - EE ret = ade->adapt(dir, mfn, ms); +EE tflite_converter(std::string dir, std::string mfn, ModelSpec *ms) +{ + ModelAdaptee *ade = new TfliteAdaptee(); + EE ret = ade->adapt(dir, mfn, ms); delete ade; return ret; } diff --git a/model_tools/tools/CMakeLists.txt b/model_tools/tools/CMakeLists.txt new file mode 100644 index 00000000..bbd931d4 --- /dev/null +++ b/model_tools/tools/CMakeLists.txt @@ -0,0 +1,10 @@ +set_test_c_cxx_flags() + +if(BUILD_TEST OR USE_CAFFE OR USE_ONNX OR USE_TFLITE OR USE_TENSORFLOW) +model_tools_test(X2bolt "X2bolt/X2bolt.cpp") +install(TARGETS X2bolt + RUNTIME DESTINATION tools) +model_tools_test(post_training_quantization "quantization/post_training_quantization.cpp") +install(TARGETS post_training_quantization + RUNTIME DESTINATION tools) +endif(BUILD_TEST OR USE_CAFFE OR USE_ONNX OR USE_TFLITE OR USE_TENSORFLOW) diff --git a/model_tools/tools/X2bolt/X2bolt.cpp b/model_tools/tools/X2bolt/X2bolt.cpp new file mode 100644 index 00000000..916f5bb8 --- /dev/null +++ b/model_tools/tools/X2bolt/X2bolt.cpp @@ -0,0 +1,138 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include +#include "online_conversion.h" +#include "model_tools.h" +#include "model_serialize_deserialize.hpp" + +void print_X2bolt_usage() +{ + std::cout << "X2bolt(version:" << sg_boltVersion + << ") " + "converter usage: (<> must be filled in with exact value; [] is " + "optional)\n" + "./X2bolt -d -m -i -v -s -h " + "-r [removeOperatorNum]\n" + "Parameter description:\n" + "1. -d : The directory where your model is stored.\n" + "2. -m : The name of your model. " + "Tips: If your model trained from caffe, please ensure the names of prototxt and " + "caffemodel are the same, otherwise error occurs.\n" + "3. -i : The inference precision. Currently, you can only " + "choose one of " + "{FP32, FP16, PTQ}. PTQ produces the input for post_training_quantization tool.\n" + "4. -r [removeOperatorNum]: The number of preprocession operator in onnx model." + "The default value is 0.\n" + "5. -v : X2bolt version information.\n" + "6. -s : Bolt Model detail information.\n" + "7. -h : X2bolt help information.\n" + "Example: ./X2bolt -d /local/models/ -m resnet50 -i FP16\n" + "If model conversion is successful, you can find the resnet50_f16.bolt file in " + "/local/models. Otherwise, you should check the usage Intro above.\n" + << std::endl; +} + +void print_version() +{ + std::cout << "Current mdoel converter version is : " << sg_boltVersion << std::endl; +} + +int main(int argc, char *argv[]) +{ + std::cout << "\nEnter './X2bolt --help' to get more usage information.\nEnter './X2bolt " + "--version' to get the version.\n\n"; + std::vector lineArgs(argv, argv + argc); + for (std::string arg : lineArgs) { + if (arg == "--help" || arg == "-help" || arg == "--h" || arg == "-h") { + print_X2bolt_usage(); + return -1; + } else if (arg == "--version" || arg == "-version" || arg == "--v" || arg == "-v") { + print_version(); + return -1; + } + } + CHECK_REQUIREMENT(argc >= 4); + char *storagePath = (char *)" "; + char *modelName = (char *)" "; + char *inferPrecision = (char *)" "; + I32 removeProcessOpsNum = 0; + bool show_model_info = false; + + int option; + const char *optionstring = "d:m:i:r:s"; + while ((option = getopt(argc, argv, optionstring)) != -1) { + switch (option) { + case 'd': + std::cout << "option is -d , value is: " << optarg << std::endl; + storagePath = optarg; + break; + case 'm': + std::cout << "option is -m , value is: " << optarg << std::endl; + modelName = optarg; + break; + case 'i': + std::cout << "option is -i , value is: " << optarg << std::endl; + inferPrecision = optarg; + break; + case 'r': + std::cout << "option is -r [removeOperatorNum], value is: " << optarg << std::endl; + removeProcessOpsNum = atoi(optarg); + break; + case 's': + show_model_info = true; + break; + default: + std::cerr << "Input option gets error. Please check the params meticulously." + << std::endl; + print_X2bolt_usage(); + return -1; + } + } + + void *onlineModel = OnlineModelConversion( + storagePath, modelName, inferPrecision, removeProcessOpsNum); + ModelSpec *ms = (ModelSpec *)onlineModel; + + std::string modelStorePath = std::string(storagePath) + "/" + std::string(modelName); + if (0) { +#if _USE_FP32 + } else if (std::string(inferPrecision).compare(std::string("PTQ")) == 0) { + modelStorePath += std::string("_ptq_input.bolt"); +#endif +#if _USE_FP16 + } else if (std::string(inferPrecision).compare(std::string("FP16")) == 0) { + modelStorePath += std::string("_f16.bolt"); +#endif +#if _USE_FP32 + } else if (std::string(inferPrecision).compare(std::string("FP32")) == 0) { + modelStorePath += std::string("_f32.bolt"); +#endif + } else { + std::cerr << "NOT SUPPORT THIS PRECISION " << inferPrecision << std::endl; + return -1; + } + CHECK_STATUS(serialize_model_to_file(ms, modelStorePath.c_str())); + OnlineModelReclaim(onlineModel); + if (show_model_info) { + ModelSpec resultMs; + CHECK_STATUS(deserialize_model_from_file(modelStorePath.c_str(), &resultMs)); + print_header(resultMs); + print_operator_tensor_relationship(resultMs); + print_weights(resultMs); + CHECK_STATUS(mt_destroy_model(&resultMs)); + } + std::cout << "Model Conversion Succeeded!" << std::endl; + return 0; +} diff --git a/model-tools/tools/pytorch2caffe/README.md b/model_tools/tools/pytorch2caffe/README.md similarity index 100% rename from model-tools/tools/pytorch2caffe/README.md rename to model_tools/tools/pytorch2caffe/README.md diff --git a/model-tools/tools/pytorch2caffe/lenet.py b/model_tools/tools/pytorch2caffe/lenet.py similarity index 100% rename from model-tools/tools/pytorch2caffe/lenet.py rename to model_tools/tools/pytorch2caffe/lenet.py diff --git a/model_tools/tools/quantization/post_training_quantization.cpp b/model_tools/tools/quantization/post_training_quantization.cpp new file mode 100644 index 00000000..fb86d4d3 --- /dev/null +++ b/model_tools/tools/quantization/post_training_quantization.cpp @@ -0,0 +1,177 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include +#include "online_conversion.h" +#include "model_tools.h" +#include "model_quantization.h" + +void print_quantization_usage() +{ + std::cout << "post_training_quantization : " + "./post_training_quantization -p \n" + "Parameter description:\n" + "1. -p : Path to the input model. The suffix should be _ptq_input.bolt.\n" + "2. -i [inferencePrecision]: The inference precision. Currently, you can only " + "choose one of " + "{FP32, FP16, INT8}. Default is INT8.\n" + "3. -b [BatchNormFusion]: Whether to fuse convolution or FC with BN. Default is true.\n" + "4. -q [quantStorage]: Store model in quantized form. You can choose one of" + "{FP16, INT8, MIX}. Default is MIX.\n" + "5. -c [clipValue]: To clip the input for gemm if clipValue > 0. The default " + "value is 0.\n" + "6. -s : The directory of the scale file.\n" + "7. -V : Verbose mode.\n" + << std::endl; +} + +int main(int argc, char *argv[]) +{ + std::cout << "\nEnter './post_training_quantization --help' to get more usage information." << std::endl; + std::vector lineArgs(argv, argv + argc); + for (std::string arg : lineArgs) { + if (arg == "--help" || arg == "-help" || arg == "--h" || arg == "-h") { + print_quantization_usage(); + return -1; + } + } + CHECK_REQUIREMENT(argc >= 2); + char *modelPath = nullptr; + char *inferPrecision = (char *)"INT8"; + bool fuseBN = true; + char *quantStorage = (char *)"NOQUANT"; + F32 clipVal = 0.0; + char *scaleFile = nullptr; + bool verbose = false; + + int option; + const char *optionstring = "p:i:b:q:c:s:V"; + while ((option = getopt(argc, argv, optionstring)) != -1) { + switch (option) { + case 'p': + std::cout << "option is -p , value is: " << optarg << std::endl; + modelPath = optarg; + break; + case 'i': + std::cout << "option is -i [inferencePrecision], value is: " << optarg << std::endl; + inferPrecision = optarg; + break; + case 'b': + std::cout << "option is -b [BatchNormFusion], value is: " << optarg << std::endl; + fuseBN = (std::string(optarg).compare("false") == 0) ? false : true; + break; + case 'q': + std::cout << "option is -q [quantStorage], value is: " << optarg << std::endl; + quantStorage = optarg; + break; + case 'c': + std::cout << "option is -c [clipValue], value is: " << optarg << std::endl; + clipVal = atof(optarg); + break; + case 's': + std::cout << "option is -s [scaleFileDirectory], value is: " << optarg << std::endl; + scaleFile = optarg; + break; + case 'V': + verbose = true; + break; + default: + std::cout << "Input option gets error. Please check the params meticulously." + << std::endl; + print_quantization_usage(); + return -1; + } + } + ModelSpec ms; + std::string storePath = std::string(modelPath); + CHECK_STATUS(deserialize_model_from_file(storePath.c_str(), &ms)); + if (ms.dt != DT_F32 || std::string::npos == storePath.find("ptq_input.bolt")) { + CHECK_STATUS(mt_destroy_model(&ms)); + UNI_ERROR_LOG("Input model does not match. Please produce it with: ./X2bolt -i PTQ\n"); + return 0; + } + auto relationNum = ms.num_op_tensor_entries; + auto relationPtr = ms.op_relationship_entries; + ms.num_op_tensor_entries = 0; + ms.op_relationship_entries = nullptr; +#ifdef _DEBUG + print_ms(ms); +#endif + + DataConvertType converterMode = F32_to_F16; + if (inferPrecision == std::string("INT8")) { + converterMode = F32_to_F16; + } else if (inferPrecision == std::string("HIDDEN")) { + converterMode = F32_to_F16; + } else if (inferPrecision == std::string("FP16")) { + converterMode = F32_to_F16; + } else if (inferPrecision == std::string("FP32")) { + converterMode = F32_to_F32; + } else { + UNI_ERROR_LOG("Unknown converter data precision : %s", inferPrecision); + } + + ModelSpecOptimizer msOptimizer; + msOptimizer.suggest_for_ptq(inferPrecision, fuseBN, clipVal); + msOptimizer.optimize(&ms); + + ModelSpec *targetMs = new ModelSpec(); + CHECK_STATUS(mt_create_model(targetMs)); + CHECK_STATUS(ms_datatype_converter(&ms, targetMs, converterMode, quantStorage)); + if ("INT8" == std::string(inferPrecision)) { + targetMs->dt = DT_F16_8Q; + } + + if (nullptr != scaleFile) { + add_scale_from_file(targetMs, scaleFile); + } + + auto suffixPos = storePath.find("ptq_input.bolt"); + storePath.erase(suffixPos, 14); + if (0) { +#if _USE_INT8 + } else if (std::string(inferPrecision).compare(std::string("INT8")) == 0) { + storePath += std::string("int8_q.bolt"); +#endif +#if _USE_FP16 + } else if (std::string(inferPrecision).compare(std::string("FP16")) == 0) { + storePath += std::string("f16_q.bolt"); +#endif +#if _USE_FP32 + } else if (std::string(inferPrecision).compare(std::string("FP32")) == 0) { + storePath += std::string("f32_q.bolt"); +#endif + } else { + std::cerr << "NOT SUPPORT THIS PRECISION " << inferPrecision << std::endl; + return -1; + } + + CHECK_STATUS(serialize_model_to_file(targetMs, storePath.c_str())); + CHECK_STATUS(mt_destroy_model(targetMs)); + delete targetMs; + ms.num_op_tensor_entries = relationNum; + ms.op_relationship_entries = relationPtr; + CHECK_STATUS(mt_destroy_model(&ms)); + + if (verbose) { + ModelSpec resultMs; + CHECK_STATUS(deserialize_model_from_file(storePath.c_str(), &resultMs)); + print_header(resultMs); + print_operator_tensor_relationship(resultMs); + print_weights(resultMs); + CHECK_STATUS(mt_destroy_model(&resultMs)); + } + std::cout << "Post Training Quantization Succeeded! " << std::endl; + return 0; +} \ No newline at end of file diff --git a/model-tools/tools/tensorflow2caffe/Caffe/__init__.py b/model_tools/tools/tensorflow2caffe/Caffe/__init__.py similarity index 100% rename from model-tools/tools/tensorflow2caffe/Caffe/__init__.py rename to model_tools/tools/tensorflow2caffe/Caffe/__init__.py diff --git a/model-tools/tools/tensorflow2caffe/Caffe/caffe_net.py b/model_tools/tools/tensorflow2caffe/Caffe/caffe_net.py similarity index 97% rename from model-tools/tools/tensorflow2caffe/Caffe/caffe_net.py rename to model_tools/tools/tensorflow2caffe/Caffe/caffe_net.py index 727fd3fb..43ec1cc6 100644 --- a/model-tools/tools/tensorflow2caffe/Caffe/caffe_net.py +++ b/model_tools/tools/tensorflow2caffe/Caffe/caffe_net.py @@ -21,6 +21,9 @@ def layer_index(self, layer_name): return i return -1 + def add_output(self, outputs): + for item in outputs: + self.net.output.append(item) def add_layer(self, layer_params, before='', after=''): if (not self.add_layer_set): diff --git a/model-tools/tools/tensorflow2caffe/Caffe/layer_parameter.py b/model_tools/tools/tensorflow2caffe/Caffe/layer_parameter.py similarity index 91% rename from model-tools/tools/tensorflow2caffe/Caffe/layer_parameter.py rename to model_tools/tools/tensorflow2caffe/Caffe/layer_parameter.py index 22c4810a..f46869f8 100644 --- a/model-tools/tools/tensorflow2caffe/Caffe/layer_parameter.py +++ b/model_tools/tools/tensorflow2caffe/Caffe/layer_parameter.py @@ -63,6 +63,12 @@ def reshape_param(self, shape, axis=0, num_axes=-1): reshape_param.num_axes = num_axes self.layerParameter.reshape_param.CopyFrom(reshape_param) + def tile_param(self, axis=1, tiles=2): + tile_param = pb.TileParameter() + tile_param.axis = axis + tile_param.tiles = tiles + self.layerParameter.tile_param.CopyFrom(tile_param) + def slice_param(self, axis, slice_point): slice_param = pb.SliceParameter() slice_param.axis = axis @@ -147,32 +153,11 @@ def set_params_by_dict(self,dic): def copy_from(self,layer_param): pass - # caffe.proto - # optional TransposeParameter transpose_param = 100010; - # message TransposeParameter { - # optional BlobShape dim = 1; - # } - # - # prototxt example - # layer { - # name: "layer0_out_att_self_query_t" - # type: "Transpose" - # bottom: "layer0_out_att_self_query_r" - # top: "layer0_out_att_self_query_t" - # transpose_param { - # dim { - # dim: 0 - # dim: 2 - # dim: 1 - # dim: 3 - # } - # } - # } - def transpose_param(self, dim): - transpose_param = pb.TransposeParameter() + def permute_param(self, dim): + permute_param = pb.PermuteParameter() for i in dim: - transpose_param.dim.dim.append(i) - self.layerParameter.transpose_param.CopyFrom(transpose_param) + permute_param.order.append(i) + self.layerParameter.permute_param.CopyFrom(permute_param) def convert_data_type(self, type_str): if (type_str == "FLOAT32"): @@ -252,29 +237,12 @@ def memory_param(self, shape, data_type): preallocated_memory_param.shape.dim.append(i) self.layerParameter.preallocated_memory_param.CopyFrom(preallocated_memory_param) - # caffe.proto - # optional MultiplyParameter multiply_param = 100013; - # message MultiplyParameter { - # optional float scale = 1 [default = 1]; - # optional float bias = 2 [default = 0]; - # } - # - # prototxt example - # layer { - # name: "layer0_out_att_self_qks" - # type: "Multiply" - # bottom: "layer0_out_att_self_qk" - # top: "layer0_out_att_self_qks" - # multiply_param { - # scale: 0.125 - # bias: 0 - # } - # } - def multiply_param(self, scale, bias): - multiply_param = pb.MultiplyParameter() - multiply_param.scale = scale - multiply_param.bias = bias - self.layerParameter.multiply_param.CopyFrom(multiply_param) + def power_param(self, scale, shift, power): + power_param = pb.PowerParameter() + power_param.scale = scale + power_param.shift = shift + power_param.power = power + self.layerParameter.power_param.CopyFrom(power_param) # caffe.proto # optional AttentionParameter attention_param = 100014; @@ -490,8 +458,8 @@ def copy_param(self, src_batch_stride, src_stride, src_offset, # optional uint32 num_output = 1; # optional int32 steps = 2 [default = -1]; # optional int32 num_proj = 3 [default = 0]; - # optional float zoneout_cell = 4 [default = 0]; - # optional float zoneout_output = 5 [default = 0]; + # optional float zoneoutCell = 4 [default = 0]; + # optional float zoneoutOutput = 5 [default = 0]; # } # # prototxt example @@ -504,17 +472,17 @@ def copy_param(self, src_batch_stride, src_stride, src_offset, # num_output: 1024 # steps: -1 # num_proj: 0 - # zoneout_cell: 0 - # zoneout_output: 0 + # zoneoutCell: 0 + # zoneoutOutput: 0 # } # } - def lstm_param(self, num_output, steps, num_proj, zoneout_cell, zoneout_output): + def lstm_param(self, num_output, steps, num_proj, zoneoutCell, zoneoutOutput): lstm_param = pb.LSTMParameter() lstm_param.num_output = num_output lstm_param.steps = steps lstm_param.num_proj = num_proj - lstm_param.zoneout_cell = zoneout_cell - lstm_param.zoneout_output = zoneout_output + lstm_param.zoneoutCell = zoneoutCell + lstm_param.zoneoutOutput = zoneoutOutput self.layerParameter.lstm_param.CopyFrom(lstm_param) # caffe.proto @@ -680,10 +648,10 @@ def relative_shift_param(self, axis, shift_length): def padding_param(self, shape, value=None): padding_param = pb.PaddingParameter() for i in np.array(shape).flatten(): - padding_param.shape.dim.append(i) + padding_param.shape.append(i) if (value is not None): for i in np.array(value).flatten(): - padding_param.value.dim.append(i) + padding_param.value.append(i) self.layerParameter.padding_param.CopyFrom(padding_param) # caffe.proto diff --git a/model-tools/tools/tensorflow2caffe/README.md b/model_tools/tools/tensorflow2caffe/README.md similarity index 97% rename from model-tools/tools/tensorflow2caffe/README.md rename to model_tools/tools/tensorflow2caffe/README.md index 59f8885d..9d492159 100644 --- a/model-tools/tools/tensorflow2caffe/README.md +++ b/model_tools/tools/tensorflow2caffe/README.md @@ -9,7 +9,7 @@ If you want to add your layers, you can refer section 4 step by step. 1. tensorflow python environment -2. caffe python environment, you need to compile the model-tools project to generate the caffe_pbs2.py in the [Caffe](./Caffe) directory. +2. caffe python environment, you need to compile the model_tools project to generate the caffe_pbs2.py in the [Caffe](./Caffe) directory. ## How to use? diff --git a/model_tools/tools/tensorflow2caffe/asr/convolution_transformer_params.py b/model_tools/tools/tensorflow2caffe/asr/convolution_transformer_params.py new file mode 100644 index 00000000..3b77c038 --- /dev/null +++ b/model_tools/tools/tensorflow2caffe/asr/convolution_transformer_params.py @@ -0,0 +1,245 @@ +#!/usr/local/bin/python +# -*- coding: utf-8 -*- + + +base_params = { + "sequence.max_length": 15, + "sequence.num_units": 128, + + "random_seed": 0, + "use_horovod": False, + "num_gpus": 8, + "batch_size_per_gpu": [[300, 400, 500, 600, 700, 800, 900, 1000, 1200, 1400], + [120, 90, 72, 60, 50, 44, 38, 20, 18, 18, 18]], +# "iter_size": 4, +# [16, 10, 8, 6, 6, 4]], + "max_steps": 5000 * 999999, # 3200 is estimated steps per epoch + # "num_epochs": 200, + + "save_summaries_steps": 100, + "print_loss_steps": 50, + "print_samples_steps": 5000, + "eval_steps": 5000, + "save_checkpoint_steps": 5000, + "num_checkpoints": 6, + #"logdir": output_dir, + + #"optimizer": NovoGrad, + "optimizer_params": { + "beta1": 0.95, + "beta2": 0.99, + "epsilon": 1e-08, + "weight_decay": 1e-04, + "grad_averaging": False, + "exclude_from_weight_decay": ["LayerNorm", "layer_norm", "bias", "bn/gamm", "bn/beta"], + }, + #"lr_policy": poly_decay, + "lr_policy_params": { + "learning_rate": 2.4e-2, + "min_lr": 1.2e-5, + "power": 2., + "warmup_steps": 5000 * 2, + "begin_decay_at": 5000 * 2, + "decay_steps": 5000 * 85, + }, + "larc_params": { + "larc_eta": 0.001, + }, + #"dtype": tf.float32, +# "dtype": "mixed", +# "loss_scaling": "Backoff", + # weight decay +# "regularizer": tf.contrib.layers.l2_regularizer, +# "regularizer_params": { +# 'scale': 0.0005 +# }, + #"initializer": tf.contrib.layers.xavier_initializer, + + "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + + "encoder": "ICRNNEncoder", + "encoder_params": { +# "use_conv_mask": True, + "net_blocks": [ + {"conv_type": "conv2d", + "conv_layers": [ + { + "states": [1, 1, 128, 1], + "kernel_size": [3, 7], "stride": [1, 2], + "num_channels": 32, "padding": "SAME" + }, + { + "states": [1, 1, 64, 32], + "kernel_size": [3, 5], "stride": [2, 2], + "num_channels": 32, "padding": "SAME" + }, + ], + "num_rnn_layers": 0}, + {"conv_type": "conv1d", + "conv_layers": [ + { + "kernel_size": [1], "stride": [1], + "num_channels": 384, "padding": "SAME", + "activation_fn": None, + }, + ], + "num_rnn_layers": 0, + "transformer_block_params": { + "n_layer": 2, + "d_model": 384, + "n_head": 6, + "d_head": 64, + "d_inner": 384 * 4, + "dropout_keep_prob": 0.9, + "att_trunc_len": [5, 7], + "norm_type": 'bn', + "use_xl_pos_enc": False, + }, + }, + {"conv_type": "conv1d", + "conv_layers": [ + { + "states": [1, 1, 384], + #"kernel_size": [3, 7], "stride": [1, 2], + "kernel_size": [3], "stride": [1], + "num_channels": 1024, "padding": "SAME" + }, + { + "states": [1, 1, 1024], + "kernel_size": [3], "stride": [2], + "num_channels": 1280, "padding": "SAME" + }, + { + "kernel_size": [1], "stride": [1], + "num_channels": 512, "padding": "SAME", + "activation_fn": None, + }, + ], + "num_rnn_layers": 0, + "transformer_block_params": { + "n_layer": 2, + "d_model": 512, + "n_head": 8, + "d_head": 64, + "d_inner": 512 * 4, + "dropout_keep_prob": 0.9, + "att_trunc_len": [7, 9], + "norm_type": 'bn', + "use_xl_pos_enc": False, + }, + }, + {"conv_type": "conv1d", + "conv_layers": [ + { + "states": [1, 1, 512], + "kernel_size": [3], "stride": [1], + "num_channels": 1024, "padding": "SAME" + }, + { + "states": [1, 1, 1024], + "kernel_size": [3], "stride": [2], + "num_channels": 1280, "padding": "SAME" + }, + { + "kernel_size": [1], "stride": [1], + "num_channels": 512, "padding": "SAME", + "activation_fn": None, + }, + ], + "num_rnn_layers": 0, + "transformer_block_params": { + "n_layer": 4, + "d_model": 512, + "n_head": 8, + "d_head": 64, + "d_inner": 512 * 4, + "dropout_keep_prob": 0.9, + "att_trunc_len": [9, 15, 23, 31], + "norm_type": 'bn', + "output_norm_type": 'ln', + "use_xl_pos_enc": False, + }, + }, + ], + "rnn_cell_dim": 0, + + "use_cudnn_rnn": False, + #"rnn_type": "cudnn_gru", + "rnn_type": "omni_lstm", + "rnn_unidirectional": True, + + "row_conv": False, +# "row_conv_width": 3, + "output_fc": False, + + "n_hidden": 512, + + "dropout_keep_prob": 0.85, + "activation_fn": "relu", + # "data_format": "BCFT", # "channels_first",'BCTF', 'BTFC', 'BCFT', 'BFTC' + }, + + "decoder": "TransducerDecoder", + "decoder_params": { + "use_sa_pred_net": True, + "blank_last": True, + "prepend_start_token": True, + "start_token_id": 1, + "pred_net_params": { +# 'emb_drop_word': True, + 'emb_keep_prob': 0.9, + "emb_size": 128, + 'mask_label_prob': 0.1, + 'mask_label_id': 3, + 'norm_inputs': False, + "transformer_block_params": { + "n_layer": 4, + "d_model": 512, + "n_head": 8, + "d_head": 64, + "d_inner": 512 * 4, + "dropout_keep_prob": 0.9, + "input_keep_prob": 1.0, + "input_project": True, + "att_trunc_len": [3, 5, 7, 9], + "norm_type": 'bn', + "output_norm_type": 'ln', + "use_xl_pos_enc": False, + }, + }, + + "joint_net_params": { + "hidden_units": 512, + "activation_fn": "relu", + "tie_embedding": False, + }, + }, + + "data_layer": "Speech2TextDataLayer", + "data_layer_params": { + #"data_dir": data_dir, + "bpe": False, + "text_type": 'zh_sep', + #"vocab_file": transcript_data_dir + "/pinyin/pinyin_vocab.txt", + + "feat_pad_value": -4.0, + + #"features_mean_path": transcript_data_dir + "/feat_stats_128_gain_mel/feat_mean.npy", + #"features_std_dev_path": transcript_data_dir + "/feat_stats_128_gain_mel/feat_std.npy", + + "num_audio_features": 128, + "input_type": "logfbank", + "norm_per_feature": True, + "window": "hanning", + "precompute_mel_basis": True, + "sample_freq": 16000, + "gain": 1.0/32767.0, + "pad_to": 16, +# "dither": 1e-5, + "backend": "librosa" + }, + + #"loss": FakeTransducerLoss, + "loss_params": {}, +} diff --git a/model_tools/tools/tensorflow2caffe/asr/convolution_transformer_params_v2.py b/model_tools/tools/tensorflow2caffe/asr/convolution_transformer_params_v2.py new file mode 100644 index 00000000..ff205b90 --- /dev/null +++ b/model_tools/tools/tensorflow2caffe/asr/convolution_transformer_params_v2.py @@ -0,0 +1,245 @@ +#!/usr/local/bin/python +# -*- coding: utf-8 -*- + + +base_params = { + "sequence.max_length": 15, + "sequence.num_units": 128, + + "random_seed": 0, + "use_horovod": False, + "num_gpus": 8, + "batch_size_per_gpu": [[300, 400, 500, 600, 700, 800, 900, 1000, 1200, 1400], + [120, 90, 72, 60, 50, 44, 38, 20, 18, 18, 18]], +# "iter_size": 4, +# [16, 10, 8, 6, 6, 4]], + "max_steps": 5000 * 999999, # 3200 is estimated steps per epoch + # "num_epochs": 200, + + "save_summaries_steps": 100, + "print_loss_steps": 50, + "print_samples_steps": 5000, + "eval_steps": 5000, + "save_checkpoint_steps": 5000, + "num_checkpoints": 6, + #"logdir": output_dir, + + #"optimizer": NovoGrad, + "optimizer_params": { + "beta1": 0.95, + "beta2": 0.99, + "epsilon": 1e-08, + "weight_decay": 1e-04, + "grad_averaging": False, + "exclude_from_weight_decay": ["LayerNorm", "layer_norm", "bias", "bn/gamm", "bn/beta"], + }, + #"lr_policy": poly_decay, + "lr_policy_params": { + "learning_rate": 2.4e-2, + "min_lr": 1.2e-5, + "power": 2., + "warmup_steps": 5000 * 2, + "begin_decay_at": 5000 * 2, + "decay_steps": 5000 * 85, + }, + "larc_params": { + "larc_eta": 0.001, + }, + #"dtype": tf.float32, +# "dtype": "mixed", +# "loss_scaling": "Backoff", + # weight decay +# "regularizer": tf.contrib.layers.l2_regularizer, +# "regularizer_params": { +# 'scale': 0.0005 +# }, + #"initializer": tf.contrib.layers.xavier_initializer, + + "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + + "encoder": "ICRNNEncoder", + "encoder_params": { +# "use_conv_mask": True, + "net_blocks": [ + {"conv_type": "conv2d", + "conv_layers": [ + { + "states": [1, 1, 128, 1], + "kernel_size": [3, 7], "stride": [1, 2], + "num_channels": 32, "padding": "SAME" + }, + { + "states": [1, 1, 64, 32], + "kernel_size": [3, 5], "stride": [2, 2], + "num_channels": 32, "padding": "SAME" + }, + ], + "num_rnn_layers": 0}, + {"conv_type": "conv1d", + "conv_layers": [ + { + "kernel_size": [1], "stride": [1], + "num_channels": 384, "padding": "SAME", + "activation_fn": None, + }, + ], + "num_rnn_layers": 0, + "transformer_block_params": { + "n_layer": 2, + "d_model": 384, + "n_head": 6, + "d_head": 64, + "d_inner": 384 * 4, + "dropout_keep_prob": 0.9, + "att_trunc_len": [5, 7], + "norm_type": 'bn', + "use_xl_pos_enc": False, + }, + }, + {"conv_type": "conv1d", + "conv_layers": [ + { + "states": [1, 1, 384], + #"kernel_size": [3, 7], "stride": [1, 2], + "kernel_size": [3], "stride": [1], + "num_channels": 1024, "padding": "SAME" + }, + { + "states": [1, 1, 1024], + "kernel_size": [3], "stride": [2], + "num_channels": 1280, "padding": "SAME" + }, + { + "kernel_size": [1], "stride": [1], + "num_channels": 512, "padding": "SAME", + "activation_fn": None, + }, + ], + "num_rnn_layers": 0, + "transformer_block_params": { + "n_layer": 2, + "d_model": 512, + "n_head": 8, + "d_head": 64, + "d_inner": 512 * 4, + "dropout_keep_prob": 0.9, + "att_trunc_len": [7, 9], + "norm_type": 'bn', + "use_xl_pos_enc": False, + }, + }, + {"conv_type": "conv1d", + "conv_layers": [ + { + "states": [1, 1, 512], + "kernel_size": [3], "stride": [1], + "num_channels": 1024, "padding": "SAME" + }, + { + "states": [1, 1, 1024], + "kernel_size": [3], "stride": [2], + "num_channels": 1280, "padding": "SAME" + }, + { + "kernel_size": [1], "stride": [1], + "num_channels": 512, "padding": "SAME", + "activation_fn": None, + }, + ], + "num_rnn_layers": 0, + "transformer_block_params": { + "n_layer": 8, + "d_model": 512, + "n_head": 8, + "d_head": 64, + "d_inner": 512 * 4, + "dropout_keep_prob": 0.9, + "att_trunc_len": [15, 15, 23, 23, 31, 31, 31, 31], + "norm_type": 'bn', + "output_norm_type": 'ln', + "use_xl_pos_enc": False, + }, + }, + ], + "rnn_cell_dim": 0, + + "use_cudnn_rnn": False, + #"rnn_type": "cudnn_gru", + "rnn_type": "omni_lstm", + "rnn_unidirectional": True, + + "row_conv": False, +# "row_conv_width": 3, + "output_fc": False, + + "n_hidden": 512, + + "dropout_keep_prob": 0.85, + "activation_fn": "relu", + # "data_format": "BCFT", # "channels_first",'BCTF', 'BTFC', 'BCFT', 'BFTC' + }, + + "decoder": "TransducerDecoder", + "decoder_params": { + "use_sa_pred_net": True, + "blank_last": True, + "prepend_start_token": True, + "start_token_id": 1, + "pred_net_params": { +# 'emb_drop_word': True, + 'emb_keep_prob': 0.9, + "emb_size": 128, + 'mask_label_prob': 0.1, + 'mask_label_id': 3, + 'norm_inputs': False, + "transformer_block_params": { + "n_layer": 4, + "d_model": 512, + "n_head": 8, + "d_head": 64, + "d_inner": 512 * 4, + "dropout_keep_prob": 0.9, + "input_keep_prob": 1.0, + "input_project": True, + "att_trunc_len": [7, 7, 9, 9], + "norm_type": 'bn', + "output_norm_type": 'ln', + "use_xl_pos_enc": False, + }, + }, + + "joint_net_params": { + "hidden_units": 512, + "activation_fn": "relu", + "tie_embedding": False, + }, + }, + + "data_layer": "Speech2TextDataLayer", + "data_layer_params": { + #"data_dir": data_dir, + "bpe": False, + "text_type": 'zh_sep', + #"vocab_file": transcript_data_dir + "/pinyin/pinyin_vocab.txt", + + "feat_pad_value": -4.0, + + #"features_mean_path": transcript_data_dir + "/feat_stats_128_gain_mel/feat_mean.npy", + #"features_std_dev_path": transcript_data_dir + "/feat_stats_128_gain_mel/feat_std.npy", + + "num_audio_features": 128, + "input_type": "logfbank", + "norm_per_feature": True, + "window": "hanning", + "precompute_mel_basis": True, + "sample_freq": 16000, + "gain": 1.0/32767.0, + "pad_to": 16, +# "dither": 1e-5, + "backend": "librosa" + }, + + #"loss": FakeTransducerLoss, + "loss_params": {}, +} diff --git a/model_tools/tools/tensorflow2caffe/asr/tensorflow2caffe_convolution_transformer.py b/model_tools/tools/tensorflow2caffe/asr/tensorflow2caffe_convolution_transformer.py new file mode 100644 index 00000000..12114ac4 --- /dev/null +++ b/model_tools/tools/tensorflow2caffe/asr/tensorflow2caffe_convolution_transformer.py @@ -0,0 +1,1297 @@ +#!/usr/local/bin/python +# -*- coding: utf-8 -*- + +import math +import numpy as np +import sys +sys.path.append("../") +from tensorflow2caffe import Tensorflow2Caffe +from convolution_transformer_params import base_params + +class Tensorflow2CaffeConvolutionTransformer(Tensorflow2Caffe): + def __init__(self, + tensorflow_model_path, caffe_model_path_prefix, caffe_model_name, + nchwc8=True, first_frame=True, + check=False, calc=False, quantization=False): + Tensorflow2Caffe.__init__(self, tensorflow_model_path, + caffe_model_path_prefix, caffe_model_name, check, calc, quantization) + self.base_params = base_params + self.params = {} + self.mode = "infer" + self.nchwc8 = nchwc8 + self.first_frame = first_frame + self.state_data_path = "./data" + self.save_state = True + + def layer_normed_fc(self, input_name, activation_fn, output_name_prefix, scope_id): + fc_name = output_name_prefix + "_fc" + self.extract_dense(input_name, fc_name, scope_id, scope_name="fully_connected") + ln_name = output_name_prefix + "_ln" + self.extract_layer_norm(fc_name, ln_name, scope_id, ["LayerNorm", "gamma", "beta"]) + result_name = "" + if (activation_fn == "relu"): + relu_name = output_name_prefix + "_relu" + result_name = self.add_relu(ln_name, relu_name) + else: + print("[ERROR] unsupported activation function" % (activation_fn)) + exit(1) + return result_name; + + def positionwise_FF(self, input, d_model, d_inner, dropout, kernel_initializer, + scope='ff', is_training=True, norm_type='ln', + scope_id=-1, output_name_prefix=""): + self.scopes[scope_id] = scope + output = input + if norm_type == 'bn': + output = self.extract_batch_norm(output, output_name_prefix+"_FF_bn", scope_id+1, + data_format="NCHW", axis=-1, + layer_names=["batch_normalization", "moving_mean", "moving_variance"]) + elif norm_type == 'pre_ln': + output = self.extract_layer_norm(output, output_name_prefix+"_FF_ln", scope_id+1, ["LayerNorm", "gamma", "beta"]) + self.add_quantization(scope_id+1, "quant_ffn_input", output) + output = self.extract_dense(output, output_name_prefix+"_FF_fc1", scope_id+1, scope_name="layer_1") + output = self.add_relu(output, output+"_FF_relu") + self.add_quantization(scope_id+1, "quant_ffn_middle", output) + output = self.extract_dense(output, output_name_prefix+"_FF_fc2", scope_id+1, scope_name="layer_2") + output = self.add_sum([output, input], output_name=output_name_prefix+"_FF_sum") + if norm_type == 'ln': + output = self.extract_layer_norm(output, output_name_prefix+"_FF_ln", scope_id+1, ["LayerNorm", "gamma", "beta"]) + else: + assert norm_type in ['bn', 'pre_ln'] + output = output + return output + + def group_norm(x, scop_id, data_format, group_num=32): + if data_format == 'channels_last': + x = self.add_transpose(x, x+"_t1", [0, 3, 1, 2]) + x = self.extract_group_norm(x, group_num, x+"_gn", scope_id, data_format="NCHW", layer_names=None) + output = self.add_transpose(x, x+"_t2", [0, 2, 3, 1]) + return output + + def neg_slice(self, input_name, axis, length, output_name_prefix): + other_name = output_name_prefix + "_other" + output_name = output_name_prefix + "_neg_slice" + self.add_slice(input_name, [other_name, output_name], axis=axis, slice_point=[-length]) + return output_name + + def row_conv(self, name, input_layer, batch, channels, width, activation_fn, + data_format, norm_type='batch_norm', gn_group_num=0, + scope_id=-1, output_name_prefix=""): + print("[ERROR] currently not support row_conv") + exit(1) + if width < 2: + return input_layer + + if data_format == 'channels_last': + x = self.add_reshape(input_layer, [batch, -1, 1, channels], output_name_prefix+"_row_conv_r") + x = self.add_transpose(x, [0, 3, 1, 2], output_name_prefix+"_row_conv_t") # B C T + else: + x = self.add_transpose(input_layer, [0, 2, 1], output_name_prefix+"_row_conv_t") # B C T + x = self.add_reshape(x, [batch, channels, -1, 1], output_name_prefix+"_row_conv_r") + y = self.extract_convolution(x, output_name_prefix+"_row_conv", scope_id+1, + channels, [width, 1], [1, 1], [(width-1)//2, (width-1)//2, 0, 0], + data_format="NCHW", weight_format="NHWC", + dilation=1, groups=channels, layer_names=None) + if norm_type != 'batch_norm': + y = self.add_nchwc8_nchw(y, output_name_prefix+"_nchwc8_nchw") + if norm_type == 'batch_norm': + bn = self.extract_batch_norm(y, output_name_prefix+"_row_conv_bn", scope_id+1) + bn = self.add_nchwc8_nchw(bn, output_name_prefix+"_nchwc8_nchw") + elif norm_type == 'group_norm': + assert data_format == 'channels_last' + bn = self.group_norm(y, scope_id+1, data_format) + elif norm_type == 'layer_norm': + assert data_format == 'channels_last' + bn = self.extract_layer_norm(y, output_name_prefix+"_row_conv_ln", scope_id+1) + else: + assert norm_type == 'skip' + bn = y + if (activation_fn == "relu"): + relu_name = output_name_prefix + "_relu" + output = self.add_relu(bn, relu_name) + else: + print("[ERROR] unsupported activation function" % (activation_fn)) + exit(1) + output = self.add_transpose(output, [0, 2, 3, 1], output_name_prefix+"_t") + output = self.add_reshape(output, [batch, -1, channels], output_name_prefix+"_r") + return output + + def conv_proj_res(self, layer_type, name, inputs, filters, proj_filters, kernel_size, activation_fn, strides, padding, data_format, + dilation=1, norm_type='batch_norm', + scope_id=-1, output_name_prefix=""): + print("[ERROR] currently not support conv_proj_res") + exit(1) + self.scopes[scope_id] = name + assert norm_type == 'batch_norm' + output = self.extract_batch_norm(inputs, output_name_prefix+"_bn", scope_id+1) + assert layer_type == 'conv1d' + assert activation_fn is not None + output = self.extract_convolution(output, output_name_prefix+"_row_conv", scope_id+1, + kernel_size[0], kernel_size[2:4], strides, padding, + data_format="NCHW", weight_format="NHWC", + dilation=dilation, groups=1, layer_names=None) + if (activation_fn == "relu"): + relu_name = output_name_prefix + "_relu" + output = self.add_relu(output, relu_name) + else: + print("[ERROR] unsupported activation function" % (activation_fn)) + exit(1) + output = self.extract_convolution(output, proj_filters, [1], [1], padding, + 1, 1, output_name_prefix+"_conv") + output = self.extract_convolution(output, output_name_prefix+"_row_conv", scope_id+1, + proj_filters, [1, 1], [1, 1], padding, + data_format="NCHW", weight_format="NHWC", + dilation=1, groups=1, layer_names=None) + output = self.add_nchwc8_nchw(output) + output = self.add_sum([output, inputs], output_name_prefix+"_conv_proj_res") + return output + + def conv_bn_actv(self, layer_type, name, inputs, filters, kernel_size, activation_fn, + strides, padding, data_format, + dilation=1, norm_type='batch_norm', gn_group_num=0, + scope_id=-1, output_name_prefix=""): + groups = 1 + if layer_type == 'sep_conv1d': + groups = filters + if data_format == "channels_last": + inputs = self.add_transpose(inputs, output_name_prefix+"_pre_t", [0, 3, 1, 2]) + self.add_quantization(scope_id, "quant_"+name, inputs) + conv = self.extract_convolution(inputs, output_name_prefix+"_conv", scope_id, + filters, kernel_size, strides, padding, + data_format="NCHW", weight_format="NHWC", + dilation=dilation, groups=groups, layer_names=[name, "kernel", "bias"]) + if norm_type != 'batch_norm': + conv = self.add_nchwc8_nchw(conv, output_name_prefix+"_nchwc8_nchw") + squeeze = False + if "conv1d" in layer_type and norm_type == 'batch_norm': + #axis = 1 if data_format == 'channels_last' else 2 + ## NWC --> NHWC + #conv = self.add_expand_dims(conv, axis=axis, output_name=conv+"_expand") + squeeze = True + + if norm_type == 'skip': + bn = conv + elif norm_type == 'group_norm': + print("[ERROR] currently not support online group norm") + exit(1) + elif norm_type == 'layer_norm': + assert data_format == 'channels_last' + bn = self.extract_layer_norm(conv, output_name_prefix+"_ln", scope_id+1, ["LayerNorm", "gamma", "beta"]) + elif norm_type == 'online_batch_norm': + print("[ERROR] currently not support online batch norm") + exit(1) + else: + assert norm_type == 'batch_norm' + bn = self.extract_batch_norm(conv, output_name_prefix+"_bn", scope_id+1) + bn = self.add_nchwc8_nchw(bn, output_name_prefix+"_nchwc8_nchw") + if data_format == "channels_last": + bn = self.add_transpose(bn, output_name_prefix+"_post_t", [0, 2, 3, 1]) + if squeeze: + bn = self.add_squeeze(bn, axis=2, output_name=bn+"_squeeze") + if (activation_fn is None): + output = bn + elif (activation_fn == "relu"): + relu_name = output_name_prefix + "_relu" + output = self.add_relu(bn, relu_name) + else: + print("[ERROR] unsupported activation function %s" % (activation_fn)) + exit(1) + return output + + def conv_layer_wrapper(self, inputs, conv_type, conv_layer_params, + gn_group_num, name, mask, layout, data_format, + decode_state=None, is_decoding=False, + scope_id=-1, output_name_prefix=""): + self.scopes[scope_id] = name + ch_out = conv_layer_params['num_channels'] + kernel_size = conv_layer_params['kernel_size'] # [T,F] format + strides = conv_layer_params['stride'] # [T,F] format + padding = conv_layer_params['padding'] + new_mems = None + if is_decoding and kernel_size[0] > 1: + assert decode_state is not None + inputs = self.add_concat([decode_state, inputs], output_name_prefix+"_concat", axis=1) + if strides[0] == 1: + mem_len = 2 + else: + assert strides[0] == 2 + mem_len = 1 + new_mems = self.neg_slice(inputs, axis=1, length=mem_len, output_name_prefix=output_name_prefix) + + if padding.lower() == 'same': + if conv_type == 'conv2d': + assert kernel_size[1] % 2 == 1 + pad_num = kernel_size[1] // 2 + left_pad_num = pad_num - 1 if self.get_tensor_shape(inputs)[-2] % 2 == 0 else pad_num + else: + pad_num = 0 + left_pad_num = 0 + padding = 'VALID' + padding = [0, 0, left_pad_num, pad_num] + else: + padding = [0, 0, 0, 0] + if kernel_size[0] == 1: + assert decode_state is None + assert len(kernel_size) == 1 + new_mems = None + act = conv_layer_params['activation_fn'] + norm_type = conv_layer_params.get('norm_type', 'batch_norm') + + if mask is not None and not is_decoding: + use_2d_conv = conv_type == 'conv2d' + if use_2d_conv: + assert data_format == 'channels_last' + + if layout == 'BFTC' or layout == 'BCFT': + kernel_size = kernel_size[::-1] + strides = strides[::-1] + if conv_type == 'conv1d': + inputs = self.add_expand_dims(inputs, 2, output_name_prefix+"_conv1d_expand") + strides.append(1) + kernel_size.append(1) + shape = self.get_tensor_shape(inputs) + padding[1] = math.ceil((shape[1] + padding[0] + padding[1] - kernel_size[0] + 1) / strides[0]) \ + * strides[0] + kernel_size[0] - 1 - padding[0] - shape[1] + padding[3] = math.ceil((shape[2] + padding[2] + padding[3] - kernel_size[1] + 1) / strides[1]) \ + * strides[1] + kernel_size[1] - 1 - padding[2] - shape[2] + if conv_type == 'conv_proj_res': + proj_filters = conv_layer_params['proj_num_channels'] + inputs = self.conv_proj_res( + layer_type='conv1d', + name=name, + inputs=inputs, + filters=ch_out, + proj_filters=proj_filters, + kernel_size=kernel_size, + activation_fn=act, + strides=strides, + padding=padding, + norm_type=norm_type, + data_format=data_format, + scope_id=scope_id, + output_name_prefix=output_name_prefix + ) + else: + inputs = self.conv_bn_actv( + layer_type=conv_type, + name=name, + inputs=inputs, + filters=ch_out, + kernel_size=kernel_size, + activation_fn=act, + strides=strides, + padding=padding, + norm_type=norm_type, + gn_group_num=gn_group_num, + data_format=data_format, + scope_id=scope_id, + output_name_prefix=output_name_prefix + ) + return inputs, new_mems + + def conv_bn_actv_nchwc8(self, layer_type, name, inputs, filters, kernel_size, activation_fn, + strides, padding, data_format, + dilation=1, norm_type='batch_norm', gn_group_num=0, + layer_id=0, layer_num=1, + scope_id=-1, output_name_prefix=""): + groups = 1 + if layer_type == 'sep_conv1d': + groups = filters + if data_format == "channels_last" and layer_id == 0: + inputs = self.add_transpose(inputs, output_name_prefix+"_pre_t", [0, 3, 1, 2]) + self.add_quantization(scope_id, "quant_"+name, inputs) + conv = self.extract_convolution(inputs, output_name_prefix+"_conv", scope_id, + filters, kernel_size, strides, padding, + data_format="NCHW", weight_format="NHWC", + dilation=dilation, groups=groups, layer_names=[name, "kernel", "bias"]) + if norm_type != 'batch_norm' and norm_type != 'skip': + print("[ERROR] currently not support %s in conv_bn_actv" % (norm_type)) + exit(1) + squeeze = False + if "conv1d" in layer_type and norm_type == 'batch_norm': + squeeze = True + if norm_type == 'skip': + bn = conv + elif norm_type == 'group_norm': + print("[ERROR] currently not support online group norm") + exit(1) + elif norm_type == 'layer_norm': + print("[ERROR] currently not support layer norm") + exit(1) + elif norm_type == 'online_batch_norm': + print("[ERROR] currently not support online batch norm") + exit(1) + else: + assert norm_type == 'batch_norm' + bn = self.extract_batch_norm(conv, output_name_prefix+"_bn", scope_id+1) + if (activation_fn is None): + output = bn + elif (activation_fn == "relu"): + output = self.add_relu(bn, output_name_prefix+"_relu") + else: + print("[ERROR] unsupported activation function %s" % (activation_fn)) + exit(1) + if (layer_id == layer_num - 1): + output = self.add_transpose(output, output_name_prefix+"_nchwc8_nchw_t", [0, 2, 3, 1, 4]) + output_shape = self.get_tensor_shape(output) + if squeeze: + #output = self.add_reshape(output, output_name_prefix+"_nchwc8_nchw_r", [0, 0, 0, -1]) + #output = self.add_squeeze(output, axis=2, output_name=bn+"_squeeze") + output = self.add_reshape(output, output_name_prefix+"_nchwc8_nchw_r", [self.batch, -1, output_shape[2]*output_shape[3]*output_shape[4]]) + else: + output = self.add_reshape(output, output_name_prefix+"_nchwc8_nchw_r", [self.batch, -1, output_shape[2], output_shape[3]*output_shape[4]]) + return output + + def conv_layer_wrapper_nchwc8(self, inputs, conv_type, conv_layer_params, + gn_group_num, name, mask, layout, data_format, + decode_state=None, is_decoding=False, + layer_id=0, layer_num=1, + scope_id=-1, output_name_prefix=""): + self.scopes[scope_id] = name + ch_out = conv_layer_params['num_channels'] + kernel_size = conv_layer_params['kernel_size'] # [T,F] format + strides = conv_layer_params['stride'] # [T,F] format + padding = conv_layer_params['padding'] + new_mems = None + if is_decoding and kernel_size[0] > 1: + assert decode_state is not None + if layer_id == 0: + axis = 1 + else: + axis = 2 + decode_state_shape = self.get_tensor_shape(decode_state) + if (len(decode_state_shape) == 4): + self.data_dict[decode_state] = self.data_dict[decode_state].reshape( + [self.batch, decode_state_shape[1]//8, -1, decode_state_shape[3], 8]) + assert(decode_state_shape[1]//8 != 0) + inputs = self.add_concat([decode_state, inputs], output_name_prefix+"_concat", axis) + if strides[0] == 1: + mem_len = 2 + else: + assert strides[0] == 2 + mem_len = 1 + new_mems = new_mems = self.neg_slice(inputs, axis=axis, length=mem_len, output_name_prefix=output_name_prefix) + if (layer_id == 0): + h_axis = 1 + w_axis = 2 + else: + h_axis = 2 + w_axis = 3 + if padding.lower() == 'same': + if conv_type == 'conv2d': + assert kernel_size[1] % 2 == 1 + pad_num = kernel_size[1] // 2 + left_pad_num = pad_num - 1 if self.get_tensor_shape(inputs)[w_axis] % 2 == 0 else pad_num + else: + pad_num = 0 + left_pad_num = 0 + padding = 'VALID' + padding = [0, 0, left_pad_num, pad_num] + else: + padding = [0, 0, 0, 0] + if kernel_size[0] == 1: + assert decode_state is None + assert len(kernel_size) == 1 + new_mems = None + act = conv_layer_params['activation_fn'] + norm_type = conv_layer_params.get('norm_type', 'batch_norm') + + if mask is not None and not is_decoding: + use_2d_conv = conv_type == 'conv2d' + if use_2d_conv: + assert data_format == 'channels_last' + + if layout == 'BFTC' or layout == 'BCFT': + kernel_size = kernel_size[::-1] + strides = strides[::-1] + if conv_type == 'conv1d': + if (layer_id == 0): + inputs = self.add_expand_dims(inputs, 2, output_name_prefix+"_conv1d_expand") + strides.append(1) + kernel_size.append(1) + shape = self.get_tensor_shape(inputs) + padding[1] = math.ceil((shape[h_axis] + padding[0] + padding[1] - kernel_size[0] + 1) / strides[0]) * strides[0] + kernel_size[0] - 1 - padding[0] - shape[h_axis] + padding[3] = math.ceil((shape[w_axis] + padding[2] + padding[3] - kernel_size[1] + 1) / strides[1]) * strides[1] + kernel_size[1] - 1 - padding[2] - shape[w_axis] + if conv_type == 'conv_proj_res': + print("[ERROR] currently not support conv_proj in wonv_wrapper_nchwc8") + exit(1) + else: + inputs = self.conv_bn_actv_nchwc8( + layer_type=conv_type, + name=name, + inputs=inputs, + filters=ch_out, + kernel_size=kernel_size, + activation_fn=act, + strides=strides, + padding=padding, + norm_type=norm_type, + gn_group_num=gn_group_num, + data_format=data_format, + layer_id=layer_id, + layer_num=layer_num, + scope_id=scope_id, + output_name_prefix=output_name_prefix + ) + return inputs, new_mems + + def rel_shift(self, x): + x = self.add_relative_shift(x, x+"_rel_shift", axis=3, shift_length=1) + #x_size = self.get_tensor_shape(x) + #x = self.add_pad(x, x+"_pad", [[0, 0], [0, 0], [0, 0], [1, 0]]) + #x = self.add_reshape(x, x+"_r", [x_size[0], x_size[1], x_size[3] + 1, x_size[2]]) + #result_name = x + "_slice" + #self.add_slice(x, [x+"_other", result_name], axis=2, slice_point=[1]) + #x = self.add_reshape(result_name, x+"_rel_shif", x_size) + return x + + def rel_multihead_attn(self, w, r, r_w_bias, r_r_bias, attn_mask, mems, d_model, + n_head, d_head, dropout, dropatt, is_training, + kernel_initializer, scope='rel_attn', norm_type='ln', + use_mq_attn=False, use_xl_pos_enc=True, + attn_mask_parameters=None, + pos_emb_parameters=None, + scope_id=-1, output_name_prefix=""): + if is_training: + assert mems is None + + scale = 1 / (d_head ** 0.5) + self.scopes[scope_id] = scope + query_depth = n_head * d_head + key_depth = n_head * d_head + value_depth = n_head * d_head + if norm_type == 'bn': + #w_t = self.add_transpose(w, output_name_prefix+"_bn_pre_t", [0, 2, 1]) + #w_t = self.add_expand_dims(w_t, axis=3, output_name=output_name_prefix+"_bn_expand") + #w_norm = self.extract_batch_norm(w_t, output_name_prefix+"_bn", scope_id+1, + # data_format="NCHW", layer_names=["batch_normalization", "moving_mean", "moving_variance"]) + #w_norm = self.add_squeeze(w_norm, axis=3, output_name=output_name_prefix+"_bn_squeeze") + #w_norm = self.add_transpose(w_norm, output_name_prefix+"_bn_post_t", [0, 2, 1]) + w_norm = self.extract_batch_norm(w, output_name_prefix+"_bn", scope_id+1, + data_format="NCHW", axis=-1, layer_names=["batch_normalization", "moving_mean", "moving_variance"]) + elif norm_type == 'pre_ln': + w_norm = self.extract_layer_norm(w, output_name_prefix+"_ln", scope_id+1, ["LayerNorm", "gamma", "beta"]) + else: + assert norm_type == 'ln' + w_norm = w + self.add_quantization(scope_id+1, "quant_attn_input", w_norm) + if use_mq_attn: + w_head_q = output_name_prefix + "_multihead_q" + w_head_k = output_name_prefix + "_multihead_k" + w_head_v = output_name_prefix + "_multihead_v" + self.extract_dense(w_norm, scope_id+1, "mhead_q") + self.extract_denses(w_norm, [w_head_k, w_head_v], [key_depth, value_depth], scope_id+1, "shead_kv") + self.add_quantization(scope_id+1, "quant_heads_q", w_head_q) + self.add_quantization(scope_id+1, "quant_heads_kv", w_head_k) + self.add_quantization(scope_id+1, "quant_heads_kv", w_head_v) + w_head_q = self.add_reshape(w_head_q, w_head_q+"_r", [self.batch, -1, n_head, d_head]) + w_head_k = self.add_reshape(w_head_k, w_head_k+"_r", [self.batch, -1, d_head, 1]) + w_head_v = self.add_reshape(w_head_v, w_head_v+"_r", [self.batch, -1, d_head]) + else: + w_head_q = output_name_prefix + "_multihead_q" + w_head_k = output_name_prefix + "_multihead_k" + w_head_v = output_name_prefix + "_multihead_v" + self.extract_denses(w_norm, [w_head_q, w_head_k, w_head_v], [key_depth, key_depth, value_depth], scope_id+1, "qkv") + self.add_quantization(scope_id+1, "quant_heads_qkv", w_head_q) + self.add_quantization(scope_id+1, "quant_heads_qkv", w_head_k) + self.add_quantization(scope_id+1, "quant_heads_qkv", w_head_v) + w_head_q = self.add_reshape(w_head_q, w_head_q+"_r", [self.batch, -1, n_head, d_head]) + w_head_k = self.add_reshape(w_head_k, w_head_k+"_r", [self.batch, -1, n_head, d_head]) + w_head_v = self.add_reshape(w_head_v, w_head_v+"_r", [self.batch, -1, n_head, d_head]) + if mems is not None: + k_mems, v_mems = mems + w_head_k = self.add_concat([k_mems, w_head_k], w_head_k+"_concat", axis=1) + w_head_v = self.add_concat([v_mems, w_head_v], w_head_v+"_concat", axis=1) + new_mems = w_head_k, w_head_v + else: + new_mems = None + + if (r is None): + r = self.add_relative_position_embedding(w_head_k, pos_emb_parameters, axis=1, output_name=output_name_prefix+"_rel_pos_emb") + if use_xl_pos_enc: + r_head_k = self.extract_dense(r, output_name_prefix+"_multihead_r", scope_id+1, scope_name="r") + else: + r_head_k = r + r_head_k = self.add_reshape(r_head_k, r_head_k+"_r", [self.batch, -1, n_head, d_head]) + if use_xl_pos_enc: + rw_head_q = self.add_sum([w_head_q, r_w_bias], w_head_q+"_rw") + rr_head_q = self.add_sum([w_head_q, r_r_bias], w_head_q+"_rr") + else: + rw_head_q = w_head_q + rr_head_q = w_head_q + + if use_mq_attn: + rw_head_qt = self.add_transpose(rw_head_q, rw_head_q+"_t", [0, 2, 1, 3]) + w_head_kt = self.add_transpose(w_head_k, w_head_k+"_t", [0, 2, 3, 1]) + AC = self.add_matmul(rw_head_qt, w_head_kt, output_name_prefix+"_AC") + else: + rw_head_qt = self.add_transpose(rw_head_q, rw_head_q+"_t", [0, 2, 1, 3]) + w_head_kt = self.add_transpose(w_head_k, w_head_k+"_t", [0, 2, 3, 1]) + AC = self.add_matmul(rw_head_qt, w_head_kt, output_name_prefix+"_AC") + if use_xl_pos_enc: + rr_head_qt = self.add_transpose(rr_head_q, rr_head_q+"_t", [0, 2, 1, 3]) + else: + rr_head_qt = rw_head_qt + r_head_kt = self.add_transpose(r_head_k, r_head_k+"_t", [0, 2, 3, 1]) + BD = self.add_matmul(rr_head_qt, r_head_kt, output_name_prefix+"_BD") + BD = self.rel_shift(BD) + + attn_score = self.add_sum([AC, BD], output_name_prefix+"_ACBD") + attn_score = self.add_power(attn_score, output_name_prefix+"_ACBD_s", scale=scale) + if (attn_mask is None): + attn_trunc_len, same_length = attn_mask_parameters + if attn_trunc_len is not None: + attn_score = self.add_attention_mask(attn_score, attn_score+"_mask", attn_trunc_len, same_length, 1e30) + attn_prob = self.add_softmax(attn_score, attn_score+"_softmax", 3) + self.add_quantization(scope_id+1, "quant_attn_prob", attn_prob) + + if use_mq_attn: + w_head_vt = self.add_transpose(w_head_v, w_head_v+"_t", [0, 2, 1, 3]) + attn_vec = self.add_matmul(attn_prob, w_head_vt, output_name_prefix+"_cont") + else: + w_head_vt = self.add_transpose(w_head_v, w_head_v+"_t", [0, 2, 1, 3]) + attn_vec = self.add_matmul(attn_prob, w_head_vt, output_name_prefix+"_cont") + + attn_vec = self.add_transpose(attn_vec, output_name_prefix+"_cont_t", [0, 2, 1, 3]) + attn_vec = self.add_reshape(attn_vec, output_name_prefix+"_cont_r", [self.batch, -1, n_head*d_head]) + self.add_quantization(scope_id+1, "quant_attn_vec", attn_vec) + attn_out = self.extract_dense(attn_vec, attn_vec+"_fc", scope_id+1, scope_name="o") + output = self.add_sum([attn_out, w], output_name_prefix+"_rel_multihead_sum") + if norm_type == 'ln': + output = self.extract_layer_norm(output, output+"_ln2", scope_id+1, ["LayerNorm", "gamma", "beta"]) + else: + assert norm_type in ['bn', 'pre_ln'] + output = output + return output, new_mems + + def _cache_decode_mem(self, curr_kv, prev_mem, mem_len=0, output_name_prefix=""): + assert prev_mem is not None + assert mem_len >= 0 + k_mem, v_mem = curr_kv + k_prev_mem, v_prev_mem = prev_mem + new_k_mem = k_mem #self.add_concat([k_prev_mem, k_mem], output_name_prefix+"_concat_k", 1) + new_v_mem = v_mem #self.add_concat([v_prev_mem, v_mem], output_name_prefix+"_concat_v", 1) + if mem_len > 0: + assert mem_len > 0 + new_k_mem = self.neg_slice(new_k_mem, axis=1, length=mem_len, output_name_prefix=output_name_prefix+"_k") + new_v_mem = self.neg_slice(new_v_mem, axis=1, length=mem_len, output_name_prefix=output_name_prefix+"_v") + return (new_k_mem, new_v_mem) + + def transformer_block(self, input, n_layer, d_model, n_head, d_head, d_inner, dropout, dropatt, dropinp, initializer, mode, + mems=None, att_trunc_len=0, pos_emb_cache=None, same_length=False, clamp_len=-1, untie_r=False, + scope='transformer', norm_type='ln', output_norm_type='', use_mq_attn=False, pre_compute=False, + use_xl_pos_enc=True, mult_query_decode=False, + scope_id=-1, output_name_prefix=""): + print('[INFO] transformer block params: n_layer: {}, d_model: {}, n_head: {}, d_head: {}, d_inner: {}, dropout: {}, dropatt: {}, dropinp: {}'\ + .format(n_layer, d_model, n_head, d_head, d_inner, dropout, dropatt, dropinp)) + assert mode in ['train', 'eval', 'infer'] + is_training = mode == 'train' + is_decoding = mems is not None + if is_decoding: + assert mode == 'infer' + if isinstance(d_inner, int): + d_inner = [d_inner] * n_layer + self.scopes[scope_id] = scope + if use_xl_pos_enc: + print("[ERROR] currently not support xl pos encoding") + exit(1) + #if untie_r: + # r_w_bias = tf.get_variable('r_w_bias', [n_layer, n_head, d_head], + # initializer=initializer) + # r_r_bias = tf.get_variable('r_r_bias', [n_layer, n_head, d_head], + # initializer=initializer) + #else: + # r_w_bias = tf.get_variable('r_w_bias', [n_head, d_head], + # initializer=initializer) + # r_r_bias = tf.get_variable('r_r_bias', [n_head, d_head], + # initializer=initializer) + else: + if untie_r: + r_w_bias = [None] * n_layer + r_r_bias = [None] * n_layer + else: + r_w_bias = None + r_r_bias = None + is_decoding_var_att_trunc = is_decoding and not isinstance(att_trunc_len, int) + #qlen = self.get_tensor_shape(input)[0] + #if is_decoding_var_att_trunc: + # mlen = [self.get_tensor_shape(mems[l_i][0])[0] for l_i in range(n_layer)] + # klen = [mlen[l_i] + qlen for l_i in range(n_layer)] + #else: + # mlen = self.get_tensor_shape(mems[0][0])[0] if mems is not None else 0 + # klen = mlen + qlen + if not is_decoding: + assert mems is None + #assert mlen == 0 + assert not same_length + + attn_mask = [None] * n_layer + if is_decoding_var_att_trunc: + #if pre_compute and mode == 'infer': + # if mult_query_decode: + # print('precompute multi query attn mask for decoding var att') + # attn_mask_caches = [tf.constant(mask_cache(8, 32, trunc_len), dtype=tf.float32) for trunc_len in att_trunc_len] + # attn_mask = [neg_slice_m(attn_mask_cache, [qlen, qlen+mlen[l_i]]) for l_i, attn_mask_cache in enumerate(attn_mask_caches)] + # else: + # print('precompute attn mask for decoding var att') + # # todo remove unnecessary mask + # attn_mask = [tf.zeros([qlen, qlen + mlen[l_i]]) for l_i in range(n_layer)] + #else: + # attn_mask = [_create_mask(qlen, mlen[l_i], att_trunc_len=att_trunc_len[l_i], same_length=same_length) for l_i in range(n_layer)] + attn_mask_parameters = [] + for l_i in range(n_layer): + trunc_len = att_trunc_len[l_i] + if pre_compute and mode == 'infer': + if not mult_query_decode: + trunc_len = None #-1 + attn_mask_parameters.append((trunc_len, same_length)) + elif not isinstance(att_trunc_len, int): + assert len(att_trunc_len) == n_layer + assert not is_decoding + #if pre_compute and mode == 'infer': + # print('precompute attn mask') + # attn_mask_caches = [tf.constant(mask_cache(1024, 0, trunc_len), dtype=tf.float32) for trunc_len in att_trunc_len] + # attn_mask = [attn_mask_cache[:qlen, :qlen] for attn_mask_cache in attn_mask_caches] + #else: + # attn_mask = [_create_mask(qlen, mlen, att_trunc_len=trunc_len, same_length=same_length) for trunc_len in att_trunc_len] + attn_mask_parameters = [] + for l_i in range(n_layer): + trunc_len = att_trunc_len[l_i] + attn_mask_parameters.append((trunc_len, same_length)) + else: + #attn_mask = _create_mask(qlen, mlen, att_trunc_len=att_trunc_len, same_length=same_length) + #attn_mask = [attn_mask] * n_layer + att_trunc_len = [att_trunc_len] * n_layer + attn_mask_parameters = [(att_trunc_len, same_length)] * n_layer + if use_xl_pos_enc: + #if is_decoding_var_att_trunc: + # pos_emb = [neg_slice(pos_emb_cache, klen[l_i]) for l_i in range(n_layer)] + #else: + # if pos_emb_cache is None: + # pos_seq = tf.range(klen - 1, -1, -1.0) + # if clamp_len > 0: + # pos_seq = tf.minimum(pos_seq, clamp_len) + # inv_freq = 1 / (10000 ** (tf.range(0, d_model, 2.0) / d_model)) + # pos_emb = positional_embedding(pos_seq, inv_freq) + # else: + # pos_emb = neg_slice(pos_emb_cache, klen) + pos_emb_parameters = [pos_emb_cache] * n_layer + pos_emb = [None] * n_layers + else: + #if not isinstance(att_trunc_len, int): + # max_relative_position = [att_trunc_len[i] + 1 for i in range(n_layer)] + #else: + # max_relative_position = [(att_trunc_len + 1) if att_trunc_len > 0 else 16] * n_layer + #pos_emb = [get_shaw_relative_embeddings_left(max_relative_position[i], + # klen[i] if is_decoding_var_att_trunc else klen, + # d_model, + # 'rel_pos_emb_{}'.format(i)) + # for i in range(n_layer)] + pos_emb_parameters = [] + for i in range(n_layer): + self.scopes[scope_id+1] = "rel_pos_emb_" + str(i) + weight_name = output_name_prefix + "_rel_pos_emb_" + str(i) + self.add_weight(weight_name, scope_id=scope_id+2, weight_name=None, weight=None, transpose=None, data_type="FLOAT32") + pos_emb_parameters.append(weight_name) + pos_emb = [None] * n_layer + + if mems is None: + mems = [None] * n_layer + output=input + new_mems = [] + for i in range(n_layer): + self.scopes[scope_id+1] = "layer_" + str(i) + output, kv_mems = self.rel_multihead_attn( + w=output, + r=pos_emb[i] if (is_decoding_var_att_trunc or not use_xl_pos_enc) else pos_emb, + r_w_bias=r_w_bias if not untie_r else r_w_bias[i], + r_r_bias=r_r_bias if not untie_r else r_r_bias[i], + attn_mask=attn_mask[i], + mems=mems[i], + d_model=d_model, + n_head=n_head, + d_head=d_head, + dropout=dropout, + dropatt=dropatt, + is_training=is_training, + kernel_initializer=initializer, + norm_type=norm_type, + use_mq_attn=use_mq_attn, + use_xl_pos_enc=use_xl_pos_enc, + attn_mask_parameters=attn_mask_parameters[i], + pos_emb_parameters=pos_emb_parameters[i], + scope_id=scope_id+2, + output_name_prefix=output_name_prefix+"_layer"+str(i)) + + # cache new mems + if is_decoding: + new_mems.append(self._cache_decode_mem(kv_mems, mems[i], att_trunc_len[i], output_name_prefix=output_name_prefix+"_layer"+str(i))) + + output = self.positionwise_FF( + input=output, + d_model=d_model, + d_inner=d_inner[i], + dropout=dropout, + kernel_initializer=initializer, + is_training=is_training, + norm_type=norm_type, + scope_id=scope_id+2, + output_name_prefix=output_name_prefix+"_layer"+str(i)) + + if output_norm_type == 'ln': + output = self.extract_layer_norm(output, output_name_prefix+"_ln", scope_id+1) + elif output_norm_type == 'bn': + #output_t = self.add_transpose(output, output_name_prefix+"_bn_pre_t", [0, 2, 1]) + #output_t = self.add_expand_dims(output_t, axis=3, output_name=output_name_prefix+"_bn_expand") + #output = self.extract_batch_norm(output_t, output_name_prefix+"_bn", scope_id+1, data_format="NCHW", layer_names=["batch_normalization", "moving_mean", "moving_variance"]) + #output = self.add_squeeze(output, axis=3, output_name=output_name_prefix+"_bn_squeeze") + #output = self.add_transpose(output, output_name_prefix+"_bn_post_t", [0, 2, 1]) + output = self.extract_batch_norm(output, output_name_prefix+"_bn", scope_id+1, + data_format="NCHW", axis=-1, layer_names=["batch_normalization", "moving_mean", "moving_variance"]) + else: + assert output_norm_type == '' + + if not is_decoding: + assert len(new_mems) == 0 + return output, new_mems + + def proj_transformer_block(self, input, n_layer, d_model, n_head, d_head, d_inner, dropout, dropatt, dropinp, initializer, mode, + input_project=False, decode_state=None, att_trunc_len=0, pos_emb_cache=None, norm_type='ln', output_norm_type='', + use_mq_attn=False, pre_compute=False, use_xl_pos_enc=True, mult_query_decode=False, scope_id=-1, output_name_prefix=""): + self.scopes[scope_id] = 'transformer_block' + orig_input_dim = self.get_tensor_shape(input)[-1] + if input_project: + assert orig_input_dim != d_model + input = self.extract_dense(input, output_name_prefix+"_proj", scope_id+1, scope_name="input_proj") + else: + assert orig_input_dim == d_model + # [B, T, C] --> [T, B, C] + #input = self.add_transpose(input, output_name_prefix+"_pre_t", [1, 0, 2]) + output, new_decode_state = self.transformer_block(input, + n_layer=n_layer, d_model=d_model, n_head=n_head, d_head=d_head, d_inner=d_inner, + dropout=dropout, dropatt=dropatt, dropinp=dropinp, initializer=initializer, mode=mode, + mems=decode_state, att_trunc_len=att_trunc_len, pos_emb_cache=pos_emb_cache, + norm_type=norm_type, output_norm_type=output_norm_type, use_mq_attn=use_mq_attn, + pre_compute=pre_compute, use_xl_pos_enc=use_xl_pos_enc, mult_query_decode=mult_query_decode, + scope_id=scope_id+1, output_name_prefix=output_name_prefix) + # [T, B, C] --> [B, T, C] + #output = self.add_transpose(output, output_name_prefix+"_post_t", [1, 0, 2]) + return output, new_decode_state + + def transformer_block_wrapper(self, input, initializer, mode, transformer_block_params, decode_state=None, pos_emb_cache=None, + pre_compute=False, mult_query_decode=False, + scope_id=-1, output_name_prefix=""): + n_layer = transformer_block_params['n_layer'] + d_model = transformer_block_params['d_model'] + n_head = transformer_block_params['n_head'] + d_head = transformer_block_params['d_head'] + d_inner = transformer_block_params['d_inner'] + dropout_keep_prob = transformer_block_params['dropout_keep_prob'] + input_keep_prob = transformer_block_params.get('input_keep_prob', dropout_keep_prob) + att_keep_prob = transformer_block_params.get('att_keep_prob', dropout_keep_prob) + input_project = transformer_block_params.get('input_project', False) + att_trunc_len = transformer_block_params.get('att_trunc_len', 0) + norm_type = transformer_block_params.get('norm_type', 'ln') + output_norm_type = transformer_block_params.get('output_norm_type', '') + use_mq_attn = transformer_block_params.get('use_mq_attn', False) + use_xl_pos_enc = transformer_block_params.get('use_xl_pos_enc', True) + + valid_params = {'n_layer', 'd_model', 'n_head', 'd_head', 'd_inner', 'dropout_keep_prob', 'input_keep_prob', + 'att_keep_prob', 'input_project', 'att_trunc_len', 'norm_type', 'output_norm_type', 'use_mq_attn', + 'use_xl_pos_enc'} + for k in transformer_block_params.keys(): + if k not in valid_params: + raise ValueError('unknown transformer parameter: {}'.format(k)) + + output, new_decode_state = self.proj_transformer_block(input, n_layer=n_layer, d_model=d_model, n_head=n_head, d_head=d_head, + d_inner=d_inner, dropout=1.0-dropout_keep_prob, dropatt=1.0-att_keep_prob, + dropinp=1.0 - input_keep_prob, + initializer=initializer, mode=mode, + input_project=input_project, decode_state=decode_state, + att_trunc_len=att_trunc_len, pos_emb_cache=pos_emb_cache, + norm_type=norm_type, output_norm_type=output_norm_type, + use_mq_attn=use_mq_attn, + pre_compute=pre_compute, + use_xl_pos_enc=use_xl_pos_enc, + mult_query_decode=mult_query_decode, + scope_id=scope_id, + output_name_prefix=output_name_prefix) + return output, new_decode_state + + def _transformer_block(self, input, transformer_block_params, initializer, pos_emb_cache=None, decode_state=None, scope_id=-1, output_name_prefix=""): + output, new_decode_state = self.transformer_block_wrapper(input, + initializer=initializer, + mode=self.mode, + transformer_block_params=transformer_block_params, + pos_emb_cache=pos_emb_cache, + pre_compute=None, + decode_state=decode_state, + mult_query_decode=True, + scope_id=scope_id, + output_name_prefix=output_name_prefix) + return output, new_decode_state + + def prepare_convolution_states(self, conv_layers, output_name_prefix, init_with_none=False): + conv_states = [] + for i in range(len(conv_layers)): + if (init_with_none): + state_shape = [0] * 4 + else: + state_shape = conv_layers[i].get('states', None) + if (self.nchwc8 and i != 0 and state_shape is not None): + if (len(state_shape) == 3): + tmp = state_shape[1] + state_shape[1] = state_shape[2] + state_shape[2] = tmp + state_shape.append(1) + elif (len(state_shape) == 4): + tmp = state_shape[3] + state_shape[3] = state_shape[2] + state_shape[2] = state_shape[1] + state_shape[1] = tmp + else: + print("[ERROR] unsupported state shape %d" % (len(state_shape))) + exit(1) + if (state_shape is not None): + state_name = output_name_prefix + "_layer" + str(i) + "_mem" + if (self.first_frame): + data = {state_name: np.zeros(state_shape)} + else: + file_data = np.load(self.state_data_path + "/" + state_name + ".npy") + data = {state_name: file_data} + state_shape = file_data.shape + #self.add_memory(state_name, state_shape, data_type="FLOAT32") + self.add_input(state_name, state_shape) + self.set_input(data) + conv_states.append(state_name) + else: + conv_states.append(None) + return conv_states + + def prepare_transformer_states(self, transformer_layers, output_name_prefix, init_with_none=False): + transformer_states = [] + layers = transformer_layers.get('n_layer', 0) + attn_trunc_lens = transformer_layers.get("att_trunc_len", [None]*layers) + n_head = transformer_layers['n_head'] + d_head = transformer_layers['d_head'] + for i in range(layers): + if (attn_trunc_lens[i] is not None): + kmem_name = output_name_prefix + "_layer" + str(i) + "_kmem" + vmem_name = output_name_prefix + "_layer" + str(i) + "_vmem" + if (init_with_none): + state_shape = [0] * 4 + else: + state_shape = [1, attn_trunc_lens[i], n_head, d_head] + if (self.first_frame): + data = {kmem_name: np.zeros(state_shape), + vmem_name: np.zeros(state_shape)} + state_shape_k = state_shape + state_shape_v = state_shape + else: + file_data_k = np.load(self.state_data_path + "/" + kmem_name + ".npy") + file_data_v = np.load(self.state_data_path + "/" + vmem_name + ".npy") + data = {kmem_name: file_data_k, + vmem_name: file_data_v} + state_shape_k = file_data_k.shape + state_shape_v = file_data_v.shape + #self.add_memory(kmem_name, state_shape, data_type="FLOAT32") + #self.add_memory(vmem_name, state_shape, data_type="FLOAT32") + self.add_input(kmem_name, state_shape_k) + self.add_input(vmem_name, state_shape_v) + self.set_input(data) + else: + kmem_name = None + vmem_name = None + transformer_states.append((kmem_name, vmem_name)) + return transformer_states + + def prepare_states(self, params, output_name_prefix, + init_convolution_with_none=False, + init_transformer_with_none=False, + block_id_start=0, + block_id_end=-1): + block_states = [] + if (block_id_end == -1): + block_id_end = len(params['net_blocks']) + for block_i in range(block_id_start, block_id_end): + block_params = params['net_blocks'][block_i] + block_state = [] + trunk_i = -1 + if ('conv_layers' in block_params.keys()): + trunk_i = trunk_i + 1 + conv_layers = block_params['conv_layers'] + block_state.append(self.prepare_convolution_states(conv_layers, + output_name_prefix+"_block"+str(block_i)+"_trunk"+str(trunk_i), + init_convolution_with_none)) + + if ('transformer_block_params' in block_params.keys()): + trunk_i = trunk_i + 1 + transformer_layers = block_params['transformer_block_params'] + block_state.append(self.prepare_transformer_states(transformer_layers, + output_name_prefix+"_block"+str(block_i)+"_trunk"+str(trunk_i), + init_transformer_with_none)) + block_states.append(block_state) + return block_states + + def extract_encoder(self, input_dict, scope_id, block_id_start=0, block_id_end=-1): + self.params = self.base_params["encoder_params"] + self.scopes[scope_id] = "ds2_encoder" + source_sequence = input_dict['source_tensors'] + decoding_states = input_dict.get('decoding_states', None) + is_decoding = decoding_states is not None + self._pre_compute = False + if is_decoding: + assert self.mode == 'infer' + + data_format = self.params.get('data_format', 'channels_last') + use_output_fc = self.params.get('output_fc', False) + output_norm_type = self.params.get('output_norm_type', '') + residual_type = self.params.get('residual_type', None) + use_group_dense = residual_type == 'group_dense' + if self.params['rnn_type'] not in ['wd_cudnn_lstm']: + assert residual_type is None + + #max_len = tf.reduce_max(src_length) + self._get_additional_pad_num(src_length) + + #if self.params['use_conv_mask']: + # mask = tf.sequence_mask( + # lengths=src_length, maxlen=max_len, + # dtype=source_sequence.dtype + # ) + # mask = tf.expand_dims(mask, 2) + #else: + # mask = None + mask = None + + # BTF -> BCTF + if (block_id_start == 0): + input_layer = self.add_expand_dims(source_sequence, axis=-1, output_name="encoder_input_expand") + else: + input_layer = source_sequence + + assert data_format == 'channels_last' + if data_format=='channels_last' or data_format=='BTFC': + layout = 'BTFC' + dformat = 'channels_last' + elif data_format=='channels_first' or data_format=='BCTF': + layout = 'BCTF' + dformat = 'channels_first' + elif data_format=='BFTC': + layout = 'BFTC' + dformat = 'channels_last' + elif data_format=='BCFT': + layout = 'BCFT' + dformat = 'channels_first' + else: + print("[WARNING] unsupported data format: will use channels_last (BTFC) instead") + layout = 'BTFC' + dformat = 'channels_last' + + #input_layer is BTFC + if layout == 'BCTF': + top_layer = self.add_transpose(input_layer, "encoder_input_transpose", [0, 3, 1, 2]) + elif layout == 'BFTC': + top_layer = self.add_transpose(input_layer, "encoder_input_transpose", [0, 2, 1, 3]) + elif layout == 'BCFT': + top_layer = self.add_transpose(input_layer, "encoder_input_transpose", [0, 3, 2, 1]) + else: + top_layer = input_layer + new_decode_states = [] + if (block_id_end == -1): + block_id_end = len(self.params['net_blocks']); + for block_i in range(block_id_start, block_id_end): + block_local_id = block_i - block_id_start + block_params = self.params['net_blocks'][block_i] + self.scopes[scope_id+1] = "block_" + str(block_i) + output_name_prefix = "encoder_block" + str(block_i) + new_block_decode_states = [] + conv_layers = block_params['conv_layers'] + block_conv_type = block_params.get('conv_type', 'conv1d') + use_2d_conv = block_conv_type == 'conv2d' + gn_group_num = block_params.get('gn_group_num', [0] * len(conv_layers)) + + # ----- Convolutional layers --------------------------------------------- + new_conv_decode_states = [] + for idx_conv in range(len(conv_layers)): + conv_layer_params = conv_layers[idx_conv] + conv_type = conv_layer_params.get('conv_type', block_conv_type) + conv_layer_params.setdefault('activation_fn', self.params['activation_fn']) + + if (self.nchwc8): + top_layer, new_conv_decode_state_i = self.conv_layer_wrapper_nchwc8( + inputs=top_layer, + conv_type=conv_type, + conv_layer_params=conv_layer_params, + gn_group_num=gn_group_num[idx_conv], + name="conv{}".format(idx_conv + 1), + #inputs_len=src_length, + mask=mask, + #max_len=max_len, + layout=layout, + data_format=dformat, + decode_state=decoding_states[block_local_id][0][idx_conv] if is_decoding else None, + is_decoding=is_decoding, + layer_id=idx_conv, + layer_num=len(conv_layers), + scope_id=scope_id+2, + output_name_prefix=output_name_prefix+"_conv"+str(idx_conv) + ) + else: + top_layer, new_conv_decode_state_i = self.conv_layer_wrapper( + inputs=top_layer, + conv_type=conv_type, + conv_layer_params=conv_layer_params, + gn_group_num=gn_group_num[idx_conv], + name="conv{}".format(idx_conv + 1), + #inputs_len=src_length, + mask=mask, + #max_len=max_len, + layout=layout, + data_format=dformat, + decode_state=decoding_states[block_local_id][0][idx_conv] if is_decoding else None, + is_decoding=is_decoding, + scope_id=scope_id+2, + output_name_prefix=output_name_prefix+"_conv"+str(idx_conv) + ) + new_conv_decode_states.append(new_conv_decode_state_i) + #if (block_i == 2 and idx_conv == 2): + # exit(1) + new_block_decode_states.append(new_conv_decode_states) + + # convert layout --> BTFC + if data_format == 'channels_first': + top_layer = self.add_transpose(top_layer, output_name_prefix+"_t", [0, 2, 3, 1]) + + if layout == 'BCTF': # BCTF --> BTFC + top_layer = self.add_transpose(top_layer, output_name_prefix+"_t", [0, 2, 3, 1]) + elif layout == 'BFTC': # BFTC --> BTFC + top_layer = self.add_transpose(top_layer, output_name_prefix+"_t", [0, 2, 1, 3]) + elif layout == 'BCFT': # BCFT --> BTFC + top_layer = self.add_transpose(top_layer, output_name_prefix+"_t", [0, 3, 2, 1]) + + num_rnn_layers = block_params['num_rnn_layers'] + group_dense_group_spec = block_params.get('group_dense_group_spec', None) + if use_group_dense: + print("[ERROR] currently not support group dense") + exit(1) + + ## reshape to [B, T, FxC] + if use_2d_conv: + f = self.get_tensor_shape(top_layer)[2] + c = self.get_tensor_shape(top_layer)[3] + fc = f * c + top_layer = self.add_reshape(top_layer, output_name_prefix+"_r", [self.batch, -1, fc]) + + if self.params.get('ln_fc_after_conv', False): + assert not use_group_dense + top_layer = self.layer_normed_fc(top_layer, "relu", output_name_prefix, scope_id+1) + + # ----- RNN --------------------------------------------------------------- + if num_rnn_layers > 0: + print("[ERROR] currently not support RNN") + exit(1) + + transformer_block_params = block_params.get('transformer_block_params', None) + if transformer_block_params: + if self._pre_compute and self.mode == 'infer': + print("[ERROR] currently not support pre_compute in encoder") + exit(1) + else: + pos_emb_cache = None + + if is_decoding: + transformer_decode_state = decoding_states[block_local_id][-1] + assert transformer_decode_state is not None + else: + transformer_decode_state = None + initializer = None + top_layer, new_transformer_decode_state = self._transformer_block(top_layer, transformer_block_params, + initializer=initializer, + pos_emb_cache=pos_emb_cache, decode_state=transformer_decode_state, + scope_id=scope_id+2, + output_name_prefix=output_name_prefix+"_transformer") + new_block_decode_states.append(new_transformer_decode_state) + #if (block_i == 3): + # exit(1) + new_decode_states.append(new_block_decode_states) + + if self.params['row_conv']: + channels = self.get_tensor_shape(top_layer)[-1] + top_layer = row_conv( + name="row_conv", + input_layer=top_layer, + batch=self.batch, + channels=channels, + activation_fn=self.params['activation_fn'], + width=self.params['row_conv_width'], + data_format=data_format, + norm_type=self.params.get('norm_type', 'batch_norm'), + gn_group_num=gn_group_num[-1] + ) + + if use_output_fc: + assert output_norm_type == '' + output_norm_type = 'ln_fc' + if output_norm_type == 'ln_fc': + #c = self.get_tensor_shape(top_layer)[-1] + #top_layer = self.add_reshape(top_layer, "encoder_output_r", [-1, c]) + outputs = self.layer_normed_fc(top_layer, self.params['activation_fn'], "encoder_output", scope_id+1) + elif output_norm_type == 'layer_norm': + outputs = self.extract_layer_norm(top_layer, "encoder_output_ln", scope_id+1) + elif output_norm_type == 'batch_norm': + outputs = self.extract_batch_norm(top_layer, "encoder_output_bn", scope_id+1, data_format="NCHW") + else: + outputs = top_layer + return { + 'outputs': outputs, + 'decode_state': new_decode_states + } + + def extract_prediction_net(self, input_dict, scope_id): + pred_net_params = self.base_params["decoder_params"]['pred_net_params'] + assert self.mode == 'infer' + source_sequence = input_dict['source_tensors'] + decoding_states = input_dict.get('decoding_states', None) + assert decoding_states is not None + embedded_inputs = "prediction_net_embedding" + self.extract_embedding(source_sequence, scope_id, "PredNetEmbeddingMatrix", embedded_inputs) + + if pred_net_params['norm_inputs']: + pred_net_outputs = tf.contrib.layers.layer_norm(embedded_inputs, begin_norm_axis=-1) + else: + pred_net_outputs = embedded_inputs + transformer_block_params = pred_net_params['transformer_block_params'] + init_dict = None + initializer = None + pred_net_outputs, new_pred_net_cell_state = self._transformer_block(pred_net_outputs, + transformer_block_params, initializer=initializer, + decode_state=decoding_states, pos_emb_cache=None, + scope_id=scope_id, + output_name_prefix="prediction_net") + #pred_net_outputs_last_dim = pred_net_outputs.get_shape().as_list()[-1] + #pred_net_outputs = tf.reshape(pred_net_outputs, [self.batch, pred_net_outputs_last_dim]) + return { + 'outputs': pred_net_outputs, + 'decode_state': new_pred_net_cell_state + } + + def extract_joint_net(self, input_dict, scope_id): + encoder_output = input_dict["encoder"] + prediction_net_output = input_dict["prediction_net"] + fc0_name = "joint_encoder_fc" + self.add_quantization(scope_id, "quant_enc_joint_input", encoder_output) + self.extract_dense(encoder_output, fc0_name, scope_id, scope_name="joint_encoder_fc") + #ep0_name = "joint_net_expand0" + #self.add_expand_dims(fc0_name, axis=2, output_name=ep0_name) + fc1_name = "joint_pred_net_fc" + self.add_quantization(scope_id, "quant_pred_joint_input", prediction_net_output) + self.extract_dense(prediction_net_output, fc1_name, scope_id, scope_name="joint_pred_net_fc") + #ep1_name = "joint_net_expand1" + #self.add_expand_dims(fc1_name, axis=1, output_name=ep1_name) + sum_name = "joint_net_sum" + #self.add_sum([ep0_name, ep1_name], sum_name) + self.add_sum([fc0_name, fc1_name], sum_name) + activation_fn = self.base_params["decoder_params"]["joint_net_params"]["activation_fn"] + if (activation_fn == "relu"): + relu_name = "joint_net_relu" + result_name = self.add_relu(sum_name, relu_name) + else: + print("[ERROR] unsupported activation function" % (activation_fn)) + exit(1) + self.add_quantization(scope_id, "quant_joint_middle", result_name) + result_name = self.extract_dense(result_name, "joint_output_fc", scope_id, scope_name="joint_output_fc") + argmax_name = self.add_argmax(result_name, axis=-1, output_name="output_argmax") + return argmax_name + + def generate_encoder(self, input=None, block_id_start=0, block_id_end=-1): + sounds_input_name = "sounds" + if (block_id_start == 0): + sounds_input_shape = [self.batch, self.base_params["sequence.max_length"], self.base_params["sequence.num_units"]] + else: + sounds_input_shape = input[sounds_input_name].shape + self.add_input(sounds_input_name, sounds_input_shape) + self.set_input(input) + + input_dict = {'source_tensors': sounds_input_name, + 'decoding_states': self.prepare_states(self.base_params["encoder_params"], + "encoder", False, True, block_id_start, block_id_end) + } + self.save_input() + output = self.extract_encoder(input_dict, 0, block_id_start, block_id_end) + self.save_caffe_model() + if (self.save_state and self.calculate): + for index in range(len(output['decode_state'])): + block_i = block_id_start + index + file_path_prefix_block = self.state_data_path + "/encoder_block" + str(block_i) + for trunk_i in range(len(output['decode_state'][index])): + file_path_prefix_trunk = file_path_prefix_block + "_trunk" + str(trunk_i) + for layer_i in range(len(output['decode_state'][index][trunk_i])): + data = output['decode_state'][index][trunk_i][layer_i] + if (data is None): + continue + if (isinstance(data, str)): + file_path = file_path_prefix_trunk + "_layer" + str(layer_i) + "_mem.npy" + np.save(file_path, self.get_tensor(data)) + print("[INFO] save encoder block %d trunk %d layer %d state to %s" % (block_i, trunk_i, layer_i, file_path)) + elif (isinstance(data, tuple)): + file_k_path = file_path_prefix_trunk + "_layer" + str(layer_i) + "_kmem.npy" + file_v_path = file_path_prefix_trunk + "_layer" + str(layer_i) + "_vmem.npy" + state_k_data, state_v_data = data + np.save(file_k_path, self.get_tensor(state_k_data)) + np.save(file_v_path, self.get_tensor(state_v_data)) + print("[INFO] save encoder block %d trunk %d layer %d k state to %s" % (block_i, trunk_i, layer_i, file_k_path)) + print("[INFO] save encoder block %d trunk %d layer %d v state to %s" % (block_i, trunk_i, layer_i, file_v_path)) + else: + print("[ERROR] unrecognized state array type") + exit(1) + + def generate_prediction_net(self, input=None): + label_input_name = "label" + label_input_shape = [self.batch, 1] + self.add_input(label_input_name, label_input_shape) + self.set_input(input) + + input_dict = {'source_tensors': label_input_name, + 'decoding_states': self.prepare_transformer_states( + self.base_params["decoder_params"]["pred_net_params"]["transformer_block_params"], "prediction_net", + True) + } + self.save_input() + output = self.extract_prediction_net(input_dict, 0) + self.save_caffe_model() + if (self.save_state and self.calculate): + file_path_prefix = self.state_data_path + "/prediction_net" + for layer_i in range(len(output['decode_state'])): + data = output['decode_state'][layer_i] + if (data is None): + continue + if (isinstance(data, str)): + file_path = file_path_prefix + "_layer" + str(layer_i) + "_mem.npy" + np.save(file_path, self.get_tensor(data)) + print("[INFO] save prediction net layer %d state to %s" % (layer_i, file_path)) + elif (isinstance(data, tuple)): + file_k_path = file_path_prefix + "_layer" + str(layer_i) + "_kmem.npy" + file_v_path = file_path_prefix + "_layer" + str(layer_i) + "_vmem.npy" + state_k_data, state_v_data = data + np.save(file_k_path, self.get_tensor(state_k_data)) + np.save(file_v_path, self.get_tensor(state_v_data)) + print("[INFO] save prediction net layer %d k state to %s" % (layer_i, file_k_path)) + print("[INFO] save prediction net layer %d v state to %s" % (layer_i, file_v_path)) + else: + print("[ERROR] unrecognized state array type") + exit(1) + + def generate_joint_net(self, input_shape, input=None): + encoder_output_name = "encoder" + encoder_output_shape = input_shape["encoder"] + self.add_input(encoder_output_name, encoder_output_shape) + prediction_net_output_name = "prediction_net" + prediction_net_output_shape = input_shape["prediction_net"] + self.add_input(prediction_net_output_name, prediction_net_output_shape) + self.set_input(input) + + input_dict = {'encoder': encoder_output_name, + 'prediction_net': prediction_net_output_name} + self.save_input() + self.extract_joint_net(input_dict, 0) + self.save_caffe_model() diff --git a/model_tools/tools/tensorflow2caffe/asr/tensorflow2caffe_convolution_transformer_keras.py b/model_tools/tools/tensorflow2caffe/asr/tensorflow2caffe_convolution_transformer_keras.py new file mode 100644 index 00000000..949ccf95 --- /dev/null +++ b/model_tools/tools/tensorflow2caffe/asr/tensorflow2caffe_convolution_transformer_keras.py @@ -0,0 +1,420 @@ +#!/usr/local/bin/python +# -*- coding: utf-8 -*- + +import math +import numpy as np +import sys +sys.path.append("../") +from tensorflow2caffe import Tensorflow2Caffe + +class Tensorflow2CaffeConvolutionTransformerKeras(Tensorflow2Caffe): + def __init__(self, + tensorflow_model_path, caffe_model_path_prefix, caffe_model_name, + base_params=None, + nchwc8=True, first_frame=True, + check=False, calc=False): + Tensorflow2Caffe.__init__(self, tensorflow_model_path, + caffe_model_path_prefix, caffe_model_name, check, calc) + self.params = base_params + self.nchwc8 = nchwc8 + self.first_frame = first_frame + self.mode = "infer" + self.state_data_path = "./data" + self.save_state = True + + @staticmethod + def default_params(): + return { + "max_sequence_length": 40, + "encoder": [ + {"num": 1, + "shape": [1, 1, 1, 41]}, + {"num": 1, + "shape": [1, 32, 1, 21]}, + {"num": 3, + "shape": [1, 8, 512]}, + {"num": 1, + "shape": [1, 512, 1, 1]}, + {"num": 3, + "shape": [1, 12, 512]}, + {"num": 1, + "shape": [1, 512, 1, 1]}, + {"num": 3, + "shape": [1, 16, 512]}, + {"num": 1, + "shape": [1, 512, 1, 1]} + ], + "prediction_net": [ + {"num": 3, + "shape": [1, 5, 512]} + ] + } + + def rel_shift(self, x): + x = self.add_relative_shift(x, x+"_rel_shift", axis=3, shift_length=1) + return x + + def rel_attn_core(self, q_head, k_head_h, v_head_h, k_head_r, recep_field, same_length, scale, scope_id, scope_name_prefix): + """Core relative positional attention operations.""" + + # content based attention score + r_w_bias = self.add_weight(scope_name_prefix+"_r_w_bias", weight_name=scope_name_prefix+"/r_w_bias", + transpose=None, data_type="FLOAT32") + q_head_b = self.add_sum([q_head, r_w_bias], q_head+"_w_sum") + ac = self.add_matmul(q_head_b, k_head_h, scope_name_prefix+"_ac") + + # position based attention score + r_r_bias = self.add_weight(scope_name_prefix+"_r_r_bias", weight_name=scope_name_prefix+"/r_r_bias", + transpose=None, data_type="FLOAT32") + q_head_b = self.add_sum([q_head, r_r_bias], q_head+"_r_sum") + bd = self.add_matmul(q_head_b, k_head_r, scope_name_prefix+"_bd") + bd = self.rel_shift(bd) + + # merge attention scores and perform masking + acbd = self.add_sum([ac, bd], scope_name_prefix+"_acbd") + attn_score = self.add_power(acbd, scope_name_prefix+"_attn_score", scale=scale) + attn_score = self.add_attention_mask(attn_score, attn_score+"_mask", recep_field, same_length, 1e30) + + # attention probability + attn_prob = self.add_softmax(attn_score, scope_name_prefix+"_attn_prob", 3) + # attention output + attn_vec = self.add_matmul(attn_prob, v_head_h, scope_name_prefix+"_attn_vec") + attn_vec = self.add_transpose(attn_vec, attn_vec+"_t", [0, 2, 1, 3]) + return attn_vec + + def relative_positional_encoding(self, d_model, recep_field, scope_name_prefix): + """create relative positional encoding.""" + time_len = self.params['max_sequence_length'] + freq_seq = np.arange(0, d_model, 2.0, dtype=np.float32) + inv_freq = 1 / (10000 ** (freq_seq / d_model)) + + fwd_pos_seq = np.arange(time_len, -1, -1.0, dtype=np.float32) + fwd_pos_seq = np.clip(fwd_pos_seq, -recep_field, recep_field) + sinusoid_inp = np.matmul(np.expand_dims(fwd_pos_seq, 1), np.expand_dims(inv_freq, 0)) + pos_emb = np.concatenate([np.sin(sinusoid_inp), np.cos(sinusoid_inp)], axis=-1) + pos_emb = np.tile(pos_emb, (1, 1)) + pos_emb = self.add_weight(scope_name_prefix+"_pos_dict", weight=pos_emb, data_type="FLOAT32") + return pos_emb + + def RelHistoricalSelfAttention2D(self, inputs, n_head, d_head, recep_field, same_length, use_4d, scope_id, scope_name_prefix): + d_model = n_head * d_head + h_input = inputs + scale = 1 / (d_head ** 0.5) + + pos_emb = self.relative_positional_encoding(d_model, recep_field, scope_name_prefix) + pos_emb = self.add_relative_position_embedding(inputs, pos_emb, 1, scope_name_prefix+"_pos_emb", transpose=False) + + q_head_h = self.extract_dense(h_input, h_input+"_q", scope_id, [scope_name_prefix+"_hsa", "query_weights", "bias"]) + k_head_h = self.extract_dense(h_input, h_input+"_k", scope_id, [scope_name_prefix+"_hsa", "key_weights", "bias"]) + v_head_h = self.extract_dense(h_input, h_input+"_v", scope_id, [scope_name_prefix+"_hsa", "value_weights", "bias"]) + q_head_h = self.add_reshape(q_head_h, q_head_h+"_r", [self.batch, -1, n_head, d_head]) + k_head_h = self.add_reshape(k_head_h, k_head_h+"_r", [self.batch, -1, n_head, d_head]) + v_head_h = self.add_reshape(v_head_h, v_head_h+"_r", [self.batch, -1, n_head, d_head]) + q_head_h = self.add_transpose(q_head_h, q_head_h+"_t", [0, 2, 1, 3]) + k_head_h = self.add_transpose(k_head_h, k_head_h+"_t", [0, 2, 3, 1]) + v_head_h = self.add_transpose(v_head_h, v_head_h+"_t", [0, 2, 1, 3]) + + # positional heads + k_head_r = self.extract_dense(pos_emb, pos_emb+"_k", scope_id, [scope_name_prefix+"_hsa", "rel_weights", "bias"]) + k_head_r = self.add_reshape(k_head_r, k_head_r+"_r", [self.batch, -1, n_head, d_head]) + k_head_r = self.add_transpose(k_head_r, k_head_r+"_t", [0, 2, 3, 1]) + + # core attention ops + attn_vec = self.rel_attn_core(q_head_h, k_head_h, v_head_h, k_head_r, recep_field, same_length, scale, scope_id, scope_name_prefix+"_hsa") + + # post processing + attn_vec = self.add_reshape(attn_vec, attn_vec+"_r", [self.batch, -1, d_model]) + output = self.extract_dense(attn_vec, attn_vec+"_output", scope_id, [scope_name_prefix+"_hsa", "output_weights", "bias"]) + return output + + def TransformerTransition(self, inputs, activation_fn, scope_id, tt_name_prefix): + x = self.extract_dense(inputs, inputs+"_x", scope_id, [tt_name_prefix, "weights1", "biases1"]) + if (activation_fn == "relu"): + x = self.add_relu(x, x+"_relu") + else: + print("[ERROR] unsupported activation function" % (activation_fn)) + exit(1) + output = self.extract_dense(x, inputs+"_xx", scope_id, [tt_name_prefix, "weights2", "biases2"]) + return output + + def HistoricalSelfAttentionBlock(self, x, n_head, d_head, recep_field, merge_size, + scope_id, scope_name_prefix, tt_name_prefix, use_4d=False, + use_bn=False, use_final_norm=False, + state=None, activation_fn="relu", same_length=False): + + # self-attention block + residual = x + x = self.add_concat([state, x], scope_name_prefix+"_concat", axis=1) + slice_result = scope_name_prefix + "_slice" + self.add_slice(x, [scope_name_prefix+"_other1", slice_result], 1, [-recep_field]) + if use_bn: + x = self.extract_batch_norm(x, x+"_norm0", scope_id, + data_format="NCHW", axis=-1, layer_names=[scope_name_prefix+"_norm0", "moving_mean", "moving_variance"]) + x = self.RelHistoricalSelfAttention2D(x, n_head, d_head, recep_field, same_length, use_4d, + scope_id, scope_name_prefix) + content = scope_name_prefix + "_padding_drop" + self.add_slice(x, [scope_name_prefix+"_other2", content], 1, [recep_field]) + x = content + x = self.add_sum([residual, x], scope_name_prefix+"_sum") + if not use_bn: + x = self.extract_layer_norm(x, x+"_norm0", scope_id, [scope_name_prefix+"_norm0", "gamma", "beta"]) + + # feed forword block + residual = x + if use_bn: + x = self.extract_batch_norm(x, x+"_norm1", scope_id, + data_format="NCHW", axis=-1, layer_names=[scope_name_prefix+"_norm1", "moving_mean", "moving_variance"]) + x = self.TransformerTransition(x, activation_fn, scope_id, tt_name_prefix) + x = self.add_sum([residual, x], scope_name_prefix+"_sum2") + if not use_bn: + x = self.extract_layer_norm(x, x+"_norm1", scope_id, [scope_name_prefix+"_norm1", "gamma", "beta"]) + + if use_final_norm: + x = self.extract_layer_norm(x, x+"_final_norm", scope_id, [scope_name_prefix+"_final_norm", "gamma", "beta"]) + return x, slice_result + + def generate_prediction_joint_net(self, input_shape, input=None): + prediction_net_input_name = "prediction_net" + prediction_net_input_shape = input_shape[prediction_net_input_name] + self.add_input(prediction_net_input_name, prediction_net_input_shape) + encoder_output_name = "encoder" + encoder_output_shape = input_shape[encoder_output_name] + self.add_input(encoder_output_name, encoder_output_shape) + self.set_input(input) + + states = self.prepare_states(self.params["prediction_net"], "prediction_net_mem") + self.save_input() + + attention_id = 1 + x = prediction_net_input_name + new_states = [] + for i in range(3): + tt_name_prefix = "transformer_transition_" + str(attention_id+8) + x, state = self.HistoricalSelfAttentionBlock(x, + n_head=8, d_head=64, + recep_field=5, merge_size=1, + scope_id=0, scope_name_prefix="decoder_hsa_" + str(attention_id), tt_name_prefix=tt_name_prefix, use_4d=False, + state=states[i]) + new_states.append(state) + attention_id += 1 + + x = self.add_concat([x, encoder_output_name], "joint_net_input", 2) + x = self.extract_dense(x, x+"_fc1", 0, ["joint_forward", "kernel", "bias"]) + x = self.add_tanh(x, "joint_net_tanh") + x = self.extract_dense(x, "joint_output_fc", 0, ["joint_classification", "kernel", "bias"]) + self.save_caffe_model() + + def prepare_states(self, states_shape, output_name_prefix, init_with_none=False): + states = [] + state_id = 0 + for item in states_shape: + num = item["num"] + state_shape = item["shape"] + if (init_with_none): + state_shape = [0] * len(state_shape) + for i in range(num): + mem_name = output_name_prefix + str(state_id) + if (self.first_frame): + data = {mem_name: np.zeros(state_shape)} + else: + file_data = np.load(self.state_data_path + "/" + mem_name + ".npy") + data = {mem_name: file_data} + state_shape = file_data.shape + self.add_input(mem_name, state_shape) + self.set_input(data) + states.append(mem_name) + state_id += 1 + return states + + def Conv2DBlock(self, x, filters, scope_name_prefix, kernel_size=[3, 3], strides=[1, 1], + use_relu=True, axis=2, state=None, scope_id=0): + if (len(self.get_tensor_shape(x)) == 5 and len(self.get_tensor_shape(state)) == 4): + shape = self.get_tensor_shape(state) + self.data_dict[state] = self.data_dict[state].reshape( + [self.batch, shape[1]//8, -1, shape[3], 8]) + x = self.add_concat([state, x], scope_name_prefix+"_concat", axis=axis) + slice_result = scope_name_prefix + "_slice" + self.add_slice(x, [scope_name_prefix+"_other1", slice_result], axis, [-1]) + padding = self.calculate_convolution_padding(self.get_tensor_shape(x), kernel_size, strides, 'same') + y = self.extract_convolution(x, scope_name_prefix+"_conv", scope_id, + filters, kernel_size, strides, padding, + data_format="NCHW", weight_format="NHWC", + dilation=1, groups=1, layer_names=[scope_name_prefix+"_conv", "kernel", "bias"]) + y = self.extract_batch_norm(y, scope_name_prefix+"_bn", scope_id, layer_names=[scope_name_prefix+"_bn", "moving_mean", "moving_variance"]) + if use_relu: + y = self.add_relu6(y, scope_name_prefix+"_relu6") + content = scope_name_prefix + "_padding_drop" + self.add_slice(y, [scope_name_prefix+"_other2", content], axis, [1]) + return content, slice_result + + def AlignmentBlock(self, x, filters, scope_name_prefix, extend=4, scope_id=0): + stride = [1, 1] + shape = self.get_tensor_shape(x) + x = self.add_reshape(x, scope_name_prefix+"_r", [self.batch, -1, shape[2], 1]) + if extend > 0: + padding0 = self.calculate_convolution_padding(self.get_tensor_shape(x), [1, 1], stride, 'same') + x = self.extract_convolution(x, scope_name_prefix+"_conv0", scope_id, + filters*extend, [1, 1], stride, padding0, + data_format="NCHW", weight_format="NHWC", + dilation=1, groups=1, layer_names=[scope_name_prefix+"_conv0", "kernel", "bias"]) + x = self.extract_batch_norm(x, scope_name_prefix+"_bn0", scope_id, layer_names=[scope_name_prefix+"_bn0", "moving_mean", "moving_variance"]) + x = self.add_relu6(x, scope_name_prefix+"_act0") + + padding1 = self.calculate_convolution_padding(self.get_tensor_shape(x), [1, 1], stride, 'same') + x = self.extract_convolution(x, scope_name_prefix+"_conv1", scope_id, + filters, [1, 1], stride, padding1, + data_format="NCHW", weight_format="NHWC", + dilation=1, groups=1, layer_names=[scope_name_prefix+"_conv1", "kernel", "bias"]) + x = self.extract_batch_norm(x, scope_name_prefix+"_bn1", scope_id, layer_names=[scope_name_prefix+"_bn1", "moving_mean", "moving_variance"]) + else: + padding0 = self.calculate_convolution_padding(self.get_tensor_shape(x), [1, 1], stride, 'same') + x = self.extract_convolution(x, scope_name_prefix+"_conv0", scope_id, + filters, [1, 1], stride, padding0, + data_format="NCHW", weight_format="NHWC", + dilation=1, groups=1, layer_names=[scope_name_prefix+"_conv0", "kernel", "bias"]) + x = self.extract_batch_norm(x, scope_name_prefix+"_bn0", scope_id, layer_names=[scope_name_prefix+"_bn0", "moving_mean", "moving_variance"]) + return x + + def ResConvBlock(self, x, filters, scope_name_prefix, stride=2, axis=2, state=None, scope_id=0): + stride = [stride, stride] + padding0 = self.calculate_convolution_padding(self.get_tensor_shape(x), [1, 1], stride, 'same') + residual = self.extract_convolution(x, scope_name_prefix+"_resconv", scope_id, + filters[0], [1, 1], stride, padding0, + data_format="NCHW", weight_format="NHWC", + dilation=1, groups=1, layer_names=[scope_name_prefix+"_resconv", "kernel", "bias"]) + residual = self.extract_batch_norm(residual, scope_name_prefix+"_resbn", scope_id, layer_names=[scope_name_prefix+"_resbn", "moving_mean", "moving_variance"]) + + x = self.add_relu6(x, scope_name_prefix+"_act0") + x = self.add_concat([state, x], scope_name_prefix+"_concat", axis=axis) + slice_result = scope_name_prefix + "_slice" + self.add_slice(x, [scope_name_prefix+"_other1", slice_result], axis, [-1]) + padding1 = self.calculate_convolution_padding(self.get_tensor_shape(x), [3, 1], stride, 'same') + x = self.extract_convolution(x, scope_name_prefix+"_conv1", scope_id, + filters[1], [3, 1], stride, padding1, + data_format="NCHW", weight_format="NHWC", + dilation=1, groups=1, layer_names=[scope_name_prefix+"_conv1", "kernel", "bias"]) + x = self.extract_batch_norm(x, scope_name_prefix+"_bn1", scope_id, layer_names=[scope_name_prefix+"_bn1", "moving_mean", "moving_variance"]) + x = self.add_relu6(x, scope_name_prefix+"_act1") + content = scope_name_prefix + "_padding_drop" + self.add_slice(x, [scope_name_prefix+"_other2", content], axis, [1]) + x = content + + padding2 = self.calculate_convolution_padding(self.get_tensor_shape(x), [1, 1], stride, 'same') + x = self.extract_convolution(x, scope_name_prefix+"_conv2", scope_id, + filters[2], [1, 1], [1, 1], padding2, + data_format="NCHW", weight_format="NHWC", + dilation=1, groups=1, layer_names=[scope_name_prefix+"_conv2", "kernel", "bias"]) + x = self.extract_batch_norm(x, scope_name_prefix+"_bn2", scope_id, layer_names=[scope_name_prefix+"_bn2", "moving_mean", "moving_variance"]) + x = self.add_relu6(x, scope_name_prefix+"_act2") + + padding3 = self.calculate_convolution_padding(self.get_tensor_shape(x), [1, 1], stride, 'same') + x = self.extract_convolution(x, scope_name_prefix+"_conv3", scope_id, + filters[3], [1, 1], [1, 1], padding3, + data_format="NCHW", weight_format="NHWC", + dilation=1, groups=1, layer_names=[scope_name_prefix+"_conv3", "kernel", "bias"]) + x = self.extract_batch_norm(x, scope_name_prefix+"_bn3", scope_id, layer_names=[scope_name_prefix+"_bn3", "moving_mean", "moving_variance"]) + x = self.add_sum([x, residual], scope_name_prefix+"_sum") + return x, slice_result + + def TransformerEncoder(self, x, states): + scope_id = 0 + new_states = [] + x = self.transpose_nhwc_nchw(x) + # main blocks + attention_id = 1 + state_id = 0 + + # single-conv block + x, state = self.Conv2DBlock(x, filters=32, scope_name_prefix="conv_1", + kernel_size=[3, 11], strides=[1, 2], + axis=2, state=states[state_id], scope_id=scope_id) + state_id += 1 + new_states.append(state) + x, state = self.Conv2DBlock(x, filters=32, scope_name_prefix="conv_2", + kernel_size=[3, 7], strides=[2, 1], + axis=2, state=states[state_id], scope_id=scope_id) + state_id += 1 + new_states.append(state) + x = self.AlignmentBlock(x, 512, scope_name_prefix="alignment", scope_id=scope_id) + + # attention block 1-3 + x = self.transpose_nchc8_nhc(x) + for i in range(3): + tt_name_prefix = "transformer_transition" + if (attention_id != 1): + tt_name_prefix += "_" + str(attention_id-1) + x, state = self.HistoricalSelfAttentionBlock(x, n_head=8, d_head=64, recep_field=8, merge_size=2, + scope_id=scope_id, scope_name_prefix="hsa_" + str(attention_id), tt_name_prefix=tt_name_prefix, use_4d=True, + state=states[state_id]) + state_id += 1 + new_states.append(state) + attention_id += 1 + x = self.transpose_nhc_nchw(x) + + # res-conv block 1 + x, state = self.ResConvBlock(x, filters=[512, 1024, 2048, 512], scope_name_prefix="conv_block_1", + stride=2, axis=2, state=states[state_id], scope_id=scope_id) + state_id += 1 + new_states.append(state) + + # attention block 4-6 + x = self.transpose_nchc8_nhc(x) + for i in range(3): + tt_name_prefix = "transformer_transition" + if (attention_id != 1): + tt_name_prefix += "_" + str(attention_id-1) + x, state = self.HistoricalSelfAttentionBlock(x, n_head=8, d_head=64, recep_field=12, merge_size=4, + scope_id=scope_id, scope_name_prefix="hsa_" + str(attention_id), tt_name_prefix=tt_name_prefix, use_4d=True, + state=states[state_id]) + state_id += 1 + new_states.append(state) + attention_id += 1 + x = self.transpose_nhc_nchw(x) + + # res-conv block 2 + x, state = self.ResConvBlock(x, filters=[512, 1024, 2048, 512], scope_name_prefix="conv_block_2", + stride=2, axis=2, state=states[state_id], scope_id=scope_id) + state_id += 1 + new_states.append(state) + + # attention block 7-9 + x = self.transpose_nchc8_nhc(x) + for i in range(3): + tt_name_prefix = "transformer_transition" + if (attention_id != 1): + tt_name_prefix += "_" + str(attention_id-1) + x, state = self.HistoricalSelfAttentionBlock(x, n_head=8, d_head=64, recep_field=16, merge_size=8, + scope_id=scope_id, scope_name_prefix="hsa_" + str(attention_id), tt_name_prefix=tt_name_prefix, use_4d=True, + state=states[state_id]) + state_id += 1 + new_states.append(state) + attention_id += 1 + x = self.transpose_nhc_nchw(x) + + # res-conv block 3 + x, state = self.ResConvBlock(x, filters=[1024, 1024, 4096, 1024], scope_name_prefix="conv_block_3", + stride=1, axis=2, state=states[state_id], scope_id=scope_id) + state_id += 1 + new_states.append(state) + + # layers for output + stride = [1, 1] + padding = self.calculate_convolution_padding(self.get_tensor_shape(x), [1, 1], stride, 'valid') + x = self.extract_convolution(x, "joint_encoder_trans", scope_id, + 512, [1, 1], stride, padding, + data_format="NCHW", weight_format="NHWC", + dilation=1, groups=1, layer_names=["joint_encoder_trans", "kernel", "bias"]) + x = self.extract_batch_norm(x, "joint_encoder_trans_bn", scope_id, layer_names=["joint_encoder_trans_bn", "moving_mean", "moving_variance"]) + x = self.add_relu6(x, "encoder_output") + return x, new_states + + def generate_encoder(self, input_shape, input=None): + encoder_input_name = "encoder" + encoder_input_shape = input_shape[encoder_input_name] + self.add_input(encoder_input_name, encoder_input_shape) + self.set_input(input) + + states = self.prepare_states(self.params["encoder"], "encoder_mem") + self.save_input() + self.TransformerEncoder(encoder_input_name, states) + self.save_caffe_model() diff --git a/model_tools/tools/tensorflow2caffe/asr/tensorflow2caffe_rnnt.py b/model_tools/tools/tensorflow2caffe/asr/tensorflow2caffe_rnnt.py new file mode 100644 index 00000000..dc74609c --- /dev/null +++ b/model_tools/tools/tensorflow2caffe/asr/tensorflow2caffe_rnnt.py @@ -0,0 +1,319 @@ +#!/usr/local/bin/python +# -*- coding: utf-8 -*- + +import math +import numpy as np +import sys +sys.path.append("../") +from tensorflow2caffe import Tensorflow2Caffe + +class Tensorflow2CaffeRNNT(Tensorflow2Caffe): + def __init__(self, + tensorflow_model_path, caffe_model_path_prefix, caffe_model_name, + params, + check=False, calc=False): + Tensorflow2Caffe.__init__(self, tensorflow_model_path, + caffe_model_path_prefix, caffe_model_name, check, calc) + self.params = params + self.check_params() + + @staticmethod + def default_params(): + return { + "BLANK": 0, + + "sequence.max_length": 128, + "sequence.num_units": 240, + + "encoder.lstm_cells": 6, + "encoder.lstm_cell_type": "LSTMP", + "encoder.lstm_cell_state_shape": 1664, + "encoder.use_layer_normed_fc": True, + "encoder.activation": "relu", + "encoder.num_output": 640, + + "prediction_net.lstm_cells": 2, + "prediction_net.lstm_cell_type": "LSTMP", + "prediction_net.lstm_cell_state_shape": 1664, + "prediction_net.use_layer_normed_fc": True, + "prediction_net.activation": "relu", + "prediction_net.num_output": 640, + + "output.activation": None, + } + + def check_params(self): + return + if (self.params["sequence.num_units"] != self.params["encoder.lstm_cell_input_shape"]): + print("[ERROR] sequence.num_units(%d) must equal encoder.lstm_cell_input_shape(%d)" + % (self.params["sequence.num_units"], self.params["encoder.lstm_cell_input_shape"])) + exit(1) + + + def layer_normed_fc(self, input_name, activation_fn, output_name_prefix, scope_id): + fc_name = output_name_prefix + "_fc" + self.extract_dense(input_name, fc_name, scope_id, scope_name="fully_connected") + ln_name = output_name_prefix + "_ln" + self.extract_layer_norm(fc_name, ln_name, scope_id, ["LayerNorm", "gamma", "beta"]) + result_name = "" + if (activation_fn == "relu"): + relu_name = output_name_prefix + "_relu" + result_name = self.add_relu(ln_name, relu_name) + else: + print("[ERROR] unsupported activation function" % (activation_fn)) + exit(1) + return result_name; + + def extract_encoder(self, input_name, states, scope_id): + self.scopes[scope_id] = "encoder" + + self.scopes[scope_id+1] = "rnn" + self.scopes[scope_id+2] = "multi_rnn_cell" + last_input_name = input_name + for layer_idx in range(self.params["encoder.lstm_cells"]): + if (self.params["encoder.lstm_cell_type"] == "LSTMP"): + self.scopes[scope_id+3] = "cell_" + str(layer_idx) + self.scopes[scope_id+4] = "lstm_cell" + lstm_output_name = "encoder_lstm" + str(layer_idx) + "_cell" + self.extract_lstm(last_input_name, states[layer_idx], lstm_output_name, scope_id+4, scope_name = "lstm_cell", use_proj=True) + last_input_name = lstm_output_name + else: + print("[ERROR] unsupported lstm type %s" % (self.params["encoder.lstm_cell_type"])) + exit(1) + if (self.params["encoder.use_layer_normed_fc"]): + result_name = self.layer_normed_fc(last_input_name, self.params["encoder.activation"], "encoder", scope_id+1) + else: + result_name = last_input_name + return result_name + + def extract_prediction_net(self, input_name, states, scope_id): + self.scopes[scope_id] = "prediction_net" + + eb_name = "prediction_net_embedding" + self.scopes[scope_id+1] = "embedding" + self.extract_embedding(input_name, scope_id+2, "embedding", eb_name) + + squeeze_name = eb_name + "_squeeze" + self.add_squeeze(eb_name, axis=1, output_name=squeeze_name) + + self.scopes[scope_id+1] = "rnn" + self.scopes[scope_id+2] = "multi_rnn_cell" + last_input_name = squeeze_name + for layer_idx in range(self.params["prediction_net.lstm_cells"]): + if (self.params["prediction_net.lstm_cell_type"] == "LSTMP"): + self.scopes[scope_id+3] = "cell_" + str(layer_idx) + self.scopes[scope_id+4] = "lstm_cell" + lstm_output_name = "prediction_net_lstm" + str(layer_idx) + "_cell" + self.extract_lstm(last_input_name, states[layer_idx], lstm_output_name, scope_id+4, scope_name = "lstm_cell", use_proj=True) + last_input_name = lstm_output_name + else: + print("[ERROR] unsupported lstm type %s" % (self.params["prediction_net.lstm_cell_type"])) + exit(1) + + if (self.params["prediction_net.use_layer_normed_fc"]): + result_name = self.layer_normed_fc(last_input_name, self.params["prediction_net.activation"], "prediction_net", scope_id+1) + else: + result_name = last_input_name + return result_name + + def extract_joint_net(self, input_names, scope_id): + self.scopes[scope_id] = "joint_net" + + fc0_name = "joint_net_fc" + self.extract_dense(input_names[0], fc0_name, scope_id+1, scope_name = "dense") + #ep0_name = "joint_net_expand0" + #self.add_expand_dims(fc0_name, axis=2, output_name=ep0_name) + fc1_name = "joint_net_fc_1" + self.extract_dense(input_names[1], fc1_name, scope_id+1, scope_name = "dense_1") + #ep1_name = "joint_net_expand1" + #self.add_expand_dims(fc1_name, axis=1, output_name=ep1_name) + sum_name = "joint_net_sum" + #self.add_sum([ep0_name, ep1_name], sum_name) + self.add_sum([fc0_name, fc1_name], sum_name) + tanh_name = "joint_net_tanh" + self.add_tanh(sum_name, tanh_name) + return tanh_name + + def extract_output(self, input_name, scope_id): + self.scopes[scope_id] = "rnnt_output" + + fc_name = "output_fc" + self.extract_dense(input_name, fc_name, scope_id+1, scope_name = "rnnt_output_fc") + + activation_name = "" + if (self.params["output.activation"] is not None): + print("[ERROR] unsupported activation function" % (self.params["output.activation"])) + exit(1) + else: + activation_name = fc_name + argmax_name = self.add_argmax(activation_name, axis=-1, output_name="output_argmax") + return argmax_name + + def generate_encoder(self, input=None): + encoder_input_name = "sounds" + encoder_input_shape = [self.batch, self.params["sequence.num_units"]] + self.add_input(encoder_input_name, encoder_input_shape) + self.set_input(input) + self.scopes[0] = "rnnt" + encoder_states = [] + for layer_idx in range(self.params["encoder.lstm_cells"]): + state_shape = [self.batch, self.params["encoder.lstm_cell_state_shape"]] + state_name = "encoder_lstm" + str(layer_idx) + "_state" + self.add_input(state_name, state_shape) + encoder_states.append(state_name) + encoder_output = self.extract_encoder(encoder_input_name, encoder_states, 1) + self.save_caffe_model() + + def generate_prediction_net(self, input=None): + prediction_net_input_name = "prediction_net_input" + prediction_net_input_shape = [self.batch, 1] + self.add_input(prediction_net_input_name, prediction_net_input_shape) + self.set_input(input) + self.scopes[0] = "rnnt" + prediction_net_states = [] + for layer_idx in range(self.params["prediction_net.lstm_cells"]): + state_shape = [self.batch, self.params["prediction_net.lstm_cell_state_shape"]] + state_name = "prediction_net_lstm" + str(layer_idx) + "_state" + self.add_input(state_name, state_shape) + prediction_net_states.append(state_name) + prediction_net_output = self.extract_prediction_net(prediction_net_input_name, prediction_net_states, 1) + self.save_caffe_model() + + def generate_joint_net(self, input=None): + encoder_output = "encoder" + encoder_output_shape = [self.batch, self.params["encoder.num_output"]] + prediction_net_output = "prediction_net" + prediction_net_output_shape = [self.batch, self.params["prediction_net.num_output"]] + self.add_input(encoder_output, encoder_output_shape) + self.add_input(prediction_net_output, prediction_net_output_shape) + self.set_input(input) + self.scopes[0] = "rnnt" + joint_net_output = self.extract_joint_net([encoder_output, prediction_net_output], 1) + label_output = self.extract_output(joint_net_output, 0) + self.save_caffe_model() + + def generate(self, input=None): + sounds_input_name = "sounds" + sounds_input_shape = [self.batch, self.params["sequence.max_length"], self.params["sequence.num_units"]] + self.add_input(sounds_input_name, sounds_input_shape) + self.set_input(input) + + labels_output_name = "labels" + labels_output_shape = [self.batch, self.params["sequence.max_length"]] + self.add_memory(labels_output_name, labels_output_shape, data_type="INT32") + + position_input_name = "position" + position_input_shape = [self.batch, 1] + self.add_memory(position_input_name, position_input_shape, data_type="INT32") + + encoder_input_name = "encoder_input" + encoder_input_shape = [self.batch, self.params["sequence.num_units"]] + self.add_memory(encoder_input_name, encoder_input_shape, data_type="FLOAT32") + + prediction_net_input_name = "prediction_net_input" + prediction_net_input_shape = [self.batch, 1] + self.add_memory(prediction_net_input_name, prediction_net_input_shape, data_type="INT32") + + prediction_net_status_name = "prediction_net_status" + prediction_net_status_shape = [self.batch, 1] + self.add_memory(prediction_net_status_name, prediction_net_status_shape, data_type="INT32") + + blank = "BLANK" + weight = np.array([[self.params["BLANK"]] * self.batch]) + self.add_weight(blank, weight=weight, data_type="INT32") + negative_one = "negative_one" + weight = np.array([[-1] * self.batch]) + self.add_weight(negative_one, weight=weight, data_type="INT32") + zero = "zero" + weight = np.array([[0]*self.batch], dtype=int) + self.add_weight(zero, weight=weight, data_type="INT32") + + # init position + self.add_copy(negative_one, 1, 1, 0, + position_input_name, 1, 1, 0, + 1, output_name="init_position") + + # init prediction_net input + self.add_copy(blank, 1, 1, 0, + prediction_net_input_name, 1, 1, 0, + 1, output_name="init_prediction_net_input") + + # init prediction_net status + self.add_copy(zero, 1, 1, 0, + prediction_net_status_name, 1, 1, 0, + 1, output_name="init_prediction_net_status") + + encoder_states = [] + for layer_idx in range(self.params["encoder.lstm_cells"]): + state_shape = [self.batch, self.params["encoder.lstm_cell_state_shape"]] + state_name = "encoder_lstm" + str(layer_idx) + "_state" + self.add_memory(state_name, state_shape, data_type="FLOAT32") + encoder_states.append(state_name) + prediction_net_states = [] + for layer_idx in range(self.params["prediction_net.lstm_cells"]): + state_shape = [self.batch, self.params["prediction_net.lstm_cell_state_shape"]] + state_name = "prediction_net_lstm" + str(layer_idx) + "_state" + self.add_memory(state_name, state_shape, data_type="FLOAT32") + prediction_net_states.append(state_name) + + sequence_length = 1 + if (input is not None): + sequence_length = self.get_tensor(sounds_input_name).shape[-2] + + copy_sound_name = "copy_to_encoder_input" + repeat_name = "loops" + prediction_net_output = "" + self.add_jump(repeat_name, "jump_to_repeat", prediction_net_status_name) + for step in range(sequence_length): + self.scopes[0] = "rnnt" + self.set_add_layer(step==0) + + position_input_name_new = position_input_name+"_add_one" + self.add_power(position_input_name, position_input_name_new, scale=1, shift=1, power=1) + self.add_copy(position_input_name_new, 1, 1, 0, + position_input_name, 1, 1, 0, + 1, output_name="update_position") + + jump_name = "skip_blank" + self.add_jump(copy_sound_name, jump_name, prediction_net_status_name) + if (isinstance(self.get_tensor(prediction_net_status_name)[0][0], bool) and not self.get_tensor(prediction_net_status_name)[0][0]) \ + or (isinstance(self.get_tensor(prediction_net_status_name)[0][0], int) and self.get_tensor(prediction_net_status_name)[0][0] == 0) \ + or (isinstance(self.get_tensor(prediction_net_status_name)[0][0], float) and self.get_tensor(prediction_net_status_name)[0][0] == 0): + prediction_net_output = self.extract_prediction_net(prediction_net_input_name, prediction_net_states, 1) + + self.add_copy(sounds_input_name, + self.params["sequence.max_length"]*self.params["sequence.num_units"], self.params["sequence.num_units"], 0, + encoder_input_name, + self.params["sequence.num_units"], self.params["sequence.num_units"], 0, + self.params["sequence.num_units"], + output_name=copy_sound_name, + src_index_name=position_input_name, + dst_index_name=zero) + encoder_output = self.extract_encoder(encoder_input_name, encoder_states, 1) + + joint_net_output = self.extract_joint_net([encoder_output, prediction_net_output], 1) + + label_output = self.extract_output(joint_net_output, 0) + self.add_copy(label_output, 1, 1, 0, + prediction_net_input_name, 1, 1, 0, + 1, output_name="copy_to_prediction_net_input") + self.add_copy(label_output, + 1, 1, 0, + labels_output_name, + 1, 1, 0, + 1, + output_name="copy_to_global_labels", + src_index_name=zero, + dst_index_name=position_input_name) + status_name = "check_label_is_blank" + self.add_check(blank, label_output, "equal", status_name) + # set prediction_net status + self.add_copy(status_name, 1, 1, 0, + prediction_net_status_name, 1, 1, 0, + 1, output_name="set_prediction_net_status") + self.add_repeat(self.params["sequence.max_length"]-1, position_input_name_new, + output_name=repeat_name, + status_name=zero, + axis_name=sounds_input_name, axis=1) + + self.save_caffe_model() diff --git a/model_tools/tools/tensorflow2caffe/asr/transform_convolution_transformer.py b/model_tools/tools/tensorflow2caffe/asr/transform_convolution_transformer.py new file mode 100644 index 00000000..2f27cded --- /dev/null +++ b/model_tools/tools/tensorflow2caffe/asr/transform_convolution_transformer.py @@ -0,0 +1,55 @@ +#!/usr/local/bin/python +# -*- coding: utf-8 -*- + +from tensorflow2caffe_convolution_transformer import Tensorflow2CaffeConvolutionTransformer +import numpy as np + +def transform_encoder(model_path_prefix, data_path_prefix, quantization, block_id_start, block_id_end): + tensorflow_model_path = model_path_prefix + "/encoder.pb" + caffe_model_path_prefix = "asr_convolution_transformer_encoder" + caffe_model_name = "asr_convolution_transformer_encoder" + first_frame = True + asr_caffe = Tensorflow2CaffeConvolutionTransformer(tensorflow_model_path, caffe_model_path_prefix, caffe_model_name, + nchwc8=True, first_frame=first_frame, + check=False, calc=True, quantization=quantization) + data = {} + if (first_frame): + data["sounds"] = np.load(data_path_prefix + "/sound0_frame0.npy") + else: + data["sounds"] = np.load(data_path_prefix + "/sound0_frame1.npy") + asr_caffe.generate_encoder(data, block_id_start, block_id_end) + + +def transform_prediction_net(model_path_prefix, quantization): + tensorflow_model_path = model_path_prefix + "/pred_net.pb" + caffe_model_path_prefix = "asr_convolution_transformer_prediction_net" + caffe_model_name = "asr_convolution_transformer_prediction_net" + asr_caffe = Tensorflow2CaffeConvolutionTransformer(tensorflow_model_path, caffe_model_path_prefix, caffe_model_name, + check=False, calc=True, quantization=quantization) + data = {} + data["label"] = np.array([[1]]) + asr_caffe.generate_prediction_net(data) + +def transform_joint_net(model_path_prefix, data_path_prefix, quantization): + tensorflow_model_path = model_path_prefix + "/joint_net.pb" + caffe_model_path_prefix = "asr_convolution_transformer_joint_net" + caffe_model_name = "asr_convolution_transformer_joint_net" + asr_caffe = Tensorflow2CaffeConvolutionTransformer(tensorflow_model_path, caffe_model_path_prefix, caffe_model_name, + check=False, calc=True, quantization=quantization) + shapes = {} + shapes["encoder"] = [1, 512] + shapes["prediction_net"] = [1, 512] + data = {} + data["encoder"] = np.load(data_path_prefix + "/encoder.npy") + data["prediction_net"] = np.load(data_path_prefix + "/pred_net.npy") + asr_caffe.print_weight_map() + asr_caffe.generate_joint_net(shapes, data) + +if __name__ == '__main__': + model_path_prefix = "/data/models/asr/OpenSeq2Seq/pipeline/model/transducer" + data_path_prefix = "/data/models/asr/OpenSeq2Seq" + + quantization = False + transform_encoder(model_path_prefix, data_path_prefix, quantization, 0, -1) + transform_prediction_net(model_path_prefix, quantization) + transform_joint_net(model_path_prefix, data_path_prefix, quantization) diff --git a/model_tools/tools/tensorflow2caffe/asr/transform_convolution_transformer_keras.py b/model_tools/tools/tensorflow2caffe/asr/transform_convolution_transformer_keras.py new file mode 100644 index 00000000..94b08198 --- /dev/null +++ b/model_tools/tools/tensorflow2caffe/asr/transform_convolution_transformer_keras.py @@ -0,0 +1,42 @@ +#!/usr/local/bin/python +# -*- coding: utf-8 -*- + +from tensorflow2caffe_convolution_transformer_keras import Tensorflow2CaffeConvolutionTransformerKeras +import numpy as np + +def transform_encoder(model_path_prefix): + tensorflow_model_path = model_path_prefix + "/transformer_t_db_s8_r33_v1_i160_20200330_encoder.pb" + caffe_model_path_prefix = "asr_convolution_transformer_encoder" + caffe_model_name = "asr_convolution_transformer_encoder" + params = Tensorflow2CaffeConvolutionTransformerKeras.default_params() + asr_caffe = Tensorflow2CaffeConvolutionTransformerKeras(tensorflow_model_path, caffe_model_path_prefix, caffe_model_name, + params, + check=False, calc=True) + shapes = {} + shapes["encoder"] = [1, 32, 41, 1] + data = {} + data["encoder"] = np.ones(shapes["encoder"]) + asr_caffe.generate_encoder(shapes, data) + +def transform_prediction_joint_net(model_path_prefix): + tensorflow_model_path = model_path_prefix + "/transformer_t_db_s8_r33_v1_i160_20200330_decoder_joint.pb" + caffe_model_path_prefix = "asr_convolution_transformer_joint_net" + caffe_model_name = "asr_convolution_transformer_joint_net" + params = Tensorflow2CaffeConvolutionTransformerKeras.default_params() + asr_caffe = Tensorflow2CaffeConvolutionTransformerKeras(tensorflow_model_path, caffe_model_path_prefix, caffe_model_name, + params, + check=False, calc=True) + shapes = {} + shapes["prediction_net"] = [1, 1, 512] + shapes["encoder"] = [1, 1, 512] + data = {} + data["prediction_net"] = np.ones(shapes["prediction_net"]) + data["encoder"] = np.ones(shapes["encoder"]) + asr_caffe.print_weight_map() + asr_caffe.generate_prediction_joint_net(shapes, data) + +if __name__ == '__main__': + model_path_prefix = "/data/models/asr/transformer_t_db_s8_r33_v1_i160_20200330153153" + + transform_encoder(model_path_prefix) + transform_prediction_joint_net(model_path_prefix) diff --git a/model_tools/tools/tensorflow2caffe/asr/transform_rnnt.py b/model_tools/tools/tensorflow2caffe/asr/transform_rnnt.py new file mode 100644 index 00000000..07d4f187 --- /dev/null +++ b/model_tools/tools/tensorflow2caffe/asr/transform_rnnt.py @@ -0,0 +1,35 @@ +#!/usr/local/bin/python +# -*- coding: utf-8 -*- + +from tensorflow2caffe_rnnt import Tensorflow2CaffeRNNT +import numpy as np + + +if __name__ == '__main__': + tensorflow_model_path = "/data/models/asr/rnnt-170M/checkpoints/rnnt_spm2048_ms/model.ckpt-424000" + params = Tensorflow2CaffeRNNT.default_params() + + mode = "sub" + if mode == "all": + caffe_model_path_prefix = "asr_rnnt" + caffe_model_name = "asr_rnnt" + asr_caffe = Tensorflow2CaffeRNNT(tensorflow_model_path, caffe_model_path_prefix, caffe_model_name, + params, + check=False, calc=True) + data = {} + input = np.loadtxt('/data/models/asr/rnnt-450M/sound_data_0.txt') + data["sounds"] = np.reshape(input, [1, -1, params['sequence.num_units']]) + asr_caffe.generate(data) + else: + asr_caffe = Tensorflow2CaffeRNNT(tensorflow_model_path, "asr_rnnt_encoder", "asr_rnnt_encoder", + params, + check=False, calc=False) + asr_caffe.generate_encoder({}) + asr_caffe = Tensorflow2CaffeRNNT(tensorflow_model_path, "asr_rnnt_prediction_net", "asr_rnnt_prediction_net", + params, + check=False, calc=False) + asr_caffe.generate_prediction_net({}) + asr_caffe = Tensorflow2CaffeRNNT(tensorflow_model_path, "asr_rnnt_joint_net", "asr_rnnt_joint_net", + params, + check=False, calc=False) + asr_caffe.generate_joint_net({}) diff --git a/model-tools/tools/tensorflow2caffe/bert/albert/tensorflow2caffe_albert.py b/model_tools/tools/tensorflow2caffe/bert/albert/tensorflow2caffe_albert.py similarity index 100% rename from model-tools/tools/tensorflow2caffe/bert/albert/tensorflow2caffe_albert.py rename to model_tools/tools/tensorflow2caffe/bert/albert/tensorflow2caffe_albert.py diff --git a/model-tools/tools/tensorflow2caffe/bert/albert/transform_albert.py b/model_tools/tools/tensorflow2caffe/bert/albert/transform_albert.py similarity index 100% rename from model-tools/tools/tensorflow2caffe/bert/albert/transform_albert.py rename to model_tools/tools/tensorflow2caffe/bert/albert/transform_albert.py diff --git a/model-tools/tools/tensorflow2caffe/bert/tensorflow2caffe_bert.py b/model_tools/tools/tensorflow2caffe/bert/tensorflow2caffe_bert.py similarity index 98% rename from model-tools/tools/tensorflow2caffe/bert/tensorflow2caffe_bert.py rename to model_tools/tools/tensorflow2caffe/bert/tensorflow2caffe_bert.py index 63979326..44f2a204 100644 --- a/model-tools/tools/tensorflow2caffe/bert/tensorflow2caffe_bert.py +++ b/model_tools/tools/tensorflow2caffe/bert/tensorflow2caffe_bert.py @@ -89,7 +89,7 @@ def extract_encoder_attention(self, input_name, attention_mask_input_name, outpu query_key_name = output_name_prefix + "att_self_qk" self.add_matmul(query_transpose_name, key_transpose_name, query_key_name) query_key_scale_name = output_name_prefix + "att_self_qks" - self.add_multiply(query_key_name, query_key_scale_name, 1.0/math.sqrt(size_per_head)) + self.add_power(query_key_name, query_key_scale_name, scale=1.0/math.sqrt(size_per_head)) # query * key + mask if (attention_mask_input_name is None): diff --git a/model-tools/tools/tensorflow2caffe/bert/tinybert/adb_run.sh b/model_tools/tools/tensorflow2caffe/bert/tinybert/adb_run.sh similarity index 78% rename from model-tools/tools/tensorflow2caffe/bert/tinybert/adb_run.sh rename to model_tools/tools/tensorflow2caffe/bert/tinybert/adb_run.sh index dd0dfd26..1580f72e 100644 --- a/model-tools/tools/tensorflow2caffe/bert/tinybert/adb_run.sh +++ b/model_tools/tools/tensorflow2caffe/bert/tinybert/adb_run.sh @@ -9,4 +9,4 @@ adb shell mkdir ${device_dir}/data adb shell mkdir ${device_dir}/data/input adb shell mkdir ${device_dir}/data/result adb push sequence.seq ${device_dir}/data/input/0.seq -adb shell "cd ${device_dir} && ./tinybert tinybert_f16.bolt data" &> result.txt +adb shell "cd ${device_dir} && ./tinybert -m tinybert_f16.bolt -i data -a CPU_AFFINITY_HIGH_PERFORMANCE" &> result.txt diff --git a/model-tools/tools/tensorflow2caffe/bert/tinybert/result.txt b/model_tools/tools/tensorflow2caffe/bert/tinybert/result.txt similarity index 100% rename from model-tools/tools/tensorflow2caffe/bert/tinybert/result.txt rename to model_tools/tools/tensorflow2caffe/bert/tinybert/result.txt diff --git a/model-tools/tools/tensorflow2caffe/bert/tinybert/sequence.seq b/model_tools/tools/tensorflow2caffe/bert/tinybert/sequence.seq similarity index 100% rename from model-tools/tools/tensorflow2caffe/bert/tinybert/sequence.seq rename to model_tools/tools/tensorflow2caffe/bert/tinybert/sequence.seq diff --git a/model-tools/tools/tensorflow2caffe/bert/tinybert/tensorflow2caffe_tinybert.py b/model_tools/tools/tensorflow2caffe/bert/tinybert/tensorflow2caffe_tinybert.py similarity index 76% rename from model-tools/tools/tensorflow2caffe/bert/tinybert/tensorflow2caffe_tinybert.py rename to model_tools/tools/tensorflow2caffe/bert/tinybert/tensorflow2caffe_tinybert.py index b1266e1f..c21406a0 100644 --- a/model-tools/tools/tensorflow2caffe/bert/tinybert/tensorflow2caffe_tinybert.py +++ b/model_tools/tools/tensorflow2caffe/bert/tinybert/tensorflow2caffe_tinybert.py @@ -16,9 +16,7 @@ def __init__(self, max_seq_length, embedding_dim, encoder_layers, num_heads, check, calc) self.max_ambiguate = (self.max_seq_length * self.max_seq_length - 1) // 2 - def extract_dense_prefix(self, input_name, dense_name, weight_name_prefix): - kernel_name = weight_name_prefix + "weight" - bias_name = weight_name_prefix + "bias" + def extract_dense_prefix(self, input_name, dense_name, kernel_name, bias_name): kernel = self.get_weight(kernel_name) bias = self.get_weight(bias_name) layer = caffe_net.LayerParameter(name=dense_name, type='InnerProduct', @@ -35,11 +33,12 @@ def extract_dense_prefix(self, input_name, dense_name, weight_name_prefix): self.data_dict[dense_name] = Operators.fully_connect(self.data_dict[input_name], kernel.transpose((1, 0)), bias, dense_name) + return dense_name def extract_intent_classifier(self, input_name): dense_name = "intent_classifier" weight_name_prefix = "intent_classifier_" - self.extract_dense_prefix(input_name, dense_name, weight_name_prefix) + self.extract_dense_prefix(input_name, dense_name, weight_name_prefix+"weight", weight_name_prefix+"bias") softmax_name = "intent_softmax" self.add_softmax(dense_name, softmax_name, -1) @@ -49,7 +48,7 @@ def extract_intent_classifier(self, input_name): def extract_slot_classifier(self, input_name): dense_name = "slot_classifier" weight_name_prefix = "slot_classifier_" - self.extract_dense_prefix(input_name, dense_name, weight_name_prefix) + self.extract_dense_prefix(input_name, dense_name, weight_name_prefix+"weight", weight_name_prefix+"bias") softmax_name = "slot_softmax" self.add_softmax(dense_name, softmax_name, -1) @@ -59,13 +58,24 @@ def extract_slot_classifier(self, input_name): def extract_mrpc_classifier(self, input_name): dense_name = "mrpc_classifier" weight_name_prefix = "classifier_" - self.extract_dense_prefix(input_name, dense_name, weight_name_prefix) + self.extract_dense_prefix(input_name, dense_name, weight_name_prefix+"weight", weight_name_prefix+"bias") softmax_name = "mrpc_softmax" self.add_softmax(dense_name, softmax_name, -1) return softmax_name + def extract_tts_preprocess_task(self, x, scope_id, scope_name, weight_name, bias_name): + self.scopes[scope_id] = scope_name + for i in range(3): + name = scope_name + '_dense%d' % (i+1) + x = self.extract_dense(x, name, scope_id+1, scope_name='dense_%d' % (i+1)) + x = self.add_relu(x, name+"_relu") + logits = self.extract_dense_prefix(x, scope_name+"_dense", weight_name, bias_name) + #logits = tf.reshape(logits, [batch_size, seq_length, num_labels]) + pred_ids = self.add_argmax(logits, axis=-1, output_name=scope_name+"_argmax") + return pred_ids + def generate_intent_slot_task(self, input=None): word_input_name = "tinybert_words" position_input_name = "tinybert_positions" @@ -81,6 +91,7 @@ def generate_intent_slot_task(self, input=None): self.add_input(token_input_name, token_input_shape) #self.add_input(attention_mask_input_name, attention_mask_input_shape) self.set_input(input) + self.save_input() attention_mask_name = None #"attention" #self.add_attention(attention_mask_input_name, self.num_heads, self.max_seq_length, self.max_seq_length, attention_mask_name); @@ -108,6 +119,7 @@ def generate_mrpc_task(self, input=None): self.add_input(token_input_name, token_input_shape) #self.add_input(attention_mask_input_name, attention_mask_input_shape) self.set_input(input) + self.save_input() attention_mask_name = None #"attention" #self.add_attention(attention_mask_input_name, self.num_heads, self.max_seq_length, self.max_seq_length, attention_mask_name); @@ -137,6 +149,7 @@ def generate_disambiguate_task(self, input=None): self.add_input(word_mask_input_name, word_mask_input_shape) self.add_input(dict_input_name, dict_input_shape) self.set_input(input) + self.save_input() attention_mask_name = None @@ -149,7 +162,31 @@ def generate_disambiguate_task(self, input=None): output_name = self.add_concat([output_name1, output_name2], "mask_result_concat", 2) dense_name = "slot_classifier1" weight_name_prefix = "slot_classifier1_" - output_name = self.extract_dense_prefix(output_name, dense_name, weight_name_prefix) + output_name = self.extract_dense_prefix(output_name, dense_name, weight_name_prefix+"weight", weight_name_prefix+"bias") slots = self.extract_slot_classifier(dense_name) self.save_caffe_model() + + def generate_tts_preprocess_task(self, input=None): + word_input_name = "tinybert_words" + position_input_name = "tinybert_positions" + token_input_name = "tinybert_token_type" + word_input_shape = [self.batch, self.max_seq_length] + position_input_shape = [self.batch, self.max_seq_length] + token_input_shape = [self.batch, self.max_seq_length] + + self.add_input(word_input_name, word_input_shape) + self.add_input(position_input_name, position_input_shape) + self.add_input(token_input_name, token_input_shape) + self.set_input(input) + self.save_input() + + attention_mask_name = None + + output_name = self.extract_embeddings(word_input_name, position_input_name, token_input_name) + output_names = self.extract_encoder(output_name, attention_mask_name) + self.scopes[0] = "loss" + output1 = self.extract_tts_preprocess_task(output_names[-1], 1, 'dense_layer', "output_weights1", "output_bias1") + output2 = self.extract_tts_preprocess_task(output_names[-1], 1, 'dense_layer2', "output_weights2", "output_bias2") + + self.save_caffe_model() diff --git a/model-tools/tools/tensorflow2caffe/bert/tinybert/tinybert-infer.py b/model_tools/tools/tensorflow2caffe/bert/tinybert/tinybert-infer.py similarity index 100% rename from model-tools/tools/tensorflow2caffe/bert/tinybert/tinybert-infer.py rename to model_tools/tools/tensorflow2caffe/bert/tinybert/tinybert-infer.py diff --git a/model-tools/tools/tensorflow2caffe/bert/tinybert/tokenization.py b/model_tools/tools/tensorflow2caffe/bert/tinybert/tokenization.py similarity index 100% rename from model-tools/tools/tensorflow2caffe/bert/tinybert/tokenization.py rename to model_tools/tools/tensorflow2caffe/bert/tinybert/tokenization.py diff --git a/model-tools/tools/tensorflow2caffe/bert/tinybert/transform_bert.py b/model_tools/tools/tensorflow2caffe/bert/tinybert/transform_bert.py similarity index 100% rename from model-tools/tools/tensorflow2caffe/bert/tinybert/transform_bert.py rename to model_tools/tools/tensorflow2caffe/bert/tinybert/transform_bert.py diff --git a/model-tools/tools/tensorflow2caffe/bert/tinybert/transform_tinybert_disambiguate.py b/model_tools/tools/tensorflow2caffe/bert/tinybert/transform_tinybert_disambiguate.py similarity index 93% rename from model-tools/tools/tensorflow2caffe/bert/tinybert/transform_tinybert_disambiguate.py rename to model_tools/tools/tensorflow2caffe/bert/tinybert/transform_tinybert_disambiguate.py index 04ef08af..88eceaad 100644 --- a/model-tools/tools/tensorflow2caffe/bert/tinybert/transform_tinybert_disambiguate.py +++ b/model_tools/tools/tensorflow2caffe/bert/tinybert/transform_tinybert_disambiguate.py @@ -16,8 +16,8 @@ embedding_dim = params["emb_size"] encoder_layers = params["num_hidden_layers"] num_heads = params["num_attention_heads"] - caffe_model_path_prefix = "tinybert_intent_slot" - caffe_model_name = "tinybert_intent_slot" + caffe_model_path_prefix = "tinybert_disambiguate" + caffe_model_name = "tinybert_disambiguate" bert_caffe = Tensorflow2CaffeTinyBert(tensorflow_model_path, caffe_model_path_prefix, caffe_model_name, diff --git a/model-tools/tools/tensorflow2caffe/bert/tinybert/transform_tinybert_intent_slot.py b/model_tools/tools/tensorflow2caffe/bert/tinybert/transform_tinybert_intent_slot.py similarity index 100% rename from model-tools/tools/tensorflow2caffe/bert/tinybert/transform_tinybert_intent_slot.py rename to model_tools/tools/tensorflow2caffe/bert/tinybert/transform_tinybert_intent_slot.py diff --git a/model-tools/tools/tensorflow2caffe/bert/tinybert/transform_tinybert_mrpc.py b/model_tools/tools/tensorflow2caffe/bert/tinybert/transform_tinybert_mrpc.py similarity index 100% rename from model-tools/tools/tensorflow2caffe/bert/tinybert/transform_tinybert_mrpc.py rename to model_tools/tools/tensorflow2caffe/bert/tinybert/transform_tinybert_mrpc.py diff --git a/model_tools/tools/tensorflow2caffe/bert/tinybert/transform_tinybert_tts_preprocess.py b/model_tools/tools/tensorflow2caffe/bert/tinybert/transform_tinybert_tts_preprocess.py new file mode 100644 index 00000000..16e1a461 --- /dev/null +++ b/model_tools/tools/tensorflow2caffe/bert/tinybert/transform_tinybert_tts_preprocess.py @@ -0,0 +1,31 @@ +#!/usr/local/bin/python +# -*- coding: utf-8 -*- + +from tensorflow2caffe_tinybert import Tensorflow2CaffeTinyBert +import numpy as np +import json + +if __name__ == '__main__': + tensorflow_model_path = "/data/models/bert/tinybert/tts-tinybert/model.ckpt-173506" + configure_file_path = "/data/models/bert/tinybert/tts-tinybert/config.json" + configure_file = open(configure_file_path) + params = json.load(configure_file) + configure_file.close() + + max_seq_length = 64 + embedding_dim = params["emb_size"] + encoder_layers = params["num_hidden_layers"] + num_heads = params["num_attention_heads"] + caffe_model_path_prefix = "tts_tinybert" + caffe_model_name = "tts_tinybert" + + bert_caffe = Tensorflow2CaffeTinyBert(tensorflow_model_path, + caffe_model_path_prefix, caffe_model_name, + max_seq_length, embedding_dim, encoder_layers, num_heads, + True, True) + data = {} + data["tinybert_words"] = np.array([[101,1045,2342,1037,14764,2005,2296,5353,3531,102]]) + tinybert_length = len(data["tinybert_words"][0]) + data["tinybert_positions"] = np.array([[i for i in range(tinybert_length)]]) + data["tinybert_token_type"] = np.array([[0] * tinybert_length]) + bert_caffe.generate_tts_preprocess_task(data) diff --git a/model-tools/tools/tensorflow2caffe/bert/transform_bert.py b/model_tools/tools/tensorflow2caffe/bert/transform_bert.py similarity index 100% rename from model-tools/tools/tensorflow2caffe/bert/transform_bert.py rename to model_tools/tools/tensorflow2caffe/bert/transform_bert.py diff --git a/model_tools/tools/tensorflow2caffe/nmt/tensorflow2caffe_transformer_lstm.py b/model_tools/tools/tensorflow2caffe/nmt/tensorflow2caffe_transformer_lstm.py new file mode 100644 index 00000000..0c2a1147 --- /dev/null +++ b/model_tools/tools/tensorflow2caffe/nmt/tensorflow2caffe_transformer_lstm.py @@ -0,0 +1,661 @@ +#!/usr/local/bin/python +# -*- coding: utf-8 -*- + +import math +import numpy as np +import sys +sys.path.append("../") +from tensorflow2caffe import Tensorflow2Caffe + +class Tensorflow2CaffeTransformerLstm(Tensorflow2Caffe): + def __init__(self, + tensorflow_model_path, caffe_model_path_prefix, caffe_model_name, + max_seq_length, max_decode_length, + encoder_params, decoder_params, + use_small_word_list=False, max_candidate_size=0, + check=False, calc=False): + Tensorflow2Caffe.__init__(self, tensorflow_model_path, + caffe_model_path_prefix, caffe_model_name, check, calc) + self.scopes[0] = "seq2seq_model" + self.encoder_params = encoder_params + self.decoder_params = decoder_params + self.max_seq_length = max_seq_length + self.max_decode_length = max_decode_length + self.use_small_word_list = use_small_word_list + self.max_candidate_size = max_candidate_size + if (self.use_small_word_list and self.max_candidate_size == 0): + self.max_candidate_size = self.max_seq_length * 50 + 2000 + self.encoder_outputs = {} + + @staticmethod + def default_encoder_params(): + return { + "num_units": 512, + "num_layers": 6, + "layer.preprocess": "n", + "layer.postprocess": "da", + "ffn.num_units": 2048, + "ffn.activation": "relu", # relu or swish + "attention.num_heads": 8, + "attention.branch": False, # weighted transformer in https://arxiv.org/pdf/1711.02132.pdf + "attention.relpos": 0, # relative position representation in https://arxiv.org/pdf/1803.02155.pdf + "dropout_rate": 0.1, + "position.enable": True, + "position.combiner_fn": "tensorflow.add", + "initializer": "uniform_unit_scaling", + "init_scale": 1.0, + "share_level": 1 # every 2 layers share the same params + } + + @staticmethod + def default_decoder_params(): + return { + "num_layers": 6, + "num_units": 512, + "layer.preprocess": "n", + "layer.postprocess": "da", + "attention.self_average": False, + "attention.num_heads": 8, + "attention.branch": False, + "attention.relpos": 0, + "ffn.num_units": 2048, + "ffn.activation": "relu", + "dropout_rate": 0.1, + "position.enable": True, + "position.combiner_fn": "tensorflow.add", + "position.max_length": 1000, + "decode_length_factor": 2., + "flex_decode_length": True, + "initializer": "uniform_unit_scaling", + "init_scale": 1.0, + "attention.weighted_avg": False, + "forget_bias": 1.0, + "rnn.cell_type": "lstm", + "sum_att": False + } + + def ffn(self, x, output_name_prefix, scope_id, activation="relu"): + self.scopes[scope_id] = "ffn" + + dense_name_1 = output_name_prefix + "_ffn_conv1" + self.extract_dense(x, dense_name_1, scope_id+1, "conv1") + + activation_name = output_name_prefix + "_ffn_act" + activation_support = False + if (activation == "relu"): + activation_support = True + self.add_relu(dense_name_1, activation_name) + if (activation == "swish"): + activation_support = True + self.add_swish(dense_name_1, activation_name) + if (not activation_support): + print("[ERROR] unsupported FFN activation %s" % (activation)) + exit(0) + + dense_name_2 = output_name_prefix + "_ffn_conv2" + self.extract_dense(activation_name, dense_name_2, scope_id+1, "conv2") + return dense_name_2 + + def additive_attention(self, q, + k, + v, + mask, + attention_mask, + output_name_prefix, + name=None): + print("[ERROR] unsupported additive attention") + exit(0) + + def dot_product_attention(self, q, + k, + v, + mask, + attention_mask, + output_name_prefix, + edge_k=None, + edge_v=None, + name=None): + if (edge_k is not None): + sum_name = output_name_prefix + "_dot_ek" + k = self.add_sum([k, edge_k], sum_name) + # query * key + query_key_name = output_name_prefix + "_dot_qk" + self.add_matmul(q, k, query_key_name) + + if (mask is not None): + scores = output_name_prefix + "_dot_scores" + self.add_prod([query_key_name, mask], scores) + query_key_name = output_name_prefix + "_dot_scores_mask" + self.add_sum([scores, attention_mask], query_key_name) + + # softmax + scores_normalized = output_name_prefix + "_dot_score_norm" + self.add_softmax(query_key_name, scores_normalized, 3) + + if edge_v is not None: + sum_name = output_name_prefix + "_dot_ev" + v = self.add_sum([v, edge_v], sum_name) + context = output_name_prefix + "_dot_cont" + self.add_matmul(scores_normalized, v, context) + + return scores_normalized, context + + def multihead_attention(self, query, + memory, + mask, + attention_mask, + key_depth, + value_depth, + output_depth, + num_heads, + sequence_length, + output_name_prefix, + scope_id, + name=None, + cache=None, + branch=False, + filter_depth=None, + activation="relu", + relpos=0, + sum_att=False, + **kwargs): + self.scopes[scope_id] = "multihead_attention" + key_depth_per_head = key_depth // num_heads + value_depth_per_head = value_depth // num_heads + if memory is None: + query_name = output_name_prefix + "_multihead_q" + key_name = output_name_prefix + "_multihead_k" + value_name = output_name_prefix + "_multihead_v" + self.extract_denses(query, [query_name, key_name, value_name], [key_depth, key_depth, value_depth], scope_id+1, "qkv") + key_reshape_name = key_name + "_r" + value_reshape_name = value_name + "_r" + self.add_reshape(key_name, key_reshape_name, [self.batch, -1, num_heads, key_depth_per_head]) + self.add_reshape(value_name, value_reshape_name, [self.batch, -1, num_heads, value_depth_per_head]) + key_transpose_name = key_name + "_t" + value_transpose_name = value_name + "_t" + self.add_transpose(key_reshape_name, key_transpose_name, [0, 2, 3, 1]) + self.add_transpose(value_reshape_name, value_transpose_name, [0, 2, 1, 3]) + else: + query_name = output_name_prefix + "_multihead_q" + self.extract_dense(query, query_name, scope_id+1, "q") + #key_name = output_name_prefix + "_multihead_k" + #value_name = output_name_prefix + "_multihead_v" + #self.extract_denses(memory, [key_name, value_name], [key_depth, value_depth], scope_id+1, "kv") + #key_name = memory["key"] + #value_name = memory["value"] + key_transpose_name = memory["key"] + value_transpose_name = memory["value"] + + # reshape + query_reshape_name = query_name + "_r" + self.add_reshape(query_name, query_reshape_name, [self.batch, -1, num_heads, key_depth_per_head]) + + # transpose + query_transpose_name = query_name + "_t" + self.add_transpose(query_reshape_name, query_transpose_name, [0, 2, 1, 3]) + + edge_k = None + edge_v = None + if cache is not None: + print("[ERROR] cache NOT_SUPPORTED") + exit(0) + + query_scale_name = output_name_prefix + "_multihead_qs" + self.add_power(query_transpose_name, query_scale_name, scale=1.0/math.sqrt(key_depth_per_head)) + + if relpos > 0: + print("[ERROR] relpos>0 NOT_SUPPORTED") + exit(0) + if sum_att: + scores, x = self.additive_attention( + query_scale_name, key_transpose_name, value_transpose_name, mask, attention_mask) + else: + scores, x = self.dot_product_attention( + query_scale_name, key_transpose_name, value_transpose_name, mask, attention_mask, output_name_prefix, + edge_k=edge_k, edge_v=edge_v) + if branch: + print("[ERROR] branch=True NOT_SUPPORTED") + exit(0) + else: + # transpose + x_t = output_name_prefix + "_multihead_out_t" + self.add_transpose(x, x_t, [0, 2, 1, 3]) + # reshape + x_r = output_name_prefix + "_multihead_out_r" + #self.add_reshape(x_t, x_r, [self.batch, sequence_length, num_heads*value_depth_per_head]) + self.add_reshape(x_t, x_r, [self.batch, -1, num_heads*value_depth_per_head]) + # dense + x = output_name_prefix + "_multihead_out_dense" + self.extract_dense(x_r, x, scope_id+1, "output_transform") + return scores, x + + def self_attention_sublayer(self, x, mask, attention_mask, num_units, num_heads, sequence_length, output_name_prefix, scope_id, memory=None, cache=None, branch=False, + filter_depth=None, activation="relu", relpos=0, sum_att=False): + att_scores, x = self.multihead_attention( + query=x, + memory=memory, + mask=mask, + attention_mask=attention_mask, + key_depth=num_units, + value_depth=num_units, + output_depth=num_units, + num_heads=num_heads, + sequence_length=sequence_length, + output_name_prefix=output_name_prefix, + scope_id=scope_id, + cache=cache, + branch=branch, + filter_depth=filter_depth, + activation=activation, + relpos=relpos, + sum_att=sum_att + ) + return att_scores, x + + def layer_process(self, x, output_name_prefix, scope_id, y=None, mode=None): + if not mode or mode == "none": + return x + + index = 0 + for m in mode: + if m == 'a': + output_name = output_name_prefix + "_a" + str(index) + x = self.add_sum([x, y], output_name) + elif m == 'n': + output_name = output_name_prefix + "_n" + str(index) + x = self.extract_layer_norm(x, output_name, scope_id, ["layer_norm", "gamma", "beta"]) + elif m == 'd': + print("[INFO] dropout") + else: + print("[ERROR] unknown layer process %s" % (m)) + index += 1 + return x + + def position_encoding(self, length, depth, output_name_prefix=None, + min_timescale=1, + max_timescale=1e4): + positions = np.arange(length) + depths = np.arange(depth) + # correspond to log(10000^(1/(d-1))) + log_timescale_increment = ( + math.log(max_timescale / min_timescale) / (depth - 1)) + # correspond to 1 / 10000^(i/(d-1)), i=0....d-1 + inv_timescales = min_timescale * np.exp(depths * -1 * log_timescale_increment) + # pos / 10000^(i/(d-1)) + scaled_time = np.expand_dims(positions, 1) * np.expand_dims(inv_timescales, 0) + # intead of using SIN and COS interleaved + # it's the same to first use SIN then COS + # as they are applied to the same position + position_embedding_weight = np.concatenate((np.sin(scaled_time), np.cos(scaled_time)), axis=1) + + output_name = "_position_dict" + if (output_name_prefix is not None): + output_name = output_name_prefix + output_name + self.add_weight(output_name=output_name, weight=position_embedding_weight) + return output_name + + def encode(self, inputs, sequence_mask, attention_mask, scope_id, output_name_prefix): + num_units = self.encoder_params["num_units"] + + self.scopes[scope_id] = "target_space_emb" + adder = output_name_prefix + "_emb" + weight_name = output_name_prefix + "_target_space_emb" + self.add_weight(weight_name, scope_id=scope_id+1) + self.add_sum([inputs, weight_name], adder) + + if (self.encoder_params["position.enable"]): + position_input_name = "nmt_positions" + position_input_shape = [self.batch, self.max_seq_length] + self.add_input(position_input_name, position_input_shape) + #weight = np.array([[i for i in range(sequence_length)] * self.batch]) + #self.add_weight(position_input_name, weight=weight, data_type="INT32") + + position_embedding_dict_name = self.position_encoding(length=self.max_seq_length, + depth=self.encoder_params["num_units"] // 2, + output_name_prefix=output_name_prefix) + position_embedding_name = output_name_prefix + "position_embedding" + self.add_embedding(position_input_name, + position_embedding_dict_name, + position_embedding_name) + + if (self.encoder_params["position.combiner_fn"] != "tensorflow.add"): + print("[ERROR] position embedding unsupported") + exit(0) + output_name = "we+pe" + self.add_sum([adder, position_embedding_name], output_name) + adder = output_name + + x = adder + for i in range(self.encoder_params["num_layers"]): + layer_idx = i // self.encoder_params["share_level"] + self.scopes[scope_id] = "layer_" + str(layer_idx) + self.scopes[scope_id+1] = "self_attention" + output_name_prefix_new = output_name_prefix + "_layer" + str(i) + + # preprocess + x_preprocess = self.layer_process(x, + output_name_prefix_new + "_pre1", + scope_id+2, + mode=self.encoder_params["layer.preprocess"]) + + # attention + _, y = self.self_attention_sublayer(x=x_preprocess, + mask=sequence_mask, + attention_mask=attention_mask, + num_units=self.encoder_params["num_units"], + num_heads=self.encoder_params["attention.num_heads"], + sequence_length=self.max_seq_length, + output_name_prefix=output_name_prefix_new, + scope_id=scope_id+2, + branch=self.encoder_params["attention.branch"], + filter_depth=self.encoder_params["ffn.num_units"], + activation=self.encoder_params["ffn.activation"], + relpos=self.encoder_params["attention.relpos"], + ) + + # post process + x = self.layer_process(x, + output_name_prefix_new + "_post1", + scope_id+2, + y=y, mode=self.encoder_params["layer.postprocess"]) + + # ffn + self.scopes[scope_id+1] = "ffn" + x_preprocess = self.layer_process(x, + output_name_prefix_new + "_pre2", + scope_id+2, + mode=self.encoder_params["layer.preprocess"]) + y = self.ffn(x_preprocess, output_name_prefix_new, scope_id+2, activation=self.encoder_params["ffn.activation"]) + x = self.layer_process(x, + output_name_prefix_new + "_post2", + scope_id+2, + y=y, mode=self.encoder_params["layer.postprocess"]) + + outputs = self.layer_process(x, + output_name_prefix + "_att_post", + scope_id, + mode=self.encoder_params["layer.preprocess"]) + self.encoder_outputs["encoder_output"] = outputs + return outputs + + def encoder_post_process(self, output_name_prefix): + self.scopes[0] = "seq2seq_model" + self.scopes[1] = "rnnformer_decoder" + self.scopes[3] = "encdec_attention" + self.scopes[4] = "multihead_attention" + key_depth = self.decoder_params["num_units"] + value_depth = self.decoder_params["num_units"] + num_heads = self.decoder_params["attention.num_heads"] + key_depth_per_head = key_depth // num_heads + value_depth_per_head = value_depth // num_heads + encoder_outputs = [] + for layer_idx in range(self.decoder_params["num_layers"]): + self.scopes[2] = "layer_%d" % (layer_idx) + key_name = output_name_prefix + "_layer" + str(layer_idx) + "_multihead_k" + value_name = output_name_prefix + "_layer" + str(layer_idx) + "_multihead_v" + memory = self.encoder_outputs["encoder_output"] + self.extract_denses(memory, [key_name, value_name], [key_depth, value_depth], 5, ["kv", "kernel", "bias"]) + key_reshape_name = key_name + "_r" + value_reshape_name = value_name + "_r" + self.add_reshape(key_name, key_reshape_name, [self.batch, -1, num_heads, key_depth_per_head]) + self.add_reshape(value_name, value_reshape_name, [self.batch, -1, num_heads, value_depth_per_head]) + key_transpose_name = key_name + "_t" + value_transpose_name = value_name + "_t" + self.add_transpose(key_reshape_name, key_transpose_name, [0, 2, 3, 1]) + self.add_transpose(value_reshape_name, value_transpose_name, [0, 2, 1, 3]) + encoder_outputs.append({"key": key_transpose_name, "value": value_transpose_name}) + self.encoder_outputs["encoder_output"] = encoder_outputs + + def attention_ffn_block(self, inputs, encoder_mask, attention_mask, scope_id, output_name_prefix, state=None, position=None): + x = inputs + attentions = [] + state_cache = [] + for layer_idx in range(self.decoder_params["num_layers"]): + self.scopes[scope_id] = "layer_%d" % (layer_idx) + output_name_prefix_new = output_name_prefix + "_layer_" + str(layer_idx) + + # RNN sublayer + self.scopes[scope_id+1] = "rnn_sublayer" + cur_state = state[layer_idx] + # Preprocess + x_process = output_name_prefix_new + "_pre1" + x = self.layer_process(x, x_process, scope_id+2, mode=self.decoder_params["layer.preprocess"]) + + x = self.add_squeeze(x, axis=1, output_name=x+"_squeeze") + + y = output_name_prefix_new + "_cell" + self.extract_lstm(x, cur_state, y, scope_id+2, scope_name = "basic_lstm_cell") + state_cache.append(cur_state) + #Postprocess + x_process = output_name_prefix_new + "_post1" + x = self.layer_process(x, x_process, scope_id+2, y=y, mode=self.decoder_params["layer.postprocess"]) + x = self.add_expand_dims(x, axis=1, output_name=x+"_expand") + + # Encdec sublayer + self.scopes[scope_id+1] = "encdec_attention" + # Preprocess + x_preprocess = output_name_prefix_new + "_pre2" + x_preprocess = self.layer_process(x, x_preprocess, scope_id+2, mode=self.decoder_params["layer.preprocess"]) + # Cross attention + att_scores, y = self.self_attention_sublayer(x=x_preprocess, + mask=encoder_mask, + attention_mask=attention_mask, + num_units=self.decoder_params["num_units"], + num_heads=self.decoder_params["attention.num_heads"], + sequence_length=1, + output_name_prefix=output_name_prefix_new, + scope_id=scope_id+2, + memory=self.encoder_outputs["encoder_output"][layer_idx], + branch=self.decoder_params["attention.branch"], + filter_depth=self.decoder_params["ffn.num_units"], + activation=self.decoder_params["ffn.activation"], + sum_att=self.decoder_params["sum_att"] + ) + # Post process + x_process = output_name_prefix_new + "_post2" + x = self.layer_process(x, x_process, scope_id+2, y=y, mode=self.decoder_params["layer.postprocess"]) + + att_context = x + + if not self.decoder_params["attention.weighted_avg"]: + print("[WARNING] unused attention scores") + attentions.append(att_scores) + else: + print("[ERROR] unsupported attention weighted average") + + #if attention is None: + # attention = att_scores + #else: + # if not self.decoder_params["attention.weighted_avg"]: + # output_name = attention + "add_socres" + # #attention = self.add_sum([attention, att_scores], output_name) + # else: + # print("[ERROR] unsupported attention weighted average") + # #attention = np.concatenate([attention, att_scores], axis = 1) + + # FFN sublayer + self.scopes[scope_id+1] = "ffn" + # Preprocess + x_preprocess = output_name_prefix_new + "_pre3" + x_preprocess = self.layer_process(x, x_preprocess, scope_id+2, mode=self.decoder_params["layer.preprocess"]) + # FFN + y = self.ffn(x_preprocess, output_name_prefix_new, scope_id+2, activation=self.decoder_params["ffn.activation"]) + # Postprocess + x_process = output_name_prefix_new + "_post3" + x = self.layer_process(x, x_process, scope_id+2, y=y, mode=self.decoder_params["layer.postprocess"]) + state = state_cache + + #Preproecss + x_process = output_name_prefix_new + "_pre4" + outputs = self.layer_process(x, x_process, scope_id, mode=self.decoder_params["layer.preprocess"]) + + attention = None + if not self.decoder_params["attention.weighted_avg"]: + attention = self.add_sum(attentions, "decoder_attention_sum") + attention = self.add_reduce_mean(attention, axis=1, keep_dim=False, output_name="decoder_attention_mean") + attention = self.add_power(attention, "decoder_attention_avg", scale=1.0/self.decoder_params["num_layers"]) + else: + print("[ERROR] unsupported attention weighted average") + + return outputs, state, attention, att_context + + def add_projection(self, input_name, weight_name, output_name_prefix): + matmul_name = output_name_prefix + "_matmul" + self.add_matmul(input_name, weight_name, matmul_name, transpose_a=False, transpose_b=True) + + argmax_name = self.add_argmax(matmul_name, axis=-1, output_name=output_name_prefix+"_argmax") + + return argmax_name + + def extract_decoder(self, sequence_mask, attention_mask, max_decode_length, scope_id, output_name_prefix): + # sos=1 + sos = "sos" + weight = np.array([[1] * self.batch]) + self.add_weight(sos, weight=weight, data_type="INT32") + + negative_one = "negative_one" + weight = np.array([[-1] * self.batch]) + self.add_weight(negative_one, weight=weight, data_type="INT32") + + decoder_start_name = output_name_prefix + "_words" + decoder_start_shape = [self.batch, 1] + self.add_memory(decoder_start_name, decoder_start_shape, data_type="INT32") + self.add_copy(sos, 1, 1, 0, + decoder_start_name, 1, 1, 0, + 1, output_name="init_decoder") + + position_input_name = output_name_prefix + "_position" + position_input_shape = [self.batch, 1] + self.add_memory(position_input_name, position_input_shape, data_type="INT32") + self.add_copy(negative_one, 1, 1, 0, + position_input_name, 1, 1, 0, + 1, output_name="init_decoder_position") + + zero = "zero" + weight = np.array([[0]*self.batch]) + self.add_weight(zero, weight=weight, data_type="INT32") + + # eos=2 + eos = "eos" + weight = np.array([[2]*self.batch]) + self.add_weight(eos, weight=weight, data_type="INT32") + + decoder_output_shape = [self.batch, max_decode_length] + decoder_output = "decoder_output" + self.add_memory(decoder_output, decoder_output_shape, data_type="INT32") + + decoder_attention_shape = [self.batch, max_decode_length, self.max_seq_length] + decoder_attention = "decoder_attention" + self.add_memory(decoder_attention, decoder_attention_shape, data_type="FLOAT32") + + position_embedding_dict_name = self.position_encoding(length=self.decoder_params["position.max_length"], + depth=self.decoder_params["num_units"] // 2, + output_name_prefix=output_name_prefix) + state = [] + state_shape = [self.batch, self.encoder_params["num_units"]+self.decoder_params["num_units"]] + for layer_idx in range(self.decoder_params["num_layers"]): + state_name = output_name_prefix + "_layer" + str(layer_idx) + "_state" + self.add_memory(state_name, state_shape, data_type="FLOAT32") + state.append(state_name) + sample_ids = decoder_start_name + for step in range(max_decode_length): + # whether to add caffe layer + self.set_add_layer(step==0) + + position_input_name_new = position_input_name+"_add_one" + self.add_power(position_input_name, position_input_name_new, scale=1, shift=1) + self.add_copy(position_input_name_new, 1, 1, 0, + position_input_name, 1, 1, 0, + 1, output_name="update_position") + + + output_name_prefix_new = output_name_prefix + "_step" + str(step) + cur_inputs = output_name_prefix_new + "_words_embedding" + self.add_embedding(sample_ids, self.source_modality, cur_inputs)#, transpose=True) + cur_inputs_scale = output_name_prefix_new + "_words_embedding" + "_s" + self.add_power(cur_inputs, cur_inputs_scale, scale=(self.decoder_params['num_units'] ** 0.5)) + + position_embedding_name = output_name_prefix_new + "_position_embedding" + self.add_embedding(position_input_name, position_embedding_dict_name, position_embedding_name) + + output_name = output_name_prefix_new + "_embedding" + cur_inputs_pos = self.add_sum([cur_inputs_scale, position_embedding_name], output_name) + + cell_outputs, state, attention, att_context = self.attention_ffn_block(inputs=cur_inputs_pos, + encoder_mask=sequence_mask, + attention_mask=attention_mask, + scope_id=scope_id, + output_name_prefix = output_name_prefix_new, + state=state, + position=step) + self.add_copy(attention, + -1, -1, 0, + decoder_attention, + max_decode_length*self.max_seq_length, self.max_seq_length, 0, + -1, + output_name="copy_attention_to_global_buffer", + src_index_name=zero, + dst_index_name=position_input_name) + + current_ids = self.add_projection(cell_outputs, self.source_modality, output_name) + self.add_copy(current_ids, 1, 1, 0, + sample_ids, 1, 1, 0, + 1, output_name="copy_to_next_input") + self.add_copy(current_ids, + 1, 1, 0, + decoder_output, + max_decode_length, 1, 0, + 1, + output_name="copy_word_to_global_buffer", + src_index_name=zero, + dst_index_name=position_input_name) + status = output_name + "_check" + self.add_check(current_ids, eos, "equal", status) + self.add_repeat(max_decode_length-1, position_input_name_new, output_name="repeat", status_name=status) + if (self.get_tensor(status)[0]): + break; + + return self.get_tensor(decoder_output), self.get_tensor(decoder_attention) + + def generate(self, input=None): + encoder_word_input_name = "nmt_words" + encoder_word_input_shape = [self.batch, self.max_seq_length] + self.add_input(encoder_word_input_name, encoder_word_input_shape) + if (self.use_small_word_list): + decoder_candidate_input_name = "nmt_candidates" + decoder_candidate_input_shape = [self.batch, self.max_candidate_size] + self.add_input(decoder_candidate_input_name, decoder_candidate_input_shape) + self.set_input(input) + + self.scopes[1] = "source_modality" + self.scopes[2] = "embedding" + self.source_modality = "source_modality" + self.add_weight(output_name=self.source_modality, scope_id=3)#, transpose=[1,0]) + + encoder_embedding = "encoder_embedding" + self.add_embedding(encoder_word_input_name, self.source_modality, encoder_embedding)#, transpose=True) + encoder_embedding = self.add_power(encoder_embedding, encoder_embedding+"_s", scale=math.sqrt(self.encoder_params["num_units"])) + + if (self.use_small_word_list): + small_words_embedding = "small_words_embedding" + self.add_embedding(decoder_candidate_input_name, self.source_modality, small_words_embedding)#, transpose=True) + self.source_modality = small_words_embedding + mask_input_name = None + encoder_attention_mask = None + decoder_attention_mask = None + #mask_input_name = "nmt_mask" + #mask_input_shape = [self.batch, self.max_seq_length] + #self.add_input(mask_input_name, mask_input_shape) + + self.scopes[1] = "transformer_encoder" + #encoder_attention_mask = "encoder_attention_mask" + #self.add_attention(mask_input_name, self.encoder_params['attention.num_heads'], self.max_seq_length, self.max_seq_length, encoder_attention_mask) + encoders = self.encode(encoder_embedding, mask_input_name, encoder_attention_mask, 2, output_name_prefix="transformer") + self.encoder_post_process("transformer_decoder") + + self.scopes[1] = "rnnformer_decoder" + #decoder_attention_mask = "decoder_attention_mask" + #self.add_attention(mask_input_name, self.decoder_params['attention.num_heads'], 1, self.max_seq_length, decoder_attention_mask) + decoders = self.extract_decoder(mask_input_name, decoder_attention_mask, self.max_decode_length, 2, output_name_prefix="rnnformer") + + self.save_caffe_model() diff --git a/model_tools/tools/tensorflow2caffe/nmt/tensorflow2caffe_transformer_tsc.py b/model_tools/tools/tensorflow2caffe/nmt/tensorflow2caffe_transformer_tsc.py new file mode 100644 index 00000000..9eb9356e --- /dev/null +++ b/model_tools/tools/tensorflow2caffe/nmt/tensorflow2caffe_transformer_tsc.py @@ -0,0 +1,709 @@ +#!/usr/local/bin/python +# -*- coding: utf-8 -*- + +import math +import numpy as np +import sys +sys.path.append("../") +from tensorflow2caffe import Tensorflow2Caffe + +class Tensorflow2CaffeTransformerTSC(Tensorflow2Caffe): + def __init__(self, + tensorflow_model_path, caffe_model_path_prefix, caffe_model_name, + max_seq_length, max_decode_length, + encoder_params, decoder_params, + check=False, calc=False): + Tensorflow2Caffe.__init__(self, tensorflow_model_path, + caffe_model_path_prefix, caffe_model_name, check, calc) + self.scopes[0] = "seq2seq_model" + self.encoder_params = encoder_params + self.decoder_params = decoder_params + self.max_seq_length = max_seq_length + self.max_decode_length = max_decode_length + self.encoder_outputs = {} + + @staticmethod + def default_encoder_params(): + return { + "num_layers": 6, + "num_units": 512, + "layer.preprocess": "n", + "layer.postprocess": "da", + "ffn.num_units": 2048, + "ffn.activation": "relu", # relu or swish + "attention.num_heads": 8, + "attention.branch": False, # weighted transformer in https://arxiv.org/pdf/1711.02132.pdf + "attention.relpos": 0, # relative position representation in https://arxiv.org/pdf/1803.02155.pdf + "dropout_rate": 0.1, + "position.enable": True, + "position.combiner_fn": "tensorflow.add", + "initializer": "uniform_unit_scaling", + "init_scale": 1.0, + "share_level": 1 + } + + @staticmethod + def default_decoder_params(): + return { + "num_layers": 6, + "num_units": 512, + "layer.preprocess": "n", + "layer.postprocess": "da", + "attention.self_average": False, + "attention.num_heads": 8, + "attention.branch": False, + "attention.relpos": 0, + "ffn.num_units": 2048, + "ffn.activation": "relu", + "dropout_rate": 0.1, + "position.enable": True, + "position.combiner_fn": "tensorflow.add", + "position.max_length": 1000, + "decode_length_factor": 2., + "flex_decode_length": True, + "initializer": "uniform_unit_scaling", + "init_scale": 1.0, + "attention.weighted_avg": False, + "forget_bias": 1.0, + "rnn.cell_type": "lstm", + "sum_att": False, + "share_level": 1 + } + + def ffn(self, x, output_name_prefix, scope_id, activation="relu", share_index=0, share_num=1): + self.scopes[scope_id] = "ffn_layer" + + dense_name_1 = output_name_prefix + "_ffn_conv1" + self.extract_dense(x, dense_name_1, scope_id+1, ["input_layer/linear", "matrix", "bias"], + share_index=share_index, share_num=share_num) + + activation_name = output_name_prefix + "_ffn_act" + activation_support = False + if (activation == "relu"): + activation_support = True + self.add_relu(dense_name_1, activation_name) + if (activation == "swish"): + activation_support = True + self.add_swish(dense_name_1, activation_name) + if (not activation_support): + print("[ERROR] unsupported FFN activation %s" % (activation)) + exit(0) + + dense_name_2 = output_name_prefix + "_ffn_conv2" + self.extract_dense(activation_name, dense_name_2, scope_id+1, ["output_layer/linear", "matrix", "bias"], + share_index=share_index, share_num=share_num) + return dense_name_2 + + def additive_attention(self, q, + k, + v, + mask, + attention_mask, + output_name_prefix, + name=None): + print("[ERROR] unsupported additive attention") + exit(0) + + def dot_product_attention(self, q, + k, + v, + mask, + attention_mask, + output_name_prefix, + edge_k=None, + edge_v=None, + name=None): + if (edge_k is not None): + sum_name = output_name_prefix + "_dot_ek" + k = self.add_sum([k, edge_k], sum_name) + # query * key + query_key_name = output_name_prefix + "_dot_qk" + self.add_matmul(q, k, query_key_name) + + if (mask is not None): + scores = output_name_prefix + "_dot_scores" + self.add_prod([query_key_name, mask], scores) + query_key_name = output_name_prefix + "_dot_scores_mask" + self.add_sum([scores, attention_mask], query_key_name) + + # softmax + scores_normalized = output_name_prefix + "_dot_score_norm" + self.add_softmax(query_key_name, scores_normalized, 3) + + if edge_v is not None: + sum_name = output_name_prefix + "_dot_ev" + v = self.add_sum([v, edge_v], sum_name) + context = output_name_prefix + "_dot_cont" + self.add_matmul(scores_normalized, v, context) + + return scores_normalized, context + + def multihead_attention(self, query, + memory, + mask, + attention_mask, + key_depth, + value_depth, + output_depth, + num_heads, + sequence_length, + output_name_prefix, + scope_id, + name=None, + cache=None, + branch=False, + filter_depth=None, + activation="relu", + relpos=0, + sum_att=False, + share_index=0, + share_num=1, + **kwargs): + self.scopes[scope_id] = "multihead_attention" + if memory is None: + query_name = output_name_prefix + "_multihead_q" + key_name = output_name_prefix + "_multihead_k" + value_name = output_name_prefix + "_multihead_v" + self.extract_denses(query, [query_name, key_name, value_name], + [key_depth, key_depth, value_depth], scope_id+1, ["qkv_transform", "matrix", "bias"], + share_index=share_index, share_num=share_num) + + # 结合历史序列的key value,生成当前key value,在axis=1处进行concat + if cache is not None: + key_name = self.add_concat([cache["self_key"], key_name], key_name + "_cache", axis=1) + value_name = self.add_concat([cache["self_value"], value_name], value_name + "_cache", axis=1) + # 更新缓存 + cache["self_key"] = key_name + cache["self_value"] = value_name + + else: + query_name = output_name_prefix + "_multihead_q" + self.extract_dense(query, query_name, scope_id+1, ["q_transform", "matrix", "bias"], + share_index=share_index, share_num=share_num) + #key_name = output_name_prefix + "_multihead_k" + #value_name = output_name_prefix + "_multihead_v" + #self.extract_denses(memory, [key_name, value_name], [key_depth, value_depth], scope_id+1, ["kv_transform", "matrix", "bias"]) + key_name = memory["key"] + value_name = memory["value"] + + # reshape + query_reshape_name = query_name + "_r" + key_reshape_name = key_name + "_r" + value_reshape_name = value_name + "_r" + key_depth_per_head = key_depth // num_heads + value_depth_per_head = value_depth // num_heads + #self.add_reshape(query_name, query_reshape_name, [self.batch, sequence_length, num_heads, key_depth_per_head]) + #self.add_reshape(key_name, key_reshape_name, [self.batch, self.max_seq_length, num_heads, key_depth_per_head]) + #self.add_reshape(value_name, value_reshape_name, [self.batch, self.max_seq_length, num_heads, value_depth_per_head]) + self.add_reshape(query_name, query_reshape_name, [self.batch, -1, num_heads, key_depth_per_head]) + self.add_reshape(key_name, key_reshape_name, [self.batch, -1, num_heads, key_depth_per_head]) + self.add_reshape(value_name, value_reshape_name, [self.batch, -1, num_heads, value_depth_per_head]) + + # transpose + query_transpose_name = query_name + "_t" + key_transpose_name = key_name + "_t" + value_transpose_name = value_name + "_t" + self.add_transpose(query_reshape_name, query_transpose_name, [0, 2, 1, 3]) + self.add_transpose(key_reshape_name, key_transpose_name, [0, 2, 3, 1]) + self.add_transpose(value_reshape_name, value_transpose_name, [0, 2, 1, 3]) + + edge_k = None + edge_v = None + + query_scale_name = output_name_prefix + "_multihead_qs" + self.add_power(query_transpose_name, query_scale_name, scale=1.0/math.sqrt(key_depth_per_head)) + + if relpos > 0: + print("[ERROR] relpos>0 NOT_SUPPORTED") + exit(0) + if sum_att: + scores, x = self.additive_attention( + query_scale_name, key_transpose_name, value_transpose_name, mask, attention_mask) + else: + scores, x = self.dot_product_attention( + query_scale_name, key_transpose_name, value_transpose_name, mask, attention_mask, output_name_prefix, + edge_k=edge_k, edge_v=edge_v) + if branch: + print("[ERROR] branch=True NOT_SUPPORTED") + exit(0) + else: + # transpose + x_t = output_name_prefix + "_multihead_out_t" + self.add_transpose(x, x_t, [0, 2, 1, 3]) + # reshape + x_r = output_name_prefix + "_multihead_out_r" + #self.add_reshape(x_t, x_r, [self.batch, sequence_length, num_heads*value_depth_per_head]) + self.add_reshape(x_t, x_r, [self.batch, -1, num_heads*value_depth_per_head]) + # dense + x = output_name_prefix + "_multihead_out_dense" + self.extract_dense(x_r, x, scope_id+1, ["output_transform", "matrix", "bias"], + share_index=share_index, share_num=share_num) + return scores, x + + def self_attention_sublayer(self, x, mask, attention_mask, num_units, num_heads, sequence_length, + output_name_prefix, scope_id, memory=None, cache=None, branch=False, + filter_depth=None, activation="relu", relpos=0, sum_att=False, + share_index=0, share_num=1): + att_scores, x = self.multihead_attention( + query=x, + memory=memory, + mask=mask, + attention_mask=attention_mask, + key_depth=num_units, + value_depth=num_units, + output_depth=num_units, + num_heads=num_heads, + sequence_length=sequence_length, + output_name_prefix=output_name_prefix, + scope_id=scope_id, + cache=cache, + branch=branch, + filter_depth=filter_depth, + activation=activation, + relpos=relpos, + sum_att=sum_att, + share_index=share_index, + share_num=share_num + ) + return att_scores, x + + def layer_process(self, x, output_name_prefix, scope_id, y=None, mode=None): + if not mode or mode == "none": + return x + + index = 0 + for m in mode: + if m == 'a': + output_name = output_name_prefix + "_a" + str(index) + x = self.add_sum([x, y], output_name) + elif m == 'n': + output_name = output_name_prefix + "_n" + str(index) + x = self.extract_layer_norm(x, output_name, scope_id, ["layer_norm", "scale", "offset"]) + elif m == 'd': + print("[INFO] dropout") + else: + print("[ERROR] unknown layer process %s" % (m)) + index += 1 + return x + + def position_encoding(self, length, depth, output_name_prefix=None, + min_timescale=1, + max_timescale=1e4): + positions = np.arange(length) + depths = np.arange(depth) + # correspond to log(10000^(1/(d-1))) + log_timescale_increment = ( + math.log(max_timescale / min_timescale) / (depth - 1)) + # correspond to 1 / 10000^(i/(d-1)), i=0....d-1 + inv_timescales = min_timescale * np.exp(depths * -1 * log_timescale_increment) + # pos / 10000^(i/(d-1)) + scaled_time = np.expand_dims(positions, 1) * np.expand_dims(inv_timescales, 0) + # intead of using SIN and COS interleaved + # it's the same to first use SIN then COS + # as they are applied to the same position + position_embedding_weight = np.concatenate((np.sin(scaled_time), np.cos(scaled_time)), axis=1) + + output_name = "_position_dict" + if (output_name_prefix is not None): + output_name = output_name_prefix + output_name + self.add_weight(output_name=output_name, weight=position_embedding_weight) + return output_name + + def encode(self, inputs, sequence_mask, attention_mask, scope_id, output_name_prefix): + num_units = self.encoder_params["num_units"] + + if (self.encoder_params["position.enable"]): + position_input_name = "encoder_positions" + position_input_shape = [self.batch, self.max_seq_length] + self.add_input(position_input_name, position_input_shape) + #weight = np.array([[i for i in range(sequence_length)] * self.batch]) + #self.add_weight(position_input_name, weight=weight, data_type="INT32") + + position_embedding_dict_name = self.position_encoding(length=self.max_seq_length, + depth=self.encoder_params["num_units"] // 2, + output_name_prefix=output_name_prefix) + position_embedding_name = output_name_prefix + "position_embedding" + self.add_embedding(position_input_name, + position_embedding_dict_name, + position_embedding_name) + + if (self.encoder_params["position.combiner_fn"] != "tensorflow.add"): + print("[ERROR] position embedding unsupported") + exit(0) + output_name = "we+pe" + self.add_sum([inputs, position_embedding_name], output_name) + adder = output_name + + x = adder + for i in range(self.encoder_params["num_layers"]//self.encoder_params["share_level"]): + x_input = x + for j in range(self.encoder_params["share_level"]): + self.set_add_layer(j==0) + + layer_idx = i + self.scopes[scope_id] = "layer_" + str(layer_idx) + self.scopes[scope_id+1] = "self_attention" + output_name_prefix_new = output_name_prefix + "_layer" + str(i) + + # preprocess + x_preprocess = self.layer_process(x, + output_name_prefix_new + "_pre1", + scope_id+2, + mode=self.encoder_params["layer.preprocess"]) + loop_start = x_preprocess + + # attention + _, y = self.self_attention_sublayer(x=x_preprocess, + mask=sequence_mask, + attention_mask=attention_mask, + num_units=self.encoder_params["num_units"], + num_heads=self.encoder_params["attention.num_heads"], + sequence_length=self.max_seq_length, + output_name_prefix=output_name_prefix_new, + scope_id=scope_id+2, + branch=self.encoder_params["attention.branch"], + filter_depth=self.encoder_params["ffn.num_units"], + activation=self.encoder_params["ffn.activation"], + relpos=self.encoder_params["attention.relpos"], + ) + + # post process + x = self.layer_process(x, + output_name_prefix_new + "_post1", + scope_id+2, + y=y, mode=self.encoder_params["layer.postprocess"]) + + # ffn + self.scopes[scope_id+1] = "feed_forward" + x_preprocess = self.layer_process(x, + output_name_prefix_new + "_pre2", + scope_id+2, + mode=self.encoder_params["layer.preprocess"]) + y = self.ffn(x_preprocess, output_name_prefix_new, scope_id+2, activation=self.encoder_params["ffn.activation"]) + x = self.layer_process(x, + output_name_prefix_new + "_post2", + scope_id+2, + y=y, mode=self.encoder_params["layer.postprocess"]) + if (self.encoder_params["share_level"] > 1): + self.add_copy(x, + -1, -1, 0, + x_input, + -1, -1, 0, + -1, + output_name="encoder_copy"+str(i)) + self.add_repeat(self.encoder_params["share_level"]-1, loop_start, output_name="encoder_repeat"+str(i)) + + self.set_add_layer(True) + outputs = self.layer_process(x, + output_name_prefix + "_att_post", + scope_id, + mode=self.encoder_params["layer.preprocess"]) + self.encoder_outputs["encoder_output"] = outputs + return outputs + + def attention_ffn_block(self, inputs, encoder_mask, attention_mask, scope_id, output_name_prefix, state=None, position=None): + x = inputs + attention = None + state_cache = [] + for layer_idx in range(self.decoder_params["num_layers"]): + self.scopes[scope_id] = "layer_%d" % (layer_idx // self.decoder_params["share_level"]) + output_name_prefix_new = output_name_prefix + "_layer_" + str(layer_idx) + + # RNN sublayer + self.scopes[scope_id+1] = "self_attention" + cur_state = state[layer_idx] + # Preprocess + x_process = output_name_prefix_new + "_pre1" + x_process = self.layer_process(x, x_process, scope_id+2, mode=self.decoder_params["layer.preprocess"]) + + #x = self.add_squeeze(x, axis=1, output_name=x+"_squeeze") + + y = output_name_prefix_new + "_self_attention" + #self.extract_lstm(x, cur_state, y, scope_id+2, scope_name = "basic_lstm_cell") + # attention + _, y = self.self_attention_sublayer(x_process, + None, None, + num_units=self.encoder_params["num_units"], + num_heads=self.encoder_params["attention.num_heads"], + sequence_length=self.max_seq_length, + output_name_prefix=y, + scope_id=scope_id + 2, + branch=self.encoder_params["attention.branch"], + filter_depth=self.encoder_params["ffn.num_units"], + activation=self.encoder_params["ffn.activation"], + relpos=self.encoder_params["attention.relpos"], + cache=cur_state, + share_index=layer_idx % self.decoder_params["share_level"], + share_num=self.decoder_params["share_level"] + ) + + state_cache.append(cur_state) + #Postprocess + x_process = output_name_prefix_new + "_post1" + x = self.layer_process(x, x_process, scope_id+2, y=y, mode=self.decoder_params["layer.postprocess"]) + #x = self.add_expand_dims(x, axis=1, output_name=x+"_expand") + + # Encdec sublayer + self.scopes[scope_id+1] = "encdec_attention" + # Preprocess + x_preprocess = output_name_prefix_new + "_pre2" + x_preprocess = self.layer_process(x, x_preprocess, scope_id+2, mode=self.decoder_params["layer.preprocess"]) + # Cross attention + att_scores, y = self.self_attention_sublayer(x=x_preprocess, + mask=encoder_mask, + attention_mask=attention_mask, + num_units=self.decoder_params["num_units"], + num_heads=self.decoder_params["attention.num_heads"], + sequence_length=1, + output_name_prefix=output_name_prefix_new, + scope_id=scope_id+2, + memory=self.encoder_outputs["encoder_output"][layer_idx], + branch=self.decoder_params["attention.branch"], + filter_depth=self.decoder_params["ffn.num_units"], + activation=self.decoder_params["ffn.activation"], + sum_att=self.decoder_params["sum_att"], + share_index=layer_idx % self.decoder_params["share_level"], + share_num=self.decoder_params["share_level"] + ) + # Post process + x_process = output_name_prefix_new + "_post2" + x = self.layer_process(x, x_process, scope_id+2, y=y, mode=self.decoder_params["layer.postprocess"]) + + att_context = x + + if not self.decoder_params["attention.weighted_avg"]: + print("[WARNING] unused attention scores") + #att_scores = self.add_axis_mean(att_scores, axis=1, output_name=att_scores+"_mean") + else: + print("[ERROR] unsupported attention weighted average") + + if attention is None: + attention = att_scores + else: + if not self.decoder_params["attention.weighted_avg"]: + output_name = attention + "add_socres" + #attention = self.add_sum([attention, att_scores], output_name) + else: + print("[ERROR] unsupported attention weighted average") + #attention = np.concatenate([attention, att_scores], axis = 1) + + # FFN sublayer + self.scopes[scope_id+1] = "feed_forward" + # Preprocess + x_preprocess = output_name_prefix_new + "_pre3" + x_preprocess = self.layer_process(x, x_preprocess, scope_id+2, mode=self.decoder_params["layer.preprocess"]) + # FFN + y = self.ffn(x_preprocess, output_name_prefix_new, scope_id+2, activation=self.decoder_params["ffn.activation"], + share_index=layer_idx % self.decoder_params["share_level"], + share_num=self.decoder_params["share_level"]) + # Postprocess + x_process = output_name_prefix_new + "_post3" + x = self.layer_process(x, x_process, scope_id+2, y=y, mode=self.decoder_params["layer.postprocess"]) + state = state_cache + + #Preproecss + x_process = output_name_prefix_new + "_pre4" + outputs = self.layer_process(x, x_process, scope_id, mode=self.decoder_params["layer.preprocess"]) + + if not self.decoder_params["attention.weighted_avg"]: + output_name = attention + "_div" + #attention = self.add_power(attention, output_name, scale=1.0/self.decoder_params["num_layers"]) + else: + print("[ERROR] unsupported attention weighted average") + + return outputs, state, att_context + + def add_projection(self, input_name, weight_name, output_name_prefix): + matmul_name = output_name_prefix + "_matmul" + self.add_matmul(input_name, weight_name, matmul_name, transpose_a=False, transpose_b=True) + + argmax_name = self.add_argmax(matmul_name, axis=-1, output_name=output_name_prefix+"_argmax") + + return argmax_name + + def encoder_post_process(self, output_name_prefix): + self.scopes[0] = "transformer" + self.scopes[1] = "decoder" + self.scopes[3] = "encdec_attention" + self.scopes[4] = "multihead_attention" + key_depth = self.decoder_params["num_units"] + value_depth = self.decoder_params["num_units"] + encoder_outputs = [] + for layer_idx in range(self.decoder_params["num_layers"]): + self.scopes[2] = "layer_%d" % (layer_idx // self.decoder_params["share_level"]) + key_name = output_name_prefix + "_layer" + str(layer_idx) + "_multihead_k" + value_name = output_name_prefix + "_layer" + str(layer_idx) + "_multihead_v" + memory = self.encoder_outputs["encoder_output"] + self.extract_denses(memory, [key_name, value_name], [key_depth, value_depth], 5, ["kv_transform", "matrix", "bias"], + share_index=layer_idx % self.decoder_params["share_level"], + share_num=self.decoder_params["share_level"]) + encoder_outputs.append({"key": key_name, "value": value_name}) + self.encoder_outputs["encoder_output"] = encoder_outputs + + def save_encoder_states(self): + output_name_prefix = "decoder" + for layer_idx in range(self.decoder_params["num_layers"]): + key_name = output_name_prefix + "_layer" + str(layer_idx) + "_multihead_k" + value_name = output_name_prefix + "_layer" + str(layer_idx) + "_multihead_v" + key_data = self.get_tensor(self.encoder_outputs["encoder_output"][layer_idx]["key"]) + value_data = self.get_tensor(self.encoder_outputs["encoder_output"][layer_idx]["value"]) + np.savetxt(key_name+".txt", key_data.reshape([key_data.size])) + np.savetxt(value_name+".txt", value_data.reshape([value_data.size])) + + def prepare_decoder_states(self, decoder_position_input): + output_name_prefix = "decoder" + self.encoder_outputs["encoder_output"] = [] + states = [] + position = self.get_tensor(decoder_position_input)[0][0] + for layer_idx in range(self.decoder_params["num_layers"]): + prefix = "decoder_layer" + str(layer_idx) + key0_name = prefix + "_kmem" + value0_name = prefix + "_vmem" + if (position == 0): + state0_shape = [self.batch, 0, 0] + key0_data = np.zeros(state0_shape) + value0_data = np.zeros(state0_shape) + state0_shape = [self.batch, self.max_decode_length, self.decoder_params["num_units"]] + else: + key0_data = np.load(key0_name + ".npy") + value0_data = np.load(value0_name + ".npy") + state0_shape = key0_data.shape + self.add_input(key0_name, state0_shape) + self.add_input(value0_name, state0_shape) + states.append({"self_key": key0_name, "self_value": value0_name}) + + key1_name = output_name_prefix + "_layer" + str(layer_idx) + "_multihead_k" + value1_name = output_name_prefix + "_layer" + str(layer_idx) + "_multihead_v" + key1_data = np.loadtxt(key1_name + ".txt") + value1_data = np.loadtxt(value1_name + ".txt") + key1_data = key1_data.reshape([self.batch, -1, self.decoder_params["num_units"]]) + value1_data = value1_data.reshape([self.batch, -1, self.decoder_params["num_units"]]) + self.add_input(key1_name, key1_data.shape) + self.add_input(value1_name, value1_data.shape) + self.encoder_outputs["encoder_output"].append({"key": key1_name, "value": value1_name}) + data = {key0_name: key0_data, + value0_name: value0_data, + key1_name: key1_data, + value1_name: value1_data} + self.set_input(data) + return states + + def save_decoder_states(self, decoder_states, decoder_position_input): + position = self.get_tensor(decoder_position_input)[0][0] + states = [] + for layer_idx in range(self.decoder_params["num_layers"]): + prefix = "decoder_layer" + str(layer_idx) + key_name = prefix + "_kmem" + value_name = prefix + "_vmem" + np.save(key_name +".npy", self.get_tensor(decoder_states[layer_idx]["self_key"])) + np.save(value_name +".npy", self.get_tensor(decoder_states[layer_idx]["self_value"])) + states.append(decoder_states[layer_idx]["self_key"]) + states.append(decoder_states[layer_idx]["self_value"]) + return states + + def generate_decoder(self, input=None): + position = input["decoder_positions"][0][0] + decoder_word_input_name = "decoder_words" + decoder_word_input_shape = [self.batch, 1] + self.add_input(decoder_word_input_name, decoder_word_input_shape) + decoder_position_input_name = "decoder_positions" + decoder_position_input_shape = [self.batch, 1] + self.add_input(decoder_position_input_name, decoder_position_input_shape) + self.set_input(input) + + output_name_prefix = "transformer_decoder" + self.scopes[0] = "transformer" + self.scopes[1] = "target_embedding" + self.target_modality = "target_embedding" + self.add_weight(output_name=self.target_modality, scope_id=2) + + decoder_states = self.prepare_decoder_states(decoder_position_input_name) + self.save_input() + + zero = "zero" + zero_weight = np.zeros([self.batch, self.decoder_params["num_units"]]) + self.add_weight(output_name=zero, weight=zero_weight) + + sos = "sos" + sos_weight = np.zeros([self.batch, 1]) + self.add_weight(output_name=sos, weight=sos_weight) + + position_embedding_name = output_name_prefix + "_position_embedding" + position_embedding_dict_name = self.position_encoding(length=self.decoder_params["position.max_length"], + depth=self.decoder_params["num_units"] // 2, + output_name_prefix=output_name_prefix) + + # word embedding + word_embedding_shape = [self.batch, self.decoder_params["num_units"]] + word_embedding_result = output_name_prefix + "_words_embedding_buffer" + self.add_memory(word_embedding_result, word_embedding_shape, data_type="FLOAT32") + is_first_word = "is_first_word" + self.add_check(sos, decoder_position_input_name, "equal", is_first_word) + jump_name = "skip_first_word_embedding" + self.add_jump(position_embedding_name, jump_name, is_first_word) + is_first_word_data = self.get_tensor(is_first_word).tolist()[0] + if (isinstance(is_first_word_data, bool) and not is_first_word_data) \ + or (isinstance(is_first_word_data, int) and is_first_word_data == 0) \ + or (isinstance(is_first_word_data, float) and is_first_word_data == 0): + cur_inputs = output_name_prefix + "_words_embedding" + self.add_embedding(decoder_word_input_name, self.target_modality, cur_inputs) + cur_inputs_scale = output_name_prefix + "_words_embedding" + "_s" + self.add_power(cur_inputs, cur_inputs_scale, scale=(self.decoder_params['num_units'] ** 0.5)) + self.add_copy(cur_inputs_scale, self.decoder_params["num_units"], self.decoder_params["num_units"], 0, + word_embedding_result, self.decoder_params["num_units"], self.decoder_params["num_units"], 0, + self.decoder_params["num_units"], output_name="copy_word_embedding") + + # position embedding + self.add_embedding(decoder_position_input_name, position_embedding_dict_name, position_embedding_name) + + output_name = output_name_prefix + "_embedding" + cur_inputs_pos = self.add_sum([word_embedding_result, position_embedding_name], output_name) + self.scopes[1] = "decoder" + cell_outputs, state, att_context = self.attention_ffn_block(inputs=cur_inputs_pos, + encoder_mask=None, + attention_mask=None, + scope_id=2, + output_name_prefix=output_name_prefix, + state=decoder_states, + position=decoder_position_input_name) + + self.scopes[1] = "softmax" + self.softmax = "softmax" + self.add_weight(output_name=self.softmax, scope_id=2) + current_ids = self.add_projection(cell_outputs, self.softmax, output_name) + # current_ids = self.add_projection(cell_outputs, self.target_modality, output_name) + output = self.save_decoder_states(state, decoder_position_input_name) + output.append(current_ids) + self.add_output(output) + self.save_caffe_model() + return current_ids + + def generate_encoder(self, input=None): + encoder_word_input_name = "encoder_words" + encoder_word_input_shape = [self.batch, self.max_seq_length] + self.add_input(encoder_word_input_name, encoder_word_input_shape) + self.set_input(input) + + self.scopes[0] = "transformer" + self.scopes[1] = "source_embedding" + self.source_modality = "source_embedding" + self.add_weight(output_name=self.source_modality, scope_id=2) + + encoder_embedding = "encoder_embedding" + self.add_embedding(encoder_word_input_name, self.source_modality, encoder_embedding) + encoder_embedding = self.add_power(encoder_embedding, encoder_embedding+"_s", scale=math.sqrt(self.encoder_params["num_units"])) + + # source embedding bias + self.scopes[1] = "bias" + embedding_bias = "bias" + self.add_weight(output_name=embedding_bias, scope_id=2) + encoder_embedding = self.add_sum([encoder_embedding, embedding_bias], encoder_embedding + "_b") + + self.scopes[1] = "encoder" + encoder_attention_mask = "encoder_attention_mask" + encoders = self.encode(encoder_embedding, None, encoder_attention_mask, 2, output_name_prefix="transformer_encoder") + self.encoder_post_process("transformer_decoder") + + self.save_input() + self.save_caffe_model() + self.save_encoder_states() diff --git a/model_tools/tools/tensorflow2caffe/nmt/transform_transformer_lstm.py b/model_tools/tools/tensorflow2caffe/nmt/transform_transformer_lstm.py new file mode 100644 index 00000000..1538c6e7 --- /dev/null +++ b/model_tools/tools/tensorflow2caffe/nmt/transform_transformer_lstm.py @@ -0,0 +1,41 @@ +#!/usr/local/bin/python +# -*- coding: utf-8 -*- + +from tensorflow2caffe_transformer_lstm import Tensorflow2CaffeTransformerLstm +import numpy as np +import json + +if __name__ == '__main__': + tensorflow_model_path = "/data/yuxianzhi/models/nmt/tfm-rnn-288/model.ckpt-217809" + configure_file_path = "/data/yuxianzhi/models/nmt/tfm-rnn-288/train_options.json" + caffe_model_path_prefix = "transformer_lstm_nmt" + caffe_model_name = "transformer_lstm_nmt" + configure_file = open(configure_file_path) + params = json.load(configure_file) + configure_file.close() + encoder_params = Tensorflow2CaffeTransformerLstm.default_encoder_params() + decoder_params = Tensorflow2CaffeTransformerLstm.default_decoder_params() + for key,value in params["model_params"]["encoder.params"].items(): + if (key in encoder_params): + encoder_params[key] = value + for key,value in params["model_params"]["decoder.params"].items(): + if (key in decoder_params): + decoder_params[key] = value + + max_seq_length = 128 + max_decode_length = 128 + use_small_word_list = False + max_candidates_size = max_seq_length * 50 + 2000 + nmt_caffe = Tensorflow2CaffeTransformerLstm(tensorflow_model_path, + caffe_model_path_prefix, caffe_model_name, + max_seq_length, max_decode_length, + encoder_params, decoder_params, + use_small_word_list, max_candidates_size, + check=False, calc=True) + + data = {} + data["nmt_words"] = np.array([[2056,1176,6492,897,285,50,121,809,53,2988,263,1252,14,76,407,383,2]]) + nmt_length = len(data["nmt_words"][0]) + data["nmt_positions"] = np.array([[i for i in range(nmt_length)]]) + data["nmt_candidates"] = np.array([[i for i in range(nmt_length)]]) + nmt_caffe.generate(data) diff --git a/model_tools/tools/tensorflow2caffe/nmt/transform_transformer_tsc.py b/model_tools/tools/tensorflow2caffe/nmt/transform_transformer_tsc.py new file mode 100644 index 00000000..0d6bc834 --- /dev/null +++ b/model_tools/tools/tensorflow2caffe/nmt/transform_transformer_tsc.py @@ -0,0 +1,41 @@ +#!/usr/local/bin/python +# -*- coding: utf-8 -*- + +from tensorflow2caffe_transformer_tsc import Tensorflow2CaffeTransformerTSC +import numpy as np + +if __name__ == '__main__': + tensorflow_model_path = "/data/models/nmt/nmt_tsc/model/model.ckpt-353000" + encoder_params = Tensorflow2CaffeTransformerTSC.default_encoder_params() + decoder_params = Tensorflow2CaffeTransformerTSC.default_decoder_params() + + max_seq_length = 128 + max_decode_length = 128 + nmt_caffe = Tensorflow2CaffeTransformerTSC(tensorflow_model_path, + "nmt_tsc_encoder", "nmt_tsc_encoder", + max_seq_length, max_decode_length, + encoder_params, decoder_params, + check=False, calc=True) + encoder_data = {} + encoder_data["encoder_words"] = np.array([[13024, 1657, 35399, 0]]) # result:[6160, 3057, 113, 157, 0] + encoder_length = len(encoder_data["encoder_words"][0]) + encoder_data["encoder_positions"] = np.array([[i for i in range(encoder_length)]]) + nmt_caffe.generate_encoder(encoder_data) + + word = 0 + results = [] + for i in range(max_decode_length): + nmt_caffe = Tensorflow2CaffeTransformerTSC(tensorflow_model_path, + "nmt_tsc_decoder", "nmt_tsc_decoder", + max_seq_length, max_decode_length, + encoder_params, decoder_params, + check=False, calc=True) + decoder_data = {} + decoder_data["decoder_words"] = np.array([[word]]) + decoder_data["decoder_positions"] = np.array([[i]]) + result = nmt_caffe.generate_decoder(decoder_data) + word = nmt_caffe.get_tensor(result).tolist()[0][0][0] + results.append(word) + if (word == 0): + break; + print(results) diff --git a/model-tools/tools/tensorflow2caffe/operators.py b/model_tools/tools/tensorflow2caffe/operators.py similarity index 87% rename from model-tools/tools/tensorflow2caffe/operators.py rename to model_tools/tools/tensorflow2caffe/operators.py index c6afa874..5d1e5c9f 100644 --- a/model-tools/tools/tensorflow2caffe/operators.py +++ b/model_tools/tools/tensorflow2caffe/operators.py @@ -126,6 +126,23 @@ def group_norm(_x, groups, gamma, beta, name): Operators.print_data(y, name) return y + @staticmethod + def l2_norm(_x, name): + if (not Operators.calculate): + return None; + x = _x.copy() + if (len(x.shape) == 2): + for i in range(x.shape[0]): + tmp = 0 + for j in range(x.shape[1]): + tmp += x[i][j] * x[i][j] + tmp = tmp ** 0.5 + for j in range(x.shape[1]): + x[i][j] = x[i][j]/tmp + + Operators.print_data(x, name) + return x + @staticmethod def convolution(x, kernels, bias, num_output, kernel_size, strides, paddings, @@ -147,7 +164,7 @@ def convolution(x, kernels, bias, x = x.reshape(x_shape) y = np.zeros([x_shape[0], num_output, h, w]) - x_pad = np.pad(x, [[0, 0], [0, 0], [paddings[0], paddings[1]], [paddings[2], paddings[3]]]) + x_pad = np.pad(x, [[0, 0], [0, 0], [paddings[0], paddings[1]], [paddings[2], paddings[3]]], mode='constant') for f in range(y.shape[1]): for i in range(y.shape[2]): for j in range(y.shape[3]): @@ -268,11 +285,12 @@ def transpose(_x, dim, name): return x @staticmethod - def multiply(_x, scale, bias, name): + def power(_x, scale, shift, power, name): if (not Operators.calculate): return None; x = _x * scale - x = x + bias + x = x + shift + x = np.power(x, power) Operators.print_data(x, name) return x @@ -283,7 +301,40 @@ def matmultiply(_x, _y, name): x = np.multiply(_x, _y) Operators.print_data(x, name) return x - + + @staticmethod + def divide(_x, _y, name): + if (not Operators.calculate): + return None; + x = np.divide(_x, _y) + Operators.print_data(x, name) + return x + + @staticmethod + def tile(_x, loops, axis, name): + if (not Operators.calculate): + return None + input = _x + input = np.array(input, dtype=float) + input_shape_list = list(input.shape) + length = loops + for i in input_shape_list: + length *= i + temp_shape_list = input_shape_list + temp_shape_list[axis] = input_shape_list[axis] * loops + temp_list = [] + if axis == -1 and len(input_shape_list) == 3: + for i in range(input_shape_list[0]): + for j in range(input_shape_list[1]): + for n in range(loops): + temp_list.append(input[i][j][:]) + if axis == -1 and len(input_shape_list) == 2: + for i in range(input_shape_list[0]): + for n in range(loops): + temp_list.append(input[i][:]) + x = Operators.reshape(temp_list, tuple(temp_shape_list), name) + return x + @staticmethod def slice(x, axis, slice_points, names, print_flag=True): if (not Operators.calculate): @@ -413,7 +464,7 @@ def attention(_x, num_attention, from_seq_length, to_seq_length, name): return x @staticmethod - def embedding(_x, w, transpose, name): + def embedding(_x, _w, transpose, name): if (not Operators.calculate): return None; x = _x.copy() @@ -421,7 +472,13 @@ def embedding(_x, w, transpose, name): print("[ERROR] batch != 1") exit(0) x = np.reshape(x, [len(x[0])]) - + if (len(_w.shape) == 2): + w = _w + elif (len(_w.shape) == 3): + w = _w[0] + else: + print("[ERROR] can not support more dimension embedding") + exit(0) y = [] for i in x: index = int(i) @@ -431,9 +488,9 @@ def embedding(_x, w, transpose, name): y.append(w[index]) y = np.array(y) if (transpose): - x = np.reshape(y, [1, len(x), len(w)]) + x = np.reshape(y, [1, len(x), w.shape[-2]]) else: - x = np.reshape(y, [1, len(x), len(w[0])]) + x = np.reshape(y, [1, len(x), w.shape[-1]]) Operators.print_data(x, name) return x; @@ -560,7 +617,7 @@ def reduction(input, mask, operation, axis, name): return x @staticmethod - def lstm(inputs, state, w, b, projection, projection_bias, zoneout_cell, zoneout_output, + def lstm(inputs, state, w, b, projection, projection_bias, zoneoutCell, zoneoutOutput, name, state_name, printFlag=True): if (not Operators.calculate): return None, None @@ -584,8 +641,8 @@ def lstm(inputs, state, w, b, projection, projection_bias, zoneout_cell, zoneout new_h = np.matmul(new_h, projection) if (projection_bias is not None): new_h = new_h + projection_bias - o_c = new_c * (1 - zoneout_cell) + c * zoneout_cell - o_h = new_h * (1 - zoneout_output) + h * zoneout_output + o_c = new_c * (1 - zoneoutCell) + c * zoneoutCell + o_h = new_h * (1 - zoneoutOutput) + h * zoneoutOutput new_state = np.concatenate([o_c, o_h], axis = 1) if (printFlag): @@ -594,7 +651,30 @@ def lstm(inputs, state, w, b, projection, projection_bias, zoneout_cell, zoneout return new_h, new_state @staticmethod - def bi_lstm(inputs, w, b, projection, projection_bias, zoneout_cell, zoneout_output, name): + def fw_lstm(inputs, w, b, projection, projection_bias, zoneoutCell, zoneoutOutput, name): + if (not Operators.calculate): + return None + inputs = np.reshape(inputs, [-1, inputs.shape[-1]]) + state_length = w.shape[0] // 4 + if (projection is not None): + state_length += projection.shape[0] + else: + state_length += w.shape[0] // 4 + state = np.zeros([1, state_length]) + loops = inputs.shape[0] + results = [] + for i in range(loops): + result, state = Operators.lstm(inputs[i], state, w, b, projection, projection_bias, + zoneoutCell, zoneoutOutput, None, None, False) + results.append(result) + results = np.array(results) + shape = results.shape + results = results.reshape([1, shape[0], -1]) + Operators.print_data(results, name) + return results + + @staticmethod + def bi_lstm(inputs, w, b, projection, projection_bias, zoneoutCell, zoneoutOutput, name): if (not Operators.calculate): return None fw = w[0] @@ -630,12 +710,12 @@ def bi_lstm(inputs, w, b, projection, projection_bias, zoneout_cell, zoneout_out fw_results = [] for i in range(loops): fw_result, fw_state = Operators.lstm(inputs[i], fw_state, fw, fb, fp, fpb, - zoneout_cell, zoneout_output, None, None, False) + zoneoutCell, zoneoutOutput, None, None, False) fw_results.append(fw_result) bw_results = [] for i in range(loops): bw_result, bw_state = Operators.lstm(inputs[loops-1-i], bw_state, bw, bb, bp, bpb, - zoneout_cell, zoneout_output, None, None, False) + zoneoutCell, zoneoutOutput, None, None, False) bw_results.append(bw_result) results = [] for i in range(loops): @@ -677,6 +757,8 @@ def copy(_src, dst_shape = dst.shape src = src.reshape([src_shape[0], src.size//src_shape[0]]) dst = dst.reshape([dst_shape[0], dst.size//dst_shape[0]]) + print("%d %d %d %d %d %d" % (src_batch_stride, src_stride, src_offset, + dst_batch_stride, dst_stride, dst_offset)) if (length < 0): length = src.size; if (src_batch_stride < 0): @@ -687,6 +769,8 @@ def copy(_src, dst_batch_stride = dst.shape[1] if (dst_stride < 0): dst_stride = dst.shape[1] + print("%d %d %d %d %d %d" % (src_batch_stride, src_stride, src_offset, + dst_batch_stride, dst_stride, dst_offset)) for i in range(batch): src_j = 0 if src_index is not None: @@ -773,7 +857,7 @@ def pad(_x, pad_shapes, pad_values, name): if (pad_values is not None): assert(np.array(pad_values).sum() == 0) x = _x.copy() - x = np.pad(x, pad_shapes) + x = np.pad(x, pad_shapes, mode='constant') Operators.print_data(x, name) return x @@ -785,7 +869,7 @@ def relative_shift(_x, axis, shift_length, name): shapes = [i for i in x.shape] pad_shapes = [[0, 0] for i in range(len(shapes))] pad_shapes[axis][0] = shift_length - x = np.pad(x, pad_shapes) + x = np.pad(x, pad_shapes, mode='constant') tmp = shapes[axis-1] shapes[axis-1] = shapes[axis] + 1 shapes[axis] = tmp diff --git a/model_tools/tools/tensorflow2caffe/punctuation/tensorflow2caffe_punctuation.py b/model_tools/tools/tensorflow2caffe/punctuation/tensorflow2caffe_punctuation.py new file mode 100644 index 00000000..fb0161df --- /dev/null +++ b/model_tools/tools/tensorflow2caffe/punctuation/tensorflow2caffe_punctuation.py @@ -0,0 +1,51 @@ +#!/usr/local/bin/python +# -*- coding: utf-8 -*- + +import sys +sys.path.append("../") +from tensorflow2caffe import Tensorflow2Caffe +from Caffe import caffe_net +from operators import Operators + + + +class Tensorflow2CaffePunctuation(Tensorflow2Caffe): + def __init__(self, + tensorflow_model_path, caffe_model_path_prefix, caffe_model_name, + check=False, calc=False): + Tensorflow2Caffe.__init__(self, tensorflow_model_path, + caffe_model_path_prefix, caffe_model_name, check, calc) + + def generate(self, input = None): + #batch_seq_length = 16 + input_text_name = "input_text" + input_text_shape = [self.batch, 16] + self.add_input(input_text_name, input_text_shape) + self.set_input(input) + #embedding + x = self.extract_embedding(input_text_name, 0, "emb_table", "embedding_lookup") + #bilstm + x = self.extract_lstm(x, None, "BiLSTM", 0, steps = -2, scope_name = ["BiLSTM/fw/lstm_cell", "BiLSTM/bw/lstm_cell"]) + #FC + weight = self.get_weight("W") + bias = self.get_weight("b") + layer = caffe_net.LayerParameter("wb_fc_output", type='InnerProduct', + bottom=[x], top=["wb_fc_output"]) + num_output = len(weight[0]) + weight = weight.transpose((1,0)) + layer.inner_product_param(num_output, bias_term=bias is not None) + if len(bias) != num_output: + print("[ERROR] extract_dense failed") + exit(0) + layer.add_data(weight, bias) + self.caffe_model.add_layer(layer) + self.data_dict["wb_fc_output"] = Operators.fully_connect(self.data_dict[x], + weight.transpose((1, 0)), bias, + "wb_fc_output") + x = "wb_fc_output" + #softmax + x = self.add_softmax(x, "softmax_output", -1) + #argmax + x = self.add_argmax(x, -1, "output") + self.save_caffe_model() + diff --git a/model_tools/tools/tensorflow2caffe/punctuation/transform_punctuation.py b/model_tools/tools/tensorflow2caffe/punctuation/transform_punctuation.py new file mode 100644 index 00000000..3994b0da --- /dev/null +++ b/model_tools/tools/tensorflow2caffe/punctuation/transform_punctuation.py @@ -0,0 +1,18 @@ +#!/usr/local/bin/python +# -*- coding: utf-8 -*- + +from tensorflow2caffe_punctuation import Tensorflow2CaffePunctuation +import numpy as np + + +if __name__ == '__main__': + tensorflow_model_path = "bilstm.pb" + caffe_model_path_prefix = "punctuation" + caffe_model_name = "punctuation" + + bilstm = Tensorflow2CaffePunctuation(tensorflow_model_path, caffe_model_path_prefix, caffe_model_name, + False, True) + data = {} + data["input_text"] = np.array([[6, 13, 5, 14, 31, 234, 325, 161, 5, 182, 180, 266, 31, 234, 460, 62]]) + bilstm.print_weight_map() + bilstm.generate(data) \ No newline at end of file diff --git a/model-tools/tools/tensorflow2caffe/requirements.txt b/model_tools/tools/tensorflow2caffe/requirements.txt similarity index 100% rename from model-tools/tools/tensorflow2caffe/requirements.txt rename to model_tools/tools/tensorflow2caffe/requirements.txt diff --git a/model_tools/tools/tensorflow2caffe/rotation/tensorflow2caffe_rotation.py b/model_tools/tools/tensorflow2caffe/rotation/tensorflow2caffe_rotation.py new file mode 100644 index 00000000..4f2ab2dc --- /dev/null +++ b/model_tools/tools/tensorflow2caffe/rotation/tensorflow2caffe_rotation.py @@ -0,0 +1,45 @@ +#!/usr/local/bin/python +# -*- coding: utf-8 -*- + +import sys +sys.path.append("../") +from tensorflow2caffe import Tensorflow2Caffe + +class Tensorflow2CaffeRotation(Tensorflow2Caffe): + def __init__(self, + tensorflow_model_path, caffe_model_path_prefix, caffe_model_name, + check=False, calc=False): + Tensorflow2Caffe.__init__(self, tensorflow_model_path, + caffe_model_path_prefix, caffe_model_name, check, calc) + + def generate(self, input=None): + steps = 30 + input_name = "input" + input_shape = [self.batch, 3, steps] + self.add_input(input_name, input_shape) + self.set_input(input) + + kernel_size = [3, 1] + strides = [1, 1] + padding = self.calculate_convolution_padding(self.get_tensor_shape(input_name), kernel_size, strides, 'same') + x = self.extract_convolution(input_name, "conv1d_1", 0, + 64, kernel_size, strides, padding, + data_format="NCHW", weight_format="NHWC", + dilation=1, groups=1, layer_names=['conv1d', "kernel", "bias"]) + x = self.add_tanh(x, "tanh_1") + x = self.extract_convolution(x, "conv1d_2", 0, + 64, kernel_size, strides, padding, + data_format="NCHW", weight_format="NHWC", + dilation=1, groups=1, layer_names=['conv1d_1', "kernel", "bias"]) + x = self.add_tanh(x, "tanh_2") + x = self.transpose_nchc8_nhc(x) + x = self.extract_lstm(x, None, "lstm_backbone", steps=steps, + scope_id=0, scope_name="lstm_backbone") + x1 = self.extract_lstm(x, None, "lstm_pose_2", steps=steps, + scope_id=0, scope_name="lstm_pose_2") + x2 = self.extract_lstm(x, None, "lstm_scene_2", steps=steps, + scope_id=0, scope_name="lstm_scene_2") + x1 = self.extract_dense(x1, "pose_output", 0, "pose_output") + x2 = self.extract_dense(x2, "scene_output", 0, "scene_output") + + self.save_caffe_model() diff --git a/model_tools/tools/tensorflow2caffe/rotation/transform_rotation.py b/model_tools/tools/tensorflow2caffe/rotation/transform_rotation.py new file mode 100644 index 00000000..a79e3fb7 --- /dev/null +++ b/model_tools/tools/tensorflow2caffe/rotation/transform_rotation.py @@ -0,0 +1,18 @@ +#!/usr/local/bin/python +# -*- coding: utf-8 -*- + +from tensorflow2caffe_rotation import Tensorflow2CaffeRotation +import numpy as np + + +if __name__ == '__main__': + tensorflow_model_path = "/data/models/rotation/20200617LSTM_2convfront_209-99.970848083496.pb" + caffe_model_path_prefix = "rotation" + caffe_model_name = "rotation" + + rotation = Tensorflow2CaffeRotation(tensorflow_model_path, caffe_model_path_prefix, caffe_model_name, + False, True) + data = {} + data["input"] = np.ones([1,3,30,1]) + rotation.print_weight_map() + rotation.generate(data) diff --git a/model-tools/tools/tensorflow2caffe/tensorflow2caffe.py b/model_tools/tools/tensorflow2caffe/tensorflow2caffe.py similarity index 83% rename from model-tools/tools/tensorflow2caffe/tensorflow2caffe.py rename to model_tools/tools/tensorflow2caffe/tensorflow2caffe.py index b97f38a8..2132590e 100644 --- a/model-tools/tools/tensorflow2caffe/tensorflow2caffe.py +++ b/model_tools/tools/tensorflow2caffe/tensorflow2caffe.py @@ -15,7 +15,8 @@ class Tensorflow2Caffe: def __init__(self, tensorflow_model_path, caffe_model_path_prefix, caffe_model_name, check=False, - calculate=False): + calculate=False, + quantization=False): self.scopes = ["" for i in range(100)] self.caffe_model = caffe_net.CaffeModel('') self.caffe_model.net.name = caffe_model_name @@ -29,6 +30,8 @@ def __init__(self, tensorflow_model_path, caffe_model_path_prefix, caffe_model_n self.inputs = [] self.weight_size_map = {} Operators.set_calculate(calculate) + self.quantization = quantization + self.quantization_max = {} def load_tensorflow_model_from_ckpt(self, tensorflow_model_path): self.tensor_map = checkpoint_utils.list_variables(tensorflow_model_path) @@ -117,6 +120,7 @@ def print_weight_map(self): keys = sorted(list(self.weight_map.keys())) for key in keys: print("[INFO] weight %s shape %s" % (key, self.weight_map[key].shape)) + print(self.weight_map[key].reshape([-1])[0]) def print_weight_statistics(self): #for key in self.weight_map.keys(): @@ -142,6 +146,13 @@ def save_caffe_model(self): print("[INFO] save caffe model to %s.*" % (self.caffe_model_path_prefix)) self.caffe_model.save_prototxt(self.caffe_model_path_prefix + ".prototxt") self.caffe_model.save(self.caffe_model_path_prefix + ".caffemodel") + if (self.quantization): + print("[INFO] save int8 quantization max value to %s.*" % (self.caffe_model_path_prefix+"_quant.txt")) + quantizationMaxFile = open(self.caffe_model_path_prefix + "_quant.txt", "w") + for key, value in self.quantization_max.items(): + if (value is not None): + quantizationMaxFile.write("%s %f\n" % (key, value)) + quantizationMaxFile.close() def print_tensor(self, name): Operators.print_data(self.get_tensor(name), name) @@ -241,6 +252,18 @@ def add_input(self, input_name, input_shape): self.data_dict[input_name] = None return input_name + def add_output(self, output_names): + self.caffe_model.add_output(output_names) + for item in output_names: + print("[INFO] add model output %s" % (item)) + + def add_quantization(self, scope_id, tensorflow_weight_name, output_name): + self.scopes[scope_id] = tensorflow_weight_name + self.scopes[scope_id+1] = "a_quant_max" + weight_name = self.generate_name(self.scopes, scope_id+2) + weight = self.get_weight(weight_name) + self.quantization_max[output_name] = weight + def add_concat(self, input_names, output_name, axis): layer = caffe_net.LayerParameter(name=output_name, type='Concat', bottom=input_names, top=[output_name]) @@ -417,7 +440,7 @@ def extract_convolution(self, input_name, output_name, scope_id, self.data_dict[output_name] = None return output_name - def extract_dense(self, input_name, output_name, scope_id, scope_name="dense"): + def extract_dense(self, input_name, output_name, scope_id, scope_name="dense", share_index=0, share_num=1): if (isinstance(scope_name, str)): layer_names = [scope_name, "kernel", "bias"] elif (isinstance(scope_name, list)): @@ -426,25 +449,37 @@ def extract_dense(self, input_name, output_name, scope_id, scope_name="dense"): print("[ERROR] unsupported dense scope_name") exit(1) kernel, bias = self.get_weights(scope_id, layer_names) - layer = caffe_net.LayerParameter(name=output_name, type='InnerProduct', - bottom=[input_name], top=[output_name]) - num_output = len(kernel[0]) - kernel = kernel.transpose((1, 0)) - layer.inner_product_param(num_output, bias_term=bias is not None) - if bias is not None: - if len(bias) != num_output: - print("[ERROR] extract_dense failed") - exit(0) - layer.add_data(kernel, bias) + if (share_num == 1): + layer = caffe_net.LayerParameter(name=output_name, type='InnerProduct', + bottom=[input_name], top=[output_name]) + num_output = len(kernel[0]) + kernel = kernel.transpose((1, 0)) + layer.inner_product_param(num_output, bias_term=bias is not None) + if bias is not None: + if len(bias) != num_output: + print("[ERROR] extract_dense failed") + exit(0) + layer.add_data(kernel, bias) + else: + layer.add_data(kernel) + self.caffe_model.add_layer(layer) + self.data_dict[output_name] = Operators.fully_connect(self.data_dict[input_name], + kernel.transpose((1, 0)), bias, + output_name) else: - layer.add_data(kernel) - self.caffe_model.add_layer(layer) - self.data_dict[output_name] = Operators.fully_connect(self.data_dict[input_name], - kernel.transpose((1, 0)), bias, - output_name) + self.scopes[scope_id] = layer_names[0] + kernel_name = self.generate_name(self.scopes, scope_id+1) + "/kernel" + bias_name = self.generate_name(self.scopes, scope_id+1) + "/bias" + if (share_index == 0): + self.add_weight(kernel_name, weight=kernel) + if (bias is not None): + self.add_weight(bias_name, weight=bias) + tmp_name = self.add_matmul(input_name, kernel_name, output_name+"/matmul"+str(share_index)) + if (bias is not None): + self.add_sum([tmp_name, bias_name], output_name) return output_name - def extract_denses(self, input_name, output_names, output_nums, scope_id, scope_name="dense"): + def extract_denses(self, input_name, output_names, output_nums, scope_id, scope_name="dense", share_index=0, share_num=1): if (isinstance(scope_name, str)): layer_names = [scope_name, "kernel", "bias"] elif (isinstance(scope_name, list)): @@ -453,32 +488,51 @@ def extract_denses(self, input_name, output_names, output_nums, scope_id, scope_ print("[ERROR] unsupported dense scope_name") exit(1) kernels, biases = self.get_weights(scope_id, layer_names) - last_sum = 0 - for index in range(len(output_nums)): - kernel = kernels[:, last_sum:last_sum+output_nums[index]] - bias = None - if biases is not None: - bias = biases[last_sum:last_sum+output_nums[index]] - layer = caffe_net.LayerParameter(name=output_names[index], type='InnerProduct', - bottom=[input_name], top=[output_names[index]]) - num_output = len(kernel[0]) - kernel = kernel.transpose((1, 0)) - layer.inner_product_param(num_output, bias_term=bias is not None) - if bias is not None: - if len(bias) != num_output: - print("[ERROR] extract_denses failed") - exit(0) - layer.add_data(kernel, bias) - else: - layer.add_data(kernel) - self.caffe_model.add_layer(layer) - self.data_dict[output_names[index]] = Operators.fully_connect(self.data_dict[input_name], - kernel.transpose((1, 0)), bias, - output_names[index]) - last_sum = last_sum + output_nums[index] - if (last_sum != len(kernels[0])): - print("[ERROR] extract_denses failed") - exit(0) + if (share_num == 1): + last_sum = 0 + for index in range(len(output_nums)): + kernel = kernels[:, last_sum:last_sum+output_nums[index]] + bias = None + if biases is not None: + bias = biases[last_sum:last_sum+output_nums[index]] + layer = caffe_net.LayerParameter(name=output_names[index], type='InnerProduct', + bottom=[input_name], top=[output_names[index]]) + num_output = len(kernel[0]) + kernel = kernel.transpose((1, 0)) + layer.inner_product_param(num_output, bias_term=bias is not None) + if bias is not None: + if len(bias) != num_output: + print("[ERROR] extract_denses failed") + exit(0) + layer.add_data(kernel, bias) + else: + layer.add_data(kernel) + self.caffe_model.add_layer(layer) + self.data_dict[output_names[index]] = Operators.fully_connect(self.data_dict[input_name], + kernel.transpose((1, 0)), bias, + output_names[index]) + last_sum = last_sum + output_nums[index] + if (last_sum != len(kernels[0])): + print("[ERROR] extract_denses failed") + exit(0) + else: + self.scopes[scope_id] = layer_names[0] + kernel_name = self.generate_name(self.scopes, scope_id+1) + "/kernel" + bias_name = self.generate_name(self.scopes, scope_id+1) + "/bias" + if (share_index == 0): + self.add_weight(kernel_name, weight=kernels) + if (biases is not None): + self.add_weight(bias_name, weight=biases) + tmp_name = self.add_matmul(input_name, kernel_name, self.generate_name(self.scopes, scope_id+1)+"/matmul"+str(share_index)) + if (biases is not None): + tmp_name = self.add_sum([tmp_name, bias_name], self.generate_name(self.scopes, scope_id+1)+"/sum"+str(share_index)) + slice_point = [] + last_sum = 0 + for i in range(len(output_nums)-1): + last_sum = last_sum + output_nums[i] + slice_point.append(last_sum) + shape_len = len(self.get_tensor_shape(self.generate_name(self.scopes, scope_id+1)+"/matmul"+str(share_index))) + self.add_slice(tmp_name, output_names, shape_len-1, slice_point) return output_names def add_reshape(self, input_name, output_name, shape): @@ -498,9 +552,9 @@ def add_squeeze(self, input_name, output_name, axis): return output_name def add_transpose(self, input_name, output_name, dim): - layer = caffe_net.LayerParameter(name=output_name, type='Transpose', + layer = caffe_net.LayerParameter(name=output_name, type='Permute', bottom=[input_name], top=[output_name]) - layer.transpose_param(dim) + layer.permute_param(dim) self.caffe_model.add_layer(layer) self.data_dict[output_name] = Operators.transpose(self.data_dict[input_name], dim, output_name) return output_name @@ -514,12 +568,24 @@ def add_matmul(self, input_a_name, input_b_name, output_name, transpose_a=False, self.data_dict[input_b_name], transpose_b, output_name) return output_name - def add_multiply(self, input_name, output_name, scale=1, bias=0): - layer = caffe_net.LayerParameter(name=output_name, type='Multiply', + def add_power(self, input_name, output_name, scale=1, shift=0, power=1): + layer = caffe_net.LayerParameter(name=output_name, type='Power', bottom=[input_name], top=[output_name]) - layer.multiply_param(scale, bias) + layer.power_param(scale, shift, power) self.caffe_model.add_layer(layer) - self.data_dict[output_name] = Operators.multiply(self.data_dict[input_name], scale, bias, output_name) + self.data_dict[output_name] = Operators.power(self.data_dict[input_name], scale, shift, power, output_name) + return output_name + + def add_div(self, input_names, output_name): + layer = caffe_net.LayerParameter(name=output_name, type='Eltwise', + bottom=input_names, + top=[output_name]) + layer.eltwise_param(3) #Div + self.caffe_model.add_layer(layer) + data = [] + for name in input_names: + data.append(self.data_dict[name]) + self.data_dict[output_name] = Operators.divide(self.data_dict[input_names[0]], self.data_dict[input_names[1]], output_name) return output_name def add_prod(self, input_names, output_name): @@ -533,6 +599,14 @@ def add_prod(self, input_names, output_name): self.data_dict[output_name] = Operators.matmultiply(self.data_dict[output_name], self.data_dict[input_names[i]], output_name) return output_name + + def add_l2norm(self, input_name, output_name): + layer = caffe_net.LayerParameter(name=output_name, type='L2Norm', + bottom=[input_name], + top=[output_name]) + self.caffe_model.add_layer(layer) + self.data_dict[output_name] = Operators.l2_norm(self.data_dict[input_name], output_name) + return output_name def add_slice(self, input_name, output_names, axis, slice_point): layer = caffe_net.LayerParameter(name=output_names[0], type='Slice', @@ -660,11 +734,11 @@ def add_embedding(self, input_name, weight_name, output_name, transpose=False): bottom=[input_name,weight_name], top=[output_name]) weight = self.data_dict[weight_name] if transpose: - input_dim = len(weight[0]) - embedding_dim = len(weight) + input_dim = weight.shape[-1] + embedding_dim = weight.shape[-2] else: - input_dim = len(weight) - embedding_dim = len(weight[0]) + input_dim = weight.shape[-2] + embedding_dim = weight.shape[-1] layer.embed_param(input_dim, embedding_dim, transpose) self.caffe_model.add_layer(layer) self.data_dict[output_name] = Operators.embedding(self.data_dict[input_name], weight, transpose, output_name) @@ -705,7 +779,7 @@ def add_reduce_mean(self, input_name, axis, keep_dim, output_name): bottom=[input_name], top=[output_name]) layer.reduction_param(operation, axis, keep_dim) self.caffe_model.add_layer(layer) - self.data_dict[output_name] = Operators.reduction(self.data_dict[input_name], operation, axis, output_name) + self.data_dict[output_name] = Operators.reduction(self.data_dict[input_name], None, operation, axis, output_name) return output_name def add_reduce_sum(self, input_name, axis, keep_dim, output_name, mask_input_name=None): @@ -733,6 +807,14 @@ def add_expand_dims(self, input_name, axis, output_name): self.data_dict[output_name] = Operators.expand_dims(self.data_dict[input_name], axis, output_name) return output_name + def add_tile(self, input_name, loops, axis, output_name): + layer = caffe_net.LayerParameter(name=output_name, type='Tile', + bottom=[input_name], top=[output_name]) + layer.tile_param(axis, loops) + self.caffe_model.add_layer(layer) + self.data_dict[output_name] = Operators.tile(self.data_dict[input_name], loops, axis, output_name) + return output_name + def add_argmax(self, input_name, axis, output_name): layer = caffe_net.LayerParameter(name=output_name, type='ArgMax', bottom=[input_name], top=[output_name]) @@ -743,7 +825,7 @@ def add_argmax(self, input_name, axis, output_name): def extract_lstm(self, input_name, state_name, output_name, scope_id, steps=-1, scope_name="basic_lstm_cell", - use_proj=False, zoneout_cell=0, zoneout_output=0): + use_proj=False, zoneoutCell=0, zoneoutOutput=0): if (isinstance(scope_name, str)): scope_name = [scope_name] kernels = [] @@ -767,6 +849,9 @@ def extract_lstm(self, input_name, state_name, output_name, scope_id, num_output = projection.shape[1] else: num_output = num_output_4 // 4 + if (len(kernel) != self.get_tensor_shape(input_name)[-1] + num_output): + kernel_2, bias_2 = self.get_weights(scope_id, [scope_name[i], "recurrent_kernel", "bias"]) + kernel = np.concatenate([kernel, kernel_2], axis = 0) kernels.append(kernel.transpose([1, 0])) if (bias is None): bias = np.zeros([num_output_4 // 2]) @@ -784,31 +869,45 @@ def extract_lstm(self, input_name, state_name, output_name, scope_id, bottom.append(state_name) layer = caffe_net.LayerParameter(name=output_name, type='LSTM', bottom=bottom, top=[output_name]) - layer.lstm_param(num_output, steps, projection_size, zoneout_cell, zoneout_output) + layer.lstm_param(num_output, steps, projection_size, zoneoutCell, zoneoutOutput) if (use_proj): - layer.add_data(np.concatenate(kernels, axis=0), np.concatenate(biases, axis=0), - np.concatenate(projections, axis=0), np.concatenate(projection_biases, axis=0)) + if (projection_biases[0] is not None): + layer.add_data(np.concatenate(kernels, axis=0), np.concatenate(biases, axis=0), + np.concatenate(projections, axis=0), np.concatenate(projection_biases, axis=0)) + else: + layer.add_data(np.concatenate(kernels, axis=0), np.concatenate(biases, axis=0), + np.concatenate(projections, axis=0)) else: layer.add_data(np.concatenate(kernels, axis=0), np.concatenate(biases, axis=0)) self.caffe_model.add_layer(layer) - if (len(scope_name) == 1): + #if (len(scope_name) == 1): + if (steps >= 0): + self.data_dict[output_name] = Operators.fw_lstm(self.data_dict[input_name], + kernels[0], + biases[0], + projections[0], + projection_biases[0], + zoneoutCell, zoneoutOutput, + output_name) + elif (steps == -1): self.data_dict[output_name], self.data_dict[state_name] = Operators.lstm(self.data_dict[input_name], self.data_dict[state_name], kernels[0], biases[0], projections[0], projection_biases[0], - zoneout_cell, zoneout_output, + zoneoutCell, zoneoutOutput, output_name, state_name) - elif (len(scope_name) == 2): + #elif (len(scope_name) == 2): + elif (steps == -2): self.data_dict[output_name] = Operators.bi_lstm(self.data_dict[input_name], kernels, biases, projections, projection_biases, - zoneout_cell, zoneout_output, + zoneoutCell, zoneoutOutput, output_name) return output_name @@ -890,9 +989,9 @@ def add_memory(self, memory_name, memory_shapes, data_type): return memory_name def add_pad(self, input_name, output_name, padding_shapes, padding_values=None): - layer = caffe_net.LayerParameter(name=output_name, type='Padding', + layer = caffe_net.LayerParameter(name=output_name, type='Pad', bottom=[input_name], top=[output_name]) - layer.pad_param(padding_shapes, padding_values) + layer.padding_param(padding_shapes, padding_values) self.caffe_model.add_layer(layer) self.data_dict[output_name] = Operators.pad(self.data_dict[input_name], padding_shapes, padding_values, output_name) return output_name diff --git a/model_tools/tools/tensorflow2caffe/tts/tensorflow2caffe_tactron2.py b/model_tools/tools/tensorflow2caffe/tts/tensorflow2caffe_tactron2.py new file mode 100644 index 00000000..f1722eb8 --- /dev/null +++ b/model_tools/tools/tensorflow2caffe/tts/tensorflow2caffe_tactron2.py @@ -0,0 +1,520 @@ +#!/usr/local/bin/python +# -*- coding: utf-8 -*- + +import math +import numpy as np +import sys +sys.path.append("../") +from tensorflow2caffe import Tensorflow2Caffe + +class Tensorflow2CaffeTactron2(Tensorflow2Caffe): + def __init__(self, + tensorflow_model_path, caffe_model_path_prefix, caffe_model_name, + params, + check=False, calc=False): + Tensorflow2Caffe.__init__(self, tensorflow_model_path, + caffe_model_path_prefix, caffe_model_name, check, calc) + self.params = params + + class Parameters: + def __init__(self): + self.streaming = False + + self.max_sequence_length = 128 # max input sequence + self.num_mels = 80 #Number of mel-spectrogram channels and local conditioning dimensionality + self.outputs_per_step = 3 #number of frames to generate at each decoding step (increase to speed up computation and allows for higher batch size, decreases G&L audio quality) + self.tacotron_zoneout_rate = 0.1 #zoneout rate for all LSTM cells in the network + + #Mel and Linear spectrograms normalization/scaling and clipping + self.signal_normalization = True #Whether to normalize mel spectrograms to some predefined range (following below parameters) + self.allow_clipping_in_normalization = True #Only relevant if mel_normalization = True + self.symmetric_mels = True #Whether to scale the data to be symmetric around 0. (Also multiplies the output range by 2, faster and cleaner convergence) + self.max_abs_value = 4. #max absolute value of data. If symmetric, data will be [-max, max] else [0, max] (Must not be too big to avoid gradient explosion, + + #Limits + self.min_level_db = -120 + self.ref_level_db = 20 + + # Emotion dims + self.emotion_dim = 64 + + #Encoder parameters + self.enc_conv_num_layers = 3 #number of encoder convolutional layers + self.enc_conv_kernel_size = 5 #size of encoder convolution filters for each layer + self.enc_conv_channels = 512 #number of encoder convolutions filters for each layer + self.encoder_lstm_units = 256 #number of lstm units for each direction (forward and backward) + + #Attention mechanism + self.mask_encoder = True #whether to mask encoder padding while computing attention. Set to True for better prosody but slower convergence. + self.mask_decoder = False #Whether to use loss mask for padded sequences (if False, loss function will not be weighted, else recommended pos_weight = 20) + self.smoothing = False #Whether to smooth the attention normalization function + self.attention_dim = 128 #dimension of attention space + self.attention_filters = 32 #number of attention convolution filters + self.attention_kernel = 31 #kernel size of attention convolution + self.cumulative_weights = True #Whether to cumulate (sum) all previous attention weights or simply feed previous weights (Recommended: True) + + #Attention synthesis constraints + #"Monotonic" constraint forces the model to only look at the forwards attention_win_size steps. + #"Window" allows the model to look at attention_win_size neighbors, both forward and backward steps. + self.synthesis_constraint = False #Whether to use attention windows constraints in synthesis only (Useful for long utterances synthesis) + self.synthesis_constraint_type = 'window' #can be in ('window', 'monotonic'). + self.attention_win_size = 7 #Side of the window. Current step does not count. If mode is window and attention_win_size is not pair, the 1 extra is provided to backward part of the window. + + #Decoder + self.prenet_layers = [256, 256] #number of layers and number of units of prenet + self.decoder_query_layers = 1 #number of decoder query lstm lstm layers + self.decoder_layers = 2 #number of decoder lstm layers + self.decoder_lstm_units = 512 #number of decoder lstm units on each layer + self.max_iters = 2000 #Max decoder steps during inference (Just for safety from infinite loop cases) + + #Residual postnet + self.postnet_num_layers = 5 #number of postnet convolutional layers + self.postnet_kernel_size = 5 #size of postnet convolution filters for each layer + self.postnet_channels = 512 #number of postnet convolution filters for each layer + + def EncoderConvolutions(self, inputs, hparams, activation="relu", scope_id=0, scope="enc_conv_layers", output_name_prefix=""): + self.scopes[scope_id] = scope + kernel_size = [hparams.enc_conv_kernel_size, 1] + strides = [1, 1] + channels = hparams.enc_conv_channels + activation = activation + enc_conv_num_layers = hparams.enc_conv_num_layers + + inputs = self.transpose_nhc_nchw(inputs) + x = inputs + for i in range(enc_conv_num_layers): + self.scopes[scope_id+1] = 'conv_layer_{}_'.format(i + 1) + scope + padding = self.calculate_convolution_padding(self.get_tensor_shape(x), kernel_size, strides, 'same') + x = self.extract_convolution(x, output_name_prefix+"_conv_"+str(i+1), scope_id+2, + channels, kernel_size, strides, padding, + data_format="NCHW", weight_format="NHWC", + dilation=1, groups=1, layer_names=['conv1d', "kernel", "bias"]) + x = self.extract_batch_norm(x, output_name_prefix+"_bn_"+str(i+1), scope_id+2, + layer_names=["batch_normalization", "moving_mean", "moving_variance"]) + if activation == "relu": + x = self.add_relu(x, output_name_prefix+"_"+scope+"_"+activation+"_"+str(i+1)) + else: + print("[ERROR] unsupported activation layer %s in EncoderConvolutions" % (activation)) + exit(1) + x = self.transpose_nchc8_nhc(x) + return x + + def EncoderRNN(self, inputs, size=256, zoneout=0.1, scope_id=0, scope="encoder_LSTM", output_name_prefix=""): + self.scopes[scope_id] = scope + self.scopes[scope_id+1] = "bidirectional_rnn" + lstm_output_name = output_name_prefix + "_Bi-LSTM" + outputs = self.extract_lstm(inputs, None, lstm_output_name, + scope_id+2, steps=-2, scope_name=["fw/encoder_fw_LSTM", "bw/encoder_bw_LSTM"], + zoneoutCell=zoneout, zoneoutOutput=zoneout) + return outputs + + def Prenet(self, inputs, layers_sizes=[256, 256], activation="relu", scope_id=0, scope='prenet', output_name_prefix=""): + self.scopes[scope_id] = scope + x = inputs + for i, size in enumerate(layers_sizes): + x = self.extract_dense(x, output_name_prefix+"_prenet_dense_"+str(i+1), scope_id+1, 'dense_{}'.format(i + 1)) + if activation == "relu": + x = self.add_relu(x, output_name_prefix+"_prenet_relu_"+str(i+1)) + else: + print("[ERROR] unsupported activation layer %s in Prenet" % (activation)) + exit(1) + return x + + def DecoderRNN(self, inputs, states, layers=2, size=1024, zoneout=0.1, scope_id=0, scope="decoder_rnn", output_name_prefix=""): + self.scopes[scope_id] = scope + self.scopes[scope_id+1] = "multi_rnn_cell" + x = inputs + for i in range(layers): + lstm_output_name = output_name_prefix + "_lstm" + str(i) + x = self.extract_lstm(x, states[i], lstm_output_name, + scope_id=scope_id+2, scope_name='cell_{}/decoder_LSTM_{}'.format(i, i+1), + zoneoutCell=zoneout, + zoneoutOutput=zoneout) + return x + + def FrameProjection(self, inputs, shape=80, activation=None, scope_id=0, scope="Linear_projection", output_name_prefix=""): + self.scopes[scope_id] = scope + x = self.extract_dense(inputs, output_name_prefix+"_"+scope, scope_id+1, 'projection_{}'.format(scope)) + if activation == "relu": + x = self.add_relu(x, output_name_prefix+"_"+scope+"_"+activation) + elif activation == None: + x = x + else: + print("[ERROR] unsupported activation layer %s in FrameProjection" % (activation)) + exit(1) + return x + + def StopProjection(self, inputs, shape=1, activation="sigmoid", scope_id=0, scope="stop_token_projection", output_name_prefix=""): + self.scopes[scope_id] = scope + x = self.extract_dense(inputs, output_name_prefix+"_"+scope, scope_id+1, 'projection_{}'.format(scope)) + if activation == "sigmoid": + x = self.add_sigmoid(x, output_name_prefix+"_"+scope+"_"+activation) + else: + print("[ERROR] unsupported activation layer %s in StopProjection" % (activation)) + exit(1) + return x + + def Postnet(self, inputs, hparams, activation="tanh", scope_id=0, scope="postnet_convolutions", output_name_prefix=""): + self.scopes[scope_id] = scope + kernel_size = [hparams.postnet_kernel_size, 1] + channels = hparams.postnet_channels + strides = [1, 1] + activation = activation + postnet_num_layers = hparams.postnet_num_layers + + inputs = self.transpose_nhc_nchw(inputs) + x = inputs + for i in range(postnet_num_layers - 1): + self.scopes[scope_id+1] = 'conv_layer_{}_'.format(i + 1) + scope + padding = self.calculate_convolution_padding(self.get_tensor_shape(x), kernel_size, strides, 'same') + x = self.extract_convolution(x, output_name_prefix+"_conv_"+str(i+1), scope_id+2, + channels, kernel_size, strides, padding, + data_format="NCHW", weight_format="NHWC", + dilation=1, groups=1, layer_names=['conv1d', "kernel", "bias"]) + x = self.extract_batch_norm(x, output_name_prefix+"_bn_"+str(i+1), scope_id+2, + layer_names=["batch_normalization", "moving_mean", "moving_variance"]) + if activation == "tanh": + x = self.add_tanh(x, output_name_prefix+"_"+scope+"_"+activation+"_"+str(i+1)) + else: + print("[ERROR] unsupported activation layer %s in EncoderConvolutions" % (activation)) + exit(1) + + layer_id = 5 + self.scopes[scope_id+1] = 'conv_layer_{}_'.format(layer_id) + scope + padding = self.calculate_convolution_padding(self.get_tensor_shape(x), kernel_size, strides, 'same') + x = self.extract_convolution(x, output_name_prefix+"_conv_"+str(layer_id), scope_id+2, + channels, kernel_size, strides, padding, + data_format="NCHW", weight_format="NHWC", + dilation=1, groups=1, layer_names=['conv1d', "kernel", "bias"]) + x = self.extract_batch_norm(x, output_name_prefix+"_bn_"+str(layer_id), scope_id+2, + layer_names=["batch_normalization", "moving_mean", "moving_variance"]) + + x = self.transpose_nchc8_nhc(x) + return x + + def _compute_attention(self, cell_output, attention_state, + attention_layer, prev_max_attentions, encoder_outputs, hp, scope_id, output_name_prefix): + alignments, next_attention_state = self.LocationSensitiveAttention( + cell_output, state=attention_state, prev_max_attentions=prev_max_attentions, + num_units=hp.attention_dim, memory=encoder_outputs, hparams=hp, scope_id=scope_id, + mask_encoder=hp.mask_encoder, smoothing=hp.smoothing, + cumulate_weights=hp.cumulative_weights, output_name_prefix=output_name_prefix) + expanded_alignments = self.add_expand_dims(alignments, 1, output_name_prefix+"_alignment_expand") + context = self.add_matmul(expanded_alignments, encoder_outputs, output_name_prefix+"_context") + context = self.add_squeeze(context, output_name_prefix+"_context_squeeze", 1) + if attention_layer is not None: + print("[ERROR] unsupported attention layer") + exit(1) + else: + attention = context + return attention, alignments, next_attention_state + + def LocationSensitiveAttention(self, + query, state, prev_max_attentions, + num_units, + memory, + hparams, + scope_id=0, + mask_encoder=True, + memory_sequence_length=None, + smoothing=False, + cumulate_weights=True, + name='LocationSensitiveAttention', + output_name_prefix=""): + _cumulate = cumulate_weights + synthesis_constraint = hparams.synthesis_constraint + attention_win_size = hparams.attention_win_size + constraint_type = hparams.synthesis_constraint_type + previous_alignments = state + + keys = self.extract_dense(memory, output_name_prefix+"_keys", scope_id-1, "memory_layer") + + self.scopes[scope_id-1] = "decoder" + self.scopes[scope_id] = "Location_Sensitive_Attention" + + processed_query = self.extract_dense(query, output_name_prefix+"_query", scope_id+1, "query_layer") + processed_query = self.add_expand_dims(processed_query, 1, output_name_prefix+"_query_expand") + + expanded_alignments = self.add_expand_dims(previous_alignments, 2, output_name_prefix+"_align_expand") + expanded_alignments = self.transpose_nhc_nchw(expanded_alignments) + kernel_size = [hparams.attention_kernel, 1] + strides = [1, 1] + padding = self.calculate_convolution_padding(self.get_tensor_shape(expanded_alignments), kernel_size, strides, 'same') + f = self.extract_convolution(expanded_alignments, output_name_prefix+"_conv", scope_id+1, + hparams.attention_filters, kernel_size, strides, padding, + data_format="NCHW", weight_format="NHWC", + dilation=1, groups=1, layer_names=['location_features_convolution', "kernel", "bias"]) + f = self.transpose_nchc8_nhc(f) + processed_location_features = self.extract_dense(f, output_name_prefix+"_location", scope_id+1, ["", "location_features_layer/kernel", "attention_bias"]) + + #energy = self._location_sensitive_score(processed_query, processed_location_features, self.keys) + sum_result = self.add_sum([processed_query, processed_location_features, keys], output_name_prefix+"_sum1") + tanh_result = self.add_tanh(sum_result, output_name_prefix+"_tanh") + fc_result = self.extract_scale(tanh_result, output_name_prefix+"_scale", scope_id+1, axis=-1, layer_names=["", "attention_variable_projection", "bias"]) + energy = self.add_reduce_sum(fc_result, 2, False, output_name="decoder_reduce_sum") + + if synthesis_constraint: + print("[ERROR] not support synthesis_constraint") + exit(1) + if (smoothing): + print("[ERROR] unsupported smoothing softmax") + exit(1) + else: + alignments = self.add_softmax(energy, output_name_prefix+"_softmax", -1) + + if _cumulate: + next_state = self.add_sum([alignments, previous_alignments], output_name_prefix+"_sum2") + else: + next_state = alignments + + return alignments, next_state + + def prepare_decoder_states(self, layers, num_units, output_name_prefix): + states = [] + for i in range(layers): + state_shape = [self.batch, num_units] + state_name = output_name_prefix + "_layer" + str(i) + "_state" + if (self.params.streaming): + state_name = self.add_input(state_name, state_shape) + else: + state_name = self.add_memory(state_name, state_shape, data_type="FLOAT32") + states.append(state_name) + return states + + def generate_encoder(self, inputs=None): + hp = self.params + word_input_name = "tts_words" + word_input_shape = [self.batch, hp.max_sequence_length] + self.add_input(word_input_name, word_input_shape) + if (hp.emotion_dim > 0): + emotion_input_name = "tts_emotions" + emotion_input_shape = [self.batch, hp.max_sequence_length] + self.add_input(emotion_input_name, emotion_input_shape) + self.set_input(inputs) + self.save_input() + + self.scopes[0] = "Tacotron_model" + self.scopes[1] = "inference" + embedding_inputs = "tts_word_embedding" + self.extract_embedding(word_input_name, 2, "inputs_embedding", embedding_inputs) + if (hp.emotion_dim > 0): + emotion_embedding_inputs = "tts_emotion_embedding" + self.extract_embedding(emotion_input_name, 2, "emotion_embedding_table", emotion_embedding_inputs) + if (hp.emotion_dim != self.get_tensor_shape(emotion_embedding_inputs)[2]): + print("[ERROR] speaker embedding dim emotion_dim(%d) is set to %d" % + (self.get_tensor_shape(emotion_embedding_inputs)[2], hp.emotion_dim)) + exit(1) + + convolution_result = self.EncoderConvolutions(embedding_inputs, hparams=hp, scope='encoder_convolutions', scope_id=2, output_name_prefix="encoder") + rnn_result = self.EncoderRNN(convolution_result, size=hp.encoder_lstm_units, + zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM', scope_id=2, output_name_prefix="encoder") + if (hp.emotion_dim > 0): + rnn_result = self.add_concat([rnn_result, emotion_embedding_inputs], "encoder_concat", axis=2) + if (self.params.streaming): + self.save_caffe_model() + return rnn_result + + def generate_decoder(self, rnn_result, inputs=None): + hp = self.params + cumulated_alignments = "tts_alignments" + cumulated_alignments_shape = [self.batch, hp.max_sequence_length] + self.add_input(cumulated_alignments, cumulated_alignments_shape) + decoder_input_name = "decoder_input" + decoder_input_shape = [self.batch, hp.num_mels] + decoder_attention_name = "decoder_attention" + rnn_result_dim = hp.encoder_lstm_units * 2 + hp.emotion_dim + # TODO rn_result shape + decoder_attention_shape = [self.batch, rnn_result_dim] + if (self.params.streaming): + self.add_input(decoder_input_name, decoder_input_shape) + self.add_input(decoder_attention_name, decoder_attention_shape) + rnn_result_shape = [self.batch, hp.max_sequence_length, rnn_result_dim] + self.add_input(rnn_result, rnn_result_shape) + alignments_history = "tts_alignments_history" + alignments_history_shape = [self.batch, hp.max_iters, hp.max_sequence_length] + self.add_memory(alignments_history, alignments_history_shape, data_type="FLOAT32") + else: + self.add_memory(decoder_input_name, decoder_input_shape, data_type="FLOAT32") + self.add_memory(decoder_attention_name, decoder_attention_shape, data_type="FLOAT32") + if (hp.decoder_query_layers > 0): + decoder_query_lstm_states = self.prepare_decoder_states(hp.decoder_query_layers, hp.decoder_lstm_units*2, "decoder_query") + decoder_lstm_states = self.prepare_decoder_states(hp.decoder_layers, hp.decoder_lstm_units*2, "decoder_lstm") + self.set_input(inputs) + self.save_input() + negative_one = "negative_one" + weight = np.array([[-1] * self.batch]) + self.add_weight(negative_one, weight=weight, data_type="INT32") + zero = "zero" + weight = np.array([[0]*self.batch]) + self.add_weight(zero, weight=weight, data_type="INT32") + position_input_name = "decoder_position" + position_input_shape = [self.batch, 1] + self.add_memory(position_input_name, position_input_shape, data_type="INT32") + self.add_copy(negative_one, 1, 1, 0, + position_input_name, 1, 1, 0, + 1, output_name="init_decoder_position") + + decoder_result_name = "decoder_result" + decoder_result_shape = [self.batch, hp.outputs_per_step*hp.max_iters, hp.num_mels] + self.add_memory(decoder_result_name, decoder_result_shape, data_type="FLOAT32") + x = decoder_input_name + self.scopes[0] = "Tacotron_model" + self.scopes[1] = "inference" + self.scopes[2] = "decoder" + index = 0 + for i in range(hp.max_iters): + self.set_add_layer(i == 0) + position_input_name_new = position_input_name+"_add_one" + self.add_power(position_input_name, position_input_name_new, scale=1, shift=1, power=1) + self.add_copy(position_input_name_new, 1, 1, 0, + position_input_name, 1, 1, 0, + 1, output_name="update_position") + + prenet = self.Prenet(x, layers_sizes=hp.prenet_layers, scope_id=3, scope='decoder_prenet', output_name_prefix="decoder") + + LSTM_input = self.add_concat([prenet, decoder_attention_name], "decoder_concat1", axis=-1) + + if (hp.decoder_query_layers > 0): + LSTM_output = self.DecoderRNN(LSTM_input, decoder_query_lstm_states, layers=hp.decoder_query_layers, + size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, + scope_id=3, scope='decoder_query_LSTM', output_name_prefix="decoder_query_lstm") + else: + LSTM_output = self.DecoderRNN(LSTM_input, decoder_lstm_states, layers=hp.decoder_layers, + size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, + scope_id=3, scope='decoder_LSTM', output_name_prefix="decoder_lstm") + context_vector, alignments, new_cumulated_alignments = self._compute_attention( + LSTM_output, + cumulated_alignments, + attention_layer=None, + prev_max_attentions=None, + encoder_outputs=rnn_result, + hp=hp, scope_id=3, output_name_prefix="decoder_attention") + self.add_copy(context_vector, + rnn_result_dim, rnn_result_dim, 0, + decoder_attention_name, + rnn_result_dim, rnn_result_dim, 0, + rnn_result_dim, + output_name="copy_decoder_attention") + self.add_copy(new_cumulated_alignments, + -1, -1, 0, + cumulated_alignments, + -1, -1, 0, + -1, + output_name="copy_cumulated_alignments") + if (self.params.streaming): + self.add_copy(alignments, + -1, -1, 0, + alignments_history, + hp.max_sequence_length, hp.max_sequence_length, 0, + -1, + output_name="copy_to_alignments_history", + src_index_name=zero, + dst_index_name=position_input_name) + projections_input = self.add_concat([LSTM_output, context_vector], "decoder_concat2", axis=-1) + if (hp.decoder_query_layers > 0): + multi_rnn_output = self.DecoderRNN(projections_input, decoder_lstm_states, layers=hp.decoder_layers, + size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, + scope_id=3, scope='decoder_LSTM', output_name_prefix="decoder_lstm") + else: + multi_rnn_output = projections_input + + frame_projection = self.FrameProjection(multi_rnn_output, hp.num_mels * hp.outputs_per_step, + scope_id=3, scope='linear_transform_projection', output_name_prefix="decoder") + + stop_projection = self.StopProjection(multi_rnn_output, shape=hp.outputs_per_step, + scope_id=3, scope='stop_token_projection', output_name_prefix="decoder") + stop_projection = self.add_power(stop_projection, "decoder_stop_sub", scale=1, shift=-0.5, power=1) + stop_projection = self.add_relu(stop_projection, "decoder_stop_relu") + stop_projection = self.add_reduce_sum(stop_projection, 1, False, "decoder_stop_sum") + self.add_copy(frame_projection, + hp.outputs_per_step*hp.num_mels, hp.outputs_per_step*hp.num_mels, 0, + decoder_result_name, + hp.outputs_per_step*hp.max_iters*hp.num_mels, hp.outputs_per_step*hp.num_mels, 0, + hp.outputs_per_step*hp.num_mels, + output_name="copy_to_global_decoder_buffer", + src_index_name=zero, + dst_index_name=position_input_name) + next_input = "decoder_next_input" + self.add_slice(frame_projection, ["other", next_input], 1, [(hp.outputs_per_step-1)*hp.num_mels]) + + self.add_copy(next_input, + hp.num_mels, hp.num_mels, 0, + x, + hp.num_mels, hp.num_mels, 0, + hp.num_mels, + output_name="copy_to_next_decoder_input") + status = "decoder_check" + self.add_check(stop_projection, zero, "great", status) + index = index + 1 + self.add_repeat(hp.max_iters-1, position_input_name_new, output_name="repeat", status_name=status) + if (self.get_tensor(status)[0] or index > hp.max_iters-1): + break; + if (self.params.streaming): + mels = decoder_result_name + if (hp.signal_normalization): + mels = self.convert_db_melgan_log(self._denormalize(mels, hp), hp) + outputs = [mels, position_input_name, stop_projection, decoder_attention_name, + cumulated_alignments, alignments_history] + outputs.extend(decoder_query_lstm_states) + outputs.extend(decoder_lstm_states) + else: + outputs = [decoder_result_name, position_input_name] + self.add_output(outputs) + self.save_caffe_model() + return self.get_tensor(decoder_result_name), self.get_tensor(position_input_name) + + def generate_encoder_decoder(self, inputs=None): + rnn_result = self.generate_encoder(inputs) + return self.generate_decoder(rnn_result, inputs) + + def convert_db_melgan_log(self, mel, hparams): + #return (mel + hparams.ref_level_db) / 20 + return self.add_power(mel, "mel", scale=1/20.0, shift=hparams.ref_level_db/20.0, power=1) + + def _denormalize(self, D, hparams): + if hparams.allow_clipping_in_normalization: + if hparams.symmetric_mels: + #return (((np.clip(D, -hparams.max_abs_value, + # hparams.max_abs_value) + hparams.max_abs_value) * -hparams.min_level_db / ( + # 2 * hparams.max_abs_value)) + # + hparams.min_level_db) + clip_result = self.add_clip(D, "mel_clip", -hparams.max_abs_value, hparams.max_abs_value) + b = -hparams.min_level_db / (2 * hparams.max_abs_value) + return self.add_power(clip_result, "mel_denormalize", scale=b, shift=hparams.max_abs_value*b+hparams.min_level_db, power=1) + else: + #return ((np.clip(D, 0, + # hparams.max_abs_value) * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db) + clip_result = self.add_clip(D, "mel_clip", 0, hparams.max_abs_value) + return self.add_power(clip_result, "mel_denormalize", scale=-hparams.min_level_db / hparams.max_abs_value, + shift=hparams.min_level_db, power=1) + + + if hparams.symmetric_mels: + #return (((D + hparams.max_abs_value) * -hparams.min_level_db / ( + # 2 * hparams.max_abs_value)) + hparams.min_level_db) + b = -hparams.min_level_db / (2 * hparams.max_abs_value) + return self.add_power(D, "mel_denormalize", scale=b, shift=hparams.max_abs_value*b+hparams.min_level_db, power=1) + else: + #return ((D * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db) + return self.add_power(D, "mel_denormalize", scale=-hparams.min_level_db / hparams.max_abs_value, shift=hparams.min_level_db, power=1) + + def generate_postnet(self, inputs=None): + hp = self.params + decoder_result_name = "tts_decoder" + decoder_result_shape = [self.batch, hp.outputs_per_step*hp.max_iters, hp.num_mels] + self.add_input(decoder_result_name, decoder_result_shape) + self.set_input(inputs) + self.save_input() + + self.scopes[0] = "Tacotron_model" + self.scopes[1] = "inference" + + #Postnet + postnet = self.Postnet(decoder_result_name, hparams=hp, scope_id=2, scope='postnet_convolutions', output_name_prefix="postnet") + projected_residual = self.FrameProjection(postnet, hp.num_mels, scope_id=2, scope='postnet_projection', output_name_prefix="postnet_projection") + mel_outputs = self.add_sum([decoder_result_name, projected_residual], "mel_sum") + mel_outputs = self.transpose_nhc_nchw(mel_outputs) + if (hp.signal_normalization): + mels = self.convert_db_melgan_log(self._denormalize(mel_outputs, hp), hp) + + self.save_caffe_model() diff --git a/model_tools/tools/tensorflow2caffe/tts/transform_tactron2.py b/model_tools/tools/tensorflow2caffe/tts/transform_tactron2.py new file mode 100644 index 00000000..b7ae033a --- /dev/null +++ b/model_tools/tools/tensorflow2caffe/tts/transform_tactron2.py @@ -0,0 +1,73 @@ +#!/usr/local/bin/python +# -*- coding: utf-8 -*- + +from tensorflow2caffe_tactron2 import Tensorflow2CaffeTactron2 +import os +import numpy as np + +def text_to_speech(tensorflow_model_path): + params = Tensorflow2CaffeTactron2.Parameters() + tts_caffe = Tensorflow2CaffeTactron2(tensorflow_model_path, "tts_encoder_decoder", "tts_encoder_decoder", + params, + check=False, calc=True) + + data = {} + data["tts_words"] = np.array([[4, 25, 14, 33, 11, 20, 1, 9, 14, 33, 27, 2, 20, 35, 15, 1, 10, 37, 11, 2, 30, + 34, 15, 7, 21, 1, 25, 14, 35, 21, 27, 3, 25, 14, 34, 27, 1, 25, 14, 35, 27, 1, 17, 36, 7, 20, 1, 37, 7, 0]]) + data["tts_alignments"] = np.zeros(data["tts_words"].shape) + data["tts_emotions"] = np.array([[4]*data["tts_words"].size]) + decoder_result, num = tts_caffe.generate_encoder_decoder(data) + os.system('mv input_shape.txt encoder_decoder_input_shape.txt') + + tts_caffe = Tensorflow2CaffeTactron2(tensorflow_model_path, "tts_postnet", "tts_postnet", + params, + check=False, calc=True) + data = {} + data["tts_decoder"] = decoder_result[:, :int(num+1)*params.outputs_per_step, :] + tts_caffe.generate_postnet(data) + os.system('mv input_shape.txt postnet_input_shape.txt') + + +def genrate_streaming_lstm_states(layers, num_units, output_name_prefix): + states = {} + for i in range(layers): + state_shape = [1, num_units] + state_name = output_name_prefix + "_layer" + str(i) + "_state" + states[state_name] = np.zeros(state_shape) + return states + +def text_to_speech_streaming(tensorflow_model_path): + params = Tensorflow2CaffeTactron2.Parameters() + params.streaming = True + params.max_iters = 12 + tts_caffe = Tensorflow2CaffeTactron2(tensorflow_model_path, "tts_encoder", "tts_encoder", + params, + check=False, calc=True) + + data = {} + data["tts_words"] = np.array([[4, 25, 14, 33, 11, 20, 1, 9, 14, 33, 27, 2, 20, 35, 15, 1, 10, 37, 11, 2, 30, + 34, 15, 7, 21, 1, 25, 14, 35, 21, 27, 3, 25, 14, 34, 27, 1, 25, 14, 35, 27, 1, 17, 36, 7, 20, 1, 37, 7, 0]]) + data["tts_emotions"] = np.array([[4]*data["tts_words"].size]) + rnn_result = tts_caffe.generate_encoder(data) + rnn_result_data = tts_caffe.get_tensor(rnn_result) + os.system('mv input_shape.txt encoder_input_shape.txt') + + tts_caffe = Tensorflow2CaffeTactron2(tensorflow_model_path, "tts_decoder", "tts_decoder", + params, + check=False, calc=True) + tts_caffe.print_weight_map() + data = {} + data[rnn_result] = rnn_result_data + data["tts_alignments"] = np.zeros([1, rnn_result_data.shape[1]]) + data["decoder_input"] = np.zeros([1, params.num_mels]) + data["decoder_attention"] = np.zeros([1, rnn_result_data.shape[2]]) + data.update(genrate_streaming_lstm_states(1, params.decoder_lstm_units*2, "decoder_query")) + data.update(genrate_streaming_lstm_states(params.decoder_layers, params.decoder_lstm_units*2, "decoder_lstm")) + tts_caffe.generate_decoder(rnn_result, data) + os.system('mv input_shape.txt decoder_input_shape.txt') + +if __name__ == '__main__': + tensorflow_model_path = "/data/models/tts/taco_pretrained-290000/tacotron_model.ckpt-290000" + # text_to_speech(tensorflow_model_path) + text_to_speech_streaming(tensorflow_model_path) + diff --git a/model_tools/tools/tensorflow2json/tf2json.py b/model_tools/tools/tensorflow2json/tf2json.py new file mode 100644 index 00000000..a77dfa02 --- /dev/null +++ b/model_tools/tools/tensorflow2json/tf2json.py @@ -0,0 +1,55 @@ +from google.protobuf import json_format +import sys +import json +import numpy as np +import tensorflow as tf +from tensorflow.python.platform import gfile +from tensorflow.python.framework import tensor_util +import math + +GRAPH_PB_PATH = "" +SAVE_JSON_PATH = "" +if (len(sys.argv) != 2): + print("Error input, please input 2 params(GRAPH_PB_PATH and SAVE_JSON_PATH) respectively.\n") +else: + GRAPH_PB_PATH = sys.argv[0] + SAVE_JSON_PATH = sys.argv[1] + +global_weight_dict = {} +with tf.Session() as sess: + with gfile.FastGFile(GRAPH_PB_PATH, "rb") as f: + graph_def = tf.GraphDef() + graph_def.ParseFromString(f.read()) + json_string = json_format.MessageToJson(graph_def) + + json_string = json_string.replace(' ', '') + json_string = json_string.replace('\n', '') + + d = json.loads(json_string) + sess.graph.as_default() + tf.import_graph_def(graph_def, name='') + graph_nodes=[n for n in graph_def.node] + for item in graph_nodes: + if item.op == "Const": + weight_values = (tensor_util.MakeNdarray(item.attr['value'].tensor).astype("float64")).flatten().tolist() + for wvIndex in range(len(weight_values)): + if weight_values[wvIndex] == math.inf: + weight_values[wvIndex] = np.finfo(np.float64).max + elif weight_values[wvIndex] == -math.inf: + weight_values[wvIndex] = np.finfo(np.float64).min + tmp_numpy_arr = tensor_util.MakeNdarray(item.attr['value'].tensor) + weight_values_new = tmp_numpy_arr.astype('float64').flatten().tolist() + weight_op_name = item.name + global_weight_dict[weight_op_name] = weight_values + + totalConstIndex = 0 + constIndex = 0 + for node in d["node"]: + if node["op"] == "Const": + totalConstIndex = totalConstIndex + 1 + node["attr"]["value"]["tensor"]["tensorContent"] = global_weight_dict[node["name"]] + constIndex = constIndex + 1 + + final_dict = json.dumps(d) + with open(SAVE_JSON_PATH, 'w', encoding="utf-8") as f: + json.dump(final_dict, f, ensure_ascii=False) diff --git a/quick_benchmark.sh b/quick_benchmark.sh deleted file mode 100644 index 95975c4a..00000000 --- a/quick_benchmark.sh +++ /dev/null @@ -1,154 +0,0 @@ -#!/bin/bash - -script_name=$0 -script_abs=$(readlink -f "$0") -script_dir=$(dirname $script_abs) - -host_test_dir="" -host_kit_dir="" -use_static_library=true -host_lib_dir="" -exe_on_device=true -array=($(adb devices | grep ".device$")) -device=${array[0]} -cpu_mask="40" -device_dir="" -gpu=false - -print_help() { - cat < run specified program in test . - -k, --kit run specified program in kit . - -l, --lib use specified library in . - -d, --device run test on device. - -c, --cpu_mask taskset cpu mask(default: 40). - -g, --gpu run gpu test. - -p, --path run test on device in specified PATH. -EOF - exit 1; -} - - -TEMP=`getopt -o t:k:c:hl:d:p:g --long test:kit:cpu_mask:help,lib:device:path:gpu \ - -n ${script_name} -- "$@"` -if [ $? != 0 ] ; then echo "[ERROR] terminating..." >&2 ; exit 1 ; fi -eval set -- "$TEMP" -while true ; do - case "$1" in - -t|--test) - host_test_dir=$2 - echo "[INFO] run test in '${host_test_dir}'" ; - shift 2 ;; - -k|--kit) - host_kit_dir=$2 - echo "[INFO] run test in '${host_kit_dir}'" ; - shift 2 ;; - -c|--cpu_mask) - cpu_mask=$2 - echo "[INFO] CPU mask '${cpu_mask}'" ; - shift 2 ;; - -l|--lib) - use_static_library=false; - host_lib_dir=$2 - echo "[INFO] use library in ${host_lib_dir}" ; - shift 2 ;; - -d|--device) - device=$2 - exe_on_device=true - echo "[INFO] test on device \`${device}'" ; - shift 2 ;; - -p|--path) - device_dir=$2 - echo "[INFO] test on device directory \`${device_dir}'" ; - shift 2 ;; - -g|--gpu) - gpu=true; - shift ;; - -h|--help) - print_help ; - shift ;; - --) shift ; - break ;; - *) echo "[ERROR]" ; exit 1 ;; - esac -done - - -if [ ${exe_on_device} == true ] ; then - status=`adb -s ${device} shell "ls ${device_dir} && echo 'success'" | tail -n 1` - if [ "${status}" != "success" ] ; then - adb -s ${device} shell "mkdir ${device_dir}" - fi - if [ ${use_static_library} != true ] ; then - for file in `ls ${host_lib_dir}/*.so` - do - adb -s ${device} push ${file} ${device_dir} || exit 1 - done - ${script_dir}/scripts/push_third_party.sh -l ${script_dir}/third_party/llvm -d ${device} -p ${device_dir} || exit 1 - fi -fi - -function device_excute(){ - adb -s ${device} shell "export LD_LIBRARY_PATH=${device_dir} && taskset ${cpu_mask} $@ || echo '[FAILURE]'" &> status.txt - cat status.txt || exit 1 - if [ `grep -c "\[FAILURE\]" status.txt` -ne '0' ] ; then - exit 1 - fi - rm status.txt -} - -# mmm -adb -s ${device} push ${host_test_dir}/test_mmm_int8 ${device_dir} || exit 1 -adb -s ${device} push ${host_test_dir}/test_mmm ${device_dir} || exit 1 -echo " " ; echo "--- Matrix Matrix Multiplication" -device_excute ${device_dir}/test_mmm 384 768 768 - -# conv_ic=3 -adb -s ${device} push ${host_test_dir}/test_convolution ${device_dir} || exit 1 -echo " " ; echo "--- Conv IC=3" -device_excute ${device_dir}/test_convolution 1 3 227 227 96 3 11 11 4 0 1 96 55 55 - -# conv_5x5 -adb -s ${device} push ${host_test_dir}/test_convolution_bnn ${device_dir} || exit 1 -adb -s ${device} push ${host_test_dir}/test_convolution_int8 ${device_dir} || exit 1 -echo " " ; echo "--- Conv 5x5" -device_excute ${device_dir}/test_convolution_bnn 1 96 27 27 256 96 5 5 2 0 1 256 13 13 -device_excute ${device_dir}/test_convolution_int8 1 96 27 27 256 96 5 5 2 0 1 256 13 13 -device_excute ${device_dir}/test_convolution 1 96 27 27 256 96 5 5 2 0 1 256 13 13 - -# conv_3x3 -echo " " ; echo "--- Conv 3x3" -device_excute ${device_dir}/test_convolution_bnn 1 128 28 28 256 256 3 3 1 1 1 192 28 28 -device_excute ${device_dir}/test_convolution_int8 1 128 28 28 256 256 3 3 1 1 1 192 28 28 -device_excute ${device_dir}/test_convolution 1 128 28 28 256 256 3 3 1 1 1 192 28 28 - -# conv_5x5 -adb -s ${device} push ${host_test_dir}/test_depthwise_convolution ${device_dir} || exit 1 -echo " " ; echo "--- Depthwise-Pointwise Conv" -device_excute ${device_dir}/test_depthwise_convolution 1 256 28 28 256 256 3 3 1 1 1 256 28 28 - -# OCL -if [ ${gpu} == true ] ; then - adb -s ${device} push ${host_kit_dir}/hdr ${device_dir} || exit 1 - echo " " ; echo " " ; echo "--- GPU Network Test (HDR_OCL)" - echo " " ; echo "=== Input FP16" - device_excute ${device_dir}/hdr 1 3 720 1280 - echo " " ; echo "=== Input UCHAR" - device_excute ${device_dir}/hdr 1 3 720 1280 UCHAR - - adb -s ${device} push ${host_test_dir}/test_convolution_ocl ${device_dir} || exit 1 - adb -s ${device} push ${host_test_dir}/test_depthwise_convolution_ocl ${device_dir} || exit 1 - adb -s ${device} push ${host_test_dir}/test_fully_connected_ocl ${device_dir} || exit 1 - device_excute ${device_dir}/test_convolution_ocl 64 112 112 64 5 5 1 2 - device_excute ${device_dir}/test_convolution_ocl 64 112 112 64 3 3 1 1 - device_excute ${device_dir}/test_depthwise_convolution_ocl 64 112 112 64 3 3 1 1 - device_excute ${device_dir}/test_fully_connected_ocl 24 1 1 96 -fi - -adb -s ${device} shell "rm -rf ${device_dir}" - diff --git a/scripts/build_light_bolt.sh b/scripts/build_light_bolt.sh index c0085dc1..f702c71a 100644 --- a/scripts/build_light_bolt.sh +++ b/scripts/build_light_bolt.sh @@ -5,13 +5,16 @@ script_abs=$(readlink -f "$0") script_dir=$(dirname $script_abs) BOLT_ROOT=${script_dir}/.. -build_dir=$1 -use_mali=$2 -use_debug=$3 -use_android=$4 -CXX=$5 -AR=$6 -STRIP=$7 +CXX=$1 +AR=$2 +STRIP=$3 +build_dir=$4 +use_mali=$5 +use_debug=$6 +use_android=$7 +use_android_log=$8 +use_ios=$9 +use_openmp=${10} allSrcs="" skip_list=() @@ -31,30 +34,45 @@ searchFiles() { done if [[ ${skip} == false ]] then - srcs="${srcs} ${line}" + srcs="${srcs} ${build_dir}/${line}" fi done } -allSrcs=`find ${build_dir} -name "*.o"` -skip_list=("static" "model-tools" "tests" "tools" "kits" "data_loader") -searchFiles -jniLibrarySrcs="${srcs} ${build_dir}/model-tools/src/CMakeFiles/model-tools.dir/model_deserialize.cpp.o \ -${build_dir}/model-tools/src/CMakeFiles/model-tools.dir/model_tools.cpp.o" +if [ $use_ios == "OFF" ]; +then + allSrcs=`find ${build_dir} -name "*.o" -printf "%P\n"` + skip_list=("static" "model_tools" "tests" "tools" "examples" "flow" "data_loader") + searchFiles + jniLibrarySrcs="${srcs} \ + ${build_dir}/model_tools/src/CMakeFiles/model_tools.dir/model_tools.cpp.o" +fi -allSrcs=`find ${build_dir} -name "*.o" | grep "static.dir"` -skip_list=("tests" "tools" "kits" "BoltModel_Jni") +allSrcs=`find ${build_dir} -name "*.o" -printf "%P\n"| grep "static.dir"` +skip_list=("tests" "tools" "examples" "BoltModel_Jni" "flow" "data_loader") searchFiles -staticLibrarySrcs="${srcs} ${build_dir}/model-tools/src/CMakeFiles/model-tools_static.dir/model_deserialize.cpp.o \ -${build_dir}/model-tools/src/CMakeFiles/model-tools_static.dir/model_tools.cpp.o" +staticLibrarySrcs="${srcs} \ +${build_dir}/model_tools/src/CMakeFiles/model_tools_static.dir/model_tools.cpp.o" -allSrcs=`find ${build_dir} -name "*.o"` -skip_list=("static" "tests" "tools" "kits") +allSrcs=`find ${build_dir} -name "*.o" -printf "%P\n"` +skip_list=("static" "tests" "tools" "examples" "BoltModel_Jni" "flow" "data_loader") searchFiles -sharedLibrarySrcs=${srcs} +sharedLibrarySrcs="${srcs} \ +${build_dir}/model_tools/src/CMakeFiles/model_tools_static.dir/model_tools.cpp.o" + +if [ -f "${build_dir}/common/gcl/tools/kernel_source_compile/libkernelsource.so" ] && [ $use_mali == "ON" ]; +then + gclLibrarySrcs="${build_dir}/common/gcl/tools/kernel_source_compile/CMakeFiles/kernelsource.dir/src/cl/gcl_kernel_source.cpp.o \ + ${build_dir}/common/gcl/tools/kernel_source_compile/CMakeFiles/kernelsource.dir/src/cl/inline_cl_source.cpp.o \ + ${build_dir}/common/gcl/tools/kernel_source_compile/CMakeFiles/kernelsource.dir/src/option/gcl_kernel_option.cpp.o \ + ${build_dir}/common/gcl/tools/kernel_source_compile/CMakeFiles/kernelsource.dir/src/option/inline_cl_option.cpp.o" + jniLibrarySrcs="${jniLibrarySrcs} ${gclLibrarySrcs}" + staticLibrarySrcs="${staticLibrarySrcs} ${gclLibrarySrcs}" + sharedLibrarySrcs="${sharedLibrarySrcs} ${gclLibrarySrcs}" +fi -if [ -f "${BOLT_ROOT}/third_party/llvm/opencl/lib64/libOpenCL.so" ] && [ $use_mali == "ON" ]; +if [ -f "${BOLT_ROOT}/third_party/arm_llvm/opencl/lib64/libOpenCL.so" ] && [ $use_mali == "ON" ]; then - cp ${BOLT_ROOT}/third_party/llvm/opencl/lib64/libOpenCL.so ${build_dir} + cp ${BOLT_ROOT}/third_party/arm_llvm/opencl/lib64/libOpenCL.so ${build_dir} ${STRIP} ${build_dir}/libOpenCL.so || exit 1 fi @@ -66,41 +84,49 @@ if [ -f "${build_dir}/libbolt.so" ]; then rm -rf ${build_dir}/libbolt.so fi +if [ -f "${build_dir}/libbolt.dylib" ]; +then + rm -rf ${build_dir}/libbolt.dylib +fi if [ -f "${build_dir}/libBoltModel.so" ]; then rm -rf ${build_dir}/libBoltModel.so fi -if [ -f "${BOLT_ROOT}/gcl/tools/kernel_lib_compile/lib/libkernelbin.so" ] && [ $use_mali == "ON" ]; +lib="" +if [ $use_android_log == "ON" ] && [ $use_android == "ON" ]; then - if [ $use_debug == "ON" ] && [ $use_android == "ON" ]; - then - ${STRIP} ${BOLT_ROOT}/gcl/tools/kernel_lib_compile/lib/libkernelbin.so || exit 1 - ${CXX} -shared -o ${build_dir}/libBoltModel.so ${jniLibrarySrcs} \ - -L${BOLT_ROOT}/third_party/llvm/opencl/lib64 -lOpenCL \ - -L${BOLT_ROOT}/gcl/tools/kernel_lib_compile/lib -lkernelbin -llog || exit 1 - else - ${CXX} -shared -o ${build_dir}/libBoltModel.so ${jniLibrarySrcs} \ - -L${BOLT_ROOT}/third_party/llvm/opencl/lib64 -lOpenCL \ - -L${BOLT_ROOT}/gcl/tools/kernel_lib_compile/lib -lkernelbin || exit 1 - fi - ${CXX} -shared -o ${build_dir}/libbolt.so ${sharedLibrarySrcs} \ - -L${BOLT_ROOT}/third_party/llvm/opencl/lib64 -lOpenCL \ - -L${BOLT_ROOT}/gcl/tools/kernel_lib_compile/lib -lkernelbin || exit 1 + lib="${lib} -llog" +fi +if [ $use_openmp == "ON" ]; +then + lib="${lib} -fopenmp" +fi +if [ -f "${build_dir}/common/gcl/tools/kernel_source_compile/libkernelsource.so" ] && [ $use_mali == "ON" ]; +then + ${STRIP} ${build_dir}/common/gcl/tools/kernel_source_compile/libkernelsource.so || exit 1 + lib="${lib} -L${BOLT_ROOT}/third_party/arm_llvm/opencl/lib64 -lOpenCL" +fi + +if [ $use_ios == "ON" ]; +then + ${CXX} -shared -o ${build_dir}/libbolt.dylib ${sharedLibrarySrcs} ${lib} || exit 1 else - if [ $use_debug == "ON" ] && [ $use_android == "ON" ]; - then - ${CXX} -shared -o ${build_dir}/libBoltModel.so ${jniLibrarySrcs} -llog || exit 1 - else - ${CXX} -shared -o ${build_dir}/libBoltModel.so ${jniLibrarySrcs} || exit 1 - fi - ${CXX} -shared -o ${build_dir}/libbolt.so ${sharedLibrarySrcs} || exit 1 + ${CXX} -shared -o ${build_dir}/libBoltModel.so ${jniLibrarySrcs} ${lib} -Wl,-soname,libBoltModel.so || exit 1 + ${CXX} -shared -o ${build_dir}/libbolt.so ${sharedLibrarySrcs} ${lib} -Wl,-soname,libbolt.so || exit 1 fi + ${AR} -rc ${build_dir}/libbolt.a ${staticLibrarySrcs} || exit 1 if [ $use_debug == "OFF" ]; then - ${STRIP} ${build_dir}/libBoltModel.so || exit 1 - ${STRIP} ${build_dir}/libbolt.so || exit 1 - ${STRIP} -g -S -d --strip-debug --strip-unneeded ${build_dir}/libbolt.a || exit 1 + if [ $use_ios == "OFF" ]; + then + ${STRIP} ${build_dir}/libBoltModel.so || exit 1 + fi + if [ $use_ios == "OFF" ]; + then + ${STRIP} ${build_dir}/libbolt.so || exit 1 + ${STRIP} -g -S -d --strip-debug --strip-unneeded ${build_dir}/libbolt.a || exit 1 + fi fi diff --git a/scripts/operator_driver.sh b/scripts/operator_driver.sh deleted file mode 100644 index 5cb0e137..00000000 --- a/scripts/operator_driver.sh +++ /dev/null @@ -1,121 +0,0 @@ -#!/bin/bash - -script_name=$0 -script_abs=$(readlink -f "$0") -script_dir=$(dirname $script_abs) - -cpu_mask="2" -exe_host_path="" -parameter_file_path="" -excute_on_device=false -use_static_library=false -device="" -device_dir="" -exe_device_path="" - - -print_help() { - cat < run specified program. - -i, --input parameter file PATH. - -s, --static use the static library(default: false). - -c, --cpu_mask taskset cpu mask(default: 2). - -d, --device run test on device. - -p, --path run test on device in specified . -EOF - exit 1; -} - -TEMP=`getopt -o c:d:e:i:p:hs: --long cpu_mask:device:exe:input:path:help,static: \ - -n ${script_name} -- "$@"` -if [ $? != 0 ] ; then echo "[ERROR] terminating..." >&2 ; exit 1 ; fi -eval set -- "$TEMP" -while true ; do - case "$1" in - -c|--cpu_mask) - cpu_mask=$2 - echo "[INFO] CPU mask '${cpu_mask}'" ; - shift 2 ;; - -d|--device) - device=$2 - exe_on_device=true - echo "[INFO] test on device '${device}'" ; - shift 2 ;; - -p|--path) - device_dir=$2 - echo "[INFO] test on device path '${device_dir}'" ; - shift 2 ;; - -s|--static) - use_static_library=$2 - echo "[INFO] use static library: ${use_static_library}" ; - shift 2;; - -e|--exe) - exe_host_path=$2 - echo "[INFO] exe '${exe_host_path}'" ; - shift 2 ;; - -i|--input) - parameter_file_path=$2 - echo "[INFO] parameter \`${parameter_file_path}'" ; - shift 2 ;; - -h|--help) - print_help ; - shift ;; - --) shift ; - break ;; - *) echo "[ERROR]" ; exit 1 ;; - esac -done - -if [ "${exe_host_path}" == "" ] || [ ! -f ${exe_host_path} ] ; then - echo "[ERROR] exe '${exe}' doesn't exist"; - exit 1 -fi - -if [ "${parameter_file_path}" == "" ] || [ ! -f ${parameter_file_path} ] ; then - echo "[ERROR] parameter '${parameter_file_path}' doesn't exist"; - exit 1 -fi - -if [ ${exe_on_device} == true ] ; then - exe_name=${exe_host_path##*/} - exe_device_path="${device_dir}/${exe_name}" - adb -s ${device} push ${exe_host_path} ${exe_device_path} || exit 1 -fi - -while read params -do - # filter out the params that starts with '#' - if [[ ! "$params" =~ ^#.* ]]; then - params_len=${#params} - if [[ $params_len -gt 0 ]]; then - #echo " parameter: ${params}" - if [ ${exe_on_device} == true ] ; then - if [ ${use_static_library} == true ] ; then - adb -s ${device} shell "taskset ${cpu_mask} ${exe_device_path} ${params} || echo '[FAILURE]'" &> status.txt - else - adb -s ${device} shell "export LD_LIBRARY_PATH=${device_dir} && taskset ${cpu_mask} ${exe_device_path} ${params} || echo '[FAILURE]'" &> status.txt - fi - else - if [ ${use_static_library} == true ] ; then - ${exe_host_path} ${params} || echo '[FAILURE]' &> status.txt - else - export LD_LIBRARY_PATH=${exe_host_path}/../lib:${LD_LIBRARY_PATH} && ${exe_host_path} ${params} || echo '[FAILURE]' &> status.txt - fi - fi - cat status.txt || exit 1 - if [ `grep -c "\[FAILURE\]" status.txt` -ne '0' ] ; then - exit 1 - fi - rm status.txt - fi - fi -done < ${parameter_file_path} - -if [ ${exe_on_device} == true ] ; then - adb -s ${device} shell "rm -rf ${exe_device_path}" -fi diff --git a/scripts/operator_test.sh b/scripts/operator_test.sh deleted file mode 100644 index 499529a1..00000000 --- a/scripts/operator_test.sh +++ /dev/null @@ -1,150 +0,0 @@ -#!/bin/bash - -script_name=$0 -script_abs=$(readlink -f "$0") -script_dir=$(dirname $script_abs) -driver_script_path="${script_dir}/operator_driver.sh" - -host_bin_dir="" -use_static_library=true -host_lib_dir="" -excute_on_device=false -device="" -cpu_mask="2" -device_dir="" - -print_help() { - cat < run specified program in . - -l, --lib use sprcified library in . - -d, --device run test on device. - -c, --cpu_mask taskset cpu mask(default: 2). - -p, --path run test on device in specified PATH. -EOF - exit 1; -} - - -TEMP=`getopt -o b:c:hl:d:p: --long bin:cpu_mask:help,lib:device:path: \ - -n ${script_name} -- "$@"` -if [ $? != 0 ] ; then echo "[ERROR] terminating..." >&2 ; exit 1 ; fi -eval set -- "$TEMP" -while true ; do - case "$1" in - -b|--bin) - host_bin_dir=$2 - echo "[INFO] run test in '${host_bin_dir}'" ; - shift 2 ;; - -c|--cpu_mask) - cpu_mask=$2 - echo "[INFO] CPU mask '${cpu_mask}'" ; - shift 2 ;; - -l|--lib) - use_static_library=false; - host_lib_dir=$2 - echo "[INFO] use library in ${host_lib_dir}" ; - shift 2 ;; - -d|--device) - device=$2 - exe_on_device=true - echo "[INFO] test on device \`${device}'" ; - shift 2 ;; - -p|--path) - device_dir=$2 - echo "[INFO] test on device directory \`${device_dir}'" ; - shift 2 ;; - -h|--help) - print_help ; - shift ;; - --) shift ; - break ;; - *) echo "[ERROR]" ; exit 1 ;; - esac -done - -run_command() { - params=" -c ${cpu_mask} -e $1 -i $2" - if [ ${exe_on_device} == true ] ; then - params="${params} -p ${device_dir} -d ${device}" - fi - if [ ${use_static_library} == true ] ; then - params="${params} -s ${use_static_library}" - fi - ${driver_script_path} ${params} || exit 1 -} - -if [ ${exe_on_device} == true ] ; then - status=`adb -s ${device} shell "ls ${device_dir} && echo 'success'" | tail -n 1` - if [ "${status}" == "success" ] ; then - if [ ${use_static_library} != true ] ; then - adb -s ${device} push ${host_lib_dir}/libblas-enhance.so ${device_dir} - adb -s ${device} push ${host_lib_dir}/libtensor_computing.so ${device_dir} - fi - else - adb -s ${device} shell "mkdir ${device_dir}" - fi -fi - - -# FP32 & FP16 operator test -# blas-enhance -run_command ${host_bin_dir}/test_mmm ${script_dir}/params/mmm.csv -run_command ${host_bin_dir}/test_mvm ${script_dir}/params/mvm.csv - -# tensor_computing -run_command ${host_bin_dir}/test_activation ${script_dir}/params/activation.csv -run_command ${host_bin_dir}/test_attention ${script_dir}/params/attention.csv -run_command ${host_bin_dir}/test_reduction ${script_dir}/params/reduction.csv -run_command ${host_bin_dir}/test_clip ${script_dir}/params/clip.csv -run_command ${host_bin_dir}/test_concat ${script_dir}/params/concat.csv -run_command ${host_bin_dir}/test_convolution ${script_dir}/params/convolution.csv -run_command ${host_bin_dir}/test_convolution ${script_dir}/params/alexnet_convolution.csv -run_command ${host_bin_dir}/test_convolution ${script_dir}/params/googlenet_convolution.csv -run_command ${host_bin_dir}/test_convolution ${script_dir}/params/resnet50_convolution.csv -run_command ${host_bin_dir}/test_deconvolution ${script_dir}/params/deconvolution.csv -run_command ${host_bin_dir}/test_depthwise_convolution ${script_dir}/params/mobilenetv1_depthwise_convolution.csv -run_command ${host_bin_dir}/test_depthwise_convolution ${script_dir}/params/mobilenetv2_depthwise_convolution.csv -run_command ${host_bin_dir}/test_depthwise_convolution ${script_dir}/params/mobilenetv3_depthwise_convolution.csv -run_command ${host_bin_dir}/test_dilated_convolution ${script_dir}/params/dilated_convolution.csv -run_command ${host_bin_dir}/test_eltwise ${script_dir}/params/eltwise.csv -run_command ${host_bin_dir}/test_fully_connected ${script_dir}/params/lenet_fully_connected.csv -run_command ${host_bin_dir}/test_lstm ${script_dir}/params/lstm.csv -run_command ${host_bin_dir}/test_multiply ${script_dir}/params/multiply.csv -run_command ${host_bin_dir}/test_pooling ${script_dir}/params/pooling.csv -run_command ${host_bin_dir}/test_reshape ${script_dir}/params/reshape.csv -run_command ${host_bin_dir}/test_softmax ${script_dir}/params/softmax.csv -run_command ${host_bin_dir}/test_split ${script_dir}/params/split.csv -run_command ${host_bin_dir}/test_slice ${script_dir}/params/slice.csv -run_command ${host_bin_dir}/test_scale ${script_dir}/params/scale.csv -run_command ${host_bin_dir}/test_transpose ${script_dir}/params/transpose.csv - -# INT8 operator test -# blas-enhance -run_command ${host_bin_dir}/test_mmm_int8 ${script_dir}/params/mmm.csv -run_command ${host_bin_dir}/test_mvm_int8 ${script_dir}/params/mvm.csv - -# tensor_computing -run_command ${host_bin_dir}/test_concat_int8 ${script_dir}/params/concat.csv -run_command ${host_bin_dir}/test_pooling_int8 ${script_dir}/params/pooling.csv -run_command ${host_bin_dir}/test_convolution_int8 ${script_dir}/params/alexnet_convolution.csv -run_command ${host_bin_dir}/test_convolution_int8 ${script_dir}/params/googlenet_convolution.csv -run_command ${host_bin_dir}/test_convolution_int8 ${script_dir}/params/resnet50_convolution.csv -run_command ${host_bin_dir}/test_depthwise_convolution_int8 ${script_dir}/params/mobilenetv1_depthwise_convolution.csv -run_command ${host_bin_dir}/test_depthwise_convolution_int8 ${script_dir}/params/mobilenetv2_depthwise_convolution.csv -run_command ${host_bin_dir}/test_depthwise_convolution_int8 ${script_dir}/params/mobilenetv3_depthwise_convolution.csv - -# BNN operator test -run_command ${host_bin_dir}/test_convolution_bnn ${script_dir}/params/bnn_convolution.csv - - -if [ ${exe_on_device} == true ] ; then - if [ ${use_static_library} != true ] ; then - adb -s ${device} shell "rm -rf ${device_dir}/libblas-enhance.so" - adb -s ${device} shell "rm -rf ${device_dir}/libtensor_computing.so" - fi -fi diff --git a/scripts/params/alexnet_convolution.csv b/scripts/params/alexnet_convolution.csv deleted file mode 100644 index 23682d37..00000000 --- a/scripts/params/alexnet_convolution.csv +++ /dev/null @@ -1,6 +0,0 @@ -#in_n in_c in_h in_w f_n f_c f_h f_w stride padding out_n out_c out_h out_w -1 3 227 227 96 3 11 11 4 0 1 96 55 55 -1 96 27 27 256 96 5 5 2 0 1 256 13 13 -1 256 13 13 384 256 3 3 1 1 1 384 13 13 -1 384 13 13 384 384 3 3 1 1 1 384 13 13 -1 384 13 13 256 384 3 3 1 1 1 256 13 13 diff --git a/scripts/params/argmax.csv b/scripts/params/argmax.csv deleted file mode 100644 index b7bdd748..00000000 --- a/scripts/params/argmax.csv +++ /dev/null @@ -1,5 +0,0 @@ -#in ic ih iw axis -1 64 24 24 -1 -1 8 100 100 1 -1 8 100 100 2 -1 8 100 100 3 diff --git a/scripts/params/bnn_convolution.csv b/scripts/params/bnn_convolution.csv deleted file mode 100644 index 09d7386b..00000000 --- a/scripts/params/bnn_convolution.csv +++ /dev/null @@ -1,53 +0,0 @@ -#in_n in_c in_h in_w f_n f_c f_h f_w stride padding out_n out_c out_h out_w -1 64 56 56 256 64 1 1 1 0 1 256 56 56 -1 64 56 56 64 64 1 1 1 0 1 64 56 56 -1 64 56 56 64 64 3 3 1 1 1 64 56 56 -1 64 56 56 256 64 1 1 1 0 1 256 56 56 -1 256 56 56 64 256 1 1 1 0 1 64 56 56 -1 64 56 56 64 64 3 3 1 1 1 64 56 56 -1 64 56 56 256 64 1 1 1 0 1 256 56 56 -1 256 56 56 64 256 1 1 1 0 1 64 56 56 -1 64 56 56 64 64 3 3 1 1 1 64 56 56 -1 64 56 56 256 64 1 1 1 0 1 256 56 56 -1 256 56 56 512 256 1 1 2 0 1 512 28 28 -1 256 56 56 128 256 1 1 2 0 1 128 28 28 -1 128 28 28 128 128 3 3 1 1 1 128 28 28 -1 128 28 28 512 128 1 1 1 0 1 512 28 28 -1 512 28 28 128 512 1 1 1 0 1 128 28 28 -1 128 28 28 128 128 3 3 1 1 1 128 28 28 -1 128 28 28 512 128 1 1 1 0 1 512 28 28 -1 512 28 28 128 512 1 1 1 0 1 128 28 28 -1 128 28 28 128 128 3 3 1 1 1 128 28 28 -1 128 28 28 512 128 1 1 1 0 1 512 28 28 -1 512 28 28 128 512 1 1 1 0 1 128 28 28 -1 128 28 28 128 128 3 3 1 1 1 128 28 28 -1 128 28 28 512 128 1 1 1 0 1 512 28 28 -1 512 28 28 1024 512 1 1 2 0 1 1024 14 14 -1 512 28 28 256 512 1 1 2 0 1 256 14 14 -1 256 14 14 256 256 3 3 1 1 1 256 14 14 -1 256 14 14 1024 256 1 1 1 0 1 1024 14 14 -1 1024 14 14 256 1024 1 1 1 0 1 256 14 14 -1 256 14 14 256 256 3 3 1 1 1 256 14 14 -1 256 14 14 1024 256 1 1 1 0 1 1024 14 14 -1 1024 14 14 256 1024 1 1 1 0 1 256 14 14 -1 256 14 14 256 256 3 3 1 1 1 256 14 14 -1 256 14 14 1024 256 1 1 1 0 1 1024 14 14 -1 1024 14 14 256 1024 1 1 1 0 1 256 14 14 -1 256 14 14 256 256 3 3 1 1 1 256 14 14 -1 256 14 14 1024 256 1 1 1 0 1 1024 14 14 -1 1024 14 14 256 1024 1 1 1 0 1 256 14 14 -1 256 14 14 256 256 3 3 1 1 1 256 14 14 -1 256 14 14 1024 256 1 1 1 0 1 1024 14 14 -1 1024 14 14 256 1024 1 1 1 0 1 256 14 14 -1 256 14 14 256 256 3 3 1 1 1 256 14 14 -1 256 14 14 1024 256 1 1 1 0 1 1024 14 14 -1 1024 14 14 2048 1024 1 1 2 0 1 2048 7 7 -1 1024 14 14 512 1024 1 1 2 0 1 512 7 7 -1 512 7 7 512 512 3 3 1 1 1 512 7 7 -1 512 7 7 2048 512 1 1 1 0 1 2048 7 7 -1 2048 7 7 512 2048 1 1 1 0 1 512 7 7 -1 512 7 7 512 512 3 3 1 1 1 512 7 7 -1 512 7 7 2048 512 1 1 1 0 1 2048 7 7 -1 2048 7 7 512 2048 1 1 1 0 1 512 7 7 -1 512 7 7 512 512 3 3 1 1 1 512 7 7 -1 512 7 7 2048 512 1 1 1 0 1 2048 7 7 diff --git a/scripts/params/check.csv b/scripts/params/check.csv deleted file mode 100644 index 96065aa5..00000000 --- a/scripts/params/check.csv +++ /dev/null @@ -1,3 +0,0 @@ -#in ic ih iw -1 64 24 24 -1 8 100 100 diff --git a/scripts/params/concat.csv b/scripts/params/concat.csv deleted file mode 100644 index d2167561..00000000 --- a/scripts/params/concat.csv +++ /dev/null @@ -1,3 +0,0 @@ -#num axis [in ic ih iw]* on oc oh ow -2 1 1 8 16 16 1 16 16 16 1 24 16 16 -2 1 1 3 7 7 1 16 7 7 1 19 7 7 diff --git a/scripts/params/convolution.csv b/scripts/params/convolution.csv deleted file mode 100644 index 135dccfd..00000000 --- a/scripts/params/convolution.csv +++ /dev/null @@ -1,9 +0,0 @@ -#in_n in_c in_h in_w f_n f_c f_h f_w stride padding out_n out_c out_h out_w -1 1 227 227 96 1 11 11 4 0 1 96 55 55 -1 2 227 227 96 2 11 11 4 0 1 96 55 55 -1 3 227 227 96 3 11 11 4 0 1 96 55 55 -1 4 227 227 96 4 11 11 4 0 1 96 55 55 -1 5 227 227 96 5 11 11 4 0 1 96 55 55 -1 6 227 227 96 6 11 11 4 0 1 96 55 55 -1 7 227 227 96 7 11 11 4 0 1 96 55 55 -1 8 227 227 96 8 11 11 4 0 1 96 55 55 diff --git a/scripts/params/deconvolution.csv b/scripts/params/deconvolution.csv deleted file mode 100644 index 18386e0b..00000000 --- a/scripts/params/deconvolution.csv +++ /dev/null @@ -1,2 +0,0 @@ -#in_n in_c in_h in_w f_n f_c f_h f_w stride padding out_n out_c out_h out_w -1 128 32 32 128 128 2 2 2 0 1 128 64 64 diff --git a/scripts/params/dilated_convolution.csv b/scripts/params/dilated_convolution.csv deleted file mode 100644 index 5188f5e1..00000000 --- a/scripts/params/dilated_convolution.csv +++ /dev/null @@ -1,5 +0,0 @@ -#in_n in_c in_h in_w f_n f_c f_h f_w stride padding rate out_n out_c out_h out_w -1 96 27 27 256 96 5 5 2 0 2 1 256 10 10 -1 256 13 13 384 256 3 3 1 1 2 1 384 11 11 -1 384 13 13 384 384 3 3 1 1 3 1 384 9 9 -1 384 13 13 256 384 3 3 1 1 4 1 256 7 7 diff --git a/scripts/params/googlenet_convolution.csv b/scripts/params/googlenet_convolution.csv deleted file mode 100644 index 6c6851ca..00000000 --- a/scripts/params/googlenet_convolution.csv +++ /dev/null @@ -1,58 +0,0 @@ -#in_n in_c in_h in_w f_n f_c f_h f_w stride padding out_n out_c out_h out_w -1 3 224 224 64 3 7 7 2 3 1 64 112 112 -1 64 56 56 64 64 1 1 1 0 1 64 56 56 -1 64 56 56 192 64 3 3 1 1 1 192 56 56 -1 192 28 28 64 192 1 1 1 0 1 64 28 28 -1 192 28 28 96 192 1 1 1 0 1 96 28 28 -1 96 28 28 128 96 3 3 1 1 1 128 28 28 -1 192 28 28 16 192 1 1 1 0 1 16 28 28 -1 16 28 28 32 16 5 5 1 2 1 32 28 28 -1 192 28 28 32 192 1 1 1 0 1 32 28 28 -1 256 28 28 128 256 1 1 1 0 1 128 28 28 -1 256 28 28 128 256 1 1 1 0 1 128 28 28 -1 128 28 28 192 128 3 3 1 1 1 192 28 28 -1 256 28 28 32 256 1 1 1 0 1 32 28 28 -1 32 28 28 96 32 5 5 1 2 1 96 28 28 -1 256 28 28 64 256 1 1 1 0 1 64 28 28 -1 480 14 14 192 480 1 1 1 0 1 192 14 14 -1 480 14 14 96 480 1 1 1 0 1 96 14 14 -1 96 14 14 208 96 3 3 1 1 1 208 14 14 -1 480 14 14 16 480 1 1 1 0 1 16 14 14 -1 16 14 14 48 16 5 5 1 2 1 48 14 14 -1 480 14 14 64 480 1 1 1 0 1 64 14 14 -1 512 14 14 160 512 1 1 1 0 1 160 14 14 -1 512 14 14 112 512 1 1 1 0 1 112 14 14 -1 112 14 14 224 112 3 3 1 1 1 224 14 14 -1 512 14 14 24 512 1 1 1 0 1 24 14 14 -1 24 14 14 64 24 5 5 1 2 1 64 14 14 -1 512 14 14 64 512 1 1 1 0 1 64 14 14 -1 512 14 14 128 512 1 1 1 0 1 128 14 14 -1 512 14 14 128 512 1 1 1 0 1 128 14 14 -1 128 14 14 256 128 3 3 1 1 1 256 14 14 -1 512 14 14 24 512 1 1 1 0 1 24 14 14 -1 24 14 14 64 24 5 5 1 2 1 64 14 14 -1 512 14 14 64 512 1 1 1 0 1 64 14 14 -1 512 14 14 112 512 1 1 1 0 1 112 14 14 -1 512 14 14 144 512 1 1 1 0 1 144 14 14 -1 144 14 14 288 144 3 3 1 1 1 288 14 14 -1 512 14 14 32 512 1 1 1 0 1 32 14 14 -1 32 14 14 64 32 5 5 1 2 1 64 14 14 -1 512 14 14 64 512 1 1 1 0 1 64 14 14 -1 528 14 14 256 528 1 1 1 0 1 256 14 14 -1 528 14 14 160 528 1 1 1 0 1 160 14 14 -1 160 14 14 320 160 3 3 1 1 1 320 14 14 -1 528 14 14 32 528 1 1 1 0 1 32 14 14 -1 32 14 14 128 32 5 5 1 2 1 128 14 14 -1 528 14 14 128 528 1 1 1 0 1 128 14 14 -1 832 7 7 256 832 1 1 1 0 1 256 7 7 -1 832 7 7 160 832 1 1 1 0 1 160 7 7 -1 160 7 7 320 160 3 3 1 1 1 320 7 7 -1 832 7 7 32 832 1 1 1 0 1 32 7 7 -1 32 7 7 128 32 5 5 1 2 1 128 7 7 -1 832 7 7 128 832 1 1 1 0 1 128 7 7 -1 832 7 7 384 832 1 1 1 0 1 384 7 7 -1 832 7 7 192 832 1 1 1 0 1 192 7 7 -1 192 7 7 384 192 3 3 1 1 1 384 7 7 -1 832 7 7 48 832 1 1 1 0 1 48 7 7 -1 48 7 7 128 48 5 5 1 2 1 128 7 7 -1 832 7 7 128 832 1 1 1 0 1 128 7 7 diff --git a/scripts/params/lenet_convolution.csv b/scripts/params/lenet_convolution.csv deleted file mode 100644 index 61df2925..00000000 --- a/scripts/params/lenet_convolution.csv +++ /dev/null @@ -1,3 +0,0 @@ -#in_n in_c in_h in_w f_n f_c f_h f_w stride padding out_n out_c out_h out_w -1 1 32 32 6 1 5 5 1 0 1 6 28 28 -1 6 14 14 16 6 5 5 1 0 1 16 10 10 diff --git a/scripts/params/mobilenetv1_depthwise_convolution.csv b/scripts/params/mobilenetv1_depthwise_convolution.csv deleted file mode 100644 index 319ede80..00000000 --- a/scripts/params/mobilenetv1_depthwise_convolution.csv +++ /dev/null @@ -1,14 +0,0 @@ -#in_n in_c in_h in_w f_n f_c f_h f_w stride padding out_n out_c out_h out_w -1 32 112 112 64 32 3 3 1 1 1 64 112 112 -1 64 112 112 128 64 3 3 2 1 1 128 56 56 -1 128 56 56 128 128 3 3 1 1 1 128 56 56 -1 128 56 56 256 128 3 3 2 1 1 256 28 28 -1 256 28 28 256 256 3 3 1 1 1 256 28 28 -1 256 28 28 512 256 3 3 2 1 1 512 14 14 -1 512 14 14 512 512 3 3 1 1 1 512 14 14 -1 512 14 14 512 512 3 3 1 1 1 512 14 14 -1 512 14 14 512 512 3 3 1 1 1 512 14 14 -1 512 14 14 512 512 3 3 1 1 1 512 14 14 -1 512 14 14 512 512 3 3 1 1 1 512 14 14 -1 512 14 14 1024 512 3 3 2 1 1 1024 7 7 -1 1024 7 7 1024 1024 3 3 1 1 1 1024 7 7 diff --git a/scripts/params/mobilenetv2_depthwise_convolution.csv b/scripts/params/mobilenetv2_depthwise_convolution.csv deleted file mode 100644 index fcf13249..00000000 --- a/scripts/params/mobilenetv2_depthwise_convolution.csv +++ /dev/null @@ -1,18 +0,0 @@ -#in_n in_c in_h in_w f_n f_c f_h f_w stride padding out_n out_c out_h out_w -1 32 112 112 32 32 3 3 1 1 1 16 112 112 -1 96 112 112 24 96 3 3 2 1 1 24 56 56 -1 144 56 56 24 144 3 3 1 1 1 24 56 56 -1 144 56 56 32 144 3 3 2 1 1 32 28 28 -1 192 28 28 32 192 3 3 1 1 1 32 28 28 -1 192 28 28 32 192 3 3 1 1 1 32 28 28 -1 192 28 28 64 192 3 3 1 1 1 64 28 28 -1 384 28 28 64 384 3 3 1 1 1 64 28 28 -1 384 28 28 64 384 3 3 1 1 1 64 28 28 -1 384 28 28 64 384 3 3 1 1 1 64 28 28 -1 384 28 28 96 384 3 3 2 1 1 96 14 14 -1 576 14 14 96 576 3 3 1 1 1 96 14 14 -1 576 14 14 96 576 3 3 1 1 1 96 14 14 -1 576 14 14 160 576 3 3 2 1 1 160 7 7 -1 960 7 7 160 960 3 3 1 1 1 160 7 7 -1 960 7 7 160 960 3 3 1 1 1 160 7 7 -1 960 7 7 320 960 3 3 1 1 1 320 7 7 diff --git a/scripts/params/mobilenetv3_convolution.csv b/scripts/params/mobilenetv3_convolution.csv deleted file mode 100644 index 9c6f2c1d..00000000 --- a/scripts/params/mobilenetv3_convolution.csv +++ /dev/null @@ -1,33 +0,0 @@ -#in_n in_c in_h in_w f_n f_c f_h f_w stride padding out_n out_c out_h out_w -1 3 224 224 16 3 3 3 2 0 1 16 112 112 -1 16 112 112 16 16 1 1 1 0 1 16 112 112 -1 16 112 112 16 16 1 1 1 0 1 16 112 112 -1 16 112 112 64 16 1 1 1 0 1 64 112 112 -1 64 56 56 24 64 1 1 1 0 1 24 56 56 -1 24 56 56 72 24 1 1 1 0 1 72 56 56 -1 72 56 56 24 72 1 1 1 0 1 24 56 56 -1 24 56 56 72 24 1 1 1 0 1 72 56 56 -1 72 28 28 40 72 1 1 1 0 1 40 28 28 -1 40 28 28 120 40 1 1 1 0 1 120 28 28 -1 120 28 28 40 120 1 1 1 0 1 40 28 28 -1 40 28 28 120 40 1 1 1 0 1 120 28 28 -1 120 28 28 40 120 1 1 1 0 1 40 28 28 -1 40 28 28 240 40 1 1 1 0 1 240 28 28 -1 240 14 14 80 240 1 1 1 0 1 80 14 14 -1 80 14 14 200 80 1 1 1 0 1 200 14 14 -1 200 14 14 80 200 1 1 1 0 1 80 14 14 -1 80 14 14 184 80 1 1 1 0 1 184 14 14 -1 184 14 14 80 184 1 1 1 0 1 80 14 14 -1 80 14 14 184 80 1 1 1 0 1 184 14 14 -1 184 14 14 80 184 1 1 1 0 1 80 14 14 -1 80 14 14 480 80 1 1 1 0 1 480 14 14 -1 480 14 14 112 480 1 1 1 0 1 112 14 14 -1 112 14 14 672 112 1 1 1 0 1 672 14 14 -1 672 14 14 112 672 1 1 1 0 1 112 14 14 -1 112 14 14 672 112 1 1 1 0 1 672 14 14 -1 672 14 14 112 672 1 1 1 0 1 112 14 14 -1 112 14 14 672 112 1 1 1 0 1 672 14 14 -1 672 7 7 160 672 1 1 1 0 1 160 7 7 -1 160 7 7 960 160 1 1 1 0 1 960 7 7 -1 960 7 7 160 960 1 1 1 0 1 160 7 7 -1 160 7 7 960 160 1 1 1 0 1 960 7 7 diff --git a/scripts/params/mobilenetv3_depthwise_convolution.csv b/scripts/params/mobilenetv3_depthwise_convolution.csv deleted file mode 100644 index 09a77153..00000000 --- a/scripts/params/mobilenetv3_depthwise_convolution.csv +++ /dev/null @@ -1,16 +0,0 @@ -#in_n in_c in_h in_w f_n f_c f_h f_w stride padding out_n out_c out_h out_w -1 16 112 112 16 16 3 3 1 1 1 16 112 112 -1 64 112 112 24 64 3 3 2 0 1 24 56 56 -1 72 56 56 24 72 3 3 1 1 1 24 56 56 -1 72 56 56 40 72 5 5 2 1 1 40 28 28 -1 120 28 28 40 120 5 5 1 2 1 40 28 28 -1 120 28 28 40 120 5 5 1 2 1 40 28 28 -1 240 28 28 80 240 3 3 2 0 1 80 14 14 -1 200 14 14 80 200 3 3 1 1 1 80 14 14 -1 184 14 14 80 184 3 3 1 1 1 80 14 14 -1 184 14 14 112 184 3 3 1 1 1 80 14 14 -1 480 14 14 112 480 3 3 1 1 1 112 14 14 -1 672 14 14 160 672 3 3 1 1 1 112 14 14 -1 672 14 14 160 672 5 5 1 2 1 160 14 14 -1 672 14 14 160 672 5 5 2 1 1 160 7 7 -1 960 7 7 160 960 5 5 1 2 1 160 7 7 diff --git a/scripts/params/multiply.csv b/scripts/params/multiply.csv deleted file mode 100644 index 55d4743d..00000000 --- a/scripts/params/multiply.csv +++ /dev/null @@ -1,3 +0,0 @@ -#len alpha beta -1000 1.1 0.2 -999 -0.2 -0.1 diff --git a/scripts/params/reduction.csv b/scripts/params/reduction.csv deleted file mode 100644 index b7bdd748..00000000 --- a/scripts/params/reduction.csv +++ /dev/null @@ -1,5 +0,0 @@ -#in ic ih iw axis -1 64 24 24 -1 -1 8 100 100 1 -1 8 100 100 2 -1 8 100 100 3 diff --git a/scripts/params/resnet50_convolution.csv b/scripts/params/resnet50_convolution.csv deleted file mode 100644 index 30148c8f..00000000 --- a/scripts/params/resnet50_convolution.csv +++ /dev/null @@ -1,54 +0,0 @@ -#in_n in_c in_h in_w f_n f_c f_h f_w stride padding out_n out_c out_h out_w -1 3 224 224 64 3 7 7 2 3 1 64 112 112 -1 64 56 56 256 64 1 1 1 0 1 256 56 56 -1 64 56 56 64 64 1 1 1 0 1 64 56 56 -1 64 56 56 64 64 3 3 1 1 1 64 56 56 -1 64 56 56 256 64 1 1 1 0 1 256 56 56 -1 256 56 56 64 256 1 1 1 0 1 64 56 56 -1 64 56 56 64 64 3 3 1 1 1 64 56 56 -1 64 56 56 256 64 1 1 1 0 1 256 56 56 -1 256 56 56 64 256 1 1 1 0 1 64 56 56 -1 64 56 56 64 64 3 3 1 1 1 64 56 56 -1 64 56 56 256 64 1 1 1 0 1 256 56 56 -1 256 56 56 512 256 1 1 2 0 1 512 28 28 -1 256 56 56 128 256 1 1 2 0 1 128 28 28 -1 128 28 28 128 128 3 3 1 1 1 128 28 28 -1 128 28 28 512 128 1 1 1 0 1 512 28 28 -1 512 28 28 128 512 1 1 1 0 1 128 28 28 -1 128 28 28 128 128 3 3 1 1 1 128 28 28 -1 128 28 28 512 128 1 1 1 0 1 512 28 28 -1 512 28 28 128 512 1 1 1 0 1 128 28 28 -1 128 28 28 128 128 3 3 1 1 1 128 28 28 -1 128 28 28 512 128 1 1 1 0 1 512 28 28 -1 512 28 28 128 512 1 1 1 0 1 128 28 28 -1 128 28 28 128 128 3 3 1 1 1 128 28 28 -1 128 28 28 512 128 1 1 1 0 1 512 28 28 -1 512 28 28 1024 512 1 1 2 0 1 1024 14 14 -1 512 28 28 256 512 1 1 2 0 1 256 14 14 -1 256 14 14 256 256 3 3 1 1 1 256 14 14 -1 256 14 14 1024 256 1 1 1 0 1 1024 14 14 -1 1024 14 14 256 1024 1 1 1 0 1 256 14 14 -1 256 14 14 256 256 3 3 1 1 1 256 14 14 -1 256 14 14 1024 256 1 1 1 0 1 1024 14 14 -1 1024 14 14 256 1024 1 1 1 0 1 256 14 14 -1 256 14 14 256 256 3 3 1 1 1 256 14 14 -1 256 14 14 1024 256 1 1 1 0 1 1024 14 14 -1 1024 14 14 256 1024 1 1 1 0 1 256 14 14 -1 256 14 14 256 256 3 3 1 1 1 256 14 14 -1 256 14 14 1024 256 1 1 1 0 1 1024 14 14 -1 1024 14 14 256 1024 1 1 1 0 1 256 14 14 -1 256 14 14 256 256 3 3 1 1 1 256 14 14 -1 256 14 14 1024 256 1 1 1 0 1 1024 14 14 -1 1024 14 14 256 1024 1 1 1 0 1 256 14 14 -1 256 14 14 256 256 3 3 1 1 1 256 14 14 -1 256 14 14 1024 256 1 1 1 0 1 1024 14 14 -1 1024 14 14 2048 1024 1 1 2 0 1 2048 7 7 -1 1024 14 14 512 1024 1 1 2 0 1 512 7 7 -1 512 7 7 512 512 3 3 1 1 1 512 7 7 -1 512 7 7 2048 512 1 1 1 0 1 2048 7 7 -1 2048 7 7 512 2048 1 1 1 0 1 512 7 7 -1 512 7 7 512 512 3 3 1 1 1 512 7 7 -1 512 7 7 2048 512 1 1 1 0 1 2048 7 7 -1 2048 7 7 512 2048 1 1 1 0 1 512 7 7 -1 512 7 7 512 512 3 3 1 1 1 512 7 7 -1 512 7 7 2048 512 1 1 1 0 1 2048 7 7 diff --git a/scripts/push_third_party.sh b/scripts/push_third_party.sh index 51d7a406..bb6b5662 100644 --- a/scripts/push_third_party.sh +++ b/scripts/push_third_party.sh @@ -7,8 +7,9 @@ script_dir=$(dirname $script_abs) host_lib_dir="" device="" device_dir="" +compiler="" -TEMP=`getopt -o l:d:p: --long lib:device:path \ +TEMP=`getopt -o l:d:p:c: --long lib:device:path:compiler: \ -n ${script_name} -- "$@"` if [ $? != 0 ] ; then echo "[ERROR] terminating..." >&2 ; exit 1 ; fi eval set -- "$TEMP" @@ -26,12 +27,34 @@ while true ; do device_dir=$2 echo "[INFO] test on device directory ${device_dir}" ; shift 2 ;; + -c|--compiler) + compiler=$2 + echo "[INFO] push ${compiler} library" ; + shift 2 ;; --) shift ; break ;; *) echo "[ERROR] $1" ; exit 1 ;; esac done -adb -s ${device} push ${host_lib_dir}/protobuf/lib/libprotobuf.so.11 ${device_dir} || exit 1 -adb -s ${device} push ${host_lib_dir}/opencl/lib64 ${device_dir} || exit 1 -adb -s ${device} push ${host_lib_dir}/jpeg/lib/libjpeg.so.9 ${device_dir} || exit 1 +adb -s ${device} push ${host_lib_dir}/protobuf/lib/libprotobuf.so.11 ${device_dir} > /dev/null || exit 1 +if [[ "${compiler}" == "arm_llvm" ]]; then + adb -s ${device} push ${host_lib_dir}/opencl/lib64 ${device_dir} > /dev/null || exit 1 + if [[ -f "${host_lib_dir}/opencl/lib64/libc++_shared.so" ]]; then + cxx_shared_path=${host_lib_dir}/opencl/lib64/libc++_shared.so + else + clang_path=`which aarch64-linux-android21-clang++` + clang_dir=$(dirname ${clang_path}) + cxx_shared_path=${clang_dir}/../sysroot/usr/lib/aarch64-linux-android/libc++_shared.so + fi +fi +if [[ "${compiler}" == "arm_ndkv7" ]]; then + clang_path=`which armv7a-linux-androideabi19-clang++` + clang_dir=$(dirname ${clang_path}) + cxx_shared_path=${clang_dir}/../sysroot/usr/lib/arm-linux-androideabi/libc++_shared.so +fi +if [[ -f "${cxx_shared_path}" ]]; then + adb -s ${device} push ${cxx_shared_path} ${device_dir} > /dev/null || exit 1 +fi +adb -s ${device} push ${host_lib_dir}/jpeg/lib/libjpeg.so.9 ${device_dir} > /dev/null || exit 1 +adb -s ${device} push ${host_lib_dir}/jsoncpp/lib/libjsoncpp.so ${device_dir} > /dev/null || exit 1 diff --git a/scripts/quick_benchmark.sh b/scripts/quick_benchmark.sh new file mode 100644 index 00000000..c3fc94af --- /dev/null +++ b/scripts/quick_benchmark.sh @@ -0,0 +1,151 @@ +#!/bin/bash + +script_name=$0 +script_abs=$(readlink -f "$0") +script_dir=$(dirname $script_abs) + +host_dir="" +use_static_library=true +host_lib_dir="" +exe_on_device=true +array=($(adb devices | sed 's/\r//' | grep ".device$")) +device=${array[0]} +array=($(adb devices | grep ".device$")) +#device=${array[0]} +device="GCL5T19822000030" +cpu_mask="40" +device_dir="" +gpu=false + +print_help() { + cat < run specified program in /[tests|examples|kits]. + -l, --lib use specified library in . + -d, --device run test on device. + -c, --cpu_mask taskset cpu mask(default: 40). + -g, --gpu run gpu test. + -p, --path run test on device in specified PATH. +EOF + exit 1; +} + + +TEMP=`getopt -o t:c:hl:d:p:g --long test:cpu_mask:help,lib:device:path:gpu \ + -n ${script_name} -- "$@"` +if [ $? != 0 ] ; then echo "[ERROR] terminating..." >&2 ; exit 1 ; fi +eval set -- "$TEMP" +while true ; do + case "$1" in + -t|--test) + host_dir=$2 + echo "[INFO] run test in '${host_dir}'" ; + shift 2 ;; + -c|--cpu_mask) + cpu_mask=$2 + echo "[INFO] CPU mask '${cpu_mask}'" ; + shift 2 ;; + -l|--lib) + use_static_library=false; + host_lib_dir=$2 + echo "[INFO] use library in ${host_lib_dir}" ; + shift 2 ;; + -d|--device) + device=$2 + exe_on_device=true + echo "[INFO] test on device \`${device}'" ; + shift 2 ;; + -p|--path) + device_dir=$2 + echo "[INFO] test on device directory \`${device_dir}'" ; + shift 2 ;; + -g|--gpu) + gpu=true; + shift ;; + -h|--help) + print_help ; + shift ;; + --) shift ; + break ;; + *) echo "[ERROR]" ; exit 1 ;; + esac +done + + +if [ ${exe_on_device} == true ] ; then + status=`adb -s ${device} shell "ls ${device_dir}/ && echo 'success'" | tail -n 1` + if [ "${status}" != "success" ] ; then + adb -s ${device} shell "rm -rf ${device_dir}" + adb -s ${device} shell "mkdir ${device_dir}" + fi + if [ ${use_static_library} != true ] ; then + for file in `ls ${host_lib_dir}/*.so` + do + adb -s ${device} push ${file} ${device_dir} > /dev/null || exit 1 + done + ${script_dir}/push_third_party.sh -l ${script_dir}/../third_party/arm_llvm -d ${device} -p ${device_dir} -c arm_llvm || exit 1 + fi +fi + +function device_excute(){ + adb -s ${device} shell "export LD_LIBRARY_PATH=${device_dir} && taskset ${cpu_mask} $@ || echo '[FAILURE]'" &> status.txt + cat status.txt || exit 1 + if [ `grep -c "\[FAILURE\]" status.txt` -ne '0' ] ; then + exit 1 + fi + rm status.txt +} + +# mmm +adb -s ${device} push ${host_dir}/tests/test_mmm_int8 ${device_dir} > /dev/null || exit 1 +adb -s ${device} push ${host_dir}/tests/test_mmm ${device_dir} > /dev/null || exit 1 +echo " " ; echo "--- Matrix Matrix Multiplication" +device_excute ${device_dir}/test_mmm 384 768 768 + +# conv_ic=3 +adb -s ${device} push ${host_dir}/tests/test_convolution ${device_dir} > /dev/null || exit 1 +echo " " ; echo "--- Conv IC=3" +device_excute ${device_dir}/test_convolution 1 3 227 227 96 3 11 11 1 4 0 1 96 55 55 + +# conv_5x5 +adb -s ${device} push ${host_dir}/tests/test_convolution_bnn ${device_dir} > /dev/null || exit 1 +adb -s ${device} push ${host_dir}/tests/test_convolution_int8 ${device_dir} > /dev/null || exit 1 +echo " " ; echo "--- Conv 5x5" +device_excute ${device_dir}/test_convolution_bnn 1 96 27 27 256 96 5 5 1 2 0 1 256 13 13 +device_excute ${device_dir}/test_convolution_int8 1 96 27 27 256 96 5 5 1 2 0 1 256 13 13 +device_excute ${device_dir}/test_convolution 1 96 27 27 256 96 5 5 1 2 0 1 256 13 13 + +# conv_3x3 +echo " " ; echo "--- Conv 3x3" +device_excute ${device_dir}/test_convolution_bnn 1 128 28 28 256 128 3 3 1 1 1 1 256 28 28 +device_excute ${device_dir}/test_convolution_int8 1 128 28 28 256 128 3 3 1 1 1 1 256 28 28 +device_excute ${device_dir}/test_convolution 1 128 28 28 256 128 3 3 1 1 1 1 256 28 28 + +# depthwise-pointwise convolution +adb -s ${device} push ${host_dir}/tests/test_depthwise_convolution ${device_dir} > /dev/null || exit 1 +echo " " ; echo "--- Depthwise-Pointwise Conv" +device_excute ${device_dir}/test_depthwise_convolution 1 256 28 28 256 256 3 3 1 1 1 1 256 28 28 + +# OCL +if [ ${gpu} == true ] ; then + adb -s ${device} push ${host_dir}/examples/hdr ${device_dir} > /dev/null || exit 1 + echo " " ; echo " " ; echo "--- GPU Network Test (HDR_OCL)" + echo " " ; echo "=== Input FP16" + device_excute ${device_dir}/hdr 1 3 720 1280 + echo " " ; echo "=== Input UCHAR" + device_excute ${device_dir}/hdr 1 3 720 1280 UCHAR + + adb -s ${device} push ${host_dir}/tests/test_convolution_ocl ${device_dir} > /dev/null || exit 1 + adb -s ${device} push ${host_dir}/tests/test_depthwise_convolution_ocl ${device_dir} > /dev/null || exit 1 + adb -s ${device} push ${host_dir}/tests/test_fully_connected_ocl ${device_dir} > /dev/null || exit 1 + device_excute ${device_dir}/test_convolution_ocl 64 112 112 64 5 5 1 2 + device_excute ${device_dir}/test_convolution_ocl 64 112 112 64 3 3 1 1 + device_excute ${device_dir}/test_depthwise_convolution_ocl 64 112 112 64 3 3 1 1 + device_excute ${device_dir}/test_fully_connected_ocl 24 1 1 96 +fi + +adb -s ${device} shell "rm -rf ${device_dir}" diff --git a/tensor_computing/CMakeLists.txt b/tensor_computing/CMakeLists.txt deleted file mode 100644 index d5cebaf4..00000000 --- a/tensor_computing/CMakeLists.txt +++ /dev/null @@ -1,29 +0,0 @@ -cmake_minimum_required(VERSION 3.2) - -file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/bolt.cmake ${BOLT_ROOT}/bolt.cmake) -if (BOLT_CONFIGURE_FILE) - include(${BOLT_CONFIGURE_FILE}) -else (BOLT_CONFIGURE_FILE) - message(FATAL_ERROR " -FATAL: can not find bolt.cmake in directory, - please set shell or cmake environment variable BOLT_ROOT. - ") -endif (BOLT_CONFIGURE_FILE) - -project(tensor_computing) - -set_policy() - -SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${BOLT_ROOT}/cmakes") -find_package(Uni) -find_package(BlasEnhance) -find_package(TensorComputing) -if(USE_MALI) - find_package(Gcl) -endif(USE_MALI) - -set_project_install_directory() - -set_c_cxx_flags() - -add_subdirectory(src) diff --git a/tensor_computing/include/tensor_computing.h b/tensor_computing/include/tensor_computing.h deleted file mode 100644 index 5a5fe676..00000000 --- a/tensor_computing/include/tensor_computing.h +++ /dev/null @@ -1,360 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_TENSOR_COMPUTING -#define _H_TENSOR_COMPUTING - -#include -#include -#include "sys.h" -#include "tensor_computing_type.h" - -#ifdef __cplusplus -extern "C" { -#endif - EE convolution_infer_output_size(TensorDesc inputDesc, TensorDesc filterDesc, ConvolutionDesc convDesc, - TensorDesc* outputDesc, DataType targetDataType, U32* outputBytes, Arch arch, ExtInfo_t extInfo = NULL); - - EE convolution_infer_forward_algorithm(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, ConvolutionPolicy policy, ConvolutionForwardAlgorithm *algorithm, DataType targetDataType, - ActivationDesc activationDesc, Arch arch, ExtInfo_t extInfo = NULL); - - EE convolution_transform_filter_bytes(TensorDesc filterDesc, ConvolutionForwardAlgorithm algorithm, U32* bytes, Arch arch, ExtInfo_t extInfo = NULL); - - EE convolution_transform_filter(TensorDesc filterDesc, const void* filter, ConvolutionForwardAlgorithm algorithm, - TensorDesc *ftmDesc, void* filterTransformed, void* tmp, Arch arch, ExtInfo_t extInfo = NULL); - - EE convolution_infer_forward_tmp_bytes(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, ConvolutionForwardAlgorithm algorithm, U32 *bytes, Arch arch, ExtInfo_t extInfo = NULL); - - EE convolution(TensorDesc inputDesc, void* input, - TensorDesc filterDesc, const void* filter, - ConvolutionDesc convDesc, - ConvolutionForwardAlgorithm algorithm, - TensorDesc scaleDesc, const void* scale, - TensorDesc biasDesc, const void* bias, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, void* output, - ActivationDesc activationDesc, - Arch arch, ExtInfo_t extInfo = NULL); - - EE deconvolution_infer_output_size(TensorDesc inputDesc, TensorDesc filterDesc, ConvolutionDesc convDesc, - TensorDesc* outputDesc, DataType targetDataType, U32* outputBytes); - - EE deconvolution_infer_forward_algorithm(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, ConvolutionPolicy policy, ConvolutionForwardAlgorithm *algorithm, DataType targetDataType, Arch arch); - - EE deconvolution_transform_filter_bytes(TensorDesc filterDesc, ConvolutionForwardAlgorithm algorithm, U32* bytes, Arch arch); - - EE deconvolution_transform_filter(TensorDesc filterDesc, const void* filter, ConvolutionForwardAlgorithm algorithm, - TensorDesc *ftmDesc, void* filterTransformed, Arch arch); - - EE deconvolution_infer_forward_tmp_bytes(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, ConvolutionForwardAlgorithm algorithm, U32 *bytes, Arch arch); - - EE deconvolution(TensorDesc inputDesc, void* input, - TensorDesc filterDesc, const void* filter, - ConvolutionDesc convDesc, - ConvolutionForwardAlgorithm algorithm, - TensorDesc scaleDesc, const void* scale, - TensorDesc biasDesc, const void* bias, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, void* output, - ActivationDesc activationDesc, - Arch arch); - - EE deconvolution_infer_output_size(TensorDesc inputDesc, TensorDesc filterDesc, ConvolutionDesc convDesc, - TensorDesc* outputDesc, DataType targetDataType, U32* outputBytes); - - EE deconvolution_infer_forward_algorithm(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, ConvolutionPolicy policy, ConvolutionForwardAlgorithm *algorithm, DataType targetDataType, Arch arch); - - EE deconvolution_transform_filter_bytes(TensorDesc filterDesc, ConvolutionForwardAlgorithm algorithm, U32* bytes, Arch arch); - - EE deconvolution_transform_filter(TensorDesc filterDesc, const void* filter, ConvolutionForwardAlgorithm algorithm, - TensorDesc *ftmDesc, void* filterTransformed, Arch arch); - - EE deconvolution_infer_forward_tmp_bytes(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, ConvolutionForwardAlgorithm algorithm, U32 *bytes, Arch arch); - - EE deconvolution(TensorDesc inputDesc, void* input, - TensorDesc filterDesc, const void* filter, - ConvolutionDesc convDesc, - ConvolutionForwardAlgorithm algorithm, - TensorDesc scaleDesc, const void* scale, - TensorDesc biasDesc, const void* bias, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, void* output, - ActivationDesc activationDesc, - Arch arch); - - EE depthwise_convolution_infer_output_size(TensorDesc inputDesc, TensorDesc filterDesc, ConvolutionDesc convDesc, - TensorDesc* outputDesc, DataType targetDataType, U32* outputBytes, Arch arch, ExtInfo_t extInfo = NULL); - - EE depthwise_convolution_infer_forward_algorithm(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, ConvolutionPolicy policy, DepthwiseConvolutionForwardAlgorithm *algorithm, DataType targetDataType, - ActivationDesc depthwiseActivationDesc, ActivationDesc pointwiseActivationDesc, Arch arch, ExtInfo_t extInfo = NULL); - - EE depthwise_convolution_transform_filter_bytes(TensorDesc filterDesc, DepthwiseConvolutionForwardAlgorithm algorithm, U32* bytes, Arch arch, ExtInfo_t extInfo = NULL); - - EE depthwise_convolution_transform_filter(TensorDesc filterDesc, const void* filter, DepthwiseConvolutionForwardAlgorithm algorithm, - TensorDesc *ftmDesc, void* filterTransformed, Arch arch, ExtInfo_t extInfo = NULL); - - EE depthwise_convolution_infer_forward_tmp_bytes(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, DepthwiseConvolutionForwardAlgorithm algorithm, U32 *bytes, Arch arch, ExtInfo_t extInfo = NULL); - - EE depthwise_convolution(TensorDesc inputDesc, void* input, - TensorDesc filterDesc, const void* filter, - ConvolutionDesc convDesc, - DepthwiseConvolutionForwardAlgorithm algorithm, - TensorDesc biasDesc, const void* bias, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, void* output, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc, - Arch arch, ExtInfo_t extInfo = NULL); - - EE detectionoutput_infer_output_size(std::vector inputDesc, DetectionOutputDesc detectionoutputDesc, TensorDesc* outputDesc, Arch arch, ExtInfo_t extInfo = NULL); - - EE detectionoutput(std::vector inputDesc, std::vector input, DetectionOutputDesc detectionoutputDesc, TensorDesc outputDesc, void* output, Arch arch, ExtInfo_t extInfo = NULL); - - EE pooling_infer_output_size(TensorDesc inputDesc, PoolingDesc poolingDesc, TensorDesc *outputDesc, Arch arch, ExtInfo_t extInfo = NULL); - - EE pooling(TensorDesc inputDesc, const void* input, PoolingDesc poolingDesc, const void* scale, TensorDesc outputDesc, void* output, Arch arch, ExtInfo_t extInfo = NULL); - - EE priorbox_infer_output_size(std::vector inputDesc, PriorBoxDesc priorboxDesc, TensorDesc* outputDesc, Arch arch, ExtInfo_t extInfo = NULL); - - EE priorbox(std::vector inputDesc, PriorBoxDesc priorboxDesc, TensorDesc outputDesc, void* output, Arch arch, ExtInfo_t extInfo = NULL); - - EE activation_infer_output_size(TensorDesc inputDesc, TensorDesc *outputDesc, Arch arch, ExtInfo_t extInfo = NULL); - - EE activation(TensorDesc inputDesc, void* input, ActivationDesc activationDesc, TensorDesc outputDesc, void* output, Arch arch, ExtInfo_t extInfo = NULL); - - EE concat_infer_output_size(std::vector inputDesc, TensorDesc* outputDesc, I32 axis, Arch arch, ExtInfo_t extInfo = NULL); - - EE concat(std::vector inputDesc, std::vector input, void* inputScale, - TensorDesc outputDesc, void* output, void* outputScale, I32 axis, Arch arch, ExtInfo_t extInfo = NULL); - - EE eltwise(std::vector inputDesc, std::vector input, TensorDesc outputDesc, void* output, EltwiseMode eltwiseMode, Arch arch, ExtInfo_t extInfo = NULL); - - EE eltwise_infer_output_size(std::vector inputDesc, TensorDesc* outputDesc, Arch arch, ExtInfo_t extInfo = NULL); - - EE split(TensorDesc inputDesc, void* input, std::vector outputDesc, std::vector* output, Arch arch); - - EE split_infer_output_size(TensorDesc inputDesc, std::vector* outputDesc); - - EE fully_connected_infer_output_size(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc *outputDesc, Arch arch, ExtInfo_t extInfo = NULL); - - EE fully_connected_infer_forward_algorithm(TensorDesc inputDesc, TensorDesc filterDesc, std::vector outputDescs, Arch arch, ExtInfo_t extInfo = NULL); - - EE fully_connected_infer_forward_tmp_bytes(TensorDesc inputDesc, TensorDesc filterDesc, U32 *bytes, Arch arch, ExtInfo_t extInfo = NULL); - - EE fully_connected_transform_filter_bytes(TensorDesc filterDesc, U32* bytes, Arch arch, ExtInfo_t extInfo = NULL); - - EE fully_connected_transform_filter(TensorDesc inputDesc, TensorDesc filterDesc, const void* filter, - TensorDesc *ftmDesc, void* filterTransformed, Arch arch, ExtInfo_t extInfo = NULL); - - EE fully_connected(TensorDesc inputDesc, const void* input, TensorDesc weightDesc, const void* weight, void* tmp, U32 bytes, - TensorDesc outputDesc, void* output, TensorDesc biasDesc, const void* bias, Arch arch, ExtInfo_t extInfo = NULL); - - EE softmax_infer_output_size(TensorDesc inputDesc, TensorDesc *outputDesc, Arch arch, ExtInfo_t extInfo = NULL); - - EE softmax(TensorDesc inputDesc, const void* input, int axis, TensorDesc outputDesc, void* output, Arch arch, ExtInfo_t extInfo = NULL); - - EE lstm_transform_filter(TensorDesc filterDesc, - const void* filter, - LSTMDesc lstmDesc, - TensorDesc *ftmDesc, - void* filterTransformed, - Arch arch); - - EE lstm_transform_filter_bytes(TensorDesc filterDesc, LSTMDesc lstmDesc, U32* bytes, Arch arch); - - EE lstm_infer_output_size(TensorDesc inputDesc, - TensorDesc filterDesc, - LSTMDesc lstmDesc, - TensorDesc* outputDesc, - U32* outputBytes); - - EE lstm_infer_forward_tmp_bytes(TensorDesc inputDesc, - TensorDesc filterDesc, - TensorDesc outputDesc, - LSTMDesc lstmDesc, - U32 *bytes, Arch arch); - - EE lstm(TensorDesc inputDesc, const void* input, - TensorDesc filterDesc, const void* filter, - TensorDesc biasDesc, const void* bias, - U32 tmpBytes, void* tmp, - LSTMDesc lstmDesc, - TensorDesc outputDesc, void* output, - Arch arch); - - EE lstmcell_infer_output_size(TensorDesc inputDesc, - TensorDesc filterDesc, - LSTMDesc lstmDesc, - TensorDesc* outputDesc, - U32* outputBytes); - - EE lstmcell_infer_forward_tmp_bytes(TensorDesc inputDesc, - TensorDesc filterDesc, - TensorDesc outputDesc, - LSTMDesc lstmDesc, - U32 *bytes, Arch arch); - - EE lstmcell(TensorDesc xDesc, const void* currentX, - TensorDesc filterDesc, const void* filter, - TensorDesc biasDesc, const void* bias, - void *state, - U32 tmpBytes, void *tmp, - LSTMDesc lstmDesc, U32 batchStrideX, U32 batchStrideH, - TensorDesc hDesc, void* currentH, - Arch arch); - - EE scale(TensorDesc inputDesc, void* input, - I32 axis, void *alpha, void *beta, - TensorDesc outputDesc, void* output, - Arch arch, ExtInfo_t extInfo=NULL); - - EE scale_infer_output_size(TensorDesc inputDesc, TensorDesc *outputDesc, Arch arch, ExtInfo_t extInfo = NULL); - - EE normalization_infer_output_size(TensorDesc inputDesc, TensorDesc *outputDesc, Arch arch, ExtInfo_t extInfo = NULL); - - EE layer_normalization(void *alpha, void *beta, - TensorDesc inputDesc, void* input, - TensorDesc outputDesc, void* output, Arch arch, ExtInfo_t extInfo = NULL); - - EE slice_infer_output_size(TensorDesc inputDesc, std::vector* outputDesc, I32 axis, I32 *slice_point, Arch arch, ExtInfo_t extInfo = NULL); - - EE slice(TensorDesc inputDesc, void* input, int axis, std::vector outputDesc, std::vector* output, Arch arch, ExtInfo_t extInfo = NULL); - - EE transpose(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *output, U32 *dim, Arch arch, ExtInfo_t extInfo = NULL); - - EE transpose_infer_output_size(TensorDesc inputDesc, TensorDesc *outputDesc, U32 *dim, Arch arch, ExtInfo_t extInfo = NULL); - - EE matmul_infer_output_size(TensorDesc matrixADesc, bool transposeA, TensorDesc matrixBDesc, bool transposeB, TensorDesc *matrixCDesc, Arch arch, ExtInfo_t extInfo = NULL); - - EE matmul_infer_forward_algorithm(TensorDesc matrixADesc, bool transposeA, TensorDesc matrixBDesc, bool transposeB, TensorDesc matrixCDesc, Arch arch, ExtInfo_t extInfo = NULL); - - EE matmul_infer_forward_tmp_bytes(TensorDesc matrixADesc, bool transposeA, TensorDesc matrixBDesc, bool transposeB, U32 *bytes, Arch arch, ExtInfo_t extInfo = NULL); - - EE matmul(TensorDesc matrixADesc, bool transposeA, const void* matrixA, - TensorDesc matrixBDesc, bool transposeB, const void* matrixB, - void* tmp, U32 bytes, - TensorDesc matirxCDesc, void* matrixC, Arch arch, ExtInfo_t extInfo = NULL); - - EE reshape_infer_output_size(TensorDesc inputDesc, TensorDesc* outputDesc, I32 *shape, I32 shape_size, Arch arch, ExtInfo_t extInfo = NULL); - - EE reshape(TensorDesc inputDesc, void* input, - TensorDesc outputDesc, void* output, Arch arch, ExtInfo_t extInfo = NULL); - - EE attention(TensorDesc inputDesc, const void *input, - TensorDesc outputDesc, void *output, - Arch arch); - - EE attention_infer_output_size(TensorDesc inputDesc, - U32 numHeads, U32 fromSequenceLength, U32 toSequenceLength, - TensorDesc *outputDesc); - - EE multiply(void *alpha, void *beta, TensorDesc inputDesc, void* input, TensorDesc outputDesc, void *output, Arch arch, ExtInfo_t extInfo = NULL); - - EE multiply_infer_output_size(TensorDesc inputDesc, TensorDesc *outputDesc, Arch arch, ExtInfo_t extInfo = NULL); - - EE clip(void *min_value, void *max_value, TensorDesc inputDesc, void* input, TensorDesc outputDesc, void *output, Arch arch, ExtInfo_t extInfo = NULL); - - EE clip_infer_output_size(TensorDesc inputDesc, TensorDesc *outputDesc, Arch arch, ExtInfo_t extInfo = NULL); - - EE quantize_tensor(TensorDesc dDesc, const void* data, TensorDesc* qDesc, void* qData, void *scale); - -#ifdef _USE_INT8 - void dequantize_int8_to_fp16(U32 len, INT8* q, F32 scale, F16* d); - - void dequantize_int32_to_fp16(U32 len, I32* q, F32 scale, F16* d, U32 biasLen=0, F16* biasPtr=nullptr); -#endif - - EE tensor_computing_set_input_infer_tmpBuf_size(void* input, TensorDesc hostDesc, U32* tmpBufSize, Arch arch); - - EE tensor_computing_set_input(void* input, TensorDesc hostDesc, const void* hostPtr, void* tmpBuf, bool blocking, Arch arch, ExtInfo_t extInfo = NULL); - - EE tensor_computing_get_output_infer_tmpBuf_size(const void* input, TensorDesc hostDesc, U32* tmpBufSize, Arch arch); - - EE tensor_computing_get_output(const void* input, TensorDesc hostDesc, void** hostPtr, void* tmpBuf, bool blocking, Arch arch, ExtInfo_t extInfo = NULL); - - EE bilateral_slice_apply_infer_output_size(TensorDesc inputDesc, TensorDesc guideDesc, TensorDesc gridDesc, BilateralSliceApplyDesc bilateralSliceApplyDesc, - TensorDesc* outputDesc, Arch arch, ExtInfo_t extInfo = NULL); - - EE bilateral_slice_apply_infer_forward_tmp_bytes(TensorDesc inputDesc, TensorDesc guideDesc, TensorDesc gridDesc, BilateralSliceApplyDesc bilateralSliceApplyDesc, - U32* bytes, Arch arch, ExtInfo_t extInfo = NULL); - - EE bilateral_slice_apply(TensorDesc inputDesc, const void* input, TensorDesc guideDesc, const void* guide, TensorDesc gridDesc, const void* grid, - BilateralSliceApplyDesc bilateralSliceApplyDesc, U32 tmpBytes, const void* tmpBuf, TensorDesc outputDesc, const void* output, Arch arch, ExtInfo_t extInfo = NULL); - - EE argmax(TensorDesc inputDesc, const void* input, - I32 axis, - TensorDesc outputDesc, void* output, Arch arch); - - EE argmax_infer_output_size(TensorDesc inputDesc, int axis, TensorDesc *outputDesc); - - EE reduction(TensorDesc inputDesc, const void* input, - TensorDesc maskDesc, const void *mask, - I32 axis, - ReductionMode reductionMode, - float coeff, - TensorDesc outputDesc, void* output, Arch arch); - - EE reduction_infer_output_size(TensorDesc inputDesc, TensorDesc maskDesc, int axis, bool keepDim, TensorDesc *outputDesc); - - EE check(TensorDesc inputDescA, const void* inputA, - TensorDesc inputDescB, const void* inputB, - CheckMode checkMode, - TensorDesc outputDesc, void* output, Arch arch); - - EE check_infer_output_size(TensorDesc inputDesc, TensorDesc *outputDesc); - - EE squeeze_infer_output_size(TensorDesc inputDesc, TensorDesc *outputDesc, Arch arch, ExtInfo_t extInfo = NULL); - - EE squeeze(TensorDesc inputDesc, const void* input, TensorDesc outputDesc, void* output, Arch arch, ExtInfo_t extInfo = NULL); - - EE space2depth_infer_output_size(TensorDesc inputDesc, TensorDesc *outputDesc, Arch arch, ExtInfo_t extInfo = NULL); - - EE space2depth(TensorDesc inputDesc, const void* input, TensorDesc outputDesc, void* output, Arch arch, ExtInfo_t extInfo = NULL); - - EE depth2space_infer_output_size(TensorDesc inputDesc, TensorDesc *outputDesc, Arch arch, ExtInfo_t extInfo = NULL); - - EE depth2space(TensorDesc inputDesc, const void* input, TensorDesc outputDesc, void* output, Arch arch, ExtInfo_t extInfo = NULL); - - EE attention_mask(TensorDesc inputDesc, const void* input, - I32 attentionLength, bool sameLength, float mask, - TensorDesc outputDesc, void* output, Arch arch); - - EE attention_mask_infer_output_size(TensorDesc inputDesc, TensorDesc *outputDesc); - - EE padding_infer_output_size(TensorDesc inputDesc, PadDesc padDesc, TensorDesc* outputDesc); - - EE padding(TensorDesc inputDesc, const void* input, PadDesc padDesc, TensorDesc outputDesc, void* output, Arch arch); - EE embedding_infer_output_size(TensorDesc inputDesc, TensorDesc *outputDesc, U32 inputDim, U32 numOutput, DataType dt, Arch arch, ExtInfo_t extInfo = NULL); - - EE embedding(TensorDesc inputDesc, void* input, TensorDesc weightDesc, void* weight, TensorDesc outputDesc, void *output, - U32 inputDim, U32 numOutput, bool transpose, DataType dt, Arch arch, ExtInfo_t extInfo = NULL); -#ifdef __cplusplus -} -#ifdef _USE_FP16 - void update_histogram(U32 len, const F16* data, int numBins, F32 interval, F32* histo); -#endif - std::vector compress_histogram(std::vector &histogram, F32 numPerBin, F32 last_max); - - std::vector compute_scale_with_KL(std::vector &histogram,F32 interval); -#endif -#endif - diff --git a/tensor_computing/include/tensor_computing_type.h b/tensor_computing/include/tensor_computing_type.h deleted file mode 100644 index 3468509b..00000000 --- a/tensor_computing/include/tensor_computing_type.h +++ /dev/null @@ -1,160 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_TENSOR_COMPUTING_TYPE -#define _H_TENSOR_COMPUTING_TYPE - -#include -#include "type.h" -#ifdef _USE_MALI -#include "gcl.h" -#define ALIGN(len, align_num) ((len + align_num - 1) / align_num * align_num) -#endif - -typedef struct { - U32 top; - U32 bottom; - U32 left; - U32 right; - F32 constant_value; - PadMode pad_mode; -} PadDesc; - -typedef struct { - ActivationMode mode; - float value[4] = {0, 0, 0, 0}; -} ActivationDesc; - -typedef struct { - U32 stride_h; - U32 stride_w; - U32 padding_top; - U32 padding_bottom; - U32 padding_left; - U32 padding_right; - U32 dilatedRate_h; - U32 dilatedRate_w; -} ConvolutionDesc; - -typedef enum { - CONVOLUTION_NO_TMP_MEM, - CONVOLUTION_FASTEST, - CONVOLUTION_TUNNING, - CONVOLUTION_LIBRARY_SEARCH, -} ConvolutionPolicy; - -typedef enum { - CONVOLUTION_ALGORITHM_DIRECT, - CONVOLUTION_ALGORITHM_GEMM, - CONVOLUTION_ALGORITHM_GEMM_ICNCHW, - CONVOLUTION_ALGORITHM_WINOGRAD, - CONVOLUTION_ALGORITHM_BNN, - CONVOLUTION_ALGORITHM_DIRECT_SPE_CK, - CONVOLUTION_ALGORITHM_NULL -} ConvolutionForwardAlgorithm; - -typedef struct { - F32 xmin; - F32 ymin; - F32 xmax; - F32 ymax; - U32 label; -}BoxRect; - -typedef struct { - U32 num_class; - F32 nms_threshold; - U32 nms_top_k; - U32 keep_top_k; - F32 confidence_threshold; -}DetectionOutputDesc; - -typedef struct { - PoolingMode pm; - U32 stride_h; - U32 stride_w; - U32 padding_top; - U32 padding_bottom; - U32 padding_left; - U32 padding_right; - U32 kernelSize_h; - U32 kernelSize_w; - RoundMode rm; -} PoolingDesc; - -typedef struct { - std::vector min_sizes; - std::vector max_sizes; - std::vector aspect_ratios; - U32 flip; - U32 clip; - F32 variances[4]; - U32 image_h; - U32 image_w; - F32 step_h; - F32 step_w; - F32 offset; -} PriorBoxDesc; - -typedef struct { - bool biDirection; - U32 numOutput; - U32 numProjection; - F32 forgetBias; - F32 zoneoutCell; - F32 zoneoutOutput; - ActivationMode activationMode; -} LSTMDesc; - -typedef struct { - U32 coefficient_len; - bool has_offset; - BilateralSliceApplyMode mode; -} BilateralSliceApplyDesc; - -typedef enum { - DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT, - DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT, - DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT_NO_PADDING, - DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_3X3S1P1, - DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_GEMM, - DEPTHWISE_CONVOLUTION_ALGORITHM_NULL -} DepthwiseConvolutionForwardAlgorithm; - -#ifdef _USE_MALI -typedef struct { - I32 algorithm; - U32 best_w[2]; - U32 best_c[2]; - U32 best_k[2]; -} ForwardRunInfoMali; -typedef ForwardRunInfoMali* ForwardRunInfoMali_t; - -typedef struct { - GCLHandle_t handle; - GCLMemDesc_t gclmemInputDesc; - GCLMemDesc_t gclmemOutputDesc; - GCLMemDesc_t gclmemFilterDesc; - ForwardRunInfoMali_t forwardRunInfo; -} MaliInfo; -typedef MaliInfo* MaliInfo_t; -#endif - -typedef union{ -#ifdef _USE_MALI - MaliInfo maliInfo; -#endif -} ExtInfo; -typedef ExtInfo* ExtInfo_t; -#endif diff --git a/tensor_computing/src/CMakeLists.txt b/tensor_computing/src/CMakeLists.txt deleted file mode 100644 index 5f5d7256..00000000 --- a/tensor_computing/src/CMakeLists.txt +++ /dev/null @@ -1,39 +0,0 @@ -if (USE_GENERAL) - file(GLOB general_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/general/*.cpp) -endif (USE_GENERAL) - -if (USE_NEON) - if (USE_FP32) - file(GLOB arm_fp32_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/fp32/*.cpp) - endif (USE_FP32) - if (USE_FP16) - file(GLOB arm_fp16_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/fp16/*.cpp) - file(GLOB arm_bnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/bnn/*.cpp) - endif (USE_FP16) - if (USE_INT8) - file(GLOB arm_int8_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/int8/*.cpp) - endif (USE_INT8) - file(GLOB arm_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/*.cpp) - set(arm_srcs "${arm_srcs};${arm_fp16_srcs};${arm_fp32_srcs};${arm_int8_srcs};${arm_bnn_srcs}") -endif (USE_NEON) - -if (USE_MALI) - file(GLOB mali_fp16_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/mali/fp16/*.cpp) - file(GLOB mali_uchar_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/mali/uchar/*.cpp) - file(GLOB mali_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/mali/*.cpp) - set(mali_srcs "${mali_srcs};${mali_fp16_srcs};${mali_uchar_srcs}") -endif (USE_MALI) - -file(GLOB srcs ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) -set(srcs "${srcs};${general_srcs};${arm_srcs};${mali_srcs}") - -include_directories(${CMAKE_CURRENT_SOURCE_DIR}) - -# shared library -ADD_LIBRARY(${PROJECT_NAME} SHARED ${srcs}) - -# static library -ADD_LIBRARY(${PROJECT_NAME}_static STATIC ${srcs}) -SET_TARGET_PROPERTIES(${PROJECT_NAME}_static PROPERTIES OUTPUT_NAME "${PROJECT_NAME}") -SET_TARGET_PROPERTIES(${PROJECT_NAME} PROPERTIES CLEAN_DIRECT_OUTPUT 1) -SET_TARGET_PROPERTIES(${PROJECT_NAME}_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) diff --git a/tensor_computing/src/activation.cpp b/tensor_computing/src/activation.cpp deleted file mode 100644 index d900910d..00000000 --- a/tensor_computing/src/activation.cpp +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing.h" -#ifdef _USE_GENERAL -#include "cpu/general/tensor_computing_general.h" -#endif -#ifdef _USE_NEON -#include "cpu/arm/tensor_computing_arm.h" -#endif -#ifdef _USE_MALI -#include "gpu/mali/tensor_computing_mali.h" -#endif - -inline EE activation_infer_output_size_cpu(TensorDesc inputDesc, TensorDesc *outputDesc) -{ - if (nullptr == outputDesc) - CHECK_STATUS(NULL_POINTER); - - *outputDesc = inputDesc; - return SUCCESS; -} - -EE activation_infer_output_size(TensorDesc inputDesc, TensorDesc *outputDesc, Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if(arch == MALI){ -#ifdef _USE_MALI - ret = activation_infer_output_size_mali(inputDesc, outputDesc, extInfo->maliInfo.gclmemInputDesc, extInfo->maliInfo.gclmemOutputDesc); -#endif - } else { - ret = activation_infer_output_size_cpu(inputDesc, outputDesc); - } - return ret; -} - -EE activation(TensorDesc inputDesc, void* input, ActivationDesc activationDesc, TensorDesc outputDesc, void* output, Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = activation_general(inputDesc, input, activationDesc, outputDesc, output); -#endif -#ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = activation_arm(inputDesc, input, activationDesc, outputDesc, output); -#endif -#ifdef _USE_MALI - } else if (arch == MALI) { - ret = activation_mali(extInfo->maliInfo.handle, inputDesc, (GCLMem_t)input, outputDesc, (GCLMem_t)output, activationDesc.mode); -#endif - } - return ret; -} diff --git a/tensor_computing/src/argmax.cpp b/tensor_computing/src/argmax.cpp deleted file mode 100644 index cebd63d7..00000000 --- a/tensor_computing/src/argmax.cpp +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing.h" -#ifdef _USE_GENERAL -#include "cpu/general/tensor_computing_general.h" -#endif -#ifdef _USE_NEON -#include "cpu/arm/tensor_computing_arm.h" -#endif - -EE argmax(TensorDesc inputDesc, const void* input, - I32 axis, - TensorDesc outputDesc, void* output, Arch arch) -{ - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = argmax_general(inputDesc, input, axis, outputDesc, output); -#endif -#ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = argmax_arm(inputDesc, input, axis, outputDesc, output); -#endif - } - return ret; -} - -EE argmax_infer_output_size(TensorDesc inputDesc, int axis, TensorDesc *outputDesc) -{ - if (nullptr == outputDesc) - CHECK_STATUS(NULL_POINTER); - - *outputDesc = inputDesc; - if (axis < 0) - axis += inputDesc.nDims; - axis = inputDesc.nDims - 1 - axis; - for (int i = axis; i < (I32)(inputDesc.nDims)-1; i++) { - (*outputDesc).dims[i] = (*outputDesc).dims[i+1]; - } - (*outputDesc).nDims = inputDesc.nDims - 1; - (*outputDesc).dt = DT_U32; - return SUCCESS; -} diff --git a/tensor_computing/src/attention.cpp b/tensor_computing/src/attention.cpp deleted file mode 100644 index 8003c092..00000000 --- a/tensor_computing/src/attention.cpp +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing.h" -#ifdef _USE_GENERAL -#include "cpu/general/tensor_computing_general.h" -#endif -#ifdef _USE_NEON -#include "cpu/arm/tensor_computing_arm.h" -#endif - -EE attention(TensorDesc inputDesc, const void *input, - TensorDesc outputDesc, void *output, Arch arch) -{ - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = attention_general(inputDesc, input, outputDesc, output); -#endif -#ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = attention_arm(inputDesc, input, outputDesc, output); -#endif - } - return ret; -} - -EE attention_infer_output_size(TensorDesc inputDesc, - U32 numHeads, U32 fromSequenceLength, U32 toSequenceLength, - TensorDesc *outputDesc) -{ - if (nullptr == outputDesc) - CHECK_STATUS(NULL_POINTER); - - DataType dt; - U32 batch, sequenceLength; - CHECK_STATUS(tensor2dGet(inputDesc, &dt, &batch, &sequenceLength)); - - *outputDesc = tensor4df(dt, DF_NCHW, batch, numHeads, fromSequenceLength, toSequenceLength); - - return SUCCESS; -} diff --git a/tensor_computing/src/attention_mask.cpp b/tensor_computing/src/attention_mask.cpp deleted file mode 100644 index d31b4dc3..00000000 --- a/tensor_computing/src/attention_mask.cpp +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing.h" -#ifdef _USE_GENERAL -#include "cpu/general/tensor_computing_general.h" -#endif -#ifdef _USE_NEON -#include "cpu/arm/tensor_computing_arm.h" -#endif - -EE attention_mask(TensorDesc inputDesc, const void* input, - I32 attentionLength, bool sameLength, float mask, - TensorDesc outputDesc, void* output, Arch arch) -{ - // reinit mask value to avoid overflow - if (bytesOf(inputDesc.dt) == 2 && mask > 10000) - mask = 10000; - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = attention_mask_general(inputDesc, input, attentionLength, sameLength, mask, outputDesc, output); -#endif -#ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = attention_mask_arm(inputDesc, input, attentionLength, sameLength, mask, outputDesc, output); -#endif - } - return ret; -} - -EE attention_mask_infer_output_size(TensorDesc inputDesc, TensorDesc *outputDesc) -{ - if (nullptr == outputDesc) - CHECK_STATUS(NULL_POINTER); - if (inputDesc.nDims < 2) - return NOT_MATCH; - *outputDesc = inputDesc; - return SUCCESS; -} diff --git a/tensor_computing/src/bilateral_slice_apply.cpp b/tensor_computing/src/bilateral_slice_apply.cpp deleted file mode 100644 index 67cfbafb..00000000 --- a/tensor_computing/src/bilateral_slice_apply.cpp +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing.h" -#ifdef _USE_MALI -#include "gpu/mali/tensor_computing_mali.h" -#endif - -inline EE bilateral_slice_apply_infer_output_size_cpu() -{ - return NOT_SUPPORTED; -} - -EE bilateral_slice_apply_infer_output_size(TensorDesc inputDesc, TensorDesc guideDesc, TensorDesc gridDesc, BilateralSliceApplyDesc bilateralSliceApplyDesc, - TensorDesc* outputDesc, Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if(arch == MALI) { -#ifdef _USE_MALI - if(extInfo->maliInfo.gclmemInputDesc) { - ret = bilateral_slice_apply_infer_output_size_mali(inputDesc, guideDesc, gridDesc, bilateralSliceApplyDesc, outputDesc, - &extInfo->maliInfo.gclmemInputDesc[0], &extInfo->maliInfo.gclmemInputDesc[2], &extInfo->maliInfo.gclmemInputDesc[1], extInfo->maliInfo.gclmemOutputDesc); - } else { - ret = bilateral_slice_apply_infer_output_size_mali(inputDesc, guideDesc, gridDesc, bilateralSliceApplyDesc, outputDesc, - NULL, NULL, NULL, extInfo->maliInfo.gclmemOutputDesc); - } -#endif - } - return ret; -} - -EE bilateral_slice_apply_infer_forward_tmp_bytes(TensorDesc inputDesc, TensorDesc guideDesc, TensorDesc gridDesc, BilateralSliceApplyDesc bilateralSliceApplyDesc, - U32* bytes, Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if(arch == MALI) { -#ifdef _USE_MALI - ret = bilateral_slice_apply_infer_forward_tmp_bytes_mali(inputDesc, guideDesc, gridDesc, bilateralSliceApplyDesc, - extInfo->maliInfo.forwardRunInfo, bytes); -#endif - } - return ret; -} - -EE bilateral_slice_apply(TensorDesc inputDesc, const void* input, TensorDesc guideDesc, const void* guide, - TensorDesc gridDesc, const void* grid, BilateralSliceApplyDesc bilateralSliceApplyDesc, - U32 tmpBytes, const void* tmpBuf, TensorDesc outputDesc, - const void* output, Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if(arch == MALI){ -#ifdef _USE_MALI - ret = bilateral_slice_apply_mali(extInfo->maliInfo.handle, inputDesc, (GCLMem_t)input, guideDesc, (GCLMem_t)guide, gridDesc, (GCLMem_t)grid, - bilateralSliceApplyDesc, extInfo->maliInfo.forwardRunInfo, tmpBytes, (GCLMem_t)tmpBuf, outputDesc, (GCLMem_t)output); -#endif - } - return ret; -} diff --git a/tensor_computing/src/check.cpp b/tensor_computing/src/check.cpp deleted file mode 100644 index f2fb8882..00000000 --- a/tensor_computing/src/check.cpp +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing.h" -#ifdef _USE_GENERAL -#include "cpu/general/tensor_computing_general.h" -#endif -#ifdef _USE_NEON -#include "cpu/arm/tensor_computing_arm.h" -#endif - -EE check(TensorDesc inputDescA, const void* inputA, - TensorDesc inputDescB, const void* inputB, - CheckMode checkMode, - TensorDesc outputDesc, void* output, Arch arch) -{ - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = check_general(inputDescA, inputA, inputDescB, inputB, checkMode, outputDesc, output); -#endif -#ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = check_arm(inputDescA, inputA, inputDescB, inputB, checkMode, outputDesc, output); -#endif - } - return ret; -} - -EE check_infer_output_size(TensorDesc inputDesc, TensorDesc *outputDesc) -{ - if (nullptr == outputDesc) - CHECK_STATUS(NULL_POINTER); - - (*outputDesc).dt = DT_I32; - (*outputDesc).nDims = 1; - (*outputDesc).dims[0] = inputDesc.dims[inputDesc.nDims-1]; - return SUCCESS; -} diff --git a/tensor_computing/src/clip.cpp b/tensor_computing/src/clip.cpp deleted file mode 100644 index 8d6e628e..00000000 --- a/tensor_computing/src/clip.cpp +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing.h" -#ifdef _USE_GENERAL -#include "cpu/general/tensor_computing_general.h" -#endif -#ifdef _USE_NEON -#include "cpu/arm/tensor_computing_arm.h" -#endif -#ifdef _USE_MALI -#include "gpu/mali/tensor_computing_mali.h" -#endif - -inline EE clip_infer_output_size_cpu(TensorDesc inputDesc, TensorDesc *outputDesc) -{ - if (nullptr == outputDesc) - CHECK_STATUS(NULL_POINTER); - *outputDesc = inputDesc; - return SUCCESS; -} - -EE clip_infer_output_size(TensorDesc inputDesc, TensorDesc *outputDesc, Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if(arch == MALI){ -#ifdef _USE_MALI - ret = clip_infer_output_size_mali(inputDesc, outputDesc, extInfo->maliInfo.gclmemInputDesc, extInfo->maliInfo.gclmemOutputDesc); -#endif - } else { - ret = clip_infer_output_size_cpu(inputDesc, outputDesc); - } - return ret; -} - -EE clip(void *min_value, void *max_value, TensorDesc inputDesc, void* input, TensorDesc outputDesc, void *output, Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = clip_general(min_value, max_value, inputDesc, input, outputDesc, output); -#endif -#ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = clip_arm(min_value, max_value, inputDesc, input, outputDesc, output); -#endif -#ifdef _USE_MALI - } else if (arch == MALI) { - ret = clip_mali(extInfo->maliInfo.handle, min_value, max_value, inputDesc, (GCLMem_t)input, outputDesc, (GCLMem_t)output); -#endif - } - return ret; -} - diff --git a/tensor_computing/src/concat.cpp b/tensor_computing/src/concat.cpp deleted file mode 100644 index 8394434f..00000000 --- a/tensor_computing/src/concat.cpp +++ /dev/null @@ -1,105 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include -#include "tensor_computing.h" -#ifdef _USE_GENERAL -#include "cpu/general/tensor_computing_general.h" -#endif -#ifdef _USE_NEON -#include "cpu/arm/tensor_computing_arm.h" -#endif -#ifdef _USE_MALI -#include "gpu/mali/tensor_computing_mali.h" -#endif - -inline EE concat_infer_output_size_cpu(std::vector inputDesc, TensorDesc* outputDesc, I32 axis) -{ - if (inputDesc.size() < 1) { - CHECK_STATUS(NOT_MATCH); - } - if (inputDesc.size() == 1) { - *outputDesc = inputDesc[0]; - return SUCCESS; - } - - for (U32 i = 1; i < inputDesc.size(); i++) { - if (inputDesc[i].nDims != 0) { - *outputDesc = inputDesc[i]; - break; - } - } - I32 dim = outputDesc->nDims; - axis = (axis + dim) % dim; - axis = dim - 1 - axis; - outputDesc->dims[axis] = 0; - - for (U32 i = 0; i < inputDesc.size(); i++) { - if (inputDesc[i].nDims == 0) - continue; - - if (inputDesc[i].nDims != (U32)dim) - return NOT_MATCH; - - for (I32 j = 0; j < dim; j++) { - if (j == axis) - outputDesc->dims[j] += inputDesc[i].dims[j]; - else { - outputDesc->dims[j] = UNI_MAX(inputDesc[i].dims[j], outputDesc->dims[j]); - if (inputDesc[i].dims[j] != 0 && outputDesc->dims[j] != 0 && outputDesc->dims[j] != inputDesc[i].dims[j]) { - return NOT_MATCH; - } - } - } - } - return SUCCESS; -} - -EE concat_infer_output_size(std::vector inputDesc, TensorDesc* outputDesc, I32 axis, Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if(arch == MALI){ -#ifdef _USE_MALI - ret = concat_infer_output_size_mali(inputDesc, outputDesc, axis, extInfo->maliInfo.gclmemInputDesc, extInfo->maliInfo.gclmemOutputDesc); -#endif - } else { - ret = concat_infer_output_size_cpu(inputDesc, outputDesc, axis); - } - return ret; -} - -EE concat(std::vector inputDesc, std::vector input, void* inputScale, - TensorDesc outputDesc, void* output, void* outputScale, I32 axis, Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = concat_general(inputDesc, input, - outputDesc, output, - axis); -#endif -#ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = concat_arm(inputDesc, input, inputScale, - outputDesc, output, outputScale, - axis); -#endif -#ifdef _USE_MALI - } else if (arch == MALI) { - ret = concat_mali(extInfo->maliInfo.handle, inputDesc, input, NULL, outputDesc, (GCLMem_t)output, NULL, axis); -#endif - } - return ret; -} diff --git a/tensor_computing/src/convolution.cpp b/tensor_computing/src/convolution.cpp deleted file mode 100644 index 9ad031f6..00000000 --- a/tensor_computing/src/convolution.cpp +++ /dev/null @@ -1,211 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing.h" -#ifdef _USE_GENERAL -#include "cpu/general/tensor_computing_general.h" -#endif -#ifdef _USE_NEON -#include "cpu/arm/tensor_computing_arm.h" -#endif -#ifdef _USE_MALI -#include "gpu/mali/tensor_computing_mali.h" -#endif - -inline EE convolution_infer_output_size_cpu(TensorDesc inputDesc, TensorDesc filterDesc, ConvolutionDesc convDesc, - TensorDesc* outputDesc, DataType targetDataType, U32* outputBytes) -{ - if (nullptr == outputDesc || nullptr == outputBytes) - CHECK_STATUS(NULL_POINTER); - DataType idt, fdt; - DataFormat idf, fdf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - I32 oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - if (fh < 1 || fw < 1) { - CHECK_STATUS(NOT_SUPPORTED); - } - - I32 strideH = convDesc.stride_h; - I32 strideW = convDesc.stride_w; - I32 paddingT = convDesc.padding_top; - I32 paddingB = convDesc.padding_bottom; - I32 paddingL = convDesc.padding_left; - I32 paddingR = convDesc.padding_right; - I32 dilateH = convDesc.dilatedRate_h; - I32 dilateW = convDesc.dilatedRate_w; - - U32 fhDilated = (fh - 1) * dilateH + 1; - U32 fwDilated = (fw - 1) * dilateW + 1; - oh = (ih + paddingT + paddingB - fhDilated) / strideH + 1; - ow = (iw + paddingL + paddingR - fwDilated) / strideW + 1; - - if (fn % 8 != 0) { - CHECK_STATUS(NOT_SUPPORTED); - } - - *outputDesc = tensor4df(targetDataType, DF_NCHWC8, in, fn, oh, ow); - *outputBytes = tensorNumBytes(*outputDesc); - return SUCCESS; -} - -EE convolution_infer_output_size(TensorDesc inputDesc, TensorDesc filterDesc, ConvolutionDesc convDesc, - TensorDesc* outputDesc, DataType targetDataType, U32* outputBytes, Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if (arch == MALI) { -#ifdef _USE_MALI - ret = convolution_infer_output_size_mali(inputDesc, filterDesc, convDesc, outputDesc, - extInfo->maliInfo.gclmemInputDesc, extInfo->maliInfo.gclmemOutputDesc, extInfo->maliInfo.forwardRunInfo); -#endif - } else { - ret = convolution_infer_output_size_cpu(inputDesc, filterDesc, convDesc, outputDesc, targetDataType, outputBytes); - } - return ret; -} - -EE convolution_infer_forward_algorithm(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, ConvolutionPolicy policy, ConvolutionForwardAlgorithm *algorithm, - DataType targetDataType, ActivationDesc activationDesc, Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = SUCCESS; -#endif -#ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = convolution_infer_forward_algorithm_arm(inputDesc, filterDesc, outputDesc, convDesc, policy, algorithm, targetDataType); -#endif -#ifdef _USE_MALI - } else if (arch == MALI) { - ret = convolution_infer_forward_algorithm_mali(extInfo->maliInfo.handle, inputDesc, - filterDesc, convDesc, outputDesc, policy, activationDesc.mode, extInfo->maliInfo.forwardRunInfo); -#endif - } - return ret; -} - -EE convolution_transform_filter_bytes(TensorDesc filterDesc, ConvolutionForwardAlgorithm algorithm, U32* bytes, Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = SUCCESS; -#endif -#ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = convolution_transform_filter_bytes_arm(filterDesc, algorithm, bytes); -#endif -#ifdef _USE_MALI - } else if (arch == MALI) { - ret = convolution_transform_filter_bytes_mali(filterDesc, extInfo->maliInfo.forwardRunInfo, extInfo->maliInfo.gclmemFilterDesc, bytes); -#endif - } - return ret; -} - -EE convolution_transform_filter(TensorDesc filterDesc, const void* filter, ConvolutionForwardAlgorithm algorithm, - TensorDesc *ftmDesc, void* filterTransformed, void* tmp, Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = SUCCESS; -#endif -#ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = convolution_transform_filter_arm(filterDesc, filter, algorithm, ftmDesc, filterTransformed); -#endif -#ifdef _USE_MALI - } else if (arch == MALI) { - ret = convolution_transform_filter_mali(extInfo->maliInfo.handle, filterDesc, (GCLMem_t)filter, extInfo->maliInfo.forwardRunInfo, - ftmDesc, (GCLMem_t)filterTransformed, (GCLMem_t)tmp); -#endif - } - return ret; -} - -EE convolution_infer_forward_tmp_bytes(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, ConvolutionForwardAlgorithm algorithm, U32 *bytes, Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = SUCCESS; -#endif -#ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = convolution_infer_forward_tmp_bytes_arm(inputDesc, filterDesc, outputDesc, convDesc, algorithm, bytes); -#endif -#ifdef _USE_MALI - } else if (arch == MALI) { - ret = convolution_infer_forward_tmp_bytes_mali(inputDesc, filterDesc, outputDesc, convDesc, extInfo->maliInfo.forwardRunInfo, bytes); -#endif - } - return ret; -} - -EE convolution(TensorDesc inputDesc, void* input, - TensorDesc filterDesc, const void* filter, - ConvolutionDesc convDesc, - ConvolutionForwardAlgorithm algorithm, - TensorDesc scaleDesc, const void* scale, - TensorDesc biasDesc, const void* bias, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, void* output, - ActivationDesc activationDesc, - Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = convolution_general(inputDesc, input, - filterDesc, filter, - convDesc, - scaleDesc, scale, - biasDesc, bias, - outputDesc, output, - activationDesc); -#endif -#ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = convolution_arm(inputDesc, input, - filterDesc, filter, - convDesc, - algorithm, - scaleDesc, scale, - biasDesc, bias, - tmpBytes, tmp, - outputDesc, output, - activationDesc, - arch); -#endif -#ifdef _USE_MALI - } else if (arch == MALI) { - ret = convolution_mali(extInfo->maliInfo.handle, inputDesc, (GCLMem_t)input, - filterDesc, (GCLMem_t)filter, - convDesc, - extInfo->maliInfo.forwardRunInfo, - scaleDesc, (GCLMem_t)scale, - biasDesc, (GCLMem_t)bias, - tmpBytes, (GCLMem_t)tmp, - outputDesc, (GCLMem_t)output, - activationDesc.mode); -#endif - } - return ret; -} diff --git a/tensor_computing/src/cpu/arm/activation.cpp b/tensor_computing/src/cpu/arm/activation.cpp deleted file mode 100644 index 0f70aca0..00000000 --- a/tensor_computing/src/cpu/arm/activation.cpp +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/tensor_computing_arm.h" -#ifdef _USE_FP32 -#include "cpu/arm/fp32/tensor_computing_fp32.h" -#endif -#ifdef _USE_FP16 -#include "cpu/arm/fp16/tensor_computing_fp16.h" -#endif -#ifdef _USE_INT8 -#include "cpu/arm/int8/tensor_computing_int8.h" -#endif - -EE activation_arm(TensorDesc inputDesc, void* input, ActivationDesc activationDesc, TensorDesc outputDesc, void* output) -{ - if (nullptr == input || nullptr == output) { - CHECK_STATUS(NULL_POINTER); - } - DataType idt = inputDesc.dt; - EE ret = SUCCESS; - U32 len = tensorNumElements(inputDesc); - CHECK_REQUIREMENT(len == tensorNumElements(outputDesc)); - switch (idt) { -#ifdef _USE_FP32 - case DT_F32: { - ret = activation_fp32((F32*)input, len, activationDesc, (F32*)output); - break; - } -#endif -#ifdef _USE_FP16 - case DT_F16: { - ret = activation_fp16((F16*)input, len, activationDesc, (F16*)output); - break; - } -#endif -#ifdef _USE_INT8 - case DT_I8: { - ret = activation_int8((INT8*)input, len, activationDesc, (INT8*)output); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - - return ret; -} diff --git a/tensor_computing/src/cpu/arm/argmax.cpp b/tensor_computing/src/cpu/arm/argmax.cpp deleted file mode 100644 index eee15dfb..00000000 --- a/tensor_computing/src/cpu/arm/argmax.cpp +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/tensor_computing_arm.h" - -template -U32 array_argmax(const T* input, U32 len, U32 stride) { - U32 index = 0; - U32 j = stride; - for (U32 i = 1; i < len; i++, j+=stride) { - if(input[j] > input[index]) - index = j; - } - return index / stride; -} - -template -EE argmax(TensorDesc inputDesc, const T* input, - I32 axis, - TensorDesc outputDesc, U32* output) -{ - UNUSED(outputDesc); - - if (nullptr == input || nullptr == output) - CHECK_STATUS(NULL_POINTER); - - if (axis < 0) - axis = inputDesc.nDims + axis; - axis = inputDesc.nDims - 1 - axis; - U32 loopInner = 1; - for (int i = 0; i < axis; i++) { - loopInner *= inputDesc.dims[i]; - } - U32 loopOuter = 1; - for (U32 i = axis+1; i < inputDesc.nDims; i++) { - loopOuter *= inputDesc.dims[i]; - } - - U32 len = inputDesc.dims[axis]; - for (U32 i = 0; i < loopOuter; i++) { - for (U32 j = 0; j < loopInner; j++) { - const T* array = input + i * (len * loopInner) + j; - output[i*loopInner+j] = array_argmax(array, len, loopInner); - } - } - return SUCCESS; -} - -EE argmax_arm(TensorDesc inputDesc, const void* input, - I32 axis, - TensorDesc outputDesc, void* output) -{ - DataType idt = inputDesc.dt; - EE ret = SUCCESS; - switch (idt) { -#ifdef _USE_FP32 - case DT_F32: { - ret = argmax(inputDesc, (const F32*)input, axis, outputDesc, (U32*)output); - break; - } -#endif -#ifdef _USE_FP16 - case DT_F16: { - ret = argmax(inputDesc, (const F16*)input, axis, outputDesc, (U32*)output); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - - return ret; -} diff --git a/tensor_computing/src/cpu/arm/arm_functions.h b/tensor_computing/src/cpu/arm/arm_functions.h deleted file mode 100644 index cec166a8..00000000 --- a/tensor_computing/src/cpu/arm/arm_functions.h +++ /dev/null @@ -1,193 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_ARM_FUNCTIONS -#define _H_ARM_FUNCTIONS - -#include "cpu/arm/fp16/arm_functions_fp16.h" -#include "cpu/arm/fp32/arm_functions_fp32.h" - -// array sum -inline F32 array_sum(DataType dt, const void *data, I32 len) { - F32 result = 0; - switch(dt) { -#ifdef _USE_FP16 - case DT_F16: - result = array_sum_f16((const F16*)data, len); - break; -#endif -#ifdef _USE_FP32 - case DT_F32: - result = array_sum_f32((const F32*)data, len); - break; -#endif - default: - CHECK_STATUS(NOT_SUPPORTED); - break; - } - return result; -} - -// array mean -inline F32 array_mean(DataType dt, const void *data, I32 len) { - F32 result = 0; - switch(dt) { -#ifdef _USE_FP16 - case DT_F16: - result = array_mean_f16((const F16*)data, len); - break; -#endif -#ifdef _USE_FP32 - case DT_F32: - result = array_mean_f32((const F32*)data, len); - break; -#endif - default: - CHECK_STATUS(NOT_SUPPORTED); - break; - } - return result; -} - -// array var -inline F32 array_var(DataType dt, const void *data, I32 len, F32 mean) { - F32 result = 0; - switch(dt) { -#ifdef _USE_FP16 - case DT_F16: - result = array_var_f16((const F16*)data, len, mean); - break; -#endif -#ifdef _USE_FP32 - case DT_F32: - result = array_var_f32((const F32*)data, len, mean); - break; -#endif - default: - CHECK_STATUS(NOT_SUPPORTED); - break; - } - return result; -} - -// array max -inline F32 array_max(DataType dt, const void* data, I32 len) { - F32 result = 0; - switch(dt) { -#ifdef _USE_FP16 - case DT_F16: - result = array_max_f16((const F16*)data, len); - break; -#endif -#ifdef _USE_FP32 - case DT_F32: - result = array_max_f32((const F32*)data, len); - break; -#endif - default: - CHECK_STATUS(NOT_SUPPORTED); - break; - } - return result; -} - -inline F32 array_maxabs(DataType dt, const void* data, I32 len) -{ - F32 result = 0; - switch(dt) { -#ifdef _USE_FP16 - case DT_F16: - result = array_maxabs_f16((const F16*)data, len); - break; -#endif - default: - CHECK_STATUS(NOT_SUPPORTED); - break; - } - return result; -} - -template -inline void array_scale_template(T *input, T *output, I32 len, F32 alpha, F32 beta) { - for (I32 i = 0; i < len; i++) { - output[i] = alpha * input[i] + beta; - } -} - -inline void array_scale(DataType dt, void *input, void *output, I32 len, F32 alpha, F32 beta) { - switch(dt) { -#ifdef _USE_FP16 - case DT_F16: - array_scale_f16((F16*)input, (F16*)output, len, alpha, beta); - break; -#endif -#ifdef _USE_FP32 - case DT_F32: - array_scale_f32((F32*)input, (F32*)output, len, alpha, beta); - break; -#endif - case DT_I32: { - array_scale_template((I32 *)input, (I32 *)output, len, alpha, beta); - break; - } - case DT_U32: { - array_scale_template((U32 *)input, (U32 *)output, len, alpha, beta); - break; - } - default: - CHECK_STATUS(NOT_SUPPORTED); - break; - } -} - -inline EE array_activation(DataType dt, void* input, U32 len, ActivationDesc activationDesc, void* output) -{ - EE result = SUCCESS; - switch(dt) { -#ifdef _USE_FP16 - case DT_F16: - result = activation_fp16((F16*)input, len, activationDesc, (F16*)output); - break; -#endif -#ifdef _USE_FP32 - case DT_F32: - result = activation_fp32((F32*)input, len, activationDesc, (F32*)output); - break; -#endif - default: - CHECK_STATUS(NOT_SUPPORTED); - break; - } - return result; -} - -inline void array_add(DataType dt, const void *inputA, const void *inputB, void *output, I32 len) { - switch(dt) { -#ifdef _USE_FP16 - case DT_F16: - array_add_f16((const F16*)inputA, (const F16*)inputB, (F16*)output, len); - break; -#endif -#ifdef _USE_FP32 - case DT_F32: - array_add_f32((const F32*)inputA, (const F32*)inputB, (F32*)output, len); - break; -#endif - default: - CHECK_STATUS(NOT_SUPPORTED); - break; - } -} - -#endif diff --git a/tensor_computing/src/cpu/arm/attention.cpp b/tensor_computing/src/cpu/arm/attention.cpp deleted file mode 100644 index 3b9ce727..00000000 --- a/tensor_computing/src/cpu/arm/attention.cpp +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/tensor_computing_arm.h" -#ifdef _USE_FP32 -#include "cpu/arm/fp32/tensor_computing_fp32.h" -#endif -#ifdef _USE_FP16 -#include "cpu/arm/fp16/tensor_computing_fp16.h" -#endif - -EE attention_arm(TensorDesc inputDesc, const void *input, TensorDesc outputDesc, void *output) -{ - DataType dt; - DataFormat df; - U32 batch, numHeads, fromSequenceLength, toSequenceLength; - CHECK_REQUIREMENT(tensorIs2d(inputDesc)); - CHECK_REQUIREMENT(tensorIs4d(outputDesc)); - CHECK_STATUS(tensor4dGet(outputDesc, &dt, &df, &batch, &numHeads, &fromSequenceLength, &toSequenceLength)); - - EE ret = SUCCESS; - switch (dt) { -#ifdef _USE_FP32 - case DT_F32: { - ret = attention_fp32(batch, numHeads, fromSequenceLength, toSequenceLength, (const F32*)input, (F32*)output); - break; - } -#endif -#ifdef _USE_FP16 - case DT_F16: { - ret = attention_fp16(batch, numHeads, fromSequenceLength, toSequenceLength, (const F16*)input, (F16*)output); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} diff --git a/tensor_computing/src/cpu/arm/attention_mask.cpp b/tensor_computing/src/cpu/arm/attention_mask.cpp deleted file mode 100644 index 38928807..00000000 --- a/tensor_computing/src/cpu/arm/attention_mask.cpp +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/tensor_computing_arm.h" -#ifdef _USE_FP32 -#include "cpu/arm/fp32/tensor_computing_fp32.h" -#endif -#ifdef _USE_FP16 -#include "cpu/arm/fp16/tensor_computing_fp16.h" -#endif - -EE attention_mask_arm(TensorDesc inputDesc, const void* input, - I32 attentionLength, bool sameLength, float mask, - TensorDesc outputDesc, void* output) -{ - DataType idt = inputDesc.dt; - EE ret = SUCCESS; - switch (idt) { -#ifdef _USE_FP32 - case DT_F32: { - ret = attention_mask_fp32(inputDesc, (const F32*)input, - attentionLength, sameLength, mask, outputDesc, (F32*)output); - break; - } -#endif -#ifdef _USE_FP16 - case DT_F16: { - ret = attention_mask_fp16(inputDesc, (const F16*)input, - attentionLength, sameLength, mask, outputDesc, (F16*)output); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - - return ret; -} diff --git a/tensor_computing/src/cpu/arm/bnn/convolution.cpp b/tensor_computing/src/cpu/arm/bnn/convolution.cpp deleted file mode 100644 index 6ffdb2da..00000000 --- a/tensor_computing/src/cpu/arm/bnn/convolution.cpp +++ /dev/null @@ -1,109 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifdef _USE_FP16 -#include "cpu/arm/bnn/tensor_computing_bnn.h" - -EE convolution_infer_forward_tmp_bytes_bnn(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, ConvolutionForwardAlgorithm algorithm, U32 *bytes) -{ - UNUSED(outputDesc); - - if (nullptr == bytes) - CHECK_STATUS(NULL_POINTER); - DataType idt, fdt; - DataFormat idf, fdf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - - U32 ih_pad = ih + paddingT + paddingB; - U32 iw_pad = iw + paddingL + paddingR; - EE ret = SUCCESS; - switch (algorithm) { - case CONVOLUTION_ALGORITHM_BNN: - *bytes = ic*ih_pad*iw_pad + 8*fh*fw*ic + ic*ih*iw; - break; - default: - ret = NOT_MATCH; - break; - } - *bytes /= 8; - *bytes *= sizeof(BIN8); - *bytes += 32; - return ret; -} - -EE convolution_bnn(TensorDesc inputDesc, const F16* input, - TensorDesc filterDesc, const BIN8* filter, - ConvolutionDesc convDesc, - TensorDesc scaleDesc, const F16* scale, - TensorDesc biasDesc, const F16* bias, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* output, - ActivationDesc activationDesc, - Arch arch) -{ - if(nullptr == input || nullptr == filter || nullptr == output || nullptr == scale || nullptr == bias || nullptr == tmp) - CHECK_STATUS(NULL_POINTER); - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - - if (idt != DT_F16) - CHECK_STATUS(NOT_MATCH); - if (odt != DT_F16) - CHECK_STATUS(NOT_MATCH); - if (idf != DF_NCHWC8 || odf != DF_NCHWC8) - CHECK_STATUS(NOT_MATCH); - - EE ret = SUCCESS; - switch (fdt) { - case DT_BIN01: - ret = convolution_dorefa(inputDesc, (F16*)input, - filterDesc, (BIN8*)filter, - convDesc, - scaleDesc, (F16*)scale, - biasDesc, (F16*)bias, - tmpBytes, tmp, - outputDesc, (F16*)output, - activationDesc, arch); - break; - case DT_BIN11: - ret = convolution_xnor(inputDesc, (F16*)input, - filterDesc, (BIN8*)filter, - convDesc, - scaleDesc, (F16*)scale, - biasDesc, (F16*)bias, - tmpBytes, tmp, - outputDesc, (F16*)output, - activationDesc, arch); - break; - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} -#endif diff --git a/tensor_computing/src/cpu/arm/bnn/convolution_dorefa.h b/tensor_computing/src/cpu/arm/bnn/convolution_dorefa.h deleted file mode 100644 index da79838c..00000000 --- a/tensor_computing/src/cpu/arm/bnn/convolution_dorefa.h +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_CONVOLUTION_DOREFA -#define _H_CONVOLUTION_DOREFA - -#ifdef _USE_FP16 -#include -#include -#include "sys.h" -#include "type.h" -#include "error.h" -#include "tensor_desc.h" -#include "tensor_computing_type.h" - -EE convolution_dorefa_A55(TensorDesc inputDesc, const F16* input, - TensorDesc filterDesc, const BIN8* filterArray, - ConvolutionDesc convDesc, - TensorDesc scaleDesc, const F16* scaleArray, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc activationDesc); - -EE convolution_dorefa_A76(TensorDesc inputDesc, const F16* input, - TensorDesc filterDesc, const BIN8* filterArray, - ConvolutionDesc convDesc, - TensorDesc scaleDesc, const F16* scaleArray, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc activationDesc); - -inline EE convolution_dorefa(TensorDesc inputDesc, const F16* input, - TensorDesc filterDesc, const BIN8* filter, - ConvolutionDesc convDesc, - TensorDesc scaleDesc, const F16* scale, - TensorDesc biasDesc, const F16* bias, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* output, - ActivationDesc activationDesc, - Arch arch) -{ - EE ret = SUCCESS; - switch (arch) { - case ARM_A55: - ret = convolution_dorefa_A55(inputDesc, input, - filterDesc, filter, - convDesc, - scaleDesc, scale, - biasDesc, bias, - tmpBytes, tmp, - outputDesc, output, - activationDesc); - break; - case ARM_A76: - ret = convolution_dorefa_A76(inputDesc, input, - filterDesc, filter, - convDesc, - scaleDesc, scale, - biasDesc, bias, - tmpBytes, tmp, - outputDesc, output, - activationDesc); - break; - default: - return NOT_SUPPORTED; - } - return ret; -} -#endif -#endif diff --git a/tensor_computing/src/cpu/arm/bnn/convolution_dorefa_A55.cpp b/tensor_computing/src/cpu/arm/bnn/convolution_dorefa_A55.cpp deleted file mode 100644 index cdd6960d..00000000 --- a/tensor_computing/src/cpu/arm/bnn/convolution_dorefa_A55.cpp +++ /dev/null @@ -1,777 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifdef _USE_FP16 -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" - -#include "cpu/arm/bnn/convolution_dorefa.h" - -EE convolution_dorefa_A55(TensorDesc inputDesc, const F16* input, - TensorDesc filterDesc, const BIN8* filterArray, - ConvolutionDesc convDesc, - TensorDesc scaleDesc, const F16* scaleArray, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc activationDesc) -{ - UNUSED(scaleDesc); - UNUSED(biasDesc); - UNUSED(tmpBytes); - UNUSED(activationDesc); - - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - - if (fdf != DF_NCHWN16C8) - CHECK_STATUS(NOT_MATCH); - if (!(ic == fc && oc == fn)) - CHECK_STATUS(NOT_MATCH); - - oc /= 8; - ic /= 8; - - U32 ih_pad = ih + paddingT + paddingB; - U32 iw_pad = iw + paddingL + paddingR; - U32 ohow = oh*ow; - U32 ihiw = ih_pad*iw_pad; - - BIN8* inArray = ((BIN8*)tmp) + ic*ihiw + 8*fh*fw*ic; // ic has been divided by 8 - BIN8 *inArray_pad; - for (U32 n = 0; n < in; n++) { - const F16 *in = input + n*ic*ih*iw*8; - for (U32 i = 0; i < ic*ih*iw; i++) { - BIN8 temp = 0; - for (U32 j = 0; j < 8; j++) { - if (in[i*8+j] >= 0.5) { - temp |= (1 << (7-j)); // set - } - } - inArray[i] = temp; - } - - if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { - inArray_pad = inArray + n*ic*ih*iw; // ic has been divided by 8 - } else { - // copy input into a input with padding - inArray_pad = (BIN8*)tmp; - BIN8 *inArray_pad_mov = inArray_pad; - BIN8 *inArray_mov = inArray + n*ic*ih*iw; - for (U32 c = 0; c < ic; c++) { // All divide by 8 - for (U32 h = 0; h < paddingT; h++) { - memset(inArray_pad_mov, 0, iw_pad*bytesOf(DT_BIN01)); - inArray_pad_mov += iw_pad; - } - for (U32 h = paddingT; h < ih_pad - paddingB; h++) { - memset(inArray_pad_mov, 0, paddingL*bytesOf(DT_BIN01)); - inArray_pad_mov += paddingL; - memcpy(inArray_pad_mov, inArray_mov, iw*bytesOf(DT_BIN01)); - inArray_pad_mov += iw; - inArray_mov += iw; - memset(inArray_pad_mov, 0, paddingR*bytesOf(DT_BIN01)); - inArray_pad_mov += paddingR; - } - for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { - memset(inArray_pad_mov, 0, iw_pad*bytesOf(DT_BIN01)); - inArray_pad_mov += iw_pad; - } - } - } - for (U32 hw = 0; hw < ohow-7; hw+=8) { - const F16 *s0 = scaleArray; - const F16 *s1 = scaleArray + 8; - const F16 *b0 = biasArray; - const F16 *b1 = biasArray + 8; - BIN8 *in_order = ((BIN8*)tmp) + ic*ihiw; // ic has been divided by 8 - // reorder input - // NCHWc8 => NHWChw8c8 + im2col - U32 in_h[8]; - U32 in_w[8]; - for (U32 i = 0; i < 8; i++) { - in_h[i] = ((hw+i)/ow)*strideH; - in_w[i] = ((hw+i)%ow)*strideW; - } - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - BIN8 *in_hw8c8 = inArray_pad + c*ihiw + fh_idx*iw_pad + fw_idx; - // NHWChw8c8 - BIN8 *in_order_hw8c8 = in_order + c*fh*fw*8 + fh_idx*fw*8 + fw_idx*8; // This 8 comes from hw8 - for (U32 i = 0; i < 8; i++) { - in_order_hw8c8[i] = *(in_hw8c8 + in_h[i]*iw_pad + in_w[i]); - } - } - } - } - - // compute - for (U32 o = 0; o < oc; o+=2) { // oc should be multiple of 32. It will at least be multiple of 16 in the future. - BIN8 *in_hw0 = in_order; - const BIN8 *f_o0c0 = filterArray + o*8*fh*fw*ic; // ic has been divided by 8 - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; - // scale and bias - const F16 *s_o0 = s0; - const F16 *s_o1 = s1; - const F16 *b_o0 = b0; - const F16 *b_o1 = b1; - __asm__ __volatile__( - /* Layout - 5 6 - 7 8 - 9 10 - 11 12 - - 13 14 - 15 16 - 17 18 - 19 20 - */ - "eor v5.16b, v5.16b, v5.16b\n" - "ldr d29, [%[in_0]]\n" //in_0 - "eor v6.16b, v6.16b, v6.16b\n" - "ldr d0, [%[f_0]]\n" //f_0 - "eor v7.16b, v7.16b, v7.16b\n" - "ldr x2, [%[f_0], #8]\n" - "eor v8.16b, v8.16b, v8.16b\n" - "ins v0.d[1], x2\n" - "eor v9.16b, v9.16b, v9.16b\n" - "dup v1.16b, v29.b[0]\n" //duplicate a full register - "eor v10.16b, v10.16b, v10.16b\n" - "dup v2.16b, v29.b[1]\n" - "eor v11.16b, v11.16b, v11.16b\n" - "eor v12.16b, v12.16b, v12.16b\n" - "eor v13.16b, v13.16b, v13.16b\n" - "eor v14.16b, v14.16b, v14.16b\n" - "eor v15.16b, v15.16b, v15.16b\n" - "eor v16.16b, v16.16b, v16.16b\n" - "eor v17.16b, v17.16b, v17.16b\n" - "eor v18.16b, v18.16b, v18.16b\n" - "eor v19.16b, v19.16b, v19.16b\n" - "eor v20.16b, v20.16b, v20.16b\n" - - //give in address to x3 - "mov x3, %[in_0]\n" - - //give f address to x0 - "mov x0, %[f_0]\n" - - "mov x2, %[ic]\n" //ic_blk - - "0:\n" - "eor v21.16b, v21.16b, v21.16b\n" - "mov x9, %[fhfw]\n" - "eor v22.16b, v22.16b, v22.16b\n" - "eor v23.16b, v23.16b, v23.16b\n" - "eor v24.16b, v24.16b, v24.16b\n" - "eor v25.16b, v25.16b, v25.16b\n" - "eor v26.16b, v26.16b, v26.16b\n" - "eor v27.16b, v27.16b, v27.16b\n" - "eor v28.16b, v28.16b, v28.16b\n" - - "mov x4, #4\n" - - "1:\n" - "and v3.16b, v1.16b, v0.16b\n" - "ldr d30, [x0, 16]!\n" // next filter - - "and v4.16b, v2.16b, v0.16b\n" - "ldr x1, [x0, 8]\n" - - "cnt v3.16b, v3.16b\n" - "subs x4, x4, #1\n" - - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[2]\n" - - "add v21.16b, v21.16b, v3.16b\n" // Use add because the latency is shorter - "dup v2.16b, v29.b[3]\n" - - "add v22.16b, v22.16b, v4.16b\n" - "ins v30.d[1], x1\n" - - "and v3.16b, v1.16b, v0.16b\n" - "and v4.16b, v2.16b, v0.16b\n" - "cnt v3.16b, v3.16b\n" - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[4]\n" - "add v23.16b, v23.16b, v3.16b\n" - "dup v2.16b, v29.b[5]\n" - "add v24.16b, v24.16b, v4.16b\n" - - "and v3.16b, v1.16b, v0.16b\n" - "and v4.16b, v2.16b, v0.16b\n" - "cnt v3.16b, v3.16b\n" - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[6]\n" - "add v25.16b, v25.16b, v3.16b\n" - "dup v2.16b, v29.b[7]\n" - "add v26.16b, v26.16b, v4.16b\n" - - "and v3.16b, v1.16b, v0.16b\n" - "ldr d29, [x3, 8]!\n" - "and v4.16b, v2.16b, v0.16b\n" - "cnt v3.16b, v3.16b\n" - "mov v0.16b, v30.16b\n" - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[0]\n" - "add v27.16b, v27.16b, v3.16b\n" - "dup v2.16b, v29.b[1]\n" - "add v28.16b, v28.16b, v4.16b\n" - "bne 1b\n" - - "movi v3.16b, #1\n" - "umlal v5.8h, v21.8b, v3.8b\n" - "umlal v7.8h, v22.8b, v3.8b\n" - "umlal v9.8h, v23.8b, v3.8b\n" - "umlal v11.8h, v24.8b, v3.8b\n" - "umlal v13.8h, v25.8b, v3.8b\n" - "umlal v15.8h, v26.8b, v3.8b\n" - "umlal v17.8h, v27.8b, v3.8b\n" - "umlal v19.8h, v28.8b, v3.8b\n" - - "umlal2 v6.8h, v21.16b, v3.16b\n" - "umlal2 v8.8h, v22.16b, v3.16b\n" - "umlal2 v10.8h, v23.16b, v3.16b\n" - "umlal2 v12.8h, v24.16b, v3.16b\n" - "umlal2 v14.8h, v25.16b, v3.16b\n" - "umlal2 v16.8h, v26.16b, v3.16b\n" - "umlal2 v18.8h, v27.16b, v3.16b\n" - "umlal2 v20.8h, v28.16b, v3.16b\n" - - "subs x9, x9, #1\n" - "beq 4f\n" // 1x1, continue with the next 32 input channels - - "2:\n" - "eor v21.16b, v21.16b, v21.16b\n" - "eor v22.16b, v22.16b, v22.16b\n" - "eor v23.16b, v23.16b, v23.16b\n" - "eor v24.16b, v24.16b, v24.16b\n" - "eor v25.16b, v25.16b, v25.16b\n" - "eor v26.16b, v26.16b, v26.16b\n" - "eor v27.16b, v27.16b, v27.16b\n" - "eor v28.16b, v28.16b, v28.16b\n" - - "mov x4, #32\n" // Assume 256 will not happen - "3:\n" - "and v3.16b, v1.16b, v0.16b\n" - "ldr d30, [x0, 16]!\n" // next filter - - "and v4.16b, v2.16b, v0.16b\n" - "ldr x1, [x0, 8]\n" - - "cnt v3.16b, v3.16b\n" - "subs x4, x4, #1\n" - - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[2]\n" - - "uqadd v21.16b, v21.16b, v3.16b\n" - "dup v2.16b, v29.b[3]\n" - - "uqadd v22.16b, v22.16b, v4.16b\n" - "ins v30.d[1], x1\n" - - "and v3.16b, v1.16b, v0.16b\n" - "and v4.16b, v2.16b, v0.16b\n" - "cnt v3.16b, v3.16b\n" - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[4]\n" - "uqadd v23.16b, v23.16b, v3.16b\n" - "dup v2.16b, v29.b[5]\n" - "uqadd v24.16b, v24.16b, v4.16b\n" - - "and v3.16b, v1.16b, v0.16b\n" - "and v4.16b, v2.16b, v0.16b\n" - "cnt v3.16b, v3.16b\n" - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[6]\n" - "uqadd v25.16b, v25.16b, v3.16b\n" - "dup v2.16b, v29.b[7]\n" - "uqadd v26.16b, v26.16b, v4.16b\n" - - "and v3.16b, v1.16b, v0.16b\n" - "ldr d29, [x3, 8]!\n" - "and v4.16b, v2.16b, v0.16b\n" - "cnt v3.16b, v3.16b\n" - "mov v0.16b, v30.16b\n" - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[0]\n" - "uqadd v27.16b, v27.16b, v3.16b\n" - "dup v2.16b, v29.b[1]\n" - "uqadd v28.16b, v28.16b, v4.16b\n" - "bne 3b\n" - - "movi v3.16b, #1\n" - "umlal v5.8h, v21.8b, v3.8b\n" - "umlal v7.8h, v22.8b, v3.8b\n" - "umlal v9.8h, v23.8b, v3.8b\n" - "umlal v11.8h, v24.8b, v3.8b\n" - "umlal v13.8h, v25.8b, v3.8b\n" - "umlal v15.8h, v26.8b, v3.8b\n" - "umlal v17.8h, v27.8b, v3.8b\n" - "umlal v19.8h, v28.8b, v3.8b\n" - - "umlal2 v6.8h, v21.16b, v3.16b\n" - "umlal2 v8.8h, v22.16b, v3.16b\n" - "umlal2 v10.8h, v23.16b, v3.16b\n" - "umlal2 v12.8h, v24.16b, v3.16b\n" - "umlal2 v14.8h, v25.16b, v3.16b\n" - "umlal2 v16.8h, v26.16b, v3.16b\n" - "umlal2 v18.8h, v27.16b, v3.16b\n" - "umlal2 v20.8h, v28.16b, v3.16b\n" - - "subs x9, x9, #8\n" - "bne 2b\n" - - "4:\n" // Wrap up computation for 32 input channels - "subs x2, x2, #32\n" - "bne 0b\n" - - // pipelined - "ucvtf v5.8h, v5.8h\n" - "ucvtf v6.8h, v6.8h\n" - "ldr q21, [%[b_0]]\n" - "ucvtf v7.8h, v7.8h\n" - "ldr q22, [%[b_1]]\n" - "ucvtf v8.8h, v8.8h\n" - "ldr q23, [%[s_0]]\n" - "ucvtf v9.8h, v9.8h\n" - "ldr q24, [%[s_1]]\n" - "ucvtf v10.8h, v10.8h\n" - "ucvtf v11.8h, v11.8h\n" - "mov v1.16b, v21.16b\n" - "ucvtf v12.8h, v12.8h\n" - "mov v2.16b, v22.16b\n" - "ucvtf v13.8h, v13.8h\n" - "fmla v1.8h, v5.8h, v23.8h\n" - "ucvtf v14.8h, v14.8h\n" - "fmla v2.8h, v6.8h, v24.8h\n" - "ucvtf v15.8h, v15.8h\n" - "mov v3.16b, v21.16b\n" - "ucvtf v16.8h, v16.8h\n" - "mov v4.16b, v22.16b\n" - "ucvtf v17.8h, v17.8h\n" - "fmla v3.8h, v7.8h, v23.8h\n" - "ucvtf v18.8h, v18.8h\n" - "fmla v4.8h, v8.8h, v24.8h\n" - "ucvtf v19.8h, v19.8h\n" - "mov v5.16b, v21.16b\n" - "ucvtf v20.8h, v20.8h\n" - "mov v6.16b, v22.16b\n" - - "fmla v5.8h, v9.8h, v23.8h\n" - "mov v7.16b, v21.16b\n" - "fmla v6.8h, v10.8h, v24.8h\n" - "mov v8.16b, v22.16b\n" - "fmla v7.8h, v11.8h, v23.8h\n" - "mov v9.16b, v21.16b\n" - "fmla v8.8h, v12.8h, v24.8h\n" - "mov v10.16b, v22.16b\n" - "fmla v9.8h, v13.8h, v23.8h\n" - "mov v11.16b, v21.16b\n" - "fmla v10.8h, v14.8h, v24.8h\n" - "mov v12.16b, v22.16b\n" - "fmla v11.8h, v15.8h, v23.8h\n" - "mov v13.16b, v21.16b\n" - "fmla v12.8h, v16.8h, v24.8h\n" - "mov v14.16b, v22.16b\n" - "fmla v13.8h, v17.8h, v23.8h\n" - "mov v15.16b, v21.16b\n" - "fmla v14.8h, v18.8h, v24.8h\n" - "mov v16.16b, v22.16b\n" - "fmla v15.8h, v19.8h, v23.8h\n" - "fmla v16.8h, v20.8h, v24.8h\n" - - "str q1, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw1 - "str q5, [%[out_0], #32]\n" //out_o0hw2 - "str q7, [%[out_0], #48]\n" //out_o0hw3 - "str q9, [%[out_0], #64]\n" //out_o0hw4 - "str q11, [%[out_0], #80]\n" //out_o0hw5 - "str q13, [%[out_0], #96]\n" //out_o0hw6 - "str q15, [%[out_0], #112]\n" //out_o0hw7 - - "str q2, [%[out_1]]\n" //out_o1hw0 - "str q4, [%[out_1], #16]\n" //out_o1hw1 - "str q6, [%[out_1], #32]\n" //out_o1hw2 - "str q8, [%[out_1], #48]\n" //out_o1hw3 - "str q10, [%[out_1], #64]\n" //out_o1hw4 - "str q12, [%[out_1], #80]\n" //out_o1hw5 - "str q14, [%[out_1], #96]\n" //out_o1hw6 - "str q16, [%[out_1], #112]\n" //out_o1hw7 - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8), - [fhfw]"r"((I64)fh*fw), - [s_0]"r"(s_o0), - [s_1]"r"(s_o1), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x0", "x1", "x2", "x3", "x4", "x9" - ); - s0 += 16; - s1 += 16; - b0 += 16; - b1 += 16; - } - } - // ohow_remainder % 8 / 4 - U32 ohow_s = (ohow / 8) * 8; - - for (U32 hw = ohow_s; hw < ohow-3; hw+=4) { - const F16 *s0 = scaleArray; - const F16 *s1 = scaleArray + 8; - const F16 *b0 = biasArray; - const F16 *b1 = biasArray + 8; - BIN8 *in_order = ((BIN8*)tmp) + ic*ihiw; // ic has been divided by 8 - // reorder input - // NCHWc8 => NHWChw4c8 + im2col - U32 in_h[4]; - U32 in_w[4]; - for (U32 i = 0; i < 4; i++) { - in_h[i] = ((hw+i)/ow)*strideH; - in_w[i] = ((hw+i)%ow)*strideW; - } - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - BIN8 *in_hw4c8 = inArray_pad + c*ihiw + fh_idx*iw_pad + fw_idx; - // NHWChw4c8 - BIN8 *in_order_hw4c8 = in_order + c*fh*fw*4 + fh_idx*fw*4 + fw_idx*4; - for (U32 i = 0; i < 4; i++) { - in_order_hw4c8[i] = *(in_hw4c8 + in_h[i]*iw_pad + in_w[i]); - } - } - } - } - - // compute - for (U32 o = 0; o < oc; o+=2) { // oc should be multiple of 32. It will at least be multiple of 16 in the future. - BIN8 *in_hw0 = in_order; - const BIN8 *f_o0c0 = filterArray + o*8*fh*fw*ic; // ic has been divided by 8 - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; - // scale and bias - const F16 *s_o0 = s0; - const F16 *s_o1 = s1; - const F16 *b_o0 = b0; - const F16 *b_o1 = b1; - __asm__ __volatile__( - /* Layout - 5 6 - 7 8 - 9 10 - 11 12 - */ - "eor v5.16b, v5.16b, v5.16b\n" - "ldr s29, [%[in_0]]\n" //in_0 - "eor v6.16b, v6.16b, v6.16b\n" - "ldr d0, [%[f_0]]\n" //f_0 - "eor v7.16b, v7.16b, v7.16b\n" - "ldr x2, [%[f_0], #8]\n" - "eor v8.16b, v8.16b, v8.16b\n" - "ins v0.d[1], x2\n" - "eor v9.16b, v9.16b, v9.16b\n" - "dup v1.16b, v29.b[0]\n" //duplicate a full register - "eor v10.16b, v10.16b, v10.16b\n" - "dup v2.16b, v29.b[1]\n" - "eor v11.16b, v11.16b, v11.16b\n" - "eor v12.16b, v12.16b, v12.16b\n" - - //give in address to x3 - "mov x3, %[in_0]\n" - - //give f address to x0 - "mov x0, %[f_0]\n" - - "mov x2, %[ic]\n" //ic_blk - - "0:\n" - "eor v21.16b, v21.16b, v21.16b\n" - "mov x9, %[fhfw]\n" - "eor v22.16b, v22.16b, v22.16b\n" - "eor v23.16b, v23.16b, v23.16b\n" - "eor v24.16b, v24.16b, v24.16b\n" - - "mov x4, #4\n" - - "1:\n" - "and v3.16b, v1.16b, v0.16b\n" - "ldr d30, [x0, 16]!\n" // next filter - - "and v4.16b, v2.16b, v0.16b\n" - "ldr x1, [x0, 8]\n" - - "cnt v3.16b, v3.16b\n" - "subs x4, x4, #1\n" - - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[2]\n" - - "add v21.16b, v21.16b, v3.16b\n" // Use add because the latency is shorter - "dup v2.16b, v29.b[3]\n" - - "add v22.16b, v22.16b, v4.16b\n" - "ins v30.d[1], x1\n" - - "and v3.16b, v1.16b, v0.16b\n" - "ldr s29, [x3, 4]!\n" - "and v4.16b, v2.16b, v0.16b\n" - "cnt v3.16b, v3.16b\n" - "mov v0.16b, v30.16b\n" - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[0]\n" - "add v23.16b, v23.16b, v3.16b\n" - "dup v2.16b, v29.b[1]\n" - "add v24.16b, v24.16b, v4.16b\n" - "bne 1b\n" - - "movi v3.16b, #1\n" - "umlal v5.8h, v21.8b, v3.8b\n" - "umlal v7.8h, v22.8b, v3.8b\n" - "umlal v9.8h, v23.8b, v3.8b\n" - "umlal v11.8h, v24.8b, v3.8b\n" - - "umlal2 v6.8h, v21.16b, v3.16b\n" - "umlal2 v8.8h, v22.16b, v3.16b\n" - "umlal2 v10.8h, v23.16b, v3.16b\n" - "umlal2 v12.8h, v24.16b, v3.16b\n" - - "subs x9, x9, #1\n" - "beq 4f\n" // 1x1, continue with the next 32 input channels - - "2:\n" - "eor v21.16b, v21.16b, v21.16b\n" - "eor v22.16b, v22.16b, v22.16b\n" - "eor v23.16b, v23.16b, v23.16b\n" - "eor v24.16b, v24.16b, v24.16b\n" - - "mov x4, #32\n" // Assume 256 will not happen - "3:\n" - "and v3.16b, v1.16b, v0.16b\n" - "ldr d30, [x0, 16]!\n" // next filter - - "and v4.16b, v2.16b, v0.16b\n" - "ldr x1, [x0, 8]\n" - - "cnt v3.16b, v3.16b\n" - "subs x4, x4, #1\n" - - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[2]\n" - - "uqadd v21.16b, v21.16b, v3.16b\n" - "dup v2.16b, v29.b[3]\n" - - "uqadd v22.16b, v22.16b, v4.16b\n" - "ins v30.d[1], x1\n" - - "and v3.16b, v1.16b, v0.16b\n" - "ldr s29, [x3, 4]!\n" - "and v4.16b, v2.16b, v0.16b\n" - "cnt v3.16b, v3.16b\n" - "mov v0.16b, v30.16b\n" - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[0]\n" - "uqadd v23.16b, v23.16b, v3.16b\n" - "dup v2.16b, v29.b[1]\n" - "uqadd v24.16b, v24.16b, v4.16b\n" - "bne 3b\n" - - "movi v3.16b, #1\n" - "umlal v5.8h, v21.8b, v3.8b\n" - "umlal v7.8h, v22.8b, v3.8b\n" - "umlal v9.8h, v23.8b, v3.8b\n" - "umlal v11.8h, v24.8b, v3.8b\n" - - "umlal2 v6.8h, v21.16b, v3.16b\n" - "umlal2 v8.8h, v22.16b, v3.16b\n" - "umlal2 v10.8h, v23.16b, v3.16b\n" - "umlal2 v12.8h, v24.16b, v3.16b\n" - - "subs x9, x9, #8\n" - "bne 2b\n" - - "4:\n" // Wrap up computation for 32 input channels - "subs x2, x2, #32\n" - "bne 0b\n" - - // pipelined - "ucvtf v5.8h, v5.8h\n" - "ucvtf v6.8h, v6.8h\n" - "ldr q21, [%[b_0]]\n" - "ucvtf v7.8h, v7.8h\n" - "ldr q22, [%[b_1]]\n" - "ucvtf v8.8h, v8.8h\n" - "ldr q23, [%[s_0]]\n" - "ucvtf v9.8h, v9.8h\n" - "ldr q24, [%[s_1]]\n" - "ucvtf v10.8h, v10.8h\n" - "ucvtf v11.8h, v11.8h\n" - "mov v1.16b, v21.16b\n" - "ucvtf v12.8h, v12.8h\n" - "mov v2.16b, v22.16b\n" - "fmla v1.8h, v5.8h, v23.8h\n" - "fmla v2.8h, v6.8h, v24.8h\n" - "mov v3.16b, v21.16b\n" - "mov v4.16b, v22.16b\n" - "fmla v3.8h, v7.8h, v23.8h\n" - "fmla v4.8h, v8.8h, v24.8h\n" - "mov v5.16b, v21.16b\n" - "mov v6.16b, v22.16b\n" - - "fmla v5.8h, v9.8h, v23.8h\n" - "mov v7.16b, v21.16b\n" - "fmla v6.8h, v10.8h, v24.8h\n" - "mov v8.16b, v22.16b\n" - "fmla v7.8h, v11.8h, v23.8h\n" - "fmla v8.8h, v12.8h, v24.8h\n" - - "str q1, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw1 - "str q5, [%[out_0], #32]\n" //out_o0hw2 - "str q7, [%[out_0], #48]\n" //out_o0hw3 - - "str q2, [%[out_1]]\n" //out_o1hw0 - "str q4, [%[out_1], #16]\n" //out_o1hw1 - "str q6, [%[out_1], #32]\n" //out_o1hw2 - "str q8, [%[out_1], #48]\n" //out_o1hw3 - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8), - [fhfw]"r"((I64)fh*fw), - [s_0]"r"(s_o0), - [s_1]"r"(s_o1), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v21", "v22", "v23", "v24", "v29", "v30", "x0", "x1", "x2", "x3", "x4", "x9" - ); - s0 += 16; - s1 += 16; - b0 += 16; - b1 += 16; - } - } - // ohow_reminder % 4 - ohow_s = (ohow / 4) * 4; - for (U32 hw = ohow_s; hw < ohow; hw++) { - const F16 *s0 = scaleArray; - const F16 *s1 = scaleArray + 8; - const F16 *b0 = biasArray; - const F16 *b1 = biasArray + 8; - BIN8 *in_order = ((BIN8*)tmp) + ic*ih_pad*iw_pad; // ic has been divided by 8 - // reorder input - // NCHWc8 => NHWChw1c8 + im2col - U32 in_h_0 = (hw/ow)*strideH; - U32 in_w_0 = (hw%ow)*strideW; - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - BIN8 *in_hw1c8 = inArray_pad + c*ihiw + fh_idx*iw_pad + fw_idx; - BIN8 *in_0 = in_hw1c8 + in_h_0*iw_pad + in_w_0; - BIN8 *in_order_hw1c8 = in_order + c*fh*fw + fh_idx*fw + fw_idx; - *in_order_hw1c8 = (*in_0); - } - } - } - // compute - for (U32 o = 0; o < oc; o+=2) { - BIN8 *in_hw0 = in_order; - const BIN8 *f_o = filterArray + o*8*fh*fw*ic; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; - - uint16x8_t sum[2] = {0}; - uint8x8_t v1 = vdup_n_u8(1); - for (U32 i = 0; i < ic*8; i += 32) { - uint8x8_t sub0[2] = {0}; - - for (U32 j = 0; j < 4; j++) { - uint8x8_t f_0 = vld1_u8(f_o); - uint8x8_t f_1 = vld1_u8(f_o+8); - f_o += 16; - uint8x8_t in_1 = vdup_n_u8(*in_hw0); - in_hw0++; - f_0 = vand_u8(in_1, f_0); - f_1 = vand_u8(in_1, f_1); - f_0 = vcnt_u8(f_0); - f_1 = vcnt_u8(f_1); - sub0[0] = vadd_u8(sub0[0], f_0); - sub0[1] = vadd_u8(sub0[1], f_1); - } - sum[0] = vmlal_u8(sum[0], sub0[0], v1); - sum[1] = vmlal_u8(sum[1], sub0[1], v1); - - for (U32 j = 1; j < fh*fw; j += 8) { - uint8x8_t sub1[2] = {0}; - for (U32 k = 0; k < 32; k++) { - uint8x8_t f_0 = vld1_u8(f_o); - uint8x8_t f_1 = vld1_u8(f_o+8); - f_o += 16; - uint8x8_t in_1 = vdup_n_u8(*in_hw0); - in_hw0++; - f_0 = vand_u8(in_1, f_0); - f_1 = vand_u8(in_1, f_1); - f_0 = vcnt_u8(f_0); - f_1 = vcnt_u8(f_1); - sub1[0] = vadd_u8(sub1[0], f_0); - sub1[1] = vadd_u8(sub1[1], f_1); - } - sum[0] = vmlal_u8(sum[0], sub1[0], v1); - sum[1] = vmlal_u8(sum[1], sub1[1], v1); - } - } - - float16x8_t res_o0 = vcvtq_f16_u16(sum[0]); - float16x8_t res_o1 = vcvtq_f16_u16(sum[1]); - float16x8_t scale_o0 = vld1q_f16(s0); - s0 += 16; - float16x8_t scale_o1 = vld1q_f16(s1); - s1 += 16; - float16x8_t bias_o0 = vld1q_f16(b0); - b0 += 16; - float16x8_t bias_o1 = vld1q_f16(b1); - b1 += 16; - res_o0 = vmulq_f16(res_o0, scale_o0); - res_o1 = vmulq_f16(res_o1, scale_o1); - res_o0 = vaddq_f16(res_o0, bias_o0); - res_o1 = vaddq_f16(res_o1, bias_o1); - vst1q_f16(out_o0hw0, res_o0); - vst1q_f16(out_o1hw0, res_o1); - } - } - } - return SUCCESS; -} -#endif diff --git a/tensor_computing/src/cpu/arm/bnn/convolution_dorefa_A76.cpp b/tensor_computing/src/cpu/arm/bnn/convolution_dorefa_A76.cpp deleted file mode 100644 index 84fc3514..00000000 --- a/tensor_computing/src/cpu/arm/bnn/convolution_dorefa_A76.cpp +++ /dev/null @@ -1,757 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifdef _USE_FP16 -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" - -#include "cpu/arm/bnn/convolution_dorefa.h" - -EE convolution_dorefa_A76(TensorDesc inputDesc, const F16* input, - TensorDesc filterDesc, const BIN8* filterArray, - ConvolutionDesc convDesc, - TensorDesc scaleDesc, const F16* scaleArray, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc activationDesc) -{ - UNUSED(scaleDesc); - UNUSED(biasDesc); - UNUSED(tmpBytes); - UNUSED(activationDesc); - - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - - if (fdf != DF_NCHWN16C8) - CHECK_STATUS(NOT_MATCH); - if (!(ic == fc && oc == fn)) - CHECK_STATUS(NOT_MATCH); - - oc /= 8; - ic /= 8; - - U32 ih_pad = ih + paddingT + paddingB; - U32 iw_pad = iw + paddingL + paddingR; - U32 ohow = oh*ow; - U32 ihiw = ih_pad*iw_pad; - - BIN8* inArray = ((BIN8*)tmp) + ic*ihiw + 8*fh*fw*ic; // ic has been divided by 8 - BIN8 *inArray_pad; - for (U32 n = 0; n < in; n++) { - const F16 *in = input + n*ic*ih*iw*8; - for (U32 i = 0; i < ic*ih*iw; i++) { - BIN8 temp = 0; - for (U32 j = 0; j < 8; j++) { - if (in[i*8+j] >= 0.5) { - temp |= (1 << (7-j)); // set - } - } - inArray[i] = temp; - } - - if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { - inArray_pad = inArray + n*ic*ih*iw; // ic has been divided by 8 - } else { - // copy input into a input with padding - inArray_pad = (BIN8*)tmp; - BIN8 *inArray_pad_mov = inArray_pad; - BIN8 *inArray_mov = inArray + n*ic*ih*iw; - for (U32 c = 0; c < ic; c++) { // All divide by 8 - for (U32 h = 0; h < paddingT; h++) { - memset(inArray_pad_mov, 0, iw_pad*bytesOf(DT_BIN01)); - inArray_pad_mov += iw_pad; - } - for (U32 h = paddingT; h < ih_pad - paddingB; h++) { - memset(inArray_pad_mov, 0, paddingL*bytesOf(DT_BIN01)); - inArray_pad_mov += paddingL; - memcpy(inArray_pad_mov, inArray_mov, iw*bytesOf(DT_BIN01)); - inArray_pad_mov += iw; - inArray_mov += iw; - memset(inArray_pad_mov, 0, paddingR*bytesOf(DT_BIN01)); - inArray_pad_mov += paddingR; - } - for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { - memset(inArray_pad_mov, 0, iw_pad*bytesOf(DT_BIN01)); - inArray_pad_mov += iw_pad; - } - } - } - for (U32 hw = 0; hw < ohow-7; hw+=8) { - const F16 *s0 = scaleArray; - const F16 *s1 = scaleArray + 8; - const F16 *b0 = biasArray; - const F16 *b1 = biasArray + 8; - BIN8 *in_order = ((BIN8*)tmp) + ic*ihiw; // ic has been divided by 8 - // reorder input - // NCHWc8 => NHWChw8c8 + im2col - U32 in_h[8]; - U32 in_w[8]; - for (U32 i = 0; i < 8; i++) { - in_h[i] = ((hw+i)/ow)*strideH; - in_w[i] = ((hw+i)%ow)*strideW; - } - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - BIN8 *in_hw8c8 = inArray_pad + c*ihiw + fh_idx*iw_pad + fw_idx; - // NHWChw8c8 - BIN8 *in_order_hw8c8 = in_order + c*fh*fw*8 + fh_idx*fw*8 + fw_idx*8; // This 8 comes from hw8 - for (U32 i = 0; i < 8; i++) { - in_order_hw8c8[i] = *(in_hw8c8 + in_h[i]*iw_pad + in_w[i]); - } - } - } - } - - // compute - for (U32 o = 0; o < oc; o+=2) { // oc should be multiple of 32. It will at least be multiple of 16 in the future. - BIN8 *in_hw0 = in_order; - const BIN8 *f_o0c0 = filterArray + o*8*fh*fw*ic; // ic has been divided by 8 - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; - // scale and bias - const F16 *s_o0 = s0; - const F16 *s_o1 = s1; - const F16 *b_o0 = b0; - const F16 *b_o1 = b1; - __asm__ __volatile__( - "ldr d29, [%[in_0]]\n" //in_0 - "ldr q0, [%[f_0]]\n" //f_0 - /* Layout - 5 6 - 7 8 - 9 10 - 11 12 - - 13 14 - 15 16 - 17 18 - 19 20 - */ - "eor v5.16b, v5.16b, v5.16b\n" - "eor v6.16b, v6.16b, v6.16b\n" - "eor v7.16b, v7.16b, v7.16b\n" - "eor v8.16b, v8.16b, v8.16b\n" - "eor v9.16b, v9.16b, v9.16b\n" - "dup v1.16b, v29.b[0]\n" //duplicate a full register - "eor v10.16b, v10.16b, v10.16b\n" - "dup v2.16b, v29.b[1]\n" - "eor v11.16b, v11.16b, v11.16b\n" - "eor v12.16b, v12.16b, v12.16b\n" - "eor v13.16b, v13.16b, v13.16b\n" - "eor v14.16b, v14.16b, v14.16b\n" - "eor v15.16b, v15.16b, v15.16b\n" - "eor v16.16b, v16.16b, v16.16b\n" - "eor v17.16b, v17.16b, v17.16b\n" - "eor v18.16b, v18.16b, v18.16b\n" - "eor v19.16b, v19.16b, v19.16b\n" - "eor v20.16b, v20.16b, v20.16b\n" - - //give in address to x3 - "mov x3, %[in_0]\n" - - //give f address to x0 - "mov x0, %[f_0]\n" - - "mov x2, %[ic]\n" //ic_blk - - "0:\n" - "eor v21.16b, v21.16b, v21.16b\n" - "mov x9, %[fhfw]\n" - "eor v22.16b, v22.16b, v22.16b\n" - "eor v23.16b, v23.16b, v23.16b\n" - "eor v24.16b, v24.16b, v24.16b\n" - "eor v25.16b, v25.16b, v25.16b\n" - "eor v26.16b, v26.16b, v26.16b\n" - "eor v27.16b, v27.16b, v27.16b\n" - "eor v28.16b, v28.16b, v28.16b\n" - - "mov x4, #4\n" - - "1:\n" - "and v3.16b, v1.16b, v0.16b\n" - "and v4.16b, v2.16b, v0.16b\n" - - "cnt v3.16b, v3.16b\n" - "subs x4, x4, #1\n" - - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[2]\n" - - "add v21.16b, v21.16b, v3.16b\n" // Use add because the latency is shorter - "dup v2.16b, v29.b[3]\n" - - "add v22.16b, v22.16b, v4.16b\n" - - "and v3.16b, v1.16b, v0.16b\n" - "and v4.16b, v2.16b, v0.16b\n" - "cnt v3.16b, v3.16b\n" - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[4]\n" - "add v23.16b, v23.16b, v3.16b\n" - "dup v2.16b, v29.b[5]\n" - "add v24.16b, v24.16b, v4.16b\n" - - "and v3.16b, v1.16b, v0.16b\n" - "and v4.16b, v2.16b, v0.16b\n" - "cnt v3.16b, v3.16b\n" - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[6]\n" - "add v25.16b, v25.16b, v3.16b\n" - "dup v2.16b, v29.b[7]\n" - "add v26.16b, v26.16b, v4.16b\n" - - "and v3.16b, v1.16b, v0.16b\n" - "ldr d29, [x3, 8]!\n" - "and v4.16b, v2.16b, v0.16b\n" - "cnt v3.16b, v3.16b\n" - "ldr q0, [x0, 16]!\n" // next filter - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[0]\n" - "add v27.16b, v27.16b, v3.16b\n" - "dup v2.16b, v29.b[1]\n" - "add v28.16b, v28.16b, v4.16b\n" - "bne 1b\n" - - "movi v3.16b, #1\n" - "umlal v5.8h, v21.8b, v3.8b\n" - "umlal v7.8h, v22.8b, v3.8b\n" - "umlal v9.8h, v23.8b, v3.8b\n" - "umlal v11.8h, v24.8b, v3.8b\n" - "umlal v13.8h, v25.8b, v3.8b\n" - "umlal v15.8h, v26.8b, v3.8b\n" - "umlal v17.8h, v27.8b, v3.8b\n" - "umlal v19.8h, v28.8b, v3.8b\n" - - "umlal2 v6.8h, v21.16b, v3.16b\n" - "umlal2 v8.8h, v22.16b, v3.16b\n" - "umlal2 v10.8h, v23.16b, v3.16b\n" - "umlal2 v12.8h, v24.16b, v3.16b\n" - "umlal2 v14.8h, v25.16b, v3.16b\n" - "umlal2 v16.8h, v26.16b, v3.16b\n" - "umlal2 v18.8h, v27.16b, v3.16b\n" - "umlal2 v20.8h, v28.16b, v3.16b\n" - - "subs x9, x9, #1\n" - "beq 4f\n" // 1x1, continue with the next 32 input channels - - "2:\n" - "eor v21.16b, v21.16b, v21.16b\n" - "eor v22.16b, v22.16b, v22.16b\n" - "eor v23.16b, v23.16b, v23.16b\n" - "eor v24.16b, v24.16b, v24.16b\n" - "eor v25.16b, v25.16b, v25.16b\n" - "eor v26.16b, v26.16b, v26.16b\n" - "eor v27.16b, v27.16b, v27.16b\n" - "eor v28.16b, v28.16b, v28.16b\n" - - "mov x4, #32\n" // Assume 256 will not happen - "3:\n" - "and v3.16b, v1.16b, v0.16b\n" - "and v4.16b, v2.16b, v0.16b\n" - - "cnt v3.16b, v3.16b\n" - "subs x4, x4, #1\n" - - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[2]\n" - - "uqadd v21.16b, v21.16b, v3.16b\n" - "dup v2.16b, v29.b[3]\n" - - "uqadd v22.16b, v22.16b, v4.16b\n" - - "and v3.16b, v1.16b, v0.16b\n" - "and v4.16b, v2.16b, v0.16b\n" - "cnt v3.16b, v3.16b\n" - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[4]\n" - "uqadd v23.16b, v23.16b, v3.16b\n" - "dup v2.16b, v29.b[5]\n" - "uqadd v24.16b, v24.16b, v4.16b\n" - - "and v3.16b, v1.16b, v0.16b\n" - "and v4.16b, v2.16b, v0.16b\n" - "cnt v3.16b, v3.16b\n" - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[6]\n" - "uqadd v25.16b, v25.16b, v3.16b\n" - "dup v2.16b, v29.b[7]\n" - "uqadd v26.16b, v26.16b, v4.16b\n" - - "and v3.16b, v1.16b, v0.16b\n" - "ldr d29, [x3, 8]!\n" - "and v4.16b, v2.16b, v0.16b\n" - "cnt v3.16b, v3.16b\n" - "ldr q0, [x0, 16]!\n" // next filter - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[0]\n" - "uqadd v27.16b, v27.16b, v3.16b\n" - "dup v2.16b, v29.b[1]\n" - "uqadd v28.16b, v28.16b, v4.16b\n" - "bne 3b\n" - - "movi v3.16b, #1\n" - "umlal v5.8h, v21.8b, v3.8b\n" - "umlal v7.8h, v22.8b, v3.8b\n" - "umlal v9.8h, v23.8b, v3.8b\n" - "umlal v11.8h, v24.8b, v3.8b\n" - "umlal v13.8h, v25.8b, v3.8b\n" - "umlal v15.8h, v26.8b, v3.8b\n" - "umlal v17.8h, v27.8b, v3.8b\n" - "umlal v19.8h, v28.8b, v3.8b\n" - - "umlal2 v6.8h, v21.16b, v3.16b\n" - "umlal2 v8.8h, v22.16b, v3.16b\n" - "umlal2 v10.8h, v23.16b, v3.16b\n" - "umlal2 v12.8h, v24.16b, v3.16b\n" - "umlal2 v14.8h, v25.16b, v3.16b\n" - "umlal2 v16.8h, v26.16b, v3.16b\n" - "umlal2 v18.8h, v27.16b, v3.16b\n" - "umlal2 v20.8h, v28.16b, v3.16b\n" - - "subs x9, x9, #8\n" - "bne 2b\n" - - "4:\n" // Wrap up computation for 32 input channels - "subs x2, x2, #32\n" - "bne 0b\n" - - // pipelined - "ucvtf v5.8h, v5.8h\n" - "ucvtf v6.8h, v6.8h\n" - "ldr q21, [%[b_0]]\n" - "ucvtf v7.8h, v7.8h\n" - "ldr q22, [%[b_1]]\n" - "ucvtf v8.8h, v8.8h\n" - "ldr q23, [%[s_0]]\n" - "ucvtf v9.8h, v9.8h\n" - "ldr q24, [%[s_1]]\n" - "ucvtf v10.8h, v10.8h\n" - "ucvtf v11.8h, v11.8h\n" - "mov v1.16b, v21.16b\n" - "ucvtf v12.8h, v12.8h\n" - "mov v2.16b, v22.16b\n" - "ucvtf v13.8h, v13.8h\n" - "fmla v1.8h, v5.8h, v23.8h\n" - "ucvtf v14.8h, v14.8h\n" - "fmla v2.8h, v6.8h, v24.8h\n" - "ucvtf v15.8h, v15.8h\n" - "mov v3.16b, v21.16b\n" - "ucvtf v16.8h, v16.8h\n" - "mov v4.16b, v22.16b\n" - "ucvtf v17.8h, v17.8h\n" - "fmla v3.8h, v7.8h, v23.8h\n" - "ucvtf v18.8h, v18.8h\n" - "fmla v4.8h, v8.8h, v24.8h\n" - "ucvtf v19.8h, v19.8h\n" - "mov v5.16b, v21.16b\n" - "ucvtf v20.8h, v20.8h\n" - "mov v6.16b, v22.16b\n" - - "fmla v5.8h, v9.8h, v23.8h\n" - "mov v7.16b, v21.16b\n" - "fmla v6.8h, v10.8h, v24.8h\n" - "mov v8.16b, v22.16b\n" - "fmla v7.8h, v11.8h, v23.8h\n" - "mov v9.16b, v21.16b\n" - "fmla v8.8h, v12.8h, v24.8h\n" - "mov v10.16b, v22.16b\n" - "fmla v9.8h, v13.8h, v23.8h\n" - "mov v11.16b, v21.16b\n" - "fmla v10.8h, v14.8h, v24.8h\n" - "mov v12.16b, v22.16b\n" - "fmla v11.8h, v15.8h, v23.8h\n" - "mov v13.16b, v21.16b\n" - "fmla v12.8h, v16.8h, v24.8h\n" - "mov v14.16b, v22.16b\n" - "fmla v13.8h, v17.8h, v23.8h\n" - "mov v15.16b, v21.16b\n" - "fmla v14.8h, v18.8h, v24.8h\n" - "mov v16.16b, v22.16b\n" - "fmla v15.8h, v19.8h, v23.8h\n" - "fmla v16.8h, v20.8h, v24.8h\n" - - "str q1, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw1 - "str q5, [%[out_0], #32]\n" //out_o0hw2 - "str q7, [%[out_0], #48]\n" //out_o0hw3 - "str q9, [%[out_0], #64]\n" //out_o0hw4 - "str q11, [%[out_0], #80]\n" //out_o0hw5 - "str q13, [%[out_0], #96]\n" //out_o0hw6 - "str q15, [%[out_0], #112]\n" //out_o0hw7 - - "str q2, [%[out_1]]\n" //out_o1hw0 - "str q4, [%[out_1], #16]\n" //out_o1hw1 - "str q6, [%[out_1], #32]\n" //out_o1hw2 - "str q8, [%[out_1], #48]\n" //out_o1hw3 - "str q10, [%[out_1], #64]\n" //out_o1hw4 - "str q12, [%[out_1], #80]\n" //out_o1hw5 - "str q14, [%[out_1], #96]\n" //out_o1hw6 - "str q16, [%[out_1], #112]\n" //out_o1hw7 - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8), - [fhfw]"r"((I64)fh*fw), - [s_0]"r"(s_o0), - [s_1]"r"(s_o1), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x0", "x1", "x2", "x3", "x4", "x9" - ); - s0 += 16; - s1 += 16; - b0 += 16; - b1 += 16; - } - } - // ohow_remainder % 8 / 4 - U32 ohow_s = (ohow / 8) * 8; - - for (U32 hw = ohow_s; hw < ohow-3; hw+=4) { - const F16 *s0 = scaleArray; - const F16 *s1 = scaleArray + 8; - const F16 *b0 = biasArray; - const F16 *b1 = biasArray + 8; - BIN8 *in_order = ((BIN8*)tmp) + ic*ihiw; // ic has been divided by 8 - // reorder input - // NCHWc8 => NHWChw4c8 + im2col - U32 in_h[4]; - U32 in_w[4]; - for (U32 i = 0; i < 4; i++) { - in_h[i] = ((hw+i)/ow)*strideH; - in_w[i] = ((hw+i)%ow)*strideW; - } - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - BIN8 *in_hw4c8 = inArray_pad + c*ihiw + fh_idx*iw_pad + fw_idx; - // NHWChw4c8 - BIN8 *in_order_hw4c8 = in_order + c*fh*fw*4 + fh_idx*fw*4 + fw_idx*4; - for (U32 i = 0; i < 4; i++) { - in_order_hw4c8[i] = *(in_hw4c8 + in_h[i]*iw_pad + in_w[i]); - } - } - } - } - - // compute - for (U32 o = 0; o < oc; o+=2) { // oc should be multiple of 32. It will at least be multiple of 16 in the future. - BIN8 *in_hw0 = in_order; - const BIN8 *f_o0c0 = filterArray + o*8*fh*fw*ic; // ic has been divided by 8 - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; - // scale and bias - const F16 *s_o0 = s0; - const F16 *s_o1 = s1; - const F16 *b_o0 = b0; - const F16 *b_o1 = b1; - __asm__ __volatile__( - "ldr q0, [%[f_0]]\n" //f_0 - "ldr s29, [%[in_0]]\n" //in_0 - /* Layout - 5 6 - 7 8 - 9 10 - 11 12 - */ - "eor v5.16b, v5.16b, v5.16b\n" - "eor v6.16b, v6.16b, v6.16b\n" - "eor v7.16b, v7.16b, v7.16b\n" - "eor v8.16b, v8.16b, v8.16b\n" - "eor v9.16b, v9.16b, v9.16b\n" - "dup v1.16b, v29.b[0]\n" //duplicate a full register - "eor v10.16b, v10.16b, v10.16b\n" - "dup v2.16b, v29.b[1]\n" - "eor v11.16b, v11.16b, v11.16b\n" - "eor v12.16b, v12.16b, v12.16b\n" - - //give in address to x3 - "mov x3, %[in_0]\n" - - //give f address to x0 - "mov x0, %[f_0]\n" - - "mov x2, %[ic]\n" //ic_blk - - "0:\n" - "eor v21.16b, v21.16b, v21.16b\n" - "mov x9, %[fhfw]\n" - "eor v22.16b, v22.16b, v22.16b\n" - "eor v23.16b, v23.16b, v23.16b\n" - "eor v24.16b, v24.16b, v24.16b\n" - - "mov x4, #4\n" - - "1:\n" - "and v3.16b, v1.16b, v0.16b\n" - "and v4.16b, v2.16b, v0.16b\n" - - "cnt v3.16b, v3.16b\n" - "subs x4, x4, #1\n" - - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[2]\n" - - "add v21.16b, v21.16b, v3.16b\n" // Use add because the latency is shorter - "dup v2.16b, v29.b[3]\n" - - "add v22.16b, v22.16b, v4.16b\n" - - "and v3.16b, v1.16b, v0.16b\n" - "ldr s29, [x3, 4]!\n" - "and v4.16b, v2.16b, v0.16b\n" - "cnt v3.16b, v3.16b\n" - "ldr q0, [x0, 16]!\n" // next filter - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[0]\n" - "add v23.16b, v23.16b, v3.16b\n" - "dup v2.16b, v29.b[1]\n" - "add v24.16b, v24.16b, v4.16b\n" - "bne 1b\n" - - "movi v3.16b, #1\n" - "umlal v5.8h, v21.8b, v3.8b\n" - "umlal v7.8h, v22.8b, v3.8b\n" - "umlal v9.8h, v23.8b, v3.8b\n" - "umlal v11.8h, v24.8b, v3.8b\n" - - "umlal2 v6.8h, v21.16b, v3.16b\n" - "umlal2 v8.8h, v22.16b, v3.16b\n" - "umlal2 v10.8h, v23.16b, v3.16b\n" - "umlal2 v12.8h, v24.16b, v3.16b\n" - - "subs x9, x9, #1\n" - "beq 4f\n" // 1x1, continue with the next 32 input channels - - "2:\n" - "eor v21.16b, v21.16b, v21.16b\n" - "eor v22.16b, v22.16b, v22.16b\n" - "eor v23.16b, v23.16b, v23.16b\n" - "eor v24.16b, v24.16b, v24.16b\n" - - "mov x4, #32\n" // Assume 256 will not happen - "3:\n" - "and v3.16b, v1.16b, v0.16b\n" - "and v4.16b, v2.16b, v0.16b\n" - - "cnt v3.16b, v3.16b\n" - "subs x4, x4, #1\n" - - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[2]\n" - - "uqadd v21.16b, v21.16b, v3.16b\n" - "dup v2.16b, v29.b[3]\n" - - "uqadd v22.16b, v22.16b, v4.16b\n" - - "and v3.16b, v1.16b, v0.16b\n" - "ldr s29, [x3, 4]!\n" - "and v4.16b, v2.16b, v0.16b\n" - "cnt v3.16b, v3.16b\n" - "ldr q0, [x0, 16]!\n" // next filter - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[0]\n" - "uqadd v23.16b, v23.16b, v3.16b\n" - "dup v2.16b, v29.b[1]\n" - "uqadd v24.16b, v24.16b, v4.16b\n" - "bne 3b\n" - - "movi v3.16b, #1\n" - "umlal v5.8h, v21.8b, v3.8b\n" - "umlal v7.8h, v22.8b, v3.8b\n" - "umlal v9.8h, v23.8b, v3.8b\n" - "umlal v11.8h, v24.8b, v3.8b\n" - - "umlal2 v6.8h, v21.16b, v3.16b\n" - "umlal2 v8.8h, v22.16b, v3.16b\n" - "umlal2 v10.8h, v23.16b, v3.16b\n" - "umlal2 v12.8h, v24.16b, v3.16b\n" - - "subs x9, x9, #8\n" - "bne 2b\n" - - "4:\n" // Wrap up computation for 32 input channels - "subs x2, x2, #32\n" - "bne 0b\n" - - // pipelined - "ucvtf v5.8h, v5.8h\n" - "ucvtf v6.8h, v6.8h\n" - "ldr q21, [%[b_0]]\n" - "ucvtf v7.8h, v7.8h\n" - "ldr q22, [%[b_1]]\n" - "ucvtf v8.8h, v8.8h\n" - "ldr q23, [%[s_0]]\n" - "ucvtf v9.8h, v9.8h\n" - "ldr q24, [%[s_1]]\n" - "ucvtf v10.8h, v10.8h\n" - "ucvtf v11.8h, v11.8h\n" - "mov v1.16b, v21.16b\n" - "ucvtf v12.8h, v12.8h\n" - "mov v2.16b, v22.16b\n" - "fmla v1.8h, v5.8h, v23.8h\n" - "fmla v2.8h, v6.8h, v24.8h\n" - "mov v3.16b, v21.16b\n" - "mov v4.16b, v22.16b\n" - "fmla v3.8h, v7.8h, v23.8h\n" - "fmla v4.8h, v8.8h, v24.8h\n" - "mov v5.16b, v21.16b\n" - "mov v6.16b, v22.16b\n" - - "fmla v5.8h, v9.8h, v23.8h\n" - "mov v7.16b, v21.16b\n" - "fmla v6.8h, v10.8h, v24.8h\n" - "mov v8.16b, v22.16b\n" - "fmla v7.8h, v11.8h, v23.8h\n" - "fmla v8.8h, v12.8h, v24.8h\n" - - "str q1, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw1 - "str q5, [%[out_0], #32]\n" //out_o0hw2 - "str q7, [%[out_0], #48]\n" //out_o0hw3 - - "str q2, [%[out_1]]\n" //out_o1hw0 - "str q4, [%[out_1], #16]\n" //out_o1hw1 - "str q6, [%[out_1], #32]\n" //out_o1hw2 - "str q8, [%[out_1], #48]\n" //out_o1hw3 - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8), - [fhfw]"r"((I64)fh*fw), - [s_0]"r"(s_o0), - [s_1]"r"(s_o1), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v21", "v22", "v23", "v24", "v29", "v30", "x0", "x1", "x2", "x3", "x4", "x9" - ); - s0 += 16; - s1 += 16; - b0 += 16; - b1 += 16; - } - } - // ohow_reminder % 4 - ohow_s = (ohow / 4) * 4; - for (U32 hw = ohow_s; hw < ohow; hw++) { - const F16 *s0 = scaleArray; - const F16 *s1 = scaleArray + 8; - const F16 *b0 = biasArray; - const F16 *b1 = biasArray + 8; - BIN8 *in_order = ((BIN8*)tmp) + ic*ih_pad*iw_pad; // ic has been divided by 8 - // reorder input - // NCHWc8 => NHWChw1c8 + im2col - U32 in_h_0 = (hw/ow)*strideH; - U32 in_w_0 = (hw%ow)*strideW; - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - BIN8 *in_hw1c8 = inArray_pad + c*ihiw + fh_idx*iw_pad + fw_idx; - BIN8 *in_0 = in_hw1c8 + in_h_0*iw_pad + in_w_0; - BIN8 *in_order_hw1c8 = in_order + c*fh*fw + fh_idx*fw + fw_idx; - *in_order_hw1c8 = (*in_0); - } - } - } - // compute - for (U32 o = 0; o < oc; o+=2) { - BIN8 *in_hw0 = in_order; - const BIN8 *f_o = filterArray + o*8*fh*fw*ic; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; - - uint16x8_t sum[2] = {0}; - uint8x8_t v1 = vdup_n_u8(1); - for (U32 i = 0; i < ic*8; i += 32) { - uint8x8_t sub0[2] = {0}; - - for (U32 j = 0; j < 4; j++) { - uint8x8_t f_0 = vld1_u8(f_o); - uint8x8_t f_1 = vld1_u8(f_o+8); - f_o += 16; - uint8x8_t in_1 = vdup_n_u8(*in_hw0); - in_hw0++; - f_0 = vand_u8(in_1, f_0); - f_1 = vand_u8(in_1, f_1); - f_0 = vcnt_u8(f_0); - f_1 = vcnt_u8(f_1); - sub0[0] = vadd_u8(sub0[0], f_0); - sub0[1] = vadd_u8(sub0[1], f_1); - } - sum[0] = vmlal_u8(sum[0], sub0[0], v1); - sum[1] = vmlal_u8(sum[1], sub0[1], v1); - - for (U32 j = 1; j < fh*fw; j += 8) { - uint8x8_t sub1[2] = {0}; - for (U32 k = 0; k < 32; k++) { - uint8x8_t f_0 = vld1_u8(f_o); - uint8x8_t f_1 = vld1_u8(f_o+8); - f_o += 16; - uint8x8_t in_1 = vdup_n_u8(*in_hw0); - in_hw0++; - f_0 = vand_u8(in_1, f_0); - f_1 = vand_u8(in_1, f_1); - f_0 = vcnt_u8(f_0); - f_1 = vcnt_u8(f_1); - sub1[0] = vadd_u8(sub1[0], f_0); - sub1[1] = vadd_u8(sub1[1], f_1); - } - sum[0] = vmlal_u8(sum[0], sub1[0], v1); - sum[1] = vmlal_u8(sum[1], sub1[1], v1); - } - } - - float16x8_t res_o0 = vcvtq_f16_u16(sum[0]); - float16x8_t res_o1 = vcvtq_f16_u16(sum[1]); - float16x8_t scale_o0 = vld1q_f16(s0); - s0 += 16; - float16x8_t scale_o1 = vld1q_f16(s1); - s1 += 16; - float16x8_t bias_o0 = vld1q_f16(b0); - b0 += 16; - float16x8_t bias_o1 = vld1q_f16(b1); - b1 += 16; - res_o0 = vmulq_f16(res_o0, scale_o0); - res_o1 = vmulq_f16(res_o1, scale_o1); - res_o0 = vaddq_f16(res_o0, bias_o0); - res_o1 = vaddq_f16(res_o1, bias_o1); - vst1q_f16(out_o0hw0, res_o0); - vst1q_f16(out_o1hw0, res_o1); - } - } - } - return SUCCESS; -} -#endif diff --git a/tensor_computing/src/cpu/arm/bnn/convolution_transform_bnn.h b/tensor_computing/src/cpu/arm/bnn/convolution_transform_bnn.h deleted file mode 100644 index 14e39a71..00000000 --- a/tensor_computing/src/cpu/arm/bnn/convolution_transform_bnn.h +++ /dev/null @@ -1,89 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_CONVOLUTION_TRANSFORM_BNN -#define _H_CONVOLUTION_TRANSFORM_BNN - -#ifdef _USE_FP16 -#include -#include - -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "tensor_computing.h" - - -inline void bitwise_copy(BIN8 srcVal, U32 srcBit, BIN8* dest, U32 destBit) { - std::bitset<8> Src(srcVal); - if (Src.test(srcBit)) { - *dest |= (1< (N/16)*(C/8)*(H*W)*n16*c8 - */ - if (nullptr == filterArray || nullptr == ftmDesc || nullptr == ftmArray) - CHECK_STATUS(NULL_POINTER); - - DataType fdt; - DataFormat fdf; - U32 fn, fc, fh, fw; - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - switch (fdf) { - case DF_NCHWN16C8: - // Everything is ready - memcpy(ftmArray, filterArray, fn*fc*fh*fw/8*bytesOf(fdt)); - break; - case DF_NCHW: { - /* - * NCHW => NCHWN16C8 - * Now assume fn is divisible by 32 - */ - U32 oc = fn / 16; - U32 ic = fc / 8; - for (U32 o = 0; o < oc; o++) { - for (U32 c = 0; c < ic; c++) { - for (U32 hw = 0; hw < fh*fw; hw++) { - for (U32 o16 = 0; o16 < 16; o16++) { - for (U32 c8 = 0; c8 < 8; c8++) { - U32 ftmBitPos = o*fh*fw*ic*128 + c*fh*fw*128 + hw*128 + o16*8 + c8; - U32 ftmSlot = ftmBitPos / 8; - U32 ftmBitNo = 7 - (ftmBitPos % 8); - - U32 filterBitPos = (o*16+o16)*ic*8*fh*fw + (c*8+c8)*fh*fw + hw; - U32 filterSlot = filterBitPos / 8; - U32 filterBitNo = 7 - (filterBitPos % 8); - bitwise_copy(filterArray[filterSlot], filterBitNo, ftmArray+ftmSlot, ftmBitNo); - } - } - } - } - } - break; - } - default: - return NOT_MATCH; - } - *ftmDesc = tensor4df(fdt, DF_NCHWN16C8, fn, fc, fh, fw); - return SUCCESS; -} -#endif -#endif diff --git a/tensor_computing/src/cpu/arm/bnn/convolution_xnor.h b/tensor_computing/src/cpu/arm/bnn/convolution_xnor.h deleted file mode 100644 index a11cf4b7..00000000 --- a/tensor_computing/src/cpu/arm/bnn/convolution_xnor.h +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_CONVOLUTION_XNOR -#define _H_CONVOLUTION_XNOR - -#ifdef _USE_FP16 -#include -#include -#include "sys.h" -#include "type.h" -#include "error.h" -#include "tensor_desc.h" -#include "tensor_computing_type.h" - -EE convolution_xnor_A55(TensorDesc inputDesc, const F16* input, - TensorDesc filterDesc, const BIN8* filterArray, - ConvolutionDesc convDesc, - TensorDesc scaleDesc, const F16* scaleArray, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc activationDesc); - -EE convolution_xnor_A76(TensorDesc inputDesc, const F16* input, - TensorDesc filterDesc, const BIN8* filterArray, - ConvolutionDesc convDesc, - TensorDesc scaleDesc, const F16* scaleArray, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc activationDesc); - -inline EE convolution_xnor(TensorDesc inputDesc, const F16* input, - TensorDesc filterDesc, const BIN8* filter, - ConvolutionDesc convDesc, - TensorDesc scaleDesc, const F16* scale, - TensorDesc biasDesc, const F16* bias, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* output, - ActivationDesc activationDesc, - Arch arch) -{ - EE ret = SUCCESS; - switch (arch) { - case ARM_A55: - ret = convolution_xnor_A55(inputDesc, input, - filterDesc, filter, - convDesc, - scaleDesc, scale, - biasDesc, bias, - tmpBytes, tmp, - outputDesc, output, - activationDesc); - break; - case ARM_A76: - ret = convolution_xnor_A76(inputDesc, input, - filterDesc, filter, - convDesc, - scaleDesc, scale, - biasDesc, bias, - tmpBytes, tmp, - outputDesc, output, - activationDesc); - break; - default: - return NOT_SUPPORTED; - } - return ret; -} -#endif -#endif diff --git a/tensor_computing/src/cpu/arm/bnn/convolution_xnor_A55.cpp b/tensor_computing/src/cpu/arm/bnn/convolution_xnor_A55.cpp deleted file mode 100644 index f46381e8..00000000 --- a/tensor_computing/src/cpu/arm/bnn/convolution_xnor_A55.cpp +++ /dev/null @@ -1,797 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifdef _USE_FP16 -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" - -#include "cpu/arm/bnn/convolution_xnor.h" - -EE convolution_xnor_A55(TensorDesc inputDesc, const F16* input, - TensorDesc filterDesc, const BIN8* filterArray, - ConvolutionDesc convDesc, - TensorDesc scaleDesc, const F16* scaleArray, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc activationDesc) -{ - UNUSED(scaleDesc); - UNUSED(biasDesc); - UNUSED(tmpBytes); - UNUSED(activationDesc); - - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - - if (fdf != DF_NCHWN16C8) - CHECK_STATUS(NOT_MATCH); - if (!(ic == fc && oc == fn)) - CHECK_STATUS(NOT_MATCH); - - oc /= 8; - ic /= 8; - - U32 ih_pad = ih + paddingT + paddingB; - U32 iw_pad = iw + paddingL + paddingR; - U32 ohow = oh*ow; - U32 ihiw = ih_pad*iw_pad; - - BIN8* inArray = ((BIN8*)tmp) + ic*ihiw + 8*fh*fw*ic; // ic has been divided by 8 - BIN8 *inArray_pad; - - for (U32 n = 0; n < in; n++) { - const F16 *in = input + n*ic*ih*iw*8; - for (U32 i=0; i= 0) { - temp |= (1 << (7-j)); // set - } - } - inArray[i] = temp; - } - - if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { - inArray_pad = inArray + n*ic*ih*iw; // ic has been divided by 8 - } else { - // copy input into a input with padding - inArray_pad = (BIN8*)tmp; - BIN8 *inArray_pad_mov = inArray_pad; - BIN8 *inArray_mov = inArray + n*ic*ih*iw; - for (U32 c = 0; c < ic; c++) { // All divide by 8 - for (U32 h = 0; h < paddingT; h++) { - memset(inArray_pad_mov, 0, iw_pad*bytesOf(DT_BIN11)); - inArray_pad_mov += iw_pad; - } - for (U32 h = paddingT; h < ih_pad - paddingB; h++) { - memset(inArray_pad_mov, 0, paddingL*bytesOf(DT_BIN11)); - inArray_pad_mov += paddingL; - memcpy(inArray_pad_mov, inArray_mov, iw*bytesOf(DT_BIN11)); - inArray_pad_mov += iw; - inArray_mov += iw; - memset(inArray_pad_mov, 0, paddingR*bytesOf(DT_BIN11)); - inArray_pad_mov += paddingR; - } - for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { - memset(inArray_pad_mov, 0, iw_pad*bytesOf(DT_BIN11)); - inArray_pad_mov += iw_pad; - } - } - } - // ohow / 8 - short base_s = fh*fw*ic*8; // For xnorNet, actual_sum = base_s - 2 * noOf1sFromXOR - short base_v[8]; // Assume the base can be represented as int16 - for (U32 i=0; i<8; i++) { - base_v[i] = base_s; - } - for (U32 hw = 0; hw < ohow-7; hw+=8) { - const F16 *s0 = scaleArray; - const F16 *s1 = scaleArray + 8; - const F16 *b0 = biasArray; - const F16 *b1 = biasArray + 8; - BIN8 *in_order = ((BIN8*)tmp) + ic*ihiw; // ic has been divided by 8 - // reorder input - // NCHWc8 => NHWChw8c8 + im2col - U32 in_h[8]; - U32 in_w[8]; - for (U32 i=0; i<8; i++) { - in_h[i] = ((hw+i)/ow)*strideH; - in_w[i] = ((hw+i)%ow)*strideW; - } - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - BIN8 *in_hw8c8 = inArray_pad + c*ihiw + fh_idx*iw_pad + fw_idx; - // NHWChw8c8 - BIN8 *in_order_hw8c8 = in_order + c*fh*fw*8 + fh_idx*fw*8 + fw_idx*8; // This 8 comes from hw8 - for (U32 i=0; i<8; i++) { - in_order_hw8c8[i] = *(in_hw8c8 + in_h[i]*iw_pad + in_w[i]); - } - } - } - } - - // compute - for (U32 o = 0; o < oc; o+=2) { // oc should be multiple of 32. It will at least be multiple of 16 in the future. - BIN8 *in_hw0 = in_order; - const BIN8 *f_o0c0 = filterArray + o*8*fh*fw*ic;// ic has been divided by 8 - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; - // scale and bias - const F16 *s_o0 = s0; - const F16 *s_o1 = s1; - const F16 *b_o0 = b0; - const F16 *b_o1 = b1; - __asm__ __volatile__( - "ldr q4, [%[base]]\n" - /* Layout - 5 6 - 7 8 - 9 10 - 11 12 - - 13 14 - 15 16 - 17 18 - 19 20 - */ - "mov v5.16b, v4.16b\n" - "ldr d29, [%[in_0]]\n" //in_0 - "mov v6.16b, v4.16b\n" - "ldr d0, [%[f_0]]\n" //f_0 - "mov v7.16b, v4.16b\n" - "ldr x2, [%[f_0], #8]\n" - "mov v8.16b, v4.16b\n" - "ins v0.d[1], x2\n" - "mov v9.16b, v4.16b\n" - "dup v1.16b, v29.b[0]\n" //duplicate a full register - "mov v10.16b, v4.16b\n" - "dup v2.16b, v29.b[1]\n" - "mov v11.16b, v4.16b\n" - "mov v12.16b, v4.16b\n" - "mov v13.16b, v4.16b\n" - "mov v14.16b, v4.16b\n" - "mov v15.16b, v4.16b\n" - "mov v16.16b, v4.16b\n" - "mov v17.16b, v4.16b\n" - "mov v18.16b, v4.16b\n" - "mov v19.16b, v4.16b\n" - "mov v20.16b, v4.16b\n" - - //give in address to x3 - "mov x3, %[in_0]\n" - - //give f address to x0 - "mov x0, %[f_0]\n" - - "mov x2, %[ic]\n" //ic_blk - - "0:\n" - "eor v21.16b, v21.16b, v21.16b\n" - "mov x9, %[fhfw]\n" - "eor v22.16b, v22.16b, v22.16b\n" - "eor v23.16b, v23.16b, v23.16b\n" - "eor v24.16b, v24.16b, v24.16b\n" - "eor v25.16b, v25.16b, v25.16b\n" - "eor v26.16b, v26.16b, v26.16b\n" - "eor v27.16b, v27.16b, v27.16b\n" - "eor v28.16b, v28.16b, v28.16b\n" - - "mov x4, #4\n" - - "1:\n" - "eor v3.16b, v1.16b, v0.16b\n" - "ldr d30, [x0, 16]!\n" // next filter - - "eor v4.16b, v2.16b, v0.16b\n" - "ldr x1, [x0, 8]\n" - - "cnt v3.16b, v3.16b\n" - "subs x4, x4, #1\n" - - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[2]\n" - - "add v21.16b, v21.16b, v3.16b\n" // Use add because the latency is shorter - "dup v2.16b, v29.b[3]\n" - - "add v22.16b, v22.16b, v4.16b\n" - "ins v30.d[1], x1\n" - - "eor v3.16b, v1.16b, v0.16b\n" - "eor v4.16b, v2.16b, v0.16b\n" - "cnt v3.16b, v3.16b\n" - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[4]\n" - "add v23.16b, v23.16b, v3.16b\n" - "dup v2.16b, v29.b[5]\n" - "add v24.16b, v24.16b, v4.16b\n" - - "eor v3.16b, v1.16b, v0.16b\n" - "eor v4.16b, v2.16b, v0.16b\n" - "cnt v3.16b, v3.16b\n" - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[6]\n" - "add v25.16b, v25.16b, v3.16b\n" - "dup v2.16b, v29.b[7]\n" - "add v26.16b, v26.16b, v4.16b\n" - - "eor v3.16b, v1.16b, v0.16b\n" - "ldr d29, [x3, 8]!\n" - "eor v4.16b, v2.16b, v0.16b\n" - "cnt v3.16b, v3.16b\n" - "mov v0.16b, v30.16b\n" - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[0]\n" - "add v27.16b, v27.16b, v3.16b\n" - "dup v2.16b, v29.b[1]\n" - "add v28.16b, v28.16b, v4.16b\n" - "bne 1b\n" - - "movi v3.16b, #2\n" - "umlsl v5.8h, v21.8b, v3.8b\n" - "umlsl v7.8h, v22.8b, v3.8b\n" - "umlsl v9.8h, v23.8b, v3.8b\n" - "umlsl v11.8h, v24.8b, v3.8b\n" - "umlsl v13.8h, v25.8b, v3.8b\n" - "umlsl v15.8h, v26.8b, v3.8b\n" - "umlsl v17.8h, v27.8b, v3.8b\n" - "umlsl v19.8h, v28.8b, v3.8b\n" - - "umlsl2 v6.8h, v21.16b, v3.16b\n" - "umlsl2 v8.8h, v22.16b, v3.16b\n" - "umlsl2 v10.8h, v23.16b, v3.16b\n" - "umlsl2 v12.8h, v24.16b, v3.16b\n" - "umlsl2 v14.8h, v25.16b, v3.16b\n" - "umlsl2 v16.8h, v26.16b, v3.16b\n" - "umlsl2 v18.8h, v27.16b, v3.16b\n" - "umlsl2 v20.8h, v28.16b, v3.16b\n" - - "subs x9, x9, #1\n" - "beq 4f\n" // 1x1, continue with the next 32 input channels - - "2:\n" - "eor v21.16b, v21.16b, v21.16b\n" - "eor v22.16b, v22.16b, v22.16b\n" - "eor v23.16b, v23.16b, v23.16b\n" - "eor v24.16b, v24.16b, v24.16b\n" - "eor v25.16b, v25.16b, v25.16b\n" - "eor v26.16b, v26.16b, v26.16b\n" - "eor v27.16b, v27.16b, v27.16b\n" - "eor v28.16b, v28.16b, v28.16b\n" - - "mov x4, #32\n" // Assume 256 will not happen - "3:\n" - "eor v3.16b, v1.16b, v0.16b\n" - "ldr d30, [x0, 16]!\n" // next filter - - "eor v4.16b, v2.16b, v0.16b\n" - "ldr x1, [x0, 8]\n" - - "cnt v3.16b, v3.16b\n" - "subs x4, x4, #1\n" - - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[2]\n" - - "uqadd v21.16b, v21.16b, v3.16b\n" - "dup v2.16b, v29.b[3]\n" - - "uqadd v22.16b, v22.16b, v4.16b\n" - "ins v30.d[1], x1\n" - - "eor v3.16b, v1.16b, v0.16b\n" - "eor v4.16b, v2.16b, v0.16b\n" - "cnt v3.16b, v3.16b\n" - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[4]\n" - "uqadd v23.16b, v23.16b, v3.16b\n" - "dup v2.16b, v29.b[5]\n" - "uqadd v24.16b, v24.16b, v4.16b\n" - - "eor v3.16b, v1.16b, v0.16b\n" - "eor v4.16b, v2.16b, v0.16b\n" - "cnt v3.16b, v3.16b\n" - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[6]\n" - "uqadd v25.16b, v25.16b, v3.16b\n" - "dup v2.16b, v29.b[7]\n" - "uqadd v26.16b, v26.16b, v4.16b\n" - - "eor v3.16b, v1.16b, v0.16b\n" - "ldr d29, [x3, 8]!\n" - "eor v4.16b, v2.16b, v0.16b\n" - "cnt v3.16b, v3.16b\n" - "mov v0.16b, v30.16b\n" - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[0]\n" - "uqadd v27.16b, v27.16b, v3.16b\n" - "dup v2.16b, v29.b[1]\n" - "uqadd v28.16b, v28.16b, v4.16b\n" - "bne 3b\n" - - "movi v3.16b, #2\n" // actual sum = base - 2 * noOf1s - "umlsl v5.8h, v21.8b, v3.8b\n" - "umlsl v7.8h, v22.8b, v3.8b\n" - "umlsl v9.8h, v23.8b, v3.8b\n" - "umlsl v11.8h, v24.8b, v3.8b\n" - "umlsl v13.8h, v25.8b, v3.8b\n" - "umlsl v15.8h, v26.8b, v3.8b\n" - "umlsl v17.8h, v27.8b, v3.8b\n" - "umlsl v19.8h, v28.8b, v3.8b\n" - - "umlsl2 v6.8h, v21.16b, v3.16b\n" - "umlsl2 v8.8h, v22.16b, v3.16b\n" - "umlsl2 v10.8h, v23.16b, v3.16b\n" - "umlsl2 v12.8h, v24.16b, v3.16b\n" - "umlsl2 v14.8h, v25.16b, v3.16b\n" - "umlsl2 v16.8h, v26.16b, v3.16b\n" - "umlsl2 v18.8h, v27.16b, v3.16b\n" - "umlsl2 v20.8h, v28.16b, v3.16b\n" - - "subs x9, x9, #8\n" - "bne 2b\n" - - "4:\n" // Wrap up computation for 32 input channels - "subs x2, x2, #32\n" - "bne 0b\n" - - // pipelined - "scvtf v5.8h, v5.8h\n" - "scvtf v6.8h, v6.8h\n" - "ldr q21, [%[b_0]]\n" - "scvtf v7.8h, v7.8h\n" - "ldr q22, [%[b_1]]\n" - "scvtf v8.8h, v8.8h\n" - "ldr q23, [%[s_0]]\n" - "scvtf v9.8h, v9.8h\n" - "ldr q24, [%[s_1]]\n" - "scvtf v10.8h, v10.8h\n" - "scvtf v11.8h, v11.8h\n" - "mov v1.16b, v21.16b\n" - "scvtf v12.8h, v12.8h\n" - "mov v2.16b, v22.16b\n" - "scvtf v13.8h, v13.8h\n" - "fmla v1.8h, v5.8h, v23.8h\n" - "scvtf v14.8h, v14.8h\n" - "fmla v2.8h, v6.8h, v24.8h\n" - "scvtf v15.8h, v15.8h\n" - "mov v3.16b, v21.16b\n" - "scvtf v16.8h, v16.8h\n" - "mov v4.16b, v22.16b\n" - "scvtf v17.8h, v17.8h\n" - "fmla v3.8h, v7.8h, v23.8h\n" - "scvtf v18.8h, v18.8h\n" - "fmla v4.8h, v8.8h, v24.8h\n" - "scvtf v19.8h, v19.8h\n" - "mov v5.16b, v21.16b\n" - "scvtf v20.8h, v20.8h\n" - "mov v6.16b, v22.16b\n" - - "fmla v5.8h, v9.8h, v23.8h\n" - "mov v7.16b, v21.16b\n" - "fmla v6.8h, v10.8h, v24.8h\n" - "mov v8.16b, v22.16b\n" - "fmla v7.8h, v11.8h, v23.8h\n" - "mov v9.16b, v21.16b\n" - "fmla v8.8h, v12.8h, v24.8h\n" - "mov v10.16b, v22.16b\n" - "fmla v9.8h, v13.8h, v23.8h\n" - "mov v11.16b, v21.16b\n" - "fmla v10.8h, v14.8h, v24.8h\n" - "mov v12.16b, v22.16b\n" - "fmla v11.8h, v15.8h, v23.8h\n" - "mov v13.16b, v21.16b\n" - "fmla v12.8h, v16.8h, v24.8h\n" - "mov v14.16b, v22.16b\n" - "fmla v13.8h, v17.8h, v23.8h\n" - "mov v15.16b, v21.16b\n" - "fmla v14.8h, v18.8h, v24.8h\n" - "mov v16.16b, v22.16b\n" - "fmla v15.8h, v19.8h, v23.8h\n" - "fmla v16.8h, v20.8h, v24.8h\n" - - "str q1, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw1 - "str q5, [%[out_0], #32]\n" //out_o0hw2 - "str q7, [%[out_0], #48]\n" //out_o0hw3 - "str q9, [%[out_0], #64]\n" //out_o0hw4 - "str q11, [%[out_0], #80]\n" //out_o0hw5 - "str q13, [%[out_0], #96]\n" //out_o0hw6 - "str q15, [%[out_0], #112]\n" //out_o0hw7 - - "str q2, [%[out_1]]\n" //out_o1hw0 - "str q4, [%[out_1], #16]\n" //out_o1hw1 - "str q6, [%[out_1], #32]\n" //out_o1hw2 - "str q8, [%[out_1], #48]\n" //out_o1hw3 - "str q10, [%[out_1], #64]\n" //out_o1hw4 - "str q12, [%[out_1], #80]\n" //out_o1hw5 - "str q14, [%[out_1], #96]\n" //out_o1hw6 - "str q16, [%[out_1], #112]\n" //out_o1hw7 - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8), - [fhfw]"r"((I64)fh*fw), - [base]"r"(base_v), - [s_0]"r"(s_o0), - [s_1]"r"(s_o1), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x0", "x1", "x2", "x3", "x4", "x9" - ); - s0 += 16; - s1 += 16; - b0 += 16; - b1 += 16; - } - } - // ohow_remainder % 8 / 4 - U32 ohow_s = (ohow / 8) * 8; - - for (U32 hw = ohow_s; hw < ohow-3; hw+=4) { - const F16 *s0 = scaleArray; - const F16 *s1 = scaleArray + 8; - const F16 *b0 = biasArray; - const F16 *b1 = biasArray + 8; - BIN8 *in_order = ((BIN8*)tmp) + ic*ihiw; // ic has been divided by 8 - // reorder input - // NCHWc8 => NHWChw4c8 + im2col - U32 in_h[4]; - U32 in_w[4]; - for (U32 i=0; i<4; i++) { - in_h[i] = ((hw+i)/ow)*strideH; - in_w[i] = ((hw+i)%ow)*strideW; - } - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - BIN8 *in_hw4c8 = inArray_pad + c*ihiw + fh_idx*iw_pad + fw_idx; - // NHWChw8c8 - BIN8 *in_order_hw4c8 = in_order + c*fh*fw*4 + fh_idx*fw*4 + fw_idx*4; - for (U32 i=0; i<4; i++) { - in_order_hw4c8[i] = *(in_hw4c8 + in_h[i]*iw_pad + in_w[i]); - } - } - } - } - - // compute - for (U32 o = 0; o < oc; o+=2) { // oc should be multiple of 32. It will at least be multiple of 16 in the future. - BIN8 *in_hw0 = in_order; - const BIN8 *f_o0c0 = filterArray + o*8*fh*fw*ic;// ic has been divided by 8 - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; - // scale and bias - const F16 *s_o0 = s0; - const F16 *s_o1 = s1; - const F16 *b_o0 = b0; - const F16 *b_o1 = b1; - __asm__ __volatile__( - "ldr q4, [%[base]]\n" - /* Layout - 5 6 - 7 8 - 9 10 - 11 12 - */ - "mov v5.16b, v4.16b\n" - "ldr s29, [%[in_0]]\n" //in_0 - "mov v6.16b, v4.16b\n" - "ldr d0, [%[f_0]]\n" //f_0 - "mov v7.16b, v4.16b\n" - "ldr x2, [%[f_0], #8]\n" - "mov v8.16b, v4.16b\n" - "ins v0.d[1], x2\n" - "mov v9.16b, v4.16b\n" - "dup v1.16b, v29.b[0]\n" //duplicate a full register - "mov v10.16b, v4.16b\n" - "dup v2.16b, v29.b[1]\n" - "mov v11.16b, v4.16b\n" - "mov v12.16b, v4.16b\n" - - //give in address to x3 - "mov x3, %[in_0]\n" - - //give f address to x0 - "mov x0, %[f_0]\n" - - "mov x2, %[ic]\n" //ic_blk - - "0:\n" - "eor v21.16b, v21.16b, v21.16b\n" - "mov x9, %[fhfw]\n" - "eor v22.16b, v22.16b, v22.16b\n" - "eor v23.16b, v23.16b, v23.16b\n" - "eor v24.16b, v24.16b, v24.16b\n" - - "mov x4, #4\n" - - "1:\n" - "eor v3.16b, v1.16b, v0.16b\n" - "ldr d30, [x0, 16]!\n" // next filter - - "eor v4.16b, v2.16b, v0.16b\n" - "ldr x1, [x0, 8]\n" - - "cnt v3.16b, v3.16b\n" - "subs x4, x4, #1\n" - - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[2]\n" - - "add v21.16b, v21.16b, v3.16b\n" // Use add because the latency is shorter - "dup v2.16b, v29.b[3]\n" - - "add v22.16b, v22.16b, v4.16b\n" - "ins v30.d[1], x1\n" - - "eor v3.16b, v1.16b, v0.16b\n" - "ldr s29, [x3, 4]!\n" - "eor v4.16b, v2.16b, v0.16b\n" - "cnt v3.16b, v3.16b\n" - "mov v0.16b, v30.16b\n" - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[0]\n" - "add v23.16b, v23.16b, v3.16b\n" - "dup v2.16b, v29.b[1]\n" - "add v24.16b, v24.16b, v4.16b\n" - "bne 1b\n" - - "movi v3.16b, #2\n" - "umlsl v5.8h, v21.8b, v3.8b\n" - "umlsl v7.8h, v22.8b, v3.8b\n" - "umlsl v9.8h, v23.8b, v3.8b\n" - "umlsl v11.8h, v24.8b, v3.8b\n" - - "umlsl2 v6.8h, v21.16b, v3.16b\n" - "umlsl2 v8.8h, v22.16b, v3.16b\n" - "umlsl2 v10.8h, v23.16b, v3.16b\n" - "umlsl2 v12.8h, v24.16b, v3.16b\n" - - "subs x9, x9, #1\n" - "beq 4f\n" // 1x1, continue with the next 32 input channels - - "2:\n" - "eor v21.16b, v21.16b, v21.16b\n" - "eor v22.16b, v22.16b, v22.16b\n" - "eor v23.16b, v23.16b, v23.16b\n" - "eor v24.16b, v24.16b, v24.16b\n" - - "mov x4, #32\n" // Assume 256 will not happen - "3:\n" - "eor v3.16b, v1.16b, v0.16b\n" - "ldr d30, [x0, 16]!\n" // next filter - - "eor v4.16b, v2.16b, v0.16b\n" - "ldr x1, [x0, 8]\n" - - "cnt v3.16b, v3.16b\n" - "subs x4, x4, #1\n" - - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[2]\n" - - "uqadd v21.16b, v21.16b, v3.16b\n" - "dup v2.16b, v29.b[3]\n" - - "uqadd v22.16b, v22.16b, v4.16b\n" - "ins v30.d[1], x1\n" - - "eor v3.16b, v1.16b, v0.16b\n" - "ldr s29, [x3, 4]!\n" - "eor v4.16b, v2.16b, v0.16b\n" - "cnt v3.16b, v3.16b\n" - "mov v0.16b, v30.16b\n" - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[0]\n" - "uqadd v23.16b, v23.16b, v3.16b\n" - "dup v2.16b, v29.b[1]\n" - "uqadd v24.16b, v24.16b, v4.16b\n" - "bne 3b\n" - - "movi v3.16b, #2\n" - "umlsl v5.8h, v21.8b, v3.8b\n" - "umlsl v7.8h, v22.8b, v3.8b\n" - "umlsl v9.8h, v23.8b, v3.8b\n" - "umlsl v11.8h, v24.8b, v3.8b\n" - - "umlsl2 v6.8h, v21.16b, v3.16b\n" - "umlsl2 v8.8h, v22.16b, v3.16b\n" - "umlsl2 v10.8h, v23.16b, v3.16b\n" - "umlsl2 v12.8h, v24.16b, v3.16b\n" - - "subs x9, x9, #8\n" - "bne 2b\n" - - "4:\n" // Wrap up computation for 32 input channels - "subs x2, x2, #32\n" - "bne 0b\n" - - // pipelined - "scvtf v5.8h, v5.8h\n" - "scvtf v6.8h, v6.8h\n" - "ldr q21, [%[b_0]]\n" - "scvtf v7.8h, v7.8h\n" - "ldr q22, [%[b_1]]\n" - "scvtf v8.8h, v8.8h\n" - "ldr q23, [%[s_0]]\n" - "scvtf v9.8h, v9.8h\n" - "ldr q24, [%[s_1]]\n" - "scvtf v10.8h, v10.8h\n" - "scvtf v11.8h, v11.8h\n" - "mov v1.16b, v21.16b\n" - "scvtf v12.8h, v12.8h\n" - "mov v2.16b, v22.16b\n" - "fmla v1.8h, v5.8h, v23.8h\n" - "fmla v2.8h, v6.8h, v24.8h\n" - "mov v3.16b, v21.16b\n" - "mov v4.16b, v22.16b\n" - "fmla v3.8h, v7.8h, v23.8h\n" - "fmla v4.8h, v8.8h, v24.8h\n" - "mov v5.16b, v21.16b\n" - "mov v6.16b, v22.16b\n" - - "fmla v5.8h, v9.8h, v23.8h\n" - "mov v7.16b, v21.16b\n" - "fmla v6.8h, v10.8h, v24.8h\n" - "mov v8.16b, v22.16b\n" - "fmla v7.8h, v11.8h, v23.8h\n" - "fmla v8.8h, v12.8h, v24.8h\n" - - "str q1, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw1 - "str q5, [%[out_0], #32]\n" //out_o0hw2 - "str q7, [%[out_0], #48]\n" //out_o0hw3 - - "str q2, [%[out_1]]\n" //out_o1hw0 - "str q4, [%[out_1], #16]\n" //out_o1hw1 - "str q6, [%[out_1], #32]\n" //out_o1hw2 - "str q8, [%[out_1], #48]\n" //out_o1hw3 - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8), - [fhfw]"r"((I64)fh*fw), - [base]"r"(base_v), - [s_0]"r"(s_o0), - [s_1]"r"(s_o1), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v21", "v22", "v23", "v24", "v29", "v30", "x0", "x1", "x2", "x3", "x4", "x9" - ); - s0 += 16; - s1 += 16; - b0 += 16; - b1 += 16; - } - } - // ohow_reminder % 4 - ohow_s = (ohow / 4) * 4; - for (U32 hw = ohow_s; hw < ohow; hw++) { - const F16 *s0 = scaleArray; - const F16 *s1 = scaleArray + 8; - const F16 *b0 = biasArray; - const F16 *b1 = biasArray + 8; - BIN8 *in_order = ((BIN8*)tmp) + ic*ihiw; // ic has been divided by 8 - // reorder input - // NCHWc8 => NHWChw1c8 + im2col - U32 in_h_0 = (hw/ow)*strideH; - U32 in_w_0 = (hw%ow)*strideW; - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - BIN8 *in_hw1c8 = inArray_pad + c*ihiw + fh_idx*iw_pad + fw_idx; - BIN8 *in_0 = in_hw1c8 + in_h_0*iw_pad + in_w_0; - BIN8 *in_order_hw1c8 = in_order + c*fh*fw + fh_idx*fw + fw_idx; - *in_order_hw1c8 = (*in_0); - } - } - } - // compute - for (U32 o = 0; o < oc; o+=2) { - BIN8 *in_hw0 = in_order; - const BIN8 *f_o = filterArray + o*8*fh*fw*ic; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; - - uint16x8_t sum[2] = {0}; - uint8x8_t v2 = vdup_n_u8(2); - for (U32 i = 0; i < ic*8; i += 32) { - uint8x8_t sub0[2] = {0}; - - for (U32 j = 0; j < 4; j++) { - uint8x8_t f_0 = vld1_u8(f_o); - uint8x8_t f_1 = vld1_u8(f_o+8); - f_o += 16; - uint8x8_t in_1 = vdup_n_u8(*in_hw0); - in_hw0++; - f_0 = veor_u8(in_1, f_0); - f_1 = veor_u8(in_1, f_1); - f_0 = vcnt_u8(f_0); - f_1 = vcnt_u8(f_1); - sub0[0] = vadd_u8(sub0[0], f_0); - sub0[1] = vadd_u8(sub0[1], f_1); - } - sum[0] = vmlal_u8(sum[0], sub0[0], v2); - sum[1] = vmlal_u8(sum[1], sub0[1], v2); - - for (U32 j = 1; j < fh*fw; j+=8) { - uint8x8_t sub1[2] = {0}; - for (U32 k = 0; k < 32; k++) { - uint8x8_t f_0 = vld1_u8(f_o); - uint8x8_t f_1 = vld1_u8(f_o+8); - f_o += 16; - uint8x8_t in_1 = vdup_n_u8(*in_hw0); - in_hw0++; - f_0 = veor_u8(in_1, f_0); - f_1 = veor_u8(in_1, f_1); - f_0 = vcnt_u8(f_0); - f_1 = vcnt_u8(f_1); - sub1[0] = vadd_u8(sub1[0], f_0); - sub1[1] = vadd_u8(sub1[1], f_1); - } - sum[0] = vmlal_u8(sum[0], sub1[0], v2); - sum[1] = vmlal_u8(sum[1], sub1[1], v2); - } - } - short temp[16]; - vst1q_u16((uint16_t*)temp, sum[0]); - vst1q_u16((uint16_t*)(temp+8), sum[1]); - int16x8_t base_abs = vdupq_n_s16(base_s); - int16x8_t ssum[2]; - ssum[0] = vld1q_s16(temp); - ssum[1] = vld1q_s16(temp+8); - ssum[0] = vsubq_s16(base_abs, ssum[0]); - ssum[1] = vsubq_s16(base_abs, ssum[1]); - - float16x8_t res_o0 = vcvtq_f16_s16(ssum[0]); - float16x8_t res_o1 = vcvtq_f16_s16(ssum[1]); - float16x8_t scale_o0 = vld1q_f16(s0); - s0 += 16; - float16x8_t scale_o1 = vld1q_f16(s1); - s1 += 16; - float16x8_t bias_o0 = vld1q_f16(b0); - b0 += 16; - float16x8_t bias_o1 = vld1q_f16(b1); - b1 += 16; - res_o0 = vmulq_f16(res_o0, scale_o0); - res_o1 = vmulq_f16(res_o1, scale_o1); - res_o0 = vaddq_f16(res_o0, bias_o0); - res_o1 = vaddq_f16(res_o1, bias_o1); - vst1q_f16(out_o0hw0, res_o0); - vst1q_f16(out_o1hw0, res_o1); - } - } - } - return SUCCESS; -} -#endif diff --git a/tensor_computing/src/cpu/arm/bnn/convolution_xnor_A76.cpp b/tensor_computing/src/cpu/arm/bnn/convolution_xnor_A76.cpp deleted file mode 100644 index 46f0f57f..00000000 --- a/tensor_computing/src/cpu/arm/bnn/convolution_xnor_A76.cpp +++ /dev/null @@ -1,776 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifdef _USE_FP16 -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" - -#include "cpu/arm/bnn/convolution_xnor.h" - -EE convolution_xnor_A76(TensorDesc inputDesc, const F16* input, - TensorDesc filterDesc, const BIN8* filterArray, - ConvolutionDesc convDesc, - TensorDesc scaleDesc, const F16* scaleArray, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc activationDesc) -{ - UNUSED(scaleDesc); - UNUSED(biasDesc); - UNUSED(tmpBytes); - UNUSED(activationDesc); - - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - - if (fdf != DF_NCHWN16C8) - CHECK_STATUS(NOT_MATCH); - if (!(ic == fc && oc == fn)) - CHECK_STATUS(NOT_MATCH); - - oc /= 8; - ic /= 8; - - U32 ih_pad = ih + paddingT + paddingB; - U32 iw_pad = iw + paddingL + paddingR; - U32 ohow = oh*ow; - U32 ihiw = ih_pad*iw_pad; - - BIN8* inArray = ((BIN8*)tmp) + ic*ihiw + 8*fh*fw*ic; // ic has been divided by 8 - BIN8 *inArray_pad; - - for (U32 n = 0; n < in; n++) { - const F16 *in = input + n*ic*ih*iw*8; - for (U32 i=0; i= 0) { - temp |= (1 << (7-j)); // set - } - } - inArray[i] = temp; - } - - if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { - inArray_pad = inArray + n*ic*ih*iw; // ic has been divided by 8 - } else { - // copy input into a input with padding - inArray_pad = (BIN8*)tmp; - BIN8 *inArray_pad_mov = inArray_pad; - BIN8 *inArray_mov = inArray + n*ic*ih*iw; - for (U32 c = 0; c < ic; c++) { // All divide by 8 - for (U32 h = 0; h < paddingT; h++) { - memset(inArray_pad_mov, 0, iw_pad*bytesOf(DT_BIN11)); - inArray_pad_mov += iw_pad; - } - for (U32 h = paddingT; h < ih_pad - paddingB; h++) { - memset(inArray_pad_mov, 0, paddingL*bytesOf(DT_BIN11)); - inArray_pad_mov += paddingL; - memcpy(inArray_pad_mov, inArray_mov, iw*bytesOf(DT_BIN11)); - inArray_pad_mov += iw; - inArray_mov += iw; - memset(inArray_pad_mov, 0, paddingR*bytesOf(DT_BIN11)); - inArray_pad_mov += paddingR; - } - for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { - memset(inArray_pad_mov, 0, iw_pad*bytesOf(DT_BIN11)); - inArray_pad_mov += iw_pad; - } - } - } - // ohow / 8 - short base_s = fh*fw*ic*8; // For xnorNet, actual_sum = base_s - 2 * noOf1sFromXOR - short base_v[8]; // Assume the base can be represented as int16 - for (U32 i=0; i<8; i++) { - base_v[i] = base_s; - } - for (U32 hw = 0; hw < ohow - 7; hw+=8) { - const F16 *s0 = scaleArray; - const F16 *s1 = scaleArray + 8; - const F16 *b0 = biasArray; - const F16 *b1 = biasArray + 8; - BIN8 *in_order = ((BIN8*)tmp) + ic*ihiw; // ic has been divided by 8 - // reorder input - // NCHWc8 => NHWChw8c8 + im2col - U32 in_h[8]; - U32 in_w[8]; - for (U32 i=0; i<8; i++) { - in_h[i] = ((hw+i)/ow)*strideH; - in_w[i] = ((hw+i)%ow)*strideW; - } - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - BIN8 *in_hw8c8 = inArray_pad + c*ihiw + fh_idx*iw_pad + fw_idx; - // NHWChw8c8 - BIN8 *in_order_hw8c8 = in_order + c*fh*fw*8 + fh_idx*fw*8 + fw_idx*8; // This 8 comes from hw8 - for (U32 i=0; i<8; i++) { - in_order_hw8c8[i] = *(in_hw8c8 + in_h[i]*iw_pad + in_w[i]); - } - } - } - } - - // compute - for (U32 o = 0; o < oc; o+=2) { // oc should be multiple of 32. It will at least be multiple of 16 in the future. - BIN8 *in_hw0 = in_order; - const BIN8 *f_o0c0 = filterArray + o*8*fh*fw*ic;// ic has been divided by 8 - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; - // scale and bias - const F16 *s_o0 = s0; - const F16 *s_o1 = s1; - const F16 *b_o0 = b0; - const F16 *b_o1 = b1; - __asm__ __volatile__( - "ldr q4, [%[base]]\n" - "ldr q0, [%[f_0]]\n" //f_0 - "ldr d29, [%[in_0]]\n" //in_0 - /* Layout - 5 6 - 7 8 - 9 10 - 11 12 - - 13 14 - 15 16 - 17 18 - 19 20 - */ - "mov v5.16b, v4.16b\n" - "mov v6.16b, v4.16b\n" - "mov v7.16b, v4.16b\n" - "mov v8.16b, v4.16b\n" - "mov v9.16b, v4.16b\n" - "dup v1.16b, v29.b[0]\n" //duplicate a full register - "mov v10.16b, v4.16b\n" - "dup v2.16b, v29.b[1]\n" - "mov v11.16b, v4.16b\n" - "mov v12.16b, v4.16b\n" - "mov v13.16b, v4.16b\n" - "mov v14.16b, v4.16b\n" - "mov v15.16b, v4.16b\n" - "mov v16.16b, v4.16b\n" - "mov v17.16b, v4.16b\n" - "mov v18.16b, v4.16b\n" - "mov v19.16b, v4.16b\n" - "mov v20.16b, v4.16b\n" - - //give in address to x3 - "mov x3, %[in_0]\n" - - //give f address to x0 - "mov x0, %[f_0]\n" - - "mov x2, %[ic]\n" //ic_blk - - "0:\n" - "eor v21.16b, v21.16b, v21.16b\n" - "mov x9, %[fhfw]\n" - "eor v22.16b, v22.16b, v22.16b\n" - "eor v23.16b, v23.16b, v23.16b\n" - "eor v24.16b, v24.16b, v24.16b\n" - "eor v25.16b, v25.16b, v25.16b\n" - "eor v26.16b, v26.16b, v26.16b\n" - "eor v27.16b, v27.16b, v27.16b\n" - "eor v28.16b, v28.16b, v28.16b\n" - - "mov x4, #4\n" - - "1:\n" - "eor v3.16b, v1.16b, v0.16b\n" - "eor v4.16b, v2.16b, v0.16b\n" - "cnt v3.16b, v3.16b\n" - "subs x4, x4, #1\n" - - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[2]\n" - - "add v21.16b, v21.16b, v3.16b\n" // Use add because the latency is shorter - "dup v2.16b, v29.b[3]\n" - - "add v22.16b, v22.16b, v4.16b\n" - - "eor v3.16b, v1.16b, v0.16b\n" - "eor v4.16b, v2.16b, v0.16b\n" - "cnt v3.16b, v3.16b\n" - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[4]\n" - "add v23.16b, v23.16b, v3.16b\n" - "dup v2.16b, v29.b[5]\n" - "add v24.16b, v24.16b, v4.16b\n" - - "eor v3.16b, v1.16b, v0.16b\n" - "eor v4.16b, v2.16b, v0.16b\n" - "cnt v3.16b, v3.16b\n" - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[6]\n" - "add v25.16b, v25.16b, v3.16b\n" - "dup v2.16b, v29.b[7]\n" - "add v26.16b, v26.16b, v4.16b\n" - - "eor v3.16b, v1.16b, v0.16b\n" - "ldr d29, [x3, 8]!\n" - "eor v4.16b, v2.16b, v0.16b\n" - "cnt v3.16b, v3.16b\n" - "ldr q0, [x0, 16]!\n" // next filter - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[0]\n" - "add v27.16b, v27.16b, v3.16b\n" - "dup v2.16b, v29.b[1]\n" - "add v28.16b, v28.16b, v4.16b\n" - "bne 1b\n" - - "movi v3.16b, #2\n" - "umlsl v5.8h, v21.8b, v3.8b\n" - "umlsl v7.8h, v22.8b, v3.8b\n" - "umlsl v9.8h, v23.8b, v3.8b\n" - "umlsl v11.8h, v24.8b, v3.8b\n" - "umlsl v13.8h, v25.8b, v3.8b\n" - "umlsl v15.8h, v26.8b, v3.8b\n" - "umlsl v17.8h, v27.8b, v3.8b\n" - "umlsl v19.8h, v28.8b, v3.8b\n" - - "umlsl2 v6.8h, v21.16b, v3.16b\n" - "umlsl2 v8.8h, v22.16b, v3.16b\n" - "umlsl2 v10.8h, v23.16b, v3.16b\n" - "umlsl2 v12.8h, v24.16b, v3.16b\n" - "umlsl2 v14.8h, v25.16b, v3.16b\n" - "umlsl2 v16.8h, v26.16b, v3.16b\n" - "umlsl2 v18.8h, v27.16b, v3.16b\n" - "umlsl2 v20.8h, v28.16b, v3.16b\n" - - "subs x9, x9, #1\n" - "beq 4f\n" // 1x1, continue with the next 32 input channels - - "2:\n" - "eor v21.16b, v21.16b, v21.16b\n" - "eor v22.16b, v22.16b, v22.16b\n" - "eor v23.16b, v23.16b, v23.16b\n" - "eor v24.16b, v24.16b, v24.16b\n" - "eor v25.16b, v25.16b, v25.16b\n" - "eor v26.16b, v26.16b, v26.16b\n" - "eor v27.16b, v27.16b, v27.16b\n" - "eor v28.16b, v28.16b, v28.16b\n" - - "mov x4, #32\n" // Assume 256 will not happen - "3:\n" - "eor v3.16b, v1.16b, v0.16b\n" - "eor v4.16b, v2.16b, v0.16b\n" - "cnt v3.16b, v3.16b\n" - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[2]\n" - - "uqadd v21.16b, v21.16b, v3.16b\n" - "dup v2.16b, v29.b[3]\n" - - "uqadd v22.16b, v22.16b, v4.16b\n" - "subs x4, x4, #1\n" - - "eor v3.16b, v1.16b, v0.16b\n" - "eor v4.16b, v2.16b, v0.16b\n" - "cnt v3.16b, v3.16b\n" - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[4]\n" - "uqadd v23.16b, v23.16b, v3.16b\n" - "dup v2.16b, v29.b[5]\n" - "uqadd v24.16b, v24.16b, v4.16b\n" - - "eor v3.16b, v1.16b, v0.16b\n" - "eor v4.16b, v2.16b, v0.16b\n" - "cnt v3.16b, v3.16b\n" - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[6]\n" - "uqadd v25.16b, v25.16b, v3.16b\n" - "dup v2.16b, v29.b[7]\n" - "uqadd v26.16b, v26.16b, v4.16b\n" - - "eor v3.16b, v1.16b, v0.16b\n" - "ldr d29, [x3, 8]!\n" - "eor v4.16b, v2.16b, v0.16b\n" - "cnt v3.16b, v3.16b\n" - "ldr q0, [x0, 16]!\n" // next filter - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[0]\n" - "uqadd v27.16b, v27.16b, v3.16b\n" - "dup v2.16b, v29.b[1]\n" - "uqadd v28.16b, v28.16b, v4.16b\n" - "bne 3b\n" - - "movi v3.16b, #2\n" // actual sum = base - 2 * noOf1s - "umlsl v5.8h, v21.8b, v3.8b\n" - "umlsl v7.8h, v22.8b, v3.8b\n" - "umlsl v9.8h, v23.8b, v3.8b\n" - "umlsl v11.8h, v24.8b, v3.8b\n" - "umlsl v13.8h, v25.8b, v3.8b\n" - "umlsl v15.8h, v26.8b, v3.8b\n" - "umlsl v17.8h, v27.8b, v3.8b\n" - "umlsl v19.8h, v28.8b, v3.8b\n" - - "umlsl2 v6.8h, v21.16b, v3.16b\n" - "umlsl2 v8.8h, v22.16b, v3.16b\n" - "umlsl2 v10.8h, v23.16b, v3.16b\n" - "umlsl2 v12.8h, v24.16b, v3.16b\n" - "umlsl2 v14.8h, v25.16b, v3.16b\n" - "umlsl2 v16.8h, v26.16b, v3.16b\n" - "umlsl2 v18.8h, v27.16b, v3.16b\n" - "umlsl2 v20.8h, v28.16b, v3.16b\n" - - "subs x9, x9, #8\n" - "bne 2b\n" - - "4:\n" // Wrap up computation for 32 input channels - "subs x2, x2, #32\n" - "bne 0b\n" - - // pipelined - "scvtf v5.8h, v5.8h\n" - "scvtf v6.8h, v6.8h\n" - "ldr q21, [%[b_0]]\n" - "ldr q22, [%[b_1]]\n" - "scvtf v7.8h, v7.8h\n" - "scvtf v8.8h, v8.8h\n" - "ldr q23, [%[s_0]]\n" - "ldr q24, [%[s_1]]\n" - "scvtf v9.8h, v9.8h\n" - "scvtf v10.8h, v10.8h\n" - "scvtf v11.8h, v11.8h\n" - "scvtf v12.8h, v12.8h\n" - "mov v1.16b, v21.16b\n" - "mov v2.16b, v22.16b\n" - "scvtf v13.8h, v13.8h\n" - "scvtf v14.8h, v14.8h\n" - "fmla v1.8h, v5.8h, v23.8h\n" - "fmla v2.8h, v6.8h, v24.8h\n" - "scvtf v15.8h, v15.8h\n" - "scvtf v16.8h, v16.8h\n" - "mov v3.16b, v21.16b\n" - "mov v4.16b, v22.16b\n" - "scvtf v17.8h, v17.8h\n" - "scvtf v18.8h, v18.8h\n" - "fmla v3.8h, v7.8h, v23.8h\n" - "fmla v4.8h, v8.8h, v24.8h\n" - "scvtf v19.8h, v19.8h\n" - "scvtf v20.8h, v20.8h\n" - "mov v5.16b, v21.16b\n" - "mov v6.16b, v22.16b\n" - - "fmla v5.8h, v9.8h, v23.8h\n" - "mov v7.16b, v21.16b\n" - "fmla v6.8h, v10.8h, v24.8h\n" - "mov v8.16b, v22.16b\n" - "fmla v7.8h, v11.8h, v23.8h\n" - "mov v9.16b, v21.16b\n" - "fmla v8.8h, v12.8h, v24.8h\n" - "mov v10.16b, v22.16b\n" - "fmla v9.8h, v13.8h, v23.8h\n" - "mov v11.16b, v21.16b\n" - "fmla v10.8h, v14.8h, v24.8h\n" - "mov v12.16b, v22.16b\n" - "fmla v11.8h, v15.8h, v23.8h\n" - "mov v13.16b, v21.16b\n" - "fmla v12.8h, v16.8h, v24.8h\n" - "mov v14.16b, v22.16b\n" - "fmla v13.8h, v17.8h, v23.8h\n" - "mov v15.16b, v21.16b\n" - "fmla v14.8h, v18.8h, v24.8h\n" - "mov v16.16b, v22.16b\n" - "fmla v15.8h, v19.8h, v23.8h\n" - "fmla v16.8h, v20.8h, v24.8h\n" - - "str q1, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw1 - "str q5, [%[out_0], #32]\n" //out_o0hw2 - "str q7, [%[out_0], #48]\n" //out_o0hw3 - "str q9, [%[out_0], #64]\n" //out_o0hw4 - "str q11, [%[out_0], #80]\n" //out_o0hw5 - "str q13, [%[out_0], #96]\n" //out_o0hw6 - "str q15, [%[out_0], #112]\n" //out_o0hw7 - - "str q2, [%[out_1]]\n" //out_o1hw0 - "str q4, [%[out_1], #16]\n" //out_o1hw1 - "str q6, [%[out_1], #32]\n" //out_o1hw2 - "str q8, [%[out_1], #48]\n" //out_o1hw3 - "str q10, [%[out_1], #64]\n" //out_o1hw4 - "str q12, [%[out_1], #80]\n" //out_o1hw5 - "str q14, [%[out_1], #96]\n" //out_o1hw6 - "str q16, [%[out_1], #112]\n" //out_o1hw7 - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8), - [fhfw]"r"((I64)fh*fw), - [base]"r"(base_v), - [s_0]"r"(s_o0), - [s_1]"r"(s_o1), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", - "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", - "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x0", "x2", "x3", "x4", "x9" - ); - s0 += 16; - s1 += 16; - b0 += 16; - b1 += 16; - } - } - // ohow_remainder % 8 / 4 - U32 ohow_s = (ohow / 8) * 8; - - for (U32 hw = ohow_s; hw < ohow-3; hw+=4) { - const F16 *s0 = scaleArray; - const F16 *s1 = scaleArray + 8; - const F16 *b0 = biasArray; - const F16 *b1 = biasArray + 8; - BIN8 *in_order = ((BIN8*)tmp) + ic*ihiw; // ic has been divided by 8 - // reorder input - // NCHWc8 => NHWChw4c8 + im2col - U32 in_h[4]; - U32 in_w[4]; - for (U32 i=0; i<4; i++) { - in_h[i] = ((hw+i)/ow)*strideH; - in_w[i] = ((hw+i)%ow)*strideW; - } - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - BIN8 *in_hw4c8 = inArray_pad + c*ihiw + fh_idx*iw_pad + fw_idx; - // NHWChw8c8 - BIN8 *in_order_hw4c8 = in_order + c*fh*fw*4 + fh_idx*fw*4 + fw_idx*4; - for (U32 i=0; i<4; i++) { - in_order_hw4c8[i] = *(in_hw4c8 + in_h[i]*iw_pad + in_w[i]); - } - } - } - } - - // compute - for (U32 o = 0; o < oc; o+=2) { // oc should be multiple of 32. It will at least be multiple of 16 in the future. - BIN8 *in_hw0 = in_order; - const BIN8 *f_o0c0 = filterArray + o*8*fh*fw*ic;// ic has been divided by 8 - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; - // scale and bias - const F16 *s_o0 = s0; - const F16 *s_o1 = s1; - const F16 *b_o0 = b0; - const F16 *b_o1 = b1; - __asm__ __volatile__( - "ldr q4, [%[base]]\n" - "ldr q0, [%[f_0]]\n" //f_0 - "ldr s29, [%[in_0]]\n" //in_0 - /* Layout - 5 6 - 7 8 - 9 10 - 11 12 - */ - "mov v5.16b, v4.16b\n" - "mov v6.16b, v4.16b\n" - "mov v7.16b, v4.16b\n" - "mov v8.16b, v4.16b\n" - "mov v9.16b, v4.16b\n" - "dup v1.16b, v29.b[0]\n" //duplicate a full register - "mov v10.16b, v4.16b\n" - "dup v2.16b, v29.b[1]\n" - "mov v11.16b, v4.16b\n" - "mov v12.16b, v4.16b\n" - - //give in address to x3 - "mov x3, %[in_0]\n" - - //give f address to x0 - "mov x0, %[f_0]\n" - - "mov x2, %[ic]\n" //ic_blk - - "0:\n" - "eor v21.16b, v21.16b, v21.16b\n" - "mov x9, %[fhfw]\n" - "eor v22.16b, v22.16b, v22.16b\n" - "eor v23.16b, v23.16b, v23.16b\n" - "eor v24.16b, v24.16b, v24.16b\n" - - "mov x4, #4\n" - - "1:\n" - "eor v3.16b, v1.16b, v0.16b\n" - "eor v4.16b, v2.16b, v0.16b\n" - - "cnt v3.16b, v3.16b\n" - "subs x4, x4, #1\n" - - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[2]\n" - - "add v21.16b, v21.16b, v3.16b\n" // Use add because the latency is shorter - "dup v2.16b, v29.b[3]\n" - - "add v22.16b, v22.16b, v4.16b\n" - - "eor v3.16b, v1.16b, v0.16b\n" - "ldr s29, [x3, 4]!\n" - "eor v4.16b, v2.16b, v0.16b\n" - "cnt v3.16b, v3.16b\n" - "ldr q0, [x0, 16]!\n" // next filter - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[0]\n" - "add v23.16b, v23.16b, v3.16b\n" - "dup v2.16b, v29.b[1]\n" - "add v24.16b, v24.16b, v4.16b\n" - "bne 1b\n" - - "movi v3.16b, #2\n" - "umlsl v5.8h, v21.8b, v3.8b\n" - "umlsl v7.8h, v22.8b, v3.8b\n" - "umlsl v9.8h, v23.8b, v3.8b\n" - "umlsl v11.8h, v24.8b, v3.8b\n" - - "umlsl2 v6.8h, v21.16b, v3.16b\n" - "umlsl2 v8.8h, v22.16b, v3.16b\n" - "umlsl2 v10.8h, v23.16b, v3.16b\n" - "umlsl2 v12.8h, v24.16b, v3.16b\n" - - "subs x9, x9, #1\n" - "beq 4f\n" // 1x1, continue with the next 32 input channels - - "2:\n" - "eor v21.16b, v21.16b, v21.16b\n" - "eor v22.16b, v22.16b, v22.16b\n" - "eor v23.16b, v23.16b, v23.16b\n" - "eor v24.16b, v24.16b, v24.16b\n" - - "mov x4, #32\n" // Assume 256 will not happen - "3:\n" - "eor v3.16b, v1.16b, v0.16b\n" - "eor v4.16b, v2.16b, v0.16b\n" - - "cnt v3.16b, v3.16b\n" - "subs x4, x4, #1\n" - - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[2]\n" - - "uqadd v21.16b, v21.16b, v3.16b\n" - "dup v2.16b, v29.b[3]\n" - - "uqadd v22.16b, v22.16b, v4.16b\n" - - "eor v3.16b, v1.16b, v0.16b\n" - "ldr s29, [x3, 4]!\n" - "eor v4.16b, v2.16b, v0.16b\n" - "cnt v3.16b, v3.16b\n" - "ldr q0, [x0, 16]!\n" // next filter - "cnt v4.16b, v4.16b\n" - "dup v1.16b, v29.b[0]\n" - "uqadd v23.16b, v23.16b, v3.16b\n" - "dup v2.16b, v29.b[1]\n" - "uqadd v24.16b, v24.16b, v4.16b\n" - "bne 3b\n" - - "movi v3.16b, #2\n" - "umlsl v5.8h, v21.8b, v3.8b\n" - "umlsl v7.8h, v22.8b, v3.8b\n" - "umlsl v9.8h, v23.8b, v3.8b\n" - "umlsl v11.8h, v24.8b, v3.8b\n" - - "umlsl2 v6.8h, v21.16b, v3.16b\n" - "umlsl2 v8.8h, v22.16b, v3.16b\n" - "umlsl2 v10.8h, v23.16b, v3.16b\n" - "umlsl2 v12.8h, v24.16b, v3.16b\n" - - "subs x9, x9, #8\n" - "bne 2b\n" - - "4:\n" // Wrap up computation for 32 input channels - "subs x2, x2, #32\n" - "bne 0b\n" - - // pipelined - "scvtf v5.8h, v5.8h\n" - "scvtf v6.8h, v6.8h\n" - "ldr q21, [%[b_0]]\n" - "scvtf v7.8h, v7.8h\n" - "ldr q22, [%[b_1]]\n" - "scvtf v8.8h, v8.8h\n" - "ldr q23, [%[s_0]]\n" - "scvtf v9.8h, v9.8h\n" - "ldr q24, [%[s_1]]\n" - "scvtf v10.8h, v10.8h\n" - "scvtf v11.8h, v11.8h\n" - "mov v1.16b, v21.16b\n" - "scvtf v12.8h, v12.8h\n" - "mov v2.16b, v22.16b\n" - "fmla v1.8h, v5.8h, v23.8h\n" - "fmla v2.8h, v6.8h, v24.8h\n" - "mov v3.16b, v21.16b\n" - "mov v4.16b, v22.16b\n" - "fmla v3.8h, v7.8h, v23.8h\n" - "fmla v4.8h, v8.8h, v24.8h\n" - "mov v5.16b, v21.16b\n" - "mov v6.16b, v22.16b\n" - - "fmla v5.8h, v9.8h, v23.8h\n" - "mov v7.16b, v21.16b\n" - "fmla v6.8h, v10.8h, v24.8h\n" - "mov v8.16b, v22.16b\n" - "fmla v7.8h, v11.8h, v23.8h\n" - "fmla v8.8h, v12.8h, v24.8h\n" - - "str q1, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw1 - "str q5, [%[out_0], #32]\n" //out_o0hw2 - "str q7, [%[out_0], #48]\n" //out_o0hw3 - - "str q2, [%[out_1]]\n" //out_o1hw0 - "str q4, [%[out_1], #16]\n" //out_o1hw1 - "str q6, [%[out_1], #32]\n" //out_o1hw2 - "str q8, [%[out_1], #48]\n" //out_o1hw3 - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8), - [fhfw]"r"((I64)fh*fw), - [base]"r"(base_v), - [s_0]"r"(s_o0), - [s_1]"r"(s_o1), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v21", "v22", "v23", "v24", "v29", "v30", "x0", "x1", "x2", "x3", "x4", "x9" - ); - s0 += 16; - s1 += 16; - b0 += 16; - b1 += 16; - } - } - // ohow_reminder % 4 - ohow_s = (ohow / 4) * 4; - for (U32 hw = ohow_s; hw < ohow; hw++) { - const F16 *s0 = scaleArray; - const F16 *s1 = scaleArray + 8; - const F16 *b0 = biasArray; - const F16 *b1 = biasArray + 8; - BIN8 *in_order = ((BIN8*)tmp) + ic*ihiw; // ic has been divided by 8 - // reorder input - // NCHWc8 => NHWChw1c8 + im2col - U32 in_h_0 = (hw/ow)*strideH; - U32 in_w_0 = (hw%ow)*strideW; - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - BIN8 *in_hw1c8 = inArray_pad + c*ihiw + fh_idx*iw_pad + fw_idx; - BIN8 *in_0 = in_hw1c8 + in_h_0*iw_pad + in_w_0; - BIN8 *in_order_hw1c8 = in_order + c*fh*fw + fh_idx*fw + fw_idx; - *in_order_hw1c8 = (*in_0); - } - } - } - // compute - for (U32 o = 0; o < oc; o+=2) { - BIN8 *in_hw0 = in_order; - const BIN8 *f_o = filterArray + o*8*fh*fw*ic; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; - - uint16x8_t sum[2] = {0}; - uint8x8_t v2 = vdup_n_u8(2); - for (U32 i = 0; i < ic*8; i += 32) { - uint8x8_t sub0[2] = {0}; - - for (U32 j = 0; j < 4; j++) { - uint8x8_t f_0 = vld1_u8(f_o); - uint8x8_t f_1 = vld1_u8(f_o+8); - f_o += 16; - uint8x8_t in_1 = vdup_n_u8(*in_hw0); - in_hw0++; - f_0 = veor_u8(in_1, f_0); - f_1 = veor_u8(in_1, f_1); - f_0 = vcnt_u8(f_0); - f_1 = vcnt_u8(f_1); - sub0[0] = vadd_u8(sub0[0], f_0); - sub0[1] = vadd_u8(sub0[1], f_1); - } - sum[0] = vmlal_u8(sum[0], sub0[0], v2); - sum[1] = vmlal_u8(sum[1], sub0[1], v2); - - for (U32 j = 1; j < fh*fw; j+=8) { - uint8x8_t sub1[2] = {0}; - for (U32 k = 0; k < 32; k++) { - uint8x8_t f_0 = vld1_u8(f_o); - uint8x8_t f_1 = vld1_u8(f_o+8); - f_o += 16; - uint8x8_t in_1 = vdup_n_u8(*in_hw0); - in_hw0++; - f_0 = veor_u8(in_1, f_0); - f_1 = veor_u8(in_1, f_1); - f_0 = vcnt_u8(f_0); - f_1 = vcnt_u8(f_1); - sub1[0] = vadd_u8(sub1[0], f_0); - sub1[1] = vadd_u8(sub1[1], f_1); - } - sum[0] = vmlal_u8(sum[0], sub1[0], v2); - sum[1] = vmlal_u8(sum[1], sub1[1], v2); - } - } - short temp[16]; - vst1q_u16((uint16_t*)temp, sum[0]); - vst1q_u16((uint16_t*)(temp+8), sum[1]); - int16x8_t base_abs = vdupq_n_s16(base_s); - int16x8_t ssum[2]; - ssum[0] = vld1q_s16(temp); - ssum[1] = vld1q_s16(temp+8); - ssum[0] = vsubq_s16(base_abs, ssum[0]); - ssum[1] = vsubq_s16(base_abs, ssum[1]); - - float16x8_t res_o0 = vcvtq_f16_s16(ssum[0]); - float16x8_t res_o1 = vcvtq_f16_s16(ssum[1]); - float16x8_t scale_o0 = vld1q_f16(s0); - s0 += 16; - float16x8_t scale_o1 = vld1q_f16(s1); - s1 += 16; - float16x8_t bias_o0 = vld1q_f16(b0); - b0 += 16; - float16x8_t bias_o1 = vld1q_f16(b1); - b1 += 16; - res_o0 = vmulq_f16(res_o0, scale_o0); - res_o1 = vmulq_f16(res_o1, scale_o1); - res_o0 = vaddq_f16(res_o0, bias_o0); - res_o1 = vaddq_f16(res_o1, bias_o1); - vst1q_f16(out_o0hw0, res_o0); - vst1q_f16(out_o1hw0, res_o1); - } - } - } - return SUCCESS; -} -#endif diff --git a/tensor_computing/src/cpu/arm/bnn/tensor_computing_bnn.h b/tensor_computing/src/cpu/arm/bnn/tensor_computing_bnn.h deleted file mode 100644 index 3dd951d3..00000000 --- a/tensor_computing/src/cpu/arm/bnn/tensor_computing_bnn.h +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_TENSOR_COMPUTING_BNN -#define _H_TENSOR_COMPUTING_BNN - -#ifdef _USE_FP16 -#include "cpu/arm/bnn/convolution_transform_bnn.h" -#include "cpu/arm/bnn/convolution_dorefa.h" -#include "cpu/arm/bnn/convolution_xnor.h" - -EE convolution_infer_forward_tmp_bytes_bnn(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, ConvolutionForwardAlgorithm algorithm, U32 *bytes); - -EE convolution_bnn(TensorDesc inputDesc, const F16* input, - TensorDesc filterDesc, const BIN8* filter, - ConvolutionDesc convDesc, - TensorDesc scaleDesc, const F16* scale, - TensorDesc biasDesc, const F16* bias, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* output, - ActivationDesc activationDesc, - Arch arch); -#endif -#endif diff --git a/tensor_computing/src/cpu/arm/check.cpp b/tensor_computing/src/cpu/arm/check.cpp deleted file mode 100644 index ac18a698..00000000 --- a/tensor_computing/src/cpu/arm/check.cpp +++ /dev/null @@ -1,112 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#include "cpu/arm/tensor_computing_arm.h" -#include "arm_neon_expand.h" -#ifdef _USE_FP32 -#include "cpu/arm/fp32/tensor_computing_fp32.h" -#endif -#ifdef _USE_FP16 -#include "cpu/arm/fp16/tensor_computing_fp16.h" -#endif - -EE check_u32(TensorDesc inputDescA, const U32* inputA, - TensorDesc inputDescB, const U32* inputB, - CheckMode checkMode, - TensorDesc outputDesc, I32* output) -{ - if (nullptr == inputA || nullptr == inputB || nullptr == output) - CHECK_STATUS(NULL_POINTER); - - if (tensorNumElements(inputDescA) != tensorNumElements(inputDescB)) - CHECK_STATUS(NOT_MATCH); - - U32 size = tensorNumElements(inputDescA); - U32 loopOuter = inputDescA.dims[inputDescA.nDims-1]; - if (tensorNumElements(outputDesc) != loopOuter) - CHECK_STATUS(NOT_MATCH); - I32 length = size / loopOuter; - for (U32 j = 0; j < loopOuter; j++) { - const U32 *arrayA = inputA + j * length; - const U32 *arrayB = inputB + j * length; - switch (checkMode) { - case CHECK_EQUAL: { - uint32x4_t count_v = vdupq_n_u32(0); - I32 i = 0; - for (; i < length-3; i+=4) { - uint32x4_t a = vld1q_u32(arrayA + i); - uint32x4_t b = vld1q_u32(arrayA + i); - count_v = vaddq_u32(count_v, vceqq_u32(a, b)); - } - I32 count = vaddvq_u32(count_v); - for (; i < length; i++) - if (arrayA[i] == arrayB[i]) - count ++; - output[j] = (count == length); - break; - } - default: - CHECK_STATUS(NOT_SUPPORTED); - break; - } - } - return SUCCESS; -} - -EE check_arm(TensorDesc inputDescA, const void* inputA, - TensorDesc inputDescB, const void* inputB, - CheckMode checkMode, - TensorDesc outputDesc, void* output) -{ - DataType idt = inputDescA.dt; - EE ret = SUCCESS; - switch (idt) { -#ifdef _USE_FP32 - case DT_F32: { - ret = check_fp32(inputDescA, (const F32*)inputA, - inputDescB, (const F32*)inputB, - checkMode, - outputDesc, (I32*)output); - break; - } -#endif -#ifdef _USE_FP16 - case DT_F16: { - ret = check_fp16(inputDescA, (const F16*)inputA, - inputDescB, (const F16*)inputB, - checkMode, - outputDesc, (I32*)output); - break; - } -#endif - case DT_U32: { - ret = check_u32(inputDescA, (const U32*)inputA, - inputDescB, (const U32*)inputB, - checkMode, - outputDesc, (I32*)output); - break; - } - case DT_I32: { - ret = check_u32(inputDescA, (const U32*)inputA, - inputDescB, (const U32*)inputB, - checkMode, - outputDesc, (I32*)output); - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - - return ret; -} diff --git a/tensor_computing/src/cpu/arm/clip.cpp b/tensor_computing/src/cpu/arm/clip.cpp deleted file mode 100644 index 1e061223..00000000 --- a/tensor_computing/src/cpu/arm/clip.cpp +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/tensor_computing_arm.h" -#ifdef _USE_FP32 -#include "cpu/arm/fp32/tensor_computing_fp32.h" -#endif -#ifdef _USE_FP16 -#include "cpu/arm/fp16/tensor_computing_fp16.h" -#endif - -EE clip_arm(void *minValue, void *maxValue, TensorDesc inputDesc, void* input, TensorDesc outputDesc, void *output) -{ - UNUSED(outputDesc); - - if (nullptr == minValue - || nullptr == maxValue) - CHECK_STATUS(NULL_POINTER); - - EE ret = SUCCESS; - switch (inputDesc.dt) { -#ifdef _USE_FP32 - case DT_F32: { - ret = clip_fp32((F32 *)input, (F32 *)output, tensorNumElements(inputDesc), *((F32 *)minValue), *((F32 *)maxValue)); - break; - } -#endif -#ifdef _USE_FP16 - case DT_F16: { - ret = clip_fp16((F16 *)input, (F16 *)output, tensorNumElements(inputDesc), *((F32 *)minValue), *((F32 *)maxValue)); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} diff --git a/tensor_computing/src/cpu/arm/concat.cpp b/tensor_computing/src/cpu/arm/concat.cpp deleted file mode 100644 index 4ef50a34..00000000 --- a/tensor_computing/src/cpu/arm/concat.cpp +++ /dev/null @@ -1,117 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/tensor_computing_arm.h" -#ifdef _USE_INT8 -#include "cpu/arm/int8/tensor_computing_int8.h" -#endif -#include - -EE concat(std::vector inputDesc, std::vector input, TensorDesc outputDesc, void* output, int axis) -{ - if (nullptr == output) { - CHECK_STATUS(NULL_POINTER); - } - U32 num = inputDesc.size(); - if (num < 1) { - return NOT_MATCH; - } - - int dim = outputDesc.nDims; - axis = (axis + dim) % dim; - axis = dim - 1 - axis; - U32 tileSize = bytesOf(outputDesc.dt); - for (I32 i = 0; i < axis; i++) { - tileSize *= outputDesc.dims[i]; - } - U32 loops = 1; - for (I32 i = axis + 1; i < dim; i++) { - loops *= outputDesc.dims[i]; - } - - if (outputDesc.df == DF_NCHWC8) { - if (axis < 2) { - tileSize *= 8; - loops /= 8; - } - } - - // DF should either all be NCHWC8, or all be non-C8 - bool isC8 = DF_NCHWC8 == outputDesc.df; - - U8 *ptr = (U8 *)output; - for (U32 i = 0; i < loops; i++) { - for (U32 j = 0; j < num; j++) { - if (nullptr == input[j]) { - CHECK_STATUS(NULL_POINTER); - } - if (isC8) { - if (DF_NCHWC8 != inputDesc[j].df) { - CHECK_REQUIREMENT(4 == inputDesc[j].nDims); - CHECK_REQUIREMENT(1 == inputDesc[j].dims[1] && 1 == inputDesc[j].dims[0]); - } - } else { - if (DF_NCHWC8 == inputDesc[j].df) { - CHECK_REQUIREMENT(4 == inputDesc[j].nDims); - CHECK_REQUIREMENT(1 == inputDesc[j].dims[1] && 1 == inputDesc[j].dims[0]); - } - } - U32 blockSize = inputDesc[j].dims[axis] * tileSize; - U8* srcPtr = (U8*)((input)[j]) + i * blockSize; - memcpy(ptr, srcPtr, blockSize); - ptr += blockSize; - } - } - return SUCCESS; -} - -EE concat_arm(std::vector inputDesc, std::vector input, void* inputScale, - TensorDesc outputDesc, void* output, void* outputScale, int axis) -{ - EE ret = SUCCESS; - switch (outputDesc.dt) { -#ifdef _USE_FP32 - case DT_F32: { - UNUSED(inputScale); - UNUSED(outputScale); - ret = concat(inputDesc, input, - outputDesc, output, - axis); - break; - } -#endif -#ifdef _USE_FP16 - case DT_F16: { - UNUSED(inputScale); - UNUSED(outputScale); - ret = concat(inputDesc, input, - outputDesc, output, - axis); - break; - } -#endif -#ifdef _USE_INT8 - case DT_I8: { - ret = concat_int8(inputDesc, input, (F32*)inputScale, - outputDesc, output, (F32*)outputScale, - axis); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} diff --git a/tensor_computing/src/cpu/arm/convolution.cpp b/tensor_computing/src/cpu/arm/convolution.cpp deleted file mode 100644 index 1a185642..00000000 --- a/tensor_computing/src/cpu/arm/convolution.cpp +++ /dev/null @@ -1,399 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include -#include "cpu/arm/tensor_computing_arm.h" -#ifdef _USE_FP32 -#include "cpu/arm/fp32/tensor_computing_fp32.h" -#endif -#ifdef _USE_FP16 -#include "cpu/arm/fp16/tensor_computing_fp16.h" -#endif -#ifdef _USE_INT8 -#include "cpu/arm/int8/tensor_computing_int8.h" -#endif -#ifdef _USE_FP16 -#include "cpu/arm/bnn/tensor_computing_bnn.h" -#endif -#include "ut_util.h" -#include "tensor_computing_library_algorithm_search.h" - -EE convolution_infer_forward_algorithm_arm(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, ConvolutionPolicy policy, ConvolutionForwardAlgorithm *algorithm, DataType targetDataType) -{ - UNUSED(outputDesc); - if (nullptr == algorithm) - CHECK_STATUS(NULL_POINTER); - if (*algorithm != CONVOLUTION_ALGORITHM_NULL) - return SUCCESS; - if (policy == CONVOLUTION_LIBRARY_SEARCH) { -#ifdef _USE_LIBRARY_TUNING - if (libraryAlgorithmMap.size() == 0) { - loadLibraryAlgorithmMapFromTxt(); - } - std::string name = "convolution_cpu_" + getConvolutionAlgorithmMapNameFromInput(inputDesc, - filterDesc, convDesc, targetDataType); - if (libraryAlgorithmMap.find(name) != libraryAlgorithmMap.end()) { - *algorithm = (ConvolutionForwardAlgorithm)libraryAlgorithmMap[name]; - return SUCCESS; - } else -#endif - { - policy = CONVOLUTION_FASTEST; - } - } - - EE ret = SUCCESS; - if (policy == CONVOLUTION_FASTEST) { - DataType idt, fdt; - DataFormat idf, fdf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - U32 dilateH = convDesc.dilatedRate_h; - U32 dilateW = convDesc.dilatedRate_w; - if (dilateH > 1 || dilateW > 1) { - *algorithm = CONVOLUTION_ALGORITHM_GEMM; - return SUCCESS; - } - - if (ic % 8 != 0 || idf != DF_NCHWC8) { - *algorithm = CONVOLUTION_ALGORITHM_GEMM_ICNCHW; - } else if (fh == 3 && fw == 3 && strideH == 1 && strideW == 1 && paddingT == 1 && paddingB == 1 && paddingL == 1 && paddingR == 1) { - *algorithm = CONVOLUTION_ALGORITHM_WINOGRAD; - } else { - *algorithm = CONVOLUTION_ALGORITHM_GEMM; - } - - switch (targetDataType) { - case DT_BIN01: { - *algorithm = CONVOLUTION_ALGORITHM_BNN; - break; - } - case DT_BIN11: { - *algorithm = CONVOLUTION_ALGORITHM_BNN; - break; - } - case DT_I8: { - if (*algorithm == CONVOLUTION_ALGORITHM_WINOGRAD) { - *algorithm = CONVOLUTION_ALGORITHM_GEMM; - } - break; - } - default: - break; - } - -#ifndef __aarch64__ - if (CONVOLUTION_ALGORITHM_GEMM_ICNCHW != *algorithm) { - *algorithm = CONVOLUTION_ALGORITHM_GEMM; - } - return SUCCESS; -#endif - } else if (policy == CONVOLUTION_TUNNING) { - std::vector convolutionAlgorithms; - U32 filterBytes = 0; - U32 tmpBytes = 0; - for (U32 i = 0; i < convolutionAlgorithms.size(); i++) { - U32 bytes = 0; - CHECK_STATUS(convolution_transform_filter_bytes_arm(filterDesc, convolutionAlgorithms[i], &bytes)); - filterBytes = (bytes > filterBytes) ? bytes : filterBytes; - CHECK_STATUS(convolution_infer_forward_tmp_bytes_arm(inputDesc, filterDesc, outputDesc, - convDesc, convolutionAlgorithms[i], &bytes)); - tmpBytes = (bytes > tmpBytes) ? bytes : tmpBytes; - } - TensorDesc biasDesc = tensor1d(filterDesc.dt, outputDesc.dims[3]); - TensorDesc scaleDesc = tensor1d(DT_F32, outputDesc.dims[2]); - U8 *input = ut_input_v(tensorNumElements(inputDesc), inputDesc.dt, UT_INIT_RANDOM); - U8 *filter = ut_input_v(tensorNumElements(filterDesc), filterDesc.dt, UT_INIT_RANDOM); - U8 *filterTransformed = ut_input_v(filterBytes/bytesOf(filterDesc.dt), filterDesc.dt, UT_INIT_RANDOM); - U8 *bias = ut_input_v(tensorNumElements(biasDesc), biasDesc.dt, UT_INIT_RANDOM); - U8 *scale = ut_input_v(tensorNumElements(scaleDesc), scaleDesc.dt, UT_INIT_RANDOM); - U8 *tmp = ut_input_v(tmpBytes/bytesOf(inputDesc.dt), inputDesc.dt, UT_INIT_ZERO); - U8 *output = ut_input_v(tensorNumElements(outputDesc), outputDesc.dt, UT_INIT_ZERO); - U32 algorithmIndex = 0; - ActivationDesc activationDesc; - activationDesc.mode = ACTIVATION_RELU; - activationDesc.value[0] = 0; - for (U32 i = 0; i < convolutionAlgorithms.size(); i++) { - TensorDesc ftmDesc; - CHECK_STATUS(convolution_transform_filter_arm(filterDesc, filter, - convolutionAlgorithms[i], - &ftmDesc, filterTransformed)); - - memset(tmp, 0, tmpBytes); - double timeStart = ut_time_ms(); - CHECK_STATUS(convolution_arm(inputDesc, input, - ftmDesc, filterTransformed, - convDesc, - convolutionAlgorithms[i], - scaleDesc, scale, - biasDesc, bias, - tmpBytes, tmp, - outputDesc, output, - activationDesc, - ARM_A76)); - double timeEnd = ut_time_ms(); - double timeMin = FLT_MAX; - if (timeMin > timeEnd - timeStart) { - timeMin = timeEnd - timeStart; - algorithmIndex = i; - } - } - free(input); - free(filter); - free(filterTransformed); - free(bias); - free(scale); - free(tmp); - free(output); - *algorithm = convolutionAlgorithms[algorithmIndex]; - ret = SUCCESS;; - } else { - ret = NOT_SUPPORTED; - } - return ret; -} - -EE convolution_transform_filter_bytes_arm(TensorDesc filterDesc, ConvolutionForwardAlgorithm algorithm, U32* bytes) -{ - if (nullptr == bytes) - CHECK_STATUS(NULL_POINTER); - EE ret = SUCCESS; - - DataType fdt; - DataFormat fdf; - U32 fn, fc, fh, fw; - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - switch (algorithm) { - case CONVOLUTION_ALGORITHM_WINOGRAD: - *bytes = fn * fc * 6 * 6; - break; - case CONVOLUTION_ALGORITHM_DIRECT: - *bytes = fn * fc * fh * fw; - break; - case CONVOLUTION_ALGORITHM_GEMM: - *bytes = fn * fc * fh * fw; - break; - case CONVOLUTION_ALGORITHM_GEMM_ICNCHW: - *bytes = fn * fc * fh * fw; - break; - case CONVOLUTION_ALGORITHM_BNN: - *bytes = fn * fc * fh * fw; - break; - default: - return NOT_SUPPORTED; - } - *bytes *= bytesOf(fdt); - - switch (filterDesc.dt) { - case DT_BIN01: { - *bytes /= 8; - break; - } - case DT_BIN11: { - *bytes /= 8; - break; - } - default: - break; - } - *bytes += 32; - return ret; -} - -EE convolution_transform_filter_arm(TensorDesc filterDesc, const void* filter, - ConvolutionForwardAlgorithm algorithm, - TensorDesc *ftmDesc, void* filterTransformed) -{ - EE ret = SUCCESS; - switch (filterDesc.dt) { -#ifdef _USE_FP32 - case DT_F32: { - ret = convolution_transform_filter_fp32(filterDesc, (F32*)filter, algorithm, ftmDesc, (F32*)filterTransformed); - break; - } -#endif -#ifdef _USE_FP16 - case DT_F16: { - ret = convolution_transform_filter_fp16(filterDesc, (F16*)filter, algorithm, ftmDesc, (F16*)filterTransformed); - break; - } -#endif -#ifdef _USE_INT8 - case DT_I8: { - ret = convolution_transform_filter_int8(filterDesc, filter, algorithm, ftmDesc, filterTransformed); - break; - } - case DT_F16_8Q: { - ret = convolution_transform_filter_int8(filterDesc, filter, algorithm, ftmDesc, filterTransformed); - break; - } -#endif -#ifdef _USE_FP16 - case DT_BIN01: { - ret = convolution_transform_filter_bnn(filterDesc, (BIN8*)filter, ftmDesc, (BIN8*)filterTransformed); - break; - } - case DT_BIN11: { - ret = convolution_transform_filter_bnn(filterDesc, (BIN8*)filter, ftmDesc, (BIN8*)filterTransformed); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - -EE convolution_infer_forward_tmp_bytes_arm(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, ConvolutionForwardAlgorithm algorithm, U32 *bytes) -{ - EE ret = SUCCESS; - switch (filterDesc.dt) { -#ifdef _USE_FP32 - case DT_F32: { - ret = convolution_infer_forward_tmp_bytes_fp32(inputDesc, filterDesc, outputDesc, convDesc, algorithm, bytes); - break; - } -#endif -#ifdef _USE_FP16 - case DT_F16: { - ret = convolution_infer_forward_tmp_bytes_fp16(inputDesc, filterDesc, outputDesc, convDesc, algorithm, bytes); - break; - } -#endif -#ifdef _USE_INT8 - case DT_I8: { - ret = convolution_infer_forward_tmp_bytes_int8(inputDesc, filterDesc, outputDesc, convDesc, algorithm, bytes); - break; - } -#endif -#ifdef _USE_FP16 - case DT_BIN01: { - ret = convolution_infer_forward_tmp_bytes_bnn(inputDesc, filterDesc, outputDesc, convDesc, algorithm, bytes); - break; - } - case DT_BIN11: { - ret = convolution_infer_forward_tmp_bytes_bnn(inputDesc, filterDesc, outputDesc, convDesc, algorithm, bytes); - break; - } -#endif - default: - CHECK_STATUS(NOT_SUPPORTED); - break; - } - return ret; - -} - -EE convolution_arm(TensorDesc inputDesc, void* input, - TensorDesc filterDesc, const void* filter, - ConvolutionDesc convDesc, - ConvolutionForwardAlgorithm algorithm, - TensorDesc scaleDesc, const void* scale, - TensorDesc biasDesc, const void* bias, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, void* output, - ActivationDesc activationDesc, - Arch arch) -{ - EE ret = SUCCESS; - UNUSED(scaleDesc); - UNUSED(scale); - switch (filterDesc.dt) { -#ifdef _USE_FP32 - case DT_F32: { - ret = convolution_fp32(inputDesc, (F32*)input, - filterDesc, (F32*)filter, - convDesc, - algorithm, - biasDesc, (F32*)bias, - tmpBytes, tmp, - outputDesc, (F32*)output, - activationDesc, - arch); - break; - } -#endif -#ifdef _USE_FP16 - case DT_F16: { - ret = convolution_fp16(inputDesc, (F16*)input, - filterDesc, (F16*)filter, - convDesc, - algorithm, - biasDesc, (F16*)bias, - tmpBytes, tmp, - outputDesc, (F16*)output, - activationDesc, - arch); - break; - } -#endif -#ifdef _USE_INT8 - case DT_I8: { - ret = convolution_int8(inputDesc, (INT8*)input, - filterDesc, (INT8*)filter, - (F16*)scale, - convDesc, - algorithm, - biasDesc, (F16*)bias, - tmpBytes, tmp, - outputDesc, output, - activationDesc, - arch); - break; - } -#endif -#ifdef _USE_FP16 - case DT_BIN01: { - ret = convolution_bnn(inputDesc, (F16*)input, - filterDesc, (BIN8*)filter, - convDesc, - scaleDesc, (F16*)scale, - biasDesc, (F16*)bias, - tmpBytes, tmp, - outputDesc, (F16*)output, - activationDesc, - arch); - break; - } - case DT_BIN11: { - ret = convolution_bnn(inputDesc, (F16*)input, - filterDesc, (BIN8*)filter, - convDesc, - scaleDesc, (F16*)scale, - biasDesc, (F16*)bias, - tmpBytes, tmp, - outputDesc, (F16*)output, - activationDesc, - arch); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} diff --git a/tensor_computing/src/cpu/arm/deconvolution.cpp b/tensor_computing/src/cpu/arm/deconvolution.cpp deleted file mode 100644 index 13ed5800..00000000 --- a/tensor_computing/src/cpu/arm/deconvolution.cpp +++ /dev/null @@ -1,173 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/tensor_computing_arm.h" -#ifdef _USE_FP32 -#include "cpu/arm/fp32/tensor_computing_fp32.h" -#endif -#ifdef _USE_FP16 -#include "cpu/arm/fp16/tensor_computing_fp16.h" -#endif - -EE deconvolution_infer_forward_algorithm_arm(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, ConvolutionPolicy policy, ConvolutionForwardAlgorithm *algorithm, DataType targetDataType) -{ - if (nullptr == algorithm) { - CHECK_STATUS(NULL_POINTER); - } - DataType idt, fdt; - DataFormat idf, fdf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - - ConvolutionDesc transposedCD; - transposedCD.stride_h = 1; - transposedCD.stride_w = 1; - transposedCD.padding_top = 1; - transposedCD.padding_bottom = 1; - transposedCD.padding_left = 1; - transposedCD.padding_right = 1; - transposedCD.dilatedRate_h = 1; - transposedCD.dilatedRate_w = 1; - - U32 tPadding = (fh - 1 - paddingT) - 1; // Leave out padding of length 1 to activate Winograd - U32 bPadding = (fh - 1 - paddingB) - 1; - U32 lPadding = (fw - 1 - paddingL) - 1; - U32 rPadding = (fw - 1 - paddingR) - 1; - - ih = ih + (ih - 1) * (strideH - 1) + tPadding + bPadding; - iw = iw + (iw - 1) * (strideW - 1) + lPadding + rPadding; - - TensorDesc inPaddedDesc = tensor4df(idt, idf, in, ic, ih, iw); - - // Swap fn and fc - filterDesc.dims[2] = filterDesc.dims[3]; - filterDesc.dims[3] = ic; - EE ret = convolution_infer_forward_algorithm_arm(inPaddedDesc, filterDesc, outputDesc, transposedCD, policy, algorithm, targetDataType); - return ret; -} - -EE deconvolution_transform_filter_bytes_arm(TensorDesc filterDesc, ConvolutionForwardAlgorithm algorithm, U32* bytes) { - return convolution_transform_filter_bytes_arm(filterDesc, algorithm, bytes); -} - -EE deconvolution_transform_filter_arm(TensorDesc filterDesc, const void* filter, - ConvolutionForwardAlgorithm algorithm, - TensorDesc *ftmDesc, void* filterTransformed) -{ - EE ret = SUCCESS; - switch (filterDesc.dt) { -#ifdef _USE_FP32 - case DT_F32: { - ret = deconvolution_transform_filter_fp32(filterDesc, (F32*)filter, algorithm, ftmDesc, (F32*)filterTransformed); - break; - } -#endif -#ifdef _USE_FP16 - case DT_F16: { - ret = deconvolution_transform_filter_fp16(filterDesc, (F16*)filter, algorithm, ftmDesc, (F16*)filterTransformed); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - -EE deconvolution_infer_forward_tmp_bytes_arm(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, ConvolutionForwardAlgorithm algorithm, U32 *bytes) -{ - EE ret = SUCCESS; - switch (filterDesc.dt) { -#ifdef _USE_FP32 - case DT_F32: { - ret = deconvolution_infer_forward_tmp_bytes_fp32(inputDesc, filterDesc, outputDesc, convDesc, algorithm, bytes); - break; - } -#endif -#ifdef _USE_FP16 - case DT_F16: { - ret = deconvolution_infer_forward_tmp_bytes_fp16(inputDesc, filterDesc, outputDesc, convDesc, algorithm, bytes); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - return ret; - -} - -EE deconvolution_arm(TensorDesc inputDesc, void* input, - TensorDesc filterDesc, const void* filter, - ConvolutionDesc convDesc, - ConvolutionForwardAlgorithm algorithm, - TensorDesc scaleDesc, const void* scale, - TensorDesc biasDesc, const void* bias, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, void* output, - ActivationDesc activationDesc, - Arch arch) -{ - UNUSED(scaleDesc); - UNUSED(scale); - - EE ret = SUCCESS; - switch (filterDesc.dt) { -#ifdef _USE_FP32 - case DT_F32: { - ret = deconvolution_fp32(inputDesc, (F32*)input, - filterDesc, (F32*)filter, - convDesc, - algorithm, - biasDesc, (F32*)bias, - tmpBytes, tmp, - outputDesc, (F32*)output, - activationDesc, - arch); - break; - } -#endif -#ifdef _USE_FP16 - case DT_F16: { - ret = deconvolution_fp16(inputDesc, (F16*)input, - filterDesc, (F16*)filter, - convDesc, - algorithm, - biasDesc, (F16*)bias, - tmpBytes, tmp, - outputDesc, (F16*)output, - activationDesc, - arch); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} diff --git a/tensor_computing/src/cpu/arm/depthwise_convolution.cpp b/tensor_computing/src/cpu/arm/depthwise_convolution.cpp deleted file mode 100644 index b6f2fa72..00000000 --- a/tensor_computing/src/cpu/arm/depthwise_convolution.cpp +++ /dev/null @@ -1,273 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/tensor_computing_arm.h" -#ifdef _USE_FP32 -#include "cpu/arm/fp32/tensor_computing_fp32.h" -#endif -#ifdef _USE_FP16 -#include "cpu/arm/fp16/tensor_computing_fp16.h" -#endif -#ifdef _USE_INT8 -#include "cpu/arm/int8/tensor_computing_int8.h" -#endif - -EE depthwise_convolution_infer_forward_algorithm_arm(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, ConvolutionPolicy policy, DepthwiseConvolutionForwardAlgorithm *algorithm, DataType targetDataType) -{ - UNUSED(policy); - if (nullptr == algorithm) { - CHECK_STATUS(NULL_POINTER); - } - EE ret = SUCCESS; - - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - - switch (fdf) - { - case DF_NCHW: { - *algorithm = DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT; - if (convDesc.dilatedRate_h != 1 || convDesc.dilatedRate_w != 1) { - return ret; - } - break; - } - case DF_CHW_NC: { - *algorithm = DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT; - if (convDesc.dilatedRate_h != 1 || convDesc.dilatedRate_w != 1) { - return ret; - } - break; - } - default: - return NOT_MATCH; - } - - switch (targetDataType) { - case DT_F16: { - if (fdf == DF_NCHW) { - break; - } - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - - if (fh == 3 && fw == 3 && strideH == 1 && strideW == 1 && paddingT == 1 && paddingB == 1 && paddingL == 1 && paddingR == 1 && ow % 4 == 0 && ow >= 12) { - *algorithm = DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_3X3S1P1; - } - else if (fh == 3 && fw == 3 && strideH == 2 && strideW == 2 && ow >= 28) { - *algorithm = DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT_NO_PADDING; - } - break; - } - default: { - break; - } - } - return ret; -} - -EE depthwise_convolution_transform_filter_bytes_arm(TensorDesc filterDesc, DepthwiseConvolutionForwardAlgorithm algorithm, U32* bytes) -{ - if (nullptr == bytes) { - CHECK_STATUS(NULL_POINTER); - } - DataType fdt; - DataFormat fdf; - U32 fn, fc, fh, fw; - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - U32 fhfw = fh * fw; - switch (algorithm) { - case DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT: - *bytes = fc * fhfw; - break; - case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT: - *bytes = fc * fhfw + fn * fc; - break; - case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT_NO_PADDING: - *bytes = fc * fhfw + fn * fc; - break; - case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_3X3S1P1: - *bytes = fc * fhfw + fn * fc; - break; - default: - return NOT_SUPPORTED; - } - *bytes *= bytesOf(fdt); - *bytes += 32; - return SUCCESS; -} - -EE depthwise_convolution_transform_filter_arm(TensorDesc filterDesc, const void* filter, - DepthwiseConvolutionForwardAlgorithm algorithm, - TensorDesc *ftmDesc, void* filterTransformed) -{ - EE ret = SUCCESS; - switch (filterDesc.dt) { -#ifdef _USE_FP16 - case DT_F16: { - ret = depthwise_convolution_transform_filter_fp16(filterDesc, (F16*)filter, algorithm, ftmDesc, (F16*)filterTransformed); - break; - } -#endif -#ifdef _USE_FP32 - case DT_F32: { - ret = depthwise_convolution_transform_filter_fp32(filterDesc, (F32*)filter, algorithm, ftmDesc, (F32*)filterTransformed); - break; - } -#endif -#ifdef _USE_INT8 - case DT_I8: { - ret = depthwise_convolution_transform_filter_int8(filterDesc, (INT8*)filter, algorithm, ftmDesc, (INT8*)filterTransformed); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - -EE depthwise_convolution_infer_forward_tmp_bytes_arm(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, DepthwiseConvolutionForwardAlgorithm algorithm, U32 *bytes) -{ - if (nullptr == bytes) { - CHECK_STATUS(NULL_POINTER); - } - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - - U32 ih_pad = ih + paddingT + paddingB; - U32 iw_pad = iw + paddingL + paddingR; - EE ret = SUCCESS; - switch (algorithm) { - case DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT: - *bytes = ic * ih_pad * iw_pad; - break; - case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT: - *bytes = ic * ih_pad * iw_pad + ic * oh * ow; - break; - case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT_NO_PADDING: - *bytes = ic * oh * ow; - break; - case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_3X3S1P1: - *bytes = ic * oh * ow + ic * 8; - break; - default: { - ret = NOT_MATCH; - *bytes = 0; - break; - } - } - *bytes *= bytesOf(idt); - - switch (filterDesc.dt) { -#ifdef _USE_INT8 - case DT_I8: { - *bytes += ic * oh * ow * sizeof(I32); - break; - } -#endif - default: - break; - } - *bytes += 32; - return ret; -} - -EE depthwise_convolution_arm(TensorDesc inputDesc, void* input, - TensorDesc filterDesc, const void* filter, - ConvolutionDesc convDesc, - DepthwiseConvolutionForwardAlgorithm algorithm, - TensorDesc biasDesc, const void* bias, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, void* output, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc, - Arch arch) -{ - EE ret = SUCCESS; - switch (filterDesc.dt) { -#ifdef _USE_FP16 - case DT_F16: { - ret = depthwise_convolution_fp16(inputDesc, (F16*)input, - filterDesc, (const F16*)filter, - convDesc, - algorithm, - biasDesc, (const F16*)bias, - tmpBytes, tmp, - outputDesc, (F16*)output, - depthwiseActivationDesc, - pointwiseActivationDesc, - arch); - break; - } -#endif -#ifdef _USE_FP32 - case DT_F32: { - ret = depthwise_convolution_fp32(inputDesc, (F32*)input, - filterDesc, (const F32*)filter, - convDesc, - algorithm, - biasDesc, (const F32*)bias, - tmpBytes, tmp, - outputDesc, (F32*)output, - depthwiseActivationDesc, - pointwiseActivationDesc, - arch); - break; - } -#endif -#ifdef _USE_INT8 - case DT_I8: { - ret = depthwise_convolution_int8(inputDesc, (INT8*)input, - filterDesc, (INT8*)filter, - convDesc, - algorithm, - biasDesc, (I32*)bias, - tmpBytes, tmp, - outputDesc, (I32*)output, - depthwiseActivationDesc, - pointwiseActivationDesc, - arch); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} diff --git a/tensor_computing/src/cpu/arm/detectionoutput.cpp b/tensor_computing/src/cpu/arm/detectionoutput.cpp deleted file mode 100644 index a744f525..00000000 --- a/tensor_computing/src/cpu/arm/detectionoutput.cpp +++ /dev/null @@ -1,119 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#include "cpu/arm/tensor_computing_arm.h" -#ifdef _USE_FP32 -#include "cpu/arm/fp32/tensor_computing_fp32.h" -#endif -#ifdef _USE_FP16 -#include "cpu/arm/fp16/tensor_computing_fp16.h" -#endif - - -EE detectionoutput_qsort_descent_arm(std::vector& boxes, std::vector& scores, int left, int right) -{ - if (boxes.empty() || scores.empty()) - return NOT_SUPPORTED; - - int i = left; - int j = right; - F32 temp = scores[(left+right) / 2]; - - while (i <= j){ - while(scores[i] > temp) - i++; - while(scores[j] < temp) - j--; - if(i<=j){ - std::swap(boxes[i], boxes[j]); - std::swap(scores[i], scores[j]); - i++; - j--; - } - } - - if (left < j) - detectionoutput_qsort_descent_arm(boxes, scores, left, j); - if (i < right) - detectionoutput_qsort_descent_arm(boxes, scores, i, right); - - return SUCCESS; -} - -F32 detectionoutput_intersectionarea_arm(BoxRect a, BoxRect b) -{ - if (a.xmin > b.xmax || a.xmax < b.xmin || a.ymin > b.ymax || a.ymax < b.ymin) - { - return 0.f; - } - F32 inter_width = std::min(a.xmax, b.xmax) - std::max(a.xmin, b.xmin); - F32 inter_height = std::min(a.ymax, b.ymax) - std::max(a.ymin, b.ymin); - - return inter_width * inter_height; -} - -EE detectionoutput_nms_pickedboxes_arm(std::vector boxes, std::vector& picked, F32 nms_threshold) -{ - I64 n = boxes.size(); - - std::vector areas(n); - for(I64 i = 0; i < n; i++){ - BoxRect box = boxes[i]; - - F32 width = box.xmax - box.xmin; - F32 height = box.ymax - box.ymin; - - areas[i] = width * height; - } - for(I64 i = 0; i < n; i++){ - - BoxRect a = boxes[i]; - int keep = 1; - for(int j = 0; j < (int)picked.size(); j++){ - BoxRect b = boxes[picked[j]]; - F32 inter_area = detectionoutput_intersectionarea_arm(a,b); - F32 union_area = areas[i] + areas[picked[j]] - inter_area; - - if(inter_area / union_area > nms_threshold) - keep = 0; - } - if(keep){ - picked.push_back(i); - } - } - return SUCCESS; -} - -EE detectionoutput_arm(std::vector inputDesc, std::vector input, DetectionOutputDesc detectionoutputDesc, TensorDesc outputDesc, void* output) -{ - EE ret = SUCCESS; - switch (inputDesc[0].dt){ -#ifdef _USE_FP32 - case DT_F32: { - ret = detectionoutput_fp32(inputDesc, input, detectionoutputDesc, outputDesc, (F32*)output); - break; - } -#endif -#ifdef _USE_FP16 - case DT_F16: { - ret = detectionoutput_fp16(inputDesc, input, detectionoutputDesc, outputDesc, (F16*)output); - break; - } -#endif - default: { - ret = NOT_SUPPORTED; - break; - } - } - return ret; -} \ No newline at end of file diff --git a/tensor_computing/src/cpu/arm/eltwise.cpp b/tensor_computing/src/cpu/arm/eltwise.cpp deleted file mode 100644 index b50d4766..00000000 --- a/tensor_computing/src/cpu/arm/eltwise.cpp +++ /dev/null @@ -1,133 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "cpu/arm/tensor_computing_arm.h" -#ifdef _USE_FP32 -#include "cpu/arm/fp32/tensor_computing_fp32.h" -#endif -#ifdef _USE_FP16 -#include "cpu/arm/fp16/tensor_computing_fp16.h" -#endif - -std::vector calculateLocalIndex_arm(U32 index, TensorDesc desc) { - std::vector indexes(desc.nDims); - for (U32 i = 0; i < desc.nDims; i++) { - indexes[i] = index % desc.dims[i]; - index /= desc.dims[i]; - } - return indexes; -} - -U32 calculateGlobalIndex_arm(std::vector indexes, TensorDesc desc) { - U32 index = 0; - for (int i = ((int)desc.nDims) - 1; i >= 0; i--) { - index = index * desc.dims[i] + indexes[i]; - } - return index; - -} - -std::vector calculateRelativeLocalIndex_arm(std::vector indexes, TensorDesc desc) { - std::vector relativeIndexes(desc.nDims); - for (U32 i = 0; i < desc.nDims; i++) { - relativeIndexes[i] = indexes[i] % desc.dims[i]; - } - return relativeIndexes; -} - -// [1, 10, 10] + [1, 10, 10] = [1, 10, 10] -// [1, 10, 1] + [1, 1, 10] = [1, 10, 10] -// [1, 20, 10] + [10] = [1. 20, 10] + [1, 1, 10] = [1, 20, 10] -EE eltwise_arm(std::vector inputDesc, std::vector input, - TensorDesc outputDesc, void* output, EltwiseMode eltwiseMode) { - U32 num = inputDesc.size(); - if(num <= 1 || outputDesc.nDims < 1) return NOT_MATCH; - I32 oneCount = 0; - for (int i = 0; i < ((int)outputDesc.nDims)-1; i++) { - if(outputDesc.dims[i] == 1) - oneCount ++; - else - break; - } - TensorDesc newOutputDesc = outputDesc; - for (int i = 0; i < (int)outputDesc.nDims - oneCount; i++) - newOutputDesc.dims[i] = outputDesc.dims[oneCount+i]; - newOutputDesc.nDims = outputDesc.nDims - oneCount; - - std::vector newInputDesc(num); - for (U32 i = 0; i < num; i++) { - newInputDesc[i] = inputDesc[i]; - for (int j = 0; j < (int)inputDesc[i].nDims - oneCount; j++) - newInputDesc[i].dims[j] = inputDesc[i].dims[oneCount+j]; - newInputDesc[i].nDims = inputDesc[i].nDims - oneCount; - for (U32 j = newInputDesc[i].nDims; j < newOutputDesc.nDims; j++) { - newInputDesc[i].dims[j] = 1; - } - newInputDesc[i].nDims = newOutputDesc.nDims; - } - U32 size = tensorNumElements(newOutputDesc); - U32 lastDimSize = newOutputDesc.dims[0]; - std::vector lastDimSizes(num); - for (U32 i = 0; i < num; i++) - lastDimSizes[i] = newInputDesc[i].dims[0]; - for (U32 i = 1; i < newOutputDesc.nDims; i++) { - bool sameDim = true; - for (U32 j = 0; j < num; j++) { - if (newInputDesc[j].dims[i] != newOutputDesc.dims[i]) { - sameDim = false; - break; - } - } - if (sameDim) { - lastDimSize *= newOutputDesc.dims[i]; - for (U32 j = 0; j < num; j++) { - lastDimSizes[j] *= newInputDesc[j].dims[i]; - } - } else { - break; - } - } - - std::vector newInput(num); - EE ret = SUCCESS; - for (U32 i = 0; i < size; i+=lastDimSize) { - std::vector index = calculateLocalIndex_arm(i, newOutputDesc); - for (U32 j = 0; j < num; j++) { - std::vector relativeIndex = calculateRelativeLocalIndex_arm(index, newInputDesc[j]); - U32 globalIndex = calculateGlobalIndex_arm(relativeIndex, newInputDesc[j]); - newInput[j] = (U8*)(input[j]) + globalIndex * bytesOf(newInputDesc[j].dt); - } - U8* newOutput = (U8*)output + i * bytesOf(newOutputDesc.dt); - switch (newOutputDesc.dt) { -#ifdef _USE_FP32 - case DT_F32: { - ret = eltwise_fp32(newInput, lastDimSizes, num, lastDimSize, newOutput, eltwiseMode); - break; - } -#endif -#ifdef _USE_FP16 - case DT_F16: { - ret = eltwise_fp16(newInput, lastDimSizes, num, lastDimSize, newOutput, eltwiseMode); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - - } - return ret; -} diff --git a/tensor_computing/src/cpu/arm/fp16/arm_functions_fp16.h b/tensor_computing/src/cpu/arm/fp16/arm_functions_fp16.h deleted file mode 100644 index b8c37cd2..00000000 --- a/tensor_computing/src/cpu/arm/fp16/arm_functions_fp16.h +++ /dev/null @@ -1,324 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_ARM_FUNCTIONS_FP16 -#define _H_ARM_FUNCTIONS_FP16 - -#ifdef _USE_FP16 -#include "arm_neon_expand.h" -#include -#include "tensor_computing_type.h" - -// array sum -inline F32 array_sum_f16(const F16 *data, I32 len) { - if(len <= 0) return 0; - - I32 i = 0; - F32 sum_s = 0; - float16x8_t sum_v = vdupq_n_f16(0); - for(i = 0; i < len - 7; i+=8){ - float16x8_t in = vld1q_f16(data + i); - sum_v = vaddq_f16(sum_v, in); - } - sum_s += vaddvq_f16(sum_v); - for(; i < len; i++){ - sum_s += data[i]; - } - return sum_s; -} - -// array mean -inline F32 array_mean_f16(const F16 *data, I32 len) { - if(len <= 0) return 0; - return array_sum_f16(data, len) / len; -} - -// array var -inline F32 array_var_f16(const F16 *data, I32 len, F32 mean) { - if(len <= 0) return 0; - - I32 i = 0; - F32 sum_s = 0; - float32x4_t mean_v = vdupq_n_f32(mean); - for(i = 0; i < len - 3; i+=4){ - float16x4_t in = vld1_f16(data + i); - float32x4_t in_f32 = vcvt_f32_f16(in); - float32x4_t tmp_v = vsubq_f32(in_f32, mean_v); - float32x4_t sum_v = vmulq_f32(tmp_v, tmp_v); - sum_s += vaddvq_f32(sum_v); - } - for(; i < len; i++){ - F16 in = data[i]; - F32 tmp = in - mean; - sum_s += tmp * tmp; - } - return sum_s / len; -} - -// array max -inline F16 array_max_f16(const F16* data, I32 len) { - F16 max_s = data[0]; - I32 i = 0; - if(len >= 8){ - float16x8_t max_v, tmp_v; - max_v = vld1q_f16(data); - for(i = 8; i < len - 7; i+=8){ - tmp_v = vld1q_f16(data + i); - max_v = vmaxq_f16(tmp_v, max_v); - } - max_s = vmaxvq_f16(max_v); - } - - for(; i < len; i++){ - if(data[i] > max_s) - max_s = data[i]; - } - - return max_s; -} - -inline F16 array_maxabs_f16(const F16* data, I32 len) -{ - F16 max_s = std::abs(data[0]); - I32 i = 0; - if (len >= 8) { - float16x8_t max_v, tmp_v; - max_v = vld1q_f16(data); - max_v = vabsq_f16(max_v); - for(i = 8; i < len - 7; i+=8){ - tmp_v = vld1q_f16(data + i); - tmp_v = vabsq_f16(tmp_v); - max_v = vmaxq_f16(tmp_v, max_v); - } - max_s = vmaxvq_f16(max_v); - } - - for ( ; i < len; i++) { - if(std::abs(data[i]) > max_s) - max_s = std::abs(data[i]); - } - - return max_s; -} - -inline void array_scale_f16(F16 *input, F16 *output, I32 len, F32 alpha, F32 beta) { - I32 i = 0; -#ifdef _USE_F16_MIX_PRECISION - float32x4_t alpha_v = vdupq_n_f32(alpha); - float32x4_t beta_v = vdupq_n_f32(beta); - for(i = 0; i < len - 3; i+=4){ - float16x4_t in = vld1_f16(input + i); - float32x4_t in_f32 = vcvt_f32_f16(in); - float32x4_t result = vfmaq_f32(beta_v, alpha_v, in_f32); - vst1_f16(output + i, vcvt_f16_f32(result)); - } -#else - float16x8_t alpha_v = vdupq_n_f16(alpha); - float16x8_t beta_v = vdupq_n_f16(beta); - for (i = 0; i < len - 7; i += 8) { - float16x8_t in = vld1q_f16(input + i); - float16x8_t tmp_v = vfmaq_f16(beta_v, alpha_v, in); - vst1q_f16(output+i, tmp_v); - } -#endif - for (; i < len; i++) { - output[i] = alpha * input[i] + beta; - } -} - -inline EE activation_fp16(F16* input, U32 len, ActivationDesc activationDesc, F16* output) -{ - float16x8_t in, out; - float16x8_t zero = vdupq_n_f16(float16_t(0.)); - float16x8_t one = vdupq_n_f16(float16_t(1.)); - float16x8_t three = vdupq_n_f16(float16_t(3.)); - float16x8_t six = vdupq_n_f16(float16_t(6.)); - U32 len_main = len / 8; - U32 len_tail = len % 8; - - F16 value; - switch (activationDesc.mode){ - case ACTIVATION_NULL: { - break; - } - case ACTIVATION_RELU: { - if (activationDesc.value[0] == 0) { - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f16(input); - out = vmaxq_f16(zero, in); - vst1q_f16(output, out); - input += 8; - output += 8; - } - for (U32 i = 0; i < len_tail; i++) { - output[i] = (input[i] < 0) ? 0 : input[i]; - } - } else { - float16x8_t scale = vdupq_n_f16(activationDesc.value[0]); - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f16(input); - float16x8_t tmp = vmulq_f16(scale, in); - out = vmaxq_f16(tmp, in); - vst1q_f16(output, out); - input += 8; - output += 8; - } - for (U32 i = 0; i < len_tail; i++) { - float tmp = activationDesc.value[0] * input[i]; - output[i] = (input[i] < tmp) ? tmp : input[i]; - } - } - break; - } - case ACTIVATION_RELU6: { - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f16(input); - out = vmaxq_f16(zero, in); - out = vminq_f16(six, out); - vst1q_f16(output, out); - input += 8; - output += 8; - } - for (U32 i = 0; i < len_tail; i++) { - value = (input[i] < 0) ? 0 : input[i]; - if (value > 6) { - value = 6; - } - output[i] = value; - } - break; - } - case ACTIVATION_H_SIGMOID: { - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f16(input); - out = vaddq_f16(in, three); - out = vmaxq_f16(out, zero); - out = vminq_f16(out, six); - out = vdivq_f16(out, six); - vst1q_f16(output, out); - input += 8; - output += 8; - } - for (U32 i = 0; i < len_tail; i++) { - value = input[i] + 3; - value = (value < 0) ? 0 : value; - value = (value > 6) ? 6 : value; - value = value / 6; - output[i] = value; - } - break; - } - case ACTIVATION_H_SWISH: { - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f16(input); - out = vaddq_f16(in, three); - out = vmaxq_f16(out, zero); - out = vminq_f16(out, six); - out = vdivq_f16(out, six); - out = vmulq_f16(out, in); - vst1q_f16(output, out); - input += 8; - output += 8; - } - for (U32 i = 0; i < len_tail; i++) { - value = input[i] + 3; - value = (value < 0) ? 0 : value; - value = (value > 6) ? 6 : value; - value = input[i] * value; - value = value / 6; - output[i] = value; - } - break; - } - case ACTIVATION_GELU: { - F16 two_div_PI_sqrt = sqrt(2 / 3.14159265358979323846); - float16x8_t vec0 = vdupq_n_f16(two_div_PI_sqrt); - float16x8_t vec1 = vdupq_n_f16(float16_t(0.044715)); - float16x8_t vec2 = vdupq_n_f16(float16_t(0.5)); - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f16(input); - out = vmulq_f16(in, in); - out = vmulq_f16(out, in); - out = vfmaq_f16(in, vec1, out); - out = vmulq_f16(vec0, out); - out = vtanhq_f16(out); - out = vaddq_f16(one, out); - out = vmulq_f16(vec2, out); - out = vmulq_f16(in, out); - vst1q_f16(output, out); - input += 8; - output += 8; - } - for (U32 i = 0; i < len_tail; i++) { - value = input[i]; - value = two_div_PI_sqrt * (value + 0.044715 * pow(value, 3)); - value = 1.0 - 2.0 / (exp(2.0 * value) + 1.0); - value = 0.5 * (1.0 + value); - value = input[i] * value; - output[i] = value; - } - break; - } - case ACTIVATION_TANH: { - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f16(input); - out = vtanhq_f16(in); - vst1q_f16(output, out); - input += 8; - output += 8; - } - for (U32 i = 0; i < len_tail; i++) { - value = 1.0 - 2.0 / (exp(2.0 * input[i]) + 1.0); - output[i] = value; - } - break; - } - case ACTIVATION_SIGMOID: { - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f16(input); - out = vsigmoidq_f16(in); - vst1q_f16(output, out); - input += 8; - output += 8; - } - for (U32 i = 0; i < len_tail; i++) { - value = 1.0 / (1.0 + exp(-1.0 * input[i])); - output[i] = value; - } - break; - } - default: - return NOT_SUPPORTED; - } - - return SUCCESS; -} - -inline void array_add_f16(const F16* inputA, const F16* inputB, F16* output, I32 len) -{ - I32 i = 0; - for(i = 0; i < len - 7; i+=8){ - float16x8_t a = vld1q_f16(inputA + i); - float16x8_t b = vld1q_f16(inputB + i); - float16x8_t c = vaddq_f16(a, b); - vst1q_f16(output+i, c); - } - - for ( ; i < len; i++) { - output[i] = inputA[i] + inputB[i]; - } -} - -#endif -#endif diff --git a/tensor_computing/src/cpu/arm/fp16/attention.cpp b/tensor_computing/src/cpu/arm/fp16/attention.cpp deleted file mode 100644 index 46a2041f..00000000 --- a/tensor_computing/src/cpu/arm/fp16/attention.cpp +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "cpu/arm/fp16/tensor_computing_fp16.h" - -EE attention_fp16(U32 batch, U32 numHeads, I32 fromSequenceLength, I32 toSequenceLength, const F16 *input, F16 *output) -{ - if (nullptr == input || nullptr == output) - CHECK_STATUS(NULL_POINTER); - - F16 mask_s = -10000.0; - I32 count = array_sum_f16(input, toSequenceLength); - I32 valid = UNI_MIN(count, fromSequenceLength); - float16x8_t mask_v = vdupq_n_f16(float16_t(mask_s)); - float16x8_t one_v = vdupq_n_f16(float16_t(1.0)); - for(U32 n = 0; n < batch; n++){ - for (U32 i = 0; i < numHeads; i++) { - if (i == 0) { - for (I32 j = 0; j < valid; j++) { - if (j == 0) { - I32 k = 0; - for (; k < toSequenceLength-7; k+=8) { - float16x8_t in_v = vld1q_f16(input + k); - float16x8_t tmp_v = vsubq_f16(one_v, in_v); - tmp_v = vmulq_f16(tmp_v, mask_v); - vst1q_f16(output+k, tmp_v); - } - for (; k < toSequenceLength; k++) { - F16 value = (1 - input[k]) * mask_s; - output[k] = value; - } - } - else { - memcpy(output+j*toSequenceLength, output, toSequenceLength*sizeof(F16)); - } - } - - for (I32 j = valid; j < fromSequenceLength; j++) { - if (j == valid) { - I32 k = 0; - for (; k < toSequenceLength-7; k+=8) { - vst1q_f16(output+j*toSequenceLength+k, mask_v); - } - for (; k < toSequenceLength; k++) { - output[j*toSequenceLength+k] = mask_s; - } - } - else { - memcpy(output+j*toSequenceLength, output+valid*toSequenceLength, toSequenceLength*sizeof(F16)); - } - } - } else { - memcpy(output+i*fromSequenceLength*toSequenceLength, output, fromSequenceLength*toSequenceLength*sizeof(F16)); - } - } - - input += toSequenceLength; - output += numHeads * fromSequenceLength * toSequenceLength; - } - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/arm/fp16/attention_mask.cpp b/tensor_computing/src/cpu/arm/fp16/attention_mask.cpp deleted file mode 100644 index befcb250..00000000 --- a/tensor_computing/src/cpu/arm/fp16/attention_mask.cpp +++ /dev/null @@ -1,76 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "cpu/arm/fp16/tensor_computing_fp16.h" - -EE attention_mask_fp16(TensorDesc inputDesc, const F16* input, - I32 attentionLength, bool sameLength, float maskValue, - TensorDesc outputDesc, F16* output) -{ - UNUSED(outputDesc); - if (nullptr == input || nullptr == output) - CHECK_STATUS(NULL_POINTER); - int qlen = inputDesc.dims[1]; - int klen = inputDesc.dims[0]; - int mlen = klen - qlen; - I32 length = qlen * klen; - std::vector mask; - if (attentionLength < 0) { - mask = std::vector(length, 0); - } else { - mask = std::vector(length, 1); - for (int i = 0; i < qlen; i++) { - int start, loops; - if (attentionLength > 0) { - int end = mlen + i; - start = UNI_MAX(end - attentionLength, 0); - loops = end - start + 1; - } else { - if (sameLength) { - start = i; - loops = qlen + 1; - } else { - start = 0; - loops = i + qlen + 1; - } - } - loops = UNI_MAX(loops, 0); - start = UNI_MIN(start, klen); - if (start + loops > klen) - loops = UNI_MAX(klen - start, 0); - memset(&mask[i*klen+start], 0, sizeof(F16)*loops); - } - } - I32 loops = tensorNumElements(inputDesc) / length; - float16x8_t one_v = vdupq_n_f16(1); - float16x8_t mask_value_v = vdupq_n_f16(maskValue); - for (int i = 0, index = 0; i < loops; i++) { - int j = 0; - for (; j < length-7; j+=8) { - float16x8_t in = vld1q_f16(input+index); - float16x8_t mask_v = vld1q_f16(&mask[j]); - float16x8_t tmp_v = vsubq_f16(one_v, mask_v); - tmp_v = vmulq_f16(in, tmp_v); - tmp_v = vfmsq_f16(tmp_v, mask_value_v, mask_v); - vst1q_f16(output+index, tmp_v); - index += 8; - } - for (; j < length; j++) { - output[index] = input[index] * (1 - mask[j]) - maskValue * mask[j]; - index++; - } - } - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/arm/fp16/check.cpp b/tensor_computing/src/cpu/arm/fp16/check.cpp deleted file mode 100644 index 1c31c74e..00000000 --- a/tensor_computing/src/cpu/arm/fp16/check.cpp +++ /dev/null @@ -1,88 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/fp16/tensor_computing_fp16.h" - -EE check_fp16(TensorDesc inputDescA, const F16* inputA, - TensorDesc inputDescB, const F16* inputB, - CheckMode checkMode, - TensorDesc outputDesc, I32* output) -{ - if (nullptr == inputA || nullptr == inputB || nullptr == output) - CHECK_STATUS(NULL_POINTER); - - if (tensorNumElements(inputDescA) != tensorNumElements(inputDescB)) - CHECK_STATUS(NOT_MATCH); - - U32 size = tensorNumElements(inputDescA); - U32 loopOuter = inputDescA.dims[inputDescA.nDims-1]; - I32 length = size / loopOuter; - if (tensorNumElements(outputDesc) != loopOuter) - CHECK_STATUS(NOT_MATCH); - for (U32 j = 0; j < loopOuter; j++) { - const F16 *arrayA = inputA + j * length; - const F16 *arrayB = inputB + j * length; - switch (checkMode) { - case CHECK_GREAT: { - uint16x8_t count_v = vdupq_n_u16(0); - I32 i = 0; - for (; i < length-7; i+=8) { - float16x8_t a = vld1q_f16(arrayA + i); - float16x8_t b = vld1q_f16(arrayA + i); - count_v = vaddq_u16(count_v, vcgtq_f16(a, b)); - } - I32 count = vaddvq_u16(count_v); - for (; i < length; i++) - if (arrayA[i] > arrayB[i]) - count ++; - output[j] = (count == length); - break; - } - case CHECK_GREATEQUAL: { - uint16x8_t count_v = vdupq_n_u16(0); - I32 i = 0; - for (; i < length-7; i+=8) { - float16x8_t a = vld1q_f16(arrayA + i); - float16x8_t b = vld1q_f16(arrayA + i); - count_v = vaddq_u16(count_v, vcgeq_f16(a, b)); - } - I32 count = vaddvq_u16(count_v); - for (; i < length; i++) - if (arrayA[i] >= arrayB[i]) - count ++; - output[j] = (count == length); - break; - } - case CHECK_EQUAL: { - uint16x8_t count_v = vdupq_n_u16(0); - I32 i = 0; - for (; i < length-7; i+=8) { - float16x8_t a = vld1q_f16(arrayA + i); - float16x8_t b = vld1q_f16(arrayA + i); - count_v = vaddq_u16(count_v, vceqq_f16(a, b)); - } - I32 count = vaddvq_u16(count_v); - for (; i < length; i++) - if (arrayA[i] == arrayB[i]) - count ++; - output[j] = (count == length); - break; - } - default: - CHECK_STATUS(NOT_SUPPORTED); - break; - } - } - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/arm/fp16/clip.cpp b/tensor_computing/src/cpu/arm/fp16/clip.cpp deleted file mode 100644 index ce451034..00000000 --- a/tensor_computing/src/cpu/arm/fp16/clip.cpp +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/fp16/tensor_computing_fp16.h" - -EE clip_fp16(F16 *input, F16 *output, I32 len, F32 minValue, F32 maxValue) { - if (nullptr == input - || nullptr == output) - CHECK_STATUS(NULL_POINTER); - - float16x8_t min_v = vdupq_n_f16(minValue); - float16x8_t max_v = vdupq_n_f16(maxValue); - - I32 i = 0; - for (i = 0; i < len - 7; i += 8) { - float16x8_t in = vld1q_f16(input + i); - float16x8_t tmp_v = vminq_f16(max_v, vmaxq_f16(min_v, in)); - vst1q_f16(output+i, tmp_v); - } - for (; i < len; i++) { - F16 value = input[i]; - value = (value > minValue) ? value : minValue; - value = (value < maxValue) ? value : maxValue; - output[i] = value; - } - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/arm/fp16/convolution.cpp b/tensor_computing/src/cpu/arm/fp16/convolution.cpp deleted file mode 100644 index ec178a16..00000000 --- a/tensor_computing/src/cpu/arm/fp16/convolution.cpp +++ /dev/null @@ -1,133 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/fp16/tensor_computing_fp16.h" -#include "cpu/arm/fp16/convolution_winograd.h" -#include "cpu/arm/fp16/convolution_gemm.h" -#include "cpu/arm/fp16/convolution_gemm_icnchw.h" -#include "cpu/arm/fp16/convolution_direct.h" - -EE convolution_infer_forward_tmp_bytes_fp16(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, ConvolutionForwardAlgorithm algorithm, U32 *bytes) -{ - if (nullptr == bytes) - CHECK_STATUS(NULL_POINTER); - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - - U32 ih_pad = ih + paddingT + paddingB; - U32 iw_pad = iw + paddingL + paddingR; - EE ret = SUCCESS; - switch (algorithm) { - case CONVOLUTION_ALGORITHM_DIRECT: - *bytes = ic*ih_pad*iw_pad; - break; - case CONVOLUTION_ALGORITHM_GEMM: - *bytes = ic*ih_pad*iw_pad + 8*fh*fw*ic; - break; - case CONVOLUTION_ALGORITHM_WINOGRAD: { - U32 tile_h = (oh + 3) / 4; - U32 tile_w = (ow + 3) / 4; - U32 pad_left = paddingL; - U32 pad_right = paddingR + (tile_w*4 - ow); - U32 pad_top = paddingT; - U32 pad_bottom = paddingB + (tile_h*4 - oh); - ih_pad = ih + pad_top + pad_bottom; - iw_pad = iw + pad_left + pad_right; - *bytes = ic*ih_pad*iw_pad + (ic+oc)*6*6*8; - break; - } - case CONVOLUTION_ALGORITHM_GEMM_ICNCHW: - *bytes = ic*ih_pad*iw_pad + 8*fh*fw*ic; - break; - default: - ret = NOT_MATCH; - break; - } - *bytes *= bytesOf(idt); - *bytes += 32; - return ret; -} - -EE convolution_fp16(TensorDesc inputDesc, F16* input, - TensorDesc filterDesc, const F16* filter, - ConvolutionDesc convDesc, ConvolutionForwardAlgorithm algorithm, - TensorDesc biasDesc, const F16* bias, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* output, - ActivationDesc activationDesc, - Arch arch) -{ - if (nullptr == input || nullptr == filter || nullptr == output || nullptr == bias || nullptr == tmp) { - CHECK_STATUS(NULL_POINTER); - } - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - - if (!(idt == DT_F16 && fdt == DT_F16 && odt == DT_F16)) { - CHECK_STATUS(NOT_MATCH); - } - if (!(odf == DF_NCHWC8)) { - CHECK_STATUS(NOT_MATCH); - } - if (!(ic == fc && oc == fn)) { - CHECK_STATUS(NOT_MATCH); - } - - // In some cases when we adjust the model input, the input tensor of conv can change from NCHW to NCHWc8 - // In this case we can simply change the algo, because they both require the same filter transform - if (CONVOLUTION_ALGORITHM_GEMM_ICNCHW == algorithm && DF_NCHWC8 == idf) { - algorithm = CONVOLUTION_ALGORITHM_GEMM; - } - - EE ret = SUCCESS; - switch (algorithm) { - case CONVOLUTION_ALGORITHM_DIRECT: - ret = convolution_direct(inputDesc, input, filterDesc, filter, convDesc, - biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc, arch); - break; - case CONVOLUTION_ALGORITHM_GEMM: - ret = convolution_gemm(inputDesc, input, filterDesc, filter, convDesc, - biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc, arch); - break; - case CONVOLUTION_ALGORITHM_WINOGRAD: - ret = convolution_winograd(inputDesc, input, filterDesc, filter, convDesc, - biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc, arch); - break; - case CONVOLUTION_ALGORITHM_GEMM_ICNCHW: - ret = convolution_gemm_icnchw(inputDesc, input, filterDesc, filter, convDesc, - biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc, arch); - break; - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} diff --git a/tensor_computing/src/cpu/arm/fp16/convolution_direct.cpp b/tensor_computing/src/cpu/arm/fp16/convolution_direct.cpp deleted file mode 100644 index 9cb37781..00000000 --- a/tensor_computing/src/cpu/arm/fp16/convolution_direct.cpp +++ /dev/null @@ -1,503 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include - -#include "cpu/arm/fp16/convolution_direct.h" - -EE convolution_direct(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc activationDesc, - Arch arch) -{ - UNUSED(biasDesc); - UNUSED(tmpBytes); - UNUSED(arch); - - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - - if (fdf != DF_NCHWN16) - CHECK_STATUS(NOT_MATCH); - - oc /= 8; - ic /= 8; - U32 ih_pad = ih + paddingT + paddingB; - U32 iw_pad = iw + paddingL + paddingR; - - // naive, no blocking, in: NCHWc8, out: NOHWo8, filter: OCHWo16, no bias - - EE ret = SUCCESS; - for (U32 n = 0; n < in; n++) { - // copy input into a input with padding - F16 *inArray_pad = (F16*)tmp; - F16 *inArray_pad_mov = inArray_pad; - F16 *inArray_mov = inArray + n*ic*ih*iw*8; - for (U32 c = 0; c < ic; c++) { - for (U32 h = 0; h < paddingT; h++) { - memset(inArray_pad_mov, 0, iw_pad*8*bytesOf(idt)); - inArray_pad_mov += iw_pad*8; - } - for (U32 h = paddingT; h < ih_pad - paddingB; h++) { - memset(inArray_pad_mov, 0, paddingL*8*bytesOf(idt)); - inArray_pad_mov += paddingL*8; - memcpy(inArray_pad_mov, inArray_mov, iw*8*bytesOf(idt)); - inArray_pad_mov += iw*8; - inArray_mov += iw*8; - memset(inArray_pad_mov, 0, paddingR*8*bytesOf(idt)); - inArray_pad_mov += paddingR*8; - } - for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { - memset(inArray_pad_mov, 0, iw_pad*8*bytesOf(idt)); - inArray_pad_mov += iw_pad*8; - } - } - - // compute - const F16 *f0 = filterArray; - const F16 *f1 = f0 + fh*fw*16; - const F16 *f2 = f0 + fh*fw*16*2; - const F16 *f3 = f0 + fh*fw*16*3; - const F16 *f4 = f0 + fh*fw*16*4; - const F16 *f5 = f0 + fh*fw*16*5; - const F16 *f6 = f0 + fh*fw*16*6; - const F16 *f7 = f0 + fh*fw*16*7; - - F16 *outo0h0 = outArray + n*oc*oh*ow*8; - F16 *outo1h0 = outo0h0 + oh*ow*8; - F16 *outo0h1 = outo0h0 + ow*8; - F16 *outo1h1 = outo1h0 + ow*8; - for (U32 o = 0; o < oc; o+=2) { - for (U32 c = 0; c < ic; c++) { - F16 *out_o0h0 = outo0h0; - F16 *out_o1h0 = outo1h0; - F16 *out_o0h1 = outo0h1; - F16 *out_o1h1 = outo1h1; - - F16 *in_h0w0 = inArray_pad + n*ic*ih_pad*iw_pad*8 + c*ih_pad*iw_pad*8; - F16 *in_h0w1 = in_h0w0 + strideW*8; - F16 *in_h0w2 = in_h0w0 + strideW*8*2; - F16 *in_h0w3 = in_h0w0 + strideW*8*3; - F16 *in_h1w0 = in_h0w0 + strideH*iw_pad*8; - F16 *in_h1w1 = in_h1w0 + strideW*8; - F16 *in_h1w2 = in_h1w0 + strideW*8*2; - F16 *in_h1w3 = in_h1w0 + strideW*8*3; - - for (U32 h = 0; h < oh; h+=2) { - for (U32 w = 0; w < ow; w+=4) { - const F16 *f_c0 = f0; - const F16 *f_c1 = f1; - const F16 *f_c2 = f2; - const F16 *f_c3 = f3; - const F16 *f_c4 = f4; - const F16 *f_c5 = f5; - const F16 *f_c6 = f6; - const F16 *f_c7 = f7; - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - __asm__ __volatile__( - "ldr d16, [%[f_c0]]\n" - "ldr x4, [%[f_c0], #8]\n" - "ins v16.d[1], x4\n" - "ldr d0, [%[in_h0w0]]\n" - "ldr x0, [%[in_h0w0], #8]\n" - "ins v0.d[1], x0\n" - "ldr d1, [%[in_h0w1]]\n" - "ldr x1, [%[in_h0w1], #8]\n" - "ins v1.d[1], x1\n" - "ldr d2, [%[in_h0w2]]\n" - "ldr x2, [%[in_h0w2], #8]\n" - "ins v2.d[1], x2\n" - "ldr d3, [%[in_h0w3]]\n" - "ldr x3, [%[in_h0w3], #8]\n" - "ins v3.d[1], x3\n" - "ldr d4, [%[in_h1w0]]\n" - "ldr x0, [%[in_h1w0], #8]\n" - "ins v4.d[1], x0\n" - "ldr d5, [%[in_h1w1]]\n" - "ldr x1, [%[in_h1w1], #8]\n" - "ins v5.d[1], x1\n" - "ldr d6, [%[in_h1w2]]\n" - "ldr x2, [%[in_h1w2], #8]\n" - "ins v6.d[1], x2\n" - "ldr d7, [%[in_h1w3]]\n" - "ldr x3, [%[in_h1w3], #8]\n" - "ins v7.d[1], x3\n" - "ldr d8, [%[out_o0h0]]\n" - "ldr x0, [%[out_o0h0], #8]\n" - "ins v8.d[1], x0\n" - "ldr d9, [%[out_o0h0], #16]\n" - "ldr x1, [%[out_o0h0], #24]\n" - "ins v9.d[1], x1\n" - "ldr d10, [%[out_o0h0], #32]\n" - "ldr x2, [%[out_o0h0], #40]\n" - "ins v10.d[1], x2\n" - "ldr d11, [%[out_o0h0], #48]\n" - "ldr x3, [%[out_o0h0], #56]\n" - "ins v11.d[1], x3\n" - "ldr d12, [%[out_o0h1]]\n" - "ldr x0, [%[out_o0h1], #8]\n" - "ins v12.d[1], x0\n" - "ldr d13, [%[out_o0h1], #16]\n" - "ldr x1, [%[out_o0h1], #24]\n" - "ins v13.d[1], x1\n" - "ldr d14, [%[out_o0h1], #32]\n" - "ldr x2, [%[out_o0h1], #40]\n" - "ins v14.d[1], x2\n" - "ldr d15, [%[out_o0h1], #48]\n" - "ldr x3, [%[out_o0h1], #56]\n" - "ins v15.d[1], x3\n" - - "fmla v8.8h, v16.8h, v0.h[0]\n" - "ldr d18, [%[out_o1h0]]\n" - "fmla v9.8h, v16.8h, v1.h[0]\n" - "ldr x0, [%[out_o1h0], #8]\n" - "fmla v10.8h, v16.8h, v2.h[0]\n" - "ldr d17, [%[f_c1]]\n" - "fmla v11.8h, v16.8h, v3.h[0]\n" - "ldr x5, [%[f_c1], #8]\n" - "fmla v12.8h, v16.8h, v4.h[0]\n" - "ins v17.d[1], x5\n" - "fmla v13.8h, v16.8h, v5.h[0]\n" - "ins v18.d[1], x0\n" - "fmla v14.8h, v16.8h, v6.h[0]\n" - "ldr d19, [%[out_o1h0], #16]\n" - "fmla v15.8h, v16.8h, v7.h[0]\n" - "ldr x1, [%[out_o1h0], #24]\n" - "fmla v8.8h, v17.8h, v0.h[1]\n" - "ins v19.d[1], x1\n" - "fmla v9.8h, v17.8h, v1.h[1]\n" - "fmla v10.8h, v17.8h, v2.h[1]\n" - "ldr d16, [%[f_c2]]\n" - "fmla v11.8h, v17.8h, v3.h[1]\n" - "ldr x5, [%[f_c2], #8]\n" - "fmla v12.8h, v17.8h, v4.h[1]\n" - "ins v16.d[1], x5\n" - "fmla v13.8h, v17.8h, v5.h[1]\n" - "ldr d20, [%[out_o1h0], #32]\n" - "fmla v14.8h, v17.8h, v6.h[1]\n" - "ldr x2, [%[out_o1h0], #40]\n" - "fmla v15.8h, v17.8h, v7.h[1]\n" - "ins v20.d[1], x2\n" - "fmla v8.8h, v16.8h, v0.h[2]\n" - "ldr d21, [%[out_o1h0], #48]\n" - "fmla v9.8h, v16.8h, v1.h[2]\n" - "fmla v10.8h, v16.8h, v2.h[2]\n" - "ldr d17, [%[f_c3]]\n" - "fmla v11.8h, v16.8h, v3.h[2]\n" - "ldr x5, [%[f_c3], #8]\n" - "fmla v12.8h, v16.8h, v4.h[2]\n" - "ins v17.d[1], x5\n" - "fmla v13.8h, v16.8h, v5.h[2]\n" - "ldr x3, [%[out_o1h0], #56]\n" - "fmla v14.8h, v16.8h, v6.h[2]\n" - "ins v21.d[1], x3\n" - "fmla v15.8h, v16.8h, v7.h[2]\n" - "ldr d22, [%[out_o1h1]]\n" - "fmla v8.8h, v17.8h, v0.h[3]\n" - "ldr x0, [%[out_o1h1], #8]\n" - "fmla v9.8h, v17.8h, v1.h[3]\n" - "ins v22.d[1], x0\n" - "fmla v10.8h, v17.8h, v2.h[3]\n" - "ldr d16, [%[f_c4]]\n" - "fmla v11.8h, v17.8h, v3.h[3]\n" - "ldr x5, [%[f_c4], #8]\n" - "fmla v12.8h, v17.8h, v4.h[3]\n" - "ins v16.d[1], x5\n" - "fmla v13.8h, v17.8h, v5.h[3]\n" - "ldr d23, [%[out_o1h1], #16]\n" - "fmla v14.8h, v17.8h, v6.h[3]\n" - "ldr x1, [%[out_o1h1], #24]\n" - "fmla v15.8h, v17.8h, v7.h[3]\n" - "ins v23.d[1], x1\n" - "fmla v8.8h, v16.8h, v0.h[4]\n" - "fmla v9.8h, v16.8h, v1.h[4]\n" - "fmla v10.8h, v16.8h, v2.h[4]\n" - "ldr d17, [%[f_c5]]\n" - "fmla v11.8h, v16.8h, v3.h[4]\n" - "ldr x5, [%[f_c5], #8]\n" - "fmla v12.8h, v16.8h, v4.h[4]\n" - "ins v17.d[1], x5\n" - "fmla v13.8h, v16.8h, v5.h[4]\n" - "ldr d24, [%[out_o1h1], #32]\n" - "fmla v14.8h, v16.8h, v6.h[4]\n" - "ldr x2, [%[out_o1h1], #40]\n" - "fmla v15.8h, v16.8h, v7.h[4]\n" - "ins v24.d[1], x2\n" - "fmla v8.8h, v17.8h, v0.h[5]\n" - "fmla v9.8h, v17.8h, v1.h[5]\n" - "fmla v10.8h, v17.8h, v2.h[5]\n" - "ldr d16, [%[f_c6]]\n" - "fmla v11.8h, v17.8h, v3.h[5]\n" - "ldr x5, [%[f_c6], #8]\n" - "fmla v12.8h, v17.8h, v4.h[5]\n" - "ins v16.d[1], x5\n" - "fmla v13.8h, v17.8h, v5.h[5]\n" - "ldr d25, [%[out_o1h1], #48]\n" - "fmla v14.8h, v17.8h, v6.h[5]\n" - "ldr x3, [%[out_o1h1], #56]\n" - "fmla v15.8h, v17.8h, v7.h[5]\n" - "ins v25.d[1], x3\n" - "fmla v8.8h, v16.8h, v0.h[6]\n" - "fmla v9.8h, v16.8h, v1.h[6]\n" - "fmla v10.8h, v16.8h, v2.h[6]\n" - "ldr d17, [%[f_c7]]\n" - "fmla v11.8h, v16.8h, v3.h[6]\n" - "ldr x5, [%[f_c7], #8]\n" - "fmla v12.8h, v16.8h, v4.h[6]\n" - "ins v17.d[1], x5\n" - "fmla v13.8h, v16.8h, v5.h[6]\n" - "fmla v14.8h, v16.8h, v6.h[6]\n" - "fmla v15.8h, v16.8h, v7.h[6]\n" - "fmla v8.8h, v17.8h, v0.h[7]\n" - "fmla v9.8h, v17.8h, v1.h[7]\n" - "fmla v10.8h, v17.8h, v2.h[7]\n" - "ldr d16, [%[f_c0], #16]\n" - "fmla v11.8h, v17.8h, v3.h[7]\n" - "ldr x4, [%[f_c0], #24]\n" - "fmla v12.8h, v17.8h, v4.h[7]\n" - "ins v16.d[1], x4\n" - "fmla v13.8h, v17.8h, v5.h[7]\n" - "fmla v14.8h, v17.8h, v6.h[7]\n" - "fmla v15.8h, v17.8h, v7.h[7]\n" - - "fmla v18.8h, v16.8h, v0.h[0]\n" - "fmla v19.8h, v16.8h, v1.h[0]\n" - "fmla v20.8h, v16.8h, v2.h[0]\n" - "ldr d17, [%[f_c1], #16]\n" - "fmla v21.8h, v16.8h, v3.h[0]\n" - "ldr x5, [%[f_c1], #24]\n" - "fmla v22.8h, v16.8h, v4.h[0]\n" - "ins v17.d[1], x5\n" - "fmla v23.8h, v16.8h, v5.h[0]\n" - "fmla v24.8h, v16.8h, v6.h[0]\n" - "fmla v25.8h, v16.8h, v7.h[0]\n" - "fmla v18.8h, v17.8h, v0.h[1]\n" - "fmla v19.8h, v17.8h, v1.h[1]\n" - "fmla v20.8h, v17.8h, v2.h[1]\n" - "ldr d16, [%[f_c2], #16]\n" - "fmla v21.8h, v17.8h, v3.h[1]\n" - "ldr x4, [%[f_c2], #24]\n" - "fmla v22.8h, v17.8h, v4.h[1]\n" - "ins v16.d[1], x4\n" - "fmla v23.8h, v17.8h, v5.h[1]\n" - "fmla v24.8h, v17.8h, v6.h[1]\n" - "fmla v25.8h, v17.8h, v7.h[1]\n" - "fmla v18.8h, v16.8h, v0.h[2]\n" - "fmla v19.8h, v16.8h, v1.h[2]\n" - "fmla v20.8h, v16.8h, v2.h[2]\n" - "ldr d17, [%[f_c3], #16]\n" - "fmla v21.8h, v16.8h, v3.h[2]\n" - "ldr x5, [%[f_c3], #24]\n" - "fmla v22.8h, v16.8h, v4.h[2]\n" - "ins v17.d[1], x5\n" - "fmla v23.8h, v16.8h, v5.h[2]\n" - "fmla v24.8h, v16.8h, v6.h[2]\n" - "fmla v25.8h, v16.8h, v7.h[2]\n" - "fmla v18.8h, v17.8h, v0.h[3]\n" - "fmla v19.8h, v17.8h, v1.h[3]\n" - "fmla v20.8h, v17.8h, v2.h[3]\n" - "ldr d16, [%[f_c4], #16]\n" - "fmla v21.8h, v17.8h, v3.h[3]\n" - "ldr x4, [%[f_c4], #24]\n" - "fmla v22.8h, v17.8h, v4.h[3]\n" - "ins v16.d[1], x4\n" - "fmla v23.8h, v17.8h, v5.h[3]\n" - "fmla v24.8h, v17.8h, v6.h[3]\n" - "fmla v25.8h, v17.8h, v7.h[3]\n" - "fmla v18.8h, v16.8h, v0.h[4]\n" - "fmla v19.8h, v16.8h, v1.h[4]\n" - "fmla v20.8h, v16.8h, v2.h[4]\n" - "ldr d17, [%[f_c5], #16]\n" - "fmla v21.8h, v16.8h, v3.h[4]\n" - "ldr x5, [%[f_c5], #24]\n" - "fmla v22.8h, v16.8h, v4.h[4]\n" - "ins v17.d[1], x5\n" - "fmla v23.8h, v16.8h, v5.h[4]\n" - "fmla v24.8h, v16.8h, v6.h[4]\n" - "fmla v25.8h, v16.8h, v7.h[4]\n" - "fmla v18.8h, v17.8h, v0.h[5]\n" - "fmla v19.8h, v17.8h, v1.h[5]\n" - "fmla v20.8h, v17.8h, v2.h[5]\n" - "ldr d16, [%[f_c6], #16]\n" - "fmla v21.8h, v17.8h, v3.h[5]\n" - "ldr x4, [%[f_c6], #24]\n" - "fmla v22.8h, v17.8h, v4.h[5]\n" - "ins v16.d[1], x4\n" - "fmla v23.8h, v17.8h, v5.h[5]\n" - "fmla v24.8h, v17.8h, v6.h[5]\n" - "fmla v25.8h, v17.8h, v7.h[5]\n" - "fmla v18.8h, v16.8h, v0.h[6]\n" - "fmla v19.8h, v16.8h, v1.h[6]\n" - "fmla v20.8h, v16.8h, v2.h[6]\n" - "ldr d17, [%[f_c7], #16]\n" - "fmla v21.8h, v16.8h, v3.h[6]\n" - "ldr x5, [%[f_c7], #24]\n" - "fmla v22.8h, v16.8h, v4.h[6]\n" - "ins v17.d[1], x5\n" - "fmla v23.8h, v16.8h, v5.h[6]\n" - "fmla v24.8h, v16.8h, v6.h[6]\n" - "fmla v25.8h, v16.8h, v7.h[6]\n" - "fmla v18.8h, v17.8h, v0.h[7]\n" - "fmla v19.8h, v17.8h, v1.h[7]\n" - "fmla v20.8h, v17.8h, v2.h[7]\n" - "fmla v21.8h, v17.8h, v3.h[7]\n" - "fmla v22.8h, v17.8h, v4.h[7]\n" - "fmla v23.8h, v17.8h, v5.h[7]\n" - "fmla v24.8h, v17.8h, v6.h[7]\n" - "fmla v25.8h, v17.8h, v7.h[7]\n" - "str q8, [%[out_o0h0]]\n" - "str q9, [%[out_o0h0], #16]\n" - "str q10, [%[out_o0h0], #32]\n" - "str q11, [%[out_o0h0], #48]\n" - "str q12, [%[out_o0h1]]\n" - "str q13, [%[out_o0h1], #16]\n" - "str q14, [%[out_o0h1], #32]\n" - "str q15, [%[out_o0h1], #48]\n" - "str q18, [%[out_o1h0]]\n" - "str q19, [%[out_o1h0], #16]\n" - "str q20, [%[out_o1h0], #32]\n" - "str q21, [%[out_o1h0], #48]\n" - "str q22, [%[out_o1h1]]\n" - "str q23, [%[out_o1h1], #16]\n" - "str q24, [%[out_o1h1], #32]\n" - "str q25, [%[out_o1h1], #48]\n" - - :[out_o0h0]"+r"(out_o0h0), - [out_o0h1]"+r"(out_o0h1), - [out_o1h0]"+r"(out_o1h0), - [out_o1h1]"+r"(out_o1h1) - :[in_h0w0]"r"(in_h0w0), - [in_h0w1]"r"(in_h0w1), - [in_h0w2]"r"(in_h0w2), - [in_h0w3]"r"(in_h0w3), - [in_h1w0]"r"(in_h1w0), - [in_h1w1]"r"(in_h1w1), - [in_h1w2]"r"(in_h1w2), - [in_h1w3]"r"(in_h1w3), - [f_c0]"r"(f_c0), - [f_c1]"r"(f_c1), - [f_c2]"r"(f_c2), - [f_c3]"r"(f_c3), - [f_c4]"r"(f_c4), - [f_c5]"r"(f_c5), - [f_c6]"r"(f_c6), - [f_c7]"r"(f_c7) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "x0", "x1", "x2", "x3", "x4", "x5" ); - f_c0 += 16; - f_c1 += 16; - f_c2 += 16; - f_c3 += 16; - f_c4 += 16; - f_c5 += 16; - f_c6 += 16; - f_c7 += 16; - in_h0w0 += 8; - in_h0w1 += 8; - in_h0w2 += 8; - in_h0w3 += 8; - in_h1w0 += 8; - in_h1w1 += 8; - in_h1w2 += 8; - in_h1w3 += 8; - } - in_h0w0 += iw_pad*8 - fw*8; - in_h0w1 += iw_pad*8 - fw*8; - in_h0w2 += iw_pad*8 - fw*8; - in_h0w3 += iw_pad*8 - fw*8; - in_h1w0 += iw_pad*8 - fw*8; - in_h1w1 += iw_pad*8 - fw*8; - in_h1w2 += iw_pad*8 - fw*8; - in_h1w3 += iw_pad*8 - fw*8; - } - in_h0w0 = in_h0w0 + 4*strideW*8 - fh*iw_pad*8; - in_h0w1 = in_h0w1 + 4*strideW*8 - fh*iw_pad*8; - in_h0w2 = in_h0w2 + 4*strideW*8 - fh*iw_pad*8; - in_h0w3 = in_h0w3 + 4*strideW*8 - fh*iw_pad*8; - in_h1w0 = in_h1w0 + 4*strideW*8 - fh*iw_pad*8; - in_h1w1 = in_h1w1 + 4*strideW*8 - fh*iw_pad*8; - in_h1w2 = in_h1w2 + 4*strideW*8 - fh*iw_pad*8; - in_h1w3 = in_h1w3 + 4*strideW*8 - fh*iw_pad*8; - out_o0h0 += 32; - out_o1h0 += 32; - out_o0h1 += 32; - out_o1h1 += 32; - } - in_h0w0 += 2*strideH*iw_pad*8 - ow*strideW*8; - in_h0w1 += 2*strideH*iw_pad*8 - ow*strideW*8; - in_h0w2 += 2*strideH*iw_pad*8 - ow*strideW*8; - in_h0w3 += 2*strideH*iw_pad*8 - ow*strideW*8; - in_h1w0 += 2*strideH*iw_pad*8 - ow*strideW*8; - in_h1w1 += 2*strideH*iw_pad*8 - ow*strideW*8; - in_h1w2 += 2*strideH*iw_pad*8 - ow*strideW*8; - in_h1w3 += 2*strideH*iw_pad*8 - ow*strideW*8; - out_o0h0 += ow*8; - out_o1h0 += ow*8; - out_o0h1 += ow*8; - out_o1h1 += ow*8; - } - f0 += 8*fh*fw*16; - f1 += 8*fh*fw*16; - f2 += 8*fh*fw*16; - f3 += 8*fh*fw*16; - f4 += 8*fh*fw*16; - f5 += 8*fh*fw*16; - f6 += 8*fh*fw*16; - f7 += 8*fh*fw*16; - } - outo0h0 += 2*oh*ow*8; - outo1h0 += 2*oh*ow*8; - outo0h1 += 2*oh*ow*8; - outo1h1 += 2*oh*ow*8; - } - - // bias - F16 *out = outArray; - float16x8_t v_0 = vmovq_n_f16(0); - for (U32 o = 0; o < oc; o++) { - float16x8_t v_b = vld1q_f16(biasArray + o*8); - for (U32 hw = 0; hw < oh*ow; hw++) { - float16x8_t v = vld1q_f16(out); - switch (activationDesc.mode) { - case ACTIVATION_NULL: - vst1q_f16(out, vaddq_f16(v, v_b)); - break; - case ACTIVATION_RELU: - vst1q_f16(out, vmaxq_f16(vaddq_f16(v, v_b), v_0)); - break; - default: - return NOT_SUPPORTED; - } - out += 8; - } - } - } - return ret; -} diff --git a/tensor_computing/src/cpu/arm/fp16/convolution_direct.h b/tensor_computing/src/cpu/arm/fp16/convolution_direct.h deleted file mode 100644 index 87e98f25..00000000 --- a/tensor_computing/src/cpu/arm/fp16/convolution_direct.h +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_CONVOLUTION_DIRECT -#define _H_CONVOLUTION_DIRECT -#include "sys.h" -#include "type.h" -#include "error.h" -#include "tensor_desc.h" -#include "tensor_computing_type.h" - -EE convolution_direct(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc activationDesc, - Arch arch); -#endif diff --git a/tensor_computing/src/cpu/arm/fp16/convolution_gemm.h b/tensor_computing/src/cpu/arm/fp16/convolution_gemm.h deleted file mode 100644 index d8f3bfc6..00000000 --- a/tensor_computing/src/cpu/arm/fp16/convolution_gemm.h +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_CONVOLUTION_GEMM -#define _H_CONVOLUTION_GEMM - -#include "sys.h" -#include "type.h" -#include "error.h" -#include "tensor_desc.h" - -#include "tensor_computing_type.h" - -EE convolution_gemm_A55(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc activationDesc); - -EE convolution_gemm_A76(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc activationDesc); - -inline EE convolution_gemm(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc activationDesc, - Arch arch) -{ - EE ret = SUCCESS; - switch (arch) { - case ARM_A55: - ret = convolution_gemm_A55(inputDesc, inArray, - filterDesc, filterArray, - convDesc, - biasDesc, biasArray, - tmpBytes, tmp, - outputDesc, outArray, - activationDesc); - break; - case ARM_A76: - ret = convolution_gemm_A76(inputDesc, inArray, - filterDesc, filterArray, - convDesc, - biasDesc, biasArray, - tmpBytes, tmp, - outputDesc, outArray, - activationDesc); - break; - default: - return NOT_SUPPORTED; - } - return ret; -} -#endif diff --git a/tensor_computing/src/cpu/arm/fp16/convolution_gemm_A55.cpp b/tensor_computing/src/cpu/arm/fp16/convolution_gemm_A55.cpp deleted file mode 100644 index 8e22ec9b..00000000 --- a/tensor_computing/src/cpu/arm/fp16/convolution_gemm_A55.cpp +++ /dev/null @@ -1,1024 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "cpu/arm/fp16/convolution_gemm.h" - -EE convolution_gemm_A55(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc activationDesc) -{ - UNUSED(biasDesc); - UNUSED(tmpBytes); - - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - U32 dilateH = convDesc.dilatedRate_h; - U32 dilateW = convDesc.dilatedRate_w; - - if (fdf != DF_NHWCN16) { - CHECK_STATUS(NOT_MATCH); - } - - oc /= 8; - ic /= 8; - U32 ih_pad = ih + paddingT + paddingB; - U32 iw_pad = iw + paddingL + paddingR; - I32 ohow = oh*ow; - U32 ihiw = ih_pad*iw_pad; - F16 *inArray_pad; - EE ret = SUCCESS; - for (U32 n = 0; n < in; n++) { - if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { - inArray_pad = inArray + n*ic*ih*iw*8; - } else { - // copy input into a input with padding - inArray_pad = (F16*)tmp; - F16 *inArray_pad_mov = inArray_pad; - F16 *inArray_mov = inArray + n*ic*ih*iw*8; - for (U32 c = 0; c < ic; c++) { - for (U32 h = 0; h < paddingT; h++) { - memset(inArray_pad_mov, 0, iw_pad*8*bytesOf(idt)); - inArray_pad_mov += iw_pad*8; - } - for (U32 h = paddingT; h < ih_pad - paddingB; h++) { - memset(inArray_pad_mov, 0, paddingL*8*bytesOf(idt)); - inArray_pad_mov += paddingL*8; - memcpy(inArray_pad_mov, inArray_mov, iw*8*bytesOf(idt)); - inArray_pad_mov += iw*8; - inArray_mov += iw*8; - memset(inArray_pad_mov, 0, paddingR*8*bytesOf(idt)); - inArray_pad_mov += paddingR*8; - } - for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { - memset(inArray_pad_mov, 0, iw_pad*8*bytesOf(idt)); - inArray_pad_mov += iw_pad*8; - } - } - } - // ohow / 8 - for (I32 hw = 0; hw < ohow-7; hw+=8) { - const F16 *b0 = biasArray; - const F16 *b1 = biasArray + 8; - const F16 *f_o0c0 = filterArray; - F16 *in_pack = ((F16*)tmp) + ic*ih_pad*iw_pad*8; - // pack input - // NCHWc8 => NHWChw8 + im2col - U32 in_h_0 = (hw/ow)*strideH; - U32 in_w_0 = (hw%ow)*strideW; - U32 in_h_1 = ((hw+1)/ow)*strideH; - U32 in_w_1 = ((hw+1)%ow)*strideW; - U32 in_h_2 = ((hw+2)/ow)*strideH; - U32 in_w_2 = ((hw+2)%ow)*strideW; - U32 in_h_3 = ((hw+3)/ow)*strideH; - U32 in_w_3 = ((hw+3)%ow)*strideW; - U32 in_h_4 = ((hw+4)/ow)*strideH; - U32 in_w_4 = ((hw+4)%ow)*strideW; - U32 in_h_5 = ((hw+5)/ow)*strideH; - U32 in_w_5 = ((hw+5)%ow)*strideW; - U32 in_h_6 = ((hw+6)/ow)*strideH; - U32 in_w_6 = ((hw+6)%ow)*strideW; - U32 in_h_7 = ((hw+7)/ow)*strideH; - U32 in_w_7 = ((hw+7)%ow)*strideW; - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - F16 *in_hw8c8 = inArray_pad + c*ihiw*8 + fh_idx*dilateH*iw_pad*8+ fw_idx*dilateW*8; - F16 *in_0 = in_hw8c8 + in_h_0*iw_pad*8 + in_w_0*8; - F16 *in_1 = in_hw8c8 + in_h_1*iw_pad*8 + in_w_1*8; - F16 *in_2 = in_hw8c8 + in_h_2*iw_pad*8 + in_w_2*8; - F16 *in_3 = in_hw8c8 + in_h_3*iw_pad*8 + in_w_3*8; - F16 *in_4 = in_hw8c8 + in_h_4*iw_pad*8 + in_w_4*8; - F16 *in_5 = in_hw8c8 + in_h_5*iw_pad*8 + in_w_5*8; - F16 *in_6 = in_hw8c8 + in_h_6*iw_pad*8 + in_w_6*8; - F16 *in_7 = in_hw8c8 + in_h_7*iw_pad*8 + in_w_7*8; - - // NHWChw8 - F16 *in_pack_c8hw8 = in_pack + fh_idx*fw*ic*8*8 + fw_idx*ic*8*8 + c*8*8; - /* - * for (U32 c8 = 0; c8 < 8; c8++) { - * for (U32 hw8 = 0; hw8 < 8; hw8++) { - * in_pack_c8hw8[c8*8 + hw8] = in_hw8c8[hw8*8 + c8]; - * } - * } - */ - float16x8_t v0 = vld1q_f16(in_0); - float16x8_t v1 = vld1q_f16(in_1); - float16x8_t v2 = vld1q_f16(in_2); - float16x8_t v3 = vld1q_f16(in_3); - float16x8_t v4 = vld1q_f16(in_4); - float16x8_t v5 = vld1q_f16(in_5); - float16x8_t v6 = vld1q_f16(in_6); - float16x8_t v7 = vld1q_f16(in_7); - vst1q_f16(in_pack_c8hw8, - vzip1q_f16( - vzip1q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), - vzip1q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); - vst1q_f16(in_pack_c8hw8 + 8, - vzip2q_f16( - vzip1q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), - vzip1q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); - vst1q_f16(in_pack_c8hw8 + 8*2, - vzip1q_f16( - vzip2q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), - vzip2q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); - vst1q_f16(in_pack_c8hw8 + 8*3, - vzip2q_f16( - vzip2q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), - vzip2q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); - vst1q_f16(in_pack_c8hw8 + 8*4, - vzip1q_f16( - vzip1q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), - vzip1q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); - vst1q_f16(in_pack_c8hw8 + 8*5, - vzip2q_f16( - vzip1q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), - vzip1q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); - vst1q_f16(in_pack_c8hw8 + 8*6, - vzip1q_f16( - vzip2q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), - vzip2q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); - vst1q_f16(in_pack_c8hw8 + 8*7, - vzip2q_f16( - vzip2q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), - vzip2q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); - } - } - } - - // compute - for (I32 o = 0; o < I32(oc)-1; o+=2) { - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; - // bias - const F16 *b_o0 = b0; - const F16 *b_o1 = b1; - __asm__ __volatile__( - "ldr d22, [%[b_0]]\n" //b_o0 - "ldr x1, [%[b_0], #8]\n" - "ins v22.d[1], x1\n" - "ldr d23, [%[b_1]]\n" //b_o1 - "ldr x2, [%[b_1], #8]\n" - "ins v23.d[1], x2\n" - "mov x0, %[ic]\n" //ic_blk - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr d0, [%[in_0]]\n" //in_hw0 - "mov v3.16b, v22.16b\n" //out_o0hw1 - "ldr x1, [%[in_0], #8]\n" - "mov v4.16b, v22.16b\n" //out_o0hw2 - "ins v0.d[1], x1\n" - "mov v5.16b, v22.16b\n" //out_o0hw3 - "ldr d18, [%[f_0]]\n" //f_o0c0 - "mov v6.16b, v22.16b\n" //out_o0hw4 - "ldr x2, [%[f_0], #8]\n" - "mov v7.16b, v22.16b\n" //out_o0hw5 - "ins v18.d[1], x2\n" - "mov v8.16b, v22.16b\n" //out_o0hw6 - "ldr d19, [%[f_0], #16]\n" //f_o1c0 - "mov v9.16b, v22.16b\n" //out_o0hw7 - "ldr x3, [%[f_0], #24]\n" - "mov v10.16b, v23.16b\n" //out_o1hw0 - "ins v19.d[1], x3\n" - "mov v11.16b, v23.16b\n" //out_o1hw1 - "mov v12.16b, v23.16b\n" //out_o1hw2 - "mov v13.16b, v23.16b\n" //out_o1hw3 - "mov v14.16b, v23.16b\n" //out_o1hw4 - "mov v15.16b, v23.16b\n" //out_o1hw5 - "mov v16.16b, v23.16b\n" //out_o1hw6 - "mov v17.16b, v23.16b\n" //out_o1hw7 - "0:\n" - "ldr d1, [%[in_0], #16]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr x1, [%[in_0], #24]\n" - "fmla v3.8h, v18.8h, v0.h[1]\n" - "ins v1.d[1], x1\n" - "fmla v4.8h, v18.8h, v0.h[2]\n" - "ldr d20, [%[f_0], #32]\n" //f_o0c0 - "fmla v5.8h, v18.8h, v0.h[3]\n" - "ldr x2, [%[f_0], #40]\n" - "fmla v6.8h, v18.8h, v0.h[4]\n" - "ins v20.d[1], x2\n" - "fmla v7.8h, v18.8h, v0.h[5]\n" - "ldr d21, [%[f_0], #48]\n" //f_o1c0 - "fmla v8.8h, v18.8h, v0.h[6]\n" - "ldr x3, [%[f_0], #56]\n" - "fmla v9.8h, v18.8h, v0.h[7]\n" - "ins v21.d[1], x3\n" - "fmla v10.8h, v19.8h, v0.h[0]\n" - "fmla v11.8h, v19.8h, v0.h[1]\n" - "fmla v12.8h, v19.8h, v0.h[2]\n" - "fmla v13.8h, v19.8h, v0.h[3]\n" - "fmla v14.8h, v19.8h, v0.h[4]\n" - "fmla v15.8h, v19.8h, v0.h[5]\n" - "fmla v16.8h, v19.8h, v0.h[6]\n" - "fmla v17.8h, v19.8h, v0.h[7]\n" - - "ldr d0, [%[in_0], #32]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr x1, [%[in_0], #40]\n" - "fmla v3.8h, v20.8h, v1.h[1]\n" - "ins v0.d[1], x1\n" - "fmla v4.8h, v20.8h, v1.h[2]\n" - "ldr d18, [%[f_0], #64]\n" //f_o0c0 - "fmla v5.8h, v20.8h, v1.h[3]\n" - "ldr x2, [%[f_0], #72]\n" - "fmla v6.8h, v20.8h, v1.h[4]\n" - "ins v18.d[1], x2\n" - "fmla v7.8h, v20.8h, v1.h[5]\n" - "ldr d19, [%[f_0], #80]\n" //f_o1c0 - "fmla v8.8h, v20.8h, v1.h[6]\n" - "ldr x3, [%[f_0], #88]\n" - "fmla v9.8h, v20.8h, v1.h[7]\n" - "ins v19.d[1], x3\n" - "fmla v10.8h, v21.8h, v1.h[0]\n" - "add %[in_0], %[in_0], #32\n" - "fmla v11.8h, v21.8h, v1.h[1]\n" - "add %[f_0], %[f_0], #64\n" - "fmla v12.8h, v21.8h, v1.h[2]\n" - "subs x0, x0, #2\n" - "fmla v13.8h, v21.8h, v1.h[3]\n" - "fmla v14.8h, v21.8h, v1.h[4]\n" - "fmla v15.8h, v21.8h, v1.h[5]\n" - "fmla v16.8h, v21.8h, v1.h[6]\n" - "fmla v17.8h, v21.8h, v1.h[7]\n" - "bne 0b\n" - :[in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8*fh*fw), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "x0", "x1", "x2", "x3" - ); - - switch (activationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v1.16b, v1.16b, v1.16b\n" //zero - "fmax v2.8h, v2.8h, v1.8h\n" //max(v2, 0) - "fmax v3.8h, v3.8h, v1.8h\n" - "fmax v4.8h, v4.8h, v1.8h\n" - "fmax v5.8h, v5.8h, v1.8h\n" - "fmax v6.8h, v6.8h, v1.8h\n" - "fmax v7.8h, v7.8h, v1.8h\n" - "fmax v8.8h, v8.8h, v1.8h\n" - "fmax v9.8h, v9.8h, v1.8h\n" - "fmax v10.8h, v10.8h, v1.8h\n" - "fmax v11.8h, v11.8h, v1.8h\n" - "fmax v12.8h, v12.8h, v1.8h\n" - "fmax v13.8h, v13.8h, v1.8h\n" - "fmax v14.8h, v14.8h, v1.8h\n" - "fmax v15.8h, v15.8h, v1.8h\n" - "fmax v16.8h, v16.8h, v1.8h\n" - "fmax v17.8h, v17.8h, v1.8h\n" - : - : - :"memory", "cc", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "movi v30.8h, #0x46, lsl #8\n" // six - "fmax v2.8h, v2.8h, v31.8h\n" - "fmax v3.8h, v3.8h, v31.8h\n" - "fmax v4.8h, v4.8h, v31.8h\n" - "fmax v5.8h, v5.8h, v31.8h\n" - "fmax v6.8h, v6.8h, v31.8h\n" - "fmax v7.8h, v7.8h, v31.8h\n" - "fmax v8.8h, v8.8h, v31.8h\n" - "fmax v9.8h, v9.8h, v31.8h\n" - "fmax v10.8h, v10.8h, v31.8h\n" - "fmax v11.8h, v11.8h, v31.8h\n" - "fmax v12.8h, v12.8h, v31.8h\n" - "fmax v13.8h, v13.8h, v31.8h\n" - "fmax v14.8h, v14.8h, v31.8h\n" - "fmax v15.8h, v15.8h, v31.8h\n" - "fmax v16.8h, v16.8h, v31.8h\n" - "fmax v17.8h, v17.8h, v31.8h\n" - - "fmin v2.8h, v2.8h, v30.8h\n" - "fmin v3.8h, v3.8h, v30.8h\n" - "fmin v4.8h, v4.8h, v30.8h\n" - "fmin v5.8h, v5.8h, v30.8h\n" - "fmin v6.8h, v6.8h, v30.8h\n" - "fmin v7.8h, v7.8h, v30.8h\n" - "fmin v8.8h, v8.8h, v30.8h\n" - "fmin v9.8h, v9.8h, v30.8h\n" - "fmin v10.8h, v10.8h, v30.8h\n" - "fmin v11.8h, v11.8h, v30.8h\n" - "fmin v12.8h, v12.8h, v30.8h\n" - "fmin v13.8h, v13.8h, v30.8h\n" - "fmin v14.8h, v14.8h, v30.8h\n" - "fmin v15.8h, v15.8h, v30.8h\n" - "fmin v16.8h, v16.8h, v30.8h\n" - "fmin v17.8h, v17.8h, v30.8h\n" - : - : - :"memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v30", "v31" - ); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - - __asm__ __volatile__( - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw1 - "str q4, [%[out_0], #32]\n" //out_o0hw2 - "str q5, [%[out_0], #48]\n" //out_o0hw3 - "str q6, [%[out_0], #64]\n" //out_o0hw4 - "str q7, [%[out_0], #80]\n" //out_o0hw5 - "str q8, [%[out_0], #96]\n" //out_o0hw6 - "str q9, [%[out_0], #112]\n" //out_o0hw7 - "str q10, [%[out_1]]\n" //out_o1hw0 - "str q11, [%[out_1], #16]\n" //out_o1hw1 - "str q12, [%[out_1], #32]\n" //out_o1hw2 - "str q13, [%[out_1], #48]\n" //out_o1hw3 - "str q14, [%[out_1], #64]\n" //out_o1hw4 - "str q15, [%[out_1], #80]\n" //out_o1hw5 - "str q16, [%[out_1], #96]\n" //out_o1hw6 - "str q17, [%[out_1], #112]\n" //out_o1hw7 - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0) - : - :"memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17" - ); - b0 += 16; - b1 += 16; - } - if (oc & 1) { - // oc%2 != 0 - const F16 *f_r = filterArray + (oc-1)*8*fh*fw*ic*8; - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + (oc-1)*ohow*8 + hw*8; - // bias - const F16 *b_o0 = biasArray + (oc-1)*8; - __asm__ __volatile__( - "ldr q12, [%[b_0]]\n" //b_o0 - "mov x0, %[ic]\n" // ic_blk - "ldr d0, [%[in_0]]\n" //in_hw0 - "mov v2.16b, v12.16b\n" //out_o0hw0 - "ldr x1, [%[in_0], #8]\n" - "mov v3.16b, v12.16b\n" //out_o0hw1 - "ins v0.d[1], x1\n" - "mov v4.16b, v12.16b\n" //out_o0hw2 - "ldr d10, [%[f_0]]\n" //f_o0c0 - "mov v5.16b, v12.16b\n" //out_o0hw3 - "ldr x2, [%[f_0], #8]\n" - "mov v6.16b, v12.16b\n" //out_o0hw4 - "ins v10.d[1], x2\n" - "mov v7.16b, v12.16b\n" //out_o0hw5 - "mov v8.16b, v12.16b\n" //out_o0hw6 - "mov v9.16b, v12.16b\n" //out_o0hw7 - "0:\n" - "ldr d1, [%[in_0], #16]\n" //in_hw0 - "fmla v2.8h, v10.8h, v0.h[0]\n" - "ldr x1, [%[in_0], #24]\n" - "fmla v3.8h, v10.8h, v0.h[1]\n" - "ins v1.d[1], x1\n" - "fmla v4.8h, v10.8h, v0.h[2]\n" - "ldr d11, [%[f_0], #16]\n" //f_o0c0 - "fmla v5.8h, v10.8h, v0.h[3]\n" - "ldr x2, [%[f_0], #24]\n" - "fmla v6.8h, v10.8h, v0.h[4]\n" - "ins v11.d[1], x2\n" - "fmla v7.8h, v10.8h, v0.h[5]\n" - "subs x0, x0, #2\n" - "fmla v8.8h, v10.8h, v0.h[6]\n" - "fmla v9.8h, v10.8h, v0.h[7]\n" - - "ldr d0, [%[in_0], #32]\n" //in_hw0 - "fmla v2.8h, v11.8h, v1.h[0]\n" - "ldr x1, [%[in_0], #40]\n" - "fmla v3.8h, v11.8h, v1.h[1]\n" - "ins v0.d[1], x1\n" - "fmla v4.8h, v11.8h, v1.h[2]\n" - "ldr d10, [%[f_0], #32]\n" //f_o0c0 - "fmla v5.8h, v11.8h, v1.h[3]\n" - "ldr x2, [%[f_0], #40]\n" - "fmla v6.8h, v11.8h, v1.h[4]\n" - "ins v10.d[1], x2\n" - "fmla v7.8h, v11.8h, v1.h[5]\n" - "add %[in_0], %[in_0], #32\n" - "fmla v8.8h, v11.8h, v1.h[6]\n" - "add %[f_0], %[f_0], #32\n" - "fmla v9.8h, v11.8h, v1.h[7]\n" - "bne 0b\n" - :[in_0]"+r"(in_hw0), - [f_0]"+r"(f_r) - :[ic]"r"((I64)ic*8*fh*fw), - [b_0]"r"(b_o0) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "x0", "x1", "x2" - ); - - switch (activationDesc.mode) { - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v1.16b, v1.16b, v1.16b\n" //zero - "fmax v2.8h, v2.8h, v1.8h\n" //max(v2, 0) - "fmax v3.8h, v3.8h, v1.8h\n" - "fmax v4.8h, v4.8h, v1.8h\n" - "fmax v5.8h, v5.8h, v1.8h\n" - "fmax v6.8h, v6.8h, v1.8h\n" - "fmax v7.8h, v7.8h, v1.8h\n" - "fmax v8.8h, v8.8h, v1.8h\n" - "fmax v9.8h, v9.8h, v1.8h\n" - : - : - :"memory", "cc", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "movi v30.8h, #0x46, lsl #8\n" // six - "fmax v2.8h, v2.8h, v31.8h\n" - "fmax v3.8h, v3.8h, v31.8h\n" - "fmax v4.8h, v4.8h, v31.8h\n" - "fmax v5.8h, v5.8h, v31.8h\n" - "fmax v6.8h, v6.8h, v31.8h\n" - "fmax v7.8h, v7.8h, v31.8h\n" - "fmax v8.8h, v8.8h, v31.8h\n" - "fmax v9.8h, v9.8h, v31.8h\n" - - "fmin v2.8h, v2.8h, v30.8h\n" - "fmin v3.8h, v3.8h, v30.8h\n" - "fmin v4.8h, v4.8h, v30.8h\n" - "fmin v5.8h, v5.8h, v30.8h\n" - "fmin v6.8h, v6.8h, v30.8h\n" - "fmin v7.8h, v7.8h, v30.8h\n" - "fmin v8.8h, v8.8h, v30.8h\n" - "fmin v9.8h, v9.8h, v30.8h\n" - : - : - :"memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v30", "v31" - ); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - - __asm__ __volatile__( - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw0 - "str q4, [%[out_0], #32]\n" //out_o0hw0 - "str q5, [%[out_0], #48]\n" //out_o0hw0 - "str q6, [%[out_0], #64]\n" //out_o0hw0 - "str q7, [%[out_0], #80]\n" //out_o0hw0 - "str q8, [%[out_0], #96]\n" //out_o0hw0 - "str q9, [%[out_0], #112]\n" //out_o0hw0 - :[out_0]"+r"(out_o0hw0) - : - :"memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9" - ); - } - } - - // ohow_reminder % 8 / 4 - U32 ohow_s = (ohow / 8) * 8; - //U32 ohow_s = (ohow/8)*8; - for (I32 hw = ohow_s; hw < ohow-3; hw+=4) { - const F16 *b0 = biasArray; - const F16 *b1 = biasArray + 8; - const F16 *f_o0c0 = filterArray; - F16 *in_pack = ((F16*)tmp) + ic*ih_pad*iw_pad*8; - // pack input - // NCHWc8 => NHWChw4 + im2col - U32 in_h_0 = (hw/ow)*strideH; - U32 in_w_0 = (hw%ow)*strideW; - U32 in_h_1 = ((hw+1)/ow)*strideH; - U32 in_w_1 = ((hw+1)%ow)*strideW; - U32 in_h_2 = ((hw+2)/ow)*strideH; - U32 in_w_2 = ((hw+2)%ow)*strideW; - U32 in_h_3 = ((hw+3)/ow)*strideH; - U32 in_w_3 = ((hw+3)%ow)*strideW; - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - F16 *in_hw4c8 = inArray_pad + c*ihiw*8 + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - F16 *in_0 = in_hw4c8 + in_h_0*iw_pad*8 + in_w_0*8; - F16 *in_1 = in_hw4c8 + in_h_1*iw_pad*8 + in_w_1*8; - F16 *in_2 = in_hw4c8 + in_h_2*iw_pad*8 + in_w_2*8; - F16 *in_3 = in_hw4c8 + in_h_3*iw_pad*8 + in_w_3*8; - F16 *in_pack_c8hw4 = in_pack + fh_idx*fw*ic*8*4 + fw_idx*ic*8*4 + c*8*4; - - /* - * for (U32 c8 = 0; c8 < 8; c8++) { - * for (U32 hw4 = 0; hw4 < 4; hw4++) { - * in_pack_c8hw4[c8*4 + hw4] = in_hw4c8[hw4*8 + c8]; - * } - * } - */ - - __asm__ __volatile__( - "ldr q0, [%[in_0]]\n" - "ldr q1, [%[in_1]]\n" - "ldr q2, [%[in_2]]\n" - "ldr q3, [%[in_3]]\n" - "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%[in_pack_0]]\n" - :[in_pack_0]"+r"(in_pack_c8hw4) - :[in_0]"r"(in_0), - [in_1]"r"(in_1), - [in_2]"r"(in_2), - [in_3]"r"(in_3) - :"memory", "cc", "v0", "v1", "v2", "v3" - ); - } - } - } - - // compute - for (I32 o = 0; o < I32(oc-1); o+=2) { - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; - // bias - const F16 *b_o0 = b0; - const F16 *b_o1 = b1; - __asm__ __volatile__( - "ldr d22, [%[b_0]]\n" //b_o0 - "ldr x1, [%[b_0], #8]\n" - "ins v22.d[1], x1\n" - "ldr d23, [%[b_1]]\n" //b_o1 - "ldr x2, [%[b_1], #8]\n" - "ins v23.d[1], x2\n" - "mov x0, %[ic]\n" //ic_blk - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr d0, [%[in_0]]\n" //in_hw0 - "mov v3.16b, v22.16b\n" //out_o0hw1 - "ldr d18, [%[f_0]]\n" //f_o0c0 - "mov v4.16b, v22.16b\n" //out_o0hw2 - "ldr x2, [%[f_0], #8]\n" - "mov v5.16b, v22.16b\n" //out_o0hw3 - "ins v18.d[1], x2\n" - "mov v10.16b, v23.16b\n" //out_o1hw0 - "ldr d19, [%[f_0], #16]\n" //f_o1c0 - "mov v11.16b, v23.16b\n" //out_o1hw1 - "ldr x3, [%[f_0], #24]\n" - "mov v12.16b, v23.16b\n" //out_o1hw2 - "ins v19.d[1], x3\n" - "mov v13.16b, v23.16b\n" //out_o1hw3 - "0:\n" - "ldr d1, [%[in_0], #8]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr d20, [%[f_0], #32]\n" //f_o0c0 - "fmla v3.8h, v18.8h, v0.h[1]\n" - "ldr x2, [%[f_0], #40]\n" - "fmla v4.8h, v18.8h, v0.h[2]\n" - "ins v20.d[1], x2\n" - "fmla v5.8h, v18.8h, v0.h[3]\n" - "ldr d21, [%[f_0], #48]\n" //f_o1c0 - "fmla v10.8h, v19.8h, v0.h[0]\n" - "ldr x3, [%[f_0], #56]\n" - "fmla v11.8h, v19.8h, v0.h[1]\n" - "ins v21.d[1], x3\n" - "fmla v12.8h, v19.8h, v0.h[2]\n" - "subs x0, x0, #2\n" - "fmla v13.8h, v19.8h, v0.h[3]\n" - - "ldr d0, [%[in_0], #16]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr d18, [%[f_0], #64]\n" //f_o0c0 - "fmla v3.8h, v20.8h, v1.h[1]\n" - "ldr x2, [%[f_0], #72]\n" - "fmla v4.8h, v20.8h, v1.h[2]\n" - "ldr d19, [%[f_0], #80]\n" //f_o1c0 - "fmla v5.8h, v20.8h, v1.h[3]\n" - "ins v18.d[1], x2\n" - "fmla v10.8h, v21.8h, v1.h[0]\n" - "ldr x3, [%[f_0], #88]\n" - "fmla v11.8h, v21.8h, v1.h[1]\n" - "ins v19.d[1], x3\n" - "fmla v12.8h, v21.8h, v1.h[2]\n" - "add %[in_0], %[in_0], #16\n" - "fmla v13.8h, v21.8h, v1.h[3]\n" - "add %[f_0], %[f_0], #64\n" - "bne 0b\n" - :[in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8*fh*fw), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v10", "v11", "v12", "v13", "v18", "v19", "v20", "v21", "v22", "v23", "x0", "x1", "x2", "x3" - ); - - switch (activationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v1.16b, v1.16b, v1.16b\n" //zero - "fmax v2.8h, v2.8h, v1.8h\n" //max(v2, 0) - "fmax v3.8h, v3.8h, v1.8h\n" - "fmax v4.8h, v4.8h, v1.8h\n" - "fmax v5.8h, v5.8h, v1.8h\n" - "fmax v10.8h, v10.8h, v1.8h\n" - "fmax v11.8h, v11.8h, v1.8h\n" - "fmax v12.8h, v12.8h, v1.8h\n" - "fmax v13.8h, v13.8h, v1.8h\n" - : - : - :"memory", "cc", "v1", "v2", "v3", "v4", "v5", "v10", "v11", "v12", "v13" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "movi v30.8h, #0x46, lsl #8\n" // six - "fmax v2.8h, v2.8h, v31.8h\n" - "fmax v3.8h, v3.8h, v31.8h\n" - "fmax v4.8h, v4.8h, v31.8h\n" - "fmax v5.8h, v5.8h, v31.8h\n" - "fmax v10.8h, v10.8h, v31.8h\n" - "fmax v11.8h, v11.8h, v31.8h\n" - "fmax v12.8h, v12.8h, v31.8h\n" - "fmax v13.8h, v13.8h, v31.8h\n" - - "fmin v2.8h, v2.8h, v30.8h\n" - "fmin v3.8h, v3.8h, v30.8h\n" - "fmin v4.8h, v4.8h, v30.8h\n" - "fmin v5.8h, v5.8h, v30.8h\n" - "fmin v10.8h, v10.8h, v30.8h\n" - "fmin v11.8h, v11.8h, v30.8h\n" - "fmin v12.8h, v12.8h, v30.8h\n" - "fmin v13.8h, v13.8h, v30.8h\n" - : - : - :"memory", "cc", "v2", "v3", "v4", "v5", "v10", "v11", "v12", "v13", "v30", "v31" - ); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - - __asm__ __volatile__( - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw1 - "str q4, [%[out_0], #32]\n" //out_o0hw2 - "str q5, [%[out_0], #48]\n" //out_o0hw3 - "str q10, [%[out_1]]\n" //out_o1hw0 - "str q11, [%[out_1], #16]\n" //out_o1hw1 - "str q12, [%[out_1], #32]\n" //out_o1hw2 - "str q13, [%[out_1], #48]\n" //out_o1hw3 - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0) - : - :"memory", "cc", "v2", "v3", "v4", "v5", "v10", - "v11", "v12", "v13" - ); - b0 += 16; - b1 += 16; - } - if (oc & 1) { - // oc%2 != 0 - const F16 *f_r = filterArray + (oc-1)*8*fh*fw*ic*8; - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + (oc-1)*ohow*8 + hw*8; - // bias - const F16 *b_o0 = biasArray + (oc-1)*8; - __asm__ __volatile__( - "ldr d22, [%[b_0]]\n" //b_o0 - "ldr x1, [%[b_0], #8]\n" - "ins v22.d[1], x1\n" - "mov x0, %[ic]\n" //ic_blk - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr d0, [%[in_0]]\n" //in_hw0 - "mov v3.16b, v22.16b\n" //out_o0hw1 - "ldr d18, [%[f_0]]\n" //f_o0c0 - "mov v4.16b, v22.16b\n" //out_o0hw2 - "ldr x2, [%[f_0], #8]\n" - "mov v5.16b, v22.16b\n" //out_o0hw3 - "ins v18.d[1], x2\n" - "0:\n" - "ldr d1, [%[in_0], #8]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr d20, [%[f_0], #16]\n" //f_o0c0 - "fmla v3.8h, v18.8h, v0.h[1]\n" - "ldr x2, [%[f_0], #24]\n" - "fmla v4.8h, v18.8h, v0.h[2]\n" - "ins v20.d[1], x2\n" - "fmla v5.8h, v18.8h, v0.h[3]\n" - "subs x0, x0, #2\n" - - "ldr d0, [%[in_0], #16]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr d18, [%[f_0], #32]\n" //f_o0c0 - "fmla v3.8h, v20.8h, v1.h[1]\n" - "ldr x2, [%[f_0], #40]\n" - "fmla v4.8h, v20.8h, v1.h[2]\n" - "ins v18.d[1], x2\n" - "fmla v5.8h, v20.8h, v1.h[3]\n" - "add %[in_0], %[in_0], #16\n" - "add %[f_0], %[f_0], #32\n" - "bne 0b\n" - :[in_0]"+r"(in_hw0), - [f_0]"+r"(f_r) - :[ic]"r"((I64)ic*8*fh*fw), - [b_0]"r"(b_o0) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v18", "v20", "v22", "x0", "x1", "x2" - ); - - switch (activationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v1.16b, v1.16b, v1.16b\n" //zero - "fmax v2.8h, v2.8h, v1.8h\n" //max(v2, 0) - "fmax v3.8h, v3.8h, v1.8h\n" - "fmax v4.8h, v4.8h, v1.8h\n" - "fmax v5.8h, v5.8h, v1.8h\n" - : - : - :"memory", "cc", "v1", "v2", "v3", "v4", "v5" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "movi v30.8h, #0x46, lsl #8\n" // six - "fmax v2.8h, v2.8h, v31.8h\n" - "fmax v3.8h, v3.8h, v31.8h\n" - "fmax v4.8h, v4.8h, v31.8h\n" - "fmax v5.8h, v5.8h, v31.8h\n" - - "fmin v2.8h, v2.8h, v30.8h\n" - "fmin v3.8h, v3.8h, v30.8h\n" - "fmin v4.8h, v4.8h, v30.8h\n" - "fmin v5.8h, v5.8h, v30.8h\n" - : - : - :"memory", "cc", "v2", "v3", "v4", "v5", "v30", "v31" - ); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - - __asm__ __volatile__( - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw1 - "str q4, [%[out_0], #32]\n" //out_o0hw2 - "str q5, [%[out_0], #48]\n" //out_o0hw3 - :[out_0]"+r"(out_o0hw0) - : - :"memory", "cc", "v2", "v3", "v4", "v5" - ); - } - } - - // ohow_reminder % 4 - ohow_s = (ohow / 4) * 4; - for (I32 hw = ohow_s; hw < ohow; hw++) { - const F16 *b0 = biasArray; - const F16 *b1 = biasArray + 8; - const F16 *f_o0c0 = filterArray; - F16 *in_pack = ((F16*)tmp) + ic*ih_pad*iw_pad*8; - // pack input - // NCHWc8 => NHWChw4 + im2col - U32 in_h_0 = (hw/ow)*strideH; - U32 in_w_0 = (hw%ow)*strideW; - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - F16 *in_hw1c8 = inArray_pad + c*ihiw*8 + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - F16 *in_0 = in_hw1c8 + in_h_0*iw_pad*8 + in_w_0*8; - F16 *in_pack_c8hw1 = in_pack + fh_idx*fw*ic*8 + fw_idx*ic*8 + c*8; - /* - * for (U32 c8 = 0; c8 < 8; c8++) { - * in_pack_c8hw1[c8] = in_0[c8]; - * } - */ - memcpy(in_pack_c8hw1, in_0, 8*bytesOf(idt)); - } - } - } - - // compute - for (I32 o = 0; o < I32(oc-1); o+=2) { - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; - // bias - const F16 *b_o0 = b0; - const F16 *b_o1 = b1; - __asm__ __volatile__( - "ldr d22, [%[b_0]]\n" //b_o0 - "ldr x1, [%[b_0], #8]\n" - "ins v22.d[1], x1\n" - "ldr d23, [%[b_1]]\n" //b_o1 - "mov x0, %[ic]\n" //ic_blk - "ldr x2, [%[b_1], #8]\n" - "ins v23.d[1], x2\n" - "ldr h0, [%[in_0]]\n" //in_hw0 - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr d18, [%[f_0]]\n" //f_o0c0 - "mov v10.16b, v23.16b\n" //out_o1hw0 - "ldr x2, [%[f_0], #8]\n" - "ins v18.d[1], x2\n" - "ldr d19, [%[f_0], #16]\n" //f_o1c0 - "ldr x3, [%[f_0], #24]\n" - "ins v19.d[1], x3\n" - "0:\n" - "ldr h1, [%[in_0], #2]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr d20, [%[f_0], #32]\n" //f_o0c0 - "fmla v10.8h, v19.8h, v0.h[0]\n" - "ldr x2, [%[f_0], #40]\n" - "ins v20.d[1], x2\n" - "ldr d21, [%[f_0], #48]\n" //f_o1c0 - "subs x0, x0, #2\n" - "ldr x3, [%[f_0], #56]\n" - "ins v21.d[1], x3\n" - - "ldr h0, [%[in_0], #4]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr d18, [%[f_0], #64]\n" //f_o0c0 - "fmla v10.8h, v21.8h, v1.h[0]\n" - "ldr x2, [%[f_0], #72]\n" - "ins v18.d[1], x2\n" - "ldr d19, [%[f_0], #80]\n" //f_o1c0 - "add %[in_0], %[in_0], #4\n" - "ldr x3, [%[f_0], #88]\n" - "ins v19.d[1], x3\n" - "add %[f_0], %[f_0], #64\n" - "bne 0b\n" - :[in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8*fh*fw), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1) - :"memory", "cc", "v0", "v1", "v2", "v10", "v18", "v19", "v20", "v21", "v22", "v23", "x0", "x1", "x2", "x3" - ); - - switch (activationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v1.16b, v1.16b, v1.16b\n" //zero - "fmax v2.8h, v2.8h, v1.8h\n" //max(v2, 0) - "fmax v10.8h, v10.8h, v1.8h\n" - : - : - :"memory", "cc", "v1", "v2", "v10" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "movi v30.8h, #0x46, lsl #8\n" // six - "fmax v2.8h, v2.8h, v31.8h\n" - "fmax v10.8h, v10.8h, v31.8h\n" - - "fmin v2.8h, v2.8h, v30.8h\n" - "fmin v10.8h, v10.8h, v30.8h\n" - : - : - :"memory", "cc", "v2", "v10", "v30", "v31" - ); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - - __asm__ __volatile__( - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q10, [%[out_1]]\n" //out_o1hw0 - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0) - : - :"memory", "cc", "v2", "v10" - ); - b0 += 16; - b1 += 16; - } - if (oc & 1) { - // oc%2 != 0 - const F16 *f_r = filterArray + (oc-1)*8*fh*fw*ic*8; - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + (oc-1)*ohow*8 + hw*8; - // bias - const F16 *b_o0 = biasArray + (oc-1)*8; - __asm__ __volatile__( - "ldr d22, [%[b_0]]\n" //b_o0 - "mov x0, %[ic]\n" //ic_blk - "ldr x1, [%[b_0], #8]\n" - "ins v22.d[1], x1\n" - "ldr h0, [%[in_0]]\n" //in_hw0 - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr d18, [%[f_0]]\n" //f_o0c0 - "ldr x2, [%[f_0], #8]\n" - "ins v18.d[1], x2\n" - "0:\n" - "ldr h1, [%[in_0], #2]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr d20, [%[f_0], #16]\n" //f_o0c0 - "subs x0, x0, #2\n" - "ldr x2, [%[f_0], #24]\n" - "ins v20.d[1], x2\n" - - "ldr h0, [%[in_0], #4]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr d18, [%[f_0], #32]\n" //f_o0c0 - "ldr x2, [%[f_0], #40]\n" - "ins v18.d[1], x2\n" - "add %[in_0], %[in_0], #4\n" - "add %[f_0], %[f_0], #32\n" - "bne 0b\n" - :[in_0]"+r"(in_hw0), - [f_0]"+r"(f_r) - :[ic]"r"((I64)ic*8*fh*fw), - [b_0]"r"(b_o0) - :"memory", "cc", "v0", "v1", "v2", "v10", "v18", "v20", "v22", "x0", "x1", "x2" - ); - - switch (activationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v1.16b, v1.16b, v1.16b\n" //zero - "fmax v2.8h, v2.8h, v1.8h\n" //max(v2, 0) - : - : - :"memory", "cc", "v1", "v2" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "movi v30.8h, #0x46, lsl #8\n" // six - "fmax v2.8h, v2.8h, v31.8h\n" - "fmin v2.8h, v2.8h, v30.8h\n" - : - : - :"memory", "cc", "v2", "v30", "v31" - ); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - - __asm__ __volatile__( - "str q2, [%[out_0]]\n" //out_o0hw0 - :[out_0]"+r"(out_o0hw0) - : - :"memory", "cc", "v2" - ); - } - } - } - return ret; -} diff --git a/tensor_computing/src/cpu/arm/fp16/convolution_gemm_A76.cpp b/tensor_computing/src/cpu/arm/fp16/convolution_gemm_A76.cpp deleted file mode 100644 index 4fb234b3..00000000 --- a/tensor_computing/src/cpu/arm/fp16/convolution_gemm_A76.cpp +++ /dev/null @@ -1,943 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "cpu/arm/fp16/convolution_gemm.h" - -EE convolution_gemm_A76(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc activationDesc) -{ - UNUSED(biasDesc); - UNUSED(tmpBytes); - - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - U32 dilateH = convDesc.dilatedRate_h; - U32 dilateW = convDesc.dilatedRate_w; - - if (fdf != DF_NHWCN16) - CHECK_STATUS(NOT_MATCH); - - oc /= 8; - ic /= 8; - U32 ih_pad = ih + paddingT + paddingB; - U32 iw_pad = iw + paddingL + paddingR; - I32 ohow = oh*ow; - U32 ihiw = ih_pad*iw_pad; - F16 *inArray_pad; - EE ret = SUCCESS; - for (U32 n = 0; n < in; n++) { - if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { - inArray_pad = inArray + n*ic*ih*iw*8; - } else { - // copy input into a input with padding - inArray_pad = (F16*)tmp; - F16 *inArray_pad_mov = inArray_pad; - F16 *inArray_mov = inArray + n*ic*ih*iw*8; - for (U32 c = 0; c < ic; c++) { - for (U32 h = 0; h < paddingT; h++) { - memset(inArray_pad_mov, 0, iw_pad*8*bytesOf(idt)); - inArray_pad_mov += iw_pad*8; - } - for (U32 h = paddingT; h < ih_pad - paddingB; h++) { - memset(inArray_pad_mov, 0, paddingL*8*bytesOf(idt)); - inArray_pad_mov += paddingL*8; - memcpy(inArray_pad_mov, inArray_mov, iw*8*bytesOf(idt)); - inArray_pad_mov += iw*8; - inArray_mov += iw*8; - memset(inArray_pad_mov, 0, paddingR*8*bytesOf(idt)); - inArray_pad_mov += paddingR*8; - } - for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { - memset(inArray_pad_mov, 0, iw_pad*8*bytesOf(idt)); - inArray_pad_mov += iw_pad*8; - } - } - } - // ohow / 8 - for (I32 hw = 0; hw < ohow-7; hw+=8) { - const F16 *b0 = biasArray; - const F16 *b1 = biasArray + 8; - const F16 *f_o0c0 = filterArray; - F16 *in_pack = ((F16*)tmp) + ic*ih_pad*iw_pad*8; - // pack input - // NCHWc8 => NHWChw8 + im2col - U32 in_h_0 = (hw/ow)*strideH; - U32 in_w_0 = (hw%ow)*strideW; - U32 in_h_1 = ((hw+1)/ow)*strideH; - U32 in_w_1 = ((hw+1)%ow)*strideW; - U32 in_h_2 = ((hw+2)/ow)*strideH; - U32 in_w_2 = ((hw+2)%ow)*strideW; - U32 in_h_3 = ((hw+3)/ow)*strideH; - U32 in_w_3 = ((hw+3)%ow)*strideW; - U32 in_h_4 = ((hw+4)/ow)*strideH; - U32 in_w_4 = ((hw+4)%ow)*strideW; - U32 in_h_5 = ((hw+5)/ow)*strideH; - U32 in_w_5 = ((hw+5)%ow)*strideW; - U32 in_h_6 = ((hw+6)/ow)*strideH; - U32 in_w_6 = ((hw+6)%ow)*strideW; - U32 in_h_7 = ((hw+7)/ow)*strideH; - U32 in_w_7 = ((hw+7)%ow)*strideW; - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - F16 *in_hw8c8 = inArray_pad + c*ihiw*8 + fh_idx*dilateH*iw_pad*8+ fw_idx*dilateW*8; - F16 *in_0 = in_hw8c8 + in_h_0*iw_pad*8 + in_w_0*8; - F16 *in_1 = in_hw8c8 + in_h_1*iw_pad*8 + in_w_1*8; - F16 *in_2 = in_hw8c8 + in_h_2*iw_pad*8 + in_w_2*8; - F16 *in_3 = in_hw8c8 + in_h_3*iw_pad*8 + in_w_3*8; - F16 *in_4 = in_hw8c8 + in_h_4*iw_pad*8 + in_w_4*8; - F16 *in_5 = in_hw8c8 + in_h_5*iw_pad*8 + in_w_5*8; - F16 *in_6 = in_hw8c8 + in_h_6*iw_pad*8 + in_w_6*8; - F16 *in_7 = in_hw8c8 + in_h_7*iw_pad*8 + in_w_7*8; - - // NHWChw8 - F16 *in_pack_c8hw8 = in_pack + fh_idx*fw*ic*8*8 + fw_idx*ic*8*8 + c*8*8; - /* - * for (U32 c8 = 0; c8 < 8; c8++) { - * for (U32 hw8 = 0; hw8 < 8; hw8++) { - * in_pack_c8hw8[c8*8 + hw8] = in_hw8c8[hw8*8 + c8]; - * } - * } - */ - float16x8_t v0 = vld1q_f16(in_0); - float16x8_t v1 = vld1q_f16(in_1); - float16x8_t v2 = vld1q_f16(in_2); - float16x8_t v3 = vld1q_f16(in_3); - float16x8_t v4 = vld1q_f16(in_4); - float16x8_t v5 = vld1q_f16(in_5); - float16x8_t v6 = vld1q_f16(in_6); - float16x8_t v7 = vld1q_f16(in_7); - vst1q_f16(in_pack_c8hw8, - vzip1q_f16( - vzip1q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), - vzip1q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); - vst1q_f16(in_pack_c8hw8 + 8, - vzip2q_f16( - vzip1q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), - vzip1q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); - vst1q_f16(in_pack_c8hw8 + 8*2, - vzip1q_f16( - vzip2q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), - vzip2q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); - vst1q_f16(in_pack_c8hw8 + 8*3, - vzip2q_f16( - vzip2q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), - vzip2q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); - vst1q_f16(in_pack_c8hw8 + 8*4, - vzip1q_f16( - vzip1q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), - vzip1q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); - vst1q_f16(in_pack_c8hw8 + 8*5, - vzip2q_f16( - vzip1q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), - vzip1q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); - vst1q_f16(in_pack_c8hw8 + 8*6, - vzip1q_f16( - vzip2q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), - vzip2q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); - vst1q_f16(in_pack_c8hw8 + 8*7, - vzip2q_f16( - vzip2q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), - vzip2q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); - } - } - } - - // compute - for (I32 o = 0; o < I32(oc)-1; o+=2) { - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; - // bias - const F16 *b_o0 = b0; - const F16 *b_o1 = b1; - __asm__ __volatile__( - "ldr q22, [%[b_0]]\n" //b_o0 - "ldr q23, [%[b_1]]\n" //b_o1 - "mov x0, %[ic]\n" //ic_blk - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr q0, [%[in_0]]\n" //in_hw0 - "mov v3.16b, v22.16b\n" //out_o0hw1 - "mov v4.16b, v22.16b\n" //out_o0hw2 - "mov v5.16b, v22.16b\n" //out_o0hw3 - "ldr q18, [%[f_0]]\n" //f_o0c0 - "mov v6.16b, v22.16b\n" //out_o0hw4 - "mov v7.16b, v22.16b\n" //out_o0hw5 - "mov v8.16b, v22.16b\n" //out_o0hw6 - "ldr q19, [%[f_0], #16]\n" //f_o1c0 - "mov v9.16b, v22.16b\n" //out_o0hw7 - "mov v10.16b, v23.16b\n" //out_o1hw0 - "mov v11.16b, v23.16b\n" //out_o1hw1 - "mov v12.16b, v23.16b\n" //out_o1hw2 - "mov v13.16b, v23.16b\n" //out_o1hw3 - "mov v14.16b, v23.16b\n" //out_o1hw4 - "mov v15.16b, v23.16b\n" //out_o1hw5 - "mov v16.16b, v23.16b\n" //out_o1hw6 - "mov v17.16b, v23.16b\n" //out_o1hw7 - "0:\n" - "ldr q1, [%[in_0], #16]\n" //in_hw0 - "ldr q20, [%[f_0], #32]\n" //f_o0c0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "fmla v3.8h, v18.8h, v0.h[1]\n" - "ldr q21, [%[f_0], #48]\n" //f_o1c0 - "fmla v4.8h, v18.8h, v0.h[2]\n" - "fmla v5.8h, v18.8h, v0.h[3]\n" - "fmla v6.8h, v18.8h, v0.h[4]\n" - "fmla v7.8h, v18.8h, v0.h[5]\n" - "fmla v8.8h, v18.8h, v0.h[6]\n" - "fmla v9.8h, v18.8h, v0.h[7]\n" - "fmla v10.8h, v19.8h, v0.h[0]\n" - "fmla v11.8h, v19.8h, v0.h[1]\n" - "fmla v12.8h, v19.8h, v0.h[2]\n" - "fmla v13.8h, v19.8h, v0.h[3]\n" - "fmla v14.8h, v19.8h, v0.h[4]\n" - "fmla v15.8h, v19.8h, v0.h[5]\n" - "fmla v16.8h, v19.8h, v0.h[6]\n" - "fmla v17.8h, v19.8h, v0.h[7]\n" - "subs x0, x0, #2\n" - - "ldr q0, [%[in_0], #32]\n" //in_hw0 - "ldr q18, [%[f_0], #64]\n" //f_o0c0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "fmla v3.8h, v20.8h, v1.h[1]\n" - "ldr q19, [%[f_0], #80]\n" //f_o1c0 - "fmla v4.8h, v20.8h, v1.h[2]\n" - "fmla v5.8h, v20.8h, v1.h[3]\n" - "fmla v6.8h, v20.8h, v1.h[4]\n" - "fmla v7.8h, v20.8h, v1.h[5]\n" - "fmla v8.8h, v20.8h, v1.h[6]\n" - "fmla v9.8h, v20.8h, v1.h[7]\n" - "fmla v10.8h, v21.8h, v1.h[0]\n" - "fmla v11.8h, v21.8h, v1.h[1]\n" - "fmla v12.8h, v21.8h, v1.h[2]\n" - "fmla v13.8h, v21.8h, v1.h[3]\n" - "add %[in_0], %[in_0], #32\n" - "add %[f_0], %[f_0], #64\n" - "fmla v14.8h, v21.8h, v1.h[4]\n" - "fmla v15.8h, v21.8h, v1.h[5]\n" - "fmla v16.8h, v21.8h, v1.h[6]\n" - "fmla v17.8h, v21.8h, v1.h[7]\n" - "bne 0b\n" - :[in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8*fh*fw), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "x0" - ); - - switch (activationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v1.16b, v1.16b, v1.16b\n" //zero - "fmax v2.8h, v2.8h, v1.8h\n" //max(v2, 0) - "fmax v3.8h, v3.8h, v1.8h\n" - "fmax v4.8h, v4.8h, v1.8h\n" - "fmax v5.8h, v5.8h, v1.8h\n" - "fmax v6.8h, v6.8h, v1.8h\n" - "fmax v7.8h, v7.8h, v1.8h\n" - "fmax v8.8h, v8.8h, v1.8h\n" - "fmax v9.8h, v9.8h, v1.8h\n" - "fmax v10.8h, v10.8h, v1.8h\n" - "fmax v11.8h, v11.8h, v1.8h\n" - "fmax v12.8h, v12.8h, v1.8h\n" - "fmax v13.8h, v13.8h, v1.8h\n" - "fmax v14.8h, v14.8h, v1.8h\n" - "fmax v15.8h, v15.8h, v1.8h\n" - "fmax v16.8h, v16.8h, v1.8h\n" - "fmax v17.8h, v17.8h, v1.8h\n" - : - : - :"memory", "cc", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "movi v30.8h, #0x46, lsl #8\n" // six - "fmax v2.8h, v2.8h, v31.8h\n" - "fmax v3.8h, v3.8h, v31.8h\n" - "fmax v4.8h, v4.8h, v31.8h\n" - "fmax v5.8h, v5.8h, v31.8h\n" - "fmax v6.8h, v6.8h, v31.8h\n" - "fmax v7.8h, v7.8h, v31.8h\n" - "fmax v8.8h, v8.8h, v31.8h\n" - "fmax v9.8h, v9.8h, v31.8h\n" - "fmax v10.8h, v10.8h, v31.8h\n" - "fmax v11.8h, v11.8h, v31.8h\n" - "fmax v12.8h, v12.8h, v31.8h\n" - "fmax v13.8h, v13.8h, v31.8h\n" - "fmax v14.8h, v14.8h, v31.8h\n" - "fmax v15.8h, v15.8h, v31.8h\n" - "fmax v16.8h, v16.8h, v31.8h\n" - "fmax v17.8h, v17.8h, v31.8h\n" - - "fmin v2.8h, v2.8h, v30.8h\n" - "fmin v3.8h, v3.8h, v30.8h\n" - "fmin v4.8h, v4.8h, v30.8h\n" - "fmin v5.8h, v5.8h, v30.8h\n" - "fmin v6.8h, v6.8h, v30.8h\n" - "fmin v7.8h, v7.8h, v30.8h\n" - "fmin v8.8h, v8.8h, v30.8h\n" - "fmin v9.8h, v9.8h, v30.8h\n" - "fmin v10.8h, v10.8h, v30.8h\n" - "fmin v11.8h, v11.8h, v30.8h\n" - "fmin v12.8h, v12.8h, v30.8h\n" - "fmin v13.8h, v13.8h, v30.8h\n" - "fmin v14.8h, v14.8h, v30.8h\n" - "fmin v15.8h, v15.8h, v30.8h\n" - "fmin v16.8h, v16.8h, v30.8h\n" - "fmin v17.8h, v17.8h, v30.8h\n" - : - : - :"memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v30", "v31" - ); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - - __asm__ __volatile__( - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw1 - "str q4, [%[out_0], #32]\n" //out_o0hw2 - "str q5, [%[out_0], #48]\n" //out_o0hw3 - "str q6, [%[out_0], #64]\n" //out_o0hw4 - "str q7, [%[out_0], #80]\n" //out_o0hw5 - "str q8, [%[out_0], #96]\n" //out_o0hw6 - "str q9, [%[out_0], #112]\n" //out_o0hw7 - "str q10, [%[out_1]]\n" //out_o1hw0 - "str q11, [%[out_1], #16]\n" //out_o1hw1 - "str q12, [%[out_1], #32]\n" //out_o1hw2 - "str q13, [%[out_1], #48]\n" //out_o1hw3 - "str q14, [%[out_1], #64]\n" //out_o1hw4 - "str q15, [%[out_1], #80]\n" //out_o1hw5 - "str q16, [%[out_1], #96]\n" //out_o1hw6 - "str q17, [%[out_1], #112]\n" //out_o1hw7 - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0) - : - :"memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17" - ); - b0 += 16; - b1 += 16; - } - if (oc & 1) { - // oc%2 != 0 - const F16 *f_r = filterArray + (oc-1)*8*fh*fw*ic*8; - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + (oc-1)*ohow*8 + hw*8; - // bias - const F16 *b_o0 = biasArray + (oc-1)*8; - __asm__ __volatile__( - "ldr q12, [%[b_0]]\n" //b_o0 - "mov x0, %[ic]\n" // ic_blk - "ldr q0, [%[in_0]]\n" //in_hw0 - "ldr q10, [%[f_0]]\n" //f_o0c0 - "mov v2.16b, v12.16b\n" //out_o0hw0 - "mov v3.16b, v12.16b\n" //out_o0hw1 - "mov v4.16b, v12.16b\n" //out_o0hw2 - "mov v5.16b, v12.16b\n" //out_o0hw3 - "mov v6.16b, v12.16b\n" //out_o0hw4 - "mov v7.16b, v12.16b\n" //out_o0hw5 - "mov v8.16b, v12.16b\n" //out_o0hw6 - "mov v9.16b, v12.16b\n" //out_o0hw7 - "0:\n" - "ldr q1, [%[in_0], #16]\n" //in_hw0 - "ldr q11, [%[f_0], #16]\n" //f_o0c0 - "fmla v2.8h, v10.8h, v0.h[0]\n" - "fmla v3.8h, v10.8h, v0.h[1]\n" - "fmla v4.8h, v10.8h, v0.h[2]\n" - "fmla v5.8h, v10.8h, v0.h[3]\n" - "fmla v6.8h, v10.8h, v0.h[4]\n" - "fmla v7.8h, v10.8h, v0.h[5]\n" - "fmla v8.8h, v10.8h, v0.h[6]\n" - "fmla v9.8h, v10.8h, v0.h[7]\n" - "subs x0, x0, #2\n" - - "ldr q0, [%[in_0], #32]\n" //in_hw0 - "ldr q10, [%[f_0], #32]\n" //f_o0c0 - "fmla v2.8h, v11.8h, v1.h[0]\n" - "fmla v3.8h, v11.8h, v1.h[1]\n" - "fmla v4.8h, v11.8h, v1.h[2]\n" - "fmla v5.8h, v11.8h, v1.h[3]\n" - "add %[in_0], %[in_0], #32\n" - "add %[f_0], %[f_0], #32\n" - "fmla v6.8h, v11.8h, v1.h[4]\n" - "fmla v7.8h, v11.8h, v1.h[5]\n" - "fmla v8.8h, v11.8h, v1.h[6]\n" - "fmla v9.8h, v11.8h, v1.h[7]\n" - "bne 0b\n" - :[in_0]"+r"(in_hw0), - [f_0]"+r"(f_r) - :[ic]"r"((I64)ic*8*fh*fw), - [b_0]"r"(b_o0) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "x0" - ); - - switch (activationDesc.mode) { - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v1.16b, v1.16b, v1.16b\n" //zero - "fmax v2.8h, v2.8h, v1.8h\n" //max(v2, 0) - "fmax v3.8h, v3.8h, v1.8h\n" - "fmax v4.8h, v4.8h, v1.8h\n" - "fmax v5.8h, v5.8h, v1.8h\n" - "fmax v6.8h, v6.8h, v1.8h\n" - "fmax v7.8h, v7.8h, v1.8h\n" - "fmax v8.8h, v8.8h, v1.8h\n" - "fmax v9.8h, v9.8h, v1.8h\n" - : - : - :"memory", "cc", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "movi v30.8h, #0x46, lsl #8\n" // six - "fmax v2.8h, v2.8h, v31.8h\n" - "fmax v3.8h, v3.8h, v31.8h\n" - "fmax v4.8h, v4.8h, v31.8h\n" - "fmax v5.8h, v5.8h, v31.8h\n" - "fmax v6.8h, v6.8h, v31.8h\n" - "fmax v7.8h, v7.8h, v31.8h\n" - "fmax v8.8h, v8.8h, v31.8h\n" - "fmax v9.8h, v9.8h, v31.8h\n" - - "fmin v2.8h, v2.8h, v30.8h\n" - "fmin v3.8h, v3.8h, v30.8h\n" - "fmin v4.8h, v4.8h, v30.8h\n" - "fmin v5.8h, v5.8h, v30.8h\n" - "fmin v6.8h, v6.8h, v30.8h\n" - "fmin v7.8h, v7.8h, v30.8h\n" - "fmin v8.8h, v8.8h, v30.8h\n" - "fmin v9.8h, v9.8h, v30.8h\n" - : - : - :"memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v30", "v31" - ); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - - __asm__ __volatile__( - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw0 - "str q4, [%[out_0], #32]\n" //out_o0hw0 - "str q5, [%[out_0], #48]\n" //out_o0hw0 - "str q6, [%[out_0], #64]\n" //out_o0hw0 - "str q7, [%[out_0], #80]\n" //out_o0hw0 - "str q8, [%[out_0], #96]\n" //out_o0hw0 - "str q9, [%[out_0], #112]\n" //out_o0hw0 - :[out_0]"+r"(out_o0hw0) - : - :"memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9" - ); - } - } - - // ohow_reminder % 8 / 4 - U32 ohow_s = (ohow / 8) * 8; - - for (I32 hw = ohow_s; hw < ohow-3; hw+=4) { - const F16 *b0 = biasArray; - const F16 *b1 = biasArray + 8; - const F16 *f_o0c0 = filterArray; - F16 *in_pack = ((F16*)tmp) + ic*ih_pad*iw_pad*8; - // pack input - // NCHWc8 => NHWChw4 + im2col - U32 in_h_0 = (hw/ow)*strideH; - U32 in_w_0 = (hw%ow)*strideW; - U32 in_h_1 = ((hw+1)/ow)*strideH; - U32 in_w_1 = ((hw+1)%ow)*strideW; - U32 in_h_2 = ((hw+2)/ow)*strideH; - U32 in_w_2 = ((hw+2)%ow)*strideW; - U32 in_h_3 = ((hw+3)/ow)*strideH; - U32 in_w_3 = ((hw+3)%ow)*strideW; - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - F16 *in_hw4c8 = inArray_pad + c*ihiw*8 + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - F16 *in_0 = in_hw4c8 + in_h_0*iw_pad*8 + in_w_0*8; - F16 *in_1 = in_hw4c8 + in_h_1*iw_pad*8 + in_w_1*8; - F16 *in_2 = in_hw4c8 + in_h_2*iw_pad*8 + in_w_2*8; - F16 *in_3 = in_hw4c8 + in_h_3*iw_pad*8 + in_w_3*8; - F16 *in_pack_c8hw4 = in_pack + fh_idx*fw*ic*8*4 + fw_idx*ic*8*4 + c*8*4; - - /* - * for (U32 c8 = 0; c8 < 8; c8++) { - * for (U32 hw4 = 0; hw4 < 4; hw4++) { - * in_pack_c8hw4[c8*4 + hw4] = in_hw4c8[hw4*8 + c8]; - * } - * } - */ - - __asm__ __volatile__( - "ldr q0, [%[in_0]]\n" - "ldr q1, [%[in_1]]\n" - "ldr q2, [%[in_2]]\n" - "ldr q3, [%[in_3]]\n" - "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%[in_pack_0]]\n" - :[in_pack_0]"+r"(in_pack_c8hw4) - :[in_0]"r"(in_0), - [in_1]"r"(in_1), - [in_2]"r"(in_2), - [in_3]"r"(in_3) - :"memory", "cc", "v0", "v1", "v2", "v3" - ); - } - } - } - - // compute - for (I32 o = 0; o < I32(oc-1); o+=2) { - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; - // bias - const F16 *b_o0 = b0; - const F16 *b_o1 = b1; - __asm__ __volatile__( - "ldr q22, [%[b_0]]\n" //b_o0 - "ldr q23, [%[b_1]]\n" //b_o1 - "mov x0, %[ic]\n" //ic_blk - "ldr d0, [%[in_0]]\n" //in_hw0 - "ldr q18, [%[f_0]]\n" //f_o0c0 - "ldr q19, [%[f_0], #16]\n" //f_o1c0 - "mov v2.16b, v22.16b\n" //out_o0hw0 - "mov v3.16b, v22.16b\n" //out_o0hw1 - "mov v4.16b, v22.16b\n" //out_o0hw2 - "mov v5.16b, v22.16b\n" //out_o0hw3 - "mov v10.16b, v23.16b\n" //out_o1hw0 - "mov v11.16b, v23.16b\n" //out_o1hw1 - "mov v12.16b, v23.16b\n" //out_o1hw2 - "mov v13.16b, v23.16b\n" //out_o1hw3 - "0:\n" - "ldr d1, [%[in_0], #8]\n" //in_hw0 - "ldr q20, [%[f_0], #32]\n" //f_o0c0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "fmla v3.8h, v18.8h, v0.h[1]\n" - "ldr q21, [%[f_0], #48]\n" //f_o1c0 - "fmla v4.8h, v18.8h, v0.h[2]\n" - "fmla v5.8h, v18.8h, v0.h[3]\n" - "fmla v10.8h, v19.8h, v0.h[0]\n" - "fmla v11.8h, v19.8h, v0.h[1]\n" - "fmla v12.8h, v19.8h, v0.h[2]\n" - "fmla v13.8h, v19.8h, v0.h[3]\n" - "subs x0, x0, #2\n" - - "ldr d0, [%[in_0], #16]\n" //in_hw0 - "ldr q18, [%[f_0], #64]\n" //f_o0c0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "fmla v3.8h, v20.8h, v1.h[1]\n" - "ldr q19, [%[f_0], #80]\n" //f_o1c0 - "fmla v4.8h, v20.8h, v1.h[2]\n" - "fmla v5.8h, v20.8h, v1.h[3]\n" - "add %[in_0], %[in_0], #16\n" - "add %[f_0], %[f_0], #64\n" - "fmla v10.8h, v21.8h, v1.h[0]\n" - "fmla v11.8h, v21.8h, v1.h[1]\n" - "fmla v12.8h, v21.8h, v1.h[2]\n" - "fmla v13.8h, v21.8h, v1.h[3]\n" - "bne 0b\n" - :[in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8*fh*fw), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v10", "v11", "v12", "v13", "v18", "v19", "v20", "v21", "v22", "v23", "x0" - ); - - switch (activationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v1.16b, v1.16b, v1.16b\n" //zero - "fmax v2.8h, v2.8h, v1.8h\n" //max(v2, 0) - "fmax v3.8h, v3.8h, v1.8h\n" - "fmax v4.8h, v4.8h, v1.8h\n" - "fmax v5.8h, v5.8h, v1.8h\n" - "fmax v10.8h, v10.8h, v1.8h\n" - "fmax v11.8h, v11.8h, v1.8h\n" - "fmax v12.8h, v12.8h, v1.8h\n" - "fmax v13.8h, v13.8h, v1.8h\n" - : - : - :"memory", "cc", "v1", "v2", "v3", "v4", "v5", "v10", "v11", "v12", "v13" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "movi v30.8h, #0x46, lsl #8\n" // six - "fmax v2.8h, v2.8h, v31.8h\n" - "fmax v3.8h, v3.8h, v31.8h\n" - "fmax v4.8h, v4.8h, v31.8h\n" - "fmax v5.8h, v5.8h, v31.8h\n" - "fmax v10.8h, v10.8h, v31.8h\n" - "fmax v11.8h, v11.8h, v31.8h\n" - "fmax v12.8h, v12.8h, v31.8h\n" - "fmax v13.8h, v13.8h, v31.8h\n" - - "fmin v2.8h, v2.8h, v30.8h\n" - "fmin v3.8h, v3.8h, v30.8h\n" - "fmin v4.8h, v4.8h, v30.8h\n" - "fmin v5.8h, v5.8h, v30.8h\n" - "fmin v10.8h, v10.8h, v30.8h\n" - "fmin v11.8h, v11.8h, v30.8h\n" - "fmin v12.8h, v12.8h, v30.8h\n" - "fmin v13.8h, v13.8h, v30.8h\n" - : - : - :"memory", "cc", "v2", "v3", "v4", "v5", "v10", "v11", "v12", "v13", "v30", "v31" - ); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - - __asm__ __volatile__( - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw1 - "str q4, [%[out_0], #32]\n" //out_o0hw2 - "str q5, [%[out_0], #48]\n" //out_o0hw3 - "str q10, [%[out_1]]\n" //out_o1hw0 - "str q11, [%[out_1], #16]\n" //out_o1hw1 - "str q12, [%[out_1], #32]\n" //out_o1hw2 - "str q13, [%[out_1], #48]\n" //out_o1hw3 - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0) - : - :"memory", "cc", "v2", "v3", "v4", "v5", "v10", - "v11", "v12", "v13" - ); - b0 += 16; - b1 += 16; - } - if (oc & 1) { - // oc%2 != 0 - const F16 *f_r = filterArray + (oc-1)*8*fh*fw*ic*8; - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + (oc-1)*ohow*8 + hw*8; - // bias - const F16 *b_o0 = biasArray + (oc-1)*8; - __asm__ __volatile__( - "ldr q22, [%[b_0]]\n" //b_o0 - "ldr d0, [%[in_0]]\n" //in_hw0 - "ldr q18, [%[f_0]]\n" //f_o0c0 - "mov x0, %[ic]\n" //ic_blk - "mov v2.16b, v22.16b\n" //out_o0hw0 - "mov v3.16b, v22.16b\n" //out_o0hw1 - "mov v4.16b, v22.16b\n" //out_o0hw2 - "mov v5.16b, v22.16b\n" //out_o0hw3 - "0:\n" - "ldr d1, [%[in_0], #8]\n" //in_hw0 - "ldr q20, [%[f_0], #16]\n" //f_o0c0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "fmla v3.8h, v18.8h, v0.h[1]\n" - "fmla v4.8h, v18.8h, v0.h[2]\n" - "fmla v5.8h, v18.8h, v0.h[3]\n" - "subs x0, x0, #2\n" - - "ldr d0, [%[in_0], #16]\n" //in_hw0 - "ldr q18, [%[f_0], #32]\n" //f_o0c0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "fmla v3.8h, v20.8h, v1.h[1]\n" - "fmla v4.8h, v20.8h, v1.h[2]\n" - "fmla v5.8h, v20.8h, v1.h[3]\n" - "add %[in_0], %[in_0], #16\n" - "add %[f_0], %[f_0], #32\n" - "bne 0b\n" - :[in_0]"+r"(in_hw0), - [f_0]"+r"(f_r) - :[ic]"r"((I64)ic*8*fh*fw), - [b_0]"r"(b_o0) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v18", "v20", "v22", "x0" - ); - - switch (activationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v1.16b, v1.16b, v1.16b\n" //zero - "fmax v2.8h, v2.8h, v1.8h\n" //max(v2, 0) - "fmax v3.8h, v3.8h, v1.8h\n" - "fmax v4.8h, v4.8h, v1.8h\n" - "fmax v5.8h, v5.8h, v1.8h\n" - : - : - :"memory", "cc", "v1", "v2", "v3", "v4", "v5" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "movi v30.8h, #0x46, lsl #8\n" // six - "fmax v2.8h, v2.8h, v31.8h\n" - "fmax v3.8h, v3.8h, v31.8h\n" - "fmax v4.8h, v4.8h, v31.8h\n" - "fmax v5.8h, v5.8h, v31.8h\n" - - "fmin v2.8h, v2.8h, v30.8h\n" - "fmin v3.8h, v3.8h, v30.8h\n" - "fmin v4.8h, v4.8h, v30.8h\n" - "fmin v5.8h, v5.8h, v30.8h\n" - : - : - :"memory", "cc", "v2", "v3", "v4", "v5", "v30", "v31" - ); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - - __asm__ __volatile__( - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw1 - "str q4, [%[out_0], #32]\n" //out_o0hw2 - "str q5, [%[out_0], #48]\n" //out_o0hw3 - :[out_0]"+r"(out_o0hw0) - : - :"memory", "cc", "v2", "v3", "v4", "v5" - ); - } - } - - // ohow_reminder % 4 - ohow_s = (ohow / 4) * 4; - for (I32 hw = ohow_s; hw < ohow; hw++) { - const F16 *b0 = biasArray; - const F16 *b1 = biasArray + 8; - const F16 *f_o0c0 = filterArray; - F16 *in_pack = ((F16*)tmp) + ic*ih_pad*iw_pad*8; - // pack input - // NCHWc8 => NHWChw4 + im2col - U32 in_h_0 = (hw/ow)*strideH; - U32 in_w_0 = (hw%ow)*strideW; - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - F16 *in_hw1c8 = inArray_pad + c*ihiw*8 + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - F16 *in_0 = in_hw1c8 + in_h_0*iw_pad*8 + in_w_0*8; - F16 *in_pack_c8hw1 = in_pack + fh_idx*fw*ic*8 + fw_idx*ic*8 + c*8; - /* - * for (U32 c8 = 0; c8 < 8; c8++) { - * in_pack_c8hw1[c8] = in_0[c8]; - * } - */ - memcpy(in_pack_c8hw1, in_0, 8*bytesOf(idt)); - } - } - } - - // compute - for (I32 o = 0; o < I32(oc-1); o+=2) { - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; - // bias - const F16 *b_o0 = b0; - const F16 *b_o1 = b1; - __asm__ __volatile__( - "ldr q22, [%[b_0]]\n" //b_o0 - "ldr q23, [%[b_1]]\n" //b_o1 - "ldr h0, [%[in_0]]\n" //in_hw0 - "ldr q18, [%[f_0]]\n" //f_o0c0 - "ldr q19, [%[f_0], #16]\n" //f_o1c0 - "mov x0, %[ic]\n" //ic_blk - "mov v2.16b, v22.16b\n" //out_o0hw0 - "mov v10.16b, v23.16b\n" //out_o1hw0 - "0:\n" - "ldr h1, [%[in_0], #2]\n" //in_hw0 - "ldr q20, [%[f_0], #32]\n" //f_o0c0 - "ldr q21, [%[f_0], #48]\n" //f_o1c0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "fmla v10.8h, v19.8h, v0.h[0]\n" - "subs x0, x0, #2\n" - - "ldr h0, [%[in_0], #4]\n" //in_hw0 - "ldr q18, [%[f_0], #64]\n" //f_o0c0 - "ldr q19, [%[f_0], #80]\n" //f_o1c0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "fmla v10.8h, v21.8h, v1.h[0]\n" - "add %[in_0], %[in_0], #4\n" - "add %[f_0], %[f_0], #64\n" - "bne 0b\n" - :[in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8*fh*fw), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1) - :"memory", "cc", "v0", "v1", "v2", "v10", "v18", "v19", "v20", "v21", "v22", "v23", "x0" - ); - - switch (activationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v1.16b, v1.16b, v1.16b\n" //zero - "fmax v2.8h, v2.8h, v1.8h\n" //max(v2, 0) - "fmax v10.8h, v10.8h, v1.8h\n" - : - : - :"memory", "cc", "v1", "v2", "v10" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "movi v30.8h, #0x46, lsl #8\n" // six - "fmax v2.8h, v2.8h, v31.8h\n" - "fmax v10.8h, v10.8h, v31.8h\n" - - "fmin v2.8h, v2.8h, v30.8h\n" - "fmin v10.8h, v10.8h, v30.8h\n" - : - : - :"memory", "cc", "v2", "v10", "v30", "v31" - ); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - - __asm__ __volatile__( - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q10, [%[out_1]]\n" //out_o1hw0 - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0) - : - :"memory", "cc", "v2", "v10" - ); - b0 += 16; - b1 += 16; - } - if (oc & 1) { - // oc%2 != 0 - const F16 *f_r = filterArray + (oc-1)*8*fh*fw*ic*8; - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + (oc-1)*ohow*8 + hw*8; - // bias - const F16 *b_o0 = biasArray + (oc-1)*8; - __asm__ __volatile__( - "ldr q22, [%[b_0]]\n" //b_o0 - "ldr h0, [%[in_0]]\n" //in_hw0 - "ldr q18, [%[f_0]]\n" //f_o0c0 - "mov x0, %[ic]\n" //ic_blk - "mov v2.16b, v22.16b\n" //out_o0hw0 - "0:\n" - "ldr h1, [%[in_0], #2]\n" //in_hw0 - "ldr q20, [%[f_0], #16]\n" //f_o0c0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "subs x0, x0, #2\n" - - "ldr h0, [%[in_0], #4]\n" //in_hw0 - "ldr q18, [%[f_0], #32]\n" //f_o0c0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "add %[in_0], %[in_0], #4\n" - "add %[f_0], %[f_0], #32\n" - "bne 0b\n" - :[in_0]"+r"(in_hw0), - [f_0]"+r"(f_r) - :[ic]"r"((I64)ic*8*fh*fw), - [b_0]"r"(b_o0) - :"memory", "cc", "v0", "v1", "v2", "v10", "v18", "v20", "v22", "x0" - ); - - switch (activationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v1.16b, v1.16b, v1.16b\n" //zero - "fmax v2.8h, v2.8h, v1.8h\n" //max(v2, 0) - : - : - :"memory", "cc", "v1", "v2" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "movi v30.8h, #0x46, lsl #8\n" // six - "fmax v2.8h, v2.8h, v31.8h\n" - "fmin v2.8h, v2.8h, v30.8h\n" - : - : - :"memory", "cc", "v2", "v30", "v31" - ); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - - __asm__ __volatile__( - "str q2, [%[out_0]]\n" //out_o0hw0 - :[out_0]"+r"(out_o0hw0) - : - :"memory", "cc", "v2" - ); - } - } - } - return ret; -} diff --git a/tensor_computing/src/cpu/arm/fp16/convolution_gemm_icnchw.h b/tensor_computing/src/cpu/arm/fp16/convolution_gemm_icnchw.h deleted file mode 100644 index 79172a6d..00000000 --- a/tensor_computing/src/cpu/arm/fp16/convolution_gemm_icnchw.h +++ /dev/null @@ -1,76 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_CONVOLUTION_GEMM_ICNCHW -#define _H_CONVOLUTION_GEMM_ICNCHW - -#include -#include "sys.h" -#include "type.h" -#include "error.h" -#include "tensor_desc.h" - -#include "tensor_computing_type.h" - -EE convolution_gemm_icnchw_A55(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc activationDesc); - -EE convolution_gemm_icnchw_A76(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc activationDesc); - -inline EE convolution_gemm_icnchw(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc activationDesc, - Arch arch) -{ - EE ret = SUCCESS; - switch (arch) { - case ARM_A55: - ret = convolution_gemm_icnchw_A55(inputDesc, inArray, - filterDesc, filterArray, - convDesc, - biasDesc, biasArray, - tmpBytes, tmp, - outputDesc, outArray, - activationDesc); - break; - case ARM_A76: - ret = convolution_gemm_icnchw_A76(inputDesc, inArray, - filterDesc, filterArray, - convDesc, - biasDesc, biasArray, - tmpBytes, tmp, - outputDesc, outArray, - activationDesc); - break; - default: - return NOT_SUPPORTED; - } - return ret; -} -#endif diff --git a/tensor_computing/src/cpu/arm/fp16/convolution_gemm_icnchw_A55.cpp b/tensor_computing/src/cpu/arm/fp16/convolution_gemm_icnchw_A55.cpp deleted file mode 100644 index 231ad0fb..00000000 --- a/tensor_computing/src/cpu/arm/fp16/convolution_gemm_icnchw_A55.cpp +++ /dev/null @@ -1,1045 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/fp16/convolution_gemm_icnchw.h" - -EE convolution_gemm_icnchw_A55(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc activationDesc) -{ - UNUSED(biasDesc); - UNUSED(tmpBytes); - - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - U32 dilateH = convDesc.dilatedRate_h; - U32 dilateW = convDesc.dilatedRate_w; - - if (fdf != DF_NHWCN16) - CHECK_STATUS(NOT_MATCH); - - oc /= 8; - U32 ih_pad = ih + paddingT + paddingB; - U32 iw_pad = iw + paddingL + paddingR; - I32 ohow = oh*ow; - U32 ihiw = ih_pad*iw_pad; - F16 *inArray_pad; - EE ret = SUCCESS; - for (U32 n = 0; n < in; n++) { - if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { - inArray_pad = inArray + n*ic*ih*iw; - } else { - // copy input into a input with padding - inArray_pad = (F16*)tmp; - F16 *inArray_pad_mov = inArray_pad; - F16 *inArray_mov = inArray + n*ic*ih*iw; - for (U32 c = 0; c < ic; c++) { - for (U32 h = 0; h < paddingT; h++) { - memset(inArray_pad_mov, 0, iw_pad*bytesOf(idt)); - inArray_pad_mov += iw_pad; - } - for (U32 h = paddingT; h < ih_pad - paddingB; h++) { - memset(inArray_pad_mov, 0, paddingL*bytesOf(idt)); - inArray_pad_mov += paddingL; - memcpy(inArray_pad_mov, inArray_mov, iw*bytesOf(idt)); - inArray_pad_mov += iw; - inArray_mov += iw; - memset(inArray_pad_mov, 0, paddingR*bytesOf(idt)); - inArray_pad_mov += paddingR; - } - for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { - memset(inArray_pad_mov, 0, iw_pad*bytesOf(idt)); - inArray_pad_mov += iw_pad; - } - } - } - - // ohow / 8 - for (I32 hw = 0; hw < ohow-7; hw+=8) { - const F16 *b0 = biasArray; - const F16 *b1 = biasArray + 8; - const F16 *f_o0c0 = filterArray; - F16 *in_pack = ((F16*)tmp) + ic*ih_pad*iw_pad; - // pack input - // NCHW => NHWChw8 + im2col - U32 in_h_0 = (hw/ow)*strideH; - U32 in_w_0 = (hw%ow)*strideW; - U32 in_h_1 = ((hw+1)/ow)*strideH; - U32 in_w_1 = ((hw+1)%ow)*strideW; - U32 in_h_2 = ((hw+2)/ow)*strideH; - U32 in_w_2 = ((hw+2)%ow)*strideW; - U32 in_h_3 = ((hw+3)/ow)*strideH; - U32 in_w_3 = ((hw+3)%ow)*strideW; - U32 in_h_4 = ((hw+4)/ow)*strideH; - U32 in_w_4 = ((hw+4)%ow)*strideW; - U32 in_h_5 = ((hw+5)/ow)*strideH; - U32 in_w_5 = ((hw+5)%ow)*strideW; - U32 in_h_6 = ((hw+6)/ow)*strideH; - U32 in_w_6 = ((hw+6)%ow)*strideW; - U32 in_h_7 = ((hw+7)/ow)*strideH; - U32 in_w_7 = ((hw+7)%ow)*strideW; - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - F16 *in_hw = inArray_pad + c*ihiw + fh_idx*dilateH*iw_pad+ dilateW*fw_idx; - F16 *in_0 = in_hw + in_h_0*iw_pad + in_w_0; - F16 *in_1 = in_hw + in_h_1*iw_pad + in_w_1; - F16 *in_2 = in_hw + in_h_2*iw_pad + in_w_2; - F16 *in_3 = in_hw + in_h_3*iw_pad + in_w_3; - F16 *in_4 = in_hw + in_h_4*iw_pad + in_w_4; - F16 *in_5 = in_hw + in_h_5*iw_pad + in_w_5; - F16 *in_6 = in_hw + in_h_6*iw_pad + in_w_6; - F16 *in_7 = in_hw + in_h_7*iw_pad + in_w_7; - F16 *in_pack_hw8 = in_pack + fh_idx*fw*ic*8 + fw_idx*ic*8 + c*8; - *in_pack_hw8 = *in_0; - *(in_pack_hw8+1) = *in_1; - *(in_pack_hw8+2) = *in_2; - *(in_pack_hw8+3) = *in_3; - *(in_pack_hw8+4) = *in_4; - *(in_pack_hw8+5) = *in_5; - *(in_pack_hw8+6) = *in_6; - *(in_pack_hw8+7) = *in_7; - } - } - } - - // compute - for (I32 o = 0; o < I32(oc-1); o+=2) { - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; - // bias - const F16 *b_o0 = b0; - const F16 *b_o1 = b1; - __asm__ __volatile__( - "ldr d22, [%[b_0]]\n" //b_o0 - "ldr x1, [%[b_0], #8]\n" - "ins v22.d[1], x1\n" - "ldr d23, [%[b_1]]\n" //b_o1 - "ldr x2, [%[b_1], #8]\n" - "ins v23.d[1], x2\n" - "mov x0, %[ic]\n" //ic_blk - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr d0, [%[in_0]]\n" //in_hw0 - "mov v3.16b, v22.16b\n" //out_o0hw1 - "ldr x1, [%[in_0], #8]\n" - "mov v4.16b, v22.16b\n" //out_o0hw2 - "ins v0.d[1], x1\n" - "mov v5.16b, v22.16b\n" //out_o0hw3 - "ldr d18, [%[f_0]]\n" //f_o0c0 - "mov v6.16b, v22.16b\n" //out_o0hw4 - "ldr x2, [%[f_0], #8]\n" - "mov v7.16b, v22.16b\n" //out_o0hw5 - "ins v18.d[1], x2\n" - "mov v8.16b, v22.16b\n" //out_o0hw6 - "ldr d19, [%[f_0], #16]\n" //f_o1c0 - "mov v9.16b, v22.16b\n" //out_o0hw7 - "ldr x3, [%[f_0], #24]\n" - "mov v10.16b, v23.16b\n" //out_o1hw0 - "ins v19.d[1], x3\n" - "mov v11.16b, v23.16b\n" //out_o1hw1 - "mov v12.16b, v23.16b\n" //out_o1hw2 - "mov v13.16b, v23.16b\n" //out_o1hw3 - "mov v14.16b, v23.16b\n" //out_o1hw4 - "mov v15.16b, v23.16b\n" //out_o1hw5 - "mov v16.16b, v23.16b\n" //out_o1hw6 - "mov v17.16b, v23.16b\n" //out_o1hw7 - - "0:\n" - "cmp x0, #1\n" - "ble 1f\n" - "ldr d1, [%[in_0], #16]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr x1, [%[in_0], #24]\n" - "fmla v3.8h, v18.8h, v0.h[1]\n" - "ins v1.d[1], x1\n" - "fmla v4.8h, v18.8h, v0.h[2]\n" - "ldr d20, [%[f_0], #32]\n" //f_o0c0 - "fmla v5.8h, v18.8h, v0.h[3]\n" - "ldr x2, [%[f_0], #40]\n" - "fmla v6.8h, v18.8h, v0.h[4]\n" - "ins v20.d[1], x2\n" - "fmla v7.8h, v18.8h, v0.h[5]\n" - "ldr d21, [%[f_0], #48]\n" //f_o1c0 - "fmla v8.8h, v18.8h, v0.h[6]\n" - "ldr x3, [%[f_0], #56]\n" - "fmla v9.8h, v18.8h, v0.h[7]\n" - "ins v21.d[1], x3\n" - "fmla v10.8h, v19.8h, v0.h[0]\n" - "fmla v11.8h, v19.8h, v0.h[1]\n" - "fmla v12.8h, v19.8h, v0.h[2]\n" - "fmla v13.8h, v19.8h, v0.h[3]\n" - "fmla v14.8h, v19.8h, v0.h[4]\n" - "fmla v15.8h, v19.8h, v0.h[5]\n" - "fmla v16.8h, v19.8h, v0.h[6]\n" - "fmla v17.8h, v19.8h, v0.h[7]\n" - - "ldr d0, [%[in_0], #32]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr x1, [%[in_0], #40]\n" - "fmla v3.8h, v20.8h, v1.h[1]\n" - "ins v0.d[1], x1\n" - "fmla v4.8h, v20.8h, v1.h[2]\n" - "ldr d18, [%[f_0], #64]\n" //f_o0c0 - "fmla v5.8h, v20.8h, v1.h[3]\n" - "ldr x2, [%[f_0], #72]\n" - "fmla v6.8h, v20.8h, v1.h[4]\n" - "ins v18.d[1], x2\n" - "fmla v7.8h, v20.8h, v1.h[5]\n" - "ldr d19, [%[f_0], #80]\n" //f_o1c0 - "fmla v8.8h, v20.8h, v1.h[6]\n" - "ldr x3, [%[f_0], #88]\n" - "fmla v9.8h, v20.8h, v1.h[7]\n" - "ins v19.d[1], x3\n" - "fmla v10.8h, v21.8h, v1.h[0]\n" - "add %[in_0], %[in_0], #32\n" - "fmla v11.8h, v21.8h, v1.h[1]\n" - "add %[f_0], %[f_0], #64\n" - "fmla v12.8h, v21.8h, v1.h[2]\n" - "sub x0, x0, #2\n" - "fmla v13.8h, v21.8h, v1.h[3]\n" - "fmla v14.8h, v21.8h, v1.h[4]\n" - "fmla v15.8h, v21.8h, v1.h[5]\n" - "fmla v16.8h, v21.8h, v1.h[6]\n" - "fmla v17.8h, v21.8h, v1.h[7]\n" - "b 0b\n" - - "1:\n" - "blt 2f\n" - "fmla v2.8h, v18.8h, v0.h[0]\n" - "fmla v3.8h, v18.8h, v0.h[1]\n" - "fmla v4.8h, v18.8h, v0.h[2]\n" - "fmla v5.8h, v18.8h, v0.h[3]\n" - "fmla v6.8h, v18.8h, v0.h[4]\n" - "fmla v7.8h, v18.8h, v0.h[5]\n" - "fmla v8.8h, v18.8h, v0.h[6]\n" - "fmla v9.8h, v18.8h, v0.h[7]\n" - "add %[f_0], %[f_0], #32\n" - "fmla v10.8h, v19.8h, v0.h[0]\n" - "fmla v11.8h, v19.8h, v0.h[1]\n" - "fmla v12.8h, v19.8h, v0.h[2]\n" - "fmla v13.8h, v19.8h, v0.h[3]\n" - "fmla v14.8h, v19.8h, v0.h[4]\n" - "fmla v15.8h, v19.8h, v0.h[5]\n" - "fmla v16.8h, v19.8h, v0.h[6]\n" - "fmla v17.8h, v19.8h, v0.h[7]\n" - "2:\n" - :[in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*fh*fw), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "x0", "x1", "x2", "x3" - ); - switch (activationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v1.16b, v1.16b, v1.16b\n" //zero - "fmax v2.8h, v2.8h, v1.8h\n" //max(v2, 0) - "fmax v3.8h, v3.8h, v1.8h\n" - "fmax v4.8h, v4.8h, v1.8h\n" - "fmax v5.8h, v5.8h, v1.8h\n" - "fmax v6.8h, v6.8h, v1.8h\n" - "fmax v7.8h, v7.8h, v1.8h\n" - "fmax v8.8h, v8.8h, v1.8h\n" - "fmax v9.8h, v9.8h, v1.8h\n" - "fmax v10.8h, v10.8h, v1.8h\n" - "fmax v11.8h, v11.8h, v1.8h\n" - "fmax v12.8h, v12.8h, v1.8h\n" - "fmax v13.8h, v13.8h, v1.8h\n" - "fmax v14.8h, v14.8h, v1.8h\n" - "fmax v15.8h, v15.8h, v1.8h\n" - "fmax v16.8h, v16.8h, v1.8h\n" - "fmax v17.8h, v17.8h, v1.8h\n" - : - : - :"memory", "cc", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "movi v30.8h, #0x46, lsl #8\n" // six - "fmax v2.8h, v2.8h, v31.8h\n" - "fmax v3.8h, v3.8h, v31.8h\n" - "fmax v4.8h, v4.8h, v31.8h\n" - "fmax v5.8h, v5.8h, v31.8h\n" - "fmax v6.8h, v6.8h, v31.8h\n" - "fmax v7.8h, v7.8h, v31.8h\n" - "fmax v8.8h, v8.8h, v31.8h\n" - "fmax v9.8h, v9.8h, v31.8h\n" - "fmax v10.8h, v10.8h, v31.8h\n" - "fmax v11.8h, v11.8h, v31.8h\n" - "fmax v12.8h, v12.8h, v31.8h\n" - "fmax v13.8h, v13.8h, v31.8h\n" - "fmax v14.8h, v14.8h, v31.8h\n" - "fmax v15.8h, v15.8h, v31.8h\n" - "fmax v16.8h, v16.8h, v31.8h\n" - "fmax v17.8h, v17.8h, v31.8h\n" - - "fmin v2.8h, v2.8h, v30.8h\n" - "fmin v3.8h, v3.8h, v30.8h\n" - "fmin v4.8h, v4.8h, v30.8h\n" - "fmin v5.8h, v5.8h, v30.8h\n" - "fmin v6.8h, v6.8h, v30.8h\n" - "fmin v7.8h, v7.8h, v30.8h\n" - "fmin v8.8h, v8.8h, v30.8h\n" - "fmin v9.8h, v9.8h, v30.8h\n" - "fmin v10.8h, v10.8h, v30.8h\n" - "fmin v11.8h, v11.8h, v30.8h\n" - "fmin v12.8h, v12.8h, v30.8h\n" - "fmin v13.8h, v13.8h, v30.8h\n" - "fmin v14.8h, v14.8h, v30.8h\n" - "fmin v15.8h, v15.8h, v30.8h\n" - "fmin v16.8h, v16.8h, v30.8h\n" - "fmin v17.8h, v17.8h, v30.8h\n" - : - : - :"memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v30", "v31" - ); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - - __asm__ __volatile__( - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw1 - "str q4, [%[out_0], #32]\n" //out_o0hw2 - "str q5, [%[out_0], #48]\n" //out_o0hw3 - "str q6, [%[out_0], #64]\n" //out_o0hw4 - "str q7, [%[out_0], #80]\n" //out_o0hw5 - "str q8, [%[out_0], #96]\n" //out_o0hw6 - "str q9, [%[out_0], #112]\n" //out_o0hw7 - "str q10, [%[out_1]]\n" //out_o1hw0 - "str q11, [%[out_1], #16]\n" //out_o1hw1 - "str q12, [%[out_1], #32]\n" //out_o1hw2 - "str q13, [%[out_1], #48]\n" //out_o1hw3 - "str q14, [%[out_1], #64]\n" //out_o1hw4 - "str q15, [%[out_1], #80]\n" //out_o1hw5 - "str q16, [%[out_1], #96]\n" //out_o1hw6 - "str q17, [%[out_1], #112]\n" //out_o1hw7 - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0) - : - :"memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17" - ); - b0 += 16; - b1 += 16; - } - if (oc & 1) { - // oc%2 != 0 - const F16 *f_r = filterArray + (oc-1)*8*fh*fw*ic; - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + (oc-1)*ohow*8 + hw*8; - // bias - const F16 *b_o0 = biasArray + (oc-1)*8; - __asm__ __volatile__( - "ldr q12, [%[b_0]]\n" //b_o0 - "mov x0, %[ic]\n" // ic_blk - "ldr d0, [%[in_0]]\n" //in_hw0 - "mov v2.16b, v12.16b\n" //out_o0hw0 - "ldr x1, [%[in_0], #8]\n" - "mov v3.16b, v12.16b\n" //out_o0hw1 - "ins v0.d[1], x1\n" - "mov v4.16b, v12.16b\n" //out_o0hw2 - "ldr d10, [%[f_0]]\n" //f_o0c0 - "mov v5.16b, v12.16b\n" //out_o0hw3 - "ldr x2, [%[f_0], #8]\n" - "mov v6.16b, v12.16b\n" //out_o0hw4 - "ins v10.d[1], x2\n" - "mov v7.16b, v12.16b\n" //out_o0hw5 - "mov v8.16b, v12.16b\n" //out_o0hw6 - "mov v9.16b, v12.16b\n" //out_o0hw7 - - "0:\n" - "cmp x0, #1\n" - "ble 1f\n" - "ldr d1, [%[in_0], #16]\n" //in_hw0 - "fmla v2.8h, v10.8h, v0.h[0]\n" - "ldr x1, [%[in_0], #24]\n" - "fmla v3.8h, v10.8h, v0.h[1]\n" - "ins v1.d[1], x1\n" - "fmla v4.8h, v10.8h, v0.h[2]\n" - "ldr d11, [%[f_0], #16]\n" //f_o0c0 - "fmla v5.8h, v10.8h, v0.h[3]\n" - "ldr x2, [%[f_0], #24]\n" - "fmla v6.8h, v10.8h, v0.h[4]\n" - "ins v11.d[1], x2\n" - "fmla v7.8h, v10.8h, v0.h[5]\n" - "sub x0, x0, #2\n" - "fmla v8.8h, v10.8h, v0.h[6]\n" - "fmla v9.8h, v10.8h, v0.h[7]\n" - - "ldr d0, [%[in_0], #32]\n" //in_hw0 - "fmla v2.8h, v11.8h, v1.h[0]\n" - "ldr x1, [%[in_0], #40]\n" - "fmla v3.8h, v11.8h, v1.h[1]\n" - "ins v0.d[1], x1\n" - "fmla v4.8h, v11.8h, v1.h[2]\n" - "ldr d10, [%[f_0], #32]\n" //f_o0c0 - "fmla v5.8h, v11.8h, v1.h[3]\n" - "ldr x2, [%[f_0], #40]\n" - "fmla v6.8h, v11.8h, v1.h[4]\n" - "ins v10.d[1], x2\n" - "fmla v7.8h, v11.8h, v1.h[5]\n" - "add %[in_0], %[in_0], #32\n" - "fmla v8.8h, v11.8h, v1.h[6]\n" - "add %[f_0], %[f_0], #32\n" - "fmla v9.8h, v11.8h, v1.h[7]\n" - "b 0b\n" - - "1:\n" - "blt 2f\n" - "fmla v2.8h, v10.8h, v0.h[0]\n" - "fmla v3.8h, v10.8h, v0.h[1]\n" - "fmla v4.8h, v10.8h, v0.h[2]\n" - "fmla v5.8h, v10.8h, v0.h[3]\n" - "add %[f_0], %[f_0], #16\n" - "fmla v6.8h, v10.8h, v0.h[4]\n" - "fmla v7.8h, v10.8h, v0.h[5]\n" - "fmla v8.8h, v10.8h, v0.h[6]\n" - "fmla v9.8h, v10.8h, v0.h[7]\n" - "2:\n" - :[in_0]"+r"(in_hw0), - [f_0]"+r"(f_r) - :[ic]"r"((I64)ic*fh*fw), - [b_0]"r"(b_o0) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "x0", "x1", "x2" - ); - switch (activationDesc.mode) { - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v1.16b, v1.16b, v1.16b\n" //zero - "fmax v2.8h, v2.8h, v1.8h\n" //max(v2, 0) - "fmax v3.8h, v3.8h, v1.8h\n" - "fmax v4.8h, v4.8h, v1.8h\n" - "fmax v5.8h, v5.8h, v1.8h\n" - "fmax v6.8h, v6.8h, v1.8h\n" - "fmax v7.8h, v7.8h, v1.8h\n" - "fmax v8.8h, v8.8h, v1.8h\n" - "fmax v9.8h, v9.8h, v1.8h\n" - : - : - :"memory", "cc", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "movi v30.8h, #0x46, lsl #8\n" // six - "fmax v2.8h, v2.8h, v31.8h\n" - "fmax v3.8h, v3.8h, v31.8h\n" - "fmax v4.8h, v4.8h, v31.8h\n" - "fmax v5.8h, v5.8h, v31.8h\n" - "fmax v6.8h, v6.8h, v31.8h\n" - "fmax v7.8h, v7.8h, v31.8h\n" - "fmax v8.8h, v8.8h, v31.8h\n" - "fmax v9.8h, v9.8h, v31.8h\n" - - "fmin v2.8h, v2.8h, v30.8h\n" - "fmin v3.8h, v3.8h, v30.8h\n" - "fmin v4.8h, v4.8h, v30.8h\n" - "fmin v5.8h, v5.8h, v30.8h\n" - "fmin v6.8h, v6.8h, v30.8h\n" - "fmin v7.8h, v7.8h, v30.8h\n" - "fmin v8.8h, v8.8h, v30.8h\n" - "fmin v9.8h, v9.8h, v30.8h\n" - : - : - :"memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v30", "v31" - ); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - - __asm__ __volatile__( - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw0 - "str q4, [%[out_0], #32]\n" //out_o0hw0 - "str q5, [%[out_0], #48]\n" //out_o0hw0 - "str q6, [%[out_0], #64]\n" //out_o0hw0 - "str q7, [%[out_0], #80]\n" //out_o0hw0 - "str q8, [%[out_0], #96]\n" //out_o0hw0 - "str q9, [%[out_0], #112]\n" //out_o0hw0 - :[out_0]"+r"(out_o0hw0) - : - :"memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9" - ); - } - } - // ohow_reminder % 8 / 4 - U32 ohow_s = (ohow / 8) * 8; - //U32 ohow_s = (ohow/8)*8; - for (I32 hw = ohow_s; hw < ohow-3; hw+=4) { - const F16 *b0 = biasArray; - const F16 *b1 = biasArray + 8; - const F16 *f_o0c0 = filterArray; - F16 *in_pack = ((F16*)tmp) + ic*ih_pad*iw_pad; - - // pack input - // NCHWc8 => NHWChw4 + im2col - U32 in_h_0 = (hw/ow)*strideH; - U32 in_w_0 = (hw%ow)*strideW; - U32 in_h_1 = ((hw+1)/ow)*strideH; - U32 in_w_1 = ((hw+1)%ow)*strideW; - U32 in_h_2 = ((hw+2)/ow)*strideH; - U32 in_w_2 = ((hw+2)%ow)*strideW; - U32 in_h_3 = ((hw+3)/ow)*strideH; - U32 in_w_3 = ((hw+3)%ow)*strideW; - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - F16 *in_hw = inArray_pad + c*ihiw + fh_idx*dilateH*iw_pad + dilateW*fw_idx; - F16 *in_0 = in_hw + in_h_0*iw_pad + in_w_0; - F16 *in_1 = in_hw + in_h_1*iw_pad + in_w_1; - F16 *in_2 = in_hw + in_h_2*iw_pad + in_w_2; - F16 *in_3 = in_hw + in_h_3*iw_pad + in_w_3; - F16 *in_pack_hw4 = in_pack + fh_idx*fw*ic*4 + fw_idx*ic*4 + c*4; - *in_pack_hw4 = *in_0; - *(in_pack_hw4+1) = *in_1; - *(in_pack_hw4+2) = *in_2; - *(in_pack_hw4+3) = *in_3; - } - } - } - - // compute - for (I32 o = 0; o < I32(oc-1); o+=2) { - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; - // bias - const F16 *b_o0 = b0; - const F16 *b_o1 = b1; - __asm__ __volatile__( - "ldr d22, [%[b_0]]\n" //b_o0 - "ldr x1, [%[b_0], #8]\n" - "ins v22.d[1], x1\n" - "ldr d23, [%[b_1]]\n" //b_o1 - "ldr x2, [%[b_1], #8]\n" - "ins v23.d[1], x2\n" - "mov x0, %[ic]\n" //ic_blk - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr d0, [%[in_0]]\n" //in_hw0 - "mov v3.16b, v22.16b\n" //out_o0hw1 - "ldr d18, [%[f_0]]\n" //f_o0c0 - "mov v4.16b, v22.16b\n" //out_o0hw2 - "ldr x2, [%[f_0], #8]\n" - "mov v5.16b, v22.16b\n" //out_o0hw3 - "ins v18.d[1], x2\n" - "mov v10.16b, v23.16b\n" //out_o1hw0 - "ldr d19, [%[f_0], #16]\n" //f_o1c0 - "mov v11.16b, v23.16b\n" //out_o1hw1 - "ldr x3, [%[f_0], #24]\n" - "mov v12.16b, v23.16b\n" //out_o1hw2 - "ins v19.d[1], x3\n" - "mov v13.16b, v23.16b\n" //out_o1hw3 - - "0:\n" - "cmp x0, #1\n" - "ble 1f\n" - "ldr d1, [%[in_0], #8]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr d20, [%[f_0], #32]\n" //f_o0c0 - "fmla v3.8h, v18.8h, v0.h[1]\n" - "ldr x2, [%[f_0], #40]\n" - "fmla v4.8h, v18.8h, v0.h[2]\n" - "ins v20.d[1], x2\n" - "fmla v5.8h, v18.8h, v0.h[3]\n" - "ldr d21, [%[f_0], #48]\n" //f_o1c0 - "fmla v10.8h, v19.8h, v0.h[0]\n" - "ldr x3, [%[f_0], #56]\n" - "fmla v11.8h, v19.8h, v0.h[1]\n" - "ins v21.d[1], x3\n" - "fmla v12.8h, v19.8h, v0.h[2]\n" - "sub x0, x0, #2\n" - "fmla v13.8h, v19.8h, v0.h[3]\n" - - "ldr d0, [%[in_0], #16]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr d18, [%[f_0], #64]\n" //f_o0c0 - "fmla v3.8h, v20.8h, v1.h[1]\n" - "ldr x2, [%[f_0], #72]\n" - "fmla v4.8h, v20.8h, v1.h[2]\n" - "ldr d19, [%[f_0], #80]\n" //f_o1c0 - "fmla v5.8h, v20.8h, v1.h[3]\n" - "ins v18.d[1], x2\n" - "fmla v10.8h, v21.8h, v1.h[0]\n" - "ldr x3, [%[f_0], #88]\n" - "fmla v11.8h, v21.8h, v1.h[1]\n" - "ins v19.d[1], x3\n" - "fmla v12.8h, v21.8h, v1.h[2]\n" - "add %[in_0], %[in_0], #16\n" - "fmla v13.8h, v21.8h, v1.h[3]\n" - "add %[f_0], %[f_0], #64\n" - "b 0b\n" - - "1:\n" - "blt 2f\n" - "fmla v2.8h, v18.8h, v0.h[0]\n" - "fmla v3.8h, v18.8h, v0.h[1]\n" - "fmla v4.8h, v18.8h, v0.h[2]\n" - "fmla v5.8h, v18.8h, v0.h[3]\n" - "add %[f_0], %[f_0], #32\n" - "fmla v10.8h, v19.8h, v0.h[0]\n" - "fmla v11.8h, v19.8h, v0.h[1]\n" - "fmla v12.8h, v19.8h, v0.h[2]\n" - "fmla v13.8h, v19.8h, v0.h[3]\n" - "2:\n" - :[in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*fh*fw), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v10", "v11", "v12", "v13", "v18", "v19", "v20", "v21", "v22", "v23", "x0", "x1", "x2", "x3" - ); - switch (activationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v1.16b, v1.16b, v1.16b\n" //zero - "fmax v2.8h, v2.8h, v1.8h\n" //max(v2, 0) - "fmax v3.8h, v3.8h, v1.8h\n" - "fmax v4.8h, v4.8h, v1.8h\n" - "fmax v5.8h, v5.8h, v1.8h\n" - "fmax v10.8h, v10.8h, v1.8h\n" - "fmax v11.8h, v11.8h, v1.8h\n" - "fmax v12.8h, v12.8h, v1.8h\n" - "fmax v13.8h, v13.8h, v1.8h\n" - : - : - :"memory", "cc", "v1", "v2", "v3", "v4", "v5", "v10", "v11", "v12", "v13" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "movi v30.8h, #0x46, lsl #8\n" // six - "fmax v2.8h, v2.8h, v31.8h\n" - "fmax v3.8h, v3.8h, v31.8h\n" - "fmax v4.8h, v4.8h, v31.8h\n" - "fmax v5.8h, v5.8h, v31.8h\n" - "fmax v10.8h, v10.8h, v31.8h\n" - "fmax v11.8h, v11.8h, v31.8h\n" - "fmax v12.8h, v12.8h, v31.8h\n" - "fmax v13.8h, v13.8h, v31.8h\n" - - "fmin v2.8h, v2.8h, v30.8h\n" - "fmin v3.8h, v3.8h, v30.8h\n" - "fmin v4.8h, v4.8h, v30.8h\n" - "fmin v5.8h, v5.8h, v30.8h\n" - "fmin v10.8h, v10.8h, v30.8h\n" - "fmin v11.8h, v11.8h, v30.8h\n" - "fmin v12.8h, v12.8h, v30.8h\n" - "fmin v13.8h, v13.8h, v30.8h\n" - : - : - :"memory", "cc", "v2", "v3", "v4", "v5", "v10", "v11", "v12", "v13", "v30", "v31" - ); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - - __asm__ __volatile__( - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw1 - "str q4, [%[out_0], #32]\n" //out_o0hw2 - "str q5, [%[out_0], #48]\n" //out_o0hw3 - "str q10, [%[out_1]]\n" //out_o1hw0 - "str q11, [%[out_1], #16]\n" //out_o1hw1 - "str q12, [%[out_1], #32]\n" //out_o1hw2 - "str q13, [%[out_1], #48]\n" //out_o1hw3 - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0) - : - :"memory", "cc", "v2", "v3", "v4", "v5", "v10", - "v11", "v12", "v13" - ); - b0 += 16; - b1 += 16; - } - if (oc & 1) { - // oc%2 != 0 - const F16 *f_r = filterArray + (oc-1)*8*fh*fw*ic; - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + (oc-1)*ohow*8 + hw*8; - // bias - const F16 *b_o0 = biasArray + (oc-1)*8; - __asm__ __volatile__( - "ldr d22, [%[b_0]]\n" //b_o0 - "ldr x1, [%[b_0], #8]\n" - "ins v22.d[1], x1\n" - "mov x0, %[ic]\n" //ic_blk - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr d0, [%[in_0]]\n" //in_hw0 - "mov v3.16b, v22.16b\n" //out_o0hw1 - "ldr d18, [%[f_0]]\n" //f_o0c0 - "mov v4.16b, v22.16b\n" //out_o0hw2 - "ldr x2, [%[f_0], #8]\n" - "mov v5.16b, v22.16b\n" //out_o0hw3 - "ins v18.d[1], x2\n" - - "0:\n" - "cmp x0, #1\n" - "ble 1f\n" - "ldr d1, [%[in_0], #8]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr d20, [%[f_0], #16]\n" //f_o0c0 - "fmla v3.8h, v18.8h, v0.h[1]\n" - "ldr x2, [%[f_0], #24]\n" - "fmla v4.8h, v18.8h, v0.h[2]\n" - "ins v20.d[1], x2\n" - "fmla v5.8h, v18.8h, v0.h[3]\n" - "sub x0, x0, #2\n" - - "ldr d0, [%[in_0], #16]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr d18, [%[f_0], #32]\n" //f_o0c0 - "fmla v3.8h, v20.8h, v1.h[1]\n" - "ldr x2, [%[f_0], #40]\n" - "fmla v4.8h, v20.8h, v1.h[2]\n" - "ins v18.d[1], x2\n" - "fmla v5.8h, v20.8h, v1.h[3]\n" - "add %[in_0], %[in_0], #16\n" - "add %[f_0], %[f_0], #32\n" - "b 0b\n" - - "1:\n" - "blt 2f\n" - "fmla v2.8h, v18.8h, v0.h[0]\n" - "fmla v3.8h, v18.8h, v0.h[1]\n" - "add %[f_0], %[f_0], #16\n" - "fmla v4.8h, v18.8h, v0.h[2]\n" - "fmla v5.8h, v18.8h, v0.h[3]\n" - "2:\n" - :[in_0]"+r"(in_hw0), - [f_0]"+r"(f_r) - :[ic]"r"((I64)ic*fh*fw), - [b_0]"r"(b_o0) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v18", "v20", "v22", "x0", "x1", "x2" - ); - switch (activationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v1.16b, v1.16b, v1.16b\n" //zero - "fmax v2.8h, v2.8h, v1.8h\n" //max(v2, 0) - "fmax v3.8h, v3.8h, v1.8h\n" - "fmax v4.8h, v4.8h, v1.8h\n" - "fmax v5.8h, v5.8h, v1.8h\n" - : - : - :"memory", "cc", "v1", "v2", "v3", "v4", "v5" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "movi v30.8h, #0x46, lsl #8\n" // six - "fmax v2.8h, v2.8h, v31.8h\n" - "fmax v3.8h, v3.8h, v31.8h\n" - "fmax v4.8h, v4.8h, v31.8h\n" - "fmax v5.8h, v5.8h, v31.8h\n" - - "fmin v2.8h, v2.8h, v30.8h\n" - "fmin v3.8h, v3.8h, v30.8h\n" - "fmin v4.8h, v4.8h, v30.8h\n" - "fmin v5.8h, v5.8h, v30.8h\n" - : - : - :"memory", "cc", "v2", "v3", "v4", "v5", "v30", "v31" - ); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - - __asm__ __volatile__( - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw1 - "str q4, [%[out_0], #32]\n" //out_o0hw2 - "str q5, [%[out_0], #48]\n" //out_o0hw3 - :[out_0]"+r"(out_o0hw0) - : - :"memory", "cc", "v2", "v3", "v4", "v5" - ); - } - } - // ohow_reminder % 4 - ohow_s = (ohow / 4) * 4; - for (I32 hw = ohow_s; hw < ohow; hw++) { - const F16 *b0 = biasArray; - const F16 *b1 = biasArray + 8; - const F16 *f_o0c0 = filterArray; - F16 *in_pack = ((F16*)tmp) + ic*ih_pad*iw_pad; - // pack input - // NCHWc8 => NHWChw4 + im2col - U32 in_h_0 = (hw/ow)*strideH; - U32 in_w_0 = (hw%ow)*strideW; - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - F16 *in_hw = inArray_pad + c*ihiw + fh_idx*dilateH*iw_pad + dilateW*fw_idx; - F16 *in_0 = in_hw + in_h_0*iw_pad + in_w_0; - F16 *in_pack_hw1 = in_pack + fh_idx*fw*ic + fw_idx*ic + c; - /* - * for (U32 c8 = 0; c8 < 8; c8++) { - * in_pack_c8hw1[c8] = in_0[c8]; - * } - */ - *in_pack_hw1 = *in_0; - } - } - } - - // compute - for (I32 o = 0; o < I32(oc-1); o+=2) { - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; - // bias - const F16 *b_o0 = b0; - const F16 *b_o1 = b1; - __asm__ __volatile__( - "ldr d22, [%[b_0]]\n" //b_o0 - "ldr x1, [%[b_0], #8]\n" - "ins v22.d[1], x1\n" - "ldr d23, [%[b_1]]\n" //b_o1 - "mov x0, %[ic]\n" //ic_blk - "ldr x2, [%[b_1], #8]\n" - "ins v23.d[1], x2\n" - "ldr h0, [%[in_0]]\n" //in_hw0 - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr d18, [%[f_0]]\n" //f_o0c0 - "mov v10.16b, v23.16b\n" //out_o1hw0 - "ldr x2, [%[f_0], #8]\n" - "ins v18.d[1], x2\n" - "ldr d19, [%[f_0], #16]\n" //f_o1c0 - "ldr x3, [%[f_0], #24]\n" - "ins v19.d[1], x3\n" - - "0:\n" - "cmp x0, #1\n" - "ble 1f\n" - "ldr h1, [%[in_0], #2]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr d20, [%[f_0], #32]\n" //f_o0c0 - "fmla v10.8h, v19.8h, v0.h[0]\n" - "ldr x2, [%[f_0], #40]\n" - "ins v20.d[1], x2\n" - "ldr d21, [%[f_0], #48]\n" //f_o1c0 - "sub x0, x0, #2\n" - "ldr x3, [%[f_0], #56]\n" - "ins v21.d[1], x3\n" - - "ldr h0, [%[in_0], #4]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr d18, [%[f_0], #64]\n" //f_o0c0 - "fmla v10.8h, v21.8h, v1.h[0]\n" - "ldr x2, [%[f_0], #72]\n" - "ins v18.d[1], x2\n" - "ldr d19, [%[f_0], #80]\n" //f_o1c0 - "add %[in_0], %[in_0], #4\n" - "ldr x3, [%[f_0], #88]\n" - "ins v19.d[1], x3\n" - "add %[f_0], %[f_0], #64\n" - "b 0b\n" - - "1:\n" - "blt 2f\n" - "fmla v2.8h, v18.8h, v0.h[0]\n" - "add %[f_0], %[f_0], #32\n" - "fmla v10.8h, v19.8h, v0.h[0]\n" - "2:\n" - :[in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*fh*fw), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1) - :"memory", "cc", "v0", "v1", "v2", "v10", "v18", "v19", "v20", "v21", "v22", "v23", "x0", "x1", "x2", "x3" - ); - switch (activationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v1.16b, v1.16b, v1.16b\n" //zero - "fmax v2.8h, v2.8h, v1.8h\n" //max(v2, 0) - "fmax v10.8h, v10.8h, v1.8h\n" - : - : - :"memory", "cc", "v1", "v2", "v10" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "movi v30.8h, #0x46, lsl #8\n" // six - "fmax v2.8h, v2.8h, v31.8h\n" - "fmax v10.8h, v10.8h, v31.8h\n" - - "fmin v2.8h, v2.8h, v30.8h\n" - "fmin v10.8h, v10.8h, v30.8h\n" - : - : - :"memory", "cc", "v2", "v10", "v30", "v31" - ); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - - __asm__ __volatile__( - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q10, [%[out_1]]\n" //out_o1hw0 - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0) - : - :"memory", "cc", "v2", "v10" - ); - b0 += 16; - b1 += 16; - } - if (oc & 1) { - // oc%2 != 0 - const F16 *f_r = filterArray + (oc-1)*8*fh*fw*ic; - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + (oc-1)*ohow*8 + hw*8; - // bias - const F16 *b_o0 = biasArray + (oc-1)*8; - __asm__ __volatile__( - "ldr d22, [%[b_0]]\n" //b_o0 - "mov x0, %[ic]\n" //ic_blk - "ldr x1, [%[b_0], #8]\n" - "ins v22.d[1], x1\n" - "ldr h0, [%[in_0]]\n" //in_hw0 - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr d18, [%[f_0]]\n" //f_o0c0 - "ldr x2, [%[f_0], #8]\n" - "ins v18.d[1], x2\n" - - "0:\n" - "cmp x0, #1\n" - "ble 1f\n" - "ldr h1, [%[in_0], #2]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr d20, [%[f_0], #16]\n" //f_o0c0 - "sub x0, x0, #2\n" - "ldr x2, [%[f_0], #24]\n" - "ins v20.d[1], x2\n" - - "ldr h0, [%[in_0], #4]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr d18, [%[f_0], #32]\n" //f_o0c0 - "ldr x2, [%[f_0], #40]\n" - "ins v18.d[1], x2\n" - "add %[in_0], %[in_0], #4\n" - "add %[f_0], %[f_0], #32\n" - "b 0b\n" - - "1:\n" - "blt 2f\n" - "fmla v2.8h, v18.8h, v0.h[0]\n" - "add %[f_0], %[f_0], #16\n" - "2:\n" - :[in_0]"+r"(in_hw0), - [f_0]"+r"(f_r) - :[ic]"r"((I64)ic*fh*fw), - [b_0]"r"(b_o0) - :"memory", "cc", "v0", "v1", "v2", "v10", "v18", "v20", "v22", "x0", "x1", "x2" - ); - switch (activationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v1.16b, v1.16b, v1.16b\n" //zero - "fmax v2.8h, v2.8h, v1.8h\n" //max(v2, 0) - : - : - :"memory", "cc", "v1", "v2" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "movi v30.8h, #0x46, lsl #8\n" // six - "fmax v2.8h, v2.8h, v31.8h\n" - "fmin v2.8h, v2.8h, v30.8h\n" - : - : - :"memory", "cc", "v2", "v30", "v31" - ); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - - __asm__ __volatile__( - "str q2, [%[out_0]]\n" //out_o0hw0 - :[out_0]"+r"(out_o0hw0) - : - :"memory", "cc", "v2" - ); - } - } - } - return ret; -} diff --git a/tensor_computing/src/cpu/arm/fp16/convolution_gemm_icnchw_A76.cpp b/tensor_computing/src/cpu/arm/fp16/convolution_gemm_icnchw_A76.cpp deleted file mode 100644 index 3b6490b5..00000000 --- a/tensor_computing/src/cpu/arm/fp16/convolution_gemm_icnchw_A76.cpp +++ /dev/null @@ -1,963 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/fp16/convolution_gemm_icnchw.h" - -EE convolution_gemm_icnchw_A76(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc activationDesc) -{ - UNUSED(biasDesc); - UNUSED(tmpBytes); - - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - U32 dilateH = convDesc.dilatedRate_h; - U32 dilateW = convDesc.dilatedRate_w; - - if (fdf != DF_NHWCN16) - CHECK_STATUS(NOT_MATCH); - - oc /= 8; - U32 ih_pad = ih + paddingT + paddingB; - U32 iw_pad = iw + paddingL + paddingR; - I32 ohow = oh*ow; - U32 ihiw = ih_pad*iw_pad; - F16 *inArray_pad; - EE ret = SUCCESS; - for (U32 n = 0; n < in; n++) { - if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { - inArray_pad = inArray + n*ic*ih*iw; - } else { - // copy input into a input with padding - inArray_pad = (F16*)tmp; - F16 *inArray_pad_mov = inArray_pad; - F16 *inArray_mov = inArray + n*ic*ih*iw; - for (U32 c = 0; c < ic; c++) { - for (U32 h = 0; h < paddingT; h++) { - memset(inArray_pad_mov, 0, iw_pad*bytesOf(idt)); - inArray_pad_mov += iw_pad; - } - for (U32 h = paddingT; h < ih_pad - paddingB; h++) { - memset(inArray_pad_mov, 0, paddingL*bytesOf(idt)); - inArray_pad_mov += paddingL; - memcpy(inArray_pad_mov, inArray_mov, iw*bytesOf(idt)); - inArray_pad_mov += iw; - inArray_mov += iw; - memset(inArray_pad_mov, 0, paddingR*bytesOf(idt)); - inArray_pad_mov += paddingR; - } - for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { - memset(inArray_pad_mov, 0, iw_pad*bytesOf(idt)); - inArray_pad_mov += iw_pad; - } - } - } - - // ohow / 8 - for (I32 hw = 0; hw < ohow-7; hw+=8) { - const F16 *b0 = biasArray; - const F16 *b1 = biasArray + 8; - const F16 *f_o0c0 = filterArray; - F16 *in_pack = ((F16*)tmp) + ic*ih_pad*iw_pad; - // pack input - // NCHW => NHWChw8 + im2col - U32 in_h_0 = (hw/ow)*strideH; - U32 in_w_0 = (hw%ow)*strideW; - U32 in_h_1 = ((hw+1)/ow)*strideH; - U32 in_w_1 = ((hw+1)%ow)*strideW; - U32 in_h_2 = ((hw+2)/ow)*strideH; - U32 in_w_2 = ((hw+2)%ow)*strideW; - U32 in_h_3 = ((hw+3)/ow)*strideH; - U32 in_w_3 = ((hw+3)%ow)*strideW; - U32 in_h_4 = ((hw+4)/ow)*strideH; - U32 in_w_4 = ((hw+4)%ow)*strideW; - U32 in_h_5 = ((hw+5)/ow)*strideH; - U32 in_w_5 = ((hw+5)%ow)*strideW; - U32 in_h_6 = ((hw+6)/ow)*strideH; - U32 in_w_6 = ((hw+6)%ow)*strideW; - U32 in_h_7 = ((hw+7)/ow)*strideH; - U32 in_w_7 = ((hw+7)%ow)*strideW; - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - F16 *in_hw = inArray_pad + c*ihiw + fh_idx*dilateH*iw_pad + dilateW*fw_idx; - F16 *in_0 = in_hw + in_h_0*iw_pad + in_w_0; - F16 *in_1 = in_hw + in_h_1*iw_pad + in_w_1; - F16 *in_2 = in_hw + in_h_2*iw_pad + in_w_2; - F16 *in_3 = in_hw + in_h_3*iw_pad + in_w_3; - F16 *in_4 = in_hw + in_h_4*iw_pad + in_w_4; - F16 *in_5 = in_hw + in_h_5*iw_pad + in_w_5; - F16 *in_6 = in_hw + in_h_6*iw_pad + in_w_6; - F16 *in_7 = in_hw + in_h_7*iw_pad + in_w_7; - F16 *in_pack_hw8 = in_pack + fh_idx*fw*ic*8 + fw_idx*ic*8 + c*8; - *in_pack_hw8 = *in_0; - *(in_pack_hw8+1) = *in_1; - *(in_pack_hw8+2) = *in_2; - *(in_pack_hw8+3) = *in_3; - *(in_pack_hw8+4) = *in_4; - *(in_pack_hw8+5) = *in_5; - *(in_pack_hw8+6) = *in_6; - *(in_pack_hw8+7) = *in_7; - } - } - } - - // compute - for (I32 o = 0; o < I32(oc-1); o+=2) { - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; - // bias - const F16 *b_o0 = b0; - const F16 *b_o1 = b1; - __asm__ __volatile__( - "ldr q22, [%[b_0]]\n" //b_o0 - "ldr q23, [%[b_1]]\n" //b_o1 - "mov x0, %[ic]\n" //ic_blk - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr q0, [%[in_0]]\n" //in_hw0 - "mov v3.16b, v22.16b\n" //out_o0hw1 - "mov v4.16b, v22.16b\n" //out_o0hw2 - "mov v5.16b, v22.16b\n" //out_o0hw3 - "ldr q18, [%[f_0]]\n" //f_o0c0 - "mov v6.16b, v22.16b\n" //out_o0hw4 - "mov v7.16b, v22.16b\n" //out_o0hw5 - "mov v8.16b, v22.16b\n" //out_o0hw6 - "ldr q19, [%[f_0], #16]\n" //f_o1c0 - "mov v9.16b, v22.16b\n" //out_o0hw7 - "mov v10.16b, v23.16b\n" //out_o1hw0 - "mov v11.16b, v23.16b\n" //out_o1hw1 - "mov v12.16b, v23.16b\n" //out_o1hw2 - "mov v13.16b, v23.16b\n" //out_o1hw3 - "mov v14.16b, v23.16b\n" //out_o1hw4 - "mov v15.16b, v23.16b\n" //out_o1hw5 - "mov v16.16b, v23.16b\n" //out_o1hw6 - "mov v17.16b, v23.16b\n" //out_o1hw7 - - "0:\n" - "cmp x0, #1\n" - "ble 1f\n" - "ldr q1, [%[in_0], #16]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "fmla v3.8h, v18.8h, v0.h[1]\n" - "fmla v4.8h, v18.8h, v0.h[2]\n" - "ldr q20, [%[f_0], #32]\n" //f_o0c0 - "fmla v5.8h, v18.8h, v0.h[3]\n" - "fmla v6.8h, v18.8h, v0.h[4]\n" - "fmla v7.8h, v18.8h, v0.h[5]\n" - "ldr q21, [%[f_0], #48]\n" //f_o1c0 - "fmla v8.8h, v18.8h, v0.h[6]\n" - "fmla v9.8h, v18.8h, v0.h[7]\n" - "fmla v10.8h, v19.8h, v0.h[0]\n" - "fmla v11.8h, v19.8h, v0.h[1]\n" - "fmla v12.8h, v19.8h, v0.h[2]\n" - "fmla v13.8h, v19.8h, v0.h[3]\n" - "fmla v14.8h, v19.8h, v0.h[4]\n" - "fmla v15.8h, v19.8h, v0.h[5]\n" - "fmla v16.8h, v19.8h, v0.h[6]\n" - "fmla v17.8h, v19.8h, v0.h[7]\n" - - "ldr q0, [%[in_0], #32]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "fmla v3.8h, v20.8h, v1.h[1]\n" - "fmla v4.8h, v20.8h, v1.h[2]\n" - "ldr q18, [%[f_0], #64]\n" //f_o0c0 - "fmla v5.8h, v20.8h, v1.h[3]\n" - "fmla v6.8h, v20.8h, v1.h[4]\n" - "fmla v7.8h, v20.8h, v1.h[5]\n" - "ldr q19, [%[f_0], #80]\n" //f_o1c0 - "fmla v8.8h, v20.8h, v1.h[6]\n" - "fmla v9.8h, v20.8h, v1.h[7]\n" - "fmla v10.8h, v21.8h, v1.h[0]\n" - "add %[in_0], %[in_0], #32\n" - "fmla v11.8h, v21.8h, v1.h[1]\n" - "add %[f_0], %[f_0], #64\n" - "fmla v12.8h, v21.8h, v1.h[2]\n" - "sub x0, x0, #2\n" - "fmla v13.8h, v21.8h, v1.h[3]\n" - "fmla v14.8h, v21.8h, v1.h[4]\n" - "fmla v15.8h, v21.8h, v1.h[5]\n" - "fmla v16.8h, v21.8h, v1.h[6]\n" - "fmla v17.8h, v21.8h, v1.h[7]\n" - "b 0b\n" - - "1:\n" - "blt 2f\n" - "fmla v2.8h, v18.8h, v0.h[0]\n" - "fmla v3.8h, v18.8h, v0.h[1]\n" - "fmla v4.8h, v18.8h, v0.h[2]\n" - "fmla v5.8h, v18.8h, v0.h[3]\n" - "fmla v6.8h, v18.8h, v0.h[4]\n" - "fmla v7.8h, v18.8h, v0.h[5]\n" - "fmla v8.8h, v18.8h, v0.h[6]\n" - "fmla v9.8h, v18.8h, v0.h[7]\n" - "add %[f_0], %[f_0], #32\n" - "fmla v10.8h, v19.8h, v0.h[0]\n" - "fmla v11.8h, v19.8h, v0.h[1]\n" - "fmla v12.8h, v19.8h, v0.h[2]\n" - "fmla v13.8h, v19.8h, v0.h[3]\n" - "fmla v14.8h, v19.8h, v0.h[4]\n" - "fmla v15.8h, v19.8h, v0.h[5]\n" - "fmla v16.8h, v19.8h, v0.h[6]\n" - "fmla v17.8h, v19.8h, v0.h[7]\n" - "2:\n" - :[in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*fh*fw), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "x0" - ); - switch (activationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v1.16b, v1.16b, v1.16b\n" //zero - "fmax v2.8h, v2.8h, v1.8h\n" //max(v2, 0) - "fmax v3.8h, v3.8h, v1.8h\n" - "fmax v4.8h, v4.8h, v1.8h\n" - "fmax v5.8h, v5.8h, v1.8h\n" - "fmax v6.8h, v6.8h, v1.8h\n" - "fmax v7.8h, v7.8h, v1.8h\n" - "fmax v8.8h, v8.8h, v1.8h\n" - "fmax v9.8h, v9.8h, v1.8h\n" - "fmax v10.8h, v10.8h, v1.8h\n" - "fmax v11.8h, v11.8h, v1.8h\n" - "fmax v12.8h, v12.8h, v1.8h\n" - "fmax v13.8h, v13.8h, v1.8h\n" - "fmax v14.8h, v14.8h, v1.8h\n" - "fmax v15.8h, v15.8h, v1.8h\n" - "fmax v16.8h, v16.8h, v1.8h\n" - "fmax v17.8h, v17.8h, v1.8h\n" - : - : - :"memory", "cc", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "movi v30.8h, #0x46, lsl #8\n" // six - "fmax v2.8h, v2.8h, v31.8h\n" - "fmax v3.8h, v3.8h, v31.8h\n" - "fmax v4.8h, v4.8h, v31.8h\n" - "fmax v5.8h, v5.8h, v31.8h\n" - "fmax v6.8h, v6.8h, v31.8h\n" - "fmax v7.8h, v7.8h, v31.8h\n" - "fmax v8.8h, v8.8h, v31.8h\n" - "fmax v9.8h, v9.8h, v31.8h\n" - "fmax v10.8h, v10.8h, v31.8h\n" - "fmax v11.8h, v11.8h, v31.8h\n" - "fmax v12.8h, v12.8h, v31.8h\n" - "fmax v13.8h, v13.8h, v31.8h\n" - "fmax v14.8h, v14.8h, v31.8h\n" - "fmax v15.8h, v15.8h, v31.8h\n" - "fmax v16.8h, v16.8h, v31.8h\n" - "fmax v17.8h, v17.8h, v31.8h\n" - - "fmin v2.8h, v2.8h, v30.8h\n" - "fmin v3.8h, v3.8h, v30.8h\n" - "fmin v4.8h, v4.8h, v30.8h\n" - "fmin v5.8h, v5.8h, v30.8h\n" - "fmin v6.8h, v6.8h, v30.8h\n" - "fmin v7.8h, v7.8h, v30.8h\n" - "fmin v8.8h, v8.8h, v30.8h\n" - "fmin v9.8h, v9.8h, v30.8h\n" - "fmin v10.8h, v10.8h, v30.8h\n" - "fmin v11.8h, v11.8h, v30.8h\n" - "fmin v12.8h, v12.8h, v30.8h\n" - "fmin v13.8h, v13.8h, v30.8h\n" - "fmin v14.8h, v14.8h, v30.8h\n" - "fmin v15.8h, v15.8h, v30.8h\n" - "fmin v16.8h, v16.8h, v30.8h\n" - "fmin v17.8h, v17.8h, v30.8h\n" - : - : - :"memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v30", "v31" - ); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - - __asm__ __volatile__( - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw1 - "str q4, [%[out_0], #32]\n" //out_o0hw2 - "str q5, [%[out_0], #48]\n" //out_o0hw3 - "str q6, [%[out_0], #64]\n" //out_o0hw4 - "str q7, [%[out_0], #80]\n" //out_o0hw5 - "str q8, [%[out_0], #96]\n" //out_o0hw6 - "str q9, [%[out_0], #112]\n" //out_o0hw7 - "str q10, [%[out_1]]\n" //out_o1hw0 - "str q11, [%[out_1], #16]\n" //out_o1hw1 - "str q12, [%[out_1], #32]\n" //out_o1hw2 - "str q13, [%[out_1], #48]\n" //out_o1hw3 - "str q14, [%[out_1], #64]\n" //out_o1hw4 - "str q15, [%[out_1], #80]\n" //out_o1hw5 - "str q16, [%[out_1], #96]\n" //out_o1hw6 - "str q17, [%[out_1], #112]\n" //out_o1hw7 - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0) - : - :"memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17" - ); - b0 += 16; - b1 += 16; - } - if (oc & 1) { - // oc%2 != 0 - const F16 *f_r = filterArray + (oc-1)*8*fh*fw*ic; - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + (oc-1)*ohow*8 + hw*8; - // bias - const F16 *b_o0 = biasArray + (oc-1)*8; - __asm__ __volatile__( - "ldr q12, [%[b_0]]\n" //b_o0 - "mov x0, %[ic]\n" // ic_blk - "ldr q0, [%[in_0]]\n" //in_hw0 - "mov v2.16b, v12.16b\n" //out_o0hw0 - "mov v3.16b, v12.16b\n" //out_o0hw1 - "mov v4.16b, v12.16b\n" //out_o0hw2 - "ldr q10, [%[f_0]]\n" //f_o0c0 - "mov v5.16b, v12.16b\n" //out_o0hw3 - "mov v6.16b, v12.16b\n" //out_o0hw4 - "mov v7.16b, v12.16b\n" //out_o0hw5 - "mov v8.16b, v12.16b\n" //out_o0hw6 - "mov v9.16b, v12.16b\n" //out_o0hw7 - - "0:\n" - "cmp x0, #1\n" - "ble 1f\n" - "ldr q1, [%[in_0], #16]\n" //in_hw0 - "fmla v2.8h, v10.8h, v0.h[0]\n" - "fmla v3.8h, v10.8h, v0.h[1]\n" - "fmla v4.8h, v10.8h, v0.h[2]\n" - "ldr q11, [%[f_0], #16]\n" //f_o0c0 - "fmla v5.8h, v10.8h, v0.h[3]\n" - "fmla v6.8h, v10.8h, v0.h[4]\n" - "fmla v7.8h, v10.8h, v0.h[5]\n" - "sub x0, x0, #2\n" - "fmla v8.8h, v10.8h, v0.h[6]\n" - "fmla v9.8h, v10.8h, v0.h[7]\n" - - "ldr q0, [%[in_0], #32]\n" //in_hw0 - "fmla v2.8h, v11.8h, v1.h[0]\n" - "fmla v3.8h, v11.8h, v1.h[1]\n" - "fmla v4.8h, v11.8h, v1.h[2]\n" - "ldr q10, [%[f_0], #32]\n" //f_o0c0 - "fmla v5.8h, v11.8h, v1.h[3]\n" - "fmla v6.8h, v11.8h, v1.h[4]\n" - "fmla v7.8h, v11.8h, v1.h[5]\n" - "add %[in_0], %[in_0], #32\n" - "fmla v8.8h, v11.8h, v1.h[6]\n" - "add %[f_0], %[f_0], #32\n" - "fmla v9.8h, v11.8h, v1.h[7]\n" - "b 0b\n" - - "1:\n" - "blt 2f\n" - "fmla v2.8h, v10.8h, v0.h[0]\n" - "fmla v3.8h, v10.8h, v0.h[1]\n" - "fmla v4.8h, v10.8h, v0.h[2]\n" - "fmla v5.8h, v10.8h, v0.h[3]\n" - "add %[f_0], %[f_0], #16\n" - "fmla v6.8h, v10.8h, v0.h[4]\n" - "fmla v7.8h, v10.8h, v0.h[5]\n" - "fmla v8.8h, v10.8h, v0.h[6]\n" - "fmla v9.8h, v10.8h, v0.h[7]\n" - "2:\n" - :[in_0]"+r"(in_hw0), - [f_0]"+r"(f_r) - :[ic]"r"((I64)ic*fh*fw), - [b_0]"r"(b_o0) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "x0" - ); - switch (activationDesc.mode) { - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v1.16b, v1.16b, v1.16b\n" //zero - "fmax v2.8h, v2.8h, v1.8h\n" //max(v2, 0) - "fmax v3.8h, v3.8h, v1.8h\n" - "fmax v4.8h, v4.8h, v1.8h\n" - "fmax v5.8h, v5.8h, v1.8h\n" - "fmax v6.8h, v6.8h, v1.8h\n" - "fmax v7.8h, v7.8h, v1.8h\n" - "fmax v8.8h, v8.8h, v1.8h\n" - "fmax v9.8h, v9.8h, v1.8h\n" - : - : - :"memory", "cc", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "movi v30.8h, #0x46, lsl #8\n" // six - "fmax v2.8h, v2.8h, v31.8h\n" - "fmax v3.8h, v3.8h, v31.8h\n" - "fmax v4.8h, v4.8h, v31.8h\n" - "fmax v5.8h, v5.8h, v31.8h\n" - "fmax v6.8h, v6.8h, v31.8h\n" - "fmax v7.8h, v7.8h, v31.8h\n" - "fmax v8.8h, v8.8h, v31.8h\n" - "fmax v9.8h, v9.8h, v31.8h\n" - - "fmin v2.8h, v2.8h, v30.8h\n" - "fmin v3.8h, v3.8h, v30.8h\n" - "fmin v4.8h, v4.8h, v30.8h\n" - "fmin v5.8h, v5.8h, v30.8h\n" - "fmin v6.8h, v6.8h, v30.8h\n" - "fmin v7.8h, v7.8h, v30.8h\n" - "fmin v8.8h, v8.8h, v30.8h\n" - "fmin v9.8h, v9.8h, v30.8h\n" - : - : - :"memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v30", "v31" - ); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - - __asm__ __volatile__( - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw0 - "str q4, [%[out_0], #32]\n" //out_o0hw0 - "str q5, [%[out_0], #48]\n" //out_o0hw0 - "str q6, [%[out_0], #64]\n" //out_o0hw0 - "str q7, [%[out_0], #80]\n" //out_o0hw0 - "str q8, [%[out_0], #96]\n" //out_o0hw0 - "str q9, [%[out_0], #112]\n" //out_o0hw0 - :[out_0]"+r"(out_o0hw0) - : - :"memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9" - ); - } - } - // ohow_reminder % 8 / 4 - U32 ohow_s = (ohow / 8) * 8; - - for (I32 hw = ohow_s; hw < ohow-3; hw+=4) { - const F16 *b0 = biasArray; - const F16 *b1 = biasArray + 8; - const F16 *f_o0c0 = filterArray; - F16 *in_pack = ((F16*)tmp) + ic*ih_pad*iw_pad; - - // pack input - // NCHWc8 => NHWChw4 + im2col - U32 in_h_0 = (hw/ow)*strideH; - U32 in_w_0 = (hw%ow)*strideW; - U32 in_h_1 = ((hw+1)/ow)*strideH; - U32 in_w_1 = ((hw+1)%ow)*strideW; - U32 in_h_2 = ((hw+2)/ow)*strideH; - U32 in_w_2 = ((hw+2)%ow)*strideW; - U32 in_h_3 = ((hw+3)/ow)*strideH; - U32 in_w_3 = ((hw+3)%ow)*strideW; - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - F16 *in_hw = inArray_pad + c*ihiw + fh_idx*dilateH*iw_pad + dilateW*fw_idx; - F16 *in_0 = in_hw + in_h_0*iw_pad + in_w_0; - F16 *in_1 = in_hw + in_h_1*iw_pad + in_w_1; - F16 *in_2 = in_hw + in_h_2*iw_pad + in_w_2; - F16 *in_3 = in_hw + in_h_3*iw_pad + in_w_3; - F16 *in_pack_hw4 = in_pack + fh_idx*fw*ic*4 + fw_idx*ic*4 + c*4; - *in_pack_hw4 = *in_0; - *(in_pack_hw4+1) = *in_1; - *(in_pack_hw4+2) = *in_2; - *(in_pack_hw4+3) = *in_3; - } - } - } - - // compute - for (I32 o = 0; o < I32(oc-1); o+=2) { - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; - // bias - const F16 *b_o0 = b0; - const F16 *b_o1 = b1; - __asm__ __volatile__( - "ldr q22, [%[b_0]]\n" //b_o0 - "ldr q23, [%[b_1]]\n" //b_o1 - "mov x0, %[ic]\n" //ic_blk - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr d0, [%[in_0]]\n" //in_hw0 - "mov v3.16b, v22.16b\n" //out_o0hw1 - "ldr q18, [%[f_0]]\n" //f_o0c0 - "mov v4.16b, v22.16b\n" //out_o0hw2 - "mov v5.16b, v22.16b\n" //out_o0hw3 - "mov v10.16b, v23.16b\n" //out_o1hw0 - "ldr q19, [%[f_0], #16]\n" //f_o1c0 - "mov v11.16b, v23.16b\n" //out_o1hw1 - "mov v12.16b, v23.16b\n" //out_o1hw2 - "mov v13.16b, v23.16b\n" //out_o1hw3 - - "0:\n" - "cmp x0, #1\n" - "ble 1f\n" - "ldr d1, [%[in_0], #8]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr q20, [%[f_0], #32]\n" //f_o0c0 - "fmla v3.8h, v18.8h, v0.h[1]\n" - "fmla v4.8h, v18.8h, v0.h[2]\n" - "fmla v5.8h, v18.8h, v0.h[3]\n" - "ldr q21, [%[f_0], #48]\n" //f_o1c0 - "fmla v10.8h, v19.8h, v0.h[0]\n" - "fmla v11.8h, v19.8h, v0.h[1]\n" - "fmla v12.8h, v19.8h, v0.h[2]\n" - "sub x0, x0, #2\n" - "fmla v13.8h, v19.8h, v0.h[3]\n" - - "ldr d0, [%[in_0], #16]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr q18, [%[f_0], #64]\n" //f_o0c0 - "fmla v3.8h, v20.8h, v1.h[1]\n" - "fmla v4.8h, v20.8h, v1.h[2]\n" - "ldr q19, [%[f_0], #80]\n" //f_o1c0 - "fmla v5.8h, v20.8h, v1.h[3]\n" - "fmla v10.8h, v21.8h, v1.h[0]\n" - "fmla v11.8h, v21.8h, v1.h[1]\n" - "fmla v12.8h, v21.8h, v1.h[2]\n" - "add %[in_0], %[in_0], #16\n" - "fmla v13.8h, v21.8h, v1.h[3]\n" - "add %[f_0], %[f_0], #64\n" - "b 0b\n" - - "1:\n" - "blt 2f\n" - "fmla v2.8h, v18.8h, v0.h[0]\n" - "fmla v3.8h, v18.8h, v0.h[1]\n" - "fmla v4.8h, v18.8h, v0.h[2]\n" - "fmla v5.8h, v18.8h, v0.h[3]\n" - "add %[f_0], %[f_0], #32\n" - "fmla v10.8h, v19.8h, v0.h[0]\n" - "fmla v11.8h, v19.8h, v0.h[1]\n" - "fmla v12.8h, v19.8h, v0.h[2]\n" - "fmla v13.8h, v19.8h, v0.h[3]\n" - "2:\n" - :[in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*fh*fw), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v10", "v11", "v12", "v13", "v18", "v19", "v20", "v21", "v22", "v23", "x0" - ); - switch (activationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v1.16b, v1.16b, v1.16b\n" //zero - "fmax v2.8h, v2.8h, v1.8h\n" //max(v2, 0) - "fmax v3.8h, v3.8h, v1.8h\n" - "fmax v4.8h, v4.8h, v1.8h\n" - "fmax v5.8h, v5.8h, v1.8h\n" - "fmax v10.8h, v10.8h, v1.8h\n" - "fmax v11.8h, v11.8h, v1.8h\n" - "fmax v12.8h, v12.8h, v1.8h\n" - "fmax v13.8h, v13.8h, v1.8h\n" - : - : - :"memory", "cc", "v1", "v2", "v3", "v4", "v5", "v10", "v11", "v12", "v13" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "movi v30.8h, #0x46, lsl #8\n" // six - "fmax v2.8h, v2.8h, v31.8h\n" - "fmax v3.8h, v3.8h, v31.8h\n" - "fmax v4.8h, v4.8h, v31.8h\n" - "fmax v5.8h, v5.8h, v31.8h\n" - "fmax v10.8h, v10.8h, v31.8h\n" - "fmax v11.8h, v11.8h, v31.8h\n" - "fmax v12.8h, v12.8h, v31.8h\n" - "fmax v13.8h, v13.8h, v31.8h\n" - - "fmin v2.8h, v2.8h, v30.8h\n" - "fmin v3.8h, v3.8h, v30.8h\n" - "fmin v4.8h, v4.8h, v30.8h\n" - "fmin v5.8h, v5.8h, v30.8h\n" - "fmin v10.8h, v10.8h, v30.8h\n" - "fmin v11.8h, v11.8h, v30.8h\n" - "fmin v12.8h, v12.8h, v30.8h\n" - "fmin v13.8h, v13.8h, v30.8h\n" - : - : - :"memory", "cc", "v2", "v3", "v4", "v5", "v10", "v11", "v12", "v13", "v30", "v31" - ); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - - __asm__ __volatile__( - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw1 - "str q4, [%[out_0], #32]\n" //out_o0hw2 - "str q5, [%[out_0], #48]\n" //out_o0hw3 - "str q10, [%[out_1]]\n" //out_o1hw0 - "str q11, [%[out_1], #16]\n" //out_o1hw1 - "str q12, [%[out_1], #32]\n" //out_o1hw2 - "str q13, [%[out_1], #48]\n" //out_o1hw3 - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0) - : - :"memory", "cc", "v2", "v3", "v4", "v5", "v10", - "v11", "v12", "v13" - ); - b0 += 16; - b1 += 16; - } - if (oc & 1) { - // oc%2 != 0 - const F16 *f_r = filterArray + (oc-1)*8*fh*fw*ic; - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + (oc-1)*ohow*8 + hw*8; - // bias - const F16 *b_o0 = biasArray + (oc-1)*8; - __asm__ __volatile__( - "ldr q22, [%[b_0]]\n" //b_o0 - "mov x0, %[ic]\n" //ic_blk - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr d0, [%[in_0]]\n" //in_hw0 - "mov v3.16b, v22.16b\n" //out_o0hw1 - "ldr q18, [%[f_0]]\n" //f_o0c0 - "mov v4.16b, v22.16b\n" //out_o0hw2 - "mov v5.16b, v22.16b\n" //out_o0hw3 - - "0:\n" - "cmp x0, #1\n" - "ble 1f\n" - "ldr d1, [%[in_0], #8]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr q20, [%[f_0], #16]\n" //f_o0c0 - "fmla v3.8h, v18.8h, v0.h[1]\n" - "fmla v4.8h, v18.8h, v0.h[2]\n" - "fmla v5.8h, v18.8h, v0.h[3]\n" - "sub x0, x0, #2\n" - - "ldr d0, [%[in_0], #16]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr q18, [%[f_0], #32]\n" //f_o0c0 - "fmla v3.8h, v20.8h, v1.h[1]\n" - "fmla v4.8h, v20.8h, v1.h[2]\n" - "fmla v5.8h, v20.8h, v1.h[3]\n" - "add %[in_0], %[in_0], #16\n" - "add %[f_0], %[f_0], #32\n" - "b 0b\n" - - "1:\n" - "blt 2f\n" - "fmla v2.8h, v18.8h, v0.h[0]\n" - "fmla v3.8h, v18.8h, v0.h[1]\n" - "add %[f_0], %[f_0], #16\n" - "fmla v4.8h, v18.8h, v0.h[2]\n" - "fmla v5.8h, v18.8h, v0.h[3]\n" - "2:\n" - :[in_0]"+r"(in_hw0), - [f_0]"+r"(f_r) - :[ic]"r"((I64)ic*fh*fw), - [b_0]"r"(b_o0) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v18", "v20", "v22", "x0" - ); - switch (activationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v1.16b, v1.16b, v1.16b\n" //zero - "fmax v2.8h, v2.8h, v1.8h\n" //max(v2, 0) - "fmax v3.8h, v3.8h, v1.8h\n" - "fmax v4.8h, v4.8h, v1.8h\n" - "fmax v5.8h, v5.8h, v1.8h\n" - : - : - :"memory", "cc", "v1", "v2", "v3", "v4", "v5" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "movi v30.8h, #0x46, lsl #8\n" // six - "fmax v2.8h, v2.8h, v31.8h\n" - "fmax v3.8h, v3.8h, v31.8h\n" - "fmax v4.8h, v4.8h, v31.8h\n" - "fmax v5.8h, v5.8h, v31.8h\n" - - "fmin v2.8h, v2.8h, v30.8h\n" - "fmin v3.8h, v3.8h, v30.8h\n" - "fmin v4.8h, v4.8h, v30.8h\n" - "fmin v5.8h, v5.8h, v30.8h\n" - : - : - :"memory", "cc", "v2", "v3", "v4", "v5", "v30", "v31" - ); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - - __asm__ __volatile__( - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw1 - "str q4, [%[out_0], #32]\n" //out_o0hw2 - "str q5, [%[out_0], #48]\n" //out_o0hw3 - :[out_0]"+r"(out_o0hw0) - : - :"memory", "cc", "v2", "v3", "v4", "v5" - ); - } - } - // ohow_reminder % 4 - ohow_s = (ohow / 4) * 4; - for (I32 hw = ohow_s; hw < ohow; hw++) { - const F16 *b0 = biasArray; - const F16 *b1 = biasArray + 8; - const F16 *f_o0c0 = filterArray; - F16 *in_pack = ((F16*)tmp) + ic*ih_pad*iw_pad; - // pack input - // NCHWc8 => NHWChw4 + im2col - U32 in_h_0 = (hw/ow)*strideH; - U32 in_w_0 = (hw%ow)*strideW; - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - F16 *in_hw = inArray_pad + c*ihiw + fh_idx*dilateH*iw_pad + dilateW*fw_idx; - F16 *in_0 = in_hw + in_h_0*iw_pad + in_w_0; - F16 *in_pack_hw1 = in_pack + fh_idx*fw*ic + fw_idx*ic + c; - /* - * for (U32 c8 = 0; c8 < 8; c8++) { - * in_pack_c8hw1[c8] = in_0[c8]; - * } - */ - *in_pack_hw1 = *in_0; - } - } - } - - // compute - for (I32 o = 0; o < I32(oc-1); o+=2) { - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; - // bias - const F16 *b_o0 = b0; - const F16 *b_o1 = b1; - __asm__ __volatile__( - "ldr q22, [%[b_0]]\n" //b_o0 - "ldr q23, [%[b_1]]\n" //b_o1 - "mov x0, %[ic]\n" //ic_blk - "ldr h0, [%[in_0]]\n" //in_hw0 - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr q18, [%[f_0]]\n" //f_o0c0 - "mov v10.16b, v23.16b\n" //out_o1hw0 - "ldr q19, [%[f_0], #16]\n" //f_o1c0 - - "0:\n" - "cmp x0, #1\n" - "ble 1f\n" - "ldr h1, [%[in_0], #2]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr q20, [%[f_0], #32]\n" //f_o0c0 - "fmla v10.8h, v19.8h, v0.h[0]\n" - "ldr q21, [%[f_0], #48]\n" //f_o1c0 - "sub x0, x0, #2\n" - - "ldr h0, [%[in_0], #4]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr q18, [%[f_0], #64]\n" //f_o0c0 - "fmla v10.8h, v21.8h, v1.h[0]\n" - "ldr q19, [%[f_0], #80]\n" //f_o1c0 - "add %[in_0], %[in_0], #4\n" - "add %[f_0], %[f_0], #64\n" - "b 0b\n" - - "1:\n" - "blt 2f\n" - "fmla v2.8h, v18.8h, v0.h[0]\n" - "add %[f_0], %[f_0], #32\n" - "fmla v10.8h, v19.8h, v0.h[0]\n" - "2:\n" - :[in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*fh*fw), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1) - :"memory", "cc", "v0", "v1", "v2", "v10", "v18", "v19", "v20", "v21", "v22", "v23", "x0" - ); - switch (activationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v1.16b, v1.16b, v1.16b\n" //zero - "fmax v2.8h, v2.8h, v1.8h\n" //max(v2, 0) - "fmax v10.8h, v10.8h, v1.8h\n" - : - : - :"memory", "cc", "v1", "v2", "v10" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "movi v30.8h, #0x46, lsl #8\n" // six - "fmax v2.8h, v2.8h, v31.8h\n" - "fmax v10.8h, v10.8h, v31.8h\n" - - "fmin v2.8h, v2.8h, v30.8h\n" - "fmin v10.8h, v10.8h, v30.8h\n" - : - : - :"memory", "cc", "v2", "v10", "v30", "v31" - ); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - - __asm__ __volatile__( - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q10, [%[out_1]]\n" //out_o1hw0 - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0) - : - :"memory", "cc", "v2", "v10" - ); - b0 += 16; - b1 += 16; - } - if (oc & 1) { - // oc%2 != 0 - const F16 *f_r = filterArray + (oc-1)*8*fh*fw*ic; - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + (oc-1)*ohow*8 + hw*8; - // bias - const F16 *b_o0 = biasArray + (oc-1)*8; - __asm__ __volatile__( - "ldr q22, [%[b_0]]\n" //b_o0 - "mov x0, %[ic]\n" //ic_blk - "ldr h0, [%[in_0]]\n" //in_hw0 - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr q18, [%[f_0]]\n" //f_o0c0 - - "0:\n" - "cmp x0, #1\n" - "ble 1f\n" - "ldr h1, [%[in_0], #2]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr q20, [%[f_0], #16]\n" //f_o0c0 - "sub x0, x0, #2\n" - - "ldr h0, [%[in_0], #4]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr q18, [%[f_0], #32]\n" //f_o0c0 - "add %[in_0], %[in_0], #4\n" - "add %[f_0], %[f_0], #32\n" - "b 0b\n" - - "1:\n" - "blt 2f\n" - "fmla v2.8h, v18.8h, v0.h[0]\n" - "add %[f_0], %[f_0], #16\n" - "2:\n" - :[in_0]"+r"(in_hw0), - [f_0]"+r"(f_r) - :[ic]"r"((I64)ic*fh*fw), - [b_0]"r"(b_o0) - :"memory", "cc", "v0", "v1", "v2", "v10", "v18", "v20", "v22", "x0" - ); - switch (activationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v1.16b, v1.16b, v1.16b\n" //zero - "fmax v2.8h, v2.8h, v1.8h\n" //max(v2, 0) - : - : - :"memory", "cc", "v1", "v2" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "movi v30.8h, #0x46, lsl #8\n" // six - "fmax v2.8h, v2.8h, v31.8h\n" - "fmin v2.8h, v2.8h, v30.8h\n" - : - : - :"memory", "cc", "v2", "v30", "v31" - ); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - - __asm__ __volatile__( - "str q2, [%[out_0]]\n" //out_o0hw0 - :[out_0]"+r"(out_o0hw0) - : - :"memory", "cc", "v2" - ); - } - } - } - return ret; -} diff --git a/tensor_computing/src/cpu/arm/fp16/convolution_transform.cpp b/tensor_computing/src/cpu/arm/fp16/convolution_transform.cpp deleted file mode 100644 index 13a9dd6a..00000000 --- a/tensor_computing/src/cpu/arm/fp16/convolution_transform.cpp +++ /dev/null @@ -1,165 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/fp16/tensor_computing_fp16.h" -#include "cpu/arm/fp16/convolution_winograd_transform.h" - -inline EE convolution_transform_filter_kernel_fp16(TensorDesc filterDesc, const F16* filterArray, - TensorDesc *ftmDesc, F16* ftmArray, - DataFormat ftmDataFormat) -{ - if (nullptr == filterArray || nullptr == ftmDesc || nullptr == ftmArray) - CHECK_STATUS(NULL_POINTER); - DataType fdt; - DataFormat fdf; - U32 fn, fc, fh, fw; - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - if (fdf == ftmDataFormat) { - *ftmDesc = filterDesc; - memcpy(ftmArray, filterArray, fn*fc*fh*fw*bytesOf(fdt)); - return SUCCESS; - } - if (fdf != DF_NCHW) - CHECK_STATUS(NOT_SUPPORTED); - EE ret = SUCCESS; - switch (ftmDataFormat) { - case DF_NHWCN16: { - /* - * NCHW => NHWCN16 - * if there is remainder, it should be NHWCN8 - */ - U32 oc = fn / 16; - for (U32 o = 0; o < oc; o++) { - for (U32 hw = 0; hw < fh*fw; hw++) { - for (U32 c = 0; c < fc; c++) { - for (U32 o16 = 0; o16 < 16; o16++) { - ftmArray[o*fh*fw*fc*16 + hw*fc*16 + c*16 + o16] = filterArray[(o*16+o16)*fc*fh*fw + c*fh*fw + hw]; - } - } - } - } - if (fn != oc*16) { - for (U32 hw = 0; hw < fh*fw; hw++) { - for (U32 c = 0; c < fc; c++) { - for (U32 o8 = 0; o8 < 8; o8++) { - ftmArray[(oc*16)*fh*fw*fc + hw*fc*8 + c*8 + o8] = filterArray[(oc*16+o8)*fc*fh*fw + c*fh*fw + hw]; - } - } - } - } - *ftmDesc = tensor4df(fdt, ftmDataFormat, fn, fc, fh, fw); - break; - } - case DF_NCHWN16: { - /* - * NCHW => NCHWN16 - */ - U32 oc = fn / 16; - for (U32 o = 0; o < oc; o++) { - for (U32 chw = 0; chw < fc*fh*fw; chw++) { - for (U32 o16 = 0; o16 < 16; o16++) { - ftmArray[o*fc*fh*fw*16 + chw*16 + o16] = filterArray[(o*16+o16)*fc*fh*fw + chw]; - } - } - } - *ftmDesc = tensor4df(fdt, ftmDataFormat, fn, fc, fh, fw); - break; - } - case DF_HWNCN16: { - for (U32 o = 0; o < fn/16; o++) { - for (U32 c = 0; c < fc; c++) { - U32 f_off_0 = (o*16)*fc*fh*fw + c*fh*fw; - U32 f_off_1 = (o*16+8)*fc*fh*fw + c*fh*fw; - U32 ftm_off_0 = o*36*fc*16 + c*16; - U32 ftm_off_1 = o*36*fc*16 + c*16 + 8; - F16 F[9][8]; - F16 *F_ptr[9]; - F16 *Fw[36]; - for (U32 hw = 0; hw < 9; hw++) { - for (U32 oo = 0; oo < 8; oo++) { - F[hw][oo] = filterArray[f_off_0 + hw + oo*fc*fh*fw]; - } - F_ptr[hw] = F[hw]; - } - for (U32 hw = 0; hw < 36; hw++) { - Fw[hw] = ftmArray + ftm_off_0 + hw*fc*16; - } - trans_W_4x4_3x3(Fw, F_ptr); - for (U32 hw = 0; hw < 9; hw++) { - for (U32 oo = 0; oo < 8; oo++) { - F[hw][oo] = filterArray[f_off_1 + hw + oo*fc*fh*fw]; - } - F_ptr[hw] = F[hw]; - } - for (U32 hw = 0; hw < 36; hw++) { - Fw[hw] = ftmArray + ftm_off_1 + hw*fc*16; - } - trans_W_4x4_3x3(Fw, F_ptr); - } - } - U32 oc = (fn / 16) * 16; - if (oc != fn) { - for (U32 c = 0; c < fc; c++) { - U32 f_off_0 = oc*fc*fh*fw + c*fh*fw; - U32 ftm_off_0 = oc*36*fc + c*8; - F16 F[9][8]; - F16 *F_ptr[9]; - F16 *Fw[36]; - for (U32 hw = 0; hw < 9; hw++) { - for (U32 oo = 0; oo < 8; oo++) { - F[hw][oo] = filterArray[f_off_0 + hw + oo*fc*fh*fw]; - } - F_ptr[hw] = F[hw]; - } - for (U32 hw = 0; hw < 36; hw++) { - Fw[hw] = ftmArray + ftm_off_0 + hw*fc*8; - } - trans_W_4x4_3x3(Fw, F_ptr); - } - } - *ftmDesc = tensor4df(fdt, ftmDataFormat, fn, fc, 6, 6); - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - -EE convolution_transform_filter_fp16(TensorDesc filterDesc, const F16* filter, - ConvolutionForwardAlgorithm algorithm, - TensorDesc *ftmDesc, F16* filterTransformed) -{ - DataFormat ftmDataFormat; - switch (algorithm) { - case CONVOLUTION_ALGORITHM_WINOGRAD: - ftmDataFormat = DF_HWNCN16; - break; - case CONVOLUTION_ALGORITHM_DIRECT: - ftmDataFormat = DF_NCHWN16; - break; - case CONVOLUTION_ALGORITHM_GEMM: - ftmDataFormat = DF_NHWCN16; - break; - case CONVOLUTION_ALGORITHM_GEMM_ICNCHW: - ftmDataFormat = DF_NHWCN16; - break; - default: - return NOT_MATCH; - } - EE ret = convolution_transform_filter_kernel_fp16(filterDesc, filter, ftmDesc, filterTransformed, ftmDataFormat); - CHECK_STATUS(ret); - return ret; -} diff --git a/tensor_computing/src/cpu/arm/fp16/convolution_winograd.h b/tensor_computing/src/cpu/arm/fp16/convolution_winograd.h deleted file mode 100644 index 6c7c5511..00000000 --- a/tensor_computing/src/cpu/arm/fp16/convolution_winograd.h +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_CONVOLUTION_WINOGRAD -#define _H_CONVOLUTION_WINOGRAD - -#include "sys.h" -#include "type.h" -#include "error.h" -#include "tensor_desc.h" -#include "tensor_computing_type.h" - -EE convolution_winograd_A55(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc activationDesc); - -EE convolution_winograd_A76(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc activationDesc); - -inline EE convolution_winograd(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc activationDesc, - Arch arch) -{ - EE ret = SUCCESS; - switch (arch) { - case ARM_A55: - ret = convolution_winograd_A55(inputDesc, inArray, - filterDesc, filterArray, - convDesc, - biasDesc, biasArray, - tmpBytes, tmp, - outputDesc, outArray, - activationDesc); - break; - case ARM_A76: - ret = convolution_winograd_A76(inputDesc, inArray, - filterDesc, filterArray, - convDesc, - biasDesc, biasArray, - tmpBytes, tmp, - outputDesc, outArray, - activationDesc); - break; - default: - return NOT_SUPPORTED; - } - return ret; -} -#endif diff --git a/tensor_computing/src/cpu/arm/fp16/convolution_winograd_A55.cpp b/tensor_computing/src/cpu/arm/fp16/convolution_winograd_A55.cpp deleted file mode 100644 index 06b7fd68..00000000 --- a/tensor_computing/src/cpu/arm/fp16/convolution_winograd_A55.cpp +++ /dev/null @@ -1,865 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/fp16/convolution_winograd_transform.h" -#include "cpu/arm/fp16/convolution_winograd.h" - -EE convolution_winograd_A55(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc activationDesc) -{ - UNUSED(biasDesc); - UNUSED(tmpBytes); - - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - - if (fdf != DF_HWNCN16) - CHECK_STATUS(NOT_MATCH); - if (!(fh == 6 && fw == 6)) - CHECK_STATUS(NOT_SUPPORTED); - - oc /= 8; - ic /= 8; - - U32 tile_h = (oh + 3) / 4; - U32 tile_w = (ow + 3) / 4; - I32 tiles = tile_h * tile_w; // num of 6x6 tiles - U32 pad_left = paddingL; - U32 pad_right = paddingR + (tile_w*4 - ow); - U32 pad_w_mod_4 = tile_w*4 - ow; - U32 pad_top = paddingT; - U32 pad_bottom = paddingB + (tile_h*4 - oh); - U32 pad_h_mod_4 = tile_h*4 - oh; - U32 ih_pad = ih + pad_top + pad_bottom; - U32 iw_pad = iw + pad_left + pad_right; - // tmp = in_pad + itm + otm - // in_pad: ic*ih_pad*iw_pad*8 - // itm: 6*6*ic*8*8 - // otm: oc*6*6*8*8 - F16* inArray_pad = (F16*)tmp; - F16* itmArray = inArray_pad + ic*ih_pad*iw_pad*8; - F16* otmArray = itmArray + 6*6*ic*8*8; - - EE ret = SUCCESS; - // copy input into a input with padding - for (U32 n = 0; n < in; n++) { - F16 *inArray_pad_mov = inArray_pad; - F16 *inArray_mov = inArray + n*ic*ih*iw*8; - for (U32 c = 0; c < ic; c++) { - memset(inArray_pad_mov, 0, pad_top*iw_pad*8*bytesOf(idt)); - inArray_pad_mov += pad_top*iw_pad*8; - for (U32 h = pad_top; h < ih_pad - pad_bottom; h++) { - memset(inArray_pad_mov, 0, pad_left*8*bytesOf(idt)); - inArray_pad_mov += pad_left*8; - memcpy(inArray_pad_mov, inArray_mov, iw*8*bytesOf(idt)); - inArray_pad_mov += iw*8; - inArray_mov += iw*8; - memset(inArray_pad_mov, 0, pad_right*8*bytesOf(idt)); - inArray_pad_mov += pad_right*8; - } - memset(inArray_pad_mov, 0, pad_bottom*iw_pad*8*bytesOf(idt)); - inArray_pad_mov += pad_bottom*iw_pad*8; - } - - // tiles / 8 - for (I32 hw = 0; hw < tiles-7; hw+=8) { - const F16 *ftm_0 = filterArray; - F16 *otm_0 = otmArray; - // in trans - // NCHWc8 => (6*6)*C*c8*hw8 - for (U32 c = 0; c < ic; c++) { - F16 *inArray_pad_mov = inArray_pad + c*ih_pad*iw_pad*8; - F16 *Iw_ptr[36]; - F16 Iw0[36][8]; - F16 *I0[36]; - F16 Iw1[36][8]; - F16 *I1[36]; - F16 Iw2[36][8]; - F16 *I2[36]; - F16 Iw3[36][8]; - F16 *I3[36]; - F16 Iw4[36][8]; - F16 *I4[36]; - F16 Iw5[36][8]; - F16 *I5[36]; - F16 Iw6[36][8]; - F16 *I6[36]; - F16 Iw7[36][8]; - F16 *I7[36]; - F16 *itmArray_mov = itmArray + c*8*8; - U32 h0 = (hw/tile_w)*4; - U32 w0 = (hw%tile_w)*4; - U32 h1 = ((hw+1)/tile_w)*4; - U32 w1 = ((hw+1)%tile_w)*4; - U32 h2 = ((hw+2)/tile_w)*4; - U32 w2 = ((hw+2)%tile_w)*4; - U32 h3 = ((hw+3)/tile_w)*4; - U32 w3 = ((hw+3)%tile_w)*4; - U32 h4 = ((hw+4)/tile_w)*4; - U32 w4 = ((hw+4)%tile_w)*4; - U32 h5 = ((hw+5)/tile_w)*4; - U32 w5 = ((hw+5)%tile_w)*4; - U32 h6 = ((hw+6)/tile_w)*4; - U32 w6 = ((hw+6)%tile_w)*4; - U32 h7 = ((hw+7)/tile_w)*4; - U32 w7 = ((hw+7)%tile_w)*4; - for (U32 i = 0; i < 6; i++) { - for (U32 j = 0; j < 6; j++) { - I0[i*6 + j] = inArray_pad_mov + (h0+i)*iw_pad*8 + (w0+j)*8; - I1[i*6 + j] = inArray_pad_mov + (h1+i)*iw_pad*8 + (w1+j)*8; - I2[i*6 + j] = inArray_pad_mov + (h2+i)*iw_pad*8 + (w2+j)*8; - I3[i*6 + j] = inArray_pad_mov + (h3+i)*iw_pad*8 + (w3+j)*8; - I4[i*6 + j] = inArray_pad_mov + (h4+i)*iw_pad*8 + (w4+j)*8; - I5[i*6 + j] = inArray_pad_mov + (h5+i)*iw_pad*8 + (w5+j)*8; - I6[i*6 + j] = inArray_pad_mov + (h6+i)*iw_pad*8 + (w6+j)*8; - I7[i*6 + j] = inArray_pad_mov + (h7+i)*iw_pad*8 + (w7+j)*8; - } - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw0[i]; - } - trans_I_4x4_3x3(Iw_ptr, I0); - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw1[i]; - } - trans_I_4x4_3x3(Iw_ptr, I1); - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw2[i]; - } - trans_I_4x4_3x3(Iw_ptr, I2); - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw3[i]; - } - trans_I_4x4_3x3(Iw_ptr, I3); - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw4[i]; - } - trans_I_4x4_3x3(Iw_ptr, I4); - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw5[i]; - } - trans_I_4x4_3x3(Iw_ptr, I5); - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw6[i]; - } - trans_I_4x4_3x3(Iw_ptr, I6); - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw7[i]; - } - trans_I_4x4_3x3(Iw_ptr, I7); - for (U32 i = 0; i < 36; i++) { - F16* itm = itmArray_mov + i*ic*8*8; - - // for (U32 c8 = 0; c8 < 8; c8++) { - // itm[c8*8] = Iw0[i][c8]; - // itm[c8*8 + 1] = Iw1[i][c8]; - // itm[c8*8 + 2] = Iw2[i][c8]; - // itm[c8*8 + 3] = Iw3[i][c8]; - // itm[c8*8 + 4] = Iw4[i][c8]; - // itm[c8*8 + 5] = Iw5[i][c8]; - // itm[c8*8 + 6] = Iw6[i][c8]; - // itm[c8*8 + 7] = Iw7[i][c8]; - // } - - float16x8_t v0 = vld1q_f16(Iw0[i]); - float16x8_t v1 = vld1q_f16(Iw1[i]); - float16x8_t v2 = vld1q_f16(Iw2[i]); - float16x8_t v3 = vld1q_f16(Iw3[i]); - float16x8_t v4 = vld1q_f16(Iw4[i]); - float16x8_t v5 = vld1q_f16(Iw5[i]); - float16x8_t v6 = vld1q_f16(Iw6[i]); - float16x8_t v7 = vld1q_f16(Iw7[i]); - vst1q_f16(itm, - vzip1q_f16( - vzip1q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), - vzip1q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); - vst1q_f16(itm + 8, - vzip2q_f16( - vzip1q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), - vzip1q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); - vst1q_f16(itm + 8*2, - vzip1q_f16( - vzip2q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), - vzip2q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); - vst1q_f16(itm + 8*3, - vzip2q_f16( - vzip2q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), - vzip2q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); - vst1q_f16(itm + 8*4, - vzip1q_f16( - vzip1q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), - vzip1q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); - vst1q_f16(itm + 8*5, - vzip2q_f16( - vzip1q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), - vzip1q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); - vst1q_f16(itm + 8*6, - vzip1q_f16( - vzip2q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), - vzip2q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); - vst1q_f16(itm + 8*7, - vzip2q_f16( - vzip2q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), - vzip2q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); - } - } - for (I32 o = 0; o < I32(oc-1); o+=2) { - const F16 *b_0 = biasArray + o*8; - const F16 *b_1 = b_0 + 8; - F16 *itm_0 = itmArray; - // dot prod - // (6*6)*C*c8*hw8 times O*(6*6)*C*c8*o16 = O*(6*6)*hw8*o16 - for (U32 idx = 0; idx < 36; idx++) { - __asm__ __volatile__( - "mov x0, %[ic]\n" //ic_blk - "eor v2.16b, v2.16b, v2.16b\n" //out_o0hw0 - "ldr d0, [%[in]]\n" //in_hw0 - "eor v4.16b, v4.16b, v4.16b\n" //out_o0hw1 - "ldr x1, [%[in], #8]\n" - "eor v6.16b, v6.16b, v6.16b\n" //out_o0hw2 - "ins v0.d[1], x1\n" - "eor v8.16b, v8.16b, v8.16b\n" //out_o0hw3 - "ldr d18, [%[f]]\n" //f_o0c0 - "eor v10.16b, v10.16b, v10.16b\n" //out_o0hw4 - "ldr x2, [%[f], #8]\n" - "eor v12.16b, v12.16b, v12.16b\n" //out_o0hw5 - "ins v18.d[1], x2\n" - "eor v14.16b, v14.16b, v14.16b\n" //out_o0hw6 - "ldr d19, [%[f], #16]\n" //f_o1c0 - "eor v16.16b, v16.16b, v16.16b\n" //out_o0hw7 - "ldr x3, [%[f], #24]\n" - "eor v3.16b, v3.16b, v3.16b\n" //out_o1hw0 - "ins v19.d[1], x3\n" - "eor v5.16b, v5.16b, v5.16b\n" //out_o1hw1 - "eor v7.16b, v7.16b, v7.16b\n" //out_o1hw2 - "eor v9.16b, v9.16b, v9.16b\n" //out_o1hw3 - "eor v11.16b, v11.16b, v11.16b\n" //out_o1hw4 - "eor v13.16b, v13.16b, v13.16b\n" //out_o1hw5 - "eor v15.16b, v15.16b, v15.16b\n" //out_o1hw6 - "eor v17.16b, v17.16b, v17.16b\n" //out_o1hw7 - "0:\n" - "ldr d1, [%[in], #16]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr x1, [%[in], #24]\n" - "fmla v4.8h, v18.8h, v0.h[1]\n" - "ins v1.d[1], x1\n" - "fmla v6.8h, v18.8h, v0.h[2]\n" - "ldr d20, [%[f], #32]\n" //f_o0c0 - "fmla v8.8h, v18.8h, v0.h[3]\n" - "ldr x2, [%[f], #40]\n" - "fmla v10.8h, v18.8h, v0.h[4]\n" - "ins v20.d[1], x2\n" - "fmla v12.8h, v18.8h, v0.h[5]\n" - "ldr d21, [%[f], #48]\n" //f_o1c0 - "fmla v14.8h, v18.8h, v0.h[6]\n" - "ldr x3, [%[f], #56]\n" - "fmla v16.8h, v18.8h, v0.h[7]\n" - "ins v21.d[1], x3\n" - "fmla v3.8h, v19.8h, v0.h[0]\n" - "fmla v5.8h, v19.8h, v0.h[1]\n" - "fmla v7.8h, v19.8h, v0.h[2]\n" - "fmla v9.8h, v19.8h, v0.h[3]\n" - "fmla v11.8h, v19.8h, v0.h[4]\n" - "fmla v13.8h, v19.8h, v0.h[5]\n" - "fmla v15.8h, v19.8h, v0.h[6]\n" - "fmla v17.8h, v19.8h, v0.h[7]\n" - - "ldr d0, [%[in], #32]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr x1, [%[in], #40]\n" - "fmla v4.8h, v20.8h, v1.h[1]\n" - "ins v0.d[1], x1\n" - "fmla v6.8h, v20.8h, v1.h[2]\n" - "ldr d18, [%[f], #64]\n" //f_o0c0 - "fmla v8.8h, v20.8h, v1.h[3]\n" - "ldr x2, [%[f], #72]\n" - "fmla v10.8h, v20.8h, v1.h[4]\n" - "ins v18.d[1], x2\n" - "fmla v12.8h, v20.8h, v1.h[5]\n" - "ldr d19, [%[f], #80]\n" //f_o1c0 - "fmla v14.8h, v20.8h, v1.h[6]\n" - "ldr x3, [%[f], #88]\n" - "fmla v16.8h, v20.8h, v1.h[7]\n" - "ins v19.d[1], x3\n" - "fmla v3.8h, v21.8h, v1.h[0]\n" - "add %[in], %[in], #32\n" - "fmla v5.8h, v21.8h, v1.h[1]\n" - "add %[f], %[f], #64\n" - "fmla v7.8h, v21.8h, v1.h[2]\n" - "subs x0, x0, #2\n" - "fmla v9.8h, v21.8h, v1.h[3]\n" - "fmla v11.8h, v21.8h, v1.h[4]\n" - "fmla v13.8h, v21.8h, v1.h[5]\n" - "fmla v15.8h, v21.8h, v1.h[6]\n" - "fmla v17.8h, v21.8h, v1.h[7]\n" - "bne 0b\n" - "st1 { v2.8h, v3.8h, v4.8h, v5.8h}, [%[out]], #64\n" - "st1 { v6.8h, v7.8h, v8.8h, v9.8h}, [%[out]], #64\n" - "st1 {v10.8h, v11.8h, v12.8h, v13.8h}, [%[out]], #64\n" - "st1 {v14.8h, v15.8h, v16.8h, v17.8h}, [%[out]], #64\n" - :[out]"+r"(otm_0), - [in]"+r"(itm_0), - [f]"+r"(ftm_0) - :[ic]"r"((I64)ic*8) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "x0", "x1", "x2", "x3" - ); - } - // out trans - // O*(6*6)*hw8*o16 => NOHWo8 - for (U32 hw8 = 0; hw8 < 8; hw8++) { - U32 h = (hw+hw8) / tile_w; - U32 w = (hw+hw8) % tile_w; - F16 *out_0 = outArray + n*oc*oh*ow*8 + o*oh*ow*8 + h*4*ow*8 + w*4*8; - F16 *out_1 = out_0 + oh*ow*8; - U32 otm_off_0 = o*8*36*8 + hw8*16; - U32 otm_off_1 = otm_off_0 + 8; - - F16 *Ow_0[36]; - F16 *Ow_1[36]; - F16 *O_0[16]; - F16 *O_1[16]; - for (U32 idx = 0; idx < 36; idx++) { - Ow_0[idx] = otmArray + otm_off_0 + idx*8*16; - Ow_1[idx] = otmArray + otm_off_1 + idx*8*16; - } - for (U32 i = 0; i < 4; ++i) { - for (U32 j = 0; j < 4; ++j) { - O_0[i*4 + j] = out_0 + i*ow*8 + j*8; - O_1[i*4 + j] = out_1 + i*ow*8 + j*8; - } - } - CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, activationDesc)); - CHECK_STATUS(trans_O_4x4_3x3(Ow_1, O_1, b_1, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, activationDesc)); - } - } - if (oc & 1) { - F16 *itm_0 = itmArray; - const F16 *ftm_0 = filterArray + (oc-1)*36*ic*8*8; - F16 *otm_0 = otmArray + (oc-1)*36*8*8; - const F16 *b_0 = biasArray + (oc-1)*8; - // dot prod - // (6*6)*C*c8*hw8 times O*(6*6)*C*c8*o8 = O*(6*6)*hw8*o8 - for (U32 idx = 0; idx < 36; idx++) { - __asm__ __volatile__( - "mov x0, %[ic]\n" //ic_blk - "eor v2.16b, v2.16b, v2.16b\n" //out_o0hw0 - "ldr d0, [%[in]]\n" //in_hw0 - "eor v3.16b, v3.16b, v3.16b\n" //out_o0hw1 - "ldr x1, [%[in], #8]\n" - "eor v4.16b, v4.16b, v4.16b\n" //out_o0hw2 - "ins v0.d[1], x1\n" - "eor v5.16b, v5.16b, v5.16b\n" //out_o0hw3 - "ldr d18, [%[f]]\n" //f_o0c0 - "eor v6.16b, v6.16b, v6.16b\n" //out_o0hw4 - "ldr x2, [%[f], #8]\n" - "eor v7.16b, v7.16b, v7.16b\n" //out_o0hw5 - "ins v18.d[1], x2\n" - "eor v8.16b, v8.16b, v8.16b\n" //out_o0hw6 - "eor v9.16b, v9.16b, v9.16b\n" //out_o0hw7 - "0:\n" - "ldr d1, [%[in], #16]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr x1, [%[in], #24]\n" - "fmla v3.8h, v18.8h, v0.h[1]\n" - "ins v1.d[1], x1\n" - "fmla v4.8h, v18.8h, v0.h[2]\n" - "ldr d20, [%[f], #16]\n" //f_o0c0 - "fmla v5.8h, v18.8h, v0.h[3]\n" - "ldr x2, [%[f], #24]\n" - "fmla v6.8h, v18.8h, v0.h[4]\n" - "ins v20.d[1], x2\n" - "fmla v7.8h, v18.8h, v0.h[5]\n" - "fmla v8.8h, v18.8h, v0.h[6]\n" - "subs x0, x0, #2\n" - "fmla v9.8h, v18.8h, v0.h[7]\n" - - "ldr d0, [%[in], #32]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr x1, [%[in], #40]\n" - "fmla v3.8h, v20.8h, v1.h[1]\n" - "ins v0.d[1], x1\n" - "fmla v4.8h, v20.8h, v1.h[2]\n" - "ldr d18, [%[f], #32]\n" //f_o0c0 - "fmla v5.8h, v20.8h, v1.h[3]\n" - "ldr x2, [%[f], #40]\n" - "fmla v6.8h, v20.8h, v1.h[4]\n" - "ins v18.d[1], x2\n" - "fmla v7.8h, v20.8h, v1.h[5]\n" - "add %[in], %[in], #32\n" - "fmla v8.8h, v20.8h, v1.h[6]\n" - "add %[f], %[f], #32\n" - "fmla v9.8h, v20.8h, v1.h[7]\n" - "bne 0b\n" - "st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%[out]], #64\n" - "st1 {v6.8h, v7.8h, v8.8h, v9.8h}, [%[out]], #64\n" - :[out]"+r"(otm_0), - [in]"+r"(itm_0), - [f]"+r"(ftm_0) - :[ic]"r"((I64)ic*8) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v18", "v20", "x0", "x1", "x2" - ); - } - // out trans - // O*(6*6)*hw8*o8 => NOWHo8 - for (U32 hw8 = 0; hw8 < 8; hw8++) { - U32 h = (hw+hw8) / tile_w; - U32 w = (hw+hw8) % tile_w; - F16 *out_0 = outArray + n*oc*oh*ow*8 + (oc-1)*oh*ow*8 + h*4*ow*8 + w*4*8; - U32 otm_off_0 = (oc-1)*8*36*8 + hw8*8; - - F16 *Ow_0[36]; - F16 *O_0[16]; - for (U32 idx = 0; idx < 36; idx++) { - Ow_0[idx] = otmArray + otm_off_0 + idx*8*8; - } - for (U32 i = 0; i < 4; ++i) { - for (U32 j = 0; j < 4; ++j) { - O_0[i*4 + j] = out_0 + i*ow*8 + j*8; - } - } - CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, activationDesc)); - } - } - } - - // tiles_reminder % 8 / 4 - I32 tiles_s = (tiles / 8) * 8; - for (I32 hw = tiles_s; hw < tiles-3; hw+=4) { - const F16 *ftm_0 = filterArray; - F16 *otm_0 = otmArray; - // in trans - // NCHWc8 => (6*6)*C*c8*hw4 - for (U32 c = 0; c < ic; c++) { - F16 *inArray_pad_mov = inArray_pad + c*ih_pad*iw_pad*8; - F16 *Iw_ptr[36]; - F16 Iw0[36][8]; - F16 *I0[36]; - F16 Iw1[36][8]; - F16 *I1[36]; - F16 Iw2[36][8]; - F16 *I2[36]; - F16 Iw3[36][8]; - F16 *I3[36]; - F16 *itmArray_mov = itmArray + c*8*4; - U32 h0 = (hw/tile_w)*4; - U32 w0 = (hw%tile_w)*4; - U32 h1 = ((hw+1)/tile_w)*4; - U32 w1 = ((hw+1)%tile_w)*4; - U32 h2 = ((hw+2)/tile_w)*4; - U32 w2 = ((hw+2)%tile_w)*4; - U32 h3 = ((hw+3)/tile_w)*4; - U32 w3 = ((hw+3)%tile_w)*4; - for (U32 i = 0; i < 6; i++) { - for (U32 j = 0; j < 6; j++) { - I0[i*6 + j] = inArray_pad_mov + (h0+i)*iw_pad*8 + (w0+j)*8; - I1[i*6 + j] = inArray_pad_mov + (h1+i)*iw_pad*8 + (w1+j)*8; - I2[i*6 + j] = inArray_pad_mov + (h2+i)*iw_pad*8 + (w2+j)*8; - I3[i*6 + j] = inArray_pad_mov + (h3+i)*iw_pad*8 + (w3+j)*8; - } - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw0[i]; - } - trans_I_4x4_3x3(Iw_ptr, I0); - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw1[i]; - } - trans_I_4x4_3x3(Iw_ptr, I1); - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw2[i]; - } - trans_I_4x4_3x3(Iw_ptr, I2); - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw3[i]; - } - trans_I_4x4_3x3(Iw_ptr, I3); - for (U32 i = 0; i < 36; i++) { - F16* itm = itmArray_mov + i*ic*8*4; - - // for (U32 c8 = 0; c8 < 8; c8++) { - // itm[c8*4] = Iw0[i][c8]; - // itm[c8*4 + 1] = Iw1[i][c8]; - // itm[c8*4 + 2] = Iw2[i][c8]; - // itm[c8*4 + 3] = Iw3[i][c8]; - // } - - __asm__ __volatile__( - "ldr q0, [%[in_0]]\n" - "ldr q1, [%[in_1]]\n" - "ldr q2, [%[in_2]]\n" - "ldr q3, [%[in_3]]\n" - "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%[itm]]\n" - :[itm]"+r"(itm) - :[in_0]"r"(Iw0[i]), - [in_1]"r"(Iw1[i]), - [in_2]"r"(Iw2[i]), - [in_3]"r"(Iw3[i]) - :"memory", "cc", "v0", "v1", "v2", "v3" - ); - } - } - for (I32 o = 0; o < I32(oc-1); o+=2) { - const F16 *b_0 = biasArray + o*8; - const F16 *b_1 = b_0 + 8; - F16 *itm_0 = itmArray; - // dot prod - // (6*6)*C*c8*hw4 times O*(6*6)*C*c8*o16 = O*(6*6)*hw4*o16 - for (U32 idx = 0; idx < 36; idx++) { - __asm__ __volatile__( - "mov x0, %[ic]\n" //ic_blk - "eor v2.16b, v2.16b, v2.16b\n" //out_o0hw0 - "ldr d0, [%[in]]\n" //in_hw0 - "eor v4.16b, v4.16b, v4.16b\n" //out_o0hw1 - "ldr d18, [%[f]]\n" //f_o0c0 - "eor v6.16b, v6.16b, v6.16b\n" //out_o0hw2 - "ldr x2, [%[f], #8]\n" //f_o0c0 - "eor v8.16b, v8.16b, v8.16b\n" //out_o0hw3 - "ins v18.d[1], x2\n" //f_o0c0 - "ldr d19, [%[f], #16]\n" //f_o1c0 - "eor v3.16b, v3.16b, v3.16b\n" //out_o1hw0 - "ldr x3, [%[f], #24]\n" //f_o1c0 - "eor v5.16b, v5.16b, v5.16b\n" //out_o1hw1 - "ins v19.d[1], x3\n" //f_o1c0 - "eor v7.16b, v7.16b, v7.16b\n" //out_o1hw2 - "eor v9.16b, v9.16b, v9.16b\n" //out_o1hw3 - "0:\n" - "ldr d1, [%[in], #8]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr d20, [%[f], #32]\n" //f_o0c0 - "fmla v4.8h, v18.8h, v0.h[1]\n" - "ldr x2, [%[f], #40]\n" //f_o0c0 - "fmla v6.8h, v18.8h, v0.h[2]\n" - "ins v20.d[1], x2\n" //f_o0c0 - "fmla v8.8h, v18.8h, v0.h[3]\n" - "ldr d21, [%[f], #48]\n" //f_o1c0 - "fmla v3.8h, v19.8h, v0.h[0]\n" - "ldr x3, [%[f], #56]\n" //f_o1c0 - "fmla v5.8h, v19.8h, v0.h[1]\n" - "ins v21.d[1], x3\n" //f_o1c0 - "fmla v7.8h, v19.8h, v0.h[2]\n" - "subs x0, x0, #2\n" - "fmla v9.8h, v19.8h, v0.h[3]\n" - - "ldr d0, [%[in], #16]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr d18, [%[f], #64]\n" //f_o0c0 - "fmla v4.8h, v20.8h, v1.h[1]\n" - "ldr x2, [%[f], #72]\n" //f_o0c0 - "fmla v6.8h, v20.8h, v1.h[2]\n" - "ins v18.d[1], x2\n" //f_o0c0 - "fmla v8.8h, v20.8h, v1.h[3]\n" - "ldr d19, [%[f], #80]\n" //f_o1c0 - "fmla v3.8h, v21.8h, v1.h[0]\n" - "ldr x3, [%[f], #88]\n" //f_o1c0 - "fmla v5.8h, v21.8h, v1.h[1]\n" - "ins v19.d[1], x3\n" //f_o1c0 - "fmla v7.8h, v21.8h, v1.h[2]\n" - "add %[in], %[in], #16\n" - "fmla v9.8h, v21.8h, v1.h[3]\n" - "add %[f], %[f], #64\n" - "bne 0b\n" - "st1 { v2.8h, v3.8h, v4.8h, v5.8h}, [%[out]], #64\n" - "st1 { v6.8h, v7.8h, v8.8h, v9.8h}, [%[out]], #64\n" - :[out]"+r"(otm_0), - [in]"+r"(itm_0), - [f]"+r"(ftm_0) - :[ic]"r"((I64)ic*8) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v18", "v19", "v20", "v21", "x0", "x1", "x2", "x3" - ); - } - // out trans - // O*(6*6)*hw4*o16 => NOWHo8 - for (U32 hw4 = 0; hw4 < 4; hw4++) { - U32 h = (hw+hw4) / tile_w; - U32 w = (hw+hw4) % tile_w; - F16 *out_0 = outArray + n*oc*oh*ow*8 + o*oh*ow*8 + h*4*ow*8 + w*4*8; - F16 *out_1 = out_0 + oh*ow*8; - U32 otm_off_0 = o*8*36*4 + hw4*16; - U32 otm_off_1 = otm_off_0 + 8; - - F16 *Ow_0[36]; - F16 *Ow_1[36]; - F16 *O_0[16]; - F16 *O_1[16]; - for (U32 idx = 0; idx < 36; idx++) { - Ow_0[idx] = otmArray + otm_off_0 + idx*4*16; - Ow_1[idx] = otmArray + otm_off_1 + idx*4*16; - } - for (U32 i = 0; i < 4; ++i) { - for (U32 j = 0; j < 4; ++j) { - O_0[i*4 + j] = out_0 + i*ow*8 + j*8; - O_1[i*4 + j] = out_1 + i*ow*8 + j*8; - } - } - CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, activationDesc)); - CHECK_STATUS(trans_O_4x4_3x3(Ow_1, O_1, b_1, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, activationDesc)); - } - } - if (oc & 1) { - F16 *itm_0 = itmArray; - const F16 *ftm_0 = filterArray + (oc-1)*8*36*ic*8; - F16 *otm_0 = otmArray + (oc-1)*8*36*4; - const F16 *b_0 = biasArray + (oc-1)*8; - // dot prod - // (6*6)*C*c8*hw4 times O*(6*6)*C*c8*o8 = O*(6*6)*hw4*o8 - for (U32 idx = 0; idx < 36; idx++) { - __asm__ __volatile__( - "mov x0, %[ic]\n" //ic_blk - "eor v2.16b, v2.16b, v2.16b\n" //out_o0hw0 - "ldr d0, [%[in]]\n" //in_hw0 - "eor v3.16b, v3.16b, v3.16b\n" //out_o0hw1 - "ldr d18, [%[f]]\n" //f_o0c0 - "eor v4.16b, v4.16b, v4.16b\n" //out_o0hw2 - "ldr x2, [%[f], #8]\n" - "eor v5.16b, v5.16b, v5.16b\n" //out_o0hw3 - "ins v18.d[1], x2\n" - "0:\n" - "ldr d1, [%[in], #8]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr d20, [%[f], #16]\n" //f_o0c0 - "fmla v3.8h, v18.8h, v0.h[1]\n" - "ldr x2, [%[f], #24]\n" - "fmla v4.8h, v18.8h, v0.h[2]\n" - "ins v20.d[1], x2\n" - "fmla v5.8h, v18.8h, v0.h[3]\n" - "subs x0, x0, #2\n" - - "ldr d0, [%[in], #16]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr d18, [%[f], #32]\n" //f_o0c0 - "fmla v3.8h, v20.8h, v1.h[1]\n" - "ldr x2, [%[f], #40]\n" - "fmla v4.8h, v20.8h, v1.h[2]\n" - "ins v18.d[1], x2\n" - "fmla v5.8h, v20.8h, v1.h[3]\n" - "add %[in], %[in], #16\n" - "add %[f], %[f], #32\n" - "bne 0b\n" - "st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%[out]], #64\n" - :[out]"+r"(otm_0), - [in]"+r"(itm_0), - [f]"+r"(ftm_0) - :[ic]"r"((I64)ic*8) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v18", "v20", "x0", "x2" - ); - } - // out trans - // O*(6*6)*hw4*o8 => NOWHo8 - for (U32 hw4 = 0; hw4 < 4; hw4++) { - U32 h = (hw+hw4) / tile_w; - U32 w = (hw+hw4) % tile_w; - F16 *out_0 = outArray + n*oc*oh*ow*8 + (oc-1)*oh*ow*8 + h*4*ow*8 + w*4*8; - U32 otm_off_0 = (oc-1)*8*36*4 + hw4*8; - - F16 *Ow_0[36]; - F16 *O_0[16]; - for (U32 idx = 0; idx < 36; idx++) { - Ow_0[idx] = otmArray + otm_off_0 + idx*4*8; - } - for (U32 i = 0; i < 4; ++i) { - for (U32 j = 0; j < 4; ++j) { - O_0[i*4 + j] = out_0 + i*ow*8 + j*8; - } - } - CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, activationDesc)); - } - } - } - - // tiles_reminder % 4 - tiles_s = (tiles / 4) * 4; - for (I32 hw = tiles_s; hw < tiles; hw++) { - const F16 *ftm_0 = filterArray; - F16 *otm_0 = otmArray; - // in trans - // NCHWc8 => (6*6)*C*c8*hw1 - for (U32 c = 0; c < ic; c++) { - F16 *inArray_pad_mov = inArray_pad + c*ih_pad*iw_pad*8; - F16 *Iw_ptr[36]; - F16 Iw0[36][8]; - F16 *I0[36]; - F16 *itmArray_mov = itmArray + c*8; - U32 h0 = (hw/tile_w)*4; - U32 w0 = (hw%tile_w)*4; - for (U32 i = 0; i < 6; i++) { - for (U32 j = 0; j < 6; j++) { - I0[i*6 + j] = inArray_pad_mov + (h0+i)*iw_pad*8 + (w0+j)*8; - } - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw0[i]; - } - trans_I_4x4_3x3(Iw_ptr, I0); - for (U32 i = 0; i < 36; i++) { - F16* itm = itmArray_mov + i*ic*8; - - // for (U32 c8 = 0; c8 < 8; c8++) { - // itm[c8] = Iw0[i][c8]; - // } - memcpy(itm, Iw0[i], 8*bytesOf(idt)); - } - } - for (I32 o = 0; o < I32(oc-1); o+=2) { - const F16 *b_0 = biasArray + o*8; - const F16 *b_1 = b_0 + 8; - F16 *itm_0 = itmArray; - // dot prod - // (6*6)*C*c8*hw1 times O*(6*6)*C*c8*o16 = O*(6*6)*hw1*o16 - for (U32 idx = 0; idx < 36; idx++) { - __asm__ __volatile__( - "mov x0, %[ic]\n" //ic_blk - "eor v2.16b, v2.16b, v2.16b\n" //out_o0hw0 - "ldr h0, [%[in]]\n" //in_hw0 - "eor v3.16b, v3.16b, v3.16b\n" //out_o1hw0 - "ldr d18, [%[f]]\n" //f_o0c0 - "ldr x2, [%[f], #8]\n" //f_o0c0 - "ins v18.d[1], x2\n" //f_o0c0 - "ldr d19, [%[f], #16]\n" //f_o1c0 - "ldr x3, [%[f], #24]\n" //f_o1c0 - "ins v19.d[1], x3\n" //f_o1c0 - "0:\n" - "ldr h1, [%[in], #2]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr d20, [%[f], #32]\n" //f_o0c0 - "fmla v3.8h, v19.8h, v0.h[0]\n" - "ldr x2, [%[f], #40]\n" //f_o0c0 - "ins v20.d[1], x2\n" //f_o0c0 - "ldr d21, [%[f], #48]\n" //f_o1c0 - "subs x0, x0, #2\n" - "ldr x3, [%[f], #56]\n" //f_o1c0 - "ins v21.d[1], x3\n" //f_o1c0 - - "ldr h0, [%[in], #4]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr d18, [%[f], #64]\n" //f_o0c0 - "fmla v3.8h, v21.8h, v1.h[0]\n" - "ldr x2, [%[f], #72]\n" //f_o0c0 - "ins v18.d[1], x2\n" //f_o0c0 - "ldr d19, [%[f], #80]\n" //f_o1c0 - "add %[in], %[in], #4\n" - "ldr x3, [%[f], #88]\n" //f_o1c0 - "ins v19.d[1], x3\n" //f_o1c0 - "add %[f], %[f], #64\n" - "bne 0b\n" - "st1 {v2.8h, v3.8h}, [%[out]], #32\n" - :[out]"+r"(otm_0), - [in]"+r"(itm_0), - [f]"+r"(ftm_0) - :[ic]"r"((I64)ic*8) - :"memory", "cc", "v0", "v1", "v2", "v3", "v18", "v19", "v20", "v21", "x0", "x2", "x3" - ); - } - // out trans - // O*(6*6)*hw1*o16 => NOWHo8 - U32 h = hw / tile_w; - U32 w = hw % tile_w; - F16 *out_0 = outArray + n*oc*oh*ow*8 + o*oh*ow*8 + h*4*ow*8 + w*4*8; - F16 *out_1 = out_0 + oh*ow*8; - U32 otm_off_0 = o*8*36; - U32 otm_off_1 = otm_off_0 + 8; - - F16 *Ow_0[36]; - F16 *Ow_1[36]; - F16 *O_0[16]; - F16 *O_1[16]; - for (U32 idx = 0; idx < 36; idx++) { - Ow_0[idx] = otmArray + otm_off_0 + idx*16; - Ow_1[idx] = otmArray + otm_off_1 + idx*16; - } - for (U32 i = 0; i < 4; ++i) { - for (U32 j = 0; j < 4; ++j) { - O_0[i*4 + j] = out_0 + i*ow*8 + j*8; - O_1[i*4 + j] = out_1 + i*ow*8 + j*8; - } - } - CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, activationDesc)); - CHECK_STATUS(trans_O_4x4_3x3(Ow_1, O_1, b_1, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, activationDesc)); - } - if (oc & 1) { - F16 *itm_0 = itmArray; - const F16 *ftm_0 = filterArray + (oc-1)*8*36*ic*8; - F16 *otm_0 = otmArray + (oc-1)*8*36; - const F16 *b_0 = biasArray + (oc-1)*8; - // dot prod - // (6*6)*C*c8*hw1 times O*(6*6)*C*c8*o8 = O*(6*6)*hw1*o8 - for (U32 idx = 0; idx < 36; idx++) { - __asm__ __volatile__( - "mov x0, %[ic]\n" //ic_blk - "eor v2.16b, v2.16b, v2.16b\n" //out_o0hw0 - "ldr s0, [%[in]]\n" //in_hw0 - "ldr d18, [%[f]]\n" //f_o0c0 - "ldr x2, [%[f], #8]\n" - "ins v18.d[1], x2\n" - "0:\n" - "ldr h1, [%[in], #2]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr d20, [%[f], #16]\n" //f_o0c0 - "ldr x2, [%[f], #24]\n" - "ins v20.d[1], x2\n" - "subs x0, x0, #2\n" - - "ldr h0, [%[in], #4]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr d18, [%[f], #32]\n" //f_o0c0 - "ldr x2, [%[f], #40]\n" - "ins v18.d[1], x2\n" - "add %[in], %[in], #4\n" - "add %[f], %[f], #32\n" - "bne 0b\n" - "st1 {v2.8h}, [%[out]], #16\n" - :[out]"+r"(otm_0), - [in]"+r"(itm_0), - [f]"+r"(ftm_0) - :[ic]"r"((I64)ic*8) - :"memory", "cc", "v0", "v1", "v2", "v18", "v20", "x0", "x2" - ); - } - // out trans - // O*(6*6)*hw1*o8 => NOWHo8 - U32 h = hw / tile_w; - U32 w = hw % tile_w; - F16 *out_0 = outArray + n*oc*oh*ow*8 + (oc-1)*oh*ow*8 + h*4*ow*8 + w*4*8; - U32 otm_off_0 = (oc-1)*8*36; - - F16 *Ow_0[36]; - F16 *O_0[16]; - for (U32 idx = 0; idx < 36; idx++) { - Ow_0[idx] = otmArray + otm_off_0 + idx*8; - } - for (U32 i = 0; i < 4; ++i) { - for (U32 j = 0; j < 4; ++j) { - O_0[i*4 + j] = out_0 + i*ow*8 + j*8; - } - } - CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, activationDesc)); - } - } - } - return ret; -} diff --git a/tensor_computing/src/cpu/arm/fp16/convolution_winograd_A76.cpp b/tensor_computing/src/cpu/arm/fp16/convolution_winograd_A76.cpp deleted file mode 100644 index 03f6085f..00000000 --- a/tensor_computing/src/cpu/arm/fp16/convolution_winograd_A76.cpp +++ /dev/null @@ -1,733 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/fp16/convolution_winograd_transform.h" -#include "cpu/arm/fp16/convolution_winograd.h" - -EE convolution_winograd_A76(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc activationDesc) -{ - UNUSED(biasDesc); - UNUSED(tmpBytes); - - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - - if (fdf != DF_HWNCN16) - CHECK_STATUS(NOT_MATCH); - if (!(fh == 6 && fw == 6)) - CHECK_STATUS(NOT_SUPPORTED); - - oc /= 8; - ic /= 8; - - U32 tile_h = (oh + 3) / 4; - U32 tile_w = (ow + 3) / 4; - // num of 6x6 tiles - I32 tiles = tile_h * tile_w; - U32 pad_left = paddingL; - U32 pad_right = paddingR + (tile_w*4 - ow); - U32 pad_w_mod_4 = tile_w*4 - ow; - U32 pad_top = paddingT; - U32 pad_bottom = paddingB + (tile_h*4 - oh); - U32 pad_h_mod_4 = tile_h*4 - oh; - U32 ih_pad = ih + pad_top + pad_bottom; - U32 iw_pad = iw + pad_left + pad_right; - // tmp = in_pad + itm + otm - // in_pad: ic*ih_pad*iw_pad*8 - // itm: 6*6*ic*8*8 - // otm: oc*6*6*8*8 - F16* inArray_pad = (F16*)tmp; - F16* itmArray = inArray_pad + ic*ih_pad*iw_pad*8; - F16* otmArray = itmArray + 6*6*ic*8*8; - - EE ret = SUCCESS; - // copy input into a input with padding - for (U32 n = 0; n < in; n++) { - F16 *inArray_pad_mov = inArray_pad; - F16 *inArray_mov = inArray + n*ic*ih*iw*8; - for (U32 c = 0; c < ic; c++) { - memset(inArray_pad_mov, 0, pad_top*iw_pad*8*bytesOf(idt)); - inArray_pad_mov += pad_top*iw_pad*8; - for (U32 h = pad_top; h < ih_pad - pad_bottom; h++) { - memset(inArray_pad_mov, 0, pad_left*8*bytesOf(idt)); - inArray_pad_mov += pad_left*8; - memcpy(inArray_pad_mov, inArray_mov, iw*8*bytesOf(idt)); - inArray_pad_mov += iw*8; - inArray_mov += iw*8; - memset(inArray_pad_mov, 0, pad_right*8*bytesOf(idt)); - inArray_pad_mov += pad_right*8; - } - memset(inArray_pad_mov, 0, pad_bottom*iw_pad*8*bytesOf(idt)); - inArray_pad_mov += pad_bottom*iw_pad*8; - } - - // tiles / 8 - for (I32 hw = 0; hw < tiles-7; hw+=8) { - const F16 *ftm_0 = filterArray; - F16 *otm_0 = otmArray; - // in trans - // NCHWc8 => (6*6)*C*c8*hw8 - for (U32 c = 0; c < ic; c++) { - F16 *inArray_pad_mov = inArray_pad + c*ih_pad*iw_pad*8; - F16 *itmArray_mov = itmArray + c*8*8; - F16 *Iw_ptr[36]; - F16 Iw[8][36][8]; - F16 *I[8][36]; - U32 h[8]; - U32 w[8]; - for (U32 index = 0; index < 8; index++) { - h[index] = ((hw + index) / tile_w) * 4; - w[index] = ((hw + index) % tile_w) * 4; - } - for (U32 i = 0; i < 6; i++) { - for (U32 j = 0; j < 6; j++) { - for (U32 index = 0; index < 8; index++) { - I[index][i*6 + j] = inArray_pad_mov + (h[index] + i) * iw_pad * 8 + (w[index] + j) * 8; - } - } - } - for (U32 index = 0; index < 8; index++) { - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw[index][i]; - } - trans_I_4x4_3x3(Iw_ptr, I[index]); - } - for (U32 i = 0; i < 36; i++) { - F16* itm = itmArray_mov + i*ic*8*8; - float16x8_t v0 = vld1q_f16(Iw[0][i]); - float16x8_t v1 = vld1q_f16(Iw[1][i]); - float16x8_t v2 = vld1q_f16(Iw[2][i]); - float16x8_t v3 = vld1q_f16(Iw[3][i]); - float16x8_t v4 = vld1q_f16(Iw[4][i]); - float16x8_t v5 = vld1q_f16(Iw[5][i]); - float16x8_t v6 = vld1q_f16(Iw[6][i]); - float16x8_t v7 = vld1q_f16(Iw[7][i]); - vst1q_f16(itm, - vzip1q_f16( - vzip1q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), - vzip1q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); - vst1q_f16(itm + 8, - vzip2q_f16( - vzip1q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), - vzip1q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); - vst1q_f16(itm + 8*2, - vzip1q_f16( - vzip2q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), - vzip2q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); - vst1q_f16(itm + 8*3, - vzip2q_f16( - vzip2q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), - vzip2q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); - vst1q_f16(itm + 8*4, - vzip1q_f16( - vzip1q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), - vzip1q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); - vst1q_f16(itm + 8*5, - vzip2q_f16( - vzip1q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), - vzip1q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); - vst1q_f16(itm + 8*6, - vzip1q_f16( - vzip2q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), - vzip2q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); - vst1q_f16(itm + 8*7, - vzip2q_f16( - vzip2q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), - vzip2q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); - } - } - for (I32 o = 0; o < I32(oc-1); o+=2) { - const F16 *b_0 = biasArray + o*8; - const F16 *b_1 = b_0 + 8; - F16 *itm_0 = itmArray; - // dot prod - // (6*6)*C*c8*hw8 times O*(6*6)*C*c8*o16 = O*(6*6)*hw8*o16 - for (U32 idx = 0; idx < 36; idx++) { - __asm__ __volatile__( - "mov x0, %[ic]\n" //ic_blk - "eor v2.16b, v2.16b, v2.16b\n" //out_o0hw0 - "ldr q0, [%[in]]\n" //in_hw0 - "eor v4.16b, v4.16b, v4.16b\n" //out_o0hw1 - "eor v6.16b, v6.16b, v6.16b\n" //out_o0hw2 - "eor v8.16b, v8.16b, v8.16b\n" //out_o0hw3 - "ldr q18, [%[f]]\n" //f_o0c0 - "eor v10.16b, v10.16b, v10.16b\n" //out_o0hw4 - "eor v12.16b, v12.16b, v12.16b\n" //out_o0hw5 - "eor v14.16b, v14.16b, v14.16b\n" //out_o0hw6 - "ldr q19, [%[f], #16]\n" //f_o1c0 - "eor v16.16b, v16.16b, v16.16b\n" //out_o0hw7 - "eor v3.16b, v3.16b, v3.16b\n" //out_o1hw0 - "eor v5.16b, v5.16b, v5.16b\n" //out_o1hw1 - "eor v7.16b, v7.16b, v7.16b\n" //out_o1hw2 - "eor v9.16b, v9.16b, v9.16b\n" //out_o1hw3 - "eor v11.16b, v11.16b, v11.16b\n" //out_o1hw4 - "eor v13.16b, v13.16b, v13.16b\n" //out_o1hw5 - "eor v15.16b, v15.16b, v15.16b\n" //out_o1hw6 - "eor v17.16b, v17.16b, v17.16b\n" //out_o1hw7 - "0:\n" - "ldr q1, [%[in], #16]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "fmla v4.8h, v18.8h, v0.h[1]\n" - "ldr q20, [%[f], #32]\n" //f_o0c0 - "fmla v6.8h, v18.8h, v0.h[2]\n" - "fmla v8.8h, v18.8h, v0.h[3]\n" - "ldr q21, [%[f], #48]\n" //f_o1c0 - "fmla v10.8h, v18.8h, v0.h[4]\n" - "fmla v12.8h, v18.8h, v0.h[5]\n" - "fmla v14.8h, v18.8h, v0.h[6]\n" - "fmla v16.8h, v18.8h, v0.h[7]\n" - "fmla v3.8h, v19.8h, v0.h[0]\n" - "fmla v5.8h, v19.8h, v0.h[1]\n" - "fmla v7.8h, v19.8h, v0.h[2]\n" - "fmla v9.8h, v19.8h, v0.h[3]\n" - "fmla v11.8h, v19.8h, v0.h[4]\n" - "fmla v13.8h, v19.8h, v0.h[5]\n" - "fmla v15.8h, v19.8h, v0.h[6]\n" - "fmla v17.8h, v19.8h, v0.h[7]\n" - - "ldr q0, [%[in], #32]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "fmla v4.8h, v20.8h, v1.h[1]\n" - "ldr q18, [%[f], #64]\n" //f_o0c0 - "fmla v6.8h, v20.8h, v1.h[2]\n" - "fmla v8.8h, v20.8h, v1.h[3]\n" - "ldr q19, [%[f], #80]\n" //f_o1c0 - "fmla v10.8h, v20.8h, v1.h[4]\n" - "fmla v12.8h, v20.8h, v1.h[5]\n" - "fmla v14.8h, v20.8h, v1.h[6]\n" - "fmla v16.8h, v20.8h, v1.h[7]\n" - "fmla v3.8h, v21.8h, v1.h[0]\n" - "add %[in], %[in], #32\n" - "fmla v5.8h, v21.8h, v1.h[1]\n" - "add %[f], %[f], #64\n" - "fmla v7.8h, v21.8h, v1.h[2]\n" - "subs x0, x0, #2\n" - "fmla v9.8h, v21.8h, v1.h[3]\n" - "fmla v11.8h, v21.8h, v1.h[4]\n" - "fmla v13.8h, v21.8h, v1.h[5]\n" - "fmla v15.8h, v21.8h, v1.h[6]\n" - "fmla v17.8h, v21.8h, v1.h[7]\n" - "bne 0b\n" - "st1 { v2.8h, v3.8h, v4.8h, v5.8h}, [%[out]], #64\n" - "st1 { v6.8h, v7.8h, v8.8h, v9.8h}, [%[out]], #64\n" - "st1 {v10.8h, v11.8h, v12.8h, v13.8h}, [%[out]], #64\n" - "st1 {v14.8h, v15.8h, v16.8h, v17.8h}, [%[out]], #64\n" - :[out]"+r"(otm_0), - [in]"+r"(itm_0), - [f]"+r"(ftm_0) - :[ic]"r"((I64)ic*8) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "x0" - ); - } - // out trans - // O*(6*6)*hw8*o16 => NOHWo8 - for (U32 hw8 = 0; hw8 < 8; hw8++) { - U32 h = (hw+hw8) / tile_w; - U32 w = (hw+hw8) % tile_w; - F16 *out_0 = outArray + n*oc*oh*ow*8 + o*oh*ow*8 + h*4*ow*8 + w*4*8; - F16 *out_1 = out_0 + oh*ow*8; - U32 otm_off_0 = o*8*36*8 + hw8*16; - U32 otm_off_1 = otm_off_0 + 8; - - F16 *Ow_0[36]; - F16 *Ow_1[36]; - F16 *O_0[16]; - F16 *O_1[16]; - for (U32 idx = 0; idx < 36; idx++) { - Ow_0[idx] = otmArray + otm_off_0 + idx*8*16; - Ow_1[idx] = otmArray + otm_off_1 + idx*8*16; - } - for (U32 i = 0; i < 4; ++i) { - for (U32 j = 0; j < 4; ++j) { - O_0[i*4 + j] = out_0 + i*ow*8 + j*8; - O_1[i*4 + j] = out_1 + i*ow*8 + j*8; - } - } - CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, activationDesc)); - CHECK_STATUS(trans_O_4x4_3x3(Ow_1, O_1, b_1, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, activationDesc)); - } - } - if (oc & 1) { - F16 *itm_0 = itmArray; - const F16 *ftm_0 = filterArray + (oc-1)*36*ic*8*8; - F16 *otm_0 = otmArray + (oc-1)*36*8*8; - const F16 *b_0 = biasArray + (oc-1)*8; - // dot prod - // (6*6)*C*c8*hw8 times O*(6*6)*C*c8*o8 = O*(6*6)*hw8*o8 - for (U32 idx = 0; idx < 36; idx++) { - __asm__ __volatile__( - "mov x0, %[ic]\n" //ic_blk - "eor v2.16b, v2.16b, v2.16b\n" //out_o0hw0 - "ldr q0, [%[in]]\n" //in_hw0 - "eor v3.16b, v3.16b, v3.16b\n" //out_o0hw1 - "eor v4.16b, v4.16b, v4.16b\n" //out_o0hw2 - "eor v5.16b, v5.16b, v5.16b\n" //out_o0hw3 - "ldr q18, [%[f]]\n" //f_o0c0 - "eor v6.16b, v6.16b, v6.16b\n" //out_o0hw4 - "eor v7.16b, v7.16b, v7.16b\n" //out_o0hw5 - "eor v8.16b, v8.16b, v8.16b\n" //out_o0hw6 - "eor v9.16b, v9.16b, v9.16b\n" //out_o0hw7 - "0:\n" - "ldr q1, [%[in], #16]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "fmla v3.8h, v18.8h, v0.h[1]\n" - "fmla v4.8h, v18.8h, v0.h[2]\n" - "ldr q20, [%[f], #16]\n" //f_o0c0 - "fmla v5.8h, v18.8h, v0.h[3]\n" - "fmla v6.8h, v18.8h, v0.h[4]\n" - "fmla v7.8h, v18.8h, v0.h[5]\n" - "fmla v8.8h, v18.8h, v0.h[6]\n" - "subs x0, x0, #2\n" - "fmla v9.8h, v18.8h, v0.h[7]\n" - - "ldr q0, [%[in], #32]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "fmla v3.8h, v20.8h, v1.h[1]\n" - "fmla v4.8h, v20.8h, v1.h[2]\n" - "ldr q18, [%[f], #32]\n" //f_o0c0 - "fmla v5.8h, v20.8h, v1.h[3]\n" - "fmla v6.8h, v20.8h, v1.h[4]\n" - "fmla v7.8h, v20.8h, v1.h[5]\n" - "add %[in], %[in], #32\n" - "fmla v8.8h, v20.8h, v1.h[6]\n" - "add %[f], %[f], #32\n" - "fmla v9.8h, v20.8h, v1.h[7]\n" - "bne 0b\n" - "st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%[out]], #64\n" - "st1 {v6.8h, v7.8h, v8.8h, v9.8h}, [%[out]], #64\n" - :[out]"+r"(otm_0), - [in]"+r"(itm_0), - [f]"+r"(ftm_0) - :[ic]"r"((I64)ic*8) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v18", "v20", "x0"); - } - // out trans - // O*(6*6)*hw8*o8 => NOWHo8 - for (U32 hw8 = 0; hw8 < 8; hw8++) { - U32 h = (hw+hw8) / tile_w; - U32 w = (hw+hw8) % tile_w; - F16 *out_0 = outArray + n*oc*oh*ow*8 + (oc-1)*oh*ow*8 + h*4*ow*8 + w*4*8; - U32 otm_off_0 = (oc-1)*8*36*8 + hw8*8; - - F16 *Ow_0[36]; - F16 *O_0[16]; - for (U32 idx = 0; idx < 36; idx++) { - Ow_0[idx] = otmArray + otm_off_0 + idx*8*8; - } - for (U32 i = 0; i < 4; ++i) { - for (U32 j = 0; j < 4; ++j) { - O_0[i*4 + j] = out_0 + i*ow*8 + j*8; - } - } - CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, activationDesc)); - } - } - } - - // tiles_reminder % 8 / 4 - I32 tiles_s = (tiles / 8) * 8; - for (I32 hw = tiles_s; hw < tiles-3; hw+=4) { - const F16 *ftm_0 = filterArray; - F16 *otm_0 = otmArray; - // in trans - // NCHWc8 => (6*6)*C*c8*hw4 - for (U32 c = 0; c < ic; c++) { - F16 *inArray_pad_mov = inArray_pad + c*ih_pad*iw_pad*8; - F16 *Iw_ptr[36]; - F16 Iw0[36][8]; - F16 *I0[36]; - F16 Iw1[36][8]; - F16 *I1[36]; - F16 Iw2[36][8]; - F16 *I2[36]; - F16 Iw3[36][8]; - F16 *I3[36]; - F16 *itmArray_mov = itmArray + c*8*4; - U32 h0 = (hw/tile_w)*4; - U32 w0 = (hw%tile_w)*4; - U32 h1 = ((hw+1)/tile_w)*4; - U32 w1 = ((hw+1)%tile_w)*4; - U32 h2 = ((hw+2)/tile_w)*4; - U32 w2 = ((hw+2)%tile_w)*4; - U32 h3 = ((hw+3)/tile_w)*4; - U32 w3 = ((hw+3)%tile_w)*4; - for (U32 i = 0; i < 6; i++) { - for (U32 j = 0; j < 6; j++) { - I0[i*6 + j] = inArray_pad_mov + (h0+i)*iw_pad*8 + (w0+j)*8; - I1[i*6 + j] = inArray_pad_mov + (h1+i)*iw_pad*8 + (w1+j)*8; - I2[i*6 + j] = inArray_pad_mov + (h2+i)*iw_pad*8 + (w2+j)*8; - I3[i*6 + j] = inArray_pad_mov + (h3+i)*iw_pad*8 + (w3+j)*8; - } - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw0[i]; - } - trans_I_4x4_3x3(Iw_ptr, I0); - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw1[i]; - } - trans_I_4x4_3x3(Iw_ptr, I1); - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw2[i]; - } - trans_I_4x4_3x3(Iw_ptr, I2); - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw3[i]; - } - trans_I_4x4_3x3(Iw_ptr, I3); - for (U32 i = 0; i < 36; i++) { - F16* itm = itmArray_mov + i*ic*8*4; - - // for (U32 c8 = 0; c8 < 8; c8++) { - // itm[c8*4] = Iw0[i][c8]; - // itm[c8*4 + 1] = Iw1[i][c8]; - // itm[c8*4 + 2] = Iw2[i][c8]; - // itm[c8*4 + 3] = Iw3[i][c8]; - // } - - __asm__ __volatile__( - "ldr q0, [%[in_0]]\n" - "ldr q1, [%[in_1]]\n" - "ldr q2, [%[in_2]]\n" - "ldr q3, [%[in_3]]\n" - "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%[itm]]\n" - :[itm]"+r"(itm) - :[in_0]"r"(Iw0[i]), - [in_1]"r"(Iw1[i]), - [in_2]"r"(Iw2[i]), - [in_3]"r"(Iw3[i]) - :"memory", "cc", "v0", "v1", "v2", "v3" - ); - } - } - for (I32 o = 0; o < I32(oc-1); o+=2) { - const F16 *b_0 = biasArray + o*8; - const F16 *b_1 = b_0 + 8; - F16 *itm_0 = itmArray; - // dot prod - // (6*6)*C*c8*hw4 times O*(6*6)*C*c8*o16 = O*(6*6)*hw4*o16 - for (U32 idx = 0; idx < 36; idx++) { - __asm__ __volatile__( - "mov x0, %[ic]\n" //ic_blk - "eor v2.16b, v2.16b, v2.16b\n" //out_o0hw0 - "ldr d0, [%[in]]\n" //in_hw0 - "eor v4.16b, v4.16b, v4.16b\n" //out_o0hw1 - "ldr q18, [%[f]]\n" //f_o0c0 - "eor v6.16b, v6.16b, v6.16b\n" //out_o0hw2 - "eor v8.16b, v8.16b, v8.16b\n" //out_o0hw3 - "ldr q19, [%[f], #16]\n" //f_o1c0 - "eor v3.16b, v3.16b, v3.16b\n" //out_o1hw0 - "eor v5.16b, v5.16b, v5.16b\n" //out_o1hw1 - "eor v7.16b, v7.16b, v7.16b\n" //out_o1hw2 - "eor v9.16b, v9.16b, v9.16b\n" //out_o1hw3 - "0:\n" - "ldr d1, [%[in], #8]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr q20, [%[f], #32]\n" //f_o0c0 - "fmla v4.8h, v18.8h, v0.h[1]\n" - "fmla v6.8h, v18.8h, v0.h[2]\n" - "fmla v8.8h, v18.8h, v0.h[3]\n" - "ldr q21, [%[f], #48]\n" //f_o1c0 - "fmla v3.8h, v19.8h, v0.h[0]\n" - "fmla v5.8h, v19.8h, v0.h[1]\n" - "fmla v7.8h, v19.8h, v0.h[2]\n" - "subs x0, x0, #2\n" - "fmla v9.8h, v19.8h, v0.h[3]\n" - - "ldr d0, [%[in], #16]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr q18, [%[f], #64]\n" //f_o0c0 - "fmla v4.8h, v20.8h, v1.h[1]\n" - "fmla v6.8h, v20.8h, v1.h[2]\n" - "fmla v8.8h, v20.8h, v1.h[3]\n" - "ldr q19, [%[f], #80]\n" //f_o1c0 - "fmla v3.8h, v21.8h, v1.h[0]\n" - "fmla v5.8h, v21.8h, v1.h[1]\n" - "fmla v7.8h, v21.8h, v1.h[2]\n" - "add %[in], %[in], #16\n" - "fmla v9.8h, v21.8h, v1.h[3]\n" - "add %[f], %[f], #64\n" - "bne 0b\n" - "st1 { v2.8h, v3.8h, v4.8h, v5.8h}, [%[out]], #64\n" - "st1 { v6.8h, v7.8h, v8.8h, v9.8h}, [%[out]], #64\n" - :[out]"+r"(otm_0), - [in]"+r"(itm_0), - [f]"+r"(ftm_0) - :[ic]"r"((I64)ic*8) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v18", "v19", "v20", "v21", "x0" - ); - } - // out trans - // O*(6*6)*hw4*o16 => NOWHo8 - for (U32 hw4 = 0; hw4 < 4; hw4++) { - U32 h = (hw+hw4) / tile_w; - U32 w = (hw+hw4) % tile_w; - F16 *out_0 = outArray + n*oc*oh*ow*8 + o*oh*ow*8 + h*4*ow*8 + w*4*8; - F16 *out_1 = out_0 + oh*ow*8; - U32 otm_off_0 = o*8*36*4 + hw4*16; - U32 otm_off_1 = otm_off_0 + 8; - - F16 *Ow_0[36]; - F16 *Ow_1[36]; - F16 *O_0[16]; - F16 *O_1[16]; - for (U32 idx = 0; idx < 36; idx++) { - Ow_0[idx] = otmArray + otm_off_0 + idx*4*16; - Ow_1[idx] = otmArray + otm_off_1 + idx*4*16; - } - for (U32 i = 0; i < 4; ++i) { - for (U32 j = 0; j < 4; ++j) { - O_0[i*4 + j] = out_0 + i*ow*8 + j*8; - O_1[i*4 + j] = out_1 + i*ow*8 + j*8; - } - } - CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, activationDesc)); - CHECK_STATUS(trans_O_4x4_3x3(Ow_1, O_1, b_1, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, activationDesc)); - } - } - if (oc & 1) { - F16 *itm_0 = itmArray; - const F16 *ftm_0 = filterArray + (oc-1)*8*36*ic*8; - F16 *otm_0 = otmArray + (oc-1)*8*36*4; - const F16 *b_0 = biasArray + (oc-1)*8; - // dot prod - // (6*6)*C*c8*hw4 times O*(6*6)*C*c8*o8 = O*(6*6)*hw4*o8 - for (U32 idx = 0; idx < 36; idx++) { - __asm__ __volatile__( - "mov x0, %[ic]\n" //ic_blk - "eor v2.16b, v2.16b, v2.16b\n" //out_o0hw0 - "ldr d0, [%[in]]\n" //in_hw0 - "eor v3.16b, v3.16b, v3.16b\n" //out_o0hw1 - "ldr q18, [%[f]]\n" //f_o0c0 - "eor v4.16b, v4.16b, v4.16b\n" //out_o0hw2 - "eor v5.16b, v5.16b, v5.16b\n" //out_o0hw3 - "0:\n" - "ldr d1, [%[in], #8]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr q20, [%[f], #16]\n" //f_o0c0 - "fmla v3.8h, v18.8h, v0.h[1]\n" - "fmla v4.8h, v18.8h, v0.h[2]\n" - "fmla v5.8h, v18.8h, v0.h[3]\n" - "subs x0, x0, #2\n" - - "ldr d0, [%[in], #16]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr q18, [%[f], #32]\n" //f_o0c0 - "fmla v3.8h, v20.8h, v1.h[1]\n" - "fmla v4.8h, v20.8h, v1.h[2]\n" - "fmla v5.8h, v20.8h, v1.h[3]\n" - "add %[in], %[in], #16\n" - "add %[f], %[f], #32\n" - "bne 0b\n" - "st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%[out]], #64\n" - :[out]"+r"(otm_0), - [in]"+r"(itm_0), - [f]"+r"(ftm_0) - :[ic]"r"((I64)ic*8) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v18", "v20", "x0" - ); - } - // out trans - // O*(6*6)*hw4*o8 => NOWHo8 - for (U32 hw4 = 0; hw4 < 4; hw4++) { - U32 h = (hw+hw4) / tile_w; - U32 w = (hw+hw4) % tile_w; - F16 *out_0 = outArray + n*oc*oh*ow*8 + (oc-1)*oh*ow*8 + h*4*ow*8 + w*4*8; - U32 otm_off_0 = (oc-1)*8*36*4 + hw4*8; - - F16 *Ow_0[36]; - F16 *O_0[16]; - for (U32 idx = 0; idx < 36; idx++) { - Ow_0[idx] = otmArray + otm_off_0 + idx*4*8; - } - for (U32 i = 0; i < 4; ++i) { - for (U32 j = 0; j < 4; ++j) { - O_0[i*4 + j] = out_0 + i*ow*8 + j*8; - } - } - CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, activationDesc)); - } - } - } - - // tiles_reminder % 4 - tiles_s = (tiles / 4) * 4; - for (I32 hw = tiles_s; hw < tiles; hw++) { - const F16 *ftm_0 = filterArray; - F16 *otm_0 = otmArray; - // in trans - // NCHWc8 => (6*6)*C*c8*hw1 - for (U32 c = 0; c < ic; c++) { - F16 *inArray_pad_mov = inArray_pad + c*ih_pad*iw_pad*8; - F16 *Iw_ptr[36]; - F16 Iw0[36][8]; - F16 *I0[36]; - F16 *itmArray_mov = itmArray + c*8; - U32 h0 = (hw/tile_w)*4; - U32 w0 = (hw%tile_w)*4; - for (U32 i = 0; i < 6; i++) { - for (U32 j = 0; j < 6; j++) { - I0[i*6 + j] = inArray_pad_mov + (h0+i)*iw_pad*8 + (w0+j)*8; - } - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw0[i]; - } - trans_I_4x4_3x3(Iw_ptr, I0); - for (U32 i = 0; i < 36; i++) { - F16* itm = itmArray_mov + i*ic*8; - - // for (U32 c8 = 0; c8 < 8; c8++) { - // itm[c8] = Iw0[i][c8]; - // } - memcpy(itm, Iw0[i], 8*bytesOf(idt)); - } - } - for (I32 o = 0; o < I32(oc-1); o+=2) { - const F16 *b_0 = biasArray + o*8; - const F16 *b_1 = b_0 + 8; - F16 *itm_0 = itmArray; - // dot prod - // (6*6)*C*c8*hw1 times O*(6*6)*C*c8*o16 = O*(6*6)*hw1*o16 - for (U32 idx = 0; idx < 36; idx++) { - __asm__ __volatile__( - "mov x0, %[ic]\n" //ic_blk - "eor v2.16b, v2.16b, v2.16b\n" //out_o0hw0 - "ldr h0, [%[in]]\n" //in_hw0 - "eor v3.16b, v3.16b, v3.16b\n" //out_o1hw0 - "ldr q18, [%[f]]\n" //f_o0c0 - "ldr q19, [%[f], #16]\n" //f_o1c0 - "0:\n" - "ldr h1, [%[in], #2]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr q20, [%[f], #32]\n" //f_o0c0 - "fmla v3.8h, v19.8h, v0.h[0]\n" - "ldr q21, [%[f], #48]\n" //f_o1c0 - "subs x0, x0, #2\n" - - "ldr h0, [%[in], #4]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr q18, [%[f], #64]\n" //f_o0c0 - "fmla v3.8h, v21.8h, v1.h[0]\n" - "ldr q19, [%[f], #80]\n" //f_o1c0 - "add %[in], %[in], #4\n" - "add %[f], %[f], #64\n" - "bne 0b\n" - "st1 {v2.8h, v3.8h}, [%[out]], #32\n" - :[out]"+r"(otm_0), - [in]"+r"(itm_0), - [f]"+r"(ftm_0) - :[ic]"r"((I64)ic*8) - :"memory", "cc", "v0", "v1", "v2", "v3", "v18", "v19", "v20", "v21", "x0" - ); - } - // out trans - // O*(6*6)*hw1*o16 => NOWHo8 - U32 h = hw / tile_w; - U32 w = hw % tile_w; - F16 *out_0 = outArray + n*oc*oh*ow*8 + o*oh*ow*8 + h*4*ow*8 + w*4*8; - F16 *out_1 = out_0 + oh*ow*8; - U32 otm_off_0 = o*8*36; - U32 otm_off_1 = otm_off_0 + 8; - - F16 *Ow_0[36]; - F16 *Ow_1[36]; - F16 *O_0[16]; - F16 *O_1[16]; - for (U32 idx = 0; idx < 36; idx++) { - Ow_0[idx] = otmArray + otm_off_0 + idx*16; - Ow_1[idx] = otmArray + otm_off_1 + idx*16; - } - for (U32 i = 0; i < 4; ++i) { - for (U32 j = 0; j < 4; ++j) { - O_0[i*4 + j] = out_0 + i*ow*8 + j*8; - O_1[i*4 + j] = out_1 + i*ow*8 + j*8; - } - } - CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, activationDesc)); - CHECK_STATUS(trans_O_4x4_3x3(Ow_1, O_1, b_1, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, activationDesc)); - } - if (oc & 1) { - F16 *itm_0 = itmArray; - const F16 *ftm_0 = filterArray + (oc-1)*8*36*ic*8; - F16 *otm_0 = otmArray + (oc-1)*8*36; - const F16 *b_0 = biasArray + (oc-1)*8; - // dot prod - // (6*6)*C*c8*hw1 times O*(6*6)*C*c8*o8 = O*(6*6)*hw1*o8 - for (U32 idx = 0; idx < 36; idx++) { - __asm__ __volatile__( - "mov x0, %[ic]\n" //ic_blk - "eor v2.16b, v2.16b, v2.16b\n" //out_o0hw0 - "ldr s0, [%[in]]\n" //in_hw0 - "ldr q18, [%[f]]\n" //f_o0c0 - "0:\n" - "ldr h1, [%[in], #2]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr q20, [%[f], #16]\n" //f_o0c0 - "subs x0, x0, #2\n" - - "ldr h0, [%[in], #4]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr q18, [%[f], #32]\n" //f_o0c0 - "add %[in], %[in], #4\n" - "add %[f], %[f], #32\n" - "bne 0b\n" - "st1 {v2.8h}, [%[out]], #16\n" - :[out]"+r"(otm_0), - [in]"+r"(itm_0), - [f]"+r"(ftm_0) - :[ic]"r"((I64)ic*8) - :"memory", "cc", "v0", "v1", "v2", "v18", "v20", "x0" - ); - } - // out trans - // O*(6*6)*hw1*o8 => NOWHo8 - U32 h = hw / tile_w; - U32 w = hw % tile_w; - F16 *out_0 = outArray + n*oc*oh*ow*8 + (oc-1)*oh*ow*8 + h*4*ow*8 + w*4*8; - U32 otm_off_0 = (oc-1)*8*36; - - F16 *Ow_0[36]; - F16 *O_0[16]; - for (U32 idx = 0; idx < 36; idx++) { - Ow_0[idx] = otmArray + otm_off_0 + idx*8; - } - for (U32 i = 0; i < 4; ++i) { - for (U32 j = 0; j < 4; ++j) { - O_0[i*4 + j] = out_0 + i*ow*8 + j*8; - } - } - CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, activationDesc)); - } - } - } - return ret; -} diff --git a/tensor_computing/src/cpu/arm/fp16/convolution_winograd_transform.h b/tensor_computing/src/cpu/arm/fp16/convolution_winograd_transform.h deleted file mode 100644 index 18893534..00000000 --- a/tensor_computing/src/cpu/arm/fp16/convolution_winograd_transform.h +++ /dev/null @@ -1,503 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_WINOGRAD_TRANSFORM -#define _H_WINOGRAD_TRANSFORM - -#ifdef _USE_FP16 -#include -#include -#include "cpu/arm/fp16/arm_functions_fp16.h" - -inline void trans_W_4x4_3x3(F16 *Fw[36], F16* const F[9]) { - F16 T[6][3][8]; - - float16x8_t v_01666 = vmovq_n_f16(0.1666666666666667f); - float16x8_t v_minus_01666 = vmovq_n_f16(-0.1666666666666667f); - float16x8_t v_00833 = vmovq_n_f16(0.0833333333333333f); - float16x8_t v_minus_00833 = vmovq_n_f16(-0.0833333333333333f); - float16x8_t v_004166 = vmovq_n_f16(0.0416666666666667f); - float16x8_t v_025 = vmovq_n_f16(0.25f); - - for (U32 i = 0; i < 3; i++) { - float16x8_t v_F0 = vld1q_f16(F[0*3+i]); - float16x8_t v_F1 = vld1q_f16(F[1*3+i]); - float16x8_t v_F2 = vld1q_f16(F[2*3+i]); - - float16x8_t v_t0 = vmulq_f16(v_01666, v_F2); - float16x8_t v_t1 = vsubq_f16(vmulq_f16(v_minus_01666, v_F0), v_t0); - float16x8_t v_t2 = vfmaq_f16(v_t0, v_004166, v_F0); - - float16x8_t v_T0 = vmulq_f16(v_025, v_F0); - float16x8_t v_T1 = vfmaq_f16(v_t1, v_minus_01666, v_F1); - float16x8_t v_T2 = vfmaq_f16(v_t1, v_01666, v_F1); - float16x8_t v_T3 = vfmaq_f16(v_t2, v_00833, v_F1); - float16x8_t v_T4 = vfmaq_f16(v_t2, v_minus_00833, v_F1); - - vst1q_f16(T[0][i], v_T0); - vst1q_f16(T[1][i], v_T1); - vst1q_f16(T[2][i], v_T2); - vst1q_f16(T[3][i], v_T3); - vst1q_f16(T[4][i], v_T4); - vst1q_f16(T[5][i], v_F2); - } - for (U32 i = 0; i < 6; i++) { - float16x8_t v_T0 = vld1q_f16(T[i][0]); - float16x8_t v_T1 = vld1q_f16(T[i][1]); - float16x8_t v_T2 = vld1q_f16(T[i][2]); - - float16x8_t v_t0 = vmulq_f16(v_01666, v_T2); - float16x8_t v_t1 = vsubq_f16(vmulq_f16(v_minus_01666, v_T0), v_t0); - float16x8_t v_t2 = vfmaq_f16(v_t0, v_004166, v_T0); - - float16x8_t v_Fw0 = vmulq_f16(v_025, v_T0); - float16x8_t v_Fw1 = vfmaq_f16(v_t1, v_minus_01666, v_T1); - float16x8_t v_Fw2 = vfmaq_f16(v_t1, v_01666, v_T1); - float16x8_t v_Fw3 = vfmaq_f16(v_t2, v_00833, v_T1); - float16x8_t v_Fw4 = vfmaq_f16(v_t2, v_minus_00833, v_T1); - - vst1q_f16(Fw[i*6+0], v_Fw0); - vst1q_f16(Fw[i*6+1], v_Fw1); - vst1q_f16(Fw[i*6+2], v_Fw2); - vst1q_f16(Fw[i*6+3], v_Fw3); - vst1q_f16(Fw[i*6+4], v_Fw4); - vst1q_f16(Fw[i*6+5], v_T2); - } -} - -inline EE trans_O_4x4_3x3(F16* const Ow[36], F16 *O[16], const F16* bias, - U32 h, U32 w, U32 _pad_h_mod_4, U32 _pad_w_mod_4, U32 oh, U32 ow, ActivationDesc activationDesc) -{ - F16 T[4][6][8]; - // bias - float16x8_t v_b = vld1q_f16(bias); - - float16x8_t v_0 = vmovq_n_f16(0); - float16x8_t v_2 = vmovq_n_f16(2); - float16x8_t v_4 = vmovq_n_f16(4); - float16x8_t v_8 = vmovq_n_f16(8); - - for (U32 i = 0; i < 6; i++) { - float16x8_t v_Ow0 = vld1q_f16(Ow[i]); - float16x8_t v_Ow1 = vld1q_f16(Ow[1*6+i]); - float16x8_t v_Ow2 = vld1q_f16(Ow[2*6+i]); - float16x8_t v_Ow3 = vld1q_f16(Ow[3*6+i]); - float16x8_t v_Ow4 = vld1q_f16(Ow[4*6+i]); - float16x8_t v_Ow5 = vld1q_f16(Ow[5*6+i]); - - float16x8_t v_t0 = vaddq_f16(v_Ow1, v_Ow2); - float16x8_t v_t1 = vaddq_f16(v_Ow3, v_Ow4); - float16x8_t v_t2 = vsubq_f16(v_Ow1, v_Ow2); - float16x8_t v_t3 = vsubq_f16(v_Ow3, v_Ow4); - - float16x8_t v_T0 = vaddq_f16(vaddq_f16(v_t0, v_t1), v_Ow0); - float16x8_t v_T1 = vfmaq_f16(v_t2, v_t3, v_2); - float16x8_t v_T2 = vfmaq_f16(v_t0, v_t1, v_4); - float16x8_t v_T3 = vaddq_f16(vfmaq_f16(v_t2, v_t3, v_8), v_Ow5); - - vst1q_f16(T[0][i], v_T0); - vst1q_f16(T[1][i], v_T1); - vst1q_f16(T[2][i], v_T2); - vst1q_f16(T[3][i], v_T3); - } - - U32 pad_h_mod_4 = 0, pad_w_mod_4 = 0; - if (h == oh && w == ow) { - pad_h_mod_4 = _pad_h_mod_4; - pad_w_mod_4 = _pad_w_mod_4; - } else if (h == oh) { - pad_h_mod_4 = _pad_h_mod_4; - } else if (w == ow) { - pad_w_mod_4 = _pad_w_mod_4; - } - - for (U32 i = 0; i < 4 - pad_h_mod_4; i++) { - float16x8_t v_T0 = vld1q_f16(T[i][0]); - float16x8_t v_T1 = vld1q_f16(T[i][1]); - float16x8_t v_T2 = vld1q_f16(T[i][2]); - float16x8_t v_T3 = vld1q_f16(T[i][3]); - float16x8_t v_T4 = vld1q_f16(T[i][4]); - float16x8_t v_T5 = vld1q_f16(T[i][5]); - - float16x8_t v_t0 = vaddq_f16(v_T1, v_T2); - float16x8_t v_t1 = vaddq_f16(v_T3, v_T4); - float16x8_t v_t2 = vsubq_f16(v_T1, v_T2); - float16x8_t v_t3 = vsubq_f16(v_T3, v_T4); - - float16x8_t v_O0 = vaddq_f16(vaddq_f16(v_t0, v_t1), v_T0); - float16x8_t v_O1 = vfmaq_f16(v_t2, v_t3, v_2); - float16x8_t v_O2 = vfmaq_f16(v_t0, v_t1, v_4); - float16x8_t v_O3 = vaddq_f16(vfmaq_f16(v_t2, v_t3, v_8), v_T5); - - switch (activationDesc.mode) { - case ACTIVATION_NULL: { - if (pad_w_mod_4 == 0) { - vst1q_f16(O[i*4+0], vaddq_f16(v_O0, v_b)); - vst1q_f16(O[i*4+1], vaddq_f16(v_O1, v_b)); - vst1q_f16(O[i*4+2], vaddq_f16(v_O2, v_b)); - vst1q_f16(O[i*4+3], vaddq_f16(v_O3, v_b)); - } else if (pad_w_mod_4 == 1) { - vst1q_f16(O[i*4+0], vaddq_f16(v_O0, v_b)); - vst1q_f16(O[i*4+1], vaddq_f16(v_O1, v_b)); - vst1q_f16(O[i*4+2], vaddq_f16(v_O2, v_b)); - } else if (pad_w_mod_4 == 2) { - vst1q_f16(O[i*4+0], vaddq_f16(v_O0, v_b)); - vst1q_f16(O[i*4+1], vaddq_f16(v_O1, v_b)); - } else if (pad_w_mod_4 == 3) { - vst1q_f16(O[i*4+0], vaddq_f16(v_O0, v_b)); - } - break; - } - case ACTIVATION_RELU: { - if (pad_w_mod_4 == 0) { - vst1q_f16(O[i*4+0], vmaxq_f16(vaddq_f16(v_O0, v_b), v_0)); - vst1q_f16(O[i*4+1], vmaxq_f16(vaddq_f16(v_O1, v_b), v_0)); - vst1q_f16(O[i*4+2], vmaxq_f16(vaddq_f16(v_O2, v_b), v_0)); - vst1q_f16(O[i*4+3], vmaxq_f16(vaddq_f16(v_O3, v_b), v_0)); - } else if (pad_w_mod_4 == 1) { - vst1q_f16(O[i*4+0], vmaxq_f16(vaddq_f16(v_O0, v_b), v_0)); - vst1q_f16(O[i*4+1], vmaxq_f16(vaddq_f16(v_O1, v_b), v_0)); - vst1q_f16(O[i*4+2], vmaxq_f16(vaddq_f16(v_O2, v_b), v_0)); - } else if (pad_w_mod_4 == 2) { - vst1q_f16(O[i*4+0], vmaxq_f16(vaddq_f16(v_O0, v_b), v_0)); - vst1q_f16(O[i*4+1], vmaxq_f16(vaddq_f16(v_O1, v_b), v_0)); - } else if (pad_w_mod_4 == 3) { - vst1q_f16(O[i*4+0], vmaxq_f16(vaddq_f16(v_O0, v_b), v_0)); - } - break; - } - case ACTIVATION_SIGMOID: { - if (pad_w_mod_4 == 0) { - vst1q_f16(O[i*4+0], vsigmoidq_f16(vaddq_f16(v_O0, v_b))); - vst1q_f16(O[i*4+1], vsigmoidq_f16(vaddq_f16(v_O1, v_b))); - vst1q_f16(O[i*4+2], vsigmoidq_f16(vaddq_f16(v_O2, v_b))); - vst1q_f16(O[i*4+3], vsigmoidq_f16(vaddq_f16(v_O3, v_b))); - } else if (pad_w_mod_4 == 1) { - vst1q_f16(O[i*4+0], vsigmoidq_f16(vaddq_f16(v_O0, v_b))); - vst1q_f16(O[i*4+1], vsigmoidq_f16(vaddq_f16(v_O1, v_b))); - vst1q_f16(O[i*4+2], vsigmoidq_f16(vaddq_f16(v_O2, v_b))); - } else if (pad_w_mod_4 == 2) { - vst1q_f16(O[i*4+0], vsigmoidq_f16(vaddq_f16(v_O0, v_b))); - vst1q_f16(O[i*4+1], vsigmoidq_f16(vaddq_f16(v_O1, v_b))); - } else if (pad_w_mod_4 == 3) { - vst1q_f16(O[i*4+0], vsigmoidq_f16(vaddq_f16(v_O0, v_b))); - } - break; - } - default: - return NOT_SUPPORTED; - } - } - return SUCCESS; -} - - -inline void trans_I_4x4_3x3(F16 *Iw[36], F16* const I[36]) -{ - F16 T[6][6][8]; - - float16x8_t v_4 = vmovq_n_f16(4); - float16x8_t v_minus_4 = vmovq_n_f16(-4); - float16x8_t v_2 = vmovq_n_f16(2); - float16x8_t v_minus_5 = vmovq_n_f16(-5); - - for (U32 i = 0; i < 6; i++) { - float16x8_t v_I0 = vld1q_f16(I[0*6+i]); - float16x8_t v_I1 = vld1q_f16(I[1*6+i]); - float16x8_t v_I2 = vld1q_f16(I[2*6+i]); - float16x8_t v_I3 = vld1q_f16(I[3*6+i]); - float16x8_t v_I4 = vld1q_f16(I[4*6+i]); - float16x8_t v_I5 = vld1q_f16(I[5*6+i]); - - float16x8_t v_t0 = vfmaq_f16(v_I4, v_I2, v_minus_4); - float16x8_t v_t1 = vfmaq_f16(v_I3, v_I1, v_minus_4); - float16x8_t v_t2 = vsubq_f16(v_I4, v_I2); - float16x8_t v_t3 = vmulq_f16(vsubq_f16(v_I3, v_I1), v_2); - float16x8_t v_t4 = vfmaq_f16(v_I4, v_I0, v_4); - float16x8_t v_t5 = vfmaq_f16(v_I5, v_I1, v_4); - - float16x8_t v_T0 = vfmaq_f16(v_t4, v_I2, v_minus_5); - float16x8_t v_T1 = vaddq_f16(v_t1, v_t0); - float16x8_t v_T2 = vsubq_f16(v_t0, v_t1); - float16x8_t v_T3 = vaddq_f16(v_t3, v_t2); - float16x8_t v_T4 = vsubq_f16(v_t2, v_t3); - float16x8_t v_T5 = vfmaq_f16(v_t5, v_I3, v_minus_5); - - vst1q_f16(T[0][i], v_T0); - vst1q_f16(T[1][i], v_T1); - vst1q_f16(T[2][i], v_T2); - vst1q_f16(T[3][i], v_T3); - vst1q_f16(T[4][i], v_T4); - vst1q_f16(T[5][i], v_T5); - } - - for (U32 i = 0; i < 6; i++) { - float16x8_t v_T0 = vld1q_f16(T[i][0]); - float16x8_t v_T1 = vld1q_f16(T[i][1]); - float16x8_t v_T2 = vld1q_f16(T[i][2]); - float16x8_t v_T3 = vld1q_f16(T[i][3]); - float16x8_t v_T4 = vld1q_f16(T[i][4]); - float16x8_t v_T5 = vld1q_f16(T[i][5]); - - float16x8_t v_t0 = vfmaq_f16(v_T4, v_T2, v_minus_4); - float16x8_t v_t1 = vfmaq_f16(v_T3, v_T1, v_minus_4); - float16x8_t v_t2 = vsubq_f16(v_T4, v_T2); - float16x8_t v_t3 = vmulq_f16(vsubq_f16(v_T3, v_T1), v_2); - float16x8_t v_t4 = vfmaq_f16(v_T4, v_T0, v_4); - float16x8_t v_t5 = vfmaq_f16(v_T5, v_T1, v_4); - - float16x8_t v_Iw0 = vfmaq_f16(v_t4, v_T2, v_minus_5); - float16x8_t v_Iw1 = vaddq_f16(v_t1, v_t0); - float16x8_t v_Iw2 = vsubq_f16(v_t0, v_t1); - float16x8_t v_Iw3 = vaddq_f16(v_t3, v_t2); - float16x8_t v_Iw4 = vsubq_f16(v_t2, v_t3); - float16x8_t v_Iw5 = vfmaq_f16(v_t5, v_T3, v_minus_5); - - F16 max = vmaxvq_f16(v_Iw0); - F16 min = vminvq_f16(v_Iw0); - if (UNI_ISNAN(max) || UNI_ISINF(max) || UNI_ISNAN(min) || UNI_ISINF(min)) { - F16 check[8]; - vst1q_f16(check, v_Iw0); - for (U32 c = 0; c < 8; c++) { - F16 tmp = check[c]; - if (UNI_ISINF(tmp)) { - if (tmp > 0) { - check[c] = 65504; // FMAX for F16 - } else { - check[c] = -65504; - } - } else if (UNI_ISNAN(tmp)) { - tmp = (T[i][0][c] - T[i][2][c]) * 4; - if (UNI_ISINF(tmp)) { - if (tmp > 0) { - tmp = 65504; // FMAX for F16 - } else { - tmp = -65504; - } - } - F16 diff = T[i][4][c] - T[i][2][c]; - tmp += diff; - if (UNI_ISINF(tmp)) { - if (diff > 0) { - tmp = 65504; - } else { - tmp = -65504; - } - } - check[c] = tmp; - } - } - memcpy(Iw[i*6+0], check, 8*bytesOf(DT_F16)); - } else { - vst1q_f16(Iw[i*6+0], v_Iw0); - } - - max = vmaxvq_f16(v_Iw1); - min = vminvq_f16(v_Iw1); - if (UNI_ISNAN(max) || UNI_ISINF(max) || UNI_ISNAN(min) || UNI_ISINF(min)) { - F16 check[8]; - vst1q_f16(check, v_Iw1); - for (U32 c = 0; c < 8; c++) { - F16 tmp = check[c]; - if (UNI_ISINF(tmp)) { - if (tmp > 0) { - check[c] = 65504; // FMAX for F16 - } else { - check[c] = -65504; - } - } else if (UNI_ISNAN(tmp)) { - tmp = (T[i][1][c] + T[i][2][c]) * -4; - if (UNI_ISINF(tmp)) { - if (tmp > 0) { - tmp = 65504; // FMAX for F16 - } else { - tmp = -65504; - } - } - F16 sum = T[i][3][c] + T[i][4][c]; - tmp += sum; - if (UNI_ISINF(tmp)) { - if (sum > 0) { - tmp = 65504; - } else { - tmp = -65504; - } - } - check[c] = tmp; - } - } - memcpy(Iw[i*6+1], check, 8*bytesOf(DT_F16)); - } else { - vst1q_f16(Iw[i*6+1], v_Iw1); - } - - max = vmaxvq_f16(v_Iw2); - min = vminvq_f16(v_Iw2); - if (UNI_ISNAN(max) || UNI_ISINF(max) || UNI_ISNAN(min) || UNI_ISINF(min)) { - F16 check[8]; - vst1q_f16(check, v_Iw2); - for (U32 c = 0; c < 8; c++) { - F16 tmp = check[c]; - if (UNI_ISINF(tmp)) { - if (tmp > 0) { - check[c] = 65504; // FMAX for F16 - } else { - check[c] = -65504; - } - } else if (UNI_ISNAN(tmp)) { - tmp = (T[i][1][c] - T[i][2][c]) * 4; - if (UNI_ISINF(tmp)) { - if (tmp > 0) { - tmp = 65504; // FMAX for F16 - } else { - tmp = -65504; - } - } - F16 diff = T[i][4][c] - T[i][3][c]; - tmp += diff; - if (UNI_ISINF(tmp)) { - if (diff > 0) { - tmp = 65504; - } else { - tmp = -65504; - } - } - check[c] = tmp; - } - } - memcpy(Iw[i*6+2], check, 8*bytesOf(DT_F16)); - } else { - vst1q_f16(Iw[i*6+2], v_Iw2); - } - - max = vmaxvq_f16(v_Iw3); - min = vminvq_f16(v_Iw3); - if (UNI_ISNAN(max) || UNI_ISINF(max) || UNI_ISNAN(min) || UNI_ISINF(min)) { - F16 check[8]; - vst1q_f16(check, v_Iw3); - for (U32 c = 0; c < 8; c++) { - F16 tmp = check[c]; - if (UNI_ISINF(tmp)) { - if (tmp > 0) { - check[c] = 65504; // FMAX for F16 - } else { - check[c] = -65504; - } - } else if (UNI_ISNAN(tmp)) { - tmp = (T[i][3][c] - T[i][1][c]) * 2; - if (UNI_ISINF(tmp)) { - if (tmp > 0) { - tmp = 65504; // FMAX for F16 - } else { - tmp = -65504; - } - } - F16 diff = T[i][4][c] - T[i][2][c]; - tmp += diff; - if (UNI_ISINF(tmp)) { - if (diff > 0) { - tmp = 65504; - } else { - tmp = -65504; - } - } - check[c] = tmp; - } - } - memcpy(Iw[i*6+3], check, 8*bytesOf(DT_F16)); - } else { - vst1q_f16(Iw[i*6+3], v_Iw3); - } - - max = vmaxvq_f16(v_Iw4); - min = vminvq_f16(v_Iw4); - if (UNI_ISNAN(max) || UNI_ISINF(max) || UNI_ISNAN(min) || UNI_ISINF(min)) { - F16 check[8]; - vst1q_f16(check, v_Iw4); - for (U32 c = 0; c < 8; c++) { - F16 tmp = check[c]; - if (UNI_ISINF(tmp)) { - if (tmp > 0) { - check[c] = 65504; // FMAX for F16 - } else { - check[c] = -65504; - } - } else if (UNI_ISNAN(tmp)) { - tmp = (T[i][1][c] - T[i][3][c]) * 2; - if (UNI_ISINF(tmp)) { - if (tmp > 0) { - tmp = 65504; // FMAX for F16 - } else { - tmp = -65504; - } - } - F16 diff = T[i][4][c] - T[i][2][c]; - tmp += diff; - if (UNI_ISINF(tmp)) { - if (diff > 0) { - tmp = 65504; - } else { - tmp = -65504; - } - } - check[c] = tmp; - } - } - memcpy(Iw[i*6+4], check, 8*bytesOf(DT_F16)); - } else { - vst1q_f16(Iw[i*6+4], v_Iw4); - } - - max = vmaxvq_f16(v_Iw5); - min = vminvq_f16(v_Iw5); - if (UNI_ISNAN(max) || UNI_ISINF(max) || UNI_ISNAN(min) || UNI_ISINF(min)) { - F16 check[8]; - vst1q_f16(check, v_Iw5); - for (U32 c = 0; c < 8; c++) { - F16 tmp = check[c]; - if (UNI_ISINF(tmp)) { - if (tmp > 0) { - check[c] = 65504; // FMAX for F16 - } else { - check[c] = -65504; - } - } else if (UNI_ISNAN(tmp)) { - tmp = (T[i][1][c] - T[i][3][c]) * 4; - if (UNI_ISINF(tmp)) { - if (tmp > 0) { - tmp = 65504; // FMAX for F16 - } else { - tmp = -65504; - } - } - F16 diff = T[i][5][c] - T[i][3][c]; - tmp += diff; - if (UNI_ISINF(tmp)) { - if (diff > 0) { - tmp = 65504; - } else { - tmp = -65504; - } - } - check[c] = tmp; - } - } - memcpy(Iw[i*6+5], check, 8*bytesOf(DT_F16)); - } else { - vst1q_f16(Iw[i*6+5], v_Iw5); - } - } -} -#endif -#endif diff --git a/tensor_computing/src/cpu/arm/fp16/deconvolution.cpp b/tensor_computing/src/cpu/arm/fp16/deconvolution.cpp deleted file mode 100644 index 77795d96..00000000 --- a/tensor_computing/src/cpu/arm/fp16/deconvolution.cpp +++ /dev/null @@ -1,218 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/fp16/tensor_computing_fp16.h" -#include "cpu/arm/fp16/convolution_winograd.h" -#include "cpu/arm/fp16/convolution_gemm.h" -#include - -EE deconvolution_infer_forward_tmp_bytes_fp16(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, ConvolutionForwardAlgorithm algorithm, U32 *bytes) -{ - if (nullptr == bytes) { - CHECK_STATUS(NULL_POINTER); - } - - DataType idt, fdt; - DataFormat idf, fdf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - - U32 tPadding = fh - 1 - paddingT; - U32 bPadding = fh - 1 - paddingB; - U32 lPadding = fw - 1 - paddingL; - U32 rPadding = fw - 1 - paddingR; - - ConvolutionDesc transposedCD; - transposedCD.stride_h = 1; - transposedCD.stride_w = 1; - transposedCD.padding_top = 0; - transposedCD.padding_bottom = 0; - transposedCD.padding_left = 0; - transposedCD.padding_right = 0; - transposedCD.dilatedRate_h = 1; - transposedCD.dilatedRate_w = 1; - - if (CONVOLUTION_ALGORITHM_WINOGRAD == algorithm) { - // If algorithm is not Winograd, leave out padding of length 1 - tPadding--; - bPadding--; - lPadding--; - rPadding--; - transposedCD.padding_top += 1; - transposedCD.padding_bottom += 1; - transposedCD.padding_left += 1; - transposedCD.padding_right += 1; - } - - ih = ih + (ih - 1) * (strideH - 1) + tPadding + bPadding; - iw = iw + (iw - 1) * (strideW - 1) + lPadding + rPadding; - TensorDesc inPaddedDesc = tensor4df(idt, idf, in, ic, ih, iw); - if (DF_NCHW == filterDesc.df) { - // Swap fn and fc - filterDesc.dims[2] = filterDesc.dims[3]; - filterDesc.dims[3] = ic; - } - EE ret = convolution_infer_forward_tmp_bytes_fp16(inPaddedDesc, filterDesc, outputDesc, transposedCD, algorithm, bytes); - *bytes += tensorNumBytes(inPaddedDesc); // for pre-convolution padding - return ret; -} - -EE deconvolution_fp16(TensorDesc inputDesc, F16* input, - TensorDesc filterDesc, const F16* filter, - ConvolutionDesc convDesc, ConvolutionForwardAlgorithm algorithm, - TensorDesc biasDesc, const F16* bias, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* output, - ActivationDesc activationDesc, - Arch arch) -{ - if (nullptr == input || nullptr == filter || nullptr == output || nullptr == bias || nullptr == tmp) { - CHECK_STATUS(NULL_POINTER); - } - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - - if (!(idt == DT_F16 && fdt == DT_F16 && odt == DT_F16)){ - CHECK_STATUS(NOT_MATCH); - } - if (!(idf == DF_NCHWC8 && odf == DF_NCHWC8)) { - CHECK_STATUS(NOT_MATCH); - } - - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - - ConvolutionDesc transposedCD; - transposedCD.stride_h = 1; - transposedCD.stride_w = 1; - transposedCD.padding_top = 0; - transposedCD.padding_bottom = 0; - transposedCD.padding_left = 0; - transposedCD.padding_right = 0; - transposedCD.dilatedRate_h = 1; - transposedCD.dilatedRate_w = 1; - - U32 tPadding = fh - 1 - paddingT; - U32 bPadding = fh - 1 - paddingB; - U32 lPadding = fw - 1 - paddingL; - U32 rPadding = fw - 1 - paddingR; - - if (CONVOLUTION_ALGORITHM_WINOGRAD == algorithm) { - // If algorithm is not Winograd, leave out padding of length 1 - tPadding--; - bPadding--; - lPadding--; - rPadding--; - transposedCD.padding_top += 1; - transposedCD.padding_bottom += 1; - transposedCD.padding_left += 1; - transposedCD.padding_right += 1; - } - - U32 stuffH = strideH - 1; - U32 stuffW = strideW - 1; - U32 ihPadded = ih + (ih - 1) * stuffH + tPadding + bPadding; - U32 iwPadded = iw + (iw - 1) * stuffW + lPadding + rPadding; - TensorDesc inPaddedDesc = tensor4df(idt, idf, in, ic, ihPadded, iwPadded); - - F16 *inPad = (F16*)tmp; - F16 *inPadMov = inPad; - F16 *inputMov = input; - - ic /= 8; - - for (U32 c = 0; c < ic; c++) { - for (U32 h = 0; h < tPadding; h++) { - memset(inPadMov, 0, iwPadded*8*bytesOf(idt)); - inPadMov += iwPadded*8; - } - for (U32 h = 0; h < ih - 1; h++) { - memset(inPadMov, 0, lPadding*8*bytesOf(idt)); - inPadMov += lPadding*8; - for (U32 w = 0; w < iw - 1; w++) { - memcpy(inPadMov, inputMov, 8*bytesOf(idt)); - inPadMov += 8; - inputMov += 8; - memset(inPadMov, 0, stuffW*8*bytesOf(idt)); - inPadMov += stuffW * 8; - } - memcpy(inPadMov, inputMov, 8*bytesOf(idt)); - inPadMov += 8; - inputMov += 8; - memset(inPadMov, 0, rPadding*8*bytesOf(idt)); - inPadMov += rPadding*8; - - // stuffH - memset(inPadMov, 0, iwPadded*stuffH*8*bytesOf(idt)); - inPadMov += iwPadded*stuffH*8; - } - memset(inPadMov, 0, lPadding*8*bytesOf(idt)); - inPadMov += lPadding*8; - for (U32 w = 0; w < iw - 1; w++) { - memcpy(inPadMov, inputMov, 8*bytesOf(idt)); - inPadMov += 8; - inputMov += 8; - memset(inPadMov, 0, stuffW*8*bytesOf(idt)); - inPadMov += stuffW * 8; - } - memcpy(inPadMov, inputMov, 8*bytesOf(idt)); - inPadMov += 8; - inputMov += 8; - memset(inPadMov, 0, rPadding*8*bytesOf(idt)); - inPadMov += rPadding*8; - - for (U32 h = ihPadded - bPadding; h < ihPadded; h++) { - memset(inPadMov, 0, iwPadded*8*bytesOf(idt)); - inPadMov += iwPadded*8; - } - } - - EE ret = SUCCESS; - switch (algorithm) { - case CONVOLUTION_ALGORITHM_GEMM: - ret = convolution_gemm(inPaddedDesc, inPad, filterDesc, filter, transposedCD, - biasDesc, bias, tmpBytes - tensorNumBytes(inPaddedDesc), - inPad + tensorNumElements(inPaddedDesc), outputDesc, output, activationDesc, arch); - break; - case CONVOLUTION_ALGORITHM_WINOGRAD: - ret = convolution_winograd(inPaddedDesc, inPad, filterDesc, filter, transposedCD, - biasDesc, bias, tmpBytes - tensorNumBytes(inPaddedDesc), - inPad + tensorNumElements(inPaddedDesc), outputDesc, output, activationDesc, arch); - break; - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} diff --git a/tensor_computing/src/cpu/arm/fp16/deconvolution_transform.cpp b/tensor_computing/src/cpu/arm/fp16/deconvolution_transform.cpp deleted file mode 100644 index a0b6bd47..00000000 --- a/tensor_computing/src/cpu/arm/fp16/deconvolution_transform.cpp +++ /dev/null @@ -1,157 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/fp16/tensor_computing_fp16.h" -#include "cpu/arm/fp16/convolution_winograd_transform.h" - -inline EE deconvolution_transform_filter_kernel_fp16(TensorDesc filterDesc, const F16* filterArray, - TensorDesc *ftmDesc, F16* ftmArray, - DataFormat ftmDataFormat) -{ - // Procedure should be the same, but fhfw is reversed - if (nullptr == filterArray || nullptr == ftmDesc || nullptr == ftmArray) { - CHECK_STATUS(NULL_POINTER); - } - DataType fdt; - DataFormat fdf; - U32 fn, fc, fh, fw; - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - if (fdf == ftmDataFormat) { - *ftmDesc = filterDesc; - memcpy(ftmArray, filterArray, fn*fc*fh*fw*bytesOf(fdt)); - return SUCCESS; - } - if (fdf != DF_NCHW) { - CHECK_STATUS(NOT_SUPPORTED); - } - EE ret = SUCCESS; - switch (ftmDataFormat) { - case DF_NHWCN16: { - /* - * CNHW => NHWCN16 - * if there is remainder, it should be NHWCN8 - */ - U32 oc = fc / 16; - U32 hwMax = fh * fw - 1; - for (U32 o = 0; o < oc; o++) { - for (U32 hw = 0; hw < fh*fw; hw++) { - for (U32 c = 0; c < fn; c++) { - for (U32 o16 = 0; o16 < 16; o16++) { - ftmArray[o*fh*fw*fn*16 + hw*fn*16 + c*16 + o16] = filterArray[c*fc*fh*fw + (o*16+o16)*fh*fw + hwMax - hw]; - } - } - } - } - if (fc != oc * 16) { - for (U32 hw = 0; hw < fh*fw; hw++) { - for (U32 c = 0; c < fn; c++) { - for (U32 o8 = 0; o8 < 8; o8++) { - ftmArray[(oc*16)*fh*fw*fn + hw*fn*8 + c*8 + o8] = filterArray[c*fc*fh*fw + (oc*16+o8)*fh*fw + hwMax - hw]; - } - } - } - } - *ftmDesc = tensor4df(fdt, ftmDataFormat, fc, fn, fh, fw); - break; - } - case DF_HWNCN16: { - /* - * CNHW => NHWCN16 + NHWCN8 if there is remainder divided by 16 - */ - const U32 hwMax = 8; - - for (U32 o = 0; o < fc/16; o++) { - for (U32 c = 0; c < fn; c++) { - U32 f_off_0 = c*fc*fh*fw + (o*16)*fh*fw; - U32 f_off_1 = c*fc*fh*fw + (o*16+8)*fh*fw; - U32 ftm_off_0 = o*36*fn*16 + c*16; - U32 ftm_off_1 = o*36*fn*16 + c*16 + 8; - F16 F[9][8]; - F16 *F_ptr[9]; - F16 *Fw[36]; - - for (U32 hw = 0; hw < 9; hw++) { - for (U32 oo = 0; oo < 8; oo++) { - F[hw][oo] = filterArray[f_off_0 + hwMax - hw + oo*fh*fw]; - } - F_ptr[hw] = F[hw]; - } - for (U32 hw = 0; hw < 36; hw++) { - Fw[hw] = ftmArray + ftm_off_0 + hw*fn*16; - } - trans_W_4x4_3x3(Fw, F_ptr); - for (U32 hw = 0; hw < 9; hw++) { - for (U32 oo = 0; oo < 8; oo++) { - F[hw][oo] = filterArray[f_off_1 + hwMax - hw + oo*fh*fw]; - } - F_ptr[hw] = F[hw]; - } - for (U32 hw = 0; hw < 36; hw++) { - Fw[hw] = ftmArray + ftm_off_1 + hw*fn*16; - } - trans_W_4x4_3x3(Fw, F_ptr); - } - } - U32 oc = (fc / 16) * 16; - if (oc != fc) { - for (U32 c = 0; c < fn; c++) { - U32 f_off_0 = c*fc*fh*fw + oc*fh*fw; - U32 ftm_off_0 = oc*36*fn + c*8; - F16 F[9][8]; - F16 *F_ptr[9]; - F16 *Fw[36]; - for (U32 hw = 0; hw < 9; hw++) { - for (U32 oo = 0; oo < 8; oo++) { - F[hw][oo] = filterArray[f_off_0 + hwMax - hw + oo*fh*fw]; - } - F_ptr[hw] = F[hw]; - } - for (U32 hw = 0; hw < 36; hw++) { - Fw[hw] = ftmArray + ftm_off_0 + hw*fn*8; - } - trans_W_4x4_3x3(Fw, F_ptr); - } - } - *ftmDesc = tensor4df(fdt, ftmDataFormat, fc, fn, 6, 6); - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - -EE deconvolution_transform_filter_fp16(TensorDesc filterDesc, const F16* filter, - ConvolutionForwardAlgorithm algorithm, - TensorDesc *ftmDesc, F16* filterTransformed) -{ - DataFormat ftmDataFormat; - switch (algorithm) { - case CONVOLUTION_ALGORITHM_WINOGRAD: - ftmDataFormat = DF_HWNCN16; - break; - case CONVOLUTION_ALGORITHM_GEMM_ICNCHW: - ftmDataFormat = DF_NHWCN16; - break; - case CONVOLUTION_ALGORITHM_GEMM: - ftmDataFormat = DF_NHWCN16; - break; - default: - return NOT_MATCH; - } - EE ret = deconvolution_transform_filter_kernel_fp16(filterDesc, filter, ftmDesc, filterTransformed, ftmDataFormat); - CHECK_STATUS(ret); - return ret; -} diff --git a/tensor_computing/src/cpu/arm/fp16/depthwise_convolution.cpp b/tensor_computing/src/cpu/arm/fp16/depthwise_convolution.cpp deleted file mode 100644 index 946f584e..00000000 --- a/tensor_computing/src/cpu/arm/fp16/depthwise_convolution.cpp +++ /dev/null @@ -1,99 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/fp16/tensor_computing_fp16.h" -#include "cpu/arm/fp16/depthwise_convolution_direct.h" -#include "cpu/arm/fp16/depthwise_pointwise_convolution_direct.h" -#include "cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding.h" -#include "cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1.h" - -EE depthwise_convolution_fp16(TensorDesc inputDesc, F16* input, - TensorDesc filterDesc, const F16* filter, - ConvolutionDesc convDesc, DepthwiseConvolutionForwardAlgorithm algorithm, - TensorDesc biasDesc, const F16* bias, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* output, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc, - Arch arch) -{ - if(nullptr == input || nullptr == filter || nullptr == output || nullptr == bias || nullptr == tmp) - CHECK_STATUS(NULL_POINTER); - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - - if (!(idt == DT_F16 && fdt == DT_F16 && odt == DT_F16)) - CHECK_STATUS(NOT_MATCH); - if (!(idf == DF_NCHWC8 && odf == DF_NCHWC8)) - CHECK_STATUS(NOT_MATCH); - if (!(ic == fc && oc == fn)) - CHECK_STATUS(NOT_MATCH); - - EE ret = SUCCESS; - switch (algorithm) { - case DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT: - ret = depthwise_convolution_direct(inputDesc, input, - filterDesc, filter, - convDesc, - biasDesc, bias, - tmpBytes, tmp, - outputDesc, output, - depthwiseActivationDesc, - arch); - break; - case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT: - ret = depthwise_pointwise_convolution_direct(inputDesc, input, - filterDesc, filter, - convDesc, - biasDesc, bias, - tmpBytes, tmp, - outputDesc, output, - depthwiseActivationDesc, - pointwiseActivationDesc, - arch); - break; - case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT_NO_PADDING: - ret = depthwise_pointwise_convolution_direct_no_padding(inputDesc, input, - filterDesc, filter, - convDesc, - biasDesc, bias, - tmpBytes, tmp, - outputDesc, output, - depthwiseActivationDesc, - pointwiseActivationDesc, - arch); - break; - case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_3X3S1P1: - ret = depthwise_pointwise_convolution_3x3s1p1(inputDesc, input, - filterDesc, filter, - convDesc, - biasDesc, bias, - tmpBytes, tmp, - outputDesc, output, - depthwiseActivationDesc, - pointwiseActivationDesc, - arch); - break; - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} diff --git a/tensor_computing/src/cpu/arm/fp16/depthwise_convolution_direct.h b/tensor_computing/src/cpu/arm/fp16/depthwise_convolution_direct.h deleted file mode 100644 index f30e7e3c..00000000 --- a/tensor_computing/src/cpu/arm/fp16/depthwise_convolution_direct.h +++ /dev/null @@ -1,76 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_DEPTHWISE_CONVOLUTION_DIRECT -#define _H_DEPTHWISE_CONVOLUTION_DIRECT - -#include - -#include "sys.h" -#include "type.h" -#include "error.h" -#include "tensor_desc.h" -#include "tensor_computing_type.h" - -EE depthwise_convolution_direct_A55(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc depthwiseActivationDesc); - -EE depthwise_convolution_direct_A76(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc depthwiseActivationDesc); - -inline EE depthwise_convolution_direct(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc depthwiseActivationDesc, - Arch arch) -{ - EE ret = SUCCESS; - switch (arch) { - case ARM_A55: - ret = depthwise_convolution_direct_A55(inputDesc, inArray, - filterDesc, filterArray, - convDesc, - biasDesc, biasArray, - tmpBytes, tmp, - outputDesc, outArray, - depthwiseActivationDesc); - break; - case ARM_A76: - ret = depthwise_convolution_direct_A76(inputDesc, inArray, - filterDesc, filterArray, - convDesc, - biasDesc, biasArray, - tmpBytes, tmp, - outputDesc, outArray, - depthwiseActivationDesc); - break; - default: - return NOT_SUPPORTED; - } - return ret; -} -#endif diff --git a/tensor_computing/src/cpu/arm/fp16/depthwise_convolution_direct_A55.cpp b/tensor_computing/src/cpu/arm/fp16/depthwise_convolution_direct_A55.cpp deleted file mode 100644 index cbe53304..00000000 --- a/tensor_computing/src/cpu/arm/fp16/depthwise_convolution_direct_A55.cpp +++ /dev/null @@ -1,500 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/fp16/depthwise_convolution_direct.h" - -EE depthwise_convolution_direct_A55(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc depthwiseActivationDesc) -{ - UNUSED(biasDesc); - UNUSED(tmpBytes); - - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - U32 dilateH = convDesc.dilatedRate_h; - U32 dilateW = convDesc.dilatedRate_w; - - if (fdf != DF_NCHWC8) - CHECK_STATUS(NOT_MATCH); - - oc /= 8; - ic /= 8; - U32 ih_pad = ih + paddingT + paddingB; - U32 iw_pad = iw + paddingL + paddingR; - U32 ihiw = ih*iw; - I32 ohow = oh*ow; - - for (U32 n = 0; n < in; n++) { - F16 *inArray_pad = (F16*)tmp; - F16 *inArray_pad_mov = inArray_pad; - F16 *inArray_mov = inArray + n*ic*ihiw*8; - for (U32 c = 0; c < ic; c++) { - // copy input into a input with padding - for (U32 h = 0; h < paddingT; h++) { - memset(inArray_pad_mov, 0, iw_pad*8*sizeof(F16)); - inArray_pad_mov += iw_pad*8; - } - for (U32 h = paddingT; h < ih_pad - paddingB; h++) { - memset(inArray_pad_mov, 0, paddingL*8*sizeof(F16)); - inArray_pad_mov += paddingL*8; - memcpy(inArray_pad_mov, inArray_mov, iw*8*sizeof(F16)); - inArray_pad_mov += iw*8; - inArray_mov += iw*8; - memset(inArray_pad_mov, 0, paddingR*8*sizeof(F16)); - inArray_pad_mov += paddingR*8; - } - for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { - memset(inArray_pad_mov, 0, iw_pad*8*sizeof(F16)); - inArray_pad_mov += iw_pad*8; - } - - const F16 *b = biasArray + c*8; - F16 *in_pad = inArray_pad + c*ih_pad*iw_pad*8; - const F16 *f = filterArray + c*fh*fw*8; - // ohow / 8 - for (I32 hw = 0; hw < ohow-7; hw+=8) { - U32 in_h_0 = hw/ow*strideH; - U32 in_w_0 = hw%ow*strideW; - U32 in_h_1 = (hw+1)/ow*strideH; - U32 in_w_1 = (hw+1)%ow*strideW; - U32 in_h_2 = (hw+2)/ow*strideH; - U32 in_w_2 = (hw+2)%ow*strideW; - U32 in_h_3 = (hw+3)/ow*strideH; - U32 in_w_3 = (hw+3)%ow*strideW; - U32 in_h_4 = (hw+4)/ow*strideH; - U32 in_w_4 = (hw+4)%ow*strideW; - U32 in_h_5 = (hw+5)/ow*strideH; - U32 in_w_5 = (hw+5)%ow*strideW; - U32 in_h_6 = (hw+6)/ow*strideH; - U32 in_w_6 = (hw+6)%ow*strideW; - U32 in_h_7 = (hw+7)/ow*strideH; - U32 in_w_7 = (hw+7)%ow*strideW; - F16 *out_ptr = outArray + ((n * ic + c) * ohow + hw) * 8; - //TODO handle asm combined with c. No guarantee that compile will not use vec reg in c. - __asm__ __volatile__( - "ldr q8, [%[b]]\n" - "mov v0.16b, v8.16b\n" - "mov v1.16b, v8.16b\n" - "mov v2.16b, v8.16b\n" - "mov v3.16b, v8.16b\n" - "mov v4.16b, v8.16b\n" - "mov v5.16b, v8.16b\n" - "mov v6.16b, v8.16b\n" - "mov v7.16b, v8.16b\n" - : - :[b]"r"(b) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8" - ); - - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - const F16 *f_0 = f + fh_idx*fw*8 + fw_idx*8; - F16 *in_idx = in_pad + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - F16 *in_0 = in_idx + in_h_0*iw_pad*8 + in_w_0*8; - F16 *in_1 = in_idx + in_h_1*iw_pad*8 + in_w_1*8; - F16 *in_2 = in_idx + in_h_2*iw_pad*8 + in_w_2*8; - F16 *in_3 = in_idx + in_h_3*iw_pad*8 + in_w_3*8; - F16 *in_4 = in_idx + in_h_4*iw_pad*8 + in_w_4*8; - F16 *in_5 = in_idx + in_h_5*iw_pad*8 + in_w_5*8; - F16 *in_6 = in_idx + in_h_6*iw_pad*8 + in_w_6*8; - F16 *in_7 = in_idx + in_h_7*iw_pad*8 + in_w_7*8; - __asm__ __volatile__( - "ldr q17, [%[f0]]\n" - "ldr q9, [%[in0]]\n" - "ldr q10, [%[in1]]\n" - "ldr q11, [%[in2]]\n" - "ldr q12, [%[in3]]\n" - "ldr q13, [%[in4]]\n" - "ldr q14, [%[in5]]\n" - "ldr q15, [%[in6]]\n" - "ldr q16, [%[in7]]\n" - "fmla v0.8h, v9.8h, v17.8h\n" - "fmla v1.8h, v10.8h, v17.8h\n" - "fmla v2.8h, v11.8h, v17.8h\n" - "fmla v3.8h, v12.8h, v17.8h\n" - "fmla v4.8h, v13.8h, v17.8h\n" - "fmla v5.8h, v14.8h, v17.8h\n" - "fmla v6.8h, v15.8h, v17.8h\n" - "fmla v7.8h, v16.8h, v17.8h\n" - : - :[in0]"r"(in_0), - [in1]"r"(in_1), - [in2]"r"(in_2), - [in3]"r"(in_3), - [in4]"r"(in_4), - [in5]"r"(in_5), - [in6]"r"(in_6), - [in7]"r"(in_7), - [f0]"r"(f_0) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17" - ); - } - } - - // activation - switch (depthwiseActivationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fmax v0.8h, v0.8h, v31.8h\n" - "fmax v1.8h, v1.8h, v31.8h\n" - "fmax v2.8h, v2.8h, v31.8h\n" - "fmax v3.8h, v3.8h, v31.8h\n" - "fmax v4.8h, v4.8h, v31.8h\n" - "fmax v5.8h, v5.8h, v31.8h\n" - "fmax v6.8h, v6.8h, v31.8h\n" - "fmax v7.8h, v7.8h, v31.8h\n" - : - : - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v31" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "movi v30.8h, #0x46, lsl #8\n" // six - "fmax v0.8h, v0.8h, v31.8h\n" - "fmax v1.8h, v1.8h, v31.8h\n" - "fmax v2.8h, v2.8h, v31.8h\n" - "fmax v3.8h, v3.8h, v31.8h\n" - "fmax v4.8h, v4.8h, v31.8h\n" - "fmax v5.8h, v5.8h, v31.8h\n" - "fmax v6.8h, v6.8h, v31.8h\n" - "fmax v7.8h, v7.8h, v31.8h\n" - "fmin v0.8h, v0.8h, v30.8h\n" - "fmin v1.8h, v1.8h, v30.8h\n" - "fmin v2.8h, v2.8h, v30.8h\n" - "fmin v3.8h, v3.8h, v30.8h\n" - "fmin v4.8h, v4.8h, v30.8h\n" - "fmin v5.8h, v5.8h, v30.8h\n" - "fmin v6.8h, v6.8h, v30.8h\n" - "fmin v7.8h, v7.8h, v30.8h\n" - : - : - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v30", "v31" - ); - break; - } - case ACTIVATION_H_SWISH:{ - __asm__ __volatile__( - "movi v29.8h, #0x42, lsl #8\n" // three - "movi v30.8h, #0x46, lsl #8\n" // six - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fadd v21.8h, v0.8h, v29.8h\n" - "fadd v22.8h, v1.8h, v29.8h\n" - "fadd v23.8h, v2.8h, v29.8h\n" - "fadd v24.8h, v3.8h, v29.8h\n" - "fadd v25.8h, v4.8h, v29.8h\n" - "fadd v26.8h, v5.8h, v29.8h\n" - "fadd v27.8h, v6.8h, v29.8h\n" - "fadd v28.8h, v7.8h, v29.8h\n" - "fmax v21.8h, v21.8h, v31.8h\n" - "fmax v22.8h, v22.8h, v31.8h\n" - "fmax v23.8h, v23.8h, v31.8h\n" - "fmax v24.8h, v24.8h, v31.8h\n" - "fmax v25.8h, v25.8h, v31.8h\n" - "fmax v26.8h, v26.8h, v31.8h\n" - "fmax v27.8h, v27.8h, v31.8h\n" - "fmax v28.8h, v28.8h, v31.8h\n" - "fmin v21.8h, v21.8h, v30.8h\n" - "fmin v22.8h, v22.8h, v30.8h\n" - "fmin v23.8h, v23.8h, v30.8h\n" - "fmin v24.8h, v24.8h, v30.8h\n" - "fmin v25.8h, v25.8h, v30.8h\n" - "fmin v26.8h, v26.8h, v30.8h\n" - "fmin v27.8h, v27.8h, v30.8h\n" - "fmin v28.8h, v28.8h, v30.8h\n" - "fdiv v21.8h, v21.8h, v30.8h\n" - "fdiv v22.8h, v22.8h, v30.8h\n" - "fdiv v23.8h, v23.8h, v30.8h\n" - "fdiv v24.8h, v24.8h, v30.8h\n" - "fdiv v25.8h, v25.8h, v30.8h\n" - "fdiv v26.8h, v26.8h, v30.8h\n" - "fdiv v27.8h, v27.8h, v30.8h\n" - "fdiv v28.8h, v28.8h, v30.8h\n" - "fmul v0.8h, v0.8h, v21.8h\n" - "fmul v1.8h, v1.8h, v22.8h\n" - "fmul v2.8h, v2.8h, v23.8h\n" - "fmul v3.8h, v3.8h, v24.8h\n" - "fmul v4.8h, v4.8h, v25.8h\n" - "fmul v5.8h, v5.8h, v26.8h\n" - "fmul v6.8h, v6.8h, v27.8h\n" - "fmul v7.8h, v7.8h, v28.8h\n" - : - : - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" - ); - break; - } - default: - return NOT_SUPPORTED; - } - - __asm__ __volatile__( - "str q0, [%[out]]\n" - "str q1, [%[out], #16]\n" - "str q2, [%[out], #32]\n" - "str q3, [%[out], #48]\n" - "str q4, [%[out], #64]\n" - "str q5, [%[out], #80]\n" - "str q6, [%[out], #96]\n" - "str q7, [%[out], #112]\n" - :[out]"+r"(out_ptr) - : - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19" - ); - } - - // ohow_reminder % 8 / 4 - U32 ohow_s = (ohow / 8) * 8; - for (I32 hw = ohow_s; hw < ohow-3; hw+=4) { - U32 in_h_0 = hw/ow*strideH; - U32 in_w_0 = hw%ow*strideW; - U32 in_h_1 = (hw+1)/ow*strideH; - U32 in_w_1 = (hw+1)%ow*strideW; - U32 in_h_2 = (hw+2)/ow*strideH; - U32 in_w_2 = (hw+2)%ow*strideW; - U32 in_h_3 = (hw+3)/ow*strideH; - U32 in_w_3 = (hw+3)%ow*strideW; - F16 *out_ptr = outArray + ((n * ic + c) * ohow + hw) * 8; - //TODO handle asm combined with c. No guarantee that compile will not use vec reg in c. - __asm__ __volatile__( - "ldr q8, [%[b]]\n" - "mov v0.16b, v8.16b\n" - "mov v1.16b, v8.16b\n" - "mov v2.16b, v8.16b\n" - "mov v3.16b, v8.16b\n" - : - :[b]"r"(b) - :"memory", "cc", "v0", "v1", "v2", "v3", "v8" - ); - - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - const F16 *f_0 = f + fh_idx*fw*8 + fw_idx*8; - F16 *in_idx = in_pad + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - F16 *in_0 = in_idx + in_h_0*iw_pad*8 + in_w_0*8; - F16 *in_1 = in_idx + in_h_1*iw_pad*8 + in_w_1*8; - F16 *in_2 = in_idx + in_h_2*iw_pad*8 + in_w_2*8; - F16 *in_3 = in_idx + in_h_3*iw_pad*8 + in_w_3*8; - __asm__ __volatile__( - "ldr q17, [%[f0]]\n" - "ldr q9, [%[in0]]\n" - "ldr q10, [%[in1]]\n" - "ldr q11, [%[in2]]\n" - "ldr q12, [%[in3]]\n" - "fmla v0.8h, v9.8h, v17.8h\n" - "fmla v1.8h, v10.8h, v17.8h\n" - "fmla v2.8h, v11.8h, v17.8h\n" - "fmla v3.8h, v12.8h, v17.8h\n" - : - :[in0]"r"(in_0), - [in1]"r"(in_1), - [in2]"r"(in_2), - [in3]"r"(in_3), - [f0]"r"(f_0) - :"memory", "cc", "v0", "v1", "v2", "v3", "v9", "v10", "v11", "v12", "v17" - ); - } - } - - // activation - switch (depthwiseActivationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fmax v0.8h, v0.8h, v31.8h\n" - "fmax v1.8h, v1.8h, v31.8h\n" - "fmax v2.8h, v2.8h, v31.8h\n" - "fmax v3.8h, v3.8h, v31.8h\n" - : - : - :"memory", "cc", "v0", "v1", "v2", "v3", "v31" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "movi v30.8h, #0x46, lsl #8\n" // six - "fmax v0.8h, v0.8h, v31.8h\n" - "fmax v1.8h, v1.8h, v31.8h\n" - "fmax v2.8h, v2.8h, v31.8h\n" - "fmax v3.8h, v3.8h, v31.8h\n" - "fmin v0.8h, v0.8h, v30.8h\n" - "fmin v1.8h, v1.8h, v30.8h\n" - "fmin v2.8h, v2.8h, v30.8h\n" - "fmin v3.8h, v3.8h, v30.8h\n" - : - : - :"memory", "cc", "v0", "v1", "v2", "v3", "v30", "v31" - ); - break; - } - case ACTIVATION_H_SWISH:{ - __asm__ __volatile__( - "movi v29.8h, #0x42, lsl #8\n" // three - "movi v30.8h, #0x46, lsl #8\n" // six - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fadd v25.8h, v0.8h, v29.8h\n" - "fadd v26.8h, v1.8h, v29.8h\n" - "fadd v27.8h, v2.8h, v29.8h\n" - "fadd v28.8h, v3.8h, v29.8h\n" - "fmax v25.8h, v25.8h, v31.8h\n" - "fmax v26.8h, v26.8h, v31.8h\n" - "fmax v27.8h, v27.8h, v31.8h\n" - "fmax v28.8h, v28.8h, v31.8h\n" - "fmin v25.8h, v25.8h, v30.8h\n" - "fmin v26.8h, v26.8h, v30.8h\n" - "fmin v27.8h, v27.8h, v30.8h\n" - "fmin v28.8h, v28.8h, v30.8h\n" - "fdiv v25.8h, v25.8h, v30.8h\n" - "fdiv v26.8h, v26.8h, v30.8h\n" - "fdiv v27.8h, v27.8h, v30.8h\n" - "fdiv v28.8h, v28.8h, v30.8h\n" - "fmul v0.8h, v0.8h, v25.8h\n" - "fmul v1.8h, v1.8h, v26.8h\n" - "fmul v2.8h, v2.8h, v27.8h\n" - "fmul v3.8h, v3.8h, v28.8h\n" - : - : - :"memory", "cc", "v0", "v1", "v2", "v3", "v25", "v26", "v27", "v28", "v29", "v30", "v31" - ); - break; - } - default: - return NOT_SUPPORTED; - } - - __asm__ __volatile__( - "st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [%[out]]\n" - :[out]"+r"(out_ptr) - : - :"memory", "cc", "v0", "v1", "v2", "v3" - ); - } - - // ohow_reminder % 4 - ohow_s = (ohow / 4) * 4; - for (I32 hw = ohow_s; hw < ohow; hw++) { - U32 in_h_0 = hw/ow*strideH; - U32 in_w_0 = hw%ow*strideW; - F16 *out_ptr = outArray + ((n * ic + c) * ohow + hw) * 8; - //TODO handle asm combined with c. No guarantee that compile will not use vec reg in c. - __asm__ __volatile__( - "ldr q8, [%[b]]\n" - "mov v0.16b, v8.16b\n" - : - :[b]"r"(b) - :"memory", "cc", "v0" - ); - - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - const F16 *f_0 = f + fh_idx*fw*8 + fw_idx*8; - F16 *in_idx = in_pad + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - F16 *in_0 = in_idx + in_h_0*iw_pad*8 + in_w_0*8; - __asm__ __volatile__( - "ldr q17, [%[f0]]\n" - "ldr q9, [%[in0]]\n" - "fmla v0.8h, v9.8h, v17.8h\n" - : - :[in0]"r"(in_0), - [f0]"r"(f_0) - :"memory", "cc", "v0", "v9", "v17" - ); - } - } - - // activation - switch (depthwiseActivationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fmax v0.8h, v0.8h, v31.8h\n" - : - : - :"memory", "cc", "v0", "v31" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "movi v30.8h, #0x46, lsl #8\n" // six - "fmax v0.8h, v0.8h, v31.8h\n" - "fmin v0.8h, v0.8h, v30.8h\n" - : - : - :"memory", "cc", "v0", "v30", "v31" - ); - break; - } - case ACTIVATION_H_SWISH:{ - __asm__ __volatile__( - "movi v29.8h, #0x42, lsl #8\n" // three - "movi v30.8h, #0x46, lsl #8\n" // six - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fadd v28.8h, v0.8h, v29.8h\n" - "fmax v28.8h, v28.8h, v31.8h\n" - "fmin v28.8h, v28.8h, v30.8h\n" - "fdiv v28.8h, v28.8h, v30.8h\n" - "fmul v0.8h, v0.8h, v28.8h\n" - : - : - :"memory", "cc", "v0", "v28", "v29", "v30", "v31" - ); - break; - } - default: - return NOT_SUPPORTED; - } - - __asm__ __volatile__( - "str q0, [%[out]]\n" - :[out]"+r"(out_ptr) - : - :"memory", "cc", "v0" - ); - } - } - } - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/arm/fp16/depthwise_convolution_direct_A76.cpp b/tensor_computing/src/cpu/arm/fp16/depthwise_convolution_direct_A76.cpp deleted file mode 100644 index 40b58257..00000000 --- a/tensor_computing/src/cpu/arm/fp16/depthwise_convolution_direct_A76.cpp +++ /dev/null @@ -1,500 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/fp16/depthwise_convolution_direct.h" - -EE depthwise_convolution_direct_A76(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc depthwiseActivationDesc) -{ - UNUSED(biasDesc); - UNUSED(tmpBytes); - - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - U32 dilateH = convDesc.dilatedRate_h; - U32 dilateW = convDesc.dilatedRate_w; - - if (fdf != DF_NCHWC8) - CHECK_STATUS(NOT_MATCH); - - oc /= 8; - ic /= 8; - U32 ih_pad = ih + paddingT + paddingB; - U32 iw_pad = iw + paddingL + paddingR; - U32 ihiw = ih*iw; - I32 ohow = oh*ow; - - for (U32 n = 0; n < in; n++) { - F16 *inArray_pad = (F16*)tmp; - F16 *inArray_pad_mov = inArray_pad; - F16 *inArray_mov = inArray + n*ic*ihiw*8; - for (U32 c = 0; c < ic; c++) { - // copy input into a input with padding - for (U32 h = 0; h < paddingT; h++) { - memset(inArray_pad_mov, 0, iw_pad*8*sizeof(F16)); - inArray_pad_mov += iw_pad*8; - } - for (U32 h = paddingT; h < ih_pad - paddingB; h++) { - memset(inArray_pad_mov, 0, paddingL*8*sizeof(F16)); - inArray_pad_mov += paddingL*8; - memcpy(inArray_pad_mov, inArray_mov, iw*8*sizeof(F16)); - inArray_pad_mov += iw*8; - inArray_mov += iw*8; - memset(inArray_pad_mov, 0, paddingR*8*sizeof(F16)); - inArray_pad_mov += paddingR*8; - } - for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { - memset(inArray_pad_mov, 0, iw_pad*8*sizeof(F16)); - inArray_pad_mov += iw_pad*8; - } - - const F16 *b = biasArray + c*8; - F16 *in_pad = inArray_pad + c*ih_pad*iw_pad*8; - const F16 *f = filterArray + c*fh*fw*8; - // ohow / 8 - for (I32 hw = 0; hw < ohow-7; hw+=8) { - U32 in_h_0 = hw/ow*strideH; - U32 in_w_0 = hw%ow*strideW; - U32 in_h_1 = (hw+1)/ow*strideH; - U32 in_w_1 = (hw+1)%ow*strideW; - U32 in_h_2 = (hw+2)/ow*strideH; - U32 in_w_2 = (hw+2)%ow*strideW; - U32 in_h_3 = (hw+3)/ow*strideH; - U32 in_w_3 = (hw+3)%ow*strideW; - U32 in_h_4 = (hw+4)/ow*strideH; - U32 in_w_4 = (hw+4)%ow*strideW; - U32 in_h_5 = (hw+5)/ow*strideH; - U32 in_w_5 = (hw+5)%ow*strideW; - U32 in_h_6 = (hw+6)/ow*strideH; - U32 in_w_6 = (hw+6)%ow*strideW; - U32 in_h_7 = (hw+7)/ow*strideH; - U32 in_w_7 = (hw+7)%ow*strideW; - F16 *out_ptr = outArray + ((n * ic + c) * ohow + hw) * 8; - //TODO handle asm combined with c. No guarantee that compile will not use vec reg in c. - __asm__ __volatile__( - "ldr q8, [%[b]]\n" - "mov v0.16b, v8.16b\n" - "mov v1.16b, v8.16b\n" - "mov v2.16b, v8.16b\n" - "mov v3.16b, v8.16b\n" - "mov v4.16b, v8.16b\n" - "mov v5.16b, v8.16b\n" - "mov v6.16b, v8.16b\n" - "mov v7.16b, v8.16b\n" - : - :[b]"r"(b) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8" - ); - - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - const F16 *f_0 = f + fh_idx*fw*8 + fw_idx*8; - F16 *in_idx = in_pad + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - F16 *in_0 = in_idx + in_h_0*iw_pad*8 + in_w_0*8; - F16 *in_1 = in_idx + in_h_1*iw_pad*8 + in_w_1*8; - F16 *in_2 = in_idx + in_h_2*iw_pad*8 + in_w_2*8; - F16 *in_3 = in_idx + in_h_3*iw_pad*8 + in_w_3*8; - F16 *in_4 = in_idx + in_h_4*iw_pad*8 + in_w_4*8; - F16 *in_5 = in_idx + in_h_5*iw_pad*8 + in_w_5*8; - F16 *in_6 = in_idx + in_h_6*iw_pad*8 + in_w_6*8; - F16 *in_7 = in_idx + in_h_7*iw_pad*8 + in_w_7*8; - __asm__ __volatile__( - "ldr q17, [%[f0]]\n" - "ldr q9, [%[in0]]\n" - "ldr q10, [%[in1]]\n" - "ldr q11, [%[in2]]\n" - "ldr q12, [%[in3]]\n" - "ldr q13, [%[in4]]\n" - "ldr q14, [%[in5]]\n" - "ldr q15, [%[in6]]\n" - "ldr q16, [%[in7]]\n" - "fmla v0.8h, v9.8h, v17.8h\n" - "fmla v1.8h, v10.8h, v17.8h\n" - "fmla v2.8h, v11.8h, v17.8h\n" - "fmla v3.8h, v12.8h, v17.8h\n" - "fmla v4.8h, v13.8h, v17.8h\n" - "fmla v5.8h, v14.8h, v17.8h\n" - "fmla v6.8h, v15.8h, v17.8h\n" - "fmla v7.8h, v16.8h, v17.8h\n" - : - :[in0]"r"(in_0), - [in1]"r"(in_1), - [in2]"r"(in_2), - [in3]"r"(in_3), - [in4]"r"(in_4), - [in5]"r"(in_5), - [in6]"r"(in_6), - [in7]"r"(in_7), - [f0]"r"(f_0) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17" - ); - } - } - - // activation - switch (depthwiseActivationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fmax v0.8h, v0.8h, v31.8h\n" - "fmax v1.8h, v1.8h, v31.8h\n" - "fmax v2.8h, v2.8h, v31.8h\n" - "fmax v3.8h, v3.8h, v31.8h\n" - "fmax v4.8h, v4.8h, v31.8h\n" - "fmax v5.8h, v5.8h, v31.8h\n" - "fmax v6.8h, v6.8h, v31.8h\n" - "fmax v7.8h, v7.8h, v31.8h\n" - : - : - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v31" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "movi v30.8h, #0x46, lsl #8\n" // six - "fmax v0.8h, v0.8h, v31.8h\n" - "fmax v1.8h, v1.8h, v31.8h\n" - "fmax v2.8h, v2.8h, v31.8h\n" - "fmax v3.8h, v3.8h, v31.8h\n" - "fmax v4.8h, v4.8h, v31.8h\n" - "fmax v5.8h, v5.8h, v31.8h\n" - "fmax v6.8h, v6.8h, v31.8h\n" - "fmax v7.8h, v7.8h, v31.8h\n" - "fmin v0.8h, v0.8h, v30.8h\n" - "fmin v1.8h, v1.8h, v30.8h\n" - "fmin v2.8h, v2.8h, v30.8h\n" - "fmin v3.8h, v3.8h, v30.8h\n" - "fmin v4.8h, v4.8h, v30.8h\n" - "fmin v5.8h, v5.8h, v30.8h\n" - "fmin v6.8h, v6.8h, v30.8h\n" - "fmin v7.8h, v7.8h, v30.8h\n" - : - : - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v30", "v31" - ); - break; - } - case ACTIVATION_H_SWISH:{ - __asm__ __volatile__( - "movi v29.8h, #0x42, lsl #8\n" // three - "movi v30.8h, #0x46, lsl #8\n" // six - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fadd v21.8h, v0.8h, v29.8h\n" - "fadd v22.8h, v1.8h, v29.8h\n" - "fadd v23.8h, v2.8h, v29.8h\n" - "fadd v24.8h, v3.8h, v29.8h\n" - "fadd v25.8h, v4.8h, v29.8h\n" - "fadd v26.8h, v5.8h, v29.8h\n" - "fadd v27.8h, v6.8h, v29.8h\n" - "fadd v28.8h, v7.8h, v29.8h\n" - "fmax v21.8h, v21.8h, v31.8h\n" - "fmax v22.8h, v22.8h, v31.8h\n" - "fmax v23.8h, v23.8h, v31.8h\n" - "fmax v24.8h, v24.8h, v31.8h\n" - "fmax v25.8h, v25.8h, v31.8h\n" - "fmax v26.8h, v26.8h, v31.8h\n" - "fmax v27.8h, v27.8h, v31.8h\n" - "fmax v28.8h, v28.8h, v31.8h\n" - "fmin v21.8h, v21.8h, v30.8h\n" - "fmin v22.8h, v22.8h, v30.8h\n" - "fmin v23.8h, v23.8h, v30.8h\n" - "fmin v24.8h, v24.8h, v30.8h\n" - "fmin v25.8h, v25.8h, v30.8h\n" - "fmin v26.8h, v26.8h, v30.8h\n" - "fmin v27.8h, v27.8h, v30.8h\n" - "fmin v28.8h, v28.8h, v30.8h\n" - "fdiv v21.8h, v21.8h, v30.8h\n" - "fdiv v22.8h, v22.8h, v30.8h\n" - "fdiv v23.8h, v23.8h, v30.8h\n" - "fdiv v24.8h, v24.8h, v30.8h\n" - "fdiv v25.8h, v25.8h, v30.8h\n" - "fdiv v26.8h, v26.8h, v30.8h\n" - "fdiv v27.8h, v27.8h, v30.8h\n" - "fdiv v28.8h, v28.8h, v30.8h\n" - "fmul v0.8h, v0.8h, v21.8h\n" - "fmul v1.8h, v1.8h, v22.8h\n" - "fmul v2.8h, v2.8h, v23.8h\n" - "fmul v3.8h, v3.8h, v24.8h\n" - "fmul v4.8h, v4.8h, v25.8h\n" - "fmul v5.8h, v5.8h, v26.8h\n" - "fmul v6.8h, v6.8h, v27.8h\n" - "fmul v7.8h, v7.8h, v28.8h\n" - : - : - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" - ); - break; - } - default: - return NOT_SUPPORTED; - } - - __asm__ __volatile__( - "str q0, [%[out]]\n" - "str q1, [%[out], #16]\n" - "str q2, [%[out], #32]\n" - "str q3, [%[out], #48]\n" - "str q4, [%[out], #64]\n" - "str q5, [%[out], #80]\n" - "str q6, [%[out], #96]\n" - "str q7, [%[out], #112]\n" - :[out]"+r"(out_ptr) - : - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19" - ); - } - - // ohow_reminder % 8 / 4 - U32 ohow_s = (ohow / 8) * 8; - for (I32 hw = ohow_s; hw < ohow-3; hw+=4) { - U32 in_h_0 = hw/ow*strideH; - U32 in_w_0 = hw%ow*strideW; - U32 in_h_1 = (hw+1)/ow*strideH; - U32 in_w_1 = (hw+1)%ow*strideW; - U32 in_h_2 = (hw+2)/ow*strideH; - U32 in_w_2 = (hw+2)%ow*strideW; - U32 in_h_3 = (hw+3)/ow*strideH; - U32 in_w_3 = (hw+3)%ow*strideW; - F16 *out_ptr = outArray + ((n * ic + c) * ohow + hw) * 8; - //TODO handle asm combined with c. No guarantee that compile will not use vec reg in c. - __asm__ __volatile__( - "ldr q8, [%[b]]\n" - "mov v0.16b, v8.16b\n" - "mov v1.16b, v8.16b\n" - "mov v2.16b, v8.16b\n" - "mov v3.16b, v8.16b\n" - : - :[b]"r"(b) - :"memory", "cc", "v0", "v1", "v2", "v3", "v8" - ); - - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - const F16 *f_0 = f + fh_idx*fw*8 + fw_idx*8; - F16 *in_idx = in_pad + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - F16 *in_0 = in_idx + in_h_0*iw_pad*8 + in_w_0*8; - F16 *in_1 = in_idx + in_h_1*iw_pad*8 + in_w_1*8; - F16 *in_2 = in_idx + in_h_2*iw_pad*8 + in_w_2*8; - F16 *in_3 = in_idx + in_h_3*iw_pad*8 + in_w_3*8; - __asm__ __volatile__( - "ldr q17, [%[f0]]\n" - "ldr q9, [%[in0]]\n" - "ldr q10, [%[in1]]\n" - "ldr q11, [%[in2]]\n" - "ldr q12, [%[in3]]\n" - "fmla v0.8h, v9.8h, v17.8h\n" - "fmla v1.8h, v10.8h, v17.8h\n" - "fmla v2.8h, v11.8h, v17.8h\n" - "fmla v3.8h, v12.8h, v17.8h\n" - : - :[in0]"r"(in_0), - [in1]"r"(in_1), - [in2]"r"(in_2), - [in3]"r"(in_3), - [f0]"r"(f_0) - :"memory", "cc", "v0", "v1", "v2", "v3", "v9", "v10", "v11", "v12", "v17" - ); - } - } - - // activation - switch (depthwiseActivationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fmax v0.8h, v0.8h, v31.8h\n" - "fmax v1.8h, v1.8h, v31.8h\n" - "fmax v2.8h, v2.8h, v31.8h\n" - "fmax v3.8h, v3.8h, v31.8h\n" - : - : - :"memory", "cc", "v0", "v1", "v2", "v3", "v31" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "movi v30.8h, #0x46, lsl #8\n" // six - "fmax v0.8h, v0.8h, v31.8h\n" - "fmax v1.8h, v1.8h, v31.8h\n" - "fmax v2.8h, v2.8h, v31.8h\n" - "fmax v3.8h, v3.8h, v31.8h\n" - "fmin v0.8h, v0.8h, v30.8h\n" - "fmin v1.8h, v1.8h, v30.8h\n" - "fmin v2.8h, v2.8h, v30.8h\n" - "fmin v3.8h, v3.8h, v30.8h\n" - : - : - :"memory", "cc", "v0", "v1", "v2", "v3", "v30", "v31" - ); - break; - } - case ACTIVATION_H_SWISH:{ - __asm__ __volatile__( - "movi v29.8h, #0x42, lsl #8\n" // three - "movi v30.8h, #0x46, lsl #8\n" // six - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fadd v25.8h, v0.8h, v29.8h\n" - "fadd v26.8h, v1.8h, v29.8h\n" - "fadd v27.8h, v2.8h, v29.8h\n" - "fadd v28.8h, v3.8h, v29.8h\n" - "fmax v25.8h, v25.8h, v31.8h\n" - "fmax v26.8h, v26.8h, v31.8h\n" - "fmax v27.8h, v27.8h, v31.8h\n" - "fmax v28.8h, v28.8h, v31.8h\n" - "fmin v25.8h, v25.8h, v30.8h\n" - "fmin v26.8h, v26.8h, v30.8h\n" - "fmin v27.8h, v27.8h, v30.8h\n" - "fmin v28.8h, v28.8h, v30.8h\n" - "fdiv v25.8h, v25.8h, v30.8h\n" - "fdiv v26.8h, v26.8h, v30.8h\n" - "fdiv v27.8h, v27.8h, v30.8h\n" - "fdiv v28.8h, v28.8h, v30.8h\n" - "fmul v0.8h, v0.8h, v25.8h\n" - "fmul v1.8h, v1.8h, v26.8h\n" - "fmul v2.8h, v2.8h, v27.8h\n" - "fmul v3.8h, v3.8h, v28.8h\n" - : - : - :"memory", "cc", "v0", "v1", "v2", "v3", "v25", "v26", "v27", "v28", "v29", "v30", "v31" - ); - break; - } - default: - return NOT_SUPPORTED; - } - - __asm__ __volatile__( - "st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [%[out]]\n" - :[out]"+r"(out_ptr) - : - :"memory", "cc", "v0", "v1", "v2", "v3" - ); - } - - // ohow_reminder % 4 - ohow_s = (ohow / 4) * 4; - for (I32 hw = ohow_s; hw < ohow; hw++) { - U32 in_h_0 = hw/ow*strideH; - U32 in_w_0 = hw%ow*strideW; - F16 *out_ptr = outArray + ((n * ic + c) * ohow + hw) * 8; - //TODO handle asm combined with c. No guarantee that compile will not use vec reg in c. - __asm__ __volatile__( - "ldr q8, [%[b]]\n" - "mov v0.16b, v8.16b\n" - : - :[b]"r"(b) - :"memory", "cc", "v0" - ); - - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - const F16 *f_0 = f + fh_idx*fw*8 + fw_idx*8; - F16 *in_idx = in_pad + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - F16 *in_0 = in_idx + in_h_0*iw_pad*8 + in_w_0*8; - __asm__ __volatile__( - "ldr q17, [%[f0]]\n" - "ldr q9, [%[in0]]\n" - "fmla v0.8h, v9.8h, v17.8h\n" - : - :[in0]"r"(in_0), - [f0]"r"(f_0) - :"memory", "cc", "v0", "v9", "v17" - ); - } - } - - // activation - switch (depthwiseActivationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fmax v0.8h, v0.8h, v31.8h\n" - : - : - :"memory", "cc", "v0", "v31" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "movi v30.8h, #0x46, lsl #8\n" // six - "fmax v0.8h, v0.8h, v31.8h\n" - "fmin v0.8h, v0.8h, v30.8h\n" - : - : - :"memory", "cc", "v0", "v30", "v31" - ); - break; - } - case ACTIVATION_H_SWISH:{ - __asm__ __volatile__( - "movi v29.8h, #0x42, lsl #8\n" // three - "movi v30.8h, #0x46, lsl #8\n" // six - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fadd v28.8h, v0.8h, v29.8h\n" - "fmax v28.8h, v28.8h, v31.8h\n" - "fmin v28.8h, v28.8h, v30.8h\n" - "fdiv v28.8h, v28.8h, v30.8h\n" - "fmul v0.8h, v0.8h, v28.8h\n" - : - : - :"memory", "cc", "v0", "v28", "v29", "v30", "v31" - ); - break; - } - default: - return NOT_SUPPORTED; - } - - __asm__ __volatile__( - "str q0, [%[out]]\n" - :[out]"+r"(out_ptr) - : - :"memory", "cc", "v0" - ); - } - } - } - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/arm/fp16/depthwise_convolution_transform.cpp b/tensor_computing/src/cpu/arm/fp16/depthwise_convolution_transform.cpp deleted file mode 100644 index df1f87f9..00000000 --- a/tensor_computing/src/cpu/arm/fp16/depthwise_convolution_transform.cpp +++ /dev/null @@ -1,126 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include - -#include "cpu/arm/fp16/tensor_computing_fp16.h" - - -inline EE depthwise_convolution_transform_filter_kernel_fp16(TensorDesc filterDesc, const F16* filterArray, - TensorDesc *ftmDesc, F16* ftmArray, - DataFormat ftmDataFormat) -{ - if (nullptr == filterArray || nullptr == ftmDesc || nullptr == ftmArray) - CHECK_STATUS(NULL_POINTER); - DataType fdt; - DataFormat fdf; - U32 fn, fc, fh, fw; - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - - if (fdf == ftmDataFormat) { - *ftmDesc = filterDesc; - if (fdf == DF_NCHW || fdf == DF_NCHWC8) { - memcpy(ftmArray, filterArray, fn*fc*fh*fw*bytesOf(fdt)); - return SUCCESS; - } - if (fdf == DF_CHW_NC || fdf == DF_CHWC8_NCN16) { - memcpy(ftmArray, filterArray, (fc*fh*fw + fc*fn)*bytesOf(fdt)); - return SUCCESS; - } - return NOT_SUPPORTED; - } - - switch (fdf) { - case DF_NCHW: { - if (ftmDataFormat == DF_NCHWC8) { - U32 ic = fc / 8; - for (U32 c = 0; c < ic; c++) { - for (U32 hw = 0; hw < fh*fw; hw++) { - for (U32 c8 = 0; c8 < 8; c8++) { - ftmArray[c*fh*fw*8 + hw*8 + c8] = filterArray[(c*8+c8)*fh*fw + hw]; - } - } - } - *ftmDesc = tensor4df(fdt, DF_NCHWC8, fn, fc, fh, fw); - } - else { - return NOT_SUPPORTED; - } - break; - } - case DF_CHW_NC: { - if (ftmDataFormat == DF_CHWC8_NCN16) { - const F16 *pwFilterArray = filterArray + fc*fh*fw; - F16 *pwFtmArray = ftmArray + fc*fh*fw; - U32 oc = fn / 16; - U32 ic = fc / 8; - for (U32 c = 0; c < ic; c++) { - for (U32 hw = 0; hw < fh*fw; hw++) { - for (U32 c8 = 0; c8 < 8; c8++) { - ftmArray[c*fh*fw*8 + hw*8 + c8] = filterArray[(c*8+c8)*fh*fw + hw]; - } - } - } - for (U32 o = 0; o < oc; o++) { - for (U32 c = 0; c < fc; c++) { - for (U32 o16 = 0; o16 < 16; o16++) { - pwFtmArray[o*fc*16 + c*16 + o16] = pwFilterArray[(o*16+o16)*fc + c]; - } - } - } - if (fn != oc*16) { - for (U32 c = 0; c < fc; c++) { - for (U32 o8 = 0; o8 < 8; o8++) { - pwFtmArray[oc*16*fc + c*8 + o8] = pwFilterArray[(oc*16+o8)*fc + c]; - } - } - } - *ftmDesc = tensor4df(fdt, DF_CHWC8_NCN16, fn, fc, fh, fw); - } - else { - return NOT_SUPPORTED; - } - break; - } - default: - return NOT_SUPPORTED; - } - return SUCCESS; -} - -EE depthwise_convolution_transform_filter_fp16(TensorDesc filterDesc, const F16* filter, - DepthwiseConvolutionForwardAlgorithm algorithm, - TensorDesc *ftmDesc, F16* filterTransformed) -{ - DataFormat ftmDataFormat; - switch (algorithm) { - case DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT: - ftmDataFormat = DF_NCHWC8; - break; - case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT: - ftmDataFormat = DF_CHWC8_NCN16; - break; - case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT_NO_PADDING: - ftmDataFormat = DF_CHWC8_NCN16; - break; - case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_3X3S1P1: - ftmDataFormat = DF_CHWC8_NCN16; - break; - default: - return NOT_MATCH; - } - EE ret = depthwise_convolution_transform_filter_kernel_fp16(filterDesc, filter, ftmDesc, filterTransformed, ftmDataFormat); - CHECK_STATUS(ret); - return ret; -} diff --git a/tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1.h b/tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1.h deleted file mode 100644 index 644c38cc..00000000 --- a/tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1.h +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_DEPTHWISE_POINTWISE_CONVOLUTION_3X3S1P1 -#define _H_DEPTHWISE_POINTWISE_CONVOLUTION_3X3S1P1 - -#include "sys.h" -#include "type.h" -#include "error.h" -#include "tensor_desc.h" -#include "tensor_computing_type.h" - -EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc); - -EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc); - - -inline EE depthwise_pointwise_convolution_3x3s1p1(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc, - Arch arch) -{ - EE ret = SUCCESS; - switch (arch) { - case ARM_A55: - ret = depthwise_pointwise_convolution_3x3s1p1_A55(inputDesc, inArray, - filterDesc, filterArray, - convDesc, - biasDesc, biasArray, - tmpBytes, tmp, - outputDesc, outArray, - depthwiseActivationDesc, - pointwiseActivationDesc); - break; - case ARM_A76: - ret = depthwise_pointwise_convolution_3x3s1p1_A76(inputDesc, inArray, - filterDesc, filterArray, - convDesc, - biasDesc, biasArray, - tmpBytes, tmp, - outputDesc, outArray, - depthwiseActivationDesc, - pointwiseActivationDesc); - break; - default: - return NOT_SUPPORTED; - } - return ret; -} -#endif diff --git a/tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1_A55.cpp b/tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1_A55.cpp deleted file mode 100644 index 9262862d..00000000 --- a/tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1_A55.cpp +++ /dev/null @@ -1,1639 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1.h" - -EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc) -{ - UNUSED(biasDesc); - UNUSED(tmpBytes); - UNUSED(convDesc); - - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - - if (fdf != DF_CHWC8_NCN16) - CHECK_STATUS(NOT_MATCH); - - oc /= 8; - ic /= 8; - - I32 ohow = oh * ow; - F16 *pwArray = (F16*)tmp; - - for (U32 n = 0; n < in; n++) { - // dw_conv + padding - for (U32 c = 0; c < ic; c++) { - const F16 *b = biasArray + c*8; - F16 *in_c = inArray + c*ih*iw*8; - const F16 *f = filterArray + c*fh*fw*8; - F16 *out = pwArray + c*ohow*8; - F16 *in0 = in_c; - F16 *in1 = in0 + iw*8; - F16 *in2 = in1 + iw*8; - __asm__ __volatile__( - "mov x0, %[w]\n" - "ldr q28, [%[b]]\n" - "ldr q3, [%[f], #48]\n" - "ldr q4, [%[f], #64]\n" - "ldr q5, [%[f], #80]\n" - "ldr q6, [%[f], #96]\n" - "ldr q7, [%[f], #112]\n" - "ldr q8, [%[f], #128]\n" - "ldr q13, [%[in_0]]\n" - "ldr q14, [%[in_0], #16]\n" - "ldr q15, [%[in_0], #32]\n" - "ldr q16, [%[in_0], #48]\n" - "ldr q18, [%[in_1]]\n" - "ldr q19, [%[in_1], #16]\n" - "ldr q20, [%[in_1], #32]\n" - "ldr q21, [%[in_1], #48]\n" - "mov v9.16b, v28.16b\n" //out_0 - "mov v10.16b, v28.16b\n" //out_1 - "mov v11.16b, v28.16b\n" //out_2 - "mov v12.16b, v28.16b\n" //out_3 - - "ldr q17, [%[in_0], #64]\n" - "fmla v10.8h, v3.8h, v13.8h\n" - "fmla v11.8h, v3.8h, v14.8h\n" - "fmla v12.8h, v3.8h, v15.8h\n" - "ldr q22, [%[in_1], #64]\n" - "fmla v10.8h, v6.8h, v18.8h\n" - "fmla v11.8h, v6.8h, v19.8h\n" - "fmla v12.8h, v6.8h, v20.8h\n" - - "fmla v9.8h, v4.8h, v13.8h\n" - "fmla v10.8h, v4.8h, v14.8h\n" - "fmla v11.8h, v4.8h, v15.8h\n" - "fmla v12.8h, v4.8h, v16.8h\n" - "fmla v9.8h, v7.8h, v18.8h\n" - "fmla v10.8h, v7.8h, v19.8h\n" - "fmla v11.8h, v7.8h, v20.8h\n" - "fmla v12.8h, v7.8h, v21.8h\n" - - "ldr q13, [%[in_0], #80]\n" - "fmla v9.8h, v5.8h, v14.8h\n" - "fmla v10.8h, v5.8h, v15.8h\n" - "fmla v11.8h, v5.8h, v16.8h\n" - "fmla v12.8h, v5.8h, v17.8h\n" - "ldr q18, [%[in_1], #80]\n" - "fmla v9.8h, v8.8h, v19.8h\n" - "fmla v10.8h, v8.8h, v20.8h\n" - "fmla v11.8h, v8.8h, v21.8h\n" - "fmla v12.8h, v8.8h, v22.8h\n" - - "mov v14.16b, v17.16b\n" - "mov v19.16b, v22.16b\n" - "mov v15.16b, v13.16b\n" - "mov v20.16b, v18.16b\n" - "mov v13.16b, v16.16b\n" - "mov v18.16b, v21.16b\n" - "ldr q16, [%[in_0], #96]\n" - "ldr q21, [%[in_1], #96]\n" - "add %[in_0], %[in_0], #48\n" - "add %[in_1], %[in_1], #48\n" - - "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse - "bne 111f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) - "fmax v10.8h, v10.8h, v17.8h\n" - "fmax v11.8h, v11.8h, v17.8h\n" - "fmax v12.8h, v12.8h, v17.8h\n" - - "111:\n" - "cmp %[depthwiseActivationMode], %[am_relu6]\n" - "bne 112f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) - "fmax v10.8h, v10.8h, v17.8h\n" - "fmax v11.8h, v11.8h, v17.8h\n" - "fmax v12.8h, v12.8h, v17.8h\n" - "fmin v9.8h, v9.8h, v22.8h\n" //min(v9, 6) - "fmin v10.8h, v10.8h, v22.8h\n" - "fmin v11.8h, v11.8h, v22.8h\n" - "fmin v12.8h, v12.8h, v22.8h\n" - - "112:\n" - "cmp %[depthwiseActivationMode], %[am_h_swish]\n" - "bne 113f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x42, lsl #8\n" //three - "fadd v27.8h, v9.8h, v22.8h\n" - "fadd v29.8h, v10.8h, v22.8h\n" - "fadd v30.8h, v11.8h, v22.8h\n" - "fadd v31.8h, v12.8h, v22.8h\n" - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v27.8h, v27.8h, v17.8h\n" - "fmax v29.8h, v29.8h, v17.8h\n" - "fmax v30.8h, v30.8h, v17.8h\n" - "fmax v31.8h, v31.8h, v17.8h\n" - "fmin v27.8h, v27.8h, v22.8h\n" - "fmin v29.8h, v29.8h, v22.8h\n" - "fmin v30.8h, v30.8h, v22.8h\n" - "fmin v31.8h, v31.8h, v22.8h\n" - "fdiv v27.8h, v27.8h, v22.8h\n" - "fdiv v29.8h, v29.8h, v22.8h\n" - "fdiv v30.8h, v30.8h, v22.8h\n" - "fdiv v31.8h, v31.8h, v22.8h\n" - "fmul v9.8h, v27.8h, v9.8h\n" - "fmul v10.8h, v29.8h, v10.8h\n" - "fmul v11.8h, v30.8h, v11.8h\n" - "fmul v12.8h, v31.8h, v12.8h\n" - - "113:\n" - "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" - - "0:\n" - "mov v9.16b, v28.16b\n" //out_0 - "mov v10.16b, v28.16b\n" //out_1 - "mov v11.16b, v28.16b\n" //out_2 - "mov v12.16b, v28.16b\n" //out_3 - - "ldr d17, [%[in_0], #64]\n" - "fmla v9.8h, v3.8h, v13.8h\n" - "ldr x1, [%[in_0], #72]\n" - "fmla v10.8h, v3.8h, v14.8h\n" - "ins v17.d[1], x1\n" - "fmla v11.8h, v3.8h, v15.8h\n" - "ldr d22, [%[in_1], #64]\n" - "fmla v12.8h, v3.8h, v16.8h\n" - "ldr x2, [%[in_1], #72]\n" - "fmla v9.8h, v6.8h, v18.8h\n" - "ins v22.d[1], x2\n" - "fmla v10.8h, v6.8h, v19.8h\n" - "fmla v11.8h, v6.8h, v20.8h\n" - "fmla v12.8h, v6.8h, v21.8h\n" - - "ldr d13, [%[in_0], #80]\n" - "fmla v9.8h, v4.8h, v14.8h\n" - "ldr x1, [%[in_0], #88]\n" - "fmla v10.8h, v4.8h, v15.8h\n" - "ins v13.d[1], x1\n" - "fmla v11.8h, v4.8h, v16.8h\n" - "ldr d18, [%[in_1], #80]\n" - "fmla v12.8h, v4.8h, v17.8h\n" - "ldr x2, [%[in_1], #88]\n" - "fmla v9.8h, v7.8h, v19.8h\n" - "ins v18.d[1], x2\n" - "fmla v10.8h, v7.8h, v20.8h\n" - "fmla v11.8h, v7.8h, v21.8h\n" - "fmla v12.8h, v7.8h, v22.8h\n" - - "ldr d14, [%[in_0], #96]\n" - "fmla v9.8h, v5.8h, v15.8h\n" - "ldr x1, [%[in_0], #104]\n" - "fmla v10.8h, v5.8h, v16.8h\n" - "ins v14.d[1], x1\n" - "fmla v11.8h, v5.8h, v17.8h\n" - "ldr d19, [%[in_1], #96]\n" - "fmla v12.8h, v5.8h, v13.8h\n" - "ldr x2, [%[in_1], #104]\n" - "fmla v9.8h, v8.8h, v20.8h\n" - "ins v19.d[1], x2\n" - "fmla v10.8h, v8.8h, v21.8h\n" - "fmla v11.8h, v8.8h, v22.8h\n" - "fmla v12.8h, v8.8h, v18.8h\n" - - "ldr d16, [%[in_0], #112]\n" - "mov v15.16b, v14.16b\n" - "ldr x1, [%[in_0], #120]\n" - "mov v20.16b, v19.16b\n" - "ins v16.d[1], x1\n" - "mov v14.16b, v13.16b\n" - "ldr d21, [%[in_1], #112]\n" - "mov v19.16b, v18.16b\n" - "ldr x2, [%[in_1], #120]\n" - "mov v13.16b, v17.16b\n" - "ins v21.d[1], x2\n" - "mov v18.16b, v22.16b\n" - - "add %[in_0], %[in_0], #64\n" - "add %[in_1], %[in_1], #64\n" - - "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse - "bne 211f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) - "fmax v10.8h, v10.8h, v17.8h\n" - "fmax v11.8h, v11.8h, v17.8h\n" - "fmax v12.8h, v12.8h, v17.8h\n" - - "211:\n" - "cmp %[depthwiseActivationMode], %[am_relu6]\n" - "bne 212f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) - "fmax v10.8h, v10.8h, v17.8h\n" - "fmax v11.8h, v11.8h, v17.8h\n" - "fmax v12.8h, v12.8h, v17.8h\n" - "fmin v9.8h, v9.8h, v22.8h\n" //min(v9, 6) - "fmin v10.8h, v10.8h, v22.8h\n" - "fmin v11.8h, v11.8h, v22.8h\n" - "fmin v12.8h, v12.8h, v22.8h\n" - - "212:\n" - "cmp %[depthwiseActivationMode], %[am_h_swish]\n" - "bne 213f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x42, lsl #8\n" //three - "fadd v27.8h, v9.8h, v22.8h\n" - "fadd v29.8h, v10.8h, v22.8h\n" - "fadd v30.8h, v11.8h, v22.8h\n" - "fadd v31.8h, v12.8h, v22.8h\n" - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v27.8h, v27.8h, v17.8h\n" - "fmax v29.8h, v29.8h, v17.8h\n" - "fmax v30.8h, v30.8h, v17.8h\n" - "fmax v31.8h, v31.8h, v17.8h\n" - "fmin v27.8h, v27.8h, v22.8h\n" - "fmin v29.8h, v29.8h, v22.8h\n" - "fmin v30.8h, v30.8h, v22.8h\n" - "fmin v31.8h, v31.8h, v22.8h\n" - "fdiv v27.8h, v27.8h, v22.8h\n" - "fdiv v29.8h, v29.8h, v22.8h\n" - "fdiv v30.8h, v30.8h, v22.8h\n" - "fdiv v31.8h, v31.8h, v22.8h\n" - "fmul v9.8h, v27.8h, v9.8h\n" - "fmul v10.8h, v29.8h, v10.8h\n" - "fmul v11.8h, v30.8h, v11.8h\n" - "fmul v12.8h, v31.8h, v12.8h\n" - - "213:\n" - "subs x0, x0, #4\n" - "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" - "bne 0b\n" - - "mov v9.16b, v28.16b\n" //out_0 - "mov v10.16b, v28.16b\n" //out_1 - "mov v11.16b, v28.16b\n" //out_2 - "mov v12.16b, v28.16b\n" //out_3 - - "ldr q17, [%[in_0], #64]\n" - "fmla v9.8h, v3.8h, v13.8h\n" - "fmla v10.8h, v3.8h, v14.8h\n" - "fmla v11.8h, v3.8h, v15.8h\n" - "fmla v12.8h, v3.8h, v16.8h\n" - "ldr q22, [%[in_1], #64]\n" - "fmla v9.8h, v6.8h, v18.8h\n" - "fmla v10.8h, v6.8h, v19.8h\n" - "fmla v11.8h, v6.8h, v20.8h\n" - "fmla v12.8h, v6.8h, v21.8h\n" - - "fmla v9.8h, v4.8h, v14.8h\n" - "fmla v10.8h, v4.8h, v15.8h\n" - "fmla v11.8h, v4.8h, v16.8h\n" - "fmla v12.8h, v4.8h, v17.8h\n" - "fmla v9.8h, v7.8h, v19.8h\n" - "fmla v10.8h, v7.8h, v20.8h\n" - "fmla v11.8h, v7.8h, v21.8h\n" - "fmla v12.8h, v7.8h, v22.8h\n" - - "fmla v9.8h, v5.8h, v15.8h\n" - "fmla v10.8h, v5.8h, v16.8h\n" - "fmla v11.8h, v5.8h, v17.8h\n" - "fmla v9.8h, v8.8h, v20.8h\n" - "fmla v10.8h, v8.8h, v21.8h\n" - "fmla v11.8h, v8.8h, v22.8h\n" - - "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse - "bne 311f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) - "fmax v10.8h, v10.8h, v17.8h\n" - "fmax v11.8h, v11.8h, v17.8h\n" - "fmax v12.8h, v12.8h, v17.8h\n" - - "311:\n" - "cmp %[depthwiseActivationMode], %[am_relu6]\n" - "bne 312f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) - "fmax v10.8h, v10.8h, v17.8h\n" - "fmax v11.8h, v11.8h, v17.8h\n" - "fmax v12.8h, v12.8h, v17.8h\n" - "fmin v9.8h, v9.8h, v22.8h\n" //min(v9, 6) - "fmin v10.8h, v10.8h, v22.8h\n" - "fmin v11.8h, v11.8h, v22.8h\n" - "fmin v12.8h, v12.8h, v22.8h\n" - - "312:\n" - "cmp %[depthwiseActivationMode], %[am_h_swish]\n" - "bne 313f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x42, lsl #8\n" //three - "fadd v27.8h, v9.8h, v22.8h\n" - "fadd v29.8h, v10.8h, v22.8h\n" - "fadd v30.8h, v11.8h, v22.8h\n" - "fadd v31.8h, v12.8h, v22.8h\n" - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v27.8h, v27.8h, v17.8h\n" - "fmax v29.8h, v29.8h, v17.8h\n" - "fmax v30.8h, v30.8h, v17.8h\n" - "fmax v31.8h, v31.8h, v17.8h\n" - "fmin v27.8h, v27.8h, v22.8h\n" - "fmin v29.8h, v29.8h, v22.8h\n" - "fmin v30.8h, v30.8h, v22.8h\n" - "fmin v31.8h, v31.8h, v22.8h\n" - "fdiv v27.8h, v27.8h, v22.8h\n" - "fdiv v29.8h, v29.8h, v22.8h\n" - "fdiv v30.8h, v30.8h, v22.8h\n" - "fdiv v31.8h, v31.8h, v22.8h\n" - "fmul v9.8h, v27.8h, v9.8h\n" - "fmul v10.8h, v29.8h, v10.8h\n" - "fmul v11.8h, v30.8h, v11.8h\n" - "fmul v12.8h, v31.8h, v12.8h\n" - - "313:\n" - "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" - :[out]"+r"(out), - [in_0]"+r"(in0), - [in_1]"+r"(in1) - :[f]"r"(f), - [b]"r"(b), - [w]"r"((I64)ow-8), - [depthwiseActivationMode]"r"((I64)depthwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3" - ); - - for (U32 h = 0; h < oh-2; h++) { - in0 = in_c + h*iw*8; - in1 = in0 + iw*8; - in2 = in1 + iw*8; - __asm__ __volatile__( - "mov x0, %[w]\n" - "ldr q28, [%[b]]\n" - "ldr q0, [%[f]]\n" - "ldr q1, [%[f], #16]\n" - "ldr q2, [%[f], #32]\n" - "ldr q3, [%[f], #48]\n" - "ldr q4, [%[f], #64]\n" - "ldr q5, [%[f], #80]\n" - "ldr q6, [%[f], #96]\n" - "ldr q7, [%[f], #112]\n" - "ldr q8, [%[f], #128]\n" - "ldr q13, [%[in_0]]\n" - "ldr q14, [%[in_0], #16]\n" - "ldr q15, [%[in_0], #32]\n" - "ldr q16, [%[in_0], #48]\n" - "ldr q18, [%[in_1]]\n" - "ldr q19, [%[in_1], #16]\n" - "ldr q20, [%[in_1], #32]\n" - "ldr q21, [%[in_1], #48]\n" - "ldr q23, [%[in_2]]\n" - "ldr q24, [%[in_2], #16]\n" - "ldr q25, [%[in_2], #32]\n" - "ldr q26, [%[in_2], #48]\n" - - "mov v9.16b, v28.16b\n" //out_0 - "mov v10.16b, v28.16b\n" //out_1 - "mov v11.16b, v28.16b\n" //out_2 - "mov v12.16b, v28.16b\n" //out_3 - - "ldr q17, [%[in_0], #64]\n" - "fmla v10.8h, v0.8h, v13.8h\n" - "fmla v11.8h, v0.8h, v14.8h\n" - "fmla v12.8h, v0.8h, v15.8h\n" - "ldr q22, [%[in_1], #64]\n" - "fmla v10.8h, v3.8h, v18.8h\n" - "fmla v11.8h, v3.8h, v19.8h\n" - "fmla v12.8h, v3.8h, v20.8h\n" - "ldr q27, [%[in_2], #64]\n" - "fmla v10.8h, v6.8h, v23.8h\n" - "fmla v11.8h, v6.8h, v24.8h\n" - "fmla v12.8h, v6.8h, v25.8h\n" - - "fmla v9.8h, v1.8h, v13.8h\n" - "fmla v10.8h, v1.8h, v14.8h\n" - "fmla v11.8h, v1.8h, v15.8h\n" - "fmla v12.8h, v1.8h, v16.8h\n" - "fmla v9.8h, v4.8h, v18.8h\n" - "fmla v10.8h, v4.8h, v19.8h\n" - "fmla v11.8h, v4.8h, v20.8h\n" - "fmla v12.8h, v4.8h, v21.8h\n" - "fmla v9.8h, v7.8h, v23.8h\n" - "fmla v10.8h, v7.8h, v24.8h\n" - "fmla v11.8h, v7.8h, v25.8h\n" - "fmla v12.8h, v7.8h, v26.8h\n" - - "ldr q13, [%[in_0], #80]\n" - "fmla v9.8h, v2.8h, v14.8h\n" - "fmla v10.8h, v2.8h, v15.8h\n" - "fmla v11.8h, v2.8h, v16.8h\n" - "fmla v12.8h, v2.8h, v17.8h\n" - "ldr q18, [%[in_1], #80]\n" - "fmla v9.8h, v5.8h, v19.8h\n" - "fmla v10.8h, v5.8h, v20.8h\n" - "fmla v11.8h, v5.8h, v21.8h\n" - "fmla v12.8h, v5.8h, v22.8h\n" - "ldr q23, [%[in_2], #80]\n" - "fmla v9.8h, v8.8h, v24.8h\n" - "fmla v10.8h, v8.8h, v25.8h\n" - "fmla v11.8h, v8.8h, v26.8h\n" - "fmla v12.8h, v8.8h, v27.8h\n" - - "mov v14.16b, v17.16b\n" - "mov v19.16b, v22.16b\n" - "mov v24.16b, v27.16b\n" - "mov v15.16b, v13.16b\n" - "mov v20.16b, v18.16b\n" - "mov v25.16b, v23.16b\n" - "mov v13.16b, v16.16b\n" - "mov v18.16b, v21.16b\n" - "mov v23.16b, v26.16b\n" - "ldr q16, [%[in_0], #96]\n" - "ldr q21, [%[in_1], #96]\n" - "ldr q26, [%[in_2], #96]\n" - "add %[in_0], %[in_0], #48\n" - "add %[in_1], %[in_1], #48\n" - "add %[in_2], %[in_2], #48\n" - - "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse - "bne 111f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) - "fmax v10.8h, v10.8h, v17.8h\n" - "fmax v11.8h, v11.8h, v17.8h\n" - "fmax v12.8h, v12.8h, v17.8h\n" - - "111:\n" - "cmp %[depthwiseActivationMode], %[am_relu6]\n" - "bne 112f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) - "fmax v10.8h, v10.8h, v17.8h\n" - "fmax v11.8h, v11.8h, v17.8h\n" - "fmax v12.8h, v12.8h, v17.8h\n" - "fmin v9.8h, v9.8h, v22.8h\n" //min(v9, 6) - "fmin v10.8h, v10.8h, v22.8h\n" - "fmin v11.8h, v11.8h, v22.8h\n" - "fmin v12.8h, v12.8h, v22.8h\n" - - "112:\n" - "cmp %[depthwiseActivationMode], %[am_h_swish]\n" - "bne 113f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x42, lsl #8\n" //three - "fadd v27.8h, v9.8h, v22.8h\n" - "fadd v29.8h, v10.8h, v22.8h\n" - "fadd v30.8h, v11.8h, v22.8h\n" - "fadd v31.8h, v12.8h, v22.8h\n" - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v27.8h, v27.8h, v17.8h\n" - "fmax v29.8h, v29.8h, v17.8h\n" - "fmax v30.8h, v30.8h, v17.8h\n" - "fmax v31.8h, v31.8h, v17.8h\n" - "fmin v27.8h, v27.8h, v22.8h\n" - "fmin v29.8h, v29.8h, v22.8h\n" - "fmin v30.8h, v30.8h, v22.8h\n" - "fmin v31.8h, v31.8h, v22.8h\n" - "fdiv v27.8h, v27.8h, v22.8h\n" - "fdiv v29.8h, v29.8h, v22.8h\n" - "fdiv v30.8h, v30.8h, v22.8h\n" - "fdiv v31.8h, v31.8h, v22.8h\n" - "fmul v9.8h, v27.8h, v9.8h\n" - "fmul v10.8h, v29.8h, v10.8h\n" - "fmul v11.8h, v30.8h, v11.8h\n" - "fmul v12.8h, v31.8h, v12.8h\n" - - "113:\n" - "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" - - "0:\n" - "mov v9.16b, v28.16b\n" //out_0 - "mov v10.16b, v28.16b\n" //out_1 - "mov v11.16b, v28.16b\n" //out_2 - "mov v12.16b, v28.16b\n" //out_3 - - "ldr d17, [%[in_0], #64]\n" - "fmla v9.8h, v0.8h, v13.8h\n" - "ldr x1, [%[in_0], #72]\n" - "fmla v10.8h, v0.8h, v14.8h\n" - "ins v17.d[1], x1\n" - "fmla v11.8h, v0.8h, v15.8h\n" - "ldr d22, [%[in_1], #64]\n" - "fmla v12.8h, v0.8h, v16.8h\n" - "ldr x2, [%[in_1], #72]\n" - "fmla v9.8h, v3.8h, v18.8h\n" - "ins v22.d[1], x2\n" - "fmla v10.8h, v3.8h, v19.8h\n" - "ldr d27, [%[in_2], #64]\n" - "fmla v11.8h, v3.8h, v20.8h\n" - "ldr x3, [%[in_2], #72]\n" - "fmla v12.8h, v3.8h, v21.8h\n" - "ins v27.d[1], x3\n" - "fmla v9.8h, v6.8h, v23.8h\n" - "fmla v10.8h, v6.8h, v24.8h\n" - "fmla v11.8h, v6.8h, v25.8h\n" - "fmla v12.8h, v6.8h, v26.8h\n" - - "ldr d13, [%[in_0], #80]\n" - "fmla v9.8h, v1.8h, v14.8h\n" - "ldr x1, [%[in_0], #88]\n" - "fmla v10.8h, v1.8h, v15.8h\n" - "ins v13.d[1], x1\n" - "fmla v11.8h, v1.8h, v16.8h\n" - "ldr d18, [%[in_1], #80]\n" - "fmla v12.8h, v1.8h, v17.8h\n" - "ldr x2, [%[in_1], #88]\n" - "fmla v9.8h, v4.8h, v19.8h\n" - "ins v18.d[1], x2\n" - "fmla v10.8h, v4.8h, v20.8h\n" - "ldr d23, [%[in_2], #80]\n" - "fmla v11.8h, v4.8h, v21.8h\n" - "ldr x3, [%[in_2], #88]\n" - "fmla v12.8h, v4.8h, v22.8h\n" - "ins v23.d[1], x3\n" - "fmla v9.8h, v7.8h, v24.8h\n" - "fmla v10.8h, v7.8h, v25.8h\n" - "fmla v11.8h, v7.8h, v26.8h\n" - "fmla v12.8h, v7.8h, v27.8h\n" - - "ldr d14, [%[in_0], #96]\n" - "fmla v9.8h, v2.8h, v15.8h\n" - "ldr x1, [%[in_0], #104]\n" - "fmla v10.8h, v2.8h, v16.8h\n" - "ins v14.d[1], x1\n" - "fmla v11.8h, v2.8h, v17.8h\n" - "ldr d19, [%[in_1], #96]\n" - "fmla v12.8h, v2.8h, v13.8h\n" - "ldr x2, [%[in_1], #104]\n" - "fmla v9.8h, v5.8h, v20.8h\n" - "ins v19.d[1], x2\n" - "fmla v10.8h, v5.8h, v21.8h\n" - "ldr d24, [%[in_2], #96]\n" - "fmla v11.8h, v5.8h, v22.8h\n" - "ldr x3, [%[in_2], #104]\n" - "fmla v12.8h, v5.8h, v18.8h\n" - "ins v24.d[1], x3\n" - "fmla v9.8h, v8.8h, v25.8h\n" - "fmla v10.8h, v8.8h, v26.8h\n" - "fmla v11.8h, v8.8h, v27.8h\n" - "fmla v12.8h, v8.8h, v23.8h\n" - - - "ldr d16, [%[in_0], #112]\n" - "mov v15.16b, v14.16b\n" - "ldr x1, [%[in_0], #120]\n" - "mov v20.16b, v19.16b\n" - "ins v16.d[1], x1\n" - "mov v25.16b, v24.16b\n" - "ldr d21, [%[in_1], #112]\n" - "mov v14.16b, v13.16b\n" - "ldr x2, [%[in_1], #120]\n" - "mov v19.16b, v18.16b\n" - "ins v21.d[1], x2\n" - "mov v24.16b, v23.16b\n" - "ldr d26, [%[in_2], #112]\n" - "mov v13.16b, v17.16b\n" - "ldr x3, [%[in_2], #120]\n" - "mov v18.16b, v22.16b\n" - "ins v26.d[1], x3\n" - "mov v23.16b, v27.16b\n" - - "add %[in_0], %[in_0], #64\n" - "add %[in_1], %[in_1], #64\n" - "add %[in_2], %[in_2], #64\n" - - "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse - "bne 211f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) - "fmax v10.8h, v10.8h, v17.8h\n" - "fmax v11.8h, v11.8h, v17.8h\n" - "fmax v12.8h, v12.8h, v17.8h\n" - - "211:\n" - "cmp %[depthwiseActivationMode], %[am_relu6]\n" - "bne 212f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) - "fmax v10.8h, v10.8h, v17.8h\n" - "fmax v11.8h, v11.8h, v17.8h\n" - "fmax v12.8h, v12.8h, v17.8h\n" - "fmin v9.8h, v9.8h, v22.8h\n" //min(v9, 6) - "fmin v10.8h, v10.8h, v22.8h\n" - "fmin v11.8h, v11.8h, v22.8h\n" - "fmin v12.8h, v12.8h, v22.8h\n" - - "212:\n" - "cmp %[depthwiseActivationMode], %[am_h_swish]\n" - "bne 213f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x42, lsl #8\n" //three - "fadd v27.8h, v9.8h, v22.8h\n" - "fadd v29.8h, v10.8h, v22.8h\n" - "fadd v30.8h, v11.8h, v22.8h\n" - "fadd v31.8h, v12.8h, v22.8h\n" - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v27.8h, v27.8h, v17.8h\n" - "fmax v29.8h, v29.8h, v17.8h\n" - "fmax v30.8h, v30.8h, v17.8h\n" - "fmax v31.8h, v31.8h, v17.8h\n" - "fmin v27.8h, v27.8h, v22.8h\n" - "fmin v29.8h, v29.8h, v22.8h\n" - "fmin v30.8h, v30.8h, v22.8h\n" - "fmin v31.8h, v31.8h, v22.8h\n" - "fdiv v27.8h, v27.8h, v22.8h\n" - "fdiv v29.8h, v29.8h, v22.8h\n" - "fdiv v30.8h, v30.8h, v22.8h\n" - "fdiv v31.8h, v31.8h, v22.8h\n" - "fmul v9.8h, v27.8h, v9.8h\n" - "fmul v10.8h, v29.8h, v10.8h\n" - "fmul v11.8h, v30.8h, v11.8h\n" - "fmul v12.8h, v31.8h, v12.8h\n" - - "213:\n" - "subs x0, x0, #4\n" - "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" - "bne 0b\n" - - "mov v9.16b, v28.16b\n" //out_0 - "mov v10.16b, v28.16b\n" //out_1 - "mov v11.16b, v28.16b\n" //out_2 - "mov v12.16b, v28.16b\n" //out_3 - - "ldr q17, [%[in_0], #64]\n" - "fmla v9.8h, v0.8h, v13.8h\n" - "fmla v10.8h, v0.8h, v14.8h\n" - "fmla v11.8h, v0.8h, v15.8h\n" - "fmla v12.8h, v0.8h, v16.8h\n" - "ldr q22, [%[in_1], #64]\n" - "fmla v9.8h, v3.8h, v18.8h\n" - "fmla v10.8h, v3.8h, v19.8h\n" - "fmla v11.8h, v3.8h, v20.8h\n" - "fmla v12.8h, v3.8h, v21.8h\n" - "ldr q27, [%[in_2], #64]\n" - "fmla v9.8h, v6.8h, v23.8h\n" - "fmla v10.8h, v6.8h, v24.8h\n" - "fmla v11.8h, v6.8h, v25.8h\n" - "fmla v12.8h, v6.8h, v26.8h\n" - - "fmla v9.8h, v1.8h, v14.8h\n" - "fmla v10.8h, v1.8h, v15.8h\n" - "fmla v11.8h, v1.8h, v16.8h\n" - "fmla v12.8h, v1.8h, v17.8h\n" - "fmla v9.8h, v4.8h, v19.8h\n" - "fmla v10.8h, v4.8h, v20.8h\n" - "fmla v11.8h, v4.8h, v21.8h\n" - "fmla v12.8h, v4.8h, v22.8h\n" - "fmla v9.8h, v7.8h, v24.8h\n" - "fmla v10.8h, v7.8h, v25.8h\n" - "fmla v11.8h, v7.8h, v26.8h\n" - "fmla v12.8h, v7.8h, v27.8h\n" - - "fmla v9.8h, v2.8h, v15.8h\n" - "fmla v10.8h, v2.8h, v16.8h\n" - "fmla v11.8h, v2.8h, v17.8h\n" - "fmla v9.8h, v5.8h, v20.8h\n" - "fmla v10.8h, v5.8h, v21.8h\n" - "fmla v11.8h, v5.8h, v22.8h\n" - "fmla v9.8h, v8.8h, v25.8h\n" - "fmla v10.8h, v8.8h, v26.8h\n" - "fmla v11.8h, v8.8h, v27.8h\n" - - "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse - "bne 311f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) - "fmax v10.8h, v10.8h, v17.8h\n" - "fmax v11.8h, v11.8h, v17.8h\n" - "fmax v12.8h, v12.8h, v17.8h\n" - - "311:\n" - "cmp %[depthwiseActivationMode], %[am_relu6]\n" - "bne 312f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) - "fmax v10.8h, v10.8h, v17.8h\n" - "fmax v11.8h, v11.8h, v17.8h\n" - "fmax v12.8h, v12.8h, v17.8h\n" - "fmin v9.8h, v9.8h, v22.8h\n" //min(v9, 6) - "fmin v10.8h, v10.8h, v22.8h\n" - "fmin v11.8h, v11.8h, v22.8h\n" - "fmin v12.8h, v12.8h, v22.8h\n" - - "312:\n" - "cmp %[depthwiseActivationMode], %[am_h_swish]\n" - "bne 313f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x42, lsl #8\n" //three - "fadd v27.8h, v9.8h, v22.8h\n" - "fadd v29.8h, v10.8h, v22.8h\n" - "fadd v30.8h, v11.8h, v22.8h\n" - "fadd v31.8h, v12.8h, v22.8h\n" - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v27.8h, v27.8h, v17.8h\n" - "fmax v29.8h, v29.8h, v17.8h\n" - "fmax v30.8h, v30.8h, v17.8h\n" - "fmax v31.8h, v31.8h, v17.8h\n" - "fmin v27.8h, v27.8h, v22.8h\n" - "fmin v29.8h, v29.8h, v22.8h\n" - "fmin v30.8h, v30.8h, v22.8h\n" - "fmin v31.8h, v31.8h, v22.8h\n" - "fdiv v27.8h, v27.8h, v22.8h\n" - "fdiv v29.8h, v29.8h, v22.8h\n" - "fdiv v30.8h, v30.8h, v22.8h\n" - "fdiv v31.8h, v31.8h, v22.8h\n" - "fmul v9.8h, v27.8h, v9.8h\n" - "fmul v10.8h, v29.8h, v10.8h\n" - "fmul v11.8h, v30.8h, v11.8h\n" - "fmul v12.8h, v31.8h, v12.8h\n" - - "313:\n" - "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" - :[out]"+r"(out), - [in_0]"+r"(in0), - [in_1]"+r"(in1), - [in_2]"+r"(in2) - :[f]"r"(f), - [b]"r"(b), - [w]"r"((I64)ow-8), - [depthwiseActivationMode]"r"((I64)depthwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3" - ); - } - in0 = in_c + (ih-2)*iw*8; - in1 = in0 + iw*8; - in2 = in1 + iw*8; - __asm__ __volatile__( - "mov x0, %[w]\n" - "ldr q28, [%[b]]\n" - "ldr q0, [%[f]]\n" - "ldr q1, [%[f], #16]\n" - "ldr q2, [%[f], #32]\n" - "ldr q3, [%[f], #48]\n" - "ldr q4, [%[f], #64]\n" - "ldr q5, [%[f], #80]\n" - "ldr q13, [%[in_0]]\n" - "ldr q14, [%[in_0], #16]\n" - "ldr q15, [%[in_0], #32]\n" - "ldr q16, [%[in_0], #48]\n" - "ldr q18, [%[in_1]]\n" - "ldr q19, [%[in_1], #16]\n" - "ldr q20, [%[in_1], #32]\n" - "ldr q21, [%[in_1], #48]\n" - - "mov v9.16b, v28.16b\n" //out_0 - "mov v10.16b, v28.16b\n" //out_1 - "mov v11.16b, v28.16b\n" //out_2 - "mov v12.16b, v28.16b\n" //out_3 - - "ldr q17, [%[in_0], #64]\n" - "fmla v10.8h, v0.8h, v13.8h\n" - "fmla v11.8h, v0.8h, v14.8h\n" - "fmla v12.8h, v0.8h, v15.8h\n" - "ldr q22, [%[in_1], #64]\n" - "fmla v10.8h, v3.8h, v18.8h\n" - "fmla v11.8h, v3.8h, v19.8h\n" - "fmla v12.8h, v3.8h, v20.8h\n" - - "fmla v9.8h, v1.8h, v13.8h\n" - "fmla v10.8h, v1.8h, v14.8h\n" - "fmla v11.8h, v1.8h, v15.8h\n" - "fmla v12.8h, v1.8h, v16.8h\n" - "fmla v9.8h, v4.8h, v18.8h\n" - "fmla v10.8h, v4.8h, v19.8h\n" - "fmla v11.8h, v4.8h, v20.8h\n" - "fmla v12.8h, v4.8h, v21.8h\n" - - "ldr q13, [%[in_0], #80]\n" - "fmla v9.8h, v2.8h, v14.8h\n" - "fmla v10.8h, v2.8h, v15.8h\n" - "fmla v11.8h, v2.8h, v16.8h\n" - "fmla v12.8h, v2.8h, v17.8h\n" - "ldr q18, [%[in_1], #80]\n" - "fmla v9.8h, v5.8h, v19.8h\n" - "fmla v10.8h, v5.8h, v20.8h\n" - "fmla v11.8h, v5.8h, v21.8h\n" - "fmla v12.8h, v5.8h, v22.8h\n" - - "mov v14.16b, v17.16b\n" - "mov v19.16b, v22.16b\n" - "mov v15.16b, v13.16b\n" - "mov v20.16b, v18.16b\n" - "mov v13.16b, v16.16b\n" - "mov v18.16b, v21.16b\n" - "ldr q16, [%[in_0], #96]\n" - "ldr q21, [%[in_1], #96]\n" - "add %[in_0], %[in_0], #48\n" - "add %[in_1], %[in_1], #48\n" - - "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse - "bne 111f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) - "fmax v10.8h, v10.8h, v17.8h\n" - "fmax v11.8h, v11.8h, v17.8h\n" - "fmax v12.8h, v12.8h, v17.8h\n" - - "111:\n" - "cmp %[depthwiseActivationMode], %[am_relu6]\n" - "bne 112f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) - "fmax v10.8h, v10.8h, v17.8h\n" - "fmax v11.8h, v11.8h, v17.8h\n" - "fmax v12.8h, v12.8h, v17.8h\n" - "fmin v9.8h, v9.8h, v22.8h\n" //min(v9, 6) - "fmin v10.8h, v10.8h, v22.8h\n" - "fmin v11.8h, v11.8h, v22.8h\n" - "fmin v12.8h, v12.8h, v22.8h\n" - - "112:\n" - "cmp %[depthwiseActivationMode], %[am_h_swish]\n" - "bne 113f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x42, lsl #8\n" //three - "fadd v27.8h, v9.8h, v22.8h\n" - "fadd v29.8h, v10.8h, v22.8h\n" - "fadd v30.8h, v11.8h, v22.8h\n" - "fadd v31.8h, v12.8h, v22.8h\n" - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v27.8h, v27.8h, v17.8h\n" - "fmax v29.8h, v29.8h, v17.8h\n" - "fmax v30.8h, v30.8h, v17.8h\n" - "fmax v31.8h, v31.8h, v17.8h\n" - "fmin v27.8h, v27.8h, v22.8h\n" - "fmin v29.8h, v29.8h, v22.8h\n" - "fmin v30.8h, v30.8h, v22.8h\n" - "fmin v31.8h, v31.8h, v22.8h\n" - "fdiv v27.8h, v27.8h, v22.8h\n" - "fdiv v29.8h, v29.8h, v22.8h\n" - "fdiv v30.8h, v30.8h, v22.8h\n" - "fdiv v31.8h, v31.8h, v22.8h\n" - "fmul v9.8h, v27.8h, v9.8h\n" - "fmul v10.8h, v29.8h, v10.8h\n" - "fmul v11.8h, v30.8h, v11.8h\n" - "fmul v12.8h, v31.8h, v12.8h\n" - - "113:\n" - "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" - - "0:\n" - "mov v9.16b, v28.16b\n" //out_0 - "mov v10.16b, v28.16b\n" //out_1 - "mov v11.16b, v28.16b\n" //out_2 - "mov v12.16b, v28.16b\n" //out_3 - - "ldr d17, [%[in_0], #64]\n" - "fmla v9.8h, v0.8h, v13.8h\n" - "ldr x1, [%[in_0], #72]\n" - "fmla v10.8h, v0.8h, v14.8h\n" - "ins v17.d[1], x1\n" - "fmla v11.8h, v0.8h, v15.8h\n" - "ldr d22, [%[in_1], #64]\n" - "fmla v12.8h, v0.8h, v16.8h\n" - "ldr x2, [%[in_1], #72]\n" - "fmla v9.8h, v3.8h, v18.8h\n" - "ins v22.d[1], x2\n" - "fmla v10.8h, v3.8h, v19.8h\n" - "fmla v11.8h, v3.8h, v20.8h\n" - "fmla v12.8h, v3.8h, v21.8h\n" - - "ldr d13, [%[in_0], #80]\n" - "fmla v9.8h, v1.8h, v14.8h\n" - "ldr x1, [%[in_0], #88]\n" - "fmla v10.8h, v1.8h, v15.8h\n" - "ins v13.d[1], x1\n" - "fmla v11.8h, v1.8h, v16.8h\n" - "ldr d18, [%[in_1], #80]\n" - "fmla v12.8h, v1.8h, v17.8h\n" - "ldr x2, [%[in_1], #88]\n" - "fmla v9.8h, v4.8h, v19.8h\n" - "ins v18.d[1], x2\n" - "fmla v10.8h, v4.8h, v20.8h\n" - "fmla v11.8h, v4.8h, v21.8h\n" - "fmla v12.8h, v4.8h, v22.8h\n" - - "ldr d14, [%[in_0], #96]\n" - "fmla v9.8h, v2.8h, v15.8h\n" - "ldr x1, [%[in_0], #104]\n" - "fmla v10.8h, v2.8h, v16.8h\n" - "ins v14.d[1], x1\n" - "fmla v11.8h, v2.8h, v17.8h\n" - "ldr d19, [%[in_1], #96]\n" - "fmla v12.8h, v2.8h, v13.8h\n" - "ldr x2, [%[in_1], #104]\n" - "fmla v9.8h, v5.8h, v20.8h\n" - "ins v19.d[1], x2\n" - "fmla v10.8h, v5.8h, v21.8h\n" - "fmla v11.8h, v5.8h, v22.8h\n" - "fmla v12.8h, v5.8h, v18.8h\n" - - - "ldr d16, [%[in_0], #112]\n" - "mov v15.16b, v14.16b\n" - "ldr x1, [%[in_0], #120]\n" - "mov v20.16b, v19.16b\n" - "ins v16.d[1], x1\n" - "ldr d21, [%[in_1], #112]\n" - "mov v14.16b, v13.16b\n" - "ldr x2, [%[in_1], #120]\n" - "mov v19.16b, v18.16b\n" - "ins v21.d[1], x2\n" - "mov v13.16b, v17.16b\n" - "mov v18.16b, v22.16b\n" - - "add %[in_0], %[in_0], #64\n" - "add %[in_1], %[in_1], #64\n" - - "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse - "bne 211f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) - "fmax v10.8h, v10.8h, v17.8h\n" - "fmax v11.8h, v11.8h, v17.8h\n" - "fmax v12.8h, v12.8h, v17.8h\n" - - "211:\n" - "cmp %[depthwiseActivationMode], %[am_relu6]\n" - "bne 212f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) - "fmax v10.8h, v10.8h, v17.8h\n" - "fmax v11.8h, v11.8h, v17.8h\n" - "fmax v12.8h, v12.8h, v17.8h\n" - "fmin v9.8h, v9.8h, v22.8h\n" //min(v9, 6) - "fmin v10.8h, v10.8h, v22.8h\n" - "fmin v11.8h, v11.8h, v22.8h\n" - "fmin v12.8h, v12.8h, v22.8h\n" - - "212:\n" - "cmp %[depthwiseActivationMode], %[am_h_swish]\n" - "bne 213f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x42, lsl #8\n" //three - "fadd v27.8h, v9.8h, v22.8h\n" - "fadd v29.8h, v10.8h, v22.8h\n" - "fadd v30.8h, v11.8h, v22.8h\n" - "fadd v31.8h, v12.8h, v22.8h\n" - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v27.8h, v27.8h, v17.8h\n" - "fmax v29.8h, v29.8h, v17.8h\n" - "fmax v30.8h, v30.8h, v17.8h\n" - "fmax v31.8h, v31.8h, v17.8h\n" - "fmin v27.8h, v27.8h, v22.8h\n" - "fmin v29.8h, v29.8h, v22.8h\n" - "fmin v30.8h, v30.8h, v22.8h\n" - "fmin v31.8h, v31.8h, v22.8h\n" - "fdiv v27.8h, v27.8h, v22.8h\n" - "fdiv v29.8h, v29.8h, v22.8h\n" - "fdiv v30.8h, v30.8h, v22.8h\n" - "fdiv v31.8h, v31.8h, v22.8h\n" - "fmul v9.8h, v27.8h, v9.8h\n" - "fmul v10.8h, v29.8h, v10.8h\n" - "fmul v11.8h, v30.8h, v11.8h\n" - "fmul v12.8h, v31.8h, v12.8h\n" - - "213:\n" - "subs x0, x0, #4\n" - "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" - "bne 0b\n" - - "mov v9.16b, v28.16b\n" //out_0 - "mov v10.16b, v28.16b\n" //out_1 - "mov v11.16b, v28.16b\n" //out_2 - "mov v12.16b, v28.16b\n" //out_3 - - "ldr q17, [%[in_0], #64]\n" - "fmla v9.8h, v0.8h, v13.8h\n" - "fmla v10.8h, v0.8h, v14.8h\n" - "fmla v11.8h, v0.8h, v15.8h\n" - "fmla v12.8h, v0.8h, v16.8h\n" - "ldr q22, [%[in_1], #64]\n" - "fmla v9.8h, v3.8h, v18.8h\n" - "fmla v10.8h, v3.8h, v19.8h\n" - "fmla v11.8h, v3.8h, v20.8h\n" - "fmla v12.8h, v3.8h, v21.8h\n" - - "fmla v9.8h, v1.8h, v14.8h\n" - "fmla v10.8h, v1.8h, v15.8h\n" - "fmla v11.8h, v1.8h, v16.8h\n" - "fmla v12.8h, v1.8h, v17.8h\n" - "fmla v9.8h, v4.8h, v19.8h\n" - "fmla v10.8h, v4.8h, v20.8h\n" - "fmla v11.8h, v4.8h, v21.8h\n" - "fmla v12.8h, v4.8h, v22.8h\n" - - "fmla v9.8h, v2.8h, v15.8h\n" - "fmla v10.8h, v2.8h, v16.8h\n" - "fmla v11.8h, v2.8h, v17.8h\n" - "fmla v9.8h, v5.8h, v20.8h\n" - "fmla v10.8h, v5.8h, v21.8h\n" - "fmla v11.8h, v5.8h, v22.8h\n" - - "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse - "bne 311f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) - "fmax v10.8h, v10.8h, v17.8h\n" - "fmax v11.8h, v11.8h, v17.8h\n" - "fmax v12.8h, v12.8h, v17.8h\n" - - "311:\n" - "cmp %[depthwiseActivationMode], %[am_relu6]\n" - "bne 312f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) - "fmax v10.8h, v10.8h, v17.8h\n" - "fmax v11.8h, v11.8h, v17.8h\n" - "fmax v12.8h, v12.8h, v17.8h\n" - "fmin v9.8h, v9.8h, v22.8h\n" //min(v9, 6) - "fmin v10.8h, v10.8h, v22.8h\n" - "fmin v11.8h, v11.8h, v22.8h\n" - "fmin v12.8h, v12.8h, v22.8h\n" - - "312:\n" - "cmp %[depthwiseActivationMode], %[am_h_swish]\n" - "bne 313f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x42, lsl #8\n" //three - "fadd v27.8h, v9.8h, v22.8h\n" - "fadd v29.8h, v10.8h, v22.8h\n" - "fadd v30.8h, v11.8h, v22.8h\n" - "fadd v31.8h, v12.8h, v22.8h\n" - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v27.8h, v27.8h, v17.8h\n" - "fmax v29.8h, v29.8h, v17.8h\n" - "fmax v30.8h, v30.8h, v17.8h\n" - "fmax v31.8h, v31.8h, v17.8h\n" - "fmin v27.8h, v27.8h, v22.8h\n" - "fmin v29.8h, v29.8h, v22.8h\n" - "fmin v30.8h, v30.8h, v22.8h\n" - "fmin v31.8h, v31.8h, v22.8h\n" - "fdiv v27.8h, v27.8h, v22.8h\n" - "fdiv v29.8h, v29.8h, v22.8h\n" - "fdiv v30.8h, v30.8h, v22.8h\n" - "fdiv v31.8h, v31.8h, v22.8h\n" - "fmul v9.8h, v27.8h, v9.8h\n" - "fmul v10.8h, v29.8h, v10.8h\n" - "fmul v11.8h, v30.8h, v11.8h\n" - "fmul v12.8h, v31.8h, v12.8h\n" - - "313:\n" - "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" - :[out]"+r"(out), - [in_0]"+r"(in0), - [in_1]"+r"(in1) - :[f]"r"(f), - [b]"r"(b), - [w]"r"((I64)ow-8), - [depthwiseActivationMode]"r"((I64)depthwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3" - ); - } - - // pw_conv - for (I32 hw = 0; hw < ohow-7; hw+=8) { - const F16 *b0 = biasArray + ic*8; - const F16 *b1 = b0 + 8; - const F16 *f_o0c0 = filterArray + ic*fh*fw*8; - F16 *in_pack = pwArray + ohow*ic*8; - // pack input - // NCHWc8 => NHWChw8 - for (U32 c = 0; c < ic; c++) { - F16 *in_pack_c8hw8 = in_pack + c*8*8; - // it is 2% faster than in_hw8c8 = ... + hw*8; Amazing! - F16 *in_hw8c8 = pwArray + c*ohow*8; - // - // for (U32 c8 = 0; c8 < 8; c8++) { - // for (U32 hw8 = 0; hw8 < 8; hw8++) { - // in_pack_c8hw8[c8*8 + hw8] = in_hw8c8[hw8*8 + c8]; - // } - // } - // - float16x8_t v0 = vld1q_f16(in_hw8c8 + hw*8); - float16x8_t v1 = vld1q_f16(in_hw8c8 + hw*8 + 8); - float16x8_t v2 = vld1q_f16(in_hw8c8 + hw*8 + 8*2); - float16x8_t v3 = vld1q_f16(in_hw8c8 + hw*8 + 8*3); - float16x8_t v4 = vld1q_f16(in_hw8c8 + hw*8 + 8*4); - float16x8_t v5 = vld1q_f16(in_hw8c8 + hw*8 + 8*5); - float16x8_t v6 = vld1q_f16(in_hw8c8 + hw*8 + 8*6); - float16x8_t v7 = vld1q_f16(in_hw8c8 + hw*8 + 8*7); - vst1q_f16(in_pack_c8hw8, - vzip1q_f16( - vzip1q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), - vzip1q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); - vst1q_f16(in_pack_c8hw8 + 8, - vzip2q_f16( - vzip1q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), - vzip1q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); - vst1q_f16(in_pack_c8hw8 + 8*2, - vzip1q_f16( - vzip2q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), - vzip2q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); - vst1q_f16(in_pack_c8hw8 + 8*3, - vzip2q_f16( - vzip2q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), - vzip2q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); - vst1q_f16(in_pack_c8hw8 + 8*4, - vzip1q_f16( - vzip1q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), - vzip1q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); - vst1q_f16(in_pack_c8hw8 + 8*5, - vzip2q_f16( - vzip1q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), - vzip1q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); - vst1q_f16(in_pack_c8hw8 + 8*6, - vzip1q_f16( - vzip2q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), - vzip2q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); - vst1q_f16(in_pack_c8hw8 + 8*7, - vzip2q_f16( - vzip2q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), - vzip2q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); - } - // compute - for (I32 o = 0; o < I32(oc-1); o+=2) { - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; - // bias - const F16 *b_o0 = b0; - const F16 *b_o1 = b1; - __asm__ __volatile__( - "ldr d22, [%[b_0]]\n" //b_o0 - "ldr x1, [%[b_0], #8]\n" - "ins v22.d[1], x1\n" - "ldr d23, [%[b_1]]\n" //b_o1 - "ldr x2, [%[b_1], #8]\n" - "ins v23.d[1], x2\n" - "mov x0, %[ic]\n" //ic_blk - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr d0, [%[in_0]]\n" //in_hw0 - "mov v3.16b, v22.16b\n" //out_o0hw1 - "ldr x1, [%[in_0], #8]\n" - "mov v4.16b, v22.16b\n" //out_o0hw2 - "ins v0.d[1], x1\n" - "mov v5.16b, v22.16b\n" //out_o0hw3 - "ldr d18, [%[f_0]]\n" //f_o0c0 - "mov v6.16b, v22.16b\n" //out_o0hw4 - "ldr x2, [%[f_0], #8]\n" - "mov v7.16b, v22.16b\n" //out_o0hw5 - "ins v18.d[1], x2\n" - "mov v8.16b, v22.16b\n" //out_o0hw6 - "ldr d19, [%[f_0], #16]\n" //f_o1c0 - "mov v9.16b, v22.16b\n" //out_o0hw7 - "ldr x3, [%[f_0], #24]\n" - "mov v10.16b, v23.16b\n" //out_o1hw0 - "ins v19.d[1], x3\n" - "mov v11.16b, v23.16b\n" //out_o1hw1 - "mov v12.16b, v23.16b\n" //out_o1hw2 - "mov v13.16b, v23.16b\n" //out_o1hw3 - "mov v14.16b, v23.16b\n" //out_o1hw4 - "mov v15.16b, v23.16b\n" //out_o1hw5 - "mov v16.16b, v23.16b\n" //out_o1hw6 - "mov v17.16b, v23.16b\n" //out_o1hw7 - "0:\n" - "ldr d1, [%[in_0], #16]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr x1, [%[in_0], #24]\n" - "fmla v3.8h, v18.8h, v0.h[1]\n" - "ins v1.d[1], x1\n" - "fmla v4.8h, v18.8h, v0.h[2]\n" - "ldr d20, [%[f_0], #32]\n" //f_o0c0 - "fmla v5.8h, v18.8h, v0.h[3]\n" - "ldr x2, [%[f_0], #40]\n" - "fmla v6.8h, v18.8h, v0.h[4]\n" - "ins v20.d[1], x2\n" - "fmla v7.8h, v18.8h, v0.h[5]\n" - "ldr d21, [%[f_0], #48]\n" //f_o1c0 - "fmla v8.8h, v18.8h, v0.h[6]\n" - "ldr x3, [%[f_0], #56]\n" - "fmla v9.8h, v18.8h, v0.h[7]\n" - "ins v21.d[1], x3\n" - "fmla v10.8h, v19.8h, v0.h[0]\n" - "fmla v11.8h, v19.8h, v0.h[1]\n" - "fmla v12.8h, v19.8h, v0.h[2]\n" - "fmla v13.8h, v19.8h, v0.h[3]\n" - "fmla v14.8h, v19.8h, v0.h[4]\n" - "fmla v15.8h, v19.8h, v0.h[5]\n" - "fmla v16.8h, v19.8h, v0.h[6]\n" - "fmla v17.8h, v19.8h, v0.h[7]\n" - - "ldr d0, [%[in_0], #32]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr x1, [%[in_0], #40]\n" - "fmla v3.8h, v20.8h, v1.h[1]\n" - "ins v0.d[1], x1\n" - "fmla v4.8h, v20.8h, v1.h[2]\n" - "ldr d18, [%[f_0], #64]\n" //f_o0c0 - "fmla v5.8h, v20.8h, v1.h[3]\n" - "ldr x2, [%[f_0], #72]\n" - "fmla v6.8h, v20.8h, v1.h[4]\n" - "ins v18.d[1], x2\n" - "fmla v7.8h, v20.8h, v1.h[5]\n" - "ldr d19, [%[f_0], #80]\n" //f_o1c0 - "fmla v8.8h, v20.8h, v1.h[6]\n" - "ldr x3, [%[f_0], #88]\n" - "fmla v9.8h, v20.8h, v1.h[7]\n" - "ins v19.d[1], x3\n" - "fmla v10.8h, v21.8h, v1.h[0]\n" - "add %[in_0], %[in_0], #32\n" - "fmla v11.8h, v21.8h, v1.h[1]\n" - "add %[f_0], %[f_0], #64\n" - "fmla v12.8h, v21.8h, v1.h[2]\n" - "subs x0, x0, #2\n" - "fmla v13.8h, v21.8h, v1.h[3]\n" - "fmla v14.8h, v21.8h, v1.h[4]\n" - "fmla v15.8h, v21.8h, v1.h[5]\n" - "fmla v16.8h, v21.8h, v1.h[6]\n" - "fmla v17.8h, v21.8h, v1.h[7]\n" - "bne 0b\n" - - "cmp %[pointwiseActivationMode], %[am_relu]\n" - "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v3.8h, v3.8h, v0.8h\n" - "fmax v4.8h, v4.8h, v0.8h\n" - "fmax v5.8h, v5.8h, v0.8h\n" - "fmax v6.8h, v6.8h, v0.8h\n" - "fmax v7.8h, v7.8h, v0.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" - "fmax v12.8h, v12.8h, v0.8h\n" - "fmax v13.8h, v13.8h, v0.8h\n" - "fmax v14.8h, v14.8h, v0.8h\n" - "fmax v15.8h, v15.8h, v0.8h\n" - "fmax v16.8h, v16.8h, v0.8h\n" - "fmax v17.8h, v17.8h, v0.8h\n" - - "11:\n" - "cmp %[pointwiseActivationMode], %[am_relu6]\n" - "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v3.8h, v3.8h, v0.8h\n" - "fmax v4.8h, v4.8h, v0.8h\n" - "fmax v5.8h, v5.8h, v0.8h\n" - "fmax v6.8h, v6.8h, v0.8h\n" - "fmax v7.8h, v7.8h, v0.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" - "fmax v12.8h, v12.8h, v0.8h\n" - "fmax v13.8h, v13.8h, v0.8h\n" - "fmax v14.8h, v14.8h, v0.8h\n" - "fmax v15.8h, v15.8h, v0.8h\n" - "fmax v16.8h, v16.8h, v0.8h\n" - "fmax v17.8h, v17.8h, v0.8h\n" - "fmin v2.8h, v2.8h, v1.8h\n" - "fmin v3.8h, v3.8h, v1.8h\n" - "fmin v4.8h, v4.8h, v1.8h\n" - "fmin v5.8h, v5.8h, v1.8h\n" - "fmin v6.8h, v6.8h, v1.8h\n" - "fmin v7.8h, v7.8h, v1.8h\n" - "fmin v8.8h, v8.8h, v1.8h\n" - "fmin v9.8h, v9.8h, v1.8h\n" - "fmin v10.8h, v10.8h, v1.8h\n" - "fmin v11.8h, v11.8h, v1.8h\n" - "fmin v12.8h, v12.8h, v1.8h\n" - "fmin v13.8h, v13.8h, v1.8h\n" - "fmin v14.8h, v14.8h, v1.8h\n" - "fmin v15.8h, v15.8h, v1.8h\n" - "fmin v16.8h, v16.8h, v1.8h\n" - "fmin v17.8h, v17.8h, v1.8h\n" - - "12:\n" - "cmp %[pointwiseActivationMode], %[am_h_swish]\n" - "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v18.8h, #0x42, lsl #8\n" //three - "fadd v19.8h, v2.8h, v18.8h\n" - "fadd v20.8h, v3.8h, v18.8h\n" - "fadd v21.8h, v4.8h, v18.8h\n" - "fadd v22.8h, v5.8h, v18.8h\n" - "fadd v23.8h, v6.8h, v18.8h\n" - "fadd v24.8h, v7.8h, v18.8h\n" - "fadd v25.8h, v8.8h, v18.8h\n" - "fadd v26.8h, v9.8h, v18.8h\n" - "fmax v19.8h, v19.8h, v0.8h\n" - "fmax v20.8h, v20.8h, v0.8h\n" - "fmax v21.8h, v21.8h, v0.8h\n" - "fmax v22.8h, v22.8h, v0.8h\n" - "fmax v23.8h, v23.8h, v0.8h\n" - "fmax v24.8h, v24.8h, v0.8h\n" - "fmax v25.8h, v25.8h, v0.8h\n" - "fmax v26.8h, v26.8h, v0.8h\n" - "fmin v19.8h, v19.8h, v1.8h\n" - "fmin v20.8h, v20.8h, v1.8h\n" - "fmin v21.8h, v21.8h, v1.8h\n" - "fmin v22.8h, v22.8h, v1.8h\n" - "fmin v23.8h, v23.8h, v1.8h\n" - "fmin v24.8h, v24.8h, v1.8h\n" - "fmin v25.8h, v25.8h, v1.8h\n" - "fmin v26.8h, v26.8h, v1.8h\n" - "fdiv v19.8h, v19.8h, v1.8h\n" - "fdiv v20.8h, v20.8h, v1.8h\n" - "fdiv v21.8h, v21.8h, v1.8h\n" - "fdiv v22.8h, v22.8h, v1.8h\n" - "fdiv v23.8h, v23.8h, v1.8h\n" - "fdiv v24.8h, v24.8h, v1.8h\n" - "fdiv v25.8h, v25.8h, v1.8h\n" - "fdiv v26.8h, v26.8h, v1.8h\n" - "fmul v2.8h, v19.8h, v2.8h\n" - "fmul v3.8h, v20.8h, v3.8h\n" - "fmul v4.8h, v21.8h, v4.8h\n" - "fmul v5.8h, v22.8h, v5.8h\n" - "fmul v6.8h, v23.8h, v6.8h\n" - "fmul v7.8h, v24.8h, v7.8h\n" - "fmul v8.8h, v25.8h, v8.8h\n" - "fmul v9.8h, v26.8h, v9.8h\n" - - "fadd v19.8h, v10.8h, v18.8h\n" - "fadd v20.8h, v11.8h, v18.8h\n" - "fadd v21.8h, v12.8h, v18.8h\n" - "fadd v22.8h, v13.8h, v18.8h\n" - "fadd v23.8h, v14.8h, v18.8h\n" - "fadd v24.8h, v15.8h, v18.8h\n" - "fadd v25.8h, v16.8h, v18.8h\n" - "fadd v26.8h, v17.8h, v18.8h\n" - "fmax v19.8h, v19.8h, v0.8h\n" - "fmax v20.8h, v20.8h, v0.8h\n" - "fmax v21.8h, v21.8h, v0.8h\n" - "fmax v22.8h, v22.8h, v0.8h\n" - "fmax v23.8h, v23.8h, v0.8h\n" - "fmax v24.8h, v24.8h, v0.8h\n" - "fmax v25.8h, v25.8h, v0.8h\n" - "fmax v26.8h, v26.8h, v0.8h\n" - "fmin v19.8h, v19.8h, v1.8h\n" - "fmin v20.8h, v20.8h, v1.8h\n" - "fmin v21.8h, v21.8h, v1.8h\n" - "fmin v22.8h, v22.8h, v1.8h\n" - "fmin v23.8h, v23.8h, v1.8h\n" - "fmin v24.8h, v24.8h, v1.8h\n" - "fmin v25.8h, v25.8h, v1.8h\n" - "fmin v26.8h, v26.8h, v1.8h\n" - "fdiv v19.8h, v19.8h, v1.8h\n" - "fdiv v20.8h, v20.8h, v1.8h\n" - "fdiv v21.8h, v21.8h, v1.8h\n" - "fdiv v22.8h, v22.8h, v1.8h\n" - "fdiv v23.8h, v23.8h, v1.8h\n" - "fdiv v24.8h, v24.8h, v1.8h\n" - "fdiv v25.8h, v25.8h, v1.8h\n" - "fdiv v26.8h, v26.8h, v1.8h\n" - "fmul v10.8h, v19.8h, v10.8h\n" - "fmul v11.8h, v20.8h, v11.8h\n" - "fmul v12.8h, v21.8h, v12.8h\n" - "fmul v13.8h, v22.8h, v13.8h\n" - "fmul v14.8h, v23.8h, v14.8h\n" - "fmul v15.8h, v24.8h, v15.8h\n" - "fmul v16.8h, v25.8h, v16.8h\n" - "fmul v17.8h, v26.8h, v17.8h\n" - - "13:\n" - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw1 - "str q4, [%[out_0], #32]\n" //out_o0hw2 - "str q5, [%[out_0], #48]\n" //out_o0hw3 - "str q6, [%[out_0], #64]\n" //out_o0hw4 - "str q7, [%[out_0], #80]\n" //out_o0hw5 - "str q8, [%[out_0], #96]\n" //out_o0hw6 - "str q9, [%[out_0], #112]\n" //out_o0hw7 - "str q10, [%[out_1]]\n" //out_o1hw0 - "str q11, [%[out_1], #16]\n" //out_o1hw1 - "str q12, [%[out_1], #32]\n" //out_o1hw2 - "str q13, [%[out_1], #48]\n" //out_o1hw3 - "str q14, [%[out_1], #64]\n" //out_o1hw4 - "str q15, [%[out_1], #80]\n" //out_o1hw5 - "str q16, [%[out_1], #96]\n" //out_o1hw6 - "str q17, [%[out_1], #112]\n" //out_o1hw7 - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "x0", "x1", "x2", "x3" - ); - b0 += 16; - b1 += 16; - } - if (oc & 1) { - // oc%2 != 0 - const F16 *f_r = filterArray + ic*fh*fw*8 + (oc-1)*8*ic*8; - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + (oc-1)*ohow*8 + hw*8; - // bias - const F16 *b_o0 = biasArray + ic*8 + (oc-1)*8; - __asm__ __volatile__( - "ldr q12, [%[b_0]]\n" //b_o0 - "mov x0, %[ic]\n" // ic_blk - "ldr d0, [%[in_0]]\n" //in_hw0 - "mov v2.16b, v12.16b\n" //out_o0hw0 - "ldr x1, [%[in_0], #8]\n" - "mov v3.16b, v12.16b\n" //out_o0hw1 - "ins v0.d[1], x1\n" - "mov v4.16b, v12.16b\n" //out_o0hw2 - "ldr d10, [%[f_0]]\n" //f_o0c0 - "mov v5.16b, v12.16b\n" //out_o0hw3 - "ldr x2, [%[f_0], #8]\n" - "mov v6.16b, v12.16b\n" //out_o0hw4 - "ins v10.d[1], x2\n" - "mov v7.16b, v12.16b\n" //out_o0hw5 - "mov v8.16b, v12.16b\n" //out_o0hw6 - "mov v9.16b, v12.16b\n" //out_o0hw7 - "0:\n" - "ldr d1, [%[in_0], #16]\n" //in_hw0 - "fmla v2.8h, v10.8h, v0.h[0]\n" - "ldr x1, [%[in_0], #24]\n" - "fmla v3.8h, v10.8h, v0.h[1]\n" - "ins v1.d[1], x1\n" - "fmla v4.8h, v10.8h, v0.h[2]\n" - "ldr d11, [%[f_0], #16]\n" //f_o0c0 - "fmla v5.8h, v10.8h, v0.h[3]\n" - "ldr x2, [%[f_0], #24]\n" - "fmla v6.8h, v10.8h, v0.h[4]\n" - "ins v11.d[1], x2\n" - "fmla v7.8h, v10.8h, v0.h[5]\n" - "subs x0, x0, #2\n" - "fmla v8.8h, v10.8h, v0.h[6]\n" - "fmla v9.8h, v10.8h, v0.h[7]\n" - - "ldr d0, [%[in_0], #32]\n" //in_hw0 - "fmla v2.8h, v11.8h, v1.h[0]\n" - "ldr x1, [%[in_0], #40]\n" - "fmla v3.8h, v11.8h, v1.h[1]\n" - "ins v0.d[1], x1\n" - "fmla v4.8h, v11.8h, v1.h[2]\n" - "ldr d10, [%[f_0], #32]\n" //f_o0c0 - "fmla v5.8h, v11.8h, v1.h[3]\n" - "ldr x2, [%[f_0], #40]\n" - "fmla v6.8h, v11.8h, v1.h[4]\n" - "ins v10.d[1], x2\n" - "fmla v7.8h, v11.8h, v1.h[5]\n" - "add %[in_0], %[in_0], #32\n" - "fmla v8.8h, v11.8h, v1.h[6]\n" - "add %[f_0], %[f_0], #32\n" - "fmla v9.8h, v11.8h, v1.h[7]\n" - "bne 0b\n" - - "cmp %[pointwiseActivationMode], %[am_relu]\n" - "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v3.8h, v3.8h, v0.8h\n" - "fmax v4.8h, v4.8h, v0.8h\n" - "fmax v5.8h, v5.8h, v0.8h\n" - "fmax v6.8h, v6.8h, v0.8h\n" - "fmax v7.8h, v7.8h, v0.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - - "11:\n" - "cmp %[pointwiseActivationMode], %[am_relu6]\n" - "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v3.8h, v3.8h, v0.8h\n" - "fmax v4.8h, v4.8h, v0.8h\n" - "fmax v5.8h, v5.8h, v0.8h\n" - "fmax v6.8h, v6.8h, v0.8h\n" - "fmax v7.8h, v7.8h, v0.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmin v2.8h, v2.8h, v1.8h\n" - "fmin v3.8h, v3.8h, v1.8h\n" - "fmin v4.8h, v4.8h, v1.8h\n" - "fmin v5.8h, v5.8h, v1.8h\n" - "fmin v6.8h, v6.8h, v1.8h\n" - "fmin v7.8h, v7.8h, v1.8h\n" - "fmin v8.8h, v8.8h, v1.8h\n" - "fmin v9.8h, v9.8h, v1.8h\n" - - "12:\n" - "cmp %[pointwiseActivationMode], %[am_h_swish]\n" - "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v10.8h, #0x42, lsl #8\n" //three - "fadd v11.8h, v2.8h, v10.8h\n" - "fadd v12.8h, v3.8h, v10.8h\n" - "fadd v13.8h, v4.8h, v10.8h\n" - "fadd v14.8h, v5.8h, v10.8h\n" - "fadd v15.8h, v6.8h, v10.8h\n" - "fadd v16.8h, v7.8h, v10.8h\n" - "fadd v17.8h, v8.8h, v10.8h\n" - "fadd v18.8h, v9.8h, v10.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" - "fmax v12.8h, v12.8h, v0.8h\n" - "fmax v13.8h, v13.8h, v0.8h\n" - "fmax v14.8h, v14.8h, v0.8h\n" - "fmax v15.8h, v15.8h, v0.8h\n" - "fmax v16.8h, v16.8h, v0.8h\n" - "fmax v17.8h, v17.8h, v0.8h\n" - "fmax v18.8h, v18.8h, v0.8h\n" - "fmin v11.8h, v11.8h, v1.8h\n" - "fmin v12.8h, v12.8h, v1.8h\n" - "fmin v13.8h, v13.8h, v1.8h\n" - "fmin v14.8h, v14.8h, v1.8h\n" - "fmin v15.8h, v15.8h, v1.8h\n" - "fmin v16.8h, v16.8h, v1.8h\n" - "fmin v17.8h, v17.8h, v1.8h\n" - "fmin v18.8h, v18.8h, v1.8h\n" - "fdiv v11.8h, v11.8h, v1.8h\n" - "fdiv v12.8h, v12.8h, v1.8h\n" - "fdiv v13.8h, v13.8h, v1.8h\n" - "fdiv v14.8h, v14.8h, v1.8h\n" - "fdiv v15.8h, v15.8h, v1.8h\n" - "fdiv v16.8h, v16.8h, v1.8h\n" - "fdiv v17.8h, v17.8h, v1.8h\n" - "fdiv v18.8h, v18.8h, v1.8h\n" - "fmul v2.8h, v11.8h, v2.8h\n" - "fmul v3.8h, v12.8h, v3.8h\n" - "fmul v4.8h, v13.8h, v4.8h\n" - "fmul v5.8h, v14.8h, v5.8h\n" - "fmul v6.8h, v15.8h, v6.8h\n" - "fmul v7.8h, v16.8h, v7.8h\n" - "fmul v8.8h, v17.8h, v8.8h\n" - "fmul v9.8h, v18.8h, v9.8h\n" - - "13:\n" - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw0 - "str q4, [%[out_0], #32]\n" //out_o0hw0 - "str q5, [%[out_0], #48]\n" //out_o0hw0 - "str q6, [%[out_0], #64]\n" //out_o0hw0 - "str q7, [%[out_0], #80]\n" //out_o0hw0 - "str q8, [%[out_0], #96]\n" //out_o0hw0 - "str q9, [%[out_0], #112]\n" //out_o0hw0 - :[out_0]"+r"(out_o0hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_r) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "x0", "x1", "x2" - ); - } - } - } - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1_A76.cpp b/tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1_A76.cpp deleted file mode 100644 index f790ff80..00000000 --- a/tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1_A76.cpp +++ /dev/null @@ -1,1549 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1.h" - -EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc) -{ - UNUSED(biasDesc); - UNUSED(tmpBytes); - UNUSED(convDesc); - - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - - if (fdf != DF_CHWC8_NCN16) - CHECK_STATUS(NOT_MATCH); - - oc /= 8; - ic /= 8; - - I32 ohow = oh * ow; - F16 *pwArray = (F16*)tmp; - - for (U32 n = 0; n < in; n++) { - // dw_conv + padding - for (U32 c = 0; c < ic; c++) { - const F16 *b = biasArray + c*8; - F16 *in_c = inArray + c*ih*iw*8; - const F16 *f = filterArray + c*fh*fw*8; - F16 *out = pwArray + c*ohow*8; - F16 *in0 = in_c; - F16 *in1 = in0 + iw*8; - F16 *in2 = in1 + iw*8; - __asm__ __volatile__( - "mov x0, %[w]\n" - "ldr q28, [%[b]]\n" - "ldr q3, [%[f], #48]\n" - "ldr q4, [%[f], #64]\n" - "ldr q5, [%[f], #80]\n" - "ldr q6, [%[f], #96]\n" - "ldr q7, [%[f], #112]\n" - "ldr q8, [%[f], #128]\n" - "ldr q13, [%[in_0]]\n" - "ldr q14, [%[in_0], #16]\n" - "ldr q15, [%[in_0], #32]\n" - "ldr q16, [%[in_0], #48]\n" - "ldr q18, [%[in_1]]\n" - "ldr q19, [%[in_1], #16]\n" - "ldr q20, [%[in_1], #32]\n" - "ldr q21, [%[in_1], #48]\n" - "mov v9.16b, v28.16b\n" //out_0 - "mov v10.16b, v28.16b\n" //out_1 - "mov v11.16b, v28.16b\n" //out_2 - "mov v12.16b, v28.16b\n" //out_3 - - "ldr q17, [%[in_0], #64]\n" - "fmla v10.8h, v3.8h, v13.8h\n" - "fmla v11.8h, v3.8h, v14.8h\n" - "fmla v12.8h, v3.8h, v15.8h\n" - "ldr q22, [%[in_1], #64]\n" - "fmla v10.8h, v6.8h, v18.8h\n" - "fmla v11.8h, v6.8h, v19.8h\n" - "fmla v12.8h, v6.8h, v20.8h\n" - - "fmla v9.8h, v4.8h, v13.8h\n" - "fmla v10.8h, v4.8h, v14.8h\n" - "fmla v11.8h, v4.8h, v15.8h\n" - "fmla v12.8h, v4.8h, v16.8h\n" - "fmla v9.8h, v7.8h, v18.8h\n" - "fmla v10.8h, v7.8h, v19.8h\n" - "fmla v11.8h, v7.8h, v20.8h\n" - "fmla v12.8h, v7.8h, v21.8h\n" - - "ldr q13, [%[in_0], #80]\n" - "fmla v9.8h, v5.8h, v14.8h\n" - "fmla v10.8h, v5.8h, v15.8h\n" - "fmla v11.8h, v5.8h, v16.8h\n" - "fmla v12.8h, v5.8h, v17.8h\n" - "ldr q18, [%[in_1], #80]\n" - "fmla v9.8h, v8.8h, v19.8h\n" - "fmla v10.8h, v8.8h, v20.8h\n" - "fmla v11.8h, v8.8h, v21.8h\n" - "fmla v12.8h, v8.8h, v22.8h\n" - - "mov v14.16b, v17.16b\n" - "mov v19.16b, v22.16b\n" - "mov v15.16b, v13.16b\n" - "mov v20.16b, v18.16b\n" - "mov v13.16b, v16.16b\n" - "mov v18.16b, v21.16b\n" - "ldr q16, [%[in_0], #96]\n" - "ldr q21, [%[in_1], #96]\n" - "add %[in_0], %[in_0], #48\n" - "add %[in_1], %[in_1], #48\n" - - "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse - "bne 111f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) - "fmax v10.8h, v10.8h, v17.8h\n" - "fmax v11.8h, v11.8h, v17.8h\n" - "fmax v12.8h, v12.8h, v17.8h\n" - - "111:\n" - "cmp %[depthwiseActivationMode], %[am_relu6]\n" - "bne 112f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) - "fmax v10.8h, v10.8h, v17.8h\n" - "fmax v11.8h, v11.8h, v17.8h\n" - "fmax v12.8h, v12.8h, v17.8h\n" - "fmin v9.8h, v9.8h, v22.8h\n" //min(v9, 6) - "fmin v10.8h, v10.8h, v22.8h\n" - "fmin v11.8h, v11.8h, v22.8h\n" - "fmin v12.8h, v12.8h, v22.8h\n" - - "112:\n" - "cmp %[depthwiseActivationMode], %[am_h_swish]\n" - "bne 113f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x42, lsl #8\n" //three - "fadd v27.8h, v9.8h, v22.8h\n" - "fadd v29.8h, v10.8h, v22.8h\n" - "fadd v30.8h, v11.8h, v22.8h\n" - "fadd v31.8h, v12.8h, v22.8h\n" - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v27.8h, v27.8h, v17.8h\n" - "fmax v29.8h, v29.8h, v17.8h\n" - "fmax v30.8h, v30.8h, v17.8h\n" - "fmax v31.8h, v31.8h, v17.8h\n" - "fmin v27.8h, v27.8h, v22.8h\n" - "fmin v29.8h, v29.8h, v22.8h\n" - "fmin v30.8h, v30.8h, v22.8h\n" - "fmin v31.8h, v31.8h, v22.8h\n" - "fdiv v27.8h, v27.8h, v22.8h\n" - "fdiv v29.8h, v29.8h, v22.8h\n" - "fdiv v30.8h, v30.8h, v22.8h\n" - "fdiv v31.8h, v31.8h, v22.8h\n" - "fmul v9.8h, v27.8h, v9.8h\n" - "fmul v10.8h, v29.8h, v10.8h\n" - "fmul v11.8h, v30.8h, v11.8h\n" - "fmul v12.8h, v31.8h, v12.8h\n" - - "113:\n" - "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" - - "0:\n" - "mov v9.16b, v28.16b\n" //out_0 - "mov v10.16b, v28.16b\n" //out_1 - "mov v11.16b, v28.16b\n" //out_2 - "mov v12.16b, v28.16b\n" //out_3 - - "ldr q17, [%[in_0], #64]\n" - "fmla v9.8h, v3.8h, v13.8h\n" - "fmla v10.8h, v3.8h, v14.8h\n" - "fmla v11.8h, v3.8h, v15.8h\n" - "ldr q22, [%[in_1], #64]\n" - "fmla v12.8h, v3.8h, v16.8h\n" - "fmla v9.8h, v6.8h, v18.8h\n" - "fmla v10.8h, v6.8h, v19.8h\n" - "fmla v11.8h, v6.8h, v20.8h\n" - "fmla v12.8h, v6.8h, v21.8h\n" - - "ldr q13, [%[in_0], #80]\n" - "fmla v9.8h, v4.8h, v14.8h\n" - "fmla v10.8h, v4.8h, v15.8h\n" - "fmla v11.8h, v4.8h, v16.8h\n" - "ldr q18, [%[in_1], #80]\n" - "fmla v12.8h, v4.8h, v17.8h\n" - "fmla v9.8h, v7.8h, v19.8h\n" - "fmla v10.8h, v7.8h, v20.8h\n" - "fmla v11.8h, v7.8h, v21.8h\n" - "fmla v12.8h, v7.8h, v22.8h\n" - - "ldr q14, [%[in_0], #96]\n" - "fmla v9.8h, v5.8h, v15.8h\n" - "fmla v10.8h, v5.8h, v16.8h\n" - "fmla v11.8h, v5.8h, v17.8h\n" - "ldr q19, [%[in_1], #96]\n" - "fmla v12.8h, v5.8h, v13.8h\n" - "fmla v9.8h, v8.8h, v20.8h\n" - "fmla v10.8h, v8.8h, v21.8h\n" - "fmla v11.8h, v8.8h, v22.8h\n" - "fmla v12.8h, v8.8h, v18.8h\n" - - "ldr q16, [%[in_0], #112]\n" - "mov v15.16b, v14.16b\n" - "mov v20.16b, v19.16b\n" - "mov v14.16b, v13.16b\n" - "ldr q21, [%[in_1], #112]\n" - "mov v19.16b, v18.16b\n" - "mov v13.16b, v17.16b\n" - "mov v18.16b, v22.16b\n" - - "add %[in_0], %[in_0], #64\n" - "add %[in_1], %[in_1], #64\n" - - "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse - "bne 211f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) - "fmax v10.8h, v10.8h, v17.8h\n" - "fmax v11.8h, v11.8h, v17.8h\n" - "fmax v12.8h, v12.8h, v17.8h\n" - - "211:\n" - "cmp %[depthwiseActivationMode], %[am_relu6]\n" - "bne 212f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) - "fmax v10.8h, v10.8h, v17.8h\n" - "fmax v11.8h, v11.8h, v17.8h\n" - "fmax v12.8h, v12.8h, v17.8h\n" - "fmin v9.8h, v9.8h, v22.8h\n" //min(v9, 6) - "fmin v10.8h, v10.8h, v22.8h\n" - "fmin v11.8h, v11.8h, v22.8h\n" - "fmin v12.8h, v12.8h, v22.8h\n" - - "212:\n" - "cmp %[depthwiseActivationMode], %[am_h_swish]\n" - "bne 213f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x42, lsl #8\n" //three - "fadd v27.8h, v9.8h, v22.8h\n" - "fadd v29.8h, v10.8h, v22.8h\n" - "fadd v30.8h, v11.8h, v22.8h\n" - "fadd v31.8h, v12.8h, v22.8h\n" - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v27.8h, v27.8h, v17.8h\n" - "fmax v29.8h, v29.8h, v17.8h\n" - "fmax v30.8h, v30.8h, v17.8h\n" - "fmax v31.8h, v31.8h, v17.8h\n" - "fmin v27.8h, v27.8h, v22.8h\n" - "fmin v29.8h, v29.8h, v22.8h\n" - "fmin v30.8h, v30.8h, v22.8h\n" - "fmin v31.8h, v31.8h, v22.8h\n" - "fdiv v27.8h, v27.8h, v22.8h\n" - "fdiv v29.8h, v29.8h, v22.8h\n" - "fdiv v30.8h, v30.8h, v22.8h\n" - "fdiv v31.8h, v31.8h, v22.8h\n" - "fmul v9.8h, v27.8h, v9.8h\n" - "fmul v10.8h, v29.8h, v10.8h\n" - "fmul v11.8h, v30.8h, v11.8h\n" - "fmul v12.8h, v31.8h, v12.8h\n" - - "213:\n" - "subs x0, x0, #4\n" - "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" - "bne 0b\n" - - "mov v9.16b, v28.16b\n" //out_0 - "mov v10.16b, v28.16b\n" //out_1 - "mov v11.16b, v28.16b\n" //out_2 - "mov v12.16b, v28.16b\n" //out_3 - - "ldr q17, [%[in_0], #64]\n" - "fmla v9.8h, v3.8h, v13.8h\n" - "fmla v10.8h, v3.8h, v14.8h\n" - "fmla v11.8h, v3.8h, v15.8h\n" - "fmla v12.8h, v3.8h, v16.8h\n" - "ldr q22, [%[in_1], #64]\n" - "fmla v9.8h, v6.8h, v18.8h\n" - "fmla v10.8h, v6.8h, v19.8h\n" - "fmla v11.8h, v6.8h, v20.8h\n" - "fmla v12.8h, v6.8h, v21.8h\n" - - "fmla v9.8h, v4.8h, v14.8h\n" - "fmla v10.8h, v4.8h, v15.8h\n" - "fmla v11.8h, v4.8h, v16.8h\n" - "fmla v12.8h, v4.8h, v17.8h\n" - "fmla v9.8h, v7.8h, v19.8h\n" - "fmla v10.8h, v7.8h, v20.8h\n" - "fmla v11.8h, v7.8h, v21.8h\n" - "fmla v12.8h, v7.8h, v22.8h\n" - - "fmla v9.8h, v5.8h, v15.8h\n" - "fmla v10.8h, v5.8h, v16.8h\n" - "fmla v11.8h, v5.8h, v17.8h\n" - "fmla v9.8h, v8.8h, v20.8h\n" - "fmla v10.8h, v8.8h, v21.8h\n" - "fmla v11.8h, v8.8h, v22.8h\n" - - "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse - "bne 311f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) - "fmax v10.8h, v10.8h, v17.8h\n" - "fmax v11.8h, v11.8h, v17.8h\n" - "fmax v12.8h, v12.8h, v17.8h\n" - - "311:\n" - "cmp %[depthwiseActivationMode], %[am_relu6]\n" - "bne 312f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) - "fmax v10.8h, v10.8h, v17.8h\n" - "fmax v11.8h, v11.8h, v17.8h\n" - "fmax v12.8h, v12.8h, v17.8h\n" - "fmin v9.8h, v9.8h, v22.8h\n" //min(v9, 6) - "fmin v10.8h, v10.8h, v22.8h\n" - "fmin v11.8h, v11.8h, v22.8h\n" - "fmin v12.8h, v12.8h, v22.8h\n" - - "312:\n" - "cmp %[depthwiseActivationMode], %[am_h_swish]\n" - "bne 313f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x42, lsl #8\n" //three - "fadd v27.8h, v9.8h, v22.8h\n" - "fadd v29.8h, v10.8h, v22.8h\n" - "fadd v30.8h, v11.8h, v22.8h\n" - "fadd v31.8h, v12.8h, v22.8h\n" - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v27.8h, v27.8h, v17.8h\n" - "fmax v29.8h, v29.8h, v17.8h\n" - "fmax v30.8h, v30.8h, v17.8h\n" - "fmax v31.8h, v31.8h, v17.8h\n" - "fmin v27.8h, v27.8h, v22.8h\n" - "fmin v29.8h, v29.8h, v22.8h\n" - "fmin v30.8h, v30.8h, v22.8h\n" - "fmin v31.8h, v31.8h, v22.8h\n" - "fdiv v27.8h, v27.8h, v22.8h\n" - "fdiv v29.8h, v29.8h, v22.8h\n" - "fdiv v30.8h, v30.8h, v22.8h\n" - "fdiv v31.8h, v31.8h, v22.8h\n" - "fmul v9.8h, v27.8h, v9.8h\n" - "fmul v10.8h, v29.8h, v10.8h\n" - "fmul v11.8h, v30.8h, v11.8h\n" - "fmul v12.8h, v31.8h, v12.8h\n" - - "313:\n" - "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" - :[out]"+r"(out), - [in_0]"+r"(in0), - [in_1]"+r"(in1) - :[f]"r"(f), - [b]"r"(b), - [w]"r"((I64)ow-8), - [depthwiseActivationMode]"r"((I64)depthwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3" - ); - - for (U32 h = 0; h < oh-2; h++) { - in0 = in_c + h*iw*8; - in1 = in0 + iw*8; - in2 = in1 + iw*8; - __asm__ __volatile__( - "mov x0, %[w]\n" - "ldr q28, [%[b]]\n" - "ldr q0, [%[f]]\n" - "ldr q1, [%[f], #16]\n" - "ldr q2, [%[f], #32]\n" - "ldr q3, [%[f], #48]\n" - "ldr q4, [%[f], #64]\n" - "ldr q5, [%[f], #80]\n" - "ldr q6, [%[f], #96]\n" - "ldr q7, [%[f], #112]\n" - "ldr q8, [%[f], #128]\n" - "ldr q13, [%[in_0]]\n" - "ldr q14, [%[in_0], #16]\n" - "ldr q15, [%[in_0], #32]\n" - "ldr q16, [%[in_0], #48]\n" - "ldr q18, [%[in_1]]\n" - "ldr q19, [%[in_1], #16]\n" - "ldr q20, [%[in_1], #32]\n" - "ldr q21, [%[in_1], #48]\n" - "ldr q23, [%[in_2]]\n" - "ldr q24, [%[in_2], #16]\n" - "ldr q25, [%[in_2], #32]\n" - "ldr q26, [%[in_2], #48]\n" - - "mov v9.16b, v28.16b\n" //out_0 - "mov v10.16b, v28.16b\n" //out_1 - "mov v11.16b, v28.16b\n" //out_2 - "mov v12.16b, v28.16b\n" //out_3 - - "ldr q17, [%[in_0], #64]\n" - "fmla v10.8h, v0.8h, v13.8h\n" - "fmla v11.8h, v0.8h, v14.8h\n" - "fmla v12.8h, v0.8h, v15.8h\n" - "ldr q22, [%[in_1], #64]\n" - "fmla v10.8h, v3.8h, v18.8h\n" - "fmla v11.8h, v3.8h, v19.8h\n" - "fmla v12.8h, v3.8h, v20.8h\n" - "ldr q27, [%[in_2], #64]\n" - "fmla v10.8h, v6.8h, v23.8h\n" - "fmla v11.8h, v6.8h, v24.8h\n" - "fmla v12.8h, v6.8h, v25.8h\n" - - "fmla v9.8h, v1.8h, v13.8h\n" - "fmla v10.8h, v1.8h, v14.8h\n" - "fmla v11.8h, v1.8h, v15.8h\n" - "fmla v12.8h, v1.8h, v16.8h\n" - "fmla v9.8h, v4.8h, v18.8h\n" - "fmla v10.8h, v4.8h, v19.8h\n" - "fmla v11.8h, v4.8h, v20.8h\n" - "fmla v12.8h, v4.8h, v21.8h\n" - "fmla v9.8h, v7.8h, v23.8h\n" - "fmla v10.8h, v7.8h, v24.8h\n" - "fmla v11.8h, v7.8h, v25.8h\n" - "fmla v12.8h, v7.8h, v26.8h\n" - - "ldr q13, [%[in_0], #80]\n" - "fmla v9.8h, v2.8h, v14.8h\n" - "fmla v10.8h, v2.8h, v15.8h\n" - "fmla v11.8h, v2.8h, v16.8h\n" - "fmla v12.8h, v2.8h, v17.8h\n" - "ldr q18, [%[in_1], #80]\n" - "fmla v9.8h, v5.8h, v19.8h\n" - "fmla v10.8h, v5.8h, v20.8h\n" - "fmla v11.8h, v5.8h, v21.8h\n" - "fmla v12.8h, v5.8h, v22.8h\n" - "ldr q23, [%[in_2], #80]\n" - "fmla v9.8h, v8.8h, v24.8h\n" - "fmla v10.8h, v8.8h, v25.8h\n" - "fmla v11.8h, v8.8h, v26.8h\n" - "fmla v12.8h, v8.8h, v27.8h\n" - - "mov v14.16b, v17.16b\n" - "mov v19.16b, v22.16b\n" - "mov v24.16b, v27.16b\n" - "mov v15.16b, v13.16b\n" - "mov v20.16b, v18.16b\n" - "mov v25.16b, v23.16b\n" - "mov v13.16b, v16.16b\n" - "mov v18.16b, v21.16b\n" - "mov v23.16b, v26.16b\n" - "ldr q16, [%[in_0], #96]\n" - "ldr q21, [%[in_1], #96]\n" - "ldr q26, [%[in_2], #96]\n" - "add %[in_0], %[in_0], #48\n" - "add %[in_1], %[in_1], #48\n" - "add %[in_2], %[in_2], #48\n" - - "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse - "bne 111f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) - "fmax v10.8h, v10.8h, v17.8h\n" - "fmax v11.8h, v11.8h, v17.8h\n" - "fmax v12.8h, v12.8h, v17.8h\n" - - "111:\n" - "cmp %[depthwiseActivationMode], %[am_relu6]\n" - "bne 112f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) - "fmax v10.8h, v10.8h, v17.8h\n" - "fmax v11.8h, v11.8h, v17.8h\n" - "fmax v12.8h, v12.8h, v17.8h\n" - "fmin v9.8h, v9.8h, v22.8h\n" //min(v9, 6) - "fmin v10.8h, v10.8h, v22.8h\n" - "fmin v11.8h, v11.8h, v22.8h\n" - "fmin v12.8h, v12.8h, v22.8h\n" - - "112:\n" - "cmp %[depthwiseActivationMode], %[am_h_swish]\n" - "bne 113f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x42, lsl #8\n" //three - "fadd v27.8h, v9.8h, v22.8h\n" - "fadd v29.8h, v10.8h, v22.8h\n" - "fadd v30.8h, v11.8h, v22.8h\n" - "fadd v31.8h, v12.8h, v22.8h\n" - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v27.8h, v27.8h, v17.8h\n" - "fmax v29.8h, v29.8h, v17.8h\n" - "fmax v30.8h, v30.8h, v17.8h\n" - "fmax v31.8h, v31.8h, v17.8h\n" - "fmin v27.8h, v27.8h, v22.8h\n" - "fmin v29.8h, v29.8h, v22.8h\n" - "fmin v30.8h, v30.8h, v22.8h\n" - "fmin v31.8h, v31.8h, v22.8h\n" - "fdiv v27.8h, v27.8h, v22.8h\n" - "fdiv v29.8h, v29.8h, v22.8h\n" - "fdiv v30.8h, v30.8h, v22.8h\n" - "fdiv v31.8h, v31.8h, v22.8h\n" - "fmul v9.8h, v27.8h, v9.8h\n" - "fmul v10.8h, v29.8h, v10.8h\n" - "fmul v11.8h, v30.8h, v11.8h\n" - "fmul v12.8h, v31.8h, v12.8h\n" - - "113:\n" - "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" - - "0:\n" - "mov v9.16b, v28.16b\n" //out_0 - "mov v10.16b, v28.16b\n" //out_1 - "mov v11.16b, v28.16b\n" //out_2 - "mov v12.16b, v28.16b\n" //out_3 - - "ldr q17, [%[in_0], #64]\n" - "fmla v9.8h, v0.8h, v13.8h\n" - "fmla v10.8h, v0.8h, v14.8h\n" - "fmla v11.8h, v0.8h, v15.8h\n" - "ldr q22, [%[in_1], #64]\n" - "fmla v12.8h, v0.8h, v16.8h\n" - "fmla v9.8h, v3.8h, v18.8h\n" - "fmla v10.8h, v3.8h, v19.8h\n" - "ldr q27, [%[in_2], #64]\n" - "fmla v11.8h, v3.8h, v20.8h\n" - "fmla v12.8h, v3.8h, v21.8h\n" - "fmla v9.8h, v6.8h, v23.8h\n" - "fmla v10.8h, v6.8h, v24.8h\n" - "fmla v11.8h, v6.8h, v25.8h\n" - "fmla v12.8h, v6.8h, v26.8h\n" - - "ldr q13, [%[in_0], #80]\n" - "fmla v9.8h, v1.8h, v14.8h\n" - "fmla v10.8h, v1.8h, v15.8h\n" - "fmla v11.8h, v1.8h, v16.8h\n" - "ldr q18, [%[in_1], #80]\n" - "fmla v12.8h, v1.8h, v17.8h\n" - "fmla v9.8h, v4.8h, v19.8h\n" - "fmla v10.8h, v4.8h, v20.8h\n" - "ldr q23, [%[in_2], #80]\n" - "fmla v11.8h, v4.8h, v21.8h\n" - "fmla v12.8h, v4.8h, v22.8h\n" - "fmla v9.8h, v7.8h, v24.8h\n" - "fmla v10.8h, v7.8h, v25.8h\n" - "fmla v11.8h, v7.8h, v26.8h\n" - "fmla v12.8h, v7.8h, v27.8h\n" - - "ldr q14, [%[in_0], #96]\n" - "fmla v9.8h, v2.8h, v15.8h\n" - "fmla v10.8h, v2.8h, v16.8h\n" - "fmla v11.8h, v2.8h, v17.8h\n" - "ldr q19, [%[in_1], #96]\n" - "fmla v12.8h, v2.8h, v13.8h\n" - "fmla v9.8h, v5.8h, v20.8h\n" - "fmla v10.8h, v5.8h, v21.8h\n" - "ldr q24, [%[in_2], #96]\n" - "fmla v11.8h, v5.8h, v22.8h\n" - "fmla v12.8h, v5.8h, v18.8h\n" - "fmla v9.8h, v8.8h, v25.8h\n" - "fmla v10.8h, v8.8h, v26.8h\n" - "fmla v11.8h, v8.8h, v27.8h\n" - "fmla v12.8h, v8.8h, v23.8h\n" - - - "ldr q16, [%[in_0], #112]\n" - "mov v15.16b, v14.16b\n" - "mov v20.16b, v19.16b\n" - "mov v25.16b, v24.16b\n" - "ldr q21, [%[in_1], #112]\n" - "mov v14.16b, v13.16b\n" - "mov v19.16b, v18.16b\n" - "mov v24.16b, v23.16b\n" - "ldr q26, [%[in_2], #112]\n" - "mov v13.16b, v17.16b\n" - "mov v18.16b, v22.16b\n" - "mov v23.16b, v27.16b\n" - - "add %[in_0], %[in_0], #64\n" - "add %[in_1], %[in_1], #64\n" - "add %[in_2], %[in_2], #64\n" - - "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse - "bne 211f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) - "fmax v10.8h, v10.8h, v17.8h\n" - "fmax v11.8h, v11.8h, v17.8h\n" - "fmax v12.8h, v12.8h, v17.8h\n" - - "211:\n" - "cmp %[depthwiseActivationMode], %[am_relu6]\n" - "bne 212f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) - "fmax v10.8h, v10.8h, v17.8h\n" - "fmax v11.8h, v11.8h, v17.8h\n" - "fmax v12.8h, v12.8h, v17.8h\n" - "fmin v9.8h, v9.8h, v22.8h\n" //min(v9, 6) - "fmin v10.8h, v10.8h, v22.8h\n" - "fmin v11.8h, v11.8h, v22.8h\n" - "fmin v12.8h, v12.8h, v22.8h\n" - - "212:\n" - "cmp %[depthwiseActivationMode], %[am_h_swish]\n" - "bne 213f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x42, lsl #8\n" //three - "fadd v27.8h, v9.8h, v22.8h\n" - "fadd v29.8h, v10.8h, v22.8h\n" - "fadd v30.8h, v11.8h, v22.8h\n" - "fadd v31.8h, v12.8h, v22.8h\n" - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v27.8h, v27.8h, v17.8h\n" - "fmax v29.8h, v29.8h, v17.8h\n" - "fmax v30.8h, v30.8h, v17.8h\n" - "fmax v31.8h, v31.8h, v17.8h\n" - "fmin v27.8h, v27.8h, v22.8h\n" - "fmin v29.8h, v29.8h, v22.8h\n" - "fmin v30.8h, v30.8h, v22.8h\n" - "fmin v31.8h, v31.8h, v22.8h\n" - "fdiv v27.8h, v27.8h, v22.8h\n" - "fdiv v29.8h, v29.8h, v22.8h\n" - "fdiv v30.8h, v30.8h, v22.8h\n" - "fdiv v31.8h, v31.8h, v22.8h\n" - "fmul v9.8h, v27.8h, v9.8h\n" - "fmul v10.8h, v29.8h, v10.8h\n" - "fmul v11.8h, v30.8h, v11.8h\n" - "fmul v12.8h, v31.8h, v12.8h\n" - - "213:\n" - "subs x0, x0, #4\n" - "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" - "bne 0b\n" - - "mov v9.16b, v28.16b\n" //out_0 - "mov v10.16b, v28.16b\n" //out_1 - "mov v11.16b, v28.16b\n" //out_2 - "mov v12.16b, v28.16b\n" //out_3 - - "ldr q17, [%[in_0], #64]\n" - "fmla v9.8h, v0.8h, v13.8h\n" - "fmla v10.8h, v0.8h, v14.8h\n" - "fmla v11.8h, v0.8h, v15.8h\n" - "fmla v12.8h, v0.8h, v16.8h\n" - "ldr q22, [%[in_1], #64]\n" - "fmla v9.8h, v3.8h, v18.8h\n" - "fmla v10.8h, v3.8h, v19.8h\n" - "fmla v11.8h, v3.8h, v20.8h\n" - "fmla v12.8h, v3.8h, v21.8h\n" - "ldr q27, [%[in_2], #64]\n" - "fmla v9.8h, v6.8h, v23.8h\n" - "fmla v10.8h, v6.8h, v24.8h\n" - "fmla v11.8h, v6.8h, v25.8h\n" - "fmla v12.8h, v6.8h, v26.8h\n" - - "fmla v9.8h, v1.8h, v14.8h\n" - "fmla v10.8h, v1.8h, v15.8h\n" - "fmla v11.8h, v1.8h, v16.8h\n" - "fmla v12.8h, v1.8h, v17.8h\n" - "fmla v9.8h, v4.8h, v19.8h\n" - "fmla v10.8h, v4.8h, v20.8h\n" - "fmla v11.8h, v4.8h, v21.8h\n" - "fmla v12.8h, v4.8h, v22.8h\n" - "fmla v9.8h, v7.8h, v24.8h\n" - "fmla v10.8h, v7.8h, v25.8h\n" - "fmla v11.8h, v7.8h, v26.8h\n" - "fmla v12.8h, v7.8h, v27.8h\n" - - "fmla v9.8h, v2.8h, v15.8h\n" - "fmla v10.8h, v2.8h, v16.8h\n" - "fmla v11.8h, v2.8h, v17.8h\n" - "fmla v9.8h, v5.8h, v20.8h\n" - "fmla v10.8h, v5.8h, v21.8h\n" - "fmla v11.8h, v5.8h, v22.8h\n" - "fmla v9.8h, v8.8h, v25.8h\n" - "fmla v10.8h, v8.8h, v26.8h\n" - "fmla v11.8h, v8.8h, v27.8h\n" - - "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse - "bne 311f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) - "fmax v10.8h, v10.8h, v17.8h\n" - "fmax v11.8h, v11.8h, v17.8h\n" - "fmax v12.8h, v12.8h, v17.8h\n" - - "311:\n" - "cmp %[depthwiseActivationMode], %[am_relu6]\n" - "bne 312f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) - "fmax v10.8h, v10.8h, v17.8h\n" - "fmax v11.8h, v11.8h, v17.8h\n" - "fmax v12.8h, v12.8h, v17.8h\n" - "fmin v9.8h, v9.8h, v22.8h\n" //min(v9, 6) - "fmin v10.8h, v10.8h, v22.8h\n" - "fmin v11.8h, v11.8h, v22.8h\n" - "fmin v12.8h, v12.8h, v22.8h\n" - - "312:\n" - "cmp %[depthwiseActivationMode], %[am_h_swish]\n" - "bne 313f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x42, lsl #8\n" //three - "fadd v27.8h, v9.8h, v22.8h\n" - "fadd v29.8h, v10.8h, v22.8h\n" - "fadd v30.8h, v11.8h, v22.8h\n" - "fadd v31.8h, v12.8h, v22.8h\n" - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v27.8h, v27.8h, v17.8h\n" - "fmax v29.8h, v29.8h, v17.8h\n" - "fmax v30.8h, v30.8h, v17.8h\n" - "fmax v31.8h, v31.8h, v17.8h\n" - "fmin v27.8h, v27.8h, v22.8h\n" - "fmin v29.8h, v29.8h, v22.8h\n" - "fmin v30.8h, v30.8h, v22.8h\n" - "fmin v31.8h, v31.8h, v22.8h\n" - "fdiv v27.8h, v27.8h, v22.8h\n" - "fdiv v29.8h, v29.8h, v22.8h\n" - "fdiv v30.8h, v30.8h, v22.8h\n" - "fdiv v31.8h, v31.8h, v22.8h\n" - "fmul v9.8h, v27.8h, v9.8h\n" - "fmul v10.8h, v29.8h, v10.8h\n" - "fmul v11.8h, v30.8h, v11.8h\n" - "fmul v12.8h, v31.8h, v12.8h\n" - - "313:\n" - "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" - :[out]"+r"(out), - [in_0]"+r"(in0), - [in_1]"+r"(in1), - [in_2]"+r"(in2) - :[f]"r"(f), - [b]"r"(b), - [w]"r"((I64)ow-8), - [depthwiseActivationMode]"r"((I64)depthwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3" - ); - } - in0 = in_c + (ih-2)*iw*8; - in1 = in0 + iw*8; - in2 = in1 + iw*8; - __asm__ __volatile__( - "mov x0, %[w]\n" - "ldr q28, [%[b]]\n" - "ldr q0, [%[f]]\n" - "ldr q1, [%[f], #16]\n" - "ldr q2, [%[f], #32]\n" - "ldr q3, [%[f], #48]\n" - "ldr q4, [%[f], #64]\n" - "ldr q5, [%[f], #80]\n" - "ldr q13, [%[in_0]]\n" - "ldr q14, [%[in_0], #16]\n" - "ldr q15, [%[in_0], #32]\n" - "ldr q16, [%[in_0], #48]\n" - "ldr q18, [%[in_1]]\n" - "ldr q19, [%[in_1], #16]\n" - "ldr q20, [%[in_1], #32]\n" - "ldr q21, [%[in_1], #48]\n" - - "mov v9.16b, v28.16b\n" //out_0 - "mov v10.16b, v28.16b\n" //out_1 - "mov v11.16b, v28.16b\n" //out_2 - "mov v12.16b, v28.16b\n" //out_3 - - "ldr q17, [%[in_0], #64]\n" - "fmla v10.8h, v0.8h, v13.8h\n" - "fmla v11.8h, v0.8h, v14.8h\n" - "fmla v12.8h, v0.8h, v15.8h\n" - "ldr q22, [%[in_1], #64]\n" - "fmla v10.8h, v3.8h, v18.8h\n" - "fmla v11.8h, v3.8h, v19.8h\n" - "fmla v12.8h, v3.8h, v20.8h\n" - - "fmla v9.8h, v1.8h, v13.8h\n" - "fmla v10.8h, v1.8h, v14.8h\n" - "fmla v11.8h, v1.8h, v15.8h\n" - "fmla v12.8h, v1.8h, v16.8h\n" - "fmla v9.8h, v4.8h, v18.8h\n" - "fmla v10.8h, v4.8h, v19.8h\n" - "fmla v11.8h, v4.8h, v20.8h\n" - "fmla v12.8h, v4.8h, v21.8h\n" - - "ldr q13, [%[in_0], #80]\n" - "fmla v9.8h, v2.8h, v14.8h\n" - "fmla v10.8h, v2.8h, v15.8h\n" - "fmla v11.8h, v2.8h, v16.8h\n" - "fmla v12.8h, v2.8h, v17.8h\n" - "ldr q18, [%[in_1], #80]\n" - "fmla v9.8h, v5.8h, v19.8h\n" - "fmla v10.8h, v5.8h, v20.8h\n" - "fmla v11.8h, v5.8h, v21.8h\n" - "fmla v12.8h, v5.8h, v22.8h\n" - - "mov v14.16b, v17.16b\n" - "mov v19.16b, v22.16b\n" - "mov v15.16b, v13.16b\n" - "mov v20.16b, v18.16b\n" - "mov v13.16b, v16.16b\n" - "mov v18.16b, v21.16b\n" - "ldr q16, [%[in_0], #96]\n" - "ldr q21, [%[in_1], #96]\n" - "add %[in_0], %[in_0], #48\n" - "add %[in_1], %[in_1], #48\n" - - "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse - "bne 111f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) - "fmax v10.8h, v10.8h, v17.8h\n" - "fmax v11.8h, v11.8h, v17.8h\n" - "fmax v12.8h, v12.8h, v17.8h\n" - - "111:\n" - "cmp %[depthwiseActivationMode], %[am_relu6]\n" - "bne 112f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) - "fmax v10.8h, v10.8h, v17.8h\n" - "fmax v11.8h, v11.8h, v17.8h\n" - "fmax v12.8h, v12.8h, v17.8h\n" - "fmin v9.8h, v9.8h, v22.8h\n" //min(v9, 6) - "fmin v10.8h, v10.8h, v22.8h\n" - "fmin v11.8h, v11.8h, v22.8h\n" - "fmin v12.8h, v12.8h, v22.8h\n" - - "112:\n" - "cmp %[depthwiseActivationMode], %[am_h_swish]\n" - "bne 113f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x42, lsl #8\n" //three - "fadd v27.8h, v9.8h, v22.8h\n" - "fadd v29.8h, v10.8h, v22.8h\n" - "fadd v30.8h, v11.8h, v22.8h\n" - "fadd v31.8h, v12.8h, v22.8h\n" - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v27.8h, v27.8h, v17.8h\n" - "fmax v29.8h, v29.8h, v17.8h\n" - "fmax v30.8h, v30.8h, v17.8h\n" - "fmax v31.8h, v31.8h, v17.8h\n" - "fmin v27.8h, v27.8h, v22.8h\n" - "fmin v29.8h, v29.8h, v22.8h\n" - "fmin v30.8h, v30.8h, v22.8h\n" - "fmin v31.8h, v31.8h, v22.8h\n" - "fdiv v27.8h, v27.8h, v22.8h\n" - "fdiv v29.8h, v29.8h, v22.8h\n" - "fdiv v30.8h, v30.8h, v22.8h\n" - "fdiv v31.8h, v31.8h, v22.8h\n" - "fmul v9.8h, v27.8h, v9.8h\n" - "fmul v10.8h, v29.8h, v10.8h\n" - "fmul v11.8h, v30.8h, v11.8h\n" - "fmul v12.8h, v31.8h, v12.8h\n" - - "113:\n" - "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" - - "0:\n" - "mov v9.16b, v28.16b\n" //out_0 - "mov v10.16b, v28.16b\n" //out_1 - "mov v11.16b, v28.16b\n" //out_2 - "mov v12.16b, v28.16b\n" //out_3 - - "ldr q17, [%[in_0], #64]\n" - "fmla v9.8h, v0.8h, v13.8h\n" - "fmla v10.8h, v0.8h, v14.8h\n" - "fmla v11.8h, v0.8h, v15.8h\n" - "ldr q22, [%[in_1], #64]\n" - "fmla v12.8h, v0.8h, v16.8h\n" - "fmla v9.8h, v3.8h, v18.8h\n" - "fmla v10.8h, v3.8h, v19.8h\n" - "fmla v11.8h, v3.8h, v20.8h\n" - "fmla v12.8h, v3.8h, v21.8h\n" - - "ldr q13, [%[in_0], #80]\n" - "fmla v9.8h, v1.8h, v14.8h\n" - "fmla v10.8h, v1.8h, v15.8h\n" - "fmla v11.8h, v1.8h, v16.8h\n" - "ldr q18, [%[in_1], #80]\n" - "fmla v12.8h, v1.8h, v17.8h\n" - "fmla v9.8h, v4.8h, v19.8h\n" - "fmla v10.8h, v4.8h, v20.8h\n" - "fmla v11.8h, v4.8h, v21.8h\n" - "fmla v12.8h, v4.8h, v22.8h\n" - - "ldr q14, [%[in_0], #96]\n" - "fmla v9.8h, v2.8h, v15.8h\n" - "fmla v10.8h, v2.8h, v16.8h\n" - "fmla v11.8h, v2.8h, v17.8h\n" - "ldr q19, [%[in_1], #96]\n" - "fmla v12.8h, v2.8h, v13.8h\n" - "fmla v9.8h, v5.8h, v20.8h\n" - "fmla v10.8h, v5.8h, v21.8h\n" - "fmla v11.8h, v5.8h, v22.8h\n" - "fmla v12.8h, v5.8h, v18.8h\n" - - - "ldr q16, [%[in_0], #112]\n" - "mov v15.16b, v14.16b\n" - "mov v20.16b, v19.16b\n" - "ldr q21, [%[in_1], #112]\n" - "mov v14.16b, v13.16b\n" - "mov v19.16b, v18.16b\n" - "mov v13.16b, v17.16b\n" - "mov v18.16b, v22.16b\n" - - "add %[in_0], %[in_0], #64\n" - "add %[in_1], %[in_1], #64\n" - - "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse - "bne 211f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) - "fmax v10.8h, v10.8h, v17.8h\n" - "fmax v11.8h, v11.8h, v17.8h\n" - "fmax v12.8h, v12.8h, v17.8h\n" - - "211:\n" - "cmp %[depthwiseActivationMode], %[am_relu6]\n" - "bne 212f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) - "fmax v10.8h, v10.8h, v17.8h\n" - "fmax v11.8h, v11.8h, v17.8h\n" - "fmax v12.8h, v12.8h, v17.8h\n" - "fmin v9.8h, v9.8h, v22.8h\n" //min(v9, 6) - "fmin v10.8h, v10.8h, v22.8h\n" - "fmin v11.8h, v11.8h, v22.8h\n" - "fmin v12.8h, v12.8h, v22.8h\n" - - "212:\n" - "cmp %[depthwiseActivationMode], %[am_h_swish]\n" - "bne 213f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x42, lsl #8\n" //three - "fadd v27.8h, v9.8h, v22.8h\n" - "fadd v29.8h, v10.8h, v22.8h\n" - "fadd v30.8h, v11.8h, v22.8h\n" - "fadd v31.8h, v12.8h, v22.8h\n" - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v27.8h, v27.8h, v17.8h\n" - "fmax v29.8h, v29.8h, v17.8h\n" - "fmax v30.8h, v30.8h, v17.8h\n" - "fmax v31.8h, v31.8h, v17.8h\n" - "fmin v27.8h, v27.8h, v22.8h\n" - "fmin v29.8h, v29.8h, v22.8h\n" - "fmin v30.8h, v30.8h, v22.8h\n" - "fmin v31.8h, v31.8h, v22.8h\n" - "fdiv v27.8h, v27.8h, v22.8h\n" - "fdiv v29.8h, v29.8h, v22.8h\n" - "fdiv v30.8h, v30.8h, v22.8h\n" - "fdiv v31.8h, v31.8h, v22.8h\n" - "fmul v9.8h, v27.8h, v9.8h\n" - "fmul v10.8h, v29.8h, v10.8h\n" - "fmul v11.8h, v30.8h, v11.8h\n" - "fmul v12.8h, v31.8h, v12.8h\n" - - "213:\n" - "subs x0, x0, #4\n" - "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" - "bne 0b\n" - - "mov v9.16b, v28.16b\n" //out_0 - "mov v10.16b, v28.16b\n" //out_1 - "mov v11.16b, v28.16b\n" //out_2 - "mov v12.16b, v28.16b\n" //out_3 - - "ldr q17, [%[in_0], #64]\n" - "fmla v9.8h, v0.8h, v13.8h\n" - "fmla v10.8h, v0.8h, v14.8h\n" - "fmla v11.8h, v0.8h, v15.8h\n" - "fmla v12.8h, v0.8h, v16.8h\n" - "ldr q22, [%[in_1], #64]\n" - "fmla v9.8h, v3.8h, v18.8h\n" - "fmla v10.8h, v3.8h, v19.8h\n" - "fmla v11.8h, v3.8h, v20.8h\n" - "fmla v12.8h, v3.8h, v21.8h\n" - - "fmla v9.8h, v1.8h, v14.8h\n" - "fmla v10.8h, v1.8h, v15.8h\n" - "fmla v11.8h, v1.8h, v16.8h\n" - "fmla v12.8h, v1.8h, v17.8h\n" - "fmla v9.8h, v4.8h, v19.8h\n" - "fmla v10.8h, v4.8h, v20.8h\n" - "fmla v11.8h, v4.8h, v21.8h\n" - "fmla v12.8h, v4.8h, v22.8h\n" - - "fmla v9.8h, v2.8h, v15.8h\n" - "fmla v10.8h, v2.8h, v16.8h\n" - "fmla v11.8h, v2.8h, v17.8h\n" - "fmla v9.8h, v5.8h, v20.8h\n" - "fmla v10.8h, v5.8h, v21.8h\n" - "fmla v11.8h, v5.8h, v22.8h\n" - - "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse - "bne 311f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) - "fmax v10.8h, v10.8h, v17.8h\n" - "fmax v11.8h, v11.8h, v17.8h\n" - "fmax v12.8h, v12.8h, v17.8h\n" - - "311:\n" - "cmp %[depthwiseActivationMode], %[am_relu6]\n" - "bne 312f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) - "fmax v10.8h, v10.8h, v17.8h\n" - "fmax v11.8h, v11.8h, v17.8h\n" - "fmax v12.8h, v12.8h, v17.8h\n" - "fmin v9.8h, v9.8h, v22.8h\n" //min(v9, 6) - "fmin v10.8h, v10.8h, v22.8h\n" - "fmin v11.8h, v11.8h, v22.8h\n" - "fmin v12.8h, v12.8h, v22.8h\n" - - "312:\n" - "cmp %[depthwiseActivationMode], %[am_h_swish]\n" - "bne 313f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x42, lsl #8\n" //three - "fadd v27.8h, v9.8h, v22.8h\n" - "fadd v29.8h, v10.8h, v22.8h\n" - "fadd v30.8h, v11.8h, v22.8h\n" - "fadd v31.8h, v12.8h, v22.8h\n" - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v27.8h, v27.8h, v17.8h\n" - "fmax v29.8h, v29.8h, v17.8h\n" - "fmax v30.8h, v30.8h, v17.8h\n" - "fmax v31.8h, v31.8h, v17.8h\n" - "fmin v27.8h, v27.8h, v22.8h\n" - "fmin v29.8h, v29.8h, v22.8h\n" - "fmin v30.8h, v30.8h, v22.8h\n" - "fmin v31.8h, v31.8h, v22.8h\n" - "fdiv v27.8h, v27.8h, v22.8h\n" - "fdiv v29.8h, v29.8h, v22.8h\n" - "fdiv v30.8h, v30.8h, v22.8h\n" - "fdiv v31.8h, v31.8h, v22.8h\n" - "fmul v9.8h, v27.8h, v9.8h\n" - "fmul v10.8h, v29.8h, v10.8h\n" - "fmul v11.8h, v30.8h, v11.8h\n" - "fmul v12.8h, v31.8h, v12.8h\n" - - "313:\n" - "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" - :[out]"+r"(out), - [in_0]"+r"(in0), - [in_1]"+r"(in1) - :[f]"r"(f), - [b]"r"(b), - [w]"r"((I64)ow-8), - [depthwiseActivationMode]"r"((I64)depthwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3" - ); - } - - // pw_conv - for (I32 hw = 0; hw < ohow-7; hw+=8) { - const F16 *b0 = biasArray + ic*8; - const F16 *b1 = b0 + 8; - const F16 *f_o0c0 = filterArray + ic*fh*fw*8; - F16 *in_pack = pwArray + ohow*ic*8; - // pack input - // NCHWc8 => NHWChw8 - for (U32 c = 0; c < ic; c++) { - F16 *in_pack_c8hw8 = in_pack + c*8*8; - // it is 2% faster than in_hw8c8 = ... + hw*8; Amazing! - F16 *in_hw8c8 = pwArray + c*ohow*8; - // - // for (U32 c8 = 0; c8 < 8; c8++) { - // for (U32 hw8 = 0; hw8 < 8; hw8++) { - // in_pack_c8hw8[c8*8 + hw8] = in_hw8c8[hw8*8 + c8]; - // } - // } - // - float16x8_t v0 = vld1q_f16(in_hw8c8 + hw*8); - float16x8_t v1 = vld1q_f16(in_hw8c8 + hw*8 + 8); - float16x8_t v2 = vld1q_f16(in_hw8c8 + hw*8 + 8*2); - float16x8_t v3 = vld1q_f16(in_hw8c8 + hw*8 + 8*3); - float16x8_t v4 = vld1q_f16(in_hw8c8 + hw*8 + 8*4); - float16x8_t v5 = vld1q_f16(in_hw8c8 + hw*8 + 8*5); - float16x8_t v6 = vld1q_f16(in_hw8c8 + hw*8 + 8*6); - float16x8_t v7 = vld1q_f16(in_hw8c8 + hw*8 + 8*7); - vst1q_f16(in_pack_c8hw8, - vzip1q_f16( - vzip1q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), - vzip1q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); - vst1q_f16(in_pack_c8hw8 + 8, - vzip2q_f16( - vzip1q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), - vzip1q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); - vst1q_f16(in_pack_c8hw8 + 8*2, - vzip1q_f16( - vzip2q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), - vzip2q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); - vst1q_f16(in_pack_c8hw8 + 8*3, - vzip2q_f16( - vzip2q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), - vzip2q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); - vst1q_f16(in_pack_c8hw8 + 8*4, - vzip1q_f16( - vzip1q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), - vzip1q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); - vst1q_f16(in_pack_c8hw8 + 8*5, - vzip2q_f16( - vzip1q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), - vzip1q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); - vst1q_f16(in_pack_c8hw8 + 8*6, - vzip1q_f16( - vzip2q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), - vzip2q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); - vst1q_f16(in_pack_c8hw8 + 8*7, - vzip2q_f16( - vzip2q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), - vzip2q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); - } - // compute - for (I32 o = 0; o < I32(oc-1); o+=2) { - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; - // bias - const F16 *b_o0 = b0; - const F16 *b_o1 = b1; - __asm__ __volatile__( - "ldr q22, [%[b_0]]\n" //b_o0 - "ldr q23, [%[b_1]]\n" //b_o1 - "mov x0, %[ic]\n" //ic_blk - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr q0, [%[in_0]]\n" //in_hw0 - "mov v3.16b, v22.16b\n" //out_o0hw1 - "mov v4.16b, v22.16b\n" //out_o0hw2 - "mov v5.16b, v22.16b\n" //out_o0hw3 - "ldr q18, [%[f_0]]\n" //f_o0c0 - "mov v6.16b, v22.16b\n" //out_o0hw4 - "mov v7.16b, v22.16b\n" //out_o0hw5 - "mov v8.16b, v22.16b\n" //out_o0hw6 - "ldr q19, [%[f_0], #16]\n" //f_o1c0 - "mov v9.16b, v22.16b\n" //out_o0hw7 - "mov v10.16b, v23.16b\n" //out_o1hw0 - "mov v11.16b, v23.16b\n" //out_o1hw1 - "mov v12.16b, v23.16b\n" //out_o1hw2 - "mov v13.16b, v23.16b\n" //out_o1hw3 - "mov v14.16b, v23.16b\n" //out_o1hw4 - "mov v15.16b, v23.16b\n" //out_o1hw5 - "mov v16.16b, v23.16b\n" //out_o1hw6 - "mov v17.16b, v23.16b\n" //out_o1hw7 - "0:\n" - "ldr q1, [%[in_0], #16]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "fmla v3.8h, v18.8h, v0.h[1]\n" - "fmla v4.8h, v18.8h, v0.h[2]\n" - "ldr q20, [%[f_0], #32]\n" //f_o0c0 - "fmla v5.8h, v18.8h, v0.h[3]\n" - "fmla v6.8h, v18.8h, v0.h[4]\n" - "fmla v7.8h, v18.8h, v0.h[5]\n" - "ldr q21, [%[f_0], #48]\n" //f_o1c0 - "fmla v8.8h, v18.8h, v0.h[6]\n" - "fmla v9.8h, v18.8h, v0.h[7]\n" - "fmla v10.8h, v19.8h, v0.h[0]\n" - "fmla v11.8h, v19.8h, v0.h[1]\n" - "fmla v12.8h, v19.8h, v0.h[2]\n" - "fmla v13.8h, v19.8h, v0.h[3]\n" - "fmla v14.8h, v19.8h, v0.h[4]\n" - "fmla v15.8h, v19.8h, v0.h[5]\n" - "fmla v16.8h, v19.8h, v0.h[6]\n" - "fmla v17.8h, v19.8h, v0.h[7]\n" - - "ldr q0, [%[in_0], #32]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "fmla v3.8h, v20.8h, v1.h[1]\n" - "fmla v4.8h, v20.8h, v1.h[2]\n" - "ldr q18, [%[f_0], #64]\n" //f_o0c0 - "fmla v5.8h, v20.8h, v1.h[3]\n" - "fmla v6.8h, v20.8h, v1.h[4]\n" - "fmla v7.8h, v20.8h, v1.h[5]\n" - "ldr q19, [%[f_0], #80]\n" //f_o1c0 - "fmla v8.8h, v20.8h, v1.h[6]\n" - "fmla v9.8h, v20.8h, v1.h[7]\n" - "fmla v10.8h, v21.8h, v1.h[0]\n" - "add %[in_0], %[in_0], #32\n" - "fmla v11.8h, v21.8h, v1.h[1]\n" - "add %[f_0], %[f_0], #64\n" - "fmla v12.8h, v21.8h, v1.h[2]\n" - "subs x0, x0, #2\n" - "fmla v13.8h, v21.8h, v1.h[3]\n" - "fmla v14.8h, v21.8h, v1.h[4]\n" - "fmla v15.8h, v21.8h, v1.h[5]\n" - "fmla v16.8h, v21.8h, v1.h[6]\n" - "fmla v17.8h, v21.8h, v1.h[7]\n" - "bne 0b\n" - - "cmp %[pointwiseActivationMode], %[am_relu]\n" - "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v3.8h, v3.8h, v0.8h\n" - "fmax v4.8h, v4.8h, v0.8h\n" - "fmax v5.8h, v5.8h, v0.8h\n" - "fmax v6.8h, v6.8h, v0.8h\n" - "fmax v7.8h, v7.8h, v0.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" - "fmax v12.8h, v12.8h, v0.8h\n" - "fmax v13.8h, v13.8h, v0.8h\n" - "fmax v14.8h, v14.8h, v0.8h\n" - "fmax v15.8h, v15.8h, v0.8h\n" - "fmax v16.8h, v16.8h, v0.8h\n" - "fmax v17.8h, v17.8h, v0.8h\n" - - "11:\n" - "cmp %[pointwiseActivationMode], %[am_relu6]\n" - "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v3.8h, v3.8h, v0.8h\n" - "fmax v4.8h, v4.8h, v0.8h\n" - "fmax v5.8h, v5.8h, v0.8h\n" - "fmax v6.8h, v6.8h, v0.8h\n" - "fmax v7.8h, v7.8h, v0.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" - "fmax v12.8h, v12.8h, v0.8h\n" - "fmax v13.8h, v13.8h, v0.8h\n" - "fmax v14.8h, v14.8h, v0.8h\n" - "fmax v15.8h, v15.8h, v0.8h\n" - "fmax v16.8h, v16.8h, v0.8h\n" - "fmax v17.8h, v17.8h, v0.8h\n" - "fmin v2.8h, v2.8h, v1.8h\n" - "fmin v3.8h, v3.8h, v1.8h\n" - "fmin v4.8h, v4.8h, v1.8h\n" - "fmin v5.8h, v5.8h, v1.8h\n" - "fmin v6.8h, v6.8h, v1.8h\n" - "fmin v7.8h, v7.8h, v1.8h\n" - "fmin v8.8h, v8.8h, v1.8h\n" - "fmin v9.8h, v9.8h, v1.8h\n" - "fmin v10.8h, v10.8h, v1.8h\n" - "fmin v11.8h, v11.8h, v1.8h\n" - "fmin v12.8h, v12.8h, v1.8h\n" - "fmin v13.8h, v13.8h, v1.8h\n" - "fmin v14.8h, v14.8h, v1.8h\n" - "fmin v15.8h, v15.8h, v1.8h\n" - "fmin v16.8h, v16.8h, v1.8h\n" - "fmin v17.8h, v17.8h, v1.8h\n" - - "12:\n" - "cmp %[pointwiseActivationMode], %[am_h_swish]\n" - "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v18.8h, #0x42, lsl #8\n" //three - "fadd v19.8h, v2.8h, v18.8h\n" - "fadd v20.8h, v3.8h, v18.8h\n" - "fadd v21.8h, v4.8h, v18.8h\n" - "fadd v22.8h, v5.8h, v18.8h\n" - "fadd v23.8h, v6.8h, v18.8h\n" - "fadd v24.8h, v7.8h, v18.8h\n" - "fadd v25.8h, v8.8h, v18.8h\n" - "fadd v26.8h, v9.8h, v18.8h\n" - "fmax v19.8h, v19.8h, v0.8h\n" - "fmax v20.8h, v20.8h, v0.8h\n" - "fmax v21.8h, v21.8h, v0.8h\n" - "fmax v22.8h, v22.8h, v0.8h\n" - "fmax v23.8h, v23.8h, v0.8h\n" - "fmax v24.8h, v24.8h, v0.8h\n" - "fmax v25.8h, v25.8h, v0.8h\n" - "fmax v26.8h, v26.8h, v0.8h\n" - "fmin v19.8h, v19.8h, v1.8h\n" - "fmin v20.8h, v20.8h, v1.8h\n" - "fmin v21.8h, v21.8h, v1.8h\n" - "fmin v22.8h, v22.8h, v1.8h\n" - "fmin v23.8h, v23.8h, v1.8h\n" - "fmin v24.8h, v24.8h, v1.8h\n" - "fmin v25.8h, v25.8h, v1.8h\n" - "fmin v26.8h, v26.8h, v1.8h\n" - "fdiv v19.8h, v19.8h, v1.8h\n" - "fdiv v20.8h, v20.8h, v1.8h\n" - "fdiv v21.8h, v21.8h, v1.8h\n" - "fdiv v22.8h, v22.8h, v1.8h\n" - "fdiv v23.8h, v23.8h, v1.8h\n" - "fdiv v24.8h, v24.8h, v1.8h\n" - "fdiv v25.8h, v25.8h, v1.8h\n" - "fdiv v26.8h, v26.8h, v1.8h\n" - "fmul v2.8h, v19.8h, v2.8h\n" - "fmul v3.8h, v20.8h, v3.8h\n" - "fmul v4.8h, v21.8h, v4.8h\n" - "fmul v5.8h, v22.8h, v5.8h\n" - "fmul v6.8h, v23.8h, v6.8h\n" - "fmul v7.8h, v24.8h, v7.8h\n" - "fmul v8.8h, v25.8h, v8.8h\n" - "fmul v9.8h, v26.8h, v9.8h\n" - - "fadd v19.8h, v10.8h, v18.8h\n" - "fadd v20.8h, v11.8h, v18.8h\n" - "fadd v21.8h, v12.8h, v18.8h\n" - "fadd v22.8h, v13.8h, v18.8h\n" - "fadd v23.8h, v14.8h, v18.8h\n" - "fadd v24.8h, v15.8h, v18.8h\n" - "fadd v25.8h, v16.8h, v18.8h\n" - "fadd v26.8h, v17.8h, v18.8h\n" - "fmax v19.8h, v19.8h, v0.8h\n" - "fmax v20.8h, v20.8h, v0.8h\n" - "fmax v21.8h, v21.8h, v0.8h\n" - "fmax v22.8h, v22.8h, v0.8h\n" - "fmax v23.8h, v23.8h, v0.8h\n" - "fmax v24.8h, v24.8h, v0.8h\n" - "fmax v25.8h, v25.8h, v0.8h\n" - "fmax v26.8h, v26.8h, v0.8h\n" - "fmin v19.8h, v19.8h, v1.8h\n" - "fmin v20.8h, v20.8h, v1.8h\n" - "fmin v21.8h, v21.8h, v1.8h\n" - "fmin v22.8h, v22.8h, v1.8h\n" - "fmin v23.8h, v23.8h, v1.8h\n" - "fmin v24.8h, v24.8h, v1.8h\n" - "fmin v25.8h, v25.8h, v1.8h\n" - "fmin v26.8h, v26.8h, v1.8h\n" - "fdiv v19.8h, v19.8h, v1.8h\n" - "fdiv v20.8h, v20.8h, v1.8h\n" - "fdiv v21.8h, v21.8h, v1.8h\n" - "fdiv v22.8h, v22.8h, v1.8h\n" - "fdiv v23.8h, v23.8h, v1.8h\n" - "fdiv v24.8h, v24.8h, v1.8h\n" - "fdiv v25.8h, v25.8h, v1.8h\n" - "fdiv v26.8h, v26.8h, v1.8h\n" - "fmul v10.8h, v19.8h, v10.8h\n" - "fmul v11.8h, v20.8h, v11.8h\n" - "fmul v12.8h, v21.8h, v12.8h\n" - "fmul v13.8h, v22.8h, v13.8h\n" - "fmul v14.8h, v23.8h, v14.8h\n" - "fmul v15.8h, v24.8h, v15.8h\n" - "fmul v16.8h, v25.8h, v16.8h\n" - "fmul v17.8h, v26.8h, v17.8h\n" - - "13:\n" - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw1 - "str q4, [%[out_0], #32]\n" //out_o0hw2 - "str q5, [%[out_0], #48]\n" //out_o0hw3 - "str q6, [%[out_0], #64]\n" //out_o0hw4 - "str q7, [%[out_0], #80]\n" //out_o0hw5 - "str q8, [%[out_0], #96]\n" //out_o0hw6 - "str q9, [%[out_0], #112]\n" //out_o0hw7 - "str q10, [%[out_1]]\n" //out_o1hw0 - "str q11, [%[out_1], #16]\n" //out_o1hw1 - "str q12, [%[out_1], #32]\n" //out_o1hw2 - "str q13, [%[out_1], #48]\n" //out_o1hw3 - "str q14, [%[out_1], #64]\n" //out_o1hw4 - "str q15, [%[out_1], #80]\n" //out_o1hw5 - "str q16, [%[out_1], #96]\n" //out_o1hw6 - "str q17, [%[out_1], #112]\n" //out_o1hw7 - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "x0", "x1", "x2", "x3" - ); - b0 += 16; - b1 += 16; - } - if (oc & 1) { - // oc%2 != 0 - const F16 *f_r = filterArray + ic*fh*fw*8 + (oc-1)*8*ic*8; - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + (oc-1)*ohow*8 + hw*8; - // bias - const F16 *b_o0 = biasArray + ic*8 + (oc-1)*8; - __asm__ __volatile__( - "ldr q12, [%[b_0]]\n" //b_o0 - "mov x0, %[ic]\n" // ic_blk - "ldr q0, [%[in_0]]\n" //in_hw0 - "mov v2.16b, v12.16b\n" //out_o0hw0 - "mov v3.16b, v12.16b\n" //out_o0hw1 - "mov v4.16b, v12.16b\n" //out_o0hw2 - "ldr q10, [%[f_0]]\n" //f_o0c0 - "mov v5.16b, v12.16b\n" //out_o0hw3 - "mov v6.16b, v12.16b\n" //out_o0hw4 - "mov v7.16b, v12.16b\n" //out_o0hw5 - "mov v8.16b, v12.16b\n" //out_o0hw6 - "mov v9.16b, v12.16b\n" //out_o0hw7 - "0:\n" - "ldr q1, [%[in_0], #16]\n" //in_hw0 - "fmla v2.8h, v10.8h, v0.h[0]\n" - "fmla v3.8h, v10.8h, v0.h[1]\n" - "fmla v4.8h, v10.8h, v0.h[2]\n" - "ldr q11, [%[f_0], #16]\n" //f_o0c0 - "fmla v5.8h, v10.8h, v0.h[3]\n" - "fmla v6.8h, v10.8h, v0.h[4]\n" - "fmla v7.8h, v10.8h, v0.h[5]\n" - "subs x0, x0, #2\n" - "fmla v8.8h, v10.8h, v0.h[6]\n" - "fmla v9.8h, v10.8h, v0.h[7]\n" - - "ldr q0, [%[in_0], #32]\n" //in_hw0 - "fmla v2.8h, v11.8h, v1.h[0]\n" - "fmla v3.8h, v11.8h, v1.h[1]\n" - "fmla v4.8h, v11.8h, v1.h[2]\n" - "ldr q10, [%[f_0], #32]\n" //f_o0c0 - "fmla v5.8h, v11.8h, v1.h[3]\n" - "fmla v6.8h, v11.8h, v1.h[4]\n" - "fmla v7.8h, v11.8h, v1.h[5]\n" - "add %[in_0], %[in_0], #32\n" - "fmla v8.8h, v11.8h, v1.h[6]\n" - "add %[f_0], %[f_0], #32\n" - "fmla v9.8h, v11.8h, v1.h[7]\n" - "bne 0b\n" - - "cmp %[pointwiseActivationMode], %[am_relu]\n" - "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v3.8h, v3.8h, v0.8h\n" - "fmax v4.8h, v4.8h, v0.8h\n" - "fmax v5.8h, v5.8h, v0.8h\n" - "fmax v6.8h, v6.8h, v0.8h\n" - "fmax v7.8h, v7.8h, v0.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - - "11:\n" - "cmp %[pointwiseActivationMode], %[am_relu6]\n" - "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v3.8h, v3.8h, v0.8h\n" - "fmax v4.8h, v4.8h, v0.8h\n" - "fmax v5.8h, v5.8h, v0.8h\n" - "fmax v6.8h, v6.8h, v0.8h\n" - "fmax v7.8h, v7.8h, v0.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmin v2.8h, v2.8h, v1.8h\n" - "fmin v3.8h, v3.8h, v1.8h\n" - "fmin v4.8h, v4.8h, v1.8h\n" - "fmin v5.8h, v5.8h, v1.8h\n" - "fmin v6.8h, v6.8h, v1.8h\n" - "fmin v7.8h, v7.8h, v1.8h\n" - "fmin v8.8h, v8.8h, v1.8h\n" - "fmin v9.8h, v9.8h, v1.8h\n" - - "12:\n" - "cmp %[pointwiseActivationMode], %[am_h_swish]\n" - "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v10.8h, #0x42, lsl #8\n" //three - "fadd v11.8h, v2.8h, v10.8h\n" - "fadd v12.8h, v3.8h, v10.8h\n" - "fadd v13.8h, v4.8h, v10.8h\n" - "fadd v14.8h, v5.8h, v10.8h\n" - "fadd v15.8h, v6.8h, v10.8h\n" - "fadd v16.8h, v7.8h, v10.8h\n" - "fadd v17.8h, v8.8h, v10.8h\n" - "fadd v18.8h, v9.8h, v10.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" - "fmax v12.8h, v12.8h, v0.8h\n" - "fmax v13.8h, v13.8h, v0.8h\n" - "fmax v14.8h, v14.8h, v0.8h\n" - "fmax v15.8h, v15.8h, v0.8h\n" - "fmax v16.8h, v16.8h, v0.8h\n" - "fmax v17.8h, v17.8h, v0.8h\n" - "fmax v18.8h, v18.8h, v0.8h\n" - "fmin v11.8h, v11.8h, v1.8h\n" - "fmin v12.8h, v12.8h, v1.8h\n" - "fmin v13.8h, v13.8h, v1.8h\n" - "fmin v14.8h, v14.8h, v1.8h\n" - "fmin v15.8h, v15.8h, v1.8h\n" - "fmin v16.8h, v16.8h, v1.8h\n" - "fmin v17.8h, v17.8h, v1.8h\n" - "fmin v18.8h, v18.8h, v1.8h\n" - "fdiv v11.8h, v11.8h, v1.8h\n" - "fdiv v12.8h, v12.8h, v1.8h\n" - "fdiv v13.8h, v13.8h, v1.8h\n" - "fdiv v14.8h, v14.8h, v1.8h\n" - "fdiv v15.8h, v15.8h, v1.8h\n" - "fdiv v16.8h, v16.8h, v1.8h\n" - "fdiv v17.8h, v17.8h, v1.8h\n" - "fdiv v18.8h, v18.8h, v1.8h\n" - "fmul v2.8h, v11.8h, v2.8h\n" - "fmul v3.8h, v12.8h, v3.8h\n" - "fmul v4.8h, v13.8h, v4.8h\n" - "fmul v5.8h, v14.8h, v5.8h\n" - "fmul v6.8h, v15.8h, v6.8h\n" - "fmul v7.8h, v16.8h, v7.8h\n" - "fmul v8.8h, v17.8h, v8.8h\n" - "fmul v9.8h, v18.8h, v9.8h\n" - - "13:\n" - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw0 - "str q4, [%[out_0], #32]\n" //out_o0hw0 - "str q5, [%[out_0], #48]\n" //out_o0hw0 - "str q6, [%[out_0], #64]\n" //out_o0hw0 - "str q7, [%[out_0], #80]\n" //out_o0hw0 - "str q8, [%[out_0], #96]\n" //out_o0hw0 - "str q9, [%[out_0], #112]\n" //out_o0hw0 - :[out_0]"+r"(out_o0hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_r) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "x0", "x1", "x2" - ); - } - } - } - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct.h b/tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct.h deleted file mode 100644 index 5b1f0a2c..00000000 --- a/tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct.h +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_DEPTHWISE_POINTWISE_CONVOLUTION_DIRECT -#define _H_DEPTHWISE_POINTWISE_CONVOLUTION_DIRECT - -#include - -#include "sys.h" -#include "type.h" -#include "error.h" -#include "tensor_desc.h" -#include "tensor_computing_type.h" - - -EE depthwise_pointwise_convolution_direct_A55(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc); - -EE depthwise_pointwise_convolution_direct_A76(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc); - -inline EE depthwise_pointwise_convolution_direct(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc, - Arch arch) -{ - EE ret = SUCCESS; - switch (arch) { - case ARM_A55: - ret = depthwise_pointwise_convolution_direct_A55(inputDesc, inArray, - filterDesc, filterArray, - convDesc, - biasDesc, biasArray, - tmpBytes, tmp, - outputDesc, outArray, - depthwiseActivationDesc, - pointwiseActivationDesc); - break; - case ARM_A76: - ret = depthwise_pointwise_convolution_direct_A76(inputDesc, inArray, - filterDesc, filterArray, - convDesc, - biasDesc, biasArray, - tmpBytes, tmp, - outputDesc, outArray, - depthwiseActivationDesc, - pointwiseActivationDesc); - break; - default: - return NOT_SUPPORTED; - } - return ret; -} -#endif diff --git a/tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_A55.cpp b/tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_A55.cpp deleted file mode 100644 index b28303ce..00000000 --- a/tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_A55.cpp +++ /dev/null @@ -1,1442 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/fp16/depthwise_pointwise_convolution_direct.h" - -EE depthwise_pointwise_convolution_direct_A55(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc) -{ - UNUSED(biasDesc); - UNUSED(tmpBytes); - - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - U32 dilateH = convDesc.dilatedRate_h; - U32 dilateW = convDesc.dilatedRate_w; - - if (fdf != DF_CHWC8_NCN16) - CHECK_STATUS(NOT_MATCH); - - oc /= 8; - ic /= 8; - - U32 ih_pad = ih + paddingT + paddingB; - U32 iw_pad = iw + paddingL + paddingR; - U32 ihiw = ih*iw; - I32 ohow = oh*ow; - F16 *pwArray = (F16*)tmp + ic*ih_pad*iw_pad*8; - - for (U32 n = 0; n < in; n++) { - // copy input into a input with padding - F16 *inArray_pad = (F16*)tmp; - F16 *inArray_pad_mov = inArray_pad; - F16 *inArray_mov = inArray + n*ic*ihiw*8; - for (U32 c = 0; c < ic; c++) { - for (U32 h = 0; h < paddingT; h++) { - memset(inArray_pad_mov, 0, iw_pad*8*bytesOf(fdt)); - inArray_pad_mov += iw_pad*8; - } - for (U32 h = paddingT; h < ih_pad - paddingB; h++) { - memset(inArray_pad_mov, 0, paddingL*8*bytesOf(fdt)); - inArray_pad_mov += paddingL*8; - memcpy(inArray_pad_mov, inArray_mov, iw*8*bytesOf(fdt)); - inArray_pad_mov += iw*8; - inArray_mov += iw*8; - memset(inArray_pad_mov, 0, paddingR*8*bytesOf(fdt)); - inArray_pad_mov += paddingR*8; - } - for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { - memset(inArray_pad_mov, 0, iw_pad*8*bytesOf(fdt)); - inArray_pad_mov += iw_pad*8; - } - } - - // dw_conv - for (U32 c = 0; c < ic ; c++) { - const F16 *b = biasArray + c*8; - F16 *in_pad = inArray_pad + c*ih_pad*iw_pad*8; - const F16 *f = filterArray + c*fh*fw*8; - // ohow / 8 - for (I32 hw = 0; hw < ohow-7; hw+=8) { - U32 in_h_0 = hw/ow*strideH; - U32 in_w_0 = hw%ow*strideW; - U32 in_h_1 = (hw+1)/ow*strideH; - U32 in_w_1 = (hw+1)%ow*strideW; - U32 in_h_2 = (hw+2)/ow*strideH; - U32 in_w_2 = (hw+2)%ow*strideW; - U32 in_h_3 = (hw+3)/ow*strideH; - U32 in_w_3 = (hw+3)%ow*strideW; - U32 in_h_4 = (hw+4)/ow*strideH; - U32 in_w_4 = (hw+4)%ow*strideW; - U32 in_h_5 = (hw+5)/ow*strideH; - U32 in_w_5 = (hw+5)%ow*strideW; - U32 in_h_6 = (hw+6)/ow*strideH; - U32 in_w_6 = (hw+6)%ow*strideW; - U32 in_h_7 = (hw+7)/ow*strideH; - U32 in_w_7 = (hw+7)%ow*strideW; - F16 *pw_pack_0 = pwArray + hw*ic*8 + c*8*8; - //TODO handle asm combined with c. No guarantee that compile will not use vec reg in c. - __asm__ __volatile__( - "ldr q8, [%[b]]\n" - "mov v0.16b, v8.16b\n" - "mov v1.16b, v8.16b\n" - "mov v2.16b, v8.16b\n" - "mov v3.16b, v8.16b\n" - "mov v4.16b, v8.16b\n" - "mov v5.16b, v8.16b\n" - "mov v6.16b, v8.16b\n" - "mov v7.16b, v8.16b\n" - : - :[b]"r"(b) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8" - ); - - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - const F16 *f_0 = f + fh_idx*fw*8 + fw_idx*8; - F16 *in_idx = in_pad + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - F16 *in_0 = in_idx + in_h_0*iw_pad*8 + in_w_0*8; - F16 *in_1 = in_idx + in_h_1*iw_pad*8 + in_w_1*8; - F16 *in_2 = in_idx + in_h_2*iw_pad*8 + in_w_2*8; - F16 *in_3 = in_idx + in_h_3*iw_pad*8 + in_w_3*8; - F16 *in_4 = in_idx + in_h_4*iw_pad*8 + in_w_4*8; - F16 *in_5 = in_idx + in_h_5*iw_pad*8 + in_w_5*8; - F16 *in_6 = in_idx + in_h_6*iw_pad*8 + in_w_6*8; - F16 *in_7 = in_idx + in_h_7*iw_pad*8 + in_w_7*8; - __asm__ __volatile__( - "ldr q17, [%[f0]]\n" - "ldr q9, [%[in0]]\n" - "ldr q10, [%[in1]]\n" - "ldr q11, [%[in2]]\n" - "ldr q12, [%[in3]]\n" - "ldr q13, [%[in4]]\n" - "ldr q14, [%[in5]]\n" - "ldr q15, [%[in6]]\n" - "ldr q16, [%[in7]]\n" - "fmla v0.8h, v9.8h, v17.8h\n" - "fmla v1.8h, v10.8h, v17.8h\n" - "fmla v2.8h, v11.8h, v17.8h\n" - "fmla v3.8h, v12.8h, v17.8h\n" - "fmla v4.8h, v13.8h, v17.8h\n" - "fmla v5.8h, v14.8h, v17.8h\n" - "fmla v6.8h, v15.8h, v17.8h\n" - "fmla v7.8h, v16.8h, v17.8h\n" - : - :[in0]"r"(in_0), - [in1]"r"(in_1), - [in2]"r"(in_2), - [in3]"r"(in_3), - [in4]"r"(in_4), - [in5]"r"(in_5), - [in6]"r"(in_6), - [in7]"r"(in_7), - [f0]"r"(f_0) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17" - ); - } - } - - // activation - switch (depthwiseActivationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fmax v0.8h, v0.8h, v31.8h\n" - "fmax v1.8h, v1.8h, v31.8h\n" - "fmax v2.8h, v2.8h, v31.8h\n" - "fmax v3.8h, v3.8h, v31.8h\n" - "fmax v4.8h, v4.8h, v31.8h\n" - "fmax v5.8h, v5.8h, v31.8h\n" - "fmax v6.8h, v6.8h, v31.8h\n" - "fmax v7.8h, v7.8h, v31.8h\n" - : - : - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v31" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "movi v30.8h, #0x46, lsl #8\n" // six - "fmax v0.8h, v0.8h, v31.8h\n" - "fmax v1.8h, v1.8h, v31.8h\n" - "fmax v2.8h, v2.8h, v31.8h\n" - "fmax v3.8h, v3.8h, v31.8h\n" - "fmax v4.8h, v4.8h, v31.8h\n" - "fmax v5.8h, v5.8h, v31.8h\n" - "fmax v6.8h, v6.8h, v31.8h\n" - "fmax v7.8h, v7.8h, v31.8h\n" - "fmin v0.8h, v0.8h, v30.8h\n" - "fmin v1.8h, v1.8h, v30.8h\n" - "fmin v2.8h, v2.8h, v30.8h\n" - "fmin v3.8h, v3.8h, v30.8h\n" - "fmin v4.8h, v4.8h, v30.8h\n" - "fmin v5.8h, v5.8h, v30.8h\n" - "fmin v6.8h, v6.8h, v30.8h\n" - "fmin v7.8h, v7.8h, v30.8h\n" - : - : - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v30", "v31" - ); - break; - } - case ACTIVATION_H_SWISH:{ - __asm__ __volatile__( - "movi v29.8h, #0x42, lsl #8\n" // three - "movi v30.8h, #0x46, lsl #8\n" // six - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fadd v21.8h, v0.8h, v29.8h\n" - "fadd v22.8h, v1.8h, v29.8h\n" - "fadd v23.8h, v2.8h, v29.8h\n" - "fadd v24.8h, v3.8h, v29.8h\n" - "fadd v25.8h, v4.8h, v29.8h\n" - "fadd v26.8h, v5.8h, v29.8h\n" - "fadd v27.8h, v6.8h, v29.8h\n" - "fadd v28.8h, v7.8h, v29.8h\n" - "fmax v21.8h, v21.8h, v31.8h\n" - "fmax v22.8h, v22.8h, v31.8h\n" - "fmax v23.8h, v23.8h, v31.8h\n" - "fmax v24.8h, v24.8h, v31.8h\n" - "fmax v25.8h, v25.8h, v31.8h\n" - "fmax v26.8h, v26.8h, v31.8h\n" - "fmax v27.8h, v27.8h, v31.8h\n" - "fmax v28.8h, v28.8h, v31.8h\n" - "fmin v21.8h, v21.8h, v30.8h\n" - "fmin v22.8h, v22.8h, v30.8h\n" - "fmin v23.8h, v23.8h, v30.8h\n" - "fmin v24.8h, v24.8h, v30.8h\n" - "fmin v25.8h, v25.8h, v30.8h\n" - "fmin v26.8h, v26.8h, v30.8h\n" - "fmin v27.8h, v27.8h, v30.8h\n" - "fmin v28.8h, v28.8h, v30.8h\n" - "fdiv v21.8h, v21.8h, v30.8h\n" - "fdiv v22.8h, v22.8h, v30.8h\n" - "fdiv v23.8h, v23.8h, v30.8h\n" - "fdiv v24.8h, v24.8h, v30.8h\n" - "fdiv v25.8h, v25.8h, v30.8h\n" - "fdiv v26.8h, v26.8h, v30.8h\n" - "fdiv v27.8h, v27.8h, v30.8h\n" - "fdiv v28.8h, v28.8h, v30.8h\n" - "fmul v0.8h, v0.8h, v21.8h\n" - "fmul v1.8h, v1.8h, v22.8h\n" - "fmul v2.8h, v2.8h, v23.8h\n" - "fmul v3.8h, v3.8h, v24.8h\n" - "fmul v4.8h, v4.8h, v25.8h\n" - "fmul v5.8h, v5.8h, v26.8h\n" - "fmul v6.8h, v6.8h, v27.8h\n" - "fmul v7.8h, v7.8h, v28.8h\n" - : - : - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" - ); - break; - } - default: - return NOT_SUPPORTED; - } - - __asm__ __volatile__( - "zip1 v8.8h, v0.8h, v4.8h\n" - "zip1 v9.8h, v2.8h, v6.8h\n" - "zip1 v10.8h, v1.8h, v5.8h\n" - "zip1 v11.8h, v3.8h, v7.8h\n" - "zip2 v0.8h, v0.8h, v4.8h\n" - "zip2 v2.8h, v2.8h, v6.8h\n" - "zip2 v1.8h, v1.8h, v5.8h\n" - "zip2 v3.8h, v3.8h, v7.8h\n" - "zip1 v12.8h, v8.8h, v9.8h\n" - "zip1 v13.8h, v10.8h, v11.8h\n" - "zip2 v8.8h, v8.8h, v9.8h\n" - "zip2 v10.8h, v10.8h, v11.8h\n" - "zip1 v14.8h, v0.8h, v2.8h\n" - "zip1 v15.8h, v1.8h, v3.8h\n" - "zip2 v0.8h, v0.8h, v2.8h\n" - "zip2 v1.8h, v1.8h, v3.8h\n" - "zip1 v16.8h, v12.8h, v13.8h\n" - "zip2 v12.8h, v12.8h, v13.8h\n" - "zip1 v17.8h, v8.8h, v10.8h\n" - "zip2 v8.8h, v8.8h, v10.8h\n" - "zip1 v18.8h, v14.8h, v15.8h\n" - "zip2 v14.8h, v14.8h, v15.8h\n" - "zip1 v19.8h, v0.8h, v1.8h\n" - "zip2 v0.8h, v0.8h, v1.8h\n" - "str q16, [%[pw0]]\n" - "str q12, [%[pw0], #16]\n" - "str q17, [%[pw0], #32]\n" - "str q8, [%[pw0], #48]\n" - "str q18, [%[pw0], #64]\n" - "str q14, [%[pw0], #80]\n" - "str q19, [%[pw0], #96]\n" - "str q0, [%[pw0], #112]\n" - :[pw0]"+r"(pw_pack_0) - : - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19" - ); - } - - // ohow_reminder % 8 / 4 - U32 ohow_s = (ohow / 8) * 8; - for (I32 hw = ohow_s; hw < ohow-3; hw+=4) { - U32 in_h_0 = hw/ow*strideH; - U32 in_w_0 = hw%ow*strideW; - U32 in_h_1 = (hw+1)/ow*strideH; - U32 in_w_1 = (hw+1)%ow*strideW; - U32 in_h_2 = (hw+2)/ow*strideH; - U32 in_w_2 = (hw+2)%ow*strideW; - U32 in_h_3 = (hw+3)/ow*strideH; - U32 in_w_3 = (hw+3)%ow*strideW; - F16 *pw_pack_0 = pwArray + hw*ic*8 + c*8*4; - //TODO handle asm combined with c. No guarantee that compile will not use vec reg in c. - __asm__ __volatile__( - "ldr q8, [%[b]]\n" - "mov v0.16b, v8.16b\n" - "mov v1.16b, v8.16b\n" - "mov v2.16b, v8.16b\n" - "mov v3.16b, v8.16b\n" - : - :[b]"r"(b) - :"memory", "cc", "v0", "v1", "v2", "v3", "v8" - ); - - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - const F16 *f_0 = f + fh_idx*fw*8 + fw_idx*8; - F16 *in_idx = in_pad + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - F16 *in_0 = in_idx + in_h_0*iw_pad*8 + in_w_0*8; - F16 *in_1 = in_idx + in_h_1*iw_pad*8 + in_w_1*8; - F16 *in_2 = in_idx + in_h_2*iw_pad*8 + in_w_2*8; - F16 *in_3 = in_idx + in_h_3*iw_pad*8 + in_w_3*8; - __asm__ __volatile__( - "ldr q17, [%[f0]]\n" - "ldr q9, [%[in0]]\n" - "ldr q10, [%[in1]]\n" - "ldr q11, [%[in2]]\n" - "ldr q12, [%[in3]]\n" - "fmla v0.8h, v9.8h, v17.8h\n" - "fmla v1.8h, v10.8h, v17.8h\n" - "fmla v2.8h, v11.8h, v17.8h\n" - "fmla v3.8h, v12.8h, v17.8h\n" - : - :[in0]"r"(in_0), - [in1]"r"(in_1), - [in2]"r"(in_2), - [in3]"r"(in_3), - [f0]"r"(f_0) - :"memory", "cc", "v0", "v1", "v2", "v3", "v9", "v10", "v11", "v12", "v17" - ); - } - } - - // activation - switch (depthwiseActivationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fmax v0.8h, v0.8h, v31.8h\n" - "fmax v1.8h, v1.8h, v31.8h\n" - "fmax v2.8h, v2.8h, v31.8h\n" - "fmax v3.8h, v3.8h, v31.8h\n" - : - : - :"memory", "cc", "v0", "v1", "v2", "v3", "v31" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "movi v30.8h, #0x46, lsl #8\n" // six - "fmax v0.8h, v0.8h, v31.8h\n" - "fmax v1.8h, v1.8h, v31.8h\n" - "fmax v2.8h, v2.8h, v31.8h\n" - "fmax v3.8h, v3.8h, v31.8h\n" - "fmin v0.8h, v0.8h, v30.8h\n" - "fmin v1.8h, v1.8h, v30.8h\n" - "fmin v2.8h, v2.8h, v30.8h\n" - "fmin v3.8h, v3.8h, v30.8h\n" - : - : - :"memory", "cc", "v0", "v1", "v2", "v3", "v30", "v31" - ); - break; - } - case ACTIVATION_H_SWISH:{ - __asm__ __volatile__( - "movi v29.8h, #0x42, lsl #8\n" // three - "movi v30.8h, #0x46, lsl #8\n" // six - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fadd v25.8h, v0.8h, v29.8h\n" - "fadd v26.8h, v1.8h, v29.8h\n" - "fadd v27.8h, v2.8h, v29.8h\n" - "fadd v28.8h, v3.8h, v29.8h\n" - "fmax v25.8h, v25.8h, v31.8h\n" - "fmax v26.8h, v26.8h, v31.8h\n" - "fmax v27.8h, v27.8h, v31.8h\n" - "fmax v28.8h, v28.8h, v31.8h\n" - "fmin v25.8h, v25.8h, v30.8h\n" - "fmin v26.8h, v26.8h, v30.8h\n" - "fmin v27.8h, v27.8h, v30.8h\n" - "fmin v28.8h, v28.8h, v30.8h\n" - "fdiv v25.8h, v25.8h, v30.8h\n" - "fdiv v26.8h, v26.8h, v30.8h\n" - "fdiv v27.8h, v27.8h, v30.8h\n" - "fdiv v28.8h, v28.8h, v30.8h\n" - "fmul v0.8h, v0.8h, v25.8h\n" - "fmul v1.8h, v1.8h, v26.8h\n" - "fmul v2.8h, v2.8h, v27.8h\n" - "fmul v3.8h, v3.8h, v28.8h\n" - : - : - :"memory", "cc", "v0", "v1", "v2", "v3", "v25", "v26", "v27", "v28", "v29", "v30", "v31" - ); - break; - } - default: - return NOT_SUPPORTED; - } - - __asm__ __volatile__( - "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%[pw0]]\n" - :[pw0]"+r"(pw_pack_0) - : - :"memory", "cc", "v0", "v1", "v2", "v3" - ); - } - - // ohow_reminder % 4 - ohow_s = (ohow / 4) * 4; - for (I32 hw = ohow_s; hw < ohow; hw++) { - U32 in_h_0 = hw/ow*strideH; - U32 in_w_0 = hw%ow*strideW; - F16 *pw_pack_0 = pwArray + hw*ic*8 + c*8; - //TODO handle asm combined with c. No guarantee that compile will not use vec reg in c. - __asm__ __volatile__( - "ldr q8, [%[b]]\n" - "mov v0.16b, v8.16b\n" - : - :[b]"r"(b) - :"memory", "cc", "v0" - ); - - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - const F16 *f_0 = f + fh_idx*fw*8 + fw_idx*8; - F16 *in_idx = in_pad + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - F16 *in_0 = in_idx + in_h_0*iw_pad*8 + in_w_0*8; - __asm__ __volatile__( - "ldr q17, [%[f0]]\n" - "ldr q9, [%[in0]]\n" - "fmla v0.8h, v9.8h, v17.8h\n" - : - :[in0]"r"(in_0), - [f0]"r"(f_0) - :"memory", "cc", "v0", "v9", "v17" - ); - } - } - - // activation - switch (depthwiseActivationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fmax v0.8h, v0.8h, v31.8h\n" - : - : - :"memory", "cc", "v0", "v31" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "movi v30.8h, #0x46, lsl #8\n" // six - "fmax v0.8h, v0.8h, v31.8h\n" - "fmin v0.8h, v0.8h, v30.8h\n" - : - : - :"memory", "cc", "v0", "v30", "v31" - ); - break; - } - case ACTIVATION_H_SWISH:{ - __asm__ __volatile__( - "movi v29.8h, #0x42, lsl #8\n" // three - "movi v30.8h, #0x46, lsl #8\n" // six - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fadd v28.8h, v0.8h, v29.8h\n" - "fmax v28.8h, v28.8h, v31.8h\n" - "fmin v28.8h, v28.8h, v30.8h\n" - "fdiv v28.8h, v28.8h, v30.8h\n" - "fmul v0.8h, v0.8h, v28.8h\n" - : - : - :"memory", "cc", "v0", "v28", "v29", "v30", "v31" - ); - break; - } - default: - return NOT_SUPPORTED; - } - - __asm__ __volatile__( - "str q0, [%[pw0]]\n" - :[pw0]"+r"(pw_pack_0) - : - :"memory", "cc", "v0" - ); - } - } - - // pw_conv - // ohow / 8 - for (I32 hw = 0; hw < ohow-7; hw+=8) { - const F16 *b0 = biasArray + ic*8; - const F16 *b1 = b0 + 8; - F16 *in_pack = pwArray + hw*ic*8; - const F16 *f_o0c0 = filterArray + ic*fh*fw*8; - for (I32 o = 0; o < I32(oc-1); o+=2) { - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; - // bias - const F16 *b_o0 = b0; - const F16 *b_o1 = b1; - __asm__ __volatile__( - "ldr d22, [%[b_0]]\n" //b_o0 - "ldr x1, [%[b_0], #8]\n" - "ins v22.d[1], x1\n" - "ldr d23, [%[b_1]]\n" //b_o1 - "ldr x2, [%[b_1], #8]\n" - "ins v23.d[1], x2\n" - "mov x0, %[ic]\n" //ic_blk - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr d0, [%[in_0]]\n" //in_hw0 - "mov v3.16b, v22.16b\n" //out_o0hw1 - "ldr x1, [%[in_0], #8]\n" - "mov v4.16b, v22.16b\n" //out_o0hw2 - "ins v0.d[1], x1\n" - "mov v5.16b, v22.16b\n" //out_o0hw3 - "ldr d18, [%[f_0]]\n" //f_o0c0 - "mov v6.16b, v22.16b\n" //out_o0hw4 - "ldr x2, [%[f_0], #8]\n" - "mov v7.16b, v22.16b\n" //out_o0hw5 - "ins v18.d[1], x2\n" - "mov v8.16b, v22.16b\n" //out_o0hw6 - "ldr d19, [%[f_0], #16]\n" //f_o1c0 - "mov v9.16b, v22.16b\n" //out_o0hw7 - "ldr x3, [%[f_0], #24]\n" - "mov v10.16b, v23.16b\n" //out_o1hw0 - "ins v19.d[1], x3\n" - "mov v11.16b, v23.16b\n" //out_o1hw1 - "mov v12.16b, v23.16b\n" //out_o1hw2 - "mov v13.16b, v23.16b\n" //out_o1hw3 - "mov v14.16b, v23.16b\n" //out_o1hw4 - "mov v15.16b, v23.16b\n" //out_o1hw5 - "mov v16.16b, v23.16b\n" //out_o1hw6 - "mov v17.16b, v23.16b\n" //out_o1hw7 - "0:\n" - "ldr d1, [%[in_0], #16]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr x1, [%[in_0], #24]\n" - "fmla v3.8h, v18.8h, v0.h[1]\n" - "ins v1.d[1], x1\n" - "fmla v4.8h, v18.8h, v0.h[2]\n" - "ldr d20, [%[f_0], #32]\n" //f_o0c0 - "fmla v5.8h, v18.8h, v0.h[3]\n" - "ldr x2, [%[f_0], #40]\n" - "fmla v6.8h, v18.8h, v0.h[4]\n" - "ins v20.d[1], x2\n" - "fmla v7.8h, v18.8h, v0.h[5]\n" - "ldr d21, [%[f_0], #48]\n" //f_o1c0 - "fmla v8.8h, v18.8h, v0.h[6]\n" - "ldr x3, [%[f_0], #56]\n" - "fmla v9.8h, v18.8h, v0.h[7]\n" - "ins v21.d[1], x3\n" - "fmla v10.8h, v19.8h, v0.h[0]\n" - "fmla v11.8h, v19.8h, v0.h[1]\n" - "fmla v12.8h, v19.8h, v0.h[2]\n" - "fmla v13.8h, v19.8h, v0.h[3]\n" - "fmla v14.8h, v19.8h, v0.h[4]\n" - "fmla v15.8h, v19.8h, v0.h[5]\n" - "fmla v16.8h, v19.8h, v0.h[6]\n" - "fmla v17.8h, v19.8h, v0.h[7]\n" - - "ldr d0, [%[in_0], #32]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr x1, [%[in_0], #40]\n" - "fmla v3.8h, v20.8h, v1.h[1]\n" - "ins v0.d[1], x1\n" - "fmla v4.8h, v20.8h, v1.h[2]\n" - "ldr d18, [%[f_0], #64]\n" //f_o0c0 - "fmla v5.8h, v20.8h, v1.h[3]\n" - "ldr x2, [%[f_0], #72]\n" - "fmla v6.8h, v20.8h, v1.h[4]\n" - "ins v18.d[1], x2\n" - "fmla v7.8h, v20.8h, v1.h[5]\n" - "ldr d19, [%[f_0], #80]\n" //f_o1c0 - "fmla v8.8h, v20.8h, v1.h[6]\n" - "ldr x3, [%[f_0], #88]\n" - "fmla v9.8h, v20.8h, v1.h[7]\n" - "ins v19.d[1], x3\n" - "fmla v10.8h, v21.8h, v1.h[0]\n" - "add %[in_0], %[in_0], #32\n" - "fmla v11.8h, v21.8h, v1.h[1]\n" - "add %[f_0], %[f_0], #64\n" - "fmla v12.8h, v21.8h, v1.h[2]\n" - "subs x0, x0, #2\n" - "fmla v13.8h, v21.8h, v1.h[3]\n" - "fmla v14.8h, v21.8h, v1.h[4]\n" - "fmla v15.8h, v21.8h, v1.h[5]\n" - "fmla v16.8h, v21.8h, v1.h[6]\n" - "fmla v17.8h, v21.8h, v1.h[7]\n" - "bne 0b\n" - - "cmp %[pointwiseActivationMode], %[am_relu]\n" - "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v3.8h, v3.8h, v0.8h\n" - "fmax v4.8h, v4.8h, v0.8h\n" - "fmax v5.8h, v5.8h, v0.8h\n" - "fmax v6.8h, v6.8h, v0.8h\n" - "fmax v7.8h, v7.8h, v0.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" - "fmax v12.8h, v12.8h, v0.8h\n" - "fmax v13.8h, v13.8h, v0.8h\n" - "fmax v14.8h, v14.8h, v0.8h\n" - "fmax v15.8h, v15.8h, v0.8h\n" - "fmax v16.8h, v16.8h, v0.8h\n" - "fmax v17.8h, v17.8h, v0.8h\n" - - "11:\n" - "cmp %[pointwiseActivationMode], %[am_relu6]\n" - "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v3.8h, v3.8h, v0.8h\n" - "fmax v4.8h, v4.8h, v0.8h\n" - "fmax v5.8h, v5.8h, v0.8h\n" - "fmax v6.8h, v6.8h, v0.8h\n" - "fmax v7.8h, v7.8h, v0.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" - "fmax v12.8h, v12.8h, v0.8h\n" - "fmax v13.8h, v13.8h, v0.8h\n" - "fmax v14.8h, v14.8h, v0.8h\n" - "fmax v15.8h, v15.8h, v0.8h\n" - "fmax v16.8h, v16.8h, v0.8h\n" - "fmax v17.8h, v17.8h, v0.8h\n" - "fmin v2.8h, v2.8h, v1.8h\n" - "fmin v3.8h, v3.8h, v1.8h\n" - "fmin v4.8h, v4.8h, v1.8h\n" - "fmin v5.8h, v5.8h, v1.8h\n" - "fmin v6.8h, v6.8h, v1.8h\n" - "fmin v7.8h, v7.8h, v1.8h\n" - "fmin v8.8h, v8.8h, v1.8h\n" - "fmin v9.8h, v9.8h, v1.8h\n" - "fmin v10.8h, v10.8h, v1.8h\n" - "fmin v11.8h, v11.8h, v1.8h\n" - "fmin v12.8h, v12.8h, v1.8h\n" - "fmin v13.8h, v13.8h, v1.8h\n" - "fmin v14.8h, v14.8h, v1.8h\n" - "fmin v15.8h, v15.8h, v1.8h\n" - "fmin v16.8h, v16.8h, v1.8h\n" - "fmin v17.8h, v17.8h, v1.8h\n" - - "12:\n" - "cmp %[pointwiseActivationMode], %[am_h_swish]\n" - "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v18.8h, #0x42, lsl #8\n" //three - "fadd v19.8h, v2.8h, v18.8h\n" - "fadd v20.8h, v3.8h, v18.8h\n" - "fadd v21.8h, v4.8h, v18.8h\n" - "fadd v22.8h, v5.8h, v18.8h\n" - "fadd v23.8h, v6.8h, v18.8h\n" - "fadd v24.8h, v7.8h, v18.8h\n" - "fadd v25.8h, v8.8h, v18.8h\n" - "fadd v26.8h, v9.8h, v18.8h\n" - "fmax v19.8h, v19.8h, v0.8h\n" - "fmax v20.8h, v20.8h, v0.8h\n" - "fmax v21.8h, v21.8h, v0.8h\n" - "fmax v22.8h, v22.8h, v0.8h\n" - "fmax v23.8h, v23.8h, v0.8h\n" - "fmax v24.8h, v24.8h, v0.8h\n" - "fmax v25.8h, v25.8h, v0.8h\n" - "fmax v26.8h, v26.8h, v0.8h\n" - "fmin v19.8h, v19.8h, v1.8h\n" - "fmin v20.8h, v20.8h, v1.8h\n" - "fmin v21.8h, v21.8h, v1.8h\n" - "fmin v22.8h, v22.8h, v1.8h\n" - "fmin v23.8h, v23.8h, v1.8h\n" - "fmin v24.8h, v24.8h, v1.8h\n" - "fmin v25.8h, v25.8h, v1.8h\n" - "fmin v26.8h, v26.8h, v1.8h\n" - "fdiv v19.8h, v19.8h, v1.8h\n" - "fdiv v20.8h, v20.8h, v1.8h\n" - "fdiv v21.8h, v21.8h, v1.8h\n" - "fdiv v22.8h, v22.8h, v1.8h\n" - "fdiv v23.8h, v23.8h, v1.8h\n" - "fdiv v24.8h, v24.8h, v1.8h\n" - "fdiv v25.8h, v25.8h, v1.8h\n" - "fdiv v26.8h, v26.8h, v1.8h\n" - "fmul v2.8h, v19.8h, v2.8h\n" - "fmul v3.8h, v20.8h, v3.8h\n" - "fmul v4.8h, v21.8h, v4.8h\n" - "fmul v5.8h, v22.8h, v5.8h\n" - "fmul v6.8h, v23.8h, v6.8h\n" - "fmul v7.8h, v24.8h, v7.8h\n" - "fmul v8.8h, v25.8h, v8.8h\n" - "fmul v9.8h, v26.8h, v9.8h\n" - - "fadd v19.8h, v10.8h, v18.8h\n" - "fadd v20.8h, v11.8h, v18.8h\n" - "fadd v21.8h, v12.8h, v18.8h\n" - "fadd v22.8h, v13.8h, v18.8h\n" - "fadd v23.8h, v14.8h, v18.8h\n" - "fadd v24.8h, v15.8h, v18.8h\n" - "fadd v25.8h, v16.8h, v18.8h\n" - "fadd v26.8h, v17.8h, v18.8h\n" - "fmax v19.8h, v19.8h, v0.8h\n" - "fmax v20.8h, v20.8h, v0.8h\n" - "fmax v21.8h, v21.8h, v0.8h\n" - "fmax v22.8h, v22.8h, v0.8h\n" - "fmax v23.8h, v23.8h, v0.8h\n" - "fmax v24.8h, v24.8h, v0.8h\n" - "fmax v25.8h, v25.8h, v0.8h\n" - "fmax v26.8h, v26.8h, v0.8h\n" - "fmin v19.8h, v19.8h, v1.8h\n" - "fmin v20.8h, v20.8h, v1.8h\n" - "fmin v21.8h, v21.8h, v1.8h\n" - "fmin v22.8h, v22.8h, v1.8h\n" - "fmin v23.8h, v23.8h, v1.8h\n" - "fmin v24.8h, v24.8h, v1.8h\n" - "fmin v25.8h, v25.8h, v1.8h\n" - "fmin v26.8h, v26.8h, v1.8h\n" - "fdiv v19.8h, v19.8h, v1.8h\n" - "fdiv v20.8h, v20.8h, v1.8h\n" - "fdiv v21.8h, v21.8h, v1.8h\n" - "fdiv v22.8h, v22.8h, v1.8h\n" - "fdiv v23.8h, v23.8h, v1.8h\n" - "fdiv v24.8h, v24.8h, v1.8h\n" - "fdiv v25.8h, v25.8h, v1.8h\n" - "fdiv v26.8h, v26.8h, v1.8h\n" - "fmul v10.8h, v19.8h, v10.8h\n" - "fmul v11.8h, v20.8h, v11.8h\n" - "fmul v12.8h, v21.8h, v12.8h\n" - "fmul v13.8h, v22.8h, v13.8h\n" - "fmul v14.8h, v23.8h, v14.8h\n" - "fmul v15.8h, v24.8h, v15.8h\n" - "fmul v16.8h, v25.8h, v16.8h\n" - "fmul v17.8h, v26.8h, v17.8h\n" - - "13:\n" - "st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%[out_0]], #64\n" - "st1 {v6.8h, v7.8h, v8.8h, v9.8h}, [%[out_0]], #64\n" - "st1 {v10.8h, v11.8h, v12.8h, v13.8h}, [%[out_1]], #64\n" - "st1 {v14.8h, v15.8h, v16.8h, v17.8h}, [%[out_1]], #64\n" - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "x0", "x1", "x2", "x3" - ); - b0 += 16; - b1 += 16; - } - if (oc & 1) { - // oc%2 != 0 - const F16 *f_r = filterArray + ic*fh*fw*8 + (oc-1)*8*ic*8; - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + (oc-1)*ohow*8 + hw*8; - // bias - const F16 *b_o0 = biasArray + ic*8 + (oc-1)*8; - __asm__ __volatile__( - "ldr q12, [%[b_0]]\n" //b_o0 - "mov x0, %[ic]\n" // ic_blk - "ldr d0, [%[in_0]]\n" //in_hw0 - "mov v2.16b, v12.16b\n" //out_o0hw0 - "ldr x1, [%[in_0], #8]\n" - "mov v3.16b, v12.16b\n" //out_o0hw1 - "ins v0.d[1], x1\n" - "mov v4.16b, v12.16b\n" //out_o0hw2 - "ldr d10, [%[f_0]]\n" //f_o0c0 - "mov v5.16b, v12.16b\n" //out_o0hw3 - "ldr x2, [%[f_0], #8]\n" - "mov v6.16b, v12.16b\n" //out_o0hw4 - "ins v10.d[1], x2\n" - "mov v7.16b, v12.16b\n" //out_o0hw5 - "mov v8.16b, v12.16b\n" //out_o0hw6 - "mov v9.16b, v12.16b\n" //out_o0hw7 - "0:\n" - "ldr d1, [%[in_0], #16]\n" //in_hw0 - "fmla v2.8h, v10.8h, v0.h[0]\n" - "ldr x1, [%[in_0], #24]\n" - "fmla v3.8h, v10.8h, v0.h[1]\n" - "ins v1.d[1], x1\n" - "fmla v4.8h, v10.8h, v0.h[2]\n" - "ldr d11, [%[f_0], #16]\n" //f_o0c0 - "fmla v5.8h, v10.8h, v0.h[3]\n" - "ldr x2, [%[f_0], #24]\n" - "fmla v6.8h, v10.8h, v0.h[4]\n" - "ins v11.d[1], x2\n" - "fmla v7.8h, v10.8h, v0.h[5]\n" - "subs x0, x0, #2\n" - "fmla v8.8h, v10.8h, v0.h[6]\n" - "fmla v9.8h, v10.8h, v0.h[7]\n" - - "ldr d0, [%[in_0], #32]\n" //in_hw0 - "fmla v2.8h, v11.8h, v1.h[0]\n" - "ldr x1, [%[in_0], #40]\n" - "fmla v3.8h, v11.8h, v1.h[1]\n" - "ins v0.d[1], x1\n" - "fmla v4.8h, v11.8h, v1.h[2]\n" - "ldr d10, [%[f_0], #32]\n" //f_o0c0 - "fmla v5.8h, v11.8h, v1.h[3]\n" - "ldr x2, [%[f_0], #40]\n" - "fmla v6.8h, v11.8h, v1.h[4]\n" - "ins v10.d[1], x2\n" - "fmla v7.8h, v11.8h, v1.h[5]\n" - "add %[in_0], %[in_0], #32\n" - "fmla v8.8h, v11.8h, v1.h[6]\n" - "add %[f_0], %[f_0], #32\n" - "fmla v9.8h, v11.8h, v1.h[7]\n" - "bne 0b\n" - - "cmp %[pointwiseActivationMode], %[am_relu]\n" - "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v3.8h, v3.8h, v0.8h\n" - "fmax v4.8h, v4.8h, v0.8h\n" - "fmax v5.8h, v5.8h, v0.8h\n" - "fmax v6.8h, v6.8h, v0.8h\n" - "fmax v7.8h, v7.8h, v0.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - - "11:\n" - "cmp %[pointwiseActivationMode], %[am_relu6]\n" - "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v3.8h, v3.8h, v0.8h\n" - "fmax v4.8h, v4.8h, v0.8h\n" - "fmax v5.8h, v5.8h, v0.8h\n" - "fmax v6.8h, v6.8h, v0.8h\n" - "fmax v7.8h, v7.8h, v0.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmin v2.8h, v2.8h, v1.8h\n" - "fmin v3.8h, v3.8h, v1.8h\n" - "fmin v4.8h, v4.8h, v1.8h\n" - "fmin v5.8h, v5.8h, v1.8h\n" - "fmin v6.8h, v6.8h, v1.8h\n" - "fmin v7.8h, v7.8h, v1.8h\n" - "fmin v8.8h, v8.8h, v1.8h\n" - "fmin v9.8h, v9.8h, v1.8h\n" - - "12:\n" - "cmp %[pointwiseActivationMode], %[am_h_swish]\n" - "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v10.8h, #0x42, lsl #8\n" //three - "fadd v11.8h, v2.8h, v10.8h\n" - "fadd v12.8h, v3.8h, v10.8h\n" - "fadd v13.8h, v4.8h, v10.8h\n" - "fadd v14.8h, v5.8h, v10.8h\n" - "fadd v15.8h, v6.8h, v10.8h\n" - "fadd v16.8h, v7.8h, v10.8h\n" - "fadd v17.8h, v8.8h, v10.8h\n" - "fadd v18.8h, v9.8h, v10.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" - "fmax v12.8h, v12.8h, v0.8h\n" - "fmax v13.8h, v13.8h, v0.8h\n" - "fmax v14.8h, v14.8h, v0.8h\n" - "fmax v15.8h, v15.8h, v0.8h\n" - "fmax v16.8h, v16.8h, v0.8h\n" - "fmax v17.8h, v17.8h, v0.8h\n" - "fmax v18.8h, v18.8h, v0.8h\n" - "fmin v11.8h, v11.8h, v1.8h\n" - "fmin v12.8h, v12.8h, v1.8h\n" - "fmin v13.8h, v13.8h, v1.8h\n" - "fmin v14.8h, v14.8h, v1.8h\n" - "fmin v15.8h, v15.8h, v1.8h\n" - "fmin v16.8h, v16.8h, v1.8h\n" - "fmin v17.8h, v17.8h, v1.8h\n" - "fmin v18.8h, v18.8h, v1.8h\n" - "fdiv v11.8h, v11.8h, v1.8h\n" - "fdiv v12.8h, v12.8h, v1.8h\n" - "fdiv v13.8h, v13.8h, v1.8h\n" - "fdiv v14.8h, v14.8h, v1.8h\n" - "fdiv v15.8h, v15.8h, v1.8h\n" - "fdiv v16.8h, v16.8h, v1.8h\n" - "fdiv v17.8h, v17.8h, v1.8h\n" - "fdiv v18.8h, v18.8h, v1.8h\n" - "fmul v2.8h, v11.8h, v2.8h\n" - "fmul v3.8h, v12.8h, v3.8h\n" - "fmul v4.8h, v13.8h, v4.8h\n" - "fmul v5.8h, v14.8h, v5.8h\n" - "fmul v6.8h, v15.8h, v6.8h\n" - "fmul v7.8h, v16.8h, v7.8h\n" - "fmul v8.8h, v17.8h, v8.8h\n" - "fmul v9.8h, v18.8h, v9.8h\n" - - "13:\n" - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw0 - "str q4, [%[out_0], #32]\n" //out_o0hw0 - "str q5, [%[out_0], #48]\n" //out_o0hw0 - "str q6, [%[out_0], #64]\n" //out_o0hw0 - "str q7, [%[out_0], #80]\n" //out_o0hw0 - "str q8, [%[out_0], #96]\n" //out_o0hw0 - "str q9, [%[out_0], #112]\n" //out_o0hw0 - :[out_0]"+r"(out_o0hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_r) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "x0", "x1", "x2" - ); - } - } - - // ohow_remainder % 8 / 4 - U32 ohow_s = (ohow / 8) * 8; - for (I32 hw = ohow_s; hw < ohow-3; hw+=4) { - const F16 *b0 = biasArray + ic*8; - const F16 *b1 = b0 + 8; - const F16 *f_o0c0 = filterArray + ic*fh*fw*8; - F16 *in_pack = pwArray + hw*ic*8; - for (I32 o = 0; o < I32(oc-1); o+=2) { - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; - // bias - const F16 *b_o0 = b0; - const F16 *b_o1 = b1; - __asm__ __volatile__( - "ldr d22, [%[b_0]]\n" //b_o0 - "ldr x1, [%[b_0], #8]\n" - "ins v22.d[1], x1\n" - "ldr d23, [%[b_1]]\n" //b_o1 - "ldr x2, [%[b_1], #8]\n" - "ins v23.d[1], x2\n" - "mov x0, %[ic]\n" //ic_blk - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr d0, [%[in_0]]\n" //in_hw0 - "mov v3.16b, v22.16b\n" //out_o0hw1 - "ldr d18, [%[f_0]]\n" //f_o0c0 - "mov v4.16b, v22.16b\n" //out_o0hw2 - "ldr x2, [%[f_0], #8]\n" - "mov v5.16b, v22.16b\n" //out_o0hw3 - "ins v18.d[1], x2\n" - "mov v10.16b, v23.16b\n" //out_o1hw0 - "ldr d19, [%[f_0], #16]\n" //f_o1c0 - "mov v11.16b, v23.16b\n" //out_o1hw1 - "ldr x3, [%[f_0], #24]\n" - "mov v12.16b, v23.16b\n" //out_o1hw2 - "ins v19.d[1], x3\n" - "mov v13.16b, v23.16b\n" //out_o1hw3 - "0:\n" - "ldr d1, [%[in_0], #8]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr d20, [%[f_0], #32]\n" //f_o0c0 - "fmla v3.8h, v18.8h, v0.h[1]\n" - "ldr x2, [%[f_0], #40]\n" - "fmla v4.8h, v18.8h, v0.h[2]\n" - "ins v20.d[1], x2\n" - "fmla v5.8h, v18.8h, v0.h[3]\n" - "ldr d21, [%[f_0], #48]\n" //f_o1c0 - "fmla v10.8h, v19.8h, v0.h[0]\n" - "ldr x3, [%[f_0], #56]\n" - "fmla v11.8h, v19.8h, v0.h[1]\n" - "ins v21.d[1], x3\n" - "fmla v12.8h, v19.8h, v0.h[2]\n" - "subs x0, x0, #2\n" - "fmla v13.8h, v19.8h, v0.h[3]\n" - - "ldr d0, [%[in_0], #16]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr d18, [%[f_0], #64]\n" //f_o0c0 - "fmla v3.8h, v20.8h, v1.h[1]\n" - "ldr x2, [%[f_0], #72]\n" - "fmla v4.8h, v20.8h, v1.h[2]\n" - "ldr d19, [%[f_0], #80]\n" //f_o1c0 - "fmla v5.8h, v20.8h, v1.h[3]\n" - "ins v18.d[1], x2\n" - "fmla v10.8h, v21.8h, v1.h[0]\n" - "ldr x3, [%[f_0], #88]\n" - "fmla v11.8h, v21.8h, v1.h[1]\n" - "ins v19.d[1], x3\n" - "fmla v12.8h, v21.8h, v1.h[2]\n" - "add %[in_0], %[in_0], #16\n" - "fmla v13.8h, v21.8h, v1.h[3]\n" - "add %[f_0], %[f_0], #64\n" - "bne 0b\n" - - "cmp %[pointwiseActivationMode], %[am_relu]\n" - "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v3.8h, v3.8h, v0.8h\n" - "fmax v4.8h, v4.8h, v0.8h\n" - "fmax v5.8h, v5.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" - "fmax v12.8h, v12.8h, v0.8h\n" - "fmax v13.8h, v13.8h, v0.8h\n" - - "11:\n" - "cmp %[pointwiseActivationMode], %[am_relu6]\n" - "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v3.8h, v3.8h, v0.8h\n" - "fmax v4.8h, v4.8h, v0.8h\n" - "fmax v5.8h, v5.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" - "fmax v12.8h, v12.8h, v0.8h\n" - "fmax v13.8h, v13.8h, v0.8h\n" - "fmin v2.8h, v2.8h, v1.8h\n" - "fmin v3.8h, v3.8h, v1.8h\n" - "fmin v4.8h, v4.8h, v1.8h\n" - "fmin v5.8h, v5.8h, v1.8h\n" - "fmin v10.8h, v10.8h, v1.8h\n" - "fmin v11.8h, v11.8h, v1.8h\n" - "fmin v12.8h, v12.8h, v1.8h\n" - "fmin v13.8h, v13.8h, v1.8h\n" - - "12:\n" - "cmp %[pointwiseActivationMode], %[am_h_swish]\n" - "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v18.8h, #0x42, lsl #8\n" //three - "fadd v6.8h, v2.8h, v18.8h\n" - "fadd v7.8h, v3.8h, v18.8h\n" - "fadd v8.8h, v4.8h, v18.8h\n" - "fadd v9.8h, v5.8h, v18.8h\n" - "fadd v19.8h, v10.8h, v18.8h\n" - "fadd v20.8h, v11.8h, v18.8h\n" - "fadd v21.8h, v12.8h, v18.8h\n" - "fadd v22.8h, v13.8h, v18.8h\n" - "fmax v6.8h, v6.8h, v0.8h\n" - "fmax v7.8h, v7.8h, v0.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmax v19.8h, v19.8h, v0.8h\n" - "fmax v20.8h, v20.8h, v0.8h\n" - "fmax v21.8h, v21.8h, v0.8h\n" - "fmax v22.8h, v22.8h, v0.8h\n" - "fmin v6.8h, v6.8h, v1.8h\n" - "fmin v7.8h, v7.8h, v1.8h\n" - "fmin v8.8h, v8.8h, v1.8h\n" - "fmin v9.8h, v9.8h, v1.8h\n" - "fmin v19.8h, v19.8h, v1.8h\n" - "fmin v20.8h, v20.8h, v1.8h\n" - "fmin v21.8h, v21.8h, v1.8h\n" - "fmin v22.8h, v22.8h, v1.8h\n" - "fdiv v6.8h, v6.8h, v1.8h\n" - "fdiv v7.8h, v7.8h, v1.8h\n" - "fdiv v8.8h, v8.8h, v1.8h\n" - "fdiv v9.8h, v9.8h, v1.8h\n" - "fdiv v19.8h, v19.8h, v1.8h\n" - "fdiv v20.8h, v20.8h, v1.8h\n" - "fdiv v21.8h, v21.8h, v1.8h\n" - "fdiv v22.8h, v22.8h, v1.8h\n" - "fmul v2.8h, v6.8h, v2.8h\n" - "fmul v3.8h, v7.8h, v3.8h\n" - "fmul v4.8h, v8.8h, v4.8h\n" - "fmul v5.8h, v9.8h, v5.8h\n" - "fmul v10.8h, v19.8h, v10.8h\n" - "fmul v11.8h, v20.8h, v11.8h\n" - "fmul v12.8h, v21.8h, v12.8h\n" - "fmul v13.8h, v22.8h, v13.8h\n" - - "13:\n" - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw1 - "str q4, [%[out_0], #32]\n" //out_o0hw2 - "str q5, [%[out_0], #48]\n" //out_o0hw3 - "str q10, [%[out_1]]\n" //out_o1hw0 - "str q11, [%[out_1], #16]\n" //out_o1hw1 - "str q12, [%[out_1], #32]\n" //out_o1hw2 - "str q13, [%[out_1], #48]\n" //out_o1hw3 - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v18", "v19", "v20", "v21", "v22", "v23", "x0", "x1", "x2", "x3" - ); - b0 += 16; - b1 += 16; - } - if (oc & 1) { - // oc%2 != 0 - const F16 *f_r = filterArray + ic*fh*fw*8 + (oc-1)*8*ic*8; - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + (oc-1)*ohow*8 + hw*8; - // bias - const F16 *b_o0 = biasArray + ic*8 + (oc-1)*8; - __asm__ __volatile__( - "ldr d22, [%[b_0]]\n" //b_o0 - "ldr x1, [%[b_0], #8]\n" - "ins v22.d[1], x1\n" - "mov x0, %[ic]\n" //ic_blk - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr d0, [%[in_0]]\n" //in_hw0 - "mov v3.16b, v22.16b\n" //out_o0hw1 - "ldr d18, [%[f_0]]\n" //f_o0c0 - "mov v4.16b, v22.16b\n" //out_o0hw2 - "ldr x2, [%[f_0], #8]\n" - "mov v5.16b, v22.16b\n" //out_o0hw3 - "ins v18.d[1], x2\n" - "0:\n" - "ldr d1, [%[in_0], #8]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr d20, [%[f_0], #16]\n" //f_o0c0 - "fmla v3.8h, v18.8h, v0.h[1]\n" - "ldr x2, [%[f_0], #24]\n" - "fmla v4.8h, v18.8h, v0.h[2]\n" - "ins v20.d[1], x2\n" - "fmla v5.8h, v18.8h, v0.h[3]\n" - "subs x0, x0, #2\n" - - "ldr d0, [%[in_0], #16]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr d18, [%[f_0], #32]\n" //f_o0c0 - "fmla v3.8h, v20.8h, v1.h[1]\n" - "ldr x2, [%[f_0], #40]\n" - "fmla v4.8h, v20.8h, v1.h[2]\n" - "ins v18.d[1], x2\n" - "fmla v5.8h, v20.8h, v1.h[3]\n" - "add %[in_0], %[in_0], #16\n" - "add %[f_0], %[f_0], #32\n" - "bne 0b\n" - - "cmp %[pointwiseActivationMode], %[am_relu]\n" - "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v3.8h, v3.8h, v0.8h\n" - "fmax v4.8h, v4.8h, v0.8h\n" - "fmax v5.8h, v5.8h, v0.8h\n" - - "11:\n" - "cmp %[pointwiseActivationMode], %[am_relu6]\n" - "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v3.8h, v3.8h, v0.8h\n" - "fmax v4.8h, v4.8h, v0.8h\n" - "fmax v5.8h, v5.8h, v0.8h\n" - "fmin v2.8h, v2.8h, v1.8h\n" - "fmin v3.8h, v3.8h, v1.8h\n" - "fmin v4.8h, v4.8h, v1.8h\n" - "fmin v5.8h, v5.8h, v1.8h\n" - - "12:\n" - "cmp %[pointwiseActivationMode], %[am_h_swish]\n" - "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v18.8h, #0x42, lsl #8\n" //three - "fadd v6.8h, v2.8h, v18.8h\n" - "fadd v7.8h, v3.8h, v18.8h\n" - "fadd v8.8h, v4.8h, v18.8h\n" - "fadd v9.8h, v5.8h, v18.8h\n" - "fmax v6.8h, v6.8h, v0.8h\n" - "fmax v7.8h, v7.8h, v0.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmin v6.8h, v6.8h, v1.8h\n" - "fmin v7.8h, v7.8h, v1.8h\n" - "fmin v8.8h, v8.8h, v1.8h\n" - "fmin v9.8h, v9.8h, v1.8h\n" - "fdiv v6.8h, v6.8h, v1.8h\n" - "fdiv v7.8h, v7.8h, v1.8h\n" - "fdiv v8.8h, v8.8h, v1.8h\n" - "fdiv v9.8h, v9.8h, v1.8h\n" - "fmul v2.8h, v6.8h, v2.8h\n" - "fmul v3.8h, v7.8h, v3.8h\n" - "fmul v4.8h, v8.8h, v4.8h\n" - "fmul v5.8h, v9.8h, v5.8h\n" - - "13:\n" - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw1 - "str q4, [%[out_0], #32]\n" //out_o0hw2 - "str q5, [%[out_0], #48]\n" //out_o0hw3 - :[out_0]"+r"(out_o0hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_r) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", - "v18", "v20", "v22", "x0", "x1", "x2" - ); - } - } - - // ohow_reminder % 4 - ohow_s = (ohow / 4) * 4; - for (I32 hw = ohow_s; hw < ohow; hw++) { - const F16 *b0 = biasArray + ic*8; - const F16 *b1 = b0 + 8; - const F16 *f_o0c0 = filterArray + ic*fh*fw*8; - F16 *in_pack = pwArray + hw*ic*8; - for (I32 o = 0; o < I32(oc-1); o+=2) { - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; - // bias - const F16 *b_o0 = b0; - const F16 *b_o1 = b1; - __asm__ __volatile__( - "ldr d22, [%[b_0]]\n" //b_o0 - "ldr x1, [%[b_0], #8]\n" - "ins v22.d[1], x1\n" - "ldr d23, [%[b_1]]\n" //b_o1 - "mov x0, %[ic]\n" //ic_blk - "ldr x2, [%[b_1], #8]\n" - "ins v23.d[1], x2\n" - "ldr h0, [%[in_0]]\n" //in_hw0 - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr d18, [%[f_0]]\n" //f_o0c0 - "mov v10.16b, v23.16b\n" //out_o1hw0 - "ldr x2, [%[f_0], #8]\n" - "ins v18.d[1], x2\n" - "ldr d19, [%[f_0], #16]\n" //f_o1c0 - "ldr x3, [%[f_0], #24]\n" - "ins v19.d[1], x3\n" - "0:\n" - "ldr h1, [%[in_0], #2]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr d20, [%[f_0], #32]\n" //f_o0c0 - "fmla v10.8h, v19.8h, v0.h[0]\n" - "ldr x2, [%[f_0], #40]\n" - "ins v20.d[1], x2\n" - "ldr d21, [%[f_0], #48]\n" //f_o1c0 - "subs x0, x0, #2\n" - "ldr x3, [%[f_0], #56]\n" - "ins v21.d[1], x3\n" - - "ldr h0, [%[in_0], #4]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr d18, [%[f_0], #64]\n" //f_o0c0 - "fmla v10.8h, v21.8h, v1.h[0]\n" - "ldr x2, [%[f_0], #72]\n" - "ins v18.d[1], x2\n" - "ldr d19, [%[f_0], #80]\n" //f_o1c0 - "add %[in_0], %[in_0], #4\n" - "ldr x3, [%[f_0], #88]\n" - "ins v19.d[1], x3\n" - "add %[f_0], %[f_0], #64\n" - "bne 0b\n" - - "cmp %[pointwiseActivationMode], %[am_relu]\n" - "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - - "11:\n" - "cmp %[pointwiseActivationMode], %[am_relu6]\n" - "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - "fmin v2.8h, v2.8h, v1.8h\n" - "fmin v10.8h, v10.8h, v1.8h\n" - - "12:\n" - "cmp %[pointwiseActivationMode], %[am_h_swish]\n" - "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v18.8h, #0x42, lsl #8\n" //three - "fadd v19.8h, v2.8h, v18.8h\n" - "fadd v20.8h, v10.8h, v18.8h\n" - "fmax v19.8h, v19.8h, v0.8h\n" - "fmax v20.8h, v20.8h, v0.8h\n" - "fmin v19.8h, v19.8h, v1.8h\n" - "fmin v20.8h, v20.8h, v1.8h\n" - "fdiv v19.8h, v19.8h, v1.8h\n" - "fdiv v20.8h, v20.8h, v1.8h\n" - "fmul v2.8h, v19.8h, v2.8h\n" - "fmul v10.8h, v20.8h, v10.8h\n" - - "13:\n" - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q10, [%[out_1]]\n" //out_o1hw0 - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v10", "v18", "v19", "v20", "v21", "v22", "v23", "x0", "x1", "x2", "x3" - ); - b0 += 16; - b1 += 16; - } - if (oc & 1) { - // oc%2 != 0 - const F16 *f_r = filterArray + ic*fh*fw*8 + (oc-1)*8*ic*8; - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + (oc-1)*ohow*8 + hw*8; - // bias - const F16 *b_o0 = biasArray + ic*8 + (oc-1)*8; - __asm__ __volatile__( - "ldr d22, [%[b_0]]\n" //b_o0 - "mov x0, %[ic]\n" //ic_blk - "ldr x1, [%[b_0], #8]\n" - "ins v22.d[1], x1\n" - "ldr h0, [%[in_0]]\n" //in_hw0 - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr d18, [%[f_0]]\n" //f_o0c0 - "ldr x2, [%[f_0], #8]\n" - "ins v18.d[1], x2\n" - "0:\n" - "ldr h1, [%[in_0], #2]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr d20, [%[f_0], #16]\n" //f_o0c0 - "subs x0, x0, #2\n" - "ldr x2, [%[f_0], #24]\n" - "ins v20.d[1], x2\n" - - "ldr h0, [%[in_0], #4]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr d18, [%[f_0], #32]\n" //f_o0c0 - "ldr x2, [%[f_0], #40]\n" - "ins v18.d[1], x2\n" - "add %[in_0], %[in_0], #4\n" - "add %[f_0], %[f_0], #32\n" - "bne 0b\n" - - "cmp %[pointwiseActivationMode], %[am_relu]\n" - "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "fmax v2.8h, v2.8h, v0.8h\n" - - "11:\n" - "cmp %[pointwiseActivationMode], %[am_relu6]\n" - "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "fmax v2.8h, v2.8h, v0.8h\n" - "fmin v2.8h, v2.8h, v1.8h\n" - - "12:\n" - "cmp %[pointwiseActivationMode], %[am_h_swish]\n" - "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v18.8h, #0x42, lsl #8\n" //three - "fadd v20.8h, v2.8h, v18.8h\n" - "fmax v20.8h, v20.8h, v0.8h\n" - "fmin v20.8h, v20.8h, v1.8h\n" - "fdiv v20.8h, v20.8h, v1.8h\n" - "fmul v2.8h, v20.8h, v2.8h\n" - - "13:\n" - "str q2, [%[out_0]]\n" //out_o0hw0 - :[out_0]"+r"(out_o0hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_r) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v10", "v18", "v20", "v22", "x0", "x1", "x2" - ); - } - } - } - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_A76.cpp b/tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_A76.cpp deleted file mode 100644 index 578d4705..00000000 --- a/tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_A76.cpp +++ /dev/null @@ -1,1360 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/fp16/depthwise_pointwise_convolution_direct.h" - -EE depthwise_pointwise_convolution_direct_A76(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc) -{ - UNUSED(biasDesc); - UNUSED(tmpBytes); - - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - U32 dilateH = convDesc.dilatedRate_h; - U32 dilateW = convDesc.dilatedRate_w; - - if (fdf != DF_CHWC8_NCN16) - CHECK_STATUS(NOT_MATCH); - - oc /= 8; - ic /= 8; - - U32 ih_pad = ih + paddingT + paddingB; - U32 iw_pad = iw + paddingL + paddingR; - U32 ihiw = ih*iw; - I32 ohow = oh*ow; - F16 *pwArray = (F16*)tmp + ic*ih_pad*iw_pad*8; - - for (U32 n = 0; n < in; n++) { - // copy input into a input with padding - F16 *inArray_pad = (F16*)tmp; - F16 *inArray_pad_mov = inArray_pad; - F16 *inArray_mov = inArray + n*ic*ihiw*8; - for (U32 c = 0; c < ic; c++) { - for (U32 h = 0; h < paddingT; h++) { - memset(inArray_pad_mov, 0, iw_pad*8*bytesOf(fdt)); - inArray_pad_mov += iw_pad*8; - } - for (U32 h = paddingT; h < ih_pad - paddingB; h++) { - memset(inArray_pad_mov, 0, paddingL*8*bytesOf(fdt)); - inArray_pad_mov += paddingL*8; - memcpy(inArray_pad_mov, inArray_mov, iw*8*bytesOf(fdt)); - inArray_pad_mov += iw*8; - inArray_mov += iw*8; - memset(inArray_pad_mov, 0, paddingR*8*bytesOf(fdt)); - inArray_pad_mov += paddingR*8; - } - for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { - memset(inArray_pad_mov, 0, iw_pad*8*bytesOf(fdt)); - inArray_pad_mov += iw_pad*8; - } - } - - // dw_conv - for (U32 c = 0; c < ic ; c++) { - const F16 *b = biasArray + c*8; - F16 *in_pad = inArray_pad + c*ih_pad*iw_pad*8; - const F16 *f = filterArray + c*fh*fw*8; - // ohow / 8 - for (I32 hw = 0; hw < ohow-7; hw+=8) { - U32 in_h_0 = hw/ow*strideH; - U32 in_w_0 = hw%ow*strideW; - U32 in_h_1 = (hw+1)/ow*strideH; - U32 in_w_1 = (hw+1)%ow*strideW; - U32 in_h_2 = (hw+2)/ow*strideH; - U32 in_w_2 = (hw+2)%ow*strideW; - U32 in_h_3 = (hw+3)/ow*strideH; - U32 in_w_3 = (hw+3)%ow*strideW; - U32 in_h_4 = (hw+4)/ow*strideH; - U32 in_w_4 = (hw+4)%ow*strideW; - U32 in_h_5 = (hw+5)/ow*strideH; - U32 in_w_5 = (hw+5)%ow*strideW; - U32 in_h_6 = (hw+6)/ow*strideH; - U32 in_w_6 = (hw+6)%ow*strideW; - U32 in_h_7 = (hw+7)/ow*strideH; - U32 in_w_7 = (hw+7)%ow*strideW; - F16 *pw_pack_0 = pwArray + hw*ic*8 + c*8*8; - //TODO handle asm combined with c. No guarantee that compile will not use vec reg in c. - __asm__ __volatile__( - "ldr q8, [%[b]]\n" - "mov v0.16b, v8.16b\n" - "mov v1.16b, v8.16b\n" - "mov v2.16b, v8.16b\n" - "mov v3.16b, v8.16b\n" - "mov v4.16b, v8.16b\n" - "mov v5.16b, v8.16b\n" - "mov v6.16b, v8.16b\n" - "mov v7.16b, v8.16b\n" - : - :[b]"r"(b) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8" - ); - - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - const F16 *f_0 = f + fh_idx*fw*8 + fw_idx*8; - F16 *in_idx = in_pad + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - F16 *in_0 = in_idx + in_h_0*iw_pad*8 + in_w_0*8; - F16 *in_1 = in_idx + in_h_1*iw_pad*8 + in_w_1*8; - F16 *in_2 = in_idx + in_h_2*iw_pad*8 + in_w_2*8; - F16 *in_3 = in_idx + in_h_3*iw_pad*8 + in_w_3*8; - F16 *in_4 = in_idx + in_h_4*iw_pad*8 + in_w_4*8; - F16 *in_5 = in_idx + in_h_5*iw_pad*8 + in_w_5*8; - F16 *in_6 = in_idx + in_h_6*iw_pad*8 + in_w_6*8; - F16 *in_7 = in_idx + in_h_7*iw_pad*8 + in_w_7*8; - __asm__ __volatile__( - "ldr q17, [%[f0]]\n" - "ldr q9, [%[in0]]\n" - "ldr q10, [%[in1]]\n" - "ldr q11, [%[in2]]\n" - "ldr q12, [%[in3]]\n" - "ldr q13, [%[in4]]\n" - "ldr q14, [%[in5]]\n" - "ldr q15, [%[in6]]\n" - "ldr q16, [%[in7]]\n" - "fmla v0.8h, v9.8h, v17.8h\n" - "fmla v1.8h, v10.8h, v17.8h\n" - "fmla v2.8h, v11.8h, v17.8h\n" - "fmla v3.8h, v12.8h, v17.8h\n" - "fmla v4.8h, v13.8h, v17.8h\n" - "fmla v5.8h, v14.8h, v17.8h\n" - "fmla v6.8h, v15.8h, v17.8h\n" - "fmla v7.8h, v16.8h, v17.8h\n" - : - :[in0]"r"(in_0), - [in1]"r"(in_1), - [in2]"r"(in_2), - [in3]"r"(in_3), - [in4]"r"(in_4), - [in5]"r"(in_5), - [in6]"r"(in_6), - [in7]"r"(in_7), - [f0]"r"(f_0) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17" - ); - } - } - - // activation - switch (depthwiseActivationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fmax v0.8h, v0.8h, v31.8h\n" - "fmax v1.8h, v1.8h, v31.8h\n" - "fmax v2.8h, v2.8h, v31.8h\n" - "fmax v3.8h, v3.8h, v31.8h\n" - "fmax v4.8h, v4.8h, v31.8h\n" - "fmax v5.8h, v5.8h, v31.8h\n" - "fmax v6.8h, v6.8h, v31.8h\n" - "fmax v7.8h, v7.8h, v31.8h\n" - : - : - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v31" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "movi v30.8h, #0x46, lsl #8\n" // six - "fmax v0.8h, v0.8h, v31.8h\n" - "fmax v1.8h, v1.8h, v31.8h\n" - "fmax v2.8h, v2.8h, v31.8h\n" - "fmax v3.8h, v3.8h, v31.8h\n" - "fmax v4.8h, v4.8h, v31.8h\n" - "fmax v5.8h, v5.8h, v31.8h\n" - "fmax v6.8h, v6.8h, v31.8h\n" - "fmax v7.8h, v7.8h, v31.8h\n" - "fmin v0.8h, v0.8h, v30.8h\n" - "fmin v1.8h, v1.8h, v30.8h\n" - "fmin v2.8h, v2.8h, v30.8h\n" - "fmin v3.8h, v3.8h, v30.8h\n" - "fmin v4.8h, v4.8h, v30.8h\n" - "fmin v5.8h, v5.8h, v30.8h\n" - "fmin v6.8h, v6.8h, v30.8h\n" - "fmin v7.8h, v7.8h, v30.8h\n" - : - : - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v30", "v31" - ); - break; - } - case ACTIVATION_H_SWISH:{ - __asm__ __volatile__( - "movi v29.8h, #0x42, lsl #8\n" // three - "movi v30.8h, #0x46, lsl #8\n" // six - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fadd v21.8h, v0.8h, v29.8h\n" - "fadd v22.8h, v1.8h, v29.8h\n" - "fadd v23.8h, v2.8h, v29.8h\n" - "fadd v24.8h, v3.8h, v29.8h\n" - "fadd v25.8h, v4.8h, v29.8h\n" - "fadd v26.8h, v5.8h, v29.8h\n" - "fadd v27.8h, v6.8h, v29.8h\n" - "fadd v28.8h, v7.8h, v29.8h\n" - "fmax v21.8h, v21.8h, v31.8h\n" - "fmax v22.8h, v22.8h, v31.8h\n" - "fmax v23.8h, v23.8h, v31.8h\n" - "fmax v24.8h, v24.8h, v31.8h\n" - "fmax v25.8h, v25.8h, v31.8h\n" - "fmax v26.8h, v26.8h, v31.8h\n" - "fmax v27.8h, v27.8h, v31.8h\n" - "fmax v28.8h, v28.8h, v31.8h\n" - "fmin v21.8h, v21.8h, v30.8h\n" - "fmin v22.8h, v22.8h, v30.8h\n" - "fmin v23.8h, v23.8h, v30.8h\n" - "fmin v24.8h, v24.8h, v30.8h\n" - "fmin v25.8h, v25.8h, v30.8h\n" - "fmin v26.8h, v26.8h, v30.8h\n" - "fmin v27.8h, v27.8h, v30.8h\n" - "fmin v28.8h, v28.8h, v30.8h\n" - "fdiv v21.8h, v21.8h, v30.8h\n" - "fdiv v22.8h, v22.8h, v30.8h\n" - "fdiv v23.8h, v23.8h, v30.8h\n" - "fdiv v24.8h, v24.8h, v30.8h\n" - "fdiv v25.8h, v25.8h, v30.8h\n" - "fdiv v26.8h, v26.8h, v30.8h\n" - "fdiv v27.8h, v27.8h, v30.8h\n" - "fdiv v28.8h, v28.8h, v30.8h\n" - "fmul v0.8h, v0.8h, v21.8h\n" - "fmul v1.8h, v1.8h, v22.8h\n" - "fmul v2.8h, v2.8h, v23.8h\n" - "fmul v3.8h, v3.8h, v24.8h\n" - "fmul v4.8h, v4.8h, v25.8h\n" - "fmul v5.8h, v5.8h, v26.8h\n" - "fmul v6.8h, v6.8h, v27.8h\n" - "fmul v7.8h, v7.8h, v28.8h\n" - : - : - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" - ); - break; - } - default: - return NOT_SUPPORTED; - } - - __asm__ __volatile__( - "zip1 v8.8h, v0.8h, v4.8h\n" - "zip1 v9.8h, v2.8h, v6.8h\n" - "zip1 v10.8h, v1.8h, v5.8h\n" - "zip1 v11.8h, v3.8h, v7.8h\n" - "zip2 v0.8h, v0.8h, v4.8h\n" - "zip2 v2.8h, v2.8h, v6.8h\n" - "zip2 v1.8h, v1.8h, v5.8h\n" - "zip2 v3.8h, v3.8h, v7.8h\n" - "zip1 v12.8h, v8.8h, v9.8h\n" - "zip1 v13.8h, v10.8h, v11.8h\n" - "zip2 v8.8h, v8.8h, v9.8h\n" - "zip2 v10.8h, v10.8h, v11.8h\n" - "zip1 v14.8h, v0.8h, v2.8h\n" - "zip1 v15.8h, v1.8h, v3.8h\n" - "zip2 v0.8h, v0.8h, v2.8h\n" - "zip2 v1.8h, v1.8h, v3.8h\n" - "zip1 v16.8h, v12.8h, v13.8h\n" - "zip2 v12.8h, v12.8h, v13.8h\n" - "zip1 v17.8h, v8.8h, v10.8h\n" - "zip2 v8.8h, v8.8h, v10.8h\n" - "zip1 v18.8h, v14.8h, v15.8h\n" - "zip2 v14.8h, v14.8h, v15.8h\n" - "zip1 v19.8h, v0.8h, v1.8h\n" - "zip2 v0.8h, v0.8h, v1.8h\n" - "str q16, [%[pw0]]\n" - "str q12, [%[pw0], #16]\n" - "str q17, [%[pw0], #32]\n" - "str q8, [%[pw0], #48]\n" - "str q18, [%[pw0], #64]\n" - "str q14, [%[pw0], #80]\n" - "str q19, [%[pw0], #96]\n" - "str q0, [%[pw0], #112]\n" - :[pw0]"+r"(pw_pack_0) - : - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19" - ); - } - - // ohow_reminder % 8 / 4 - U32 ohow_s = (ohow / 8) * 8; - for (I32 hw = ohow_s; hw < ohow-3; hw+=4) { - U32 in_h_0 = hw/ow*strideH; - U32 in_w_0 = hw%ow*strideW; - U32 in_h_1 = (hw+1)/ow*strideH; - U32 in_w_1 = (hw+1)%ow*strideW; - U32 in_h_2 = (hw+2)/ow*strideH; - U32 in_w_2 = (hw+2)%ow*strideW; - U32 in_h_3 = (hw+3)/ow*strideH; - U32 in_w_3 = (hw+3)%ow*strideW; - F16 *pw_pack_0 = pwArray + hw*ic*8 + c*8*4; - //TODO handle asm combined with c. No guarantee that compile will not use vec reg in c. - __asm__ __volatile__( - "ldr q8, [%[b]]\n" - "mov v0.16b, v8.16b\n" - "mov v1.16b, v8.16b\n" - "mov v2.16b, v8.16b\n" - "mov v3.16b, v8.16b\n" - : - :[b]"r"(b) - :"memory", "cc", "v0", "v1", "v2", "v3", "v8" - ); - - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - const F16 *f_0 = f + fh_idx*fw*8 + fw_idx*8; - F16 *in_idx = in_pad + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - F16 *in_0 = in_idx + in_h_0*iw_pad*8 + in_w_0*8; - F16 *in_1 = in_idx + in_h_1*iw_pad*8 + in_w_1*8; - F16 *in_2 = in_idx + in_h_2*iw_pad*8 + in_w_2*8; - F16 *in_3 = in_idx + in_h_3*iw_pad*8 + in_w_3*8; - __asm__ __volatile__( - "ldr q17, [%[f0]]\n" - "ldr q9, [%[in0]]\n" - "ldr q10, [%[in1]]\n" - "ldr q11, [%[in2]]\n" - "ldr q12, [%[in3]]\n" - "fmla v0.8h, v9.8h, v17.8h\n" - "fmla v1.8h, v10.8h, v17.8h\n" - "fmla v2.8h, v11.8h, v17.8h\n" - "fmla v3.8h, v12.8h, v17.8h\n" - : - :[in0]"r"(in_0), - [in1]"r"(in_1), - [in2]"r"(in_2), - [in3]"r"(in_3), - [f0]"r"(f_0) - :"memory", "cc", "v0", "v1", "v2", "v3", "v9", "v10", "v11", "v12", "v17" - ); - } - } - - // activation - switch (depthwiseActivationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fmax v0.8h, v0.8h, v31.8h\n" - "fmax v1.8h, v1.8h, v31.8h\n" - "fmax v2.8h, v2.8h, v31.8h\n" - "fmax v3.8h, v3.8h, v31.8h\n" - : - : - :"memory", "cc", "v0", "v1", "v2", "v3", "v31" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "movi v30.8h, #0x46, lsl #8\n" // six - "fmax v0.8h, v0.8h, v31.8h\n" - "fmax v1.8h, v1.8h, v31.8h\n" - "fmax v2.8h, v2.8h, v31.8h\n" - "fmax v3.8h, v3.8h, v31.8h\n" - "fmin v0.8h, v0.8h, v30.8h\n" - "fmin v1.8h, v1.8h, v30.8h\n" - "fmin v2.8h, v2.8h, v30.8h\n" - "fmin v3.8h, v3.8h, v30.8h\n" - : - : - :"memory", "cc", "v0", "v1", "v2", "v3", "v30", "v31" - ); - break; - } - case ACTIVATION_H_SWISH:{ - __asm__ __volatile__( - "movi v29.8h, #0x42, lsl #8\n" // three - "movi v30.8h, #0x46, lsl #8\n" // six - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fadd v25.8h, v0.8h, v29.8h\n" - "fadd v26.8h, v1.8h, v29.8h\n" - "fadd v27.8h, v2.8h, v29.8h\n" - "fadd v28.8h, v3.8h, v29.8h\n" - "fmax v25.8h, v25.8h, v31.8h\n" - "fmax v26.8h, v26.8h, v31.8h\n" - "fmax v27.8h, v27.8h, v31.8h\n" - "fmax v28.8h, v28.8h, v31.8h\n" - "fmin v25.8h, v25.8h, v30.8h\n" - "fmin v26.8h, v26.8h, v30.8h\n" - "fmin v27.8h, v27.8h, v30.8h\n" - "fmin v28.8h, v28.8h, v30.8h\n" - "fdiv v25.8h, v25.8h, v30.8h\n" - "fdiv v26.8h, v26.8h, v30.8h\n" - "fdiv v27.8h, v27.8h, v30.8h\n" - "fdiv v28.8h, v28.8h, v30.8h\n" - "fmul v0.8h, v0.8h, v25.8h\n" - "fmul v1.8h, v1.8h, v26.8h\n" - "fmul v2.8h, v2.8h, v27.8h\n" - "fmul v3.8h, v3.8h, v28.8h\n" - : - : - :"memory", "cc", "v0", "v1", "v2", "v3", "v25", "v26", "v27", "v28", "v29", "v30", "v31" - ); - break; - } - default: - return NOT_SUPPORTED; - } - - __asm__ __volatile__( - "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%[pw0]]\n" - :[pw0]"+r"(pw_pack_0) - : - :"memory", "cc", "v0", "v1", "v2", "v3" - ); - } - - // ohow_reminder % 4 - ohow_s = (ohow / 4) * 4; - for (I32 hw = ohow_s; hw < ohow; hw++) { - U32 in_h_0 = hw/ow*strideH; - U32 in_w_0 = hw%ow*strideW; - F16 *pw_pack_0 = pwArray + hw*ic*8 + c*8; - //TODO handle asm combined with c. No guarantee that compile will not use vec reg in c. - __asm__ __volatile__( - "ldr q8, [%[b]]\n" - "mov v0.16b, v8.16b\n" - : - :[b]"r"(b) - :"memory", "cc", "v0" - ); - - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - const F16 *f_0 = f + fh_idx*fw*8 + fw_idx*8; - F16 *in_idx = in_pad + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - F16 *in_0 = in_idx + in_h_0*iw_pad*8 + in_w_0*8; - __asm__ __volatile__( - "ldr q17, [%[f0]]\n" - "ldr q9, [%[in0]]\n" - "fmla v0.8h, v9.8h, v17.8h\n" - : - :[in0]"r"(in_0), - [f0]"r"(f_0) - :"memory", "cc", "v0", "v9", "v17" - ); - } - } - - // activation - switch (depthwiseActivationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fmax v0.8h, v0.8h, v31.8h\n" - : - : - :"memory", "cc", "v0", "v31" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "movi v30.8h, #0x46, lsl #8\n" // six - "fmax v0.8h, v0.8h, v31.8h\n" - "fmin v0.8h, v0.8h, v30.8h\n" - : - : - :"memory", "cc", "v0", "v30", "v31" - ); - break; - } - case ACTIVATION_H_SWISH:{ - __asm__ __volatile__( - "movi v29.8h, #0x42, lsl #8\n" // three - "movi v30.8h, #0x46, lsl #8\n" // six - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fadd v28.8h, v0.8h, v29.8h\n" - "fmax v28.8h, v28.8h, v31.8h\n" - "fmin v28.8h, v28.8h, v30.8h\n" - "fdiv v28.8h, v28.8h, v30.8h\n" - "fmul v0.8h, v0.8h, v28.8h\n" - : - : - :"memory", "cc", "v0", "v28", "v29", "v30", "v31" - ); - break; - } - default: - return NOT_SUPPORTED; - } - - __asm__ __volatile__( - "str q0, [%[pw0]]\n" - :[pw0]"+r"(pw_pack_0) - : - :"memory", "cc", "v0" - ); - } - } - - // pw_conv - // ohow / 8 - for (I32 hw = 0; hw < ohow-7; hw+=8) { - const F16 *b0 = biasArray + ic*8; - const F16 *b1 = b0 + 8; - F16 *in_pack = pwArray + hw*ic*8; - const F16 *f_o0c0 = filterArray + ic*fh*fw*8; - for (I32 o = 0; o < I32(oc-1); o+=2) { - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; - // bias - const F16 *b_o0 = b0; - const F16 *b_o1 = b1; - __asm__ __volatile__( - "ldr q22, [%[b_0]]\n" //b_o0 - "ldr q23, [%[b_1]]\n" //b_o1 - "mov x0, %[ic]\n" //ic_blk - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr q0, [%[in_0]]\n" //in_hw0 - "mov v3.16b, v22.16b\n" //out_o0hw1 - "mov v4.16b, v22.16b\n" //out_o0hw2 - "mov v5.16b, v22.16b\n" //out_o0hw3 - "ldr q18, [%[f_0]]\n" //f_o0c0 - "mov v6.16b, v22.16b\n" //out_o0hw4 - "mov v7.16b, v22.16b\n" //out_o0hw5 - "mov v8.16b, v22.16b\n" //out_o0hw6 - "ldr q19, [%[f_0], #16]\n" //f_o1c0 - "mov v9.16b, v22.16b\n" //out_o0hw7 - "mov v10.16b, v23.16b\n" //out_o1hw0 - "mov v11.16b, v23.16b\n" //out_o1hw1 - "mov v12.16b, v23.16b\n" //out_o1hw2 - "mov v13.16b, v23.16b\n" //out_o1hw3 - "mov v14.16b, v23.16b\n" //out_o1hw4 - "mov v15.16b, v23.16b\n" //out_o1hw5 - "mov v16.16b, v23.16b\n" //out_o1hw6 - "mov v17.16b, v23.16b\n" //out_o1hw7 - "0:\n" - "ldr q1, [%[in_0], #16]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "fmla v3.8h, v18.8h, v0.h[1]\n" - "fmla v4.8h, v18.8h, v0.h[2]\n" - "ldr q20, [%[f_0], #32]\n" //f_o0c0 - "fmla v5.8h, v18.8h, v0.h[3]\n" - "fmla v6.8h, v18.8h, v0.h[4]\n" - "fmla v7.8h, v18.8h, v0.h[5]\n" - "ldr q21, [%[f_0], #48]\n" //f_o1c0 - "fmla v8.8h, v18.8h, v0.h[6]\n" - "fmla v9.8h, v18.8h, v0.h[7]\n" - "fmla v10.8h, v19.8h, v0.h[0]\n" - "fmla v11.8h, v19.8h, v0.h[1]\n" - "fmla v12.8h, v19.8h, v0.h[2]\n" - "fmla v13.8h, v19.8h, v0.h[3]\n" - "fmla v14.8h, v19.8h, v0.h[4]\n" - "fmla v15.8h, v19.8h, v0.h[5]\n" - "fmla v16.8h, v19.8h, v0.h[6]\n" - "fmla v17.8h, v19.8h, v0.h[7]\n" - - "ldr q0, [%[in_0], #32]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "fmla v3.8h, v20.8h, v1.h[1]\n" - "fmla v4.8h, v20.8h, v1.h[2]\n" - "ldr q18, [%[f_0], #64]\n" //f_o0c0 - "fmla v5.8h, v20.8h, v1.h[3]\n" - "fmla v6.8h, v20.8h, v1.h[4]\n" - "fmla v7.8h, v20.8h, v1.h[5]\n" - "ldr q19, [%[f_0], #80]\n" //f_o1c0 - "fmla v8.8h, v20.8h, v1.h[6]\n" - "fmla v9.8h, v20.8h, v1.h[7]\n" - "fmla v10.8h, v21.8h, v1.h[0]\n" - "add %[in_0], %[in_0], #32\n" - "fmla v11.8h, v21.8h, v1.h[1]\n" - "add %[f_0], %[f_0], #64\n" - "fmla v12.8h, v21.8h, v1.h[2]\n" - "subs x0, x0, #2\n" - "fmla v13.8h, v21.8h, v1.h[3]\n" - "fmla v14.8h, v21.8h, v1.h[4]\n" - "fmla v15.8h, v21.8h, v1.h[5]\n" - "fmla v16.8h, v21.8h, v1.h[6]\n" - "fmla v17.8h, v21.8h, v1.h[7]\n" - "bne 0b\n" - - "cmp %[pointwiseActivationMode], %[am_relu]\n" - "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v3.8h, v3.8h, v0.8h\n" - "fmax v4.8h, v4.8h, v0.8h\n" - "fmax v5.8h, v5.8h, v0.8h\n" - "fmax v6.8h, v6.8h, v0.8h\n" - "fmax v7.8h, v7.8h, v0.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" - "fmax v12.8h, v12.8h, v0.8h\n" - "fmax v13.8h, v13.8h, v0.8h\n" - "fmax v14.8h, v14.8h, v0.8h\n" - "fmax v15.8h, v15.8h, v0.8h\n" - "fmax v16.8h, v16.8h, v0.8h\n" - "fmax v17.8h, v17.8h, v0.8h\n" - - "11:\n" - "cmp %[pointwiseActivationMode], %[am_relu6]\n" - "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v3.8h, v3.8h, v0.8h\n" - "fmax v4.8h, v4.8h, v0.8h\n" - "fmax v5.8h, v5.8h, v0.8h\n" - "fmax v6.8h, v6.8h, v0.8h\n" - "fmax v7.8h, v7.8h, v0.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" - "fmax v12.8h, v12.8h, v0.8h\n" - "fmax v13.8h, v13.8h, v0.8h\n" - "fmax v14.8h, v14.8h, v0.8h\n" - "fmax v15.8h, v15.8h, v0.8h\n" - "fmax v16.8h, v16.8h, v0.8h\n" - "fmax v17.8h, v17.8h, v0.8h\n" - "fmin v2.8h, v2.8h, v1.8h\n" - "fmin v3.8h, v3.8h, v1.8h\n" - "fmin v4.8h, v4.8h, v1.8h\n" - "fmin v5.8h, v5.8h, v1.8h\n" - "fmin v6.8h, v6.8h, v1.8h\n" - "fmin v7.8h, v7.8h, v1.8h\n" - "fmin v8.8h, v8.8h, v1.8h\n" - "fmin v9.8h, v9.8h, v1.8h\n" - "fmin v10.8h, v10.8h, v1.8h\n" - "fmin v11.8h, v11.8h, v1.8h\n" - "fmin v12.8h, v12.8h, v1.8h\n" - "fmin v13.8h, v13.8h, v1.8h\n" - "fmin v14.8h, v14.8h, v1.8h\n" - "fmin v15.8h, v15.8h, v1.8h\n" - "fmin v16.8h, v16.8h, v1.8h\n" - "fmin v17.8h, v17.8h, v1.8h\n" - - "12:\n" - "cmp %[pointwiseActivationMode], %[am_h_swish]\n" - "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v18.8h, #0x42, lsl #8\n" //three - "fadd v19.8h, v2.8h, v18.8h\n" - "fadd v20.8h, v3.8h, v18.8h\n" - "fadd v21.8h, v4.8h, v18.8h\n" - "fadd v22.8h, v5.8h, v18.8h\n" - "fadd v23.8h, v6.8h, v18.8h\n" - "fadd v24.8h, v7.8h, v18.8h\n" - "fadd v25.8h, v8.8h, v18.8h\n" - "fadd v26.8h, v9.8h, v18.8h\n" - "fmax v19.8h, v19.8h, v0.8h\n" - "fmax v20.8h, v20.8h, v0.8h\n" - "fmax v21.8h, v21.8h, v0.8h\n" - "fmax v22.8h, v22.8h, v0.8h\n" - "fmax v23.8h, v23.8h, v0.8h\n" - "fmax v24.8h, v24.8h, v0.8h\n" - "fmax v25.8h, v25.8h, v0.8h\n" - "fmax v26.8h, v26.8h, v0.8h\n" - "fmin v19.8h, v19.8h, v1.8h\n" - "fmin v20.8h, v20.8h, v1.8h\n" - "fmin v21.8h, v21.8h, v1.8h\n" - "fmin v22.8h, v22.8h, v1.8h\n" - "fmin v23.8h, v23.8h, v1.8h\n" - "fmin v24.8h, v24.8h, v1.8h\n" - "fmin v25.8h, v25.8h, v1.8h\n" - "fmin v26.8h, v26.8h, v1.8h\n" - "fdiv v19.8h, v19.8h, v1.8h\n" - "fdiv v20.8h, v20.8h, v1.8h\n" - "fdiv v21.8h, v21.8h, v1.8h\n" - "fdiv v22.8h, v22.8h, v1.8h\n" - "fdiv v23.8h, v23.8h, v1.8h\n" - "fdiv v24.8h, v24.8h, v1.8h\n" - "fdiv v25.8h, v25.8h, v1.8h\n" - "fdiv v26.8h, v26.8h, v1.8h\n" - "fmul v2.8h, v19.8h, v2.8h\n" - "fmul v3.8h, v20.8h, v3.8h\n" - "fmul v4.8h, v21.8h, v4.8h\n" - "fmul v5.8h, v22.8h, v5.8h\n" - "fmul v6.8h, v23.8h, v6.8h\n" - "fmul v7.8h, v24.8h, v7.8h\n" - "fmul v8.8h, v25.8h, v8.8h\n" - "fmul v9.8h, v26.8h, v9.8h\n" - - "fadd v19.8h, v10.8h, v18.8h\n" - "fadd v20.8h, v11.8h, v18.8h\n" - "fadd v21.8h, v12.8h, v18.8h\n" - "fadd v22.8h, v13.8h, v18.8h\n" - "fadd v23.8h, v14.8h, v18.8h\n" - "fadd v24.8h, v15.8h, v18.8h\n" - "fadd v25.8h, v16.8h, v18.8h\n" - "fadd v26.8h, v17.8h, v18.8h\n" - "fmax v19.8h, v19.8h, v0.8h\n" - "fmax v20.8h, v20.8h, v0.8h\n" - "fmax v21.8h, v21.8h, v0.8h\n" - "fmax v22.8h, v22.8h, v0.8h\n" - "fmax v23.8h, v23.8h, v0.8h\n" - "fmax v24.8h, v24.8h, v0.8h\n" - "fmax v25.8h, v25.8h, v0.8h\n" - "fmax v26.8h, v26.8h, v0.8h\n" - "fmin v19.8h, v19.8h, v1.8h\n" - "fmin v20.8h, v20.8h, v1.8h\n" - "fmin v21.8h, v21.8h, v1.8h\n" - "fmin v22.8h, v22.8h, v1.8h\n" - "fmin v23.8h, v23.8h, v1.8h\n" - "fmin v24.8h, v24.8h, v1.8h\n" - "fmin v25.8h, v25.8h, v1.8h\n" - "fmin v26.8h, v26.8h, v1.8h\n" - "fdiv v19.8h, v19.8h, v1.8h\n" - "fdiv v20.8h, v20.8h, v1.8h\n" - "fdiv v21.8h, v21.8h, v1.8h\n" - "fdiv v22.8h, v22.8h, v1.8h\n" - "fdiv v23.8h, v23.8h, v1.8h\n" - "fdiv v24.8h, v24.8h, v1.8h\n" - "fdiv v25.8h, v25.8h, v1.8h\n" - "fdiv v26.8h, v26.8h, v1.8h\n" - "fmul v10.8h, v19.8h, v10.8h\n" - "fmul v11.8h, v20.8h, v11.8h\n" - "fmul v12.8h, v21.8h, v12.8h\n" - "fmul v13.8h, v22.8h, v13.8h\n" - "fmul v14.8h, v23.8h, v14.8h\n" - "fmul v15.8h, v24.8h, v15.8h\n" - "fmul v16.8h, v25.8h, v16.8h\n" - "fmul v17.8h, v26.8h, v17.8h\n" - - "13:\n" - "st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%[out_0]], #64\n" - "st1 {v6.8h, v7.8h, v8.8h, v9.8h}, [%[out_0]], #64\n" - "st1 {v10.8h, v11.8h, v12.8h, v13.8h}, [%[out_1]], #64\n" - "st1 {v14.8h, v15.8h, v16.8h, v17.8h}, [%[out_1]], #64\n" - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "x0", "x1", "x2", "x3" - ); - b0 += 16; - b1 += 16; - } - if (oc & 1) { - // oc%2 != 0 - const F16 *f_r = filterArray + ic*fh*fw*8 + (oc-1)*8*ic*8; - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + (oc-1)*ohow*8 + hw*8; - // bias - const F16 *b_o0 = biasArray + ic*8 + (oc-1)*8; - __asm__ __volatile__( - "ldr q12, [%[b_0]]\n" //b_o0 - "mov x0, %[ic]\n" // ic_blk - "ldr q0, [%[in_0]]\n" //in_hw0 - "mov v2.16b, v12.16b\n" //out_o0hw0 - "mov v3.16b, v12.16b\n" //out_o0hw1 - "mov v4.16b, v12.16b\n" //out_o0hw2 - "ldr q10, [%[f_0]]\n" //f_o0c0 - "mov v5.16b, v12.16b\n" //out_o0hw3 - "mov v6.16b, v12.16b\n" //out_o0hw4 - "mov v7.16b, v12.16b\n" //out_o0hw5 - "mov v8.16b, v12.16b\n" //out_o0hw6 - "mov v9.16b, v12.16b\n" //out_o0hw7 - "0:\n" - "ldr q1, [%[in_0], #16]\n" //in_hw0 - "fmla v2.8h, v10.8h, v0.h[0]\n" - "fmla v3.8h, v10.8h, v0.h[1]\n" - "fmla v4.8h, v10.8h, v0.h[2]\n" - "ldr q11, [%[f_0], #16]\n" //f_o0c0 - "fmla v5.8h, v10.8h, v0.h[3]\n" - "fmla v6.8h, v10.8h, v0.h[4]\n" - "fmla v7.8h, v10.8h, v0.h[5]\n" - "subs x0, x0, #2\n" - "fmla v8.8h, v10.8h, v0.h[6]\n" - "fmla v9.8h, v10.8h, v0.h[7]\n" - - "ldr q0, [%[in_0], #32]\n" //in_hw0 - "fmla v2.8h, v11.8h, v1.h[0]\n" - "fmla v3.8h, v11.8h, v1.h[1]\n" - "fmla v4.8h, v11.8h, v1.h[2]\n" - "ldr q10, [%[f_0], #32]\n" //f_o0c0 - "fmla v5.8h, v11.8h, v1.h[3]\n" - "fmla v6.8h, v11.8h, v1.h[4]\n" - "fmla v7.8h, v11.8h, v1.h[5]\n" - "add %[in_0], %[in_0], #32\n" - "fmla v8.8h, v11.8h, v1.h[6]\n" - "add %[f_0], %[f_0], #32\n" - "fmla v9.8h, v11.8h, v1.h[7]\n" - "bne 0b\n" - - "cmp %[pointwiseActivationMode], %[am_relu]\n" - "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v3.8h, v3.8h, v0.8h\n" - "fmax v4.8h, v4.8h, v0.8h\n" - "fmax v5.8h, v5.8h, v0.8h\n" - "fmax v6.8h, v6.8h, v0.8h\n" - "fmax v7.8h, v7.8h, v0.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - - "11:\n" - "cmp %[pointwiseActivationMode], %[am_relu6]\n" - "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v3.8h, v3.8h, v0.8h\n" - "fmax v4.8h, v4.8h, v0.8h\n" - "fmax v5.8h, v5.8h, v0.8h\n" - "fmax v6.8h, v6.8h, v0.8h\n" - "fmax v7.8h, v7.8h, v0.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmin v2.8h, v2.8h, v1.8h\n" - "fmin v3.8h, v3.8h, v1.8h\n" - "fmin v4.8h, v4.8h, v1.8h\n" - "fmin v5.8h, v5.8h, v1.8h\n" - "fmin v6.8h, v6.8h, v1.8h\n" - "fmin v7.8h, v7.8h, v1.8h\n" - "fmin v8.8h, v8.8h, v1.8h\n" - "fmin v9.8h, v9.8h, v1.8h\n" - - "12:\n" - "cmp %[pointwiseActivationMode], %[am_h_swish]\n" - "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v10.8h, #0x42, lsl #8\n" //three - "fadd v11.8h, v2.8h, v10.8h\n" - "fadd v12.8h, v3.8h, v10.8h\n" - "fadd v13.8h, v4.8h, v10.8h\n" - "fadd v14.8h, v5.8h, v10.8h\n" - "fadd v15.8h, v6.8h, v10.8h\n" - "fadd v16.8h, v7.8h, v10.8h\n" - "fadd v17.8h, v8.8h, v10.8h\n" - "fadd v18.8h, v9.8h, v10.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" - "fmax v12.8h, v12.8h, v0.8h\n" - "fmax v13.8h, v13.8h, v0.8h\n" - "fmax v14.8h, v14.8h, v0.8h\n" - "fmax v15.8h, v15.8h, v0.8h\n" - "fmax v16.8h, v16.8h, v0.8h\n" - "fmax v17.8h, v17.8h, v0.8h\n" - "fmax v18.8h, v18.8h, v0.8h\n" - "fmin v11.8h, v11.8h, v1.8h\n" - "fmin v12.8h, v12.8h, v1.8h\n" - "fmin v13.8h, v13.8h, v1.8h\n" - "fmin v14.8h, v14.8h, v1.8h\n" - "fmin v15.8h, v15.8h, v1.8h\n" - "fmin v16.8h, v16.8h, v1.8h\n" - "fmin v17.8h, v17.8h, v1.8h\n" - "fmin v18.8h, v18.8h, v1.8h\n" - "fdiv v11.8h, v11.8h, v1.8h\n" - "fdiv v12.8h, v12.8h, v1.8h\n" - "fdiv v13.8h, v13.8h, v1.8h\n" - "fdiv v14.8h, v14.8h, v1.8h\n" - "fdiv v15.8h, v15.8h, v1.8h\n" - "fdiv v16.8h, v16.8h, v1.8h\n" - "fdiv v17.8h, v17.8h, v1.8h\n" - "fdiv v18.8h, v18.8h, v1.8h\n" - "fmul v2.8h, v11.8h, v2.8h\n" - "fmul v3.8h, v12.8h, v3.8h\n" - "fmul v4.8h, v13.8h, v4.8h\n" - "fmul v5.8h, v14.8h, v5.8h\n" - "fmul v6.8h, v15.8h, v6.8h\n" - "fmul v7.8h, v16.8h, v7.8h\n" - "fmul v8.8h, v17.8h, v8.8h\n" - "fmul v9.8h, v18.8h, v9.8h\n" - - "13:\n" - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw0 - "str q4, [%[out_0], #32]\n" //out_o0hw0 - "str q5, [%[out_0], #48]\n" //out_o0hw0 - "str q6, [%[out_0], #64]\n" //out_o0hw0 - "str q7, [%[out_0], #80]\n" //out_o0hw0 - "str q8, [%[out_0], #96]\n" //out_o0hw0 - "str q9, [%[out_0], #112]\n" //out_o0hw0 - :[out_0]"+r"(out_o0hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_r) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "x0", "x1", "x2" - ); - } - } - - // ohow_remainder % 8 / 4 - U32 ohow_s = (ohow / 8) * 8; - for (I32 hw = ohow_s; hw < ohow-3; hw+=4) { - const F16 *b0 = biasArray + ic*8; - const F16 *b1 = b0 + 8; - const F16 *f_o0c0 = filterArray + ic*fh*fw*8; - F16 *in_pack = pwArray + hw*ic*8; - for (I32 o = 0; o < I32(oc-1); o+=2) { - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; - // bias - const F16 *b_o0 = b0; - const F16 *b_o1 = b1; - __asm__ __volatile__( - "ldr q22, [%[b_0]]\n" //b_o0 - "ldr q23, [%[b_1]]\n" //b_o1 - "mov x0, %[ic]\n" //ic_blk - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr d0, [%[in_0]]\n" //in_hw0 - "mov v3.16b, v22.16b\n" //out_o0hw1 - "ldr q18, [%[f_0]]\n" //f_o0c0 - "mov v4.16b, v22.16b\n" //out_o0hw2 - "mov v5.16b, v22.16b\n" //out_o0hw3 - "mov v10.16b, v23.16b\n" //out_o1hw0 - "ldr q19, [%[f_0], #16]\n" //f_o1c0 - "mov v11.16b, v23.16b\n" //out_o1hw1 - "mov v12.16b, v23.16b\n" //out_o1hw2 - "mov v13.16b, v23.16b\n" //out_o1hw3 - "0:\n" - "ldr d1, [%[in_0], #8]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr q20, [%[f_0], #32]\n" //f_o0c0 - "fmla v3.8h, v18.8h, v0.h[1]\n" - "fmla v4.8h, v18.8h, v0.h[2]\n" - "fmla v5.8h, v18.8h, v0.h[3]\n" - "ldr q21, [%[f_0], #48]\n" //f_o1c0 - "fmla v10.8h, v19.8h, v0.h[0]\n" - "fmla v11.8h, v19.8h, v0.h[1]\n" - "fmla v12.8h, v19.8h, v0.h[2]\n" - "subs x0, x0, #2\n" - "fmla v13.8h, v19.8h, v0.h[3]\n" - - "ldr d0, [%[in_0], #16]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr q18, [%[f_0], #64]\n" //f_o0c0 - "fmla v3.8h, v20.8h, v1.h[1]\n" - "fmla v4.8h, v20.8h, v1.h[2]\n" - "ldr q19, [%[f_0], #80]\n" //f_o1c0 - "fmla v5.8h, v20.8h, v1.h[3]\n" - "fmla v10.8h, v21.8h, v1.h[0]\n" - "fmla v11.8h, v21.8h, v1.h[1]\n" - "fmla v12.8h, v21.8h, v1.h[2]\n" - "add %[in_0], %[in_0], #16\n" - "fmla v13.8h, v21.8h, v1.h[3]\n" - "add %[f_0], %[f_0], #64\n" - "bne 0b\n" - - "cmp %[pointwiseActivationMode], %[am_relu]\n" - "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v3.8h, v3.8h, v0.8h\n" - "fmax v4.8h, v4.8h, v0.8h\n" - "fmax v5.8h, v5.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" - "fmax v12.8h, v12.8h, v0.8h\n" - "fmax v13.8h, v13.8h, v0.8h\n" - - "11:\n" - "cmp %[pointwiseActivationMode], %[am_relu6]\n" - "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v3.8h, v3.8h, v0.8h\n" - "fmax v4.8h, v4.8h, v0.8h\n" - "fmax v5.8h, v5.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" - "fmax v12.8h, v12.8h, v0.8h\n" - "fmax v13.8h, v13.8h, v0.8h\n" - "fmin v2.8h, v2.8h, v1.8h\n" - "fmin v3.8h, v3.8h, v1.8h\n" - "fmin v4.8h, v4.8h, v1.8h\n" - "fmin v5.8h, v5.8h, v1.8h\n" - "fmin v10.8h, v10.8h, v1.8h\n" - "fmin v11.8h, v11.8h, v1.8h\n" - "fmin v12.8h, v12.8h, v1.8h\n" - "fmin v13.8h, v13.8h, v1.8h\n" - - "12:\n" - "cmp %[pointwiseActivationMode], %[am_h_swish]\n" - "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v18.8h, #0x42, lsl #8\n" //three - "fadd v6.8h, v2.8h, v18.8h\n" - "fadd v7.8h, v3.8h, v18.8h\n" - "fadd v8.8h, v4.8h, v18.8h\n" - "fadd v9.8h, v5.8h, v18.8h\n" - "fadd v19.8h, v10.8h, v18.8h\n" - "fadd v20.8h, v11.8h, v18.8h\n" - "fadd v21.8h, v12.8h, v18.8h\n" - "fadd v22.8h, v13.8h, v18.8h\n" - "fmax v6.8h, v6.8h, v0.8h\n" - "fmax v7.8h, v7.8h, v0.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmax v19.8h, v19.8h, v0.8h\n" - "fmax v20.8h, v20.8h, v0.8h\n" - "fmax v21.8h, v21.8h, v0.8h\n" - "fmax v22.8h, v22.8h, v0.8h\n" - "fmin v6.8h, v6.8h, v1.8h\n" - "fmin v7.8h, v7.8h, v1.8h\n" - "fmin v8.8h, v8.8h, v1.8h\n" - "fmin v9.8h, v9.8h, v1.8h\n" - "fmin v19.8h, v19.8h, v1.8h\n" - "fmin v20.8h, v20.8h, v1.8h\n" - "fmin v21.8h, v21.8h, v1.8h\n" - "fmin v22.8h, v22.8h, v1.8h\n" - "fdiv v6.8h, v6.8h, v1.8h\n" - "fdiv v7.8h, v7.8h, v1.8h\n" - "fdiv v8.8h, v8.8h, v1.8h\n" - "fdiv v9.8h, v9.8h, v1.8h\n" - "fdiv v19.8h, v19.8h, v1.8h\n" - "fdiv v20.8h, v20.8h, v1.8h\n" - "fdiv v21.8h, v21.8h, v1.8h\n" - "fdiv v22.8h, v22.8h, v1.8h\n" - "fmul v2.8h, v6.8h, v2.8h\n" - "fmul v3.8h, v7.8h, v3.8h\n" - "fmul v4.8h, v8.8h, v4.8h\n" - "fmul v5.8h, v9.8h, v5.8h\n" - "fmul v10.8h, v19.8h, v10.8h\n" - "fmul v11.8h, v20.8h, v11.8h\n" - "fmul v12.8h, v21.8h, v12.8h\n" - "fmul v13.8h, v22.8h, v13.8h\n" - - "13:\n" - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw1 - "str q4, [%[out_0], #32]\n" //out_o0hw2 - "str q5, [%[out_0], #48]\n" //out_o0hw3 - "str q10, [%[out_1]]\n" //out_o1hw0 - "str q11, [%[out_1], #16]\n" //out_o1hw1 - "str q12, [%[out_1], #32]\n" //out_o1hw2 - "str q13, [%[out_1], #48]\n" //out_o1hw3 - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v18", "v19", "v20", "v21", "v22", "v23", "x0", "x1", "x2", "x3" - ); - b0 += 16; - b1 += 16; - } - if (oc & 1) { - // oc%2 != 0 - const F16 *f_r = filterArray + ic*fh*fw*8 + (oc-1)*8*ic*8; - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + (oc-1)*ohow*8 + hw*8; - // bias - const F16 *b_o0 = biasArray + ic*8 + (oc-1)*8; - __asm__ __volatile__( - "ldr q22, [%[b_0]]\n" //b_o0 - "mov x0, %[ic]\n" //ic_blk - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr d0, [%[in_0]]\n" //in_hw0 - "mov v3.16b, v22.16b\n" //out_o0hw1 - "ldr q18, [%[f_0]]\n" //f_o0c0 - "mov v4.16b, v22.16b\n" //out_o0hw2 - "mov v5.16b, v22.16b\n" //out_o0hw3 - "0:\n" - "ldr d1, [%[in_0], #8]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr q20, [%[f_0], #16]\n" //f_o0c0 - "fmla v3.8h, v18.8h, v0.h[1]\n" - "fmla v4.8h, v18.8h, v0.h[2]\n" - "fmla v5.8h, v18.8h, v0.h[3]\n" - "subs x0, x0, #2\n" - - "ldr d0, [%[in_0], #16]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr q18, [%[f_0], #32]\n" //f_o0c0 - "fmla v3.8h, v20.8h, v1.h[1]\n" - "fmla v4.8h, v20.8h, v1.h[2]\n" - "fmla v5.8h, v20.8h, v1.h[3]\n" - "add %[in_0], %[in_0], #16\n" - "add %[f_0], %[f_0], #32\n" - "bne 0b\n" - - "cmp %[pointwiseActivationMode], %[am_relu]\n" - "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v3.8h, v3.8h, v0.8h\n" - "fmax v4.8h, v4.8h, v0.8h\n" - "fmax v5.8h, v5.8h, v0.8h\n" - - "11:\n" - "cmp %[pointwiseActivationMode], %[am_relu6]\n" - "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v3.8h, v3.8h, v0.8h\n" - "fmax v4.8h, v4.8h, v0.8h\n" - "fmax v5.8h, v5.8h, v0.8h\n" - "fmin v2.8h, v2.8h, v1.8h\n" - "fmin v3.8h, v3.8h, v1.8h\n" - "fmin v4.8h, v4.8h, v1.8h\n" - "fmin v5.8h, v5.8h, v1.8h\n" - - "12:\n" - "cmp %[pointwiseActivationMode], %[am_h_swish]\n" - "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v18.8h, #0x42, lsl #8\n" //three - "fadd v6.8h, v2.8h, v18.8h\n" - "fadd v7.8h, v3.8h, v18.8h\n" - "fadd v8.8h, v4.8h, v18.8h\n" - "fadd v9.8h, v5.8h, v18.8h\n" - "fmax v6.8h, v6.8h, v0.8h\n" - "fmax v7.8h, v7.8h, v0.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmin v6.8h, v6.8h, v1.8h\n" - "fmin v7.8h, v7.8h, v1.8h\n" - "fmin v8.8h, v8.8h, v1.8h\n" - "fmin v9.8h, v9.8h, v1.8h\n" - "fdiv v6.8h, v6.8h, v1.8h\n" - "fdiv v7.8h, v7.8h, v1.8h\n" - "fdiv v8.8h, v8.8h, v1.8h\n" - "fdiv v9.8h, v9.8h, v1.8h\n" - "fmul v2.8h, v6.8h, v2.8h\n" - "fmul v3.8h, v7.8h, v3.8h\n" - "fmul v4.8h, v8.8h, v4.8h\n" - "fmul v5.8h, v9.8h, v5.8h\n" - - "13:\n" - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw1 - "str q4, [%[out_0], #32]\n" //out_o0hw2 - "str q5, [%[out_0], #48]\n" //out_o0hw3 - :[out_0]"+r"(out_o0hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_r) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", - "v18", "v20", "v22", "x0", "x1", "x2" - ); - } - } - - // ohow_reminder % 4 - ohow_s = (ohow / 4) * 4; - for (I32 hw = ohow_s; hw < ohow; hw++) { - const F16 *b0 = biasArray + ic*8; - const F16 *b1 = b0 + 8; - const F16 *f_o0c0 = filterArray + ic*fh*fw*8; - F16 *in_pack = pwArray + hw*ic*8; - for (I32 o = 0; o < I32(oc-1); o+=2) { - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; - // bias - const F16 *b_o0 = b0; - const F16 *b_o1 = b1; - __asm__ __volatile__( - "ldr q22, [%[b_0]]\n" //b_o0 - "ldr q23, [%[b_1]]\n" //b_o1 - "mov x0, %[ic]\n" //ic_blk - "ldr h0, [%[in_0]]\n" //in_hw0 - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr q18, [%[f_0]]\n" //f_o0c0 - "mov v10.16b, v23.16b\n" //out_o1hw0 - "ldr q19, [%[f_0], #16]\n" //f_o1c0 - "0:\n" - "ldr h1, [%[in_0], #2]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr q20, [%[f_0], #32]\n" //f_o0c0 - "fmla v10.8h, v19.8h, v0.h[0]\n" - "ldr q21, [%[f_0], #48]\n" //f_o1c0 - "subs x0, x0, #2\n" - - "ldr h0, [%[in_0], #4]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr q18, [%[f_0], #64]\n" //f_o0c0 - "fmla v10.8h, v21.8h, v1.h[0]\n" - "ldr q19, [%[f_0], #80]\n" //f_o1c0 - "add %[in_0], %[in_0], #4\n" - "add %[f_0], %[f_0], #64\n" - "bne 0b\n" - - "cmp %[pointwiseActivationMode], %[am_relu]\n" - "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - - "11:\n" - "cmp %[pointwiseActivationMode], %[am_relu6]\n" - "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - "fmin v2.8h, v2.8h, v1.8h\n" - "fmin v10.8h, v10.8h, v1.8h\n" - - "12:\n" - "cmp %[pointwiseActivationMode], %[am_h_swish]\n" - "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v18.8h, #0x42, lsl #8\n" //three - "fadd v19.8h, v2.8h, v18.8h\n" - "fadd v20.8h, v10.8h, v18.8h\n" - "fmax v19.8h, v19.8h, v0.8h\n" - "fmax v20.8h, v20.8h, v0.8h\n" - "fmin v19.8h, v19.8h, v1.8h\n" - "fmin v20.8h, v20.8h, v1.8h\n" - "fdiv v19.8h, v19.8h, v1.8h\n" - "fdiv v20.8h, v20.8h, v1.8h\n" - "fmul v2.8h, v19.8h, v2.8h\n" - "fmul v10.8h, v20.8h, v10.8h\n" - - "13:\n" - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q10, [%[out_1]]\n" //out_o1hw0 - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v10", "v18", "v19", "v20", "v21", "v22", "v23", "x0", "x1", "x2", "x3" - ); - b0 += 16; - b1 += 16; - } - if (oc & 1) { - // oc%2 != 0 - const F16 *f_r = filterArray + ic*fh*fw*8 + (oc-1)*8*ic*8; - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + (oc-1)*ohow*8 + hw*8; - // bias - const F16 *b_o0 = biasArray + ic*8 + (oc-1)*8; - __asm__ __volatile__( - "ldr q22, [%[b_0]]\n" //b_o0 - "mov x0, %[ic]\n" //ic_blk - "ldr h0, [%[in_0]]\n" //in_hw0 - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr q18, [%[f_0]]\n" //f_o0c0 - "0:\n" - "ldr h1, [%[in_0], #2]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr q20, [%[f_0], #16]\n" //f_o0c0 - "subs x0, x0, #2\n" - - "ldr h0, [%[in_0], #4]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr q18, [%[f_0], #32]\n" //f_o0c0 - "add %[in_0], %[in_0], #4\n" - "add %[f_0], %[f_0], #32\n" - "bne 0b\n" - - "cmp %[pointwiseActivationMode], %[am_relu]\n" - "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "fmax v2.8h, v2.8h, v0.8h\n" - - "11:\n" - "cmp %[pointwiseActivationMode], %[am_relu6]\n" - "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "fmax v2.8h, v2.8h, v0.8h\n" - "fmin v2.8h, v2.8h, v1.8h\n" - - "12:\n" - "cmp %[pointwiseActivationMode], %[am_h_swish]\n" - "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v18.8h, #0x42, lsl #8\n" //three - "fadd v20.8h, v2.8h, v18.8h\n" - "fmax v20.8h, v20.8h, v0.8h\n" - "fmin v20.8h, v20.8h, v1.8h\n" - "fdiv v20.8h, v20.8h, v1.8h\n" - "fmul v2.8h, v20.8h, v2.8h\n" - - "13:\n" - "str q2, [%[out_0]]\n" //out_o0hw0 - :[out_0]"+r"(out_o0hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_r) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v10", "v18", "v20", "v22", "x0", "x1", "x2" - ); - } - } - } - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding.h b/tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding.h deleted file mode 100644 index 82877ff9..00000000 --- a/tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding.h +++ /dev/null @@ -1,126 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_DEPTHWISE_POINTWISE_CONVOLUTION_DIRECT_NO_PADDING -#define _H_DEPTHWISE_POINTWISE_CONVOLUTION_DIRECT_NO_PADDING - -#include - -#include "sys.h" -#include "type.h" -#include "error.h" -#include "tensor_desc.h" -#include "tensor_computing_type.h" - - -inline void calc_eight_channel_elements(I32 hw, - I32 ih_base, I32 ih, I32 iw, - I32 fh, I32 fw, - I32 ow, - F16 *inArray, - I32 strideH, I32 strideW, I32 paddingT, I32 paddingL, - const F16 *filterArray, - float16x8_t bias, - F16 *output) -{ - I32 h = hw / ow; - I32 w = hw % ow; - float16x8_t v0 = bias; - I32 ih_start = h * strideH - paddingT; - I32 iw_start = w * strideW - paddingL; - I32 fh_start = 0; - if (ih_start < 0) { - fh_start -= ih_start; - } - I32 fw_start = 0; - if (iw_start < 0) { - fw_start -= iw_start; - } - for (I32 fh_idx = fh_start; fh_idx < fh; fh_idx++) { - I32 ih_idx = ih_start + fh_idx; - if (ih_idx >= ih) - break; - I32 iw_base = ((ih_base + ih_idx) * iw); - I32 filter_index = (fh_idx * fw + fw_start) * 8; - for (I32 fw_idx = fw_start; fw_idx < fw; fw_idx++, filter_index+=8) { - I32 iw_idx = iw_start + fw_idx; - if (iw_idx >= iw) - break; - { - U32 in_index = (iw_base + iw_idx) * 8; - float16x8_t v1 = vld1q_f16(inArray + in_index); - float16x8_t v2 = vld1q_f16(filterArray + filter_index); - v0 = vfmaq_f16(v0, v1, v2); - } - } - } - vst1q_f16(output, v0); -} - -EE depthwise_pointwise_convolution_direct_no_padding_A55(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc); - -EE depthwise_pointwise_convolution_direct_no_padding_A76(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc); - -inline EE depthwise_pointwise_convolution_direct_no_padding(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc, - Arch arch) -{ - EE ret = SUCCESS; - switch (arch) { - case ARM_A55: - ret = depthwise_pointwise_convolution_direct_no_padding_A55(inputDesc, inArray, - filterDesc, filterArray, - convDesc, - biasDesc, biasArray, - tmpBytes, tmp, - outputDesc, outArray, - depthwiseActivationDesc, - pointwiseActivationDesc); - break; - case ARM_A76: - ret = depthwise_pointwise_convolution_direct_no_padding_A76(inputDesc, inArray, - filterDesc, filterArray, - convDesc, - biasDesc, biasArray, - tmpBytes, tmp, - outputDesc, outArray, - depthwiseActivationDesc, - pointwiseActivationDesc); - break; - default: - return NOT_SUPPORTED; - } - return ret; -} -#endif diff --git a/tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A55.cpp b/tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A55.cpp deleted file mode 100644 index 270ca2c3..00000000 --- a/tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A55.cpp +++ /dev/null @@ -1,1040 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding.h" -#include "cpu/arm/fp16/arm_functions_fp16.h" - -EE depthwise_pointwise_convolution_direct_no_padding_A55(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc) -{ - UNUSED(biasDesc); - UNUSED(tmpBytes); - - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingL = convDesc.padding_left; - - if (fdf != DF_CHWC8_NCN16) - CHECK_STATUS(NOT_MATCH); - - oc /= 8; - ic /= 8; - - I32 ohow = oh*ow; - F16 *pwArray = (F16*)tmp; - - F16 buffer[8]; - for (U32 n = 0; n < in; n++) { - for (U32 c = 0; c < ic; c++) { - const F16 *f = filterArray + c*fh*fw*8; - const F16 *b = biasArray + c*8; - float16x8_t vv0 = vld1q_f16(b); - - I32 iter = 0; - U32 ih_base = ((n * ic) + c) * ih; - // nhwchw8 - for (; iter < ohow-7; iter += 8) { - U32 out_base = iter * ic * 8 + c * 8 * 8; - for (I32 j = 0; j < 8; j++) { - I32 hw = iter + j; - calc_eight_channel_elements(hw, - ih_base, ih, iw, - fh, fw, - ow, - inArray, - strideH, strideW, paddingT, paddingL, - f, - vv0, - buffer); - CHECK_STATUS(activation_fp16(buffer, 8, depthwiseActivationDesc, buffer)); - U32 out_index = out_base + j; - for (I32 i = 0; i < 8; i++, out_index+=8) { - pwArray[out_index] = buffer[i]; - } - } - } - // nhwchw4 - for (; iter < ohow-3; iter += 4) { - U32 out_base = iter * ic * 8 + c * 8 * 4; - for (I32 j = 0; j < 4; j++) { - I32 hw = iter + j; - calc_eight_channel_elements(hw, - ih_base, ih, iw, - fh, fw, - ow, - inArray, - strideH, strideW, paddingT, paddingL, - f, - vv0, - buffer); - CHECK_STATUS(activation_fp16(buffer, 8, depthwiseActivationDesc, buffer)); - U32 out_index = out_base + j; - for (I32 i = 0; i < 8; i++, out_index+=4) { - pwArray[out_index] = buffer[i]; - } - } - } - // nhwchw1 - for (; iter < ohow; iter ++) { - U32 out_base = iter * ic * 8 + c * 8; - for (I32 j = 0; j < 1; j++) { - I32 hw = iter + j; - calc_eight_channel_elements(hw, - ih_base, ih, iw, - fh, fw, - ow, - inArray, - strideH, strideW, paddingT, paddingL, - f, - vv0, - buffer); - CHECK_STATUS(activation_fp16(buffer, 8, depthwiseActivationDesc, buffer)); - U32 out_index = out_base + j; - for (I32 i = 0; i < 8; i++, out_index++) { - pwArray[out_index] = buffer[i]; - } - } - } - } - - // pw_conv - // ohow / 8 - for (I32 hw = 0; hw < ohow-7; hw+=8) { - const F16 *b0 = biasArray + ic*8; - const F16 *b1 = b0 + 8; - F16 *in_pack = pwArray + hw*ic*8; - const F16 *f_o0c0 = filterArray + ic*fh*fw*8; - for (I32 o = 0; o < I32(oc-1); o+=2) { - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; - // bias - const F16 *b_o0 = b0; - const F16 *b_o1 = b1; - __asm__ __volatile__( - "ldr d22, [%[b_0]]\n" //b_o0 - "ldr x1, [%[b_0], #8]\n" - "ins v22.d[1], x1\n" - "ldr d23, [%[b_1]]\n" //b_o1 - "ldr x2, [%[b_1], #8]\n" - "ins v23.d[1], x2\n" - "mov x0, %[ic]\n" //ic_blk - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr d0, [%[in_0]]\n" //in_hw0 - "mov v3.16b, v22.16b\n" //out_o0hw1 - "ldr x1, [%[in_0], #8]\n" - "mov v4.16b, v22.16b\n" //out_o0hw2 - "ins v0.d[1], x1\n" - "mov v5.16b, v22.16b\n" //out_o0hw3 - "ldr d18, [%[f_0]]\n" //f_o0c0 - "mov v6.16b, v22.16b\n" //out_o0hw4 - "ldr x2, [%[f_0], #8]\n" - "mov v7.16b, v22.16b\n" //out_o0hw5 - "ins v18.d[1], x2\n" - "mov v8.16b, v22.16b\n" //out_o0hw6 - "ldr d19, [%[f_0], #16]\n" //f_o1c0 - "mov v9.16b, v22.16b\n" //out_o0hw7 - "ldr x3, [%[f_0], #24]\n" - "mov v10.16b, v23.16b\n" //out_o1hw0 - "ins v19.d[1], x3\n" - "mov v11.16b, v23.16b\n" //out_o1hw1 - "mov v12.16b, v23.16b\n" //out_o1hw2 - "mov v13.16b, v23.16b\n" //out_o1hw3 - "mov v14.16b, v23.16b\n" //out_o1hw4 - "mov v15.16b, v23.16b\n" //out_o1hw5 - "mov v16.16b, v23.16b\n" //out_o1hw6 - "mov v17.16b, v23.16b\n" //out_o1hw7 - "0:\n" - "ldr d1, [%[in_0], #16]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr x1, [%[in_0], #24]\n" - "fmla v3.8h, v18.8h, v0.h[1]\n" - "ins v1.d[1], x1\n" - "fmla v4.8h, v18.8h, v0.h[2]\n" - "ldr d20, [%[f_0], #32]\n" //f_o0c0 - "fmla v5.8h, v18.8h, v0.h[3]\n" - "ldr x2, [%[f_0], #40]\n" - "fmla v6.8h, v18.8h, v0.h[4]\n" - "ins v20.d[1], x2\n" - "fmla v7.8h, v18.8h, v0.h[5]\n" - "ldr d21, [%[f_0], #48]\n" //f_o1c0 - "fmla v8.8h, v18.8h, v0.h[6]\n" - "ldr x3, [%[f_0], #56]\n" - "fmla v9.8h, v18.8h, v0.h[7]\n" - "ins v21.d[1], x3\n" - "fmla v10.8h, v19.8h, v0.h[0]\n" - "fmla v11.8h, v19.8h, v0.h[1]\n" - "fmla v12.8h, v19.8h, v0.h[2]\n" - "fmla v13.8h, v19.8h, v0.h[3]\n" - "fmla v14.8h, v19.8h, v0.h[4]\n" - "fmla v15.8h, v19.8h, v0.h[5]\n" - "fmla v16.8h, v19.8h, v0.h[6]\n" - "fmla v17.8h, v19.8h, v0.h[7]\n" - - "ldr d0, [%[in_0], #32]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr x1, [%[in_0], #40]\n" - "fmla v3.8h, v20.8h, v1.h[1]\n" - "ins v0.d[1], x1\n" - "fmla v4.8h, v20.8h, v1.h[2]\n" - "ldr d18, [%[f_0], #64]\n" //f_o0c0 - "fmla v5.8h, v20.8h, v1.h[3]\n" - "ldr x2, [%[f_0], #72]\n" - "fmla v6.8h, v20.8h, v1.h[4]\n" - "ins v18.d[1], x2\n" - "fmla v7.8h, v20.8h, v1.h[5]\n" - "ldr d19, [%[f_0], #80]\n" //f_o1c0 - "fmla v8.8h, v20.8h, v1.h[6]\n" - "ldr x3, [%[f_0], #88]\n" - "fmla v9.8h, v20.8h, v1.h[7]\n" - "ins v19.d[1], x3\n" - "fmla v10.8h, v21.8h, v1.h[0]\n" - "add %[in_0], %[in_0], #32\n" - "fmla v11.8h, v21.8h, v1.h[1]\n" - "add %[f_0], %[f_0], #64\n" - "fmla v12.8h, v21.8h, v1.h[2]\n" - "subs x0, x0, #2\n" - "fmla v13.8h, v21.8h, v1.h[3]\n" - "fmla v14.8h, v21.8h, v1.h[4]\n" - "fmla v15.8h, v21.8h, v1.h[5]\n" - "fmla v16.8h, v21.8h, v1.h[6]\n" - "fmla v17.8h, v21.8h, v1.h[7]\n" - "bne 0b\n" - - "cmp %[pointwiseActivationMode], %[am_relu]\n" - "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v3.8h, v3.8h, v0.8h\n" - "fmax v4.8h, v4.8h, v0.8h\n" - "fmax v5.8h, v5.8h, v0.8h\n" - "fmax v6.8h, v6.8h, v0.8h\n" - "fmax v7.8h, v7.8h, v0.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" - "fmax v12.8h, v12.8h, v0.8h\n" - "fmax v13.8h, v13.8h, v0.8h\n" - "fmax v14.8h, v14.8h, v0.8h\n" - "fmax v15.8h, v15.8h, v0.8h\n" - "fmax v16.8h, v16.8h, v0.8h\n" - "fmax v17.8h, v17.8h, v0.8h\n" - - "11:\n" - "cmp %[pointwiseActivationMode], %[am_relu6]\n" - "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v3.8h, v3.8h, v0.8h\n" - "fmax v4.8h, v4.8h, v0.8h\n" - "fmax v5.8h, v5.8h, v0.8h\n" - "fmax v6.8h, v6.8h, v0.8h\n" - "fmax v7.8h, v7.8h, v0.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" - "fmax v12.8h, v12.8h, v0.8h\n" - "fmax v13.8h, v13.8h, v0.8h\n" - "fmax v14.8h, v14.8h, v0.8h\n" - "fmax v15.8h, v15.8h, v0.8h\n" - "fmax v16.8h, v16.8h, v0.8h\n" - "fmax v17.8h, v17.8h, v0.8h\n" - "fmin v2.8h, v2.8h, v1.8h\n" - "fmin v3.8h, v3.8h, v1.8h\n" - "fmin v4.8h, v4.8h, v1.8h\n" - "fmin v5.8h, v5.8h, v1.8h\n" - "fmin v6.8h, v6.8h, v1.8h\n" - "fmin v7.8h, v7.8h, v1.8h\n" - "fmin v8.8h, v8.8h, v1.8h\n" - "fmin v9.8h, v9.8h, v1.8h\n" - "fmin v10.8h, v10.8h, v1.8h\n" - "fmin v11.8h, v11.8h, v1.8h\n" - "fmin v12.8h, v12.8h, v1.8h\n" - "fmin v13.8h, v13.8h, v1.8h\n" - "fmin v14.8h, v14.8h, v1.8h\n" - "fmin v15.8h, v15.8h, v1.8h\n" - "fmin v16.8h, v16.8h, v1.8h\n" - "fmin v17.8h, v17.8h, v1.8h\n" - - "12:\n" - "cmp %[pointwiseActivationMode], %[am_h_swish]\n" - "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v18.8h, #0x42, lsl #8\n" //three - "fadd v19.8h, v2.8h, v18.8h\n" - "fadd v20.8h, v3.8h, v18.8h\n" - "fadd v21.8h, v4.8h, v18.8h\n" - "fadd v22.8h, v5.8h, v18.8h\n" - "fadd v23.8h, v6.8h, v18.8h\n" - "fadd v24.8h, v7.8h, v18.8h\n" - "fadd v25.8h, v8.8h, v18.8h\n" - "fadd v26.8h, v9.8h, v18.8h\n" - "fmax v19.8h, v19.8h, v0.8h\n" - "fmax v20.8h, v20.8h, v0.8h\n" - "fmax v21.8h, v21.8h, v0.8h\n" - "fmax v22.8h, v22.8h, v0.8h\n" - "fmax v23.8h, v23.8h, v0.8h\n" - "fmax v24.8h, v24.8h, v0.8h\n" - "fmax v25.8h, v25.8h, v0.8h\n" - "fmax v26.8h, v26.8h, v0.8h\n" - "fmin v19.8h, v19.8h, v1.8h\n" - "fmin v20.8h, v20.8h, v1.8h\n" - "fmin v21.8h, v21.8h, v1.8h\n" - "fmin v22.8h, v22.8h, v1.8h\n" - "fmin v23.8h, v23.8h, v1.8h\n" - "fmin v24.8h, v24.8h, v1.8h\n" - "fmin v25.8h, v25.8h, v1.8h\n" - "fmin v26.8h, v26.8h, v1.8h\n" - "fdiv v19.8h, v19.8h, v1.8h\n" - "fdiv v20.8h, v20.8h, v1.8h\n" - "fdiv v21.8h, v21.8h, v1.8h\n" - "fdiv v22.8h, v22.8h, v1.8h\n" - "fdiv v23.8h, v23.8h, v1.8h\n" - "fdiv v24.8h, v24.8h, v1.8h\n" - "fdiv v25.8h, v25.8h, v1.8h\n" - "fdiv v26.8h, v26.8h, v1.8h\n" - "fmul v2.8h, v19.8h, v2.8h\n" - "fmul v3.8h, v20.8h, v3.8h\n" - "fmul v4.8h, v21.8h, v4.8h\n" - "fmul v5.8h, v22.8h, v5.8h\n" - "fmul v6.8h, v23.8h, v6.8h\n" - "fmul v7.8h, v24.8h, v7.8h\n" - "fmul v8.8h, v25.8h, v8.8h\n" - "fmul v9.8h, v26.8h, v9.8h\n" - - "fadd v19.8h, v10.8h, v18.8h\n" - "fadd v20.8h, v11.8h, v18.8h\n" - "fadd v21.8h, v12.8h, v18.8h\n" - "fadd v22.8h, v13.8h, v18.8h\n" - "fadd v23.8h, v14.8h, v18.8h\n" - "fadd v24.8h, v15.8h, v18.8h\n" - "fadd v25.8h, v16.8h, v18.8h\n" - "fadd v26.8h, v17.8h, v18.8h\n" - "fmax v19.8h, v19.8h, v0.8h\n" - "fmax v20.8h, v20.8h, v0.8h\n" - "fmax v21.8h, v21.8h, v0.8h\n" - "fmax v22.8h, v22.8h, v0.8h\n" - "fmax v23.8h, v23.8h, v0.8h\n" - "fmax v24.8h, v24.8h, v0.8h\n" - "fmax v25.8h, v25.8h, v0.8h\n" - "fmax v26.8h, v26.8h, v0.8h\n" - "fmin v19.8h, v19.8h, v1.8h\n" - "fmin v20.8h, v20.8h, v1.8h\n" - "fmin v21.8h, v21.8h, v1.8h\n" - "fmin v22.8h, v22.8h, v1.8h\n" - "fmin v23.8h, v23.8h, v1.8h\n" - "fmin v24.8h, v24.8h, v1.8h\n" - "fmin v25.8h, v25.8h, v1.8h\n" - "fmin v26.8h, v26.8h, v1.8h\n" - "fdiv v19.8h, v19.8h, v1.8h\n" - "fdiv v20.8h, v20.8h, v1.8h\n" - "fdiv v21.8h, v21.8h, v1.8h\n" - "fdiv v22.8h, v22.8h, v1.8h\n" - "fdiv v23.8h, v23.8h, v1.8h\n" - "fdiv v24.8h, v24.8h, v1.8h\n" - "fdiv v25.8h, v25.8h, v1.8h\n" - "fdiv v26.8h, v26.8h, v1.8h\n" - "fmul v10.8h, v19.8h, v10.8h\n" - "fmul v11.8h, v20.8h, v11.8h\n" - "fmul v12.8h, v21.8h, v12.8h\n" - "fmul v13.8h, v22.8h, v13.8h\n" - "fmul v14.8h, v23.8h, v14.8h\n" - "fmul v15.8h, v24.8h, v15.8h\n" - "fmul v16.8h, v25.8h, v16.8h\n" - "fmul v17.8h, v26.8h, v17.8h\n" - - "13:\n" - "st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%[out_0]], #64\n" - "st1 {v6.8h, v7.8h, v8.8h, v9.8h}, [%[out_0]], #64\n" - "st1 {v10.8h, v11.8h, v12.8h, v13.8h}, [%[out_1]], #64\n" - "st1 {v14.8h, v15.8h, v16.8h, v17.8h}, [%[out_1]], #64\n" - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "x0", "x1", "x2", "x3" - ); - b0 += 16; - b1 += 16; - } - if (oc & 1) { - // oc%2 != 0 - const F16 *f_r = filterArray + ic*fh*fw*8 + (oc-1)*8*ic*8; - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + (oc-1)*ohow*8 + hw*8; - // bias - const F16 *b_o0 = biasArray + ic*8 + (oc-1)*8; - __asm__ __volatile__( - "ldr q12, [%[b_0]]\n" //b_o0 - "mov x0, %[ic]\n" // ic_blk - "ldr d0, [%[in_0]]\n" //in_hw0 - "mov v2.16b, v12.16b\n" //out_o0hw0 - "ldr x1, [%[in_0], #8]\n" - "mov v3.16b, v12.16b\n" //out_o0hw1 - "ins v0.d[1], x1\n" - "mov v4.16b, v12.16b\n" //out_o0hw2 - "ldr d10, [%[f_0]]\n" //f_o0c0 - "mov v5.16b, v12.16b\n" //out_o0hw3 - "ldr x2, [%[f_0], #8]\n" - "mov v6.16b, v12.16b\n" //out_o0hw4 - "ins v10.d[1], x2\n" - "mov v7.16b, v12.16b\n" //out_o0hw5 - "mov v8.16b, v12.16b\n" //out_o0hw6 - "mov v9.16b, v12.16b\n" //out_o0hw7 - "0:\n" - "ldr d1, [%[in_0], #16]\n" //in_hw0 - "fmla v2.8h, v10.8h, v0.h[0]\n" - "ldr x1, [%[in_0], #24]\n" - "fmla v3.8h, v10.8h, v0.h[1]\n" - "ins v1.d[1], x1\n" - "fmla v4.8h, v10.8h, v0.h[2]\n" - "ldr d11, [%[f_0], #16]\n" //f_o0c0 - "fmla v5.8h, v10.8h, v0.h[3]\n" - "ldr x2, [%[f_0], #24]\n" - "fmla v6.8h, v10.8h, v0.h[4]\n" - "ins v11.d[1], x2\n" - "fmla v7.8h, v10.8h, v0.h[5]\n" - "subs x0, x0, #2\n" - "fmla v8.8h, v10.8h, v0.h[6]\n" - "fmla v9.8h, v10.8h, v0.h[7]\n" - - "ldr d0, [%[in_0], #32]\n" //in_hw0 - "fmla v2.8h, v11.8h, v1.h[0]\n" - "ldr x1, [%[in_0], #40]\n" - "fmla v3.8h, v11.8h, v1.h[1]\n" - "ins v0.d[1], x1\n" - "fmla v4.8h, v11.8h, v1.h[2]\n" - "ldr d10, [%[f_0], #32]\n" //f_o0c0 - "fmla v5.8h, v11.8h, v1.h[3]\n" - "ldr x2, [%[f_0], #40]\n" - "fmla v6.8h, v11.8h, v1.h[4]\n" - "ins v10.d[1], x2\n" - "fmla v7.8h, v11.8h, v1.h[5]\n" - "add %[in_0], %[in_0], #32\n" - "fmla v8.8h, v11.8h, v1.h[6]\n" - "add %[f_0], %[f_0], #32\n" - "fmla v9.8h, v11.8h, v1.h[7]\n" - "bne 0b\n" - - "cmp %[pointwiseActivationMode], %[am_relu]\n" - "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v3.8h, v3.8h, v0.8h\n" - "fmax v4.8h, v4.8h, v0.8h\n" - "fmax v5.8h, v5.8h, v0.8h\n" - "fmax v6.8h, v6.8h, v0.8h\n" - "fmax v7.8h, v7.8h, v0.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - - "11:\n" - "cmp %[pointwiseActivationMode], %[am_relu6]\n" - "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v3.8h, v3.8h, v0.8h\n" - "fmax v4.8h, v4.8h, v0.8h\n" - "fmax v5.8h, v5.8h, v0.8h\n" - "fmax v6.8h, v6.8h, v0.8h\n" - "fmax v7.8h, v7.8h, v0.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmin v2.8h, v2.8h, v1.8h\n" - "fmin v3.8h, v3.8h, v1.8h\n" - "fmin v4.8h, v4.8h, v1.8h\n" - "fmin v5.8h, v5.8h, v1.8h\n" - "fmin v6.8h, v6.8h, v1.8h\n" - "fmin v7.8h, v7.8h, v1.8h\n" - "fmin v8.8h, v8.8h, v1.8h\n" - "fmin v9.8h, v9.8h, v1.8h\n" - - "12:\n" - "cmp %[pointwiseActivationMode], %[am_h_swish]\n" - "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v10.8h, #0x42, lsl #8\n" //three - "fadd v11.8h, v2.8h, v10.8h\n" - "fadd v12.8h, v3.8h, v10.8h\n" - "fadd v13.8h, v4.8h, v10.8h\n" - "fadd v14.8h, v5.8h, v10.8h\n" - "fadd v15.8h, v6.8h, v10.8h\n" - "fadd v16.8h, v7.8h, v10.8h\n" - "fadd v17.8h, v8.8h, v10.8h\n" - "fadd v18.8h, v9.8h, v10.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" - "fmax v12.8h, v12.8h, v0.8h\n" - "fmax v13.8h, v13.8h, v0.8h\n" - "fmax v14.8h, v14.8h, v0.8h\n" - "fmax v15.8h, v15.8h, v0.8h\n" - "fmax v16.8h, v16.8h, v0.8h\n" - "fmax v17.8h, v17.8h, v0.8h\n" - "fmax v18.8h, v18.8h, v0.8h\n" - "fmin v11.8h, v11.8h, v1.8h\n" - "fmin v12.8h, v12.8h, v1.8h\n" - "fmin v13.8h, v13.8h, v1.8h\n" - "fmin v14.8h, v14.8h, v1.8h\n" - "fmin v15.8h, v15.8h, v1.8h\n" - "fmin v16.8h, v16.8h, v1.8h\n" - "fmin v17.8h, v17.8h, v1.8h\n" - "fmin v18.8h, v18.8h, v1.8h\n" - "fdiv v11.8h, v11.8h, v1.8h\n" - "fdiv v12.8h, v12.8h, v1.8h\n" - "fdiv v13.8h, v13.8h, v1.8h\n" - "fdiv v14.8h, v14.8h, v1.8h\n" - "fdiv v15.8h, v15.8h, v1.8h\n" - "fdiv v16.8h, v16.8h, v1.8h\n" - "fdiv v17.8h, v17.8h, v1.8h\n" - "fdiv v18.8h, v18.8h, v1.8h\n" - "fmul v2.8h, v11.8h, v2.8h\n" - "fmul v3.8h, v12.8h, v3.8h\n" - "fmul v4.8h, v13.8h, v4.8h\n" - "fmul v5.8h, v14.8h, v5.8h\n" - "fmul v6.8h, v15.8h, v6.8h\n" - "fmul v7.8h, v16.8h, v7.8h\n" - "fmul v8.8h, v17.8h, v8.8h\n" - "fmul v9.8h, v18.8h, v9.8h\n" - - "13:\n" - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw0 - "str q4, [%[out_0], #32]\n" //out_o0hw0 - "str q5, [%[out_0], #48]\n" //out_o0hw0 - "str q6, [%[out_0], #64]\n" //out_o0hw0 - "str q7, [%[out_0], #80]\n" //out_o0hw0 - "str q8, [%[out_0], #96]\n" //out_o0hw0 - "str q9, [%[out_0], #112]\n" //out_o0hw0 - :[out_0]"+r"(out_o0hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_r) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "x0", "x1", "x2" - ); - } - } - - // ohow_remainder % 8 / 4 - U32 ohow_s = (ohow / 8) * 8; - for (I32 hw = ohow_s; hw < ohow-3; hw+=4) { - const F16 *b0 = biasArray + ic*8; - const F16 *b1 = b0 + 8; - const F16 *f_o0c0 = filterArray + ic*fh*fw*8; - F16 *in_pack = pwArray + hw*ic*8; - for (I32 o = 0; o < I32(oc-1); o+=2) { - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; - // bias - const F16 *b_o0 = b0; - const F16 *b_o1 = b1; - __asm__ __volatile__( - "ldr d22, [%[b_0]]\n" //b_o0 - "ldr x1, [%[b_0], #8]\n" - "ins v22.d[1], x1\n" - "ldr d23, [%[b_1]]\n" //b_o1 - "ldr x2, [%[b_1], #8]\n" - "ins v23.d[1], x2\n" - "mov x0, %[ic]\n" //ic_blk - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr d0, [%[in_0]]\n" //in_hw0 - "mov v3.16b, v22.16b\n" //out_o0hw1 - "ldr d18, [%[f_0]]\n" //f_o0c0 - "mov v4.16b, v22.16b\n" //out_o0hw2 - "ldr x2, [%[f_0], #8]\n" - "mov v5.16b, v22.16b\n" //out_o0hw3 - "ins v18.d[1], x2\n" - "mov v10.16b, v23.16b\n" //out_o1hw0 - "ldr d19, [%[f_0], #16]\n" //f_o1c0 - "mov v11.16b, v23.16b\n" //out_o1hw1 - "ldr x3, [%[f_0], #24]\n" - "mov v12.16b, v23.16b\n" //out_o1hw2 - "ins v19.d[1], x3\n" - "mov v13.16b, v23.16b\n" //out_o1hw3 - "0:\n" - "ldr d1, [%[in_0], #8]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr d20, [%[f_0], #32]\n" //f_o0c0 - "fmla v3.8h, v18.8h, v0.h[1]\n" - "ldr x2, [%[f_0], #40]\n" - "fmla v4.8h, v18.8h, v0.h[2]\n" - "ins v20.d[1], x2\n" - "fmla v5.8h, v18.8h, v0.h[3]\n" - "ldr d21, [%[f_0], #48]\n" //f_o1c0 - "fmla v10.8h, v19.8h, v0.h[0]\n" - "ldr x3, [%[f_0], #56]\n" - "fmla v11.8h, v19.8h, v0.h[1]\n" - "ins v21.d[1], x3\n" - "fmla v12.8h, v19.8h, v0.h[2]\n" - "subs x0, x0, #2\n" - "fmla v13.8h, v19.8h, v0.h[3]\n" - - "ldr d0, [%[in_0], #16]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr d18, [%[f_0], #64]\n" //f_o0c0 - "fmla v3.8h, v20.8h, v1.h[1]\n" - "ldr x2, [%[f_0], #72]\n" - "fmla v4.8h, v20.8h, v1.h[2]\n" - "ldr d19, [%[f_0], #80]\n" //f_o1c0 - "fmla v5.8h, v20.8h, v1.h[3]\n" - "ins v18.d[1], x2\n" - "fmla v10.8h, v21.8h, v1.h[0]\n" - "ldr x3, [%[f_0], #88]\n" - "fmla v11.8h, v21.8h, v1.h[1]\n" - "ins v19.d[1], x3\n" - "fmla v12.8h, v21.8h, v1.h[2]\n" - "add %[in_0], %[in_0], #16\n" - "fmla v13.8h, v21.8h, v1.h[3]\n" - "add %[f_0], %[f_0], #64\n" - "bne 0b\n" - - "cmp %[pointwiseActivationMode], %[am_relu]\n" - "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v3.8h, v3.8h, v0.8h\n" - "fmax v4.8h, v4.8h, v0.8h\n" - "fmax v5.8h, v5.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" - "fmax v12.8h, v12.8h, v0.8h\n" - "fmax v13.8h, v13.8h, v0.8h\n" - - "11:\n" - "cmp %[pointwiseActivationMode], %[am_relu6]\n" - "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v3.8h, v3.8h, v0.8h\n" - "fmax v4.8h, v4.8h, v0.8h\n" - "fmax v5.8h, v5.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" - "fmax v12.8h, v12.8h, v0.8h\n" - "fmax v13.8h, v13.8h, v0.8h\n" - "fmin v2.8h, v2.8h, v1.8h\n" - "fmin v3.8h, v3.8h, v1.8h\n" - "fmin v4.8h, v4.8h, v1.8h\n" - "fmin v5.8h, v5.8h, v1.8h\n" - "fmin v10.8h, v10.8h, v1.8h\n" - "fmin v11.8h, v11.8h, v1.8h\n" - "fmin v12.8h, v12.8h, v1.8h\n" - "fmin v13.8h, v13.8h, v1.8h\n" - - "12:\n" - "cmp %[pointwiseActivationMode], %[am_h_swish]\n" - "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v18.8h, #0x42, lsl #8\n" //three - "fadd v6.8h, v2.8h, v18.8h\n" - "fadd v7.8h, v3.8h, v18.8h\n" - "fadd v8.8h, v4.8h, v18.8h\n" - "fadd v9.8h, v5.8h, v18.8h\n" - "fadd v19.8h, v10.8h, v18.8h\n" - "fadd v20.8h, v11.8h, v18.8h\n" - "fadd v21.8h, v12.8h, v18.8h\n" - "fadd v22.8h, v13.8h, v18.8h\n" - "fmax v6.8h, v6.8h, v0.8h\n" - "fmax v7.8h, v7.8h, v0.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmax v19.8h, v19.8h, v0.8h\n" - "fmax v20.8h, v20.8h, v0.8h\n" - "fmax v21.8h, v21.8h, v0.8h\n" - "fmax v22.8h, v22.8h, v0.8h\n" - "fmin v6.8h, v6.8h, v1.8h\n" - "fmin v7.8h, v7.8h, v1.8h\n" - "fmin v8.8h, v8.8h, v1.8h\n" - "fmin v9.8h, v9.8h, v1.8h\n" - "fmin v19.8h, v19.8h, v1.8h\n" - "fmin v20.8h, v20.8h, v1.8h\n" - "fmin v21.8h, v21.8h, v1.8h\n" - "fmin v22.8h, v22.8h, v1.8h\n" - "fdiv v6.8h, v6.8h, v1.8h\n" - "fdiv v7.8h, v7.8h, v1.8h\n" - "fdiv v8.8h, v8.8h, v1.8h\n" - "fdiv v9.8h, v9.8h, v1.8h\n" - "fdiv v19.8h, v19.8h, v1.8h\n" - "fdiv v20.8h, v20.8h, v1.8h\n" - "fdiv v21.8h, v21.8h, v1.8h\n" - "fdiv v22.8h, v22.8h, v1.8h\n" - "fmul v2.8h, v6.8h, v2.8h\n" - "fmul v3.8h, v7.8h, v3.8h\n" - "fmul v4.8h, v8.8h, v4.8h\n" - "fmul v5.8h, v9.8h, v5.8h\n" - "fmul v10.8h, v19.8h, v10.8h\n" - "fmul v11.8h, v20.8h, v11.8h\n" - "fmul v12.8h, v21.8h, v12.8h\n" - "fmul v13.8h, v22.8h, v13.8h\n" - - "13:\n" - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw1 - "str q4, [%[out_0], #32]\n" //out_o0hw2 - "str q5, [%[out_0], #48]\n" //out_o0hw3 - "str q10, [%[out_1]]\n" //out_o1hw0 - "str q11, [%[out_1], #16]\n" //out_o1hw1 - "str q12, [%[out_1], #32]\n" //out_o1hw2 - "str q13, [%[out_1], #48]\n" //out_o1hw3 - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v18", "v19", "v20", "v21", "v22", "v23", "x0", "x1", "x2", "x3" - ); - b0 += 16; - b1 += 16; - } - if (oc & 1) { - // oc%2 != 0 - const F16 *f_r = filterArray + ic*fh*fw*8 + (oc-1)*8*ic*8; - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + (oc-1)*ohow*8 + hw*8; - // bias - const F16 *b_o0 = biasArray + ic*8 + (oc-1)*8; - __asm__ __volatile__( - "ldr d22, [%[b_0]]\n" //b_o0 - "ldr x1, [%[b_0], #8]\n" - "ins v22.d[1], x1\n" - "mov x0, %[ic]\n" //ic_blk - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr d0, [%[in_0]]\n" //in_hw0 - "mov v3.16b, v22.16b\n" //out_o0hw1 - "ldr d18, [%[f_0]]\n" //f_o0c0 - "mov v4.16b, v22.16b\n" //out_o0hw2 - "ldr x2, [%[f_0], #8]\n" - "mov v5.16b, v22.16b\n" //out_o0hw3 - "ins v18.d[1], x2\n" - "0:\n" - "ldr d1, [%[in_0], #8]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr d20, [%[f_0], #16]\n" //f_o0c0 - "fmla v3.8h, v18.8h, v0.h[1]\n" - "ldr x2, [%[f_0], #24]\n" - "fmla v4.8h, v18.8h, v0.h[2]\n" - "ins v20.d[1], x2\n" - "fmla v5.8h, v18.8h, v0.h[3]\n" - "subs x0, x0, #2\n" - - "ldr d0, [%[in_0], #16]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr d18, [%[f_0], #32]\n" //f_o0c0 - "fmla v3.8h, v20.8h, v1.h[1]\n" - "ldr x2, [%[f_0], #40]\n" - "fmla v4.8h, v20.8h, v1.h[2]\n" - "ins v18.d[1], x2\n" - "fmla v5.8h, v20.8h, v1.h[3]\n" - "add %[in_0], %[in_0], #16\n" - "add %[f_0], %[f_0], #32\n" - "bne 0b\n" - - "cmp %[pointwiseActivationMode], %[am_relu]\n" - "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v3.8h, v3.8h, v0.8h\n" - "fmax v4.8h, v4.8h, v0.8h\n" - "fmax v5.8h, v5.8h, v0.8h\n" - - "11:\n" - "cmp %[pointwiseActivationMode], %[am_relu6]\n" - "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v3.8h, v3.8h, v0.8h\n" - "fmax v4.8h, v4.8h, v0.8h\n" - "fmax v5.8h, v5.8h, v0.8h\n" - "fmin v2.8h, v2.8h, v1.8h\n" - "fmin v3.8h, v3.8h, v1.8h\n" - "fmin v4.8h, v4.8h, v1.8h\n" - "fmin v5.8h, v5.8h, v1.8h\n" - - "12:\n" - "cmp %[pointwiseActivationMode], %[am_h_swish]\n" - "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v18.8h, #0x42, lsl #8\n" //three - "fadd v6.8h, v2.8h, v18.8h\n" - "fadd v7.8h, v3.8h, v18.8h\n" - "fadd v8.8h, v4.8h, v18.8h\n" - "fadd v9.8h, v5.8h, v18.8h\n" - "fmax v6.8h, v6.8h, v0.8h\n" - "fmax v7.8h, v7.8h, v0.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmin v6.8h, v6.8h, v1.8h\n" - "fmin v7.8h, v7.8h, v1.8h\n" - "fmin v8.8h, v8.8h, v1.8h\n" - "fmin v9.8h, v9.8h, v1.8h\n" - "fdiv v6.8h, v6.8h, v1.8h\n" - "fdiv v7.8h, v7.8h, v1.8h\n" - "fdiv v8.8h, v8.8h, v1.8h\n" - "fdiv v9.8h, v9.8h, v1.8h\n" - "fmul v2.8h, v6.8h, v2.8h\n" - "fmul v3.8h, v7.8h, v3.8h\n" - "fmul v4.8h, v8.8h, v4.8h\n" - "fmul v5.8h, v9.8h, v5.8h\n" - - "13:\n" - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw1 - "str q4, [%[out_0], #32]\n" //out_o0hw2 - "str q5, [%[out_0], #48]\n" //out_o0hw3 - :[out_0]"+r"(out_o0hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_r) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", - "v18", "v20", "v22", "x0", "x1", "x2" - ); - } - } - - // ohow_reminder % 4 - ohow_s = (ohow / 4) * 4; - for (I32 hw = ohow_s; hw < ohow; hw++) { - const F16 *b0 = biasArray + ic*8; - const F16 *b1 = b0 + 8; - const F16 *f_o0c0 = filterArray + ic*fh*fw*8; - F16 *in_pack = pwArray + hw*ic*8; - for (I32 o = 0; o < I32(oc-1); o+=2) { - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; - // bias - const F16 *b_o0 = b0; - const F16 *b_o1 = b1; - __asm__ __volatile__( - "ldr d22, [%[b_0]]\n" //b_o0 - "ldr x1, [%[b_0], #8]\n" - "ins v22.d[1], x1\n" - "ldr d23, [%[b_1]]\n" //b_o1 - "mov x0, %[ic]\n" //ic_blk - "ldr x2, [%[b_1], #8]\n" - "ins v23.d[1], x2\n" - "ldr h0, [%[in_0]]\n" //in_hw0 - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr d18, [%[f_0]]\n" //f_o0c0 - "mov v10.16b, v23.16b\n" //out_o1hw0 - "ldr x2, [%[f_0], #8]\n" - "ins v18.d[1], x2\n" - "ldr d19, [%[f_0], #16]\n" //f_o1c0 - "ldr x3, [%[f_0], #24]\n" - "ins v19.d[1], x3\n" - "0:\n" - "ldr h1, [%[in_0], #2]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr d20, [%[f_0], #32]\n" //f_o0c0 - "fmla v10.8h, v19.8h, v0.h[0]\n" - "ldr x2, [%[f_0], #40]\n" - "ins v20.d[1], x2\n" - "ldr d21, [%[f_0], #48]\n" //f_o1c0 - "subs x0, x0, #2\n" - "ldr x3, [%[f_0], #56]\n" - "ins v21.d[1], x3\n" - - "ldr h0, [%[in_0], #4]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr d18, [%[f_0], #64]\n" //f_o0c0 - "fmla v10.8h, v21.8h, v1.h[0]\n" - "ldr x2, [%[f_0], #72]\n" - "ins v18.d[1], x2\n" - "ldr d19, [%[f_0], #80]\n" //f_o1c0 - "add %[in_0], %[in_0], #4\n" - "ldr x3, [%[f_0], #88]\n" - "ins v19.d[1], x3\n" - "add %[f_0], %[f_0], #64\n" - "bne 0b\n" - - "cmp %[pointwiseActivationMode], %[am_relu]\n" - "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - - "11:\n" - "cmp %[pointwiseActivationMode], %[am_relu6]\n" - "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - "fmin v2.8h, v2.8h, v1.8h\n" - "fmin v10.8h, v10.8h, v1.8h\n" - - "12:\n" - "cmp %[pointwiseActivationMode], %[am_h_swish]\n" - "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v18.8h, #0x42, lsl #8\n" //three - "fadd v19.8h, v2.8h, v18.8h\n" - "fadd v20.8h, v10.8h, v18.8h\n" - "fmax v19.8h, v19.8h, v0.8h\n" - "fmax v20.8h, v20.8h, v0.8h\n" - "fmin v19.8h, v19.8h, v1.8h\n" - "fmin v20.8h, v20.8h, v1.8h\n" - "fdiv v19.8h, v19.8h, v1.8h\n" - "fdiv v20.8h, v20.8h, v1.8h\n" - "fmul v2.8h, v19.8h, v2.8h\n" - "fmul v10.8h, v20.8h, v10.8h\n" - - "13:\n" - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q10, [%[out_1]]\n" //out_o1hw0 - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v10", "v18", "v19", "v20", "v21", "v22", "v23", "x0", "x1", "x2", "x3" - ); - b0 += 16; - b1 += 16; - } - if (oc & 1) { - // oc%2 != 0 - const F16 *f_r = filterArray + ic*fh*fw*8 + (oc-1)*8*ic*8; - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + (oc-1)*ohow*8 + hw*8; - // bias - const F16 *b_o0 = biasArray + ic*8 + (oc-1)*8; - __asm__ __volatile__( - "ldr d22, [%[b_0]]\n" //b_o0 - "mov x0, %[ic]\n" //ic_blk - "ldr x1, [%[b_0], #8]\n" - "ins v22.d[1], x1\n" - "ldr h0, [%[in_0]]\n" //in_hw0 - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr d18, [%[f_0]]\n" //f_o0c0 - "ldr x2, [%[f_0], #8]\n" - "ins v18.d[1], x2\n" - "0:\n" - "ldr h1, [%[in_0], #2]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr d20, [%[f_0], #16]\n" //f_o0c0 - "subs x0, x0, #2\n" - "ldr x2, [%[f_0], #24]\n" - "ins v20.d[1], x2\n" - - "ldr h0, [%[in_0], #4]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr d18, [%[f_0], #32]\n" //f_o0c0 - "ldr x2, [%[f_0], #40]\n" - "ins v18.d[1], x2\n" - "add %[in_0], %[in_0], #4\n" - "add %[f_0], %[f_0], #32\n" - "bne 0b\n" - - "cmp %[pointwiseActivationMode], %[am_relu]\n" - "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "fmax v2.8h, v2.8h, v0.8h\n" - - "11:\n" - "cmp %[pointwiseActivationMode], %[am_relu6]\n" - "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "fmax v2.8h, v2.8h, v0.8h\n" - "fmin v2.8h, v2.8h, v1.8h\n" - - "12:\n" - "cmp %[pointwiseActivationMode], %[am_h_swish]\n" - "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v18.8h, #0x42, lsl #8\n" //three - "fadd v20.8h, v2.8h, v18.8h\n" - "fmax v20.8h, v20.8h, v0.8h\n" - "fmin v20.8h, v20.8h, v1.8h\n" - "fdiv v20.8h, v20.8h, v1.8h\n" - "fmul v2.8h, v20.8h, v2.8h\n" - - "13:\n" - "str q2, [%[out_0]]\n" //out_o0hw0 - :[out_0]"+r"(out_o0hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_r) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v10", "v18", "v20", "v22", "x0", "x1", "x2" - ); - } - } - } - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A76.cpp b/tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A76.cpp deleted file mode 100644 index bf501f4f..00000000 --- a/tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A76.cpp +++ /dev/null @@ -1,958 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding.h" -#include "cpu/arm/fp16/arm_functions_fp16.h" - -EE depthwise_pointwise_convolution_direct_no_padding_A76(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc) -{ - UNUSED(biasDesc); - UNUSED(tmpBytes); - - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingL = convDesc.padding_left; - - if (fdf != DF_CHWC8_NCN16) - CHECK_STATUS(NOT_MATCH); - - oc /= 8; - ic /= 8; - - I32 ohow = oh*ow; - F16 *pwArray = (F16*)tmp; - - F16 buffer[8]; - for (U32 n = 0; n < in; n++) { - for (U32 c = 0; c < ic; c++) { - const F16 *f = filterArray + c*fh*fw*8; - const F16 *b = biasArray + c*8; - float16x8_t vv0 = vld1q_f16(b); - - I32 iter = 0; - U32 ih_base = ((n * ic) + c) * ih; - // nhwchw8 - for (; iter < ohow-7; iter += 8) { - U32 out_base = iter * ic * 8 + c * 8 * 8; - for (I32 j = 0; j < 8; j++) { - I32 hw = iter + j; - calc_eight_channel_elements(hw, - ih_base, ih, iw, - fh, fw, - ow, - inArray, - strideH, strideW, paddingT, paddingL, - f, - vv0, - buffer); - CHECK_STATUS(activation_fp16(buffer, 8, depthwiseActivationDesc, buffer)); - U32 out_index = out_base + j; - for (I32 i = 0; i < 8; i++, out_index+=8) { - pwArray[out_index] = buffer[i]; - } - } - } - // nhwchw4 - for (; iter < ohow-3; iter += 4) { - U32 out_base = iter * ic * 8 + c * 8 * 4; - for (I32 j = 0; j < 4; j++) { - I32 hw = iter + j; - calc_eight_channel_elements(hw, - ih_base, ih, iw, - fh, fw, - ow, - inArray, - strideH, strideW, paddingT, paddingL, - f, - vv0, - buffer); - CHECK_STATUS(activation_fp16(buffer, 8, depthwiseActivationDesc, buffer)); - U32 out_index = out_base + j; - for (I32 i = 0; i < 8; i++, out_index+=4) { - pwArray[out_index] = buffer[i]; - } - } - } - // nhwchw1 - for (; iter < ohow; iter ++) { - U32 out_base = iter * ic * 8 + c * 8; - for (I32 j = 0; j < 1; j++) { - I32 hw = iter + j; - calc_eight_channel_elements(hw, - ih_base, ih, iw, - fh, fw, - ow, - inArray, - strideH, strideW, paddingT, paddingL, - f, - vv0, - buffer); - CHECK_STATUS(activation_fp16(buffer, 8, depthwiseActivationDesc, buffer)); - U32 out_index = out_base + j; - for (I32 i = 0; i < 8; i++, out_index++) { - pwArray[out_index] = buffer[i]; - } - } - } - } - - // pw_conv - // ohow / 8 - for (I32 hw = 0; hw < ohow-7; hw+=8) { - const F16 *b0 = biasArray + ic*8; - const F16 *b1 = b0 + 8; - F16 *in_pack = pwArray + hw*ic*8; - const F16 *f_o0c0 = filterArray + ic*fh*fw*8; - for (I32 o = 0; o < I32(oc-1); o+=2) { - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; - // bias - const F16 *b_o0 = b0; - const F16 *b_o1 = b1; - __asm__ __volatile__( - "ldr q22, [%[b_0]]\n" //b_o0 - "ldr q23, [%[b_1]]\n" //b_o1 - "mov x0, %[ic]\n" //ic_blk - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr q0, [%[in_0]]\n" //in_hw0 - "mov v3.16b, v22.16b\n" //out_o0hw1 - "mov v4.16b, v22.16b\n" //out_o0hw2 - "mov v5.16b, v22.16b\n" //out_o0hw3 - "ldr q18, [%[f_0]]\n" //f_o0c0 - "mov v6.16b, v22.16b\n" //out_o0hw4 - "mov v7.16b, v22.16b\n" //out_o0hw5 - "mov v8.16b, v22.16b\n" //out_o0hw6 - "ldr q19, [%[f_0], #16]\n" //f_o1c0 - "mov v9.16b, v22.16b\n" //out_o0hw7 - "mov v10.16b, v23.16b\n" //out_o1hw0 - "mov v11.16b, v23.16b\n" //out_o1hw1 - "mov v12.16b, v23.16b\n" //out_o1hw2 - "mov v13.16b, v23.16b\n" //out_o1hw3 - "mov v14.16b, v23.16b\n" //out_o1hw4 - "mov v15.16b, v23.16b\n" //out_o1hw5 - "mov v16.16b, v23.16b\n" //out_o1hw6 - "mov v17.16b, v23.16b\n" //out_o1hw7 - "0:\n" - "ldr q1, [%[in_0], #16]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "fmla v3.8h, v18.8h, v0.h[1]\n" - "fmla v4.8h, v18.8h, v0.h[2]\n" - "ldr q20, [%[f_0], #32]\n" //f_o0c0 - "fmla v5.8h, v18.8h, v0.h[3]\n" - "fmla v6.8h, v18.8h, v0.h[4]\n" - "fmla v7.8h, v18.8h, v0.h[5]\n" - "ldr q21, [%[f_0], #48]\n" //f_o1c0 - "fmla v8.8h, v18.8h, v0.h[6]\n" - "fmla v9.8h, v18.8h, v0.h[7]\n" - "fmla v10.8h, v19.8h, v0.h[0]\n" - "fmla v11.8h, v19.8h, v0.h[1]\n" - "fmla v12.8h, v19.8h, v0.h[2]\n" - "fmla v13.8h, v19.8h, v0.h[3]\n" - "fmla v14.8h, v19.8h, v0.h[4]\n" - "fmla v15.8h, v19.8h, v0.h[5]\n" - "fmla v16.8h, v19.8h, v0.h[6]\n" - "fmla v17.8h, v19.8h, v0.h[7]\n" - - "ldr q0, [%[in_0], #32]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "fmla v3.8h, v20.8h, v1.h[1]\n" - "fmla v4.8h, v20.8h, v1.h[2]\n" - "ldr q18, [%[f_0], #64]\n" //f_o0c0 - "fmla v5.8h, v20.8h, v1.h[3]\n" - "fmla v6.8h, v20.8h, v1.h[4]\n" - "fmla v7.8h, v20.8h, v1.h[5]\n" - "ldr q19, [%[f_0], #80]\n" //f_o1c0 - "fmla v8.8h, v20.8h, v1.h[6]\n" - "fmla v9.8h, v20.8h, v1.h[7]\n" - "fmla v10.8h, v21.8h, v1.h[0]\n" - "add %[in_0], %[in_0], #32\n" - "fmla v11.8h, v21.8h, v1.h[1]\n" - "add %[f_0], %[f_0], #64\n" - "fmla v12.8h, v21.8h, v1.h[2]\n" - "subs x0, x0, #2\n" - "fmla v13.8h, v21.8h, v1.h[3]\n" - "fmla v14.8h, v21.8h, v1.h[4]\n" - "fmla v15.8h, v21.8h, v1.h[5]\n" - "fmla v16.8h, v21.8h, v1.h[6]\n" - "fmla v17.8h, v21.8h, v1.h[7]\n" - "bne 0b\n" - - "cmp %[pointwiseActivationMode], %[am_relu]\n" - "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v3.8h, v3.8h, v0.8h\n" - "fmax v4.8h, v4.8h, v0.8h\n" - "fmax v5.8h, v5.8h, v0.8h\n" - "fmax v6.8h, v6.8h, v0.8h\n" - "fmax v7.8h, v7.8h, v0.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" - "fmax v12.8h, v12.8h, v0.8h\n" - "fmax v13.8h, v13.8h, v0.8h\n" - "fmax v14.8h, v14.8h, v0.8h\n" - "fmax v15.8h, v15.8h, v0.8h\n" - "fmax v16.8h, v16.8h, v0.8h\n" - "fmax v17.8h, v17.8h, v0.8h\n" - - "11:\n" - "cmp %[pointwiseActivationMode], %[am_relu6]\n" - "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v3.8h, v3.8h, v0.8h\n" - "fmax v4.8h, v4.8h, v0.8h\n" - "fmax v5.8h, v5.8h, v0.8h\n" - "fmax v6.8h, v6.8h, v0.8h\n" - "fmax v7.8h, v7.8h, v0.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" - "fmax v12.8h, v12.8h, v0.8h\n" - "fmax v13.8h, v13.8h, v0.8h\n" - "fmax v14.8h, v14.8h, v0.8h\n" - "fmax v15.8h, v15.8h, v0.8h\n" - "fmax v16.8h, v16.8h, v0.8h\n" - "fmax v17.8h, v17.8h, v0.8h\n" - "fmin v2.8h, v2.8h, v1.8h\n" - "fmin v3.8h, v3.8h, v1.8h\n" - "fmin v4.8h, v4.8h, v1.8h\n" - "fmin v5.8h, v5.8h, v1.8h\n" - "fmin v6.8h, v6.8h, v1.8h\n" - "fmin v7.8h, v7.8h, v1.8h\n" - "fmin v8.8h, v8.8h, v1.8h\n" - "fmin v9.8h, v9.8h, v1.8h\n" - "fmin v10.8h, v10.8h, v1.8h\n" - "fmin v11.8h, v11.8h, v1.8h\n" - "fmin v12.8h, v12.8h, v1.8h\n" - "fmin v13.8h, v13.8h, v1.8h\n" - "fmin v14.8h, v14.8h, v1.8h\n" - "fmin v15.8h, v15.8h, v1.8h\n" - "fmin v16.8h, v16.8h, v1.8h\n" - "fmin v17.8h, v17.8h, v1.8h\n" - - "12:\n" - "cmp %[pointwiseActivationMode], %[am_h_swish]\n" - "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v18.8h, #0x42, lsl #8\n" //three - "fadd v19.8h, v2.8h, v18.8h\n" - "fadd v20.8h, v3.8h, v18.8h\n" - "fadd v21.8h, v4.8h, v18.8h\n" - "fadd v22.8h, v5.8h, v18.8h\n" - "fadd v23.8h, v6.8h, v18.8h\n" - "fadd v24.8h, v7.8h, v18.8h\n" - "fadd v25.8h, v8.8h, v18.8h\n" - "fadd v26.8h, v9.8h, v18.8h\n" - "fmax v19.8h, v19.8h, v0.8h\n" - "fmax v20.8h, v20.8h, v0.8h\n" - "fmax v21.8h, v21.8h, v0.8h\n" - "fmax v22.8h, v22.8h, v0.8h\n" - "fmax v23.8h, v23.8h, v0.8h\n" - "fmax v24.8h, v24.8h, v0.8h\n" - "fmax v25.8h, v25.8h, v0.8h\n" - "fmax v26.8h, v26.8h, v0.8h\n" - "fmin v19.8h, v19.8h, v1.8h\n" - "fmin v20.8h, v20.8h, v1.8h\n" - "fmin v21.8h, v21.8h, v1.8h\n" - "fmin v22.8h, v22.8h, v1.8h\n" - "fmin v23.8h, v23.8h, v1.8h\n" - "fmin v24.8h, v24.8h, v1.8h\n" - "fmin v25.8h, v25.8h, v1.8h\n" - "fmin v26.8h, v26.8h, v1.8h\n" - "fdiv v19.8h, v19.8h, v1.8h\n" - "fdiv v20.8h, v20.8h, v1.8h\n" - "fdiv v21.8h, v21.8h, v1.8h\n" - "fdiv v22.8h, v22.8h, v1.8h\n" - "fdiv v23.8h, v23.8h, v1.8h\n" - "fdiv v24.8h, v24.8h, v1.8h\n" - "fdiv v25.8h, v25.8h, v1.8h\n" - "fdiv v26.8h, v26.8h, v1.8h\n" - "fmul v2.8h, v19.8h, v2.8h\n" - "fmul v3.8h, v20.8h, v3.8h\n" - "fmul v4.8h, v21.8h, v4.8h\n" - "fmul v5.8h, v22.8h, v5.8h\n" - "fmul v6.8h, v23.8h, v6.8h\n" - "fmul v7.8h, v24.8h, v7.8h\n" - "fmul v8.8h, v25.8h, v8.8h\n" - "fmul v9.8h, v26.8h, v9.8h\n" - - "fadd v19.8h, v10.8h, v18.8h\n" - "fadd v20.8h, v11.8h, v18.8h\n" - "fadd v21.8h, v12.8h, v18.8h\n" - "fadd v22.8h, v13.8h, v18.8h\n" - "fadd v23.8h, v14.8h, v18.8h\n" - "fadd v24.8h, v15.8h, v18.8h\n" - "fadd v25.8h, v16.8h, v18.8h\n" - "fadd v26.8h, v17.8h, v18.8h\n" - "fmax v19.8h, v19.8h, v0.8h\n" - "fmax v20.8h, v20.8h, v0.8h\n" - "fmax v21.8h, v21.8h, v0.8h\n" - "fmax v22.8h, v22.8h, v0.8h\n" - "fmax v23.8h, v23.8h, v0.8h\n" - "fmax v24.8h, v24.8h, v0.8h\n" - "fmax v25.8h, v25.8h, v0.8h\n" - "fmax v26.8h, v26.8h, v0.8h\n" - "fmin v19.8h, v19.8h, v1.8h\n" - "fmin v20.8h, v20.8h, v1.8h\n" - "fmin v21.8h, v21.8h, v1.8h\n" - "fmin v22.8h, v22.8h, v1.8h\n" - "fmin v23.8h, v23.8h, v1.8h\n" - "fmin v24.8h, v24.8h, v1.8h\n" - "fmin v25.8h, v25.8h, v1.8h\n" - "fmin v26.8h, v26.8h, v1.8h\n" - "fdiv v19.8h, v19.8h, v1.8h\n" - "fdiv v20.8h, v20.8h, v1.8h\n" - "fdiv v21.8h, v21.8h, v1.8h\n" - "fdiv v22.8h, v22.8h, v1.8h\n" - "fdiv v23.8h, v23.8h, v1.8h\n" - "fdiv v24.8h, v24.8h, v1.8h\n" - "fdiv v25.8h, v25.8h, v1.8h\n" - "fdiv v26.8h, v26.8h, v1.8h\n" - "fmul v10.8h, v19.8h, v10.8h\n" - "fmul v11.8h, v20.8h, v11.8h\n" - "fmul v12.8h, v21.8h, v12.8h\n" - "fmul v13.8h, v22.8h, v13.8h\n" - "fmul v14.8h, v23.8h, v14.8h\n" - "fmul v15.8h, v24.8h, v15.8h\n" - "fmul v16.8h, v25.8h, v16.8h\n" - "fmul v17.8h, v26.8h, v17.8h\n" - - "13:\n" - "st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%[out_0]], #64\n" - "st1 {v6.8h, v7.8h, v8.8h, v9.8h}, [%[out_0]], #64\n" - "st1 {v10.8h, v11.8h, v12.8h, v13.8h}, [%[out_1]], #64\n" - "st1 {v14.8h, v15.8h, v16.8h, v17.8h}, [%[out_1]], #64\n" - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "x0", "x1", "x2", "x3" - ); - b0 += 16; - b1 += 16; - } - if (oc & 1) { - // oc%2 != 0 - const F16 *f_r = filterArray + ic*fh*fw*8 + (oc-1)*8*ic*8; - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + (oc-1)*ohow*8 + hw*8; - // bias - const F16 *b_o0 = biasArray + ic*8 + (oc-1)*8; - __asm__ __volatile__( - "ldr q12, [%[b_0]]\n" //b_o0 - "mov x0, %[ic]\n" // ic_blk - "ldr q0, [%[in_0]]\n" //in_hw0 - "mov v2.16b, v12.16b\n" //out_o0hw0 - "mov v3.16b, v12.16b\n" //out_o0hw1 - "mov v4.16b, v12.16b\n" //out_o0hw2 - "ldr q10, [%[f_0]]\n" //f_o0c0 - "mov v5.16b, v12.16b\n" //out_o0hw3 - "mov v6.16b, v12.16b\n" //out_o0hw4 - "mov v7.16b, v12.16b\n" //out_o0hw5 - "mov v8.16b, v12.16b\n" //out_o0hw6 - "mov v9.16b, v12.16b\n" //out_o0hw7 - "0:\n" - "ldr q1, [%[in_0], #16]\n" //in_hw0 - "fmla v2.8h, v10.8h, v0.h[0]\n" - "fmla v3.8h, v10.8h, v0.h[1]\n" - "fmla v4.8h, v10.8h, v0.h[2]\n" - "ldr q11, [%[f_0], #16]\n" //f_o0c0 - "fmla v5.8h, v10.8h, v0.h[3]\n" - "fmla v6.8h, v10.8h, v0.h[4]\n" - "fmla v7.8h, v10.8h, v0.h[5]\n" - "subs x0, x0, #2\n" - "fmla v8.8h, v10.8h, v0.h[6]\n" - "fmla v9.8h, v10.8h, v0.h[7]\n" - - "ldr q0, [%[in_0], #32]\n" //in_hw0 - "fmla v2.8h, v11.8h, v1.h[0]\n" - "fmla v3.8h, v11.8h, v1.h[1]\n" - "fmla v4.8h, v11.8h, v1.h[2]\n" - "ldr q10, [%[f_0], #32]\n" //f_o0c0 - "fmla v5.8h, v11.8h, v1.h[3]\n" - "fmla v6.8h, v11.8h, v1.h[4]\n" - "fmla v7.8h, v11.8h, v1.h[5]\n" - "add %[in_0], %[in_0], #32\n" - "fmla v8.8h, v11.8h, v1.h[6]\n" - "add %[f_0], %[f_0], #32\n" - "fmla v9.8h, v11.8h, v1.h[7]\n" - "bne 0b\n" - - "cmp %[pointwiseActivationMode], %[am_relu]\n" - "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v3.8h, v3.8h, v0.8h\n" - "fmax v4.8h, v4.8h, v0.8h\n" - "fmax v5.8h, v5.8h, v0.8h\n" - "fmax v6.8h, v6.8h, v0.8h\n" - "fmax v7.8h, v7.8h, v0.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - - "11:\n" - "cmp %[pointwiseActivationMode], %[am_relu6]\n" - "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v3.8h, v3.8h, v0.8h\n" - "fmax v4.8h, v4.8h, v0.8h\n" - "fmax v5.8h, v5.8h, v0.8h\n" - "fmax v6.8h, v6.8h, v0.8h\n" - "fmax v7.8h, v7.8h, v0.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmin v2.8h, v2.8h, v1.8h\n" - "fmin v3.8h, v3.8h, v1.8h\n" - "fmin v4.8h, v4.8h, v1.8h\n" - "fmin v5.8h, v5.8h, v1.8h\n" - "fmin v6.8h, v6.8h, v1.8h\n" - "fmin v7.8h, v7.8h, v1.8h\n" - "fmin v8.8h, v8.8h, v1.8h\n" - "fmin v9.8h, v9.8h, v1.8h\n" - - "12:\n" - "cmp %[pointwiseActivationMode], %[am_h_swish]\n" - "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v10.8h, #0x42, lsl #8\n" //three - "fadd v11.8h, v2.8h, v10.8h\n" - "fadd v12.8h, v3.8h, v10.8h\n" - "fadd v13.8h, v4.8h, v10.8h\n" - "fadd v14.8h, v5.8h, v10.8h\n" - "fadd v15.8h, v6.8h, v10.8h\n" - "fadd v16.8h, v7.8h, v10.8h\n" - "fadd v17.8h, v8.8h, v10.8h\n" - "fadd v18.8h, v9.8h, v10.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" - "fmax v12.8h, v12.8h, v0.8h\n" - "fmax v13.8h, v13.8h, v0.8h\n" - "fmax v14.8h, v14.8h, v0.8h\n" - "fmax v15.8h, v15.8h, v0.8h\n" - "fmax v16.8h, v16.8h, v0.8h\n" - "fmax v17.8h, v17.8h, v0.8h\n" - "fmax v18.8h, v18.8h, v0.8h\n" - "fmin v11.8h, v11.8h, v1.8h\n" - "fmin v12.8h, v12.8h, v1.8h\n" - "fmin v13.8h, v13.8h, v1.8h\n" - "fmin v14.8h, v14.8h, v1.8h\n" - "fmin v15.8h, v15.8h, v1.8h\n" - "fmin v16.8h, v16.8h, v1.8h\n" - "fmin v17.8h, v17.8h, v1.8h\n" - "fmin v18.8h, v18.8h, v1.8h\n" - "fdiv v11.8h, v11.8h, v1.8h\n" - "fdiv v12.8h, v12.8h, v1.8h\n" - "fdiv v13.8h, v13.8h, v1.8h\n" - "fdiv v14.8h, v14.8h, v1.8h\n" - "fdiv v15.8h, v15.8h, v1.8h\n" - "fdiv v16.8h, v16.8h, v1.8h\n" - "fdiv v17.8h, v17.8h, v1.8h\n" - "fdiv v18.8h, v18.8h, v1.8h\n" - "fmul v2.8h, v11.8h, v2.8h\n" - "fmul v3.8h, v12.8h, v3.8h\n" - "fmul v4.8h, v13.8h, v4.8h\n" - "fmul v5.8h, v14.8h, v5.8h\n" - "fmul v6.8h, v15.8h, v6.8h\n" - "fmul v7.8h, v16.8h, v7.8h\n" - "fmul v8.8h, v17.8h, v8.8h\n" - "fmul v9.8h, v18.8h, v9.8h\n" - - "13:\n" - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw0 - "str q4, [%[out_0], #32]\n" //out_o0hw0 - "str q5, [%[out_0], #48]\n" //out_o0hw0 - "str q6, [%[out_0], #64]\n" //out_o0hw0 - "str q7, [%[out_0], #80]\n" //out_o0hw0 - "str q8, [%[out_0], #96]\n" //out_o0hw0 - "str q9, [%[out_0], #112]\n" //out_o0hw0 - :[out_0]"+r"(out_o0hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_r) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "x0", "x1", "x2" - ); - } - } - - // ohow_remainder % 8 / 4 - U32 ohow_s = (ohow / 8) * 8; - for (I32 hw = ohow_s; hw < ohow-3; hw+=4) { - const F16 *b0 = biasArray + ic*8; - const F16 *b1 = b0 + 8; - const F16 *f_o0c0 = filterArray + ic*fh*fw*8; - F16 *in_pack = pwArray + hw*ic*8; - for (I32 o = 0; o < I32(oc-1); o+=2) { - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; - // bias - const F16 *b_o0 = b0; - const F16 *b_o1 = b1; - __asm__ __volatile__( - "ldr q22, [%[b_0]]\n" //b_o0 - "ldr q23, [%[b_1]]\n" //b_o1 - "mov x0, %[ic]\n" //ic_blk - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr d0, [%[in_0]]\n" //in_hw0 - "mov v3.16b, v22.16b\n" //out_o0hw1 - "ldr q18, [%[f_0]]\n" //f_o0c0 - "mov v4.16b, v22.16b\n" //out_o0hw2 - "mov v5.16b, v22.16b\n" //out_o0hw3 - "mov v10.16b, v23.16b\n" //out_o1hw0 - "ldr q19, [%[f_0], #16]\n" //f_o1c0 - "mov v11.16b, v23.16b\n" //out_o1hw1 - "mov v12.16b, v23.16b\n" //out_o1hw2 - "mov v13.16b, v23.16b\n" //out_o1hw3 - "0:\n" - "ldr d1, [%[in_0], #8]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr q20, [%[f_0], #32]\n" //f_o0c0 - "fmla v3.8h, v18.8h, v0.h[1]\n" - "fmla v4.8h, v18.8h, v0.h[2]\n" - "fmla v5.8h, v18.8h, v0.h[3]\n" - "ldr q21, [%[f_0], #48]\n" //f_o1c0 - "fmla v10.8h, v19.8h, v0.h[0]\n" - "fmla v11.8h, v19.8h, v0.h[1]\n" - "fmla v12.8h, v19.8h, v0.h[2]\n" - "subs x0, x0, #2\n" - "fmla v13.8h, v19.8h, v0.h[3]\n" - - "ldr d0, [%[in_0], #16]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr q18, [%[f_0], #64]\n" //f_o0c0 - "fmla v3.8h, v20.8h, v1.h[1]\n" - "fmla v4.8h, v20.8h, v1.h[2]\n" - "ldr q19, [%[f_0], #80]\n" //f_o1c0 - "fmla v5.8h, v20.8h, v1.h[3]\n" - "fmla v10.8h, v21.8h, v1.h[0]\n" - "fmla v11.8h, v21.8h, v1.h[1]\n" - "fmla v12.8h, v21.8h, v1.h[2]\n" - "add %[in_0], %[in_0], #16\n" - "fmla v13.8h, v21.8h, v1.h[3]\n" - "add %[f_0], %[f_0], #64\n" - "bne 0b\n" - - "cmp %[pointwiseActivationMode], %[am_relu]\n" - "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v3.8h, v3.8h, v0.8h\n" - "fmax v4.8h, v4.8h, v0.8h\n" - "fmax v5.8h, v5.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" - "fmax v12.8h, v12.8h, v0.8h\n" - "fmax v13.8h, v13.8h, v0.8h\n" - - "11:\n" - "cmp %[pointwiseActivationMode], %[am_relu6]\n" - "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v3.8h, v3.8h, v0.8h\n" - "fmax v4.8h, v4.8h, v0.8h\n" - "fmax v5.8h, v5.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - "fmax v11.8h, v11.8h, v0.8h\n" - "fmax v12.8h, v12.8h, v0.8h\n" - "fmax v13.8h, v13.8h, v0.8h\n" - "fmin v2.8h, v2.8h, v1.8h\n" - "fmin v3.8h, v3.8h, v1.8h\n" - "fmin v4.8h, v4.8h, v1.8h\n" - "fmin v5.8h, v5.8h, v1.8h\n" - "fmin v10.8h, v10.8h, v1.8h\n" - "fmin v11.8h, v11.8h, v1.8h\n" - "fmin v12.8h, v12.8h, v1.8h\n" - "fmin v13.8h, v13.8h, v1.8h\n" - - "12:\n" - "cmp %[pointwiseActivationMode], %[am_h_swish]\n" - "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v18.8h, #0x42, lsl #8\n" //three - "fadd v6.8h, v2.8h, v18.8h\n" - "fadd v7.8h, v3.8h, v18.8h\n" - "fadd v8.8h, v4.8h, v18.8h\n" - "fadd v9.8h, v5.8h, v18.8h\n" - "fadd v19.8h, v10.8h, v18.8h\n" - "fadd v20.8h, v11.8h, v18.8h\n" - "fadd v21.8h, v12.8h, v18.8h\n" - "fadd v22.8h, v13.8h, v18.8h\n" - "fmax v6.8h, v6.8h, v0.8h\n" - "fmax v7.8h, v7.8h, v0.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmax v19.8h, v19.8h, v0.8h\n" - "fmax v20.8h, v20.8h, v0.8h\n" - "fmax v21.8h, v21.8h, v0.8h\n" - "fmax v22.8h, v22.8h, v0.8h\n" - "fmin v6.8h, v6.8h, v1.8h\n" - "fmin v7.8h, v7.8h, v1.8h\n" - "fmin v8.8h, v8.8h, v1.8h\n" - "fmin v9.8h, v9.8h, v1.8h\n" - "fmin v19.8h, v19.8h, v1.8h\n" - "fmin v20.8h, v20.8h, v1.8h\n" - "fmin v21.8h, v21.8h, v1.8h\n" - "fmin v22.8h, v22.8h, v1.8h\n" - "fdiv v6.8h, v6.8h, v1.8h\n" - "fdiv v7.8h, v7.8h, v1.8h\n" - "fdiv v8.8h, v8.8h, v1.8h\n" - "fdiv v9.8h, v9.8h, v1.8h\n" - "fdiv v19.8h, v19.8h, v1.8h\n" - "fdiv v20.8h, v20.8h, v1.8h\n" - "fdiv v21.8h, v21.8h, v1.8h\n" - "fdiv v22.8h, v22.8h, v1.8h\n" - "fmul v2.8h, v6.8h, v2.8h\n" - "fmul v3.8h, v7.8h, v3.8h\n" - "fmul v4.8h, v8.8h, v4.8h\n" - "fmul v5.8h, v9.8h, v5.8h\n" - "fmul v10.8h, v19.8h, v10.8h\n" - "fmul v11.8h, v20.8h, v11.8h\n" - "fmul v12.8h, v21.8h, v12.8h\n" - "fmul v13.8h, v22.8h, v13.8h\n" - - "13:\n" - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw1 - "str q4, [%[out_0], #32]\n" //out_o0hw2 - "str q5, [%[out_0], #48]\n" //out_o0hw3 - "str q10, [%[out_1]]\n" //out_o1hw0 - "str q11, [%[out_1], #16]\n" //out_o1hw1 - "str q12, [%[out_1], #32]\n" //out_o1hw2 - "str q13, [%[out_1], #48]\n" //out_o1hw3 - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v18", "v19", "v20", "v21", "v22", "v23", "x0", "x1", "x2", "x3" - ); - b0 += 16; - b1 += 16; - } - if (oc & 1) { - // oc%2 != 0 - const F16 *f_r = filterArray + ic*fh*fw*8 + (oc-1)*8*ic*8; - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + (oc-1)*ohow*8 + hw*8; - // bias - const F16 *b_o0 = biasArray + ic*8 + (oc-1)*8; - __asm__ __volatile__( - "ldr q22, [%[b_0]]\n" //b_o0 - "mov x0, %[ic]\n" //ic_blk - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr d0, [%[in_0]]\n" //in_hw0 - "mov v3.16b, v22.16b\n" //out_o0hw1 - "ldr q18, [%[f_0]]\n" //f_o0c0 - "mov v4.16b, v22.16b\n" //out_o0hw2 - "mov v5.16b, v22.16b\n" //out_o0hw3 - "0:\n" - "ldr d1, [%[in_0], #8]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr q20, [%[f_0], #16]\n" //f_o0c0 - "fmla v3.8h, v18.8h, v0.h[1]\n" - "fmla v4.8h, v18.8h, v0.h[2]\n" - "fmla v5.8h, v18.8h, v0.h[3]\n" - "subs x0, x0, #2\n" - - "ldr d0, [%[in_0], #16]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr q18, [%[f_0], #32]\n" //f_o0c0 - "fmla v3.8h, v20.8h, v1.h[1]\n" - "fmla v4.8h, v20.8h, v1.h[2]\n" - "fmla v5.8h, v20.8h, v1.h[3]\n" - "add %[in_0], %[in_0], #16\n" - "add %[f_0], %[f_0], #32\n" - "bne 0b\n" - - "cmp %[pointwiseActivationMode], %[am_relu]\n" - "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v3.8h, v3.8h, v0.8h\n" - "fmax v4.8h, v4.8h, v0.8h\n" - "fmax v5.8h, v5.8h, v0.8h\n" - - "11:\n" - "cmp %[pointwiseActivationMode], %[am_relu6]\n" - "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v3.8h, v3.8h, v0.8h\n" - "fmax v4.8h, v4.8h, v0.8h\n" - "fmax v5.8h, v5.8h, v0.8h\n" - "fmin v2.8h, v2.8h, v1.8h\n" - "fmin v3.8h, v3.8h, v1.8h\n" - "fmin v4.8h, v4.8h, v1.8h\n" - "fmin v5.8h, v5.8h, v1.8h\n" - - "12:\n" - "cmp %[pointwiseActivationMode], %[am_h_swish]\n" - "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v18.8h, #0x42, lsl #8\n" //three - "fadd v6.8h, v2.8h, v18.8h\n" - "fadd v7.8h, v3.8h, v18.8h\n" - "fadd v8.8h, v4.8h, v18.8h\n" - "fadd v9.8h, v5.8h, v18.8h\n" - "fmax v6.8h, v6.8h, v0.8h\n" - "fmax v7.8h, v7.8h, v0.8h\n" - "fmax v8.8h, v8.8h, v0.8h\n" - "fmax v9.8h, v9.8h, v0.8h\n" - "fmin v6.8h, v6.8h, v1.8h\n" - "fmin v7.8h, v7.8h, v1.8h\n" - "fmin v8.8h, v8.8h, v1.8h\n" - "fmin v9.8h, v9.8h, v1.8h\n" - "fdiv v6.8h, v6.8h, v1.8h\n" - "fdiv v7.8h, v7.8h, v1.8h\n" - "fdiv v8.8h, v8.8h, v1.8h\n" - "fdiv v9.8h, v9.8h, v1.8h\n" - "fmul v2.8h, v6.8h, v2.8h\n" - "fmul v3.8h, v7.8h, v3.8h\n" - "fmul v4.8h, v8.8h, v4.8h\n" - "fmul v5.8h, v9.8h, v5.8h\n" - - "13:\n" - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw1 - "str q4, [%[out_0], #32]\n" //out_o0hw2 - "str q5, [%[out_0], #48]\n" //out_o0hw3 - :[out_0]"+r"(out_o0hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_r) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", - "v18", "v20", "v22", "x0", "x1", "x2" - ); - } - } - - // ohow_reminder % 4 - ohow_s = (ohow / 4) * 4; - for (I32 hw = ohow_s; hw < ohow; hw++) { - const F16 *b0 = biasArray + ic*8; - const F16 *b1 = b0 + 8; - const F16 *f_o0c0 = filterArray + ic*fh*fw*8; - F16 *in_pack = pwArray + hw*ic*8; - for (I32 o = 0; o < I32(oc-1); o+=2) { - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; - // bias - const F16 *b_o0 = b0; - const F16 *b_o1 = b1; - __asm__ __volatile__( - "ldr q22, [%[b_0]]\n" //b_o0 - "ldr q23, [%[b_1]]\n" //b_o1 - "mov x0, %[ic]\n" //ic_blk - "ldr h0, [%[in_0]]\n" //in_hw0 - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr q18, [%[f_0]]\n" //f_o0c0 - "mov v10.16b, v23.16b\n" //out_o1hw0 - "ldr q19, [%[f_0], #16]\n" //f_o1c0 - "0:\n" - "ldr h1, [%[in_0], #2]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr q20, [%[f_0], #32]\n" //f_o0c0 - "fmla v10.8h, v19.8h, v0.h[0]\n" - "ldr q21, [%[f_0], #48]\n" //f_o1c0 - "subs x0, x0, #2\n" - - "ldr h0, [%[in_0], #4]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr q18, [%[f_0], #64]\n" //f_o0c0 - "fmla v10.8h, v21.8h, v1.h[0]\n" - "ldr q19, [%[f_0], #80]\n" //f_o1c0 - "add %[in_0], %[in_0], #4\n" - "add %[f_0], %[f_0], #64\n" - "bne 0b\n" - - "cmp %[pointwiseActivationMode], %[am_relu]\n" - "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - - "11:\n" - "cmp %[pointwiseActivationMode], %[am_relu6]\n" - "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "fmax v2.8h, v2.8h, v0.8h\n" - "fmax v10.8h, v10.8h, v0.8h\n" - "fmin v2.8h, v2.8h, v1.8h\n" - "fmin v10.8h, v10.8h, v1.8h\n" - - "12:\n" - "cmp %[pointwiseActivationMode], %[am_h_swish]\n" - "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v18.8h, #0x42, lsl #8\n" //three - "fadd v19.8h, v2.8h, v18.8h\n" - "fadd v20.8h, v10.8h, v18.8h\n" - "fmax v19.8h, v19.8h, v0.8h\n" - "fmax v20.8h, v20.8h, v0.8h\n" - "fmin v19.8h, v19.8h, v1.8h\n" - "fmin v20.8h, v20.8h, v1.8h\n" - "fdiv v19.8h, v19.8h, v1.8h\n" - "fdiv v20.8h, v20.8h, v1.8h\n" - "fmul v2.8h, v19.8h, v2.8h\n" - "fmul v10.8h, v20.8h, v10.8h\n" - - "13:\n" - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q10, [%[out_1]]\n" //out_o1hw0 - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v10", "v18", "v19", "v20", "v21", "v22", "v23", "x0", "x1", "x2", "x3" - ); - b0 += 16; - b1 += 16; - } - if (oc & 1) { - // oc%2 != 0 - const F16 *f_r = filterArray + ic*fh*fw*8 + (oc-1)*8*ic*8; - F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + (oc-1)*ohow*8 + hw*8; - // bias - const F16 *b_o0 = biasArray + ic*8 + (oc-1)*8; - __asm__ __volatile__( - "ldr q22, [%[b_0]]\n" //b_o0 - "mov x0, %[ic]\n" //ic_blk - "ldr h0, [%[in_0]]\n" //in_hw0 - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr q18, [%[f_0]]\n" //f_o0c0 - "0:\n" - "ldr h1, [%[in_0], #2]\n" //in_hw0 - "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr q20, [%[f_0], #16]\n" //f_o0c0 - "subs x0, x0, #2\n" - - "ldr h0, [%[in_0], #4]\n" //in_hw0 - "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr q18, [%[f_0], #32]\n" //f_o0c0 - "add %[in_0], %[in_0], #4\n" - "add %[f_0], %[f_0], #32\n" - "bne 0b\n" - - "cmp %[pointwiseActivationMode], %[am_relu]\n" - "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "fmax v2.8h, v2.8h, v0.8h\n" - - "11:\n" - "cmp %[pointwiseActivationMode], %[am_relu6]\n" - "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "fmax v2.8h, v2.8h, v0.8h\n" - "fmin v2.8h, v2.8h, v1.8h\n" - - "12:\n" - "cmp %[pointwiseActivationMode], %[am_h_swish]\n" - "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v18.8h, #0x42, lsl #8\n" //three - "fadd v20.8h, v2.8h, v18.8h\n" - "fmax v20.8h, v20.8h, v0.8h\n" - "fmin v20.8h, v20.8h, v1.8h\n" - "fdiv v20.8h, v20.8h, v1.8h\n" - "fmul v2.8h, v20.8h, v2.8h\n" - - "13:\n" - "str q2, [%[out_0]]\n" //out_o0hw0 - :[out_0]"+r"(out_o0hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_r) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v10", "v18", "v20", "v22", "x0", "x1", "x2" - ); - } - } - } - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/arm/fp16/detectionoutput.cpp b/tensor_computing/src/cpu/arm/fp16/detectionoutput.cpp deleted file mode 100644 index 65e5155d..00000000 --- a/tensor_computing/src/cpu/arm/fp16/detectionoutput.cpp +++ /dev/null @@ -1,139 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/tensor_computing_arm.h" -#include "cpu/arm/fp16/tensor_computing_fp16.h" - -EE detectionoutput_fp16(std::vector inputDesc, std::vector input, DetectionOutputDesc detectionoutputDesc, TensorDesc outputDesc, F16* output) -{ - UNUSED(outputDesc); - if (inputDesc.size() != 3) { - CHECK_STATUS(NOT_MATCH); - } - F16* location = (F16*)input[0]; - F16* confidence = (F16*)input[1]; - F16* priorbox = (F16*)input[2]; - - U32 ilens2 = inputDesc[2].dims[0]; - U32 num_total_priorbox = ilens2 / 4; - U32 num_class = detectionoutputDesc.num_class; - F32 nms_threshold = detectionoutputDesc.nms_threshold; - U32 nms_top_k = detectionoutputDesc.nms_top_k; - U32 keep_top_k = detectionoutputDesc.keep_top_k; - F32 confidence_threshold = detectionoutputDesc.confidence_threshold; - - std::vector> boxes; - boxes.resize(num_total_priorbox); - F16* variance = priorbox + ilens2; - // decode priorbox - for(U32 i = 0 ; i < num_total_priorbox ; i++){ - F16* loc = location + i * 4; - F16* pb = priorbox + i * 4; - F16* var = variance + i * 4; - - F32 pb_w = pb[2] - pb[0]; - F32 pb_h = pb[3] - pb[1]; - F32 pb_cx = (pb[0] + pb[2]) * 0.5f; - F32 pb_cy = (pb[1] + pb[3]) * 0.5f; - - F32 box_cx = var[0] * loc[0] * pb_w + pb_cx; - F32 box_cy = var[1] * loc[1] * pb_h + pb_cy; - F32 box_w = static_cast(exp(var[2] * loc[2]) * pb_w); - F32 box_h = static_cast(exp(var[3] * loc[3]) * pb_h); - - std::vector box; - box.resize(4); - box[0] = box_cx - box_w * 0.5f; - box[1] = box_cy - box_h * 0.5f; - box[2] = box_cx + box_w * 0.5f; - box[3] = box_cy + box_h * 0.5f; - // give box to boxes - boxes[i].assign(box.begin(),box.end()); - } - std::vector> allclass_boxrects; - std::vector> allclass_boxscores; - allclass_boxrects.resize(num_class); - allclass_boxscores.resize(num_class); - - for(U32 i = 1; i < num_class; i++){ - std::vector class_boxrects; - std::vector class_boxscores; - for(U32 j = 0; j < num_total_priorbox; j++){ - - F32 score = confidence[j * num_class + i]; - if (score > confidence_threshold) - { - std::vector inbox; - inbox.assign(boxes[j].begin(),boxes[j].end()); - BoxRect b = { inbox[0], inbox[1], inbox[2], inbox[3], i }; - class_boxrects.push_back(b); - class_boxscores.push_back(score); - } - } - //sort the boxes with scores - detectionoutput_qsort_descent_arm(class_boxrects, class_boxscores, 0, static_cast(class_boxscores.size()-1)); - - if(nms_top_k < (U32)class_boxrects.size()){ - class_boxrects.resize(nms_top_k); - class_boxscores.resize(nms_top_k); - } - //apply nms - std::vector picked; - detectionoutput_nms_pickedboxes_arm(class_boxrects, picked, nms_threshold); - - for(I64 j = 0; j < (I64)picked.size(); j++) - { - I64 picked_box = picked[j]; - allclass_boxrects[i].push_back(class_boxrects[picked_box]); - allclass_boxscores[i].push_back(class_boxscores[picked_box]); - } - } - - std::vector boxrects; - std::vector boxscores; - - for (U32 i = 1; i < num_class ; i++) - { - boxrects.insert(boxrects.end(), allclass_boxrects[i].begin(), allclass_boxrects[i].end()); - boxscores.insert(boxscores.end(), allclass_boxscores[i].begin(), allclass_boxscores[i].end()); - } - - detectionoutput_qsort_descent_arm(boxrects, boxscores, 0, static_cast(boxscores.size()-1)); - - if (keep_top_k < (U32)boxrects.size()) - { - boxrects.resize(keep_top_k); - boxscores.resize(keep_top_k); - } - - U32 num_detected = static_cast(boxrects.size()); - if (num_detected == 0) - return SUCCESS; - // the first box contains the number of availble boxes - output[0] = num_detected; - output[1] = output[2] = output[3] = output[4] = output[5] = 0; - - for(U32 i = 0; i < num_detected ; i++){ - BoxRect b = boxrects[i]; - F32 score = boxscores[i]; - - output[(i+1)*6] = b.label; - output[(i+1)*6+1] = score; - output[(i+1)*6+2] = b.xmin; - output[(i+1)*6+3] = b.ymin; - output[(i+1)*6+4] = b.xmax; - output[(i+1)*6+5] = b.ymax; - } - return SUCCESS; -} \ No newline at end of file diff --git a/tensor_computing/src/cpu/arm/fp16/eltwise.cpp b/tensor_computing/src/cpu/arm/fp16/eltwise.cpp deleted file mode 100644 index 69be7620..00000000 --- a/tensor_computing/src/cpu/arm/fp16/eltwise.cpp +++ /dev/null @@ -1,90 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "cpu/arm/fp16/tensor_computing_fp16.h" - -float16x8_t getHalfVector(void* input, int inputSize, int index) { - float16x8_t result; - if (inputSize == 1) { - result = vdupq_n_f16(*((F16*)input)); - return result; - } - int local = index % inputSize; - int remain = inputSize - local; - if (remain >= 8) { - result = vld1q_f16((F16*)(input) + local); - } else { - F16 buffer[8]; - F16 *ptr = (F16*)input; - memcpy(buffer, ptr+local, sizeof(F16)*remain); - for (int i = 0; i < 8 - remain; i++) { - buffer[remain+i] = ptr[i % inputSize]; - } - result = vld1q_f16(buffer); - } - return result; -} - -F32 getHalfScalar(void* input, int inputSize, int index) { - int local = index % inputSize; - return ((F16*)input)[local]; -} - -EE eltwise_fp16(std::vectorinput, std::vector inputSize, U32 num, U32 len, void *output, EltwiseMode eltwiseMode) { - U32 len_tail = len % 8; - U32 len_main = len - len_tail; - F16 *output_ptr = (F16 *)output; - for (U32 i = 0; i < len_main; i+=8){ - float16x8_t tmp_v = getHalfVector(input[0], inputSize[0], i); - for (U32 j = 1; j < num; j++) { - float16x8_t value_v = getHalfVector(input[j], inputSize[j], i); - switch (eltwiseMode) { - case ELTWISE_SUM: - tmp_v = vaddq_f16(value_v, tmp_v); - break; - case ELTWISE_MAX: - tmp_v = vmaxq_f16(value_v, tmp_v); - break; - case ELTWISE_PROD: - tmp_v = vmulq_f16(value_v, tmp_v); - break; - default: - return NOT_SUPPORTED; - } - } - vst1q_f16(output_ptr + i, tmp_v); - } - for (U32 i = len_main; i < len; i++){ - F32 tmp_s = getHalfScalar(input[0], inputSize[0], i); - for (U32 j = 1; j < num; j++) { - F32 value_s = getHalfScalar(input[j], inputSize[j], i); - switch (eltwiseMode) { - case ELTWISE_SUM: - tmp_s = value_s + tmp_s; - break; - case ELTWISE_MAX: - tmp_s = (value_s > tmp_s) ? value_s : tmp_s; - break; - case ELTWISE_PROD: - tmp_s *= value_s; - break; - default: - return NOT_SUPPORTED; - } - } - output_ptr[i] = tmp_s; - } - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/arm/fp16/lstm.cpp b/tensor_computing/src/cpu/arm/fp16/lstm.cpp deleted file mode 100644 index 4d9c3c80..00000000 --- a/tensor_computing/src/cpu/arm/fp16/lstm.cpp +++ /dev/null @@ -1,265 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "cpu/arm/fp16/tensor_computing_fp16.h" -#include "blas-enhance.h" - -void mvm_nkn32(U32 fn, U32 fk, const F16* filterArray, F16* input, F16* output) { - for (U32 n = 0; n < fn; n++) { - F16 *in = input; - const F16 *f = filterArray + n*fk*32; - __asm__ __volatile__( - "ldr s0, [%[in]]\n" - "ldr q1, [%[out]]\n" - "ldr q2, [%[out], #16]\n" - "ldr q3, [%[out], #32]\n" - "ldr q4, [%[out], #48]\n" - "mov x0, %[k]\n" - "ldr q5, [%[f]]\n" - "ldr q6, [%[f], #16]\n" - "ldr q7, [%[f], #32]\n" - "ldr q8, [%[f], #48]\n" - "0:\n" - "prfm pldl2strm, [%[f], #4096]\n" - "prfm pldl1strm, [%[f], #1024]\n" - "ldr d9, [%[f], #64]\n" - "fmla v1.8h, v5.8h, v0.h[0]\n" - "ldr x9, [%[f], #72]\n" - "ins v9.d[1], x9\n" - "ldr d10, [%[f], #80]\n" - "fmla v2.8h, v6.8h, v0.h[0]\n" - "ldr x10, [%[f], #88]\n" - "ins v10.d[1], x10\n" - "ldr d11, [%[f], #96]\n" - "fmla v3.8h, v7.8h, v0.h[0]\n" - "ldr x11, [%[f], #104]\n" - "ins v11.d[1], x11\n" - "ldr d12, [%[f], #112]\n" - "fmla v4.8h, v8.8h, v0.h[0]\n" - "ldr x12, [%[f], #120]\n" - "ins v12.d[1], x12\n" - - "ldr d5, [%[f], #128]\n" - "fmla v1.8h, v9.8h, v0.h[1]\n" - "ldr x5, [%[f], #136]\n" - "ins v5.d[1], x5\n" - "ldr d6, [%[f], #144]\n" - "fmla v2.8h, v10.8h, v0.h[1]\n" - "ldr x6, [%[f], #152]\n" - "ins v6.d[1], x6\n" - "ldr d7, [%[f], #160]\n" - "fmla v3.8h, v11.8h, v0.h[1]\n" - "ldr x7, [%[f], #168]\n" - "ins v7.d[1], x7\n" - "ldr d8, [%[f], #176]\n" - "fmla v4.8h, v12.8h, v0.h[1]\n" - "ldr x8, [%[f], #184]\n" - "add %[in], %[in], #4\n" - "ins v8.d[1], x8\n" - "add %[f], %[f], #128\n" - "ldr s0, [%[in]]\n" - "sub x0, x0, #2\n" - - "cmp x0, #3\n" - "bgt 0b\n" - "ldr q9, [%[f], #64]\n" - "ldr q10, [%[f], #80]\n" - "ldr q11, [%[f], #96]\n" - "ldr q12, [%[f], #112]\n" - "fmla v1.8h, v5.8h, v0.h[0]\n" - "fmla v2.8h, v6.8h, v0.h[0]\n" - "fmla v3.8h, v7.8h, v0.h[0]\n" - "fmla v4.8h, v8.8h, v0.h[0]\n" - "fmla v1.8h, v9.8h, v0.h[1]\n" - "fmla v2.8h, v10.8h, v0.h[1]\n" - "fmla v3.8h, v11.8h, v0.h[1]\n" - "fmla v4.8h, v12.8h, v0.h[1]\n" - "cmp x0, #3\n" - "bne 1f\n" - "ldr h0, [%[in], #4]\n" - "ldr q5, [%[f], #128]\n" - "ldr q6, [%[f], #144]\n" - "ldr q7, [%[f], #160]\n" - "ldr q8, [%[f], #176]\n" - "fmla v1.8h, v5.8h, v0.h[0]\n" - "fmla v2.8h, v6.8h, v0.h[0]\n" - "fmla v3.8h, v7.8h, v0.h[0]\n" - "fmla v4.8h, v8.8h, v0.h[0]\n" - - "1:\n" - "str q1, [%[out]]\n" - "str q2, [%[out], #16]\n" - "str q3, [%[out], #32]\n" - "str q4, [%[out], #48]\n" - :[out]"+r"(output), - [f]"+r"(f), - [in]"+r"(in) - :[k]"r"((I64)fk) - :"memory", "cc", "x0", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", - "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12" - ); - output += 32; - } -} - -EE lstmcell_fp16(TensorDesc xDesc, const void* currentX, - TensorDesc filterDesc, const void* filter, - TensorDesc biasDesc, const void* bias, - void *state, - U32 tmpBytes, void *tmp, - LSTMDesc lstmDesc, U32 batchStrideX, U32 batchStrideH, - TensorDesc hDesc, void* output, - Arch arch) -{ - UNUSED(biasDesc); - UNUSED(tmpBytes); - UNUSED(arch); - if (nullptr == currentX - || nullptr == filter - || nullptr == bias - || nullptr == state - || nullptr == tmp - || nullptr == output) - CHECK_STATUS(NULL_POINTER); - - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ix; - U32 on, oh; - U32 fk, fn; - CHECK_STATUS(tensor2dfGet(xDesc, &idt, &idf, &in, &ix)); - CHECK_STATUS(tensor2dfGet(filterDesc, &fdt, &fdf, &fn, &fk)); - CHECK_STATUS(tensor2dfGet(hDesc, &odt, &odf, &on, &oh)); - if(fdf != DF_NKN32) { - CHECK_STATUS(NOT_MATCH); - } - fn /= 32; - - U32 batch = in; - I32 xDim = ix; - I32 hDim = lstmDesc.numOutput; - I32 column = (lstmDesc.numProjection > 0) ? lstmDesc.numProjection : lstmDesc.numOutput; - if (!(idt == DT_F16 && fdt == DT_F16 && odt == DT_F16)) { - CHECK_STATUS(NOT_MATCH); - } - if (!(4*column == (I32)fn*32 && (ix+oh) == fk && in == on)) { - CHECK_STATUS(NOT_MATCH); - } - F32 forgetBias = lstmDesc.forgetBias; - ActivationMode activationMode = lstmDesc.activationMode; - if (activationMode != ACTIVATION_TANH) - CHECK_STATUS(NOT_SUPPORTED); - - const F16 *currentXArray = (const F16*)currentX; - const F16 *filterArray = (const F16*)filter; - const F16 *biasArray = (const F16*)bias; - const F16 *projectionArray = (const F16*)filter + (fn * 32 * fk); - F16 *lastStateArray = (F16*)state; - F16 *lastHArray = lastStateArray + column; - F16 *tmpArray = (F16*)tmp; - F16 *currentStateArray = (F16*)state; - F16 *currentHArray = currentStateArray + column; - F16 *outputArray = (F16*)output; - F16 *xhArray = tmpArray; - F16 *intermediateH = xhArray + (xDim + hDim); - U32 lastStateStride = column + hDim; - U32 lastHStride = column + hDim; - U32 currentStateStride = column + hDim; - U32 currentHStride = column + hDim; - float16x8_t forgetBiasVector = vdupq_n_f16(forgetBias); - for (U32 m = 0; m < batch; m++) { - F16 *lastBatchH = lastHArray + m * lastHStride; - memcpy(xhArray, currentXArray+m*batchStrideX, xDim*sizeof(F16)); - memcpy(xhArray+xDim, lastBatchH, hDim*sizeof(F16)); - - memcpy(intermediateH, biasArray, column * 4 * sizeof(F16)); - mvm_nkn32(fn, fk, filterArray, xhArray, intermediateH); - - F16 *out_i = intermediateH; - F16 *out_g = out_i + column; - F16 *out_f = out_i + column * 2; - F16 *out_o = out_i + column * 3; - - F16 *lastBatchState = lastStateArray + m * lastStateStride; - F16 *currentBatchState = currentStateArray + m * currentStateStride; - F16 *currentBatchH = currentHArray + m * currentHStride; - F16 *currentOutput = outputArray + m * batchStrideH; - - F16* tmpState, *tmpHH, *tmpH; - if (lstmDesc.zoneoutCell == 0) { - tmpState = currentBatchState; - } else { - tmpState = out_i; - } - if (lstmDesc.zoneoutOutput != 0) { - tmpHH = out_g; - tmpH = out_f; - } else { - if (lstmDesc.numProjection > 0) { - tmpHH = out_g; - tmpH = out_f; - } else { - tmpHH = currentBatchH; - tmpH = currentBatchH; - } - } - - I32 h = 0; - for (; h < column-7; h+=8) { - float16x8_t out_i_v = vld1q_f16(out_i + h); - float16x8_t out_g_v = vld1q_f16(out_g + h); - float16x8_t out_f_v = vld1q_f16(out_f + h); - float16x8_t out_o_v = vld1q_f16(out_o + h); - float16x8_t C_v = vld1q_f16(lastBatchState + h); - float16x8_t I_v = vsigmoidq_f16(out_i_v); - float16x8_t F_v = vsigmoidq_f16(vaddq_f16(out_f_v, forgetBiasVector)); - float16x8_t O_v = vsigmoidq_f16(out_o_v); - float16x8_t G_v = vtanhq_f16(out_g_v); - C_v = vaddq_f16_f32(vmulq_f16(C_v, F_v), vmulq_f16(I_v, G_v)); - float16x8_t out_hidden_v = vmulq_f16(O_v, vtanhq_f16(C_v)); - vst1q_f16(tmpState + h, C_v); - vst1q_f16(tmpHH + h, out_hidden_v); - } - for (; h < column; h++) { - F16 C_s = lastBatchState[h]; - F16 I_s = 1.0 / (1.0 + exp(-out_i[h])); - F16 F_s = 1.0 / (1.0 + exp(-(out_f[h] + forgetBias))); - F16 O_s = 1.0 / (1.0 + exp(-out_o[h])); - F16 G_s = tanh(out_g[h]); - C_s = C_s * F_s + I_s * G_s; - F16 value = O_s * tanh(C_s); - tmpState[h] = C_s; - tmpHH[h] = value; - } - if (lstmDesc.zoneoutCell != 0) { - array_scale_f16(tmpState, tmpState, column, 1-lstmDesc.zoneoutCell, 0); - array_scale_f16(lastBatchState, lastBatchState, column, lstmDesc.zoneoutCell, 0); - array_add_f16(tmpState, lastBatchState, currentBatchState, column); - } - if (lstmDesc.zoneoutOutput != 0) { - array_scale_f16(tmpHH, tmpH, column, 1-lstmDesc.zoneoutOutput, 0); - array_scale_f16(lastBatchH, lastBatchH, column, lstmDesc.zoneoutOutput, 0); - array_add_f16(tmpH, lastBatchH, currentBatchH, column); - } - if (lstmDesc.numProjection > 0) { - memset(currentBatchH, 0, sizeof(F16) * hDim); - mvm_nkn32(hDim/32, lstmDesc.numProjection, projectionArray, tmpHH, currentBatchH); - tmpHH = currentBatchH; - } - memcpy(currentOutput, tmpHH, sizeof(F16) * hDim); - } - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/arm/fp16/normalization.cpp b/tensor_computing/src/cpu/arm/fp16/normalization.cpp deleted file mode 100644 index 7621c132..00000000 --- a/tensor_computing/src/cpu/arm/fp16/normalization.cpp +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "cpu/arm/fp16/tensor_computing_fp16.h" - -inline void array_norm_scale_fp16(F16 *input, F16 *output, I32 len, F32 mean, F32 var, F16 *alpha, F16 *beta) { - F32 eps = 1e-6; - F32 std_value = sqrt(var + eps); - float16x8_t mean_v = vdupq_n_f16(mean); - float16x8_t std_v = vdupq_n_f16(std_value); - - I32 i = 0; - for(i = 0; i < len - 7; i += 8){ - float16x8_t in = vld1q_f16(input + i); - float16x8_t alpha_v = vld1q_f16(alpha + i); - float16x8_t beta_v = vld1q_f16(beta + i); - - float16x8_t tmp_v = vsubq_f16(in, mean_v); - tmp_v = vdivq_f16(tmp_v, std_v); - tmp_v = vfmaq_f16(beta_v, alpha_v, tmp_v); - vst1q_f16(output+i, tmp_v); - } - for(; i < len; i++){ - output[i] = alpha[i] * (input[i] - mean) / std_value + beta[i]; - } -} - -EE layer_normalization_fp16(F16 *alpha, F16 *beta, - TensorDesc inputDesc, F16* input, - TensorDesc outputDesc, F16* output) -{ - UNUSED(outputDesc); - if (nullptr == alpha || nullptr == beta || nullptr == input || nullptr == output) - CHECK_STATUS(NULL_POINTER); - - U32 size = tensorNumElements(inputDesc); - I32 size_inner = inputDesc.dims[0]; - I32 size_outer = size / size_inner; - for(I32 i = 0; i < size_outer; i++) { - F16 *current_input = input + i * size_inner; - F16 *current_output = output + i * size_inner; - F32 mean = array_mean_f16(current_input, size_inner); - F32 var = array_var_f16(current_input, size_inner, mean); - - array_norm_scale_fp16(current_input, current_output, size_inner, mean, var, alpha, beta); - } - - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/arm/fp16/pooling.cpp b/tensor_computing/src/cpu/arm/fp16/pooling.cpp deleted file mode 100644 index 984e776b..00000000 --- a/tensor_computing/src/cpu/arm/fp16/pooling.cpp +++ /dev/null @@ -1,88 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/fp16/tensor_computing_fp16.h" - -EE pooling_fp16(TensorDesc inputDesc, const F16* input, PoolingDesc poolingDesc, TensorDesc outputDesc, F16* output) -{ - if (nullptr == input || nullptr == output) { - CHECK_STATUS(NULL_POINTER); - } - DataType idt, odt; - DataFormat idf, odf; - U32 in = 0, ic = 0, ih = 0, iw = 0, - on = 0, oc = 0, oh = 0, ow = 0; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - - if (idt != odt || idt != DT_F16) { - CHECK_STATUS(NOT_MATCH); - } - if (in != on || ic != oc) { - CHECK_STATUS(NOT_MATCH); - } - if (idf != DF_NCHWC8 || odf != idf) { - CHECK_STATUS(NOT_MATCH); - } - - PoolingMode pm = poolingDesc.pm; - U32 strideH = poolingDesc.stride_h; - U32 strideW = poolingDesc.stride_w; - U32 paddingT = poolingDesc.padding_top; - U32 paddingL = poolingDesc.padding_left; - U32 kernelSizeH = poolingDesc.kernelSize_h; - U32 kernelSizeW = poolingDesc.kernelSize_w; - if (paddingT >= kernelSizeH || paddingL >= kernelSizeW) { - CHECK_STATUS(NOT_SUPPORTED); - } - - ic /= 8; - for (U32 n = 0; n < in; n++) { - for (U32 c = 0; c < ic; c++) { - for (U32 h = 0; h < oh; h++) { - for (U32 w = 0; w < ow; w++) { - int hstart = (int)h * (int)strideH - (int)paddingT; - int wstart = (int)w * (int)strideW - (int)paddingL; - int hend = UNI_MIN(hstart + kernelSizeH, ih); - int wend = UNI_MIN(wstart + kernelSizeW, iw); - hstart = UNI_MAX(hstart, 0); - wstart = UNI_MAX(wstart, 0); - float16x8_t in1, out1; - float16x8_t poolSize = vdupq_n_f16(float16_t((hend - hstart)*(wend - wstart))); - out1 = vdupq_n_f16(float16_t((pm == POOLING_MAX) ? UNI_F16_MIN : 0)); - for (int kernelH = hstart; kernelH < hend; kernelH++) { - for (int kernelW = wstart; kernelW < wend; kernelW++) { - const U32 index = (kernelH * iw + kernelW) * 8; - in1 = vld1q_f16(input + index); - switch (pm) { - case POOLING_MAX: - out1 = vmaxq_f16(in1, out1); - break; - case POOLING_MEAN: - out1 = vaddq_f16(out1, in1); - break; - default: - CHECK_STATUS(NOT_SUPPORTED); - } - } - } - vst1q_f16(output + (h * ow + w) * 8, ((pm == POOLING_MAX) ? out1 : vdivq_f16(out1, poolSize))); - } - } - input += ih * iw * 8; - output += oh * ow * 8; - } - } - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/arm/fp16/priorbox.cpp b/tensor_computing/src/cpu/arm/fp16/priorbox.cpp deleted file mode 100644 index 7a965b06..00000000 --- a/tensor_computing/src/cpu/arm/fp16/priorbox.cpp +++ /dev/null @@ -1,134 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/fp16/tensor_computing_fp16.h" - -EE priorbox_fp16(std::vector inputDesc, PriorBoxDesc priorboxDesc, TensorDesc outputDesc, F16* output) -{ - UNUSED(outputDesc); - if (nullptr == output) { - CHECK_STATUS(NULL_POINTER); - } - U32 num = inputDesc.size(); - if (num != 2) return NOT_MATCH; - DataType idt0, idt1; - DataFormat idf0, idf1; - U32 in0 = 0, ic0 = 0, ih0 = 0, iw0 = 0; - U32 in1 = 0, ic1 = 0, ih1 = 0, iw1 = 0; - CHECK_STATUS(tensor4dGet(inputDesc[0], &idt0, &idf0, &in0, &ic0, &ih0, &iw0)); - CHECK_STATUS(tensor4dGet(inputDesc[1], &idt1, &idf1, &in1, &ic1, &ih1, &iw1)); - - std::vector minsizes = priorboxDesc.min_sizes; - std::vector maxsizes = priorboxDesc.max_sizes; - std::vector ars = priorboxDesc.aspect_ratios; - U32 flip = priorboxDesc.flip; - U32 clip = priorboxDesc.clip; - F32 vars[4]; - for (int i = 0; i < 4 ; i++){ - vars[i] = priorboxDesc.variances[i]; - } - U32 imageH = priorboxDesc.image_h; - U32 imageW = priorboxDesc.image_w; - F32 stepH = priorboxDesc.step_h; - F32 stepW = priorboxDesc.step_w; - F32 offset = priorboxDesc.offset; - - U32 layer_w = iw0; - U32 layer_h = ih0; - - int img_w, img_h; - if(imageH == 0 || imageW == 0){ - img_w = iw1; - img_h = ih1; - } else { - img_w = imageW; - img_h = imageH; - } - F32 stp_h, stp_w; - if (stepW == 0 || stepH == 0){ - stp_w = static_cast(ceil((img_w)/layer_w)); - stp_h = static_cast(ceil((img_h)/layer_h)); - } else{ - stp_w = stepW; - stp_h = stepH; - } - - U32 num_priorboxs = ars.size(); - if(flip){ - num_priorboxs = num_priorboxs * 2; - } - U32 num_minsize = minsizes.size(); - num_priorboxs = (num_priorboxs + 1) * num_minsize; - if(!maxsizes.empty()){ - U32 num_maxsize = priorboxDesc.max_sizes.size(); - num_priorboxs = num_priorboxs + num_maxsize; - } - int dim = layer_h * layer_w * num_priorboxs * 4; - int idx = 0; - for (U32 h = 0 ; h < layer_h ; h++){ - for (U32 w = 0 ; w < layer_w ; w++){ - F32 center_x = (w + offset) * stp_w; - F32 center_y = (h + offset) * stp_h; - F32 box_w , box_h; - for( int n = 0 ; n < (int)minsizes.size() ; n++){ - F32 minsize = minsizes[n]; - box_w = box_h = minsize; - output[idx++] = (center_x - box_w/2) / img_w; - output[idx++] = (center_y - box_h/2) / img_h; - output[idx++] = (center_x + box_w/2) / img_w; - output[idx++] = (center_y + box_h/2) / img_h; - - if ((int)maxsizes.size() > 0) { - F32 maxsize = maxsizes[n]; - box_w = box_h = sqrt(minsize * maxsize); - output[idx++] = (center_x - box_w/2) / img_w; - output[idx++] = (center_y - box_h/2) / img_h; - output[idx++] = (center_x + box_w/2) / img_w; - output[idx++] = (center_y + box_h/2) / img_h; - } - - for (int a = 0; a < (int)ars.size(); a++){ - F32 ar = ars[a]; - box_w = minsize * sqrt(ar); - box_h = minsize / sqrt(ar); - output[idx++] = (center_x - box_w/2) / img_w; - output[idx++] = (center_y - box_h/2) / img_h; - output[idx++] = (center_x + box_w/2) / img_w; - output[idx++] = (center_y + box_h/2) / img_h; - if(flip){ - output[idx++] = (center_x - box_h/2) / img_w; - output[idx++] = (center_y - box_w/2) / img_h; - output[idx++] = (center_x + box_h/2) / img_w; - output[idx++] = (center_y + box_w/2) / img_h; - } - } - } - } - } - - if (clip) { - for (int i = 0; i < dim; i++) { - output[i] = std::min(std::max(output[i], 0.), 1.); - } - } - - //variances - for(int i = 0 ; i < dim/4 ; i++){ - output[idx++] = vars[0]; - output[idx++] = vars[1]; - output[idx++] = vars[2]; - output[idx++] = vars[3]; - } - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/arm/fp16/quantize.cpp b/tensor_computing/src/cpu/arm/fp16/quantize.cpp deleted file mode 100644 index a9fbf7ea..00000000 --- a/tensor_computing/src/cpu/arm/fp16/quantize.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include -#include "cpu/arm/fp16/tensor_computing_fp16.h" - -inline void apply_scale_f16(U32 numData, F16* array, F16 scale, INT8* qArray) -{ - for (U32 i=0; i 127.0) { - qArray[i] = 127; - } else if (tmp < -127.0) { - qArray[i] = -127; - } else { - qArray[i] = round(tmp); - } - } -} - -EE quantize_tensor_fp16(TensorDesc dDesc, const void* data, TensorDesc* qDesc, void* qData, F16 *scale) -{ - if (nullptr == data || nullptr == qDesc || nullptr == qData || nullptr == scale) { - CHECK_STATUS(NULL_POINTER); - } - DataType dt; - DataFormat df; - U32 n, c, h, w; - if (tensorIs2d(dDesc)) { - CHECK_STATUS(tensor2dfGet(dDesc, &dt, &df, &n, &w)); - c = 1; - h = 1; - } else if (tensorIs3d(dDesc)) { - CHECK_STATUS(tensor3dGet(dDesc, &dt, &df, &n, &h, &w)); - c = 1; - } else { - CHECK_STATUS(tensor4dGet(dDesc, &dt, &df, &n, &c, &h, &w)); - } - - switch (dt) { - case DT_F16: { - switch (df) { - case DF_HWNCN8C4:{ // winograd - F16 *array = (F16*)data; - for (U32 idx=0; idx<36; idx++) { - float16x8_t tmp_v = vld1q_f16(array + idx*8*c); - float16x8_t max_v = tmp_v; - float16x8_t min_v = tmp_v; - - for (U32 o=0; o 0 && min < 0) { - F16 scale_max = 127.0 / max; - F16 scale_min = -127.0 / min; - scale[idx] = (scale_max < scale_min) ? scale_max : scale_min; - } else if (max < 0) { - scale[idx] = -127.0 / min; - } else { // min > 0 - scale[idx] = 127.0 / max; - } - - INT8* qArray = (INT8*)qData; - for (U32 o=0; o= 8); - U32 i = 8; - for (; i < numData - 7; i += 8) { - tmp_v = vld1q_f16(array+i); - max_v = vmaxq_f16(max_v, tmp_v); - min_v = vminq_f16(min_v, tmp_v); - } - - F16 max = vmaxvq_f16(max_v); - F16 min = vminvq_f16(min_v); - - for (; i < numData; i++) { - F16 tmp = array[i]; - if (tmp > max) { - max = tmp; - } - if (tmp < min) { - min = tmp; - } - } - if (max == 0 && min == 0) { - CHECK_STATUS(NOT_SUPPORTED); - } - F16 scaleRaw; - if (max > 0 && min < 0) { - F32 scale_max = 127.0 / max; - F32 scale_min = -127.0 / min; - scaleRaw = (scale_max < scale_min) ? scale_max : scale_min; - } else if (max < 0) { - scaleRaw = -127.0 / min; - } else { // min > 0 - scaleRaw = 127.0 / max; - } - DEBUG_info(max << " is the max FP16 value, and min value is " << min); - if (*scale < scaleRaw) { - *scale = scaleRaw; - } - - INT8* qArray = (INT8*)qData; - apply_scale_f16(numData, array, *scale, qArray); - - if (tensorIs2d(dDesc)) { - *qDesc = tensor2df(DT_I8, df, n, w); - } else if (tensorIs3d(dDesc)) { - *qDesc = tensor3df(DT_I8, df, n, h, w); - } else { - *qDesc = tensor4df(DT_I8, df, n, c, h, w); - } - break; - } - } - break; - } - default:{ - CHECK_STATUS(NOT_SUPPORTED); - break; - } - } - DEBUG_info(scale[0] << " is the quantization scale"); - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/arm/fp16/scale.cpp b/tensor_computing/src/cpu/arm/fp16/scale.cpp deleted file mode 100644 index 351bd63f..00000000 --- a/tensor_computing/src/cpu/arm/fp16/scale.cpp +++ /dev/null @@ -1,108 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "cpu/arm/fp16/tensor_computing_fp16.h" - -EE scale_nchwc8_fp16(F16* input, F16* alpha, F16* beta, I32 in, I32 ic, I32 elements_per_channel, F16* output) -{ - float16x8_t one = vdupq_n_f16(1.); - float16x8_t zero = vdupq_n_f16(0.); - U32 index = 0; - for (I32 n = 0; n < in; n++) { - for (I32 c = 0; c < ic; c += 8) { - float16x8_t alpha_vec = (alpha == nullptr) ? one : vld1q_f16(alpha + c); - float16x8_t beta_vec = (beta == nullptr) ? zero : vld1q_f16(beta + c); - for (I32 i = 0; i < elements_per_channel; i++) { - float16x8_t in_vec = vld1q_f16(input + index); - float16x8_t out_vec = vfmaq_f16(beta_vec, alpha_vec, in_vec); - vst1q_f16(output+index, out_vec); - index += 8; - } - } - } - return SUCCESS; -} - -EE scale_nchw_fp16(F16* input, F16* alpha, F16* beta, I32 in, I32 ic, I32 elements_per_channel, F16* output) -{ - float16x8_t one = vdupq_n_f16(1.); - float16x8_t zero = vdupq_n_f16(0.); - U32 index = 0; - for (I32 n = 0; n < in; n++) { - for (I32 c = 0; c < ic; c++) { - float16x8_t alpha_vec = (alpha == nullptr) ? one : vdupq_n_f16(alpha[c]); - float16x8_t beta_vec = (beta == nullptr) ? zero : vdupq_n_f16(beta[c]); - I32 i = 0; - for (; i < elements_per_channel-7; i += 8) { - float16x8_t in_vec = vld1q_f16(input + index); - float16x8_t out_vec = vfmaq_f16(beta_vec, alpha_vec, in_vec); - vst1q_f16(output+index, out_vec); - index += 8; - } - for (; i < elements_per_channel; i++) { - output[index] = alpha[c] * input[index] + beta[c]; - index++; - } - } - } - return SUCCESS; -} - -EE scale_nhwc_fp16(F16* input, F16* alpha, F16* beta, I32 in, I32 ic, I32 elements_per_channel, F16* output) -{ - float16x8_t one = vdupq_n_f16(1.); - float16x8_t zero = vdupq_n_f16(0.); - U32 index = 0; - for (I32 n = 0; n < in; n++) { - for (I32 i = 0; i < elements_per_channel; i++) { - I32 c = 0; - for (; c < ic-7; c += 8) { - float16x8_t alpha_vec = (alpha == nullptr) ? one : vld1q_f16(alpha+c); - float16x8_t beta_vec = (beta == nullptr) ? zero : vld1q_f16(beta+c); - float16x8_t in_vec = vld1q_f16(input + index); - float16x8_t out_vec = vfmaq_f16(beta_vec, alpha_vec, in_vec); - vst1q_f16(output+index, out_vec); - index += 8; - } - for (; c < ic; c++) { - F32 beta_s = (beta == nullptr) ? 0 : beta[c]; - output[index] = alpha[c] * input[index] + beta_s; - index++; - } - } - } - return SUCCESS; -} - -EE scale_fp16(F16* input, I32 axis, I32 nDims, F16* alpha, F16* beta, I32 in, I32 ic, I32 elements_per_channel, F16* output) -{ - if (nullptr == input || nullptr == output) - CHECK_STATUS(NULL_POINTER); - EE ret = SUCCESS; - if (axis == 1 || axis == 0) { - ret = scale_nchw_fp16(input, alpha, beta, in, ic, elements_per_channel, output); - CHECK_STATUS(ret); - } else if (axis == nDims - 1) { - ret = scale_nhwc_fp16(input, alpha, beta, in, ic, elements_per_channel, output); - CHECK_STATUS(ret); - } else if (axis == nDims) { - ret = scale_nchwc8_fp16(input, alpha, beta, in, ic, elements_per_channel, output); - CHECK_STATUS(ret); - } else { - ret = NOT_SUPPORTED; - CHECK_STATUS(ret); - } - return ret; -} diff --git a/tensor_computing/src/cpu/arm/fp16/softmax.cpp b/tensor_computing/src/cpu/arm/fp16/softmax.cpp deleted file mode 100644 index df416489..00000000 --- a/tensor_computing/src/cpu/arm/fp16/softmax.cpp +++ /dev/null @@ -1,140 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include -#include "cpu/arm/fp16/tensor_computing_fp16.h" - -void softmax_lastAxis_fp16(const F16* input, I32 loopOuter, I32 loops, F16 *output) -{ - for(I32 i = 0; i < loopOuter; i++) { - const F16 *inputPtr = input + i * loops; - F16 *outputPtr = output + i * loops; - - float16x8_t max_v, sub_v, sum_v, tmp_v; - F32 max_s, tmp_s; - max_s = array_max_f16(inputPtr, loops); - max_v = vdupq_n_f16(max_s); - sum_v = vdupq_n_f16(0); - - I32 j = 0; - F32 sum_s = 0; - for(j = 0; j < loops - 7; j += 8) { - float16x8_t in = vld1q_f16(inputPtr + j); - sub_v = vsubq_f16(in, max_v); - tmp_v = vexpq_f16_f32(sub_v); - sum_v = vaddq_f16(sum_v, tmp_v); - vst1q_f16(outputPtr + j, tmp_v); - } - sum_s += vaddvq_f16(sum_v); - for(; j < loops; j++){ - tmp_s = exp(inputPtr[j] - max_s); - outputPtr[j] = tmp_s; - sum_s += tmp_s; - } - array_scale_f16(outputPtr, outputPtr, loops, 1.0 / sum_s, 0); - } -} - -void softmax_anyAxis_fp16(const F16* input, I32 loopOuter, I32 loops, I32 loopInner, F16 *output) -{ - std::vector buffer(loopInner * 2); - F16* maxBuffer = &buffer[0] ; - F16* sumBuffer = &buffer[loopInner] ; - I32 k = 0; - for(I32 i = 0; i < loopOuter; i++) { - const F16* inputPtrBase = input + i * loops * loopInner; - F16* outputPtrBase = output + i * loops * loopInner; - - memcpy(maxBuffer, inputPtrBase, loopInner * sizeof(F16)); - memset(sumBuffer, 0, loopInner * sizeof(F16)); - for (I32 j = 1; j < loops; j++) { - const F16* inputPtr = inputPtrBase + j * loopInner; - for (k = 0; k < loopInner-7; k += 8) { - float16x8_t in_v = vld1q_f16(inputPtr + k); - float16x8_t out_v = vld1q_f16(maxBuffer + k); - float16x8_t max_v = vmaxq_f16(in_v, out_v); - vst1q_f16(maxBuffer + k, max_v); - } - for (; k < loopInner; k++) - maxBuffer[k] = UNI_MAX(maxBuffer[k], inputPtr[k]); - } - for (I32 j = 0; j < loops; j++) { - const F16* inputPtr = inputPtrBase + j * loopInner; - F16* outputPtr = outputPtrBase + j * loopInner; - for (k = 0; k < loopInner-7; k += 8) { - float16x8_t in_v = vld1q_f16(inputPtr + k); - float16x8_t max_v = vld1q_f16(maxBuffer + k); - float16x8_t sub_v = vsubq_f16(in_v, max_v); - float16x8_t exp_v = vexpq_f16_f32(sub_v); - float16x8_t sum_v = vld1q_f16(sumBuffer + k); - sum_v = vaddq_f16(sum_v, exp_v); - vst1q_f16(sumBuffer + k, sum_v); - vst1q_f16(outputPtr + k, exp_v); - } - for (; k < loopInner; k++) { - outputPtr[k] = exp(inputPtr[k] - maxBuffer[k]); - sumBuffer[k] += outputPtr[k]; - } - } - for (I32 j = 0; j < loops; j++) { - F16* outputPtr = outputPtrBase + j * loopInner; - for (k = 0; k < loopInner-7; k += 8) { - float16x8_t out_v = vld1q_f16(outputPtr + k); - float16x8_t sum_v = vld1q_f16(sumBuffer + k); - out_v = vdivq_f16(out_v, sum_v); - vst1q_f16(outputPtr + k, out_v); - } - for (; k < loopInner; k++) { - outputPtr[k] /= sumBuffer[k]; - } - } - } -} - - -EE softmax_fp16(TensorDesc inputDesc, const F16* input, - int axis, - TensorDesc outputDesc, F16* output) -{ - UNUSED(outputDesc); - if(nullptr == input || nullptr == output) - CHECK_STATUS(NULL_POINTER); - - U32 size = tensorNumElements(inputDesc); - axis = (axis + inputDesc.nDims) % inputDesc.nDims; - axis = inputDesc.nDims - 1 - axis; - I32 loops = inputDesc.dims[axis]; - - I32 loopInner = 1; - for (int i = 0; i < axis; i++) - loopInner *= inputDesc.dims[i]; - U32 loopOuter = size / loops / loopInner; - - if (loopInner == 1) { - if (DF_NCHWC8 == inputDesc.df && 4 == inputDesc.nDims && - (inputDesc.dims[1] != 1 || inputDesc.dims[0] != 1)) { - CHECK_REQUIREMENT(2 != axis); - loopInner *= 8; - loopOuter /= 8; - softmax_anyAxis_fp16(input, loopOuter, loops, loopInner, output); - } else { - softmax_lastAxis_fp16(input, loopOuter, loops, output); - } - } else { - CHECK_REQUIREMENT(DF_NCHWC8 != inputDesc.df); - softmax_anyAxis_fp16(input, loopOuter, loops, loopInner, output); - } - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/arm/fp16/tensor_computing_fp16.h b/tensor_computing/src/cpu/arm/fp16/tensor_computing_fp16.h deleted file mode 100644 index a4d3e83e..00000000 --- a/tensor_computing/src/cpu/arm/fp16/tensor_computing_fp16.h +++ /dev/null @@ -1,138 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_TENSOR_COMPUTING_FP16 -#define _H_TENSOR_COMPUTING_FP16 -#include - -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "tensor_computing_type.h" -#include "cpu/arm/fp16/arm_functions_fp16.h" - -EE convolution_infer_forward_tmp_bytes_fp16(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, ConvolutionForwardAlgorithm algorithm, U32 *bytes); - -EE convolution_transform_filter_fp16(TensorDesc filterDesc, const F16* filter, - ConvolutionForwardAlgorithm algorithm, - TensorDesc *ftmDesc, F16* filterTransformed); - -EE convolution_fp16(TensorDesc inputDesc, F16* input, - TensorDesc filterDesc, const F16* filter, - ConvolutionDesc convDesc, ConvolutionForwardAlgorithm algorithm, - TensorDesc biasDesc, const F16* bias, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* output, - ActivationDesc activationDesc, - Arch arch); - -EE deconvolution_infer_forward_algorithm_fp16(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, ConvolutionPolicy policy, ConvolutionForwardAlgorithm *algorithm); - -EE deconvolution_infer_forward_tmp_bytes_fp16(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, ConvolutionForwardAlgorithm algorithm, U32 *bytes); - -EE deconvolution_transform_filter_bytes_fp16(TensorDesc filterDesc, ConvolutionForwardAlgorithm algorithm, U32* bytes); - -EE deconvolution_transform_filter_fp16(TensorDesc filterDesc, const F16* filter, - ConvolutionForwardAlgorithm algorithm, - TensorDesc *ftmDesc, F16* filterTransformed); - -EE deconvolution_fp16(TensorDesc inputDesc, F16* input, - TensorDesc filterDesc, const F16* filter, - ConvolutionDesc convDesc, ConvolutionForwardAlgorithm algorithm, - TensorDesc biasDesc, const F16* bias, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* output, - ActivationDesc activationDesc, - Arch arch); - -EE pooling_fp16(TensorDesc inputDesc, const F16* input, PoolingDesc poolingDesc, TensorDesc outputDesc, F16* output); - -EE softmax_fp16(TensorDesc inputDesc, const F16* input, int axis, TensorDesc outputDesc, F16* output); - -EE attention_fp16(U32 batch, U32 numHeads, I32 fromSequenceLength, I32 toSequenceLength, const F16 *input, F16 *output); - -EE clip_fp16(F16 *input, F16 *output, I32 len, F32 minValue, F32 maxValue); - -EE concat_fp16(std::vector inputDesc, std::vector input, F16* inputScale, - TensorDesc outputDesc, F16* output, F16* outputScale, U32 concatDim); - -EE depthwise_convolution_infer_forward_algorithm_fp16(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, ConvolutionPolicy policy, DepthwiseConvolutionForwardAlgorithm *algorithm, DataType targetDataType); - -EE depthwise_convolution_transform_filter_bytes_fp16(TensorDesc filterDesc, DepthwiseConvolutionForwardAlgorithm algorithm, U32* bytes); - -EE depthwise_convolution_transform_filter_fp16(TensorDesc filterDesc, const F16* filter, - DepthwiseConvolutionForwardAlgorithm algorithm, - TensorDesc *ftmDesc, F16* filterTransformed); - -EE depthwise_convolution_infer_forward_tmp_bytes_fp16(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, DepthwiseConvolutionForwardAlgorithm algorithm, U32 *bytes); - -EE depthwise_convolution_transform_filter_fp16(TensorDesc filterDesc, const F16* filter, - DepthwiseConvolutionForwardAlgorithm algorithm, - TensorDesc *ftmDesc, F16* filterTransformed); - -EE depthwise_convolution_fp16(TensorDesc inputDesc, F16* input, - TensorDesc filterDesc, const F16* filter, - ConvolutionDesc convDesc, DepthwiseConvolutionForwardAlgorithm algorithm, - TensorDesc biasDesc, const F16* bias, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* output, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc, - Arch arch); - -EE detectionoutput_fp16(std::vector inputDesc, std::vector input, DetectionOutputDesc detectionoutputDesc, TensorDesc outputDesc, F16* output); - -EE eltwise_fp16(std::vectorinput, std::vector inputSize, U32 num, U32 len, void *output, EltwiseMode eltwiseMode); - -EE lstmcell_fp16(TensorDesc xDesc, const void* currentX, - TensorDesc filterDesc, const void* filter, - TensorDesc biasDesc, const void* bias, - void *state, - U32 tmpBytes, void *tmp, - LSTMDesc lstmDesc, U32 batchStrideX, U32 batchStrideH, - TensorDesc hDesc, void* output, - Arch arch); - -EE multiply_fp16(F16 *alpha, F16 *beta, TensorDesc inputDesc, F16* input, TensorDesc outputDesc, F16 *output); - -EE layer_normalization_fp16(F16 *alpha, F16 *beta, - TensorDesc inputDesc, F16* input, - TensorDesc outputDesc, F16* output); - -EE pooling_fp16(TensorDesc inputDesc, const F16* input, PoolingDesc poolingDesc, const F16* scale, TensorDesc outputDesc, F16* output); - -EE priorbox_fp16(std::vector inputDesc, PriorBoxDesc priorboxDesc, TensorDesc outputDesc, F16* output); - -EE scale_fp16(F16* input, I32 axis, I32 nDims, F16* alpha, F16* beta, I32 in, I32 ic, I32 elements_per_channel, F16* output); - -EE softmax_fp16(TensorDesc inputDesc, const F16* input, - TensorDesc outputDesc, F16* output); - -EE check_fp16(TensorDesc inputDescA, const F16* inputA, - TensorDesc inputDescB, const F16* inputB, - CheckMode checkMode, - TensorDesc outputDesc, I32* output); - -EE quantize_tensor_fp16(TensorDesc dDesc, const void* data, TensorDesc* qDesc, void* qData, F16 *scale); - -EE attention_mask_fp16(TensorDesc inputDesc, const F16* input, - I32 attentionLength, bool sameLength, float maskValue, - TensorDesc outputDesc, F16* output); -#endif diff --git a/tensor_computing/src/cpu/arm/fp32/arm_functions_fp32.h b/tensor_computing/src/cpu/arm/fp32/arm_functions_fp32.h deleted file mode 100644 index 23da22ec..00000000 --- a/tensor_computing/src/cpu/arm/fp32/arm_functions_fp32.h +++ /dev/null @@ -1,288 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_ARM_FUNCTIONS_FP32 -#define _H_ARM_FUNCTIONS_FP32 - -#ifdef _USE_FP32 -#include "arm_neon_expand.h" -#include -#include "tensor_computing_type.h" - -// array sum -inline F32 array_sum_f32(const F32 *data, I32 len) { - if(len <= 0) return 0; - - I32 i = 0; - F32 sum_s = 0; - float32x4_t sum_v = vdupq_n_f32(0); - for(i = 0; i < len - 3; i+=4){ - float32x4_t in = vld1q_f32(data + i); - sum_v = vaddq_f32(sum_v, in); - } - sum_s += vaddvq_f32(sum_v); - for(; i < len; i++){ - sum_s += data[i]; - } - return sum_s; -} - -// array mean -inline F32 array_mean_f32(const F32 *data, I32 len) { - if(len <= 0) return 0; - return array_sum_f32(data, len) / len; -} - -// array var -inline F32 array_var_f32(const F32 *data, I32 len, F32 mean) { - if(len <= 0) return 0; - - I32 i = 0; - F32 sum_s = 0; - float32x4_t mean_v = vdupq_n_f32(mean); - for(i = 0; i < len - 3; i+=4){ - float32x4_t in = vld1q_f32(data + i); - float32x4_t tmp_v = vsubq_f32(in, mean_v); - float32x4_t sum_v = vmulq_f32(tmp_v, tmp_v); - sum_s += vaddvq_f32(sum_v); - } - for(; i < len; i++){ - F32 in = data[i]; - F32 tmp = in - mean; - sum_s += tmp * tmp; - } - return sum_s / len; -} - -// array max -inline F32 array_max_f32(const F32* data, I32 len) { - F32 max_s = data[0]; - I32 i = 0; - if(len >= 4){ - float32x4_t max_v, tmp_v; - max_v = vld1q_f32(data); - for (i = 4; i < len - 3; i+=4) { - tmp_v = vld1q_f32(data + i); - max_v = vmaxq_f32(tmp_v, max_v); - } - max_s = vmaxvq_f32(max_v); - } - - for (; i < len; i++) { - if(data[i] > max_s) - max_s = data[i]; - } - - return max_s; -} - -inline void array_scale_f32(F32 *input, F32 *output, I32 len, F32 alpha, F32 beta) { - I32 i = 0; - float32x4_t alpha_v = vdupq_n_f32(alpha); - float32x4_t beta_v = vdupq_n_f32(beta); - for (i = 0; i < len-3; i+=4) { - float32x4_t in = vld1q_f32(input + i); - float32x4_t tmp_v = vfmaq_f32(beta_v, alpha_v, in); - vst1q_f32(output+i, tmp_v); - } - for (; i < len; i++) { - output[i] = alpha * input[i] + beta; - } -} - -inline EE activation_fp32(F32* input, U32 len, ActivationDesc activationDesc, F32* output) -{ - float32x4_t in, out; - float32x4_t zero = vdupq_n_f32(0.); - float32x4_t one = vdupq_n_f32(1.); - float32x4_t three = vdupq_n_f32(3.); - float32x4_t six = vdupq_n_f32(6.); - U32 len_main = len / 4; - U32 len_tail = len % 4; - - F32 value; - switch (activationDesc.mode){ - case ACTIVATION_NULL: { - break; - } - case ACTIVATION_RELU: { - if (activationDesc.value[0] == 0) { - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f32(input); - out = vmaxq_f32(zero, in); - vst1q_f32(output, out); - input += 4; - output += 4; - } - for (U32 i = 0; i < len_tail; i++) { - output[i] = (input[i] < 0) ? 0 : input[i]; - } - } else { - float32x4_t scale = vdupq_n_f32(activationDesc.value[0]); - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f32(input); - float32x4_t tmp = vmulq_f32(in, scale); - out = vmaxq_f32(tmp, in); - vst1q_f32(output, out); - input += 4; - output += 4; - } - for (U32 i = 0; i < len_tail; i++) { - float tmp = activationDesc.value[0] * input[i]; - output[i] = (input[i] < tmp) ? tmp : input[i]; - } - } - break; - } - case ACTIVATION_RELU6: { - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f32(input); - out = vmaxq_f32(zero, in); - out = vminq_f32(six, out); - vst1q_f32(output, out); - input += 4; - output += 4; - } - for (U32 i = 0; i < len_tail; i++) { - value = (input[i] < 0) ? 0 : input[i]; - if (value > 6) { - value = 6; - } - output[i] = value; - } - break; - } - case ACTIVATION_H_SIGMOID: { - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f32(input); - out = vaddq_f32(in, three); - out = vmaxq_f32(out, zero); - out = vminq_f32(out, six); - out = vdivq_f32(out, six); - vst1q_f32(output, out); - input += 4; - output += 4; - } - for (U32 i = 0; i < len_tail; i++) { - value = input[i] + 3; - value = (value < 0) ? 0 : value; - value = (value > 6) ? 6 : value; - value = value / 6; - output[i] = value; - } - break; - } - case ACTIVATION_H_SWISH: { - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f32(input); - out = vaddq_f32(in, three); - out = vmaxq_f32(out, zero); - out = vminq_f32(out, six); - out = vdivq_f32(out, six); - out = vmulq_f32(out, in); - vst1q_f32(output, out); - input += 4; - output += 4; - } - for (U32 i = 0; i < len_tail; i++) { - value = input[i] + 3; - value = (value < 0) ? 0 : value; - value = (value > 6) ? 6 : value; - value = input[i] * value; - value = value / 6; - output[i] = value; - } - break; - } - case ACTIVATION_GELU: { - F32 two_div_PI_sqrt = sqrt(2 / 3.14159265358979323846); - float32x4_t vec0 = vdupq_n_f32(two_div_PI_sqrt); - float32x4_t vec1 = vdupq_n_f32(0.044715); - float32x4_t vec2 = vdupq_n_f32(0.5); - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f32(input); - out = vmulq_f32(in, in); - out = vmulq_f32(out, in); - out = vfmaq_f32(in, vec1, out); - out = vmulq_f32(vec0, out); - out = vtanhq_f32(out); - out = vaddq_f32(one, out); - out = vmulq_f32(vec2, out); - out = vmulq_f32(in, out); - vst1q_f32(output, out); - input += 4; - output += 4; - } - for (U32 i = 0; i < len_tail; i++) { - value = input[i]; - value = two_div_PI_sqrt * (value + 0.044715 * pow(value, 3)); - value = 1.0 - 2.0 / (exp(2.0 * value) + 1.0); - value = 0.5 * (1.0 + value); - value = input[i] * value; - output[i] = value; - } - break; - } - case ACTIVATION_TANH: { - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f32(input); - out = vtanhq_f32(in); - vst1q_f32(output, out); - input += 4; - output += 4; - } - for (U32 i = 0; i < len_tail; i++) { - value = 1.0 - 2.0 / (exp(2.0 * input[i]) + 1.0); - output[i] = value; - } - break; - } - case ACTIVATION_SIGMOID: { - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f32(input); - out = vsigmoidq_f32(in); - vst1q_f32(output, out); - input += 4; - output += 4; - } - for (U32 i = 0; i < len_tail; i++) { - value = 1.0 / (1.0 + exp(-1.0 * input[i])); - output[i] = value; - } - break; - } - default: - return NOT_SUPPORTED; - } - - return SUCCESS; -} - -inline void array_add_f32(const F32* inputA, const F32* inputB, F32* output, I32 len) -{ - I32 i = 0; - for(i = 0; i < len - 3; i+=4){ - float32x4_t a = vld1q_f32(inputA + i); - float32x4_t b = vld1q_f32(inputB + i); - float32x4_t c = vaddq_f32(a, b); - vst1q_f32(output+i, c); - } - - for ( ; i < len; i++) { - output[i] = inputA[i] + inputB[i]; - } -} - -#endif -#endif diff --git a/tensor_computing/src/cpu/arm/fp32/attention.cpp b/tensor_computing/src/cpu/arm/fp32/attention.cpp deleted file mode 100644 index 15ee1724..00000000 --- a/tensor_computing/src/cpu/arm/fp32/attention.cpp +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "cpu/arm/fp32/tensor_computing_fp32.h" - -EE attention_fp32(U32 batch, U32 numHeads, I32 fromSequenceLength, I32 toSequenceLength, const F32 *input, F32 *output) -{ - if (nullptr == input || nullptr == output) - CHECK_STATUS(NULL_POINTER); - - F32 mask_s = -10000.0; - I32 count = array_sum_f32(input, toSequenceLength); - I32 valid = UNI_MIN(count, fromSequenceLength); - float32x4_t mask_v = vdupq_n_f32(mask_s); - float32x4_t one_v = vdupq_n_f32(1.0); - for(U32 n = 0; n < batch; n++){ - for (U32 i = 0; i < numHeads; i++) { - if (i == 0) { - for (I32 j = 0; j < valid; j++) { - if (j == 0) { - I32 k = 0; - for (; k < toSequenceLength-3; k+=4) { - float32x4_t in_v = vld1q_f32(input + k); - float32x4_t tmp_v = vsubq_f32(one_v, in_v); - tmp_v = vmulq_f32(tmp_v, mask_v); - vst1q_f32(output+k, tmp_v); - } - for (; k < toSequenceLength; k++) { - F32 value = (1 - input[k]) * mask_s; - output[k] = value; - } - } - else { - memcpy(output+j*toSequenceLength, output, toSequenceLength*sizeof(F32)); - } - } - - for (I32 j = valid; j < fromSequenceLength; j++) { - if (j == valid) { - I32 k = 0; - for (; k < toSequenceLength-3; k+=4) { - vst1q_f32(output+j*toSequenceLength+k, mask_v); - } - for (; k < toSequenceLength; k++) { - output[j*toSequenceLength+k] = mask_s; - } - } - else { - memcpy(output+j*toSequenceLength, output+valid*toSequenceLength, toSequenceLength*sizeof(F32)); - } - } - } else { - memcpy(output+i*fromSequenceLength*toSequenceLength, output, fromSequenceLength*toSequenceLength*sizeof(F32)); - } - } - - input += toSequenceLength; - output += numHeads * fromSequenceLength * toSequenceLength; - } - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/arm/fp32/attention_mask.cpp b/tensor_computing/src/cpu/arm/fp32/attention_mask.cpp deleted file mode 100644 index 94ee7db0..00000000 --- a/tensor_computing/src/cpu/arm/fp32/attention_mask.cpp +++ /dev/null @@ -1,76 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "cpu/arm/fp32/tensor_computing_fp32.h" - -EE attention_mask_fp32(TensorDesc inputDesc, const F32* input, - I32 attentionLength, bool sameLength, float maskValue, - TensorDesc outputDesc, F32* output) -{ - UNUSED(outputDesc); - if (nullptr == input || nullptr == output) - CHECK_STATUS(NULL_POINTER); - int qlen = inputDesc.dims[1]; - int klen = inputDesc.dims[0]; - int mlen = klen - qlen; - I32 length = qlen * klen; - std::vector mask; - if (attentionLength < 0) { - mask = std::vector(length, 0); - } else { - mask = std::vector(length, 1); - for (int i = 0; i < qlen; i++) { - int start, loops; - if (attentionLength > 0) { - int end = mlen + i; - start = UNI_MAX(end - attentionLength, 0); - loops = end - start + 1; - } else { - if (sameLength) { - start = i; - loops = qlen + 1; - } else { - start = 0; - loops = i + qlen + 1; - } - } - loops = UNI_MAX(loops, 0); - start = UNI_MIN(start, klen); - if (start + loops > klen) - loops = UNI_MAX(klen - start, 0); - memset(&mask[i*klen+start], 0, sizeof(F32)*loops); - } - } - I32 loops = tensorNumElements(inputDesc) / length; - float32x4_t one_v = vdupq_n_f32(1); - float32x4_t mask_value_v = vdupq_n_f32(maskValue); - for (int i = 0, index = 0; i < loops; i++) { - int j = 0; - for (; j < length-3; j+=4) { - float32x4_t in = vld1q_f32(input+index); - float32x4_t mask_v = vld1q_f32(&mask[j]); - float32x4_t tmp_v = vsubq_f32(one_v, mask_v); - tmp_v = vmulq_f32(in, tmp_v); - tmp_v = vfmsq_f32(tmp_v, mask_value_v, mask_v); - vst1q_f32(output+index, tmp_v); - index += 4; - } - for (; j < length; j++) { - output[index] = input[index] * (1 - mask[j]) - maskValue * mask[j]; - index++; - } - } - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/arm/fp32/check.cpp b/tensor_computing/src/cpu/arm/fp32/check.cpp deleted file mode 100644 index 4501f17f..00000000 --- a/tensor_computing/src/cpu/arm/fp32/check.cpp +++ /dev/null @@ -1,88 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/fp32/tensor_computing_fp32.h" - -EE check_fp32(TensorDesc inputDescA, const F32* inputA, - TensorDesc inputDescB, const F32* inputB, - CheckMode checkMode, - TensorDesc outputDesc, I32* output) -{ - if (nullptr == inputA || nullptr == inputB || nullptr == output) - CHECK_STATUS(NULL_POINTER); - - if (tensorNumElements(inputDescA) != tensorNumElements(inputDescB)) - CHECK_STATUS(NOT_MATCH); - - U32 size = tensorNumElements(inputDescA); - U32 loopOuter = inputDescA.dims[inputDescA.nDims-1]; - I32 length = size / loopOuter; - if (tensorNumElements(outputDesc) != loopOuter) - CHECK_STATUS(NOT_MATCH); - for (U32 j = 0; j < loopOuter; j++) { - const F32 *arrayA = inputA + j * length; - const F32 *arrayB = inputB + j * length; - switch (checkMode) { - case CHECK_GREAT: { - uint32x4_t count_v = vdupq_n_u32(0); - I32 i = 0; - for (; i < length-3; i+=4) { - float32x4_t a = vld1q_f32(arrayA + i); - float32x4_t b = vld1q_f32(arrayA + i); - count_v = vaddq_u32(count_v, vcgtq_f32(a, b)); - } - I32 count = vaddvq_u32(count_v); - for (; i < length; i++) - if (arrayA[i] > arrayB[i]) - count ++; - output[j] = (count == length); - break; - } - case CHECK_GREATEQUAL: { - uint32x4_t count_v = vdupq_n_u32(0); - I32 i = 0; - for (; i < length-3; i+=4) { - float32x4_t a = vld1q_f32(arrayA + i); - float32x4_t b = vld1q_f32(arrayA + i); - count_v = vaddq_u32(count_v, vcgeq_f32(a, b)); - } - I32 count = vaddvq_u32(count_v); - for (; i < length; i++) - if (arrayA[i] >= arrayB[i]) - count ++; - output[j] = (count == length); - break; - } - case CHECK_EQUAL: { - uint32x4_t count_v = vdupq_n_u32(0); - I32 i = 0; - for (; i < length-3; i+=4) { - float32x4_t a = vld1q_f32(arrayA + i); - float32x4_t b = vld1q_f32(arrayA + i); - count_v = vaddq_u32(count_v, vceqq_f32(a, b)); - } - I32 count = vaddvq_u32(count_v); - for (; i < length; i++) - if (arrayA[i] == arrayB[i]) - count ++; - output[j] = (count == length); - break; - } - default: - CHECK_STATUS(NOT_SUPPORTED); - break; - } - } - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/arm/fp32/clip.cpp b/tensor_computing/src/cpu/arm/fp32/clip.cpp deleted file mode 100644 index 64b3b482..00000000 --- a/tensor_computing/src/cpu/arm/fp32/clip.cpp +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/fp32/tensor_computing_fp32.h" - -EE clip_fp32(F32 *input, F32 *output, I32 len, F32 minValue, F32 maxValue) -{ - if (nullptr == input - || nullptr == output) - CHECK_STATUS(NULL_POINTER); - - float32x4_t min_v = vdupq_n_f32(minValue); - float32x4_t max_v = vdupq_n_f32(maxValue); - - I32 i = 0; - for (i = 0; i < len - 3; i += 4) { - float32x4_t in = vld1q_f32(input + i); - float32x4_t tmp_v = vminq_f32(max_v, vmaxq_f32(min_v, in)); - vst1q_f32(output+i, tmp_v); - } - for (; i < len; i++) { - F32 value = input[i]; - value = (value > minValue) ? value : minValue; - value = (value < maxValue) ? value : maxValue; - output[i] = value; - } - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/arm/fp32/convolution.cpp b/tensor_computing/src/cpu/arm/fp32/convolution.cpp deleted file mode 100644 index c288f101..00000000 --- a/tensor_computing/src/cpu/arm/fp32/convolution.cpp +++ /dev/null @@ -1,151 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "tensor_desc.h" -#include "type.h" -#include "error.h" -#include "tensor_computing_type.h" - -#include "cpu/arm/fp32/tensor_computing_fp32.h" - -EE convolution_infer_forward_tmp_bytes_fp32(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, ConvolutionForwardAlgorithm algorithm, U32 *bytes) -{ - if (nullptr == bytes) { - CHECK_STATUS(NULL_POINTER); - } - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - - U32 ih_pad = ih + paddingT + paddingB; - U32 iw_pad = iw + paddingL + paddingR; - EE ret = SUCCESS; - switch (algorithm) { - case CONVOLUTION_ALGORITHM_DIRECT: - *bytes = ic*ih_pad*iw_pad; - break; - case CONVOLUTION_ALGORITHM_GEMM: -#ifdef __aarch64__ - *bytes = ic*ih_pad*iw_pad + 12*fh*fw*ic; -#else - *bytes = ic*ih_pad*iw_pad + 6*fh*fw*ic; -#endif - break; - case CONVOLUTION_ALGORITHM_WINOGRAD: { - U32 tile_h = (oh + 3) / 4; - U32 tile_w = (ow + 3) / 4; - U32 pad_left = paddingL; - U32 pad_right = paddingR + (tile_w*4 - ow); - U32 pad_top = paddingT; - U32 pad_bottom = paddingB + (tile_h*4 - oh); - ih_pad = ih + pad_top + pad_bottom; - iw_pad = iw + pad_left + pad_right; - *bytes = ic*ih_pad*iw_pad + (ic+8)*6*6*12; - break; - } - case CONVOLUTION_ALGORITHM_GEMM_ICNCHW: -#ifdef __aarch64__ - *bytes = ic*ih_pad*iw_pad + 12*fh*fw*ic; -#else - *bytes = ic*ih_pad*iw_pad + 6*fh*fw*ic; -#endif - break; - default: - ret = NOT_MATCH; - break; - } - *bytes *= bytesOf(idt); - *bytes += 32; - return ret; -} - -EE convolution_fp32(TensorDesc inputDesc, F32* input, - TensorDesc filterDesc, const F32* filter, - ConvolutionDesc convDesc, ConvolutionForwardAlgorithm algorithm, - TensorDesc biasDesc, const F32* bias, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F32* output, - ActivationDesc activationDesc, - Arch arch) -{ - UNUSED(arch); - if (nullptr == input || nullptr == filter || nullptr == output || nullptr == bias || nullptr == tmp) { - CHECK_STATUS(NULL_POINTER); - } - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - - if (!(idt == DT_F32 && fdt == DT_F32 && odt == DT_F32)) { - CHECK_STATUS(NOT_MATCH); - } - if (!(odf == DF_NCHWC8)) { - CHECK_STATUS(NOT_MATCH); - } - if (!(ic == fc && oc == fn)) { - CHECK_STATUS(NOT_MATCH); - } - - // In some cases when we adjust the model input, the input tensor of conv can change from NCHW to NCHWc8 - // In this case we can simply change the algo, because they both require the same filter transform - if (CONVOLUTION_ALGORITHM_GEMM_ICNCHW == algorithm && DF_NCHWC8 == idf) { - algorithm = CONVOLUTION_ALGORITHM_GEMM; - } - - EE ret = SUCCESS; - switch (algorithm) { - case CONVOLUTION_ALGORITHM_GEMM: -#ifdef __aarch64__ - ret = convolution_gemm_V8(inputDesc, input, filterDesc, filter, convDesc, - biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc); -#else - ret = convolution_gemm_V7(inputDesc, input, filterDesc, filter, convDesc, - biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc); -#endif - break; - case CONVOLUTION_ALGORITHM_GEMM_ICNCHW: -#ifdef __aarch64__ - ret = convolution_gemm_icnchw_V8(inputDesc, input, filterDesc, filter, convDesc, - biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc); -#else - ret = convolution_gemm_icnchw_V7(inputDesc, input, filterDesc, filter, convDesc, - biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc); -#endif - break; - case CONVOLUTION_ALGORITHM_WINOGRAD: - ret = convolution_winograd_V8(inputDesc, input, filterDesc, filter, convDesc, - biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc); - break; - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} diff --git a/tensor_computing/src/cpu/arm/fp32/convolution_gemm_V7.cpp b/tensor_computing/src/cpu/arm/fp32/convolution_gemm_V7.cpp deleted file mode 100644 index 776bc76e..00000000 --- a/tensor_computing/src/cpu/arm/fp32/convolution_gemm_V7.cpp +++ /dev/null @@ -1,631 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#ifndef __aarch64__ -#include - -#include "type.h" -#include "error.h" -#include "tensor_desc.h" - -#include "cpu/arm/fp32/tensor_computing_fp32.h" - -EE convolution_gemm_V7(TensorDesc inputDesc, F32* inArray, - TensorDesc filterDesc, const F32* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F32* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F32* outArray, - ActivationDesc activationDesc) -{ - UNUSED(biasDesc); - UNUSED(tmpBytes); - - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - U32 dilateH = convDesc.dilatedRate_h; - U32 dilateW = convDesc.dilatedRate_w; - - if (fdf != DF_NHWCN8) { - CHECK_STATUS(NOT_MATCH); - } - - I64 activation = 0; - switch (activationDesc.mode) { - case ACTIVATION_NULL: - activation = 0; - break; - case ACTIVATION_RELU: - activation = 1; - break; - default: - CHECK_STATUS(NOT_SUPPORTED); - } - - oc /= 8; - ic /= 8; - U32 ih_pad = ih + paddingT + paddingB; - U32 iw_pad = iw + paddingL + paddingR; - I32 ohow = oh*ow; - U32 ihiw = ih_pad*iw_pad; - F32 *inArray_pad; - EE ret = SUCCESS; - for (U32 n = 0; n < in; n++) { - if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { - inArray_pad = inArray + n*ic*ih*iw*8; - } else { - // copy input into a input with padding - inArray_pad = (F32*)tmp; - F32 *inArray_pad_mov = inArray_pad; - F32 *inArray_mov = inArray + n*ic*ih*iw*8; - for (U32 c = 0; c < ic; c++) { - for (U32 h = 0; h < paddingT; h++) { - memset(inArray_pad_mov, 0, iw_pad*8*bytesOf(idt)); - inArray_pad_mov += iw_pad*8; - } - for (U32 h = paddingT; h < ih_pad - paddingB; h++) { - memset(inArray_pad_mov, 0, paddingL*8*bytesOf(idt)); - inArray_pad_mov += paddingL*8; - memcpy(inArray_pad_mov, inArray_mov, iw*8*bytesOf(idt)); - inArray_pad_mov += iw*8; - inArray_mov += iw*8; - memset(inArray_pad_mov, 0, paddingR*8*bytesOf(idt)); - inArray_pad_mov += paddingR*8; - } - for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { - memset(inArray_pad_mov, 0, iw_pad*8*bytesOf(idt)); - inArray_pad_mov += iw_pad*8; - } - } - } - // ohow / 6 - for (I32 hw = 0; hw < ohow - 5; hw += 6) { - const F32 *b0 = biasArray; - const F32 *b1 = biasArray + 4; - F32 *in_pack = ((F32*)tmp) + ic * ihiw * 8; - // pack input - // NCHWc8 => NHWChw6 + im2col - U32 in_h[6] = {0}; - U32 in_w[6] = {0}; - for (U32 i = 0; i < 6; i++) { - in_h[i] = ((hw+i)/ow)*strideH; - in_w[i] = ((hw+i)%ow)*strideW; - } - - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - F32 *in_hw6c8 = inArray_pad + c*ihiw*8 + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - F32 *in_0 = in_hw6c8 + in_h[0]*iw_pad*8 + in_w[0]*8; - F32 *in_1 = in_hw6c8 + in_h[1]*iw_pad*8 + in_w[1]*8; - F32 *in_2 = in_hw6c8 + in_h[2]*iw_pad*8 + in_w[2]*8; - F32 *in_3 = in_hw6c8 + in_h[3]*iw_pad*8 + in_w[3]*8; - F32 *in_4 = in_hw6c8 + in_h[4]*iw_pad*8 + in_w[4]*8; - F32 *in_5 = in_hw6c8 + in_h[5]*iw_pad*8 + in_w[5]*8; - - // NHWChw6 - F32 *in_pack_c8hw6 = in_pack + fh_idx*fw*ic*6*8 + fw_idx*ic*6*8 + c*6*8; - - __asm__ __volatile__( - "vld1.f32 {d0-d3}, [%[in_0]]\n" - "vld1.f32 {d4-d7}, [%[in_1]]\n" - "vld1.f32 {d8-d11}, [%[in_2]]\n" - "vld1.f32 {d12-d15}, [%[in_3]]\n" - "vld1.f32 {d16-d19}, [%[in_4]]\n" - "vld1.f32 {d20-d23}, [%[in_5]]\n" - - "vzip.32 q0, q2\n" - "vzip.32 q4, q6\n" - "vzip.32 q8, q10\n" - - "vst1.f32 {d0}, [%[pack]]!\n" - "vst1.f32 {d8}, [%[pack]]!\n" - "vst1.f32 {d16}, [%[pack]]!\n" - "vst1.f32 {d1}, [%[pack]]!\n" - "vst1.f32 {d9}, [%[pack]]!\n" - "vst1.f32 {d17}, [%[pack]]!\n" - "vst1.f32 {d4}, [%[pack]]!\n" - "vst1.f32 {d12}, [%[pack]]!\n" - "vst1.f32 {d20}, [%[pack]]!\n" - "vst1.f32 {d5}, [%[pack]]!\n" - "vst1.f32 {d13}, [%[pack]]!\n" - "vst1.f32 {d21}, [%[pack]]!\n" - - "vzip.32 q1, q3\n" - "vzip.32 q5, q7\n" - "vzip.32 q9, q11\n" - - "vst1.f32 {d2}, [%[pack]]!\n" - "vst1.f32 {d10}, [%[pack]]!\n" - "vst1.f32 {d18}, [%[pack]]!\n" - "vst1.f32 {d3}, [%[pack]]!\n" - "vst1.f32 {d11}, [%[pack]]!\n" - "vst1.f32 {d19}, [%[pack]]!\n" - "vst1.f32 {d6}, [%[pack]]!\n" - "vst1.f32 {d14}, [%[pack]]!\n" - "vst1.f32 {d22}, [%[pack]]!\n" - "vst1.f32 {d7}, [%[pack]]!\n" - "vst1.f32 {d15}, [%[pack]]!\n" - "vst1.f32 {d23}, [%[pack]]!\n" - :[pack]"+r"(in_pack_c8hw6), - [in_0]"+r"(in_0), - [in_1]"+r"(in_1), - [in_2]"+r"(in_2), - [in_3]"+r"(in_3), - [in_4]"+r"(in_4), - [in_5]"+r"(in_5) - : - :"memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11" - ); - } - } - } - - // compute - for (I32 o = 0; o < I32(oc); o++) { - F32 *in_hw0 = in_pack; - const F32 *f_o0c0 = filterArray + o*8*fh*fw*ic*8; - F32 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - // bias - const F32 *b_o0 = b0; - const F32 *b_o1 = b1; - __asm__ __volatile__( - "vld1.f32 {d8-d9}, [%[b_0]]\n" - "vld1.f32 {d10-d11}, [%[b_1]]\n" - "vld1.f32 {d0-d3}, [%[in_0]]!\n" - "vld1.f32 {d4-d7}, [%[f_0]]!\n" - - "vmov.f32 q6, q4\n" - "vmov.f32 q8, q4\n" - "vmov.f32 q10, q4\n" - "vmov.f32 q12, q4\n" - "vmov.f32 q14, q4\n" - - "mov r2, %[ic]\n" - - "vmov.f32 q7, q5\n" - "vmov.f32 q9, q5\n" - "vmov.f32 q11, q5\n" - "vmov.f32 q13, q5\n" - "vmov.f32 q15, q5\n" - - "0:\n" - "vmla.f32 q4, q2, d0[0]\n" - "vmla.f32 q6, q2, d0[1]\n" - "vmla.f32 q8, q2, d1[0]\n" - "vmla.f32 q10, q2, d1[1]\n" - "vmla.f32 q12, q2, d2[0]\n" - "vmla.f32 q14, q2, d2[1]\n" - - "vld1.f32 {d4-d5}, [%[f_0]]!\n" - - "vmla.f32 q5, q3, d0[0]\n" - "vmla.f32 q7, q3, d0[1]\n" - "vmla.f32 q9, q3, d1[0]\n" - "vmla.f32 q11, q3, d1[1]\n" - "vld1.f32 {d0-d1}, [%[in_0]]!\n" - "vmla.f32 q13, q3, d2[0]\n" - "vmla.f32 q15, q3, d2[1]\n" - - "vld1.f32 {d6-d7}, [%[f_0]]!\n" - "subs r2, r2, #4\n" - - "vmla.f32 q4, q2, d3[0]\n" - "vmla.f32 q6, q2, d3[1]\n" - "vmla.f32 q8, q2, d0[0]\n" - "vmla.f32 q10, q2, d0[1]\n" - "vmla.f32 q12, q2, d1[0]\n" - "vmla.f32 q14, q2, d1[1]\n" - - "vld1.f32 {d4-d5}, [%[f_0]]!\n" - - "vmla.f32 q5, q3, d3[0]\n" - "vmla.f32 q7, q3, d3[1]\n" - "vld1.f32 {d2-d3}, [%[in_0]]!\n" - "vmla.f32 q9, q3, d0[0]\n" - "vmla.f32 q11, q3, d0[1]\n" - "vmla.f32 q13, q3, d1[0]\n" - "vmla.f32 q15, q3, d1[1]\n" - - "vld1.f32 {d6-d7}, [%[f_0]]!\n" - "vld1.f32 {d0-d1}, [%[in_0]]!\n" - - "vmla.f32 q4, q2, d2[0]\n" - "vmla.f32 q6, q2, d2[1]\n" - "vmla.f32 q8, q2, d3[0]\n" - "vmla.f32 q10, q2, d3[1]\n" - "vmla.f32 q12, q2, d0[0]\n" - "vmla.f32 q14, q2, d0[1]\n" - - "vld1.f32 {d4-d5}, [%[f_0]]!\n" - - "vmla.f32 q5, q3, d2[0]\n" - "vmla.f32 q7, q3, d2[1]\n" - "vmla.f32 q9, q3, d3[0]\n" - "vmla.f32 q11, q3, d3[1]\n" - "vld1.f32 {d2-d3}, [%[in_0]]!\n" - "vmla.f32 q13, q3, d0[0]\n" - "vmla.f32 q15, q3, d0[1]\n" - - "vld1.f32 {d6-d7}, [%[f_0]]!\n" - - "vmla.f32 q4, q2, d1[0]\n" - "vmla.f32 q6, q2, d1[1]\n" - "vmla.f32 q8, q2, d2[0]\n" - "vmla.f32 q10, q2, d2[1]\n" - "vmla.f32 q12, q2, d3[0]\n" - "vmla.f32 q14, q2, d3[1]\n" - - "vld1.f32 {d4-d5}, [%[f_0]]!\n" - - "vmla.f32 q5, q3, d1[0]\n" - "vmla.f32 q7, q3, d1[1]\n" - "vld1.f32 {d0-d1}, [%[in_0]]!\n" - "vmla.f32 q9, q3, d2[0]\n" - "vmla.f32 q11, q3, d2[1]\n" - "vmla.f32 q13, q3, d3[0]\n" - "vmla.f32 q15, q3, d3[1]\n" - - "vld1.f32 {d2-d3}, [%[in_0]]!\n" - "vld1.f32 {d6-d7}, [%[f_0]]!\n" - "bne 0b\n" - :[out_0]"+r"(out_o0hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8*fh*fw), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1), - [activation]"r"(activation) - :"memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", - "q11", "q12", "q13", "q14", "q15", "r2" - ); - - switch (activationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU: { - __asm__ __volatile__( - "veor q1, q1, q1\n" //zero - "vmax.f32 q4, q4, q1\n" - "vmax.f32 q5, q5, q1\n" - "vmax.f32 q6, q6, q1\n" - "vmax.f32 q7, q7, q1\n" - "vmax.f32 q8, q8, q1\n" - "vmax.f32 q9, q9, q1\n" - "vmax.f32 q10, q10, q1\n" - "vmax.f32 q11, q11, q1\n" - "vmax.f32 q12, q12, q1\n" - "vmax.f32 q13, q13, q1\n" - "vmax.f32 q14, q14, q1\n" - "vmax.f32 q15, q15, q1\n" - : - : - :"memory", "cc", "q1", "q4", "q5", "q6", "q7", "q8", "q9", "q10", - "q11", "q12", "q13", "q14", "q15" - ); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - - __asm__ __volatile__( - "vst1.f32 {q4}, [%[out_0]]!\n" - "vst1.f32 {q5}, [%[out_0]]!\n" - "vst1.f32 {q6}, [%[out_0]]!\n" - "vst1.f32 {q7}, [%[out_0]]!\n" - "vst1.f32 {q8}, [%[out_0]]!\n" - "vst1.f32 {q9}, [%[out_0]]!\n" - "vst1.f32 {q10}, [%[out_0]]!\n" - "vst1.f32 {q11}, [%[out_0]]!\n" - "vst1.f32 {q12}, [%[out_0]]!\n" - "vst1.f32 {q13}, [%[out_0]]!\n" - "vst1.f32 {q14}, [%[out_0]]!\n" - "vst1.f32 {q15}, [%[out_0]]!\n" - :[out_0]"+r"(out_o0hw0) - : - :"memory", "cc", "q4", "q5", "q6", "q7", "q8", "q9", "q10", - "q11", "q12", "q13", "q14", "q15" - ); - b0 += 8; - b1 += 8; - } - } - - U32 ohow_s = (ohow / 6) * 6; - U32 ohow_tail = ohow - ohow_s; - - if (ohow_tail >= 4) { - I32 hw = ohow_s; - const F32 *b0 = biasArray; - const F32 *b1 = biasArray + 4; - F32 *in_pack = ((F32*)tmp) + ic*ih_pad*iw_pad*8; - // pack input - // NCHWc8 => NHWChw4 + im2col - U32 in_h[4] = {0}; - U32 in_w[4] = {0}; - - for (U32 i = 0; i < 4; i++) { - in_h[i] = ((hw+i)/ow)*strideH; - in_w[i] = ((hw+i)%ow)*strideW; - } - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - F32 *in_hw4c8 = inArray_pad + c*ihiw*8 + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - F32 *in_0 = in_hw4c8 + in_h[0]*iw_pad*8 + in_w[0]*8; - F32 *in_1 = in_hw4c8 + in_h[1]*iw_pad*8 + in_w[1]*8; - F32 *in_2 = in_hw4c8 + in_h[2]*iw_pad*8 + in_w[2]*8; - F32 *in_3 = in_hw4c8 + in_h[3]*iw_pad*8 + in_w[3]*8; - F32 *in_pack_c8hw4 = in_pack + fh_idx*fw*ic*8*4 + fw_idx*ic*8*4 + c*8*4; - - __asm__ __volatile__( - "vld1.f32 {d0-d3}, [%[in_0]]\n" - "vld1.f32 {d4-d7}, [%[in_1]]\n" - "vld1.f32 {d8-d11}, [%[in_2]]\n" - "vld1.f32 {d12-d15}, [%[in_3]]\n" - - "vzip.32 q0, q4\n" - "vzip.32 q2, q6\n" - - "vzip.32 q0, q2\n" - "vzip.32 q4, q6\n" - - "vst1.f32 {q0}, [%[pack]]!\n" - "vst1.f32 {q2}, [%[pack]]!\n" - "vst1.f32 {q4}, [%[pack]]!\n" - "vst1.f32 {q6}, [%[pack]]!\n" - - "vzip.32 q1, q5\n" - "vzip.32 q3, q7\n" - - "vzip.32 q1, q3\n" - "vzip.32 q5, q7\n" - - "vst1.f32 {q1}, [%[pack]]!\n" - "vst1.f32 {q3}, [%[pack]]!\n" - "vst1.f32 {q5}, [%[pack]]!\n" - "vst1.f32 {q7}, [%[pack]]!\n" - :[pack]"+r"(in_pack_c8hw4), - [in_0]"+r"(in_0), - [in_1]"+r"(in_1), - [in_2]"+r"(in_2), - [in_3]"+r"(in_3) - : - :"memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7" - ); - } - } - } - - // compute - for (I32 o = 0; o < I32(oc); o++) { - F32 *in_hw0 = in_pack; - const F32 *f_o0c0 = filterArray + o*8*fh*fw*ic*8; - F32 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - // bias - const F32 *b_o0 = b0; - const F32 *b_o1 = b1; - __asm__ __volatile__( - "vld1.f32 {d8-d9}, [%[b_0]]\n" - "vld1.f32 {d10-d11}, [%[b_1]]\n" - "vld1.f32 {d0-d1}, [%[in_0]]!\n" - "vld1.f32 {d4-d7}, [%[f_0]]!\n" - - "vmov.f32 q6, q4\n" - "vmov.f32 q8, q4\n" - "vmov.f32 q10, q4\n" - - "mov r2, %[ic]\n" - - "vmov.f32 q7, q5\n" - "vmov.f32 q9, q5\n" - "vmov.f32 q11, q5\n" - - "0:\n" - "vmla.f32 q4, q2, d0[0]\n" - "vmla.f32 q6, q2, d0[1]\n" - "vmla.f32 q8, q2, d1[0]\n" - "vmla.f32 q10, q2, d1[1]\n" - - "vld1.f32 {d2-d3}, [%[in_0]]!\n" - "vld1.f32 {d4-d5}, [%[f_0]]!\n" - - "vmla.f32 q5, q3, d0[0]\n" - "vmla.f32 q7, q3, d0[1]\n" - "vmla.f32 q9, q3, d1[0]\n" - "vmla.f32 q11, q3, d1[1]\n" - - "vld1.f32 {d6-d7}, [%[f_0]]!\n" - "subs r2, r2, #2\n" - - "vmla.f32 q4, q2, d2[0]\n" - "vmla.f32 q6, q2, d2[1]\n" - "vmla.f32 q8, q2, d3[0]\n" - "vmla.f32 q10, q2, d3[1]\n" - - "vld1.f32 {d0-d1}, [%[in_0]]!\n" - "vld1.f32 {d4-d5}, [%[f_0]]!\n" - - "vmla.f32 q5, q3, d2[0]\n" - "vmla.f32 q7, q3, d2[1]\n" - "vmla.f32 q9, q3, d3[0]\n" - "vmla.f32 q11, q3, d3[1]\n" - - "vld1.f32 {d6-d7}, [%[f_0]]!\n" - "bne 0b\n" - :[in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8*fh*fw), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1) - :"memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", - "q11", "r2" - ); - - switch (activationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU: { - __asm__ __volatile__( - "veor q1, q1, q1\n" //zero - "vmax.f32 q4, q4, q1\n" - "vmax.f32 q5, q5, q1\n" - "vmax.f32 q6, q6, q1\n" - "vmax.f32 q7, q7, q1\n" - "vmax.f32 q8, q8, q1\n" - "vmax.f32 q9, q9, q1\n" - "vmax.f32 q10, q10, q1\n" - "vmax.f32 q11, q11, q1\n" - : - : - :"memory", "cc", "q1", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11" - ); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - - __asm__ __volatile__( - "vst1.f32 {q4}, [%[out_0]]!\n" - "vst1.f32 {q5}, [%[out_0]]!\n" - "vst1.f32 {q6}, [%[out_0]]!\n" - "vst1.f32 {q7}, [%[out_0]]!\n" - "vst1.f32 {q8}, [%[out_0]]!\n" - "vst1.f32 {q9}, [%[out_0]]!\n" - "vst1.f32 {q10}, [%[out_0]]!\n" - "vst1.f32 {q11}, [%[out_0]]!\n" - :[out_0]"+r"(out_o0hw0) - : - :"memory", "cc", "q4", "q5", "q6", "q7", "q8", "q9", "q10", - "q11" - ); - b0 += 8; - b1 += 8; - } - ohow_s += 4; - ohow_tail -= 4; - } - - //I32 ohow_s = (ohow / 4) * 4; - - for (I32 hw = ohow_s; hw < ohow; hw++) { - const F32 *b0 = biasArray; - const F32 *b1 = biasArray + 4; - F32 *in_pack = ((F32*)tmp) + ic*ih_pad*iw_pad*8; - // pack input - // NCHW => NCHWc8hw1 + im2col - U32 in_h_0 = (hw/ow)*strideH; - U32 in_w_0 = (hw%ow)*strideW; - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - F32 *in_hw1c8 = inArray_pad + c*ihiw*8 + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - F32 *in_0 = in_hw1c8 + in_h_0*iw_pad*8 + in_w_0*8; - F32 *in_pack_c8hw1 = in_pack + fh_idx*fw*ic*8 + fw_idx*ic*8 + c*8; - - memcpy(in_pack_c8hw1, in_0, 8*bytesOf(idt)); - } - } - } - - // compute - for (I32 o = 0; o < I32(oc); o++) { - F32 *in_hw0 = in_pack; - const F32 *f_o0c0 = filterArray + o*8*fh*fw*ic*8; - F32 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - // bias - const F32 *b_o0 = b0; - const F32 *b_o1 = b1; - __asm__ __volatile__( - "vld1.f32 {d8-d9}, [%[b_0]]\n" - "vld1.f32 {d10-d11}, [%[b_1]]\n" - "vld1.f32 {d0}, [%[in_0]]!\n" - "vld1.f32 {d4-d7}, [%[f_0]]!\n" - "mov r2, %[ic]\n" - "0:\n" - "vmla.f32 q4, q2, d0[0]\n" - - "vld1.f32 {d4-d5}, [%[f_0]]!\n" - - "vmla.f32 q5, q3, d0[0]\n" - - "vld1.f32 {d6-d7}, [%[f_0]]!\n" - "subs r2, r2, #2\n" - - "vmla.f32 q4, q2, d0[1]\n" - - "vld1.f32 {d4-d5}, [%[f_0]]!\n" - - "vmla.f32 q5, q3, d0[1]\n" - - "vld1.f32 {d0}, [%[in_0]]!\n" - "vld1.f32 {d6-d7}, [%[f_0]]!\n" - "bne 0b\n" - :[in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8*fh*fw), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1) - :"memory", "cc", "q0", "q2", "q3", "q4", "q5", "r2" - ); - - switch (activationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU: { - __asm__ __volatile__( - "veor q1, q1, q1\n" //zero - "vmax.f32 q4, q4, q1\n" - "vmax.f32 q5, q5, q1\n" - : - : - :"memory", "cc", "q1", "q4", "v5" - ); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - - __asm__ __volatile__( - "vst1.f32 {q4}, [%[out_0]]!\n" - "vst1.f32 {q5}, [%[out_0]]!\n" - :[out_0]"+r"(out_o0hw0) - : - :"memory", "cc", "q4", "q5" - ); - b0 += 8; - b1 += 8; - } - } - } - return ret; -} -#endif diff --git a/tensor_computing/src/cpu/arm/fp32/convolution_gemm_V8.cpp b/tensor_computing/src/cpu/arm/fp32/convolution_gemm_V8.cpp deleted file mode 100644 index 75d4a127..00000000 --- a/tensor_computing/src/cpu/arm/fp32/convolution_gemm_V8.cpp +++ /dev/null @@ -1,1047 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#ifdef __aarch64__ -#include - -#include "type.h" -#include "error.h" -#include "tensor_desc.h" - -#include "cpu/arm/fp32/tensor_computing_fp32.h" - -EE convolution_gemm_V8(TensorDesc inputDesc, F32* inArray, - TensorDesc filterDesc, const F32* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F32* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F32* outArray, - ActivationDesc activationDesc) -{ - UNUSED(biasDesc); - UNUSED(tmpBytes); - - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - U32 dilateH = convDesc.dilatedRate_h; - U32 dilateW = convDesc.dilatedRate_w; - - if (fdf != DF_NHWCN8) { - CHECK_STATUS(NOT_MATCH); - } - - oc /= 8; - ic /= 8; - U32 ih_pad = ih + paddingT + paddingB; - U32 iw_pad = iw + paddingL + paddingR; - I32 ohow = oh*ow; - U32 ihiw = ih_pad*iw_pad; - F32 *inArray_pad; - EE ret = SUCCESS; - for (U32 n = 0; n < in; n++) { - if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { - inArray_pad = inArray + n*ic*ih*iw*8; - } else { - // copy input into a input with padding - inArray_pad = (F32*)tmp; - F32 *inArray_pad_mov = inArray_pad; - F32 *inArray_mov = inArray + n*ic*ih*iw*8; - for (U32 c = 0; c < ic; c++) { - for (U32 h = 0; h < paddingT; h++) { - memset(inArray_pad_mov, 0, iw_pad*8*bytesOf(idt)); - inArray_pad_mov += iw_pad*8; - } - for (U32 h = paddingT; h < ih_pad - paddingB; h++) { - memset(inArray_pad_mov, 0, paddingL*8*bytesOf(idt)); - inArray_pad_mov += paddingL*8; - memcpy(inArray_pad_mov, inArray_mov, iw*8*bytesOf(idt)); - inArray_pad_mov += iw*8; - inArray_mov += iw*8; - memset(inArray_pad_mov, 0, paddingR*8*bytesOf(idt)); - inArray_pad_mov += paddingR*8; - } - for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { - memset(inArray_pad_mov, 0, iw_pad*8*bytesOf(idt)); - inArray_pad_mov += iw_pad*8; - } - } - } - // ohow / 12 - for (I32 hw = 0; hw < ohow - 11; hw += 12) { - const F32 *b0 = biasArray; - const F32 *b1 = biasArray + 4; - F32 *in_pack = ((F32*)tmp) + ic * ihiw * 8; - // pack input - // NCHWc8 => NHWChw12 + im2col - U32 in_h[12] = {0}; - U32 in_w[12] = {0}; - for (U32 i = 0; i < 12; i++) { - in_h[i] = ((hw+i)/ow)*strideH; - in_w[i] = ((hw+i)%ow)*strideW; - } - - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - F32 *in_hw12c8 = inArray_pad + c*ihiw*8 + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - F32 *in_0 = in_hw12c8 + in_h[0]*iw_pad*8 + in_w[0]*8; - F32 *in_1 = in_hw12c8 + in_h[1]*iw_pad*8 + in_w[1]*8; - F32 *in_2 = in_hw12c8 + in_h[2]*iw_pad*8 + in_w[2]*8; - F32 *in_3 = in_hw12c8 + in_h[3]*iw_pad*8 + in_w[3]*8; - F32 *in_4 = in_hw12c8 + in_h[4]*iw_pad*8 + in_w[4]*8; - F32 *in_5 = in_hw12c8 + in_h[5]*iw_pad*8 + in_w[5]*8; - F32 *in_6 = in_hw12c8 + in_h[6]*iw_pad*8 + in_w[6]*8; - F32 *in_7 = in_hw12c8 + in_h[7]*iw_pad*8 + in_w[7]*8; - F32 *in_8 = in_hw12c8 + in_h[8]*iw_pad*8 + in_w[8]*8; - F32 *in_9 = in_hw12c8 + in_h[9]*iw_pad*8 + in_w[9]*8; - F32 *in_10 = in_hw12c8 + in_h[10]*iw_pad*8 + in_w[10]*8; - F32 *in_11 = in_hw12c8 + in_h[11]*iw_pad*8 + in_w[11]*8; - - // NHWChw12 - F32 *in_pack_c8hw12 = in_pack + fh_idx*fw*ic*12*8 + fw_idx*ic*12*8 + c*12*8; - - __asm__ __volatile__( - "ldp q0, q1, [%[in_0]]\n" - "ldp q2, q3, [%[in_1]]\n" - "ldp q4, q5, [%[in_2]]\n" - "ldp q6, q7, [%[in_3]]\n" - - "ldp q8, q9, [%[in_4]]\n" - "ldp q10, q11, [%[in_5]]\n" - "ldp q12, q13, [%[in_6]]\n" - "ldp q14, q15, [%[in_7]]\n" - - "ldp q16, q17, [%[in_8]]\n" - "ldp q18, q19, [%[in_9]]\n" - "ldp q20, q21, [%[in_10]]\n" - "ldp q22, q23, [%[in_11]]\n" - - "zip1 v24.4s, v0.4s, v2.4s\n" - "zip2 v25.4s, v0.4s, v2.4s\n" - "zip1 v26.4s, v4.4s, v6.4s\n" - "zip2 v27.4s, v4.4s, v6.4s\n" - - "zip1 v0.2d, v24.2d, v26.2d\n" - "zip2 v2.2d, v24.2d, v26.2d\n" - "zip1 v4.2d, v25.2d, v27.2d\n" - "zip2 v6.2d, v25.2d, v27.2d\n" - - "zip1 v24.4s, v8.4s, v10.4s\n" - "zip2 v25.4s, v8.4s, v10.4s\n" - "zip1 v26.4s, v12.4s, v14.4s\n" - "zip2 v27.4s, v12.4s, v14.4s\n" - - "zip1 v8.2d, v24.2d, v26.2d\n" - "zip2 v10.2d, v24.2d, v26.2d\n" - "zip1 v12.2d, v25.2d, v27.2d\n" - "zip2 v14.2d, v25.2d, v27.2d\n" - - "zip1 v24.4s, v16.4s, v18.4s\n" - "zip2 v25.4s, v16.4s, v18.4s\n" - "zip1 v26.4s, v20.4s, v22.4s\n" - "zip2 v27.4s, v20.4s, v22.4s\n" - - "zip1 v16.2d, v24.2d, v26.2d\n" - "zip2 v18.2d, v24.2d, v26.2d\n" - "zip1 v20.2d, v25.2d, v27.2d\n" - "zip2 v22.2d, v25.2d, v27.2d\n" - - "stp q0, q8, [%[pack]]\n" - "str q16, [%[pack], #32]\n" - "stp q2, q10, [%[pack], 48]\n" - "str q18, [%[pack], #80]\n" - "stp q4, q12, [%[pack], #96]\n" - "str q20, [%[pack], #128]\n" - "stp q6, q14, [%[pack], #144]\n" - "str q22, [%[pack], #176]\n" - - "zip1 v24.4s, v1.4s, v3.4s\n" - "zip2 v25.4s, v1.4s, v3.4s\n" - "zip1 v26.4s, v5.4s, v7.4s\n" - "zip2 v27.4s, v5.4s, v7.4s\n" - - "zip1 v1.2d, v24.2d, v26.2d\n" - "zip2 v3.2d, v24.2d, v26.2d\n" - "zip1 v5.2d, v25.2d, v27.2d\n" - "zip2 v7.2d, v25.2d, v27.2d\n" - - "zip1 v24.4s, v9.4s, v11.4s\n" - "zip2 v25.4s, v9.4s, v11.4s\n" - "zip1 v26.4s, v13.4s, v15.4s\n" - "zip2 v27.4s, v13.4s, v15.4s\n" - - "zip1 v9.2d, v24.2d, v26.2d\n" - "zip2 v11.2d, v24.2d, v26.2d\n" - "zip1 v13.2d, v25.2d, v27.2d\n" - "zip2 v15.2d, v25.2d, v27.2d\n" - - "zip1 v24.4s, v17.4s, v19.4s\n" - "zip2 v25.4s, v17.4s, v19.4s\n" - "zip1 v26.4s, v21.4s, v23.4s\n" - "zip2 v27.4s, v21.4s, v23.4s\n" - - "zip1 v17.2d, v24.2d, v26.2d\n" - "zip2 v19.2d, v24.2d, v26.2d\n" - "zip1 v21.2d, v25.2d, v27.2d\n" - "zip2 v23.2d, v25.2d, v27.2d\n" - - "stp q1, q9, [%[pack], #192]\n" - "str q17, [%[pack], #224]\n" - "stp q3, q11, [%[pack], 240]\n" - "str q19, [%[pack], #272]\n" - "stp q5, q13, [%[pack], 288]\n" - "str q21, [%[pack], #320]\n" - "stp q7, q15, [%[pack], 336]\n" - "str q23, [%[pack], #368]\n" - : - :[pack]"r"(in_pack_c8hw12), - [in_0]"r"(in_0), - [in_1]"r"(in_1), - [in_2]"r"(in_2), - [in_3]"r"(in_3), - [in_4]"r"(in_4), - [in_5]"r"(in_5), - [in_6]"r"(in_6), - [in_7]"r"(in_7), - [in_8]"r"(in_8), - [in_9]"r"(in_9), - [in_10]"r"(in_10), - [in_11]"r"(in_11) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", - "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27" - ); - } - } - } - - // compute - for (I32 o = 0; o < I32(oc); o++) { - F32 *in_hw0 = in_pack; - const F32 *f_o0c0 = filterArray + o*8*fh*fw*ic*8; - F32 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - // bias - const F32 *b_o0 = b0; - const F32 *b_o1 = b1; - __asm__ __volatile__( - "ldr q27, [%[b_0]]\n" - "ldr q28, [%[b_1]]\n" - //give in address to x3 - "mov x3, %[in_0]\n" - - //give f address to x0 - "mov x0, %[f_0]\n" - - "mov x2, %[ic]\n" - - "mov v5.16b, v27.16b\n" - "ldr q1, [%[in_0]]\n" //in_hw0 - "mov v7.16b, v27.16b\n" - "mov v9.16b, v27.16b\n" - "mov v11.16b, v27.16b\n" - "ldr q0, [%[f_0]]\n" //f_o0c0 - "mov v13.16b, v27.16b\n" - "mov v15.16b, v27.16b\n" - "mov v17.16b, v27.16b\n" - "ldr q3, [%[in_0], #16]\n" - "mov v19.16b, v27.16b\n" - "mov v21.16b, v27.16b\n" - "mov v23.16b, v27.16b\n" - "mov v25.16b, v27.16b\n" - - "mov v6.16b, v28.16b\n" - "mov v8.16b, v28.16b\n" - "mov v10.16b, v28.16b\n" - "mov v12.16b, v28.16b\n" - "mov v14.16b, v28.16b\n" - "mov v16.16b, v28.16b\n" - "mov v18.16b, v28.16b\n" - "mov v20.16b, v28.16b\n" - "mov v22.16b, v28.16b\n" - "mov v24.16b, v28.16b\n" - "mov v26.16b, v28.16b\n" - "0:\n" - "fmla v5.4s, v0.4s, v1.s[0]\n" - "fmla v7.4s, v0.4s, v1.s[1]\n" - "ldr q2, [x3, 32]\n" - "ldr q29, [x0, 16]\n" - "fmla v9.4s, v0.4s, v1.s[2]\n" - "fmla v11.4s, v0.4s, v1.s[3]\n" - - "fmla v13.4s, v0.4s, v3.s[0]\n" - "fmla v15.4s, v0.4s, v3.s[1]\n" - "fmla v17.4s, v0.4s, v3.s[2]\n" - "fmla v19.4s, v0.4s, v3.s[3]\n" - - "fmla v21.4s, v0.4s, v2.s[0]\n" - "fmla v23.4s, v0.4s, v2.s[1]\n" - "fmla v25.4s, v0.4s, v2.s[2]\n" - "fmla v27.4s, v0.4s, v2.s[3]\n" - - "fmla v6.4s, v29.4s, v1.s[0]\n" - "fmla v8.4s, v29.4s, v1.s[1]\n" - "fmla v10.4s, v29.4s, v1.s[2]\n" - "fmla v12.4s, v29.4s, v1.s[3]\n" - - "fmla v14.4s, v29.4s, v3.s[0]\n" - "fmla v16.4s, v29.4s, v3.s[1]\n" - "ldr q1, [x3, 48]!\n" - "ldr q0, [x0, 32]!\n" - "fmla v18.4s, v29.4s, v3.s[2]\n" - "fmla v20.4s, v29.4s, v3.s[3]\n" - - "fmla v22.4s, v29.4s, v2.s[0]\n" - "fmla v24.4s, v29.4s, v2.s[1]\n" - "ldr q3, [x3, 16]\n" - "subs x2, x2, #1\n" - "fmla v26.4s, v29.4s, v2.s[2]\n" - "fmla v28.4s, v29.4s, v2.s[3]\n" - "bne 0b\n" - :[in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8*fh*fw), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1) - :"memory", "cc", "v0", "v1", "v2", "v3", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x0", "x1", "x2", "x3" - ); - - switch (activationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v1.16b, v1.16b, v1.16b\n" //zero - "fmax v5.4s, v5.4s, v1.4s\n" - "fmax v6.4s, v6.4s, v1.4s\n" - "fmax v7.4s, v7.4s, v1.4s\n" - "fmax v8.4s, v8.4s, v1.4s\n" - "fmax v9.4s, v9.4s, v1.4s\n" - "fmax v10.4s, v10.4s, v1.4s\n" - "fmax v11.4s, v11.4s, v1.4s\n" - "fmax v12.4s, v12.4s, v1.4s\n" - "fmax v13.4s, v13.4s, v1.4s\n" - "fmax v14.4s, v14.4s, v1.4s\n" - "fmax v15.4s, v15.4s, v1.4s\n" - "fmax v16.4s, v16.4s, v1.4s\n" - "fmax v17.4s, v17.4s, v1.4s\n" - "fmax v18.4s, v18.4s, v1.4s\n" - "fmax v19.4s, v19.4s, v1.4s\n" - "fmax v20.4s, v20.4s, v1.4s\n" - "fmax v21.4s, v21.4s, v1.4s\n" - "fmax v22.4s, v22.4s, v1.4s\n" - "fmax v23.4s, v23.4s, v1.4s\n" - "fmax v24.4s, v24.4s, v1.4s\n" - "fmax v25.4s, v25.4s, v1.4s\n" - "fmax v26.4s, v26.4s, v1.4s\n" - "fmax v27.4s, v27.4s, v1.4s\n" - "fmax v28.4s, v28.4s, v1.4s\n" - : - : - :"memory", "cc", "v1", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v1.16b, v1.16b, v1.16b\n" // zero - "fmov v30.4s, 6.0\n" // six - "fmax v5.4s, v5.4s, v1.4s\n" - "fmax v6.4s, v6.4s, v1.4s\n" - "fmax v7.4s, v7.4s, v1.4s\n" - "fmax v8.4s, v8.4s, v1.4s\n" - "fmax v9.4s, v9.4s, v1.4s\n" - "fmax v10.4s, v10.4s, v1.4s\n" - "fmax v11.4s, v11.4s, v1.4s\n" - "fmax v12.4s, v12.4s, v1.4s\n" - "fmax v13.4s, v13.4s, v1.4s\n" - "fmax v14.4s, v14.4s, v1.4s\n" - "fmax v15.4s, v15.4s, v1.4s\n" - "fmax v16.4s, v16.4s, v1.4s\n" - "fmax v17.4s, v17.4s, v1.4s\n" - "fmax v18.4s, v18.4s, v1.4s\n" - "fmax v19.4s, v19.4s, v1.4s\n" - "fmax v20.4s, v20.4s, v1.4s\n" - "fmax v21.4s, v21.4s, v1.4s\n" - "fmax v22.4s, v22.4s, v1.4s\n" - "fmax v23.4s, v23.4s, v1.4s\n" - "fmax v24.4s, v24.4s, v1.4s\n" - "fmax v25.4s, v25.4s, v1.4s\n" - "fmax v26.4s, v26.4s, v1.4s\n" - "fmax v27.4s, v27.4s, v1.4s\n" - "fmax v28.4s, v28.4s, v1.4s\n" - - "fmin v5.4s, v5.4s, v30.4s\n" - "fmin v6.4s, v6.4s, v30.4s\n" - "fmin v7.4s, v7.4s, v30.4s\n" - "fmin v8.4s, v8.4s, v30.4s\n" - "fmin v9.4s, v9.4s, v30.4s\n" - "fmin v10.4s, v10.4s, v30.4s\n" - "fmin v11.4s, v11.4s, v30.4s\n" - "fmin v12.4s, v12.4s, v30.4s\n" - "fmin v13.4s, v13.4s, v30.4s\n" - "fmin v14.4s, v14.4s, v30.4s\n" - "fmin v15.4s, v15.4s, v30.4s\n" - "fmin v16.4s, v16.4s, v30.4s\n" - "fmin v17.4s, v17.4s, v30.4s\n" - "fmin v18.4s, v18.4s, v30.4s\n" - "fmin v19.4s, v19.4s, v30.4s\n" - "fmin v20.4s, v20.4s, v30.4s\n" - "fmin v21.4s, v21.4s, v30.4s\n" - "fmin v22.4s, v22.4s, v30.4s\n" - "fmin v23.4s, v23.4s, v30.4s\n" - "fmin v24.4s, v24.4s, v30.4s\n" - "fmin v25.4s, v25.4s, v30.4s\n" - "fmin v26.4s, v26.4s, v30.4s\n" - "fmin v27.4s, v27.4s, v30.4s\n" - "fmin v28.4s, v28.4s, v30.4s\n" - : - : - :"memory", "cc", "v1", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v30" - ); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - - __asm__ __volatile__( - "str q5, [%[out_0]]\n" - "str q6, [%[out_0], #16]\n" - "str q7, [%[out_0], #32]\n" - "str q8, [%[out_0], #48]\n" - "str q9, [%[out_0], #64]\n" - "str q10, [%[out_0], #80]\n" - "str q11, [%[out_0], #96]\n" - "str q12, [%[out_0], #112]\n" - "str q13, [%[out_0], #128]\n" - "str q14, [%[out_0], #144]\n" - "str q15, [%[out_0], #160]\n" - "str q16, [%[out_0], #176]\n" - "str q17, [%[out_0], #192]\n" - "str q18, [%[out_0], #208]\n" - "str q19, [%[out_0], #224]\n" - "str q20, [%[out_0], #240]\n" - "str q21, [%[out_0], #256]\n" - "str q22, [%[out_0], #272]\n" - "str q23, [%[out_0], #288]\n" - "str q24, [%[out_0], #304]\n" - "str q25, [%[out_0], #320]\n" - "str q26, [%[out_0], #336]\n" - "str q27, [%[out_0], #352]\n" - "str q28, [%[out_0], #368]\n" - :[out_0]"+r"(out_o0hw0) - : - :"memory", "cc", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28" - ); - b0 += 8; - b1 += 8; - } - } - - U32 ohow_s = (ohow / 12) * 12; - U32 ohow_tail = ohow - ohow_s; - - if (ohow_tail >= 8) { - I32 hw = ohow_s; - const F32 *b0 = biasArray; - const F32 *b1 = biasArray + 4; - F32 *in_pack = ((F32*)tmp) + ic*ih_pad*iw_pad*8; - // pack input - // NCHWc8 => NHWChw8 + im2col - U32 in_h[8] = {0}; - U32 in_w[8] = {0}; - - for (U32 i = 0; i < 8; i++) { - in_h[i] = ((hw+i)/ow)*strideH; - in_w[i] = ((hw+i)%ow)*strideW; - } - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - F32 *in_hw8c8 = inArray_pad + c*ihiw*8 + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - F32 *in_0 = in_hw8c8 + in_h[0]*iw_pad*8 + in_w[0]*8; - F32 *in_1 = in_hw8c8 + in_h[1]*iw_pad*8 + in_w[1]*8; - F32 *in_2 = in_hw8c8 + in_h[2]*iw_pad*8 + in_w[2]*8; - F32 *in_3 = in_hw8c8 + in_h[3]*iw_pad*8 + in_w[3]*8; - F32 *in_4 = in_hw8c8 + in_h[4]*iw_pad*8 + in_w[4]*8; - F32 *in_5 = in_hw8c8 + in_h[5]*iw_pad*8 + in_w[5]*8; - F32 *in_6 = in_hw8c8 + in_h[6]*iw_pad*8 + in_w[6]*8; - F32 *in_7 = in_hw8c8 + in_h[7]*iw_pad*8 + in_w[7]*8; - F32 *in_pack_c8hw8 = in_pack + fh_idx*fw*ic*8*8 + fw_idx*ic*8*8 + c*8*8; - - __asm__ __volatile__( - "ldp q0, q1, [%[in_0]]\n" - "ldp q2, q3, [%[in_1]]\n" - "ldp q4, q5, [%[in_2]]\n" - "ldp q6, q7, [%[in_3]]\n" - - "ldp q8, q9, [%[in_4]]\n" - "ldp q10, q11, [%[in_5]]\n" - "ldp q12, q13, [%[in_6]]\n" - "ldp q14, q15, [%[in_7]]\n" - - "zip1 v24.4s, v0.4s, v2.4s\n" - "zip2 v25.4s, v0.4s, v2.4s\n" - "zip1 v26.4s, v4.4s, v6.4s\n" - "zip2 v27.4s, v4.4s, v6.4s\n" - - "zip1 v0.2d, v24.2d, v26.2d\n" - "zip2 v2.2d, v24.2d, v26.2d\n" - "zip1 v4.2d, v25.2d, v27.2d\n" - "zip2 v6.2d, v25.2d, v27.2d\n" - - "zip1 v24.4s, v8.4s, v10.4s\n" - "zip2 v25.4s, v8.4s, v10.4s\n" - "zip1 v26.4s, v12.4s, v14.4s\n" - "zip2 v27.4s, v12.4s, v14.4s\n" - - "zip1 v8.2d, v24.2d, v26.2d\n" - "zip2 v10.2d, v24.2d, v26.2d\n" - "zip1 v12.2d, v25.2d, v27.2d\n" - "zip2 v14.2d, v25.2d, v27.2d\n" - - "stp q0, q8, [%[pack]]\n" - "stp q2, q10, [%[pack], #32]\n" - "stp q4, q12, [%[pack], #64]\n" - "stp q6, q14, [%[pack], #96]\n" - - "zip1 v24.4s, v1.4s, v3.4s\n" - "zip2 v25.4s, v1.4s, v3.4s\n" - "zip1 v26.4s, v5.4s, v7.4s\n" - "zip2 v27.4s, v5.4s, v7.4s\n" - - "zip1 v1.2d, v24.2d, v26.2d\n" - "zip2 v3.2d, v24.2d, v26.2d\n" - "zip1 v5.2d, v25.2d, v27.2d\n" - "zip2 v7.2d, v25.2d, v27.2d\n" - - "zip1 v24.4s, v9.4s, v11.4s\n" - "zip2 v25.4s, v9.4s, v11.4s\n" - "zip1 v26.4s, v13.4s, v15.4s\n" - "zip2 v27.4s, v13.4s, v15.4s\n" - - "zip1 v9.2d, v24.2d, v26.2d\n" - "zip2 v11.2d, v24.2d, v26.2d\n" - "zip1 v13.2d, v25.2d, v27.2d\n" - "zip2 v15.2d, v25.2d, v27.2d\n" - - "stp q1, q9, [%[pack], #128]\n" - "stp q3, q11, [%[pack], #160]\n" - "stp q5, q13, [%[pack], #192]\n" - "stp q7, q15, [%[pack], #224]\n" - : - :[pack]"r"(in_pack_c8hw8), - [in_0]"r"(in_0), - [in_1]"r"(in_1), - [in_2]"r"(in_2), - [in_3]"r"(in_3), - [in_4]"r"(in_4), - [in_5]"r"(in_5), - [in_6]"r"(in_6), - [in_7]"r"(in_7) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", - "v12", "v13", "v14", "v15", "v24", "v25", "v26", "v27" - ); - } - } - } - - // compute - for (I32 o = 0; o < I32(oc); o++) { - F32 *in_hw0 = in_pack; - const F32 *f_o0c0 = filterArray + o*8*fh*fw*ic*8; - F32 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - // bias - const F32 *b_o0 = b0; - const F32 *b_o1 = b1; - __asm__ __volatile__( - "ldr q27, [%[b_0]]\n" - "ldr q28, [%[b_1]]\n" - //give in address to x3 - "mov x3, %[in_0]\n" - - //give f address to x0 - "mov x0, %[f_0]\n" - - "mov x2, %[ic]\n" - - "mov v5.16b, v27.16b\n" - "ldr q1, [%[in_0]]\n" //in_hw0 - "mov v7.16b, v27.16b\n" - "mov v9.16b, v27.16b\n" - "mov v11.16b, v27.16b\n" - "ldr q0, [%[f_0]]\n" //f_o0c0 - "mov v13.16b, v27.16b\n" - "mov v15.16b, v27.16b\n" - "mov v17.16b, v27.16b\n" - "mov v19.16b, v27.16b\n" - - "mov v6.16b, v28.16b\n" - "mov v8.16b, v28.16b\n" - "mov v10.16b, v28.16b\n" - "mov v12.16b, v28.16b\n" - "mov v14.16b, v28.16b\n" - "mov v16.16b, v28.16b\n" - "mov v18.16b, v28.16b\n" - "mov v20.16b, v28.16b\n" - "0:\n" - "ldr q3, [x3, 16]!\n" - "ldr q29, [x0, 16]\n" - "fmla v5.4s, v0.4s, v1.s[0]\n" - "fmla v7.4s, v0.4s, v1.s[1]\n" - "fmla v9.4s, v0.4s, v1.s[2]\n" - "fmla v11.4s, v0.4s, v1.s[3]\n" - - "fmla v13.4s, v0.4s, v3.s[0]\n" - "fmla v15.4s, v0.4s, v3.s[1]\n" - "fmla v17.4s, v0.4s, v3.s[2]\n" - "fmla v19.4s, v0.4s, v3.s[3]\n" - - "fmla v6.4s, v29.4s, v1.s[0]\n" - "fmla v8.4s, v29.4s, v1.s[1]\n" - "fmla v10.4s, v29.4s, v1.s[2]\n" - "fmla v12.4s, v29.4s, v1.s[3]\n" - - "fmla v14.4s, v29.4s, v3.s[0]\n" - "fmla v16.4s, v29.4s, v3.s[1]\n" - "ldr q1, [x3, 16]!\n" - "ldr q0, [x0, 32]!\n" - "subs x2, x2, #1\n" - "fmla v18.4s, v29.4s, v3.s[2]\n" - "fmla v20.4s, v29.4s, v3.s[3]\n" - "bne 0b\n" - :[in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8*fh*fw), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1) - :"memory", "cc", "v0", "v1", "v3", "v5", "v6", "v7", "v8", "v9", "v10", "v11", - "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v27", "v28", "v29", "x0", "x1", "x2", "x3" - ); - - switch (activationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v1.16b, v1.16b, v1.16b\n" //zero - "fmax v5.4s, v5.4s, v1.4s\n" - "fmax v6.4s, v6.4s, v1.4s\n" - "fmax v7.4s, v7.4s, v1.4s\n" - "fmax v8.4s, v8.4s, v1.4s\n" - "fmax v9.4s, v9.4s, v1.4s\n" - "fmax v10.4s, v10.4s, v1.4s\n" - "fmax v11.4s, v11.4s, v1.4s\n" - "fmax v12.4s, v12.4s, v1.4s\n" - "fmax v13.4s, v13.4s, v1.4s\n" - "fmax v14.4s, v14.4s, v1.4s\n" - "fmax v15.4s, v15.4s, v1.4s\n" - "fmax v16.4s, v16.4s, v1.4s\n" - "fmax v17.4s, v17.4s, v1.4s\n" - "fmax v18.4s, v18.4s, v1.4s\n" - "fmax v19.4s, v19.4s, v1.4s\n" - "fmax v20.4s, v20.4s, v1.4s\n" - : - : - :"memory", "cc", "v1", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v1.16b, v1.16b, v1.16b\n" // zero - "fmov v30.4s, 6.0\n" // six - "fmax v5.4s, v5.4s, v1.4s\n" - "fmax v6.4s, v6.4s, v1.4s\n" - "fmax v7.4s, v7.4s, v1.4s\n" - "fmax v8.4s, v8.4s, v1.4s\n" - "fmax v9.4s, v9.4s, v1.4s\n" - "fmax v10.4s, v10.4s, v1.4s\n" - "fmax v11.4s, v11.4s, v1.4s\n" - "fmax v12.4s, v12.4s, v1.4s\n" - "fmax v13.4s, v13.4s, v1.4s\n" - "fmax v14.4s, v14.4s, v1.4s\n" - "fmax v15.4s, v15.4s, v1.4s\n" - "fmax v16.4s, v16.4s, v1.4s\n" - "fmax v17.4s, v17.4s, v1.4s\n" - "fmax v18.4s, v18.4s, v1.4s\n" - "fmax v19.4s, v19.4s, v1.4s\n" - "fmax v20.4s, v20.4s, v1.4s\n" - - "fmin v5.4s, v5.4s, v30.4s\n" - "fmin v6.4s, v6.4s, v30.4s\n" - "fmin v7.4s, v7.4s, v30.4s\n" - "fmin v8.4s, v8.4s, v30.4s\n" - "fmin v9.4s, v9.4s, v30.4s\n" - "fmin v10.4s, v10.4s, v30.4s\n" - "fmin v11.4s, v11.4s, v30.4s\n" - "fmin v12.4s, v12.4s, v30.4s\n" - "fmin v13.4s, v13.4s, v30.4s\n" - "fmin v14.4s, v14.4s, v30.4s\n" - "fmin v15.4s, v15.4s, v30.4s\n" - "fmin v16.4s, v16.4s, v30.4s\n" - "fmin v17.4s, v17.4s, v30.4s\n" - "fmin v18.4s, v18.4s, v30.4s\n" - "fmin v19.4s, v19.4s, v30.4s\n" - "fmin v20.4s, v20.4s, v30.4s\n" - : - : - :"memory", "cc", "v1", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v30" - ); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - - __asm__ __volatile__( - "str q5, [%[out_0]]\n" - "str q6, [%[out_0], #16]\n" - "str q7, [%[out_0], #32]\n" - "str q8, [%[out_0], #48]\n" - "str q9, [%[out_0], #64]\n" - "str q10, [%[out_0], #80]\n" - "str q11, [%[out_0], #96]\n" - "str q12, [%[out_0], #112]\n" - "str q13, [%[out_0], #128]\n" - "str q14, [%[out_0], #144]\n" - "str q15, [%[out_0], #160]\n" - "str q16, [%[out_0], #176]\n" - "str q17, [%[out_0], #192]\n" - "str q18, [%[out_0], #208]\n" - "str q19, [%[out_0], #224]\n" - "str q20, [%[out_0], #240]\n" - :[out_0]"+r"(out_o0hw0) - : - :"memory", "cc", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20" - ); - b0 += 8; - b1 += 8; - } - ohow_s += 8; - ohow_tail -= 8; - } - - if (ohow_tail >= 4) { - I32 hw = ohow_s; - const F32 *b0 = biasArray; - const F32 *b1 = biasArray + 4; - F32 *in_pack = ((F32*)tmp) + ic*ih_pad*iw_pad*8; - // pack input - // NCHWc8 => NHWChw4 + im2col - U32 in_h[4] = {0}; - U32 in_w[4] = {0}; - - for (U32 i = 0; i < 4; i++) { - in_h[i] = ((hw+i)/ow)*strideH; - in_w[i] = ((hw+i)%ow)*strideW; - } - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - F32 *in_hw4c8 = inArray_pad + c*ihiw*8 + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - F32 *in_0 = in_hw4c8 + in_h[0]*iw_pad*8 + in_w[0]*8; - F32 *in_1 = in_hw4c8 + in_h[1]*iw_pad*8 + in_w[1]*8; - F32 *in_2 = in_hw4c8 + in_h[2]*iw_pad*8 + in_w[2]*8; - F32 *in_3 = in_hw4c8 + in_h[3]*iw_pad*8 + in_w[3]*8; - F32 *in_pack_c8hw4 = in_pack + fh_idx*fw*ic*8*4 + fw_idx*ic*8*4 + c*8*4; - - __asm__ __volatile__( - "ldp q0, q4, [%[in_0]]\n" - "ldp q1, q5, [%[in_1]]\n" - "ldp q2, q6, [%[in_2]]\n" - "ldp q3, q7, [%[in_3]]\n" - - "st4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[pack]], #64\n" - "st4 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[pack]]\n" - :[pack]"+r"(in_pack_c8hw4) - :[in_0]"r"(in_0), - [in_1]"r"(in_1), - [in_2]"r"(in_2), - [in_3]"r"(in_3) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" - ); - } - } - } - - // compute - for (I32 o = 0; o < I32(oc); o++) { - F32 *in_hw0 = in_pack; - const F32 *f_o0c0 = filterArray + o*8*fh*fw*ic*8; - F32 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - // bias - const F32 *b_o0 = b0; - const F32 *b_o1 = b1; - __asm__ __volatile__( - "ldr q27, [%[b_0]]\n" - "ldr q28, [%[b_1]]\n" - //give in address to x3 - "mov x3, %[in_0]\n" - - //give f address to x0 - "mov x0, %[f_0]\n" - - "mov x2, %[ic]\n" - - "mov v5.16b, v27.16b\n" - "ldr q1, [%[in_0]]\n" //in_hw0 - "mov v7.16b, v27.16b\n" - "mov v9.16b, v27.16b\n" - "mov v11.16b, v27.16b\n" - "ldr q0, [%[f_0]]\n" //f_o0c0 - - "mov v6.16b, v28.16b\n" - "mov v8.16b, v28.16b\n" - "mov v10.16b, v28.16b\n" - "mov v12.16b, v28.16b\n" - "0:\n" - "ldr q3, [x3, 16]!\n" - "ldr q29, [x0, 16]\n" - "fmla v5.4s, v0.4s, v1.s[0]\n" - "fmla v7.4s, v0.4s, v1.s[1]\n" - "fmla v9.4s, v0.4s, v1.s[2]\n" - "fmla v11.4s, v0.4s, v1.s[3]\n" - - "fmla v6.4s, v29.4s, v1.s[0]\n" - "fmla v8.4s, v29.4s, v1.s[1]\n" - "ldr q0, [x0, 32]!\n" - "subs x2, x2, #1\n" - "fmla v10.4s, v29.4s, v1.s[2]\n" - "fmla v12.4s, v29.4s, v1.s[3]\n" - - "mov v1.16b, v3.16b\n" - "bne 0b\n" - :[in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8*fh*fw), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1) - :"memory", "cc", "v0", "v1", "v3", "v5", "v6", "v7", "v8", "v9", "v10", "v11", - "v12", "v27", "v28", "v29", "x0", "x1", "x2", "x3" - ); - - switch (activationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v1.16b, v1.16b, v1.16b\n" //zero - "fmax v5.4s, v5.4s, v1.4s\n" - "fmax v6.4s, v6.4s, v1.4s\n" - "fmax v7.4s, v7.4s, v1.4s\n" - "fmax v8.4s, v8.4s, v1.4s\n" - "fmax v9.4s, v9.4s, v1.4s\n" - "fmax v10.4s, v10.4s, v1.4s\n" - "fmax v11.4s, v11.4s, v1.4s\n" - "fmax v12.4s, v12.4s, v1.4s\n" - : - : - :"memory", "cc", "v1", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v1.16b, v1.16b, v1.16b\n" // zero - "fmov v30.4s, 6.0\n" // six - "fmax v5.4s, v5.4s, v1.4s\n" - "fmax v6.4s, v6.4s, v1.4s\n" - "fmax v7.4s, v7.4s, v1.4s\n" - "fmax v8.4s, v8.4s, v1.4s\n" - "fmax v9.4s, v9.4s, v1.4s\n" - "fmax v10.4s, v10.4s, v1.4s\n" - "fmax v11.4s, v11.4s, v1.4s\n" - "fmax v12.4s, v12.4s, v1.4s\n" - - "fmin v5.4s, v5.4s, v30.4s\n" - "fmin v6.4s, v6.4s, v30.4s\n" - "fmin v7.4s, v7.4s, v30.4s\n" - "fmin v8.4s, v8.4s, v30.4s\n" - "fmin v9.4s, v9.4s, v30.4s\n" - "fmin v10.4s, v10.4s, v30.4s\n" - "fmin v11.4s, v11.4s, v30.4s\n" - "fmin v12.4s, v12.4s, v30.4s\n" - : - : - :"memory", "cc", "v1", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v30" - ); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - - __asm__ __volatile__( - "str q5, [%[out_0]]\n" - "str q6, [%[out_0], #16]\n" - "str q7, [%[out_0], #32]\n" - "str q8, [%[out_0], #48]\n" - "str q9, [%[out_0], #64]\n" - "str q10, [%[out_0], #80]\n" - "str q11, [%[out_0], #96]\n" - "str q12, [%[out_0], #112]\n" - :[out_0]"+r"(out_o0hw0) - : - :"memory", "cc", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12" - ); - b0 += 8; - b1 += 8; - } - ohow_s += 4; - ohow_tail -= 4; - } - - for (I32 hw = ohow_s; hw < ohow; hw++) { - const F32 *b0 = biasArray; - const F32 *b1 = biasArray + 4; - F32 *in_pack = ((F32*)tmp) + ic*ih_pad*iw_pad*8; - // pack input - // NCHW => NCHWc8hw1 + im2col - U32 in_h_0 = (hw/ow)*strideH; - U32 in_w_0 = (hw%ow)*strideW; - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - F32 *in_hw1c8 = inArray_pad + c*ihiw*8 + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - F32 *in_0 = in_hw1c8 + in_h_0*iw_pad*8 + in_w_0*8; - F32 *in_pack_c8hw1 = in_pack + fh_idx*fw*ic*8 + fw_idx*ic*8 + c*8; - - memcpy(in_pack_c8hw1, in_0, 8*bytesOf(idt)); - } - } - } - - // compute - for (I32 o = 0; o < I32(oc); o++) { - F32 *in_hw0 = in_pack; - const F32 *f_o0c0 = filterArray + o*8*fh*fw*ic*8; - F32 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - // bias - const F32 *b_o0 = b0; - const F32 *b_o1 = b1; - __asm__ __volatile__( - "ldr q5, [%[b_0]]\n" - "ldr q6, [%[b_1]]\n" - //give in address to x3 - "mov x3, %[in_0]\n" - - //give f address to x0 - "mov x0, %[f_0]\n" - - "mov x2, %[ic]\n" - - "ldr s1, [%[in_0]]\n" //in_hw0 - "ldp q0, q29, [%[f_0]]\n" //f_o0c0 - - "0:\n" - "ldp q30, q28, [x0, #32]\n" - "ldr s3, [x3, #4]\n" - "fmla v5.4s, v0.4s, v1.s[0]\n" - "fmla v6.4s, v29.4s, v1.s[0]\n" - - - "ldr q0, [x0, #64]!\n" - "subs x2, x2, #2\n" - "ldr q29, [x0, #16]\n" - "ldr s1, [x3, #8]!\n" - "fmla v5.4s, v30.4s, v3.s[0]\n" - "fmla v6.4s, v28.4s, v3.s[0]\n" - - "bne 0b\n" - :[in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8*fh*fw), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1) - :"memory", "cc", "v0", "v1", "v3", "v5", "v6", "v28", "v29", "v30", "x0", "x1", "x2", "x3" - ); - - switch (activationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v1.16b, v1.16b, v1.16b\n" //zero - "fmax v5.4s, v5.4s, v1.4s\n" - "fmax v6.4s, v6.4s, v1.4s\n" - : - : - :"memory", "cc", "v1", "v5", "v6" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v1.16b, v1.16b, v1.16b\n" // zero - "fmov v30.4s, 6.0\n" // six - "fmax v5.4s, v5.4s, v1.4s\n" - "fmax v6.4s, v6.4s, v1.4s\n" - - "fmin v5.4s, v5.4s, v30.4s\n" - "fmin v6.4s, v6.4s, v30.4s\n" - : - : - :"memory", "cc", "v1", "v5", "v6", "v30" - ); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - - __asm__ __volatile__( - "str q5, [%[out_0]]\n" - "str q6, [%[out_0], #16]\n" - :[out_0]"+r"(out_o0hw0) - : - :"memory", "cc", "v5", "v6" - ); - b0 += 8; - b1 += 8; - } - } - } - return ret; -} -#endif diff --git a/tensor_computing/src/cpu/arm/fp32/convolution_gemm_icnchw_V7.cpp b/tensor_computing/src/cpu/arm/fp32/convolution_gemm_icnchw_V7.cpp deleted file mode 100644 index 0d90205f..00000000 --- a/tensor_computing/src/cpu/arm/fp32/convolution_gemm_icnchw_V7.cpp +++ /dev/null @@ -1,404 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef __aarch64__ -#include -#include "cpu/arm/fp32/tensor_computing_fp32.h" - -EE convolution_gemm_icnchw_V7(TensorDesc inputDesc, F32* inArray, - TensorDesc filterDesc, const F32* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F32* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F32* outArray, - ActivationDesc activationDesc) -{ - UNUSED(biasDesc); - UNUSED(tmpBytes); - - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - U32 dilateH = convDesc.dilatedRate_h; - U32 dilateW = convDesc.dilatedRate_w; - - if (fdf != DF_NHWCN8) { - CHECK_STATUS(NOT_MATCH); - } - - I64 activation = 0; - switch (activationDesc.mode) { - case ACTIVATION_NULL: - activation = 0; - break; - case ACTIVATION_RELU: - activation = 1; - break; - default: - return NOT_SUPPORTED; - } - oc /= 8; - U32 ih_pad = ih + paddingT + paddingB; - U32 iw_pad = iw + paddingL + paddingR; - I32 ohow = oh*ow; - U32 ihiw = ih_pad*iw_pad; - F32 *inArray_pad; - EE ret = SUCCESS; - for (U32 n = 0; n < in; n++) { - if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { - inArray_pad = inArray + n*ic*ih*iw; - } else { - // copy input into a input with padding - inArray_pad = (F32*)tmp; - F32 *inArray_pad_mov = inArray_pad; - F32 *inArray_mov = inArray + n*ic*ih*iw; - for (U32 c = 0; c < ic; c++) { - for (U32 h = 0; h < paddingT; h++) { - memset(inArray_pad_mov, 0, iw_pad*bytesOf(idt)); - inArray_pad_mov += iw_pad; - } - for (U32 h = paddingT; h < ih_pad - paddingB; h++) { - memset(inArray_pad_mov, 0, paddingL*bytesOf(idt)); - inArray_pad_mov += paddingL; - memcpy(inArray_pad_mov, inArray_mov, iw*bytesOf(idt)); - inArray_pad_mov += iw; - inArray_mov += iw; - memset(inArray_pad_mov, 0, paddingR*bytesOf(idt)); - inArray_pad_mov += paddingR; - } - for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { - memset(inArray_pad_mov, 0, iw_pad*bytesOf(idt)); - inArray_pad_mov += iw_pad; - } - } - } - // ohow / 6 - for (I32 hw = 0; hw < ohow - 5; hw += 6) { - const F32 *b0 = biasArray; - const F32 *b1 = biasArray + 4; - F32 *in_pack = ((F32*)tmp) + ic*ih_pad*iw_pad; - // pack input - // NCHW => NHWChw12 + im2col - U32 in_h_0 = (hw/ow)*strideH; - U32 in_w_0 = (hw%ow)*strideW; - U32 in_h_1 = ((hw+1)/ow)*strideH; - U32 in_w_1 = ((hw+1)%ow)*strideW; - U32 in_h_2 = ((hw+2)/ow)*strideH; - U32 in_w_2 = ((hw+2)%ow)*strideW; - U32 in_h_3 = ((hw+3)/ow)*strideH; - U32 in_w_3 = ((hw+3)%ow)*strideW; - U32 in_h_4 = ((hw+4)/ow)*strideH; - U32 in_w_4 = ((hw+4)%ow)*strideW; - U32 in_h_5 = ((hw+5)/ow)*strideH; - U32 in_w_5 = ((hw+5)%ow)*strideW; - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - F32 *in_hw = inArray_pad + c*ihiw + fh_idx*dilateH*iw_pad + dilateW*fw_idx; - F32 *in_0 = in_hw + in_h_0*iw_pad + in_w_0; - F32 *in_1 = in_hw + in_h_1*iw_pad + in_w_1; - F32 *in_2 = in_hw + in_h_2*iw_pad + in_w_2; - F32 *in_3 = in_hw + in_h_3*iw_pad + in_w_3; - F32 *in_4 = in_hw + in_h_4*iw_pad + in_w_4; - F32 *in_5 = in_hw + in_h_5*iw_pad + in_w_5; - F32 *in_pack_hw6 = in_pack + (fh_idx*fw*ic + fw_idx*ic + c)*6; - *in_pack_hw6 = *in_0; - *(in_pack_hw6+1) = *in_1; - *(in_pack_hw6+2) = *in_2; - *(in_pack_hw6+3) = *in_3; - *(in_pack_hw6+4) = *in_4; - *(in_pack_hw6+5) = *in_5; - } - } - } - - // compute - for (I32 o = 0; o < I32(oc); o++) { - F32 *in_hw0 = in_pack; - const F32 *f_o0c0 = filterArray + o*8*fh*fw*ic; - F32 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - - // bias - const F32 *b_o0 = b0; - const F32 *b_o1 = b1; - __asm__ __volatile__( - "vld1.f32 {d10-d11}, [%[b_0]]\n" - "vld1.f32 {d12-d13}, [%[b_1]]\n" - "mov r2, %[ic]\n" - - "vld1.f32 {d2-d3}, [%[in_0]]!\n" //in_hw0 - "vmov.f32 q7, q5\n" - "vmov.f32 q9, q5\n" - "vmov.f32 q11, q5\n" - "vld1.f32 {d0-d1}, [%[f_0]]!\n" //f_o0c0 - "vmov.f32 q13, q5\n" - "vmov.f32 q15, q5\n" - - "vmov.f32 q8, q6\n" - "vmov.f32 q10, q6\n" - "vmov.f32 q12, q6\n" - "vmov.f32 q14, q6\n" - "vmov.f32 q3, q6\n" - "0:\n" - "vld1.f32 {d4}, [%[in_0]]!\n" - "vld1.f32 {d8-d9}, [%[f_0]]!\n" - "vmla.f32 q5, q0, d2[0]\n" - "vmla.f32 q7, q0, d2[1]\n" - "vmla.f32 q9, q0, d3[0]\n" - "vmla.f32 q11, q0, d3[1]\n" - "vmla.f32 q13, q0, d4[0]\n" - "vmla.f32 q15, q0, d4[1]\n" - "vld1.f32 {d0-d1}, [%[f_0]]!\n" - - "vmla.f32 q6, q4, d2[0]\n" - "vmla.f32 q8, q4, d2[1]\n" - "vmla.f32 q10, q4, d3[0]\n" - "vmla.f32 q12, q4, d3[1]\n" - "vld1.f32 {d2-d3}, [%[in_0]]!\n" - "vmla.f32 q14, q4, d4[0]\n" - "vmla.f32 q3, q4, d4[1]\n" - "subs r2, r2, #1\n" - "bne 0b\n" - - "cmp %[activation], #0\n" - "beq 1f\n" - "veor q1, q1, q1\n" //zero - "vmax.f32 q5, q5, q1\n" - "vmax.f32 q6, q6, q1\n" - "vmax.f32 q7, q7, q1\n" - "vmax.f32 q8, q8, q1\n" - "vmax.f32 q9, q9, q1\n" - "vmax.f32 q10, q10, q1\n" - "vmax.f32 q11, q11, q1\n" - "vmax.f32 q12, q12, q1\n" - "vmax.f32 q13, q13, q1\n" - "vmax.f32 q14, q14, q1\n" - "vmax.f32 q15, q15, q1\n" - "vmax.f32 q3, q3, q1\n" - "1:\n" - "vst1.f32 {d10-d11}, [%[out_0]]!\n" - "vst1.f32 {d12-d13}, [%[out_0]]!\n" - "vst1.f32 {d14-d15}, [%[out_0]]!\n" - "vst1.f32 {d16-d17}, [%[out_0]]!\n" - "vst1.f32 {d18-d19}, [%[out_0]]!\n" - "vst1.f32 {d20-d21}, [%[out_0]]!\n" - "vst1.f32 {d22-d23}, [%[out_0]]!\n" - "vst1.f32 {d24-d25}, [%[out_0]]!\n" - "vst1.f32 {d26-d27}, [%[out_0]]!\n" - "vst1.f32 {d28-d29}, [%[out_0]]!\n" - "vst1.f32 {d30-d31}, [%[out_0]]!\n" - "vst1.f32 {d6-d7}, [%[out_0]]\n" - :[out_0]"+r"(out_o0hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*fh*fw), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1), - [activation]"r"(activation) - :"memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", - "q11", "q12", "q13", "q14", "q15", - "r2" - ); - b0 += 8; - b1 += 8; - } - } - - U32 ohow_s = (ohow / 6) * 6; - U32 ohow_tail = ohow - ohow_s; - - if (ohow_tail >= 4) { - I32 hw = ohow_s; - const F32 *b0 = biasArray; - const F32 *b1 = biasArray + 4; - F32 *in_pack = ((F32*)tmp) + ic*ih_pad*iw_pad; - // pack input - // NCHW => NHWChw4 + im2col - U32 in_h_0 = (hw/ow)*strideH; - U32 in_w_0 = (hw%ow)*strideW; - U32 in_h_1 = ((hw+1)/ow)*strideH; - U32 in_w_1 = ((hw+1)%ow)*strideW; - U32 in_h_2 = ((hw+2)/ow)*strideH; - U32 in_w_2 = ((hw+2)%ow)*strideW; - U32 in_h_3 = ((hw+3)/ow)*strideH; - U32 in_w_3 = ((hw+3)%ow)*strideW; - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - F32 *in_hw = inArray_pad + c*ihiw + fh_idx*dilateH*iw_pad + dilateW*fw_idx; - F32 *in_0 = in_hw + in_h_0*iw_pad + in_w_0; - F32 *in_1 = in_hw + in_h_1*iw_pad + in_w_1; - F32 *in_2 = in_hw + in_h_2*iw_pad + in_w_2; - F32 *in_3 = in_hw + in_h_3*iw_pad + in_w_3; - F32 *in_pack_hw4 = in_pack + fh_idx*fw*ic*4 + fw_idx*ic*4 + c*4; - *in_pack_hw4 = *in_0; - *(in_pack_hw4+1) = *in_1; - *(in_pack_hw4+2) = *in_2; - *(in_pack_hw4+3) = *in_3; - } - } - } - - // compute - for (I32 o = 0; o < I32(oc); o++) { - F32 *in_hw0 = in_pack; - const F32 *f_o0c0 = filterArray + o*8*fh*fw*ic; - F32 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - // bias - const F32 *b_o0 = b0; - const F32 *b_o1 = b1; - __asm__ __volatile__( - "vld1.f32 {d10-d11}, [%[b_0]]\n" - "vld1.f32 {d12-d13}, [%[b_1]]\n" - "mov r2, %[ic]\n" - - "vld1.f32 {d2-d3}, [%[in_0]]!\n" //in_hw0 - "vmov.f32 q7, q5\n" - "vmov.f32 q9, q5\n" - "vmov.f32 q11, q5\n" - "vld1.f32 {d0-d1}, [%[f_0]]!\n" //f_o0c0 - - "vmov.f32 q8, q6\n" - "vmov.f32 q10, q6\n" - "vmov.f32 q12, q6\n" - "0:\n" - "vld1.f32 {d4-d5}, [%[in_0]]!\n" - "vld1.f32 {d8-d9}, [%[f_0]]!\n" - "vmla.f32 q5, q0, d2[0]\n" - "vmla.f32 q7, q0, d2[1]\n" - "vmla.f32 q9, q0, d3[0]\n" - "vmla.f32 q11, q0, d3[1]\n" - - "vmla.f32 q6, q4, d2[0]\n" - "vmla.f32 q8, q4, d2[1]\n" - "subs r2, r2, #1\n" - "vmla.f32 q10, q4, d3[0]\n" - "vmla.f32 q12, q4, d3[1]\n" - "vmov.f32 q1, q2\n" - "bne 0b\n" - - "cmp %[activation], #0\n" - "beq 1f\n" - "veor q1, q1, q1\n" //zero - "vmax.f32 q5, q5, q1\n" - "vmax.f32 q6, q6, q1\n" - "vmax.f32 q7, q7, q1\n" - "vmax.f32 q8, q8, q1\n" - "vmax.f32 q9, q9, q1\n" - "vmax.f32 q10, q10, q1\n" - "vmax.f32 q11, q11, q1\n" - "vmax.f32 q12, q12, q1\n" - "1:\n" - "vst1.f32 {d10-d11}, [%[out_0]]!\n" - "vst1.f32 {d12-d13}, [%[out_0]]!\n" - "vst1.f32 {d14-d15}, [%[out_0]]!\n" - "vst1.f32 {d16-d17}, [%[out_0]]!\n" - "vst1.f32 {d18-d19}, [%[out_0]]!\n" - "vst1.f32 {d20-d21}, [%[out_0]]!\n" - "vst1.f32 {d22-d23}, [%[out_0]]!\n" - "vst1.f32 {d24-d25}, [%[out_0]]\n" - :[out_0]"+r"(out_o0hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*fh*fw), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1), - [activation]"r"(activation) - :"memory", "cc", "q0", "q1", "q3", "q5", "q6", "q7", "q8", "q9", "q10", "q11", - "q12", "q4", "r2" - ); - b0 += 8; - b1 += 8; - } - ohow_s += 4; - ohow_tail -= 4; - } - - for (I32 hw = ohow_s; hw < ohow; hw++) { - const F32 *b0 = biasArray; - const F32 *b1 = biasArray + 4; - F32 *in_pack = ((F32*)tmp) + ic*ih_pad*iw_pad; - // pack input - // NCHW => NCHWc8hw1 + im2col - U32 in_h_0 = (hw/ow)*strideH; - U32 in_w_0 = (hw%ow)*strideW; - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - F32 *in_hw = inArray_pad + c*ihiw + fh_idx*dilateH*iw_pad + dilateW*fw_idx; - F32 *in_0 = in_hw + in_h_0*iw_pad + in_w_0; - F32 *in_pack_hw1 = in_pack + fh_idx*fw*ic + fw_idx*ic + c; - *in_pack_hw1 = *in_0; - } - } - } - - // compute - for (I32 o = 0; o < I32(oc); o++) { - F32 *in_hw0 = in_pack; - const F32 *f_o0c0 = filterArray + o*8*fh*fw*ic; - F32 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - // bias - const F32 *b_o0 = b0; - const F32 *b_o1 = b1; - __asm__ __volatile__( - "vld1.f32 {d10-d11}, [%[b_0]]\n" - "vld1.f32 {d12-d13}, [%[b_1]]\n" - "mov r2, %[ic]\n" - - "0:\n" - "vld1.f32 {d0-d1}, [%[f_0]]!\n" - "vld1.f32 {d8-d9}, [%[f_0]]!\n" - "vld1.f32 {d2[0]}, [%[in_0]]!\n" - "subs r2, r2, #1\n" - "vmla.f32 q5, q0, d2[0]\n" - "vmla.f32 q6, q4, d2[0]\n" - "bne 0b\n" - - "cmp %[activation], #0\n" - "beq 1f\n" - "veor q1, q1, q1\n" //zero - "vmax.f32 q5, q5, q1\n" - "vmax.f32 q6, q6, q1\n" - "1:\n" - "vst1.f32 {d10-d11}, [%[out_0]]!\n" - "vst1.f32 {d12-d13}, [%[out_0]]\n" - :[out_0]"+r"(out_o0hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*fh*fw), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1), - [activation]"r"(activation) - :"memory", "cc", "q0", "q1", "q5", "q6", "q4", "r2" - ); - b0 += 8; - b1 += 8; - } - } - } - return ret; -} -#endif diff --git a/tensor_computing/src/cpu/arm/fp32/convolution_gemm_icnchw_V8.cpp b/tensor_computing/src/cpu/arm/fp32/convolution_gemm_icnchw_V8.cpp deleted file mode 100644 index cf3b0378..00000000 --- a/tensor_computing/src/cpu/arm/fp32/convolution_gemm_icnchw_V8.cpp +++ /dev/null @@ -1,874 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifdef __aarch64__ -#include -#include "cpu/arm/fp32/tensor_computing_fp32.h" - -EE convolution_gemm_icnchw_V8(TensorDesc inputDesc, F32* inArray, - TensorDesc filterDesc, const F32* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F32* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F32* outArray, - ActivationDesc activationDesc) -{ - UNUSED(biasDesc); - UNUSED(tmpBytes); - - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - U32 dilateH = convDesc.dilatedRate_h; - U32 dilateW = convDesc.dilatedRate_w; - - if (fdf != DF_NHWCN8) { - CHECK_STATUS(NOT_MATCH); - } - - oc /= 8; - U32 ih_pad = ih + paddingT + paddingB; - U32 iw_pad = iw + paddingL + paddingR; - I32 ohow = oh*ow; - U32 ihiw = ih_pad*iw_pad; - F32 *inArray_pad; - EE ret = SUCCESS; - for (U32 n = 0; n < in; n++) { - if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { - inArray_pad = inArray + n*ic*ih*iw; - } else { - // copy input into a input with padding - inArray_pad = (F32*)tmp; - F32 *inArray_pad_mov = inArray_pad; - F32 *inArray_mov = inArray + n*ic*ih*iw; - for (U32 c = 0; c < ic; c++) { - for (U32 h = 0; h < paddingT; h++) { - memset(inArray_pad_mov, 0, iw_pad*bytesOf(idt)); - inArray_pad_mov += iw_pad; - } - for (U32 h = paddingT; h < ih_pad - paddingB; h++) { - memset(inArray_pad_mov, 0, paddingL*bytesOf(idt)); - inArray_pad_mov += paddingL; - memcpy(inArray_pad_mov, inArray_mov, iw*bytesOf(idt)); - inArray_pad_mov += iw; - inArray_mov += iw; - memset(inArray_pad_mov, 0, paddingR*bytesOf(idt)); - inArray_pad_mov += paddingR; - } - for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { - memset(inArray_pad_mov, 0, iw_pad*bytesOf(idt)); - inArray_pad_mov += iw_pad; - } - } - } - // ohow / 12 - for (I32 hw = 0; hw < ohow - 11; hw += 12) { - const F32 *b0 = biasArray; - const F32 *b1 = biasArray + 4; - F32 *in_pack = ((F32*)tmp) + ic*ih_pad*iw_pad; - // pack input - // NCHW => NHWChw12 + im2col - U32 in_h_0 = (hw/ow)*strideH; - U32 in_w_0 = (hw%ow)*strideW; - U32 in_h_1 = ((hw+1)/ow)*strideH; - U32 in_w_1 = ((hw+1)%ow)*strideW; - U32 in_h_2 = ((hw+2)/ow)*strideH; - U32 in_w_2 = ((hw+2)%ow)*strideW; - U32 in_h_3 = ((hw+3)/ow)*strideH; - U32 in_w_3 = ((hw+3)%ow)*strideW; - U32 in_h_4 = ((hw+4)/ow)*strideH; - U32 in_w_4 = ((hw+4)%ow)*strideW; - U32 in_h_5 = ((hw+5)/ow)*strideH; - U32 in_w_5 = ((hw+5)%ow)*strideW; - U32 in_h_6 = ((hw+6)/ow)*strideH; - U32 in_w_6 = ((hw+6)%ow)*strideW; - U32 in_h_7 = ((hw+7)/ow)*strideH; - U32 in_w_7 = ((hw+7)%ow)*strideW; - U32 in_h_8 = ((hw+8)/ow)*strideH; - U32 in_w_8 = ((hw+8)%ow)*strideW; - U32 in_h_9 = ((hw+9)/ow)*strideH; - U32 in_w_9 = ((hw+9)%ow)*strideW; - U32 in_h_10 = ((hw+10)/ow)*strideH; - U32 in_w_10 = ((hw+10)%ow)*strideW; - U32 in_h_11 = ((hw+11)/ow)*strideH; - U32 in_w_11 = ((hw+11)%ow)*strideW; - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - F32 *in_hw = inArray_pad + c*ihiw + fh_idx*dilateH*iw_pad + dilateW*fw_idx; - F32 *in_0 = in_hw + in_h_0*iw_pad + in_w_0; - F32 *in_1 = in_hw + in_h_1*iw_pad + in_w_1; - F32 *in_2 = in_hw + in_h_2*iw_pad + in_w_2; - F32 *in_3 = in_hw + in_h_3*iw_pad + in_w_3; - F32 *in_4 = in_hw + in_h_4*iw_pad + in_w_4; - F32 *in_5 = in_hw + in_h_5*iw_pad + in_w_5; - F32 *in_6 = in_hw + in_h_6*iw_pad + in_w_6; - F32 *in_7 = in_hw + in_h_7*iw_pad + in_w_7; - F32 *in_8 = in_hw + in_h_8*iw_pad + in_w_8; - F32 *in_9 = in_hw + in_h_9*iw_pad + in_w_9; - F32 *in_10 = in_hw + in_h_10*iw_pad + in_w_10; - F32 *in_11 = in_hw + in_h_11*iw_pad + in_w_11; - F32 *in_pack_hw12 = in_pack + fh_idx*fw*ic*12 + fw_idx*ic*12 + c*12; - *in_pack_hw12 = *in_0; - *(in_pack_hw12+1) = *in_1; - *(in_pack_hw12+2) = *in_2; - *(in_pack_hw12+3) = *in_3; - *(in_pack_hw12+4) = *in_4; - *(in_pack_hw12+5) = *in_5; - *(in_pack_hw12+6) = *in_6; - *(in_pack_hw12+7) = *in_7; - *(in_pack_hw12+8) = *in_8; - *(in_pack_hw12+9) = *in_9; - *(in_pack_hw12+10) = *in_10; - *(in_pack_hw12+11) = *in_11; - } - } - } - - // compute - for (I32 o = 0; o < I32(oc); o++) { - F32 *in_hw0 = in_pack; - const F32 *f_o0c0 = filterArray + o*8*fh*fw*ic; - F32 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - - // bias - const F32 *b_o0 = b0; - const F32 *b_o1 = b1; - __asm__ __volatile__( - "ldr q5, [%[b_0]]\n" - "ldr q6, [%[b_1]]\n" - //give in address to x3 - "mov x3, %[in_0]\n" - - //give f address to x0 - "mov x0, %[f_0]\n" - - "mov x2, %[ic]\n" - - "ldr q1, [%[in_0]]\n" //in_hw0 - "mov v7.16b, v5.16b\n" - "mov v9.16b, v5.16b\n" - "mov v11.16b, v5.16b\n" - "ldr q0, [%[f_0]]\n" //f_o0c0 - "mov v13.16b, v5.16b\n" - "mov v15.16b, v5.16b\n" - "mov v17.16b, v5.16b\n" - "ldr q3, [%[in_0], #16]\n" - "mov v19.16b, v5.16b\n" - "mov v21.16b, v5.16b\n" - "mov v23.16b, v5.16b\n" - "mov v25.16b, v5.16b\n" - "mov v27.16b, v5.16b\n" - - "mov v8.16b, v6.16b\n" - "mov v10.16b, v6.16b\n" - "mov v12.16b, v6.16b\n" - "mov v14.16b, v6.16b\n" - "mov v16.16b, v6.16b\n" - "mov v18.16b, v6.16b\n" - "mov v20.16b, v6.16b\n" - "mov v22.16b, v6.16b\n" - "mov v24.16b, v6.16b\n" - "mov v26.16b, v6.16b\n" - "mov v28.16b, v6.16b\n" - "0:\n" - "fmla v5.4s, v0.4s, v1.s[0]\n" - "fmla v7.4s, v0.4s, v1.s[1]\n" - "ldr q2, [x3, 32]\n" - "ldr q4, [x0, 16]\n" - "fmla v9.4s, v0.4s, v1.s[2]\n" - "fmla v11.4s, v0.4s, v1.s[3]\n" - - "fmla v13.4s, v0.4s, v3.s[0]\n" - "fmla v15.4s, v0.4s, v3.s[1]\n" - "fmla v17.4s, v0.4s, v3.s[2]\n" - "fmla v19.4s, v0.4s, v3.s[3]\n" - - "fmla v21.4s, v0.4s, v2.s[0]\n" - "fmla v23.4s, v0.4s, v2.s[1]\n" - "fmla v25.4s, v0.4s, v2.s[2]\n" - "fmla v27.4s, v0.4s, v2.s[3]\n" - - "fmla v6.4s, v4.4s, v1.s[0]\n" - "fmla v8.4s, v4.4s, v1.s[1]\n" - "fmla v10.4s, v4.4s, v1.s[2]\n" - "fmla v12.4s, v4.4s, v1.s[3]\n" - - "fmla v14.4s, v4.4s, v3.s[0]\n" - "fmla v16.4s, v4.4s, v3.s[1]\n" - "ldr q1, [x3, 48]!\n" - "ldr q0, [x0, 32]!\n" - "fmla v18.4s, v4.4s, v3.s[2]\n" - "fmla v20.4s, v4.4s, v3.s[3]\n" - - "fmla v22.4s, v4.4s, v2.s[0]\n" - "fmla v24.4s, v4.4s, v2.s[1]\n" - "ldr q3, [x3, 16]\n" - "subs x2, x2, #1\n" - "fmla v26.4s, v4.4s, v2.s[2]\n" - "fmla v28.4s, v4.4s, v2.s[3]\n" - "bne 0b\n" - :[in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*fh*fw), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1) - :"memory", "cc", "v0", "v1", "v2", "v3", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v4", "v30", "x0", "x1", "x2", "x3" - ); - switch (activationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v1.16b, v1.16b, v1.16b\n" //zero - "fmax v5.4s, v5.4s, v1.4s\n" - "fmax v6.4s, v6.4s, v1.4s\n" - "fmax v7.4s, v7.4s, v1.4s\n" - "fmax v8.4s, v8.4s, v1.4s\n" - "fmax v9.4s, v9.4s, v1.4s\n" - "fmax v10.4s, v10.4s, v1.4s\n" - "fmax v11.4s, v11.4s, v1.4s\n" - "fmax v12.4s, v12.4s, v1.4s\n" - "fmax v13.4s, v13.4s, v1.4s\n" - "fmax v14.4s, v14.4s, v1.4s\n" - "fmax v15.4s, v15.4s, v1.4s\n" - "fmax v16.4s, v16.4s, v1.4s\n" - "fmax v17.4s, v17.4s, v1.4s\n" - "fmax v18.4s, v18.4s, v1.4s\n" - "fmax v19.4s, v19.4s, v1.4s\n" - "fmax v20.4s, v20.4s, v1.4s\n" - "fmax v21.4s, v21.4s, v1.4s\n" - "fmax v22.4s, v22.4s, v1.4s\n" - "fmax v23.4s, v23.4s, v1.4s\n" - "fmax v24.4s, v24.4s, v1.4s\n" - "fmax v25.4s, v25.4s, v1.4s\n" - "fmax v26.4s, v26.4s, v1.4s\n" - "fmax v27.4s, v27.4s, v1.4s\n" - "fmax v28.4s, v28.4s, v1.4s\n" - : - : - :"memory", "cc", "v1", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v1.16b, v1.16b, v1.16b\n" // zero - "fmov v30.4s, 6.0\n" // six - "fmax v5.4s, v5.4s, v1.4s\n" - "fmax v6.4s, v6.4s, v1.4s\n" - "fmax v7.4s, v7.4s, v1.4s\n" - "fmax v8.4s, v8.4s, v1.4s\n" - "fmax v9.4s, v9.4s, v1.4s\n" - "fmax v10.4s, v10.4s, v1.4s\n" - "fmax v11.4s, v11.4s, v1.4s\n" - "fmax v12.4s, v12.4s, v1.4s\n" - "fmax v13.4s, v13.4s, v1.4s\n" - "fmax v14.4s, v14.4s, v1.4s\n" - "fmax v15.4s, v15.4s, v1.4s\n" - "fmax v16.4s, v16.4s, v1.4s\n" - "fmax v17.4s, v17.4s, v1.4s\n" - "fmax v18.4s, v18.4s, v1.4s\n" - "fmax v19.4s, v19.4s, v1.4s\n" - "fmax v20.4s, v20.4s, v1.4s\n" - "fmax v21.4s, v21.4s, v1.4s\n" - "fmax v22.4s, v22.4s, v1.4s\n" - "fmax v23.4s, v23.4s, v1.4s\n" - "fmax v24.4s, v24.4s, v1.4s\n" - "fmax v25.4s, v25.4s, v1.4s\n" - "fmax v26.4s, v26.4s, v1.4s\n" - "fmax v27.4s, v27.4s, v1.4s\n" - "fmax v28.4s, v28.4s, v1.4s\n" - - "fmin v5.4s, v5.4s, v30.4s\n" - "fmin v6.4s, v6.4s, v30.4s\n" - "fmin v7.4s, v7.4s, v30.4s\n" - "fmin v8.4s, v8.4s, v30.4s\n" - "fmin v9.4s, v9.4s, v30.4s\n" - "fmin v10.4s, v10.4s, v30.4s\n" - "fmin v11.4s, v11.4s, v30.4s\n" - "fmin v12.4s, v12.4s, v30.4s\n" - "fmin v13.4s, v13.4s, v30.4s\n" - "fmin v14.4s, v14.4s, v30.4s\n" - "fmin v15.4s, v15.4s, v30.4s\n" - "fmin v16.4s, v16.4s, v30.4s\n" - "fmin v17.4s, v17.4s, v30.4s\n" - "fmin v18.4s, v18.4s, v30.4s\n" - "fmin v19.4s, v19.4s, v30.4s\n" - "fmin v20.4s, v20.4s, v30.4s\n" - "fmin v21.4s, v21.4s, v30.4s\n" - "fmin v22.4s, v22.4s, v30.4s\n" - "fmin v23.4s, v23.4s, v30.4s\n" - "fmin v24.4s, v24.4s, v30.4s\n" - "fmin v25.4s, v25.4s, v30.4s\n" - "fmin v26.4s, v26.4s, v30.4s\n" - "fmin v27.4s, v27.4s, v30.4s\n" - "fmin v28.4s, v28.4s, v30.4s\n" - : - : - :"memory", "cc", "v1", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v30" - ); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - - __asm__ __volatile__( - "str q5, [%[out_0]]\n" - "str q6, [%[out_0], #16]\n" - "str q7, [%[out_0], #32]\n" - "str q8, [%[out_0], #48]\n" - "str q9, [%[out_0], #64]\n" - "str q10, [%[out_0], #80]\n" - "str q11, [%[out_0], #96]\n" - "str q12, [%[out_0], #112]\n" - "str q13, [%[out_0], #128]\n" - "str q14, [%[out_0], #144]\n" - "str q15, [%[out_0], #160]\n" - "str q16, [%[out_0], #176]\n" - "str q17, [%[out_0], #192]\n" - "str q18, [%[out_0], #208]\n" - "str q19, [%[out_0], #224]\n" - "str q20, [%[out_0], #240]\n" - "str q21, [%[out_0], #256]\n" - "str q22, [%[out_0], #272]\n" - "str q23, [%[out_0], #288]\n" - "str q24, [%[out_0], #304]\n" - "str q25, [%[out_0], #320]\n" - "str q26, [%[out_0], #336]\n" - "str q27, [%[out_0], #352]\n" - "str q28, [%[out_0], #368]\n" - :[out_0]"+r"(out_o0hw0) - : - :"memory", "cc", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28" - ); - b0 += 8; - b1 += 8; - } - } - - U32 ohow_s = (ohow / 12) * 12; - U32 ohow_tail = ohow - ohow_s; - - if (ohow_tail >= 8) { - I32 hw = ohow_s; - const F32 *b0 = biasArray; - const F32 *b1 = biasArray + 4; - F32 *in_pack = ((F32*)tmp) + ic*ih_pad*iw_pad; - // pack input - // NCHW => NHWChw8 + im2col - U32 in_h_0 = (hw/ow)*strideH; - U32 in_w_0 = (hw%ow)*strideW; - U32 in_h_1 = ((hw+1)/ow)*strideH; - U32 in_w_1 = ((hw+1)%ow)*strideW; - U32 in_h_2 = ((hw+2)/ow)*strideH; - U32 in_w_2 = ((hw+2)%ow)*strideW; - U32 in_h_3 = ((hw+3)/ow)*strideH; - U32 in_w_3 = ((hw+3)%ow)*strideW; - U32 in_h_4 = ((hw+4)/ow)*strideH; - U32 in_w_4 = ((hw+4)%ow)*strideW; - U32 in_h_5 = ((hw+5)/ow)*strideH; - U32 in_w_5 = ((hw+5)%ow)*strideW; - U32 in_h_6 = ((hw+6)/ow)*strideH; - U32 in_w_6 = ((hw+6)%ow)*strideW; - U32 in_h_7 = ((hw+7)/ow)*strideH; - U32 in_w_7 = ((hw+7)%ow)*strideW; - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - F32 *in_hw = inArray_pad + c*ihiw + fh_idx*dilateH*iw_pad + dilateW*fw_idx; - F32 *in_0 = in_hw + in_h_0*iw_pad + in_w_0; - F32 *in_1 = in_hw + in_h_1*iw_pad + in_w_1; - F32 *in_2 = in_hw + in_h_2*iw_pad + in_w_2; - F32 *in_3 = in_hw + in_h_3*iw_pad + in_w_3; - F32 *in_4 = in_hw + in_h_4*iw_pad + in_w_4; - F32 *in_5 = in_hw + in_h_5*iw_pad + in_w_5; - F32 *in_6 = in_hw + in_h_6*iw_pad + in_w_6; - F32 *in_7 = in_hw + in_h_7*iw_pad + in_w_7; - F32 *in_pack_hw8 = in_pack + fh_idx*fw*ic*8 + fw_idx*ic*8 + c*8; - *in_pack_hw8 = *in_0; - *(in_pack_hw8+1) = *in_1; - *(in_pack_hw8+2) = *in_2; - *(in_pack_hw8+3) = *in_3; - *(in_pack_hw8+4) = *in_4; - *(in_pack_hw8+5) = *in_5; - *(in_pack_hw8+6) = *in_6; - *(in_pack_hw8+7) = *in_7; - } - } - } - - // compute - for (I32 o = 0; o < I32(oc); o++) { - F32 *in_hw0 = in_pack; - const F32 *f_o0c0 = filterArray + o*8*fh*fw*ic; - F32 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - // bias - const F32 *b_o0 = b0; - const F32 *b_o1 = b1; - __asm__ __volatile__( - "ldr q5, [%[b_0]]\n" - "ldr q6, [%[b_1]]\n" - //give in address to x3 - "mov x3, %[in_0]\n" - - //give f address to x0 - "mov x0, %[f_0]\n" - - "mov x2, %[ic]\n" - - "ldr q1, [%[in_0]]\n" //in_hw0 - "mov v7.16b, v5.16b\n" - "mov v9.16b, v5.16b\n" - "mov v11.16b, v5.16b\n" - "ldr q0, [%[f_0]]\n" //f_o0c0 - "mov v13.16b, v5.16b\n" - "mov v15.16b, v5.16b\n" - "mov v17.16b, v5.16b\n" - "mov v19.16b, v5.16b\n" - - "mov v6.16b, v6.16b\n" - "mov v8.16b, v6.16b\n" - "mov v10.16b, v6.16b\n" - "mov v12.16b, v6.16b\n" - "mov v14.16b, v6.16b\n" - "mov v16.16b, v6.16b\n" - "mov v18.16b, v6.16b\n" - "mov v20.16b, v6.16b\n" - "0:\n" - "ldr q3, [x3, 16]!\n" - "ldr q4, [x0, 16]\n" - "fmla v5.4s, v0.4s, v1.s[0]\n" - "fmla v7.4s, v0.4s, v1.s[1]\n" - "fmla v9.4s, v0.4s, v1.s[2]\n" - "fmla v11.4s, v0.4s, v1.s[3]\n" - - "fmla v13.4s, v0.4s, v3.s[0]\n" - "fmla v15.4s, v0.4s, v3.s[1]\n" - "fmla v17.4s, v0.4s, v3.s[2]\n" - "fmla v19.4s, v0.4s, v3.s[3]\n" - - "fmla v6.4s, v4.4s, v1.s[0]\n" - "fmla v8.4s, v4.4s, v1.s[1]\n" - "fmla v10.4s, v4.4s, v1.s[2]\n" - "fmla v12.4s, v4.4s, v1.s[3]\n" - - "fmla v14.4s, v4.4s, v3.s[0]\n" - "fmla v16.4s, v4.4s, v3.s[1]\n" - "ldr q1, [x3, 16]!\n" - "ldr q0, [x0, 32]!\n" - "subs x2, x2, #1\n" - "fmla v18.4s, v4.4s, v3.s[2]\n" - "fmla v20.4s, v4.4s, v3.s[3]\n" - "bne 0b\n" - :[in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*fh*fw), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1) - :"memory", "cc", "v0", "v1", "v3", "v5", "v6", "v7", "v8", "v9", "v10", "v11", - "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v4", "x0", "x1", "x2", "x3" - ); - switch (activationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v1.16b, v1.16b, v1.16b\n" //zero - "fmax v5.4s, v5.4s, v1.4s\n" - "fmax v6.4s, v6.4s, v1.4s\n" - "fmax v7.4s, v7.4s, v1.4s\n" - "fmax v8.4s, v8.4s, v1.4s\n" - "fmax v9.4s, v9.4s, v1.4s\n" - "fmax v10.4s, v10.4s, v1.4s\n" - "fmax v11.4s, v11.4s, v1.4s\n" - "fmax v12.4s, v12.4s, v1.4s\n" - "fmax v13.4s, v13.4s, v1.4s\n" - "fmax v14.4s, v14.4s, v1.4s\n" - "fmax v15.4s, v15.4s, v1.4s\n" - "fmax v16.4s, v16.4s, v1.4s\n" - "fmax v17.4s, v17.4s, v1.4s\n" - "fmax v18.4s, v18.4s, v1.4s\n" - "fmax v19.4s, v19.4s, v1.4s\n" - "fmax v20.4s, v20.4s, v1.4s\n" - : - : - :"memory", "cc", "v1", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v1.16b, v1.16b, v1.16b\n" // zero - "fmov v30.4s, 6.0\n" // six - "fmax v5.4s, v5.4s, v1.4s\n" - "fmax v6.4s, v6.4s, v1.4s\n" - "fmax v7.4s, v7.4s, v1.4s\n" - "fmax v8.4s, v8.4s, v1.4s\n" - "fmax v9.4s, v9.4s, v1.4s\n" - "fmax v10.4s, v10.4s, v1.4s\n" - "fmax v11.4s, v11.4s, v1.4s\n" - "fmax v12.4s, v12.4s, v1.4s\n" - "fmax v13.4s, v13.4s, v1.4s\n" - "fmax v14.4s, v14.4s, v1.4s\n" - "fmax v15.4s, v15.4s, v1.4s\n" - "fmax v16.4s, v16.4s, v1.4s\n" - "fmax v17.4s, v17.4s, v1.4s\n" - "fmax v18.4s, v18.4s, v1.4s\n" - "fmax v19.4s, v19.4s, v1.4s\n" - "fmax v20.4s, v20.4s, v1.4s\n" - - "fmin v5.4s, v5.4s, v30.4s\n" - "fmin v6.4s, v6.4s, v30.4s\n" - "fmin v7.4s, v7.4s, v30.4s\n" - "fmin v8.4s, v8.4s, v30.4s\n" - "fmin v9.4s, v9.4s, v30.4s\n" - "fmin v10.4s, v10.4s, v30.4s\n" - "fmin v11.4s, v11.4s, v30.4s\n" - "fmin v12.4s, v12.4s, v30.4s\n" - "fmin v13.4s, v13.4s, v30.4s\n" - "fmin v14.4s, v14.4s, v30.4s\n" - "fmin v15.4s, v15.4s, v30.4s\n" - "fmin v16.4s, v16.4s, v30.4s\n" - "fmin v17.4s, v17.4s, v30.4s\n" - "fmin v18.4s, v18.4s, v30.4s\n" - "fmin v19.4s, v19.4s, v30.4s\n" - "fmin v20.4s, v20.4s, v30.4s\n" - : - : - :"memory", "cc", "v1", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v30" - ); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - - __asm__ __volatile__( - "str q5, [%[out_0]]\n" - "str q6, [%[out_0], #16]\n" - "str q7, [%[out_0], #32]\n" - "str q8, [%[out_0], #48]\n" - "str q9, [%[out_0], #64]\n" - "str q10, [%[out_0], #80]\n" - "str q11, [%[out_0], #96]\n" - "str q12, [%[out_0], #112]\n" - "str q13, [%[out_0], #128]\n" - "str q14, [%[out_0], #144]\n" - "str q15, [%[out_0], #160]\n" - "str q16, [%[out_0], #176]\n" - "str q17, [%[out_0], #192]\n" - "str q18, [%[out_0], #208]\n" - "str q19, [%[out_0], #224]\n" - "str q20, [%[out_0], #240]\n" - :[out_0]"+r"(out_o0hw0) - : - :"memory", "cc", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20" - ); - b0 += 8; - b1 += 8; - } - ohow_s += 8; - ohow_tail -= 8; - } - - if (ohow_tail >= 4) { - I32 hw = ohow_s; - const F32 *b0 = biasArray; - const F32 *b1 = biasArray + 4; - F32 *in_pack = ((F32*)tmp) + ic*ih_pad*iw_pad; - // pack input - // NCHW => NHWChw4 + im2col - U32 in_h_0 = (hw/ow)*strideH; - U32 in_w_0 = (hw%ow)*strideW; - U32 in_h_1 = ((hw+1)/ow)*strideH; - U32 in_w_1 = ((hw+1)%ow)*strideW; - U32 in_h_2 = ((hw+2)/ow)*strideH; - U32 in_w_2 = ((hw+2)%ow)*strideW; - U32 in_h_3 = ((hw+3)/ow)*strideH; - U32 in_w_3 = ((hw+3)%ow)*strideW; - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - F32 *in_hw = inArray_pad + c*ihiw + fh_idx*dilateH*iw_pad + dilateW*fw_idx; - F32 *in_0 = in_hw + in_h_0*iw_pad + in_w_0; - F32 *in_1 = in_hw + in_h_1*iw_pad + in_w_1; - F32 *in_2 = in_hw + in_h_2*iw_pad + in_w_2; - F32 *in_3 = in_hw + in_h_3*iw_pad + in_w_3; - F32 *in_pack_hw4 = in_pack + fh_idx*fw*ic*4 + fw_idx*ic*4 + c*4; - *in_pack_hw4 = *in_0; - *(in_pack_hw4+1) = *in_1; - *(in_pack_hw4+2) = *in_2; - *(in_pack_hw4+3) = *in_3; - } - } - } - - // compute - for (I32 o = 0; o < I32(oc); o++) { - F32 *in_hw0 = in_pack; - const F32 *f_o0c0 = filterArray + o*8*fh*fw*ic; - F32 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - // bias - const F32 *b_o0 = b0; - const F32 *b_o1 = b1; - __asm__ __volatile__( - "ldr q5, [%[b_0]]\n" - "ldr q6, [%[b_1]]\n" - //give in address to x3 - "mov x3, %[in_0]\n" - - //give f address to x0 - "mov x0, %[f_0]\n" - - "mov x2, %[ic]\n" - - "ldr q1, [%[in_0]]\n" //in_hw0 - "mov v7.16b, v5.16b\n" - "mov v9.16b, v5.16b\n" - "mov v11.16b, v5.16b\n" - "ldr q0, [%[f_0]]\n" //f_o0c0 - - "mov v6.16b, v6.16b\n" - "mov v8.16b, v6.16b\n" - "mov v10.16b, v6.16b\n" - "mov v12.16b, v6.16b\n" - "0:\n" - "ldr q3, [x3, 16]!\n" - "ldr q4, [x0, 16]\n" - "fmla v5.4s, v0.4s, v1.s[0]\n" - "fmla v7.4s, v0.4s, v1.s[1]\n" - "fmla v9.4s, v0.4s, v1.s[2]\n" - "fmla v11.4s, v0.4s, v1.s[3]\n" - - "fmla v6.4s, v4.4s, v1.s[0]\n" - "fmla v8.4s, v4.4s, v1.s[1]\n" - "ldr q0, [x0, 32]!\n" - "subs x2, x2, #1\n" - "fmla v10.4s, v4.4s, v1.s[2]\n" - "fmla v12.4s, v4.4s, v1.s[3]\n" - "mov v1.16b, v3.16b\n" - "bne 0b\n" - :[in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*fh*fw), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1) - :"memory", "cc", "v0", "v1", "v3", "v5", "v6", "v7", "v8", "v9", "v10", "v11", - "v12", "v4", "x0", "x1", "x2", "x3" - ); - switch (activationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v1.16b, v1.16b, v1.16b\n" //zero - "fmax v5.4s, v5.4s, v1.4s\n" - "fmax v6.4s, v6.4s, v1.4s\n" - "fmax v7.4s, v7.4s, v1.4s\n" - "fmax v8.4s, v8.4s, v1.4s\n" - "fmax v9.4s, v9.4s, v1.4s\n" - "fmax v10.4s, v10.4s, v1.4s\n" - "fmax v11.4s, v11.4s, v1.4s\n" - "fmax v12.4s, v12.4s, v1.4s\n" - : - : - :"memory", "cc", "v1", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v1.16b, v1.16b, v1.16b\n" // zero - "fmov v30.4s, 6.0\n" // six - "fmax v5.4s, v5.4s, v1.4s\n" - "fmax v6.4s, v6.4s, v1.4s\n" - "fmax v7.4s, v7.4s, v1.4s\n" - "fmax v8.4s, v8.4s, v1.4s\n" - "fmax v9.4s, v9.4s, v1.4s\n" - "fmax v10.4s, v10.4s, v1.4s\n" - "fmax v11.4s, v11.4s, v1.4s\n" - "fmax v12.4s, v12.4s, v1.4s\n" - - "fmin v5.4s, v5.4s, v30.4s\n" - "fmin v6.4s, v6.4s, v30.4s\n" - "fmin v7.4s, v7.4s, v30.4s\n" - "fmin v8.4s, v8.4s, v30.4s\n" - "fmin v9.4s, v9.4s, v30.4s\n" - "fmin v10.4s, v10.4s, v30.4s\n" - "fmin v11.4s, v11.4s, v30.4s\n" - "fmin v12.4s, v12.4s, v30.4s\n" - : - : - :"memory", "cc", "v1", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v30" - ); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - - __asm__ __volatile__( - "str q5, [%[out_0]]\n" - "str q6, [%[out_0], #16]\n" - "str q7, [%[out_0], #32]\n" - "str q8, [%[out_0], #48]\n" - "str q9, [%[out_0], #64]\n" - "str q10, [%[out_0], #80]\n" - "str q11, [%[out_0], #96]\n" - "str q12, [%[out_0], #112]\n" - :[out_0]"+r"(out_o0hw0) - : - :"memory", "cc", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12" - ); - b0 += 8; - b1 += 8; - } - ohow_s += 4; - ohow_tail -= 4; - } - - for (I32 hw = ohow_s; hw < ohow; hw++) { - const F32 *b0 = biasArray; - const F32 *b1 = biasArray + 4; - F32 *in_pack = ((F32*)tmp) + ic*ih_pad*iw_pad; - // pack input - // NCHW => NCHWc8hw1 + im2col - U32 in_h_0 = (hw/ow)*strideH; - U32 in_w_0 = (hw%ow)*strideW; - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - F32 *in_hw = inArray_pad + c*ihiw + fh_idx*dilateH*iw_pad + dilateW*fw_idx; - F32 *in_0 = in_hw + in_h_0*iw_pad + in_w_0; - F32 *in_pack_hw1 = in_pack + fh_idx*fw*ic + fw_idx*ic + c; - *in_pack_hw1 = *in_0; - } - } - } - - // compute - for (I32 o = 0; o < I32(oc); o++) { - F32 *in_hw0 = in_pack; - const F32 *f_o0c0 = filterArray + o*8*fh*fw*ic; - F32 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - // bias - const F32 *b_o0 = b0; - const F32 *b_o1 = b1; - __asm__ __volatile__( - "ldr q5, [%[b_0]]\n" - "ldr q6, [%[b_1]]\n" - //give in address to x3 - "mov x3, %[in_0]\n" - - //give f address to x0 - "mov x0, %[f_0]\n" - - "mov x2, %[ic]\n" - - "0:\n" - "ldr q0, [x0], #16\n" - "subs x2, x2, #1\n" - "ldr q4, [x0], #16\n" - "ldr s1, [x3], #4\n" - "fmla v5.4s, v0.4s, v1.s[0]\n" - "fmla v6.4s, v4.4s, v1.s[0]\n" - - "bne 0b\n" - :[in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*fh*fw), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1) - :"memory", "cc", "v0", "v1", "v5", "v6", "v4", "x0", "x1", "x2", "x3" - ); - switch (activationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v1.16b, v1.16b, v1.16b\n" //zero - "fmax v5.4s, v5.4s, v1.4s\n" - "fmax v6.4s, v6.4s, v1.4s\n" - : - : - :"memory", "cc", "v1", "v5", "v6" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v1.16b, v1.16b, v1.16b\n" // zero - "fmov v30.4s, 6.0\n" // six - "fmax v5.4s, v5.4s, v1.4s\n" - "fmax v6.4s, v6.4s, v1.4s\n" - - "fmin v5.4s, v5.4s, v30.4s\n" - "fmin v6.4s, v6.4s, v30.4s\n" - : - : - :"memory", "cc", "v1", "v5", "v6", "v30" - ); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - - __asm__ __volatile__( - "str q5, [%[out_0]]\n" - "str q6, [%[out_0], #16]\n" - :[out_0]"+r"(out_o0hw0) - : - :"memory", "cc", "v5", "v6" - ); - b0 += 8; - b1 += 8; - } - } - } - return ret; -} -#endif diff --git a/tensor_computing/src/cpu/arm/fp32/convolution_transform.cpp b/tensor_computing/src/cpu/arm/fp32/convolution_transform.cpp deleted file mode 100644 index 82cd38ec..00000000 --- a/tensor_computing/src/cpu/arm/fp32/convolution_transform.cpp +++ /dev/null @@ -1,123 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "cpu/arm/fp32/tensor_computing_fp32.h" -#include "cpu/arm/fp32/convolution_winograd_transform.h" - -inline EE convolution_transform_filter_kernel_fp32(TensorDesc filterDesc, const F32* filterArray, - TensorDesc *ftmDesc, F32* ftmArray, - DataFormat ftmDataFormat) -{ - if (nullptr == filterArray || nullptr == ftmDesc || nullptr == ftmArray) - CHECK_STATUS(NULL_POINTER); - DataType fdt; - DataFormat fdf; - U32 fn, fc, fh, fw; - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - if (fdf == ftmDataFormat) { - *ftmDesc = filterDesc; - memcpy(ftmArray, filterArray, fn*fc*fh*fw*bytesOf(fdt)); - return SUCCESS; - } - if (fdf != DF_NCHW) { - CHECK_STATUS(NOT_SUPPORTED); - } - EE ret = SUCCESS; - switch (ftmDataFormat) { - case DF_NHWCN8: { - /* - * NCHW => NHWCN8 - */ - U32 oc = fn / 8; - for (U32 o = 0; o < oc; o++) { - for (U32 hw = 0; hw < fh*fw; hw++) { - for (U32 c = 0; c < fc; c++) { - for (U32 o8 = 0; o8 < 8; o8++) { - ftmArray[o*fh*fw*fc*8 + hw*fc*8 + c*8 + o8] = filterArray[(o*8+o8)*fc*fh*fw + c*fh*fw + hw]; - } - } - } - } - *ftmDesc = tensor4df(fdt, ftmDataFormat, fn, fc, fh, fw); - break; - } - case DF_HWNCN8: { - for (U32 o = 0; o < fn/8; o++) { - for (U32 c = 0; c < fc; c++) { - // Each time deal with N4; 2 times we have N8 - U32 f_off_0 = (o*8)*fc*fh*fw + c*fh*fw; - U32 f_off_1 = (o*8+4)*fc*fh*fw + c*fh*fw; - - U32 ftm_off_0 = o*36*fc*8 + c*8; - U32 ftm_off_1 = o*36*fc*8 + c*8 + 4; - - F32 F[9][4]; - F32 *F_ptr[9]; - F32 *Fw[36]; - - for (U32 hw = 0; hw < 9; hw++) { - for (U32 oo = 0; oo < 4; oo++) { - F[hw][oo] = filterArray[f_off_0 + hw + oo*fc*fh*fw]; - } - F_ptr[hw] = F[hw]; - } - for (U32 hw = 0; hw < 36; hw++) { - Fw[hw] = ftmArray + ftm_off_0 + hw*fc*8; - } - trans_W_4x4_3x3(Fw, F_ptr); - for (U32 hw = 0; hw < 9; hw++) { - for (U32 oo = 0; oo < 4; oo++) { - F[hw][oo] = filterArray[f_off_1 + hw + oo*fc*fh*fw]; - } - F_ptr[hw] = F[hw]; - } - for (U32 hw = 0; hw < 36; hw++) { - Fw[hw] = ftmArray + ftm_off_1 + hw*fc*8; - } - trans_W_4x4_3x3(Fw, F_ptr); - } - } - *ftmDesc = tensor4df(fdt, ftmDataFormat, fn, fc, 6, 6); - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - -EE convolution_transform_filter_fp32(TensorDesc filterDesc, const F32* filter, - ConvolutionForwardAlgorithm algorithm, - TensorDesc *ftmDesc, F32* filterTransformed) -{ - DataFormat ftmDataFormat; - switch (algorithm) { - case CONVOLUTION_ALGORITHM_GEMM: - ftmDataFormat = DF_NHWCN8; - break; - case CONVOLUTION_ALGORITHM_GEMM_ICNCHW: - ftmDataFormat = DF_NHWCN8; - break; - case CONVOLUTION_ALGORITHM_WINOGRAD: - ftmDataFormat = DF_HWNCN8; - break; - default: - return NOT_MATCH; - } - EE ret = convolution_transform_filter_kernel_fp32(filterDesc, filter, ftmDesc, filterTransformed, ftmDataFormat); - CHECK_STATUS(ret); - return ret; -} diff --git a/tensor_computing/src/cpu/arm/fp32/convolution_winograd_V8.cpp b/tensor_computing/src/cpu/arm/fp32/convolution_winograd_V8.cpp deleted file mode 100644 index 6782a3b2..00000000 --- a/tensor_computing/src/cpu/arm/fp32/convolution_winograd_V8.cpp +++ /dev/null @@ -1,868 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/fp32/convolution_winograd_transform.h" -#include "cpu/arm/fp32/tensor_computing_fp32.h" - -EE convolution_winograd_V8(TensorDesc inputDesc, F32* inArray, - TensorDesc filterDesc, const F32* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F32* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F32* outArray, - ActivationDesc activationDesc) -{ -#ifdef __aarch64__ - UNUSED(biasDesc); - UNUSED(tmpBytes); - - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - - if (fdf != DF_HWNCN8) - CHECK_STATUS(NOT_MATCH); - if (!(fh == 6 && fw == 6)) - CHECK_STATUS(NOT_SUPPORTED); - - oc /= 8; - ic /= 8; - - U32 tile_h = (oh + 3) / 4; - U32 tile_w = (ow + 3) / 4; - // num of 6x6 tiles - I32 tiles = tile_h * tile_w; - U32 pad_left = paddingL; - U32 pad_right = paddingR + (tile_w*4 - ow); - U32 pad_w_mod_4 = tile_w*4 - ow; - U32 pad_top = paddingT; - U32 pad_bottom = paddingB + (tile_h*4 - oh); - U32 pad_h_mod_4 = tile_h*4 - oh; - U32 ih_pad = ih + pad_top + pad_bottom; - U32 iw_pad = iw + pad_left + pad_right; - // tmp = in_pad + itm + otm - // in_pad: ic*ih_pad*iw_pad*8 - // itm: 6*6*ic*12*8 - // otm: 6*6*12*8 - F32* inArray_pad = (F32*)tmp; - F32* itmArray = inArray_pad + ic*ih_pad*iw_pad*8; - F32* otmArray = itmArray + 6*6*ic*12*8; - - EE ret = SUCCESS; - // copy input into a input with padding - for (U32 n = 0; n < in; n++) { - F32 *inArray_pad_mov = inArray_pad; - F32 *inArray_mov = inArray + n*ic*ih*iw*8; - for (U32 c = 0; c < ic; c++) { - memset(inArray_pad_mov, 0, pad_top*iw_pad*8*bytesOf(idt)); - inArray_pad_mov += pad_top*iw_pad*8; - for (U32 h = pad_top; h < ih_pad - pad_bottom; h++) { - memset(inArray_pad_mov, 0, pad_left*8*bytesOf(idt)); - inArray_pad_mov += pad_left*8; - memcpy(inArray_pad_mov, inArray_mov, iw*8*bytesOf(idt)); - inArray_pad_mov += iw*8; - inArray_mov += iw*8; - memset(inArray_pad_mov, 0, pad_right*8*bytesOf(idt)); - inArray_pad_mov += pad_right*8; - } - memset(inArray_pad_mov, 0, pad_bottom*iw_pad*8*bytesOf(idt)); - inArray_pad_mov += pad_bottom*iw_pad*8; - } - - // tiles / 12 - for (I32 hw = 0; hw < tiles-11; hw+=12) { - // in trans - // NCHWc8 => (6*6)*C*c8*hw12 - for (U32 c = 0; c < ic; c++) { - F32 *inArray_pad_mov = inArray_pad + c*ih_pad*iw_pad*8; - F32 *itmArray_mov = itmArray + c*12*8; - F32 *Iw_ptr0[36]; - F32 *Iw_ptr1[36]; - F32 Iw[12][36][8]; - F32 *I0[12][36]; - F32 *I1[12][36]; - U32 h[12]; - U32 w[12]; - for (U32 index = 0; index < 12; index++) { - h[index] = ((hw + index) / tile_w) * 4; - w[index] = ((hw + index) % tile_w) * 4; - } - for (U32 i = 0; i < 6; i++) { - for (U32 j = 0; j < 6; j++) { - for (U32 index = 0; index < 12; index++) { - I0[index][i*6 + j] = inArray_pad_mov + (h[index] + i) * iw_pad * 8 + (w[index] + j) * 8; - I1[index][i*6 + j] = inArray_pad_mov + (h[index] + i) * iw_pad * 8 + (w[index] + j) * 8 + 4; - } - } - } - for (U32 index = 0; index < 12; index++) { - for (U32 i = 0; i < 36; i++) { - Iw_ptr0[i] = Iw[index][i]; - Iw_ptr1[i] = Iw_ptr0[i] + 4; - } - trans_I_4x4_3x3(Iw_ptr0, I0[index]); - trans_I_4x4_3x3(Iw_ptr1, I1[index]); - } - for (U32 i = 0; i < 36; i++) { - F32* itm = itmArray_mov + i*ic*8*12; - - __asm__ __volatile__( - "ldp q0, q1, [%[in_0]]\n" - "ldp q2, q3, [%[in_1]]\n" - "ldp q4, q5, [%[in_2]]\n" - "ldp q6, q7, [%[in_3]]\n" - - "ldp q8, q9, [%[in_4]]\n" - "ldp q10, q11, [%[in_5]]\n" - "ldp q12, q13, [%[in_6]]\n" - "ldp q14, q15, [%[in_7]]\n" - - "ldp q16, q17, [%[in_8]]\n" - "ldp q18, q19, [%[in_9]]\n" - "ldp q20, q21, [%[in_10]]\n" - "ldp q22, q23, [%[in_11]]\n" - - "zip1 v24.4s, v0.4s, v2.4s\n" - "zip2 v25.4s, v0.4s, v2.4s\n" - "zip1 v26.4s, v4.4s, v6.4s\n" - "zip2 v27.4s, v4.4s, v6.4s\n" - - "zip1 v0.2d, v24.2d, v26.2d\n" - "zip2 v2.2d, v24.2d, v26.2d\n" - "zip1 v4.2d, v25.2d, v27.2d\n" - "zip2 v6.2d, v25.2d, v27.2d\n" - - "zip1 v24.4s, v8.4s, v10.4s\n" - "zip2 v25.4s, v8.4s, v10.4s\n" - "zip1 v26.4s, v12.4s, v14.4s\n" - "zip2 v27.4s, v12.4s, v14.4s\n" - - "zip1 v8.2d, v24.2d, v26.2d\n" - "zip2 v10.2d, v24.2d, v26.2d\n" - "zip1 v12.2d, v25.2d, v27.2d\n" - "zip2 v14.2d, v25.2d, v27.2d\n" - - "zip1 v24.4s, v16.4s, v18.4s\n" - "zip2 v25.4s, v16.4s, v18.4s\n" - "zip1 v26.4s, v20.4s, v22.4s\n" - "zip2 v27.4s, v20.4s, v22.4s\n" - - "zip1 v16.2d, v24.2d, v26.2d\n" - "zip2 v18.2d, v24.2d, v26.2d\n" - "zip1 v20.2d, v25.2d, v27.2d\n" - "zip2 v22.2d, v25.2d, v27.2d\n" - - "stp q0, q8, [%[pack]]\n" - "str q16, [%[pack], #32]\n" - "stp q2, q10, [%[pack], 48]\n" - "str q18, [%[pack], #80]\n" - "stp q4, q12, [%[pack], #96]\n" - "str q20, [%[pack], #128]\n" - "stp q6, q14, [%[pack], #144]\n" - "str q22, [%[pack], #176]\n" - - "zip1 v24.4s, v1.4s, v3.4s\n" - "zip2 v25.4s, v1.4s, v3.4s\n" - "zip1 v26.4s, v5.4s, v7.4s\n" - "zip2 v27.4s, v5.4s, v7.4s\n" - - "zip1 v1.2d, v24.2d, v26.2d\n" - "zip2 v3.2d, v24.2d, v26.2d\n" - "zip1 v5.2d, v25.2d, v27.2d\n" - "zip2 v7.2d, v25.2d, v27.2d\n" - - "zip1 v24.4s, v9.4s, v11.4s\n" - "zip2 v25.4s, v9.4s, v11.4s\n" - "zip1 v26.4s, v13.4s, v15.4s\n" - "zip2 v27.4s, v13.4s, v15.4s\n" - - "zip1 v9.2d, v24.2d, v26.2d\n" - "zip2 v11.2d, v24.2d, v26.2d\n" - "zip1 v13.2d, v25.2d, v27.2d\n" - "zip2 v15.2d, v25.2d, v27.2d\n" - - "zip1 v24.4s, v17.4s, v19.4s\n" - "zip2 v25.4s, v17.4s, v19.4s\n" - "zip1 v26.4s, v21.4s, v23.4s\n" - "zip2 v27.4s, v21.4s, v23.4s\n" - - "zip1 v17.2d, v24.2d, v26.2d\n" - "zip2 v19.2d, v24.2d, v26.2d\n" - "zip1 v21.2d, v25.2d, v27.2d\n" - "zip2 v23.2d, v25.2d, v27.2d\n" - - "stp q1, q9, [%[pack], #192]\n" - "str q17, [%[pack], #224]\n" - "stp q3, q11, [%[pack], 240]\n" - "str q19, [%[pack], #272]\n" - "stp q5, q13, [%[pack], 288]\n" - "str q21, [%[pack], #320]\n" - "stp q7, q15, [%[pack], 336]\n" - "str q23, [%[pack], #368]\n" - : - :[pack]"r"(itm), - [in_0]"r"(Iw[0][i]), - [in_1]"r"(Iw[1][i]), - [in_2]"r"(Iw[2][i]), - [in_3]"r"(Iw[3][i]), - [in_4]"r"(Iw[4][i]), - [in_5]"r"(Iw[5][i]), - [in_6]"r"(Iw[6][i]), - [in_7]"r"(Iw[7][i]), - [in_8]"r"(Iw[8][i]), - [in_9]"r"(Iw[9][i]), - [in_10]"r"(Iw[10][i]), - [in_11]"r"(Iw[11][i]) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", - "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27" - ); - } - } - for (I32 o = 0; o < I32(oc); o++) { - const F32 *b_0 = biasArray + o*8; - const F32 *b_1 = b_0 + 4; - // dot prod - // (6*6)*C*c8*hw12 times O*(6*6)*C*c8*o8 = O*(6*6)*hw12*o8 - for (U32 idx = 0; idx < 36; idx++) { - F32 *itm_0 = itmArray + idx*12*ic*8; - const F32 *f_o0c0 = filterArray + o*8*36*ic*8 + idx*8*ic*8; - F32 *out_o0hw0 = otmArray + idx*12*8; - __asm__ __volatile__( - //give in address to x3 - "mov x3, %[in_0]\n" - - //give f address to x0 - "mov x0, %[f_0]\n" - - "mov x2, %[ic]\n" - - "eor v5.16b, v5.16b, v5.16b\n" - "ldr q1, [%[in_0]]\n" //in_hw0 - "eor v6.16b, v6.16b, v6.16b\n" - "eor v7.16b, v7.16b, v7.16b\n" - "eor v8.16b, v8.16b, v8.16b\n" - "ldr q0, [%[f_0]]\n" //f_o0c0 - "eor v9.16b, v9.16b, v9.16b\n" - "eor v10.16b, v10.16b, v10.16b\n" - "eor v11.16b, v11.16b, v11.16b\n" - "ldr q3, [%[in_0], #16]\n" - "eor v12.16b, v12.16b, v12.16b\n" - "eor v13.16b, v13.16b, v13.16b\n" - "eor v14.16b, v14.16b, v14.16b\n" - "eor v15.16b, v15.16b, v15.16b\n" - - "eor v16.16b, v16.16b, v16.16b\n" - "eor v17.16b, v17.16b, v17.16b\n" - "eor v18.16b, v18.16b, v18.16b\n" - "eor v19.16b, v19.16b, v19.16b\n" - "eor v20.16b, v20.16b, v20.16b\n" - "eor v21.16b, v21.16b, v21.16b\n" - "eor v22.16b, v22.16b, v22.16b\n" - "eor v23.16b, v23.16b, v23.16b\n" - "eor v24.16b, v24.16b, v24.16b\n" - "eor v25.16b, v25.16b, v25.16b\n" - "eor v26.16b, v26.16b, v26.16b\n" - "eor v27.16b, v27.16b, v27.16b\n" - "eor v28.16b, v28.16b, v28.16b\n" - "0:\n" - "fmla v5.4s, v0.4s, v1.s[0]\n" - "fmla v7.4s, v0.4s, v1.s[1]\n" - "ldr q2, [x3, 32]\n" - "ldr q29, [x0, 16]\n" - "fmla v9.4s, v0.4s, v1.s[2]\n" - "fmla v11.4s, v0.4s, v1.s[3]\n" - - "fmla v13.4s, v0.4s, v3.s[0]\n" - "fmla v15.4s, v0.4s, v3.s[1]\n" - "fmla v17.4s, v0.4s, v3.s[2]\n" - "fmla v19.4s, v0.4s, v3.s[3]\n" - - "fmla v21.4s, v0.4s, v2.s[0]\n" - "fmla v23.4s, v0.4s, v2.s[1]\n" - "fmla v25.4s, v0.4s, v2.s[2]\n" - "fmla v27.4s, v0.4s, v2.s[3]\n" - - "fmla v6.4s, v29.4s, v1.s[0]\n" - "fmla v8.4s, v29.4s, v1.s[1]\n" - "fmla v10.4s, v29.4s, v1.s[2]\n" - "fmla v12.4s, v29.4s, v1.s[3]\n" - - "fmla v14.4s, v29.4s, v3.s[0]\n" - "fmla v16.4s, v29.4s, v3.s[1]\n" - "ldr q1, [x3, 48]!\n" - "ldr q0, [x0, 32]!\n" - "fmla v18.4s, v29.4s, v3.s[2]\n" - "fmla v20.4s, v29.4s, v3.s[3]\n" - - "fmla v22.4s, v29.4s, v2.s[0]\n" - "fmla v24.4s, v29.4s, v2.s[1]\n" - "ldr q3, [x3, 16]\n" - "subs x2, x2, #1\n" - "fmla v26.4s, v29.4s, v2.s[2]\n" - "fmla v28.4s, v29.4s, v2.s[3]\n" - "bne 0b\n" - "str q5, [%[out_0]]\n" - "str q6, [%[out_0], #16]\n" - "str q7, [%[out_0], #32]\n" - "str q8, [%[out_0], #48]\n" - "str q9, [%[out_0], #64]\n" - "str q10, [%[out_0], #80]\n" - "str q11, [%[out_0], #96]\n" - "str q12, [%[out_0], #112]\n" - "str q13, [%[out_0], #128]\n" - "str q14, [%[out_0], #144]\n" - "str q15, [%[out_0], #160]\n" - "str q16, [%[out_0], #176]\n" - "str q17, [%[out_0], #192]\n" - "str q18, [%[out_0], #208]\n" - "str q19, [%[out_0], #224]\n" - "str q20, [%[out_0], #240]\n" - "str q21, [%[out_0], #256]\n" - "str q22, [%[out_0], #272]\n" - "str q23, [%[out_0], #288]\n" - "str q24, [%[out_0], #304]\n" - "str q25, [%[out_0], #320]\n" - "str q26, [%[out_0], #336]\n" - "str q27, [%[out_0], #352]\n" - "str q28, [%[out_0], #368]\n" - :[out_0]"+r"(out_o0hw0), - [in_0]"+r"(itm_0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8) - :"memory", "cc", "v0", "v1", "v2", "v3", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x0", "x1", "x2", "x3" - ); - } - // out trans - // O*(6*6)*hw12*o8 => NOHWo8 - for (U32 hw12 = 0; hw12 < 12; hw12++) { - U32 h = (hw+hw12) / tile_w; - U32 w = (hw+hw12) % tile_w; - F32 *out_0 = outArray + n*oc*oh*ow*8 + o*oh*ow*8 + h*4*ow*8 + w*4*8; - - F32 *Ow_0[36]; - F32 *Ow_1[36]; - F32 *O_0[16]; - F32 *O_1[16]; - for (U32 idx = 0; idx < 36; idx++) { - Ow_0[idx] = otmArray + idx*12*8 + hw12*8; - Ow_1[idx] = Ow_0[idx] + 4; - } - for (U32 i = 0; i < 4; ++i) { - for (U32 j = 0; j < 4; ++j) { - O_0[i*4 + j] = out_0 + i*ow*8 + j*8; - O_1[i*4 + j] = O_0[i*4 + j] + 4; - } - } - CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, activationDesc)); - CHECK_STATUS(trans_O_4x4_3x3(Ow_1, O_1, b_1, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, activationDesc)); - } - } - } - - // tiles_reminder % 12 / 8 - I32 tiles_s = (tiles / 12) * 12; - I32 tiles_tail = tiles - tiles_s; - - if (tiles_tail >= 8) { - I32 hw = tiles_s; - // in trans - // NCHWc8 => (6*6)*C*c8*hw8 - for (U32 c = 0; c < ic; c++) { - F32 *inArray_pad_mov = inArray_pad + c*ih_pad*iw_pad*8; - F32 *itmArray_mov = itmArray + c*8*8; - F32 *Iw_ptr0[36]; - F32 *Iw_ptr1[36]; - F32 Iw[8][36][8]; - F32 *I0[8][36]; - F32 *I1[8][36]; - U32 h[8]; - U32 w[8]; - for (U32 index = 0; index < 8; index++) { - h[index] = ((hw + index) / tile_w) * 4; - w[index] = ((hw + index) % tile_w) * 4; - } - for (U32 i = 0; i < 6; i++) { - for (U32 j = 0; j < 6; j++) { - for (U32 index = 0; index < 8; index++) { - I0[index][i*6 + j] = inArray_pad_mov + (h[index] + i) * iw_pad * 8 + (w[index] + j) * 8; - I1[index][i*6 + j] = inArray_pad_mov + (h[index] + i) * iw_pad * 8 + (w[index] + j) * 8 + 4; - } - } - } - for (U32 index = 0; index < 8; index++) { - for (U32 i = 0; i < 36; i++) { - Iw_ptr0[i] = Iw[index][i]; - Iw_ptr1[i] = Iw_ptr0[i] + 4; - } - trans_I_4x4_3x3(Iw_ptr0, I0[index]); - trans_I_4x4_3x3(Iw_ptr1, I1[index]); - } - for (U32 i = 0; i < 36; i++) { - F32* itm = itmArray_mov + i*ic*8*8; - - __asm__ __volatile__( - "ldp q0, q1, [%[in_0]]\n" - "ldp q2, q3, [%[in_1]]\n" - "ldp q4, q5, [%[in_2]]\n" - "ldp q6, q7, [%[in_3]]\n" - - "ldp q8, q9, [%[in_4]]\n" - "ldp q10, q11, [%[in_5]]\n" - "ldp q12, q13, [%[in_6]]\n" - "ldp q14, q15, [%[in_7]]\n" - - "zip1 v24.4s, v0.4s, v2.4s\n" - "zip2 v25.4s, v0.4s, v2.4s\n" - "zip1 v26.4s, v4.4s, v6.4s\n" - "zip2 v27.4s, v4.4s, v6.4s\n" - - "zip1 v0.2d, v24.2d, v26.2d\n" - "zip2 v2.2d, v24.2d, v26.2d\n" - "zip1 v4.2d, v25.2d, v27.2d\n" - "zip2 v6.2d, v25.2d, v27.2d\n" - - "zip1 v24.4s, v8.4s, v10.4s\n" - "zip2 v25.4s, v8.4s, v10.4s\n" - "zip1 v26.4s, v12.4s, v14.4s\n" - "zip2 v27.4s, v12.4s, v14.4s\n" - - "zip1 v8.2d, v24.2d, v26.2d\n" - "zip2 v10.2d, v24.2d, v26.2d\n" - "zip1 v12.2d, v25.2d, v27.2d\n" - "zip2 v14.2d, v25.2d, v27.2d\n" - - "stp q0, q8, [%[pack]]\n" - "stp q2, q10, [%[pack], #32]\n" - "stp q4, q12, [%[pack], #64]\n" - "stp q6, q14, [%[pack], #96]\n" - - "zip1 v24.4s, v1.4s, v3.4s\n" - "zip2 v25.4s, v1.4s, v3.4s\n" - "zip1 v26.4s, v5.4s, v7.4s\n" - "zip2 v27.4s, v5.4s, v7.4s\n" - - "zip1 v1.2d, v24.2d, v26.2d\n" - "zip2 v3.2d, v24.2d, v26.2d\n" - "zip1 v5.2d, v25.2d, v27.2d\n" - "zip2 v7.2d, v25.2d, v27.2d\n" - - "zip1 v24.4s, v9.4s, v11.4s\n" - "zip2 v25.4s, v9.4s, v11.4s\n" - "zip1 v26.4s, v13.4s, v15.4s\n" - "zip2 v27.4s, v13.4s, v15.4s\n" - - "zip1 v9.2d, v24.2d, v26.2d\n" - "zip2 v11.2d, v24.2d, v26.2d\n" - "zip1 v13.2d, v25.2d, v27.2d\n" - "zip2 v15.2d, v25.2d, v27.2d\n" - - "stp q1, q9, [%[pack], #128]\n" - "stp q3, q11, [%[pack], #160]\n" - "stp q5, q13, [%[pack], #192]\n" - "stp q7, q15, [%[pack], #224]\n" - : - :[pack]"r"(itm), - [in_0]"r"(Iw[0][i]), - [in_1]"r"(Iw[1][i]), - [in_2]"r"(Iw[2][i]), - [in_3]"r"(Iw[3][i]), - [in_4]"r"(Iw[4][i]), - [in_5]"r"(Iw[5][i]), - [in_6]"r"(Iw[6][i]), - [in_7]"r"(Iw[7][i]) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", - "v12", "v13", "v14", "v15", "v24", "v25", "v26", "v27" - ); - } - } - for (I32 o = 0; o < I32(oc); o++) { - const F32 *b_0 = biasArray + o*8; - const F32 *b_1 = b_0 + 4; - // dot prod - // (6*6)*C*c8*hw8 times O*(6*6)*C*c8*o8 = O*(6*6)*hw8*o8 - for (U32 idx = 0; idx < 36; idx++) { - F32 *itm_0 = itmArray + idx*8*ic*8; - const F32 *f_o0c0 = filterArray + o*8*36*ic*8 + idx*8*ic*8; - F32 *out_o0hw0 = otmArray + idx*8*8; - __asm__ __volatile__( - //give in address to x3 - "mov x3, %[in_0]\n" - - //give f address to x0 - "mov x0, %[f_0]\n" - - "mov x2, %[ic]\n" - - "eor v5.16b, v5.16b, v5.16b\n" - "ldr q1, [%[in_0]]\n" //in_hw0 - "eor v6.16b, v6.16b, v6.16b\n" - "eor v7.16b, v7.16b, v7.16b\n" - "eor v8.16b, v8.16b, v8.16b\n" - "ldr q0, [%[f_0]]\n" //f_o0c0 - "eor v9.16b, v9.16b, v9.16b\n" - "eor v10.16b, v10.16b, v10.16b\n" - "eor v11.16b, v11.16b, v11.16b\n" - "eor v12.16b, v12.16b, v12.16b\n" - "eor v13.16b, v13.16b, v13.16b\n" - "eor v14.16b, v14.16b, v14.16b\n" - "eor v15.16b, v15.16b, v15.16b\n" - - "eor v16.16b, v16.16b, v16.16b\n" - "eor v17.16b, v17.16b, v17.16b\n" - "eor v18.16b, v18.16b, v18.16b\n" - "eor v19.16b, v19.16b, v19.16b\n" - "eor v20.16b, v20.16b, v20.16b\n" - "0:\n" - "ldr q3, [x3, 16]!\n" - "ldr q29, [x0, 16]\n" - "fmla v5.4s, v0.4s, v1.s[0]\n" - "fmla v7.4s, v0.4s, v1.s[1]\n" - "fmla v9.4s, v0.4s, v1.s[2]\n" - "fmla v11.4s, v0.4s, v1.s[3]\n" - - "fmla v13.4s, v0.4s, v3.s[0]\n" - "fmla v15.4s, v0.4s, v3.s[1]\n" - "fmla v17.4s, v0.4s, v3.s[2]\n" - "fmla v19.4s, v0.4s, v3.s[3]\n" - - "fmla v6.4s, v29.4s, v1.s[0]\n" - "fmla v8.4s, v29.4s, v1.s[1]\n" - "fmla v10.4s, v29.4s, v1.s[2]\n" - "fmla v12.4s, v29.4s, v1.s[3]\n" - - "fmla v14.4s, v29.4s, v3.s[0]\n" - "fmla v16.4s, v29.4s, v3.s[1]\n" - "ldr q1, [x3, 16]!\n" - "ldr q0, [x0, 32]!\n" - "subs x2, x2, #1\n" - "fmla v18.4s, v29.4s, v3.s[2]\n" - "fmla v20.4s, v29.4s, v3.s[3]\n" - "bne 0b\n" - "str q5, [%[out_0]]\n" - "str q6, [%[out_0], #16]\n" - "str q7, [%[out_0], #32]\n" - "str q8, [%[out_0], #48]\n" - "str q9, [%[out_0], #64]\n" - "str q10, [%[out_0], #80]\n" - "str q11, [%[out_0], #96]\n" - "str q12, [%[out_0], #112]\n" - "str q13, [%[out_0], #128]\n" - "str q14, [%[out_0], #144]\n" - "str q15, [%[out_0], #160]\n" - "str q16, [%[out_0], #176]\n" - "str q17, [%[out_0], #192]\n" - "str q18, [%[out_0], #208]\n" - "str q19, [%[out_0], #224]\n" - "str q20, [%[out_0], #240]\n" - :[out_0]"+r"(out_o0hw0), - [in_0]"+r"(itm_0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8) - :"memory", "cc", "v0", "v1", "v3", "v5", "v6", "v7", "v8", "v9", "v10", "v11", - "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v27", "v28", "v29", "x0", "x1", "x2", "x3" - ); - } - // out trans - // O*(6*6)*hw8*o8 => NOHWo8 - for (U32 hw8 = 0; hw8 < 8; hw8++) { - U32 h = (hw+hw8) / tile_w; - U32 w = (hw+hw8) % tile_w; - F32 *out_0 = outArray + n*oc*oh*ow*8 + o*oh*ow*8 + h*4*ow*8 + w*4*8; - - F32 *Ow_0[36]; - F32 *Ow_1[36]; - F32 *O_0[16]; - F32 *O_1[16]; - for (U32 idx = 0; idx < 36; idx++) { - Ow_0[idx] = otmArray + idx*8*8 + hw8*8; - Ow_1[idx] = Ow_0[idx] + 4; - } - for (U32 i = 0; i < 4; ++i) { - for (U32 j = 0; j < 4; ++j) { - O_0[i*4 + j] = out_0 + i*ow*8 + j*8; - O_1[i*4 + j] = O_0[i*4 + j] + 4; - } - } - CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, activationDesc)); - CHECK_STATUS(trans_O_4x4_3x3(Ow_1, O_1, b_1, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, activationDesc)); - } - } - tiles_s += 8; - tiles_tail -= 8; - } - - if (tiles_tail >= 4) { - I32 hw = tiles_s; - // in trans - // NCHWc8 => (6*6)*C*c8*hw4 - for (U32 c = 0; c < ic; c++) { - F32 *inArray_pad_mov = inArray_pad + c*ih_pad*iw_pad*8; - F32 *itmArray_mov = itmArray + c*8*4; - F32 *Iw_ptr0[36]; - F32 *Iw_ptr1[36]; - F32 Iw[4][36][8]; - F32 *I0[4][36]; - F32 *I1[4][36]; - U32 h[4]; - U32 w[4]; - for (U32 index = 0; index < 4; index++) { - h[index] = ((hw + index) / tile_w) * 4; - w[index] = ((hw + index) % tile_w) * 4; - } - for (U32 i = 0; i < 6; i++) { - for (U32 j = 0; j < 6; j++) { - for (U32 index = 0; index < 4; index++) { - I0[index][i*6 + j] = inArray_pad_mov + (h[index] + i) * iw_pad * 8 + (w[index] + j) * 8; - I1[index][i*6 + j] = inArray_pad_mov + (h[index] + i) * iw_pad * 8 + (w[index] + j) * 8 + 4; - } - } - } - for (U32 index = 0; index < 4; index++) { - for (U32 i = 0; i < 36; i++) { - Iw_ptr0[i] = Iw[index][i]; - Iw_ptr1[i] = Iw_ptr0[i] + 4; - } - trans_I_4x4_3x3(Iw_ptr0, I0[index]); - trans_I_4x4_3x3(Iw_ptr1, I1[index]); - } - for (U32 i = 0; i < 36; i++) { - F32* itm = itmArray_mov + i*ic*8*4; - - __asm__ __volatile__( - "ldp q0, q4, [%[in_0]]\n" - "ldp q1, q5, [%[in_1]]\n" - "ldp q2, q6, [%[in_2]]\n" - "ldp q3, q7, [%[in_3]]\n" - - "st4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[pack]], #64\n" - "st4 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[pack]]\n" - : - :[pack]"r"(itm), - [in_0]"r"(Iw[0][i]), - [in_1]"r"(Iw[1][i]), - [in_2]"r"(Iw[2][i]), - [in_3]"r"(Iw[3][i]) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" - ); - } - } - for (I32 o = 0; o < I32(oc); o++) { - const F32 *b_0 = biasArray + o*8; - const F32 *b_1 = b_0 + 4; - // dot prod - // (6*6)*C*c8*hw4 times O*(6*6)*C*c8*o8 = O*(6*6)*hw4*o8 - for (U32 idx = 0; idx < 36; idx++) { - F32 *itm_0 = itmArray + idx*4*ic*8; - const F32 *f_o0c0 = filterArray + o*8*36*ic*8 + idx*8*ic*8; - F32 *out_o0hw0 = otmArray + idx*4*8; - __asm__ __volatile__( - //give in address to x3 - "mov x3, %[in_0]\n" - - //give f address to x0 - "mov x0, %[f_0]\n" - - "mov x2, %[ic]\n" - - "eor v5.16b, v5.16b, v5.16b\n" - "ldr q1, [%[in_0]]\n" //in_hw0 - "eor v6.16b, v6.16b, v6.16b\n" - "eor v7.16b, v7.16b, v7.16b\n" - "eor v8.16b, v8.16b, v8.16b\n" - "ldr q0, [%[f_0]]\n" //f_o0c0 - "eor v9.16b, v9.16b, v9.16b\n" - "eor v10.16b, v10.16b, v10.16b\n" - "eor v11.16b, v11.16b, v11.16b\n" - "eor v12.16b, v12.16b, v12.16b\n" - "0:\n" - "ldr q3, [x3, 16]!\n" - "ldr q29, [x0, 16]\n" - "fmla v5.4s, v0.4s, v1.s[0]\n" - "fmla v7.4s, v0.4s, v1.s[1]\n" - "fmla v9.4s, v0.4s, v1.s[2]\n" - "fmla v11.4s, v0.4s, v1.s[3]\n" - - "fmla v6.4s, v29.4s, v1.s[0]\n" - "fmla v8.4s, v29.4s, v1.s[1]\n" - "ldr q0, [x0, 32]!\n" - "subs x2, x2, #1\n" - "fmla v10.4s, v29.4s, v1.s[2]\n" - "fmla v12.4s, v29.4s, v1.s[3]\n" - - "mov v1.16b, v3.16b\n" - "bne 0b\n" - "str q5, [%[out_0]]\n" - "str q6, [%[out_0], #16]\n" - "str q7, [%[out_0], #32]\n" - "str q8, [%[out_0], #48]\n" - "str q9, [%[out_0], #64]\n" - "str q10, [%[out_0], #80]\n" - "str q11, [%[out_0], #96]\n" - "str q12, [%[out_0], #112]\n" - :[out_0]"+r"(out_o0hw0), - [in_0]"+r"(itm_0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8) - :"memory", "cc", "v0", "v1", "v3", "v5", "v6", "v7", "v8", "v9", "v10", "v11", - "v12", "v27", "v28", "v29", "x0", "x1", "x2", "x3" - ); - } - // out trans - // O*(6*6)*hw4*o8 => NOHWo8 - for (U32 hw4 = 0; hw4 < 4; hw4++) { - U32 h = (hw+hw4) / tile_w; - U32 w = (hw+hw4) % tile_w; - F32 *out_0 = outArray + n*oc*oh*ow*8 + o*oh*ow*8 + h*4*ow*8 + w*4*8; - - F32 *Ow_0[36]; - F32 *Ow_1[36]; - F32 *O_0[16]; - F32 *O_1[16]; - for (U32 idx = 0; idx < 36; idx++) { - Ow_0[idx] = otmArray + idx*4*8 + hw4*8; - Ow_1[idx] = Ow_0[idx] + 4; - } - for (U32 i = 0; i < 4; ++i) { - for (U32 j = 0; j < 4; ++j) { - O_0[i*4 + j] = out_0 + i*ow*8 + j*8; - O_1[i*4 + j] = O_0[i*4 + j] + 4; - } - } - CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, activationDesc)); - CHECK_STATUS(trans_O_4x4_3x3(Ow_1, O_1, b_1, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, activationDesc)); - } - } - tiles_s += 4; - tiles_tail -= 4; - } - - for (I32 hw = tiles_s; hw < tiles; hw++) { - // in trans - // NCHWc8 => (6*6)*C*c8*hw1 - for (U32 c = 0; c < ic; c++) { - F32 *inArray_pad_mov = inArray_pad + c*ih_pad*iw_pad*8; - F32 *itmArray_mov = itmArray + c*8; - F32 *Iw_ptr0[36]; - F32 *Iw_ptr1[36]; - F32 Iw[36][8]; - F32 *I0[36]; - F32 *I1[36]; - U32 h = (hw / tile_w) * 4;; - U32 w = (hw % tile_w) * 4; - for (U32 i = 0; i < 6; i++) { - for (U32 j = 0; j < 6; j++) { - I0[i*6 + j] = inArray_pad_mov + (h + i) * iw_pad * 8 + (w + j) * 8; - I1[i*6 + j] = inArray_pad_mov + (h + i) * iw_pad * 8 + (w + j) * 8 + 4; - } - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr0[i] = Iw[i]; - Iw_ptr1[i] = Iw_ptr0[i] + 4; - } - trans_I_4x4_3x3(Iw_ptr0, I0); - trans_I_4x4_3x3(Iw_ptr1, I1); - for (U32 i = 0; i < 36; i++) { - F32* itm = itmArray_mov + i*ic*8; - memcpy(itm, Iw[i], 8*bytesOf(idt)); - } - } - for (I32 o = 0; o < I32(oc); o++) { - const F32 *b_0 = biasArray + o*8; - const F32 *b_1 = b_0 + 4; - // dot prod - // (6*6)*C*c8*hw1 times O*(6*6)*C*c8*o8 = O*(6*6)*hw1*o8 - for (U32 idx = 0; idx < 36; idx++) { - F32 *itm_0 = itmArray + idx*ic*8; - const F32 *f_o0c0 = filterArray + o*8*36*ic*8 + idx*8*ic*8; - F32 *out_o0hw0 = otmArray + idx*8; - __asm__ __volatile__( - "ldr s1, [%[in_0]]\n" //in_hw0 - "ldp q0, q29, [%[f_0]]\n" //f_o0c0 - //give in address to x3 - "mov x3, %[in_0]\n" - - //give f address to x0 - "mov x0, %[f_0]\n" - - "mov x2, %[ic]\n" - - "eor v5.16b, v5.16b, v5.16b\n" - "eor v6.16b, v6.16b, v6.16b\n" - "0:\n" - "ldp q30, q28, [x0, #32]\n" - "ldr s3, [x3, #4]\n" - "fmla v5.4s, v0.4s, v1.s[0]\n" - "fmla v6.4s, v29.4s, v1.s[0]\n" - - - "ldr q0, [x0, #64]!\n" - "subs x2, x2, #2\n" - "ldr q29, [x0, #16]\n" - "ldr s1, [x3, #8]!\n" - "fmla v5.4s, v30.4s, v3.s[0]\n" - "fmla v6.4s, v28.4s, v3.s[0]\n" - "bne 0b\n" - "str q5, [%[out_0]]\n" - "str q6, [%[out_0], #16]\n" - :[out_0]"+r"(out_o0hw0), - [in_0]"+r"(itm_0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8) - :"memory", "cc", "v0", "v1", "v3", "v5", "v6", "v28", "v29", "v30", "x0", "x1", "x2", "x3" - ); - } - // out trans - // O*(6*6)*hw1*o8 => NOHWo8 - U32 h = hw / tile_w; - U32 w = hw % tile_w; - F32 *out_0 = outArray + n*oc*oh*ow*8 + o*oh*ow*8 + h*4*ow*8 + w*4*8; - - F32 *Ow_0[36]; - F32 *Ow_1[36]; - F32 *O_0[16]; - F32 *O_1[16]; - for (U32 idx = 0; idx < 36; idx++) { - Ow_0[idx] = otmArray + idx*8; - Ow_1[idx] = Ow_0[idx] + 4; - } - for (U32 i = 0; i < 4; ++i) { - for (U32 j = 0; j < 4; ++j) { - O_0[i*4 + j] = out_0 + i*ow*8 + j*8; - O_1[i*4 + j] = O_0[i*4 + j] + 4; - } - } - CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, activationDesc)); - CHECK_STATUS(trans_O_4x4_3x3(Ow_1, O_1, b_1, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, activationDesc)); - } - } - } - return ret; -#else - // TODO - std::cerr << "[ERROR] currently not support ARMv7 convolution winograd" < -#include -#include "cpu/arm/fp32/arm_functions_fp32.h" - -inline void trans_W_4x4_3x3(float* WTM[36], float* W[9]) -{ - float T[6][3][4]; - - float32x4_t v_01666 = vmovq_n_f32(0.1666666666666667f); - float32x4_t v_minus_01666 = vmovq_n_f32(-0.1666666666666667f); - float32x4_t v_00833 = vmovq_n_f32(0.0833333333333333f); - float32x4_t v_minus_00833 = vmovq_n_f32(-0.0833333333333333f); - float32x4_t v_004166 = vmovq_n_f32(0.0416666666666667f); - float32x4_t v_025 = vmovq_n_f32(0.25f); - - for (int i = 0; i < 3; i++) { - float32x4_t v_W0 = vld1q_f32(W[0*3+i]); - float32x4_t v_W1 = vld1q_f32(W[1*3+i]); - float32x4_t v_W2 = vld1q_f32(W[2*3+i]); - - float32x4_t v_t0 = vmulq_f32(v_01666, v_W2); - float32x4_t v_t1 = vsubq_f32(vmulq_f32(v_minus_01666, v_W0), v_t0); - float32x4_t v_t2 = vfmaq_f32(v_t0, v_004166, v_W0); - - float32x4_t v_T0 = vmulq_f32(v_025, v_W0); - float32x4_t v_T1 = vfmaq_f32(v_t1, v_minus_01666, v_W1); - float32x4_t v_T2 = vfmaq_f32(v_t1, v_01666, v_W1); - float32x4_t v_T3 = vfmaq_f32(v_t2, v_00833, v_W1); - float32x4_t v_T4 = vfmaq_f32(v_t2, v_minus_00833, v_W1); - - vst1q_f32(T[0][i], v_T0); - vst1q_f32(T[1][i], v_T1); - vst1q_f32(T[2][i], v_T2); - vst1q_f32(T[3][i], v_T3); - vst1q_f32(T[4][i], v_T4); - vst1q_f32(T[5][i], v_W2); - } - for (int i = 0; i < 6; i++) { - float32x4_t v_T0 = vld1q_f32(T[i][0]); - float32x4_t v_T1 = vld1q_f32(T[i][1]); - float32x4_t v_T2 = vld1q_f32(T[i][2]); - - float32x4_t v_t0 = vmulq_f32(v_01666, v_T2); - float32x4_t v_t1 = vsubq_f32(vmulq_f32(v_minus_01666, v_T0), v_t0); - float32x4_t v_t2 = vfmaq_f32(v_t0, v_004166, v_T0); - - float32x4_t v_WTM0 = vmulq_f32(v_025, v_T0); - float32x4_t v_WTM1 = vfmaq_f32(v_t1, v_minus_01666, v_T1); - float32x4_t v_WTM2 = vfmaq_f32(v_t1, v_01666, v_T1); - float32x4_t v_WTM3 = vfmaq_f32(v_t2, v_00833, v_T1); - float32x4_t v_WTM4 = vfmaq_f32(v_t2, v_minus_00833, v_T1); - - vst1q_f32(WTM[i*6+0], v_WTM0); - vst1q_f32(WTM[i*6+1], v_WTM1); - vst1q_f32(WTM[i*6+2], v_WTM2); - vst1q_f32(WTM[i*6+3], v_WTM3); - vst1q_f32(WTM[i*6+4], v_WTM4); - vst1q_f32(WTM[i*6+5], v_T2); - } -} - -inline EE trans_O_4x4_3x3(float* OTM[36], float* O[16], const float* bias, - U32 h, U32 w, U32 _pad_h_mod_4, U32 _pad_w_mod_4, U32 oh, U32 ow, ActivationDesc activationDesc) -{ - float T[4][6][4]; - // bias - float32x4_t v_b = vld1q_f32(bias); - - float32x4_t v_0 = vmovq_n_f32(0); - float32x4_t v_2 = vmovq_n_f32(2); - float32x4_t v_4 = vmovq_n_f32(4); - float32x4_t v_8 = vmovq_n_f32(8); - - for (int i = 0; i < 6; i++) { - float32x4_t v_OTM0 = vld1q_f32(OTM[i]); - float32x4_t v_OTM1 = vld1q_f32(OTM[1*6+i]); - float32x4_t v_OTM2 = vld1q_f32(OTM[2*6+i]); - float32x4_t v_OTM3 = vld1q_f32(OTM[3*6+i]); - float32x4_t v_OTM4 = vld1q_f32(OTM[4*6+i]); - float32x4_t v_OTM5 = vld1q_f32(OTM[5*6+i]); - - float32x4_t v_t0 = vaddq_f32(v_OTM1, v_OTM2); - float32x4_t v_t1 = vaddq_f32(v_OTM3, v_OTM4); - float32x4_t v_t2 = vsubq_f32(v_OTM1, v_OTM2); - float32x4_t v_t3 = vsubq_f32(v_OTM3, v_OTM4); - - float32x4_t v_T0 = vaddq_f32(vaddq_f32(v_t0, v_t1), v_OTM0); - float32x4_t v_T1 = vfmaq_f32(v_t2, v_t3, v_2); - float32x4_t v_T2 = vfmaq_f32(v_t0, v_t1, v_4); - float32x4_t v_T3 = vaddq_f32(vfmaq_f32(v_t2, v_t3, v_8), v_OTM5); - - vst1q_f32(T[0][i], v_T0); - vst1q_f32(T[1][i], v_T1); - vst1q_f32(T[2][i], v_T2); - vst1q_f32(T[3][i], v_T3); - } - - U32 pad_h_mod_4 = 0, pad_w_mod_4 = 0; - if (h == oh && w == ow) { - pad_h_mod_4 = _pad_h_mod_4; - pad_w_mod_4 = _pad_w_mod_4; - } else if (h == oh) { - pad_h_mod_4 = _pad_h_mod_4; - } else if (w == ow) { - pad_w_mod_4 = _pad_w_mod_4; - } - - for (U32 i = 0; i < 4 - pad_h_mod_4; i++) { - float32x4_t v_T0 = vld1q_f32(T[i][0]); - float32x4_t v_T1 = vld1q_f32(T[i][1]); - float32x4_t v_T2 = vld1q_f32(T[i][2]); - float32x4_t v_T3 = vld1q_f32(T[i][3]); - float32x4_t v_T4 = vld1q_f32(T[i][4]); - float32x4_t v_T5 = vld1q_f32(T[i][5]); - - float32x4_t v_t0 = vaddq_f32(v_T1, v_T2); - float32x4_t v_t1 = vaddq_f32(v_T3, v_T4); - float32x4_t v_t2 = vsubq_f32(v_T1, v_T2); - float32x4_t v_t3 = vsubq_f32(v_T3, v_T4); - - float32x4_t v_O0 = vaddq_f32(vaddq_f32(v_t0, v_t1), v_T0); - float32x4_t v_O1 = vfmaq_f32(v_t2, v_t3, v_2); - float32x4_t v_O2 = vfmaq_f32(v_t0, v_t1, v_4); - float32x4_t v_O3 = vaddq_f32(vfmaq_f32(v_t2, v_t3, v_8), v_T5); - - switch (activationDesc.mode) { - case ACTIVATION_NULL: { - if (pad_w_mod_4 == 0) { - vst1q_f32(O[i*4+0], vaddq_f32(v_O0, v_b)); - vst1q_f32(O[i*4+1], vaddq_f32(v_O1, v_b)); - vst1q_f32(O[i*4+2], vaddq_f32(v_O2, v_b)); - vst1q_f32(O[i*4+3], vaddq_f32(v_O3, v_b)); - } else if (pad_w_mod_4 == 1) { - vst1q_f32(O[i*4+0], vaddq_f32(v_O0, v_b)); - vst1q_f32(O[i*4+1], vaddq_f32(v_O1, v_b)); - vst1q_f32(O[i*4+2], vaddq_f32(v_O2, v_b)); - } else if (pad_w_mod_4 == 2) { - vst1q_f32(O[i*4+0], vaddq_f32(v_O0, v_b)); - vst1q_f32(O[i*4+1], vaddq_f32(v_O1, v_b)); - } else if (pad_w_mod_4 == 3) { - vst1q_f32(O[i*4+0], vaddq_f32(v_O0, v_b)); - } - break; - } - case ACTIVATION_RELU: { - if (pad_w_mod_4 == 0) { - vst1q_f32(O[i*4+0], vmaxq_f32(vaddq_f32(v_O0, v_b), v_0)); - vst1q_f32(O[i*4+1], vmaxq_f32(vaddq_f32(v_O1, v_b), v_0)); - vst1q_f32(O[i*4+2], vmaxq_f32(vaddq_f32(v_O2, v_b), v_0)); - vst1q_f32(O[i*4+3], vmaxq_f32(vaddq_f32(v_O3, v_b), v_0)); - } else if (pad_w_mod_4 == 1) { - vst1q_f32(O[i*4+0], vmaxq_f32(vaddq_f32(v_O0, v_b), v_0)); - vst1q_f32(O[i*4+1], vmaxq_f32(vaddq_f32(v_O1, v_b), v_0)); - vst1q_f32(O[i*4+2], vmaxq_f32(vaddq_f32(v_O2, v_b), v_0)); - } else if (pad_w_mod_4 == 2) { - vst1q_f32(O[i*4+0], vmaxq_f32(vaddq_f32(v_O0, v_b), v_0)); - vst1q_f32(O[i*4+1], vmaxq_f32(vaddq_f32(v_O1, v_b), v_0)); - } else if (pad_w_mod_4 == 3) { - vst1q_f32(O[i*4+0], vmaxq_f32(vaddq_f32(v_O0, v_b), v_0)); - } - break; - } - default: - return NOT_SUPPORTED; - } - } - return SUCCESS; -} - -inline void trans_I_4x4_3x3(float* ITM[36], float* I[36]) -{ - float T[6][6][4]; - - float32x4_t v_4 = vmovq_n_f32(4); - float32x4_t v_minus_4 = vmovq_n_f32(-4); - float32x4_t v_2 = vmovq_n_f32(2); - float32x4_t v_minus_5 = vmovq_n_f32(-5); - - for (int i = 0; i < 6; i++) { - float32x4_t v_I0 = vld1q_f32(I[0*6+i]); - float32x4_t v_I1 = vld1q_f32(I[1*6+i]); - float32x4_t v_I2 = vld1q_f32(I[2*6+i]); - float32x4_t v_I3 = vld1q_f32(I[3*6+i]); - float32x4_t v_I4 = vld1q_f32(I[4*6+i]); - float32x4_t v_I5 = vld1q_f32(I[5*6+i]); - - float32x4_t v_t0 = vfmaq_f32(v_I4, v_I2, v_minus_4); - float32x4_t v_t1 = vfmaq_f32(v_I3, v_I1, v_minus_4); - float32x4_t v_t2 = vsubq_f32(v_I4, v_I2); - float32x4_t v_t3 = vmulq_f32(vsubq_f32(v_I3, v_I1), v_2); - float32x4_t v_t4 = vfmaq_f32(v_I4, v_I0, v_4); - float32x4_t v_t5 = vfmaq_f32(v_I5, v_I1, v_4); - - float32x4_t v_T0 = vfmaq_f32(v_t4, v_I2, v_minus_5); - float32x4_t v_T1 = vaddq_f32(v_t1, v_t0); - float32x4_t v_T2 = vsubq_f32(v_t0, v_t1); - float32x4_t v_T3 = vaddq_f32(v_t3, v_t2); - float32x4_t v_T4 = vsubq_f32(v_t2, v_t3); - float32x4_t v_T5 = vfmaq_f32(v_t5, v_I3, v_minus_5); - - vst1q_f32(T[0][i], v_T0); - vst1q_f32(T[1][i], v_T1); - vst1q_f32(T[2][i], v_T2); - vst1q_f32(T[3][i], v_T3); - vst1q_f32(T[4][i], v_T4); - vst1q_f32(T[5][i], v_T5); - } - - for (int i = 0; i < 6; i++) { - float32x4_t v_T0 = vld1q_f32(T[i][0]); - float32x4_t v_T1 = vld1q_f32(T[i][1]); - float32x4_t v_T2 = vld1q_f32(T[i][2]); - float32x4_t v_T3 = vld1q_f32(T[i][3]); - float32x4_t v_T4 = vld1q_f32(T[i][4]); - float32x4_t v_T5 = vld1q_f32(T[i][5]); - - float32x4_t v_t0 = vfmaq_f32(v_T4, v_T2, v_minus_4); - float32x4_t v_t1 = vfmaq_f32(v_T3, v_T1, v_minus_4); - float32x4_t v_t2 = vsubq_f32(v_T4, v_T2); - float32x4_t v_t3 = vmulq_f32(vsubq_f32(v_T3, v_T1), v_2); - float32x4_t v_t4 = vfmaq_f32(v_T4, v_T0, v_4); - float32x4_t v_t5 = vfmaq_f32(v_T5, v_T1, v_4); - - float32x4_t v_ITM0 = vfmaq_f32(v_t4, v_T2, v_minus_5); - float32x4_t v_ITM1 = vaddq_f32(v_t1, v_t0); - float32x4_t v_ITM2 = vsubq_f32(v_t0, v_t1); - float32x4_t v_ITM3 = vaddq_f32(v_t3, v_t2); - float32x4_t v_ITM4 = vsubq_f32(v_t2, v_t3); - float32x4_t v_ITM5 = vfmaq_f32(v_t5, v_T3, v_minus_5); - - vst1q_f32(ITM[i*6+0], v_ITM0); - vst1q_f32(ITM[i*6+1], v_ITM1); - vst1q_f32(ITM[i*6+2], v_ITM2); - vst1q_f32(ITM[i*6+3], v_ITM3); - vst1q_f32(ITM[i*6+4], v_ITM4); - vst1q_f32(ITM[i*6+5], v_ITM5); - } -} -#endif -#endif diff --git a/tensor_computing/src/cpu/arm/fp32/deconvolution.cpp b/tensor_computing/src/cpu/arm/fp32/deconvolution.cpp deleted file mode 100644 index 833e8a77..00000000 --- a/tensor_computing/src/cpu/arm/fp32/deconvolution.cpp +++ /dev/null @@ -1,223 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/fp32/tensor_computing_fp32.h" -#include - -EE deconvolution_infer_forward_tmp_bytes_fp32(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, ConvolutionForwardAlgorithm algorithm, U32 *bytes) -{ - if (nullptr == bytes) { - CHECK_STATUS(NULL_POINTER); - } - - DataType idt, fdt; - DataFormat idf, fdf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - - U32 tPadding = fh - 1 - paddingT; - U32 bPadding = fh - 1 - paddingB; - U32 lPadding = fw - 1 - paddingL; - U32 rPadding = fw - 1 - paddingR; - - ConvolutionDesc transposedCD; - transposedCD.stride_h = 1; - transposedCD.stride_w = 1; - transposedCD.padding_top = 0; - transposedCD.padding_bottom = 0; - transposedCD.padding_left = 0; - transposedCD.padding_right = 0; - transposedCD.dilatedRate_h = 1; - transposedCD.dilatedRate_w = 1; - - if (CONVOLUTION_ALGORITHM_WINOGRAD == algorithm) { - // If algorithm is not Winograd, leave out padding of length 1 - tPadding--; - bPadding--; - lPadding--; - rPadding--; - transposedCD.padding_top += 1; - transposedCD.padding_bottom += 1; - transposedCD.padding_left += 1; - transposedCD.padding_right += 1; - } - - ih = ih + (ih - 1) * (strideH - 1) + tPadding + bPadding; - iw = iw + (iw - 1) * (strideW - 1) + lPadding + rPadding; - TensorDesc inPaddedDesc = tensor4df(idt, idf, in, ic, ih, iw); - if (DF_NCHW == filterDesc.df) { - // Swap fn and fc - filterDesc.dims[2] = filterDesc.dims[3]; - filterDesc.dims[3] = ic; - } - EE ret = convolution_infer_forward_tmp_bytes_fp32(inPaddedDesc, filterDesc, outputDesc, transposedCD, algorithm, bytes); - *bytes += tensorNumBytes(inPaddedDesc); // for pre-convolution padding - return ret; -} - -EE deconvolution_fp32(TensorDesc inputDesc, F32* input, - TensorDesc filterDesc, const F32* filter, - ConvolutionDesc convDesc, ConvolutionForwardAlgorithm algorithm, - TensorDesc biasDesc, const F32* bias, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F32* output, - ActivationDesc activationDesc, - Arch arch) -{ - UNUSED(arch); - if (nullptr == input || nullptr == filter || nullptr == output || nullptr == bias || nullptr == tmp) { - CHECK_STATUS(NULL_POINTER); - } - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - - if (!(idt == DT_F32 && fdt == DT_F32 && odt == DT_F32)){ - CHECK_STATUS(NOT_MATCH); - } - if (!(idf == DF_NCHWC8 && odf == DF_NCHWC8)) { - CHECK_STATUS(NOT_MATCH); - } - - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - - ConvolutionDesc transposedCD; - transposedCD.stride_h = 1; - transposedCD.stride_w = 1; - transposedCD.padding_top = 0; - transposedCD.padding_bottom = 0; - transposedCD.padding_left = 0; - transposedCD.padding_right = 0; - transposedCD.dilatedRate_h = 1; - transposedCD.dilatedRate_w = 1; - - U32 tPadding = fh - 1 - paddingT; - U32 bPadding = fh - 1 - paddingB; - U32 lPadding = fw - 1 - paddingL; - U32 rPadding = fw - 1 - paddingR; - - if (CONVOLUTION_ALGORITHM_WINOGRAD == algorithm) { - // If algorithm is not Winograd, leave out padding of length 1 - tPadding--; - bPadding--; - lPadding--; - rPadding--; - transposedCD.padding_top += 1; - transposedCD.padding_bottom += 1; - transposedCD.padding_left += 1; - transposedCD.padding_right += 1; - } - - U32 stuffH = strideH - 1; - U32 stuffW = strideW - 1; - U32 ihPadded = ih + (ih - 1) * stuffH + tPadding + bPadding; - U32 iwPadded = iw + (iw - 1) * stuffW + lPadding + rPadding; - TensorDesc inPaddedDesc = tensor4df(idt, idf, in, ic, ihPadded, iwPadded); - - F32 *inPad = (F32*)tmp; - F32 *inPadMov = inPad; - F32 *inputMov = input; - - ic /= 8; - - for (U32 c = 0; c < ic; c++) { - for (U32 h = 0; h < tPadding; h++) { - memset(inPadMov, 0, iwPadded*8*bytesOf(idt)); - inPadMov += iwPadded*8; - } - for (U32 h = 0; h < ih - 1; h++) { - memset(inPadMov, 0, lPadding*8*bytesOf(idt)); - inPadMov += lPadding*8; - for (U32 w = 0; w < iw - 1; w++) { - memcpy(inPadMov, inputMov, 8*bytesOf(idt)); - inPadMov += 8; - inputMov += 8; - memset(inPadMov, 0, stuffW*8*bytesOf(idt)); - inPadMov += stuffW * 8; - } - memcpy(inPadMov, inputMov, 8*bytesOf(idt)); - inPadMov += 8; - inputMov += 8; - memset(inPadMov, 0, rPadding*8*bytesOf(idt)); - inPadMov += rPadding*8; - - // stuffH - memset(inPadMov, 0, iwPadded*stuffH*8*bytesOf(idt)); - inPadMov += iwPadded*stuffH*8; - } - memset(inPadMov, 0, lPadding*8*bytesOf(idt)); - inPadMov += lPadding*8; - for (U32 w = 0; w < iw - 1; w++) { - memcpy(inPadMov, inputMov, 8*bytesOf(idt)); - inPadMov += 8; - inputMov += 8; - memset(inPadMov, 0, stuffW*8*bytesOf(idt)); - inPadMov += stuffW * 8; - } - memcpy(inPadMov, inputMov, 8*bytesOf(idt)); - inPadMov += 8; - inputMov += 8; - memset(inPadMov, 0, rPadding*8*bytesOf(idt)); - inPadMov += rPadding*8; - - for (U32 h = ihPadded - bPadding; h < ihPadded; h++) { - memset(inPadMov, 0, iwPadded*8*bytesOf(idt)); - inPadMov += iwPadded*8; - } - } - - EE ret = SUCCESS; - switch (algorithm) { - case CONVOLUTION_ALGORITHM_GEMM: -#ifdef __aarch64__ - ret = convolution_gemm_V8(inPaddedDesc, inPad, filterDesc, filter, transposedCD, - biasDesc, bias, tmpBytes - tensorNumBytes(inPaddedDesc), - inPad + tensorNumElements(inPaddedDesc), outputDesc, output, activationDesc); -#else - ret = convolution_gemm_V7(inPaddedDesc, inPad, filterDesc, filter, transposedCD, - biasDesc, bias, tmpBytes - tensorNumBytes(inPaddedDesc), - inPad + tensorNumElements(inPaddedDesc), outputDesc, output, activationDesc); -#endif - break; - case CONVOLUTION_ALGORITHM_WINOGRAD: - ret = convolution_winograd_V8(inPaddedDesc, inPad, filterDesc, filter, transposedCD, - biasDesc, bias, tmpBytes - tensorNumBytes(inPaddedDesc), - inPad + tensorNumElements(inPaddedDesc), outputDesc, output, activationDesc); - break; - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} diff --git a/tensor_computing/src/cpu/arm/fp32/deconvolution_transform.cpp b/tensor_computing/src/cpu/arm/fp32/deconvolution_transform.cpp deleted file mode 100644 index 865ea3e5..00000000 --- a/tensor_computing/src/cpu/arm/fp32/deconvolution_transform.cpp +++ /dev/null @@ -1,124 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/fp32/tensor_computing_fp32.h" -#include "cpu/arm/fp32/convolution_winograd_transform.h" - -inline EE deconvolution_transform_filter_kernel_fp32(TensorDesc filterDesc, const F32* filterArray, - TensorDesc *ftmDesc, F32* ftmArray, - DataFormat ftmDataFormat) -{ - if (nullptr == filterArray || nullptr == ftmDesc || nullptr == ftmArray) - CHECK_STATUS(NULL_POINTER); - DataType fdt; - DataFormat fdf; - U32 fn, fc, fh, fw; - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - if (fdf == ftmDataFormat) { - *ftmDesc = filterDesc; - memcpy(ftmArray, filterArray, fn*fc*fh*fw*bytesOf(fdt)); - return SUCCESS; - } - if (fdf != DF_NCHW) { - CHECK_STATUS(NOT_SUPPORTED); - } - EE ret = SUCCESS; - switch (ftmDataFormat) { - case DF_NHWCN8: { - /* - * CNHW => NHWCN8 - */ - U32 oc = fc / 8; - U32 hwMax = fh * fw - 1; - for (U32 o = 0; o < oc; o++) { - for (U32 hw = 0; hw < fh*fw; hw++) { - for (U32 c = 0; c < fn; c++) { - for (U32 o8 = 0; o8 < 8; o8++) { - ftmArray[o*fh*fw*fn*8 + hw*fn*8 + c*8 + o8] = filterArray[c*fc*fh*fw + (o*8+o8)*fh*fw + hwMax-hw]; - } - } - } - } - *ftmDesc = tensor4df(fdt, ftmDataFormat, fc, fn, fh, fw); - break; - } - case DF_HWNCN8: { - U32 hwMax = 8; - for (U32 o = 0; o < fc / 8; o++) { - for (U32 c = 0; c < fn; c++) { - // Each time deal with N4; 2 times we have N8 - U32 f_off_0 = c*fc*fh*fw + (o*8)*fh*fw; - U32 f_off_1 = c*fc*fh*fw + (o*8+4)*fh*fw; - - U32 ftm_off_0 = o*36*fn*8 + c*8; - U32 ftm_off_1 = o*36*fn*8 + c*8 + 4; - - F32 F[9][4]; - F32 *F_ptr[9]; - F32 *Fw[36]; - - for (U32 hw = 0; hw < 9; hw++) { - for (U32 oo = 0; oo < 4; oo++) { - F[hw][oo] = filterArray[f_off_0 + hwMax-hw + oo*fh*fw]; - } - F_ptr[hw] = F[hw]; - } - for (U32 hw = 0; hw < 36; hw++) { - Fw[hw] = ftmArray + ftm_off_0 + hw*fn*8; - } - trans_W_4x4_3x3(Fw, F_ptr); - for (U32 hw = 0; hw < 9; hw++) { - for (U32 oo = 0; oo < 4; oo++) { - F[hw][oo] = filterArray[f_off_1 + hwMax-hw + oo*fh*fw]; - } - F_ptr[hw] = F[hw]; - } - for (U32 hw = 0; hw < 36; hw++) { - Fw[hw] = ftmArray + ftm_off_1 + hw*fn*8; - } - trans_W_4x4_3x3(Fw, F_ptr); - } - } - *ftmDesc = tensor4df(fdt, ftmDataFormat, fc, fn, 6, 6); - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - -EE deconvolution_transform_filter_fp32(TensorDesc filterDesc, const F32* filter, - ConvolutionForwardAlgorithm algorithm, - TensorDesc *ftmDesc, F32* filterTransformed) -{ - DataFormat ftmDataFormat; - switch (algorithm) { - case CONVOLUTION_ALGORITHM_GEMM: - ftmDataFormat = DF_NHWCN8; - break; - case CONVOLUTION_ALGORITHM_GEMM_ICNCHW: - ftmDataFormat = DF_NHWCN8; - break; - case CONVOLUTION_ALGORITHM_WINOGRAD: - ftmDataFormat = DF_HWNCN8; - break; - default: - return NOT_MATCH; - } - EE ret = deconvolution_transform_filter_kernel_fp32(filterDesc, filter, ftmDesc, filterTransformed, ftmDataFormat); - CHECK_STATUS(ret); - return ret; -} diff --git a/tensor_computing/src/cpu/arm/fp32/depthwise_convolution.cpp b/tensor_computing/src/cpu/arm/fp32/depthwise_convolution.cpp deleted file mode 100644 index 5deca43a..00000000 --- a/tensor_computing/src/cpu/arm/fp32/depthwise_convolution.cpp +++ /dev/null @@ -1,97 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing_type.h" - -#include "cpu/arm/fp32/tensor_computing_fp32.h" -#include "cpu/arm/fp32/depthwise_convolution.h" - -EE depthwise_convolution_fp32(TensorDesc inputDesc, F32* input, - TensorDesc filterDesc, const F32* filter, - ConvolutionDesc convDesc, DepthwiseConvolutionForwardAlgorithm algorithm, - TensorDesc biasDesc, const F32* bias, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F32* output, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc, - Arch arch) -{ - UNUSED(arch); - if(nullptr == input || nullptr == filter || nullptr == output || nullptr == bias || nullptr == tmp) { - CHECK_STATUS(NULL_POINTER); - } - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - - if (!(idt == DT_F32 && fdt == DT_F32 && odt == DT_F32)) - CHECK_STATUS(NOT_MATCH); - if (!(idf == DF_NCHWC8 && odf == DF_NCHWC8)) - CHECK_STATUS(NOT_MATCH); - if (!(ic == fc && oc == fn)) - CHECK_STATUS(NOT_MATCH); - - EE ret = SUCCESS; - switch (algorithm) { - case DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT: -#ifdef __aarch64__ - ret = depthwise_convolution_direct_V8(inputDesc, input, - filterDesc, filter, - convDesc, - biasDesc, bias, - tmpBytes, tmp, - outputDesc, output, - depthwiseActivationDesc); -#else - ret = depthwise_convolution_direct_V7(inputDesc, input, - filterDesc, filter, - convDesc, - biasDesc, bias, - tmpBytes, tmp, - outputDesc, output, - depthwiseActivationDesc); -#endif - break; - case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT: -#ifdef __aarch64__ - ret = depthwise_pointwise_convolution_direct_V8(inputDesc, input, - filterDesc, filter, - convDesc, - biasDesc, bias, - tmpBytes, tmp, - outputDesc, output, - depthwiseActivationDesc, - pointwiseActivationDesc); -#else - ret = depthwise_pointwise_convolution_direct_V7(inputDesc, input, - filterDesc, filter, - convDesc, - biasDesc, bias, - tmpBytes, tmp, - outputDesc, output, - depthwiseActivationDesc, - pointwiseActivationDesc); -#endif - break; - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} diff --git a/tensor_computing/src/cpu/arm/fp32/depthwise_convolution.h b/tensor_computing/src/cpu/arm/fp32/depthwise_convolution.h deleted file mode 100644 index c13248a4..00000000 --- a/tensor_computing/src/cpu/arm/fp32/depthwise_convolution.h +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_DEPTHWISE_CONVOLUTION -#define _H_DEPTHWISE_CONVOLUTION -#include - -#include "sys.h" -#include "type.h" -#include "error.h" -#include "tensor_desc.h" -#include "tensor_computing_type.h" - -#ifdef __aarch64__ -EE depthwise_convolution_direct_V8(TensorDesc inputDesc, F32* inArray, - TensorDesc filterDesc, const F32* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F32* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F32* outArray, - ActivationDesc depthwiseActivationDesc); -#else -EE depthwise_convolution_direct_V7(TensorDesc inputDesc, F32* inArray, - TensorDesc filterDesc, const F32* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F32* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F32* outArray, - ActivationDesc depthwiseActivationDesc); -#endif - -#ifdef __aarch64__ -EE depthwise_pointwise_convolution_direct_V8(TensorDesc inputDesc, F32 *inArray, - TensorDesc filterDesc, const F32 *filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F32 *biasArray, - U32 tmpBytes, void *tmp, - TensorDesc outputDesc, F32 *outArray, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc); -#else -EE depthwise_pointwise_convolution_direct_V7(TensorDesc inputDesc, F32 *inArray, - TensorDesc filterDesc, const F32 *filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F32 *biasArray, - U32 tmpBytes, void *tmp, - TensorDesc outputDesc, F32 *outArray, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc); -#endif - -#endif diff --git a/tensor_computing/src/cpu/arm/fp32/depthwise_convolution_direct_V7.cpp b/tensor_computing/src/cpu/arm/fp32/depthwise_convolution_direct_V7.cpp deleted file mode 100644 index e8cc612a..00000000 --- a/tensor_computing/src/cpu/arm/fp32/depthwise_convolution_direct_V7.cpp +++ /dev/null @@ -1,634 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef __aarch64__ -#include "cpu/arm/fp32/depthwise_convolution.h" - -EE depthwise_convolution_direct_V7(TensorDesc inputDesc, F32* inArray, - TensorDesc filterDesc, const F32* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F32* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F32* outArray, - ActivationDesc depthwiseActivationDesc) -{ - UNUSED(biasDesc); - UNUSED(tmpBytes); - - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - U32 dilateH = convDesc.dilatedRate_h; - U32 dilateW = convDesc.dilatedRate_w; - - if (fdf != DF_NCHWC8) { - CHECK_STATUS(NOT_MATCH); - } - - oc /= 8; - ic /= 8; - U32 ih_pad = ih + paddingT + paddingB; - U32 iw_pad = iw + paddingL + paddingR; - U32 ihiw = ih*iw; - I32 ohow = oh*ow; - - for (U32 n = 0; n < in; n++) { - F32 *inArray_pad = (F32*)tmp; - F32 *inArray_pad_mov = inArray_pad; - F32 *inArray_mov = inArray + n*ic*ihiw*8; - for (U32 c = 0; c < ic; c++) { - // copy input into a input with padding - for (U32 h = 0; h < paddingT; h++) { - memset(inArray_pad_mov, 0, iw_pad*8*sizeof(F32)); - inArray_pad_mov += iw_pad*8; - } - for (U32 h = paddingT; h < ih_pad - paddingB; h++) { - memset(inArray_pad_mov, 0, paddingL*8*sizeof(F32)); - inArray_pad_mov += paddingL*8; - memcpy(inArray_pad_mov, inArray_mov, iw*8*sizeof(F32)); - inArray_pad_mov += iw*8; - inArray_mov += iw*8; - memset(inArray_pad_mov, 0, paddingR*8*sizeof(F32)); - inArray_pad_mov += paddingR*8; - } - for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { - memset(inArray_pad_mov, 0, iw_pad*8*sizeof(F32)); - inArray_pad_mov += iw_pad*8; - } - - const F32 *b = biasArray + c*8; - F32 *in_pad = inArray_pad + c*ih_pad*iw_pad*8; - const F32 *f = filterArray + c*fh*fw*8; - // ohow / 6 - for (I32 hw = 0; hw < ohow-5; hw+=6) { - U32 in_h_0 = hw/ow*strideH; - U32 in_w_0 = hw%ow*strideW; - U32 in_h_1 = (hw+1)/ow*strideH; - U32 in_w_1 = (hw+1)%ow*strideW; - U32 in_h_2 = (hw+2)/ow*strideH; - U32 in_w_2 = (hw+2)%ow*strideW; - U32 in_h_3 = (hw+3)/ow*strideH; - U32 in_w_3 = (hw+3)%ow*strideW; - U32 in_h_4 = (hw+4)/ow*strideH; - U32 in_w_4 = (hw+4)%ow*strideW; - U32 in_h_5 = (hw+5)/ow*strideH; - U32 in_w_5 = (hw+5)%ow*strideW; - - F32 *out_ptr = outArray + ((n * ic + c) * ohow + hw) * 8; - __asm__ __volatile__( - "vld1.f32 {d0-d3}, [%[b]]\n" - "vmov.f32 q2, q0\n" - "vmov.f32 q3, q1\n" - "vmov.f32 q4, q0\n" - "vmov.f32 q5, q1\n" - "vmov.f32 q6, q0\n" - "vmov.f32 q7, q1\n" - "vmov.f32 q8, q0\n" - "vmov.f32 q9, q1\n" - "vmov.f32 q10, q0\n" - "vmov.f32 q11, q1\n" - : [b] "+r"(b) - : - : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11" - ); - - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - const F32 *f_0 = f + fh_idx*fw*8 + fw_idx*8; - F32 *in_idx = in_pad + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; - F32 *in_0 = in_idx + in_h_0 * iw_pad * 8 + in_w_0 * 8; - F32 *in_1 = in_idx + in_h_1 * iw_pad * 8 + in_w_1 * 8; - F32 *in_2 = in_idx + in_h_2 * iw_pad * 8 + in_w_2 * 8; - F32 *in_3 = in_idx + in_h_3 * iw_pad * 8 + in_w_3 * 8; - F32 *in_4 = in_idx + in_h_4 * iw_pad * 8 + in_w_4 * 8; - F32 *in_5 = in_idx + in_h_5 * iw_pad * 8 + in_w_5 * 8; - __asm__ __volatile__( - "vld1.f32 {d24-d27}, [%[f0]]\n" - "vld1.f32 {d28-d29}, [%[in0]]!\n" - "vld1.f32 {d30-d31}, [%[in0]]\n" - - "vmla.f32 q0, q14, q12\n" - "vld1.f32 {d28-d29}, [%[in1]]!\n" - "vmla.f32 q1, q15, q13\n" - "vld1.f32 {d30-d31}, [%[in1]]\n" - "vmla.f32 q2, q14, q12\n" - "vld1.f32 {d28-d29}, [%[in2]]!\n" - "vmla.f32 q3, q15, q13\n" - "vld1.f32 {d30-d31}, [%[in2]]\n" - "vmla.f32 q4, q14, q12\n" - "vld1.f32 {d28-d29}, [%[in3]]!\n" - "vmla.f32 q5, q15, q13\n" - "vld1.f32 {d30-d31}, [%[in3]]\n" - "vmla.f32 q6, q14, q12\n" - "vld1.f32 {d28-d29}, [%[in4]]!\n" - "vmla.f32 q7, q15, q13\n" - "vld1.f32 {d30-d31}, [%[in4]]\n" - "vmla.f32 q8, q14, q12\n" - "vld1.f32 {d28-d29}, [%[in5]]!\n" - "vmla.f32 q9, q15, q13\n" - "vld1.f32 {d30-d31}, [%[in5]]\n" - "vmla.f32 q10, q14, q12\n" - "vmla.f32 q11, q15, q13\n" - : [in0] "+r"(in_0), - [in1] "+r"(in_1), - [in2] "+r"(in_2), - [in3] "+r"(in_3), - [in4] "+r"(in_4), - [in5] "+r"(in_5) - : [f0] "r"(f_0) - : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", - "q11", "q12", "q13", "q14", "q15" - ); - } - } - - // activation - switch (depthwiseActivationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "veor q15, q15, q15\n" // zero - "vmax.f32 q0, q0, q15\n" - "vmax.f32 q1, q1, q15\n" - "vmax.f32 q2, q2, q15\n" - "vmax.f32 q3, q3, q15\n" - "vmax.f32 q4, q4, q15\n" - "vmax.f32 q5, q5, q15\n" - "vmax.f32 q6, q6, q15\n" - "vmax.f32 q7, q7, q15\n" - "vmax.f32 q8, q8, q15\n" - "vmax.f32 q9, q9, q15\n" - "vmax.f32 q10, q10, q15\n" - "vmax.f32 q11, q11, q15\n" - : - : - : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", - "q11", "q12", "q13", "q14", "q15" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "veor q15, q15, q15\n" // zero - "vmov.f32 q14, #6.0\n" // six - "vmax.f32 q0, q0, q15\n" - "vmax.f32 q1, q1, q15\n" - "vmax.f32 q2, q2, q15\n" - "vmax.f32 q3, q3, q15\n" - "vmax.f32 q4, q4, q15\n" - "vmax.f32 q5, q5, q15\n" - "vmax.f32 q6, q6, q15\n" - "vmax.f32 q7, q7, q15\n" - "vmax.f32 q8, q8, q15\n" - "vmax.f32 q9, q9, q15\n" - "vmax.f32 q10, q10, q15\n" - "vmax.f32 q11, q11, q15\n" - - "vmin.f32 q0, q0, q14\n" - "vmin.f32 q1, q1, q14\n" - "vmin.f32 q2, q2, q14\n" - "vmin.f32 q3, q3, q14\n" - "vmin.f32 q4, q4, q14\n" - "vmin.f32 q5, q5, q14\n" - "vmin.f32 q6, q6, q14\n" - "vmin.f32 q7, q7, q14\n" - "vmin.f32 q8, q8, q14\n" - "vmin.f32 q9, q9, q14\n" - "vmin.f32 q10, q10, q14\n" - "vmin.f32 q11, q11, q14\n" - : - : - : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", - "q11", "q12", "q13", "q14", "q15" - ); - break; - } - case ACTIVATION_H_SWISH:{ - F32 *out_0 = out_ptr; - F32 *out_24 = out_ptr + 24; - __asm__ __volatile__( - "vstm %[out], {d0-d11}\n" - "vmov.f32 q13, #3.0\n" // three - "vmov.f32 q14, #6.0\n" // six - "veor q15, q15, q15\n" // zero - "vrecpe.f32 q12, q14\n" - "vrecps.f32 q0, q14, q12\n" - "vmul.f32 q12, q0, q12\n" - - "vadd.f32 q0, q6, q13\n" - "vadd.f32 q1, q7, q13\n" - "vadd.f32 q2, q8, q13\n" - "vadd.f32 q3, q9, q13\n" - "vadd.f32 q4, q10, q13\n" - "vadd.f32 q5, q11, q13\n" - "vmax.f32 q0, q0, q15\n" - "vmax.f32 q1, q1, q15\n" - "vmax.f32 q2, q2, q15\n" - "vmax.f32 q3, q3, q15\n" - "vmax.f32 q4, q4, q15\n" - "vmax.f32 q5, q5, q15\n" - "vmin.f32 q0, q0, q14\n" - "vmin.f32 q1, q1, q14\n" - "vmin.f32 q2, q2, q14\n" - "vmin.f32 q3, q3, q14\n" - "vmin.f32 q4, q4, q14\n" - "vmin.f32 q5, q5, q14\n" - "vmul.f32 q0, q0, q12\n" - "vmul.f32 q1, q1, q12\n" - "vmul.f32 q2, q2, q12\n" - "vmul.f32 q3, q3, q12\n" - "vmul.f32 q4, q4, q12\n" - "vmul.f32 q5, q5, q12\n" - "vmul.f32 q0, q0, q6\n" - "vld1.f32 {d12-d13}, [%[out_0]]!\n" - "vst1.f32 {d0-d1}, [%[out_24]]!\n" - "vmul.f32 q1, q1, q7\n" - "vld1.f32 {d14-d15}, [%[out_0]]!\n" - "vst1.f32 {d2-d3}, [%[out_24]]!\n" - "vmul.f32 q2, q2, q8\n" - "vld1.f32 {d16-d17}, [%[out_0]]!\n" - "vst1.f32 {d4-d5}, [%[out_24]]!\n" - "vmul.f32 q3, q3, q9\n" - "vld1.f32 {d18-d19}, [%[out_0]]!\n" - "vst1.f32 {d6-d7}, [%[out_24]]!\n" - "vmul.f32 q4, q4, q10\n" - "vld1.f32 {d20-d21}, [%[out_0]]!\n" - "vst1.f32 {d8-d9}, [%[out_24]]!\n" - "vmul.f32 q5, q5, q11\n" - "vld1.f32 {d22-d23}, [%[out_0]]\n" - "vst1.f32 {d10-d11}, [%[out_24]]\n" - - "vadd.f32 q0, q6, q13\n" - "vadd.f32 q1, q7, q13\n" - "vadd.f32 q2, q8, q13\n" - "vadd.f32 q3, q9, q13\n" - "vadd.f32 q4, q10, q13\n" - "vadd.f32 q5, q11, q13\n" - "vmax.f32 q0, q0, q15\n" - "vmax.f32 q1, q1, q15\n" - "vmax.f32 q2, q2, q15\n" - "vmax.f32 q3, q3, q15\n" - "vmax.f32 q4, q4, q15\n" - "vmax.f32 q5, q5, q15\n" - "vmin.f32 q0, q0, q14\n" - "vmin.f32 q1, q1, q14\n" - "vmin.f32 q2, q2, q14\n" - "vmin.f32 q3, q3, q14\n" - "vmin.f32 q4, q4, q14\n" - "vmin.f32 q5, q5, q14\n" - "vmul.f32 q0, q0, q12\n" - "vmul.f32 q1, q1, q12\n" - "vmul.f32 q2, q2, q12\n" - "vmul.f32 q3, q3, q12\n" - "vmul.f32 q4, q4, q12\n" - "vmul.f32 q5, q5, q12\n" - "vmul.f32 q0, q0, q6\n" - "vst1.f32 {d0-d1}, [%[out]]!\n" - "vmul.f32 q1, q1, q7\n" - "vst1.f32 {d2-d3}, [%[out]]!\n" - "vmul.f32 q2, q2, q8\n" - "vst1.f32 {d4-d5}, [%[out]]!\n" - "vmul.f32 q3, q3, q9\n" - "vst1.f32 {d6-d7}, [%[out]]!\n" - "vmul.f32 q4, q4, q10\n" - "vst1.f32 {d8-d9}, [%[out]]!\n" - "vmul.f32 q5, q5, q11\n" - "vst1.f32 {d10-d11}, [%[out]]\n" - : [out] "+r"(out_ptr), - [out_0] "+r"(out_0), - [out_24] "+r"(out_24) - : - : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", - "q11", "q12", "q13", "q14", "q15" - ); - break; - } - default: - return NOT_SUPPORTED; - } - if (depthwiseActivationDesc.mode != ACTIVATION_H_SWISH) { - __asm__ __volatile__( - "vstm %[out]!, {d0-d15}\n" - "vstm %[out], {d16-d23}\n" - : [out] "+r"(out_ptr) - : - : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", - "q11", "q12", "q13", "q14", "q15" - ); - } - } - - U32 ohow_s = (ohow / 6) * 6; - U32 ohow_tail = ohow - ohow_s; - if (ohow_tail >= 4) { - I32 hw = ohow_s; - U32 in_h_0 = hw/ow*strideH; - U32 in_w_0 = hw%ow*strideW; - U32 in_h_1 = (hw+1)/ow*strideH; - U32 in_w_1 = (hw+1)%ow*strideW; - U32 in_h_2 = (hw+2)/ow*strideH; - U32 in_w_2 = (hw+2)%ow*strideW; - U32 in_h_3 = (hw+3)/ow*strideH; - U32 in_w_3 = (hw+3)%ow*strideW; - F32 *out_ptr = outArray + ((n * ic + c) * ohow + hw) * 8; - - __asm__ __volatile__( - "vld1.f32 {d0-d1}, [%[b]]!\n" - "vld1.f32 {d2-d3}, [%[b]]\n" - "vmov.f32 q2, q0\n" - "vmov.f32 q3, q1\n" - "vmov.f32 q4, q0\n" - "vmov.f32 q5, q1\n" - "vmov.f32 q6, q0\n" - "vmov.f32 q7, q1\n" - :[b]"+r"(b) - : - :"memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7" - ); - - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - const F32 *f_0 = f + fh_idx*fw*8 + fw_idx*8; - F32 *in_idx = in_pad + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - F32 *in_0 = in_idx + in_h_0*iw_pad*8 + in_w_0*8; - F32 *in_1 = in_idx + in_h_1*iw_pad*8 + in_w_1*8; - F32 *in_2 = in_idx + in_h_2*iw_pad*8 + in_w_2*8; - F32 *in_3 = in_idx + in_h_3*iw_pad*8 + in_w_3*8; - - __asm__ __volatile__( - "vld1.f32 {d28-d31}, [%[f0]]\n" - "vld1.f32 {d16-d19}, [%[in0]]\n" - "vld1.f32 {d20-d23}, [%[in1]]\n" - "vld1.f32 {d24-d27}, [%[in2]]\n" - - "vmla.f32 q0, q8, q14\n" - "vmla.f32 q1, q9, q15\n" - "vld1.f32 {d16-d19}, [%[in3]]\n" - "vmla.f32 q2, q10, q14\n" - "vmla.f32 q3, q11, q15\n" - "vmla.f32 q4, q12, q14\n" - "vmla.f32 q5, q13, q15\n" - "vmla.f32 q6, q8, q14\n" - "vmla.f32 q7, q9, q15\n" - : - :[in0]"r"(in_0), - [in1]"r"(in_1), - [in2]"r"(in_2), - [in3]"r"(in_3), - [f0]"r"(f_0) - :"memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", - "q11", "q12", "q13", "q14", "q15" - ); - } - } - - // activation - switch (depthwiseActivationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "veor q15, q15, q15\n" // zero - "vmax.f32 q0, q0, q15\n" - "vmax.f32 q1, q1, q15\n" - "vmax.f32 q2, q2, q15\n" - "vmax.f32 q3, q3, q15\n" - "vmax.f32 q4, q4, q15\n" - "vmax.f32 q5, q5, q15\n" - "vmax.f32 q6, q6, q15\n" - "vmax.f32 q7, q7, q15\n" - : - : - :"memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q15" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "veor q15, q15, q15\n" // zero - "vmov.f32 q14, #6.0\n" // six - "vmax.f32 q0, q0, q15\n" - "vmax.f32 q1, q1, q15\n" - "vmax.f32 q2, q2, q15\n" - "vmax.f32 q3, q3, q15\n" - "vmax.f32 q4, q4, q15\n" - "vmax.f32 q5, q5, q15\n" - "vmax.f32 q6, q6, q15\n" - "vmax.f32 q7, q7, q15\n" - - "vmin.f32 q0, q0, q14\n" - "vmin.f32 q1, q1, q14\n" - "vmin.f32 q2, q2, q14\n" - "vmin.f32 q3, q3, q14\n" - "vmin.f32 q4, q4, q14\n" - "vmin.f32 q5, q5, q14\n" - "vmin.f32 q6, q6, q14\n" - "vmin.f32 q7, q7, q14\n" - : - : - :"memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q14", "q15" - ); - break; - } - case ACTIVATION_H_SWISH:{ - __asm__ __volatile__( - "vmov.f32 q13, #3.0\n" // three - "vmov.f32 q14, #6.0\n" // six - "veor q15, q15, q15\n" // zero - "vadd.f32 q8, q0, q13\n" - "vadd.f32 q9, q1, q13\n" - "vadd.f32 q10, q2, q13\n" - "vadd.f32 q11, q3, q13\n" - "vmax.f32 q8, q8, q15\n" - "vmax.f32 q9, q9, q15\n" - "vmax.f32 q10, q10, q15\n" - "vmax.f32 q11, q11, q15\n" - "vmin.f32 q8, q8, q14\n" - "vmin.f32 q9, q9, q14\n" - "vmin.f32 q10, q10, q14\n" - "vmin.f32 q11, q11, q14\n" - "vrecpe.f32 q12, q14\n" - "vrecps.f32 q14, q14, q12\n" - "vmul.f32 q12, q14, q12\n" - "vmul.f32 q8, q8, q12\n" - "vmul.f32 q9, q9, q12\n" - "vmul.f32 q10, q10, q12\n" - "vmul.f32 q11, q11, q12\n" - "vmov.f32 q14, #6.0\n" // six - "vmul.f32 q0, q0, q8\n" - "vmul.f32 q1, q1, q9\n" - "vmul.f32 q2, q2, q10\n" - "vmul.f32 q3, q3, q11\n" - - "vadd.f32 q8, q4, q13\n" - "vadd.f32 q9, q5, q13\n" - "vadd.f32 q10, q6, q13\n" - "vadd.f32 q11, q7, q13\n" - "vmax.f32 q8, q8, q15\n" - "vmax.f32 q9, q9, q15\n" - "vmax.f32 q10, q10, q15\n" - "vmax.f32 q11, q11, q15\n" - "vmin.f32 q8, q8, q14\n" - "vmin.f32 q9, q9, q14\n" - "vmin.f32 q10, q10, q14\n" - "vmin.f32 q11, q11, q14\n" - "vrecpe.f32 q12, q14\n" - "vrecps.f32 q14, q14, q12\n" - "vmul.f32 q12, q14, q12\n" - "vmul.f32 q8, q8, q12\n" - "vmul.f32 q9, q9, q12\n" - "vmul.f32 q10, q10, q12\n" - "vmul.f32 q11, q11, q12\n" - "vmul.f32 q4, q4, q8\n" - "vmul.f32 q5, q5, q9\n" - "vmul.f32 q6, q6, q10\n" - "vmul.f32 q7, q7, q11\n" - : - : - :"memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", - "q11", "q12", "q13", "q14", "q15" - ); - break; - } - default: - return NOT_SUPPORTED; - } - - __asm__ __volatile__( - "vstm %[out], {d0-d15}\n" - :[out]"+r"(out_ptr) - : - :"memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7" - ); - - ohow_s += 4; - } - - for (I32 hw = ohow_s; hw < ohow; hw++) { - U32 in_h_0 = hw/ow*strideH; - U32 in_w_0 = hw%ow*strideW; - F32 *out_ptr = outArray + ((n * ic + c) * ohow + hw) * 8; - - __asm__ __volatile__( - "vld1.f32 {d0-d3}, [%[b]]\n" - : - :[b]"r"(b) - :"memory", "cc", "q0", "q1" - ); - - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - const F32 *f_0 = f + fh_idx*fw*8 + fw_idx*8; - F32 *in_idx = in_pad + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - F32 *in_0 = in_idx + in_h_0*iw_pad*8 + in_w_0*8; - - __asm__ __volatile__( - "vld1.f32 {d28-d31}, [%[f0]]\n" - "vld1.f32 {d24-d27}, [%[in0]]\n" - - "vmla.f32 q0, q12, q14\n" - "vmla.f32 q1, q13, q15\n" - : - :[in0]"r"(in_0), - [f0]"r"(f_0) - :"memory", "cc", "q0", "q1", "q12", "q13", "q14", "q15" - ); - } - } - - // activation - switch (depthwiseActivationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "veor q15, q15, q15\n" // zero - "vmax.f32 q0, q0, q15\n" - "vmax.f32 q1, q1, q15\n" - : - : - :"memory", "cc", "q0", "q1", "q15" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "veor q15, q15, q15\n" // zero - "vmov.f32 q14, #6.0\n" // six - "vmax.f32 q0, q0, q15\n" - "vmax.f32 q1, q1, q15\n" - - "vmin.f32 q0, q0, q14\n" - "vmin.f32 q1, q1, q14\n" - : - : - :"memory", "cc", "q0", "q1", "q14", "q15" - ); - break; - } - case ACTIVATION_H_SWISH:{ - __asm__ __volatile__( - "vmov.f32 q13, #3.0\n" // three - "vmov.f32 q14, #6.0\n" // six - "veor q15, q15, q15\n" // zero - "vadd.f32 q11, q0, q13\n" - "vadd.f32 q12, q1, q13\n" - - "vmax.f32 q11, q11, q15\n" - "vmax.f32 q12, q12, q15\n" - - "vmin.f32 q11, q11, q14\n" - "vmin.f32 q12, q12, q14\n" - - "vrecpe.f32 q13, q14\n" - "vrecps.f32 q14, q14, q13\n" - "vmul.f32 q14, q14, q13\n" - "vmul.f32 q11, q11, q14\n" - "vmul.f32 q12, q12, q14\n" - - "vmul.f32 q0, q0, q11\n" - "vmul.f32 q1, q1, q12\n" - : - : - :"memory", "cc", "q0", "q1", "q11", "q12", "q13", "q14", "q15" - ); - break; - } - default: - return NOT_SUPPORTED; - } - __asm__ __volatile__( - "vst1.f32 {d0-d3}, [%[out]]\n" - :[out]"+r"(out_ptr) - : - :"memory", "cc", "q0", "q1" - ); - } - } - } - return SUCCESS; -} -#endif diff --git a/tensor_computing/src/cpu/arm/fp32/depthwise_convolution_direct_V8.cpp b/tensor_computing/src/cpu/arm/fp32/depthwise_convolution_direct_V8.cpp deleted file mode 100644 index 8da08f90..00000000 --- a/tensor_computing/src/cpu/arm/fp32/depthwise_convolution_direct_V8.cpp +++ /dev/null @@ -1,666 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifdef __aarch64__ -#include "cpu/arm/fp32/depthwise_convolution.h" - -EE depthwise_convolution_direct_V8(TensorDesc inputDesc, F32* inArray, - TensorDesc filterDesc, const F32* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F32* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F32* outArray, - ActivationDesc depthwiseActivationDesc) -{ - UNUSED(biasDesc); - UNUSED(tmpBytes); - - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - U32 dilateH = convDesc.dilatedRate_h; - U32 dilateW = convDesc.dilatedRate_w; - - if (fdf != DF_NCHWC8) { - CHECK_STATUS(NOT_MATCH); - } - - oc /= 8; - ic /= 8; - U32 ih_pad = ih + paddingT + paddingB; - U32 iw_pad = iw + paddingL + paddingR; - U32 ihiw = ih*iw; - I32 ohow = oh*ow; - - for (U32 n = 0; n < in; n++) { - F32 *inArray_pad = (F32*)tmp; - F32 *inArray_pad_mov = inArray_pad; - F32 *inArray_mov = inArray + n*ic*ihiw*8; - for (U32 c = 0; c < ic; c++) { - // copy input into a input with padding - for (U32 h = 0; h < paddingT; h++) { - memset(inArray_pad_mov, 0, iw_pad*8*sizeof(F32)); - inArray_pad_mov += iw_pad*8; - } - for (U32 h = paddingT; h < ih_pad - paddingB; h++) { - memset(inArray_pad_mov, 0, paddingL*8*sizeof(F32)); - inArray_pad_mov += paddingL*8; - memcpy(inArray_pad_mov, inArray_mov, iw*8*sizeof(F32)); - inArray_pad_mov += iw*8; - inArray_mov += iw*8; - memset(inArray_pad_mov, 0, paddingR*8*sizeof(F32)); - inArray_pad_mov += paddingR*8; - } - for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { - memset(inArray_pad_mov, 0, iw_pad*8*sizeof(F32)); - inArray_pad_mov += iw_pad*8; - } - - const F32 *b = biasArray + c*8; - F32 *in_pad = inArray_pad + c*ih_pad*iw_pad*8; - const F32 *f = filterArray + c*fh*fw*8; - // ohow / 8 - for (I32 hw = 0; hw < ohow-7; hw+=8) { - U32 in_h_0 = hw/ow*strideH; - U32 in_w_0 = hw%ow*strideW; - U32 in_h_1 = (hw+1)/ow*strideH; - U32 in_w_1 = (hw+1)%ow*strideW; - U32 in_h_2 = (hw+2)/ow*strideH; - U32 in_w_2 = (hw+2)%ow*strideW; - U32 in_h_3 = (hw+3)/ow*strideH; - U32 in_w_3 = (hw+3)%ow*strideW; - U32 in_h_4 = (hw+4)/ow*strideH; - U32 in_w_4 = (hw+4)%ow*strideW; - U32 in_h_5 = (hw+5)/ow*strideH; - U32 in_w_5 = (hw+5)%ow*strideW; - U32 in_h_6 = (hw+6)/ow*strideH; - U32 in_w_6 = (hw+6)%ow*strideW; - U32 in_h_7 = (hw+7)/ow*strideH; - U32 in_w_7 = (hw+7)%ow*strideW; - - F32 *out_ptr = outArray + ((n * ic + c) * ohow + hw) * 8; - __asm__ __volatile__( - "ldr q14, [%[b]]\n" - "ldr q15, [%[b], #16]\n" - "mov v0.16b, v14.16b\n" - "mov v1.16b, v15.16b\n" - "mov v2.16b, v14.16b\n" - "mov v3.16b, v15.16b\n" - "mov v4.16b, v14.16b\n" - "mov v5.16b, v15.16b\n" - "mov v6.16b, v14.16b\n" - "mov v7.16b, v15.16b\n" - "mov v8.16b, v14.16b\n" - "mov v9.16b, v15.16b\n" - "mov v10.16b, v14.16b\n" - "mov v11.16b, v15.16b\n" - "mov v12.16b, v14.16b\n" - "mov v13.16b, v15.16b\n" - : - : [b] "r"(b) - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" - ); - - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - const F32 *f_0 = f + fh_idx*fw*8 + fw_idx*8; - F32 *in_idx = in_pad + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; - F32 *in_0 = in_idx + in_h_0 * iw_pad * 8 + in_w_0 * 8; - F32 *in_1 = in_idx + in_h_1 * iw_pad * 8 + in_w_1 * 8; - F32 *in_2 = in_idx + in_h_2 * iw_pad * 8 + in_w_2 * 8; - F32 *in_3 = in_idx + in_h_3 * iw_pad * 8 + in_w_3 * 8; - F32 *in_4 = in_idx + in_h_4 * iw_pad * 8 + in_w_4 * 8; - F32 *in_5 = in_idx + in_h_5 * iw_pad * 8 + in_w_5 * 8; - F32 *in_6 = in_idx + in_h_6 * iw_pad * 8 + in_w_6 * 8; - F32 *in_7 = in_idx + in_h_7 * iw_pad * 8 + in_w_7 * 8; - - __asm__ __volatile__( - "ldp q16, q17, [%[f0]]\n" - "ldp q30, q31, [%[in0]]\n" - "ldp q18, q19, [%[in1]]\n" - "ldp q20, q21, [%[in2]]\n" - "ldp q22, q23, [%[in3]]\n" - "ldp q24, q25, [%[in4]]\n" - "ldp q26, q27, [%[in5]]\n" - "ldp q28, q29, [%[in6]]\n" - - "fmla v0.4s, v30.4s, v16.4s\n" - "fmla v1.4s, v31.4s, v17.4s\n" - "fmla v2.4s, v18.4s, v16.4s\n" - "ldp q30, q31, [%[in7]]\n" - "fmla v3.4s, v19.4s, v17.4s\n" - "fmla v4.4s, v20.4s, v16.4s\n" - "fmla v5.4s, v21.4s, v17.4s\n" - "fmla v6.4s, v22.4s, v16.4s\n" - "fmla v7.4s, v23.4s, v17.4s\n" - "fmla v8.4s, v24.4s, v16.4s\n" - "fmla v9.4s, v25.4s, v17.4s\n" - "fmla v10.4s, v26.4s, v16.4s\n" - "fmla v11.4s, v27.4s, v17.4s\n" - "fmla v12.4s, v28.4s, v16.4s\n" - "fmla v13.4s, v29.4s, v17.4s\n" - "fmla v14.4s, v30.4s, v16.4s\n" - "fmla v15.4s, v31.4s, v17.4s\n" - : - : [in0] "r"(in_0), - [in1] "r"(in_1), - [in2] "r"(in_2), - [in3] "r"(in_3), - [in4] "r"(in_4), - [in5] "r"(in_5), - [in6] "r"(in_6), - [in7] "r"(in_7), - [f0] "r"(f_0) - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", - "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" - ); - } - } - - // activation - switch (depthwiseActivationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fmax v0.4s, v0.4s, v31.4s\n" - "fmax v1.4s, v1.4s, v31.4s\n" - "fmax v2.4s, v2.4s, v31.4s\n" - "fmax v3.4s, v3.4s, v31.4s\n" - "fmax v4.4s, v4.4s, v31.4s\n" - "fmax v5.4s, v5.4s, v31.4s\n" - "fmax v6.4s, v6.4s, v31.4s\n" - "fmax v7.4s, v7.4s, v31.4s\n" - "fmax v8.4s, v8.4s, v31.4s\n" - "fmax v9.4s, v9.4s, v31.4s\n" - "fmax v10.4s, v10.4s, v31.4s\n" - "fmax v11.4s, v11.4s, v31.4s\n" - "fmax v12.4s, v12.4s, v31.4s\n" - "fmax v13.4s, v13.4s, v31.4s\n" - "fmax v14.4s, v14.4s, v31.4s\n" - "fmax v15.4s, v15.4s, v31.4s\n" - : - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v31" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fmov v30.4s, 6.0\n" // six - "fmax v0.4s, v0.4s, v31.4s\n" - "fmax v1.4s, v1.4s, v31.4s\n" - "fmax v2.4s, v2.4s, v31.4s\n" - "fmax v3.4s, v3.4s, v31.4s\n" - "fmax v4.4s, v4.4s, v31.4s\n" - "fmax v5.4s, v5.4s, v31.4s\n" - "fmax v6.4s, v6.4s, v31.4s\n" - "fmax v7.4s, v7.4s, v31.4s\n" - "fmax v8.4s, v8.4s, v31.4s\n" - "fmax v9.4s, v9.4s, v31.4s\n" - "fmax v10.4s, v10.4s, v31.4s\n" - "fmax v11.4s, v11.4s, v31.4s\n" - "fmax v12.4s, v12.4s, v31.4s\n" - "fmax v13.4s, v13.4s, v31.4s\n" - "fmax v14.4s, v14.4s, v31.4s\n" - "fmax v15.4s, v15.4s, v31.4s\n" - - "fmin v0.4s, v0.4s, v30.4s\n" - "fmin v1.4s, v1.4s, v30.4s\n" - "fmin v2.4s, v2.4s, v30.4s\n" - "fmin v3.4s, v3.4s, v30.4s\n" - "fmin v4.4s, v4.4s, v30.4s\n" - "fmin v5.4s, v5.4s, v30.4s\n" - "fmin v6.4s, v6.4s, v30.4s\n" - "fmin v7.4s, v7.4s, v30.4s\n" - "fmin v8.4s, v8.4s, v30.4s\n" - "fmin v9.4s, v9.4s, v30.4s\n" - "fmin v10.4s, v10.4s, v30.4s\n" - "fmin v11.4s, v11.4s, v30.4s\n" - "fmin v12.4s, v12.4s, v30.4s\n" - "fmin v13.4s, v13.4s, v30.4s\n" - "fmin v14.4s, v14.4s, v30.4s\n" - "fmin v15.4s, v15.4s, v30.4s\n" - : - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v30", "v31" - ); - break; - } - case ACTIVATION_H_SWISH:{ - __asm__ __volatile__( - "fmov v29.4s, 3.0\n" // three - "fmov v30.4s, 6.0\n" // six - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fadd v22.4s, v0.4s, v29.4s\n" - "fadd v23.4s, v1.4s, v29.4s\n" - "fadd v16.4s, v2.4s, v29.4s\n" - "fadd v17.4s, v3.4s, v29.4s\n" - "fadd v18.4s, v4.4s, v29.4s\n" - "fadd v19.4s, v5.4s, v29.4s\n" - "fadd v20.4s, v6.4s, v29.4s\n" - "fadd v21.4s, v7.4s, v29.4s\n" - - "fmax v22.4s, v22.4s, v31.4s\n" - "fmax v23.4s, v23.4s, v31.4s\n" - "fmax v16.4s, v16.4s, v31.4s\n" - "fmax v17.4s, v17.4s, v31.4s\n" - "fmax v18.4s, v18.4s, v31.4s\n" - "fmax v19.4s, v19.4s, v31.4s\n" - "fmax v20.4s, v20.4s, v31.4s\n" - "fmax v21.4s, v21.4s, v31.4s\n" - - "fmin v22.4s, v22.4s, v30.4s\n" - "fmin v23.4s, v23.4s, v30.4s\n" - "fmin v16.4s, v16.4s, v30.4s\n" - "fmin v17.4s, v17.4s, v30.4s\n" - "fmin v18.4s, v18.4s, v30.4s\n" - "fmin v19.4s, v19.4s, v30.4s\n" - "fmin v20.4s, v20.4s, v30.4s\n" - "fmin v21.4s, v21.4s, v30.4s\n" - - "fdiv v22.4s, v22.4s, v30.4s\n" - "fdiv v23.4s, v23.4s, v30.4s\n" - "fdiv v16.4s, v16.4s, v30.4s\n" - "fdiv v17.4s, v17.4s, v30.4s\n" - "fdiv v18.4s, v18.4s, v30.4s\n" - "fdiv v19.4s, v19.4s, v30.4s\n" - "fdiv v20.4s, v20.4s, v30.4s\n" - "fdiv v21.4s, v21.4s, v30.4s\n" - - "fmul v0.4s, v0.4s, v22.4s\n" - "fmul v1.4s, v1.4s, v23.4s\n" - "fmul v2.4s, v2.4s, v16.4s\n" - "fmul v3.4s, v3.4s, v17.4s\n" - "fmul v4.4s, v4.4s, v18.4s\n" - "fmul v5.4s, v5.4s, v19.4s\n" - "fmul v6.4s, v6.4s, v20.4s\n" - "fmul v7.4s, v7.4s, v21.4s\n" - - - "fadd v22.4s, v8.4s, v29.4s\n" - "fadd v23.4s, v9.4s, v29.4s\n" - "fadd v16.4s, v10.4s, v29.4s\n" - "fadd v17.4s, v11.4s, v29.4s\n" - "fadd v18.4s, v12.4s, v29.4s\n" - "fadd v19.4s, v13.4s, v29.4s\n" - "fadd v20.4s, v14.4s, v29.4s\n" - "fadd v21.4s, v15.4s, v29.4s\n" - - "fmax v22.4s, v22.4s, v31.4s\n" - "fmax v23.4s, v23.4s, v31.4s\n" - "fmax v16.4s, v16.4s, v31.4s\n" - "fmax v17.4s, v17.4s, v31.4s\n" - "fmax v18.4s, v18.4s, v31.4s\n" - "fmax v19.4s, v19.4s, v31.4s\n" - "fmax v20.4s, v20.4s, v31.4s\n" - "fmax v21.4s, v21.4s, v31.4s\n" - - "fmin v22.4s, v22.4s, v30.4s\n" - "fmin v23.4s, v23.4s, v30.4s\n" - "fmin v16.4s, v16.4s, v30.4s\n" - "fmin v17.4s, v17.4s, v30.4s\n" - "fmin v18.4s, v18.4s, v30.4s\n" - "fmin v19.4s, v19.4s, v30.4s\n" - "fmin v20.4s, v20.4s, v30.4s\n" - "fmin v21.4s, v21.4s, v30.4s\n" - - "fdiv v22.4s, v22.4s, v30.4s\n" - "fdiv v23.4s, v23.4s, v30.4s\n" - "fdiv v16.4s, v16.4s, v30.4s\n" - "fdiv v17.4s, v17.4s, v30.4s\n" - "fdiv v18.4s, v18.4s, v30.4s\n" - "fdiv v19.4s, v19.4s, v30.4s\n" - "fdiv v20.4s, v20.4s, v30.4s\n" - "fdiv v21.4s, v21.4s, v30.4s\n" - - "fmul v8.4s, v8.4s, v22.4s\n" - "fmul v9.4s, v9.4s, v23.4s\n" - "fmul v10.4s, v10.4s, v16.4s\n" - "fmul v11.4s, v11.4s, v17.4s\n" - "fmul v12.4s, v12.4s, v18.4s\n" - "fmul v13.4s, v13.4s, v19.4s\n" - "fmul v14.4s, v14.4s, v20.4s\n" - "fmul v15.4s, v15.4s, v21.4s\n" - : - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", - "v18", "v19", "v20", "v21", "v22", "v23", "v29", "v30", "v31" - ); - break; - } - default: - return NOT_SUPPORTED; - } - - __asm__ __volatile__( - "stp q0, q1, [%[out]]\n" - "stp q2, q3, [%[out], #32]\n" - "stp q4, q5, [%[out], #64]\n" - "stp q6, q7, [%[out], #96]\n" - "stp q8, q9, [%[out], #128]\n" - "stp q10, q11, [%[out], #160]\n" - "stp q12, q13, [%[out], #192]\n" - "stp q14, q15, [%[out], #224]\n" - : [out] "+r"(out_ptr) - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15" - ); - } - - U32 ohow_s = (ohow / 8) * 8; - U32 ohow_tail = ohow - ohow_s; - if (ohow_tail >= 4) { - I32 hw = ohow_s; - U32 in_h_0 = hw/ow*strideH; - U32 in_w_0 = hw%ow*strideW; - U32 in_h_1 = (hw+1)/ow*strideH; - U32 in_w_1 = (hw+1)%ow*strideW; - U32 in_h_2 = (hw+2)/ow*strideH; - U32 in_w_2 = (hw+2)%ow*strideW; - U32 in_h_3 = (hw+3)/ow*strideH; - U32 in_w_3 = (hw+3)%ow*strideW; - F32 *out_ptr = outArray + ((n * ic + c) * ohow + hw) * 8; - - __asm__ __volatile__( - "ldr q14, [%[b]]\n" - "ldr q15, [%[b], #16]\n" - "mov v0.16b, v14.16b\n" - "mov v1.16b, v15.16b\n" - "mov v2.16b, v14.16b\n" - "mov v3.16b, v15.16b\n" - "mov v4.16b, v14.16b\n" - "mov v5.16b, v15.16b\n" - "mov v6.16b, v14.16b\n" - "mov v7.16b, v15.16b\n" - : - :[b]"r"(b) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v14", "v15" - ); - - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - const F32 *f_0 = f + fh_idx*fw*8 + fw_idx*8; - F32 *in_idx = in_pad + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - F32 *in_0 = in_idx + in_h_0*iw_pad*8 + in_w_0*8; - F32 *in_1 = in_idx + in_h_1*iw_pad*8 + in_w_1*8; - F32 *in_2 = in_idx + in_h_2*iw_pad*8 + in_w_2*8; - F32 *in_3 = in_idx + in_h_3*iw_pad*8 + in_w_3*8; - - __asm__ __volatile__( - "ldp q14, q15, [%[f0]]\n" - "ldp q16, q17, [%[in0]]\n" - "ldp q18, q19, [%[in1]]\n" - "ldp q20, q21, [%[in2]]\n" - "ldp q22, q23, [%[in3]]\n" - - "fmla v0.4s, v16.4s, v14.4s\n" - "fmla v1.4s, v17.4s, v15.4s\n" - "fmla v2.4s, v18.4s, v14.4s\n" - "fmla v3.4s, v19.4s, v15.4s\n" - "fmla v4.4s, v20.4s, v14.4s\n" - "fmla v5.4s, v21.4s, v15.4s\n" - "fmla v6.4s, v22.4s, v14.4s\n" - "fmla v7.4s, v23.4s, v15.4s\n" - : - :[in0]"r"(in_0), - [in1]"r"(in_1), - [in2]"r"(in_2), - [in3]"r"(in_3), - [f0]"r"(f_0) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v14", "v15", "v16", "v17", - "v18", "v19", "v20", "v21", "v22", "v23" - ); - } - } - - // activation - switch (depthwiseActivationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fmax v0.4s, v0.4s, v31.4s\n" - "fmax v1.4s, v1.4s, v31.4s\n" - "fmax v2.4s, v2.4s, v31.4s\n" - "fmax v3.4s, v3.4s, v31.4s\n" - "fmax v4.4s, v4.4s, v31.4s\n" - "fmax v5.4s, v5.4s, v31.4s\n" - "fmax v6.4s, v6.4s, v31.4s\n" - "fmax v7.4s, v7.4s, v31.4s\n" - : - : - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v31" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fmov v30.4s, 6.0\n" // six - "fmax v0.4s, v0.4s, v31.4s\n" - "fmax v1.4s, v1.4s, v31.4s\n" - "fmax v2.4s, v2.4s, v31.4s\n" - "fmax v3.4s, v3.4s, v31.4s\n" - "fmax v4.4s, v4.4s, v31.4s\n" - "fmax v5.4s, v5.4s, v31.4s\n" - "fmax v6.4s, v6.4s, v31.4s\n" - "fmax v7.4s, v7.4s, v31.4s\n" - - "fmin v0.4s, v0.4s, v30.4s\n" - "fmin v1.4s, v1.4s, v30.4s\n" - "fmin v2.4s, v2.4s, v30.4s\n" - "fmin v3.4s, v3.4s, v30.4s\n" - "fmin v4.4s, v4.4s, v30.4s\n" - "fmin v5.4s, v5.4s, v30.4s\n" - "fmin v6.4s, v6.4s, v30.4s\n" - "fmin v7.4s, v7.4s, v30.4s\n" - : - : - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v30", "v31" - ); - break; - } - case ACTIVATION_H_SWISH:{ - __asm__ __volatile__( - "fmov v29.4s, 3.0\n" // three - "fmov v30.4s, 6.0\n" // six - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fadd v14.4s, v0.4s, v29.4s\n" - "fadd v15.4s, v1.4s, v29.4s\n" - "fadd v16.4s, v2.4s, v29.4s\n" - "fadd v17.4s, v3.4s, v29.4s\n" - "fadd v18.4s, v4.4s, v29.4s\n" - "fadd v19.4s, v5.4s, v29.4s\n" - "fadd v20.4s, v6.4s, v29.4s\n" - "fadd v21.4s, v7.4s, v29.4s\n" - - "fmax v14.4s, v14.4s, v31.4s\n" - "fmax v15.4s, v15.4s, v31.4s\n" - "fmax v16.4s, v16.4s, v31.4s\n" - "fmax v17.4s, v17.4s, v31.4s\n" - "fmax v18.4s, v18.4s, v31.4s\n" - "fmax v19.4s, v19.4s, v31.4s\n" - "fmax v20.4s, v20.4s, v31.4s\n" - "fmax v21.4s, v21.4s, v31.4s\n" - - "fmin v14.4s, v14.4s, v30.4s\n" - "fmin v15.4s, v15.4s, v30.4s\n" - "fmin v16.4s, v16.4s, v30.4s\n" - "fmin v17.4s, v17.4s, v30.4s\n" - "fmin v18.4s, v18.4s, v30.4s\n" - "fmin v19.4s, v19.4s, v30.4s\n" - "fmin v20.4s, v20.4s, v30.4s\n" - "fmin v21.4s, v21.4s, v30.4s\n" - - "fdiv v14.4s, v14.4s, v30.4s\n" - "fdiv v15.4s, v15.4s, v30.4s\n" - "fdiv v16.4s, v16.4s, v30.4s\n" - "fdiv v17.4s, v17.4s, v30.4s\n" - "fdiv v18.4s, v18.4s, v30.4s\n" - "fdiv v19.4s, v19.4s, v30.4s\n" - "fdiv v20.4s, v20.4s, v30.4s\n" - "fdiv v21.4s, v21.4s, v30.4s\n" - - "fmul v0.4s, v0.4s, v14.4s\n" - "fmul v1.4s, v1.4s, v15.4s\n" - "fmul v2.4s, v2.4s, v16.4s\n" - "fmul v3.4s, v3.4s, v17.4s\n" - "fmul v4.4s, v4.4s, v18.4s\n" - "fmul v5.4s, v5.4s, v19.4s\n" - "fmul v6.4s, v6.4s, v20.4s\n" - "fmul v7.4s, v7.4s, v21.4s\n" - : - : - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v14", "v15", "v16", "v17", - "v18", "v19", "v20", "v21", "v29", "v30", "v31" - ); - break; - } - default: - return NOT_SUPPORTED; - } - - __asm__ __volatile__( - "stp q0, q1, [%[out]]\n" - "stp q2, q3, [%[out], #32]\n" - "stp q4, q5, [%[out], #64]\n" - "stp q6, q7, [%[out], #96]\n" - :[out]"+r"(out_ptr) - : - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" - ); - - ohow_s += 4; - } - - for (I32 hw = ohow_s; hw < ohow; hw++) { - U32 in_h_0 = hw/ow*strideH; - U32 in_w_0 = hw%ow*strideW; - F32 *out_ptr = outArray + ((n * ic + c) * ohow + hw) * 8; - - __asm__ __volatile__( - "ldr q0, [%[b]]\n" - "ldr q1, [%[b], #16]\n" - : - :[b]"r"(b) - :"memory", "cc", "v0", "v1" - ); - - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - const F32 *f_0 = f + fh_idx*fw*8 + fw_idx*8; - F32 *in_idx = in_pad + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - F32 *in_0 = in_idx + in_h_0*iw_pad*8 + in_w_0*8; - - __asm__ __volatile__( - "ldp q14, q15, [%[f0]]\n" - "ldp q16, q17, [%[in0]]\n" - - "fmla v0.4s, v16.4s, v14.4s\n" - "fmla v1.4s, v17.4s, v15.4s\n" - : - :[in0]"r"(in_0), - [f0]"r"(f_0) - :"memory", "cc", "v0", "v1", "v14", "v15" - ); - } - } - - // activation - switch (depthwiseActivationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fmax v0.4s, v0.4s, v31.4s\n" - "fmax v1.4s, v1.4s, v31.4s\n" - : - : - :"memory", "cc", "v0", "v1", "v31" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fmov v30.4s, 6.0\n" // six - "fmax v0.4s, v0.4s, v31.4s\n" - "fmax v1.4s, v1.4s, v31.4s\n" - - "fmin v0.4s, v0.4s, v30.4s\n" - "fmin v1.4s, v1.4s, v30.4s\n" - : - : - :"memory", "cc", "v0", "v1", "v30", "v31" - ); - break; - } - case ACTIVATION_H_SWISH:{ - __asm__ __volatile__( - "fmov v29.4s, 3.0\n" // three - "fmov v30.4s, 6.0\n" // six - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fadd v14.4s, v0.4s, v29.4s\n" - "fadd v15.4s, v1.4s, v29.4s\n" - - "fmax v14.4s, v14.4s, v31.4s\n" - "fmax v15.4s, v15.4s, v31.4s\n" - - "fmin v14.4s, v14.4s, v30.4s\n" - "fmin v15.4s, v15.4s, v30.4s\n" - - "fdiv v14.4s, v14.4s, v30.4s\n" - "fdiv v15.4s, v15.4s, v30.4s\n" - - "fmul v0.4s, v0.4s, v14.4s\n" - "fmul v1.4s, v1.4s, v15.4s\n" - : - : - :"memory", "cc", "v0", "v1", "v14", "v15", "v29", "v30", "v31" - ); - break; - } - default: - return NOT_SUPPORTED; - } - - __asm__ __volatile__( - "stp q0, q1, [%[out]]\n" - :[out]"+r"(out_ptr) - : - :"memory", "cc", "v0", "v1" - ); - } - } - } - return SUCCESS; -} -#endif diff --git a/tensor_computing/src/cpu/arm/fp32/depthwise_convolution_transform.cpp b/tensor_computing/src/cpu/arm/fp32/depthwise_convolution_transform.cpp deleted file mode 100644 index 0ff2f2a1..00000000 --- a/tensor_computing/src/cpu/arm/fp32/depthwise_convolution_transform.cpp +++ /dev/null @@ -1,119 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include - -#include "cpu/arm/fp32/tensor_computing_fp32.h" - -inline EE depthwise_convolution_transform_filter_kernel_fp32(TensorDesc filterDesc, const F32* filterArray, - TensorDesc *ftmDesc, F32* ftmArray, - DataFormat ftmDataFormat) -{ - if (nullptr == filterArray || nullptr == ftmDesc || nullptr == ftmArray) { - CHECK_STATUS(NULL_POINTER); - } - DataType fdt; - DataFormat fdf; - U32 fn, fc, fh, fw; - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - - if (fdf == ftmDataFormat) { - *ftmDesc = filterDesc; - if (fdf == DF_NCHWC8) { - memcpy(ftmArray, filterArray, fn*fc*fh*fw*bytesOf(fdt)); - return SUCCESS; - } - if (fdf == DF_CHWC8_NCN8) { - memcpy(ftmArray, filterArray, (fc*fh*fw + fc*fn)*bytesOf(fdt)); - return SUCCESS; - } - return NOT_SUPPORTED; - } - - switch (fdf) { - case DF_NCHW: { - if (ftmDataFormat == DF_NCHWC8) { - U32 ic = fc / 8; - for (U32 c = 0; c < ic; c++) { - for (U32 hw = 0; hw < fh*fw; hw++) { - for (U32 c8 = 0; c8 < 8; c8++) { - ftmArray[c*fh*fw*8 + hw*8 + c8] = filterArray[(c*8+c8)*fh*fw + hw]; - } - } - } - *ftmDesc = tensor4df(fdt, DF_NCHWC8, fn, fc, fh, fw); - } - else { - return NOT_SUPPORTED; - } - break; - } - case DF_CHW_NC: { - if (ftmDataFormat == DF_CHWC8_NCN8) { - const F32 *pwFilterArray = filterArray + fc*fh*fw; - F32 *pwFtmArray = ftmArray + fc*fh*fw; - U32 oc = fn / 8; - U32 ic = fc / 8; - for (U32 c = 0; c < ic; c++) { - for (U32 hw = 0; hw < fh*fw; hw++) { - for (U32 c8 = 0; c8 < 8; c8++) { - ftmArray[c*fh*fw*8 + hw*8 + c8] = filterArray[(c*8+c8)*fh*fw + hw]; - } - } - } - for (U32 o = 0; o < oc; o++) { - for (U32 c = 0; c < fc; c++) { - for (U32 o8 = 0; o8 < 8; o8++) { - pwFtmArray[o*fc*8 + c*8 + o8] = pwFilterArray[(o*8+o8)*fc + c]; - } - } - } - *ftmDesc = tensor4df(fdt, DF_CHWC8_NCN8, fn, fc, fh, fw); - } - else { - return NOT_SUPPORTED; - } - break; - } - default: - return NOT_SUPPORTED; - } - return SUCCESS; -} - -EE depthwise_convolution_transform_filter_fp32(TensorDesc filterDesc, const F32* filter, - DepthwiseConvolutionForwardAlgorithm algorithm, - TensorDesc *ftmDesc, F32* filterTransformed) -{ - DataFormat ftmDataFormat; - switch (algorithm) { - case DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT: - ftmDataFormat = DF_NCHWC8; - break; - case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT: - ftmDataFormat = DF_CHWC8_NCN8; - break; - case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT_NO_PADDING: - ftmDataFormat = DF_CHWC8_NCN8; - break; - case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_3X3S1P1: - ftmDataFormat = DF_CHWC8_NCN8; - break; - default: - return NOT_MATCH; - } - EE ret = depthwise_convolution_transform_filter_kernel_fp32(filterDesc, filter, ftmDesc, filterTransformed, ftmDataFormat); - CHECK_STATUS(ret); - return ret; -} diff --git a/tensor_computing/src/cpu/arm/fp32/depthwise_pointwise_convolution_direct_V7.cpp b/tensor_computing/src/cpu/arm/fp32/depthwise_pointwise_convolution_direct_V7.cpp deleted file mode 100644 index 3fa04b9a..00000000 --- a/tensor_computing/src/cpu/arm/fp32/depthwise_pointwise_convolution_direct_V7.cpp +++ /dev/null @@ -1,717 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef __aarch64__ -#include "cpu/arm/fp32/depthwise_convolution.h" - -EE depthwise_pointwise_convolution_direct_V7(TensorDesc inputDesc, F32* inArray, - TensorDesc filterDesc, const F32* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F32* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F32* outArray, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc) -{ - UNUSED(biasDesc); - UNUSED(tmpBytes); - - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - U32 dilateH = convDesc.dilatedRate_h; - U32 dilateW = convDesc.dilatedRate_w; - - if (fdf != DF_CHWC8_NCN8) { - CHECK_STATUS(NOT_MATCH); - } - if (!(ic == fc && oc == fn)) { - CHECK_STATUS(NOT_MATCH); - } - - oc /= 8; - ic /= 8; - - U32 ih_pad = ih + paddingT + paddingB; - U32 iw_pad = iw + paddingL + paddingR; - U32 ihiw = ih*iw; - I32 ohow = oh*ow; - F32 *pwArray = (F32*)tmp + ic*ih_pad*iw_pad*8; - - for (U32 n = 0; n < in; n++) { - // copy input into a input with padding - F32 *inArray_pad = (F32*)tmp; - F32 *inArray_pad_mov = inArray_pad; - F32 *inArray_mov = inArray + n*ic*ihiw*8; - for (U32 c = 0; c < ic; c++) { - for (U32 h = 0; h < paddingT; h++) { - memset(inArray_pad_mov, 0, iw_pad*8*bytesOf(fdt)); - inArray_pad_mov += iw_pad*8; - } - for (U32 h = paddingT; h < ih_pad - paddingB; h++) { - memset(inArray_pad_mov, 0, paddingL*8*bytesOf(fdt)); - inArray_pad_mov += paddingL*8; - memcpy(inArray_pad_mov, inArray_mov, iw*8*bytesOf(fdt)); - inArray_pad_mov += iw*8; - inArray_mov += iw*8; - memset(inArray_pad_mov, 0, paddingR*8*bytesOf(fdt)); - inArray_pad_mov += paddingR*8; - } - for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { - memset(inArray_pad_mov, 0, iw_pad*8*bytesOf(fdt)); - inArray_pad_mov += iw_pad*8; - } - } - - // dw_conv - for (U32 c = 0; c < ic ; c++) { - const F32 *b = biasArray + c*8; - F32 *in_pad = inArray_pad + c*ih_pad*iw_pad*8; - const F32 *f = filterArray + c*fh*fw*8; - // ohow / 4 - for (I32 hw = 0; hw < ohow-3; hw+=4) { - U32 in_h_0 = hw/ow*strideH; - U32 in_w_0 = hw%ow*strideW; - U32 in_h_1 = (hw+1)/ow*strideH; - U32 in_w_1 = (hw+1)%ow*strideW; - U32 in_h_2 = (hw+2)/ow*strideH; - U32 in_w_2 = (hw+2)%ow*strideW; - U32 in_h_3 = (hw+3)/ow*strideH; - U32 in_w_3 = (hw+3)%ow*strideW; - F32 *pw_pack_0 = pwArray + hw*ic*8 + c*4*8; - - __asm__ __volatile__( - "vld1.f32 {d0-d3}, [%[b]]\n" - "vmov.f32 q2, q0\n" - "vmov.f32 q3, q1\n" - "vmov.f32 q4, q0\n" - "vmov.f32 q5, q1\n" - "vmov.f32 q6, q0\n" - "vmov.f32 q7, q1\n" - : - :[b]"r"(b) - :"memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7" - ); - - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - const F32 *f_0 = f + fh_idx*fw*8 + fw_idx*8; - F32 *in_idx = in_pad + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - F32 *in_0 = in_idx + in_h_0*iw_pad*8 + in_w_0*8; - F32 *in_1 = in_idx + in_h_1*iw_pad*8 + in_w_1*8; - F32 *in_2 = in_idx + in_h_2*iw_pad*8 + in_w_2*8; - F32 *in_3 = in_idx + in_h_3*iw_pad*8 + in_w_3*8; - - __asm__ __volatile__( - "vld1.f32 {d28-d31}, [%[f0]]\n" - "vld1.f32 {d16-d19}, [%[in0]]\n" - "vld1.f32 {d20-d23}, [%[in1]]\n" - "vld1.f32 {d24-d27}, [%[in2]]\n" - - "vmla.f32 q0, q8, q14\n" - "vmla.f32 q1, q9, q15\n" - "vld1.f32 {d16-d19}, [%[in3]]\n" - "vmla.f32 q2, q10, q14\n" - "vmla.f32 q3, q11, q15\n" - "vmla.f32 q4, q12, q14\n" - "vmla.f32 q5, q13, q15\n" - "vmla.f32 q6, q8, q14\n" - "vmla.f32 q7, q9, q15\n" - : - :[in0]"r"(in_0), - [in1]"r"(in_1), - [in2]"r"(in_2), - [in3]"r"(in_3), - [f0]"r"(f_0) - :"memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", - "q11", "q12", "q13", "q14", "q15" - ); - } - } - - // activation - switch (depthwiseActivationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "veor q15, q15, q15\n" // zero - "vmax.f32 q0, q0, q15\n" - "vmax.f32 q1, q1, q15\n" - "vmax.f32 q2, q2, q15\n" - "vmax.f32 q3, q3, q15\n" - "vmax.f32 q4, q4, q15\n" - "vmax.f32 q5, q5, q15\n" - "vmax.f32 q6, q6, q15\n" - "vmax.f32 q7, q7, q15\n" - : - : - :"memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q15" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "veor q15, q15, q15\n" // zero - "vmov.f32 q14, #6.0\n" // six - "vmax.f32 q0, q0, q15\n" - "vmax.f32 q1, q1, q15\n" - "vmax.f32 q2, q2, q15\n" - "vmax.f32 q3, q3, q15\n" - "vmax.f32 q4, q4, q15\n" - "vmax.f32 q5, q5, q15\n" - "vmax.f32 q6, q6, q15\n" - "vmax.f32 q7, q7, q15\n" - - "vmin.f32 q0, q0, q14\n" - "vmin.f32 q1, q1, q14\n" - "vmin.f32 q2, q2, q14\n" - "vmin.f32 q3, q3, q14\n" - "vmin.f32 q4, q4, q14\n" - "vmin.f32 q5, q5, q14\n" - "vmin.f32 q6, q6, q14\n" - "vmin.f32 q7, q7, q14\n" - : - : - :"memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q14", "q15" - ); - break; - } - case ACTIVATION_H_SWISH:{ - __asm__ __volatile__( - "vmov.f32 q13, #3.0\n" // three - "vmov.f32 q14, #6.0\n" // six - "veor q15, q15, q15\n" // zero - "vadd.f32 q8, q0, q13\n" - "vadd.f32 q9, q1, q13\n" - "vadd.f32 q10, q2, q13\n" - "vadd.f32 q11, q3, q13\n" - "vmax.f32 q8, q8, q15\n" - "vmax.f32 q9, q9, q15\n" - "vmax.f32 q10, q10, q15\n" - "vmax.f32 q11, q11, q15\n" - "vmin.f32 q8, q8, q14\n" - "vmin.f32 q9, q9, q14\n" - "vmin.f32 q10, q10, q14\n" - "vmin.f32 q11, q11, q14\n" - "vrecpe.f32 q12, q14\n" - "vrecps.f32 q14, q14, q12\n" - "vmul.f32 q12, q14, q12\n" - "vmul.f32 q8, q8, q12\n" - "vmul.f32 q9, q9, q12\n" - "vmul.f32 q10, q10, q12\n" - "vmul.f32 q11, q11, q12\n" - "vmov.f32 q14, #6.0\n" // six - "vmul.f32 q0, q0, q8\n" - "vmul.f32 q1, q1, q9\n" - "vmul.f32 q2, q2, q10\n" - "vmul.f32 q3, q3, q11\n" - - "vadd.f32 q8, q4, q13\n" - "vadd.f32 q9, q5, q13\n" - "vadd.f32 q10, q6, q13\n" - "vadd.f32 q11, q7, q13\n" - "vmax.f32 q8, q8, q15\n" - "vmax.f32 q9, q9, q15\n" - "vmax.f32 q10, q10, q15\n" - "vmax.f32 q11, q11, q15\n" - "vmin.f32 q8, q8, q14\n" - "vmin.f32 q9, q9, q14\n" - "vmin.f32 q10, q10, q14\n" - "vmin.f32 q11, q11, q14\n" - "vrecpe.f32 q12, q14\n" - "vrecps.f32 q14, q14, q12\n" - "vmul.f32 q12, q14, q12\n" - "vmul.f32 q8, q8, q12\n" - "vmul.f32 q9, q9, q12\n" - "vmul.f32 q10, q10, q12\n" - "vmul.f32 q11, q11, q12\n" - "vmul.f32 q4, q4, q8\n" - "vmul.f32 q5, q5, q9\n" - "vmul.f32 q6, q6, q10\n" - "vmul.f32 q7, q7, q11\n" - : - : - :"memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", - "q11", "q12", "q13", "q14", "q15" - ); - break; - } - default: - return NOT_SUPPORTED; - } - - __asm__ __volatile__( - "vzip.32 q0, q4\n" - "vzip.32 q2, q6\n" - "vzip.32 q1, q5\n" - "vzip.32 q3, q7\n" - - "vzip.32 q0, q2\n" - "vzip.32 q4, q6\n" - "vzip.32 q1, q3\n" - "vzip.32 q5, q7\n" - - "vst1.f32 {q0}, [%[pw0]]!\n" - "vst1.f32 {q2}, [%[pw0]]!\n" - "vst1.f32 {q4}, [%[pw0]]!\n" - "vst1.f32 {q6}, [%[pw0]]!\n" - "vst1.f32 {q1}, [%[pw0]]!\n" - "vst1.f32 {q3}, [%[pw0]]!\n" - "vst1.f32 {q5}, [%[pw0]]!\n" - "vst1.f32 {q7}, [%[pw0]]!\n" - : [pw0] "+r"(pw_pack_0) - : - : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7" - ); - } - - // ohow_reminder % 4 - U32 ohow_s = (ohow / 4) * 4; - for (I32 hw = ohow_s; hw < ohow; hw++) { - U32 in_h_0 = hw/ow*strideH; - U32 in_w_0 = hw%ow*strideW; - F32 *pw_pack_0 = pwArray + hw*ic*8 + c*8; - - __asm__ __volatile__( - "vld1.f32 {d0-d3}, [%[b]]\n" - : - :[b]"r"(b) - :"memory", "cc", "q0", "q1" - ); - - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - const F32 *f_0 = f + fh_idx*fw*8 + fw_idx*8; - F32 *in_idx = in_pad + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - F32 *in_0 = in_idx + in_h_0*iw_pad*8 + in_w_0*8; - __asm__ __volatile__( - "vld1.f32 {d28-d31}, [%[f0]]\n" - "vld1.f32 {d24-d27}, [%[in0]]\n" - - "vmla.f32 q0, q12, q14\n" - "vmla.f32 q1, q13, q15\n" - : - :[in0]"r"(in_0), - [f0]"r"(f_0) - :"memory", "cc", "q0", "q1", "q12", "q13", "q14", "q15" - ); - } - } - - // activation - switch (depthwiseActivationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "veor q15, q15, q15\n" // zero - "vmax.f32 q0, q0, q15\n" - "vmax.f32 q1, q1, q15\n" - : - : - :"memory", "cc", "q0", "q1", "q15" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "veor q15, q15, q15\n" // zero - "vmov.f32 q14, #6.0\n" // six - "vmax.f32 q0, q0, q15\n" - "vmax.f32 q1, q1, q15\n" - - "vmin.f32 q0, q0, q14\n" - "vmin.f32 q1, q1, q14\n" - : - : - :"memory", "cc", "q0", "q1", "q14", "q15" - ); - break; - } - case ACTIVATION_H_SWISH:{ - __asm__ __volatile__( - "vmov.f32 q13, #3.0\n" // three - "vmov.f32 q14, #6.0\n" // six - "veor q15, q15, q15\n" // zero - "vadd.f32 q11, q0, q13\n" - "vadd.f32 q12, q1, q13\n" - - "vmax.f32 q11, q11, q15\n" - "vmax.f32 q12, q12, q15\n" - - "vmin.f32 q11, q11, q14\n" - "vmin.f32 q12, q12, q14\n" - - "vrecpe.f32 q13, q14\n" - "vrecps.f32 q14, q14, q13\n" - "vmul.f32 q14, q14, q13\n" - "vmul.f32 q11, q11, q14\n" - "vmul.f32 q12, q12, q14\n" - - "vmul.f32 q0, q0, q11\n" - "vmul.f32 q1, q1, q12\n" - : - : - :"memory", "cc", "q0", "q1", "q11", "q12", "q13", "q14", "q15" - ); - break; - } - default: - return NOT_SUPPORTED; - } - - __asm__ __volatile__( - "vst1.f32 {d0-d3}, [%[pw0]]\n" - : [pw0] "+r"(pw_pack_0) - : - : "memory", "cc", "q0", "q1" - ); - } - } - - // pw_conv - // ohow / 4 - for (I32 hw = 0; hw < ohow-3; hw+=4) { - const F32 *b0 = biasArray + ic*8; - const F32 *b1 = b0 + 4; - F32 *in_pack = pwArray + hw*ic*8; - for (I32 o = 0; o < I32(oc); o++) { - F32 *in_hw0 = in_pack; - const F32 *f_o0c0 = filterArray + ic*fh*fw*8 + o*8*ic*8; - F32 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - // bias - const F32 *b_o0 = b0; - const F32 *b_o1 = b1; - __asm__ __volatile__( - "vld1.f32 {d0-d1}, [%[b_0]]\n" - "vld1.f32 {d2-d3}, [%[b_1]]\n" - "vld1.f32 {d12-d13}, [%[in_0]]!\n" - "vld1.f32 {d20-d23}, [%[f_0]]!\n" - - "vmov.f32 q2, q0\n" - "vmov.f32 q4, q0\n" - "vmov.f32 q8, q0\n" - - "mov r2, %[ic]\n" - - "vmov.f32 q3, q1\n" - "vmov.f32 q5, q1\n" - "vmov.f32 q9, q1\n" - - "0:\n" - "vmla.f32 q0, q10, d12[0]\n" - "vmla.f32 q2, q10, d12[1]\n" - "vmla.f32 q4, q10, d13[0]\n" - "vmla.f32 q8, q10, d13[1]\n" - - "vld1.f32 {d14-d15}, [%[in_0]]!\n" - "vld1.f32 {d20-d21}, [%[f_0]]!\n" - - "vmla.f32 q1, q11, d12[0]\n" - "vmla.f32 q3, q11, d12[1]\n" - "vmla.f32 q5, q11, d13[0]\n" - "vmla.f32 q9, q11, d13[1]\n" - - "vld1.f32 {d22-d23}, [%[f_0]]!\n" - "subs r2, r2, #2\n" - - "vmla.f32 q0, q10, d14[0]\n" - "vmla.f32 q2, q10, d14[1]\n" - "vmla.f32 q4, q10, d15[0]\n" - "vmla.f32 q8, q10, d15[1]\n" - - "vld1.f32 {d12-d13}, [%[in_0]]!\n" - "vld1.f32 {d20-d21}, [%[f_0]]!\n" - - "vmla.f32 q1, q11, d14[0]\n" - "vmla.f32 q3, q11, d14[1]\n" - "vmla.f32 q5, q11, d15[0]\n" - "vmla.f32 q9, q11, d15[1]\n" - - "vld1.f32 {d22-d23}, [%[f_0]]!\n" - "bne 0b\n" - :[in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1) - :"memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", - "q11", "r2" - ); - - // activation - switch (pointwiseActivationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "veor q15, q15, q15\n" // zero - "vmax.f32 q0, q0, q15\n" - "vmax.f32 q1, q1, q15\n" - "vmax.f32 q2, q2, q15\n" - "vmax.f32 q3, q3, q15\n" - "vmax.f32 q4, q4, q15\n" - "vmax.f32 q5, q5, q15\n" - "vmax.f32 q8, q8, q15\n" - "vmax.f32 q9, q9, q15\n" - : - : - :"memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q8", "q9", "q15" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "veor q15, q15, q15\n" // zero - "vmov.f32 q14, #6.0\n" // six - "vmax.f32 q0, q0, q15\n" - "vmax.f32 q1, q1, q15\n" - "vmax.f32 q2, q2, q15\n" - "vmax.f32 q3, q3, q15\n" - "vmax.f32 q4, q4, q15\n" - "vmax.f32 q5, q5, q15\n" - "vmax.f32 q8, q8, q15\n" - "vmax.f32 q9, q9, q15\n" - - "vmin.f32 q0, q0, q14\n" - "vmin.f32 q1, q1, q14\n" - "vmin.f32 q2, q2, q14\n" - "vmin.f32 q3, q3, q14\n" - "vmin.f32 q4, q4, q14\n" - "vmin.f32 q5, q5, q14\n" - "vmin.f32 q8, q8, q14\n" - "vmin.f32 q9, q9, q14\n" - : - : - :"memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q8", "q9", "q14", "q15" - ); - break; - } - case ACTIVATION_H_SWISH:{ - __asm__ __volatile__( - "vmov.f32 q6, q8\n" - "vmov.f32 q7, q9\n" - - "vmov.f32 q13, #3.0\n" // three - "vmov.f32 q14, #6.0\n" // six - "veor q15, q15, q15\n" // zero - "vadd.f32 q8, q0, q13\n" - "vadd.f32 q9, q1, q13\n" - "vadd.f32 q10, q2, q13\n" - "vadd.f32 q11, q3, q13\n" - "vmax.f32 q8, q8, q15\n" - "vmax.f32 q9, q9, q15\n" - "vmax.f32 q10, q10, q15\n" - "vmax.f32 q11, q11, q15\n" - "vmin.f32 q8, q8, q14\n" - "vmin.f32 q9, q9, q14\n" - "vmin.f32 q10, q10, q14\n" - "vmin.f32 q11, q11, q14\n" - "vrecpe.f32 q12, q14\n" - "vrecps.f32 q14, q14, q12\n" - "vmul.f32 q12, q14, q12\n" - "vmul.f32 q8, q8, q12\n" - "vmul.f32 q9, q9, q12\n" - "vmul.f32 q10, q10, q12\n" - "vmul.f32 q11, q11, q12\n" - "vmov.f32 q14, #6.0\n" // six - "vmul.f32 q0, q0, q8\n" - "vmul.f32 q1, q1, q9\n" - "vmul.f32 q2, q2, q10\n" - "vmul.f32 q3, q3, q11\n" - - "vadd.f32 q8, q4, q13\n" - "vadd.f32 q9, q5, q13\n" - "vadd.f32 q10, q6, q13\n" - "vadd.f32 q11, q7, q13\n" - "vmax.f32 q8, q8, q15\n" - "vmax.f32 q9, q9, q15\n" - "vmax.f32 q10, q10, q15\n" - "vmax.f32 q11, q11, q15\n" - "vmin.f32 q8, q8, q14\n" - "vmin.f32 q9, q9, q14\n" - "vmin.f32 q10, q10, q14\n" - "vmin.f32 q11, q11, q14\n" - "vrecpe.f32 q12, q14\n" - "vrecps.f32 q14, q14, q12\n" - "vmul.f32 q12, q14, q12\n" - "vmul.f32 q8, q8, q12\n" - "vmul.f32 q9, q9, q12\n" - "vmul.f32 q10, q10, q12\n" - "vmul.f32 q11, q11, q12\n" - "vmul.f32 q4, q4, q8\n" - "vmul.f32 q5, q5, q9\n" - "vmul.f32 q6, q6, q10\n" - "vmul.f32 q7, q7, q11\n" - - "vmov.f32 q8, q6\n" - "vmov.f32 q9, q7\n" - : - : - :"memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", - "q11", "q12", "q13", "q14", "q15" - ); - break; - } - default: - return NOT_SUPPORTED; - } - - __asm__ __volatile__( - "vst1.f32 {d0-d3}, [%[out_0]]!\n" - "vst1.f32 {d4-d7}, [%[out_0]]!\n" - "vst1.f32 {d8-d11}, [%[out_0]]!\n" - "vst1.f32 {d16-d19}, [%[out_0]]!\n" - :[out_0]"+r"(out_o0hw0) - : - :"memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q8", "q9" - ); - b0 += 8; - b1 += 8; - } - } - - // ohow_reminder % 4 - U32 ohow_s = (ohow / 4) * 4; - for (I32 hw = ohow_s; hw < ohow; hw++) { - const F32 *b0 = biasArray + ic*8; - const F32 *b1 = b0 + 4; - F32 *in_pack = pwArray + hw*ic*8; - for (I32 o = 0; o < I32(oc); o++) { - F32 *in_hw0 = in_pack; - const F32 *f_o0c0 = filterArray + ic*fh*fw*8 + o*8*ic*8; - F32 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - // bias - const F32 *b_o0 = b0; - const F32 *b_o1 = b1; - __asm__ __volatile__( - "vld1.f32 {d0-d1}, [%[b_0]]\n" - "vld1.f32 {d2-d3}, [%[b_1]]\n" - "vld1.f32 {d8}, [%[in_0]]!\n" - "vld1.f32 {d4-d7}, [%[f_0]]!\n" - "mov r2, %[ic]\n" - "0:\n" - "vmla.f32 q0, q2, d8[0]\n" - - "vld1.f32 {d4-d5}, [%[f_0]]!\n" - - "vmla.f32 q1, q3, d8[0]\n" - - "vld1.f32 {d6-d7}, [%[f_0]]!\n" - "subs r2, r2, #2\n" - - "vmla.f32 q0, q2, d8[1]\n" - - "vld1.f32 {d4-d5}, [%[f_0]]!\n" - - "vmla.f32 q1, q3, d8[1]\n" - - "vld1.f32 {d8}, [%[in_0]]!\n" - "vld1.f32 {d6-d7}, [%[f_0]]!\n" - "bne 0b\n" - :[in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1) - :"memory", "cc", "q0", "q1", "q2", "q3", "q4", "r2" - ); - - switch (pointwiseActivationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "veor q15, q15, q15\n" // zero - "vmax.f32 q0, q0, q15\n" - "vmax.f32 q1, q1, q15\n" - : - : - :"memory", "cc", "q0", "q1", "q15" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "veor q15, q15, q15\n" // zero - "vmov.f32 q14, #6.0\n" // six - "vmax.f32 q0, q0, q15\n" - "vmax.f32 q1, q1, q15\n" - - "vmin.f32 q0, q0, q14\n" - "vmin.f32 q1, q1, q14\n" - : - : - :"memory", "cc", "q0", "q1", "q14", "q15" - ); - break; - } - case ACTIVATION_H_SWISH:{ - __asm__ __volatile__( - "vmov.f32 q13, #3.0\n" // three - "vmov.f32 q14, #6.0\n" // six - "veor q15, q15, q15\n" // zero - "vadd.f32 q11, q0, q13\n" - "vadd.f32 q12, q1, q13\n" - - "vmax.f32 q11, q11, q15\n" - "vmax.f32 q12, q12, q15\n" - - "vmin.f32 q11, q11, q14\n" - "vmin.f32 q12, q12, q14\n" - - "vrecpe.f32 q13, q14\n" - "vrecps.f32 q14, q14, q13\n" - "vmul.f32 q14, q14, q13\n" - "vmul.f32 q11, q11, q14\n" - "vmul.f32 q12, q12, q14\n" - - "vmul.f32 q0, q0, q11\n" - "vmul.f32 q1, q1, q12\n" - : - : - :"memory", "cc", "q0", "q1", "q11", "q12", "q13", "q14", "q15" - ); - break; - } - default: - return NOT_SUPPORTED; - } - - __asm__ __volatile__( - "vst1.f32 {d0-d3}, [%[out_0]]\n" - :[out_0]"+r"(out_o0hw0) - : - :"memory", "cc", "q0", "q1" - ); - b0 += 8; - b1 += 8; - } - } - } - return SUCCESS; -} -#endif diff --git a/tensor_computing/src/cpu/arm/fp32/depthwise_pointwise_convolution_direct_V8.cpp b/tensor_computing/src/cpu/arm/fp32/depthwise_pointwise_convolution_direct_V8.cpp deleted file mode 100644 index 9b7348e3..00000000 --- a/tensor_computing/src/cpu/arm/fp32/depthwise_pointwise_convolution_direct_V8.cpp +++ /dev/null @@ -1,1268 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifdef __aarch64__ -#include "cpu/arm/fp32/depthwise_convolution.h" - -EE depthwise_pointwise_convolution_direct_V8(TensorDesc inputDesc, F32* inArray, - TensorDesc filterDesc, const F32* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F32* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F32* outArray, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc) -{ - UNUSED(biasDesc); - UNUSED(tmpBytes); - - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - U32 dilateH = convDesc.dilatedRate_h; - U32 dilateW = convDesc.dilatedRate_w; - - if (fdf != DF_CHWC8_NCN8) { - CHECK_STATUS(NOT_MATCH); - } - if (!(ic == fc && oc == fn)) { - CHECK_STATUS(NOT_MATCH); - } - - oc /= 8; - ic /= 8; - - U32 ih_pad = ih + paddingT + paddingB; - U32 iw_pad = iw + paddingL + paddingR; - U32 ihiw = ih*iw; - I32 ohow = oh*ow; - F32 *pwArray = (F32*)tmp + ic*ih_pad*iw_pad*8; - - for (U32 n = 0; n < in; n++) { - // copy input into a input with padding - F32 *inArray_pad = (F32*)tmp; - F32 *inArray_pad_mov = inArray_pad; - F32 *inArray_mov = inArray + n*ic*ihiw*8; - for (U32 c = 0; c < ic; c++) { - for (U32 h = 0; h < paddingT; h++) { - memset(inArray_pad_mov, 0, iw_pad*8*bytesOf(fdt)); - inArray_pad_mov += iw_pad*8; - } - for (U32 h = paddingT; h < ih_pad - paddingB; h++) { - memset(inArray_pad_mov, 0, paddingL*8*bytesOf(fdt)); - inArray_pad_mov += paddingL*8; - memcpy(inArray_pad_mov, inArray_mov, iw*8*bytesOf(fdt)); - inArray_pad_mov += iw*8; - inArray_mov += iw*8; - memset(inArray_pad_mov, 0, paddingR*8*bytesOf(fdt)); - inArray_pad_mov += paddingR*8; - } - for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { - memset(inArray_pad_mov, 0, iw_pad*8*bytesOf(fdt)); - inArray_pad_mov += iw_pad*8; - } - } - - // dw_conv - for (U32 c = 0; c < ic ; c++) { - const F32 *b = biasArray + c*8; - F32 *in_pad = inArray_pad + c*ih_pad*iw_pad*8; - const F32 *f = filterArray + c*fh*fw*8; - // ohow / 8 - for (I32 hw = 0; hw < ohow-7; hw+=8) { - U32 in_h_0 = hw/ow*strideH; - U32 in_w_0 = hw%ow*strideW; - U32 in_h_1 = (hw+1)/ow*strideH; - U32 in_w_1 = (hw+1)%ow*strideW; - U32 in_h_2 = (hw+2)/ow*strideH; - U32 in_w_2 = (hw+2)%ow*strideW; - U32 in_h_3 = (hw+3)/ow*strideH; - U32 in_w_3 = (hw+3)%ow*strideW; - U32 in_h_4 = (hw+4)/ow*strideH; - U32 in_w_4 = (hw+4)%ow*strideW; - U32 in_h_5 = (hw+5)/ow*strideH; - U32 in_w_5 = (hw+5)%ow*strideW; - U32 in_h_6 = (hw+6)/ow*strideH; - U32 in_w_6 = (hw+6)%ow*strideW; - U32 in_h_7 = (hw+7)/ow*strideH; - U32 in_w_7 = (hw+7)%ow*strideW; - F32 *pw_pack_0 = pwArray + hw*ic*8 + c*8*8; - - __asm__ __volatile__( - "ldr q14, [%[b]]\n" - "ldr q15, [%[b], #16]\n" - "mov v0.16b, v14.16b\n" - "mov v1.16b, v15.16b\n" - "mov v2.16b, v14.16b\n" - "mov v3.16b, v15.16b\n" - "mov v4.16b, v14.16b\n" - "mov v5.16b, v15.16b\n" - "mov v6.16b, v14.16b\n" - "mov v7.16b, v15.16b\n" - "mov v8.16b, v14.16b\n" - "mov v9.16b, v15.16b\n" - "mov v10.16b, v14.16b\n" - "mov v11.16b, v15.16b\n" - "mov v12.16b, v14.16b\n" - "mov v13.16b, v15.16b\n" - : - : [b] "r"(b) - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" - ); - - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - const F32 *f_0 = f + fh_idx*fw*8 + fw_idx*8; - F32 *in_idx = in_pad + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - F32 *in_0 = in_idx + in_h_0*iw_pad*8 + in_w_0*8; - F32 *in_1 = in_idx + in_h_1*iw_pad*8 + in_w_1*8; - F32 *in_2 = in_idx + in_h_2*iw_pad*8 + in_w_2*8; - F32 *in_3 = in_idx + in_h_3*iw_pad*8 + in_w_3*8; - F32 *in_4 = in_idx + in_h_4*iw_pad*8 + in_w_4*8; - F32 *in_5 = in_idx + in_h_5*iw_pad*8 + in_w_5*8; - F32 *in_6 = in_idx + in_h_6*iw_pad*8 + in_w_6*8; - F32 *in_7 = in_idx + in_h_7*iw_pad*8 + in_w_7*8; - __asm__ __volatile__( - "ldp q16, q17, [%[f0]]\n" - "ldp q30, q31, [%[in0]]\n" - "ldp q18, q19, [%[in1]]\n" - "ldp q20, q21, [%[in2]]\n" - "ldp q22, q23, [%[in3]]\n" - "ldp q24, q25, [%[in4]]\n" - "ldp q26, q27, [%[in5]]\n" - "ldp q28, q29, [%[in6]]\n" - - "fmla v0.4s, v30.4s, v16.4s\n" - "fmla v1.4s, v31.4s, v17.4s\n" - "fmla v2.4s, v18.4s, v16.4s\n" - "ldp q30, q31, [%[in7]]\n" - "fmla v3.4s, v19.4s, v17.4s\n" - "fmla v4.4s, v20.4s, v16.4s\n" - "fmla v5.4s, v21.4s, v17.4s\n" - "fmla v6.4s, v22.4s, v16.4s\n" - "fmla v7.4s, v23.4s, v17.4s\n" - "fmla v8.4s, v24.4s, v16.4s\n" - "fmla v9.4s, v25.4s, v17.4s\n" - "fmla v10.4s, v26.4s, v16.4s\n" - "fmla v11.4s, v27.4s, v17.4s\n" - "fmla v12.4s, v28.4s, v16.4s\n" - "fmla v13.4s, v29.4s, v17.4s\n" - "fmla v14.4s, v30.4s, v16.4s\n" - "fmla v15.4s, v31.4s, v17.4s\n" - : - : [in0] "r"(in_0), - [in1] "r"(in_1), - [in2] "r"(in_2), - [in3] "r"(in_3), - [in4] "r"(in_4), - [in5] "r"(in_5), - [in6] "r"(in_6), - [in7] "r"(in_7), - [f0] "r"(f_0) - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", - "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" - ); - } - } - - // activation - switch (depthwiseActivationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fmax v0.4s, v0.4s, v31.4s\n" - "fmax v1.4s, v1.4s, v31.4s\n" - "fmax v2.4s, v2.4s, v31.4s\n" - "fmax v3.4s, v3.4s, v31.4s\n" - "fmax v4.4s, v4.4s, v31.4s\n" - "fmax v5.4s, v5.4s, v31.4s\n" - "fmax v6.4s, v6.4s, v31.4s\n" - "fmax v7.4s, v7.4s, v31.4s\n" - "fmax v8.4s, v8.4s, v31.4s\n" - "fmax v9.4s, v9.4s, v31.4s\n" - "fmax v10.4s, v10.4s, v31.4s\n" - "fmax v11.4s, v11.4s, v31.4s\n" - "fmax v12.4s, v12.4s, v31.4s\n" - "fmax v13.4s, v13.4s, v31.4s\n" - "fmax v14.4s, v14.4s, v31.4s\n" - "fmax v15.4s, v15.4s, v31.4s\n" - : - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v31" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fmov v30.4s, 6.0\n" // six - "fmax v0.4s, v0.4s, v31.4s\n" - "fmax v1.4s, v1.4s, v31.4s\n" - "fmax v2.4s, v2.4s, v31.4s\n" - "fmax v3.4s, v3.4s, v31.4s\n" - "fmax v4.4s, v4.4s, v31.4s\n" - "fmax v5.4s, v5.4s, v31.4s\n" - "fmax v6.4s, v6.4s, v31.4s\n" - "fmax v7.4s, v7.4s, v31.4s\n" - "fmax v8.4s, v8.4s, v31.4s\n" - "fmax v9.4s, v9.4s, v31.4s\n" - "fmax v10.4s, v10.4s, v31.4s\n" - "fmax v11.4s, v11.4s, v31.4s\n" - "fmax v12.4s, v12.4s, v31.4s\n" - "fmax v13.4s, v13.4s, v31.4s\n" - "fmax v14.4s, v14.4s, v31.4s\n" - "fmax v15.4s, v15.4s, v31.4s\n" - - "fmin v0.4s, v0.4s, v30.4s\n" - "fmin v1.4s, v1.4s, v30.4s\n" - "fmin v2.4s, v2.4s, v30.4s\n" - "fmin v3.4s, v3.4s, v30.4s\n" - "fmin v4.4s, v4.4s, v30.4s\n" - "fmin v5.4s, v5.4s, v30.4s\n" - "fmin v6.4s, v6.4s, v30.4s\n" - "fmin v7.4s, v7.4s, v30.4s\n" - "fmin v8.4s, v8.4s, v30.4s\n" - "fmin v9.4s, v9.4s, v30.4s\n" - "fmin v10.4s, v10.4s, v30.4s\n" - "fmin v11.4s, v11.4s, v30.4s\n" - "fmin v12.4s, v12.4s, v30.4s\n" - "fmin v13.4s, v13.4s, v30.4s\n" - "fmin v14.4s, v14.4s, v30.4s\n" - "fmin v15.4s, v15.4s, v30.4s\n" - : - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v30", "v31" - ); - break; - } - case ACTIVATION_H_SWISH:{ - __asm__ __volatile__( - "fmov v29.4s, 3.0\n" // three - "fmov v30.4s, 6.0\n" // six - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fadd v22.4s, v0.4s, v29.4s\n" - "fadd v23.4s, v1.4s, v29.4s\n" - "fadd v16.4s, v2.4s, v29.4s\n" - "fadd v17.4s, v3.4s, v29.4s\n" - "fadd v18.4s, v4.4s, v29.4s\n" - "fadd v19.4s, v5.4s, v29.4s\n" - "fadd v20.4s, v6.4s, v29.4s\n" - "fadd v21.4s, v7.4s, v29.4s\n" - - "fmax v22.4s, v22.4s, v31.4s\n" - "fmax v23.4s, v23.4s, v31.4s\n" - "fmax v16.4s, v16.4s, v31.4s\n" - "fmax v17.4s, v17.4s, v31.4s\n" - "fmax v18.4s, v18.4s, v31.4s\n" - "fmax v19.4s, v19.4s, v31.4s\n" - "fmax v20.4s, v20.4s, v31.4s\n" - "fmax v21.4s, v21.4s, v31.4s\n" - - "fmin v22.4s, v22.4s, v30.4s\n" - "fmin v23.4s, v23.4s, v30.4s\n" - "fmin v16.4s, v16.4s, v30.4s\n" - "fmin v17.4s, v17.4s, v30.4s\n" - "fmin v18.4s, v18.4s, v30.4s\n" - "fmin v19.4s, v19.4s, v30.4s\n" - "fmin v20.4s, v20.4s, v30.4s\n" - "fmin v21.4s, v21.4s, v30.4s\n" - - "fdiv v22.4s, v22.4s, v30.4s\n" - "fdiv v23.4s, v23.4s, v30.4s\n" - "fdiv v16.4s, v16.4s, v30.4s\n" - "fdiv v17.4s, v17.4s, v30.4s\n" - "fdiv v18.4s, v18.4s, v30.4s\n" - "fdiv v19.4s, v19.4s, v30.4s\n" - "fdiv v20.4s, v20.4s, v30.4s\n" - "fdiv v21.4s, v21.4s, v30.4s\n" - - "fmul v0.4s, v0.4s, v22.4s\n" - "fmul v1.4s, v1.4s, v23.4s\n" - "fmul v2.4s, v2.4s, v16.4s\n" - "fmul v3.4s, v3.4s, v17.4s\n" - "fmul v4.4s, v4.4s, v18.4s\n" - "fmul v5.4s, v5.4s, v19.4s\n" - "fmul v6.4s, v6.4s, v20.4s\n" - "fmul v7.4s, v7.4s, v21.4s\n" - - "fadd v22.4s, v8.4s, v29.4s\n" - "fadd v23.4s, v9.4s, v29.4s\n" - "fadd v16.4s, v10.4s, v29.4s\n" - "fadd v17.4s, v11.4s, v29.4s\n" - "fadd v18.4s, v12.4s, v29.4s\n" - "fadd v19.4s, v13.4s, v29.4s\n" - "fadd v20.4s, v14.4s, v29.4s\n" - "fadd v21.4s, v15.4s, v29.4s\n" - - "fmax v22.4s, v22.4s, v31.4s\n" - "fmax v23.4s, v23.4s, v31.4s\n" - "fmax v16.4s, v16.4s, v31.4s\n" - "fmax v17.4s, v17.4s, v31.4s\n" - "fmax v18.4s, v18.4s, v31.4s\n" - "fmax v19.4s, v19.4s, v31.4s\n" - "fmax v20.4s, v20.4s, v31.4s\n" - "fmax v21.4s, v21.4s, v31.4s\n" - - "fmin v22.4s, v22.4s, v30.4s\n" - "fmin v23.4s, v23.4s, v30.4s\n" - "fmin v16.4s, v16.4s, v30.4s\n" - "fmin v17.4s, v17.4s, v30.4s\n" - "fmin v18.4s, v18.4s, v30.4s\n" - "fmin v19.4s, v19.4s, v30.4s\n" - "fmin v20.4s, v20.4s, v30.4s\n" - "fmin v21.4s, v21.4s, v30.4s\n" - - "fdiv v22.4s, v22.4s, v30.4s\n" - "fdiv v23.4s, v23.4s, v30.4s\n" - "fdiv v16.4s, v16.4s, v30.4s\n" - "fdiv v17.4s, v17.4s, v30.4s\n" - "fdiv v18.4s, v18.4s, v30.4s\n" - "fdiv v19.4s, v19.4s, v30.4s\n" - "fdiv v20.4s, v20.4s, v30.4s\n" - "fdiv v21.4s, v21.4s, v30.4s\n" - - "fmul v8.4s, v8.4s, v22.4s\n" - "fmul v9.4s, v9.4s, v23.4s\n" - "fmul v10.4s, v10.4s, v16.4s\n" - "fmul v11.4s, v11.4s, v17.4s\n" - "fmul v12.4s, v12.4s, v18.4s\n" - "fmul v13.4s, v13.4s, v19.4s\n" - "fmul v14.4s, v14.4s, v20.4s\n" - "fmul v15.4s, v15.4s, v21.4s\n" - : - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", - "v18", "v19", "v20", "v21", "v22", "v23", "v29", "v30", "v31" - ); - break; - } - default: - return NOT_SUPPORTED; - } - - __asm__ __volatile__( - "zip1 v16.4s, v0.4s, v2.4s\n" - "zip2 v17.4s, v0.4s, v2.4s\n" - "zip1 v18.4s, v4.4s, v6.4s\n" - "zip2 v19.4s, v4.4s, v6.4s\n" - "zip1 v0.2d, v16.2d, v18.2d\n" - "zip2 v2.2d, v16.2d, v18.2d\n" - "zip1 v4.2d, v17.2d, v19.2d\n" - "zip2 v6.2d, v17.2d, v19.2d\n" - - "zip1 v16.4s, v8.4s, v10.4s\n" - "zip2 v17.4s, v8.4s, v10.4s\n" - "zip1 v18.4s, v12.4s, v14.4s\n" - "zip2 v19.4s, v12.4s, v14.4s\n" - "zip1 v8.2d, v16.2d, v18.2d\n" - "zip2 v10.2d, v16.2d, v18.2d\n" - "zip1 v12.2d, v17.2d, v19.2d\n" - "zip2 v14.2d, v17.2d, v19.2d\n" - - "zip1 v16.4s, v1.4s, v3.4s\n" - "zip2 v17.4s, v1.4s, v3.4s\n" - "zip1 v18.4s, v5.4s, v7.4s\n" - "zip2 v19.4s, v5.4s, v7.4s\n" - "zip1 v1.2d, v16.2d, v18.2d\n" - "zip2 v3.2d, v16.2d, v18.2d\n" - "zip1 v5.2d, v17.2d, v19.2d\n" - "zip2 v7.2d, v17.2d, v19.2d\n" - - "zip1 v16.4s, v9.4s, v11.4s\n" - "zip2 v17.4s, v9.4s, v11.4s\n" - "zip1 v18.4s, v13.4s, v15.4s\n" - "zip2 v19.4s, v13.4s, v15.4s\n" - "zip1 v9.2d, v16.2d, v18.2d\n" - "zip2 v11.2d, v16.2d, v18.2d\n" - "zip1 v13.2d, v17.2d, v19.2d\n" - "zip2 v15.2d, v17.2d, v19.2d\n" - - "str q0, [%[pw0]]\n" - "str q8, [%[pw0], #16]\n" - "str q2, [%[pw0], #32]\n" - "str q10, [%[pw0], #48]\n" - "str q4, [%[pw0], #64]\n" - "str q12, [%[pw0], #80]\n" - "str q6, [%[pw0], #96]\n" - "str q14, [%[pw0], #112]\n" - "str q1, [%[pw0], #128]\n" - "str q9, [%[pw0], #144]\n" - "str q3, [%[pw0], #160]\n" - "str q11, [%[pw0], #176]\n" - "str q5, [%[pw0], #192]\n" - "str q13, [%[pw0], #208]\n" - "str q7, [%[pw0], #224]\n" - "str q15, [%[pw0], #240]\n" - : [pw0] "+r"(pw_pack_0) - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19" - ); - } - - // ohow_reminder % 8 / 4 - U32 ohow_s = (ohow / 8) * 8; - for (I32 hw = ohow_s; hw < ohow-3; hw+=4) { - U32 in_h_0 = hw/ow*strideH; - U32 in_w_0 = hw%ow*strideW; - U32 in_h_1 = (hw+1)/ow*strideH; - U32 in_w_1 = (hw+1)%ow*strideW; - U32 in_h_2 = (hw+2)/ow*strideH; - U32 in_w_2 = (hw+2)%ow*strideW; - U32 in_h_3 = (hw+3)/ow*strideH; - U32 in_w_3 = (hw+3)%ow*strideW; - F32 *pw_pack_0 = pwArray + hw*ic*8 + c*8*4; - - __asm__ __volatile__( - "ldr q14, [%[b]]\n" - "ldr q15, [%[b], #16]\n" - "mov v0.16b, v14.16b\n" - "mov v1.16b, v15.16b\n" - "mov v2.16b, v14.16b\n" - "mov v3.16b, v15.16b\n" - "mov v4.16b, v14.16b\n" - "mov v5.16b, v15.16b\n" - "mov v6.16b, v14.16b\n" - "mov v7.16b, v15.16b\n" - : - : [b] "r"(b) - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v14", "v15" - ); - - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - const F32 *f_0 = f + fh_idx*fw*8 + fw_idx*8; - F32 *in_idx = in_pad + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - F32 *in_0 = in_idx + in_h_0*iw_pad*8 + in_w_0*8; - F32 *in_1 = in_idx + in_h_1*iw_pad*8 + in_w_1*8; - F32 *in_2 = in_idx + in_h_2*iw_pad*8 + in_w_2*8; - F32 *in_3 = in_idx + in_h_3*iw_pad*8 + in_w_3*8; - __asm__ __volatile__( - "ldp q14, q15, [%[f0]]\n" - "ldp q16, q17, [%[in0]]\n" - "ldp q18, q19, [%[in1]]\n" - "ldp q20, q21, [%[in2]]\n" - "ldp q22, q23, [%[in3]]\n" - - "fmla v0.4s, v16.4s, v14.4s\n" - "fmla v1.4s, v17.4s, v15.4s\n" - "fmla v2.4s, v18.4s, v14.4s\n" - "fmla v3.4s, v19.4s, v15.4s\n" - "fmla v4.4s, v20.4s, v14.4s\n" - "fmla v5.4s, v21.4s, v15.4s\n" - "fmla v6.4s, v22.4s, v14.4s\n" - "fmla v7.4s, v23.4s, v15.4s\n" - : - : [in0] "r"(in_0), - [in1] "r"(in_1), - [in2] "r"(in_2), - [in3] "r"(in_3), - [f0] "r"(f_0) - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v14", "v15", "v16", "v17", - "v18", "v19", "v20", "v21", "v22", "v23" - ); - } - } - - // activation - switch (depthwiseActivationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fmax v0.4s, v0.4s, v31.4s\n" - "fmax v1.4s, v1.4s, v31.4s\n" - "fmax v2.4s, v2.4s, v31.4s\n" - "fmax v3.4s, v3.4s, v31.4s\n" - "fmax v4.4s, v4.4s, v31.4s\n" - "fmax v5.4s, v5.4s, v31.4s\n" - "fmax v6.4s, v6.4s, v31.4s\n" - "fmax v7.4s, v7.4s, v31.4s\n" - : - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v31" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fmov v30.4s, 6.0\n" // six - "fmax v0.4s, v0.4s, v31.4s\n" - "fmax v1.4s, v1.4s, v31.4s\n" - "fmax v2.4s, v2.4s, v31.4s\n" - "fmax v3.4s, v3.4s, v31.4s\n" - "fmax v4.4s, v4.4s, v31.4s\n" - "fmax v5.4s, v5.4s, v31.4s\n" - "fmax v6.4s, v6.4s, v31.4s\n" - "fmax v7.4s, v7.4s, v31.4s\n" - - "fmin v0.4s, v0.4s, v30.4s\n" - "fmin v1.4s, v1.4s, v30.4s\n" - "fmin v2.4s, v2.4s, v30.4s\n" - "fmin v3.4s, v3.4s, v30.4s\n" - "fmin v4.4s, v4.4s, v30.4s\n" - "fmin v5.4s, v5.4s, v30.4s\n" - "fmin v6.4s, v6.4s, v30.4s\n" - "fmin v7.4s, v7.4s, v30.4s\n" - : - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v30", "v31" - ); - break; - } - case ACTIVATION_H_SWISH:{ - __asm__ __volatile__( - "fmov v29.4s, 3.0\n" // three - "fmov v30.4s, 6.0\n" // six - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fadd v14.4s, v0.4s, v29.4s\n" - "fadd v15.4s, v1.4s, v29.4s\n" - "fadd v16.4s, v2.4s, v29.4s\n" - "fadd v17.4s, v3.4s, v29.4s\n" - "fadd v18.4s, v4.4s, v29.4s\n" - "fadd v19.4s, v5.4s, v29.4s\n" - "fadd v20.4s, v6.4s, v29.4s\n" - "fadd v21.4s, v7.4s, v29.4s\n" - - "fmax v14.4s, v14.4s, v31.4s\n" - "fmax v15.4s, v15.4s, v31.4s\n" - "fmax v16.4s, v16.4s, v31.4s\n" - "fmax v17.4s, v17.4s, v31.4s\n" - "fmax v18.4s, v18.4s, v31.4s\n" - "fmax v19.4s, v19.4s, v31.4s\n" - "fmax v20.4s, v20.4s, v31.4s\n" - "fmax v21.4s, v21.4s, v31.4s\n" - - "fmin v14.4s, v14.4s, v30.4s\n" - "fmin v15.4s, v15.4s, v30.4s\n" - "fmin v16.4s, v16.4s, v30.4s\n" - "fmin v17.4s, v17.4s, v30.4s\n" - "fmin v18.4s, v18.4s, v30.4s\n" - "fmin v19.4s, v19.4s, v30.4s\n" - "fmin v20.4s, v20.4s, v30.4s\n" - "fmin v21.4s, v21.4s, v30.4s\n" - - "fdiv v14.4s, v14.4s, v30.4s\n" - "fdiv v15.4s, v15.4s, v30.4s\n" - "fdiv v16.4s, v16.4s, v30.4s\n" - "fdiv v17.4s, v17.4s, v30.4s\n" - "fdiv v18.4s, v18.4s, v30.4s\n" - "fdiv v19.4s, v19.4s, v30.4s\n" - "fdiv v20.4s, v20.4s, v30.4s\n" - "fdiv v21.4s, v21.4s, v30.4s\n" - - "fmul v0.4s, v0.4s, v14.4s\n" - "fmul v1.4s, v1.4s, v15.4s\n" - "fmul v2.4s, v2.4s, v16.4s\n" - "fmul v3.4s, v3.4s, v17.4s\n" - "fmul v4.4s, v4.4s, v18.4s\n" - "fmul v5.4s, v5.4s, v19.4s\n" - "fmul v6.4s, v6.4s, v20.4s\n" - "fmul v7.4s, v7.4s, v21.4s\n" - : - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v14", "v15", "v16", "v17", - "v18", "v19", "v20", "v21", "v29", "v30", "v31" - ); - break; - } - default: - return NOT_SUPPORTED; - } - - __asm__ __volatile__( - "zip1 v16.4s, v0.4s, v2.4s\n" - "zip2 v17.4s, v0.4s, v2.4s\n" - "zip1 v18.4s, v4.4s, v6.4s\n" - "zip2 v19.4s, v4.4s, v6.4s\n" - "zip1 v0.2d, v16.2d, v18.2d\n" - "zip2 v2.2d, v16.2d, v18.2d\n" - "zip1 v4.2d, v17.2d, v19.2d\n" - "zip2 v6.2d, v17.2d, v19.2d\n" - - "zip1 v16.4s, v1.4s, v3.4s\n" - "zip2 v17.4s, v1.4s, v3.4s\n" - "zip1 v18.4s, v5.4s, v7.4s\n" - "zip2 v19.4s, v5.4s, v7.4s\n" - "zip1 v1.2d, v16.2d, v18.2d\n" - "zip2 v3.2d, v16.2d, v18.2d\n" - "zip1 v5.2d, v17.2d, v19.2d\n" - "zip2 v7.2d, v17.2d, v19.2d\n" - - "str q0, [%[pw0]]\n" - "str q2, [%[pw0], #16]\n" - "str q4, [%[pw0], #32]\n" - "str q6, [%[pw0], #48]\n" - "str q1, [%[pw0], #64]\n" - "str q3, [%[pw0], #80]\n" - "str q5, [%[pw0], #96]\n" - "str q7, [%[pw0], #112]\n" - : [pw0] "+r"(pw_pack_0) - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19" - ); - } - - // ohow_reminder % 4 - ohow_s = (ohow / 4) * 4; - for (I32 hw = ohow_s; hw < ohow; hw++) { - U32 in_h_0 = hw/ow*strideH; - U32 in_w_0 = hw%ow*strideW; - F32 *pw_pack_0 = pwArray + hw*ic*8 + c*8; - - __asm__ __volatile__( - "ldr q0, [%[b]]\n" - "ldr q1, [%[b], #16]\n" - : - : [b] "r"(b) - : "memory", "cc", "v0", "v1" - ); - - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - const F32 *f_0 = f + fh_idx*fw*8 + fw_idx*8; - F32 *in_idx = in_pad + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - F32 *in_0 = in_idx + in_h_0*iw_pad*8 + in_w_0*8; - __asm__ __volatile__( - "ldp q14, q15, [%[f0]]\n" - "ldp q16, q17, [%[in0]]\n" - - "fmla v0.4s, v16.4s, v14.4s\n" - "fmla v1.4s, v17.4s, v15.4s\n" - : - : [in0] "r"(in_0), - [f0] "r"(f_0) - : "memory", "cc", "v0", "v1", "v14", "v15" - ); - } - } - - // activation - switch (depthwiseActivationDesc.mode){ - case ACTIVATION_NULL: - break; - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fmax v0.4s, v0.4s, v31.4s\n" - "fmax v1.4s, v1.4s, v31.4s\n" - : - : - : "memory", "cc", "v0", "v1", "v31" - ); - break; - } - case ACTIVATION_RELU6:{ - __asm__ __volatile__( - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fmov v30.4s, 6.0\n" // six - "fmax v0.4s, v0.4s, v31.4s\n" - "fmax v1.4s, v1.4s, v31.4s\n" - - "fmin v0.4s, v0.4s, v30.4s\n" - "fmin v1.4s, v1.4s, v30.4s\n" - : - : - : "memory", "cc", "v0", "v1", "v30", "v31" - ); - break; - } - case ACTIVATION_H_SWISH:{ - __asm__ __volatile__( - "fmov v29.4s, 3.0\n" // three - "fmov v30.4s, 6.0\n" // six - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fadd v14.4s, v0.4s, v29.4s\n" - "fadd v15.4s, v1.4s, v29.4s\n" - - "fmax v14.4s, v14.4s, v31.4s\n" - "fmax v15.4s, v15.4s, v31.4s\n" - - "fmin v14.4s, v14.4s, v30.4s\n" - "fmin v15.4s, v15.4s, v30.4s\n" - - "fdiv v14.4s, v14.4s, v30.4s\n" - "fdiv v15.4s, v15.4s, v30.4s\n" - - "fmul v0.4s, v0.4s, v14.4s\n" - "fmul v1.4s, v1.4s, v15.4s\n" - : - : - : "memory", "cc", "v0", "v1", "v14", "v15", "v29", "v30", "v31" - ); - break; - } - default: - return NOT_SUPPORTED; - } - - __asm__ __volatile__( - "stp q0, q1, [%[pw0]]\n" - : [pw0] "+r"(pw_pack_0) - : - : "memory", "cc", "v0", "v1" - ); - } - } - - // pw_conv - // ohow / 8 - for (I32 hw = 0; hw < ohow-7; hw+=8) { - const F32 *b0 = biasArray + ic*8; - const F32 *b1 = b0 + 4; - F32 *in_pack = pwArray + hw*ic*8; - const F32 *f_o0c0 = filterArray + ic*fh*fw*8; - for (I32 o = 0; o < I32(oc); o++) { - F32 *in_hw0 = in_pack; - F32 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - // bias - const F32 *b_o0 = b0; - const F32 *b_o1 = b1; - __asm__ __volatile__( - "ldr q24, [%[b_0]]\n" // b_O0o[0:3] - "ldr q25, [%[b_1]]\n" // b_O1o[0:3] - "mov x0, %[ic]\n" //ic_blk - "mov v4.16b, v24.16b\n" - "ldr q0, [%[in_0]]\n" //in_hw0 - "mov v5.16b, v24.16b\n" - "ldr q1, [%[in_0], #16]\n" //in_hw0 - "mov v6.16b, v24.16b\n" - "ldr q20, [%[f_0]]\n" //f_o0c0 - "mov v7.16b, v24.16b\n" - "ldr q21, [%[f_0], #16]\n" //f_o0c0 - "mov v8.16b, v24.16b\n" - "mov v9.16b, v24.16b\n" - "mov v10.16b, v24.16b\n" - "mov v11.16b, v24.16b\n" - "mov v12.16b, v25.16b\n" - "mov v13.16b, v25.16b\n" - "mov v14.16b, v25.16b\n" - "mov v15.16b, v25.16b\n" - "mov v16.16b, v25.16b\n" - "mov v17.16b, v25.16b\n" - "mov v18.16b, v25.16b\n" - "mov v19.16b, v25.16b\n" - - "0:\n" - "fmla v4.4s, v20.4s, v0.s[0]\n" - "ldr q2, [%[in_0], #32]\n" - "fmla v5.4s, v20.4s, v0.s[1]\n" - "ldr q3, [%[in_0], #48]\n" - "fmla v6.4s, v20.4s, v0.s[2]\n" - "ldr q22, [%[f_0], #32]\n" - "fmla v7.4s, v20.4s, v0.s[3]\n" - "ldr q23, [%[f_0], #48]\n" - "fmla v8.4s, v20.4s, v1.s[0]\n" - "fmla v9.4s, v20.4s, v1.s[1]\n" - "fmla v10.4s, v20.4s, v1.s[2]\n" - "fmla v11.4s, v20.4s, v1.s[3]\n" - "fmla v12.4s, v21.4s, v0.s[0]\n" - "fmla v13.4s, v21.4s, v0.s[1]\n" - "fmla v14.4s, v21.4s, v0.s[2]\n" - "fmla v15.4s, v21.4s, v0.s[3]\n" - "fmla v16.4s, v21.4s, v1.s[0]\n" - "fmla v17.4s, v21.4s, v1.s[1]\n" - "fmla v18.4s, v21.4s, v1.s[2]\n" - "fmla v19.4s, v21.4s, v1.s[3]\n" - - "fmla v4.4s, v22.4s, v2.s[0]\n" - "ldr q0, [%[in_0], #64]!\n" - "fmla v5.4s, v22.4s, v2.s[1]\n" - "ldr q1, [%[in_0], #16]\n" - "fmla v6.4s, v22.4s, v2.s[2]\n" - "ldr q20, [%[f_0], #64]!\n" - "fmla v7.4s, v22.4s, v2.s[3]\n" - "ldr q21, [%[f_0], #16]\n" - "fmla v8.4s, v22.4s, v3.s[0]\n" - "fmla v9.4s, v22.4s, v3.s[1]\n" - "fmla v10.4s, v22.4s, v3.s[2]\n" - "fmla v11.4s, v22.4s, v3.s[3]\n" - "fmla v12.4s, v23.4s, v2.s[0]\n" - "fmla v13.4s, v23.4s, v2.s[1]\n" - "fmla v14.4s, v23.4s, v2.s[2]\n" - "fmla v15.4s, v23.4s, v2.s[3]\n" - "fmla v16.4s, v23.4s, v3.s[0]\n" - "fmla v17.4s, v23.4s, v3.s[1]\n" - "fmla v18.4s, v23.4s, v3.s[2]\n" - "fmla v19.4s, v23.4s, v3.s[3]\n" - "subs x0, x0, #2\n" - "bne 0b\n" - - "cmp %[pointwiseActivationMode], %[am_relu]\n" - "bne 11f\n" - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fmax v4.4s, v4.4s, v31.4s\n" - "fmax v5.4s, v5.4s, v31.4s\n" - "fmax v6.4s, v6.4s, v31.4s\n" - "fmax v7.4s, v7.4s, v31.4s\n" - "fmax v8.4s, v8.4s, v31.4s\n" - "fmax v9.4s, v9.4s, v31.4s\n" - "fmax v10.4s, v10.4s, v31.4s\n" - "fmax v11.4s, v11.4s, v31.4s\n" - "fmax v12.4s, v12.4s, v31.4s\n" - "fmax v13.4s, v13.4s, v31.4s\n" - "fmax v14.4s, v14.4s, v31.4s\n" - "fmax v15.4s, v15.4s, v31.4s\n" - "fmax v16.4s, v16.4s, v31.4s\n" - "fmax v17.4s, v17.4s, v31.4s\n" - "fmax v18.4s, v18.4s, v31.4s\n" - "fmax v19.4s, v19.4s, v31.4s\n" - - "11:\n" - "cmp %[pointwiseActivationMode], %[am_relu6]\n" - "bne 12f\n" - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fmov v30.4s, 6.0\n" // six - "fmax v4.4s, v4.4s, v31.4s\n" - "fmax v5.4s, v5.4s, v31.4s\n" - "fmax v6.4s, v6.4s, v31.4s\n" - "fmax v7.4s, v7.4s, v31.4s\n" - "fmax v8.4s, v8.4s, v31.4s\n" - "fmax v9.4s, v9.4s, v31.4s\n" - "fmax v10.4s, v10.4s, v31.4s\n" - "fmax v11.4s, v11.4s, v31.4s\n" - "fmax v12.4s, v12.4s, v31.4s\n" - "fmax v13.4s, v13.4s, v31.4s\n" - "fmax v14.4s, v14.4s, v31.4s\n" - "fmax v15.4s, v15.4s, v31.4s\n" - "fmax v16.4s, v16.4s, v31.4s\n" - "fmax v17.4s, v17.4s, v31.4s\n" - "fmax v18.4s, v18.4s, v31.4s\n" - "fmax v19.4s, v19.4s, v31.4s\n" - - "fmin v4.4s, v4.4s, v30.4s\n" - "fmin v5.4s, v5.4s, v30.4s\n" - "fmin v6.4s, v6.4s, v30.4s\n" - "fmin v7.4s, v7.4s, v30.4s\n" - "fmin v8.4s, v8.4s, v30.4s\n" - "fmin v9.4s, v9.4s, v30.4s\n" - "fmin v10.4s, v10.4s, v30.4s\n" - "fmin v11.4s, v11.4s, v30.4s\n" - "fmin v12.4s, v12.4s, v30.4s\n" - "fmin v13.4s, v13.4s, v30.4s\n" - "fmin v14.4s, v14.4s, v30.4s\n" - "fmin v15.4s, v15.4s, v30.4s\n" - "fmin v16.4s, v16.4s, v30.4s\n" - "fmin v17.4s, v17.4s, v30.4s\n" - "fmin v18.4s, v18.4s, v30.4s\n" - "fmin v19.4s, v19.4s, v30.4s\n" - - "12:\n" - "cmp %[pointwiseActivationMode], %[am_h_swish]\n" - "bne 13f\n" - "fmov v29.4s, 3.0\n" // three - "fmov v30.4s, 6.0\n" // six - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fadd v20.4s, v4.4s, v29.4s\n" - "fadd v21.4s, v5.4s, v29.4s\n" - "fadd v22.4s, v6.4s, v29.4s\n" - "fadd v23.4s, v7.4s, v29.4s\n" - "fadd v24.4s, v8.4s, v29.4s\n" - "fadd v25.4s, v9.4s, v29.4s\n" - "fadd v26.4s, v10.4s, v29.4s\n" - "fadd v27.4s, v11.4s, v29.4s\n" - - "fmax v20.4s, v20.4s, v31.4s\n" - "fmax v21.4s, v21.4s, v31.4s\n" - "fmax v22.4s, v22.4s, v31.4s\n" - "fmax v23.4s, v23.4s, v31.4s\n" - "fmax v24.4s, v24.4s, v31.4s\n" - "fmax v25.4s, v25.4s, v31.4s\n" - "fmax v26.4s, v26.4s, v31.4s\n" - "fmax v27.4s, v27.4s, v31.4s\n" - - "fmin v20.4s, v20.4s, v30.4s\n" - "fmin v21.4s, v21.4s, v30.4s\n" - "fmin v22.4s, v22.4s, v30.4s\n" - "fmin v23.4s, v23.4s, v30.4s\n" - "fmin v24.4s, v24.4s, v30.4s\n" - "fmin v25.4s, v25.4s, v30.4s\n" - "fmin v26.4s, v26.4s, v30.4s\n" - "fmin v27.4s, v27.4s, v30.4s\n" - - "fdiv v20.4s, v20.4s, v30.4s\n" - "fdiv v21.4s, v21.4s, v30.4s\n" - "fdiv v22.4s, v22.4s, v30.4s\n" - "fdiv v23.4s, v23.4s, v30.4s\n" - "fdiv v24.4s, v24.4s, v30.4s\n" - "fdiv v25.4s, v25.4s, v30.4s\n" - "fdiv v26.4s, v26.4s, v30.4s\n" - "fdiv v27.4s, v27.4s, v30.4s\n" - - "fmul v4.4s, v4.4s, v20.4s\n" - "fmul v5.4s, v5.4s, v21.4s\n" - "fmul v6.4s, v6.4s, v22.4s\n" - "fmul v7.4s, v7.4s, v23.4s\n" - "fmul v8.4s, v8.4s, v24.4s\n" - "fmul v9.4s, v9.4s, v25.4s\n" - "fmul v10.4s, v10.4s, v26.4s\n" - "fmul v11.4s, v11.4s, v27.4s\n" - - "fadd v20.4s, v12.4s, v29.4s\n" - "fadd v21.4s, v13.4s, v29.4s\n" - "fadd v22.4s, v14.4s, v29.4s\n" - "fadd v23.4s, v15.4s, v29.4s\n" - "fadd v24.4s, v16.4s, v29.4s\n" - "fadd v25.4s, v17.4s, v29.4s\n" - "fadd v26.4s, v18.4s, v29.4s\n" - "fadd v27.4s, v19.4s, v29.4s\n" - - "fmax v20.4s, v20.4s, v31.4s\n" - "fmax v21.4s, v21.4s, v31.4s\n" - "fmax v22.4s, v22.4s, v31.4s\n" - "fmax v23.4s, v23.4s, v31.4s\n" - "fmax v24.4s, v24.4s, v31.4s\n" - "fmax v25.4s, v25.4s, v31.4s\n" - "fmax v26.4s, v26.4s, v31.4s\n" - "fmax v27.4s, v27.4s, v31.4s\n" - - "fmin v20.4s, v20.4s, v30.4s\n" - "fmin v21.4s, v21.4s, v30.4s\n" - "fmin v22.4s, v22.4s, v30.4s\n" - "fmin v23.4s, v23.4s, v30.4s\n" - "fmin v24.4s, v24.4s, v30.4s\n" - "fmin v25.4s, v25.4s, v30.4s\n" - "fmin v26.4s, v26.4s, v30.4s\n" - "fmin v27.4s, v27.4s, v30.4s\n" - - "fdiv v20.4s, v20.4s, v30.4s\n" - "fdiv v21.4s, v21.4s, v30.4s\n" - "fdiv v22.4s, v22.4s, v30.4s\n" - "fdiv v23.4s, v23.4s, v30.4s\n" - "fdiv v24.4s, v24.4s, v30.4s\n" - "fdiv v25.4s, v25.4s, v30.4s\n" - "fdiv v26.4s, v26.4s, v30.4s\n" - "fdiv v27.4s, v27.4s, v30.4s\n" - - "fmul v12.4s, v12.4s, v20.4s\n" - "fmul v13.4s, v13.4s, v21.4s\n" - "fmul v14.4s, v14.4s, v22.4s\n" - "fmul v15.4s, v15.4s, v23.4s\n" - "fmul v16.4s, v16.4s, v24.4s\n" - "fmul v17.4s, v17.4s, v25.4s\n" - "fmul v18.4s, v18.4s, v26.4s\n" - "fmul v19.4s, v19.4s, v27.4s\n" - - "13:\n" - "str q4, [%[out_0]], #16\n" - "str q12, [%[out_0]], #16\n" - "str q5, [%[out_0]], #16\n" - "str q13, [%[out_0]], #16\n" - "str q6, [%[out_0]], #16\n" - "str q14, [%[out_0]], #16\n" - "str q7, [%[out_0]], #16\n" - "str q15, [%[out_0]], #16\n" - "str q8, [%[out_0]], #16\n" - "str q16, [%[out_0]], #16\n" - "str q9, [%[out_0]], #16\n" - "str q17, [%[out_0]], #16\n" - "str q10, [%[out_0]], #16\n" - "str q18, [%[out_0]], #16\n" - "str q11, [%[out_0]], #16\n" - "str q19, [%[out_0]], #16\n" - : [out_0] "+r"(out_o0hw0), - [in_0] "+r"(in_hw0), - [f_0] "+r"(f_o0c0) - : [ic] "r"((I64)ic * 8), - [b_0] "r"(b_o0), - [b_1] "r"(b_o1), - [pointwiseActivationMode] "r"((I64)pointwiseActivationDesc.mode), - [am_relu] "r"((I64)ACTIVATION_RELU), - [am_relu6] "r"((I64)ACTIVATION_RELU6), - [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v29", "v30", "v31", "x0", "x1", "x2", "x3" - ); - b0 += 8; - b1 += 8; - } - } - - // ohow_remainder % 8 / 4 - U32 ohow_s = (ohow / 8) * 8; - for (I32 hw = ohow_s; hw < ohow-3; hw+=4) { - const F32 *b0 = biasArray + ic*8; - const F32 *b1 = b0 + 4; - const F32 *f_o0c0 = filterArray + ic*fh*fw*8; - F32 *in_pack = pwArray + hw*ic*8; - for (I32 o = 0; o < I32(oc); o++) { - F32 *in_hw0 = in_pack; - F32 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - // bias - const F32 *b_o0 = b0; - const F32 *b_o1 = b1; - __asm__ __volatile__( - "ldr q24, [%[b_0]]\n" //b_o0 - "ldr q25, [%[b_1]]\n" //b_o1 - "mov x0, %[ic]\n" //ic_blk - "mov v4.16b, v24.16b\n" - "ldr q0, [%[in_0]]\n" //in_hw0 - "mov v5.16b, v24.16b\n" - "mov v6.16b, v24.16b\n" - "ldr q20, [%[f_0]]\n" //f_o0c0 - "mov v7.16b, v24.16b\n" - "ldr q21, [%[f_0], #16]\n" //f_o0c0 - "mov v12.16b, v25.16b\n" - "mov v13.16b, v25.16b\n" - "mov v14.16b, v25.16b\n" - "mov v15.16b, v25.16b\n" - - "0:\n" - "fmla v4.4s, v20.4s, v0.s[0]\n" - "ldr q2, [%[in_0], #16]\n" - "fmla v5.4s, v20.4s, v0.s[1]\n" - "ldr q22, [%[f_0], #32]\n" - "fmla v6.4s, v20.4s, v0.s[2]\n" - "ldr q23, [%[f_0], #48]\n" - "fmla v7.4s, v20.4s, v0.s[3]\n" - "fmla v12.4s, v21.4s, v0.s[0]\n" - "fmla v13.4s, v21.4s, v0.s[1]\n" - "fmla v14.4s, v21.4s, v0.s[2]\n" - "fmla v15.4s, v21.4s, v0.s[3]\n" - - "fmla v4.4s, v22.4s, v2.s[0]\n" - "ldr q0, [%[in_0], #32]!\n" - "fmla v5.4s, v22.4s, v2.s[1]\n" - "ldr q20, [%[f_0], #64]!\n" - "fmla v6.4s, v22.4s, v2.s[2]\n" - "ldr q21, [%[f_0], #16]\n" - "fmla v7.4s, v22.4s, v2.s[3]\n" - "fmla v12.4s, v23.4s, v2.s[0]\n" - "fmla v13.4s, v23.4s, v2.s[1]\n" - "fmla v14.4s, v23.4s, v2.s[2]\n" - "fmla v15.4s, v23.4s, v2.s[3]\n" - "subs x0, x0, #2\n" - "bne 0b\n" - - "cmp %[pointwiseActivationMode], %[am_relu]\n" - "bne 11f\n" - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fmax v4.4s, v4.4s, v31.4s\n" - "fmax v5.4s, v5.4s, v31.4s\n" - "fmax v6.4s, v6.4s, v31.4s\n" - "fmax v7.4s, v7.4s, v31.4s\n" - "fmax v12.4s, v12.4s, v31.4s\n" - "fmax v13.4s, v13.4s, v31.4s\n" - "fmax v14.4s, v14.4s, v31.4s\n" - "fmax v15.4s, v15.4s, v31.4s\n" - - "11:\n" - "cmp %[pointwiseActivationMode], %[am_relu6]\n" - "bne 12f\n" - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fmov v30.4s, 6.0\n" // six - "fmax v4.4s, v4.4s, v31.4s\n" - "fmax v5.4s, v5.4s, v31.4s\n" - "fmax v6.4s, v6.4s, v31.4s\n" - "fmax v7.4s, v7.4s, v31.4s\n" - "fmax v12.4s, v12.4s, v31.4s\n" - "fmax v13.4s, v13.4s, v31.4s\n" - "fmax v14.4s, v14.4s, v31.4s\n" - "fmax v15.4s, v15.4s, v31.4s\n" - - "fmin v4.4s, v4.4s, v30.4s\n" - "fmin v5.4s, v5.4s, v30.4s\n" - "fmin v6.4s, v6.4s, v30.4s\n" - "fmin v7.4s, v7.4s, v30.4s\n" - "fmin v12.4s, v12.4s, v30.4s\n" - "fmin v13.4s, v13.4s, v30.4s\n" - "fmin v14.4s, v14.4s, v30.4s\n" - "fmin v15.4s, v15.4s, v30.4s\n" - - "12:\n" - "cmp %[pointwiseActivationMode], %[am_h_swish]\n" - "bne 13f\n" - "fmov v29.4s, 3.0\n" // three - "fmov v30.4s, 6.0\n" // six - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fadd v20.4s, v4.4s, v29.4s\n" - "fadd v21.4s, v5.4s, v29.4s\n" - "fadd v22.4s, v6.4s, v29.4s\n" - "fadd v23.4s, v7.4s, v29.4s\n" - "fadd v24.4s, v12.4s, v29.4s\n" - "fadd v25.4s, v13.4s, v29.4s\n" - "fadd v26.4s, v14.4s, v29.4s\n" - "fadd v27.4s, v15.4s, v29.4s\n" - - "fmax v20.4s, v20.4s, v31.4s\n" - "fmax v21.4s, v21.4s, v31.4s\n" - "fmax v22.4s, v22.4s, v31.4s\n" - "fmax v23.4s, v23.4s, v31.4s\n" - "fmax v24.4s, v24.4s, v31.4s\n" - "fmax v25.4s, v25.4s, v31.4s\n" - "fmax v26.4s, v26.4s, v31.4s\n" - "fmax v27.4s, v27.4s, v31.4s\n" - - "fmin v20.4s, v20.4s, v30.4s\n" - "fmin v21.4s, v21.4s, v30.4s\n" - "fmin v22.4s, v22.4s, v30.4s\n" - "fmin v23.4s, v23.4s, v30.4s\n" - "fmin v24.4s, v24.4s, v30.4s\n" - "fmin v25.4s, v25.4s, v30.4s\n" - "fmin v26.4s, v26.4s, v30.4s\n" - "fmin v27.4s, v27.4s, v30.4s\n" - - "fdiv v20.4s, v20.4s, v30.4s\n" - "fdiv v21.4s, v21.4s, v30.4s\n" - "fdiv v22.4s, v22.4s, v30.4s\n" - "fdiv v23.4s, v23.4s, v30.4s\n" - "fdiv v24.4s, v24.4s, v30.4s\n" - "fdiv v25.4s, v25.4s, v30.4s\n" - "fdiv v26.4s, v26.4s, v30.4s\n" - "fdiv v27.4s, v27.4s, v30.4s\n" - - "fmul v4.4s, v4.4s, v20.4s\n" - "fmul v5.4s, v5.4s, v21.4s\n" - "fmul v6.4s, v6.4s, v22.4s\n" - "fmul v7.4s, v7.4s, v23.4s\n" - "fmul v12.4s, v12.4s, v24.4s\n" - "fmul v13.4s, v13.4s, v25.4s\n" - "fmul v14.4s, v14.4s, v26.4s\n" - "fmul v15.4s, v15.4s, v27.4s\n" - - "13:\n" - "str q4, [%[out_0]]\n" - "str q12, [%[out_0], #16]\n" - "str q5, [%[out_0], #32]\n" - "str q13, [%[out_0], #48]\n" - "str q6, [%[out_0], #64]\n" - "str q14, [%[out_0], #80]\n" - "str q7, [%[out_0], #96]\n" - "str q15, [%[out_0], #112]\n" - : [out_0] "+r"(out_o0hw0), - [in_0] "+r"(in_hw0), - [f_0] "+r"(f_o0c0) - : [ic] "r"((I64)ic * 8), - [b_0] "r"(b_o0), - [b_1] "r"(b_o1), - [pointwiseActivationMode] "r"((I64)pointwiseActivationDesc.mode), - [am_relu] "r"((I64)ACTIVATION_RELU), - [am_relu6] "r"((I64)ACTIVATION_RELU6), - [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) - : "memory", "cc", "v0", "v2", "v4", "v5", "v6", "v7", - "v12", "v13", "v14", "v15", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v29", "v30", "v31", "x0", "x1", "x2", "x3"); - b0 += 8; - b1 += 8; - } - } - - // ohow_reminder % 4 - ohow_s = (ohow / 4) * 4; - for (I32 hw = ohow_s; hw < ohow; hw++) { - const F32 *b0 = biasArray + ic*8; - const F32 *b1 = b0 + 4; - const F32 *f_o0c0 = filterArray + ic*fh*fw*8; - F32 *in_pack = pwArray + hw*ic*8; - for (I32 o = 0; o < I32(oc); o++) { - F32 *in_hw0 = in_pack; - F32 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - // bias - const F32 *b_o0 = b0; - const F32 *b_o1 = b1; - __asm__ __volatile__( - "ldr q4, [%[b_0]]\n" //b_o0 - "ldr q12, [%[b_1]]\n" //b_o1 - "mov x0, %[ic]\n" //ic_blk - "ldr s0, [%[in_0]]\n" //in_hw0 - "ldr q20, [%[f_0]]\n" //f_o0c0 - "ldr q21, [%[f_0], #16]\n" - "0:\n" - "ldr s2, [%[in_0], #4]\n" - "ldr q22, [%[f_0], #32]\n" - "ldr q23, [%[f_0], #48]\n" - "fmla v4.4s, v20.4s, v0.s[0]\n" - "fmla v12.4s, v21.4s, v0.s[0]\n" - - "ldr s0, [%[in_0], #8]!\n" - "ldr q20, [%[f_0], #64]!\n" - "ldr q21, [%[f_0], #16]\n" - "fmla v4.4s, v22.4s, v2.s[0]\n" - "fmla v12.4s, v23.4s, v2.s[0]\n" - "subs x0, x0, #2\n" - "bne 0b\n" - - "cmp %[pointwiseActivationMode], %[am_relu]\n" - "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "fmax v4.4s, v4.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v0.4s\n" - - "11:\n" - "cmp %[pointwiseActivationMode], %[am_relu6]\n" - "bne 12f\n" - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fmov v30.4s, 6.0\n" // six - "fmax v4.4s, v4.4s, v31.4s\n" - "fmax v12.4s, v12.4s, v31.4s\n" - - "12:\n" - "cmp %[pointwiseActivationMode], %[am_h_swish]\n" - "bne 13f\n" - "fmov v29.4s, 3.0\n" // three - "fmov v30.4s, 6.0\n" // six - "eor v31.16b, v31.16b, v31.16b\n" // zero - "fadd v20.4s, v4.4s, v29.4s\n" - "fadd v24.4s, v12.4s, v29.4s\n" - - "fmax v20.4s, v20.4s, v31.4s\n" - "fmax v24.4s, v24.4s, v31.4s\n" - - "fmin v20.4s, v20.4s, v30.4s\n" - "fmin v24.4s, v24.4s, v30.4s\n" - - "fdiv v20.4s, v20.4s, v30.4s\n" - "fdiv v24.4s, v24.4s, v30.4s\n" - - "fmul v4.4s, v4.4s, v20.4s\n" - "fmul v12.4s, v12.4s, v24.4s\n" - - "13:\n" - "str q4, [%[out_0]]\n" - "str q12, [%[out_0], #16]\n" - : [out_0] "+r"(out_o0hw0), - [in_0] "+r"(in_hw0), - [f_0] "+r"(f_o0c0) - : [ic] "r"((I64)ic * 8), - [b_0] "r"(b_o0), - [b_1] "r"(b_o1), - [pointwiseActivationMode] "r"((I64)pointwiseActivationDesc.mode), - [am_relu] "r"((I64)ACTIVATION_RELU), - [am_relu6] "r"((I64)ACTIVATION_RELU6), - [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) - : "memory", "cc", "v0", "v1", "v2", "v4", "v12", "v20", "v24", "v29", "v30", "v31", "x0", "x1", "x2", "x3" - ); - b0 += 8; - b1 += 8; - } - } - } - return SUCCESS; -} -#endif diff --git a/tensor_computing/src/cpu/arm/fp32/detectionoutput.cpp b/tensor_computing/src/cpu/arm/fp32/detectionoutput.cpp deleted file mode 100644 index 9bd53f63..00000000 --- a/tensor_computing/src/cpu/arm/fp32/detectionoutput.cpp +++ /dev/null @@ -1,140 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/tensor_computing_arm.h" -#include "cpu/arm/fp32/tensor_computing_fp32.h" - -EE detectionoutput_fp32(std::vector inputDesc, std::vector input, DetectionOutputDesc detectionoutputDesc, TensorDesc outputDesc, F32* output) -{ - UNUSED(outputDesc); - if (inputDesc.size() != 3) { - CHECK_STATUS(NOT_MATCH); - } - F32* location = (F32*)input[0]; - F32* confidence = (F32*)input[1]; - F32* priorbox = (F32*)input[2]; - - U32 ilens2 = inputDesc[2].dims[0]; - U32 num_total_priorbox = ilens2 / 4; - U32 num_class = detectionoutputDesc.num_class; - F32 nms_threshold = detectionoutputDesc.nms_threshold; - U32 nms_top_k = detectionoutputDesc.nms_top_k; - U32 keep_top_k = detectionoutputDesc.keep_top_k; - F32 confidence_threshold = detectionoutputDesc.confidence_threshold; - - std::vector> boxes; - boxes.resize(num_total_priorbox); - F32* variance = priorbox + ilens2; - // decode priorbox - for(U32 i = 0 ; i < num_total_priorbox ; i++){ - F32* loc = location + i * 4; - F32* pb = priorbox + i * 4; - F32* var = variance + i * 4; - - F32 pb_w = pb[2] - pb[0]; - F32 pb_h = pb[3] - pb[1]; - F32 pb_cx = (pb[0] + pb[2]) * 0.5f; - F32 pb_cy = (pb[1] + pb[3]) * 0.5f; - - F32 box_cx = var[0] * loc[0] * pb_w + pb_cx; - F32 box_cy = var[1] * loc[1] * pb_h + pb_cy; - F32 box_w = static_cast(exp(var[2] * loc[2]) * pb_w); - F32 box_h = static_cast(exp(var[3] * loc[3]) * pb_h); - - std::vector box; - box.resize(4); - box[0] = box_cx - box_w * 0.5f; - box[1] = box_cy - box_h * 0.5f; - box[2] = box_cx + box_w * 0.5f; - box[3] = box_cy + box_h * 0.5f; - // give box to boxes - boxes[i].assign(box.begin(),box.end()); - } - - std::vector> allclass_boxrects; - std::vector> allclass_boxscores; - allclass_boxrects.resize(num_class); - allclass_boxscores.resize(num_class); - - for(U32 i = 1; i < num_class; i++){ - std::vector class_boxrects; - std::vector class_boxscores; - for(U32 j = 0; j < num_total_priorbox; j++){ - F32 score = confidence[j * num_class + i]; - if (score > confidence_threshold) - { - std::vector inbox; - inbox.assign(boxes[j].begin(),boxes[j].end()); - BoxRect b = { inbox[0], inbox[1], inbox[2], inbox[3], i }; - class_boxrects.push_back(b); - class_boxscores.push_back(score); - } - } - //sort the boxes with scores - detectionoutput_qsort_descent_arm(class_boxrects, class_boxscores, 0, static_cast(class_boxscores.size()-1)); - - if(nms_top_k < (U32)class_boxrects.size()){ - class_boxrects.resize(nms_top_k); - class_boxscores.resize(nms_top_k); - } - //apply nms - std::vector picked; - detectionoutput_nms_pickedboxes_arm(class_boxrects, picked, nms_threshold); - - for(I64 j = 0; j < (I64)picked.size(); j++) - { - I64 picked_box = picked[j]; - allclass_boxrects[i].push_back(class_boxrects[picked_box]); - allclass_boxscores[i].push_back(class_boxscores[picked_box]); - } - } - - std::vector boxrects; - std::vector boxscores; - - for (U32 i = 1; i < num_class ; i++) - { - boxrects.insert(boxrects.end(), allclass_boxrects[i].begin(), allclass_boxrects[i].end()); - boxscores.insert(boxscores.end(), allclass_boxscores[i].begin(), allclass_boxscores[i].end()); - } - - detectionoutput_qsort_descent_arm(boxrects, boxscores, 0, static_cast(boxscores.size()-1)); - - if (keep_top_k < (U32)boxrects.size()) - { - boxrects.resize(keep_top_k); - boxscores.resize(keep_top_k); - } - - U32 num_detected = static_cast(boxrects.size()); - if (num_detected == 0) - return SUCCESS; - - // the first box contains the number of availble boxes - output[0] = num_detected; - output[1] = output[2] = output[3] = output[4] = output[5] = 0; - - for(U32 i = 0; i < num_detected ; i++){ - BoxRect b = boxrects[i]; - F32 score = boxscores[i]; - - output[(i+1)*6] = b.label; - output[(i+1)*6+1] = score; - output[(i+1)*6+2] = b.xmin; - output[(i+1)*6+3] = b.ymin; - output[(i+1)*6+4] = b.xmax; - output[(i+1)*6+5] = b.ymax; - } - return SUCCESS; -} \ No newline at end of file diff --git a/tensor_computing/src/cpu/arm/fp32/eltwise.cpp b/tensor_computing/src/cpu/arm/fp32/eltwise.cpp deleted file mode 100644 index 2e82ddc0..00000000 --- a/tensor_computing/src/cpu/arm/fp32/eltwise.cpp +++ /dev/null @@ -1,90 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "cpu/arm/fp32/tensor_computing_fp32.h" - -float32x4_t getFloatVector(void* input, int inputSize, int index) { - float32x4_t result; - if (inputSize == 1) { - result = vdupq_n_f32(*((F32*)input)); - } - int local = index % inputSize; - int remain = inputSize - local; - if (remain >= 4) { - result = vld1q_f32((F32*)(input) + local); - } else { - F32 buffer[4]; - F32 *ptr = (F32*)input; - memcpy(buffer, ptr+local, sizeof(F32)*remain); - for (int i = 0; i < 4 - remain; i++) { - buffer[remain+i] = ptr[i % inputSize]; - } - result = vld1q_f32(buffer); - } - return result; -} - -F32 getFloatScalar(void* input, int inputSize, int index) { - int local = index % inputSize; - return ((F32*)input)[local]; -} - -EE eltwise_fp32(std::vectorinput, std::vector inputSize, U32 num, U32 len, void *output, EltwiseMode eltwiseMode) { - U32 len_tail = len % 4; - U32 len_main = len - len_tail; - - F32 *output_ptr = (F32 *)output; - for (U32 i = 0; i < len_main; i += 4){ - float32x4_t tmp_v = getFloatVector(input[0], inputSize[0], i); - for (U32 j = 1; j < num; j++) { - float32x4_t value_v = getFloatVector(input[j], inputSize[j], i); - switch (eltwiseMode) { - case ELTWISE_SUM: - tmp_v = vaddq_f32(value_v, tmp_v); - break; - case ELTWISE_MAX: - tmp_v = vmaxq_f32(value_v, tmp_v); - break; - case ELTWISE_PROD: - tmp_v = vmulq_f32(value_v, tmp_v); - break; - default: - return NOT_SUPPORTED; - } - } - vst1q_f32(output_ptr + i, tmp_v); - } - for (U32 i = len_main; i < len; i++){ - F32 tmp_s = getFloatScalar(input[0], inputSize[0], i); - for (U32 j = 1; j < num; j++) { - F32 value_s = getFloatScalar(input[j], inputSize[j], i); - switch (eltwiseMode) { - case ELTWISE_SUM: - tmp_s = value_s + tmp_s; - break; - case ELTWISE_MAX: - tmp_s = (value_s > tmp_s) ? value_s : tmp_s; - break; - case ELTWISE_PROD: - tmp_s *= value_s; - break; - default: - return NOT_SUPPORTED; - } - } - output_ptr[i] = tmp_s; - } - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/arm/fp32/lstm.cpp b/tensor_computing/src/cpu/arm/fp32/lstm.cpp deleted file mode 100644 index 24253bd9..00000000 --- a/tensor_computing/src/cpu/arm/fp32/lstm.cpp +++ /dev/null @@ -1,337 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "cpu/arm/fp32/tensor_computing_fp32.h" -#include "blas-enhance.h" - -void mvm_nkn32(U32 fn, U32 fk, const F32* filterArray, F32* input, F32* output) { -#ifdef __aarch64__ - for (U32 n = 0; n < fn; n++) { - F32 *in = input; - const F32 *f = filterArray + n*fk*32; - __asm__ __volatile__( - "ldr d0, [%[in]]\n" - "ldr q1, [%[out]]\n" - "ldr q2, [%[out], #16]\n" - "ldr q3, [%[out], #32]\n" - "ldr q4, [%[out], #48]\n" - "ldr q13, [%[out], #64]\n" - "ldr q14, [%[out], #80]\n" - "ldr q15, [%[out], #96]\n" - "ldr q16, [%[out], #112]\n" - "mov x0, %[k]\n" - "ldr q5, [%[f]]\n" - "ldr q6, [%[f], #16]\n" - "ldr q7, [%[f], #32]\n" - "ldr q8, [%[f], #48]\n" - "ldr q17, [%[f], #64]\n" - "ldr q18, [%[f], #80]\n" - "ldr q19, [%[f], #96]\n" - "ldr q20, [%[f], #112]\n" - "0:\n" - "prfm pldl2strm, [%[f], #4096]\n" - "prfm pldl1strm, [%[f], #1024]\n" - "ldr d9, [%[f], #128]\n" - "fmla v1.4s, v5.4s, v0.s[0]\n" - "ldr x9, [%[f], #136]\n" - "ins v9.d[1], x9\n" - "ldr d10, [%[f], #144]\n" - "fmla v2.4s, v6.4s, v0.s[0]\n" - "ldr x10, [%[f], #152]\n" - "ins v10.d[1], x10\n" - "ldr d11, [%[f], #160]\n" - "fmla v3.4s, v7.4s, v0.s[0]\n" - "ldr x11, [%[f], #168]\n" - "ins v11.d[1], x11\n" - "ldr d12, [%[f], #176]\n" - "fmla v4.4s, v8.4s, v0.s[0]\n" - "ldr x12, [%[f], #184]\n" - "ins v12.d[1], x12\n" - "ldr d21, [%[f], #192]\n" - "fmla v13.4s, v17.4s, v0.s[0]\n" - "ldr x9, [%[f], #200]\n" - "ins v21.d[1], x9\n" - "ldr d22, [%[f], #208]\n" - "fmla v14.4s, v18.4s, v0.s[0]\n" - "ldr x10, [%[f], #216]\n" - "ins v22.d[1], x10\n" - "ldr d23, [%[f], #224]\n" - "fmla v15.4s, v19.4s, v0.s[0]\n" - "ldr x11, [%[f], #232]\n" - "ins v23.d[1], x11\n" - "ldr d24, [%[f], #240]\n" - "fmla v16.4s, v20.4s, v0.s[0]\n" - "ldr x12, [%[f], #248]\n" - "ins v24.d[1], x12\n" - - "add %[f], %[f], #256\n" - "ldr d5, [%[f]]\n" - "fmla v1.4s, v9.4s, v0.s[1]\n" - "ldr x5, [%[f], #8]\n" - "ins v5.d[1], x5\n" - "ldr d6, [%[f], #16]\n" - "fmla v2.4s, v10.4s, v0.s[1]\n" - "ldr x6, [%[f], #24]\n" - "ins v6.d[1], x6\n" - "ldr d7, [%[f], #32]\n" - "fmla v3.4s, v11.4s, v0.s[1]\n" - "ldr x7, [%[f], #40]\n" - "ins v7.d[1], x7\n" - "ldr d8, [%[f], #48]\n" - "fmla v4.4s, v12.4s, v0.s[1]\n" - "ldr x8, [%[f], #56]\n" - "ins v8.d[1], x8\n" - "ldr d17, [%[f], #64]\n" - "fmla v13.4s, v21.4s, v0.s[1]\n" - "ldr x5, [%[f], #72]\n" - "ins v17.d[1], x5\n" - "ldr d18, [%[f], #80]\n" - "fmla v14.4s, v22.4s, v0.s[1]\n" - "ldr x6, [%[f], #88]\n" - "ins v18.d[1], x6\n" - "ldr d19, [%[f], #96]\n" - "fmla v15.4s, v23.4s, v0.s[1]\n" - "ldr x7, [%[f], #104]\n" - "ins v19.d[1], x7\n" - "ldr d20, [%[f], #112]\n" - "fmla v16.4s, v24.4s, v0.s[1]\n" - "ldr x8, [%[f], #120]\n" - "add %[in], %[in], #8\n" - "ins v20.d[1], x8\n" - - "ldr d0, [%[in]]\n" - "sub x0, x0, #2\n" - - "cmp x0, #3\n" - "bgt 0b\n" - "ldr q9, [%[f], #128]\n" - "ldr q10, [%[f], #144]\n" - "ldr q11, [%[f], #160]\n" - "ldr q12, [%[f], #176]\n" - "ldr q21, [%[f], #192]\n" - "ldr q22, [%[f], #208]\n" - "ldr q23, [%[f], #224]\n" - "ldr q24, [%[f], #240]\n" - "fmla v1.4s, v5.4s, v0.s[0]\n" - "fmla v2.4s, v6.4s, v0.s[0]\n" - "fmla v3.4s, v7.4s, v0.s[0]\n" - "fmla v4.4s, v8.4s, v0.s[0]\n" - "fmla v13.4s, v17.4s, v0.s[0]\n" - "fmla v14.4s, v18.4s, v0.s[0]\n" - "fmla v15.4s, v19.4s, v0.s[0]\n" - "fmla v16.4s, v20.4s, v0.s[0]\n" - "fmla v1.4s, v9.4s, v0.s[1]\n" - "fmla v2.4s, v10.4s, v0.s[1]\n" - "fmla v3.4s, v11.4s, v0.s[1]\n" - "fmla v4.4s, v12.4s, v0.s[1]\n" - "fmla v13.4s, v21.4s, v0.s[1]\n" - "fmla v14.4s, v22.4s, v0.s[1]\n" - "fmla v15.4s, v23.4s, v0.s[1]\n" - "fmla v16.4s, v24.4s, v0.s[1]\n" - "cmp x0, #3\n" - "bne 1f\n" - "add %[f], %[f], #256\n" - "ldr s0, [%[in], #8]\n" - "ldr q5, [%[f]]\n" - "ldr q6, [%[f], #16]\n" - "ldr q7, [%[f], #32]\n" - "ldr q8, [%[f], #48]\n" - "ldr q17, [%[f], #64]\n" - "ldr q18, [%[f], #80]\n" - "ldr q19, [%[f], #96]\n" - "ldr q20, [%[f], #112]\n" - "fmla v1.4s, v5.4s, v0.s[0]\n" - "fmla v2.4s, v6.4s, v0.s[0]\n" - "fmla v3.4s, v7.4s, v0.s[0]\n" - "fmla v4.4s, v8.4s, v0.s[0]\n" - "fmla v13.4s, v17.4s, v0.s[0]\n" - "fmla v14.4s, v18.4s, v0.s[0]\n" - "fmla v15.4s, v19.4s, v0.s[0]\n" - "fmla v16.4s, v20.4s, v0.s[0]\n" - - "1:\n" - "str q1, [%[out]]\n" - "str q2, [%[out], #16]\n" - "str q3, [%[out], #32]\n" - "str q4, [%[out], #48]\n" - "str q13, [%[out], #64]\n" - "str q14, [%[out], #80]\n" - "str q15, [%[out], #96]\n" - "str q16, [%[out], #112]\n" - :[out]"+r"(output), - [f]"+r"(f), - [in]"+r"(in) - :[k]"r"((I64)fk) - :"memory", "cc", "x0", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", - "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24" - ); - output += 32; - } -#else - // TODO - std::cerr << "[ERROR] currently not support ARMv7 LSTM" < 0) ? lstmDesc.numProjection : lstmDesc.numOutput; - if (!(idt == DT_F32 && fdt == DT_F32 && odt == DT_F32)) { - CHECK_STATUS(NOT_MATCH); - } - if (!(4*column == (I32)fn*32 && (ix+oh) == fk && in == on)) { - CHECK_STATUS(NOT_MATCH); - } - F32 forgetBias = lstmDesc.forgetBias; - ActivationMode activationMode = lstmDesc.activationMode; - if (activationMode != ACTIVATION_TANH) - CHECK_STATUS(NOT_SUPPORTED); - - const F32 *currentXArray = (const F32*)currentX; - const F32 *filterArray = (const F32*)filter; - const F32 *biasArray = (const F32*)bias; - const F32 *projectionArray = (const F32*)filter + (fn * 32 * fk); - F32 *lastStateArray = (F32*)state; - F32 *lastHArray = lastStateArray + column; - F32 *tmpArray = (F32*)tmp; - F32 *currentStateArray = (F32*)state; - F32 *currentHArray = currentStateArray + column; - F32 *outputArray = (F32*)output; - F32 *xhArray = tmpArray; - F32 *intermediateH = xhArray + (xDim + hDim); - U32 lastStateStride = column + hDim; - U32 lastHStride = column + hDim; - U32 currentStateStride = column + hDim; - U32 currentHStride = column + hDim; - float32x4_t forgetBiasVector = vdupq_n_f32(forgetBias); - for (U32 m = 0; m < batch; m++) { - F32 *lastBatchH = lastHArray + m * lastHStride; - memcpy(xhArray, currentXArray+m*batchStrideX, xDim*sizeof(F32)); - memcpy(xhArray+xDim, lastBatchH, hDim*sizeof(F32)); - - memcpy(intermediateH, biasArray, column * 4 * sizeof(F32)); - mvm_nkn32(fn, fk, filterArray, xhArray, intermediateH); - F32 *out_i = intermediateH; - F32 *out_g = out_i + column; - F32 *out_f = out_i + column * 2; - F32 *out_o = out_i + column * 3; - - F32 *lastBatchState = lastStateArray + m * lastStateStride; - F32 *currentBatchState = currentStateArray + m * currentStateStride; - F32 *currentBatchH = currentHArray + m * currentHStride; - F32 *currentOutput = outputArray + m * batchStrideH; - - F32* tmpState, *tmpHH, *tmpH; - if (lstmDesc.zoneoutCell == 0) { - tmpState = currentBatchState; - } else { - tmpState = out_i; - } - if (lstmDesc.zoneoutOutput != 0) { - tmpHH = out_g; - tmpH = out_f; - } else { - if (lstmDesc.numProjection > 0) { - tmpHH = out_g; - tmpH = out_f; - } else { - tmpHH = currentBatchH; - tmpH = currentBatchH; - } - } - - I32 h = 0; - for (; h < column-3; h+=4) { - float32x4_t out_i_v = vld1q_f32(out_i + h); - float32x4_t out_g_v = vld1q_f32(out_g + h); - float32x4_t out_f_v = vld1q_f32(out_f + h); - float32x4_t out_o_v = vld1q_f32(out_o + h); - float32x4_t C_v = vld1q_f32(lastBatchState + h); - float32x4_t I_v = vsigmoidq_f32(out_i_v); - float32x4_t F_v = vsigmoidq_f32(vaddq_f32(out_f_v, forgetBiasVector)); - float32x4_t O_v = vsigmoidq_f32(out_o_v); - float32x4_t G_v = vtanhq_f32(out_g_v); - C_v = vaddq_f32(vmulq_f32(C_v, F_v), vmulq_f32(I_v, G_v)); - float32x4_t out_hidden_v = vmulq_f32(O_v, vtanhq_f32(C_v)); - vst1q_f32(tmpState + h, C_v); - vst1q_f32(tmpHH + h, out_hidden_v); - } - for (; h < column; h++) { - F32 C_s = lastBatchState[h]; - F32 I_s = 1.0 / (1.0 + exp(-out_i[h])); - F32 F_s = 1.0 / (1.0 + exp(-(out_f[h] + forgetBias))); - F32 O_s = 1.0 / (1.0 + exp(-out_o[h])); - F32 G_s = tanh(out_g[h]); - C_s = C_s * F_s + I_s * G_s; - F32 value = O_s * tanh(C_s); - tmpState[h] = C_s; - tmpHH[h] = value; - } - if (lstmDesc.zoneoutCell != 0) { - array_scale_f32(tmpState, tmpState, column, 1-lstmDesc.zoneoutCell, 0); - array_scale_f32(lastBatchState, lastBatchState, column, lstmDesc.zoneoutCell, 0); - array_add_f32(tmpState, lastBatchState, currentBatchState, column); - } - if (lstmDesc.zoneoutOutput != 0) { - array_scale_f32(tmpHH, tmpH, column, 1-lstmDesc.zoneoutOutput, 0); - array_scale_f32(lastBatchH, lastBatchH, column, lstmDesc.zoneoutOutput, 0); - array_add_f32(tmpH, lastBatchH, currentBatchH, column); - } - if (lstmDesc.numProjection > 0) { - memset(currentBatchH, 0, sizeof(F32) * hDim); - mvm_nkn32(hDim/32, lstmDesc.numProjection, projectionArray, tmpHH, currentBatchH); - tmpHH = currentBatchH; - } - memcpy(currentOutput, tmpHH, sizeof(F32) * hDim); - } - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/arm/fp32/normalization.cpp b/tensor_computing/src/cpu/arm/fp32/normalization.cpp deleted file mode 100644 index c03b6608..00000000 --- a/tensor_computing/src/cpu/arm/fp32/normalization.cpp +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "cpu/arm/fp32/tensor_computing_fp32.h" - -inline void array_norm_scale_fp32(F32 *input, F32 *output, I32 len, F32 mean, F32 var, F32 *alpha, F32 *beta) { - F32 eps = 1e-6; - F32 std_value = sqrt(var + eps); - float32x4_t mean_v = vdupq_n_f32(mean); - float32x4_t std_v = vdupq_n_f32(std_value); - - I32 i = 0; - for(i = 0; i < len - 3; i += 4){ - float32x4_t in = vld1q_f32(input + i); - float32x4_t alpha_v = vld1q_f32(alpha + i); - float32x4_t beta_v = vld1q_f32(beta + i); - - float32x4_t tmp_v = vsubq_f32(in, mean_v); - tmp_v = vdivq_f32(tmp_v, std_v); - tmp_v = vfmaq_f32(beta_v, alpha_v, tmp_v); - vst1q_f32(output+i, tmp_v); - } - for(; i < len; i++){ - output[i] = alpha[i] * (input[i] - mean) / std_value + beta[i]; - } -} - -EE layer_normalization_fp32(F32 *alpha, F32 *beta, - TensorDesc inputDesc, F32* input, - TensorDesc outputDesc, F32* output) -{ - UNUSED(outputDesc); - if (nullptr == alpha || nullptr == beta || nullptr == input || nullptr == output) - CHECK_STATUS(NULL_POINTER); - - U32 size = tensorNumElements(inputDesc); - I32 size_inner = inputDesc.dims[0]; - I32 size_outer = size / size_inner; - for(I32 i = 0; i < size_outer; i++) { - F32 *current_input = input + i * size_inner; - F32 *current_output = output + i * size_inner; - F32 mean = array_mean_f32(current_input, size_inner); - F32 var = array_var_f32(current_input, size_inner, mean); - - array_norm_scale_fp32(current_input, current_output, size_inner, mean, var, alpha, beta); - } - - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/arm/fp32/pooling.cpp b/tensor_computing/src/cpu/arm/fp32/pooling.cpp deleted file mode 100644 index ec782db0..00000000 --- a/tensor_computing/src/cpu/arm/fp32/pooling.cpp +++ /dev/null @@ -1,96 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "cpu/arm/fp32/tensor_computing_fp32.h" - -EE pooling_fp32(TensorDesc inputDesc, const F32* input, PoolingDesc poolingDesc, TensorDesc outputDesc, F32* output) -{ - if (nullptr == input || nullptr == output) { - CHECK_STATUS(NULL_POINTER); - } - DataType idt, odt; - DataFormat idf, odf; - U32 in = 0, ic = 0, ih = 0, iw = 0, - on = 0, oc = 0, oh = 0, ow = 0; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - - if (idt != odt || idt != DT_F32) { - CHECK_STATUS(NOT_MATCH); - } - if (in != on || ic != oc) { - CHECK_STATUS(NOT_MATCH); - } - if (idf != DF_NCHWC8 || odf != idf) { - CHECK_STATUS(NOT_MATCH); - } - - PoolingMode pm = poolingDesc.pm; - U32 strideH = poolingDesc.stride_h; - U32 strideW = poolingDesc.stride_w; - U32 paddingT = poolingDesc.padding_top; - U32 paddingL = poolingDesc.padding_left; - U32 kernelSizeH = poolingDesc.kernelSize_h; - U32 kernelSizeW = poolingDesc.kernelSize_w; - if (paddingT >= kernelSizeH || paddingL >= kernelSizeW) { - CHECK_STATUS(NOT_SUPPORTED); - } - - ic /= 8; - for (U32 n = 0; n < in; n++) { - for (U32 c = 0; c < ic; c++) { - for (U32 h = 0; h < oh; h++) { - for (U32 w = 0; w < ow; w++) { - int hstart = (int)h * (int)strideH - (int)paddingT; - int wstart = (int)w * (int)strideW - (int)paddingL; - int hend = UNI_MIN(hstart + kernelSizeH, ih); - int wend = UNI_MIN(wstart + kernelSizeW, iw); - hstart = UNI_MAX(hstart, 0); - wstart = UNI_MAX(wstart, 0); - float32x4_t in0, in1, out0, out1; - float32x4_t poolSize = vdupq_n_f32((hend - hstart)*(wend - wstart)); - out0 = vdupq_n_f32((pm == POOLING_MAX) ? FLT_MIN : 0); - out1 = out0; - for (int kernelH = hstart; kernelH < hend; kernelH++) { - for (int kernelW = wstart; kernelW < wend; kernelW++) { - const U32 index = (kernelH * iw + kernelW) * 8; - in0 = vld1q_f32(input + index); - in1 = vld1q_f32(input + index + 4); - switch (pm) { - case POOLING_MAX: { - out0 = vmaxq_f32(in0, out0); - out1 = vmaxq_f32(in1, out1); - break; - } - case POOLING_MEAN: { - out0 = vaddq_f32(out0, in0); - out1 = vaddq_f32(out1, in1); - break; - } - default: - CHECK_STATUS(NOT_SUPPORTED); - } - } - } - vst1q_f32(output + (h * ow + w) * 8, ((pm == POOLING_MAX) ? out0 : vdivq_f32(out0, poolSize))); - vst1q_f32(output + (h * ow + w) * 8 + 4, ((pm == POOLING_MAX) ? out1 : vdivq_f32(out1, poolSize))); - } - } - input += ih * iw * 8; - output += oh * ow * 8; - } - } - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/arm/fp32/priorbox.cpp b/tensor_computing/src/cpu/arm/fp32/priorbox.cpp deleted file mode 100644 index 1b4caf71..00000000 --- a/tensor_computing/src/cpu/arm/fp32/priorbox.cpp +++ /dev/null @@ -1,133 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/fp32/tensor_computing_fp32.h" - -EE priorbox_fp32(std::vector inputDesc, PriorBoxDesc priorboxDesc, TensorDesc outputDesc, F32* output) -{ - UNUSED(outputDesc); - if (nullptr == output) { - CHECK_STATUS(NULL_POINTER); - } - U32 num = inputDesc.size(); - if (num != 2) return NOT_MATCH; - DataType idt0, idt1; - DataFormat idf0, idf1; - U32 in0 = 0, ic0 = 0, ih0 = 0, iw0 = 0; - U32 in1 = 0, ic1 = 0, ih1 = 0, iw1 = 0; - CHECK_STATUS(tensor4dGet(inputDesc[0], &idt0, &idf0, &in0, &ic0, &ih0, &iw0)); - CHECK_STATUS(tensor4dGet(inputDesc[1], &idt1, &idf1, &in1, &ic1, &ih1, &iw1)); - - std::vector minsizes = priorboxDesc.min_sizes; - std::vector maxsizes = priorboxDesc.max_sizes; - std::vector ars = priorboxDesc.aspect_ratios; - U32 flip = priorboxDesc.flip; - U32 clip = priorboxDesc.clip; - F32 vars[4]; - for (int i = 0; i < 4 ; i++){ - vars[i] = priorboxDesc.variances[i]; - } - U32 imageH = priorboxDesc.image_h; - U32 imageW = priorboxDesc.image_w; - F32 stepH = priorboxDesc.step_h; - F32 stepW = priorboxDesc.step_w; - F32 offset = priorboxDesc.offset; - - U32 layer_w = iw0; - U32 layer_h = ih0; - - int img_w, img_h; - if(imageH == 0 || imageW == 0){ - img_w = iw1; - img_h = ih1; - } else { - img_w = imageW; - img_h = imageH; - } - F32 stp_h, stp_w; - if (stepW == 0 || stepH == 0){ - stp_w = static_cast(ceil((img_w)/layer_w)); - stp_h = static_cast(ceil((img_h)/layer_h)); - } else{ - stp_w = stepW; - stp_h = stepH; - } - - U32 num_priorboxs = ars.size(); - if(flip){ - num_priorboxs = num_priorboxs * 2; - } - U32 num_minsize = minsizes.size(); - num_priorboxs = (num_priorboxs + 1) * num_minsize; - if(!maxsizes.empty()){ - U32 num_maxsize = priorboxDesc.max_sizes.size(); - num_priorboxs = num_priorboxs + num_maxsize; - } - int dim = layer_h * layer_w * num_priorboxs * 4; - int idx = 0; - for (U32 h = 0 ; h < layer_h ; h++){ - for (U32 w = 0 ; w < layer_w ; w++){ - F32 center_x = (w + offset) * stp_w; - F32 center_y = (h + offset) * stp_h; - F32 box_w , box_h; - for( int n = 0 ; n < (int)minsizes.size() ; n++){ - F32 minsize = minsizes[n]; - box_w = box_h = minsize; - output[idx++] = (center_x - box_w/2) / img_w; - output[idx++] = (center_y - box_h/2) / img_h; - output[idx++] = (center_x + box_w/2) / img_w; - output[idx++] = (center_y + box_h/2) / img_h; - - if ((int)maxsizes.size() > 0) { - F32 maxsize = maxsizes[n]; - box_w = box_h = sqrt(minsize * maxsize); - output[idx++] = (center_x - box_w/2) / img_w; - output[idx++] = (center_y - box_h/2) / img_h; - output[idx++] = (center_x + box_w/2) / img_w; - output[idx++] = (center_y + box_h/2) / img_h; - } - - for (int a = 0; a < (int)ars.size(); a++){ - F32 ar = ars[a]; - box_w = minsize * sqrt(ar); - box_h = minsize / sqrt(ar); - output[idx++] = (center_x - box_w/2) / img_w; - output[idx++] = (center_y - box_h/2) / img_h; - output[idx++] = (center_x + box_w/2) / img_w; - output[idx++] = (center_y + box_h/2) / img_h; - if(flip){ - output[idx++] = (center_x - box_h/2) / img_w; - output[idx++] = (center_y - box_w/2) / img_h; - output[idx++] = (center_x + box_h/2) / img_w; - output[idx++] = (center_y + box_w/2) / img_h; - } - } - } - } - } - - if (clip) { - for (int i = 0; i < dim; i++) { - output[i] = std::min(std::max(output[i], 0.), 1.); - } - } - - for(int i = 0 ; i < dim/4 ; i++){ - output[idx++] = vars[0]; - output[idx++] = vars[1]; - output[idx++] = vars[2]; - output[idx++] = vars[3]; - } - return SUCCESS; -} \ No newline at end of file diff --git a/tensor_computing/src/cpu/arm/fp32/scale.cpp b/tensor_computing/src/cpu/arm/fp32/scale.cpp deleted file mode 100644 index dbfbe567..00000000 --- a/tensor_computing/src/cpu/arm/fp32/scale.cpp +++ /dev/null @@ -1,111 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "cpu/arm/fp32/tensor_computing_fp32.h" - -EE scale_nchwc8_fp32(F32* input, F32* alpha, F32* beta, I32 in, I32 ic, I32 elements_per_channel, F32* output) -{ - float32x4_t in_vec, out_vec; - float32x4_t one = vdupq_n_f32(float32_t(1.)); - float32x4_t zero = vdupq_n_f32(float32_t(0.)); - U32 index = 0; - for (I32 n = 0; n < in; n++) { - for (I32 c = 0; c < ic; c += 8) { - float32x4_t alpha_vec0 = (alpha == nullptr) ? one : vld1q_f32(alpha + c); - float32x4_t alpha_vec1 = (alpha == nullptr) ? one : vld1q_f32(alpha + c + 4); - float32x4_t beta_vec0 = (beta == nullptr) ? zero : vld1q_f32(beta + c); - float32x4_t beta_vec1 = (beta == nullptr) ? zero : vld1q_f32(beta + c + 4); - for (I32 i = 0; i < elements_per_channel; i++) { - in_vec = vld1q_f32(input + index); - out_vec = vfmaq_f32(beta_vec0, alpha_vec0, in_vec); - vst1q_f32(output+index, out_vec); - - in_vec = vld1q_f32(input + index + 4); - out_vec = vfmaq_f32(beta_vec1, alpha_vec1, in_vec); - vst1q_f32(output+index+4, out_vec); - index += 8; - } - } - } - return SUCCESS; -} - -EE scale_nchw_fp32(F32* input, F32* alpha, F32* beta, I32 in, I32 ic, I32 elements_per_channel, F32* output) -{ - float32x4_t one = vdupq_n_f32(1.); - float32x4_t zero = vdupq_n_f32(0.); - U32 index = 0; - for (I32 n = 0; n < in; n++) { - for (I32 c = 0; c < ic; c++) { - float32x4_t alpha_vec = (alpha == nullptr) ? one : vdupq_n_f32(alpha[c]); - float32x4_t beta_vec = (beta == nullptr) ? zero : vdupq_n_f32(beta[c]); - I32 i = 0; - for (; i < elements_per_channel-3; i += 4) { - float32x4_t in_vec = vld1q_f32(input + index); - float32x4_t out_vec = vfmaq_f32(beta_vec, alpha_vec, in_vec); - vst1q_f32(output+index, out_vec); - index += 4; - } - for (; i < elements_per_channel; i++) { - output[index] = alpha[c] * input[index] + beta[c]; - index++; - } - } - } - return SUCCESS; -} - -EE scale_nhwc_fp32(F32* input, F32* alpha, F32* beta, I32 in, I32 ic, I32 elements_per_channel, F32* output) -{ - float32x4_t one = vdupq_n_f32(1.); - float32x4_t zero = vdupq_n_f32(0.); - U32 index = 0; - for (I32 n = 0; n < in; n++) { - for (I32 i = 0; i < elements_per_channel; i++) { - I32 c = 0; - for (; c < ic-3; c += 4) { - float32x4_t alpha_vec = (alpha == nullptr) ? one : vld1q_f32(alpha+c); - float32x4_t beta_vec = (beta == nullptr) ? zero : vld1q_f32(beta+c); - float32x4_t in_vec = vld1q_f32(input + index); - float32x4_t out_vec = vfmaq_f32(beta_vec, alpha_vec, in_vec); - vst1q_f32(output+index, out_vec); - index += 4; - } - for (; c < ic; c++) { - F32 beta_s = (beta == nullptr) ? 0 : beta[c]; - output[index] = alpha[c] * input[index] + beta_s; - index++; - } - } - } - return SUCCESS; -} - -EE scale_fp32(F32* input, I32 axis, I32 nDims, F32* alpha, F32* beta, I32 in, I32 ic, I32 elements_per_channel, F32* output) -{ - if (nullptr == input || nullptr == output) - CHECK_STATUS(NULL_POINTER); - EE ret = SUCCESS; - if (axis == 1 || axis == 0) { - ret = scale_nchw_fp32(input, alpha, beta, in, ic, elements_per_channel, output); - } else if (axis == nDims - 1) { - ret = scale_nhwc_fp32(input, alpha, beta, in, ic, elements_per_channel, output); - } else if (axis == nDims) { - ret = scale_nchwc8_fp32(input, alpha, beta, in, ic, elements_per_channel, output); - } else { - CHECK_STATUS(NOT_SUPPORTED); - } - return ret; -} diff --git a/tensor_computing/src/cpu/arm/fp32/softmax.cpp b/tensor_computing/src/cpu/arm/fp32/softmax.cpp deleted file mode 100644 index 8e88f58e..00000000 --- a/tensor_computing/src/cpu/arm/fp32/softmax.cpp +++ /dev/null @@ -1,140 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include -#include "cpu/arm/fp32/tensor_computing_fp32.h" - -void softmax_lastAxis_fp32(const F32* input, I32 loopOuter, I32 loops, F32 *output) -{ - for(I32 i = 0; i < loopOuter; i++) { - const F32 *inputPtr = input + i * loops; - F32 *outputPtr = output + i * loops; - - float32x4_t max_v, sub_v, sum_v, tmp_v; - F32 max_s, tmp_s; - max_s = array_max_f32(inputPtr, loops); - max_v = vdupq_n_f32(max_s); - sum_v = vdupq_n_f32(0); - - I32 j = 0; - F32 sum_s = 0; - for(j = 0; j < loops-3; j += 4) { - float32x4_t in = vld1q_f32(inputPtr + j); - sub_v = vsubq_f32(in, max_v); - tmp_v = vexpq_f32_03_percent_error(sub_v); - sum_v = vaddq_f32(sum_v, tmp_v); - vst1q_f32(outputPtr + j, tmp_v); - } - sum_s += vaddvq_f32(sum_v); - for(; j < loops; j++){ - tmp_s = exp(inputPtr[j] - max_s); - outputPtr[j] = tmp_s; - sum_s += tmp_s; - } - array_scale_f32(outputPtr, outputPtr, loops, 1.0 / sum_s, 0); - } -} - -void softmax_anyAxis_fp32(const F32* input, I32 loopOuter, I32 loops, I32 loopInner, F32 *output) -{ - std::vector buffer(loopInner * 2); - F32* maxBuffer = &buffer[0] ; - F32* sumBuffer = &buffer[loopInner] ; - I32 k = 0; - for(I32 i = 0; i < loopOuter; i++) { - const F32* inputPtrBase = input + i * loops * loopInner; - F32* outputPtrBase = output + i * loops * loopInner; - - memcpy(maxBuffer, inputPtrBase, loopInner * sizeof(F32)); - memset(sumBuffer, 0, loopInner * sizeof(F32)); - for (I32 j = 1; j < loops; j++) { - const F32* inputPtr = inputPtrBase + j * loopInner; - for (k = 0; k < loopInner-3; k += 4) { - float32x4_t in_v = vld1q_f32(inputPtr + k); - float32x4_t out_v = vld1q_f32(maxBuffer + k); - float32x4_t max_v = vmaxq_f32(in_v, out_v); - vst1q_f32(maxBuffer + k, max_v); - } - for (; k < loopInner; k++) - maxBuffer[k] = UNI_MAX(maxBuffer[k], inputPtr[k]); - } - for (I32 j = 0; j < loops; j++) { - const F32* inputPtr = inputPtrBase + j * loopInner; - F32* outputPtr = outputPtrBase + j * loopInner; - for (k = 0; k < loopInner-3; k += 4) { - float32x4_t in_v = vld1q_f32(inputPtr + k); - float32x4_t max_v = vld1q_f32(maxBuffer + k); - float32x4_t sub_v = vsubq_f32(in_v, max_v); - float32x4_t exp_v = vexpq_f32_03_percent_error(sub_v); - float32x4_t sum_v = vld1q_f32(sumBuffer + k); - sum_v = vaddq_f32(sum_v, exp_v); - vst1q_f32(sumBuffer + k, sum_v); - vst1q_f32(outputPtr + k, exp_v); - } - for (; k < loopInner; k++) { - outputPtr[k] = exp(inputPtr[k] - maxBuffer[k]); - sumBuffer[k] += outputPtr[k]; - } - } - for (I32 j = 0; j < loops; j++) { - F32* outputPtr = outputPtrBase + j * loopInner; - for (k = 0; k < loopInner-3; k += 4) { - float32x4_t out_v = vld1q_f32(outputPtr + k); - float32x4_t sum_v = vld1q_f32(sumBuffer + k); - out_v = vdivq_f32(out_v, sum_v); - vst1q_f32(outputPtr + k, out_v); - } - for (; k < loopInner; k++) { - outputPtr[k] /= sumBuffer[k]; - } - } - } -} - - -EE softmax_fp32(TensorDesc inputDesc, const F32* input, - int axis, - TensorDesc outputDesc, F32* output) -{ - UNUSED(outputDesc); - if(nullptr == input || nullptr == output) - CHECK_STATUS(NULL_POINTER); - - U32 size = tensorNumElements(inputDesc); - axis = (axis + inputDesc.nDims) % inputDesc.nDims; - axis = inputDesc.nDims - 1 - axis; - I32 loops = inputDesc.dims[axis]; - - I32 loopInner = 1; - for (int i = 0; i < axis; i++) - loopInner *= inputDesc.dims[i]; - U32 loopOuter = size / loops / loopInner; - - if (loopInner == 1) { - if (DF_NCHWC8 == inputDesc.df && 4 == inputDesc.nDims && - (inputDesc.dims[1] != 1 || inputDesc.dims[0] != 1)) { - CHECK_REQUIREMENT(2 != axis); - loopInner *= 8; - loopOuter /= 8; - softmax_anyAxis_fp32(input, loopOuter, loops, loopInner, output); - } else { - softmax_lastAxis_fp32(input, loopOuter, loops, output); - } - } else { - CHECK_REQUIREMENT(DF_NCHWC8 != inputDesc.df); - softmax_anyAxis_fp32(input, loopOuter, loops, loopInner, output); - } - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/arm/fp32/tensor_computing_fp32.h b/tensor_computing/src/cpu/arm/fp32/tensor_computing_fp32.h deleted file mode 100644 index 4f7d9191..00000000 --- a/tensor_computing/src/cpu/arm/fp32/tensor_computing_fp32.h +++ /dev/null @@ -1,179 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_TENSOR_COMPUTING_FP32 -#define _H_TENSOR_COMPUTING_FP32 -#include -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "tensor_computing_type.h" -#include "cpu/arm/fp32/arm_functions_fp32.h" - -EE convolution_transform_filter_fp32(TensorDesc filterDesc, const F32* filter, - ConvolutionForwardAlgorithm algorithm, - TensorDesc *ftmDesc, F32* filterTransformed); - -EE convolution_infer_forward_tmp_bytes_fp32(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, ConvolutionForwardAlgorithm algorithm, U32 *bytes); - -EE convolution_fp32(TensorDesc inputDesc, F32* input, - TensorDesc filterDesc, const F32* filter, - ConvolutionDesc convDesc, ConvolutionForwardAlgorithm algorithm, - TensorDesc biasDesc, const F32* bias, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F32* output, - ActivationDesc activationDesc, - Arch arch); - -#ifdef __aarch64__ -EE convolution_gemm_V8(TensorDesc inputDesc, F32* inArray, - TensorDesc filterDesc, const F32* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F32* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F32* outArray, - ActivationDesc activationDesc); -#else -EE convolution_gemm_V7(TensorDesc inputDesc, F32* inArray, - TensorDesc filterDesc, const F32* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F32* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F32* outArray, - ActivationDesc activationDesc); -#endif - -#ifdef __aarch64__ -EE convolution_gemm_icnchw_V8(TensorDesc inputDesc, F32* inArray, - TensorDesc filterDesc, const F32* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F32* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F32* outArray, - ActivationDesc activationDesc); -#else -EE convolution_gemm_icnchw_V7(TensorDesc inputDesc, F32* inArray, - TensorDesc filterDesc, const F32* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F32* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F32* outArray, - ActivationDesc activationDesc); -#endif - -EE convolution_winograd_V8(TensorDesc inputDesc, F32* inArray, - TensorDesc filterDesc, const F32* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F32* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F32* outArray, - ActivationDesc activationDesc); - -EE deconvolution_infer_forward_algorithm_fp32(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, ConvolutionPolicy policy, ConvolutionForwardAlgorithm *algorithm); - -EE deconvolution_infer_forward_tmp_bytes_fp32(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, ConvolutionForwardAlgorithm algorithm, U32 *bytes); - -EE deconvolution_transform_filter_bytes_fp32(TensorDesc filterDesc, ConvolutionForwardAlgorithm algorithm, U32* bytes); - -EE deconvolution_transform_filter_fp32(TensorDesc filterDesc, const F32* filter, - ConvolutionForwardAlgorithm algorithm, - TensorDesc *ftmDesc, F32* filterTransformed); - -EE deconvolution_fp32(TensorDesc inputDesc, F32* input, - TensorDesc filterDesc, const F32* filter, - ConvolutionDesc convDesc, ConvolutionForwardAlgorithm algorithm, - TensorDesc biasDesc, const F32* bias, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F32* output, - ActivationDesc activationDesc, - Arch arch); - -EE detectionoutput_fp32(std::vector inputDesc, std::vector input, DetectionOutputDesc detectionoutputDesc, TensorDesc outputDesc, F32* output); - -EE pooling_fp32(TensorDesc inputDesc, const F32* input, PoolingDesc poolingDesc, TensorDesc outputDesc, F32* output); - -EE softmax_fp32(TensorDesc inputDesc, const F32* input, int axis, TensorDesc outputDesc, F32* output); - -EE concat_fp32(std::vector inputDesc, std::vector input, TensorDesc outputDesc, void* output, U32 concatDim); - -EE attention_fp32(U32 batch, U32 numHeads, I32 fromSequenceLength, I32 toSequenceLength, const F32 *input, F32 *output); - -EE clip_fp32(F32 *input, F32 *output, I32 len, F32 minValue, F32 maxValue); - -EE depthwise_convolution_infer_forward_algorithm_fp32(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, ConvolutionPolicy policy, DepthwiseConvolutionForwardAlgorithm *algorithm, DataType targetDataType); - -EE depthwise_convolution_transform_filter_bytes_fp32(TensorDesc filterDesc, DepthwiseConvolutionForwardAlgorithm algorithm, U32* bytes); - -EE depthwise_convolution_transform_filter_fp32(TensorDesc filterDesc, const F32* filter, - DepthwiseConvolutionForwardAlgorithm algorithm, - TensorDesc *ftmDesc, F32* filterTransformed); - -EE depthwise_convolution_infer_forward_tmp_bytes_fp32(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, DepthwiseConvolutionForwardAlgorithm algorithm, U32 *bytes); - -EE depthwise_convolution_transform_filter_fp32(TensorDesc filterDesc, const F32* filter, - DepthwiseConvolutionForwardAlgorithm algorithm, - TensorDesc *ftmDesc, F32* filterTransformed); - -EE depthwise_convolution_fp32(TensorDesc inputDesc, F32* input, - TensorDesc filterDesc, const F32* filter, - ConvolutionDesc convDesc, - DepthwiseConvolutionForwardAlgorithm algorithm, - TensorDesc biasDesc, const F32* bias, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F32* output, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc, - Arch arch); - -EE eltwise_fp32(std::vectorinput, std::vector inputSize, U32 num, U32 len, void *output, EltwiseMode eltwiseMode); - -EE lstmcell_fp32(TensorDesc xDesc, const void* currentX, - TensorDesc filterDesc, const void* filter, - TensorDesc biasDesc, const void* bias, - void *state, - U32 tmpBytes, void *tmp, - LSTMDesc lstmDesc, U32 batchStrideX, U32 batchStrideH, - TensorDesc hDesc, void* output, - Arch arch); - -EE multiply_fp32(F32 *alpha, F32 *beta, TensorDesc inputDesc, F32* input, TensorDesc outputDesc, F32 *output); - -EE layer_normalization_fp32(F32 *alpha, F32 *beta, - TensorDesc inputDesc, F32* input, - TensorDesc outputDesc, F32* output); - -EE pooling_fp32(TensorDesc inputDesc, const F32* input, PoolingDesc poolingDesc, const F32* scale, TensorDesc outputDesc, F32* output); - -EE priorbox_fp32(std::vector inputDesc, PriorBoxDesc priorboxDesc, TensorDesc outputDesc, F32* output); - -EE scale_fp32(F32* input, I32 axis, I32 nDims, F32* alpha, F32* beta, I32 in, I32 ic, I32 elements_per_channel, F32* output); - -EE softmax_fp32(TensorDesc inputDesc, const F32* input, - TensorDesc outputDesc, F32* output); - -EE check_fp32(TensorDesc inputDescA, const F32* inputA, - TensorDesc inputDescB, const F32* inputB, - CheckMode checkMode, - TensorDesc outputDesc, I32* output); - -EE attention_mask_fp32(TensorDesc inputDesc, const F32* input, - I32 attentionLength, bool sameLength, float maskValue, - TensorDesc outputDesc, F32* output); -#endif diff --git a/tensor_computing/src/cpu/arm/int8/concat.cpp b/tensor_computing/src/cpu/arm/int8/concat.cpp deleted file mode 100644 index 429d1b60..00000000 --- a/tensor_computing/src/cpu/arm/int8/concat.cpp +++ /dev/null @@ -1,142 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifdef _USE_INT8 -#include -#include "cpu/arm/int8/tensor_computing_int8.h" - -EE concat_int8(std::vector inputDesc, std::vector input, F32* inputScale, - TensorDesc outputDesc, void* output, F32* outputScale, U32 concatDim) -{ - if (inputDesc.size() < 1) { - CHECK_STATUS(NOT_MATCH); - } - if(inputDesc.size() == 1) { - memcpy(output, input[0], tensorNumBytes(outputDesc)); - return SUCCESS; - } - if (concatDim != 0 && concatDim != 1) { - CHECK_STATUS(NOT_SUPPORTED); - } - - F32 min_scale = inputScale[0]; - U32 min_idx = 0; - - for (U32 i=1; i inputScale[i]) { - min_scale = inputScale[i]; - min_idx = i; - } - } - *outputScale = min_scale; - - for (U32 i=0; i= 0.9961) { // Even 128 will not be updated to 127 - continue; - } - INT8 factor = rescale * 128; - - if (factor < 2) { - continue; - } - - int8x8_t fact = vdup_n_s8(factor); - - U32 num = tensorNumElements(inputDesc[i]); - U32 i32 = num / 32; - - int8x8_t in[4]; - int16x8_t in16[4]; - - for (U32 i=0; i - -#include "sys.h" -#include "type.h" -#include "error.h" -#include "tensor_desc.h" -#include "tensor_computing_type.h" - - -template -EE convolution_gemm_A55(TensorDesc inputDesc, const void* input, F16* inputScale, TensorDesc filterDesc, const void* filter, F16* filterScale, - ConvolutionDesc convDesc, TensorDesc biasDesc, const void* bias, U32 tmpBytes, void* tmp, TensorDesc outputDesc, - void* output, F16* outputScale, ActivationDesc am); - -template -EE convolution_gemm_A76(TensorDesc inputDesc, const void* input, F16* inputScale, TensorDesc filterDesc, const void* filter, F16* filterScale, - ConvolutionDesc convDesc, TensorDesc biasDesc, const void* bias, U32 tmpBytes, void* tmp, TensorDesc outputDesc, - void* output, F16* outputScale, ActivationDesc am); - -inline EE convolution_gemm(TensorDesc inputDesc, const void* input, F16* inputScale, TensorDesc filterDesc, const void* filter, F16* filterScale, - ConvolutionDesc convDesc, TensorDesc biasDesc, const void* bias, U32 tmpBytes, void* tmp, TensorDesc outputDesc, - void* output, F16* outputScale, ActivationDesc am, Arch arch) -{ - EE ret = SUCCESS; - switch (arch) { - case ARM_A55: - ret = convolution_gemm_A55(inputDesc, input, inputScale, - filterDesc, filter, filterScale, - convDesc, - biasDesc, bias, - tmpBytes, tmp, - outputDesc, output, outputScale, - am); - break; - case ARM_A76: - ret = convolution_gemm_A76(inputDesc, input, inputScale, - filterDesc, filter, filterScale, - convDesc, - biasDesc, bias, - tmpBytes, tmp, - outputDesc, output, outputScale, - am); - break; - default: - return NOT_SUPPORTED; - } - return ret; -} - -inline EE quantize_I32(U32 num_v, I32* out_d, I32 factor, F32 scale, INT8* out_q) -{ - // num_v is the number of q-form vectors (I32) - I32 *arr_d = out_d; - I32 fact = factor; - INT8 *arr_q = out_q; - U32 i28 = num_v / 28; // The number of iterations, each handling 28 vectors - - if (i28 > 0) { - __asm__ __volatile__( - "ld4 {v1.4s, v2.4s, v3.4s, v4.4s}, [%[out_d]], #64\n" - "ldr s0, [%[factor]]\n" - "ld4 {v5.4s, v6.4s, v7.4s, v8.4s}, [%[out_d]], #64\n" - "mov x1, %[i]\n" - "ld4 {v9.4s, v10.4s, v11.4s, v12.4s}, [%[out_d]], #64\n" - "dup v0.4s, v0.s[0]\n" - "ld4 {v13.4s, v14.4s, v15.4s, v16.4s}, [%[out_d]], #64\n" - "ld4 {v17.4s, v18.4s, v19.4s, v20.4s}, [%[out_d]], #64\n" - "ld4 {v21.4s, v22.4s, v23.4s, v24.4s}, [%[out_d]], #64\n" - - "0:\n" - "ld4 {v25.4s, v26.4s, v27.4s, v28.4s}, [%[out_d]], #64\n" - "subs x1, x1, #1\n" - - "mul v4.4s, v4.4s, v0.4s\n" - "mul v3.4s, v3.4s, v0.4s\n" - "mul v2.4s, v2.4s, v0.4s\n" - "mul v1.4s, v1.4s, v0.4s\n" - - "mul v8.4s, v8.4s, v0.4s\n" - "sri v4.4s, v3.4s, #8\n" - "mul v7.4s, v7.4s, v0.4s\n" - "sri v2.4s, v1.4s, #8\n" - "mul v6.4s, v6.4s, v0.4s\n" - "mul v5.4s, v5.4s, v0.4s\n" - "sri v4.4s, v2.4s, #16\n" - - "mul v12.4s, v12.4s, v0.4s\n" - "sri v8.4s, v7.4s, #8\n" - "mul v11.4s, v11.4s, v0.4s\n" - "sri v6.4s, v5.4s, #8\n" - "mul v10.4s, v10.4s, v0.4s\n" - "str q4, [%[out_q]], #16\n" - "ld4 {v1.4s, v2.4s, v3.4s, v4.4s}, [%[out_d]], #64\n" - "mul v9.4s, v9.4s, v0.4s\n" - "sri v8.4s, v6.4s, #16\n" - - "mul v16.4s, v16.4s, v0.4s\n" - "sri v12.4s, v11.4s, #8\n" - "mul v15.4s, v15.4s, v0.4s\n" - "sri v10.4s, v9.4s, #8\n" - "mul v14.4s, v14.4s, v0.4s\n" - "str q8, [%[out_q]], #16\n" - "ld4 {v5.4s, v6.4s, v7.4s, v8.4s}, [%[out_d]], #64\n" - "mul v13.4s, v13.4s, v0.4s\n" - "sri v12.4s, v10.4s, #16\n" - - "mul v20.4s, v20.4s, v0.4s\n" - "sri v16.4s, v15.4s, #8\n" - "mul v19.4s, v19.4s, v0.4s\n" - "sri v14.4s, v13.4s, #8\n" - "mul v18.4s, v18.4s, v0.4s\n" - "str q12, [%[out_q]], #16\n" - "ld4 {v9.4s, v10.4s, v11.4s, v12.4s}, [%[out_d]], #64\n" - "mul v17.4s, v17.4s, v0.4s\n" - "sri v16.4s, v14.4s, #16\n" - - "mul v24.4s, v24.4s, v0.4s\n" - "sri v20.4s, v19.4s, #8\n" - "mul v23.4s, v23.4s, v0.4s\n" - "sri v18.4s, v17.4s, #8\n" - "mul v22.4s, v22.4s, v0.4s\n" - "str q16, [%[out_q]], #16\n" - "ld4 {v13.4s, v14.4s, v15.4s, v16.4s}, [%[out_d]], #64\n" - "mul v21.4s, v21.4s, v0.4s\n" - "sri v20.4s, v18.4s, #16\n" - - "mul v28.4s, v28.4s, v0.4s\n" - "sri v24.4s, v23.4s, #8\n" - "mul v27.4s, v27.4s, v0.4s\n" - "sri v22.4s, v21.4s, #8\n" - "mul v26.4s, v26.4s, v0.4s\n" - "str q20, [%[out_q]], #16\n" - "ld4 {v17.4s, v18.4s, v19.4s, v20.4s}, [%[out_d]], #64\n" - "mul v25.4s, v25.4s, v0.4s\n" - "sri v24.4s, v22.4s, #16\n" - - "sri v28.4s, v27.4s, #8\n" - "sri v26.4s, v25.4s, #8\n" - "str q24, [%[out_q]], #16\n" - "sri v28.4s, v26.4s, #16\n" - "ld4 {v21.4s, v22.4s, v23.4s, v24.4s}, [%[out_d]], #64\n" - "str q28, [%[out_q]], #16\n" - "bne 0b\n" - :[out_d]"+r"(arr_d), - [out_q]"+r"(arr_q) - :[factor]"r"(&fact), - [i]"r"((I64)i28) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x1" - ); - arr_d -= 96; // Prefetched 24 extra vectors - } - - U32 remainder = num_v - i28 * 28; - - if (remainder % 4) { - for (U32 i=0; i<8; i++) { - arr_q[i] = arr_d[i] * scale; - } - arr_d += 8; - arr_q += 8; - remainder -= 2; - } - - switch(remainder) { - case 24: { - __asm__ __volatile__( - "ldr s0, [%[factor]]\n" - "dup v0.4s, v0.s[0]\n" - - "ld4 {v1.4s, v2.4s, v3.4s, v4.4s}, [%[out_d]], #64\n" - "ld4 {v5.4s, v6.4s, v7.4s, v8.4s}, [%[out_d]], #64\n" - "ld4 {v9.4s, v10.4s, v11.4s, v12.4s}, [%[out_d]], #64\n" - "ld4 {v13.4s, v14.4s, v15.4s, v16.4s}, [%[out_d]], #64\n" - "ld4 {v17.4s, v18.4s, v19.4s, v20.4s}, [%[out_d]], #64\n" - "ld4 {v21.4s, v22.4s, v23.4s, v24.4s}, [%[out_d]], #64\n" - - "mul v4.4s, v4.4s, v0.4s\n" - "mul v3.4s, v3.4s, v0.4s\n" - "mul v2.4s, v2.4s, v0.4s\n" - "mul v1.4s, v1.4s, v0.4s\n" - - "mul v8.4s, v8.4s, v0.4s\n" - "sri v4.4s, v3.4s, #8\n" - "mul v7.4s, v7.4s, v0.4s\n" - "sri v2.4s, v1.4s, #8\n" - "mul v6.4s, v6.4s, v0.4s\n" - "mul v5.4s, v5.4s, v0.4s\n" - "sri v4.4s, v2.4s, #16\n" - - "mul v12.4s, v12.4s, v0.4s\n" - "sri v8.4s, v7.4s, #8\n" - "mul v11.4s, v11.4s, v0.4s\n" - "sri v6.4s, v5.4s, #8\n" - "mul v10.4s, v10.4s, v0.4s\n" - "str q4, [%[out_q]], #16\n" - "mul v9.4s, v9.4s, v0.4s\n" - "sri v8.4s, v6.4s, #16\n" - - "mul v16.4s, v16.4s, v0.4s\n" - "sri v12.4s, v11.4s, #8\n" - "mul v15.4s, v15.4s, v0.4s\n" - "sri v10.4s, v9.4s, #8\n" - "mul v14.4s, v14.4s, v0.4s\n" - "str q8, [%[out_q]], #16\n" - "mul v13.4s, v13.4s, v0.4s\n" - "sri v12.4s, v10.4s, #16\n" - - "mul v20.4s, v20.4s, v0.4s\n" - "sri v16.4s, v15.4s, #8\n" - "mul v19.4s, v19.4s, v0.4s\n" - "sri v14.4s, v13.4s, #8\n" - "mul v18.4s, v18.4s, v0.4s\n" - "str q12, [%[out_q]], #16\n" - "mul v17.4s, v17.4s, v0.4s\n" - "sri v16.4s, v14.4s, #16\n" - - "mul v24.4s, v24.4s, v0.4s\n" - "sri v20.4s, v19.4s, #8\n" - "mul v23.4s, v23.4s, v0.4s\n" - "sri v18.4s, v17.4s, #8\n" - "mul v22.4s, v22.4s, v0.4s\n" - "str q16, [%[out_q]], #16\n" - "mul v21.4s, v21.4s, v0.4s\n" - "sri v20.4s, v18.4s, #16\n" - - "sri v24.4s, v23.4s, #8\n" - "sri v22.4s, v21.4s, #8\n" - "str q20, [%[out_q]], #16\n" - "sri v24.4s, v22.4s, #16\n" - - "str q24, [%[out_q]], #16\n" - :[out_d]"+r"(arr_d), - [out_q]"+r"(arr_q) - :[factor]"r"(&fact) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "x1" - ); - break; - } - case 20: { - __asm__ __volatile__( - "ldr s0, [%[factor]]\n" - "dup v0.4s, v0.s[0]\n" - - "ld4 {v1.4s, v2.4s, v3.4s, v4.4s}, [%[out_d]], #64\n" - "ld4 {v5.4s, v6.4s, v7.4s, v8.4s}, [%[out_d]], #64\n" - "ld4 {v9.4s, v10.4s, v11.4s, v12.4s}, [%[out_d]], #64\n" - "ld4 {v13.4s, v14.4s, v15.4s, v16.4s}, [%[out_d]], #64\n" - "ld4 {v17.4s, v18.4s, v19.4s, v20.4s}, [%[out_d]], #64\n" - - "mul v4.4s, v4.4s, v0.4s\n" - "mul v3.4s, v3.4s, v0.4s\n" - "mul v2.4s, v2.4s, v0.4s\n" - "mul v1.4s, v1.4s, v0.4s\n" - - "mul v8.4s, v8.4s, v0.4s\n" - "sri v4.4s, v3.4s, #8\n" - "mul v7.4s, v7.4s, v0.4s\n" - "sri v2.4s, v1.4s, #8\n" - "mul v6.4s, v6.4s, v0.4s\n" - "mul v5.4s, v5.4s, v0.4s\n" - "sri v4.4s, v2.4s, #16\n" - - "mul v12.4s, v12.4s, v0.4s\n" - "sri v8.4s, v7.4s, #8\n" - "mul v11.4s, v11.4s, v0.4s\n" - "sri v6.4s, v5.4s, #8\n" - "mul v10.4s, v10.4s, v0.4s\n" - "str q4, [%[out_q]], #16\n" - "mul v9.4s, v9.4s, v0.4s\n" - "sri v8.4s, v6.4s, #16\n" - - "mul v16.4s, v16.4s, v0.4s\n" - "sri v12.4s, v11.4s, #8\n" - "mul v15.4s, v15.4s, v0.4s\n" - "sri v10.4s, v9.4s, #8\n" - "mul v14.4s, v14.4s, v0.4s\n" - "str q8, [%[out_q]], #16\n" - "mul v13.4s, v13.4s, v0.4s\n" - "sri v12.4s, v10.4s, #16\n" - - "mul v20.4s, v20.4s, v0.4s\n" - "sri v16.4s, v15.4s, #8\n" - "mul v19.4s, v19.4s, v0.4s\n" - "sri v14.4s, v13.4s, #8\n" - "mul v18.4s, v18.4s, v0.4s\n" - "str q12, [%[out_q]], #16\n" - "mul v17.4s, v17.4s, v0.4s\n" - "sri v16.4s, v14.4s, #16\n" - - "sri v20.4s, v19.4s, #8\n" - "sri v18.4s, v17.4s, #8\n" - "str q16, [%[out_q]], #16\n" - "sri v20.4s, v18.4s, #16\n" - - "str q20, [%[out_q]], #16\n" - :[out_d]"+r"(arr_d), - [out_q]"+r"(arr_q) - :[factor]"r"(&fact) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "x1" - ); - break; - } - case 16: { - __asm__ __volatile__( - "ldr s0, [%[factor]]\n" - "dup v0.4s, v0.s[0]\n" - - "ld4 {v1.4s, v2.4s, v3.4s, v4.4s}, [%[out_d]], #64\n" - "ld4 {v5.4s, v6.4s, v7.4s, v8.4s}, [%[out_d]], #64\n" - "ld4 {v9.4s, v10.4s, v11.4s, v12.4s}, [%[out_d]], #64\n" - "ld4 {v13.4s, v14.4s, v15.4s, v16.4s}, [%[out_d]], #64\n" - - "mul v4.4s, v4.4s, v0.4s\n" - "mul v3.4s, v3.4s, v0.4s\n" - "mul v2.4s, v2.4s, v0.4s\n" - "mul v1.4s, v1.4s, v0.4s\n" - - "mul v8.4s, v8.4s, v0.4s\n" - "sri v4.4s, v3.4s, #8\n" - "mul v7.4s, v7.4s, v0.4s\n" - "sri v2.4s, v1.4s, #8\n" - "mul v6.4s, v6.4s, v0.4s\n" - "mul v5.4s, v5.4s, v0.4s\n" - "sri v4.4s, v2.4s, #16\n" - - "mul v12.4s, v12.4s, v0.4s\n" - "sri v8.4s, v7.4s, #8\n" - "mul v11.4s, v11.4s, v0.4s\n" - "sri v6.4s, v5.4s, #8\n" - "mul v10.4s, v10.4s, v0.4s\n" - "str q4, [%[out_q]], #16\n" - "mul v9.4s, v9.4s, v0.4s\n" - "sri v8.4s, v6.4s, #16\n" - - "mul v16.4s, v16.4s, v0.4s\n" - "sri v12.4s, v11.4s, #8\n" - "mul v15.4s, v15.4s, v0.4s\n" - "sri v10.4s, v9.4s, #8\n" - "mul v14.4s, v14.4s, v0.4s\n" - "str q8, [%[out_q]], #16\n" - "mul v13.4s, v13.4s, v0.4s\n" - "sri v12.4s, v10.4s, #16\n" - - "sri v16.4s, v15.4s, #8\n" - "sri v14.4s, v13.4s, #8\n" - "str q12, [%[out_q]], #16\n" - "sri v16.4s, v14.4s, #16\n" - - "str q16, [%[out_q]], #16\n" - :[out_d]"+r"(arr_d), - [out_q]"+r"(arr_q) - :[factor]"r"(&fact) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "x1" - ); - break; - } - case 12: { - __asm__ __volatile__( - "ldr s0, [%[factor]]\n" - "dup v0.4s, v0.s[0]\n" - - "ld4 {v1.4s, v2.4s, v3.4s, v4.4s}, [%[out_d]], #64\n" - "ld4 {v5.4s, v6.4s, v7.4s, v8.4s}, [%[out_d]], #64\n" - "ld4 {v9.4s, v10.4s, v11.4s, v12.4s}, [%[out_d]], #64\n" - - "mul v4.4s, v4.4s, v0.4s\n" - "mul v3.4s, v3.4s, v0.4s\n" - "mul v2.4s, v2.4s, v0.4s\n" - "mul v1.4s, v1.4s, v0.4s\n" - - "mul v8.4s, v8.4s, v0.4s\n" - "sri v4.4s, v3.4s, #8\n" - "mul v7.4s, v7.4s, v0.4s\n" - "sri v2.4s, v1.4s, #8\n" - "mul v6.4s, v6.4s, v0.4s\n" - "mul v5.4s, v5.4s, v0.4s\n" - "sri v4.4s, v2.4s, #16\n" - - "mul v12.4s, v12.4s, v0.4s\n" - "sri v8.4s, v7.4s, #8\n" - "mul v11.4s, v11.4s, v0.4s\n" - "sri v6.4s, v5.4s, #8\n" - "mul v10.4s, v10.4s, v0.4s\n" - "str q4, [%[out_q]], #16\n" - "mul v9.4s, v9.4s, v0.4s\n" - "sri v8.4s, v6.4s, #16\n" - - "sri v12.4s, v11.4s, #8\n" - "sri v10.4s, v9.4s, #8\n" - "str q8, [%[out_q]], #16\n" - "sri v12.4s, v10.4s, #16\n" - - "str q12, [%[out_q]], #16\n" - :[out_d]"+r"(arr_d), - [out_q]"+r"(arr_q) - :[factor]"r"(&fact) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "x1" - ); - break; - } - case 8: { - __asm__ __volatile__( - "ldr s0, [%[factor]]\n" - "dup v0.4s, v0.s[0]\n" - - "ld4 {v1.4s, v2.4s, v3.4s, v4.4s}, [%[out_d]], #64\n" - "ld4 {v5.4s, v6.4s, v7.4s, v8.4s}, [%[out_d]], #64\n" - - "mul v4.4s, v4.4s, v0.4s\n" - "mul v3.4s, v3.4s, v0.4s\n" - "mul v2.4s, v2.4s, v0.4s\n" - "mul v1.4s, v1.4s, v0.4s\n" - - "mul v8.4s, v8.4s, v0.4s\n" - "sri v4.4s, v3.4s, #8\n" - "mul v7.4s, v7.4s, v0.4s\n" - "sri v2.4s, v1.4s, #8\n" - "mul v6.4s, v6.4s, v0.4s\n" - "mul v5.4s, v5.4s, v0.4s\n" - "sri v4.4s, v2.4s, #16\n" - - "sri v8.4s, v7.4s, #8\n" - "sri v6.4s, v5.4s, #8\n" - "str q4, [%[out_q]], #16\n" - "sri v8.4s, v6.4s, #16\n" - - "str q8, [%[out_q]], #16\n" - :[out_d]"+r"(arr_d), - [out_q]"+r"(arr_q) - :[factor]"r"(&fact) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "x1" - ); - break; - } - case 4: { - __asm__ __volatile__( - "ldr s0, [%[factor]]\n" - "dup v0.4s, v0.s[0]\n" - - "ld4 {v1.4s, v2.4s, v3.4s, v4.4s}, [%[out_d]], #64\n" - - "mul v4.4s, v4.4s, v0.4s\n" - "mul v3.4s, v3.4s, v0.4s\n" - "mul v2.4s, v2.4s, v0.4s\n" - "mul v1.4s, v1.4s, v0.4s\n" - - "sri v4.4s, v3.4s, #8\n" - "sri v2.4s, v1.4s, #8\n" - "sri v4.4s, v2.4s, #16\n" - - "str q4, [%[out_q]], #16\n" - :[out_d]"+r"(arr_d), - [out_q]"+r"(arr_q) - :[factor]"r"(&fact) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "x1" - ); - break; - } - case 0: { - break; - } - default: { - return UNKNOWN; - } - } - return SUCCESS; -} -#endif -#endif diff --git a/tensor_computing/src/cpu/arm/int8/convolution_gemm_A55.cpp b/tensor_computing/src/cpu/arm/int8/convolution_gemm_A55.cpp deleted file mode 100644 index c8fa2045..00000000 --- a/tensor_computing/src/cpu/arm/int8/convolution_gemm_A55.cpp +++ /dev/null @@ -1,1624 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifdef _USE_INT8 -#include -#include "cpu/arm/int8/convolution_gemm.h" - -template -EE convolution_gemm_A55(TensorDesc inputDesc, const void* input, F16* inputScale, TensorDesc filterDesc, const void* filter, F16* filterScale, - ConvolutionDesc convDesc, TensorDesc biasDesc, const void* bias, U32 tmpBytes, void* tmp, TensorDesc outputDesc, - void* output, F16* outputScale, ActivationDesc activationDesc) -{ - UNUSED(biasDesc); - UNUSED(tmpBytes); - // still im2col + gemm with a smaller buffer - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - U32 dilateH = convDesc.dilatedRate_h; - U32 dilateW = convDesc.dilatedRate_w; - - if (fdf != DF_NCHWN8C4) { - return NOT_MATCH; - } - - I64 conv_relu_bool = (activationDesc.mode == ACTIVATION_RELU) ? 1 : 0; - I64 out_f16_bool = (odt == DT_F16) ? 1 : 0; - I64 scale_known_bool = 0; - if (*outputScale > 0 || ACTIVATION_RELU6 == activationDesc.mode) { - scale_known_bool = 1; - } - - INT8* inArray = (INT8*)input; // It will be updated if there is quantization - INT8* filterArray = (INT8*)filter; - F16* outArray = (F16*)output; - F16* biasArray = (F16*)bias; - INT8* in_pad = (INT8*)tmp; - - // both input and output are stored with C8 - oc /= 8; - ic /= 8; - - U32 ih_pad = ih + paddingT + paddingB; - U32 iw_pad = iw + paddingL + paddingR; - I32 ohow = oh*ow; - U32 ihiw = ih_pad*iw_pad; - - I32* biasScaled = (I32*)(in_pad + ic*ihiw*8 + 12*fh*fw*ic*8); // Initialize - - //double start, end; - I32 max_i32[4] = {0}; // To record max I32 values - I32 min_i32[4] = {0}; // To record min I32 values - - for (U32 n = 0; n < in; n++) {// for each batch - F16 scale_i = 1.0; - - // quantize input if necessary - if (idt == DT_F16) { - //start = get_current_time_int8(); - F16* in = ((F16*)input) + n*ic*ih*iw*8; - inArray = in_pad + ic*ihiw*8 + 12*fh*fw*ic*8; // After the space for padding and packing - - U32 numData = ic*ih*iw*8; - if (*inputScale > 0) { - scale_i = *inputScale; - } else { - float16x8_t temp_v = vld1q_f16(in); - float16x8_t max_v = temp_v; - float16x8_t min_v = temp_v; - - for (U32 i=8; i 0 && min < 0) { - F16 scale_max = 127.0 / max; - F16 scale_min = -127.0 / min; - scale_i = (scale_max < scale_min) ? scale_max : scale_min; - } else if (max < 0) { - scale_i = -127.0 / min; - } else { // min > 0 - scale_i = 127.0 / max; - } - } - for (U32 i = 0; i < numData; i++) { - F32 temp = in[i] * scale_i; - if (temp > 127) { - inArray[i] = 127; - } else if (temp < -127) { - inArray[i] = -127; - } else { - inArray[i] = temp; - } - } - *inputScale = scale_i; - } else { - scale_i = *inputScale; - } - - if (1 == scale_known_bool) { - if (ACTIVATION_RELU6 == activationDesc.mode) { - *outputScale = 127.0 / 6.0; - } - F32 scaleInt = (*outputScale / *inputScale) / *filterScale; - I32 thresholdP = 127.0 / scaleInt; - I32 thresholdN = 0; - if (ACTIVATION_RELU6 != activationDesc.mode) { - thresholdN = thresholdP * -1; - } - - for (U32 i = 0; i < 4; i++) { - max_i32[i] = thresholdP; - min_i32[i] = thresholdN; - } - } - - if (odt == DT_I8) { // Scale the bias - if (idt == DT_F16) { - biasScaled += ic * ih * iw * 8 / bytesOf(DT_I32); // After the quantized input - } - F32 scale = (*inputScale) * (*filterScale); - for (U32 i=0; i NHWChw12c4 + im2col - U32 in_h[12]; - U32 in_w[12]; - - for (U32 i = 0; i < 12; i++) { - in_h[i] = ((hw+i)/ow)*strideH; - in_w[i] = ((hw+i)%ow)*strideW; - } - for (U32 c = 0; c < ic; c++) {// for each 8 channels - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - INT8 *in_hw12c8 = inArray_pad + c*ihiw*8 + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - - INT8 *in_0 = in_hw12c8 + in_h[0]*iw_pad*8 + in_w[0]*8; - INT8 *in_1 = in_hw12c8 + in_h[1]*iw_pad*8 + in_w[1]*8; - INT8 *in_2 = in_hw12c8 + in_h[2]*iw_pad*8 + in_w[2]*8; - INT8 *in_3 = in_hw12c8 + in_h[3]*iw_pad*8 + in_w[3]*8; - INT8 *in_4 = in_hw12c8 + in_h[4]*iw_pad*8 + in_w[4]*8; - INT8 *in_5 = in_hw12c8 + in_h[5]*iw_pad*8 + in_w[5]*8; - INT8 *in_6 = in_hw12c8 + in_h[6]*iw_pad*8 + in_w[6]*8; - INT8 *in_7 = in_hw12c8 + in_h[7]*iw_pad*8 + in_w[7]*8; - INT8 *in_8 = in_hw12c8 + in_h[8]*iw_pad*8 + in_w[8]*8; - INT8 *in_9 = in_hw12c8 + in_h[9]*iw_pad*8 + in_w[9]*8; - INT8 *in_10 = in_hw12c8 + in_h[10]*iw_pad*8 + in_w[10]*8; - INT8 *in_11 = in_hw12c8 + in_h[11]*iw_pad*8 + in_w[11]*8; - - // in_pack (tmp) is reused for each tile - // NHWChw12c4 - INT8 *in_pack_0 = in_pack + c*fh*fw*12*8 + fh_idx*fw*12*4 + fw_idx*12*4; - INT8 *in_pack_1 = in_pack_0 + fh*fw*12*4; - - __asm__ __volatile__( - "ldr d0, [%[in_0]]\n" - "ldr x2, [%[in_2]]\n" - - "ldr d1, [%[in_1]]\n" - "ldr x3, [%[in_3]]\n" - - "ldr d4, [%[in_4]]\n" - "ldr x6, [%[in_6]]\n" - - "ldr d5, [%[in_5]]\n" - "ins v0.d[1], x2\n" - - "ldr x7, [%[in_7]]\n" - "ins v1.d[1], x3\n" - - "ldr d8, [%[in_8]]\n" - "ins v4.d[1], x6\n" - - "trn1 v20.4s, v0.4s, v1.4s\n" - "ins v5.d[1], x7\n" - - "trn2 v21.4s, v0.4s, v1.4s\n" - "ldr x10, [%[in_10]]\n" - - "ldr d9, [%[in_9]]\n" - "trn1 v24.4s, v4.4s, v5.4s\n" - - "trn2 v25.4s, v4.4s, v5.4s\n" - "ldr x11, [%[in_11]]\n" - - "str q20, [%[pack_0]]\n" - "ins v8.d[1], x10\n" - - "str q24, [%[pack_0], #16]\n" - "ins v9.d[1], x11\n" - - "trn1 v28.4s, v8.4s, v9.4s\n" - "str q21, [%[pack_1]]\n" - - "trn2 v29.4s, v8.4s, v9.4s\n" - "str q25, [%[pack_1], #16]\n" - - "str q28, [%[pack_0], #32]\n" - "str q29, [%[pack_1], #32]\n" - : - :[pack_0]"r"(in_pack_0), - [pack_1]"r"(in_pack_1), - [in_0]"r"(in_0), - [in_1]"r"(in_1), - [in_2]"r"(in_2), - [in_3]"r"(in_3), - [in_4]"r"(in_4), - [in_5]"r"(in_5), - [in_6]"r"(in_6), - [in_7]"r"(in_7), - [in_8]"r"(in_8), - [in_9]"r"(in_9), - [in_10]"r"(in_10), - [in_11]"r"(in_11) - :"memory", "cc", "v0", "v1", "v4", "v5", "v8", "v9", "v20", "v21", "v24", "v25", "v28", "v29", "x2", "x3", "x6", "x7", "x10", "x11" - ); - } - } - } - - // compute - for (U32 o = 0; o < oc; o++) {// 8 output channels at a time - INT8 *in_hw0 = in_pack; - INT8 *f_o0c0 = filterArray + o*8*fh*fw*ic*8; - I32 *out_buf = biasScaled + oc*8 + n*oc*ohow*8 + o*ohow*8 + hw*8;; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - // bias - F16 *b_0 = b0; - I32 *b_0_s = b0_s; - __asm__ __volatile__( - "cbz %[out_f16], 8f\n" - "eor v5.16b, v5.16b, v5.16b\n" - "ldr d1, [%[in_0]]\n" //in_0 - "eor v6.16b, v6.16b, v6.16b\n" - "ldr x1, [%[in_0], #8]\n" - "eor v7.16b, v7.16b, v7.16b\n" - "ins v1.d[1], x1\n" - "eor v8.16b, v8.16b, v8.16b\n" - "ldr d0, [%[f_0]]\n" //f_0 - "eor v9.16b, v9.16b, v9.16b\n" - "ldr x2, [%[f_0], #8]\n" - "eor v10.16b, v10.16b, v10.16b\n" - "ins v0.d[1], x2\n" - "eor v11.16b, v11.16b, v11.16b\n" - "ldr d3, [%[in_0], #16]\n" //in_1 - "eor v12.16b, v12.16b, v12.16b\n" - "ldr x3, [%[in_0], #24]\n" - "eor v13.16b, v13.16b, v13.16b\n" - "ins v3.d[1], x3\n" - "eor v14.16b, v14.16b, v14.16b\n" - "eor v15.16b, v15.16b, v15.16b\n" - "eor v16.16b, v16.16b, v16.16b\n" - - "eor v17.16b, v17.16b, v17.16b\n" - "eor v18.16b, v18.16b, v18.16b\n" - "eor v19.16b, v19.16b, v19.16b\n" - "eor v20.16b, v20.16b, v20.16b\n" - "eor v21.16b, v21.16b, v21.16b\n" - "eor v22.16b, v22.16b, v22.16b\n" - "eor v23.16b, v23.16b, v23.16b\n" - "eor v24.16b, v24.16b, v24.16b\n" - "eor v25.16b, v25.16b, v25.16b\n" - "eor v26.16b, v26.16b, v26.16b\n" - "eor v27.16b, v27.16b, v27.16b\n" - "eor v28.16b, v28.16b, v28.16b\n" - "b 7f\n" - - "8:\n" - "ldp q29, q30, [%[b_0_s]]\n" - "mov v5.16b, v29.16b\n" - "ldr d1, [%[in_0]]\n" //in_0 - "mov v7.16b, v29.16b\n" - "ldr x1, [%[in_0], #8]\n" - "mov v9.16b, v29.16b\n" - "ins v1.d[1], x1\n" - "mov v11.16b, v29.16b\n" - "ldr d0, [%[f_0]]\n" //f_0 - "mov v13.16b, v29.16b\n" - "ldr x2, [%[f_0], #8]\n" - "mov v15.16b, v29.16b\n" - "ins v0.d[1], x2\n" - "mov v17.16b, v29.16b\n" - "ldr d3, [%[in_0], #16]\n" //in_1 - "mov v19.16b, v29.16b\n" - "ldr x3, [%[in_0], #24]\n" - "mov v21.16b, v29.16b\n" - "ins v3.d[1], x3\n" - "mov v23.16b, v29.16b\n" - "mov v25.16b, v29.16b\n" - "mov v27.16b, v29.16b\n" - - "mov v6.16b, v30.16b\n" - "mov v8.16b, v30.16b\n" - "mov v10.16b, v30.16b\n" - "mov v12.16b, v30.16b\n" - "mov v14.16b, v30.16b\n" - "mov v16.16b, v30.16b\n" - "mov v18.16b, v30.16b\n" - "mov v20.16b, v30.16b\n" - "mov v22.16b, v30.16b\n" - "mov v24.16b, v30.16b\n" - "mov v26.16b, v30.16b\n" - "mov v28.16b, v30.16b\n" - - "7:\n" - //give in address to x3 - "mov x3, %[in_0]\n" - - //give f address to x0 - "mov x0, %[f_0]\n" - - "mov x2, %[ic]\n" //ic_blk - "0:\n" - "sdot v5.4s, v0.16b, v1.4b[0]\n" - "ldr d2, [x3, 32]\n" - "ldr x16, [x3, 40]\n" - "sdot v7.4s, v0.16b, v1.4b[1]\n" - "ldr d29, [x0, 16]\n" - "ldr x17, [x0, 24]\n" - "sdot v9.4s, v0.16b, v1.4b[2]\n" - "ins v2.d[1], x16\n" - "ldr d30, [x3, 48]!\n" - "sdot v11.4s, v0.16b, v1.4b[3]\n" - "ins v29.d[1], x17\n" - - "sdot v13.4s, v0.16b, v3.4b[0]\n" - "ldr x16, [x3, 8]\n" - "subs x2, x2, #4\n" - "sdot v15.4s, v0.16b, v3.4b[1]\n" - "sdot v17.4s, v0.16b, v3.4b[2]\n" - "ins v30.d[1], x16\n" - "sdot v19.4s, v0.16b, v3.4b[3]\n" - - "sdot v21.4s, v0.16b, v2.4b[0]\n" - "sdot v23.4s, v0.16b, v2.4b[1]\n" - "sdot v25.4s, v0.16b, v2.4b[2]\n" - "sdot v27.4s, v0.16b, v2.4b[3]\n" - - "sdot v14.4s, v29.16b, v3.4b[0]\n" - "sdot v16.4s, v29.16b, v3.4b[1]\n" - "ldr d0, [x0, 32]!\n" - "ldr x17, [x0, 8]\n" - "sdot v18.4s, v29.16b, v3.4b[2]\n" - "sdot v20.4s, v29.16b, v3.4b[3]\n" - - "sdot v6.4s, v29.16b, v1.4b[0]\n" - "sdot v8.4s, v29.16b, v1.4b[1]\n" - "ldr d3, [x3, 16]\n" - "ldr x16, [x3, 24]\n" - "sdot v10.4s, v29.16b, v1.4b[2]\n" - "sdot v12.4s, v29.16b, v1.4b[3]\n" - - "ins v0.d[1], x17\n" - "ins v3.d[1], x16\n" - - "sdot v22.4s, v29.16b, v2.4b[0]\n" - "mov v1.16b, v30.16b\n" - "sdot v24.4s, v29.16b, v2.4b[1]\n" - "sdot v26.4s, v29.16b, v2.4b[2]\n" - "sdot v28.4s, v29.16b, v2.4b[3]\n" - - "bne 0b\n" - "cbz %[out_f16], 6f\n" - - "scvtf v5.4s, v5.4s\n" - "scvtf v6.4s, v6.4s\n" - "ldr d1, [%[factor]]\n" - "ldr x1, [%[factor], #8]\n" - "scvtf v7.4s, v7.4s\n" - "scvtf v8.4s, v8.4s\n" - "ins v1.d[1], x1\n" - "scvtf v9.4s, v9.4s\n" - "ldr d0, [%[b_0]]\n" - "ldr x0, [%[b_0], #8]\n" - "scvtf v10.4s, v10.4s\n" - "scvtf v11.4s, v11.4s\n" - "ins v0.d[1], x0\n" - "scvtf v12.4s, v12.4s\n" - "scvtf v13.4s, v13.4s\n" - "scvtf v14.4s, v14.4s\n" - "scvtf v15.4s, v15.4s\n" - "scvtf v16.4s, v16.4s\n" - "scvtf v17.4s, v17.4s\n" - "scvtf v18.4s, v18.4s\n" - "scvtf v19.4s, v19.4s\n" - "scvtf v20.4s, v20.4s\n" - "scvtf v21.4s, v21.4s\n" - "scvtf v22.4s, v22.4s\n" - "scvtf v23.4s, v23.4s\n" - "scvtf v24.4s, v24.4s\n" - "scvtf v25.4s, v25.4s\n" - "scvtf v26.4s, v26.4s\n" - "scvtf v27.4s, v27.4s\n" - "scvtf v28.4s, v28.4s\n" - - "fmul v5.4s, v1.4s, v5.4s\n" - "fmul v6.4s, v1.4s, v6.4s\n" - "fmul v7.4s, v1.4s, v7.4s\n" - "fmul v8.4s, v1.4s, v8.4s\n" - "fmul v9.4s, v1.4s, v9.4s\n" - "fmul v10.4s, v1.4s, v10.4s\n" - "fmul v11.4s, v1.4s, v11.4s\n" - "fmul v12.4s, v1.4s, v12.4s\n" - "fmul v13.4s, v1.4s, v13.4s\n" - "fmul v14.4s, v1.4s, v14.4s\n" - "fmul v15.4s, v1.4s, v15.4s\n" - "fmul v16.4s, v1.4s, v16.4s\n" - "fmul v17.4s, v1.4s, v17.4s\n" - "fmul v18.4s, v1.4s, v18.4s\n" - "fmul v19.4s, v1.4s, v19.4s\n" - "fmul v20.4s, v1.4s, v20.4s\n" - "fmul v21.4s, v1.4s, v21.4s\n" - "fmul v22.4s, v1.4s, v22.4s\n" - "fmul v23.4s, v1.4s, v23.4s\n" - "fmul v24.4s, v1.4s, v24.4s\n" - "fmul v25.4s, v1.4s, v25.4s\n" - "fmul v26.4s, v1.4s, v26.4s\n" - "fmul v27.4s, v1.4s, v27.4s\n" - "fmul v28.4s, v1.4s, v28.4s\n" - - "fcvtn v5.4h, v5.4s\n" - "fcvtn v7.4h, v7.4s\n" - "fcvtn v9.4h, v9.4s\n" - "fcvtn v11.4h, v11.4s\n" - "fcvtn v13.4h, v13.4s\n" - "fcvtn v15.4h, v15.4s\n" - "fcvtn v17.4h, v17.4s\n" - "fcvtn v19.4h, v19.4s\n" - "fcvtn v21.4h, v21.4s\n" - "fcvtn v23.4h, v23.4s\n" - "fcvtn v25.4h, v25.4s\n" - "fcvtn v27.4h, v27.4s\n" - - "fcvtn2 v5.8h, v6.4s\n" - "fcvtn2 v7.8h, v8.4s\n" - "fcvtn2 v9.8h, v10.4s\n" - "fcvtn2 v11.8h, v12.4s\n" - "fcvtn2 v13.8h, v14.4s\n" - "fcvtn2 v15.8h, v16.4s\n" - "fcvtn2 v17.8h, v18.4s\n" - "fcvtn2 v19.8h, v20.4s\n" - "fcvtn2 v21.8h, v22.4s\n" - "fcvtn2 v23.8h, v24.4s\n" - "fcvtn2 v25.8h, v26.4s\n" - "fcvtn2 v27.8h, v28.4s\n" - - "fadd v5.8h, v0.8h, v5.8h\n" - "fadd v7.8h, v0.8h, v7.8h\n" - "fadd v9.8h, v0.8h, v9.8h\n" - "fadd v11.8h, v0.8h, v11.8h\n" - "fadd v13.8h, v0.8h, v13.8h\n" - "fadd v15.8h, v0.8h, v15.8h\n" - "fadd v17.8h, v0.8h, v17.8h\n" - "fadd v19.8h, v0.8h, v19.8h\n" - "fadd v21.8h, v0.8h, v21.8h\n" - "fadd v23.8h, v0.8h, v23.8h\n" - "fadd v25.8h, v0.8h, v25.8h\n" - "fadd v27.8h, v0.8h, v27.8h\n" - - "cbz %[conv_relu], 1f\n" - "eor v1.16b, v1.16b, v1.16b\n" //zero - "fmax v5.8h, v5.8h, v1.8h\n" - "fmax v7.8h, v7.8h, v1.8h\n" - "fmax v9.8h, v9.8h, v1.8h\n" - "fmax v11.8h, v11.8h, v1.8h\n" - "fmax v13.8h, v13.8h, v1.8h\n" - "fmax v15.8h, v15.8h, v1.8h\n" - "fmax v17.8h, v17.8h, v1.8h\n" - "fmax v19.8h, v19.8h, v1.8h\n" - "fmax v21.8h, v21.8h, v1.8h\n" - "fmax v23.8h, v23.8h, v1.8h\n" - "fmax v25.8h, v25.8h, v1.8h\n" - "fmax v27.8h, v27.8h, v1.8h\n" - - "1:\n" - "str q5, [%[out_0]]\n" - "str q7, [%[out_0], #16]\n" - "str q9, [%[out_0], #32]\n" - "str q11, [%[out_0], #48]\n" - "str q13, [%[out_0], #64]\n" - "str q15, [%[out_0], #80]\n" - "str q17, [%[out_0], #96]\n" - "str q19, [%[out_0], #112]\n" - "str q21, [%[out_0], #128]\n" - "str q23, [%[out_0], #144]\n" - "str q25, [%[out_0], #160]\n" - "str q27, [%[out_0], #176]\n" - "b 5f\n" - - "6:\n" - "ldr q0, [%[min]]\n" - "ldr q30, [%[max]]\n" - "cbz %[conv_relu], 2f\n" - "eor v1.16b, v1.16b, v1.16b\n" //zero - "smax v5.4s, v5.4s, v1.4s\n" - "smax v6.4s, v6.4s, v1.4s\n" - "smax v7.4s, v7.4s, v1.4s\n" - "smax v8.4s, v8.4s, v1.4s\n" - "smax v9.4s, v9.4s, v1.4s\n" - "smax v10.4s, v10.4s, v1.4s\n" - "smax v11.4s, v11.4s, v1.4s\n" - "smax v12.4s, v12.4s, v1.4s\n" - "smax v13.4s, v13.4s, v1.4s\n" - "smax v14.4s, v14.4s, v1.4s\n" - "smax v15.4s, v15.4s, v1.4s\n" - "smax v16.4s, v16.4s, v1.4s\n" - "smax v17.4s, v17.4s, v1.4s\n" - "smax v18.4s, v18.4s, v1.4s\n" - "smax v19.4s, v19.4s, v1.4s\n" - "smax v20.4s, v20.4s, v1.4s\n" - "smax v21.4s, v21.4s, v1.4s\n" - "smax v22.4s, v22.4s, v1.4s\n" - "smax v23.4s, v23.4s, v1.4s\n" - "smax v24.4s, v24.4s, v1.4s\n" - "smax v25.4s, v25.4s, v1.4s\n" - "smax v26.4s, v26.4s, v1.4s\n" - "smax v27.4s, v27.4s, v1.4s\n" - "smax v28.4s, v28.4s, v1.4s\n" - - "2:\n" - "cbz %[scale_known], 7f\n" - "smax v5.4s, v5.4s, v0.4s\n" - "smin v5.4s, v5.4s, v30.4s\n" - "smax v6.4s, v6.4s, v0.4s\n" - "smin v6.4s, v6.4s, v30.4s\n" - "smax v7.4s, v7.4s, v0.4s\n" - "smin v7.4s, v7.4s, v30.4s\n" - "smax v8.4s, v8.4s, v0.4s\n" - "smin v8.4s, v8.4s, v30.4s\n" - "smax v9.4s, v9.4s, v0.4s\n" - "smin v9.4s, v9.4s, v30.4s\n" - "smax v10.4s, v10.4s, v0.4s\n" - "smin v10.4s, v10.4s, v30.4s\n" - "smax v11.4s, v11.4s, v0.4s\n" - "smin v11.4s, v11.4s, v30.4s\n" - "smax v12.4s, v12.4s, v0.4s\n" - "smin v12.4s, v12.4s, v30.4s\n" - "smax v13.4s, v13.4s, v0.4s\n" - "smin v13.4s, v13.4s, v30.4s\n" - "smax v14.4s, v14.4s, v0.4s\n" - "smin v14.4s, v14.4s, v30.4s\n" - "smax v15.4s, v15.4s, v0.4s\n" - "smin v15.4s, v15.4s, v30.4s\n" - "smax v16.4s, v16.4s, v0.4s\n" - "smin v16.4s, v16.4s, v30.4s\n" - "smax v17.4s, v17.4s, v0.4s\n" - "smin v17.4s, v17.4s, v30.4s\n" - "smax v18.4s, v18.4s, v0.4s\n" - "smin v18.4s, v18.4s, v30.4s\n" - "smax v19.4s, v19.4s, v0.4s\n" - "smin v19.4s, v19.4s, v30.4s\n" - "smax v20.4s, v20.4s, v0.4s\n" - "smin v20.4s, v20.4s, v30.4s\n" - "smax v21.4s, v21.4s, v0.4s\n" - "smin v21.4s, v21.4s, v30.4s\n" - "smax v22.4s, v22.4s, v0.4s\n" - "smin v22.4s, v22.4s, v30.4s\n" - "smax v23.4s, v23.4s, v0.4s\n" - "smin v23.4s, v23.4s, v30.4s\n" - "smax v24.4s, v24.4s, v0.4s\n" - "smin v24.4s, v24.4s, v30.4s\n" - "smax v25.4s, v25.4s, v0.4s\n" - "smin v25.4s, v25.4s, v30.4s\n" - "smax v26.4s, v26.4s, v0.4s\n" - "smin v26.4s, v26.4s, v30.4s\n" - "smax v27.4s, v27.4s, v0.4s\n" - "smin v27.4s, v27.4s, v30.4s\n" - "smax v28.4s, v28.4s, v0.4s\n" - "smin v28.4s, v28.4s, v30.4s\n" - - "str q5, [%[out_buf]]\n" - "str q6, [%[out_buf], 16]\n" - "str q7, [%[out_buf], 32]\n" - "str q8, [%[out_buf], 48]\n" - "str q9, [%[out_buf], 64]\n" - "str q10, [%[out_buf], 80]\n" - "str q11, [%[out_buf], 96]\n" - "str q12, [%[out_buf], 112]\n" - "str q13, [%[out_buf], 128]\n" - "str q14, [%[out_buf], 144]\n" - "str q15, [%[out_buf], 160]\n" - "str q16, [%[out_buf], 176]\n" - "str q17, [%[out_buf], 192]\n" - "str q18, [%[out_buf], 208]\n" - "str q19, [%[out_buf], 224]\n" - "str q20, [%[out_buf], 240]\n" - "str q21, [%[out_buf], 256]\n" - "str q22, [%[out_buf], 272]\n" - "str q23, [%[out_buf], 288]\n" - "str q24, [%[out_buf], 304]\n" - "str q25, [%[out_buf], 320]\n" - "str q26, [%[out_buf], 336]\n" - "str q27, [%[out_buf], 352]\n" - "str q28, [%[out_buf], 368]\n" - "b 5f\n" - - "7:\n" - "smax v30.4s, v5.4s, v30.4s\n" - "smin v0.4s, v5.4s, v0.4s\n" - "str q5, [%[out_buf]]\n" - "smax v30.4s, v6.4s, v30.4s\n" - "smin v0.4s, v6.4s, v0.4s\n" - "str q6, [%[out_buf], 16]\n" - "smax v30.4s, v7.4s, v30.4s\n" - "smin v0.4s, v7.4s, v0.4s\n" - "str q7, [%[out_buf], 32]\n" - "smax v30.4s, v8.4s, v30.4s\n" - "smin v0.4s, v8.4s, v0.4s\n" - "str q8, [%[out_buf], 48]\n" - "smax v30.4s, v9.4s, v30.4s\n" - "smin v0.4s, v9.4s, v0.4s\n" - "str q9, [%[out_buf], 64]\n" - "smax v30.4s, v10.4s, v30.4s\n" - "smin v0.4s, v10.4s, v0.4s\n" - "str q10, [%[out_buf], 80]\n" - "smax v30.4s, v11.4s, v30.4s\n" - "smin v0.4s, v11.4s, v0.4s\n" - "str q11, [%[out_buf], 96]\n" - "smax v30.4s, v12.4s, v30.4s\n" - "smin v0.4s, v12.4s, v0.4s\n" - "str q12, [%[out_buf], 112]\n" - "smax v30.4s, v13.4s, v30.4s\n" - "smin v0.4s, v13.4s, v0.4s\n" - "str q13, [%[out_buf], 128]\n" - - "smax v30.4s, v14.4s, v30.4s\n" - "smin v0.4s, v14.4s, v0.4s\n" - "str q14, [%[out_buf], 144]\n" - "smax v30.4s, v15.4s, v30.4s\n" - "smin v0.4s, v15.4s, v0.4s\n" - "str q15, [%[out_buf], 160]\n" - "smax v30.4s, v16.4s, v30.4s\n" - "smin v0.4s, v16.4s, v0.4s\n" - "str q16, [%[out_buf], 176]\n" - "smax v30.4s, v17.4s, v30.4s\n" - "smin v0.4s, v17.4s, v0.4s\n" - "str q17, [%[out_buf], 192]\n" - "smax v30.4s, v18.4s, v30.4s\n" - "smin v0.4s, v18.4s, v0.4s\n" - "str q18, [%[out_buf], 208]\n" - "smax v30.4s, v19.4s, v30.4s\n" - "smin v0.4s, v19.4s, v0.4s\n" - "str q19, [%[out_buf], 224]\n" - "smax v30.4s, v20.4s, v30.4s\n" - "smin v0.4s, v20.4s, v0.4s\n" - "str q20, [%[out_buf], 240]\n" - "smax v30.4s, v21.4s, v30.4s\n" - "smin v0.4s, v21.4s, v0.4s\n" - "str q21, [%[out_buf], 256]\n" - "smax v30.4s, v22.4s, v30.4s\n" - "smin v0.4s, v22.4s, v0.4s\n" - "str q22, [%[out_buf], 272]\n" - "smax v30.4s, v23.4s, v30.4s\n" - "smin v0.4s, v23.4s, v0.4s\n" - "str q23, [%[out_buf], 288]\n" - "smax v30.4s, v24.4s, v30.4s\n" - "smin v0.4s, v24.4s, v0.4s\n" - "str q24, [%[out_buf], 304]\n" - "smax v30.4s, v25.4s, v30.4s\n" - "smin v0.4s, v25.4s, v0.4s\n" - "str q25, [%[out_buf], 320]\n" - "smax v30.4s, v26.4s, v30.4s\n" - "smin v0.4s, v26.4s, v0.4s\n" - "str q26, [%[out_buf], 336]\n" - "smax v30.4s, v27.4s, v30.4s\n" - "smin v0.4s, v27.4s, v0.4s\n" - "str q27, [%[out_buf], 352]\n" - "smax v30.4s, v28.4s, v30.4s\n" - "smin v0.4s, v28.4s, v0.4s\n" - "str q28, [%[out_buf], 368]\n" - - "str q30, [%[max]]\n" - "str q0, [%[min]]\n" - - "5:\n" - : - :[out_0]"r"(out_o0hw0), - [out_buf]"r"(out_buf), - [in_0]"r"(in_hw0), - [f_0]"r"(f_o0c0), - [ic]"r"((I64)ic*8*fh*fw), - [b_0]"r"(b_0), - [b_0_s]"r"(b_0_s), - [factor]"r"(factor_v), - [max]"r"(max_i32), - [min]"r"(min_i32), - [conv_relu]"r"(conv_relu_bool), - [out_f16]"r"(out_f16_bool), - [scale_known]"r"(scale_known_bool) - :"memory", "cc", "v0", "v1", "v2", "v3", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x0", "x1", "x2", "x3","x17","x16" - ); - b0 += 8; - b0_s += 8; - } - } - - // ohow_reminder % 12 / 8 - I32 ohow_s = (ohow / 12) * 12; - I32 ohow_tail = ohow - ohow_s; - - if (ohow_tail >= 8) { - I32 hw = ohow_s; - F16 *b0 = biasArray; - I32 *b0_s = biasScaled; - INT8 *in_pack = ((INT8*)tmp) + ic*ih_pad*iw_pad*8; - // pack input - // NCHWc8 => NHWChw8c4 + im2col - U32 in_h[8]; - U32 in_w[8]; - - for (U32 i = 0; i < 8; i++) { - in_h[i] = ((hw+i)/ow)*strideH; - in_w[i] = ((hw+i)%ow)*strideW; - } - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - INT8 *in_hw8c8 = inArray_pad + c*ihiw*8 + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - INT8 *in_0 = in_hw8c8 + in_h[0]*iw_pad*8 + in_w[0]*8; - INT8 *in_1 = in_hw8c8 + in_h[1]*iw_pad*8 + in_w[1]*8; - INT8 *in_2 = in_hw8c8 + in_h[2]*iw_pad*8 + in_w[2]*8; - INT8 *in_3 = in_hw8c8 + in_h[3]*iw_pad*8 + in_w[3]*8; - INT8 *in_4 = in_hw8c8 + in_h[4]*iw_pad*8 + in_w[4]*8; - INT8 *in_5 = in_hw8c8 + in_h[5]*iw_pad*8 + in_w[5]*8; - INT8 *in_6 = in_hw8c8 + in_h[6]*iw_pad*8 + in_w[6]*8; - INT8 *in_7 = in_hw8c8 + in_h[7]*iw_pad*8 + in_w[7]*8; - INT8 *in_pack_0 = in_pack + c*fh*fw*8*8 + fh_idx*fw*8*4 + fw_idx*8*4; - INT8 *in_pack_1 = in_pack_0 + fh*fw*8*4; - - __asm__ __volatile__( - "ldr d0, [%[in_0]]\n" - "ldr x2, [%[in_2]]\n" - "ldr d1, [%[in_1]]\n" - "ldr x3, [%[in_3]]\n" - "ins v0.d[1], x2\n" - "ins v1.d[1], x3\n" - "ldr d4, [%[in_4]]\n" - "ldr x6, [%[in_6]]\n" - "trn1 v20.4s, v0.4s, v1.4s\n" - "trn2 v21.4s, v0.4s, v1.4s\n" - - "ldr d5, [%[in_5]]\n" - "ldr x7, [%[in_7]]\n" - "ins v4.d[1], x6\n" - "ins v5.d[1], x7\n" - - "str q20, [%[pack_0]]\n" - "trn1 v24.4s, v4.4s, v5.4s\n" - "trn2 v25.4s, v4.4s, v5.4s\n" - "str q21, [%[pack_1]]\n" - "str q24, [%[pack_0], #16]\n" - "str q25, [%[pack_1], #16]\n" - : - :[pack_0]"r"(in_pack_0), - [pack_1]"r"(in_pack_1), - [in_0]"r"(in_0), - [in_1]"r"(in_1), - [in_2]"r"(in_2), - [in_3]"r"(in_3), - [in_4]"r"(in_4), - [in_5]"r"(in_5), - [in_6]"r"(in_6), - [in_7]"r"(in_7) - :"memory", "cc", "v0", "v1", "v4", "v5", "v20", "v21", "v24", "v25", "x2", "x3", "x6", "x7" - ); - } - } - } - - // compute - for (U32 o = 0; o < oc; o++) { - INT8 *in_hw0 = in_pack; - INT8 *f_o0c0 = filterArray + o*8*fh*fw*ic*8; - I32 *out_buf = biasScaled + oc*8 + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - // bias - F16 *b_0 = b0; - I32 *b_0_s = b0_s; - __asm__ __volatile__( - "cbz %[out_f16], 8f\n" - "eor v5.16b, v5.16b, v5.16b\n" - "ldr d1, [%[in_0]]\n" //in_0 - "eor v6.16b, v6.16b, v6.16b\n" - "ldr x1, [%[in_0], #8]\n" - "eor v7.16b, v7.16b, v7.16b\n" - "ins v1.d[1], x1\n" - "eor v8.16b, v8.16b, v8.16b\n" - "ldr d0, [%[f_0]]\n" //f_0 - "eor v9.16b, v9.16b, v9.16b\n" - "ldr x2, [%[f_0], #8]\n" - "eor v10.16b, v10.16b, v10.16b\n" - "ins v0.d[1], x2\n" - "eor v11.16b, v11.16b, v11.16b\n" - "eor v12.16b, v12.16b, v12.16b\n" - "eor v13.16b, v13.16b, v13.16b\n" - "eor v14.16b, v14.16b, v14.16b\n" - "eor v15.16b, v15.16b, v15.16b\n" - "eor v16.16b, v16.16b, v16.16b\n" - "eor v17.16b, v17.16b, v17.16b\n" - "eor v18.16b, v18.16b, v18.16b\n" - "eor v19.16b, v19.16b, v19.16b\n" - "eor v20.16b, v20.16b, v20.16b\n" - "b 7f\n" - - "8:\n" - "ldp q29, q30, [%[b_0_s]]\n" - "mov v5.16b, v29.16b\n" - "ldr d1, [%[in_0]]\n" //in_0 - "mov v7.16b, v29.16b\n" - "ldr x1, [%[in_0], #8]\n" - "mov v9.16b, v29.16b\n" - "ins v1.d[1], x1\n" - "mov v11.16b, v29.16b\n" - "ldr d0, [%[f_0]]\n" //f_0 - "mov v13.16b, v29.16b\n" - "ldr x2, [%[f_0], #8]\n" - "mov v15.16b, v29.16b\n" - "ins v0.d[1], x2\n" - "mov v17.16b, v29.16b\n" - "mov v19.16b, v29.16b\n" - - "mov v6.16b, v30.16b\n" - "mov v8.16b, v30.16b\n" - "mov v10.16b, v30.16b\n" - "mov v12.16b, v30.16b\n" - "mov v14.16b, v30.16b\n" - "mov v16.16b, v30.16b\n" - "mov v18.16b, v30.16b\n" - "mov v20.16b, v30.16b\n" - - "7:\n" - - //give in address to x3 - "mov x3, %[in_0]\n" - - //give f address to x0 - "mov x0, %[f_0]\n" - - "mov x2, %[ic]\n" //ic_blk - "0:\n" - "sdot v5.4s, v0.16b, v1.4b[0]\n" - "ldr d3, [x3, 16]!\n" - "ldr x16, [x3, 8]\n" - "sdot v7.4s, v0.16b, v1.4b[1]\n" - "ldr d29, [x0, 16]\n" - "ldr x17, [x0, 24]\n" - "sdot v9.4s, v0.16b, v1.4b[2]\n" - "ins v3.d[1], x16\n" - "ldr d30, [x3, 16]!\n" - "sdot v11.4s, v0.16b, v1.4b[3]\n" - "ins v29.d[1], x17\n" - - "sdot v13.4s, v0.16b, v3.4b[0]\n" - "ldr x16, [x3, 8]\n" - "subs x2, x2, #4\n" - "sdot v15.4s, v0.16b, v3.4b[1]\n" - "sdot v17.4s, v0.16b, v3.4b[2]\n" - "ins v30.d[1], x16\n" - "sdot v19.4s, v0.16b, v3.4b[3]\n" - - "sdot v6.4s, v29.16b, v1.4b[0]\n" - "sdot v8.4s, v29.16b, v1.4b[1]\n" - "ldr d0, [x0, 32]!\n" - "ldr x17, [x0, 8]\n" - "sdot v10.4s, v29.16b, v1.4b[2]\n" - "sdot v12.4s, v29.16b, v1.4b[3]\n" - - "sdot v14.4s, v29.16b, v3.4b[0]\n" - "ins v0.d[1], x17\n" - "mov v1.16b, v30.16b\n" - "sdot v16.4s, v29.16b, v3.4b[1]\n" - "sdot v18.4s, v29.16b, v3.4b[2]\n" - "sdot v20.4s, v29.16b, v3.4b[3]\n" - - "bne 0b\n" - "cbz %[out_f16], 6f\n" - "scvtf v5.4s, v5.4s\n" - "scvtf v6.4s, v6.4s\n" - "ldr d1, [%[factor]]\n" - "ldr x1, [%[factor], #8]\n" - "scvtf v7.4s, v7.4s\n" - "scvtf v8.4s, v8.4s\n" - "ins v1.d[1], x1\n" - "scvtf v9.4s, v9.4s\n" - "ldr d0, [%[b_0]]\n" - "ldr x0, [%[b_0], #8]\n" - "scvtf v10.4s, v10.4s\n" - "scvtf v11.4s, v11.4s\n" - "ins v0.d[1], x0\n" - "scvtf v12.4s, v12.4s\n" - "scvtf v13.4s, v13.4s\n" - "scvtf v14.4s, v14.4s\n" - "scvtf v15.4s, v15.4s\n" - "scvtf v16.4s, v16.4s\n" - "scvtf v17.4s, v17.4s\n" - "scvtf v18.4s, v18.4s\n" - "scvtf v19.4s, v19.4s\n" - "scvtf v20.4s, v20.4s\n" - - "fmul v5.4s, v1.4s, v5.4s\n" - "fmul v6.4s, v1.4s, v6.4s\n" - "fmul v7.4s, v1.4s, v7.4s\n" - "fmul v8.4s, v1.4s, v8.4s\n" - "fmul v9.4s, v1.4s, v9.4s\n" - "fmul v10.4s, v1.4s, v10.4s\n" - "fmul v11.4s, v1.4s, v11.4s\n" - "fmul v12.4s, v1.4s, v12.4s\n" - "fmul v13.4s, v1.4s, v13.4s\n" - "fmul v14.4s, v1.4s, v14.4s\n" - "fmul v15.4s, v1.4s, v15.4s\n" - "fmul v16.4s, v1.4s, v16.4s\n" - "fmul v17.4s, v1.4s, v17.4s\n" - "fmul v18.4s, v1.4s, v18.4s\n" - "fmul v19.4s, v1.4s, v19.4s\n" - "fmul v20.4s, v1.4s, v20.4s\n" - - "fcvtn v5.4h, v5.4s\n" - "fcvtn v7.4h, v7.4s\n" - "fcvtn v9.4h, v9.4s\n" - "fcvtn v11.4h, v11.4s\n" - "fcvtn v13.4h, v13.4s\n" - "fcvtn v15.4h, v15.4s\n" - "fcvtn v17.4h, v17.4s\n" - "fcvtn v19.4h, v19.4s\n" - - "fcvtn2 v5.8h, v6.4s\n" - "fcvtn2 v7.8h, v8.4s\n" - "fcvtn2 v9.8h, v10.4s\n" - "fcvtn2 v11.8h, v12.4s\n" - "fcvtn2 v13.8h, v14.4s\n" - "fcvtn2 v15.8h, v16.4s\n" - "fcvtn2 v17.8h, v18.4s\n" - "fcvtn2 v19.8h, v20.4s\n" - - "fadd v5.8h, v0.8h, v5.8h\n" - "fadd v7.8h, v0.8h, v7.8h\n" - "fadd v9.8h, v0.8h, v9.8h\n" - "fadd v11.8h, v0.8h, v11.8h\n" - "fadd v13.8h, v0.8h, v13.8h\n" - "fadd v15.8h, v0.8h, v15.8h\n" - "fadd v17.8h, v0.8h, v17.8h\n" - "fadd v19.8h, v0.8h, v19.8h\n" - - "cbz %[conv_relu], 1f\n" - "eor v1.16b, v1.16b, v1.16b\n" //zero - "fmax v5.8h, v5.8h, v1.8h\n" - "fmax v7.8h, v7.8h, v1.8h\n" - "fmax v9.8h, v9.8h, v1.8h\n" - "fmax v11.8h, v11.8h, v1.8h\n" - "fmax v13.8h, v13.8h, v1.8h\n" - "fmax v15.8h, v15.8h, v1.8h\n" - "fmax v17.8h, v17.8h, v1.8h\n" - "fmax v19.8h, v19.8h, v1.8h\n" - - "1:\n" - "str q5, [%[out_0]]\n" - "str q7, [%[out_0], #16]\n" - "str q9, [%[out_0], #32]\n" - "str q11, [%[out_0], #48]\n" - "str q13, [%[out_0], #64]\n" - "str q15, [%[out_0], #80]\n" - "str q17, [%[out_0], #96]\n" - "str q19, [%[out_0], #112]\n" - "b 5f\n" - - "6:\n" - "ldr q0, [%[min]]\n" - "ldr q30, [%[max]]\n" - "cbz %[conv_relu], 2f\n" - "eor v1.16b, v1.16b, v1.16b\n" //zero - "smax v5.4s, v5.4s, v1.4s\n" - "smax v6.4s, v6.4s, v1.4s\n" - "smax v7.4s, v7.4s, v1.4s\n" - "smax v8.4s, v8.4s, v1.4s\n" - "smax v9.4s, v9.4s, v1.4s\n" - "smax v10.4s, v10.4s, v1.4s\n" - "smax v11.4s, v11.4s, v1.4s\n" - "smax v12.4s, v12.4s, v1.4s\n" - "smax v13.4s, v13.4s, v1.4s\n" - "smax v14.4s, v14.4s, v1.4s\n" - "smax v15.4s, v15.4s, v1.4s\n" - "smax v16.4s, v16.4s, v1.4s\n" - "smax v17.4s, v17.4s, v1.4s\n" - "smax v18.4s, v18.4s, v1.4s\n" - "smax v19.4s, v19.4s, v1.4s\n" - "smax v20.4s, v20.4s, v1.4s\n" - - "2:\n" - "cbz %[scale_known], 7f\n" - "smax v5.4s, v5.4s, v0.4s\n" - "smin v5.4s, v5.4s, v30.4s\n" - "smax v6.4s, v6.4s, v0.4s\n" - "smin v6.4s, v6.4s, v30.4s\n" - "smax v7.4s, v7.4s, v0.4s\n" - "smin v7.4s, v7.4s, v30.4s\n" - "smax v8.4s, v8.4s, v0.4s\n" - "smin v8.4s, v8.4s, v30.4s\n" - "smax v9.4s, v9.4s, v0.4s\n" - "smin v9.4s, v9.4s, v30.4s\n" - "smax v10.4s, v10.4s, v0.4s\n" - "smin v10.4s, v10.4s, v30.4s\n" - "smax v11.4s, v11.4s, v0.4s\n" - "smin v11.4s, v11.4s, v30.4s\n" - "smax v12.4s, v12.4s, v0.4s\n" - "smin v12.4s, v12.4s, v30.4s\n" - "smax v13.4s, v13.4s, v0.4s\n" - "smin v13.4s, v13.4s, v30.4s\n" - "smax v14.4s, v14.4s, v0.4s\n" - "smin v14.4s, v14.4s, v30.4s\n" - "smax v15.4s, v15.4s, v0.4s\n" - "smin v15.4s, v15.4s, v30.4s\n" - "smax v16.4s, v16.4s, v0.4s\n" - "smin v16.4s, v16.4s, v30.4s\n" - "smax v17.4s, v17.4s, v0.4s\n" - "smin v17.4s, v17.4s, v30.4s\n" - "smax v18.4s, v18.4s, v0.4s\n" - "smin v18.4s, v18.4s, v30.4s\n" - "smax v19.4s, v19.4s, v0.4s\n" - "smin v19.4s, v19.4s, v30.4s\n" - "smax v20.4s, v20.4s, v0.4s\n" - "smin v20.4s, v20.4s, v30.4s\n" - - "str q5, [%[out_buf]]\n" - "str q6, [%[out_buf], 16]\n" - "str q7, [%[out_buf], 32]\n" - "str q8, [%[out_buf], 48]\n" - "str q9, [%[out_buf], 64]\n" - "str q10, [%[out_buf], 80]\n" - "str q11, [%[out_buf], 96]\n" - "str q12, [%[out_buf], 112]\n" - "str q13, [%[out_buf], 128]\n" - "str q14, [%[out_buf], 144]\n" - "str q15, [%[out_buf], 160]\n" - "str q16, [%[out_buf], 176]\n" - "str q17, [%[out_buf], 192]\n" - "str q18, [%[out_buf], 208]\n" - "str q19, [%[out_buf], 224]\n" - "str q20, [%[out_buf], 240]\n" - "b 5f\n" - - "7:\n" - "smax v30.4s, v5.4s, v30.4s\n" - "smin v0.4s, v5.4s, v0.4s\n" - "str q5, [%[out_buf]]\n" - "smax v30.4s, v6.4s, v30.4s\n" - "smin v0.4s, v6.4s, v0.4s\n" - "str q6, [%[out_buf], 16]\n" - "smax v30.4s, v7.4s, v30.4s\n" - "smin v0.4s, v7.4s, v0.4s\n" - "str q7, [%[out_buf], 32]\n" - "smax v30.4s, v8.4s, v30.4s\n" - "smin v0.4s, v8.4s, v0.4s\n" - "str q8, [%[out_buf], 48]\n" - "smax v30.4s, v9.4s, v30.4s\n" - "smin v0.4s, v9.4s, v0.4s\n" - "str q9, [%[out_buf], 64]\n" - "smax v30.4s, v10.4s, v30.4s\n" - "smin v0.4s, v10.4s, v0.4s\n" - "str q10, [%[out_buf], 80]\n" - "smax v30.4s, v11.4s, v30.4s\n" - "smin v0.4s, v11.4s, v0.4s\n" - "str q11, [%[out_buf], 96]\n" - "smax v30.4s, v12.4s, v30.4s\n" - "smin v0.4s, v12.4s, v0.4s\n" - "str q12, [%[out_buf], 112]\n" - "smax v30.4s, v13.4s, v30.4s\n" - "smin v0.4s, v13.4s, v0.4s\n" - "str q13, [%[out_buf], 128]\n" - - "smax v30.4s, v14.4s, v30.4s\n" - "smin v0.4s, v14.4s, v0.4s\n" - "str q14, [%[out_buf], 144]\n" - "smax v30.4s, v15.4s, v30.4s\n" - "smin v0.4s, v15.4s, v0.4s\n" - "str q15, [%[out_buf], 160]\n" - "smax v30.4s, v16.4s, v30.4s\n" - "smin v0.4s, v16.4s, v0.4s\n" - "str q16, [%[out_buf], 176]\n" - "smax v30.4s, v17.4s, v30.4s\n" - "smin v0.4s, v17.4s, v0.4s\n" - "str q17, [%[out_buf], 192]\n" - "smax v30.4s, v18.4s, v30.4s\n" - "smin v0.4s, v18.4s, v0.4s\n" - "str q18, [%[out_buf], 208]\n" - "smax v30.4s, v19.4s, v30.4s\n" - "smin v0.4s, v19.4s, v0.4s\n" - "str q19, [%[out_buf], 224]\n" - "smax v30.4s, v20.4s, v30.4s\n" - "smin v0.4s, v20.4s, v0.4s\n" - "str q20, [%[out_buf], 240]\n" - - "str q30, [%[max]]\n" - "str q0, [%[min]]\n" - "5:\n" - : - :[out_0]"r"(out_o0hw0), - [out_buf]"r"(out_buf), - [in_0]"r"(in_hw0), - [f_0]"r"(f_o0c0), - [ic]"r"((I64)ic*8*fh*fw), - [b_0]"r"(b_0), - [b_0_s]"r"(b_0_s), - [factor]"r"(factor_v), - [max]"r"(max_i32), - [min]"r"(min_i32), - [conv_relu]"r"(conv_relu_bool), - [out_f16]"r"(out_f16_bool), - [scale_known]"r"(scale_known_bool) - :"memory", "cc", "v0", "v1", "v3", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v29", "v30", "x0", "x1", "x2", "x3","x17","x16" - ); - b0 += 8; - b0_s += 8; - } - ohow_s += 8; - ohow_tail -= 8; - } - - if (ohow_tail >= 4) { - I32 hw = ohow_s; - F16 *b0 = biasArray; - I32 *b0_s = biasScaled; - INT8 *in_pack = ((INT8*)tmp) + ic*ih_pad*iw_pad*8; - // pack input - // NCHWc8 => NHWChw4c4 + im2col - U32 in_h[4]; - U32 in_w[4]; - - for (U32 i=0; i<4; i++) { - in_h[i] = ((hw+i)/ow)*strideH; - in_w[i] = ((hw+i)%ow)*strideW; - } - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - INT8 *in_hw4c8 = inArray_pad + c*ihiw*8 + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - INT8 *in_0 = in_hw4c8 + in_h[0]*iw_pad*8 + in_w[0]*8; - INT8 *in_1 = in_hw4c8 + in_h[1]*iw_pad*8 + in_w[1]*8; - INT8 *in_2 = in_hw4c8 + in_h[2]*iw_pad*8 + in_w[2]*8; - INT8 *in_3 = in_hw4c8 + in_h[3]*iw_pad*8 + in_w[3]*8; - INT8 *in_pack_0 = in_pack + c*fh*fw*4*8 + fh_idx*fw*4*4 + fw_idx*4*4; - INT8 *in_pack_1 = in_pack_0 + fh*fw*4*4; - - __asm__ __volatile__( - "ldr d0, [%[in_0]]\n" - "ldr x2, [%[in_2]]\n" - "ldr d1, [%[in_1]]\n" - "ldr x3, [%[in_3]]\n" - "ins v0.d[1], x2\n" - "ins v1.d[1], x3\n" - "trn1 v20.4s, v0.4s, v1.4s\n" - "trn2 v21.4s, v0.4s, v1.4s\n" - "str q20, [%[pack_0]]\n" - "str q21, [%[pack_1]]\n" - : - :[pack_0]"r"(in_pack_0), - [pack_1]"r"(in_pack_1), - [in_0]"r"(in_0), - [in_1]"r"(in_1), - [in_2]"r"(in_2), - [in_3]"r"(in_3) - :"memory", "cc", "v0", "v1", "v20", "v21", "x2", "x3" - ); - } - } - } - - // compute - for (U32 o = 0; o < oc; o++) { - INT8 *in_hw0 = in_pack; - INT8 *f_o0c0 = filterArray + o*8*fh*fw*ic*8; - I32 *out_buf = biasScaled + oc*8 + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - // bias - F16 *b_0 = b0; - I32 *b_0_s = b0_s; - __asm__ __volatile__( - "cbz %[out_f16], 8f\n" - "eor v5.16b, v5.16b, v5.16b\n" - "ldr d1, [%[in_0]]\n" //in_0 - "eor v6.16b, v6.16b, v6.16b\n" - "ldr x1, [%[in_0], #8]\n" - "eor v7.16b, v7.16b, v7.16b\n" - "ins v1.d[1], x1\n" - "eor v8.16b, v8.16b, v8.16b\n" - "ldr d0, [%[f_0]]\n" //f_0 - - "eor v9.16b, v9.16b, v9.16b\n" - "ldr x2, [%[f_0], #8]\n" - "eor v10.16b, v10.16b, v10.16b\n" - "ins v0.d[1], x2\n" - "eor v11.16b, v11.16b, v11.16b\n" - "eor v12.16b, v12.16b, v12.16b\n" - "b 7f\n" - - "8:\n" - "ldp q29, q30, [%[b_0_s]]\n" - "ldr d1, [%[in_0]]\n" //in_0 - "mov v5.16b, v29.16b\n" - "ldr x1, [%[in_0], #8]\n" - "mov v7.16b, v29.16b\n" - "ins v1.d[1], x1\n" - "mov v9.16b, v29.16b\n" - "ldr d0, [%[f_0]]\n" //f_0 - "mov v11.16b, v29.16b\n" - "ldr x2, [%[f_0], #8]\n" - - "mov v6.16b, v30.16b\n" - "ins v0.d[1], x2\n" - "mov v8.16b, v30.16b\n" - "mov v10.16b, v30.16b\n" - "mov v12.16b, v30.16b\n" - - "7:\n" - - //give in address to x3 - "mov x3, %[in_0]\n" - - //give f address to x0 - "mov x0, %[f_0]\n" - - "mov x2, %[ic]\n" //ic_blk - "0:\n" - "ldr d29, [x0, 16]\n" - "ldr x17, [x0, 24]\n" - "sdot v5.4s, v0.16b, v1.4b[0]\n" - "ldr d3, [x3, 16]!\n" - "ldr x16, [x3, 8]\n" - "sdot v7.4s, v0.16b, v1.4b[1]\n" - "ins v29.d[1], x17\n" - "subs x2, x2, #4\n" - "sdot v9.4s, v0.16b, v1.4b[2]\n" - "ins v3.d[1], x16\n" - "sdot v11.4s, v0.16b, v1.4b[3]\n" - - "sdot v6.4s, v29.16b, v1.4b[0]\n" - "ldr d0, [x0, 32]!\n" - "ldr x17, [x0, 8]\n" - "sdot v8.4s, v29.16b, v1.4b[1]\n" - "sdot v10.4s, v29.16b, v1.4b[2]\n" - "ins v0.d[1], x17\n" - "sdot v12.4s, v29.16b, v1.4b[3]\n" - "mov v1.16b, v3.16b\n" - - "bne 0b\n" - "cbz %[out_f16], 6f\n" - - "scvtf v5.4s, v5.4s\n" - "scvtf v6.4s, v6.4s\n" - "ldr d1, [%[factor]]\n" - "ldr x1, [%[factor], #8]\n" - "scvtf v7.4s, v7.4s\n" - "scvtf v8.4s, v8.4s\n" - "ins v1.d[1], x1\n" - "scvtf v9.4s, v9.4s\n" - "ldr d0, [%[b_0]]\n" - "ldr x0, [%[b_0], #8]\n" - "scvtf v10.4s, v10.4s\n" - "scvtf v11.4s, v11.4s\n" - "ins v0.d[1], x0\n" - "scvtf v12.4s, v12.4s\n" - - "fmul v5.4s, v1.4s, v5.4s\n" - "fmul v6.4s, v1.4s, v6.4s\n" - "fmul v7.4s, v1.4s, v7.4s\n" - "fmul v8.4s, v1.4s, v8.4s\n" - "fmul v9.4s, v1.4s, v9.4s\n" - "fmul v10.4s, v1.4s, v10.4s\n" - "fmul v11.4s, v1.4s, v11.4s\n" - "fmul v12.4s, v1.4s, v12.4s\n" - - "fcvtn v5.4h, v5.4s\n" - "fcvtn v7.4h, v7.4s\n" - "fcvtn v9.4h, v9.4s\n" - "fcvtn v11.4h, v11.4s\n" - - "fcvtn2 v5.8h, v6.4s\n" - "fcvtn2 v7.8h, v8.4s\n" - "fcvtn2 v9.8h, v10.4s\n" - "fcvtn2 v11.8h, v12.4s\n" - - "fadd v5.8h, v0.8h, v5.8h\n" - "fadd v7.8h, v0.8h, v7.8h\n" - "fadd v9.8h, v0.8h, v9.8h\n" - "fadd v11.8h, v0.8h, v11.8h\n" - - "cbz %[conv_relu], 1f\n" - "eor v1.16b, v1.16b, v1.16b\n" //zero - "fmax v5.8h, v5.8h, v1.8h\n" - "fmax v7.8h, v7.8h, v1.8h\n" - "fmax v9.8h, v9.8h, v1.8h\n" - "fmax v11.8h, v11.8h, v1.8h\n" - - "1:\n" - "str q5, [%[out_0]]\n" - "str q7, [%[out_0], #16]\n" - "str q9, [%[out_0], #32]\n" - "str q11, [%[out_0], #48]\n" - "b 5f\n" - - "6:\n" - "ldr q0, [%[min]]\n" - "ldr q30, [%[max]]\n" - "cbz %[conv_relu], 2f\n" - "eor v1.16b, v1.16b, v1.16b\n" //zero - "smax v5.4s, v5.4s, v1.4s\n" - "smax v6.4s, v6.4s, v1.4s\n" - "smax v7.4s, v7.4s, v1.4s\n" - "smax v8.4s, v8.4s, v1.4s\n" - "smax v9.4s, v9.4s, v1.4s\n" - "smax v10.4s, v10.4s, v1.4s\n" - "smax v11.4s, v11.4s, v1.4s\n" - "smax v12.4s, v12.4s, v1.4s\n" - - "2:\n" - "cbz %[scale_known], 7f\n" - "smax v5.4s, v5.4s, v0.4s\n" - "smin v5.4s, v5.4s, v30.4s\n" - "smax v6.4s, v6.4s, v0.4s\n" - "smin v6.4s, v6.4s, v30.4s\n" - "smax v7.4s, v7.4s, v0.4s\n" - "smin v7.4s, v7.4s, v30.4s\n" - "smax v8.4s, v8.4s, v0.4s\n" - "smin v8.4s, v8.4s, v30.4s\n" - "smax v9.4s, v9.4s, v0.4s\n" - "smin v9.4s, v9.4s, v30.4s\n" - "smax v10.4s, v10.4s, v0.4s\n" - "smin v10.4s, v10.4s, v30.4s\n" - "smax v11.4s, v11.4s, v0.4s\n" - "smin v11.4s, v11.4s, v30.4s\n" - "smax v12.4s, v12.4s, v0.4s\n" - "smin v12.4s, v12.4s, v30.4s\n" - - "str q5, [%[out_buf]]\n" - "str q6, [%[out_buf], 16]\n" - "str q7, [%[out_buf], 32]\n" - "str q8, [%[out_buf], 48]\n" - "str q9, [%[out_buf], 64]\n" - "str q10, [%[out_buf], 80]\n" - "str q11, [%[out_buf], 96]\n" - "str q12, [%[out_buf], 112]\n" - "b 5f\n" - - "7:\n" - "smax v30.4s, v5.4s, v30.4s\n" - "smin v0.4s, v5.4s, v0.4s\n" - "str q5, [%[out_buf]]\n" - "smax v30.4s, v6.4s, v30.4s\n" - "smin v0.4s, v6.4s, v0.4s\n" - "str q6, [%[out_buf], 16]\n" - "smax v30.4s, v7.4s, v30.4s\n" - "smin v0.4s, v7.4s, v0.4s\n" - "str q7, [%[out_buf], 32]\n" - "smax v30.4s, v8.4s, v30.4s\n" - "smin v0.4s, v8.4s, v0.4s\n" - "str q8, [%[out_buf], 48]\n" - "smax v30.4s, v9.4s, v30.4s\n" - "smin v0.4s, v9.4s, v0.4s\n" - "str q9, [%[out_buf], 64]\n" - "smax v30.4s, v10.4s, v30.4s\n" - "smin v0.4s, v10.4s, v0.4s\n" - "str q10, [%[out_buf], 80]\n" - "smax v30.4s, v11.4s, v30.4s\n" - "smin v0.4s, v11.4s, v0.4s\n" - "str q11, [%[out_buf], 96]\n" - "smax v30.4s, v12.4s, v30.4s\n" - "smin v0.4s, v12.4s, v0.4s\n" - "str q12, [%[out_buf], 112]\n" - - "str q30, [%[max]]\n" - "str q0, [%[min]]\n" - "5:\n" - : - :[out_0]"r"(out_o0hw0), - [out_buf]"r"(out_buf), - [in_0]"r"(in_hw0), - [f_0]"r"(f_o0c0), - [ic]"r"((I64)ic*8*fh*fw), - [b_0]"r"(b_0), - [b_0_s]"r"(b_0_s), - [factor]"r"(factor_v), - [max]"r"(max_i32), - [min]"r"(min_i32), - [conv_relu]"r"(conv_relu_bool), - [out_f16]"r"(out_f16_bool), - [scale_known]"r"(scale_known_bool) - :"memory", "cc", "v0", "v1", "v2", "v3", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v29", "x0", "x1", "x2", "x3","x17","x16" - ); - b0 += 8; - b0_s += 8; - } - ohow_s += 4; - } - - for (I32 hw = ohow_s; hw < ohow; hw++) { - F16 *b0 = biasArray; - I32 *b0_s = biasScaled; - INT8 *in_pack = ((INT8*)tmp) + ic*ih_pad*iw_pad*8; - // pack input - // NCHWc8 => NHWChw1c4 + im2col - U32 in_h_0 = (hw/ow)*strideH; - U32 in_w_0 = (hw%ow)*strideW; - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - INT8 *in_hw1c8 = inArray_pad + c*ihiw*8 + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - INT8 *in_0 = in_hw1c8 + in_h_0*iw_pad*8 + in_w_0*8; - INT8 *in_pack_0 = in_pack + c*fh*fw*8 + fh_idx*fw*4 + fw_idx*4; - INT8 *in_pack_1 = in_pack_0 + fh*fw*4; - - memcpy(in_pack_0, in_0, 4*bytesOf(DT_I8)); - memcpy(in_pack_1, in_0+4, 4*bytesOf(DT_I8)); - } - } - } - - // compute - for (U32 o = 0; o < oc; o++) { - INT8 *in_hw = in_pack; - INT8 *f_o = filterArray + o*8*fh*fw*ic*8; - I32 *out_buf = biasScaled + oc*8 + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - - int32x4_t res[2] = {0}; - if (out_f16_bool == 0) { - res[0] = vld1q_s32(b0_s); - res[1] = vld1q_s32(b0_s + 4); - } - - for(U32 c=0; c min_i32[i]) { - min = min_i32[i]; - } - } - - if (max == 0 && min == 0) { - return NOT_SUPPORTED; - } - - if (max > 0 && min < 0) { - I32 factor_max = 127 * 16777216 / max; - I32 factor_min = -127 * 16777216 / min; - factor = (factor_max < factor_min) ? factor_max : factor_min; - scale_o = (factor_max < factor_min) ? (127.0/max) : (-127.0/min); - } else if (max > 0) { - factor = 127 * 16777216 / max; - scale_o = 127.0 / max; - } else { - factor = -127 * 16777216 / min; - scale_o = -127.0 / min; - } - *outputScale = (*inputScale) * (*filterScale) * scale_o; - } - - U32 num_v = oc * ohow * 2; // Number of q-form vectors - I32 *out_buf = biasScaled + oc*8; - INT8 *out_q = (INT8*)output; - - ret = quantize_I32(num_v, out_buf, factor, scale_o, out_q); - } - return ret; -} - -template EE convolution_gemm_A55(TensorDesc inputDesc, const void* input, F16* inputScale, TensorDesc filterDesc, const void* filter, F16* filterScale, - ConvolutionDesc convDesc, TensorDesc biasDesc, const void* bias, U32 tmpBytes, void* tmp, TensorDesc outputDesc, - void* output, F16* outputScale, ActivationDesc activationDesc); - -template EE convolution_gemm_A55(TensorDesc inputDesc, const void* input, F16* inputScale, TensorDesc filterDesc, const void* filter, F16* filterScale, - ConvolutionDesc convDesc, TensorDesc biasDesc, const void* bias, U32 tmpBytes, void* tmp, TensorDesc outputDesc, - void* output, F16* outputScale, ActivationDesc activationDesc); -#endif diff --git a/tensor_computing/src/cpu/arm/int8/convolution_gemm_A76.cpp b/tensor_computing/src/cpu/arm/int8/convolution_gemm_A76.cpp deleted file mode 100644 index b022870e..00000000 --- a/tensor_computing/src/cpu/arm/int8/convolution_gemm_A76.cpp +++ /dev/null @@ -1,1559 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifdef _USE_INT8 -#include -#include "cpu/arm/int8/convolution_gemm.h" - -template -EE convolution_gemm_A76(TensorDesc inputDesc, const void* input, F16* inputScale, TensorDesc filterDesc, const void* filter, F16* filterScale, - ConvolutionDesc convDesc, TensorDesc biasDesc, const void* bias, U32 tmpBytes, void* tmp, TensorDesc outputDesc, - void* output, F16* outputScale, ActivationDesc activationDesc) -{ - UNUSED(biasDesc); - UNUSED(tmpBytes); - // still im2col + gemm with a smaller buffer - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - U32 dilateH = convDesc.dilatedRate_h; - U32 dilateW = convDesc.dilatedRate_w; - - if (fdf != DF_NCHWN8C4) { - return NOT_MATCH; - } - - I64 conv_relu_bool = (activationDesc.mode == ACTIVATION_RELU) ? 1 : 0; - I64 out_f16_bool = (odt == DT_F16) ? 1 : 0; - I64 scale_known_bool = 0; - if (*outputScale > 0 || ACTIVATION_RELU6 == activationDesc.mode) { - scale_known_bool = 1; - } - - INT8* inArray = (INT8*)input; // It will be updated if there is quantization - INT8* filterArray = (INT8*)filter; - F16* outArray = (F16*)output; - F16* biasArray = (F16*)bias; - INT8* in_pad = (INT8*)tmp; - - // both input and output are stored with C8 - oc /= 8; - ic /= 8; - - U32 ih_pad = ih + paddingT + paddingB; - U32 iw_pad = iw + paddingL + paddingR; - I32 ohow = oh*ow; - U32 ihiw = ih_pad*iw_pad; - - I32* biasScaled = (I32*)(in_pad + ic*ihiw*8 + 12*fh*fw*ic*8); // Initialize - - //double start, end; - I32 max_i32[4] = {0}; // To record max I32 values - I32 min_i32[4] = {0}; // To record min I32 values - - for (U32 n = 0; n < in; n++) {// for each batch - F16 scale_i = 1.0; - - // quantize input if necessary - if (idt == DT_F16) { - //start = get_current_time_int8(); - F16* in = ((F16*)input) + n*ic*ih*iw*8; - inArray = in_pad + ic*ihiw*8 + 12*fh*fw*ic*8; // After the space for padding and packing - - U32 numData = ic*ih*iw*8; - if (*inputScale > 0) { - scale_i = *inputScale; - } else { - float16x8_t temp_v = vld1q_f16(in); - float16x8_t max_v = temp_v; - float16x8_t min_v = temp_v; - - for (U32 i=8; i 0 && min < 0) { - F16 scale_max = 127.0 / max; - F16 scale_min = -127.0 / min; - scale_i = (scale_max < scale_min) ? scale_max : scale_min; - } else if (max < 0) { - scale_i = -127.0 / min; - } else { // min > 0 - scale_i = 127.0 / max; - } - } - for (U32 i = 0; i < numData; i++) { - F32 temp = in[i] * scale_i; - if (temp > 127) { - inArray[i] = 127; - } else if (temp < -127) { - inArray[i] = -127; - } else { - inArray[i] = temp; - } - } - *inputScale = scale_i; - } else { - scale_i = *inputScale; - } - - if (1 == scale_known_bool) { - if (ACTIVATION_RELU6 == activationDesc.mode) { - *outputScale = 127.0 / 6.0; - } - F32 scaleInt = (*outputScale / *inputScale) / *filterScale; - I32 thresholdP = 127.0 / scaleInt; - I32 thresholdN = 0; - if (ACTIVATION_RELU6 != activationDesc.mode) { - thresholdN = thresholdP * -1; - } - - for (U32 i = 0; i < 4; i++) { - max_i32[i] = thresholdP; - min_i32[i] = thresholdN; - } - } - - if (odt == DT_I8) { // Scale the bias - if (idt == DT_F16) { - biasScaled += ic * ih * iw * 8 / bytesOf(DT_I32); // After the quantized input - } - F32 scale = (*inputScale) * (*filterScale); - for (U32 i=0; i NHWChw12c4 + im2col - U32 in_h[12]; - U32 in_w[12]; - - for (U32 i = 0; i < 12; i++) { - in_h[i] = ((hw+i)/ow)*strideH; - in_w[i] = ((hw+i)%ow)*strideW; - } - for (U32 c = 0; c < ic; c++) {// for each 8 channels - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - INT8 *in_hw12c8 = inArray_pad + c*ihiw*8 + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - - INT8 *in_0 = in_hw12c8 + in_h[0]*iw_pad*8 + in_w[0]*8; - INT8 *in_1 = in_hw12c8 + in_h[1]*iw_pad*8 + in_w[1]*8; - INT8 *in_2 = in_hw12c8 + in_h[2]*iw_pad*8 + in_w[2]*8; - INT8 *in_3 = in_hw12c8 + in_h[3]*iw_pad*8 + in_w[3]*8; - INT8 *in_4 = in_hw12c8 + in_h[4]*iw_pad*8 + in_w[4]*8; - INT8 *in_5 = in_hw12c8 + in_h[5]*iw_pad*8 + in_w[5]*8; - INT8 *in_6 = in_hw12c8 + in_h[6]*iw_pad*8 + in_w[6]*8; - INT8 *in_7 = in_hw12c8 + in_h[7]*iw_pad*8 + in_w[7]*8; - INT8 *in_8 = in_hw12c8 + in_h[8]*iw_pad*8 + in_w[8]*8; - INT8 *in_9 = in_hw12c8 + in_h[9]*iw_pad*8 + in_w[9]*8; - INT8 *in_10 = in_hw12c8 + in_h[10]*iw_pad*8 + in_w[10]*8; - INT8 *in_11 = in_hw12c8 + in_h[11]*iw_pad*8 + in_w[11]*8; - - // in_pack (tmp) is reused for each tile - // NHWChw12c4 - INT8 *in_pack_0 = in_pack + c*fh*fw*12*8 + fh_idx*fw*12*4 + fw_idx*12*4; - INT8 *in_pack_1 = in_pack_0 + fh*fw*12*4; - - __asm__ __volatile__( - "ldr d0, [%[in_0]]\n" - "ldr x2, [%[in_2]]\n" - - "ldr d1, [%[in_1]]\n" - "ldr x3, [%[in_3]]\n" - - "ldr d4, [%[in_4]]\n" - "ldr x6, [%[in_6]]\n" - - "ldr d5, [%[in_5]]\n" - "ins v0.d[1], x2\n" - - "ldr x7, [%[in_7]]\n" - "ins v1.d[1], x3\n" - - "ldr d8, [%[in_8]]\n" - "ins v4.d[1], x6\n" - - "trn1 v20.4s, v0.4s, v1.4s\n" - "ins v5.d[1], x7\n" - - "trn2 v21.4s, v0.4s, v1.4s\n" - "ldr x10, [%[in_10]]\n" - - "ldr d9, [%[in_9]]\n" - "trn1 v24.4s, v4.4s, v5.4s\n" - - "trn2 v25.4s, v4.4s, v5.4s\n" - "ldr x11, [%[in_11]]\n" - - "str q20, [%[pack_0]]\n" - "ins v8.d[1], x10\n" - - "str q24, [%[pack_0], #16]\n" - "ins v9.d[1], x11\n" - - "trn1 v28.4s, v8.4s, v9.4s\n" - "str q21, [%[pack_1]]\n" - - "trn2 v29.4s, v8.4s, v9.4s\n" - "str q25, [%[pack_1], #16]\n" - - "str q28, [%[pack_0], #32]\n" - "str q29, [%[pack_1], #32]\n" - : - :[pack_0]"r"(in_pack_0), - [pack_1]"r"(in_pack_1), - [in_0]"r"(in_0), - [in_1]"r"(in_1), - [in_2]"r"(in_2), - [in_3]"r"(in_3), - [in_4]"r"(in_4), - [in_5]"r"(in_5), - [in_6]"r"(in_6), - [in_7]"r"(in_7), - [in_8]"r"(in_8), - [in_9]"r"(in_9), - [in_10]"r"(in_10), - [in_11]"r"(in_11) - :"memory", "cc", "v0", "v1", "v4", "v5", "v8", "v9", "v20", "v21", "v24", "v25", "v28", "v29", "x2", "x3", "x6", "x7", "x10", "x11" - ); - } - } - } - - // compute - for (U32 o = 0; o < oc; o++) {// 8 output channels at a time - INT8 *in_hw0 = in_pack; - INT8 *f_o0c0 = filterArray + o*8*fh*fw*ic*8; - I32 *out_buf = biasScaled + oc*8 + n*oc*ohow*8 + o*ohow*8 + hw*8;; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - // bias - F16 *b_0 = b0; - I32 *b_0_s = b0_s; - __asm__ __volatile__( - "cbz %[out_f16], 8f\n" - "eor v5.16b, v5.16b, v5.16b\n" - "ldr q1, [%[in_0]]\n" //in_0 - "eor v6.16b, v6.16b, v6.16b\n" - "eor v7.16b, v7.16b, v7.16b\n" - "eor v8.16b, v8.16b, v8.16b\n" - "ldr q0, [%[f_0]]\n" //f_0 - "eor v9.16b, v9.16b, v9.16b\n" - "eor v10.16b, v10.16b, v10.16b\n" - "eor v11.16b, v11.16b, v11.16b\n" - "ldr q3, [%[in_0], #16]\n" //in_1 - "eor v12.16b, v12.16b, v12.16b\n" - "eor v13.16b, v13.16b, v13.16b\n" - "eor v14.16b, v14.16b, v14.16b\n" - "eor v15.16b, v15.16b, v15.16b\n" - "eor v16.16b, v16.16b, v16.16b\n" - - "eor v17.16b, v17.16b, v17.16b\n" - "eor v18.16b, v18.16b, v18.16b\n" - "eor v19.16b, v19.16b, v19.16b\n" - "eor v20.16b, v20.16b, v20.16b\n" - "eor v21.16b, v21.16b, v21.16b\n" - "eor v22.16b, v22.16b, v22.16b\n" - "eor v23.16b, v23.16b, v23.16b\n" - "eor v24.16b, v24.16b, v24.16b\n" - "eor v25.16b, v25.16b, v25.16b\n" - "eor v26.16b, v26.16b, v26.16b\n" - "eor v27.16b, v27.16b, v27.16b\n" - "eor v28.16b, v28.16b, v28.16b\n" - "b 7f\n" - - "8:\n" - "ldp q29, q30, [%[b_0_s]]\n" - "mov v5.16b, v29.16b\n" - "ldr q1, [%[in_0]]\n" //in_0 - "mov v7.16b, v29.16b\n" - "mov v9.16b, v29.16b\n" - "mov v11.16b, v29.16b\n" - "ldr q0, [%[f_0]]\n" //f_0 - "mov v13.16b, v29.16b\n" - "mov v15.16b, v29.16b\n" - "mov v17.16b, v29.16b\n" - "ldr q3, [%[in_0], #16]\n" //in_1 - "mov v19.16b, v29.16b\n" - "mov v21.16b, v29.16b\n" - "mov v23.16b, v29.16b\n" - "mov v25.16b, v29.16b\n" - "mov v27.16b, v29.16b\n" - - "mov v6.16b, v30.16b\n" - "mov v8.16b, v30.16b\n" - "mov v10.16b, v30.16b\n" - "mov v12.16b, v30.16b\n" - "mov v14.16b, v30.16b\n" - "mov v16.16b, v30.16b\n" - "mov v18.16b, v30.16b\n" - "mov v20.16b, v30.16b\n" - "mov v22.16b, v30.16b\n" - "mov v24.16b, v30.16b\n" - "mov v26.16b, v30.16b\n" - "mov v28.16b, v30.16b\n" - - "7:\n" - //give in address to x3 - "mov x3, %[in_0]\n" - - //give f address to x0 - "mov x0, %[f_0]\n" - - "mov x2, %[ic]\n" //ic_blk - "0:\n" - "ldr q2, [x3, 32]\n" - "ldr q29, [x0, 16]\n" - "sdot v5.4s, v0.16b, v1.4b[0]\n" - "sdot v7.4s, v0.16b, v1.4b[1]\n" - "sdot v9.4s, v0.16b, v1.4b[2]\n" - "sdot v11.4s, v0.16b, v1.4b[3]\n" - - "sdot v13.4s, v0.16b, v3.4b[0]\n" - "sdot v15.4s, v0.16b, v3.4b[1]\n" - "sdot v17.4s, v0.16b, v3.4b[2]\n" - "sdot v19.4s, v0.16b, v3.4b[3]\n" - - "sdot v21.4s, v0.16b, v2.4b[0]\n" - "sdot v23.4s, v0.16b, v2.4b[1]\n" - "sdot v25.4s, v0.16b, v2.4b[2]\n" - "sdot v27.4s, v0.16b, v2.4b[3]\n" - - "sdot v14.4s, v29.16b, v3.4b[0]\n" - "sdot v16.4s, v29.16b, v3.4b[1]\n" - "ldr q0, [x0, 32]!\n" - "subs x2, x2, #4\n" - "sdot v18.4s, v29.16b, v3.4b[2]\n" - "sdot v20.4s, v29.16b, v3.4b[3]\n" - - "sdot v6.4s, v29.16b, v1.4b[0]\n" - "sdot v8.4s, v29.16b, v1.4b[1]\n" - "sdot v10.4s, v29.16b, v1.4b[2]\n" - "sdot v12.4s, v29.16b, v1.4b[3]\n" - - "ldr q1, [x3, 48]!\n" - "ldr q3, [x3, 16]\n" - "sdot v22.4s, v29.16b, v2.4b[0]\n" - "sdot v24.4s, v29.16b, v2.4b[1]\n" - "sdot v26.4s, v29.16b, v2.4b[2]\n" - "sdot v28.4s, v29.16b, v2.4b[3]\n" - - "bne 0b\n" - "cbz %[out_f16], 6f\n" - - "scvtf v5.4s, v5.4s\n" - "scvtf v6.4s, v6.4s\n" - "ldr q1, [%[factor]]\n" - "scvtf v7.4s, v7.4s\n" - "scvtf v8.4s, v8.4s\n" - "scvtf v9.4s, v9.4s\n" - "ldr q0, [%[b_0]]\n" - "scvtf v10.4s, v10.4s\n" - "scvtf v11.4s, v11.4s\n" - "scvtf v12.4s, v12.4s\n" - "scvtf v13.4s, v13.4s\n" - "scvtf v14.4s, v14.4s\n" - "scvtf v15.4s, v15.4s\n" - "scvtf v16.4s, v16.4s\n" - "scvtf v17.4s, v17.4s\n" - "scvtf v18.4s, v18.4s\n" - "scvtf v19.4s, v19.4s\n" - "scvtf v20.4s, v20.4s\n" - "scvtf v21.4s, v21.4s\n" - "scvtf v22.4s, v22.4s\n" - "scvtf v23.4s, v23.4s\n" - "scvtf v24.4s, v24.4s\n" - "scvtf v25.4s, v25.4s\n" - "scvtf v26.4s, v26.4s\n" - "scvtf v27.4s, v27.4s\n" - "scvtf v28.4s, v28.4s\n" - - "fmul v5.4s, v1.4s, v5.4s\n" - "fmul v6.4s, v1.4s, v6.4s\n" - "fmul v7.4s, v1.4s, v7.4s\n" - "fmul v8.4s, v1.4s, v8.4s\n" - "fmul v9.4s, v1.4s, v9.4s\n" - "fmul v10.4s, v1.4s, v10.4s\n" - "fmul v11.4s, v1.4s, v11.4s\n" - "fmul v12.4s, v1.4s, v12.4s\n" - "fmul v13.4s, v1.4s, v13.4s\n" - "fmul v14.4s, v1.4s, v14.4s\n" - "fmul v15.4s, v1.4s, v15.4s\n" - "fmul v16.4s, v1.4s, v16.4s\n" - "fmul v17.4s, v1.4s, v17.4s\n" - "fmul v18.4s, v1.4s, v18.4s\n" - "fmul v19.4s, v1.4s, v19.4s\n" - "fmul v20.4s, v1.4s, v20.4s\n" - "fmul v21.4s, v1.4s, v21.4s\n" - "fmul v22.4s, v1.4s, v22.4s\n" - "fmul v23.4s, v1.4s, v23.4s\n" - "fmul v24.4s, v1.4s, v24.4s\n" - "fmul v25.4s, v1.4s, v25.4s\n" - "fmul v26.4s, v1.4s, v26.4s\n" - "fmul v27.4s, v1.4s, v27.4s\n" - "fmul v28.4s, v1.4s, v28.4s\n" - - "fcvtn v5.4h, v5.4s\n" - "fcvtn v7.4h, v7.4s\n" - "fcvtn v9.4h, v9.4s\n" - "fcvtn v11.4h, v11.4s\n" - "fcvtn v13.4h, v13.4s\n" - "fcvtn v15.4h, v15.4s\n" - "fcvtn v17.4h, v17.4s\n" - "fcvtn v19.4h, v19.4s\n" - "fcvtn v21.4h, v21.4s\n" - "fcvtn v23.4h, v23.4s\n" - "fcvtn v25.4h, v25.4s\n" - "fcvtn v27.4h, v27.4s\n" - - "fcvtn2 v5.8h, v6.4s\n" - "fcvtn2 v7.8h, v8.4s\n" - "fcvtn2 v9.8h, v10.4s\n" - "fcvtn2 v11.8h, v12.4s\n" - "fcvtn2 v13.8h, v14.4s\n" - "fcvtn2 v15.8h, v16.4s\n" - "fcvtn2 v17.8h, v18.4s\n" - "fcvtn2 v19.8h, v20.4s\n" - "fcvtn2 v21.8h, v22.4s\n" - "fcvtn2 v23.8h, v24.4s\n" - "fcvtn2 v25.8h, v26.4s\n" - "fcvtn2 v27.8h, v28.4s\n" - - "fadd v5.8h, v0.8h, v5.8h\n" - "fadd v7.8h, v0.8h, v7.8h\n" - "fadd v9.8h, v0.8h, v9.8h\n" - "fadd v11.8h, v0.8h, v11.8h\n" - "fadd v13.8h, v0.8h, v13.8h\n" - "fadd v15.8h, v0.8h, v15.8h\n" - "fadd v17.8h, v0.8h, v17.8h\n" - "fadd v19.8h, v0.8h, v19.8h\n" - "fadd v21.8h, v0.8h, v21.8h\n" - "fadd v23.8h, v0.8h, v23.8h\n" - "fadd v25.8h, v0.8h, v25.8h\n" - "fadd v27.8h, v0.8h, v27.8h\n" - - "cbz %[conv_relu], 1f\n" - "eor v1.16b, v1.16b, v1.16b\n" //zero - "fmax v5.8h, v5.8h, v1.8h\n" - "fmax v7.8h, v7.8h, v1.8h\n" - "fmax v9.8h, v9.8h, v1.8h\n" - "fmax v11.8h, v11.8h, v1.8h\n" - "fmax v13.8h, v13.8h, v1.8h\n" - "fmax v15.8h, v15.8h, v1.8h\n" - "fmax v17.8h, v17.8h, v1.8h\n" - "fmax v19.8h, v19.8h, v1.8h\n" - "fmax v21.8h, v21.8h, v1.8h\n" - "fmax v23.8h, v23.8h, v1.8h\n" - "fmax v25.8h, v25.8h, v1.8h\n" - "fmax v27.8h, v27.8h, v1.8h\n" - - "1:\n" - "str q5, [%[out_0]]\n" - "str q7, [%[out_0], #16]\n" - "str q9, [%[out_0], #32]\n" - "str q11, [%[out_0], #48]\n" - "str q13, [%[out_0], #64]\n" - "str q15, [%[out_0], #80]\n" - "str q17, [%[out_0], #96]\n" - "str q19, [%[out_0], #112]\n" - "str q21, [%[out_0], #128]\n" - "str q23, [%[out_0], #144]\n" - "str q25, [%[out_0], #160]\n" - "str q27, [%[out_0], #176]\n" - "b 5f\n" - - "6:\n" - "ldr q0, [%[min]]\n" - "ldr q30, [%[max]]\n" - "cbz %[conv_relu], 2f\n" - "eor v1.16b, v1.16b, v1.16b\n" //zero - "smax v5.4s, v5.4s, v1.4s\n" - "smax v6.4s, v6.4s, v1.4s\n" - "smax v7.4s, v7.4s, v1.4s\n" - "smax v8.4s, v8.4s, v1.4s\n" - "smax v9.4s, v9.4s, v1.4s\n" - "smax v10.4s, v10.4s, v1.4s\n" - "smax v11.4s, v11.4s, v1.4s\n" - "smax v12.4s, v12.4s, v1.4s\n" - "smax v13.4s, v13.4s, v1.4s\n" - "smax v14.4s, v14.4s, v1.4s\n" - "smax v15.4s, v15.4s, v1.4s\n" - "smax v16.4s, v16.4s, v1.4s\n" - "smax v17.4s, v17.4s, v1.4s\n" - "smax v18.4s, v18.4s, v1.4s\n" - "smax v19.4s, v19.4s, v1.4s\n" - "smax v20.4s, v20.4s, v1.4s\n" - "smax v21.4s, v21.4s, v1.4s\n" - "smax v22.4s, v22.4s, v1.4s\n" - "smax v23.4s, v23.4s, v1.4s\n" - "smax v24.4s, v24.4s, v1.4s\n" - "smax v25.4s, v25.4s, v1.4s\n" - "smax v26.4s, v26.4s, v1.4s\n" - "smax v27.4s, v27.4s, v1.4s\n" - "smax v28.4s, v28.4s, v1.4s\n" - - "2:\n" - "cbz %[scale_known], 7f\n" - "smax v5.4s, v5.4s, v0.4s\n" - "smin v5.4s, v5.4s, v30.4s\n" - "smax v6.4s, v6.4s, v0.4s\n" - "smin v6.4s, v6.4s, v30.4s\n" - "smax v7.4s, v7.4s, v0.4s\n" - "smin v7.4s, v7.4s, v30.4s\n" - "smax v8.4s, v8.4s, v0.4s\n" - "smin v8.4s, v8.4s, v30.4s\n" - "smax v9.4s, v9.4s, v0.4s\n" - "smin v9.4s, v9.4s, v30.4s\n" - "smax v10.4s, v10.4s, v0.4s\n" - "smin v10.4s, v10.4s, v30.4s\n" - "smax v11.4s, v11.4s, v0.4s\n" - "smin v11.4s, v11.4s, v30.4s\n" - "smax v12.4s, v12.4s, v0.4s\n" - "smin v12.4s, v12.4s, v30.4s\n" - "smax v13.4s, v13.4s, v0.4s\n" - "smin v13.4s, v13.4s, v30.4s\n" - "smax v14.4s, v14.4s, v0.4s\n" - "smin v14.4s, v14.4s, v30.4s\n" - "smax v15.4s, v15.4s, v0.4s\n" - "smin v15.4s, v15.4s, v30.4s\n" - "smax v16.4s, v16.4s, v0.4s\n" - "smin v16.4s, v16.4s, v30.4s\n" - "smax v17.4s, v17.4s, v0.4s\n" - "smin v17.4s, v17.4s, v30.4s\n" - "smax v18.4s, v18.4s, v0.4s\n" - "smin v18.4s, v18.4s, v30.4s\n" - "smax v19.4s, v19.4s, v0.4s\n" - "smin v19.4s, v19.4s, v30.4s\n" - "smax v20.4s, v20.4s, v0.4s\n" - "smin v20.4s, v20.4s, v30.4s\n" - "smax v21.4s, v21.4s, v0.4s\n" - "smin v21.4s, v21.4s, v30.4s\n" - "smax v22.4s, v22.4s, v0.4s\n" - "smin v22.4s, v22.4s, v30.4s\n" - "smax v23.4s, v23.4s, v0.4s\n" - "smin v23.4s, v23.4s, v30.4s\n" - "smax v24.4s, v24.4s, v0.4s\n" - "smin v24.4s, v24.4s, v30.4s\n" - "smax v25.4s, v25.4s, v0.4s\n" - "smin v25.4s, v25.4s, v30.4s\n" - "smax v26.4s, v26.4s, v0.4s\n" - "smin v26.4s, v26.4s, v30.4s\n" - "smax v27.4s, v27.4s, v0.4s\n" - "smin v27.4s, v27.4s, v30.4s\n" - "smax v28.4s, v28.4s, v0.4s\n" - "smin v28.4s, v28.4s, v30.4s\n" - - "str q5, [%[out_buf]]\n" - "str q6, [%[out_buf], 16]\n" - "str q7, [%[out_buf], 32]\n" - "str q8, [%[out_buf], 48]\n" - "str q9, [%[out_buf], 64]\n" - "str q10, [%[out_buf], 80]\n" - "str q11, [%[out_buf], 96]\n" - "str q12, [%[out_buf], 112]\n" - "str q13, [%[out_buf], 128]\n" - "str q14, [%[out_buf], 144]\n" - "str q15, [%[out_buf], 160]\n" - "str q16, [%[out_buf], 176]\n" - "str q17, [%[out_buf], 192]\n" - "str q18, [%[out_buf], 208]\n" - "str q19, [%[out_buf], 224]\n" - "str q20, [%[out_buf], 240]\n" - "str q21, [%[out_buf], 256]\n" - "str q22, [%[out_buf], 272]\n" - "str q23, [%[out_buf], 288]\n" - "str q24, [%[out_buf], 304]\n" - "str q25, [%[out_buf], 320]\n" - "str q26, [%[out_buf], 336]\n" - "str q27, [%[out_buf], 352]\n" - "str q28, [%[out_buf], 368]\n" - "b 5f\n" - - "7:\n" - "smax v30.4s, v5.4s, v30.4s\n" - "smin v0.4s, v5.4s, v0.4s\n" - "str q5, [%[out_buf]]\n" - "smax v30.4s, v6.4s, v30.4s\n" - "smin v0.4s, v6.4s, v0.4s\n" - "str q6, [%[out_buf], 16]\n" - "smax v30.4s, v7.4s, v30.4s\n" - "smin v0.4s, v7.4s, v0.4s\n" - "str q7, [%[out_buf], 32]\n" - "smax v30.4s, v8.4s, v30.4s\n" - "smin v0.4s, v8.4s, v0.4s\n" - "str q8, [%[out_buf], 48]\n" - "smax v30.4s, v9.4s, v30.4s\n" - "smin v0.4s, v9.4s, v0.4s\n" - "str q9, [%[out_buf], 64]\n" - "smax v30.4s, v10.4s, v30.4s\n" - "smin v0.4s, v10.4s, v0.4s\n" - "str q10, [%[out_buf], 80]\n" - "smax v30.4s, v11.4s, v30.4s\n" - "smin v0.4s, v11.4s, v0.4s\n" - "str q11, [%[out_buf], 96]\n" - "smax v30.4s, v12.4s, v30.4s\n" - "smin v0.4s, v12.4s, v0.4s\n" - "str q12, [%[out_buf], 112]\n" - "smax v30.4s, v13.4s, v30.4s\n" - "smin v0.4s, v13.4s, v0.4s\n" - "str q13, [%[out_buf], 128]\n" - - "smax v30.4s, v14.4s, v30.4s\n" - "smin v0.4s, v14.4s, v0.4s\n" - "str q14, [%[out_buf], 144]\n" - "smax v30.4s, v15.4s, v30.4s\n" - "smin v0.4s, v15.4s, v0.4s\n" - "str q15, [%[out_buf], 160]\n" - "smax v30.4s, v16.4s, v30.4s\n" - "smin v0.4s, v16.4s, v0.4s\n" - "str q16, [%[out_buf], 176]\n" - "smax v30.4s, v17.4s, v30.4s\n" - "smin v0.4s, v17.4s, v0.4s\n" - "str q17, [%[out_buf], 192]\n" - "smax v30.4s, v18.4s, v30.4s\n" - "smin v0.4s, v18.4s, v0.4s\n" - "str q18, [%[out_buf], 208]\n" - "smax v30.4s, v19.4s, v30.4s\n" - "smin v0.4s, v19.4s, v0.4s\n" - "str q19, [%[out_buf], 224]\n" - "smax v30.4s, v20.4s, v30.4s\n" - "smin v0.4s, v20.4s, v0.4s\n" - "str q20, [%[out_buf], 240]\n" - "smax v30.4s, v21.4s, v30.4s\n" - "smin v0.4s, v21.4s, v0.4s\n" - "str q21, [%[out_buf], 256]\n" - "smax v30.4s, v22.4s, v30.4s\n" - "smin v0.4s, v22.4s, v0.4s\n" - "str q22, [%[out_buf], 272]\n" - "smax v30.4s, v23.4s, v30.4s\n" - "smin v0.4s, v23.4s, v0.4s\n" - "str q23, [%[out_buf], 288]\n" - "smax v30.4s, v24.4s, v30.4s\n" - "smin v0.4s, v24.4s, v0.4s\n" - "str q24, [%[out_buf], 304]\n" - "smax v30.4s, v25.4s, v30.4s\n" - "smin v0.4s, v25.4s, v0.4s\n" - "str q25, [%[out_buf], 320]\n" - "smax v30.4s, v26.4s, v30.4s\n" - "smin v0.4s, v26.4s, v0.4s\n" - "str q26, [%[out_buf], 336]\n" - "smax v30.4s, v27.4s, v30.4s\n" - "smin v0.4s, v27.4s, v0.4s\n" - "str q27, [%[out_buf], 352]\n" - "smax v30.4s, v28.4s, v30.4s\n" - "smin v0.4s, v28.4s, v0.4s\n" - "str q28, [%[out_buf], 368]\n" - - "str q30, [%[max]]\n" - "str q0, [%[min]]\n" - - "5:\n" - : - :[out_0]"r"(out_o0hw0), - [out_buf]"r"(out_buf), - [in_0]"r"(in_hw0), - [f_0]"r"(f_o0c0), - [ic]"r"((I64)ic*8*fh*fw), - [b_0]"r"(b_0), - [b_0_s]"r"(b_0_s), - [factor]"r"(factor_v), - [max]"r"(max_i32), - [min]"r"(min_i32), - [conv_relu]"r"(conv_relu_bool), - [out_f16]"r"(out_f16_bool), - [scale_known]"r"(scale_known_bool) - :"memory", "cc", "v0", "v1", "v2", "v3", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x0", "x1", "x2", "x3","x17","x16" - ); - b0 += 8; - b0_s += 8; - } - } - - // ohow_reminder % 12 / 8 - I32 ohow_s = (ohow / 12) * 12; - I32 ohow_tail = ohow - ohow_s; - - if (ohow_tail >= 8) { - I32 hw = ohow_s; - F16 *b0 = biasArray; - I32 *b0_s = biasScaled; - INT8 *in_pack = ((INT8*)tmp) + ic*ih_pad*iw_pad*8; - // pack input - // NCHWc8 => NHWChw8c4 + im2col - U32 in_h[8]; - U32 in_w[8]; - - for (U32 i = 0; i < 8; i++) { - in_h[i] = ((hw+i)/ow)*strideH; - in_w[i] = ((hw+i)%ow)*strideW; - } - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - INT8 *in_hw8c8 = inArray_pad + c*ihiw*8 + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - INT8 *in_0 = in_hw8c8 + in_h[0]*iw_pad*8 + in_w[0]*8; - INT8 *in_1 = in_hw8c8 + in_h[1]*iw_pad*8 + in_w[1]*8; - INT8 *in_2 = in_hw8c8 + in_h[2]*iw_pad*8 + in_w[2]*8; - INT8 *in_3 = in_hw8c8 + in_h[3]*iw_pad*8 + in_w[3]*8; - INT8 *in_4 = in_hw8c8 + in_h[4]*iw_pad*8 + in_w[4]*8; - INT8 *in_5 = in_hw8c8 + in_h[5]*iw_pad*8 + in_w[5]*8; - INT8 *in_6 = in_hw8c8 + in_h[6]*iw_pad*8 + in_w[6]*8; - INT8 *in_7 = in_hw8c8 + in_h[7]*iw_pad*8 + in_w[7]*8; - INT8 *in_pack_0 = in_pack + c*fh*fw*8*8 + fh_idx*fw*8*4 + fw_idx*8*4; - INT8 *in_pack_1 = in_pack_0 + fh*fw*8*4; - - __asm__ __volatile__( - "ldr d0, [%[in_0]]\n" - "ldr x2, [%[in_2]]\n" - "ldr d1, [%[in_1]]\n" - "ldr x3, [%[in_3]]\n" - "ins v0.d[1], x2\n" - "ins v1.d[1], x3\n" - "ldr d4, [%[in_4]]\n" - "ldr x6, [%[in_6]]\n" - "trn1 v20.4s, v0.4s, v1.4s\n" - "trn2 v21.4s, v0.4s, v1.4s\n" - - "ldr d5, [%[in_5]]\n" - "ldr x7, [%[in_7]]\n" - "ins v4.d[1], x6\n" - "ins v5.d[1], x7\n" - - "str q20, [%[pack_0]]\n" - "trn1 v24.4s, v4.4s, v5.4s\n" - "trn2 v25.4s, v4.4s, v5.4s\n" - "str q21, [%[pack_1]]\n" - "str q24, [%[pack_0], #16]\n" - "str q25, [%[pack_1], #16]\n" - : - :[pack_0]"r"(in_pack_0), - [pack_1]"r"(in_pack_1), - [in_0]"r"(in_0), - [in_1]"r"(in_1), - [in_2]"r"(in_2), - [in_3]"r"(in_3), - [in_4]"r"(in_4), - [in_5]"r"(in_5), - [in_6]"r"(in_6), - [in_7]"r"(in_7) - :"memory", "cc", "v0", "v1", "v4", "v5", "v20", "v21", "v24", "v25", "x2", "x3", "x6", "x7" - ); - } - } - } - - // compute - for (U32 o = 0; o < oc; o++) { - INT8 *in_hw0 = in_pack; - INT8 *f_o0c0 = filterArray + o*8*fh*fw*ic*8; - I32 *out_buf = biasScaled + oc*8 + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - // bias - F16 *b_0 = b0; - I32 *b_0_s = b0_s; - __asm__ __volatile__( - "cbz %[out_f16], 8f\n" - "eor v5.16b, v5.16b, v5.16b\n" - "ldr q1, [%[in_0]]\n" //in_0 - "eor v6.16b, v6.16b, v6.16b\n" - "eor v7.16b, v7.16b, v7.16b\n" - "eor v8.16b, v8.16b, v8.16b\n" - "ldr q0, [%[f_0]]\n" //f_0 - "eor v9.16b, v9.16b, v9.16b\n" - "eor v10.16b, v10.16b, v10.16b\n" - "eor v11.16b, v11.16b, v11.16b\n" - "eor v12.16b, v12.16b, v12.16b\n" - "eor v13.16b, v13.16b, v13.16b\n" - "eor v14.16b, v14.16b, v14.16b\n" - "eor v15.16b, v15.16b, v15.16b\n" - "eor v16.16b, v16.16b, v16.16b\n" - "eor v17.16b, v17.16b, v17.16b\n" - "eor v18.16b, v18.16b, v18.16b\n" - "eor v19.16b, v19.16b, v19.16b\n" - "eor v20.16b, v20.16b, v20.16b\n" - "b 7f\n" - - "8:\n" - "ldp q29, q30, [%[b_0_s]]\n" - "ldr q1, [%[in_0]]\n" //in_0 - "ldr q0, [%[f_0]]\n" //f_0 - "mov v5.16b, v29.16b\n" - "mov v7.16b, v29.16b\n" - "mov v9.16b, v29.16b\n" - "mov v11.16b, v29.16b\n" - "mov v13.16b, v29.16b\n" - "mov v15.16b, v29.16b\n" - "mov v17.16b, v29.16b\n" - "mov v19.16b, v29.16b\n" - - "mov v6.16b, v30.16b\n" - "mov v8.16b, v30.16b\n" - "mov v10.16b, v30.16b\n" - "mov v12.16b, v30.16b\n" - "mov v14.16b, v30.16b\n" - "mov v16.16b, v30.16b\n" - "mov v18.16b, v30.16b\n" - "mov v20.16b, v30.16b\n" - - "7:\n" - - //give in address to x3 - "mov x3, %[in_0]\n" - - //give f address to x0 - "mov x0, %[f_0]\n" - - "mov x2, %[ic]\n" //ic_blk - "0:\n" - "ldr q3, [x3, 16]!\n" - "ldr q29, [x0, 16]\n" - "sdot v5.4s, v0.16b, v1.4b[0]\n" - "sdot v7.4s, v0.16b, v1.4b[1]\n" - "sdot v9.4s, v0.16b, v1.4b[2]\n" - "sdot v11.4s, v0.16b, v1.4b[3]\n" - - "sdot v13.4s, v0.16b, v3.4b[0]\n" - "sdot v15.4s, v0.16b, v3.4b[1]\n" - "sdot v17.4s, v0.16b, v3.4b[2]\n" - "sdot v19.4s, v0.16b, v3.4b[3]\n" - - "sdot v6.4s, v29.16b, v1.4b[0]\n" - "sdot v8.4s, v29.16b, v1.4b[1]\n" - - "sdot v10.4s, v29.16b, v1.4b[2]\n" - "sdot v12.4s, v29.16b, v1.4b[3]\n" - - "ldr q0, [x0, 32]!\n" - "subs x2, x2, #4\n" - "ldr q1, [x3, 16]!\n" - "sdot v14.4s, v29.16b, v3.4b[0]\n" - "sdot v16.4s, v29.16b, v3.4b[1]\n" - "sdot v18.4s, v29.16b, v3.4b[2]\n" - "sdot v20.4s, v29.16b, v3.4b[3]\n" - - "bne 0b\n" - "cbz %[out_f16], 6f\n" - "scvtf v5.4s, v5.4s\n" - "scvtf v6.4s, v6.4s\n" - "ldr q1, [%[factor]]\n" - "scvtf v7.4s, v7.4s\n" - "scvtf v8.4s, v8.4s\n" - "scvtf v9.4s, v9.4s\n" - "ldr q0, [%[b_0]]\n" - "scvtf v10.4s, v10.4s\n" - "scvtf v11.4s, v11.4s\n" - "scvtf v12.4s, v12.4s\n" - "scvtf v13.4s, v13.4s\n" - "scvtf v14.4s, v14.4s\n" - "scvtf v15.4s, v15.4s\n" - "scvtf v16.4s, v16.4s\n" - "scvtf v17.4s, v17.4s\n" - "scvtf v18.4s, v18.4s\n" - "scvtf v19.4s, v19.4s\n" - "scvtf v20.4s, v20.4s\n" - - "fmul v5.4s, v1.4s, v5.4s\n" - "fmul v6.4s, v1.4s, v6.4s\n" - "fmul v7.4s, v1.4s, v7.4s\n" - "fmul v8.4s, v1.4s, v8.4s\n" - "fmul v9.4s, v1.4s, v9.4s\n" - "fmul v10.4s, v1.4s, v10.4s\n" - "fmul v11.4s, v1.4s, v11.4s\n" - "fmul v12.4s, v1.4s, v12.4s\n" - "fmul v13.4s, v1.4s, v13.4s\n" - "fmul v14.4s, v1.4s, v14.4s\n" - "fmul v15.4s, v1.4s, v15.4s\n" - "fmul v16.4s, v1.4s, v16.4s\n" - "fmul v17.4s, v1.4s, v17.4s\n" - "fmul v18.4s, v1.4s, v18.4s\n" - "fmul v19.4s, v1.4s, v19.4s\n" - "fmul v20.4s, v1.4s, v20.4s\n" - - "fcvtn v5.4h, v5.4s\n" - "fcvtn v7.4h, v7.4s\n" - "fcvtn v9.4h, v9.4s\n" - "fcvtn v11.4h, v11.4s\n" - "fcvtn v13.4h, v13.4s\n" - "fcvtn v15.4h, v15.4s\n" - "fcvtn v17.4h, v17.4s\n" - "fcvtn v19.4h, v19.4s\n" - - "fcvtn2 v5.8h, v6.4s\n" - "fcvtn2 v7.8h, v8.4s\n" - "fcvtn2 v9.8h, v10.4s\n" - "fcvtn2 v11.8h, v12.4s\n" - "fcvtn2 v13.8h, v14.4s\n" - "fcvtn2 v15.8h, v16.4s\n" - "fcvtn2 v17.8h, v18.4s\n" - "fcvtn2 v19.8h, v20.4s\n" - - "fadd v5.8h, v0.8h, v5.8h\n" - "fadd v7.8h, v0.8h, v7.8h\n" - "fadd v9.8h, v0.8h, v9.8h\n" - "fadd v11.8h, v0.8h, v11.8h\n" - "fadd v13.8h, v0.8h, v13.8h\n" - "fadd v15.8h, v0.8h, v15.8h\n" - "fadd v17.8h, v0.8h, v17.8h\n" - "fadd v19.8h, v0.8h, v19.8h\n" - - "cbz %[conv_relu], 1f\n" - "eor v1.16b, v1.16b, v1.16b\n" //zero - "fmax v5.8h, v5.8h, v1.8h\n" - "fmax v7.8h, v7.8h, v1.8h\n" - "fmax v9.8h, v9.8h, v1.8h\n" - "fmax v11.8h, v11.8h, v1.8h\n" - "fmax v13.8h, v13.8h, v1.8h\n" - "fmax v15.8h, v15.8h, v1.8h\n" - "fmax v17.8h, v17.8h, v1.8h\n" - "fmax v19.8h, v19.8h, v1.8h\n" - - "1:\n" - "str q5, [%[out_0]]\n" - "str q7, [%[out_0], #16]\n" - "str q9, [%[out_0], #32]\n" - "str q11, [%[out_0], #48]\n" - "str q13, [%[out_0], #64]\n" - "str q15, [%[out_0], #80]\n" - "str q17, [%[out_0], #96]\n" - "str q19, [%[out_0], #112]\n" - "b 5f\n" - - "6:\n" - "ldr q0, [%[min]]\n" - "ldr q30, [%[max]]\n" - "cbz %[conv_relu], 2f\n" - "eor v1.16b, v1.16b, v1.16b\n" //zero - "smax v5.4s, v5.4s, v1.4s\n" - "smax v6.4s, v6.4s, v1.4s\n" - "smax v7.4s, v7.4s, v1.4s\n" - "smax v8.4s, v8.4s, v1.4s\n" - "smax v9.4s, v9.4s, v1.4s\n" - "smax v10.4s, v10.4s, v1.4s\n" - "smax v11.4s, v11.4s, v1.4s\n" - "smax v12.4s, v12.4s, v1.4s\n" - "smax v13.4s, v13.4s, v1.4s\n" - "smax v14.4s, v14.4s, v1.4s\n" - "smax v15.4s, v15.4s, v1.4s\n" - "smax v16.4s, v16.4s, v1.4s\n" - "smax v17.4s, v17.4s, v1.4s\n" - "smax v18.4s, v18.4s, v1.4s\n" - "smax v19.4s, v19.4s, v1.4s\n" - "smax v20.4s, v20.4s, v1.4s\n" - - "2:\n" - "cbz %[scale_known], 7f\n" - "smax v5.4s, v5.4s, v0.4s\n" - "smin v5.4s, v5.4s, v30.4s\n" - "smax v6.4s, v6.4s, v0.4s\n" - "smin v6.4s, v6.4s, v30.4s\n" - "smax v7.4s, v7.4s, v0.4s\n" - "smin v7.4s, v7.4s, v30.4s\n" - "smax v8.4s, v8.4s, v0.4s\n" - "smin v8.4s, v8.4s, v30.4s\n" - "smax v9.4s, v9.4s, v0.4s\n" - "smin v9.4s, v9.4s, v30.4s\n" - "smax v10.4s, v10.4s, v0.4s\n" - "smin v10.4s, v10.4s, v30.4s\n" - "smax v11.4s, v11.4s, v0.4s\n" - "smin v11.4s, v11.4s, v30.4s\n" - "smax v12.4s, v12.4s, v0.4s\n" - "smin v12.4s, v12.4s, v30.4s\n" - "smax v13.4s, v13.4s, v0.4s\n" - "smin v13.4s, v13.4s, v30.4s\n" - "smax v14.4s, v14.4s, v0.4s\n" - "smin v14.4s, v14.4s, v30.4s\n" - "smax v15.4s, v15.4s, v0.4s\n" - "smin v15.4s, v15.4s, v30.4s\n" - "smax v16.4s, v16.4s, v0.4s\n" - "smin v16.4s, v16.4s, v30.4s\n" - "smax v17.4s, v17.4s, v0.4s\n" - "smin v17.4s, v17.4s, v30.4s\n" - "smax v18.4s, v18.4s, v0.4s\n" - "smin v18.4s, v18.4s, v30.4s\n" - "smax v19.4s, v19.4s, v0.4s\n" - "smin v19.4s, v19.4s, v30.4s\n" - "smax v20.4s, v20.4s, v0.4s\n" - "smin v20.4s, v20.4s, v30.4s\n" - - "str q5, [%[out_buf]]\n" - "str q6, [%[out_buf], 16]\n" - "str q7, [%[out_buf], 32]\n" - "str q8, [%[out_buf], 48]\n" - "str q9, [%[out_buf], 64]\n" - "str q10, [%[out_buf], 80]\n" - "str q11, [%[out_buf], 96]\n" - "str q12, [%[out_buf], 112]\n" - "str q13, [%[out_buf], 128]\n" - "str q14, [%[out_buf], 144]\n" - "str q15, [%[out_buf], 160]\n" - "str q16, [%[out_buf], 176]\n" - "str q17, [%[out_buf], 192]\n" - "str q18, [%[out_buf], 208]\n" - "str q19, [%[out_buf], 224]\n" - "str q20, [%[out_buf], 240]\n" - "b 5f\n" - - "7:\n" - "smax v30.4s, v5.4s, v30.4s\n" - "smin v0.4s, v5.4s, v0.4s\n" - "str q5, [%[out_buf]]\n" - "smax v30.4s, v6.4s, v30.4s\n" - "smin v0.4s, v6.4s, v0.4s\n" - "str q6, [%[out_buf], 16]\n" - "smax v30.4s, v7.4s, v30.4s\n" - "smin v0.4s, v7.4s, v0.4s\n" - "str q7, [%[out_buf], 32]\n" - "smax v30.4s, v8.4s, v30.4s\n" - "smin v0.4s, v8.4s, v0.4s\n" - "str q8, [%[out_buf], 48]\n" - "smax v30.4s, v9.4s, v30.4s\n" - "smin v0.4s, v9.4s, v0.4s\n" - "str q9, [%[out_buf], 64]\n" - "smax v30.4s, v10.4s, v30.4s\n" - "smin v0.4s, v10.4s, v0.4s\n" - "str q10, [%[out_buf], 80]\n" - "smax v30.4s, v11.4s, v30.4s\n" - "smin v0.4s, v11.4s, v0.4s\n" - "str q11, [%[out_buf], 96]\n" - "smax v30.4s, v12.4s, v30.4s\n" - "smin v0.4s, v12.4s, v0.4s\n" - "str q12, [%[out_buf], 112]\n" - "smax v30.4s, v13.4s, v30.4s\n" - "smin v0.4s, v13.4s, v0.4s\n" - "str q13, [%[out_buf], 128]\n" - - "smax v30.4s, v14.4s, v30.4s\n" - "smin v0.4s, v14.4s, v0.4s\n" - "str q14, [%[out_buf], 144]\n" - "smax v30.4s, v15.4s, v30.4s\n" - "smin v0.4s, v15.4s, v0.4s\n" - "str q15, [%[out_buf], 160]\n" - "smax v30.4s, v16.4s, v30.4s\n" - "smin v0.4s, v16.4s, v0.4s\n" - "str q16, [%[out_buf], 176]\n" - "smax v30.4s, v17.4s, v30.4s\n" - "smin v0.4s, v17.4s, v0.4s\n" - "str q17, [%[out_buf], 192]\n" - "smax v30.4s, v18.4s, v30.4s\n" - "smin v0.4s, v18.4s, v0.4s\n" - "str q18, [%[out_buf], 208]\n" - "smax v30.4s, v19.4s, v30.4s\n" - "smin v0.4s, v19.4s, v0.4s\n" - "str q19, [%[out_buf], 224]\n" - "smax v30.4s, v20.4s, v30.4s\n" - "smin v0.4s, v20.4s, v0.4s\n" - "str q20, [%[out_buf], 240]\n" - - "str q30, [%[max]]\n" - "str q0, [%[min]]\n" - "5:\n" - : - :[out_0]"r"(out_o0hw0), - [out_buf]"r"(out_buf), - [in_0]"r"(in_hw0), - [f_0]"r"(f_o0c0), - [ic]"r"((I64)ic*8*fh*fw), - [b_0]"r"(b_0), - [b_0_s]"r"(b_0_s), - [factor]"r"(factor_v), - [max]"r"(max_i32), - [min]"r"(min_i32), - [conv_relu]"r"(conv_relu_bool), - [out_f16]"r"(out_f16_bool), - [scale_known]"r"(scale_known_bool) - :"memory", "cc", "v0", "v1", "v3", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v29", "v30", "x0", "x1", "x2", "x3","x17","x16" - ); - b0 += 8; - b0_s += 8; - } - ohow_s += 8; - ohow_tail -= 8; - } - - if (ohow_tail >= 4) { - I32 hw = ohow_s; - F16 *b0 = biasArray; - I32 *b0_s = biasScaled; - INT8 *in_pack = ((INT8*)tmp) + ic*ih_pad*iw_pad*8; - // pack input - // NCHWc8 => NHWChw4c4 + im2col - U32 in_h[4]; - U32 in_w[4]; - - for (U32 i=0; i<4; i++) { - in_h[i] = ((hw+i)/ow)*strideH; - in_w[i] = ((hw+i)%ow)*strideW; - } - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - INT8 *in_hw4c8 = inArray_pad + c*ihiw*8 + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - INT8 *in_0 = in_hw4c8 + in_h[0]*iw_pad*8 + in_w[0]*8; - INT8 *in_1 = in_hw4c8 + in_h[1]*iw_pad*8 + in_w[1]*8; - INT8 *in_2 = in_hw4c8 + in_h[2]*iw_pad*8 + in_w[2]*8; - INT8 *in_3 = in_hw4c8 + in_h[3]*iw_pad*8 + in_w[3]*8; - INT8 *in_pack_0 = in_pack + c*fh*fw*4*8 + fh_idx*fw*4*4 + fw_idx*4*4; - INT8 *in_pack_1 = in_pack_0 + fh*fw*4*4; - - __asm__ __volatile__( - "ldr d0, [%[in_0]]\n" - "ldr x2, [%[in_2]]\n" - "ldr d1, [%[in_1]]\n" - "ldr x3, [%[in_3]]\n" - "ins v0.d[1], x2\n" - "ins v1.d[1], x3\n" - "trn1 v20.4s, v0.4s, v1.4s\n" - "trn2 v21.4s, v0.4s, v1.4s\n" - "str q20, [%[pack_0]]\n" - "str q21, [%[pack_1]]\n" - : - :[pack_0]"r"(in_pack_0), - [pack_1]"r"(in_pack_1), - [in_0]"r"(in_0), - [in_1]"r"(in_1), - [in_2]"r"(in_2), - [in_3]"r"(in_3) - :"memory", "cc", "v0", "v1", "v20", "v21", "x2", "x3" - ); - } - } - } - - // compute - for (U32 o = 0; o < oc; o++) { - INT8 *in_hw0 = in_pack; - INT8 *f_o0c0 = filterArray + o*8*fh*fw*ic*8; - I32 *out_buf = biasScaled + oc*8 + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - // bias - F16 *b_0 = b0; - I32 *b_0_s = b0_s; - __asm__ __volatile__( - "cbz %[out_f16], 8f\n" - "eor v5.16b, v5.16b, v5.16b\n" - "ldr q1, [%[in_0]]\n" //in_0 - "eor v6.16b, v6.16b, v6.16b\n" - "eor v7.16b, v7.16b, v7.16b\n" - "eor v8.16b, v8.16b, v8.16b\n" - "ldr q0, [%[f_0]]\n" //f_0 - - "eor v9.16b, v9.16b, v9.16b\n" - "eor v10.16b, v10.16b, v10.16b\n" - "eor v11.16b, v11.16b, v11.16b\n" - "eor v12.16b, v12.16b, v12.16b\n" - "b 7f\n" - - "8:\n" - "ldp q29, q30, [%[b_0_s]]\n" - "ldr q1, [%[in_0]]\n" //in_0 - "ldr q0, [%[f_0]]\n" //f_0 - "mov v5.16b, v29.16b\n" - "mov v7.16b, v29.16b\n" - "mov v9.16b, v29.16b\n" - "mov v11.16b, v29.16b\n" - - "mov v6.16b, v30.16b\n" - "mov v8.16b, v30.16b\n" - "mov v10.16b, v30.16b\n" - "mov v12.16b, v30.16b\n" - - "7:\n" - - //give in address to x3 - "mov x3, %[in_0]\n" - - //give f address to x0 - "mov x0, %[f_0]\n" - - "mov x2, %[ic]\n" //ic_blk - "0:\n" - "ldr q29, [x0, 16]\n" - "ldr q3, [x3, 16]!\n" - "sdot v5.4s, v0.16b, v1.4b[0]\n" - "sdot v7.4s, v0.16b, v1.4b[1]\n" - - "sdot v9.4s, v0.16b, v1.4b[2]\n" - "sdot v11.4s, v0.16b, v1.4b[3]\n" - - "subs x2, x2, #4\n" - "ldr q0, [x0, 32]!\n" - "sdot v6.4s, v29.16b, v1.4b[0]\n" - "sdot v8.4s, v29.16b, v1.4b[1]\n" - "sdot v10.4s, v29.16b, v1.4b[2]\n" - "sdot v12.4s, v29.16b, v1.4b[3]\n" - "mov v1.16b, v3.16b\n" - - "bne 0b\n" - "cbz %[out_f16], 6f\n" - - "scvtf v5.4s, v5.4s\n" - "scvtf v6.4s, v6.4s\n" - "ldr q1, [%[factor]]\n" - "scvtf v7.4s, v7.4s\n" - "scvtf v8.4s, v8.4s\n" - "scvtf v9.4s, v9.4s\n" - "ldr q0, [%[b_0]]\n" - "scvtf v10.4s, v10.4s\n" - "scvtf v11.4s, v11.4s\n" - "scvtf v12.4s, v12.4s\n" - - "fmul v5.4s, v1.4s, v5.4s\n" - "fmul v6.4s, v1.4s, v6.4s\n" - "fmul v7.4s, v1.4s, v7.4s\n" - "fmul v8.4s, v1.4s, v8.4s\n" - "fmul v9.4s, v1.4s, v9.4s\n" - "fmul v10.4s, v1.4s, v10.4s\n" - "fmul v11.4s, v1.4s, v11.4s\n" - "fmul v12.4s, v1.4s, v12.4s\n" - - "fcvtn v5.4h, v5.4s\n" - "fcvtn v7.4h, v7.4s\n" - "fcvtn v9.4h, v9.4s\n" - "fcvtn v11.4h, v11.4s\n" - - "fcvtn2 v5.8h, v6.4s\n" - "fcvtn2 v7.8h, v8.4s\n" - "fcvtn2 v9.8h, v10.4s\n" - "fcvtn2 v11.8h, v12.4s\n" - - "fadd v5.8h, v0.8h, v5.8h\n" - "fadd v7.8h, v0.8h, v7.8h\n" - "fadd v9.8h, v0.8h, v9.8h\n" - "fadd v11.8h, v0.8h, v11.8h\n" - - "cbz %[conv_relu], 1f\n" - "eor v1.16b, v1.16b, v1.16b\n" //zero - "fmax v5.8h, v5.8h, v1.8h\n" - "fmax v7.8h, v7.8h, v1.8h\n" - "fmax v9.8h, v9.8h, v1.8h\n" - "fmax v11.8h, v11.8h, v1.8h\n" - - "1:\n" - "str q5, [%[out_0]]\n" - "str q7, [%[out_0], #16]\n" - "str q9, [%[out_0], #32]\n" - "str q11, [%[out_0], #48]\n" - "b 5f\n" - - "6:\n" - "ldr q0, [%[min]]\n" - "ldr q30, [%[max]]\n" - "cbz %[conv_relu], 2f\n" - "eor v1.16b, v1.16b, v1.16b\n" //zero - "smax v5.4s, v5.4s, v1.4s\n" - "smax v6.4s, v6.4s, v1.4s\n" - "smax v7.4s, v7.4s, v1.4s\n" - "smax v8.4s, v8.4s, v1.4s\n" - "smax v9.4s, v9.4s, v1.4s\n" - "smax v10.4s, v10.4s, v1.4s\n" - "smax v11.4s, v11.4s, v1.4s\n" - "smax v12.4s, v12.4s, v1.4s\n" - - "2:\n" - "cbz %[scale_known], 7f\n" - "smax v5.4s, v5.4s, v0.4s\n" - "smin v5.4s, v5.4s, v30.4s\n" - "smax v6.4s, v6.4s, v0.4s\n" - "smin v6.4s, v6.4s, v30.4s\n" - "smax v7.4s, v7.4s, v0.4s\n" - "smin v7.4s, v7.4s, v30.4s\n" - "smax v8.4s, v8.4s, v0.4s\n" - "smin v8.4s, v8.4s, v30.4s\n" - "smax v9.4s, v9.4s, v0.4s\n" - "smin v9.4s, v9.4s, v30.4s\n" - "smax v10.4s, v10.4s, v0.4s\n" - "smin v10.4s, v10.4s, v30.4s\n" - "smax v11.4s, v11.4s, v0.4s\n" - "smin v11.4s, v11.4s, v30.4s\n" - "smax v12.4s, v12.4s, v0.4s\n" - "smin v12.4s, v12.4s, v30.4s\n" - - "str q5, [%[out_buf]]\n" - "str q6, [%[out_buf], 16]\n" - "str q7, [%[out_buf], 32]\n" - "str q8, [%[out_buf], 48]\n" - "str q9, [%[out_buf], 64]\n" - "str q10, [%[out_buf], 80]\n" - "str q11, [%[out_buf], 96]\n" - "str q12, [%[out_buf], 112]\n" - "b 5f\n" - - "7:\n" - "smax v30.4s, v5.4s, v30.4s\n" - "smin v0.4s, v5.4s, v0.4s\n" - "str q5, [%[out_buf]]\n" - "smax v30.4s, v6.4s, v30.4s\n" - "smin v0.4s, v6.4s, v0.4s\n" - "str q6, [%[out_buf], 16]\n" - "smax v30.4s, v7.4s, v30.4s\n" - "smin v0.4s, v7.4s, v0.4s\n" - "str q7, [%[out_buf], 32]\n" - "smax v30.4s, v8.4s, v30.4s\n" - "smin v0.4s, v8.4s, v0.4s\n" - "str q8, [%[out_buf], 48]\n" - "smax v30.4s, v9.4s, v30.4s\n" - "smin v0.4s, v9.4s, v0.4s\n" - "str q9, [%[out_buf], 64]\n" - "smax v30.4s, v10.4s, v30.4s\n" - "smin v0.4s, v10.4s, v0.4s\n" - "str q10, [%[out_buf], 80]\n" - "smax v30.4s, v11.4s, v30.4s\n" - "smin v0.4s, v11.4s, v0.4s\n" - "str q11, [%[out_buf], 96]\n" - "smax v30.4s, v12.4s, v30.4s\n" - "smin v0.4s, v12.4s, v0.4s\n" - "str q12, [%[out_buf], 112]\n" - - "str q30, [%[max]]\n" - "str q0, [%[min]]\n" - "5:\n" - : - :[out_0]"r"(out_o0hw0), - [out_buf]"r"(out_buf), - [in_0]"r"(in_hw0), - [f_0]"r"(f_o0c0), - [ic]"r"((I64)ic*8*fh*fw), - [b_0]"r"(b_0), - [b_0_s]"r"(b_0_s), - [factor]"r"(factor_v), - [max]"r"(max_i32), - [min]"r"(min_i32), - [conv_relu]"r"(conv_relu_bool), - [out_f16]"r"(out_f16_bool), - [scale_known]"r"(scale_known_bool) - :"memory", "cc", "v0", "v1", "v2", "v3", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v29", "x0", "x1", "x2", "x3","x17","x16" - ); - b0 += 8; - b0_s += 8; - } - ohow_s += 4; - } - - for (I32 hw = ohow_s; hw < ohow; hw++) { - F16 *b0 = biasArray; - I32 *b0_s = biasScaled; - INT8 *in_pack = ((INT8*)tmp) + ic*ih_pad*iw_pad*8; - // pack input - // NCHWc8 => NHWChw1c4 + im2col - U32 in_h_0 = (hw/ow)*strideH; - U32 in_w_0 = (hw%ow)*strideW; - for (U32 c = 0; c < ic; c++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - INT8 *in_hw1c8 = inArray_pad + c*ihiw*8 + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - INT8 *in_0 = in_hw1c8 + in_h_0*iw_pad*8 + in_w_0*8; - INT8 *in_pack_0 = in_pack + c*fh*fw*8 + fh_idx*fw*4 + fw_idx*4; - INT8 *in_pack_1 = in_pack_0 + fh*fw*4; - - memcpy(in_pack_0, in_0, 4*bytesOf(DT_I8)); - memcpy(in_pack_1, in_0+4, 4*bytesOf(DT_I8)); - } - } - } - - // compute - for (U32 o = 0; o < oc; o++) { - INT8 *in_hw = in_pack; - INT8 *f_o = filterArray + o*8*fh*fw*ic*8; - I32 *out_buf = biasScaled + oc*8 + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - - int32x4_t res[2] = {0}; - if (out_f16_bool == 0) { - res[0] = vld1q_s32(b0_s); - res[1] = vld1q_s32(b0_s + 4); - } - - for(U32 c=0; c min_i32[i]) { - min = min_i32[i]; - } - } - - if (max == 0 && min == 0) { - return NOT_SUPPORTED; - } - - if (max > 0 && min < 0) { - I32 factor_max = 127 * 16777216 / max; - I32 factor_min = -127 * 16777216 / min; - factor = (factor_max < factor_min) ? factor_max : factor_min; - scale_o = (factor_max < factor_min) ? (127.0/max) : (-127.0/min); - } else if (max > 0) { - factor = 127 * 16777216 / max; - scale_o = 127.0 / max; - } else { - factor = -127 * 16777216 / min; - scale_o = -127.0 / min; - } - *outputScale = (*inputScale) * (*filterScale) * scale_o; - } - - U32 num_v = oc * ohow * 2; // Number of q-form vectors - I32 *out_buf = biasScaled + oc*8; - INT8 *out_q = (INT8*)output; - - ret = quantize_I32(num_v, out_buf, factor, scale_o, out_q); - } - return ret; -} - -template EE convolution_gemm_A76(TensorDesc inputDesc, const void* input, F16* inputScale, TensorDesc filterDesc, const void* filter, F16* filterScale, - ConvolutionDesc convDesc, TensorDesc biasDesc, const void* bias, U32 tmpBytes, void* tmp, TensorDesc outputDesc, - void* output, F16* outputScale, ActivationDesc activationDesc); - -template EE convolution_gemm_A76(TensorDesc inputDesc, const void* input, F16* inputScale, TensorDesc filterDesc, const void* filter, F16* filterScale, - ConvolutionDesc convDesc, TensorDesc biasDesc, const void* bias, U32 tmpBytes, void* tmp, TensorDesc outputDesc, - void* output, F16* outputScale, ActivationDesc activationDesc); -#endif diff --git a/tensor_computing/src/cpu/arm/int8/convolution_transform.cpp b/tensor_computing/src/cpu/arm/int8/convolution_transform.cpp deleted file mode 100644 index 83e1ad2d..00000000 --- a/tensor_computing/src/cpu/arm/int8/convolution_transform.cpp +++ /dev/null @@ -1,167 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifdef _USE_INT8 -#include "cpu/arm/int8/tensor_computing_int8.h" -#include "cpu/arm/fp16/convolution_winograd_transform.h" - -#include -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "tensor_computing.h" - -inline EE convolution_transform_filter_kernel_int8(TensorDesc filterDesc, const void* filter, - TensorDesc *ftmDesc, void* ftm, - DataFormat ftmDataFormat) -{ - if (nullptr == filter || nullptr == ftmDesc || nullptr == ftm) - CHECK_STATUS(NULL_POINTER); - DataType fdt; - DataFormat fdf; - U32 fn, fc, fh, fw; - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - if (fdf == ftmDataFormat) { - *ftmDesc = filterDesc; - memcpy(ftm, filter, fn*fc*fh*fw*bytesOf(fdt)); - return SUCCESS; - } - if (fdf != DF_NCHW) - CHECK_STATUS(NOT_SUPPORTED); - EE ret = SUCCESS; - switch (ftmDataFormat) { - case DF_NCHWN8C4: { - INT8 *filterArray = (INT8*)filter; - INT8 *ftmArray = (INT8*)ftm; - U32 oc = fn / 8; - U32 fc_quad = fc / 4; - for (U32 o = 0; o < oc; o++) { - for (U32 c = 0; c < fc_quad; c++) { - for (U32 hw = 0; hw < fh*fw; hw++) { - for (U32 o8 = 0; o8 < 8; o8++) { - for (U32 c4 = 0; c4 < 4; c4++) { - ftmArray[o*fh*fw*fc*8 + c*fh*fw*32 + hw*32 + o8*4 + c4] = filterArray[(o*8+o8)*fc*fh*fw + (c*4+c4)*fh*fw + hw]; - } - } - } - } - } - break; - } - case DF_HWNCN8C4: { - F16 *filterArray = (F16*)filter; - F16 *ftmArray = (F16*)ftm; - for (U32 o = 0; o < fn/8; o++) { - for (U32 c = 0; c < fc/4; c++) { - // Each time deal with N2C4; 4 times we have N8C4 - U32 f_off_0 = (o*8)*fc*fh*fw + c*4*fh*fw; - U32 f_off_1 = (o*8+2)*fc*fh*fw + c*4*fh*fw; - U32 f_off_2 = (o*8+4)*fc*fh*fw + c*4*fh*fw; - U32 f_off_3 = (o*8+6)*fc*fh*fw + c*4*fh*fw; - - U32 ftm_off_0 = o*36*fc*8 + c*32; - U32 ftm_off_1 = o*36*fc*8 + c*32 + 8; - U32 ftm_off_2 = o*36*fc*8 + c*32 + 16; - U32 ftm_off_3 = o*36*fc*8 + c*32 + 24; - - F16 F[9][8]; // N2C4 at a time - F16 *F_ptr[9]; - F16 *Fw[36]; - - for (U32 hw = 0; hw < 9; hw++) { - for (U32 oo = 0; oo < 2; oo++) { - for (U32 cc = 0; cc < 4; cc++) { - F[hw][oo*4+cc] = filterArray[f_off_0 + hw + oo*fc*fh*fw + cc*fh*fw]; - } - } - F_ptr[hw] = F[hw]; - } - for (U32 hw = 0; hw < 36; hw++) { - Fw[hw] = ftmArray + ftm_off_0 + hw*fc*8; // Each hw fills N8*fc - } - trans_W_4x4_3x3(Fw, F_ptr); - - for (U32 hw = 0; hw < 9; hw++) { - for (U32 oo = 0; oo < 2; oo++) { - for (U32 cc = 0; cc < 4; cc++) { - F[hw][oo*4+cc] = filterArray[f_off_1 + hw + oo*fc*fh*fw + cc*fh*fw]; - } - } - F_ptr[hw] = F[hw]; - } - for (U32 hw = 0; hw < 36; hw++) { - Fw[hw] = ftmArray + ftm_off_1 + hw*fc*8; // Each hw fills N8*fc - } - trans_W_4x4_3x3(Fw, F_ptr); - - for (U32 hw = 0; hw < 9; hw++) { - for (U32 oo = 0; oo < 2; oo++) { - for (U32 cc = 0; cc < 4; cc++) { - F[hw][oo*4+cc] = filterArray[f_off_2 + hw + oo*fc*fh*fw + cc*fh*fw]; - } - } - F_ptr[hw] = F[hw]; - } - for (U32 hw = 0; hw < 36; hw++) { - Fw[hw] = ftmArray + ftm_off_2 + hw*fc*8; // Each hw fills N8*fc - } - trans_W_4x4_3x3(Fw, F_ptr); - - for (U32 hw = 0; hw < 9; hw++) { - for (U32 oo = 0; oo < 2; oo++) { - for (U32 cc = 0; cc < 4; cc++) { - F[hw][oo*4+cc] = filterArray[f_off_3 + hw + oo*fc*fh*fw + cc*fh*fw]; - } - } - F_ptr[hw] = F[hw]; - } - for (U32 hw = 0; hw < 36; hw++) { - Fw[hw] = ftmArray + ftm_off_3 + hw*fc*8; // Each hw fills N8*fc - } - trans_W_4x4_3x3(Fw, F_ptr); - } - } - fdt = DT_F16; - fh = 6; - fw = 6; - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - *ftmDesc = tensor4df(fdt, ftmDataFormat, fn, fc, fh, fw); - return ret; -} - - -EE convolution_transform_filter_int8(TensorDesc filterDesc, const void* filter, - ConvolutionForwardAlgorithm algorithm, - TensorDesc *ftmDesc, void* filterTransformed) -{ - DataFormat ftmDataFormat; - switch (algorithm) { - case CONVOLUTION_ALGORITHM_WINOGRAD: - ftmDataFormat = DF_HWNCN8C4; - break; - case CONVOLUTION_ALGORITHM_GEMM: - ftmDataFormat = DF_NCHWN8C4; - break; - default: - return NOT_MATCH; - } - EE ret = convolution_transform_filter_kernel_int8(filterDesc, filter, ftmDesc, filterTransformed, ftmDataFormat); - return ret; -} -#endif diff --git a/tensor_computing/src/cpu/arm/int8/convolution_winograd.h b/tensor_computing/src/cpu/arm/int8/convolution_winograd.h deleted file mode 100644 index f8553a48..00000000 --- a/tensor_computing/src/cpu/arm/int8/convolution_winograd.h +++ /dev/null @@ -1,178 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_CONVOLUTION_WINOGRAD -#define _H_CONVOLUTION_WINOGRAD - -#ifdef _USE_INT8 -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "tensor_computing_type.h" - -template -EE convolution_winograd_A55(TensorDesc inputDesc, const void* input, F16* input_scale, TensorDesc filterDesc, const void* filter, F16* filterScale, - ConvolutionDesc convDesc, TensorDesc biasDesc, const void* bias, U32 tmpBytes, void* tmp, TensorDesc outputDesc, - void* output, F16* outputScale, ActivationDesc am); - -template -EE convolution_winograd_A76(TensorDesc inputDesc, const void* input, F16* input_scale, TensorDesc filterDesc, const void* filter, F16* filterScale, - ConvolutionDesc convDesc, TensorDesc biasDesc, const void* bias, U32 tmpBytes, void* tmp, TensorDesc outputDesc, - void* output, F16* outputScale, ActivationDesc am); - -inline EE convolution_winograd(TensorDesc inputDesc, const void* input, F16* input_scale, TensorDesc filterDesc, const void* filter, F16* filterScale, - ConvolutionDesc convDesc, TensorDesc biasDesc, const void* bias, U32 tmpBytes, void* tmp, TensorDesc outputDesc, - void* output, F16* outputScale, ActivationDesc am, Arch arch) -{ - EE ret = SUCCESS; - switch (arch) { - case ARM_A55: - ret = convolution_winograd_A55(inputDesc, input, input_scale, - filterDesc, filter, filterScale, - convDesc, - biasDesc, bias, - tmpBytes, tmp, - outputDesc, output, outputScale, - am); - break; - case ARM_A76: - ret = convolution_winograd_A76(inputDesc, input, input_scale, - filterDesc, filter, filterScale, - convDesc, - biasDesc, bias, - tmpBytes, tmp, - outputDesc, output, outputScale, - am); - break; - default: - return NOT_SUPPORTED; - } - return ret; -} - -inline void apply_scale_f16(U32 numData, F16* array, F16 scale, INT8* qArray) -{ - for (U32 i = 0; i < numData; i++) { - F32 tmp = array[i] * scale; - qArray[i] = round(tmp); - } -} - -inline void quantize_wino_input(F16* itmArray, U32 len_per_36, INT8* inQ, F32* inputScale) -{ - U32 numData = len_per_36; - F32 scale; - - for (U32 idx = 0; idx < 36; idx++) { - F16* in = itmArray + idx*numData; - float16x8_t temp_v = vld1q_f16(in); - float16x8_t max_v = temp_v; - float16x8_t min_v = temp_v; - - for (U32 i = 8; i < numData; i += 8) { - temp_v = vld1q_f16(in+i); - max_v = vmaxq_f16(max_v, temp_v); - min_v = vminq_f16(min_v, temp_v); - } - - F16 max = vmaxvq_f16(max_v); - F16 min = vminvq_f16(min_v); - - if (max == 0 && min == 0) { - inputScale[idx] = 0.0; // We can skip this dotprod later - continue; - } - if (max > 0 && min < 0) { - F32 scale_max = 127.0 / max; - F32 scale_min = -128.0 / min; - scale = (scale_max < scale_min) ? scale_max : scale_min; - } else if (max < 0) { - scale = -128.0 / min; - } else { // min > 0 - scale = 127.0 / max; - } - - INT8 *base = inQ + idx*numData; - apply_scale_f16(numData, in, scale, base); - inputScale[idx] = scale; - } -} - -inline void quantize_wino_input_s16(short* itmArray, U32 len_per_36, INT8* inQ, F32* inputScale, F16 input_scale) -{ - U32 numData = len_per_36; - short factor; - - for (U32 idx = 0; idx < 36; idx++) { - short* in = itmArray + idx*numData; - int16x8_t temp_v = vld1q_s16(in); - int16x8_t max_v = temp_v; - int16x8_t min_v = temp_v; - - for (U32 i = 8; i < numData; i += 8) { - temp_v = vld1q_s16(in+i); - max_v = vmaxq_s16(max_v, temp_v); - min_v = vminq_s16(min_v, temp_v); - } - - short max = vmaxvq_s16(max_v); - short min = vminvq_s16(min_v); - - if (max == 0 && min == 0) { - inputScale[idx] = 0.0; // We can skip this dotprod later - continue; - } - if (max > 0 && min < 0) { - short factor_max = 127 * 256 / max; - short factor_min = -128 * 256 / min; - factor = (factor_max < factor_min) ? factor_max : factor_min; - } else if (max < 0) { - factor = -128 * 256 / min; - } else { // min > 0 - factor = 127 * 256 / max; - } - - INT8 *base = inQ + idx*numData; - int16x8_t d[4]; - int8x8_t q[4]; - U32 i = 0; - for (; i < numData-31; i += 32) { - for (U32 j = 0; j < 4; j++) { - d[j] = vld1q_s16(in+i+j*8); - } - for (U32 j = 0; j < 4; j++) { - d[j] = vmulq_n_s16(d[j], factor); - } - - q[0] = vshrn_n_s16(d[0], 8); - q[1] = vshrn_n_s16(d[1], 8); - q[2] = vshrn_n_s16(d[2], 8); - vst1_s8(base+i, q[0]); - q[3] = vshrn_n_s16(d[3], 8); - vst1_s8(base+i+8, q[1]); - vst1_s8(base+i+16, q[2]); - vst1_s8(base+i+24, q[3]); - } - - for (; i < numData; i+=8) { - d[0] = vld1q_s16(in+i); - d[0] = vmulq_n_s16(d[0], factor); - q[0] = vshrn_n_s16(d[0], 8); - vst1_s8(base+i, q[0]); - } - inputScale[idx] = (F32)factor * input_scale / 256.0; - } -} -#endif -#endif diff --git a/tensor_computing/src/cpu/arm/int8/convolution_winograd_A55.cpp b/tensor_computing/src/cpu/arm/int8/convolution_winograd_A55.cpp deleted file mode 100644 index 58d3042c..00000000 --- a/tensor_computing/src/cpu/arm/int8/convolution_winograd_A55.cpp +++ /dev/null @@ -1,1472 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifdef _USE_INT8 -#include "cpu/arm/int8/convolution_winograd_transform.h" -#include "cpu/arm/int8/convolution_winograd.h" - -template -EE convolution_winograd_A55(TensorDesc inputDesc, const void* input, F16* input_scale, TensorDesc filterDesc, const void* filter, F16* filterScale, - ConvolutionDesc convDesc, TensorDesc biasDesc, const void* bias, U32 tmpBytes, void* tmp, TensorDesc outputDesc, - void* output, F16* outputScale, ActivationDesc am) -{ - UNUSED(biasDesc); - UNUSED(tmpBytes); - - // not truely one_step. Compute hw12*(6*6)*ic at one time. - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - - if (fdf != DF_HWNCN8C4) { - return NOT_MATCH; - } - if (!(fh == 6 && fw == 6)) { - return NOT_MATCH; - } - - // Assume IT is the same as OT - OT* inArray = (OT*)input; - INT8* filterArray = (INT8*)filter; - F16* outArray = (F16*)output; - F16* biasArray = (F16*)bias; - - // both input and output are stored with C8 - oc /= 8; - ic /= 8; - - U32 tile_h = (oh + 3) / 4; - U32 tile_w = (ow + 3) / 4; - I32 tiles = tile_h * tile_w; // num of 6x6 tiles - U32 pad_left = paddingL; - U32 pad_right = paddingR + (tile_w*4 - ow); - U32 pad_w_mod_4 = tile_w*4 - ow; - U32 pad_top = paddingT; - U32 pad_bottom = paddingB + (tile_h*4 - oh); - U32 pad_h_mod_4 = tile_h*4 - oh; - U32 ih_pad = ih + pad_top + pad_bottom; - U32 iw_pad = iw + pad_left + pad_right; - - U32 ohow = oh*ow; - U32 ihiw = ih_pad*iw_pad; - - // tmp = in_pad + itm + otm + inQ + ... - // in_pad: ic*ih_pad*iw_pad*8 - // itm: 6*6*ic*12*8 (int16 or fp16) - // otm: 6*6*12*8 (F16) - // inQ: 6*6*ic*12*8 (int8) - OT* inArray_pad = (OT*)tmp; - short* itmArray = (short*)(inArray_pad + ic*ihiw*8); // will be cast to fp16 for fp16 inputs - F16* otmArray = (F16*)(itmArray + 6*6*ic*12*8); - INT8* inQ = (INT8*)(otmArray + 6*6*12*8); - if (DT_I8 == odt) { - outArray = (F16*)(inQ + 6*6*ic*12*8); // After otmArray and pack - } - - // To track the range of the final outputs and prepare for quantization - F16 max[8] = {0}; - F16 min[8] = {0}; - - for (U32 n = 0; n < in; n++) { // for each batch - OT *inArray_pad_mov = inArray_pad; - OT *inArray_mov = inArray + n*ic*ih*iw*8; - for (U32 c = 0; c < ic; c++) { - memset(inArray_pad_mov, 0, pad_top*iw_pad*8*bytesOf(idt)); - inArray_pad_mov += pad_top*iw_pad*8; - for (U32 h = pad_top; h < ih_pad - pad_bottom; h++) { - memset(inArray_pad_mov, 0, pad_left*8*bytesOf(idt)); - inArray_pad_mov += pad_left*8; - memcpy(inArray_pad_mov, inArray_mov, iw*8*bytesOf(idt)); - inArray_pad_mov += iw*8; - inArray_mov += iw*8; - memset(inArray_pad_mov, 0, pad_right*8*bytesOf(idt)); - inArray_pad_mov += pad_right*8; - } - memset(inArray_pad_mov, 0, pad_bottom*iw_pad*8*bytesOf(idt)); - inArray_pad_mov += pad_bottom*iw_pad*8; - } - - // tiles / 12 - for (I32 hw = 0; hw < tiles-11; hw+=12) { - // in trans - // NCHWc8 => (6*6)*(C/4)*hw12*c4 - // transform hw1c8 at a time, so we need 12 times to cover hw12c8 - // pack into hw12c4 after quantizing (reuse the space of itmArray) - for (U32 c = 0; c < ic; c++) { - OT *inArray_pad_mov = inArray_pad + c*ihiw*8; - short *Iw_ptr[36]; - short *Iw0[36]; - OT *I0[36]; - short *Iw1[36]; - OT *I1[36]; - short *Iw2[36]; - OT *I2[36]; - short *Iw3[36]; - OT *I3[36]; - short *Iw4[36]; - OT *I4[36]; - short *Iw5[36]; - OT *I5[36]; - short *Iw6[36]; - OT *I6[36]; - short *Iw7[36]; - OT *I7[36]; - short *Iw8[36]; - OT *I8[36]; - short *Iw9[36]; - OT *I9[36]; - short *Iw10[36]; - OT *I10[36]; - short *Iw11[36]; - OT *I11[36]; - - // Store transformed hw12c8 to itmArray - for (U32 i = 0; i < 36; i++) { - Iw0[i] = itmArray + i*12*ic*8 + c*8*12; - Iw1[i] = itmArray + i*12*ic*8 + c*8*12 + 1*8; - Iw2[i] = itmArray + i*12*ic*8 + c*8*12 + 2*8; - Iw3[i] = itmArray + i*12*ic*8 + c*8*12 + 3*8; - Iw4[i] = itmArray + i*12*ic*8 + c*8*12 + 4*8; - Iw5[i] = itmArray + i*12*ic*8 + c*8*12 + 5*8; - Iw6[i] = itmArray + i*12*ic*8 + c*8*12 + 6*8; - Iw7[i] = itmArray + i*12*ic*8 + c*8*12 + 7*8; - Iw8[i] = itmArray + i*12*ic*8 + c*8*12 + 8*8; - Iw9[i] = itmArray + i*12*ic*8 + c*8*12 + 9*8; - Iw10[i] = itmArray + i*12*ic*8 + c*8*12 + 10*8; - Iw11[i] = itmArray + i*12*ic*8 + c*8*12 + 11*8; - } - - U32 h0 = (hw/tile_w)*4; // stride is 4 - U32 w0 = (hw%tile_w)*4; - U32 h1 = ((hw+1)/tile_w)*4; - U32 w1 = ((hw+1)%tile_w)*4; - U32 h2 = ((hw+2)/tile_w)*4; - U32 w2 = ((hw+2)%tile_w)*4; - U32 h3 = ((hw+3)/tile_w)*4; - U32 w3 = ((hw+3)%tile_w)*4; - U32 h4 = ((hw+4)/tile_w)*4; - U32 w4 = ((hw+4)%tile_w)*4; - U32 h5 = ((hw+5)/tile_w)*4; - U32 w5 = ((hw+5)%tile_w)*4; - U32 h6 = ((hw+6)/tile_w)*4; - U32 w6 = ((hw+6)%tile_w)*4; - U32 h7 = ((hw+7)/tile_w)*4; - U32 w7 = ((hw+7)%tile_w)*4; - U32 h8 = ((hw+8)/tile_w)*4; - U32 w8 = ((hw+8)%tile_w)*4; - U32 h9 = ((hw+9)/tile_w)*4; - U32 w9 = ((hw+9)%tile_w)*4; - U32 h10 = ((hw+10)/tile_w)*4; - U32 w10 = ((hw+10)%tile_w)*4; - U32 h11 = ((hw+11)/tile_w)*4; - U32 w11 = ((hw+11)%tile_w)*4; - - for (U32 i = 0; i < 6; i++) { - for (U32 j = 0; j < 6; j++) { - I0[i*6 + j] = inArray_pad_mov + (h0+i)*iw_pad*8 + (w0+j)*8; - I1[i*6 + j] = inArray_pad_mov + (h1+i)*iw_pad*8 + (w1+j)*8; - I2[i*6 + j] = inArray_pad_mov + (h2+i)*iw_pad*8 + (w2+j)*8; - I3[i*6 + j] = inArray_pad_mov + (h3+i)*iw_pad*8 + (w3+j)*8; - I4[i*6 + j] = inArray_pad_mov + (h4+i)*iw_pad*8 + (w4+j)*8; - I5[i*6 + j] = inArray_pad_mov + (h5+i)*iw_pad*8 + (w5+j)*8; - I6[i*6 + j] = inArray_pad_mov + (h6+i)*iw_pad*8 + (w6+j)*8; - I7[i*6 + j] = inArray_pad_mov + (h7+i)*iw_pad*8 + (w7+j)*8; - I8[i*6 + j] = inArray_pad_mov + (h8+i)*iw_pad*8 + (w8+j)*8; - I9[i*6 + j] = inArray_pad_mov + (h9+i)*iw_pad*8 + (w9+j)*8; - I10[i*6 + j] = inArray_pad_mov + (h10+i)*iw_pad*8 + (w10+j)*8; - I11[i*6 + j] = inArray_pad_mov + (h11+i)*iw_pad*8 + (w11+j)*8; - } - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw0[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I0); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I0); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw1[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I1); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I1); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw2[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I2); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I2); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw3[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I3); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I3); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw4[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I4); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I4); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw5[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I5); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I5); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw6[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I6); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I6); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw7[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I7); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I7); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw8[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I8); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I8); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw9[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I9); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I9); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw10[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I10); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I10); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw11[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I11); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I11); - } - } - - F32 inputScale[36]; - - if (DT_I8 == idt) { - quantize_wino_input_s16(itmArray, 12*ic*8, inQ, inputScale, *input_scale); - } else { - quantize_wino_input((F16*)itmArray, 12*ic*8, inQ, inputScale); - } - - F32 factor_v[36][4]; - for (U32 i = 0; i < 36; i++) { - if (inputScale[i] == 0) { - factor_v[i][0] = 0; - continue; - } else { - factor_v[i][0] = 1.0 / inputScale[i] / (F32)filterScale[i]; - } - - factor_v[i][1] = factor_v[i][0]; - factor_v[i][2] = factor_v[i][0]; - factor_v[i][3] = factor_v[i][0]; - } - - INT8 *in_pack = (INT8*)itmArray; // Reuse the space - - for (U32 idx=0; idx<36; idx++) { - if (factor_v[idx][0] == 0) { // input pixels are all 0 - continue; - } - for (U32 c = 0; c < ic; c++) { // for each 8 channels - INT8 *in_hw12c8 = inQ + idx*12*ic*8 + c*12*8; - - INT8 *in_0 = in_hw12c8; - INT8 *in_1 = in_hw12c8 + 1*8; - INT8 *in_2 = in_hw12c8 + 2*8; - INT8 *in_3 = in_hw12c8 + 3*8; - INT8 *in_4 = in_hw12c8 + 4*8; - INT8 *in_5 = in_hw12c8 + 5*8; - INT8 *in_6 = in_hw12c8 + 6*8; - INT8 *in_7 = in_hw12c8 + 7*8; - INT8 *in_8 = in_hw12c8 + 8*8; - INT8 *in_9 = in_hw12c8 + 9*8; - INT8 *in_10 = in_hw12c8 + 10*8; - INT8 *in_11 = in_hw12c8 + 11*8; - - // NHWChw12c4 - INT8 *in_pack_0 = in_pack + idx*12*ic*8 + c*12*8; - INT8 *in_pack_1 = in_pack_0 + 12*4; - - __asm__ __volatile__( - "ldr d0, [%[in_0]]\n" - "ldr x2, [%[in_2]]\n" - "ldr d1, [%[in_1]]\n" - "ldr x3, [%[in_3]]\n" - "ins v0.d[1], x2\n" - "ins v1.d[1], x3\n" - "ldr d4, [%[in_4]]\n" - "ldr x6, [%[in_6]]\n" - "trn1 v20.4s, v0.4s, v1.4s\n" - "trn2 v21.4s, v0.4s, v1.4s\n" - - "ldr d5, [%[in_5]]\n" - "ldr x7, [%[in_7]]\n" - "ins v4.d[1], x6\n" - "ins v5.d[1], x7\n" - "ldr d8, [%[in_8]]\n" - "ldr x10, [%[in_10]]\n" - "trn1 v24.4s, v4.4s, v5.4s\n" - "trn2 v25.4s, v4.4s, v5.4s\n" - "ldr d9, [%[in_9]]\n" - "ldr x11, [%[in_11]]\n" - "ins v8.d[1], x10\n" - "ins v9.d[1], x11\n" - - "str q20, [%[pack_0]]\n" - "trn1 v28.4s, v8.4s, v9.4s\n" - "trn2 v29.4s, v8.4s, v9.4s\n" - "str q24, [%[pack_0], #16]\n" - "str q28, [%[pack_0], #32]\n" - "str q21, [%[pack_1]]\n" - "str q25, [%[pack_1], #16]\n" - "str q29, [%[pack_1], #32]\n" - : - :[pack_0]"r"(in_pack_0), - [pack_1]"r"(in_pack_1), - [in_0]"r"(in_0), - [in_1]"r"(in_1), - [in_2]"r"(in_2), - [in_3]"r"(in_3), - [in_4]"r"(in_4), - [in_5]"r"(in_5), - [in_6]"r"(in_6), - [in_7]"r"(in_7), - [in_8]"r"(in_8), - [in_9]"r"(in_9), - [in_10]"r"(in_10), - [in_11]"r"(in_11) - :"memory", "cc", "v0", "v1", "v4", "v5", "v8", "v9", "v20", "v21", "v24", "v25", "v28", "v29", "x2", "x3", "x6", "x7", "x10", "x11" - ); - } - } - - // compute - for (U32 o = 0; o < oc; o++) { // 8 output channels at a time - // bias - F16 *b_0 = biasArray + o*8; - for (U32 idx = 0; idx < 36; idx++) { - INT8 *in_hw0 = in_pack + idx*12*ic*8; - INT8 *f_o0c0 = filterArray + o*8*36*ic*8 + idx*8*ic*8; - F16 *out_o0hw0 = otmArray + idx*12*8; - if (factor_v[idx][0] == 0) { // input pixels are all 0 - memset(out_o0hw0, 0, 12*8*sizeof(OT)); - continue; - } - F32 *fac = factor_v[idx]; - __asm__ __volatile__( - "eor v5.16b, v5.16b, v5.16b\n" - "ldr d1, [%[in_0]]\n" //in_0 - "eor v6.16b, v6.16b, v6.16b\n" - "ldr x1, [%[in_0], #8]\n" - "eor v7.16b, v7.16b, v7.16b\n" - "ins v1.d[1], x1\n" - "eor v8.16b, v8.16b, v8.16b\n" - "ldr d0, [%[f_0]]\n" //f_0 - "eor v9.16b, v9.16b, v9.16b\n" - "ldr x2, [%[f_0], #8]\n" - "eor v10.16b, v10.16b, v10.16b\n" - "ins v0.d[1], x2\n" - "eor v11.16b, v11.16b, v11.16b\n" - "ldr d3, [%[in_0], #16]\n" //in_1 - "eor v12.16b, v12.16b, v12.16b\n" - "ldr x3, [%[in_0], #24]\n" - "eor v13.16b, v13.16b, v13.16b\n" - "ins v3.d[1], x3\n" - "eor v14.16b, v14.16b, v14.16b\n" - "eor v15.16b, v15.16b, v15.16b\n" - "eor v16.16b, v16.16b, v16.16b\n" - - "eor v17.16b, v17.16b, v17.16b\n" - "eor v18.16b, v18.16b, v18.16b\n" - "eor v19.16b, v19.16b, v19.16b\n" - "eor v20.16b, v20.16b, v20.16b\n" - "eor v21.16b, v21.16b, v21.16b\n" - "eor v22.16b, v22.16b, v22.16b\n" - "eor v23.16b, v23.16b, v23.16b\n" - "eor v24.16b, v24.16b, v24.16b\n" - "eor v25.16b, v25.16b, v25.16b\n" - "eor v26.16b, v26.16b, v26.16b\n" - "eor v27.16b, v27.16b, v27.16b\n" - "eor v28.16b, v28.16b, v28.16b\n" - - //give in address to x3 - "mov x3, %[in_0]\n" - - //give f address to x0 - "mov x0, %[f_0]\n" - - "mov x2, %[ic]\n" //ic_blk - "0:\n" - "sdot v5.4s, v0.16b, v1.4b[0]\n" - "ldr d2, [x3, 32]\n" - "ldr x16, [x3, 40]\n" - "sdot v7.4s, v0.16b, v1.4b[1]\n" - "ldr d29, [x0, 16]\n" - "ldr x17, [x0, 24]\n" - "sdot v9.4s, v0.16b, v1.4b[2]\n" - "ins v2.d[1], x16\n" - "ldr d30, [x3, 48]!\n" - "sdot v11.4s, v0.16b, v1.4b[3]\n" - "ins v29.d[1], x17\n" - - "sdot v13.4s, v0.16b, v3.4b[0]\n" - "ldr x16, [x3, 8]\n" - "subs x2, x2, #4\n" - "sdot v15.4s, v0.16b, v3.4b[1]\n" - "sdot v17.4s, v0.16b, v3.4b[2]\n" - "ins v30.d[1], x16\n" - "sdot v19.4s, v0.16b, v3.4b[3]\n" - - "sdot v21.4s, v0.16b, v2.4b[0]\n" - "sdot v23.4s, v0.16b, v2.4b[1]\n" - "sdot v25.4s, v0.16b, v2.4b[2]\n" - "sdot v27.4s, v0.16b, v2.4b[3]\n" - - "sdot v14.4s, v29.16b, v3.4b[0]\n" - "sdot v16.4s, v29.16b, v3.4b[1]\n" - "ldr d0, [x0, 32]!\n" - "ldr x17, [x0, 8]\n" - "sdot v18.4s, v29.16b, v3.4b[2]\n" - "sdot v20.4s, v29.16b, v3.4b[3]\n" - - "sdot v6.4s, v29.16b, v1.4b[0]\n" - "sdot v8.4s, v29.16b, v1.4b[1]\n" - "ldr d3, [x3, 16]\n" - "ldr x16, [x3, 24]\n" - "sdot v10.4s, v29.16b, v1.4b[2]\n" - "sdot v12.4s, v29.16b, v1.4b[3]\n" - - "ins v0.d[1], x17\n" - "ins v3.d[1], x16\n" - - "sdot v22.4s, v29.16b, v2.4b[0]\n" - "mov v1.16b, v30.16b\n" - "sdot v24.4s, v29.16b, v2.4b[1]\n" - "sdot v26.4s, v29.16b, v2.4b[2]\n" - "sdot v28.4s, v29.16b, v2.4b[3]\n" - - "bne 0b\n" - "scvtf v5.4s, v5.4s\n" - "scvtf v6.4s, v6.4s\n" - "ldr d1, [%[factor]]\n" - "ldr x1, [%[factor], #8]\n" - "scvtf v7.4s, v7.4s\n" - "scvtf v8.4s, v8.4s\n" - "ins v1.d[1], x1\n" - "scvtf v9.4s, v9.4s\n" - "scvtf v10.4s, v10.4s\n" - "scvtf v11.4s, v11.4s\n" - "scvtf v12.4s, v12.4s\n" - "scvtf v13.4s, v13.4s\n" - "scvtf v14.4s, v14.4s\n" - "scvtf v15.4s, v15.4s\n" - "scvtf v16.4s, v16.4s\n" - "scvtf v17.4s, v17.4s\n" - "scvtf v18.4s, v18.4s\n" - "scvtf v19.4s, v19.4s\n" - "scvtf v20.4s, v20.4s\n" - "scvtf v21.4s, v21.4s\n" - "scvtf v22.4s, v22.4s\n" - "scvtf v23.4s, v23.4s\n" - "scvtf v24.4s, v24.4s\n" - "scvtf v25.4s, v25.4s\n" - "scvtf v26.4s, v26.4s\n" - "scvtf v27.4s, v27.4s\n" - "scvtf v28.4s, v28.4s\n" - - "fmul v5.4s, v1.4s, v5.4s\n" - "fmul v6.4s, v1.4s, v6.4s\n" - "fmul v7.4s, v1.4s, v7.4s\n" - "fmul v8.4s, v1.4s, v8.4s\n" - "fmul v9.4s, v1.4s, v9.4s\n" - "fmul v10.4s, v1.4s, v10.4s\n" - "fmul v11.4s, v1.4s, v11.4s\n" - "fmul v12.4s, v1.4s, v12.4s\n" - "fmul v13.4s, v1.4s, v13.4s\n" - "fmul v14.4s, v1.4s, v14.4s\n" - "fmul v15.4s, v1.4s, v15.4s\n" - "fmul v16.4s, v1.4s, v16.4s\n" - "fmul v17.4s, v1.4s, v17.4s\n" - "fmul v18.4s, v1.4s, v18.4s\n" - "fmul v19.4s, v1.4s, v19.4s\n" - "fmul v20.4s, v1.4s, v20.4s\n" - "fmul v21.4s, v1.4s, v21.4s\n" - "fmul v22.4s, v1.4s, v22.4s\n" - "fmul v23.4s, v1.4s, v23.4s\n" - "fmul v24.4s, v1.4s, v24.4s\n" - "fmul v25.4s, v1.4s, v25.4s\n" - "fmul v26.4s, v1.4s, v26.4s\n" - "fmul v27.4s, v1.4s, v27.4s\n" - "fmul v28.4s, v1.4s, v28.4s\n" - - "fcvtn v5.4h, v5.4s\n" - "fcvtn v7.4h, v7.4s\n" - "fcvtn v9.4h, v9.4s\n" - "fcvtn v11.4h, v11.4s\n" - "fcvtn v13.4h, v13.4s\n" - "fcvtn v15.4h, v15.4s\n" - "fcvtn v17.4h, v17.4s\n" - "fcvtn v19.4h, v19.4s\n" - "fcvtn v21.4h, v21.4s\n" - "fcvtn v23.4h, v23.4s\n" - "fcvtn v25.4h, v25.4s\n" - "fcvtn v27.4h, v27.4s\n" - - "fcvtn2 v5.8h, v6.4s\n" - "fcvtn2 v7.8h, v8.4s\n" - "fcvtn2 v9.8h, v10.4s\n" - "fcvtn2 v11.8h, v12.4s\n" - "fcvtn2 v13.8h, v14.4s\n" - "fcvtn2 v15.8h, v16.4s\n" - "fcvtn2 v17.8h, v18.4s\n" - "fcvtn2 v19.8h, v20.4s\n" - "fcvtn2 v21.8h, v22.4s\n" - "fcvtn2 v23.8h, v24.4s\n" - "fcvtn2 v25.8h, v26.4s\n" - "fcvtn2 v27.8h, v28.4s\n" - - "str q5, [%[out_0]]\n" - "str q7, [%[out_0], #16]\n" - "str q9, [%[out_0], #32]\n" - "str q11, [%[out_0], #48]\n" - "str q13, [%[out_0], #64]\n" - "str q15, [%[out_0], #80]\n" - "str q17, [%[out_0], #96]\n" - "str q19, [%[out_0], #112]\n" - "str q21, [%[out_0], #128]\n" - "str q23, [%[out_0], #144]\n" - "str q25, [%[out_0], #160]\n" - "str q27, [%[out_0], #176]\n" - : - :[out_0]"r"(out_o0hw0), - [in_0]"r"(in_hw0), - [f_0]"r"(f_o0c0), - [ic]"r"((I64)ic*8), - [factor]"r"(fac) - :"memory", "cc", "v0", "v1", "v2", "v3", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x0", "x1", "x2", "x3","x17","x16" - ); - } - // out trans - // (6*6)*hw12*o8 => NOHWo8 - for (U32 hw12 = 0; hw12 < 12; hw12++) { - U32 h = (hw+hw12) / tile_w; - U32 w = (hw+hw12) % tile_w; - F16 *out_0 = outArray + n*oc*ohow*8 + o*ohow*8 + h*4*ow*8 + w*4*8; - - F16 *Ow_0[36]; - F16 *O_0[16]; - - for (U32 idx = 0; idx < 36; idx++) { - Ow_0[idx] = otmArray + idx*12*8 + hw12*8; - } - for (U32 i = 0; i < 4; ++i) { - for (U32 j = 0; j < 4; ++j) { - O_0[i*4 + j] = out_0 + i*ow*8 + j*8; - } - } - trans_O(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, max, min, am); - } - } - } - - // tiles_reminder % 12 / 8 - I32 tiles_s = (tiles / 12) * 12; - I32 tiles_tail = tiles - tiles_s; - - if (tiles_tail >= 8) { - I32 hw = tiles_s; - // in trans - // NCHWc8 => (6*6)*(C/4)*hw8*c4 - // transform hw1c8 at a time, so we need 8 times to cover hw8c8 - // pack into hw8c4 after quantizing (reuse the space of itmArray) - for (U32 c = 0; c < ic; c++) { - OT *inArray_pad_mov = inArray_pad + c*ihiw*8; - short *Iw_ptr[36]; - short *Iw0[36]; - OT *I0[36]; - short *Iw1[36]; - OT *I1[36]; - short *Iw2[36]; - OT *I2[36]; - short *Iw3[36]; - OT *I3[36]; - short *Iw4[36]; - OT *I4[36]; - short *Iw5[36]; - OT *I5[36]; - short *Iw6[36]; - OT *I6[36]; - short *Iw7[36]; - OT *I7[36]; - - // Store transformed hw12c8 to itmArray - for (U32 i = 0; i < 36; i++) { - Iw0[i] = itmArray + i*8*ic*8 + c*8*8; - Iw1[i] = itmArray + i*8*ic*8 + c*8*8 + 1*8; - Iw2[i] = itmArray + i*8*ic*8 + c*8*8 + 2*8; - Iw3[i] = itmArray + i*8*ic*8 + c*8*8 + 3*8; - Iw4[i] = itmArray + i*8*ic*8 + c*8*8 + 4*8; - Iw5[i] = itmArray + i*8*ic*8 + c*8*8 + 5*8; - Iw6[i] = itmArray + i*8*ic*8 + c*8*8 + 6*8; - Iw7[i] = itmArray + i*8*ic*8 + c*8*8 + 7*8; - } - - U32 h0 = (hw/tile_w)*4; // stride is 4 - U32 w0 = (hw%tile_w)*4; - U32 h1 = ((hw+1)/tile_w)*4; - U32 w1 = ((hw+1)%tile_w)*4; - U32 h2 = ((hw+2)/tile_w)*4; - U32 w2 = ((hw+2)%tile_w)*4; - U32 h3 = ((hw+3)/tile_w)*4; - U32 w3 = ((hw+3)%tile_w)*4; - U32 h4 = ((hw+4)/tile_w)*4; - U32 w4 = ((hw+4)%tile_w)*4; - U32 h5 = ((hw+5)/tile_w)*4; - U32 w5 = ((hw+5)%tile_w)*4; - U32 h6 = ((hw+6)/tile_w)*4; - U32 w6 = ((hw+6)%tile_w)*4; - U32 h7 = ((hw+7)/tile_w)*4; - U32 w7 = ((hw+7)%tile_w)*4; - - for (U32 i = 0; i < 6; i++) { - for (U32 j = 0; j < 6; j++) { - I0[i*6 + j] = inArray_pad_mov + (h0+i)*iw_pad*8 + (w0+j)*8; - I1[i*6 + j] = inArray_pad_mov + (h1+i)*iw_pad*8 + (w1+j)*8; - I2[i*6 + j] = inArray_pad_mov + (h2+i)*iw_pad*8 + (w2+j)*8; - I3[i*6 + j] = inArray_pad_mov + (h3+i)*iw_pad*8 + (w3+j)*8; - I4[i*6 + j] = inArray_pad_mov + (h4+i)*iw_pad*8 + (w4+j)*8; - I5[i*6 + j] = inArray_pad_mov + (h5+i)*iw_pad*8 + (w5+j)*8; - I6[i*6 + j] = inArray_pad_mov + (h6+i)*iw_pad*8 + (w6+j)*8; - I7[i*6 + j] = inArray_pad_mov + (h7+i)*iw_pad*8 + (w7+j)*8; - } - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw0[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I0); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I0); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw1[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I1); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I1); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw2[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I2); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I2); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw3[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I3); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I3); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw4[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I4); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I4); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw5[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I5); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I5); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw6[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I6); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I6); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw7[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I7); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I7); - } - } - - F32 inputScale[36]; - - if (idt == DT_I8) { - quantize_wino_input_s16(itmArray, 8*ic*8, inQ, inputScale, *input_scale); - } else { - quantize_wino_input((F16*)itmArray, 8*ic*8, inQ, inputScale); - } - - F32 factor_v[36][4]; - for (U32 i = 0; i < 36; i++) { - if (inputScale[i] == 0) { - factor_v[i][0] = 0; - continue; - } else { - factor_v[i][0] = 1.0 / inputScale[i] / (F32)filterScale[i]; - } - factor_v[i][1] = factor_v[i][0]; - factor_v[i][2] = factor_v[i][0]; - factor_v[i][3] = factor_v[i][0]; - } - - INT8 *in_pack = (INT8*)itmArray; // Reuse the space - - for (U32 idx=0; idx<36; idx++) { - if (factor_v[idx][0] == 0) { // input pixels are all 0 - continue; - } - for (U32 c = 0; c < ic; c++) { // for each 8 channels - INT8 *in_hw8c8 = inQ + idx*8*ic*8 + c*8*8; - - INT8 *in_0 = in_hw8c8; - INT8 *in_1 = in_hw8c8 + 1*8; - INT8 *in_2 = in_hw8c8 + 2*8; - INT8 *in_3 = in_hw8c8 + 3*8; - INT8 *in_4 = in_hw8c8 + 4*8; - INT8 *in_5 = in_hw8c8 + 5*8; - INT8 *in_6 = in_hw8c8 + 6*8; - INT8 *in_7 = in_hw8c8 + 7*8; - - // NHWChw8c4 - INT8 *in_pack_0 = in_pack + idx*8*ic*8 + c*8*8; - INT8 *in_pack_1 = in_pack_0 + 8*4; - - __asm__ __volatile__( - "ldr d0, [%[in_0]]\n" - "ldr x2, [%[in_2]]\n" - "ldr d1, [%[in_1]]\n" - "ldr x3, [%[in_3]]\n" - "ins v0.d[1], x2\n" - "ins v1.d[1], x3\n" - "ldr d4, [%[in_4]]\n" - "ldr x6, [%[in_6]]\n" - "trn1 v20.4s, v0.4s, v1.4s\n" - "trn2 v21.4s, v0.4s, v1.4s\n" - - "ldr d5, [%[in_5]]\n" - "ldr x7, [%[in_7]]\n" - "ins v4.d[1], x6\n" - "ins v5.d[1], x7\n" - - "str q20, [%[pack_0]]\n" - "trn1 v24.4s, v4.4s, v5.4s\n" - "trn2 v25.4s, v4.4s, v5.4s\n" - "str q21, [%[pack_1]]\n" - "str q24, [%[pack_0], #16]\n" - "str q25, [%[pack_1], #16]\n" - : - :[pack_0]"r"(in_pack_0), - [pack_1]"r"(in_pack_1), - [in_0]"r"(in_0), - [in_1]"r"(in_1), - [in_2]"r"(in_2), - [in_3]"r"(in_3), - [in_4]"r"(in_4), - [in_5]"r"(in_5), - [in_6]"r"(in_6), - [in_7]"r"(in_7) - :"memory", "cc", "v0", "v1", "v4", "v5", "v20", "v21", "v24", "v25", "x2", "x3", "x6", "x7" - ); - } - } - - // compute - for (U32 o = 0; o < oc; o++) { // 8 output channels at a time - // bias - F16 *b_0 = biasArray + o*8; - for (U32 idx = 0; idx < 36; idx++) { - INT8 *in_hw0 = in_pack + idx*8*ic*8; - INT8 *f_o0c0 = filterArray + o*8*36*ic*8 + idx*8*ic*8; - F16 *out_o0hw0 = otmArray + idx*8*8; - if (factor_v[idx][0] == 0) { // input pixels are all 0 - memset(out_o0hw0, 0, 8*8*sizeof(OT)); - continue; - } - F32 *fac = factor_v[idx]; - __asm__ __volatile__( - // Bias should be applied after transform - "eor v5.16b, v5.16b, v5.16b\n" - "ldr d1, [%[in_0]]\n" //in_0 - "eor v6.16b, v6.16b, v6.16b\n" - "ldr x1, [%[in_0], #8]\n" - "eor v7.16b, v7.16b, v7.16b\n" - "ins v1.d[1], x1\n" - "eor v8.16b, v8.16b, v8.16b\n" - "ldr d0, [%[f_0]]\n" //f_0 - "eor v9.16b, v9.16b, v9.16b\n" - "ldr x2, [%[f_0], #8]\n" - "eor v10.16b, v10.16b, v10.16b\n" - "ins v0.d[1], x2\n" - "eor v11.16b, v11.16b, v11.16b\n" - "eor v12.16b, v12.16b, v12.16b\n" - "eor v13.16b, v13.16b, v13.16b\n" - "eor v14.16b, v14.16b, v14.16b\n" - "eor v15.16b, v15.16b, v15.16b\n" - "eor v16.16b, v16.16b, v16.16b\n" - "eor v17.16b, v17.16b, v17.16b\n" - "eor v18.16b, v18.16b, v18.16b\n" - "eor v19.16b, v19.16b, v19.16b\n" - "eor v20.16b, v20.16b, v20.16b\n" - - //give in address to x3 - "mov x3, %[in_0]\n" - - //give f address to x0 - "mov x0, %[f_0]\n" - - "mov x2, %[ic]\n" //ic_blk - "0:\n" - "sdot v5.4s, v0.16b, v1.4b[0]\n" - "ldr d3, [x3, 16]!\n" - "ldr x16, [x3, 8]\n" - "sdot v7.4s, v0.16b, v1.4b[1]\n" - "ldr d29, [x0, 16]\n" - "ldr x17, [x0, 24]\n" - "sdot v9.4s, v0.16b, v1.4b[2]\n" - "ins v3.d[1], x16\n" - "ldr d30, [x3, 16]!\n" - "sdot v11.4s, v0.16b, v1.4b[3]\n" - "ins v29.d[1], x17\n" - - "sdot v13.4s, v0.16b, v3.4b[0]\n" - "ldr x16, [x3, 8]\n" - "subs x2, x2, #4\n" - "sdot v15.4s, v0.16b, v3.4b[1]\n" - "sdot v17.4s, v0.16b, v3.4b[2]\n" - "ins v30.d[1], x16\n" - "sdot v19.4s, v0.16b, v3.4b[3]\n" - - "sdot v6.4s, v29.16b, v1.4b[0]\n" - "sdot v8.4s, v29.16b, v1.4b[1]\n" - "ldr d0, [x0, 32]!\n" - "ldr x17, [x0, 8]\n" - "sdot v10.4s, v29.16b, v1.4b[2]\n" - "sdot v12.4s, v29.16b, v1.4b[3]\n" - - "sdot v14.4s, v29.16b, v3.4b[0]\n" - "ins v0.d[1], x17\n" - "mov v1.16b, v30.16b\n" - "sdot v16.4s, v29.16b, v3.4b[1]\n" - "sdot v18.4s, v29.16b, v3.4b[2]\n" - "sdot v20.4s, v29.16b, v3.4b[3]\n" - - "bne 0b\n" - "scvtf v5.4s, v5.4s\n" - "scvtf v6.4s, v6.4s\n" - "ldr d1, [%[factor]]\n" - "ldr x1, [%[factor], #8]\n" - "scvtf v7.4s, v7.4s\n" - "scvtf v8.4s, v8.4s\n" - "ins v1.d[1], x1\n" - "scvtf v9.4s, v9.4s\n" - "scvtf v10.4s, v10.4s\n" - "scvtf v11.4s, v11.4s\n" - "scvtf v12.4s, v12.4s\n" - "scvtf v13.4s, v13.4s\n" - "scvtf v14.4s, v14.4s\n" - "scvtf v15.4s, v15.4s\n" - "scvtf v16.4s, v16.4s\n" - "scvtf v17.4s, v17.4s\n" - "scvtf v18.4s, v18.4s\n" - "scvtf v19.4s, v19.4s\n" - "scvtf v20.4s, v20.4s\n" - - "fmul v5.4s, v1.4s, v5.4s\n" - "fmul v6.4s, v1.4s, v6.4s\n" - "fmul v7.4s, v1.4s, v7.4s\n" - "fmul v8.4s, v1.4s, v8.4s\n" - "fmul v9.4s, v1.4s, v9.4s\n" - "fmul v10.4s, v1.4s, v10.4s\n" - "fmul v11.4s, v1.4s, v11.4s\n" - "fmul v12.4s, v1.4s, v12.4s\n" - "fmul v13.4s, v1.4s, v13.4s\n" - "fmul v14.4s, v1.4s, v14.4s\n" - "fmul v15.4s, v1.4s, v15.4s\n" - "fmul v16.4s, v1.4s, v16.4s\n" - "fmul v17.4s, v1.4s, v17.4s\n" - "fmul v18.4s, v1.4s, v18.4s\n" - "fmul v19.4s, v1.4s, v19.4s\n" - "fmul v20.4s, v1.4s, v20.4s\n" - - "fcvtn v5.4h, v5.4s\n" - "fcvtn v7.4h, v7.4s\n" - "fcvtn v9.4h, v9.4s\n" - "fcvtn v11.4h, v11.4s\n" - "fcvtn v13.4h, v13.4s\n" - "fcvtn v15.4h, v15.4s\n" - "fcvtn v17.4h, v17.4s\n" - "fcvtn v19.4h, v19.4s\n" - - "fcvtn2 v5.8h, v6.4s\n" - "fcvtn2 v7.8h, v8.4s\n" - "fcvtn2 v9.8h, v10.4s\n" - "fcvtn2 v11.8h, v12.4s\n" - "fcvtn2 v13.8h, v14.4s\n" - "fcvtn2 v15.8h, v16.4s\n" - "fcvtn2 v17.8h, v18.4s\n" - "fcvtn2 v19.8h, v20.4s\n" - - "str q5, [%[out_0]]\n" - "str q7, [%[out_0], #16]\n" - "str q9, [%[out_0], #32]\n" - "str q11, [%[out_0], #48]\n" - "str q13, [%[out_0], #64]\n" - "str q15, [%[out_0], #80]\n" - "str q17, [%[out_0], #96]\n" - "str q19, [%[out_0], #112]\n" - : - :[out_0]"r"(out_o0hw0), - [in_0]"r"(in_hw0), - [f_0]"r"(f_o0c0), - [ic]"r"((I64)ic*8), - [factor]"r"(fac) - :"memory", "cc", "v0", "v1", "v3", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v29", "v30", "x0", "x1", "x2", "x3","x17","x16" - ); - } - // out trans - // (6*6)*hw8*o8 => NOHWo8 - for (U32 hw8 = 0; hw8 < 8; hw8++) { - U32 h = (hw+hw8) / tile_w; - U32 w = (hw+hw8) % tile_w; - F16 *out_0 = outArray + n*oc*ohow*8 + o*ohow*8 + h*4*ow*8 + w*4*8; - - F16 *Ow_0[36]; - F16 *O_0[16]; - - for (U32 idx = 0; idx < 36; idx++) { - Ow_0[idx] = otmArray + idx*8*8 + hw8*8; - } - for (U32 i = 0; i < 4; ++i) { - for (U32 j = 0; j < 4; ++j) { - O_0[i*4 + j] = out_0 + i*ow*8 + j*8; - } - } - trans_O(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, max, min, am); - } - } - tiles_s += 8; - tiles_tail -= 8; - } - - if (tiles_tail >= 4) { - I32 hw = tiles_s; - // in trans - // NCHWc8 => (6*6)*(C/4)*hw4*c4 - // transform hw4c8 at a time, so we need 4 times to cover hw4c8 - // pack into hw4c4 after quantizing (reuse the space of itmArray) - for (U32 c = 0; c < ic; c++) { - OT *inArray_pad_mov = inArray_pad + c*ihiw*8; - short *Iw_ptr[36]; - short *Iw0[36]; - OT *I0[36]; - short *Iw1[36]; - OT *I1[36]; - short *Iw2[36]; - OT *I2[36]; - short *Iw3[36]; - OT *I3[36]; - - // Store transformed hw4c8 to itmArray - for (U32 i = 0; i < 36; i++) { - Iw0[i] = itmArray + i*4*ic*8 + c*4*8; - Iw1[i] = itmArray + i*4*ic*8 + c*4*8 + 1*8; - Iw2[i] = itmArray + i*4*ic*8 + c*4*8 + 2*8; - Iw3[i] = itmArray + i*4*ic*8 + c*4*8 + 3*8; - } - - U32 h0 = (hw/tile_w)*4; // stride is 4 - U32 w0 = (hw%tile_w)*4; - U32 h1 = ((hw+1)/tile_w)*4; - U32 w1 = ((hw+1)%tile_w)*4; - U32 h2 = ((hw+2)/tile_w)*4; - U32 w2 = ((hw+2)%tile_w)*4; - U32 h3 = ((hw+3)/tile_w)*4; - U32 w3 = ((hw+3)%tile_w)*4; - - for (U32 i = 0; i < 6; i++) { - for (U32 j = 0; j < 6; j++) { - I0[i*6 + j] = inArray_pad_mov + (h0+i)*iw_pad*8 + (w0+j)*8; - I1[i*6 + j] = inArray_pad_mov + (h1+i)*iw_pad*8 + (w1+j)*8; - I2[i*6 + j] = inArray_pad_mov + (h2+i)*iw_pad*8 + (w2+j)*8; - I3[i*6 + j] = inArray_pad_mov + (h3+i)*iw_pad*8 + (w3+j)*8; - } - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw0[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I0); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I0); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw1[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I1); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I1); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw2[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I2); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I2); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw3[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I3); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I3); - } - } - - F32 inputScale[36]; - - if (idt == DT_I8) { - quantize_wino_input_s16(itmArray, 4*ic*8, inQ, inputScale, *input_scale); - } else { - quantize_wino_input((F16*)itmArray, 4*ic*8, inQ, inputScale); - } - - F32 factor_v[36][4]; - for (U32 i = 0; i < 36; i++) { - if (inputScale[i] == 0) { - factor_v[i][0] = 0; - continue; - } else { - factor_v[i][0] = 1.0 / inputScale[i] / (F32)filterScale[i]; - } - factor_v[i][1] = factor_v[i][0]; - factor_v[i][2] = factor_v[i][0]; - factor_v[i][3] = factor_v[i][0]; - } - - F16 *b0 = biasArray; - INT8 *in_pack = (INT8*)itmArray; // Reuse the space - - for (U32 idx=0; idx<36; idx++) { - if (factor_v[idx][0] == 0) { // input pixels are all 0 - continue; - } - for (U32 c = 0; c < ic; c++) { // for each 8 channels - INT8 *in_hw4c8 = inQ + idx*4*ic*8 + c*4*8; - - INT8 *in_0 = in_hw4c8; - INT8 *in_1 = in_hw4c8 + 1*8; - INT8 *in_2 = in_hw4c8 + 2*8; - INT8 *in_3 = in_hw4c8 + 3*8; - - // NHWChw8c4 - INT8 *in_pack_0 = in_pack + idx*4*ic*8 + c*4*8; - INT8 *in_pack_1 = in_pack_0 + 4*4; - - __asm__ __volatile__( - "ldr d0, [%[in_0]]\n" - "ldr x2, [%[in_2]]\n" - "ldr d1, [%[in_1]]\n" - "ldr x3, [%[in_3]]\n" - "ins v0.d[1], x2\n" - "ins v1.d[1], x3\n" - "trn1 v20.4s, v0.4s, v1.4s\n" - "trn2 v21.4s, v0.4s, v1.4s\n" - "str q20, [%[pack_0]]\n" - "str q21, [%[pack_1]]\n" - : - :[pack_0]"r"(in_pack_0), - [pack_1]"r"(in_pack_1), - [in_0]"r"(in_0), - [in_1]"r"(in_1), - [in_2]"r"(in_2), - [in_3]"r"(in_3) - :"memory", "cc", "v0", "v1", "v20", "v21", "x2", "x3" - ); - } - } - - // compute - for (U32 o = 0; o < oc; o++) { // 8 output channels at a time - // bias - F16 *b_0 = b0 + o*8; - for (U32 idx = 0; idx < 36; idx++) { - INT8 *in_hw0 = in_pack + idx*4*ic*8; - INT8 *f_o0c0 = filterArray + o*8*36*ic*8 + idx*8*ic*8; - F16 *out_o0hw0 = otmArray + idx*4*8; - if (factor_v[idx][0] == 0) { - memset(out_o0hw0, 0, 4*8*sizeof(OT)); - continue; - } - F32 *fac = factor_v[idx]; - __asm__ __volatile__( - "eor v5.16b, v5.16b, v5.16b\n" - "ldr d1, [%[in_0]]\n" //in_0 - "eor v6.16b, v6.16b, v6.16b\n" - "ldr x1, [%[in_0], #8]\n" - "eor v7.16b, v7.16b, v7.16b\n" - "ins v1.d[1], x1\n" - "eor v8.16b, v8.16b, v8.16b\n" - "ldr d0, [%[f_0]]\n" //f_0 - - "eor v9.16b, v9.16b, v9.16b\n" - "ldr x2, [%[f_0], #8]\n" - "eor v10.16b, v10.16b, v10.16b\n" - "ins v0.d[1], x2\n" - "eor v11.16b, v11.16b, v11.16b\n" - "eor v12.16b, v12.16b, v12.16b\n" - - //give in address to x3 - "mov x3, %[in_0]\n" - - //give f address to x0 - "mov x0, %[f_0]\n" - - "mov x2, %[ic]\n" //ic_blk - "0:\n" - "ldr d29, [x0, 16]\n" - "ldr x17, [x0, 24]\n" - "sdot v5.4s, v0.16b, v1.4b[0]\n" - "ldr d3, [x3, 16]!\n" - "ldr x16, [x3, 8]\n" - "sdot v7.4s, v0.16b, v1.4b[1]\n" - "ins v29.d[1], x17\n" - "subs x2, x2, #4\n" - "sdot v9.4s, v0.16b, v1.4b[2]\n" - "ins v3.d[1], x16\n" - "sdot v11.4s, v0.16b, v1.4b[3]\n" - - "sdot v6.4s, v29.16b, v1.4b[0]\n" - "ldr d0, [x0, 32]!\n" - "ldr x17, [x0, 8]\n" - "sdot v8.4s, v29.16b, v1.4b[1]\n" - "sdot v10.4s, v29.16b, v1.4b[2]\n" - "ins v0.d[1], x17\n" - "sdot v12.4s, v29.16b, v1.4b[3]\n" - "mov v1.16b, v3.16b\n" - - "bne 0b\n" - - "scvtf v5.4s, v5.4s\n" - "scvtf v6.4s, v6.4s\n" - "ldr d1, [%[factor]]\n" - "ldr x1, [%[factor], #8]\n" - "scvtf v7.4s, v7.4s\n" - "scvtf v8.4s, v8.4s\n" - "ins v1.d[1], x1\n" - "scvtf v9.4s, v9.4s\n" - "scvtf v10.4s, v10.4s\n" - "scvtf v11.4s, v11.4s\n" - "scvtf v12.4s, v12.4s\n" - - "fmul v5.4s, v1.4s, v5.4s\n" - "fmul v6.4s, v1.4s, v6.4s\n" - "fmul v7.4s, v1.4s, v7.4s\n" - "fmul v8.4s, v1.4s, v8.4s\n" - "fmul v9.4s, v1.4s, v9.4s\n" - "fmul v10.4s, v1.4s, v10.4s\n" - "fmul v11.4s, v1.4s, v11.4s\n" - "fmul v12.4s, v1.4s, v12.4s\n" - - "fcvtn v5.4h, v5.4s\n" - "fcvtn v7.4h, v7.4s\n" - "fcvtn v9.4h, v9.4s\n" - "fcvtn v11.4h, v11.4s\n" - - "fcvtn2 v5.8h, v6.4s\n" - "fcvtn2 v7.8h, v8.4s\n" - "fcvtn2 v9.8h, v10.4s\n" - "fcvtn2 v11.8h, v12.4s\n" - - "str q5, [%[out_0]]\n" - "str q7, [%[out_0], #16]\n" - "str q9, [%[out_0], #32]\n" - "str q11, [%[out_0], #48]\n" - : - :[out_0]"r"(out_o0hw0), - [in_0]"r"(in_hw0), - [f_0]"r"(f_o0c0), - [ic]"r"((I64)ic*8), - [factor]"r"(fac) - :"memory", "cc", "v0", "v1", "v2", "v3", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v29", "x0", "x1", "x2", "x3","x17","x16" - ); - } - // out trans - // (6*6)*hw4*o8 => NOHWo8 - for (U32 hw4 = 0; hw4 < 4; hw4++) { - U32 h = (hw+hw4) / tile_w; - U32 w = (hw+hw4) % tile_w; - F16 *out_0 = outArray + n*oc*ohow*8 + o*ohow*8 + h*4*ow*8 + w*4*8; - - F16 *Ow_0[36]; - F16 *O_0[16]; - - for (U32 idx = 0; idx < 36; idx++) { - Ow_0[idx] = otmArray + idx*4*8 + hw4*8; - } - for (U32 i = 0; i < 4; ++i) { - for (U32 j = 0; j < 4; ++j) { - O_0[i*4 + j] = out_0 + i*ow*8 + j*8; - } - } - trans_O(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, max, min, am); - } - } - tiles_s += 4; - } - - for (I32 hw = tiles_s; hw < tiles; hw++) { - // in trans - // NCHWc8 => (6*6)*(C/4)*hw1*c4 - // transform hw1c8 - // pack into hw1c4 after quantizing (reuse the space of itmArray) - for (U32 c = 0; c < ic; c++) { - OT *inArray_pad_mov = inArray_pad + c*ihiw*8; - short *Iw_ptr[36]; - short *Iw0[36]; - OT *I0[36]; - - // Store transformed hw12c8 to itmArray - for (U32 i = 0; i < 36; i++) { - Iw0[i] = itmArray + i*ic*8 + c*8; - } - - U32 h0 = (hw/tile_w)*4; // stride is 4 - U32 w0 = (hw%tile_w)*4; - - for (U32 i = 0; i < 6; i++) { - for (U32 j = 0; j < 6; j++) { - I0[i*6 + j] = inArray_pad_mov + (h0+i)*iw_pad*8 + (w0+j)*8; - } - } - - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw0[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I0); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I0); - } - } - - F32 inputScale[36]; - - if (idt == DT_I8) { - quantize_wino_input_s16(itmArray, ic*8, inQ, inputScale, *input_scale); - } else { - quantize_wino_input((F16*)itmArray, ic*8, inQ, inputScale); - } - - F32 factor_v[36][4]; - for (U32 i = 0; i < 36; i++) { - if (inputScale[i] == 0) { - factor_v[i][0] = 0; - continue; - } else { - factor_v[i][0] = 1.0 / inputScale[i] / (F32)filterScale[i]; - } - factor_v[i][1] = factor_v[i][0]; - factor_v[i][2] = factor_v[i][0]; - factor_v[i][3] = factor_v[i][0]; - } - - F16 *b0 = biasArray; - INT8 *in_pack = (INT8*)itmArray; // Reuse the space - - for (U32 idx=0; idx<36; idx++) { - if (factor_v[idx][0] == 0) { - continue; - } - for (U32 c = 0; c < ic; c++) { // for each 8 channels - INT8 *in_0 = inQ + idx*ic*8 + c*8; - - // NHWChw8c4 - INT8 *in_pack_0 = in_pack + idx*ic*8 + c*8; - INT8 *in_pack_1 = in_pack_0 + 4; - - memcpy(in_pack_0, in_0, 4*bytesOf(DT_I8)); - memcpy(in_pack_1, in_0+4, 4*bytesOf(DT_I8)); - } - } - - // compute - for (U32 o = 0; o < oc; o++) { // 8 output channels at a time - // bias - F16 *b_0 = b0 + o*8; - for (U32 idx = 0; idx < 36; idx++) { - INT8 *in_hw = in_pack + idx*ic*8; - INT8 *f_o = filterArray + o*8*36*ic*8 + idx*8*ic*8; - F16 *out_o0hw0 = otmArray + idx*8; - if (factor_v[idx][0] == 0) { - memset(out_o0hw0, 0, 8*sizeof(OT)); - continue; - } - int32x4_t res[2] = {0}; - - for(U32 c = 0; c < ic; c++) { - int8x8_t in_2 = vld1_s8(in_hw); - in_hw += 8; - int8x16_t f_8o[4]; - f_8o[0] = vld1q_s8(f_o); - f_8o[1] = vld1q_s8(f_o+16); - res[0] = vdotq_lane_s32(res[0], f_8o[0], in_2, 0); - res[1] = vdotq_lane_s32(res[1], f_8o[1], in_2, 0); - - f_8o[2] = vld1q_s8(f_o+32); - f_8o[3] = vld1q_s8(f_o+48); - f_o += 64; - res[0] = vdotq_lane_s32(res[0], f_8o[2], in_2, 1); - res[1] = vdotq_lane_s32(res[1], f_8o[3], in_2, 1); - } - float32x4_t fac = vld1q_f32(factor_v[idx]); - float32x4_t resf0 = vcvtq_f32_s32(res[0]); - float32x4_t resf1 = vcvtq_f32_s32(res[1]); - resf0 = vmulq_f32(resf0, fac); - resf1 = vmulq_f32(resf1, fac); - - float16x4_t resh0 = vcvt_f16_f32(resf0); - float16x4_t resh1 = vcvt_f16_f32(resf1); - - vst1_f16(out_o0hw0, resh0); - vst1_f16(out_o0hw0+4, resh1); - } - // out trans - // (6*6)*hw1*o8 => NOHWo8 - U32 h = hw / tile_w; - U32 w = hw % tile_w; - F16 *out_0 = outArray + n*oc*ohow*8 + o*ohow*8 + h*4*ow*8 + w*4*8; - - F16 *Ow_0[36]; - F16 *O_0[16]; - - for (U32 idx = 0; idx < 36; idx++) { - Ow_0[idx] = otmArray + idx*8; - } - for (U32 i = 0; i < 4; ++i) { - for (U32 j = 0; j < 4; ++j) { - O_0[i*4 + j] = out_0 + i*ow*8 + j*8; - } - } - trans_O(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, max, min, am); - } - } - } - - if (DT_I8 == odt) { - F16 max_s = max[0]; - F16 min_s = min[0]; - for (U32 i = 1; i < 8; i++) { - if (max_s < max[i]) { - max_s = max[i]; - } - if (min_s > min[i]) { - min_s = min[i]; - } - } - - if (max_s == 0 && min_s == 0) { - return NOT_SUPPORTED; - } - - F16 scale_o; - if (max_s > 0 && min_s < 0) { - F16 scale_max = 127.0 / max_s; - F16 scale_min = -128.0 / min_s; - scale_o = (scale_max < scale_min) ? scale_max : scale_min; - } else if (max_s > 0) { - scale_o = 127.0 / max_s; - } else { - scale_o = -128.0 / min_s; - } - *outputScale = scale_o; - - apply_scale_f16(on*oc*ohow*8, outArray, scale_o, (INT8*)output); - } - return SUCCESS; -} - -template EE convolution_winograd_A55(TensorDesc inputDesc, const void* input, F16* input_scale, TensorDesc filterDesc, const void* filter, F16* filterScale, - ConvolutionDesc convDesc, TensorDesc biasDesc, const void* bias, U32 tmpBytes, void* tmp, TensorDesc outputDesc, - void* output, F16* outputScale, ActivationDesc am); - -template EE convolution_winograd_A55(TensorDesc inputDesc, const void* input, F16* input_scale, TensorDesc filterDesc, const void* filter, F16* filterScale, - ConvolutionDesc convDesc, TensorDesc biasDesc, const void* bias, U32 tmpBytes, void* tmp, TensorDesc outputDesc, - void* output, F16* outputScale, ActivationDesc am); -#endif diff --git a/tensor_computing/src/cpu/arm/int8/convolution_winograd_A76.cpp b/tensor_computing/src/cpu/arm/int8/convolution_winograd_A76.cpp deleted file mode 100644 index 7f7f1132..00000000 --- a/tensor_computing/src/cpu/arm/int8/convolution_winograd_A76.cpp +++ /dev/null @@ -1,1427 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifdef _USE_INT8 -#include "cpu/arm/int8/convolution_winograd_transform.h" -#include "cpu/arm/int8/convolution_winograd.h" - -template -EE convolution_winograd_A76(TensorDesc inputDesc, const void* input, F16* input_scale, TensorDesc filterDesc, const void* filter, F16* filterScale, - ConvolutionDesc convDesc, TensorDesc biasDesc, const void* bias, U32 tmpBytes, void* tmp, TensorDesc outputDesc, - void* output, F16* outputScale, ActivationDesc am) -{ - UNUSED(biasDesc); - UNUSED(tmpBytes); - - // not truely one_step. Compute hw12*(6*6)*ic at one time. - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - - if (fdf != DF_HWNCN8C4) { - return NOT_MATCH; - } - if (!(fh == 6 && fw == 6)) { - return NOT_MATCH; - } - - // Assume IT is the same as OT - OT* inArray = (OT*)input; - INT8* filterArray = (INT8*)filter; - F16* outArray = (F16*)output; - F16* biasArray = (F16*)bias; - - // both input and output are stored with C8 - oc /= 8; - ic /= 8; - - U32 tile_h = (oh + 3) / 4; - U32 tile_w = (ow + 3) / 4; - I32 tiles = tile_h * tile_w; // num of 6x6 tiles - U32 pad_left = paddingL; - U32 pad_right = paddingR + (tile_w*4 - ow); - U32 pad_w_mod_4 = tile_w*4 - ow; - U32 pad_top = paddingT; - U32 pad_bottom = paddingB + (tile_h*4 - oh); - U32 pad_h_mod_4 = tile_h*4 - oh; - U32 ih_pad = ih + pad_top + pad_bottom; - U32 iw_pad = iw + pad_left + pad_right; - - U32 ohow = oh*ow; - U32 ihiw = ih_pad*iw_pad; - - // tmp = in_pad + itm + otm + inQ + ... - // in_pad: ic*ih_pad*iw_pad*8 - // itm: 6*6*ic*12*8 (int16 or fp16) - // otm: 6*6*12*8 (F16) - // inQ: 6*6*ic*12*8 (int8) - OT* inArray_pad = (OT*)tmp; - short* itmArray = (short*)(inArray_pad + ic*ihiw*8); // will be cast to fp16 for fp16 inputs - F16* otmArray = (F16*)(itmArray + 6*6*ic*12*8); - INT8* inQ = (INT8*)(otmArray + 6*6*12*8); - if (DT_I8 == odt) { - outArray = (F16*)(inQ + 6*6*ic*12*8); // After otmArray and pack - } - - // To track the range of the final outputs and prepare for quantization - F16 max[8] = {0}; - F16 min[8] = {0}; - - for (U32 n = 0; n < in; n++) { // for each batch - OT *inArray_pad_mov = inArray_pad; - OT *inArray_mov = inArray + n*ic*ih*iw*8; - for (U32 c = 0; c < ic; c++) { - memset(inArray_pad_mov, 0, pad_top*iw_pad*8*bytesOf(idt)); - inArray_pad_mov += pad_top*iw_pad*8; - for (U32 h = pad_top; h < ih_pad - pad_bottom; h++) { - memset(inArray_pad_mov, 0, pad_left*8*bytesOf(idt)); - inArray_pad_mov += pad_left*8; - memcpy(inArray_pad_mov, inArray_mov, iw*8*bytesOf(idt)); - inArray_pad_mov += iw*8; - inArray_mov += iw*8; - memset(inArray_pad_mov, 0, pad_right*8*bytesOf(idt)); - inArray_pad_mov += pad_right*8; - } - memset(inArray_pad_mov, 0, pad_bottom*iw_pad*8*bytesOf(idt)); - inArray_pad_mov += pad_bottom*iw_pad*8; - } - - // tiles / 12 - for (I32 hw = 0; hw < tiles-11; hw+=12) { - // in trans - // NCHWc8 => (6*6)*(C/4)*hw12*c4 - // transform hw1c8 at a time, so we need 12 times to cover hw12c8 - // pack into hw12c4 after quantizing (reuse the space of itmArray) - for (U32 c = 0; c < ic; c++) { - OT *inArray_pad_mov = inArray_pad + c*ihiw*8; - short *Iw_ptr[36]; - short *Iw0[36]; - OT *I0[36]; - short *Iw1[36]; - OT *I1[36]; - short *Iw2[36]; - OT *I2[36]; - short *Iw3[36]; - OT *I3[36]; - short *Iw4[36]; - OT *I4[36]; - short *Iw5[36]; - OT *I5[36]; - short *Iw6[36]; - OT *I6[36]; - short *Iw7[36]; - OT *I7[36]; - short *Iw8[36]; - OT *I8[36]; - short *Iw9[36]; - OT *I9[36]; - short *Iw10[36]; - OT *I10[36]; - short *Iw11[36]; - OT *I11[36]; - - // Store transformed hw12c8 to itmArray - for (U32 i = 0; i < 36; i++) { - Iw0[i] = itmArray + i*12*ic*8 + c*8*12; - Iw1[i] = itmArray + i*12*ic*8 + c*8*12 + 1*8; - Iw2[i] = itmArray + i*12*ic*8 + c*8*12 + 2*8; - Iw3[i] = itmArray + i*12*ic*8 + c*8*12 + 3*8; - Iw4[i] = itmArray + i*12*ic*8 + c*8*12 + 4*8; - Iw5[i] = itmArray + i*12*ic*8 + c*8*12 + 5*8; - Iw6[i] = itmArray + i*12*ic*8 + c*8*12 + 6*8; - Iw7[i] = itmArray + i*12*ic*8 + c*8*12 + 7*8; - Iw8[i] = itmArray + i*12*ic*8 + c*8*12 + 8*8; - Iw9[i] = itmArray + i*12*ic*8 + c*8*12 + 9*8; - Iw10[i] = itmArray + i*12*ic*8 + c*8*12 + 10*8; - Iw11[i] = itmArray + i*12*ic*8 + c*8*12 + 11*8; - } - - U32 h0 = (hw/tile_w)*4; // stride is 4 - U32 w0 = (hw%tile_w)*4; - U32 h1 = ((hw+1)/tile_w)*4; - U32 w1 = ((hw+1)%tile_w)*4; - U32 h2 = ((hw+2)/tile_w)*4; - U32 w2 = ((hw+2)%tile_w)*4; - U32 h3 = ((hw+3)/tile_w)*4; - U32 w3 = ((hw+3)%tile_w)*4; - U32 h4 = ((hw+4)/tile_w)*4; - U32 w4 = ((hw+4)%tile_w)*4; - U32 h5 = ((hw+5)/tile_w)*4; - U32 w5 = ((hw+5)%tile_w)*4; - U32 h6 = ((hw+6)/tile_w)*4; - U32 w6 = ((hw+6)%tile_w)*4; - U32 h7 = ((hw+7)/tile_w)*4; - U32 w7 = ((hw+7)%tile_w)*4; - U32 h8 = ((hw+8)/tile_w)*4; - U32 w8 = ((hw+8)%tile_w)*4; - U32 h9 = ((hw+9)/tile_w)*4; - U32 w9 = ((hw+9)%tile_w)*4; - U32 h10 = ((hw+10)/tile_w)*4; - U32 w10 = ((hw+10)%tile_w)*4; - U32 h11 = ((hw+11)/tile_w)*4; - U32 w11 = ((hw+11)%tile_w)*4; - - for (U32 i = 0; i < 6; i++) { - for (U32 j = 0; j < 6; j++) { - I0[i*6 + j] = inArray_pad_mov + (h0+i)*iw_pad*8 + (w0+j)*8; - I1[i*6 + j] = inArray_pad_mov + (h1+i)*iw_pad*8 + (w1+j)*8; - I2[i*6 + j] = inArray_pad_mov + (h2+i)*iw_pad*8 + (w2+j)*8; - I3[i*6 + j] = inArray_pad_mov + (h3+i)*iw_pad*8 + (w3+j)*8; - I4[i*6 + j] = inArray_pad_mov + (h4+i)*iw_pad*8 + (w4+j)*8; - I5[i*6 + j] = inArray_pad_mov + (h5+i)*iw_pad*8 + (w5+j)*8; - I6[i*6 + j] = inArray_pad_mov + (h6+i)*iw_pad*8 + (w6+j)*8; - I7[i*6 + j] = inArray_pad_mov + (h7+i)*iw_pad*8 + (w7+j)*8; - I8[i*6 + j] = inArray_pad_mov + (h8+i)*iw_pad*8 + (w8+j)*8; - I9[i*6 + j] = inArray_pad_mov + (h9+i)*iw_pad*8 + (w9+j)*8; - I10[i*6 + j] = inArray_pad_mov + (h10+i)*iw_pad*8 + (w10+j)*8; - I11[i*6 + j] = inArray_pad_mov + (h11+i)*iw_pad*8 + (w11+j)*8; - } - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw0[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I0); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I0); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw1[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I1); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I1); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw2[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I2); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I2); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw3[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I3); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I3); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw4[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I4); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I4); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw5[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I5); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I5); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw6[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I6); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I6); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw7[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I7); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I7); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw8[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I8); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I8); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw9[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I9); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I9); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw10[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I10); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I10); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw11[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I11); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I11); - } - } - - F32 inputScale[36]; - - if (DT_I8 == idt) { - quantize_wino_input_s16(itmArray, 12*ic*8, inQ, inputScale, *input_scale); - } else { - quantize_wino_input((F16*)itmArray, 12*ic*8, inQ, inputScale); - } - - F32 factor_v[36][4]; - for (U32 i = 0; i < 36; i++) { - if (inputScale[i] == 0) { - factor_v[i][0] = 0; - continue; - } else { - factor_v[i][0] = 1.0 / inputScale[i] / (F32)filterScale[i]; - } - - factor_v[i][1] = factor_v[i][0]; - factor_v[i][2] = factor_v[i][0]; - factor_v[i][3] = factor_v[i][0]; - } - - INT8 *in_pack = (INT8*)itmArray; // Reuse the space - - for (U32 idx=0; idx<36; idx++) { - if (factor_v[idx][0] == 0) { // input pixels are all 0 - continue; - } - for (U32 c = 0; c < ic; c++) { // for each 8 channels - INT8 *in_hw12c8 = inQ + idx*12*ic*8 + c*12*8; - - INT8 *in_0 = in_hw12c8; - INT8 *in_1 = in_hw12c8 + 1*8; - INT8 *in_2 = in_hw12c8 + 2*8; - INT8 *in_3 = in_hw12c8 + 3*8; - INT8 *in_4 = in_hw12c8 + 4*8; - INT8 *in_5 = in_hw12c8 + 5*8; - INT8 *in_6 = in_hw12c8 + 6*8; - INT8 *in_7 = in_hw12c8 + 7*8; - INT8 *in_8 = in_hw12c8 + 8*8; - INT8 *in_9 = in_hw12c8 + 9*8; - INT8 *in_10 = in_hw12c8 + 10*8; - INT8 *in_11 = in_hw12c8 + 11*8; - - // NHWChw12c4 - INT8 *in_pack_0 = in_pack + idx*12*ic*8 + c*12*8; - INT8 *in_pack_1 = in_pack_0 + 12*4; - - __asm__ __volatile__( - "ldr d0, [%[in_0]]\n" - "ldr x2, [%[in_2]]\n" - "ldr d1, [%[in_1]]\n" - "ldr x3, [%[in_3]]\n" - "ins v0.d[1], x2\n" - "ins v1.d[1], x3\n" - "ldr d4, [%[in_4]]\n" - "ldr x6, [%[in_6]]\n" - "trn1 v20.4s, v0.4s, v1.4s\n" - "trn2 v21.4s, v0.4s, v1.4s\n" - - "ldr d5, [%[in_5]]\n" - "ldr x7, [%[in_7]]\n" - "ins v4.d[1], x6\n" - "ins v5.d[1], x7\n" - "ldr d8, [%[in_8]]\n" - "ldr x10, [%[in_10]]\n" - "trn1 v24.4s, v4.4s, v5.4s\n" - "trn2 v25.4s, v4.4s, v5.4s\n" - "ldr d9, [%[in_9]]\n" - "ldr x11, [%[in_11]]\n" - "ins v8.d[1], x10\n" - "ins v9.d[1], x11\n" - - "str q20, [%[pack_0]]\n" - "trn1 v28.4s, v8.4s, v9.4s\n" - "trn2 v29.4s, v8.4s, v9.4s\n" - "str q24, [%[pack_0], #16]\n" - "str q28, [%[pack_0], #32]\n" - "str q21, [%[pack_1]]\n" - "str q25, [%[pack_1], #16]\n" - "str q29, [%[pack_1], #32]\n" - : - :[pack_0]"r"(in_pack_0), - [pack_1]"r"(in_pack_1), - [in_0]"r"(in_0), - [in_1]"r"(in_1), - [in_2]"r"(in_2), - [in_3]"r"(in_3), - [in_4]"r"(in_4), - [in_5]"r"(in_5), - [in_6]"r"(in_6), - [in_7]"r"(in_7), - [in_8]"r"(in_8), - [in_9]"r"(in_9), - [in_10]"r"(in_10), - [in_11]"r"(in_11) - :"memory", "cc", "v0", "v1", "v4", "v5", "v8", "v9", "v20", "v21", "v24", "v25", "v28", "v29", "x2", "x3", "x6", "x7", "x10", "x11" - ); - } - } - - // compute - for (U32 o = 0; o < oc; o++) { // 8 output channels at a time - // bias - F16 *b_0 = biasArray + o*8; - for (U32 idx = 0; idx < 36; idx++) { - INT8 *in_hw0 = in_pack + idx*12*ic*8; - INT8 *f_o0c0 = filterArray + o*8*36*ic*8 + idx*8*ic*8; - F16 *out_o0hw0 = otmArray + idx*12*8; - if (factor_v[idx][0] == 0) { // input pixels are all 0 - memset(out_o0hw0, 0, 12*8*sizeof(OT)); - continue; - } - F32 *fac = factor_v[idx]; - __asm__ __volatile__( - "eor v5.16b, v5.16b, v5.16b\n" - "ldr q1, [%[in_0]]\n" //in_0 - "eor v6.16b, v6.16b, v6.16b\n" - "eor v7.16b, v7.16b, v7.16b\n" - "eor v8.16b, v8.16b, v8.16b\n" - "ldr q0, [%[f_0]]\n" //f_0 - "eor v9.16b, v9.16b, v9.16b\n" - "eor v10.16b, v10.16b, v10.16b\n" - "eor v11.16b, v11.16b, v11.16b\n" - "ldr q3, [%[in_0], #16]\n" //in_1 - "eor v12.16b, v12.16b, v12.16b\n" - "eor v13.16b, v13.16b, v13.16b\n" - "eor v14.16b, v14.16b, v14.16b\n" - "eor v15.16b, v15.16b, v15.16b\n" - "eor v16.16b, v16.16b, v16.16b\n" - - "eor v17.16b, v17.16b, v17.16b\n" - "eor v18.16b, v18.16b, v18.16b\n" - "eor v19.16b, v19.16b, v19.16b\n" - "eor v20.16b, v20.16b, v20.16b\n" - "eor v21.16b, v21.16b, v21.16b\n" - "eor v22.16b, v22.16b, v22.16b\n" - "eor v23.16b, v23.16b, v23.16b\n" - "eor v24.16b, v24.16b, v24.16b\n" - "eor v25.16b, v25.16b, v25.16b\n" - "eor v26.16b, v26.16b, v26.16b\n" - "eor v27.16b, v27.16b, v27.16b\n" - "eor v28.16b, v28.16b, v28.16b\n" - - //give in address to x3 - "mov x3, %[in_0]\n" - - //give f address to x0 - "mov x0, %[f_0]\n" - - "mov x2, %[ic]\n" //ic_blk - "0:\n" - "sdot v5.4s, v0.16b, v1.4b[0]\n" - "sdot v7.4s, v0.16b, v1.4b[1]\n" - "ldr q2, [x3, 32]\n" - "ldr q29, [x0, 16]\n" - "sdot v9.4s, v0.16b, v1.4b[2]\n" - "sdot v11.4s, v0.16b, v1.4b[3]\n" - - "sdot v13.4s, v0.16b, v3.4b[0]\n" - "sdot v15.4s, v0.16b, v3.4b[1]\n" - "sdot v17.4s, v0.16b, v3.4b[2]\n" - "sdot v19.4s, v0.16b, v3.4b[3]\n" - - "sdot v21.4s, v0.16b, v2.4b[0]\n" - "sdot v23.4s, v0.16b, v2.4b[1]\n" - "sdot v25.4s, v0.16b, v2.4b[2]\n" - "sdot v27.4s, v0.16b, v2.4b[3]\n" - - "sdot v14.4s, v29.16b, v3.4b[0]\n" - "sdot v16.4s, v29.16b, v3.4b[1]\n" - - "sdot v18.4s, v29.16b, v3.4b[2]\n" - "sdot v20.4s, v29.16b, v3.4b[3]\n" - - "sdot v6.4s, v29.16b, v1.4b[0]\n" - "sdot v8.4s, v29.16b, v1.4b[1]\n" - "ldr q0, [x0, 32]!\n" - "subs x2, x2, #4\n" - "sdot v10.4s, v29.16b, v1.4b[2]\n" - "sdot v12.4s, v29.16b, v1.4b[3]\n" - - "ldr q1, [x3, 48]!\n" - "ldr q3, [x3, 16]\n" - "sdot v22.4s, v29.16b, v2.4b[0]\n" - "sdot v24.4s, v29.16b, v2.4b[1]\n" - "sdot v26.4s, v29.16b, v2.4b[2]\n" - "sdot v28.4s, v29.16b, v2.4b[3]\n" - - "bne 0b\n" - "scvtf v5.4s, v5.4s\n" - "scvtf v6.4s, v6.4s\n" - "ldr q1, [%[factor]]\n" - "scvtf v7.4s, v7.4s\n" - "scvtf v8.4s, v8.4s\n" - "scvtf v9.4s, v9.4s\n" - "scvtf v10.4s, v10.4s\n" - "scvtf v11.4s, v11.4s\n" - "scvtf v12.4s, v12.4s\n" - "scvtf v13.4s, v13.4s\n" - "scvtf v14.4s, v14.4s\n" - "scvtf v15.4s, v15.4s\n" - "scvtf v16.4s, v16.4s\n" - "scvtf v17.4s, v17.4s\n" - "scvtf v18.4s, v18.4s\n" - "scvtf v19.4s, v19.4s\n" - "scvtf v20.4s, v20.4s\n" - "scvtf v21.4s, v21.4s\n" - "scvtf v22.4s, v22.4s\n" - "scvtf v23.4s, v23.4s\n" - "scvtf v24.4s, v24.4s\n" - "scvtf v25.4s, v25.4s\n" - "scvtf v26.4s, v26.4s\n" - "scvtf v27.4s, v27.4s\n" - "scvtf v28.4s, v28.4s\n" - - "fmul v5.4s, v1.4s, v5.4s\n" - "fmul v6.4s, v1.4s, v6.4s\n" - "fmul v7.4s, v1.4s, v7.4s\n" - "fmul v8.4s, v1.4s, v8.4s\n" - "fmul v9.4s, v1.4s, v9.4s\n" - "fmul v10.4s, v1.4s, v10.4s\n" - "fmul v11.4s, v1.4s, v11.4s\n" - "fmul v12.4s, v1.4s, v12.4s\n" - "fmul v13.4s, v1.4s, v13.4s\n" - "fmul v14.4s, v1.4s, v14.4s\n" - "fmul v15.4s, v1.4s, v15.4s\n" - "fmul v16.4s, v1.4s, v16.4s\n" - "fmul v17.4s, v1.4s, v17.4s\n" - "fmul v18.4s, v1.4s, v18.4s\n" - "fmul v19.4s, v1.4s, v19.4s\n" - "fmul v20.4s, v1.4s, v20.4s\n" - "fmul v21.4s, v1.4s, v21.4s\n" - "fmul v22.4s, v1.4s, v22.4s\n" - "fmul v23.4s, v1.4s, v23.4s\n" - "fmul v24.4s, v1.4s, v24.4s\n" - "fmul v25.4s, v1.4s, v25.4s\n" - "fmul v26.4s, v1.4s, v26.4s\n" - "fmul v27.4s, v1.4s, v27.4s\n" - "fmul v28.4s, v1.4s, v28.4s\n" - - "fcvtn v5.4h, v5.4s\n" - "fcvtn v7.4h, v7.4s\n" - "fcvtn v9.4h, v9.4s\n" - "fcvtn v11.4h, v11.4s\n" - "fcvtn v13.4h, v13.4s\n" - "fcvtn v15.4h, v15.4s\n" - "fcvtn v17.4h, v17.4s\n" - "fcvtn v19.4h, v19.4s\n" - "fcvtn v21.4h, v21.4s\n" - "fcvtn v23.4h, v23.4s\n" - "fcvtn v25.4h, v25.4s\n" - "fcvtn v27.4h, v27.4s\n" - - "fcvtn2 v5.8h, v6.4s\n" - "fcvtn2 v7.8h, v8.4s\n" - "fcvtn2 v9.8h, v10.4s\n" - "fcvtn2 v11.8h, v12.4s\n" - "fcvtn2 v13.8h, v14.4s\n" - "fcvtn2 v15.8h, v16.4s\n" - "fcvtn2 v17.8h, v18.4s\n" - "fcvtn2 v19.8h, v20.4s\n" - "fcvtn2 v21.8h, v22.4s\n" - "fcvtn2 v23.8h, v24.4s\n" - "fcvtn2 v25.8h, v26.4s\n" - "fcvtn2 v27.8h, v28.4s\n" - - "str q5, [%[out_0]]\n" - "str q7, [%[out_0], #16]\n" - "str q9, [%[out_0], #32]\n" - "str q11, [%[out_0], #48]\n" - "str q13, [%[out_0], #64]\n" - "str q15, [%[out_0], #80]\n" - "str q17, [%[out_0], #96]\n" - "str q19, [%[out_0], #112]\n" - "str q21, [%[out_0], #128]\n" - "str q23, [%[out_0], #144]\n" - "str q25, [%[out_0], #160]\n" - "str q27, [%[out_0], #176]\n" - : - :[out_0]"r"(out_o0hw0), - [in_0]"r"(in_hw0), - [f_0]"r"(f_o0c0), - [ic]"r"((I64)ic*8), - [factor]"r"(fac) - :"memory", "cc", "v0", "v1", "v2", "v3", "v5", "v6", "v7", "v8", "v9", - "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", - "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x0", "x1", "x2", "x3", "x17", "x16" - ); - } - // out trans - // (6*6)*hw12*o8 => NOHWo8 - for (U32 hw12 = 0; hw12 < 12; hw12++) { - U32 h = (hw+hw12) / tile_w; - U32 w = (hw+hw12) % tile_w; - F16 *out_0 = outArray + n*oc*ohow*8 + o*ohow*8 + h*4*ow*8 + w*4*8; - - F16 *Ow_0[36]; - F16 *O_0[16]; - - for (U32 idx = 0; idx < 36; idx++) { - Ow_0[idx] = otmArray + idx*12*8 + hw12*8; - } - for (U32 i = 0; i < 4; ++i) { - for (U32 j = 0; j < 4; ++j) { - O_0[i*4 + j] = out_0 + i*ow*8 + j*8; - } - } - trans_O(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, max, min, am); - } - } - } - - // tiles_reminder % 12 / 8 - I32 tiles_s = (tiles / 12) * 12; - I32 tiles_tail = tiles - tiles_s; - - if (tiles_tail >= 8) { - I32 hw = tiles_s; - // in trans - // NCHWc8 => (6*6)*(C/4)*hw8*c4 - // transform hw1c8 at a time, so we need 8 times to cover hw8c8 - // pack into hw8c4 after quantizing (reuse the space of itmArray) - for (U32 c = 0; c < ic; c++) { - OT *inArray_pad_mov = inArray_pad + c*ihiw*8; - short *Iw_ptr[36]; - short *Iw0[36]; - OT *I0[36]; - short *Iw1[36]; - OT *I1[36]; - short *Iw2[36]; - OT *I2[36]; - short *Iw3[36]; - OT *I3[36]; - short *Iw4[36]; - OT *I4[36]; - short *Iw5[36]; - OT *I5[36]; - short *Iw6[36]; - OT *I6[36]; - short *Iw7[36]; - OT *I7[36]; - - // Store transformed hw12c8 to itmArray - for (U32 i = 0; i < 36; i++) { - Iw0[i] = itmArray + i*8*ic*8 + c*8*8; - Iw1[i] = itmArray + i*8*ic*8 + c*8*8 + 1*8; - Iw2[i] = itmArray + i*8*ic*8 + c*8*8 + 2*8; - Iw3[i] = itmArray + i*8*ic*8 + c*8*8 + 3*8; - Iw4[i] = itmArray + i*8*ic*8 + c*8*8 + 4*8; - Iw5[i] = itmArray + i*8*ic*8 + c*8*8 + 5*8; - Iw6[i] = itmArray + i*8*ic*8 + c*8*8 + 6*8; - Iw7[i] = itmArray + i*8*ic*8 + c*8*8 + 7*8; - } - - U32 h0 = (hw/tile_w)*4; // stride is 4 - U32 w0 = (hw%tile_w)*4; - U32 h1 = ((hw+1)/tile_w)*4; - U32 w1 = ((hw+1)%tile_w)*4; - U32 h2 = ((hw+2)/tile_w)*4; - U32 w2 = ((hw+2)%tile_w)*4; - U32 h3 = ((hw+3)/tile_w)*4; - U32 w3 = ((hw+3)%tile_w)*4; - U32 h4 = ((hw+4)/tile_w)*4; - U32 w4 = ((hw+4)%tile_w)*4; - U32 h5 = ((hw+5)/tile_w)*4; - U32 w5 = ((hw+5)%tile_w)*4; - U32 h6 = ((hw+6)/tile_w)*4; - U32 w6 = ((hw+6)%tile_w)*4; - U32 h7 = ((hw+7)/tile_w)*4; - U32 w7 = ((hw+7)%tile_w)*4; - - for (U32 i = 0; i < 6; i++) { - for (U32 j = 0; j < 6; j++) { - I0[i*6 + j] = inArray_pad_mov + (h0+i)*iw_pad*8 + (w0+j)*8; - I1[i*6 + j] = inArray_pad_mov + (h1+i)*iw_pad*8 + (w1+j)*8; - I2[i*6 + j] = inArray_pad_mov + (h2+i)*iw_pad*8 + (w2+j)*8; - I3[i*6 + j] = inArray_pad_mov + (h3+i)*iw_pad*8 + (w3+j)*8; - I4[i*6 + j] = inArray_pad_mov + (h4+i)*iw_pad*8 + (w4+j)*8; - I5[i*6 + j] = inArray_pad_mov + (h5+i)*iw_pad*8 + (w5+j)*8; - I6[i*6 + j] = inArray_pad_mov + (h6+i)*iw_pad*8 + (w6+j)*8; - I7[i*6 + j] = inArray_pad_mov + (h7+i)*iw_pad*8 + (w7+j)*8; - } - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw0[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I0); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I0); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw1[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I1); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I1); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw2[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I2); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I2); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw3[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I3); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I3); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw4[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I4); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I4); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw5[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I5); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I5); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw6[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I6); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I6); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw7[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I7); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I7); - } - } - - F32 inputScale[36]; - - if (idt == DT_I8) { - quantize_wino_input_s16(itmArray, 8*ic*8, inQ, inputScale, *input_scale); - } else { - quantize_wino_input((F16*)itmArray, 8*ic*8, inQ, inputScale); - } - - F32 factor_v[36][4]; - for (U32 i = 0; i < 36; i++) { - if (inputScale[i] == 0) { - factor_v[i][0] = 0; - continue; - } else { - factor_v[i][0] = 1.0 / inputScale[i] / (F32)filterScale[i]; - } - factor_v[i][1] = factor_v[i][0]; - factor_v[i][2] = factor_v[i][0]; - factor_v[i][3] = factor_v[i][0]; - } - - INT8 *in_pack = (INT8*)itmArray; // Reuse the space - - for (U32 idx=0; idx<36; idx++) { - if (factor_v[idx][0] == 0) { // input pixels are all 0 - continue; - } - for (U32 c = 0; c < ic; c++) { // for each 8 channels - INT8 *in_hw8c8 = inQ + idx*8*ic*8 + c*8*8; - - INT8 *in_0 = in_hw8c8; - INT8 *in_1 = in_hw8c8 + 1*8; - INT8 *in_2 = in_hw8c8 + 2*8; - INT8 *in_3 = in_hw8c8 + 3*8; - INT8 *in_4 = in_hw8c8 + 4*8; - INT8 *in_5 = in_hw8c8 + 5*8; - INT8 *in_6 = in_hw8c8 + 6*8; - INT8 *in_7 = in_hw8c8 + 7*8; - - // NHWChw8c4 - INT8 *in_pack_0 = in_pack + idx*8*ic*8 + c*8*8; - INT8 *in_pack_1 = in_pack_0 + 8*4; - - __asm__ __volatile__( - "ldr d0, [%[in_0]]\n" - "ldr x2, [%[in_2]]\n" - "ldr d1, [%[in_1]]\n" - "ldr x3, [%[in_3]]\n" - "ins v0.d[1], x2\n" - "ins v1.d[1], x3\n" - "ldr d4, [%[in_4]]\n" - "ldr x6, [%[in_6]]\n" - "trn1 v20.4s, v0.4s, v1.4s\n" - "trn2 v21.4s, v0.4s, v1.4s\n" - - "ldr d5, [%[in_5]]\n" - "ldr x7, [%[in_7]]\n" - "ins v4.d[1], x6\n" - "ins v5.d[1], x7\n" - - "str q20, [%[pack_0]]\n" - "trn1 v24.4s, v4.4s, v5.4s\n" - "trn2 v25.4s, v4.4s, v5.4s\n" - "str q21, [%[pack_1]]\n" - "str q24, [%[pack_0], #16]\n" - "str q25, [%[pack_1], #16]\n" - : - :[pack_0]"r"(in_pack_0), - [pack_1]"r"(in_pack_1), - [in_0]"r"(in_0), - [in_1]"r"(in_1), - [in_2]"r"(in_2), - [in_3]"r"(in_3), - [in_4]"r"(in_4), - [in_5]"r"(in_5), - [in_6]"r"(in_6), - [in_7]"r"(in_7) - :"memory", "cc", "v0", "v1", "v4", "v5", "v20", "v21", "v24", "v25", "x2", "x3", "x6", "x7" - ); - } - } - - // compute - for (U32 o = 0; o < oc; o++) { // 8 output channels at a time - // bias - F16 *b_0 = biasArray + o*8; - for (U32 idx = 0; idx < 36; idx++) { - INT8 *in_hw0 = in_pack + idx*8*ic*8; - INT8 *f_o0c0 = filterArray + o*8*36*ic*8 + idx*8*ic*8; - F16 *out_o0hw0 = otmArray + idx*8*8; - if (factor_v[idx][0] == 0) { // input pixels are all 0 - memset(out_o0hw0, 0, 8*8*sizeof(OT)); - continue; - } - F32 *fac = factor_v[idx]; - __asm__ __volatile__( - // Bias should be applied after transform - "eor v5.16b, v5.16b, v5.16b\n" - "ldr q1, [%[in_0]]\n" //in_0 - "eor v6.16b, v6.16b, v6.16b\n" - "eor v7.16b, v7.16b, v7.16b\n" - "eor v8.16b, v8.16b, v8.16b\n" - "ldr q0, [%[f_0]]\n" //f_0 - "eor v9.16b, v9.16b, v9.16b\n" - "eor v10.16b, v10.16b, v10.16b\n" - "eor v11.16b, v11.16b, v11.16b\n" - "eor v12.16b, v12.16b, v12.16b\n" - "eor v13.16b, v13.16b, v13.16b\n" - "eor v14.16b, v14.16b, v14.16b\n" - "eor v15.16b, v15.16b, v15.16b\n" - "eor v16.16b, v16.16b, v16.16b\n" - "eor v17.16b, v17.16b, v17.16b\n" - "eor v18.16b, v18.16b, v18.16b\n" - "eor v19.16b, v19.16b, v19.16b\n" - "eor v20.16b, v20.16b, v20.16b\n" - - //give in address to x3 - "mov x3, %[in_0]\n" - - //give f address to x0 - "mov x0, %[f_0]\n" - - "mov x2, %[ic]\n" //ic_blk - "0:\n" - "ldr q3, [x3, 16]!\n" - "ldr q29, [x0, 16]\n" - "sdot v5.4s, v0.16b, v1.4b[0]\n" - "sdot v7.4s, v0.16b, v1.4b[1]\n" - "sdot v9.4s, v0.16b, v1.4b[2]\n" - "sdot v11.4s, v0.16b, v1.4b[3]\n" - - "sdot v13.4s, v0.16b, v3.4b[0]\n" - "sdot v15.4s, v0.16b, v3.4b[1]\n" - "sdot v17.4s, v0.16b, v3.4b[2]\n" - "sdot v19.4s, v0.16b, v3.4b[3]\n" - - "sdot v6.4s, v29.16b, v1.4b[0]\n" - "sdot v8.4s, v29.16b, v1.4b[1]\n" - "ldr q0, [x0, 32]!\n" - "subs x2, x2, #4\n" - "sdot v10.4s, v29.16b, v1.4b[2]\n" - "sdot v12.4s, v29.16b, v1.4b[3]\n" - - "sdot v14.4s, v29.16b, v3.4b[0]\n" - "ldr q1, [x3, 16]!\n" - "sdot v16.4s, v29.16b, v3.4b[1]\n" - "sdot v18.4s, v29.16b, v3.4b[2]\n" - "sdot v20.4s, v29.16b, v3.4b[3]\n" - - "bne 0b\n" - "scvtf v5.4s, v5.4s\n" - "scvtf v6.4s, v6.4s\n" - "ldr q1, [%[factor]]\n" - "scvtf v7.4s, v7.4s\n" - "scvtf v8.4s, v8.4s\n" - "scvtf v9.4s, v9.4s\n" - "scvtf v10.4s, v10.4s\n" - "scvtf v11.4s, v11.4s\n" - "scvtf v12.4s, v12.4s\n" - "scvtf v13.4s, v13.4s\n" - "scvtf v14.4s, v14.4s\n" - "scvtf v15.4s, v15.4s\n" - "scvtf v16.4s, v16.4s\n" - "scvtf v17.4s, v17.4s\n" - "scvtf v18.4s, v18.4s\n" - "scvtf v19.4s, v19.4s\n" - "scvtf v20.4s, v20.4s\n" - - "fmul v5.4s, v1.4s, v5.4s\n" - "fmul v6.4s, v1.4s, v6.4s\n" - "fmul v7.4s, v1.4s, v7.4s\n" - "fmul v8.4s, v1.4s, v8.4s\n" - "fmul v9.4s, v1.4s, v9.4s\n" - "fmul v10.4s, v1.4s, v10.4s\n" - "fmul v11.4s, v1.4s, v11.4s\n" - "fmul v12.4s, v1.4s, v12.4s\n" - "fmul v13.4s, v1.4s, v13.4s\n" - "fmul v14.4s, v1.4s, v14.4s\n" - "fmul v15.4s, v1.4s, v15.4s\n" - "fmul v16.4s, v1.4s, v16.4s\n" - "fmul v17.4s, v1.4s, v17.4s\n" - "fmul v18.4s, v1.4s, v18.4s\n" - "fmul v19.4s, v1.4s, v19.4s\n" - "fmul v20.4s, v1.4s, v20.4s\n" - - "fcvtn v5.4h, v5.4s\n" - "fcvtn v7.4h, v7.4s\n" - "fcvtn v9.4h, v9.4s\n" - "fcvtn v11.4h, v11.4s\n" - "fcvtn v13.4h, v13.4s\n" - "fcvtn v15.4h, v15.4s\n" - "fcvtn v17.4h, v17.4s\n" - "fcvtn v19.4h, v19.4s\n" - - "fcvtn2 v5.8h, v6.4s\n" - "fcvtn2 v7.8h, v8.4s\n" - "fcvtn2 v9.8h, v10.4s\n" - "fcvtn2 v11.8h, v12.4s\n" - "fcvtn2 v13.8h, v14.4s\n" - "fcvtn2 v15.8h, v16.4s\n" - "fcvtn2 v17.8h, v18.4s\n" - "fcvtn2 v19.8h, v20.4s\n" - - "str q5, [%[out_0]]\n" - "str q7, [%[out_0], #16]\n" - "str q9, [%[out_0], #32]\n" - "str q11, [%[out_0], #48]\n" - "str q13, [%[out_0], #64]\n" - "str q15, [%[out_0], #80]\n" - "str q17, [%[out_0], #96]\n" - "str q19, [%[out_0], #112]\n" - : - :[out_0]"r"(out_o0hw0), - [in_0]"r"(in_hw0), - [f_0]"r"(f_o0c0), - [ic]"r"((I64)ic*8), - [factor]"r"(fac) - :"memory", "cc", "v0", "v1", "v3", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v29", "v30", "x0", "x1", "x2", "x3","x17","x16" - ); - } - // out trans - // (6*6)*hw8*o8 => NOHWo8 - for (U32 hw8 = 0; hw8 < 8; hw8++) { - U32 h = (hw+hw8) / tile_w; - U32 w = (hw+hw8) % tile_w; - F16 *out_0 = outArray + n*oc*ohow*8 + o*ohow*8 + h*4*ow*8 + w*4*8; - - F16 *Ow_0[36]; - F16 *O_0[16]; - - for (U32 idx = 0; idx < 36; idx++) { - Ow_0[idx] = otmArray + idx*8*8 + hw8*8; - } - for (U32 i = 0; i < 4; ++i) { - for (U32 j = 0; j < 4; ++j) { - O_0[i*4 + j] = out_0 + i*ow*8 + j*8; - } - } - trans_O(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, max, min, am); - } - } - tiles_s += 8; - tiles_tail -= 8; - } - - if (tiles_tail >= 4) { - I32 hw = tiles_s; - // in trans - // NCHWc8 => (6*6)*(C/4)*hw4*c4 - // transform hw4c8 at a time, so we need 4 times to cover hw4c8 - // pack into hw4c4 after quantizing (reuse the space of itmArray) - for (U32 c = 0; c < ic; c++) { - OT *inArray_pad_mov = inArray_pad + c*ihiw*8; - short *Iw_ptr[36]; - short *Iw0[36]; - OT *I0[36]; - short *Iw1[36]; - OT *I1[36]; - short *Iw2[36]; - OT *I2[36]; - short *Iw3[36]; - OT *I3[36]; - - // Store transformed hw4c8 to itmArray - for (U32 i = 0; i < 36; i++) { - Iw0[i] = itmArray + i*4*ic*8 + c*4*8; - Iw1[i] = itmArray + i*4*ic*8 + c*4*8 + 1*8; - Iw2[i] = itmArray + i*4*ic*8 + c*4*8 + 2*8; - Iw3[i] = itmArray + i*4*ic*8 + c*4*8 + 3*8; - } - - U32 h0 = (hw/tile_w)*4; // stride is 4 - U32 w0 = (hw%tile_w)*4; - U32 h1 = ((hw+1)/tile_w)*4; - U32 w1 = ((hw+1)%tile_w)*4; - U32 h2 = ((hw+2)/tile_w)*4; - U32 w2 = ((hw+2)%tile_w)*4; - U32 h3 = ((hw+3)/tile_w)*4; - U32 w3 = ((hw+3)%tile_w)*4; - - for (U32 i = 0; i < 6; i++) { - for (U32 j = 0; j < 6; j++) { - I0[i*6 + j] = inArray_pad_mov + (h0+i)*iw_pad*8 + (w0+j)*8; - I1[i*6 + j] = inArray_pad_mov + (h1+i)*iw_pad*8 + (w1+j)*8; - I2[i*6 + j] = inArray_pad_mov + (h2+i)*iw_pad*8 + (w2+j)*8; - I3[i*6 + j] = inArray_pad_mov + (h3+i)*iw_pad*8 + (w3+j)*8; - } - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw0[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I0); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I0); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw1[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I1); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I1); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw2[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I2); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I2); - } - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw3[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I3); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I3); - } - } - - F32 inputScale[36]; - - if (idt == DT_I8) { - quantize_wino_input_s16(itmArray, 4*ic*8, inQ, inputScale, *input_scale); - } else { - quantize_wino_input((F16*)itmArray, 4*ic*8, inQ, inputScale); - } - - F32 factor_v[36][4]; - for (U32 i = 0; i < 36; i++) { - if (inputScale[i] == 0) { - factor_v[i][0] = 0; - continue; - } else { - factor_v[i][0] = 1.0 / inputScale[i] / (F32)filterScale[i]; - } - factor_v[i][1] = factor_v[i][0]; - factor_v[i][2] = factor_v[i][0]; - factor_v[i][3] = factor_v[i][0]; - } - - F16 *b0 = biasArray; - INT8 *in_pack = (INT8*)itmArray; // Reuse the space - - for (U32 idx=0; idx<36; idx++) { - if (factor_v[idx][0] == 0) { // input pixels are all 0 - continue; - } - for (U32 c = 0; c < ic; c++) { // for each 8 channels - INT8 *in_hw4c8 = inQ + idx*4*ic*8 + c*4*8; - - INT8 *in_0 = in_hw4c8; - INT8 *in_1 = in_hw4c8 + 1*8; - INT8 *in_2 = in_hw4c8 + 2*8; - INT8 *in_3 = in_hw4c8 + 3*8; - - // NHWChw8c4 - INT8 *in_pack_0 = in_pack + idx*4*ic*8 + c*4*8; - INT8 *in_pack_1 = in_pack_0 + 4*4; - - __asm__ __volatile__( - "ldr d0, [%[in_0]]\n" - "ldr x2, [%[in_2]]\n" - "ldr d1, [%[in_1]]\n" - "ldr x3, [%[in_3]]\n" - "ins v0.d[1], x2\n" - "ins v1.d[1], x3\n" - "trn1 v20.4s, v0.4s, v1.4s\n" - "trn2 v21.4s, v0.4s, v1.4s\n" - "str q20, [%[pack_0]]\n" - "str q21, [%[pack_1]]\n" - : - :[pack_0]"r"(in_pack_0), - [pack_1]"r"(in_pack_1), - [in_0]"r"(in_0), - [in_1]"r"(in_1), - [in_2]"r"(in_2), - [in_3]"r"(in_3) - :"memory", "cc", "v0", "v1", "v20", "v21", "x2", "x3" - ); - } - } - - // compute - for (U32 o = 0; o < oc; o++) { // 8 output channels at a time - // bias - F16 *b_0 = b0 + o*8; - for (U32 idx = 0; idx < 36; idx++) { - INT8 *in_hw0 = in_pack + idx*4*ic*8; - INT8 *f_o0c0 = filterArray + o*8*36*ic*8 + idx*8*ic*8; - F16 *out_o0hw0 = otmArray + idx*4*8; - if (factor_v[idx][0] == 0) { - memset(out_o0hw0, 0, 4*8*sizeof(OT)); - continue; - } - F32 *fac = factor_v[idx]; - __asm__ __volatile__( - "eor v5.16b, v5.16b, v5.16b\n" - "eor v6.16b, v6.16b, v6.16b\n" - "ldr q1, [%[in_0]]\n" //in_0 - "ldr q0, [%[f_0]]\n" //f_0 - "eor v7.16b, v7.16b, v7.16b\n" - "eor v8.16b, v8.16b, v8.16b\n" - - "eor v9.16b, v9.16b, v9.16b\n" - "eor v10.16b, v10.16b, v10.16b\n" - "eor v11.16b, v11.16b, v11.16b\n" - "eor v12.16b, v12.16b, v12.16b\n" - - //give in address to x3 - "mov x3, %[in_0]\n" - - //give f address to x0 - "mov x0, %[f_0]\n" - - "mov x2, %[ic]\n" //ic_blk - "0:\n" - "ldr q29, [x0, 16]\n" - "ldr q3, [x3, 16]!\n" - "sdot v5.4s, v0.16b, v1.4b[0]\n" - "sdot v7.4s, v0.16b, v1.4b[1]\n" - "sdot v9.4s, v0.16b, v1.4b[2]\n" - "sdot v11.4s, v0.16b, v1.4b[3]\n" - - "subs x2, x2, #4\n" - "ldr q0, [x0, 32]!\n" - "sdot v6.4s, v29.16b, v1.4b[0]\n" - "sdot v8.4s, v29.16b, v1.4b[1]\n" - "sdot v10.4s, v29.16b, v1.4b[2]\n" - "sdot v12.4s, v29.16b, v1.4b[3]\n" - "mov v1.16b, v3.16b\n" - "bne 0b\n" - - "scvtf v5.4s, v5.4s\n" - "scvtf v6.4s, v6.4s\n" - "ldr q1, [%[factor]]\n" - "scvtf v7.4s, v7.4s\n" - "scvtf v8.4s, v8.4s\n" - "scvtf v9.4s, v9.4s\n" - "scvtf v10.4s, v10.4s\n" - "scvtf v11.4s, v11.4s\n" - "scvtf v12.4s, v12.4s\n" - - "fmul v5.4s, v1.4s, v5.4s\n" - "fmul v6.4s, v1.4s, v6.4s\n" - "fmul v7.4s, v1.4s, v7.4s\n" - "fmul v8.4s, v1.4s, v8.4s\n" - "fmul v9.4s, v1.4s, v9.4s\n" - "fmul v10.4s, v1.4s, v10.4s\n" - "fmul v11.4s, v1.4s, v11.4s\n" - "fmul v12.4s, v1.4s, v12.4s\n" - - "fcvtn v5.4h, v5.4s\n" - "fcvtn v7.4h, v7.4s\n" - "fcvtn v9.4h, v9.4s\n" - "fcvtn v11.4h, v11.4s\n" - - "fcvtn2 v5.8h, v6.4s\n" - "fcvtn2 v7.8h, v8.4s\n" - "fcvtn2 v9.8h, v10.4s\n" - "fcvtn2 v11.8h, v12.4s\n" - - "str q5, [%[out_0]]\n" - "str q7, [%[out_0], #16]\n" - "str q9, [%[out_0], #32]\n" - "str q11, [%[out_0], #48]\n" - : - :[out_0]"r"(out_o0hw0), - [in_0]"r"(in_hw0), - [f_0]"r"(f_o0c0), - [ic]"r"((I64)ic*8), - [factor]"r"(fac) - :"memory", "cc", "v0", "v1", "v2", "v3", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v29", "x0", "x1", "x2", "x3","x17","x16" - ); - } - // out trans - // (6*6)*hw4*o8 => NOHWo8 - for (U32 hw4 = 0; hw4 < 4; hw4++) { - U32 h = (hw+hw4) / tile_w; - U32 w = (hw+hw4) % tile_w; - F16 *out_0 = outArray + n*oc*ohow*8 + o*ohow*8 + h*4*ow*8 + w*4*8; - - F16 *Ow_0[36]; - F16 *O_0[16]; - - for (U32 idx = 0; idx < 36; idx++) { - Ow_0[idx] = otmArray + idx*4*8 + hw4*8; - } - for (U32 i = 0; i < 4; ++i) { - for (U32 j = 0; j < 4; ++j) { - O_0[i*4 + j] = out_0 + i*ow*8 + j*8; - } - } - trans_O(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, max, min, am); - } - } - tiles_s += 4; - } - - for (I32 hw = tiles_s; hw < tiles; hw++) { - // in trans - // NCHWc8 => (6*6)*(C/4)*hw1*c4 - // transform hw1c8 - // pack into hw1c4 after quantizing (reuse the space of itmArray) - for (U32 c = 0; c < ic; c++) { - OT *inArray_pad_mov = inArray_pad + c*ihiw*8; - short *Iw_ptr[36]; - short *Iw0[36]; - OT *I0[36]; - - // Store transformed hw12c8 to itmArray - for (U32 i = 0; i < 36; i++) { - Iw0[i] = itmArray + i*ic*8 + c*8; - } - - U32 h0 = (hw/tile_w)*4; // stride is 4 - U32 w0 = (hw%tile_w)*4; - - for (U32 i = 0; i < 6; i++) { - for (U32 j = 0; j < 6; j++) { - I0[i*6 + j] = inArray_pad_mov + (h0+i)*iw_pad*8 + (w0+j)*8; - } - } - - for (U32 i = 0; i < 36; i++) { - Iw_ptr[i] = Iw0[i]; - } - if (idt == DT_I8) { - trans_I_int8(Iw_ptr, (INT8* const*)I0); - } else { - trans_I_4x4_3x3((F16**)Iw_ptr, (F16* const*)I0); - } - } - - F32 inputScale[36]; - - if (idt == DT_I8) { - quantize_wino_input_s16(itmArray, ic*8, inQ, inputScale, *input_scale); - } else { - quantize_wino_input((F16*)itmArray, ic*8, inQ, inputScale); - } - - F32 factor_v[36][4]; - for (U32 i = 0; i < 36; i++) { - if (inputScale[i] == 0) { - factor_v[i][0] = 0; - continue; - } else { - factor_v[i][0] = 1.0 / inputScale[i] / (F32)filterScale[i]; - } - factor_v[i][1] = factor_v[i][0]; - factor_v[i][2] = factor_v[i][0]; - factor_v[i][3] = factor_v[i][0]; - } - - F16 *b0 = biasArray; - INT8 *in_pack = (INT8*)itmArray; // Reuse the space - - for (U32 idx=0; idx<36; idx++) { - if (factor_v[idx][0] == 0) { - continue; - } - for (U32 c = 0; c < ic; c++) { // for each 8 channels - INT8 *in_0 = inQ + idx*ic*8 + c*8; - - // NHWChw8c4 - INT8 *in_pack_0 = in_pack + idx*ic*8 + c*8; - INT8 *in_pack_1 = in_pack_0 + 4; - - memcpy(in_pack_0, in_0, 4*bytesOf(DT_I8)); - memcpy(in_pack_1, in_0+4, 4*bytesOf(DT_I8)); - } - } - - // compute - for (U32 o = 0; o < oc; o++) { // 8 output channels at a time - // bias - F16 *b_0 = b0 + o*8; - for (U32 idx = 0; idx < 36; idx++) { - INT8 *in_hw = in_pack + idx*ic*8; - INT8 *f_o = filterArray + o*8*36*ic*8 + idx*8*ic*8; - F16 *out_o0hw0 = otmArray + idx*8; - if (factor_v[idx][0] == 0) { - memset(out_o0hw0, 0, 8*sizeof(OT)); - continue; - } - int32x4_t res[2] = {0}; - - for(U32 c = 0; c < ic; c++) { - int8x8_t in_2 = vld1_s8(in_hw); - in_hw += 8; - int8x16_t f_8o[4]; - f_8o[0] = vld1q_s8(f_o); - f_8o[1] = vld1q_s8(f_o+16); - res[0] = vdotq_lane_s32(res[0], f_8o[0], in_2, 0); - res[1] = vdotq_lane_s32(res[1], f_8o[1], in_2, 0); - - f_8o[2] = vld1q_s8(f_o+32); - f_8o[3] = vld1q_s8(f_o+48); - f_o += 64; - res[0] = vdotq_lane_s32(res[0], f_8o[2], in_2, 1); - res[1] = vdotq_lane_s32(res[1], f_8o[3], in_2, 1); - } - float32x4_t fac = vld1q_f32(factor_v[idx]); - float32x4_t resf0 = vcvtq_f32_s32(res[0]); - float32x4_t resf1 = vcvtq_f32_s32(res[1]); - resf0 = vmulq_f32(resf0, fac); - resf1 = vmulq_f32(resf1, fac); - - float16x4_t resh0 = vcvt_f16_f32(resf0); - float16x4_t resh1 = vcvt_f16_f32(resf1); - - vst1_f16(out_o0hw0, resh0); - vst1_f16(out_o0hw0+4, resh1); - } - // out trans - // (6*6)*hw1*o8 => NOHWo8 - U32 h = hw / tile_w; - U32 w = hw % tile_w; - F16 *out_0 = outArray + n*oc*ohow*8 + o*ohow*8 + h*4*ow*8 + w*4*8; - - F16 *Ow_0[36]; - F16 *O_0[16]; - - for (U32 idx = 0; idx < 36; idx++) { - Ow_0[idx] = otmArray + idx*8; - } - for (U32 i = 0; i < 4; ++i) { - for (U32 j = 0; j < 4; ++j) { - O_0[i*4 + j] = out_0 + i*ow*8 + j*8; - } - } - trans_O(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, max, min, am); - } - } - } - - if (DT_I8 == odt) { - F16 max_s = max[0]; - F16 min_s = min[0]; - for (U32 i = 1; i < 8; i++) { - if (max_s < max[i]) { - max_s = max[i]; - } - if (min_s > min[i]) { - min_s = min[i]; - } - } - - if (max_s == 0 && min_s == 0) { - return NOT_SUPPORTED; - } - - F16 scale_o; - if (max_s > 0 && min_s < 0) { - F16 scale_max = 127.0 / max_s; - F16 scale_min = -128.0 / min_s; - scale_o = (scale_max < scale_min) ? scale_max : scale_min; - } else if (max_s > 0) { - scale_o = 127.0 / max_s; - } else { - scale_o = -128.0 / min_s; - } - *outputScale = scale_o; - - apply_scale_f16(on*oc*ohow*8, outArray, scale_o, (INT8*)output); - } - return SUCCESS; -} - -template EE convolution_winograd_A76(TensorDesc inputDesc, const void* input, F16* input_scale, TensorDesc filterDesc, const void* filter, F16* filterScale, - ConvolutionDesc convDesc, TensorDesc biasDesc, const void* bias, U32 tmpBytes, void* tmp, TensorDesc outputDesc, - void* output, F16* outputScale, ActivationDesc am); - -template EE convolution_winograd_A76(TensorDesc inputDesc, const void* input, F16* input_scale, TensorDesc filterDesc, const void* filter, F16* filterScale, - ConvolutionDesc convDesc, TensorDesc biasDesc, const void* bias, U32 tmpBytes, void* tmp, TensorDesc outputDesc, - void* output, F16* outputScale, ActivationDesc am); -#endif diff --git a/tensor_computing/src/cpu/arm/int8/convolution_winograd_transform.h b/tensor_computing/src/cpu/arm/int8/convolution_winograd_transform.h deleted file mode 100644 index b903299a..00000000 --- a/tensor_computing/src/cpu/arm/int8/convolution_winograd_transform.h +++ /dev/null @@ -1,304 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_CONVOLUTION_WINOGRAD_TRANSFORM -#define _H_CONVOLUTION_WINOGRAD_TRANSFORM - -#ifdef _USE_INT8 -#include -#include -#include "type.h" -#include "error.h" -#include "cpu/arm/fp16/convolution_winograd_transform.h" - -inline void trans_I_int8(short *Iw[36], INT8* const I[36]) -{ - short T[6][6][8]; - - int8x8_t v_4 = vmov_n_s8(4); - int8x8_t v_minus_4 = vmov_n_s8(-4); - int8x8_t v_minus_5 = vmov_n_s8(-5); - - for (U32 i = 0; i < 6; i++) { - int8x8_t v_I0 = vld1_s8(I[0*6+i]); - int8x8_t v_I1 = vld1_s8(I[1*6+i]); - int8x8_t v_I2 = vld1_s8(I[2*6+i]); - int8x8_t v_I3 = vld1_s8(I[3*6+i]); - int8x8_t v_I4 = vld1_s8(I[4*6+i]); - int8x8_t v_I5 = vld1_s8(I[5*6+i]); - - // Reorder to accelerate - int16x8_t v_t0 = vmull_s8(v_I2, v_minus_4); - - int16x8_t v_t1 = vmull_s8(v_I1, v_minus_4); - - int16x8_t v_t2 = vsubl_s8(v_I4, v_I2); - - int16x8_t v_t3 = vsubl_s8(v_I3, v_I1); - - v_t0 = vaddw_s8(v_t0, v_I4); - - v_t1 = vaddw_s8(v_t1, v_I3); - - v_t3 = vmulq_n_s16(v_t3, 2); - - int16x8_t v_t4 = vmull_s8(v_I0, v_4); - - int16x8_t v_t5 = vmull_s8(v_I1, v_4); - - int16x8_t v_T0 = vmull_s8(v_I2, v_minus_5); - - int16x8_t v_T1 = vaddq_s16(v_t1, v_t0); - - v_t4 = vaddw_s8(v_t4, v_I4); - - v_t5 = vaddw_s8(v_t5, v_I5); - - v_T0 = vaddq_s16(v_T0, v_t4); - - int16x8_t v_T2 = vsubq_s16(v_t0, v_t1); - - int16x8_t v_T3 = vaddq_s16(v_t3, v_t2); - - int16x8_t v_T4 = vsubq_s16(v_t2, v_t3); - - int16x8_t v_T5 = vmull_s8(v_I3, v_minus_5); - - vst1q_s16(T[0][i], v_T0); - vst1q_s16(T[1][i], v_T1); - vst1q_s16(T[2][i], v_T2); - vst1q_s16(T[3][i], v_T3); - v_T5 = vaddq_s16(v_T5, v_t5); - vst1q_s16(T[4][i], v_T4); - vst1q_s16(T[5][i], v_T5); - } - - for (U32 i = 0; i < 6; i++) { - int16x8_t v_T0 = vld1q_s16(T[i][0]); - int16x8_t v_T1 = vld1q_s16(T[i][1]); - int16x8_t v_T2 = vld1q_s16(T[i][2]); - int16x8_t v_T3 = vld1q_s16(T[i][3]); - int16x8_t v_T4 = vld1q_s16(T[i][4]); - int16x8_t v_T5 = vld1q_s16(T[i][5]); - - int16x8_t v_t0 = vmlaq_n_s16(v_T4, v_T2, -4); - int16x8_t v_t1 = vmlaq_n_s16(v_T3, v_T1, -4); - int16x8_t v_t2 = vsubq_s16(v_T4, v_T2); - int16x8_t v_t3 = vsubq_s16(v_T3, v_T1); - int16x8_t v_t4 = vmlaq_n_s16(v_T4, v_T0, 4); - int16x8_t v_t5 = vmlaq_n_s16(v_T5, v_T1, 4); - - v_t3 = vmulq_n_s16(v_t3, 2); - - int16x8_t v_Iw0 = vmlaq_n_s16(v_t4, v_T2, -5); - int16x8_t v_Iw1 = vaddq_s16(v_t1, v_t0); - int16x8_t v_Iw2 = vsubq_s16(v_t0, v_t1); - int16x8_t v_Iw3 = vaddq_s16(v_t3, v_t2); - int16x8_t v_Iw4 = vsubq_s16(v_t2, v_t3); - int16x8_t v_Iw5 = vmlaq_n_s16(v_t5, v_T3, -5); - - vst1q_s16(Iw[i*6+0], v_Iw0); - vst1q_s16(Iw[i*6+1], v_Iw1); - vst1q_s16(Iw[i*6+2], v_Iw2); - vst1q_s16(Iw[i*6+3], v_Iw3); - vst1q_s16(Iw[i*6+4], v_Iw4); - vst1q_s16(Iw[i*6+5], v_Iw5); - } -} - -inline void trans_O(F16* const Ow[36], F16 *O[16], const F16* bias, - U32 h, U32 w, U32 _pad_h_mod_4, U32 _pad_w_mod_4, U32 oh, U32 ow, F16* max, F16* min, ActivationDesc activationDesc) -{ - F16 T[4][6][8]; - // bias - float16x8_t v_b = vld1q_f16(bias); - - float16x8_t v_0 = vmovq_n_f16(0); - float16x8_t v_2 = vmovq_n_f16(2); - float16x8_t v_4 = vmovq_n_f16(4); - float16x8_t v_8 = vmovq_n_f16(8); - - for (U32 i = 0; i < 6; i++) { - float16x8_t v_Ow0 = vld1q_f16(Ow[i]); - float16x8_t v_Ow1 = vld1q_f16(Ow[1*6+i]); - float16x8_t v_Ow2 = vld1q_f16(Ow[2*6+i]); - float16x8_t v_Ow3 = vld1q_f16(Ow[3*6+i]); - float16x8_t v_Ow4 = vld1q_f16(Ow[4*6+i]); - float16x8_t v_Ow5 = vld1q_f16(Ow[5*6+i]); - - float16x8_t v_t0 = vaddq_f16(v_Ow1, v_Ow2); - float16x8_t v_t1 = vaddq_f16(v_Ow3, v_Ow4); - float16x8_t v_t2 = vsubq_f16(v_Ow1, v_Ow2); - float16x8_t v_t3 = vsubq_f16(v_Ow3, v_Ow4); - - float16x8_t v_T0 = vaddq_f16(v_t0, v_t1); - float16x8_t v_T1 = vfmaq_f16(v_t2, v_t3, v_2); - float16x8_t v_T2 = vfmaq_f16(v_t0, v_t1, v_4); - float16x8_t v_T3 = vfmaq_f16(v_t2, v_t3, v_8); - v_T0 = vaddq_f16(v_T0, v_Ow0); - v_T3 = vaddq_f16(v_T3, v_Ow5); - - vst1q_f16(T[0][i], v_T0); - vst1q_f16(T[1][i], v_T1); - vst1q_f16(T[2][i], v_T2); - vst1q_f16(T[3][i], v_T3); - } - - float16x8_t max_v = vld1q_f16(max); - float16x8_t min_v = vld1q_f16(min); - - U32 pad_h_mod_4 = 0, pad_w_mod_4 = 0; - if (h == oh && w == ow) { - pad_h_mod_4 = _pad_h_mod_4; - pad_w_mod_4 = _pad_w_mod_4; - } else if (h == oh) { - pad_h_mod_4 = _pad_h_mod_4; - } else if (w == ow) { - pad_w_mod_4 = _pad_w_mod_4; - } - - for (U32 i = 0; i < 4 - pad_h_mod_4; i++) { - float16x8_t v_T0 = vld1q_f16(T[i][0]); - float16x8_t v_T1 = vld1q_f16(T[i][1]); - float16x8_t v_T2 = vld1q_f16(T[i][2]); - float16x8_t v_T3 = vld1q_f16(T[i][3]); - float16x8_t v_T4 = vld1q_f16(T[i][4]); - float16x8_t v_T5 = vld1q_f16(T[i][5]); - - float16x8_t v_t0 = vaddq_f16(v_T1, v_T2); - float16x8_t v_t1 = vaddq_f16(v_T3, v_T4); - float16x8_t v_t2 = vsubq_f16(v_T1, v_T2); - float16x8_t v_t3 = vsubq_f16(v_T3, v_T4); - - float16x8_t v_O0 = vaddq_f16(v_t0, v_t1); - float16x8_t v_O1 = vfmaq_f16(v_t2, v_t3, v_2); - float16x8_t v_O2 = vfmaq_f16(v_t0, v_t1, v_4); - float16x8_t v_O3 = vfmaq_f16(v_t2, v_t3, v_8); - v_O0 = vaddq_f16(v_O0, v_T0); - v_O3 = vaddq_f16(v_O3, v_T5); - - float16x8_t temp; - - if (activationDesc.mode == ACTIVATION_RELU) { - if (pad_w_mod_4 == 0) { - temp = vmaxq_f16(vaddq_f16(v_O0, v_b), v_0); - max_v = vmaxq_f16(max_v, temp); - min_v = vminq_f16(min_v, temp); - vst1q_f16(O[i*4+0], temp); - - temp = vmaxq_f16(vaddq_f16(v_O1, v_b), v_0); - max_v = vmaxq_f16(max_v, temp); - min_v = vminq_f16(min_v, temp); - vst1q_f16(O[i*4+1], temp); - - temp = vmaxq_f16(vaddq_f16(v_O2, v_b), v_0); - max_v = vmaxq_f16(max_v, temp); - min_v = vminq_f16(min_v, temp); - vst1q_f16(O[i*4+2], temp); - - temp = vmaxq_f16(vaddq_f16(v_O3, v_b), v_0); - max_v = vmaxq_f16(max_v, temp); - min_v = vminq_f16(min_v, temp); - vst1q_f16(O[i*4+3], temp); - } else if (pad_w_mod_4 == 1) { - temp = vmaxq_f16(vaddq_f16(v_O0, v_b), v_0); - max_v = vmaxq_f16(max_v, temp); - min_v = vminq_f16(min_v, temp); - vst1q_f16(O[i*4+0], temp); - - temp = vmaxq_f16(vaddq_f16(v_O1, v_b), v_0); - max_v = vmaxq_f16(max_v, temp); - min_v = vminq_f16(min_v, temp); - vst1q_f16(O[i*4+1], temp); - - temp = vmaxq_f16(vaddq_f16(v_O2, v_b), v_0); - max_v = vmaxq_f16(max_v, temp); - min_v = vminq_f16(min_v, temp); - vst1q_f16(O[i*4+2], temp); - } else if (pad_w_mod_4 == 2) { - temp = vmaxq_f16(vaddq_f16(v_O0, v_b), v_0); - max_v = vmaxq_f16(max_v, temp); - min_v = vminq_f16(min_v, temp); - vst1q_f16(O[i*4+0], temp); - - temp = vmaxq_f16(vaddq_f16(v_O1, v_b), v_0); - max_v = vmaxq_f16(max_v, temp); - min_v = vminq_f16(min_v, temp); - vst1q_f16(O[i*4+1], temp); - } else if (pad_w_mod_4 == 3) { - temp = vmaxq_f16(vaddq_f16(v_O0, v_b), v_0); - max_v = vmaxq_f16(max_v, temp); - min_v = vminq_f16(min_v, temp); - vst1q_f16(O[i*4+0], temp); - } - } else { - if (pad_w_mod_4 == 0) { - temp = vaddq_f16(v_O0, v_b); - max_v = vmaxq_f16(max_v, temp); - min_v = vminq_f16(min_v, temp); - vst1q_f16(O[i*4+0], temp); - - temp = vaddq_f16(v_O1, v_b); - max_v = vmaxq_f16(max_v, temp); - min_v = vminq_f16(min_v, temp); - vst1q_f16(O[i*4+1], temp); - - temp = vaddq_f16(v_O2, v_b); - max_v = vmaxq_f16(max_v, temp); - min_v = vminq_f16(min_v, temp); - vst1q_f16(O[i*4+2], temp); - - temp = vaddq_f16(v_O3, v_b); - max_v = vmaxq_f16(max_v, temp); - min_v = vminq_f16(min_v, temp); - vst1q_f16(O[i*4+3], temp); - } else if (pad_w_mod_4 == 1) { - temp = vaddq_f16(v_O0, v_b); - max_v = vmaxq_f16(max_v, temp); - min_v = vminq_f16(min_v, temp); - vst1q_f16(O[i*4+0], temp); - - temp = vaddq_f16(v_O1, v_b); - max_v = vmaxq_f16(max_v, temp); - min_v = vminq_f16(min_v, temp); - vst1q_f16(O[i*4+1], temp); - - temp = vaddq_f16(v_O2, v_b); - max_v = vmaxq_f16(max_v, temp); - min_v = vminq_f16(min_v, temp); - vst1q_f16(O[i*4+2], temp); - } else if (pad_w_mod_4 == 2) { - temp = vaddq_f16(v_O0, v_b); - max_v = vmaxq_f16(max_v, temp); - min_v = vminq_f16(min_v, temp); - vst1q_f16(O[i*4+0], temp); - - temp = vaddq_f16(v_O1, v_b); - max_v = vmaxq_f16(max_v, temp); - min_v = vminq_f16(min_v, temp); - vst1q_f16(O[i*4+1], temp); - } else if (pad_w_mod_4 == 3) { - temp = vaddq_f16(v_O0, v_b); - max_v = vmaxq_f16(max_v, temp); - min_v = vminq_f16(min_v, temp); - vst1q_f16(O[i*4+0], temp); - } - } - } - - vst1q_f16(max, max_v); - vst1q_f16(min, min_v); -} -#endif -#endif diff --git a/tensor_computing/src/cpu/arm/int8/depthwise_convolution.cpp b/tensor_computing/src/cpu/arm/int8/depthwise_convolution.cpp deleted file mode 100644 index d8b5ae7a..00000000 --- a/tensor_computing/src/cpu/arm/int8/depthwise_convolution.cpp +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifdef _USE_INT8 -#include "tensor_computing_type.h" -#include "cpu/arm/int8/depthwise_convolution.h" -#include "cpu/arm/int8/tensor_computing_int8.h" - -EE depthwise_convolution_int8(TensorDesc inputDesc, INT8* input, - TensorDesc filterDesc, const INT8* filter, - ConvolutionDesc convDesc, DepthwiseConvolutionForwardAlgorithm algorithm, - TensorDesc biasDesc, const I32* bias, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, I32* output, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc, - Arch arch) -{ - if(nullptr == input || nullptr == filter || nullptr == output || nullptr == bias || nullptr == tmp) - CHECK_STATUS(NULL_POINTER); - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - - if (!(idt == DT_I8 && fdt == DT_I8 && odt == DT_I32)) - CHECK_STATUS(NOT_MATCH); - if (fh != fw) - CHECK_STATUS(NOT_MATCH); - if (!(idf == DF_NCHWC8 && odf == DF_NCHWC8)) - CHECK_STATUS(NOT_MATCH); - if (!(ic == fc && oc == fn)) - CHECK_STATUS(NOT_MATCH); - - EE ret = SUCCESS; - switch (algorithm) { - case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT: - ret = depthwise_pointwise_convolution_direct(inputDesc, input, - filterDesc, filter, - convDesc, - biasDesc, bias, - tmpBytes, tmp, - outputDesc, output, - depthwiseActivationDesc, - pointwiseActivationDesc, - arch); - break; - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} -#endif diff --git a/tensor_computing/src/cpu/arm/int8/depthwise_convolution.h b/tensor_computing/src/cpu/arm/int8/depthwise_convolution.h deleted file mode 100644 index 96c7b464..00000000 --- a/tensor_computing/src/cpu/arm/int8/depthwise_convolution.h +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_DEPTHWISE_CONVOLUTION -#define _H_DEPTHWISE_CONVOLUTION -#ifdef _USE_INT8 -#include - -#include "sys.h" -#include "type.h" -#include "error.h" -#include "tensor_desc.h" -#include "tensor_computing_type.h" - -EE depthwise_pointwise_convolution_direct(TensorDesc inputDesc, INT8* inArray, - TensorDesc filterDesc, const INT8* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const I32* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, I32* outArray, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc, - Arch arch); -#endif -#endif diff --git a/tensor_computing/src/cpu/arm/int8/depthwise_convolution_transform.cpp b/tensor_computing/src/cpu/arm/int8/depthwise_convolution_transform.cpp deleted file mode 100644 index 1623d31d..00000000 --- a/tensor_computing/src/cpu/arm/int8/depthwise_convolution_transform.cpp +++ /dev/null @@ -1,119 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifdef _USE_INT8 -#include - -#include "tensor_computing_type.h" -#include "cpu/arm/int8/tensor_computing_int8.h" - - -inline EE depthwise_convolution_transform_filter_kernel_int8(TensorDesc filterDesc, const INT8* filterArray, - TensorDesc *ftmDesc, INT8* ftmArray, - DataFormat ftmDataFormat) -{ - if (nullptr == filterArray || nullptr == ftmDesc || nullptr == ftmArray) - CHECK_STATUS(NULL_POINTER); - DataType fdt; - DataFormat fdf; - U32 fn, fc, fh, fw; - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - - if (fdf == ftmDataFormat) { - *ftmDesc = filterDesc; - if (fdf == DF_NCHW || fdf == DF_NCHWC8) { - memcpy(ftmArray, filterArray, fn*fc*fh*fw*bytesOf(fdt)); - return SUCCESS; - } - if (fdf == DF_CHW_NC || fdf == DF_CHWC8_NCN8C4) { - memcpy(ftmArray, filterArray, (fc*fh*fw + fc*fn)*bytesOf(fdt)); - return SUCCESS; - } - return NOT_SUPPORTED; - } - - switch (fdf) { - case DF_NCHW: { - if (ftmDataFormat == DF_NCHWC8) { - U32 ic = fc / 8; - for (U32 c = 0; c < ic; c++) { - for (U32 hw = 0; hw < fh*fw; hw++) { - for (U32 c8 = 0; c8 < 8; c8++) { - ftmArray[c*fh*fw*8 + hw*8 + c8] = filterArray[(c*8+c8)*fh*fw + hw]; - } - } - } - *ftmDesc = tensor4df(fdt, DF_NCHWC8, fn, fc, fh, fw); - } else { - return NOT_SUPPORTED; - } - break; - } - case DF_CHW_NC: { - if (ftmDataFormat == DF_CHWC8_NCN8C4) { - /* - * CHW_NC => DF_CHWC8_NCN8C4 - */ - const INT8 *pwFilterArray = filterArray + fc*fh*fw; - INT8 *pwFtmArray = ftmArray + fc*fh*fw; - - U32 ic = fc / 8; - for (U32 c = 0; c < ic; c++) { - for (U32 hw = 0; hw < fh*fw; hw++) { - for (U32 c8 = 0; c8 < 8; c8++) { - ftmArray[c*fh*fw*8 + hw*8 + c8] = filterArray[(c*8+c8)*fh*fw + hw]; - } - } - } - - U32 oc = fn / 8; - ic *= 2; // fc / 4 - for (U32 o = 0; o < oc; o++) { - for (U32 c = 0; c < ic; c++) { - for (U32 o8 = 0; o8 < 8; o8++) { - for (U32 c4 = 0; c4 < 4; c4++) { - pwFtmArray[o*fc*8 + c*32 + o8*4 + c4] = pwFilterArray[(o*8+o8)*fc + c*4 + c4]; - } - } - } - } - *ftmDesc = tensor4df(fdt, DF_CHWC8_NCN8C4, fn, fc, fh, fw); - } else { - return NOT_SUPPORTED; - } - break; - } - default: - return NOT_SUPPORTED; - } - return SUCCESS; -} - -EE depthwise_convolution_transform_filter_int8(TensorDesc filterDesc, const INT8* filter, - DepthwiseConvolutionForwardAlgorithm algorithm, - TensorDesc *ftmDesc, INT8* filterTransformed) -{ - DataFormat ftmDataFormat; - switch (algorithm) { - case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT: - ftmDataFormat = DF_CHWC8_NCN8C4; - break; - default: - return NOT_MATCH; - } - EE ret = depthwise_convolution_transform_filter_kernel_int8(filterDesc, filter, ftmDesc, filterTransformed, ftmDataFormat); - CHECK_STATUS(ret); - return ret; -} -#endif diff --git a/tensor_computing/src/cpu/arm/int8/depthwise_pointwise_convolution_direct.cpp b/tensor_computing/src/cpu/arm/int8/depthwise_pointwise_convolution_direct.cpp deleted file mode 100644 index f0cdffaa..00000000 --- a/tensor_computing/src/cpu/arm/int8/depthwise_pointwise_convolution_direct.cpp +++ /dev/null @@ -1,1910 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifdef _USE_INT8 -#include "cpu/arm/int8/depthwise_convolution.h" - -EE depthwise_pointwise_convolution_direct(TensorDesc inputDesc, INT8* inArray, - TensorDesc filterDesc, const INT8* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const I32* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, I32* outArray, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc, - Arch arch) -{ - UNUSED(biasDesc); - UNUSED(tmpBytes); - UNUSED(arch); - - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - U32 dilateH = convDesc.dilatedRate_h; - U32 dilateW = convDesc.dilatedRate_w; - - if (fdf != DF_CHWC8_NCN8C4) - CHECK_STATUS(NOT_MATCH); - - oc /= 8; - ic /= 8; - - U32 ih_pad = ih + paddingT + paddingB; - U32 iw_pad = iw + paddingL + paddingR; - U32 ihiw = ih*iw; - I32 ohow = oh*ow; - INT8 *pwArray = (INT8*)tmp + ic*ih_pad*iw_pad*8; - I32 *dw_out = (I32 *)(pwArray + ic*ohow*8); - - for (U32 n = 0; n < in; n++) { - // copy input into a input with padding - INT8 *inArray_pad = (INT8*)tmp; - INT8 *inArray_pad_mov = inArray_pad; - INT8 *inArray_mov = inArray + n*ic*ihiw*8; - for (U32 c = 0; c < ic; c++) { - for (U32 h = 0; h < paddingT; h++) { - memset(inArray_pad_mov, 0, iw_pad*8*bytesOf(idt)); - inArray_pad_mov += iw_pad*8; - } - for (U32 h = paddingT; h < ih_pad - paddingB; h++) { - memset(inArray_pad_mov, 0, paddingL*8*bytesOf(idt)); - inArray_pad_mov += paddingL*8; - memcpy(inArray_pad_mov, inArray_mov, iw*8*bytesOf(idt)); - inArray_pad_mov += iw*8; - inArray_mov += iw*8; - memset(inArray_pad_mov, 0, paddingR*8*bytesOf(idt)); - inArray_pad_mov += paddingR*8; - } - for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { - memset(inArray_pad_mov, 0, iw_pad*8*bytesOf(idt)); - inArray_pad_mov += iw_pad*8; - } - } - - // dw_conv - for (U32 c = 0; c < ic ; c++) { - const I32 *b = biasArray + c*8; - INT8 *in_pad = inArray_pad + c*ih_pad*iw_pad*8; - const INT8 *f = filterArray + c*fh*fw*8; - - // ohow / 12 - for (I32 hw = 0; hw < ohow-11; hw+=12) { - U32 in_h_0 = hw/ow*strideH; - U32 in_w_0 = hw%ow*strideW; - U32 in_h_1 = (hw+1)/ow*strideH; - U32 in_w_1 = (hw+1)%ow*strideW; - U32 in_h_2 = (hw+2)/ow*strideH; - U32 in_w_2 = (hw+2)%ow*strideW; - U32 in_h_3 = (hw+3)/ow*strideH; - U32 in_w_3 = (hw+3)%ow*strideW; - U32 in_h_4 = (hw+4)/ow*strideH; - U32 in_w_4 = (hw+4)%ow*strideW; - U32 in_h_5 = (hw+5)/ow*strideH; - U32 in_w_5 = (hw+5)%ow*strideW; - U32 in_h_6 = (hw+6)/ow*strideH; - U32 in_w_6 = (hw+6)%ow*strideW; - U32 in_h_7 = (hw+7)/ow*strideH; - U32 in_w_7 = (hw+7)%ow*strideW; - U32 in_h_8 = (hw+8)/ow*strideH; - U32 in_w_8 = (hw+8)%ow*strideW; - U32 in_h_9 = (hw+9)/ow*strideH; - U32 in_w_9 = (hw+9)%ow*strideW; - U32 in_h_10 = (hw+10)/ow*strideH; - U32 in_w_10 = (hw+10)%ow*strideW; - U32 in_h_11 = (hw+11)/ow*strideH; - U32 in_w_11 = (hw+11)%ow*strideW; - - I32 *pw_pack_0 = dw_out + hw*ic*8 + c*12*8; - I32 *pw_pack_1 = pw_pack_0 + 48; // Second half - //TODO handle asm combined with c. No guarantee that compiler will not use vec reg in c. - __asm__ __volatile__( - "ldr d29, [%[b]]\n" //b_0 - "ldr x1, [%[b], #8]\n" - "ins v29.d[1], x1\n" - "ldr d30, [%[b], #16]\n" //b_1 - "ldr x2, [%[b], #24]\n" - "ins v30.d[1], x2\n" - "mov v5.16b, v29.16b\n" - "mov v7.16b, v29.16b\n" - "mov v9.16b, v29.16b\n" - "mov v11.16b, v29.16b\n" - "mov v13.16b, v29.16b\n" - "mov v15.16b, v29.16b\n" - "mov v17.16b, v29.16b\n" - "mov v19.16b, v29.16b\n" - "mov v21.16b, v29.16b\n" - "mov v23.16b, v29.16b\n" - "mov v25.16b, v29.16b\n" - "mov v27.16b, v29.16b\n" - - "mov v6.16b, v30.16b\n" - "mov v8.16b, v30.16b\n" - "mov v10.16b, v30.16b\n" - "mov v12.16b, v30.16b\n" - "mov v14.16b, v30.16b\n" - "mov v16.16b, v30.16b\n" - "mov v18.16b, v30.16b\n" - "mov v20.16b, v30.16b\n" - "mov v22.16b, v30.16b\n" - "mov v24.16b, v30.16b\n" - "mov v26.16b, v30.16b\n" - "mov v28.16b, v30.16b\n" - : - :[b]"r"(b) - :"memory", "cc", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x1", "x2" - ); - - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - const INT8 *f_0 = f + fh_idx*fw*8 + fw_idx*8; - INT8 *in_idx = in_pad + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - INT8 *in_0 = in_idx + in_h_0*iw_pad*8 + in_w_0*8; - INT8 *in_1 = in_idx + in_h_1*iw_pad*8 + in_w_1*8; - INT8 *in_2 = in_idx + in_h_2*iw_pad*8 + in_w_2*8; - INT8 *in_3 = in_idx + in_h_3*iw_pad*8 + in_w_3*8; - INT8 *in_4 = in_idx + in_h_4*iw_pad*8 + in_w_4*8; - INT8 *in_5 = in_idx + in_h_5*iw_pad*8 + in_w_5*8; - INT8 *in_6 = in_idx + in_h_6*iw_pad*8 + in_w_6*8; - INT8 *in_7 = in_idx + in_h_7*iw_pad*8 + in_w_7*8; - INT8 *in_8 = in_idx + in_h_8*iw_pad*8 + in_w_8*8; - INT8 *in_9 = in_idx + in_h_9*iw_pad*8 + in_w_9*8; - INT8 *in_10 = in_idx + in_h_10*iw_pad*8 + in_w_10*8; - INT8 *in_11 = in_idx + in_h_11*iw_pad*8 + in_w_11*8; - __asm__ __volatile__( - "ldr d29, [%[f0]]\n" - "ldr d0, [%[in0]]\n" - "ldr d1, [%[in1]]\n" - "ldr d2, [%[in2]]\n" - "sshll v29.8h, v29.8b, #0\n" - "ldr d30, [%[in3]]\n" - "sshll v0.8h, v0.8b, #0\n" - "sshll v1.8h, v1.8b, #0\n" - - "smlal v5.4s, v29.4h, v0.4h\n" - "sshll v2.8h, v2.8b, #0\n" - "smlal2 v6.4s, v29.8h, v0.8h\n" - "sshll v30.8h, v30.8b, #0\n" - "smlal v7.4s, v29.4h, v1.4h\n" - "ldr d0, [%[in4]]\n" - "smlal2 v8.4s, v29.8h, v1.8h\n" - "smlal v9.4s, v29.4h, v2.4h\n" - "ldr d1, [%[in5]]\n" - "smlal2 v10.4s, v29.8h, v2.8h\n" - "sshll v0.8h, v0.8b, #0\n" - "smlal v11.4s, v29.4h, v30.4h\n" - "ldr d2, [%[in6]]\n" - "smlal2 v12.4s, v29.8h, v30.8h\n" - "sshll v1.8h, v1.8b, #0\n" - - "smlal v13.4s, v29.4h, v0.4h\n" - "ldr d30, [%[in7]]\n" - "smlal2 v14.4s, v29.8h, v0.8h\n" - "sshll v2.8h, v2.8b, #0\n" - "smlal v15.4s, v29.4h, v1.4h\n" - "ldr d0, [%[in8]]\n" - "smlal2 v16.4s, v29.8h, v1.8h\n" - "sshll v30.8h, v30.8b, #0\n" - "smlal v17.4s, v29.4h, v2.4h\n" - "ldr d1, [%[in9]]\n" - "smlal2 v18.4s, v29.8h, v2.8h\n" - "sshll v0.8h, v0.8b, #0\n" - "smlal v19.4s, v29.4h, v30.4h\n" - "ldr d2, [%[in10]]\n" - "smlal2 v20.4s, v29.8h, v30.8h\n" - "sshll v1.8h, v1.8b, #0\n" - - "smlal v21.4s, v29.4h, v0.4h\n" - "ldr d30, [%[in11]]\n" - "smlal2 v22.4s, v29.8h, v0.8h\n" - "sshll v2.8h, v2.8b, #0\n" - "smlal v23.4s, v29.4h, v1.4h\n" - "sshll v30.8h, v30.8b, #0\n" - "smlal2 v24.4s, v29.8h, v1.8h\n" - "smlal v25.4s, v29.4h, v2.4h\n" - "smlal2 v26.4s, v29.8h, v2.8h\n" - "smlal v27.4s, v29.4h, v30.4h\n" - "smlal2 v28.4s, v29.8h, v30.8h\n" - : - :[in0]"r"(in_0), - [in1]"r"(in_1), - [in2]"r"(in_2), - [in3]"r"(in_3), - [in4]"r"(in_4), - [in5]"r"(in_5), - [in6]"r"(in_6), - [in7]"r"(in_7), - [in8]"r"(in_8), - [in9]"r"(in_9), - [in10]"r"(in_10), - [in11]"r"(in_11), - [f0]"r"(f_0) - :"memory", "cc", "v0", "v1", "v2", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" - ); - } - } - - // activation - switch (depthwiseActivationDesc.mode){ - case ACTIVATION_NULL: { - break; - } - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v0.16b, v0.16b, v0.16b\n" // zero - - "smax v5.4s, v0.4s, v5.4s\n" - "smax v6.4s, v0.4s, v6.4s\n" - "smax v7.4s, v0.4s, v7.4s\n" - "smax v8.4s, v0.4s, v8.4s\n" - "smax v9.4s, v0.4s, v9.4s\n" - "smax v10.4s, v0.4s, v10.4s\n" - "smax v11.4s, v0.4s, v11.4s\n" - "smax v12.4s, v0.4s, v12.4s\n" - "smax v13.4s, v0.4s, v13.4s\n" - "smax v14.4s, v0.4s, v14.4s\n" - "smax v15.4s, v0.4s, v15.4s\n" - "smax v16.4s, v0.4s, v16.4s\n" - "smax v17.4s, v0.4s, v17.4s\n" - "smax v18.4s, v0.4s, v18.4s\n" - "smax v19.4s, v0.4s, v19.4s\n" - "smax v20.4s, v0.4s, v20.4s\n" - "smax v21.4s, v0.4s, v21.4s\n" - "smax v22.4s, v0.4s, v22.4s\n" - "smax v23.4s, v0.4s, v23.4s\n" - "smax v24.4s, v0.4s, v24.4s\n" - "smax v25.4s, v0.4s, v25.4s\n" - "smax v26.4s, v0.4s, v26.4s\n" - "smax v27.4s, v0.4s, v27.4s\n" - "smax v28.4s, v0.4s, v28.4s\n" - : - : - :"memory", "cc", "v0", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28" - ); - break; - } - case ACTIVATION_RELU6:{ - INT8* pw_in0 = pwArray + hw*ic*8 + c*12*8; - INT8* pw_in1 = pw_in0 + 48; - __asm__ __volatile__( - "eor v0.16b, v0.16b, v0.16b\n" // zero - "movi v30.4s, #6\n" // six - - "smax v5.4s, v0.4s, v5.4s\n" - "smax v6.4s, v0.4s, v6.4s\n" - "smax v7.4s, v0.4s, v7.4s\n" - "smax v8.4s, v0.4s, v8.4s\n" - "smax v9.4s, v0.4s, v9.4s\n" - "smax v10.4s, v0.4s, v10.4s\n" - "smax v11.4s, v0.4s, v11.4s\n" - "smax v12.4s, v0.4s, v12.4s\n" - "smax v13.4s, v0.4s, v13.4s\n" - "smax v14.4s, v0.4s, v14.4s\n" - "smax v15.4s, v0.4s, v15.4s\n" - "smax v16.4s, v0.4s, v16.4s\n" - "smax v17.4s, v0.4s, v17.4s\n" - "smax v18.4s, v0.4s, v18.4s\n" - "smax v19.4s, v0.4s, v19.4s\n" - "smax v20.4s, v0.4s, v20.4s\n" - "smax v21.4s, v0.4s, v21.4s\n" - "smax v22.4s, v0.4s, v22.4s\n" - "smax v23.4s, v0.4s, v23.4s\n" - "smax v24.4s, v0.4s, v24.4s\n" - "smax v25.4s, v0.4s, v25.4s\n" - "smax v26.4s, v0.4s, v26.4s\n" - "smax v27.4s, v0.4s, v27.4s\n" - "smax v28.4s, v0.4s, v28.4s\n" - - "smin v5.4s, v30.4s, v5.4s\n" - "smin v6.4s, v30.4s, v6.4s\n" - "smin v7.4s, v30.4s, v7.4s\n" - "smin v8.4s, v30.4s, v8.4s\n" - "smin v9.4s, v30.4s, v9.4s\n" - "smin v10.4s, v30.4s, v10.4s\n" - "smin v11.4s, v30.4s, v11.4s\n" - "smin v12.4s, v30.4s, v12.4s\n" - "smin v13.4s, v30.4s, v13.4s\n" - "smin v14.4s, v30.4s, v14.4s\n" - "smin v15.4s, v30.4s, v15.4s\n" - "smin v16.4s, v30.4s, v16.4s\n" - "smin v17.4s, v30.4s, v17.4s\n" - "smin v18.4s, v30.4s, v18.4s\n" - "smin v19.4s, v30.4s, v19.4s\n" - "smin v20.4s, v30.4s, v20.4s\n" - "smin v21.4s, v30.4s, v21.4s\n" - "smin v22.4s, v30.4s, v22.4s\n" - "smin v23.4s, v30.4s, v23.4s\n" - "smin v24.4s, v30.4s, v24.4s\n" - "smin v25.4s, v30.4s, v25.4s\n" - "smin v26.4s, v30.4s, v26.4s\n" - "smin v27.4s, v30.4s, v27.4s\n" - "smin v28.4s, v30.4s, v28.4s\n" - - // No need to quantize for ReLU6 - "sqshl v5.4s, v5.4s, #2\n" - "sqshl v6.4s, v6.4s, #2\n" - "sqshl v7.4s, v7.4s, #2\n" - "sqshl v8.4s, v8.4s, #2\n" - "sqshl v9.4s, v9.4s, #2\n" - "sqshl v10.4s, v10.4s, #2\n" - "sqshl v11.4s, v11.4s, #2\n" - "sqshl v12.4s, v12.4s, #2\n" - "sqshl v13.4s, v13.4s, #2\n" - "sqshl v14.4s, v14.4s, #2\n" - "sqshl v15.4s, v15.4s, #2\n" - "sqshl v16.4s, v16.4s, #2\n" - "sqshl v17.4s, v17.4s, #2\n" - "sqshl v18.4s, v18.4s, #2\n" - "sqshl v19.4s, v19.4s, #2\n" - "sqshl v20.4s, v20.4s, #2\n" - "sqshl v21.4s, v21.4s, #2\n" - "sqshl v22.4s, v22.4s, #2\n" - "sqshl v23.4s, v23.4s, #2\n" - "sqshl v24.4s, v24.4s, #2\n" - "sqshl v25.4s, v25.4s, #2\n" - "sqshl v26.4s, v26.4s, #2\n" - "sqshl v27.4s, v27.4s, #2\n" - "sqshl v28.4s, v28.4s, #2\n" - - "sqshrn v5.4h, v5.4s, #1\n" - "sqshrn v9.4h, v9.4s, #1\n" - "sqshrn2 v5.8h, v7.4s, #1\n" - "sqshrn2 v9.8h, v11.4s, #1\n" - "sqshrn v13.4h, v13.4s, #1\n" - "sqshrn v17.4h, v17.4s, #1\n" - "sqshrn2 v13.8h, v15.4s, #1\n" - "sqshrn2 v17.8h, v19.4s, #1\n" - - "sqshrn v21.4h, v21.4s, #1\n" - "sqshrn v25.4h, v25.4s, #1\n" - "sqshrn2 v21.8h, v23.4s, #1\n" - "sqshrn2 v25.8h, v27.4s, #1\n" - - "sqshrn v5.8b, v5.8h, #1\n" - "sqshrn v13.8b, v13.8h, #1\n" - "sqshrn v21.8b, v21.8h, #1\n" - - "sqshrn2 v5.16b, v9.8h, #1\n" - "sqshrn2 v13.16b, v17.8h, #1\n" - "sqshrn2 v21.16b, v25.8h, #1\n" - "str q5, [%[in0]]\n" - "str q13, [%[in0], #16]\n" - "str q21, [%[in0], #32]\n" - - "sqshrn v6.4h, v6.4s, #1\n" - "sqshrn v10.4h, v10.4s, #1\n" - "sqshrn2 v6.8h, v8.4s, #1\n" - "sqshrn2 v10.8h, v12.4s, #1\n" - - "sqshrn v14.4h, v14.4s, #1\n" - "sqshrn v18.4h, v18.4s, #1\n" - "sqshrn2 v14.8h, v16.4s, #1\n" - "sqshrn2 v18.8h, v20.4s, #1\n" - - "sqshrn v22.4h, v22.4s, #1\n" - "sqshrn v26.4h, v26.4s, #1\n" - "sqshrn2 v22.8h, v24.4s, #1\n" - "sqshrn2 v26.8h, v28.4s, #1\n" - - "sqshrn v6.8b, v6.8h, #1\n" - "sqshrn v14.8b, v14.8h, #1\n" - "sqshrn v22.8b, v22.8h, #1\n" - - "sqshrn2 v6.16b, v10.8h, #1\n" - "sqshrn2 v14.16b, v18.8h, #1\n" - "sqshrn2 v22.16b, v26.8h, #1\n" - "str q6, [%[in1]]\n" - "str q14, [%[in1], #16]\n" - "str q22, [%[in1], #32]\n" - : - :[in0]"r"(pw_in0), - [in1]"r"(pw_in1) - :"memory", "cc", "v0", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v30" - ); - break; - } - default: - return NOT_SUPPORTED; - } - - if (depthwiseActivationDesc.mode != ACTIVATION_RELU6) { - __asm__ __volatile__( - "str q5, [%[pw0]]\n" - "str q7, [%[pw0], #16]\n" - "str q9, [%[pw0], #32]\n" - "str q11, [%[pw0], #48]\n" - "str q13, [%[pw0], #64]\n" - "str q15, [%[pw0], #80]\n" - "str q17, [%[pw0], #96]\n" - "str q19, [%[pw0], #112]\n" - "str q21, [%[pw0], #128]\n" - "str q23, [%[pw0], #144]\n" - "str q25, [%[pw0], #160]\n" - "str q27, [%[pw0], #176]\n" - - "str q6, [%[pw1]]\n" - "str q8, [%[pw1], #16]\n" - "str q10, [%[pw1], #32]\n" - "str q12, [%[pw1], #48]\n" - "str q14, [%[pw1], #64]\n" - "str q16, [%[pw1], #80]\n" - "str q18, [%[pw1], #96]\n" - "str q20, [%[pw1], #112]\n" - "str q22, [%[pw1], #128]\n" - "str q24, [%[pw1], #144]\n" - "str q26, [%[pw1], #160]\n" - "str q28, [%[pw1], #176]\n" - : - :[pw0]"r"(pw_pack_0), - [pw1]"r"(pw_pack_1) - :"memory", "cc", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28" - ); - } - } - - // ohow_reminder % 12 / 8 - U32 ohow_s = (ohow / 12) * 12; - U32 ohow_tail = ohow - ohow_s; - - if (ohow_tail >= 8) { - U32 hw = ohow_s; - U32 in_h_0 = hw/ow*strideH; - U32 in_w_0 = hw%ow*strideW; - U32 in_h_1 = (hw+1)/ow*strideH; - U32 in_w_1 = (hw+1)%ow*strideW; - U32 in_h_2 = (hw+2)/ow*strideH; - U32 in_w_2 = (hw+2)%ow*strideW; - U32 in_h_3 = (hw+3)/ow*strideH; - U32 in_w_3 = (hw+3)%ow*strideW; - U32 in_h_4 = ((hw+4)/ow)*strideH; - U32 in_w_4 = ((hw+4)%ow)*strideW; - U32 in_h_5 = ((hw+5)/ow)*strideH; - U32 in_w_5 = ((hw+5)%ow)*strideW; - U32 in_h_6 = ((hw+6)/ow)*strideH; - U32 in_w_6 = ((hw+6)%ow)*strideW; - U32 in_h_7 = ((hw+7)/ow)*strideH; - U32 in_w_7 = ((hw+7)%ow)*strideW; - I32 *pw_pack_0 = dw_out + hw*ic*8 + c*8*8; - I32 *pw_pack_1 = pw_pack_0 + 32; - //TODO handle asm combined with c. No guarantee that compile will not use vec reg in c. - __asm__ __volatile__( - "ldr d29, [%[b]]\n" //b_0 - "ldr x1, [%[b], #8]\n" - "ins v29.d[1], x1\n" - "ldr d30, [%[b], #16]\n" //b_1 - "ldr x2, [%[b], #24]\n" - "ins v30.d[1], x2\n" - "mov v5.16b, v29.16b\n" - "mov v7.16b, v29.16b\n" - "mov v9.16b, v29.16b\n" - "mov v11.16b, v29.16b\n" - "mov v13.16b, v29.16b\n" - "mov v15.16b, v29.16b\n" - "mov v17.16b, v29.16b\n" - "mov v19.16b, v29.16b\n" - - "mov v6.16b, v30.16b\n" - "mov v8.16b, v30.16b\n" - "mov v10.16b, v30.16b\n" - "mov v12.16b, v30.16b\n" - "mov v14.16b, v30.16b\n" - "mov v16.16b, v30.16b\n" - "mov v18.16b, v30.16b\n" - "mov v20.16b, v30.16b\n" - : - :[b]"r"(b) - :"memory", "cc", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v29", "v30", "x1", "x2" - ); - - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - const INT8 *f_0 = f + fh_idx*fw*8 + fw_idx*8; - INT8 *in_idx = in_pad + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - INT8 *in_0 = in_idx + in_h_0*iw_pad*8 + in_w_0*8; - INT8 *in_1 = in_idx + in_h_1*iw_pad*8 + in_w_1*8; - INT8 *in_2 = in_idx + in_h_2*iw_pad*8 + in_w_2*8; - INT8 *in_3 = in_idx + in_h_3*iw_pad*8 + in_w_3*8; - INT8 *in_4 = in_idx + in_h_4*iw_pad*8 + in_w_4*8; - INT8 *in_5 = in_idx + in_h_5*iw_pad*8 + in_w_5*8; - INT8 *in_6 = in_idx + in_h_6*iw_pad*8 + in_w_6*8; - INT8 *in_7 = in_idx + in_h_7*iw_pad*8 + in_w_7*8; - __asm__ __volatile__( - "ldr d29, [%[f0]]\n" - "ldr d0, [%[in0]]\n" - "ldr d1, [%[in1]]\n" - "ldr d2, [%[in2]]\n" - "sshll v29.8h, v29.8b, #0\n" - "ldr d30, [%[in3]]\n" - "sshll v0.8h, v0.8b, #0\n" - "sshll v1.8h, v1.8b, #0\n" - - "smlal v5.4s, v29.4h, v0.4h\n" - "sshll v2.8h, v2.8b, #0\n" - "smlal2 v6.4s, v29.8h, v0.8h\n" - "sshll v30.8h, v30.8b, #0\n" - "smlal v7.4s, v29.4h, v1.4h\n" - "ldr d0, [%[in4]]\n" - "smlal2 v8.4s, v29.8h, v1.8h\n" - "smlal v9.4s, v29.4h, v2.4h\n" - "ldr d1, [%[in5]]\n" - "smlal2 v10.4s, v29.8h, v2.8h\n" - "sshll v0.8h, v0.8b, #0\n" - "smlal v11.4s, v29.4h, v30.4h\n" - "ldr d2, [%[in6]]\n" - "smlal2 v12.4s, v29.8h, v30.8h\n" - "sshll v1.8h, v1.8b, #0\n" - - "smlal v13.4s, v29.4h, v0.4h\n" - "ldr d30, [%[in7]]\n" - "smlal2 v14.4s, v29.8h, v0.8h\n" - "sshll v2.8h, v2.8b, #0\n" - "smlal v15.4s, v29.4h, v1.4h\n" - "smlal2 v16.4s, v29.8h, v1.8h\n" - "sshll v30.8h, v30.8b, #0\n" - "smlal v17.4s, v29.4h, v2.4h\n" - "smlal2 v18.4s, v29.8h, v2.8h\n" - "smlal v19.4s, v29.4h, v30.4h\n" - "smlal2 v20.4s, v29.8h, v30.8h\n" - : - :[in0]"r"(in_0), - [in1]"r"(in_1), - [in2]"r"(in_2), - [in3]"r"(in_3), - [in4]"r"(in_4), - [in5]"r"(in_5), - [in6]"r"(in_6), - [in7]"r"(in_7), - [f0]"r"(f_0) - :"memory", "cc", "v0", "v1", "v2", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v29", "v30" - ); - } - } - - // activation - switch (depthwiseActivationDesc.mode){ - case ACTIVATION_NULL: { - break; - } - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v0.16b, v0.16b, v0.16b\n" // zero - - "smax v5.4s, v0.4s, v5.4s\n" - "smax v6.4s, v0.4s, v6.4s\n" - "smax v7.4s, v0.4s, v7.4s\n" - "smax v8.4s, v0.4s, v8.4s\n" - "smax v9.4s, v0.4s, v9.4s\n" - "smax v10.4s, v0.4s, v10.4s\n" - "smax v11.4s, v0.4s, v11.4s\n" - "smax v12.4s, v0.4s, v12.4s\n" - "smax v13.4s, v0.4s, v13.4s\n" - "smax v14.4s, v0.4s, v14.4s\n" - "smax v15.4s, v0.4s, v15.4s\n" - "smax v16.4s, v0.4s, v16.4s\n" - "smax v17.4s, v0.4s, v17.4s\n" - "smax v18.4s, v0.4s, v18.4s\n" - "smax v19.4s, v0.4s, v19.4s\n" - "smax v20.4s, v0.4s, v20.4s\n" - : - : - :"memory", "cc", "v0", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20" - ); - break; - } - case ACTIVATION_RELU6:{ - INT8* pw_in0 = pwArray + hw*ic*8 + c*8*8; - INT8* pw_in1 = pw_in0 + 32; - __asm__ __volatile__( - "eor v0.16b, v0.16b, v0.16b\n" // zero - "movi v30.4s, #6\n" // six - - "smax v5.4s, v0.4s, v5.4s\n" - "smax v6.4s, v0.4s, v6.4s\n" - "smax v7.4s, v0.4s, v7.4s\n" - "smax v8.4s, v0.4s, v8.4s\n" - "smax v9.4s, v0.4s, v9.4s\n" - "smax v10.4s, v0.4s, v10.4s\n" - "smax v11.4s, v0.4s, v11.4s\n" - "smax v12.4s, v0.4s, v12.4s\n" - "smax v13.4s, v0.4s, v13.4s\n" - "smax v14.4s, v0.4s, v14.4s\n" - "smax v15.4s, v0.4s, v15.4s\n" - "smax v16.4s, v0.4s, v16.4s\n" - "smax v17.4s, v0.4s, v17.4s\n" - "smax v18.4s, v0.4s, v18.4s\n" - "smax v19.4s, v0.4s, v19.4s\n" - "smax v20.4s, v0.4s, v20.4s\n" - - "smin v5.4s, v30.4s, v5.4s\n" - "smin v6.4s, v30.4s, v6.4s\n" - "smin v7.4s, v30.4s, v7.4s\n" - "smin v8.4s, v30.4s, v8.4s\n" - "smin v9.4s, v30.4s, v9.4s\n" - "smin v10.4s, v30.4s, v10.4s\n" - "smin v11.4s, v30.4s, v11.4s\n" - "smin v12.4s, v30.4s, v12.4s\n" - "smin v13.4s, v30.4s, v13.4s\n" - "smin v14.4s, v30.4s, v14.4s\n" - "smin v15.4s, v30.4s, v15.4s\n" - "smin v16.4s, v30.4s, v16.4s\n" - "smin v17.4s, v30.4s, v17.4s\n" - "smin v18.4s, v30.4s, v18.4s\n" - "smin v19.4s, v30.4s, v19.4s\n" - "smin v20.4s, v30.4s, v20.4s\n" - - // No need to quantize for ReLU6 - "sqshl v5.4s, v5.4s, #2\n" - "sqshl v6.4s, v6.4s, #2\n" - "sqshl v7.4s, v7.4s, #2\n" - "sqshl v8.4s, v8.4s, #2\n" - "sqshl v9.4s, v9.4s, #2\n" - "sqshl v10.4s, v10.4s, #2\n" - "sqshl v11.4s, v11.4s, #2\n" - "sqshl v12.4s, v12.4s, #2\n" - "sqshl v13.4s, v13.4s, #2\n" - "sqshl v14.4s, v14.4s, #2\n" - "sqshl v15.4s, v15.4s, #2\n" - "sqshl v16.4s, v16.4s, #2\n" - "sqshl v17.4s, v17.4s, #2\n" - "sqshl v18.4s, v18.4s, #2\n" - "sqshl v19.4s, v19.4s, #2\n" - "sqshl v20.4s, v20.4s, #2\n" - - "sqshrn v5.4h, v5.4s, #1\n" - "sqshrn v9.4h, v9.4s, #1\n" - "sqshrn2 v5.8h, v7.4s, #1\n" - "sqshrn2 v9.8h, v11.4s, #1\n" - - "sqshrn v13.4h, v13.4s, #1\n" - "sqshrn v17.4h, v17.4s, #1\n" - "sqshrn2 v13.8h, v15.4s, #1\n" - "sqshrn2 v17.8h, v19.4s, #1\n" - - "sqshrn v5.8b, v5.8h, #1\n" - "sqshrn v13.8b, v13.8h, #1\n" - - "sqshrn2 v5.16b, v9.8h, #1\n" - "sqshrn2 v13.16b, v17.8h, #1\n" - "str q5, [%[in0]]\n" - "str q13, [%[in0], #16]\n" - - "sqshrn v6.4h, v6.4s, #1\n" - "sqshrn v10.4h, v10.4s, #1\n" - "sqshrn2 v6.8h, v8.4s, #1\n" - "sqshrn2 v10.8h, v12.4s, #1\n" - - "sqshrn v14.4h, v14.4s, #1\n" - "sqshrn v18.4h, v18.4s, #1\n" - "sqshrn2 v14.8h, v16.4s, #1\n" - "sqshrn2 v18.8h, v20.4s, #1\n" - - "sqshrn v6.8b, v6.8h, #1\n" - "sqshrn v14.8b, v14.8h, #1\n" - - "sqshrn2 v6.16b, v10.8h, #1\n" - "sqshrn2 v14.16b, v18.8h, #1\n" - "str q6, [%[in1]]\n" - "str q14, [%[in1], #16]\n" - : - :[in0]"r"(pw_in0), - [in1]"r"(pw_in1) - :"memory", "cc", "v0", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v30" - ); - break; - } - default: - return NOT_SUPPORTED; - } - - if (depthwiseActivationDesc.mode != ACTIVATION_RELU6) { - __asm__ __volatile__( - "str q5, [%[pw0]]\n" - "str q7, [%[pw0], #16]\n" - "str q9, [%[pw0], #32]\n" - "str q11, [%[pw0], #48]\n" - "str q13, [%[pw0], #64]\n" - "str q15, [%[pw0], #80]\n" - "str q17, [%[pw0], #96]\n" - "str q19, [%[pw0], #112]\n" - - "str q6, [%[pw1]]\n" - "str q8, [%[pw1], #16]\n" - "str q10, [%[pw1], #32]\n" - "str q12, [%[pw1], #48]\n" - "str q14, [%[pw1], #64]\n" - "str q16, [%[pw1], #80]\n" - "str q18, [%[pw1], #96]\n" - "str q20, [%[pw1], #112]\n" - : - :[pw0]"r"(pw_pack_0), - [pw1]"r"(pw_pack_1) - :"memory", "cc", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20" - ); - } - ohow_s += 8; - ohow_tail -= 8; - } - - if (ohow_tail >= 4) { - U32 hw = ohow_s; - U32 in_h_0 = hw/ow*strideH; - U32 in_w_0 = hw%ow*strideW; - U32 in_h_1 = (hw+1)/ow*strideH; - U32 in_w_1 = (hw+1)%ow*strideW; - U32 in_h_2 = (hw+2)/ow*strideH; - U32 in_w_2 = (hw+2)%ow*strideW; - U32 in_h_3 = (hw+3)/ow*strideH; - U32 in_w_3 = (hw+3)%ow*strideW; - I32 *pw_pack_0 = dw_out + hw*ic*8 + c*4*8; - I32 *pw_pack_1 = pw_pack_0 + 16; - //TODO handle asm combined with c. No guarantee that compile will not use vec reg in c. - __asm__ __volatile__( - "ldr d29, [%[b]]\n" //b_0 - "ldr x1, [%[b], #8]\n" - "ins v29.d[1], x1\n" - "ldr d30, [%[b], #16]\n" //b_1 - "ldr x2, [%[b], #24]\n" - "ins v30.d[1], x2\n" - "mov v5.16b, v29.16b\n" - "mov v7.16b, v29.16b\n" - "mov v9.16b, v29.16b\n" - "mov v11.16b, v29.16b\n" - - "mov v6.16b, v30.16b\n" - "mov v8.16b, v30.16b\n" - "mov v10.16b, v30.16b\n" - "mov v12.16b, v30.16b\n" - : - :[b]"r"(b) - :"memory", "cc", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v29", "v30", "x1", "x2" - ); - - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - const INT8 *f_0 = f + fh_idx*fw*8 + fw_idx*8; - INT8 *in_idx = in_pad + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - INT8 *in_0 = in_idx + in_h_0*iw_pad*8 + in_w_0*8; - INT8 *in_1 = in_idx + in_h_1*iw_pad*8 + in_w_1*8; - INT8 *in_2 = in_idx + in_h_2*iw_pad*8 + in_w_2*8; - INT8 *in_3 = in_idx + in_h_3*iw_pad*8 + in_w_3*8; - __asm__ __volatile__( - "ldr d29, [%[f0]]\n" - "ldr d0, [%[in0]]\n" - "ldr d1, [%[in1]]\n" - "ldr d2, [%[in2]]\n" - "sshll v29.8h, v29.8b, #0\n" - "ldr d30, [%[in3]]\n" - "sshll v0.8h, v0.8b, #0\n" - "sshll v1.8h, v1.8b, #0\n" - - "smlal v5.4s, v29.4h, v0.4h\n" - "sshll v2.8h, v2.8b, #0\n" - "smlal2 v6.4s, v29.8h, v0.8h\n" - "sshll v30.8h, v30.8b, #0\n" - "smlal v7.4s, v29.4h, v1.4h\n" - "smlal2 v8.4s, v29.8h, v1.8h\n" - "smlal v9.4s, v29.4h, v2.4h\n" - "smlal2 v10.4s, v29.8h, v2.8h\n" - "smlal v11.4s, v29.4h, v30.4h\n" - "smlal2 v12.4s, v29.8h, v30.8h\n" - : - :[in0]"r"(in_0), - [in1]"r"(in_1), - [in2]"r"(in_2), - [in3]"r"(in_3), - [f0]"r"(f_0) - :"memory", "cc", "v0", "v1", "v2", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v29", "v30" - ); - } - } - - // activation - switch (depthwiseActivationDesc.mode){ - case ACTIVATION_NULL: { - break; - } - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v0.16b, v0.16b, v0.16b\n" // zero - - "smax v5.4s, v0.4s, v5.4s\n" - "smax v6.4s, v0.4s, v6.4s\n" - "smax v7.4s, v0.4s, v7.4s\n" - "smax v8.4s, v0.4s, v8.4s\n" - "smax v9.4s, v0.4s, v9.4s\n" - "smax v10.4s, v0.4s, v10.4s\n" - "smax v11.4s, v0.4s, v11.4s\n" - "smax v12.4s, v0.4s, v12.4s\n" - : - : - :"memory", "cc", "v0", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12" - ); - break; - } - case ACTIVATION_RELU6:{ - INT8* pw_in0 = pwArray + hw*ic*8 + c*4*8; - INT8* pw_in1 = pw_in0 + 16; - __asm__ __volatile__( - "eor v0.16b, v0.16b, v0.16b\n" // zero - "movi v30.4s, #6\n" // six - - "smax v5.4s, v0.4s, v5.4s\n" - "smax v6.4s, v0.4s, v6.4s\n" - "smax v7.4s, v0.4s, v7.4s\n" - "smax v8.4s, v0.4s, v8.4s\n" - "smax v9.4s, v0.4s, v9.4s\n" - "smax v10.4s, v0.4s, v10.4s\n" - "smax v11.4s, v0.4s, v11.4s\n" - "smax v12.4s, v0.4s, v12.4s\n" - - "smin v5.4s, v30.4s, v5.4s\n" - "smin v6.4s, v30.4s, v6.4s\n" - "smin v7.4s, v30.4s, v7.4s\n" - "smin v8.4s, v30.4s, v8.4s\n" - "smin v9.4s, v30.4s, v9.4s\n" - "smin v10.4s, v30.4s, v10.4s\n" - "smin v11.4s, v30.4s, v11.4s\n" - "smin v12.4s, v30.4s, v12.4s\n" - - // No need to quantize for ReLU6 - "sqshl v5.4s, v5.4s, #2\n" - "sqshl v6.4s, v6.4s, #2\n" - "sqshl v7.4s, v7.4s, #2\n" - "sqshl v8.4s, v8.4s, #2\n" - "sqshl v9.4s, v9.4s, #2\n" - "sqshl v10.4s, v10.4s, #2\n" - "sqshl v11.4s, v11.4s, #2\n" - "sqshl v12.4s, v12.4s, #2\n" - - "sqshrn v5.4h, v5.4s, #1\n" - "sqshrn v9.4h, v9.4s, #1\n" - "sqshrn2 v5.8h, v7.4s, #1\n" - "sqshrn2 v9.8h, v11.4s, #1\n" - - "sqshrn v5.8b, v5.8h, #1\n" - "sqshrn2 v5.16b, v9.8h, #1\n" - "str q5, [%[in0]]\n" - - "sqshrn v6.4h, v6.4s, #1\n" - "sqshrn v10.4h, v10.4s, #1\n" - "sqshrn2 v6.8h, v8.4s, #1\n" - "sqshrn2 v10.8h, v12.4s, #1\n" - - "sqshrn v6.8b, v6.8h, #1\n" - - "sqshrn2 v6.16b, v10.8h, #1\n" - "str q6, [%[in1]]\n" - : - :[in0]"r"(pw_in0), - [in1]"r"(pw_in1) - :"memory", "cc", "v0", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v30" - ); - break; - } - default: - return NOT_SUPPORTED; - } - - if (depthwiseActivationDesc.mode != ACTIVATION_RELU6) { - __asm__ __volatile__( - "str q5, [%[pw0]]\n" - "str q7, [%[pw0], #16]\n" - "str q9, [%[pw0], #32]\n" - "str q11, [%[pw0], #48]\n" - - "str q6, [%[pw1]]\n" - "str q8, [%[pw1], #16]\n" - "str q10, [%[pw1], #32]\n" - "str q12, [%[pw1], #48]\n" - : - :[pw0]"r"(pw_pack_0), - [pw1]"r"(pw_pack_1) - :"memory", "cc", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12" - ); - } - ohow_s += 4; - ohow_tail -= 4; - } - - // ohow_reminder % 4 - for (I32 hw = ohow_s; hw < ohow; hw++) { - U32 in_h_0 = hw/ow*strideH; - U32 in_w_0 = hw%ow*strideW; - I32 *pw_pack_0 = dw_out + hw*ic*8 + c*8; - I32 *pw_pack_1 = pw_pack_0 + 4; - //TODO handle asm combined with c. No guarantee that compile will not use vec reg in c. - __asm__ __volatile__( - "ldr d5, [%[b]]\n" //b_0 - "ldr x1, [%[b], #8]\n" - "ins v5.d[1], x1\n" - "ldr d6, [%[b], #16]\n" //b_1 - "ldr x2, [%[b], #24]\n" - "ins v6.d[1], x2\n" - : - :[b]"r"(b) - :"memory", "cc", "v5", "v6", "x1", "x2" - ); - - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - const INT8 *f_0 = f + fh_idx*fw*8 + fw_idx*8; - INT8 *in_idx = in_pad + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - INT8 *in_0 = in_idx + in_h_0*iw_pad*8 + in_w_0*8; - __asm__ __volatile__( - "ldr d29, [%[f0]]\n" - "ldr d0, [%[in0]]\n" - "sshll v29.8h, v29.8b, #0\n" - "sshll v0.8h, v0.8b, #0\n" - "smlal v5.4s, v29.4h, v0.4h\n" - "smlal2 v6.4s, v29.8h, v0.8h\n" - : - :[in0]"r"(in_0), - [f0]"r"(f_0) - :"memory", "cc", "v0", "v5", "v6", "v29" - ); - } - } - - // activation - switch (depthwiseActivationDesc.mode){ - case ACTIVATION_NULL: { - break; - } - case ACTIVATION_RELU:{ - __asm__ __volatile__( - "eor v0.16b, v0.16b, v0.16b\n" // zero - - "smax v5.4s, v0.4s, v5.4s\n" - "smax v6.4s, v0.4s, v6.4s\n" - : - : - :"memory", "cc", "v0", "v5", "v6" - ); - break; - } - case ACTIVATION_RELU6:{ - INT8* pw_in0 = pwArray + hw*ic*8 + c*8; - __asm__ __volatile__( - "eor v0.16b, v0.16b, v0.16b\n" // zero - "movi v30.4s, #6\n" // six - - "smax v5.4s, v0.4s, v5.4s\n" - "smax v6.4s, v0.4s, v6.4s\n" - - "smin v5.4s, v30.4s, v5.4s\n" - "smin v6.4s, v30.4s, v6.4s\n" - - // No need to quantize for ReLU6 - "sqshl v5.4s, v5.4s, #2\n" - "sqshl v6.4s, v6.4s, #2\n" - - "sqshrn v5.4h, v5.4s, #1\n" - "sqshrn2 v5.8h, v6.4s, #1\n" - - "sqshrn v5.8b, v5.8h, #1\n" - "str d5, [%[in0]]\n" - : - :[in0]"r"(pw_in0) - :"memory", "cc", "v0", "v5", "v6", "v30" - ); - break; - } - default: - return NOT_SUPPORTED; - } - - if (depthwiseActivationDesc.mode != ACTIVATION_RELU6) { - __asm__ __volatile__( - "str q5, [%[pw0]]\n" - "str q6, [%[pw1]]\n" - : - :[pw0]"r"(pw_pack_0), - [pw1]"r"(pw_pack_1) - :"memory", "cc", "v5", "v6" - ); - } - } - } - - I32 scale = 1; - if (depthwiseActivationDesc.mode != ACTIVATION_RELU6) { - // quantization - I32 factor = 16777216; // 24 bits - switch (depthwiseActivationDesc.mode) { - case ACTIVATION_NULL: { - I32 max_s = dw_out[0]; - I32 min_s = dw_out[0]; - for (U32 i=1; i max_s) { - max_s = cur; - } - if (cur < min_s) { - min_s = cur; - } - } - - if (max_s <= 127 && min_s >= -128) { // No need to scale - break; - } - - if (max_s == 0 && min_s == 0) { - break; - } - - if (max_s>0 && min_s<0) { - I32 factor_p = (factor * 127) / max_s; - I32 factor_n = (factor * -128) / min_s; - factor = (factor_p < factor_n) ? factor_p : factor_n; - } else if (max_s < 0) { - factor = (factor * -128) / min_s; - } else { // min_s > 0 - factor = (factor * 127) / max_s; - } - scale = 16777216 / factor; - break; - } - case ACTIVATION_RELU: { - I32 max_s = dw_out[0]; - for (U32 i=1; i max_s) { - max_s = cur; - } - } - if (max_s <= 127) { // No need to scale - break; - } - - if (max_s == 0) { - break; - } - - factor = (factor * 127) / max_s; - scale = 16777216 / factor; - break; - } - default: - return NOT_SUPPORTED; - } - I32 factor_v[4]; - for (U32 i=0; i<4; i++) { - factor_v[i] = factor; - } - __asm__ __volatile__( - "ldr q0, [%[factor]]\n" - "mov x0, %[dw_out]\n" - "mov x1, %[pw_in]\n" - "mov x2, %[num]\n" - "0:\n" - "ldr q1, [x0], #16\n" - "ldr q2, [x0], #16\n" - "mul v1.4s, v0.4s, v1.4s\n" - "mul v2.4s, v0.4s, v2.4s\n" - - "shrn v1.4h, v1.4s, #16\n" - "shrn2 v1.8h, v2.4s, #16\n" - - "shrn v1.8b, v1.8h, #8\n" - "subs x2, x2, #8\n" - - "str d1, [x1], #8\n" - "bne 0b\n" - : - :[factor]"r"(factor_v), - [dw_out]"r"(dw_out), - [pw_in]"r"(pwArray), - [num]"r"((I64)ohow*ic*8) - :"memory", "cc", "v0", "v1", "v2", "x0", "x1", "x2" - ); - } - - I32 scale_v[4]; - for (U32 i=0; i<4; i++) { - scale_v[i] = scale; - } - - // pw_conv - const INT8 *f_base = filterArray + ic*fh*fw*8; - - // ohow / 12 - for (I32 hw = 0; hw < ohow-11; hw+=12) { - const I32 *b0 = biasArray + ic*8; - const I32 *b1 = b0 + 4; - INT8 *in_pack = pwArray + hw*ic*8; - for (U32 o = 0; o < oc; o++) { - INT8 *in_hw0 = in_pack; - const INT8 *f_o0c0 = f_base + o*8*ic*8; - I32 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - // bias - const I32 *b_0 = b0; - const I32 *b_1 = b1; - __asm__ __volatile__( - // Bias should be applied after scaling - "eor v5.16b, v5.16b, v5.16b\n" - "ldr d1, [%[in_0]]\n" //in_0 - "eor v6.16b, v6.16b, v6.16b\n" - "ldr x1, [%[in_0], #8]\n" - "eor v7.16b, v7.16b, v7.16b\n" - "ins v1.d[1], x1\n" - "eor v8.16b, v8.16b, v8.16b\n" - "ldr d0, [%[f_0]]\n" //f_0 - "eor v9.16b, v9.16b, v9.16b\n" - "ldr x2, [%[f_0], #8]\n" - "eor v10.16b, v10.16b, v10.16b\n" - "ins v0.d[1], x2\n" - "eor v11.16b, v11.16b, v11.16b\n" - "ldr d3, [%[in_0], #16]\n" //in_1 - "eor v12.16b, v12.16b, v12.16b\n" - "ldr x3, [%[in_0], #24]\n" - "eor v13.16b, v13.16b, v13.16b\n" - "ins v3.d[1], x3\n" - "eor v14.16b, v14.16b, v14.16b\n" - "eor v15.16b, v15.16b, v15.16b\n" - "eor v16.16b, v16.16b, v16.16b\n" - - "eor v17.16b, v17.16b, v17.16b\n" - "eor v18.16b, v18.16b, v18.16b\n" - "eor v19.16b, v19.16b, v19.16b\n" - "eor v20.16b, v20.16b, v20.16b\n" - "eor v21.16b, v21.16b, v21.16b\n" - "eor v22.16b, v22.16b, v22.16b\n" - "eor v23.16b, v23.16b, v23.16b\n" - "eor v24.16b, v24.16b, v24.16b\n" - "eor v25.16b, v25.16b, v25.16b\n" - "eor v26.16b, v26.16b, v26.16b\n" - "eor v27.16b, v27.16b, v27.16b\n" - "eor v28.16b, v28.16b, v28.16b\n" - - //give in address to x3 - "mov x3, %[in_0]\n" - - //give f address to x0 - "mov x0, %[f_0]\n" - - "mov x2, %[ic]\n" //ic_blk - "0:\n" - "sdot v5.4s, v0.16b, v1.4b[0]\n" - "ldr d2, [x3, 32]\n" - "ldr x16, [x3, 40]\n" - "sdot v7.4s, v0.16b, v1.4b[1]\n" - "ldr d29, [x0, 16]\n" - "ldr x17, [x0, 24]\n" - "sdot v9.4s, v0.16b, v1.4b[2]\n" - "ins v2.d[1], x16\n" - "ldr d30, [x3, 48]!\n" - "sdot v11.4s, v0.16b, v1.4b[3]\n" - "ins v29.d[1], x17\n" - - "sdot v13.4s, v0.16b, v3.4b[0]\n" - "ldr x16, [x3, 8]\n" - "subs x2, x2, #4\n" - "sdot v15.4s, v0.16b, v3.4b[1]\n" - "sdot v17.4s, v0.16b, v3.4b[2]\n" - "ins v30.d[1], x16\n" - "sdot v19.4s, v0.16b, v3.4b[3]\n" - - "sdot v21.4s, v0.16b, v2.4b[0]\n" - "sdot v23.4s, v0.16b, v2.4b[1]\n" - "sdot v25.4s, v0.16b, v2.4b[2]\n" - "sdot v27.4s, v0.16b, v2.4b[3]\n" - - "sdot v14.4s, v29.16b, v3.4b[0]\n" - "sdot v16.4s, v29.16b, v3.4b[1]\n" - "ldr d0, [x0, 32]!\n" - "ldr x17, [x0, 8]\n" - "sdot v18.4s, v29.16b, v3.4b[2]\n" - "sdot v20.4s, v29.16b, v3.4b[3]\n" - - "sdot v6.4s, v29.16b, v1.4b[0]\n" - "sdot v8.4s, v29.16b, v1.4b[1]\n" - "ldr d3, [x3, 16]\n" - "ldr x16, [x3, 24]\n" - "sdot v10.4s, v29.16b, v1.4b[2]\n" - "sdot v12.4s, v29.16b, v1.4b[3]\n" - - "ins v0.d[1], x17\n" - "ins v3.d[1], x16\n" - - "sdot v22.4s, v29.16b, v2.4b[0]\n" - "mov v1.16b, v30.16b\n" - "sdot v24.4s, v29.16b, v2.4b[1]\n" - "sdot v26.4s, v29.16b, v2.4b[2]\n" - "sdot v28.4s, v29.16b, v2.4b[3]\n" - "bne 0b\n" - - "cmp %[pointwiseActivationMode], %[am_relu6]\n" //No need to scale for relu6 - "ldr q3, [%[b_0]]\n" - "ldr q4, [%[b_1]]\n" - "beq 11f\n" - - "ldr q0, [%[scale]]\n" - "mul v5.4s, v0.4s, v5.4s\n" - "mul v6.4s, v0.4s, v6.4s\n" - "mul v7.4s, v0.4s, v7.4s\n" - "mul v8.4s, v0.4s, v8.4s\n" - "mul v9.4s, v0.4s, v9.4s\n" - "mul v10.4s, v0.4s, v10.4s\n" - "mul v11.4s, v0.4s, v11.4s\n" - "mul v12.4s, v0.4s, v12.4s\n" - "mul v13.4s, v0.4s, v13.4s\n" - "mul v14.4s, v0.4s, v14.4s\n" - "mul v15.4s, v0.4s, v15.4s\n" - "mul v16.4s, v0.4s, v16.4s\n" - "mul v17.4s, v0.4s, v17.4s\n" - "mul v18.4s, v0.4s, v18.4s\n" - "mul v19.4s, v0.4s, v19.4s\n" - "mul v20.4s, v0.4s, v20.4s\n" - "mul v21.4s, v0.4s, v21.4s\n" - "mul v22.4s, v0.4s, v22.4s\n" - "mul v23.4s, v0.4s, v23.4s\n" - "mul v24.4s, v0.4s, v24.4s\n" - "mul v25.4s, v0.4s, v25.4s\n" - "mul v26.4s, v0.4s, v26.4s\n" - "mul v27.4s, v0.4s, v27.4s\n" - "mul v28.4s, v0.4s, v28.4s\n" - - "add v5.4s, v3.4s, v5.4s\n" - "add v6.4s, v4.4s, v6.4s\n" - "add v7.4s, v3.4s, v7.4s\n" - "add v8.4s, v4.4s, v8.4s\n" - "add v9.4s, v3.4s, v9.4s\n" - "add v10.4s, v4.4s, v10.4s\n" - "add v11.4s, v3.4s, v11.4s\n" - "add v12.4s, v4.4s, v12.4s\n" - "add v13.4s, v3.4s, v13.4s\n" - "add v14.4s, v4.4s, v14.4s\n" - "add v15.4s, v3.4s, v15.4s\n" - "add v16.4s, v4.4s, v16.4s\n" - "add v17.4s, v3.4s, v17.4s\n" - "add v18.4s, v4.4s, v18.4s\n" - "add v19.4s, v3.4s, v19.4s\n" - "add v20.4s, v4.4s, v20.4s\n" - "add v21.4s, v3.4s, v21.4s\n" - "add v22.4s, v4.4s, v22.4s\n" - "add v23.4s, v3.4s, v23.4s\n" - "add v24.4s, v4.4s, v24.4s\n" - "add v25.4s, v3.4s, v25.4s\n" - "add v26.4s, v4.4s, v26.4s\n" - "add v27.4s, v3.4s, v27.4s\n" - "add v28.4s, v4.4s, v28.4s\n" - - "cmp %[pointwiseActivationMode], %[am_relu]\n" - "bne 13f\n" - "eor v1.16b, v1.16b, v1.16b\n" //zero - "smax v5.4s, v5.4s, v1.4s\n" - "smax v6.4s, v6.4s, v1.4s\n" - "smax v7.4s, v7.4s, v1.4s\n" - "smax v8.4s, v8.4s, v1.4s\n" - "smax v9.4s, v9.4s, v1.4s\n" - "smax v10.4s, v10.4s, v1.4s\n" - "smax v11.4s, v11.4s, v1.4s\n" - "smax v12.4s, v12.4s, v1.4s\n" - "smax v13.4s, v13.4s, v1.4s\n" - "smax v14.4s, v14.4s, v1.4s\n" - "smax v15.4s, v15.4s, v1.4s\n" - "smax v16.4s, v16.4s, v1.4s\n" - "smax v17.4s, v17.4s, v1.4s\n" - "smax v18.4s, v18.4s, v1.4s\n" - "smax v19.4s, v19.4s, v1.4s\n" - "smax v20.4s, v20.4s, v1.4s\n" - "smax v21.4s, v21.4s, v1.4s\n" - "smax v22.4s, v22.4s, v1.4s\n" - "smax v23.4s, v23.4s, v1.4s\n" - "smax v24.4s, v24.4s, v1.4s\n" - "smax v25.4s, v25.4s, v1.4s\n" - "smax v26.4s, v26.4s, v1.4s\n" - "smax v27.4s, v27.4s, v1.4s\n" - "smax v28.4s, v28.4s, v1.4s\n" - - "11:\n" - "cmp %[pointwiseActivationMode], %[am_relu6]\n" - "bne 13f\n" - // Apply bias - "add v5.4s, v3.4s, v5.4s\n" - "add v6.4s, v4.4s, v6.4s\n" - "add v7.4s, v3.4s, v7.4s\n" - "add v8.4s, v4.4s, v8.4s\n" - "add v9.4s, v3.4s, v9.4s\n" - "add v10.4s, v4.4s, v10.4s\n" - "add v11.4s, v3.4s, v11.4s\n" - "add v12.4s, v4.4s, v12.4s\n" - "add v13.4s, v3.4s, v13.4s\n" - "add v14.4s, v4.4s, v14.4s\n" - "add v15.4s, v3.4s, v15.4s\n" - "add v16.4s, v4.4s, v16.4s\n" - "add v17.4s, v3.4s, v17.4s\n" - "add v18.4s, v4.4s, v18.4s\n" - "add v19.4s, v3.4s, v19.4s\n" - "add v20.4s, v4.4s, v20.4s\n" - "add v21.4s, v3.4s, v21.4s\n" - "add v22.4s, v4.4s, v22.4s\n" - "add v23.4s, v3.4s, v23.4s\n" - "add v24.4s, v4.4s, v24.4s\n" - "add v25.4s, v3.4s, v25.4s\n" - "add v26.4s, v4.4s, v26.4s\n" - "add v27.4s, v3.4s, v27.4s\n" - "add v28.4s, v4.4s, v28.4s\n" - - "eor v1.16b, v0.16b, v0.16b\n" //zero - "movi v2.4s, #6\n" //six - "smax v5.4s, v5.4s, v1.4s\n" - "smax v6.4s, v6.4s, v1.4s\n" - "smax v7.4s, v7.4s, v1.4s\n" - "smax v8.4s, v8.4s, v1.4s\n" - "smax v9.4s, v9.4s, v1.4s\n" - "smax v10.4s, v10.4s, v1.4s\n" - "smax v11.4s, v11.4s, v1.4s\n" - "smax v12.4s, v12.4s, v1.4s\n" - "smax v13.4s, v13.4s, v1.4s\n" - "smax v14.4s, v14.4s, v1.4s\n" - "smax v15.4s, v15.4s, v1.4s\n" - "smax v16.4s, v16.4s, v1.4s\n" - "smax v17.4s, v17.4s, v1.4s\n" - "smax v18.4s, v18.4s, v1.4s\n" - "smax v19.4s, v19.4s, v1.4s\n" - "smax v20.4s, v20.4s, v1.4s\n" - "smax v21.4s, v21.4s, v1.4s\n" - "smax v22.4s, v22.4s, v1.4s\n" - "smax v23.4s, v23.4s, v1.4s\n" - "smax v24.4s, v24.4s, v1.4s\n" - "smax v25.4s, v25.4s, v1.4s\n" - "smax v26.4s, v26.4s, v1.4s\n" - "smax v27.4s, v27.4s, v1.4s\n" - "smax v28.4s, v28.4s, v1.4s\n" - - "smin v5.4s, v5.4s, v2.4s\n" - "smin v6.4s, v6.4s, v2.4s\n" - "smin v7.4s, v7.4s, v2.4s\n" - "smin v8.4s, v8.4s, v2.4s\n" - "smin v9.4s, v9.4s, v2.4s\n" - "smin v10.4s, v10.4s, v2.4s\n" - "smin v11.4s, v11.4s, v2.4s\n" - "smin v12.4s, v12.4s, v2.4s\n" - "smin v13.4s, v13.4s, v2.4s\n" - "smin v14.4s, v14.4s, v2.4s\n" - "smin v15.4s, v15.4s, v2.4s\n" - "smin v16.4s, v16.4s, v2.4s\n" - "smin v17.4s, v17.4s, v2.4s\n" - "smin v18.4s, v18.4s, v2.4s\n" - "smin v19.4s, v19.4s, v2.4s\n" - "smin v20.4s, v20.4s, v2.4s\n" - "smin v21.4s, v21.4s, v2.4s\n" - "smin v22.4s, v22.4s, v2.4s\n" - "smin v23.4s, v23.4s, v2.4s\n" - "smin v24.4s, v24.4s, v2.4s\n" - "smin v25.4s, v25.4s, v2.4s\n" - "smin v26.4s, v26.4s, v2.4s\n" - "smin v27.4s, v27.4s, v2.4s\n" - "smin v28.4s, v28.4s, v2.4s\n" - - "13:\n" - "str q5, [%[out_0]]\n" - "str q6, [%[out_0], #16]\n" - "str q7, [%[out_0], #32]\n" - "str q8, [%[out_0], #48]\n" - "str q9, [%[out_0], #64]\n" - "str q10, [%[out_0], #80]\n" - "str q11, [%[out_0], #96]\n" - "str q12, [%[out_0], #112]\n" - "str q13, [%[out_0], #128]\n" - "str q14, [%[out_0], #144]\n" - "str q15, [%[out_0], #160]\n" - "str q16, [%[out_0], #176]\n" - "str q17, [%[out_0], #192]\n" - "str q18, [%[out_0], #208]\n" - "str q19, [%[out_0], #224]\n" - "str q20, [%[out_0], #240]\n" - "str q21, [%[out_0], #256]\n" - "str q22, [%[out_0], #272]\n" - "str q23, [%[out_0], #288]\n" - "str q24, [%[out_0], #304]\n" - "str q25, [%[out_0], #320]\n" - "str q26, [%[out_0], #336]\n" - "str q27, [%[out_0], #352]\n" - "str q28, [%[out_0], #368]\n" - :[out_0]"+r"(out_o0hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_0), - [b_1]"r"(b_1), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [scale]"r"(scale_v) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x0", "x1", "x2", "x3","x17","x16" - ); - b0 += 8; - b1 += 8; - } - } - - // ohow_reminder % 12 / 8 - U32 ohow_s = (ohow / 12) * 12; - U32 ohow_tail = ohow - ohow_s; - - if (ohow_tail >= 8) { - U32 hw = ohow_s; - const I32 *b0 = biasArray + ic*8; - const I32 *b1 = b0 + 4; - INT8 *in_pack = pwArray + hw*ic*8; - for (U32 o = 0; o < oc; o++) { - INT8 *in_hw0 = in_pack; - const INT8 *f_o0c0 = f_base + o*8*ic*8; - I32 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - // bias - const I32 *b_0 = b0; - const I32 *b_1 = b1; - __asm__ __volatile__( - // Bias should be applied after scaling - "eor v5.16b, v5.16b, v5.16b\n" - "ldr d1, [%[in_0]]\n" //in_0 - "eor v6.16b, v6.16b, v6.16b\n" - "ldr x1, [%[in_0], #8]\n" - "eor v7.16b, v7.16b, v7.16b\n" - "ins v1.d[1], x1\n" - "eor v8.16b, v8.16b, v8.16b\n" - "ldr d0, [%[f_0]]\n" //f_0 - "eor v9.16b, v9.16b, v9.16b\n" - "ldr x2, [%[f_0], #8]\n" - "eor v10.16b, v10.16b, v10.16b\n" - "ins v0.d[1], x2\n" - "eor v11.16b, v11.16b, v11.16b\n" - "eor v12.16b, v12.16b, v12.16b\n" - "eor v13.16b, v13.16b, v13.16b\n" - "eor v14.16b, v14.16b, v14.16b\n" - "eor v15.16b, v15.16b, v15.16b\n" - "eor v16.16b, v16.16b, v16.16b\n" - "eor v17.16b, v17.16b, v17.16b\n" - "eor v18.16b, v18.16b, v18.16b\n" - "eor v19.16b, v19.16b, v19.16b\n" - "eor v20.16b, v20.16b, v20.16b\n" - - //give in address to x3 - "mov x3, %[in_0]\n" - - //give f address to x0 - "mov x0, %[f_0]\n" - - "mov x2, %[ic]\n" //ic_blk - "0:\n" - "sdot v5.4s, v0.16b, v1.4b[0]\n" - "ldr d3, [x3, 16]!\n" - "ldr x16, [x3, 8]\n" - "sdot v7.4s, v0.16b, v1.4b[1]\n" - "ldr d29, [x0, 16]\n" - "ldr x17, [x0, 24]\n" - "sdot v9.4s, v0.16b, v1.4b[2]\n" - "ins v3.d[1], x16\n" - "ldr d30, [x3, 16]!\n" - "sdot v11.4s, v0.16b, v1.4b[3]\n" - "ins v29.d[1], x17\n" - - "sdot v13.4s, v0.16b, v3.4b[0]\n" - "ldr x16, [x3, 8]\n" - "subs x2, x2, #4\n" - "sdot v15.4s, v0.16b, v3.4b[1]\n" - "sdot v17.4s, v0.16b, v3.4b[2]\n" - "ins v30.d[1], x16\n" - "sdot v19.4s, v0.16b, v3.4b[3]\n" - - "sdot v6.4s, v29.16b, v1.4b[0]\n" - "sdot v8.4s, v29.16b, v1.4b[1]\n" - "ldr d0, [x0, 32]!\n" - "ldr x17, [x0, 8]\n" - "sdot v10.4s, v29.16b, v1.4b[2]\n" - "sdot v12.4s, v29.16b, v1.4b[3]\n" - - "sdot v14.4s, v29.16b, v3.4b[0]\n" - "ins v0.d[1], x17\n" - "mov v1.16b, v30.16b\n" - "sdot v16.4s, v29.16b, v3.4b[1]\n" - "sdot v18.4s, v29.16b, v3.4b[2]\n" - "sdot v20.4s, v29.16b, v3.4b[3]\n" - - "bne 0b\n" - - "cmp %[pointwiseActivationMode], %[am_relu6]\n" //No need to scale for relu6 - "ldr q3, [%[b_0]]\n" - "ldr q4, [%[b_1]]\n" - "beq 11f\n" - - "ldr q0, [%[scale]]\n" - "mul v5.4s, v0.4s, v5.4s\n" - "mul v6.4s, v0.4s, v6.4s\n" - "mul v7.4s, v0.4s, v7.4s\n" - "mul v8.4s, v0.4s, v8.4s\n" - "mul v9.4s, v0.4s, v9.4s\n" - "mul v10.4s, v0.4s, v10.4s\n" - "mul v11.4s, v0.4s, v11.4s\n" - "mul v12.4s, v0.4s, v12.4s\n" - "mul v13.4s, v0.4s, v13.4s\n" - "mul v14.4s, v0.4s, v14.4s\n" - "mul v15.4s, v0.4s, v15.4s\n" - "mul v16.4s, v0.4s, v16.4s\n" - "mul v17.4s, v0.4s, v17.4s\n" - "mul v18.4s, v0.4s, v18.4s\n" - "mul v19.4s, v0.4s, v19.4s\n" - "mul v20.4s, v0.4s, v20.4s\n" - - "add v5.4s, v3.4s, v5.4s\n" - "add v6.4s, v4.4s, v6.4s\n" - "add v7.4s, v3.4s, v7.4s\n" - "add v8.4s, v4.4s, v8.4s\n" - "add v9.4s, v3.4s, v9.4s\n" - "add v10.4s, v4.4s, v10.4s\n" - "add v11.4s, v3.4s, v11.4s\n" - "add v12.4s, v4.4s, v12.4s\n" - "add v13.4s, v3.4s, v13.4s\n" - "add v14.4s, v4.4s, v14.4s\n" - "add v15.4s, v3.4s, v15.4s\n" - "add v16.4s, v4.4s, v16.4s\n" - "add v17.4s, v3.4s, v17.4s\n" - "add v18.4s, v4.4s, v18.4s\n" - "add v19.4s, v3.4s, v19.4s\n" - "add v20.4s, v4.4s, v20.4s\n" - - "cmp %[pointwiseActivationMode], %[am_relu]\n" - "bne 13f\n" - "eor v1.16b, v1.16b, v1.16b\n" //zero - "smax v5.4s, v5.4s, v1.4s\n" - "smax v6.4s, v6.4s, v1.4s\n" - "smax v7.4s, v7.4s, v1.4s\n" - "smax v8.4s, v8.4s, v1.4s\n" - "smax v9.4s, v9.4s, v1.4s\n" - "smax v10.4s, v10.4s, v1.4s\n" - "smax v11.4s, v11.4s, v1.4s\n" - "smax v12.4s, v12.4s, v1.4s\n" - "smax v13.4s, v13.4s, v1.4s\n" - "smax v14.4s, v14.4s, v1.4s\n" - "smax v15.4s, v15.4s, v1.4s\n" - "smax v16.4s, v16.4s, v1.4s\n" - "smax v17.4s, v17.4s, v1.4s\n" - "smax v18.4s, v18.4s, v1.4s\n" - "smax v19.4s, v19.4s, v1.4s\n" - "smax v20.4s, v20.4s, v1.4s\n" - - "11:\n" - "cmp %[pointwiseActivationMode], %[am_relu6]\n" - "bne 13f\n" - // Apply bias - "add v5.4s, v3.4s, v5.4s\n" - "add v6.4s, v4.4s, v6.4s\n" - "add v7.4s, v3.4s, v7.4s\n" - "add v8.4s, v4.4s, v8.4s\n" - "add v9.4s, v3.4s, v9.4s\n" - "add v10.4s, v4.4s, v10.4s\n" - "add v11.4s, v3.4s, v11.4s\n" - "add v12.4s, v4.4s, v12.4s\n" - "add v13.4s, v3.4s, v13.4s\n" - "add v14.4s, v4.4s, v14.4s\n" - "add v15.4s, v3.4s, v15.4s\n" - "add v16.4s, v4.4s, v16.4s\n" - "add v17.4s, v3.4s, v17.4s\n" - "add v18.4s, v4.4s, v18.4s\n" - "add v19.4s, v3.4s, v19.4s\n" - "add v20.4s, v4.4s, v20.4s\n" - - "eor v1.16b, v0.16b, v0.16b\n" //zero - "movi v2.4s, #6\n" //six - "smax v5.4s, v5.4s, v1.4s\n" - "smax v6.4s, v6.4s, v1.4s\n" - "smax v7.4s, v7.4s, v1.4s\n" - "smax v8.4s, v8.4s, v1.4s\n" - "smax v9.4s, v9.4s, v1.4s\n" - "smax v10.4s, v10.4s, v1.4s\n" - "smax v11.4s, v11.4s, v1.4s\n" - "smax v12.4s, v12.4s, v1.4s\n" - "smax v13.4s, v13.4s, v1.4s\n" - "smax v14.4s, v14.4s, v1.4s\n" - "smax v15.4s, v15.4s, v1.4s\n" - "smax v16.4s, v16.4s, v1.4s\n" - "smax v17.4s, v17.4s, v1.4s\n" - "smax v18.4s, v18.4s, v1.4s\n" - "smax v19.4s, v19.4s, v1.4s\n" - "smax v20.4s, v20.4s, v1.4s\n" - - "smin v5.4s, v5.4s, v2.4s\n" - "smin v6.4s, v6.4s, v2.4s\n" - "smin v7.4s, v7.4s, v2.4s\n" - "smin v8.4s, v8.4s, v2.4s\n" - "smin v9.4s, v9.4s, v2.4s\n" - "smin v10.4s, v10.4s, v2.4s\n" - "smin v11.4s, v11.4s, v2.4s\n" - "smin v12.4s, v12.4s, v2.4s\n" - "smin v13.4s, v13.4s, v2.4s\n" - "smin v14.4s, v14.4s, v2.4s\n" - "smin v15.4s, v15.4s, v2.4s\n" - "smin v16.4s, v16.4s, v2.4s\n" - "smin v17.4s, v17.4s, v2.4s\n" - "smin v18.4s, v18.4s, v2.4s\n" - "smin v19.4s, v19.4s, v2.4s\n" - "smin v20.4s, v20.4s, v2.4s\n" - - "13:\n" - "str q5, [%[out_0]]\n" - "str q6, [%[out_0], #16]\n" - "str q7, [%[out_0], #32]\n" - "str q8, [%[out_0], #48]\n" - "str q9, [%[out_0], #64]\n" - "str q10, [%[out_0], #80]\n" - "str q11, [%[out_0], #96]\n" - "str q12, [%[out_0], #112]\n" - "str q13, [%[out_0], #128]\n" - "str q14, [%[out_0], #144]\n" - "str q15, [%[out_0], #160]\n" - "str q16, [%[out_0], #176]\n" - "str q17, [%[out_0], #192]\n" - "str q18, [%[out_0], #208]\n" - "str q19, [%[out_0], #224]\n" - "str q20, [%[out_0], #240]\n" - :[out_0]"+r"(out_o0hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_0), - [b_1]"r"(b_1), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [scale]"r"(scale_v) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v29", "v30", "x0", "x1", "x2", "x3","x17","x16" - ); - b0 += 8; - b1 += 8; - } - ohow_s += 8; - ohow_tail -= 8; - } - - if (ohow_tail >= 4) { - U32 hw = ohow_s; - const I32 *b0 = biasArray + ic*8; - const I32 *b1 = b0 + 4; - INT8 *in_pack = pwArray + hw*ic*8; - for (U32 o = 0; o < oc; o++) { - INT8 *in_hw0 = in_pack; - const INT8 *f_o0c0 = f_base + o*8*ic*8; - I32 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - // bias - const I32 *b_0 = b0; - const I32 *b_1 = b1; - __asm__ __volatile__( - // Bias should be applied after scaling - "eor v5.16b, v5.16b, v5.16b\n" - "ldr d1, [%[in_0]]\n" //in_0 - "eor v6.16b, v6.16b, v6.16b\n" - "ldr x1, [%[in_0], #8]\n" - "eor v7.16b, v7.16b, v7.16b\n" - "ins v1.d[1], x1\n" - "eor v8.16b, v8.16b, v8.16b\n" - "ldr d0, [%[f_0]]\n" //f_0 - "eor v9.16b, v9.16b, v9.16b\n" - "ldr x2, [%[f_0], #8]\n" - "eor v10.16b, v10.16b, v10.16b\n" - "ins v0.d[1], x2\n" - "eor v11.16b, v11.16b, v11.16b\n" - "eor v12.16b, v12.16b, v12.16b\n" - - //give in address to x3 - "mov x3, %[in_0]\n" - - //give f address to x0 - "mov x0, %[f_0]\n" - - "mov x2, %[ic]\n" //ic_blk - "0:\n" - "ldr d29, [x0, 16]\n" - "ldr x17, [x0, 24]\n" - "sdot v5.4s, v0.16b, v1.4b[0]\n" - "ldr d3, [x3, 16]!\n" - "ldr x16, [x3, 8]\n" - "sdot v7.4s, v0.16b, v1.4b[1]\n" - "ins v29.d[1], x17\n" - "subs x2, x2, #4\n" - "sdot v9.4s, v0.16b, v1.4b[2]\n" - "ins v3.d[1], x16\n" - "sdot v11.4s, v0.16b, v1.4b[3]\n" - - "sdot v6.4s, v29.16b, v1.4b[0]\n" - "ldr d0, [x0, 32]!\n" - "ldr x17, [x0, 8]\n" - "sdot v8.4s, v29.16b, v1.4b[1]\n" - "sdot v10.4s, v29.16b, v1.4b[2]\n" - "ins v0.d[1], x17\n" - "sdot v12.4s, v29.16b, v1.4b[3]\n" - "mov v1.16b, v3.16b\n" - - "bne 0b\n" - - "cmp %[pointwiseActivationMode], %[am_relu6]\n" //No need to scale for relu6 - "ldr q3, [%[b_0]]\n" - "ldr q4, [%[b_1]]\n" - "beq 11f\n" - - "ldr q0, [%[scale]]\n" - "mul v5.4s, v0.4s, v5.4s\n" - "mul v6.4s, v0.4s, v6.4s\n" - "mul v7.4s, v0.4s, v7.4s\n" - "mul v8.4s, v0.4s, v8.4s\n" - "mul v9.4s, v0.4s, v9.4s\n" - "mul v10.4s, v0.4s, v10.4s\n" - "mul v11.4s, v0.4s, v11.4s\n" - "mul v12.4s, v0.4s, v12.4s\n" - - "add v5.4s, v3.4s, v5.4s\n" - "add v6.4s, v4.4s, v6.4s\n" - "add v7.4s, v3.4s, v7.4s\n" - "add v8.4s, v4.4s, v8.4s\n" - "add v9.4s, v3.4s, v9.4s\n" - "add v10.4s, v4.4s, v10.4s\n" - "add v11.4s, v3.4s, v11.4s\n" - "add v12.4s, v4.4s, v12.4s\n" - - "cmp %[pointwiseActivationMode], %[am_relu]\n" - "bne 13f\n" - "eor v1.16b, v1.16b, v1.16b\n" //zero - "smax v5.4s, v5.4s, v1.4s\n" - "smax v6.4s, v6.4s, v1.4s\n" - "smax v7.4s, v7.4s, v1.4s\n" - "smax v8.4s, v8.4s, v1.4s\n" - "smax v9.4s, v9.4s, v1.4s\n" - "smax v10.4s, v10.4s, v1.4s\n" - "smax v11.4s, v11.4s, v1.4s\n" - "smax v12.4s, v12.4s, v1.4s\n" - - "11:\n" - "cmp %[pointwiseActivationMode], %[am_relu6]\n" - "bne 13f\n" - // Apply bias - "add v5.4s, v3.4s, v5.4s\n" - "add v6.4s, v4.4s, v6.4s\n" - "add v7.4s, v3.4s, v7.4s\n" - "add v8.4s, v4.4s, v8.4s\n" - "add v9.4s, v3.4s, v9.4s\n" - "add v10.4s, v4.4s, v10.4s\n" - "add v11.4s, v3.4s, v11.4s\n" - "add v12.4s, v4.4s, v12.4s\n" - - "eor v1.16b, v0.16b, v0.16b\n" //zero - "movi v2.4s, #0x06\n" //six - "smax v5.4s, v5.4s, v1.4s\n" - "smax v6.4s, v6.4s, v1.4s\n" - "smax v7.4s, v7.4s, v1.4s\n" - "smax v8.4s, v8.4s, v1.4s\n" - "smax v9.4s, v9.4s, v1.4s\n" - "smax v10.4s, v10.4s, v1.4s\n" - "smax v11.4s, v11.4s, v1.4s\n" - "smax v12.4s, v12.4s, v1.4s\n" - - "smin v5.4s, v5.4s, v2.4s\n" - "smin v6.4s, v6.4s, v2.4s\n" - "smin v7.4s, v7.4s, v2.4s\n" - "smin v8.4s, v8.4s, v2.4s\n" - "smin v9.4s, v9.4s, v2.4s\n" - "smin v10.4s, v10.4s, v2.4s\n" - "smin v11.4s, v11.4s, v2.4s\n" - "smin v12.4s, v12.4s, v2.4s\n" - - "13:\n" - "str q5, [%[out_0]]\n" - "str q6, [%[out_0], #16]\n" - "str q7, [%[out_0], #32]\n" - "str q8, [%[out_0], #48]\n" - "str q9, [%[out_0], #64]\n" - "str q10, [%[out_0], #80]\n" - "str q11, [%[out_0], #96]\n" - "str q12, [%[out_0], #112]\n" - :[out_0]"+r"(out_o0hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_0), - [b_1]"r"(b_1), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [scale]"r"(scale_v) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v29", "v30", "x0", "x1", "x2", "x3","x17","x16" - ); - b0 += 8; - b1 += 8; - } - ohow_s += 4; - ohow_tail -= 4; - } - - for (I32 hw = ohow_s; hw < ohow; hw++) { - const I32 *b0 = biasArray + ic*8; - INT8 *in_pack = pwArray + hw*ic*8; - - // compute - for (U32 o = 0; o < oc; o++) { - INT8 *in_hw = in_pack; - const INT8 *f_o = f_base + o*8*ic*8; - I32 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - - int32x4_t res[2] = {0}; - - for(U32 c=0; c= kernelSizeH || paddingL >= kernelSizeW) { - CHECK_STATUS(NOT_SUPPORTED); - } - - if (kernelSizeH * kernelSizeW > 256 && pm == POOLING_MEAN) { - CHECK_STATUS(NOT_SUPPORTED); - } - - ic /= 8; - - short khkw = kernelSizeH * kernelSizeW; - short factor = 256 / khkw; - - switch (pm) { - case POOLING_MAX: { - *outputScale = *inputScale; - break; - } - case POOLING_MEAN: { - *outputScale = *inputScale * factor * khkw / 256; - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - - for (U32 n = 0; n < in; n++) { - for (U32 c = 0; c < ic; c++) { - for (U32 h = 0; h < oh; h++) { - for (U32 w = 0; w < ow; w++) { - int hstart = (int)h * (int)strideH - (int)paddingT; - int wstart = (int)w * (int)strideW - (int)paddingL; - int hend = UNI_MIN(hstart + kernelSizeH, ih); - int wend = UNI_MIN(wstart + kernelSizeW, iw); - hstart = UNI_MAX(hstart, 0); - wstart = UNI_MAX(wstart, 0); - - int8x8_t in1, out1; - int16x8_t out_mean = {0}; - out1 = vdup_n_s8(-128); - short pool_size = (hend-hstart) * (wend-wstart); - for (int kernelH = hstart; kernelH < hend; kernelH++) { - for (int kernelW = wstart; kernelW < wend; kernelW++) { - const U32 index = (kernelH * iw + kernelW) * 8; - in1 = vld1_s8(input + index); - switch (pm) { - case POOLING_MAX: - out1 = vmax_s8(out1, in1); - break; - case POOLING_MEAN: - out_mean = vaddw_s8(out_mean, in1); - break; - default: - CHECK_STATUS(NOT_SUPPORTED); - } - } - } - if (pm == POOLING_MAX) { - vst1_s8(output + (h * ow + w) * 8, out1); - } else { - short pool_factor = factor * khkw / pool_size; - if (pool_factor > 1) { - out_mean = vmulq_n_s16(out_mean, pool_factor); - } - in1 = vshrn_n_s16(out_mean, 8); - vst1_s8(output + (h * ow + w) * 8, in1); - } - } - } - input += ih * iw * 8; - output += oh * ow * 8; - } - } - - return SUCCESS; -} -#endif diff --git a/tensor_computing/src/cpu/arm/int8/quantize.cpp b/tensor_computing/src/cpu/arm/int8/quantize.cpp deleted file mode 100644 index d746c87e..00000000 --- a/tensor_computing/src/cpu/arm/int8/quantize.cpp +++ /dev/null @@ -1,109 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include -#include "cpu/arm/int8/tensor_computing_int8.h" -#include "cpu/arm/int8/convolution_gemm.h" - - -EE quantize_tensor_int32(TensorDesc dDesc, const void* data, TensorDesc* qDesc, void* qData, F32 *scale) -{ - if (nullptr == data || nullptr == qDesc || nullptr == qData || nullptr == scale) { - CHECK_STATUS(NULL_POINTER); - } - DataType dt; - DataFormat df; - U32 n, c, h, w; - if (tensorIs2d(dDesc)) { - CHECK_STATUS(tensor2dfGet(dDesc, &dt, &df, &n, &w)); - c = 1; - h = 1; - } else if (tensorIs3d(dDesc)) { - CHECK_STATUS(tensor3dGet(dDesc, &dt, &df, &n, &h, &w)); - c = 1; - } else { - CHECK_STATUS(tensor4dGet(dDesc, &dt, &df, &n, &c, &h, &w)); - } - switch (dt) { - case DT_I32: { - I32 *array = (I32*)data; - int32x4_t tmp_v = vld1q_s32(array); - int32x4_t max_v = tmp_v; - int32x4_t min_v = tmp_v; - - U32 numData = n * c * h * w; - CHECK_REQUIREMENT(numData >= 4); - U32 i = 4; - for (; i < numData - 3; i += 4) { - tmp_v = vld1q_s32(array+i); - max_v = vmaxq_s32(max_v, tmp_v); - min_v = vminq_s32(min_v, tmp_v); - } - - I32 max = vmaxvq_s32(max_v); - I32 min = vminvq_s32(min_v); - for (; i < numData; i++) { - I32 tmp = array[i]; - if (tmp > max) { - max = tmp; - } - if (tmp < min) { - min = tmp; - } - } - if (max == 0 && min == 0) { - CHECK_STATUS(NOT_SUPPORTED); - } - - I32 factor; - F32 scaleO; - if (max > 0 && min < 0) { - I32 factor_max = 127 * 16777216 / max; - I32 factor_min = -127 * 16777216 / min; - factor = (factor_max < factor_min) ? factor_max : factor_min; - scaleO = (factor_max < factor_min) ? (127.0/max) : (-127.0/min); - } else if (max > 0) { - factor = 127 * 16777216 / max; - scaleO = 127.0 / max; - } else { - factor = -127 * 16777216 / min; - scaleO = -127.0 / min; - } - DEBUG_info(max << " is the max I32 value, and min values is " << min); - DEBUG_info(scaleO << " is the derived scale"); - *scale *= scaleO; - - U32 main = numData / 16; - INT8 *qArray = (INT8*)qData; - CHECK_STATUS(quantize_I32(main * 4, array, factor, scaleO, qArray)); - for (U32 i = main * 16; i < numData; i++) { - qArray[i] = array[i] * scaleO; - } - - if (tensorIs2d(dDesc)) { - *qDesc = tensor2df(DT_I8, df, n, w); - } else if (tensorIs3d(dDesc)) { - *qDesc = tensor3df(DT_I8, df, n, h, w); - } else { - *qDesc = tensor4df(DT_I8, df, n, c, h, w); - } - break; - } - default:{ - CHECK_STATUS(NOT_SUPPORTED); - } - } - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/arm/int8/tensor_computing_int8.h b/tensor_computing/src/cpu/arm/int8/tensor_computing_int8.h deleted file mode 100644 index 3a6f427a..00000000 --- a/tensor_computing/src/cpu/arm/int8/tensor_computing_int8.h +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_TENSOR_COMPUTING_INT8 -#define _H_TENSOR_COMPUTING_INT8 -#ifdef _USE_INT8 -#include -#include "sys.h" -#include "type.h" -#include "error.h" -#include "tensor_desc.h" -#include "tensor_computing_type.h" -#include "cpu/arm/int8/arm_functions_int8.h" - -EE convolution_infer_forward_algorithm_int8(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, ConvolutionPolicy policy, ConvolutionForwardAlgorithm *algorithm); - -EE convolution_infer_forward_tmp_bytes_int8(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, ConvolutionForwardAlgorithm algorithm, U32 *bytes); - -EE convolution_transform_filter_bytes_int8(TensorDesc filterDesc, ConvolutionForwardAlgorithm algorithm, U32* bytes); - -EE convolution_transform_filter_int8(TensorDesc filterDesc, const void* filter, - ConvolutionForwardAlgorithm algorithm, - TensorDesc *ftmDesc, void* filterTransformed); - -EE convolution_int8(TensorDesc inputDesc, const INT8* input, - TensorDesc filterDesc, const INT8* filter, F16* scales, - ConvolutionDesc convDesc, ConvolutionForwardAlgorithm algorithm, - TensorDesc biasDesc, const F16* bias, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, void* output, - ActivationDesc activationDesc, - Arch arch); - - -EE depthwise_convolution_transform_filter_int8(TensorDesc filterDesc, const INT8* filter, - DepthwiseConvolutionForwardAlgorithm algorithm, - TensorDesc *ftmDesc, INT8* filterTransformed); - -EE depthwise_convolution_int8(TensorDesc inputDesc, INT8* input, - TensorDesc filterDesc, const INT8* filter, - ConvolutionDesc convDesc, DepthwiseConvolutionForwardAlgorithm algorithm, - TensorDesc biasDesc, const I32* bias, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, I32* output, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc, - Arch arch); - -EE pooling_int8(TensorDesc inputDesc, const INT8* input, F16* inputScale, - PoolingDesc poolingDesc, - TensorDesc outputDesc, INT8* output, F16* outputScale); - -EE concat_int8(std::vector inputDesc, std::vector input, F32* inputScale, - TensorDesc outputDesc, void* output, F32* outputScale, U32 concatDim); - -EE quantize_tensor_int32(TensorDesc dDesc, const void* data, TensorDesc* qDesc, void* qData, F32 *scale); -#endif -#endif diff --git a/tensor_computing/src/cpu/arm/lstm.cpp b/tensor_computing/src/cpu/arm/lstm.cpp deleted file mode 100644 index 16703c58..00000000 --- a/tensor_computing/src/cpu/arm/lstm.cpp +++ /dev/null @@ -1,272 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include - -#include "cpu/arm/tensor_computing_arm.h" -#ifdef _USE_FP32 -#include "cpu/arm/fp32/tensor_computing_fp32.h" -#endif -#ifdef _USE_FP16 -#include "cpu/arm/fp16/tensor_computing_fp16.h" -#endif -#include "blas-enhance.h" - -template -EE lstm_transform_filter(TensorDesc filterDesc, const T* filterArray, LSTMDesc lstmDesc, TensorDesc *ftmDesc, T* ftmArray) -{ - if (nullptr == filterArray || nullptr == ftmDesc || nullptr == ftmArray) - CHECK_STATUS(NULL_POINTER); - DataType fdt; - DataFormat fdf; - U32 fn, fk, ftm_n, ftm_k; - CHECK_STATUS(tensor2dfGet(filterDesc, &fdt, &fdf, &fn, &fk)); - U32 alignSize = 32; - switch(fdf) { - case DF_NKN32: { - ftm_n = fn; - ftm_k = fk; - break; - } - case DF_NK: { - // NK => NKN32 - if (fn % alignSize != 0) { - return NOT_MATCH; - } - ftm_n = fn / alignSize; - ftm_k = fk; - for (U32 n = 0; n < ftm_n; n++) { - for (U32 k = 0; k < ftm_k; k++) { - for (U32 n32 = 0; n32 < alignSize; n32++) { - ftmArray[n*ftm_k*alignSize + k*alignSize + n32] = filterArray[(n*alignSize+n32)*ftm_k + k]; - } - } - } - break; - } - default: - return NOT_MATCH; - } - if (lstmDesc.numProjection > 0) { - U32 offset = fn * fk; - if (lstmDesc.numOutput % alignSize != 0) { - return NOT_MATCH; - } - U32 row = lstmDesc.numOutput / alignSize; - U32 col = lstmDesc.numProjection; - for (U32 n = 0; n < row; n++) { - for (U32 k = 0; k < col; k++) { - for (U32 n32 = 0; n32 < alignSize; n32++) { - ftmArray[offset+n*col*alignSize + k*alignSize + n32] = filterArray[offset+(n*alignSize+n32)*col + k]; - } - } - } - } - *ftmDesc = tensor2df(fdt, DF_NKN32, fn, fk); - return SUCCESS; -} - -EE lstm_transform_filter_arm_kernel(TensorDesc filterDesc, const void* filterArray, LSTMDesc lstmDesc, TensorDesc *ftmDesc, void* ftmArray) -{ - EE ret = SUCCESS; - switch (filterDesc.dt) { -#ifdef _USE_FP32 - case DT_F32: { - ret = lstm_transform_filter(filterDesc, (const F32*)filterArray, lstmDesc, ftmDesc, (F32*)ftmArray); - break; - } -#endif -#ifdef _USE_FP16 - case DT_F16: { - ret = lstm_transform_filter(filterDesc, (const F16*)filterArray, lstmDesc, ftmDesc, (F16*)ftmArray); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - -EE lstm_transform_filter_arm(TensorDesc filterDesc, const void* filterArray, LSTMDesc lstmDesc, TensorDesc *ftmDesc, void* ftmArray) -{ - EE ret = SUCCESS; - U32 bytes = tensorNumBytes(filterDesc) + bytesOf(filterDesc.dt) * lstmDesc.numProjection * lstmDesc.numOutput; - int num = lstmDesc.biDirection ? 2 : 1; - for (int i = 0; i < num; i++) { - const U8* filterArrayPtr = (const U8*)filterArray + i * bytes; - U8* ftmArrayPtr = (U8*)ftmArray + i * bytes; - ret = lstm_transform_filter_arm_kernel(filterDesc, filterArrayPtr, lstmDesc, ftmDesc, ftmArrayPtr); - } - return ret; -} - -EE lstm_transform_filter_bytes_arm(TensorDesc filterDesc, LSTMDesc lstmDesc, U32* bytes) -{ - if (nullptr == bytes) - CHECK_STATUS(NULL_POINTER); - *bytes = tensorNumBytes(filterDesc) + bytesOf(filterDesc.dt) * lstmDesc.numProjection * lstmDesc.numOutput; - int num = lstmDesc.biDirection ? 2 : 1; - *bytes *= num; - return SUCCESS; -} - -EE lstmcell_infer_forward_tmp_bytes_arm(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, LSTMDesc lstmDesc, U32 *bytes, Arch arch) -{ - UNUSED(outputDesc); - if (nullptr == bytes) - CHECK_STATUS(NULL_POINTER); - DataType idt; - DataFormat idf; - U32 batch, xDim; - CHECK_STATUS(tensor2dfGet(inputDesc, &idt, &idf, &batch, &xDim)); - U32 hDim = lstmDesc.numOutput; - U32 column = (lstmDesc.numProjection > 0) ? lstmDesc.numProjection : lstmDesc.numOutput; - TensorDesc projectionMatrixDesc = tensor2df(filterDesc.dt, DF_NORMAL, lstmDesc.numProjection, lstmDesc.numOutput); - TensorDesc projectionVectorDesc = tensor1d(filterDesc.dt, lstmDesc.numProjection); - CHECK_STATUS(matrix_vector_multiply_tmp_bytes(projectionMatrixDesc, projectionVectorDesc, bytes, arch)); - *bytes += (hDim + xDim + column * 4) * bytesOf(idt); - return SUCCESS; -} - -EE lstm_infer_forward_tmp_bytes_arm(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, LSTMDesc lstmDesc, U32 *bytes, Arch arch) -{ - UNUSED(filterDesc); - UNUSED(outputDesc); - if (nullptr == bytes) - CHECK_STATUS(NULL_POINTER); - DataType idt; - DataFormat idf; - U32 batch, step, xDim; - CHECK_STATUS(tensor3dGet(inputDesc, &idt, &idf, &batch, &step, &xDim)); - U32 hDim = lstmDesc.numOutput; - TensorDesc xDesc = tensor2df(idt, DF_NORMAL, batch, xDim); - CHECK_STATUS(lstmcell_infer_forward_tmp_bytes_arm(xDesc, filterDesc, outputDesc, lstmDesc, bytes, arch)); - U32 column = (lstmDesc.numProjection > 0) ? lstmDesc.numProjection : lstmDesc.numOutput; - *bytes += batch * (column + hDim) * bytesOf(idt); - return SUCCESS; -} - -EE lstmcell_arm(TensorDesc xDesc, const void* currentX, - TensorDesc filterDesc, const void* filter, - TensorDesc biasDesc, const void* bias, - void *state, - U32 tmpBytes, void *tmp, - LSTMDesc lstmDesc, U32 batchStrideX, U32 batchStrideH, - TensorDesc hDesc, void* output, - Arch arch) -{ - EE ret = SUCCESS; - switch (xDesc.dt) { -#ifdef _USE_FP32 - case DT_F32: { - ret = lstmcell_fp32(xDesc, currentX, - filterDesc, filter, - biasDesc, bias, - state, - tmpBytes, tmp, - lstmDesc, batchStrideX, batchStrideH, - hDesc, output, - arch); - break; - } -#endif -#ifdef _USE_FP16 - case DT_F16: { - ret = lstmcell_fp16(xDesc, currentX, - filterDesc, filter, - biasDesc, bias, - state, - tmpBytes, tmp, - lstmDesc, batchStrideX, batchStrideH, - hDesc, output, - arch); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - -EE lstm_arm(TensorDesc inputDesc, const void* input, - TensorDesc filterDesc, const void* filter, - TensorDesc biasDesc, const void* bias, - U32 tmpBytes, void* tmp, - LSTMDesc lstmDesc, - TensorDesc outputDesc, void* output, - Arch arch) -{ - UNUSED(outputDesc); - - if (nullptr == input - || nullptr == filter - || nullptr == bias - || nullptr == tmp - || nullptr == output) - CHECK_STATUS(NULL_POINTER); - - DataType idt; - DataFormat idf; - U32 batch, step, xDim; - int num = lstmDesc.biDirection ? 2 : 1; - CHECK_STATUS(tensor3dGet(inputDesc, &idt, &idf, &batch, &step, &xDim)); - U32 hDim = lstmDesc.numOutput; - U32 column = (lstmDesc.numProjection > 0) ? lstmDesc.numProjection : lstmDesc.numOutput; - - U8 *cellState = (U8*)tmp; - U8 *tmpArray = cellState + batch * (column + hDim) * bytesOf(idt); - U32 batchStrideX = step * xDim; - U32 batchStrideH = step * hDim * num; - TensorDesc xDesc = tensor2df(idt, DF_NORMAL, batch, xDim); - TensorDesc hDesc = tensor2df(idt, DF_NORMAL, batch, hDim); - - memset(cellState, 0, batch * (column + hDim) * bytesOf(idt)); - for (U32 t = 0; t < step; t++) { - const U8* currentX = (const U8*)input + t * xDim * bytesOf(idt); - U8 *currentH = (U8*)output + t * hDim * num * bytesOf(idt); - CHECK_STATUS(lstmcell_arm(xDesc, currentX, - filterDesc, filter, - biasDesc, bias, - cellState, - tmpBytes, tmpArray, - lstmDesc, batchStrideX, batchStrideH, - hDesc, currentH, - arch)); - } - - if (lstmDesc.biDirection) { - memset(cellState, 0, batch * (column + hDim) * bytesOf(idt)); - U32 filterBytes = tensorNumBytes(filterDesc) + bytesOf(filterDesc.dt) * lstmDesc.numProjection * lstmDesc.numOutput; - U32 biasBytes = tensorNumBytes(biasDesc); - const U8* filterPtr = (const U8*)filter + filterBytes; - const U8* biasPtr = (const U8*)bias + biasBytes; - for (I32 t = step-1; t >= 0; t--) { - const U8* currentX = (const U8*)input + t * xDim * bytesOf(idt); - U8 *currentH = (U8*)output + t * hDim * num * bytesOf(idt) + hDim * bytesOf(idt); - CHECK_STATUS(lstmcell_arm(xDesc, currentX, - filterDesc, filterPtr, - biasDesc, biasPtr, - cellState, - tmpBytes, tmpArray, - lstmDesc, batchStrideX, batchStrideH, - hDesc, currentH, - arch)); - } - } - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/arm/multiply.cpp b/tensor_computing/src/cpu/arm/multiply.cpp deleted file mode 100644 index 926e9d19..00000000 --- a/tensor_computing/src/cpu/arm/multiply.cpp +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/tensor_computing_arm.h" -#include "arm_functions.h" - -EE multiply_arm(void *alpha, void *beta, TensorDesc inputDesc, void* input, TensorDesc outputDesc, void *output) -{ - UNUSED(outputDesc); - - if (nullptr == alpha - || nullptr == beta - || nullptr == input - || nullptr == output) - CHECK_STATUS(NULL_POINTER); - - array_scale(inputDesc.dt, input, output, tensorNumElements(inputDesc), *((F32 *)alpha), *((F32 *)beta)); - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/arm/normalization.cpp b/tensor_computing/src/cpu/arm/normalization.cpp deleted file mode 100644 index b495ffd0..00000000 --- a/tensor_computing/src/cpu/arm/normalization.cpp +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/tensor_computing_arm.h" -#ifdef _USE_FP32 -#include "cpu/arm/fp32/tensor_computing_fp32.h" -#endif -#ifdef _USE_FP16 -#include "cpu/arm/fp16/tensor_computing_fp16.h" -#endif - -EE layer_normalization_arm(void *alpha, void *beta, - TensorDesc inputDesc, void* input, - TensorDesc outputDesc, void* output) -{ - DataType idt = inputDesc.dt; - EE ret = SUCCESS; - switch (idt) { -#ifdef _USE_FP32 - case DT_F32: { - ret = layer_normalization_fp32((F32*)alpha, (F32*)beta, inputDesc, (F32*)input, outputDesc, (F32*)output); - break; - } -#endif -#ifdef _USE_FP16 - case DT_F16: { - ret = layer_normalization_fp16((F16*)alpha, (F16*)beta, inputDesc, (F16*)input, outputDesc, (F16*)output); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} diff --git a/tensor_computing/src/cpu/arm/padding.cpp b/tensor_computing/src/cpu/arm/padding.cpp deleted file mode 100644 index c3b29c45..00000000 --- a/tensor_computing/src/cpu/arm/padding.cpp +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing_type.h" -#include "cpu/arm/tensor_computing_arm.h" -#include - -EE padding_arm(TensorDesc inputDesc, const void* input, PadDesc padDesc, TensorDesc outputDesc, void* output) -{ - DataType idt, odt; - DataFormat idf, odf; - U32 in = 0, ic = 0, ih = 0, iw = 0, - on = 0, oc = 0, oh = 0, ow = 0; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - CHECK_REQUIREMENT(in == on); - CHECK_REQUIREMENT(ic == oc); - U32 alignSize = 1; - if (idf == DF_NCHWC8) - alignSize = 8; - ic /= alignSize; - oc /= alignSize; - for (U32 n = 0; n < in; n++) { - for (U32 c = 0; c < ic; c++) { - for (U32 h = 0; h < ih; h++) { - const U8* inPtr = (const U8*)input + (((n * ic + c) * ih + h) * iw) * alignSize * bytesOf(idt); - U8* outPtr = (U8 *)output + (((n * oc + c) * oh + (padDesc.top+h)) * ow) * alignSize * bytesOf(odt); - if (padDesc.pad_mode == Pad_Constant) { - memset(outPtr, 0, padDesc.left*alignSize*bytesOf(odt)); - outPtr += padDesc.left * alignSize * bytesOf(odt); - memcpy(outPtr, inPtr, iw * alignSize * bytesOf(idt)); - outPtr += iw * alignSize * bytesOf(odt); - memset(outPtr, 0, padDesc.right*alignSize*bytesOf(odt)); - } else { - for (U32 w = 0; w < padDesc.left; w++) { - U32 index = 0; - if (padDesc.pad_mode == Pad_Reflect) { - index = (padDesc.left - w) * alignSize * bytesOf(idt); - } - memcpy(outPtr, inPtr+index, alignSize*bytesOf(idt)); - outPtr += alignSize * bytesOf(idt); - } - memcpy(outPtr, inPtr, iw * alignSize * bytesOf(idt)); - outPtr += iw * alignSize * bytesOf(odt); - for (U32 w = 0; w < padDesc.right; w++) { - U32 index = 0; - if (padDesc.pad_mode == Pad_Reflect) { - index = (iw - w - 2) * alignSize * bytesOf(idt); - } - memcpy(outPtr, inPtr+index, alignSize*bytesOf(idt)); - outPtr += alignSize * bytesOf(idt); - } - } - } - U8* outPtr = (U8*)output + (((n * oc + c) * oh) * ow) * alignSize * bytesOf(odt); - for (U32 h = 0; h < padDesc.top; h++) { - U32 index = h * ow * alignSize * bytesOf(odt); - if (padDesc.pad_mode == Pad_Constant) { - memset(outPtr+index, 0, ow * alignSize * bytesOf(odt)); - } else if (padDesc.pad_mode == Pad_Edge) { - memcpy(outPtr+index, outPtr+(padDesc.top*ow*alignSize*bytesOf(odt)), ow*alignSize*bytesOf(odt)); - } else if (padDesc.pad_mode == Pad_Reflect) { - memcpy(outPtr+index, outPtr+((padDesc.top+padDesc.top-h)*ow*alignSize*bytesOf(odt)), ow*alignSize*bytesOf(odt)); - } else { - return NOT_SUPPORTED; - } - } - for (U32 h = 0; h < padDesc.bottom; h++) { - U32 index = (padDesc.top+ih+h) * ow * alignSize * bytesOf(odt); - if (padDesc.pad_mode == Pad_Constant) { - memset(outPtr+index, 0, ow * alignSize * bytesOf(odt)); - } else if (padDesc.pad_mode == Pad_Edge) { - memcpy(outPtr+index, outPtr+((padDesc.top+ih)*ow*alignSize*bytesOf(odt)), ow*alignSize*bytesOf(odt)); - } else if (padDesc.pad_mode == Pad_Reflect) { - memcpy(outPtr+index, outPtr+((padDesc.top+ih-2-h)*ow*alignSize*bytesOf(odt)), ow*alignSize*bytesOf(odt)); - } else { - return NOT_SUPPORTED; - } - } - } - } - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/arm/pooling.cpp b/tensor_computing/src/cpu/arm/pooling.cpp deleted file mode 100644 index 4c89ec47..00000000 --- a/tensor_computing/src/cpu/arm/pooling.cpp +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/tensor_computing_arm.h" -#ifdef _USE_FP32 -#include "cpu/arm/fp32/tensor_computing_fp32.h" -#endif -#ifdef _USE_FP16 -#include "cpu/arm/fp16/tensor_computing_fp16.h" -#endif -#ifdef _USE_INT8 -#include "cpu/arm/int8/tensor_computing_int8.h" -#endif - -EE pooling_arm(TensorDesc inputDesc, const void* input, PoolingDesc poolingDesc, const void* scale, TensorDesc outputDesc, void* output) -{ - EE ret = SUCCESS; - switch (inputDesc.dt) { -#ifdef _USE_FP32 - case DT_F32: { - UNUSED(scale); - ret = pooling_fp32(inputDesc, (const F32*)input, - poolingDesc, - outputDesc, (F32*)output); - break; - } -#endif -#ifdef _USE_FP16 - case DT_F16: { - ret = pooling_fp16(inputDesc, (const F16*)input, - poolingDesc, - outputDesc, (F16*)output); - break; - } -#endif -#ifdef _USE_INT8 - case DT_I8: { - ret = pooling_int8(inputDesc, (const INT8*)input, (F16*)scale, - poolingDesc, - outputDesc, (INT8*)output, ((F16*)scale)+1); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} diff --git a/tensor_computing/src/cpu/arm/priorbox.cpp b/tensor_computing/src/cpu/arm/priorbox.cpp deleted file mode 100644 index 4565f956..00000000 --- a/tensor_computing/src/cpu/arm/priorbox.cpp +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#include "cpu/arm/tensor_computing_arm.h" -#ifdef _USE_FP32 -#include "cpu/arm/fp32/tensor_computing_fp32.h" -#endif -#ifdef _USE_FP16 -#include "cpu/arm/fp16/tensor_computing_fp16.h" -#endif - -EE priorbox_arm(std::vector inputDesc, PriorBoxDesc priorboxDesc, TensorDesc outputDesc, void* output) -{ - EE ret = SUCCESS; - switch (inputDesc[0].dt) { -#ifdef _USE_FP32 - case DT_F32: { - ret = priorbox_fp32(inputDesc, priorboxDesc, outputDesc, (F32*)output); - break; - } -#endif -#ifdef _USE_FP16 - case DT_F16: { - ret = priorbox_fp16(inputDesc, priorboxDesc, outputDesc, (F16*)output); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} diff --git a/tensor_computing/src/cpu/arm/quantize.cpp b/tensor_computing/src/cpu/arm/quantize.cpp deleted file mode 100644 index 13c1aa94..00000000 --- a/tensor_computing/src/cpu/arm/quantize.cpp +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/tensor_computing_arm.h" -#ifdef _USE_FP16 -#include "cpu/arm/fp16/tensor_computing_fp16.h" -#endif -#ifdef _USE_INT8 -#include "cpu/arm/int8/tensor_computing_int8.h" -#endif - - -EE quantize_tensor_arm(TensorDesc dDesc, const void* data, TensorDesc* qDesc, void* qData, void *scale) -{ - EE ret = SUCCESS; - switch (dDesc.dt) { -#ifdef _USE_FP16 - case DT_F16: { - ret = quantize_tensor_fp16(dDesc, data, qDesc, qData, (F16*)scale); - break; - } -#endif -#ifdef _USE_INT8 - case DT_I32: { - ret = quantize_tensor_int32(dDesc, data, qDesc, qData, (F32*)scale); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} diff --git a/tensor_computing/src/cpu/arm/reduction.cpp b/tensor_computing/src/cpu/arm/reduction.cpp deleted file mode 100644 index 4c598fc8..00000000 --- a/tensor_computing/src/cpu/arm/reduction.cpp +++ /dev/null @@ -1,124 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "cpu/arm/tensor_computing_arm.h" -#include "cpu/arm/arm_functions.h" - -template -EE reduction(TensorDesc inputDesc, const T* input, - TensorDesc maskDesc, const float* mask, - I32 axis, - ReductionMode reductionMode, - F32 coeff, - TensorDesc outputDesc, T* output) -{ - if (nullptr == input || nullptr == output) - CHECK_STATUS(NULL_POINTER); - - if (axis < 0) - axis = inputDesc.nDims + axis; - axis = inputDesc.nDims - 1 - axis; - U32 loopInner = 1; - for (int i = 0; i < axis; i++) { - loopInner *= inputDesc.dims[i]; - } - U32 loopOuter = 1; - for (U32 i = axis+1; i < inputDesc.nDims; i++) { - loopOuter *= inputDesc.dims[i]; - } - - U32 len = inputDesc.dims[axis]; - U32 maskLen = tensorNumElements(maskDesc); - maskLen = (maskLen > 0) ? maskLen : len; - U32 axisDim = maskLen / len; - for (U32 i = 0; i < loopOuter; i++) { - if (loopInner == 1) { - if (mask != nullptr) { - return NOT_SUPPORTED; - } - const T* array = input + i * len; - switch (reductionMode) { - case REDUCTION_SUM: - output[i] = array_sum(inputDesc.dt, array, len); - break; - case REDUCTION_MEAN: - output[i] = array_mean(inputDesc.dt, array, len); - break; - default: - return NOT_SUPPORTED; - } - } else { - for (U32 j = 0; j < maskLen; j+=len) { - U32 axisIndex = j / len; - U32 outputIndex = (i * axisDim + axisIndex) * loopInner; - if (reductionMode == REDUCTION_SUM || reductionMode == REDUCTION_MEAN) { - memset(output + outputIndex, 0, loopInner*bytesOf(inputDesc.dt)); - } else { - return NOT_SUPPORTED; - } - U32 count = 0; - for (U32 k = 0; k < len; k++) { - if (mask == nullptr || (mask != nullptr && mask[j+k] == 1)) { - if (reductionMode == REDUCTION_SUM || reductionMode == REDUCTION_MEAN) { - array_add(inputDesc.dt, output+outputIndex, - &input[(i * len + k) * loopInner], - output+outputIndex, loopInner); - count++; - } else { - return NOT_SUPPORTED; - } - } - } - if (reductionMode == REDUCTION_MEAN) { - array_scale(inputDesc.dt, output+outputIndex, output+outputIndex, loopInner, 1.0/count, 0); - } - } - } - } - if (coeff != 1) { - array_scale(outputDesc.dt, output, output, tensorNumElements(outputDesc), coeff, 0); - } - return SUCCESS; -} - -EE reduction_arm(TensorDesc inputDesc, const void* input, - TensorDesc maskDesc, const void* mask, - I32 axis, - ReductionMode reductionMode, - float coeff, - TensorDesc outputDesc, void* output) -{ - DataType idt = inputDesc.dt; - EE ret = SUCCESS; - switch (idt) { -#ifdef _USE_FP32 - case DT_F32: { - ret = reduction(inputDesc, (const F32*)input, maskDesc, (const float*)mask, axis, reductionMode, coeff, outputDesc, (F32*)output); - break; - } -#endif -#ifdef _USE_FP16 - case DT_F16: { - ret = reduction(inputDesc, (const F16*)input, maskDesc, (const float*)mask, axis, reductionMode, coeff, outputDesc, (F16*)output); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - - return ret; -} diff --git a/tensor_computing/src/cpu/arm/reshape.cpp b/tensor_computing/src/cpu/arm/reshape.cpp deleted file mode 100644 index fda17b4b..00000000 --- a/tensor_computing/src/cpu/arm/reshape.cpp +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include - -#include "cpu/arm/tensor_computing_arm.h" - -EE reshape_arm(TensorDesc inputDesc, void* input, - TensorDesc outputDesc, void* output) -{ - if (nullptr == input || nullptr == output) - CHECK_STATUS(NULL_POINTER); - - if (tensorNumElements(inputDesc) != tensorNumElements(outputDesc)) { - // Only allow the removal of padded convolution channels - CHECK_REQUIREMENT(DF_NCHWC8 == inputDesc.df); - CHECK_REQUIREMENT(4 == inputDesc.nDims); - CHECK_REQUIREMENT(1 == inputDesc.dims[1] && 1 == inputDesc.dims[0]); - inputDesc.df = DF_NCHW; - } - if (DF_NCHWC8 != inputDesc.df) { - memcpy(output, input, tensorNumBytes(outputDesc)); - } else { - DataType idt; - DataFormat idf; - U32 in, ic, ih, iw; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - - U32 elementBytes = bytesOf(idt); - ic /= 8; - U8 *inPtr = (U8*)input; - U8 *outPtr = (U8*)output; - for (U32 n = 0; n < in; n++) { - for (U32 c = 0; c < ic; c++) { - for (U32 hw = 0; hw < ih*iw; hw++) { - for (U32 c8 = 0; c8 < 8; c8++) { - memcpy(outPtr + elementBytes * (n*ic*8*ih*iw + (c*8 + c8)*ih*iw + hw), - inPtr + elementBytes * (n*ic*ih*iw*8 + c*ih*iw*8 + hw*8 + c8), - elementBytes); - } - } - } - } - } - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/arm/scale.cpp b/tensor_computing/src/cpu/arm/scale.cpp deleted file mode 100644 index 52a96c5c..00000000 --- a/tensor_computing/src/cpu/arm/scale.cpp +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/tensor_computing_arm.h" -#ifdef _USE_FP32 -#include "cpu/arm/fp32/tensor_computing_fp32.h" -#endif -#ifdef _USE_FP16 -#include "cpu/arm/fp16/tensor_computing_fp16.h" -#endif - -EE scale_arm(TensorDesc inputDesc, void* input, - I32 axis, void *alpha, void *beta, - TensorDesc outputDesc, void* output) -{ - UNUSED(outputDesc); - U32 length = tensorNumElements(inputDesc); - axis = (axis + inputDesc.nDims) % inputDesc.nDims; - I32 in = inputDesc.dims[inputDesc.nDims - 1]; - I32 ic = inputDesc.dims[inputDesc.nDims - 1 - axis]; - I32 elements_per_channel = length / (in * ic); - if (inputDesc.df == DF_NCHWC8) - axis = inputDesc.nDims; - EE ret = SUCCESS; - switch (inputDesc.dt) { -#ifdef _USE_FP32 - case DT_F32: { - ret = scale_fp32((F32*)input, - axis, inputDesc.nDims, (F32*)alpha, (F32*)beta, in, ic, elements_per_channel, - (F32*)output); - break; - } -#endif -#ifdef _USE_FP16 - case DT_F16: { - ret = scale_fp16((F16*)input, - axis, inputDesc.nDims, (F16*)alpha, (F16*)beta, in, ic, elements_per_channel, - (F16*)output); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - - return ret; -} diff --git a/tensor_computing/src/cpu/arm/slice.cpp b/tensor_computing/src/cpu/arm/slice.cpp deleted file mode 100644 index 106ea87a..00000000 --- a/tensor_computing/src/cpu/arm/slice.cpp +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include -#include "cpu/arm/tensor_computing_arm.h" - - -EE slice_arm(TensorDesc inputDesc, void* input, - int axis, - std::vector outputDesc, std::vector* output) -{ - if (nullptr == input || nullptr == output) - CHECK_STATUS(NULL_POINTER); - U32 num = outputDesc.size(); - if (num < 1) return NOT_MATCH; - - int dim = inputDesc.nDims; - axis = (axis + dim) % dim; - axis = dim - 1 - axis; - U32 tileSize = bytesOf(inputDesc.dt); - for (I32 i = 0; i < axis; i++) { - tileSize *= inputDesc.dims[i]; - } - U32 loops = 1; - for (I32 i = axis + 1; i < dim; i++) { - loops *= inputDesc.dims[i]; - } - - if (inputDesc.df == DF_NCHWC8) { - if (axis < 2) { - tileSize *= 8; - loops /= 8; - } - } - - U8 *ptr = (U8 *)input; - for (U32 i = 0; i < loops; i++) { - for (U32 j = 0; j < num; j++) { - U32 blockSize = outputDesc[j].dims[axis] * tileSize; - if (blockSize > 0 && nullptr == (*output)[j]) - CHECK_STATUS(NULL_POINTER); - U8* dstPtr = (U8*)((*output)[j]) + i * blockSize; - memcpy(dstPtr, ptr, blockSize); - ptr += blockSize; - } - } - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/arm/softmax.cpp b/tensor_computing/src/cpu/arm/softmax.cpp deleted file mode 100644 index 9a6f06a8..00000000 --- a/tensor_computing/src/cpu/arm/softmax.cpp +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/arm/tensor_computing_arm.h" -#ifdef _USE_FP32 -#include "cpu/arm/fp32/tensor_computing_fp32.h" -#endif -#ifdef _USE_FP16 -#include "cpu/arm/fp16/tensor_computing_fp16.h" -#endif - - -EE softmax_arm(TensorDesc inputDesc, const void* input, - int axis, - TensorDesc outputDesc, void* output) -{ - DataType idt = inputDesc.dt; - EE ret = SUCCESS; - switch (idt) { -#ifdef _USE_FP32 - case DT_F32: { - ret = softmax_fp32(inputDesc, (const F32*)input, axis, outputDesc, (F32*)output); - break; - } -#endif -#ifdef _USE_FP16 - case DT_F16: { - ret = softmax_fp16(inputDesc, (const F16*)input, axis, outputDesc, (F16*)output); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - - return ret; -} diff --git a/tensor_computing/src/cpu/arm/split.cpp b/tensor_computing/src/cpu/arm/split.cpp deleted file mode 100644 index 682ad152..00000000 --- a/tensor_computing/src/cpu/arm/split.cpp +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include - -#include "cpu/arm/tensor_computing_arm.h" - -EE split_arm(TensorDesc inputDesc, void* input, - std::vector outputDesc, std::vector* output) -{ - UNUSED(inputDesc); - if (nullptr == input) - CHECK_STATUS(NULL_POINTER); - if(outputDesc.size() <= 1) return NOT_MATCH; - - for(U32 i = 0; i < (*output).size(); i++) { - if (nullptr == (*output)[i]) - CHECK_STATUS(NULL_POINTER); - memcpy((*output)[i], input, tensorNumBytes(outputDesc[i])); - } - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/arm/tensor_computing_arm.h b/tensor_computing/src/cpu/arm/tensor_computing_arm.h deleted file mode 100644 index 12836c8a..00000000 --- a/tensor_computing/src/cpu/arm/tensor_computing_arm.h +++ /dev/null @@ -1,188 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_TENSOR_COMPUTING_ARM -#define _H_TENSOR_COMPUTING_ARM - -#include - -#include "sys.h" -#include "tensor_desc.h" -#include "tensor_computing_type.h" - -EE activation_arm(TensorDesc inputDesc, void* input, ActivationDesc activationDesc, TensorDesc outputDesc, void* output); - -EE attention_arm(TensorDesc inputDesc, const void *input, TensorDesc outputDesc, void *output); - -EE clip_arm(void *minValue, void *maxValue, TensorDesc inputDesc, void* input, TensorDesc outputDesc, void *output); - -EE concat_arm(std::vector inputDesc, std::vector input, void* inputScale, - TensorDesc outputDesc, void* output, void* outputScale, int axis); - -EE convolution_infer_forward_algorithm_arm(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, ConvolutionPolicy policy, ConvolutionForwardAlgorithm *algorithm, DataType targetDataType); - -EE convolution_transform_filter_bytes_arm(TensorDesc filterDesc, ConvolutionForwardAlgorithm algorithm, U32* bytes); - -EE convolution_transform_filter_arm(TensorDesc filterDesc, const void* filter, - ConvolutionForwardAlgorithm algorithm, - TensorDesc *ftmDesc, void* filterTransformed); - -EE convolution_infer_forward_tmp_bytes_arm(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, ConvolutionForwardAlgorithm algorithm, U32 *bytes); - -EE convolution_arm(TensorDesc inputDesc, void* input, - TensorDesc filterDesc, const void* filter, - ConvolutionDesc convDesc, - ConvolutionForwardAlgorithm algorithm, - TensorDesc scaleDesc, const void* scale, - TensorDesc biasDesc, const void* bias, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, void* output, - ActivationDesc activationDesc, - Arch arch); - -EE deconvolution_infer_forward_algorithm_arm(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, ConvolutionPolicy policy, ConvolutionForwardAlgorithm *algorithm, DataType targetDataType); - -EE deconvolution_transform_filter_bytes_arm(TensorDesc filterDesc, ConvolutionForwardAlgorithm algorithm, U32* bytes); - -EE deconvolution_transform_filter_arm(TensorDesc filterDesc, const void* filter, - ConvolutionForwardAlgorithm algorithm, - TensorDesc *ftmDesc, void* filterTransformed); - -EE deconvolution_infer_forward_tmp_bytes_arm(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, ConvolutionForwardAlgorithm algorithm, U32 *bytes); - -EE deconvolution_arm(TensorDesc inputDesc, void* input, - TensorDesc filterDesc, const void* filter, - ConvolutionDesc convDesc, - ConvolutionForwardAlgorithm algorithm, - TensorDesc scaleDesc, const void* scale, - TensorDesc biasDesc, const void* bias, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, void* output, - ActivationDesc activationDesc, - Arch arch); - -EE depthwise_convolution_infer_forward_algorithm_arm(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, ConvolutionPolicy policy, DepthwiseConvolutionForwardAlgorithm *algorithm, DataType targetDataType); - -EE depthwise_convolution_transform_filter_bytes_arm(TensorDesc filterDesc, DepthwiseConvolutionForwardAlgorithm algorithm, U32* bytes); - -EE depthwise_convolution_transform_filter_arm(TensorDesc filterDesc, const void* filter, - DepthwiseConvolutionForwardAlgorithm algorithm, - TensorDesc *ftmDesc, void* filterTransformed); - -EE depthwise_convolution_infer_forward_tmp_bytes_arm(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, DepthwiseConvolutionForwardAlgorithm algorithm, U32 *bytes); - -EE depthwise_convolution_arm(TensorDesc inputDesc, void* input, - TensorDesc filterDesc, const void* filter, - ConvolutionDesc convDesc, - DepthwiseConvolutionForwardAlgorithm algorithm, - TensorDesc biasDesc, const void* bias, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, void* output, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc, - Arch arch); - -EE detectionoutput_qsort_descent_arm(std::vector& boxes, std::vector& scores, int left, int right); - -F32 detectionoutput_intersectionarea_arm(BoxRect a, BoxRect b); - -EE detectionoutput_nms_pickedboxes_arm(std::vector boxes, std::vector& picked, F32 nms_threshold); - -EE detectionoutput_arm(std::vector inputDesc, std::vector input, DetectionOutputDesc detectionoutputDesc, TensorDesc outputDesc, void* output); - -EE eltwise_arm(std::vector inputDesc, std::vector input, - TensorDesc outputDesc, void* output, EltwiseMode eltwiseMode); - -EE lstm_transform_filter_arm(TensorDesc filterDesc, const void* filter, LSTMDesc lstmDesc, TensorDesc *ftmDesc, void* ftm); - -EE lstm_transform_filter_bytes_arm(TensorDesc filterDesc, LSTMDesc lstmDesc, U32* bytes); - -EE lstm_arm(TensorDesc inputDesc, const void* input, - TensorDesc filterDesc, const void* filter, - TensorDesc biasDesc, const void* bias, - U32 tmpBytes, void* tmp, - LSTMDesc lstmDesc, - TensorDesc outputDesc, void* output, - Arch arch); - -EE lstmcell_infer_forward_tmp_bytes_arm(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, LSTMDesc lstmDesc, U32 *bytes, Arch arch); - -EE lstmcell_arm(TensorDesc xDesc, const void* currentX, - TensorDesc filterDesc, const void* filter, - TensorDesc biasDesc, const void* bias, - void *state, - U32 tmpBytes, void *tmp, - LSTMDesc lstmDesc, U32 batchStrideX, U32 batchStrideH, - TensorDesc hDesc, void* currentH, - Arch arch); - -EE lstm_infer_forward_tmp_bytes_arm(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, LSTMDesc lstmDesc, U32 *bytes, Arch arch); - -EE multiply_arm(void *alpha, void *beta, TensorDesc inputDesc, void* input, TensorDesc outputDesc, void *output); - -EE layer_normalization_arm(void *alpha, void *beta, - TensorDesc inputDesc, void* input, - TensorDesc outputDesc, void* output); - -EE pooling_arm(TensorDesc inputDesc, const void* input, PoolingDesc poolingDesc, const void* scale, TensorDesc outputDesc, void* output); - -EE priorbox_arm(std::vector inputDesc, PriorBoxDesc priorboxDesc, TensorDesc outputDesc, void* output); - -EE reshape_arm(TensorDesc inputDesc, void* input, - TensorDesc outputDesc, void* output); - -EE scale_arm(TensorDesc inputDesc, void* input, I32 axis, void *alpha, void *beta, TensorDesc outputDesc, void* output); - -EE slice_arm(TensorDesc inputDesc, void* input, int axis, - std::vector outputDesc, std::vector* output); - -EE softmax_arm(TensorDesc inputDesc, const void* input, - int axis, - TensorDesc outputDesc, void* output); - -EE split_arm(TensorDesc inputDesc, void* input, - std::vector outputDesc, std::vector* output); - -EE transpose_arm(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *output, U32 *dim); - -EE quantize_tensor_arm(TensorDesc dDesc, const void* data, TensorDesc* qDesc, void* qData, void *scale); - -EE argmax_arm(TensorDesc inputDesc, const void* input, - I32 axis, - TensorDesc outputDesc, void* output); - -EE reduction_arm(TensorDesc inputDesc, const void* input, - TensorDesc maskDesc, const void* mask, - I32 axis, - ReductionMode reductionMode, - float coeff, - TensorDesc outputDesc, void* output); - -EE check_arm(TensorDesc inputDescA, const void* inputA, - TensorDesc inputDescB, const void* inputB, - CheckMode checkMode, - TensorDesc outputDesc, void* output); - -EE attention_mask_arm(TensorDesc inputDesc, const void* input, - I32 attentionLength, bool sameLength, float mask, - TensorDesc outputDesc, void* output); - -EE padding_arm(TensorDesc inputDesc, const void* input, PadDesc padDesc, TensorDesc outputDesc, void* output); -#endif diff --git a/tensor_computing/src/cpu/arm/transpose.cpp b/tensor_computing/src/cpu/arm/transpose.cpp deleted file mode 100644 index cedae053..00000000 --- a/tensor_computing/src/cpu/arm/transpose.cpp +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include - -#include "cpu/arm/tensor_computing_arm.h" - -EE transpose_arm(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *output, U32 *dim) { - if (nullptr == input || nullptr == output || nullptr == dim) - CHECK_STATUS(NULL_POINTER); - - I32 inputDim = inputDesc.nDims; - I32 outputDim = outputDesc.nDims; - CHECK_REQUIREMENT(inputDim == outputDim); - - U32 size_inner = 1; - I32 size_inner_index = 0; - for (I32 i = inputDim-1; i >= 0; i--) { - if ((I32)dim[i] == i) { - size_inner *= inputDesc.dims[inputDim-1-i]; - size_inner_index++; - } else { - break; - } - } - U32 inputSize = tensorNumElements(inputDesc) / size_inner; - U32 outputSize = tensorNumElements(outputDesc) / size_inner; - CHECK_REQUIREMENT(inputSize == outputSize); - - std::vector inputLocalIndex(inputDim); - U8 *input_ptr = (U8 *)input; - U8 *output_ptr = (U8 *)output; - for (U32 i = 0; i < outputSize; i++) { - U32 outputIndex = i; - for (I32 j = size_inner_index; j < outputDim; j++) { - U32 value = outputIndex % outputDesc.dims[j]; - outputIndex /= outputDesc.dims[j]; - inputLocalIndex[inputDim - 1 - dim[outputDim - 1 - j]] = value; - } - U32 inputIndex = 0; - for (I32 j = inputDim-1; j > size_inner_index; j--) { - inputIndex = (inputIndex + inputLocalIndex[j]) * inputDesc.dims[j-1]; - } - inputIndex += inputLocalIndex[size_inner_index]; - memcpy(output_ptr+i*size_inner*bytesOf(outputDesc.dt), - input_ptr+inputIndex*size_inner*bytesOf(inputDesc.dt), - size_inner*bytesOf(inputDesc.dt)); - } - - return SUCCESS; -} - -EE transpose_naive(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *output, U32 *dim) { - if (nullptr == input || nullptr == output || nullptr == dim) - CHECK_STATUS(NULL_POINTER); - - U32 inputDim = inputDesc.nDims; - U32 outputDim = outputDesc.nDims; - CHECK_REQUIREMENT(inputDim == outputDim); - - U32 outputSize = tensorNumElements(outputDesc); - CHECK_REQUIREMENT(inputDim == outputDim); - std::vector inputLocalIndex(inputDim); - U8 *input_ptr = (U8 *)input; - U8 *output_ptr = (U8 *)output; - for (U32 i = 0; i < outputSize; i++) { - U32 outputIndex = i; - for (U32 j = 0; j < outputDim; j++) { - U32 value = outputIndex % outputDesc.dims[j]; - outputIndex /= outputDesc.dims[j]; - inputLocalIndex[inputDim - 1 - dim[outputDim - 1 - j]] = value; - } - U32 inputIndex = 0; - for (U32 j = inputDim-1; j > 0; j--) { - inputIndex = (inputIndex + inputLocalIndex[j]) * inputDesc.dims[j-1]; - } - inputIndex += inputLocalIndex[0]; - memcpy(output_ptr+i*bytesOf(outputDesc.dt), input_ptr+inputIndex*bytesOf(inputDesc.dt), bytesOf(inputDesc.dt)); - } - - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/general/activation.cpp b/tensor_computing/src/cpu/general/activation.cpp deleted file mode 100644 index 88e4693c..00000000 --- a/tensor_computing/src/cpu/general/activation.cpp +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/general/tensor_computing_general.h" -#include "cpu/general/general_functions.h" - -EE activation_general(TensorDesc inputDesc, void* input, ActivationDesc activationDesc, TensorDesc outputDesc, void* output) -{ - if (nullptr == input || nullptr == output) { - CHECK_STATUS(NULL_POINTER); - } - DataType idt = inputDesc.dt; - EE ret = SUCCESS; - U32 len = tensorNumElements(inputDesc); - CHECK_REQUIREMENT(len == tensorNumElements(outputDesc)); - for (U32 i = 0; i < len; i++) { - switch (idt) { -#ifdef _USE_FP16 - case DT_F16: { - F16* inPtr = (F16 *)input; - F16* outPtr = (F16 *)output; - CHECK_STATUS(activation(activationDesc, inPtr[i], &outPtr[i])); - break; - } -#endif -#ifdef _USE_FP32 - case DT_F32: { - F32* inPtr = (F32 *)input; - F32* outPtr = (F32 *)output; - CHECK_STATUS(activation(activationDesc, inPtr[i], &outPtr[i])); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - } - return ret; -} diff --git a/tensor_computing/src/cpu/general/argmax.cpp b/tensor_computing/src/cpu/general/argmax.cpp deleted file mode 100644 index 8f8653a2..00000000 --- a/tensor_computing/src/cpu/general/argmax.cpp +++ /dev/null @@ -1,87 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "cpu/general/tensor_computing_general.h" - - -template -U32 array_argmax(const T* input, U32 len, U32 stride) { - U32 index = 0; - U32 j = stride; - for (U32 i = 1; i < len; i++, j+=stride) { - if(input[j] > input[index]) - index = j; - } - return index / stride; -} - -template -EE argmax(TensorDesc inputDesc, const T* input, - I32 axis, - TensorDesc outputDesc, U32* output) -{ - UNUSED(outputDesc); - - if (nullptr == input || nullptr == output) - CHECK_STATUS(NULL_POINTER); - - if (axis < 0) - axis = inputDesc.nDims + axis; - axis = inputDesc.nDims - 1 - axis; - U32 loopInner = 1; - for (int i = 0; i < axis; i++) { - loopInner *= inputDesc.dims[i]; - } - U32 loopOuter = 1; - for (U32 i = axis+1; i < inputDesc.nDims; i++) { - loopOuter *= inputDesc.dims[i]; - } - - U32 len = inputDesc.dims[axis]; - for (U32 i = 0; i < loopOuter; i++) { - for (U32 j = 0; j < loopInner; j++) { - const T* array = input + i * (len * loopInner) + j; - output[i*loopInner+j] = array_argmax(array, len, loopInner); - } - } - return SUCCESS; -} - -EE argmax_general(TensorDesc inputDesc, const void* input, - I32 axis, - TensorDesc outputDesc, void* output) -{ - DataType idt = inputDesc.dt; - EE ret = SUCCESS; - switch (idt) { -#ifdef _USE_FP16 - case DT_F16: { - ret = argmax(inputDesc, (const F16*)input, axis, outputDesc, (U32*)output); - break; - } -#endif -#ifdef _USE_FP32 - case DT_F32: { - ret = argmax(inputDesc, (const F32*)input, axis, outputDesc, (U32*)output); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - - return ret; -} diff --git a/tensor_computing/src/cpu/general/attention.cpp b/tensor_computing/src/cpu/general/attention.cpp deleted file mode 100644 index db10ffb0..00000000 --- a/tensor_computing/src/cpu/general/attention.cpp +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/general/tensor_computing_general.h" -#include "cpu/general/general_functions.h" - -template -EE attention(U32 batch, U32 numHeads, U32 fromSequenceLength, U32 toSequenceLength, const T *input, T *output) -{ - if (nullptr == input || nullptr == output) - CHECK_STATUS(NULL_POINTER); - - T minValue = -10000.0; - U32 count = array_sum(input, toSequenceLength); - U32 valid = UNI_MIN(count, fromSequenceLength); - for (U32 n = 0; n < batch; n++) { - for (U32 i = 0; i < numHeads; i++) { - for (U32 j = 0; j < valid; j++) { - for (U32 k = 0; k < toSequenceLength; k++) { - T value = input[n*toSequenceLength + k]; - U32 index = (((n * numHeads + i)*fromSequenceLength + j)*toSequenceLength + k); - output[index] = (1 - value) * minValue; - } - } - for (U32 j = valid; j < fromSequenceLength; j++) { - for (U32 k = 0; k < toSequenceLength; k++) { - U32 index = (((n * numHeads + i)*fromSequenceLength + j)*toSequenceLength + k); - output[index] = minValue; - } - } - } - } - return SUCCESS; -} - -EE attention_general(TensorDesc inputDesc, const void *input, TensorDesc outputDesc, void *output) -{ - DataType dt; - DataFormat df; - U32 batch, numHeads, fromSequenceLength, toSequenceLength; - CHECK_REQUIREMENT(tensorIs2d(inputDesc)); - CHECK_REQUIREMENT(tensorIs4d(outputDesc)); - CHECK_STATUS(tensor4dGet(outputDesc, &dt, &df, &batch, &numHeads, &fromSequenceLength, &toSequenceLength)); - - EE ret = SUCCESS; - switch (dt) { -#ifdef _USE_FP16 - case DT_F16: { - ret = attention(batch, numHeads, fromSequenceLength, toSequenceLength, (const F16*)input, (F16*)output); - break; - } -#endif -#ifdef _USE_FP32 - case DT_F32: { - ret = attention(batch, numHeads, fromSequenceLength, toSequenceLength, (const F32*)input, (F32*)output); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} diff --git a/tensor_computing/src/cpu/general/attention_mask.cpp b/tensor_computing/src/cpu/general/attention_mask.cpp deleted file mode 100644 index cf4efbb0..00000000 --- a/tensor_computing/src/cpu/general/attention_mask.cpp +++ /dev/null @@ -1,96 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "cpu/general/tensor_computing_general.h" - - -template -EE attention_mask(TensorDesc inputDesc, const T* input, - I32 attentionLength, bool sameLength, float maskValue, - TensorDesc outputDesc, T* output) -{ - UNUSED(outputDesc); - if (nullptr == input || nullptr == output) - CHECK_STATUS(NULL_POINTER); - int qlen = inputDesc.dims[1]; - int klen = inputDesc.dims[0]; - int mlen = klen - qlen; - std::vector> mask; - if (attentionLength < 0) { - mask = std::vector>(qlen, std::vector(klen, 0)); - } else { - mask = std::vector>(qlen, std::vector(klen, 1)); - for (int i = 0; i < qlen; i++) { - int start, loops; - if (attentionLength > 0) { - int end = mlen + i; - start = UNI_MAX(end - attentionLength, 0); - loops = end - start + 1; - } else { - if (sameLength) { - start = i; - loops = qlen + 1; - } else { - start = 0; - loops = i + qlen + 1; - } - } - loops = UNI_MAX(loops, 0); - start = UNI_MIN(start, klen); - if (start + loops > klen) - loops = UNI_MAX(klen - start, 0); - memset(&mask[i][start], 0, sizeof(T)*loops); - } - } - I32 loops = tensorNumElements(inputDesc) / qlen / klen; - for (int i = 0, index = 0; i < loops; i++) { - for (int j = 0; j < qlen; j++) { - for (int k = 0; k < klen; k++) { - output[index] = input[index] * (1 - mask[j][k]) - maskValue * mask[j][k]; - index++; - } - } - } - return SUCCESS; -} - -EE attention_mask_general(TensorDesc inputDesc, const void* input, - I32 attentionLength, bool sameLength, float mask, - TensorDesc outputDesc, void* output) -{ - DataType idt = inputDesc.dt; - EE ret = SUCCESS; - switch (idt) { -#ifdef _USE_FP32 - case DT_F32: { - ret = attention_mask(inputDesc, (const F32*)input, - attentionLength, sameLength, mask, outputDesc, (F32*)output); - break; - } -#endif -#ifdef _USE_FP16 - case DT_F16: { - ret = attention_mask(inputDesc, (const F16*)input, - attentionLength, sameLength, mask, outputDesc, (F16*)output); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - - return ret; -} diff --git a/tensor_computing/src/cpu/general/check.cpp b/tensor_computing/src/cpu/general/check.cpp deleted file mode 100644 index 3b17ab43..00000000 --- a/tensor_computing/src/cpu/general/check.cpp +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "cpu/general/tensor_computing_general.h" - -template -EE check(TensorDesc inputDescA, const T* inputA, - TensorDesc inputDescB, const T* inputB, - CheckMode checkMode, - TensorDesc outputDesc, I32* output) -{ - UNUSED(inputDescB); - UNUSED(outputDesc); - - if (nullptr == inputA || nullptr == inputB || nullptr == output) - CHECK_STATUS(NULL_POINTER); - - U32 size = tensorNumElements(inputDescA); - U32 loopOuter = inputDescA.dims[inputDescA.nDims-1]; - U32 loopInner = size / loopOuter; - - for (U32 i = 0; i < loopOuter; i++) { - U32 count = 0; - for (U32 j = 0; j < loopInner; j++) { - U32 index = i * loopInner + j; - switch (checkMode) { - case CHECK_EQUAL: { - if (inputA[index] == inputB[index]) - count ++; - break; - } - default: - CHECK_STATUS(NOT_SUPPORTED); - break; - } - } - switch (checkMode) { - case CHECK_EQUAL: { - if (count == loopInner) - output[i] = 1; - else - output[i] = 0; - break; - } - default: - break; - } - } - return SUCCESS; -} - -EE check_general(TensorDesc inputDescA, const void* inputA, - TensorDesc inputDescB, const void* inputB, - CheckMode checkMode, - TensorDesc outputDesc, void* output) -{ - DataType idt = inputDescA.dt; - EE ret = SUCCESS; - switch (idt) { -#ifdef _USE_FP16 - case DT_F16: { - ret = check(inputDescA, (const F16*)inputA, - inputDescB, (const F16*)inputB, - checkMode, outputDesc, (I32*)output); - break; - } -#endif -#ifdef _USE_FP32 - case DT_F32: { - ret = check(inputDescA, (const F32*)inputA, - inputDescB, (const F32*)inputB, - checkMode, outputDesc, (I32*)output); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - - return ret; -} diff --git a/tensor_computing/src/cpu/general/clip.cpp b/tensor_computing/src/cpu/general/clip.cpp deleted file mode 100644 index 62c128b5..00000000 --- a/tensor_computing/src/cpu/general/clip.cpp +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/general/tensor_computing_general.h" - -template -EE clip(T* input, T* output, U32 len, F32 min_value, F32 max_value) { - if (nullptr == input - || nullptr == output) - CHECK_STATUS(NULL_POINTER); - - for (U32 i = 0; i < len; i++) { - F32 value = input[i]; - value = (value > min_value) ? value : min_value; - value = (value < max_value) ? value : max_value; - output[i] = value; - } - return SUCCESS; -} - -EE clip_general(void *minValue, void *maxValue, TensorDesc inputDesc, void* input, TensorDesc outputDesc, void *output) -{ - UNUSED(outputDesc); - - if (nullptr == minValue - || nullptr == maxValue) - CHECK_STATUS(NULL_POINTER); - - EE ret = SUCCESS; - switch (inputDesc.dt) { -#ifdef _USE_FP32 - case DT_F32: { - ret = clip((F32 *)input, (F32 *)output, tensorNumElements(inputDesc), *((F32 *)minValue), *((F32 *)maxValue)); - break; - } -#endif -#ifdef _USE_FP16 - case DT_F16: { - ret = clip((F16 *)input, (F16 *)output, tensorNumElements(inputDesc), *((F32 *)minValue), *((F32 *)maxValue)); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} diff --git a/tensor_computing/src/cpu/general/concat.cpp b/tensor_computing/src/cpu/general/concat.cpp deleted file mode 100644 index 2dd28966..00000000 --- a/tensor_computing/src/cpu/general/concat.cpp +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include -#include "cpu/general/tensor_computing_general.h" - -EE concat_general(std::vector inputDesc, std::vector input, TensorDesc outputDesc, void* output, int axis) -{ - if (nullptr == output) - CHECK_STATUS(NULL_POINTER); - U32 num = inputDesc.size(); - if (num < 1) return NOT_MATCH; - - int dim = outputDesc.nDims; - axis = (axis + dim) % dim; - axis = dim - 1 - axis; - U32 tileSize = bytesOf(outputDesc.dt); - for (I32 i = 0; i < axis; i++) { - tileSize *= outputDesc.dims[i]; - } - U32 loops = 1; - for (I32 i = axis + 1; i < dim; i++) { - loops *= outputDesc.dims[i]; - } - - if (outputDesc.df == DF_NCHWC8) { - if (axis < 2) { - tileSize *= 8; - loops /= 8; - } - } - - U8 *ptr = (U8 *)output; - for (U32 i = 0; i < loops; i++) { - for (U32 j = 0; j < num; j++) { - if (nullptr == input[j]) - CHECK_STATUS(NULL_POINTER); - U32 blockSize = inputDesc[j].dims[axis] * tileSize; - U8* srcPtr = (U8*)((input)[j]) + i * blockSize; - memcpy(ptr, srcPtr, blockSize); - ptr += blockSize; - } - } - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/general/convolution.cpp b/tensor_computing/src/cpu/general/convolution.cpp deleted file mode 100644 index 4a7c1f85..00000000 --- a/tensor_computing/src/cpu/general/convolution.cpp +++ /dev/null @@ -1,243 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "tensor_computing_type.h" -#include "cpu/general/tensor_computing_general.h" -#include "cpu/general/general_functions.h" - -template -inline EE convolution(TensorDesc inputDesc, T1* inArray, - TensorDesc filterDesc, const T2* filterArray, - ConvolutionDesc convDesc, - const T3* biasArray, - const T4* scaleArray, - TensorDesc outputDesc, T4* outArray, - ActivationDesc activationDesc, - T1 paddingValue=0) -{ - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingL = convDesc.padding_left; - U32 dilateH = convDesc.dilatedRate_h; - U32 dilateW = convDesc.dilatedRate_w; - - if (idf == DF_NCHWC8) - CHECK_STATUS(from_nchwc8_to_nchw(&inputDesc, inArray)); - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - if (idf != DF_NCHW) - CHECK_STATUS(NOT_MATCH); - - // For BNN, accumulated values are always 0 or 1, which may lead to error if buf is floating point. - std::vector outBuf(tensorNumElements(outputDesc)); - - for (U32 n = 0; n < in; n++) { - for (U32 o = 0; o < oc; o++) { - for (U32 h = 0; h < oh; h++) { - for (U32 w = 0; w < ow; w++) { - U32 o_off = n*oc*oh*ow + o*oh*ow + h*ow + w; - outBuf[o_off] = 0; - for (U32 c = 0; c < ic; c++) { - for (I32 fh_idx = 0; fh_idx < (I32)fh; fh_idx++) { - for (I32 fw_idx = 0; fw_idx < (I32)fw; fw_idx++) { - I32 ih_idx = h * strideH - paddingT + fh_idx*dilateH; - I32 iw_idx = w * strideW - paddingL + fw_idx*dilateW; - U32 f_off = o*ic*fh*fw + c*fh*fw + fh_idx*fw + fw_idx; - if (ih_idx >= 0 && ih_idx < (I32)ih && iw_idx >= 0 && iw_idx < (I32)iw) { - U32 i_off = n*ic*ih*iw + c*ih*iw + ih_idx*iw + iw_idx; - outBuf[o_off] += inArray[i_off] * filterArray[f_off]; - } - else { - outBuf[o_off] += paddingValue * filterArray[f_off]; - } - } - } - } - } - } - } - } - // bias - for (U32 n = 0; n < in; n++) { - for (U32 o = 0; o < oc; o++) { - for (U32 h = 0; h < oh; h++) { - for (U32 w = 0; w < ow; w++) { - U32 o_off = n*oc*oh*ow + o*oh*ow + h*ow + w; - U32 b_off = o; - T4 scale = 1; - if (scaleArray != nullptr) - scale = scaleArray[b_off]; - outArray[o_off] = scale * outBuf[o_off] + biasArray[b_off]; - switch (activationDesc.mode) { - case ACTIVATION_NULL: { - break; - } - case ACTIVATION_RELU: { - F32 tmp = activationDesc.value[0] * outArray[o_off]; - if(outArray[o_off] < tmp) { - outArray[o_off] = tmp; - } - break; - } - case ACTIVATION_SIGMOID: { - outArray[o_off] = 1.0f / (1.0f + exp(-1 * outArray[o_off])); - break; - } - default: - return NOT_SUPPORTED; - } - } - } - } - } - - if (odf == DF_NCHWC8) { - outputDesc.df = DF_NCHW; - CHECK_STATUS(from_nchw_to_nchwc8(&outputDesc, outArray)); - } - return SUCCESS; -} - -#ifdef _USE_FP16 -void bnn_input_process(TensorDesc inputDesc, F16 *input, DataType fdt, short *output) { - F16 centerValue = 0.0; - if (fdt == DT_BIN01) { - centerValue = 0.5; - } - short zeroValue = 0; - if (fdt == DT_BIN11) { - zeroValue = -1; - } - U32 len = tensorNumElements(inputDesc); - for (U32 i = 0; i < len; i++) { - if (input[i] >= centerValue) - output[i] = 1; - else - output[i] = zeroValue; - } -} - -void bnn_filter_process(TensorDesc filterDesc, BIN8 *filter, short *filterTransformed) { - short zeroValue = 0; - if (filterDesc.dt == DT_BIN11) { - zeroValue = -1; - } - U32 len = tensorNumElements(filterDesc); - for (U32 i = 0; i < len; i++) { - U32 bitSlot = i / 8; - U32 bitNo = 7 - (i%8); - std::bitset<8> Q(filter[bitSlot]); - if (Q.test(bitNo)) { - filterTransformed[i] = 1; - } else { - filterTransformed[i] = zeroValue; - } - } -} -#endif - -EE convolution_general(TensorDesc inputDesc, void* input, - TensorDesc filterDesc, const void* filter, - ConvolutionDesc convDesc, - TensorDesc scaleDesc, const void* scale, - TensorDesc biasDesc, const void* bias, - TensorDesc outputDesc, void* output, - ActivationDesc activationDesc) -{ - UNUSED(scaleDesc); - UNUSED(biasDesc); - - EE ret = SUCCESS; - switch (filterDesc.dt) { -#ifdef _USE_FP32 - case DT_F32: - ret = convolution(inputDesc, (F32*)input, - filterDesc, (F32*)filter, - convDesc, - (F32*)bias, - (F32*)scale, - outputDesc, (F32*)output, - activationDesc); - break; -#endif -#ifdef _USE_FP16 - case DT_F16: - ret = convolution(inputDesc, (F16*)input, - filterDesc, (F16*)filter, - convDesc, - (F16*)bias, - (F16*)scale, - outputDesc, (F16*)output, - activationDesc); - break; -#endif -#ifdef _USE_INT8 - case DT_I8: - ret = convolution(inputDesc, (INT8*)input, - filterDesc, (F16*)filter, - convDesc, - (F16*)bias, - (F16*)scale, - outputDesc, (F16*)output, - activationDesc); - break; -#endif -#ifdef _USE_FP16 - case DT_BIN01: { - std::vector inputTransformed(tensorNumElements(inputDesc)); - std::vector filterTransformed(tensorNumElements(filterDesc)); - bnn_input_process(inputDesc, (F16*)input, filterDesc.dt, inputTransformed.data()); - bnn_filter_process(filterDesc, (BIN8*)filter, filterTransformed.data()); - ret = convolution(inputDesc, inputTransformed.data(), - filterDesc, filterTransformed.data(), - convDesc, - (F16*)bias, - (F16*)scale, - outputDesc, (F16*)output, - activationDesc, 0); - break; - } - case DT_BIN11: { - std::vector inputTransformed(tensorNumElements(inputDesc)); - std::vector filterTransformed(tensorNumElements(filterDesc)); - bnn_input_process(inputDesc, (F16*)input, filterDesc.dt, inputTransformed.data()); - bnn_filter_process(filterDesc, (BIN8*)filter, filterTransformed.data()); - ret = convolution(inputDesc, inputTransformed.data(), - filterDesc, filterTransformed.data(), - convDesc, - (F16*)bias, - (F16*)scale, - outputDesc, (F16*)output, - activationDesc, -1); - break; - } -#endif - default: - return NOT_SUPPORTED; - } - return ret; -} diff --git a/tensor_computing/src/cpu/general/deconvolution.cpp b/tensor_computing/src/cpu/general/deconvolution.cpp deleted file mode 100644 index 99c7b1e8..00000000 --- a/tensor_computing/src/cpu/general/deconvolution.cpp +++ /dev/null @@ -1,153 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "tensor_computing_type.h" -#include "cpu/general/tensor_computing_general.h" -#include "cpu/general/general_functions.h" - -template -inline EE deconvolution(TensorDesc inputDesc, T* inArray, - TensorDesc filterDesc, const T* filterArray, - ConvolutionDesc convDesc, - const T* biasArray, - TensorDesc outputDesc, T* outArray, - ActivationDesc activationDesc) -{ - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - - if (ic != fn) { - CHECK_STATUS(NOT_SUPPORTED); - } - - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingL = convDesc.padding_left; - - if (idf == DF_NCHWC8) { - CHECK_STATUS(from_nchwc8_to_nchw(&inputDesc, inArray)); - } - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - if (idf != DF_NCHW) { - CHECK_STATUS(NOT_MATCH); - } - - // initialize outputs to 0 - memset(outArray, 0, tensorNumBytes(outputDesc)); - - for (U32 n = 0; n < in; n++) { - for (U32 o = 0; o < oc; o++) { - for (U32 c = 0; c < ic; c++) { - for (U32 h = 0; h < ih; h++) { - for (U32 w = 0; w < iw; w++) { - U32 i_off = n*ic*ih*iw + c*ih*iw + h*iw + w; - for (I32 fh_idx = 0; fh_idx < (I32)fh; fh_idx++) { - for (I32 fw_idx = 0; fw_idx < (I32)fw; fw_idx++) { - I32 oh_idx = fh_idx + strideH * h - paddingT; - I32 ow_idx = fw_idx + strideW * w - paddingL; - if (oh_idx >= 0 && oh_idx < (I32)oh && ow_idx >= 0 && ow_idx < (I32)ow) { - U32 o_off = n*oc*oh*ow + o*oh*ow + oh_idx*ow + ow_idx; - U32 f_off = c*fc*fh*fw + o*fh*fw + fh_idx*fw + fw_idx; - outArray[o_off] += inArray[i_off] * filterArray[f_off]; - } - } - } - } - } - } - } - } - // bias - for (U32 n = 0; n < in; n++) { - for (U32 o = 0; o < oc; o++) { - for (U32 h = 0; h < oh; h++) { - for (U32 w = 0; w < ow; w++) { - U32 o_off = n*oc*oh*ow + o*oh*ow + h*ow + w; - U32 b_off = o; - outArray[o_off] += biasArray[b_off]; - switch (activationDesc.mode) { - case ACTIVATION_NULL: { - break; - } - case ACTIVATION_RELU: { - F32 tmp = activationDesc.value[0] * outArray[o_off]; - if(outArray[o_off] < tmp) { - outArray[o_off] = tmp; - } - break; - } - default: - return NOT_SUPPORTED; - } - } - } - } - } - - if (odf == DF_NCHWC8) { - outputDesc.df = DF_NCHW; - CHECK_STATUS(from_nchw_to_nchwc8(&outputDesc, outArray)); - } - return SUCCESS; -} - -EE deconvolution_general(TensorDesc inputDesc, void* input, - TensorDesc filterDesc, const void* filter, - ConvolutionDesc convDesc, - TensorDesc scaleDesc, const void* scale, - TensorDesc biasDesc, const void* bias, - TensorDesc outputDesc, void* output, - ActivationDesc activationDesc) -{ - UNUSED(scaleDesc); - UNUSED(scale); - UNUSED(biasDesc); - - EE ret = SUCCESS; - switch (inputDesc.dt) { -#ifdef _USE_FP16 - case DT_F16: - ret = deconvolution(inputDesc, (F16*)input, - filterDesc, (F16*)filter, - convDesc, - (F16*)bias, - outputDesc, (F16*)output, - activationDesc); - break; -#endif -#ifdef _USE_FP32 - case DT_F32: - ret = deconvolution(inputDesc, (F32*)input, - filterDesc, (F32*)filter, - convDesc, - (F32*)bias, - outputDesc, (F32*)output, - activationDesc); - break; -#endif - default: - return NOT_SUPPORTED; - } - return ret; -} diff --git a/tensor_computing/src/cpu/general/depthwise_convolution.cpp b/tensor_computing/src/cpu/general/depthwise_convolution.cpp deleted file mode 100644 index 3e828b7e..00000000 --- a/tensor_computing/src/cpu/general/depthwise_convolution.cpp +++ /dev/null @@ -1,172 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "tensor_computing_type.h" -#include "cpu/general/tensor_computing_general.h" -#include "cpu/general/general_functions.h" - -template -inline EE depthwise_convolution(TensorDesc inputDesc, T1* inArray, - TensorDesc filterDesc, const T2* filterArray, - ConvolutionDesc convDesc, - const T3* biasArray, - TensorDesc outputDesc, T3* outArray, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc) -{ - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingL = convDesc.padding_left; - bool fuseDepthwisePointwise = (fdf == DF_CHW_NC) ? true : false; - - if (idf == DF_NCHWC8) - CHECK_STATUS(from_nchwc8_to_nchw(&inputDesc, inArray)); - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - if (idf != DF_NCHW) - CHECK_STATUS(NOT_MATCH); - - T3* pwArray; - if (fuseDepthwisePointwise) { - pwArray = (T3*)malloc(ic * oh * ow * sizeof(T3)); - memset(pwArray, 0, ic * oh * ow * sizeof(T3)); - } - else { - pwArray = outArray; - } - const T1* filterDwArray = (const T1*)filterArray; - const T1* filterPwArray = (const T1*)filterArray + fh*fw*ic; - for (U32 n = 0; n < in; n++) { - // dw conv - for (U32 c = 0; c < ic; c++) { - for (U32 h = 0; h < oh; h++) { - for (U32 w = 0; w < ow; w++) { - for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { - for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - I32 ih_idx = h * strideH - paddingT + fh_idx; - I32 iw_idx = w * strideW - paddingL + fw_idx; - if (ih_idx >= 0 && ih_idx < (I32)ih && iw_idx >= 0 && iw_idx < (I32)iw) { - pwArray[c*oh*ow + h*ow + w] += - inArray[n*ic*ih*iw + c*ih*iw + ih_idx*iw + iw_idx] * - filterDwArray[c*fh*fw + fh_idx*fw +fw_idx]; - } - } - } - } - } - } - // bias - for (U32 c = 0; c < ic; c++) { - for (U32 hw = 0; hw < oh*ow; hw++) { - U32 pw_off = c*oh*ow + hw; - U32 b_off = c; - pwArray[pw_off] += biasArray[b_off]; - CHECK_STATUS(activation(depthwiseActivationDesc, pwArray[pw_off], &pwArray[pw_off])); - } - } - if (fuseDepthwisePointwise) { - // pw conv - for (U32 o = 0; o < oc; o++) { - for (U32 c = 0; c < ic; c++) { - for (U32 hw = 0; hw < oh*ow; hw++) { - outArray[n*oc*oh*ow + o*oh*ow + hw] += pwArray[c*oh*ow + hw] * - filterPwArray[o*ic + c]; - } - } - } - // bias - for (U32 o = 0; o < oc; o++) { - for (U32 h = 0; h < oh; h++) { - for (U32 w = 0; w < ow; w++) { - U32 o_off = n*oc*oh*ow + o*oh*ow + h*ow + w; - U32 b_off = ic + o; - outArray[o_off] += biasArray[b_off]; - CHECK_STATUS(activation(pointwiseActivationDesc, outArray[o_off], &outArray[o_off])); - } - } - } - } - } - - if(fuseDepthwisePointwise) - free(pwArray); - - if (odf == DF_NCHWC8) { - outputDesc.df = DF_NCHW; - CHECK_STATUS(from_nchw_to_nchwc8(&outputDesc, outArray)); - } - return SUCCESS; -} - -EE depthwise_convolution_general(TensorDesc inputDesc, void* input, - TensorDesc filterDesc, const void* filter, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const void* bias, - TensorDesc outputDesc, void* output, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc) -{ - UNUSED(biasDesc); - - EE ret = SUCCESS; - switch (inputDesc.dt) { -#ifdef _USE_FP16 - case DT_F16: - ret = depthwise_convolution(inputDesc, (F16*)input, - filterDesc, (F16*)filter, - convDesc, - (F16*)bias, - outputDesc, (F16*)output, - depthwiseActivationDesc, - pointwiseActivationDesc); - break; -#endif -#ifdef _USE_INT8 - case DT_I8: - ret = depthwise_convolution(inputDesc, (INT8*)input, - filterDesc, (I32*)filter, - convDesc, - (I32*)bias, - outputDesc, (I32*)output, - depthwiseActivationDesc, - pointwiseActivationDesc); - break; -#endif -#ifdef _USE_FP32 - case DT_F32: - ret = depthwise_convolution(inputDesc, (F32*)input, - filterDesc, (F32*)filter, - convDesc, - (F32*)bias, - outputDesc, (F32*)output, - depthwiseActivationDesc, - pointwiseActivationDesc); - break; -#endif - default: - return NOT_SUPPORTED; - } - return ret; -} diff --git a/tensor_computing/src/cpu/general/detectionoutput.cpp b/tensor_computing/src/cpu/general/detectionoutput.cpp deleted file mode 100644 index 24d03e45..00000000 --- a/tensor_computing/src/cpu/general/detectionoutput.cpp +++ /dev/null @@ -1,243 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "tensor_computing_type.h" -#include "cpu/general/tensor_computing_general.h" - -inline EE qsort_descent(std::vector& boxes, std::vector& scores, int left, int right) -{ - if (boxes.empty() || scores.empty()) - return NOT_SUPPORTED; - - int i = left; - int j = right; - F32 temp = scores[(left+right) / 2]; - - while (i <= j){ - while(scores[i] > temp) - i++; - while(scores[j] < temp) - j--; - if(i<=j){ - std::swap(boxes[i], boxes[j]); - std::swap(scores[i], scores[j]); - i++; - j--; - } - } - - if (left < j) - qsort_descent(boxes, scores, left, j); - if (i < right) - qsort_descent(boxes, scores, i, right); - - return SUCCESS; -} - -inline F32 intersectionarea(BoxRect a, BoxRect b) -{ - if (a.xmin > b.xmax || a.xmax < b.xmin || a.ymin > b.ymax || a.ymax < b.ymin) - { - return 0.f; - } - F32 inter_width = std::min(a.xmax, b.xmax) - std::max(a.xmin, b.xmin); - F32 inter_height = std::min(a.ymax, b.ymax) - std::max(a.ymin, b.ymin); - - return inter_width * inter_height; -} - -inline EE nms_pickedboxes(std::vector boxes, std::vector& picked, F32 nms_threshold) -{ - I64 n = boxes.size(); - - std::vector areas(n); - for(I64 i = 0; i < n; i++){ - BoxRect box = boxes[i]; - - F32 width = box.xmax - box.xmin; - F32 height = box.ymax - box.ymin; - - areas[i] = width * height; - } - for(I64 i = 0; i < n; i++){ - BoxRect a = boxes[i]; - int keep = 1; - for(int j = 0; j < (int)picked.size(); j++){ - BoxRect b = boxes[picked[j]]; - F32 inter_area = intersectionarea(a,b); - F32 union_area = areas[i] + areas[picked[j]] - inter_area; - - if(inter_area / union_area > nms_threshold) - keep = 0; - } - if(keep){ - picked.push_back(i); - } - } - return SUCCESS; -} - -template -EE detectionoutput(std::vector input, T* output, U32 priorbox_width, U32 num_class, F32 nms_threshold, U32 nms_top_k, U32 keep_top_k, F32 confidence_threshold) -{ - T* location = (T*)input[0]; - T* confidence = (T*)input[1]; - T* priorbox = (T*)input[2]; - - U32 num_total_priorbox = priorbox_width / 4; - U32 numclass = num_class; - - std::vector> boxes; - boxes.resize(num_total_priorbox); - T* variance = priorbox + priorbox_width; - // decode priorbox - for(U32 i = 0 ; i < num_total_priorbox ; i++){ - T* loc = location + i * 4; - T* pb = priorbox + i * 4; - T* var = variance + i * 4; - - F32 pb_w = pb[2] - pb[0]; - F32 pb_h = pb[3] - pb[1]; - F32 pb_cx = (pb[0] + pb[2]) * 0.5f; - F32 pb_cy = (pb[1] + pb[3]) * 0.5f; - - F32 box_cx = var[0] * loc[0] * pb_w + pb_cx; - F32 box_cy = var[1] * loc[1] * pb_h + pb_cy; - F32 box_w = static_cast(exp(var[2] * loc[2]) * pb_w); - F32 box_h = static_cast(exp(var[3] * loc[3]) * pb_h); - - std::vector box; - box.resize(4); - box[0] = box_cx - box_w * 0.5f; - box[1] = box_cy - box_h * 0.5f; - box[2] = box_cx + box_w * 0.5f; - box[3] = box_cy + box_h * 0.5f; - // give box to boxes - boxes[i].assign(box.begin(),box.end()); - } - - std::vector> allclass_boxrects; - std::vector> allclass_boxscores; - allclass_boxrects.resize(numclass); - allclass_boxscores.resize(numclass); - - for(U32 i = 1; i < numclass; i++){ - std::vector class_boxrects; - std::vector class_boxscores; - for(U32 j = 0; j < num_total_priorbox; j++){ - - F32 score = confidence[j * numclass + i]; - - if (score > confidence_threshold) - { - std::vector inbox; - inbox.assign(boxes[j].begin(),boxes[j].end()); - BoxRect b = { inbox[0], inbox[1], inbox[2], inbox[3], i }; - class_boxrects.push_back(b); - class_boxscores.push_back(score); - } - } - //sort the boxes with scores - qsort_descent(class_boxrects, class_boxscores, 0, static_cast(class_boxscores.size()-1)); - - if(nms_top_k < (U32)class_boxrects.size()){ - class_boxrects.resize(nms_top_k); - class_boxscores.resize(nms_top_k); - } - //apply nms - std::vector picked; - nms_pickedboxes(class_boxrects, picked, nms_threshold); - - for(I64 j = 0; j < (I64)picked.size(); j++) - { - I64 picked_box = picked[j]; - allclass_boxrects[i].push_back(class_boxrects[picked_box]); - allclass_boxscores[i].push_back(class_boxscores[picked_box]); - } - } - - std::vector boxrects; - std::vector boxscores; - - for (U32 i = 1; i < numclass ; i++) - { - boxrects.insert(boxrects.end(), allclass_boxrects[i].begin(), allclass_boxrects[i].end()); - boxscores.insert(boxscores.end(), allclass_boxscores[i].begin(), allclass_boxscores[i].end()); - } - - qsort_descent(boxrects, boxscores, 0, static_cast(boxscores.size()-1)); - - if (keep_top_k < (U32)boxrects.size()) - { - boxrects.resize(keep_top_k); - boxscores.resize(keep_top_k); - } - - U32 num_detected = static_cast(boxrects.size()); - if (num_detected == 0) - return SUCCESS; - // the first box contains the number of availble boxes in the first element. - output[0] = num_detected; - output[1] = output[2] = output[3] = output[4] = output[5] = 0; - - for(U32 i = 0; i < num_detected ; i++){ - BoxRect b = boxrects[i]; - F32 score = boxscores[i]; - - output[(i+1)*6] = b.label; - output[(i+1)*6+1] = score; - output[(i+1)*6+2] = b.xmin; - output[(i+1)*6+3] = b.ymin; - output[(i+1)*6+4] = b.xmax; - output[(i+1)*6+5] = b.ymax; - } - return SUCCESS; -} - -EE detectionoutput_general(std::vector inputDesc, std::vector input, DetectionOutputDesc detectionoutputDesc, TensorDesc outputDesc, void* output) -{ - UNUSED(outputDesc); - if (nullptr == output) { - CHECK_STATUS(NULL_POINTER); - } - if (inputDesc.size() != 3) { - CHECK_STATUS(NOT_MATCH); - } - DataType idt0 = inputDesc[0].dt; - U32 ilens2 = inputDesc[2].dims[0]; - U32 numclass = detectionoutputDesc.num_class; - F32 nmsthreshold = detectionoutputDesc.nms_threshold; - U32 nmstopk = detectionoutputDesc.nms_top_k; - U32 keeptopk = detectionoutputDesc.keep_top_k; - F32 confidencethreshold = detectionoutputDesc.confidence_threshold; - EE ret = SUCCESS; - switch (idt0) { -#ifdef _USE_FP32 - case DT_F32: - detectionoutput(input, (F32*)output, ilens2, numclass, nmsthreshold, nmstopk, keeptopk, confidencethreshold); - break; -#endif -#ifdef _USE_FP16 - case DT_F16: - detectionoutput(input, (F16*)output, ilens2, numclass, nmsthreshold, nmstopk, keeptopk, confidencethreshold); - break; -#endif - default: - ret = NOT_SUPPORTED; - } - return ret; -} diff --git a/tensor_computing/src/cpu/general/eltwise.cpp b/tensor_computing/src/cpu/general/eltwise.cpp deleted file mode 100644 index abd819e0..00000000 --- a/tensor_computing/src/cpu/general/eltwise.cpp +++ /dev/null @@ -1,161 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/general/tensor_computing_general.h" - -template -T getFloatScalar(void* input, int inputSize, int index) { - int local = index % inputSize; - return ((T*)input)[local]; -} - -template -EE eltwise_general_kernel(std::vectorinput, std::vector inputSize, - U32 num, U32 len, void *output, EltwiseMode eltwiseMode) -{ - T* output_ptr = (T*)output; - for (U32 i = 0; i < len; i++){ - F32 tmp_s = getFloatScalar(input[0], inputSize[0], i); - for (U32 j = 1; j < num; j++) { - F32 value_s = getFloatScalar(input[j], inputSize[j], i); - switch (eltwiseMode) { - case ELTWISE_SUM: - tmp_s = value_s + tmp_s; - break; - case ELTWISE_MAX: - tmp_s = (value_s > tmp_s) ? value_s : tmp_s; - break; - case ELTWISE_PROD: - tmp_s *= value_s; - break; - default: - return NOT_SUPPORTED; - } - } - output_ptr[i] = tmp_s; - } - return SUCCESS; -} - -std::vector calculateLocalIndex_general(U32 index, TensorDesc desc) { - std::vector indexes(desc.nDims); - for (U32 i = 0; i < desc.nDims; i++) { - indexes[i] = index % desc.dims[i]; - index /= desc.dims[i]; - } - return indexes; -} - -U32 calculateGlobalIndex_general(std::vector indexes, TensorDesc desc) { - U32 index = 0; - for (int i = ((int)desc.nDims) - 1; i >= 0; i--) { - index = index * desc.dims[i] + indexes[i]; - } - return index; - -} - -std::vector calculateRelativeLocalIndex_general(std::vector indexes, TensorDesc desc) { - std::vector relativeIndexes(desc.nDims); - for (U32 i = 0; i < desc.nDims; i++) { - relativeIndexes[i] = indexes[i] % desc.dims[i]; - } - return relativeIndexes; -} - -// [1, 10, 10] + [1, 10, 10] = [1, 10, 10] -// [1, 10, 1] + [1, 1, 10] = [1, 10, 10] -// [1, 20, 10] + [10] = [1. 20, 10] + [1, 1, 10] = [1, 20, 10] -EE eltwise_general(std::vector inputDesc, std::vector input, - TensorDesc outputDesc, void* output, EltwiseMode eltwiseMode) -{ - U32 num = inputDesc.size(); - if(num <= 1 || outputDesc.nDims < 1) return NOT_MATCH; - I32 oneCount = 0; - for (int i = 0; i < ((int)outputDesc.nDims)-1; i++) { - if(outputDesc.dims[i] == 1) - oneCount ++; - else - break; - } - TensorDesc newOutputDesc = outputDesc; - for (int i = 0; i < (int)outputDesc.nDims - oneCount; i++) - newOutputDesc.dims[i] = outputDesc.dims[oneCount+i]; - newOutputDesc.nDims = outputDesc.nDims - oneCount; - - std::vector newInputDesc(num); - for (U32 i = 0; i < num; i++) { - newInputDesc[i] = inputDesc[i]; - for (int j = 0; j < (int)inputDesc[i].nDims - oneCount; j++) - newInputDesc[i].dims[j] = inputDesc[i].dims[oneCount+j]; - newInputDesc[i].nDims = inputDesc[i].nDims - oneCount; - for (U32 j = newInputDesc[i].nDims; j < newOutputDesc.nDims; j++) { - newInputDesc[i].dims[j] = 1; - } - newInputDesc[i].nDims = newOutputDesc.nDims; - } - U32 size = tensorNumElements(newOutputDesc); - U32 lastDimSize = newOutputDesc.dims[0]; - std::vector lastDimSizes(num); - for (U32 i = 0; i < num; i++) - lastDimSizes[i] = newInputDesc[i].dims[0]; - for (U32 i = 1; i < newOutputDesc.nDims; i++) { - bool sameDim = true; - for (U32 j = 0; j < num; j++) { - if (newInputDesc[j].dims[i] != newOutputDesc.dims[i]) { - sameDim = false; - break; - } - } - if (sameDim) { - lastDimSize *= newOutputDesc.dims[i]; - for (U32 j = 0; j < num; j++) { - lastDimSizes[j] *= newInputDesc[j].dims[i]; - } - } else { - break; - } - } - - std::vector newInput(num); - EE ret = SUCCESS; - for (U32 i = 0; i < size; i+=lastDimSize) { - std::vector index = calculateLocalIndex_general(i, newOutputDesc); - for (U32 j = 0; j < num; j++) { - std::vector relativeIndex = calculateRelativeLocalIndex_general(index, newInputDesc[j]); - U32 globalIndex = calculateGlobalIndex_general(relativeIndex, newInputDesc[j]); - newInput[j] = (U8*)(input[j]) + globalIndex * bytesOf(newInputDesc[j].dt); - } - U8* newOutput = (U8*)output + i * bytesOf(newOutputDesc.dt); - switch (newOutputDesc.dt) { -#ifdef _USE_FP32 - case DT_F32: { - ret = eltwise_general_kernel(newInput, lastDimSizes, num, lastDimSize, newOutput, eltwiseMode); - break; - } -#endif -#ifdef _USE_FP16 - case DT_F16: { - ret = eltwise_general_kernel(newInput, lastDimSizes, num, lastDimSize, newOutput, eltwiseMode); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - - } - return ret; -} diff --git a/tensor_computing/src/cpu/general/general_functions.h b/tensor_computing/src/cpu/general/general_functions.h deleted file mode 100644 index 8d5c8360..00000000 --- a/tensor_computing/src/cpu/general/general_functions.h +++ /dev/null @@ -1,197 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_GENERAL_FUNCTIONS -#define _H_GENERAL_FUNCTIONS -#include -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "math.h" -#include "tensor_computing_type.h" - -template -inline EE from_nchwc8_to_nchw(TensorDesc *desc, T *data) { - if (desc == nullptr || data == nullptr) - CHECK_STATUS(NULL_POINTER); - - DataType idt; - DataFormat idf; - U32 in, ic, ih, iw; - CHECK_STATUS(tensor4dGet(*desc, &idt, &idf, &in, &ic, &ih, &iw)); - if (idf != DF_NCHWC8) - CHECK_STATUS(NOT_MATCH); - - *desc = tensor4df(idt, DF_NCHW, in, ic, ih, iw); - - T *tmp = (T *)malloc(tensorNumBytes(*desc)); - ic /= 8; - for (U32 n = 0; n < in; n++) { - for (U32 c = 0; c < ic; c++) { - for (U32 hw = 0; hw < ih*iw; hw++) { - for (U32 c8 = 0; c8 < 8; c8++) { - tmp[n*ic*8*ih*iw + (c*8 + c8)*ih*iw + hw] = data[n*ic*ih*iw*8 + c*ih*iw*8 + hw*8 + c8]; - } - } - } - } - memcpy(data, tmp, tensorNumBytes(*desc)); - free(tmp); - return SUCCESS; -} - -template -inline EE from_nchw_to_nchwc8(TensorDesc *desc, T *data) { - if (desc == nullptr || data == nullptr) - CHECK_STATUS(NULL_POINTER); - - DataType idt; - DataFormat idf; - U32 in, ic, ih, iw; - CHECK_STATUS(tensor4dGet(*desc, &idt, &idf, &in, &ic, &ih, &iw)); - if (idf != DF_NCHW) - CHECK_STATUS(NOT_MATCH); - - *desc = tensor4df(idt, DF_NCHWC8, in, ic, ih, iw); - - T *tmp = (T *)malloc(tensorNumBytes(*desc)); - ic /= 8; - for (U32 n = 0; n < in; n++) { - for (U32 c = 0; c < ic; c++) { - for (U32 hw = 0; hw < ih*iw; hw++) { - for (U32 c8 = 0; c8 < 8; c8++) { - tmp[n*ic*ih*iw*8 + c*ih*iw*8 + hw*8 + c8] = data[n*ic*8*ih*iw + (c*8 + c8)*ih*iw + hw]; - } - } - } - } - memcpy(data, tmp, tensorNumBytes(*desc)); - free(tmp); - return SUCCESS; -} - - -template -EE activation(ActivationDesc activationDesc, F32 input, T* output) { - F32 value, result; - switch (activationDesc.mode){ - case ACTIVATION_NULL:{ - result = input; - break; - } - case ACTIVATION_RELU:{ - value = input; - F32 tmp = activationDesc.value[0] * value; - if(value < tmp) value = tmp; - result = value; - break; - } - case ACTIVATION_RELU6:{ - value = input; - if(value < 0) value = 0; - if(value > 6) value = 6; - result = value; - break; - } - case ACTIVATION_H_SIGMOID:{ - value = input + 3; - if(value < 0) value = 0; - if(value > 6) value = 6; - result = value / 6; - break; - } - case ACTIVATION_H_SWISH:{ - value = input + 3; - if(value < 0) value = 0; - if(value > 6) value = 6; - result = input * (value / 6); - break; - } - case ACTIVATION_GELU:{ - value = input; - F32 two_div_PI_sqrt = sqrt(2 / 3.14159265358979323846); - value = two_div_PI_sqrt * (value + 0.044715 * pow(value, 3)); - value = 1.0 - 2.0 / (exp(2.0 * value) + 1.0); - value = 0.5 * (1.0 + value); - value = input * value; - result = value; - break; - } - case ACTIVATION_TANH:{ - value = 1.0 - 2.0 / (exp(2.0 * input) + 1.0); - result = value; - break; - } - case ACTIVATION_SIGMOID: { - value = 1.0 / (1.0 + exp(-1.0 * input)); - result = value; - break; - } - default: - return NOT_SUPPORTED; - } - *output = result; - return SUCCESS; -} - -template -F32 array_sum(const T* array, U32 length) { - F32 sum = 0; - for (U32 i=0; i < length; i++) - sum += array[i]; - return sum; -} - -// array mean -template -F32 array_mean(const T *data, I32 len) { - if(len <= 0) return 0; - return array_sum(data, len) / len; -} - -// array var -template -F32 array_var(const T *data, I32 len, F32 mean) { - F32 sum_s = 0; - for(I32 i = 0; i < len; i++){ - F32 in = data[i]; - F32 tmp = in - mean; - sum_s += tmp * tmp; - } - return sum_s / len; -} - - -template -EE array_scale(T* input, T* output, U32 len, F32 alpha, F32 beta) -{ - if (nullptr == input - || nullptr == output) - CHECK_STATUS(NULL_POINTER); - - for (U32 i = 0; i < len; i++) { - F32 value = input[i]; - output[i] = alpha * value + beta; - } - return SUCCESS; -} - -template -inline void array_add(const T* inputA, const T* inputB, T* output, I32 len) -{ - for (I32 i = 0; i < len; i++) { - output[i] = inputA[i] + inputB[i]; - } -} -#endif diff --git a/tensor_computing/src/cpu/general/lstm.cpp b/tensor_computing/src/cpu/general/lstm.cpp deleted file mode 100644 index 27fbd9d9..00000000 --- a/tensor_computing/src/cpu/general/lstm.cpp +++ /dev/null @@ -1,284 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include - -#include "cpu/general/tensor_computing_general.h" -#include "cpu/general/general_functions.h" - -template -void mvm_nkn32(U32 fn, U32 fk, const T* filterArray, T* input, T* output) { - for (U32 i = 0; i < fn; i++) { - for (U32 j = 0; j < 32; j++) { - U32 n = i * 32 + j; - F32 value = 0; - for (U32 k = 0; k < fk; k++) { - value += input[k] * filterArray[(i * fk + k) * 32 + j]; - } - output[n] += value; - } - } -} - -template -EE lstmcell(TensorDesc xDesc, const void* currentX, - TensorDesc filterDesc, const void* filter, - TensorDesc biasDesc, const void* bias, - void *state, - U32 tmpBytes, void *tmp, - LSTMDesc lstmDesc, U32 batchStrideX, U32 batchStrideH, - TensorDesc hDesc, void* output) -{ - UNUSED(biasDesc); - UNUSED(tmpBytes); - if (nullptr == currentX - || nullptr == filter - || nullptr == bias - || nullptr == state - || nullptr == tmp - || nullptr == output) - CHECK_STATUS(NULL_POINTER); - - DataType idt, fdt, odt; - DataFormat idf, fdf, odf; - U32 in, ix; - U32 on, oh; - U32 fk, fn; - CHECK_STATUS(tensor2dfGet(xDesc, &idt, &idf, &in, &ix)); - CHECK_STATUS(tensor2dfGet(filterDesc, &fdt, &fdf, &fn, &fk)); - CHECK_STATUS(tensor2dfGet(hDesc, &odt, &odf, &on, &oh)); - if(fdf != DF_NKN32) { - CHECK_STATUS(NOT_MATCH); - } - - U32 batch = in; - U32 xDim = ix; - U32 hDim = lstmDesc.numOutput; - I32 column = (lstmDesc.numProjection > 0) ? lstmDesc.numProjection : lstmDesc.numOutput; - F32 forgetBias = lstmDesc.forgetBias; - ActivationMode activationMode = lstmDesc.activationMode; - if (activationMode != ACTIVATION_TANH) - CHECK_STATUS(NOT_SUPPORTED); - - if (!(idt == fdt && idt == odt)) { - CHECK_STATUS(NOT_MATCH); - } - - const T *currentXArray = (const T*)currentX; - const T *filterArray = (const T*)filter; - const T *biasArray = (const T*)bias; - const T *projectionArray = (const T*)filter + (fn * fk); - T *lastStateArray = (T*)state; - T *lastHArray = lastStateArray + column; - T *tmpArray = (T*)tmp; - T *currentStateArray = (T*)state; - T *currentHArray = currentStateArray + column; - T *outputArray = (T*)output; - T *xhArray = tmpArray; - T *intermediateH = xhArray + (xDim + hDim); - U32 lastStateStride = column + hDim; - U32 lastHStride = column + hDim; - U32 currentStateStride = column + hDim; - U32 currentHStride = column + hDim; - for (U32 m = 0; m < batch; m++) { - T *lastBatchH = lastHArray + m * lastHStride; - memcpy(xhArray, currentXArray+m*batchStrideX, xDim*sizeof(T)); - memcpy(xhArray+xDim, lastBatchH, hDim*sizeof(T)); - - // MVM - memcpy(intermediateH, biasArray, column * 4 * sizeof(T)); - mvm_nkn32(fn/32, fk, filterArray, xhArray, intermediateH); - - T *out_i = intermediateH; - T *out_g = out_i + column; - T *out_f = out_i + column * 2; - T *out_o = out_i + column * 3; - T *lastBatchState = lastStateArray + m * lastStateStride; - T *currentBatchState = currentStateArray + m * currentStateStride; - T *currentBatchH = currentHArray + m * currentHStride; - T *currentOutput = outputArray + m * batchStrideH; - T* tmpState, *tmpHH, *tmpH; - if (lstmDesc.zoneoutCell == 0) { - tmpState = currentBatchState; - } else { - tmpState = out_i; - } - if (lstmDesc.zoneoutOutput != 0) { - tmpHH = out_g; - tmpH = out_f; - } else { - if (lstmDesc.numProjection > 0) { - tmpHH = out_g; - tmpH = out_f; - } else { - tmpHH = currentBatchH; - tmpH = currentBatchH; - } - } - - for (I32 h = 0; h < column; h++) { - F32 C_s = lastBatchState[h]; - F32 I_s = 1.0 / (1.0 + exp(-out_i[h])); - F32 G_s = tanh(out_g[h]); - F32 F_s = 1.0 / (1.0 + exp(-(out_f[h] + forgetBias))); - F32 O_s = 1.0 / (1.0 + exp(-out_o[h])); - C_s = C_s * F_s + I_s * G_s; - F32 value = O_s * tanh(C_s); - tmpState[h] = C_s; - tmpHH[h] = value; - } - - if (lstmDesc.zoneoutCell != 0) { - array_scale(tmpState, tmpState, column, 1-lstmDesc.zoneoutCell, 0); - array_scale(lastBatchState, lastBatchState, column, lstmDesc.zoneoutCell, 0); - array_add(tmpState, lastBatchState, currentBatchState, column); - } - if (lstmDesc.zoneoutOutput != 0) { - array_scale(tmpHH, tmpH, column, 1-lstmDesc.zoneoutOutput, 0); - array_scale(lastBatchH, lastBatchH, column, lstmDesc.zoneoutOutput, 0); - array_add(tmpH, lastBatchH, currentBatchH, column); - } - - if (lstmDesc.numProjection > 0) { - memset(currentBatchH, 0, sizeof(T) * hDim); - mvm_nkn32(hDim/32, lstmDesc.numProjection, projectionArray, tmpHH, currentBatchH); - tmpHH = currentBatchH; - } - memcpy(currentOutput, tmpHH, sizeof(T) * hDim); - } - return SUCCESS; -} - -EE lstm(TensorDesc inputDesc, const void* input, - TensorDesc filterDesc, const void* filter, - TensorDesc biasDesc, const void* bias, - U32 tmpBytes, void* tmp, - LSTMDesc lstmDesc, - TensorDesc outputDesc, void* output) -{ - UNUSED(outputDesc); - - if (nullptr == input - || nullptr == filter - || nullptr == bias - || nullptr == tmp - || nullptr == output) - CHECK_STATUS(NULL_POINTER); - - DataType idt; - DataFormat idf; - U32 batch, step, xDim; - int num = lstmDesc.biDirection ? 2 : 1; - CHECK_STATUS(tensor3dGet(inputDesc, &idt, &idf, &batch, &step, &xDim)); - U32 hDim = lstmDesc.numOutput; - U32 column = (lstmDesc.numProjection > 0) ? lstmDesc.numProjection : lstmDesc.numOutput; - - U8 *cellState = (U8*)tmp; - U8 *tmpArray = cellState + batch * (column + hDim) * bytesOf(idt); - U32 batchStrideX = step * xDim; - U32 batchStrideH = step * hDim * num; - TensorDesc xDesc = tensor2df(idt, DF_NORMAL, batch, xDim); - TensorDesc hDesc = tensor2df(idt, DF_NORMAL, batch, hDim); - - memset(cellState, 0, batch * (column + hDim) * bytesOf(idt)); - for (U32 t = 0; t < step; t++) { - const U8* currentX = (const U8*)input + t * xDim * bytesOf(idt); - U8 *currentH = (U8*)output + t * hDim * num * bytesOf(idt); - CHECK_STATUS(lstmcell_general(xDesc, currentX, - filterDesc, filter, - biasDesc, bias, - cellState, - tmpBytes, tmpArray, - lstmDesc, batchStrideX, batchStrideH, - hDesc, currentH)); - } - - if (lstmDesc.biDirection) { - memset(cellState, 0, batch * (column + hDim) * bytesOf(idt)); - U32 filterBytes = tensorNumBytes(filterDesc) + bytesOf(filterDesc.dt) * lstmDesc.numProjection * lstmDesc.numOutput; - U32 biasBytes = tensorNumBytes(biasDesc); - const U8* filterPtr = (const U8*)filter + filterBytes; - const U8* biasPtr = (const U8*)bias + biasBytes; - for (I32 t = step-1; t >= 0; t--) { - const U8* currentX = (const U8*)input + t * xDim * bytesOf(idt); - U8 *currentH = (U8*)output + t * hDim * num * bytesOf(idt) + hDim * bytesOf(idt); - CHECK_STATUS(lstmcell_general(xDesc, currentX, - filterDesc, filterPtr, - biasDesc, biasPtr, - cellState, - tmpBytes, tmpArray, - lstmDesc, batchStrideX, batchStrideH, - hDesc, currentH)); - } - } - return SUCCESS; -} - -EE lstmcell_general(TensorDesc xDesc, const void* currentX, - TensorDesc filterDesc, const void* filter, - TensorDesc biasDesc, const void* bias, - void *state, - U32 tmpBytes, void *tmp, - LSTMDesc lstmDesc, U32 batchStrideX, U32 batchStrideH, - TensorDesc hDesc, void* output) -{ - EE ret = SUCCESS; - switch (xDesc.dt) { -#ifdef _USE_FP16 - case DT_F16: - ret = lstmcell(xDesc, currentX, filterDesc, filter, biasDesc, bias, - state, tmpBytes, tmp, lstmDesc, batchStrideX, batchStrideH, hDesc, output); - break; -#endif -#ifdef _USE_FP32 - case DT_F32: - ret = lstmcell(xDesc, currentX, filterDesc, filter, biasDesc, bias, - state, tmpBytes, tmp, lstmDesc, batchStrideX, batchStrideH, hDesc, output); - break; -#endif - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - -EE lstm_general(TensorDesc inputDesc, const void* input, - TensorDesc filterDesc, const void* filter, - TensorDesc biasDesc, const void* bias, - U32 tmpBytes, void* tmp, - LSTMDesc lstmDesc, - TensorDesc outputDesc, void* output) -{ - EE ret = SUCCESS; - switch (inputDesc.dt) { -#ifdef _USE_FP16 - case DT_F16: - ret = lstm(inputDesc, input, filterDesc, filter, biasDesc, bias, - tmpBytes, tmp, lstmDesc, outputDesc, output); - break; -#endif -#ifdef _USE_FP32 - case DT_F32: - ret = lstm(inputDesc, input, filterDesc, filter, biasDesc, bias, - tmpBytes, tmp, lstmDesc, outputDesc, output); - break; -#endif - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} diff --git a/tensor_computing/src/cpu/general/multiply.cpp b/tensor_computing/src/cpu/general/multiply.cpp deleted file mode 100644 index 13f888ea..00000000 --- a/tensor_computing/src/cpu/general/multiply.cpp +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/general/tensor_computing_general.h" -#include "cpu/general/general_functions.h" - -EE multiply_general(void *alpha, void *beta, TensorDesc inputDesc, void* input, TensorDesc outputDesc, void *output) -{ - UNUSED(outputDesc); - - if (nullptr == alpha - || nullptr == beta) - CHECK_STATUS(NULL_POINTER); - - EE ret = SUCCESS; - switch (inputDesc.dt) { -#ifdef _USE_FP32 - case DT_F32: { - ret = array_scale((F32 *)input, (F32 *)output, tensorNumElements(inputDesc), *((F32 *)alpha), *((F32 *)beta)); - break; - } -#endif -#ifdef _USE_FP16 - case DT_F16: { - ret = array_scale((F16 *)input, (F16 *)output, tensorNumElements(inputDesc), *((F32 *)alpha), *((F32 *)beta)); - break; - } -#endif - case DT_I32: { - ret = array_scale((I32 *)input, (I32 *)output, tensorNumElements(inputDesc), *((F32 *)alpha), *((F32 *)beta)); - break; - } - case DT_U32: { - ret = array_scale((U32 *)input, (U32 *)output, tensorNumElements(inputDesc), *((F32 *)alpha), *((F32 *)beta)); - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - - return ret; -} diff --git a/tensor_computing/src/cpu/general/normalization.cpp b/tensor_computing/src/cpu/general/normalization.cpp deleted file mode 100644 index 871dbb7f..00000000 --- a/tensor_computing/src/cpu/general/normalization.cpp +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include - -#include "cpu/general/general_functions.h" -#include "cpu/general/tensor_computing_general.h" - -template -inline EE array_norm_scale(T *input, T *output, I32 len, F32 mean, F32 var, T *alpha, T *beta) { - F32 eps = 1e-6; - F32 std_value = sqrt(var + eps); - for(I32 i = 0; i < len; i++){ - output[i] = alpha[i] * (input[i] - mean) / std_value + beta[i]; - } - return SUCCESS; -} - -template -inline EE layer_normalization(T *alpha, T *beta, - TensorDesc inputDesc, T* input, - TensorDesc outputDesc, T* output) -{ - if (nullptr == input || nullptr == output) - CHECK_STATUS(NULL_POINTER); - if(inputDesc.dt != outputDesc.dt || inputDesc.df != outputDesc.df) - CHECK_STATUS(NOT_MATCH); - - U32 size = tensorNumElements(inputDesc); - I32 size_inner = inputDesc.dims[0]; - I32 size_outer = size / size_inner; - for(I32 i = 0; i < size_outer; i++) { - T *current_input = input + i * size_inner; - T *current_output = output + i * size_inner; - F32 mean = array_mean(current_input, size_inner); - F32 var = array_var(current_input, size_inner, mean); - - array_norm_scale(current_input, current_output, size_inner, mean, var, alpha, beta); - } - - return SUCCESS; -} - - -EE layer_normalization_general(void *alpha, void *beta, - TensorDesc inputDesc, void* input, - TensorDesc outputDesc, void* output) -{ - DataType idt = inputDesc.dt; - EE ret = SUCCESS; - switch (idt) { -#ifdef _USE_FP32 - case DT_F32: { - ret = layer_normalization((F32*)alpha, (F32*)beta, inputDesc, (F32*)input, outputDesc, (F32*)output); - break; - } -#endif -#ifdef _USE_FP16 - case DT_F16: { - ret = layer_normalization((F16*)alpha, (F16*)beta, inputDesc, (F16*)input, outputDesc, (F16*)output); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} diff --git a/tensor_computing/src/cpu/general/padding.cpp b/tensor_computing/src/cpu/general/padding.cpp deleted file mode 100644 index 38a311f1..00000000 --- a/tensor_computing/src/cpu/general/padding.cpp +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing_type.h" -#include "cpu/general/tensor_computing_general.h" -#include - -EE padding_general(TensorDesc inputDesc, const void* input, PadDesc padDesc, TensorDesc outputDesc, void* output) -{ - DataType idt, odt; - DataFormat idf, odf; - U32 in = 0, ic = 0, ih = 0, iw = 0, - on = 0, oc = 0, oh = 0, ow = 0; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - CHECK_REQUIREMENT(in == on); - CHECK_REQUIREMENT(ic == oc); - U32 alignSize = 1; - if (idf == DF_NCHWC8) - alignSize = 8; - ic /= alignSize; - oc /= alignSize; - for (U32 n = 0; n < in; n++) { - for (U32 c = 0; c < ic; c++) { - for (U32 h = 0; h < ih; h++) { - const U8* inPtr = (const U8*)input + (((n * ic + c) * ih + h) * iw) * alignSize * bytesOf(idt); - U8* outPtr = (U8 *)output + (((n * oc + c) * oh + (padDesc.top+h)) * ow) * alignSize * bytesOf(odt); - if (padDesc.pad_mode == Pad_Constant) { - memset(outPtr, 0, padDesc.left*alignSize*bytesOf(odt)); - outPtr += padDesc.left * alignSize * bytesOf(odt); - memcpy(outPtr, inPtr, iw * alignSize * bytesOf(idt)); - outPtr += iw * alignSize * bytesOf(odt); - memset(outPtr, 0, padDesc.right*alignSize*bytesOf(odt)); - } else { - for (U32 w = 0; w < padDesc.left; w++) { - U32 index = 0; - if (padDesc.pad_mode == Pad_Reflect) { - index = (padDesc.left - w) * alignSize * bytesOf(idt); - } - memcpy(outPtr, inPtr+index, alignSize*bytesOf(idt)); - outPtr += alignSize * bytesOf(idt); - } - memcpy(outPtr, inPtr, iw * alignSize * bytesOf(idt)); - outPtr += iw * alignSize * bytesOf(odt); - for (U32 w = 0; w < padDesc.right; w++) { - U32 index = 0; - if (padDesc.pad_mode == Pad_Reflect) { - index = (iw - w - 2) * alignSize * bytesOf(idt); - } - memcpy(outPtr, inPtr+index, alignSize*bytesOf(idt)); - outPtr += alignSize * bytesOf(idt); - } - } - } - U8* outPtr = (U8*)output + (((n * oc + c) * oh) * ow) * alignSize * bytesOf(odt); - for (U32 h = 0; h < padDesc.top; h++) { - U32 index = h * ow * alignSize * bytesOf(odt); - if (padDesc.pad_mode == Pad_Constant) { - memset(outPtr+index, 0, ow * alignSize * bytesOf(odt)); - } else if (padDesc.pad_mode == Pad_Edge) { - memcpy(outPtr+index, outPtr+(padDesc.top*ow*alignSize*bytesOf(odt)), ow*alignSize*bytesOf(odt)); - } else if (padDesc.pad_mode == Pad_Reflect) { - memcpy(outPtr+index, outPtr+((padDesc.top+padDesc.top-h)*ow*alignSize*bytesOf(odt)), ow*alignSize*bytesOf(odt)); - } else { - return NOT_SUPPORTED; - } - } - for (U32 h = 0; h < padDesc.bottom; h++) { - U32 index = (padDesc.top+ih+h) * ow * alignSize * bytesOf(odt); - if (padDesc.pad_mode == Pad_Constant) { - memset(outPtr+index, 0, ow * alignSize * bytesOf(odt)); - } else if (padDesc.pad_mode == Pad_Edge) { - memcpy(outPtr+index, outPtr+((padDesc.top+ih)*ow*alignSize*bytesOf(odt)), ow*alignSize*bytesOf(odt)); - } else if (padDesc.pad_mode == Pad_Reflect) { - memcpy(outPtr+index, outPtr+((padDesc.top+ih-2-h)*ow*alignSize*bytesOf(odt)), ow*alignSize*bytesOf(odt)); - } else { - return NOT_SUPPORTED; - } - } - } - } - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/general/pooling.cpp b/tensor_computing/src/cpu/general/pooling.cpp deleted file mode 100644 index bcf5a1bb..00000000 --- a/tensor_computing/src/cpu/general/pooling.cpp +++ /dev/null @@ -1,158 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "tensor_computing_type.h" -#include "cpu/general/tensor_computing_general.h" - -template -EE pooling(T *input, T* output, - U32 in, U32 ic, U32 ih, U32 iw, - U32 strideH, U32 strideW, U32 paddingT, U32 paddingB, U32 paddingL, U32 paddingR, U32 kernelH, U32 kernelW, - PoolingMode pm, RoundMode rm, - U32 alignSize, - F32 minValue) -{ - U32 oh = 0, ow = 0; - if (rm == CEIL) { - oh = (U32)(ceil((double(ih + paddingT + paddingB - kernelH) / strideH))) + 1; - ow = (U32)(ceil((double(iw + paddingL + paddingR - kernelW) / strideW))) + 1; - } - if (rm == FLOOR) { - oh = (U32)(floor((double(ih + paddingT + paddingB - kernelH) / strideH))) + 1; - ow = (U32)(floor((double(iw + paddingL + paddingR - kernelW) / strideW))) + 1; - } - - CHECK_REQUIREMENT(ic % alignSize == 0); - ic = ic / alignSize; - - for (U32 n=0; n (int)ih) ? ih : hend; - wend = (wend > (int)iw) ? iw : wend; - float poolSize = (hend - hstart)*(wend - wstart); - - T value; - switch(pm){ - case POOLING_MAX: - value = minValue; - break; - case POOLING_MEAN: - value = 0; - break; - default: - return NOT_SUPPORTED; - } - for (int x = hstart; x < hend; x++) { - for (int y = wstart; y < wend; y++) { - U32 in_off = ((((n*ic + c)*ih) + x)*iw + y)*alignSize + j; - switch(pm){ - case POOLING_MAX: - value = (value > input[in_off]) ? value : input[in_off]; - break; - case POOLING_MEAN: - value += input[in_off]; - break; - default: - return NOT_SUPPORTED; - } - } - } - switch(pm){ - case POOLING_MAX: - break; - case POOLING_MEAN: - value = value / poolSize; - break; - default: - return NOT_SUPPORTED; - } - - U32 out_off = ((((n*ic + c)*oh) + h)*ow + w)*alignSize + j; - output[out_off] = value; - } - } - } - } - } - return SUCCESS; -} - -EE pooling_general(TensorDesc inputDesc, const void* input, PoolingDesc poolingDesc, TensorDesc outputDesc, void* output) -{ - if (nullptr == input || nullptr == output) { - CHECK_STATUS(NULL_POINTER); - } - DataType idt, odt; - DataFormat idf, odf; - U32 in = 0, ic = 0, ih = 0, iw = 0, - on = 0, oc = 0, oh = 0, ow = 0; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - - if (in != on || ic != oc) { - CHECK_STATUS(NOT_MATCH); - } - if (idf != DF_NCHWC8 || odf != idf) { - CHECK_STATUS(NOT_MATCH); - } - - U32 strideH = poolingDesc.stride_h; - U32 strideW = poolingDesc.stride_w; - U32 paddingT = poolingDesc.padding_top; - U32 paddingB = poolingDesc.padding_bottom; - U32 paddingL = poolingDesc.padding_left; - U32 paddingR = poolingDesc.padding_right; - U32 kernelSizeH = poolingDesc.kernelSize_h; - U32 kernelSizeW = poolingDesc.kernelSize_w; - - EE ret = SUCCESS; - switch (idt) { -#ifdef _USE_FP32 - case DT_F32: - ret = pooling((F32*)input, (F32*)output, - in, ic, ih, iw, - strideH, strideW, paddingT, paddingB, paddingL, paddingR, - kernelSizeH, kernelSizeW, - poolingDesc.pm, poolingDesc.rm, - 8, FLT_MIN); - break; -#endif -#ifdef _USE_FP16 - case DT_F16: - ret = pooling((F16*)input, (F16*)output, - in, ic, ih, iw, - strideH, strideW, paddingT, paddingB, paddingL, paddingR, - kernelSizeH, kernelSizeW, - poolingDesc.pm, poolingDesc.rm, - 8, UNI_F16_MIN); - break; -#endif - default: - ret = NOT_SUPPORTED; - } - return ret; -} diff --git a/tensor_computing/src/cpu/general/priorbox.cpp b/tensor_computing/src/cpu/general/priorbox.cpp deleted file mode 100644 index 6bd14df0..00000000 --- a/tensor_computing/src/cpu/general/priorbox.cpp +++ /dev/null @@ -1,171 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "tensor_computing_type.h" -#include "cpu/general/tensor_computing_general.h" - - -template -EE priorbox(T* output, - U32 ih_layer, U32 iw_layer, U32 ih_img, U32 iw_img, - std::vector minsizes, std::vector maxsizes, std::vector ars, U32 flip, U32 clip, F32* vars, I32 imageW, I32 imageH, - F32 stepW, F32 stepH, - F32 offset) -{ - U32 layer_w = iw_layer; - U32 layer_h = ih_layer; - - int img_w, img_h; - if(imageH == 0 || imageW == 0){ - img_w = iw_img; - img_h = ih_img; - } else { - img_w = imageW; - img_h = imageH; - } - F32 stp_h, stp_w; - if (stepW == 0 || stepH == 0){ - stp_w = static_cast(ceil((img_w)/layer_w)); - stp_h = static_cast(ceil((img_h)/layer_h)); - } else{ - stp_w = stepW; - stp_h = stepH; - } - - U32 num_priorboxs = ars.size(); - if(flip){ - num_priorboxs = num_priorboxs * 2; - } - U32 num_minsize = minsizes.size(); - num_priorboxs = (num_priorboxs + 1) * num_minsize; - if(!maxsizes.empty()){ - U32 num_maxsize = maxsizes.size(); - num_priorboxs = num_priorboxs + num_maxsize; - } - int dim = layer_h * layer_w * num_priorboxs * 4; - int idx = 0; - for (U32 h = 0 ; h < layer_h ; h++){ - for (U32 w = 0 ; w < layer_w ; w++){ - F32 center_x = (w + offset) * stp_w; - F32 center_y = (h + offset) * stp_h; - F32 box_w , box_h; - for( int n = 0 ; n < (int)minsizes.size() ; n++){ - F32 minsize = minsizes[n]; - box_w = box_h = minsize; - output[idx++] = (center_x - box_w/2) / img_w; - output[idx++] = (center_y - box_h/2) / img_h; - output[idx++] = (center_x + box_w/2) / img_w; - output[idx++] = (center_y + box_h/2) / img_h; - - if ((int)maxsizes.size() > 0) { - F32 maxsize = maxsizes[n]; - box_w = box_h = sqrt(minsize * maxsize); - output[idx++] = (center_x - box_w/2) / img_w; - output[idx++] = (center_y - box_h/2) / img_h; - output[idx++] = (center_x + box_w/2) / img_w; - output[idx++] = (center_y + box_h/2) / img_h; - } - - for (int a = 0; a < (int)ars.size(); a++){ - F32 ar = ars[a]; - box_w = minsize * sqrt(ar); - box_h = minsize / sqrt(ar); - output[idx++] = (center_x - box_w/2) / img_w; - output[idx++] = (center_y - box_h/2) / img_h; - output[idx++] = (center_x + box_w/2) / img_w; - output[idx++] = (center_y + box_h/2) / img_h; - if(flip){ - output[idx++] = (center_x - box_h/2) / img_w; - output[idx++] = (center_y - box_w/2) / img_h; - output[idx++] = (center_x + box_h/2) / img_w; - output[idx++] = (center_y + box_w/2) / img_h; - } - } - } - } - } - - if (clip) { - for (int i = 0; i < dim; i++) { - output[i] = std::min(std::max(output[i], 0.), 1.); - } - } - - for(int i = 0 ; i < dim/4 ; i++){ - output[idx++] = vars[0]; - output[idx++] = vars[1]; - output[idx++] = vars[2]; - output[idx++] = vars[3]; - } - return SUCCESS; -} - -EE priorbox_general(std::vector inputDesc, PriorBoxDesc priorboxDesc, TensorDesc outputDesc, void* output) -{ - UNUSED(outputDesc); - if (nullptr == output) { - CHECK_STATUS(NULL_POINTER); - } - U32 num = inputDesc.size(); - if (num != 2) return NOT_MATCH; - DataType idt0, idt1; - DataFormat idf0, idf1; - U32 in0 = 0, ic0 = 0, ih0 = 0, iw0 = 0; - U32 in1 = 0, ic1 = 0, ih1 = 0, iw1 = 0; - CHECK_STATUS(tensor4dGet(inputDesc[0], &idt0, &idf0, &in0, &ic0, &ih0, &iw0)); - CHECK_STATUS(tensor4dGet(inputDesc[1], &idt1, &idf1, &in1, &ic1, &ih1, &iw1)); - - std::vector minsizes = priorboxDesc.min_sizes; - std::vector maxsizes = priorboxDesc.max_sizes; - std::vector ars = priorboxDesc.aspect_ratios; - U32 flip = priorboxDesc.flip; - U32 clip = priorboxDesc.clip; - F32 vars[4]; - for (int i = 0; i < 4 ; i++){ - vars[i] = priorboxDesc.variances[i]; - } - U32 imageH = priorboxDesc.image_h; - U32 imageW = priorboxDesc.image_w; - F32 stepH = priorboxDesc.step_h; - F32 stepW = priorboxDesc.step_w; - F32 offset = priorboxDesc.offset; - - EE ret = SUCCESS; - switch (idt0) { -#ifdef _USE_FP32 - case DT_F32: - ret = priorbox((F32*)output, - ih0, iw0, ih1, iw1, - minsizes, maxsizes, ars, flip, clip, vars, imageW, imageH, - stepW, stepH, - offset); - break; -#endif -#ifdef _USE_FP16 - case DT_F16: - ret = priorbox((F16*)output, - ih0, iw0, ih1, iw1, - minsizes, maxsizes, ars, flip, clip, vars, imageW, imageH, - stepW, stepH, - offset); - break; -#endif - default: - ret = NOT_SUPPORTED; - } - return ret; -} \ No newline at end of file diff --git a/tensor_computing/src/cpu/general/reduction.cpp b/tensor_computing/src/cpu/general/reduction.cpp deleted file mode 100644 index 7bcddb08..00000000 --- a/tensor_computing/src/cpu/general/reduction.cpp +++ /dev/null @@ -1,109 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/general/tensor_computing_general.h" - -template -F32 reductionKernel(const T* input, const float *mask, ReductionMode reductionMode, float coeff, U32 len, U32 stride) { - F32 sum = 0; - U32 j = 0; - U32 count = 0; - for (U32 i = 0; i < len; i++, j+=stride) { - if (mask == nullptr || (mask != nullptr && mask[i] == 1)) { - if (reductionMode == REDUCTION_SUM || reductionMode == REDUCTION_MEAN) - sum += input[j]; - else - CHECK_STATUS(NOT_SUPPORTED); - count ++; - } - } - F32 result = sum; - if (reductionMode == REDUCTION_MEAN) - result /= count; - result *= coeff; - return result; -} - -template -EE reduction(TensorDesc inputDesc, const T* input, - TensorDesc maskDesc, const float* mask, - I32 axis, - ReductionMode reductionMode, - float coeff, - TensorDesc outputDesc, T* output) -{ - UNUSED(outputDesc); - - if (nullptr == input || nullptr == output) - CHECK_STATUS(NULL_POINTER); - - if (axis < 0) - axis = inputDesc.nDims + axis; - axis = inputDesc.nDims - 1 - axis; - U32 loopInner = 1; - for (int i = 0; i < axis; i++) { - loopInner *= inputDesc.dims[i]; - } - U32 loopOuter = 1; - for (U32 i = axis+1; i < inputDesc.nDims; i++) { - loopOuter *= inputDesc.dims[i]; - } - - U32 len = inputDesc.dims[axis]; - U32 maskLen = tensorNumElements(maskDesc); - maskLen = (maskLen > 0) ? maskLen : len; - U32 axisDim = maskLen / len; - for (U32 i = 0; i < loopOuter; i++) { - for (U32 j = 0; j < maskLen; j += len) { - U32 axisIndex = j / len; - U32 outputIndex = (i * axisDim + axisIndex) * loopInner; - for (U32 k = 0; k < loopInner; k++) { - const T* array = input + i * (len * loopInner) + k; - const float *maskPtr = (mask == nullptr) ? nullptr : mask + j; - output[outputIndex+k] = reductionKernel(array, maskPtr, reductionMode, coeff, len, loopInner); - } - } - } - return SUCCESS; -} - -EE reduction_general(TensorDesc inputDesc, const void* input, - TensorDesc maskDesc, const void* mask, - I32 axis, - ReductionMode reductionMode, - float coeff, - TensorDesc outputDesc, void* output) -{ - DataType idt = inputDesc.dt; - EE ret = SUCCESS; - switch (idt) { -#ifdef _USE_FP32 - case DT_F32: { - ret = reduction(inputDesc, (const F32*)input, maskDesc, (const float*)mask, axis, reductionMode, coeff, outputDesc, (F32*)output); - break; - } -#endif -#ifdef _USE_FP16 - case DT_F16: { - ret = reduction(inputDesc, (const F16*)input, maskDesc, (const float *)mask, axis, reductionMode, coeff, outputDesc, (F16*)output); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - - return ret; -} diff --git a/tensor_computing/src/cpu/general/reshape.cpp b/tensor_computing/src/cpu/general/reshape.cpp deleted file mode 100644 index b59ecc37..00000000 --- a/tensor_computing/src/cpu/general/reshape.cpp +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include - -#include "cpu/arm/tensor_computing_arm.h" - -EE reshape_general(TensorDesc inputDesc, void* input, - TensorDesc outputDesc, void* output) -{ - if (nullptr == input || nullptr == output) - CHECK_STATUS(NULL_POINTER); - - CHECK_REQUIREMENT(DF_NCHWC8 != inputDesc.df); - - CHECK_REQUIREMENT(tensorNumElements(inputDesc) == tensorNumElements(outputDesc)); - memcpy(output, input, tensorNumBytes(inputDesc)); - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/general/scale.cpp b/tensor_computing/src/cpu/general/scale.cpp deleted file mode 100644 index 8f5c7ec3..00000000 --- a/tensor_computing/src/cpu/general/scale.cpp +++ /dev/null @@ -1,109 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "cpu/general/tensor_computing_general.h" - -template -EE scale_nchw(T* input, T* alpha, T* beta, U32 in, - U32 ic, U32 elements_per_channel, U32 align_size, T*output) -{ - ic = ic / align_size; - for (U32 n = 0; n < in; n++) { - for (U32 c = 0; c < ic; c++) { - for (U32 i = 0; i < elements_per_channel; i++) { - for (U32 k = 0; k < align_size; k++) { - T alphaValue = alpha[c * align_size + k]; - T betaValue = (nullptr == beta) ? 0 : beta[c * align_size + k]; - U32 index = ((n * ic + c) * elements_per_channel + i) * align_size + k; - output[index] = alphaValue * input[index] + betaValue; - } - } - } - } - return SUCCESS; -} - -template -EE scale_nhwc(T* input, T* alpha, T* beta, U32 in, - U32 ic, U32 elements_per_channel, T*output) -{ - for (U32 n = 0; n < in; n++) { - for (U32 i = 0; i < elements_per_channel; i++) { - for (U32 c = 0; c < ic; c++) { - T alphaValue = alpha[c]; - T betaValue = (nullptr == beta) ? 0 : beta[c]; - U32 index = ((n * elements_per_channel) + i) * ic + c; - output[index] = alphaValue * input[index] + betaValue; - } - } - } - return SUCCESS; -} - -template -EE scale(T* input, I32 axis, I32 nDims, T* alpha, T* beta, - U32 in, U32 ic, U32 elements_per_channel, U32 align_size, T*output) -{ - EE ret = SUCCESS; - if (axis == 1) { - ret = scale_nchw(input, alpha, beta, in, - ic, elements_per_channel, align_size, output); - } else if (axis == nDims-1) { - ret = scale_nhwc(input, alpha, beta, in, - ic, elements_per_channel, output); - } else { - ret = NOT_SUPPORTED; - } - return ret; -} - -EE scale_general(TensorDesc inputDesc, void* input, - I32 axis, void *alpha, void *beta, - TensorDesc outputDesc, void* output) -{ - UNUSED(outputDesc); - if (nullptr == input || nullptr == output || nullptr == alpha) - CHECK_STATUS(NULL_POINTER); - - U32 length = tensorNumElements(inputDesc); - axis = (axis + inputDesc.nDims) % inputDesc.nDims; - I32 in = inputDesc.dims[inputDesc.nDims - 1]; - I32 ic = inputDesc.dims[inputDesc.nDims - 1 - axis]; - I32 elements_per_channel = length / (in * ic); - I32 align_size = 1; - if (inputDesc.df == DF_NCHWC8) - align_size = 8; - EE ret = SUCCESS; - switch (inputDesc.dt) { -#ifdef _USE_FP32 - case DT_F32: { - ret = scale((F32*)input, axis, inputDesc.nDims, (F32*)alpha, (F32*)beta, - in, ic, elements_per_channel, align_size, (F32*)output); - break; - } -#endif -#ifdef _USE_FP16 - case DT_F16: { - ret = scale((F16*)input, axis, inputDesc.nDims, (F16*)alpha, (F16*)beta, - in, ic, elements_per_channel, align_size, (F16*)output); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - - return ret; -} diff --git a/tensor_computing/src/cpu/general/slice.cpp b/tensor_computing/src/cpu/general/slice.cpp deleted file mode 100644 index 4b550b78..00000000 --- a/tensor_computing/src/cpu/general/slice.cpp +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include -#include "cpu/general/tensor_computing_general.h" - -EE slice_general(TensorDesc inputDesc, void* input, - int axis, - std::vector outputDesc, std::vector* output) -{ - if (nullptr == input || nullptr == output) - CHECK_STATUS(NULL_POINTER); - U32 num = outputDesc.size(); - if (num < 1) return NOT_MATCH; - - int dim = inputDesc.nDims; - axis = (axis + dim) % dim; - axis = dim - 1 - axis; - U32 tileSize = bytesOf(inputDesc.dt); - for (I32 i = 0; i < axis; i++) { - tileSize *= inputDesc.dims[i]; - } - U32 loops = 1; - for (I32 i = axis + 1; i < dim; i++) { - loops *= inputDesc.dims[i]; - } - - if (inputDesc.df == DF_NCHWC8) { - if (axis < 2) { - tileSize *= 8; - loops /= 8; - } - } - - U8 *ptr = (U8 *)input; - for (U32 i = 0; i < loops; i++) { - for (U32 j = 0; j < num; j++) { - U32 blockSize = outputDesc[j].dims[axis] * tileSize; - if (blockSize > 0 && nullptr == (*output)[j]) - CHECK_STATUS(NULL_POINTER); - U8* dstPtr = (U8*)((*output)[j]) + i * blockSize; - memcpy(dstPtr, ptr, blockSize); - ptr += blockSize; - } - } - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/general/softmax.cpp b/tensor_computing/src/cpu/general/softmax.cpp deleted file mode 100644 index e489fc98..00000000 --- a/tensor_computing/src/cpu/general/softmax.cpp +++ /dev/null @@ -1,95 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "cpu/general/tensor_computing_general.h" - - -template -F32 array_max(const T* input, U32 len, U32 stride) { - F32 tmp = input[0]; - for (U32 i = 1; i < len; i++) { - if(input[i * stride] > tmp) - tmp = input[i * stride]; - } - return tmp; -} - -template -EE softmax(TensorDesc inputDesc, const T* input, - int axis, - TensorDesc outputDesc, T* output) -{ - UNUSED(outputDesc); - if (nullptr == input || nullptr == output) - CHECK_STATUS(NULL_POINTER); - - CHECK_REQUIREMENT(DF_NCHWC8 != inputDesc.df); - - U32 size = tensorNumElements(inputDesc); - axis = (axis + inputDesc.nDims) % inputDesc.nDims; - axis = inputDesc.nDims - 1 - axis; - U32 loops = inputDesc.dims[axis]; - - U32 loop_inner = 1; - for (int i = 0; i < axis; i++) - loop_inner *= inputDesc.dims[i]; - U32 loop_outer = size / loops / loop_inner; - - for (U32 i = 0; i < loop_outer; i++) { - for (U32 j = 0; j < loop_inner; j++) { - const T *in = input + i * loops * loop_inner + j; - T *out = output + i * loops * loop_inner + j; - F32 max_value = array_max(in, loops, loop_inner); - F32 sum = 0; - for (U32 i = 0; i < loops; i++) { - F32 tmp = exp(in[i*loop_inner] - max_value); - sum += tmp; - out[i*loop_inner] = tmp; - } - sum = 1 / sum; - for (U32 i = 0; i < loops; i++) { - out[i*loop_inner] *= sum; - } - } - } - return SUCCESS; -} - -EE softmax_general(TensorDesc inputDesc, const void* input, - int axis, - TensorDesc outputDesc, void* output) -{ - DataType idt = inputDesc.dt; - EE ret = SUCCESS; - switch (idt) { -#ifdef _USE_FP16 - case DT_F16: { - ret = softmax(inputDesc, (const F16*)input, axis, outputDesc, (F16*)output); - break; - } -#endif -#ifdef _USE_FP32 - case DT_F32: { - ret = softmax(inputDesc, (const F32*)input, axis, outputDesc, (F32*)output); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - - return ret; -} diff --git a/tensor_computing/src/cpu/general/split.cpp b/tensor_computing/src/cpu/general/split.cpp deleted file mode 100644 index 5c5f4dec..00000000 --- a/tensor_computing/src/cpu/general/split.cpp +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include - -#include "cpu/general/tensor_computing_general.h" - -EE split_general(TensorDesc inputDesc, void* input, - std::vector outputDesc, std::vector* output) -{ - UNUSED(inputDesc); - if (nullptr == input || nullptr == output) - CHECK_STATUS(NULL_POINTER); - if(outputDesc.size() <= 1) return NOT_MATCH; - - for(U32 i = 0; i < (*output).size(); i++) { - if (nullptr == (*output)[i]) - CHECK_STATUS(NULL_POINTER); - memcpy((*output)[i], input, tensorNumBytes(outputDesc[i])); - } - return SUCCESS; -} diff --git a/tensor_computing/src/cpu/general/tensor_computing_general.h b/tensor_computing/src/cpu/general/tensor_computing_general.h deleted file mode 100644 index cba6db67..00000000 --- a/tensor_computing/src/cpu/general/tensor_computing_general.h +++ /dev/null @@ -1,128 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_TENSOR_COMPUTING_GENERAL -#define _H_TENSOR_COMPUTING_GENERAL - -#include - -#include "error.h" -#include "sys.h" -#include "tensor_desc.h" -#include "tensor_computing_type.h" - -EE convolution_general(TensorDesc inputDesc, void* input, - TensorDesc filterDesc, const void* filter, - ConvolutionDesc convDesc, - TensorDesc scaleDesc, const void* scale, - TensorDesc biasDesc, const void* bias, - TensorDesc outputDesc, void* output, - ActivationDesc activationDesc); - -EE deconvolution_general(TensorDesc inputDesc, void* input, - TensorDesc filterDesc, const void* filter, - ConvolutionDesc convDesc, - TensorDesc scaleDesc, const void* scale, - TensorDesc biasDesc, const void* bias, - TensorDesc outputDesc, void* output, - ActivationDesc activationDesc); - -EE depthwise_convolution_general(TensorDesc inputDesc, void* input, - TensorDesc filterDesc, const void* filter, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const void* bias, - TensorDesc outputDesc, void* output, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc); - -EE detectionoutput_general(std::vector inputDesc, std::vector input, DetectionOutputDesc detectionoutputDesc, TensorDesc outputDesc, void* output); - -EE pooling_general(TensorDesc inputDesc, const void* input, PoolingDesc poolingDesc, TensorDesc outputDesc, void* output); - -EE priorbox_general(std::vector inputDesc, PriorBoxDesc priorboxDesc, TensorDesc outputDesc, void* output); - -EE activation_general(TensorDesc inputDesc, void* input, ActivationDesc activationDesc, TensorDesc outputDesc, void* output); - -EE attention_general(TensorDesc inputDesc, const void *input, - TensorDesc outputDesc, void *output); - -EE clip_general(void *minValue, void *maxValue, TensorDesc inputDesc, void* input, TensorDesc outputDesc, void *output); - -EE eltwise_general(std::vector inputDesc, std::vector input, - TensorDesc outputDesc, void* output, EltwiseMode eltwiseMode); - -EE lstmcell_general(TensorDesc xDesc, const void* currentX, - TensorDesc filterDesc, const void* filter, - TensorDesc biasDesc, const void* bias, - void *state, - U32 tmpBytes, void *tmp, - LSTMDesc lstmDesc, U32 batchStrideX, U32 batchStrideH, - TensorDesc hDesc, void* currentH); - -EE lstm_general(TensorDesc inputDesc, const void* input, - TensorDesc filterDesc, const void* filter, - TensorDesc biasDesc, const void* bias, - U32 tmpBytes, void* tmp, - LSTMDesc lstmDesc, - TensorDesc outputDesc, void* output); - -EE transpose_general(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *output, U32 *dim); - -EE slice_general(TensorDesc inputDesc, void* input, int axis, - std::vector outputDesc, std::vector* output); - -EE split_general(TensorDesc inputDesc, void* input, - std::vector outputDesc, std::vector* output); - -EE multiply_general(void *alpha, void *beta, TensorDesc inputDesc, void* input, TensorDesc outputDesc, void *output); - -EE scale_general(TensorDesc inputDesc, void* input, - I32 axis, void *alpha, void *beta, - TensorDesc outputDesc, void* output); - -EE softmax_general(TensorDesc inputDesc, const void* input, - int axis, - TensorDesc outputDesc, void* output); - -EE reshape_general(TensorDesc inputDesc, void* input, - TensorDesc outputDesc, void* output); - -EE argmax_general(TensorDesc inputDesc, const void* input, - I32 axis, - TensorDesc outputDesc, void* output); - -EE reduction_general(TensorDesc inputDesc, const void* input, - TensorDesc maskDesc, const void* mask, - I32 axis, - ReductionMode reductionMode, - float coeff, - TensorDesc outputDesc, void* output); - -EE check_general(TensorDesc inputDescA, const void* inputA, - TensorDesc inputDescB, const void* inputB, - CheckMode checkMode, - TensorDesc outputDesc, void* output); - -EE layer_normalization_general(void *alpha, void *beta, - TensorDesc inputDesc, void* input, - TensorDesc outputDesc, void* output); - -EE attention_mask_general(TensorDesc inputDesc, const void* input, - I32 attentionLength, bool sameLength, float mask, - TensorDesc outputDesc, void* output); - -EE concat_general(std::vector inputDesc, std::vector input, TensorDesc outputDesc, void* output, int axis); - -EE padding_general(TensorDesc inputDesc, const void* input, PadDesc padDesc, TensorDesc outputDesc, void* output); -#endif diff --git a/tensor_computing/src/cpu/general/transpose.cpp b/tensor_computing/src/cpu/general/transpose.cpp deleted file mode 100644 index 63e4a396..00000000 --- a/tensor_computing/src/cpu/general/transpose.cpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include - -#include "cpu/general/tensor_computing_general.h" - -EE transpose_general(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *output, U32 *dim) { - if (nullptr == input || nullptr == output || nullptr == dim) - CHECK_STATUS(NULL_POINTER); - - U32 inputDim = inputDesc.nDims; - U32 outputDim = outputDesc.nDims; - CHECK_REQUIREMENT(inputDim == outputDim); - - U32 outputSize = tensorNumElements(outputDesc); - CHECK_REQUIREMENT(inputDim == outputDim); - std::vector inputLocalIndex(inputDim); - U8 *input_ptr = (U8 *)input; - U8 *output_ptr = (U8 *)output; - for (U32 i = 0; i < outputSize; i++) { - U32 outputIndex = i; - for (U32 j = 0; j < outputDim; j++) { - U32 value = outputIndex % outputDesc.dims[j]; - outputIndex /= outputDesc.dims[j]; - inputLocalIndex[inputDim - 1 - dim[outputDim - 1 - j]] = value; - } - U32 inputIndex = 0; - for (U32 j = inputDim-1; j > 0; j--) { - inputIndex = (inputIndex + inputLocalIndex[j]) * inputDesc.dims[j-1]; - } - inputIndex += inputLocalIndex[0]; - memcpy(output_ptr+i*bytesOf(outputDesc.dt), input_ptr+inputIndex*bytesOf(inputDesc.dt), bytesOf(inputDesc.dt)); - } - - return SUCCESS; -} diff --git a/tensor_computing/src/deconvolution.cpp b/tensor_computing/src/deconvolution.cpp deleted file mode 100644 index 13c6bb3a..00000000 --- a/tensor_computing/src/deconvolution.cpp +++ /dev/null @@ -1,160 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing.h" -#ifdef _USE_GENERAL -#include "cpu/general/tensor_computing_general.h" -#endif -#ifdef _USE_NEON -#include "cpu/arm/tensor_computing_arm.h" -#endif - -EE deconvolution_infer_output_size(TensorDesc inputDesc, TensorDesc filterDesc, ConvolutionDesc convDesc, - TensorDesc* outputDesc, DataType targetDataType, U32* outputBytes) -{ - if (nullptr == outputDesc || nullptr == outputBytes) { - CHECK_STATUS(NULL_POINTER); - } - DataType idt, fdt; - DataFormat idf, fdf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - - if (fc % 8 != 0) { - CHECK_STATUS(NOT_SUPPORTED); - } - - if (fh < 1 || fw < 1) { - CHECK_STATUS(NOT_SUPPORTED); - } - - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - - oh = fh + strideH * (ih - 1) - paddingT - paddingB; - ow = fw + strideW * (iw - 1) - paddingL - paddingR; - - *outputDesc = tensor4df(targetDataType, DF_NCHWC8, in, fc, oh, ow); - *outputBytes = tensorNumBytes(*outputDesc); - return SUCCESS; -} - -EE deconvolution_infer_forward_algorithm(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, ConvolutionPolicy policy, ConvolutionForwardAlgorithm *algorithm, DataType targetDataType, Arch arch) -{ - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = SUCCESS; -#endif -#ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = deconvolution_infer_forward_algorithm_arm(inputDesc, filterDesc, outputDesc, convDesc, policy, algorithm, targetDataType); -#endif - } - return ret; -} - -EE deconvolution_transform_filter_bytes(TensorDesc filterDesc, ConvolutionForwardAlgorithm algorithm, U32* bytes, Arch arch) -{ - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = SUCCESS; -#endif -#ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = deconvolution_transform_filter_bytes_arm(filterDesc, algorithm, bytes); -#endif - } - return ret; -} - -EE deconvolution_transform_filter(TensorDesc filterDesc, const void* filter, ConvolutionForwardAlgorithm algorithm, - TensorDesc *ftmDesc, void* filterTransformed, Arch arch) -{ - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = SUCCESS; -#endif -#ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = deconvolution_transform_filter_arm(filterDesc, filter, algorithm, ftmDesc, filterTransformed); -#endif - } - return ret; -} - -EE deconvolution_infer_forward_tmp_bytes(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, ConvolutionForwardAlgorithm algorithm, U32 *bytes, Arch arch) -{ - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = SUCCESS; -#endif -#ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = deconvolution_infer_forward_tmp_bytes_arm(inputDesc, filterDesc, outputDesc, convDesc, algorithm, bytes); -#endif - } - return ret; -} - -EE deconvolution(TensorDesc inputDesc, void* input, - TensorDesc filterDesc, const void* filter, - ConvolutionDesc convDesc, - ConvolutionForwardAlgorithm algorithm, - TensorDesc scaleDesc, const void* scale, - TensorDesc biasDesc, const void* bias, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, void* output, - ActivationDesc activationDesc, - Arch arch) -{ - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = deconvolution_general(inputDesc, input, - filterDesc, filter, - convDesc, - scaleDesc, scale, - biasDesc, bias, - outputDesc, output, - activationDesc); -#endif -#ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = deconvolution_arm(inputDesc, input, - filterDesc, filter, - convDesc, - algorithm, - scaleDesc, scale, - biasDesc, bias, - tmpBytes, tmp, - outputDesc, output, - activationDesc, - arch); -#endif - } - return ret; -} diff --git a/tensor_computing/src/depth2space.cpp b/tensor_computing/src/depth2space.cpp deleted file mode 100644 index 6870774d..00000000 --- a/tensor_computing/src/depth2space.cpp +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing.h" -#ifdef _USE_MALI -#include "gpu/mali/tensor_computing_mali.h" -#endif - -EE depth2space_infer_output_size(TensorDesc inputDesc, TensorDesc *outputDesc, Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if(arch == MALI){ -#ifdef _USE_MALI - ret = depth2space_infer_output_size_mali(inputDesc, outputDesc, extInfo->maliInfo.gclmemInputDesc, extInfo->maliInfo.gclmemOutputDesc); -#endif - } - return ret; -} - -EE depth2space(TensorDesc inputDesc, const void* input, TensorDesc outputDesc, void* output, Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if(arch == MALI){ -#ifdef _USE_MALI - ret = depth2space_mali(extInfo->maliInfo.handle, inputDesc, (GCLMem_t)input, outputDesc, (GCLMem_t)output); -#endif - } - return ret; -} diff --git a/tensor_computing/src/depthwise_convolution.cpp b/tensor_computing/src/depthwise_convolution.cpp deleted file mode 100644 index 08bc8d2d..00000000 --- a/tensor_computing/src/depthwise_convolution.cpp +++ /dev/null @@ -1,223 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#include "tensor_computing.h" -#ifdef _USE_GENERAL -#include "cpu/general/tensor_computing_general.h" -#endif -#ifdef _USE_NEON -#include "cpu/arm/tensor_computing_arm.h" -#endif -#ifdef _USE_MALI -#include "gpu/mali/tensor_computing_mali.h" -#endif - -inline EE depthwise_convolution_infer_output_size_cpu(TensorDesc inputDesc, TensorDesc filterDesc, ConvolutionDesc convDesc, - TensorDesc* outputDesc, DataType targetDataType, U32* outputBytes) -{ - if (nullptr == outputDesc || nullptr == outputBytes) { - CHECK_STATUS(NULL_POINTER); - } - DataType idt, fdt; - DataFormat idf, fdf; - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); - if (fh < 1 || fw < 1) { - CHECK_STATUS(NOT_SUPPORTED); - } - - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - U32 dilateH = convDesc.dilatedRate_h; - U32 dilateW = convDesc.dilatedRate_w; - - U32 fhDilated = (fh - 1) * dilateH + 1; - U32 fwDilated = (fw - 1) * dilateW + 1; - - if (fdf == DF_NCHW || fdf == DF_NCHWC8) { - oc = ic; - } else { - oc = fn; - } - oh = (ih + paddingT + paddingB - fhDilated) / strideH + 1; - ow = (iw + paddingL + paddingR - fwDilated) / strideW + 1; - - if (fn % 8 != 0) { - CHECK_STATUS(NOT_SUPPORTED); - } - - *outputDesc = tensor4df(targetDataType, DF_NCHWC8, in, oc, oh, ow); - *outputBytes = tensorNumBytes(*outputDesc); - return SUCCESS; -} - -EE depthwise_convolution_infer_output_size(TensorDesc inputDesc, TensorDesc filterDesc, ConvolutionDesc convDesc, - TensorDesc* outputDesc, DataType targetDataType, U32* outputBytes, Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = SUCCESS; -#endif -#ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = depthwise_convolution_infer_output_size_cpu(inputDesc, filterDesc, convDesc, outputDesc, targetDataType, outputBytes); -#endif -#ifdef _USE_MALI - } else if (arch == MALI) { - ret = depthwise_convolution_infer_output_size_mali(inputDesc, filterDesc, convDesc, outputDesc, - extInfo->maliInfo.gclmemInputDesc, extInfo->maliInfo.gclmemOutputDesc, extInfo->maliInfo.forwardRunInfo); -#endif - } - return ret; -} - -EE depthwise_convolution_infer_forward_algorithm(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, ConvolutionPolicy policy, DepthwiseConvolutionForwardAlgorithm *algorithm, DataType targetDataType, - ActivationDesc depthwiseActivationDesc, ActivationDesc pointwiseActivationDesc, Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = SUCCESS; -#endif -#ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = depthwise_convolution_infer_forward_algorithm_arm(inputDesc, filterDesc, outputDesc, convDesc, policy, algorithm, targetDataType); -#endif -#ifdef _USE_MALI - } else if (arch == MALI) { - ret = depthwise_convolution_infer_forward_algorithm_mali(extInfo->maliInfo.handle, inputDesc, filterDesc, outputDesc, convDesc, policy, - depthwiseActivationDesc.mode, pointwiseActivationDesc.mode, extInfo->maliInfo.forwardRunInfo); -#endif - } - return ret; -} - -EE depthwise_convolution_transform_filter_bytes(TensorDesc filterDesc, DepthwiseConvolutionForwardAlgorithm algorithm, U32* bytes, Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = SUCCESS; -#endif -#ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = depthwise_convolution_transform_filter_bytes_arm(filterDesc, algorithm, bytes); -#endif -#ifdef _USE_MALI - } else if (arch == MALI) { - ret = depthwise_convolution_transform_filter_bytes_mali(filterDesc, extInfo->maliInfo.forwardRunInfo, extInfo->maliInfo.gclmemFilterDesc, bytes); -#endif - } - return ret; -} - -EE depthwise_convolution_transform_filter(TensorDesc filterDesc, const void* filter, DepthwiseConvolutionForwardAlgorithm algorithm, - TensorDesc *ftmDesc, void* filterTransformed, Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = SUCCESS; -#endif -#ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = depthwise_convolution_transform_filter_arm(filterDesc, filter, algorithm, ftmDesc, filterTransformed); -#endif -#ifdef _USE_MALI - } else if (arch == MALI) { - ret = depthwise_convolution_transform_filter_mali(extInfo->maliInfo.handle, filterDesc, (GCLMem_t)filter, extInfo->maliInfo.forwardRunInfo, ftmDesc, - (GCLMem_t)filterTransformed); -#endif - } - return ret; -} - -EE depthwise_convolution_infer_forward_tmp_bytes(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, DepthwiseConvolutionForwardAlgorithm algorithm, U32 *bytes, Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = SUCCESS; -#endif -#ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = depthwise_convolution_infer_forward_tmp_bytes_arm(inputDesc, filterDesc, outputDesc, convDesc, algorithm, bytes); -#endif -#ifdef _USE_MALI - } else if (arch == MALI) { - ret = depthwise_convolution_infer_forward_tmp_bytes_mali(inputDesc, filterDesc, outputDesc, convDesc, extInfo->maliInfo.forwardRunInfo, bytes); -#endif - } - return ret; -} - -EE depthwise_convolution(TensorDesc inputDesc, void* input, - TensorDesc filterDesc, const void* filter, - ConvolutionDesc convDesc, - DepthwiseConvolutionForwardAlgorithm algorithm, - TensorDesc biasDesc, const void* bias, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, void* output, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc, - Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = depthwise_convolution_general(inputDesc, input, - filterDesc, filter, - convDesc, - biasDesc, bias, - outputDesc, output, - depthwiseActivationDesc, - pointwiseActivationDesc); -#endif -#ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = depthwise_convolution_arm(inputDesc, input, - filterDesc, filter, - convDesc, - algorithm, - biasDesc, bias, - tmpBytes, tmp, - outputDesc, output, - depthwiseActivationDesc, - pointwiseActivationDesc, - arch); -#endif -#ifdef _USE_MALI - } else if (arch == MALI) { - ret = depthwise_convolution_mali(extInfo->maliInfo.handle, inputDesc, (GCLMem_t)input, - filterDesc, (GCLMem_t)filter, - convDesc, - extInfo->maliInfo.forwardRunInfo, - biasDesc, (GCLMem_t)bias, - tmpBytes, (GCLMem_t)tmp, - outputDesc, (GCLMem_t)output, - depthwiseActivationDesc.mode, - pointwiseActivationDesc.mode); -#endif - } - return ret; -} diff --git a/tensor_computing/src/detectionoutput.cpp b/tensor_computing/src/detectionoutput.cpp deleted file mode 100644 index fd649408..00000000 --- a/tensor_computing/src/detectionoutput.cpp +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing.h" -#ifdef _USE_GENERAL -#include "cpu/general/tensor_computing_general.h" -#endif -#ifdef _USE_NEON -#include "cpu/arm/tensor_computing_arm.h" -#endif - -inline EE detectionoutput_infer_output_size_cpu(std::vector inputDesc, DetectionOutputDesc detectionoutputDesc, TensorDesc* outputDesc) -{ - if (inputDesc.size() != 3) { - CHECK_STATUS(NOT_MATCH); - } - if (nullptr == outputDesc) { - CHECK_STATUS(NULL_POINTER); - } - DataType idt0, idt2; - DataFormat idf0, idf2; - U32 ih0, iw0; - U32 in2, ic2, ilens2; - //loc - CHECK_STATUS(tensor2dfGet(inputDesc[0], &idt0, &idf0, &ih0, &iw0)); - //priorbox - CHECK_STATUS(tensor3dGet(inputDesc[2], &idt2, &idf2, &in2, &ic2, &ilens2)); - CHECK_REQUIREMENT(iw0 == ilens2); - //output size - U32 oh, ow; - //oh = the first box for saving the number of available boxes(1) + the maximum number of dectected boxes(keep_top_k) - U32 num_detected_max = detectionoutputDesc.keep_top_k; - oh = 1 + num_detected_max; - //Each width is a 6 dimension vector, which stores [label, confidence, xmin, ymin, xmax, ymax] -> 6 - //The first box is [ number of available boxes, 0, 0, 0, 0, 0 ] - ow = 6; - *outputDesc = tensor2df(idt0, idf2, oh, ow); - return SUCCESS; -} - -EE detectionoutput_infer_output_size(std::vector inputDesc, DetectionOutputDesc detectionoutputDesc, TensorDesc* outputDesc, Arch arch, ExtInfo_t extInfo) -{ - UNUSED(arch); - UNUSED(extInfo); - UNUSED(detectionoutputDesc); - CHECK_STATUS(detectionoutput_infer_output_size_cpu(inputDesc, detectionoutputDesc, outputDesc)); - return SUCCESS; -} - -EE detectionoutput(std::vector inputDesc, std::vector input, DetectionOutputDesc detectionoutputDesc, TensorDesc outputDesc, void* output, Arch arch, ExtInfo_t extInfo) -{ - UNUSED(extInfo); - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = detectionoutput_general(inputDesc, input, detectionoutputDesc, outputDesc, output); -#endif -#ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = detectionoutput_arm(inputDesc, input, detectionoutputDesc, outputDesc, output); -#endif - } - return ret; -} diff --git a/tensor_computing/src/eltwise.cpp b/tensor_computing/src/eltwise.cpp deleted file mode 100644 index bb1da6a9..00000000 --- a/tensor_computing/src/eltwise.cpp +++ /dev/null @@ -1,96 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing.h" -#ifdef _USE_GENERAL -#include "cpu/general/tensor_computing_general.h" -#endif -#ifdef _USE_NEON -#include "cpu/arm/tensor_computing_arm.h" -#endif -#ifdef _USE_MALI -#include "gpu/mali/tensor_computing_mali.h" -#endif - -// [1, 10, 10] + [1, 10, 10] = [1, 10, 10] -// [1, 10, 1] + [1, 1, 10] = [1, 10, 10] -// [1, 20, 10] + [10] = [1, 20, 10] -inline EE eltwise_infer_output_size_cpu(std::vector inputDesc, TensorDesc* outputDesc) -{ - if (nullptr == outputDesc) - CHECK_STATUS(NULL_POINTER); - U32 num = inputDesc.size(); - if (num <= 0) - return NOT_MATCH; - - U32 arrayDimMax = 0; - for (U32 i = 1; i < num; i++) { - if (inputDesc[i].nDims > inputDesc[arrayDimMax].nDims) - arrayDimMax = i; - } - - U32 dim = inputDesc[arrayDimMax].nDims; - *outputDesc = inputDesc[arrayDimMax]; - - // DF should either all be NCHWC8, or all be non-C8 - bool isC8 = DF_NCHWC8 == (*outputDesc).df; - for (U32 i = 0; i < num; i++) { - if (isC8) { - CHECK_REQUIREMENT(DF_NCHWC8 == inputDesc[i].df); - } else { - CHECK_REQUIREMENT(DF_NCHWC8 != inputDesc[i].df); - } - } - for (U32 i = 0; i < dim; i++) { - for (U32 j = 0; j < num; j++) { - if (inputDesc[j].nDims > i) { - outputDesc->dims[i] = UNI_MAX(outputDesc->dims[i], inputDesc[j].dims[i]); - } - } - } - return SUCCESS; -} - -EE eltwise_infer_output_size(std::vector inputDesc, TensorDesc* outputDesc, Arch arch, ExtInfo_t extInfo) { - EE ret = NOT_SUPPORTED; - if(arch == MALI){ -#ifdef _USE_MALI - ret = eltwise_infer_output_size_mali(inputDesc, outputDesc, extInfo->maliInfo.gclmemInputDesc, extInfo->maliInfo.gclmemOutputDesc); -#endif - } else { - ret = eltwise_infer_output_size_cpu(inputDesc, outputDesc); - } - return ret; -} - -EE eltwise(std::vector inputDesc, std::vector input, - TensorDesc outputDesc, void* output, EltwiseMode eltwiseMode, Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = eltwise_general(inputDesc, input, outputDesc, output, eltwiseMode); -#endif -#ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = eltwise_arm(inputDesc, input, outputDesc, output, eltwiseMode); -#endif -#ifdef _USE_MALI - } else if (arch == MALI) { - ret = eltwise_mali(extInfo->maliInfo.handle, inputDesc, input, outputDesc, (GCLMem_t)output, eltwiseMode); -#endif - } - return ret; -} - diff --git a/tensor_computing/src/embedding.cpp b/tensor_computing/src/embedding.cpp deleted file mode 100644 index 1a7d2bbd..00000000 --- a/tensor_computing/src/embedding.cpp +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing.h" -#include "cpu/general/tensor_computing_general.h" -#ifdef _USE_MALI -#include "gpu/mali/tensor_computing_mali.h" -#endif - -EE embedding_infer_output_size(TensorDesc inputDesc, TensorDesc *outputDesc, U32 inputDim, U32 numOutput, DataType dt, Arch arch, ExtInfo_t extInfo) -{ -#ifdef _USE_MALI - if(arch == MALI){ - CHECK_STATUS(embedding_infer_output_size_mali(inputDesc, outputDesc, inputDim, numOutput, dt, extInfo->maliInfo.gclmemInputDesc, extInfo->maliInfo.gclmemOutputDesc)); - } else { -#endif - UNUSED(inputDesc); - UNUSED(outputDesc); - UNUSED(inputDim); - UNUSED(numOutput); - UNUSED(arch); - UNUSED(extInfo); - return NOT_SUPPORTED; -#ifdef _USE_MALI - } -#endif - return SUCCESS; -} - -EE embedding(TensorDesc inputDesc, void* input, TensorDesc weightDesc, void* weight, TensorDesc outputDesc, void *output, U32 inputDim, U32 numOutput, bool transpose, DataType dt, Arch arch, ExtInfo_t extInfo) -{ - EE ret = SUCCESS; - switch (arch) { -#ifdef _USE_MALI - case MALI: - ret = embedding_mali(extInfo->maliInfo.handle, inputDesc, (GCLMem_t)input, weightDesc, (GCLMem_t)weight, outputDesc, (GCLMem_t)output, inputDim, numOutput, transpose, dt); - break; -#endif - default: - UNUSED(inputDesc); - UNUSED(input); - UNUSED(weightDesc); - UNUSED(weight); - UNUSED(outputDesc); - UNUSED(output); - UNUSED(inputDim); - UNUSED(numOutput); - UNUSED(transpose); - UNUSED(extInfo); - ret = NOT_SUPPORTED; - } - return ret; -} - diff --git a/tensor_computing/src/fully_connected.cpp b/tensor_computing/src/fully_connected.cpp deleted file mode 100644 index cdad9d8b..00000000 --- a/tensor_computing/src/fully_connected.cpp +++ /dev/null @@ -1,318 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include - -#include "tensor_computing.h" -#include "blas-enhance.h" -#ifdef _USE_MALI -#include "gpu/mali/tensor_computing_mali.h" -#endif - -// input format: NCHW|NCHWC8|NORMAL -// weight(filter) format: NORMAL -// result format: NORMAL - -inline EE fully_connected_infer_output_size_cpu(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc *outputDesc) -{ - if(outputDesc == nullptr) - CHECK_STATUS(NULL_POINTER); - - DataType idt, fdt; - DataFormat idf, fdf; - U32 in, ic, ih, iw; - U32 fh, fw; - if (tensorIs2d(inputDesc)) { - CHECK_STATUS(tensor2dfGet(inputDesc, &idt, &idf, &in, &iw)); - ic = 1; - ih = 1; - } else if (tensorIs4d(inputDesc)) { - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - if (idf != DF_NCHW && idf != DF_NCHWC8) { - CHECK_STATUS(NOT_MATCH); - } - } else { - CHECK_STATUS(NOT_MATCH); - } - - CHECK_REQUIREMENT(tensorIs2d(filterDesc)); - CHECK_STATUS(tensor2dfGet(filterDesc, &fdt, &fdf, &fh, &fw)); - if (fdf != DF_NORMAL) - CHECK_STATUS(NOT_MATCH); - - if (fw != ic * ih * iw) - CHECK_STATUS(NOT_MATCH); - - *outputDesc = tensor2df(idt, DF_NORMAL, in, fh); - return SUCCESS; -} - -EE fully_connected_infer_output_size(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc *outputDesc, Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if(arch == MALI){ -#ifdef _USE_MALI - ret = fully_connected_infer_output_size_mali(inputDesc, filterDesc, outputDesc, extInfo->maliInfo.gclmemInputDesc, extInfo->maliInfo.gclmemOutputDesc, extInfo->maliInfo.forwardRunInfo); -#endif - } else { - ret = fully_connected_infer_output_size_cpu(inputDesc, filterDesc, outputDesc); - } - return ret; -} - -EE fully_connected_infer_forward_algorithm(TensorDesc inputDesc, TensorDesc filterDesc, std::vector outputDescs, Arch arch, ExtInfo_t extInfo) { - EE ret = NOT_SUPPORTED; - if(arch == MALI){ -#ifdef _USE_MALI - ret = fully_connected_infer_forward_algorithm_mali(extInfo->maliInfo.handle, inputDesc, filterDesc, outputDescs, extInfo->maliInfo.forwardRunInfo); -#endif - } else { - UNUSED(inputDesc); - UNUSED(filterDesc); - UNUSED(outputDescs); - UNUSED(extInfo); - } - return ret; -} -EE fully_connected_infer_forward_tmp_bytes(TensorDesc inputDesc, TensorDesc filterDesc, U32 *bytes, Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if(arch == MALI){ -#ifdef _USE_MALI - ret = fully_connected_infer_forward_tmp_bytes_mali(inputDesc, filterDesc, bytes, extInfo->maliInfo.forwardRunInfo); -#endif - } else { - if(bytes == nullptr) CHECK_STATUS(NULL_POINTER); - DataType idt; - DataFormat idf; - U32 in, ic, ih, iw; - if (tensorIs2d(inputDesc)) { - CHECK_STATUS(tensor2dfGet(inputDesc, &idt, &idf, &in, &iw)); - ic = ih = 1; - } - else if (tensorIs4d(inputDesc)) { - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - } - else { - return NOT_MATCH; - } - - if(in != 1){ - // call gemm - TensorDesc in_desc = tensor2df(idt, DF_NORMAL, in, ic*ih*iw); - ret = matrix_matrix_multiply_tmp_bytes(in_desc, filterDesc, bytes, arch); - } - else{ - // call gemv - TensorDesc in_desc = tensor1d(idt, ic*ih*iw); - ret = matrix_vector_multiply_tmp_bytes(filterDesc, in_desc, bytes, arch); - } - } - return ret; -} - - -EE fully_connected_transform_filter_bytes(TensorDesc filterDesc, U32* bytes, Arch arch, ExtInfo_t extInfo) -{ - if(arch == MALI){ -#ifdef _USE_MALI - CHECK_STATUS(fully_connected_transform_filter_bytes_mali(filterDesc, extInfo->maliInfo.gclmemFilterDesc, bytes, extInfo->maliInfo.forwardRunInfo)); -#endif - } else { - if (bytes == nullptr) CHECK_STATUS(NULL_POINTER); - *bytes = tensorNumBytes(filterDesc) + 32; - } - return SUCCESS; -} - - -template -EE fully_connected_transform_filter_kernel(TensorDesc inputDesc, TensorDesc filterDesc, const void* filter, - TensorDesc *ftmDesc, void* filterTransformed) -{ - if (filter == nullptr || ftmDesc == nullptr || filterTransformed == nullptr) { - CHECK_STATUS(NULL_POINTER); - } - - DataType idt, fdt; - DataFormat idf, fdf; - U32 in, ic, ih, iw; - U32 fh, fw; - if (tensorIs2d(inputDesc)) { - CHECK_STATUS(tensor2dfGet(inputDesc, &idt, &idf, &in, &iw)); - ic = ih = 1; - } else if (tensorIs4d(inputDesc)) { - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - } else { - CHECK_STATUS(NOT_MATCH); - } - CHECK_STATUS(tensor2dfGet(filterDesc, &fdt, &fdf, &fh, &fw)); - - if (fw != ic*ih*iw) - CHECK_STATUS(NOT_MATCH); - bool need_transpose = false; - if (in > 1) - need_transpose = true; - - if (idf == DF_NCHW || idf == DF_NORMAL) { - if (need_transpose) { - T *f_ptr = (T *)filter; - T *ftm_ptr = (T *)filterTransformed; - for (U32 h = 0; h < fh; h++) { - for(U32 w = 0; w < fw; w++){ - U32 f_index = h * fw + w; - U32 ftm_index = w * fh + h; - ftm_ptr[ftm_index] = f_ptr[f_index]; - } - } - } else { - memcpy(filterTransformed, filter, tensorNumBytes(filterDesc)); - } - } - if (idf == DF_NCHWC8) { - U32 align = 8; - U32 ic_new = ic / align; - T *f_ptr = (T *)filter; - T *ftm_ptr = (T *)filterTransformed; - for (U32 h = 0; h < fh; h++) { - for(U32 w = 0; w < fw; w++){ - U32 i_n = w / (ic * ih * iw); - U32 remain = w % (ic * ih * iw); - U32 i_c = remain / (ih * iw); - remain = remain % (ih * iw); - U32 i_h = remain / iw; - U32 i_w = remain % iw; - U32 i_c_outer = i_c / align; - U32 i_c_inner = i_c % align; - U32 h_new = h; - U32 w_new = (((i_n * ic_new + i_c_outer) * ih + i_h) * iw + i_w) * align + i_c_inner; - U32 ld = fw; - if (need_transpose) { - U32 tmp = h_new; - h_new = w_new; - w_new = tmp; - ld = fh; - } - U32 f_index = h * fw + w; - U32 ftm_index = h_new * ld + w_new; - ftm_ptr[ftm_index] = f_ptr[f_index]; - } - } - } - - DataFormat fdf_after = fdf; - U32 fh_after = fh; - U32 fw_after = fw; - if (need_transpose) { - fh_after = fw; - fw_after = fh; - } - *ftmDesc = tensor2df(fdt, fdf_after, fh_after, fw_after); - return SUCCESS; -} - -EE fully_connected_transform_filter(TensorDesc inputDesc, TensorDesc filterDesc, const void* filter, - TensorDesc *ftmDesc, void* filterTransformed, Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if(arch == MALI){ -#ifdef _USE_MALI - ret = fully_connected_transform_filter_mali(extInfo->maliInfo.handle, filterDesc, (GCLMem_t)filter, ftmDesc, (std::vector*)(filterTransformed), extInfo->maliInfo.forwardRunInfo); -#endif - } else { - switch(filterDesc.dt) { -#ifdef _USE_FP16 - case DT_F16: { - ret = fully_connected_transform_filter_kernel(inputDesc, filterDesc, filter, ftmDesc, filterTransformed); - break; - } -#endif -#ifdef _USE_FP32 - case DT_F32: { - ret = fully_connected_transform_filter_kernel(inputDesc, filterDesc, filter, ftmDesc, filterTransformed); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - } - return ret; -} - -EE fully_connected(TensorDesc inputDesc, const void* input, - TensorDesc filterDesc, const void* filter, - void* tmp, U32 bytes, - TensorDesc outputDesc, void* output, - TensorDesc biasDesc, const void* bias, - Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if(arch == MALI) { -#ifdef _USE_MALI - ret = fully_connected_mali(extInfo->maliInfo.handle, inputDesc, (GCLMem_t)input, - filterDesc, (std::vector*) filter, - biasDesc, (std::vector*) bias, - bytes, (GCLMem_t) tmp, - outputDesc, (std::vector*) output, extInfo->maliInfo.forwardRunInfo); -#endif - } else { - if(input == nullptr || filter == nullptr || output == nullptr) { - CHECK_STATUS(NULL_POINTER); - } - - U32 in, ic, ih, iw; - U32 oh, ow; - U32 fh, fw, bw; - DataType idt, fdt, odt, bdt; - DataFormat idf, fdf, odf; - if (tensorIs2d(inputDesc)) { - CHECK_STATUS(tensor2dfGet(inputDesc, &idt, &idf, &in, &iw)); - ic = ih = 1; - } else if (tensorIs4d(inputDesc)){ - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - } else { - CHECK_STATUS(NOT_MATCH); - } - - CHECK_REQUIREMENT(tensorIs2d(filterDesc)); - CHECK_STATUS(tensor2dfGet(filterDesc, &fdt, &fdf, &fh, &fw)); - CHECK_STATUS(tensor2dfGet(outputDesc, &odt, &odf, &oh, &ow)); - - if (bias != nullptr) { - CHECK_STATUS(tensor1dGet(biasDesc, &bdt, &bw)); - - if (bw != ow) { - CHECK_STATUS(NOT_MATCH); - } else { - U8 *outArray = (U8*)output; - U32 size = tensorNumBytes(biasDesc); - for (U32 i = 0; i < in; i++) { - memcpy(outArray + i*size, bias, size); - } - } - } - if (in == 1 && (fdf == DF_NORMAL || fdf == DF_TRANSPOSE)) { - TensorDesc vectorDesc = tensor1d(idt, ic*ih*iw); - TensorDesc resultDesc = tensor1d(odt, ow); - ret = matrix_vector_multiply(filterDesc, filter, vectorDesc, input, bytes, tmp, resultDesc, output, arch); - } else { - if (idf == DF_TRANSPOSE || fdf == DF_TRANSPOSE) CHECK_STATUS(NOT_MATCH); - TensorDesc in_desc = tensor2df(idt, DF_NORMAL, in, ic*ih*iw); - ret = matrix_matrix_multiply(in_desc, input, filterDesc, filter, bytes, tmp, outputDesc, output, arch); - } - } - return ret; -} diff --git a/tensor_computing/src/get_output.cpp b/tensor_computing/src/get_output.cpp deleted file mode 100644 index c59e7978..00000000 --- a/tensor_computing/src/get_output.cpp +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing.h" -#ifdef _USE_MALI -#include "gpu/mali/tensor_computing_mali.h" -#endif - -EE tensor_computing_get_output_infer_tmpBuf_size(const void* input, TensorDesc hostDesc, U32* tmpBufSize, Arch arch) -{ - EE ret = NOT_SUPPORTED; - if(arch == MALI) { -#ifdef _USE_MALI - ret = tensor_computing_get_output_infer_tmpBuf_size_mali((const GCLMem_t)input, hostDesc, tmpBufSize); -#endif - } - return ret; -} - -EE tensor_computing_get_output(const void* input, TensorDesc hostDesc, void** hostPtr, void* tmpBuf, bool blocking, Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if (arch == MALI) { -#ifdef _USE_MALI - ret = tensor_computing_get_output_mali(extInfo->maliInfo.handle, (const GCLMem_t)input, hostDesc, (U8**)hostPtr, (GCLMem_t)tmpBuf, blocking); -#endif - } - return ret; -} diff --git a/tensor_computing/src/gpu/mali/activation.cpp b/tensor_computing/src/gpu/mali/activation.cpp deleted file mode 100644 index 742eae35..00000000 --- a/tensor_computing/src/gpu/mali/activation.cpp +++ /dev/null @@ -1,97 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "gpu/mali/tensor_computing_mali.h" -#include "gpu/mali/fp16/activation_mali_fp16.h" -#include "gpu/mali/infer_gclmem_desc_mali.h" - -EE activation_infer_output_size_mali(TensorDesc inputDesc, - TensorDesc* outputDesc, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc){ - /*tensorDesc record cpu org data format info*/ - /*gclmemDesc record gpu trans data format info*/ - if(outputDesc) *outputDesc = inputDesc; - - if(inputDesc.df == DF_NCHW || inputDesc.df == DF_MKT) { - DataType idt; - DataFormat idf; - U32 iw, ih, ic, in; - if(inputDesc.df == DF_NCHW) tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); - if(inputDesc.df == DF_MKT) { - U32 m, k, t; - get_nlp_mkt_val(inputDesc, &idt, &m, &k, &t); - map_nlp_mkt_to_ncwhc4(m, k, t, &iw, &ih, &ic); - ic = 4 * ic; - } - CHECK_STATUS(infer_gclmem_desc_ncwhc4(iw, ih, ic, 0, 0, iw, ih, ic, idt, idt, gclmemInputDesc, gclmemOutputDesc)); - if(gclmemInputDesc && gclmemOutputDesc) *gclmemOutputDesc = *gclmemInputDesc; - return SUCCESS; - } - return NOT_SUPPORTED; -} - -inline EE activation_checkpara_mali(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output, - ActivationMode activationMode) { - - if(handle == nullptr || nullptr == input || nullptr == output) return NULL_POINTER; - if(inputDesc.df != outputDesc.df) return NOT_SUPPORTED; - if(outputDesc.df != DF_NCHW && outputDesc.df != DF_MKT) return NOT_SUPPORTED; - if(input->desc.memFormat != DF_NCWHC4) return NOT_SUPPORTED; - if(output->desc.memFormat != DF_NCWHC4) return NOT_SUPPORTED; - if(activationMode != ACTIVATION_NULL && - activationMode != ACTIVATION_RELU && - activationMode != ACTIVATION_RELU6 && - activationMode != ACTIVATION_H_SIGMOID && - activationMode != ACTIVATION_H_SWISH && - activationMode != ACTIVATION_GELU && - activationMode != ACTIVATION_TANH && - activationMode != ACTIVATION_SIGMOID) return NOT_SUPPORTED; - return SUCCESS; -} - -EE activation_mali(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output, - ActivationMode activationMode) { - EE ret = SUCCESS; - CHECK_STATUS(activation_checkpara_mali(handle, inputDesc, input, outputDesc, output, activationMode)); - switch(inputDesc.dt){ - case DT_F16:{ - ret = activation_mali_fp16(handle, inputDesc, input, outputDesc, output, activationMode); - break; - } - case DT_I8:{ - ret = NOT_SUPPORTED; - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - - - diff --git a/tensor_computing/src/gpu/mali/bilateral_slice_apply.cpp b/tensor_computing/src/gpu/mali/bilateral_slice_apply.cpp deleted file mode 100644 index 5a22fc80..00000000 --- a/tensor_computing/src/gpu/mali/bilateral_slice_apply.cpp +++ /dev/null @@ -1,186 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "gpu/mali/tensor_computing_mali.h" -#include "gpu/mali/fp16/bilateral_slice_apply_mali_fp16.h" -#include "gpu/mali/uchar/bilateral_slice_apply_mali_uchar.h" -#include "gpu/mali/infer_gclmem_desc_mali.h" - -inline EE bilateral_slice_apply_checkpara_mali_common(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc guideDesc, - const GCLMem_t guide, - TensorDesc gridDesc, - const GCLMem_t grid, - BilateralSliceApplyDesc bilateralSliceApplyDesc, - TensorDesc outputDesc, - GCLMem_t output){ - - if (nullptr == handle || nullptr == input || nullptr == grid || nullptr == output) return NULL_POINTER; - if (bilateralSliceApplyDesc.mode == BSliceApply_NULL && nullptr == guide) return NULL_POINTER; - if (inputDesc.df != guideDesc.df || inputDesc.df != gridDesc.df) return NOT_SUPPORTED; - if (inputDesc.df != outputDesc.df || inputDesc.df != DF_NHWC) return NOT_SUPPORTED; - if (inputDesc.dims[0] != guideDesc.dims[0] || inputDesc.dims[1] != guideDesc.dims[1]) return NOT_MATCH; - if (inputDesc.dims[0] != outputDesc.dims[0] || inputDesc.dims[1] != outputDesc.dims[1]) return NOT_MATCH; - if (inputDesc.dims[2] != outputDesc.dims[2]) return NOT_MATCH; - if ((gridDesc.dims[2] % bilateralSliceApplyDesc.coefficient_len) != 0) return NOT_MATCH; - if (bilateralSliceApplyDesc.has_offset == true){ - if(bilateralSliceApplyDesc.coefficient_len != inputDesc.dims[2] * (inputDesc.dims[2] + 1)) return NOT_MATCH; - if(bilateralSliceApplyDesc.coefficient_len != 12) return NOT_SUPPORTED; - } else { - return NOT_SUPPORTED; - //if(bilateralSliceApplyDesc.coefficient_len != inputDesc.dims[2] * inputDesc.dims[2]) return NOT_MATCH; - //if(bilateralSliceApplyDesc.coefficient_len != 9) return NOT_SUPPORTED; - } - return SUCCESS; -} - -EE bilateral_slice_apply_infer_output_size_mali(TensorDesc inputDesc, - TensorDesc guideDesc, - TensorDesc gridDesc, - BilateralSliceApplyDesc bilateralSliceApplyDesc, - TensorDesc* outputDesc, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemGuideDesc, - GCLMemDesc_t gclmemGridDesc, - GCLMemDesc_t gclmemOutputDesc) { - UNUSED(bilateralSliceApplyDesc); - DataType idt, gdt, guide_dt; - DataFormat idf, gdf; - U32 guide_w, guide_h, guide_c, guide_n; - U32 iw, ih, ic, in; - U32 ow, oh, oc, on; - U32 gw, gh, gc, gn; - - if(inputDesc.df != DF_NHWC || guideDesc.df != DF_NHWC) return NOT_MATCH; - tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); - tensorSelectGet(guideDesc, &guide_dt, &gdf, &guide_n, &guide_c, &guide_h, &guide_w); - tensorSelectGet(gridDesc, &gdt, &gdf, &gn, &gc, &gh, &gw); - ow = guide_w; - oh = guide_h; - oc = ic; - on = guide_n; - if(outputDesc) *outputDesc = tensor4df(idt, idf, on, oc, oh, ow); - CHECK_STATUS(infer_gclmem_desc_nhwc(iw, ih, ic, 0, 0, ow, oh, oc, idt, idt, gclmemInputDesc, gclmemOutputDesc)); - - if(gclmemGridDesc && gclmemGuideDesc) { - U32 s0, s1, s2; - U32 num, byteSize; - s0 = gc; - s1 = gw; - s2 = gh; - num = s0 * s1 * s2; - byteSize = s0 * s1 * s2 * bytesOf(gdt); - gclmemGridDesc->stride[0] = s0; - gclmemGridDesc->stride[1] = s1; - gclmemGridDesc->stride[2] = s2; - gclmemGridDesc->offset[0] = 0; - gclmemGridDesc->offset[1] = 0; - gclmemGridDesc->offset[2] = 0; - gclmemGridDesc->num = num; - gclmemGridDesc->byteSize = byteSize; - gclmemGridDesc->memType = GCL_MEM_BUF; - gclmemGridDesc->memFormat = DF_NHWC; - gclmemGridDesc->flags = CL_MEM_READ_WRITE; - gclmemGridDesc->host_ptr = NULL; - - if(bilateralSliceApplyDesc.mode == BSliceApply_NULL) { - s0 = guide_c; - s1 = guide_w; - s2 = guide_h; - num = s0 * s1 * s2; - byteSize = s0 * s1 * s2 * bytesOf(guide_dt); - gclmemGuideDesc->stride[0] = s0; - gclmemGuideDesc->stride[1] = s1; - gclmemGuideDesc->stride[2] = s2; - gclmemGuideDesc->offset[0] = 0; - gclmemGuideDesc->offset[1] = 0; - gclmemGuideDesc->offset[2] = 0; - gclmemGuideDesc->num = num; - gclmemGuideDesc->byteSize = byteSize; - gclmemGuideDesc->memType = GCL_MEM_BUF; - gclmemGuideDesc->memFormat = DF_NHWC; - gclmemGuideDesc->flags = CL_MEM_READ_WRITE; - gclmemGuideDesc->host_ptr = NULL; - } - } - return SUCCESS; -} - -EE bilateral_slice_apply_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, - TensorDesc guideDesc, - TensorDesc gridDesc, - BilateralSliceApplyDesc bilateralSliceApplyDesc, - ForwardRunInfoMali_t forwardRunInfo, - U32* bytes){ - - UNUSED(inputDesc); - UNUSED(guideDesc); - UNUSED(gridDesc); - UNUSED(bilateralSliceApplyDesc); - UNUSED(forwardRunInfo); - - DataType dt; - U32 gc, gw; - U32 ih; - tensorSelectGet(gridDesc, &dt, NULL, NULL, &gc, NULL, &gw); - tensorSelectGet(inputDesc, NULL, NULL, NULL, NULL, &ih, NULL); - *bytes = gc * gw * ih * bytesOf(dt); - return SUCCESS; -} - -EE bilateral_slice_apply_mali(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc guideDesc, - const GCLMem_t guide, - TensorDesc gridDesc, - const GCLMem_t grid, - BilateralSliceApplyDesc bilateralSliceApplyDesc, - ForwardRunInfoMali_t forwardRunInfo, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - GCLMem_t output){ - EE ret = SUCCESS; - CHECK_STATUS(bilateral_slice_apply_checkpara_mali_common(handle, inputDesc, input, guideDesc, guide, gridDesc, grid, bilateralSliceApplyDesc, outputDesc, output)); - switch(inputDesc.dt){ - case DT_F16:{ - ret = bilateral_slice_apply_mali_fp16(handle, inputDesc, input, guideDesc, guide, gridDesc, grid, bilateralSliceApplyDesc, forwardRunInfo, - tmpBytes, tmpBuf, outputDesc, output); - break; - } - case DT_U8:{ - ret = bilateral_slice_apply_mali_uchar(handle, inputDesc, input, guideDesc, guide, gridDesc, grid, bilateralSliceApplyDesc, forwardRunInfo, - tmpBytes, tmpBuf, outputDesc, output); - break; - } - case DT_I8:{ - ret = NOT_SUPPORTED; - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - - - diff --git a/tensor_computing/src/gpu/mali/cl/activation.cl b/tensor_computing/src/gpu/mali/cl/activation.cl deleted file mode 100644 index 4e0d9297..00000000 --- a/tensor_computing/src/gpu/mali/cl/activation.cl +++ /dev/null @@ -1,262 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - - -#define MANGLE_NAME_IMPL(base, AC, H) base ## AC ## H -#define MANGLE_NAME(base, AC, H) MANGLE_NAME_IMPL(base, AC, H) - -#if(H == 1) -#if defined(USE_RELU) -#define calCore(val) {\ - if(val.s0 < 0) val.s0 = 0;\ - if(val.s1 < 0) val.s1 = 0;\ - if(val.s2 < 0) val.s2 = 0;\ - if(val.s3 < 0) val.s3 = 0;\ -} -#elif defined(USE_RELU6) -#define calCore(val) {\ - val.s0 = clamp(val.s0, (T)0, (T)6.0);\ - val.s1 = clamp(val.s1, (T)0, (T)6.0);\ - val.s2 = clamp(val.s2, (T)0, (T)6.0);\ - val.s3 = clamp(val.s3, (T)0, (T)6.0);\ -} -#elif defined(USE_HSIGMOID) -#define calCore(val) {\ - val.s0 = val.s0 + (T)3.0;\ - val.s1 = val.s1 + (T)3.0;\ - val.s2 = val.s2 + (T)3.0;\ - val.s3 = val.s3 + (T)3.0;\ - val.s0 = clamp(val.s0, (T)0, (T)6.0);\ - val.s1 = clamp(val.s1, (T)0, (T)6.0);\ - val.s2 = clamp(val.s2, (T)0, (T)6.0);\ - val.s3 = clamp(val.s3, (T)0, (T)6.0);\ - val.s0 = val.s0 * 0.166667;\ - val.s1 = val.s1 * 0.166667;\ - val.s2 = val.s2 * 0.166667;\ - val.s3 = val.s3 * 0.166667;\ -} -#elif defined(USE_HSWISH) -#define calCore(val) {\ - T4 tmp = val;\ - val.s0 = val.s0 + (T)3.0;\ - val.s1 = val.s1 + (T)3.0;\ - val.s2 = val.s2 + (T)3.0;\ - val.s3 = val.s3 + (T)3.0;\ - val.s0 = clamp(val.s0, (T)0, (T)6.0);\ - val.s1 = clamp(val.s1, (T)0, (T)6.0);\ - val.s2 = clamp(val.s2, (T)0, (T)6.0);\ - val.s3 = clamp(val.s3, (T)0, (T)6.0);\ - val.s0 = tmp.s0 * (val.s0 * 0.166667);\ - val.s1 = tmp.s1 * (val.s1 * 0.166667);\ - val.s2 = tmp.s2 * (val.s2 * 0.166667);\ - val.s3 = tmp.s3 * (val.s3 * 0.166667);\ -} -#elif defined(USE_GELU) -#define calCore(val) {\ - T4 tmp = val;\ - val.s0 = 0.797885 * (val.s0 + 0.044715 * pown(val.s0, 3));\ - val.s1 = 0.797885 * (val.s1 + 0.044715 * pown(val.s1, 3));\ - val.s2 = 0.797885 * (val.s2 + 0.044715 * pown(val.s2, 3));\ - val.s3 = 0.797885 * (val.s3 + 0.044715 * pown(val.s3, 3));\ - val.s0 = 1.0 - 2.0 / (exp(2.0 * val.s0) + 1.0);\ - val.s1 = 1.0 - 2.0 / (exp(2.0 * val.s1) + 1.0);\ - val.s2 = 1.0 - 2.0 / (exp(2.0 * val.s2) + 1.0);\ - val.s3 = 1.0 - 2.0 / (exp(2.0 * val.s3) + 1.0);\ - val.s0 = (val.s0 + (T)1.0) * (T)0.5;\ - val.s1 = (val.s1 + (T)1.0) * (T)0.5;\ - val.s2 = (val.s2 + (T)1.0) * (T)0.5;\ - val.s3 = (val.s3 + (T)1.0) * (T)0.5;\ - val.s0 = val.s0 * tmp.s0;\ - val.s1 = val.s1 * tmp.s1;\ - val.s2 = val.s2 * tmp.s2;\ - val.s3 = val.s3 * tmp.s3;\ -} -#elif defined(USE_TANH) -#define calCore(val) {\ - val.s0 = 1.0 - 2.0 / (exp(2.0 * val.s0) + 1.0);\ - val.s1 = 1.0 - 2.0 / (exp(2.0 * val.s1) + 1.0);\ - val.s2 = 1.0 - 2.0 / (exp(2.0 * val.s2) + 1.0);\ - val.s3 = 1.0 - 2.0 / (exp(2.0 * val.s3) + 1.0);\ -} -#elif defined(USE_SIGMOID) -#define calCore(val) {\ - val.s0 = 1.0 / (1.0 + exp(-1.0 * val.s0));\ - val.s1 = 1.0 / (1.0 + exp(-1.0 * val.s1));\ - val.s2 = 1.0 / (1.0 + exp(-1.0 * val.s2));\ - val.s3 = 1.0 / (1.0 + exp(-1.0 * val.s3));\ -} -#endif -#endif - -#if(H == 2) -#if defined(USE_RELU) -#define calCore(val) {\ - if(val.s0 < 0) val.s0 = 0;\ - if(val.s1 < 0) val.s1 = 0;\ - if(val.s2 < 0) val.s2 = 0;\ - if(val.s3 < 0) val.s3 = 0;\ - if(val.s4 < 0) val.s4 = 0;\ - if(val.s5 < 0) val.s5 = 0;\ - if(val.s6 < 0) val.s6 = 0;\ - if(val.s7 < 0) val.s7 = 0;\ -} -#elif defined(USE_RELU6) -#define calCore(val) {\ - val.s0 = clamp(val.s0, (T)0, (T)6.0);\ - val.s1 = clamp(val.s1, (T)0, (T)6.0);\ - val.s2 = clamp(val.s2, (T)0, (T)6.0);\ - val.s3 = clamp(val.s3, (T)0, (T)6.0);\ - val.s4 = clamp(val.s4, (T)0, (T)6.0);\ - val.s5 = clamp(val.s5, (T)0, (T)6.0);\ - val.s6 = clamp(val.s6, (T)0, (T)6.0);\ - val.s7 = clamp(val.s7, (T)0, (T)6.0);\ -} -#elif defined(USE_HSIGMOID) -#define calCore(val) {\ - val.s0 = val.s0 + (T)3.0;\ - val.s1 = val.s1 + (T)3.0;\ - val.s2 = val.s2 + (T)3.0;\ - val.s3 = val.s3 + (T)3.0;\ - val.s4 = val.s4 + (T)3.0;\ - val.s5 = val.s5 + (T)3.0;\ - val.s6 = val.s6 + (T)3.0;\ - val.s7 = val.s7 + (T)3.0;\ - val.s0 = clamp(val.s0, (T)0, (T)6.0);\ - val.s1 = clamp(val.s1, (T)0, (T)6.0);\ - val.s2 = clamp(val.s2, (T)0, (T)6.0);\ - val.s3 = clamp(val.s3, (T)0, (T)6.0);\ - val.s4 = clamp(val.s4, (T)0, (T)6.0);\ - val.s5 = clamp(val.s5, (T)0, (T)6.0);\ - val.s6 = clamp(val.s6, (T)0, (T)6.0);\ - val.s7 = clamp(val.s7, (T)0, (T)6.0);\ - val.s0 = val.s0 * 0.166667;\ - val.s1 = val.s1 * 0.166667;\ - val.s2 = val.s2 * 0.166667;\ - val.s3 = val.s3 * 0.166667;\ - val.s4 = val.s4 * 0.166667;\ - val.s5 = val.s5 * 0.166667;\ - val.s6 = val.s6 * 0.166667;\ - val.s7 = val.s7 * 0.166667;\ -} -#elif defined(USE_HSWISH) -#define calCore(val) {\ - T8 tmp = val;\ - val.s0 = val.s0 + (T)3.0;\ - val.s1 = val.s1 + (T)3.0;\ - val.s2 = val.s2 + (T)3.0;\ - val.s3 = val.s3 + (T)3.0;\ - val.s4 = val.s4 + (T)3.0;\ - val.s5 = val.s5 + (T)3.0;\ - val.s6 = val.s6 + (T)3.0;\ - val.s7 = val.s7 + (T)3.0;\ - val.s0 = clamp(val.s0, (T)0, (T)6.0);\ - val.s1 = clamp(val.s1, (T)0, (T)6.0);\ - val.s2 = clamp(val.s2, (T)0, (T)6.0);\ - val.s3 = clamp(val.s3, (T)0, (T)6.0);\ - val.s4 = clamp(val.s4, (T)0, (T)6.0);\ - val.s5 = clamp(val.s5, (T)0, (T)6.0);\ - val.s6 = clamp(val.s6, (T)0, (T)6.0);\ - val.s7 = clamp(val.s7, (T)0, (T)6.0);\ - val.s0 = tmp.s0 * (val.s0 * 0.166667);\ - val.s1 = tmp.s1 * (val.s1 * 0.166667);\ - val.s2 = tmp.s2 * (val.s2 * 0.166667);\ - val.s3 = tmp.s3 * (val.s3 * 0.166667);\ - val.s4 = tmp.s4 * (val.s4 * 0.166667);\ - val.s5 = tmp.s5 * (val.s5 * 0.166667);\ - val.s6 = tmp.s6 * (val.s6 * 0.166667);\ - val.s7 = tmp.s7 * (val.s7 * 0.166667);\ -} -#elif defined(USE_GELU) -#define calCore(val) {\ - T8 tmp = val;\ - val.s0 = 0.797885 * (val.s0 + 0.044715 * pown(val.s0, 3));\ - val.s1 = 0.797885 * (val.s1 + 0.044715 * pown(val.s1, 3));\ - val.s2 = 0.797885 * (val.s2 + 0.044715 * pown(val.s2, 3));\ - val.s3 = 0.797885 * (val.s3 + 0.044715 * pown(val.s3, 3));\ - val.s4 = 0.797885 * (val.s4 + 0.044715 * pown(val.s4, 3));\ - val.s5 = 0.797885 * (val.s5 + 0.044715 * pown(val.s5, 3));\ - val.s6 = 0.797885 * (val.s6 + 0.044715 * pown(val.s6, 3));\ - val.s7 = 0.797885 * (val.s7 + 0.044715 * pown(val.s7, 3));\ - val.s0 = 1.0 - 2.0 / (exp(2.0 * val.s0) + 1.0);\ - val.s1 = 1.0 - 2.0 / (exp(2.0 * val.s1) + 1.0);\ - val.s2 = 1.0 - 2.0 / (exp(2.0 * val.s2) + 1.0);\ - val.s3 = 1.0 - 2.0 / (exp(2.0 * val.s3) + 1.0);\ - val.s4 = 1.0 - 2.0 / (exp(2.0 * val.s4) + 1.0);\ - val.s5 = 1.0 - 2.0 / (exp(2.0 * val.s5) + 1.0);\ - val.s6 = 1.0 - 2.0 / (exp(2.0 * val.s6) + 1.0);\ - val.s7 = 1.0 - 2.0 / (exp(2.0 * val.s7) + 1.0);\ - val.s0 = (val.s0 + (T)1.0) * (T)0.5;\ - val.s1 = (val.s1 + (T)1.0) * (T)0.5;\ - val.s2 = (val.s2 + (T)1.0) * (T)0.5;\ - val.s3 = (val.s3 + (T)1.0) * (T)0.5;\ - val.s4 = (val.s4 + (T)1.0) * (T)0.5;\ - val.s5 = (val.s5 + (T)1.0) * (T)0.5;\ - val.s6 = (val.s6 + (T)1.0) * (T)0.5;\ - val.s7 = (val.s7 + (T)1.0) * (T)0.5;\ - val.s0 = val.s0 * tmp.s0;\ - val.s1 = val.s1 * tmp.s1;\ - val.s2 = val.s2 * tmp.s2;\ - val.s3 = val.s3 * tmp.s3;\ - val.s4 = val.s4 * tmp.s4;\ - val.s5 = val.s5 * tmp.s5;\ - val.s6 = val.s6 * tmp.s6;\ - val.s7 = val.s7 * tmp.s7;\ -} -#elif defined(USE_TANH) -#define calCore(val) {\ - val.s0 = 1.0 - 2.0 / (exp(2.0 * val.s0) + 1.0);\ - val.s1 = 1.0 - 2.0 / (exp(2.0 * val.s1) + 1.0);\ - val.s2 = 1.0 - 2.0 / (exp(2.0 * val.s2) + 1.0);\ - val.s3 = 1.0 - 2.0 / (exp(2.0 * val.s3) + 1.0);\ - val.s4 = 1.0 - 2.0 / (exp(2.0 * val.s4) + 1.0);\ - val.s5 = 1.0 - 2.0 / (exp(2.0 * val.s5) + 1.0);\ - val.s6 = 1.0 - 2.0 / (exp(2.0 * val.s6) + 1.0);\ - val.s7 = 1.0 - 2.0 / (exp(2.0 * val.s7) + 1.0);\ -} -#elif defined(USE_SIGMOID) -#define calCore(val) {\ - val.s0 = 1.0 / (1.0 + exp(-1.0 * val.s0));\ - val.s1 = 1.0 / (1.0 + exp(-1.0 * val.s1));\ - val.s2 = 1.0 / (1.0 + exp(-1.0 * val.s2));\ - val.s3 = 1.0 / (1.0 + exp(-1.0 * val.s3));\ - val.s4 = 1.0 / (1.0 + exp(-1.0 * val.s4));\ - val.s5 = 1.0 / (1.0 + exp(-1.0 * val.s5));\ - val.s6 = 1.0 / (1.0 + exp(-1.0 * val.s6));\ - val.s7 = 1.0 / (1.0 + exp(-1.0 * val.s7));\ -} -#endif -#endif - -__kernel void MANGLE_NAME(activation_, AC, H)(const int h, const int w, const int cd4, const int ce4, const int ih_str, const int iw_str, const int ih_off, const int iw_off, const int oh_str, const int ow_str, const int oh_off, const int ow_off, __global T* input, __global T* output) { - int idx = get_global_id(0); - int idy = get_global_id(1); - int idz = get_global_id(2); - if(idx >= h || idy >= w) return; - - T4 val; - int in_off = (idz * iw_str + idy + iw_off) * ih_str + idx + ih_off; - val = vload4(in_off, input); - calCore(val); -#if defined(USE_TANH) || defined(USE_SIGMOID) || defined(USE_HSIGMOID) || defined(USE_GELU) - if(idz == cd4 - 1) { - if(ce4 < 2) val.y = 0; - if(ce4 < 3) val.z = 0; - if(ce4 < 4) val.w = 0; - } -#endif - int out_off = (idz * ow_str + idy + ow_off) * oh_str + idx + oh_off; - vstore4(val, out_off, output); -} diff --git a/tensor_computing/src/gpu/mali/cl/bilateral_slice_apply_c12.cl b/tensor_computing/src/gpu/mali/cl/bilateral_slice_apply_c12.cl deleted file mode 100644 index b43bf2a9..00000000 --- a/tensor_computing/src/gpu/mali/cl/bilateral_slice_apply_c12.cl +++ /dev/null @@ -1,164 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - - -#if defined(USE_HALF) -#define READ_IMAGE(image, sampler, coord) read_imageh(image, sampler, coord) -#define WRITE_IMAGE(image, coord, data) write_imageh(image, coord, data) -#else -#define READ_IMAGE(image, sampler, coord) read_imagef(image, sampler, coord) -#define WRITE_IMAGE(image, coord, data) write_imagef(image, coord, data) -#endif -__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - -/*these parameters are belong to matrix mult/add and conv*/ -/*they are extract from HDR model*/ -/*they may be changful for different model*/ -#define guide_cal(v, g){\ - T3 tmp;\ - tmp.x = v.x * (T)0.900616 - v.y * (T)0.1006 - v.z * (T)0.058384 + (T)0.072721;\ - tmp.y =-v.x * (T)0.079311 + v.y * (T)0.91976 - v.z * (T)0.037624 + (T)0.124359;\ - tmp.z =-v.x * (T)0.068347 - v.y * (T)0.069032 + v.z * (T)0.975032 + (T)0.129721;\ - tmp.x = (tmp.x < 0) ? 0 : tmp.x;\ - tmp.y = (tmp.y < 0) ? 0 : tmp.y;\ - tmp.z = (tmp.z < 0) ? 0 : tmp.z;\ - tmp.x = tmp.x * (T)0.003211 * 16;\ - tmp.y = tmp.y * (T)0.007948 * 16;\ - tmp.z = tmp.z * (T)0.046259 * 16;\ - g = tmp.x * (T)0.249512 + tmp.y * (T)0.274577 + tmp.z * (T)0.324276 + (T)0.078941;\ -} - -#if defined(CONV) -#if defined(UCHAR) -__kernel void bilateral_slice_apply_c12_conv_uchar -#else -__kernel void bilateral_slice_apply_c12_conv -#endif -#else -#if defined(UCHAR) -__kernel void bilateral_slice_apply_c12_uchar -#else -__kernel void bilateral_slice_apply_c12 -#endif -#endif - (const int w, const int wh, const int gc, const int gw, const int gh, const int gcw, const int gd, const int coe, const int bx, const int by, - const float scale_x, const float scale_y, global const T* guide, global const T* grid, -#if defined (UCHAR) - global const uchar* input, - global uchar* out){ -#else - global const T* input, - global T* out){ -#endif - - const int x = get_global_id(0); - const int y = get_global_id(1); - if(x >= bx || y >= by) return; - int in_off = y * w + x; - T3 in_val; -#if defined (UCHAR) - uchar3 tmp = vload3(0, input + in_off * 3); - in_val.x = tmp.x / 256.0; - in_val.y = tmp.y / 256.0; - in_val.z = tmp.z / 256.0; -#else - in_val = vload3(0, input + in_off * 3); -#endif - - T gx = (x + (T)0.5) * (T)scale_x; - T gz; -#if defined (CONV) - guide_cal(in_val, gz); -#else - gz = guide[in_off]; -#endif - gz = gz * gd; - char fx = (char)floor(gx - (T)0.5); - char fz = (char)floor(gz - (T)0.5); - - char i = 0; - char k = 0; - char x_ = fx; - char z_ = fz; - if(fx < 0){x_ = 0; i = 1;} - if(fz < 0){z_ = 0; k = 1;} - if(fx == gw - 1) i = 1; - if(fz == gd - 1) k = 1; - - T8 g_val[3]; - T4 p; - T4 sum[3]; - T2 wx, wz; - sum[0] = (T4)0; - sum[1] = (T4)0; - sum[2] = (T4)0; - - wx.s0 = (T)1 - fabs(fx + (T)0.5 - gx); - wx.s1 = (T)1 - fabs(fx + (T)1.5 - gx); - wz.s0 = (T)1 - fabs(fz + (T)0.5 - gz); - wz.s1 = (T)1 - fabs(fz + (T)1.5 - gz); - - if(wx.s0 < 0) wx.s0 = 0; - if(wx.s1 < 0) wx.s0 = 0; - if(wz.s0 < 0) wz.s0 = 0; - if(wz.s1 < 0) wz.s0 = 0; - - p.xy = wx.s0 * wz; - p.zw = wx.s1 * wz; - - int grid_off = y * gcw + x_ * gc + z_ * coe; - g_val[0] = vload8(0, grid + grid_off); - g_val[1] = vload8(0, grid + grid_off + 8); - p.x = p.x + (T)k * p.y + (T)i * (p.z + (T)k * p.w); - sum[0] += g_val[0].s0123 * p.x; - sum[1] += g_val[0].s4567 * p.x; - sum[2] += g_val[1].s0123 * p.x; - if(k == 0) { - p.y = p.y + (T)i * p.w; - g_val[2] = vload8(0, grid + grid_off + 16); - sum[0] += g_val[1].s4567 * p.y; - sum[1] += g_val[2].s0123 * p.y; - sum[2] += g_val[2].s4567 * p.y; - } - - if(i == 0){ - grid_off += gc; - p.z = p.z + (T)k * p.w; - g_val[0] = vload8(0, grid + grid_off); - g_val[1] = vload8(0, grid + grid_off + 8); - sum[0] += g_val[0].s0123 * p.z; - sum[1] += g_val[0].s4567 * p.z; - sum[2] += g_val[1].s0123 * p.z; - if(k == 0){ - g_val[2] = vload8(0, grid + grid_off + 16); - sum[0] += g_val[1].s4567 * p.w; - sum[1] += g_val[2].s0123 * p.w; - sum[2] += g_val[2].s4567 * p.w; - } - } - - sum[0].x = sum[0].x * in_val.x + sum[0].y * in_val.y + sum[0].z * in_val.z + sum[0].w; - sum[1].x = sum[1].x * in_val.x + sum[1].y * in_val.y + sum[1].z * in_val.z + sum[1].w; - sum[2].x = sum[2].x * in_val.x + sum[2].y * in_val.y + sum[2].z * in_val.z + sum[2].w; -#if defined (UCHAR) - tmp.x = (uchar)(sum[0].x * 256.0); - tmp.y = (uchar)(sum[1].x * 256.0); - tmp.z = (uchar)(sum[2].x * 256.0); - vstore3(tmp, 0, out + in_off * 3); -#else - vstore3((T3)(sum[0].x, sum[1].x, sum[2].x), 0, out + in_off * 3); -#endif -} diff --git a/tensor_computing/src/gpu/mali/cl/bilateral_slice_apply_pre.cl b/tensor_computing/src/gpu/mali/cl/bilateral_slice_apply_pre.cl deleted file mode 100644 index 89d37654..00000000 --- a/tensor_computing/src/gpu/mali/cl/bilateral_slice_apply_pre.cl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - - - -__kernel void bilateral_slice_apply_pre(const int gh, const int gc, const int gcw, const int bx, const int bw, const float scale_y, global const T* grid, global T* gridTran){ - const int idx = get_global_id(0);//dep * coe / 4 - const int idw = get_global_id(1);//gw - const int idh = get_global_id(2);//H - if(idx >= bx || idw >= bw) return; - char j = 1; - - T2 wy; - T gy = (idh + (T)0.5) * (T)scale_y; - char fy = floor(gy - (T)0.5); - char y_ = fy; - if(fy < 0) {y_ = 0; j = 0;} - if(fy == gh - 1) j = 0; - wy.x = (T)1 - fabs(fy + (T)0.5 - gy); - wy.y = (T)1 - fabs(fy + (T)1.5 - gy); - - int grid_off = y_ * gcw + idw * gc + (idx << 2); - T4 val0; - T4 val1; - T4 res; - val0 = vload4(0, grid + grid_off); - val1 = (j == 0) ? val0 : vload4(0, grid + grid_off + gcw); - res = wy.x * val0 + wy.y * val1; - - int gridTran_off = idh * gcw + idw * gc + (idx << 2); - vstore4(res, 0, gridTran + gridTran_off); -} diff --git a/tensor_computing/src/gpu/mali/cl/concat.cl b/tensor_computing/src/gpu/mali/cl/concat.cl deleted file mode 100644 index c5ba578d..00000000 --- a/tensor_computing/src/gpu/mali/cl/concat.cl +++ /dev/null @@ -1,109 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - - - -#define MANGLE_NAME_IMPL(base, A, N) base ## A ## N -#define MANGLE_NAME(base, A, N) MANGLE_NAME_IMPL(base, A, N) - -__kernel void MANGLE_NAME(concat_, A, N)(const int ih_str, const int iw_str, const int ih_off, const int iw_off, - const int oh_str, const int ow_str, const int oh_off, const int ow_off, const int cmax, const int nmax, const int out_size, const int bx, const int by, - __global const T* in0, -#if (N > 1) - const int c0, - __global const T* in1, -#endif -#if (N > 2) - const int c1, - __global const T* in2, -#endif -#if (N > 3) - const int c2, - __global const T* in3, -#endif -#if (N > 4) - const int c3, - __global const T* in4, -#endif -#if (N > 5 ) - const int c4, - __global const T* in5, -#endif -#if (N > 6) - const int c5, - __global const T* in6, -#endif -#if (N > 7) - const int c6, - __global const T* in7, -#endif - __global T* out) { - const int idx = get_global_id(0); - const int idy = get_global_id(1); - const int idz = get_global_id(2); - if(idx >= bx || idy >= by) return; - int idc = idz - cmax; - int idn = nmax; - int out_c = cmax; -#if (N > 7) - if(idc < 0) {idc += c6; idn = 6; out_c -= c6;} -#endif -#if (N > 6) - if(idc < 0) {idc += c5; idn = 5; out_c -= c5;} -#endif -#if (N > 5) - if(idc < 0) {idc += c4; idn = 4; out_c -= c4;} -#endif -#if (N > 4) - if(idc < 0) {idc += c3; idn = 3; out_c -= c3;} -#endif -#if (N > 3) - if(idc < 0) {idc += c2; idn = 2; out_c -= c2;} -#endif -#if (N > 2) - if(idc < 0) {idc += c1; idn = 1; out_c -= c1;} -#endif -#if (N > 1) - if(idc < 0) {idc += c0; idn = 0; out_c -= c0;} -#endif - T4 val; - int in_off = (idc * iw_str + idy + iw_off) * ih_str + idx + ih_off; - if(idn == 0) val = vload4(in_off, in0); -#if (N > 1) - if(idn == 1) val = vload4(in_off, in1); -#endif -#if (N > 2) - if(idn == 2) val = vload4(in_off, in2); -#endif -#if (N > 3) - if(idn == 3) val = vload4(in_off, in3); -#endif -#if (N > 4) - if(idn == 4) val = vload4(in_off, in4); -#endif -#if (N > 5) - if(idn == 5) val = vload4(in_off, in5); -#endif -#if (N > 6) - if(idn == 6) val = vload4(in_off, in6); -#endif -#if (N > 7) - if(idn == 7) val = vload4(in_off, in7); -#endif - int out_off = ((out_c + idc) * ow_str + idy + ow_off) * oh_str + idx + oh_off; - vstore4(val, out_off, out + out_size); -} - diff --git a/tensor_computing/src/gpu/mali/cl/conv_depthwise_s1.cl b/tensor_computing/src/gpu/mali/cl/conv_depthwise_s1.cl deleted file mode 100644 index b42c4ce5..00000000 --- a/tensor_computing/src/gpu/mali/cl/conv_depthwise_s1.cl +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - -#include"kernel_def.h" -#define MANGLE_NAME_IMPL(base, F, ON) base ## F ## ON -#define MANGLE_NAME(base, F, ON) MANGLE_NAME_IMPL(base, F, ON) - - - -#if defined(USE_NCWH) -#if defined(USE_RELU) -__kernel void MANGLE_NAME(conv_depthwise_s1_relu_ncwh_, F, ON) -#else -__kernel void MANGLE_NAME(conv_depthwise_s1_ncwh_, F, ON) -#endif -#else -#if defined(USE_RELU) -__kernel void MANGLE_NAME(conv_depthwise_s1_relu_, F, ON) -#else -__kernel void MANGLE_NAME(conv_depthwise_s1_, F, ON) -#endif -#endif -(const int ih_str, const int ihw_str, const int ic_str, const int ih_off, const int iw_off, const int oh_str, const int ow_str, const int ohw_str, const int oh_off, const int ow_off, const int ow, const int bx , const int by, - __global const T* in, __global const T* flt, __read_only image1d_t bias, __global T* out) -{ - const int idx = get_global_id(0); - const int idy = get_global_id(1); - const int idz = get_global_id(2); - if(idx >= bx || idy >= by) return; - - T4 in_val[IN]; - T4 flt_val; - T4 out_val[ON]; - - LOADBIAS_IMAGE_ARRAY_V4(out_val, idz, bias); - int in_off = idz * ihw_str + (idy * ON + iw_off) * ih_str + idx + ih_off; - int flt_off = idz * Fsq; - - for(uchar i = 0; i < F; ++i) { - LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off + i, ih_str, in); - for(uchar j = 0; j < F; ++j) { -#if defined(BASICE_REG) - in_val[LN] = vload4(in_off + i + (LN + j) * ih_str, in); -#endif - flt_val = vload4(flt_off + j, flt); - DEPTHWISE_CAL_CORE_S1(in_val, flt_val, out_val); - UPDATE_REG(in_val); - } - flt_off += F; - } -#if defined(USE_NCWH) - int out_off = (idz << 2) * ohw_str + (idy * ON + ow_off) * oh_str + idx + oh_off; - STORE_OUTPUT_BUF_ARRAY_V4_NCWH(out_val, out_off, oh_str, ohw_str, idy * ON, ow, out); -#else - int out_off = (idz * ow_str + idy * ON + ow_off) * oh_str + idx + oh_off; - STORE_OUTPUT_BUF_ARRAY_V4(out_val, out_off, oh_str, idy * ON, ow, out); -#endif -} diff --git a/tensor_computing/src/gpu/mali/cl/conv_depthwise_s2.cl b/tensor_computing/src/gpu/mali/cl/conv_depthwise_s2.cl deleted file mode 100644 index e5f368be..00000000 --- a/tensor_computing/src/gpu/mali/cl/conv_depthwise_s2.cl +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - -#include "kernel_def.h" -#define MANGLE_NAME_IMPL(base, F, ON) base ## F ## ON -#define MANGLE_NAME(base, F, ON) MANGLE_NAME_IMPL(base, F, ON) - - -#if defined(USE_NCWH) -#if defined(USE_RELU) -__kernel void MANGLE_NAME(conv_depthwise_s2_relu_ncwh_, F, ON) -#else -__kernel void MANGLE_NAME(conv_depthwise_s2_ncwh_, F, ON) -#endif -#else -#if defined(USE_RELU) -__kernel void MANGLE_NAME(conv_depthwise_s2_relu_, F, ON) -#else -__kernel void MANGLE_NAME(conv_depthwise_s2_, F, ON) -#endif -#endif -(const int ih_str, const int ihw_str, const int ic_str, const int ih_off, const int iw_off, const int oh_str, const int ow_str, const int ohw_str, const int oh_off, const int ow_off, const int ow, const int bx, const int by, - __global const T* in, __global const T* flt, __read_only image1d_t bias, __global T* out) -{ - const int idx = get_global_id(0); - const int idy = get_global_id(1); - const int idz = get_global_id(2); - if(idx >= bx || idy >= by) return; - - T4 in_val[IN]; - T4 flt_val; - T4 out_val[ON]; - - LOADBIAS_IMAGE_ARRAY_V4(out_val, idz, bias); - int in_off = idz * ihw_str + ((idy << 1) * ON + iw_off) * ih_str + (idx << 1) + ih_off; - int flt_off = idz * Fsq; - for(uchar i = 0; i < F; ++i) { -#if defined(BASIC_REG) - LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off + i, (ih_str << 1), in); - for(uchar j = 0; j < F; j += 2) { - flt_val = vload4(flt_off + j, flt); - in_val[LN] = vload4(in_off + i + ((LN << 1) + j) * ih_str , in); - DEPTHWISE_CAL_CORE_S1(in_val, flt_val, out_val); - UPDATE_REG(in_val); - } - LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off + i + ih_str, (ih_str << 1), in); - for(uchar j = 1; j < F; j += 2) { - flt_val = vload4(flt_off + j, flt); - in_val[LN] = vload4(in_off + i + ((LN << 1) + j) * ih_str , in); - DEPTHWISE_CAL_CORE_S1(in_val, flt_val, out_val) - UPDATE_REG(in_val); - } -#else - LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off + i, ih_str, in); - for(uchar j = 0; j < F; ++j) { - flt_val = vload4(flt_off + j, flt); - DEPTHWISE_CAL_CORE_S2(in_val, flt_val, out_val); - UPDATE_REG(in_val); - } -#endif - flt_off += F; - } -#if defined(USE_NCWH) - int out_off = (idz << 2) * ohw_str + (idy * ON + ow_off) * oh_str + idx + oh_off; - STORE_OUTPUT_BUF_ARRAY_V4_NCWH(out_val, out_off, oh_str, ohw_str, idy * ON, ow, out); -#else - int out_off = (idz * ow_str + idy * ON + ow_off) * oh_str + idx + oh_off; - STORE_OUTPUT_BUF_ARRAY_V4(out_val, out_off, oh_str, idy * ON, ow, out); -#endif -} diff --git a/tensor_computing/src/gpu/mali/cl/conv_depthwise_trans_fltbuf.cl b/tensor_computing/src/gpu/mali/cl/conv_depthwise_trans_fltbuf.cl deleted file mode 100644 index 7eac54fe..00000000 --- a/tensor_computing/src/gpu/mali/cl/conv_depthwise_trans_fltbuf.cl +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - - -#define MANGLE_NAME_IMPL(base, K) base ## K -#define MANGLE_NAME(base, K) MANGLE_NAME_IMPL(base, K) -#if(K == 4) -#define loadFltval(off, str, flt, val){\ - val.x = flt[off];\ - val.y = flt[off + str];\ - val.z = flt[off +(str << 1)];\ - val.w = flt[off + str * 3];\ -} - -#define loadFltvalEdge(off, str, flt, val, edge){\ - val.x = flt[off];\ - if(edge > 1) val.y = flt[off + str];\ - if(edge > 2) val.z = flt[off + (str << 1)];\ -} -#endif - -#if(K == 8) -#define loadFltval(off, str, flt, val){\ - val.s0 = flt[off];\ - val.s1 = flt[off + str];\ - val.s2 = flt[off +(str << 1)];\ - val.s3 = flt[off + str * 3];\ - val.s4 = flt[off +(str << 2)];\ - val.s5 = flt[off + str * 5];\ - val.s6 = flt[off + str * 6];\ - val.s7 = flt[off + str * 7];\ -} -#define loadFltvalEdge(off, str, flt, val, edge){\ - val.s0 = flt[off];\ - if(edge > 1) val.s1 = flt[off + str];\ - if(edge > 2) val.s2 = flt[off +(str << 1)];\ - if(edge > 3) val.s3 = flt[off + str * 3];\ - if(edge > 4) val.s4 = flt[off +(str << 2)];\ - if(edge > 5) val.s5 = flt[off + str * 5];\ - if(edge > 6) val.s6 = flt[off + str * 6];\ -} -#endif - -__kernel void MANGLE_NAME(conv_depthwise_trans_fltbuf_, K)(const fwh, const fn, __global const T* fltdata, __global T* fltbuf){ - const int idx = get_global_id(0); - const int idy = get_global_id(1); - const int flt_off = idy * K * fwh + idx; - int ek = ((idy + 1) * K <= fn) ? K : (fn % K); -#if (K == 4) - T4 val = 0; -#elif (K == 8) - T8 val = 0; -#endif - if(ek == K){ - loadFltval(flt_off, fwh, fltdata, val); - } else { - loadFltvalEdge(flt_off, fwh, fltdata, val, ek); - } - const int out_off = idy * fwh + idx; -#if (K == 4) - vstore4(val, out_off, fltbuf); -#elif (K == 8) - vstore8(val, out_off, fltbuf); -#endif -} diff --git a/tensor_computing/src/gpu/mali/cl/conv_direct_s1.cl b/tensor_computing/src/gpu/mali/cl/conv_direct_s1.cl deleted file mode 100644 index d830f8eb..00000000 --- a/tensor_computing/src/gpu/mali/cl/conv_direct_s1.cl +++ /dev/null @@ -1,108 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - - -#include "kernel_def.h" -#define MANGLE_NAME_IMPL(base, F, ON, KN) base ## F ## ON ## KN -#define MANGLE_NAME(base, F, ON, KN) MANGLE_NAME_IMPL(base, F, ON, KN) - - - -#if defined(USE_RELU) -__kernel void MANGLE_NAME(conv_direct_s1_relu_, F, ON, KN) -#else -__kernel void MANGLE_NAME(conv_direct_s1_, F, ON, KN) -#endif -(const int ih_str, const int ihw_str, const int ic_str, const int ih_off, const int iw_off, const int oh_str, const int ohw_str, const int oh_off, const int ow_off, -const int ow, const int bx, const int by, __global const T* in, __global const T* flt, __read_only image1d_t bias, __global T* out) -{ - - const int idx = get_global_id(0); - const int idy = get_global_id(1); - const int idz = get_global_id(2); - if(idx >= bx || idy >= by) return; - - T4 in_val[IN]; - T16 flt_val; - T4 out_val[KN][ON]; - LOADBIAS_IMAGE_ARRAY_V4(out_val[0], idz * KN, bias); -#if(KN > 1) - LOADBIAS_IMAGE_ARRAY_V4(out_val[1], idz * KN + 1, bias); -#endif -#if(KN > 2) - LOADBIAS_IMAGE_ARRAY_V4(out_val[2], idz * KN + 2, bias); - LOADBIAS_IMAGE_ARRAY_V4(out_val[3], idz * KN + 3, bias); -#endif - - int in_off = (idy * ON + iw_off) * ih_str + idx + ih_off; - int flt_off = idz * ic_str * Fsq * KN; - - for(int i = 0; i < ic_str; ++i) { -#if(F == 1) - LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off, ih_str, in); - flt_val = vload16(flt_off, flt); - DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[0]); -#if(KN > 1) - flt_val = vload16(flt_off + 1, flt); - DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[1]); -#endif -#if(KN > 2) - flt_val = vload16(flt_off + 2, flt); - DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[2]); - flt_val = vload16(flt_off + 3, flt); - DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[3]); -#endif - flt_off += KN; -#else - for(uchar j = 0; j < F; ++j) { - LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off + j, ih_str, in); - for(uchar k = 0; k < F; ++k) { -#if defined(BASICE_REG) - in_val[LN] = vload4(in_off + j + (LN + k) * ih_str, in); -#endif - flt_val = vload16(flt_off + k * KN, flt); - DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[0]); -#if(KN > 1) - flt_val = vload16(flt_off + k * KN + 1, flt); - DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[1]); -#endif -#if(KN > 2) - flt_val = vload16(flt_off + k * KN + 2, flt); - DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[2]); - flt_val = vload16(flt_off + k * KN + 3, flt); - DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[3]); -#endif - UPDATE_REG(in_val); - } - flt_off += F * KN; - } -#endif - in_off += ihw_str; - } - - int out_off = idz * KN * ohw_str + (idy * ON + ow_off) * oh_str + idx + oh_off; - STORE_OUTPUT_BUF_ARRAY_V4(out_val[0], out_off, oh_str, idy * ON, ow, out); -#if(KN > 1) - out_off += ohw_str; - STORE_OUTPUT_BUF_ARRAY_V4(out_val[1], out_off, oh_str, idy * ON, ow, out); -#endif -#if(KN > 2) - out_off += ohw_str; - STORE_OUTPUT_BUF_ARRAY_V4(out_val[2], out_off, oh_str, idy * ON, ow, out); - out_off += ohw_str; - STORE_OUTPUT_BUF_ARRAY_V4(out_val[3], out_off, oh_str, idy * ON, ow, out); -#endif -} diff --git a/tensor_computing/src/gpu/mali/cl/conv_direct_s1_nchw_to_ncwhc4.cl b/tensor_computing/src/gpu/mali/cl/conv_direct_s1_nchw_to_ncwhc4.cl deleted file mode 100644 index 900d6718..00000000 --- a/tensor_computing/src/gpu/mali/cl/conv_direct_s1_nchw_to_ncwhc4.cl +++ /dev/null @@ -1,108 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - - -#include "kernel_def.h" -#define MANGLE_NAME_IMPL(base, F, ON) base ## F ## ON -#define MANGLE_NAME(base, F, ON) MANGLE_NAME_IMPL(base, F, ON) - -#if(F == 1) -#define calCore(A, B, C) {\ - C[0] += A.s0 * B;\ - C[1] += A.s1 * B;\ - C[2] += A.s2 * B;\ - C[3] += A.s3 * B;\ - C[4] += A.s4 * B;\ - C[5] += A.s5 * B;\ - C[6] += A.s6 * B;\ - C[7] += A.s7 * B;\ -} -#elif (F == 3) -#define calCore(a0, a1, a2, a3, a4, a5, B, C) {\ - C[0] += a0 * B;\ - C[1] += a1 * B;\ - C[2] += a2 * B;\ - C[3] += a3 * B;\ - C[4] += a4 * B;\ - C[5] += a5 * B;\ -} -#define calCore0(A, B, C) calCore(A.s0, A.s1, A.s2, A.s3, A.s4, A.s5, B, C) -#define calCore1(A, B, C) calCore(A.s1, A.s2, A.s3, A.s4, A.s5, A.s6, B, C) -#define calCore2(A, B, C) calCore(A.s2, A.s3, A.s4, A.s5, A.s6, A.s7, B, C) -#elif (F == 5) -#define calCore(a0, a1, a2, a3, B, C) {\ - C[0] += a0 * B;\ - C[1] += a1 * B;\ - C[2] += a2 * B;\ - C[3] += a3 * B;\ -} -#define calCore0(A, B, C) calCore(A.s0, A.s1, A.s2, A.s3, B, C) -#define calCore1(A, B, C) calCore(A.s1, A.s2, A.s3, A.s4, B, C) -#define calCore2(A, B, C) calCore(A.s2, A.s3, A.s4, A.s5, B, C) -#define calCore3(A, B, C) calCore(A.s3, A.s4, A.s5, A.s6, B, C) -#define calCore4(A, B, C) calCore(A.s4, A.s5, A.s6, A.s7, B, C) -#endif - - -#if defined(USE_RELU) -__kernel void MANGLE_NAME(conv_direct_s1_nchw_to_ncwhc4_relu_, F, ON) -#else -__kernel void MANGLE_NAME(conv_direct_s1_nchw_to_ncwhc4_, F, ON) -#endif -(const int iw_str, const int iwh_str, const int ic_str, const int iw_off, const int ih_off, const int oh_str, const int ow_str, const int oh_off, const int ow_off, const int ow, const int bx, const int by, - __global const T* in, __global const T* flt, __read_only image1d_t bias, __global T* out) { - const int idx = get_global_id(0); - const int idy = get_global_id(1); - const int idz = get_global_id(2); - if(idx >= bx || idy >= by) return; - - T8 in_val; - T4 flt_val; - T4 out_val[ON]; - - LOADBIAS_IMAGE_ARRAY_V4(out_val, idz, bias); - int in_off = (idy + ih_off) * iw_str + idx * ON + iw_off; - int flt_off = idz * ic_str * Fsq; - - for(int i = 0; i < ic_str; ++i) { -#if(F == 1) - flt_val = vload4(flt_off, flt); - in_val = vload8(0, in + in_off); - calCore(in_val, flt_val, out_val); - flt_off++; -#else - for(uchar j = 0; j < F; ++j) { - in_val = vload8(0, in + in_off + j * iw_str); - for(uchar k = 0; k < F; ++k) { - flt_val = vload4(flt_off + k, flt); - if(k == 0) calCore0(in_val, flt_val, out_val); - if(k == 1) calCore1(in_val, flt_val, out_val); - if(k == 2) calCore2(in_val, flt_val, out_val); -#if(F == 5) - if(k == 3) calCore3(in_val, flt_val, out_val); - if(k == 4) calCore4(in_val, flt_val, out_val); -#endif - } - flt_off += F; - } -#endif - in_off += iwh_str; - } - - int xn = idx * ON; - int out_off = (idz * ow_str + xn + ow_off) * oh_str + idy + oh_off; - STORE_OUTPUT_BUF_ARRAY_V4(out_val, out_off, oh_str, xn, ow, out); -} diff --git a/tensor_computing/src/gpu/mali/cl/conv_direct_s2_nchw_to_ncwhc4.cl b/tensor_computing/src/gpu/mali/cl/conv_direct_s2_nchw_to_ncwhc4.cl deleted file mode 100644 index 36f31ffb..00000000 --- a/tensor_computing/src/gpu/mali/cl/conv_direct_s2_nchw_to_ncwhc4.cl +++ /dev/null @@ -1,111 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - - -#include "kernel_def.h" -#define MANGLE_NAME_IMPL(base, F, ON) base ## F ## ON -#define MANGLE_NAME(base, F, ON) MANGLE_NAME_IMPL(base, F, ON) - -#if(F == 1) -#define calCore(A, B, C) {\ - C[0] += A.s0 * B;\ - C[1] += A.s2 * B;\ - C[2] += A.s4 * B;\ - C[3] += A.s6 * B;\ - C[4] += A.s8 * B;\ - C[5] += A.sa * B;\ - C[6] += A.sc * B;\ - C[7] += A.se * B;\ -} -#elif (F == 3) -#define calCore(a0, a1, a2, a3, a4, a5, a6, B, C) {\ - C[0] += a0 * B;\ - C[1] += a1 * B;\ - C[2] += a2 * B;\ - C[3] += a3 * B;\ - C[4] += a4 * B;\ - C[5] += a5 * B;\ - C[6] += a6 * B;\ -} -#define calCore0(A, B, C) calCore(A.s0, A.s2, A.s4, A.s6, A.s8, A.sa, A.sc, B, C) -#define calCore1(A, B, C) calCore(A.s1, A.s3, A.s5, A.s7, A.s9, A.sb, A.sd, B, C) -#define calCore2(A, B, C) calCore(A.s2, A.s4, A.s6, A.s8, A.sa, A.sc, A.se, B, C) -#elif (F == 5) -#define calCore(a0, a1, a2, a3, a4, a5, B, C) {\ - C[0] += a0 * B;\ - C[1] += a1 * B;\ - C[2] += a2 * B;\ - C[3] += a3 * B;\ - C[4] += a4 * B;\ - C[5] += a5 * B;\ -} -#define calCore0(A, B, C) calCore(A.s0, A.s2, A.s4, A.s6, A.s8, A.sa, B, C) -#define calCore1(A, B, C) calCore(A.s1, A.s3, A.s5, A.s7, A.s9, A.sb, B, C) -#define calCore2(A, B, C) calCore(A.s2, A.s4, A.s6, A.s8, A.sa, A.sc, B, C) -#define calCore3(A, B, C) calCore(A.s3, A.s5, A.s7, A.s9, A.sb, A.sd, B, C) -#define calCore4(A, B, C) calCore(A.s4, A.s6, A.s8, A.sa, A.sc, A.se, B, C) -#endif - - -#if defined(USE_RELU) -__kernel void MANGLE_NAME(conv_direct_s2_nchw_to_ncwhc4_relu_, F, ON) -#else -__kernel void MANGLE_NAME(conv_direct_s2_nchw_to_ncwhc4_, F, ON) -#endif -(const int iw_str, const int iwh_str, const int ic_str, const int iw_off, const int ih_off, const int oh_str, const int ow_str, const int oh_off, const int ow_off, const int ow, const int bx, const int by, - __global const T* in, __global const T* flt, __read_only image1d_t bias, __global T* out) { - const int idx = get_global_id(0); - const int idy = get_global_id(1); - const int idz = get_global_id(2); - if(idx >= bx || idy >= by) return; - - T16 in_val; - T4 flt_val; - T4 out_val[ON]; - - LOADBIAS_IMAGE_ARRAY_V4(out_val, idz, bias); - int in_off = ((idy << 1) + ih_off) * iw_str + (idx << 1) * ON + iw_off; - int flt_off = idz * ic_str * Fsq; - - for(int i = 0; i < ic_str; ++i) { -#if(F == 1) - flt_val = vload4(flt_off, flt); - in_val = vload16(0, in + in_off); - calCore(in_val, flt_val, out_val); - flt_off++; -#else - for(uchar j = 0; j < F; ++j) { - in_val = vload16(0, in + in_off + j * iw_str); - for(uchar k = 0; k < F; ++k) { - flt_val = vload4(flt_off + k, flt); - if(k == 0) calCore0(in_val, flt_val, out_val); - if(k == 1) calCore1(in_val, flt_val, out_val); - if(k == 2) calCore2(in_val, flt_val, out_val); -#if(F == 5) - if(k == 3) calCore3(in_val, flt_val, out_val); - if(k == 4) calCore4(in_val, flt_val, out_val); -#endif - } - flt_off += F; - } -#endif - in_off += iwh_str; - } - - int xn = idx * ON; - int out_off = (idz * ow_str + xn + ow_off) * oh_str + idy + oh_off; - STORE_OUTPUT_BUF_ARRAY_V4(out_val, out_off, oh_str, xn, ow, out); -} diff --git a/tensor_computing/src/gpu/mali/cl/conv_direct_spe_fwhs1.cl b/tensor_computing/src/gpu/mali/cl/conv_direct_spe_fwhs1.cl deleted file mode 100644 index 710bca4f..00000000 --- a/tensor_computing/src/gpu/mali/cl/conv_direct_spe_fwhs1.cl +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NOCINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTIOC OF COCTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN COCNECTIOC WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - - -#include "kernel_def.h" -#define MANGLE_NAME_IMPL(base, OC) base ## OC -#define MANGLE_NAME(base, OC) MANGLE_NAME_IMPL(base, OC) - -#if(OC == 1) - #define calCore(ov, i_off, f_off, in, flt) {\ - T iv = in[i_off];\ - T fv = flt[f_off];\ - ov += iv * fv;\ - } -#endif - -#if(OC == 2) - #define calCore(ov, i_off, f_off, in, flt) {\ - T2 iv = vload2(i_off, in);\ - T2 fv = vload2(f_off, flt);\ - ov += iv.x * fv.x + iv.y * fv.y;\ - } -#endif - -#if(OC == 3) - #define calCore(ov, i_off, f_off, in, flt) {\ - T3 iv = vload3(i_off, in);\ - T3 fv = vload3(f_off, flt);\ - ov += iv.x * fv.x + iv.y * fv.y + iv.z * fv.z;\ - } -#endif - -#if(OC == 4) - #define calCore(ov, i_off, f_off, in, flt) {\ - T4 iv = vload4(i_off, in);\ - T4 fv = vload4(f_off, flt);\ - DOT_A4B4C1(iv, fv, ov);\ - } -#endif - -#if(OC == 8) - #define calCore(ov, i_off, f_off, in, flt) {\ - T8 iv = vload8(i_off, in);\ - T8 fv = vload8(f_off, flt);\ - DOT_A4B4C1(iv.s0123, fv.s0123, ov);\ - DOT_A4B4C1(iv.s4567, fv.s4567, ov);\ - } -#endif - -#if(OC == 16) - #define calCore(ov, i_off, f_off, in, flt) {\ - T16 iv = vload16(i_off, in);\ - T16 fv = vload16(f_off, flt);\ - DOT_A4B4C1(iv.s0123, fv.s0123, ov);\ - DOT_A4B4C1(iv.s4567, fv.s4567, ov);\ - DOT_A4B4C1(iv.s89ab, fv.s89ab, ov);\ - DOT_A4B4C1(iv.scdef, fv.scdef, ov);\ - } -#endif - -#if defined(USE_RELU) -__kernel MANGLE_NAME(void conv_direct_spe_fwhs1_relu_, OC) -#else -__kernel MANGLE_NAME(void conv_direct_spe_fwhs1_, OC) -#endif -(const int ih_str, const int ihw_str, const int ic_str, const int ih_off, const int iw_off, const int oh_str, const int ow_str, const int oh_off, const int ow_off, - const int flt_str, const int bx, const int by, __global const T* in, __global const T* flt, __global const T* bias, __global T* out) -{ - - const int idx = get_global_id(0); - if(idx >= bx) return; - - T out_val = bias[idx]; - int in_off = iw_off * ih_str + ih_off; - int flt_off = idx; - for(int i = 0; i < ic_str; ++i) { - calCore(out_val, in_off, flt_off, in, flt); - in_off += ihw_str; - flt_off += flt_str; - } - - ACTIVATION_V1(out_val); - const int ox = idx >> 2; - const int oy = idx & 3; - int out_off = (ox * ow_str + ow_off) * oh_str + oh_off; - out[out_off * 4 + oy] = out_val; -} diff --git a/tensor_computing/src/gpu/mali/cl/conv_direct_trans_fltbuf.cl b/tensor_computing/src/gpu/mali/cl/conv_direct_trans_fltbuf.cl deleted file mode 100644 index 6b6b46eb..00000000 --- a/tensor_computing/src/gpu/mali/cl/conv_direct_trans_fltbuf.cl +++ /dev/null @@ -1,176 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - - -#define MANGLE_NAME_IMPL(base, C, K) base ## C ## K -#define MANGLE_NAME(base, C, K) MANGLE_NAME_IMPL(base, C, K) -#if(C == 1) -#define loadFltval(off, str, flt, val){\ - val = flt[off];\ -} - -#define loadFltvalEdge(off, str, flt, val, edge){\ -} -#endif - -#if(C == 2) -#define loadFltval(off, str, flt, val){\ - val.x = flt[off];\ - val.y = flt[off + str];\ -} - -#define loadFltvalEdge(off, str, flt, val, edge){\ - val.x = flt[off];\ -} -#endif - -#if(C == 3) -#define loadFltval(off, str, flt, val){\ - val.x = flt[off];\ - val.y = flt[off + str];\ - val.z = flt[off + str * 2];\ -} - -#define loadFltvalEdge(off, str, flt, val, edge){\ - val.x = flt[off];\ - if(edge > 1) val.y = flt[off + str];\ -} -#endif - -#if(C == 4) -#define loadFltval(off, str, flt, val){\ - val.x = flt[off];\ - val.y = flt[off + str];\ - val.z = flt[off + str * 2];\ - val.w = flt[off + str * 3];\ -} - -#define loadFltvalEdge(off, str, flt, val, edge){\ - val.x = flt[off];\ - if(edge > 1) val.y = flt[off + str];\ - if(edge > 2) val.z = flt[off + str * 2];\ -} -#endif - -#if(C == 8) -#define loadFltval(off, str, flt, val){\ - val.s0 = flt[off];\ - val.s1 = flt[off + str];\ - val.s2 = flt[off + str * 2];\ - val.s3 = flt[off + str * 3];\ - val.s4 = flt[off + str * 4];\ - val.s5 = flt[off + str * 5];\ - val.s6 = flt[off + str * 6];\ - val.s7 = flt[off + str * 7];\ -} -#define loadFltvalEdge(off, str, flt, val, edge){\ - val.s0 = flt[off];\ - if(edge > 1) val.s1 = flt[off + str];\ - if(edge > 2) val.s2 = flt[off + str * 2];\ - if(edge > 3) val.s3 = flt[off + str * 3];\ - if(edge > 4) val.s4 = flt[off + str * 4];\ - if(edge > 5) val.s5 = flt[off + str * 5];\ - if(edge > 6) val.s6 = flt[off + str * 6];\ -} -#endif - -#if(C == 16) -#define loadFltval(off, str, flt, val){\ - val.s0 = flt[off];\ - val.s1 = flt[off + str];\ - val.s2 = flt[off + str * 2];\ - val.s3 = flt[off + str * 3];\ - val.s4 = flt[off + str * 4];\ - val.s5 = flt[off + str * 5];\ - val.s6 = flt[off + str * 6];\ - val.s7 = flt[off + str * 7];\ - val.s8 = flt[off + str * 8];\ - val.s9 = flt[off + str * 9];\ - val.sa = flt[off + str * 10];\ - val.sb = flt[off + str * 11];\ - val.sc = flt[off + str * 12];\ - val.sd = flt[off + str * 13];\ - val.se = flt[off + str * 14];\ - val.sf = flt[off + str * 15];\ -} -#define loadFltvalEdge(off, str, flt, val, edge){\ - val.s0 = flt[off];\ - if(edge > 1) val.s1 = flt[off + str];\ - if(edge > 2) val.s2 = flt[off + str * 2];\ - if(edge > 3) val.s3 = flt[off + str * 3];\ - if(edge > 4) val.s4 = flt[off + str * 4];\ - if(edge > 5) val.s5 = flt[off + str * 5];\ - if(edge > 6) val.s6 = flt[off + str * 6];\ - if(edge > 7) val.s7 = flt[off + str * 7];\ - if(edge > 8) val.s8 = flt[off + str * 8];\ - if(edge > 9) val.s9 = flt[off + str * 9];\ - if(edge > 10) val.sa = flt[off + str * 10];\ - if(edge > 11) val.sb = flt[off + str * 11];\ - if(edge > 12) val.sc = flt[off + str * 12];\ - if(edge > 13) val.sd = flt[off + str * 13];\ - if(edge > 14) val.se = flt[off + str * 14];\ -} -#endif - -__kernel void MANGLE_NAME(conv_direct_trans_fltbuf_, C, K)(const int fwh, const int fc, const int fn, __global const T* fltdata, __global T* fltbuf) { - const int idx = get_global_id(0); - const int idy = get_global_id(1); - const int idz = get_global_id(2); - short ec = ((idy + 1) * C <= fc) ? C : (fc % C); - - const int flt_off = (idz * fc + idy * C) * fwh + idx; -#if (C == 1) - T val = 0; -#elif (C == 2) - T2 val = 0; -#elif (C == 3) - T3 val = 0; -#elif (C == 4) - T4 val = 0; -#elif (C == 8) - T8 val = 0; -#elif (C == 16) - T16 val = 0; -#endif - if(idz < fn){ - if(ec == C){ - loadFltval(flt_off, fwh, fltdata, val); - } else { - loadFltvalEdge(flt_off, fwh, fltdata, val, ec); - } - } - const int bc = (fc + C - 1) / C; - int out_off; -#if(K == 0) - out_off = (idy * fwh + idx) * fn + idz; -#else - out_off = (idz / K * bc + idy) * fwh * K + idx * K + (idz % K); -#endif -#if (C == 1) - fltbuf[out_off] = val; -#elif (C == 2) - vstore2(val, out_off, fltbuf); -#elif (C == 3) - vstore3(val, out_off, fltbuf); -#elif (C == 4) - vstore4(val, out_off, fltbuf); -#elif (C == 8) - vstore8(val, out_off, fltbuf); -#elif (C == 16) - vstore16(val, out_off, fltbuf); -#endif - -} diff --git a/tensor_computing/src/gpu/mali/cl/conv_wino_gemm36_tn.cl b/tensor_computing/src/gpu/mali/cl/conv_wino_gemm36_tn.cl deleted file mode 100644 index d1419203..00000000 --- a/tensor_computing/src/gpu/mali/cl/conv_wino_gemm36_tn.cl +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - - -#include"kernel_def.h" -#define MANGLE_NAME_LMPL(base, LM, LN) base ## LM ## LN -#define MANGLE_NAME(base, LM, LN) MANGLE_NAME_LMPL(base, LM, LN) - - -__kernel void MANGLE_NAME(conv_wino_gemm36_tn_, LM, LN) -(int M, int N, int K, int a_str, int b_str, int c_str, const int bx, const int by, __global const T* A, __global const T* B, global T* C) -{ - const int idx = get_global_id(0); - const int idy = get_global_id(1); - if(idx >= bx || idy >= by) return; - const int ix = idx * LN; - const int iy = idy * LM; - - T a[LM]; - T b[LN]; - T c[LM][LN]; - GEMM_SET_C_ZERO(c); - - int a_off = iy + a_str; - int b_off = ix + b_str; - for(int i = 0; i < K; i++) { - GEMM_LOAD_A(a, a_off, A); - GEMM_LOAD_B(b, b_off, B); - GEMM_CALCORE(a, b, c); - a_off += M; - b_off += N; - } - - int c_off = iy * N + ix + c_str; - GEMM_MUL_C((float)(0.1111111111), c); - GEMM_STORE_C(c, c_off, N, C); -} diff --git a/tensor_computing/src/gpu/mali/cl/conv_wino_trans_fltbuf_3x3.cl b/tensor_computing/src/gpu/mali/cl/conv_wino_trans_fltbuf_3x3.cl deleted file mode 100644 index b4d307d0..00000000 --- a/tensor_computing/src/gpu/mali/cl/conv_wino_trans_fltbuf_3x3.cl +++ /dev/null @@ -1,118 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#define loadG(val, str, off, flt) {\ - val[0] = flt[off];\ - val[1] = flt[off + str];\ - val[2] = flt[off + str * 2];\ -} - -#define setReg6(reg0, reg1) {\ - reg1[0] = reg0[0];\ - reg1[1] = reg0[1];\ - reg1[2] = reg0[2];\ - reg1[3] = reg0[3];\ - reg1[4] = reg0[4];\ - reg1[5] = reg0[5];\ -} - -#define addReg6(reg0, reg1) {\ - reg1[0] += reg0[0];\ - reg1[1] += reg0[1];\ - reg1[2] += reg0[2];\ - reg1[3] += reg0[3];\ - reg1[4] += reg0[4];\ - reg1[5] += reg0[5];\ -} - -#define minReg6(reg0, reg1) {\ - reg1[0] -= reg0[0];\ - reg1[1] -= reg0[1];\ - reg1[2] -= reg0[2];\ - reg1[3] -= reg0[3];\ - reg1[4] -= reg0[4];\ - reg1[5] -= reg0[5];\ -} - -#define mulReg6(s, reg0, reg1) {\ - reg1[0] = s * reg0[0];\ - reg1[1] = s * reg0[1];\ - reg1[2] = s * reg0[2];\ - reg1[3] = s * reg0[3];\ - reg1[4] = s * reg0[4];\ - reg1[5] = s * reg0[5];\ -} - -#define calCore(g, t) {\ - t[0] = (T)(0.75) * g[0];\ - t[1] = (g[0] + g[1] + g[2]) * (T)(-0.5);\ - t[2] = (g[0] - g[1] + g[2]) * (T)(-0.5);\ - t[3] = ((T)(0.125) * g[0] + (T)(0.25) * g[1] + (T)(0.5) * g[2]);\ - t[4] = ((T)(0.125) * g[0] - (T)(0.25) * g[1] + (T)(0.5) * g[2]);\ - t[5] = (T)(3.0) * g[2];\ -} - -#define storeReg6(reg, off, str, flt) {\ - flt[off] = reg[0];\ - flt[off + str] = reg[1];\ - flt[off + str * 2] = reg[2];\ - flt[off + str * 3] = reg[3];\ - flt[off + str * 4] = reg[4];\ - flt[off + str * 5] = reg[5];\ -} - - - -__kernel void conv_wino_trans_fltbuf_3x3(const int fn, const int fc, const int fnc, __global const T* fltbuf, __global T* flttran) { - const int idx = get_global_id(0); - const int idy = get_global_id(1); - const int in_off = idy * fn + idx; - - T g[3]; - T h0[6], h1[6], h2[6], h3[6], h4[6], h5[6], t[6], tmp[6]; - loadG(g, fnc, in_off, fltbuf); - calCore(g, tmp); - mulReg6((T)(0.75), tmp, h0); - mulReg6((T)(-0.5), tmp, t); - setReg6(t, h1); - setReg6(t, h2); - mulReg6((T)(0.125),tmp, t); - setReg6(t, h3); - setReg6(t, h4); - - loadG(g, fnc, in_off + 3 * fnc, fltbuf); - calCore(g, tmp); - mulReg6((T)(0.5), tmp, t); - minReg6(t, h1); - addReg6(t, h2); - mulReg6((T)(0.25), tmp, t); - addReg6(t, h3); - minReg6(t, h4); - - loadG(g, fnc, in_off + 6 * fnc, fltbuf); - calCore(g, tmp); - mulReg6((T)(0.5), tmp, t); - minReg6(t, h1); - minReg6(t, h2); - addReg6(t, h3); - addReg6(t, h4); - mulReg6((T)(3.0), tmp, h5); - - storeReg6(h0, in_off, fnc, flttran); - storeReg6(h1, in_off + 6 * fnc, fnc, flttran); - storeReg6(h2, in_off + 12 * fnc, fnc, flttran); - storeReg6(h3, in_off + 18 * fnc, fnc, flttran); - storeReg6(h4, in_off + 24 * fnc, fnc, flttran); - storeReg6(h5, in_off + 30 * fnc, fnc, flttran); -} diff --git a/tensor_computing/src/gpu/mali/cl/conv_wino_trans_outbuf.cl b/tensor_computing/src/gpu/mali/cl/conv_wino_trans_outbuf.cl deleted file mode 100644 index e4d8c635..00000000 --- a/tensor_computing/src/gpu/mali/cl/conv_wino_trans_outbuf.cl +++ /dev/null @@ -1,204 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#include"kernel_def.h" -#define loadR(val, str, off, in) {\ - val[0] = in[off];\ - val[1] = in[off + str];\ - val[2] = in[off + str * 2];\ - val[3] = in[off + str * 3];\ - val[4] = in[off + str * 4];\ - val[5] = in[off + str * 5];\ -} - -#define calCore(s, t, tmp){\ - t.x = s[1] + s[2];\ - t.y = s[3] + s[4];\ - t.z = s[1] - s[2];\ - t.w = s[3] - s[4];\ - tmp[0] = s[0] + t.x + t.y;\ - tmp[1] = t.z + (T)(2.0) * t.w;\ - tmp[2] = t.x + (T)(4.0) * t.y;\ - tmp[3] = t.z + (T)(8.0) * t.w + s[5];\ -} - - -#if defined(ALIGN) -#if defined(USE_RELU) -__kernel void conv_wino_trans_outbuf_relu_align -#else -__kernel void conv_wino_trans_outbuf_align -#endif -#else -#if defined(USE_RELU) -__kernel void conv_wino_trans_outbuf_relu -#else -__kernel void conv_wino_trans_outbuf -#endif -#endif -(const int wino_h, const int wino_w, const int pw_str, const int pwh_str, const int oh_str, const int ow_str, const int oh_off, const int ow_off, - const int oh, const int ow, __read_only image1d_t bias, __global const T* in, __global T* out) { - const int idx = get_global_id(0); - const int idy = get_global_id(1); - const int idz = get_global_id(2); - if(idx >= wino_h || idy >= wino_w) return; - - T4 r0, r1, r2, r3; - T4 r4, r5, r6, r7; - T4 r8, r9, ra, rb; - T4 rc, rd, re, rf; - T4 bias_v4 = READ_IMAGE(bias, sampler, idz); - - int in_off = (idz << 2) * pw_str + idy * wino_h + idx; - for(uchar ii = 0; ii < 4; ii++) { - r0 = r4; - r1 = r5; - r2 = r6; - r3 = r7; - - r4 = r8; - r5 = r9; - r6 = ra; - r7 = rb; - - r8 = rc; - r9 = rd; - ra = re; - rb = rf; - - T s[6]; - T4 t; - T bias_val; - if(ii == 0) bias_val = bias_v4.x; - if(ii == 1) bias_val = bias_v4.y; - if(ii == 2) bias_val = bias_v4.z; - if(ii == 3) bias_val = bias_v4.w; - - rd = (T4)bias_val; - re = (T4)bias_val; - for(uchar i = 0; i < 2; ++i) { - rc.x = rf.x; - rc.y = rf.y; - rc.z = rf.z; - rc.w = rf.w; - loadR(s, pwh_str, in_off + i * 30 * pwh_str, in); - for(uchar j = 0; j < 4; ++j) { - rf.x = rf.y; - rf.y = rf.z; - rf.z = rf.w; - rf.w = bias_val; - if(j == 0) rf.w += s[0] + s[1] + s[2] + s[3] + s[4]; - if(j == 1) rf.w += s[1] - s[2] + (T)2 * (s[3] - s[4]); - if(j == 2) rf.w += s[1] + s[2] + (T)4 * (s[3] + s[4]); - if(j == 3) rf.w += s[1] - s[2] + (T)8 * (s[3] - s[4]) + s[5]; - } - } - - for(uchar i = 0; i < 4; ++ i) { - loadR(s, pwh_str, in_off + (i + 1) * 6 * pwh_str, in); - for(uchar j = 0; j < 4; ++j) { - t.x = t.y; - t.y = t.z; - t.z = t.w; - if(j == 0) t.w = s[0] + s[1] + s[2] + s[3] + s[4]; - if(j == 1) t.w = s[1] - s[2] + (T)2 * (s[3] - s[4]); - if(j == 2) t.w = s[1] + s[2] + (T)4 * (s[3] + s[4]); - if(j == 3) t.w = s[1] - s[2] + (T)8 * (s[3] - s[4]) + s[5]; - } - if(i == 0) { - rc += t; - rd += t; - re += t; - rf += t; - } - if(i == 1) { - rc += t; - rd -= t; - re += t; - rf -= t; - } - if(i == 2) { - rc += t; - rd += (T)2 * t; - re += (T)4 * t; - rf += (T)8 * t; - } - if(i == 3) { - rc += t; - rd -= (T)2 * t; - re += (T)4 * t; - rf -= (T)8 * t; - } - - } - ACTIVATION_V4(rc); - ACTIVATION_V4(rd); - ACTIVATION_V4(re); - ACTIVATION_V4(rf); - in_off += pw_str; - } - - const int x_off = idx << 2; - const int y_off = idy << 2; - int out_off = (idz * ow_str + y_off + ow_off) * (oh_str << 2) + (x_off << 2) + (oh_off << 2); -#if defined(ALIGN) - vstore16((T16)(r0.x, r4.x, r8.x, rc.x, - r1.x, r5.x, r9.x, rd.x, - r2.x, r6.x, ra.x, re.x, - r3.x, r7.x, rb.x, rf.x), 0, out + out_off); - out_off += (oh_str << 2); - vstore16((T16)(r0.y, r4.y, r8.y, rc.y, - r1.y, r5.y, r9.y, rd.y, - r2.y, r6.y, ra.y, re.y, - r3.y, r7.y, rb.y, rf.y), 0, out + out_off); - out_off += (oh_str << 2); - vstore16((T16)(r0.z, r4.z, r8.z, rc.z, - r1.z, r5.z, r9.z, rd.z, - r2.z, r6.z, ra.z, re.z, - r3.z, r7.z, rb.z, rf.z), 0, out + out_off); - out_off += (oh_str << 2); - vstore16((T16)(r0.w, r4.w, r8.w, rc.w, - r1.w, r5.w, r9.w, rd.w, - r2.w, r6.w, ra.w, re.w, - r3.w, r7.w, rb.w, rf.w), 0, out + out_off); -#else - vstore4((T4)(r0.x, r4.x, r8.x, rc.x), 0, out + out_off); - if(x_off + 1 < oh) vstore4((T4)(r1.x, r5.x, r9.x, rd.x), 0, out + out_off + 4); - if(x_off + 2 < oh) vstore4((T4)(r2.x, r6.x, ra.x, re.x), 0, out + out_off + 8); - if(x_off + 3 < oh) vstore4((T4)(r3.x, r7.x, rb.x, rf.x), 0, out + out_off + 12); - - if(y_off + 1 < ow) { - out_off += (oh_str << 2); - vstore4((T4)(r0.y, r4.y, r8.y, rc.y), 0, out + out_off); - if(x_off + 1 < oh) vstore4((T4)(r1.y, r5.y, r9.y, rd.y), 0, out + out_off + 4); - if(x_off + 2 < oh) vstore4((T4)(r2.y, r6.y, ra.y, re.y), 0, out + out_off + 8); - if(x_off + 3 < oh) vstore4((T4)(r3.y, r7.y, rb.y, rf.y), 0, out + out_off + 12); - } - - if(y_off + 2 < ow) { - out_off += (oh_str << 2); - vstore4((T4)(r0.z, r4.z, r8.z, rc.z), 0, out + out_off); - if(x_off + 1 < oh) vstore4((T4)(r1.z, r5.z, r9.z, rd.z), 0, out + out_off + 4); - if(x_off + 2 < oh) vstore4((T4)(r2.z, r6.z, ra.z, re.z), 0, out + out_off + 8); - if(x_off + 3 < oh) vstore4((T4)(r3.z, r7.z, rb.z, rf.z), 0, out + out_off + 12); - } - - if(y_off + 3 < ow) { - out_off += (oh_str << 2); - vstore4((T4)(r0.w, r4.w, r8.w, rc.w), 0, out + out_off); - if(x_off + 1 < oh) vstore4((T4)(r1.w, r5.w, r9.w, rd.w), 0, out + out_off + 4); - if(x_off + 2 < oh) vstore4((T4)(r2.w, r6.w, ra.w, re.w), 0, out + out_off + 8); - if(x_off + 3 < oh) vstore4((T4)(r3.w, r7.w, rb.w, rf.w), 0, out + out_off + 12); - } -#endif -} diff --git a/tensor_computing/src/gpu/mali/cl/conv_wino_trans_outbuf_right.cl b/tensor_computing/src/gpu/mali/cl/conv_wino_trans_outbuf_right.cl deleted file mode 100644 index 5fbfe37e..00000000 --- a/tensor_computing/src/gpu/mali/cl/conv_wino_trans_outbuf_right.cl +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#include"kernel_def.h" -#define loadR(val, str, off, in) {\ - val[0] = in[off];\ - val[1] = in[off + str];\ - val[2] = in[off + str * 2];\ - val[3] = in[off + str * 3];\ - val[4] = in[off + str * 4];\ - val[5] = in[off + str * 5];\ -} - -#define calCore(s, t, tmp){\ - t[0] = s[1] + s[2];\ - t[1] = s[3] + s[4];\ - t[2] = s[1] - s[2];\ - t[3] = s[3] - s[4];\ - tmp[0] = s[0] + t[0] + t[1];\ - tmp[1] = t[2] + (T)(2.0) * t[3];\ - tmp[2] = t[0] + (T)(4.0) * t[1];\ - tmp[3] = t[2] + (T)(8.0) * t[3] + s[5];\ -} - - -__kernel void conv_wino_trans_outbuf_right - (const int iw_str, const int iwh_str, const int wino_h, const int wino_w, const int wino_h6, const int wino_hw, __global const T* in, __global T* out) { - const int idx = get_global_id(0); - const int idy = get_global_id(1); - const int idz = get_global_id(2); - if(idx >= wino_hw) return; - - int in_off = idz * iwh_str * 6 + (idy << 2) * iw_str + idx; - T s[6]; - T4 res[4]; - for(int ii = 0; ii < 4; ++ii) { - loadR(s, iwh_str, in_off, in); - res[0] = res[1]; - res[1] = res[2]; - res[2] = res[3]; - res[3].x = s[0] + s[1] + s[2] + s[3] + s[4]; - res[3].y = s[1] - s[2] + (T)(2) * (s[3] - s[4]); - res[3].z = s[1] + s[2] + (T)(4) * (s[3] + s[4]); - res[3].w = s[1] - s[2] + (T)(8) * (s[3] - s[4]) + s[5]; - in_off += iw_str; - } - - const int idx_i = idx % wino_h; - const int idx_j = idx / wino_h; - const int out_off = (idy * 24 * wino_w + idx_j * 24 + idz) * wino_h + idx_i; - vstore4((T4)(res[0].x, res[1].x, res[2].x, res[3].x), out_off, out); - vstore4((T4)(res[0].y, res[1].y, res[2].y, res[3].y), out_off + wino_h6, out); - vstore4((T4)(res[0].z, res[1].z, res[2].z, res[3].z), out_off + wino_h6 * 2, out); - vstore4((T4)(res[0].w, res[1].w, res[2].w, res[3].w), out_off + wino_h6 * 3, out); -} diff --git a/tensor_computing/src/gpu/mali/cl/conv_wino_trans_picbuf.cl b/tensor_computing/src/gpu/mali/cl/conv_wino_trans_picbuf.cl deleted file mode 100644 index 8c4fb969..00000000 --- a/tensor_computing/src/gpu/mali/cl/conv_wino_trans_picbuf.cl +++ /dev/null @@ -1,116 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#define loadH(val, off, pic) {\ - val[0] = pic[off];\ - val[1] = pic[off + 4];\ - val[2] = pic[off + 8];\ - val[3] = pic[off + 12];\ - val[4] = pic[off + 16];\ - val[5] = pic[off + 20];\ -} - -__kernel void conv_wino_trans_picbuf(const int ih_str4, const int iw_str, const int ih_off, const int iw_off, const int oh4, const int pw_str, const int pwh_str, - __global const T* in, __global T* pictran) { - const int id = get_global_id(0); - const int idhc = id % oh4; - const int idx = idhc >> 2; - const int idc = idhc & 3; - const int idy = id / oh4; - const int idz = get_global_id(1); - - const int in_off = (idz * iw_str + (idy << 2) + iw_off) * ih_str4 + (idx << 4) + idc + (ih_off << 2); - const int pictran_off = ((idz << 2) + idc) * pw_str + (id >> 2); - T tmp[16]; - T h0[6], h1[6], h2[6], h3[6], h4[6], h5[6]; - - loadH(h0, in_off, in); - loadH(h1, in_off + ih_str4, in); - loadH(h2, in_off + ih_str4 * 2, in); - loadH(h3, in_off + ih_str4 * 3, in); - loadH(h4, in_off + ih_str4 * 4, in); - loadH(h5, in_off + ih_str4 * 5, in); - - h1[0] = (T)(4.0) * h1[0] - (T)(5.0) * h1[2] + h1[4]; - h2[0] = (T)(4.0) * h2[0] - (T)(5.0) * h2[2] + h2[4]; - h3[0] = (T)(4.0) * h3[0] - (T)(5.0) * h3[2] + h3[4]; - h4[0] = (T)(4.0) * h4[0] - (T)(5.0) * h4[2] + h4[4]; - - tmp[0] = (T)(-4.0) * (h1[1] + h1[2]) + h1[3] + h1[4]; - tmp[1] = (T)(-4.0) * (h2[1] + h2[2]) + h2[3] + h2[4]; - tmp[2] = (T)(-4.0) * (h3[1] + h3[2]) + h3[3] + h3[4]; - tmp[3] = (T)(-4.0) * (h4[1] + h4[2]) + h4[3] + h4[4]; - - tmp[4] = (T)( 4.0) * (h1[1] - h1[2]) - h1[3] + h1[4]; - tmp[5] = (T)( 4.0) * (h2[1] - h2[2]) - h2[3] + h2[4]; - tmp[6] = (T)( 4.0) * (h3[1] - h3[2]) - h3[3] + h3[4]; - tmp[7] = (T)( 4.0) * (h4[1] - h4[2]) - h4[3] + h4[4]; - - tmp[8] = (T)(2.0) * (h1[3] - h1[1]) - h1[2] + h1[4]; - tmp[9] = (T)(2.0) * (h2[3] - h2[1]) - h2[2] + h2[4]; - tmp[10] = (T)(2.0) * (h3[3] - h3[1]) - h3[2] + h3[4]; - tmp[11] = (T)(2.0) * (h4[3] - h4[1]) - h4[2] + h4[4]; - - tmp[12] = (T)(2.0) * (h1[1] - h1[3]) - h1[2] + h1[4]; - tmp[13] = (T)(2.0) * (h2[1] - h2[3]) - h2[2] + h2[4]; - tmp[14] = (T)(2.0) * (h3[1] - h3[3]) - h3[2] + h3[4]; - tmp[15] = (T)(2.0) * (h4[1] - h4[3]) - h4[2] + h4[4]; - - h1[5] = (T)(4.0) * h1[1] - (T)(5.0) * h1[3] + h1[5]; - h2[5] = (T)(4.0) * h2[1] - (T)(5.0) * h2[3] + h2[5]; - h3[5] = (T)(4.0) * h3[1] - (T)(5.0) * h3[3] + h3[5]; - h4[5] = (T)(4.0) * h4[1] - (T)(5.0) * h4[3] + h4[5]; - - pictran[pictran_off] = (T)(16.0) * h0[0] - (T)(20.0) * h0[2] + (T)(4.0) * h0[4] - (T)(5.0) * h2[0] + h4[0]; - pictran[pictran_off + pwh_str] = (T)(-4.0) * (h1[0] + h2[0]) + h3[0] + h4[0]; - pictran[pictran_off + pwh_str * 2] = (T)( 4.0) * (h1[0] - h2[0]) - h3[0] + h4[0]; - pictran[pictran_off + pwh_str * 3] = (T)( 2.0) * (h3[0] - h1[0]) - h2[0] + h4[0]; - pictran[pictran_off + pwh_str * 4] = (T)( 2.0) * (h1[0] - h3[0]) - h2[0] + h4[0]; - pictran[pictran_off + pwh_str * 5] = (T)( 4.0) * (h1[0] + h5[0]) - (T)(5.0) * (h3[0] + h5[2]) + h5[4]; - - pictran[pictran_off + pwh_str * 6] = (T)(-16.0) * (h0[1] + h0[2]) + (T)(4.0) * (h0[3] + h0[4]) - (T)(5.0) * tmp[1] + tmp[3]; - pictran[pictran_off + pwh_str * 7] = (T)(-4.0) * (tmp[0] + tmp[1]) + tmp[2] + tmp[3]; - pictran[pictran_off + pwh_str * 8] = (T)( 4.0) * (tmp[0] - tmp[1]) - tmp[2] + tmp[3]; - pictran[pictran_off + pwh_str * 9] = (T)( 2.0) * (tmp[2] - tmp[0]) - tmp[1] + tmp[3]; - pictran[pictran_off + pwh_str * 10] = (T)( 2.0) * (tmp[0] - tmp[2]) - tmp[1] + tmp[3]; - pictran[pictran_off + pwh_str * 11] = (T)( 4.0) * (tmp[0] - h5[1] - h5[2]) - (T)(5.0) * tmp[2] + h5[3] + h5[4]; - - pictran[pictran_off + pwh_str * 12] = (T)(16.0) * (h0[1] - h0[2]) + (T)(4.0) * (h0[4] - h0[3]) - (T)(5.0) * tmp[5] + tmp[7]; - pictran[pictran_off + pwh_str * 13] = (T)(-4.0) * (tmp[4] + tmp[5]) + tmp[6] + tmp[7]; - pictran[pictran_off + pwh_str * 14] = (T)( 4.0) * (tmp[4] - tmp[5]) - tmp[6] + tmp[7]; - pictran[pictran_off + pwh_str * 15] = (T)( 2.0) * (tmp[6] - tmp[4]) - tmp[5] + tmp[7]; - pictran[pictran_off + pwh_str * 16] = (T)( 2.0) * (tmp[4] - tmp[6]) - tmp[5] + tmp[7]; - pictran[pictran_off + pwh_str * 17] = (T)( 4.0) * (tmp[4] + h5[1] - h5[2]) - (T)(5.0) * tmp[6] - h5[3] + h5[4]; - - pictran[pictran_off + pwh_str * 18] = (T)( 8.0) * (h0[3] - h0[1]) + (T)(4.0) * (h0[4] - h0[2]) -(T)(5.0) * tmp[9] + tmp[11]; - pictran[pictran_off + pwh_str * 19] = (T)(-4.0) * (tmp[8] + tmp[9]) + tmp[10] + tmp[11]; - pictran[pictran_off + pwh_str * 20] = (T)( 4.0) * (tmp[8] - tmp[9]) - tmp[10] + tmp[11]; - pictran[pictran_off + pwh_str * 21] = (T)( 2.0) * (tmp[10] - tmp[8]) - tmp[9] + tmp[11]; - pictran[pictran_off + pwh_str * 22] = (T)( 2.0) * (tmp[8] - tmp[10]) - tmp[9] + tmp[11]; - pictran[pictran_off + pwh_str * 23] = (T)( 4.0) * tmp[8] + (T)(2.0) * (h5[3] - h5[1]) - h5[2] - (T)(5.0) * tmp[10] + h5[4]; - - pictran[pictran_off + pwh_str * 24] = (T)( 8.0) * (h0[1] - h0[3]) + (T)(4.0) * (h0[4] - h0[2]) -(T)(5.0) * tmp[13] + tmp[15]; - pictran[pictran_off + pwh_str * 25] = (T)(-4.0) * (tmp[12] + tmp[13]) + tmp[14] + tmp[15]; - pictran[pictran_off + pwh_str * 26] = (T)( 4.0) * (tmp[12] - tmp[13]) - tmp[14] + tmp[15]; - pictran[pictran_off + pwh_str * 27] = (T)( 2.0) * (tmp[14] - tmp[12]) - tmp[13] + tmp[15]; - pictran[pictran_off + pwh_str * 28] = (T)( 2.0) * (tmp[12] - tmp[14]) - tmp[13] + tmp[15]; - pictran[pictran_off + pwh_str * 29] = (T)( 4.0) * tmp[12] + (T)(2.0) * (h5[1] - h5[3]) - h5[2] - (T)(5.0) * tmp[14] + h5[4]; - - pictran[pictran_off + pwh_str * 30] = (T)(16.0) * h0[1] - (T)(20.0) * h0[3] + (T)(4.0) * h0[5] - (T)(5.0) * h2[5] + h4[5]; - pictran[pictran_off + pwh_str * 31] = (T)(-4.0) *(h1[5] + h2[5])+ h3[5] + h4[5]; - pictran[pictran_off + pwh_str * 32] = (T)( 4.0) *(h1[5] - h2[5])- h3[5] + h4[5]; - pictran[pictran_off + pwh_str * 33] = (T)( 2.0) *(h3[5] - h1[5])- h2[5] + h4[5]; - pictran[pictran_off + pwh_str * 34] = (T)( 2.0) *(h1[5] - h3[5])- h2[5] + h4[5]; - pictran[pictran_off + pwh_str * 35] = (T)( 4.0) *(h1[5] + h5[1])- (T)(5.0) * (h3[5] + h5[3]) + h5[5]; -} diff --git a/tensor_computing/src/gpu/mali/cl/depth2space.cl b/tensor_computing/src/gpu/mali/cl/depth2space.cl deleted file mode 100644 index cb855786..00000000 --- a/tensor_computing/src/gpu/mali/cl/depth2space.cl +++ /dev/null @@ -1,20 +0,0 @@ -__kernel void depth2space(const int iw, const int ih, const int iw_str, const int ih_str, const int iw_off, const int ih_off, - const int ow_str, const int oh_str, const int ow_off, const int oh_off, __global const T* in, __global uchar* out) { - const int idx = get_global_id(0); - const int idy = get_global_id(1); - if(idx >= ih || idy >= (iw << 2)) return; - const int ix = idx; - const int iy = idy % iw; - const int iz = idy / iw; - - const int in_off = (iz * iw_str + iy + iw_off) * ih_str + ix + ih_off; - T4 tmp = vload4(in_off, in); - uchar4 val; - val.x = tmp.x * 255.0; - val.y = tmp.y * 255.0; - val.z = tmp.z * 255.0; - val.w = tmp.w * 255.0; - - const int out_off = ((ix << 2) + iz + oh_off) * ow_str + (iy << 2) + ow_off; - vstore4(val, 0, out + out_off); -} diff --git a/tensor_computing/src/gpu/mali/cl/eltwise.cl b/tensor_computing/src/gpu/mali/cl/eltwise.cl deleted file mode 100644 index 0179604d..00000000 --- a/tensor_computing/src/gpu/mali/cl/eltwise.cl +++ /dev/null @@ -1,120 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - - - -#define MANGLE_NAME_IMPL(base, TP, N) base ## TP ## N -#define MANGLE_NAME(base, TP, N) MANGLE_NAME_IMPL(base, TP, N) - -#if defined(USE_SUM) -#define calCore(in, off, v, res) {\ - v = vload8(0, in + (off << 2));\ - res.s0 += v.s0;\ - res.s1 += v.s1;\ - res.s2 += v.s2;\ - res.s3 += v.s3;\ - res.s4 += v.s4;\ - res.s5 += v.s5;\ - res.s6 += v.s6;\ - res.s7 += v.s7;\ -} -#endif - -#if defined(USE_MAX) -#define calCore(in, off, v, res) {\ - v = vload8(0, in + (off << 2));\ - res = fmax(res, v);\ -} -#endif - -#if defined(USE_PROD) -#define calCore(in, off, v, res) {\ - v = vload8(0, in + (off << 2));\ - res.s0 *= v.s0;\ - res.s1 *= v.s1;\ - res.s2 *= v.s2;\ - res.s3 *= v.s3;\ - res.s4 *= v.s4;\ - res.s5 *= v.s5;\ - res.s6 *= v.s6;\ - res.s7 *= v.s7;\ -} -#endif - -__kernel void MANGLE_NAME(eltwise_, TP, N)(const int h, const int w, const int c, const int ih_str, const int iw_str, const int ih_off, const int iw_off, - const int oh_str, const int ow_str, const int oh_off, const int ow_off, - __global const T* in0, -#if (N > 1) - __global const T* in1, -#endif -#if (N > 2) - __global const T* in2, -#endif -#if (N > 3) - __global const T* in3, -#endif -#if (N > 4) - __global const T* in4, -#endif -#if (N > 5 ) - __global const T* in5, -#endif -#if (N > 6) - __global const T* in6, -#endif -#if (N > 7) - __global const T* in7, -#endif - __global T* out){ - const int idx = get_global_id(0); - const int idy = get_global_id(1); - const int idz = get_global_id(2); - if(idx >= ((h + 1) >> 1) || idy >= w) return; - - T8 val; - T8 res; - const int in_off = (idz * iw_str + idy + iw_off) * ih_str + (idx << 1) + ih_off; - res = vload8(0, in0 + (in_off << 2)); -#if(N > 1) - calCore(in1, in_off, val, res); -#endif -#if(N > 2) - calCore(in2, in_off, val, res); -#endif -#if(N > 3) - calCore(in3, in_off, val, res); -#endif -#if(N > 4) - calCore(in4, in_off, val, res); -#endif -#if(N > 5) - calCore(in5, in_off, val, res); -#endif -#if(N > 6) - calCore(in6, in_off, val, res); -#endif -#if(N > 7) - calCore(in7, in_off, val, res); -#endif - - const int out_off = (idz * ow_str + idy + ow_off) * oh_str + (idx << 1) + oh_off; - if((idx << 1) + 1 < h){ - vstore8(res, 0, out + (out_off << 2)); - } else { - vstore4(res.s0123, 0, out + (out_off << 2)); - } -} - diff --git a/tensor_computing/src/gpu/mali/cl/fc_p1.cl b/tensor_computing/src/gpu/mali/cl/fc_p1.cl deleted file mode 100644 index c2c7f565..00000000 --- a/tensor_computing/src/gpu/mali/cl/fc_p1.cl +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - - -#define calCore(iv, fv, res){\ - res.x += iv.x * fv.s0 + iv.y * fv.s1 + iv.z * fv.s2 + iv.w * fv.s3;\ - res.y += iv.x * fv.s4 + iv.y * fv.s5 + iv.z * fv.s6 + iv.w * fv.s7;\ - res.z += iv.x * fv.s8 + iv.y * fv.s9 + iv.z * fv.sa + iv.w * fv.sb;\ - res.w += iv.x * fv.sc + iv.y * fv.sd + iv.z * fv.se + iv.w * fv.sf;\ -} -__kernel void fc_p1(const int item_y, const int ih_str, const int iw_str, const int ih_off, const int iw_off, const int ihy_str, const int ihw_str, - const int fh, const int fw, const int fc, const int fn, const int fhy_str, const int fhw_str, const int fwc_str, __global const T* flt, __global const T* in, __global T* out){ - const int idx = get_global_id(0); - const int idy = get_global_id(1); - const int idz = get_global_id(2); - if(idx >= fh || idy >= item_y) return; - - T4 in_val; - T16 flt_val; - T4 sum = 0; - int in_off = (idy + iw_off) * ih_str + idx + ih_off; - int flt_off = (idz * fwc_str + idy) * fh + idx; - - for(int i = 0; i < fc; i++){ - int k = 0; - for(int j = idy; j < fw; j += item_y){ - in_val = vload4 (in_off + k * ihy_str, in); - flt_val = vload16(flt_off + k * fhy_str, flt); - calCore(in_val, flt_val, sum); - k++; - } - in_off += ihw_str; - flt_off += fhw_str; - } - - const int out_off = (idy * fh + idx) * fn + idz; - vstore4(sum, out_off, out); -} diff --git a/tensor_computing/src/gpu/mali/cl/fc_p2.cl b/tensor_computing/src/gpu/mali/cl/fc_p2.cl deleted file mode 100644 index cc5944d1..00000000 --- a/tensor_computing/src/gpu/mali/cl/fc_p2.cl +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - - -#if defined(USE_HALF) -#define READ_IMAGE(image, sampler, coord) read_imageh(image, sampler, coord) -#define WRITE_IMAGE(image, coord, data) write_imageh(image, coord, data) -#else -#define READ_IMAGE(image, sampler, coord) read_imagef(image, sampler, coord) -#define WRITE_IMAGE(image, coord, data) write_imagef(image, coord, data) -#endif -__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - -#define calCore(iv, fv, res) {\ - res.x += iv.x * fv.s0 + iv.y * fv.s1 + iv.z * fv.s2 + iv.w * fv.s3;\ - res.y += iv.x * fv.s4 + iv.y * fv.s5 + iv.z * fv.s6 + iv.w * fv.s7;\ - res.z += iv.x * fv.s8 + iv.y * fv.s9 + iv.z * fv.sa + iv.w * fv.sb;\ - res.w += iv.x * fv.sc + iv.y * fv.sd + iv.z * fv.se + iv.w * fv.sf;\ -} -__kernel void fc_p2(const int loop, const int len, const int oh_str, const int ow_str, const int oh_off, const int ow_off, - __global const T* in, __global const T* bias, __global T* out) { - const int idx = get_global_id(0); - if(idx >= len) return; - - T4 sum = vload4(idx, bias); - T4 val; - for(int i = 0; i < loop; i++) { - val = vload4(idx + i * len, in); - sum.x += val.x; - sum.y += val.y; - sum.z += val.z; - sum.w += val.w; - } - - const int out_off = (idx * ow_str + ow_off) * oh_str + oh_off; - vstore4(sum, out_off, out); -} diff --git a/tensor_computing/src/gpu/mali/cl/fc_trans_fltbuf.cl b/tensor_computing/src/gpu/mali/cl/fc_trans_fltbuf.cl deleted file mode 100644 index 6dd192fd..00000000 --- a/tensor_computing/src/gpu/mali/cl/fc_trans_fltbuf.cl +++ /dev/null @@ -1,86 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - - -#define MANGLE_NAME_IMPL(base, C, K) base ## C ## K -#define MANGLE_NAME(base, C, K) MANGLE_NAME_IMPL(base, C, K) -#if(C == 4) -#define loadFltval(off, str, flt, val){\ - val.x = flt[off];\ - val.y = flt[off + str];\ - val.z = flt[off + (str << 1)];\ - val.w = flt[off + str * 3];\ -} - -#define loadFltvalEdge(off, str, flt, val, edge){\ - val.x = flt[off];\ - if(edge > 1) val.y = flt[off + str];\ - if(edge > 2) val.z = flt[off + (str << 1)];\ -} -#endif - -#if(C == 8) -#define loadFltval(off, str, flt, val){\ - val.s0 = flt[off];\ - val.s1 = flt[off + str];\ - val.s2 = flt[off +(str << 1)];\ - val.s3 = flt[off + str * 3];\ - val.s4 = flt[off +(str << 2)];\ - val.s5 = flt[off + str * 5];\ - val.s6 = flt[off + str * 6];\ - val.s7 = flt[off + str * 7];\ -} -#define loadFltvalEdge(off, str, flt, val, edge){\ - val.s0 = flt[off];\ - if(edge > 1) val.s1 = flt[off + str];\ - if(edge > 2) val.s2 = flt[off +(str << 1)];\ - if(edge > 3) val.s3 = flt[off + str * 3];\ - if(edge > 4) val.s4 = flt[off +(str << 2)];\ - if(edge > 5) val.s5 = flt[off + str * 5];\ - if(edge > 6) val.s6 = flt[off + str * 6];\ -} -#endif - -__kernel void MANGLE_NAME(fc_trans_fltbuf_, C, K)(const int fw, const int fh, const int fwh, const int fc, const int fn, - __global const T* fltdata, __global T* fltbuf){ - const int idx = get_global_id(0); - const int idy = get_global_id(1); - const int idz = get_global_id(2); - const int bc = (fc + C - 1) / C; - const int idc = idz % bc; - const int idn = idz / bc; - short ec = ((idc + 1) * C <= fc) ? C : (fc % C); - - const int flt_off = ((idn * fc + idc * C) * fh + idy) * fw + idx; -#if (C == 4) - T4 val = 0; -#elif (C == 8) - T8 val = 0; -#endif - if(idn < fn){ - if(ec == C){ - loadFltval(flt_off, fwh, fltdata, val); - } else { - loadFltvalEdge(flt_off, fwh, fltdata, val, ec); - } - } - const int out_off = ((idn / K * bc + idc) * fh + idx) * fw * K + idy * K + (idn % K); -#if (C == 4) - vstore4(val, out_off, fltbuf); -#elif (C == 8) - vstore8(val, out_off, fltbuf); -#endif -} diff --git a/tensor_computing/src/gpu/mali/cl/gemm_nt.cl b/tensor_computing/src/gpu/mali/cl/gemm_nt.cl deleted file mode 100644 index 08c2c5e7..00000000 --- a/tensor_computing/src/gpu/mali/cl/gemm_nt.cl +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - - -#include"kernel_def.h" -#define MANGLE_NAME_LMPL(base, LM, LN, LK) base ## LM ## LN ## LK -#define MANGLE_NAME(base, LM, LN, LK) MANGLE_NAME_LMPL(base, LM, LN, LK) - -#if defined(NO_BIAS) -__kernel void MANGLE_NAME(gemm_nt_nobias_, LM, LN, LK) -#else -__kernel void MANGLE_NAME(gemm_nt_, LM, LN, LK) -#endif -#if defined(NO_BIAS) -(const int KA, const int KB, const int K, const int ow_str, const int A_str, const int B_str, const int C_str, const int bx, const int by, __global const T* A, __global const T* B, __global T* C) -#else -(const int KA, const int KB, const int K, const int ow_str, const int A_str, const int B_str, const int C_str, const int bx, const int by, __global const T* A, __global const T* B, __global const T* bias, __global T* C) -#endif -{ - const int idx = get_global_id(0); - const int idy = get_global_id(1); - const int idz = get_global_id(2); - if(idx >= bx || idy >= by) return; - const int ix = idx * LN; - const int iy = idy * LM; - const int L = K >> LK; - const int VN = 1 << LK; - - T c[LM][LN]; -#if(LK == 0) - T a[LM]; - T b[LN]; -#elif(LK == 1) - T2 a[LM]; - T2 b[LN]; -#elif(LK == 2) - T4 a[LM]; - T4 b[LN]; -#elif(LK == 3) - T8 a[LM]; - T8 b[LN]; -#elif(LK == 4) - T16 a[LM]; - T16 b[LN]; -#endif - -#if defined(NO_BIAS) - GEMM_SET_C_ZERO(c); -#else - GEMM_LOAD_A(a, iy, bias); - GEMM_SET_C_BIAS(a, c); -#endif - - int a_off = iy * KA + idz * A_str; - int b_off = ix * KB + idz * B_str; - for(int i = 0; i < L; ++i) { - GEMM_NT_LOAD_A(a, a_off, KA, A); - GEMM_NT_LOAD_B(b, b_off, KB, B); - GEMM_CALCORE(a, b, c); - a_off += VN; - b_off += VN; - } - int c_off = iy * ow_str + ix + idz * C_str; - GEMM_STORE_C(c, c_off, ow_str, C); -} diff --git a/tensor_computing/src/gpu/mali/cl/gemm_tn.cl b/tensor_computing/src/gpu/mali/cl/gemm_tn.cl deleted file mode 100644 index 18ae8f19..00000000 --- a/tensor_computing/src/gpu/mali/cl/gemm_tn.cl +++ /dev/null @@ -1,118 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - - -#include"kernel_def.h" -#define MANGLE_NAME_LMPL(base, LM, LN) base ## LM ## LN -#define MANGLE_NAME(base, LM, LN) MANGLE_NAME_LMPL(base, LM, LN) - -#if defined(USE_NCWHC4) -#if defined(USE_RELU) -__kernel void MANGLE_NAME(gemm_tn_relu_ncwhc4_, LM, LN) -#else -__kernel void MANGLE_NAME(gemm_tn_ncwhc4_, LM, LN) -#endif -#else -#if defined(USE_RELU) -__kernel void MANGLE_NAME(gemm_tn_relu_, LM, LN) -#elif defined(NO_BIAS) -__kernel void MANGLE_NAME(gemm_tn_nobias_, LM, LN) -#else -__kernel void MANGLE_NAME(gemm_tn_, LM, LN) -#endif -#endif -#if defined(USE_NCWHC4) -(const int M, const int N, const int K, const int oh, const int ow, const int oc, const int oh_str, const int ow_str, const int ohw_str, - const int oh_off, const int ow_off, const int bx, const int by, __global const T* A, __global const T* B, __global const T* bias, __global T* C) -#else -#if defined(NO_BIAS) -(const int M, const int N, const int K, const int ow_str, const int A_str, const int B_str, const int C_str, const int bx, const int by, __global const T* A, __global const T* B, __global T* C) -#else -(const int M, const int N, const int K, const int bx, const int by, __global const T* A, __global const T* B, __global const T* bias, __global T* C) -#endif -#endif -{ - const int idx = get_global_id(0); - const int idy = get_global_id(1); - if(idx >= bx || idy >= by) return; - const int ix = idx * LN; - const int iy = idy * LM; - - T a[LM]; - T b[LN]; - T c[LM][LN]; - int a_off = iy; - int b_off = ix; -#if defined(NO_BIAS) - const int idz = get_global_id(2); - a_off += idz * A_str; - b_off += idz * B_str; - GEMM_SET_C_ZERO(c); -#else - GEMM_LOAD_A(a, iy, bias); - GEMM_SET_C_BIAS(a, c); -#endif - - for(int i = 0; i < K; ++i) { - GEMM_LOAD_A(a, a_off, A); - GEMM_LOAD_B(b, b_off, B); - GEMM_CALCORE(a, b, c); - a_off += M; - b_off += N; - } - -#if defined(USE_NCWHC4) - /*LM = 4 or LM = 8*/ - int c_base = (iy >> 2) * ohw_str; - for(uchar i = 0; i < LN; ++i) { - int oxh = (ix + i) % oh; - int oxw = (ix + i) / oh; - if(oxw >= ow) break; - int c_off = c_base + (oxw + ow_off) * oh_str + oxh + oh_off; - T4 tmp; - tmp.x = c[0][0]; - tmp.y = c[1][0]; - tmp.z = c[2][0]; - tmp.w = c[3][0]; - ACTIVATION_V4(tmp); - vstore4(tmp, c_off, C); - UPDATE_REG(c[0]); - UPDATE_REG(c[1]); - UPDATE_REG(c[2]); - UPDATE_REG(c[3]); -#if (LM == 8) - if(iy + 4 >= oc) continue; - c_off += ohw_str; - tmp.x = c[4][0]; - tmp.y = c[5][0]; - tmp.z = c[6][0]; - tmp.w = c[7][0]; - ACTIVATION_V4(tmp); - vstore4(tmp, c_off, C); - UPDATE_REG(c[4]); - UPDATE_REG(c[5]); - UPDATE_REG(c[6]); - UPDATE_REG(c[7]); -#endif - } -#else - int c_off = iy * ow_str + ix; -#if defined(NO_BIAS) - c_off += idz * C_str; -#endif - GEMM_STORE_C(c, c_off, ow_str, C); -#endif -} diff --git a/tensor_computing/src/gpu/mali/cl/kernel_def.h b/tensor_computing/src/gpu/mali/cl/kernel_def.h deleted file mode 100644 index d67dad82..00000000 --- a/tensor_computing/src/gpu/mali/cl/kernel_def.h +++ /dev/null @@ -1,2163 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - - -#ifndef _KERNEL_DEF -#define _KERNEL_DEF - -/* - * READ IMAGE - */ -#if defined(USE_HALF) -#define READ_IMAGE(image, sampler, coord) read_imageh(image, sampler, coord) -#define WRITE_IMAGE(image, coord, data) write_imageh(image, coord, data) -#else -#define READ_IMAGE(image, sampler, coord) read_imagef(image, sampler, coord) -#define WRITE_IMAGE(image, coord, data) write_imagef(image, coord, data) -#endif - - -__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - - -#if defined(USE_V1) - #define READ_BUF(v, off, buf) {\ - v = buf[off];\ - } -#elif defined (USE_V2) - #define READ_BUF(v, off, buf) {\ - v = vload2(0, buf + off);\ - } -#elif defined (USE_V3) - #define READ_BUF(v, off, buf) {\ - v = vload3(0, buf + off);\ - } -#elif defined (USE_V4) - #define READ_BUF(v, off, buf) {\ - v = vload4(0, buf + off);\ - } -#elif defined (USE_V8) - #define READ_BUF(v, off, buf) {\ - v = vload8(0, buf + off);\ - } -#elif defined (USE_V16) - #define READ_BUF(v, off, buf) {\ - v = vload16(0, buf + off);\ - } -#endif - -/* - * load data from buffer to reg array - */ -#define LOAD_BUF_ARRAY1(v, off, buf) {\ - v[0] = buf[off];\ -} - -#define LOAD_BUF_ARRAY2(v, off, buf) {\ - T2 tmp = vload2(0, buf + off);\ - v[0] = tmp.x;\ - v[1] = tmp.y;\ -} - -#define LOAD_BUF_ARRAY3(v, off, buf) {\ - T3 tmp = vload3(0, buf + off);\ - v[0] = tmp.x;\ - v[1] = tmp.y;\ - v[2] = tmp.z;\ -} - -#define LOAD_BUF_ARRAY4(v, off, buf) {\ - T4 tmp = vload4(0, buf + off);\ - v[0] = tmp.x;\ - v[1] = tmp.y;\ - v[2] = tmp.z;\ - v[3] = tmp.w;\ -} - -#define LOAD_BUF_ARRAY5(v, off, buf) {\ - T8 tmp = vload8(0, buf + off);\ - v[0] = tmp.s0;\ - v[1] = tmp.s1;\ - v[2] = tmp.s2;\ - v[3] = tmp.s3;\ - v[4] = tmp.s4;\ -} - -#define LOAD_BUF_ARRAY6(v, off, buf) {\ - T8 tmp = vload8(0, buf + off);\ - v[0] = tmp.s0;\ - v[1] = tmp.s1;\ - v[2] = tmp.s2;\ - v[3] = tmp.s3;\ - v[4] = tmp.s4;\ - v[5] = tmp.s5;\ -} - -#define LOAD_BUF_ARRAY7(v, off, buf) {\ - T8 tmp = vload8(0, buf + off);\ - v[0] = tmp.s0;\ - v[1] = tmp.s1;\ - v[2] = tmp.s2;\ - v[3] = tmp.s3;\ - v[4] = tmp.s4;\ - v[5] = tmp.s5;\ - v[6] = tmp.s6;\ -} - -#define LOAD_BUF_ARRAY8(v, off, buf) {\ - T8 tmp= vload8(0, buf + off);\ - v[0] = tmp.s0;\ - v[1] = tmp.s1;\ - v[2] = tmp.s2;\ - v[3] = tmp.s3;\ - v[4] = tmp.s4;\ - v[5] = tmp.s5;\ - v[6] = tmp.s6;\ - v[7] = tmp.s7;\ -} - -/* - * set reg array to normal val - */ -#define SET_REG_ARRAY1(v, reg) {\ - reg[0] = v;\ -} - -#define SET_REG_ARRAY2(v, reg) {\ - reg[0] = v;\ - reg[1] = v;\ -} - -#define SET_REG_ARRAY3(v, reg) {\ - reg[0] = v;\ - reg[1] = v;\ - reg[2] = v;\ -} - -#define SET_REG_ARRAY4(v, reg) {\ - reg[0] = v;\ - reg[1] = v;\ - reg[2] = v;\ - reg[3] = v;\ -} -#define SET_REG_ARRAY5(v, reg) {\ - reg[0] = v;\ - reg[1] = v;\ - reg[2] = v;\ - reg[3] = v;\ - reg[4] = v;\ -} - -#define SET_REG_ARRAY6(v, reg) {\ - reg[0] = v;\ - reg[1] = v;\ - reg[2] = v;\ - reg[3] = v;\ - reg[4] = v;\ - reg[5] = v;\ -} - -#define SET_REG_ARRAY7(v, reg) {\ - reg[0] = v;\ - reg[1] = v;\ - reg[2] = v;\ - reg[3] = v;\ - reg[4] = v;\ - reg[5] = v;\ - reg[6] = v;\ -} - -#define SET_REG_ARRAY8(v, reg) {\ - reg[0] = v;\ - reg[1] = v;\ - reg[2] = v;\ - reg[3] = v;\ - reg[4] = v;\ - reg[5] = v;\ - reg[6] = v;\ - reg[7] = v;\ -} - -#define MUL_REG_NORMAL_ARRAY1(s, reg) {\ - reg[0] = s * reg[0];\ -} - -#define MUL_REG_NORMAL_ARRAY2(s, reg) {\ - reg[0] = s * reg[0];\ - reg[1] = s * reg[1];\ -} - -#define MUL_REG_NORMAL_ARRAY3(s, reg) {\ - reg[0] = s * reg[0];\ - reg[1] = s * reg[1];\ - reg[2] = s * reg[2];\ -} - -#define MUL_REG_NORMAL_ARRAY4(s, reg) {\ - reg[0] = s * reg[0];\ - reg[1] = s * reg[1];\ - reg[2] = s * reg[2];\ - reg[3] = s * reg[3];\ -} - -#define MUL_REG_NORMAL_ARRAY5(s, reg) {\ - reg[0] = s * reg[0];\ - reg[1] = s * reg[1];\ - reg[2] = s * reg[2];\ - reg[3] = s * reg[3];\ - reg[4] = s * reg[4];\ -} - -#define MUL_REG_NORMAL_ARRAY6(s, reg) {\ - reg[0] = s * reg[0];\ - reg[1] = s * reg[1];\ - reg[2] = s * reg[2];\ - reg[3] = s * reg[3];\ - reg[4] = s * reg[4];\ - reg[5] = s * reg[5];\ -} - -#define MUL_REG_NORMAL_ARRAY7(s, reg) {\ - reg[0] = s * reg[0];\ - reg[1] = s * reg[1];\ - reg[2] = s * reg[2];\ - reg[3] = s * reg[3];\ - reg[4] = s * reg[4];\ - reg[5] = s * reg[5];\ - reg[6] = s * reg[6];\ -} - -#define MUL_REG_NORMAL_ARRAY8(s, reg) {\ - reg[0] = s * reg[0];\ - reg[1] = s * reg[1];\ - reg[2] = s * reg[2];\ - reg[3] = s * reg[3];\ - reg[4] = s * reg[4];\ - reg[5] = s * reg[5];\ - reg[6] = s * reg[6];\ - reg[7] = s * reg[7];\ -} - -#define ADD_REG_ARRAY4(reg0, reg1) {\ - reg1[0] += reg0[0];\ - reg1[1] += reg0[1];\ - reg1[2] += reg0[2];\ - reg1[3] += reg0[3];\ -} - -#define MINUS_REG_ARRAY4(reg0, reg1) {\ - reg1[0] -= reg0[0];\ - reg1[1] -= reg0[1];\ - reg1[2] -= reg0[2];\ - reg1[3] -= reg0[3];\ -} - -/* - * DOT - */ -#define DOT_A4B16C4(a, b, c) {\ - c.x += (a.x * b.s0 + a.y * b.s1 + a.z * b.s2 + a.w * b.s3);\ - c.y += (a.x * b.s4 + a.y * b.s5 + a.z * b.s6 + a.w * b.s7);\ - c.z += (a.x * b.s8 + a.y * b.s9 + a.z * b.sa + a.w * b.sb);\ - c.w += (a.x * b.sc + a.y * b.sd + a.z * b.se + a.w * b.sf);\ -} - -#define DOT_A4B4C1(a, b, c) {\ - c += (a.x * b.s0 + a.y * b.s1 + a.z * b.s2 + a.w * b.s3);\ -} - -#define DOT_A4B4C4(a, b, c) {\ - c.x += a.x * b.x;\ - c.y += a.y * b.y;\ - c.z += a.z * b.z;\ - c.w += a.w * b.w;\ -} - -#define DOT_A2B2C1(a, b, c) {\ - c += (a.s0 * b.s0 + a.s1 * b.s1);\ -} - -#define DOT_A8B8C1(a, b, c) {\ - c += (a.s0 * b.s0 + a.s1 * b.s1 + a.s2 * b.s2 + a.s3 * b.s3);\ - c += (a.s4 * b.s4 + a.s5 * b.s5 + a.s6 * b.s6 + a.s7 * b.s7);\ -} - -#define DOT_A16B16C1(a, b, c) {\ - c += (a.x * b.s0 + a.y * b.s1 + a.z * b.s2 + a.w * b.s3);\ - c += (a.x * b.s4 + a.y * b.s5 + a.z * b.s6 + a.w * b.s7);\ - c += (a.x * b.s8 + a.y * b.s9 + a.z * b.sa + a.w * b.sb);\ - c += (a.x * b.sc + a.y * b.sd + a.z * b.se + a.w * b.sf);\ -} - -#define DOT_A_NORMAL_B1C1_ARRAY(a, b, c) {\ - c[0] += a * b[0];\ -} - -#define DOT_A_NORMAL_B2C2_ARRAY(a, b, c) {\ - c[0] += a * b[0];\ - c[1] += a * b[1];\ -} - -#define DOT_A_NORMAL_B3C3_ARRAY(a, b, c) {\ - c[0] += a * b[0];\ - c[1] += a * b[1];\ - c[2] += a * b[2];\ -} - -#define DOT_A_NORMAL_B4C4_ARRAY(a, b, c) {\ - c[0] += a * b[0];\ - c[1] += a * b[1];\ - c[2] += a * b[2];\ - c[3] += a * b[3];\ -} - -#define DOT_A_NORMAL_B5C5_ARRAY(a, b, c) {\ - c[0] += a * b[0];\ - c[1] += a * b[1];\ - c[2] += a * b[2];\ - c[3] += a * b[3];\ - c[4] += a * b[4];\ -} - -#define DOT_A_NORMAL_B6C6_ARRAY(a, b, c) {\ - c[0] += a * b[0];\ - c[1] += a * b[1];\ - c[2] += a * b[2];\ - c[3] += a * b[3];\ - c[4] += a * b[4];\ - c[5] += a * b[5];\ -} - -#define DOT_A_NORMAL_B7C7_ARRAY(a, b, c) {\ - c[0] += a * b[0];\ - c[1] += a * b[1];\ - c[2] += a * b[2];\ - c[3] += a * b[3];\ - c[4] += a * b[4];\ - c[5] += a * b[5];\ - c[6] += a * b[6];\ -} - -#define DOT_A_NORMAL_B8C8_ARRAY(a, b, c) {\ - c[0] += a * b[0];\ - c[1] += a * b[1];\ - c[2] += a * b[2];\ - c[3] += a * b[3];\ - c[4] += a * b[4];\ - c[5] += a * b[5];\ - c[6] += a * b[6];\ - c[7] += a * b[7];\ -} - -#if defined(USE_V2) -#define DOT_VEC(a, b, c) DOT_A2B2C1(a, b, c) -#elif defined(USE_V4) -#define DOT_VEC(a, b, c) DOT_A4B4C1(a, b, c) -#elif defined(USE_V8) -#define DOT_VEC(a, b, c) DOT_A8B8C1(a, b, c) -#elif defined(USE_V16) -#define DOT_VEC(a, b, c) DOT_A16B16C1(a, b, c) -#else -#define DOT_A_VEC_B1C1_ARRAY(a, b, c) DOT_A_NORMAL_B1C1_ARRAY(a, b, c) -#define DOT_A_VEC_B2C2_ARRAY(a, b, c) DOT_A_NORMAL_B2C2_ARRAY(a, b, c) -#define DOT_A_VEC_B3C3_ARRAY(a, b, c) DOT_A_NORMAL_B3C3_ARRAY(a, b, c) -#define DOT_A_VEC_B4C4_ARRAY(a, b, c) DOT_A_NORMAL_B4C4_ARRAY(a, b, c) -#define DOT_A_VEC_B5C5_ARRAY(a, b, c) DOT_A_NORMAL_B5C5_ARRAY(a, b, c) -#define DOT_A_VEC_B6C6_ARRAY(a, b, c) DOT_A_NORMAL_B6C6_ARRAY(a, b, c) -#define DOT_A_VEC_B7C7_ARRAY(a, b, c) DOT_A_NORMAL_B7C7_ARRAY(a, b, c) -#define DOT_A_VEC_B8C8_ARRAY(a, b, c) DOT_A_NORMAL_B8C8_ARRAY(a, b, c) -#endif - -#if defined(USE_V2) || defined(USE_V4) || defined(USE_V8) || defined(USE_V16) -#define DOT_A_VEC_B1C1_ARRAY(a, b, c) {\ - DOT_VEC(a, b[0], c[0]);\ -} - -#define DOT_A_VEC_B2C2_ARRAY(a, b, c) {\ - DOT_VEC(a, b[0], c[0]);\ - DOT_VEC(a, b[1], c[1]);\ -} - -#define DOT_A_VEC_B3C3_ARRAY(a, b, c) {\ - DOT_VEC(a, b[0], c[0]);\ - DOT_VEC(a, b[1], c[1]);\ - DOT_VEC(a, b[2], c[2]);\ -} - -#define DOT_A_VEC_B4C4_ARRAY(a, b, c) {\ - DOT_VEC(a, b[0], c[0]);\ - DOT_VEC(a, b[1], c[1]);\ - DOT_VEC(a, b[2], c[2]);\ - DOT_VEC(a, b[3], c[3]);\ -} - -#define DOT_A_VEC_B5C5_ARRAY(a, b, c) {\ - DOT_VEC(a, b[0], c[0]);\ - DOT_VEC(a, b[1], c[1]);\ - DOT_VEC(a, b[2], c[2]);\ - DOT_VEC(a, b[3], c[3]);\ - DOT_VEC(a, b[4], c[4]);\ -} - -#define DOT_A_VEC_B6C6_ARRAY(a, b, c) {\ - DOT_VEC(a, b[0], c[0]);\ - DOT_VEC(a, b[1], c[1]);\ - DOT_VEC(a, b[2], c[2]);\ - DOT_VEC(a, b[3], c[3]);\ - DOT_VEC(a, b[4], c[4]);\ - DOT_VEC(a, b[5], c[5]);\ -} - -#define DOT_A_VEC_B7C7_ARRAY(a, b, c) {\ - DOT_VEC(a, b[0], c[0]);\ - DOT_VEC(a, b[1], c[1]);\ - DOT_VEC(a, b[2], c[2]);\ - DOT_VEC(a, b[3], c[3]);\ - DOT_VEC(a, b[4], c[4]);\ - DOT_VEC(a, b[5], c[5]);\ - DOT_VEC(a, b[6], c[6]);\ -} - -#define DOT_A_VEC_B8C8_ARRAY(a, b, c) {\ - DOT_VEC(a, b[0], c[0]);\ - DOT_VEC(a, b[1], c[1]);\ - DOT_VEC(a, b[2], c[2]);\ - DOT_VEC(a, b[3], c[3]);\ - DOT_VEC(a, b[4], c[4]);\ - DOT_VEC(a, b[5], c[5]);\ - DOT_VEC(a, b[6], c[6]);\ - DOT_VEC(a, b[7], c[7]);\ -} -#endif -/* - * ACTIVATION - */ -#if defined(USE_RELU) -#define ACTIVATION_V4(v) {\ - if(v.x < 0) v.x = (T)0;\ - if(v.y < 0) v.y = (T)0;\ - if(v.z < 0) v.z = (T)0;\ - if(v.w < 0) v.w = (T)0;\ -} - -#define ACTIVATION_V1(v) {\ - if(v < 0) v = (T)0;\ -} - -#define ACTIVATION_ARRAY1(v) {\ - if(v[0] < 0) v[0] = (T)0;\ -} - -#define ACTIVATION_ARRAY2(v) {\ - if(v[0] < 0) v[0] = (T)0;\ - if(v[1] < 0) v[1] = (T)0;\ -} - -#define ACTIVATION_ARRAY3(v) {\ - if(v[0] < 0) v[0] = (T)0;\ - if(v[1] < 0) v[1] = (T)0;\ - if(v[2] < 0) v[2] = (T)0;\ -} - -#define ACTIVATION_ARRAY4(v) {\ - if(v[0] < 0) v[0] = (T)0;\ - if(v[1] < 0) v[1] = (T)0;\ - if(v[2] < 0) v[2] = (T)0;\ - if(v[3] < 0) v[3] = (T)0;\ -} - -#define ACTIVATION_ARRAY5(v) {\ - if(v[0] < 0) v[0] = (T)0;\ - if(v[1] < 0) v[1] = (T)0;\ - if(v[2] < 0) v[2] = (T)0;\ - if(v[3] < 0) v[3] = (T)0;\ - if(v[4] < 0) v[4] = (T)0;\ -} - -#define ACTIVATION_ARRAY6(v) {\ - if(v[0] < 0) v[0] = (T)0;\ - if(v[1] < 0) v[1] = (T)0;\ - if(v[2] < 0) v[2] = (T)0;\ - if(v[3] < 0) v[3] = (T)0;\ - if(v[4] < 0) v[4] = (T)0;\ - if(v[5] < 0) v[5] = (T)0;\ -} - -#define ACTIVATION_ARRAY7(v) {\ - if(v[0] < 0) v[0] = (T)0;\ - if(v[1] < 0) v[1] = (T)0;\ - if(v[2] < 0) v[2] = (T)0;\ - if(v[3] < 0) v[3] = (T)0;\ - if(v[4] < 0) v[4] = (T)0;\ - if(v[5] < 0) v[5] = (T)0;\ - if(v[6] < 0) v[6] = (T)0;\ -} - -#define ACTIVATION_ARRAY8(v) {\ - if(v[0] < 0) v[0] = (T)0;\ - if(v[1] < 0) v[1] = (T)0;\ - if(v[2] < 0) v[2] = (T)0;\ - if(v[3] < 0) v[3] = (T)0;\ - if(v[4] < 0) v[4] = (T)0;\ - if(v[5] < 0) v[5] = (T)0;\ - if(v[6] < 0) v[6] = (T)0;\ - if(v[7] < 0) v[7] = (T)0;\ -} -#else -#define ACTIVATION_V1(v) {\ -} - -#define ACTIVATION_V4(v) {\ -} - -#define ACTIVATION_ARRAY1(v) {\ -} - -#define ACTIVATION_ARRAY2(v) {\ -} - -#define ACTIVATION_ARRAY3(v) {\ -} - -#define ACTIVATION_ARRAY4(v) {\ -} - -#define ACTIVATION_ARRAY5(v) {\ -} - -#define ACTIVATION_ARRAY6(v) {\ -} - -#define ACTIVATION_ARRAY7(v) {\ -} - -#define ACTIVATION_ARRAY8(v) {\ -} -#endif - -/* - * store data reg array to buffer - */ -#define STORE_BUF_ARRAY1(v, off, buf) {\ - ACTIVATION_ARRAY1(v);\ - buf[off] = v[0];\ -} - -#define STORE_BUF_ARRAY2(v, off, buf) {\ - ACTIVATION_ARRAY2(v);\ - vstore2((T2)(v[0], v[1]), 0, buf + off);\ -} - -#define STORE_BUF_ARRAY3(v, off, buf) {\ - ACTIVATION_ARRAY3(v);\ - vstore3((T3)(v[0], v[1], v[2]), 0, buf + off);\ -} - -#define STORE_BUF_ARRAY4(v, off, buf) {\ - ACTIVATION_ARRAY4(v);\ - vstore4((T4)(v[0], v[1], v[2], v[3]), 0, buf + off);\ -} - -#define STORE_BUF_ARRAY5(v, off, buf) {\ - ACTIVATION_ARRAY5(v);\ - vstore4((T4)(v[0], v[1], v[2], v[3]), 0, buf + off);\ - buf[off + 4] = v[4];\ -} - -#define STORE_BUF_ARRAY6(v, off, buf) {\ - ACTIVATION_ARRAY6(v);\ - vstore3((T3)(v[0], v[1], v[2]), 0, buf + off);\ - vstore3((T3)(v[3], v[4], v[5]), 0, buf + off + 3);\ -} - -#define STORE_BUF_ARRAY7(v, off, buf) {\ - ACTIVATION_ARRAY7(v);\ - vstore4((T4)(v[0], v[1], v[2], v[3]), 0, buf + off);\ - vstore3((T3)(v[4], v[5], v[6]), 0, buf + off + 4);\ -} - -#define STORE_BUF_ARRAY8(v, off, buf) {\ - ACTIVATION_ARRAY8(v);\ - vstore8((T8)(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]), 0, buf + off);\ -} -/* - * LOAD BIAS - * Load bias from image 1D based on out number - * ON is out number - */ - -#if(ON == 1) - #define LOADBIAS_IMAGE_ARRAY_V4(V, id, img) {\ - V[0] = READ_IMAGE(img, sampler, id);\ - } -#elif(ON == 2) - #define LOADBIAS_IMAGE_ARRAY_V4(V, id, img) {\ - V[0] = READ_IMAGE(img, sampler, id);\ - V[1] = V[0];\ - } -#elif(ON == 3) - #define LOADBIAS_IMAGE_ARRAY_V4(V, id, img) {\ - V[0] = READ_IMAGE(img, sampler, id);\ - V[1] = V[0];\ - V[2] = V[0];\ - } -#elif(ON == 4) - #define LOADBIAS_IMAGE_ARRAY_V4(V, id, img) {\ - V[0] = READ_IMAGE(img, sampler, id);\ - V[1] = V[0];\ - V[2] = V[0];\ - V[3] = V[0];\ - } -#elif(ON == 5) - #define LOADBIAS_IMAGE_ARRAY_V4(V, id, img) {\ - V[0] = READ_IMAGE(img, sampler, id);\ - V[1] = V[0];\ - V[2] = V[0];\ - V[3] = V[0];\ - V[4] = V[0];\ - } -#elif(ON == 6) - #define LOADBIAS_IMAGE_ARRAY_V4(V, id, img) {\ - V[0] = READ_IMAGE(img, sampler, id);\ - V[1] = V[0];\ - V[2] = V[0];\ - V[3] = V[0];\ - V[4] = V[0];\ - V[5] = V[0];\ - } -#elif(ON == 7) - #define LOADBIAS_IMAGE_ARRAY_V4(V, id, img) {\ - V[0] = READ_IMAGE(img, sampler, id);\ - V[1] = V[0];\ - V[2] = V[0];\ - V[3] = V[0];\ - V[4] = V[0];\ - V[5] = V[0];\ - V[6] = V[0];\ - } -#elif(ON == 8) - #define LOADBIAS_IMAGE_ARRAY_V4(V, id, img) {\ - V[0] = READ_IMAGE(img, sampler, id);\ - V[1] = V[0];\ - V[2] = V[0];\ - V[3] = V[0];\ - V[4] = V[0];\ - V[5] = V[0];\ - V[6] = V[0];\ - V[7] = V[0];\ - } -#elif(ON == 9) - #define LOADBIAS_IMAGE_ARRAY_V4(V, id, img) {\ - V[0] = READ_IMAGE(img, sampler, id);\ - V[1] = V[0];\ - V[2] = V[0];\ - V[3] = V[0];\ - V[4] = V[0];\ - V[5] = V[0];\ - V[6] = V[0];\ - V[7] = V[0];\ - V[8] = V[0];\ - } -#elif(ON == 10) - #define LOADBIAS_IMAGE_ARRAY_V4(V, id, img) {\ - V[0] = READ_IMAGE(img, sampler, id);\ - V[1] = V[0];\ - V[2] = V[0];\ - V[3] = V[0];\ - V[4] = V[0];\ - V[5] = V[0];\ - V[6] = V[0];\ - V[7] = V[0];\ - V[8] = V[0];\ - V[9] = V[0];\ - } -#elif(ON == 11) - #define LOADBIAS_IMAGE_ARRAY_V4(V, id, img) {\ - V[0] = READ_IMAGE(img, sampler, id);\ - V[1] = V[0];\ - V[2] = V[0];\ - V[3] = V[0];\ - V[4] = V[0];\ - V[5] = V[0];\ - V[6] = V[0];\ - V[7] = V[0];\ - V[8] = V[0];\ - V[9] = V[0];\ - V[10] = V[0];\ - } -#elif(ON == 12) - #define LOADBIAS_IMAGE_ARRAY_V4(V, id, img) {\ - V[0] = READ_IMAGE(img, sampler, id);\ - V[1] = V[0];\ - V[2] = V[0];\ - V[3] = V[0];\ - V[4] = V[0];\ - V[5] = V[0];\ - V[6] = V[0];\ - V[7] = V[0];\ - V[8] = V[0];\ - V[9] = V[0];\ - V[10] = V[0];\ - V[11] = V[0];\ - } -#endif - -/* - * LOAD INPUT - * load input from buffer based on len of array vector 4 - * len = N; - * N is usually associated with number W - * - * - * GEMM TN A x B = C - * Matrix A has been transposed - * Operator define for Matrix B and Matrix C - */ -#if(LN == 0) - #define LOAD_INPUT_BUF_ARRAY_V4(V, off, str, buf) {\ - } -#elif(LN == 1) - #define LOAD_INPUT_BUF_ARRAY_V4(V, off, str, buf) {\ - V[0] = vload4(off, buf);\ - } - - #define GEMM_SET_C_BIAS_X(v, reg) {\ - SET_REG_ARRAY1(v, reg);\ - } - - #define GEMM_LOAD_B(v, off, buf) {\ - LOAD_BUF_ARRAY1(v, off, buf);\ - } - - #define GEMM_CALCORE_X(a, b, c) {\ - DOT_A_VEC_B1C1_ARRAY(a, b, c);\ - } - - #define GEMM_MUL_C_X(s, reg) {\ - MUL_REG_NORMAL_ARRAY1(s, reg);\ - } - - #define GEMM_STORE_C_X(v, off, buf) {\ - STORE_BUF_ARRAY1(v, off, buf);\ - } - - #define GEMM_NT_LOAD_B(v, off, str, buf) {\ - READ_BUF(v[0], off, buf);\ - } -#elif(LN == 2) - #define LOAD_INPUT_BUF_ARRAY_V4(V, off, str, buf) {\ - V[0] = vload4(off, buf);\ - V[1] = vload4(off + str, buf);\ - } - - #define GEMM_SET_C_BIAS_X(v, reg) {\ - SET_REG_ARRAY2(v, reg);\ - } - - #define GEMM_LOAD_B(v, off, buf) {\ - LOAD_BUF_ARRAY2(v, off, buf);\ - } - - #define GEMM_CALCORE_X(a, b, c) {\ - DOT_A_VEC_B2C2_ARRAY(a, b, c);\ - } - - #define GEMM_MUL_C_X(s, reg) {\ - MUL_REG_NORMAL_ARRAY2(s, reg);\ - } - - #define GEMM_STORE_C_X(v, off, buf) {\ - STORE_BUF_ARRAY2(v, off, buf);\ - } - - #define GEMM_NT_LOAD_B(v, off, str, buf) {\ - READ_BUF(v[0], off, buf);\ - READ_BUF(v[1], off + str, buf);\ - } -#elif(LN == 3) - #define LOAD_INPUT_BUF_ARRAY_V4(V, off, str, buf) {\ - V[0] = vload4(off, buf);\ - V[1] = vload4(off + str, buf);\ - V[2] = vload4(off + str * 2, buf);\ - } - - #define GEMM_SET_C_BIAS_X(v, reg) {\ - SET_REG_ARRAY3(v, reg);\ - } - - #define GEMM_LOAD_B(v, off, buf) {\ - LOAD_BUF_ARRAY3(v, off, buf);\ - } - - #define GEMM_CALCORE_X(a, b, c) {\ - DOT_A_VEC_B3C3_ARRAY(a, b, c);\ - } - - #define GEMM_MUL_C_X(s, reg) {\ - MUL_REG_NORMAL_ARRAY3(s, reg);\ - } - - #define GEMM_STORE_C_X(v, off, buf) {\ - STORE_BUF_ARRAY3(v, off, buf);\ - } - - #define GEMM_NT_LOAD_B(v, off, str, buf) {\ - READ_BUF(v[0], off, buf);\ - READ_BUF(v[1], off + str, buf);\ - READ_BUF(v[2], off + str * 2, buf);\ - } -#elif(LN == 4) - #define LOAD_INPUT_BUF_ARRAY_V4(V, off, str, buf) {\ - V[0] = vload4(off, buf);\ - V[1] = vload4(off + str, buf);\ - V[2] = vload4(off + str * 2, buf);\ - V[3] = vload4(off + str * 3, buf);\ - } - - #define GEMM_SET_C_BIAS_X(v, reg) {\ - SET_REG_ARRAY4(v, reg);\ - } - - #define GEMM_LOAD_B(v, off, buf) {\ - LOAD_BUF_ARRAY4(v, off, buf);\ - } - - #define GEMM_CALCORE_X(a, b, c) {\ - DOT_A_VEC_B4C4_ARRAY(a, b, c);\ - } - - #define GEMM_MUL_C_X(s, reg) {\ - MUL_REG_NORMAL_ARRAY4(s, reg);\ - } - - #define GEMM_STORE_C_X(v, off, buf) {\ - STORE_BUF_ARRAY4(v, off, buf);\ - } - - #define GEMM_NT_LOAD_B(v, off, str, buf) {\ - READ_BUF(v[0], off, buf);\ - READ_BUF(v[1], off + str, buf);\ - READ_BUF(v[2], off + str * 2, buf);\ - READ_BUF(v[3], off + str * 3, buf);\ - } -#elif(LN == 5) - #define LOAD_INPUT_BUF_ARRAY_V4(V, off, str, buf) {\ - V[0] = vload4(off, buf);\ - V[1] = vload4(off + str, buf);\ - V[2] = vload4(off + str * 2, buf);\ - V[3] = vload4(off + str * 3, buf);\ - V[4] = vload4(off + str * 4, buf);\ - } - - #define GEMM_SET_C_BIAS_X(v, reg) {\ - SET_REG_ARRAY5(v, reg);\ - } - - #define GEMM_LOAD_B(v, off, buf) {\ - LOAD_BUF_ARRAY5(v, off, buf);\ - } - - #define GEMM_CALCORE_X(a, b, c) {\ - DOT_A_VEC_B5C5_ARRAY(a, b, c);\ - } - - #define GEMM_MUL_C_X(s, reg) {\ - MUL_REG_NORMAL_ARRAY5(s, reg);\ - } - - #define GEMM_STORE_C_X(v, off, buf) {\ - STORE_BUF_ARRAY5(v, off, buf);\ - } - - #define GEMM_NT_LOAD_B(v, off, str, buf) {\ - READ_BUF(v[0], off, buf);\ - READ_BUF(v[1], off + str, buf);\ - READ_BUF(v[2], off + str * 2, buf);\ - READ_BUF(v[3], off + str * 3, buf);\ - READ_BUF(v[4], off + str * 4, buf);\ - } -#elif(LN == 6) - #define LOAD_INPUT_BUF_ARRAY_V4(V, off, str, buf) {\ - V[0] = vload4(off, buf);\ - V[1] = vload4(off + str, buf);\ - V[2] = vload4(off + str * 2, buf);\ - V[3] = vload4(off + str * 3, buf);\ - V[4] = vload4(off + str * 4, buf);\ - V[5] = vload4(off + str * 5, buf);\ - } - - #define GEMM_SET_C_BIAS_X(v, reg) {\ - SET_REG_ARRAY6(v, reg);\ - } - - #define GEMM_LOAD_B(v, off, buf) {\ - LOAD_BUF_ARRAY6(v, off, buf);\ - } - - #define GEMM_CALCORE_X(a, b, c) {\ - DOT_A_VEC_B6C6_ARRAY(a, b, c);\ - } - - #define GEMM_MUL_C_X(s, reg) {\ - MUL_REG_NORMAL_ARRAY6(s, reg);\ - } - - #define GEMM_STORE_C_X(v, off, buf) {\ - STORE_BUF_ARRAY6(v, off, buf);\ - } - - #define GEMM_NT_LOAD_B(v, off, str, buf) {\ - READ_BUF(v[0], off, buf);\ - READ_BUF(v[1], off + str, buf);\ - READ_BUF(v[2], off + str * 2, buf);\ - READ_BUF(v[3], off + str * 3, buf);\ - READ_BUF(v[4], off + str * 4, buf);\ - READ_BUF(v[5], off + str * 5, buf);\ - } -#elif(LN == 7) - #define LOAD_INPUT_BUF_ARRAY_V4(V, off, str, buf) {\ - V[0] = vload4(off, buf);\ - V[1] = vload4(off + str, buf);\ - V[2] = vload4(off + str * 2, buf);\ - V[3] = vload4(off + str * 3, buf);\ - V[4] = vload4(off + str * 4, buf);\ - V[5] = vload4(off + str * 5, buf);\ - V[6] = vload4(off + str * 6, buf);\ - } - - #define GEMM_SET_C_BIAS_X(v, reg) {\ - SET_REG_ARRAY7(v, reg);\ - } - - #define GEMM_LOAD_B(v, off, buf) {\ - LOAD_BUF_ARRAY7(v, off, buf);\ - } - - #define GEMM_CALCORE_X(a, b, c) {\ - DOT_A_VEC_B7C7_ARRAY(a, b, c);\ - } - - #define GEMM_MUL_C_X(s, reg) {\ - MUL_REG_NORMAL_ARRAY7(s, reg);\ - } - - #define GEMM_STORE_C_X(v, off, buf) {\ - STORE_BUF_ARRAY7(v, off, buf);\ - } - - #define GEMM_NT_LOAD_B(v, off, str, buf) {\ - READ_BUF(v[0], off, buf);\ - READ_BUF(v[1], off + str, buf);\ - READ_BUF(v[2], off + str * 2, buf);\ - READ_BUF(v[3], off + str * 3, buf);\ - READ_BUF(v[4], off + str * 4, buf);\ - READ_BUF(v[5], off + str * 5, buf);\ - READ_BUF(v[6], off + str * 6, buf);\ - } -#elif(LN == 8) - #define LOAD_INPUT_BUF_ARRAY_V4(V, off, str, buf) {\ - V[0] = vload4(off, buf);\ - V[1] = vload4(off + str, buf);\ - V[2] = vload4(off + str * 2, buf);\ - V[3] = vload4(off + str * 3, buf);\ - V[4] = vload4(off + str * 4, buf);\ - V[5] = vload4(off + str * 5, buf);\ - V[6] = vload4(off + str * 6, buf);\ - V[7] = vload4(off + str * 7, buf);\ - } - - #define GEMM_SET_C_BIAS_X(v, reg) {\ - SET_REG_ARRAY8(v, reg);\ - } - - #define GEMM_LOAD_B(v, off, buf) {\ - LOAD_BUF_ARRAY8(v, off, buf);\ - } - - #define GEMM_CALCORE_X(a, b, c) {\ - DOT_A_VEC_B8C8_ARRAY(a, b, c);\ - } - - #define GEMM_MUL_C_X(s, reg) {\ - MUL_REG_NORMAL_ARRAY8(s, reg);\ - } - - #define GEMM_STORE_C_X(v, off, buf) {\ - STORE_BUF_ARRAY8(v, off, buf);\ - } - - #define GEMM_NT_LOAD_B(v, off, str, buf) {\ - READ_BUF(v[0], off, buf);\ - READ_BUF(v[1], off + str, buf);\ - READ_BUF(v[2], off + str * 2, buf);\ - READ_BUF(v[3], off + str * 3, buf);\ - READ_BUF(v[4], off + str * 4, buf);\ - READ_BUF(v[5], off + str * 5, buf);\ - READ_BUF(v[6], off + str * 6, buf);\ - READ_BUF(v[7], off + str * 7, buf);\ - } -#elif(LN == 9) - #define LOAD_INPUT_BUF_ARRAY_V4(V, off, str, buf) {\ - V[0] = vload4(off, buf);\ - V[1] = vload4(off + str, buf);\ - V[2] = vload4(off + str * 2, buf);\ - V[3] = vload4(off + str * 3, buf);\ - V[4] = vload4(off + str * 4, buf);\ - V[5] = vload4(off + str * 5, buf);\ - V[6] = vload4(off + str * 6, buf);\ - V[7] = vload4(off + str * 7, buf);\ - V[8] = vload4(off + str * 8, buf);\ - } -#elif(LN == 10) - #define LOAD_INPUT_BUF_ARRAY_V4(V, off, str, buf) {\ - V[0] = vload4(off, buf);\ - V[1] = vload4(off + str, buf);\ - V[2] = vload4(off + str * 2, buf);\ - V[3] = vload4(off + str * 3, buf);\ - V[4] = vload4(off + str * 4, buf);\ - V[5] = vload4(off + str * 5, buf);\ - V[6] = vload4(off + str * 6, buf);\ - V[7] = vload4(off + str * 7, buf);\ - V[8] = vload4(off + str * 8, buf);\ - V[9] = vload4(off + str * 9, buf);\ - } -#elif(LN == 11) - #define LOAD_INPUT_BUF_ARRAY_V4(V, off, str, buf) {\ - V[0] = vload4(off, buf);\ - V[1] = vload4(off + str, buf);\ - V[2] = vload4(off + str * 2, buf);\ - V[3] = vload4(off + str * 3, buf);\ - V[4] = vload4(off + str * 4, buf);\ - V[5] = vload4(off + str * 5, buf);\ - V[6] = vload4(off + str * 6, buf);\ - V[7] = vload4(off + str * 7, buf);\ - V[8] = vload4(off + str * 8, buf);\ - V[9] = vload4(off + str * 9, buf);\ - V[10] = vload4(off + str * 9, buf);\ - } -#elif(LN == 12) - #define LOAD_INPUT_BUF_ARRAY_V4(V, off, str, buf) {\ - V[0] = vload4(off, buf);\ - V[1] = vload4(off + str, buf);\ - V[2] = vload4(off + str * 2, buf);\ - V[3] = vload4(off + str * 3, buf);\ - V[4] = vload4(off + str * 4, buf);\ - V[5] = vload4(off + str * 5, buf);\ - V[6] = vload4(off + str * 6, buf);\ - V[7] = vload4(off + str * 7, buf);\ - V[8] = vload4(off + str * 8, buf);\ - V[9] = vload4(off + str * 9, buf);\ - V[10] = vload4(off + str * 9, buf);\ - V[11] = vload4(off + str * 9, buf);\ - } -#endif - - -/* - * GEMM A x B = C - */ -#if (LM == 1) - #define GEMM_LOAD_A(v, off, buf) {\ - LOAD_BUF_ARRAY1(v, off, buf);\ - } - - #define GEMM_SET_C_BIAS(v, reg) {\ - GEMM_SET_C_BIAS_X(v[0], reg[0]);\ - } - - #define GEMM_SET_C_ZERO(reg) {\ - GEMM_SET_C_BIAS_X(0, reg[0]);\ - } - - #define GEMM_CALCORE(a, b, c) {\ - GEMM_CALCORE_X(a[0], b, c[0]);\ - } - - #define GEMM_MUL_C(s, reg) {\ - GEMM_MUL_C_X(s, reg[0]);\ - } - - #define GEMM_STORE_C(v, off, str, buf) {\ - GEMM_STORE_C_X(v[0], off, buf);\ - } - - #define GEMM_NT_LOAD_A(v, off, str, buf) {\ - READ_BUF(v[0], off, buf);\ - } -#elif (LM == 2) - #define GEMM_LOAD_A(v, off, buf) {\ - LOAD_BUF_ARRAY2(v, off, buf);\ - } - - #define GEMM_SET_C_BIAS(v, reg) {\ - GEMM_SET_C_BIAS_X(v[0], reg[0]);\ - GEMM_SET_C_BIAS_X(v[1], reg[1]);\ - } - - #define GEMM_SET_C_ZERO(reg) {\ - GEMM_SET_C_BIAS_X(0, reg[0]);\ - GEMM_SET_C_BIAS_X(0, reg[1]);\ - } - - #define GEMM_CALCORE(a, b, c) {\ - GEMM_CALCORE_X(a[0], b, c[0]);\ - GEMM_CALCORE_X(a[1], b, c[1]);\ - } - - #define GEMM_MUL_C(s, reg) {\ - GEMM_MUL_C_X(s, reg[0]) {\ - GEMM_MUL_C_X(s, reg[1]) {\ - } - - #define GEMM_MUL_C(s, reg) {\ - GEMM_MUL_C_X(s, reg[0]);\ - GEMM_MUL_C_X(s, reg[1]);\ - } - - #define GEMM_STORE_C(v, off, str, buf) {\ - GEMM_STORE_C_X(v[0], off, buf);\ - GEMM_STORE_C_X(v[1], off + str, buf);\ - } - - #define GEMM_NT_LOAD_A(v, off, str, buf) {\ - READ_BUF(v[0], off, buf);\ - READ_BUF(v[1], off + str, buf);\ - } -#elif (LM == 3) - #define GEMM_LOAD_A(v, off, buf) {\ - LOAD_BUF_ARRAY3(v, off, buf);\ - } - - #define GEMM_SET_C_BIAS(v, reg) {\ - GEMM_SET_C_BIAS_X(v[0], reg[0]);\ - GEMM_SET_C_BIAS_X(v[1], reg[1]);\ - GEMM_SET_C_BIAS_X(v[2], reg[2]);\ - } - - #define GEMM_SET_C_ZERO(reg) {\ - GEMM_SET_C_BIAS_X(0, reg[0]);\ - GEMM_SET_C_BIAS_X(0, reg[1]);\ - GEMM_SET_C_BIAS_X(0, reg[2]);\ - } - - #define GEMM_CALCORE(a, b, c) {\ - GEMM_CALCORE_X(a[0], b, c[0]);\ - GEMM_CALCORE_X(a[1], b, c[1]);\ - GEMM_CALCORE_X(a[2], b, c[2]);\ - } - - #define GEMM_MUL_C(s, reg) {\ - GEMM_MUL_C_X(s, reg[0]);\ - GEMM_MUL_C_X(s, reg[1]);\ - GEMM_MUL_C_X(s, reg[2]);\ - } - - #define GEMM_STORE_C(v, off, str, buf) {\ - GEMM_STORE_C_X(v[0], off, buf);\ - GEMM_STORE_C_X(v[1], off + str, buf);\ - GEMM_STORE_C_X(v[2], off + str * 2, buf);\ - } - - #define GEMM_NT_LOAD_A(v, off, str, buf) {\ - READ_BUF(v[0], off, buf);\ - READ_BUF(v[1], off + str, buf);\ - READ_BUF(v[2], off + str * 2, buf);\ - } -#elif (LM == 4) - #define GEMM_LOAD_A(v, off, buf) {\ - LOAD_BUF_ARRAY4(v, off, buf);\ - } - - #define GEMM_SET_C_BIAS(v, reg) {\ - GEMM_SET_C_BIAS_X(v[0], reg[0]);\ - GEMM_SET_C_BIAS_X(v[1], reg[1]);\ - GEMM_SET_C_BIAS_X(v[2], reg[2]);\ - GEMM_SET_C_BIAS_X(v[3], reg[3]);\ - } - - #define GEMM_SET_C_ZERO(reg) {\ - GEMM_SET_C_BIAS_X(0, reg[0]);\ - GEMM_SET_C_BIAS_X(0, reg[1]);\ - GEMM_SET_C_BIAS_X(0, reg[2]);\ - GEMM_SET_C_BIAS_X(0, reg[3]);\ - } - - #define GEMM_CALCORE(a, b, c) {\ - GEMM_CALCORE_X(a[0], b, c[0]);\ - GEMM_CALCORE_X(a[1], b, c[1]);\ - GEMM_CALCORE_X(a[2], b, c[2]);\ - GEMM_CALCORE_X(a[3], b, c[3]);\ - } - - #define GEMM_MUL_C(s, reg) {\ - GEMM_MUL_C_X(s, reg[0]);\ - GEMM_MUL_C_X(s, reg[1]);\ - GEMM_MUL_C_X(s, reg[2]);\ - GEMM_MUL_C_X(s, reg[3]);\ - } - - #define GEMM_STORE_C(v, off, str, buf) {\ - GEMM_STORE_C_X(v[0], off, buf);\ - GEMM_STORE_C_X(v[1], off + str, buf);\ - GEMM_STORE_C_X(v[2], off + str * 2, buf);\ - GEMM_STORE_C_X(v[3], off + str * 3, buf);\ - } - - #define GEMM_NT_LOAD_A(v, off, str, buf) {\ - READ_BUF(v[0], off, buf);\ - READ_BUF(v[1], off + str, buf);\ - READ_BUF(v[2], off + str * 2, buf);\ - READ_BUF(v[3], off + str * 3, buf);\ - } -#elif (LM == 5) - #define GEMM_LOAD_A(v, off, buf) {\ - LOAD_BUF_ARRAY5(v, off, buf);\ - } - - #define GEMM_SET_C_BIAS(v, reg) {\ - GEMM_SET_C_BIAS_X(v[0], reg[0]);\ - GEMM_SET_C_BIAS_X(v[1], reg[1]);\ - GEMM_SET_C_BIAS_X(v[2], reg[2]);\ - GEMM_SET_C_BIAS_X(v[3], reg[3]);\ - GEMM_SET_C_BIAS_X(v[4], reg[4]);\ - } - - #define GEMM_SET_C_ZERO(reg) {\ - GEMM_SET_C_BIAS_X(0, reg[0]);\ - GEMM_SET_C_BIAS_X(0, reg[1]);\ - GEMM_SET_C_BIAS_X(0, reg[2]);\ - GEMM_SET_C_BIAS_X(0, reg[3]);\ - GEMM_SET_C_BIAS_X(0, reg[4]);\ - } - - #define GEMM_CALCORE(a, b, c) {\ - GEMM_CALCORE_X(a[0], b, c[0]);\ - GEMM_CALCORE_X(a[1], b, c[1]);\ - GEMM_CALCORE_X(a[2], b, c[2]);\ - GEMM_CALCORE_X(a[3], b, c[3]);\ - GEMM_CALCORE_X(a[4], b, c[4]);\ - } - - #define GEMM_MUL_C(s, reg) {\ - GEMM_MUL_C_X(s, reg[0]);\ - GEMM_MUL_C_X(s, reg[1]);\ - GEMM_MUL_C_X(s, reg[2]);\ - GEMM_MUL_C_X(s, reg[3]);\ - GEMM_MUL_C_X(s, reg[4]);\ - } - - #define GEMM_STORE_C(v, off, str, buf) {\ - GEMM_STORE_C_X(v[0], off, buf);\ - GEMM_STORE_C_X(v[1], off + str, buf);\ - GEMM_STORE_C_X(v[2], off + str * 2, buf);\ - GEMM_STORE_C_X(v[3], off + str * 3, buf);\ - GEMM_STORE_C_X(v[4], off + str * 4, buf);\ - } - - #define GEMM_NT_LOAD_A(v, off, str, buf) {\ - READ_BUF(v[0], off, buf);\ - READ_BUF(v[1], off + str, buf);\ - READ_BUF(v[2], off + str * 2, buf);\ - READ_BUF(v[3], off + str * 3, buf);\ - READ_BUF(v[4], off + str * 4, buf);\ - } -#elif (LM == 6) - #define GEMM_LOAD_A(v, off, buf) {\ - LOAD_BUF_ARRAY6(v, off, buf);\ - } - - #define GEMM_SET_C_BIAS(v, reg) {\ - GEMM_SET_C_BIAS_X(v[0], reg[0]);\ - GEMM_SET_C_BIAS_X(v[1], reg[1]);\ - GEMM_SET_C_BIAS_X(v[2], reg[2]);\ - GEMM_SET_C_BIAS_X(v[3], reg[3]);\ - GEMM_SET_C_BIAS_X(v[4], reg[4]);\ - GEMM_SET_C_BIAS_X(v[5], reg[5]);\ - } - - #define GEMM_SET_C_ZERO(reg) {\ - GEMM_SET_C_BIAS_X(0, reg[0]);\ - GEMM_SET_C_BIAS_X(0, reg[1]);\ - GEMM_SET_C_BIAS_X(0, reg[2]);\ - GEMM_SET_C_BIAS_X(0, reg[3]);\ - GEMM_SET_C_BIAS_X(0, reg[4]);\ - GEMM_SET_C_BIAS_X(0, reg[5]);\ - } - - #define GEMM_CALCORE(a, b, c) {\ - GEMM_CALCORE_X(a[0], b, c[0]);\ - GEMM_CALCORE_X(a[1], b, c[1]);\ - GEMM_CALCORE_X(a[2], b, c[2]);\ - GEMM_CALCORE_X(a[3], b, c[3]);\ - GEMM_CALCORE_X(a[4], b, c[4]);\ - GEMM_CALCORE_X(a[5], b, c[5]);\ - } - - #define GEMM_MUL_C(s, reg) {\ - GEMM_MUL_C_X(s, reg[0]);\ - GEMM_MUL_C_X(s, reg[1]);\ - GEMM_MUL_C_X(s, reg[2]);\ - GEMM_MUL_C_X(s, reg[3]);\ - GEMM_MUL_C_X(s, reg[4]);\ - GEMM_MUL_C_X(s, reg[5]);\ - } - - #define GEMM_STORE_C(v, off, str, buf) {\ - GEMM_STORE_C_X(v[0], off, buf);\ - GEMM_STORE_C_X(v[1], off + str, buf);\ - GEMM_STORE_C_X(v[2], off + str * 2, buf);\ - GEMM_STORE_C_X(v[3], off + str * 3, buf);\ - GEMM_STORE_C_X(v[4], off + str * 4, buf);\ - GEMM_STORE_C_X(v[5], off + str * 5, buf);\ - } - - #define GEMM_NT_LOAD_A(v, off, str, buf) {\ - READ_BUF(v[0], off, buf);\ - READ_BUF(v[1], off + str, buf);\ - READ_BUF(v[2], off + str * 2, buf);\ - READ_BUF(v[3], off + str * 3, buf);\ - READ_BUF(v[4], off + str * 4, buf);\ - READ_BUF(v[5], off + str * 5, buf);\ - } -#elif (LM == 7) - #define GEMM_LOAD_A(v, off, buf) {\ - LOAD_BUF_ARRAY7(v, off, buf);\ - } - - #define GEMM_SET_C_BIAS(v, reg) {\ - GEMM_SET_C_BIAS_X(v[0], reg[0]);\ - GEMM_SET_C_BIAS_X(v[1], reg[1]);\ - GEMM_SET_C_BIAS_X(v[2], reg[2]);\ - GEMM_SET_C_BIAS_X(v[3], reg[3]);\ - GEMM_SET_C_BIAS_X(v[4], reg[4]);\ - GEMM_SET_C_BIAS_X(v[5], reg[5]);\ - GEMM_SET_C_BIAS_X(v[6], reg[6]);\ - } - - #define GEMM_SET_C_ZERO(reg) {\ - GEMM_SET_C_BIAS_X(0, reg[0]);\ - GEMM_SET_C_BIAS_X(0, reg[1]);\ - GEMM_SET_C_BIAS_X(0, reg[2]);\ - GEMM_SET_C_BIAS_X(0, reg[3]);\ - GEMM_SET_C_BIAS_X(0, reg[4]);\ - GEMM_SET_C_BIAS_X(0, reg[5]);\ - GEMM_SET_C_BIAS_X(0, reg[6]);\ - } - - #define GEMM_CALCORE(a, b, c) {\ - GEMM_CALCORE_X(a[0], b, c[0]);\ - GEMM_CALCORE_X(a[1], b, c[1]);\ - GEMM_CALCORE_X(a[2], b, c[2]);\ - GEMM_CALCORE_X(a[3], b, c[3]);\ - GEMM_CALCORE_X(a[4], b, c[4]);\ - GEMM_CALCORE_X(a[5], b, c[5]);\ - GEMM_CALCORE_X(a[6], b, c[6]);\ - } - - #define GEMM_MUL_C(s, reg) {\ - GEMM_MUL_C_X(s, reg[0]);\ - GEMM_MUL_C_X(s, reg[1]);\ - GEMM_MUL_C_X(s, reg[2]);\ - GEMM_MUL_C_X(s, reg[3]);\ - GEMM_MUL_C_X(s, reg[4]);\ - GEMM_MUL_C_X(s, reg[5]);\ - GEMM_MUL_C_X(s, reg[6]);\ - } - - #define GEMM_STORE_C(v, off, str, buf) {\ - GEMM_STORE_C_X(v[0], off, buf);\ - GEMM_STORE_C_X(v[1], off + str, buf);\ - GEMM_STORE_C_X(v[2], off + str * 2, buf);\ - GEMM_STORE_C_X(v[3], off + str * 3, buf);\ - GEMM_STORE_C_X(v[4], off + str * 4, buf);\ - GEMM_STORE_C_X(v[5], off + str * 5, buf);\ - GEMM_STORE_C_X(v[6], off + str * 6, buf);\ - } - - #define GEMM_NT_LOAD_A(v, off, str, buf) {\ - READ_BUF(v[0], off, buf);\ - READ_BUF(v[1], off + str, buf);\ - READ_BUF(v[2], off + str * 2, buf);\ - READ_BUF(v[3], off + str * 3, buf);\ - READ_BUF(v[4], off + str * 4, buf);\ - READ_BUF(v[5], off + str * 5, buf);\ - READ_BUF(v[6], off + str * 6, buf);\ - } -#elif (LM == 8) - #define GEMM_LOAD_A(v, off, buf) {\ - LOAD_BUF_ARRAY8(v, off, buf);\ - } - - #define GEMM_SET_C_BIAS(v, reg) {\ - GEMM_SET_C_BIAS_X(v[0], reg[0]);\ - GEMM_SET_C_BIAS_X(v[1], reg[1]);\ - GEMM_SET_C_BIAS_X(v[2], reg[2]);\ - GEMM_SET_C_BIAS_X(v[3], reg[3]);\ - GEMM_SET_C_BIAS_X(v[4], reg[4]);\ - GEMM_SET_C_BIAS_X(v[5], reg[5]);\ - GEMM_SET_C_BIAS_X(v[6], reg[6]);\ - GEMM_SET_C_BIAS_X(v[7], reg[7]);\ - } - - #define GEMM_SET_C_ZERO(reg) {\ - GEMM_SET_C_BIAS_X(0, reg[0]);\ - GEMM_SET_C_BIAS_X(0, reg[1]);\ - GEMM_SET_C_BIAS_X(0, reg[2]);\ - GEMM_SET_C_BIAS_X(0, reg[3]);\ - GEMM_SET_C_BIAS_X(0, reg[4]);\ - GEMM_SET_C_BIAS_X(0, reg[5]);\ - GEMM_SET_C_BIAS_X(0, reg[6]);\ - GEMM_SET_C_BIAS_X(0, reg[7]);\ - } - - #define GEMM_CALCORE(a, b, c) {\ - GEMM_CALCORE_X(a[0], b, c[0]);\ - GEMM_CALCORE_X(a[1], b, c[1]);\ - GEMM_CALCORE_X(a[2], b, c[2]);\ - GEMM_CALCORE_X(a[3], b, c[3]);\ - GEMM_CALCORE_X(a[4], b, c[4]);\ - GEMM_CALCORE_X(a[5], b, c[5]);\ - GEMM_CALCORE_X(a[6], b, c[6]);\ - GEMM_CALCORE_X(a[7], b, c[7]);\ - } - - #define GEMM_MUL_C(s, reg) {\ - GEMM_MUL_C_X(s, reg[0]);\ - GEMM_MUL_C_X(s, reg[1]);\ - GEMM_MUL_C_X(s, reg[2]);\ - GEMM_MUL_C_X(s, reg[3]);\ - GEMM_MUL_C_X(s, reg[4]);\ - GEMM_MUL_C_X(s, reg[5]);\ - GEMM_MUL_C_X(s, reg[6]);\ - GEMM_MUL_C_X(s, reg[7]);\ - } - - #define GEMM_STORE_C(v, off, str, buf) {\ - GEMM_STORE_C_X(v[0], off, buf);\ - GEMM_STORE_C_X(v[1], off + str, buf);\ - GEMM_STORE_C_X(v[2], off + str * 2, buf);\ - GEMM_STORE_C_X(v[3], off + str * 3, buf);\ - GEMM_STORE_C_X(v[4], off + str * 4, buf);\ - GEMM_STORE_C_X(v[5], off + str * 5, buf);\ - GEMM_STORE_C_X(v[6], off + str * 6, buf);\ - GEMM_STORE_C_X(v[7], off + str * 7, buf);\ - } - - #define GEMM_NT_LOAD_A(v, off, str, buf) {\ - READ_BUF(v[0], off, buf);\ - READ_BUF(v[1], off + str, buf);\ - READ_BUF(v[2], off + str * 2, buf);\ - READ_BUF(v[3], off + str * 3, buf);\ - READ_BUF(v[4], off + str * 4, buf);\ - READ_BUF(v[5], off + str * 5, buf);\ - READ_BUF(v[6], off + str * 6, buf);\ - READ_BUF(v[7], off + str * 7, buf);\ - } -#endif - -/* - * UPDATE VALUE OF REG - */ -#if(UN == 0) - #define UPDATE_REG(A) {\ - } -#elif(UN == 1) - #define UPDATE_REG(A) {\ - A[0] = A[1];\ - } -#elif(UN == 2) - #define UPDATE_REG(A) {\ - A[0] = A[1];\ - A[1] = A[2];\ - } -#elif(UN == 3) - #define UPDATE_REG(A) {\ - A[0] = A[1];\ - A[1] = A[2];\ - A[2] = A[3];\ - } -#elif(UN == 4) - #define UPDATE_REG(A) {\ - A[0] = A[1];\ - A[1] = A[2];\ - A[2] = A[3];\ - A[3] = A[4];\ - } -#elif(UN == 5) - #define UPDATE_REG(A) {\ - A[0] = A[1];\ - A[1] = A[2];\ - A[2] = A[3];\ - A[3] = A[4];\ - A[4] = A[5];\ - } -#elif(UN == 6) - #define UPDATE_REG(A) {\ - A[0] = A[1];\ - A[1] = A[2];\ - A[2] = A[3];\ - A[3] = A[4];\ - A[4] = A[5];\ - A[5] = A[6];\ - } -#elif(UN == 7) - #define UPDATE_REG(A) {\ - A[0] = A[1];\ - A[1] = A[2];\ - A[2] = A[3];\ - A[3] = A[4];\ - A[4] = A[5];\ - A[5] = A[6];\ - A[6] = A[7];\ - } -#elif(UN == 8) - #define UPDATE_REG(A) {\ - A[0] = A[1];\ - A[1] = A[2];\ - A[2] = A[3];\ - A[3] = A[4];\ - A[4] = A[5];\ - A[5] = A[6];\ - A[6] = A[7];\ - A[7] = A[8];\ - } -#elif(UN == 9) - #define UPDATE_REG(A) {\ - A[0] = A[1];\ - A[1] = A[2];\ - A[2] = A[3];\ - A[3] = A[4];\ - A[4] = A[5];\ - A[5] = A[6];\ - A[6] = A[7];\ - A[7] = A[8];\ - A[8] = A[9];\ - } -#elif(UN == 10) - #define UPDATE_REG(A) {\ - A[0] = A[1];\ - A[1] = A[2];\ - A[2] = A[3];\ - A[3] = A[4];\ - A[4] = A[5];\ - A[5] = A[6];\ - A[6] = A[7];\ - A[7] = A[8];\ - A[8] = A[9];\ - A[9] = A[10];\ - } -#elif(UN == 11) - #define UPDATE_REG(A) {\ - A[0] = A[1];\ - A[1] = A[2];\ - A[2] = A[3];\ - A[3] = A[4];\ - A[4] = A[5];\ - A[5] = A[6];\ - A[6] = A[7];\ - A[7] = A[8];\ - A[8] = A[9];\ - A[9] = A[10];\ - A[10] = A[11];\ - } -#elif(UN == 12) - #define UPDATE_REG(A) {\ - A[0] = A[1];\ - A[1] = A[2];\ - A[2] = A[3];\ - A[3] = A[4];\ - A[4] = A[5];\ - A[5] = A[6];\ - A[6] = A[7];\ - A[7] = A[8];\ - A[8] = A[9];\ - A[9] = A[10];\ - A[10] = A[11];\ - A[11] = A[12];\ - } -#endif - -/* - * Direct convolution calculate core - * Depthwise calculate core - */ -#if(ON == 1) - #define DIRECT_CONV_CAL_CORE_S1(A, B, C) {\ - DOT_A4B16C4(A[0], B, C[0]);\ - } - #define DIRECT_CONV_CAL_CORE_S2(A, B, C) {\ - DOT_A4B16C4(A[0], B, C[0]);\ - } - #define DEPTHWISE_CAL_CORE_S1(A, B, C) {\ - DOT_A4B4C4(A[0], B, C[0]);\ - } - #define DEPTHWISE_CAL_CORE_S2(A, B, C) {\ - DOT_A4B4C4(A[0], B, C[0]);\ - } -#elif(ON == 2) - #define DIRECT_CONV_CAL_CORE_S1(A, B, C) {\ - DOT_A4B16C4(A[0], B, C[0]);\ - DOT_A4B16C4(A[1], B, C[1]);\ - } - #define DIRECT_CONV_CAL_CORE_S2(A, B, C) {\ - DOT_A4B16C4(A[0], B, C[0]);\ - DOT_A4B16C4(A[2], B, C[1]);\ - } - #define DEPTHWISE_CAL_CORE_S1(A, B, C) {\ - DOT_A4B4C4(A[0], B, C[0]);\ - DOT_A4B4C4(A[1], B, C[1]);\ - } - #define DEPTHWISE_CAL_CORE_S2(A, B, C) {\ - DOT_A4B4C4(A[0], B, C[0]);\ - DOT_A4B4C4(A[2], B, C[1]);\ - } -#elif(ON == 3) - #define DIRECT_CONV_CAL_CORE_S1(A, B, C) {\ - DOT_A4B16C4(A[0], B, C[0]);\ - DOT_A4B16C4(A[1], B, C[1]);\ - DOT_A4B16C4(A[2], B, C[2]);\ - } - #define DIRECT_CONV_CAL_CORE_S2(A, B, C) {\ - DOT_A4B16C4(A[0], B, C[0]);\ - DOT_A4B16C4(A[2], B, C[1]);\ - DOT_A4B16C4(A[4], B, C[2]);\ - } - #define DEPTHWISE_CAL_CORE_S1(A, B, C) {\ - DOT_A4B4C4(A[0], B, C[0]);\ - DOT_A4B4C4(A[1], B, C[1]);\ - DOT_A4B4C4(A[2], B, C[2]);\ - } - #define DEPTHWISE_CAL_CORE_S2(A, B, C) {\ - DOT_A4B4C4(A[0], B, C[0]);\ - DOT_A4B4C4(A[2], B, C[1]);\ - DOT_A4B4C4(A[4], B, C[2]);\ - } -#elif(ON == 4) - #define DIRECT_CONV_CAL_CORE_S1(A, B, C) {\ - DOT_A4B16C4(A[0], B, C[0]);\ - DOT_A4B16C4(A[1], B, C[1]);\ - DOT_A4B16C4(A[2], B, C[2]);\ - DOT_A4B16C4(A[3], B, C[3]);\ - } - #define DIRECT_CONV_CAL_CORE_S2(A, B, C) {\ - DOT_A4B16C4(A[0], B, C[0]);\ - DOT_A4B16C4(A[2], B, C[2]);\ - DOT_A4B16C4(A[4], B, C[4]);\ - DOT_A4B16C4(A[6], B, C[6]);\ - } - #define DEPTHWISE_CAL_CORE_S1(A, B, C) {\ - DOT_A4B4C4(A[0], B, C[0]);\ - DOT_A4B4C4(A[1], B, C[1]);\ - DOT_A4B4C4(A[2], B, C[2]);\ - DOT_A4B4C4(A[3], B, C[3]);\ - } - #define DEPTHWISE_CAL_CORE_S2(A, B, C) {\ - DOT_A4B4C4(A[0], B, C[0]);\ - DOT_A4B4C4(A[2], B, C[1]);\ - DOT_A4B4C4(A[4], B, C[2]);\ - DOT_A4B4C4(A[6], B, C[3]);\ - } -#elif(ON == 5) - #define DIRECT_CONV_CAL_CORE_S1(A, B, C) {\ - DOT_A4B16C4(A[0], B, C[0]);\ - DOT_A4B16C4(A[1], B, C[1]);\ - DOT_A4B16C4(A[2], B, C[2]);\ - DOT_A4B16C4(A[3], B, C[3]);\ - DOT_A4B16C4(A[4], B, C[4]);\ - } - #define DEPTHWISE_CAL_CORE_S1(A, B, C) {\ - DOT_A4B4C4(A[0], B, C[0]);\ - DOT_A4B4C4(A[1], B, C[1]);\ - DOT_A4B4C4(A[2], B, C[2]);\ - DOT_A4B4C4(A[3], B, C[3]);\ - DOT_A4B4C4(A[4], B, C[4]);\ - } -#elif(ON == 6) - #define DIRECT_CONV_CAL_CORE_S1(A, B, C) {\ - DOT_A4B16C4(A[0], B, C[0]);\ - DOT_A4B16C4(A[1], B, C[1]);\ - DOT_A4B16C4(A[2], B, C[2]);\ - DOT_A4B16C4(A[3], B, C[3]);\ - DOT_A4B16C4(A[4], B, C[4]);\ - DOT_A4B16C4(A[5], B, C[5]);\ - } - #define DEPTHWISE_CAL_CORE_S1(A, B, C) {\ - DOT_A4B4C4(A[0], B, C[0]);\ - DOT_A4B4C4(A[1], B, C[1]);\ - DOT_A4B4C4(A[2], B, C[2]);\ - DOT_A4B4C4(A[3], B, C[3]);\ - DOT_A4B4C4(A[4], B, C[4]);\ - DOT_A4B4C4(A[5], B, C[5]);\ - } -#elif(ON == 7) - #define DIRECT_CONV_CAL_CORE_S1(A, B, C) {\ - DOT_A4B16C4(A[0], B, C[0]);\ - DOT_A4B16C4(A[1], B, C[1]);\ - DOT_A4B16C4(A[2], B, C[2]);\ - DOT_A4B16C4(A[3], B, C[3]);\ - DOT_A4B16C4(A[4], B, C[4]);\ - DOT_A4B16C4(A[5], B, C[5]);\ - DOT_A4B16C4(A[6], B, C[6]);\ - } - #define DEPTHWISE_CAL_CORE_S1(A, B, C) {\ - DOT_A4B4C4(A[0], B, C[0]);\ - DOT_A4B4C4(A[1], B, C[1]);\ - DOT_A4B4C4(A[2], B, C[2]);\ - DOT_A4B4C4(A[3], B, C[3]);\ - DOT_A4B4C4(A[4], B, C[4]);\ - DOT_A4B4C4(A[5], B, C[5]);\ - DOT_A4B4C4(A[6], B, C[6]);\ - } -#elif(ON == 8) - #define DIRECT_CONV_CAL_CORE_S1(A, B, C) {\ - DOT_A4B16C4(A[0], B, C[0]);\ - DOT_A4B16C4(A[1], B, C[1]);\ - DOT_A4B16C4(A[2], B, C[2]);\ - DOT_A4B16C4(A[3], B, C[3]);\ - DOT_A4B16C4(A[4], B, C[4]);\ - DOT_A4B16C4(A[5], B, C[5]);\ - DOT_A4B16C4(A[6], B, C[6]);\ - DOT_A4B16C4(A[7], B, C[7]);\ - } - #define DEPTHWISE_CAL_CORE_S1(A, B, C) {\ - DOT_A4B4C4(A[0], B, C[0]);\ - DOT_A4B4C4(A[1], B, C[1]);\ - DOT_A4B4C4(A[2], B, C[2]);\ - DOT_A4B4C4(A[3], B, C[3]);\ - DOT_A4B4C4(A[4], B, C[4]);\ - DOT_A4B4C4(A[5], B, C[5]);\ - DOT_A4B4C4(A[6], B, C[6]);\ - DOT_A4B4C4(A[7], B, C[7]);\ - } -#endif - -/* - * STORE_OUTPUT_BUF_ARRAY_V4 WITH ACTIVATION - */ -#if(ON == 1) - #define STORE_OUTPUT_BUF_ARRAY_V4(V, off, str, id, bd, buf) {\ - ACTIVATION_V4(V[0]);\ - vstore4(V[0], off, buf);\ - } - - #define STORE_OUTPUT_BUF_ARRAY_V4_NCWH(V, off, str_h, str_hw, id, bd, buf) {\ - ACTIVATION_V4(V[0]);\ - buf[off] = V[0].x;\ - buf[off + str_hw] = V[0].y;\ - buf[off + str_hw * 2] = V[0].z;\ - buf[off + str_hw * 3] = V[0].w;\ - } - - #define STORE_OUTPUT_BUF_ARRAY_ALIGN(val, off, str, out) {\ - out[off] = val[0];\ - } - - #define SET_REG_ARRAY(v, reg) {\ - SET_REG_ARRAY1(v, reg);\ - } -#elif(ON == 2) - #define STORE_OUTPUT_BUF_ARRAY_V4(V, off, str, id, bd, buf) {\ - ACTIVATION_V4(V[0]);\ - ACTIVATION_V4(V[1]);\ - vstore4(V[0], off, buf);\ - if(id + 1 < bd) vstore4(V[1], off + str, buf);\ - } - - #define STORE_OUTPUT_BUF_ARRAY_V4_NCWH(V, off, str_h, str_hw, id, bd, buf) {\ - ACTIVATION_V4(V[0]);\ - ACTIVATION_V4(V[1]);\ - buf[off] = V[0].x;\ - buf[off + str_hw] = V[0].y;\ - buf[off + str_hw * 2] = V[0].z;\ - buf[off + str_hw * 3] = V[0].w;\ - if(id + 1 < bd) {\ - buf[off + str_h] = V[1].x;\ - buf[off + str_h + str_hw ] = V[1].y;\ - buf[off + str_h + str_hw * 2] = V[1].z;\ - buf[off + str_h + str_hw * 3] = V[1].w;\ - }\ - } - - #define STORE_OUTPUT_BUF_ARRAY_ALIGN(val, off, str, out) {\ - out[off] = val[0];\ - out[off + str] = val[1];\ - } - - #define SET_REG_ARRAY(v, reg) {\ - SET_REG_ARRAY2(v, reg);\ - } -#elif(ON == 3) - #define STORE_OUTPUT_BUF_ARRAY_V4(V, off, str, id, bd, buf) {\ - ACTIVATION_V4(V[0]);\ - ACTIVATION_V4(V[1]);\ - ACTIVATION_V4(V[2]);\ - vstore4(V[0], off, buf);\ - if(id + 1 < bd) vstore4(V[1], off + str, buf);\ - if(id + 2 < bd) vstore4(V[2], off + str * 2, buf);\ - } - - #define STORE_OUTPUT_BUF_ARRAY_V4_NCWH(V, off, str_h, str_hw, id, bd, buf) {\ - ACTIVATION_V4(V[0]);\ - ACTIVATION_V4(V[1]);\ - ACTIVATION_V4(V[2]);\ - buf[off] = V[0].x;\ - buf[off + str_hw] = V[0].y;\ - buf[off + str_hw * 2] = V[0].z;\ - buf[off + str_hw * 3] = V[0].w;\ - if(id + 1 < bd) {\ - buf[off + str_h] = V[1].x;\ - buf[off + str_h + str_hw ] = V[1].y;\ - buf[off + str_h + str_hw * 2] = V[1].z;\ - buf[off + str_h + str_hw * 3] = V[1].w;\ - }\ - if(id + 2 < bd) {\ - buf[off + str_h * 2] = V[2].x;\ - buf[off + str_h * 2 + str_hw ] = V[2].y;\ - buf[off + str_h * 2 + str_hw * 2] = V[2].z;\ - buf[off + str_h * 2 + str_hw * 3] = V[2].w;\ - }\ - } - - #define STORE_OUTPUT_BUF_ARRAY_ALIGN(val, off, str, out) {\ - out[off] = val[0];\ - out[off + str] = val[1];\ - out[off + str * 2] = val[2];\ - } - - #define SET_REG_ARRAY(v, reg) {\ - SET_REG_ARRAY3(v, reg);\ - } -#elif(ON == 4) - #define STORE_OUTPUT_BUF_ARRAY_V4(V, off, str, id, bd, buf) {\ - ACTIVATION_V4(V[0]);\ - ACTIVATION_V4(V[1]);\ - ACTIVATION_V4(V[2]);\ - ACTIVATION_V4(V[3]);\ - vstore4(V[0], off, buf);\ - if(id + 1 < bd) vstore4(V[1], off + str, buf);\ - if(id + 2 < bd) vstore4(V[2], off + str * 2, buf);\ - if(id + 3 < bd) vstore4(V[3], off + str * 3, buf);\ - } - - #define STORE_OUTPUT_BUF_ARRAY_V4_NCWH(V, off, str_h, str_hw, id, bd, buf) {\ - ACTIVATION_V4(V[0]);\ - ACTIVATION_V4(V[1]);\ - ACTIVATION_V4(V[2]);\ - ACTIVATION_V4(V[3]);\ - buf[off] = V[0].x;\ - buf[off + str_hw] = V[0].y;\ - buf[off + str_hw * 2] = V[0].z;\ - buf[off + str_hw * 3] = V[0].w;\ - if(id + 1 < bd) {\ - buf[off + str_h] = V[1].x;\ - buf[off + str_h + str_hw ] = V[1].y;\ - buf[off + str_h + str_hw * 2] = V[1].z;\ - buf[off + str_h + str_hw * 3] = V[1].w;\ - }\ - if(id + 2 < bd) {\ - buf[off + str_h * 2] = V[2].x;\ - buf[off + str_h * 2 + str_hw ] = V[2].y;\ - buf[off + str_h * 2 + str_hw * 2] = V[2].z;\ - buf[off + str_h * 2 + str_hw * 3] = V[2].w;\ - }\ - if(id + 3 < bd) {\ - buf[off + str_h * 3] = V[3].x;\ - buf[off + str_h * 3 + str_hw ] = V[3].y;\ - buf[off + str_h * 3 + str_hw * 2] = V[3].z;\ - buf[off + str_h * 3 + str_hw * 3] = V[3].w;\ - }\ - } - - #define STORE_OUTPUT_BUF_ARRAY_ALIGN(val, off, str, out) {\ - out[off] = val[0];\ - out[off + str] = val[1];\ - out[off + str * 2] = val[2];\ - out[off + str * 3] = val[3];\ - } - - #define SET_REG_ARRAY(v, reg) {\ - SET_REG_ARRAY4(v, reg);\ - } -#elif(ON == 5) - #define STORE_OUTPUT_BUF_ARRAY_V4(V, off, str, id, bd, buf) {\ - ACTIVATION_V4(V[0]);\ - ACTIVATION_V4(V[1]);\ - ACTIVATION_V4(V[2]);\ - ACTIVATION_V4(V[3]);\ - ACTIVATION_V4(V[4]);\ - vstore4(V[0], off, buf);\ - if(id + 1 < bd) vstore4(V[1], off + str, buf);\ - if(id + 2 < bd) vstore4(V[2], off + str * 2, buf);\ - if(id + 3 < bd) vstore4(V[3], off + str * 3, buf);\ - if(id + 4 < bd) vstore4(V[4], off + str * 4, buf);\ - } - - #define STORE_OUTPUT_BUF_ARRAY_V4_NCWH(V, off, str_h, str_hw, id, bd, buf) {\ - ACTIVATION_V4(V[0]);\ - ACTIVATION_V4(V[1]);\ - ACTIVATION_V4(V[2]);\ - ACTIVATION_V4(V[3]);\ - ACTIVATION_V4(V[4]);\ - buf[off] = V[0].x;\ - buf[off + str_hw] = V[0].y;\ - buf[off + str_hw * 2] = V[0].z;\ - buf[off + str_hw * 3] = V[0].w;\ - if(id + 1 < bd) {\ - buf[off + str_h] = V[1].x;\ - buf[off + str_h + str_hw ] = V[1].y;\ - buf[off + str_h + str_hw * 2] = V[1].z;\ - buf[off + str_h + str_hw * 3] = V[1].w;\ - }\ - if(id + 2 < bd) {\ - buf[off + str_h * 2] = V[2].x;\ - buf[off + str_h * 2 + str_hw ] = V[2].y;\ - buf[off + str_h * 2 + str_hw * 2] = V[2].z;\ - buf[off + str_h * 2 + str_hw * 3] = V[2].w;\ - }\ - if(id + 3 < bd) {\ - buf[off + str_h * 3] = V[3].x;\ - buf[off + str_h * 3 + str_hw ] = V[3].y;\ - buf[off + str_h * 3 + str_hw * 2] = V[3].z;\ - buf[off + str_h * 3 + str_hw * 3] = V[3].w;\ - }\ - if(id + 4 < bd) {\ - buf[off + str_h * 4] = V[4].x;\ - buf[off + str_h * 4 + str_hw ] = V[4].y;\ - buf[off + str_h * 4 + str_hw * 2] = V[4].z;\ - buf[off + str_h * 4 + str_hw * 3] = V[4].w;\ - }\ - } - - #define SET_REG_ARRAY(v, reg) {\ - SET_REG_ARRAY5(v, reg);\ - } -#elif(ON == 6) - #define STORE_OUTPUT_BUF_ARRAY_V4(V, off, str, id, bd, buf) {\ - ACTIVATION_V4(V[0]);\ - ACTIVATION_V4(V[1]);\ - ACTIVATION_V4(V[2]);\ - ACTIVATION_V4(V[3]);\ - ACTIVATION_V4(V[4]);\ - ACTIVATION_V4(V[5]);\ - vstore4(V[0], off, buf);\ - if(id + 1 < bd) vstore4(V[1], off + str, buf);\ - if(id + 2 < bd) vstore4(V[2], off + str * 2, buf);\ - if(id + 3 < bd) vstore4(V[3], off + str * 3, buf);\ - if(id + 4 < bd) vstore4(V[4], off + str * 4, buf);\ - if(id + 5 < bd) vstore4(V[5], off + str * 5, buf);\ - } - - #define STORE_OUTPUT_BUF_ARRAY_V4_NCWH(V, off, str_h, str_hw, id, bd, buf) {\ - ACTIVATION_V4(V[0]);\ - ACTIVATION_V4(V[1]);\ - ACTIVATION_V4(V[2]);\ - ACTIVATION_V4(V[3]);\ - ACTIVATION_V4(V[4]);\ - ACTIVATION_V4(V[5]);\ - buf[off] = V[0].x;\ - buf[off + str_hw] = V[0].y;\ - buf[off + str_hw * 2] = V[0].z;\ - buf[off + str_hw * 3] = V[0].w;\ - if(id + 1 < bd) {\ - buf[off + str_h] = V[1].x;\ - buf[off + str_h + str_hw ] = V[1].y;\ - buf[off + str_h + str_hw * 2] = V[1].z;\ - buf[off + str_h + str_hw * 3] = V[1].w;\ - }\ - if(id + 2 < bd) {\ - buf[off + str_h * 2] = V[2].x;\ - buf[off + str_h * 2 + str_hw ] = V[2].y;\ - buf[off + str_h * 2 + str_hw * 2] = V[2].z;\ - buf[off + str_h * 2 + str_hw * 3] = V[2].w;\ - }\ - if(id + 3 < bd) {\ - buf[off + str_h * 3] = V[3].x;\ - buf[off + str_h * 3 + str_hw ] = V[3].y;\ - buf[off + str_h * 3 + str_hw * 2] = V[3].z;\ - buf[off + str_h * 3 + str_hw * 3] = V[3].w;\ - }\ - if(id + 4 < bd) {\ - buf[off + str_h * 4] = V[4].x;\ - buf[off + str_h * 4 + str_hw ] = V[4].y;\ - buf[off + str_h * 4 + str_hw * 2] = V[4].z;\ - buf[off + str_h * 4 + str_hw * 3] = V[4].w;\ - }\ - if(id + 5 < bd) {\ - buf[off + str_h * 5] = V[5].x;\ - buf[off + str_h * 5 + str_hw ] = V[5].y;\ - buf[off + str_h * 5 + str_hw * 2] = V[5].z;\ - buf[off + str_h * 5 + str_hw * 3] = V[5].w;\ - }\ - } - - #define SET_REG_ARRAY(v, reg) {\ - SET_REG_ARRAY6(v, reg);\ - } -#elif(ON == 7) - #define STORE_OUTPUT_BUF_ARRAY_V4(V, off, str, id, bd, buf) {\ - ACTIVATION_V4(V[0]);\ - ACTIVATION_V4(V[1]);\ - ACTIVATION_V4(V[2]);\ - ACTIVATION_V4(V[3]);\ - ACTIVATION_V4(V[4]);\ - ACTIVATION_V4(V[5]);\ - ACTIVATION_V4(V[6]);\ - vstore4(V[0], off, buf);\ - if(id + 1 < bd) vstore4(V[1], off + str, buf);\ - if(id + 2 < bd) vstore4(V[2], off + str * 2, buf);\ - if(id + 3 < bd) vstore4(V[3], off + str * 3, buf);\ - if(id + 4 < bd) vstore4(V[4], off + str * 4, buf);\ - if(id + 5 < bd) vstore4(V[5], off + str * 5, buf);\ - if(id + 6 < bd) vstore4(V[6], off + str * 6, buf);\ - } - - #define STORE_OUTPUT_BUF_ARRAY_V4_NCWH(V, off, str_h, str_hw, id, bd, buf) {\ - ACTIVATION_V4(V[0]);\ - ACTIVATION_V4(V[1]);\ - ACTIVATION_V4(V[2]);\ - ACTIVATION_V4(V[3]);\ - ACTIVATION_V4(V[4]);\ - ACTIVATION_V4(V[5]);\ - ACTIVATION_V4(V[6]);\ - buf[off] = V[0].x;\ - buf[off + str_hw] = V[0].y;\ - buf[off + str_hw * 2] = V[0].z;\ - buf[off + str_hw * 3] = V[0].w;\ - if(id + 1 < bd) {\ - buf[off + str_h] = V[1].x;\ - buf[off + str_h + str_hw ] = V[1].y;\ - buf[off + str_h + str_hw * 2] = V[1].z;\ - buf[off + str_h + str_hw * 3] = V[1].w;\ - }\ - if(id + 2 < bd) {\ - buf[off + str_h * 2] = V[2].x;\ - buf[off + str_h * 2 + str_hw ] = V[2].y;\ - buf[off + str_h * 2 + str_hw * 2] = V[2].z;\ - buf[off + str_h * 2 + str_hw * 3] = V[2].w;\ - }\ - if(id + 3 < bd) {\ - buf[off + str_h * 3] = V[3].x;\ - buf[off + str_h * 3 + str_hw ] = V[3].y;\ - buf[off + str_h * 3 + str_hw * 2] = V[3].z;\ - buf[off + str_h * 3 + str_hw * 3] = V[3].w;\ - }\ - if(id + 4 < bd) {\ - buf[off + str_h * 4] = V[4].x;\ - buf[off + str_h * 4 + str_hw ] = V[4].y;\ - buf[off + str_h * 4 + str_hw * 2] = V[4].z;\ - buf[off + str_h * 4 + str_hw * 3] = V[4].w;\ - }\ - if(id + 5 < bd) {\ - buf[off + str_h * 5] = V[5].x;\ - buf[off + str_h * 5 + str_hw ] = V[5].y;\ - buf[off + str_h * 5 + str_hw * 2] = V[5].z;\ - buf[off + str_h * 5 + str_hw * 3] = V[5].w;\ - }\ - if(id + 6 < bd) {\ - buf[off + str_h * 6] = V[6].x;\ - buf[off + str_h * 6 + str_hw ] = V[6].y;\ - buf[off + str_h * 6 + str_hw * 2] = V[6].z;\ - buf[off + str_h * 6 + str_hw * 3] = V[6].w;\ - }\ - } - - #define SET_REG_ARRAY(v, reg) {\ - SET_REG_ARRAY7(v, reg);\ - } -#elif(ON == 8) - #define STORE_OUTPUT_BUF_ARRAY_V4(V, off, str, id, bd, buf) {\ - ACTIVATION_V4(V[0]);\ - ACTIVATION_V4(V[1]);\ - ACTIVATION_V4(V[2]);\ - ACTIVATION_V4(V[3]);\ - ACTIVATION_V4(V[4]);\ - ACTIVATION_V4(V[5]);\ - ACTIVATION_V4(V[6]);\ - ACTIVATION_V4(V[7]);\ - vstore4(V[0], off, buf);\ - if(id + 1 < bd) vstore4(V[1], off + str, buf);\ - if(id + 2 < bd) vstore4(V[2], off + str * 2, buf);\ - if(id + 3 < bd) vstore4(V[3], off + str * 3, buf);\ - if(id + 4 < bd) vstore4(V[4], off + str * 4, buf);\ - if(id + 5 < bd) vstore4(V[5], off + str * 5, buf);\ - if(id + 6 < bd) vstore4(V[6], off + str * 6, buf);\ - if(id + 7 < bd) vstore4(V[7], off + str * 7, buf);\ - } - - #define STORE_OUTPUT_BUF_ARRAY_V4_NCWH(V, off, str_h, str_hw, id, bd, buf) {\ - ACTIVATION_V4(V[0]);\ - ACTIVATION_V4(V[1]);\ - ACTIVATION_V4(V[2]);\ - ACTIVATION_V4(V[3]);\ - ACTIVATION_V4(V[4]);\ - ACTIVATION_V4(V[5]);\ - ACTIVATION_V4(V[6]);\ - ACTIVATION_V4(V[7]);\ - buf[off] = V[0].x;\ - buf[off + str_hw] = V[0].y;\ - buf[off + str_hw * 2] = V[0].z;\ - buf[off + str_hw * 3] = V[0].w;\ - if(id + 1 < bd) {\ - buf[off + str_h] = V[1].x;\ - buf[off + str_h + str_hw ] = V[1].y;\ - buf[off + str_h + str_hw * 2] = V[1].z;\ - buf[off + str_h + str_hw * 3] = V[1].w;\ - }\ - if(id + 2 < bd) {\ - buf[off + str_h * 2] = V[2].x;\ - buf[off + str_h * 2 + str_hw ] = V[2].y;\ - buf[off + str_h * 2 + str_hw * 2] = V[2].z;\ - buf[off + str_h * 2 + str_hw * 3] = V[2].w;\ - }\ - if(id + 3 < bd) {\ - buf[off + str_h * 3] = V[3].x;\ - buf[off + str_h * 3 + str_hw ] = V[3].y;\ - buf[off + str_h * 3 + str_hw * 2] = V[3].z;\ - buf[off + str_h * 3 + str_hw * 3] = V[3].w;\ - }\ - if(id + 4 < bd) {\ - buf[off + str_h * 4] = V[4].x;\ - buf[off + str_h * 4 + str_hw ] = V[4].y;\ - buf[off + str_h * 4 + str_hw * 2] = V[4].z;\ - buf[off + str_h * 4 + str_hw * 3] = V[4].w;\ - }\ - if(id + 5 < bd) {\ - buf[off + str_h * 5] = V[5].x;\ - buf[off + str_h * 5 + str_hw ] = V[5].y;\ - buf[off + str_h * 5 + str_hw * 2] = V[5].z;\ - buf[off + str_h * 5 + str_hw * 3] = V[5].w;\ - }\ - if(id + 6 < bd) {\ - buf[off + str_h * 6] = V[6].x;\ - buf[off + str_h * 6 + str_hw ] = V[6].y;\ - buf[off + str_h * 6 + str_hw * 2] = V[6].z;\ - buf[off + str_h * 6 + str_hw * 3] = V[6].w;\ - }\ - if(id + 7 < bd) {\ - buf[off + str_h * 7] = V[7].x;\ - buf[off + str_h * 7 + str_hw ] = V[7].y;\ - buf[off + str_h * 7 + str_hw * 2] = V[7].z;\ - buf[off + str_h * 7 + str_hw * 3] = V[7].w;\ - }\ - } - - #define SET_REG_ARRAY(v, reg) {\ - SET_REG_ARRAY8(v, reg);\ - } -#endif -#endif - - diff --git a/tensor_computing/src/gpu/mali/cl/mem_trans_nchw_to_ncwhc4.cl b/tensor_computing/src/gpu/mali/cl/mem_trans_nchw_to_ncwhc4.cl deleted file mode 100644 index 36888a98..00000000 --- a/tensor_computing/src/gpu/mali/cl/mem_trans_nchw_to_ncwhc4.cl +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - - -__kernel void mem_trans_nchw_to_ncwhc4(const int iw, const int ih, const int ic, const int iwh_str, const int pw, const int ph, - const int ow, const int oh, const __global const T* in, __global T* out){ - - const int idx = get_global_id(0); - const int idy = get_global_id(1); - const int idz = get_global_id(2); - if(idx >= (iw + 3) >> 2 || idy >= ih) return; - - short ew = ((idx << 2) + 4 <= iw) ? 4 : (iw & 3); - short ec = ((idz << 2) + 4 <= ic) ? 4 : (ic & 3); - T4 val[4]; - val[0] = 0; - val[1] = 0; - val[2] = 0; - val[3] = 0; - - int in_off = ((idz << 2) * ih + idy) * iw + (idx << 2); - - if(ew == 4){ - val[0] = vload4(0, in + in_off); - if(ec > 1) val[1] = vload4(0, in + in_off + iwh_str); - if(ec > 2) val[2] = vload4(0, in + in_off +(iwh_str << 1)); - if(ec > 3) val[3] = vload4(0, in + in_off + iwh_str * 3); - } else { - if(ew == 1){ - val[0].x = in[in_off]; - if(ec > 1) val[1].x = in[in_off + iwh_str]; - if(ec > 2) val[2].x = in[in_off +(iwh_str << 1)]; - if(ec > 3) val[3].x = in[in_off + iwh_str * 3]; - } - if(ew == 2){ - val[0].xy = vload2(0, in + in_off); - if(ec > 1) val[1].xy = vload2(0, in + in_off + iwh_str); - if(ec > 2) val[2].xy = vload2(0, in + in_off +(iwh_str << 1)); - if(ec > 3) val[3].xy = vload2(0, in + in_off + iwh_str * 3); - } - if(ew == 3){ - val[0].xyz = vload3(0, in + in_off); - if(ec > 1) val[1].xyz = vload3(0, in + in_off + iwh_str); - if(ec > 2) val[2].xyz = vload3(0, in + in_off +(iwh_str << 1)); - if(ec > 3) val[3].xyz = vload3(0, in + in_off + iwh_str * 3); - } - } - - int out_off = (idz * ow + (idx << 2) + pw) * oh + idy + ph; - vstore4((T4)(val[0].x, val[1].x, val[2].x, val[3].x), out_off, out); - if(ew > 1) vstore4((T4)(val[0].y, val[1].y, val[2].y, val[3].y), out_off + oh, out); - if(ew > 2) vstore4((T4)(val[0].z, val[1].z, val[2].z, val[3].z), out_off +(oh << 1), out); - if(ew > 3) vstore4((T4)(val[0].w, val[1].w, val[2].w, val[3].w), out_off + oh * 3, out); -} - diff --git a/tensor_computing/src/gpu/mali/cl/mem_trans_ncwhc4_to_mtk.cl b/tensor_computing/src/gpu/mali/cl/mem_trans_ncwhc4_to_mtk.cl deleted file mode 100644 index 91a51f20..00000000 --- a/tensor_computing/src/gpu/mali/cl/mem_trans_ncwhc4_to_mtk.cl +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - - -__kernel void mem_trans_ncwhc4_to_mtk(const int ih_str, const int iw_str, const int ih_off, const int iw_off, const int k, const int offset, const int bx, const int by, - __global T* in, __global T* out){ - const int idx = get_global_id(0); - const int idy = get_global_id(1); - if(idx >= bx || idy >= by) return; - uchar ek = ((idy << 2) + 4 <= k) ? 4 : (k & 3); - const int in_off = (idy * iw_str + iw_off) * ih_str + idx + ih_off; - T4 val = vload4(in_off, in); - const int out_off = idx * k + (idy << 2) + offset; - if(ek == 4) { - vstore4(val, 0, out + out_off); - } else { - if(ek == 1) out[out_off] = val.x; - if(ek == 2) vstore2((T2)(val.x, val.y), 0, out + out_off); - if(ek == 3) vstore3((T3)(val.x, val.y, val.z), 0, out + out_off); - } -} - diff --git a/tensor_computing/src/gpu/mali/cl/mem_trans_ncwhc4_to_nchw.cl b/tensor_computing/src/gpu/mali/cl/mem_trans_ncwhc4_to_nchw.cl deleted file mode 100644 index a50a6b5d..00000000 --- a/tensor_computing/src/gpu/mali/cl/mem_trans_ncwhc4_to_nchw.cl +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - - -__kernel void mem_trans_ncwhc4_to_nchw(const int iw, const int ih, const int pw, const int ph, - const int ow, const int oh, const int oc, const int owh_str, const int offset, __global T* in, __global T* out){ - - const int idx = get_global_id(0); - const int idy = get_global_id(1); - const int idz = get_global_id(2); - if(idx >= oh || idy >= (ow + 3) >> 2) return; - - short ew = ((idy << 2) + 4 <= ow) ? 4 : (ow & 3); - short ec = ((idz << 2) + 4 <= oc) ? 4 : (oc & 3); - T4 val[4]; - val[0] = 0; - val[1] = 0; - val[2] = 0; - val[3] = 0; - - int in_off = (idz * iw + (idy << 2) + pw) * ih + idx + ph; - - val[0] = vload4(in_off, in); - if(ew > 1) val[1] = vload4(in_off + ih, in); - if(ew > 2) val[2] = vload4(in_off +(ih << 1), in); - if(ew > 3) val[3] = vload4(in_off + ih * 3, in); - - int out_off = ((idz << 2) * oh + idx) * ow + (idy << 2) + offset; - - if(ew == 4){ - vstore4((T4)(val[0].x, val[1].x, val[2].x, val[3].x), 0, out + out_off); - if(ec > 1) vstore4((T4)(val[0].y, val[1].y, val[2].y, val[3].y), 0, out + out_off + owh_str); - if(ec > 2) vstore4((T4)(val[0].z, val[1].z, val[2].z, val[3].z), 0, out + out_off +(owh_str << 1)); - if(ec > 3) vstore4((T4)(val[0].w, val[1].w, val[2].w, val[3].w), 0, out + out_off + owh_str * 3); - } else { - if(ew == 1){ - out[out_off] = val[0].x; - if(ec > 1) out[out_off + owh_str] = val[0].y; - if(ec > 2) out[out_off +(owh_str << 1)] = val[0].z; - if(ec > 3) out[out_off + owh_str * 3] = val[0].w; - } - if(ew == 2){ - vstore2((T2)(val[0].x, val[1].x), 0, out + out_off); - if(ec > 1) vstore2((T2)(val[0].y, val[1].y), 0, out + out_off + owh_str); - if(ec > 2) vstore2((T2)(val[0].z, val[1].z), 0, out + out_off +(owh_str << 1)); - if(ec > 3) vstore2((T2)(val[0].w, val[1].w), 0, out + out_off + owh_str * 3); - } - if(ew == 3){ - vstore3((T3)(val[0].x, val[1].x, val[2].x), 0, out + out_off); - if(ec > 1) vstore3((T3)(val[0].y, val[1].y, val[2].y), 0, out + out_off + owh_str); - if(ec > 2) vstore3((T3)(val[0].z, val[1].z, val[2].z), 0, out + out_off +(owh_str << 1)); - if(ec > 3) vstore3((T3)(val[0].w, val[1].w, val[2].w), 0, out + out_off + owh_str * 3); - } - } - -} - diff --git a/tensor_computing/src/gpu/mali/cl/multiply_align_nchw.cl b/tensor_computing/src/gpu/mali/cl/multiply_align_nchw.cl deleted file mode 100644 index d21cbaee..00000000 --- a/tensor_computing/src/gpu/mali/cl/multiply_align_nchw.cl +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - - -__kernel void multiply_align_nchw(const int ih_str, const int iw_str, const int ih_off, const int iw_off, const int oh_str, const int ow_str, const int oh_off, const int ow_off, - const int bx, const int by, const float alp, const float bet, __global T* input, __global T* output) { - int idx = get_global_id(0); - int idy = get_global_id(1); - int idz = get_global_id(2); - if(idx >= bx || idy >= by) return; - T4 val; - int in_off = (idz * ih_str + idy + ih_off) * iw_str + (idx << 2) + iw_off; - val = vload4(0, input + in_off); - val.x = ((float)val.x) * alp + bet; - val.y = ((float)val.y) * alp + bet; - val.z = ((float)val.z) * alp + bet; - val.w = ((float)val.w) * alp + bet; - int out_off = (idz * oh_str + idy + oh_off) * ow_str + (idx << 2) + ow_off; - vstore4(val, 0, output + out_off); -} diff --git a/tensor_computing/src/gpu/mali/cl/normalization.cl b/tensor_computing/src/gpu/mali/cl/normalization.cl deleted file mode 100644 index e7b6eb46..00000000 --- a/tensor_computing/src/gpu/mali/cl/normalization.cl +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - - -__kernel void normalization(const int len, const int on, const int ih_str, const int ic_str, const int ih_off, const int iw_off, const int oh_str, const int oh_off, const int ow_off, - __global const T* alpha, __global const T* beta, __global const T* in, __global T* out) { - int idx = get_global_id(0); - if(idx >= len) return; - - float mean = 0; - float var = 0; - float para = 1.0 / on; - - int in_off = iw_off * ih_str + idx + ih_off; - for(int i = 0; i < ic_str; ++i) { - T4 tmp = vload4(in_off + i * ih_str, in); - float4 tmpf; - tmpf.x = tmp.x; - tmpf.y = tmp.y; - tmpf.z = tmp.z; - tmpf.w = tmp.w; - mean += (float)(tmpf.x + tmpf.y + tmpf.z + tmpf.w); - } - mean = mean * para; - - for(int i = 0; i < ic_str; ++i) { - T4 tmp = vload4(in_off + i * ih_str, in); - float4 tmpf; - tmpf.x = tmp.x; - tmpf.y = tmp.y; - tmpf.z = tmp.z; - tmpf.w = tmp.w; - tmpf.x = tmpf.x - mean; - tmpf.y = tmpf.y - mean; - tmpf.z = tmpf.z - mean; - tmpf.w = tmpf.w - mean; - var += tmpf.x * tmpf.x + tmpf.y * tmpf.y + tmpf.z * tmpf.z + tmpf.w * tmpf.w; - } - var = var * para; - - float std_val = sqrt(var + 1e-6); - std_val = 1.0 / std_val; - int out_off = ow_off * oh_str + idx + oh_off; - for(int i = 0; i < ic_str; ++i) { - T4 out_val = vload4(in_off + i * ih_str, in); - T4 alp = vload4(i, alpha); - T4 bet = vload4(i, beta); - out_val.x = alp.x * (out_val.x - mean) * std_val + bet.x; - out_val.y = alp.y * (out_val.y - mean) * std_val + bet.y; - out_val.z = alp.z * (out_val.z - mean) * std_val + bet.z; - out_val.w = alp.w * (out_val.w - mean) * std_val + bet.w; - vstore4(out_val, out_off + i * oh_str, out); - } -} diff --git a/tensor_computing/src/gpu/mali/cl/padding_input_gclmem.cl b/tensor_computing/src/gpu/mali/cl/padding_input_gclmem.cl deleted file mode 100644 index b839466c..00000000 --- a/tensor_computing/src/gpu/mali/cl/padding_input_gclmem.cl +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - - -__kernel void padding_input_gclmem(const int iw, const int ih, const int pw, const int ph, - const int ow, const int oh, const __global const T* in, __global T* out) { - - int idx = get_global_id(0) << 2; - int idy = get_global_id(1); - int idz = get_global_id(2); - if(idx >= ow || idy >= oh) return; - - int in_y = idy - ph; - int be_x = idx - pw; - int en_x = be_x + 4; - T4 val = 0; - if(in_y >= 0 && in_y < ih) { - int in_off = (idz * ih + in_y) * iw; - if(be_x >= 0 && en_x < iw) { - val = vload4(0, in + in_off + be_x); - } else { - if(be_x >= 0 && be_x < iw) val.x = in[in_off + be_x]; - if(be_x + 1 >= 0 && be_x + 1 < iw) val.y = in[in_off + be_x + 1]; - if(be_x + 2 >= 0 && be_x + 2 < iw) val.z = in[in_off + be_x + 2]; - if(be_x + 3 >= 0 && be_x + 3 < iw) val.w = in[in_off + be_x + 3]; - } - } - - int out_off = (idz * oh + idy) * ow + idx; - if(idx + 3 >= ow) { - out[out_off] = val.x; - if(idx + 1 < ow) out[out_off + 1] = val.y; - if(idx + 2 < ow) out[out_off + 2] = val.z; - } else { - vstore4(val, 0, out + out_off); - } -} - diff --git a/tensor_computing/src/gpu/mali/cl/pooling_max.cl b/tensor_computing/src/gpu/mali/cl/pooling_max.cl deleted file mode 100644 index 7e8be2ae..00000000 --- a/tensor_computing/src/gpu/mali/cl/pooling_max.cl +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - - -#define maxvec4(x , y){\ - x.s0 = (x.s0 > y.s0) ? x.s0 : y.s0;\ - x.s1 = (x.s1 > y.s1) ? x.s1 : y.s1;\ - x.s2 = (x.s2 > y.s2) ? x.s2 : y.s2;\ - x.s3 = (x.s3 > y.s3) ? x.s3 : y.s3;\ -} - -__kernel void pooling_max(const int ih, const int iw, const int ih_off, const int iw_off, const int ih_str, const int iw_str, - const int oh, const int ow, const int oh_off, const int ow_off, const int oh_str, const int ow_str, - const int sh, const int sw, const int ph, const int pw, const int kh, const int kw, - __global const T* in, __global T* out) { - const int idx = get_global_id(0); - const int idy = get_global_id(1); - const int idz = get_global_id(2); - if(idx >= oh || idy >= ow) return; - - int bh = idx * sh - ph; - int bw = idy * sw - pw; - int eh = bh + kh; - int ew = bw + kw; - bh = (bh < 0) ? 0 : bh; - bw = (bw < 0) ? 0 : bw; - eh = (eh < ih) ? eh : ih; - ew = (ew < iw) ? ew : iw; - - bh += ih_off; - bw += iw_off; - eh += ih_off; - ew += iw_off; - int in_off = (idz * iw_str + bw) * ih_str; - - T4 val = -FLT_MAX; - T4 maxval = -FLT_MAX; - for(int i = bw; i < ew; ++i){ - for(int j = bh; j < eh; ++j){ - val = vload4(in_off + j, in); - maxvec4(maxval, val); - } - in_off += ih_str; - } - int out_off = (idz * ow_str + ow_off + idy) * oh_str + oh_off + idx; - vstore4(maxval, out_off, out); -} diff --git a/tensor_computing/src/gpu/mali/cl/pooling_mean.cl b/tensor_computing/src/gpu/mali/cl/pooling_mean.cl deleted file mode 100644 index e2aab879..00000000 --- a/tensor_computing/src/gpu/mali/cl/pooling_mean.cl +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - - -#define sumvec4(x, y){\ - x.s0 += (float)y.s0;\ - x.s1 += (float)y.s1;\ - x.s2 += (float)y.s2;\ - x.s3 += (float)y.s3;\ -} - -__kernel void pooling_mean(const int ih, const int iw, const int ih_off, const int iw_off, const int ih_str, const int iw_str, - const int oh, const int ow, const int oh_off, const int ow_off, const int oh_str, const int ow_str, - const int sh, const int sw, const int ph, const int pw, const int kh, const int kw, - __global const T* in, __global T* out){ - const int idx = get_global_id(0); - const int idy = get_global_id(1); - const int idz = get_global_id(2); - if(idx >= oh || idy >= ow) return; - - int bh = idx * sh - ph; - int bw = idy * sw - pw; - int eh = bh + kh; - int ew = bw + kw; - bh = (bh < 0) ? 0 : bh; - bw = (bw < 0) ? 0 : bw; - eh = (eh < ih) ? eh : ih; - ew = (ew < iw) ? ew : iw; - float psize = (eh - bh) * (ew - bw); - - bh += ih_off; - bw += iw_off; - eh += ih_off; - ew += iw_off; - int in_off = (idz * iw_str + bw) * ih_str; - - T4 val; - float4 sum = 0; - for(int i = bw; i< ew; ++i){ - for(int j = bh; j < eh; ++j){ - val = vload4(in_off + j, in); - sumvec4(sum, val); - } - in_off += ih_str; - } - sum = sum / psize; - int out_off = (idz * ow_str + ow_off + idy) * oh_str + oh_off + idx; - vstore4((T4)(sum.x, sum.y, sum.z , sum.w), out_off, out); -} diff --git a/tensor_computing/src/gpu/mali/cl/reshape_mkt_to_nchw.cl b/tensor_computing/src/gpu/mali/cl/reshape_mkt_to_nchw.cl deleted file mode 100644 index 83c1c8af..00000000 --- a/tensor_computing/src/gpu/mali/cl/reshape_mkt_to_nchw.cl +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - -__kernel void reshape_mkt_to_nchw(const int ih_str, const int iw_str, const int ih_off, const int iw_off, const int ow_str, const int oh_str, - const int ow_off, const int oh_off, const int oh, const int bx, const int by, __global const T* in, __global T* out) { - int idx = get_global_id(0); - int idy = get_global_id(1); - if(idx >= bx || idy >= by) return; - T4 val; - const int in_off = (idy * iw_str + iw_off) * ih_str + idx + ih_off; - val = vload4(in_off, in); - - int idk = (idy << 2); - int ox = idx; - int4 oy; - int4 oz; - oy.s0 = idk % oh; - oy.s1 = (idk + 1) % oh; - oy.s2 = (idk + 2) % oh; - oy.s3 = (idk + 3) % oh; - oz.s0 = idk / oh; - oz.s1 = (idk + 1) / oh; - oz.s2 = (idk + 2) / oh; - oz.s3 = (idk + 3) / oh; - out[(oz.s0 * oh_str + oy.s0 + oh_off) * ow_str + ox + ow_off] = val.x; - out[(oz.s1 * oh_str + oy.s1 + oh_off) * ow_str + ox + ow_off] = val.y; - out[(oz.s2 * oh_str + oy.s2 + oh_off) * ow_str + ox + ow_off] = val.z; - out[(oz.s3 * oh_str + oy.s3 + oh_off) * ow_str + ox + ow_off] = val.w; -} diff --git a/tensor_computing/src/gpu/mali/cl/reshape_nchw_to_mkt.cl b/tensor_computing/src/gpu/mali/cl/reshape_nchw_to_mkt.cl deleted file mode 100644 index 302a3762..00000000 --- a/tensor_computing/src/gpu/mali/cl/reshape_nchw_to_mkt.cl +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - -__kernel void reshape_nchw_to_mkt(const int iw_str, ih_str, const int iw_off, const int ih_off, const int ih, const int k, const int oh_str, const int ow_str, const int oh_off, - const int ow_off, const int bx, const int by, __global const T* in, __global T* out) { - int idx = get_global_id(0); - int idy = get_global_id(1); - if(idx >= bx || idy >= by) return; - T4 val = 0; - int idk = (idy << 2); - int ix = idx; - int4 iy; - int4 iz; - iy.s0 = idk % ih; - iy.s1 = (idk + 1) % ih; - iy.s2 = (idk + 2) % ih; - iy.s3 = (idk + 3) % ih; - iz.s0 = idk / ih; - iz.s1 = (idk + 1) / ih; - iz.s2 = (idk + 2) / ih; - iz.s3 = (idk + 3) / ih; - val.x = in[(iz.s0 * ih_str + iy.s0 + ih_off) * iw_str + ix + iw_off]; - if(idk + 1 < k) val.y = in[(iz.s1 * ih_str + iy.s1 + ih_off) * iw_str + ix + iw_off]; - if(idk + 2 < k) val.z = in[(iz.s2 * ih_str + iy.s2 + ih_off) * iw_str + ix + iw_off]; - if(idk + 3 < k) val.w = in[(iz.s3 * ih_str + iy.s3 + ih_off) * iw_str + ix + iw_off]; - const int out_off = (idy * ow_str + ow_off) * oh_str + idx + oh_off; - vstore4(val, out_off, out); -} diff --git a/tensor_computing/src/gpu/mali/cl/scale.cl b/tensor_computing/src/gpu/mali/cl/scale.cl deleted file mode 100644 index 3b227d6b..00000000 --- a/tensor_computing/src/gpu/mali/cl/scale.cl +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - - -#define MANGLE_NAME_IMPL(base, MD) base ## MD -#define MANGLE_NAME(base, MD) MANGLE_NAME_IMPL(base, MD) - - -__kernel void MANGLE_NAME(scale_, MD)(const int h, const int ih_str, const int iw_str, const int ih_off, const int iw_off, - const int oh_str, const int ow_str, const int oh_off, const int ow_off, const int bx, const int by, __global const T* alpha, __global const T* beta, __global T* input, __global T* output) { - - int idx = get_global_id(0); - int idy = get_global_id(1); - int idz = get_global_id(2); - if(idx >= bx || idy >= by) return; - - T4 alp = vload4(idz, alpha); - T4 bet = 0; -#if defined(USE_BETA) - bet = vload4(idz, beta); -#endif - T8 val; - int in_off = (idz * iw_str + idy + iw_off) * ih_str + (idx << 1) + ih_off; - val = vload8(0, input + (in_off << 2)); - val.s0 = val.s0 * alp.x + bet.x; - val.s1 = val.s1 * alp.y + bet.y; - val.s2 = val.s2 * alp.z + bet.z; - val.s3 = val.s3 * alp.w + bet.w; - val.s4 = val.s4 * alp.x + bet.x; - val.s5 = val.s5 * alp.y + bet.y; - val.s6 = val.s6 * alp.z + bet.z; - val.s7 = val.s7 * alp.w + bet.w; - - int out_off = (idz * ow_str + idy + ow_off) * oh_str + (idx << 1) + oh_off; - if((idx << 1) + 1 < h){ - vstore8(val, 0, output + (out_off << 2)); - } else { - vstore4(val.s0123, 0, output + (out_off << 2)); - } -} diff --git a/tensor_computing/src/gpu/mali/cl/slice_h.cl b/tensor_computing/src/gpu/mali/cl/slice_h.cl deleted file mode 100644 index 4ab95e97..00000000 --- a/tensor_computing/src/gpu/mali/cl/slice_h.cl +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - -#define MANGLE_NAME_IMPL(base, N) base ## N -#define MANGLE_NAME(base, N) MANGLE_NAME_IMPL(base, N) - -__kernel void MANGLE_NAME(slice_h_, N)(const int ih_str, const int iw_str, const int ih_off, const int iw_off, const int bx, const int by, __global T* input, - const int oh_str0, const int ow_str0, const int oh_off0, const int ow_off0, const int slice_end0, __global T* output0, - const int oh_str1, const int ow_str1, const int oh_off1, const int ow_off1, const int slice_end1, __global T* output1 - ) { - int idx = get_global_id(0); - int idy = get_global_id(1); - int idz = get_global_id(2); - if(idx >= bx || idy >= by) return; - - T4 val; - int in_off = (idz * iw_str + idy + iw_off) * ih_str + idx + ih_off; - val = vload4(in_off, input); - if(idx < slice_end0) { - int out_off = (idz * ow_str0 + idy + ow_off0) * oh_str0 + idx + oh_off0; - vstore4(val, out_off, output0); - return; - } - if(idx < slice_end1) { - int out_off = (idz * ow_str1 + idy + ow_off1) * oh_str1 + idx + oh_off1; - vstore4(val, out_off, output1); - return; - } -} diff --git a/tensor_computing/src/gpu/mali/cl/softmax.cl b/tensor_computing/src/gpu/mali/cl/softmax.cl deleted file mode 100644 index 19f8a1ef..00000000 --- a/tensor_computing/src/gpu/mali/cl/softmax.cl +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - - -__kernel void softmax(const int cd4, const int ce4, const int ih_str, const int ihw_str, const int ih_off, const int iw_off, - const int oh_str, const int ohw_str, const int oh_off, const int ow_off, const int bx, const int by, __global const T* in, __global T* out) { - int idx = get_global_id(0); - int idy = get_global_id(1); - if(idx >= bx || idy >= by) return; - - float4 maxval = (float4)(-FLT_MAX); - float4 tmp; - T4 val; - int index = (idy + iw_off) * ih_str + idx + ih_off; - for (int i = 0; i < cd4; i++) { - val = vload4(index + i * ihw_str, in); - tmp.x = (float)val.x; - tmp.y = (float)val.y; - tmp.z = (float)val.z; - tmp.w = (float)val.w; - maxval = fmax(maxval, tmp); - } - if(maxval.x < maxval.y) maxval.x = maxval.y; - if(maxval.x < maxval.z) maxval.x = maxval.z; - if(maxval.x < maxval.w) maxval.x = maxval.w; - float sumexp = 0; - for (int i = 0; i < cd4 - 1; i++) { - val = vload4(index + i * ihw_str, in); - sumexp += exp((float)val.x - maxval.x); - sumexp += exp((float)val.y - maxval.x); - sumexp += exp((float)val.z - maxval.x); - sumexp += exp((float)val.w - maxval.x); - } - - val = vload4(index + (cd4 - 1) * ihw_str, in); - sumexp += exp((float)val.x - maxval.x); - if(ce4 > 1) sumexp += exp((float)val.y - maxval.x); - if(ce4 > 2) sumexp += exp((float)val.z - maxval.x); - if(ce4 > 3) sumexp += exp((float)val.w - maxval.x); - - sumexp = 1.0 / sumexp; - T4 res; - int out_off = (idy + ow_off) * oh_str + idx + oh_off; - for (int i = 0; i < cd4; i++) { - val = vload4(index + i * ihw_str, in); - res.x = (T)exp(val.x - maxval.x) * sumexp; - res.y = (T)exp(val.y - maxval.x) * sumexp; - res.z = (T)exp(val.z - maxval.x) * sumexp; - res.w = (T)exp(val.w - maxval.x) * sumexp; - vstore4(res, out_off + i * ohw_str, out); - } -} diff --git a/tensor_computing/src/gpu/mali/cl/softmax_nchw_c.cl b/tensor_computing/src/gpu/mali/cl/softmax_nchw_c.cl deleted file mode 100644 index ea50ad50..00000000 --- a/tensor_computing/src/gpu/mali/cl/softmax_nchw_c.cl +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - - -__kernel void softmax_nchw_c(const int c, const int iw_str, const int ihw_str, const int iw_off, const int ih_off, - const int ow_str, const int ohw_str, const int ow_off, const int oh_off, const int bx, const int by, __global const T* in, __global T* out) { - int idx = get_global_id(0); - int idy = get_global_id(1); - if(idx >= bx || idy >= by) return; - - float4 maxval = (float4)(-FLT_MAX); - float4 tmp; - T4 val; - int index = (idy + ih_off) * iw_str + (idx << 2) + iw_off; - for (int i = 0; i < c; i++) { - val = vload4(0, in + index + i * ihw_str); - tmp.x = (float)val.x; - tmp.y = (float)val.y; - tmp.z = (float)val.z; - tmp.w = (float)val.w; - maxval = fmax(maxval, tmp); - } - - float4 sumexp = 0; - for (int i = 0; i < c; i++) { - val = vload4(0, in + index + i * ihw_str); - sumexp.x += exp((float)val.x - maxval.x); - sumexp.y += exp((float)val.y - maxval.y); - sumexp.z += exp((float)val.z - maxval.z); - sumexp.w += exp((float)val.w - maxval.w); - } - - sumexp.x = 1.0 / sumexp.x; - sumexp.y = 1.0 / sumexp.y; - sumexp.z = 1.0 / sumexp.z; - sumexp.w = 1.0 / sumexp.w; - T4 res; - int out_off = (idy + oh_off) * ow_str + (idx << 2) + ow_off; - for (int i = 0; i < c; i++) { - val = vload4(0, in + index + i * ihw_str); - res.x = (T)exp(val.x - maxval.x) * sumexp.x; - res.y = (T)exp(val.y - maxval.y) * sumexp.y; - res.z = (T)exp(val.z - maxval.z) * sumexp.z; - res.w = (T)exp(val.w - maxval.w) * sumexp.w; - vstore4(res, 0, out + out_off + i * ohw_str); - } -} diff --git a/tensor_computing/src/gpu/mali/cl/softmax_nchw_w.cl b/tensor_computing/src/gpu/mali/cl/softmax_nchw_w.cl deleted file mode 100644 index 6557598f..00000000 --- a/tensor_computing/src/gpu/mali/cl/softmax_nchw_w.cl +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - - -__kernel void softmax_nchw_w(const int wd4, const int we4, const int iw_str, const int ih_str, const int iw_off, const int ih_off, - const int ow_str, const int oh_str, const int ow_off, const int oh_off, const int bx, const int by, __global const T* in, __global T* out) { - int idx = get_global_id(0); - int idy = get_global_id(1); - if(idx >= bx || idy >= by) return; - - float4 maxval = (float4)(-FLT_MAX); - float4 tmp; - T4 val; - int index = (idy * ih_str + idx + ih_off) * iw_str + iw_off; - for (int i = 0; i < wd4; i++) { - val = vload4(0, in + index + i * 4); - tmp.x = (float)val.x; - tmp.y = (float)val.y; - tmp.z = (float)val.z; - tmp.w = (float)val.w; - maxval = fmax(maxval, tmp); - } - if(maxval.x < maxval.y) maxval.x = maxval.y; - if(maxval.x < maxval.z) maxval.x = maxval.z; - if(maxval.x < maxval.w) maxval.x = maxval.w; - float sumexp = 0; - for (int i = 0; i < wd4 - 1; i++) { - val = vload4(0, in + index + i * 4); - sumexp += exp((float)val.x - maxval.x); - sumexp += exp((float)val.y - maxval.x); - sumexp += exp((float)val.z - maxval.x); - sumexp += exp((float)val.w - maxval.x); - } - val = vload4(0, in + index + (wd4 - 1) * 4); - sumexp += exp((float)val.x - maxval.x); - if(we4 > 1) sumexp += exp((float)val.y - maxval.x); - if(we4 > 2) sumexp += exp((float)val.z - maxval.x); - if(we4 > 3) sumexp += exp((float)val.w - maxval.x); - - sumexp = 1.0 / sumexp; - T4 res; - int out_off = (idy * oh_str + idx + oh_off) * ow_str + ow_off; - for (int i = 0; i < wd4; i++) { - val = vload4(0, in + index + i * 4); - tmp.x = (float)val.x; - tmp.y = (float)val.y; - tmp.z = (float)val.z; - tmp.w = (float)val.w; - tmp.x = exp(tmp.x - maxval.x) * sumexp; - tmp.y = exp(tmp.y - maxval.x) * sumexp; - tmp.z = exp(tmp.z - maxval.x) * sumexp; - tmp.w = exp(tmp.w - maxval.x) * sumexp; - res.x = (T)tmp.x; - res.y = (T)tmp.y; - res.z = (T)tmp.z; - res.w = (T)tmp.w; - vstore4(res, 0, out + out_off + i * 4); - } -} diff --git a/tensor_computing/src/gpu/mali/cl/space2depth.cl b/tensor_computing/src/gpu/mali/cl/space2depth.cl deleted file mode 100644 index 6cbcbaee..00000000 --- a/tensor_computing/src/gpu/mali/cl/space2depth.cl +++ /dev/null @@ -1,36 +0,0 @@ -__kernel void space2depth(const int iw_str, const int ih_str, const int iw_off, const int ih_off, - const int oh_str, const int ohw_str, const int ow_off, const int oh_off, - const int bx, const int by, __global const uchar* in, __global T* out) { - const int idx = get_global_id(0); - const int idy = get_global_id(1); - if(idx >= bx || idy >= by) return; - - const int in_off = ((idx << 2)+ ih_off) * iw_str + (idy << 2) + iw_off; - uchar4 tmp0 = vload4(0, in + in_off); - uchar4 tmp1 = vload4(0, in + in_off + iw_str); - uchar4 tmp2 = vload4(0, in + in_off + (iw_str << 1)); - uchar4 tmp3 = vload4(0, in + in_off + iw_str * 3); - T4 val0, val1, val2, val3; - val0.x = tmp0.x / (T)(255); - val0.y = tmp0.y / (T)(255); - val0.z = tmp0.z / (T)(255); - val0.w = tmp0.w / (T)(255); - val1.x = tmp1.x / (T)(255); - val1.y = tmp1.y / (T)(255); - val1.z = tmp1.z / (T)(255); - val1.w = tmp1.w / (T)(255); - val2.x = tmp2.x / (T)(255); - val2.y = tmp2.y / (T)(255); - val2.z = tmp2.z / (T)(255); - val2.w = tmp2.w / (T)(255); - val3.x = tmp3.x / (T)(255); - val3.y = tmp3.y / (T)(255); - val3.z = tmp3.z / (T)(255); - val3.w = tmp3.w / (T)(255); - - const int out_off = (idy + ow_off) * oh_str + idx + oh_off; - vstore4(val0, out_off, out); - vstore4(val1, out_off + ohw_str, out); - vstore4(val2, out_off + ohw_str * 2, out); - vstore4(val3, out_off + ohw_str * 3, out); -} diff --git a/tensor_computing/src/gpu/mali/cl/transpose_nchw_0132.cl b/tensor_computing/src/gpu/mali/cl/transpose_nchw_0132.cl deleted file mode 100644 index 81a3f571..00000000 --- a/tensor_computing/src/gpu/mali/cl/transpose_nchw_0132.cl +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - - -__kernel void transpose_nchw_0132(const int ih_str, const int iw_str, const int ih_off, const int iw_off, const int oh_str, const int ow_str, - const int oh_off, const int ow_off, const int oh, const int bx, const int by, __global const T* input, __global T* output) { - - int idx = get_global_id(0); - int idy = get_global_id(1); - int idz = get_global_id(2); - if(idx >= bx || idy >= by) return; - T4 val; - int idx4 = idx << 2; - int in_off = (idz * ih_str + idy + ih_off) * iw_str + idx4 + iw_off; - val = vload4(0, input + in_off); - int out_off = (idz * oh_str + idx4 + oh_off) * ow_str + idy + ow_off; - output[out_off] = val.x; - if(idx4 + 1 < oh) output[out_off + ow_str] = val.y; - if(idx4 + 2 < oh) output[out_off + ow_str * 2] = val.z; - if(idx4 + 3 < oh) output[out_off + ow_str * 3] = val.w; -} diff --git a/tensor_computing/src/gpu/mali/clip.cpp b/tensor_computing/src/gpu/mali/clip.cpp deleted file mode 100644 index 5b45d1da..00000000 --- a/tensor_computing/src/gpu/mali/clip.cpp +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "gpu/mali/tensor_computing_mali.h" -#include "gpu/mali/fp16/clip_mali_fp16.h" -#include "gpu/mali/infer_gclmem_desc_mali.h" - -EE clip_infer_output_size_mali(TensorDesc inputDesc, - TensorDesc* outputDesc, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc) { - /*tensorDesc record cpu org data format info*/ - /*gclmemDesc record gpu trans data format info*/ - if(outputDesc) *outputDesc = inputDesc; - DataType idt; - DataFormat idf; - U32 iw, ih, ic, in; - tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); - - if(idf == DF_NCHW) { - CHECK_STATUS(infer_gclmem_desc_ncwhc4(iw, ih, ic, 0, 0, iw, ih, ic, idt, idt, gclmemInputDesc, gclmemOutputDesc)); - if(gclmemInputDesc && gclmemOutputDesc) *gclmemOutputDesc = *gclmemInputDesc;//the input and output mem maybe the same - return SUCCESS; - } - return NOT_SUPPORTED; -} - -inline EE clip_checkpara_mali(GCLHandle_t handle, - void* min_value, - void* max_value, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { - if(handle == nullptr || min_value == nullptr || max_value == nullptr || nullptr == input || nullptr == output) return NULL_POINTER; - if(inputDesc.df != outputDesc.df || inputDesc.df != DF_NCHW) return NOT_SUPPORTED; - if(input->desc.memFormat != output->desc.memFormat || input->desc.memFormat != DF_NCWHC4) return NOT_SUPPORTED; - return SUCCESS; -} - -EE clip_mali(GCLHandle_t handle, - void* min_value, - void* max_value, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { - EE ret = SUCCESS; - CHECK_STATUS(clip_checkpara_mali(handle, min_value, max_value, inputDesc, input, outputDesc, output)); - switch(inputDesc.dt){ - case DT_F16:{ - ret = clip_mali_fp16(handle, min_value, max_value, inputDesc, input, outputDesc, output); - break; - } - case DT_I8:{ - ret = NOT_SUPPORTED; - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - - - diff --git a/tensor_computing/src/gpu/mali/concat.cpp b/tensor_computing/src/gpu/mali/concat.cpp deleted file mode 100644 index 9600e673..00000000 --- a/tensor_computing/src/gpu/mali/concat.cpp +++ /dev/null @@ -1,138 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "gpu/mali/tensor_computing_mali.h" -#include "gpu/mali/fp16/concat_mali_fp16.h" -#include "gpu/mali/infer_gclmem_desc_mali.h" - -EE concat_infer_output_size_mali(std::vector inputDesc, - TensorDesc* outputDesc, - U32 concatDim, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc){ - /*tensorDesc record cpu org data format info*/ - /*gclmemDesc record gpu trans data format info*/ - if(outputDesc) *outputDesc = inputDesc[0]; - U32 sumDimSize = 0; - for(U32 i = 0; i < inputDesc.size(); i++) sumDimSize += inputDesc[i].dims[3 - concatDim]; - if(concatDim == 1) { - if(outputDesc) (*outputDesc).dims[2] = sumDimSize; - DataType idt; - DataFormat idf; - U32 iw, ih, ic, in; - sumDimSize = 0; - if(gclmemInputDesc && gclmemOutputDesc) { - for(U32 i = 0; i < inputDesc.size(); i++){ - tensorSelectGet(inputDesc[i], &idt, &idf, &in, &ic, &ih, &iw); - CHECK_STATUS(infer_gclmem_desc_ncwhc4(iw, ih, ic, 0, 0, iw, ih, ic, idt, idt, &gclmemInputDesc[i], gclmemOutputDesc)); - sumDimSize += (ic + 3) / 4; - } - U32 s0 = gclmemOutputDesc->stride[0]; - U32 s1 = gclmemOutputDesc->stride[1]; - U32 s2 = sumDimSize; - gclmemOutputDesc->stride[2] = s2; - gclmemOutputDesc->num = s0 * s1 * s2 * 4; - gclmemOutputDesc->byteSize = s0 * s1 * s2 * 4 * bytesOf(idt); - - U32 off0, off1, num, byteSize; - s0 = gclmemInputDesc[0].stride[0]; - s1 = gclmemInputDesc[0].stride[1]; - off0 = gclmemInputDesc[0].offset[0]; - off1 = gclmemInputDesc[0].offset[1]; - for(U32 i = 1; i < inputDesc.size(); i++) { - s0 = (s0 >= gclmemInputDesc[i].stride[0]) ? s0 : gclmemInputDesc[i].stride[0]; - s1 = (s1 >= gclmemInputDesc[i].stride[1]) ? s1 : gclmemInputDesc[i].stride[1]; - off0 = (off0 >= gclmemInputDesc[i].offset[0]) ? off0 : gclmemInputDesc[i].offset[0]; - off1 = (off1 >= gclmemInputDesc[i].offset[1]) ? off1 : gclmemInputDesc[i].offset[1]; - } - for(U32 i = 0; i < inputDesc.size(); i++) { - s2 = gclmemInputDesc[i].stride[2]; - num = s0 * s1 * s2 * 4; - byteSize = num * bytesOf(idt); - gclmemInputDesc[i].stride[0] = s0; - gclmemInputDesc[i].stride[1] = s1; - gclmemInputDesc[i].offset[0] = off0; - gclmemInputDesc[i].offset[1] = off1; - gclmemInputDesc[i].num = num; - gclmemInputDesc[i].byteSize = byteSize; - } - } - return SUCCESS; - } - return NOT_SUPPORTED; -} - -inline EE concat_checkpara_mali(GCLHandle_t handle, - std::vector inputDesc, - std::vector input, - TensorDesc outputDesc, - GCLMem_t output, - U32 concatDim) { - - if(handle == nullptr || nullptr == output) return NULL_POINTER; - if(input.size() < 1) return NOT_MATCH; - for(auto it : input) { - GCLMem_t ptr = (GCLMem_t)it; - if(ptr == nullptr) return NULL_POINTER; - if(ptr->desc.memFormat != output->desc.memFormat) return NOT_SUPPORTED; - } - U32 sumDimSize = 0; - if(concatDim != 1) return NOT_SUPPORTED; - for(auto it : inputDesc) { - if(it.df != outputDesc.df) return NOT_SUPPORTED; - if(it.dims[0] != outputDesc.dims[0]) return NOT_SUPPORTED; - if(it.dims[1] != outputDesc.dims[1]) return NOT_SUPPORTED; - if(it.dims[3] != outputDesc.dims[3]) return NOT_SUPPORTED; - sumDimSize += it.dims[2]; - } - if(sumDimSize != outputDesc.dims[2]) return NOT_MATCH; - if(outputDesc.df != DF_NCHW) return NOT_SUPPORTED; - if(output->desc.memFormat != DF_NCWHC4) return NOT_SUPPORTED; - return SUCCESS; -} - -EE concat_mali(GCLHandle_t handle, - std::vector inputDesc, - std::vector input, - GCLMem_t inputScale, - TensorDesc outputDesc, - GCLMem_t output, - GCLMem_t outputScale, - U32 concatDim) { - UNUSED(inputScale); - UNUSED(outputScale); - EE ret = SUCCESS; - CHECK_STATUS(concat_checkpara_mali(handle, inputDesc, input, outputDesc, output, concatDim)); - switch(inputDesc[0].dt){ - case DT_F16:{ - ret = concat_mali_fp16(handle, inputDesc, input, outputDesc, output, concatDim); - break; - } - case DT_I8:{ - ret = NOT_SUPPORTED; - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - - - diff --git a/tensor_computing/src/gpu/mali/convolution.cpp b/tensor_computing/src/gpu/mali/convolution.cpp deleted file mode 100644 index 80e9ec95..00000000 --- a/tensor_computing/src/gpu/mali/convolution.cpp +++ /dev/null @@ -1,450 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#include -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "gpu/mali/tensor_computing_mali.h" -#include "gpu/mali/fp16/convolution_mali_fp16.h" -#include "gpu/mali/infer_gclmem_desc_mali.h" - -EE convolution_infer_output_size_mali(TensorDesc inputDesc, - TensorDesc filterDesc, - ConvolutionDesc convDesc, - TensorDesc* outputDesc, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc, - ForwardRunInfoMali_t forwardRunInfo) { - DataType idt, fdt; - DataFormat idf, fdf; - U32 iw, ih, ic, in; - U32 fw, fh, fc, fn; - U32 ow, oh; - U32 sw, sh, pw, ph, dw, dh, fdw, fdh; - U32 pr, pb; - tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); - tensorSelectGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw); - pw = convDesc.padding_left; - pr = convDesc.padding_right; - ph = convDesc.padding_top; - pb = convDesc.padding_bottom; - sw = convDesc.stride_w; - sh = convDesc.stride_h; - dw = convDesc.dilatedRate_w; - dh = convDesc.dilatedRate_h; - if (fw != 1 && fw != 3 && fw != 5) CHECK_STATUS(NOT_SUPPORTED); - if (in != 1) CHECK_STATUS(NOT_SUPPORTED); - if (fw < 1 || fh < 1) CHECK_STATUS(NOT_SUPPORTED); - if (dw != 1 || dh != 1) CHECK_STATUS(NOT_SUPPORTED); - if (pw != ph || sw != sh) CHECK_STATUS(NOT_SUPPORTED); - if (pb != ph || pr != pw) CHECK_STATUS(NOT_SUPPORTED); - //if ((fn & 3) != 0 && fn != 1) CHECK_STATUS(NOT_SUPPORTED); - fdw = (fw - 1) * dw + 1; - fdh = (fh - 1) * dh + 1; - ow = (iw + 2 * pw - fdw) / sw + 1; - oh = (ih + 2 * ph - fdh) / sh + 1; - U32 iw_align, ih_align, item_w, item_h, ext_w, ext_h; - - if(idf == DF_NCHW || (fw == 1 && fh == 1 && iw == 1 && ih == 1)) { - if(outputDesc) *outputDesc = tensor4df(idt, DF_NCHW, in, fn, oh, ow); - item_w = forwardRunInfo->best_w[0]; - item_h = 1; - ih_align = ih; - ext_h = ph; - if(forwardRunInfo->algorithm == CONVOLUTION_ALGORITHM_WINOGRAD) { - item_w = 16; - ext_h = 0;//no need to pad h axis - } - iw_align = (ow + item_w - 1) / item_w * item_w; - iw_align = iw_align * sw; - ext_w = (fw / 2 < pw) ? pw : fw / 2;//if fw / 2 < pw, use pw as offset - if(pw < ext_w) {//if fw / 2 > pw, use pw as offset, and pad (ext_w - pw) * 2 in the end - iw_align = iw_align + 2 * (ext_w - pw); - ext_w = pw; - } - CHECK_STATUS(infer_gclmem_desc_ncwhc4(iw_align, ih_align, ic, ext_w, ext_h, ow, oh, fn, idt, idt, gclmemInputDesc, gclmemOutputDesc)); - return SUCCESS; - } - - if(idf == DF_NCHW_ORG_MALI) { - if(outputDesc) *outputDesc = tensor4df(idt, DF_NCHW, in, fn, oh, ow); - item_w = forwardRunInfo->best_w[0]; - item_h = 1; - ih_align = ih; - ext_h = ph; - iw_align = (ow + item_w - 1) / item_w * item_w; - iw_align = iw_align * sw; - ext_w = (fw / 2 < pw) ? pw : fw / 2;//if fw / 2 < pw, use pw as offset - if(pw < ext_w) {//if fw / 2 > pw, use pw as offset, and pad (ext_w - pw) * 2 in the end - iw_align = iw_align + 2 * (ext_w - pw); - ext_w = pw; - } - CHECK_STATUS(infer_gclmem_desc_nchw(iw_align, ih_align, ic, ext_w, ext_h, ow, oh, fn, idt, idt, gclmemInputDesc, NULL)); - CHECK_STATUS(infer_gclmem_desc_ncwhc4(iw_align, ih_align, ic, ext_w, ext_h, ow, oh, fn, idt, idt, NULL, gclmemOutputDesc)); - return SUCCESS; - } - - if(idf == DF_NCHWC3) { - if(fn == 1 && fc == 3 && fw == 1){ - if(outputDesc) *outputDesc = tensor4df(idt, DF_NCHW, in, fn, oh, ow); - item_w = 2; - item_h = 1; - iw_align = (iw + item_w - 1) / item_w * item_w; - ih_align = (ih + item_h - 1) / item_h * item_h; - CHECK_STATUS(infer_gclmem_desc_nchwc3_to_nchw(iw_align, ih_align, ic, pw, ph, ow, oh, fn, gclmemInputDesc, gclmemOutputDesc)); - return SUCCESS; - } - } - return NOT_SUPPORTED; -} - -EE convolution_infer_forward_algorithm_mali(GCLHandle_t handle, - TensorDesc inputDesc, - TensorDesc filterDesc, - ConvolutionDesc convDesc, - TensorDesc outputDesc, - ConvolutionPolicy policy, - ActivationMode activationMode, - ForwardRunInfoMali_t forwardRunInfo){ - if(forwardRunInfo == nullptr) CHECK_STATUS(NULL_POINTER); - ConvolutionForwardAlgorithm algorithm = (ConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); - if(algorithm != CONVOLUTION_ALGORITHM_NULL) return SUCCESS; - if(policy == CONVOLUTION_LIBRARY_SEARCH) CHECK_STATUS(NOT_SUPPORTED); - if(policy == CONVOLUTION_FASTEST) CHECK_STATUS(NOT_SUPPORTED); - DataType dt; - DataFormat idf; - U32 ic, ih, iw, fn, fh, fw, sh, sw; - tensorSelectGet(inputDesc, NULL, &idf, NULL, &ic, &ih, &iw); - tensorSelectGet(filterDesc, &dt, NULL, &fn, NULL, &fh, &fw); - sh = convDesc.stride_h; - sw = convDesc.stride_w; - if(idf == DF_NCHW_ORG_MALI && (ih != 1 || iw != 1 || fw != 1 || fh != 1)) { - U32 item_w = (8 * sw - (fw - 1)) / sw; - forwardRunInfo->best_w[0] = item_w; - forwardRunInfo->best_k[0] = 4; - forwardRunInfo->best_c[0] = 1; - forwardRunInfo->algorithm = CONVOLUTION_ALGORITHM_DIRECT; - return SUCCESS; - } - - if(policy == CONVOLUTION_TUNNING) { - GCLHandle_t handle_tun; - CHECK_STATUS(gcl_create_handle_profiling(&handle_tun)); - handle_tun->binMapPtr = handle->binMapPtr; - GCLMem_t input = gcl_create_gclmem(); - GCLMem_t filter = gcl_create_gclmem(); - GCLMem_t output = gcl_create_gclmem(); - GCLMem_t bias = gcl_create_gclmem(); - GCLMem_t tmpbuf = gcl_create_gclmem(); - std::vector convolutionAlgorithms; - U32 maxInputSize = 0; - U32 maxOutputSize = 0; - U32 maxFilterSize = 0; - U32 maxBytes = 0; - U32 algosNum = 0; - U32 biasNum; - std::vector runInfos; - std::vector inputMemDescs; - std::vector outputMemDescs; - std::vector filterMemDescs; - U32 configInfo[3][64]; - convolutionAlgorithms.push_back(CONVOLUTION_ALGORITHM_DIRECT); - if(fw == 3 && fh == 3 && sw == 1 && sh == 1)convolutionAlgorithms.push_back(CONVOLUTION_ALGORITHM_WINOGRAD); - - for(auto p : convolutionAlgorithms) { - ForwardRunInfoMali runInfo; - U32 configNum = 0; - U32 bytes; - U32 stride[3] = {0, 0, 0}; - U32 offset[3] = {0, 0, 0}; - runInfo.algorithm = (I32)(p); - if(p == CONVOLUTION_ALGORITHM_DIRECT) { - if(ih == 1 && iw == 1 && fh == 1 && fw == 1) { - configNum = 3; - if((ic & 15) != 0) configNum = 2; - if((ic & 7) != 0) configNum = 1; - for(U32 i = 0; i < configNum; i++) { - configInfo[0][i] = 1; - configInfo[1][i] = 1 << (2 + i); - configInfo[2][i] = 0; - } - } else { - configNum = 8; - for(U32 i = 0; i < configNum; i++) { - configInfo[0][i] = i + 1; - configInfo[1][i] = 4; - configInfo[2][i] = 4; - } - if(fn % 8 == 0) { - for(U32 i = 0; i < 4; i++) { - configInfo[0][i + configNum] = i + 1; - configInfo[1][i + configNum] = 4; - configInfo[2][i + configNum] = 8; - } - configNum += 4; - } - } - } - - if(p == CONVOLUTION_ALGORITHM_WINOGRAD) { - biasNum = fn; - bias->desc.byteSize = biasNum * bytesOf(dt); - bias->desc.memType = GCL_MEM_BUF; - configNum = 0; - for(U32 i = 1; i <= 8; i++) { - for(U32 j = 1; j <= 8; j++) { - if(i * j <= 2) continue; - configInfo[0][configNum] = i; - configInfo[1][configNum] = 1; - configInfo[2][configNum] = j; - configNum++; - } - } - } - - for(U32 i = 0; i < configNum; ++i) { - GCLMemDesc inputMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); - GCLMemDesc outputMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); - GCLMemDesc filterMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); - runInfo.best_w[0] = configInfo[0][i]; - runInfo.best_c[0] = configInfo[1][i]; - runInfo.best_k[0] = configInfo[2][i]; - if(convolution_infer_output_size_mali(inputDesc, filterDesc, convDesc, NULL, &inputMemDesc, &outputMemDesc, &runInfo) != SUCCESS) continue; - if(convolution_transform_filter_bytes_mali(filterDesc, &runInfo, &filterMemDesc, &bytes) != SUCCESS) continue; - if(maxBytes < bytes) maxBytes= bytes; - if(convolution_infer_forward_tmp_bytes_mali(inputDesc, filterDesc, outputDesc, convDesc, &runInfo, &bytes) != SUCCESS) continue; - if(maxBytes < bytes) maxBytes= bytes; - if(maxInputSize < inputMemDesc.byteSize) maxInputSize = inputMemDesc.byteSize; - if(maxOutputSize < outputMemDesc.byteSize) maxOutputSize = outputMemDesc.byteSize; - if(maxFilterSize < filterMemDesc.byteSize) maxFilterSize = filterMemDesc.byteSize; - inputMemDescs.push_back(inputMemDesc); - outputMemDescs.push_back(outputMemDesc); - filterMemDescs.push_back(filterMemDesc); - runInfos.push_back(runInfo); - } - } - - if(ih == 1 && iw == 1 && fh == 1 && fw == 1) { - biasNum = fn; - bias->desc.byteSize = biasNum * bytesOf(dt); - bias->desc.memType = GCL_MEM_BUF; - } else { - biasNum = (fn + 3) / 4; - bias->desc.byteSize = biasNum * 4 * bytesOf(dt); - bias->desc.memType = GCL_MEM_IMG_1D; - } - algosNum = runInfos.size(); - if(algosNum == 0) CHECK_STATUS(NOT_SUPPORTED); - TensorDesc scaleDesc = tensor1d(DT_F32, 0); - TensorDesc biasDesc = tensor1d(dt, fn); - inputMemDescs[0].byteSize = maxInputSize; - outputMemDescs[0].byteSize = maxOutputSize; - filterMemDescs[0].byteSize = maxFilterSize; - input->desc = inputMemDescs[0]; - output->desc = outputMemDescs[0]; - filter->desc = filterMemDescs[0]; - bias->desc.stride[0] = biasNum; - bias->desc.stride[1] = 1; - bias->desc.stride[2] = 1; - bias->desc.offset[0] = 0; - bias->desc.offset[1] = 0; - bias->desc.offset[2] = 0; - bias->desc.num = biasNum; - bias->desc.memFormat = DF_NHWC; - tmpbuf->desc.byteSize = maxBytes; - gcl_create_memory(handle_tun, input); - gcl_create_memory(handle_tun, output); - gcl_create_memory(handle_tun, filter); - gcl_create_memory(handle_tun, bias); - if(maxBytes) gcl_create_memory(handle_tun, tmpbuf); - - double minTimeDirect = DBL_MAX; - double minTimeWinograd = DBL_MAX; - double minTime = DBL_MAX; - double winogradPicTranTime = DBL_MAX; - double winogradOutTranTime = DBL_MAX; - U32 runKernelBe = 0; - U32 runKernelEnd = 0; - ForwardRunInfoMali bestRunInfo; - ForwardRunInfoMali bestRunInfoDirect; - ForwardRunInfoMali bestRunInfoWinograd; - for(U32 i = 0; i < algosNum; i++) { - input->desc = inputMemDescs[i]; - output->desc = outputMemDescs[i]; - filter->desc = filterMemDescs[i]; - if(convolution_mali(handle_tun, inputDesc, input, filterDesc, filter, convDesc, &runInfos[i], scaleDesc, NULL, biasDesc, bias, - maxBytes, tmpbuf, outputDesc, output, activationMode) == SUCCESS) { - if(runInfos[i].algorithm == (I32)CONVOLUTION_ALGORITHM_DIRECT) { - runKernelEnd = handle_tun->kernelVec.size(); - gcl_run_kernelVec_timing(handle_tun, runKernelBe, runKernelEnd); - runKernelBe = runKernelEnd; - if(minTimeDirect > handle_tun->t_execute) { - minTimeDirect = handle_tun->t_execute; - bestRunInfoDirect = runInfos[i]; - } - } - - if(runInfos[i].algorithm == (I32)CONVOLUTION_ALGORITHM_WINOGRAD) { - if(winogradPicTranTime == DBL_MAX) { - runKernelEnd = runKernelBe + 2; - gcl_run_kernelVec_timing(handle_tun, runKernelBe, runKernelEnd); - winogradPicTranTime = handle_tun->t_execute; - } - runKernelBe += 2; - runKernelEnd = runKernelBe + 1; - gcl_run_kernelVec_timing(handle_tun, runKernelBe, runKernelEnd); - if(minTimeWinograd > handle_tun->t_execute) { - minTimeWinograd = handle_tun->t_execute; - bestRunInfoWinograd = runInfos[i]; - } - runKernelBe += 36; - if(winogradOutTranTime == DBL_MAX) { - runKernelEnd = runKernelBe + 1; - gcl_run_kernelVec_timing(handle_tun, runKernelBe, runKernelEnd); - winogradOutTranTime = handle_tun->t_execute; - } - runKernelBe = handle_tun->kernelVec.size(); - } - } - } - if(minTimeWinograd != DBL_MAX) minTimeWinograd = 36 * minTimeWinograd + winogradPicTranTime + winogradOutTranTime; - minTime = minTimeDirect; - bestRunInfo = bestRunInfoDirect; - if(minTimeWinograd < minTime) { - minTime = minTimeWinograd; - bestRunInfo = bestRunInfoWinograd; - } - if(minTime == DBL_MAX) CHECK_STATUS(NOT_SUPPORTED); - *forwardRunInfo = bestRunInfo; - CHECK_STATUS(gcl_finish(handle_tun)); - gcl_destroy_gclmem(input); - gcl_destroy_gclmem(filter); - gcl_destroy_gclmem(output); - gcl_destroy_gclmem(bias); - gcl_destroy_gclmem(tmpbuf); - convolutionAlgorithms.clear(); - runInfos.clear(); - inputMemDescs.clear(); - outputMemDescs.clear(); - filterMemDescs.clear(); - gcl_destroy_handle(handle_tun); - return SUCCESS; - - } - return NOT_SUPPORTED; -} - -EE convolution_transform_filter_bytes_mali(TensorDesc filterDesc, - ForwardRunInfoMali_t forwardRunInfo, - GCLMemDesc_t gclmemFilterDesc, - U32* bytes){ - EE ret = SUCCESS; - switch(filterDesc.dt){ - case DT_F16:{ - ret = convolution_transform_filter_bytes_mali_fp16(filterDesc, forwardRunInfo, gclmemFilterDesc, bytes); - break; - } - case DT_I8:{ - ret = NOT_SUPPORTED; - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - -EE convolution_transform_filter_mali(GCLHandle_t handle, - TensorDesc filterDesc, - GCLMem_t filter, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc* fltmemDesc, - GCLMem_t fltmem, - GCLMem_t tmp){ - EE ret = SUCCESS; - switch(filterDesc.dt){ - case DT_F16:{ - ret = convolution_transform_filter_mali_fp16(handle, filterDesc, filter, forwardRunInfo, fltmemDesc, fltmem, tmp); - break; - } - case DT_I8:{ - ret = NOT_SUPPORTED; - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - -EE convolution_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, - TensorDesc filterDesc, - TensorDesc outputDesc, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - U32* bytes){ - EE ret = SUCCESS; - switch(inputDesc.dt){ - case DT_F16:{ - ret = convolution_infer_forward_tmp_bytes_mali_fp16(inputDesc, filterDesc, outputDesc, convDesc, forwardRunInfo, bytes); - break; - } - case DT_I8:{ - ret = NOT_SUPPORTED; - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} -EE convolution_mali(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc filterDesc, - const GCLMem_t filter, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc scaleDesc, - const GCLMem_t scale, - TensorDesc biasDesc, - const GCLMem_t bias, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - GCLMem_t output, - ActivationMode activationMode){ - UNUSED(scaleDesc); - UNUSED(scale); - EE ret = SUCCESS; - switch(inputDesc.dt){ - case DT_F16:{ - ret = convolution_mali_fp16(handle, inputDesc, input, filterDesc, filter, convDesc, forwardRunInfo, biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, activationMode); - break; - } - case DT_I8:{ - ret = NOT_SUPPORTED; - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - diff --git a/tensor_computing/src/gpu/mali/depth2space.cpp b/tensor_computing/src/gpu/mali/depth2space.cpp deleted file mode 100644 index 1a043033..00000000 --- a/tensor_computing/src/gpu/mali/depth2space.cpp +++ /dev/null @@ -1,121 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "gpu/mali/tensor_computing_mali.h" -#include "gpu/mali/infer_gclmem_desc_mali.h" - -inline EE depth2space_checkpara_mali(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { - if(handle == nullptr || nullptr == input || nullptr == output) return NULL_POINTER; - if(input->desc.memFormat != DF_NCWHC4) return NOT_SUPPORTED; - if(output->desc.memFormat != DF_NCHW) return NOT_SUPPORTED; - if(inputDesc.df != DF_NCHW) return NOT_SUPPORTED; - if(outputDesc.df != DF_NCHW) return NOT_SUPPORTED; - if(inputDesc.dt != DT_F16) return NOT_SUPPORTED; - if(outputDesc.dt != DT_U8) return NOT_SUPPORTED; - if(inputDesc.dims[0] * 4 != outputDesc.dims[0]) return NOT_SUPPORTED; - if(inputDesc.dims[1] * 4 != outputDesc.dims[1]) return NOT_SUPPORTED; - if(inputDesc.dims[2] / 16 != outputDesc.dims[2]) return NOT_SUPPORTED; - if(inputDesc.dims[3] != outputDesc.dims[3]) return NOT_SUPPORTED; - return SUCCESS; -} - -inline EE depth2space_core_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { - UNUSED(outputDesc); - U32 iw, ih, ic, in; - tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); - U32 iw_str, ih_str, iw_off, ih_off; - ih_str = input->desc.stride[0]; - iw_str = input->desc.stride[1]; - ih_off = input->desc.offset[0]; - iw_off = input->desc.offset[1]; - U32 ow_str, oh_str, ow_off, oh_off; - ow_str = output->desc.stride[0]; - oh_str = output->desc.stride[1]; - ow_off = output->desc.offset[0]; - oh_off = output->desc.offset[1]; - - cl_mem inbuf, outbuf; - inbuf = input->mem; - outbuf = output->mem; - - U32 gs[2] = {ih, iw * 4}; - U32 ls[2] = {0, 0}; - U32 dim = 2; - Kernel kernel; - CHECK_STATUS(gcl_create_kernel_binary(handle, "depth2space", &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, iw, ih, iw_str, ih_str, iw_off, ih_off, - ow_str, oh_str, ow_off, oh_off, inbuf, outbuf)); - gcl_set_kernelVec(handle, kernel, dim, gs, ls, "depth2space"); -#ifdef _DEBUG - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "depth2space")); - CHECK_STATUS(gcl_print_memory(handle, input, "depth2space_input")); - CHECK_STATUS(gcl_print_memory(handle, output, "depth2space_output")); -#endif - return SUCCESS; -} - -EE depth2space_infer_output_size_mali(TensorDesc inputDesc, - TensorDesc* outputDesc, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc){ - /*tensorDesc record cpu org data format info*/ - /*gclmemDesc record gpu trans data format info*/ - if(outputDesc) *outputDesc = inputDesc; - - DataType idt; - DataFormat idf; - U32 iw, ih, ic, in; - U32 ow, oh, oc, on; - tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); - if(idt != DT_F16) return NOT_SUPPORTED; - if(ic != 16) return NOT_SUPPORTED; - on = in; - oc = 1; - oh = ih * 4; - ow = iw * 4; - - if(idf == DF_NCHW) { - *outputDesc = tensor4df(DT_U8, idf, on, oc, oh, ow); - CHECK_STATUS(infer_gclmem_desc_ncwhc4(iw, ih, ic, 0, 0, 0, 0, 0, DT_F16, DT_F16, gclmemInputDesc, NULL)); - CHECK_STATUS(infer_gclmem_desc_nchw( 0, 0, 0, 0, 0, ow, oh, oc, DT_U8, DT_U8, NULL, gclmemOutputDesc)); - return SUCCESS; - } - return NOT_SUPPORTED; -} - -EE depth2space_mali(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { - EE ret = SUCCESS; - CHECK_STATUS(depth2space_checkpara_mali(handle, inputDesc, input, outputDesc, output)); - CHECK_STATUS(depth2space_core_mali_fp16(handle, inputDesc, input, outputDesc, output)); - return ret; -} - - - diff --git a/tensor_computing/src/gpu/mali/depthwise_convolution.cpp b/tensor_computing/src/gpu/mali/depthwise_convolution.cpp deleted file mode 100644 index a50bfd39..00000000 --- a/tensor_computing/src/gpu/mali/depthwise_convolution.cpp +++ /dev/null @@ -1,509 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "gpu/mali/tensor_computing_mali.h" -#include "gpu/mali/fp16/depthwise_convolution_mali_fp16.h" -#include "gpu/mali/infer_gclmem_desc_mali.h" - -EE depthwise_convolution_infer_output_size_mali(TensorDesc inputDesc, - TensorDesc filterDesc, - ConvolutionDesc convDesc, - TensorDesc* outputDesc, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc, - ForwardRunInfoMali_t forwardRunInfo) { - UNUSED(forwardRunInfo); - DataType idt, fdt; - DataFormat idf, fdf; - U32 iw, ih, ic, in; - U32 fw, fh, fc, fn; - U32 ow, oh; - U32 sw, sh, pw, ph, dw, dh, pr, pb; - tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); - tensorSelectGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw); - pw = convDesc.padding_left; - pr = convDesc.padding_right; - ph = convDesc.padding_top; - pb = convDesc.padding_bottom; - sw = convDesc.stride_w; - sh = convDesc.stride_h; - dw = convDesc.dilatedRate_w; - dh = convDesc.dilatedRate_h; - if (fw < 1 || fh < 1) return NOT_SUPPORTED; - if (dw != 1 || dh != 1) return NOT_SUPPORTED; - if (pw != ph || sw != sh) return NOT_SUPPORTED; - if (pb != ph || pr != pw) return NOT_SUPPORTED; - if ((fn & 3) != 0) return NOT_SUPPORTED; - ow = (iw + 2 * pw - fw) / sw + 1; - oh = (ih + 2 * ph - fh) / sh + 1; - if(outputDesc) *outputDesc = tensor4df(idt, idf, in, fn, oh, ow); - - U32 iw_align, item_w, ext_w; - if(idf == DF_NCHW) { - item_w = forwardRunInfo->best_w[0]; - ext_w = (fw / 2 < pw) ? pw : fw / 2; - iw_align = (ow + item_w - 1) / item_w * item_w; - iw_align = iw_align * sw; - if(pw < ext_w) { - iw_align = iw_align + 2 * (ext_w - pw); - ext_w = pw; - } - CHECK_STATUS(infer_gclmem_desc_ncwhc4(iw_align, ih, ic, ext_w, ph, ow, oh, fn, idt, idt, gclmemInputDesc, gclmemOutputDesc)); - return SUCCESS; - } - return NOT_SUPPORTED; -} - -EE depthwise_convolution_infer_forward_algorithm_mali(GCLHandle_t handle, - TensorDesc inputDesc, - TensorDesc filterDesc, - TensorDesc outputDesc, - ConvolutionDesc convDesc, - ConvolutionPolicy policy, - ActivationMode depthwiseActivationMode, - ActivationMode pointwiseActivationMode, - ForwardRunInfoMali_t forwardRunInfo) { - - if(forwardRunInfo == nullptr) CHECK_STATUS(NULL_POINTER); - DepthwiseConvolutionForwardAlgorithm algorithm = (DepthwiseConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); - if(algorithm != DEPTHWISE_CONVOLUTION_ALGORITHM_NULL) return SUCCESS; - if(policy == CONVOLUTION_LIBRARY_SEARCH) CHECK_STATUS(NOT_SUPPORTED); - if(policy == CONVOLUTION_FASTEST) CHECK_STATUS(NOT_SUPPORTED); - - if(policy == CONVOLUTION_TUNNING) { - GCLHandle_t handle_tun; - CHECK_STATUS(gcl_create_handle_profiling(&handle_tun)); - handle_tun->binMapPtr = handle->binMapPtr; - GCLMem_t input = gcl_create_gclmem(); - GCLMem_t filter = gcl_create_gclmem(); - GCLMem_t output = gcl_create_gclmem(); - GCLMem_t bias = gcl_create_gclmem(); - GCLMem_t tmpbuf = gcl_create_gclmem(); - GCLMem_t filter_dp = gcl_create_gclmem(); - GCLMem_t bias_dp = gcl_create_gclmem(); - GCLMem_t bias_buf = gcl_create_gclmem(); - std::vector depthwiseConvolutionAlgorithms; - DataFormat filterDF = filterDesc.df; - if(filterDF == DF_NCHW) { - depthwiseConvolutionAlgorithms.push_back(DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT); - } else if (filterDF == DF_CHW_NC) { - depthwiseConvolutionAlgorithms.push_back(DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT); - depthwiseConvolutionAlgorithms.push_back(DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_GEMM); - } else { - CHECK_STATUS(NOT_SUPPORTED); - } - U32 maxInputSize = 0; - U32 maxOutputSize = 0; - U32 maxFilterSize = 0; - U32 maxBytes = 0; - U32 algosNum = 0; - U32 biasNum; - U32 loop = 1; - U32 ic, ih, iw, fn, fc, fh, fw; - DataType dt; - std::vector runInfos; - std::vector inputMemDescs; - std::vector outputMemDescs; - std::vector filterMemDescs; - std::vector filterMemDescs_dp; - std::vector kernelTimeArray; - U32 configInfo[3][8] = {{0}}; - U32 configInfo_dp[3][16] = {{0}}; - tensorSelectGet(inputDesc, NULL, NULL, NULL, &ic, &ih, &iw); - tensorSelectGet(filterDesc, &dt, NULL, &fn, &fc, &fh, &fw); - - for(auto p : depthwiseConvolutionAlgorithms) { - ForwardRunInfoMali runInfo; - U32 bytes; - U32 stride[3] = {0, 0, 0}; - U32 offset[3] = {0, 0, 0}; - runInfo.algorithm = (I32)p; - U32 configNum = 8; - for(U32 i = 0; i < configNum; i++) { - configInfo[0][i] = i + 1;//w - configInfo[1][i] = 1;//c - configInfo[2][i] = 4;//k - } - - if(p == DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT) { - for(U32 i = 0; i < configNum; i++) { - configInfo_dp[0][i] = i + 1; - configInfo_dp[1][i] = 4; - configInfo_dp[2][i] = 4; - } - if(fn % 8 == 0) { - for(U32 i = 0; i < configNum; i++) configInfo_dp[2][i + configNum] = 8;//k - loop = 2; - } - } - - if(p == DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_GEMM) { - for(U32 i = 0; i < configNum; i++) { - configInfo_dp[0][i] = i + 1;//w - configInfo_dp[1][i] = 1;//c - configInfo_dp[2][i] = 4;//k - } - if(fn % 8 == 0) { - for(U32 i = 0; i < configNum; i++) configInfo_dp[2][i + configNum] = 8;//k - loop = 2; - } - } - - for(U32 j = 0; j < loop; j++) { - for(U32 i = 0; i < configNum; ++i) { - GCLMemDesc inputMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); - GCLMemDesc outputMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4);; - GCLMemDesc filterMemDesc[2]; - filterMemDesc[0] = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); - filterMemDesc[1] = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); - runInfo.best_w[0] = configInfo[0][i]; - runInfo.best_c[0] = configInfo[1][i]; - runInfo.best_k[0] = configInfo[2][i]; - runInfo.best_w[1] = configInfo_dp[0][i]; - runInfo.best_c[1] = configInfo_dp[1][i]; - runInfo.best_k[1] = configInfo_dp[2][i + j * 8]; - if(depthwise_convolution_infer_output_size_mali(inputDesc, filterDesc, convDesc, NULL, &inputMemDesc, &outputMemDesc, &runInfo) != SUCCESS) continue; - if(depthwise_convolution_transform_filter_bytes_mali(filterDesc, &runInfo, filterMemDesc, &bytes) != SUCCESS) continue; - if(maxBytes < bytes) maxBytes= bytes; - if(depthwise_convolution_infer_forward_tmp_bytes_mali(inputDesc, filterDesc, outputDesc, convDesc, &runInfo, &bytes) != SUCCESS) continue; - if(maxBytes < bytes) maxBytes= bytes; - if(maxInputSize < inputMemDesc.byteSize) maxInputSize = inputMemDesc.byteSize; - if(maxOutputSize < outputMemDesc.byteSize) maxOutputSize = outputMemDesc.byteSize; - if(maxFilterSize < filterMemDesc[0].byteSize) maxFilterSize = filterMemDesc[0].byteSize; - if(maxFilterSize < filterMemDesc[1].byteSize) maxFilterSize = filterMemDesc[1].byteSize; - inputMemDescs.push_back(inputMemDesc); - outputMemDescs.push_back(outputMemDesc); - filterMemDescs.push_back(filterMemDesc[0]); - filterMemDescs_dp.push_back(filterMemDesc[1]); - runInfos.push_back(runInfo); - } - } - } - - biasNum = (fn + 3) / 4; - TensorDesc biasDesc = tensor1d(dt, fn); - if (filterDF == DF_CHW_NC) { - bias_dp->desc.memType = GCL_MEM_IMG_1D; - bias_dp->desc.byteSize = biasNum * 4 * bytesOf(dt); - bias_dp->desc.stride[0] = biasNum; - bias_dp->desc.stride[1] = 1; - bias_dp->desc.stride[2] = 1; - bias_dp->desc.offset[0] = 0; - bias_dp->desc.offset[1] = 0; - bias_dp->desc.offset[2] = 0; - bias_dp->desc.num = biasNum; - bias_dp->desc.memFormat = DF_NHWC; - gcl_create_memory(handle_tun, bias_dp); - } - - biasNum = (fn + 7) / 8 * 8; - if (filterDF == DF_CHW_NC) { - bias_buf->desc.memType = GCL_MEM_BUF; - bias_buf->desc.byteSize = biasNum * bytesOf(dt); - bias_buf->desc.stride[0] = biasNum; - bias_buf->desc.stride[1] = 1; - bias_buf->desc.stride[2] = 1; - bias_buf->desc.offset[0] = 0; - bias_buf->desc.offset[1] = 0; - bias_buf->desc.offset[2] = 0; - bias_buf->desc.num = biasNum; - bias_buf->desc.memFormat = DF_NHWC; - gcl_create_memory(handle_tun, bias_buf); - biasNum = (fc + 3) / 4; - biasDesc = tensor1d(dt, fn + fc); - } - - algosNum = runInfos.size(); - if(algosNum == 0) CHECK_STATUS(NOT_SUPPORTED); - inputMemDescs[0].byteSize = maxInputSize; - outputMemDescs[0].byteSize = maxOutputSize; - filterMemDescs[0].byteSize = maxFilterSize; - input->desc = inputMemDescs[0]; - output->desc = outputMemDescs[0]; - filter->desc = filterMemDescs[0]; - bias->desc.memType = GCL_MEM_IMG_1D; - bias->desc.byteSize = biasNum * 4 * bytesOf(dt); - bias->desc.stride[0] = biasNum; - bias->desc.stride[1] = 1; - bias->desc.stride[2] = 1; - bias->desc.offset[0] = 0; - bias->desc.offset[1] = 0; - bias->desc.offset[2] = 0; - bias->desc.num = biasNum; - bias->desc.memFormat = DF_NHWC; - tmpbuf->desc.byteSize = maxBytes; - gcl_create_memory(handle_tun, input); - gcl_create_memory(handle_tun, output); - gcl_create_memory(handle_tun, filter); - gcl_create_memory(handle_tun, bias); - if (filterDF == DF_CHW_NC) { - filterMemDescs_dp[0].byteSize = maxFilterSize; - filter_dp->desc = filterMemDescs_dp[0]; - gcl_create_memory(handle_tun, filter_dp); - } - if(maxBytes) gcl_create_memory(handle_tun, tmpbuf); - - double minTime = DBL_MAX; - double minTime_d_direct = DBL_MAX; - double minTime_p_direct = DBL_MAX; - double minTime_d_gemm = DBL_MAX; - double minTime_p_gemm = DBL_MAX; - U32 runKernelBe = 0; - U32 runKernelEnd = 0; - ForwardRunInfoMali bestRunInfo; - ForwardRunInfoMali bestRunInfoDirect; - ForwardRunInfoMali bestRunInfoGEMM; - CHECK_STATUS(gcl_finish(handle_tun)); - for(U32 i = 0; i < algosNum; i++) { - I32 algo = runInfos[i].algorithm; - U32 best_wp = runInfos[i].best_w[1]; - U32 best_kp = runInfos[i].best_k[1]; - input->desc = inputMemDescs[i]; - output->desc = outputMemDescs[i]; - filter->desc = filterMemDescs[i]; - filter_dp->desc = filterMemDescs_dp[i]; - GCLMem filterArray[2]; - GCLMem biasArray[2]; - filterArray[0] = *filter; - filterArray[1] = *filter_dp; - biasArray[0] = *bias; - biasArray[1] = *bias_dp; - if(algo == (I32)(DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_GEMM)) biasArray[1] = *bias_buf; - if(algo == (I32)(DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT) && best_kp == 8 && best_wp > 4) continue; - if(depthwise_convolution_mali(handle_tun, inputDesc, input, filterDesc, filterArray, convDesc, &runInfos[i], biasDesc, biasArray, - maxBytes, tmpbuf, outputDesc, output, depthwiseActivationMode, pointwiseActivationMode) == SUCCESS) { - if(algo == DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT) { - runKernelEnd = handle_tun->kernelVec.size(); - gcl_run_kernelVec_timing(handle_tun, runKernelBe, runKernelEnd); - if(minTime > handle_tun->t_execute) { - minTime = handle_tun->t_execute; - bestRunInfo.algorithm = runInfos[i].algorithm; - bestRunInfo.best_w[0] = runInfos[i].best_w[0]; - bestRunInfo.best_c[0] = runInfos[i].best_c[0]; - bestRunInfo.best_k[0] = runInfos[i].best_k[0]; - } - runKernelBe = runKernelEnd; - } - - if(algo == DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT) { - if(best_kp == 4) { - runKernelEnd = handle_tun->kernelVec.size(); - gcl_run_kernelVec_timing(handle_tun, runKernelBe, runKernelEnd, &kernelTimeArray); - if(minTime_d_direct > kernelTimeArray[0]) { - minTime_d_direct = kernelTimeArray[0]; - bestRunInfoDirect.algorithm = runInfos[i].algorithm; - bestRunInfoDirect.best_w[0] = runInfos[i].best_w[0]; - bestRunInfoDirect.best_c[0] = runInfos[i].best_c[0]; - bestRunInfoDirect.best_k[0] = runInfos[i].best_k[0]; - } - if(minTime_p_direct > kernelTimeArray[1]) { - minTime_p_direct = kernelTimeArray[1]; - bestRunInfoDirect.best_w[1] = runInfos[i].best_w[1]; - bestRunInfoDirect.best_c[1] = runInfos[i].best_c[1]; - bestRunInfoDirect.best_k[1] = runInfos[i].best_k[1]; - } - runKernelBe = runKernelEnd; - kernelTimeArray.clear(); - } - if(best_kp == 8) { - runKernelEnd = handle_tun->kernelVec.size(); - runKernelBe = runKernelBe + 1; - gcl_run_kernelVec_timing(handle_tun, runKernelBe, runKernelEnd); - if(minTime_p_direct > handle_tun->t_execute) { - minTime_p_direct = handle_tun->t_execute; - bestRunInfoDirect.best_w[1] = runInfos[i].best_w[1]; - bestRunInfoDirect.best_c[1] = runInfos[i].best_c[1]; - bestRunInfoDirect.best_k[1] = runInfos[i].best_k[1]; - } - runKernelBe = runKernelEnd; - } - } - - if(algo == DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_GEMM) { - if(best_kp == 4) { - runKernelEnd = handle_tun->kernelVec.size(); - gcl_run_kernelVec_timing(handle_tun, runKernelBe, runKernelEnd, &kernelTimeArray); - if(minTime_d_gemm > kernelTimeArray[0]) { - minTime_d_gemm = kernelTimeArray[0]; - bestRunInfoGEMM.algorithm = runInfos[i].algorithm; - bestRunInfoGEMM.best_w[0] = runInfos[i].best_w[0]; - bestRunInfoGEMM.best_c[0] = runInfos[i].best_c[0]; - bestRunInfoGEMM.best_k[0] = runInfos[i].best_k[0]; - } - if(minTime_p_gemm > kernelTimeArray[1]) { - minTime_p_gemm = kernelTimeArray[1]; - bestRunInfoGEMM.best_w[1] = runInfos[i].best_w[1]; - bestRunInfoGEMM.best_c[1] = runInfos[i].best_c[1]; - bestRunInfoGEMM.best_k[1] = runInfos[i].best_k[1]; - runKernelBe = runKernelEnd; - } - kernelTimeArray.clear(); - } - if(best_kp == 8) { - runKernelEnd = handle_tun->kernelVec.size(); - runKernelBe = runKernelBe + 1; - gcl_run_kernelVec_timing(handle_tun, runKernelBe, runKernelEnd); - if(minTime_p_gemm > handle_tun->t_execute) { - minTime_p_gemm = handle_tun->t_execute; - bestRunInfoGEMM.best_w[1] = runInfos[i].best_w[1]; - bestRunInfoGEMM.best_c[1] = runInfos[i].best_c[1]; - bestRunInfoGEMM.best_k[1] = runInfos[i].best_k[1]; - } - runKernelBe = runKernelEnd; - } - } - } - } - - if (filterDF == DF_CHW_NC) { - double minTime_direct = minTime_d_direct + minTime_p_direct; - double minTime_gemm = minTime_d_gemm + minTime_p_gemm; - if(minTime_direct <= minTime_gemm) { - minTime = minTime_direct; - bestRunInfo = bestRunInfoDirect; - } else { - minTime = minTime_gemm; - bestRunInfo = bestRunInfoGEMM; - } - - } - if(minTime == DBL_MAX) CHECK_STATUS(NOT_SUPPORTED); - *forwardRunInfo = bestRunInfo; - CHECK_STATUS(gcl_finish(handle_tun)); - gcl_destroy_gclmem(input); - gcl_destroy_gclmem(filter); - gcl_destroy_gclmem(output); - gcl_destroy_gclmem(bias); - gcl_destroy_gclmem(tmpbuf); - if (filterDF == DF_CHW_NC) { - gcl_destroy_gclmem(filter_dp); - gcl_destroy_gclmem(bias_dp); - gcl_destroy_gclmem(bias_buf); - } - depthwiseConvolutionAlgorithms.clear(); - runInfos.clear(); - inputMemDescs.clear(); - outputMemDescs.clear(); - filterMemDescs.clear(); - filterMemDescs_dp.clear(); - kernelTimeArray.clear(); - gcl_destroy_handle(handle_tun); - return SUCCESS; - } - return NOT_SUPPORTED; -} - -EE depthwise_convolution_transform_filter_bytes_mali(TensorDesc filterDesc, - ForwardRunInfoMali_t forwardRunInfo, - GCLMemDesc_t gclmemFilterDesc, - U32* bytes) { - EE ret = SUCCESS; - switch(filterDesc.dt) { - case DT_F16:{ - ret = depthwise_convolution_transform_filter_bytes_mali_fp16(filterDesc, forwardRunInfo, gclmemFilterDesc, bytes); - break; - } - case DT_I8:{ - ret = NOT_SUPPORTED; - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - -EE depthwise_convolution_transform_filter_mali(GCLHandle_t handle, - TensorDesc filterDesc, - GCLMem_t filter, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc* fltmemDesc, - GCLMem_t fltmem) { - EE ret = SUCCESS; - switch(filterDesc.dt) { - case DT_F16:{ - ret = depthwise_convolution_transform_filter_mali_fp16(handle, filterDesc, filter, forwardRunInfo, fltmemDesc, fltmem); - break; - } - case DT_I8:{ - ret = NOT_SUPPORTED; - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - -EE depthwise_convolution_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, - TensorDesc filterDesc, - TensorDesc outputDesc, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - U32* bytes) { - EE ret = SUCCESS; - switch(inputDesc.dt) { - case DT_F16:{ - ret = depthwise_convolution_infer_forward_tmp_bytes_mali_fp16(inputDesc, filterDesc, outputDesc, convDesc, forwardRunInfo, bytes); - break; - } - case DT_I8:{ - ret = NOT_SUPPORTED; - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} -EE depthwise_convolution_mali(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc filterDesc, - const GCLMem_t filter, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc biasDesc, - const GCLMem_t bias, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - GCLMem_t output, - ActivationMode depthwiseActivationMode, - ActivationMode pointwiseActivationMode) { - EE ret = SUCCESS; - switch(inputDesc.dt) { - case DT_F16:{ - ret = depthwise_convolution_mali_fp16(handle, inputDesc, input, filterDesc, filter, convDesc, forwardRunInfo, biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, - depthwiseActivationMode, pointwiseActivationMode); - break; - } - case DT_I8:{ - ret = NOT_SUPPORTED; - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - diff --git a/tensor_computing/src/gpu/mali/eltwise.cpp b/tensor_computing/src/gpu/mali/eltwise.cpp deleted file mode 100644 index bf0199e9..00000000 --- a/tensor_computing/src/gpu/mali/eltwise.cpp +++ /dev/null @@ -1,134 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "gpu/mali/tensor_computing_mali.h" -#include "gpu/mali/fp16/eltwise_mali_fp16.h" -#include "gpu/mali/infer_gclmem_desc_mali.h" - -EE eltwise_infer_output_size_mali(std::vector inputDesc, - TensorDesc* outputDesc, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc){ - /*tensorDesc record cpu org data format info*/ - /*gclmemDesc record gpu trans data format info*/ - U32 size = inputDesc.size(); - U32 arrayDimMax = 0; - for (U32 i = 1; i < size; i++) { - if (inputDesc[i].nDims > inputDesc[arrayDimMax].nDims) arrayDimMax = i; - if (inputDesc[i].df != inputDesc[0].df) CHECK_STATUS(NOT_SUPPORTED); - } - if(outputDesc) *outputDesc = inputDesc[arrayDimMax]; - - if(inputDesc[0].df == DF_NCHW || inputDesc[0].df == DF_MKT) { - DataType idt; - DataFormat idf; - U32 iw, ih, ic, in; - if(inputDesc[0].df == DF_NCHW) tensorSelectGet(inputDesc[0], &idt, &idf, &in, &ic, &ih, &iw); - if(inputDesc[0].df == DF_MKT) { - U32 m, k, t; - get_nlp_mkt_val(inputDesc[0], &idt, &m, &k, &t); - map_nlp_mkt_to_ncwhc4(m, k, t, &iw, &ih, &ic); - ic = 4 * ic; - } - U32 ih_align = (ih + 1) / 2 * 2; - CHECK_STATUS(infer_gclmem_desc_ncwhc4(iw, ih_align, ic, 0, 0, iw, ih, ic, idt, idt, gclmemInputDesc, gclmemOutputDesc)); - if(gclmemInputDesc) { - U32 s0 = gclmemInputDesc[0].stride[0]; - U32 s1 = gclmemInputDesc[0].stride[1]; - U32 s2 = gclmemInputDesc[0].stride[2]; - U32 off0 = gclmemInputDesc[0].offset[0]; - U32 off1 = gclmemInputDesc[0].offset[1]; - U32 off2 = gclmemInputDesc[0].offset[2]; - for(U32 i = 1; i < size; i++) { - s0 = (s0 >= gclmemInputDesc[i].stride[0]) ? s0 : gclmemInputDesc[i].stride[0]; - s1 = (s1 >= gclmemInputDesc[i].stride[1]) ? s1 : gclmemInputDesc[i].stride[1]; - s2 = (s2 >= gclmemInputDesc[i].stride[2]) ? s2 : gclmemInputDesc[i].stride[2]; - off0 = (off0 >= gclmemInputDesc[i].offset[0]) ? off0 : gclmemInputDesc[i].offset[0]; - off1 = (off1 >= gclmemInputDesc[i].offset[1]) ? off1 : gclmemInputDesc[i].offset[1]; - off2 = (off2 >= gclmemInputDesc[i].offset[2]) ? off2 : gclmemInputDesc[i].offset[2]; - } - U32 num = s0 * s1 * s2 * 4; - U32 byteSize = num * bytesOf(idt); - for(U32 i = 0; i < size; i++) { - gclmemInputDesc[i].stride[0] = s0; - gclmemInputDesc[i].stride[1] = s1; - gclmemInputDesc[i].stride[2] = s2; - gclmemInputDesc[i].offset[0] = off0; - gclmemInputDesc[i].offset[1] = off1; - gclmemInputDesc[i].offset[2] = off2; - gclmemInputDesc[i].num = num; - gclmemInputDesc[i].byteSize = byteSize; - } - } - return SUCCESS; - } - return NOT_SUPPORTED; -} - -inline EE eltwise_checkpara_mali(GCLHandle_t handle, - std::vector inputDesc, - std::vector input, - TensorDesc outputDesc, - GCLMem_t output, - EltwiseMode eltwiseMode) { - - if(handle == nullptr || nullptr == output) return NULL_POINTER; - for(auto it : input) { - GCLMem_t ptr = (GCLMem_t)it; - if(ptr == nullptr) return NULL_POINTER; - if(ptr->desc.memFormat != output->desc.memFormat) return NOT_SUPPORTED; - } - for(auto it : inputDesc) { - if(it.df != outputDesc.df) return NOT_SUPPORTED; - if(it.dims[0] != outputDesc.dims[0]) return NOT_SUPPORTED; - if(it.dims[1] != outputDesc.dims[1]) return NOT_SUPPORTED; - if(it.dims[2] != outputDesc.dims[2]) return NOT_SUPPORTED; - if(it.dims[3] != outputDesc.dims[3]) return NOT_SUPPORTED; - } - if(outputDesc.df != DF_NCHW && outputDesc.df != DF_MKT) return NOT_SUPPORTED; - if(output->desc.memFormat != DF_NCWHC4) return NOT_SUPPORTED; - if(eltwiseMode != ELTWISE_SUM && eltwiseMode != ELTWISE_MAX && eltwiseMode != ELTWISE_PROD) return NOT_SUPPORTED; - return SUCCESS; -} - -EE eltwise_mali(GCLHandle_t handle, - std::vector inputDesc, - std::vector input, - TensorDesc outputDesc, - GCLMem_t output, - EltwiseMode eltwiseMode) { - EE ret = SUCCESS; - CHECK_STATUS(eltwise_checkpara_mali(handle, inputDesc, input, outputDesc, output, eltwiseMode)); - switch(inputDesc[0].dt){ - case DT_F16:{ - ret = eltwise_mali_fp16(handle, inputDesc, input, outputDesc, output, eltwiseMode); - break; - } - case DT_I8:{ - ret = NOT_SUPPORTED; - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - - - diff --git a/tensor_computing/src/gpu/mali/embedding.cpp b/tensor_computing/src/gpu/mali/embedding.cpp deleted file mode 100644 index 834546af..00000000 --- a/tensor_computing/src/gpu/mali/embedding.cpp +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "gpu/mali/tensor_computing_mali.h" -#include "gpu/mali/fp16/embedding_mali_fp16.h" -#include "gpu/mali/infer_gclmem_desc_mali.h" - -EE embedding_infer_output_size_mali(TensorDesc inputDesc, - TensorDesc* outputDesc, - U32 inputDim, - U32 numOutput, - DataType dt, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc) { - UNUSED(inputDim); - DataType idt; - DataFormat df; - U32 batch, step; - CHECK_REQUIREMENT(tensorIs2d(inputDesc)); - CHECK_STATUS(tensor2dfGet(inputDesc, &idt, &df, &batch, &step)); - if(outputDesc) *outputDesc = tensor3df(dt, DF_MKT, batch, numOutput, step); - - if(df == DF_NORMAL) { - U32 iw = step; - U32 ih = batch; - U32 ic = 1; - CHECK_STATUS(infer_gclmem_desc_nchw(iw, ih, ic, 0, 0, 0, 0, 0, idt, dt, gclmemInputDesc, NULL)); - - U32 m = 1; - U32 ow, oh, oc; - map_nlp_mkt_to_ncwhc4(m, numOutput, step, &ow, &oh, &oc); - /*oc has been divided 4 in map_nlp_xxx, need to mul 4 for infer_xxx_ncwhc4*/ - CHECK_STATUS(infer_gclmem_desc_ncwhc4(0, 0, 0, 0, 0, ow, oh, oc * 4, idt, dt, NULL, gclmemOutputDesc)); - return SUCCESS; - } - return NOT_SUPPORTED; -} - -inline EE embedding_checkpara_mali(GCLHandle_t handle, - GCLMem_t input, - GCLMem_t weight, - GCLMem_t output) { - if(nullptr == handle || nullptr == input || nullptr == weight || nullptr == output) return NULL_POINTER; - return SUCCESS; -} - -EE embedding_mali(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc weightDesc, - GCLMem_t weight, - TensorDesc outputDesc, - GCLMem_t output, - U32 inputDim, - U32 numOutput, - bool transpose, - DataType dt) { - EE ret = SUCCESS; - CHECK_STATUS(embedding_checkpara_mali(handle, input, weight, output)); - switch(dt) { - case DT_F16:{ - ret = embedding_mali_fp16(handle, inputDesc, input, weightDesc, weight, outputDesc, output, inputDim, numOutput, transpose); - break; - } - case DT_I8:{ - ret = NOT_SUPPORTED; - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - - - diff --git a/tensor_computing/src/gpu/mali/fp16/activation_mali_fp16.cpp b/tensor_computing/src/gpu/mali/fp16/activation_mali_fp16.cpp deleted file mode 100644 index 6da804d9..00000000 --- a/tensor_computing/src/gpu/mali/fp16/activation_mali_fp16.cpp +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "tensor_computing_type.h" -#include "gpu/mali/fp16/activation_mali_fp16.h" -#include "gpu/mali/infer_gclmem_desc_mali.h" - -inline EE activation_checkpara_mali_fp16(TensorDesc inputDesc) { - if(inputDesc.dt != DT_F16) return NOT_SUPPORTED; - return SUCCESS; -} - -inline EE activation_core_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output, - ActivationMode activationMode) { - UNUSED(inputDesc); - U32 ow, oh, oc, on; - if(outputDesc.df == DF_NCHW) { - tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); - } else if(outputDesc.df == DF_MKT) { - get_nlp_mkt_val(outputDesc, NULL, &on, &oc, &oh); - ow = 1; - } else { - CHECK_STATUS(NOT_SUPPORTED); - } - - U32 iw_str, ih_str, iw_off, ih_off; - U32 ow_str, oh_str, ow_off, oh_off; - ih_str = input->desc.stride[0]; - iw_str = input->desc.stride[1]; - ih_off = input->desc.offset[0]; - iw_off = input->desc.offset[1]; - oh_str = output->desc.stride[0]; - ow_str = output->desc.stride[1]; - oh_off = output->desc.offset[0]; - ow_off = output->desc.offset[1]; - cl_mem inbuf, outbuf; - inbuf = input->mem; - outbuf = output->mem; - - char modeName[16]; - switch(activationMode) { - case ACTIVATION_NULL: return SUCCESS; - case ACTIVATION_RELU: strcpy(modeName, "relu"); break; - case ACTIVATION_RELU6: strcpy(modeName, "relu6"); break; - case ACTIVATION_H_SIGMOID: strcpy(modeName, "hsigmoid"); break; - case ACTIVATION_H_SWISH: strcpy(modeName, "hswish"); break; - case ACTIVATION_GELU: strcpy(modeName, "gelu"); break; - case ACTIVATION_TANH: strcpy(modeName, "tanh"); break; - case ACTIVATION_SIGMOID: strcpy(modeName, "sigmoid"); break; - default: - return NOT_SUPPORTED; - } - char kernelName[128]; - U32 H = 1; - sprintf(kernelName, "activation_%s%d", modeName, H); - Kernel kernel; - CHECK_STATUS(gcl_create_kernel_binary(handle, kernelName, &kernel)); - U32 cd4 = (oc + 3) / 4; - U32 ce4 = (oc & 3) == 0 ? 4 : (oc & 3); - CHECK_STATUS(gcl_set_kernelArgs(kernel, oh, ow, cd4, ce4, ih_str, iw_str, ih_off, iw_off, oh_str, ow_str, oh_off, ow_off, inbuf, outbuf)); - U32 gs[3] = {oh, ow, (oc + 3) / 4}; - U32 ls[3] = {0, 0, 0}; - U32 dim = 3; - gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); -#ifdef _DEBUG - CHECK_STATUS(gcl_print_memory(handle, input, "activation_input")); - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); - CHECK_STATUS(gcl_print_memory(handle, input, "activation_output")); -#endif - return SUCCESS; -} - - -EE activation_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output, - ActivationMode activationMode) { - CHECK_STATUS(activation_checkpara_mali_fp16(inputDesc)); - CHECK_STATUS(activation_core_mali_fp16(handle, inputDesc, input, outputDesc, output, activationMode)); - return SUCCESS; -} - diff --git a/tensor_computing/src/gpu/mali/fp16/bilateral_slice_apply_mali_fp16.cpp b/tensor_computing/src/gpu/mali/fp16/bilateral_slice_apply_mali_fp16.cpp deleted file mode 100644 index de4b7f0b..00000000 --- a/tensor_computing/src/gpu/mali/fp16/bilateral_slice_apply_mali_fp16.cpp +++ /dev/null @@ -1,124 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "tensor_computing_type.h" -#include "gpu/mali/fp16/bilateral_slice_apply_mali_fp16.h" - -inline EE bilateral_slice_apply_checkpara_mali_fp16(TensorDesc inputDesc, - TensorDesc guideDesc, - TensorDesc gridDesc, - TensorDesc outputDesc) { - if (inputDesc.dt != guideDesc.dt || inputDesc.dt != DT_F16) return NOT_SUPPORTED; - if (inputDesc.dt != gridDesc.dt || inputDesc.dt != outputDesc.dt) return NOT_SUPPORTED; - return SUCCESS; -} - -inline EE bilateral_slice_apply_core_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc guideDesc, - const GCLMem_t guide, - TensorDesc gridDesc, - const GCLMem_t grid, - BilateralSliceApplyDesc bilateralSliceApplyDesc, - ForwardRunInfoMali_t forwardRunInfo, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - GCLMem_t output){ - UNUSED(guideDesc); - UNUSED(forwardRunInfo); - U32 iw, ih, ic, in; - U32 gw, gh, gc, gn; - U32 ow, oh, oc, on; - tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); - tensorSelectGet(gridDesc, NULL, NULL, &gn, &gc, &gh, &gw); - tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); - - U32 coe = bilateralSliceApplyDesc.coefficient_len; - BilateralSliceApplyMode mode = bilateralSliceApplyDesc.mode; -// bool has_offset = bilateralSliceApplyDesc.has_offset; - U32 dep = gc / coe; - U32 gcw = gc * gw; - U32 wh = iw * ih; - F32 scale_x = (F32)gw / iw; - F32 scale_y = (F32)gh / ih; - Mem inbuf, gridbuf, guidebuf, outbuf, gridTran; - inbuf = input->mem; - gridbuf = grid->mem; - outbuf = output->mem; - gridTran = tmpBuf->mem; - if(mode == BSliceApply_NULL) { - guidebuf = guide->mem; - } else { - guidebuf = inbuf; - } - - U32 gs0[3] = {gc / 4, gw, ih}; - U32 ls0[3] = {0, 0, 0}; - U32 dim0 = 3; - Kernel kernel; - CHECK_STATUS(gcl_create_kernel_binary(handle, "bilateral_slice_apply_pre", &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, gh, gc, gcw, gs0[0], gs0[1], scale_y, gridbuf, gridTran)); - gcl_set_kernelVec(handle, kernel, dim0, gs0, ls0, "bilateral_slice_apply_pre"); - -#ifdef _DEBUG - CHECK_STATUS(gcl_run_kernel_profiling(handle, kernel, dim0, gs0, ls0, "bilateral_slice_apply_pre")); - CHECK_STATUS(gcl_print_memory(handle, grid, "bilateral_slice_apply_grid")); -#endif - char kernelname[128]; - if(mode == BSliceApply_CONV) { - sprintf(kernelname, "bilateral_slice_apply_c12_conv"); - } else { - sprintf(kernelname, "bilateral_slice_apply_c12"); - } - U32 gs[2] = {ow, oh}; - U32 ls[2] = {0, 0}; - U32 dim = 2; - CHECK_STATUS(gcl_create_kernel_binary(handle, kernelname, &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, iw, wh, gc, gw, gh, gcw, dep, coe, gs[0], gs[1], scale_x, scale_y, guidebuf, gridTran, inbuf, outbuf)); - gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); - -#ifdef _DEBUG - CHECK_STATUS(gcl_run_kernel_profiling(handle, kernel, dim, gs, ls, kernelname)); - CHECK_STATUS(gcl_print_memory(handle, input, "bilateral_slice_apply_input")); - CHECK_STATUS(gcl_print_memory(handle, output, "bilateral_slice_apply_output")); - if(mode == BSliceApply_NULL)CHECK_STATUS(gcl_print_memory(handle, guide, "bilateral_slice_apply_guide")); -#endif - return SUCCESS; - -} - -EE bilateral_slice_apply_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc guideDesc, - const GCLMem_t guide, - TensorDesc gridDesc, - const GCLMem_t grid, - BilateralSliceApplyDesc bilateralSliceApplyDesc, - ForwardRunInfoMali_t forwardRunInfo, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - GCLMem_t output){ - UNUSED(tmpBytes); - CHECK_STATUS(bilateral_slice_apply_checkpara_mali_fp16(inputDesc, guideDesc, gridDesc, outputDesc)); - CHECK_STATUS(bilateral_slice_apply_core_mali_fp16(handle, inputDesc, input, guideDesc, guide, gridDesc, grid, bilateralSliceApplyDesc, forwardRunInfo, tmpBuf, outputDesc, output)); - return SUCCESS; -} - diff --git a/tensor_computing/src/gpu/mali/fp16/bilateral_slice_apply_mali_fp16.h b/tensor_computing/src/gpu/mali/fp16/bilateral_slice_apply_mali_fp16.h deleted file mode 100644 index 16297560..00000000 --- a/tensor_computing/src/gpu/mali/fp16/bilateral_slice_apply_mali_fp16.h +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#ifndef _BILATERAL_SLICE_APPLY_MALI_FP16 -#define _BILATERAL_SLICE_APPLY_MALI_FP16 -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "tensor_computing_type.h" - -EE bilateral_slice_apply_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc guideDesc, - const GCLMem_t guide, - TensorDesc gridDesc, - const GCLMem_t grid, - BilateralSliceApplyDesc bilateralSliceApplyDesc, - ForwardRunInfoMali_t forwardRunInfo, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - GCLMem_t output); -#endif - diff --git a/tensor_computing/src/gpu/mali/fp16/clip_mali_fp16.cpp b/tensor_computing/src/gpu/mali/fp16/clip_mali_fp16.cpp deleted file mode 100644 index 25f0a554..00000000 --- a/tensor_computing/src/gpu/mali/fp16/clip_mali_fp16.cpp +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "tensor_computing_type.h" -#include "gpu/mali/fp16/clip_mali_fp16.h" - -inline EE clip_checkpara_mali_fp16(TensorDesc inputDesc, - TensorDesc outputDesc) { - if(inputDesc.dt != outputDesc.dt || inputDesc.dt != DT_F16) return NOT_SUPPORTED; - return SUCCESS; -} - -inline EE clip_core_mali_fp16(GCLHandle_t handle, - float* min_value, - float* max_value, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { - UNUSED(outputDesc); - U32 iw, ih, ic, in; - tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); - U32 iw_str, ih_str, iw_off, ih_off; - U32 ow_str, oh_str, ow_off, oh_off; - ih_str = input->desc.stride[0]; - iw_str = input->desc.stride[1]; - ih_off = input->desc.offset[0]; - iw_off = input->desc.offset[1]; - oh_str = output->desc.stride[0]; - ow_str = output->desc.stride[1]; - oh_off = output->desc.offset[0]; - ow_off = output->desc.offset[1]; - cl_mem inbuf, outbuf; - inbuf = input->mem; - outbuf = output->mem; - float min = *min_value; - float max = *max_value; - - U32 gs[3] = {ih, iw, (ic + 3) / 4}; - U32 ls[3] = {0, 0, 0}; - U32 dim = 3; - Kernel kernel; - CHECK_STATUS(gcl_create_kernel_binary(handle, "clip", &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ih_str, iw_str, ih_off, iw_off, oh_str, ow_str, oh_off, ow_off, min, max, inbuf, outbuf)); - gcl_set_kernelVec(handle, kernel, dim, gs, ls, "clip"); -#ifdef _DEBUG - CHECK_STATUS(gcl_print_memory(handle, input, "clip_input")); - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "clip")); - CHECK_STATUS(gcl_print_memory(handle, output, "clip_output")); -#endif - return SUCCESS; -} - - -EE clip_mali_fp16(GCLHandle_t handle, - void* min_value, - void* max_value, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { - CHECK_STATUS(clip_checkpara_mali_fp16(inputDesc, outputDesc)); - CHECK_STATUS(clip_core_mali_fp16(handle, (float*)min_value, (float*)max_value, inputDesc, input, outputDesc, output)); - return SUCCESS; -} - diff --git a/tensor_computing/src/gpu/mali/fp16/concat_mali_fp16.cpp b/tensor_computing/src/gpu/mali/fp16/concat_mali_fp16.cpp deleted file mode 100644 index 2d0da399..00000000 --- a/tensor_computing/src/gpu/mali/fp16/concat_mali_fp16.cpp +++ /dev/null @@ -1,142 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "tensor_computing_type.h" -#include "gpu/mali/fp16/concat_mali_fp16.h" - -inline EE concat_checkpara_mali_fp16(std::vector inputDesc, - TensorDesc outputDesc) { - for(auto it : inputDesc) { - if(it.dt != outputDesc.dt) return NOT_SUPPORTED; - } - if(outputDesc.dt != DT_F16) return NOT_SUPPORTED; - return SUCCESS; -} - -inline EE concat_core_mali_fp16(GCLHandle_t handle, - std::vector inputDesc, - std::vector input, - TensorDesc outputDesc, - GCLMem_t output, - U32 concatDim) { - UNUSED(inputDesc); - U32 ow, oh; - tensorSelectGet(outputDesc, NULL, NULL, NULL, NULL, &oh, &ow); - U32 iw_str, ih_str, iw_off, ih_off; - U32 ow_str, oh_str, oc_str, ow_off, oh_off; - oh_str = output->desc.stride[0]; - ow_str = output->desc.stride[1]; - oc_str = output->desc.stride[2]; - oh_off = output->desc.offset[0]; - ow_off = output->desc.offset[1]; - U32 num = input.size(); - GCLMem_t inputMem[8]; - cl_mem inbuf[8]; - cl_mem outbuf = output->mem; - U32 c[7]; - U32 bn = (num + 7) / 8; - U32 en; - U32 nmax; - U32 cmax; - U32 out_size = 0; - inputMem[0] = (GCLMem_t)input[0]; - ih_str = inputMem[0]->desc.stride[0]; - iw_str = inputMem[0]->desc.stride[1]; - ih_off = inputMem[0]->desc.offset[0]; - iw_off = inputMem[0]->desc.offset[1]; - for(U32 i = 0; i < bn; i++) { - en = (i * 8 + 8 <= num) ? 8 : (num & 7); - cmax = 0; - nmax = en - 1; - for(U32 j = 0; j < en; ++j) { - inputMem[j] = (GCLMem_t)input[i * 8 + j]; - inbuf[j] = inputMem[j]->mem; - } - for(U32 j = 0; j < nmax; ++j) { - c[j] = inputMem[j]->desc.stride[2]; - cmax += c[j]; - } - char kernelName[128]; - sprintf(kernelName, "concat_%d%d", concatDim, en); - Kernel kernel; - CHECK_STATUS(gcl_create_kernel_binary(handle, kernelName, &kernel)); - U32 gs[3] = {oh, ow, cmax + inputMem[nmax]->desc.stride[2]}; - U32 ls[3] = {0, 0, 0}; - U32 dim = 3; - switch(en) { - case 1: - CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, iw_str, ih_off, iw_off, - oh_str, ow_str, oh_off, ow_off, cmax, nmax, out_size, gs[0], gs[1], inbuf[0], outbuf));break; - case 2: - CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, iw_str, ih_off, iw_off, - oh_str, ow_str, oh_off, ow_off, cmax, nmax, out_size, gs[0], gs[1], inbuf[0], c[0], inbuf[1], outbuf));break; - case 3: - CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, iw_str, ih_off, iw_off, - oh_str, ow_str, oh_off, ow_off, cmax, nmax, out_size, gs[0], gs[1], inbuf[0], c[0], inbuf[1], c[1], inbuf[2], - outbuf));break; - case 4: - CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, iw_str, ih_off, iw_off, - oh_str, ow_str, oh_off, ow_off, cmax, nmax, out_size, gs[0], gs[1], inbuf[0], c[0], inbuf[1], c[1], inbuf[2], - c[2], inbuf[3], outbuf));break; - case 5: - CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, iw_str, ih_off, iw_off, - oh_str, ow_str, oh_off, ow_off, cmax, nmax, out_size, gs[0], gs[1], inbuf[0], c[0], inbuf[1], c[1], inbuf[2], - c[2], inbuf[3], c[3], inbuf[4], outbuf));break; - case 6: - CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, iw_str, ih_off, iw_off, - oh_str, ow_str, oh_off, ow_off, cmax, nmax, out_size, gs[0], gs[1], inbuf[0], c[0], inbuf[1], c[1], inbuf[2], - c[2], inbuf[3], c[3], inbuf[4], c[4], inbuf[5], outbuf));break; - case 7: - CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, iw_str, ih_off, iw_off, - oh_str, ow_str, oh_off, ow_off, cmax, nmax, out_size, gs[0], gs[1], inbuf[0], c[0], inbuf[1], c[1], inbuf[2], - c[2], inbuf[3], c[3], inbuf[4], c[4], inbuf[5], c[5], inbuf[6], outbuf));break; - case 8: - CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, iw_str, ih_off, iw_off, - oh_str, ow_str, oh_off, ow_off, cmax, nmax, out_size, gs[0], gs[1], inbuf[0], c[0], inbuf[1], c[1], inbuf[2], - c[2], inbuf[3], c[3], inbuf[4], c[4], inbuf[5], c[5], inbuf[6], c[6], inbuf[7], outbuf));break; - default: - return NOT_SUPPORTED; - } - gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); - out_size += ow_str * oh_str * gs[2] * 4; -#ifdef _DEBUG - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); - for(U32 i = 0; i < en; ++i){ - std::cout << "concat_input " << i << " " << std::endl; - CHECK_STATUS(gcl_print_memory(handle, inputMem[i], "concat_input")); - } -#endif - } -#ifdef _DEBUG - CHECK_STATUS(gcl_print_memory(handle, output, "concat_output")); -#endif - return SUCCESS; -} - - -EE concat_mali_fp16(GCLHandle_t handle, - std::vector inputDesc, - std::vector input, - TensorDesc outputDesc, - GCLMem_t output, - U32 concatDim) { - CHECK_STATUS(concat_checkpara_mali_fp16(inputDesc, outputDesc)); - CHECK_STATUS(concat_core_mali_fp16(handle, inputDesc, input, outputDesc, output, concatDim)); - return SUCCESS; -} - diff --git a/tensor_computing/src/gpu/mali/fp16/convolution_direct_mali_fp16.cpp b/tensor_computing/src/gpu/mali/fp16/convolution_direct_mali_fp16.cpp deleted file mode 100644 index 48468c43..00000000 --- a/tensor_computing/src/gpu/mali/fp16/convolution_direct_mali_fp16.cpp +++ /dev/null @@ -1,333 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "tensor_desc.h" -#include "type.h" -#include "error.h" -#include "tensor_computing_type.h" - -#include "gpu/mali/fp16/convolution_mali_fp16.h" -#include "gpu/mali/fp16/convolution_direct_mali_fp16.h" - -inline EE direct_core_nchw_to_ncwhc4_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc filterDesc, - const GCLMem_t filter, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc biasDesc, - const GCLMem_t bias, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - GCLMem_t output, - ActivationMode activationMode){ - UNUSED(biasDesc); - UNUSED(tmpBytes); - UNUSED(tmpBuf); - cl_mem inbuf, biasmem, outbuf, fltbuf; - inbuf = input->mem; - fltbuf = filter->mem; - biasmem = bias->mem; - outbuf = output->mem; - U32 iw, ih; - U32 fw, fh, fn, sw, pw, ph; - U32 ow, oh, oc, on; - sw = convDesc.stride_w; - ph = convDesc.padding_bottom; - pw = convDesc.padding_left; - tensorSelectGet(inputDesc, NULL, NULL, NULL, NULL, &ih, &iw); - tensorSelectGet(filterDesc, NULL, NULL, &fn, NULL, &fh, &fw); - tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); - U32 iw_str, ih_str, iwh_str, ic_str, iw_off, ih_off; - iw_str = input->desc.stride[0]; - ih_str = input->desc.stride[1]; - ic_str = input->desc.stride[2]; - iw_off = input->desc.offset[0] - pw; - ih_off = input->desc.offset[1] - ph; - iwh_str = iw_str * ih_str; - - U32 ow_str, oh_str, ow_off, oh_off; - oh_str = output->desc.stride[0]; - ow_str = output->desc.stride[1]; - oh_off = output->desc.offset[0]; - ow_off = output->desc.offset[1]; - - U32 item_w = forwardRunInfo->best_w[0]; - char kernelname[128]; - char modeName[16]; - U32 gs[3]; - U32 ls[3] = {0, 0, 0}; - U32 dim; - Kernel kernel; - switch(activationMode) { - case ACTIVATION_RELU: - strcpy(modeName, "relu_"); - break; - case ACTIVATION_NULL: - strcpy(modeName, ""); - break; - default: - return NOT_SUPPORTED; - } - sprintf(kernelname, "conv_direct_s%d_nchw_to_ncwhc4_%s%d%d",sw, modeName, fw, item_w); - gs[0] = (ow + item_w - 1) / item_w; - gs[1] = oh; - gs[2] = (oc + 3) / 4 * on; - dim = 3; - CHECK_STATUS(gcl_create_kernel_binary(handle, kernelname, &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, iwh_str, ic_str, iw_off, ih_off, oh_str, ow_str, oh_off, ow_off, ow, gs[0], gs[1], inbuf, fltbuf, biasmem, outbuf)); - gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); - -#ifdef _DEBUG - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); - CHECK_STATUS(gcl_print_memory(handle, input, "conv_direct_input")); - CHECK_STATUS(gcl_print_memory(handle, filter, "conv_direct_filter")); - CHECK_STATUS(gcl_print_memory(handle, bias, "conv_direct_bias")); - CHECK_STATUS(gcl_print_memory(handle, output, "conv_direct_output")); - handle->t_total += handle->t_execute; -#endif - return SUCCESS; -} - -inline EE direct_core_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc filterDesc, - const GCLMem_t filter, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc biasDesc, - const GCLMem_t bias, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - GCLMem_t output, - ActivationMode activationMode){ - UNUSED(biasDesc); - UNUSED(tmpBytes); - UNUSED(tmpBuf); - cl_mem inbuf, biasmem, outbuf, fltbuf; - inbuf = input->mem; - fltbuf = filter->mem; - biasmem = bias->mem; - outbuf = output->mem; - U32 iw, ih; - U32 fw, fh, fn, sw, pw, ph; - U32 ow, oh, oc, on; - sw = convDesc.stride_w; - ph = convDesc.padding_bottom; - pw = convDesc.padding_left; - tensorSelectGet(inputDesc, NULL, NULL, NULL, NULL, &ih, &iw); - tensorSelectGet(filterDesc, NULL, NULL, &fn, NULL, &fh, &fw); - tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); - - U32 iw_str, ih_str, ihw_str, ic_str, iw_off, ih_off; - ih_str = input->desc.stride[0]; - iw_str = input->desc.stride[1]; - ic_str = input->desc.stride[2]; - ih_off = input->desc.offset[0] - ph; - iw_off = input->desc.offset[1] - pw; - ihw_str = ih_str * iw_str; - - U32 ow_str, oh_str, ohw_str, ow_off, oh_off; - oh_str = output->desc.stride[0]; - ow_str = output->desc.stride[1]; - oh_off = output->desc.offset[0]; - ow_off = output->desc.offset[1]; - ohw_str = oh_str * ow_str; - - U32 item_w = forwardRunInfo->best_w[0]; - U32 item_c = forwardRunInfo->best_c[0]; - U32 item_k = forwardRunInfo->best_k[0]; - char kernelname[128]; - char modeName[16]; - U32 gs[3]; - U32 ls[3] = {0, 0, 0}; - U32 dim; - Kernel kernel; - switch(activationMode) { - case ACTIVATION_RELU: - strcpy(modeName, "relu_"); - break; - case ACTIVATION_NULL: - strcpy(modeName, ""); - break; - default: - return NOT_SUPPORTED; - } - if(item_k == 0) { - if((ih_str > 1 || iw_str > 1) && (item_c != 4)) CHECK_STATUS(NOT_SUPPORTED); - sprintf(kernelname, "conv_direct_spe_fwhs1_%s%d", modeName, item_c); - ic_str = filter->desc.stride[1]; - ow = fn; - gs[0] = fn; - gs[1] = 1; - gs[2] = 1; - dim = 1; - } else { - item_k = item_k >> 2; - sprintf(kernelname, "conv_direct_s%d_%s%d%d%d",sw, modeName, fw, item_w, item_k); - gs[0] = oh; - gs[1] = (ow + item_w - 1) / item_w; - gs[2] = (oc + 3) / 4 * on / item_k; - dim = 3; - } - CHECK_STATUS(gcl_create_kernel_binary(handle, kernelname, &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, ihw_str, ic_str, ih_off, iw_off, oh_str, ohw_str, oh_off, ow_off, ow, gs[0], gs[1], inbuf, fltbuf, biasmem, outbuf)); - gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); - -#ifdef _DEBUG - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); - CHECK_STATUS(gcl_print_memory(handle, input, "conv_direct_input")); - CHECK_STATUS(gcl_print_memory(handle, filter, "conv_direct_filter")); - CHECK_STATUS(gcl_print_memory(handle, bias, "conv_direct_bias")); - CHECK_STATUS(gcl_print_memory(handle, output, "conv_direct_output")); - handle->t_total += handle->t_execute; -#endif - return SUCCESS; -} - - -EE convolution_direct_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, - ForwardRunInfoMali_t forwardRunInfo, - GCLMemDesc_t gclmemFilterDesc, - U32* bytes) -{ - U32 fw, fh, fc, fn; - tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw); - U32 item_c = forwardRunInfo->best_c[0]; - U32 item_k = forwardRunInfo->best_k[0]; - U32 s0 = 0; - U32 s1 = 0; - U32 s2 = 0; - U32 num = 0; - U32 byteSize; - if(item_k == 0) { - s0 = fn; - s1 = (fc + item_c - 1) / item_c; - s2 = 1; - DataFormat df = DF_CHWNC4; - if(item_c == 8) df = DF_CHWNC8; - if(item_c == 16) df = DF_CHWNC16; - gclmemFilterDesc->memFormat = df; - num = s0 * s1 * s2 * item_c; - } else if(item_c == 4) { - s0 = fw * fh * (item_k >> 2); - s1 = (fc + item_c - 1) / item_c; - s2 = (fn + item_k - 1) / item_k; - gclmemFilterDesc->memFormat = DF_NCHWN4C4; - num = s0 * s1 * s2 * item_c * item_k / (item_k >> 2); - } else if(item_c == 1) { - s0 = fw * fh; - s1 = fc; - s2 = (fn + item_k - 1) / item_k; - gclmemFilterDesc->memFormat = DF_NCHWN4; - num = s0 * s1 * s2 * item_k; - } - byteSize = num * bytesOf(DT_F16); - gclmemFilterDesc->stride[0] = s0; - gclmemFilterDesc->stride[1] = s1; - gclmemFilterDesc->stride[2] = s2; - gclmemFilterDesc->offset[0] = 0; - gclmemFilterDesc->offset[1] = 0; - gclmemFilterDesc->offset[2] = 0; - gclmemFilterDesc->num = num; - gclmemFilterDesc->byteSize = byteSize; - gclmemFilterDesc->memType = GCL_MEM_BUF; - gclmemFilterDesc->flags = CL_MEM_READ_WRITE; - gclmemFilterDesc->host_ptr = NULL; - *bytes = 0; - return SUCCESS; -} - -EE convolution_direct_transform_filter_mali_fp16(GCLHandle_t handle, - TensorDesc filterDesc, - GCLMem_t filter, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc* fltmemDesc, - GCLMem_t fltmem) -{ - DataType fdt; - DataFormat fdf; - U32 fw, fh, fc, fn; - tensorSelectGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw); - U32 fwh = fw * fh; - U32 item_c = forwardRunInfo->best_c[0]; - U32 item_k = forwardRunInfo->best_k[0]; - U32 nk = item_k; - if(item_k == 0) item_k = fn; - char kernelname[128]; - Kernel kernel; - sprintf(kernelname, "conv_direct_trans_fltbuf_%d%d",item_c, nk); - CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, fwh, fc, fn, filter->mem, fltmem->mem)); - U32 gs[3] = {fwh, (fc + item_c - 1) / item_c, (fn + item_k - 1) / item_k * item_k}; - U32 ls[3] = {0, 0, 0}; - U32 dim = 3; - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); - *fltmemDesc = tensor4df(fdt, fdf, fn, fc, fh, fw); -#ifdef _DEBUG - CHECK_STATUS(gcl_print_memory(handle, filter, "conv_direct_filter_org")); - CHECK_STATUS(gcl_print_memory(handle, fltmem, "conv_direct_filter_tran")); -#endif - return SUCCESS; -} - -EE convolution_direct_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, - TensorDesc filterDesc, - TensorDesc outputDesc, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - U32* bytes) -{ - UNUSED(inputDesc); - UNUSED(filterDesc); - UNUSED(outputDesc); - UNUSED(convDesc); - UNUSED(forwardRunInfo); - *bytes = 0; - return SUCCESS; -} - -EE convolution_direct_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc filterDesc, - const GCLMem_t filter, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc biasDesc, - const GCLMem_t bias, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - GCLMem_t output, - ActivationMode activationMode){ - U32 fw, fh, ih, iw; - tensorSelectGet(filterDesc, NULL, NULL, NULL, NULL, &fh, &fw); - tensorSelectGet(inputDesc, NULL, NULL, NULL, NULL, &ih, &iw); - if(inputDesc.df == DF_NCHW || (fw == 1 && fw == 1 && ih == 1 && iw == 1)) { - CHECK_STATUS(direct_core_mali_fp16(handle, inputDesc, input, filterDesc, filter, convDesc, forwardRunInfo, - biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, activationMode)); - } else if(inputDesc.df == DF_NCHW_ORG_MALI){ - CHECK_STATUS(direct_core_nchw_to_ncwhc4_mali_fp16(handle, inputDesc, input, filterDesc, filter, convDesc, forwardRunInfo, - biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, activationMode)); - } else { - CHECK_STATUS(NOT_SUPPORTED); - } - return SUCCESS; -} diff --git a/tensor_computing/src/gpu/mali/fp16/convolution_direct_mali_fp16.h b/tensor_computing/src/gpu/mali/fp16/convolution_direct_mali_fp16.h deleted file mode 100644 index 45ef8df8..00000000 --- a/tensor_computing/src/gpu/mali/fp16/convolution_direct_mali_fp16.h +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#ifndef _H_CONVOLUTION_DIRECT_MALI_FP16 -#define _H_CONVOLUTION_DIRECT_MALI_FP16 -#include "sys.h" -#include "tensor_desc.h" -#include "type.h" -#include "error.h" -#include "tensor_computing_type.h" - -EE convolution_direct_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, - ForwardRunInfoMali_t forwardRunInfo, - GCLMemDesc_t gclmemFilterDesc, - U32* bytes); - -EE convolution_direct_transform_filter_mali_fp16(GCLHandle_t handle, - TensorDesc filterDesc, - GCLMem_t filter, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc* fltmemDesc, - GCLMem_t fltmem); - -EE convolution_direct_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, - TensorDesc filterDesc, - TensorDesc outputDesc, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - U32* bytes); - -EE convolution_direct_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc filterDesc, - const GCLMem_t filter, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc biasDesc, - const GCLMem_t bias, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - GCLMem_t output, - ActivationMode activationMode); - -#endif diff --git a/tensor_computing/src/gpu/mali/fp16/convolution_direct_spe_ck_mali_fp16.cpp b/tensor_computing/src/gpu/mali/fp16/convolution_direct_spe_ck_mali_fp16.cpp deleted file mode 100644 index 164f5bbc..00000000 --- a/tensor_computing/src/gpu/mali/fp16/convolution_direct_spe_ck_mali_fp16.cpp +++ /dev/null @@ -1,185 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "tensor_desc.h" -#include "type.h" -#include "error.h" -#include "tensor_computing_type.h" - -#include "gpu/mali/fp16/convolution_mali_fp16.h" -#include "gpu/mali/fp16/convolution_direct_spe_ck_mali_fp16.h" - -inline EE direct_spe_ck_core_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc filterDesc, - const GCLMem_t filter, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc biasDesc, - const GCLMem_t bias, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - GCLMem_t output, - ActivationMode activationMode){ - UNUSED(inputDesc); - UNUSED(forwardRunInfo); - UNUSED(biasDesc); - UNUSED(bias); - UNUSED(tmpBytes); - UNUSED(tmpBuf); - UNUSED(activationMode); - - cl_mem inbuf, outbuf, fltbuf; - inbuf = input->mem; - fltbuf = filter->mem; - outbuf = output->mem; - U32 fn, fc, fw, sw; - U32 ow, oh, oc, on; - sw = convDesc.stride_w; - tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, NULL, &fw); - tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); - Kernel kernel; - U32 gs[3]; - U32 ls[3] = {0, 0, 0}; - U32 dim; - char kernelname[128]; - - if(fn == 1 && fc == 4 && fw == 1){//fc = orgfc + fn - U32 iw_str, ih_str; - iw_str = input->desc.stride[0]; - ih_str = input->desc.stride[1]; - U32 ow_str, oh_str, ow_off, oh_off; - ow_str = output->desc.stride[0]; - oh_str = output->desc.stride[1]; - ow_off = output->desc.offset[0]; - oh_off = output->desc.offset[1]; - if(output->desc.memFormat != DF_NCHW) return NOT_SUPPORTED; - U32 item_w = 2; - U32 item_h = 1; - U32 ew = ow % item_w; - gs[0] = (ow + item_w - 1) / item_w; - gs[1] = (oh + item_h - 1) / item_h; - dim = 2; - sprintf(kernelname, "conv_direct_s%d_spe_f1c3k1_%d", sw, ew); - CHECK_STATUS(gcl_create_kernel_binary(handle, kernelname, &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ow_str, ow_off, oh_off, ow >> 1, gs[0], gs[1], inbuf, fltbuf, outbuf));//c = 3 k = 1, bias val has been set in fltbuf - gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); - } else { - return NOT_SUPPORTED; - } - -#ifdef _DEBUG - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); - CHECK_STATUS(gcl_print_memory(handle, input, "conv_direct_spe_ck_input")); - CHECK_STATUS(gcl_print_memory(handle, filter, "conv_direct_spe_ck_filter")); - CHECK_STATUS(gcl_print_memory(handle, output, "conv_direct_spe_ck_output")); -#endif - return SUCCESS; -} - -EE convolution_direct_spe_ck_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, - ForwardRunInfoMali_t forwardRunInfo, - GCLMemDesc_t gclmemFilterDesc, - U32* bytes) -{ - UNUSED(forwardRunInfo); - U32 fw, fh, fc, fn; - tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw); - U32 s0, s1, s2; - U32 num, byteSize; - if(fn == 1 && fc == 3 && fw == 1){ - s0 = fw * fh; - s1 = fc + fn;//set bias val in flt - s2 = fn; - gclmemFilterDesc->memFormat = DF_NCHW; - } else { - return NOT_SUPPORTED; - } - num = s0 * s1 * s2; - byteSize = num * bytesOf(DT_F16); - gclmemFilterDesc->stride[0] = s0; - gclmemFilterDesc->stride[1] = s1; - gclmemFilterDesc->stride[2] = s2; - gclmemFilterDesc->offset[0] = 0; - gclmemFilterDesc->offset[1] = 0; - gclmemFilterDesc->offset[2] = 0; - gclmemFilterDesc->num = num; - gclmemFilterDesc->byteSize = byteSize; - gclmemFilterDesc->memType = GCL_MEM_BUF; - gclmemFilterDesc->flags = CL_MEM_READ_ONLY; - gclmemFilterDesc->host_ptr = NULL; - *bytes = 0; - return SUCCESS; -} - -EE convolution_direct_spe_ck_transform_filter_mali_fp16(GCLHandle_t handle, - TensorDesc filterDesc, - GCLMem_t filter, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc* fltmemDesc, - GCLMem_t fltmem) -{ - UNUSED(forwardRunInfo); - DataType fdt; - DataFormat fdf; - U32 fw, fh, fc, fn; - tensorSelectGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw); - *fltmemDesc = tensor4df(fdt, fdf, fn, fc + fn, fh, fw);//set bias val in flt - U32 size = tensorNumBytes(*fltmemDesc); - CHECK_STATUS(gcl_trans_memory(handle, filter, fltmem, &size, DEVICE_BUF_TO_BUF, CL_FALSE)); -#ifdef _DEBUG - CHECK_STATUS(gcl_print_memory(handle, filter, "conv_direct_spe_ck_filter_org")); - CHECK_STATUS(gcl_print_memory(handle, fltmem, "conv_direct_spe_ck_filter_tran")); -#endif - return SUCCESS; -} - -EE convolution_direct_spe_ck_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, - TensorDesc filterDesc, - TensorDesc outputDesc, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - U32* bytes) -{ - UNUSED(inputDesc); - UNUSED(filterDesc); - UNUSED(outputDesc); - UNUSED(convDesc); - UNUSED(forwardRunInfo); - *bytes = 0; - return SUCCESS; -} - -EE convolution_direct_spe_ck_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc filterDesc, - const GCLMem_t filter, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc biasDesc, - const GCLMem_t bias, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - GCLMem_t output, - ActivationMode activationMode){ - CHECK_STATUS(direct_spe_ck_core_mali_fp16(handle, inputDesc, input, filterDesc, filter, convDesc, forwardRunInfo, - biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, activationMode)); - - return SUCCESS; -} diff --git a/tensor_computing/src/gpu/mali/fp16/convolution_direct_spe_ck_mali_fp16.h b/tensor_computing/src/gpu/mali/fp16/convolution_direct_spe_ck_mali_fp16.h deleted file mode 100644 index cebbcf22..00000000 --- a/tensor_computing/src/gpu/mali/fp16/convolution_direct_spe_ck_mali_fp16.h +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#ifndef _H_CONVOLUTION_DIRECT_SPE_CK_MALI_FP16 -#define _H_CONVOLUTION_DIRECT_SPE_CK_MALI_FP16 -#include "sys.h" -#include "tensor_desc.h" -#include "type.h" -#include "error.h" -#include "tensor_computing_type.h" - -EE convolution_direct_spe_ck_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, - ForwardRunInfoMali_t forwardRunInfo, - GCLMemDesc_t gclmemFilterDesc, - U32* bytes); - -EE convolution_direct_spe_ck_transform_filter_mali_fp16(GCLHandle_t handle, - TensorDesc filterDesc, - GCLMem_t filter, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc* fltmemDesc, - GCLMem_t fltmem); - -EE convolution_direct_spe_ck_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, - TensorDesc filterDesc, - TensorDesc outputDesc, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - U32* bytes); - -EE convolution_direct_spe_ck_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc filterDesc, - const GCLMem_t filter, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc biasDesc, - const GCLMem_t bias, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - GCLMem_t output, - ActivationMode activationMode); - -#endif diff --git a/tensor_computing/src/gpu/mali/fp16/convolution_mali_fp16.cpp b/tensor_computing/src/gpu/mali/fp16/convolution_mali_fp16.cpp deleted file mode 100644 index fef3b623..00000000 --- a/tensor_computing/src/gpu/mali/fp16/convolution_mali_fp16.cpp +++ /dev/null @@ -1,178 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "tensor_desc.h" -#include "type.h" -#include "error.h" -#include "tensor_computing_type.h" - -#include "gpu/mali/fp16/convolution_mali_fp16.h" -#include "gpu/mali/fp16/convolution_direct_mali_fp16.h" -#include "gpu/mali/fp16/convolution_wino_mali_fp16.h" -#include "gpu/mali/fp16/convolution_direct_spe_ck_mali_fp16.h" - -inline EE convolution_checkpara_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc filterDesc, - const GCLMem_t filter, - const GCLMem_t bias, - TensorDesc outputDesc, - GCLMem_t output){ - if(nullptr == handle || nullptr == input || nullptr == filter || nullptr == output || nullptr == bias) return NULL_POINTER; - if (inputDesc.dt != outputDesc.dt || inputDesc.dt != filterDesc.dt || inputDesc.dt != DT_F16) return NOT_MATCH; - - U32 ic, fc, fn, fh, fw, oc; - CHECK_STATUS(tensorSelectGet(inputDesc, NULL, NULL, NULL, &ic, NULL, NULL)); - CHECK_STATUS(tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh , &fw)); - CHECK_STATUS(tensorSelectGet(outputDesc, NULL, NULL, NULL, &oc, NULL, NULL)); - - if(input->desc.memFormat == DF_NCWHC4){ - if(output->desc.memFormat != DF_NCWHC4) return NOT_MATCH; - } - - if(fw != 1 && fw != 3 && fw != 5) return NOT_SUPPORTED; - if(fn != oc) return NOT_MATCH; - if(input->desc.memFormat == DF_NCHWC3){ - if(ic != 3) return NOT_MATCH; - if(output->desc.memFormat == DF_NCHW && fn != 1) return NOT_MATCH; - return SUCCESS; - } - if(ic != fc) return NOT_MATCH; - return SUCCESS; -} - -EE convolution_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, - ForwardRunInfoMali_t forwardRunInfo, - GCLMemDesc_t gclmemFilterDesc, - U32* bytes){ - EE ret = SUCCESS; - ConvolutionForwardAlgorithm algorithm = (ConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); - switch (algorithm) { - case CONVOLUTION_ALGORITHM_DIRECT: - ret = convolution_direct_transform_filter_bytes_mali_fp16(filterDesc, forwardRunInfo, gclmemFilterDesc, bytes); - break; - case CONVOLUTION_ALGORITHM_DIRECT_SPE_CK: - ret = convolution_direct_spe_ck_transform_filter_bytes_mali_fp16(filterDesc, forwardRunInfo, gclmemFilterDesc, bytes); - break; - case CONVOLUTION_ALGORITHM_GEMM: - ret = NOT_SUPPORTED; - break; - case CONVOLUTION_ALGORITHM_WINOGRAD: - ret = convolution_wino_transform_filter_bytes_mali_fp16(filterDesc, forwardRunInfo, gclmemFilterDesc, bytes); - break; - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - -EE convolution_transform_filter_mali_fp16(GCLHandle_t handle, - TensorDesc filterDesc, - GCLMem_t filter, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc* fltmemDesc, - GCLMem_t fltmem, - GCLMem_t tmp){ - EE ret = SUCCESS; - ConvolutionForwardAlgorithm algorithm = (ConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); - switch (algorithm) { - case CONVOLUTION_ALGORITHM_DIRECT: - ret = convolution_direct_transform_filter_mali_fp16(handle, filterDesc, filter, forwardRunInfo, fltmemDesc, fltmem); - break; - case CONVOLUTION_ALGORITHM_DIRECT_SPE_CK: - ret = convolution_direct_spe_ck_transform_filter_mali_fp16(handle, filterDesc, filter, forwardRunInfo, fltmemDesc, fltmem); - break; - case CONVOLUTION_ALGORITHM_GEMM: - ret = NOT_SUPPORTED; - break; - case CONVOLUTION_ALGORITHM_WINOGRAD: - ret = convolution_wino_transform_filter_mali_fp16(handle, filterDesc, filter, forwardRunInfo, fltmemDesc, fltmem, tmp); - break; - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - -EE convolution_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, - TensorDesc filterDesc, - TensorDesc outputDesc, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - U32* bytes){ - EE ret = SUCCESS; - ConvolutionForwardAlgorithm algorithm = (ConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); - switch (algorithm) { - case CONVOLUTION_ALGORITHM_DIRECT: - ret = convolution_direct_infer_forward_tmp_bytes_mali_fp16(inputDesc, filterDesc, outputDesc, convDesc, forwardRunInfo, bytes); - break; - case CONVOLUTION_ALGORITHM_DIRECT_SPE_CK: - ret = convolution_direct_spe_ck_infer_forward_tmp_bytes_mali_fp16(inputDesc, filterDesc, outputDesc, convDesc, forwardRunInfo, bytes); - break; - case CONVOLUTION_ALGORITHM_GEMM: - ret = NOT_SUPPORTED; - break; - case CONVOLUTION_ALGORITHM_WINOGRAD: - ret = convolution_wino_infer_forward_tmp_bytes_mali_fp16(inputDesc, filterDesc, outputDesc, convDesc, forwardRunInfo, bytes); - break; - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - -EE convolution_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc filterDesc, - const GCLMem_t filter, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc biasDesc, - const GCLMem_t bias, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - GCLMem_t output, - ActivationMode activationMode){ - CHECK_STATUS(convolution_checkpara_mali_fp16(handle, inputDesc, input, filterDesc, filter, bias, outputDesc, output)); - EE ret = SUCCESS; - ConvolutionForwardAlgorithm algorithm = (ConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); - switch (algorithm) { - case CONVOLUTION_ALGORITHM_DIRECT: - ret = convolution_direct_mali_fp16(handle, inputDesc, input, filterDesc, filter, convDesc, forwardRunInfo, - biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, activationMode); - break; - case CONVOLUTION_ALGORITHM_DIRECT_SPE_CK: - ret = convolution_direct_spe_ck_mali_fp16(handle, inputDesc, input, filterDesc, filter, convDesc, forwardRunInfo, - biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, activationMode); - break; - case CONVOLUTION_ALGORITHM_GEMM: - ret = NOT_SUPPORTED; - break; - case CONVOLUTION_ALGORITHM_WINOGRAD: - ret = convolution_wino_mali_fp16(handle, inputDesc, input, filterDesc, filter, convDesc, forwardRunInfo, - biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, activationMode); - break; - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} diff --git a/tensor_computing/src/gpu/mali/fp16/convolution_mali_fp16.h b/tensor_computing/src/gpu/mali/fp16/convolution_mali_fp16.h deleted file mode 100644 index 319a9b6a..00000000 --- a/tensor_computing/src/gpu/mali/fp16/convolution_mali_fp16.h +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#ifndef _CONVOLUTION_MALI_FP16 -#define _CONVOLUTION_MALI_FP16 -#include "sys.h" -#include "tensor_desc.h" -#include "type.h" -#include "error.h" -#include "tensor_computing_type.h" - -EE convolution_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, - ForwardRunInfoMali_t forwardRunInfo, - GCLMemDesc_t gclmemFilterDesc, - U32* bytes); - -EE convolution_transform_filter_mali_fp16(GCLHandle_t handle, - TensorDesc filterDesc, - GCLMem_t filter, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc* fltmemDesc, - GCLMem_t fltmem, - GCLMem_t tmp); - -EE convolution_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, - TensorDesc filterDesc, - TensorDesc outputDesc, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - U32* bytes); - -EE convolution_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc filterDesc, - const GCLMem_t filter, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc biasDesc, - const GCLMem_t bias, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - GCLMem_t output, - ActivationMode activationMode); -#endif diff --git a/tensor_computing/src/gpu/mali/fp16/convolution_wino_mali_fp16.cpp b/tensor_computing/src/gpu/mali/fp16/convolution_wino_mali_fp16.cpp deleted file mode 100644 index a2966932..00000000 --- a/tensor_computing/src/gpu/mali/fp16/convolution_wino_mali_fp16.cpp +++ /dev/null @@ -1,391 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "tensor_desc.h" -#include "type.h" -#include "error.h" -#include "tensor_computing_type.h" -#include "gpu/mali/fp16/convolution_mali_fp16.h" -#include "gpu/mali/fp16/convolution_wino_mali_fp16.h" - -#define calPicTranRDesc(wino_h, wino_w, wino_num, ic, fh, ph, dt, prh, prw, prc, prn, prh_off, prw_off, prhwc, prsize) {\ - U32 ext_h = (fh / 2 < ph) ? ph : fh / 2;\ - prh = wino_h * 4 + 2 * ext_h;\ - prw = ((wino_w + 3) / 4 * 4);\ - prc = ic;\ - prn = wino_num;\ - prhwc = prh * prw * prc;\ - prsize = prhwc * prn * bytesOf(dt);\ - prh_off = ph;\ - prw_off = 0;\ -} - -#define calPtrTranRLDesc(wino_h, wino_w, wino_num, ic, item_n, dt, prlh, prlw, prlc, prln, prlhw, prlhwc, prlsize) {\ - prlh = wino_h;\ - prlw = wino_w;\ - prlc = ic;\ - prln = wino_num * wino_num;\ - prlhw = (wino_h * wino_w + item_n - 1) / item_n * item_n;\ - prlhwc = prlhw * ic;\ - prlsize = prlhwc * prln * bytesOf(dt);\ -} - -#define calGemmOutDesc(wino_num, fn, phw, ic, item_m, dt, M, N, C, MC, NC, MN, gSize) {\ - M = (fn + item_m - 1) / item_m * item_m;\ - N = prlhw_str;\ - C = ic;\ - MC = M * C;\ - NC = N * C;\ - MN = M * N;\ - gSize = MN * wino_num * wino_num * bytesOf(dt);\ -} -inline EE wino_trans_pic(GCLHandle_t handle, - U32 ih_str, - U32 iw_str, - U32 ih_off, - U32 iw_off, - U32 ic_str, - U32 prh_str, - U32 prw_str, - U32 prc_str, - U32 prhwc_str, - U32 prh_off, - U32 prw_off, - U32 prlh_str, - U32 prlw_str, - U32 prlc_str, - U32 prlhw_str, - U32 prlhwc_str, - Mem pic, - Mem picTranR, - Mem picTranRL) - -{ - UNUSED(prw_str); - UNUSED(prw_off); - Kernel kernel; - char kernelname[128]; - U32 ih_str4 = ih_str * 4; - U32 ih_off4 = ih_off * 4; - U32 prh_off4 = prh_off * 4; - U32 gs[3] = {prh_str * 4, (prw_str / 4 + 3) / 4 * 4, ic_str}; - U32 ls[3] = {0, 0, 0}; - U32 dim = 3; - sprintf(kernelname, "conv_wino_trans_picbuf_right"); - CHECK_STATUS(gcl_create_kernel_binary(handle, kernelname, &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str4, iw_str, ih_off4, iw_off, prh_str, prw_str, prhwc_str, prh_off4, gs[0], gs[1], pic, picTranR)); - gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); -#ifdef _DEBUG - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); - handle->t_total += handle->t_execute; -#endif - U32 item_h = 1; - if(prlh_str % 2 == 0) item_h = 2; - if(prlh_str % 3 == 0) item_h = 3; - if(prlh_str % 4 == 0) item_h = 4; - gs[0] = (prlh_str / item_h + 3) / 4 * 4; - gs[1] = prlw_str; - gs[2] = prlc_str * 6; - sprintf(kernelname, "conv_wino_trans_picbuf_left_%d", item_h); - CHECK_STATUS(gcl_create_kernel_binary(handle, kernelname, &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, prh_str, prw_str, prc_str, prlh_str, prlw_str, prlhw_str, prlhwc_str, gs[0], gs[1], picTranR, picTranRL)); - gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); -#ifdef _DEBUG - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); - handle->t_total += handle->t_execute; -#endif - return SUCCESS; -} - -inline EE wino_gemm(GCLHandle_t handle, - U32 M, - U32 N, - U32 C, - U32 item_m, - U32 item_n, - U32 flttran_str, - U32 pictran_str, - U32 out_str, - U32 wino_num, - Mem flttran, - Mem pictran, - Mem out) -{ - Kernel kernel; - wino_num = wino_num * wino_num; - char kernelname[128]; - sprintf(kernelname, "conv_wino_gemm%d_tn_%d%d", wino_num, item_m, item_n); - U32 gs[2] = {(N + item_n - 1) / item_n, (M + item_m - 1) / item_m}; - U32 ls[2] = {0, 0}; - U32 dim = 2; - for(U32 i = 0; i < wino_num; i++) { - CHECK_STATUS(gcl_create_kernel_binary(handle, kernelname, &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, M, N, C, i * flttran_str, i * pictran_str, i * out_str, gs[0], gs[1], flttran, pictran, out)); - gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); -#ifdef _DEBUG - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); - handle->t_total += handle->t_execute; -#endif - } - return SUCCESS; -} - -inline EE wino_trans_out(GCLHandle_t handle, - U32 wino_h, - U32 wino_w, - U32 pw_str, - U32 pwh_str, - U32 oh_str, - U32 ow_str, - U32 oh_off, - U32 ow_off, - U32 oh, - U32 ow, - U32 oc, - ActivationMode activationMode, - Mem bias, - Mem gemm_out, - Mem output) -{ - Kernel kernel; - char kernelname[128]; - char modeName[16]; - switch(activationMode) { - case ACTIVATION_RELU: - strcpy(modeName, "_relu"); - break; - case ACTIVATION_NULL: - strcpy(modeName, ""); - break; - default: - return NOT_SUPPORTED; - } - sprintf(kernelname, "conv_wino_trans_outbuf%s", modeName); - if((oh & 3) == 0 && (ow & 3) == 0) sprintf(kernelname, "conv_wino_trans_outbuf%s_align", modeName); - U32 gs[3] = {(wino_h + 3) / 4 * 4, (wino_w + 3) / 4 * 4, oc / 4}; - U32 ls[3] = {0, 0, 0}; - U32 dim = 3; - CHECK_STATUS(gcl_create_kernel_binary(handle, kernelname, &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, wino_h, wino_w, pw_str, pwh_str, oh_str, ow_str, oh_off, ow_off, oh, ow, bias, gemm_out, output)); - gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); -#ifdef _DEBUG - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); - handle->t_total += handle->t_execute; -#endif - return SUCCESS; -} - -EE convolution_wino_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, - ForwardRunInfoMali_t forwardRunInfo, - GCLMemDesc_t gclmemFilterDesc, - U32* bytes) -{ - U32 item_k = forwardRunInfo->best_k[0]; - U32 fw, fh, fc, fn; - U32 winoTransNum = 36; - tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw); - U32 s0 = (fn + item_k - 1) / item_k * item_k; - U32 s1 = fc; - U32 s2 = winoTransNum; - U32 num = s0 * s1 * s2; - U32 byteSize = num * bytesOf(DT_F16); - gclmemFilterDesc->stride[0] = s0; - gclmemFilterDesc->stride[1] = s1; - gclmemFilterDesc->stride[2] = s2; - gclmemFilterDesc->offset[0] = 0; - gclmemFilterDesc->offset[1] = 0; - gclmemFilterDesc->offset[2] = 0; - gclmemFilterDesc->num = num; - gclmemFilterDesc->byteSize = byteSize; - gclmemFilterDesc->memType = GCL_MEM_BUF; - gclmemFilterDesc->flags = CL_MEM_READ_WRITE; - gclmemFilterDesc->memFormat = DF_HWCN; - gclmemFilterDesc->host_ptr = NULL; - *bytes = fn * fc * fh * fw * bytesOf(DT_F16); - return SUCCESS; -} - -EE convolution_wino_transform_filter_mali_fp16(GCLHandle_t handle, - TensorDesc filterDesc, - GCLMem_t filter, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc* fltmemDesc, - GCLMem_t fltmem, - GCLMem_t tmp) -{ - UNUSED(forwardRunInfo); - DataType fdt; - DataFormat fdf; - U32 fw, fh, fc, fn; - tensorSelectGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw); - U32 item_k = forwardRunInfo->best_k[0]; - U32 fn_align = (fn + item_k - 1) / item_k * item_k; - U32 fwhc = fw * fh * fc; - U32 fnc = fn_align * fc; - - char kernelname[128]; - Kernel kernel; - sprintf(kernelname, "conv_wino_rotate_fltbuf_%d",fw); - CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, fwhc, fnc, fn, filter->mem, tmp->mem)); - U32 gs[2] = {fwhc, fn_align}; - U32 ls[2] = {0, 0}; - U32 dim = 2; - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); -#ifdef _DEBUG - CHECK_STATUS(gcl_print_memory(handle, filter, "conv_wino_filter_org")); - CHECK_STATUS(gcl_print_buffer(handle, tmp->mem, fn_align * fc * fw * fh, "conv_wino_filter_tmp")); -#endif - sprintf(kernelname, "conv_wino_trans_fltbuf_3x3"); - CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, fn_align, fc, fnc, tmp->mem, fltmem->mem)); - gs[0] = fn_align; - gs[1] = fc; - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); -#ifdef _DEBUG - CHECK_STATUS(gcl_print_memory(handle, fltmem, "conv_wino_filter_tran")); -#endif - *fltmemDesc = tensor4df(fdt, fdf, fn, fc, fh, fw); - return SUCCESS; -} - -EE convolution_wino_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, - TensorDesc filterDesc, - TensorDesc outputDesc, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - U32* bytes) -{ - UNUSED(inputDesc); - UNUSED(outputDesc); - UNUSED(convDesc); - DataType fdt; - DataFormat fdf; - U32 fw, fh, fc, fn; - tensorSelectGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw); - U32 item_k = forwardRunInfo->best_k[0]; - U32 fn_align = (fn + item_k - 1) / item_k * item_k; - U32 tempBufNum = fn_align * fc * fw * fh; - U32 fltTempBufSize = tempBufNum * bytesOf(fdt); - - DataType odt; - U32 ow, oh, oc, on; - tensorSelectGet(outputDesc, &odt, NULL, &on, &oc, &oh, &ow); - U32 ph = convDesc.padding_bottom; - U32 wino_num = 6; - U32 wino_h = (oh + 3) / 4; - U32 wino_w = (ow + 3) / 4; - U32 prh_str, prw_str, prc_str, prn_str, prh_off, prw_off, prhwc_str, prSize; - calPicTranRDesc(wino_h, wino_w, wino_num, fc, fh, ph, odt, prh_str, prw_str, prc_str, prn_str, prh_off, prw_off, prhwc_str, prSize); - - U32 item_n = forwardRunInfo->best_w[0]; - U32 item_m = forwardRunInfo->best_k[0]; - U32 prlh_str, prlw_str, prlc_str, prln_str, prlhw_str, prlhwc_str, prlSize; - calPtrTranRLDesc(wino_h, wino_w, wino_num, fc, item_n, odt, prlh_str, prlw_str, prlc_str, prln_str, prlhw_str, prlhwc_str, prlSize); - - U32 M, N, C, MC, NC, MN, gemmOutSize; - calGemmOutDesc(wino_num, fn, prlhw_str, fc, item_m, odt, M, N, C, MC, NC, MN, gemmOutSize); - - U32 tempBufSize = (prSize + 1023) / 1024 * 1024; - tempBufSize += (prlSize + 1023) / 1024 * 1024; - tempBufSize += gemmOutSize; - if(tempBufSize < fltTempBufSize) tempBufSize = fltTempBufSize; - *bytes = tempBufSize; - return SUCCESS; -} - -EE convolution_wino_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc filterDesc, - const GCLMem_t filter, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc biasDesc, - const GCLMem_t bias, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - GCLMem_t output, - ActivationMode activationMode){ - UNUSED(biasDesc); - UNUSED(tmpBytes); - U32 wino_num = 6; - DataType idt; - U32 iw, ih, ic; - U32 fw, fh, fc, fn, pw, ph; - U32 ow, oh, oc, on; - ph = convDesc.padding_bottom; - pw = convDesc.padding_left; - tensorSelectGet(inputDesc, &idt, NULL, NULL, &ic, &ih, &iw); - tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw); - tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); - - U32 iw_str, ih_str, ih_str4, ic_str, iw_off, ih_off; - ih_str = input->desc.stride[0]; - iw_str = input->desc.stride[1]; - ic_str = input->desc.stride[2]; - ih_off = input->desc.offset[0];//input have not pad in h axis - iw_off = input->desc.offset[1] - pw; - ih_str4 = ih_str * 4; - - U32 ow_str, oh_str, ow_off, oh_off; - oh_str = output->desc.stride[0]; - ow_str = output->desc.stride[1]; - oh_off = output->desc.offset[0]; - ow_off = output->desc.offset[1]; - - Mem pic = input->mem; - Mem picTranR, picTranRL, gemmOut; - U32 wino_h = (oh + 3) / 4; - U32 wino_w = (ow + 3) / 4; - U32 offset = 0; - U32 prh_str, prw_str, prc_str, prn_str, prh_off, prw_off, prhwc_str, prSize; - calPicTranRDesc(wino_h, wino_w, wino_num, ic, fh, ph, idt, prh_str, prw_str, prc_str, prn_str, prh_off, prw_off, prhwc_str, prSize); - CHECK_STATUS(gcl_create_sub_buffer(prSize, &offset, tmpBuf, &picTranR)); - - U32 item_n = forwardRunInfo->best_w[0]; - U32 item_m = forwardRunInfo->best_k[0]; - U32 prlh_str, prlw_str, prlc_str, prln_str, prlhw_str, prlhwc_str, prlSize; - calPtrTranRLDesc(wino_h, wino_w, wino_num, ic, item_n, idt, prlh_str, prlw_str, prlc_str, prln_str, prlhw_str, prlhwc_str, prlSize); - CHECK_STATUS(gcl_create_sub_buffer(prlSize, &offset, tmpBuf, &picTranRL)); - - U32 M, N, C, MC, NC, MN, gemmOutSize; - calGemmOutDesc(wino_num, fn, prlhw_str, ic, item_m, idt, M, N, C, MC, NC, MN, gemmOutSize); - CHECK_STATUS(gcl_create_sub_buffer(gemmOutSize, &offset, tmpBuf, &gemmOut)); - - CHECK_STATUS(wino_trans_pic(handle, ih_str, iw_str, ih_off, iw_off, ic_str, prh_str, prw_str, prc_str, prhwc_str, prh_off, prw_off, - prlh_str, prlw_str, prlc_str, prlhw_str, prlhwc_str, pic, picTranR, picTranRL)); -#ifdef _DEBUG - CHECK_STATUS(gcl_print_memory(handle, input, "conv_wino_input")); - CHECK_STATUS(gcl_print_buffer(handle, picTranR, prSize / bytesOf(idt), "conv_wino_pictran_right")); - CHECK_STATUS(gcl_print_buffer(handle, picTranRL, prlSize / bytesOf(idt), "conv_wino_pictran_left")); -#endif - - Mem fltTran = filter->mem; - CHECK_STATUS(wino_gemm(handle, M, N, C, item_m, item_n, MC, NC, MN, wino_num, fltTran, picTranRL, gemmOut)); -#ifdef _DEBUG - CHECK_STATUS(gcl_print_memory(handle, filter, "conv_wino_flttran")); - CHECK_STATUS(gcl_print_buffer(handle, gemmOut, gemmOutSize / bytesOf(idt), "conv_wino_gemm_out")); -#endif - - Mem biasbuf = bias->mem; - Mem outbuf = output->mem; - CHECK_STATUS(wino_trans_out(handle, wino_h, wino_w, N, MN, oh_str, ow_str, oh_off, ow_off, oh, ow, oc, activationMode, biasbuf, gemmOut, outbuf)); -#ifdef _DEBUG - CHECK_STATUS(gcl_print_memory(handle, output, "conv_wino_output")); -#endif - return SUCCESS; -} diff --git a/tensor_computing/src/gpu/mali/fp16/convolution_wino_mali_fp16.h b/tensor_computing/src/gpu/mali/fp16/convolution_wino_mali_fp16.h deleted file mode 100644 index 347f6932..00000000 --- a/tensor_computing/src/gpu/mali/fp16/convolution_wino_mali_fp16.h +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_CONVOLUTION_WINO_MALI_FP16 -#define _H_CONVOLUTION_WINO_MALI_FP16 -#include "sys.h" -#include "tensor_desc.h" -#include "type.h" -#include "error.h" -#include "tensor_computing_type.h" - - -EE convolution_wino_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, - ForwardRunInfoMali_t forwardRunInfo, - GCLMemDesc_t gclmemFilterDesc, - U32* bytes); - -EE convolution_wino_transform_filter_mali_fp16(GCLHandle_t handle, - TensorDesc filterDesc, - GCLMem_t filter, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc* fltmemDesc, - GCLMem_t fltmem, - GCLMem_t tmp); - -EE convolution_wino_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, - TensorDesc filterDesc, - TensorDesc outputDesc, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - U32* bytes); - -EE convolution_wino_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc filterDesc, - const GCLMem_t filter, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc biasDesc, - const GCLMem_t bias, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - GCLMem_t output, - ActivationMode activationMode); -#endif diff --git a/tensor_computing/src/gpu/mali/fp16/depthwise_convolution_direct_mali_fp16.cpp b/tensor_computing/src/gpu/mali/fp16/depthwise_convolution_direct_mali_fp16.cpp deleted file mode 100644 index 995a8a07..00000000 --- a/tensor_computing/src/gpu/mali/fp16/depthwise_convolution_direct_mali_fp16.cpp +++ /dev/null @@ -1,198 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "tensor_desc.h" -#include "type.h" -#include "error.h" -#include "tensor_computing_type.h" -#include "gpu/mali/fp16/depthwise_convolution_direct_mali_fp16.h" - -inline EE depthwise_core_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc filterDesc, - const GCLMem_t filter, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc biasDesc, - const GCLMem_t bias, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - GCLMem_t output, - ActivationMode depthwiseActivationMode){ - UNUSED(inputDesc); - UNUSED(forwardRunInfo); - UNUSED(biasDesc); - UNUSED(tmpBytes); - UNUSED(tmpBuf); - - cl_mem inbuf, biasimg, outbuf, fltbuf; - inbuf = input->mem; - fltbuf = filter->mem; - biasimg = bias->mem; - outbuf = output->mem; - U32 fw, sw, pw, ph; - U32 ow, oh, oc, on; - sw = convDesc.stride_w; - ph = convDesc.padding_bottom; - pw = convDesc.padding_left; - tensorSelectGet(filterDesc, NULL, NULL, NULL, NULL, NULL, &fw); - tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); - - U32 iw_str, ih_str, ihw_str, ic_str, ih_off, iw_off; - ih_str = input->desc.stride[0]; - iw_str = input->desc.stride[1]; - ic_str = input->desc.stride[2]; - ih_off = input->desc.offset[0] - ph; - iw_off = input->desc.offset[1] - pw; - ihw_str = ih_str * iw_str; - - U32 ow_str, oh_str, ow_off, oh_off, ohw_str; - oh_str = output->desc.stride[0]; - ow_str = output->desc.stride[1]; - oh_off = output->desc.offset[0]; - ow_off = output->desc.offset[1]; - ohw_str = oh_str * ow_str; - - U32 item_w = forwardRunInfo->best_w[0]; - U32 gs[3] = {oh, (ow + item_w - 1) / item_w, (oc + 3) / 4 * on}; - U32 ls[3] = {0, 0, 0}; - U32 dim = 3; - char kernelname[128]; - Kernel kernel; - if(depthwiseActivationMode == ACTIVATION_NULL){ - sprintf(kernelname, "conv_depthwise_s%d_%d%d",sw, fw, item_w); - } else if (depthwiseActivationMode == ACTIVATION_RELU) { - sprintf(kernelname, "conv_depthwise_s%d_relu_%d%d",sw, fw, item_w); - } else { - CHECK_STATUS(NOT_SUPPORTED); - return NOT_SUPPORTED; - } - CHECK_STATUS(gcl_create_kernel_binary(handle, kernelname, &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, ihw_str, ic_str, ih_off, iw_off, oh_str, ow_str, ohw_str, oh_off, ow_off, ow, gs[0], gs[1], inbuf, fltbuf, biasimg, outbuf)); - gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); - -#ifdef _DEBUG - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); - CHECK_STATUS(gcl_print_memory(handle, input, "conv_depthwise_input")); - CHECK_STATUS(gcl_print_memory(handle, filter, "conv_depthwise_filter")); - CHECK_STATUS(gcl_print_memory(handle, bias, "conv_depthwise_bias")); - CHECK_STATUS(gcl_print_memory(handle, output, "conv_depthwise_output")); - handle->t_total += handle->t_execute; -#endif - return SUCCESS; -} - -EE depthwise_convolution_direct_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, - ForwardRunInfoMali_t forwardRunInfo, - GCLMemDesc_t gclmemFilterDesc, - U32* bytes) -{ - UNUSED(forwardRunInfo); - U32 fw, fh, fn; - tensorSelectGet(filterDesc, NULL, NULL, &fn, NULL, &fh, &fw); - U32 item_k = forwardRunInfo->best_k[0]; - U32 s0, s1, s2; - U32 num, byteSize; - s0 = fw * fh; - s1 = (fn + item_k - 1) / item_k; - s2 = 1; - num = s0 * s1 * s2 * item_k; - byteSize = num * bytesOf(DT_F16); - gclmemFilterDesc->stride[0] = s0; - gclmemFilterDesc->stride[1] = s1; - gclmemFilterDesc->stride[2] = s2; - gclmemFilterDesc->offset[0] = 0; - gclmemFilterDesc->offset[1] = 0; - gclmemFilterDesc->offset[2] = 0; - gclmemFilterDesc->num = num; - gclmemFilterDesc->byteSize = byteSize; - gclmemFilterDesc->memType = GCL_MEM_BUF; - gclmemFilterDesc->memFormat = DF_NHWCN4; - gclmemFilterDesc->flags = CL_MEM_READ_WRITE; - gclmemFilterDesc->host_ptr = NULL; - - *bytes = 0; - return SUCCESS; -} - -EE depthwise_convolution_direct_transform_filter_mali_fp16(GCLHandle_t handle, - TensorDesc filterDesc, - GCLMem_t filter, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc* fltmemDesc, - GCLMem_t fltmem) -{ - UNUSED(forwardRunInfo); - DataType fdt; - DataFormat fdf; - U32 fw, fh, fn; - tensorSelectGet(filterDesc, &fdt, &fdf, &fn, NULL, &fh, &fw); - U32 fwh = fw * fh; - U32 item_k = forwardRunInfo->best_k[0]; - char kernelname[128]; - Kernel kernel; - sprintf(kernelname, "conv_depthwise_trans_fltbuf_%d", item_k); - CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, fwh, fn, filter->mem, fltmem->mem)); - U32 gs[3] = {fwh, (fn + item_k - 1) / item_k}; - U32 ls[3] = {0, 0, 0}; - U32 dim = 2; - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); - *fltmemDesc = tensor4df(fdt, fdf, fn, 1, fh, fw); -#ifdef _DEBUG - CHECK_STATUS(gcl_print_memory(handle, filter, "conv_depthwise_filter_org")); - CHECK_STATUS(gcl_print_memory(handle, fltmem, "conv_depthwise_filter_tran")); -#endif - - return SUCCESS; -} - -EE depthwise_convolution_direct_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, - TensorDesc filterDesc, - TensorDesc outputDesc, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - U32* bytes) -{ - UNUSED(inputDesc); - UNUSED(filterDesc); - UNUSED(outputDesc); - UNUSED(convDesc); - UNUSED(forwardRunInfo); - *bytes = 0; - return SUCCESS; -} - -EE depthwise_convolution_direct_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc filterDesc, - const GCLMem_t filter, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc biasDesc, - const GCLMem_t bias, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - GCLMem_t output, - ActivationMode depthwiseActivationMode){ - CHECK_STATUS(depthwise_core_mali_fp16(handle, inputDesc, input, filterDesc, filter, convDesc, forwardRunInfo, - biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, depthwiseActivationMode)); - - return SUCCESS; -} diff --git a/tensor_computing/src/gpu/mali/fp16/depthwise_convolution_direct_mali_fp16.h b/tensor_computing/src/gpu/mali/fp16/depthwise_convolution_direct_mali_fp16.h deleted file mode 100644 index 7f7d7146..00000000 --- a/tensor_computing/src/gpu/mali/fp16/depthwise_convolution_direct_mali_fp16.h +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#ifndef _DEPTHWISE_CONVOLUTION_DIRECT_MALI_FP16 -#define _DEPTHWISE_CONVOLUTION_DIRECT_MALI_FP16 -#include "sys.h" -#include "tensor_desc.h" -#include "type.h" -#include "error.h" -#include "tensor_computing_type.h" - -EE depthwise_convolution_direct_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, - ForwardRunInfoMali_t forwardRunInfo, - GCLMemDesc_t gclmemFilterDesc, - U32* bytes); - -EE depthwise_convolution_direct_transform_filter_mali_fp16(GCLHandle_t handle, - TensorDesc filterDesc, - GCLMem_t filter, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc* fltmemDesc, - GCLMem_t fltmem); - -EE depthwise_convolution_direct_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, - TensorDesc filterDesc, - TensorDesc outputDesc, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - U32* bytes) ; - -EE depthwise_convolution_direct_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc filterDesc, - const GCLMem_t filter, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc biasDesc, - const GCLMem_t bias, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - GCLMem_t output, - ActivationMode depthwiseActivationMode); -#endif - diff --git a/tensor_computing/src/gpu/mali/fp16/depthwise_convolution_mali_fp16.cpp b/tensor_computing/src/gpu/mali/fp16/depthwise_convolution_mali_fp16.cpp deleted file mode 100644 index 7953ede2..00000000 --- a/tensor_computing/src/gpu/mali/fp16/depthwise_convolution_mali_fp16.cpp +++ /dev/null @@ -1,162 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "tensor_desc.h" -#include "type.h" -#include "error.h" -#include "tensor_computing_type.h" -#include "gpu/mali/fp16/depthwise_convolution_mali_fp16.h" -#include "gpu/mali/fp16/depthwise_convolution_direct_mali_fp16.h" -#include "gpu/mali/fp16/depthwise_pointwise_convolution_direct_mali_fp16.h" -#include "gpu/mali/fp16/depthwise_pointwise_convolution_gemm_mali_fp16.h" - -inline EE depthwise_convolution_checkpara_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc filterDesc, - const GCLMem_t filter, - const GCLMem_t bias, - TensorDesc outputDesc, - GCLMem_t output){ - if(nullptr == handle || nullptr == input || nullptr == filter || nullptr == output || nullptr == bias) return NULL_POINTER; - if (inputDesc.dt != outputDesc.dt || inputDesc.dt != filterDesc.dt || inputDesc.dt != DT_F16) return NOT_MATCH; - - DataFormat fdf; - U32 ic, fc, fn, fh, fw, oc; - CHECK_STATUS(tensorSelectGet(inputDesc, NULL, NULL, NULL, &ic, NULL, NULL)); - CHECK_STATUS(tensorSelectGet(filterDesc, NULL, &fdf, &fn, &fc, &fh , &fw)); - CHECK_STATUS(tensorSelectGet(outputDesc, NULL, NULL, NULL, &oc, NULL, NULL)); - if(input->desc.memFormat == DF_NCWHC4){ - if(filter->desc.memFormat != DF_NHWCN4) return NOT_MATCH; - if(output->desc.memFormat != DF_NCWHC4) return NOT_MATCH; - } - if(fw != 3 && fw != 5) return NOT_MATCH; - if(fdf == DF_NCHW && ic != fn) return NOT_MATCH; - if(fn != oc) return NOT_MATCH; - return SUCCESS; -} - - -EE depthwise_convolution_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, - ForwardRunInfoMali_t forwardRunInfo, - GCLMemDesc_t gclmemFilterDesc, - U32* bytes){ - EE ret = SUCCESS; - DepthwiseConvolutionForwardAlgorithm algorithm = (DepthwiseConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); - switch (algorithm) { - case DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT: - ret = depthwise_convolution_direct_transform_filter_bytes_mali_fp16(filterDesc, forwardRunInfo, gclmemFilterDesc, bytes); - break; - case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT: - ret = depthwise_pointwise_convolution_direct_transform_filter_bytes_mali_fp16(filterDesc, forwardRunInfo, gclmemFilterDesc, bytes); - break; - case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_GEMM: - ret = depthwise_pointwise_convolution_gemm_transform_filter_bytes_mali_fp16(filterDesc, forwardRunInfo, gclmemFilterDesc, bytes); - break; - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - -EE depthwise_convolution_transform_filter_mali_fp16(GCLHandle_t handle, - TensorDesc filterDesc, - GCLMem_t filter, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc* fltmemDesc, - GCLMem_t fltmem){ - EE ret = SUCCESS; - DepthwiseConvolutionForwardAlgorithm algorithm = (DepthwiseConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); - switch (algorithm) { - case DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT: - ret = depthwise_convolution_direct_transform_filter_mali_fp16(handle, filterDesc, filter, forwardRunInfo, fltmemDesc, fltmem); - break; - case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT: - ret = depthwise_pointwise_convolution_direct_transform_filter_mali_fp16(handle, filterDesc, filter, forwardRunInfo, fltmemDesc, fltmem); - break; - case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_GEMM: - ret = depthwise_pointwise_convolution_gemm_transform_filter_mali_fp16(handle, filterDesc, filter, forwardRunInfo, fltmemDesc, fltmem); - break; - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - -EE depthwise_convolution_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, - TensorDesc filterDesc, - TensorDesc outputDesc, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - U32* bytes){ - EE ret = SUCCESS; - DepthwiseConvolutionForwardAlgorithm algorithm = (DepthwiseConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); - switch (algorithm) { - case DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT: - ret = depthwise_convolution_direct_infer_forward_tmp_bytes_mali_fp16(inputDesc, filterDesc, outputDesc, convDesc, forwardRunInfo, bytes); - break; - case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT: - ret = depthwise_pointwise_convolution_direct_infer_forward_tmp_bytes_mali_fp16(inputDesc, filterDesc, outputDesc, convDesc, forwardRunInfo, bytes); - break; - case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_GEMM: - ret = depthwise_pointwise_convolution_gemm_infer_forward_tmp_bytes_mali_fp16(inputDesc, filterDesc, outputDesc, convDesc, forwardRunInfo, bytes); - break; - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - -EE depthwise_convolution_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc filterDesc, - const GCLMem_t filter, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc biasDesc, - const GCLMem_t bias, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - GCLMem_t output, - ActivationMode depthwiseActivationMode, - ActivationMode pointwiseActivationMode){ - UNUSED(pointwiseActivationMode); - EE ret = SUCCESS; - CHECK_STATUS(depthwise_convolution_checkpara_mali_fp16(handle, inputDesc, input, filterDesc, filter, bias, outputDesc, output)); - DepthwiseConvolutionForwardAlgorithm algorithm = (DepthwiseConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); - switch (algorithm) { - case DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT: - ret = depthwise_convolution_direct_mali_fp16(handle, inputDesc, input, filterDesc, filter, convDesc, forwardRunInfo, - biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, depthwiseActivationMode); - break; - case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT: - ret = depthwise_pointwise_convolution_direct_mali_fp16(handle, inputDesc, input, filterDesc, filter, convDesc, forwardRunInfo, - biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, depthwiseActivationMode, pointwiseActivationMode); - break; - case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_GEMM: - ret = depthwise_pointwise_convolution_gemm_mali_fp16(handle, inputDesc, input, filterDesc, filter, convDesc, forwardRunInfo, - biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, depthwiseActivationMode, pointwiseActivationMode); - break; - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} diff --git a/tensor_computing/src/gpu/mali/fp16/depthwise_convolution_mali_fp16.h b/tensor_computing/src/gpu/mali/fp16/depthwise_convolution_mali_fp16.h deleted file mode 100644 index 21b635c5..00000000 --- a/tensor_computing/src/gpu/mali/fp16/depthwise_convolution_mali_fp16.h +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#ifndef _DEPTHWISE_CONVOLUTION_MALI_FP16 -#define _DEPTHWISE_CONVOLUTION_MALI_FP16 -#include "sys.h" -#include "tensor_desc.h" -#include "type.h" -#include "error.h" -#include "tensor_computing_type.h" - -EE depthwise_convolution_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, - ForwardRunInfoMali_t forwardRunInfo, - GCLMemDesc_t gclmemFilterDesc, - U32* bytes); - -EE depthwise_convolution_transform_filter_mali_fp16(GCLHandle_t handle, - TensorDesc filterDesc, - GCLMem_t filter, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc* fltmemDesc, - GCLMem_t fltmem); - -EE depthwise_convolution_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, - TensorDesc filterDesc, - TensorDesc outputDesc, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - U32* bytes); - -EE depthwise_convolution_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc filterDesc, - const GCLMem_t filter, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc biasDesc, - const GCLMem_t bias, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - GCLMem_t output, - ActivationMode depthwiseActivationMode, - ActivationMode pointwiseActivationMode); -#endif diff --git a/tensor_computing/src/gpu/mali/fp16/depthwise_pointwise_convolution_direct_mali_fp16.cpp b/tensor_computing/src/gpu/mali/fp16/depthwise_pointwise_convolution_direct_mali_fp16.cpp deleted file mode 100644 index d2081b6b..00000000 --- a/tensor_computing/src/gpu/mali/fp16/depthwise_pointwise_convolution_direct_mali_fp16.cpp +++ /dev/null @@ -1,278 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "tensor_desc.h" -#include "type.h" -#include "error.h" -#include "tensor_computing_type.h" -#include "gpu/mali/fp16/depthwise_pointwise_convolution_direct_mali_fp16.h" - -inline EE depthwise_pointwise_direct_core_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc filterDesc, - const GCLMem_t filter, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc biasDesc, - const GCLMem_t bias, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - GCLMem_t output, - ActivationMode depthwiseActivationMode, - ActivationMode pointwiseActivationMode) { - UNUSED(inputDesc); - UNUSED(forwardRunInfo); - UNUSED(biasDesc); - UNUSED(tmpBytes); - UNUSED(tmpBuf); - - cl_mem inbuf, biasimg0, biasimg1, outbuf, fltbuf0, fltbuf1, tmp; - inbuf = input->mem; - fltbuf0 = filter[0].mem; - fltbuf1 = filter[1].mem; - biasimg0 = bias[0].mem; - biasimg1 = bias[1].mem; - outbuf = output->mem; - tmp = tmpBuf->mem; - U32 fw, sw, pw, ph, fc; - U32 ow, oh, oc, on; - sw = convDesc.stride_w; - ph = convDesc.padding_bottom; - pw = convDesc.padding_left; - tensorSelectGet(filterDesc, NULL, NULL, NULL, &fc, NULL, &fw); - tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); - - U32 iw_str, ih_str, ihw_str, ic_str, ih_off, iw_off; - ih_str = input->desc.stride[0]; - iw_str = input->desc.stride[1]; - ic_str = input->desc.stride[2]; - ih_off = input->desc.offset[0] - ph; - iw_off = input->desc.offset[1] - pw; - ihw_str = ih_str * iw_str; - - U32 th_str, tw_str, th_off, tw_off, thw_str; - U32 w_align, item_wd, item_wp; - item_wd = forwardRunInfo->best_w[0]; - item_wp = forwardRunInfo->best_w[1]; - w_align = (ow + item_wp - 1) / item_wp * item_wp; - th_str = oh; - tw_str = w_align; - th_off = 0; - tw_off = 0; - thw_str = th_str * tw_str; - - U32 ow_str, oh_str, ow_off, oh_off, ohw_str; - oh_str = output->desc.stride[0]; - ow_str = output->desc.stride[1]; - oh_off = output->desc.offset[0]; - ow_off = output->desc.offset[1]; - ohw_str = oh_str * ow_str; - - U32 gs[3] = {oh, (ow + item_wd - 1) / item_wd, (fc + 3) / 4}; - U32 ls[3] = {0, 0, 0}; - U32 dim = 3; - char kernelname[128]; - Kernel kernel; - if(depthwiseActivationMode == ACTIVATION_NULL) { - sprintf(kernelname, "conv_depthwise_s%d_%d%d",sw, fw, item_wd); - } else if (depthwiseActivationMode == ACTIVATION_RELU) { - sprintf(kernelname, "conv_depthwise_s%d_relu_%d%d",sw, fw, item_wd); - } else { - CHECK_STATUS(NOT_SUPPORTED); - return NOT_SUPPORTED; - } - CHECK_STATUS(gcl_create_kernel_binary(handle, kernelname, &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, ihw_str, ic_str, ih_off, iw_off, th_str, tw_str, thw_str, th_off, tw_off, ow, gs[0], gs[1], inbuf, fltbuf0, biasimg0, tmp)); - gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); - -#ifdef _DEBUG - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); - CHECK_STATUS(gcl_print_memory(handle, input, "conv_depthwise_input")); - CHECK_STATUS(gcl_print_memory(handle, &filter[0], "conv_depthwise_filter")); - CHECK_STATUS(gcl_print_memory(handle, &bias[0], "conv_depthwise_bias")); - CHECK_STATUS(gcl_print_buffer(handle, tmp, thw_str * fc, "conv_depthwise_output_tmp")); - handle->t_total += handle->t_execute; -#endif - fw = 1; - sw = 1; - U32 item_kp = forwardRunInfo->best_k[1]; - item_kp = item_kp >> 2; - if(pointwiseActivationMode == ACTIVATION_NULL) { - sprintf(kernelname, "conv_direct_s%d_%d%d%d",sw, fw, item_wp, item_kp); - } else if (pointwiseActivationMode == ACTIVATION_RELU) { - sprintf(kernelname, "conv_direct_s%d_relu_%d%d%d",sw, fw, item_wp, item_kp); - } else { - CHECK_STATUS(NOT_SUPPORTED); - return NOT_SUPPORTED; - } - - U32 gsp[3] = {oh, (ow + item_wp - 1) / item_wp, (oc + 3) / 4 * on / item_kp}; - U32 lsp[3] = {0, 0, 0}; - U32 dimp = 3; - CHECK_STATUS(gcl_create_kernel_binary(handle, kernelname, &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, th_str, thw_str, ic_str, th_off, tw_off, oh_str, ohw_str, oh_off, ow_off, ow, gsp[0], gsp[1], tmp, fltbuf1, biasimg1, outbuf)); - gcl_set_kernelVec(handle, kernel, dimp, gsp, lsp, kernelname); - -#ifdef _DEBUG - CHECK_STATUS(gcl_run_kernel(handle, kernel, dimp, gsp, lsp, kernelname)); - CHECK_STATUS(gcl_print_memory(handle, &filter[1], "conv_direct_filter")); - CHECK_STATUS(gcl_print_memory(handle, &bias[1], "conv_direct_bias")); - CHECK_STATUS(gcl_print_memory(handle, output, "conv_direct_output")); - handle->t_total += handle->t_execute; -#endif - return SUCCESS; -} - -EE depthwise_pointwise_convolution_direct_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, - ForwardRunInfoMali_t forwardRunInfo, - GCLMemDesc_t gclmemFilterDesc, - U32* bytes) -{ - UNUSED(forwardRunInfo); - U32 fw, fh, fc, fn; - tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw); - U32 item_kd = forwardRunInfo->best_k[0]; - U32 item_kp = forwardRunInfo->best_k[1]; - U32 item_c = forwardRunInfo->best_c[1]; - U32 s0, s1, s2; - U32 num, byteSize; - s0 = fw * fh; - s1 = (fc + item_kd - 1) / item_kd; - s2 = 1; - num = s0 * s1 * s2 * item_kd; - byteSize = num * bytesOf(DT_F16); - gclmemFilterDesc[0].stride[0] = s0; - gclmemFilterDesc[0].stride[1] = s1; - gclmemFilterDesc[0].stride[2] = s2; - gclmemFilterDesc[0].offset[0] = 0; - gclmemFilterDesc[0].offset[1] = 0; - gclmemFilterDesc[0].offset[2] = 0; - gclmemFilterDesc[0].num = num; - gclmemFilterDesc[0].byteSize = byteSize; - gclmemFilterDesc[0].memType = GCL_MEM_BUF; - gclmemFilterDesc[0].memFormat = DF_NHWCN4; - gclmemFilterDesc[0].flags = CL_MEM_READ_WRITE; - gclmemFilterDesc[0].host_ptr = NULL; - - s0 = item_kp >> 2; - s1 = (fc + item_c - 1) / item_c; - s2 = (fn + item_kp - 1) / item_kp; - num = s0 * s1 * s2 * item_c * item_kp / (item_kp >> 2); - byteSize = num * bytesOf(DT_F16); - gclmemFilterDesc[1].stride[0] = s0; - gclmemFilterDesc[1].stride[1] = s1; - gclmemFilterDesc[1].stride[2] = s2; - gclmemFilterDesc[1].offset[0] = 0; - gclmemFilterDesc[1].offset[1] = 0; - gclmemFilterDesc[1].offset[2] = 0; - gclmemFilterDesc[1].num = num; - gclmemFilterDesc[1].byteSize = byteSize; - gclmemFilterDesc[1].memType = GCL_MEM_BUF; - gclmemFilterDesc[1].memFormat = DF_NCHWN4C4; - gclmemFilterDesc[1].flags = CL_MEM_READ_WRITE; - gclmemFilterDesc[1].host_ptr = NULL; - *bytes = 0; - return SUCCESS; -} - -EE depthwise_pointwise_convolution_direct_transform_filter_mali_fp16(GCLHandle_t handle, - TensorDesc filterDesc, - GCLMem_t filter, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc* fltmemDesc, - GCLMem_t fltmem) -{ - UNUSED(forwardRunInfo); - DataType fdt; - DataFormat fdf; - U32 fw, fh, fc, fn; - tensorSelectGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw); - U32 fwh = fw * fh; - U32 item_kd = forwardRunInfo->best_k[0]; - U32 item_kp = forwardRunInfo->best_k[1]; - U32 item_c = forwardRunInfo->best_c[1]; - char kernelname[128]; - Kernel kernel; - sprintf(kernelname, "conv_depthwise_trans_fltbuf_%d", item_kd); - CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, fwh, fc, filter[0].mem, fltmem[0].mem)); - U32 gs[3] = {fwh, (fc + item_kd - 1) / item_kd}; - U32 ls[3] = {0, 0, 0}; - U32 dim = 2; - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); - *fltmemDesc = tensor4df(fdt, fdf, fn, fc, fh, fw); -#ifdef _DEBUG - CHECK_STATUS(gcl_print_memory(handle, &filter[0], "conv_depthwise_filter_org")); - CHECK_STATUS(gcl_print_memory(handle, &fltmem[0], "conv_depthwise_filter_tran")); -#endif - - fwh = 1; - sprintf(kernelname, "conv_direct_trans_fltbuf_%d%d",item_c, item_kp); - CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, fwh, fc, fn, filter[1].mem, fltmem[1].mem)); - U32 gsc[3] = {fwh, (fc + item_c - 1) / item_c, (fn + item_kp - 1) / item_kp * item_kp}; - U32 lsc[3] = {0, 0, 0}; - U32 dimc = 3; - CHECK_STATUS(gcl_run_kernel(handle, kernel, dimc, gsc, lsc, kernelname)); -#ifdef _DEBUG - CHECK_STATUS(gcl_print_memory(handle, &filter[1], "conv_direct_filter_org")); - CHECK_STATUS(gcl_print_memory(handle, &fltmem[1], "conv_direct_filter_tran")); -#endif - return SUCCESS; -} - -EE depthwise_pointwise_convolution_direct_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, - TensorDesc filterDesc, - TensorDesc outputDesc, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - U32* bytes) -{ - DataType odt; - U32 oh, ow, fc; - tensorSelectGet(filterDesc, NULL, NULL, NULL, &fc, NULL, NULL); - tensorSelectGet(outputDesc, &odt, NULL, NULL, NULL, &oh, &ow); - UNUSED(inputDesc); - UNUSED(convDesc); - - U32 w_align; - U32 item_w = forwardRunInfo->best_w[1]; - w_align = (ow + item_w - 1) / item_w * item_w; - *bytes = oh * w_align * ((fc + 3) / 4) * 4 * bytesOf(odt); - return SUCCESS; -} - -EE depthwise_pointwise_convolution_direct_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc filterDesc, - const GCLMem_t filter, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc biasDesc, - const GCLMem_t bias, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - GCLMem_t output, - ActivationMode depthwiseActivationMode, - ActivationMode pointwiseActivationMode) { - CHECK_STATUS(depthwise_pointwise_direct_core_mali_fp16(handle, inputDesc, input, filterDesc, filter, convDesc, forwardRunInfo, - biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, depthwiseActivationMode, pointwiseActivationMode)); - - return SUCCESS; -} diff --git a/tensor_computing/src/gpu/mali/fp16/depthwise_pointwise_convolution_direct_mali_fp16.h b/tensor_computing/src/gpu/mali/fp16/depthwise_pointwise_convolution_direct_mali_fp16.h deleted file mode 100644 index 449d2d26..00000000 --- a/tensor_computing/src/gpu/mali/fp16/depthwise_pointwise_convolution_direct_mali_fp16.h +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _DEPTHWISE_POINTWISE_CONVOLUTION_DIRECT_MALI_FP16 -#define _DEPTHWISE_POINTWISE_CONVOLUTION_DIRECT_MALI_FP16 -#include "sys.h" -#include "tensor_desc.h" -#include "type.h" -#include "error.h" -#include "tensor_computing_type.h" - -EE depthwise_pointwise_convolution_direct_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, - ForwardRunInfoMali_t forwardRunInfo, - GCLMemDesc_t gclmemFilterDesc, - U32* bytes); - -EE depthwise_pointwise_convolution_direct_transform_filter_mali_fp16(GCLHandle_t handle, - TensorDesc filterDesc, - GCLMem_t filter, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc* fltmemDesc, - GCLMem_t fltmem); - -EE depthwise_pointwise_convolution_direct_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, - TensorDesc filterDesc, - TensorDesc outputDesc, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - U32* bytes); - -EE depthwise_pointwise_convolution_direct_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc filterDesc, - const GCLMem_t filter, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc biasDesc, - const GCLMem_t bias, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - GCLMem_t output, - ActivationMode depthwiseActivationMode, - ActivationMode pointwiseActivationMode); -#endif diff --git a/tensor_computing/src/gpu/mali/fp16/depthwise_pointwise_convolution_gemm_mali_fp16.cpp b/tensor_computing/src/gpu/mali/fp16/depthwise_pointwise_convolution_gemm_mali_fp16.cpp deleted file mode 100644 index 4e52dbb1..00000000 --- a/tensor_computing/src/gpu/mali/fp16/depthwise_pointwise_convolution_gemm_mali_fp16.cpp +++ /dev/null @@ -1,279 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "tensor_desc.h" -#include "type.h" -#include "error.h" -#include "tensor_computing_type.h" -#include "gpu/mali/fp16/depthwise_pointwise_convolution_gemm_mali_fp16.h" - - -inline EE depthwise_pointwise_gemm_core_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc filterDesc, - const GCLMem_t filter, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc biasDesc, - const GCLMem_t bias, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - GCLMem_t output, - ActivationMode depthwiseActivationMode, - ActivationMode pointwiseActivationMode) { - UNUSED(inputDesc); - UNUSED(forwardRunInfo); - UNUSED(biasDesc); - UNUSED(tmpBytes); - UNUSED(tmpBuf); - - cl_mem inbuf, biasimg, biasbuf, outbuf, fltbuf0, fltbuf1, tmp; - inbuf = input->mem; - fltbuf0 = filter[0].mem; - fltbuf1 = filter[1].mem; - biasimg = bias[0].mem; - biasbuf = bias[1].mem; - outbuf = output->mem; - tmp = tmpBuf->mem; - U32 fw, sw, pw, ph, fc; - U32 ow, oh, oc, on; - sw = convDesc.stride_w; - ph = convDesc.padding_bottom; - pw = convDesc.padding_left; - tensorSelectGet(filterDesc, NULL, NULL, NULL, &fc, NULL, &fw); - tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); - - U32 iw_str, ih_str, ihw_str, ic_str, ih_off, iw_off; - ih_str = input->desc.stride[0]; - iw_str = input->desc.stride[1]; - ic_str = input->desc.stride[2]; - ih_off = input->desc.offset[0] - ph; - iw_off = input->desc.offset[1] - pw; - ihw_str = ih_str * iw_str; - - U32 th_str, tw_str, th_off, tw_off, thw_str; - U32 item_wd, item_whp, item_kp; - item_wd = forwardRunInfo->best_w[0]; - item_whp = forwardRunInfo->best_w[1]; - item_kp = forwardRunInfo->best_k[1]; - th_str = oh; - tw_str = ow; - th_off = 0; - tw_off = 0; - thw_str = ALIGN(th_str * tw_str, item_whp); - - U32 ow_str, oh_str, ow_off, oh_off, ohw_str; - oh_str = output->desc.stride[0]; - ow_str = output->desc.stride[1]; - oh_off = output->desc.offset[0]; - ow_off = output->desc.offset[1]; - ohw_str = oh_str * ow_str; - - U32 gs[3] = {oh, ALIGN(ow, item_wd) / item_wd, ALIGN(fc, 4) / 4}; - U32 ls[3] = {0, 0, 0}; - U32 dim = 3; - char kernelname[128]; - Kernel kernel; - if(depthwiseActivationMode == ACTIVATION_NULL) { - sprintf(kernelname, "conv_depthwise_s%d_ncwh_%d%d",sw, fw, item_wd); - } else if (depthwiseActivationMode == ACTIVATION_RELU) { - sprintf(kernelname, "conv_depthwise_s%d_relu_ncwh_%d%d",sw, fw, item_wd); - } else { - CHECK_STATUS(NOT_SUPPORTED); - return NOT_SUPPORTED; - } - CHECK_STATUS(gcl_create_kernel_binary(handle, kernelname, &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, ihw_str, ic_str, ih_off, iw_off, th_str, tw_str, thw_str, th_off, tw_off, ow, gs[0], gs[1], inbuf, fltbuf0, biasimg, tmp)); - gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); - -#ifdef _DEBUG - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); - CHECK_STATUS(gcl_print_memory(handle, input, "conv_depthwise_input")); - CHECK_STATUS(gcl_print_memory(handle, &filter[0], "conv_depthwise_filter")); - CHECK_STATUS(gcl_print_memory(handle, &bias[0], "conv_depthwise_bias")); - CHECK_STATUS(gcl_print_buffer(handle, tmp, thw_str * fc, "conv_depthwise_output_tmp")); - handle->t_total += handle->t_execute; -#endif - if(pointwiseActivationMode == ACTIVATION_NULL) { - sprintf(kernelname, "gemm_tn_ncwhc4_%d%d", item_kp, item_whp); - } else if (pointwiseActivationMode == ACTIVATION_RELU) { - sprintf(kernelname, "gemm_tn_relu_ncwhc4_%d%d", item_kp, item_whp); - } else { - CHECK_STATUS(NOT_SUPPORTED); - return NOT_SUPPORTED; - } - - U32 M, N, K; - M = ALIGN(oc, item_kp);; - N = thw_str; - K = fc; - U32 gsp[3] = {N / item_whp, M / item_kp}; - U32 lsp[3] = {0, 0}; - U32 dimp = 2; - CHECK_STATUS(gcl_create_kernel_binary(handle, kernelname, &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, M, N, K, oh, ow, oc, oh_str, ow_str, ohw_str, oh_off, ow_off, gsp[0], gsp[1], fltbuf1, tmp, biasbuf, outbuf)); - gcl_set_kernelVec(handle, kernel, dimp, gsp, lsp, kernelname); -#ifdef _DEBUG - CHECK_STATUS(gcl_run_kernel(handle, kernel, dimp, gsp, lsp, kernelname)); - CHECK_STATUS(gcl_print_memory(handle, &filter[1], "conv_direct_filter")); - CHECK_STATUS(gcl_print_memory(handle, &bias[1], "conv_direct_bias")); - CHECK_STATUS(gcl_print_memory(handle, output, "conv_direct_output")); - handle->t_total += handle->t_execute; -#endif - return SUCCESS; -} - -EE depthwise_pointwise_convolution_gemm_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, - ForwardRunInfoMali_t forwardRunInfo, - GCLMemDesc_t gclmemFilterDesc, - U32* bytes) -{ - UNUSED(forwardRunInfo); - U32 fw, fh, fc, fn; - tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw); - U32 item_kd = forwardRunInfo->best_k[0]; - U32 item_kp = forwardRunInfo->best_k[1]; - U32 item_c = forwardRunInfo->best_c[1]; - U32 s0, s1, s2; - U32 num, byteSize; - s0 = fw * fh; - s1 = ALIGN(fc, item_kd) / item_kd; - s2 = 1; - num = s0 * s1 * s2 * item_kd; - byteSize = num * bytesOf(DT_F16); - gclmemFilterDesc[0].stride[0] = s0; - gclmemFilterDesc[0].stride[1] = s1; - gclmemFilterDesc[0].stride[2] = s2; - gclmemFilterDesc[0].offset[0] = 0; - gclmemFilterDesc[0].offset[1] = 0; - gclmemFilterDesc[0].offset[2] = 0; - gclmemFilterDesc[0].num = num; - gclmemFilterDesc[0].byteSize = byteSize; - gclmemFilterDesc[0].memType = GCL_MEM_BUF; - gclmemFilterDesc[0].memFormat = DF_NHWCN4; - gclmemFilterDesc[0].flags = CL_MEM_READ_WRITE; - gclmemFilterDesc[0].host_ptr = NULL; - - s0 = ALIGN(fn, item_kp); - s1 = ALIGN(fc, item_c); - s2 = 1; - num = s0 * s1 * s2; - byteSize = num * bytesOf(DT_F16); - gclmemFilterDesc[1].stride[0] = s0; - gclmemFilterDesc[1].stride[1] = s1; - gclmemFilterDesc[1].stride[2] = s2; - gclmemFilterDesc[1].offset[0] = 0; - gclmemFilterDesc[1].offset[1] = 0; - gclmemFilterDesc[1].offset[2] = 0; - gclmemFilterDesc[1].num = num; - gclmemFilterDesc[1].byteSize = byteSize; - gclmemFilterDesc[1].memType = GCL_MEM_BUF; - gclmemFilterDesc[1].memFormat = DF_HWCN; - gclmemFilterDesc[1].flags = CL_MEM_READ_WRITE; - gclmemFilterDesc[1].host_ptr = NULL; - *bytes = 0; - return SUCCESS; -} - -EE depthwise_pointwise_convolution_gemm_transform_filter_mali_fp16(GCLHandle_t handle, - TensorDesc filterDesc, - GCLMem_t filter, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc* fltmemDesc, - GCLMem_t fltmem) -{ - UNUSED(forwardRunInfo); - DataType fdt; - DataFormat fdf; - U32 fw, fh, fc, fn; - tensorSelectGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw); - U32 fwh = fw * fh; - U32 item_kd = forwardRunInfo->best_k[0]; - U32 item_kp = forwardRunInfo->best_k[1]; - U32 item_c = forwardRunInfo->best_c[1]; - char kernelname[128]; - Kernel kernel; - sprintf(kernelname, "conv_depthwise_trans_fltbuf_%d", item_kd); - CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, fwh, fc, filter[0].mem, fltmem[0].mem)); - U32 gs[3] = {fwh, ALIGN(fc, item_kd) / item_kd}; - U32 ls[3] = {0, 0, 0}; - U32 dim = 2; - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); - *fltmemDesc = tensor4df(fdt, fdf, fn, fc, fh, fw); -#ifdef _DEBUG - CHECK_STATUS(gcl_print_memory(handle, &filter[0], "conv_depthwise_filter_org")); - CHECK_STATUS(gcl_print_memory(handle, &fltmem[0], "conv_depthwise_filter_tran")); -#endif - - fwh = 1; - U32 fn_align = ALIGN(fn, item_kp); - sprintf(kernelname, "conv_direct_trans_fltbuf_%d%d",item_c, 0); - CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, fwh, fc, fn_align, filter[1].mem, fltmem[1].mem)); - U32 gsc[3] = {fwh, ALIGN(fc, item_c) / item_c, fn_align}; - U32 lsc[3] = {0, 0, 0}; - U32 dimc = 3; - CHECK_STATUS(gcl_run_kernel(handle, kernel, dimc, gsc, lsc, kernelname)); -#ifdef _DEBUG - CHECK_STATUS(gcl_print_memory(handle, &filter[1], "conv_direct_filter_org")); - CHECK_STATUS(gcl_print_memory(handle, &fltmem[1], "conv_direct_filter_tran")); -#endif - return SUCCESS; -} - -EE depthwise_pointwise_convolution_gemm_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, - TensorDesc filterDesc, - TensorDesc outputDesc, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - U32* bytes) -{ - DataType odt; - U32 oh, ow, fc; - tensorSelectGet(filterDesc, NULL, NULL, NULL, &fc, NULL, NULL); - tensorSelectGet(outputDesc, &odt, NULL, NULL, NULL, &oh, &ow); - UNUSED(inputDesc); - UNUSED(convDesc); - - U32 N; - U32 item_wh = forwardRunInfo->best_w[1]; - N = ALIGN(oh * ow, item_wh); - *bytes = N * ALIGN(fc, 4) * bytesOf(odt); - return SUCCESS; -} - -EE depthwise_pointwise_convolution_gemm_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc filterDesc, - const GCLMem_t filter, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc biasDesc, - const GCLMem_t bias, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - GCLMem_t output, - ActivationMode depthwiseActivationMode, - ActivationMode pointwiseActivationMode) { - CHECK_STATUS(depthwise_pointwise_gemm_core_mali_fp16(handle, inputDesc, input, filterDesc, filter, convDesc, forwardRunInfo, - biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, depthwiseActivationMode, pointwiseActivationMode)); - - return SUCCESS; -} diff --git a/tensor_computing/src/gpu/mali/fp16/depthwise_pointwise_convolution_gemm_mali_fp16.h b/tensor_computing/src/gpu/mali/fp16/depthwise_pointwise_convolution_gemm_mali_fp16.h deleted file mode 100644 index 910722c8..00000000 --- a/tensor_computing/src/gpu/mali/fp16/depthwise_pointwise_convolution_gemm_mali_fp16.h +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _DEPTHWISE_POINTWISE_CONVOLUTION_GEMM_MALI_FP16 -#define _DEPTHWISE_POINTWISE_CONVOLUTION_GEMM_MALI_FP16 -#include "sys.h" -#include "tensor_desc.h" -#include "type.h" -#include "error.h" -#include "tensor_computing_type.h" - -EE depthwise_pointwise_convolution_gemm_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, - ForwardRunInfoMali_t forwardRunInfo, - GCLMemDesc_t gclmemFilterDesc, - U32* bytes); - -EE depthwise_pointwise_convolution_gemm_transform_filter_mali_fp16(GCLHandle_t handle, - TensorDesc filterDesc, - GCLMem_t filter, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc* fltmemDesc, - GCLMem_t fltmem); - -EE depthwise_pointwise_convolution_gemm_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, - TensorDesc filterDesc, - TensorDesc outputDesc, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - U32* bytes); - -EE depthwise_pointwise_convolution_gemm_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc filterDesc, - const GCLMem_t filter, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc biasDesc, - const GCLMem_t bias, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - GCLMem_t output, - ActivationMode depthwiseActivationMode, - ActivationMode pointwiseActivationMode); -#endif diff --git a/tensor_computing/src/gpu/mali/fp16/eltwise_mali_fp16.cpp b/tensor_computing/src/gpu/mali/fp16/eltwise_mali_fp16.cpp deleted file mode 100644 index 32e30f08..00000000 --- a/tensor_computing/src/gpu/mali/fp16/eltwise_mali_fp16.cpp +++ /dev/null @@ -1,138 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "tensor_computing_type.h" -#include "gpu/mali/fp16/eltwise_mali_fp16.h" -#include "gpu/mali/infer_gclmem_desc_mali.h" - -inline EE eltwise_checkpara_mali_fp16(std::vector inputDesc, - std::vector input, - TensorDesc outputDesc) { - for(auto it : inputDesc) { - if(it.dt != outputDesc.dt) return NOT_SUPPORTED; - } - U32 num = input.size(); - if(num > 8) return NOT_SUPPORTED; - if(outputDesc.dt != DT_F16) return NOT_SUPPORTED; - return SUCCESS; -} - -inline EE eltwise_core_mali_fp16(GCLHandle_t handle, - std::vector inputDesc, - std::vector input, - TensorDesc outputDesc, - GCLMem_t output, - EltwiseMode eltwiseMode) { - UNUSED(outputDesc); - U32 iw, ih, ic, in; - if(inputDesc[0].df == DF_NCHW) { - tensorSelectGet(inputDesc[0], NULL, NULL, &in, &ic, &ih, &iw); - } else if(inputDesc[0].df == DF_MKT) { - get_nlp_mkt_val(inputDesc[0], NULL, &in, &ic, &ih); - iw = 1; - } else { - CHECK_STATUS(NOT_SUPPORTED); - } - U32 iw_str, ih_str, iw_off, ih_off; - U32 num = input.size(); - GCLMem_t inputMem[8]; - for(U32 i = 0; i < num; ++i) inputMem[i] = (GCLMem_t)input[i]; - ih_str = inputMem[0]->desc.stride[0]; - iw_str = inputMem[0]->desc.stride[1]; - ih_off = inputMem[0]->desc.offset[0]; - iw_off = inputMem[0]->desc.offset[1]; - U32 ow_str, oh_str, ow_off, oh_off; - oh_str = output->desc.stride[0]; - ow_str = output->desc.stride[1]; - oh_off = output->desc.offset[0]; - ow_off = output->desc.offset[1]; - - cl_mem outbuf; - outbuf = output->mem; - - char modeName[16]; - if(eltwiseMode == ELTWISE_MAX) strcpy(modeName, "max"); - if(eltwiseMode == ELTWISE_SUM) strcpy(modeName, "sum"); - if(eltwiseMode == ELTWISE_PROD) strcpy(modeName, "prod"); - - char kernelName[128]; - sprintf(kernelName, "eltwise_%s%d", modeName, num); - Kernel kernel; - CHECK_STATUS(gcl_create_kernel_binary(handle, kernelName, &kernel)); - switch(num) { - case 1: - CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ic, ih_str, iw_str, ih_off, iw_off, - oh_str, ow_str, oh_off, ow_off, inputMem[0]->mem, outbuf));break; - case 2: - CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ic, ih_str, iw_str, ih_off, iw_off, - oh_str, ow_str, oh_off, ow_off, inputMem[0]->mem, inputMem[1]->mem, outbuf));break; - case 3: - CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ic, ih_str, iw_str, ih_off, iw_off, - oh_str, ow_str, oh_off, ow_off, inputMem[0]->mem, inputMem[1]->mem, inputMem[2]->mem, outbuf));break; - case 4: - CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ic, ih_str, iw_str, ih_off, iw_off, - oh_str, ow_str, oh_off, ow_off, inputMem[0]->mem, inputMem[1]->mem, inputMem[2]->mem, - inputMem[3]->mem, outbuf));break; - case 5: - CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ic, ih_str, iw_str, ih_off, iw_off, - oh_str, ow_str, oh_off, ow_off, inputMem[0]->mem, inputMem[1]->mem, inputMem[2]->mem, - inputMem[3]->mem, inputMem[4]->mem, outbuf));break; - case 6: - CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ic, ih_str, iw_str, ih_off, iw_off, - oh_str, ow_str, oh_off, ow_off, inputMem[0]->mem, inputMem[1]->mem, inputMem[2]->mem, - inputMem[3]->mem, inputMem[4]->mem, inputMem[5]->mem, outbuf));break; - case 7: - CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ic, ih_str, iw_str, ih_off, iw_off, - oh_str, ow_str, oh_off, ow_off, inputMem[0]->mem, inputMem[1]->mem, inputMem[2]->mem, - inputMem[3]->mem, inputMem[4]->mem, inputMem[5]->mem, inputMem[6]->mem, outbuf));break; - case 8: - CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ic, ih_str, iw_str, ih_off, iw_off, - oh_str, ow_str, oh_off, ow_off, inputMem[0]->mem, inputMem[1]->mem, inputMem[2]->mem, - inputMem[3]->mem, inputMem[4]->mem, inputMem[5]->mem, inputMem[6]->mem, inputMem[7]->mem, outbuf));break; - default: - return NOT_SUPPORTED; - } - - - U32 gs[3] = {(ih + 1) / 2, iw, (ic + 3) / 4 * in}; - U32 ls[3] = {0, 0, 0}; - U32 dim = 3; - gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); -#ifdef _DEBUG - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); - for(U32 i = 0; i < num; ++i){ - std::cout << "eltwise_input " << i << " " << std::endl; - CHECK_STATUS(gcl_print_memory(handle, inputMem[i], "eltwise_input")); - } - CHECK_STATUS(gcl_print_memory(handle, output, "eltwise_output")); -#endif - return SUCCESS; -} - - -EE eltwise_mali_fp16(GCLHandle_t handle, - std::vector inputDesc, - std::vector input, - TensorDesc outputDesc, - GCLMem_t output, - EltwiseMode eltwiseMode) { - CHECK_STATUS(eltwise_checkpara_mali_fp16(inputDesc, input, outputDesc)); - CHECK_STATUS(eltwise_core_mali_fp16 (handle, inputDesc, input, outputDesc, output, eltwiseMode)); - return SUCCESS; -} - diff --git a/tensor_computing/src/gpu/mali/fp16/embedding_mali_fp16.cpp b/tensor_computing/src/gpu/mali/fp16/embedding_mali_fp16.cpp deleted file mode 100644 index d628ae95..00000000 --- a/tensor_computing/src/gpu/mali/fp16/embedding_mali_fp16.cpp +++ /dev/null @@ -1,89 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "tensor_computing_type.h" -#include "gpu/mali/fp16/embedding_mali_fp16.h" -#include "gpu/mali/infer_gclmem_desc_mali.h" - -inline EE embedding_checkpara_mali_fp16(TensorDesc weightDesc, - TensorDesc outputDesc) { - if(weightDesc.dt != outputDesc.dt || weightDesc.dt != DT_F16) return NOT_SUPPORTED; - return SUCCESS; -} - -inline EE embedding_core_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc weightDesc, - GCLMem_t weight, - TensorDesc outputDesc, - GCLMem_t output, - U32 inputDim, - U32 numOutput, - bool transpose) { - UNUSED(weightDesc); - UNUSED(outputDesc); - UNUSED(inputDim); - UNUSED(numOutput); - U32 step = inputDesc.dims[0]; - U32 on = numOutput; - U32 oh_str = output->desc.stride[0]; - U32 ow_str = output->desc.stride[1]; - U32 oc_str = output->desc.stride[2]; - U32 oh_off = output->desc.offset[0]; - U32 ow_off = output->desc.offset[1]; - if(ow_str != 1 || oh_off != 0 || ow_off != 0) CHECK_STATUS(NOT_SUPPORTED); - cl_mem inbuf, weibuf, outbuf; - inbuf = input->mem; - weibuf = weight->mem; - outbuf = output->mem; - - if(!transpose) { - U32 gs[2] = {oc_str, step}; - U32 ls[2] = {0, 0}; - U32 dim = 2; - Kernel kernel; - CHECK_STATUS(gcl_create_kernel_binary(handle, "embedding", &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, step, on, oc_str, oh_str, oh_off, ow_off, inbuf, weibuf, outbuf)); - gcl_set_kernelVec(handle, kernel, dim, gs, ls, "embedding"); -#ifdef _DEBUG - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "embedding")); - CHECK_STATUS(gcl_print_memory(handle, output, "embedding_output")); -#endif - return SUCCESS; - } else { - return NOT_SUPPORTED; - } -} - - -EE embedding_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc weightDesc, - GCLMem_t weight, - TensorDesc outputDesc, - GCLMem_t output, - U32 inputDim, - U32 numOutput, - bool transpose) { - CHECK_STATUS(embedding_checkpara_mali_fp16(weightDesc, outputDesc)); - CHECK_STATUS(embedding_core_mali_fp16(handle, inputDesc, input, weightDesc, weight, outputDesc, output, inputDim, numOutput, transpose)); - return SUCCESS; -} - diff --git a/tensor_computing/src/gpu/mali/fp16/embedding_mali_fp16.h b/tensor_computing/src/gpu/mali/fp16/embedding_mali_fp16.h deleted file mode 100644 index 383134e6..00000000 --- a/tensor_computing/src/gpu/mali/fp16/embedding_mali_fp16.h +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#ifndef _EMBEDDING_MALI_FP16 -#define _EMBEDDING_MALI_FP16 -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "tensor_computing_type.h" - -EE embedding_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc weightDesc, - GCLMem_t weight, - TensorDesc outputDesc, - GCLMem_t output, - U32 inputDim, - U32 numOutput, - bool transpose); -#endif - diff --git a/tensor_computing/src/gpu/mali/fp16/fully_connected_mali_fp16.cpp b/tensor_computing/src/gpu/mali/fp16/fully_connected_mali_fp16.cpp deleted file mode 100644 index 29490a89..00000000 --- a/tensor_computing/src/gpu/mali/fp16/fully_connected_mali_fp16.cpp +++ /dev/null @@ -1,330 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "tensor_desc.h" -#include "type.h" -#include "error.h" -#include "tensor_computing_type.h" -#include "gpu/mali/fp16/fully_connected_mali_fp16.h" - -inline EE fully_connected_checkpara_mali_fp16(TensorDesc inputDesc, - TensorDesc filterDesc, - TensorDesc outputDesc) { - if(inputDesc.dt != outputDesc.dt || inputDesc.dt != filterDesc.dt || inputDesc.dt != DT_F16) return NOT_MATCH; - return SUCCESS; -} - - -inline EE fully_connected_core_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc filterDesc, - std::vector filter, - TensorDesc biasDesc, - std::vector bias, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - std::vector output, - ForwardRunInfoMali_t forwardRunInfo) { - UNUSED(biasDesc); - UNUSED(tmpBytes); - UNUSED(outputDesc); - - U32 ih_str, iw_str, ih_off, iw_off, ihw_str; - U32 oh_str, ow_str, oh_off, ow_off; - U32 fw, fh, fc, fn; - cl_mem inbuf, fltbuf, biasmem, outbuf, tmp; - inbuf = input->mem; - fltbuf = filter[0]->mem; - biasmem = bias[0]->mem; - outbuf = output[0]->mem; - tmp = tmpBuf->mem; - - tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw); - ih_str = input->desc.stride[0]; - iw_str = input->desc.stride[1]; - ih_off = input->desc.offset[0]; - iw_off = input->desc.offset[1]; - oh_str = output[0]->desc.stride[0]; - ow_str = output[0]->desc.stride[1]; - oh_off = output[0]->desc.offset[0]; - ow_off = output[0]->desc.offset[1]; - ihw_str = ih_str * iw_str; - char kernelname[128]; - Kernel kernel; - U32 gs[3]; - U32 ls[3] = {0, 0, 0}; - U32 dim = 3; - U32 item_w = forwardRunInfo->best_w[0]; - U32 item_c = forwardRunInfo->best_c[0]; - U32 item_k = forwardRunInfo->best_k[0]; - - if(fw == 1 && fh == 1) { - if(inputDesc.df == DF_NCHW) { - U32 ic_str; - ic_str = filter[0]->desc.stride[1]; - if(ih_str > 1 || iw_str > 1) CHECK_STATUS(NOT_SUPPORTED); - sprintf(kernelname, "conv_direct_spe_fwhs1_%d", item_c); - gs[0] = fn; - gs[1] = 1; - gs[2] = 1; - dim = 1; - CHECK_STATUS(gcl_create_kernel_binary(handle, kernelname, &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, ihw_str, ic_str, ih_off, iw_off, oh_str, ow_str, oh_off, ow_off, fn, gs[0], gs[1], inbuf, fltbuf, biasmem, outbuf)); - gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); -#ifdef _DEBUG - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); - CHECK_STATUS(gcl_print_memory(handle, input, "fc_wh1_input")); - CHECK_STATUS(gcl_print_memory(handle, filter[0], "fc_wh1_filter")); - CHECK_STATUS(gcl_print_memory(handle, bias[0], "fc_wh1_bias")); - CHECK_STATUS(gcl_print_memory(handle, output[0], "fc_wh1_output")); - handle->t_total += handle->t_execute; -#endif - } - if(inputDesc.df == DF_MKT) { - item_k = item_k >> 2; - U32 ic_str = input->desc.stride[2]; - U32 ohw_str; - U32 step = inputDesc.dims[0]; - sprintf(kernelname, "conv_direct_s%d_%d%d%d", 1, 1, item_w, item_k); - for(U32 i = 0; i < filter.size(); ++i) { - fltbuf = filter[i]->mem; - biasmem = bias[i]->mem; - outbuf = output[i]->mem; - iw_str = input->desc.stride[0]; - ih_str = input->desc.stride[1]; - iw_off = input->desc.offset[0]; - ih_off = input->desc.offset[1]; - ow_str = output[i]->desc.stride[0]; - oh_str = output[i]->desc.stride[1]; - ow_off = output[i]->desc.offset[0]; - oh_off = output[i]->desc.offset[1]; - ohw_str = oh_str * ow_str; - if(ih_str != 1 || ih_off != 0) CHECK_STATUS(NOT_SUPPORTED); - gs[0] = 1; - gs[1] = (step + item_w - 1) / item_w; - gs[2] = output[i]->desc.stride[2] / item_k; - CHECK_STATUS(gcl_create_kernel_binary(handle, kernelname, &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, ihw_str, ic_str, ih_off, iw_off, oh_str, ohw_str, oh_off, ow_off, step, gs[0], gs[1], inbuf, fltbuf, biasmem, outbuf)); - gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); -#ifdef _DEBUG - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); - CHECK_STATUS(gcl_print_memory(handle, input, "conv_direct_input")); - CHECK_STATUS(gcl_print_memory(handle, filter[i], "conv_direct_filter")); - CHECK_STATUS(gcl_print_memory(handle, bias[i], "conv_direct_bias")); - CHECK_STATUS(gcl_print_memory(handle, output[i], "conv_direct_output")); - handle->t_total += handle->t_execute; -#endif - } - } - } else { - U32 ihy_str, fhy_str, fhw_str, fwc_str; - ihy_str = ih_str * item_w; - fc = (fc + item_c - 1) / item_c; - fn = (fn + item_k - 1) / item_k; - fhy_str = fh * item_w; - fhw_str = fh * fw; - fwc_str = fw * fc; - CHECK_STATUS(gcl_create_kernel_binary(handle, "fc_p1", &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, item_w, ih_str, iw_str, ih_off, iw_off, ihy_str, ihw_str, fh, fw, fc, fn, fhy_str, fhw_str, fwc_str, fltbuf, inbuf, tmp)); - gs[0] = fh; - gs[1] = item_w; - gs[2] = fn; - gcl_set_kernelVec(handle, kernel, dim, gs, ls, "fc_p1"); -#ifdef _DEBUG - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "fc_p1")); - CHECK_STATUS(gcl_print_memory(handle, input, "fc_p1_input")); - CHECK_STATUS(gcl_print_memory(handle, filter[0], "fc_p1_filter")); - CHECK_STATUS(gcl_print_buffer(handle, tmp, fh * item_w * fn * item_k, "fc_p1_output")); - handle->t_total += handle->t_execute; -#endif - CHECK_STATUS(gcl_create_kernel_binary(handle, "fc_p2", &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, fh * item_w, fn, oh_str, ow_str, oh_off, ow_off, tmp, biasmem, outbuf)); - U32 gs2 = fn; - U32 ls2 = 0; - dim = 1; - gcl_set_kernelVec(handle, kernel, dim, &gs2, &ls2, "fc_p2"); -#ifdef _DEBUG - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, &gs2, &ls2, "fc_p2")); - CHECK_STATUS(gcl_print_memory(handle, bias[0], "fc_p2_bias")); - CHECK_STATUS(gcl_print_memory(handle, output[0], "fc_p2_output")); - handle->t_total += handle->t_execute; -#endif - } - return SUCCESS; -} - -EE fully_connected_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, - GCLMemDesc_t gclmemFilterDesc, - U32* bytes, - ForwardRunInfoMali_t forwardRunInfo) { - U32 fw, fh, fc, fn; - tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw); - U32 item_c = forwardRunInfo->best_c[0]; - U32 item_k = forwardRunInfo->best_k[0]; - U32 s0 = 0; - U32 s1 = 0; - U32 s2 = 0; - U32 num = 0; - U32 byteSize; - - if(item_k == 0) { - s0 = fn; - s1 = (fc + item_c - 1) / item_c; - s2 = 1; - DataFormat df = DF_CHWNC4; - if(item_c == 8) df = DF_CHWNC8; - if(item_c == 16) df = DF_CHWNC16; - gclmemFilterDesc->memFormat = df; - num = s0 * s1 * s2 * item_c; - } else if(fw == 1 && fh == 1) { - s0 = item_k >> 2; - s1 = (fc + item_c - 1) / item_c; - s2 = (fn + item_k - 1) / item_k; - gclmemFilterDesc->memFormat = DF_NCHWN4C4; - num = s0 * s1 * s2 * item_c * item_k / (item_k >> 2); - } else { - s0 = fh; - s1 = fw; - s2 = ((fc + item_c - 1) / item_c) * ((fn + item_k - 1) / item_k); - num = s0 * s1 * s2 * item_c * item_k; - gclmemFilterDesc->memFormat = DF_NCWHN4C4; - } - byteSize = num * bytesOf(DT_F16); - gclmemFilterDesc->stride[0] = s0; - gclmemFilterDesc->stride[1] = s1; - gclmemFilterDesc->stride[2] = s2; - gclmemFilterDesc->offset[0] = 0; - gclmemFilterDesc->offset[1] = 0; - gclmemFilterDesc->offset[2] = 0; - gclmemFilterDesc->num = num; - gclmemFilterDesc->byteSize = byteSize; - gclmemFilterDesc->memType = GCL_MEM_BUF; - gclmemFilterDesc->flags = CL_MEM_READ_WRITE; - gclmemFilterDesc->host_ptr = NULL; - *bytes = 0; - return SUCCESS; -} - -EE fully_connected_transform_filter_mali_fp16(GCLHandle_t handle, - TensorDesc filterDesc, - GCLMem_t filter, - TensorDesc* fltmemDesc, - std::vector fltmem, - ForwardRunInfoMali_t forwardRunInfo) { - DataType fdt; - DataFormat fdf; - U32 fw, fh, fc, fn; - tensorSelectGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw); - char kernelname[128]; - Kernel kernel; - U32 gs[3]; - U32 ls[3] = {0, 0, 0}; - U32 dim = 3; - U32 fwh = fw * fh; - U32 item_c = forwardRunInfo->best_c[0]; - U32 item_k = forwardRunInfo->best_k[0]; - if(fw == 1 && fh == 1) { - if(item_k == 0) { - sprintf(kernelname, "conv_direct_trans_fltbuf_%d%d",item_c, item_k); - CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, fwh, fc, fn, filter->mem, fltmem[0]->mem)); - gs[0] = fwh; - gs[1] = (fc + item_c - 1) / item_c; - gs[2] = fn; - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); - } else { - sprintf(kernelname, "conv_direct_trans_fltbuf_%d%d",item_c, item_k); - CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); - if(fltmem.size() == 1) { - CHECK_STATUS(gcl_set_kernelArgs(kernel, fwh, fc, fn, filter->mem, fltmem[0]->mem)); - gs[0] = fwh; - gs[1] = (fc + item_c - 1) / item_c; - gs[2] = (fn + item_k - 1) / item_k * item_k; - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); - } else { - GCLMem_t tmp = gcl_create_gclmem(); - tmp->desc.byteSize = 0; - for(U32 i = 0; i < fltmem.size(); ++i) tmp->desc.byteSize += fltmem[i]->desc.byteSize; - tmp->desc.memType = GCL_MEM_BUF; - tmp->desc.flags = CL_MEM_READ_WRITE; - CHECK_STATUS(gcl_create_memory(handle, tmp)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, fwh, fc, fn, filter->mem, tmp->mem)); - gs[0] = fwh; - gs[1] = (fc + item_c - 1) / item_c; - gs[2] = (fn + item_k - 1) / item_k * item_k; - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); - U32 offset[2] = {0, 0}; - for(U32 i = 0; i < fltmem.size(); i++) { - U32 size = fltmem[i]->desc.byteSize; - CHECK_STATUS(gcl_trans_memory(handle, tmp, fltmem[i], &size, DEVICE_BUF_TO_BUF, CL_TRUE, offset)); - offset[0] += size; - } - gcl_destroy_gclmem(tmp); - } - } - } else { - sprintf(kernelname, "fc_trans_fltbuf_%d%d",item_c, item_k); - CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, fw, fh, fwh, fc, fn, filter->mem, fltmem[0]->mem)); - gs[0] = fw; - gs[1] = fh; - gs[2] = (fc + item_c - 1) / item_c * ((fn + item_k - 1) / item_k) * item_k; - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); - } - *fltmemDesc = tensor4df(fdt, fdf, fn, fc, fh, fw); -#ifdef _DEBUG - CHECK_STATUS(gcl_print_memory(handle, filter, "fc_filter_org")); - for(U32 i = 0; i < fltmem.size(); ++i) CHECK_STATUS(gcl_print_memory(handle, fltmem[i], "fc_filter_tran")); -#endif - return SUCCESS; -} - -EE fully_connected_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, - TensorDesc filterDesc, - U32* bytes, - ForwardRunInfoMali_t forwardRunInfo) { - U32 fn, fw, fh; - tensorSelectGet(filterDesc, NULL, NULL, &fn, NULL, &fh, &fw); - if(fh == 1 && fw == 1) { - *bytes = 0; - } else { - DataType dt; - U32 ic, ih, iw; - tensorSelectGet(inputDesc, &dt, NULL, NULL, &ic, &ih, &iw); - U32 item_w = forwardRunInfo->best_w[0]; - U32 item_k = forwardRunInfo->best_k[0]; - *bytes = ih * item_w * ((fn + item_k - 1) / item_k * item_k) * bytesOf(dt); - } - return SUCCESS; -} - -EE fully_connected_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc filterDesc, - std::vector filter, - TensorDesc biasDesc, - std::vector bias, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - std::vector output, - ForwardRunInfoMali_t forwardRunInfo) { - CHECK_STATUS(fully_connected_checkpara_mali_fp16(inputDesc, filterDesc, outputDesc)); - CHECK_STATUS(fully_connected_core_mali_fp16(handle, inputDesc, input, filterDesc, filter, biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, forwardRunInfo)); - return SUCCESS; -} diff --git a/tensor_computing/src/gpu/mali/fp16/fully_connected_mali_fp16.h b/tensor_computing/src/gpu/mali/fp16/fully_connected_mali_fp16.h deleted file mode 100644 index 1aff5c9d..00000000 --- a/tensor_computing/src/gpu/mali/fp16/fully_connected_mali_fp16.h +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#ifndef _FC_MALI_FP16 -#define _FC_MALI_FP16 -#include "sys.h" -#include "tensor_desc.h" -#include "type.h" -#include "error.h" -#include "tensor_computing_type.h" - -EE fully_connected_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, - GCLMemDesc_t gclmemFilterDesc, - U32* bytes, - ForwardRunInfoMali_t forwardRunInfo); - -EE fully_connected_transform_filter_mali_fp16(GCLHandle_t handle, - TensorDesc filterDesc, - GCLMem_t filter, - TensorDesc* fltmemDesc, - std::vector fltmem, - ForwardRunInfoMali_t forwardRunInfo); - -EE fully_connected_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, - TensorDesc filterDesc, - U32* bytes, - ForwardRunInfoMali_t forwardRunInfo); - -EE fully_connected_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc filterDesc, - std::vector filter, - TensorDesc biasDesc, - std::vector bias, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - std::vector output, - ForwardRunInfoMali_t forwardRunInfo); -#endif diff --git a/tensor_computing/src/gpu/mali/fp16/matmul_mali_fp16.cpp b/tensor_computing/src/gpu/mali/fp16/matmul_mali_fp16.cpp deleted file mode 100644 index 8c6fbaa0..00000000 --- a/tensor_computing/src/gpu/mali/fp16/matmul_mali_fp16.cpp +++ /dev/null @@ -1,148 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "tensor_desc.h" -#include "type.h" -#include "error.h" -#include "tensor_computing_type.h" -#include "gpu/mali/fp16/matmul_mali_fp16.h" - -inline EE matmul_checkpara_mali_fp16(TensorDesc matrixADesc, - TensorDesc matrixBDesc, - TensorDesc matrixCDesc) { - if(matrixADesc.dt != matrixBDesc.dt || matrixADesc.dt != matrixCDesc.dt || matrixADesc.dt != DT_F16) return NOT_MATCH; - return SUCCESS; -} - -inline EE matmul_core_mali_fp16(GCLHandle_t handle, - TensorDesc matrixADesc, - bool transposeA, - const GCLMem_t matrixA, - TensorDesc matrixBDesc, - bool transposeB, - const GCLMem_t matrixB, - GCLMem_t tmp, - TensorDesc matrixCDesc, - GCLMem_t matrixC, - ForwardRunInfoMali_t forwardRunInfo) { - UNUSED(tmp); - UNUSED(matrixCDesc); - U32 aw, ah, bw, bh; - tensorSelectGet(matrixADesc, NULL, NULL, NULL, NULL, &ah, &aw); - tensorSelectGet(matrixBDesc, NULL, NULL, NULL, NULL, &bh, &bw); - U32 item_w = forwardRunInfo->best_w[0]; - U32 item_c = forwardRunInfo->best_c[0]; - U32 item_k = forwardRunInfo->best_k[0]; - cl_mem A, B, C; - A = matrixA->mem; - B = matrixB->mem; - C = matrixC->mem; - char kernelname[128]; - Kernel kernel; - U32 gs[3]; - U32 ls[3] = {0, 0, 0}; - U32 dim = 3; - if(matrixA->desc.offset[0] != 0 || - matrixA->desc.offset[1] != 0 || - matrixB->desc.offset[0] != 0 || - matrixB->desc.offset[1] != 0 || - matrixC->desc.offset[0] != 0 || - matrixC->desc.offset[1] != 0 - ) CHECK_STATUS(NOT_SUPPORTED); - if(transposeA && !transposeB) { - U32 M = matrixA->desc.stride[0]; - U32 N = matrixB->desc.stride[0]; - U32 K = ah; - U32 ow_str = matrixC->desc.stride[0]; - U32 A_str = M * matrixA->desc.stride[1]; - U32 B_str = N * matrixB->desc.stride[1]; - U32 C_str = ow_str * matrixC->desc.stride[1]; - U32 batch = matrixA->desc.stride[2]; - gs[0] = (bw + item_w - 1) / item_w; - gs[1] = (aw + item_k - 1) / item_k; - gs[2] = batch; - sprintf(kernelname, "gemm_tn_nobias_%d%d", item_k, item_w); - CHECK_STATUS(gcl_create_kernel_binary(handle, kernelname, &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, M, N, K, ow_str, A_str, B_str, C_str, gs[0], gs[1], A, B, C)); - gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); -#ifdef _DEBUG - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); - CHECK_STATUS(gcl_print_memory(handle, matrixA, "gemm_tn_a")); - CHECK_STATUS(gcl_print_memory(handle, matrixB, "gemm_tn_b")); - CHECK_STATUS(gcl_print_memory(handle, matrixC, "gemm_tn_c")); - handle->t_total += handle->t_execute; -#endif - return SUCCESS; - } - - if(!transposeA && transposeB) { - U32 KA = matrixA->desc.stride[0]; - U32 KB = matrixB->desc.stride[0]; - U32 K = (aw + item_c - 1) / item_c * item_c; - U32 ow_str = matrixC->desc.stride[0]; - U32 A_str = KA * matrixA->desc.stride[1]; - U32 B_str = KB * matrixB->desc.stride[1]; - U32 C_str = ow_str * matrixC->desc.stride[1]; - U32 batch = matrixA->desc.stride[2]; - gs[0] = (bh + item_w - 1) / item_w; - gs[1] = (ah + item_k - 1) / item_k; - gs[2] = batch; - sprintf(kernelname, "gemm_nt_nobias_%d%d%d", item_k, item_w, (item_c >> 1)); - CHECK_STATUS(gcl_create_kernel_binary(handle, kernelname, &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, KA, KB, K, ow_str, A_str, B_str, C_str, gs[0], gs[1], A, B, C)); - gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); -#ifdef _DEBUG - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); - CHECK_STATUS(gcl_print_memory(handle, matrixA, "gemm_nt_a")); - CHECK_STATUS(gcl_print_memory(handle, matrixB, "gemm_nt_b")); - CHECK_STATUS(gcl_print_memory(handle, matrixC, "gemm_nt_c")); - handle->t_total += handle->t_execute; -#endif - return SUCCESS; - } - return NOT_SUPPORTED; -} - - -EE matmul_infer_forward_tmp_bytes_mali_fp16(TensorDesc matrixADesc, - bool transposeA, - TensorDesc matrixBDesc, - bool transposeB, - U32* bytes, - ForwardRunInfoMali_t forwardRunInfo) { - UNUSED(matrixADesc); - UNUSED(transposeA); - UNUSED(matrixBDesc); - UNUSED(transposeB); - UNUSED(forwardRunInfo); - *bytes = 0; - return SUCCESS; -} - -EE matmul_mali_fp16(GCLHandle_t handle, - TensorDesc matrixADesc, - bool transposeA, - const GCLMem_t matrixA, - TensorDesc matrixBDesc, - bool transposeB, - const GCLMem_t matrixB, - GCLMem_t tmp, - TensorDesc matrixCDesc, - GCLMem_t matrixC, - ForwardRunInfoMali_t forwardRunInfo) { - CHECK_STATUS(matmul_checkpara_mali_fp16(matrixADesc, matrixBDesc, matrixCDesc)); - CHECK_STATUS(matmul_core_mali_fp16(handle, matrixADesc, transposeA, matrixA, matrixBDesc, transposeB, matrixB, tmp, matrixCDesc, matrixC, forwardRunInfo)); - return SUCCESS; -} diff --git a/tensor_computing/src/gpu/mali/fp16/matmul_mali_fp16.h b/tensor_computing/src/gpu/mali/fp16/matmul_mali_fp16.h deleted file mode 100644 index cc81f8e0..00000000 --- a/tensor_computing/src/gpu/mali/fp16/matmul_mali_fp16.h +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _MATMUL_MALI_FP16 -#define _MATMUL_MALI_FP16 -#include "sys.h" -#include "tensor_desc.h" -#include "type.h" -#include "error.h" -#include "tensor_computing_type.h" - -EE matmul_infer_forward_tmp_bytes_mali_fp16(TensorDesc matrixADesc, - bool transposeA, - TensorDesc matrixBDesc, - bool transposeB, - U32* bytes, - ForwardRunInfoMali_t forwardRunInfo); - -EE matmul_mali_fp16(GCLHandle_t handle, - TensorDesc matrixADesc, - bool transposeA, - const GCLMem_t matrixA, - TensorDesc matrixBDesc, - bool transposeB, - const GCLMem_t matrixB, - GCLMem_t tmp, - TensorDesc matrixCDesc, - GCLMem_t matrixC, - ForwardRunInfoMali_t forwardRunInfo); -#endif diff --git a/tensor_computing/src/gpu/mali/fp16/multiply_mali_fp16.cpp b/tensor_computing/src/gpu/mali/fp16/multiply_mali_fp16.cpp deleted file mode 100644 index 54dc4a5f..00000000 --- a/tensor_computing/src/gpu/mali/fp16/multiply_mali_fp16.cpp +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "tensor_computing_type.h" -#include "gpu/mali/fp16/multiply_mali_fp16.h" - -inline EE multiply_checkpara_mali_fp16(TensorDesc inputDesc, - TensorDesc outputDesc) { - if(inputDesc.dt != outputDesc.dt || inputDesc.dt != DT_F16) return NOT_SUPPORTED; - return SUCCESS; -} - -inline EE multiply_core_mali_fp16(GCLHandle_t handle, - float* alpha, - float* beta, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { - UNUSED(outputDesc); - U32 iw, ih, ic, in; - tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); - U32 iw_str, ih_str, iw_off, ih_off; - U32 ow_str, oh_str, ow_off, oh_off; - iw_str = input->desc.stride[0]; - ih_str = input->desc.stride[1]; - iw_off = input->desc.offset[0]; - ih_off = input->desc.offset[1]; - ow_str = output->desc.stride[0]; - oh_str = output->desc.stride[1]; - ow_off = output->desc.offset[0]; - oh_off = output->desc.offset[1]; - cl_mem inbuf, outbuf; - inbuf = input->mem; - outbuf = output->mem; - float alp = *alpha; - float bet = *beta; - U32 gs[3] = {(iw + 3) / 4, ih, ic}; - U32 ls[3] = {0, 0, 0}; - U32 dim = 3; - Kernel kernel; - CHECK_STATUS(gcl_create_kernel_binary(handle, "multiply_align_nchw", &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, iw_str, ih_off, iw_off, oh_str, ow_str, oh_off, ow_off, gs[0], gs[1], alp, bet, inbuf, outbuf)); - gcl_set_kernelVec(handle, kernel, dim, gs, ls, "multiply_align_nchw"); -#ifdef _DEBUG - CHECK_STATUS(gcl_print_memory(handle, input, "multiply_align_nchw")); - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "multiply_align_nchw")); - CHECK_STATUS(gcl_print_memory(handle, output, "multiply_align_output")); -#endif - return SUCCESS; -} - - -EE multiply_mali_fp16(GCLHandle_t handle, - void* alpha, - void* beta, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { - CHECK_STATUS(multiply_checkpara_mali_fp16(inputDesc, outputDesc)); - CHECK_STATUS(multiply_core_mali_fp16(handle, (float*)alpha, (float*)beta, inputDesc, input, outputDesc, output)); - return SUCCESS; -} - diff --git a/tensor_computing/src/gpu/mali/fp16/multiply_mali_fp16.h b/tensor_computing/src/gpu/mali/fp16/multiply_mali_fp16.h deleted file mode 100644 index 9060a2a6..00000000 --- a/tensor_computing/src/gpu/mali/fp16/multiply_mali_fp16.h +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _MULTIPLY_MALI_FP16 -#define _MULTIPLY_MALI_FP16 -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "tensor_computing_type.h" - - -EE multiply_mali_fp16(GCLHandle_t handle, - void* alpha, - void* beta, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output); -#endif diff --git a/tensor_computing/src/gpu/mali/fp16/normalization_mali_fp16.cpp b/tensor_computing/src/gpu/mali/fp16/normalization_mali_fp16.cpp deleted file mode 100644 index a84c00fe..00000000 --- a/tensor_computing/src/gpu/mali/fp16/normalization_mali_fp16.cpp +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "tensor_computing_type.h" -#include "gpu/mali/fp16/normalization_mali_fp16.h" - -inline EE normalization_checkpara_mali_fp16(TensorDesc inputDesc, - TensorDesc outputDesc) { - if(inputDesc.dt != outputDesc.dt || inputDesc.dt != DT_F16) return NOT_SUPPORTED; - return SUCCESS; -} - -inline EE normalization_core_mali_fp16(GCLHandle_t handle, - GCLMem_t alpha, - GCLMem_t beta, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { - UNUSED(outputDesc); - U32 step = inputDesc.dims[0]; - U32 numOutput = inputDesc.dims[1]; - U32 iw_str, ih_str, ic_str, iw_off, ih_off; - U32 oh_str, ow_off, oh_off; - ih_str = input->desc.stride[0]; - iw_str = input->desc.stride[1]; - ic_str = input->desc.stride[2]; - ih_off = input->desc.offset[0]; - iw_off = input->desc.offset[1]; - oh_str = output->desc.stride[0]; - oh_off = output->desc.offset[0]; - ow_off = output->desc.offset[1]; - if(iw_str != 1 || ih_off != 0 || iw_off != 0) CHECK_STATUS(NOT_SUPPORTED); - cl_mem alpbuf, betbuf, inbuf, outbuf; - alpbuf = alpha->mem; - betbuf = beta->mem; - inbuf = input->mem; - outbuf = output->mem; - - U32 gs = step; - U32 ls = 0; - U32 dim = 1; - Kernel kernel; - CHECK_STATUS(gcl_create_kernel_binary(handle, "normalization", &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, step, numOutput, ih_str, ic_str, ih_off, iw_off, oh_str, oh_off, ow_off, alpbuf, betbuf, inbuf, outbuf)); - gcl_set_kernelVec(handle, kernel, dim, &gs, &ls, "normalization"); -#ifdef _DEBUG - CHECK_STATUS(gcl_print_memory(handle, input, "normalization_input")); - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, &gs, &ls, "normalization")); - CHECK_STATUS(gcl_print_memory(handle, output, "normalization_output")); -#endif - return SUCCESS; -} - - -EE normalization_mali_fp16(GCLHandle_t handle, - GCLMem_t alpha, - GCLMem_t beta, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { - CHECK_STATUS(normalization_checkpara_mali_fp16(inputDesc, outputDesc)); - CHECK_STATUS(normalization_core_mali_fp16(handle, alpha, beta, inputDesc, input, outputDesc, output)); - return SUCCESS; -} - diff --git a/tensor_computing/src/gpu/mali/fp16/pooling_mali_fp16.cpp b/tensor_computing/src/gpu/mali/fp16/pooling_mali_fp16.cpp deleted file mode 100644 index 1dd359a1..00000000 --- a/tensor_computing/src/gpu/mali/fp16/pooling_mali_fp16.cpp +++ /dev/null @@ -1,128 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "tensor_computing_type.h" -#include "gpu/mali/fp16/pooling_mali_fp16.h" - -inline EE pooling_checkpara_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - PoolingDesc poolingDesc, - TensorDesc outputDesc, - GCLMem_t output){ - if (handle == nullptr || nullptr == input || nullptr == output) return NULL_POINTER; - if (inputDesc.dt != outputDesc.dt || inputDesc.dt != DT_F16) return NOT_SUPPORTED; - if (inputDesc.df != outputDesc.df || inputDesc.df != DF_NCHW) return NOT_SUPPORTED; - if (inputDesc.dims[2] != outputDesc.dims[2] || inputDesc.dims[3] != outputDesc.dims[3]) return NOT_SUPPORTED; - if (poolingDesc.padding_top >= poolingDesc.kernelSize_h) return NOT_SUPPORTED; - if (poolingDesc.padding_bottom >= poolingDesc.kernelSize_w) return NOT_SUPPORTED; - if (input->desc.memFormat != output->desc.memFormat || input->desc.memFormat != DF_NCWHC4) return NOT_SUPPORTED; - return SUCCESS; -} - -inline EE pooling_core_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - PoolingDesc poolingDesc, - TensorDesc outputDesc, - GCLMem_t output){ - - - U32 iw, ih, ic, in; - U32 ow, oh, oc, on; - tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); - tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); - - cl_mem inbuf, outbuf; - inbuf = input->mem; - outbuf = output->mem; - - U32 iw_str, ih_str, iw_off, ih_off; - ih_str = input->desc.stride[0]; - iw_str = input->desc.stride[1]; - ih_off = input->desc.offset[0]; - iw_off = input->desc.offset[1]; - - U32 ow_str, oh_str, ow_off, oh_off; - oh_str = output->desc.stride[0]; - ow_str = output->desc.stride[1]; - oh_off = output->desc.offset[0]; - ow_off = output->desc.offset[1]; - - U32 sw, sh, pw, ph, kw, kh; - sw = poolingDesc.stride_w; - sh = poolingDesc.stride_h; - pw = poolingDesc.padding_left; - ph = poolingDesc.padding_top; - kw = poolingDesc.kernelSize_w; - kh = poolingDesc.kernelSize_h; - - Kernel kernel; - switch(poolingDesc.pm){ - case POOLING_MAX:{ - CHECK_STATUS(gcl_create_kernel_binary(handle, "pooling_max", &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ih_off, iw_off, ih_str, iw_str, - oh, ow, oh_off, ow_off, oh_str, ow_str, - sh, sw, ph, pw, kh, kw, inbuf, outbuf)); - - U32 gs[3] = {oh, ow, (oc + 3) / 4 * on}; - U32 ls[3] = {0, 0, 0}; - U32 dim = 3; - gcl_set_kernelVec(handle, kernel, dim, gs, ls, "pooling_max"); -#ifdef _DEBUG - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "pooling_max")); - CHECK_STATUS(gcl_print_memory(handle, input, "pooling_max_input")); - CHECK_STATUS(gcl_print_memory(handle, output, "pooling_max_output")); -#endif - break; - } - case POOLING_MEAN:{ - CHECK_STATUS(gcl_create_kernel_binary(handle, "pooling_mean", &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ih_off, iw_off, ih_str, iw_str, - oh, ow, oh_off, ow_off, oh_str, ow_str, - sh, sw, ph, pw, kh, kw, inbuf, outbuf)); - - U32 gs[3] = {oh, ow, (oc + 3) / 4 * on}; - U32 ls[3] = {0, 0, 0}; - U32 dim = 3; - gcl_set_kernelVec(handle, kernel, dim, gs, ls, "pooling_mean"); -#ifdef _DEBUG - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "pooling_mean")); - CHECK_STATUS(gcl_print_memory(handle, input, "pooling_mean_input")); - CHECK_STATUS(gcl_print_memory(handle, output, "pooling_mean_output")); -#endif - break; - } - default: - {CHECK_STATUS(NOT_SUPPORTED);} - } - return SUCCESS; - -} - -EE pooling_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - PoolingDesc poolingDesc, - TensorDesc outputDesc, - GCLMem_t output){ - CHECK_STATUS(pooling_checkpara_mali_fp16(handle, inputDesc, input, poolingDesc, outputDesc, output)); - CHECK_STATUS(pooling_core_mali_fp16 (handle, inputDesc, input, poolingDesc, outputDesc, output)); - return SUCCESS; -} - diff --git a/tensor_computing/src/gpu/mali/fp16/reshape_mali_fp16.cpp b/tensor_computing/src/gpu/mali/fp16/reshape_mali_fp16.cpp deleted file mode 100644 index 0f0b0b84..00000000 --- a/tensor_computing/src/gpu/mali/fp16/reshape_mali_fp16.cpp +++ /dev/null @@ -1,118 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "tensor_computing_type.h" -#include "gpu/mali/fp16/reshape_mali_fp16.h" -#include "gpu/mali/infer_gclmem_desc_mali.h" - -inline EE reshape_checkpara_mali_fp16(TensorDesc inputDesc, - TensorDesc outputDesc) { - if(inputDesc.dt != outputDesc.dt) return NOT_SUPPORTED; - if(outputDesc.dt != DT_F16) return NOT_SUPPORTED; - return SUCCESS; -} - -inline EE reshape_core_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { - DataFormat idf, odf; - idf = inputDesc.df; - odf = outputDesc.df; - U32 iw_str, ih_str, ic_str, iw_off, ih_off; - U32 ow_str, oh_str, oc_str, ow_off, oh_off; - get_gclmem_dim(input->desc, &iw_str, &ih_str, &ic_str, &iw_off, &ih_off); - get_gclmem_dim(output->desc, &ow_str, &oh_str, &oc_str, &ow_off, &oh_off); - cl_mem inbuf = input->mem; - cl_mem outbuf = output->mem; - if(idf == DF_NCHW) { - U32 iw, ih, ic, in; - tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); - if(odf == DF_NCHW) { - if(inbuf == outbuf) return SUCCESS; - U32 gs[3] = {ih, iw, (ic + 3) / 4}; - U32 ls[3] = {0, 0, 0}; - U32 dim = 3; - Kernel kernel; - CHECK_STATUS(gcl_create_kernel_binary(handle, "reshape", &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, ih_str, iw_str, ih_off, iw_off, oh_str, ow_str, oh_off, ow_off, gs[0], gs[1], inbuf, outbuf)); - gcl_set_kernelVec(handle, kernel, dim, gs, ls, "reshape"); -#ifdef _DEBUG - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "reshape")); - CHECK_STATUS(gcl_print_memory(handle, input, "reshape_input")); - CHECK_STATUS(gcl_print_memory(handle, output, "reshape_output")); -#endif - return SUCCESS; - } - if(odf == DF_MKT) { - iw_str = input->desc.stride[0]; - ih_str = input->desc.stride[1]; - iw_off = input->desc.offset[0]; - ih_off = input->desc.offset[1]; - U32 m, k, t; - get_nlp_mkt_val(outputDesc, NULL, &m, &k, &t); - U32 gs[2] = {t, (k + 3) / 4}; - U32 ls[2] = {0, 0}; - U32 dim = 2; - Kernel kernel; - CHECK_STATUS(gcl_create_kernel_binary(handle, "reshape_nchw_to_mkt", &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, iw_off, ih_off, ih, k, oh_str, ow_str, oh_off, ow_off, gs[0], gs[1], inbuf, outbuf)); - gcl_set_kernelVec(handle, kernel, dim, gs, ls, "reshape_nchw_to_mkt"); -#ifdef _DEBUG - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "reshape_nchw_to_mkt")); - CHECK_STATUS(gcl_print_memory(handle, input, "reshape_nchw_to_mkt_input")); - CHECK_STATUS(gcl_print_memory(handle, output, "reshape_nchw_to_mkt_output")); -#endif - return SUCCESS; - } - } - - if(idf == DF_MKT && odf == DF_NCHW) { - U32 m, k, t; - U32 oh; - get_nlp_mkt_val(inputDesc, NULL, &m, &k, &t); - tensorSelectGet(outputDesc, NULL, NULL, NULL, NULL, &oh, NULL); - U32 gs[2] = {t, (k + 3) / 4}; - U32 ls[2] = {0, 0}; - U32 dim = 2; - Kernel kernel; - CHECK_STATUS(gcl_create_kernel_binary(handle, "reshape_mkt_to_nchw", &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, iw_str, ih_off, iw_off, ow_str, oh_str, ow_off, oh_off, oh, gs[0], gs[1], inbuf, outbuf)); - gcl_set_kernelVec(handle, kernel, dim, gs, ls, "reshape_mkt_to_nchw"); -#ifdef _DEBUG - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "reshape_mkt_to_nchw")); - CHECK_STATUS(gcl_print_memory(handle, input, "reshape_mkt_to_nchw_input")); - CHECK_STATUS(gcl_print_memory(handle, output, "reshape_mkt_to_nchw_output")); -#endif - return SUCCESS; - } - return NOT_SUPPORTED; -} - -EE reshape_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { - - CHECK_STATUS(reshape_checkpara_mali_fp16(inputDesc, outputDesc)); - CHECK_STATUS(reshape_core_mali_fp16(handle, inputDesc, input, outputDesc, output)); - return SUCCESS; -} - diff --git a/tensor_computing/src/gpu/mali/fp16/scale_mali_fp16.cpp b/tensor_computing/src/gpu/mali/fp16/scale_mali_fp16.cpp deleted file mode 100644 index 06bf46f0..00000000 --- a/tensor_computing/src/gpu/mali/fp16/scale_mali_fp16.cpp +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "tensor_computing_type.h" -#include "gpu/mali/fp16/scale_mali_fp16.h" - -inline EE scale_checkpara_mali_fp16(TensorDesc inputDesc, - TensorDesc outputDesc) { - if(inputDesc.dt != outputDesc.dt || inputDesc.dt != DT_F16) return NOT_SUPPORTED; - return SUCCESS; -} - -inline EE scale_core_mali_fp16(GCLHandle_t handle, - GCLMem_t alpha, - GCLMem_t beta, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { - UNUSED(outputDesc); - U32 iw, ih, ic, in; - tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); - U32 iw_str, ih_str, iw_off, ih_off; - U32 ow_str, oh_str, ow_off, oh_off; - ih_str = input->desc.stride[0]; - iw_str = input->desc.stride[1]; - ih_off = input->desc.offset[0]; - iw_off = input->desc.offset[1]; - oh_str = output->desc.stride[0]; - ow_str = output->desc.stride[1]; - oh_off = output->desc.offset[0]; - ow_off = output->desc.offset[1]; - cl_mem inbuf, outbuf, albuf, bebuf; - inbuf = input->mem; - outbuf = output->mem; - albuf = alpha->mem; - bebuf = (beta) ? beta->mem : albuf; - - char modeName[16]; - if(beta){ - strcpy(modeName, "beta"); - } else { - strcpy(modeName, "nobeta"); - } - char kernelName[128]; - sprintf(kernelName, "scale_%s", modeName); - U32 gs[3] = {(ih + 1) / 2, iw, (ic + 3) / 4}; - U32 ls[3] = {0, 0, 0}; - U32 dim = 3; - Kernel kernel; - CHECK_STATUS(gcl_create_kernel_binary(handle, kernelName, &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, ih_str, iw_str, ih_off, iw_off, oh_str, ow_str, oh_off, ow_off, gs[0], gs[1], albuf, bebuf, inbuf, outbuf)); - gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); -#ifdef _DEBUG - CHECK_STATUS(gcl_print_memory(handle, input, "scale_input")); - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); - CHECK_STATUS(gcl_print_memory(handle, alpha, "scale_alpha")); - if(beta) - CHECK_STATUS(gcl_print_memory(handle, beta, "scale_beta")); - CHECK_STATUS(gcl_print_memory(handle, output, "scale_output")); -#endif - return SUCCESS; -} - - -EE scale_mali_fp16(GCLHandle_t handle, - GCLMem_t alpha, - GCLMem_t beta, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { - CHECK_STATUS(scale_checkpara_mali_fp16(inputDesc, outputDesc)); - CHECK_STATUS(scale_core_mali_fp16(handle, alpha, beta, inputDesc, input, outputDesc, output)); - return SUCCESS; -} - diff --git a/tensor_computing/src/gpu/mali/fp16/slice_mali_fp16.cpp b/tensor_computing/src/gpu/mali/fp16/slice_mali_fp16.cpp deleted file mode 100644 index 8bb38559..00000000 --- a/tensor_computing/src/gpu/mali/fp16/slice_mali_fp16.cpp +++ /dev/null @@ -1,110 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "tensor_computing_type.h" -#include "gpu/mali/fp16/slice_mali_fp16.h" -#include "gpu/mali/infer_gclmem_desc_mali.h" -#define MAX_SLICE_NUM 2 - -inline EE slice_checkpara_mali_fp16(TensorDesc inputDesc, - std::vector outputDesc) { - if(inputDesc.dt != DT_F16) return NOT_SUPPORTED; - for(auto p : outputDesc) { - if(p.dt != DT_F16) return NOT_SUPPORTED; - } - return SUCCESS; -} - -inline EE slice_core_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - I32 axis, - std::vector outputDesc, - std::vector* output) { - if(inputDesc.df == DF_MKT) { - U32 m, k, t; - U32 gw, gh, gc; - get_nlp_mkt_val(inputDesc, NULL, &m, &k, &t); - map_nlp_mkt_to_ncwhc4(m, k, t, &gw, &gh, &gc); - if(axis == 2) { - U32 iw_str, ih_str, iw_off, ih_off; - ih_str = input->desc.stride[0]; - iw_str = input->desc.stride[1]; - ih_off = input->desc.offset[0]; - iw_off = input->desc.offset[1]; - U32 ow_str[MAX_SLICE_NUM]; - U32 oh_str[MAX_SLICE_NUM]; - U32 ow_off[MAX_SLICE_NUM]; - U32 oh_off[MAX_SLICE_NUM]; - cl_mem outbuf[MAX_SLICE_NUM]; - U32 sliceEnd[MAX_SLICE_NUM]; - U32 sliceNum = (*output).size(); - if(sliceNum > MAX_SLICE_NUM) CHECK_STATUS(NOT_SUPPORTED); - U32 j = 0; - std::vector outputArray = *output; - for(U32 i = 0; i < sliceNum; ++i) { - oh_str[i] = ((GCLMem_t)outputArray[i])->desc.stride[0]; - ow_str[i] = ((GCLMem_t)outputArray[i])->desc.stride[1]; - oh_off[i] = ((GCLMem_t)outputArray[i])->desc.offset[0]; - ow_off[i] = ((GCLMem_t)outputArray[i])->desc.offset[1]; - outbuf[i] = ((GCLMem_t)outputArray[i])->mem; - get_nlp_mkt_val(outputDesc[i], NULL, NULL, NULL, &t); - j += t; - sliceEnd[i] = j; - } - char kernelName[128]; - sprintf(kernelName, "slice_h_%d", sliceNum); - Kernel kernel; - CHECK_STATUS(gcl_create_kernel_binary(handle, kernelName, &kernel)); - U32 gs[3] = {gh, gw, gc}; - U32 ls[3] = {0, 0, 0}; - U32 dim = 3; - switch(sliceNum) { - case 2: - CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, iw_str, ih_off, iw_off, gs[0], gs[1], input->mem, - oh_str[0], ow_str[0], oh_off[0], ow_off[0], sliceEnd[0], outbuf[0], - oh_str[1], ow_str[1], oh_off[1], ow_off[1], sliceEnd[1], outbuf[1])); - break; - default: - return NOT_SUPPORTED; - } - gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); -#ifdef _DEBUG - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); - CHECK_STATUS(gcl_print_memory(handle, input, "slice_input")); - for(U32 i = 0; i < sliceNum; ++i) CHECK_STATUS(gcl_print_memory(handle, (GCLMem_t)(outputArray[i]), "slice_output")); -#endif - return SUCCESS; - } - return NOT_SUPPORTED; - } - return NOT_SUPPORTED; -} - - -EE slice_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - I32 axis, - std::vector outputDesc, - std::vector* output) { - CHECK_STATUS(slice_checkpara_mali_fp16(inputDesc, outputDesc)); - CHECK_STATUS(slice_core_mali_fp16(handle, inputDesc, input, axis, outputDesc, output)); - return SUCCESS; -} - diff --git a/tensor_computing/src/gpu/mali/fp16/softmax_mali_fp16.cpp b/tensor_computing/src/gpu/mali/fp16/softmax_mali_fp16.cpp deleted file mode 100644 index 56af18f7..00000000 --- a/tensor_computing/src/gpu/mali/fp16/softmax_mali_fp16.cpp +++ /dev/null @@ -1,117 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "tensor_computing_type.h" -#include "gpu/mali/fp16/softmax_mali_fp16.h" -#include "gpu/mali/infer_gclmem_desc_mali.h" - -inline EE softmax_checkpara_mali_fp16(TensorDesc inputDesc, - TensorDesc outputDesc) { - if(inputDesc.dt != outputDesc.dt) return NOT_SUPPORTED; - if(outputDesc.dt != DT_F16) return NOT_SUPPORTED; - return SUCCESS; -} - -inline EE softmax_core_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - int axis, - TensorDesc outputDesc, - GCLMem_t output) { - UNUSED(axis); - UNUSED(outputDesc); - U32 iw, ih, ic, in; - if(inputDesc.df == DF_NCHW) { - tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); - } else if(inputDesc.df == DF_MKT) { - get_nlp_mkt_val(inputDesc, NULL, NULL, &ic, &ih); - iw = 1; - in = 1; - } else { - return NOT_SUPPORTED; - } - U32 iw_str, ih_str, iw_off, ih_off, ihw_str; - U32 ow_str, oh_str, ow_off, oh_off, ohw_str; - get_gclmem_dim(input->desc, &iw_str, &ih_str, NULL, &iw_off, &ih_off); - get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off); - ihw_str = ih_str * iw_str; - ohw_str = oh_str * ow_str; - - cl_mem inbuf, outbuf; - inbuf = input->mem; - outbuf = output->mem; - U32 gs[2]; - U32 ls[2] = {0, 0}; - U32 dim = 2; - Kernel kernel; - char kernelname[128]; - if(input->desc.memFormat == DF_NCWHC4) { - if(axis != 1) CHECK_STATUS(NOT_SUPPORTED); - gs[0] = ih; - gs[1] = iw; - I32 icd4 = (ic + 3) >> 2; - I32 ice4 = ((ic & 3) == 0) ? 4 : (ic & 3); - sprintf(kernelname, "softmax"); - CHECK_STATUS(gcl_create_kernel_binary(handle, kernelname, &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, icd4, ice4, ih_str, ihw_str, ih_off, iw_off, - oh_str, ohw_str, oh_off, ow_off, gs[0], gs[1], inbuf, outbuf)); - } else if(input->desc.memFormat == DF_NCHW) { - I32 axisTran = (axis + 4) % 4; - if(axisTran == 1) {//on c axis - gs[0] = (iw + 3) / 4; - gs[1] = ih; - sprintf(kernelname, "softmax_nchw_c"); - CHECK_STATUS(gcl_create_kernel_binary(handle, kernelname, &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, ic, iw_str, ihw_str, iw_off, ih_off, - ow_str, ohw_str, ow_off, oh_off, gs[0], gs[1], inbuf, outbuf)); - } - if(axisTran == 3) {//on w axis - gs[0] = ih; - gs[1] = ic; - I32 iwd4 = (iw + 3) >> 2; - I32 iwe4 = ((iw & 3) == 0) ? 4 : (iw & 3); - sprintf(kernelname, "softmax_nchw_w"); - CHECK_STATUS(gcl_create_kernel_binary(handle, kernelname, &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, iwd4, iwe4, iw_str, ih_str, iw_off, ih_off, - ow_str, oh_str, ow_off, oh_off, gs[0], gs[1], inbuf, outbuf)); - } - } else { - return NOT_SUPPORTED; - } - gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); -#ifdef _DEBUG - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); - CHECK_STATUS(gcl_print_memory(handle, input, "softmax_nchw_input")); - CHECK_STATUS(gcl_print_memory(handle, output, "softmax_nchw_output")); -#endif - return SUCCESS; -} - - -EE softmax_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - int axis, - TensorDesc outputDesc, - GCLMem_t output) { - - CHECK_STATUS(softmax_checkpara_mali_fp16(inputDesc, outputDesc)); - CHECK_STATUS(softmax_core_mali_fp16(handle, inputDesc, input, axis, outputDesc, output)); - return SUCCESS; -} - diff --git a/tensor_computing/src/gpu/mali/fp16/squeeze_mali_fp16.cpp b/tensor_computing/src/gpu/mali/fp16/squeeze_mali_fp16.cpp deleted file mode 100644 index fcf5077d..00000000 --- a/tensor_computing/src/gpu/mali/fp16/squeeze_mali_fp16.cpp +++ /dev/null @@ -1,77 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "tensor_computing_type.h" -#include "gpu/mali/fp16/squeeze_mali_fp16.h" - -inline EE squeeze_checkpara_mali_fp16(TensorDesc inputDesc, - TensorDesc outputDesc) { - if(inputDesc.dt != outputDesc.dt) return NOT_SUPPORTED; - if(outputDesc.dt != DT_F16) return NOT_SUPPORTED; - return SUCCESS; -} - -inline EE squeeze_core_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { - UNUSED(outputDesc); - U32 iw, ih, ic, in; - tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); - U32 iw_str, ih_str, iw_off, ih_off; - ih_str = input->desc.stride[0]; - iw_str = input->desc.stride[1]; - ih_off = input->desc.offset[0]; - iw_off = input->desc.offset[1]; - U32 ow_str, oh_str, ow_off, oh_off; - oh_str = output->desc.stride[0]; - ow_str = output->desc.stride[1]; - oh_off = output->desc.offset[0]; - ow_off = output->desc.offset[1]; - - cl_mem inbuf, outbuf; - inbuf = input->mem; - outbuf = output->mem; - - U32 gs[3] = {ih, iw, (ic + 3) / 4}; - U32 ls[3] = {0, 0, 0}; - U32 dim = 3; - Kernel kernel; - CHECK_STATUS(gcl_create_kernel_binary(handle, "squeeze", &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ih_str, iw_str, ih_off, iw_off, oh_str, ow_str, oh_off, ow_off, inbuf, outbuf)); - gcl_set_kernelVec(handle, kernel, dim, gs, ls, "squeeze"); -#ifdef _DEBUG - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "squeeze")); - CHECK_STATUS(gcl_print_memory(handle, input, "squeeze_input")); - CHECK_STATUS(gcl_print_memory(handle, output, "squeeze_output")); -#endif - return SUCCESS; -} - -EE squeeze_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { - - CHECK_STATUS(squeeze_checkpara_mali_fp16(inputDesc, outputDesc)); - CHECK_STATUS(squeeze_core_mali_fp16(handle, inputDesc, input, outputDesc, output)); - return SUCCESS; -} - diff --git a/tensor_computing/src/gpu/mali/fp16/transpose_mali_fp16.cpp b/tensor_computing/src/gpu/mali/fp16/transpose_mali_fp16.cpp deleted file mode 100644 index 154efb45..00000000 --- a/tensor_computing/src/gpu/mali/fp16/transpose_mali_fp16.cpp +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "tensor_computing_type.h" -#include "gpu/mali/fp16/transpose_mali_fp16.h" - -inline EE transpose_checkpara_mali_fp16(TensorDesc inputDesc, - TensorDesc outputDesc) { - if(inputDesc.dt != outputDesc.dt || inputDesc.dt != DT_F16) return NOT_SUPPORTED; - return SUCCESS; -} - -inline EE transpose_core_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output, - U32* dim) { - UNUSED(inputDesc); - U32 iw_str, ih_str, iw_off, ih_off; - U32 ow_str, oh_str, ow_off, oh_off; - if(input->desc.memFormat == DF_NCHW) { - iw_str = input->desc.stride[0]; - ih_str = input->desc.stride[1]; - iw_off = input->desc.offset[0]; - ih_off = input->desc.offset[1]; - ow_str = output->desc.stride[0]; - oh_str = output->desc.stride[1]; - ow_off = output->desc.offset[0]; - oh_off = output->desc.offset[1]; - cl_mem inbuf = input->mem; - cl_mem outbuf = output->mem; - U32 gs[3] = {0, 0, 0}; - U32 ls[3] = {0, 0, 0}; - U32 kdim = 3; - Kernel kernel; - if(dim[0] == 0 && dim[1] == 1 && dim[2] == 3 && dim[3] == 2) { - U32 ow, oh, oc, on; - tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); - gs[0] =(oh + 3) / 4; - gs[1] = ow; - gs[2] = oc * on; - CHECK_STATUS(gcl_create_kernel_binary(handle, "transpose_nchw_0132", &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, iw_str, ih_off, iw_off, oh_str, ow_str, oh_off, ow_off, oh, gs[0], gs[1], inbuf, outbuf)); - gcl_set_kernelVec(handle, kernel, kdim, gs, ls, "tranpose_nchw_0132"); -#ifdef _DEBUG - CHECK_STATUS(gcl_print_memory(handle, input, "transpose_nchw_0132_input")); - CHECK_STATUS(gcl_run_kernel(handle, kernel, kdim, gs, ls, "transpose_nchw_0132")); - CHECK_STATUS(gcl_print_memory(handle, output, "transpose_nchw_0132_output")); -#endif - return SUCCESS; - } - return NOT_SUPPORTED; - } - return NOT_SUPPORTED; -} - - -EE transpose_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output, - U32* dim) { - CHECK_STATUS(transpose_checkpara_mali_fp16(inputDesc, outputDesc)); - CHECK_STATUS(transpose_core_mali_fp16(handle, inputDesc, input, outputDesc, output, dim)); - return SUCCESS; -} - diff --git a/tensor_computing/src/gpu/mali/fully_connected.cpp b/tensor_computing/src/gpu/mali/fully_connected.cpp deleted file mode 100644 index ee518bb6..00000000 --- a/tensor_computing/src/gpu/mali/fully_connected.cpp +++ /dev/null @@ -1,389 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "gpu/mali/tensor_computing_mali.h" -#include "gpu/mali/fp16/fully_connected_mali_fp16.h" -#include "gpu/mali/infer_gclmem_desc_mali.h" -inline EE fully_connected_checkpara_mali(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc filterDesc, - std::vector* filter, - std::vector* bias, - TensorDesc outputDesc, - std::vector* output) { - if(nullptr == handle || nullptr == input || nullptr == filter || nullptr == output || nullptr == bias) return NULL_POINTER; - if(filter->size() != output->size() || filter->size() != bias->size() || bias->size() == 0) return NOT_MATCH; - for(U32 i = 0; i < filter->size(); ++i) { - if(nullptr == (*filter)[i] || nullptr == (*output)[i] || nullptr == (*bias)[i]) return NULL_POINTER; - } - if(inputDesc.df == DF_NCHW) { - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 oc; - CHECK_STATUS(tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw)); - CHECK_STATUS(tensorSelectGet(outputDesc, NULL, NULL, NULL, &oc, NULL, NULL)); - if(filterDesc.df != DF_NCHW) return NOT_SUPPORTED; - if(input->desc.memFormat != DF_NCWHC4) return NOT_SUPPORTED; - if((*filter)[0]->desc.memFormat != DF_NCWHN4C4) return NOT_SUPPORTED; - if((*output)[0]->desc.memFormat != DF_NCWHC4) return NOT_SUPPORTED; - if(in > 1) return NOT_SUPPORTED; - if(filter->size() > 1) return NOT_SUPPORTED; - if(fw != iw) return NOT_MATCH; - if(fh != ih) return NOT_MATCH; - if(fc != ic) return NOT_MATCH; - if(fn != oc) return NOT_MATCH; - } - if(inputDesc.df == DF_MKT) { - U32 k; - U32 fw, fh, fc, fn; - k = inputDesc.dims[1]; - CHECK_STATUS(tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw)); - if(fh != 1 || fw != 1) return NOT_MATCH; - if(k != fc) return NOT_MATCH; - } - return SUCCESS; -} -EE fully_connected_infer_output_size_mali(TensorDesc inputDesc, - TensorDesc filterDesc, - TensorDesc* outputDesc, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc, - ForwardRunInfoMali_t forwardRunInfo) { - U32 fn; - tensorSelectGet(filterDesc, NULL, NULL, &fn, NULL, NULL, NULL); - if(inputDesc.df == DF_NCHW) { - DataType idt; - DataFormat idf; - U32 iw, ih, ic, in; - tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); - if(outputDesc) *outputDesc = tensor4df(idt, idf, in, fn, 1, 1); - CHECK_STATUS(infer_gclmem_desc_ncwhc4(iw, ih, ic, 0, 0, 1, 1, fn, idt, idt, gclmemInputDesc, gclmemOutputDesc)); - return SUCCESS; - } else if(inputDesc.df == DF_MKT) { - DataType dt; - U32 m, k, t; - get_nlp_mkt_val(inputDesc, &dt, &m, &k, &t); - if(outputDesc) { - *outputDesc = inputDesc; - (*outputDesc).dims[1] = fn; - } - U32 item_wh = forwardRunInfo->best_w[0]; - U32 igw, igh, igc; - U32 ogw, ogh, ogc; - map_nlp_mkt_to_ncwhc4(m, k, (t + item_wh - 1) / item_wh * item_wh, &igw, &igh, &igc); - map_nlp_mkt_to_ncwhc4(m, fn, t, &ogw, &ogh, &ogc); - igc = igc * 4; - ogc = ogc * 4; - CHECK_STATUS(infer_gclmem_desc_ncwhc4(igw, igh, igc, 0, 0, ogw, ogh, ogc, dt, dt, gclmemInputDesc, gclmemOutputDesc)); - return SUCCESS; - } - CHECK_STATUS(NOT_SUPPORTED); - return NOT_SUPPORTED; -} - -EE fully_connected_infer_forward_algorithm_mali(GCLHandle_t handle, - TensorDesc inputDesc, - TensorDesc filterDesc, - std::vector outputDescs, - ForwardRunInfoMali_t forwardRunInfo) { - if(forwardRunInfo == nullptr) CHECK_STATUS(NULL_POINTER); - ConvolutionForwardAlgorithm algorithm = (ConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); - if(algorithm != CONVOLUTION_ALGORITHM_NULL) return SUCCESS; - DataType dt; - U32 iw, ih, ic, fw, fh, fn; - tensorSelectGet(filterDesc, &dt, NULL, &fn, NULL, &fh, &fw); - if(inputDesc.df == DF_NCHW) { - tensorSelectGet(inputDesc, NULL, NULL, NULL, &ic, &ih, &iw); - if(ih != 1 || iw != 1 || fh != 1 || fw != 1) { - U32 item_w = (64 + ih - 1) / ih; - item_w = (item_w > iw) ? iw : item_w; - forwardRunInfo->best_w[0] = item_w; - forwardRunInfo->best_c[0] = 4; - forwardRunInfo->best_k[0] = 4; - forwardRunInfo->algorithm = CONVOLUTION_ALGORITHM_DIRECT; - return SUCCESS; - } - } - - GCLHandle_t handle_tun; - CHECK_STATUS(gcl_create_handle_profiling(&handle_tun)); - handle_tun->binMapPtr = handle->binMapPtr; - U32 sliceNum = outputDescs.size(); - GCLMem_t input = gcl_create_gclmem(); - GCLMem_t tmpbuf = gcl_create_gclmem(); - std::vector filter; - std::vector bias; - std::vector output; - for(U32 i = 0; i < sliceNum; ++i) { - GCLMem_t filterTmp = gcl_create_gclmem(); - GCLMem_t biasTmp = gcl_create_gclmem(); - GCLMem_t outTmp = gcl_create_gclmem(); - filter.push_back(filterTmp); - bias.push_back(biasTmp); - output.push_back(outTmp); - } - - std::vector runInfos; - ForwardRunInfoMali runInfo; - runInfo.algorithm = (I32)CONVOLUTION_ALGORITHM_DIRECT; - std::vector inputMemDescs; - std::vector outputMemDescs; - std::vector filterMemDescs; - U32 configInfo[3][64]; - U32 configNum, bytes; - U32 maxBytes = 0; - U32 maxInputSize = 0; - U32 maxOutputSize = 0; - U32 maxFilterSize = 0; - U32 biasNum; - U32 stride[3] = {0, 0, 0}; - U32 offset[3] = {0, 0, 0}; - - if(inputDesc.df == DF_NCHW) { - configNum = 3; - if((ic & 15) != 0) configNum = 2; - if((ic & 7) != 0) configNum = 1; - for(U32 i =0; i < configNum; ++i) { - configInfo[0][i] = 1; - configInfo[1][i] = 1 << (2 + i); - configInfo[2][i] = 0; - } - } else if(inputDesc.df == DF_MKT) { - configNum = 8; - bool align8 = true; - for(U32 i = 0; i < configNum; i++) { - configInfo[0][i] = i + 1; - configInfo[1][i] = 4; - configInfo[2][i] = 4; - if(outputDescs[i].dims[1] % 8 != 0) align8 = false; - } - if(align8) { - for(U32 i = 0; i < 4; i++) { - configInfo[0][i + configNum] = i + 1; - configInfo[1][i + configNum] = 4; - configInfo[2][i + configNum] = 8; - } - configNum += 4; - } - } else {return NOT_SUPPORTED;} - - for(U32 i = 0; i < configNum; ++i) { - GCLMemDesc inputMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); - GCLMemDesc outputMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); - GCLMemDesc filterMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); - runInfo.best_w[0] = configInfo[0][i]; - runInfo.best_c[0] = configInfo[1][i]; - runInfo.best_k[0] = configInfo[2][i]; - if(fully_connected_infer_output_size_mali(inputDesc, filterDesc, NULL, &inputMemDesc, &outputMemDesc, &runInfo) != SUCCESS) continue; - if(fully_connected_transform_filter_bytes_mali(filterDesc, &filterMemDesc, &bytes, &runInfo) != SUCCESS) continue; - if(maxBytes < bytes) maxBytes= bytes; - if(fully_connected_infer_forward_tmp_bytes_mali(inputDesc, filterDesc, &bytes, &runInfo) != SUCCESS) continue; - if(maxBytes < bytes) maxBytes= bytes; - if(maxInputSize < inputMemDesc.byteSize) maxInputSize = inputMemDesc.byteSize; - if(maxOutputSize < outputMemDesc.byteSize) maxOutputSize = outputMemDesc.byteSize; - if(maxFilterSize < filterMemDesc.byteSize) maxFilterSize = filterMemDesc.byteSize; - inputMemDescs.push_back(inputMemDesc); - outputMemDescs.push_back(outputMemDesc); - filterMemDescs.push_back(filterMemDesc); - runInfos.push_back(runInfo); - } - - if(inputDesc.df == DF_NCHW) { - biasNum = fn; - bias[0]->desc.byteSize = biasNum * bytesOf(dt); - bias[0]->desc.memType = GCL_MEM_BUF; - } - - if(inputDesc.df == DF_MKT) { - biasNum = (fn + 3) / 4; - bias[0]->desc.byteSize = biasNum * 4 * bytesOf(dt); - bias[0]->desc.memType = GCL_MEM_IMG_1D; - } - U32 algosNum = runInfos.size(); - if(algosNum == 0) CHECK_STATUS(NOT_SUPPORTED); - TensorDesc biasDesc = tensor1d(dt, fn); - inputMemDescs[0].byteSize = maxInputSize; - outputMemDescs[0].byteSize = maxOutputSize; - filterMemDescs[0].byteSize = maxFilterSize; - input->desc = inputMemDescs[0]; - output[0]->desc = outputMemDescs[0]; - filter[0]->desc = filterMemDescs[0]; - bias[0]->desc.stride[0] = biasNum; - bias[0]->desc.stride[1] = 1; - bias[0]->desc.stride[2] = 1; - bias[0]->desc.offset[0] = 0; - bias[0]->desc.offset[1] = 0; - bias[0]->desc.offset[2] = 0; - bias[0]->desc.num = biasNum; - bias[0]->desc.memFormat = DF_NHWC; - tmpbuf->desc.byteSize = maxBytes; - gcl_create_memory(handle_tun, input); - for(U32 i = 0; i < sliceNum; ++i) { - filter[i]->desc = filter[0]->desc; - bias[i]->desc = bias[0]->desc; - output[i]->desc = output[0]->desc; - filter[i]->desc.has_alloc = false; - bias[i]->desc.has_alloc = false; - output[i]->desc.has_alloc = false; - gcl_create_memory(handle_tun, filter[i]); - gcl_create_memory(handle_tun, bias[i]); - gcl_create_memory(handle_tun, output[i]); - } - if(maxBytes) gcl_create_memory(handle_tun, tmpbuf); - - U32 runKernelBe = 0; - U32 runKernelEnd = 0; - double minTime = DBL_MAX; - ForwardRunInfoMali bestRunInfo; - for(U32 i = 0; i < algosNum; i++) { - input->desc = inputMemDescs[i]; - output[0]->desc = outputMemDescs[i]; - filter[0]->desc = filterMemDescs[i]; - if(sliceNum > 1) { - U32 item_k = runInfos[i].best_k[0]; - for(U32 j = 0; j < sliceNum; j++) { - U32 fn = outputDescs[j].dims[1]; - output[j]->desc.stride[2] = (fn + 3) / 4; - filter[j]->desc.stride[2] = (fn + item_k - 1) / item_k; - biasNum = (inputDesc.df == DF_NCHW) ? fn : (fn + 3) / 4; - bias[j]->desc.stride[0] = biasNum; - } - } - if(fully_connected_mali(handle_tun, inputDesc, input, filterDesc, &filter, biasDesc, &bias, - maxBytes, tmpbuf, outputDescs[0], &output, &runInfos[i]) == SUCCESS) { - runKernelEnd = handle_tun->kernelVec.size(); - gcl_run_kernelVec_timing(handle_tun, runKernelBe, runKernelEnd); - runKernelBe = runKernelEnd; - if(minTime > handle_tun->t_execute) { - minTime = handle_tun->t_execute; - bestRunInfo = runInfos[i]; - } - } - } - if(minTime == DBL_MAX) CHECK_STATUS(NOT_SUPPORTED); - *forwardRunInfo = bestRunInfo; - CHECK_STATUS(gcl_finish(handle_tun)); - gcl_destroy_gclmem(input); - gcl_destroy_gclmem(tmpbuf); - for(auto p : filter) gcl_destroy_gclmem(p); - for(auto p : output) gcl_destroy_gclmem(p); - for(auto p : bias) gcl_destroy_gclmem(p); - runInfos.clear(); - inputMemDescs.clear(); - outputMemDescs.clear(); - filterMemDescs.clear(); - gcl_destroy_handle(handle_tun); - return SUCCESS; -} -EE fully_connected_transform_filter_bytes_mali(TensorDesc filterDesc, - GCLMemDesc_t gclmemFilterDesc, - U32* bytes, - ForwardRunInfoMali_t forwardRunInfo) { - EE ret = SUCCESS; - switch(filterDesc.dt) { - case DT_F16:{ - ret = fully_connected_transform_filter_bytes_mali_fp16(filterDesc, gclmemFilterDesc, bytes, forwardRunInfo); - break; - } - case DT_I8:{ - ret = NOT_SUPPORTED; - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - -EE fully_connected_transform_filter_mali(GCLHandle_t handle, - TensorDesc filterDesc, - GCLMem_t filter, - TensorDesc* fltmemDesc, - std::vector* fltmem, - ForwardRunInfoMali_t forwardRunInfo) { - EE ret = SUCCESS; - switch(filterDesc.dt) { - case DT_F16:{ - ret = fully_connected_transform_filter_mali_fp16(handle, filterDesc, filter, fltmemDesc, *fltmem, forwardRunInfo); - break; - } - case DT_I8:{ - ret = NOT_SUPPORTED; - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - -EE fully_connected_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, - TensorDesc filterDesc, - U32* bytes, - ForwardRunInfoMali_t forwardRunInfo) { - EE ret = SUCCESS; - switch(inputDesc.dt) { - case DT_F16:{ - ret = fully_connected_infer_forward_tmp_bytes_mali_fp16(inputDesc, filterDesc, bytes, forwardRunInfo); - break; - } - case DT_I8:{ - ret = NOT_SUPPORTED; - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - -EE fully_connected_mali(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc filterDesc, - std::vector* filter, - TensorDesc biasDesc, - std::vector* bias, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - std::vector* output, - ForwardRunInfoMali_t forwardRunInfo) { - EE ret = SUCCESS; - ret = fully_connected_checkpara_mali(handle, inputDesc, input, filterDesc, filter, bias, outputDesc, output); - switch(inputDesc.dt) { - case DT_F16:{ - ret = fully_connected_mali_fp16(handle, inputDesc, input, filterDesc, *filter, biasDesc, *bias, tmpBytes, tmpBuf, outputDesc, *output, forwardRunInfo); - break; - } - case DT_I8:{ - ret = NOT_SUPPORTED; - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - diff --git a/tensor_computing/src/gpu/mali/infer_gclmem_desc_mali.h b/tensor_computing/src/gpu/mali/infer_gclmem_desc_mali.h deleted file mode 100644 index b104abb0..00000000 --- a/tensor_computing/src/gpu/mali/infer_gclmem_desc_mali.h +++ /dev/null @@ -1,451 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _INFER_GCLMEM_DESC_MALI_F16 -#define _INFER_GCLMEM_DESC_MALI_F16 - -inline EE infer_gclmem_desc_nchw_to_ncwhc4(U32 iw, U32 ih, U32 ic, U32 pw, U32 ph, U32 ow, U32 oh, U32 oc, - GCLMemDesc_t gclmemInputDesc, GCLMemDesc_t gclmemOutputDesc){ - if(gclmemInputDesc == nullptr || gclmemOutputDesc == nullptr) return NULL_POINTER; - U32 s0, s1, s2; - s0 = ow; - s1 = oh; - s2 = (oc + 3) / 4; - - U32 num, byteSize; - num = s0 * s1 * s2 * 4; - byteSize = num * bytesOf(DT_F16); - gclmemOutputDesc->stride[0] = s0; - gclmemOutputDesc->stride[1] = s1; - gclmemOutputDesc->stride[2] = s2; - gclmemOutputDesc->offset[0] = 0; - gclmemOutputDesc->offset[1] = 0; - gclmemOutputDesc->offset[2] = 0; - gclmemOutputDesc->num = num; - gclmemOutputDesc->byteSize = byteSize; - - U32 pw_org, ph_org; - U32 s0_org, s1_org, s2_org; - U32 byteSize_org; - - s0_org = gclmemInputDesc->stride[0]; - s1_org = gclmemInputDesc->stride[1]; - s2_org = gclmemInputDesc->stride[2]; - pw_org = gclmemInputDesc->offset[0]; - ph_org = gclmemInputDesc->offset[1]; - byteSize_org = gclmemInputDesc->byteSize; - if(byteSize_org != 0 && gclmemInputDesc->memFormat != DF_NCWHC4) return NOT_SUPPORTED; - - pw = (pw > pw_org) ? pw : pw_org; - ph = (ph > ph_org) ? ph : ph_org; - - s0 = iw + (pw << 1); - s1 = ih + (ph << 1); - s2 = (ic + 3) >> 2; - s0 = (s0 > s0_org) ? s0 : s0_org; - s1 = (s1 > s1_org) ? s1 : s1_org; - s2 = (s2 > s2_org) ? s2 : s2_org; - - num = s0 * s1 * s2 * 4; - byteSize = num * bytesOf(DT_F16); - byteSize = (byteSize > byteSize_org) ? byteSize : byteSize_org; - - gclmemInputDesc->stride[0] = s0; - gclmemInputDesc->stride[1] = s1; - gclmemInputDesc->stride[2] = s2; - gclmemInputDesc->offset[0] = pw; - gclmemInputDesc->offset[1] = ph; - gclmemInputDesc->offset[2] = 0; - gclmemInputDesc->num = num; - gclmemInputDesc->byteSize = byteSize; - - gclmemInputDesc->memType = GCL_MEM_BUF; - gclmemInputDesc->memFormat = DF_NCWHC4; - gclmemInputDesc->flags = CL_MEM_READ_WRITE; - gclmemInputDesc->host_ptr = NULL; - gclmemOutputDesc->memType = GCL_MEM_BUF; - gclmemOutputDesc->memFormat = DF_NCWHC4; - gclmemOutputDesc->flags = CL_MEM_READ_WRITE; - gclmemOutputDesc->host_ptr = NULL; - return SUCCESS; -} - -inline EE infer_gclmem_desc_ncwhc4_to_ncwhc4(U32 iw, U32 ih, U32 ic, U32 pw, U32 ph, U32 ow, U32 oh, U32 oc, - GCLMemDesc_t gclmemInputDesc, GCLMemDesc_t gclmemOutputDesc){ - if(gclmemInputDesc == nullptr || gclmemOutputDesc == nullptr) return NULL_POINTER; - U32 s0, s1, s2; - s0 = ow; - s1 = oh; - s2 = (oc + 3) / 4; - - U32 num, byteSize; - num = s0 * s1 * s2 * 4; - byteSize = num * bytesOf(DT_F16); - gclmemOutputDesc->stride[0] = s0; - gclmemOutputDesc->stride[1] = s1; - gclmemOutputDesc->stride[2] = s2; - gclmemOutputDesc->offset[0] = 0; - gclmemOutputDesc->offset[1] = 0; - gclmemOutputDesc->offset[2] = 0; - gclmemOutputDesc->num = num; - gclmemOutputDesc->byteSize = byteSize; - - U32 pw_org, ph_org; - U32 s0_org, s1_org, s2_org; - U32 byteSize_org; - - s0_org = gclmemInputDesc->stride[0]; - s1_org = gclmemInputDesc->stride[1]; - s2_org = gclmemInputDesc->stride[2]; - pw_org = gclmemInputDesc->offset[0]; - ph_org = gclmemInputDesc->offset[1]; - byteSize_org = gclmemInputDesc->byteSize; - if(byteSize_org != 0 && gclmemInputDesc->memFormat != DF_NCWHC4) return NOT_SUPPORTED; - - pw = (pw > pw_org) ? pw : pw_org; - ph = (ph > ph_org) ? ph : ph_org; - - s0 = iw + (pw << 1); - s1 = ih + (ph << 1); - s2 = (ic + 3) >> 2; - s0 = (s0 > s0_org) ? s0 : s0_org; - s1 = (s1 > s1_org) ? s1 : s1_org; - s2 = (s2 > s2_org) ? s2 : s2_org; - - num = s0 * s1 * s2 * 4; - byteSize = num * bytesOf(DT_F16); - byteSize = (byteSize > byteSize_org) ? byteSize : byteSize_org; - - gclmemInputDesc->stride[0] = s0; - gclmemInputDesc->stride[1] = s1; - gclmemInputDesc->stride[2] = s2; - gclmemInputDesc->offset[0] = pw; - gclmemInputDesc->offset[1] = ph; - gclmemInputDesc->offset[2] = 0; - gclmemInputDesc->num = num; - gclmemInputDesc->byteSize = byteSize; - - gclmemInputDesc->memType = GCL_MEM_BUF; - gclmemInputDesc->memFormat = DF_NCWHC4; - gclmemInputDesc->flags = CL_MEM_READ_WRITE; - gclmemInputDesc->host_ptr = NULL; - gclmemOutputDesc->memType = GCL_MEM_BUF; - gclmemOutputDesc->memFormat = DF_NCWHC4; - gclmemOutputDesc->flags = CL_MEM_READ_WRITE; - gclmemOutputDesc->host_ptr = NULL; - return SUCCESS; -} -inline EE infer_gclmem_desc_nchwc3_to_nchw(U32 iw, U32 ih, U32 ic, U32 pw, U32 ph, U32 ow, U32 oh, U32 oc, - GCLMemDesc_t gclmemInputDesc, GCLMemDesc_t gclmemOutputDesc){ - if(gclmemInputDesc == nullptr || gclmemOutputDesc == nullptr) return NULL_POINTER; - U32 s0, s1, s2; - s0 = ow; - s1 = oh; - s2 = oc; - - U32 num, byteSize; - num = s0 * s1 * s2; - byteSize = num * bytesOf(DT_F16); - gclmemOutputDesc->stride[0] = s0; - gclmemOutputDesc->stride[1] = s1; - gclmemOutputDesc->stride[2] = s2; - gclmemOutputDesc->offset[0] = 0; - gclmemOutputDesc->offset[1] = 0; - gclmemOutputDesc->offset[2] = 0; - gclmemOutputDesc->num = num; - gclmemOutputDesc->byteSize = byteSize; - - U32 pw_org, ph_org; - U32 s0_org, s1_org, s2_org; - U32 byteSize_org; - - s0_org = gclmemInputDesc->stride[0]; - s1_org = gclmemInputDesc->stride[1]; - s2_org = gclmemInputDesc->stride[2]; - pw_org = gclmemInputDesc->offset[0]; - ph_org = gclmemInputDesc->offset[1]; - byteSize_org = gclmemInputDesc->byteSize; - if(byteSize_org != 0 && gclmemInputDesc->memFormat != DF_NCHWC3) return NOT_SUPPORTED; - - pw = (pw > pw_org) ? pw : pw_org; - ph = (ph > ph_org) ? ph : ph_org; - - s0 = iw + (pw << 1); - s1 = ih + (ph << 1); - s2 = (ic + 2) / 3; - s0 = (s0 > s0_org) ? s0 : s0_org; - s1 = (s1 > s1_org) ? s1 : s1_org; - s2 = (s2 > s2_org) ? s2 : s2_org; - - num = s0 * s1 * s2 * 3; - byteSize = num * bytesOf(DT_F16); - byteSize = (byteSize > byteSize_org) ? byteSize : byteSize_org; - - gclmemInputDesc->stride[0] = s0; - gclmemInputDesc->stride[1] = s1; - gclmemInputDesc->stride[2] = s2; - gclmemInputDesc->offset[0] = pw; - gclmemInputDesc->offset[1] = ph; - gclmemInputDesc->offset[2] = 0; - gclmemInputDesc->num = num; - gclmemInputDesc->byteSize = byteSize; - - gclmemInputDesc->memType = GCL_MEM_BUF; - gclmemInputDesc->memFormat = DF_NCHWC3; - gclmemInputDesc->flags = CL_MEM_READ_WRITE; - gclmemInputDesc->host_ptr = NULL; - gclmemOutputDesc->memType = GCL_MEM_BUF; - gclmemOutputDesc->memFormat = DF_NCHW; - gclmemOutputDesc->flags = CL_MEM_READ_WRITE; - gclmemOutputDesc->host_ptr = NULL; - return SUCCESS; -} - -inline EE infer_gclmem_desc_ncwhc4(U32 iw, U32 ih, U32 ic, U32 pw, U32 ph, U32 ow, U32 oh, U32 oc, DataType idt, DataType odt, - GCLMemDesc_t gclmemInputDesc, GCLMemDesc_t gclmemOutputDesc){ - U32 s0, s1, s2; - U32 num, byteSize; - U32 pw_org, ph_org; - U32 s0_org, s1_org, s2_org; - U32 byteSize_org; - if(gclmemOutputDesc) { - s0 = oh; - s1 = ow; - s2 = (oc + 3) / 4; - num = s0 * s1 * s2 * 4; - byteSize = num * bytesOf(odt); - gclmemOutputDesc->stride[0] = s0; - gclmemOutputDesc->stride[1] = s1; - gclmemOutputDesc->stride[2] = s2; - gclmemOutputDesc->offset[0] = 0; - gclmemOutputDesc->offset[1] = 0; - gclmemOutputDesc->offset[2] = 0; - gclmemOutputDesc->num = num; - gclmemOutputDesc->byteSize = byteSize; - gclmemOutputDesc->memType = GCL_MEM_BUF; - gclmemOutputDesc->memFormat = DF_NCWHC4; - gclmemOutputDesc->flags = CL_MEM_READ_WRITE; - gclmemOutputDesc->host_ptr = NULL; - } - - if(gclmemInputDesc) { - s0_org = gclmemInputDesc->stride[0]; - s1_org = gclmemInputDesc->stride[1]; - s2_org = gclmemInputDesc->stride[2]; - ph_org = gclmemInputDesc->offset[0]; - pw_org = gclmemInputDesc->offset[1]; - byteSize_org = gclmemInputDesc->byteSize; - if(byteSize_org != 0 && gclmemInputDesc->memFormat != DF_NCWHC4) return NOT_SUPPORTED; - - ph = (ph > ph_org) ? ph : ph_org; - pw = (pw > pw_org) ? pw : pw_org; - - s0 = ih + (ph << 1); - s1 = iw + (pw << 1); - s2 = (ic + 3) / 4; - s0 = (s0 > s0_org) ? s0 : s0_org; - s1 = (s1 > s1_org) ? s1 : s1_org; - s2 = (s2 > s2_org) ? s2 : s2_org; - num = s0 * s1 * s2 * 4; - byteSize = num * bytesOf(idt); - byteSize = (byteSize > byteSize_org) ? byteSize : byteSize_org; - - gclmemInputDesc->stride[0] = s0; - gclmemInputDesc->stride[1] = s1; - gclmemInputDesc->stride[2] = s2; - gclmemInputDesc->offset[0] = ph; - gclmemInputDesc->offset[1] = pw; - gclmemInputDesc->offset[2] = 0; - gclmemInputDesc->num = num; - gclmemInputDesc->byteSize = byteSize; - gclmemInputDesc->memType = GCL_MEM_BUF; - gclmemInputDesc->memFormat = DF_NCWHC4; - gclmemInputDesc->flags = CL_MEM_READ_WRITE; - gclmemInputDesc->host_ptr = NULL; - } - return SUCCESS; -} - -inline EE infer_gclmem_desc_nhwc(U32 iw, U32 ih, U32 ic, U32 pc, U32 pw, U32 ow, U32 oh, U32 oc, DataType idt, DataType odt, - GCLMemDesc_t gclmemInputDesc, GCLMemDesc_t gclmemOutputDesc){ - U32 s0, s1, s2; - U32 num, byteSize; - U32 pc_org, pw_org; - U32 s0_org, s1_org, s2_org; - U32 byteSize_org; - - if(gclmemOutputDesc) { - s0 = oc; - s1 = ow; - s2 = oh; - num = s0 * s1 * s2; - byteSize = num * bytesOf(odt); - gclmemOutputDesc->stride[0] = s0; - gclmemOutputDesc->stride[1] = s1; - gclmemOutputDesc->stride[2] = s2; - gclmemOutputDesc->offset[0] = 0; - gclmemOutputDesc->offset[1] = 0; - gclmemOutputDesc->offset[2] = 0; - gclmemOutputDesc->num = num; - gclmemOutputDesc->byteSize = byteSize; - gclmemOutputDesc->memType = GCL_MEM_BUF; - gclmemOutputDesc->memFormat = DF_NHWC; - gclmemOutputDesc->flags = CL_MEM_READ_WRITE; - gclmemOutputDesc->host_ptr = NULL; - } - - if(gclmemInputDesc) { - s0_org = gclmemInputDesc->stride[0]; - s1_org = gclmemInputDesc->stride[1]; - s2_org = gclmemInputDesc->stride[2]; - pc_org = gclmemInputDesc->offset[0]; - pw_org = gclmemInputDesc->offset[1]; - byteSize_org = gclmemInputDesc->byteSize; - if(byteSize_org != 0 && gclmemInputDesc->memFormat != DF_NHWC) return NOT_SUPPORTED; - - pc = (pc > pc_org) ? pc : pc_org; - pw = (pw > pw_org) ? pw : pw_org; - s0 = ic + (pc << 1); - s1 = iw + (pw << 1); - s2 = ih; - s0 = (s0 > s0_org) ? s0 : s0_org; - s1 = (s1 > s1_org) ? s1 : s1_org; - s2 = (s2 > s2_org) ? s2 : s2_org; - - num = s0 * s1 * s2; - byteSize = num * bytesOf(idt); - byteSize = (byteSize > byteSize_org) ? byteSize : byteSize_org; - gclmemInputDesc->stride[0] = s0; - gclmemInputDesc->stride[1] = s1; - gclmemInputDesc->stride[2] = s2; - gclmemInputDesc->offset[0] = pc; - gclmemInputDesc->offset[1] = pw; - gclmemInputDesc->offset[2] = 0; - gclmemInputDesc->num = num; - gclmemInputDesc->byteSize = byteSize; - gclmemInputDesc->memType = GCL_MEM_BUF; - gclmemInputDesc->memFormat = DF_NHWC; - gclmemInputDesc->flags = CL_MEM_READ_WRITE; - gclmemInputDesc->host_ptr = NULL; - } - return SUCCESS; -} - -inline EE infer_gclmem_desc_nchw(U32 iw, U32 ih, U32 ic, U32 pw, U32 ph, U32 ow, U32 oh, U32 oc, DataType idt, DataType odt, - GCLMemDesc_t gclmemInputDesc, GCLMemDesc_t gclmemOutputDesc){ - U32 s0, s1, s2; - U32 num, byteSize; - U32 pw_org, ph_org; - U32 s0_org, s1_org, s2_org; - U32 byteSize_org; - - if(gclmemOutputDesc) { - s0 = ow; - s1 = oh; - s2 = oc; - num = s0 * s1 * s2; - byteSize = num * bytesOf(odt); - gclmemOutputDesc->stride[0] = s0; - gclmemOutputDesc->stride[1] = s1; - gclmemOutputDesc->stride[2] = s2; - gclmemOutputDesc->offset[0] = 0; - gclmemOutputDesc->offset[1] = 0; - gclmemOutputDesc->offset[2] = 0; - gclmemOutputDesc->num = num; - gclmemOutputDesc->byteSize = byteSize; - gclmemOutputDesc->memType = GCL_MEM_BUF; - gclmemOutputDesc->memFormat = DF_NCHW; - gclmemOutputDesc->flags = CL_MEM_READ_WRITE; - gclmemOutputDesc->host_ptr = NULL; - } - - if(gclmemInputDesc) { - s0_org = gclmemInputDesc->stride[0]; - s1_org = gclmemInputDesc->stride[1]; - s2_org = gclmemInputDesc->stride[2]; - pw_org = gclmemInputDesc->offset[0]; - ph_org = gclmemInputDesc->offset[1]; - byteSize_org = gclmemInputDesc->byteSize; - if(byteSize_org != 0 && gclmemInputDesc->memFormat != DF_NCHW) return NOT_SUPPORTED; - - pw = (pw > pw_org) ? pw : pw_org; - ph = (ph > ph_org) ? ph : ph_org; - s0 = iw + (pw << 1); - s1 = ih + (ph << 1); - s2 = ic; - s0 = (s0 > s0_org) ? s0 : s0_org; - s1 = (s1 > s1_org) ? s1 : s1_org; - s2 = (s2 > s2_org) ? s2 : s2_org; - - num = s0 * s1 * s2; - byteSize = num * bytesOf(idt); - byteSize = (byteSize > byteSize_org) ? byteSize : byteSize_org; - gclmemInputDesc->stride[0] = s0; - gclmemInputDesc->stride[1] = s1; - gclmemInputDesc->stride[2] = s2; - gclmemInputDesc->offset[0] = pw; - gclmemInputDesc->offset[1] = ph; - gclmemInputDesc->offset[2] = 0; - gclmemInputDesc->num = num; - gclmemInputDesc->byteSize = byteSize; - gclmemInputDesc->memType = GCL_MEM_BUF; - gclmemInputDesc->memFormat = DF_NCHW; - gclmemInputDesc->flags = CL_MEM_READ_WRITE; - gclmemInputDesc->host_ptr = NULL; - } - return SUCCESS; -} - -inline void get_nlp_mkt_val(TensorDesc desc, DataType* dt, U32* m, U32* k, U32* t) { - if(dt) *dt = desc.dt; - if(desc.df == DF_MTK) { - if(m) *m = desc.dims[2]; - if(t) *t = desc.dims[1]; - if(k) *k = desc.dims[0]; - } else if(desc.df == DF_MKT) { - if(m) *m = desc.dims[2]; - if(k) *k = desc.dims[1]; - if(t) *t = desc.dims[0]; - } else { - CHECK_STATUS(NOT_MATCH); - } - -} - -inline void map_nlp_mkt_to_ncwhc4(U32 m, U32 k, U32 t, U32* gw, U32* gh, U32* gc) { - if(gw) *gw = 1; - if(gh) *gh = t; - if(gc) *gc = (k + 3) / 4 * m; -} - -inline void get_gclmem_dim(GCLMemDesc desc, U32* w_str, U32* h_str, U32* c_str, U32* w_off, U32* h_off) { - if(desc.memFormat == DF_NCHW) { - if(w_str) *w_str = desc.stride[0]; - if(h_str) *h_str = desc.stride[1]; - if(c_str) *c_str = desc.stride[2]; - if(w_off) *w_off = desc.offset[0]; - if(h_off) *h_off = desc.offset[1]; - } - else if(desc.memFormat == DF_NCWHC4) { - if(w_str) *w_str = desc.stride[1]; - if(h_str) *h_str = desc.stride[0]; - if(c_str) *c_str = desc.stride[2]; - if(w_off) *w_off = desc.offset[1]; - if(h_off) *h_off = desc.offset[0]; - } else { - CHECK_STATUS(NOT_SUPPORTED); - } -} - -#endif diff --git a/tensor_computing/src/gpu/mali/matmul.cpp b/tensor_computing/src/gpu/mali/matmul.cpp deleted file mode 100644 index 9ae7bb7e..00000000 --- a/tensor_computing/src/gpu/mali/matmul.cpp +++ /dev/null @@ -1,293 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "gpu/mali/tensor_computing_mali.h" -#include "gpu/mali/fp16/matmul_mali_fp16.h" -#include "gpu/mali/infer_gclmem_desc_mali.h" -inline EE matmul_checkpara_mali(GCLHandle_t handle, - TensorDesc matrixADesc, - bool transposeA, - const GCLMem_t matrixA, - TensorDesc matrixBDesc, - bool transposeB, - const GCLMem_t matrixB, - TensorDesc matrixCDesc, - GCLMem_t matrixC) { - if(nullptr == handle || nullptr == matrixA || nullptr == matrixB || nullptr == matrixC) return NULL_POINTER; - if(matrixADesc.df != matrixBDesc.df || matrixADesc.df != matrixCDesc.df || matrixADesc.df != DF_NCHW) return NOT_SUPPORTED; - if(matrixA->desc.memFormat != DF_NCHW || matrixB->desc.memFormat != DF_NCHW || matrixC->desc.memFormat != DF_NCHW) return NOT_SUPPORTED; - if(transposeA && transposeB) return NOT_SUPPORTED; - if(!transposeA && !transposeB) return NOT_SUPPORTED; - if(matrixA->desc.stride[2] != matrixB->desc.stride[2]) return NOT_MATCH; - if(matrixA->desc.offset[0] != 0 || matrixA->desc.offset[1] != 0) return NOT_SUPPORTED; - if(matrixB->desc.offset[0] != 0 || matrixB->desc.offset[1] != 0) return NOT_SUPPORTED; - if(matrixC->desc.offset[0] != 0 || matrixC->desc.offset[1] != 0) return NOT_SUPPORTED; - return SUCCESS; -} - -EE matmul_infer_output_size_mali(TensorDesc matrixADesc, - bool transposeA, - TensorDesc matrixBDesc, - bool transposeB, - TensorDesc* matrixCDesc, - GCLMemDesc_t gclmemMatrixADesc, - GCLMemDesc_t gclmemMatrixBDesc, - GCLMemDesc_t gclmemMatrixCDesc, - ForwardRunInfoMali_t forwardRunInfo) { - U32 adims = matrixADesc.nDims; - U32 bdims = matrixBDesc.nDims; - DataType adt = matrixADesc.dt; - DataType bdt = matrixBDesc.dt; - if(adims < 2 || bdims < 2) CHECK_STATUS(NOT_MATCH); - if(adt != bdt) CHECK_STATUS(NOT_MATCH); - U32 ac = (adims > 2) ? matrixADesc.dims[2] : 1; - U32 ah = matrixADesc.dims[1]; - U32 aw = matrixADesc.dims[0]; - U32 bc = (bdims > 2) ? matrixBDesc.dims[2] : 1; - U32 bh = matrixBDesc.dims[1]; - U32 bw = matrixBDesc.dims[0]; - if(ac != bc) CHECK_STATUS(NOT_SUPPORTED); - if(transposeA && transposeB) CHECK_STATUS(NOT_SUPPORTED); - if(!transposeA && !transposeB) CHECK_STATUS(NOT_SUPPORTED); - if(transposeA && !transposeB) { - /*TN*/ - if(ah != bh) CHECK_STATUS(NOT_SUPPORTED); - if(matrixCDesc) { - *matrixCDesc = matrixADesc; - (*matrixCDesc).dims[0] = bw; - (*matrixCDesc).dims[1] = aw; - } - U32 item_w = forwardRunInfo->best_w[0]; - U32 item_k = forwardRunInfo->best_k[0]; - U32 aw_align = (aw + item_k - 1) / item_k * item_k; - U32 bw_align = (bw + item_w - 1) / item_w * item_w; - CHECK_STATUS(infer_gclmem_desc_nchw(aw_align, ah, ac, 0, 0, bw_align, aw_align, ac, adt, adt, gclmemMatrixADesc, gclmemMatrixCDesc)); - CHECK_STATUS(infer_gclmem_desc_nchw(bw_align, bh, bc, 0, 0, 0, 0, 0, adt, adt, gclmemMatrixBDesc, NULL)); - return SUCCESS; - } - if(!transposeA && transposeB) { - /*NT*/ - if(aw != bw) CHECK_STATUS(NOT_SUPPORTED); - if(matrixCDesc) { - *matrixCDesc = matrixADesc; - (*matrixCDesc).dims[0] = bh; - (*matrixCDesc).dims[1] = ah; - } - U32 item_w = forwardRunInfo->best_w[0]; - U32 item_c = forwardRunInfo->best_c[0]; - U32 item_k = forwardRunInfo->best_k[0]; - U32 ah_align = (ah + item_k - 1) / item_k * item_k; - U32 bh_align = (bh + item_w - 1) / item_w * item_w; - U32 aw_align = (aw + item_c - 1) / item_c * item_c; - CHECK_STATUS(infer_gclmem_desc_nchw(aw_align, ah_align, ac, 0, 0, bh_align, ah_align, ac, adt, adt, gclmemMatrixADesc, gclmemMatrixCDesc)); - CHECK_STATUS(infer_gclmem_desc_nchw(aw_align, bh_align, bc, 0, 0, 0, 0, 0, adt, adt, gclmemMatrixBDesc, NULL)); - return SUCCESS; - } - return NOT_SUPPORTED; -} - -EE matmul_infer_forward_algorithm_mali(GCLHandle_t handle, - TensorDesc matrixADesc, - bool transposeA, - TensorDesc matrixBDesc, - bool transposeB, - TensorDesc matrixCDesc, - ForwardRunInfoMali_t forwardRunInfo) { - if(forwardRunInfo == nullptr) CHECK_STATUS(NULL_POINTER); - ConvolutionForwardAlgorithm algorithm = (ConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); - if(algorithm != CONVOLUTION_ALGORITHM_NULL) return SUCCESS; - GCLHandle_t handle_tun; - CHECK_STATUS(gcl_create_handle_profiling(&handle_tun)); - handle_tun->binMapPtr = handle->binMapPtr; - GCLMem_t matrixA = gcl_create_gclmem(); - GCLMem_t matrixB = gcl_create_gclmem(); - GCLMem_t matrixC = gcl_create_gclmem(); - GCLMem_t tmpbuf = gcl_create_gclmem(); - std::vector runInfos; - ForwardRunInfoMali runInfo; - runInfo.algorithm = (I32)CONVOLUTION_ALGORITHM_GEMM; - std::vector matrixAMemDescs; - std::vector matrixBMemDescs; - std::vector matrixCMemDescs; - U32 configInfo[3][192]; - U32 configNum = 0; - U32 bytes; - U32 maxBytes = 0; - U32 maxASize = 0; - U32 maxBSize = 0; - U32 maxCSize = 0; - U32 stride[3] = {0, 0, 0}; - U32 offset[3] = {0, 0, 0}; - for(U32 i = 1; i <= 8; ++i) { - for(U32 j = 1; j <= 8; ++j) { - if(i * j <= 2) continue; - configInfo[0][configNum] = j;//w - configInfo[1][configNum] = 1;//c - configInfo[2][configNum] = i;//k - configNum++; - } - } - - if(!transposeA && transposeB) { - for(U32 i = 1; i <= 8; ++i) { - for(U32 j = 1; j <= 8; ++j) { - if(i * j <= 2) continue; - if(i == 6 && j > 7) continue; - if(i == 7 && j > 6) continue; - if(i == 8 && j > 5) continue; - configInfo[0][configNum] = j;//w - configInfo[1][configNum] = 2;//c - configInfo[2][configNum] = i;//k - configNum++; - } - } - - for(U32 i = 1; i <= 8; ++i) { - for(U32 j = 1; j <= 8; ++j) { - if(i * j <= 2) continue; - if(i == 5 && j > 6) continue; - if(i == 6 && j > 5) continue; - if(i == 7 && j > 4) continue; - if(i == 8 && j > 3) continue; - configInfo[0][configNum] = j;//w - configInfo[1][configNum] = 4;//c - configInfo[2][configNum] = i;//k - configNum++; - } - } - } - - for(U32 i = 0; i < configNum; ++i) { - GCLMemDesc matrixAMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); - GCLMemDesc matrixBMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); - GCLMemDesc matrixCMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); - runInfo.best_w[0] = configInfo[0][i]; - runInfo.best_c[0] = configInfo[1][i]; - runInfo.best_k[0] = configInfo[2][i]; - if(matmul_infer_output_size_mali(matrixADesc, transposeA, matrixBDesc, transposeB, NULL, &matrixAMemDesc, &matrixBMemDesc, &matrixCMemDesc, &runInfo) != SUCCESS) continue; - if(matmul_infer_forward_tmp_bytes_mali(matrixADesc, transposeA, matrixBDesc, transposeB, &bytes, &runInfo) != SUCCESS) continue; - if(maxBytes < bytes) maxBytes= bytes; - if(maxASize < matrixAMemDesc.byteSize) maxASize = matrixAMemDesc.byteSize; - if(maxBSize < matrixBMemDesc.byteSize) maxBSize = matrixBMemDesc.byteSize; - if(maxCSize < matrixCMemDesc.byteSize) maxCSize = matrixCMemDesc.byteSize; - matrixAMemDescs.push_back(matrixAMemDesc); - matrixBMemDescs.push_back(matrixBMemDesc); - matrixCMemDescs.push_back(matrixCMemDesc); - runInfos.push_back(runInfo); - } - U32 algosNum = runInfos.size(); - if(algosNum == 0) CHECK_STATUS(NOT_SUPPORTED); - matrixAMemDescs[0].byteSize = maxASize; - matrixBMemDescs[0].byteSize = maxBSize; - matrixCMemDescs[0].byteSize = maxCSize; - matrixA->desc = matrixAMemDescs[0]; - matrixB->desc = matrixBMemDescs[0]; - matrixC->desc = matrixCMemDescs[0]; - tmpbuf->desc.byteSize = maxBytes; - gcl_create_memory(handle_tun, matrixA); - gcl_create_memory(handle_tun, matrixB); - gcl_create_memory(handle_tun, matrixC); - if(maxBytes) gcl_create_memory(handle_tun, tmpbuf); - - U32 runKernelBe = 0; - U32 runKernelEnd = 0; - double minTime = DBL_MAX; - ForwardRunInfoMali bestRunInfo; - for(U32 i = 0; i < algosNum; i++) { - matrixA->desc = matrixAMemDescs[i]; - matrixB->desc = matrixBMemDescs[i]; - matrixC->desc = matrixCMemDescs[i]; - if(matmul_mali(handle_tun, matrixADesc, transposeA, matrixA, matrixBDesc, transposeB, matrixB, tmpbuf, - matrixCDesc, matrixC, &runInfos[i]) == SUCCESS) { - runKernelEnd = handle_tun->kernelVec.size(); - gcl_run_kernelVec_timing(handle_tun, runKernelBe, runKernelEnd); - runKernelBe = runKernelEnd; - if(minTime > handle_tun->t_execute) { - minTime = handle_tun->t_execute; - bestRunInfo = runInfos[i]; - } - } - } - if(minTime == DBL_MAX) CHECK_STATUS(NOT_SUPPORTED); - *forwardRunInfo = bestRunInfo; - CHECK_STATUS(gcl_finish(handle_tun)); - gcl_destroy_gclmem(matrixA); - gcl_destroy_gclmem(matrixB); - gcl_destroy_gclmem(matrixC); - gcl_destroy_gclmem(tmpbuf); - runInfos.clear(); - matrixAMemDescs.clear(); - matrixBMemDescs.clear(); - matrixCMemDescs.clear(); - gcl_destroy_handle(handle_tun); - return SUCCESS; -} - - -EE matmul_infer_forward_tmp_bytes_mali(TensorDesc matrixADesc, - bool transposeA, - TensorDesc matrixBDesc, - bool transposeB, - U32* bytes, - ForwardRunInfoMali_t forwardRunInfo) { - EE ret = SUCCESS; - switch(matrixADesc.dt) { - case DT_F16:{ - ret = matmul_infer_forward_tmp_bytes_mali_fp16(matrixADesc, transposeA, matrixBDesc, transposeB, bytes, forwardRunInfo); - break; - } - case DT_I8:{ - ret = NOT_SUPPORTED; - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - -EE matmul_mali(GCLHandle_t handle, - TensorDesc matrixADesc, - bool transposeA, - const GCLMem_t matrixA, - TensorDesc matrixBDesc, - bool transposeB, - const GCLMem_t matrixB, - GCLMem_t tmp, - TensorDesc matrixCDesc, - GCLMem_t matrixC, - ForwardRunInfoMali_t forwardRunInfo) { - EE ret = SUCCESS; - ret = matmul_checkpara_mali(handle, matrixADesc, transposeA, matrixA, matrixBDesc, transposeB, matrixB, matrixCDesc, matrixC); - switch(matrixADesc.dt) { - case DT_F16:{ - ret = matmul_mali_fp16(handle, matrixADesc, transposeA, matrixA, matrixBDesc, transposeB, matrixB, tmp, matrixCDesc, matrixC, forwardRunInfo); - break; - } - case DT_I8:{ - ret = NOT_SUPPORTED; - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - diff --git a/tensor_computing/src/gpu/mali/multiply.cpp b/tensor_computing/src/gpu/mali/multiply.cpp deleted file mode 100644 index f4a7eefc..00000000 --- a/tensor_computing/src/gpu/mali/multiply.cpp +++ /dev/null @@ -1,89 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "gpu/mali/tensor_computing_mali.h" -#include "gpu/mali/fp16/multiply_mali_fp16.h" -#include "gpu/mali/infer_gclmem_desc_mali.h" - -EE multiply_infer_output_size_mali(TensorDesc inputDesc, - TensorDesc* outputDesc, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc){ - /*tensorDesc record cpu org data format info*/ - /*gclmemDesc record gpu trans data format info*/ - if(outputDesc) *outputDesc = inputDesc; - DataType idt; - DataFormat idf; - U32 iw, ih, ic, in; - tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); - - if(idf == DF_NCHW) { - if(gclmemInputDesc) { - U32 iw_align = (iw + 3) / 4 * 4; - if(gclmemInputDesc->memFormat == DF_NCHW) { - CHECK_STATUS(infer_gclmem_desc_nchw(iw_align, ih, ic, 0, 0, iw_align, ih, ic, idt, idt, gclmemInputDesc, gclmemOutputDesc)); - if(gclmemInputDesc && gclmemOutputDesc) *gclmemOutputDesc = *gclmemInputDesc; - } else { - return NOT_SUPPORTED; - } - } - return SUCCESS; - } - return NOT_SUPPORTED; -} - -inline EE multiply_checkpara_mali(GCLHandle_t handle, - void* alpha, - void* beta, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { - if(handle == nullptr || alpha == nullptr || beta == nullptr || nullptr == input || nullptr == output) return NULL_POINTER; - if(inputDesc.df != outputDesc.df || inputDesc.df != DF_NCHW) return NOT_SUPPORTED; - if(input->desc.memFormat != output->desc.memFormat || input->desc.memFormat != DF_NCHW) return NOT_SUPPORTED; - return SUCCESS; -} - -EE multiply_mali(GCLHandle_t handle, - void* alpha, - void* beta, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { - EE ret = SUCCESS; - CHECK_STATUS(multiply_checkpara_mali(handle, alpha, beta, inputDesc, input, outputDesc, output)); - switch(inputDesc.dt){ - case DT_F16:{ - ret = multiply_mali_fp16(handle, alpha, beta, inputDesc, input, outputDesc, output); - break; - } - case DT_I8:{ - ret = NOT_SUPPORTED; - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - - - diff --git a/tensor_computing/src/gpu/mali/normalization.cpp b/tensor_computing/src/gpu/mali/normalization.cpp deleted file mode 100644 index 6621de14..00000000 --- a/tensor_computing/src/gpu/mali/normalization.cpp +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "gpu/mali/tensor_computing_mali.h" -#include "gpu/mali/fp16/normalization_mali_fp16.h" -#include "gpu/mali/infer_gclmem_desc_mali.h" - -EE normalization_infer_output_size_mali(TensorDesc inputDesc, - TensorDesc* outputDesc, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc) { - if(outputDesc) *outputDesc = inputDesc; - if(inputDesc.df == DF_MKT) { - DataType dt; - U32 m, k, t; - U32 w, h, c; - get_nlp_mkt_val(inputDesc, &dt, &m, &k, &t); - map_nlp_mkt_to_ncwhc4(m, k, t, &w, &h, &c); - c = c * 4; - CHECK_STATUS(infer_gclmem_desc_ncwhc4(w, h, c, 0, 0, w, h, c, dt, dt, gclmemInputDesc, gclmemOutputDesc)); - return SUCCESS; - } - return NOT_SUPPORTED; -} - -inline EE normalization_checkpara_mali(GCLHandle_t handle, - GCLMem_t alpha, - GCLMem_t beta, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { - if(nullptr == handle || nullptr == alpha || nullptr == beta || nullptr == input || nullptr == output) return NULL_POINTER; - if(inputDesc.df != outputDesc.df || inputDesc.df != DF_MKT) return NOT_SUPPORTED; - if(input->desc.memFormat != output->desc.memFormat || input->desc.memFormat != DF_NCWHC4) return NOT_SUPPORTED; - return SUCCESS; -} - -EE layer_normalization_mali(GCLHandle_t handle, - GCLMem_t alpha, - GCLMem_t beta, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { - EE ret = SUCCESS; - CHECK_STATUS(normalization_checkpara_mali(handle, alpha, beta, inputDesc, input, outputDesc, output)); - switch(inputDesc.dt){ - case DT_F16:{ - ret = normalization_mali_fp16(handle, alpha, beta, inputDesc, input, outputDesc, output); - break; - } - case DT_I8:{ - ret = NOT_SUPPORTED; - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - - - diff --git a/tensor_computing/src/gpu/mali/pooling.cpp b/tensor_computing/src/gpu/mali/pooling.cpp deleted file mode 100644 index f4d6000c..00000000 --- a/tensor_computing/src/gpu/mali/pooling.cpp +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "gpu/mali/tensor_computing_mali.h" -#include "gpu/mali/fp16/pooling_mali_fp16.h" -#include "gpu/mali/infer_gclmem_desc_mali.h" - -EE pooling_infer_output_size_mali(TensorDesc inputDesc, - PoolingDesc poolingDesc, - TensorDesc* outputDesc, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc){ - /*tensorDesc record cpu org data format info*/ - /*gclmemDesc record gpu trans data format info*/ - DataType idt; - DataFormat idf; - U32 iw, ih, ic, in; - U32 ow, oh; - U32 kw, kh, sw, sh, pw, ph, pr, pb; - tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); - pw = poolingDesc.padding_left; - pr = poolingDesc.padding_right; - ph = poolingDesc.padding_top; - pb = poolingDesc.padding_bottom; - kw = poolingDesc.kernelSize_w; - kh = poolingDesc.kernelSize_h; - sw = poolingDesc.stride_w; - sh = poolingDesc.stride_h; - if(kw != kh || sw != sh) CHECK_STATUS(NOT_SUPPORTED); - if(pw != ph || ph != pb || pw != pr) CHECK_STATUS( NOT_SUPPORTED); - switch (poolingDesc.rm){ - case CEIL: { - ow = (U32)(ceil((double(iw + 2 * pw - kw) / sw))) + 1; - oh = (U32)(ceil((double(ih + 2 * ph - kh) / sh))) + 1; - break; - } - case FLOOR: { - ow = (U32)(floor((double(iw + 2 * pw - kw) / sw))) + 1; - oh = (U32)(floor((double(ih + 2 * ph - kh) / sh))) + 1; - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - - CHECK_STATUS(infer_gclmem_desc_ncwhc4(iw, ih, ic, pw, ph, ow, oh, ic, idt, idt, gclmemInputDesc, gclmemOutputDesc)); - if(outputDesc) *outputDesc = tensor4df(idt, idf, in, ic, oh, ow); - return SUCCESS; -} -EE pooling_mali(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - PoolingDesc poolingDesc, - const void* scale, - TensorDesc outputDesc, - GCLMem_t output){ - UNUSED(scale); - EE ret = SUCCESS; - switch(inputDesc.dt){ - case DT_F16:{ - ret = pooling_mali_fp16(handle, inputDesc, input, poolingDesc, outputDesc, output); - break; - } - case DT_I8:{ - ret = NOT_SUPPORTED; - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - - - diff --git a/tensor_computing/src/gpu/mali/reshape.cpp b/tensor_computing/src/gpu/mali/reshape.cpp deleted file mode 100644 index 6fbb54c9..00000000 --- a/tensor_computing/src/gpu/mali/reshape.cpp +++ /dev/null @@ -1,134 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "gpu/mali/tensor_computing_mali.h" -#include "gpu/mali/fp16/reshape_mali_fp16.h" -#include "gpu/mali/infer_gclmem_desc_mali.h" - -EE reshape_infer_output_size_mali(TensorDesc inputDesc, - TensorDesc* outputDesc, - I32* dims, - I32 shapeSize, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc){ - /*tensorDesc record cpu org data format info*/ - /*gclmemDesc record gpu trans data format info*/ - I32 dimTran[4] = {1, 1, 1, 1}; - U32 factor = 1; - U32 count = 0; - for(I32 i = 0; i < shapeSize; i++){ - I32 value = dims[i]; - if(value == 0) value = inputDesc.dims[3 - i]; - if(value == -1) { - value = 0; - count++; - } else { - factor *=value; - } - dimTran[3 - i] = value; - } - - for(I32 i = 0; i < 4; i++) { - if(dimTran[i] == 0) dimTran[i] = tensorNumElements(inputDesc) / factor; - } - - - if(inputDesc.df == DF_NCHW) { - DataType idt; - DataFormat idf; - U32 iw, ih, ic, in; - tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); - if(shapeSize == 2 || shapeSize == 4) { - if(dimTran[2] != (I32)ic) CHECK_STATUS(NOT_SUPPORTED);//gpu use ncwhc4, if reshape on axis c, need to reset data - if(outputDesc) { - *outputDesc = inputDesc; - (*outputDesc).dims[0] = dimTran[0]; - (*outputDesc).dims[1] = dimTran[1]; - (*outputDesc).dims[2] = dimTran[2]; - (*outputDesc).dims[3] = dimTran[3]; - } - CHECK_STATUS(infer_gclmem_desc_ncwhc4(iw, ih, ic, 0, 0, dimTran[0], dimTran[1], dimTran[2], idt, idt, gclmemInputDesc, gclmemOutputDesc)); - } - if(shapeSize == 3) { - U32 m = dimTran[3]; - U32 k = dimTran[2]; - U32 t = dimTran[1]; - if(outputDesc) *outputDesc = tensor3df(idt, DF_MKT, m, k, t); - U32 ogw, ogh, ogc; - map_nlp_mkt_to_ncwhc4(m, k, t, &ogw, &ogh, &ogc); - CHECK_STATUS(infer_gclmem_desc_nchw(iw, ih, ic, 0, 0, 0, 0, 0, idt, idt, gclmemInputDesc, NULL)); - CHECK_STATUS(infer_gclmem_desc_ncwhc4(0, 0, 0, 0, 0, ogw, ogh, ogc * 4, idt, idt, NULL, gclmemOutputDesc)); - } - return SUCCESS; - } - - if(inputDesc.df == DF_MKT) { - DataType idt; - U32 m, k, t; - get_nlp_mkt_val(inputDesc, &idt, &m, &k, &t); - if(outputDesc) *outputDesc = tensor4df(idt, DF_NCHW, dimTran[3], dimTran[2], dimTran[1], dimTran[0]); - U32 igw, igh, igc; - map_nlp_mkt_to_ncwhc4(m, k, t, &igw, &igh, &igc); - if((I32)igh != dimTran[0]) CHECK_STATUS(NOT_MATCH); - if((I32)igc != (dimTran[1] * dimTran[2] + 3) / 4) CHECK_STATUS(NOT_MATCH); - CHECK_STATUS(infer_gclmem_desc_ncwhc4(igw, igh, igc * 4, 0, 0, 0, 0, 0, idt, idt, gclmemInputDesc, NULL)); - CHECK_STATUS(infer_gclmem_desc_nchw(0, 0, 0, 0, 0, dimTran[0], dimTran[1], dimTran[2], idt, idt, NULL, gclmemOutputDesc)); - return SUCCESS; - } - return NOT_SUPPORTED; -} - -inline EE reshape_checkpara_mali(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { - - if(handle == nullptr || nullptr == input || nullptr == output) return NULL_POINTER; - if(inputDesc.df != DF_NCHW && inputDesc.df != DF_MKT) return NOT_SUPPORTED; - if(outputDesc.df != DF_NCHW && outputDesc.df != DF_MKT) return NOT_SUPPORTED; - if(output->desc.memFormat != DF_NCWHC4 && output->desc.memFormat != DF_NCHW) return NOT_SUPPORTED; - if(input->desc.offset[0] != 0 || input->desc.offset[1] != 0) return NOT_SUPPORTED; - return SUCCESS; -} - -EE reshape_mali(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { - EE ret = SUCCESS; - CHECK_STATUS(reshape_checkpara_mali(handle, inputDesc, input, outputDesc, output)); - switch(inputDesc.dt){ - case DT_F16:{ - ret = reshape_mali_fp16(handle, inputDesc, input, outputDesc, output); - break; - } - case DT_I8:{ - ret = NOT_SUPPORTED; - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - - - diff --git a/tensor_computing/src/gpu/mali/scale.cpp b/tensor_computing/src/gpu/mali/scale.cpp deleted file mode 100644 index 6fb2312d..00000000 --- a/tensor_computing/src/gpu/mali/scale.cpp +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "gpu/mali/tensor_computing_mali.h" -#include "gpu/mali/fp16/scale_mali_fp16.h" -#include "gpu/mali/infer_gclmem_desc_mali.h" - -EE scale_infer_output_size_mali(TensorDesc inputDesc, - TensorDesc* outputDesc, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc){ - /*tensorDesc record cpu org data format info*/ - /*gclmemDesc record gpu trans data format info*/ - if(outputDesc) *outputDesc = inputDesc; - - DataType idt; - DataFormat idf; - U32 iw, ih, ic, in; - tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); - - if(idf == DF_NCHW) { - U32 ih_align = (ih + 1) / 2 * 2; - CHECK_STATUS(infer_gclmem_desc_ncwhc4(iw, ih_align, ic, 0, 0, iw, ih_align, ic, idt, idt, gclmemInputDesc, gclmemOutputDesc)); - if(gclmemInputDesc && gclmemOutputDesc) *gclmemOutputDesc = *gclmemInputDesc;//the input and output mem maybe the same - return SUCCESS; - } - return NOT_SUPPORTED; -} - -inline EE scale_checkpara_mali(GCLHandle_t handle, - GCLMem_t alpha, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { - if(handle == nullptr || nullptr == alpha || nullptr == input || nullptr == output) return NULL_POINTER; - if(inputDesc.df != outputDesc.df || inputDesc.df != DF_NCHW) return NOT_SUPPORTED; - if(input->desc.memFormat != output->desc.memFormat || input->desc.memFormat != DF_NCWHC4) return NOT_SUPPORTED; - return SUCCESS; -} - -EE scale_mali(GCLHandle_t handle, - GCLMem_t alpha, - GCLMem_t beta, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { - EE ret = SUCCESS; - CHECK_STATUS(scale_checkpara_mali(handle, alpha, inputDesc, input, outputDesc, output)); - switch(inputDesc.dt){ - case DT_F16:{ - ret = scale_mali_fp16(handle, alpha, beta, inputDesc, input, outputDesc, output); - break; - } - case DT_I8:{ - ret = NOT_SUPPORTED; - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - - - diff --git a/tensor_computing/src/gpu/mali/slice.cpp b/tensor_computing/src/gpu/mali/slice.cpp deleted file mode 100644 index 39a5bf02..00000000 --- a/tensor_computing/src/gpu/mali/slice.cpp +++ /dev/null @@ -1,118 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "gpu/mali/tensor_computing_mali.h" -#include "gpu/mali/fp16/slice_mali_fp16.h" -#include "gpu/mali/infer_gclmem_desc_mali.h" - -EE slice_infer_output_size_mali(TensorDesc inputDesc, - std::vector* outputDesc, - I32 axis, - I32* slice_point, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc) { - if(outputDesc == NULL) CHECK_STATUS(NULL_POINTER); - U32 num = (*outputDesc).size(); - axis = (axis + inputDesc.nDims) % inputDesc.nDims; - I32 target_axis = inputDesc.nDims - 1 - axis; - for (U32 i = 0; i < num; i++) { - (*outputDesc)[i] = inputDesc; - - I32 prev_point = 0; - if (i > 0) { - prev_point = slice_point[i-1]; - } - I32 next_point = inputDesc.dims[target_axis]; - if (i < num - 1) { - next_point = slice_point[i]; - } - if (prev_point < 0) { - prev_point = (prev_point + inputDesc.dims[target_axis]) % inputDesc.dims[target_axis]; - } - if (next_point < 0) { - next_point = (next_point + inputDesc.dims[target_axis]) % inputDesc.dims[target_axis]; - } - (*outputDesc)[i].dims[target_axis] = next_point - prev_point; - } - if(inputDesc.df == DF_MKT) { - if(axis == 2) {//slice on T - DataType dt; - U32 m, k, t; - U32 gw, gh, gc; - get_nlp_mkt_val(inputDesc, &dt, &m, &k, &t); - map_nlp_mkt_to_ncwhc4(m, k, t, &gw, &gh, &gc); - CHECK_STATUS(infer_gclmem_desc_ncwhc4(gw, gh, gc * 4, 0, 0, 0, 0, 0, dt, dt, gclmemInputDesc, NULL)); - if(gclmemOutputDesc) { - for(U32 i = 0; i < num; ++i) { - get_nlp_mkt_val((*outputDesc)[i], NULL, &m, &k, &t); - map_nlp_mkt_to_ncwhc4(m, k, t, &gw, &gh, &gc); - CHECK_STATUS(infer_gclmem_desc_ncwhc4(0, 0, 0, 0, 0, gw, gh, gc * 4, dt, dt, NULL, &gclmemOutputDesc[i])); - } - } - } - return SUCCESS; - } - return NOT_SUPPORTED; -} - -inline EE slice_checkpara_mali(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - I32 axis, - std::vector outputDesc, - std::vector* output) { - if(handle == nullptr || input == nullptr) return NULL_POINTER; - if(input->desc.memFormat != DF_NCWHC4) return NOT_SUPPORTED; - for(auto p : (*output)) { - if(p == nullptr) return NULL_POINTER; - if(((GCLMem_t)p)->desc.memFormat != input->desc.memFormat) return NOT_MATCH; - } - if(inputDesc.df != DF_MKT) return NOT_SUPPORTED; - if(inputDesc.df == DF_MKT && axis != 2) return NOT_SUPPORTED; - for(auto p : outputDesc) { - if(p.df != inputDesc.df) return NOT_MATCH; - } - return SUCCESS; -} - -EE slice_mali(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - I32 axis, - std::vector outputDesc, - std::vector* output) { - EE ret = SUCCESS; - CHECK_STATUS(slice_checkpara_mali(handle, inputDesc, input, axis, outputDesc, output)); - switch(inputDesc.dt){ - case DT_F16:{ - ret = slice_mali_fp16(handle, inputDesc, input, axis, outputDesc, output); - break; - } - case DT_I8:{ - ret = NOT_SUPPORTED; - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - - - diff --git a/tensor_computing/src/gpu/mali/softmax.cpp b/tensor_computing/src/gpu/mali/softmax.cpp deleted file mode 100644 index a202c407..00000000 --- a/tensor_computing/src/gpu/mali/softmax.cpp +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "gpu/mali/tensor_computing_mali.h" -#include "gpu/mali/fp16/softmax_mali_fp16.h" -#include "gpu/mali/infer_gclmem_desc_mali.h" - -EE softmax_infer_output_size_mali(TensorDesc inputDesc, - TensorDesc* outputDesc, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc){ - /*tensorDesc record cpu org data format info*/ - /*gclmemDesc record gpu trans data format info*/ - if(outputDesc) *outputDesc = inputDesc; - - DataType idt; - DataFormat idf; - U32 iw, ih, ic, in; - if(inputDesc.df == DF_NCHW) tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); - if(inputDesc.df == DF_MKT) { - U32 m, k, t; - get_nlp_mkt_val(inputDesc, &idt, &m, &k, &t); - map_nlp_mkt_to_ncwhc4(m, k, t, &iw, &ih, &ic); - ic = 4 * ic; - idf = DF_MKT; - } - if(idf == DF_NCHW || idf == DF_MKT) { - if(gclmemInputDesc) { - if(gclmemInputDesc->memFormat == DF_NCHW) { - U32 iw_align = (iw + 3) / 4 * 4; - CHECK_STATUS(infer_gclmem_desc_nchw(iw_align, ih, ic, 0, 0, iw_align, ih, ic, idt, idt, gclmemInputDesc, gclmemOutputDesc)); - } else if(gclmemInputDesc->memFormat == DF_NCWHC4) { - CHECK_STATUS(infer_gclmem_desc_ncwhc4(iw, ih, ic, 0, 0, iw, ih, ic, idt, idt, gclmemInputDesc, gclmemOutputDesc)); - } else { - CHECK_STATUS(NOT_SUPPORTED); - } - } - return SUCCESS; - } - return NOT_SUPPORTED; -} - -inline EE softmax_checkpara_mali(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - int axis, - TensorDesc outputDesc, - GCLMem_t output) { - - if(handle == nullptr || nullptr == input || nullptr == output) return NULL_POINTER; - if(input->desc.memFormat != output->desc.memFormat) return NOT_SUPPORTED; - if(inputDesc.df != outputDesc.df) return NOT_SUPPORTED; - if(inputDesc.dims[0] != outputDesc.dims[0]) return NOT_SUPPORTED; - if(inputDesc.dims[1] != outputDesc.dims[1]) return NOT_SUPPORTED; - if(inputDesc.dims[2] != outputDesc.dims[2]) return NOT_SUPPORTED; - if(inputDesc.dims[3] != outputDesc.dims[3]) return NOT_SUPPORTED; - if(outputDesc.df != DF_NCHW && outputDesc.df != DF_MKT) return NOT_SUPPORTED; - if(output->desc.memFormat != DF_NCWHC4 && output->desc.memFormat != DF_NCHW) return NOT_SUPPORTED; - if(axis != 1 && axis != 3 && axis != -1) return NOT_SUPPORTED; - return SUCCESS; -} - -EE softmax_mali(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - int axis, - TensorDesc outputDesc, - GCLMem_t output) { - EE ret = SUCCESS; - CHECK_STATUS(softmax_checkpara_mali(handle, inputDesc, input, axis, outputDesc, output)); - switch(inputDesc.dt){ - case DT_F16:{ - ret = softmax_mali_fp16(handle, inputDesc, input, axis, outputDesc, output); - break; - } - case DT_I8:{ - ret = NOT_SUPPORTED; - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - - - diff --git a/tensor_computing/src/gpu/mali/space2depth.cpp b/tensor_computing/src/gpu/mali/space2depth.cpp deleted file mode 100644 index 667fd26d..00000000 --- a/tensor_computing/src/gpu/mali/space2depth.cpp +++ /dev/null @@ -1,122 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "gpu/mali/tensor_computing_mali.h" -#include "gpu/mali/infer_gclmem_desc_mali.h" - -inline EE space2depth_checkpara_mali(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { - if(handle == nullptr || nullptr == input || nullptr == output) return NULL_POINTER; - if(input->desc.memFormat != DF_NCHW) return NOT_SUPPORTED; - if(output->desc.memFormat != DF_NCWHC4) return NOT_SUPPORTED; - if(inputDesc.df != DF_NCHW) return NOT_SUPPORTED; - if(outputDesc.df != DF_NCHW) return NOT_SUPPORTED; - if(inputDesc.dt != DT_U8) return NOT_SUPPORTED; - if(outputDesc.dt != DT_F16) return NOT_SUPPORTED; - if(inputDesc.dims[0] != outputDesc.dims[0] * 4) return NOT_SUPPORTED; - if(inputDesc.dims[1] != outputDesc.dims[1] * 4) return NOT_SUPPORTED; - if(inputDesc.dims[2] != outputDesc.dims[2] / 16) return NOT_SUPPORTED; - if(inputDesc.dims[3] != outputDesc.dims[3]) return NOT_SUPPORTED; - return SUCCESS; -} - -inline EE space2depth_core_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { - UNUSED(outputDesc); - U32 iw, ih, ic, in; - tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); - U32 iw_str, ih_str, iw_off, ih_off; - iw_str = input->desc.stride[0]; - ih_str = input->desc.stride[1]; - iw_off = input->desc.offset[0]; - ih_off = input->desc.offset[1]; - U32 ow_str, oh_str, ow_off, oh_off, ohw_str; - oh_str = output->desc.stride[0]; - ow_str = output->desc.stride[1]; - oh_off = output->desc.offset[0]; - ow_off = output->desc.offset[1]; - ohw_str = oh_str * ow_str; - - cl_mem inbuf, outbuf; - inbuf = input->mem; - outbuf = output->mem; - - U32 gs[3] = {(ih + 3) / 4, (iw + 3) / 4}; - U32 ls[3] = {0, 0}; - U32 dim = 2; - Kernel kernel; - CHECK_STATUS(gcl_create_kernel_binary(handle, "space2depth", &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, iw_off, ih_off, oh_str, ohw_str, - ow_off, oh_off, gs[0], gs[1], inbuf, outbuf)); - gcl_set_kernelVec(handle, kernel, dim, gs, ls, "space2depth"); -#ifdef _DEBUG - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "space2depth")); - CHECK_STATUS(gcl_print_memory(handle, input, "space2depth_input")); - CHECK_STATUS(gcl_print_memory(handle, output, "space2depth_output")); -#endif - return SUCCESS; -} - -EE space2depth_infer_output_size_mali(TensorDesc inputDesc, - TensorDesc* outputDesc, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc){ - /*tensorDesc record cpu org data format info*/ - /*gclmemDesc record gpu trans data format info*/ - *outputDesc = inputDesc; - - DataType idt; - DataFormat idf; - U32 iw, ih, ic, in; - U32 ow, oh, oc, on; - tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); - if(idt != DT_U8) return NOT_SUPPORTED; - if(ic != 1) return NOT_SUPPORTED; - on = in; - oc = ic * 16; - oh = ih / 4; - ow = iw / 4; - - if(idf == DF_NCHW) { - if(outputDesc) *outputDesc = tensor4df(DT_F16, idf, on, oc, oh, ow); - CHECK_STATUS(infer_gclmem_desc_nchw( iw, ih, ic, 0, 0, 0, 0, 0, DT_U8, DT_U8, gclmemInputDesc, NULL)); - CHECK_STATUS(infer_gclmem_desc_ncwhc4(0, 0, 0, 0, 0, ow, oh, oc, DT_F16, DT_F16, NULL, gclmemOutputDesc)); - return SUCCESS; - } - return NOT_SUPPORTED; -} - -EE space2depth_mali(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { - EE ret = SUCCESS; - CHECK_STATUS(space2depth_checkpara_mali(handle, inputDesc, input, outputDesc, output)); - CHECK_STATUS(space2depth_core_mali_fp16(handle, inputDesc, input, outputDesc, output)); - return ret; -} - - - diff --git a/tensor_computing/src/gpu/mali/squeeze.cpp b/tensor_computing/src/gpu/mali/squeeze.cpp deleted file mode 100644 index 8fd6a9ca..00000000 --- a/tensor_computing/src/gpu/mali/squeeze.cpp +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "gpu/mali/tensor_computing_mali.h" -#include "gpu/mali/fp16/squeeze_mali_fp16.h" -#include "gpu/mali/infer_gclmem_desc_mali.h" - -EE squeeze_infer_output_size_mali(TensorDesc inputDesc, - TensorDesc* outputDesc, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc){ - /*tensorDesc record cpu org data format info*/ - /*gclmemDesc record gpu trans data format info*/ - if(outputDesc) *outputDesc = inputDesc; - - DataType idt; - DataFormat idf; - U32 iw, ih, ic, in; - tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); - - if(idf == DF_NCHW) { - CHECK_STATUS(infer_gclmem_desc_ncwhc4(iw, ih, ic, 0, 0, iw, ih, ic, idt, idt, gclmemInputDesc, gclmemOutputDesc)); - return SUCCESS; - } - return NOT_SUPPORTED; -} - -inline EE squeeze_checkpara_mali(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { - - if(handle == nullptr || nullptr == input || nullptr == output) return NULL_POINTER; - if(input->desc.memFormat != output->desc.memFormat) return NOT_SUPPORTED; - if(inputDesc.df != outputDesc.df) return NOT_SUPPORTED; - if(inputDesc.dims[0] != outputDesc.dims[0]) return NOT_SUPPORTED; - if(inputDesc.dims[1] != outputDesc.dims[1]) return NOT_SUPPORTED; - if(inputDesc.dims[2] != outputDesc.dims[2]) return NOT_SUPPORTED; - if(inputDesc.dims[3] != outputDesc.dims[3]) return NOT_SUPPORTED; - if(outputDesc.df != DF_NCHW) return NOT_SUPPORTED; - if(output->desc.memFormat != DF_NCWHC4) return NOT_SUPPORTED; - return SUCCESS; -} - -EE squeeze_mali(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { - EE ret = SUCCESS; - CHECK_STATUS(squeeze_checkpara_mali(handle, inputDesc, input, outputDesc, output)); - switch(inputDesc.dt){ - case DT_F16:{ - ret = squeeze_mali_fp16(handle, inputDesc, input, outputDesc, output); - break; - } - case DT_I8:{ - ret = NOT_SUPPORTED; - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - - - diff --git a/tensor_computing/src/gpu/mali/tensor_computing_get_output.cpp b/tensor_computing/src/gpu/mali/tensor_computing_get_output.cpp deleted file mode 100644 index d5f26824..00000000 --- a/tensor_computing/src/gpu/mali/tensor_computing_get_output.cpp +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "gpu/mali/tensor_computing_mali.h" -#include "gpu/mali/infer_gclmem_desc_mali.h" - -EE tensor_computing_get_output_infer_tmpBuf_size_mali(const GCLMem_t input, - TensorDesc hostDesc, - U32* tmpBufSize) { - UNUSED(input); - UNUSED(hostDesc); - *tmpBufSize = 0; -// if(input->desc.memFormat == DF_NCWHC4) {*tmpBufSize = tensorNumBytes(hostDesc);} - return SUCCESS; -} - -EE tensor_computing_get_output_mali(GCLHandle_t handle, - const GCLMem_t input, - TensorDesc hostDesc, - U8** hostPtr, - GCLMem_t tmpBuf, - bool blocking) { - UNUSED(hostPtr); - UNUSED(tmpBuf); - GCLMemDesc desc = input->desc; - Kernel kernel; - DataType host_dt; - DataFormat host_df, device_df; - U32 ow, oh, oc, on; - U32 iw, ih, ic, pw, ph; - if(hostDesc.df == DF_NCHW) { - tensorSelectGet(hostDesc, &host_dt, &host_df, &on, &oc, &oh, &ow); - } else if (hostDesc.df == DF_MKT) { - get_nlp_mkt_val(hostDesc, &host_dt, &on, &oc, &oh); - ow = 1; - host_df = DF_MKT; - } - U32 size = tensorNumBytes(hostDesc); - U32 offset = 0; - ih = desc.stride[0]; - iw = desc.stride[1]; - ic = desc.stride[2]; - ph = desc.offset[0]; - pw = desc.offset[1]; - device_df = desc.memFormat; - if(pw != 0 || ph != 0) CHECK_STATUS(NOT_SUPPORTED); - if(desc.byteSize < size) CHECK_STATUS(NOT_MATCH); - if(desc.use_map == false) CHECK_STATUS(NOT_MATCH); - - if(device_df == DF_NCWHC4 && host_df == DF_NCHW && - host_dt == DT_F16 && (ih != 1 || iw != 1)) { - if(desc.byteSize < size * 2) CHECK_STATUS(NOT_MATCH); - U32 owh_str = ow * oh; - offset = iw * ih * ic * 4; - CHECK_STATUS(gcl_get_kernel_from_map(handle, "mem_trans_ncwhc4_to_nchw", &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, iw, ih, pw, ph, ow, oh, oc, owh_str, offset, input->mem, input->mem)); - U32 gs[3] = {oh, (ow + 3) >> 2, (oc + 3) / 4 * on}; - U32 ls[3] = {0, 0, 0}; - U32 dim = 3; - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "mem_trans_ncwhc4_to_nchw")); -#ifdef _DEBUG - CHECK_STATUS(gcl_print_memory(handle, input, "input")); -#endif - offset = offset * bytesOf(host_dt); - } - - if(device_df == DF_NCWHC4 && host_df == DF_MKT) { - if(desc.byteSize < size * 2) CHECK_STATUS(NOT_MATCH); - offset = iw * ih * ic * 4; - U32 gs[2] = {oh, (oc + 3) / 4}; - U32 ls[2] = {0, 0}; - U32 dim = 2; - CHECK_STATUS(gcl_get_kernel_from_map(handle, "mem_trans_ncwhc4_to_mtk", &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ph, pw, oc, offset, gs[0], gs[1], input->mem, input->mem)); - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "mem_trans_ncwhc4_to_mtk")); -#ifdef _DEBUG - CHECK_STATUS(gcl_print_memory(handle, input, "input")); -#endif - offset = offset * bytesOf(host_dt); - } - - CHECK_STATUS(gcl_map_memory(handle, input, &offset, &size, CL_MAP_READ, blocking)); - return SUCCESS; -} - - - diff --git a/tensor_computing/src/gpu/mali/tensor_computing_mali.h b/tensor_computing/src/gpu/mali/tensor_computing_mali.h deleted file mode 100644 index 9e4b052a..00000000 --- a/tensor_computing/src/gpu/mali/tensor_computing_mali.h +++ /dev/null @@ -1,482 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#ifndef _H_TENSOR_COMPUTING_MALI -#define _H_TENSOR_COMPUTING_MALI -#include "tensor_desc.h" -#include "tensor_computing_type.h" -EE tensor_computing_set_input_infer_tmpBuf_size_mali(GCLMem_t input, - TensorDesc hostDesc, - U32* tmpBufSize); - - -EE tensor_computing_set_input_mali(GCLHandle_t handle, - GCLMem_t input, - TensorDesc hostDesc, - const U8* hostPtr, - GCLMem_t tmpBuf, - bool blocking); - - -EE tensor_computing_get_output_infer_tmpBuf_size_mali(const GCLMem_t input, - TensorDesc hostDesc, - U32* tmpBufSize); - - -EE tensor_computing_get_output_mali(GCLHandle_t handle, - const GCLMem_t input, - TensorDesc hostDesc, - U8** hostPtr, - GCLMem_t tmpBuf, - bool blocking); - - -EE pooling_mali(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - PoolingDesc poolingDesc, - const void* scale, - TensorDesc outputDesc, - GCLMem_t output); - - -EE pooling_infer_output_size_mali(TensorDesc inputDesc, - PoolingDesc poolingDesc, - TensorDesc* outputDesc, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc); - - -EE convolution_infer_output_size_mali(TensorDesc inputDesc, - TensorDesc filterDesc, - ConvolutionDesc convDesc, - TensorDesc* outputDesc, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc, - ForwardRunInfoMali_t forwardRunInfo); - - -EE convolution_infer_forward_algorithm_mali(GCLHandle_t handle, - TensorDesc inputDesc, - TensorDesc filterDesc, - ConvolutionDesc convDesc, - TensorDesc outputDesc, - ConvolutionPolicy policy, - ActivationMode activationMode, - ForwardRunInfoMali_t forwardRunInfo); - - -EE convolution_transform_filter_bytes_mali(TensorDesc filterDesc, - ForwardRunInfoMali_t forwardRunInfo, - GCLMemDesc_t gclmemFilterDesc, - U32* bytes); - - -EE convolution_transform_filter_mali(GCLHandle_t handle, - TensorDesc filterDesc, - GCLMem_t filter, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc* fltmemDesc, - GCLMem_t fltmem, - GCLMem_t tmp); - - -EE convolution_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, - TensorDesc filterDesc, - TensorDesc outputDesc, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - U32* bytes); - - -EE convolution_mali(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc filterDesc, - const GCLMem_t filter, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc scaleDesc, - const GCLMem_t scale, - TensorDesc biasDesc, - const GCLMem_t bias, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - GCLMem_t output, - ActivationMode activationMode); - - -EE depthwise_convolution_infer_output_size_mali(TensorDesc inputDesc, - TensorDesc filterDesc, - ConvolutionDesc convDesc, - TensorDesc* outputDesc, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc, - ForwardRunInfoMali_t forwardRunInfo); - -EE depthwise_convolution_infer_forward_algorithm_mali(GCLHandle_t handle, - TensorDesc inputDesc, - TensorDesc filterDesc, - TensorDesc outputDesc, - ConvolutionDesc convDesc, - ConvolutionPolicy policy, - ActivationMode depthwiseActivationMode, - ActivationMode pointwiseActivationMode, - ForwardRunInfoMali_t forwardRunInfo); - -EE depthwise_convolution_transform_filter_bytes_mali(TensorDesc filterDesc, - ForwardRunInfoMali_t forwardRunInfo, - GCLMemDesc_t gclmemFilterDesc, - U32* bytes); - -EE depthwise_convolution_transform_filter_mali(GCLHandle_t handle, - TensorDesc filterDesc, - GCLMem_t filter, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc* fltmemDesc, - GCLMem_t fltmem); - -EE depthwise_convolution_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, - TensorDesc filterDesc, - TensorDesc outputDesc, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - U32* bytes); - -EE depthwise_convolution_mali(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc filterDesc, - const GCLMem_t filter, - ConvolutionDesc convDesc, - ForwardRunInfoMali_t forwardRunInfo, - TensorDesc biasDesc, - const GCLMem_t bias, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - GCLMem_t output, - ActivationMode depthwiseActivationMode, - ActivationMode pointwiseActivationMode); - -EE bilateral_slice_apply_infer_output_size_mali(TensorDesc inputDesc, - TensorDesc guideDesc, - TensorDesc gridDesc, - BilateralSliceApplyDesc bilateralSliceApplyDesc, - TensorDesc* outputDesc, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemGuideDesc, - GCLMemDesc_t gclmemGridDesc, - GCLMemDesc_t gclmemOutputDesc); - -EE bilateral_slice_apply_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, - TensorDesc guideDesc, - TensorDesc gridDesc, - BilateralSliceApplyDesc bilateralSliceApplyDesc, - ForwardRunInfoMali_t forwardRunInfo, - U32* bytes); - -EE bilateral_slice_apply_mali(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc guideDesc, - const GCLMem_t guide, - TensorDesc gridDesc, - const GCLMem_t grid, - BilateralSliceApplyDesc bilateralSliceApplyDesc, - ForwardRunInfoMali_t forwardRunInfo, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - GCLMem_t output); - - -EE eltwise_infer_output_size_mali(std::vector inputDesc, - TensorDesc* outputDesc, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc); - -EE eltwise_mali(GCLHandle_t handle, - std::vector inputDesc, - std::vector input, - TensorDesc outputDesc, - GCLMem_t output, - EltwiseMode eltwiseMode); - -EE softmax_infer_output_size_mali(TensorDesc inputDesc, - TensorDesc* outputDesc, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc); - -EE softmax_mali(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - int axis, - TensorDesc outputDesc, - GCLMem_t output); - -EE activation_infer_output_size_mali(TensorDesc inputDesc, - TensorDesc* outputDesc, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc); - -EE activation_mali(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output, - ActivationMode activationMode); - -EE fully_connected_infer_output_size_mali(TensorDesc inputDesc, - TensorDesc filterDesc, - TensorDesc* outputDesc, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc, - ForwardRunInfoMali_t forwardRunInfo); - -EE fully_connected_infer_forward_algorithm_mali(GCLHandle_t handle, - TensorDesc inputDesc, - TensorDesc filterDesc, - std::vector outputDescs, - ForwardRunInfoMali_t forwardRunInfo); - -EE fully_connected_transform_filter_bytes_mali(TensorDesc filterDesc, - GCLMemDesc_t gclmemFilterDesc, - U32* bytes, - ForwardRunInfoMali_t forwardRunInfo); - -EE fully_connected_transform_filter_mali(GCLHandle_t handle, - TensorDesc filterDesc, - GCLMem_t filter, - TensorDesc* fltmemDesc, - std::vector* fltmem, - ForwardRunInfoMali_t forwardRunInfo); - -EE fully_connected_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, - TensorDesc filterDesc, - U32* bytes, - ForwardRunInfoMali_t forwardRunInfo); - -EE fully_connected_mali(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc filterDesc, - std::vector* filter, - TensorDesc biasDesc, - std::vector* bias, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - std::vector* output, - ForwardRunInfoMali_t forwardRunInfo); - -EE scale_infer_output_size_mali(TensorDesc inputDesc, - TensorDesc* outputDesc, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc); - -EE scale_mali(GCLHandle_t handle, - GCLMem_t alpha, - GCLMem_t beta, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output); - -EE concat_infer_output_size_mali(std::vector inputDesc, - TensorDesc* outputDesc, - U32 concatDim, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc); - -EE concat_mali(GCLHandle_t handle, - std::vector inputDesc, - std::vector input, - GCLMem_t inputScale, - TensorDesc outputDesc, - GCLMem_t output, - GCLMem_t outputScale, - U32 concatDim); - -EE clip_infer_output_size_mali(TensorDesc inputDesc, - TensorDesc* outputDesc, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc); - -EE clip_mali(GCLHandle_t handle, - void* min_value, - void* max_value, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output); - -EE squeeze_infer_output_size_mali(TensorDesc inputDesc, - TensorDesc* outputDesc, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc); - -EE squeeze_mali(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output); - -EE reshape_infer_output_size_mali(TensorDesc inputDesc, - TensorDesc* outputDesc, - I32* dims, - I32 shapeSize, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc); - -EE reshape_mali(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output); - -EE space2depth_infer_output_size_mali(TensorDesc inputDesc, - TensorDesc* outputDesc, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc); - -EE space2depth_mali(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output); - -EE depth2space_infer_output_size_mali(TensorDesc inputDesc, - TensorDesc* outputDesc, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc); - -EE depth2space_mali(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output); - -EE embedding_infer_output_size_mali(TensorDesc inputDesc, - TensorDesc* outputDesc, - U32 inputDim, - U32 numOutput, - DataType dt, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc); - -EE embedding_mali(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc weightDesc, - GCLMem_t weight, - TensorDesc outputDesc, - GCLMem_t output, - U32 inputDim, - U32 numOutput, - bool transpose, - DataType dt); - -EE normalization_infer_output_size_mali(TensorDesc inputDesc, - TensorDesc* outputDesc, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc); - -EE layer_normalization_mali(GCLHandle_t handle, - GCLMem_t alpha, - GCLMem_t beta, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output); - -EE matmul_infer_output_size_mali(TensorDesc matrixADesc, - bool transposeA, - TensorDesc matrixBDesc, - bool transposeB, - TensorDesc* matrixCDesc, - GCLMemDesc_t gclmemMatrixADesc, - GCLMemDesc_t gclmemMatrixBDesc, - GCLMemDesc_t gclmemMatrixCDesc, - ForwardRunInfoMali_t forwardRunInfo); - -EE matmul_infer_forward_algorithm_mali(GCLHandle_t handle, - TensorDesc matrixADesc, - bool TransposeA, - TensorDesc matrixBDesc, - bool TransposeB, - TensorDesc matrixCDesc, - ForwardRunInfoMali_t forwardRunInfo); - -EE matmul_infer_forward_tmp_bytes_mali(TensorDesc matrixADesc, - bool transposeA, - TensorDesc matrixBDesc, - bool transposeB, - U32* bytes, - ForwardRunInfoMali_t forwardRunInfo); - -EE matmul_mali(GCLHandle_t handle, - TensorDesc matrixADesc, - bool transposeA, - const GCLMem_t matrixA, - TensorDesc matrixBDesc, - bool transposeB, - const GCLMem_t matrixB, - GCLMem_t tmp, - TensorDesc matrixCDesc, - GCLMem_t matrixC, - ForwardRunInfoMali_t forwardRunInfo); - -EE multiply_infer_output_size_mali(TensorDesc inputDesc, - TensorDesc* outputDesc, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc); - -EE multiply_mali(GCLHandle_t handle, - void* alpha, - void* beta, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output); - -EE transpose_infer_output_size_mali(TensorDesc inputDesc, - TensorDesc* outputDesc, - U32* dim, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc); - -EE transpose_mali(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output, - U32* dim); - -EE slice_infer_output_size_mali(TensorDesc inputDesc, - std::vector* outputDesc, - I32 axis, - I32* slice_point, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc); - -EE slice_mali(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - I32 axis, - std::vector outputDesc, - std::vector* output); -#endif - - diff --git a/tensor_computing/src/gpu/mali/tensor_computing_set_input.cpp b/tensor_computing/src/gpu/mali/tensor_computing_set_input.cpp deleted file mode 100644 index 83da091a..00000000 --- a/tensor_computing/src/gpu/mali/tensor_computing_set_input.cpp +++ /dev/null @@ -1,135 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "gpu/mali/tensor_computing_mali.h" -EE tensor_computing_set_input_infer_tmpBuf_size_mali(GCLMem_t input, - TensorDesc hostDesc, - U32* tmpBufSize) { - UNUSED(input); - *tmpBufSize = tensorNumBytes(hostDesc); -// if(input->desc.memFormat == DF_NCWHC4) {*tmpBufSize = tensorNumBytes(hostDesc);} - return SUCCESS; -} - -EE tensor_computing_set_input_mali(GCLHandle_t handle, - GCLMem_t input, - TensorDesc hostDesc, - const U8* hostPtr, - GCLMem_t tmpBuf, - bool blocking) { - GCLMemDesc desc = input->desc; - if(desc.memType == GCL_MEM_BUF) { - U32 size = tensorNumBytes(hostDesc); - Kernel kernel; - U32 iw, ih, ic, in; - DataType hdt; - DataFormat hdf; - if(hostDesc.df == DF_NCHW || hostDesc.df == DF_NHWC || hostDesc.df == DF_NCHW_ORG_MALI) { - tensorSelectGet(hostDesc, &hdt, &hdf, &in, &ic, &ih, &iw); - } else if(hostDesc.df == DF_NORMAL){ - tensor2dfGet(hostDesc, &hdt, &hdf, &ih, &iw); - ic = 1; - in = 1; - hdf = DF_NORMAL; - } else { - return NOT_SUPPORTED; - } - if(hdf == DF_NCHW_ORG_MALI || hdf == DF_NCHW) { - U32 ow, oh, pw, ph; - ow = input->desc.stride[0]; - oh = input->desc.stride[1]; - pw = input->desc.offset[0]; - ph = input->desc.offset[1]; - if(desc.memFormat == DF_NCHW || (ow == 1 && oh == 1 && pw == 0 && ph == 0)) { - GCLMem_t dst = (iw == ow && ih == oh) ? input : tmpBuf; - CHECK_STATUS(gcl_trans_memory(handle, (void*)hostPtr, (void*)dst, &size, HOST_TO_DEVICE_BUF, CL_TRUE)); - if(iw != ow || ih != oh) { - CHECK_STATUS(gcl_get_kernel_from_map(handle, "padding_input_gclmem", &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, iw, ih, pw, ph, ow, oh, tmpBuf->mem, input->mem)); - U32 gs[3] = {((ow + 3) / 4 + 3) / 4 * 4, (oh + 3) / 4 * 4, ic}; - U32 ls[3] = {0, 0, 0}; - U32 dim = 3; - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "padding_input_gclmem")); - } -#ifdef _DEBUG - if(hdt == DT_F16) { - CHECK_STATUS(gcl_print_memory(handle, input, "padding output")); - } else if(hdt == DT_U8) { - CHECK_STATUS(gcl_print_memory(handle, input, "padding output")); - } -#endif - return SUCCESS; - } - - if(desc.memFormat == DF_NCWHC4) { - if(hdt != DT_F16) return NOT_SUPPORTED; - oh = input->desc.stride[0]; - ow = input->desc.stride[1]; - ph = input->desc.offset[0]; - pw = input->desc.offset[1]; - gcl_trans_memory(handle, (void*)hostPtr, (void*)tmpBuf, &size, HOST_TO_DEVICE_BUF, blocking); - U32 iwh_str = iw * ih; - CHECK_STATUS(gcl_get_kernel_from_map(handle, "mem_trans_nchw_to_ncwhc4", &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, iw, ih, ic, iwh_str, pw, ph, ow, oh, tmpBuf->mem, input->mem)); - U32 gs[3] = {(iw + 3) / 4, ih, (ic + 3) / 4 * in }; - U32 ls[3] = {0, 0, 0}; - U32 dim = 3; - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "mem_trans_nchw_to_ncwhc4")); -#ifdef _DEBUG - CHECK_STATUS(gcl_print_memory(handle, input, "ncwhc4 output")); -#endif - return SUCCESS; - } - return NOT_SUPPORTED; - } - - if(hdf == DF_NHWC) { - U32 oc, ow, pc, pw; - oc = input->desc.stride[0]; - ow = input->desc.stride[1]; - pc = input->desc.offset[0]; - pw = input->desc.offset[1]; - if(desc.memFormat == DF_NHWC) { - if(ic == oc && iw == ow && pc == 0 && pw == 0) { - gcl_trans_memory(handle, (void*)hostPtr, (void*)input, &size, HOST_TO_DEVICE_BUF, blocking); - return SUCCESS; - } - } - return NOT_SUPPORTED; - } - - if(hdf == DF_NORMAL) { - U32 oh, ow, ph, pw; - ow = input->desc.stride[0]; - oh = input->desc.stride[1]; - pw = input->desc.offset[0]; - ph = input->desc.offset[1]; - if(desc.memFormat == DF_NCHW) { - if(iw == ow && ih == oh && pw == 0 && ph == 0) { - gcl_trans_memory(handle, (void*)hostPtr, (void*)input, &size, HOST_TO_DEVICE_BUF, blocking); - return SUCCESS; - } - } - return NOT_SUPPORTED; - } - } - return NOT_SUPPORTED; -} - - - diff --git a/tensor_computing/src/gpu/mali/transpose.cpp b/tensor_computing/src/gpu/mali/transpose.cpp deleted file mode 100644 index 8aa31f1b..00000000 --- a/tensor_computing/src/gpu/mali/transpose.cpp +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "gpu/mali/tensor_computing_mali.h" -#include "gpu/mali/fp16/transpose_mali_fp16.h" -#include "gpu/mali/infer_gclmem_desc_mali.h" - -EE transpose_infer_output_size_mali(TensorDesc inputDesc, - TensorDesc* outputDesc, - U32* dim, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc) { - U32 dimTran[4] = {1, 1, 1, 1}; - U32 nDims = inputDesc.nDims; - for(U32 i = 0; i < nDims; ++i) dimTran[nDims - 1 - i] = inputDesc.dims[nDims - 1 - dim[i]]; - if(outputDesc) { - *outputDesc = inputDesc; - for(U32 i = 0; i < nDims; ++i) (*outputDesc).dims[i] = dimTran[i]; - } - - if(inputDesc.df == DF_NCHW) { - DataType idt; - U32 iw, ih, ic, in; - tensorSelectGet(inputDesc, &idt, NULL, &in, &ic, &ih, &iw); - U32 iw_align = (iw + 3) / 4 * 4; - CHECK_STATUS(infer_gclmem_desc_nchw(iw_align, ih, ic, 0, 0, dimTran[0], dimTran[1], dimTran[2], idt, idt, gclmemInputDesc, gclmemOutputDesc)); - return SUCCESS; - } - return NOT_SUPPORTED; -} - -inline EE transpose_checkpara_mali(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output, - U32* dim) { - if(handle == nullptr || input == nullptr || output == nullptr || dim == nullptr) return NULL_POINTER; - if(inputDesc.df != outputDesc.df || inputDesc.df != DF_NCHW) return NOT_SUPPORTED; - if(input->desc.memFormat != output->desc.memFormat || input->desc.memFormat != DF_NCHW) return NOT_SUPPORTED; - if(dim[0] != 0 || dim[1] != 1 || dim[2] != 3 || dim[3] != 2) return NOT_SUPPORTED; - return SUCCESS; -} - -EE transpose_mali(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output, - U32* dim) { - EE ret = SUCCESS; - CHECK_STATUS(transpose_checkpara_mali(handle, inputDesc, input, outputDesc, output, dim)); - switch(inputDesc.dt) { - case DT_F16:{ - ret = transpose_mali_fp16(handle, inputDesc, input, outputDesc, output, dim); - break; - } - case DT_I8:{ - ret = NOT_SUPPORTED; - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - - - diff --git a/tensor_computing/src/gpu/mali/uchar/bilateral_slice_apply_mali_uchar.cpp b/tensor_computing/src/gpu/mali/uchar/bilateral_slice_apply_mali_uchar.cpp deleted file mode 100644 index 85661246..00000000 --- a/tensor_computing/src/gpu/mali/uchar/bilateral_slice_apply_mali_uchar.cpp +++ /dev/null @@ -1,123 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "tensor_computing_type.h" -#include "gpu/mali/uchar/bilateral_slice_apply_mali_uchar.h" - -inline EE bilateral_slice_apply_checkpara_mali_uchar(TensorDesc inputDesc, - TensorDesc guideDesc, - TensorDesc gridDesc, - TensorDesc outputDesc) { - if (inputDesc.dt != outputDesc.dt || inputDesc.dt != DT_U8) return NOT_SUPPORTED; - if (gridDesc.dt != guideDesc.dt || gridDesc.dt != DT_F16) return NOT_SUPPORTED; - return SUCCESS; -} - -inline EE bilateral_slice_apply_core_mali_uchar(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc guideDesc, - const GCLMem_t guide, - TensorDesc gridDesc, - const GCLMem_t grid, - BilateralSliceApplyDesc bilateralSliceApplyDesc, - ForwardRunInfoMali_t forwardRunInfo, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - GCLMem_t output){ - UNUSED(guideDesc); - UNUSED(forwardRunInfo); - U32 iw, ih, ic, in; - U32 gw, gh, gc, gn; - U32 ow, oh, oc, on; - tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); - tensorSelectGet(gridDesc, NULL, NULL, &gn, &gc, &gh, &gw); - tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); - - U32 coe = bilateralSliceApplyDesc.coefficient_len; - BilateralSliceApplyMode mode = bilateralSliceApplyDesc.mode; - U32 dep = gc / coe; - U32 gcw = gc * gw; - U32 wh = iw * ih; - F32 scale_x = (F32)gw / iw; - F32 scale_y = (F32)gh / ih; - Mem inbuf, gridbuf, guidebuf, outbuf, gridTran; - inbuf = input->mem; - gridbuf = grid->mem; - outbuf = output->mem; - gridTran = tmpBuf->mem; - if(mode == BSliceApply_NULL) { - guidebuf = guide->mem; - } else { - guidebuf = inbuf; - } - - U32 gs0[3] = {gc / 4, gw, ih}; - U32 ls0[3] = {0, 0, 0}; - U32 dim0 = 3; - Kernel kernel; - CHECK_STATUS(gcl_create_kernel_binary(handle, "bilateral_slice_apply_pre", &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, gh, gc, gcw, gs0[0], gs0[1], scale_y, gridbuf, gridTran)); - gcl_set_kernelVec(handle, kernel, dim0, gs0, ls0, "bilateral_slice_apply_pre"); - -#ifdef _DEBUG - CHECK_STATUS(gcl_run_kernel_profiling(handle, kernel, dim0, gs0, ls0, "bilateral_slice_apply_pre")); - CHECK_STATUS(gcl_print_memory(handle, grid, "bilateral_slice_apply_grid")); -#endif - U32 gs[2] = {ow, oh}; - U32 ls[2] = {0, 0}; - U32 dim = 2; - char kernelname[128]; - if(mode == BSliceApply_CONV) { - sprintf(kernelname, "bilateral_slice_apply_c12_conv_uchar"); - } else { - sprintf(kernelname, "bilateral_slice_apply_c12_uchar"); - } - CHECK_STATUS(gcl_create_kernel_binary(handle, kernelname, &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, iw, wh, gc, gw, gh, gcw, dep, coe, gs[0], gs[1], scale_x, scale_y, guidebuf, gridTran, inbuf, outbuf)); - gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); - -#ifdef _DEBUG - CHECK_STATUS(gcl_run_kernel_profiling(handle, kernel, dim, gs, ls, kernelname)); - CHECK_STATUS(gcl_print_memory(handle, input, "bilateral_slice_apply_input")); - CHECK_STATUS(gcl_print_memory(handle, output, "bilateral_slice_apply_output")); - if(mode == BSliceApply_NULL)CHECK_STATUS(gcl_print_memory(handle, guide, "bilateral_slice_apply_guide")); -#endif - return SUCCESS; - -} - -EE bilateral_slice_apply_mali_uchar(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc guideDesc, - const GCLMem_t guide, - TensorDesc gridDesc, - const GCLMem_t grid, - BilateralSliceApplyDesc bilateralSliceApplyDesc, - ForwardRunInfoMali_t forwardRunInfo, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - GCLMem_t output){ - UNUSED(tmpBytes); - CHECK_STATUS(bilateral_slice_apply_checkpara_mali_uchar(inputDesc, guideDesc, gridDesc, outputDesc)); - CHECK_STATUS(bilateral_slice_apply_core_mali_uchar(handle, inputDesc, input, guideDesc, guide, gridDesc, grid, bilateralSliceApplyDesc, forwardRunInfo, tmpBuf, outputDesc, output)); - return SUCCESS; -} - diff --git a/tensor_computing/src/gpu/mali/uchar/bilateral_slice_apply_mali_uchar.h b/tensor_computing/src/gpu/mali/uchar/bilateral_slice_apply_mali_uchar.h deleted file mode 100644 index 25aa85c7..00000000 --- a/tensor_computing/src/gpu/mali/uchar/bilateral_slice_apply_mali_uchar.h +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#ifndef _BILATERAL_SLICE_APPLY_MALI_UCHAR -#define _BILATERAL_SLICE_APPLY_MALI_UCHAR -#include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" -#include "tensor_computing_type.h" - -EE bilateral_slice_apply_mali_uchar(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc guideDesc, - const GCLMem_t guide, - TensorDesc gridDesc, - const GCLMem_t grid, - BilateralSliceApplyDesc bilateralSliceApplyDesc, - ForwardRunInfoMali_t forwardRunInfo, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - GCLMem_t output); -#endif - diff --git a/tensor_computing/src/library_algorithm_search.cpp b/tensor_computing/src/library_algorithm_search.cpp deleted file mode 100644 index 442f3864..00000000 --- a/tensor_computing/src/library_algorithm_search.cpp +++ /dev/null @@ -1,118 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifdef _USE_LIBRARY_TUNING -#include -#include -#include -#include "tensor_desc.h" -#include "tensor_computing_library_algorithm_search.h" - -std::map libraryAlgorithmMap; -std::atomic libraryAlgorithmMapLoadFlag(0); -std::map libraryAlgorithmParameters = { - {"convolution_ic_step", 16}, - {"convolution_ic_max", 640}, - {"convolution_ih_step", 16}, - {"convolution_ih_max", 230}, - {"convolution_fn_step", 16}, - {"convolution_fn_max", 320}, - {"convolution_fh_step", 1}, - {"convolution_fh_max", 11} -}; - -void loadLibraryAlgorithmMapFromTxt() { - if (atomic_exchange(&libraryAlgorithmMapLoadFlag, 1)) - return; - char* algorithmMapPath = getenv("Bolt_TensorComputing_LibraryAlgoritmMap"); - if (algorithmMapPath == NULL || std::string(algorithmMapPath) == std::string("")) { - std::cerr << "[ERROR] need to set shell environment variable Bolt_TensorComputing_LibraryAlgoritmMap" - " to the library algorithm map file generated by tensor_computing_library_search program" - " when use CONVOLUTION_LIBRARY_SEARCH policy" << std::endl; - exit(1); - } - FILE *file = fopen(algorithmMapPath, "r"); - if (!file || feof(file)) - return; - int num = 0; - fscanf(file, "%d", &num); - char operatorName[100]; - int algorithm; - for (int i = 0; i < num; i++) { - fscanf(file, "%s %d", operatorName, &algorithm); - libraryAlgorithmMap[operatorName] = algorithm; - } - fclose(file); -} - -void saveLibraryAlgorithmMapToTxt() { - char* algorithmMapPath = getenv("Bolt_TensorComputing_LibraryAlgoritmMap"); - if (algorithmMapPath == NULL || std::string(algorithmMapPath) == std::string("")) { - std::cerr << "[ERROR] need to set shell environment variable Bolt_TensorComputing_LibraryAlgoritmMap" - " to save the search result" << std::endl; - exit(1); - } - - FILE *file = fopen(algorithmMapPath, "w"); - fprintf(file, "%ld\n", (I64)(libraryAlgorithmMap.size())); - for (auto iter: libraryAlgorithmMap) { - fprintf(file, "%s %d\n", iter.first.c_str(), iter.second); - } - fclose(file); -} - -std::string getConvolutionAlgorithmMapNameFromInput(TensorDesc inputDesc, - TensorDesc filterDesc, - ConvolutionDesc convDesc, - DataType targetDataType) -{ - std::string name = ""; - switch (targetDataType) { - case DT_F32: - name = name + "_float32"; - break; - case DT_F16: - name = name + "_float16"; - break; - case DT_I8: - name = name + "_int8"; - break; - case DT_BIN01: - name = name + "_bnn01"; - break; - case DT_BIN11: - name = name + "_bnn11"; - break; - default: - std::cerr << "[ERROR] unsupported data type in " << __func__ << std::endl; - exit(1); - } - name = name + "_in1" - + "c" + std::to_string(inputDesc.dims[2]/libraryAlgorithmParameters["convolution_ic_step"]) - + "h" + std::to_string(inputDesc.dims[1]/libraryAlgorithmParameters["convolution_ih_step"]) - + "w" + std::to_string(inputDesc.dims[1]/libraryAlgorithmParameters["convolution_ih_step"]) - + "fn" + std::to_string(filterDesc.dims[3]/libraryAlgorithmParameters["convolution_fn_step"]) - + "c" + std::to_string(inputDesc.dims[2]/libraryAlgorithmParameters["convolution_ic_step"]) - + "h" + std::to_string(filterDesc.dims[1]/libraryAlgorithmParameters["convolution_fh_step"]) - + "w" + std::to_string(filterDesc.dims[1]/libraryAlgorithmParameters["convolution_fh_step"]) - + "sh" + std::to_string(convDesc.stride_h) - + "w" + std::to_string(convDesc.stride_w); - + "pt" + std::to_string(convDesc.padding_top) - + "b" + std::to_string(convDesc.padding_bottom); - + "l" + std::to_string(convDesc.padding_left); - + "r" + std::to_string(convDesc.padding_right); - - return name; -} -#endif diff --git a/tensor_computing/src/lstm.cpp b/tensor_computing/src/lstm.cpp deleted file mode 100644 index 9cca196a..00000000 --- a/tensor_computing/src/lstm.cpp +++ /dev/null @@ -1,176 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing.h" -#ifdef _USE_GENERAL -#include "cpu/general/tensor_computing_general.h" -#endif -#ifdef _USE_NEON -#include "cpu/arm/tensor_computing_arm.h" -#endif - -EE lstm_transform_filter(TensorDesc filterDesc, const void* filter, - LSTMDesc lstmDesc, - TensorDesc *ftmDesc, void* filterTransformed, - Arch arch) -{ - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL || arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { -#if defined(_USE_GENERAL) || defined(_USE_NEON) - ret = lstm_transform_filter_arm(filterDesc, filter, lstmDesc, ftmDesc, filterTransformed); -#endif - } - return ret; -} - -EE lstm_transform_filter_bytes(TensorDesc filterDesc, LSTMDesc lstmDesc, U32* bytes, Arch arch) -{ - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL || arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { -#if defined(_USE_GENERAL) || defined(_USE_NEON) - ret = lstm_transform_filter_bytes_arm(filterDesc, lstmDesc, bytes); -#endif - } - return ret; -} - -EE lstm_infer_output_size(TensorDesc inputDesc, TensorDesc filterDesc, - LSTMDesc lstmDesc, - TensorDesc* outputDesc, U32* outputBytes) -{ - UNUSED(filterDesc); - - if (nullptr == outputDesc || nullptr == outputBytes) - CHECK_STATUS(NULL_POINTER); - DataType idt; - DataFormat idf; - U32 batch, step, xDim; - CHECK_STATUS(tensor3dGet(inputDesc, &idt, &idf, &batch, &step, &xDim)); - U32 num = (lstmDesc.biDirection) ? 2 : 1; - U32 hDim = num * lstmDesc.numOutput; - *outputDesc = tensor3df(idt, idf, batch, step, hDim); - *outputBytes = batch * step * hDim * bytesOf(idt); - return SUCCESS; -} - -EE lstm_infer_forward_tmp_bytes(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - LSTMDesc lstmDesc, - U32 *bytes, Arch arch) -{ - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL || arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { -#if defined(_USE_GENERAL) || defined(_USE_NEON) - ret = lstm_infer_forward_tmp_bytes_arm(inputDesc, filterDesc, outputDesc, lstmDesc, bytes, arch); -#endif - } - return ret; -} - -EE lstm(TensorDesc inputDesc, const void* input, - TensorDesc filterDesc, const void* filter, - TensorDesc biasDesc, const void* bias, - U32 tmpBytes, void* tmp, - LSTMDesc lstmDesc, - TensorDesc outputDesc, void* output, - Arch arch) -{ - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = lstm_general(inputDesc, input, - filterDesc, filter, - biasDesc, bias, - tmpBytes, tmp, - lstmDesc, - outputDesc, output); -#endif -#ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = lstm_arm(inputDesc, input, - filterDesc, filter, - biasDesc, bias, - tmpBytes, tmp, - lstmDesc, - outputDesc, output, - arch); -#endif - } - return ret; -} - -EE lstmcell_infer_output_size(TensorDesc inputDesc, TensorDesc filterDesc, - LSTMDesc lstmDesc, - TensorDesc* outputDesc, U32* outputBytes) -{ - UNUSED(filterDesc); - - if (nullptr == outputDesc || nullptr == outputBytes) - CHECK_STATUS(NULL_POINTER); - DataType idt; - DataFormat idf; - U32 batch, xDim; - CHECK_STATUS(tensor2dfGet(inputDesc, &idt, &idf, &batch, &xDim)); - U32 hDim = lstmDesc.numOutput; - *outputDesc = tensor2df(idt, idf, batch, hDim); - *outputBytes = batch * hDim * bytesOf(idt); - return SUCCESS; -} - -EE lstmcell_infer_forward_tmp_bytes(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - LSTMDesc lstmDesc, - U32 *bytes, Arch arch) -{ - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL || arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { -#if defined(_USE_GENERAL) || defined(_USE_NEON) - ret = lstmcell_infer_forward_tmp_bytes_arm(inputDesc, filterDesc, outputDesc, lstmDesc, bytes, arch); -#endif - } - return ret; -} - -EE lstmcell(TensorDesc xDesc, const void* currentX, - TensorDesc filterDesc, const void* filter, - TensorDesc biasDesc, const void* bias, - void *state, - U32 tmpBytes, void *tmp, - LSTMDesc lstmDesc, U32 batchStrideX, U32 batchStrideH, - TensorDesc hDesc, void* currentH, - Arch arch) -{ - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = lstmcell_general(xDesc, currentX, - filterDesc, filter, - biasDesc, bias, - state, - tmpBytes, tmp, - lstmDesc, batchStrideX, batchStrideH, - hDesc, currentH); -#endif -#ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = lstmcell_arm(xDesc, currentX, - filterDesc, filter, - biasDesc, bias, - state, - tmpBytes, tmp, - lstmDesc, batchStrideX, batchStrideH, - hDesc, currentH, - arch); -#endif - } - return ret; -} diff --git a/tensor_computing/src/matmul.cpp b/tensor_computing/src/matmul.cpp deleted file mode 100644 index 53a6f5bf..00000000 --- a/tensor_computing/src/matmul.cpp +++ /dev/null @@ -1,298 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing.h" -#include "blas-enhance.h" -#include -#ifdef _USE_MALI -#include "gpu/mali/tensor_computing_mali.h" -#endif - -EE matmul_infer_output_size_cpu(TensorDesc matrixADesc, bool transposeA, - TensorDesc matrixBDesc, bool transposeB, - TensorDesc *matrixCDesc) -{ - if (matrixCDesc == nullptr) - CHECK_STATUS(NULL_POINTER); - - if (DT_I8 == matrixADesc.dt || DT_I8 == matrixBDesc.dt) { - matrixADesc.dt = DT_I8; - matrixBDesc.dt = DT_I8; - } - - if (matrixADesc.dt != matrixBDesc.dt - || matrixADesc.nDims < 2) { - CHECK_STATUS(NOT_MATCH); - } - - if (DF_NCHWC8 == matrixADesc.df && 4 == matrixADesc.nDims) { - CHECK_REQUIREMENT(1 == matrixADesc.dims[1] && 1 == matrixADesc.dims[0]); - } - - if (DF_NCHWC8 == matrixBDesc.df && 4 == matrixBDesc.nDims) { - CHECK_REQUIREMENT(1 == matrixBDesc.dims[1] && 1 == matrixBDesc.dims[0]); - } - - int i = 0; - int j = 0; - int dimA = matrixADesc.nDims; - int dimB = matrixBDesc.nDims; - while (i < dimA-2 || j < dimB-2) { - if (matrixADesc.dims[dimA-1-i] != matrixBDesc.dims[dimB-1-j]) { - if (matrixADesc.dims[dimA-1-i] == 1) { - i++; - continue; - } - if(matrixBDesc.dims[dimB-1-j] == 1) { - j++; - continue; - } - CHECK_STATUS(NOT_MATCH); - } - else { - i++; - j++; - } - } - if (i != dimA-2 || j != dimB-2) - CHECK_STATUS(NOT_MATCH); - - U32 kDimA, kDimB; - if (transposeA) { - kDimA = 1; - } else { - kDimA = 0; - } - if (transposeB) { - kDimB = 0; - } else { - kDimB = 1; - } - - if (matrixADesc.dims[kDimA] != matrixBDesc.dims[kDimB]) { - CHECK_STATUS(NOT_MATCH); - } - - *matrixCDesc = matrixADesc; - (*matrixCDesc).dims[kDimA] = matrixBDesc.dims[1-kDimB]; - if (transposeA) { - U32 tmp = (*matrixCDesc).dims[0]; - (*matrixCDesc).dims[0] = (*matrixCDesc).dims[1]; - (*matrixCDesc).dims[1] = tmp; - } - return SUCCESS; -} - -EE matmul_infer_output_size(TensorDesc matrixADesc, bool transposeA, - TensorDesc matrixBDesc, bool transposeB, - TensorDesc *matrixCDesc, Arch arch, ExtInfo_t extInfo) -{ -#ifdef _USE_MALI - if(arch == MALI) { - GCLMemDesc_t gclmemMatrixADesc = NULL; - GCLMemDesc_t gclmemMatrixBDesc = NULL; - GCLMemDesc_t gclmemMatrixCDesc = NULL; - if(extInfo->maliInfo.gclmemInputDesc) { - gclmemMatrixADesc = &extInfo->maliInfo.gclmemInputDesc[0]; - gclmemMatrixBDesc = &extInfo->maliInfo.gclmemInputDesc[1]; - } - if(extInfo->maliInfo.gclmemOutputDesc) gclmemMatrixCDesc = extInfo->maliInfo.gclmemOutputDesc; - CHECK_STATUS(matmul_infer_output_size_mali(matrixADesc, transposeA, matrixBDesc, transposeB, matrixCDesc, gclmemMatrixADesc, gclmemMatrixBDesc, gclmemMatrixCDesc, extInfo->maliInfo.forwardRunInfo)); - } else { -#endif - UNUSED(arch); - UNUSED(extInfo); - CHECK_STATUS(matmul_infer_output_size_cpu(matrixADesc, transposeA, matrixBDesc, transposeB, matrixCDesc)); -#ifdef _USE_MALI - } -#endif - return SUCCESS; -} - -EE matmul_infer_forward_algorithm(TensorDesc matrixADesc, bool transposeA, TensorDesc matrixBDesc, bool transposeB, TensorDesc matrixCDesc, Arch arch, ExtInfo_t extInfo) { -#ifdef _USE_MALI - if(arch == MALI) { - CHECK_STATUS(matmul_infer_forward_algorithm_mali(extInfo->maliInfo.handle, matrixADesc, transposeA, matrixBDesc, transposeB, matrixCDesc, extInfo->maliInfo.forwardRunInfo)); - } else { -#endif - return NOT_SUPPORTED; -#ifdef _USE_MALI - } -#endif - return SUCCESS; -} - -EE matmul_infer_forward_tmp_bytes(TensorDesc matrixADesc, bool transposeA, - TensorDesc matrixBDesc, bool transposeB, - U32 *bytes, Arch arch, ExtInfo_t extInfo) -{ - if (bytes == nullptr) - CHECK_STATUS(NULL_POINTER); -#ifdef _USE_MALI - if(arch == MALI) { - CHECK_STATUS(matmul_infer_forward_tmp_bytes_mali(matrixADesc, transposeA, matrixBDesc, transposeB, bytes, extInfo->maliInfo.forwardRunInfo)); - return SUCCESS; - } -#endif - bool quantA = false; - bool quantB = false; - if (DT_I8 == matrixADesc.dt || DT_I8 == matrixBDesc.dt) { - if (DT_F16 == matrixADesc.dt) { - quantA = true; - matrixADesc.dt = DT_I8; - } - - if (DT_F16 == matrixBDesc.dt) { - quantB = true; - matrixBDesc.dt = DT_I8; - } - } - - EE ret = SUCCESS; - U32 kDimA, kDimB; - DataFormat dataFormatA, dataFormatB; - if (transposeA) { - kDimA = 1; - dataFormatA = DF_TRANSPOSE; - } else { - kDimA = 0; - dataFormatA = DF_NORMAL; - } - if (transposeB) { - kDimB = 0; - dataFormatB = DF_TRANSPOSE; - } else { - kDimB = 1; - dataFormatB = DF_NORMAL; - } - if (matrixADesc.dims[1-kDimA] == 1 || matrixBDesc.dims[1-kDimB] == 1) { - TensorDesc matrixDesc, vectorDesc; - if (matrixADesc.dims[1-kDimA] == 1) { - matrixDesc = tensor2df(matrixBDesc.dt, dataFormatB, matrixBDesc.dims[1], matrixBDesc.dims[0]); - vectorDesc = tensor1d(matrixADesc.dt, matrixADesc.dims[kDimA]); - } else { - matrixDesc = tensor2df(matrixADesc.dt, dataFormatA, matrixADesc.dims[1], matrixADesc.dims[0]); - vectorDesc = tensor1d(matrixBDesc.dt, matrixBDesc.dims[kDimB]); - } - ret = matrix_vector_multiply_tmp_bytes(matrixDesc, vectorDesc, bytes, arch); - } else { - TensorDesc matrixA2DDesc = tensor2df(matrixADesc.dt, dataFormatA, matrixADesc.dims[1], matrixADesc.dims[0]); - TensorDesc matrixB2Ddesc = tensor2df(matrixBDesc.dt, dataFormatB, matrixBDesc.dims[1], matrixBDesc.dims[0]); - ret = matrix_matrix_multiply_tmp_bytes(matrixA2DDesc, matrixB2Ddesc, bytes, arch); - } - - if (quantA) { - *bytes += tensorNumBytes(matrixADesc); - } - if (quantB) { - *bytes += tensorNumBytes(matrixBDesc); - } - return ret; -} - -EE matmul(TensorDesc matrixADesc, bool transposeA, const void* matrixA, - TensorDesc matrixBDesc, bool transposeB, const void* matrixB, - void* tmp, U32 bytes, - TensorDesc matrixCDesc, void* matrixC, - Arch arch, ExtInfo_t extInfo) -{ - if (matrixA == nullptr || matrixB == nullptr || matrixC == nullptr) { - CHECK_STATUS(NULL_POINTER); - } -#ifdef _USE_MALI - if(arch == MALI) { - CHECK_STATUS(matmul_mali(extInfo->maliInfo.handle, matrixADesc, transposeA, (GCLMem_t)matrixA, matrixBDesc, transposeB, (GCLMem_t)matrixB, (GCLMem_t) tmp, matrixCDesc, (GCLMem_t)matrixC, extInfo->maliInfo.forwardRunInfo)); - return SUCCESS; - } -#else - UNUSED(extInfo); -#endif - - U32 sizeA = tensorNumElements(matrixADesc); - U32 loops = sizeA / (matrixADesc.dims[1] * matrixADesc.dims[0]); - U32 kDimA, kDimB; - if (transposeA) { - kDimA = 1; - } else { - kDimA = 0; - } - if (transposeB) { - kDimB = 0; - } else { - kDimB = 1; - } - - U32 matrixA2DBytes = (matrixADesc.dims[1] * matrixADesc.dims[0]) * bytesOf(matrixADesc.dt); - U32 matrixB2DBytes = (matrixBDesc.dims[1] * matrixBDesc.dims[0]) * bytesOf(matrixBDesc.dt); - U32 matrixC2DBytes = (matrixCDesc.dims[1] * matrixCDesc.dims[0]) * bytesOf(matrixCDesc.dt); - U8* matrixAPtr = (U8 *)matrixA; - U8* matrixBPtr = (U8 *)matrixB; - U8* matrixCPtr = (U8 *)matrixC; - memset(matrixC, 0, tensorNumBytes(matrixCDesc)); - for (U32 i = 0; i < loops; i++) { - if (matrixADesc.dims[1-kDimA] == 1) { - TensorDesc matrixA1DDesc = tensor1d(matrixADesc.dt, matrixADesc.dims[kDimA]); - TensorDesc matrixB2DDesc; - if (transposeB) { - matrixB2DDesc = tensor2df(matrixBDesc.dt, DF_NORMAL, matrixBDesc.dims[1], matrixBDesc.dims[0]); - } else { - matrixB2DDesc = tensor2df(matrixBDesc.dt, DF_TRANSPOSE, matrixBDesc.dims[0], matrixBDesc.dims[1]); - } - TensorDesc matrixC1DDesc = tensor1d(matrixCDesc.dt, matrixCDesc.dims[0]); - CHECK_STATUS(matrix_vector_multiply( - matrixB2DDesc, matrixBPtr, - matrixA1DDesc, matrixAPtr, - bytes, tmp, - matrixC1DDesc, matrixCPtr, arch)); - } else { - if (matrixBDesc.dims[1-kDimB] == 1) { - TensorDesc matrixA2DDesc; - if (transposeA) { - matrixA2DDesc = tensor2df(matrixADesc.dt, DF_TRANSPOSE, matrixADesc.dims[0], matrixADesc.dims[1]); - } else { - matrixA2DDesc = tensor2df(matrixADesc.dt, DF_NORMAL, matrixADesc.dims[1], matrixADesc.dims[0]); - } - TensorDesc matrixB1DDesc = tensor1d(matrixBDesc.dt, matrixBDesc.dims[kDimB]); - TensorDesc matrixC1DDesc = tensor1d(matrixCDesc.dt, matrixCDesc.dims[1]); - CHECK_STATUS(matrix_vector_multiply(matrixA2DDesc, matrixAPtr, - matrixB1DDesc, matrixBPtr, - bytes, tmp, - matrixC1DDesc, matrixCPtr, arch)); - } else { - DataFormat dataFormatA, dataFormatB; - if (transposeA) { - dataFormatA = DF_TRANSPOSE; - } else { - dataFormatA = DF_NORMAL; - } - if (transposeB) { - dataFormatB = DF_TRANSPOSE; - } else { - dataFormatB = DF_NORMAL; - } - TensorDesc matrixA2DDesc = tensor2df(matrixADesc.dt, dataFormatA, matrixADesc.dims[1], matrixADesc.dims[0]); - TensorDesc matrixB2DDesc = tensor2df(matrixBDesc.dt, dataFormatB, matrixBDesc.dims[1], matrixBDesc.dims[0]); - TensorDesc matrixC2DDesc = tensor2df(matrixCDesc.dt, DF_NORMAL, matrixCDesc.dims[1], matrixCDesc.dims[0]); - CHECK_STATUS(matrix_matrix_multiply(matrixA2DDesc, matrixAPtr, - matrixB2DDesc, matrixBPtr, - bytes, tmp, - matrixC2DDesc, matrixCPtr, arch)); - } - } - matrixAPtr += matrixA2DBytes; - matrixBPtr += matrixB2DBytes; - matrixCPtr += matrixC2DBytes; - } - return SUCCESS; -} diff --git a/tensor_computing/src/multiply.cpp b/tensor_computing/src/multiply.cpp deleted file mode 100644 index 20847067..00000000 --- a/tensor_computing/src/multiply.cpp +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing.h" -#ifdef _USE_GENERAL -#include "cpu/general/tensor_computing_general.h" -#endif -#ifdef _USE_NEON -#include "cpu/arm/tensor_computing_arm.h" -#endif -#ifdef _USE_MALI -#include "gpu/mali/tensor_computing_mali.h" -#endif - -inline EE multiply_infer_output_size_cpu(TensorDesc inputDesc, TensorDesc *outputDesc) -{ - if (nullptr == outputDesc) CHECK_STATUS(NULL_POINTER); - *outputDesc = inputDesc; - return SUCCESS; -} - -EE multiply_infer_output_size(TensorDesc inputDesc, TensorDesc *outputDesc, Arch arch, ExtInfo_t extInfo) -{ -#ifdef _USE_MALI - if(arch == MALI){ - CHECK_STATUS(multiply_infer_output_size_mali(inputDesc, outputDesc, extInfo->maliInfo.gclmemInputDesc, extInfo->maliInfo.gclmemOutputDesc)) - } else { -#endif - UNUSED(arch); - UNUSED(extInfo); - CHECK_STATUS(multiply_infer_output_size_cpu(inputDesc, outputDesc)); -#ifdef _USE_MALI - } -#endif - return SUCCESS; -} - -EE multiply(void *alpha, void *beta, TensorDesc inputDesc, void* input, TensorDesc outputDesc, void *output, Arch arch, ExtInfo_t extInfo) -{ - EE ret = SUCCESS; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = multiply_general(alpha, beta, inputDesc, input, outputDesc, output); -#endif -#ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = multiply_arm(alpha, beta, inputDesc, input, outputDesc, output); -#endif -#ifdef _USE_MALI - } else if(arch == MALI) { - ret = multiply_mali(extInfo->maliInfo.handle, alpha, beta, inputDesc, (GCLMem_t)input, outputDesc, (GCLMem_t)output); -#endif - } - return ret; -} - diff --git a/tensor_computing/src/normalization.cpp b/tensor_computing/src/normalization.cpp deleted file mode 100644 index 9a78c393..00000000 --- a/tensor_computing/src/normalization.cpp +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing.h" -#ifdef _USE_GENERAL -#include "cpu/general/tensor_computing_general.h" -#endif -#ifdef _USE_NEON -#include "cpu/arm/tensor_computing_arm.h" -#endif -#ifdef _USE_MALI -#include "gpu/mali/tensor_computing_mali.h" -#endif - -EE layer_normalization(void *alpha, void *beta, - TensorDesc inputDesc, void* input, - TensorDesc outputDesc, void* output, - Arch arch, ExtInfo_t extInfo) -{ -#ifndef _USE_MALI - UNUSED(extInfo); -#endif - EE ret = SUCCESS; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = layer_normalization_general(alpha, beta, inputDesc, input, outputDesc, output); -#endif -#ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = layer_normalization_arm(alpha, beta, inputDesc, input, outputDesc, output); -#endif -#ifdef _USE_MALI - } else if (arch == MALI){ - ret = layer_normalization_mali(extInfo->maliInfo.handle, (GCLMem_t)alpha, (GCLMem_t)beta, inputDesc, (GCLMem_t)input, outputDesc, (GCLMem_t)output); -#endif - } - return ret; -} - -EE normalization_infer_output_size(TensorDesc inputDesc, TensorDesc *outputDesc, Arch arch, ExtInfo_t extInfo) -{ -#ifdef _USE_MALI - if(arch == MALI) { - CHECK_STATUS(normalization_infer_output_size_mali(inputDesc, outputDesc, extInfo->maliInfo.gclmemInputDesc, extInfo->maliInfo.gclmemOutputDesc)); - } else { -#endif - if (nullptr == outputDesc) CHECK_STATUS(NULL_POINTER); - *outputDesc = inputDesc; -#ifdef _USE_MALI - } -#endif - return SUCCESS; -} diff --git a/tensor_computing/src/padding.cpp b/tensor_computing/src/padding.cpp deleted file mode 100644 index 52d69fb9..00000000 --- a/tensor_computing/src/padding.cpp +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing.h" -#ifdef _USE_GENERAL -#include "cpu/general/tensor_computing_general.h" -#endif -#ifdef _USE_NEON -#include "cpu/arm/tensor_computing_arm.h" -#endif -#ifdef _USE_MALI -#include "gpu/mali/tensor_computing_mali.h" -#endif - -EE padding_infer_output_size(TensorDesc inputDesc, PadDesc padDesc, TensorDesc* outputDesc) -{ - if (nullptr == outputDesc) { - CHECK_STATUS(NULL_POINTER); - } - DataType idt; - DataFormat idf; - U32 in, ic, ih, iw; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - int out_n = in; - int out_c = ic; - int out_h = ih + padDesc.top + padDesc.bottom; - int out_w = iw + padDesc.left + padDesc.right; - *outputDesc = tensor4df(idt, idf, out_n, out_c, out_h, out_w); - return SUCCESS; -} - -EE padding(TensorDesc inputDesc, const void* input, PadDesc padDesc, TensorDesc outputDesc, void* output, Arch arch) -{ - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = padding_general(inputDesc, input, padDesc, outputDesc, output); -#endif -#ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = padding_arm(inputDesc, input, padDesc, outputDesc, output); -#endif - } - return ret; -} diff --git a/tensor_computing/src/pooling.cpp b/tensor_computing/src/pooling.cpp deleted file mode 100644 index e2d6e488..00000000 --- a/tensor_computing/src/pooling.cpp +++ /dev/null @@ -1,107 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing.h" -#ifdef _USE_GENERAL -#include "cpu/general/tensor_computing_general.h" -#endif -#ifdef _USE_NEON -#include "cpu/arm/tensor_computing_arm.h" -#endif -#ifdef _USE_MALI -#include "gpu/mali/tensor_computing_mali.h" -#endif - -inline EE pooling_infer_output_size_cpu(TensorDesc inputDesc, PoolingDesc poolingDesc, TensorDesc* outputDesc) -{ - if (nullptr == outputDesc) { - CHECK_STATUS(NULL_POINTER); - } - DataType idt; - DataFormat idf; - U32 in, ic, ih, iw; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - U32 strideH = poolingDesc.stride_h; - U32 strideW = poolingDesc.stride_w; - U32 paddingT = poolingDesc.padding_top; - U32 paddingB = poolingDesc.padding_bottom; - U32 paddingL = poolingDesc.padding_left; - U32 paddingR = poolingDesc.padding_right; - U32 kernelSizeH = poolingDesc.kernelSize_h; - U32 kernelSizeW = poolingDesc.kernelSize_w; - RoundMode rm = poolingDesc.rm; - U32 oh = 0, ow = 0; - switch (rm) { - case CEIL: { - oh = (U32)(ceil((double(ih + paddingT + paddingB - kernelSizeH) / strideH))) + 1; - ow = (U32)(ceil((double(iw + paddingL + paddingR - kernelSizeW) / strideW))) + 1; - break; - } - case FLOOR: { - oh = (U32)(floor((double(ih + paddingT + paddingB - kernelSizeH) / strideH))) + 1; - ow = (U32)(floor((double(iw + paddingL + paddingR - kernelSizeW) / strideW))) + 1; - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - *outputDesc = tensor4df(idt, idf, in, ic, oh, ow); - return SUCCESS; -} -EE pooling_infer_output_size(TensorDesc inputDesc, PoolingDesc poolingDesc, TensorDesc* outputDesc, Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if (0 == poolingDesc.kernelSize_h && 0 == poolingDesc.kernelSize_w) { // Global pooling - CHECK_REQUIREMENT(4 == inputDesc.nDims); - poolingDesc.kernelSize_h = inputDesc.dims[1]; - poolingDesc.kernelSize_w = inputDesc.dims[0]; - } - if(arch == MALI){ -#ifdef _USE_MALI - ret = pooling_infer_output_size_mali(inputDesc, poolingDesc, outputDesc, extInfo->maliInfo.gclmemInputDesc, extInfo->maliInfo.gclmemOutputDesc); -#endif - } else { - ret = pooling_infer_output_size_cpu(inputDesc, poolingDesc, outputDesc); - } - return ret; -} - -EE pooling(TensorDesc inputDesc, const void* input, PoolingDesc poolingDesc, const void* scale, TensorDesc outputDesc, void* output, Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if (0 == poolingDesc.kernelSize_h && 0 == poolingDesc.kernelSize_w) { // Global pooling - CHECK_REQUIREMENT(4 == inputDesc.nDims); - poolingDesc.kernelSize_h = inputDesc.dims[1]; - poolingDesc.kernelSize_w = inputDesc.dims[0]; - } - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = pooling_general(inputDesc, input, - poolingDesc, - outputDesc, output); -#endif -#ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = pooling_arm(inputDesc, input, - poolingDesc, scale, - outputDesc, output); -#endif -#ifdef _USE_MALI - } else if (arch == MALI) { - ret = pooling_mali(extInfo->maliInfo.handle, inputDesc, (const GCLMem_t)input, poolingDesc, scale, outputDesc, (GCLMem_t)output); -#endif - } - return ret; -} diff --git a/tensor_computing/src/priorbox.cpp b/tensor_computing/src/priorbox.cpp deleted file mode 100644 index ee37080c..00000000 --- a/tensor_computing/src/priorbox.cpp +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing.h" -#ifdef _USE_GENERAL -#include "cpu/general/tensor_computing_general.h" -#endif -#ifdef _USE_NEON -#include "cpu/arm/tensor_computing_arm.h" -#endif - -inline EE priorbox_infer_output_size_cpu(std::vector inputDesc, PriorBoxDesc priorboxDesc, TensorDesc* outputDesc) -{ - if (nullptr == outputDesc) { - CHECK_STATUS(NULL_POINTER); - } - DataType idt; - DataFormat idf; - U32 in, ic, ih, iw; - CHECK_STATUS(tensor4dGet(inputDesc[0], &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_REQUIREMENT(!priorboxDesc.aspect_ratios.empty()); - U32 num_priorboxs = priorboxDesc.aspect_ratios.size(); - if(priorboxDesc.flip){ - num_priorboxs = num_priorboxs * 2; - } - CHECK_REQUIREMENT(!priorboxDesc.min_sizes.empty()); - U32 num_minsize = priorboxDesc.min_sizes.size(); - num_priorboxs = num_priorboxs * num_minsize + num_minsize; - if(!priorboxDesc.max_sizes.empty()){ - U32 num_maxsize = priorboxDesc.max_sizes.size(); - CHECK_REQUIREMENT(num_minsize == num_maxsize); - num_priorboxs = num_priorboxs + num_maxsize; - } - DEBUG_info(" Number of priorboxs per pixel : " << num_priorboxs); - //on = 1, oc = 2, ol= 4*num_priorboxs*ih*iw - *outputDesc = tensor3d(idt, 1, 2, 4*num_priorboxs*ih*iw); - return SUCCESS; -} - -EE priorbox_infer_output_size(std::vector inputDesc, PriorBoxDesc priorboxDesc, TensorDesc* outputDesc, Arch arch, ExtInfo_t extInfo) -{ - UNUSED(arch); - UNUSED(extInfo); - CHECK_STATUS(priorbox_infer_output_size_cpu(inputDesc, priorboxDesc, outputDesc)); - return SUCCESS; -} - -EE priorbox(std::vector inputDesc, PriorBoxDesc priorboxDesc, TensorDesc outputDesc, void* output, Arch arch, ExtInfo_t extInfo) -{ - UNUSED(extInfo); - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = priorbox_general(inputDesc, priorboxDesc, outputDesc, output); -#endif -#ifdef _USE_GENERAL - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = priorbox_arm(inputDesc, priorboxDesc, outputDesc, output); -#endif - } - return ret; -} diff --git a/tensor_computing/src/quantize.cpp b/tensor_computing/src/quantize.cpp deleted file mode 100644 index 2d12f60d..00000000 --- a/tensor_computing/src/quantize.cpp +++ /dev/null @@ -1,269 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - -#include "tensor_computing.h" -#include "cpu/arm/tensor_computing_arm.h" -#ifdef _USE_FP16 -#include "cpu/arm/fp16/arm_functions_fp16.h" -#endif -#ifdef _USE_FP32 -#include "cpu/arm/fp32/arm_functions_fp32.h" -#endif -#define BINS 2048 - -EE quantize_tensor(TensorDesc dDesc, const void* data, TensorDesc* qDesc, void* qData, void *scale) -{ - EE ret = quantize_tensor_arm(dDesc, data, qDesc, qData, scale); - return ret; -} - -#ifdef _USE_INT8 -void dequantize_int8_to_fp16(U32 len, INT8* q, F32 scale, F16* d) -{ - F16 factor = 1 / scale; - int i = 0; - for (; i < ((int)len) - 15; i += 16) { - int8x8_t in0 = vld1_s8(q + i); - int8x8_t in1 = vld1_s8(q + i + 8); - int16x8_t s0 = vmovl_s8(in0); - int16x8_t s1 = vmovl_s8(in1); - float16x8_t f0 = vcvtq_f16_s16(s0); - float16x8_t f1 = vcvtq_f16_s16(s1); - f0 = vmulq_n_f16(f0, factor); - f1 = vmulq_n_f16(f1, factor); - vst1q_f16(d + i, f0); - vst1q_f16(d + i + 8, f1); - } - - for (; i < (int)len; i++) { - d[i] = q[i] * factor; - } -} - -void dequantize_int32_to_fp16(U32 len, I32* q, F32 scale, F16* d, U32 biasLen, F16* biasPtr) -{ - if (0 != biasLen) { - CHECK_REQUIREMENT(nullptr != biasPtr); - CHECK_REQUIREMENT(len % biasLen == 0); - CHECK_REQUIREMENT(biasLen % 4 == 0); - } - float16x4_t bias[4]; - - F32 factor = 1 / scale; - int i = 0; - for (; i < ((int)len) - 15; i += 16) { - int32x4_t in0 = vld1q_s32(q + i); - int32x4_t in1 = vld1q_s32(q + i + 4); - int32x4_t in2 = vld1q_s32(q + i + 8); - int32x4_t in3 = vld1q_s32(q + i + 12); - if (0 != biasLen) { - U32 offset = i % biasLen; - for (U32 j = 0; j < 4; j++) { - bias[j] = vld1_f16(biasPtr + offset); - offset += 4; - if (offset >= biasLen) { - offset = 0; - } - } - } - float32x4_t f0 = vcvtq_f32_s32(in0); - float32x4_t f1 = vcvtq_f32_s32(in1); - float32x4_t f2 = vcvtq_f32_s32(in2); - float32x4_t f3 = vcvtq_f32_s32(in3); - f0 = vmulq_n_f32(f0, factor); - f1 = vmulq_n_f32(f1, factor); - f2 = vmulq_n_f32(f2, factor); - f3 = vmulq_n_f32(f3, factor); - float16x4_t h0 = vcvt_f16_f32(f0); - float16x4_t h1 = vcvt_f16_f32(f1); - float16x4_t h2 = vcvt_f16_f32(f2); - float16x4_t h3 = vcvt_f16_f32(f3); - if (0 != biasLen) { - h0 = vadd_f16(h0, bias[0]); - h1 = vadd_f16(h1, bias[1]); - h2 = vadd_f16(h2, bias[2]); - h3 = vadd_f16(h3, bias[3]); - } - vst1_f16(d + i, h0); - vst1_f16(d + i + 4, h1); - vst1_f16(d + i + 8, h2); - vst1_f16(d + i + 12, h3); - } - - for (; i < (int)len; i++) { - d[i] = q[i] * factor; - if (0 != biasLen) { - d[i] += biasPtr[i % biasLen]; - } - } -} - -void update_histogram(U32 len, const F16* data, int numBins, F32 interval, F32* histo) -{ - for (U32 i = 0; i < len; i++) { - F32 tmp = data[i]; - int index = std::floor(std::abs(tmp) / interval); - if (index >= numBins) { - index = numBins - 1; - } - histo[index] += 1; - } -} - -std::vector compress_histogram(std::vector &histogram, F32 numPerBin, F32 last_max) -{ - std::vector newhistogram(2048, 0); - for (U32 q = 0; q < std::ceil(2048/numPerBin) ; q++) { - - F32 start = q * numPerBin; - F32 end = start + numPerBin; - int left = std::ceil(start); - if (left > start) { - newhistogram[q] += ((F32)left - start) * histogram[left - 1]; - } - if( end <= last_max){ - int right = std::floor(end); - if (right < end) { - newhistogram[q] += (end - (F32)right) * histogram[right]; - } - - for (int k = left; k < right; k++) { - newhistogram[q] += histogram[k]; - } - } - else{ - for (int k = left; k < 2048; k++) { - newhistogram[q] += histogram[k]; - } - } - } - histogram.assign(newhistogram.begin(), newhistogram.end()); - return histogram; -} - -F32 compute_KLD(U32 len, const F32* p, const F32* q) -{ - F32 kld = 0; - - for (U32 i = 0; i < len; i++) { - if (0 != p[i]) { - if (0 == q[i]) { - kld += 1; - } else { - kld += p[i] * std::log(p[i] / q[i]); - } - } - } - - return kld; -} -#endif - -std::vector compute_scale_with_KL(std::vector &histogram, F32 interval) -{ - std::vector scale; -#ifdef _USE_INT8 - F32 histoSum = array_sum_f32(histogram.data(), BINS); - array_scale_f32(histogram.data(), histogram.data(), BINS, 1 / histoSum, 0); - - F32 minKLD = 2048; - int bestThreshold = 128; - F32 sumBin = array_sum_f32(histogram.data(), 128); - DEBUG_info("First 128 bins contain " << sumBin << " of values"); - F32 sumOver = 1 - sumBin; - - for (U32 i = 128; i < 2048; i++) { - std::vector clipDist(histogram.begin(), histogram.begin() + i); - clipDist[i - 1] += sumOver; - sumOver -= histogram[i]; // Prepare for next round - - std::vector quantDist(128, 0); - - F32 numPerBin = (F32)i / 128.0; - - for (U32 j = 0; j < 128; j++) { - F32 start = j * numPerBin; - F32 end = start + numPerBin; - - int left = std::ceil(start); - if (left > start) { - quantDist[j] += ((F32)left - start) * histogram[left - 1]; - } - - int right = std::floor(end); - if (right < end) { - quantDist[j] += (end - (F32)right) * histogram[right]; - } - - for (int k = left; k < right; k++) { - quantDist[j] += histogram[k]; - } - } - - std::vector qExpand(i, 0); - - for (U32 j = 0; j < 128; j++) { - F32 start = j * numPerBin; - F32 end = start + numPerBin; - - F32 count = 0; - - int left = std::ceil(start); - if (left > start && 0 != histogram[left - 1]) { - count += (F32)left - start; - } - - int right = std::floor(end); - if (right < end && 0 != histogram[right]) { - count += end - (F32)right; - } - - for (int k = left; k < right; k++) { - if (0 != histogram[k]) { - count += 1; - } - } - - F32 expandVal = quantDist[j] / count; - - if (left > start && 0 != histogram[left - 1]) { - qExpand[left - 1] += expandVal * ((F32)left - start); - } - - if (right < end && 0 != histogram[right]) { - qExpand[right] += expandVal * (end - (F32)right); - } - - for (int k = left; k < right; k++) { - if (0 != histogram[k]) { - qExpand[k] += expandVal; - } - } - } - - F32 kld = compute_KLD(i, clipDist.data(), qExpand.data()); - - if (kld < minKLD) { - minKLD = kld; - bestThreshold = i; - } - } - DEBUG_info(bestThreshold << "/2048"); - F32 threshold = (F32)bestThreshold * interval; - F32 quantScale = 127.99 / threshold; - scale.push_back(quantScale); -#endif - return scale; -} diff --git a/tensor_computing/src/reduction.cpp b/tensor_computing/src/reduction.cpp deleted file mode 100644 index dd56ad7f..00000000 --- a/tensor_computing/src/reduction.cpp +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing.h" -#ifdef _USE_GENERAL -#include "cpu/general/tensor_computing_general.h" -#endif -#ifdef _USE_NEON -#include "cpu/arm/tensor_computing_arm.h" -#endif - -EE reduction(TensorDesc inputDesc, const void* input, - TensorDesc maskDesc, const void* mask, - I32 axis, - ReductionMode reductionMode, - float coeff, - TensorDesc outputDesc, void* output, Arch arch) -{ - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = reduction_general(inputDesc, input, maskDesc, mask, axis, reductionMode, coeff, outputDesc, output); -#endif -#ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = reduction_arm(inputDesc, input, maskDesc, mask, axis, reductionMode, coeff, outputDesc, output); -#endif - } - return ret; -} - -EE reduction_infer_output_size(TensorDesc inputDesc, TensorDesc maskDesc, int axis, bool keepDim, TensorDesc *outputDesc) -{ - if (nullptr == outputDesc) - CHECK_STATUS(NULL_POINTER); - - *outputDesc = inputDesc; - if (axis < 0) - axis += inputDesc.nDims; - axis = inputDesc.nDims - 1 - axis; - int num = 1; - if (tensorNumElements(maskDesc) == 0) - (*outputDesc).dims[axis] = 1; - else { - num = maskDesc.dims[1]; - (*outputDesc).dims[axis] = num; - } - - if (!keepDim && num < 2) { - for (int i = axis; i < (I32)(inputDesc.nDims)-1; i++) { - (*outputDesc).dims[i] = (*outputDesc).dims[i+1]; - } - (*outputDesc).nDims = inputDesc.nDims - 1; - } - return SUCCESS; -} diff --git a/tensor_computing/src/reshape.cpp b/tensor_computing/src/reshape.cpp deleted file mode 100644 index 3b90c67f..00000000 --- a/tensor_computing/src/reshape.cpp +++ /dev/null @@ -1,98 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing.h" -#ifdef _USE_GENERAL -#include "cpu/general/tensor_computing_general.h" -#endif -#ifdef _USE_NEON -#include "cpu/arm/tensor_computing_arm.h" -#endif -#ifdef _USE_MALI -#include "gpu/mali/tensor_computing_mali.h" -#endif - - -inline EE reshape_infer_output_size_cpu(TensorDesc inputDesc, TensorDesc* outputDesc, I32 *shape, I32 shape_size) -{ - if (nullptr == outputDesc || nullptr == shape) { - return NULL_POINTER; - } - - *outputDesc = inputDesc; - (*outputDesc).nDims = shape_size; - if (shape_size == 2) - (*outputDesc).df = DF_NORMAL; - if (shape_size == 4) - (*outputDesc).df = DF_NCHW; - - U32 factor = 1; - I32 count = 0; - for(I32 i = 0; i < shape_size; i++) { - I32 value = shape[i]; - if (value == 0) { - value = inputDesc.dims[inputDesc.nDims-1-i]; - } - if (value == -1) { - value = 0; - count ++; - } else { - factor *= value; - } - - (*outputDesc).dims[shape_size-1-i] = value; - } - if (count > 1) { - return NOT_SUPPORTED; - } - - for (I32 i = 0; i < shape_size; i++) { - if ((*outputDesc).dims[i] == 0) { - (*outputDesc).dims[i] = tensorNumElements(inputDesc) / factor; - } - } - - return SUCCESS; -} - -EE reshape_infer_output_size(TensorDesc inputDesc, TensorDesc* outputDesc, I32 *shape, I32 shape_size, Arch arch, ExtInfo_t extInfo) { - EE ret = NOT_SUPPORTED; - if(arch == MALI){ -#ifdef _USE_MALI - ret = reshape_infer_output_size_mali(inputDesc, outputDesc, shape, shape_size, extInfo->maliInfo.gclmemInputDesc, extInfo->maliInfo.gclmemOutputDesc); -#endif - } else { - ret = reshape_infer_output_size_cpu(inputDesc, outputDesc, shape, shape_size); - } - return ret; -} - -EE reshape(TensorDesc inputDesc, void* input, TensorDesc outputDesc, void* output, Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = reshape_general(inputDesc, input, outputDesc, output); -#endif -#ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = reshape_arm(inputDesc, input, outputDesc, output); -#endif -#ifdef _USE_MALI - } else if (arch == MALI) { - ret = reshape_mali(extInfo->maliInfo.handle, inputDesc, (GCLMem_t)input, outputDesc, (GCLMem_t)output); -#endif - } - return ret; -} diff --git a/tensor_computing/src/scale.cpp b/tensor_computing/src/scale.cpp deleted file mode 100644 index 587f8a03..00000000 --- a/tensor_computing/src/scale.cpp +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing.h" -#ifdef _USE_GENERAL -#include "cpu/general/tensor_computing_general.h" -#endif -#ifdef _USE_NEON -#include "cpu/arm/tensor_computing_arm.h" -#endif -#ifdef _USE_MALI -#include "gpu/mali/tensor_computing_mali.h" -#endif - -inline EE scale_infer_output_size_cpu(TensorDesc inputDesc, TensorDesc *outputDesc) -{ - if (nullptr == outputDesc) - CHECK_STATUS(NULL_POINTER); - - *outputDesc = inputDesc; - return SUCCESS; -} - -EE scale_infer_output_size(TensorDesc inputDesc, TensorDesc *outputDesc, Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if(arch == MALI){ -#ifdef _USE_MALI - ret = scale_infer_output_size_mali(inputDesc, outputDesc, extInfo->maliInfo.gclmemInputDesc, extInfo->maliInfo.gclmemOutputDesc); -#endif - } else { - ret = scale_infer_output_size_cpu(inputDesc, outputDesc); - } - return ret; -} - -EE scale(TensorDesc inputDesc, void* input, - I32 axis, void *alpha, void *beta, - TensorDesc outputDesc, void* output, - Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = scale_general(inputDesc, input, axis, alpha, beta, outputDesc, output); -#endif -#ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = scale_arm(inputDesc, input, axis, alpha, beta, outputDesc, output); -#endif -#ifdef _USE_MALI - } else if (arch == MALI) { - ret = scale_mali(extInfo->maliInfo.handle, (GCLMem_t)alpha, (GCLMem_t)beta, - inputDesc, (GCLMem_t)input, outputDesc, (GCLMem_t)output); -#endif - } - return ret; -} diff --git a/tensor_computing/src/set_input.cpp b/tensor_computing/src/set_input.cpp deleted file mode 100644 index fae848d2..00000000 --- a/tensor_computing/src/set_input.cpp +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing.h" -#ifdef _USE_MALI -#include "gpu/mali/tensor_computing_mali.h" -#endif - -EE tensor_computing_set_input_infer_tmpBuf_size(void* input, TensorDesc hostDesc, U32* tmpBufSize, Arch arch) -{ - EE ret = NOT_SUPPORTED; - if(arch == MALI){ -#ifdef _USE_MALI - ret = tensor_computing_set_input_infer_tmpBuf_size_mali((GCLMem_t)input, hostDesc, tmpBufSize); -#endif - } - return ret; -} - -EE tensor_computing_set_input(void* input, TensorDesc hostDesc, const void* hostPtr, void* tmpBuf, bool blocking, Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if (arch == MALI) { -#ifdef _USE_MALI - ret = tensor_computing_set_input_mali(extInfo->maliInfo.handle, (GCLMem_t)input, hostDesc, (const U8*)hostPtr, (GCLMem_t)tmpBuf, blocking); -#endif - } - return ret; -} diff --git a/tensor_computing/src/slice.cpp b/tensor_computing/src/slice.cpp deleted file mode 100644 index bcd65e68..00000000 --- a/tensor_computing/src/slice.cpp +++ /dev/null @@ -1,97 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing.h" -#ifdef _USE_GENERAL -#include "cpu/general/tensor_computing_general.h" -#endif -#ifdef _USE_NEON -#include "cpu/arm/tensor_computing_arm.h" -#endif -#ifdef _USE_MALI -#include "gpu/mali/tensor_computing_mali.h" -#endif - -inline EE slice_infer_output_size_cpu(TensorDesc inputDesc, std::vector* outputDesc, I32 axis, I32 *slice_point) -{ - if (nullptr == outputDesc) - CHECK_STATUS(NULL_POINTER); - - U32 num = (*outputDesc).size(); - axis = (axis + inputDesc.nDims) % inputDesc.nDims; - I32 target_axis = inputDesc.nDims - 1 - axis; - for (U32 i = 0; i < num; i++) { - (*outputDesc)[i] = inputDesc; - - I32 prev_point = 0; - if (i > 0) { - prev_point = slice_point[i-1]; - } - I32 next_point = inputDesc.dims[target_axis]; - if (i < num - 1) { - next_point = slice_point[i]; - } - if (prev_point < 0) { - prev_point = prev_point + inputDesc.dims[target_axis]; - if (prev_point < 0) - prev_point = 0; - } - if (next_point < 0) { - next_point = next_point + inputDesc.dims[target_axis]; - if (next_point < 0) - next_point = 0; - } - (*outputDesc)[i].dims[target_axis] = next_point - prev_point; - } - return SUCCESS; -} - -EE slice_infer_output_size(TensorDesc inputDesc, std::vector* outputDesc, I32 axis, I32* slice_point, Arch arch, ExtInfo_t extInfo) -{ -#ifdef _USE_MALI - if(arch == MALI){ - CHECK_STATUS(slice_infer_output_size_mali(inputDesc, outputDesc, axis, slice_point, extInfo->maliInfo.gclmemInputDesc, extInfo->maliInfo.gclmemOutputDesc)); - } else { -#endif - UNUSED(arch); - UNUSED(extInfo); - CHECK_STATUS(slice_infer_output_size_cpu(inputDesc, outputDesc, axis, slice_point)); -#ifdef _USE_MALI - } -#endif - return SUCCESS; -} - -EE slice(TensorDesc inputDesc, void* input, int axis, - std::vector outputDesc, std::vector* output, Arch arch, ExtInfo_t extInfo) -{ -#ifndef _USE_MALI - UNUSED(extInfo); -#endif - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = slice_general(inputDesc, input, axis, outputDesc, output); -#endif -#ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = slice_arm(inputDesc, input, axis, outputDesc, output); -#endif -#ifdef _USE_MALI - } else if (arch == MALI) { - ret = slice_mali(extInfo->maliInfo.handle, inputDesc, (GCLMem_t)input, axis, outputDesc, output); -#endif - } - return ret; -} diff --git a/tensor_computing/src/softmax.cpp b/tensor_computing/src/softmax.cpp deleted file mode 100644 index 4a2b43ad..00000000 --- a/tensor_computing/src/softmax.cpp +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing.h" -#ifdef _USE_GENERAL -#include "cpu/general/tensor_computing_general.h" -#endif -#ifdef _USE_NEON -#include "cpu/arm/tensor_computing_arm.h" -#endif -#ifdef _USE_MALI -#include "gpu/mali/tensor_computing_mali.h" -#endif - -EE softmax(TensorDesc inputDesc, const void* input, int axis, TensorDesc outputDesc, void* output, Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = softmax_general(inputDesc, input, axis, outputDesc, output); -#endif -#ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = softmax_arm(inputDesc, input, axis, outputDesc, output); -#endif -#ifdef _USE_MALI - } else if (arch == MALI) { - ret = softmax_mali(extInfo->maliInfo.handle, inputDesc, (GCLMem_t)input, axis, outputDesc, (GCLMem_t)output); -#endif - } - return ret; -} - -inline EE softmax_infer_output_size_cpu(TensorDesc inputDesc, TensorDesc *outputDesc) -{ - if (nullptr == outputDesc) CHECK_STATUS(NULL_POINTER); - *outputDesc = inputDesc; - if (DF_NCHWC8 == (*outputDesc).df) { - (*outputDesc).df = DF_NCHW; - } - return SUCCESS; -} - -EE softmax_infer_output_size(TensorDesc inputDesc, TensorDesc *outputDesc, Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if(arch == MALI){ -#ifdef _USE_MALI - ret = softmax_infer_output_size_mali(inputDesc, outputDesc, extInfo->maliInfo.gclmemInputDesc, extInfo->maliInfo.gclmemOutputDesc); -#endif - } else { - ret = softmax_infer_output_size_cpu(inputDesc, outputDesc); - } - return ret; -} diff --git a/tensor_computing/src/space2depth.cpp b/tensor_computing/src/space2depth.cpp deleted file mode 100644 index 22f93bb2..00000000 --- a/tensor_computing/src/space2depth.cpp +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing.h" -#ifdef _USE_MALI -#include "gpu/mali/tensor_computing_mali.h" -#endif - -EE space2depth_infer_output_size(TensorDesc inputDesc, TensorDesc *outputDesc, Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if(arch == MALI){ -#ifdef _USE_MALI - ret = space2depth_infer_output_size_mali(inputDesc, outputDesc, extInfo->maliInfo.gclmemInputDesc, extInfo->maliInfo.gclmemOutputDesc); -#endif - } - return ret; -} - -EE space2depth(TensorDesc inputDesc, const void* input, TensorDesc outputDesc, void* output, Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if (arch == MALI) { -#ifdef _USE_MALI - ret = space2depth_mali(extInfo->maliInfo.handle, inputDesc, (GCLMem_t)input, outputDesc, (GCLMem_t)output); -#endif - } - return ret; -} diff --git a/tensor_computing/src/split.cpp b/tensor_computing/src/split.cpp deleted file mode 100644 index d1cff984..00000000 --- a/tensor_computing/src/split.cpp +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing.h" -#ifdef _USE_GENERAL -#include "cpu/general/tensor_computing_general.h" -#endif -#ifdef _USE_NEON -#include "cpu/arm/tensor_computing_arm.h" -#endif -#include - -EE split_infer_output_size(TensorDesc inputDesc, std::vector* outputDesc) -{ - if (nullptr == outputDesc) - CHECK_STATUS(NULL_POINTER); - - for(U32 i = 0; i < (*outputDesc).size(); i++) { - (*outputDesc)[i] = inputDesc; - } - return SUCCESS; -} - -EE split(TensorDesc inputDesc, void* input, - std::vector outputDesc, std::vector* output, Arch arch) -{ - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = split_general(inputDesc, input, outputDesc, output); -#endif -#ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = split_arm(inputDesc, input, outputDesc, output); -#endif - } - return ret; -} diff --git a/tensor_computing/src/squeeze.cpp b/tensor_computing/src/squeeze.cpp deleted file mode 100644 index f9ccde25..00000000 --- a/tensor_computing/src/squeeze.cpp +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing.h" -#ifdef _USE_MALI -#include "gpu/mali/tensor_computing_mali.h" -#endif - -EE squeeze(TensorDesc inputDesc, const void* input, TensorDesc outputDesc, void* output, Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if (arch == MALI) { -#ifdef _USE_MALI - ret = squeeze_mali(extInfo->maliInfo.handle, inputDesc, (GCLMem_t)input, outputDesc, (GCLMem_t)output); -#endif - } - return ret; -} - -EE squeeze_infer_output_size(TensorDesc inputDesc, TensorDesc *outputDesc, Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if(arch == MALI){ -#ifdef _USE_MALI - ret = squeeze_infer_output_size_mali(inputDesc, outputDesc, extInfo->maliInfo.gclmemInputDesc, extInfo->maliInfo.gclmemOutputDesc); -#endif - } - return ret; -} diff --git a/tensor_computing/src/transpose.cpp b/tensor_computing/src/transpose.cpp deleted file mode 100644 index ac12383a..00000000 --- a/tensor_computing/src/transpose.cpp +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing.h" -#ifdef _USE_GENERAL -#include "cpu/general/tensor_computing_general.h" -#endif -#ifdef _USE_NEON -#include "cpu/arm/tensor_computing_arm.h" -#endif -#ifdef _USE_MALI -#include "gpu/mali/tensor_computing_mali.h" -#endif - -EE transpose(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *output, U32 *dim, Arch arch, ExtInfo_t extInfo) -{ - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = transpose_general(inputDesc, input, outputDesc, output, dim); -#endif -#ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = transpose_arm(inputDesc, input, outputDesc, output, dim); -#endif -#ifdef _USE_MALI - } else if(arch == MALI) { - ret = transpose_mali(extInfo->maliInfo.handle, inputDesc, (GCLMem_t)input, outputDesc, (GCLMem_t)output, dim); -#endif - } - return ret; -} - -inline EE transpose_infer_output_size_cpu(TensorDesc inputDesc, TensorDesc *outputDesc, U32 *dim) { - if (nullptr == outputDesc || nullptr == dim) - CHECK_STATUS(NULL_POINTER); - - *outputDesc = inputDesc; - U32 inputDim = inputDesc.nDims; - if (4 == inputDim) { - (*outputDesc).df = DF_NCHW; - } - U32 outputDim = (*outputDesc).nDims; - for (U32 i = 0; i < inputDim; i++) { - CHECK_REQUIREMENT(dim[i] < inputDim); - // NOTE: TensorDesc.dims array is in [W H C N] order. - // so if you want to transpose [N C H W] format data, we use (dims - 1 - *) - // [5 6 7 8] + [0 3 2 1] = [5 8 7 6] - // [8 7 6 5] + [0 3 2 1] = [6 7 8 5] - (*outputDesc).dims[outputDim - 1 - i] = inputDesc.dims[inputDim - 1 - dim[i]]; - } - if ((*outputDesc).nDims >= 4) - (*outputDesc).df = DF_NCHW; - return SUCCESS; -} - -EE transpose_infer_output_size(TensorDesc inputDesc, TensorDesc *outputDesc, U32* dim, Arch arch, ExtInfo_t extInfo) -{ -#ifdef _USE_MALI - if(arch == MALI){ - CHECK_STATUS(transpose_infer_output_size_mali(inputDesc, outputDesc, dim, extInfo->maliInfo.gclmemInputDesc, extInfo->maliInfo.gclmemOutputDesc)) - } else { -#endif - UNUSED(arch); - UNUSED(extInfo); - CHECK_STATUS(transpose_infer_output_size_cpu(inputDesc, outputDesc, dim)); -#ifdef _USE_MALI - } -#endif - return SUCCESS; -} - diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt deleted file mode 100644 index 91a6961a..00000000 --- a/tests/CMakeLists.txt +++ /dev/null @@ -1,84 +0,0 @@ -cmake_minimum_required(VERSION 3.2) - -file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/bolt.cmake ${BOLT_ROOT}/bolt.cmake) -if (BOLT_CONFIGURE_FILE) - include(${BOLT_CONFIGURE_FILE}) -else (BOLT_CONFIGURE_FILE) - message(FATAL_ERROR " -FATAL: can not find bolt.cmake in directory, - please set shell or cmake environment variable BOLT_ROOT. - ") -endif (BOLT_CONFIGURE_FILE) - -project(tests) - -set_policy() - -SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${BOLT_ROOT}/cmakes") -find_package(Uni) -find_package(Image) -find_package(TensorComputing) -find_package(Inference) -find_package(jpeg) -if(USE_MALI) - find_package(Gcl) -endif(USE_MALI) - -set_project_install_directory() -set_c_cxx_flags() -set_test_c_cxx_flags() - -if (BUILD_TEST) - if (USE_NEON) - blas_enhance(test_mmm) - blas_enhance(test_mvm) - tensor_computing(test_activation) - tensor_computing(test_attention) - tensor_computing(test_clip) - tensor_computing(test_concat) - tensor_computing(test_convolution) - tensor_computing(test_deconvolution) - tensor_computing(test_depthwise_convolution) - tensor_computing(test_dilated_convolution) - tensor_computing(test_detectionoutput) - tensor_computing(test_eltwise) - tensor_computing(test_fully_connected) - tensor_computing(test_lstm) - tensor_computing(test_multiply) - tensor_computing(test_reduction) - tensor_computing(test_pooling) - tensor_computing(test_priorbox) - tensor_computing(test_reshape) - tensor_computing(test_softmax) - tensor_computing(test_split) - tensor_computing(test_slice) - tensor_computing(test_scale) - tensor_computing(test_transpose) - - if (USE_INT8) - blas_enhance(test_mmm_int8) - blas_enhance(test_mvm_int8) - tensor_computing(test_convolution_int8) - tensor_computing(test_depthwise_convolution_int8) - tensor_computing(test_concat_int8) - tensor_computing(test_pooling_int8) - endif(USE_INT8) - - if (USE_FP16) - tensor_computing(test_convolution_bnn) - endif(USE_FP16) - endif (USE_NEON) - - image(test_image_processing) - image(test_image_resize) - - inference(test_api_c test_api_c.c) - if (USE_MALI) - if (USE_FP16) - inference(test_pipeline_ocl test_pipeline_ocl.cpp) - inference(test_convolution_ocl test_convolution_ocl.cpp) - inference(test_depthwise_convolution_ocl test_depthwise_convolution_ocl.cpp) - inference(test_fully_connected_ocl test_fully_connected_ocl.cpp) - endif (USE_FP16) - endif (USE_MALI) -endif (BUILD_TEST) diff --git a/tests/test_activation.cpp b/tests/test_activation.cpp deleted file mode 100644 index ab0fdec8..00000000 --- a/tests/test_activation.cpp +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "tensor_computing.h" -#include "ut_util.h" - -int activationTest(int argc, char** argv, DataType dt) { - CHECK_REQUIREMENT(argc == 5); - U32 in = atoi(argv[1]); - U32 ic = atoi(argv[2]); - U32 ih = atoi(argv[3]); - U32 iw = atoi(argv[4]); - - DataFormat df = DF_NCHWC8; - ActivationDesc activationDesc; - activationDesc.mode = ACTIVATION_RELU; - memset(activationDesc.value, 0, sizeof(activationDesc.value)); - - TensorDesc dataDesc = tensor4df(dt, df, in, ic, ih, iw); - U32 len = tensorNumElements(dataDesc); - - U8* data = ut_input_v(len, dt, UT_INIT_RANDOM); - U8* dataRef = ut_input_v(len, dt, UT_INIT_ZERO); - memcpy(dataRef, data, len*bytesOf(dt)); - - if (UT_CHECK) { - CHECK_STATUS(activation(dataDesc, data, activationDesc, dataDesc, data, UT_ARCH)); - - // naive implement - CHECK_STATUS(activation(dataDesc, dataRef, activationDesc, dataDesc, dataRef, CPU_GENERAL)); - - // check - ut_check_v(data, dataRef, len, dt, 0, __FILE__, __LINE__); - } - - // benchmark - double time_start = ut_time_ms(); - for (int iter = 0; iter < UT_LOOPS; iter++) { - CHECK_STATUS(activation(dataDesc, data, activationDesc, dataDesc, data, UT_ARCH)); - } - double time_end = ut_time_ms(); - double time = (time_end - time_start) / UT_LOOPS; - - // log performance data - char buffer[150]; - char params[120]; - sprintf(params, "(%u %u %u %u)=(%u %u %u %u)", - in, ic, ih, iw, - in, ic, ih, iw); - sprintf(buffer, "%20s, %80s", "Activation", params); - double ops = 1.0 * in * ic * ih * iw; - ut_log(dt, buffer, ops, time/UT_LOOPS); - - free(data); - free(dataRef); - - return 0; -} - -int main(int argc, char** argv) { -#ifdef _USE_FP16 - activationTest(argc, argv, DT_F16); -#endif -#ifdef _USE_FP32 - activationTest(argc, argv, DT_F32); -#endif - return 0; -} diff --git a/tests/test_api_c.c b/tests/test_api_c.c deleted file mode 100644 index 90731987..00000000 --- a/tests/test_api_c.c +++ /dev/null @@ -1,163 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#include -#include -#include -#include -#include "type.h" -#include "../exports/c/bolt.h" -#include -inline double ut_time_ms() { - struct timeval tv; - gettimeofday(&tv, NULL); - double time = tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0; - return time; -} - -void print_help(char* argv[]) { - printf("usage: %s modelPath \n", argv[0]); -} - -void classification(const char* modelPath, DEVICE_TYPE device, const char* algoPath) { - const char* precision_BNN = "f16.bolt"; - const char* precision_INT8 = "int8_q.bolt"; - const char* precision_FP16 = "f16.bolt"; - const char* precision_FP32 = "f32.bolt"; - DATA_TYPE precisionMode = FP_32; - if (strstr(modelPath, precision_BNN) - || strstr(modelPath, precision_INT8) - || strstr(modelPath, precision_FP16)) - { - precisionMode = FP_16; - } else if (strstr(modelPath, precision_FP32)) { - precisionMode = FP_32; - } else { - printf("[ERROR] inference precision currently support " - "FP32(_f32.bolt)/FP16(_f16.bolt)/INT8(_int8_q.bolt)/BNN(_f16.bolt), " - "unsupported %s\n", modelPath); - exit(1); - } - - ModelHandle model_address; - model_address = CreateModel(modelPath, HIGH_PERFORMANCE, device, algoPath); - - int num_input = 1; - int n[1] = {1}; - int c[1] = {3}; - int h[1] = {224}; - int w[1] = {224}; - //char* firstName = "MobileNetV2/MobileNetV2/Conv2d_0/Conv2D__6:0"; - char* firstName = "data"; - char* name[1]; - name[0] = firstName; - DATA_TYPE dt_input[1] = {precisionMode}; - DATA_FORMAT df_input[1] = {NCHW}; - PrepareModel(model_address, num_input, n, c, h, w, name, dt_input, df_input); - ResultHandle model_result = AllocAllResultHandle(model_address); - - void* mem[1]; - int length = 3 * 224 * 224; - switch (precisionMode) { -#ifdef _USE_FP32 - case FP_32: { - F32 *ptr = (F32 *)malloc(sizeof(F32) * length); - for (int i = 0; i < length; i++) { - ptr[i] = 1; - } - mem[0] = (void*)ptr; - break; - } -#endif -#ifdef _USE_FP16 - case FP_16: { - F16 *ptr = (F16 *)malloc(sizeof(F16) * length); - for (int i = 0; i < length; i++) { - ptr[i] = 1; - } - mem[0] = (void*)ptr; - break; - } -#endif - default: - printf("[ERROR] unsupported data precision in C API test\n"); - exit(1); - } - double totalTime = 0; - double max_time = -DBL_MAX; - double min_time = DBL_MAX; - int loop = 10; - - for(int i = 0; i < loop; i++) { - double timeBegin = ut_time_ms(); - RunModel(model_address, model_result, 1, name, mem); - double timeEnd = ut_time_ms(); - double t = timeEnd - timeBegin; - totalTime += t; - if(t < min_time) min_time = t; - if(t > max_time) max_time = t; - } - - int model_result_num = GetNumOutputsFromResultHandle(model_result); - - char* outputNames[1]; - void* outputData[1]; - int output_n[1]; - int output_c[1]; - int output_h[1]; - int output_w[1]; - DATA_TYPE dt_output[1]; - DATA_FORMAT df_output[1]; - GetPtrFromResultHandle(model_result, model_result_num, outputNames, outputData, output_n, output_c, output_h, output_w, dt_output, df_output); - FreeResultHandle(model_result); - DestroyModel(model_address); - free(mem[0]); - if(device == CPU) printf("DeviceType = CPU, Model = %s\n", modelPath); - if(device == GPU) printf("DeviceType = GPU, Model = %s", modelPath); - printf("avg_time: %lf ms\n", 1.0 * totalTime / loop); - printf("max_time: %lf ms\n", 1.0 * max_time); - printf("min_time: %lf ms\n", 1.0 * min_time); -} - -int main() { - const char* mobilenet_v1_fp16_modelPath = "/data/local/tmp/CI/model_zoo/caffe_models/mobilenet_v1/mobilenet_v1_f16.bolt"; - classification(mobilenet_v1_fp16_modelPath, CPU, NULL); - - const char* mobilenet_v1_fp32_modelPath = "/data/local/tmp/CI/model_zoo/caffe_models/mobilenet_v1/mobilenet_v1_f32.bolt"; - classification(mobilenet_v1_fp32_modelPath, CPU, NULL); - - const char* mobilenet_v2_fp16_modelPath = "/data/local/tmp/CI/model_zoo/caffe_models/mobilenet_v2/mobilenet_v2_f16.bolt"; - classification(mobilenet_v2_fp16_modelPath, CPU, NULL); - - const char* mobilenet_v3_fp16_modelPath = "/data/local/tmp/CI/model_zoo/caffe_models/mobilenet_v3/mobilenet_v3_f16.bolt"; - classification(mobilenet_v3_fp16_modelPath, CPU, NULL); - - const char* mobilenet_v2_fp32_modelPath = "/data/local/tmp/CI/model_zoo/caffe_models/mobilenet_v2/mobilenet_v2_f32.bolt"; - classification(mobilenet_v2_fp32_modelPath, CPU, NULL); - - const char* resnet50_fp16_modelPath = "/data/local/tmp/CI/model_zoo/caffe_models/resnet50/resnet50_f16.bolt"; - classification(resnet50_fp16_modelPath, CPU, NULL); - - const char* resnet50_fp32_modelPath = "/data/local/tmp/CI/model_zoo/caffe_models/resnet50/resnet50_f32.bolt"; - classification(resnet50_fp32_modelPath, CPU, NULL); - -// const char* squeezenet_fp16_modelPath = "/data/local/tmp/CI/model_zoo/caffe_models/squeezenet/squeezenet_f16.bolt"; -// const char* ghostnet_fp16_modelPath = "/data/local/tmp/xyf/model/ghostnet_f16.bolt"; - const char* algoPath = "./"; - classification(mobilenet_v1_fp16_modelPath, GPU, algoPath); - classification(mobilenet_v2_fp16_modelPath, GPU, algoPath); - classification(mobilenet_v3_fp16_modelPath, GPU, algoPath); -// classification(squeezenet_fp16_modelPath, GPU, algoPath); -// classification(ghostnet_fp16_modelPath, GPU, algoPath); - return 0; -} diff --git a/tests/test_api_java.java b/tests/test_api_java.java deleted file mode 100644 index 6adc7644..00000000 --- a/tests/test_api_java.java +++ /dev/null @@ -1,640 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -import java.io.BufferedReader; -import java.io.FileReader; -import java.io.IOException; - -public final class test_api_java { - public static float[] readSequenceDataFromFile(String pathName, int lineNumber) { - float[] array = {}; - try (FileReader reader = new FileReader(pathName); - BufferedReader br = new BufferedReader(reader) - ) { - String line; - int lineIndex = 0; - while ((line = br.readLine()) != null) { - if (lineIndex == lineNumber) { - String[] strArray = line.split(" "); - int arraySize = Integer.valueOf(strArray[0]); - array = new float[arraySize]; - for (int i = 0; i < arraySize; i++) - array[i] = Float.valueOf(strArray[1+i]); - } else { - lineIndex++; - } - } - } catch (IOException e) { - e.printStackTrace(); - } - return array; - } - - public static double getMillisTime() { - return System.nanoTime() / 1000.0 / 1000.0; - } - - public static void verify(int[] arrayA, int[] arrayB, int length) { - for (int j = 0; j < length; j++) { - if (arrayA[j] != arrayB[j]) { - System.err.println("[ERROR] verify failed " + j + " @ "+ arrayA[j] + " " - + arrayB[j] + ", in Java API test"); - System.exit(1); - } - } - } - - public static void verify(float[] arrayA, float[] arrayB, int length, float threshold) { - for (int j = 0; j < arrayA.length; j++) { - if (Math.abs(arrayA[j] - arrayB[j]) > threshold) { - System.err.println("[ERROR] verify failed " + j + " @ "+ arrayA[j] + " " - + arrayB[j] + ", in Java API test"); - System.exit(1); - } - } - } - - public static int verify(float[][] arrayA, float[][] arrayB, int[][] dimensions, float threshold) { - if (arrayA.length != arrayB.length - || arrayA.length != dimensions.length) { - System.err.println("[ERROR] unmatch data to verify, in Java API test"); - System.exit(1); - } - - int sum = 0; - for (int i = 0; i < dimensions.length; i++) { - int length = BoltResult.calculateLength(dimensions[i]); - verify(arrayA[i], arrayB[i], length, threshold); - sum += length; - } - return sum; - } - - public static int top1(float[] array, int offset, int length) { - int maxIndex = offset; - for (int i = offset + 1; i < offset + length; i++) { - if (array[i] > array[maxIndex]) - maxIndex = i; - } - return maxIndex; - } - - public static void tinybert_intent_slot(String outputPrefix, DeviceType device, AffinityType affinity, String modelPath) { - int num_input = 3; - int num_output = 2; - String[] input_names = {"tinybert_words", "tinybert_positions", "tinybert_token_type"}; - String[] output_names = {"intent_softmax", "slot_softmax"}; - int[] n = {1, 1, 1}; - int[] c_max = {32, 32, 32}; - int[] h = {1, 1, 1}; - int[] w = {1, 1, 1}; - DataType[] dts = {DataType.UINT32, DataType.UINT32, DataType.UINT32}; - DataFormat[] dfs = {DataFormat.NORMAL, DataFormat.NORMAL, DataFormat.NORMAL}; - BoltModel bolt_model = new BoltModel(modelPath, affinity, device, - num_input, input_names, n, c_max, h, w, dts, dfs, - num_output, output_names); - - int[] c_act = {9, 9, 9}; - float[][] inputData = { - {101, 2224, 8224, 7341, 2000, 22149, 2000, 2899, 102}, - {0, 1, 2, 3, 4, 5, 6, 7, 8}, - {0, 0, 0, 0, 0, 0, 0, 0, 0}}; - float[][] resultData = { - {22, 0.999023f}, - {44, 44, 1, 23, 44, 44, 44, 8, 44}}; - - double startTime = getMillisTime(); - BoltResult bolt_result = bolt_model.Run(num_input, input_names, - n, c_act, h, w, dts, dfs, - inputData); - double endTime = getMillisTime(); - System.out.println(outputPrefix + bolt_model.DeviceMapping(device) + ", " + bolt_model.AffinityMapping(affinity) - + ", tinybert " + String.format("%.3f", endTime - startTime) - + " ms/sequence, model " + modelPath); - float[][] result = bolt_result.getResultData(); - int[][] dimension = bolt_result.getResultDimension(); - int intentIndex = top1(result[0], 0, result[0].length); - float[][] finalResult = new float[2][dimension[1][1]]; - finalResult[0][0] = intentIndex; - finalResult[0][1] = result[0][intentIndex]; - for (int i = 0; i < dimension[1][1]; i++) { - finalResult[1][i] = top1(result[1], i*dimension[1][2], dimension[1][2]) - i*dimension[1][2]; - } - int[][] finalDimension = {{1, 2}, {1, dimension[1][1]}}; - int length = verify(resultData, finalResult, finalDimension, 0.1f); - if (length == 0) { - System.err.println("[ERROR] verify null data in tinybert, in Java API test"); - System.exit(1); - } - - // model destroy - bolt_model.Destructor(); - } - - public static void tinybert_disambiguate(String outputPrefix, DeviceType device, AffinityType affinity, String modelPath, DataType dt) { - int num_input = 5; - int num_output = 1; - String[] input_names = {"tinybert_words", "tinybert_positions", "tinybert_token_type", "tinybert_words_mask", "tinybert_dict_type"}; - String[] output_names = {"slot_softmax"}; - int[] n = {1, 1, 1, 1, 1}; - int[] c_max = {32, 32, 32, 511, 511}; - int[] h = {1, 1, 1, 32, 1}; - int[] w = {1, 1, 1, 1, 1}; - DataType[] dts = {DataType.UINT32, DataType.UINT32, DataType.UINT32, dt, DataType.UINT32}; - DataFormat[] dfs = {DataFormat.NORMAL, DataFormat.NORMAL, DataFormat.NORMAL, DataFormat.MTK, DataFormat.NORMAL}; - BoltModel bolt_model = new BoltModel(modelPath, affinity, device, - num_input, input_names, n, c_max, h, w, dts, dfs, - num_output, output_names); - - int[] c_act = {27, 27, 27, 1, 1}; - float[][] inputData = { - {101, 3017, 5164, 678, 5341, 5686, 5688, 4680, 5564, 6577, 1920, 1104, 2773, 5018, 671, 2108, - 2001, 3813, 3924, 2193, 4028, 3330, 3247, 712, 2898, 4638, 102}, - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26}, - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0}, - {5}}; - float[][] resultData = {{0.796903967857f, 0.203096017241f}}; - - double startTime = getMillisTime(); - BoltResult bolt_result = bolt_model.Run(num_input, input_names, - n, c_act, h, w, dts, dfs, - inputData); - double endTime = getMillisTime(); - System.out.println(outputPrefix + bolt_model.DeviceMapping(device) + ", " + bolt_model.AffinityMapping(affinity) - + ", tinybert " + String.format("%.3f", endTime - startTime) - + " ms/sequence, model " + modelPath); - float[][] result = bolt_result.getResultData(); - int[][] dimension = bolt_result.getResultDimension(); - int length = verify(resultData, result, dimension, 0.1f); - if (length == 0) { - System.err.println("[ERROR] verify null data in tinybert, in Java API test"); - System.exit(1); - } - - // model destroy - bolt_model.Destructor(); - } - - public static void nmt(String outputPrefix, DeviceType device, AffinityType affinity, String modelPath) { - int num_input = 2; - int num_output = 1; - String[] input_names = {"nmt_words", "nmt_positions"}; - String[] output_names = {"decoder_output"}; - int[] n = {1, 1}; - int[] c_max = {128, 128}; - int[] h = {1, 1, 1}; - int[] w = {1, 1, 1}; - DataType[] dts = {DataType.UINT32, DataType.UINT32, DataType.UINT32}; - DataFormat[] dfs = {DataFormat.NORMAL, DataFormat.NORMAL, DataFormat.NORMAL}; - BoltModel bolt_model = new BoltModel(modelPath, affinity, device, - num_input, input_names, n, c_max, h, w, dts, dfs, - num_output, output_names); - - int[] c_act = {28, 28}; - float[][] inputData = { - {1977, 1788, 2061, 3911, 248, 734, 1330, 1111, 1307, 729, 411, 383, 101, 713, - 5640, 627, 1330, 37, 282, 352, 438, 94, 1111, 729, 1103, 72, 133, 2}, - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, - 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27}}; - float[][] resultData = { - {7456, 40, 1788, 2061, 3911, 248, 734, 140, 4667, 1307, 5365, 411, 383, 1244, - 206, 2669, 5640, 627, 50, 236, 37, 63, 48, 352, 94, 4667, 53, 287, 1763, 72, - 133, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}}; - - double startTime = getMillisTime(); - BoltResult bolt_result = bolt_model.Run(num_input, input_names, - n, c_act, h, w, dts, dfs, - inputData); - double endTime = getMillisTime(); - System.out.println(outputPrefix + bolt_model.DeviceMapping(device) + ", " + bolt_model.AffinityMapping(affinity) - + ", machine translation " + String.format("%.3f", endTime - startTime) - + " ms/sequence, model " + modelPath); - int length = verify(resultData, bolt_result.getResultData(), bolt_result.getResultDimension(), 0); - if (length == 0) { - System.err.println("[ERROR] verify null data in machine translation, in Java API test"); - System.exit(1); - } - - // model destroy - bolt_model.Destructor(); - } - - public static void nmt_tsc(String outputPrefix, DeviceType device, AffinityType affinity, DataType dataType, - String encoderModelPath, String decoderModelPath) - { - int encoderInputNum = 2; - String[] encoderInputNames = {"encoder_words", "encoder_positions"}; - int[] encoderNs = {1, 1}; - int[] encoderCMaxs = {128, 128}; - int[] encoderHs = {1, 1, 1}; - int[] encoderWs = {1, 1, 1}; - DataType[] encoderDataTypes = {DataType.UINT32, DataType.UINT32, DataType.UINT32}; - DataFormat[] encoderDataFormats = {DataFormat.NORMAL, DataFormat.NORMAL, DataFormat.NORMAL}; - BoltModel encoderModel = new BoltModel(encoderModelPath, affinity, device, - encoderInputNum, encoderInputNames, encoderNs, encoderCMaxs, encoderHs, encoderWs, encoderDataTypes, encoderDataFormats); - - int[] encoderCActs = {4, 4}; - float[][] encoderInputData = {{13024, 1657, 35399, 0}, {0, 1, 2, 3}}; - int[] result = {6160, 3057, 113, 157, 0}; - - double startTime = getMillisTime(); - BoltResult encoderResult = encoderModel.Run(encoderInputNum, encoderInputNames, - encoderNs, encoderCActs, encoderHs, encoderWs, encoderDataTypes, encoderDataFormats, - encoderInputData); - double endTime = getMillisTime(); - double encoderTime = endTime - startTime; - - int decoderInputNum = 26; - int decoderOutputNum = 13; - int maxDecodeLength = 128; - String[] decoderInputNames = {"decoder_words", "decoder_positions", - "decoder_layer0_multihead_k", "decoder_layer0_multihead_v", - "decoder_layer1_multihead_k", "decoder_layer1_multihead_v", - "decoder_layer2_multihead_k", "decoder_layer2_multihead_v", - "decoder_layer3_multihead_k", "decoder_layer3_multihead_v", - "decoder_layer4_multihead_k", "decoder_layer4_multihead_v", - "decoder_layer5_multihead_k", "decoder_layer5_multihead_v", - "decoder_layer0_kmem", "decoder_layer0_vmem", - "decoder_layer1_kmem", "decoder_layer1_vmem", - "decoder_layer2_kmem", "decoder_layer2_vmem", - "decoder_layer3_kmem", "decoder_layer3_vmem", - "decoder_layer4_kmem", "decoder_layer4_vmem", - "decoder_layer5_kmem", "decoder_layer5_vmem" - }; - String[] decoderOutputNames = { - "transformer_decoder_embedding_argmax", - "transformer_decoder_layer_0_self_attention_multihead_k_cache", "transformer_decoder_layer_0_self_attention_multihead_v_cache", - "transformer_decoder_layer_1_self_attention_multihead_k_cache", "transformer_decoder_layer_1_self_attention_multihead_v_cache", - "transformer_decoder_layer_2_self_attention_multihead_k_cache", "transformer_decoder_layer_2_self_attention_multihead_v_cache", - "transformer_decoder_layer_3_self_attention_multihead_k_cache", "transformer_decoder_layer_3_self_attention_multihead_v_cache", - "transformer_decoder_layer_4_self_attention_multihead_k_cache", "transformer_decoder_layer_4_self_attention_multihead_v_cache", - "transformer_decoder_layer_5_self_attention_multihead_k_cache", "transformer_decoder_layer_5_self_attention_multihead_v_cache", - }; - int[] decoderNs = new int[decoderInputNum]; - int[] decoderCMaxs = new int[decoderInputNum]; - int[] decoderHs = new int[decoderInputNum]; - int[] decoderWs = new int[decoderInputNum]; - DataType[] decoderDataTypes = new DataType[decoderInputNum]; - DataFormat[] decoderDataFormats = new DataFormat[decoderInputNum]; - double decoderTime = 0; - for (int i = 0; i < 2; i++) { - decoderNs[i] = 1; - decoderCMaxs[i] = 1; - decoderHs[i] = 1; - decoderWs[i] = 1; - decoderDataTypes[i] = DataType.UINT32; - decoderDataFormats[i] = DataFormat.NORMAL; - } - for (int i = 2; i < decoderInputNum; i++) { - decoderNs[i] = 1; - if (i - 2 < 12) - decoderCMaxs[i] = 4; - else - decoderCMaxs[i] = maxDecodeLength - 1; - decoderHs[i] = 512; - decoderWs[i] = 1; - decoderDataTypes[i] = dataType; - decoderDataFormats[i] = DataFormat.MTK; - } - BoltModel decoderModel = new BoltModel(decoderModelPath, affinity, device, - decoderInputNum, decoderInputNames, decoderNs, decoderCMaxs, decoderHs, decoderWs, - decoderDataTypes, decoderDataFormats, decoderOutputNum, decoderOutputNames); - float[][] encoderResultData = encoderResult.getResultData(); - float[][] decoderStates = {{}, {}, {}, {}, {}, {}, {}, - {}, {}, {}, {}, {}, {}, {}}; - int word = 0, i; - int[] words = new int[maxDecodeLength]; - for (i = 0; i < maxDecodeLength; i++) { - int[] decoderCActs = {1, 1, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - i, i, i, i, i, i, i, i, i, i, i, i - }; - float[][] decoderInputData = {{word}, {i}, - encoderResultData[0], encoderResultData[1], - encoderResultData[2], encoderResultData[3], - encoderResultData[4], encoderResultData[5], - encoderResultData[6], encoderResultData[7], - encoderResultData[8], encoderResultData[9], - encoderResultData[10], encoderResultData[11], - decoderStates[0], decoderStates[1], - decoderStates[2], decoderStates[3], - decoderStates[4], decoderStates[5], - decoderStates[6], decoderStates[7], - decoderStates[8], decoderStates[9], - decoderStates[10], decoderStates[11], - }; - startTime = getMillisTime(); - BoltResult decoderResult = decoderModel.Run(decoderInputNum, decoderInputNames, - decoderNs, decoderCActs, decoderHs, decoderWs, decoderDataTypes, decoderDataFormats, - decoderInputData); - endTime = getMillisTime(); - decoderTime += endTime - startTime; - float[][] decoderResultData = decoderResult.getResultData(); - for (int j = 0; j < 12; j++) - decoderStates[j] = decoderResultData[j+1]; - word = (int)decoderResultData[0][0]; - words[i] = word; - if (word == 0) - break; - } - System.out.println(outputPrefix + encoderModel.DeviceMapping(device) + ", " + encoderModel.AffinityMapping(affinity) - + ", machine translation " + String.format("%.3f", encoderTime+decoderTime) - + " ms/sequence, encoder model " + encoderModelPath - + ", decoder model " + decoderModelPath); - verify(result, words, result.length); - - // model destroy - encoderModel.Destructor(); - decoderModel.Destructor(); - } - - public static void tts(String outputPrefix, DeviceType device, AffinityType affinity, - String encoderDecoderModelPath, String postnetModelPath, String melganModelPath, DataType dataType) { - int numMels = 80; - int maxResult = 2000 * 3; - int encoderDecoderInputNum = 2; - int encoderDecoderOutputNum = 2; - String[] encoderDecoderInputNames = {"tts_words", "tts_alignments"}; - String[] encoderDecoderOutputNames = {"decoder_position", "decoder_result"}; - int[] encoderDecoderNs = {1, 1}; - int[] encoderDecoderCMaxs = {128, 128}; - int[] encoderDecoderHs = {1, 1}; - int[] encoderDecoderWs = {1, 1}; - DataType[] encoderDecoderDataTypes = {DataType.UINT32, dataType}; - DataFormat[] encoderDecoderDataFormats = {DataFormat.NORMAL, DataFormat.NORMAL}; - BoltModel encoderDecoderModel = new BoltModel(encoderDecoderModelPath, affinity, device, - encoderDecoderInputNum, encoderDecoderInputNames, - encoderDecoderNs, encoderDecoderCMaxs, encoderDecoderHs, encoderDecoderWs, - encoderDecoderDataTypes, encoderDecoderDataFormats, - encoderDecoderOutputNum, encoderDecoderOutputNames); - int[] encoderDecoderCActs = {50, 50}; - float[][] encoderDecoderInputData = {{4, 25, 14, 33, 11, 20, 1, 9, 14, 33, - 27, 2, 20, 35, 15, 1, 10, 37, 11, 2, - 30, 34, 15, 7, 21, 1, 25, 14, 35, 21, - 27, 3, 25, 14, 34, 27, 1, 25, 14, 35, - 27, 1, 17, 36, 7, 20, 1, 37, 7, 0}, - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}}; - - int postnetInputNum = 1; - int postnetOutputNum = 1; - String[] postnetInputNames = {"tts_decoder"}; - String[] postnetOutputNames = {"mel"}; - int[] postnetNs = {1}; - int[] postnetCMaxs = {maxResult}; - int[] postnetHs = {numMels}; - int[] postnetWs = {1}; - DataType[] postnetDataTypes = {dataType}; - DataFormat[] postnetDataFormats = {DataFormat.MTK}; - BoltModel postnetModel = new BoltModel(postnetModelPath, affinity, device, - postnetInputNum, postnetInputNames, - postnetNs, postnetCMaxs, postnetHs, postnetWs, - postnetDataTypes, postnetDataFormats, - postnetOutputNum, postnetOutputNames); - - int melganInputNum = 1; - int melganOutputNum = 1; - String[] melganInputNames = {"input"}; - String[] melganOutputNames = {"output"}; - int[] melganNs = {1}; - int[] melganCs = {numMels}; - int[] melganHMaxs = {maxResult}; - int[] melganWs = {1}; - DataType[] melganDataTypes = {dataType}; - DataFormat[] melganDataFormats = {DataFormat.NCHW}; - BoltModel melganModel = new BoltModel(melganModelPath, affinity, device, - melganInputNum, melganInputNames, - melganNs, melganCs, melganHMaxs, melganWs, - melganDataTypes, melganDataFormats, - melganOutputNum, melganOutputNames); - - double startTime = getMillisTime(); - BoltResult encoderDecoderResult = encoderDecoderModel.Run(encoderDecoderInputNum, encoderDecoderInputNames, - encoderDecoderNs, encoderDecoderCActs, encoderDecoderHs, encoderDecoderWs, - encoderDecoderDataTypes, encoderDecoderDataFormats, - encoderDecoderInputData); - float[][] encoderDecoderResultData = encoderDecoderResult.getResultData(); - - int frameNum = ((int)encoderDecoderResultData[0][0] + 1) * 3; - int[] postnetCActs = {frameNum}; - float[][] postnetInputData = {encoderDecoderResultData[1]}; - BoltResult postnetResult = postnetModel.Run(postnetInputNum, postnetInputNames, - postnetNs, postnetCActs, postnetHs, postnetWs, - postnetDataTypes, postnetDataFormats, - postnetInputData); - int[][] postnetResultDimension = postnetResult.getResultDimension(); - float[][] postnetResultData = postnetResult.getResultData(); - - if (postnetResultDimension[0][0] != 1 || postnetResultDimension[0][1] != numMels - || postnetResultDimension[0][2] != frameNum) { - System.out.println("[ERROR] unmatched dimension of postnet"); - System.exit(1); - } - int[] melganHActs = {frameNum}; - float[][] melganInputData = {postnetResultData[0]}; - BoltResult melganResult = melganModel.Run(melganInputNum, melganInputNames, - melganNs, melganCs, melganHActs, melganWs, - melganDataTypes, melganDataFormats, - melganInputData); - int[][] melganResultDimension = melganResult.getResultDimension(); - float[][] melganResultData = melganResult.getResultData(); - int length = (int)melganResultDimension[0][2]; - float[] resultSum = {180.83719f}; - float[] result = new float[length]; - float[] sum = {0}; - for (int i = 0; i < length; i++) { - result[i] = melganResultData[0][i*8]; - sum[0] += result[i]; - } - double endTime = getMillisTime(); - System.out.println(outputPrefix + encoderDecoderModel.DeviceMapping(device) + ", " + encoderDecoderModel.AffinityMapping(affinity) - + ", text to speech " + String.format("%.3f", endTime - startTime) - + " ms/sequence, encoder decoder model " + encoderDecoderModelPath - + ", postnet model " + postnetModelPath - + ", melgan vocoder model " + melganModelPath); - verify(sum, resultSum, 1, 8); - - // model destroy - encoderDecoderModel.Destructor(); - postnetModel.Destructor(); - melganModel.Destructor(); - } - - public static void asr(String outputPrefix, DeviceType device, AffinityType affinity, String modelPath, DataType dataType) { - int num_input = 1; - int num_output = 1; - String[] input_names = {"sounds"}; - String[] output_names = {"labels"}; - int[] n = {1}; - int[] c_max = {128}; - int[] h = {240}; - int[] w = {1}; - DataType[] dts = {dataType}; - DataFormat[] dfs = {DataFormat.NCHW}; - BoltModel bolt_model = new BoltModel(modelPath, affinity, device, - num_input, input_names, n, c_max, h, w, dts, dfs, - num_output, output_names); - - String soundDataPath = "/data/local/tmp/CI/testing_data/nlp/asr/asr_rnnt/input/1.seq"; - String resultDataPath = "/data/local/tmp/CI/testing_data/nlp/asr/asr_rnnt/result/1.seq"; - float[] sound = readSequenceDataFromFile(soundDataPath, 0); - float[] result = readSequenceDataFromFile(resultDataPath, 0); - int[] c_act = {sound.length / h[0]}; - float[][] inputData = {sound}; - float[][] resultData = {result}; - - double startTime = getMillisTime(); - BoltResult bolt_result = bolt_model.Run(num_input, input_names, - n, c_act, h, w, dts, dfs, - inputData); - double endTime = getMillisTime(); - System.out.println(outputPrefix + bolt_model.DeviceMapping(device) + ", " + bolt_model.AffinityMapping(affinity) - + ", speech recognization " + String.format("%.3f", endTime - startTime) - + " ms/sequence, model " + modelPath); - int length = verify(resultData, bolt_result.getResultData(), bolt_result.getResultDimension(), 0); - if (length == 0) { - System.err.println("[ERROR] verify null data in speech recognize, in Java API test"); - System.exit(1); - } - - // model destroy - bolt_model.Destructor(); - } - - public static void classification(String outputPrefix, DeviceType device, AffinityType affinity, String modelPath, - String inputName, DataType dataType, int[] imageSize, float initValue, int topIndex) - { - int num_input = 1; - String[] input_names = {inputName}; - int[] n = {1}; - int[] c = {imageSize[0]}; - int[] h = {imageSize[1]}; - int[] w = {imageSize[2]}; - DataType[] dts = {dataType}; - DataFormat[] dfs = {DataFormat.NCHW}; - // constructor(modelCreate + ready) - BoltModel bolt_model = new BoltModel(modelPath, affinity, device, - num_input, input_names, n, c, h, w, dts, dfs); - - int length = imageSize[0]*imageSize[1]*imageSize[2]; - float[][] inputData = new float[1][length]; - for (int i = 0; i < length; i++) { - inputData[0][i] = initValue; - } - // model run - double startTime = getMillisTime(); - BoltResult bolt_result = bolt_model.Run(num_input, input_names, inputData); - double endTime = getMillisTime(); - System.out.println(outputPrefix + bolt_model.DeviceMapping(device) + ", " + bolt_model.AffinityMapping(affinity) - + ", classification " + String.format("%.3f", endTime - startTime) - + " ms/image, model " + modelPath); - - float[][] result = bolt_result.getResultData(); - int labelIndex = top1(result[0], 0, result[0].length); - if (labelIndex != topIndex) { - System.err.println("[ERROR] verify data classfication label failed " + labelIndex - + " " + topIndex + ", in Java API test"); - System.exit(1); - } - - // model destroy - bolt_model.Destructor(); - } - - public static void testSuites(String outputPrefix, DeviceType device, AffinityType affinity) { - String prefix = "/data/local/tmp/CI/java/tmp"; - - int[] image_3x224x224 = {3, 224, 224}; - int[] image_2x188x188 = {2, 188, 188}; - classification(outputPrefix, device, affinity, prefix+"/caffe_models/mobilenet_v1/mobilenet_v1_f16.bolt", - "data", DataType.FP16, image_3x224x224, 1, 499); - classification(outputPrefix, device, affinity, prefix+"/caffe_models/mobilenet_v2/mobilenet_v2_f16.bolt", - "data", DataType.FP16, image_3x224x224, 1, 813); - classification(outputPrefix, device, affinity, prefix+"/caffe_models/mobilenet_v3/mobilenet_v3_f16.bolt", - "data", DataType.FP16, image_3x224x224, 1, 892); - if (device == DeviceType.GPU) - return; - classification(outputPrefix, device, affinity, prefix+"/caffe_models/resnet50/resnet50_f16.bolt", - "data", DataType.FP16, image_3x224x224, 255, 506); - classification(outputPrefix, device, affinity, prefix+"/caffe_models/squeezenet/squeezenet_f16.bolt", - "data", DataType.FP16, image_3x224x224, 255, 310); - classification(outputPrefix, device, affinity, prefix+"/caffe_models/mobilenet_v1/mobilenet_v1_f32.bolt", - "data", DataType.FP32, image_3x224x224, 1, 499); - classification(outputPrefix, device, affinity, prefix+"/caffe_models/mobilenet_v2/mobilenet_v2_f32.bolt", - "data", DataType.FP32, image_3x224x224, 1, 813); - classification(outputPrefix, device, affinity, prefix+"/caffe_models/mobilenet_v3/mobilenet_v3_f32.bolt", - "data", DataType.FP32, image_3x224x224, 1, 892); - classification(outputPrefix, device, affinity, prefix+"/caffe_models/resnet50/resnet50_f32.bolt", - "data", DataType.FP32, image_3x224x224, 255, 506); - classification(outputPrefix, device, affinity, prefix+"/caffe_models/squeezenet/squeezenet_f32.bolt", - "data", DataType.FP32, image_3x224x224, 255, 310); - classification(outputPrefix, device, affinity, prefix+"/onnx_models/ghostnet/ghostnet_f16.bolt", - "MobileNetV2/MobileNetV2/Conv2d_0/Conv2D__6:0", DataType.FP16, image_3x224x224, 255, 789); - classification(outputPrefix, device, affinity, prefix+"/onnx_models/ghostnet/ghostnet_f32.bolt", - "MobileNetV2/MobileNetV2/Conv2d_0/Conv2D__6:0", DataType.FP32, image_3x224x224, 255, 789); - classification(outputPrefix, device, affinity, prefix+"/caffe_models/fingerprint_resnet18/fingerprint_resnet18_f16.bolt", - "Data", DataType.FP16, image_2x188x188, 1, 0); - classification(outputPrefix, device, affinity, prefix+"/caffe_models/fingerprint_resnet18/fingerprint_resnet18_f32.bolt", - "Data", DataType.FP32, image_2x188x188, 1, 0); - - tinybert_intent_slot(outputPrefix, device, affinity, prefix+"/caffe_models/tinybert384/tinybert384_int8_q.bolt"); - tinybert_intent_slot(outputPrefix, device, affinity, prefix+"/caffe_models/tinybert384/tinybert384_f16.bolt"); - tinybert_intent_slot(outputPrefix, device, affinity, prefix+"/caffe_models/tinybert384/tinybert384_f32.bolt"); - - tinybert_intent_slot(outputPrefix, device, affinity, prefix+"/caffe_models/tinybert/tinybert_f16.bolt"); - tinybert_intent_slot(outputPrefix, device, affinity, prefix+"/caffe_models/tinybert/tinybert_f32.bolt"); - tinybert_disambiguate(outputPrefix, device, affinity, prefix+"/caffe_models/tinybert_disambiguate/tinybert_disambiguate_f16.bolt", DataType.FP16); - tinybert_disambiguate(outputPrefix, device, affinity, prefix+"/caffe_models/tinybert_disambiguate/tinybert_disambiguate_f32.bolt", DataType.FP32); - nmt(outputPrefix, device, affinity, prefix+"/caffe_models/nmt/nmt_f16.bolt"); - nmt(outputPrefix, device, affinity, prefix+"/caffe_models/nmt/nmt_f32.bolt"); - nmt_tsc(outputPrefix, device, affinity, DataType.FP16, prefix+"/caffe_models/nmt_tsc_encoder/nmt_tsc_encoder_f16.bolt", - prefix+"/caffe_models/nmt_tsc_decoder/nmt_tsc_decoder_f16.bolt"); - nmt_tsc(outputPrefix, device, affinity, DataType.FP32, prefix+"/caffe_models/nmt_tsc_encoder/nmt_tsc_encoder_f32.bolt", - prefix+"/caffe_models/nmt_tsc_decoder/nmt_tsc_decoder_f32.bolt"); - - classification(outputPrefix, device, affinity, prefix+"/caffe_models/squeezenet/squeezenet_int8_q.bolt", - "data", DataType.FP16, image_3x224x224, 255, 310); - classification(outputPrefix, device, affinity, prefix+"/onnx_models/birealnet18/birealnet18_f16.bolt", - "0", DataType.FP16, image_3x224x224, 255, 565); - - asr(outputPrefix, device, affinity, prefix+"/caffe_models/asr_rnnt/asr_rnnt_f16.bolt", DataType.FP16); - asr(outputPrefix, device, affinity, prefix+"/caffe_models/asr_rnnt/asr_rnnt_f32.bolt", DataType.FP32); - tts(outputPrefix, device, affinity, - prefix+"/caffe_models/tts_encoder_decoder/tts_encoder_decoder_f16.bolt", - prefix+"/caffe_models/tts_postnet/tts_postnet_f16.bolt", - prefix+"/onnx_models/tts_melgan_vocoder/tts_melgan_vocoder_f16.bolt", - DataType.FP16); - tts(outputPrefix, device, affinity, - prefix+"/caffe_models/tts_encoder_decoder/tts_encoder_decoder_f32.bolt", - prefix+"/caffe_models/tts_postnet/tts_postnet_f32.bolt", - prefix+"/onnx_models/tts_melgan_vocoder/tts_melgan_vocoder_f32.bolt", - DataType.FP32); - } - - public static void main(String[] args) { - String outputPrefix = "[INFO] "; - if (args.length > 0) { - outputPrefix += args[0] + ", "; - } - testSuites(outputPrefix, DeviceType.CPU, AffinityType.HIGH_PERFORMANCE); - testSuites(outputPrefix, DeviceType.CPU, AffinityType.LOW_POWER); - testSuites(outputPrefix, DeviceType.GPU, AffinityType.LOW_POWER); - } -} diff --git a/tests/test_argmax.cpp b/tests/test_argmax.cpp deleted file mode 100644 index 3b5abc28..00000000 --- a/tests/test_argmax.cpp +++ /dev/null @@ -1,79 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "tensor_computing.h" -#include "ut_util.h" - -int argmaxTest(int argc, char** argv, DataType dt) { - CHECK_REQUIREMENT(argc == 6); - U32 in = atoi(argv[1]); - U32 ic = atoi(argv[2]); - U32 ih = atoi(argv[3]); - U32 iw = atoi(argv[4]); - I32 axis = atoi(argv[5]); - - DataFormat df = DF_NCHW; - TensorDesc inDesc = tensor4df(dt, df, in, ic, ih, iw); - U8* input = ut_input_v(tensorNumElements(inDesc), dt, UT_INIT_RANDOM); - TensorDesc outDesc; - CHECK_STATUS(argmax_infer_output_size(inDesc, axis, &outDesc)); - U8* output = ut_input_v(tensorNumElements(outDesc), DT_U32, UT_INIT_ZERO); - U8* outputRef = ut_input_v(tensorNumElements(outDesc), DT_U32, UT_INIT_ZERO); - - if (UT_CHECK) { - CHECK_STATUS(argmax(inDesc, input, axis, outDesc, output, UT_ARCH)); - - // naive implement - CHECK_STATUS(argmax(inDesc, input, axis, outDesc, outputRef, CPU_GENERAL)); - - // check - ut_check_v(output, outputRef, tensorNumElements(outDesc), DT_U32, 0, __FILE__, __LINE__); - } - - // benchmark - double time_start = ut_time_ms(); - for (int iter = 0; iter < UT_LOOPS; iter++) { - CHECK_STATUS(argmax(inDesc, input, axis, outDesc, output, UT_ARCH)); - } - double time_end = ut_time_ms(); - double time = (time_end - time_start) / UT_LOOPS; - - // log performance data - U32 on, oh, ow; - CHECK_STATUS(tensor3dGet(outDesc, &dt, &df, &on, &oh, &ow)); - char buffer[150]; - char params[120]; - sprintf(params, "(%u %u %u %u) %d =(%u %u %u)", - in, ic, ih, iw, axis, - on, oh, ow); - sprintf(buffer, "%20s, %80s", "Argmax", params); - double ops = 1.0 * in * ic * ih * iw; - ut_log(dt, buffer, ops, time/UT_LOOPS); - - free(output); - free(outputRef); - - return 0; -} - -int main(int argc, char** argv) { -#ifdef _USE_FP16 - argmaxTest(argc, argv, DT_F16); -#endif -#ifdef _USE_FP32 - argmaxTest(argc, argv, DT_F32); -#endif - return 0; -} diff --git a/tests/test_attention.cpp b/tests/test_attention.cpp deleted file mode 100644 index 50935509..00000000 --- a/tests/test_attention.cpp +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "tensor_computing.h" -#include "ut_util.h" - -int attentionTest(int argc, char** argv, DataType dt) { - CHECK_REQUIREMENT(argc == 5); - U32 batch = atoi(argv[1]); - U32 numHeads = atoi(argv[2]); - U32 fromSequenceLength = atoi(argv[3]); - U32 toSequenceLength = atoi(argv[4]); - - DataFormat df = DF_NORMAL; - TensorDesc inputDesc = tensor2df(dt, df, batch, toSequenceLength); - TensorDesc outputDesc; - CHECK_STATUS(attention_infer_output_size(inputDesc, numHeads, fromSequenceLength, toSequenceLength, &outputDesc)); - U32 inputLength = tensorNumElements(inputDesc); - U32 outputLength = tensorNumElements(outputDesc); - - U8* input = ut_input_v(inputLength, dt, UT_INIT_ZERO); - for (U32 i = 0; i < batch; i++) { - U32 threshold = toSequenceLength / 2 + i; - for (U32 j = 0; j < toSequenceLength; j++) { - if (j < threshold) { - switch (dt) { -#ifdef _USE_FP32 - case DT_F32: - ((F32*)input)[i * toSequenceLength + j] = 1; - break; -#endif -#ifdef _USE_FP16 - case DT_F16: - ((F16*)input)[i * toSequenceLength + j] = 1; - break; -#endif - default: - break; - } - } - } - } - U8* output = ut_input_v(outputLength, dt, UT_INIT_ZERO); - U8* outputRef = ut_input_v(outputLength, dt, UT_INIT_ZERO); - if(UT_CHECK) { - CHECK_STATUS(attention(inputDesc, input, outputDesc, output, UT_ARCH)); - - // naive implement - CHECK_STATUS(attention(inputDesc, input, outputDesc, outputRef, CPU_GENERAL)); - - // check - ut_check_v(output, outputRef, outputLength, dt, 0, __FILE__, __LINE__); - } - - // benchmark - double time_start = ut_time_ms(); - for(int iter=0; iter < UT_LOOPS; iter++) { - CHECK_STATUS(attention(inputDesc, input, outputDesc, output, UT_ARCH)); - } - double time_end = ut_time_ms(); - double time = (time_end - time_start) / UT_LOOPS; - - // log performance data - char buffer[150]; - char params[120]; - sprintf(params, "(%u %u)=(%u %u %u %u)", - batch, fromSequenceLength, - batch, numHeads, fromSequenceLength, toSequenceLength); - sprintf(buffer, "%20s, %80s", "Attention", params); - double ops = 3.0 * outputLength; - ut_log(dt, buffer, ops, time/UT_LOOPS); - - free(input); - free(output); - free(outputRef); - - return 0; -} - -int main(int argc, char** argv) { -#ifdef _USE_FP16 - attentionTest(argc, argv, DT_F16); -#endif -#ifdef _USE_FP32 - attentionTest(argc, argv, DT_F32); -#endif - return 0; -} diff --git a/tests/test_check.cpp b/tests/test_check.cpp deleted file mode 100644 index e14428bf..00000000 --- a/tests/test_check.cpp +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "tensor_computing.h" -#include "ut_util.h" - -int checkTest(int argc, char** argv, DataType dt) { - CHECK_REQUIREMENT(argc == 5); - U32 in = atoi(argv[1]); - U32 ic = atoi(argv[2]); - U32 ih = atoi(argv[3]); - U32 iw = atoi(argv[4]); - - DataFormat df = DF_NCHW; - CheckMode checkMode = CHECK_EQUAL; - - TensorDesc inDesc = tensor4df(dt, df, in, ic, ih, iw); - U8* inputA = ut_input_v(tensorNumElements(inDesc), dt, UT_INIT_RANDOM); - U8* inputB = ut_input_v(tensorNumElements(inDesc), dt, UT_INIT_RANDOM); - TensorDesc outDesc; - CHECK_STATUS(check_infer_output_size(inDesc, &outDesc)); - I32* output = (I32*)ut_input_v(tensorNumElements(outDesc), DT_I32, UT_INIT_ZERO); - I32* outputRef = (I32*)ut_input_v(tensorNumElements(outDesc), DT_I32, UT_INIT_ZERO); - - if (UT_CHECK) { - CHECK_STATUS(check(inDesc, inputA, inDesc, inputB, checkMode, outDesc, output, UT_ARCH)); - - // naive implement - CHECK_STATUS(check(inDesc, inputA, inDesc, inputB, checkMode, outDesc, outputRef, CPU_GENERAL)); - - // check - ut_check_v(output, outputRef, tensorNumElements(outDesc), DT_I32, 0, __FILE__, __LINE__); - } - - // benchmark - double time_start = ut_time_ms(); - for (int iter = 0; iter < UT_LOOPS; iter++) { - CHECK_STATUS(check(inDesc, inputA, inDesc, inputB, checkMode, outDesc, output, UT_ARCH)); - } - double time_end = ut_time_ms(); - double time = (time_end - time_start) / UT_LOOPS; - - // log performance data - char buffer[150]; - char params[120]; - sprintf(params, "(%u %u %u %u)=(%u)", - in, ic, ih, iw, - in); - sprintf(buffer, "%20s, %80s", "Check", params); - double ops = 1.0 * in * ic * ih * iw; - ut_log(dt, buffer, ops, time/UT_LOOPS); - - free(output); - free(outputRef); - - return 0; -} - -int main(int argc, char** argv) { -#ifdef _USE_FP16 - checkTest(argc, argv, DT_F16); -#endif -#ifdef _USE_FP32 - checkTest(argc, argv, DT_F32); -#endif - checkTest(argc, argv, DT_U32); - return 0; -} diff --git a/tests/test_clip.cpp b/tests/test_clip.cpp deleted file mode 100644 index b677224d..00000000 --- a/tests/test_clip.cpp +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing.h" -#include "ut_util.h" - -int clipTest(int argc, char** argv, DataType dt) { - CHECK_REQUIREMENT(argc == 4); - U32 len = atoi(argv[1]); - F32 min_value = atof(argv[2]); - F32 max_value = atof(argv[3]); - - TensorDesc input_desc = tensor1d(dt, len); - TensorDesc output_desc; - CHECK_STATUS(clip_infer_output_size(input_desc, &output_desc, UT_ARCH)); - - U8* input = ut_input_v(len, dt, UT_INIT_RANDOM); - U8* output = ut_input_v(len, dt, UT_INIT_ZERO); - U8* output_ref = ut_input_v(len, dt, UT_INIT_ZERO); - - if (UT_CHECK) { - CHECK_STATUS(clip(&min_value, &max_value, input_desc, input, output_desc, output, UT_ARCH)); - - // naive implement - CHECK_STATUS(clip(&min_value, &max_value, input_desc, input, output_desc, output_ref, CPU_GENERAL)); - - // check - ut_check_v(output, output_ref, len, dt, 0, __FILE__, __LINE__); - } - - // benchmark - double time_start = ut_time_ms(); - for (int iter = 0; iter < UT_LOOPS; iter ++) { - CHECK_STATUS(clip(&min_value, &max_value, input_desc, input, output_desc, output, UT_ARCH)); - } - double time_end = ut_time_ms(); - double time = (time_end - time_start) / UT_LOOPS; - - // log performance data - char buffer[150]; - char params[120]; - sprintf(params, "(%u)=(%u)", - len, len); - sprintf(buffer, "%20s, %80s", "Clip", params); - double ops = 2.0 * len; - ut_log(dt, buffer, ops, time/UT_LOOPS); - - free(input); - free(output); - free(output_ref); - - return 0; -} - - -int main(int argc, char** argv) { -#ifdef _USE_FP16 - clipTest(argc, argv, DT_F16); -#endif -#ifdef _USE_FP32 - clipTest(argc, argv, DT_F32); -#endif - return 0; -} diff --git a/tests/test_concat.cpp b/tests/test_concat.cpp deleted file mode 100644 index 809cec07..00000000 --- a/tests/test_concat.cpp +++ /dev/null @@ -1,101 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include - -#include "tensor_computing.h" -#include "ut_util.h" - -int concatTest(int argc, char** argv, DataType dt) { - CHECK_REQUIREMENT(argc > 2); - int num = atoi(argv[1]); - int axis = atoi(argv[2]); - CHECK_REQUIREMENT(axis == 0 || axis == 1); - CHECK_REQUIREMENT(argc == 1 + 2 + (num+1)*4); - - std::vector in_desc(num); - TensorDesc out_desc; - std::vector> in_dims(num); - for(int i=0; i in_dim(4); - in_dim[0] = atoi(argv[3 + i * 4]); - in_dim[1] = atoi(argv[3 + i * 4 + 1]); - in_dim[2] = atoi(argv[3 + i * 4 + 2]); - in_dim[3] = atoi(argv[3 + i * 4 + 3]); - in_dims[i] = in_dim; - in_desc[i] = tensor4df(dt, DF_NCHWC8, in_dim[0], in_dim[1], in_dim[2], in_dim[3]); - } - U32 on = atoi(argv[3 + num * 4]); - U32 oc = atoi(argv[3 + num * 4 + 1]); - U32 oh = atoi(argv[3 + num * 4 + 2]); - U32 ow = atoi(argv[3 + num * 4 + 3]); - - CHECK_STATUS(concat_infer_output_size(in_desc, &out_desc, axis, UT_ARCH)); - - U32 in_len = 0; - for(int i=0; i input(num); - U8 *tmp = ut_input_v(in_len, dt, UT_INIT_RANDOM); - U32 count = 0; - for(int i=0; i - -#include "tensor_computing.h" -#include "ut_util.h" - -#ifdef _USE_INT8 -int int8ConcatTest(int argc, char** argv, DataType dt){ - CHECK_REQUIREMENT(argc > 2); - int num = atoi(argv[1]); - U32 axis = atoi(argv[2]); - CHECK_REQUIREMENT(axis == 0 || axis == 1); - CHECK_REQUIREMENT(argc == 1 + 2 + (num+1)*4); - - std::vector in_desc(num); - std::vector in_desc_ref(num); - TensorDesc out_desc; - std::vector> in_dims(num); - for (int i = 0; i < num; i++){ - std::vector in_dim(4); - in_dim[0] = atoi(argv[3 + i * 4]); - in_dim[1] = atoi(argv[3 + i * 4 + 1]); - in_dim[2] = atoi(argv[3 + i * 4 + 2]); - in_dim[3] = atoi(argv[3 + i * 4 + 3]); - in_dims[i] = in_dim; - in_desc[i] = tensor4df(DT_I8, DF_NCHWC8, in_dim[0], in_dim[1], in_dim[2], in_dim[3]); - in_desc_ref[i] = in_desc[i]; - in_desc_ref[i].dt = dt; - } - U32 on = atoi(argv[3 + num * 4]); - U32 oc = atoi(argv[3 + num * 4 + 1]); - U32 oh = atoi(argv[3 + num * 4 + 2]); - U32 ow = atoi(argv[3 + num * 4 + 3]); - - CHECK_STATUS(concat_infer_output_size(in_desc, &out_desc, axis, UT_ARCH)); - - U32 in_len = 0; - for (int i = 0; i < num; i++){ - in_len += tensorNumElements(in_desc[i]); - } - U32 out_len = tensorNumElements(out_desc); - CHECK_REQUIREMENT(in_len == out_len && out_len == on * oc * oh * ow); - - std::vector input_ref(num); - std::vector input(num); - - U8 *tmp = ut_input_v(in_len, dt, UT_INIT_RANDOM); - INT8 *quant = (INT8*)ut_input_v(in_len, DT_I8, UT_INIT_RANDOM); - - U32 count = 0; - std::vector scale_i(num); - - for (int i = 0; i < num; i++){ - input_ref[i] = (void *)(tmp + count * bytesOf(dt)); - input[i] = (void *)(quant + count); - F16 scale = -1; - quantize_tensor(in_desc_ref[i], input_ref[i], &(in_desc[i]), input[i], &scale); - scale_i[i] = scale; - count += tensorNumElements(in_desc[i]); - } - - INT8 *output = (INT8*)ut_input_v(out_len, DT_I8, UT_INIT_ZERO); - U8 *out_d = ut_input_v(out_len, dt, UT_INIT_ZERO); - F32 scale_o; - - if (UT_CHECK) { - CHECK_STATUS(concat(in_desc, input, scale_i.data(), out_desc, output, &scale_o, axis, UT_ARCH)); - - for (U32 i = 0; i < out_len; i++) { - switch (dt) { -#ifdef _USE_FP16 - case DT_F16: - ((F16*)out_d)[i] = output[i] / scale_o; - break; -#endif -#ifdef _USE_FP32 - case DT_F32: - ((F32*)out_d)[i] = output[i] / scale_o; - break; -#endif - default: - break; - } - } - - // check - ut_check_v(out_d, tmp, in_len, dt, 0.05, __FILE__, __LINE__); - } - - // benchmark - double time_start = ut_time_ms(); - for(int iter = 0; iter < UT_LOOPS; iter++){ - CHECK_STATUS(concat(in_desc, input, scale_i.data(), out_desc, output, &scale_o, axis, UT_ARCH)); - } - double time_end = ut_time_ms(); - double time = (time_end - time_start) / UT_LOOPS; - - // log performance data - char buffer[150]; - char params[120]; - sprintf(params, "%d (*)/%u=(%u %u %u %u)", - num, axis, on, oc, oh, ow); - sprintf(buffer, "%20s, %80s", "Concat", params); - double ops = 1.0 * out_len; - ut_log(DT_I8, buffer, ops, time); - - free(tmp); - free(output); - free(out_d); - return 0; -} -#endif - -int main(int argc, char** argv) { -#ifdef _USE_INT8 - int8ConcatTest(argc, argv, DT_F16); -#endif - return 0; -} diff --git a/tests/test_convolution.cpp b/tests/test_convolution.cpp deleted file mode 100644 index e1e158cb..00000000 --- a/tests/test_convolution.cpp +++ /dev/null @@ -1,171 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "tensor_computing.h" -#include "ut_util.h" - -int convolutionTest(int argc, char* argv[], DataType dt) -{ - CHECK_REQUIREMENT(argc == 15); - // in data - U32 in = atoi(argv[1]); - U32 ic = atoi(argv[2]); - U32 ih = atoi(argv[3]); - U32 iw = atoi(argv[4]); - // weight - U32 fn = atoi(argv[5]); - U32 fc = atoi(argv[6]); - U32 fh = atoi(argv[7]); - U32 fw = atoi(argv[8]); - // stride & padding - U32 stride = atoi(argv[9]); - U32 padding = atoi(argv[10]); - // output - U32 on = atoi(argv[11]); - U32 oc = atoi(argv[12]); - U32 oh = atoi(argv[13]); - U32 ow = atoi(argv[14]); - - CHECK_REQUIREMENT(in == 1 && on == 1); - - ActivationDesc activationDesc; - activationDesc.mode = ACTIVATION_RELU; - activationDesc.value[0] = 0; - - TensorDesc inputDesc, filterDesc, outputDesc, biasDesc; - ConvolutionDesc convDesc; - if (ic % 8 != 0) { - inputDesc = tensor4df(dt, DF_NCHW, in, ic, ih, iw); - } else { - inputDesc = tensor4df(dt, DF_NCHWC8, in, ic, ih, iw); - } - filterDesc = tensor4df(dt, DF_NCHW, oc, ic, fh, fw); - biasDesc = tensor1d(dt, oc); - convDesc.stride_h = stride; - convDesc.stride_w = stride; - convDesc.padding_top = padding; - convDesc.padding_bottom = padding; - convDesc.padding_left = padding; - convDesc.padding_right = padding; - convDesc.dilatedRate_h = 1; - convDesc.dilatedRate_w = 1; - - // setup input, filter, bias - U8 *input = ut_input_v(in*ic*ih*iw, dt, UT_INIT_RANDOM); - U8 *filter = ut_input_v(fn*fc*fh*fw, dt, UT_INIT_RANDOM); - U8 *bias = ut_input_v(oc, dt, UT_INIT_RANDOM); - U8 *input_ref = ut_input_v(in*ic*ih*iw, dt, UT_INIT_ZERO); - U8 *filter_ref = ut_input_v(fn*fc*fh*fw, dt, UT_INIT_ZERO); - memcpy(input_ref, input, bytesOf(dt)*in*ic*ih*iw); - memcpy(filter_ref, filter, bytesOf(dt)*fn*fc*fh*fw); - - // setup output, bias - U32 outputBytes; - CHECK_STATUS(convolution_infer_output_size(inputDesc, filterDesc, convDesc, &outputDesc, dt, &outputBytes, UT_ARCH)); - U32 output_size = outputBytes / bytesOf(dt); - U8 *output = ut_input_v(output_size, dt, UT_INIT_ZERO); - U8 *output_ref = ut_input_v(output_size, dt, UT_INIT_ZERO); - - // setup alg - ConvolutionPolicy policy = CONVOLUTION_FASTEST; - ConvolutionForwardAlgorithm alg = CONVOLUTION_ALGORITHM_NULL; - CHECK_STATUS(convolution_infer_forward_algorithm(inputDesc, filterDesc, outputDesc, convDesc, policy, &alg, dt, activationDesc, UT_ARCH)); - - // setup tmp - U32 tmpBytes; - CHECK_STATUS(convolution_infer_forward_tmp_bytes(inputDesc, filterDesc, outputDesc, convDesc, alg, &tmpBytes, UT_ARCH)); - U8 *tmp = ut_input_v(tmpBytes/bytesOf(dt), dt, UT_INIT_ZERO); - - // setup filter trans - U32 ftmBytes; - CHECK_STATUS(convolution_transform_filter_bytes(filterDesc, alg, &ftmBytes, UT_ARCH)); - U8 *ftm = ut_input_v(ftmBytes/bytesOf(dt), dt, UT_INIT_ZERO); - // trans filter - TensorDesc ftmDesc; - CHECK_STATUS(convolution_transform_filter(filterDesc, filter, alg, &ftmDesc, ftm, tmp, UT_ARCH)); - - if (UT_CHECK) { - CHECK_STATUS(convolution(inputDesc, input, - ftmDesc, ftm, - convDesc, alg, - biasDesc, nullptr, - biasDesc, bias, - tmpBytes, tmp, - outputDesc, output, - activationDesc, UT_ARCH)); - - // naive implement - CHECK_STATUS(convolution(inputDesc, input_ref, - filterDesc, filter_ref, - convDesc, alg, - biasDesc, nullptr, - biasDesc, bias, - tmpBytes, tmp, - outputDesc, output_ref, - activationDesc, CPU_GENERAL)); - - // check - ut_check_v(output, output_ref, output_size, dt, 5, __FILE__, __LINE__); - } - - // benchmark - double time_start = ut_time_ms(); - for (int iter = 0; iter < UT_LOOPS; iter++) { - CHECK_STATUS(convolution(inputDesc, input, - ftmDesc, ftm, - convDesc, alg, - biasDesc, nullptr, - biasDesc, bias, - tmpBytes, tmp, - outputDesc, output, - activationDesc, UT_ARCH)); - } - double time_end = ut_time_ms(); - double time = (time_end - time_start) / UT_LOOPS; - - // log performance data - char buffer[150]; - char params[120]; - sprintf(params, "(%u %u %u %u)+(%u %u %u %u)/(%u %u)=(%u %u %u %u)", - in, ic, ih, iw, - fn, fc, fh, fw, - stride, padding, - on, oc, oh, ow); - sprintf(buffer, "%20s, %80s", "Convolution", params); - double ops = (1.0 * on * oc * oh * ow) * (2.0 * ic * fh * fw + 1); - ut_log(dt, buffer, ops, time); - - free(input); - free(filter); - free(bias); - free(output); - free(input_ref); - free(filter_ref); - free(output_ref); - free(tmp); - free(ftm); - return 0; -} - - -int main(int argc, char** argv) { -#ifdef _USE_FP16 - convolutionTest(argc, argv, DT_F16); -#endif -#ifdef _USE_FP32 - convolutionTest(argc, argv, DT_F32); -#endif - return 0; -} diff --git a/tests/test_convolution_bnn.cpp b/tests/test_convolution_bnn.cpp deleted file mode 100644 index 0cd4a897..00000000 --- a/tests/test_convolution_bnn.cpp +++ /dev/null @@ -1,188 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "tensor_computing.h" -#include "ut_util.h" - -int bnnConvolutionTest(int argc, char* argv[], DataType dt) { - CHECK_REQUIREMENT(argc == 15); - // in data - U32 in = atoi(argv[1]); - U32 ic = atoi(argv[2]); - U32 ih = atoi(argv[3]); - U32 iw = atoi(argv[4]); - // weight - U32 fn = atoi(argv[5]); - U32 fc = atoi(argv[6]); - U32 fh = atoi(argv[7]); - U32 fw = atoi(argv[8]); - // stride & padding - U32 stride = atoi(argv[9]); - U32 padding = atoi(argv[10]); - // output - U32 on = atoi(argv[11]); - U32 oc = atoi(argv[12]); - U32 oh = atoi(argv[13]); - U32 ow = atoi(argv[14]); - - CHECK_REQUIREMENT(in == 1 && on == 1); - - DataType fdt = DT_BIN11; // Use dt to distinguish DoReFa and XNOR - ActivationDesc activationDesc; - activationDesc.mode = ACTIVATION_NULL; - - TensorDesc inputDesc = tensor4df(dt, DF_NCHWC8, in, ic, ih, iw); - TensorDesc filterDesc = tensor4df(fdt, DF_NCHW, oc, ic, fh, fw); - TensorDesc scaleDesc = tensor1d(dt, oc); - TensorDesc biasDesc = tensor1d(dt, oc); - ConvolutionDesc convDesc; - convDesc.stride_h = stride; - convDesc.stride_w = stride; - convDesc.padding_top = padding; - convDesc.padding_bottom = padding; - convDesc.padding_left = padding; - convDesc.padding_right = padding; - convDesc.dilatedRate_h = 1; - convDesc.dilatedRate_w = 1; - - // setup input, filter, bias - U8 *input = ut_input_v(in*ic*ih*iw, dt, UT_INIT_RANDOM); - if (fdt == DT_BIN01) { - for (U32 i = 0; i < in*ic*ih*iw; i++) { - switch (dt) { -#ifdef _USE_FP16 - case DT_F16: - ((F16*)input)[i] += 0.5; - break; -#endif -#ifdef _USE_FP32 - case DT_F32: - ((F32*)input)[i] += 0.5; - break; -#endif - default: - break; - } - } - } - - BIN8 *filter = (BIN8*)ut_input_v(fn*fc*fh*fw/8, fdt, UT_INIT_POS); - U8 *scale = ut_input_v(oc, dt, UT_INIT_RANDOM); - U8 *bias = ut_input_v(oc, dt, UT_INIT_RANDOM); - U8 *input_ref = ut_input_v(in*ic*ih*iw, dt, UT_INIT_ZERO); - BIN8 *filter_ref = (BIN8*)ut_input_v(fn*fc*fh*fw/8, fdt, UT_INIT_ZERO); - memcpy(input_ref, input, bytesOf(dt)*in*ic*ih*iw); - memcpy(filter_ref, filter, sizeof(BIN8)*fn*fc*fh*fw/8); - - // setup output, bias - U32 outputBytes; - TensorDesc outputDesc; - CHECK_STATUS(convolution_infer_output_size(inputDesc, filterDesc, convDesc, &outputDesc, dt, &outputBytes, UT_ARCH)); - U32 output_size = outputBytes / bytesOf(dt); - - U8 *output = ut_input_v(output_size, dt, UT_INIT_ZERO); - U8 *output_ref = ut_input_v(output_size, dt, UT_INIT_ZERO); - - // setup alg - ConvolutionPolicy policy = CONVOLUTION_FASTEST; - ConvolutionForwardAlgorithm alg = CONVOLUTION_ALGORITHM_NULL; - CHECK_STATUS(convolution_infer_forward_algorithm(inputDesc, filterDesc, outputDesc, convDesc, policy, &alg, fdt, activationDesc, UT_ARCH)); - - // setup tmp - U32 tmpBytes; - CHECK_STATUS(convolution_infer_forward_tmp_bytes(inputDesc, filterDesc, outputDesc, convDesc, alg, &tmpBytes, UT_ARCH)); - - BIN8 *tmp = (BIN8*)ut_input_v(tmpBytes/sizeof(BIN8), fdt, UT_INIT_ZERO); - - // setup filter trans - U32 ftmBytes; - CHECK_STATUS(convolution_transform_filter_bytes(filterDesc, alg, &ftmBytes, UT_ARCH)); - BIN8 *ftm = (BIN8*)ut_input_v(ftmBytes/sizeof(BIN8), fdt, UT_INIT_ZERO); - // trans filter - TensorDesc ftmDesc; - CHECK_STATUS(convolution_transform_filter(filterDesc, filter, alg, &ftmDesc, ftm, tmp, UT_ARCH)); - - if (UT_CHECK) { - CHECK_STATUS(convolution(inputDesc, input, - ftmDesc, ftm, - convDesc, alg, - scaleDesc, scale, - biasDesc, bias, - tmpBytes, tmp, - outputDesc, output, - activationDesc, UT_ARCH)); - - // naive implement - CHECK_STATUS(convolution(inputDesc, input_ref, - filterDesc, filter, - convDesc, alg, - scaleDesc, scale, - biasDesc, bias, - tmpBytes, tmp, - outputDesc, output_ref, - activationDesc, CPU_GENERAL)); - - // check - ut_check_v(output, output_ref, output_size, dt, 1, __FILE__, __LINE__); - } - - // benchmark - double time_start = ut_time_ms(); - for (int iter = 0; iter < UT_LOOPS; iter++) { - CHECK_STATUS(convolution(inputDesc, input, - ftmDesc, ftm, - convDesc, alg, - scaleDesc, scale, - biasDesc, bias, - tmpBytes, tmp, - outputDesc, output, - activationDesc, UT_ARCH)); - } - double time_end = ut_time_ms(); - double time = (time_end - time_start) / UT_LOOPS; - - // log performance data - char buffer[150]; - char params[120]; - sprintf(params, "(%u %u %u %u)+(%u %u %u %u)/(%u %u)=(%u %u %u %u)", - in, ic, ih, iw, - fn, fc, fh, fw, - stride, padding, - on, oc, oh, ow); - sprintf(buffer, "%20s, %80s", "BNN Convolution", params); - double ops = (1.0 * on * oc * oh * ow) * (2.0 * ic * fh * fw + 1); - ut_log(DT_I8, buffer, ops, time); - - free(input); - free(filter); - free(bias); - free(output); - free(input_ref); - free(filter_ref); - free(output_ref); - free(tmp); - free(ftm); - return 0; -} - -int main(int argc, char** argv) { -#ifdef _USE_FP16 - bnnConvolutionTest(argc, argv, DT_F16); -#endif -/*#ifdef _USE_FP32 - bnnConvolutionTest(argc, argv, DT_F32); -#endif*/ - return 0; -} diff --git a/tests/test_convolution_int8.cpp b/tests/test_convolution_int8.cpp deleted file mode 100644 index 9732ed22..00000000 --- a/tests/test_convolution_int8.cpp +++ /dev/null @@ -1,231 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "tensor_computing.h" -#include "ut_util.h" - -#ifdef _USE_INT8 -int int8ConvolutionTest(int argc, char* argv[], DataType dt, DataType filterDataType) { - CHECK_REQUIREMENT(argc == 15); - // in data - U32 in = atoi(argv[1]); - U32 ic = atoi(argv[2]); - U32 ih = atoi(argv[3]); - U32 iw = atoi(argv[4]); - // weight - U32 fn = atoi(argv[5]); - U32 fc = atoi(argv[6]); - U32 fh = atoi(argv[7]); - U32 fw = atoi(argv[8]); - // stride & padding - U32 stride = atoi(argv[9]); - U32 padding = atoi(argv[10]); - // output - U32 on = atoi(argv[11]); - U32 oc = atoi(argv[12]); - U32 oh = atoi(argv[13]); - U32 ow = atoi(argv[14]); - - CHECK_REQUIREMENT(in == 1 && on == 1); - - ActivationDesc activationDesc; - activationDesc.mode = ACTIVATION_RELU; - activationDesc.value[0] = 0; - - TensorDesc inputDesc, filterDesc, outputDesc, biasDesc; - ConvolutionDesc convDesc; - if (ic == 3 || ic == 1) { - printf("[WARN] can not quantize the first layer\n"); - return 0; - } else { - DataType qdt = DT_I8; - TensorDesc inputDesc_ref = tensor4df(dt, DF_NCHWC8, in, ic, ih, iw); - filterDesc = tensor4df(dt, DF_NCHW, oc, ic, fh, fw); - biasDesc = tensor1d(dt, oc); - convDesc.stride_h = stride; - convDesc.stride_w = stride; - convDesc.padding_top = padding; - convDesc.padding_bottom = padding; - convDesc.padding_left = padding; - convDesc.padding_right = padding; - convDesc.dilatedRate_h = 1; - convDesc.dilatedRate_w = 1; - - // setup input, filter, bias - U8 *input_ref = ut_input_v(in*ic*ih*iw, dt, UT_INIT_RANDOM); - U8 *filter = ut_input_v(fn*fc*fh*fw, dt, UT_INIT_RANDOM); - U8 *bias = ut_input_v(oc, dt, UT_INIT_RANDOM); - - INT8 *input = (INT8*)ut_input_v(in*ic*ih*iw, DT_I8, UT_INIT_ZERO); - F16 scale_i = -1; - quantize_tensor(inputDesc_ref, input_ref, &inputDesc, input, &scale_i); - - U8 *filter_ref = ut_input_v(fn*fc*fh*fw, dt, UT_INIT_ZERO); - memcpy(filter_ref, filter, bytesOf(dt)*fn*fc*fh*fw); - - // setup output, bias - U32 outputBytes; - CHECK_STATUS(convolution_infer_output_size(inputDesc, filterDesc, convDesc, &outputDesc, qdt, &outputBytes, UT_ARCH)); - TensorDesc outputDesc_ref = outputDesc; - outputDesc_ref.dt = dt; - U32 output_size = outputBytes / bytesOf(qdt); - INT8 *output = (INT8*)ut_input_v(output_size, DT_I8, UT_INIT_ZERO); - U8 *output_ref = ut_input_v(output_size, dt, UT_INIT_ZERO); - - // setup alg - ConvolutionPolicy policy = CONVOLUTION_FASTEST; - ConvolutionForwardAlgorithm alg = CONVOLUTION_ALGORITHM_NULL; - CHECK_STATUS(convolution_infer_forward_algorithm(inputDesc, filterDesc, outputDesc, convDesc, policy, &alg, DT_I8, activationDesc, UT_ARCH)); - - F16 *scales; - - // setup filter trans - U32 ftBytes; - - TensorDesc ftmDesc; - INT8 *ftm; - - switch (alg) { - case CONVOLUTION_ALGORITHM_WINOGRAD: { - CHECK_STATUS(convolution_transform_filter_bytes(filterDesc, alg, &ftBytes, UT_ARCH)); - - TensorDesc tFilterDesc; - U8 *tFilter = ut_input_v(ftBytes/bytesOf(dt), dt, UT_INIT_ZERO); - - filterDesc.dt = filterDataType; // To label as int8 - CHECK_STATUS(convolution_transform_filter(filterDesc, filter, alg, &tFilterDesc, tFilter, NULL, UT_ARCH)); - filterDesc.dt = dt; - - ftm = (INT8*)ut_input_v(fn*fc*6*6, DT_I8, UT_INIT_ZERO); - - scales = (F16*)ut_input_v(38, DT_F16, UT_INIT_ZERO); // 1 for input, 1 for output and 36 for filter - CHECK_STATUS(quantize_tensor(tFilterDesc, tFilter, &ftmDesc, ftm, scales+2)); - - free(tFilter); - break; - } - default: { - TensorDesc qFilterDesc; - INT8 *qFilter = (INT8*)ut_input_v(fn*fc*fh*fw, DT_I8, UT_INIT_ZERO); - scales = (F16*)ut_input_v(3, DT_F16, UT_INIT_ZERO); - CHECK_STATUS(quantize_tensor(filterDesc, filter, &qFilterDesc, qFilter, scales+2)); - - CHECK_STATUS(convolution_transform_filter_bytes(qFilterDesc, alg, &ftBytes, UT_ARCH)); - - ftm = (INT8*)ut_input_v(ftBytes/sizeof(INT8), DT_I8, UT_INIT_ZERO); - // trans filter - CHECK_STATUS(convolution_transform_filter(qFilterDesc, qFilter, alg, &ftmDesc, ftm, NULL, UT_ARCH)); - - free(qFilter); - break; - } - } - - scales[0] = scale_i; - - // setup tmp - U32 tmpBytes; - CHECK_STATUS(convolution_infer_forward_tmp_bytes(inputDesc, ftmDesc, outputDesc, convDesc, alg, &tmpBytes, UT_ARCH)); - INT8 *tmp = (INT8*)ut_input_v(tmpBytes/sizeof(INT8), DT_I8, UT_INIT_ZERO); - - if (UT_CHECK) { - CHECK_STATUS(convolution(inputDesc, input, - ftmDesc, ftm, - convDesc, alg, - biasDesc, scales, - biasDesc, bias, - tmpBytes, tmp, - outputDesc, output, - activationDesc, UT_ARCH)); - - // naive implement - CHECK_STATUS(convolution(inputDesc_ref, input_ref, - filterDesc, filter_ref, - convDesc, alg, - biasDesc, nullptr, - biasDesc, bias, - tmpBytes, tmp, - outputDesc_ref, output_ref, - activationDesc, CPU_GENERAL)); - - U8 *out_d = ut_input_v(output_size, dt, UT_INIT_ZERO); - for (U32 i = 0; i < output_size; i++) { - switch (dt) { -#ifdef _USE_FP32 - case DT_F32: - ((F32*)out_d)[i] = output[i] / scales[1]; - break; -#endif -#ifdef _USE_FP16 - case DT_F16: - ((F16*)out_d)[i] = output[i] / scales[1]; - break; -#endif - default: - break; - } - } - ut_check_v(out_d, output_ref, output_size, dt, 8, __FILE__, __LINE__); - free(out_d); - } - - // benchmark - double time_start = ut_time_ms(); - for (int iter = 0; iter < UT_LOOPS; iter++){ - CHECK_STATUS(convolution(inputDesc, input, - ftmDesc, ftm, - convDesc, alg, - biasDesc, scales, - biasDesc, bias, - tmpBytes, tmp, - outputDesc, output, - activationDesc, UT_ARCH)); - } - double time_end = ut_time_ms(); - double time = (time_end - time_start) / UT_LOOPS; - - // log performance data - char buffer[150]; - char params[120]; - sprintf(params, "(%u %u %u %u)+(%u %u %u %u)/(%u %u)=(%u %u %u %u)", - in, ic, ih, iw, - fn, fc, fh, fw, - stride, padding, - on, oc, oh, ow); - sprintf(buffer, "%20s, %80s", "Convolution", params); - double ops = (1.0 * on * oc * oh * ow) * (2.0 * ic * fh * fw + 1); - ut_log(DT_I8, buffer, ops, time); - - free(input); - free(filter); - free(bias); - free(output); - free(input_ref); - free(filter_ref); - free(output_ref); - free(tmp); - free(ftm); - free(scales); - } - return 0; -} -#endif - -int main(int argc, char** argv) { -#ifdef _USE_INT8 - int8ConvolutionTest(argc, argv, DT_F16, DT_F16_8Q); -#endif - return 0; -} diff --git a/tests/test_convolution_ocl.cpp b/tests/test_convolution_ocl.cpp deleted file mode 100644 index 4df9ae52..00000000 --- a/tests/test_convolution_ocl.cpp +++ /dev/null @@ -1,240 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "tensor_computing.h" -#include "ut_util.h" -#include "gcl.h" -#include "libkernelbin.h" - -int convolutionTest(int argc, char* argv[], DataType dt) -{ - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 stride, padding; - U32 on, oc, oh, ow; - U32 biasNum; - Arch arch = MALI; - - in = 1; - ic = 4; - ih = 4; - iw = 4; - fn = 4; - fh = 3; - fw = 3; - stride = 1; - padding = 1; - - if(argc == 9) { - ic = atoi(argv[1]); - ih = atoi(argv[2]); - iw = atoi(argv[3]); - fn = atoi(argv[4]); - fh = atoi(argv[5]); - fw = atoi(argv[6]); - stride = atoi(argv[7]); - padding = atoi(argv[8]); - } - - fc = ic; - - on = 1; - oc = fn; - oh = (ih + padding * 2 - fh) / stride + 1; - ow = (iw + padding * 2 - fw) / stride + 1; - ActivationDesc activationDesc; - activationDesc.mode = ACTIVATION_NULL; - - TensorDesc inputDesc, inputDesc_gpu, filterDesc, outputDesc, biasDesc; - ConvolutionDesc convDesc; - convDesc.stride_h = stride; - convDesc.stride_w = stride; - convDesc.padding_top = padding; - convDesc.padding_bottom = padding; - convDesc.padding_left = padding; - convDesc.padding_right = padding; - convDesc.dilatedRate_h = 1; - convDesc.dilatedRate_w = 1; - - inputDesc = tensor4df(dt, DF_NCHW, in, ic, ih, iw); - filterDesc = tensor4df(dt, DF_NCHW, fn, fc, fh, fw); - biasDesc = tensor1d(dt, oc); - U8 *input_cpu = ut_input_v(in*ic*ih*iw, dt, UT_INIT_RANDOM); - U8 *filter_cpu = ut_input_v(fn*fc*fh*fw, dt, UT_INIT_RANDOM); - U8 *bias_cpu = ut_input_v(oc, dt, UT_INIT_RANDOM); - U8 *output_cpu = ut_input_v(on*oc*oh*ow, dt, UT_INIT_ZERO); -// U8 *output_gpu = ut_input_v(on*oc*oh*ow, dt, UT_INIT_ZERO); - U8 *output_gpu = NULL; -// inputDesc_gpu = tensor4df(dt, DF_NCHW_ORG_MALI, in, ic, ih, iw);//first layer test - inputDesc_gpu = tensor4df(dt, DF_NCHW, in, ic, ih, iw); - - GCLHandle_t handle; - CHECK_STATUS(gcl_create_handle(&handle)); - CHECK_STATUS(gcl_regist_binMap(handle)); - - ExtInfo extInfo; - U32 str[3] = {0, 0, 0}; - U32 off[3] = {0, 0, 0}; - GCLMemDesc inputMemDesc = gcl_mem_desc(str, off, DT_U8, DF_NCWHC4); - GCLMemDesc outputMemDesc = gcl_mem_desc(str, off, DT_U8, DF_NCWHC4); - GCLMemDesc filterMemDesc = gcl_mem_desc(str, off, DT_U8, DF_NCWHC4); - ForwardRunInfoMali runInfo; - runInfo.algorithm = (I32)(CONVOLUTION_ALGORITHM_NULL); - extInfo.maliInfo.handle = handle; - extInfo.maliInfo.gclmemInputDesc = NULL; - extInfo.maliInfo.gclmemOutputDesc = NULL; - extInfo.maliInfo.gclmemFilterDesc = NULL; - extInfo.maliInfo.forwardRunInfo = &runInfo; - - U32 outputBytes; - CHECK_STATUS(convolution_infer_output_size(inputDesc_gpu, filterDesc, convDesc, &outputDesc, dt, &outputBytes, arch, &extInfo)); - - ConvolutionPolicy policy = CONVOLUTION_TUNNING; - ConvolutionForwardAlgorithm alg = CONVOLUTION_ALGORITHM_NULL; - CHECK_STATUS(convolution_infer_forward_algorithm(inputDesc_gpu, filterDesc, outputDesc, convDesc, policy, &alg, dt, activationDesc, arch, &extInfo)); - - extInfo.maliInfo.gclmemInputDesc = &inputMemDesc; - extInfo.maliInfo.gclmemOutputDesc = &outputMemDesc; - extInfo.maliInfo.gclmemFilterDesc = &filterMemDesc; - CHECK_STATUS(convolution_infer_output_size(inputDesc_gpu, filterDesc, convDesc, NULL, dt, NULL, arch, &extInfo)); - - U32 maxBytes = 0; - U32 tmpBytes; - CHECK_STATUS(convolution_infer_forward_tmp_bytes(inputDesc_gpu, filterDesc, outputDesc, convDesc, alg, &tmpBytes, arch, &extInfo)); - maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; - - U32 ftmBytes; - CHECK_STATUS(convolution_transform_filter_bytes(filterDesc, alg, &ftmBytes, arch, &extInfo)); - - GCLMem_t input = gcl_create_gclmem(); - GCLMem_t filter = gcl_create_gclmem(); - GCLMem_t output = gcl_create_gclmem(); - GCLMem_t bias = gcl_create_gclmem(); - GCLMem_t tmpbuf = gcl_create_gclmem(); - GCLMem_t filter_org = gcl_create_gclmem(); - outputMemDesc.use_map = true; - outputMemDesc.flags = CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR; - outputMemDesc.byteSize = 2 * outputMemDesc.byteSize; - input->desc = inputMemDesc; - filter->desc = filterMemDesc; - output->desc = outputMemDesc; - if(fh == 1 && fw == 1 && ih == 1 && iw == 1) { - biasNum = oc; - bias->desc.memType = GCL_MEM_BUF; - bias->desc.byteSize = biasNum * bytesOf(dt); - } else { - biasNum = (oc + 3) / 4; - bias->desc.memType = GCL_MEM_IMG_1D; - bias->desc.byteSize = biasNum * 4 * bytesOf(dt); - } - bias->desc.stride[0] = biasNum; - bias->desc.stride[1] = 1; - bias->desc.stride[2] = 1; - bias->desc.offset[0] = 0; - bias->desc.offset[1] = 0; - bias->desc.offset[2] = 0; - bias->desc.num = biasNum; - bias->desc.memFormat = DF_NHWC; - bias->desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; - U8* bias_cpu_align = NULL; - if((oc & 3) != 0) { - U8 *bias_cpu_align = ut_input_v((oc + 3) / 4 * 4, dt, UT_INIT_ZERO); - memcpy(bias_cpu_align, bias_cpu, (oc + 3) / 4 * 4 * bytesOf(dt)); - bias->desc.host_ptr = bias_cpu_align; - } else { - bias->desc.host_ptr = bias_cpu; - } - - - filter_org->desc.stride[0] = fw * fh; - filter_org->desc.stride[1] = fc; - filter_org->desc.stride[2] = fn; - filter_org->desc.offset[0] = 0; - filter_org->desc.offset[1] = 0; - filter_org->desc.offset[2] = 0; - filter_org->desc.byteSize = fw * fh * fc * fn * bytesOf(dt); - filter_org->desc.num = fw * fh * fc * fn; - filter_org->desc.memType = GCL_MEM_BUF; - filter_org->desc.memFormat = DF_NCHW; - filter_org->desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; - filter_org->desc.host_ptr = filter_cpu; - - gcl_create_memory(handle, input); - gcl_create_memory(handle, output); - gcl_create_memory(handle, filter); - gcl_create_memory(handle, bias); - gcl_create_memory(handle, filter_org); - CHECK_STATUS(gcl_fill_memory_zero(handle, input)); - - CHECK_STATUS(tensor_computing_set_input_infer_tmpBuf_size(input, inputDesc_gpu, &tmpBytes, arch)); - maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; - CHECK_STATUS(tensor_computing_get_output_infer_tmpBuf_size(output, outputDesc, &tmpBytes, arch)); - maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; - tmpbuf->desc.byteSize = maxBytes; - if(maxBytes) gcl_create_memory(handle, tmpbuf); - - TensorDesc filterDescTran; - CHECK_STATUS(convolution_transform_filter(filterDesc, filter_org, alg, &filterDescTran, filter, tmpbuf, arch, &extInfo)); - - CHECK_STATUS(tensor_computing_set_input(input, inputDesc_gpu, input_cpu, tmpbuf, true, arch, &extInfo)); - CHECK_STATUS(convolution(inputDesc_gpu, input, filterDesc, filter, convDesc, alg, biasDesc, nullptr, - biasDesc, bias, tmpBytes, tmpbuf, outputDesc, output, activationDesc, arch, &extInfo)); -#ifndef _DEBUG - CHECK_STATUS(gcl_run_kernelVec(handle)); -#endif - CHECK_STATUS(tensor_computing_get_output(output, outputDesc, NULL, NULL, true, arch, &extInfo)); - output_gpu = output->desc.map_ptr; -#ifdef _DEBUG - char buffer[150]; - char params[120]; - sprintf(params, "(%u %u %u %u)+(%u %u %u %u)/(%u %u)=(%u %u %u %u)", - in, ic, ih, iw, - fn, fc, fh, fw, - stride, padding, - on, oc, oh, ow); - sprintf(buffer, "%20s, %80s", "Convolution", params); - double time = handle->t_total * 0.001; - double ops = (1.0 * on * oc * oh * ow) * (2.0 * ic * fh * fw + 1); - ut_log(dt, buffer, ops, time); -#endif - CHECK_STATUS(convolution(inputDesc, input_cpu, filterDesc, filter_cpu, convDesc, CONVOLUTION_ALGORITHM_GEMM, - biasDesc, nullptr, biasDesc, bias_cpu, tmpBytes, NULL, outputDesc, output_cpu, activationDesc, CPU_GENERAL)); - ut_check_a(output_gpu, output_cpu, on * oc * ow * oh, dt); - - CHECK_STATUS(gcl_finish(handle)); - free(input_cpu); - free(filter_cpu); - free(bias_cpu); - free(output_cpu); - if(bias_cpu_align) free(bias_cpu_align); -// free(output_gpu); - CHECK_STATUS(gcl_unmap_memory(handle, output)); - gcl_destroy_gclmem(input); - gcl_destroy_gclmem(filter); - gcl_destroy_gclmem(output); - gcl_destroy_gclmem(bias); - gcl_destroy_gclmem(tmpbuf); - gcl_destroy_gclmem(filter_org); - gcl_destroy_handle(handle); - return 0; -} - - -int main(int argc, char** argv) { -#ifdef _USE_FP16 - convolutionTest(argc, argv, DT_F16); -#endif - return 0; -} diff --git a/tests/test_deconvolution.cpp b/tests/test_deconvolution.cpp deleted file mode 100644 index 989519c1..00000000 --- a/tests/test_deconvolution.cpp +++ /dev/null @@ -1,167 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "tensor_computing.h" -#include "ut_util.h" - -int deconvolutionTest(int argc, char** argv, DataType dt) { - CHECK_REQUIREMENT(argc == 15); - // in data - U32 in = atoi(argv[1]); - U32 ic = atoi(argv[2]); - U32 ih = atoi(argv[3]); - U32 iw = atoi(argv[4]); - // weight - U32 fn = atoi(argv[5]); - U32 fc = atoi(argv[6]); - U32 fh = atoi(argv[7]); - U32 fw = atoi(argv[8]); - // stride & padding - U32 stride = atoi(argv[9]); - U32 padding = atoi(argv[10]); - // output - U32 on = atoi(argv[11]); - U32 oc = atoi(argv[12]); - U32 oh = atoi(argv[13]); - U32 ow = atoi(argv[14]); - - CHECK_REQUIREMENT(in == 1 && on == 1); - CHECK_REQUIREMENT(ic % 8 == 0 && oc % 8 == 0); - - ActivationDesc activationDesc; - activationDesc.mode = ACTIVATION_NULL; - - TensorDesc inputDesc, filterDesc, outputDesc, biasDesc; - ConvolutionDesc convDesc; - inputDesc = tensor4df(dt, DF_NCHWC8, in, ic, ih, iw); - filterDesc = tensor4df(dt, DF_NCHW, fn, fc, fh, fw); - biasDesc = tensor1d(dt, oc); - convDesc.stride_h = stride; - convDesc.stride_w = stride; - convDesc.padding_top = padding; - convDesc.padding_bottom = padding; - convDesc.padding_left = padding; - convDesc.padding_right = padding; - convDesc.dilatedRate_h = 1; - convDesc.dilatedRate_w = 1; - - // setup input, filter, bias - U8 *input = ut_input_v(in*ic*ih*iw, dt, UT_INIT_RANDOM); - U8 *filter = ut_input_v(fn*fc*fh*fw, dt, UT_INIT_RANDOM); - U8 *bias = ut_input_v(oc, dt, UT_INIT_RANDOM); - U8 *input_ref = ut_input_v(in*ic*ih*iw, dt, UT_INIT_ZERO); - U8 *filter_ref = ut_input_v(fn*fc*fh*fw, dt, UT_INIT_ZERO); - memcpy(input_ref, input, bytesOf(dt)*in*ic*ih*iw); - memcpy(filter_ref, filter, bytesOf(dt)*fn*fc*fh*fw); - - // setup output, bias - U32 outputBytes; - CHECK_STATUS(deconvolution_infer_output_size(inputDesc, filterDesc, convDesc, &outputDesc, dt, &outputBytes)); - U32 output_size = outputBytes / bytesOf(dt); - U8 *output = ut_input_v(output_size, dt, UT_INIT_ZERO); - U8 *output_ref = ut_input_v(output_size, dt, UT_INIT_ZERO); - - // setup alg - ConvolutionPolicy policy = CONVOLUTION_FASTEST; - ConvolutionForwardAlgorithm alg = CONVOLUTION_ALGORITHM_NULL; - CHECK_STATUS(deconvolution_infer_forward_algorithm(inputDesc, filterDesc, outputDesc, convDesc, policy, &alg, dt, UT_ARCH)); - - // setup tmp - U32 tmpBytes; - CHECK_STATUS(deconvolution_infer_forward_tmp_bytes(inputDesc, filterDesc, outputDesc, convDesc, alg, &tmpBytes, UT_ARCH)); - U8 *tmp = ut_input_v(tmpBytes/bytesOf(dt), dt, UT_INIT_ZERO); - - // setup filter trans - U32 ftmBytes; - CHECK_STATUS(deconvolution_transform_filter_bytes(filterDesc, alg, &ftmBytes, UT_ARCH)); - U8 *ftm = ut_input_v(ftmBytes/bytesOf(dt), dt, UT_INIT_ZERO); - - // trans filter - TensorDesc ftmDesc; - CHECK_STATUS(deconvolution_transform_filter(filterDesc, filter, alg, &ftmDesc, ftm, UT_ARCH)); - - if (UT_CHECK) { - CHECK_STATUS(deconvolution(inputDesc, input, - ftmDesc, ftm, - convDesc, alg, - biasDesc, nullptr, - biasDesc, bias, - tmpBytes, tmp, - outputDesc, output, - activationDesc, UT_ARCH)); - - // naive implement - CHECK_STATUS(deconvolution(inputDesc, input_ref, - filterDesc, filter_ref, - convDesc, alg, - biasDesc, nullptr, - biasDesc, bias, - tmpBytes, nullptr, - outputDesc, output_ref, - activationDesc, CPU_GENERAL)); - - // check - ut_check_v(output, output_ref, output_size, dt, 1, __FILE__, __LINE__); - } - - // benchmark - double time_start = ut_time_ms(); - for (int iter = 0; iter < UT_LOOPS; iter++) { - CHECK_STATUS(deconvolution(inputDesc, input, - ftmDesc, ftm, - convDesc, alg, - biasDesc, nullptr, - biasDesc, bias, - tmpBytes, tmp, - outputDesc, output, - activationDesc, UT_ARCH)); - } - double time_end = ut_time_ms(); - double time = (time_end - time_start) / UT_LOOPS; - - // log performance data - char buffer[150]; - char params[120]; - sprintf(params, "(%u %u %u %u)+(%u %u %u %u)/(%u %u)=(%u %u %u %u)", - in, ic, ih, iw, - fn, fc, fh, fw, - stride, padding, - on, oc, oh, ow); - sprintf(buffer, "%20s, %80s", "Deconvolution", params); - double ops = (1.0 * on * oc * ih * iw) * (2.0 * ic * fh * fw + fh * fw); - ut_log(dt, buffer, ops, time); - - free(input); - free(filter); - free(bias); - free(output); - free(input_ref); - free(filter_ref); - free(output_ref); - free(tmp); - free(ftm); - return 0; -} - - -int main(int argc, char** argv) { -#ifdef _USE_FP16 - deconvolutionTest(argc, argv, DT_F16); -#endif -#ifdef _USE_FP32 - deconvolutionTest(argc, argv, DT_F32); -#endif - return 0; -} diff --git a/tests/test_depthwise_convolution.cpp b/tests/test_depthwise_convolution.cpp deleted file mode 100644 index 731be832..00000000 --- a/tests/test_depthwise_convolution.cpp +++ /dev/null @@ -1,198 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include - -#include "tensor_computing.h" -#include "ut_util.h" - -int depthwiseConvolutionTest(int argc, char *argv[], DataFormat filterDataFormat, DataType dt) -{ - CHECK_REQUIREMENT(argc == 15); - // in data - U32 in = atoi(argv[1]); - U32 ic = atoi(argv[2]); - U32 ih = atoi(argv[3]); - U32 iw = atoi(argv[4]); - // weight - U32 fn = atoi(argv[5]); - U32 fc = atoi(argv[6]); - U32 fh = atoi(argv[7]); - U32 fw = atoi(argv[8]); - // stride & padding - U32 stride = atoi(argv[9]); - U32 padding = atoi(argv[10]); - // output - U32 on = atoi(argv[11]); - U32 oc = atoi(argv[12]); - U32 oh = atoi(argv[13]); - U32 ow = atoi(argv[14]); - - CHECK_REQUIREMENT(in == 1 && on == 1); - - ActivationDesc dwActivationDesc; - ActivationDesc pwActivationDesc; - dwActivationDesc.mode = ACTIVATION_NULL; - pwActivationDesc.mode = ACTIVATION_NULL; - - U32 filterLength = 0; - U32 biasLength = 0; - if (filterDataFormat == DF_CHW_NC) { - filterLength = fc*fh*fw + fn*fc; - biasLength = ic + oc; - } - if (filterDataFormat == DF_NCHW) { - oc = fc = ic; - fn = 1; - filterLength = fc*fh*fw; - biasLength = ic; - } - if (filterLength == 0) { - exit(1); - } - TensorDesc inputDesc, filterDesc, outputDesc, biasDesc; - ConvolutionDesc convDesc; - inputDesc = tensor4df(dt, DF_NCHWC8, in, ic, ih, iw); - filterDesc = tensor4df(dt, filterDataFormat, oc, ic, fh, fw); - biasDesc = tensor1d(dt, biasLength); - convDesc.stride_h = stride; - convDesc.stride_w = stride; - convDesc.padding_top = padding; - convDesc.padding_bottom = padding; - convDesc.padding_left = padding; - convDesc.padding_right = padding; - convDesc.dilatedRate_h = 1; - convDesc.dilatedRate_w = 1; - - // setup input, filter, bias - U8 *input = ut_input_v(in*ic*ih*iw, dt, UT_INIT_RANDOM); - U8 *filter = ut_input_v(filterLength, dt, UT_INIT_RANDOM); - U8 *bias = ut_input_v(biasLength, dt, UT_INIT_RANDOM); - U8 *inputRef = ut_input_v(in*ic*ih*iw, dt, UT_INIT_ZERO); - U8 *filterRef = ut_input_v(filterLength, dt, UT_INIT_ZERO); - U8 *biasRef = ut_input_v(biasLength, dt, UT_INIT_ZERO); - memcpy(inputRef, input, bytesOf(dt)*in*ic*ih*iw); - memcpy(filterRef, filter, bytesOf(dt)*(filterLength)); - memcpy(biasRef, bias, bytesOf(dt)*(biasLength)); - - // setup output, bias - U32 outputBytes; - CHECK_STATUS(depthwise_convolution_infer_output_size(inputDesc, filterDesc, convDesc, &outputDesc, dt, &outputBytes, UT_ARCH)); - U32 output_size = outputBytes / bytesOf(dt); - U8 *output = ut_input_v(output_size, dt, UT_INIT_ZERO); - U8 *outputRef = ut_input_v(output_size, dt, UT_INIT_ZERO); - - // setup alg - ConvolutionPolicy policy = CONVOLUTION_FASTEST; - DepthwiseConvolutionForwardAlgorithm alg = DEPTHWISE_CONVOLUTION_ALGORITHM_NULL; - CHECK_STATUS(depthwise_convolution_infer_forward_algorithm(inputDesc, filterDesc, outputDesc, convDesc, policy, &alg, dt, dwActivationDesc, pwActivationDesc, UT_ARCH)); - - // setup tmp - U32 tmpBytes; - CHECK_STATUS(depthwise_convolution_infer_forward_tmp_bytes(inputDesc, filterDesc, outputDesc, convDesc, alg, &tmpBytes, UT_ARCH)); - U8 *tmp = ut_input_v(tmpBytes/bytesOf(dt), dt, UT_INIT_ZERO); - - // setup filter trans - U32 ftmBytes; - CHECK_STATUS(depthwise_convolution_transform_filter_bytes(filterDesc, alg, &ftmBytes, UT_ARCH)); - U8 *ftm = ut_input_v(ftmBytes/bytesOf(dt), dt, UT_INIT_ZERO); - // trans filter - TensorDesc ftmDesc; - CHECK_STATUS(depthwise_convolution_transform_filter(filterDesc, filter, alg, &ftmDesc, ftm, UT_ARCH)); - - if (UT_CHECK) { - CHECK_STATUS(depthwise_convolution(inputDesc, input, - ftmDesc, ftm, - convDesc, alg, - biasDesc, bias, - tmpBytes, tmp, - outputDesc, output, - dwActivationDesc, pwActivationDesc, - UT_ARCH)); - - // naive implement - CHECK_STATUS(depthwise_convolution(inputDesc, inputRef, - filterDesc, filterRef, - convDesc, alg, - biasDesc, biasRef, - tmpBytes, tmp, - outputDesc, outputRef, - dwActivationDesc, pwActivationDesc, - CPU_GENERAL)); - - // check - ut_check_v(output, outputRef, output_size, dt, 0.1, __FILE__, __LINE__); - } - - // benchmark - double time_start = ut_time_ms(); - for(int iter = 0; iter < UT_LOOPS; iter++){ - CHECK_STATUS(depthwise_convolution(inputDesc, input, - ftmDesc, ftm, - convDesc, alg, - biasDesc, bias, - tmpBytes, tmp, - outputDesc, output, - dwActivationDesc, pwActivationDesc, - UT_ARCH)); - } - double time_end = ut_time_ms(); - double time = (time_end - time_start) / UT_LOOPS; - - // log performance data - char buffer[150]; - char params[120]; - sprintf(params, "(%u %u %u %u)+(%u %u %u %u)/(%u %u)=(%u %u %u %u)", - in, ic, ih, iw, - fn, fc, fh, fw, - stride, padding, - on, oc, oh, ow); - double ops = 0; - if (filterDataFormat == DF_CHW_NC) { - sprintf(buffer, "%20s, %80s", "DepthwisePointwise", params); - ops = 2.0 * in * ic * ih * iw * fh * fw + in * ic * oh * ow + - 2.0 * on * oc * oh * ow * ic + on * oc * oh * ow; - } - if (filterDataFormat == DF_NCHW) { - sprintf(buffer, "%20s, %80s", "DepthwiseConvolution", params); - ops = 2.0 * in * ic * ih * iw * fh * fw + in * ic * oh * ow; - } - ut_log(dt, buffer, ops, time); - - free(input); - free(filter); - free(bias); - free(output); - free(inputRef); - free(filterRef); - free(biasRef); - free(outputRef); - free(tmp); - free(ftm); - - return 0; -} - -int main(int argc, char *argv[]) -{ -#ifdef _USE_FP16 - depthwiseConvolutionTest(argc, argv, DF_CHW_NC, DT_F16); - depthwiseConvolutionTest(argc, argv, DF_NCHW, DT_F16); -#endif -#ifdef _USE_FP32 - depthwiseConvolutionTest(argc, argv, DF_CHW_NC, DT_F32); - depthwiseConvolutionTest(argc, argv, DF_NCHW, DT_F32); -#endif - return 0; -} diff --git a/tests/test_depthwise_convolution_int8.cpp b/tests/test_depthwise_convolution_int8.cpp deleted file mode 100644 index d0c49778..00000000 --- a/tests/test_depthwise_convolution_int8.cpp +++ /dev/null @@ -1,166 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include - -#include "tensor_computing.h" -#include "ut_util.h" - -int main(int argc, char *argv[]){ - CHECK_REQUIREMENT(argc == 15); - // in data - U32 in = atoi(argv[1]); - U32 ic = atoi(argv[2]); - U32 ih = atoi(argv[3]); - U32 iw = atoi(argv[4]); - // weight - U32 fn = atoi(argv[5]); - U32 fc = atoi(argv[6]); - U32 fh = atoi(argv[7]); - U32 fw = atoi(argv[8]); - // stride & padding - U32 stride = atoi(argv[9]); - U32 padding = atoi(argv[10]); - // output - U32 on = atoi(argv[11]); - U32 oc = atoi(argv[12]); - U32 oh = atoi(argv[13]); - U32 ow = atoi(argv[14]); - - CHECK_REQUIREMENT(in == 1 && on == 1); - - DataType dt = DT_I8; - DataType odt = DT_I32; - ActivationDesc dwActivationDesc; - ActivationDesc pwActivationDesc; - dwActivationDesc.mode = ACTIVATION_RELU6; - pwActivationDesc.mode = ACTIVATION_RELU6; - - TensorDesc inputDesc, filterDesc, outputDesc, biasDesc; - ConvolutionDesc convDesc; - inputDesc = tensor4df(dt, DF_NCHWC8, in, ic, ih, iw); - filterDesc = tensor4df(dt, DF_CHW_NC, oc, ic, fh, fw); - biasDesc = tensor1d(odt, ic+oc); - convDesc.stride_h = stride; - convDesc.stride_w = stride; - convDesc.padding_top = padding; - convDesc.padding_bottom = padding; - convDesc.padding_left = padding; - convDesc.padding_right = padding; - convDesc.dilatedRate_h = 1; - convDesc.dilatedRate_w = 1; - - U32 filter_size = fc*fh*fw + fn*fc; - - // setup input, filter, bias - INT8 *input = (INT8*)ut_input_v(in*ic*ih*iw, DT_I8, UT_INIT_RANDOM); - INT8 *filter = (INT8*)ut_input_v(filter_size, DT_I8, UT_INIT_RANDOM); - I32 *bias = (I32*)ut_input_v(ic+oc, DT_I32, UT_INIT_RANDOM); - INT8 *input_ref = (INT8*)ut_input_v(in*ic*ih*iw, DT_I8, UT_INIT_ZERO); - INT8 *filter_ref = (INT8*)ut_input_v(filter_size, DT_I8, UT_INIT_ZERO); - I32 *bias_ref = (I32*)ut_input_v(ic+oc, DT_I32, UT_INIT_ZERO); - memcpy(input_ref, input, bytesOf(dt)*in*ic*ih*iw); - memcpy(filter_ref, filter, bytesOf(dt)*filter_size); - memcpy(bias_ref, bias, bytesOf(odt)*(ic+oc)); - - // setup output, bias - U32 outputBytes; - CHECK_STATUS(depthwise_convolution_infer_output_size(inputDesc, filterDesc, convDesc, &outputDesc, odt, &outputBytes, UT_ARCH)); - U32 output_size = outputBytes / bytesOf(odt); - I32 *output = (I32*)ut_input_v(output_size, DT_I32, UT_INIT_ZERO); - I32 *output_ref = (I32*)ut_input_v(output_size, DT_I32, UT_INIT_ZERO); - - // setup alg - ConvolutionPolicy policy = CONVOLUTION_FASTEST; - DepthwiseConvolutionForwardAlgorithm alg = DEPTHWISE_CONVOLUTION_ALGORITHM_NULL; - CHECK_STATUS(depthwise_convolution_infer_forward_algorithm(inputDesc, filterDesc, outputDesc, convDesc, policy, &alg, dt, dwActivationDesc, pwActivationDesc, UT_ARCH)); - - // setup tmp - U32 tmpBytes; - CHECK_STATUS(depthwise_convolution_infer_forward_tmp_bytes(inputDesc, filterDesc, outputDesc, convDesc, alg, &tmpBytes, UT_ARCH)); - INT8 *tmp = (INT8*)ut_input_v(tmpBytes/bytesOf(dt), DT_I8, UT_INIT_ZERO); - - // setup filter trans - U32 ftmBytes; - CHECK_STATUS(depthwise_convolution_transform_filter_bytes(filterDesc, alg, &ftmBytes, UT_ARCH)); - INT8 *ftm = (INT8*)ut_input_v(ftmBytes/bytesOf(dt), DT_I8, UT_INIT_ZERO); - // trans filter - TensorDesc ftmDesc; - CHECK_STATUS(depthwise_convolution_transform_filter(filterDesc, filter, alg, &ftmDesc, ftm, UT_ARCH)); - - if(UT_CHECK){ - CHECK_STATUS(depthwise_convolution(inputDesc, input, - ftmDesc, ftm, - convDesc, alg, - biasDesc, bias, - tmpBytes, tmp, - outputDesc, output, - dwActivationDesc, pwActivationDesc, - UT_ARCH)); - - // naive implement - CHECK_STATUS(depthwise_convolution(inputDesc, input_ref, - filterDesc, filter_ref, - convDesc, alg, - biasDesc, bias_ref, - tmpBytes, tmp, - outputDesc, output_ref, - dwActivationDesc, pwActivationDesc, - CPU_GENERAL)); - - // check - ut_check_v(output, output_ref, output_size, DT_I32, 1, __FILE__, __LINE__); - } - - // benchmark - double time_start = ut_time_ms(); - for(int iter=0; iter -#include "tensor_computing.h" -#include "ut_util.h" -#include "gcl.h" -#include "libkernelbin.h" - -int depthwiseConvolutionTest(int argc, char* argv[], DataFormat filterDataFormat, DataType dt) -{ - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 stride, padding; - U32 on, oc, oh, ow; - U32 biasNum; - Arch arch = MALI; - - in = 1; - ic = 8; - ih = 4; - iw = 4; - fn = 8; - fh = 3; - fw = 3; - stride = 1; - padding = 1; - - if(argc == 9) { - ic = atoi(argv[1]); - ih = atoi(argv[2]); - iw = atoi(argv[3]); - fn = atoi(argv[4]); - fh = atoi(argv[5]); - fw = atoi(argv[6]); - stride = atoi(argv[7]); - padding = atoi(argv[8]); - } - - if(filterDataFormat == DF_NCHW) { - if(fn != ic) { - std::cout << "ignored depthwise convolution for para fn != ic" << std::endl; - return 0; - } - fc = 1; - } else { - fc = ic; - } - - - on = 1; - oc = fn; - oh = (ih + padding * 2 - fh) / stride + 1; - ow = (iw + padding * 2 - fw) / stride + 1; - ActivationDesc dwActivationDesc; - ActivationDesc pwActivationDesc; - dwActivationDesc.mode = ACTIVATION_NULL; - pwActivationDesc.mode = ACTIVATION_NULL; - - TensorDesc inputDesc, filterDesc, outputDesc, biasDesc; - ConvolutionDesc convDesc; - convDesc.stride_h = stride; - convDesc.stride_w = stride; - convDesc.padding_top = padding; - convDesc.padding_bottom = padding; - convDesc.padding_left = padding; - convDesc.padding_right = padding; - convDesc.dilatedRate_h = 1; - convDesc.dilatedRate_w = 1; - - U32 filterLen = fn * fc * fh * fw; - U32 biasLen = oc; - if(filterDataFormat == DF_CHW_NC) { - filterLen = fc * fh * fw + fn * fc; - biasLen = ic + oc; - } - - inputDesc = tensor4df(dt, DF_NCHW, in, ic, ih, iw); - filterDesc = tensor4df(dt, filterDataFormat, fn, fc, fh, fw); - biasDesc = tensor1d(dt, biasLen); - U8 *input_cpu = ut_input_v(in*ic*ih*iw, dt, UT_INIT_RANDOM); - U8 *filter_cpu = ut_input_v(filterLen, dt, UT_INIT_RANDOM); - U8 *bias_cpu = ut_input_v(biasLen, dt, UT_INIT_RANDOM); - U8 *output_cpu = ut_input_v(on*oc*oh*ow, dt, UT_INIT_ZERO); - //U8 *output_gpu = ut_input_v(on*oc*oh*ow, dt, UT_INIT_ZERO); - U8 *output_gpu = NULL; - - - GCLHandle_t handle; - CHECK_STATUS(gcl_create_handle(&handle)); - CHECK_STATUS(gcl_regist_binMap(handle)); - - ExtInfo extInfo; - U32 str[3] = {0, 0, 0}; - U32 off[3] = {0, 0, 0}; - GCLMemDesc inputMemDesc = gcl_mem_desc(str, off, DT_U8, DF_NCWHC4); - GCLMemDesc outputMemDesc = gcl_mem_desc(str, off, DT_U8, DF_NCWHC4); - GCLMemDesc filterMemDesc[2]; - filterMemDesc[0] = gcl_mem_desc(str, off, DT_U8, DF_NCWHC4); - filterMemDesc[1] = gcl_mem_desc(str, off, DT_U8, DF_NCWHC4); - ForwardRunInfoMali runInfo; - runInfo.algorithm = (I32)(DEPTHWISE_CONVOLUTION_ALGORITHM_NULL); - extInfo.maliInfo.handle = handle; - extInfo.maliInfo.gclmemInputDesc = NULL; - extInfo.maliInfo.gclmemOutputDesc = NULL; - extInfo.maliInfo.gclmemFilterDesc = NULL; - extInfo.maliInfo.forwardRunInfo = &runInfo; - - U32 outputBytes; - CHECK_STATUS(depthwise_convolution_infer_output_size(inputDesc, filterDesc, convDesc, &outputDesc, dt, &outputBytes, arch, &extInfo)); - ConvolutionPolicy policy = CONVOLUTION_TUNNING; - DepthwiseConvolutionForwardAlgorithm alg = DEPTHWISE_CONVOLUTION_ALGORITHM_NULL; - CHECK_STATUS(depthwise_convolution_infer_forward_algorithm(inputDesc, filterDesc, outputDesc, convDesc, policy, &alg, dt, dwActivationDesc, pwActivationDesc, arch, &extInfo)); - - extInfo.maliInfo.gclmemInputDesc = &inputMemDesc; - extInfo.maliInfo.gclmemOutputDesc = &outputMemDesc; - extInfo.maliInfo.gclmemFilterDesc = filterMemDesc; - CHECK_STATUS(depthwise_convolution_infer_output_size(inputDesc, filterDesc, convDesc, NULL, dt, NULL, arch, &extInfo)); - - U32 maxBytes = 0; - U32 tmpBytes; - CHECK_STATUS(depthwise_convolution_infer_forward_tmp_bytes(inputDesc, filterDesc, outputDesc, convDesc, alg, &tmpBytes, arch, &extInfo)); - maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; - - U32 ftmBytes; - CHECK_STATUS(depthwise_convolution_transform_filter_bytes(filterDesc, alg, &ftmBytes, arch, &extInfo)); - - GCLMem_t input = gcl_create_gclmem(); - GCLMem_t filter = gcl_create_gclmem(); - GCLMem_t filter_dp = gcl_create_gclmem(); - GCLMem_t output = gcl_create_gclmem(); - GCLMem_t bias = gcl_create_gclmem(); - GCLMem_t bias_dp = gcl_create_gclmem(); - GCLMem_t bias_buf = gcl_create_gclmem(); - GCLMem_t tmpbuf = gcl_create_gclmem(); - GCLMem_t filter_org = gcl_create_gclmem(); - GCLMem_t filter_org_dp = gcl_create_gclmem(); - - outputMemDesc.use_map = true; - outputMemDesc.flags = CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR; - outputMemDesc.byteSize = 2 * outputMemDesc.byteSize; - input->desc = inputMemDesc; - filter->desc = filterMemDesc[0]; - filter_dp->desc = filterMemDesc[1]; - output->desc = outputMemDesc; - - biasNum = (oc + 3) / 4 ; - if(filterDataFormat == DF_CHW_NC) { - bias_dp->desc.memType = GCL_MEM_IMG_1D; - bias_dp->desc.byteSize = biasNum * 4 * bytesOf(dt); - bias_dp->desc.stride[0] = biasNum; - bias_dp->desc.stride[1] = 1; - bias_dp->desc.stride[2] = 1; - bias_dp->desc.offset[0] = 0; - bias_dp->desc.offset[1] = 0; - bias_dp->desc.offset[2] = 0; - bias_dp->desc.num = biasNum; - bias_dp->desc.memFormat = DF_NHWC; - bias_dp->desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; - bias_dp->desc.host_ptr = bias_cpu + ic * bytesOf(dt); - gcl_create_memory(handle, bias_dp); - } - - biasNum = (oc + 7) / 8 * 8; - if(filterDataFormat == DF_CHW_NC) { - bias_buf->desc.memType = GCL_MEM_BUF; - bias_buf->desc.byteSize = biasNum * bytesOf(dt); - bias_buf->desc.stride[0] = biasNum; - bias_buf->desc.stride[1] = 1; - bias_buf->desc.stride[2] = 1; - bias_buf->desc.offset[0] = 0; - bias_buf->desc.offset[1] = 0; - bias_buf->desc.offset[2] = 0; - bias_buf->desc.num = biasNum; - bias_buf->desc.memFormat = DF_NHWC; - bias_buf->desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; - bias_buf->desc.host_ptr = bias_cpu + ic * bytesOf(dt); - biasNum = (ic + 3) / 4; - gcl_create_memory(handle, bias_buf); - } - - bias->desc.memType = GCL_MEM_IMG_1D; - bias->desc.byteSize = biasNum * 4 * bytesOf(dt); - bias->desc.stride[0] = biasNum; - bias->desc.stride[1] = 1; - bias->desc.stride[2] = 1; - bias->desc.offset[0] = 0; - bias->desc.offset[1] = 0; - bias->desc.offset[2] = 0; - bias->desc.num = biasNum; - bias->desc.memFormat = DF_NHWC; - bias->desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; - bias->desc.host_ptr = bias_cpu; - - - if(filterDataFormat == DF_CHW_NC) { - filter_org->desc.stride[0] = fw * fh; - filter_org->desc.stride[1] = 1; - filter_org->desc.stride[2] = fc; - filter_org->desc.offset[0] = 0; - filter_org->desc.offset[1] = 0; - filter_org->desc.offset[2] = 0; - filter_org->desc.byteSize = fw * fh * fc * bytesOf(dt); - filter_org->desc.num = fw * fh * fc; - filter_org->desc.memType = GCL_MEM_BUF; - filter_org->desc.memFormat = DF_NCHW; - filter_org->desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; - filter_org->desc.host_ptr = filter_cpu; - - filter_org_dp->desc.stride[0] = 1; - filter_org_dp->desc.stride[1] = fc; - filter_org_dp->desc.stride[2] = fn; - filter_org_dp->desc.offset[0] = 0; - filter_org_dp->desc.offset[1] = 0; - filter_org_dp->desc.offset[2] = 0; - filter_org_dp->desc.byteSize = fn * fc * bytesOf(dt); - filter_org_dp->desc.num = fn * fc; - filter_org_dp->desc.memType = GCL_MEM_BUF; - filter_org_dp->desc.memFormat = DF_NCHW; - filter_org_dp->desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; - filter_org_dp->desc.host_ptr = filter_cpu + fw * fh * fc * bytesOf(dt); - gcl_create_memory(handle, filter_org_dp); - gcl_create_memory(handle, filter_dp); - } else { - filter_org->desc.stride[0] = fw * fh; - filter_org->desc.stride[1] = fc; - filter_org->desc.stride[2] = fn; - filter_org->desc.offset[0] = 0; - filter_org->desc.offset[1] = 0; - filter_org->desc.offset[2] = 0; - filter_org->desc.byteSize = fw * fh * fc * fn * bytesOf(dt); - filter_org->desc.num = fw * fh * fc * fn; - filter_org->desc.memType = GCL_MEM_BUF; - filter_org->desc.memFormat = DF_NCHW; - filter_org->desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; - filter_org->desc.host_ptr = filter_cpu; - } - - gcl_create_memory(handle, input); - gcl_create_memory(handle, output); - gcl_create_memory(handle, filter); - gcl_create_memory(handle, bias); - gcl_create_memory(handle, filter_org); - CHECK_STATUS(gcl_fill_memory_zero(handle, input)); - - CHECK_STATUS(tensor_computing_set_input_infer_tmpBuf_size(input, inputDesc, &tmpBytes, arch)); - maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; - CHECK_STATUS(tensor_computing_get_output_infer_tmpBuf_size(output, outputDesc, &tmpBytes, arch)); - maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; - tmpbuf->desc.byteSize = maxBytes; - if(maxBytes) gcl_create_memory(handle, tmpbuf); - - TensorDesc filterDescTran; - GCLMem filter_org_array[2]; - filter_org_array[0] = *filter_org; - filter_org_array[1] = *filter_org_dp; - GCLMem filter_array[2]; - filter_array[0] = *filter; - filter_array[1] = *filter_dp; - CHECK_STATUS(depthwise_convolution_transform_filter(filterDesc, filter_org_array, alg, &filterDescTran, filter_array, arch, &extInfo)); - GCLMem bias_array[2]; - bias_array[0] = *bias; - bias_array[1] = *bias_dp; - if(runInfo.algorithm == (I32)(DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_GEMM)) bias_array[1] = *bias_buf; - - - CHECK_STATUS(tensor_computing_set_input(input, inputDesc, input_cpu, tmpbuf, true, arch, &extInfo)); - CHECK_STATUS(depthwise_convolution(inputDesc, input, filterDesc, filter_array, convDesc, alg, - biasDesc, bias_array, tmpBytes, tmpbuf, outputDesc, output, dwActivationDesc, pwActivationDesc, arch, &extInfo)); -#ifndef _DEBUG - CHECK_STATUS(gcl_run_kernelVec(handle)); -#endif - CHECK_STATUS(tensor_computing_get_output(output, outputDesc, NULL, NULL, true, arch, &extInfo)); - output_gpu = output->desc.map_ptr; -#ifdef _DEBUG - char buffer[150]; - char params[120]; - sprintf(params, "(%u %u %u %u)+(%u %u %u %u)/(%u %u)=(%u %u %u %u)", - in, ic, ih, iw, - fn, fc, fh, fw, - stride, padding, - on, oc, oh, ow); - double time = handle->t_total * 0.001; - double ops; - if (filterDataFormat == DF_CHW_NC) { - sprintf(buffer, "%20s, %80s", "DepthwisePointwise", params); - ops = 2.0 * in * ic * ih * iw * fh * fw + in * ic * oh * ow + - 2.0 * on * oc * oh * ow * ic + on * oc * oh * ow; - } - if (filterDataFormat == DF_NCHW) { - sprintf(buffer, "%20s, %80s", "DepthwiseConvolution", params); - ops = 2.0 * in * ic * ih * iw * fh * fw + in * ic * oh * ow; - } - ut_log(dt, buffer, ops, time); -#endif - CHECK_STATUS(depthwise_convolution(inputDesc, input_cpu, filterDesc, filter_cpu, convDesc, DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT, - biasDesc, bias_cpu, tmpBytes, NULL, outputDesc, output_cpu, dwActivationDesc, pwActivationDesc, CPU_GENERAL)); - ut_check_a(output_gpu, output_cpu, on * oc * ow * oh, dt); - - CHECK_STATUS(gcl_finish(handle)); - free(input_cpu ); - free(filter_cpu); - free(bias_cpu ); - free(output_cpu); -// free(output_gpu); - CHECK_STATUS(gcl_unmap_memory(handle, output)); - gcl_destroy_gclmem(input); - gcl_destroy_gclmem(filter); - gcl_destroy_gclmem(output); - gcl_destroy_gclmem(bias); - gcl_destroy_gclmem(tmpbuf); - gcl_destroy_gclmem(filter_org); - if(filterDataFormat == DF_CHW_NC) { - gcl_destroy_gclmem(filter_org_dp); - gcl_destroy_gclmem(filter_dp); - gcl_destroy_gclmem(bias_dp); - gcl_destroy_gclmem(bias_buf); - } - gcl_destroy_handle(handle); - return 0; -} - - -int main(int argc, char** argv) { -#ifdef _USE_FP16 - depthwiseConvolutionTest(argc, argv, DF_CHW_NC, DT_F16); - depthwiseConvolutionTest(argc, argv, DF_NCHW, DT_F16); -#endif - return 0; -} diff --git a/tests/test_detectionoutput.cpp b/tests/test_detectionoutput.cpp deleted file mode 100644 index ce730425..00000000 --- a/tests/test_detectionoutput.cpp +++ /dev/null @@ -1,122 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing.h" -#include "ut_util.h" - -int detectionoutputTest(int argc, char **argv, DataType dt){ - CHECK_REQUIREMENT(argc == 11); - // in0 loc - U32 ih0 = atoi(argv[1]); - U32 iw0 = atoi(argv[2]); - // in1 conf - U32 ih1 = atoi(argv[3]); - U32 iw1 = atoi(argv[4]); - // in2 priorbox - U32 in2 = atoi(argv[5]); - U32 ic2 = atoi(argv[6]); - U32 ilens2 = atoi(argv[7]); - // output - U32 oh = atoi(argv[8]); - U32 ow = atoi(argv[9]); - U32 num_class = atoi(argv[10]); - - DetectionOutputDesc detectionoutput_desc; - detectionoutput_desc.num_class = num_class; - detectionoutput_desc.nms_top_k = 400; - detectionoutput_desc.nms_threshold = 0.449999988079; - detectionoutput_desc.keep_top_k = 200; - detectionoutput_desc.confidence_threshold = 0.00999999977648; - - std::vector input_descs; - TensorDesc output_desc; - TensorDesc input_desc_loc = tensor2d(dt, ih0, iw0); - TensorDesc input_desc_conf = tensor2d(dt, ih1, iw1); - TensorDesc input_desc_priorbox = tensor3d(dt, in2, ic2, ilens2); - input_descs.push_back(input_desc_loc); - input_descs.push_back(input_desc_conf); - input_descs.push_back(input_desc_priorbox); - CHECK_STATUS(detectionoutput_infer_output_size(input_descs, detectionoutput_desc, &output_desc, UT_ARCH)); - U32 input_len_loc = tensorNumElements(input_descs[0]); - U32 input_len_conf = tensorNumElements(input_descs[1]); - U32 input_len_priorbox = tensorNumElements(input_descs[2]); - U32 output_len = tensorNumElements(output_desc); - CHECK_REQUIREMENT(input_len_loc == ih0 * iw0 && input_len_conf == ih1 * iw1 && input_len_priorbox == in2 * ic2 * ilens2 && output_len == oh * ow); - - std::vector input(3); - U8* input_loc = ut_input_v(input_len_loc, dt, UT_INIT_RANDOM); - U8* input_conf = ut_input_v(input_len_conf, dt, UT_INIT_RANDOM); - U8* input_priorbox = ut_input_v(input_len_priorbox, dt, UT_INIT_RANDOM); - input[0] = (void*)input_loc; - input[1] = (void*)input_conf; - input[2] = (void*)input_priorbox; - - U8* output = ut_input_v(output_len, dt, UT_INIT_ZERO); - U8* output_ref = ut_input_v(output_len, dt, UT_INIT_ZERO); - - if (UT_CHECK) { - - CHECK_STATUS(detectionoutput(input_descs, input, detectionoutput_desc, output_desc, output, UT_ARCH)); - CHECK_STATUS(detectionoutput(input_descs, input, detectionoutput_desc, output_desc, output_ref, CPU_GENERAL)); - // check - ut_check_v(output, output_ref, output_len, dt, 0.05, __FILE__, __LINE__); - } - - U32 num_detected_max = detectionoutput_desc.keep_top_k; -#ifdef _USE_FP16 - if (dt == DT_F16) { - F16* output_f16 = reinterpret_cast(output); - int idx = 0; - for (U32 i = 0 ; i < 1 + num_detected_max ; i++){ - if( i >= 1 && output_f16[idx] == 0) { - break; - } - std::cout << " 1 : " << output_f16[idx] << " 2 : " << output_f16[idx+1] << " 3 : " << output_f16[idx+2] << " 4 : " << output_f16[idx+3] << " 5 : " << output_f16[idx+4] << " 6 : " << output_f16[idx+5] << std::endl; - idx = idx + 6; - } - } -#endif - if (dt == DT_F32) { - F32* output_f32 = reinterpret_cast(output_ref); - int idx = 0; - for (U32 i = 0 ; i < 1 + num_detected_max ; i++){ - if( i >= 1 && output_f32[idx] == 0) { - break; - } - std::cout << " 1 : " << output_f32[idx] << " 2 : " << output_f32[idx+1] << " 3 : " << output_f32[idx+2] << " 4 : " << output_f32[idx+3] << " 5 : " << output_f32[idx+4] << " 6 : " << output_f32[idx+5] << std::endl; - idx = idx + 6; - } - } - - free(input_loc); - free(input_conf); - free(input_priorbox); - free(output); - free(output_ref); - return 0; -} - -int main(int argc, char** argv){ -#ifdef _USE_FP16 - std::cout << "----- Testing FP16 Detectionoutput -----" < -#include "tensor_computing.h" -#include "ut_util.h" - -int dilatedConvolutionTest(int argc, char** argv, DataType dt) { - CHECK_REQUIREMENT(argc == 16); - // in data - U32 in = atoi(argv[1]); - U32 ic = atoi(argv[2]); - U32 ih = atoi(argv[3]); - U32 iw = atoi(argv[4]); - // weight - U32 fn = atoi(argv[5]); - U32 fc = atoi(argv[6]); - U32 fh = atoi(argv[7]); - U32 fw = atoi(argv[8]); - // stride & padding - U32 stride = atoi(argv[9]); - U32 padding = atoi(argv[10]); - - // dilation rate - U32 rate = atoi(argv[11]); - - // output - U32 on = atoi(argv[12]); - U32 oc = atoi(argv[13]); - U32 oh = atoi(argv[14]); - U32 ow = atoi(argv[15]); - - CHECK_REQUIREMENT(in == 1 && on == 1); - - ActivationDesc activationDesc; - activationDesc.mode = ACTIVATION_RELU; - activationDesc.value[0] = 0; - - TensorDesc inputDesc, filterDesc, outputDesc, biasDesc; - ConvolutionDesc convDesc; - inputDesc = tensor4df(dt, DF_NCHWC8, in, ic, ih, iw); - filterDesc = tensor4df(dt, DF_NCHW, oc, ic, fh, fw); - biasDesc = tensor1d(dt, oc); - convDesc.stride_h = stride; - convDesc.stride_w = stride; - convDesc.padding_top = padding; - convDesc.padding_bottom = padding; - convDesc.padding_left = padding; - convDesc.padding_right = padding; - convDesc.dilatedRate_h = rate; - convDesc.dilatedRate_w = rate; - - // setup input, filter, bias - U8 *input = ut_input_v(in*ic*ih*iw, dt, UT_INIT_RANDOM); - U8 *filter = ut_input_v(fn*fc*fh*fw, dt, UT_INIT_RANDOM); - U8 *bias = ut_input_v(oc, dt, UT_INIT_RANDOM); - U8 *input_ref = ut_input_v(in*ic*ih*iw, dt, UT_INIT_ZERO); - U8 *filter_ref = ut_input_v(fn*fc*fh*fw, dt, UT_INIT_ZERO); - memcpy(input_ref, input, bytesOf(dt)*in*ic*ih*iw); - memcpy(filter_ref, filter, bytesOf(dt)*fn*fc*fh*fw); - - // setup output, bias - U32 outputBytes; - CHECK_STATUS(convolution_infer_output_size(inputDesc, filterDesc, convDesc, &outputDesc, dt, &outputBytes, UT_ARCH)); - U32 output_size = outputBytes / bytesOf(dt); - U8 *output = ut_input_v(output_size, dt, UT_INIT_ZERO); - U8 *output_ref = ut_input_v(output_size, dt, UT_INIT_ZERO); - - // setup alg - ConvolutionPolicy policy = CONVOLUTION_FASTEST; - ConvolutionForwardAlgorithm alg = CONVOLUTION_ALGORITHM_NULL; - CHECK_STATUS(convolution_infer_forward_algorithm(inputDesc, filterDesc, outputDesc, convDesc, policy, &alg, dt, activationDesc, UT_ARCH)); - - // setup tmp - U32 tmpBytes; - CHECK_STATUS(convolution_infer_forward_tmp_bytes(inputDesc, filterDesc, outputDesc, convDesc, alg, &tmpBytes, UT_ARCH)); - U8 *tmp = ut_input_v(tmpBytes/bytesOf(dt), dt, UT_INIT_ZERO); - - // setup filter trans - U32 ftmBytes; - CHECK_STATUS(convolution_transform_filter_bytes(filterDesc, alg, &ftmBytes, UT_ARCH)); - U8 *ftm = ut_input_v(ftmBytes/bytesOf(dt), dt, UT_INIT_ZERO); - // trans filter - TensorDesc ftmDesc; - CHECK_STATUS(convolution_transform_filter(filterDesc, filter, alg, &ftmDesc, ftm, tmp, UT_ARCH)); - - if(UT_CHECK){ - CHECK_STATUS(convolution(inputDesc, input, - ftmDesc, ftm, - convDesc, alg, - biasDesc, nullptr, - biasDesc, bias, - tmpBytes, tmp, - outputDesc, output, - activationDesc, UT_ARCH)); - - // naive implement - CHECK_STATUS(convolution(inputDesc, input_ref, - filterDesc, filter_ref, - convDesc, alg, - biasDesc, nullptr, - biasDesc, bias, - tmpBytes, tmp, - outputDesc, output_ref, - activationDesc, CPU_GENERAL)); - - // check - ut_check_v(output, output_ref, output_size, dt, 1, __FILE__, __LINE__); - } - - // benchmark - double time_start = ut_time_ms(); - for(int iter=0; iter -#include - -#include "tensor_computing.h" -#include "ut_util.h" - -int eltwiseTest(int argc, char** argv, DataType dt) { - CHECK_REQUIREMENT(argc == 6); - U32 num = atoi(argv[1]); - U32 in = atoi(argv[2]); - U32 ic = atoi(argv[3]); - U32 ih = atoi(argv[4]); - U32 iw = atoi(argv[5]); - - U32 len = in * ic * ih * iw; - EltwiseMode eltwiseMode = ELTWISE_MAX; - - std::vector inputDesc(num); - std::vector input(num); - for (U32 i = 0; i < num; i++) { - inputDesc[i] = tensor4df(dt, DF_NCHWC8, in, ic, ih, iw); - input[i] = (void*)ut_input_v(len, dt, UT_INIT_RANDOM); - } - TensorDesc outputDesc; - CHECK_STATUS(eltwise_infer_output_size(inputDesc, &outputDesc, UT_ARCH)); - CHECK_REQUIREMENT(len == tensorNumElements(outputDesc)); - U8 *output = ut_input_v(len, dt, UT_INIT_ZERO); - U8 *output_ref = ut_input_v(len, dt, UT_INIT_ZERO); - - if (UT_CHECK) { - CHECK_STATUS(eltwise(inputDesc, input, outputDesc, output, eltwiseMode, UT_ARCH)); - - CHECK_STATUS(eltwise(inputDesc, input, outputDesc, output_ref, eltwiseMode, CPU_GENERAL)); - - // check - ut_check_v(output, output_ref, len, dt, 1, __FILE__, __LINE__); - } - - // benchmark - double time_start = ut_time_ms(); - for (int iter = 0; iter < UT_LOOPS; iter++) { - CHECK_STATUS(eltwise(inputDesc, input, outputDesc, output, eltwiseMode, UT_ARCH)); - } - double time_end = ut_time_ms(); - double time = (time_end - time_start) / UT_LOOPS; - - // log performance data - char buffer[150]; - char params[120]; - sprintf(params, "%u (%u %u %u %u)=(%u %u %u %u)", - num, in, ic, ih, iw, - in, ic, ih, iw); - sprintf(buffer, "%20s, %80s", "Eltwise", params); - double ops = 1.0 * num * in * ic * ih * iw; - ut_log(dt, buffer, ops, time); - - for(U32 i=0; i -#include "tensor_computing.h" -#include "ut_util.h" -#include "gcl.h" -#include "libkernelbin.h" - -#ifdef _USE_FP16 -int fullyConnectedTest(int argc, char* argv[], DataType dt) -{ - U32 in, ic, ih, iw; - U32 fn, fc, fh, fw; - U32 on, oc, oh, ow; - U32 biasNum; - Arch arch = MALI; - - in = 1; - ic = 4; - ih = 4; - iw = 4; - fn = 4; - - if(argc == 5) { - ic = atoi(argv[1]); - ih = atoi(argv[2]); - iw = atoi(argv[3]); - fn = atoi(argv[4]); - } - fc = ic; - fh = ih; - fw = iw; - - on = 1; - oc = fn; - oh = 1; - ow = 1; - - TensorDesc inputDesc, filterDesc, outputDesc, biasDesc; - TensorDesc filterDesc_cpu, outputDesc_cpu; - - inputDesc = tensor4df(dt, DF_NCHW, in, ic, ih, iw); - filterDesc = tensor4df(dt, DF_NCHW, fn, fc, fh, fw); - filterDesc_cpu = tensor2df(dt, DF_NORMAL, fn, fc * fh * fw); - outputDesc_cpu = tensor2df(dt, DF_NORMAL, 1, fn); - biasDesc = tensor1d(dt, oc); - - U8 *input_cpu = ut_input_v(in*ic*ih*iw, dt, UT_INIT_RANDOM); - U8 *filter_cpu = ut_input_v(fn*fc*fh*fw, dt, UT_INIT_RANDOM); - U8 *bias_cpu = ut_input_v(oc, dt, UT_INIT_RANDOM); - U8 *output_cpu = ut_input_v(on*oc*oh*ow, dt, UT_INIT_ZERO); -// U8 *output_gpu = ut_input_v(on*oc*oh*ow, dt, UT_INIT_ZERO); - U8 *output_gpu = NULL; - - GCLHandle_t handle; - CHECK_STATUS(gcl_create_handle(&handle)); - CHECK_STATUS(gcl_regist_binMap(handle)); - - ExtInfo extInfo; - U32 str[3] = {0, 0, 0}; - U32 off[3] = {0, 0, 0}; - GCLMemDesc inputMemDesc = gcl_mem_desc(str, off, DT_U8, DF_NCWHC4); - GCLMemDesc outputMemDesc = gcl_mem_desc(str, off, DT_U8, DF_NCWHC4); - GCLMemDesc filterMemDesc = gcl_mem_desc(str, off, DT_U8, DF_NCWHC4); - ForwardRunInfoMali runInfo; - runInfo.algorithm = (I32)(CONVOLUTION_ALGORITHM_NULL); - runInfo.best_w[0] = 1; - runInfo.best_c[0] = 1; - runInfo.best_k[0] = 1; - extInfo.maliInfo.handle = handle; - extInfo.maliInfo.gclmemInputDesc = NULL; - extInfo.maliInfo.gclmemOutputDesc = NULL; - extInfo.maliInfo.gclmemFilterDesc = NULL; - extInfo.maliInfo.forwardRunInfo = &runInfo; - - CHECK_STATUS(fully_connected_infer_output_size(inputDesc, filterDesc, &outputDesc, arch, &extInfo)); - std::vector outputDescs; - outputDescs.push_back(outputDesc); - CHECK_STATUS(fully_connected_infer_forward_algorithm(inputDesc, filterDesc, outputDescs, arch, &extInfo)); - extInfo.maliInfo.gclmemInputDesc = &inputMemDesc; - extInfo.maliInfo.gclmemOutputDesc = &outputMemDesc; - extInfo.maliInfo.gclmemFilterDesc = &filterMemDesc; - CHECK_STATUS(fully_connected_infer_output_size(inputDesc, filterDesc, NULL, arch, &extInfo)); - - U32 maxBytes = 0; - U32 tmpBytes; - CHECK_STATUS(fully_connected_infer_forward_tmp_bytes(inputDesc, filterDesc, &tmpBytes, arch, &extInfo)); - maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; - U8 *tmp_cpu = ut_input_v(tmpBytes/bytesOf(dt), dt, UT_INIT_ZERO); - - U32 ftmBytes; - CHECK_STATUS(fully_connected_transform_filter_bytes(filterDesc, &ftmBytes, arch, &extInfo)); - - GCLMem_t input = gcl_create_gclmem(); - GCLMem_t filter = gcl_create_gclmem(); - GCLMem_t output = gcl_create_gclmem(); - GCLMem_t bias = gcl_create_gclmem(); - GCLMem_t tmpbuf = gcl_create_gclmem(); - GCLMem_t filter_org = gcl_create_gclmem(); - outputMemDesc.use_map = true; - outputMemDesc.flags = CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR; - outputMemDesc.byteSize = 2 * outputMemDesc.byteSize; - input->desc = inputMemDesc; - filter->desc = filterMemDesc; - output->desc = outputMemDesc; - biasNum = oc; - bias->desc.memType = GCL_MEM_BUF; - bias->desc.byteSize = biasNum * bytesOf(dt); - bias->desc.stride[0] = biasNum; - bias->desc.stride[1] = 1; - bias->desc.stride[2] = 1; - bias->desc.offset[0] = 0; - bias->desc.offset[1] = 0; - bias->desc.offset[2] = 0; - bias->desc.num = biasNum; - bias->desc.memFormat = DF_NHWC; - bias->desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; - bias->desc.host_ptr = bias_cpu; - - filter_org->desc.stride[0] = fw * fh * fc; - filter_org->desc.stride[1] = fn; - filter_org->desc.stride[2] = 1; - filter_org->desc.offset[0] = 0; - filter_org->desc.offset[1] = 0; - filter_org->desc.offset[2] = 0; - filter_org->desc.byteSize = fw * fh * fc * fn * bytesOf(dt); - filter_org->desc.num = fw * fh * fc * fn; - filter_org->desc.memType = GCL_MEM_BUF; - filter_org->desc.memFormat = DF_NCHW; - filter_org->desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; - filter_org->desc.host_ptr = filter_cpu; - - gcl_create_memory(handle, input); - gcl_create_memory(handle, output); - gcl_create_memory(handle, filter); - gcl_create_memory(handle, bias); - gcl_create_memory(handle, filter_org); - CHECK_STATUS(gcl_fill_memory_zero(handle, input)); - - CHECK_STATUS(tensor_computing_set_input_infer_tmpBuf_size(input, inputDesc, &tmpBytes, arch)); - maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; - CHECK_STATUS(tensor_computing_get_output_infer_tmpBuf_size(output, outputDesc, &tmpBytes, arch)); - maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; - tmpbuf->desc.byteSize = maxBytes; - if(maxBytes) gcl_create_memory(handle, tmpbuf); - - TensorDesc filterDescTran; - std::vector filterArray; - std::vector outputArray; - std::vector biasArray; - filterArray.push_back(filter); - outputArray.push_back(output); - biasArray.push_back(bias); - - CHECK_STATUS(fully_connected_transform_filter(inputDesc, filterDesc, filter_org, &filterDescTran, &filterArray, arch, &extInfo)); - - CHECK_STATUS(tensor_computing_set_input(input, inputDesc, input_cpu, tmpbuf, true, arch, &extInfo)); - CHECK_STATUS(fully_connected(inputDesc, input, filterDesc, &filterArray, tmpbuf, tmpBytes, outputDesc, &outputArray, biasDesc, &biasArray, arch, &extInfo)); -#ifndef _DEBUG - CHECK_STATUS(gcl_run_kernelVec(handle)); -#endif - CHECK_STATUS(tensor_computing_get_output(output, outputDesc, NULL, NULL, true, arch, &extInfo)); - output_gpu = output->desc.map_ptr; -#ifdef _DEBUG - char buffer[150]; - char params[120]; - sprintf(params, "(%u %u %u %u)+(%u %u %u %u)=(%u %u %u %u)", - in, ic, ih, iw, - fn, fc, fh, fw, - on, oc, oh, ow); - sprintf(buffer, "%20s, %80s", "InnerProdect", params); - double time = handle->t_total * 0.001; - double ops = 2.0 * fn * fc * fh * fw + 1.0 * fn; - ut_log(dt, buffer, ops, time); -#endif - - CHECK_STATUS(fully_connected(inputDesc, input_cpu, filterDesc_cpu, filter_cpu, tmp_cpu, tmpBytes, outputDesc_cpu, output_cpu, biasDesc, bias_cpu, CPU_GENERAL)); - ut_check_a(output_gpu, output_cpu, on * oc * ow * oh, dt); - - CHECK_STATUS(gcl_finish(handle)); - free(input_cpu ); - free(filter_cpu); - free(bias_cpu ); - free(output_cpu); -// free(output_gpu); - free(tmp_cpu); - CHECK_STATUS(gcl_unmap_memory(handle, output)); - gcl_destroy_gclmem(input); - gcl_destroy_gclmem(filter); - gcl_destroy_gclmem(filter_org); - gcl_destroy_gclmem(output); - gcl_destroy_gclmem(bias); - gcl_destroy_gclmem(tmpbuf); - gcl_destroy_handle(handle); - return 0; -} -#endif - - -int main(int argc, char** argv) { -#ifdef _USE_FP16 - fullyConnectedTest(argc, argv, DT_F16); -#endif - return 0; -} diff --git a/tests/test_image_resize.cpp b/tests/test_image_resize.cpp deleted file mode 100644 index 5780aec5..00000000 --- a/tests/test_image_resize.cpp +++ /dev/null @@ -1,109 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "image.h" -#include "ut_util.h" - - -int resizeTest(int argc, char* argv[], DataType dt) -{ - CHECK_REQUIREMENT(argc == 9); - // in data - U32 in = atoi(argv[1]); - U32 ic = atoi(argv[2]); - U32 ih = atoi(argv[3]); - U32 iw = atoi(argv[4]); - // output - U32 on = atoi(argv[5]); - U32 oc = atoi(argv[6]); - U32 oh = atoi(argv[7]); - U32 ow = atoi(argv[8]); - - CHECK_REQUIREMENT(in == 1 && on == 1); - CHECK_REQUIREMENT(ic % 8 == 0 && oc % 8 == 0); - - TensorDesc inputDesc, outputDesc; - ResizeDesc resizeDesc; - inputDesc = tensor4df(dt, DF_NCHWC8, in, ic, ih, iw); - - resizeDesc.paramDT = DT_F32; - F32 scales[2]; - scales[0] = (F32)oh / (F32)ih; - scales[1] = (F32)ow / (F32)iw; - - // setup input, filter - U8 *input = ut_input_v(in*ic*ih*iw, dt, UT_INIT_RANDOM); - U8 *input_ref = ut_input_v(in*ic*ih*iw, dt, UT_INIT_ZERO); - memcpy(input_ref, input, bytesOf(dt)*in*ic*ih*iw); - - // setup output - U32 outputBytes; - CHECK_STATUS(resize_infer_output_size(inputDesc, resizeDesc, scales, &outputDesc, &outputBytes)); - CHECK_REQUIREMENT(tensorNumElements(outputDesc) == on*oc*oh*ow); - U32 output_size = outputBytes / bytesOf(dt); - U8 *output = ut_input_v(output_size, dt, UT_INIT_ZERO); - U8 *output_ref = ut_input_v(output_size, dt, UT_INIT_ZERO); - - if (UT_CHECK) { - CHECK_STATUS(resize(inputDesc, input, - outputDesc, output, - UT_ARCH)); - - // naive implement - CHECK_STATUS(resize(inputDesc, input_ref, - outputDesc, output_ref, - CPU_GENERAL)); - - // check - ut_check_v(output, output_ref, output_size, dt, 0.05, __FILE__, __LINE__); - } - - // benchmark - double time_start = ut_time_ms(); - for (int iter = 0; iter < UT_LOOPS; iter++) { - CHECK_STATUS(resize(inputDesc, input_ref, - outputDesc, output_ref, - CPU_GENERAL)); - } - double time_end = ut_time_ms(); - double time = (time_end - time_start) / UT_LOOPS; - - // log performance data - char buffer[150]; - char params[120]; - sprintf(params, "(%u %u %u %u)=>(%u %u %u %u)", - in, ic, ih, iw, - on, oc, oh, ow); - sprintf(buffer, "%20s, %80s", "Resize", params); - double ops = 15.0 * on * oc * oh * ow; - ut_log(dt, buffer, ops, time); - - free(input); - free(output); - free(input_ref); - free(output_ref); - return 0; -} - -int main(int argc, char* argv[]) -{ -#ifdef _USE_FP16 - resizeTest(argc, argv, DT_F16); -#endif -#ifdef _USE_FP32 - resizeTest(argc, argv, DT_F32); -#endif - return 0; -} diff --git a/tests/test_lstm.cpp b/tests/test_lstm.cpp deleted file mode 100644 index 0952873b..00000000 --- a/tests/test_lstm.cpp +++ /dev/null @@ -1,107 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include - -#include "tensor_computing.h" -#include "ut_util.h" - -int lstmTest(int argc, char **argv, DataType dt) { - CHECK_REQUIREMENT(argc == 5); - U32 batch = atoi(argv[1]); - U32 step = atoi(argv[2]); - U32 xDim = atoi(argv[3]); - U32 hDim = atoi(argv[4]); - - LSTMDesc lstmDesc; - lstmDesc.numOutput = hDim; - lstmDesc.numProjection = 1024; - lstmDesc.forgetBias = 1.0; - lstmDesc.activationMode = ACTIVATION_TANH; - - U32 column = (lstmDesc.numProjection > 0) ? lstmDesc.numProjection : lstmDesc.numOutput; - TensorDesc inputDesc = tensor3df(dt, DF_MTK, batch, step, xDim); - TensorDesc filterDesc = tensor2df(dt, DF_NK, 4*column, xDim+hDim); - TensorDesc biasDesc = tensor1d(dt, column*4); - U32 outputBytes, tmpBytes, ftmBytes; - TensorDesc outputDesc; - CHECK_STATUS(lstm_infer_output_size(inputDesc, filterDesc, lstmDesc, &outputDesc, &outputBytes)); - CHECK_STATUS(lstm_infer_forward_tmp_bytes(inputDesc, filterDesc, outputDesc, lstmDesc, &tmpBytes, UT_ARCH)); - CHECK_STATUS(lstm_transform_filter_bytes(filterDesc, lstmDesc, &ftmBytes, UT_ARCH)); - - U32 inputLength = batch * step * xDim; - U32 filterLength = (xDim + hDim) * column * 4 + lstmDesc.numProjection * lstmDesc.numOutput; - U32 biasLength = column * 4; - U32 outputLength = outputBytes / bytesOf(dt); - U8* input = ut_input_v(inputLength, dt, UT_INIT_RANDOM); - U8* filter = ut_input_v(filterLength, dt, UT_INIT_RANDOM); - U8* bias = ut_input_v(biasLength, dt, UT_INIT_RANDOM); - U8* output = ut_input_v(outputLength, dt, UT_INIT_ZERO); - U8* outputRef = ut_input_v(outputLength, dt, UT_INIT_ZERO); - U8* tmp = ut_input_v(tmpBytes/bytesOf(dt), dt, UT_INIT_ZERO); - U8* ftm = ut_input_v(ftmBytes/bytesOf(dt), dt, UT_INIT_ZERO); - - TensorDesc ftmDesc; - CHECK_STATUS(lstm_transform_filter(filterDesc, filter, lstmDesc, &ftmDesc, ftm, UT_ARCH)); - - if (UT_CHECK) { - CHECK_STATUS(lstm(inputDesc, input, ftmDesc, ftm, biasDesc, bias, tmpBytes, tmp, lstmDesc, outputDesc, output, UT_ARCH)); - - // naive implement - CHECK_STATUS(lstm(inputDesc, input, ftmDesc, ftm, biasDesc, bias, tmpBytes, tmp, lstmDesc, outputDesc, outputRef, CPU_GENERAL)); - - // check - ut_check_v(output, outputRef, outputLength, dt, 10, __FILE__, __LINE__); - } - - // benchmark - double time_start = ut_time_ms(); - for (int iter = 0; iter < UT_LOOPS; iter++) { - CHECK_STATUS(lstm(inputDesc, input, ftmDesc, ftm, biasDesc, bias, tmpBytes, tmp, lstmDesc, outputDesc, output, UT_ARCH)); - } - double time_end = ut_time_ms(); - double time = (time_end - time_start) / UT_LOOPS; - - // log performance data - char buffer[150]; - char params[120]; - sprintf(params, "%u (%u %u %u)=(%u %u)", - batch, step, xDim, hDim, - batch, hDim); - sprintf(buffer, "%20s, %80s", "LSTM", params); - double hxDim = hDim + xDim; - double ops = 1.0 * batch * step * (2.0 * hxDim * column * 4 + column * 4 + lstmDesc.numProjection * lstmDesc.numOutput); - ut_log(dt, buffer, ops, time); - - free(input); - free(filter); - free(bias); - free(output); - free(outputRef); - free(tmp); - free(ftm); - - return 0; -} - - -int main(int argc, char** argv) { -#ifdef _USE_FP16 - lstmTest(argc, argv, DT_F16); -#endif -#ifdef _USE_FP32 - lstmTest(argc, argv, DT_F32); -#endif - return 0; -} diff --git a/tests/test_mmm.cpp b/tests/test_mmm.cpp deleted file mode 100644 index b55481d4..00000000 --- a/tests/test_mmm.cpp +++ /dev/null @@ -1,87 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "blas-enhance.h" -#include "ut_util.h" - -int mmmTest(int argc, char** argv, DataType dt) -{ - CHECK_REQUIREMENT(argc == 4); - U32 m = atoi(argv[1]); - U32 k = atoi(argv[2]); - U32 n = atoi(argv[3]); - - TensorDesc A_desc = tensor2df(dt, DF_NORMAL, m, k); - TensorDesc B_desc = tensor2df(dt, DF_NORMAL, k, n); - TensorDesc tranDescB; - TensorDesc C_desc = tensor2df(dt, DF_NORMAL, m, n); - - U32 bytes = 0; - U8* A = ut_input_v(m * k, dt, UT_INIT_RANDOM); - U8* B = ut_input_v(k * n, dt, UT_INIT_RANDOM); - U8* B_tran = ut_input_v(k * n + 32, dt, UT_INIT_ZERO); - U8* C = ut_input_v(m * n, dt, UT_INIT_ZERO); - U8* C_ref = ut_input_v(m * n, dt, UT_INIT_ZERO); - CHECK_STATUS(matrix_matrix_multiply_tmp_bytes(A_desc, B_desc, &bytes, UT_ARCH)); - U8* tmp = ut_input_v(bytes/bytesOf(dt), dt, UT_INIT_ZERO); - - matrix_matrix_multiply_transform_rhs(B_desc, B, &tranDescB, B_tran); - if (UT_CHECK) { - CHECK_STATUS(matrix_matrix_multiply(A_desc, A, tranDescB, B_tran, bytes, tmp, C_desc, C, UT_ARCH)); - - // naive implement - CHECK_STATUS(matrix_matrix_multiply(A_desc, A, B_desc, B, bytes, tmp, C_desc, C_ref, CPU_GENERAL)); - - // check - ut_check_v(C, C_ref, m*n, dt, 10, __FILE__, __LINE__); - } - - // benchmark - double time_start = ut_time_ms(); - for (int iter = 0; iter < UT_LOOPS; iter++) { - matrix_matrix_multiply(A_desc, A, tranDescB, B_tran, bytes, tmp, C_desc, C, UT_ARCH); - } - double time_end = ut_time_ms(); - double time = (time_end - time_start) / UT_LOOPS; - - // log performance data - char buffer[150]; - char params[120]; - sprintf(params, "(%u %u)+(%u %u)=(%u %u)", - m, k, k, n, m, n); - sprintf(buffer, "%20s, %80s", "MatrixMultiply", params); - double ops = 2.0 * m * n * k + 1.0 * m * n; - ut_log(dt, buffer, ops, time); - - free(A); - free(B); - free(B_tran); - free(C); - free(C_ref); - free(tmp); - - return 0; -} - -int main(int argc, char** argv) -{ -#ifdef _USE_FP16 - mmmTest(argc, argv, DT_F16); -#endif -#ifdef _USE_FP32 - mmmTest(argc, argv, DT_F32); -#endif - return 0; -} diff --git a/tests/test_mmm_int8.cpp b/tests/test_mmm_int8.cpp deleted file mode 100644 index 51686b1a..00000000 --- a/tests/test_mmm_int8.cpp +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "blas-enhance.h" -#include "ut_util.h" - - -int main(int argc, char** argv) -{ - CHECK_REQUIREMENT(argc == 4); - U32 m = atoi(argv[1]); - U32 k = atoi(argv[2]); - U32 n = atoi(argv[3]); - - DataType dt = DT_I8; - DataType odt = DT_I32; - TensorDesc A_desc = tensor2df(dt, DF_NORMAL, m, k); - TensorDesc B_desc = tensor2df(dt, DF_NORMAL, k, n); - TensorDesc tranDescB; - TensorDesc C_desc = tensor2df(odt, DF_NORMAL, m, n); - - U32 bytes = 0; - U32 k4 = k; - if (k4 % 4 != 0) { - k4 = (k4 / 4) * 4 + 4; - } - INT8* A = (INT8*)ut_input_v(m * k, DT_I8, UT_INIT_RANDOM); - INT8* B = (INT8*)ut_input_v(k * n, DT_I8, UT_INIT_RANDOM); - INT8* B_tran = (INT8*)ut_input_v(k4 * n + 32, DT_I8, UT_INIT_ZERO); - I32* C = (I32*)ut_input_v(m * n, DT_I32, UT_INIT_ZERO); - I32* C_ref = (I32*)ut_input_v(m * n, DT_I32, UT_INIT_ZERO); - CHECK_STATUS(matrix_matrix_multiply_tmp_bytes(A_desc, B_desc, &bytes, UT_ARCH)); - INT8* tmp = (INT8 *)ut_input_v(bytes, DT_I8, UT_INIT_ZERO); - - matrix_matrix_multiply_transform_rhs(B_desc, B, &tranDescB, B_tran); - if (UT_CHECK){ - CHECK_STATUS(matrix_matrix_multiply(A_desc, A, tranDescB, B_tran, bytes, tmp, C_desc, C, UT_ARCH)); - - // naive implement - CHECK_STATUS(matrix_matrix_multiply(A_desc, A, B_desc, B, bytes, tmp, C_desc, C_ref, CPU_GENERAL)); - - // check - ut_check_v(C, C_ref, m*n, DT_I32, 1, __FILE__, __LINE__); - } - - // benchmark - double time_start = ut_time_ms(); - for (int iter = 0; iter < UT_LOOPS; iter++) { - matrix_matrix_multiply(A_desc, A, tranDescB, B_tran, bytes, tmp, C_desc, C, UT_ARCH); - } - double time_end = ut_time_ms(); - double time = (time_end - time_start) / UT_LOOPS; - - // log performance data - char buffer[150]; - char params[120]; - sprintf(params, "(%u %u)+(%u %u)=(%u %u)", - m, k, k, n, m, n); - sprintf(buffer, "%20s, %80s", "MatrixMultiply", params); - double ops = 2.0 * m * n * k + 1.0 * m * n; - ut_log(DT_I8, buffer, ops, time); - - free(A); - free(B); - free(B_tran); - free(C); - free(C_ref); - free(tmp); - - return 0; -} diff --git a/tests/test_multiply.cpp b/tests/test_multiply.cpp deleted file mode 100644 index ad489ced..00000000 --- a/tests/test_multiply.cpp +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing.h" -#include "ut_util.h" - -int multiplyTest(int argc, char** argv, DataType dt) { - CHECK_REQUIREMENT(argc == 4); - U32 len = atoi(argv[1]); - F32 alpha = atof(argv[2]); - F32 beta = atof(argv[3]); - - TensorDesc input_desc = tensor1d(dt, len); - TensorDesc output_desc; - CHECK_STATUS(multiply_infer_output_size(input_desc, &output_desc, UT_ARCH)); - - U8* input = ut_input_v(len, dt, UT_INIT_RANDOM); - U8* output = ut_input_v(len, dt, UT_INIT_ZERO); - U8* output_ref = ut_input_v(len, dt, UT_INIT_ZERO); - - if (UT_CHECK) { - CHECK_STATUS(multiply(&alpha, &beta, input_desc, input, output_desc, output, UT_ARCH)); - - // naive implement - CHECK_STATUS(multiply(&alpha, &beta, input_desc, input, output_desc, output_ref, CPU_GENERAL)); - - // check - ut_check_v(output, output_ref, len, dt, 0.1, __FILE__, __LINE__); - } - - // benchmark - double time_start = ut_time_ms(); - for (int iter = 0; iter < UT_LOOPS; iter ++) { - CHECK_STATUS(multiply(&alpha, &beta, input_desc, input, output_desc, output, UT_ARCH)); - } - double time_end = ut_time_ms(); - double time = (time_end - time_start) / UT_LOOPS; - - // log performance data - char buffer[150]; - char params[120]; - sprintf(params, "(%u)=(%u)", - len, len); - sprintf(buffer, "%20s, %80s", "Multiply", params); - double ops = 2.0 * len; - ut_log(dt, buffer, ops, time/UT_LOOPS); - - free(input); - free(output); - free(output_ref); - - return 0; -} - - -int main(int argc, char** argv) { -#ifdef _USE_FP16 - multiplyTest(argc, argv, DT_F16); -#endif -#ifdef _USE_FP32 - multiplyTest(argc, argv, DT_F32); -#endif - return 0; -} diff --git a/tests/test_mvm.cpp b/tests/test_mvm.cpp deleted file mode 100644 index 9d8b55c4..00000000 --- a/tests/test_mvm.cpp +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "blas-enhance.h" -#include "ut_util.h" - - -int mvmTest(int argc, char** argv, DataType dt) -{ - CHECK_REQUIREMENT(argc == 3); - U32 m = atoi(argv[1]); - U32 k = atoi(argv[2]); - - DataFormat df = DF_NORMAL; - U32 vc, rc; - if (df == DF_NORMAL) { - vc = k; - rc = m; - } - else { - vc = m; - rc = k; - } - - TensorDesc mat_desc = tensor2df(dt, df, m, k); - TensorDesc vec_desc = tensor1d(dt, vc); - TensorDesc res_desc = tensor1d(dt, rc); - - U8* mat = ut_input_v(m * k, dt, UT_INIT_RANDOM); - U8* vec = ut_input_v(vc, dt, UT_INIT_RANDOM); - U8* res = ut_input_v(rc, dt, UT_INIT_ZERO); - U8* res_ref = ut_input_v(rc, dt, UT_INIT_ZERO); - - U32 bytes = 0; - CHECK_STATUS(matrix_vector_multiply_tmp_bytes(mat_desc, vec_desc, &bytes, UT_ARCH)); - U8* tmp = ut_input_v(bytes/bytesOf(dt), dt, UT_INIT_ZERO); - // check - if (UT_CHECK) { - CHECK_STATUS(matrix_vector_multiply(mat_desc, mat, vec_desc, vec, bytes, tmp, res_desc, res, UT_ARCH)); - - // naive implement - CHECK_STATUS(matrix_vector_multiply(mat_desc, mat, vec_desc, vec, bytes, tmp, res_desc, res_ref, CPU_GENERAL)); - - ut_check_v(res, res_ref, rc, dt, 1, __FILE__, __LINE__); - } - - // benchmark - double time_start = ut_time_ms(); - for (int iter = 0; iter < UT_LOOPS; iter++) { - matrix_vector_multiply(mat_desc, mat, vec_desc, vec, bytes, tmp, res_desc, res, UT_ARCH); - } - double time_end = ut_time_ms(); - double time = (time_end - time_start) / UT_LOOPS; - - // log performance data - char buffer[150]; - char params[120]; - sprintf(params, "(%u %u)+(%u)=(%u)", - m, k, vc, rc); - sprintf(buffer, "%20s, %80s", "MatrixVectorMultiply", params); - double ops = 2.0 * m * k; - ut_log(dt, buffer, ops, time); - - free(mat); - free(vec); - free(tmp); - free(res); - free(res_ref); - - return 0; -} - -int main(int argc, char** argv) -{ -#ifdef _USE_FP16 - mvmTest(argc, argv, DT_F16); -#endif -#ifdef _USE_FP32 - mvmTest(argc, argv, DT_F32); -#endif - return 0; -} diff --git a/tests/test_mvm_int8.cpp b/tests/test_mvm_int8.cpp deleted file mode 100644 index 955540bf..00000000 --- a/tests/test_mvm_int8.cpp +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "blas-enhance.h" -#include "ut_util.h" - - -int main(int argc, char** argv) { - CHECK_REQUIREMENT(argc == 3); - U32 m = atoi(argv[1]); - U32 k = atoi(argv[2]); - - DataFormat df = DF_NORMAL; - DataType dt = DT_I8; - DataType odt = DT_I32; - U32 vc, rc; - if (df == DF_NORMAL) { - vc = k; - rc = m; - } - else { - vc = m; - rc = k; - } - - TensorDesc mat_desc = tensor2df(dt, df, m, k); - TensorDesc vec_desc = tensor1d(dt, vc); - TensorDesc res_desc = tensor1d(odt, rc); - - INT8* mat = (INT8*)ut_input_v(m * k, DT_I8, UT_INIT_RANDOM); - INT8* vec = (INT8*)ut_input_v(vc, DT_I8, UT_INIT_RANDOM); - I32* res = (I32*)ut_input_v(rc, DT_I32, UT_INIT_ZERO); - I32* res_ref = (I32*)ut_input_v(rc, DT_I32, UT_INIT_ZERO); - - U32 bytes; - CHECK_STATUS(matrix_vector_multiply_tmp_bytes(mat_desc, vec_desc, &bytes, UT_ARCH)); - I32* tmp = (I32*)ut_input_v(bytes/bytesOf(DT_I32), DT_I32, UT_INIT_ZERO); - // check - if (UT_CHECK) { - CHECK_STATUS(matrix_vector_multiply(mat_desc, mat, vec_desc, vec, bytes, tmp, res_desc, res, UT_ARCH)); - - // naive implement - CHECK_STATUS(matrix_vector_multiply(mat_desc, mat, vec_desc, vec, bytes, tmp, res_desc, res_ref, CPU_GENERAL)); - - ut_check_v(res, res_ref, rc, DT_I32, 10, __FILE__, __LINE__); - } - - // benchmark - double time_start = ut_time_ms(); - for (int iter = 0; iter < UT_LOOPS; iter++) { - matrix_vector_multiply(mat_desc, mat, vec_desc, vec, bytes, tmp, res_desc, res, UT_ARCH); - } - double time_end = ut_time_ms(); - double time = (time_end - time_start) / UT_LOOPS; - - // log performance data - char buffer[150]; - char params[120]; - sprintf(params, "(%u %u)+(%u)=(%u)", - m, k, vc, rc); - sprintf(buffer, "%20s, %80s", "MatrixVectorMultiply", params); - double ops = 2.0 * m * k; - ut_log(DT_I8, buffer, ops, time); - - free(mat); - free(vec); - free(res); - free(res_ref); - - return 0; -} diff --git a/tests/test_padding.cpp b/tests/test_padding.cpp deleted file mode 100644 index fc9d553b..00000000 --- a/tests/test_padding.cpp +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing.h" -#include "ut_util.h" - -int paddingTest(int argc, char **argv, DataType dt) { - // input dim - U32 in = atoi(argv[1]); - U32 ic = atoi(argv[2]); - U32 ih = atoi(argv[3]); - U32 iw = atoi(argv[4]); - - // padding info - U32 n_fir = atoi(argv[5]); - U32 c_fir = atoi(argv[6]); - U32 h_fir = atoi(argv[7]); - U32 w_fir = atoi(argv[8]); - U32 n_sec = atoi(argv[9]); - U32 c_sec = atoi(argv[10]); - U32 h_sec = atoi(argv[11]); - U32 w_sec = atoi(argv[12]); - - // output dim - U32 on = in + n_fir + n_sec; - U32 oc = ic + c_fir + c_sec; - U32 oh = ih + h_fir + h_sec; - U32 ow = iw + w_fir + w_sec; - - PadDesc padDesc; - - padDesc.top = h_fir; - padDesc.bottom = h_sec; - padDesc.left = w_fir; - padDesc.right = w_sec; - padDesc.constant_value = 0.0; - // padDesc.pad_mode = Pad_Constant; - // padDesc.pad_mode = Pad_Reflect; //limitation: the h_fir and the h_sec should lower than 0 - padDesc.pad_mode = Pad_Edge; - - TensorDesc input_desc = tensor4df(dt, DF_NCHW, in, ic, ih, iw); - TensorDesc output_desc; - CHECK_STATUS(padding_infer_output_size(input_desc, padDesc, &output_desc)); - U32 input_len = tensorNumElements(input_desc); - U32 output_len = tensorNumElements(output_desc); - U8* input = (U8*)malloc(input_len * sizeof(dt)); - F16* input_assign = (F16*)input; - for (int i=0; i -#include "type.h" -#include "tensor_desc.h" -#include "sequential_ocl.hpp" -#include "factory.hpp" -#include "ocl/factory_ocl.hpp" -#include "tensor.hpp" -#include "data_loader.hpp" - - -void print_help() { - - std::cout << "please set argvs: " < -void buildInputTensor(DataType dt, DataFormat df, U32 n, U32 c, U32 h, U32 w, Vec* dims, Vec* inputTensors){ - TensorDesc inputDesc = tensor4df(dt, df, n, c, h, w); - U32 inputNum = tensorNumElements(inputDesc); - U32 inputSize = tensorNumBytes(inputDesc); - U8* inputVal = (U8*) operator new (inputSize); - - T* data = (T*) inputVal; - if(dt == DT_F16){ - for(U32 i = 0; i < inputNum; i++) data[i] = (T)(rand() & 255) / 256.0 - 0.5; - //for(U32 i = 0; i < inputNum; i++) data[i] = (T)(i & 255) / 255.0; - } - if(dt == DT_U8){ - for(U32 i = 0; i < inputNum; i++) { - data[i] = (T)(i & 255); - } - } - std::shared_ptr inputTensor = std::shared_ptr(new Tensor()); - inputTensor->set_desc(inputDesc); - inputTensor->set_shared_ptr(std::shared_ptr(inputVal)); - - dims->push_back(inputDesc); - inputTensors->push_back(*inputTensor.get()); -} - -int main(int argc, char* argv[]) { - - if(argc != 16 && argc != 17 && argc != 18 &&argc != 19) { - printf("%d\n", argc); - print_help(); - return 0; - } - - U32 inputNum = 1; - std::string pm = "NULL"; - std::string DT_NAME = "F16"; - std::string opName = argv[1]; - - - U32 in = atoi(argv[2]); - U32 ic = atoi(argv[3]); - U32 ih = atoi(argv[4]); - U32 iw = atoi(argv[5]); - - U32 fn = atoi(argv[6]); - U32 fc = atoi(argv[7]); - U32 fh = atoi(argv[8]); - U32 fw = atoi(argv[9]); - - U32 sw = atoi(argv[10]); - U32 sh = atoi(argv[11]); - U32 pl = atoi(argv[12]); - U32 pr = atoi(argv[13]); - U32 pt = atoi(argv[14]); - U32 pb = atoi(argv[15]); - if(argc == 17){ - inputNum = atoi(argv[16]); - } - if(argc == 18){ - pm = argv[17]; - } - if(argc == 19){ - DT_NAME = argv[18]; - } - - - const Arch A = MALI; - DataType dt = DT_F16; - auto model = new SequentialOcl(A, dt, opName); - std::shared_ptr model_ptr = std::shared_ptr(model); - - OperatorType OType; - if(opName == "OT_Pooling") OType = OT_Pooling; - if(opName == "OT_Conv") OType = OT_Conv; - if(opName == "OT_Eltwise") OType = OT_Eltwise; - if(opName == "OT_Softmax") OType = OT_Softmax; - if(opName == "OT_Relu") OType = OT_Relu; - if(opName == "OT_Relu6") OType = OT_Relu6; - if(opName == "OT_HSwish") OType = OT_HSwish; - if(opName == "OT_HSigmoid") OType = OT_HSigmoid; - if(opName == "OT_Gelu") OType = OT_Gelu; - if(opName == "OT_TanH") OType = OT_TanH; - if(opName == "OT_Sigmoid") OType = OT_Sigmoid; - if(opName == "OT_FC") OType = OT_FC; - if(opName == "OT_Scale") OType = OT_Scale; - if(opName == "OT_Concat") OType = OT_Concat; - if(opName == "OT_Clip") OType = OT_Clip; - if(opName == "OT_Squeeze") OType = OT_Squeeze; - if(opName == "OT_Reshape") OType = OT_Reshape; - if(opName == "OT_Space2Depth") OType = OT_Space2Depth; - if(opName == "OT_Depth2Space") OType = OT_Depth2Space; - Factory* factory_ocl = (Factory*)(new FactoryOCL()); - std::shared_ptr factory; - factory = std::shared_ptr(factory_ocl); - ConvolutionMode convMode; -// convMode = Convolution_Depthwise_Pointwise; - convMode = Convolution_Pointwise; - - switch(OType) { - case OT_Pooling: { - auto op = factory->createPooling(PoolingMode::POOLING_MAX, fh, fw, sh, sw, pt, pb, pl, pr, RoundMode::CEIL); - model_ptr->add(op); - break; - } - case OT_Eltwise: { - auto op = factory->createEltwise(EltwiseMode::ELTWISE_SUM, 0, NULL); - model_ptr->add(op); - break; - } - case OT_Softmax: { - auto op = factory->createSoftmax(dt, -1); - model_ptr->add(op); - break; - } - case OT_Conv: { - if(pm == "NULL") { - ActivationDesc dwActivationDesc, pwActivationDesc; - dwActivationDesc.mode = ACTIVATION_NULL; - pwActivationDesc.mode = ACTIVATION_NULL; - auto op = factory->createConvolution(dt, fn, fh, fw, sh, sw, pt, pb, pl, pr, dwActivationDesc, pwActivationDesc, convMode, 1, 1, 1); - model_ptr->add(op); - } - - if(pm == "RELU") { - ActivationDesc dwActivationDesc, pwActivationDesc; - dwActivationDesc.mode = ACTIVATION_RELU; - dwActivationDesc.value[0] = 0; - pwActivationDesc.mode = ACTIVATION_NULL; - auto op = factory->createConvolution(dt, fn, fh, fw, sh, sw, pt, pb, pl, pr, dwActivationDesc, pwActivationDesc, convMode, 1, 1, 1); - model_ptr->add(op); - } - break; - } - case OT_Relu: { - ActivationDesc activationDesc; - activationDesc.mode = ACTIVATION_RELU; - activationDesc.value[0] = 0; - auto op = factory->createActivation(activationDesc); - model_ptr->add(op); - break; - } - case OT_Relu6: { - ActivationDesc activationDesc; - activationDesc.mode = ACTIVATION_RELU6; - auto op = factory->createActivation(activationDesc); - model_ptr->add(op); - break; - } - case OT_HSwish: { - ActivationDesc activationDesc; - activationDesc.mode = ACTIVATION_H_SWISH; - auto op = factory->createActivation(activationDesc); - model_ptr->add(op); - break; - } - case OT_HSigmoid: { - ActivationDesc activationDesc; - activationDesc.mode = ACTIVATION_H_SIGMOID; - auto op = factory->createActivation(activationDesc); - model_ptr->add(op); - break; - } - case OT_Gelu: { - ActivationDesc activationDesc; - activationDesc.mode = ACTIVATION_GELU; - auto op = factory->createActivation(activationDesc); - model_ptr->add(op); - break; - } - case OT_TanH: { - ActivationDesc activationDesc; - activationDesc.mode = ACTIVATION_TANH; - auto op = factory->createActivation(activationDesc); - model_ptr->add(op); - break; - } - case OT_Sigmoid: { - ActivationDesc activationDesc; - activationDesc.mode = ACTIVATION_SIGMOID; - auto op = factory->createActivation(activationDesc); - model_ptr->add(op); - break; - } - case OT_FC: { - auto op = factory->createFullyConnected(dt, ih * iw * ic, fn, 1, nullptr); - model_ptr->add(op); - break; - } - case OT_Scale: { - auto op = factory->createScale(dt, 1, fc, inputNum); - model_ptr->add(op); - break; - } - case OT_Concat: { - auto op = factory->createConcat(1); - model_ptr->add(op); - break; - } - case OT_Clip: { - auto op = factory->createClip(dt, 0, 0.5); - model_ptr->add(op); - break; - } - case OT_Squeeze: { - int dim[4] = {1, 1, 1, 1}; - auto op = factory->createSqueeze(dt, 0, dim, 4); - model_ptr->add(op); - break; - } - case OT_Reshape: { - int dim[2] = {-1, 8}; - auto op = factory->createReshape(dt, dim, 2, 0, 0); - model_ptr->add(op); - break; - } - case OT_Space2Depth: { - auto op = factory->createSpace2Depth(dt); - model_ptr->add(op); - break; - } - case OT_Depth2Space: { - auto op = factory->createDepth2Space(dt); - model_ptr->add(op); - break; - } - default: std::cout << "not support op" << std::endl; - } - - Vec dims; - Vec inputTensors; - if(OType == OT_Space2Depth){ - for(U32 i = 0; i < inputNum; i++){ - buildInputTensor(DT_U8, DF_NCHW, in, ic, ih, iw, &dims, &inputTensors); - } - } else { - for(U32 i = 0; i < inputNum; i++){ - buildInputTensor(DT_F16, DF_NCHW, in, ic, ih, iw, &dims, &inputTensors); - } - } - - U8* weightVal = NULL; - if(OType == OT_Conv){ - TensorDesc weightDesc = tensor4df(dt, DF_NCHW, fn, fc, fh, fw); - U32 weightNum = tensorNumElements(weightDesc); - U32 vectorNum = fn; - if(convMode == Convolution_Depthwise_Pointwise) vectorNum = fc + fn + fn * fc; - U32 weightSize = tensorNumBytes(weightDesc) + vectorNum * bytesOf(dt); - weightVal = (U8*) operator new (weightSize); - F16* weight = (F16*) weightVal; - for(U32 i = 0; i < weightNum + vectorNum; i++){ - weight[i] = (F16)(rand() & 255) / 256.0; - } - } - - if(OType == OT_FC){ - U32 weightNum = iw * ih * ic * fn; - U32 biasNum = fn; - U32 weightSize = (weightNum + biasNum) * bytesOf(dt); - weightVal = (U8*) operator new (weightSize); - F16* weight = (F16*) weightVal; - for(U32 i = 0; i < weightNum + biasNum; i++){ - weight[i] = (F16)(rand() & 255) / 256.0; - } - } - - if(OType == OT_Scale){ - U32 weightNum = fc; - U32 biasNum = fc; - U32 weightSize = (weightNum + biasNum) * bytesOf(dt); - weightVal = (U8*) operator new (weightSize); - F16* weight = (F16*) weightVal; - for(U32 i = 0; i < weightNum + biasNum; i++){ - weight[i] = (F16)(rand() & 255) / 256.0; - } - } - - if(weightVal){ - std::shared_ptr modelPtr(weightVal); - model_ptr->ready(dims, modelPtr, 1); - } else { - model_ptr->ready(dims, NULL, 1); - } - model_ptr->mark_input_output(); - model_ptr->mali_prepare(); - model_ptr->set_input_tensors(inputTensors); - model_ptr->run(); - - auto output = model_ptr->get_output_tensors(); - std::shared_ptr oclMem = output[0]->get_shared_ptr(); - F16* val = (F16*) (oclMem->desc.map_ptr); - for(int i = 0; i < 64; i++) std::cout << val[i] << " "; - std::cout << std::endl; - return 0; -} -#endif diff --git a/tests/test_pooling.cpp b/tests/test_pooling.cpp deleted file mode 100644 index 074e62b7..00000000 --- a/tests/test_pooling.cpp +++ /dev/null @@ -1,119 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing.h" -#include "ut_util.h" - -int poolingTest(int argc, char **argv, DataType dt) { - CHECK_REQUIREMENT(argc == 15); - // in data - U32 in = atoi(argv[1]); - U32 ic = atoi(argv[2]); - U32 ih = atoi(argv[3]); - U32 iw = atoi(argv[4]); - // weight - U32 fn = atoi(argv[5]); - U32 fc = atoi(argv[6]); - U32 fh = atoi(argv[7]); - U32 fw = atoi(argv[8]); - // stride & padding - U32 stride = atoi(argv[9]); - U32 padding = atoi(argv[10]); - // output - U32 on = atoi(argv[11]); - U32 oc = atoi(argv[12]); - U32 oh = atoi(argv[13]); - U32 ow = atoi(argv[14]); - CHECK_REQUIREMENT(in == 1 && fn == 1 && fc == 1); - CHECK_REQUIREMENT(ic == oc && ic % 8 == 0); - - PoolingDesc pooling_desc; - pooling_desc.pm = POOLING_MAX; - - pooling_desc.stride_h = stride; - pooling_desc.stride_w = stride; - pooling_desc.padding_top = padding; - pooling_desc.padding_bottom = padding; - pooling_desc.padding_left = padding; - pooling_desc.padding_right = padding; - pooling_desc.kernelSize_h = fh; - pooling_desc.kernelSize_w = fw; - pooling_desc.rm = CEIL; - - TensorDesc input_desc = tensor4df(dt, DF_NCHWC8, in, ic, ih, iw); - TensorDesc output_desc; - CHECK_STATUS(pooling_infer_output_size(input_desc, pooling_desc, &output_desc, UT_ARCH)); - U32 input_len = tensorNumElements(input_desc); - U32 output_len = tensorNumElements(output_desc); - CHECK_REQUIREMENT(input_len == in*ic*ih*iw && output_len == on*oc*oh*ow); - - U8* input = ut_input_v(input_len, dt, UT_INIT_RANDOM); - U8* output = ut_input_v(output_len, dt, UT_INIT_ZERO); - U8* output_ref = ut_input_v(output_len, dt, UT_INIT_ZERO); - - if (UT_CHECK) { - CHECK_STATUS(pooling(input_desc, input, - pooling_desc, nullptr, - output_desc, output, - UT_ARCH)); - - CHECK_STATUS(pooling(input_desc, input, - pooling_desc, nullptr, - output_desc, output_ref, - CPU_GENERAL)); - - // check - ut_check_v(output, output_ref, output_len, dt, 0.05, __FILE__, __LINE__); - } - - // benchmark - double time_start = ut_time_ms(); - for(int iter = 0; iter < UT_LOOPS; iter++){ - CHECK_STATUS(pooling(input_desc, input, - pooling_desc, nullptr, - output_desc, output, - UT_ARCH)); - } - double time_end = ut_time_ms(); - double time = (time_end - time_start) / UT_LOOPS; - - // log performance data - char buffer[150]; - char params[120]; - sprintf(params, "(%u %u %u %u)+(%u %u %u %u)/(%u %u)=(%u %u %u %u)", - in, ic, ih, iw, - fn, fc, fh, fw, - stride, padding, - on, oc, oh, ow); - sprintf(buffer, "%20s, %80s", "Pooling", params); - double ops = 1.0 * on * oc * oh * ow * fh * fw; - ut_log(dt, buffer, ops, time); - - free(input); - free(output); - free(output_ref); - - return 0; -} - - -int main(int argc, char** argv) { -#ifdef _USE_FP16 - poolingTest(argc, argv, DT_F16); -#endif -#ifdef _USE_FP32 - poolingTest(argc, argv, DT_F32); -#endif - return 0; -} diff --git a/tests/test_pooling_int8.cpp b/tests/test_pooling_int8.cpp deleted file mode 100644 index d7f468fd..00000000 --- a/tests/test_pooling_int8.cpp +++ /dev/null @@ -1,144 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing.h" -#include "ut_util.h" - -#ifdef _USE_INT8 -int int8PoolingTest(int argc, char **argv, DataType dt) { - CHECK_REQUIREMENT(argc == 15); - // in data - U32 in = atoi(argv[1]); - U32 ic = atoi(argv[2]); - U32 ih = atoi(argv[3]); - U32 iw = atoi(argv[4]); - // weight - U32 fn = atoi(argv[5]); - U32 fc = atoi(argv[6]); - U32 fh = atoi(argv[7]); - U32 fw = atoi(argv[8]); - // stride & padding - U32 stride = atoi(argv[9]); - U32 padding = atoi(argv[10]); - // output - U32 on = atoi(argv[11]); - U32 oc = atoi(argv[12]); - U32 oh = atoi(argv[13]); - U32 ow = atoi(argv[14]); - CHECK_REQUIREMENT(in == 1 && fn == 1 && fc == 1); - CHECK_REQUIREMENT(ic == oc && ic % 8 == 0); - - PoolingDesc pooling_desc; - pooling_desc.pm = POOLING_MEAN; - - pooling_desc.stride_h = stride; - pooling_desc.stride_w = stride; - pooling_desc.padding_top = padding; - pooling_desc.padding_bottom = padding; - pooling_desc.padding_left = padding; - pooling_desc.padding_right = padding; - pooling_desc.kernelSize_h = fh; - pooling_desc.kernelSize_w = fw; - pooling_desc.rm = CEIL; - - TensorDesc input_desc= tensor4df(DT_I8, DF_NCHWC8, in, ic, ih, iw); - TensorDesc in_desc_ref = input_desc; - in_desc_ref.dt = dt; - - TensorDesc output_desc; - CHECK_STATUS(pooling_infer_output_size(input_desc, pooling_desc, &output_desc, UT_ARCH)); - U32 input_len = tensorNumElements(input_desc); - U32 output_len = tensorNumElements(output_desc); - CHECK_REQUIREMENT(input_len == in*ic*ih*iw && output_len == on*oc*oh*ow); - - U8* input_ref = ut_input_v(input_len, dt, UT_INIT_RANDOM); - INT8* input = (INT8*)ut_input_v(input_len, DT_I8, UT_INIT_ZERO); - F16 scales[2] = {-1}; - quantize_tensor(in_desc_ref, input_ref, &input_desc, input, scales); - - INT8* output = (INT8*)ut_input_v(output_len, DT_I8, UT_INIT_ZERO); - U8* out_d = ut_input_v(output_len, dt, UT_INIT_ZERO); - U8* output_ref = ut_input_v(output_len, dt, UT_INIT_ZERO); - - if(UT_CHECK){ - CHECK_STATUS(pooling(input_desc, input, - pooling_desc, scales, - output_desc, output, - UT_ARCH)); - - for (U32 i=0; i - -int priorboxTest(int argc, char **argv, DataType dt){ - CHECK_REQUIREMENT(argc == 18 || argc == 19 || argc == 20 || argc == 21); - // in0 feature map - U32 in0 = atoi(argv[1]); - U32 ic0 = atoi(argv[2]); - U32 ih0 = atoi(argv[3]); - U32 iw0 = atoi(argv[4]); - // in1 data - U32 in1 = atoi(argv[5]); - U32 ic1 = atoi(argv[6]); - U32 ih1 = atoi(argv[7]); - U32 iw1 = atoi(argv[8]); - // param priorbox - F32 min_size = (F32)atof(argv[9]); - F32 max_size = (F32)atof(argv[10]); - U32 flip = atoi(argv[11]); - U32 clip = atoi(argv[12]); - F32 step = (F32)atof(argv[13]); - // output - U32 on = atoi(argv[14]); - U32 oc = atoi(argv[15]); - U32 olens = atoi(argv[16]); - // multi param priorbox - F32 ar1 = (F32)atof(argv[17]); - F32 ar2; - F32 min_size1; - F32 max_size1; - if(argc == 19 || argc == 21){ - ar2 = (F32)atof(argv[18]); - if(argc == 21){ - min_size1 = (F32)atof(argv[19]); - max_size1 = (F32)atof(argv[20]); - } - } - if(argc == 20){ - min_size1 = (F32)atof(argv[18]); - max_size1 = (F32)atof(argv[19]); - } - - CHECK_REQUIREMENT(in0 == 1 && in1 == 1 && on == 1 && oc == 2); - - PriorBoxDesc priorbox_desc; - priorbox_desc.min_sizes.push_back(min_size); - priorbox_desc.max_sizes.push_back(max_size); - priorbox_desc.aspect_ratios.push_back(ar1); - if(argc == 19 || argc == 21){ - priorbox_desc.aspect_ratios.push_back(ar2); - if(argc == 21){ - priorbox_desc.min_sizes.push_back(min_size1); - priorbox_desc.max_sizes.push_back(max_size1); - } - } - if(argc == 20){ - priorbox_desc.min_sizes.push_back(min_size1); - priorbox_desc.max_sizes.push_back(max_size1); - } - priorbox_desc.flip = flip; - priorbox_desc.clip = clip; - priorbox_desc.image_h = ih1; - priorbox_desc.image_w = iw1; - priorbox_desc.step_h = step; - priorbox_desc.step_w = step; - priorbox_desc.variances[0] = 0.10000000149; - priorbox_desc.variances[1] = 0.10000000149; - priorbox_desc.variances[2] = 0.20000000298; - priorbox_desc.variances[3] = 0.20000000298; - priorbox_desc.offset = 0.5; - - std::vector input_descs; - TensorDesc output_desc; - TensorDesc input_desc_fm = tensor4df(dt, DF_NCHWC8, in0, ic0, ih0, iw0); - TensorDesc input_desc_data = tensor4df(dt, DF_NCHWC8, in1, ic1, ih1, iw1); - input_descs.push_back(input_desc_fm); - input_descs.push_back(input_desc_data); - CHECK_STATUS(priorbox_infer_output_size(input_descs, priorbox_desc, &output_desc, UT_ARCH)); - U32 input_len_fm = tensorNumElements(input_descs[0]); - U32 input_len_data = tensorNumElements(input_descs[1]); - U32 output_len = tensorNumElements(output_desc); - CHECK_REQUIREMENT(input_len_fm == in0*ic0*ih0*iw0 && input_len_data == in1*ic1*ih1*iw1 && output_len == on*oc*olens); - - U8* output = ut_input_v(output_len, dt, UT_INIT_ZERO); - U8* output_ref = ut_input_v(output_len, dt, UT_INIT_ZERO); - - if (UT_CHECK) { - CHECK_STATUS(priorbox(input_descs, - priorbox_desc, - output_desc, output, - UT_ARCH)); - - CHECK_STATUS(priorbox(input_descs, - priorbox_desc, - output_desc, output_ref, - CPU_GENERAL)); - // check - ut_check_v(output, output_ref, output_len, dt, 0.05, __FILE__, __LINE__); - } - - // benchmark - double time_start = ut_time_ms(); - for(int iter = 0; iter < UT_LOOPS; iter++){ - CHECK_STATUS(priorbox(input_descs, priorbox_desc, output_desc, output, UT_ARCH)); - } - double time_end = ut_time_ms(); - double time = (time_end - time_start) / UT_LOOPS; - - // log performance data - U32 num_priorboxs = priorbox_desc.aspect_ratios.size(); - if(priorbox_desc.flip){ - num_priorboxs = num_priorboxs * 2; - } - U32 num_minsize = priorbox_desc.min_sizes.size(); - num_priorboxs = (num_priorboxs + 1) * num_minsize; - if(!priorbox_desc.max_sizes.empty()){ - U32 num_maxsize = priorbox_desc.max_sizes.size(); - num_priorboxs = num_priorboxs + num_maxsize; - } - U32 ochannel = 2; - U32 numperbox = 4; - char buffer[150]; - char params[120]; - sprintf(params, "(%u %u %u %u) * (%u %u %u) = (%u %u %u)", in0, ic0, ih0, iw0, ochannel, numperbox, num_priorboxs, on, oc, olens); - sprintf(buffer, "%20s, %80s", "Priorbox", params); - double ops = 1.0 * output_len; - ut_log(dt, buffer, ops, time); - - free(output); - free(output_ref); - return 0; -} - - -int main(int argc, char** argv){ -#ifdef _USE_FP16 - priorboxTest(argc, argv, DT_F16); -#endif -#ifdef _USE_FP32 - priorboxTest(argc, argv, DT_F32); -#endif - return 0; -} \ No newline at end of file diff --git a/tests/test_reduction.cpp b/tests/test_reduction.cpp deleted file mode 100644 index 57e6cb66..00000000 --- a/tests/test_reduction.cpp +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "tensor_computing.h" -#include "ut_util.h" - -int reductionTest(int argc, char** argv, DataType dt) { - CHECK_REQUIREMENT(argc == 6); - U32 in = atoi(argv[1]); - U32 ic = atoi(argv[2]); - U32 ih = atoi(argv[3]); - U32 iw = atoi(argv[4]); - I32 axis = atoi(argv[5]); - ReductionMode reductionMode = REDUCTION_MEAN; - float coeff = 1.0; - DataFormat df = DF_NCHW; - TensorDesc maskDesc; - maskDesc.nDims = 0; - U8 *mask = nullptr; - - TensorDesc inDesc = tensor4df(dt, df, in, ic, ih, iw); - U8* input = ut_input_v(tensorNumElements(inDesc), dt, UT_INIT_RANDOM); - TensorDesc outDesc; - CHECK_STATUS(reduction_infer_output_size(inDesc, maskDesc, axis, false, &outDesc)); - U8* output = ut_input_v(tensorNumElements(outDesc), dt, UT_INIT_ZERO); - U8* outputRef = ut_input_v(tensorNumElements(outDesc), dt, UT_INIT_ZERO); - - if (UT_CHECK) { - CHECK_STATUS(reduction(inDesc, input, maskDesc, mask, axis, reductionMode, coeff, outDesc, output, UT_ARCH)); - - // naive implement - CHECK_STATUS(reduction(inDesc, input, maskDesc, mask, axis, reductionMode, coeff, outDesc, outputRef, CPU_GENERAL)); - - // check - ut_check_v(output, outputRef, tensorNumElements(outDesc), dt, 1, __FILE__, __LINE__); - } - - // benchmark - double time_start = ut_time_ms(); - for (int iter = 0; iter < UT_LOOPS; iter++) { - CHECK_STATUS(reduction(inDesc, input, maskDesc, mask, axis, reductionMode, coeff, outDesc, output, UT_ARCH)); - } - double time_end = ut_time_ms(); - double time = (time_end - time_start) / UT_LOOPS; - - // log performance data - U32 on = 1, oh = 1, ow = 1; - CHECK_STATUS(tensor3dGet(outDesc, &dt, &df, &on, &oh, &ow)); - char buffer[150]; - char params[120]; - sprintf(params, "(%u %u %u %u) %d =(%u %u %u)", - in, ic, ih, iw, axis, - on, oh, ow); - sprintf(buffer, "%20s, %80s", "AxisMean", params); - double ops = 1.0 * in * ic * ih * iw; - ut_log(dt, buffer, ops, time/UT_LOOPS); - - free(output); - free(outputRef); - - return 0; -} - - -int main(int argc, char** argv) { -#ifdef _USE_FP16 - reductionTest(argc, argv, DT_F16); -#endif -#ifdef _USE_FP32 - reductionTest(argc, argv, DT_F32); -#endif - return 0; -} diff --git a/tests/test_reshape.cpp b/tests/test_reshape.cpp deleted file mode 100644 index 7ac1f090..00000000 --- a/tests/test_reshape.cpp +++ /dev/null @@ -1,96 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include - -#include "tensor_computing.h" -#include "ut_util.h" - -int reshapeTest(int argc, char** argv, DataType dt) { - CHECK_REQUIREMENT(argc > 4); - U32 in = atoi(argv[1]); - U32 ic = atoi(argv[2]); - U32 ih = atoi(argv[3]); - U32 iw = atoi(argv[4]); - I32 shape_size = atoi(argv[5]); - CHECK_REQUIREMENT(argc == 6+shape_size); - std::vector shape(shape_size); - for (I32 i = 0; i < shape_size; i++) { - shape[i] = atoi(argv[6+i]); - } - - DataFormat df = DF_NCHW; - TensorDesc in_desc = tensor4df(dt, df, in, ic, ih, iw); - TensorDesc out_desc; - - CHECK_STATUS(reshape_infer_output_size(in_desc, &out_desc, shape.data(), shape_size, UT_ARCH)); - - U32 len = tensorNumElements(in_desc); - U8* input = ut_input_v(len, dt, UT_INIT_RANDOM); - U8* output = ut_input_v(len, dt, UT_INIT_RANDOM); - - if (UT_CHECK) { - CHECK_STATUS(reshape(in_desc, input, out_desc, output, UT_ARCH)); - - CHECK_REQUIREMENT(tensorNumElements(out_desc) == len); - } - - double time_start = ut_time_ms(); - for (int iter = 0; iter < UT_LOOPS; iter++) { - CHECK_STATUS(reshape(in_desc, input, out_desc, output, UT_ARCH)); - } - double time_end = ut_time_ms(); - double time = (time_end - time_start) / UT_LOOPS; - - // log performance data - char buffer[150]; - char params[120]; - memset(params, 0, 120); - sprintf(params, "(%u %u %u %u)=(", - in, ic, ih, iw); - for(I32 i = 0; i < shape_size; i++) { - I32 index = 0; - for (; index < 120; index++) { - if (params[index] == '\0') { - break; - } - } - if (i != shape_size-1) { - sprintf(params+index, "%d ", out_desc.dims[out_desc.nDims-1-i]); - } - else { - sprintf(params+index, "%d)", out_desc.dims[out_desc.nDims-1-i]); - } - } - sprintf(buffer, "%20s, %80s", "Reshape", params); - double ops = len; - ut_log(dt, buffer, ops, time); - - free(input); - free(output); - - return 0; -} - - -int main(int argc, char** argv) { -#ifdef _USE_FP16 - reshapeTest(argc, argv, DT_F16); -#endif -#ifdef _USE_FP32 - reshapeTest(argc, argv, DT_F32); -#endif - return 0; -} diff --git a/tests/test_scale.cpp b/tests/test_scale.cpp deleted file mode 100644 index 0d8ccd22..00000000 --- a/tests/test_scale.cpp +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include "tensor_computing.h" -#include "ut_util.h" - -int scaleTest(int argc, char** argv, DataType dt){ - CHECK_REQUIREMENT(argc == 5); - U32 in = atoi(argv[1]); - U32 ic = atoi(argv[2]); - U32 ih = atoi(argv[3]); - U32 iw = atoi(argv[4]); - - I32 axis = 1; - DataFormat df = DF_NCHWC8; - TensorDesc data_desc = tensor4df(dt, df, in, ic, ih, iw); - U32 len = tensorNumElements(data_desc); - - U8* alpha = ut_input_v(ic, dt, UT_INIT_RANDOM); - U8* beta = ut_input_v(ic, dt, UT_INIT_RANDOM); - U8* data = ut_input_v(len, dt, UT_INIT_RANDOM); - U8* data_ref = ut_input_v(len, dt, UT_INIT_ZERO); - memcpy(data_ref, data, len*bytesOf(dt)); - - if (UT_CHECK) { - CHECK_STATUS(scale(data_desc, data, axis, alpha, beta, data_desc, data, UT_ARCH)); - - // naive implement - CHECK_STATUS(scale(data_desc, data_ref, axis, alpha, beta, data_desc, data_ref, CPU_GENERAL)); - - // check - ut_check_v(data, data_ref, len, dt, 1.0, __FILE__, __LINE__); - } - - // benchmark - double time_start = ut_time_ms(); - for (int iter = 0; iter < UT_LOOPS; iter++) { - CHECK_STATUS(scale(data_desc, data, axis, alpha, beta, data_desc, data, UT_ARCH)); - } - double time_end = ut_time_ms(); - double time = (time_end - time_start) / UT_LOOPS; - - // log performance data - char buffer[150]; - char params[120]; - sprintf(params, "(%u %u %u %u)=(%u %u %u %u)", - in, ic, ih, iw, - in, ic, ih, iw); - sprintf(buffer, "%20s, %80s", "Scale", params); - double ops = 2.0 * in * ic * ih * iw; - ut_log(dt, buffer, ops, time/UT_LOOPS); - - free(data); - free(data_ref); - - return 0; -} - - -int main(int argc, char** argv) { -#ifdef _USE_FP16 - scaleTest(argc, argv, DT_F16); -#endif -#ifdef _USE_FP32 - scaleTest(argc, argv, DT_F32); -#endif - return 0; -} diff --git a/tests/test_slice.cpp b/tests/test_slice.cpp deleted file mode 100644 index 17736755..00000000 --- a/tests/test_slice.cpp +++ /dev/null @@ -1,87 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include - -#include "tensor_computing.h" -#include "ut_util.h" - -int sliceTest(int argc, char** argv, DataType dt) { - CHECK_REQUIREMENT(argc > 2); - I32 num = atoi(argv[1]); - CHECK_REQUIREMENT(argc == 2+4+1+num-1); - U32 in = atoi(argv[2]); - U32 ic = atoi(argv[3]); - U32 ih = atoi(argv[4]); - U32 iw = atoi(argv[5]); - I32 axis= atoi(argv[6]); - std::vector slice_point(num); - for (I32 i = 0; i < num-1; i++) { - slice_point[i] = atoi(argv[7+i]); - } - - DataFormat df = DF_NCHW; - TensorDesc in_desc = tensor4df(dt, df, in, ic, ih, iw); - std::vector out_desc(num); - - CHECK_STATUS(slice_infer_output_size(in_desc, &out_desc, axis, slice_point.data(), UT_ARCH)); - std::vector output(num); - for (I32 i = 0; i < num; i++) { - output[i] = (void*)ut_input_v(tensorNumElements(out_desc[i]), dt, UT_INIT_ZERO); - } - - U32 len = tensorNumElements(in_desc); - U8* input = ut_input_v(len, dt, UT_INIT_RANDOM); - - if (UT_CHECK) { - CHECK_STATUS(slice(in_desc, input, axis, out_desc, &output, UT_ARCH)); - - U32 tmp = 0; - for (I32 i = 0; i < num; i++) { - tmp += tensorNumElements(out_desc[i]); - } - CHECK_REQUIREMENT(tmp == len); - } - - double time_start = ut_time_ms(); - for (int iter = 0; iter < UT_LOOPS; iter++) { - CHECK_STATUS(slice(in_desc, input, axis, out_desc, &output, UT_ARCH)); - } - double time_end = ut_time_ms(); - double time = (time_end - time_start) / UT_LOOPS; - - // log performance data - char buffer[150]; - char params[120]; - sprintf(params, "(%u %u %u %u)=(%u %u %u %u)/%u", - in, ic, ih, iw, - in, ic, ih, iw, num); - sprintf(buffer, "%20s, %80s", "Slice", params); - double ops = num * len; - ut_log(dt, buffer, ops, time); - - free(input); - - return 0; -} - -int main(int argc, char** argv) { -#ifdef _USE_FP16 - sliceTest(argc, argv, DT_F16); -#endif -#ifdef _USE_FP32 - sliceTest(argc, argv, DT_F32); -#endif - return 0; -} diff --git a/tests/test_softmax.cpp b/tests/test_softmax.cpp deleted file mode 100644 index a011c446..00000000 --- a/tests/test_softmax.cpp +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing.h" -#include "ut_util.h" - -int softmaxTest(int argc, char** argv, DataType dt){ - CHECK_REQUIREMENT(argc == 2); - U32 len = atoi(argv[1]); - U32 axis = 1; - - TensorDesc in_desc, out_desc; - in_desc = tensor2df(dt, DF_NORMAL, 1, len); - CHECK_STATUS(softmax_infer_output_size(in_desc, &out_desc, UT_ARCH)); - - U8* in = ut_input_v(len, dt, UT_INIT_RANDOM); - U8* out = ut_input_v(len, dt, UT_INIT_ZERO); - U8* out_ref = ut_input_v(len, dt, UT_INIT_ZERO); - - if(UT_CHECK){ - CHECK_STATUS(softmax(in_desc, in, axis, out_desc, out, UT_ARCH)); - - // naive implement - CHECK_STATUS(softmax(in_desc, in, axis, out_desc, out_ref, CPU_GENERAL)); - - // check - ut_check_v(out, out_ref, len, dt, 0.1, __FILE__, __LINE__); - } - - // benchmark - double time_start = ut_time_ms(); - for(int iter=0; iter - -#include "tensor_computing.h" -#include "ut_util.h" - -int splitTest(int argc, char** argv, DataType dt) { - CHECK_REQUIREMENT(argc == 6); - I32 num = atoi(argv[1]); - U32 in = atoi(argv[2]); - U32 ic = atoi(argv[3]); - U32 ih = atoi(argv[4]); - U32 iw = atoi(argv[5]); - - DataFormat df = DF_NCHWC8; - TensorDesc in_desc = tensor4df(dt, df, in, ic, ih, iw); - std::vector out_desc(num); - - CHECK_STATUS(split_infer_output_size(in_desc, &out_desc)); - std::vector output(num); - for (I32 i = 0; i < num; i++) { - output[i] = (void*)ut_input_v(tensorNumElements(out_desc[i]), dt, UT_INIT_ZERO); - } - U32 len = tensorNumElements(in_desc); - U8* input = ut_input_v(len, dt, UT_INIT_RANDOM); - - if (UT_CHECK) { - CHECK_STATUS(split(in_desc, input, out_desc, &output, UT_ARCH)); - - for (I32 i = 0; i < num; i++) { - ut_check_v(output[i], input, len, dt, 0, __FILE__, __LINE__); - } - } - - double time_start = ut_time_ms(); - for (int iter = 0; iter < UT_LOOPS; iter++) { - CHECK_STATUS(split(in_desc, input, out_desc, &output, UT_ARCH)); - } - double time_end = ut_time_ms(); - double time = (time_end - time_start) / UT_LOOPS; - - // log performance data - char buffer[150]; - char params[120]; - sprintf(params, "(%u %u %u %u)=(%u %u %u %u)*%u", - in, ic, ih, iw, - in, ic, ih, iw, num); - sprintf(buffer, "%20s, %80s", "Split", params); - double ops = num * len; - ut_log(dt, buffer, ops, time); - - free(input); - - return 0; -} - -int main(int argc, char** argv) { -#ifdef _USE_FP16 - splitTest(argc, argv, DT_F16); -#endif -#ifdef _USE_FP32 - splitTest(argc, argv, DT_F32); -#endif - return 0; -} diff --git a/tests/test_transpose.cpp b/tests/test_transpose.cpp deleted file mode 100644 index 587a4a50..00000000 --- a/tests/test_transpose.cpp +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include - -#include "tensor_computing.h" -#include "ut_util.h" - -int transposeTest(int argc, char** argv, DataType dt) { - CHECK_REQUIREMENT(argc == 9); - U32 in = atoi(argv[1]); - U32 ic = atoi(argv[2]); - U32 ih = atoi(argv[3]); - U32 iw = atoi(argv[4]); - std::vector dim(4, 0); - std::vector inv_dim(4, 0); - for (int i = 0; i < 4; i++) { - I32 value = atoi(argv[5+i]); - dim[i] = value; - inv_dim[value] = i; - } - - DataFormat df = DF_NCHW; - TensorDesc in_desc = tensor4df(dt, df, in, ic, ih, iw); - TensorDesc out_1_desc; - TensorDesc out_2_desc; - - CHECK_STATUS(transpose_infer_output_size(in_desc, &out_1_desc, dim.data(), UT_ARCH)); - CHECK_STATUS(transpose_infer_output_size(out_1_desc, &out_2_desc, inv_dim.data(), UT_ARCH)); - - U32 len = tensorNumElements(in_desc); - U8* input = ut_input_v(len, dt, UT_INIT_RANDOM); - U8* out_1 = ut_input_v(len, dt, UT_INIT_RANDOM); - U8* out_2 = ut_input_v(len, dt, UT_INIT_RANDOM); - - if (UT_CHECK) { - CHECK_STATUS(transpose(in_desc, input, out_1_desc, out_1, dim.data(), UT_ARCH)); - - CHECK_STATUS(transpose(out_1_desc, out_1, out_2_desc, out_2, inv_dim.data(), UT_ARCH)); - - // check - ut_check_v(input, out_2, len, dt, 0.0001, __FILE__, __LINE__); - } - - double time_start = ut_time_ms(); - for (int iter = 0; iter < UT_LOOPS; iter++) { - CHECK_STATUS(transpose(in_desc, input, out_1_desc, out_1, dim.data(), UT_ARCH)); - } - double time_end = ut_time_ms(); - double time = (time_end - time_start) / UT_LOOPS; - - U32 on = 0; - U32 oc = 0; - U32 oh = 0; - U32 ow = 0; - CHECK_STATUS(tensor4dGet(out_1_desc, &dt, &df, &on, &oc, &oh, &ow)); - // log performance data - char buffer[150]; - char params[120]; - sprintf(params, "(%u %u %u %u)=(%u %u %u %u)", - in, ic, ih, iw, - on, oc, oh, ow); - sprintf(buffer, "%20s, %80s", "Transpose", params); - double ops = len; - ut_log(dt, buffer, ops, time); - - free(input); - free(out_1); - free(out_2); - - return 0; -} - - -int main(int argc, char** argv) { -#ifdef _USE_FP16 - transposeTest(argc, argv, DT_F16); -#endif -#ifdef _USE_FP32 - transposeTest(argc, argv, DT_F32); -#endif - return 0; -} diff --git a/third_party/install.sh b/third_party/install.sh index 656341f4..9eba1c40 100644 --- a/third_party/install.sh +++ b/third_party/install.sh @@ -1,7 +1,7 @@ #!/bin/bash script_name=$0 -compiler_arch="gnu" +compiler_arch="arm_gnu" build_threads="8" print_help() { @@ -11,7 +11,7 @@ Build third party library. Mandatory arguments to long options are mandatory for short options too. -h, --help display this help and exit. - -c, --compiler use to set compiler(default: gnu). + -c, --compiler use to set compiler(default: arm_gnu). -t, --threads use parallel build(default: 8). EOF exit 1; @@ -41,7 +41,7 @@ while true ; do done exeIsValid(){ - if type $1 2>/dev/null; + if type $1 &> /dev/null; then return 1 else @@ -73,7 +73,9 @@ if [ $? == 0 ] ; then exit 1 fi -if [ "${compiler_arch}" == "llvm" ] ; then +configure_flags="" +dynamic_library_suffix="so" +if [ "${compiler_arch}" == "arm_llvm" ] ; then exeIsValid aarch64-linux-android21-clang && exeIsValid aarch64-linux-android21-clang++ if [ $? == 0 ] ; then echo "[ERROR] please install android ndk aarch64-linux-android21-clang++ compiler and set shell environment PATH to find it" @@ -81,8 +83,10 @@ if [ "${compiler_arch}" == "llvm" ] ; then fi export CC=aarch64-linux-android21-clang export CXX=aarch64-linux-android21-clang++ + export AR=aarch64-linux-android-ar + configure_flags="--host=arm-linux --enable-neon " fi -if [ "${compiler_arch}" == "gnu" ] ; then +if [ "${compiler_arch}" == "arm_gnu" ] ; then exeIsValid aarch64-linux-gnu-gcc && exeIsValid aarch64-linux-gnu-g++ if [ $? == 0 ] ; then echo "[ERROR] please install GNU gcc ARM compiler and set shell environment PATH to find it" @@ -90,8 +94,10 @@ if [ "${compiler_arch}" == "gnu" ] ; then fi export CC=aarch64-linux-gnu-gcc export CXX=aarch64-linux-gnu-g++ + export AR=aarch64-linux-gnu-ar + configure_flags="--host=arm-linux " fi -if [ "${compiler_arch}" == "himix100" ] ; then +if [ "${compiler_arch}" == "arm_himix100" ] ; then exeIsValid arm-himix100-linux-gcc && exeIsValid arm-himix100-linux-g++ if [ $? == 0 ] ; then echo "[ERROR] please install Himix100 GNU gcc ARM compiler and set shell environment PATH to find it" @@ -99,15 +105,52 @@ if [ "${compiler_arch}" == "himix100" ] ; then fi export CC=arm-himix100-linux-gcc export CXX=arm-himix100-linux-g++ + export AR=arm-himix100-linux-ar + configure_flags="--host=arm-linux " fi -if [ "${compiler_arch}" == "ndkv7" ] ; then +if [ "${compiler_arch}" == "arm_ndkv7" ] ; then exeIsValid armv7a-linux-androideabi16-clang && exeIsValid armv7a-linux-androideabi16-clang++ if [ $? == 0 ] ; then - echo "[ERROR] please install Himix100 GNU gcc ARM compiler and set shell environment PATH to find it" + echo "[ERROR] please install android ndk armv7a-linux-androideabi19-clang++ compiler and set shell environment PATH to find it" exit 1 fi export CC=armv7a-linux-androideabi16-clang export CXX=armv7a-linux-androideabi16-clang++ + export AR=arm-linux-androideabi-ar + configure_flags="--host=arm-linux " +fi +if [ "${compiler_arch}" == "x86_gnu" ] ; then + exeIsValid gcc && exeIsValid g++ + if [ $? == 0 ] ; then + echo "[ERROR] please install x86 gnu compiler and set shell environment PATH to find it" + exit 1 + fi + export CC=gcc + export CXX=g++ + export AR=ar +fi +if [ "${compiler_arch}" == "x86_ndk" ] ; then + exeIsValid x86_64-linux-android21-clang && exeIsValid x86_64-linux-android21-clang++ + if [ $? == 0 ] ; then + echo "[ERROR] please install android ndk x86_64-linux-android21-clang++ compiler and set shell environment PATH to find it" + exit 1 + fi + export CC=x86_64-linux-android21-clang + export CXX=x86_64-linux-android21-clang++ + export AR=x86_64-linux-android-ar + configure_flags="--host=x86-linux" +fi +if [ "${compiler_arch}" == "arm_ios" ] ; then + exeIsValid arm-apple-darwin11-clang && exeIsValid arm-apple-darwin11-clang++ + if [ $? == 0 ] ; then + echo "[ERROR] please install ios arm-apple-darwin11-clang++ compiler and set shell environment PATH to find it" + exit 1 + fi + export CC=arm-apple-darwin11-clang + export CXX=arm-apple-darwin11-clang++ + export AR=arm-apple-darwin11-ar + configure_flags="--host=arm-apple-darwin11 " + dynamic_library_suffix="dylib" fi script_abs=$(readlink -f "$0") @@ -127,7 +170,8 @@ FlatBuffers_ROOT=${script_dir}/${compiler_arch}/flatbuffers TFLite_ROOT=${script_dir}/${compiler_arch}/tflite OpenCL_ROOT=${script_dir}/${compiler_arch}/opencl JPEG_ROOT=${script_dir}/${compiler_arch}/jpeg - +FFTW_ROOT=${script_dir}/${compiler_arch}/fftw +JSONCPP_ROOT=${script_dir}/${compiler_arch}/jsoncpp # download prebuilt protoc echo "[INFO] install protoc in ${script_dir}..." @@ -144,7 +188,6 @@ unzip protoc-3.1.0-linux-x86_64.zip rm protoc-3.1.0-linux-x86_64.zip export PATH=${PROTOC_ROOT}/bin:$PATH - # download and build protobuf echo "[INFO] install protobuf in ${script_dir}..." rm -rf ${Protobuf_ROOT} @@ -159,9 +202,9 @@ fi tar xzf v3.1.0.tar.gz cd protobuf-3.1.0 if [ ! -f "./configure" ]; then - ./autogen.sh || (echo "./autogen.sh failed for protobuf"; exit 1) ; + ./autogen.sh || (echo "./autogen.sh failed for protobuf, If it is related to curl for download, you can add -k parameter for curl in autogen.sh"; exit 1) ; fi -./configure --host=arm-linux --with-protoc=${PROTOC_ROOT}/bin/protoc\ +./configure ${configure_flags} --with-protoc=${PROTOC_ROOT}/bin/protoc\ --prefix=${Protobuf_ROOT} make -j${build_threads} || exit 1 make install -j${build_threads} || exit 1 @@ -169,7 +212,6 @@ cp ${PROTOC_ROOT}/bin/protoc ${Protobuf_ROOT}/bin cd .. rm -rf v3.1.0.tar.gz protobuf-3.1.0 - # download flatbuffers header file echo "[INFO] install flatbuffers in ${script_dir}..." rm -rf ${FlatBuffers_ROOT} @@ -187,7 +229,6 @@ else cp -r ${script_dir}/sources/flatbuffers/* . fi - # download tensorflow-lite header file echo "[INFO] install TFLite in ${script_dir}..." rm -rf ${TFLite_ROOT} @@ -207,33 +248,33 @@ else cp -r ${script_dir}/sources/tflite/* . fi - # download and install OpenCL -echo "[INFO] install opencl in ${script_dir}..." -rm -rf ${OpenCL_ROOT} -mkdir ${OpenCL_ROOT} -cd ${OpenCL_ROOT} -if [ ! -d "${script_dir}/sources/opencl" ]; then - mkdir include - cd include - git init - git remote add -f origin https://github.com/KhronosGroup/OpenCL-Headers || exit 1 - git config core.sparsecheckout true - echo "CL" >> .git/info/sparse-checkout - git pull origin master || exit 1 - rm -rf .git* - cd .. - - mkdir lib64 - android_device=`adb devices | head -n 2 | tail -n 1 | awk '{print $1}'` - adb -s ${android_device} pull /vendor/lib64/libOpenCL.so lib64/ - adb -s ${android_device} pull /vendor/lib64/egl/libGLES_mali.so lib64/ - cp -r ../opencl ${script_dir}/sources/ -else - cp -r ${script_dir}/sources/opencl/* . +if [ "${compiler_arch}" == "arm_llvm" ] ; then + echo "[INFO] install opencl in ${script_dir}..." + rm -rf ${OpenCL_ROOT} + mkdir ${OpenCL_ROOT} + cd ${OpenCL_ROOT} + if [ ! -d "${script_dir}/sources/opencl" ]; then + mkdir include + cd include + git init + git remote add -f origin https://github.com/KhronosGroup/OpenCL-Headers || exit 1 + git config core.sparsecheckout true + echo "CL" >> .git/info/sparse-checkout + git pull origin master || exit 1 + rm -rf .git* + cd .. + + mkdir lib64 + android_device=`adb devices | head -n 2 | tail -n 1 | awk '{print $1}'` + adb -s ${android_device} pull /vendor/lib64/libOpenCL.so lib64/ + adb -s ${android_device} pull /vendor/lib64/egl/libGLES_mali.so lib64/ + cp -r ../opencl ${script_dir}/sources/ + else + cp -r ${script_dir}/sources/opencl/* . + fi fi - # download and build jpeg echo "[INFO] install jpeg in ${script_dir}..." rm -rf ${JPEG_ROOT} @@ -248,24 +289,65 @@ fi tar xzf jpegsrc.v9c.tar.gz cd jpeg-9c if [ ! -f "./configure" ]; then - ./autogen.sh || (echo "./autogen.sh failed for libjpeg; exit 1") ; + ./autogen.sh || (echo "./autogen.sh failed for libjpeg"; exit 1) ; fi -./configure --host=arm-linux --prefix=${JPEG_ROOT} +./configure ${configure_flags} --prefix=${JPEG_ROOT} make -j${build_threads} || exit 1 make install -j${build_threads} || exit 1 cd .. rm -rf jpeg-9c jpegsrc.v9c.tar.gz +# download and build jsoncpp +echo "[INFO] install jsoncpp in ${script_dir}..." +rm -rf ${JSONCPP_ROOT} +mkdir ${JSONCPP_ROOT} +cd ${JSONCPP_ROOT} +if [ ! -f "${script_dir}/sources/jsoncpp-master.zip" ]; then + wget https://github.com/open-source-parsers/jsoncpp/archive/master.zip || exit 1 + cp jsoncpp-master.zip ${script_dir}/sources/ +else + cp ${script_dir}/sources/jsoncpp-master.zip . +fi +unzip jsoncpp-master.zip +cd jsoncpp-master +cp -r include ${JSONCPP_ROOT}/ +mkdir ${JSONCPP_ROOT}/lib +${CXX} -shared -fPIC src/lib_json/*.cpp -Iinclude -o ${JSONCPP_ROOT}/lib/libjsoncpp.${dynamic_library_suffix} +${CXX} -c src/lib_json/*.cpp -Iinclude +${AR} -crv ${JSONCPP_ROOT}/lib/libjsoncpp.a ./*.o +cd .. +rm -rf jsoncpp-master* + +# download fftw +echo "[INFO] install fftw in ${script_dir}..." +rm -rf ${FFTW_ROOT} +mkdir ${FFTW_ROOT} +cd ${FFTW_ROOT} +if [ ! -f "${script_dir}/sources/fftw-3.3.8.tar.gz" ]; then + wget http://www.fftw.org/fftw-3.3.8.tar.gz || exit 1 + cp fftw-3.3.8.tar.gz ${script_dir}/sources/ +else + cp -r ${script_dir}/sources/fftw-3.3.8.tar.gz . +fi +tar xzf fftw-3.3.8.tar.gz +cd fftw-3.3.8 +./configure ${configure_flags} --enable-shared=yes --enable-single --enable-fma --prefix=${FFTW_ROOT} +make -j${build_threads} || exit 1 +make install -j${build_threads} || exit 1 +cd .. +rm -rf fftw-3.3.8 fftw-3.3.8.tar.gz echo "[INFO] generate environment file to ${env_file}..." echo "#!/bin/bash -export Protobuf_ROOT=${script_dir}/${compiler_arch}/protobuf -export FlatBuffers_ROOT=${script_dir}/${compiler_arch}/flatbuffers -export TFLite_ROOT=${script_dir}/${compiler_arch}/tflite -export OpenCL_ROOT=${script_dir}/${compiler_arch}/opencl -export JPEG_ROOT=${script_dir}/${compiler_arch}/jpeg +export Protobuf_ROOT=${Protobuf_ROOT} +export FlatBuffers_ROOT=${FlatBuffers_ROOT} +export TFLite_ROOT=${TFLite_ROOT} +export OpenCL_ROOT=${OpenCL_ROOT} +export JPEG_ROOT=${JPEG_ROOT} +export JSONCPP_ROOT=${JSONCPP_ROOT} +export FFTW_ROOT=${FFTW_ROOT} export PATH=\${Protobuf_ROOT}/bin:\$PATH -export LD_LIBRARY_PATH=\${Protobuf_ROOT}/lib:\${OpenCL_ROOT}/lib64:\${JPEG_ROOT}/lib:\$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=\${Protobuf_ROOT}/lib:\${OpenCL_ROOT}/lib64:\${JPEG_ROOT}/lib:\${JSONCPP_ROOT}/lib:\${FFTW_ROOT}/lib:\$LD_LIBRARY_PATH " > ${env_file} chmod a+x ${env_file} echo "[INFO] please source ${env_file} before use..." diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt deleted file mode 100644 index 03d56f39..00000000 --- a/tools/CMakeLists.txt +++ /dev/null @@ -1,87 +0,0 @@ -cmake_minimum_required(VERSION 3.2) - -file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/bolt.cmake ${BOLT_ROOT}/bolt.cmake) -if (BOLT_CONFIGURE_FILE) - include(${BOLT_CONFIGURE_FILE}) -else (BOLT_CONFIGURE_FILE) - message(FATAL_ERROR " -FATAL: can not find bolt.cmake in directory, - please set shell or cmake environment variable BOLT_ROOT. - ") -endif (BOLT_CONFIGURE_FILE) - -project(tools) - -set_policy() - -SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${BOLT_ROOT}/cmakes") -find_package(Uni) -find_package(ModelTools) -find_package(Image) -find_package(TensorComputing) -find_package(Inference) -find_package(jpeg) -if(USE_MALI) - find_package(Gcl) -endif(USE_MALI) - -set_project_install_directory() -set_c_cxx_flags() -set_test_c_cxx_flags() - -function(tensor_computing name) - add_executable(${name} ${name}.cpp) - add_dependencies(${name} tensor_computing) - add_dependencies(${name} tensor_computing_static) - target_link_libraries(${name} ${TENSOR_COMPUTING_LIBRARIES}) - if(USE_MALI) - target_link_libraries(${name} ${OPENCL_LIBRARIES}) - endif(USE_MALI) -endfunction() - -function(model_tools name) - add_executable(${name} ${name}.cpp) - if (USE_CAFFE) - add_dependencies(${name} model-tools) - add_dependencies(${name} model-tools_static) - add_dependencies(${name} model-tools_caffe) - add_dependencies(${name} model-tools_caffe_static) - TARGET_LINK_LIBRARIES(${name} ${MODEL_TOOLS_LIBRARIES}) - endif (USE_CAFFE) - - if (USE_ONNX) - add_dependencies(${name} model-tools) - add_dependencies(${name} model-tools_static) - add_dependencies(${name} model-tools_onnx) - add_dependencies(${name} model-tools_onnx_static) - TARGET_LINK_LIBRARIES(${name} ${MODEL_TOOLS_LIBRARIES}) - endif (USE_ONNX) - - if (USE_TFLITE) - add_dependencies(${name} model-tools) - add_dependencies(${name} model-tools_static) - add_dependencies(${name} model-tools_tflite) - add_dependencies(${name} model-tools_tflite_static) - TARGET_LINK_LIBRARIES(${name} ${MODEL_TOOLS_LIBRARIES}) - endif (USE_TFLITE) -endfunction() - -if (USE_CAFFE) - model_tools(caffe2bolt) -endif (USE_CAFFE) -if (USE_ONNX) - model_tools(onnx2bolt) -endif (USE_ONNX) -if (USE_TFLITE) - model_tools(tflite2bolt) -endif (USE_TFLITE) - -if (USE_LIBRARY_TUNING) - tensor_computing(tensor_computing_library_search) -endif (USE_LIBRARY_TUNING) - -if (BUILD_TEST) - if (USE_INT8) - inference(ptq_calibration ./ptq_calibration.cpp) - endif (USE_INT8) -endif (BUILD_TEST) diff --git a/tools/caffe2bolt.cpp b/tools/caffe2bolt.cpp deleted file mode 100644 index 298317c9..00000000 --- a/tools/caffe2bolt.cpp +++ /dev/null @@ -1,112 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#include -#include -#include "model_tools.h" -#include "model_serialize_deserialize.hpp" -#include "model_optimizer.hpp" -#include "converter.h" -#include "model_print.h" - - -int main(int argc, char* argv[]) -{ - std::string dir = std::string(argv[1]); - std::string mfn = std::string(argv[2]); - InferencePrecision ip = FP16; - DataConvertType converterMode = F32_to_F16; - bool quantStorage = false; - F32 clipVal = 0; - - if (argc > 3) { - if (std::string(argv[3]) == std::string("INT8_Q")) { - ip = INT8_Q; - converterMode = F32_to_F16; - } else if (std::string(argv[3]) == std::string("FP32")) { - ip = FP32; - converterMode = F32_to_F32; - } - } - - if (argc > 4) { - if (std::string(argv[4]) == std::string("QSTORE")) { - quantStorage = true; - } - } - - if (argc > 5) { - clipVal = atof(argv[5]); - if (clipVal > 0) { - std::cout << "Inputs to all gemm will be clipped between -" << clipVal << " and " << clipVal << std::endl; - } - } - - ModelSpec originalMs, targetMs, resultMs; - CHECK_STATUS(mt_create_model(&originalMs)); - CHECK_STATUS(mt_create_model(&targetMs)); - CHECK_STATUS(mt_create_model(&resultMs)); - - CHECK_STATUS(caffe_converter(dir, mfn, &originalMs)); - - //graph_optimizer - ModelSpecOptimizer ms_optimizer; - ms_optimizer.suggest(clipVal); - ms_optimizer.optimize(&originalMs); -#ifdef _DEBUG - print_ms(originalMs); -#endif - //datatype converter - CHECK_STATUS(ms_datatype_converter(&originalMs, &targetMs, converterMode, quantStorage)); -#ifdef _DEBUG - print_ms(targetMs); -#endif - - //serialize ms to ./bolt - std::string modelStorePath = std::string(argv[1]) + "/" + std::string(argv[2]); - if (quantStorage) { - modelStorePath += std::string("_qstore"); - } - switch (ip) { - case INT8_Q: { - modelStorePath += std::string("_int8_q.bolt"); - targetMs.dt = DT_F16_8Q; - CHECK_STATUS(serialize_model_to_file(&targetMs, modelStorePath.c_str())); - break; - } - case FP16: { - modelStorePath += std::string("_f16.bolt"); - CHECK_STATUS(serialize_model_to_file(&targetMs, modelStorePath.c_str())); - break; - } - case FP32: { - modelStorePath += std::string("_f32.bolt"); - CHECK_STATUS(serialize_model_to_file(&targetMs, modelStorePath.c_str())); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - return 0; - } - } - - // deserialize ./bolt to ms in memory - CHECK_STATUS(deserialize_model_from_file(modelStorePath.c_str(), &resultMs)); - print_ms(resultMs); - - CHECK_STATUS(mt_destroy_model(&originalMs)); - CHECK_STATUS(mt_destroy_model(&targetMs)); - CHECK_STATUS(mt_destroy_model(&resultMs)); - - return 0; -} diff --git a/tools/onnx2bolt.cpp b/tools/onnx2bolt.cpp deleted file mode 100644 index 17f1923e..00000000 --- a/tools/onnx2bolt.cpp +++ /dev/null @@ -1,106 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include -#include "model_tools.h" -#include "model_serialize_deserialize.hpp" -#include "model_optimizer.hpp" -#include "converter.h" -#include "model_print.h" - - -int main(int argc, char* argv[]) -{ - CHECK_REQUIREMENT(argc >= 3); - std::string dir = argv[1]; - std::string mfn = argv[2]; - - int removePreprocessOpNum = 0; - if (argc > 3) { - removePreprocessOpNum = atoi(argv[3]); - } - - InferencePrecision ip = FP16; - DataConvertType converterMode = F32_to_F16; - if (argc > 4) { - if (std::string(argv[4]) == std::string("INT8_Q")) { - ip = INT8_Q; - converterMode = F32_to_F16; - } else if (std::string(argv[4]) == std::string("FP32")) { - ip = FP32; - converterMode = F32_to_F32; - } - } - - ModelSpec originalMs; - ModelSpec targetMs; - ModelSpec resultMs; - CHECK_STATUS(mt_create_model(&originalMs)); - CHECK_STATUS(mt_create_model(&targetMs)); - CHECK_STATUS(mt_create_model(&resultMs)); - CHECK_STATUS(onnx_converter(dir, mfn, removePreprocessOpNum, &originalMs)); -#ifdef _DEBUG - print_ms(originalMs); -#endif - - - ModelSpecOptimizer msOptimizer; - msOptimizer.suggest(); - msOptimizer.optimize(&originalMs); -#ifdef _DEBUG - print_ms(originalMs); -#endif - - CHECK_STATUS(ms_datatype_converter(&originalMs, &targetMs, converterMode)); -#ifdef _DEBUG - print_ms(targetMs); -#endif - - //serialize ms to ./bolt - std::string modelStorePath = std::string(argv[1]) + "/" + std::string(argv[2]); - switch (ip) { - case INT8_Q: { - modelStorePath += std::string("_int8_q.bolt"); - targetMs.dt = DT_F16_8Q; - CHECK_STATUS(serialize_model_to_file(&targetMs, modelStorePath.c_str())); - break; - } - case FP16: { - modelStorePath += std::string("_f16.bolt"); - CHECK_STATUS(serialize_model_to_file(&targetMs, modelStorePath.c_str())); - break; - } - case FP32: { - modelStorePath += std::string("_f32.bolt"); - CHECK_STATUS(serialize_model_to_file(&targetMs, modelStorePath.c_str())); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - return 0; - } - } - - //deserialize ./bolt to ms in memory - CHECK_STATUS(deserialize_model_from_file(modelStorePath.c_str(), &resultMs)); - print_ms(resultMs); - - CHECK_STATUS(mt_destroy_model(&originalMs)); - CHECK_STATUS(mt_destroy_model(&targetMs)); - CHECK_STATUS(mt_destroy_model(&resultMs)); - - return 0; -} - diff --git a/tools/ptq_calibration.cpp b/tools/ptq_calibration.cpp deleted file mode 100644 index 2ac68fdb..00000000 --- a/tools/ptq_calibration.cpp +++ /dev/null @@ -1,417 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include -#include -#include "inference.hpp" -#include "tensor.hpp" -#include "data_loader.hpp" -#include "result_format.hpp" -#include "utils.hpp" -#include "tensor_computing.h" -#include "model_print.h" -#ifdef _USE_FP16 -#include "../tensor_computing/src/cpu/arm/fp16/arm_functions_fp16.h" -#endif -#ifdef _USE_FP32 -#include "../tensor_computing/src/cpu/arm/fp32/arm_functions_fp32.h" -#endif - -#define BINS 2048 -#define NUM_IMAGES_INPUT 100 - -void print_help(char* argv[]) -{ - std::cout << "usage: " << argv[0] << " modelPath dataDirectory dataFormat scaleValue cpuAffinityPolicyName algorithmMapPath" << std::endl; -} - -int main(int argc, char* argv[]) -{ -#ifdef _USE_FP16 - UTIL_TIME_INIT - - char* modelPath = (char*)""; - char* dataDir = (char*)""; - char* cpuAffinityPolicyName = (char*)""; - char* algorithmMapPath = (char*)""; - ImageFormat imageFormat = RGB; - DeviceTypeIn device = d_CPU; - F32 scaleValue = 1; - if (argc < 5) { - print_help(argv); - return 1; - } - modelPath = argv[1]; - dataDir = argv[2]; - - imageFormat = (std::string(argv[3]) == std::string("BGR") ? BGR : RGB); - if (std::string(argv[3]) == std::string("RGB_SC")) { - imageFormat = RGB_SC; - } else if (std::string(argv[3]) == std::string("BGR_SC_RAW")) { - imageFormat = BGR_SC_RAW; - } else if (std::string(argv[3]) == std::string("RGB_SC_RAW")) { - imageFormat = RGB_SC_RAW; - } - - scaleValue = atof(argv[4]); - - if (argc > 5) { - const char* deviceName = "GPU"; - const char* argvName = argv[5]; - if(strcmp(deviceName, argvName) == 0) { - CHECK_STATUS(NOT_SUPPORTED); - } else { - cpuAffinityPolicyName = argv[5]; - } - } - - if (argc > 6) { - algorithmMapPath = argv[6]; - } - - ModelSpec int8Ms; - deserialize_model_from_file(modelPath, &int8Ms); - CHECK_REQUIREMENT(DT_F16_8Q == int8Ms.dt || DT_F16 == int8Ms.dt); - int8Ms.dt = DT_F16_8Q; - - ModelSpec f16Ms; - deserialize_model_from_file(modelPath, &f16Ms); - f16Ms.dt = DT_F16; - - ModelSpec resultMs; - deserialize_model_from_file(modelPath, &resultMs); - resultMs.dt = DT_F16_8Q; - - auto relationNum = resultMs.num_op_tensor_entries; - auto relationPtr = resultMs.op_relationship_entries; - resultMs.num_op_tensor_entries = 0; - resultMs.op_relationship_entries = nullptr; - - Arch arch = getArch(cpuAffinityPolicyName, device); - - auto int8CNN = createPipelinefromMs(arch, &int8Ms, algorithmMapPath); - auto f16CNN = createPipelinefromMs(arch, &f16Ms, algorithmMapPath); - - // load images - HashMap> inMap = int8CNN->get_inputs(); - TensorDesc imageDesc = (*(inMap.begin()->second)).get_desc(); - Vec imageDescs; - imageDescs.push_back(imageDesc); - Vec> images; - Vec imagePaths = load_image_with_scale(dataDir, imageDescs, &images, imageFormat, scaleValue); - - std::cout << "[Calibration]:" << std::endl; - - Vec dBuf; - //Vec qBuf; - Vec calibratedOpIdx; - - auto curModelInputTensorNames = int8CNN->get_model_input_tensor_names(); - for (int index = 0; index < (int)curModelInputTensorNames.size(); index++) { - int8CNN->copy_to_named_input(curModelInputTensorNames[index], images[0][index].get_val()); - } - - U32 opIdx = int8CNN->find_next_dynamic_scale_op(calibratedOpIdx, 0); - std::map> tensorScale; - - while (0 != opIdx) { - auto op = int8CNN->get_op_by_index(opIdx); - std::string opName = op->get_op_name(); - std::cout << "Calibrating OP " << opIdx << ": " << opName << std::endl; - std::string opsName = int8Ms.ops[opIdx].name; - CHECK_REQUIREMENT(opName == opsName); - - Vec> scales; - auto inputTensors = op->get_input_tensors(); - auto outputTensors = op->get_output_tensors(); - std::cout << " Inputs:\n"; - - for (U32 i = 0; i < int8Ms.ops[opIdx].num_inputs; i++) { - std::string tensorName = int8Ms.ops[opIdx].input_tensors_name[i]; - TensorDesc inDesc = inputTensors[i].get_desc(); - - auto it = tensorScale.find(tensorName); - if (it != tensorScale.end()) { - scales.push_back(tensorScale[tensorName]); - std::cout << " InputTensor " << i << " " << tensorName << " inherits scale " << tensorScale[tensorName][0] << std::endl; - continue; - } - - if (DT_I8 == inDesc.dt) { // Gets scale from int8 pooling or concat. Label with -1 - Vec scale; - scale.push_back(-1); - scales.push_back(scale); - tensorScale[tensorName] = scale; - std::cout << " InputTensor " << i << " " << tensorName << " inherits transformed scale " << std::endl; - continue; - } - - U32 dBytes = tensorNumBytes(inDesc); - dBuf.resize(dBytes * NUM_IMAGES_INPUT); - U8 *d = dBuf.data(); - std::vector histogram; - F32 last_max = 0; - F32 interval = 0; - - for (U32 j = 0; j < images.size() ; j++) { - for (int index = 0; index < (int)curModelInputTensorNames.size(); index++) { - int8CNN->copy_to_named_input(curModelInputTensorNames[index], images[j][index].get_val()); - } - - int8CNN->run_till_breakpoint(opIdx); - memcpy(d, inputTensors[i].get_val(), dBytes); - d += dBytes; - - if ((j != images.size()-1) && ((j+1)%NUM_IMAGES_INPUT != 0)) { - continue; - } - - if (j == NUM_IMAGES_INPUT - 1 || ((j == images.size()-1) && (j < NUM_IMAGES_INPUT - 1))) { - DEBUG_info("---------- start getting 1 - "<< j+1 <<" images input tensors ----------"); - F16* ptr_d = (F16*)dBuf.data(); - F32 max = array_maxabs_f16(ptr_d, (I32)(tensorNumElements(inDesc) * (j+1))) ; - DEBUG_info(" " << max << " is the maximum value"); - interval = max / BINS; - histogram.resize(BINS, 0.00001f); - //update histogram first time - update_histogram(tensorNumElements(inDesc)*(j+1), ptr_d , BINS, interval, histogram.data()); - last_max = max; - d = dBuf.data(); - dBuf.clear(); - continue; - } - - if((j+1)%NUM_IMAGES_INPUT == 0 && j != (NUM_IMAGES_INPUT -1)) { - DEBUG_info("---------- start getting " << j+1-100 << " - " << j+1 << " images input tensors ----------"); - F16 *ptr_d = (F16*)dBuf.data(); - F32 max = array_maxabs_f16(ptr_d, (I32)(tensorNumElements(inDesc) * NUM_IMAGES_INPUT)); - if(max <= last_max) { - DEBUG_info(" " << last_max << " is the maximum value"); - interval = last_max / BINS; - //update histogram if no new max - update_histogram(tensorNumElements(inDesc) * NUM_IMAGES_INPUT, ptr_d , BINS, interval, histogram.data()); - } - else { - DEBUG_info(" " << max << " is the maximum value"); - interval = max / BINS; - F32 numPerBin = (F32) max / last_max; - //last_max = max; -> may optimize accuracy. - histogram = compress_histogram(histogram, numPerBin, last_max); - last_max = max; - update_histogram((tensorNumElements(inDesc) * NUM_IMAGES_INPUT), ptr_d , BINS, interval, histogram.data()); - } - d = dBuf.data(); - dBuf.clear(); - continue; - } - - if((j == images.size()-1) && ((j+1)%NUM_IMAGES_INPUT != 0)) { - DEBUG_info("---------- start getting " << j+1-((j+1)%NUM_IMAGES_INPUT) << " - " << j+1 << " images input tensors ----------"); - dBuf.resize(dBytes * ((j+1)%NUM_IMAGES_INPUT)); - F16*ptr_d = (F16*)dBuf.data(); - F32 max = array_maxabs_f16(ptr_d, (I32)(tensorNumElements(inDesc) * ((j+1)%NUM_IMAGES_INPUT))); - if(max <= last_max) { - DEBUG_info(" " << last_max << " is the maximum value"); - interval = last_max / BINS; - //update histogram if no new max - update_histogram(tensorNumElements(inDesc) * ((j+1)%NUM_IMAGES_INPUT), ptr_d , BINS, interval, histogram.data()); - } - else { - DEBUG_info(" " << max << " is the maximum value"); - interval = max / BINS; - F32 numPerBin = (F32) max / last_max; - //last_max = max; -> may optimize accuracy - histogram = compress_histogram(histogram, numPerBin, last_max); - last_max = max; - update_histogram((tensorNumElements(inDesc) * NUM_IMAGES_INPUT), ptr_d , BINS, interval, histogram.data()); - } - d = dBuf.data(); - dBuf.clear(); - continue; - } - } - - DEBUG_info("---------- compute KL ----------"); - Vec scale = compute_scale_with_KL(histogram, interval); - DEBUG_info("--------- finish compute KL ---------"); - scales.push_back(scale); - tensorScale[tensorName] = scale; - std::cout << " InputTensor " << i << " " << tensorName << " gets scale " << tensorScale[tensorName][0] << std::endl; - } - - op->set_feature_scale(scales); - std::cout << " Outputs:\n"; - - for (U32 i = 0; i < int8Ms.ops[opIdx].num_outputs; i++) { - std::string tensorName = int8Ms.ops[opIdx].output_tensors_name[i]; - TensorDesc desc = outputTensors[i].get_desc(); - - auto it = tensorScale.find(tensorName); - CHECK_REQUIREMENT(it == tensorScale.end()); - - if (DT_F16 == desc.dt) { - continue; - } - - CHECK_REQUIREMENT(DT_I8 == desc.dt); - - auto opF16 = f16CNN->get_op_by_index(opIdx); - auto outputs = opF16->get_output_tensors(); - - TensorDesc outDesc = outputs[i].get_desc(); - U32 dBytes = tensorNumBytes(outDesc); - dBuf.resize(dBytes * NUM_IMAGES_INPUT); - std::vector histogram; - F32 last_max = 0; - F32 interval = 0; - - U8 *d = dBuf.data(); - - for (U32 j = 0; j < images.size(); j++) { - for (int index = 0; index < (int)curModelInputTensorNames.size(); index++) { - f16CNN->copy_to_named_input(curModelInputTensorNames[index], images[j][index].get_val()); - } - - f16CNN->run_till_breakpoint(opIdx); - memcpy(d, outputs[i].get_val(), dBytes); - d += dBytes; - - - if ((j != images.size()-1) && ((j+1)%NUM_IMAGES_INPUT != 0 )){ - continue; - } - - if (j == NUM_IMAGES_INPUT - 1 || ((j == images.size()-1) && (j < NUM_IMAGES_INPUT - 1))) { - DEBUG_info("---------- start getting 1 - "<< j+1 <<" images output tensors ----------"); - - F16 *ptr_d = (F16*)dBuf.data(); - F32 max = array_maxabs_f16(ptr_d, (I32)(tensorNumElements(outDesc) * (j+1))); - DEBUG_info(" " << max << " is the maximum value"); - interval = max / BINS; - histogram.resize(BINS, 0.00001f); - //update histogram first time - update_histogram(tensorNumElements(outDesc)*(j+1), ptr_d , BINS, interval, histogram.data()); - last_max = max; - d = dBuf.data(); - dBuf.clear(); - continue; - } - - if((j+1)%NUM_IMAGES_INPUT == 0 && j != (NUM_IMAGES_INPUT -1)) { - F16 *ptr_d = (F16*)dBuf.data(); - F32 max = array_maxabs_f16(ptr_d, (I32)tensorNumElements(outDesc) * NUM_IMAGES_INPUT); - - DEBUG_info("---------- start getting " << j+1-100 << " - " << j+1 << " images output tensors ----------"); - - if(max <= last_max) { - DEBUG_info(" " << last_max << " is the maximum value"); - interval = last_max / BINS; - //update histogram if no new max - update_histogram(tensorNumElements(outDesc) * NUM_IMAGES_INPUT, ptr_d , BINS, interval, histogram.data()); - } - else { - DEBUG_info(" " << max << " is the maximum value"); - interval = max / BINS; - F32 numPerBin = (F32) max / last_max; - //last_max = max; -> may optimize accuracy - histogram = compress_histogram(histogram, numPerBin, last_max); - last_max = max; - update_histogram(tensorNumElements(outDesc) * NUM_IMAGES_INPUT, ptr_d , BINS, interval, histogram.data()); - } - d = dBuf.data(); - dBuf.clear(); - continue; - } - - if((j == images.size()-1) && ((j+1)%NUM_IMAGES_INPUT != 0)) { - DEBUG_info("---------- start getting " << j+1-((j+1)%NUM_IMAGES_INPUT) << " - " << j+1 << " images output tensors ----------"); - dBuf.resize(dBytes * ((j+1)%NUM_IMAGES_INPUT)); - F16 *ptr_d = (F16*)dBuf.data(); - F32 max = array_maxabs_f16(ptr_d, (I32)(tensorNumElements(outDesc)*((j+1)%NUM_IMAGES_INPUT))); - if(max <= last_max ){ - DEBUG_info(" " << last_max << " is the maximum value"); - interval = last_max / BINS; - //update histogram if no new max - update_histogram(tensorNumElements(outDesc)*((j+1)%NUM_IMAGES_INPUT), ptr_d , BINS, interval, histogram.data()); - } - else { - DEBUG_info(" " << max << " is the maximum value"); - interval = max / BINS; - F32 numPerBin = (F32) max / last_max; - //last_max = max; -> may optimize accuracy - histogram = compress_histogram(histogram, numPerBin, last_max); - last_max = max; - update_histogram(tensorNumElements(outDesc)*((j+1)%NUM_IMAGES_INPUT), ptr_d , BINS, interval, histogram.data()); - } - d = dBuf.data(); - dBuf.clear(); - continue; - } - } - DEBUG_info("---------- compute KL ----------"); - Vec scale = compute_scale_with_KL(histogram,interval); - DEBUG_info("---------- finish compute KL ---------"); - scales.push_back(scale); - tensorScale[tensorName] = scale; - std::cout << " OutputTensor " << i << " " << tensorName << " gets scale " << tensorScale[tensorName][0] << std::endl; - } - if (int8Ms.ops[opIdx].num_quant_feature == 1 && -2 == int8Ms.ops[opIdx].feature_scale[0].scale[0]) { - Vec outputScale; - outputScale.push_back(-2); - scales.push_back(outputScale); - } - - op->set_feature_scale(scales); - - // Store scales into result model - if (nullptr != resultMs.ops[opIdx].feature_scale) { // Could be labelled with -2 - for (U32 i = 0; i < resultMs.ops[opIdx].num_quant_feature; i++) { - if (nullptr != resultMs.ops[opIdx].feature_scale[i].scale) { - delete [] resultMs.ops[opIdx].feature_scale[i].scale; - } - } - delete [] resultMs.ops[opIdx].feature_scale; - } - - resultMs.ops[opIdx].num_quant_feature = scales.size(); - resultMs.ops[opIdx].feature_scale = (QuantSpec*)mt_new_storage(scales.size() * sizeof(QuantSpec)); - - for (U32 i = 0; i < scales.size(); i++) { - resultMs.ops[opIdx].feature_scale[i].num_scale = scales[i].size(); - U32 scaleBytes = scales[i].size() * sizeof(F32); - resultMs.ops[opIdx].feature_scale[i].scale = (F32*)mt_new_storage(scaleBytes); - memcpy(resultMs.ops[opIdx].feature_scale[i].scale, scales[i].data(), scaleBytes); - } - - calibratedOpIdx.push_back(opIdx); - opIdx = int8CNN->find_next_dynamic_scale_op(calibratedOpIdx, opIdx); - } - - print_ms(resultMs); - - std::string modelStorePath = std::string(argv[1]); - auto suffixPos = modelStorePath.find(".bolt"); - modelStorePath.erase(suffixPos, 5); - modelStorePath += "_KL.bolt"; - CHECK_STATUS(serialize_model_to_file(&resultMs, modelStorePath.c_str())); - - CHECK_STATUS(mt_destroy_model(&int8Ms)); - CHECK_STATUS(mt_destroy_model(&f16Ms)); - resultMs.num_op_tensor_entries = relationNum; - resultMs.op_relationship_entries = relationPtr; - CHECK_STATUS(mt_destroy_model(&resultMs)); -#endif - return 0; -} diff --git a/tools/tensor_computing_library_search.cpp b/tools/tensor_computing_library_search.cpp deleted file mode 100644 index c101b23c..00000000 --- a/tools/tensor_computing_library_search.cpp +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "ut_util.h" -#include "tensor_computing.h" -#include "tensor_computing_library_algorithm_search.h" - - -int convolutionCPUFloatAlgorithmSearch(DataType dt){ - TensorDesc inputDesc, filterDesc, outputDesc; - ConvolutionPolicy policy = CONVOLUTION_TUNNING; - ActivationDesc activationDesc; - activationDesc.mode = ACTIVATION_RELU; - activationDesc.value[0] = 0; - ConvolutionDesc convDesc; - convDesc.dilatedRate_h = 1; - convDesc.dilatedRate_w = 1; - U32 in = 1; - for (int ic = 8; ic < libraryAlgorithmParameters["convolution_ic_max"]; - ic+=libraryAlgorithmParameters["convolution_ic_step"]) { - for (int ih = 8; ih < libraryAlgorithmParameters["convolution_ih_max"]; - ih+=libraryAlgorithmParameters["convolution_ih_step"]) { - for (int fn = 8; fn < libraryAlgorithmParameters["convolution_fn_max"]; - fn+=libraryAlgorithmParameters["convolution_fn_step"]) { - for (int fh = 8; fh < libraryAlgorithmParameters["convolution_fh_max"]; - fh+=libraryAlgorithmParameters["convolution_fh_step"]) { - for (int sh = 1; sh < fh; sh++) { - for (int ph = 0; ph < fh; ph++) { - if (ic % 8 != 0) { - inputDesc = tensor4df(dt, DF_NCHW, in, ic, ih, ih); - } else { - inputDesc = tensor4df(dt, DF_NCHWC8, in, ic, ih, ih); - } - convDesc.stride_h = sh; - convDesc.stride_w = sh; - convDesc.padding_top = ph; - convDesc.padding_bottom = ph; - convDesc.padding_left = ph; - convDesc.padding_right = ph; - filterDesc = tensor4df(dt, DF_NCHW, fn, ic, fh, fh); - U32 outputBytes = 0; - CHECK_STATUS(convolution_infer_output_size(inputDesc, - filterDesc, convDesc, &outputDesc, dt, &outputBytes, UT_ARCH)); - ConvolutionForwardAlgorithm algorithm = CONVOLUTION_ALGORITHM_NULL; - CHECK_STATUS(convolution_infer_forward_algorithm(inputDesc, - filterDesc, outputDesc, convDesc, policy, &algorithm, dt, activationDesc, UT_ARCH)); - - std::string name = getConvolutionAlgorithmMapNameFromInput(inputDesc, - filterDesc, convDesc, dt); - libraryAlgorithmMap[name] = algorithm; - } - } - } - } - } - } - return 0; -} - - -int main() { -#ifdef _USE_FP16 - convolutionCPUFloatAlgorithmSearch(DT_F16); -#endif -#ifdef _USE_FP32 - convolutionCPUFloatAlgorithmSearch(DT_F32); -#endif - saveLibraryAlgorithmMapToTxt(); - return 0; -} diff --git a/tools/tflite2bolt.cpp b/tools/tflite2bolt.cpp deleted file mode 100644 index 45095abf..00000000 --- a/tools/tflite2bolt.cpp +++ /dev/null @@ -1,95 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#include -#include -#include "model_tools.h" -#include "model_serialize_deserialize.hpp" -#include "model_optimizer.hpp" -#include "converter.h" -#include "model_print.h" - -int main(int argc, char* argv[]) { - std::string dir = std::string(argv[1]); - std::string mfn = std::string(argv[2]); - InferencePrecision ip = FP16; - DataConvertType converterMode = F32_to_F16; - if (argc > 3) { - if (std::string(argv[3]) == std::string("INT8_Q")) { - ip = INT8_Q; - converterMode = F32_to_F16; - } else if (std::string(argv[3]) == std::string("FP32")) { - ip = FP32; - converterMode = F32_to_F32; - } - } - - ModelSpec originalMs, targetMs, resultMs; - CHECK_STATUS(mt_create_model(&originalMs)); - CHECK_STATUS(mt_create_model(&targetMs)); - CHECK_STATUS(mt_create_model(&resultMs)); - - CHECK_STATUS(tflite_converter(dir, mfn, &originalMs)); -#ifdef _DEBUG - print_ms(originalMs); -#endif - - //graph_optimizer - ModelSpecOptimizer ms_optimizer; - ms_optimizer.suggest(); - ms_optimizer.optimize(&originalMs); -#ifdef _DEBUG - print_ms(originalMs); -#endif - - //datatype converter - CHECK_STATUS(ms_datatype_converter(&originalMs, &targetMs, converterMode)); -#ifdef _DEBUG - print_ms(targetMs); -#endif - - //serialize ms to ./bolt - std::string modelStorePath = std::string(argv[1]) + "/" + std::string(argv[2]); - switch (ip) { - case INT8_Q: { - modelStorePath += std::string("_int8_q.bolt"); - targetMs.dt = DT_F16_8Q; - CHECK_STATUS(serialize_model_to_file(&targetMs, modelStorePath.c_str())); - break; - } - case FP16: { - modelStorePath += std::string("_f16.bolt"); - CHECK_STATUS(serialize_model_to_file(&targetMs, modelStorePath.c_str())); - break; - } - case FP32: { - modelStorePath += std::string("_f32.bolt"); - CHECK_STATUS(serialize_model_to_file(&targetMs, modelStorePath.c_str())); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - return 0; - } - } - - //deserialize ./bolt to ms in memory - CHECK_STATUS(deserialize_model_from_file(modelStorePath.c_str(), &resultMs)); - print_ms(resultMs); - - CHECK_STATUS(mt_destroy_model(&originalMs)); - CHECK_STATUS(mt_destroy_model(&targetMs)); - CHECK_STATUS(mt_destroy_model(&resultMs)); - - return 0; -} diff --git a/uni/include/arm_neon_expand.h b/uni/include/arm_neon_expand.h deleted file mode 100644 index 489d76b0..00000000 --- a/uni/include/arm_neon_expand.h +++ /dev/null @@ -1,307 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_ARM_NEON_EXPAND -#define _H_ARM_NEON_EXPAND -#include -#include -#include - -#include "type.h" -#include "error.h" - -#ifndef __aarch64__ -inline float32x4_t vdivq_f32(float32x4_t a, float32x4_t b) -{ - float32x4_t b_recip = vrecpeq_f32(b); - b_recip = vmulq_f32(vrecpsq_f32(b, b_recip), b_recip); - return vmulq_f32(a, b_recip); -} - -inline float vmaxvq_f32(float32x4_t x) -{ - float32x2_t max = vmax_f32(vget_low_f32(x), vget_high_f32(x)); - max = vpmax_f32(max, max); - return vget_lane_f32(max, 0); -} - -#ifndef __ANDROID__ -inline float32x4_t vfmaq_f32(float32x4_t c, float32x4_t a, float32_t b) -{ - return vmlaq_f32(c, a, vdupq_n_f32(b)); -} - -inline float32x4_t vfmaq_n_f32(float32x4_t c, float32x4_t a, float32_t b) -{ - return vfmaq_f32(c, a, vdupq_n_f32(b)); -} -#endif - -inline float vaddvq_f32(float32x4_t x) -{ - float32x2_t sum = vadd_f32(vget_low_f32(x), vget_high_f32(x)); - sum = vpadd_f32(sum, sum); - return vget_lane_f32(sum, 0); -} - -inline unsigned int vaddvq_u32(uint32x4_t x) -{ - uint32x2_t sum = vadd_u32(vget_low_u32(x), vget_high_u32(x)); - sum = vpadd_u32(sum, sum); - return vget_lane_u32(sum, 0); -} -#endif - -inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const std::array &coeffs) -{ - float32x4_t A = vfmaq_f32(coeffs[0], coeffs[4], x); - float32x4_t B = vfmaq_f32(coeffs[2], coeffs[6], x); - float32x4_t C = vfmaq_f32(coeffs[1], coeffs[5], x); - float32x4_t D = vfmaq_f32(coeffs[3], coeffs[7], x); - float32x4_t x2 = vmulq_f32(x, x); - float32x4_t x4 = vmulq_f32(x2, x2); - float32x4_t res = vfmaq_f32(vfmaq_f32(A, B, x2), - vfmaq_f32(C, D, x2), - x4); - return res; -} - -inline float32x4_t vexpq_f32_03_percent_error(float32x4_t x) -{ - const std::array exp_tab = - { - { - vdupq_n_f32(1.f), - vdupq_n_f32(0.0416598916054f), - vdupq_n_f32(0.500000596046f), - vdupq_n_f32(0.0014122662833f), - vdupq_n_f32(1.00000011921f), - vdupq_n_f32(0.00833693705499f), - vdupq_n_f32(0.166665703058f), - vdupq_n_f32(0.000195780929062f), - } - }; - - x = vminq_f32(x, vdupq_n_f32(88.3762626647949f)); - - static const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); - static const float32x4_t CONST_INV_LN2 = vdupq_n_f32(1.4426950408f); - static const float32x4_t CONST_0 = vdupq_n_f32(0.f); - static const int32x4_t CONST_NEGATIVE_14 = vdupq_n_s32(-14); - - int32x4_t m = vcvtq_s32_f32(vmulq_f32(x, CONST_INV_LN2)); - float32x4_t val = vfmsq_f32(x, vcvtq_f32_s32(m), CONST_LN2); - - float32x4_t poly = vtaylor_polyq_f32(val, exp_tab); - - poly = vreinterpretq_f32_s32(vqaddq_s32(vreinterpretq_s32_f32(poly), vqshlq_n_s32(m, 23))); - poly = vbslq_f32(vcltq_s32(m, CONST_NEGATIVE_14), CONST_0, poly); - - return poly; -} - -inline float32x4_t vsigmoidq_f32(float32x4_t x) -{ - float32x4_t one_v = vdupq_n_f32(1.f); - return vrecpeq_f32(vaddq_f32(vexpq_f32_03_percent_error(vnegq_f32(x)), one_v)); -} - -inline float32x4_t vtanhq_f32(float32x4_t x) -{ - float32x4_t one_v = vdupq_n_f32(1.f); - float32x4_t two_v = vdupq_n_f32(2.f); - float32x4_t e_2G_v = vexpq_f32_03_percent_error(vmulq_f32(two_v, x)); - //float32x4_t result_v = vfmsq_f32(one_v, two_v, vrecpeq_f32(vaddq_f32(e_2G_v, one_v))); - float32x4_t result_v = vsubq_f32(one_v, vdivq_f32(two_v, vaddq_f32(one_v, e_2G_v))); - return result_v; -} - -#ifdef _USE_FP16 - -inline float16x8_t vaddq_f16_f32(float16x8_t a, float16x8_t b) -{ -#ifdef _USE_F16_MIX_PRECISION - float32x4_t a0 = vcvt_f32_f16(vget_low_f16(a)); - float32x4_t a1 = vcvt_f32_f16(vget_high_f16(a)); - float32x4_t b0 = vcvt_f32_f16(vget_low_f16(b)); - float32x4_t b1 = vcvt_f32_f16(vget_high_f16(b)); - return vcombine_f16(vcvt_f16_f32(vaddq_f32(a0, b0)), vcvt_f16_f32(vaddq_f32(a1, b1))); -#else - return vaddq_f16(a, b); -#endif -} - -inline float16x8_t vtaylor_polyq_f16(float16x8_t x, const std::array &coeffs) -{ - float16x8_t A = vfmaq_f16(coeffs[0], coeffs[4], x); - float16x8_t B = vfmaq_f16(coeffs[2], coeffs[6], x); - float16x8_t C = vfmaq_f16(coeffs[1], coeffs[5], x); - float16x8_t D = vfmaq_f16(coeffs[3], coeffs[7], x); - float16x8_t x2 = vmulq_f16(x, x); - float16x8_t x4 = vmulq_f16(x2, x2); - float16x8_t res = vfmaq_f16(vfmaq_f16(A, B, x2), - vfmaq_f16(C, D, x2), - x4); - return res; -} - -inline float16x8_t vexpq_f16_03_percent_error(float16x8_t x) -{ - const std::array exp_tab = - { - { - vdupq_n_f16(1.f), - vdupq_n_f16(0.0416598916054f), - vdupq_n_f16(0.500000596046f), - vdupq_n_f16(0.0014122662833f), - vdupq_n_f16(1.00000011921f), - vdupq_n_f16(0.00833693705499f), - vdupq_n_f16(0.166665703058f), - vdupq_n_f16(0.000195780929062f), - } - }; - - x = vminq_f16(x, vdupq_n_f16(11.0898664884f)); - - static const float16x8_t CONST_LN2 = vdupq_n_f16(0.6931471805f); - static const float16x8_t CONST_INV_LN2 = vdupq_n_f16(1.4426950408f); - static const float16x8_t CONST_0 = vdupq_n_f16(0.f); - static const int16x8_t CONST_NEGATIVE_14 = vdupq_n_s16(-14); - - int16x8_t m = vcvtq_s16_f16(vmulq_f16(x, CONST_INV_LN2)); - float16x8_t val = vfmsq_f16(x, vcvtq_f16_s16(m), CONST_LN2); - - float16x8_t poly = vtaylor_polyq_f16(val, exp_tab); - - poly = vreinterpretq_f16_s16(vqaddq_s16(vreinterpretq_s16_f16(poly), vqshlq_n_s16(m, 10))); - poly = vbslq_f16(vcltq_s16(m, CONST_NEGATIVE_14), CONST_0, poly); - - return poly; -} - -inline float16x8_t vexpq_f16_4_percent_error_half_time(float16x8_t x) -{ - x = vminq_f16(x, vdupq_n_f16(11.0898664884f)); - static const float16x8_t CONST_Y = vdupq_n_f16(1477.3197217792); - static const float16x8_t CONST_B = vdupq_n_f16(15301.3197217792); - float16x8_t in1, in3; - int16x8_t in2; - x = vmaxq_f16(x, vdupq_n_f16(-10)); - in1 = vfmaq_f16(CONST_B, CONST_Y, x); - in2 = vcvtq_s16_f16(in1); - in3 = vreinterpretq_f16_s16(in2); - return in3; -} - - -inline float16x8_t vexpq_f16_f32(float16x8_t a) -{ -#ifdef _USE_F16_MIX_PRECISION - float32x4_t a0 = vcvt_f32_f16(vget_low_f16(a)); - float32x4_t a1 = vcvt_f32_f16(vget_high_f16(a)); - return vcombine_f16(vcvt_f16_f32(vexpq_f32_03_percent_error(a0)), vcvt_f16_f32(vexpq_f32_03_percent_error(a1))); -#else - return vexpq_f16_03_percent_error(a); -#endif -} - -inline float16x8_t vsigmoidq_f16(float16x8_t x) -{ -#ifdef _USE_F16_MIX_PRECISION - float32x4_t x0 = vcvt_f32_f16(vget_low_f16(x)); - float32x4_t x1 = vcvt_f32_f16(vget_high_f16(x)); - float16x8_t y = vcombine_f16(vcvt_f16_f32(vsigmoidq_f32(x0)), - vcvt_f16_f32(vsigmoidq_f32(x1))); - return y; -#else - float16x8_t one_v = vdupq_n_f16(1.f); - return vrecpeq_f16(vaddq_f16_f32(vexpq_f16_03_percent_error(vnegq_f16(x)), one_v)); -#endif -} - -inline float16x8_t vtanhq_f16(float16x8_t x) -{ -#ifdef _USE_F16_MIX_PRECISION - float32x4_t x0 = vcvt_f32_f16(vget_low_f16(x)); - float32x4_t x1 = vcvt_f32_f16(vget_high_f16(x)); - float16x8_t y = vcombine_f16(vcvt_f16_f32(vtanhq_f32(x0)), - vcvt_f16_f32(vtanhq_f32(x1))); - return y; -#else - float16x8_t one_v = vdupq_n_f16(1.f); - float16x8_t two_v = vdupq_n_f16(2.f); - float16x8_t e_2G_v = vexpq_f16_03_percent_error(vmulq_f16(two_v, x)); - //float16x8_t result_v = vfmsq_f16(one_v, two_v, vrecpeq_f16(vaddq_f16(e_2G_v, one_v))); - float16x8_t result_v = vsubq_f16(one_v, vdivq_f16(two_v, vaddq_f16(one_v, e_2G_v))); - return result_v; -#endif -} - -inline F32 vaddvq_f16(float16x8_t x) -{ - float32x4_t a = vcvt_f32_f16(vget_high_f16(x)); - float32x4_t b = vcvt_f32_f16(vget_low_f16(x)); - F32 sum = vaddvq_f32(vaddq_f32(a, b)); - return sum; -} - -inline void vst1q_lane_f16_builtin(F16* address, float16x8_t vec, const int laneId) { - switch (laneId) { - case 0: - vst1q_lane_f16(address, vec, 0); - break; - case 1: - vst1q_lane_f16(address, vec, 1); - break; - case 2: - vst1q_lane_f16(address, vec, 2); - break; - case 3: - vst1q_lane_f16(address, vec, 3); - break; - case 4: - vst1q_lane_f16(address, vec, 4); - break; - case 5: - vst1q_lane_f16(address, vec, 5); - break; - case 6: - vst1q_lane_f16(address, vec, 6); - break; - case 7: - vst1q_lane_f16(address, vec, 7); - break; - default: - CHECK_REQUIREMENT(0); - } -} -#endif - -#ifdef _USE_INT8 -inline int32x4_t vdotq_laneq_s32_builtin(int32x4_t c, int8x16_t a, int8x16_t b, const int laneId) { - switch (laneId) { - case 0: - return vdotq_laneq_s32(c, a, b, 0); - case 1: - return vdotq_laneq_s32(c, a, b, 1); - case 2: - return vdotq_laneq_s32(c, a, b, 2); - case 3: - return vdotq_laneq_s32(c, a, b, 3); - default: - CHECK_REQUIREMENT(0); - } -} -#endif -#endif diff --git a/uni/include/error.h b/uni/include/error.h deleted file mode 100644 index e12b32e3..00000000 --- a/uni/include/error.h +++ /dev/null @@ -1,117 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_ERROR -#define _H_ERROR - -#include -#include -#include -#include - -#if defined(_DEBUG) && defined(__ANDROID__) -#include -#define LOG_TAG "Bolt" -#define LOGD(...) __android_log_print(ANDROID_LOG_DEBUG, LOG_TAG, __VA_ARGS__) -#else -#define LOGD(...) printf(__VA_ARGS__) -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -#if defined(_DEBUG) && defined(__ANDROID__) - #define CHECK_REQUIREMENT(status) if (!(status)) {\ - LOGD("[ERROR] %s %s line %d requirement mismatch\n", __FILE__, __func__, __LINE__);\ - } -#else - #define CHECK_REQUIREMENT(status) if (!(status)) {\ - LOGD("[ERROR] %s %s line %d requirement mismatch\n", __FILE__, __func__, __LINE__);\ - exit(1);\ - } -#endif - -#if defined(_DEBUG) && defined(__ANDROID__) - #define CHECK_STATUS(ee) {\ - EE status = (ee); \ - if (status != SUCCESS) {\ - LOGD("[ERROR] %s %s line %d got an error: %s\n", __FILE__, __func__, __LINE__, ee2str(status));\ - }\ - } -#else - #define CHECK_STATUS(ee) {\ - EE status = (ee); \ - if (status != SUCCESS) {\ - LOGD("[ERROR] %s %s line %d got an error: %s\n", __FILE__, __func__, __LINE__, ee2str(status));\ - exit(1);\ - }\ - } -#endif - - typedef enum { - SUCCESS = 0, - NULL_POINTER = 1, - NOT_MATCH = 2, - NOT_FOUND = 3, - ALLOC_FAILED = 4, - NOT_IMPLEMENTED = 50, - NOT_SUPPORTED = 51, - GCL_ERROR = 52, - UNKNOWN = 99 - } EE; - - inline const char* ee2str(EE ee) { - const char* ret = 0; - switch (ee) { - case SUCCESS: - ret = "SUCCESS"; - break; - case NULL_POINTER: - ret = "Null Pointer"; - break; - case NOT_MATCH: - ret = "Not Match"; - break; - case NOT_FOUND: - ret = "Not Found"; - break; - case NOT_IMPLEMENTED: - ret = "Not Implemented"; - break; - case NOT_SUPPORTED: - ret = "Not Supported"; - break; - default: - ret = "Unknown"; - break; - } - return ret; - } - - #define CI_info(x) do { std::cout << x << std::endl; } while (0) - - #ifdef _DEBUG - #define DEBUG_info(x) do { std::cout << x << std::endl; } while (0) - #define DEBUG_info_s(x) do { std::cout << x << " "; } while (0) - #else - #define DEBUG_info(x) do { } while (0) - #define DEBUG_info_s(x) do { } while (0) - #endif - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/uni/include/op_type.h b/uni/include/op_type.h deleted file mode 100644 index 6731297c..00000000 --- a/uni/include/op_type.h +++ /dev/null @@ -1,168 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_OP_TYPE -#define _H_OP_TYPE - -#ifdef __cplusplus -extern "C" { -#endif - -// please add OperatorType and OperatorTypeName at the same time - typedef enum { - OT_Conv, - OT_FC, - OT_Pooling, - OT_Relu, - OT_Relu6, - OT_HSwish, - OT_HSigmoid, - OT_Eltwise, - OT_Softmax, - OT_Concat, - - OT_MaxOut, - OT_BatchNorm, - OT_Sigmoid, - OT_Scale, - OT_Clip, - OT_LSTM, - OT_Embedding, - OT_SoftmaxWithLoss, - OT_Pad, - OT_Gelu, - - OT_TanH, - OT_LayerNorm, - OT_MatMul, - OT_Multiply, - OT_Reshape, - OT_Slice, - OT_Transpose, - OT_Attention, - OT_Input, - OT_Squeeze, - - OT_Gather, - OT_Unsqueeze, - OT_Upsample, - OT_Cast, - OT_Logistic, - OT_BilateralSliceApply, - OT_Resize, - OT_Deconvolution, - OT_Constant, - OT_ResizeBilinear, - - OT_PreAllocatedMemory, - OT_SharedWeight, - OT_Copy, - OT_Check, - OT_Repeat, - OT_Reduction, - OT_ArgMax, - OT_None, - OT_Interp, - OT_Flatten, - - OT_Jump, - OT_Space2Depth, - OT_Depth2Space, - OT_AttentionMask, - OT_RelativePositionEmbedding, - OT_RelativeShift, - OT_TfSlice, - OT_Permute, - OT_LogSoftmax, - OT_PriorBox, - OT_DetectionOutput - } OperatorType; - - inline const char * const *OperatorTypeName() { - static const char * const names[] = { - "OT_Conv", - "OT_FC", - "OT_Pooling", - "OT_Relu", - "OT_Relu6", - "OT_HSwish", - "OT_HSigmoid", - "OT_Eltwise", - "OT_Softmax", - "OT_Concat", - - "OT_MaxOut", - "OT_BatchNorm", - "OT_Sigmoid", - "OT_Scale", - "OT_Clip", - "OT_LSTM", - "OT_Embedding", - "OT_SoftmaxWithLoss", - "OT_Pad", - "OT_Gelu", - - "OT_TanH", - "OT_LayerNorm", - "OT_MatMul", - "OT_Multiply", - "OT_Reshape", - "OT_Slice", - "OT_Transpose", - "OT_Attention", - "OT_Input", - "OT_Squeeze", - - "OT_Gather", - "OT_Unsqueeze", - "OT_Upsample", - "OT_Cast", - "OT_Logistic", - "OT_BilateralSliceApply", - "OT_Resize", - "OT_Deconvolution", - "OT_Constant", - "OT_ResizeBilinear", - - "OT_PreAllocatedMemory", - "OT_SharedWeight", - "OT_Copy", - "OT_Check", - "OT_Repeat", - "OT_Reduction", - "OT_ArgMax", - "OT_None", - "OT_Interp", - "OT_Flatten", - - "OT_Jump", - "OT_Space2Depth", - "OT_Depth2Space", - "OT_AttentionMask", - "OT_RelativePositionEmbedding", - "OT_RelativeShift", - "OT_TfSlice", - "OT_Permute", - "OT_LogSoftmax", - "OT_PriorBox", - "OT_DetectionOutput" - }; - return names; - } - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/uni/include/sys.h b/uni/include/sys.h deleted file mode 100644 index 3f376930..00000000 --- a/uni/include/sys.h +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_SYS -#define _H_SYS - -#ifdef __cplusplus -extern "C" { -#endif - - typedef enum { - CPU_GENERAL = 1, - MALI = 2, - ARM_V7 = 3, - ARM_V8 = 4, - ARM_A55 = 5, - ARM_A76 = 6, - } Arch; - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/uni/include/tensor_desc.h b/uni/include/tensor_desc.h deleted file mode 100644 index 553d570c..00000000 --- a/uni/include/tensor_desc.h +++ /dev/null @@ -1,315 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_TENSOR_DESC -#define _H_TENSOR_DESC -#include -#include -#include -#include "type.h" -#include "error.h" - - typedef enum { - DF_NCHW, - DF_NCHWN16, //vectorize for N=16, for filter - DF_NCHWC8, //vectorize for C=8, for input and output - DF_HWNCN16, //vectorize for N=16, for filter in winograd - DF_NHWCN16, // im2col + GEMM, for filter - DF_NHWCN8, //vectorize for N=8, not used - DF_HWNCN8C4, //int8 filter for winograd - DF_NCHWN8C4, //int8 im2col + GEMM, for filter - DF_NCHWN8HW4, //int8 im2col + GEMM in the first layer, for filter - DF_NCHWN16C8, //bnn im2col + GEMM, for filter - DF_TRANSPOSE, //vectorize for COL_MAJOR - DF_NORMAL, //vectorize for ROW_MAJOR - DF_MTK, // LSTM input, M: batch, T: step, K: x_dim - DF_MKT, // LSTM input, M: batch, T: step, K: x_dim - DF_NK, // MMM/MVM filter, N: col_num, K: row_num - DF_NKN32, // MMM/MVM filter, vectorized for N=32 - DF_CHW_NC, // dw_conv, CHW means dw part, NC means pw part - DF_CHWC8_NCN16, // dw_conv, vectorized for C8 and N16 - DF_CHWC8_NCN8C4, // int8 dw_conv, vectorized for C4 and N8 - DF_NCWHC4, //ocl mali input and output - DF_NCHWC3, //ocl mali support input rgb - DF_NHWC, //ocl mali support input/output - DF_NCHWN4C4, //ocl mali conv filter - DF_NCHWN4, //ocl mali conv filter - DF_HWCN , //ocl mali filter - DF_NCWHN4C4, //ocl mali fc filter - DF_NHWCN4, //ocl mali filter - DF_CHWNC4, //ocl mali filter - DF_CHWNC8, //ocl mali filter - DF_CHWNC16, //ocl mali filter - DF_CHWC8_NCN8, // fp32 dw_conv, vectorized for C8 and N8 - DF_RGB, - DF_HWNCN8, // fp32 filter for winograd - DF_NKN24, // Optimized MMM filter for FP16 -#ifdef __aarch64__ - DF_NKN12, // Optimized MMM filter for FP32 -#else - DF_NKN8, // Optimized MMM filter for FP32 -#endif - DF_NKN12K4, // Optimized MMM filter for INT8 - DF_NCHW_ORG_MALI // mark first layer for mali - } DataFormat; - - typedef struct { - DataType dt = DT_U8; - DataFormat df; - U32 nDims = 0; - U32 dims[6] = {0}; - } TensorDesc; - - /** - * @param num the number of filter or image, not count for the last dim for vectorize - * - **/ - inline TensorDesc tensor4df(DataType dt, DataFormat df, U32 num, U32 numChannels, U32 height, U32 width) { - TensorDesc ret; - ret.dt = dt; - ret.df = df; - ret.nDims = 4; - ret.dims[0] = width; - ret.dims[1] = height; - ret.dims[2] = numChannels; - ret.dims[3] = num; - return ret; - } - - inline TensorDesc tensor4d(DataType dt, U32 num, U32 numChannels, U32 height, U32 width) - { - return tensor4df(dt, DF_NCHW, num, numChannels, height, width); - } - - inline TensorDesc tensor3df(DataType dt, DataFormat df, U32 numChannels, U32 height, U32 width) - { - TensorDesc ret = tensor4df(dt, df, 1, numChannels, height, width); - ret.nDims = 3; - return ret; - } - - inline TensorDesc tensor3d(DataType dt, U32 numChannels, U32 height, U32 width) - { - return tensor3df(dt, DF_NCHW, numChannels, height, width); - } - - inline TensorDesc tensor2df(DataType dt, DataFormat df, U32 numRows, U32 numColumns) - { - TensorDesc ret = tensor3df(dt, df, 1, numRows, numColumns); - ret.nDims = 2; - return ret; - } - - inline TensorDesc tensor2d(DataType dt, U32 numRows, U32 numColumns) - { - TensorDesc ret = tensor3d(dt, 1, numRows, numColumns); - ret.nDims = 2; - return ret; - } - - inline TensorDesc tensor1d(DataType dt, U32 len) - { - TensorDesc ret = tensor2d(dt, 1, len); - ret.nDims = 1; - return ret; - } - - inline EE tensor1dGet(TensorDesc desc, DataType* dt, U32* len) - { - if (nullptr == len || nullptr == dt) { - return NULL_POINTER; - } - if (1 != desc.nDims) { - return NOT_MATCH; - } - - *dt = desc.dt; - *len = desc.dims[0]; - return SUCCESS; - } - - inline EE tensor2dfGet(TensorDesc desc, DataType* dt, DataFormat *df, U32* numRows, U32* numColumns) - { - if (nullptr == numColumns || nullptr == numRows || nullptr == dt || nullptr == df) { - return NULL_POINTER; - } - if (2 != desc.nDims) { - return NOT_MATCH; - } - - *df = desc.df; - *dt = desc.dt; - *numColumns = desc.dims[0]; - *numRows = desc.dims[1]; - return SUCCESS; - } - - inline EE tensor2dGet(TensorDesc desc, DataType* dt, U32* numRows, U32* numColumns) - { - if (nullptr == numColumns || nullptr == numRows || nullptr == dt) { - return NULL_POINTER; - } - if (2 != desc.nDims) { - return NOT_MATCH; - } - - *dt = desc.dt; - *numColumns = desc.dims[0]; - *numRows = desc.dims[1]; - return SUCCESS; - } - - inline EE tensor3dGet(TensorDesc desc, DataType* dt, DataFormat *df, U32* numChannels, U32* height, U32* width) - { - if (nullptr == numChannels || nullptr == height || nullptr == width || nullptr == dt || nullptr == df) { - return NULL_POINTER; - } - if (3 != desc.nDims) { - return NOT_MATCH; - } - - *dt = desc.dt; - *df = desc.df; - *width = desc.dims[0]; - *height = desc.dims[1]; - *numChannels = desc.dims[2]; - return SUCCESS; - } - - inline EE tensor4dGet(TensorDesc desc, DataType* dt, DataFormat *df, U32* num, U32* numChannels, U32* height, U32* width) - { - if (nullptr == num || nullptr == numChannels || nullptr == height || nullptr == width || nullptr == dt || nullptr == df) { - return NULL_POINTER; - } - if (4 != desc.nDims) { - return NOT_MATCH; - } - - *dt = desc.dt; - *df = desc.df; - *width = desc.dims[0]; - *height = desc.dims[1]; - *numChannels = desc.dims[2]; - *num = desc.dims[3]; - return SUCCESS; - } - - inline EE tensorSelectGet(TensorDesc desc, DataType* dt, DataFormat *df, U32* num, U32* numChannels, U32* height, U32* width) - { - if (dt) *dt = desc.dt; - if (df) *df = desc.df; - if (width) *width = desc.dims[0]; - if (height) *height = desc.dims[1]; - if (numChannels) *numChannels = desc.dims[2]; - if (num) *num = desc.dims[3]; - return SUCCESS; - } - - inline U32 tensorNumElements(TensorDesc desc) - { - if (desc.nDims == 0) return 0; - U32 ret = 1; - if (1 <= desc.nDims) ret *= desc.dims[0]; - if (2 <= desc.nDims) ret *= desc.dims[1]; - if (3 <= desc.nDims) ret *= desc.dims[2]; - if (4 <= desc.nDims) ret *= desc.dims[3]; - if (5 <= desc.nDims) ret *= desc.dims[4]; - - return ret; - } - - inline U32 tensorNumBytes(TensorDesc desc) - { - if (desc.dt == DT_BIN01 || desc.dt == DT_BIN11) { - return tensorNumElements(desc) / 8; - } else { - return tensorNumElements(desc) * bytesOf(desc.dt); - } - } - - inline U8 tensorIs1d(TensorDesc desc) { - return 1 == desc.nDims; - } - - inline U8 tensorIs2d(TensorDesc desc) { - return 2 == desc.nDims; - } - - inline U8 tensorIs3d(TensorDesc desc) { - return 3 == desc.nDims; - } - - inline U8 tensorIs4d(TensorDesc desc) { - return 4 == desc.nDims; - } - - inline std::string tensorDesc2Str(TensorDesc desc) - { - char buff[128]; - snprintf(buff, sizeof(buff), "dt:%d df:%d dims:%d", desc.dt, desc.df, desc.nDims); - std::string descStr = buff; - - if (desc.nDims > 0) { - descStr += "("; - } - for (I32 i = int(desc.nDims) - 1; i >= 0; i--) { - descStr += std::to_string(desc.dims[i]); - if (i > 0) { - descStr += ","; - } else { - descStr += ")"; - } - } - - return descStr; - } - - inline int tensorDescIsValid(TensorDesc desc) - { - if (desc.dt < 0 || desc.dt >= 10) - return 0; - - if (desc.df < 0 || desc.df >= 30) - return 0; - - if (desc.nDims > 6) - return 0; - - for (U32 i = 0; i < desc.nDims; i++) { - if (desc.dims[i] > INT_MAX) - return 0; - } - - return 1; - } - - inline DataFormat getTensorDefaultDataFormat(int nDims) - { - DataFormat df = DF_NORMAL; - switch (nDims) { - case 2: - df = DF_NORMAL; - break; - case 3: - df = DF_MTK; - break; - case 4: - df = DF_NCHW; - break; - default: - break; - } - return df; - } -#endif diff --git a/uni/include/thread_affinity.h b/uni/include/thread_affinity.h deleted file mode 100644 index 8d00986c..00000000 --- a/uni/include/thread_affinity.h +++ /dev/null @@ -1,431 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_THREAD_AFFINITY -#define _H_THREAD_AFFINITY - -#include -#include -#include -#include -#include -#include "sys.h" - - -typedef enum { - CPU_AFFINITY_LOW_POWER = 0, - CPU_AFFINITY_HIGH_PERFORMANCE = 1 -} CpuAffinityPolicy; - -typedef struct CpuStat { - unsigned long idle; - unsigned long total; -} CpuStat; - -inline const char * const *CpuAffinityPolicyNames() -{ - static const char * const names[] = { - "CPU_AFFINITY_LOW_POWER", - "CPU_AFFINITY_HIGH_PERFORMANCE" - }; - return names; -} - -inline const CpuAffinityPolicy* CpuAffinityPolicies() -{ - static const CpuAffinityPolicy policies[] = { - CPU_AFFINITY_LOW_POWER, - CPU_AFFINITY_HIGH_PERFORMANCE - }; - return policies; -} - -inline int get_cpus_num() -{ - const int bufferSize = 1024; - char buffer[bufferSize]; - FILE* fp = fopen("/proc/cpuinfo", "rb"); - if (!fp) { - return 1; - } - - int cpuNum = 0; - while (!feof(fp)) { - char* status = fgets(buffer, bufferSize, fp); - if (!status) - break; - - if (memcmp(buffer, "processor", 9) == 0) { - cpuNum++; - } - } - fclose(fp); - return cpuNum; -} - -inline void get_cpus_arch(int cpuNum, Arch *archs) -{ - const int bufferSize = 1024; - char buffer[bufferSize]; - FILE* fp = fopen("/proc/cpuinfo", "rb"); - *archs = CPU_GENERAL; - if (!fp) { - return; - } - - int cpuid = 0; - while (!feof(fp)) { - char* status = fgets(buffer, bufferSize, fp); - if (!status) - break; - - if (memcmp(buffer, "CPU part", 8) == 0) { - Arch arch = ARM_V8; - int id = 0; - sscanf(buffer, "CPU part\t: %x", &id); - switch (id) { - case 0xc07: - arch = ARM_V7; - break; - case 0xd03: - arch = ARM_V8; - break; - case 0xd05: - arch = ARM_A55; - break; - case 0xd07: - arch = ARM_V8; - break; - case 0xd08: - arch = ARM_V8; - break; - case 0xd09: - arch = ARM_V8; - break; - case 0xd0a: - arch = ARM_A76; - break; - case 0xd40: - arch = ARM_A76; - break; - case 0xd41: - arch = ARM_A76; - break; - case 0xd44: - arch = ARM_A76; - break; - case 0x804: - arch = ARM_A76; - break; - case 0x805: - arch = ARM_A55; - break; - case 0x802: - arch = ARM_A76; - break; - case 0x803: - arch = ARM_A55; - break; - case 0x801: - arch = ARM_V8; - break; - case 0x800: - arch = ARM_V8; - break; - case 0x205: - arch = ARM_V8; - break; - default: - printf("[WARNING] unknown CPU %d arch %x\n Default to ARM_V8\n", cpuid, id); - break; - } - archs[cpuid++] = arch; - } - } - for (; cpuid < cpuNum; cpuid++) { - archs[cpuid] = archs[0]; - } - fclose(fp); -} - -inline long get_cpu_freq(int cpuid) -{ - char path[256]; - FILE *fp = NULL; - if (fp == NULL) { - snprintf(path, - sizeof(path), - "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", - cpuid); - fp = fopen(path, "rb"); - } - if (fp == NULL) { - snprintf(path, - sizeof(path), - "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state", - cpuid); - fp = fopen(path, "rb"); - } - if (fp == NULL) { - snprintf(path, - sizeof(path), - "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", - cpuid); - fp = fopen(path, "rb"); - } - - long maxFrequency = -1; - if (fp == NULL) { - printf("[WARNING] can not get CPU max frequency\n"); - } else { - fscanf(fp, "%ld", &maxFrequency); - fclose(fp); - } - return maxFrequency; -} - -inline void get_cpus_freq(long *freqs, int cpuNum) -{ - for (int i = 0; i < cpuNum; i++) { - freqs[i] = get_cpu_freq(i); - } -} - -inline int get_cpus_stat(CpuStat *cpuStat, int cpuNum) -{ - const int bufferSize = 1024; - char buffer[bufferSize]; - char name[32]; - unsigned long user, nice, system, idle, iowait, irq, softirq; - FILE* fp = fopen("/proc/stat", "rb"); - if (!fp) { - return 0; - } - - // skip total statistics - fgets(buffer, bufferSize, fp); - - for (int i = 0; i < cpuNum; i++) { - fgets(buffer, bufferSize, fp); - sscanf(buffer, "%s %lu %lu %lu %lu %lu %lu %lu", name, &user, &nice, - &system, &idle, &iowait, &irq, &softirq); - cpuStat[i].idle = idle; - cpuStat[i].total = user + nice + system + idle + iowait + irq + softirq; - } - fclose(fp); - return cpuNum; -} - -inline void get_cpus_occupy(CpuStat *firstCpuStat, CpuStat *secondCpuStat, int cpuNum, float* cpuOccupy) -{ - for (int i = 0; i < cpuNum; i++) { - float idle = secondCpuStat[i].idle - firstCpuStat[i].idle; - float total = secondCpuStat[i].total - firstCpuStat[i].total; - if (total != 0) { - cpuOccupy[i] = 1.0 - idle / total; - } else { - cpuOccupy[i] = 0; - } - } -} - -inline void swap_variable(void* a, void *b, const int size) -{ - char buffer[size]; - memcpy(buffer, a, size); - memcpy(a, b, size); - memcpy(b, buffer, size); -} - -inline void disable_cpus(float *occupys, int *cpuids, int cpuNum, float cpuOccupyMax) -{ - for (int i = 0; i < cpuNum; i++) { - if (occupys[i] > cpuOccupyMax) - cpuids[i] = -1; - } -} - -inline void sort_cpus_by_arch_freq_occupy(Arch *archs, long *freqs, float *occupys, - int *cpuids, int cpuNum, float cpuOccupyMax) -{ - for (int i = 0; i < cpuNum; i++) { - cpuids[i] = i; - } - - for (int i = 1; i < cpuNum; i++) { - for (int j = i - 1; j >= 0; j--) { - if (archs[j+1] < archs[j]) { - swap_variable(&archs[j], &archs[j+1], sizeof(Arch)); - swap_variable(&freqs[j], &freqs[j+1], sizeof(long)); - swap_variable(&cpuids[j], &cpuids[j+1], sizeof(int)); - swap_variable(&occupys[j], &occupys[j+1], sizeof(float)); - continue; - } - if (archs[j+1] == archs[j]) { - if (freqs[j+1] < freqs[j]) { - swap_variable(&archs[j], &archs[j+1], sizeof(Arch)); - swap_variable(&freqs[j], &freqs[j+1], sizeof(long)); - swap_variable(&cpuids[j], &cpuids[j+1], sizeof(int)); - swap_variable(&occupys[j], &occupys[j+1], sizeof(float)); - continue; - } - if (freqs[j+1] >= freqs[j]) { - continue; - } - } - if (archs[j+1] > archs[j]) { - continue; - } - } - } - disable_cpus(occupys, cpuids, cpuNum, cpuOccupyMax); -} - -inline int set_thread_affinity(int threadid, int cpuid) { -#ifdef _DEBUG - printf("[INFO] bind thread %d to core %d\n", threadid, cpuid); -#else - UNUSED(threadid); -#endif -#ifdef __GLIBC__ - pid_t tid = syscall(SYS_gettid); -#else - pid_t tid = gettid(); -#endif - cpu_set_t mask; - CPU_ZERO(&mask); - CPU_SET(cpuid, &mask); - int status = syscall(__NR_sched_setaffinity, tid, sizeof(mask), &mask); - if (status) { - printf("[WARNING] fail to set affinity %d\n", status); - return -1; - } - return 0; -} - -inline CpuAffinityPolicy thread_affinity_get_policy_by_name(const char *name) -{ - int nameLength = strlen(name); - for (int i = 0; i < 2; i++) { - const char *target = CpuAffinityPolicyNames()[i]; - int targetLength = strlen(target); - if (nameLength < targetLength) continue; - int match = 1; - for (int j = 0; j < targetLength; j++) { - if (name[j] == target[j] || name[j] == target[j] + 32) { - continue; - } else { - match = 0; - break; - } - } - if (match) { - return CpuAffinityPolicies()[i]; - } - } - return CPU_AFFINITY_HIGH_PERFORMANCE; -} - -inline void thread_affinity_init(int *cpuNum, Arch **archs, int **cpuids) { - float cpuOccupyMax = 0.5; - *cpuNum = get_cpus_num(); - CpuStat* firstCpuStats = (CpuStat *)malloc(sizeof(CpuStat) * (*cpuNum)); - get_cpus_stat(firstCpuStats, *cpuNum); - CpuStat* secondCpuStats = (CpuStat *)malloc(sizeof(CpuStat) * (*cpuNum)); - float* occupys = (float *)malloc(sizeof(float) * (*cpuNum)); - *archs = (Arch *)malloc(sizeof(Arch) * (*cpuNum)); - *cpuids = (int *)malloc(sizeof(int) * (*cpuNum)); - long *freqs = (long *)malloc(sizeof(long) * (*cpuNum)); - get_cpus_arch(*cpuNum, *archs); - get_cpus_freq(freqs, *cpuNum); - get_cpus_stat(secondCpuStats, *cpuNum); - get_cpus_occupy(firstCpuStats, secondCpuStats, *cpuNum, occupys); - sort_cpus_by_arch_freq_occupy(*archs, freqs, occupys, *cpuids, *cpuNum, cpuOccupyMax); - free(firstCpuStats); - free(secondCpuStats); - free(occupys); - free(freqs); -} - -inline Arch thread_affinity_set_by_policy(int cpuNum, Arch *archs, int *cpuids, CpuAffinityPolicy policy, int threadId) { - if (threadId >= cpuNum) { - printf("[WARNING] can not allocate more cores for thread %d\n", threadId); - return CPU_GENERAL; - } - - int cpuid; - Arch arch; - int i = cpuNum - 1 - threadId; - switch (policy) { - case CPU_AFFINITY_LOW_POWER: { - i = threadId; - while(cpuids[i] == -1 && i < cpuNum - 1) { - i++; - } - break; - } - case CPU_AFFINITY_HIGH_PERFORMANCE: { - i = cpuNum - 1 - threadId; - while(cpuids[i] == -1 && i > 0) { - i--; - } - break; - } - default: { - break; - } - } - cpuid = cpuids[i]; - arch = archs[i]; - set_thread_affinity(threadId, cpuid); - return arch; -} - -inline void thread_affinity_set_by_arch(int cpuNum, Arch *archs, int *cpuids, Arch arch, int threadId) -{ - if (threadId >= cpuNum) { - printf("[WARNING] can not allocate more cores for thread %d\n", threadId); - return; - } - int count = 0; - int cpuid = -1; - for (int i=0; i < cpuNum; i++) { - if (archs[i] == arch && cpuids[i] != -1) { - if (count == threadId) { - cpuid = cpuids[i]; - break; - } else { - count++; - } - } - } - if (cpuid != -1) { - set_thread_affinity(threadId, cpuid); - } else { - printf("[WARNING] there is not enough %d arch cores for thread %d", arch, threadId); - } -} - -inline void thread_affinity_destroy(int *cpuNum, Arch **archs, int **cpuids) { - if (*cpuids != NULL) { - free(*cpuids); - *cpuids = NULL; - } - if (*archs != NULL) { - free(*archs); - *archs = NULL; - } - *cpuNum = 0; -} -#endif diff --git a/uni/include/type.h b/uni/include/type.h deleted file mode 100644 index 31fc588f..00000000 --- a/uni/include/type.h +++ /dev/null @@ -1,154 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_TYPE -#define _H_TYPE - - #include - #define UNUSED(x) (void)x - #define UNI_MIN(a, b) (((a) < (b))?(a):(b)) - #define UNI_MAX(a, b) (((a) > (b))?(a):(b)) - #define UNI_F16_MIN -65504.0f - #define UNI_F16_MAX 65504.0f - #define NAME_LEN 128 - - #include -#ifdef __clang__ - #define UNI_ISNAN(a) isnan((a)) - #define UNI_ISINF(a) isinf((a)) -#else - #define UNI_ISNAN(a) std::isnan((a)) - #define UNI_ISINF(a) std::isinf((a)) -#endif - -#ifdef __cplusplus -extern "C" { -#endif - - typedef enum { - RGB_SC = 0, // scale and center crop - RGB = 1, - BGR = 2, - RGB_RAW = 3, - RGB_SC_RAW = 4, - BGR_SC_RAW = 5 - } ImageFormat; - - typedef enum { - DT_U8 = 0, - DT_I8 = 1, - DT_U32 = 2, - DT_I32 = 3, - DT_F16 = 4, - DT_F16_8Q = 5, - DT_F32 = 6, - DT_BIN01 = 7, - DT_BIN11 = 8, - DT_NUM = 9 - } DataType; - - typedef unsigned char U8; - typedef const unsigned char CU8; - typedef char I8; - typedef const char CI8; - typedef int8_t INT8; - typedef unsigned int U32; - typedef const unsigned int CU32; - typedef int I32; - typedef const int CI32; - typedef float F32; - typedef double F64; - typedef long I64; -#ifdef __aarch64__ - typedef __fp16 F16; -#endif - typedef unsigned char BIN8; - - inline U32 bytesOf(DataType dt) { - U32 bytes[] = {1, 1, 4, 4, 2, 2, 4, 1, 1, 8}; // Please divide number of elements by 8 first in the case of binary data types - return dt < DT_NUM ? bytes[dt] : 0; - } - - typedef enum { - POOLING_MAX, - POOLING_MEAN - } PoolingMode; - - typedef enum { - CEIL, - FLOOR - } RoundMode; - - typedef enum { - Relu - } EltwiseType; - - typedef enum { - ELTWISE_SUM, - ELTWISE_MAX, - ELTWISE_PROD - } EltwiseMode; - - typedef enum { - ACTIVATION_RELU, - ACTIVATION_RELU6, - ACTIVATION_H_SWISH, - ACTIVATION_H_SIGMOID, - ACTIVATION_SIGMOID, - ACTIVATION_TANH, - ACTIVATION_GELU, - ACTIVATION_NULL - } ActivationMode; - - typedef enum{ - BSliceApply_CONV, - BSliceApply_NULL - } BilateralSliceApplyMode; - - typedef enum { - Convolution_Pointwise, - Convolution_Dilation, - Convolution_Depthwise, - Convolution_Depthwise_Pointwise, - Convolution_Deconvolution - } ConvolutionMode; - - typedef enum { - Pad_Constant, - Pad_Reflect, - Pad_Edge - } PadMode; - - typedef enum { - FP16, - INT8_Q, - FP32 - } InferencePrecision; - - typedef enum { - CHECK_EQUAL, - CHECK_GREATEQUAL, - CHECK_GREAT - } CheckMode; - - typedef enum { - REDUCTION_SUM, - REDUCTION_MEAN - } ReductionMode; - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/uni/include/ut_util.h b/uni/include/ut_util.h deleted file mode 100644 index 5ac8dde7..00000000 --- a/uni/include/ut_util.h +++ /dev/null @@ -1,332 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_UT_UTIL -#define _H_UT_UTIL - -#include -#include -#include -#include -#include - -#include "sys.h" -#include "type.h" -#include "error.h" - -const Arch UT_ARCH = ARM_A76; - -// whether to check right -const int UT_CHECK = 1; - -// toop times to benchmark -const int UT_LOOPS = 6; - -// init data type -typedef enum UT_RANDOM_TYPE{ - UT_INIT_RANDOM, // random - UT_INIT_NEG, // random & < 0 - UT_INIT_POS, // random & > 0 - UT_INIT_ZERO // 0 -} UT_RANDOM_TYPE; - -// generate random data -inline F32 ut_init_s(DataType dt, UT_RANDOM_TYPE type) { - if (type == UT_INIT_ZERO) { - return 0; - } - - F32 s = 0; - if (0 -#ifdef _USE_FP32 - || dt == DT_F32 -#endif -#ifdef _USE_FP16 - || dt == DT_F16 -#endif - ) { - s = rand() % 1000 / 1000.0 - 0.5; - } else { - s = rand() % 100 - 50; - } - - if (type == UT_INIT_NEG) { - s = (s > 0) ? (s * -1) : s; - } - if (type == UT_INIT_POS) { - s = (s < 0) ? (s * -1) : s; - } - return s; -} - - -// generate random array -inline void ut_init_v(U8* data, U32 len, DataType dt, UT_RANDOM_TYPE type) { - if (data == nullptr) - return; - - for (U32 i = 0; i < len; i++) { - switch (dt) { -#ifdef _USE_FP32 - case DT_F32: { - F32 *dataPtr = (F32 *)data; - dataPtr[i] = ut_init_s(dt, type); - break; - } -#endif -#ifdef _USE_FP16 - case DT_F16: { - F16 *dataPtr = (F16 *)data; - dataPtr[i] = ut_init_s(dt, type); - break; - } -#endif - case DT_I32: { - I32 *dataPtr = (I32 *)data; - dataPtr[i] = ut_init_s(dt, type); - break; - } - case DT_U32: { - U32 *dataPtr = (U32 *)data; - dataPtr[i] = ut_init_s(dt, type); - break; - } - case DT_I8: { - INT8 *dataPtr = (INT8 *)data; - dataPtr[i] = ut_init_s(dt, type); - break; - } - case DT_BIN11: { - BIN8 *dataPtr = (BIN8 *)data; - dataPtr[i] = ut_init_s(dt, type); - break; - } - case DT_BIN01: { - BIN8 *dataPtr = (BIN8 *)data; - dataPtr[i] = ut_init_s(dt, type); - break; - } - default: - std::cerr << "[ERROR] unsupported data type in ut_init_v" << std::endl; - exit(1); - } - } -} - -inline U8* ut_input_v(U32 len, DataType dt, UT_RANDOM_TYPE type) { - U8* data = (U8*)malloc(len * bytesOf(dt)); - ut_init_v(data, len, dt, type); - - return data; -} - - -// unit test element check -inline void ut_check_s(F32 a, F32 b, F32 threshold, std::string file, int line) -{ - if (!((a <= b + threshold) && (a >= b - threshold))) { - std::cerr << "[ERROR] check in " << file << " at line " << line << " " \ - << a << " " << b << std::endl; - exit(1); - } -} - - -// unit test array check -inline void ut_check_v(void *A, void *B, U32 len, DataType dt, F32 threshold, std::string file, int line) { - F32 a = 0, b = 0; - for (U32 i = 0; i < len; i++) { - switch (dt) { -#ifdef _USE_FP32 - case DT_F32: - a = ((F32 *)A)[i]; - b = ((F32 *)B)[i]; - break; -#endif -#ifdef _USE_FP16 - case DT_F16: - a = ((F16 *)A)[i]; - b = ((F16 *)B)[i]; - break; -#endif - case DT_I32: - a = ((I32 *)A)[i]; - b = ((I32 *)B)[i]; - break; - case DT_U32: - a = ((U32 *)A)[i]; - b = ((U32 *)B)[i]; - break; - case DT_I8: - a = ((INT8 *)A)[i]; - b = ((INT8 *)B)[i]; - break; - case DT_BIN11: - a = ((BIN8 *)A)[i]; - b = ((BIN8 *)B)[i]; - break; - case DT_BIN01: - a = ((BIN8 *)A)[i]; - b = ((BIN8 *)B)[i]; - break; - default: - std::cerr << "[ERROR] unsupported data type in ut_check_v(array, array) " << std::endl; - exit(1); - } - ut_check_s(a, b, threshold, file, line); - } -} - -inline void ut_check_v(void *A, F32 val, U32 len, DataType dt, std::string file, int line) { - F32 a; - for (U32 i = 0; i < len; i++) { - switch (dt) { -#ifdef _USE_FP32 - case DT_F32: - a = ((F32 *)A)[i]; - break; -#endif -#ifdef _USE_FP16 - case DT_F16: - a = ((F16 *)A)[i]; - break; -#endif - case DT_I32: - a = ((I32 *)A)[i]; - break; - case DT_U32: - a = ((U32 *)A)[i]; - break; - case DT_BIN11: - a = ((BIN8 *)A)[i]; - break; - case DT_BIN01: - a = ((BIN8 *)A)[i]; - break; - default: - std::cerr << "[ERROR] unsupported data type in ut_check_v(array, scalar) " << std::endl; - exit(1); - } - ut_check_s(a, val, 0, file, line); - } -} - -inline void ut_check_a(void* A, void* B, U32 len, DataType dt) { - U32 e0, e1, e2, e3, e4, e5, e6; - e0 = 0; e1 = 0; e2 = 0; e3 = 0; e4 = 0; e5 = 0; e6 = 0; - F32 a, b, diff; - F32 d0, d1, d2, d3, d4, d5; - F32 maxrel = -1.0; - F32 maxabs = -1.0; - F32 max_a0, max_b0, max_a1, max_b1; - U32 max_n0, max_n1; - - - switch(dt) { -#ifdef _USE_FP16 - case DT_F16: - d0 = 1; d1 = 0.1; d2 = 0.01; d3 = 0.001; d4 = 0.0001; d5 = 0.00001; - break; -#endif - case DT_U8: - d0 = 30; d1 = 20; d2 = 10; d3 = 5; d4 = 3; d5 = 2; - break; - - default: - std::cerr << "[ERROR] unsupported data type in ut_check_a(array, array) " << std::endl; - exit(1); - } - - for(U32 i = 0; i < len; i++) { - switch(dt) { -#ifdef _USE_FP16 - case DT_F16: - a = ((F16*)A)[i]; - b = ((F16*)B)[i]; - - break; -#endif - case DT_U8: - a = ((U8*)A)[i]; - b = ((U8*)B)[i]; - diff = a - b; - break; - default: - break; - } - diff = a - b; - if(diff < 0) diff = -diff; - if(diff > maxabs) { - maxabs = diff; - max_a0 = a; - max_b0 = b; - max_n0 = i; - } - F32 tmp = diff * 2 / (a + b + 0.000001); - if(tmp > maxrel) { - maxrel = tmp; - max_a1 = a; - max_b1 = b; - max_n1 = i; - } - if(diff >= d0) {e0++; continue;} - if(diff >= d1) {e1++; continue;} - if(diff >= d2) {e2++; continue;} - if(diff >= d3) {e3++; continue;} - if(diff >= d4) {e4++; continue;} - if(diff >= d5) {e5++; continue;} - e6++; - } - std::cout << "abs(diff) >= " << std::scientific << d0 << " number = " << std::dec << e0 << std::endl; - std::cout << "abs(diff) >= " << std::scientific << d1 << " number = " << std::dec << e1 << std::endl; - std::cout << "abs(diff) >= " << std::scientific << d2 << " number = " << std::dec << e2 << std::endl; - std::cout << "abs(diff) >= " << std::scientific << d3 << " number = " << std::dec << e3 << std::endl; - std::cout << "abs(diff) >= " << std::scientific << d4 << " number = " << std::dec << e4 << std::endl; - std::cout << "abs(diff) >= " << std::scientific << d5 << " number = " << std::dec << e5 << std::endl; - std::cout << "others number = " << e6 << std::endl; - std::cout << "number " << max_n0 << " is "<< "maxabs = " << std::fixed << maxabs << " a = " << max_a0 << " b = " << max_b0 <